From 1680ac7a5ad578d7acf819912557ceea4b4a5a88 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 7 Jul 2023 16:18:29 -0700
Subject: Input: gameport - use IS_REACHABLE() instead of open-coding it

Replace an open-coded preprocessor conditional with an equivalent helper.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/ZKYLLmsdCH0Gp7TO@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/gameport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 0a221e768ea4..07e370113b2b 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -63,7 +63,7 @@ struct gameport_driver {
 int gameport_open(struct gameport *gameport, struct gameport_driver *drv, int mode);
 void gameport_close(struct gameport *gameport);
 
-#if defined(CONFIG_GAMEPORT) || (defined(MODULE) && defined(CONFIG_GAMEPORT_MODULE))
+#if IS_REACHABLE(CONFIG_GAMEPORT)
 
 void __gameport_register_port(struct gameport *gameport, struct module *owner);
 /* use a define to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From 8ce49c2a2aa53afde9a20a8ce02b069d3b262af0 Mon Sep 17 00:00:00 2001
From: Deepak Kumar Singh <quic_deesin@quicinc.com>
Date: Fri, 7 Jul 2023 03:11:36 +0530
Subject: rpmsg: core: Add signal API support

Some transports like Glink support the state notifications between
clients using flow control signals similar to serial protocol signals.
Local glink client drivers can send and receive flow control status
to glink clients running on remote processors.

Add APIs to support sending and receiving of flow control status by
rpmsg clients.

Signed-off-by: Deepak Kumar Singh <quic_deesin@quicinc.com>
Signed-off-by: Sarannya S <quic_sarannya@quicinc.com>
Acked-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/1688679698-31274-2-git-send-email-quic_sarannya@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/rpmsg/rpmsg_core.c     | 21 +++++++++++++++++++++
 drivers/rpmsg/rpmsg_internal.h |  2 ++
 include/linux/rpmsg.h          | 15 +++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index 5039df757127..32b550c91d9f 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -330,6 +330,25 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst,
 }
 EXPORT_SYMBOL(rpmsg_trysend_offchannel);
 
+/**
+ * rpmsg_set_flow_control() - request remote to pause/resume transmission
+ * @ept:	the rpmsg endpoint
+ * @pause:	pause transmission
+ * @dst:	destination address of the endpoint
+ *
+ * Return: 0 on success and an appropriate error value on failure.
+ */
+int rpmsg_set_flow_control(struct rpmsg_endpoint *ept, bool pause, u32 dst)
+{
+	if (WARN_ON(!ept))
+		return -EINVAL;
+	if (!ept->ops->set_flow_control)
+		return -EOPNOTSUPP;
+
+	return ept->ops->set_flow_control(ept, pause, dst);
+}
+EXPORT_SYMBOL_GPL(rpmsg_set_flow_control);
+
 /**
  * rpmsg_get_mtu() - get maximum transmission buffer size for sending message.
  * @ept: the rpmsg endpoint
@@ -539,6 +558,8 @@ static int rpmsg_dev_probe(struct device *dev)
 
 		rpdev->ept = ept;
 		rpdev->src = ept->addr;
+
+		ept->flow_cb = rpdrv->flowcontrol;
 	}
 
 	err = rpdrv->probe(rpdev);
diff --git a/drivers/rpmsg/rpmsg_internal.h b/drivers/rpmsg/rpmsg_internal.h
index 39b646d0d40d..b950d6f790a3 100644
--- a/drivers/rpmsg/rpmsg_internal.h
+++ b/drivers/rpmsg/rpmsg_internal.h
@@ -55,6 +55,7 @@ struct rpmsg_device_ops {
  * @trysendto:		see @rpmsg_trysendto(), optional
  * @trysend_offchannel:	see @rpmsg_trysend_offchannel(), optional
  * @poll:		see @rpmsg_poll(), optional
+ * @set_flow_control:	see @rpmsg_set_flow_control(), optional
  * @get_mtu:		see @rpmsg_get_mtu(), optional
  *
  * Indirection table for the operations that a rpmsg backend should implement.
@@ -75,6 +76,7 @@ struct rpmsg_endpoint_ops {
 			     void *data, int len);
 	__poll_t (*poll)(struct rpmsg_endpoint *ept, struct file *filp,
 			     poll_table *wait);
+	int (*set_flow_control)(struct rpmsg_endpoint *ept, bool pause, u32 dst);
 	ssize_t (*get_mtu)(struct rpmsg_endpoint *ept);
 };
 
diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index 523c98b96cb4..90d8e4475f80 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -64,12 +64,14 @@ struct rpmsg_device {
 };
 
 typedef int (*rpmsg_rx_cb_t)(struct rpmsg_device *, void *, int, void *, u32);
+typedef int (*rpmsg_flowcontrol_cb_t)(struct rpmsg_device *, void *, bool);
 
 /**
  * struct rpmsg_endpoint - binds a local rpmsg address to its user
  * @rpdev: rpmsg channel device
  * @refcount: when this drops to zero, the ept is deallocated
  * @cb: rx callback handler
+ * @flow_cb: remote flow control callback handler
  * @cb_lock: must be taken before accessing/changing @cb
  * @addr: local rpmsg address
  * @priv: private data for the driver's use
@@ -92,6 +94,7 @@ struct rpmsg_endpoint {
 	struct rpmsg_device *rpdev;
 	struct kref refcount;
 	rpmsg_rx_cb_t cb;
+	rpmsg_flowcontrol_cb_t flow_cb;
 	struct mutex cb_lock;
 	u32 addr;
 	void *priv;
@@ -106,6 +109,7 @@ struct rpmsg_endpoint {
  * @probe: invoked when a matching rpmsg channel (i.e. device) is found
  * @remove: invoked when the rpmsg channel is removed
  * @callback: invoked when an inbound message is received on the channel
+ * @flowcontrol: invoked when remote side flow control request is received
  */
 struct rpmsg_driver {
 	struct device_driver drv;
@@ -113,6 +117,7 @@ struct rpmsg_driver {
 	int (*probe)(struct rpmsg_device *dev);
 	void (*remove)(struct rpmsg_device *dev);
 	int (*callback)(struct rpmsg_device *, void *, int, void *, u32);
+	int (*flowcontrol)(struct rpmsg_device *, void *, bool);
 };
 
 static inline u16 rpmsg16_to_cpu(struct rpmsg_device *rpdev, __rpmsg16 val)
@@ -192,6 +197,8 @@ __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp,
 
 ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept);
 
+int rpmsg_set_flow_control(struct rpmsg_endpoint *ept, bool pause, u32 dst);
+
 #else
 
 static inline int rpmsg_register_device_override(struct rpmsg_device *rpdev,
@@ -316,6 +323,14 @@ static inline ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept)
 	return -ENXIO;
 }
 
+static inline int rpmsg_set_flow_control(struct rpmsg_endpoint *ept, bool pause, u32 dst)
+{
+	/* This shouldn't be possible */
+	WARN_ON(1);
+
+	return -ENXIO;
+}
+
 #endif /* IS_ENABLED(CONFIG_RPMSG) */
 
 /* use a macro to avoid include chaining to get THIS_MODULE */
-- 
cgit v1.2.3


From f247f08da0ce822de0d6b2feec811dd6d4d599ce Mon Sep 17 00:00:00 2001
From: Siddharth Gupta <sidgup@codeaurora.org>
Date: Fri, 24 Feb 2023 13:17:06 -0800
Subject: remoteproc: core: Export the rproc coredump APIs

The remoteproc coredump APIs are currently only part of the internal
remoteproc header. This prevents the remoteproc platform drivers from
using these APIs when needed. This change moves the rproc_coredump()
and rproc_coredump_cleanup() APIs to the linux header and marks them
as exported symbols.

Signed-off-by: Siddharth Gupta <sidgup@codeaurora.org>
Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris@quicinc.com>
Link: https://lore.kernel.org/r/20230224211707.30916-2-quic_gokukris@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/remoteproc/remoteproc_coredump.c | 2 ++
 drivers/remoteproc/remoteproc_internal.h | 4 ----
 include/linux/remoteproc.h               | 4 ++++
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c
index bc0e1603a7a3..6ede8c0c93ad 100644
--- a/drivers/remoteproc/remoteproc_coredump.c
+++ b/drivers/remoteproc/remoteproc_coredump.c
@@ -32,6 +32,7 @@ void rproc_coredump_cleanup(struct rproc *rproc)
 		kfree(entry);
 	}
 }
+EXPORT_SYMBOL_GPL(rproc_coredump_cleanup);
 
 /**
  * rproc_coredump_add_segment() - add segment of device memory to coredump
@@ -327,6 +328,7 @@ void rproc_coredump(struct rproc *rproc)
 	 */
 	wait_for_completion(&dump_state.dump_done);
 }
+EXPORT_SYMBOL_GPL(rproc_coredump);
 
 /**
  * rproc_coredump_using_sections() - perform coredump using section headers
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index d4dbb8d1d80c..f62a82d71dfa 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -76,10 +76,6 @@ extern struct class rproc_class;
 int rproc_init_sysfs(void);
 void rproc_exit_sysfs(void);
 
-/* from remoteproc_coredump.c */
-void rproc_coredump_cleanup(struct rproc *rproc);
-void rproc_coredump(struct rproc *rproc);
-
 #ifdef CONFIG_REMOTEPROC_CDEV
 void rproc_init_cdev(void);
 void rproc_exit_cdev(void);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index fe8978eb69f1..b4795698d8c2 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -690,6 +690,10 @@ int rproc_detach(struct rproc *rproc);
 int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
 void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
+
+/* from remoteproc_coredump.c */
+void rproc_coredump_cleanup(struct rproc *rproc);
+void rproc_coredump(struct rproc *rproc);
 void rproc_coredump_using_sections(struct rproc *rproc);
 int rproc_coredump_add_segment(struct rproc *rproc, dma_addr_t da, size_t size);
 int rproc_coredump_add_custom_segment(struct rproc *rproc,
-- 
cgit v1.2.3


From b4f78ff746ec5274fffa92fa2a4dc531360b5016 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 14 Jul 2023 22:56:14 +0200
Subject: pwm: Use a consistent name for pwm_chip pointers in the core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most variables of type struct pwm_chip * are named "chip", there are
only three outliers called "pc". Change these three to "chip", too, for
consistency.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 28 ++++++++++++++--------------
 include/linux/pwm.h |  6 +++---
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 3dacceaef4a9..8c798753c016 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -127,28 +127,28 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
 }
 
 struct pwm_device *
-of_pwm_xlate_with_flags(struct pwm_chip *pc, const struct of_phandle_args *args)
+of_pwm_xlate_with_flags(struct pwm_chip *chip, const struct of_phandle_args *args)
 {
 	struct pwm_device *pwm;
 
-	if (pc->of_pwm_n_cells < 2)
+	if (chip->of_pwm_n_cells < 2)
 		return ERR_PTR(-EINVAL);
 
 	/* flags in the third cell are optional */
 	if (args->args_count < 2)
 		return ERR_PTR(-EINVAL);
 
-	if (args->args[0] >= pc->npwm)
+	if (args->args[0] >= chip->npwm)
 		return ERR_PTR(-EINVAL);
 
-	pwm = pwm_request_from_chip(pc, args->args[0], NULL);
+	pwm = pwm_request_from_chip(chip, args->args[0], NULL);
 	if (IS_ERR(pwm))
 		return pwm;
 
 	pwm->args.period = args->args[1];
 	pwm->args.polarity = PWM_POLARITY_NORMAL;
 
-	if (pc->of_pwm_n_cells >= 3) {
+	if (chip->of_pwm_n_cells >= 3) {
 		if (args->args_count > 2 && args->args[2] & PWM_POLARITY_INVERTED)
 			pwm->args.polarity = PWM_POLARITY_INVERSED;
 	}
@@ -158,18 +158,18 @@ of_pwm_xlate_with_flags(struct pwm_chip *pc, const struct of_phandle_args *args)
 EXPORT_SYMBOL_GPL(of_pwm_xlate_with_flags);
 
 struct pwm_device *
-of_pwm_single_xlate(struct pwm_chip *pc, const struct of_phandle_args *args)
+of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args)
 {
 	struct pwm_device *pwm;
 
-	if (pc->of_pwm_n_cells < 1)
+	if (chip->of_pwm_n_cells < 1)
 		return ERR_PTR(-EINVAL);
 
 	/* validate that one cell is specified, optionally with flags */
 	if (args->args_count != 1 && args->args_count != 2)
 		return ERR_PTR(-EINVAL);
 
-	pwm = pwm_request_from_chip(pc, 0, NULL);
+	pwm = pwm_request_from_chip(chip, 0, NULL);
 	if (IS_ERR(pwm))
 		return pwm;
 
@@ -692,7 +692,7 @@ static struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
 	struct pwm_device *pwm = NULL;
 	struct of_phandle_args args;
 	struct device_link *dl;
-	struct pwm_chip *pc;
+	struct pwm_chip *chip;
 	int index = 0;
 	int err;
 
@@ -709,16 +709,16 @@ static struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np,
 		return ERR_PTR(err);
 	}
 
-	pc = fwnode_to_pwmchip(of_fwnode_handle(args.np));
-	if (IS_ERR(pc)) {
-		if (PTR_ERR(pc) != -EPROBE_DEFER)
+	chip = fwnode_to_pwmchip(of_fwnode_handle(args.np));
+	if (IS_ERR(chip)) {
+		if (PTR_ERR(chip) != -EPROBE_DEFER)
 			pr_err("%s(): PWM chip not found\n", __func__);
 
-		pwm = ERR_CAST(pc);
+		pwm = ERR_CAST(chip);
 		goto put;
 	}
 
-	pwm = pc->of_xlate(pc, &args);
+	pwm = chip->of_xlate(chip, &args);
 	if (IS_ERR(pwm))
 		goto put;
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 04ae1d9073a7..d2f9f690a9c1 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -298,7 +298,7 @@ struct pwm_chip {
 	int base;
 	unsigned int npwm;
 
-	struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
+	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
 
@@ -395,9 +395,9 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
 					 const char *label);
 
-struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc,
+struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *chip,
 		const struct of_phandle_args *args);
-struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc,
+struct pwm_device *of_pwm_single_xlate(struct pwm_chip *chip,
 				       const struct of_phandle_args *args);
 
 struct pwm_device *pwm_get(struct device *dev, const char *con_id);
-- 
cgit v1.2.3


From 9e70a5e109a4a23367810de09be826c52d27ee2f Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:06 +0206
Subject: printk: Add per-console suspended state

Currently the global @console_suspended is used to determine if
consoles are in a suspended state. Its primary purpose is to allow
usage of the console_lock when suspended without causing console
printing. It is synchronized by the console_lock.

Rather than relying on the console_lock to determine suspended
state, make it an official per-console state that is set within
console->flags. This allows the state to be queried via SRCU.

Remove @console_suspended. Console printing will still be avoided
when suspended because console_is_usable() returns false when
the new suspended flag is set for that console.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-7-john.ogness@linutronix.de
---
 include/linux/console.h |  3 ++
 kernel/printk/printk.c  | 74 +++++++++++++++++++++++++++++--------------------
 2 files changed, 47 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index d3195664baa5..7de11c763eb3 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -154,6 +154,8 @@ static inline int con_debug_leave(void)
  *			receiving the printk spam for obvious reasons.
  * @CON_EXTENDED:	The console supports the extended output format of
  *			/dev/kmesg which requires a larger output buffer.
+ * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
+ *			printing callbacks must not be called.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -163,6 +165,7 @@ enum cons_flags {
 	CON_ANYTIME		= BIT(4),
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
+	CON_SUSPENDED		= BIT(7),
 };
 
 /**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6e853a1441a7..efe577477913 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(oops_in_progress);
 static DEFINE_MUTEX(console_mutex);
 
 /*
- * console_sem protects updates to console->seq and console_suspended,
+ * console_sem protects updates to console->seq
  * and also provides serialization for console printing.
  */
 static DEFINE_SEMAPHORE(console_sem);
@@ -359,7 +359,7 @@ static bool panic_in_progress(void)
  * paths in the console code where we end up in places I want
  * locked without the console semaphore held).
  */
-static int console_locked, console_suspended;
+static int console_locked;
 
 /*
  *	Array of consoles built from command line options (console=)
@@ -2549,22 +2549,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
  */
 void suspend_console(void)
 {
+	struct console *con;
+
 	if (!console_suspend_enabled)
 		return;
 	pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
 	pr_flush(1000, true);
-	console_lock();
-	console_suspended = 1;
-	up_console_sem();
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see that they are suspended so that it
+	 * is guaranteed that all printing has stopped when this function
+	 * completes.
+	 */
+	synchronize_srcu(&console_srcu);
 }
 
 void resume_console(void)
 {
+	struct console *con;
+
 	if (!console_suspend_enabled)
 		return;
-	down_console_sem();
-	console_suspended = 0;
-	console_unlock();
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see they are no longer suspended so
+	 * that they are guaranteed to wake up and resume printing.
+	 */
+	synchronize_srcu(&console_srcu);
+
 	pr_flush(1000, true);
 }
 
@@ -2623,8 +2647,6 @@ void console_lock(void)
 		msleep(1000);
 
 	down_console_sem();
-	if (console_suspended)
-		return;
 	console_locked = 1;
 	console_may_schedule = 1;
 }
@@ -2645,10 +2667,6 @@ int console_trylock(void)
 		return 0;
 	if (down_trylock_console_sem())
 		return 0;
-	if (console_suspended) {
-		up_console_sem();
-		return 0;
-	}
 	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
@@ -2674,6 +2692,9 @@ static inline bool console_is_usable(struct console *con)
 	if (!(flags & CON_ENABLED))
 		return false;
 
+	if ((flags & CON_SUSPENDED))
+		return false;
+
 	if (!con->write)
 		return false;
 
@@ -2992,11 +3013,6 @@ void console_unlock(void)
 	bool flushed;
 	u64 next_seq;
 
-	if (console_suspended) {
-		up_console_sem();
-		return;
-	}
-
 	/*
 	 * Console drivers are called with interrupts disabled, so
 	 * @console_may_schedule should be cleared before; however, we may
@@ -3726,8 +3742,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 
 		/*
 		 * Hold the console_lock to guarantee safe access to
-		 * console->seq and to prevent changes to @console_suspended
-		 * until all consoles have been processed.
+		 * console->seq.
 		 */
 		console_lock();
 
@@ -3735,6 +3750,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		for_each_console_srcu(c) {
 			if (con && con != c)
 				continue;
+			/*
+			 * If consoles are not usable, it cannot be expected
+			 * that they make forward progress, so only increment
+			 * @diff for usable consoles.
+			 */
 			if (!console_is_usable(c))
 				continue;
 			printk_seq = c->seq;
@@ -3743,18 +3763,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		}
 		console_srcu_read_unlock(cookie);
 
-		/*
-		 * If consoles are suspended, it cannot be expected that they
-		 * make forward progress, so timeout immediately. @diff is
-		 * still used to return a valid flush status.
-		 */
-		if (console_suspended)
-			remaining = 0;
-		else if (diff != last_diff && reset_on_progress)
+		if (diff != last_diff && reset_on_progress)
 			remaining = timeout_ms;
 
 		console_unlock();
 
+		/* Note: @diff is 0 if there are no usable consoles. */
 		if (diff == 0 || remaining == 0)
 			break;
 
@@ -3788,7 +3802,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
  * printer has been seen to make some forward progress.
  *
  * Context: Process context. May sleep while acquiring console lock.
- * Return: true if all enabled printers are caught up.
+ * Return: true if all usable printers are caught up.
  */
 static bool pr_flush(int timeout_ms, bool reset_on_progress)
 {
-- 
cgit v1.2.3


From 481461f5109919babbb393d6f68002936b8e2493 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 16 Jul 2023 19:15:54 +0900
Subject: linux/export.h: make <linux/export.h> independent of CONFIG_MODULES

Currently, all files with EXPORT_SYMBOL() are rebuilt when CONFIG_MODULES
is flipped due to <linux/export.h> depending on CONFIG_MODULES.

Now that modpost can make a final decision about export symbols,
<linux/export.h> does not need to make EXPORT_SYMBOL() no-op.
Instead, modpost can skip emitting KSYMTAB when CONFIG_MODULES is unset.

This commit will reduce the number of recompilation when CONFIG_MODULES
is toggled.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export.h   | 4 ++--
 scripts/Makefile.modpost | 1 +
 scripts/mod/modpost.c    | 8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index beed8387e0a4..9911508a9604 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -50,7 +50,7 @@ extern struct module __this_module;
 		__EXPORT_SYMBOL_REF(sym)	ASM_NL	\
 	.previous
 
-#if !defined(CONFIG_MODULES) || defined(__DISABLE_EXPORTS)
+#if defined(__DISABLE_EXPORTS)
 
 /*
  * Allow symbol exports to be disabled completely so that C code may
@@ -75,7 +75,7 @@ extern struct module __this_module;
 	__ADDRESSABLE(sym)					\
 	asm(__stringify(___EXPORT_SYMBOL(sym, license, ns)))
 
-#endif /* CONFIG_MODULES */
+#endif
 
 #ifdef DEFAULT_SYMBOL_NAMESPACE
 #define _EXPORT_SYMBOL(sym, license)	__EXPORT_SYMBOL(sym, license, __stringify(DEFAULT_SYMBOL_NAMESPACE))
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 39472e834b63..739402f45509 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -41,6 +41,7 @@ include $(srctree)/scripts/Kbuild.include
 MODPOST = scripts/mod/modpost
 
 modpost-args =										\
+	$(if $(CONFIG_MODULES),-M)							\
 	$(if $(CONFIG_MODVERSIONS),-m)							\
 	$(if $(CONFIG_MODULE_SRCVERSION_ALL),-a)					\
 	$(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E)					\
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index b29b29707f10..8227641dd087 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -24,6 +24,7 @@
 #include "../../include/linux/license.h"
 #include "../../include/linux/module_symbol.h"
 
+static bool module_enabled;
 /* Are we using CONFIG_MODVERSIONS? */
 static bool modversions;
 /* Is CONFIG_MODULE_SRCVERSION_ALL set? */
@@ -1242,7 +1243,7 @@ static void check_section_mismatch(struct module *mod, struct elf_info *elf,
 	const char *tosec = sec_name(elf, get_secindex(elf, sym));
 	const struct sectioncheck *mismatch;
 
-	if (elf->export_symbol_secndx == fsecndx) {
+	if (module_enabled && elf->export_symbol_secndx == fsecndx) {
 		check_export_symbol(mod, elf, faddr, tosec, sym);
 		return;
 	}
@@ -2272,7 +2273,7 @@ int main(int argc, char **argv)
 	LIST_HEAD(dump_lists);
 	struct dump_list *dl, *dl2;
 
-	while ((opt = getopt(argc, argv, "ei:mnT:to:au:WwENd:")) != -1) {
+	while ((opt = getopt(argc, argv, "ei:MmnT:to:au:WwENd:")) != -1) {
 		switch (opt) {
 		case 'e':
 			external_module = true;
@@ -2282,6 +2283,9 @@ int main(int argc, char **argv)
 			dl->file = optarg;
 			list_add_tail(&dl->list, &dump_lists);
 			break;
+		case 'M':
+			module_enabled = true;
+			break;
 		case 'm':
 			modversions = true;
 			break;
-- 
cgit v1.2.3


From db2d6038c5e795cab4f0a8d3e86b4f7e33338629 Mon Sep 17 00:00:00 2001
From: Benjamin Bara <benjamin.bara@skidata.com>
Date: Sat, 15 Jul 2023 09:53:25 +0200
Subject: kernel/reboot: Add device to sys_off_handler

If the dev is known (e.g. a devm-based sys_off_handler is used), it can
be passed to the handler's callback to have it available there.
Otherwise, cb_data might be set to the dev in most of the cases.

Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Benjamin Bara <benjamin.bara@skidata.com>
Link: https://lore.kernel.org/r/20230327-tegra-pmic-reboot-v7-3-18699d5dcd76@skidata.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/reboot.h | 3 +++
 kernel/reboot.c        | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 2b6bb593be5b..c4cc3b89ced1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -129,11 +129,14 @@ enum sys_off_mode {
  * @cb_data: User's callback data.
  * @cmd: Command string. Currently used only by the sys-off restart mode,
  *       NULL otherwise.
+ * @dev: Device of the sys-off handler. Only if known (devm_register_*),
+ *       NULL otherwise.
  */
 struct sys_off_data {
 	int mode;
 	void *cb_data;
 	const char *cmd;
+	struct device *dev;
 };
 
 struct sys_off_handler *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6ebef11c8876..395a0ea3c7a8 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -55,6 +55,7 @@ struct sys_off_handler {
 	enum sys_off_mode mode;
 	bool blocking;
 	void *list;
+	struct device *dev;
 };
 
 /*
@@ -324,6 +325,7 @@ static int sys_off_notify(struct notifier_block *nb,
 	data.cb_data = handler->cb_data;
 	data.mode = mode;
 	data.cmd = cmd;
+	data.dev = handler->dev;
 
 	return handler->sys_off_cb(&data);
 }
@@ -511,6 +513,7 @@ int devm_register_sys_off_handler(struct device *dev,
 	handler = register_sys_off_handler(mode, priority, callback, cb_data);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
+	handler->dev = dev;
 
 	return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
 					handler);
-- 
cgit v1.2.3


From 687fe7dfb736b03ab820d172ea5dbfc1ec447135 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 23 Jul 2023 22:30:18 -0700
Subject: Input: tca6416-keypad - always expect proper IRQ number in i2c client

Remove option having i2c client contain raw gpio number instead of proper
IRQ number. There are no users of this facility in mainline and it will
allow cleaning up the driver code with regard to wakeup handling, etc.

Link: https://lore.kernel.org/r/20230724053024.352054-1-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/tca6416-keypad.c | 27 ++++++++++-----------------
 include/linux/tca6416_keypad.h          |  1 -
 2 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c
index 2f745cabf4f2..01bc0b881188 100644
--- a/drivers/input/keyboard/tca6416-keypad.c
+++ b/drivers/input/keyboard/tca6416-keypad.c
@@ -148,7 +148,7 @@ static int tca6416_keys_open(struct input_dev *dev)
 	if (chip->use_polling)
 		schedule_delayed_work(&chip->dwork, msecs_to_jiffies(100));
 	else
-		enable_irq(chip->irqnum);
+		enable_irq(chip->client->irq);
 
 	return 0;
 }
@@ -160,7 +160,7 @@ static void tca6416_keys_close(struct input_dev *dev)
 	if (chip->use_polling)
 		cancel_delayed_work_sync(&chip->dwork);
 	else
-		disable_irq(chip->irqnum);
+		disable_irq(chip->client->irq);
 }
 
 static int tca6416_setup_registers(struct tca6416_keypad_chip *chip)
@@ -266,12 +266,7 @@ static int tca6416_keypad_probe(struct i2c_client *client)
 		goto fail1;
 
 	if (!chip->use_polling) {
-		if (pdata->irq_is_gpio)
-			chip->irqnum = gpio_to_irq(client->irq);
-		else
-			chip->irqnum = client->irq;
-
-		error = request_threaded_irq(chip->irqnum, NULL,
+		error = request_threaded_irq(client->irq, NULL,
 					     tca6416_keys_isr,
 					     IRQF_TRIGGER_FALLING |
 					     IRQF_ONESHOT | IRQF_NO_AUTOEN,
@@ -279,7 +274,7 @@ static int tca6416_keypad_probe(struct i2c_client *client)
 		if (error) {
 			dev_dbg(&client->dev,
 				"Unable to claim irq %d; error %d\n",
-				chip->irqnum, error);
+				client->irq, error);
 			goto fail1;
 		}
 	}
@@ -298,8 +293,8 @@ static int tca6416_keypad_probe(struct i2c_client *client)
 
 fail2:
 	if (!chip->use_polling) {
-		free_irq(chip->irqnum, chip);
-		enable_irq(chip->irqnum);
+		free_irq(client->irq, chip);
+		enable_irq(client->irq);
 	}
 fail1:
 	input_free_device(input);
@@ -312,8 +307,8 @@ static void tca6416_keypad_remove(struct i2c_client *client)
 	struct tca6416_keypad_chip *chip = i2c_get_clientdata(client);
 
 	if (!chip->use_polling) {
-		free_irq(chip->irqnum, chip);
-		enable_irq(chip->irqnum);
+		free_irq(client->irq, chip);
+		enable_irq(client->irq);
 	}
 
 	input_unregister_device(chip->input);
@@ -323,10 +318,9 @@ static void tca6416_keypad_remove(struct i2c_client *client)
 static int tca6416_keypad_suspend(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
-	struct tca6416_keypad_chip *chip = i2c_get_clientdata(client);
 
 	if (device_may_wakeup(dev))
-		enable_irq_wake(chip->irqnum);
+		enable_irq_wake(client->irq);
 
 	return 0;
 }
@@ -334,10 +328,9 @@ static int tca6416_keypad_suspend(struct device *dev)
 static int tca6416_keypad_resume(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
-	struct tca6416_keypad_chip *chip = i2c_get_clientdata(client);
 
 	if (device_may_wakeup(dev))
-		disable_irq_wake(chip->irqnum);
+		disable_irq_wake(client->irq);
 
 	return 0;
 }
diff --git a/include/linux/tca6416_keypad.h b/include/linux/tca6416_keypad.h
index b0d36a9934cc..5cf6f6f82aa7 100644
--- a/include/linux/tca6416_keypad.h
+++ b/include/linux/tca6416_keypad.h
@@ -25,7 +25,6 @@ struct tca6416_keys_platform_data {
 	unsigned int rep:1;	/* enable input subsystem auto repeat */
 	uint16_t pinmask;
 	uint16_t invert;
-	int irq_is_gpio;
 	int use_polling;	/* use polling if Interrupt is not connected*/
 };
 #endif
-- 
cgit v1.2.3


From 63b93099359eca37c11e4d5db5ea2c3a375f6026 Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:46 +0300
Subject: ata: libata: fix parameter type of ata_deadline()

ata_deadline() passes its 'unsigned long timeout_msecs'  parameter verbatim
to msecs_to_jiffies() which takes just 'unsigned int' -- eliminate unneeded
implicit cast...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 820f7a3a2749..d9edb3d62a42 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1876,7 +1876,7 @@ static inline int ata_check_ready(u8 status)
 }
 
 static inline unsigned long ata_deadline(unsigned long from_jiffies,
-					 unsigned long timeout_msecs)
+					 unsigned int timeout_msecs)
 {
 	return from_jiffies + msecs_to_jiffies(timeout_msecs);
 }
-- 
cgit v1.2.3


From 84abed36d7de7145d393f9d5d05b36717e0cb49d Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:47 +0300
Subject: ata: libata-core: fix parameter types of ata_wait_register()

ata_wait_register() passes its 'unsigned long {interval|timeout}' params
verbatim to ata_{msleep|deadline}() that just take 'unsigned int' param
for the time intervals in ms -- eliminate unneeded implicit casts...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-core.c | 2 +-
 include/linux/libata.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 04db0f2c683a..54cc342c0b4f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6684,7 +6684,7 @@ EXPORT_SYMBOL_GPL(ata_msleep);
  *	The final register value.
  */
 u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask, u32 val,
-		      unsigned long interval, unsigned long timeout)
+		      unsigned int interval, unsigned int timeout)
 {
 	unsigned long deadline;
 	u32 tmp;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d9edb3d62a42..4772f64af734 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1116,7 +1116,7 @@ static inline void ata_sas_port_resume(struct ata_port *ap)
 extern int ata_ratelimit(void);
 extern void ata_msleep(struct ata_port *ap, unsigned int msecs);
 extern u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask,
-			u32 val, unsigned long interval, unsigned long timeout);
+			     u32 val, unsigned int interval, unsigned int timeout);
 extern int atapi_cmd_type(u8 opcode);
 extern unsigned int ata_pack_xfermask(unsigned int pio_mask,
 				      unsigned int mwdma_mask,
-- 
cgit v1.2.3


From d14d41cc5aaef138face9d5a145b460e2b63697a Mon Sep 17 00:00:00 2001
From: Sergey Shtylyov <s.shtylyov@omp.ru>
Date: Sat, 29 Jul 2023 23:17:49 +0300
Subject: ata: fix debounce timings type

sata_deb_timing_{hotplug|long|normal}[] store 'unsigned long' debounce
timeouts in ms, while sata_link_debounce() eventually uses those timeouts
by calling ata_{deadline|msleep}( which take just 'unsigned int'.  Change
the debounce timeout table element's type to 'unsigned int' -- all these
timeouts happily fit into 'unsigned int'...

Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/ahci.c          |  2 +-
 drivers/ata/ahci_qoriq.c    |  2 +-
 drivers/ata/ahci_xgene.c    |  2 +-
 drivers/ata/libahci.c       |  2 +-
 drivers/ata/libata-core.c   |  4 ++--
 drivers/ata/libata-sata.c   | 16 ++++++++--------
 drivers/ata/libata-sff.c    |  2 +-
 drivers/ata/sata_highbank.c |  2 +-
 drivers/ata/sata_inic162x.c |  2 +-
 drivers/ata/sata_mv.c       |  2 +-
 drivers/ata/sata_nv.c       |  2 +-
 include/linux/libata.h      | 20 ++++++++++----------
 12 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index addba109406b..02503e903e4a 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -807,7 +807,7 @@ static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class,
 static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class,
 			      unsigned long deadline)
 {
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	struct ata_port *ap = link->ap;
 	struct ahci_port_priv *pp = ap->private_data;
 	struct ahci_host_priv *hpriv = ap->host->private_data;
diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c
index 7bb9ad40605e..b1a4e57578e2 100644
--- a/drivers/ata/ahci_qoriq.c
+++ b/drivers/ata/ahci_qoriq.c
@@ -88,7 +88,7 @@ MODULE_DEVICE_TABLE(acpi, ahci_qoriq_acpi_match);
 static int ahci_qoriq_hardreset(struct ata_link *link, unsigned int *class,
 			  unsigned long deadline)
 {
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	void __iomem *port_mmio = ahci_port_base(link->ap);
 	u32 px_cmd, px_is, px_val;
 	struct ata_port *ap = link->ap;
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index f5deaf648663..8e88c86a2a78 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -350,7 +350,7 @@ static void xgene_ahci_set_phy_cfg(struct xgene_ahci_context *ctx, int channel)
 static int xgene_ahci_do_hardreset(struct ata_link *link,
 				   unsigned long deadline, bool *online)
 {
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	struct ata_port *ap = link->ap;
 	struct ahci_host_priv *hpriv = ap->host->private_data;
 	struct xgene_ahci_context *ctx = hpriv->plat_data;
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 06aec35f88f2..ad2bfcbff3bc 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -1587,7 +1587,7 @@ static int ahci_pmp_retry_softreset(struct ata_link *link, unsigned int *class,
 int ahci_do_hardreset(struct ata_link *link, unsigned int *class,
 		      unsigned long deadline, bool *online)
 {
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	struct ata_port *ap = link->ap;
 	struct ahci_port_priv *pp = ap->private_data;
 	struct ahci_host_priv *hpriv = ap->host->private_data;
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 54cc342c0b4f..079ec8d0860f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3782,7 +3782,7 @@ int ata_std_prereset(struct ata_link *link, unsigned long deadline)
 {
 	struct ata_port *ap = link->ap;
 	struct ata_eh_context *ehc = &link->eh_context;
-	const unsigned long *timing = sata_ehc_deb_timing(ehc);
+	const unsigned int *timing = sata_ehc_deb_timing(ehc);
 	int rc;
 
 	/* if we're about to do hardreset, nothing more to do */
@@ -3824,7 +3824,7 @@ EXPORT_SYMBOL_GPL(ata_std_prereset);
 int sata_std_hardreset(struct ata_link *link, unsigned int *class,
 		       unsigned long deadline)
 {
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	bool online;
 	int rc;
 
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 85e279a12f62..5d393432fa06 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -19,11 +19,11 @@
 #include "libata-transport.h"
 
 /* debounce timing parameters in msecs { interval, duration, timeout } */
-const unsigned long sata_deb_timing_normal[]		= {   5,  100, 2000 };
+const unsigned int sata_deb_timing_normal[]		= {   5,  100, 2000 };
 EXPORT_SYMBOL_GPL(sata_deb_timing_normal);
-const unsigned long sata_deb_timing_hotplug[]		= {  25,  500, 2000 };
+const unsigned int sata_deb_timing_hotplug[]		= {  25,  500, 2000 };
 EXPORT_SYMBOL_GPL(sata_deb_timing_hotplug);
-const unsigned long sata_deb_timing_long[]		= { 100, 2000, 5000 };
+const unsigned int sata_deb_timing_long[]		= { 100, 2000, 5000 };
 EXPORT_SYMBOL_GPL(sata_deb_timing_long);
 
 /**
@@ -232,11 +232,11 @@ EXPORT_SYMBOL_GPL(ata_tf_from_fis);
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-int sata_link_debounce(struct ata_link *link, const unsigned long *params,
+int sata_link_debounce(struct ata_link *link, const unsigned int *params,
 		       unsigned long deadline)
 {
-	unsigned long interval = params[0];
-	unsigned long duration = params[1];
+	unsigned int interval = params[0];
+	unsigned int duration = params[1];
 	unsigned long last_jiffies, t;
 	u32 last, cur;
 	int rc;
@@ -295,7 +295,7 @@ EXPORT_SYMBOL_GPL(sata_link_debounce);
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-int sata_link_resume(struct ata_link *link, const unsigned long *params,
+int sata_link_resume(struct ata_link *link, const unsigned int *params,
 		     unsigned long deadline)
 {
 	int tries = ATA_LINK_RESUME_TRIES;
@@ -528,7 +528,7 @@ EXPORT_SYMBOL_GPL(sata_set_spd);
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int sata_link_hardreset(struct ata_link *link, const unsigned long *timing,
+int sata_link_hardreset(struct ata_link *link, const unsigned int *timing,
 			unsigned long deadline,
 			bool *online, int (*check_ready)(struct ata_link *))
 {
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 9d28badfe41d..ac55dfc2d85f 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1971,7 +1971,7 @@ int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
 		       unsigned long deadline)
 {
 	struct ata_eh_context *ehc = &link->eh_context;
-	const unsigned long *timing = sata_ehc_deb_timing(ehc);
+	const unsigned int *timing = sata_ehc_deb_timing(ehc);
 	bool online;
 	int rc;
 
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index 7a6d41b7a02d..63ef7bb073ce 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -385,7 +385,7 @@ static int highbank_initialize_phys(struct device *dev, void __iomem *addr)
 static int ahci_highbank_hardreset(struct ata_link *link, unsigned int *class,
 				unsigned long deadline)
 {
-	static const unsigned long timing[] = { 5, 100, 500};
+	static const unsigned int timing[] = { 5, 100, 500};
 	struct ata_port *ap = link->ap;
 	struct ahci_port_priv *pp = ap->private_data;
 	struct ahci_host_priv *hpriv = ap->host->private_data;
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 2c8c78ed86c1..db9c255dc9f2 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -619,7 +619,7 @@ static int inic_hardreset(struct ata_link *link, unsigned int *class,
 	struct ata_port *ap = link->ap;
 	void __iomem *port_base = inic_port_base(ap);
 	void __iomem *idma_ctl = port_base + PORT_IDMA_CTL;
-	const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
+	const unsigned int *timing = sata_ehc_deb_timing(&link->eh_context);
 	int rc;
 
 	/* hammer it into sane state */
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index d404e631d152..41c107e15c40 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -3633,7 +3633,7 @@ static int mv_hardreset(struct ata_link *link, unsigned int *class,
 
 	/* Workaround for errata FEr SATA#10 (part 2) */
 	do {
-		const unsigned long *timing =
+		const unsigned int *timing =
 				sata_ehc_deb_timing(&link->eh_context);
 
 		rc = sata_link_hardreset(link, timing, deadline + extra,
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index abf5651c87ab..0a0cee755bde 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -1529,7 +1529,7 @@ static int nv_hardreset(struct ata_link *link, unsigned int *class,
 		sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
 				    NULL, NULL);
 	else {
-		const unsigned long *timing = sata_ehc_deb_timing(ehc);
+		const unsigned int *timing = sata_ehc_deb_timing(ehc);
 		int rc;
 
 		if (!(ehc->i.flags & ATA_EHI_QUIET))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 4772f64af734..8d510fb00591 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1166,11 +1166,11 @@ extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *
  * SATA specific code - drivers/ata/libata-sata.c
  */
 #ifdef CONFIG_SATA_HOST
-extern const unsigned long sata_deb_timing_normal[];
-extern const unsigned long sata_deb_timing_hotplug[];
-extern const unsigned long sata_deb_timing_long[];
+extern const unsigned int sata_deb_timing_normal[];
+extern const unsigned int sata_deb_timing_hotplug[];
+extern const unsigned int sata_deb_timing_long[];
 
-static inline const unsigned long *
+static inline const unsigned int *
 sata_ehc_deb_timing(struct ata_eh_context *ehc)
 {
 	if (ehc->i.flags & ATA_EHI_HOTPLUGGED)
@@ -1185,14 +1185,14 @@ extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
 extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val);
 extern int sata_set_spd(struct ata_link *link);
 extern int sata_link_hardreset(struct ata_link *link,
-			const unsigned long *timing, unsigned long deadline,
+			const unsigned int *timing, unsigned long deadline,
 			bool *online, int (*check_ready)(struct ata_link *));
-extern int sata_link_resume(struct ata_link *link, const unsigned long *params,
+extern int sata_link_resume(struct ata_link *link, const unsigned int *params,
 			    unsigned long deadline);
 extern int ata_eh_read_sense_success_ncq_log(struct ata_link *link);
 extern void ata_eh_analyze_ncq_error(struct ata_link *link);
 #else
-static inline const unsigned long *
+static inline const unsigned int *
 sata_ehc_deb_timing(struct ata_eh_context *ehc)
 {
 	return NULL;
@@ -1212,7 +1212,7 @@ static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val)
 }
 static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; }
 static inline int sata_link_hardreset(struct ata_link *link,
-				      const unsigned long *timing,
+				      const unsigned int *timing,
 				      unsigned long deadline,
 				      bool *online,
 				      int (*check_ready)(struct ata_link *))
@@ -1222,7 +1222,7 @@ static inline int sata_link_hardreset(struct ata_link *link,
 	return -EOPNOTSUPP;
 }
 static inline int sata_link_resume(struct ata_link *link,
-				   const unsigned long *params,
+				   const unsigned int *params,
 				   unsigned long deadline)
 {
 	return -EOPNOTSUPP;
@@ -1234,7 +1234,7 @@ static inline int ata_eh_read_sense_success_ncq_log(struct ata_link *link)
 static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { }
 #endif
 extern int sata_link_debounce(struct ata_link *link,
-			const unsigned long *params, unsigned long deadline);
+			      const unsigned int *params, unsigned long deadline);
 extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			     bool spm_wakeup);
 extern int ata_slave_link_init(struct ata_port *ap);
-- 
cgit v1.2.3


From ff8072d589dcff7c1f0345a6ec98b5fc1e9ee2a1 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:12 +0200
Subject: ata: libata: remove references to non-existing error_handler()

With commit 65a15d6560df ("scsi: ipr: Remove SATA support") all
libata drivers now have the error_handler() callback provided,
so we can stop checking for non-existing error_handler callback.

Signed-off-by: Hannes Reinecke <hare@suse.de>
[niklas: fixed review comments, rebased, solved conflicts during rebase,
fixed bug that unconditionally dumped all QCs, removed the now unused
function ata_dump_status(), removed the now unreachable failure paths in
atapi_qc_complete(), removed the non-EH function to request ATAPI sense]
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-core.c | 209 +++++++++++++++++++---------------------------
 drivers/ata/libata-eh.c   | 152 ++++++++++++++-------------------
 drivers/ata/libata-sata.c |   7 +-
 drivers/ata/libata-scsi.c | 161 +++--------------------------------
 drivers/ata/libata-sff.c  |  30 +++----
 include/linux/libata.h    |   2 +-
 6 files changed, 170 insertions(+), 391 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 079ec8d0860f..cc59d3158e1d 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1586,13 +1586,11 @@ static unsigned ata_exec_internal_sg(struct ata_device *dev,
 		}
 	}
 
-	if (ap->ops->error_handler)
-		ata_eh_release(ap);
+	ata_eh_release(ap);
 
 	rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout));
 
-	if (ap->ops->error_handler)
-		ata_eh_acquire(ap);
+	ata_eh_acquire(ap);
 
 	ata_sff_flush_pio_task(ap);
 
@@ -1607,10 +1605,7 @@ static unsigned ata_exec_internal_sg(struct ata_device *dev,
 		if (qc->flags & ATA_QCFLAG_ACTIVE) {
 			qc->err_mask |= AC_ERR_TIMEOUT;
 
-			if (ap->ops->error_handler)
-				ata_port_freeze(ap);
-			else
-				ata_qc_complete(qc);
+			ata_port_freeze(ap);
 
 			ata_dev_warn(dev, "qc timeout after %u msecs (cmd 0x%x)\n",
 				     timeout, command);
@@ -4874,126 +4869,103 @@ static void ata_verify_xfer(struct ata_queued_cmd *qc)
 void ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
+	struct ata_device *dev = qc->dev;
+	struct ata_eh_info *ehi = &dev->link->eh_info;
 
 	/* Trigger the LED (if available) */
 	ledtrig_disk_activity(!!(qc->tf.flags & ATA_TFLAG_WRITE));
 
-	/* XXX: New EH and old EH use different mechanisms to
-	 * synchronize EH with regular execution path.
-	 *
-	 * In new EH, a qc owned by EH is marked with ATA_QCFLAG_EH.
-	 * Normal execution path is responsible for not accessing a
-	 * qc owned by EH.  libata core enforces the rule by returning NULL
-	 * from ata_qc_from_tag() for qcs owned by EH.
+	/*
+	 * In order to synchronize EH with the regular execution path, a qc that
+	 * is owned by EH is marked with ATA_QCFLAG_EH.
 	 *
-	 * Old EH depends on ata_qc_complete() nullifying completion
-	 * requests if ATA_QCFLAG_EH_SCHEDULED is set.  Old EH does
-	 * not synchronize with interrupt handler.  Only PIO task is
-	 * taken care of.
+	 * The normal execution path is responsible for not accessing a qc owned
+	 * by EH.  libata core enforces the rule by returning NULL from
+	 * ata_qc_from_tag() for qcs owned by EH.
 	 */
-	if (ap->ops->error_handler) {
-		struct ata_device *dev = qc->dev;
-		struct ata_eh_info *ehi = &dev->link->eh_info;
-
-		if (unlikely(qc->err_mask))
-			qc->flags |= ATA_QCFLAG_EH;
+	if (unlikely(qc->err_mask))
+		qc->flags |= ATA_QCFLAG_EH;
 
-		/*
-		 * Finish internal commands without any further processing
-		 * and always with the result TF filled.
-		 */
-		if (unlikely(ata_tag_internal(qc->tag))) {
-			fill_result_tf(qc);
-			trace_ata_qc_complete_internal(qc);
-			__ata_qc_complete(qc);
-			return;
-		}
+	/*
+	 * Finish internal commands without any further processing and always
+	 * with the result TF filled.
+	 */
+	if (unlikely(ata_tag_internal(qc->tag))) {
+		fill_result_tf(qc);
+		trace_ata_qc_complete_internal(qc);
+		__ata_qc_complete(qc);
+		return;
+	}
 
-		/*
-		 * Non-internal qc has failed.  Fill the result TF and
-		 * summon EH.
-		 */
-		if (unlikely(qc->flags & ATA_QCFLAG_EH)) {
-			fill_result_tf(qc);
-			trace_ata_qc_complete_failed(qc);
-			ata_qc_schedule_eh(qc);
-			return;
-		}
+	/* Non-internal qc has failed.  Fill the result TF and summon EH. */
+	if (unlikely(qc->flags & ATA_QCFLAG_EH)) {
+		fill_result_tf(qc);
+		trace_ata_qc_complete_failed(qc);
+		ata_qc_schedule_eh(qc);
+		return;
+	}
 
-		WARN_ON_ONCE(ata_port_is_frozen(ap));
+	WARN_ON_ONCE(ata_port_is_frozen(ap));
 
-		/* read result TF if requested */
-		if (qc->flags & ATA_QCFLAG_RESULT_TF)
-			fill_result_tf(qc);
+	/* read result TF if requested */
+	if (qc->flags & ATA_QCFLAG_RESULT_TF)
+		fill_result_tf(qc);
 
-		trace_ata_qc_complete_done(qc);
+	trace_ata_qc_complete_done(qc);
 
+	/*
+	 * For CDL commands that completed without an error, check if we have
+	 * sense data (ATA_SENSE is set). If we do, then the command may have
+	 * been aborted by the device due to a limit timeout using the policy
+	 * 0xD. For these commands, invoke EH to get the command sense data.
+	 */
+	if (qc->result_tf.status & ATA_SENSE &&
+	    ((ata_is_ncq(qc->tf.protocol) &&
+	      dev->flags & ATA_DFLAG_CDL_ENABLED) ||
+	     (!ata_is_ncq(qc->tf.protocol) &&
+	      ata_id_sense_reporting_enabled(dev->id)))) {
 		/*
-		 * For CDL commands that completed without an error, check if
-		 * we have sense data (ATA_SENSE is set). If we do, then the
-		 * command may have been aborted by the device due to a limit
-		 * timeout using the policy 0xD. For these commands, invoke EH
-		 * to get the command sense data.
+		 * Tell SCSI EH to not overwrite scmd->result even if this
+		 * command is finished with result SAM_STAT_GOOD.
 		 */
-		if (qc->result_tf.status & ATA_SENSE &&
-		    ((ata_is_ncq(qc->tf.protocol) &&
-		      dev->flags & ATA_DFLAG_CDL_ENABLED) ||
-		     (!ata_is_ncq(qc->tf.protocol) &&
-		      ata_id_sense_reporting_enabled(dev->id)))) {
-			/*
-			 * Tell SCSI EH to not overwrite scmd->result even if
-			 * this command is finished with result SAM_STAT_GOOD.
-			 */
-			qc->scsicmd->flags |= SCMD_FORCE_EH_SUCCESS;
-			qc->flags |= ATA_QCFLAG_EH_SUCCESS_CMD;
-			ehi->dev_action[dev->devno] |= ATA_EH_GET_SUCCESS_SENSE;
-
-			/*
-			 * set pending so that ata_qc_schedule_eh() does not
-			 * trigger fast drain, and freeze the port.
-			 */
-			ap->pflags |= ATA_PFLAG_EH_PENDING;
-			ata_qc_schedule_eh(qc);
-			return;
-		}
+		qc->scsicmd->flags |= SCMD_FORCE_EH_SUCCESS;
+		qc->flags |= ATA_QCFLAG_EH_SUCCESS_CMD;
+		ehi->dev_action[dev->devno] |= ATA_EH_GET_SUCCESS_SENSE;
 
-		/* Some commands need post-processing after successful
-		 * completion.
+		/*
+		 * set pending so that ata_qc_schedule_eh() does not trigger
+		 * fast drain, and freeze the port.
 		 */
-		switch (qc->tf.command) {
-		case ATA_CMD_SET_FEATURES:
-			if (qc->tf.feature != SETFEATURES_WC_ON &&
-			    qc->tf.feature != SETFEATURES_WC_OFF &&
-			    qc->tf.feature != SETFEATURES_RA_ON &&
-			    qc->tf.feature != SETFEATURES_RA_OFF)
-				break;
-			fallthrough;
-		case ATA_CMD_INIT_DEV_PARAMS: /* CHS translation changed */
-		case ATA_CMD_SET_MULTI: /* multi_count changed */
-			/* revalidate device */
-			ehi->dev_action[dev->devno] |= ATA_EH_REVALIDATE;
-			ata_port_schedule_eh(ap);
-			break;
+		ap->pflags |= ATA_PFLAG_EH_PENDING;
+		ata_qc_schedule_eh(qc);
+		return;
+	}
 
-		case ATA_CMD_SLEEP:
-			dev->flags |= ATA_DFLAG_SLEEPING;
+	/* Some commands need post-processing after successful completion. */
+	switch (qc->tf.command) {
+	case ATA_CMD_SET_FEATURES:
+		if (qc->tf.feature != SETFEATURES_WC_ON &&
+		    qc->tf.feature != SETFEATURES_WC_OFF &&
+		    qc->tf.feature != SETFEATURES_RA_ON &&
+		    qc->tf.feature != SETFEATURES_RA_OFF)
 			break;
-		}
-
-		if (unlikely(dev->flags & ATA_DFLAG_DUBIOUS_XFER))
-			ata_verify_xfer(qc);
+		fallthrough;
+	case ATA_CMD_INIT_DEV_PARAMS: /* CHS translation changed */
+	case ATA_CMD_SET_MULTI: /* multi_count changed */
+		/* revalidate device */
+		ehi->dev_action[dev->devno] |= ATA_EH_REVALIDATE;
+		ata_port_schedule_eh(ap);
+		break;
 
-		__ata_qc_complete(qc);
-	} else {
-		if (qc->flags & ATA_QCFLAG_EH_SCHEDULED)
-			return;
+	case ATA_CMD_SLEEP:
+		dev->flags |= ATA_DFLAG_SLEEPING;
+		break;
+	}
 
-		/* read result TF if failed or requested */
-		if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
-			fill_result_tf(qc);
+	if (unlikely(dev->flags & ATA_DFLAG_DUBIOUS_XFER))
+		ata_verify_xfer(qc);
 
-		__ata_qc_complete(qc);
-	}
+	__ata_qc_complete(qc);
 }
 EXPORT_SYMBOL_GPL(ata_qc_complete);
 
@@ -5039,11 +5011,8 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 	struct ata_link *link = qc->dev->link;
 	u8 prot = qc->tf.protocol;
 
-	/* Make sure only one non-NCQ command is outstanding.  The
-	 * check is skipped for old EH because it reuses active qc to
-	 * request ATAPI sense.
-	 */
-	WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag));
+	/* Make sure only one non-NCQ command is outstanding. */
+	WARN_ON_ONCE(ata_tag_valid(link->active_tag));
 
 	if (ata_is_ncq(prot)) {
 		WARN_ON_ONCE(link->sactive & (1 << qc->hw_tag));
@@ -5917,15 +5886,9 @@ void __ata_port_probe(struct ata_port *ap)
 
 int ata_port_probe(struct ata_port *ap)
 {
-	int rc = 0;
-
-	if (ap->ops->error_handler) {
-		__ata_port_probe(ap);
-		ata_port_wait_eh(ap);
-	} else {
-		rc = ata_bus_probe(ap);
-	}
-	return rc;
+	__ata_port_probe(ap);
+	ata_port_wait_eh(ap);
+	return 0;
 }
 
 
@@ -6130,9 +6093,6 @@ static void ata_port_detach(struct ata_port *ap)
 	struct ata_link *link;
 	struct ata_device *dev;
 
-	if (!ap->ops->error_handler)
-		goto skip_eh;
-
 	/* tell EH we're leaving & flush EH */
 	spin_lock_irqsave(ap->lock, flags);
 	ap->pflags |= ATA_PFLAG_UNLOADING;
@@ -6148,7 +6108,6 @@ static void ata_port_detach(struct ata_port *ap)
 	cancel_delayed_work_sync(&ap->hotplug_task);
 	cancel_delayed_work_sync(&ap->scsi_rescan_task);
 
- skip_eh:
 	/* clean up zpodd on port removal */
 	ata_for_each_link(link, ap, HOST_FIRST) {
 		ata_for_each_dev(dev, link, ALL) {
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 0e2acca36c10..159ba6ba19eb 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -571,13 +571,10 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap,
 	/* make sure sff pio task is not running */
 	ata_sff_flush_pio_task(ap);
 
-	if (!ap->ops->error_handler)
-		return;
-
 	/* synchronize with host lock and sort out timeouts */
 
 	/*
-	 * For new EH, all qcs are finished in one of three ways -
+	 * For EH, all qcs are finished in one of three ways -
 	 * normal completion, error completion, and SCSI timeout.
 	 * Both completions can race against SCSI timeout.  When normal
 	 * completion wins, the qc never reaches EH.  When error
@@ -659,94 +656,87 @@ EXPORT_SYMBOL(ata_scsi_cmd_error_handler);
 void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap)
 {
 	unsigned long flags;
+	struct ata_link *link;
 
-	/* invoke error handler */
-	if (ap->ops->error_handler) {
-		struct ata_link *link;
-
-		/* acquire EH ownership */
-		ata_eh_acquire(ap);
+	/* acquire EH ownership */
+	ata_eh_acquire(ap);
  repeat:
-		/* kill fast drain timer */
-		del_timer_sync(&ap->fastdrain_timer);
+	/* kill fast drain timer */
+	del_timer_sync(&ap->fastdrain_timer);
 
-		/* process port resume request */
-		ata_eh_handle_port_resume(ap);
+	/* process port resume request */
+	ata_eh_handle_port_resume(ap);
 
-		/* fetch & clear EH info */
-		spin_lock_irqsave(ap->lock, flags);
+	/* fetch & clear EH info */
+	spin_lock_irqsave(ap->lock, flags);
 
-		ata_for_each_link(link, ap, HOST_FIRST) {
-			struct ata_eh_context *ehc = &link->eh_context;
-			struct ata_device *dev;
+	ata_for_each_link(link, ap, HOST_FIRST) {
+		struct ata_eh_context *ehc = &link->eh_context;
+		struct ata_device *dev;
 
-			memset(&link->eh_context, 0, sizeof(link->eh_context));
-			link->eh_context.i = link->eh_info;
-			memset(&link->eh_info, 0, sizeof(link->eh_info));
+		memset(&link->eh_context, 0, sizeof(link->eh_context));
+		link->eh_context.i = link->eh_info;
+		memset(&link->eh_info, 0, sizeof(link->eh_info));
 
-			ata_for_each_dev(dev, link, ENABLED) {
-				int devno = dev->devno;
+		ata_for_each_dev(dev, link, ENABLED) {
+			int devno = dev->devno;
 
-				ehc->saved_xfer_mode[devno] = dev->xfer_mode;
-				if (ata_ncq_enabled(dev))
-					ehc->saved_ncq_enabled |= 1 << devno;
-			}
+			ehc->saved_xfer_mode[devno] = dev->xfer_mode;
+			if (ata_ncq_enabled(dev))
+				ehc->saved_ncq_enabled |= 1 << devno;
 		}
+	}
 
-		ap->pflags |= ATA_PFLAG_EH_IN_PROGRESS;
-		ap->pflags &= ~ATA_PFLAG_EH_PENDING;
-		ap->excl_link = NULL;	/* don't maintain exclusion over EH */
+	ap->pflags |= ATA_PFLAG_EH_IN_PROGRESS;
+	ap->pflags &= ~ATA_PFLAG_EH_PENDING;
+	ap->excl_link = NULL;	/* don't maintain exclusion over EH */
 
-		spin_unlock_irqrestore(ap->lock, flags);
+	spin_unlock_irqrestore(ap->lock, flags);
 
-		/* invoke EH, skip if unloading or suspended */
-		if (!(ap->pflags & (ATA_PFLAG_UNLOADING | ATA_PFLAG_SUSPENDED)))
-			ap->ops->error_handler(ap);
-		else {
-			/* if unloading, commence suicide */
-			if ((ap->pflags & ATA_PFLAG_UNLOADING) &&
-			    !(ap->pflags & ATA_PFLAG_UNLOADED))
-				ata_eh_unload(ap);
-			ata_eh_finish(ap);
-		}
+	/* invoke EH, skip if unloading or suspended */
+	if (!(ap->pflags & (ATA_PFLAG_UNLOADING | ATA_PFLAG_SUSPENDED)))
+		ap->ops->error_handler(ap);
+	else {
+		/* if unloading, commence suicide */
+		if ((ap->pflags & ATA_PFLAG_UNLOADING) &&
+		    !(ap->pflags & ATA_PFLAG_UNLOADED))
+			ata_eh_unload(ap);
+		ata_eh_finish(ap);
+	}
 
-		/* process port suspend request */
-		ata_eh_handle_port_suspend(ap);
+	/* process port suspend request */
+	ata_eh_handle_port_suspend(ap);
 
-		/* Exception might have happened after ->error_handler
-		 * recovered the port but before this point.  Repeat
-		 * EH in such case.
-		 */
-		spin_lock_irqsave(ap->lock, flags);
+	/*
+	 * Exception might have happened after ->error_handler recovered the
+	 * port but before this point.  Repeat EH in such case.
+	 */
+	spin_lock_irqsave(ap->lock, flags);
 
-		if (ap->pflags & ATA_PFLAG_EH_PENDING) {
-			if (--ap->eh_tries) {
-				spin_unlock_irqrestore(ap->lock, flags);
-				goto repeat;
-			}
-			ata_port_err(ap,
-				     "EH pending after %d tries, giving up\n",
-				     ATA_EH_MAX_TRIES);
-			ap->pflags &= ~ATA_PFLAG_EH_PENDING;
+	if (ap->pflags & ATA_PFLAG_EH_PENDING) {
+		if (--ap->eh_tries) {
+			spin_unlock_irqrestore(ap->lock, flags);
+			goto repeat;
 		}
+		ata_port_err(ap,
+			     "EH pending after %d tries, giving up\n",
+			     ATA_EH_MAX_TRIES);
+		ap->pflags &= ~ATA_PFLAG_EH_PENDING;
+	}
 
-		/* this run is complete, make sure EH info is clear */
-		ata_for_each_link(link, ap, HOST_FIRST)
-			memset(&link->eh_info, 0, sizeof(link->eh_info));
+	/* this run is complete, make sure EH info is clear */
+	ata_for_each_link(link, ap, HOST_FIRST)
+		memset(&link->eh_info, 0, sizeof(link->eh_info));
 
-		/* end eh (clear host_eh_scheduled) while holding
-		 * ap->lock such that if exception occurs after this
-		 * point but before EH completion, SCSI midlayer will
-		 * re-initiate EH.
-		 */
-		ap->ops->end_eh(ap);
+	/*
+	 * end eh (clear host_eh_scheduled) while holding ap->lock such that if
+	 * exception occurs after this point but before EH completion, SCSI
+	 * midlayer will re-initiate EH.
+	 */
+	ap->ops->end_eh(ap);
 
-		spin_unlock_irqrestore(ap->lock, flags);
-		ata_eh_release(ap);
-	} else {
-		WARN_ON(ata_qc_from_tag(ap, ap->link.active_tag) == NULL);
-		ap->ops->eng_timeout(ap);
-	}
+	spin_unlock_irqrestore(ap->lock, flags);
+	ata_eh_release(ap);
 
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
@@ -912,8 +902,6 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 
-	WARN_ON(!ap->ops->error_handler);
-
 	qc->flags |= ATA_QCFLAG_EH;
 	ata_eh_set_pending(ap, 1);
 
@@ -934,8 +922,6 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
  */
 void ata_std_sched_eh(struct ata_port *ap)
 {
-	WARN_ON(!ap->ops->error_handler);
-
 	if (ap->pflags & ATA_PFLAG_INITIALIZING)
 		return;
 
@@ -989,8 +975,6 @@ static int ata_do_link_abort(struct ata_port *ap, struct ata_link *link)
 	struct ata_queued_cmd *qc;
 	int tag, nr_aborted = 0;
 
-	WARN_ON(!ap->ops->error_handler);
-
 	/* we're gonna abort all commands, no need for fast drain */
 	ata_eh_set_pending(ap, 0);
 
@@ -1065,8 +1049,6 @@ EXPORT_SYMBOL_GPL(ata_port_abort);
  */
 static void __ata_port_freeze(struct ata_port *ap)
 {
-	WARN_ON(!ap->ops->error_handler);
-
 	if (ap->ops->freeze)
 		ap->ops->freeze(ap);
 
@@ -1091,8 +1073,6 @@ static void __ata_port_freeze(struct ata_port *ap)
  */
 int ata_port_freeze(struct ata_port *ap)
 {
-	WARN_ON(!ap->ops->error_handler);
-
 	__ata_port_freeze(ap);
 
 	return ata_port_abort(ap);
@@ -1112,9 +1092,6 @@ void ata_eh_freeze_port(struct ata_port *ap)
 {
 	unsigned long flags;
 
-	if (!ap->ops->error_handler)
-		return;
-
 	spin_lock_irqsave(ap->lock, flags);
 	__ata_port_freeze(ap);
 	spin_unlock_irqrestore(ap->lock, flags);
@@ -1134,9 +1111,6 @@ void ata_eh_thaw_port(struct ata_port *ap)
 {
 	unsigned long flags;
 
-	if (!ap->ops->error_handler)
-		return;
-
 	spin_lock_irqsave(ap->lock, flags);
 
 	ap->pflags &= ~ATA_PFLAG_FROZEN;
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 5d393432fa06..cf401a54c9e1 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1158,12 +1158,7 @@ EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
  */
 int ata_sas_port_start(struct ata_port *ap)
 {
-	/*
-	 * the port is marked as frozen at allocation time, but if we don't
-	 * have new eh, we won't thaw it
-	 */
-	if (!ap->ops->error_handler)
-		ap->pflags &= ~ATA_PFLAG_FROZEN;
+	/* the port is marked as frozen at allocation time */
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_start);
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index f39d603f05bc..c3f745a4e5ea 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -709,47 +709,6 @@ static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc)
 	qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes;
 }
 
-/**
- *	ata_dump_status - user friendly display of error info
- *	@ap: the port in question
- *	@tf: ptr to filled out taskfile
- *
- *	Decode and dump the ATA error/status registers for the user so
- *	that they have some idea what really happened at the non
- *	make-believe layer.
- *
- *	LOCKING:
- *	inherited from caller
- */
-static void ata_dump_status(struct ata_port *ap, struct ata_taskfile *tf)
-{
-	u8 stat = tf->status, err = tf->error;
-
-	if (stat & ATA_BUSY) {
-		ata_port_warn(ap, "status=0x%02x {Busy} ", stat);
-	} else {
-		ata_port_warn(ap, "status=0x%02x { %s%s%s%s%s%s%s} ", stat,
-			      stat & ATA_DRDY ? "DriveReady " : "",
-			      stat & ATA_DF ? "DeviceFault " : "",
-			      stat & ATA_DSC ? "SeekComplete " : "",
-			      stat & ATA_DRQ ? "DataRequest " : "",
-			      stat & ATA_CORR ? "CorrectedError " : "",
-			      stat & ATA_SENSE ? "Sense " : "",
-			      stat & ATA_ERR ? "Error " : "");
-		if (err)
-			ata_port_warn(ap, "error=0x%02x {%s%s%s%s%s%s", err,
-				      err & ATA_ABORTED ?
-				      "DriveStatusError " : "",
-				      err & ATA_ICRC ?
-				      (err & ATA_ABORTED ?
-				       "BadCRC " : "Sector ") : "",
-				      err & ATA_UNC ? "UncorrectableError " : "",
-				      err & ATA_IDNF ? "SectorIdNotFound " : "",
-				      err & ATA_TRK0NF ? "TrackZeroNotFound " : "",
-				      err & ATA_AMNF ? "AddrMarkNotFound " : "");
-	}
-}
-
 /**
  *	ata_to_sense_error - convert ATA error to SCSI error
  *	@id: ATA device number
@@ -758,7 +717,6 @@ static void ata_dump_status(struct ata_port *ap, struct ata_taskfile *tf)
  *	@sk: the sense key we'll fill out
  *	@asc: the additional sense code we'll fill out
  *	@ascq: the additional sense code qualifier we'll fill out
- *	@verbose: be verbose
  *
  *	Converts an ATA error into a SCSI error.  Fill out pointers to
  *	SK, ASC, and ASCQ bytes for later use in fixed or descriptor
@@ -768,7 +726,7 @@ static void ata_dump_status(struct ata_port *ap, struct ata_taskfile *tf)
  *	spin_lock_irqsave(host lock)
  */
 static void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk,
-			       u8 *asc, u8 *ascq, int verbose)
+			       u8 *asc, u8 *ascq)
 {
 	int i;
 
@@ -847,7 +805,7 @@ static void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk,
 				*sk = sense_table[i][1];
 				*asc = sense_table[i][2];
 				*ascq = sense_table[i][3];
-				goto translate_done;
+				return;
 			}
 		}
 	}
@@ -862,7 +820,7 @@ static void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk,
 			*sk = stat_table[i][1];
 			*asc = stat_table[i][2];
 			*ascq = stat_table[i][3];
-			goto translate_done;
+			return;
 		}
 	}
 
@@ -873,12 +831,6 @@ static void ata_to_sense_error(unsigned id, u8 drv_stat, u8 drv_err, u8 *sk,
 	*sk = ABORTED_COMMAND;
 	*asc = 0x00;
 	*ascq = 0x00;
-
- translate_done:
-	if (verbose)
-		pr_err("ata%u: translated ATA stat/err 0x%02x/%02x to SCSI SK/ASC/ASCQ 0x%x/%02x/%02x\n",
-		       id, drv_stat, drv_err, *sk, *asc, *ascq);
-	return;
 }
 
 /*
@@ -904,7 +856,6 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
 	struct ata_taskfile *tf = &qc->result_tf;
 	unsigned char *sb = cmd->sense_buffer;
 	unsigned char *desc = sb + 8;
-	int verbose = qc->ap->ops->error_handler == NULL;
 	u8 sense_key, asc, ascq;
 
 	memset(sb, 0, SCSI_SENSE_BUFFERSIZE);
@@ -916,7 +867,7 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc)
 	if (qc->err_mask ||
 	    tf->status & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->status, tf->error,
-				   &sense_key, &asc, &ascq, verbose);
+				   &sense_key, &asc, &ascq);
 		ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq);
 	} else {
 		/*
@@ -999,7 +950,6 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	struct ata_taskfile *tf = &qc->result_tf;
 	unsigned char *sb = cmd->sense_buffer;
-	int verbose = qc->ap->ops->error_handler == NULL;
 	u64 block;
 	u8 sense_key, asc, ascq;
 
@@ -1017,7 +967,7 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
 	if (qc->err_mask ||
 	    tf->status & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) {
 		ata_to_sense_error(qc->ap->print_id, tf->status, tf->error,
-				   &sense_key, &asc, &ascq, verbose);
+				   &sense_key, &asc, &ascq);
 		ata_scsi_set_sense(dev, cmd, sense_key, asc, ascq);
 	} else {
 		/* Could not decode error */
@@ -1179,9 +1129,6 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 	unsigned long flags;
 	struct ata_device *dev;
 
-	if (!ap->ops->error_handler)
-		return;
-
 	spin_lock_irqsave(ap->lock, flags);
 	dev = __ata_scsi_find_dev(ap, sdev);
 	if (dev && dev->sdev) {
@@ -1668,7 +1615,6 @@ static void ata_qc_done(struct ata_queued_cmd *qc)
 
 static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 {
-	struct ata_port *ap = qc->ap;
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	u8 *cdb = cmd->cmnd;
 	int need_sense = (qc->err_mask != 0) &&
@@ -1692,9 +1638,6 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 		/* Keep the SCSI ML and status byte, clear host byte. */
 		cmd->result &= 0x0000ffff;
 
-	if (need_sense && !ap->ops->error_handler)
-		ata_dump_status(ap, &qc->result_tf);
-
 	ata_qc_done(qc);
 }
 
@@ -2601,71 +2544,6 @@ static unsigned int ata_scsiop_report_luns(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 }
 
-static void atapi_sense_complete(struct ata_queued_cmd *qc)
-{
-	if (qc->err_mask && ((qc->err_mask & AC_ERR_DEV) == 0)) {
-		/* FIXME: not quite right; we don't want the
-		 * translation of taskfile registers into
-		 * a sense descriptors, since that's only
-		 * correct for ATA, not ATAPI
-		 */
-		ata_gen_passthru_sense(qc);
-	}
-
-	ata_qc_done(qc);
-}
-
-/* is it pointless to prefer PIO for "safety reasons"? */
-static inline int ata_pio_use_silly(struct ata_port *ap)
-{
-	return (ap->flags & ATA_FLAG_PIO_DMA);
-}
-
-static void atapi_request_sense(struct ata_queued_cmd *qc)
-{
-	struct ata_port *ap = qc->ap;
-	struct scsi_cmnd *cmd = qc->scsicmd;
-
-	memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
-
-#ifdef CONFIG_ATA_SFF
-	if (ap->ops->sff_tf_read)
-		ap->ops->sff_tf_read(ap, &qc->tf);
-#endif
-
-	/* fill these in, for the case where they are -not- overwritten */
-	cmd->sense_buffer[0] = 0x70;
-	cmd->sense_buffer[2] = qc->tf.error >> 4;
-
-	ata_qc_reinit(qc);
-
-	/* setup sg table and init transfer direction */
-	sg_init_one(&qc->sgent, cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE);
-	ata_sg_init(qc, &qc->sgent, 1);
-	qc->dma_dir = DMA_FROM_DEVICE;
-
-	memset(&qc->cdb, 0, qc->dev->cdb_len);
-	qc->cdb[0] = REQUEST_SENSE;
-	qc->cdb[4] = SCSI_SENSE_BUFFERSIZE;
-
-	qc->tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
-	qc->tf.command = ATA_CMD_PACKET;
-
-	if (ata_pio_use_silly(ap)) {
-		qc->tf.protocol = ATAPI_PROT_DMA;
-		qc->tf.feature |= ATAPI_PKT_DMA;
-	} else {
-		qc->tf.protocol = ATAPI_PROT_PIO;
-		qc->tf.lbam = SCSI_SENSE_BUFFERSIZE;
-		qc->tf.lbah = 0;
-	}
-	qc->nbytes = SCSI_SENSE_BUFFERSIZE;
-
-	qc->complete_fn = atapi_sense_complete;
-
-	ata_qc_issue(qc);
-}
-
 /*
  * ATAPI devices typically report zero for their SCSI version, and sometimes
  * deviate from the spec WRT response data format.  If SCSI version is
@@ -2691,9 +2569,8 @@ static void atapi_qc_complete(struct ata_queued_cmd *qc)
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	unsigned int err_mask = qc->err_mask;
 
-	/* handle completion from new EH */
-	if (unlikely(qc->ap->ops->error_handler &&
-		     (err_mask || qc->flags & ATA_QCFLAG_SENSE_VALID))) {
+	/* handle completion from EH */
+	if (unlikely(err_mask || qc->flags & ATA_QCFLAG_SENSE_VALID)) {
 
 		if (!(qc->flags & ATA_QCFLAG_SENSE_VALID)) {
 			/* FIXME: not quite right; we don't want the
@@ -2725,23 +2602,10 @@ static void atapi_qc_complete(struct ata_queued_cmd *qc)
 		return;
 	}
 
-	/* successful completion or old EH failure path */
-	if (unlikely(err_mask & AC_ERR_DEV)) {
-		cmd->result = SAM_STAT_CHECK_CONDITION;
-		atapi_request_sense(qc);
-		return;
-	} else if (unlikely(err_mask)) {
-		/* FIXME: not quite right; we don't want the
-		 * translation of taskfile registers into
-		 * a sense descriptors, since that's only
-		 * correct for ATA, not ATAPI
-		 */
-		ata_gen_passthru_sense(qc);
-	} else {
-		if (cmd->cmnd[0] == INQUIRY && (cmd->cmnd[1] & 0x03) == 0)
-			atapi_fixup_inquiry(cmd);
-		cmd->result = SAM_STAT_GOOD;
-	}
+	/* successful completion path */
+	if (cmd->cmnd[0] == INQUIRY && (cmd->cmnd[1] & 0x03) == 0)
+		atapi_fixup_inquiry(cmd);
+	cmd->result = SAM_STAT_GOOD;
 
 	ata_qc_done(qc);
 }
@@ -4790,9 +4654,6 @@ int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 	unsigned long flags;
 	int devno, rc = 0;
 
-	if (!ap->ops->error_handler)
-		return -EOPNOTSUPP;
-
 	if (lun != SCAN_WILD_CARD && lun)
 		return -EINVAL;
 
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index ac55dfc2d85f..8fcc622fcb3d 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -883,31 +883,21 @@ static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
 {
 	struct ata_port *ap = qc->ap;
 
-	if (ap->ops->error_handler) {
-		if (in_wq) {
-			/* EH might have kicked in while host lock is
-			 * released.
-			 */
-			qc = ata_qc_from_tag(ap, qc->tag);
-			if (qc) {
-				if (likely(!(qc->err_mask & AC_ERR_HSM))) {
-					ata_sff_irq_on(ap);
-					ata_qc_complete(qc);
-				} else
-					ata_port_freeze(ap);
-			}
-		} else {
-			if (likely(!(qc->err_mask & AC_ERR_HSM)))
+	if (in_wq) {
+		/* EH might have kicked in while host lock is released. */
+		qc = ata_qc_from_tag(ap, qc->tag);
+		if (qc) {
+			if (likely(!(qc->err_mask & AC_ERR_HSM))) {
+				ata_sff_irq_on(ap);
 				ata_qc_complete(qc);
-			else
+			} else
 				ata_port_freeze(ap);
 		}
 	} else {
-		if (in_wq) {
-			ata_sff_irq_on(ap);
-			ata_qc_complete(qc);
-		} else
+		if (likely(!(qc->err_mask & AC_ERR_HSM)))
 			ata_qc_complete(qc);
+		else
+			ata_port_freeze(ap);
 	}
 }
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8d510fb00591..d61e5465076e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1785,7 +1785,7 @@ static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap,
 {
 	struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
 
-	if (unlikely(!qc) || !ap->ops->error_handler)
+	if (unlikely(!qc))
 		return qc;
 
 	if ((qc->flags & (ATA_QCFLAG_ACTIVE |
-- 
cgit v1.2.3


From 43aa43351bb551a7732c1b9f6e8ebb9a6f30b063 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:13 +0200
Subject: ata,scsi: remove ata_sas_port_{start,stop} callbacks

Callbacks are empty now, so remove them.

Also, remove the call to ap->ops->port_start() in ata_sas_port_init(),
as this would otherwise cause a NULL pointer dereference, now when the
callback is gone.

Signed-off-by: Hannes Reinecke <hare@suse.de>
[niklas: remove the call to ap->ops->port_start() in ata_sas_port_init()]
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-sata.c     | 38 --------------------------------------
 drivers/scsi/libsas/sas_ata.c |  2 --
 include/linux/libata.h        |  2 --
 3 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index cf401a54c9e1..ce392b5b5930 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1144,40 +1144,6 @@ struct ata_port *ata_sas_port_alloc(struct ata_host *host,
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
 
-/**
- *	ata_sas_port_start - Set port up for dma.
- *	@ap: Port to initialize
- *
- *	Called just after data structures for each port are
- *	initialized.
- *
- *	May be used as the port_start() entry in ata_port_operations.
- *
- *	LOCKING:
- *	Inherited from caller.
- */
-int ata_sas_port_start(struct ata_port *ap)
-{
-	/* the port is marked as frozen at allocation time */
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ata_sas_port_start);
-
-/**
- *	ata_sas_port_stop - Undo ata_sas_port_start()
- *	@ap: Port to shut down
- *
- *	May be used as the port_stop() entry in ata_port_operations.
- *
- *	LOCKING:
- *	Inherited from caller.
- */
-
-void ata_sas_port_stop(struct ata_port *ap)
-{
-}
-EXPORT_SYMBOL_GPL(ata_sas_port_stop);
-
 /**
  * ata_sas_async_probe - simply schedule probing and return
  * @ap: Port to probe
@@ -1211,10 +1177,6 @@ EXPORT_SYMBOL_GPL(ata_sas_sync_probe);
 
 int ata_sas_port_init(struct ata_port *ap)
 {
-	int rc = ap->ops->port_start(ap);
-
-	if (rc)
-		return rc;
 	ap->print_id = atomic_inc_return(&ata_print_id);
 	return 0;
 }
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 77714a495cbb..7ead1f1be97f 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -565,8 +565,6 @@ static struct ata_port_operations sas_sata_ops = {
 	.qc_prep		= ata_noop_qc_prep,
 	.qc_issue		= sas_ata_qc_issue,
 	.qc_fill_rtf		= sas_ata_qc_fill_rtf,
-	.port_start		= ata_sas_port_start,
-	.port_stop		= ata_sas_port_stop,
 	.set_dmamode		= sas_ata_set_dmamode,
 	.sched_eh		= sas_ata_sched_eh,
 	.end_eh			= sas_ata_end_eh,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d61e5465076e..f8f8b558a8be 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1244,10 +1244,8 @@ extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 extern void ata_sas_async_probe(struct ata_port *ap);
 extern int ata_sas_sync_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
-extern int ata_sas_port_start(struct ata_port *ap);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-extern void ata_sas_port_stop(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
 extern void ata_tf_to_fis(const struct ata_taskfile *tf,
-- 
cgit v1.2.3


From 6c2fe21e08c28bbdb53ada668edea7df1aa9fb1e Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:14 +0200
Subject: ata,scsi: remove ata_sas_port_destroy()

Is now a wrapper around kfree(), so call it directly.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-sata.c          | 14 --------------
 drivers/scsi/libsas/sas_ata.c      |  2 +-
 drivers/scsi/libsas/sas_discover.c |  2 +-
 include/linux/libata.h             |  1 -
 4 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index ce392b5b5930..c149a46fadaf 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1194,20 +1194,6 @@ void ata_sas_tport_delete(struct ata_port *ap)
 }
 EXPORT_SYMBOL_GPL(ata_sas_tport_delete);
 
-/**
- *	ata_sas_port_destroy - Destroy a SATA port allocated by ata_sas_port_alloc
- *	@ap: SATA port to destroy
- *
- */
-
-void ata_sas_port_destroy(struct ata_port *ap)
-{
-	if (ap->ops->port_stop)
-		ap->ops->port_stop(ap);
-	kfree(ap);
-}
-EXPORT_SYMBOL_GPL(ata_sas_port_destroy);
-
 /**
  *	ata_sas_slave_configure - Default slave_config routine for libata devices
  *	@sdev: SCSI device to configure
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 7ead1f1be97f..a2eb9a2191c0 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -619,7 +619,7 @@ int sas_ata_init(struct domain_device *found_dev)
 	return 0;
 
 destroy_port:
-	ata_sas_port_destroy(ap);
+	kfree(ap);
 free_host:
 	ata_host_put(ata_host);
 	return rc;
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index 8c6afe724944..07e18cdb85c7 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -301,7 +301,7 @@ void sas_free_device(struct kref *kref)
 
 	if (dev_is_sata(dev) && dev->sata_dev.ap) {
 		ata_sas_tport_delete(dev->sata_dev.ap);
-		ata_sas_port_destroy(dev->sata_dev.ap);
+		kfree(dev->sata_dev.ap);
 		ata_host_put(dev->sata_dev.ata_host);
 		dev->sata_dev.ata_host = NULL;
 		dev->sata_dev.ap = NULL;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f8f8b558a8be..1d1fd16a1492 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1238,7 +1238,6 @@ extern int sata_link_debounce(struct ata_link *link,
 extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			     bool spm_wakeup);
 extern int ata_slave_link_init(struct ata_port *ap);
-extern void ata_sas_port_destroy(struct ata_port *);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_sas_async_probe(struct ata_port *ap);
-- 
cgit v1.2.3


From 8ac161ea2b3709b789c192de7da31a58aec9d7ee Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:15 +0200
Subject: ata: libata-sata: remove ata_sas_sync_probe()

Unused.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-sata.c | 7 -------
 include/linux/libata.h    | 1 -
 2 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index c149a46fadaf..04ab2bedcaa2 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1157,13 +1157,6 @@ void ata_sas_async_probe(struct ata_port *ap)
 }
 EXPORT_SYMBOL_GPL(ata_sas_async_probe);
 
-int ata_sas_sync_probe(struct ata_port *ap)
-{
-	return ata_port_probe(ap);
-}
-EXPORT_SYMBOL_GPL(ata_sas_sync_probe);
-
-
 /**
  *	ata_sas_port_init - Initialize a SATA device
  *	@ap: SATA port to initialize
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1d1fd16a1492..d08dde8c8ad2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1241,7 +1241,6 @@ extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_sas_async_probe(struct ata_port *ap);
-extern int ata_sas_sync_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-- 
cgit v1.2.3


From a76f1b637ce92c2b8d5da912826079010b11f138 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 31 Jul 2023 16:34:17 +0200
Subject: ata,scsi: cleanup __ata_port_probe()

Rename __ata_port_probe() to ata_port_probe() and drop the wrapper
ata_sas_async_probe().

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-core.c             |  5 +++--
 drivers/ata/libata-sata.c             | 13 -------------
 drivers/ata/libata.h                  |  2 --
 drivers/scsi/hisi_sas/hisi_sas_main.c |  2 +-
 drivers/scsi/libsas/sas_ata.c         |  2 +-
 include/linux/libata.h                |  2 +-
 6 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c6e624d27f46..fdd4804d0bf2 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5865,7 +5865,7 @@ void ata_host_init(struct ata_host *host, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(ata_host_init);
 
-void __ata_port_probe(struct ata_port *ap)
+void ata_port_probe(struct ata_port *ap)
 {
 	struct ata_eh_info *ehi = &ap->link.eh_info;
 	unsigned long flags;
@@ -5883,6 +5883,7 @@ void __ata_port_probe(struct ata_port *ap)
 
 	spin_unlock_irqrestore(ap->lock, flags);
 }
+EXPORT_SYMBOL_GPL(ata_port_probe);
 
 static void async_port_probe(void *data, async_cookie_t cookie)
 {
@@ -5898,7 +5899,7 @@ static void async_port_probe(void *data, async_cookie_t cookie)
 	if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN) && ap->port_no != 0)
 		async_synchronize_cookie(cookie);
 
-	__ata_port_probe(ap);
+	ata_port_probe(ap);
 	ata_port_wait_eh(ap);
 
 	/* in order to keep device order, we need to synchronize at this point */
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 04ab2bedcaa2..eb81641d383a 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1144,19 +1144,6 @@ struct ata_port *ata_sas_port_alloc(struct ata_host *host,
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
 
-/**
- * ata_sas_async_probe - simply schedule probing and return
- * @ap: Port to probe
- *
- * For batch scheduling of probe for sas attached ata devices, assumes
- * the port has already been through ata_sas_port_init()
- */
-void ata_sas_async_probe(struct ata_port *ap)
-{
-	__ata_port_probe(ap);
-}
-EXPORT_SYMBOL_GPL(ata_sas_async_probe);
-
 /**
  *	ata_sas_port_init - Initialize a SATA device
  *	@ap: SATA port to initialize
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index cf993885d2b2..1ec9b4427b84 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -78,8 +78,6 @@ extern int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg);
 extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg);
 extern struct ata_port *ata_port_alloc(struct ata_host *host);
 extern const char *sata_spd_string(unsigned int spd);
-extern int ata_port_probe(struct ata_port *ap);
-extern void __ata_port_probe(struct ata_port *ap);
 extern unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 				      u8 page, void *buf, unsigned int sectors);
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index 8f22ece957bd..b2f07c6f30e7 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -787,7 +787,7 @@ static int hisi_sas_init_device(struct domain_device *device)
 		 * However we don't need to issue a hard reset here for these
 		 * reasons:
 		 * a. When probing the device, libsas/libata already issues a
-		 * hard reset in sas_probe_sata() -> ata_sas_async_probe().
+		 * hard reset in sas_probe_sata() -> ata_port_probe().
 		 * Note that in hisi_sas_debug_I_T_nexus_reset() we take care
 		 * to issue a hard reset by checking the dev status (== INIT).
 		 * b. When resetting the controller, this is simply unnecessary.
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index a2eb9a2191c0..d6bb37b3974a 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -653,7 +653,7 @@ void sas_probe_sata(struct asd_sas_port *port)
 		if (!dev_is_sata(dev))
 			continue;
 
-		ata_sas_async_probe(dev->sata_dev.ap);
+		ata_port_probe(dev->sata_dev.ap);
 	}
 	mutex_unlock(&port->ha->disco_mutex);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d08dde8c8ad2..7468a330fc77 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1240,7 +1240,7 @@ extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
-extern void ata_sas_async_probe(struct ata_port *ap);
+extern void ata_port_probe(struct ata_port *ap);
 extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-- 
cgit v1.2.3


From 541528170a5cb1342c378dfd46b04dfe024dbc7a Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:18 +0200
Subject: ata,scsi: remove ata_sas_port_init()

ata_sas_port_init() now only contains a single initialization.

Move this single initialization to ata_sas_port_alloc(), since:
1) ata_sas_port_alloc() already initializes some of the struct members.
2) ata_sas_port_alloc() is only used by libsas.

Suggested-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libata-sata.c     | 19 +------------------
 drivers/scsi/libsas/sas_ata.c |  3 ---
 include/linux/libata.h        |  1 -
 3 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index eb81641d383a..5d31c08be013 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1139,29 +1139,12 @@ struct ata_port *ata_sas_port_alloc(struct ata_host *host,
 	ap->flags |= port_info->flags;
 	ap->ops = port_info->port_ops;
 	ap->cbl = ATA_CBL_SATA;
+	ap->print_id = atomic_inc_return(&ata_print_id);
 
 	return ap;
 }
 EXPORT_SYMBOL_GPL(ata_sas_port_alloc);
 
-/**
- *	ata_sas_port_init - Initialize a SATA device
- *	@ap: SATA port to initialize
- *
- *	LOCKING:
- *	PCI/etc. bus probe sem.
- *
- *	RETURNS:
- *	Zero on success, non-zero on error.
- */
-
-int ata_sas_port_init(struct ata_port *ap)
-{
-	ap->print_id = atomic_inc_return(&ata_print_id);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ata_sas_port_init);
-
 int ata_sas_tport_add(struct device *parent, struct ata_port *ap)
 {
 	return ata_tport_add(parent, ap);
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index d6bb37b3974a..cd16a1ac379d 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -605,9 +605,6 @@ int sas_ata_init(struct domain_device *found_dev)
 	ap->private_data = found_dev;
 	ap->cbl = ATA_CBL_SATA;
 	ap->scsi_host = shost;
-	rc = ata_sas_port_init(ap);
-	if (rc)
-		goto destroy_port;
 
 	rc = ata_sas_tport_add(ata_host->dev, ap);
 	if (rc)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7468a330fc77..0980992c54c2 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1241,7 +1241,6 @@ extern int ata_slave_link_init(struct ata_port *ap);
 extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 					   struct ata_port_info *, struct Scsi_Host *);
 extern void ata_port_probe(struct ata_port *ap);
-extern int ata_sas_port_init(struct ata_port *);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
 extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
-- 
cgit v1.2.3


From 89329c7384ef56c407269157e30e781f55c3c4d2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:20 +0200
Subject: ata: libata-core: remove ata_bus_probe()

Remove ata_bus_probe() as it is unused.

Also, remove references to ata_bus_probe and port_disable in
Documentation/driver-api/libata.rst, as neither exist anymore.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 Documentation/driver-api/libata.rst |  16 -----
 drivers/ata/libata-core.c           | 138 ------------------------------------
 drivers/ata/libata.h                |   1 -
 include/linux/libata.h              |   1 -
 4 files changed, 156 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/libata.rst b/Documentation/driver-api/libata.rst
index 311af516a3fd..eecb8b81e185 100644
--- a/Documentation/driver-api/libata.rst
+++ b/Documentation/driver-api/libata.rst
@@ -32,22 +32,6 @@ register blocks.
 :c:type:`struct ata_port_operations <ata_port_operations>`
 ----------------------------------------------------------
 
-Disable ATA port
-~~~~~~~~~~~~~~~~
-
-::
-
-    void (*port_disable) (struct ata_port *);
-
-
-Called from :c:func:`ata_bus_probe` error path, as well as when unregistering
-from the SCSI module (rmmod, hot unplug). This function should do
-whatever needs to be done to take the port out of use. In most cases,
-:c:func:`ata_port_disable` can be used as this hook.
-
-Called from :c:func:`ata_bus_probe` on a failed probe. Called from
-:c:func:`ata_scsi_release`.
-
 Post-IDENTIFY device configuration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index fdd4804d0bf2..988d1fc451be 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3057,144 +3057,6 @@ int ata_cable_sata(struct ata_port *ap)
 }
 EXPORT_SYMBOL_GPL(ata_cable_sata);
 
-/**
- *	ata_bus_probe - Reset and probe ATA bus
- *	@ap: Bus to probe
- *
- *	Master ATA bus probing function.  Initiates a hardware-dependent
- *	bus reset, then attempts to identify any devices found on
- *	the bus.
- *
- *	LOCKING:
- *	PCI/etc. bus probe sem.
- *
- *	RETURNS:
- *	Zero on success, negative errno otherwise.
- */
-
-int ata_bus_probe(struct ata_port *ap)
-{
-	unsigned int classes[ATA_MAX_DEVICES];
-	int tries[ATA_MAX_DEVICES];
-	int rc;
-	struct ata_device *dev;
-
-	ata_for_each_dev(dev, &ap->link, ALL)
-		tries[dev->devno] = ATA_PROBE_MAX_TRIES;
-
- retry:
-	ata_for_each_dev(dev, &ap->link, ALL) {
-		/* If we issue an SRST then an ATA drive (not ATAPI)
-		 * may change configuration and be in PIO0 timing. If
-		 * we do a hard reset (or are coming from power on)
-		 * this is true for ATA or ATAPI. Until we've set a
-		 * suitable controller mode we should not touch the
-		 * bus as we may be talking too fast.
-		 */
-		dev->pio_mode = XFER_PIO_0;
-		dev->dma_mode = 0xff;
-
-		/* If the controller has a pio mode setup function
-		 * then use it to set the chipset to rights. Don't
-		 * touch the DMA setup as that will be dealt with when
-		 * configuring devices.
-		 */
-		if (ap->ops->set_piomode)
-			ap->ops->set_piomode(ap, dev);
-	}
-
-	/* reset and determine device classes */
-	ap->ops->phy_reset(ap);
-
-	ata_for_each_dev(dev, &ap->link, ALL) {
-		if (dev->class != ATA_DEV_UNKNOWN)
-			classes[dev->devno] = dev->class;
-		else
-			classes[dev->devno] = ATA_DEV_NONE;
-
-		dev->class = ATA_DEV_UNKNOWN;
-	}
-
-	/* read IDENTIFY page and configure devices. We have to do the identify
-	   specific sequence bass-ackwards so that PDIAG- is released by
-	   the slave device */
-
-	ata_for_each_dev(dev, &ap->link, ALL_REVERSE) {
-		if (tries[dev->devno])
-			dev->class = classes[dev->devno];
-
-		if (!ata_dev_enabled(dev))
-			continue;
-
-		rc = ata_dev_read_id(dev, &dev->class, ATA_READID_POSTRESET,
-				     dev->id);
-		if (rc)
-			goto fail;
-	}
-
-	/* Now ask for the cable type as PDIAG- should have been released */
-	if (ap->ops->cable_detect)
-		ap->cbl = ap->ops->cable_detect(ap);
-
-	/* We may have SATA bridge glue hiding here irrespective of
-	 * the reported cable types and sensed types.  When SATA
-	 * drives indicate we have a bridge, we don't know which end
-	 * of the link the bridge is which is a problem.
-	 */
-	ata_for_each_dev(dev, &ap->link, ENABLED)
-		if (ata_id_is_sata(dev->id))
-			ap->cbl = ATA_CBL_SATA;
-
-	/* After the identify sequence we can now set up the devices. We do
-	   this in the normal order so that the user doesn't get confused */
-
-	ata_for_each_dev(dev, &ap->link, ENABLED) {
-		ap->link.eh_context.i.flags |= ATA_EHI_PRINTINFO;
-		rc = ata_dev_configure(dev);
-		ap->link.eh_context.i.flags &= ~ATA_EHI_PRINTINFO;
-		if (rc)
-			goto fail;
-	}
-
-	/* configure transfer mode */
-	rc = ata_set_mode(&ap->link, &dev);
-	if (rc)
-		goto fail;
-
-	ata_for_each_dev(dev, &ap->link, ENABLED)
-		return 0;
-
-	return -ENODEV;
-
- fail:
-	tries[dev->devno]--;
-
-	switch (rc) {
-	case -EINVAL:
-		/* eeek, something went very wrong, give up */
-		tries[dev->devno] = 0;
-		break;
-
-	case -ENODEV:
-		/* give it just one more chance */
-		tries[dev->devno] = min(tries[dev->devno], 1);
-		fallthrough;
-	case -EIO:
-		if (tries[dev->devno] == 1) {
-			/* This is the last chance, better to slow
-			 * down than lose it.
-			 */
-			sata_down_spd_limit(&ap->link, 0);
-			ata_down_xfermask_limit(dev, ATA_DNXFER_PIO);
-		}
-	}
-
-	if (!tries[dev->devno])
-		ata_dev_disable(dev);
-
-	goto retry;
-}
-
 /**
  *	sata_print_link_status - Print SATA link status
  *	@link: SATA link to printk link status about
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 1ec9b4427b84..6e7d352803bd 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -122,7 +122,6 @@ extern void ata_scsi_media_change_notify(struct ata_device *dev);
 extern void ata_scsi_hotplug(struct work_struct *work);
 extern void ata_schedule_scsi_eh(struct Scsi_Host *shost);
 extern void ata_scsi_dev_rescan(struct work_struct *work);
-extern int ata_bus_probe(struct ata_port *ap);
 extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 			      unsigned int id, u64 lun);
 void ata_scsi_sdev_config(struct scsi_device *sdev);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0980992c54c2..e176d832467d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -344,7 +344,6 @@ enum {
 	ATA_LINK_RESUME_TRIES	= 5,
 
 	/* how hard are we gonna try to probe/recover devices */
-	ATA_PROBE_MAX_TRIES	= 3,
 	ATA_EH_DEV_TRIES	= 3,
 	ATA_EH_PMP_TRIES	= 5,
 	ATA_EH_PMP_LINK_TRIES	= 3,
-- 
cgit v1.2.3


From 6b4f165e0858016f7bb5f360966882a819519f07 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 31 Jul 2023 16:34:21 +0200
Subject: ata: libata: remove deprecated EH callbacks

Now that all libata drivers have migrated to use the error_handler
callback, remove the deprecated phy_reset and eng_timeout callbacks.

Also remove references to non-existent functions sata_phy_reset and
ata_qc_timeout from Documentation/driver-api/libata.rst.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 Documentation/driver-api/libata.rst | 22 ++++++----------------
 drivers/ata/pata_sl82c105.c         |  3 +--
 include/linux/libata.h              |  6 ------
 3 files changed, 7 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/libata.rst b/Documentation/driver-api/libata.rst
index eecb8b81e185..5da27a749246 100644
--- a/Documentation/driver-api/libata.rst
+++ b/Documentation/driver-api/libata.rst
@@ -256,14 +256,6 @@ advanced drivers implement their own ``->qc_issue``.
 Exception and probe handling (EH)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-::
-
-    void (*eng_timeout) (struct ata_port *ap);
-    void (*phy_reset) (struct ata_port *ap);
-
-
-Deprecated. Use ``->error_handler()`` instead.
-
 ::
 
     void (*freeze) (struct ata_port *ap);
@@ -348,8 +340,7 @@ SATA phy read/write
                        u32 val);
 
 
-Read and write standard SATA phy registers. Currently only used if
-``->phy_reset`` hook called the :c:func:`sata_phy_reset` helper function.
+Read and write standard SATA phy registers.
 sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE.
 
 Init and shutdown
@@ -520,13 +511,12 @@ to return without deallocating the qc. This leads us to
 
 :c:func:`ata_scsi_error` is the current ``transportt->eh_strategy_handler()``
 for libata. As discussed above, this will be entered in two cases -
-timeout and ATAPI error completion. This function calls low level libata
-driver's :c:func:`eng_timeout` callback, the standard callback for which is
-:c:func:`ata_eng_timeout`. It checks if a qc is active and calls
-:c:func:`ata_qc_timeout` on the qc if so. Actual error handling occurs in
-:c:func:`ata_qc_timeout`.
+timeout and ATAPI error completion. This function will check if a qc is active
+and has not failed yet. Such a qc will be marked with AC_ERR_TIMEOUT such that
+EH will know to handle it later. Then it calls low level libata driver's
+:c:func:`error_handler` callback.
 
-If EH is invoked for timeout, :c:func:`ata_qc_timeout` stops BMDMA and
+When the :c:func:`error_handler` callback is invoked it stops BMDMA and
 completes the qc. Note that as we're currently in EH, we cannot call
 scsi_done. As described in SCSI EH doc, a recovered scmd should be
 either retried with :c:func:`scsi_queue_insert` or finished with
diff --git a/drivers/ata/pata_sl82c105.c b/drivers/ata/pata_sl82c105.c
index 3b62ea482f1a..93882e976ede 100644
--- a/drivers/ata/pata_sl82c105.c
+++ b/drivers/ata/pata_sl82c105.c
@@ -180,8 +180,7 @@ static void sl82c105_bmdma_start(struct ata_queued_cmd *qc)
  *	document.
  *
  *	This function is also called to turn off DMA when a timeout occurs
- *	during DMA operation. In both cases we need to reset the engine,
- *	so no actual eng_timeout handler is required.
+ *	during DMA operation. In both cases we need to reset the engine.
  *
  *	We assume bmdma_stop is always called if bmdma_start as called. If
  *	not then we may need to wrap qc_issue.
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e176d832467d..52d58b13e5ee 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -975,12 +975,6 @@ struct ata_port_operations {
 	ssize_t (*transmit_led_message)(struct ata_port *ap, u32 state,
 					ssize_t size);
 
-	/*
-	 * Obsolete
-	 */
-	void (*phy_reset)(struct ata_port *ap);
-	void (*eng_timeout)(struct ata_port *ap);
-
 	/*
 	 * ->inherits must be the last field and all the preceding
 	 * fields must be pointers.
-- 
cgit v1.2.3


From 54e73cd52250adeba836cd3afef3658b48ae8dc9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 1 Aug 2023 12:58:15 +0200
Subject: virtio: Remove PM #ifdef guards to fix i2c driver

A cleanup in the virtio i2c caused a build failure:

drivers/i2c/busses/i2c-virtio.c:270:10: error: 'struct virtio_driver' has no member named 'freeze'
drivers/i2c/busses/i2c-virtio.c:271:10: error: 'struct virtio_driver' has no member named 'restore'

Change the structure definition to allow this cleanup to
be applied everywhere.

Fixes: 73d546c76235b ("i2c: virtio: Remove #ifdef guards for PM related functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Link: https://lore.kernel.org/r/20230801105846.3708252-1-arnd@kernel.org
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 include/linux/virtio.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index de6041deee37..7ed071b5ef07 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -184,10 +184,8 @@ struct virtio_driver {
 	void (*scan)(struct virtio_device *dev);
 	void (*remove)(struct virtio_device *dev);
 	void (*config_changed)(struct virtio_device *dev);
-#ifdef CONFIG_PM
 	int (*freeze)(struct virtio_device *dev);
 	int (*restore)(struct virtio_device *dev);
-#endif
 };
 
 static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv)
-- 
cgit v1.2.3


From d890cfc25fe9421ffdff3a9ea678172addb36762 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 7 Aug 2023 09:31:06 +0200
Subject: rtc: ds2404: Convert to GPIO descriptors

This converts the DS2404 to use GPIO descriptors instead of
hard-coded global GPIO numbers.

The platform data can be deleted because there are no in-tree
users and it only contained GPIO numbers which are now
passed using descriptor tables (or device tree or ACPI).

The driver was rewritten to use a state container for the
device driver state (struct ds2404 *chip) and pass that
around instead of using a global singleton storage for the
GPIO handles.

When declaring GPIO descriptor tables or other hardware
descriptions for the RTC driver, implementers should take care
to flag the RESET line as active low, such as by using the
GPIOD_ACTIVE_LOW flag in the descriptor table.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20230807-descriptors-rtc-v1-1-ce0f9187576e@linaro.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/rtc/rtc-ds2404.c                 | 169 +++++++++++++------------------
 include/linux/platform_data/rtc-ds2404.h |  20 ----
 2 files changed, 69 insertions(+), 120 deletions(-)
 delete mode 100644 include/linux/platform_data/rtc-ds2404.h

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c
index 0480f592307e..3231fd9f61da 100644
--- a/drivers/rtc/rtc-ds2404.c
+++ b/drivers/rtc/rtc-ds2404.c
@@ -7,9 +7,8 @@
 #include <linux/rtc.h>
 #include <linux/types.h>
 #include <linux/bcd.h>
-#include <linux/platform_data/rtc-ds2404.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/slab.h>
 
 #include <linux/io.h>
@@ -27,164 +26,140 @@
 #define DS2404_CLK	1
 #define DS2404_DQ	2
 
-struct ds2404_gpio {
-	const char *name;
-	unsigned int gpio;
-};
-
 struct ds2404 {
-	struct ds2404_gpio *gpio;
+	struct device *dev;
+	struct gpio_desc *rst_gpiod;
+	struct gpio_desc *clk_gpiod;
+	struct gpio_desc *dq_gpiod;
 	struct rtc_device *rtc;
 };
 
-static struct ds2404_gpio ds2404_gpio[] = {
-	{ "RTC RST", 0 },
-	{ "RTC CLK", 0 },
-	{ "RTC DQ", 0 },
-};
-
-static int ds2404_gpio_map(struct ds2404 *chip, struct platform_device *pdev,
-			  struct ds2404_platform_data *pdata)
+static int ds2404_gpio_map(struct ds2404 *chip, struct platform_device *pdev)
 {
-	int i, err;
-
-	ds2404_gpio[DS2404_RST].gpio = pdata->gpio_rst;
-	ds2404_gpio[DS2404_CLK].gpio = pdata->gpio_clk;
-	ds2404_gpio[DS2404_DQ].gpio = pdata->gpio_dq;
-
-	for (i = 0; i < ARRAY_SIZE(ds2404_gpio); i++) {
-		err = gpio_request(ds2404_gpio[i].gpio, ds2404_gpio[i].name);
-		if (err) {
-			dev_err(&pdev->dev, "error mapping gpio %s: %d\n",
-				ds2404_gpio[i].name, err);
-			goto err_request;
-		}
-		if (i != DS2404_DQ)
-			gpio_direction_output(ds2404_gpio[i].gpio, 1);
-	}
+	struct device *dev = &pdev->dev;
 
-	chip->gpio = ds2404_gpio;
-	return 0;
+	/* This will de-assert RESET, declare this GPIO as GPIOD_ACTIVE_LOW */
+	chip->rst_gpiod = devm_gpiod_get(dev, "rst", GPIOD_OUT_LOW);
+	if (IS_ERR(chip->rst_gpiod))
+		return PTR_ERR(chip->rst_gpiod);
 
-err_request:
-	while (--i >= 0)
-		gpio_free(ds2404_gpio[i].gpio);
-	return err;
-}
+	chip->clk_gpiod = devm_gpiod_get(dev, "clk", GPIOD_OUT_HIGH);
+	if (IS_ERR(chip->clk_gpiod))
+		return PTR_ERR(chip->clk_gpiod);
 
-static void ds2404_gpio_unmap(void *data)
-{
-	int i;
+	chip->dq_gpiod = devm_gpiod_get(dev, "dq", GPIOD_ASIS);
+	if (IS_ERR(chip->dq_gpiod))
+		return PTR_ERR(chip->dq_gpiod);
 
-	for (i = 0; i < ARRAY_SIZE(ds2404_gpio); i++)
-		gpio_free(ds2404_gpio[i].gpio);
+	return 0;
 }
 
-static void ds2404_reset(struct device *dev)
+static void ds2404_reset(struct ds2404 *chip)
 {
-	gpio_set_value(ds2404_gpio[DS2404_RST].gpio, 0);
+	gpiod_set_value(chip->rst_gpiod, 1);
 	udelay(1000);
-	gpio_set_value(ds2404_gpio[DS2404_RST].gpio, 1);
-	gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
-	gpio_direction_output(ds2404_gpio[DS2404_DQ].gpio, 0);
+	gpiod_set_value(chip->rst_gpiod, 0);
+	gpiod_set_value(chip->clk_gpiod, 0);
+	gpiod_direction_output(chip->dq_gpiod, 0);
 	udelay(10);
 }
 
-static void ds2404_write_byte(struct device *dev, u8 byte)
+static void ds2404_write_byte(struct ds2404 *chip, u8 byte)
 {
 	int i;
 
-	gpio_direction_output(ds2404_gpio[DS2404_DQ].gpio, 1);
+	gpiod_direction_output(chip->dq_gpiod, 1);
 	for (i = 0; i < 8; i++) {
-		gpio_set_value(ds2404_gpio[DS2404_DQ].gpio, byte & (1 << i));
+		gpiod_set_value(chip->dq_gpiod, byte & (1 << i));
 		udelay(10);
-		gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 1);
+		gpiod_set_value(chip->clk_gpiod, 1);
 		udelay(10);
-		gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
+		gpiod_set_value(chip->clk_gpiod, 0);
 		udelay(10);
 	}
 }
 
-static u8 ds2404_read_byte(struct device *dev)
+static u8 ds2404_read_byte(struct ds2404 *chip)
 {
 	int i;
 	u8 ret = 0;
 
-	gpio_direction_input(ds2404_gpio[DS2404_DQ].gpio);
+	gpiod_direction_input(chip->dq_gpiod);
 
 	for (i = 0; i < 8; i++) {
-		gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
+		gpiod_set_value(chip->clk_gpiod, 0);
 		udelay(10);
-		if (gpio_get_value(ds2404_gpio[DS2404_DQ].gpio))
+		if (gpiod_get_value(chip->dq_gpiod))
 			ret |= 1 << i;
-		gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 1);
+		gpiod_set_value(chip->clk_gpiod, 1);
 		udelay(10);
 	}
 	return ret;
 }
 
-static void ds2404_read_memory(struct device *dev, u16 offset,
+static void ds2404_read_memory(struct ds2404 *chip, u16 offset,
 			       int length, u8 *out)
 {
-	ds2404_reset(dev);
-	ds2404_write_byte(dev, DS2404_READ_MEMORY_CMD);
-	ds2404_write_byte(dev, offset & 0xff);
-	ds2404_write_byte(dev, (offset >> 8) & 0xff);
+	ds2404_reset(chip);
+	ds2404_write_byte(chip, DS2404_READ_MEMORY_CMD);
+	ds2404_write_byte(chip, offset & 0xff);
+	ds2404_write_byte(chip, (offset >> 8) & 0xff);
 	while (length--)
-		*out++ = ds2404_read_byte(dev);
+		*out++ = ds2404_read_byte(chip);
 }
 
-static void ds2404_write_memory(struct device *dev, u16 offset,
+static void ds2404_write_memory(struct ds2404 *chip, u16 offset,
 				int length, u8 *out)
 {
 	int i;
 	u8 ta01, ta02, es;
 
-	ds2404_reset(dev);
-	ds2404_write_byte(dev, DS2404_WRITE_SCRATCHPAD_CMD);
-	ds2404_write_byte(dev, offset & 0xff);
-	ds2404_write_byte(dev, (offset >> 8) & 0xff);
+	ds2404_reset(chip);
+	ds2404_write_byte(chip, DS2404_WRITE_SCRATCHPAD_CMD);
+	ds2404_write_byte(chip, offset & 0xff);
+	ds2404_write_byte(chip, (offset >> 8) & 0xff);
 
 	for (i = 0; i < length; i++)
-		ds2404_write_byte(dev, out[i]);
+		ds2404_write_byte(chip, out[i]);
 
-	ds2404_reset(dev);
-	ds2404_write_byte(dev, DS2404_READ_SCRATCHPAD_CMD);
+	ds2404_reset(chip);
+	ds2404_write_byte(chip, DS2404_READ_SCRATCHPAD_CMD);
 
-	ta01 = ds2404_read_byte(dev);
-	ta02 = ds2404_read_byte(dev);
-	es = ds2404_read_byte(dev);
+	ta01 = ds2404_read_byte(chip);
+	ta02 = ds2404_read_byte(chip);
+	es = ds2404_read_byte(chip);
 
 	for (i = 0; i < length; i++) {
-		if (out[i] != ds2404_read_byte(dev)) {
-			dev_err(dev, "read invalid data\n");
+		if (out[i] != ds2404_read_byte(chip)) {
+			dev_err(chip->dev, "read invalid data\n");
 			return;
 		}
 	}
 
-	ds2404_reset(dev);
-	ds2404_write_byte(dev, DS2404_COPY_SCRATCHPAD_CMD);
-	ds2404_write_byte(dev, ta01);
-	ds2404_write_byte(dev, ta02);
-	ds2404_write_byte(dev, es);
+	ds2404_reset(chip);
+	ds2404_write_byte(chip, DS2404_COPY_SCRATCHPAD_CMD);
+	ds2404_write_byte(chip, ta01);
+	ds2404_write_byte(chip, ta02);
+	ds2404_write_byte(chip, es);
 
-	gpio_direction_input(ds2404_gpio[DS2404_DQ].gpio);
-	while (gpio_get_value(ds2404_gpio[DS2404_DQ].gpio))
+	while (gpiod_get_value(chip->dq_gpiod))
 		;
 }
 
-static void ds2404_enable_osc(struct device *dev)
+static void ds2404_enable_osc(struct ds2404 *chip)
 {
 	u8 in[1] = { 0x10 }; /* enable oscillator */
-	ds2404_write_memory(dev, 0x201, 1, in);
+
+	ds2404_write_memory(chip, 0x201, 1, in);
 }
 
 static int ds2404_read_time(struct device *dev, struct rtc_time *dt)
 {
+	struct ds2404 *chip = dev_get_drvdata(dev);
 	unsigned long time = 0;
 	__le32 hw_time = 0;
 
-	ds2404_read_memory(dev, 0x203, 4, (u8 *)&hw_time);
+	ds2404_read_memory(chip, 0x203, 4, (u8 *)&hw_time);
 	time = le32_to_cpu(hw_time);
 
 	rtc_time64_to_tm(time, dt);
@@ -193,8 +168,9 @@ static int ds2404_read_time(struct device *dev, struct rtc_time *dt)
 
 static int ds2404_set_time(struct device *dev, struct rtc_time *dt)
 {
+	struct ds2404 *chip = dev_get_drvdata(dev);
 	u32 time = cpu_to_le32(rtc_tm_to_time64(dt));
-	ds2404_write_memory(dev, 0x203, 4, (u8 *)&time);
+	ds2404_write_memory(chip, 0x203, 4, (u8 *)&time);
 	return 0;
 }
 
@@ -205,7 +181,6 @@ static const struct rtc_class_ops ds2404_rtc_ops = {
 
 static int rtc_probe(struct platform_device *pdev)
 {
-	struct ds2404_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct ds2404 *chip;
 	int retval = -EBUSY;
 
@@ -213,22 +188,16 @@ static int rtc_probe(struct platform_device *pdev)
 	if (!chip)
 		return -ENOMEM;
 
+	chip->dev = &pdev->dev;
+
 	chip->rtc = devm_rtc_allocate_device(&pdev->dev);
 	if (IS_ERR(chip->rtc))
 		return PTR_ERR(chip->rtc);
 
-	retval = ds2404_gpio_map(chip, pdev, pdata);
+	retval = ds2404_gpio_map(chip, pdev);
 	if (retval)
 		return retval;
 
-	retval = devm_add_action_or_reset(&pdev->dev, ds2404_gpio_unmap, chip);
-	if (retval)
-		return retval;
-
-	dev_info(&pdev->dev, "using GPIOs RST:%d, CLK:%d, DQ:%d\n",
-		 chip->gpio[DS2404_RST].gpio, chip->gpio[DS2404_CLK].gpio,
-		 chip->gpio[DS2404_DQ].gpio);
-
 	platform_set_drvdata(pdev, chip);
 
 	chip->rtc->ops = &ds2404_rtc_ops;
@@ -238,7 +207,7 @@ static int rtc_probe(struct platform_device *pdev)
 	if (retval)
 		return retval;
 
-	ds2404_enable_osc(&pdev->dev);
+	ds2404_enable_osc(chip);
 	return 0;
 }
 
diff --git a/include/linux/platform_data/rtc-ds2404.h b/include/linux/platform_data/rtc-ds2404.h
deleted file mode 100644
index 22c53825528f..000000000000
--- a/include/linux/platform_data/rtc-ds2404.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * ds2404.h - platform data structure for the DS2404 RTC.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
- */
-
-#ifndef __LINUX_DS2404_H
-#define __LINUX_DS2404_H
-
-struct ds2404_platform_data {
-
-	unsigned int gpio_rst;
-	unsigned int gpio_clk;
-	unsigned int gpio_dq;
-};
-#endif
-- 
cgit v1.2.3


From afb48153220d35f330d0d979792920a31f7d9a81 Mon Sep 17 00:00:00 2001
From: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Date: Fri, 28 Jul 2023 17:37:28 +0200
Subject: leds: Provide devm_of_led_get_optional()

Add an optional variant of devm_of_led_get(). It behaves the same as
devm_of_led_get() except where the LED doesn't exist. In this case,
instead of returning -ENOENT, the function returns NULL.

Signed-off-by: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230728153731.3742339-2-jjhiblot@traphandler.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-class.c | 25 +++++++++++++++++++++++++
 include/linux/leds.h     |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 4758da2b59cf..78068b06d009 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -402,6 +402,31 @@ void led_remove_lookup(struct led_lookup_data *led_lookup)
 }
 EXPORT_SYMBOL_GPL(led_remove_lookup);
 
+/**
+ * devm_of_led_get_optional - Resource-managed request of an optional LED device
+ * @dev:	LED consumer
+ * @index:	index of the LED to obtain in the consumer
+ *
+ * The device node of the device is parsed to find the requested LED device.
+ * The LED device returned from this function is automatically released
+ * on driver detach.
+ *
+ * @return a pointer to a LED device, ERR_PTR(errno) on failure and NULL if the
+ * led was not found.
+ */
+struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev,
+							int index)
+{
+	struct led_classdev *led;
+
+	led = devm_of_led_get(dev, index);
+	if (IS_ERR(led) && PTR_ERR(led) == -ENOENT)
+		return NULL;
+
+	return led;
+}
+EXPORT_SYMBOL_GPL(devm_of_led_get_optional);
+
 static int led_classdev_next_name(const char *init_name, char *name,
 				  size_t len)
 {
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 7d428100b42b..8740b4e47f88 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -313,6 +313,8 @@ extern struct led_classdev *of_led_get(struct device_node *np, int index);
 extern void led_put(struct led_classdev *led_cdev);
 struct led_classdev *__must_check devm_of_led_get(struct device *dev,
 						  int index);
+struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev,
+						  int index);
 
 /**
  * led_blink_set - set blinking with software fallback
-- 
cgit v1.2.3


From c7d80059b086c4986cd994a1973ec7a5d75f8eea Mon Sep 17 00:00:00 2001
From: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Date: Fri, 28 Jul 2023 17:37:29 +0200
Subject: leds: class: Store the color index in struct led_classdev

Store the color of the LED so that it is not lost after the LED's
name has been composed. This color information can then be exposed to
the user space or used by the LED consumer.

Signed-off-by: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Link: https://lore.kernel.org/r/20230728153731.3742339-3-jjhiblot@traphandler.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 Documentation/ABI/testing/sysfs-class-led |  9 +++++++++
 drivers/leds/led-class.c                  | 21 +++++++++++++++++++++
 include/linux/leds.h                      |  1 +
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-led b/Documentation/ABI/testing/sysfs-class-led
index 2e24ac3bd7ef..b2ff0012c0f2 100644
--- a/Documentation/ABI/testing/sysfs-class-led
+++ b/Documentation/ABI/testing/sysfs-class-led
@@ -59,6 +59,15 @@ Description:
 		brightness. Reading this file when no hw brightness change
 		event has happened will return an ENODATA error.
 
+What:		/sys/class/leds/<led>/color
+Date:		June 2023
+KernelVersion:	6.5
+Description:
+		Color of the LED.
+
+		This is a read-only file. Reading this file returns the color
+		of the LED as a string (e.g: "red", "green", "multicolor").
+
 What:		/sys/class/leds/<led>/trigger
 Date:		March 2006
 KernelVersion:	2.6.17
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 78068b06d009..4bcbd46ec75a 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -76,6 +76,19 @@ static ssize_t max_brightness_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(max_brightness);
 
+static ssize_t color_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	const char *color_text = "invalid";
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->color < LED_COLOR_ID_MAX)
+		color_text = led_colors[led_cdev->color];
+
+	return sysfs_emit(buf, "%s\n", color_text);
+}
+static DEVICE_ATTR_RO(color);
+
 #ifdef CONFIG_LEDS_TRIGGERS
 static BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0);
 static struct bin_attribute *led_trigger_bin_attrs[] = {
@@ -90,6 +103,7 @@ static const struct attribute_group led_trigger_group = {
 static struct attribute *led_class_attrs[] = {
 	&dev_attr_brightness.attr,
 	&dev_attr_max_brightness.attr,
+	&dev_attr_color.attr,
 	NULL,
 };
 
@@ -486,6 +500,10 @@ int led_classdev_register_ext(struct device *parent,
 			fwnode_property_read_u32(init_data->fwnode,
 				"max-brightness",
 				&led_cdev->max_brightness);
+
+			if (fwnode_property_present(init_data->fwnode, "color"))
+				fwnode_property_read_u32(init_data->fwnode, "color",
+							 &led_cdev->color);
 		}
 	} else {
 		proposed_name = led_cdev->name;
@@ -495,6 +513,9 @@ int led_classdev_register_ext(struct device *parent,
 	if (ret < 0)
 		return ret;
 
+	if (led_cdev->color >= LED_COLOR_ID_MAX)
+		dev_warn(parent, "LED %s color identifier out of range\n", final_name);
+
 	mutex_init(&led_cdev->led_access);
 	mutex_lock(&led_cdev->led_access);
 	led_cdev->dev = device_create_with_groups(leds_class, parent, 0,
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 8740b4e47f88..aa16dc2a8230 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -100,6 +100,7 @@ struct led_classdev {
 	const char		*name;
 	unsigned int brightness;
 	unsigned int max_brightness;
+	unsigned int color;
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
-- 
cgit v1.2.3


From a1342c8027288e345cc5fd16c6800f9d4eb788ed Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:14 +0000
Subject: KVM: Rename kvm_arch_flush_remote_tlb() to
 kvm_arch_flush_remote_tlbs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename kvm_arch_flush_remote_tlb() and the associated macro
__KVM_HAVE_ARCH_FLUSH_REMOTE_TLB to kvm_arch_flush_remote_tlbs() and
__KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS respectively.

Making the name plural matches kvm_flush_remote_tlbs() and makes it more
clear that this function can affect more than one remote TLB.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-2-rananta@google.com
---
 arch/mips/include/asm/kvm_host.h | 4 ++--
 arch/mips/kvm/mips.c             | 2 +-
 arch/x86/include/asm/kvm_host.h  | 4 ++--
 include/linux/kvm_host.h         | 4 ++--
 virt/kvm/kvm_main.c              | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 04cedf9f8811..9b0ad8f3bf32 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -896,7 +896,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
-#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 
 #endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index aa5583a7b05b..4b7bc39a4173 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -981,7 +981,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 }
 
-int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	kvm_mips_callbacks->prepare_flush_shadow(kvm);
 	return 1;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28bd38303d70..a2d3cfc2eb75 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1794,8 +1794,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void)
 #define __KVM_HAVE_ARCH_VM_FREE
 void kvm_arch_free_vm(struct kvm *kvm);
 
-#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	if (kvm_x86_ops.flush_remote_tlbs &&
 	    !static_call(kvm_x86_flush_remote_tlbs)(kvm))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9d3ac7720da9..e3f968b38ae9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1479,8 +1479,8 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 }
 #endif
 
-#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	return -ENOTSUPP;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dfbaafbe3a00..70e5479797ac 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -361,7 +361,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 	 * barrier here.
 	 */
-	if (!kvm_arch_flush_remote_tlb(kvm)
+	if (!kvm_arch_flush_remote_tlbs(kvm)
 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.generic.remote_tlb_flush;
 }
-- 
cgit v1.2.3


From cfb0c08e80120928dda1e951718be135abd49bae Mon Sep 17 00:00:00 2001
From: Raghavendra Rao Ananta <rananta@google.com>
Date: Fri, 11 Aug 2023 04:51:15 +0000
Subject: KVM: Declare kvm_arch_flush_remote_tlbs() globally

There's no reason for the architectures to declare
kvm_arch_flush_remote_tlbs() in their own headers. Hence to
avoid this duplication, make the declaration global, leaving
the architectures to define only __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
as needed.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-3-rananta@google.com
---
 arch/mips/include/asm/kvm_host.h | 1 -
 include/linux/kvm_host.h         | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 9b0ad8f3bf32..54a85f1d4f2c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -897,6 +897,5 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
-int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 
 #endif /* __MIPS_KVM_HOST_H__ */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e3f968b38ae9..ade5d4500c2c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1484,6 +1484,8 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 {
 	return -ENOTSUPP;
 }
+#else
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 #endif
 
 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
-- 
cgit v1.2.3


From d4788996051e3c07fadc6d9b214073fcf78810a8 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:18 +0000
Subject: KVM: Allow range-based TLB invalidation from common code

Make kvm_flush_remote_tlbs_range() visible in common code and create a
default implementation that just invalidates the whole TLB.

This paves the way for several future features/cleanups:

 - Introduction of range-based TLBI on ARM.
 - Eliminating kvm_arch_flush_remote_tlbs_memslot()
 - Moving the KVM/x86 TDP MMU to common code.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-6-rananta@google.com
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu.c          | 12 ++++--------
 arch/x86/kvm/mmu/mmu_internal.h |  3 ---
 include/linux/kvm_host.h        | 11 +++++++++++
 virt/kvm/kvm_main.c             | 13 +++++++++++++
 5 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a2d3cfc2eb75..b547d17c58f6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1804,6 +1804,8 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 		return -ENOTSUPP;
 }
 
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+
 #define kvm_arch_pmi_in_guest(vcpu) \
 	((vcpu) && (vcpu)->arch.handling_intr_from_guest)
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ec169f5c7dce..46ae672668e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -278,16 +278,12 @@ static inline bool kvm_available_flush_remote_tlbs_range(void)
 	return kvm_x86_ops.flush_remote_tlbs_range;
 }
 
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
-				 gfn_t nr_pages)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 {
-	int ret = -EOPNOTSUPP;
+	if (!kvm_x86_ops.flush_remote_tlbs_range)
+		return -EOPNOTSUPP;
 
-	if (kvm_x86_ops.flush_remote_tlbs_range)
-		ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
-								   nr_pages);
-	if (ret)
-		kvm_flush_remote_tlbs(kvm);
+	return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
 }
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d39af5639ce9..86cb83bb3480 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -170,9 +170,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn,
 				    int min_level);
 
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
-				 gfn_t nr_pages);
-
 /* Flush the given page (huge or not) of guest memory. */
 static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ade5d4500c2c..89d2614e4b7a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1359,6 +1359,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
@@ -1488,6 +1489,16 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
 #endif
 
+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
+						    gfn_t gfn, u64 nr_pages)
+{
+	return -EOPNOTSUPP;
+}
+#else
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
+#endif
+
 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d6b050786155..26e91000f579 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -366,6 +366,19 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
+{
+	if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
+		return;
+
+	/*
+	 * Fall back to a flushing entire TLBs if the architecture range-based
+	 * TLB invalidation is unsupported or can't be performed for whatever
+	 * reason.
+	 */
+	kvm_flush_remote_tlbs(kvm);
+}
+
 static void kvm_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_arch_flush_shadow_all(kvm);
-- 
cgit v1.2.3


From 619b5072443c05cf18c31b2c0320cdb42396d411 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 11 Aug 2023 04:51:19 +0000
Subject: KVM: Move kvm_arch_flush_remote_tlbs_memslot() to common code

Move kvm_arch_flush_remote_tlbs_memslot() to common code and drop
"arch_" from the name. kvm_arch_flush_remote_tlbs_memslot() is just a
range-based TLB invalidation where the range is defined by the memslot.
Now that kvm_flush_remote_tlbs_range() can be called from common code we
can just use that and drop a bunch of duplicate code from the arch
directories.

Note this adds a lockdep assertion for slots_lock being held when
calling kvm_flush_remote_tlbs_memslot(), which was previously only
asserted on x86. MIPS has calls to kvm_flush_remote_tlbs_memslot(),
but they all hold the slots_lock, so the lockdep assertion continues to
hold true.

Also drop the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT ifdef gating
kvm_flush_remote_tlbs_memslot(), since it is no longer necessary.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Acked-by: Anup Patel <anup@brainfault.org>
Acked-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230811045127.3308641-7-rananta@google.com
---
 arch/arm64/kvm/arm.c     |  6 ------
 arch/mips/kvm/mips.c     | 10 ++--------
 arch/riscv/kvm/mmu.c     |  6 ------
 arch/x86/kvm/mmu/mmu.c   | 16 +---------------
 arch/x86/kvm/x86.c       |  2 +-
 include/linux/kvm_host.h |  7 +++----
 virt/kvm/kvm_main.c      | 18 ++++++++++++++++--
 7 files changed, 23 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 72dc53a75d1c..fd2af63d788d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1532,12 +1532,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 }
 
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot)
-{
-	kvm_flush_remote_tlbs(kvm);
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
 					struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 4b7bc39a4173..231ac052b506 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -199,7 +199,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	/* Flush slot from GPA */
 	kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
 			      slot->base_gfn + slot->npages - 1);
-	kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+	kvm_flush_remote_tlbs_memslot(kvm, slot);
 	spin_unlock(&kvm->mmu_lock);
 }
 
@@ -235,7 +235,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 		needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
 					new->base_gfn + new->npages - 1);
 		if (needs_flush)
-			kvm_arch_flush_remote_tlbs_memslot(kvm, new);
+			kvm_flush_remote_tlbs_memslot(kvm, new);
 		spin_unlock(&kvm->mmu_lock);
 	}
 }
@@ -987,12 +987,6 @@ int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
 	return 1;
 }
 
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot)
-{
-	kvm_flush_remote_tlbs(kvm);
-}
-
 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	int r;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index f2eb47925806..97e129620686 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -406,12 +406,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
 }
 
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot)
-{
-	kvm_flush_remote_tlbs(kvm);
-}
-
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free)
 {
 }
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 46ae672668e1..dbf3c6c2316c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6666,7 +6666,7 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
 	 */
 	if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
 			    PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
-		kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+		kvm_flush_remote_tlbs_memslot(kvm, slot);
 }
 
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
@@ -6685,20 +6685,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	}
 }
 
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot)
-{
-	/*
-	 * All current use cases for flushing the TLBs for a specific memslot
-	 * related to dirty logging, and many do the TLB flush out of mmu_lock.
-	 * The interaction between the various operations on memslot must be
-	 * serialized by slots_locks to ensure the TLB flush from one operation
-	 * is observed by any other operation on the same memslot.
-	 */
-	lockdep_assert_held(&kvm->slots_lock);
-	kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
-}
-
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6b9bea62fb8..faeb2e307b36 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12751,7 +12751,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 		 * See is_writable_pte() for more details (the case involving
 		 * access-tracked SPTEs is particularly relevant).
 		 */
-		kvm_arch_flush_remote_tlbs_memslot(kvm, new);
+		kvm_flush_remote_tlbs_memslot(kvm, new);
 	}
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 89d2614e4b7a..394db2ce11e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1360,6 +1360,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
+void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
+				   const struct kvm_memory_slot *memslot);
 
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
@@ -1388,10 +1390,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					unsigned long mask);
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					const struct kvm_memory_slot *memslot);
-#else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 		      int *is_dirty, struct kvm_memory_slot **memslot);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 26e91000f579..5d4d2e051aa0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -379,6 +379,20 @@ void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 	kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
+				   const struct kvm_memory_slot *memslot)
+{
+	/*
+	 * All current use cases for flushing the TLBs for a specific memslot
+	 * are related to dirty logging, and many do the TLB flush out of
+	 * mmu_lock. The interaction between the various operations on memslot
+	 * must be serialized by slots_locks to ensure the TLB flush from one
+	 * operation is observed by any other operation on the same memslot.
+	 */
+	lockdep_assert_held(&kvm->slots_lock);
+	kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
+}
+
 static void kvm_flush_shadow_all(struct kvm *kvm)
 {
 	kvm_arch_flush_shadow_all(kvm);
@@ -2191,7 +2205,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 	}
 
 	if (flush)
-		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
+		kvm_flush_remote_tlbs_memslot(kvm, memslot);
 
 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
 		return -EFAULT;
@@ -2308,7 +2322,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
 	KVM_MMU_UNLOCK(kvm);
 
 	if (flush)
-		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
+		kvm_flush_remote_tlbs_memslot(kvm, memslot);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3e1efe2b67d3d38116ec010968dbcd89d29e4561 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 28 Jul 2023 17:41:44 -0700
Subject: KVM: Wrap kvm_{gfn,hva}_range.pte in a per-action union

Wrap kvm_{gfn,hva}_range.pte in a union so that future notifier events can
pass event specific information up and down the stack without needing to
constantly expand and churn the APIs.  Lockless aging of SPTEs will pass
around a bitmap, and support for memory attributes will pass around the
new attributes for the range.

Add a "KVM_NO_ARG" placeholder to simplify handling events without an
argument (creating a dummy union variable is midly annoying).

Opportunstically drop explicit zero-initialization of the "pte" field, as
omitting the field (now a union) has the same effect.

Cc: Yu Zhao <yuzhao@google.com>
Link: https://lore.kernel.org/all/CAOUHufagkd2Jk3_HrVoFFptRXM=hX2CV8f+M-dka-hJU4bP8kw@mail.gmail.com
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Yu Zhao <yuzhao@google.com>
Link: https://lore.kernel.org/r/20230729004144.1054885-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/mmu.c       |  2 +-
 arch/mips/kvm/mmu.c        |  2 +-
 arch/riscv/kvm/mmu.c       |  2 +-
 arch/x86/kvm/mmu/mmu.c     |  2 +-
 arch/x86/kvm/mmu/tdp_mmu.c |  6 +++---
 include/linux/kvm_host.h   |  6 +++++-
 virt/kvm/kvm_main.c        | 19 ++++++++++---------
 7 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6db9ef288ec3..55f03a68f1cd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1721,7 +1721,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	kvm_pfn_t pfn = pte_pfn(range->pte);
+	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
 
 	if (!kvm->arch.mmu.pgt)
 		return false;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index e8c08988ed37..7b2ac1319d70 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -447,7 +447,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	gpa_t gpa = range->start << PAGE_SHIFT;
-	pte_t hva_pte = range->pte;
+	pte_t hva_pte = range->arg.pte;
 	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
 	pte_t old_pte;
 
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index f2eb47925806..857f4312b0f8 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -559,7 +559,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	int ret;
-	kvm_pfn_t pfn = pte_pfn(range->pte);
+	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
 
 	if (!kvm->arch.pgd)
 		return false;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ec169f5c7dce..d72f2b20f430 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1588,7 +1588,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
 	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
 				 range->start, range->end - 1, &iterator)
 		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
-			       iterator.level, range->pte);
+			       iterator.level, range->arg.pte);
 
 	return ret;
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 512163d52194..6250bd3d20c1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1241,7 +1241,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
 	u64 new_spte;
 
 	/* Huge pages aren't expected to be modified without first being zapped. */
-	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
+	WARN_ON(pte_huge(range->arg.pte) || range->start + 1 != range->end);
 
 	if (iter->level != PG_LEVEL_4K ||
 	    !is_shadow_present_pte(iter->old_spte))
@@ -1255,9 +1255,9 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
 	 */
 	tdp_mmu_iter_set_spte(kvm, iter, 0);
 
-	if (!pte_write(range->pte)) {
+	if (!pte_write(range->arg.pte)) {
 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
-								  pte_pfn(range->pte));
+								  pte_pfn(range->arg.pte));
 
 		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9d3ac7720da9..9125d0ab642d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,11 +256,15 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+union kvm_mmu_notifier_arg {
+	pte_t pte;
+};
+
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
 	gfn_t start;
 	gfn_t end;
-	pte_t pte;
+	union kvm_mmu_notifier_arg arg;
 	bool may_block;
 };
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dfbaafbe3a00..92c50dc159e8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -526,7 +526,7 @@ typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 struct kvm_hva_range {
 	unsigned long start;
 	unsigned long end;
-	pte_t pte;
+	union kvm_mmu_notifier_arg arg;
 	hva_handler_t handler;
 	on_lock_fn_t on_lock;
 	on_unlock_fn_t on_unlock;
@@ -547,6 +547,8 @@ static void kvm_null_fn(void)
 }
 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 
+static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
+
 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
 	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
@@ -591,7 +593,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 			 * bother making these conditional (to avoid writes on
 			 * the second or later invocation of the handler).
 			 */
-			gfn_range.pte = range->pte;
+			gfn_range.arg = range->arg;
 			gfn_range.may_block = range->may_block;
 
 			/*
@@ -632,14 +634,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 						unsigned long start,
 						unsigned long end,
-						pte_t pte,
+						union kvm_mmu_notifier_arg arg,
 						hva_handler_t handler)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_hva_range range = {
 		.start		= start,
 		.end		= end,
-		.pte		= pte,
+		.arg		= arg,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
 		.on_unlock	= (void *)kvm_null_fn,
@@ -659,7 +661,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 	const struct kvm_hva_range range = {
 		.start		= start,
 		.end		= end,
-		.pte		= __pte(0),
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
 		.on_unlock	= (void *)kvm_null_fn,
@@ -693,6 +694,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					pte_t pte)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const union kvm_mmu_notifier_arg arg = { .pte = pte };
 
 	trace_kvm_set_spte_hva(address);
 
@@ -708,7 +710,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 		return;
 
-	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
+	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 }
 
 void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
@@ -747,7 +749,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
 		.end		= range->end,
-		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
 		.on_lock	= kvm_mmu_invalidate_begin,
 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
@@ -812,7 +813,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
 		.end		= range->end,
-		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
 		.on_lock	= kvm_mmu_invalidate_end,
 		.on_unlock	= (void *)kvm_null_fn,
@@ -845,7 +845,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 {
 	trace_kvm_age_hva(start, end);
 
-	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
+	return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
+				    kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
-- 
cgit v1.2.3


From 1f8403953f05af591ab72cf749b9b9b837ea9595 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Mon, 14 Aug 2023 22:03:39 +0800
Subject: KVM: Remove unused kvm_device_{get,put}() declarations

Commit 07f0a7bdec5c ("kvm: destroy emulated devices on VM exit") removed the
functions but not these declarations.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230814140339.47732-1-yuehaibing@huawei.com
[sean: split to separate patch]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9125d0ab642d..a973a7cb45e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2152,8 +2152,6 @@ struct kvm_device_ops {
 	int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
 };
 
-void kvm_device_get(struct kvm_device *dev);
-void kvm_device_put(struct kvm_device *dev);
 struct kvm_device *kvm_device_from_filp(struct file *filp);
 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type);
 void kvm_unregister_device_ops(u32 type);
-- 
cgit v1.2.3


From 458933d33af2cb3663bd8c0080c1efd1f9483db4 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Mon, 14 Aug 2023 22:03:39 +0800
Subject: KVM: Remove unused kvm_make_cpus_request_mask() declaration

Commit 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target
vCPUs") declared but never implemented kvm_make_cpus_request_mask() as
kvm_make_vcpus_request_mask() already existed.

Note, KVM's APIs are painfully inconsistent, as the inclusive variant uses
"vcpus", whereas the exclusive/all variants use "cpus", which is likely
what led to the spurious declaration.  The "vcpus" terminology is more
correct, especially since the helpers will kick _physical_ CPUs by calling
kvm_kick_many_cpus().  But that's a cleanup for the future.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230814140339.47732-1-yuehaibing@huawei.com
[sean: split to separate patch, call out inconsistent naming]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a973a7cb45e0..199698a5ffa6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -190,8 +190,6 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 				      struct kvm_vcpu *except);
-bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-				unsigned long *vcpu_bitmap);
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit v1.2.3


From 2459f4dfe5529f8b847f452473e2da08a8fc9fe5 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 6 Jul 2023 19:39:39 +0800
Subject: mfd: hi655x-pmic: Convert to devm_platform_ioremap_resource()

Use devm_platform_ioremap_resource() to simplify code.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Link: https://lore.kernel.org/r/20230706113939.1178-7-frank.li@vivo.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/hi655x-pmic.c       | 3 +--
 include/linux/mfd/hi655x-pmic.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/hi655x-pmic.c b/drivers/mfd/hi655x-pmic.c
index a58e42ddcd0c..98ae40ee3f05 100644
--- a/drivers/mfd/hi655x-pmic.c
+++ b/drivers/mfd/hi655x-pmic.c
@@ -100,8 +100,7 @@ static int hi655x_pmic_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	pmic->dev = dev;
 
-	pmic->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	base = devm_ioremap_resource(dev, pmic->res);
+	base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 
diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h
index 6a012784dd1b..194556851ccf 100644
--- a/include/linux/mfd/hi655x-pmic.h
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -52,7 +52,6 @@
 #define OTMP_D1R_INT_MASK               BIT(OTMP_D1R_INT)
 
 struct hi655x_pmic {
-	struct resource *res;
 	struct device *dev;
 	struct regmap *regmap;
 	struct gpio_desc *gpio;
-- 
cgit v1.2.3


From 2dfe293bcde2d302c045d0cd536ccb15f86385d2 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 20 Jul 2023 22:37:38 +0800
Subject: mfd: db8500-prcmu: Remove unused inline functions

Since commit b0e846248de5 ("mfd: db8500-prcmu: Remove dead code for a non-existing config")
these inline helpers also no need any more.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230720143738.13996-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/dbx500-prcmu.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index e7a7e70fdb38..dd0fc891b228 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -556,16 +556,6 @@ static inline void prcmu_clear(unsigned int reg, u32 bits)
 #define PRCMU_QOS_ARM_OPP 3
 #define PRCMU_QOS_DEFAULT_VALUE -1
 
-static inline unsigned long prcmu_qos_get_cpufreq_opp_delay(void)
-{
-	return 0;
-}
-
-static inline int prcmu_qos_requirement(int prcmu_qos_class)
-{
-	return 0;
-}
-
 static inline int prcmu_qos_add_requirement(int prcmu_qos_class,
 					    char *name, s32 value)
 {
@@ -582,15 +572,4 @@ static inline void prcmu_qos_remove_requirement(int prcmu_qos_class, char *name)
 {
 }
 
-static inline int prcmu_qos_add_notifier(int prcmu_qos_class,
-					 struct notifier_block *notifier)
-{
-	return 0;
-}
-static inline int prcmu_qos_remove_notifier(int prcmu_qos_class,
-					    struct notifier_block *notifier)
-{
-	return 0;
-}
-
 #endif /* __MACH_PRCMU_H */
-- 
cgit v1.2.3


From 10d3340441bd0db857fc7fcb1733a800acf47a3d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2023 11:02:23 +0200
Subject: mfd: rz-mtu3: Link time dependencies

The new set of drivers for RZ/G2L MTU3a tries to enable compile-testing the
individual client drivers even when the MFD portion is disabled but gets it
wrong, causing a link failure when the core is in a loadable module but the
other drivers are built-in:

x86_64-linux-ld: drivers/pwm/pwm-rz-mtu3.o: in function `rz_mtu3_pwm_apply':
pwm-rz-mtu3.c:(.text+0x4bf): undefined reference to `rz_mtu3_8bit_ch_write'
x86_64-linux-ld: pwm-rz-mtu3.c:(.text+0x509): undefined reference to `rz_mtu3_disable'

arm-linux-gnueabi-ld: drivers/counter/rz-mtu3-cnt.o: in function `rz_mtu3_cascade_counts_enable_get':
rz-mtu3-cnt.c:(.text+0xbec): undefined reference to `rz_mtu3_shared_reg_read'

It seems better not to add the extra complexity here but instead just use
a normal hard dependency, so remove the #else portion in the header along
with the "|| COMPILE_TEST". This could also be fixed by having slightly more
elaborate Kconfig dependencies or using the cursed 'IS_REACHABLE()' helper,
but in practice it's already possible to compile-test all these drivers
by enabling the mtd portion.

Fixes: 254d3a727421c ("pwm: Add Renesas RZ/G2L MTU3a PWM driver")
Fixes: 0be8907359df4 ("counter: Add Renesas RZ/G2L MTU3a counter driver")
Fixes: 654c293e1687b ("mfd: Add Renesas RZ/G2L MTU3a core driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://lore.kernel.org/r/20230719090430.1925182-1-arnd@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/counter/Kconfig     |  2 +-
 drivers/pwm/Kconfig         |  2 +-
 include/linux/mfd/rz-mtu3.h | 66 ---------------------------------------------
 3 files changed, 2 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/counter/Kconfig b/drivers/counter/Kconfig
index bca21df51168..62f118173355 100644
--- a/drivers/counter/Kconfig
+++ b/drivers/counter/Kconfig
@@ -92,7 +92,7 @@ config MICROCHIP_TCB_CAPTURE
 
 config RZ_MTU3_CNT
 	tristate "Renesas RZ/G2L MTU3a counter driver"
-	depends on RZ_MTU3 || COMPILE_TEST
+	depends on RZ_MTU3
 	help
 	  Enable support for MTU3a counter driver found on Renesas RZ/G2L alike
 	  SoCs. This IP supports both 16-bit and 32-bit phase counting mode
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 6210babb0741..8ebcddf91f7b 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -505,7 +505,7 @@ config PWM_ROCKCHIP
 
 config PWM_RZ_MTU3
 	tristate "Renesas RZ/G2L MTU3a PWM Timer support"
-	depends on RZ_MTU3 || COMPILE_TEST
+	depends on RZ_MTU3
 	depends on HAS_IOMEM
 	help
 	  This driver exposes the MTU3a PWM Timer controller found in Renesas
diff --git a/include/linux/mfd/rz-mtu3.h b/include/linux/mfd/rz-mtu3.h
index c5173bc06270..8421d49500bf 100644
--- a/include/linux/mfd/rz-mtu3.h
+++ b/include/linux/mfd/rz-mtu3.h
@@ -151,7 +151,6 @@ struct rz_mtu3 {
 	void *priv_data;
 };
 
-#if IS_ENABLED(CONFIG_RZ_MTU3)
 static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
 {
 	mutex_lock(&ch->lock);
@@ -188,70 +187,5 @@ void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val);
 void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val);
 void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch, u16 off,
 				   u16 pos, u8 val);
-#else
-static inline bool rz_mtu3_request_channel(struct rz_mtu3_channel *ch)
-{
-	return false;
-}
-
-static inline void rz_mtu3_release_channel(struct rz_mtu3_channel *ch)
-{
-}
-
-static inline bool rz_mtu3_is_enabled(struct rz_mtu3_channel *ch)
-{
-	return false;
-}
-
-static inline void rz_mtu3_disable(struct rz_mtu3_channel *ch)
-{
-}
-
-static inline int rz_mtu3_enable(struct rz_mtu3_channel *ch)
-{
-	return 0;
-}
-
-static inline u8 rz_mtu3_8bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u16 rz_mtu3_16bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u32 rz_mtu3_32bit_ch_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline u16 rz_mtu3_shared_reg_read(struct rz_mtu3_channel *ch, u16 off)
-{
-	return 0;
-}
-
-static inline void rz_mtu3_8bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u8 val)
-{
-}
-
-static inline void rz_mtu3_16bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
-{
-}
-
-static inline void rz_mtu3_32bit_ch_write(struct rz_mtu3_channel *ch, u16 off, u32 val)
-{
-}
-
-static inline void rz_mtu3_shared_reg_write(struct rz_mtu3_channel *ch, u16 off, u16 val)
-{
-}
-
-static inline void rz_mtu3_shared_reg_update_bit(struct rz_mtu3_channel *ch,
-						 u16 off, u16 pos, u8 val)
-{
-}
-#endif
 
 #endif /* __MFD_RZ_MTU3_H__ */
-- 
cgit v1.2.3


From e0d77323824054f15f3137e890f7addc48198a3e Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:27:09 +0800
Subject: mfd: max77686: Remove unused extern declarations

max77686_irq_init() and max77686_irq_exit() are not used since
commit 6f1c1e71d933 ("mfd: max77686: Convert to use regmap_irq").
And max77686_irq_resume() never be implemented since introduced in
commit dae8a969d512 ("mfd: Add Maxim 77686 driver").

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Link: https://lore.kernel.org/r/20230728132709.27052-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/max77686-private.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h
index 3acceeedbaba..ea635d12a741 100644
--- a/include/linux/mfd/max77686-private.h
+++ b/include/linux/mfd/max77686-private.h
@@ -441,8 +441,4 @@ enum max77686_types {
 	TYPE_MAX77802,
 };
 
-extern int max77686_irq_init(struct max77686_dev *max77686);
-extern void max77686_irq_exit(struct max77686_dev *max77686);
-extern int max77686_irq_resume(struct max77686_dev *max77686);
-
 #endif /*  __LINUX_MFD_MAX77686_PRIV_H */
-- 
cgit v1.2.3


From 733e2e9a28e6fa109e51e0b77901552f69df0ef1 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:24:39 +0800
Subject: mfd: ab8500: Remove unused extern declarations

commit d28f1db8187d ("mfd: Remove confusing ab8500-i2c file and merge into ab8500-core")
left behind this.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230728132439.31568-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/abx500/ab8500.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 302a330c5c84..09fb3c56e7d7 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -382,10 +382,6 @@ struct ab8500_platform_data {
 	struct ab8500_sysctrl_platform_data *sysctrl;
 };
 
-extern int ab8500_init(struct ab8500 *ab8500,
-				 enum ab8500_version version);
-extern int ab8500_exit(struct ab8500 *ab8500);
-
 extern int ab8500_suspend(struct ab8500 *ab8500);
 
 static inline int is_ab8500(struct ab8500 *ab)
-- 
cgit v1.2.3


From 54ab43a957bcb2643c13df5ab71de9dc3f72e5a6 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 28 Jul 2023 21:28:41 +0800
Subject: mfd: 88pm860x: Remove unused extern declarations

commit 260a127bfbeb ("mfd: 88pm860x-i2c: Purge unused functions")
left behind this.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230728132841.10648-1-yuehaibing@huawei.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/88pm860x.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/88pm860x.h b/include/linux/mfd/88pm860x.h
index 473545a2c425..6fa21791fc85 100644
--- a/include/linux/mfd/88pm860x.h
+++ b/include/linux/mfd/88pm860x.h
@@ -472,13 +472,7 @@ extern int pm860x_bulk_read(struct i2c_client *, int, int, unsigned char *);
 extern int pm860x_bulk_write(struct i2c_client *, int, int, unsigned char *);
 extern int pm860x_set_bits(struct i2c_client *, int, unsigned char,
 			   unsigned char);
-extern int pm860x_page_reg_read(struct i2c_client *, int);
 extern int pm860x_page_reg_write(struct i2c_client *, int, unsigned char);
 extern int pm860x_page_bulk_read(struct i2c_client *, int, int,
 				 unsigned char *);
-extern int pm860x_page_bulk_write(struct i2c_client *, int, int,
-				  unsigned char *);
-extern int pm860x_page_set_bits(struct i2c_client *, int, unsigned char,
-				unsigned char);
-
 #endif /* __LINUX_MFD_88PM860X_H */
-- 
cgit v1.2.3


From 875386b98857822b77ac7f95bdf367b70af5b78c Mon Sep 17 00:00:00 2001
From: Manish Rangankar <mrangankar@marvell.com>
Date: Mon, 21 Aug 2023 18:30:37 +0530
Subject: scsi: qla2xxx: Add Unsolicited LS Request and Response Support for
 NVMe

Introduce infrastructure in the driver to support the processing of
unsolicited LS (Link Service) requests. This will involve the utilization
of a new pass-up of unsolicited FC-NVMe request IOCB interface. Unsolicited
requests will be submitted to the NVMe transport layer through
nvme_fc_rcv_ls_req(). Any received LS responses, which are sent using
xmt_ls_rsp(), will be forwarded to the firmware through the existing
Pass-Through IOCB interface, responsible for sending FC-NVMe Link Service
requests and responses.

Signed-off-by: Manish Rangankar <mrangankar@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Link: https://lore.kernel.org/r/20230821130045.34850-2-njavali@marvell.com
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/qla2xxx/qla_dbg.c  |   5 +-
 drivers/scsi/qla2xxx/qla_dbg.h  |   1 +
 drivers/scsi/qla2xxx/qla_def.h  |  34 +++-
 drivers/scsi/qla2xxx/qla_gbl.h  |  14 +-
 drivers/scsi/qla2xxx/qla_init.c |   1 +
 drivers/scsi/qla2xxx/qla_iocb.c |  27 ++-
 drivers/scsi/qla2xxx/qla_isr.c  | 146 ++++++++++++++-
 drivers/scsi/qla2xxx/qla_nvme.c | 401 +++++++++++++++++++++++++++++++++++++++-
 drivers/scsi/qla2xxx/qla_nvme.h |  17 +-
 drivers/scsi/qla2xxx/qla_os.c   |  24 ++-
 include/linux/nvme-fc-driver.h  |   6 +-
 11 files changed, 642 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c
index d7e8454304ce..4d104425146b 100644
--- a/drivers/scsi/qla2xxx/qla_dbg.c
+++ b/drivers/scsi/qla2xxx/qla_dbg.c
@@ -12,9 +12,8 @@
  * ----------------------------------------------------------------------
  * | Module Init and Probe        |       0x0199       |                |
  * | Mailbox commands             |       0x1206       | 0x11a5-0x11ff	|
- * | Device Discovery             |       0x2134       | 0x210e-0x2115  |
- * |                              |                    | 0x211c-0x2128  |
- * |                              |                    | 0x212c-0x2134  |
+ * | Device Discovery             |       0x2134       | 0x2112-0x2115  |
+ * |                              |                    | 0x2127-0x2128  |
  * | Queue Command and IO tracing |       0x3074       | 0x300b         |
  * |                              |                    | 0x3027-0x3028  |
  * |                              |                    | 0x303d-0x3041  |
diff --git a/drivers/scsi/qla2xxx/qla_dbg.h b/drivers/scsi/qla2xxx/qla_dbg.h
index 70482b55d240..54f0a412226f 100644
--- a/drivers/scsi/qla2xxx/qla_dbg.h
+++ b/drivers/scsi/qla2xxx/qla_dbg.h
@@ -368,6 +368,7 @@ ql_log_qp(uint32_t, struct qla_qpair *, int32_t, const char *fmt, ...);
 #define ql_dbg_tgt_tmr	0x00001000 /* Target mode task management */
 #define ql_dbg_tgt_dif  0x00000800 /* Target mode dif */
 #define ql_dbg_edif	0x00000400 /* edif and purex debug */
+#define ql_dbg_unsol	0x00000100 /* Unsolicited path debug */
 
 extern int qla27xx_dump_mpi_ram(struct qla_hw_data *, uint32_t, uint32_t *,
 	uint32_t, void **);
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 97ecb2158c8a..2007d5bb5f9f 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -346,6 +346,12 @@ struct name_list_extended {
 	u8			sent;
 };
 
+struct qla_nvme_fc_rjt {
+	struct fcnvme_ls_rjt *c;
+	dma_addr_t  cdma;
+	u16 size;
+};
+
 struct els_reject {
 	struct fc_els_ls_rjt *c;
 	dma_addr_t  cdma;
@@ -503,6 +509,20 @@ struct ct_arg {
 	port_id_t	id;
 };
 
+struct qla_nvme_lsrjt_pt_arg {
+	struct fc_port *fcport;
+	u8 opcode;
+	u8 vp_idx;
+	u8 reason;
+	u8 explanation;
+	__le16 nport_handle;
+	u16 control_flags;
+	__le16 ox_id;
+	__le32 xchg_address;
+	u32 tx_byte_count, rx_byte_count;
+	dma_addr_t tx_addr, rx_addr;
+};
+
 /*
  * SRB extensions.
  */
@@ -611,13 +631,16 @@ struct srb_iocb {
 			void *desc;
 
 			/* These are only used with ls4 requests */
-			int cmd_len;
-			int rsp_len;
+			__le32 cmd_len;
+			__le32 rsp_len;
 			dma_addr_t cmd_dma;
 			dma_addr_t rsp_dma;
 			enum nvmefc_fcp_datadir dir;
 			uint32_t dl;
 			uint32_t timeout_sec;
+			__le32 exchange_address;
+			__le16 nport_handle;
+			__le16 ox_id;
 			struct	list_head   entry;
 		} nvme;
 		struct {
@@ -707,6 +730,10 @@ typedef struct srb {
 	struct fc_port *fcport;
 	struct scsi_qla_host *vha;
 	unsigned int start_timer:1;
+	unsigned int abort:1;
+	unsigned int aborted:1;
+	unsigned int completed:1;
+	unsigned int unsol_rsp:1;
 
 	uint32_t handle;
 	uint16_t flags;
@@ -2542,6 +2569,7 @@ enum rscn_addr_format {
 typedef struct fc_port {
 	struct list_head list;
 	struct scsi_qla_host *vha;
+	struct list_head unsol_ctx_head;
 
 	unsigned int conf_compl_supported:1;
 	unsigned int deleted:2;
@@ -4802,6 +4830,7 @@ struct qla_hw_data {
 	struct els_reject elsrej;
 	u8 edif_post_stop_cnt_down;
 	struct qla_vp_map *vp_map;
+	struct qla_nvme_fc_rjt lsrjt;
 };
 
 #define RX_ELS_SIZE (roundup(sizeof(struct enode) + ELS_MAX_PAYLOAD, SMP_CACHE_BYTES))
@@ -4834,6 +4863,7 @@ struct active_regions {
  * is variable) starting at "iocb".
  */
 struct purex_item {
+	void *purls_context;
 	struct list_head list;
 	struct scsi_qla_host *vha;
 	void (*process_item)(struct scsi_qla_host *vha,
diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
index 33fba9d62969..911e9adf41d3 100644
--- a/drivers/scsi/qla2xxx/qla_gbl.h
+++ b/drivers/scsi/qla2xxx/qla_gbl.h
@@ -611,7 +611,11 @@ qla2xxx_msix_rsp_q_hs(int irq, void *dev_id);
 fc_port_t *qla2x00_find_fcport_by_loopid(scsi_qla_host_t *, uint16_t);
 fc_port_t *qla2x00_find_fcport_by_wwpn(scsi_qla_host_t *, u8 *, u8);
 fc_port_t *qla2x00_find_fcport_by_nportid(scsi_qla_host_t *, port_id_t *, u8);
-void __qla_consume_iocb(struct scsi_qla_host *vha, void **pkt, struct rsp_que **rsp);
+void qla24xx_queue_purex_item(scsi_qla_host_t *, struct purex_item *,
+			      void (*process_item)(struct scsi_qla_host *,
+			      struct purex_item *));
+void __qla_consume_iocb(struct scsi_qla_host *, void **, struct rsp_que **);
+void qla2xxx_process_purls_iocb(void **pkt, struct rsp_que **rsp);
 
 /*
  * Global Function Prototypes in qla_sup.c source file.
@@ -674,9 +678,11 @@ extern int qla2xxx_get_vpd_field(scsi_qla_host_t *, char *, char *, size_t);
 extern void qla2xxx_flash_npiv_conf(scsi_qla_host_t *);
 extern int qla24xx_read_fcp_prio_cfg(scsi_qla_host_t *);
 extern int qla2x00_mailbox_passthru(struct bsg_job *bsg_job);
-int __qla_copy_purex_to_buffer(struct scsi_qla_host *vha, void **pkt,
-	struct rsp_que **rsp, u8 *buf, u32 buf_len);
-
+int qla2x00_sys_ld_info(struct bsg_job *bsg_job);
+int __qla_copy_purex_to_buffer(struct scsi_qla_host *, void **,
+	struct rsp_que **, u8 *, u32);
+struct purex_item *qla27xx_copy_multiple_pkt(struct scsi_qla_host *vha,
+	void **pkt, struct rsp_que **rsp, bool is_purls, bool byte_order);
 int qla_mailbox_passthru(scsi_qla_host_t *vha, uint16_t *mbx_in,
 			 uint16_t *mbx_out);
 
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index 53e76bbee915..27644957ae4e 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -5554,6 +5554,7 @@ qla2x00_alloc_fcport(scsi_qla_host_t *vha, gfp_t flags)
 	INIT_WORK(&fcport->reg_work, qla_register_fcport_fn);
 	INIT_LIST_HEAD(&fcport->gnl_entry);
 	INIT_LIST_HEAD(&fcport->list);
+	INIT_LIST_HEAD(&fcport->unsol_ctx_head);
 
 	INIT_LIST_HEAD(&fcport->sess_cmd_list);
 	spin_lock_init(&fcport->sess_cmd_lock);
diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c
index d0b12c746a8a..7fbd917f6e1f 100644
--- a/drivers/scsi/qla2xxx/qla_iocb.c
+++ b/drivers/scsi/qla2xxx/qla_iocb.c
@@ -3771,21 +3771,28 @@ qla_nvme_ls(srb_t *sp, struct pt_ls4_request *cmd_pkt)
 	nvme = &sp->u.iocb_cmd;
 	cmd_pkt->entry_type = PT_LS4_REQUEST;
 	cmd_pkt->entry_count = 1;
-	cmd_pkt->control_flags = cpu_to_le16(CF_LS4_ORIGINATOR << CF_LS4_SHIFT);
-
 	cmd_pkt->timeout = cpu_to_le16(nvme->u.nvme.timeout_sec);
-	cmd_pkt->nport_handle = cpu_to_le16(sp->fcport->loop_id);
 	cmd_pkt->vp_index = sp->fcport->vha->vp_idx;
 
+	if (sp->unsol_rsp) {
+		cmd_pkt->control_flags =
+				cpu_to_le16(CF_LS4_RESPONDER << CF_LS4_SHIFT);
+		cmd_pkt->nport_handle = nvme->u.nvme.nport_handle;
+		cmd_pkt->exchange_address = nvme->u.nvme.exchange_address;
+	} else {
+		cmd_pkt->control_flags =
+				cpu_to_le16(CF_LS4_ORIGINATOR << CF_LS4_SHIFT);
+		cmd_pkt->nport_handle = cpu_to_le16(sp->fcport->loop_id);
+		cmd_pkt->rx_dseg_count = cpu_to_le16(1);
+		cmd_pkt->rx_byte_count = nvme->u.nvme.rsp_len;
+		cmd_pkt->dsd[1].length  = nvme->u.nvme.rsp_len;
+		put_unaligned_le64(nvme->u.nvme.rsp_dma, &cmd_pkt->dsd[1].address);
+	}
+
 	cmd_pkt->tx_dseg_count = cpu_to_le16(1);
-	cmd_pkt->tx_byte_count = cpu_to_le32(nvme->u.nvme.cmd_len);
-	cmd_pkt->dsd[0].length = cpu_to_le32(nvme->u.nvme.cmd_len);
+	cmd_pkt->tx_byte_count = nvme->u.nvme.cmd_len;
+	cmd_pkt->dsd[0].length = nvme->u.nvme.cmd_len;
 	put_unaligned_le64(nvme->u.nvme.cmd_dma, &cmd_pkt->dsd[0].address);
-
-	cmd_pkt->rx_dseg_count = cpu_to_le16(1);
-	cmd_pkt->rx_byte_count = cpu_to_le32(nvme->u.nvme.rsp_len);
-	cmd_pkt->dsd[1].length = cpu_to_le32(nvme->u.nvme.rsp_len);
-	put_unaligned_le64(nvme->u.nvme.rsp_dma, &cmd_pkt->dsd[1].address);
 }
 
 static void
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 1f42a413b598..867025c89909 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -823,6 +823,135 @@ qla83xx_handle_8200_aen(scsi_qla_host_t *vha, uint16_t *mb)
 	}
 }
 
+/**
+ * qla27xx_copy_multiple_pkt() - Copy over purex/purls packets that can
+ * span over multiple IOCBs.
+ * @vha: SCSI driver HA context
+ * @pkt: ELS packet
+ * @rsp: Response queue
+ * @is_purls: True, for Unsolicited Received FC-NVMe LS rsp IOCB
+ *            false, for Unsolicited Received ELS IOCB
+ * @byte_order: True, to change the byte ordering of iocb payload
+ */
+struct purex_item *
+qla27xx_copy_multiple_pkt(struct scsi_qla_host *vha, void **pkt,
+			  struct rsp_que **rsp, bool is_purls,
+			  bool byte_order)
+{
+	struct purex_entry_24xx *purex = NULL;
+	struct pt_ls4_rx_unsol *purls = NULL;
+	struct rsp_que *rsp_q = *rsp;
+	sts_cont_entry_t *new_pkt;
+	uint16_t no_bytes = 0, total_bytes = 0, pending_bytes = 0;
+	uint16_t buffer_copy_offset = 0, payload_size = 0;
+	uint16_t entry_count, entry_count_remaining;
+	struct purex_item *item;
+	void *iocb_pkt = NULL;
+
+	if (is_purls) {
+		purls = *pkt;
+		total_bytes = (le16_to_cpu(purls->frame_size) & 0x0FFF) -
+			      PURX_ELS_HEADER_SIZE;
+		entry_count = entry_count_remaining = purls->entry_count;
+		payload_size = sizeof(purls->payload);
+	} else {
+		purex = *pkt;
+		total_bytes = (le16_to_cpu(purex->frame_size) & 0x0FFF) -
+			      PURX_ELS_HEADER_SIZE;
+		entry_count = entry_count_remaining = purex->entry_count;
+		payload_size = sizeof(purex->els_frame_payload);
+	}
+
+	pending_bytes = total_bytes;
+	no_bytes = (pending_bytes > payload_size) ? payload_size :
+		   pending_bytes;
+	ql_dbg(ql_dbg_async, vha, 0x509a,
+	       "%s LS, frame_size 0x%x, entry count %d\n",
+	       (is_purls ? "PURLS" : "FPIN"), total_bytes, entry_count);
+
+	item = qla24xx_alloc_purex_item(vha, total_bytes);
+	if (!item)
+		return item;
+
+	iocb_pkt = &item->iocb;
+
+	if (is_purls)
+		memcpy(iocb_pkt, &purls->payload[0], no_bytes);
+	else
+		memcpy(iocb_pkt, &purex->els_frame_payload[0], no_bytes);
+	buffer_copy_offset += no_bytes;
+	pending_bytes -= no_bytes;
+	--entry_count_remaining;
+
+	if (is_purls)
+		((response_t *)purls)->signature = RESPONSE_PROCESSED;
+	else
+		((response_t *)purex)->signature = RESPONSE_PROCESSED;
+	wmb();
+
+	do {
+		while ((total_bytes > 0) && (entry_count_remaining > 0)) {
+			if (rsp_q->ring_ptr->signature == RESPONSE_PROCESSED) {
+				ql_dbg(ql_dbg_async, vha, 0x5084,
+				       "Ran out of IOCBs, partial data 0x%x\n",
+				       buffer_copy_offset);
+				cpu_relax();
+				continue;
+			}
+
+			new_pkt = (sts_cont_entry_t *)rsp_q->ring_ptr;
+			*pkt = new_pkt;
+
+			if (new_pkt->entry_type != STATUS_CONT_TYPE) {
+				ql_log(ql_log_warn, vha, 0x507a,
+				       "Unexpected IOCB type, partial data 0x%x\n",
+				       buffer_copy_offset);
+				break;
+			}
+
+			rsp_q->ring_index++;
+			if (rsp_q->ring_index == rsp_q->length) {
+				rsp_q->ring_index = 0;
+				rsp_q->ring_ptr = rsp_q->ring;
+			} else {
+				rsp_q->ring_ptr++;
+			}
+			no_bytes = (pending_bytes > sizeof(new_pkt->data)) ?
+				sizeof(new_pkt->data) : pending_bytes;
+			if ((buffer_copy_offset + no_bytes) <= total_bytes) {
+				memcpy(((uint8_t *)iocb_pkt + buffer_copy_offset),
+				       new_pkt->data, no_bytes);
+				buffer_copy_offset += no_bytes;
+				pending_bytes -= no_bytes;
+				--entry_count_remaining;
+			} else {
+				ql_log(ql_log_warn, vha, 0x5044,
+				       "Attempt to copy more that we got, optimizing..%x\n",
+				       buffer_copy_offset);
+				memcpy(((uint8_t *)iocb_pkt + buffer_copy_offset),
+				       new_pkt->data,
+				       total_bytes - buffer_copy_offset);
+			}
+
+			((response_t *)new_pkt)->signature = RESPONSE_PROCESSED;
+			wmb();
+		}
+
+		if (pending_bytes != 0 || entry_count_remaining != 0) {
+			ql_log(ql_log_fatal, vha, 0x508b,
+			       "Dropping partial FPIN, underrun bytes = 0x%x, entry cnts 0x%x\n",
+			       total_bytes, entry_count_remaining);
+			qla24xx_free_purex_item(item);
+			return NULL;
+		}
+	} while (entry_count_remaining > 0);
+
+	if (byte_order)
+		host_to_fcp_swap((uint8_t *)&item->iocb, total_bytes);
+
+	return item;
+}
+
 int
 qla2x00_is_a_vp_did(scsi_qla_host_t *vha, uint32_t rscn_entry)
 {
@@ -958,7 +1087,7 @@ initialize_purex_header:
 	return item;
 }
 
-static void
+void
 qla24xx_queue_purex_item(scsi_qla_host_t *vha, struct purex_item *pkt,
 			 void (*process_item)(struct scsi_qla_host *vha,
 					      struct purex_item *pkt))
@@ -3811,6 +3940,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
 	struct qla_hw_data *ha = vha->hw;
 	struct purex_entry_24xx *purex_entry;
 	struct purex_item *pure_item;
+	struct pt_ls4_rx_unsol *p;
 	u16 rsp_in = 0, cur_ring_index;
 	int is_shadow_hba;
 
@@ -3983,7 +4113,19 @@ process_err:
 			qla28xx_sa_update_iocb_entry(vha, rsp->req,
 				(struct sa_update_28xx *)pkt);
 			break;
-
+		case PT_LS4_UNSOL:
+			p = (void *)pkt;
+			if (qla_chk_cont_iocb_avail(vha, rsp, (response_t *)pkt, rsp_in)) {
+				rsp->ring_ptr = (response_t *)pkt;
+				rsp->ring_index = cur_ring_index;
+
+				ql_dbg(ql_dbg_init, vha, 0x2124,
+				       "Defer processing UNSOL LS req opcode %#x...\n",
+				       p->payload[0]);
+				return;
+			}
+			qla2xxx_process_purls_iocb((void **)&pkt, &rsp);
+			break;
 		default:
 			/* Type Not Supported. */
 			ql_dbg(ql_dbg_async, vha, 0x5042,
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 9941b38eac93..1a31e877e6cb 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -12,6 +12,26 @@
 #include <linux/blk-mq.h>
 
 static struct nvme_fc_port_template qla_nvme_fc_transport;
+static int qla_nvme_ls_reject_iocb(struct scsi_qla_host *vha,
+				   struct qla_qpair *qp,
+				   struct qla_nvme_lsrjt_pt_arg *a,
+				   bool is_xchg_terminate);
+
+struct qla_nvme_unsol_ctx {
+	struct list_head elem;
+	struct scsi_qla_host *vha;
+	struct fc_port *fcport;
+	struct srb *sp;
+	struct nvmefc_ls_rsp lsrsp;
+	struct nvmefc_ls_rsp *fd_rsp;
+	struct work_struct lsrsp_work;
+	struct work_struct abort_work;
+	__le32 exchange_address;
+	__le16 nport_handle;
+	__le16 ox_id;
+	int comp_status;
+	spinlock_t cmd_lock;
+};
 
 int qla_nvme_register_remote(struct scsi_qla_host *vha, struct fc_port *fcport)
 {
@@ -216,6 +236,55 @@ static void qla_nvme_sp_ls_done(srb_t *sp, int res)
 	schedule_work(&priv->ls_work);
 }
 
+static void qla_nvme_release_lsrsp_cmd_kref(struct kref *kref)
+{
+	struct srb *sp = container_of(kref, struct srb, cmd_kref);
+	struct qla_nvme_unsol_ctx *uctx = sp->priv;
+	struct nvmefc_ls_rsp *fd_rsp;
+	unsigned long flags;
+
+	if (!uctx) {
+		qla2x00_rel_sp(sp);
+		return;
+	}
+
+	spin_lock_irqsave(&uctx->cmd_lock, flags);
+	uctx->sp = NULL;
+	sp->priv = NULL;
+	spin_unlock_irqrestore(&uctx->cmd_lock, flags);
+
+	fd_rsp = uctx->fd_rsp;
+
+	list_del(&uctx->elem);
+
+	fd_rsp->done(fd_rsp);
+	kfree(uctx);
+	qla2x00_rel_sp(sp);
+}
+
+static void qla_nvme_lsrsp_complete(struct work_struct *work)
+{
+	struct qla_nvme_unsol_ctx *uctx =
+		container_of(work, struct qla_nvme_unsol_ctx, lsrsp_work);
+
+	kref_put(&uctx->sp->cmd_kref, qla_nvme_release_lsrsp_cmd_kref);
+}
+
+static void qla_nvme_sp_lsrsp_done(srb_t *sp, int res)
+{
+	struct qla_nvme_unsol_ctx *uctx = sp->priv;
+
+	if (WARN_ON_ONCE(kref_read(&sp->cmd_kref) == 0))
+		return;
+
+	if (res)
+		res = -EINVAL;
+
+	uctx->comp_status = res;
+	INIT_WORK(&uctx->lsrsp_work, qla_nvme_lsrsp_complete);
+	schedule_work(&uctx->lsrsp_work);
+}
+
 /* it assumed that QPair lock is held. */
 static void qla_nvme_sp_done(srb_t *sp, int res)
 {
@@ -288,6 +357,92 @@ out:
 	kref_put(&sp->cmd_kref, sp->put_fn);
 }
 
+static int qla_nvme_xmt_ls_rsp(struct nvme_fc_local_port *lport,
+			       struct nvme_fc_remote_port *rport,
+			       struct nvmefc_ls_rsp *fd_resp)
+{
+	struct qla_nvme_unsol_ctx *uctx = container_of(fd_resp,
+				struct qla_nvme_unsol_ctx, lsrsp);
+	struct qla_nvme_rport *qla_rport = rport->private;
+	fc_port_t *fcport = qla_rport->fcport;
+	struct scsi_qla_host *vha = uctx->vha;
+	struct qla_hw_data *ha = vha->hw;
+	struct qla_nvme_lsrjt_pt_arg a;
+	struct srb_iocb *nvme;
+	srb_t *sp;
+	int rval = QLA_FUNCTION_FAILED;
+	uint8_t cnt = 0;
+
+	if (!fcport || fcport->deleted)
+		goto out;
+
+	if (!ha->flags.fw_started)
+		goto out;
+
+	/* Alloc SRB structure */
+	sp = qla2x00_get_sp(vha, fcport, GFP_ATOMIC);
+	if (!sp)
+		goto out;
+
+	sp->type = SRB_NVME_LS;
+	sp->name = "nvme_ls";
+	sp->done = qla_nvme_sp_lsrsp_done;
+	sp->put_fn = qla_nvme_release_lsrsp_cmd_kref;
+	sp->priv = (void *)uctx;
+	sp->unsol_rsp = 1;
+	uctx->sp = sp;
+	spin_lock_init(&uctx->cmd_lock);
+	nvme = &sp->u.iocb_cmd;
+	uctx->fd_rsp = fd_resp;
+	nvme->u.nvme.desc = fd_resp;
+	nvme->u.nvme.dir = 0;
+	nvme->u.nvme.dl = 0;
+	nvme->u.nvme.timeout_sec = 0;
+	nvme->u.nvme.cmd_dma = fd_resp->rspdma;
+	nvme->u.nvme.cmd_len = fd_resp->rsplen;
+	nvme->u.nvme.rsp_len = 0;
+	nvme->u.nvme.rsp_dma = 0;
+	nvme->u.nvme.exchange_address = uctx->exchange_address;
+	nvme->u.nvme.nport_handle = uctx->nport_handle;
+	nvme->u.nvme.ox_id = uctx->ox_id;
+	dma_sync_single_for_device(&ha->pdev->dev, nvme->u.nvme.cmd_dma,
+				   le32_to_cpu(fd_resp->rsplen), DMA_TO_DEVICE);
+
+	ql_dbg(ql_dbg_unsol, vha, 0x2122,
+	       "Unsol lsreq portid=%06x %8phC exchange_address 0x%x ox_id 0x%x hdl 0x%x\n",
+	       fcport->d_id.b24, fcport->port_name, uctx->exchange_address,
+	       uctx->ox_id, uctx->nport_handle);
+retry:
+	rval = qla2x00_start_sp(sp);
+	switch (rval) {
+	case QLA_SUCCESS:
+		break;
+	case EAGAIN:
+		msleep(PURLS_MSLEEP_INTERVAL);
+		cnt++;
+		if (cnt < PURLS_RETRY_COUNT)
+			goto retry;
+
+		fallthrough;
+	default:
+		ql_dbg(ql_log_warn, vha, 0x2123,
+		       "Failed to xmit Unsol ls response = %d\n", rval);
+		rval = -EIO;
+		qla2x00_rel_sp(sp);
+		goto out;
+	}
+
+	return 0;
+out:
+	memset((void *)&a, 0, sizeof(a));
+	a.vp_idx = vha->vp_idx;
+	a.nport_handle = uctx->nport_handle;
+	a.xchg_address = uctx->exchange_address;
+	qla_nvme_ls_reject_iocb(vha, ha->base_qpair, &a, true);
+	kfree(uctx);
+	return rval;
+}
+
 static void qla_nvme_ls_abort(struct nvme_fc_local_port *lport,
     struct nvme_fc_remote_port *rport, struct nvmefc_ls_req *fd)
 {
@@ -355,7 +510,7 @@ static int qla_nvme_ls_req(struct nvme_fc_local_port *lport,
 	nvme->u.nvme.timeout_sec = fd->timeout;
 	nvme->u.nvme.cmd_dma = fd->rqstdma;
 	dma_sync_single_for_device(&ha->pdev->dev, nvme->u.nvme.cmd_dma,
-	    fd->rqstlen, DMA_TO_DEVICE);
+	    le32_to_cpu(fd->rqstlen), DMA_TO_DEVICE);
 
 	rval = qla2x00_start_sp(sp);
 	if (rval != QLA_SUCCESS) {
@@ -720,6 +875,7 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
 	.ls_abort	= qla_nvme_ls_abort,
 	.fcp_io		= qla_nvme_post_cmd,
 	.fcp_abort	= qla_nvme_fcp_abort,
+	.xmt_ls_rsp	= qla_nvme_xmt_ls_rsp,
 	.map_queues	= qla_nvme_map_queues,
 	.max_hw_queues  = DEF_NVME_HW_QUEUES,
 	.max_sgl_segments = 1024,
@@ -924,3 +1080,246 @@ inline void qla_wait_nvme_release_cmd_kref(srb_t *orig_sp)
 		return;
 	kref_put(&orig_sp->cmd_kref, orig_sp->put_fn);
 }
+
+static void qla_nvme_fc_format_rjt(void *buf, u8 ls_cmd, u8 reason,
+				   u8 explanation, u8 vendor)
+{
+	struct fcnvme_ls_rjt *rjt = buf;
+
+	rjt->w0.ls_cmd = FCNVME_LSDESC_RQST;
+	rjt->desc_list_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt));
+	rjt->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST);
+	rjt->rqst.desc_len =
+		fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst));
+	rjt->rqst.w0.ls_cmd = ls_cmd;
+	rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT);
+	rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt));
+	rjt->rjt.reason_code = reason;
+	rjt->rjt.reason_explanation = explanation;
+	rjt->rjt.vendor = vendor;
+}
+
+static void qla_nvme_lsrjt_pt_iocb(struct scsi_qla_host *vha,
+				   struct pt_ls4_request *lsrjt_iocb,
+				   struct qla_nvme_lsrjt_pt_arg *a)
+{
+	lsrjt_iocb->entry_type = PT_LS4_REQUEST;
+	lsrjt_iocb->entry_count = 1;
+	lsrjt_iocb->sys_define = 0;
+	lsrjt_iocb->entry_status = 0;
+	lsrjt_iocb->handle = QLA_SKIP_HANDLE;
+	lsrjt_iocb->nport_handle = a->nport_handle;
+	lsrjt_iocb->exchange_address = a->xchg_address;
+	lsrjt_iocb->vp_index = a->vp_idx;
+
+	lsrjt_iocb->control_flags = cpu_to_le16(a->control_flags);
+
+	put_unaligned_le64(a->tx_addr, &lsrjt_iocb->dsd[0].address);
+	lsrjt_iocb->dsd[0].length = cpu_to_le32(a->tx_byte_count);
+	lsrjt_iocb->tx_dseg_count = cpu_to_le16(1);
+	lsrjt_iocb->tx_byte_count = cpu_to_le32(a->tx_byte_count);
+
+	put_unaligned_le64(a->rx_addr, &lsrjt_iocb->dsd[1].address);
+	lsrjt_iocb->dsd[1].length = 0;
+	lsrjt_iocb->rx_dseg_count = 0;
+	lsrjt_iocb->rx_byte_count = 0;
+}
+
+static int
+qla_nvme_ls_reject_iocb(struct scsi_qla_host *vha, struct qla_qpair *qp,
+			struct qla_nvme_lsrjt_pt_arg *a, bool is_xchg_terminate)
+{
+	struct pt_ls4_request *lsrjt_iocb;
+
+	lsrjt_iocb = __qla2x00_alloc_iocbs(qp, NULL);
+	if (!lsrjt_iocb) {
+		ql_log(ql_log_warn, vha, 0x210e,
+		       "qla2x00_alloc_iocbs failed.\n");
+		return QLA_FUNCTION_FAILED;
+	}
+
+	if (!is_xchg_terminate) {
+		qla_nvme_fc_format_rjt((void *)vha->hw->lsrjt.c, a->opcode,
+				       a->reason, a->explanation, 0);
+
+		a->tx_byte_count = sizeof(struct fcnvme_ls_rjt);
+		a->tx_addr = vha->hw->lsrjt.cdma;
+		a->control_flags = CF_LS4_RESPONDER << CF_LS4_SHIFT;
+
+		ql_dbg(ql_dbg_unsol, vha, 0x211f,
+		       "Sending nvme fc ls reject ox_id %04x op %04x\n",
+		       a->ox_id, a->opcode);
+		ql_dump_buffer(ql_dbg_unsol + ql_dbg_verbose, vha, 0x210f,
+			       vha->hw->lsrjt.c, sizeof(*vha->hw->lsrjt.c));
+	} else {
+		a->tx_byte_count = 0;
+		a->control_flags = CF_LS4_RESPONDER_TERM << CF_LS4_SHIFT;
+		ql_dbg(ql_dbg_unsol, vha, 0x2110,
+		       "Terminate nvme ls xchg 0x%x\n", a->xchg_address);
+	}
+
+	qla_nvme_lsrjt_pt_iocb(vha, lsrjt_iocb, a);
+	/* flush iocb to mem before notifying hw doorbell */
+	wmb();
+	qla2x00_start_iocbs(vha, qp->req);
+	return 0;
+}
+
+/*
+ * qla2xxx_process_purls_pkt() - Pass-up Unsolicited
+ * Received FC-NVMe Link Service pkt to nvme_fc_rcv_ls_req().
+ * LLDD need to provide memory for response buffer, which
+ * will be used to reference the exchange corresponding
+ * to the LS when issuing an ls response. LLDD will have to free
+ * response buffer in lport->ops->xmt_ls_rsp().
+ *
+ * @vha: SCSI qla host
+ * @item: ptr to purex_item
+ */
+static void
+qla2xxx_process_purls_pkt(struct scsi_qla_host *vha, struct purex_item *item)
+{
+	struct qla_nvme_unsol_ctx *uctx = item->purls_context;
+	fc_port_t *fcport = uctx->fcport;
+	struct qla_nvme_lsrjt_pt_arg a;
+	int ret;
+
+	ret = nvme_fc_rcv_ls_req(fcport->nvme_remote_port, &uctx->lsrsp,
+				 &item->iocb, item->size);
+	if (ret) {
+		ql_dbg(ql_dbg_unsol, vha, 0x2125, "NVMe tranport ls_req failed\n");
+		memset((void *)&a, 0, sizeof(a));
+		a.vp_idx = vha->vp_idx;
+		a.nport_handle = uctx->nport_handle;
+		a.xchg_address = uctx->exchange_address;
+		qla_nvme_ls_reject_iocb(vha, vha->hw->base_qpair, &a, true);
+		list_del(&uctx->elem);
+		kfree(uctx);
+	}
+}
+
+static scsi_qla_host_t *
+qla2xxx_get_vha_from_vp_idx(struct qla_hw_data *ha, uint16_t vp_index)
+{
+	scsi_qla_host_t *base_vha, *vha, *tvp;
+	unsigned long flags;
+
+	base_vha = pci_get_drvdata(ha->pdev);
+
+	if (!vp_index && !ha->num_vhosts)
+		return base_vha;
+
+	spin_lock_irqsave(&ha->vport_slock, flags);
+	list_for_each_entry_safe(vha, tvp, &ha->vp_list, list) {
+		if (vha->vp_idx == vp_index) {
+			spin_unlock_irqrestore(&ha->vport_slock, flags);
+			return vha;
+		}
+	}
+	spin_unlock_irqrestore(&ha->vport_slock, flags);
+
+	return NULL;
+}
+
+void qla2xxx_process_purls_iocb(void **pkt, struct rsp_que **rsp)
+{
+	struct nvme_fc_remote_port *rport;
+	struct qla_nvme_rport *qla_rport;
+	struct qla_nvme_lsrjt_pt_arg a;
+	struct pt_ls4_rx_unsol *p = *pkt;
+	struct qla_nvme_unsol_ctx *uctx;
+	struct rsp_que *rsp_q = *rsp;
+	struct qla_hw_data *ha;
+	scsi_qla_host_t	*vha;
+	fc_port_t *fcport = NULL;
+	struct purex_item *item;
+	port_id_t d_id = {0};
+	port_id_t id = {0};
+	u8 *opcode;
+	bool xmt_reject = false;
+
+	ha = rsp_q->hw;
+
+	vha = qla2xxx_get_vha_from_vp_idx(ha, p->vp_index);
+	if (!vha) {
+		ql_log(ql_log_warn, NULL, 0x2110, "Invalid vp index %d\n", p->vp_index);
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	memset((void *)&a, 0, sizeof(a));
+	opcode = (u8 *)&p->payload[0];
+	a.opcode = opcode[3];
+	a.vp_idx = p->vp_index;
+	a.nport_handle = p->nport_handle;
+	a.ox_id = p->ox_id;
+	a.xchg_address = p->exchange_address;
+
+	id.b.domain = p->s_id.domain;
+	id.b.area   = p->s_id.area;
+	id.b.al_pa  = p->s_id.al_pa;
+	d_id.b.domain = p->d_id[2];
+	d_id.b.area   = p->d_id[1];
+	d_id.b.al_pa  = p->d_id[0];
+
+	fcport = qla2x00_find_fcport_by_nportid(vha, &id, 0);
+	if (!fcport) {
+		ql_dbg(ql_dbg_unsol, vha, 0x211e,
+		       "Failed to find sid=%06x did=%06x\n",
+		       id.b24, d_id.b24);
+		a.reason = FCNVME_RJT_RC_INV_ASSOC;
+		a.explanation = FCNVME_RJT_EXP_NONE;
+		xmt_reject = true;
+		goto out;
+	}
+	rport = fcport->nvme_remote_port;
+	qla_rport = rport->private;
+
+	item = qla27xx_copy_multiple_pkt(vha, pkt, rsp, true, false);
+	if (!item) {
+		a.reason = FCNVME_RJT_RC_LOGIC;
+		a.explanation = FCNVME_RJT_EXP_NONE;
+		xmt_reject = true;
+		goto out;
+	}
+
+	uctx = kzalloc(sizeof(*uctx), GFP_ATOMIC);
+	if (!uctx) {
+		ql_log(ql_log_info, vha, 0x2126, "Failed allocate memory\n");
+		a.reason = FCNVME_RJT_RC_LOGIC;
+		a.explanation = FCNVME_RJT_EXP_NONE;
+		xmt_reject = true;
+		kfree(item);
+		goto out;
+	}
+
+	uctx->vha = vha;
+	uctx->fcport = fcport;
+	uctx->exchange_address = p->exchange_address;
+	uctx->nport_handle = p->nport_handle;
+	uctx->ox_id = p->ox_id;
+	qla_rport->uctx = uctx;
+	INIT_LIST_HEAD(&uctx->elem);
+	list_add_tail(&uctx->elem, &fcport->unsol_ctx_head);
+	item->purls_context = (void *)uctx;
+
+	ql_dbg(ql_dbg_unsol, vha, 0x2121,
+	       "PURLS OP[%01x] size %d xchg addr 0x%x portid %06x\n",
+	       item->iocb.iocb[3], item->size, uctx->exchange_address,
+	       fcport->d_id.b24);
+	/* +48    0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
+	 * ----- -----------------------------------------------
+	 * 0000: 00 00 00 05 28 00 00 00 07 00 00 00 08 00 00 00
+	 * 0010: ab ec 0f cc 00 00 8d 7d 05 00 00 00 10 00 00 00
+	 * 0020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+	 */
+	ql_dump_buffer(ql_dbg_unsol + ql_dbg_verbose, vha, 0x2120,
+		       &item->iocb, item->size);
+
+	qla24xx_queue_purex_item(vha, item, qla2xxx_process_purls_pkt);
+out:
+	if (xmt_reject) {
+		qla_nvme_ls_reject_iocb(vha, (*rsp)->qpair, &a, false);
+		__qla_consume_iocb(vha, pkt, rsp);
+	}
+}
diff --git a/drivers/scsi/qla2xxx/qla_nvme.h b/drivers/scsi/qla2xxx/qla_nvme.h
index d299478371b2..a253ac55171b 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.h
+++ b/drivers/scsi/qla2xxx/qla_nvme.h
@@ -21,6 +21,7 @@
 #define Q2T_NVME_NUM_TAGS 2048
 #define QLA_MAX_FC_SEGMENTS 64
 
+struct qla_nvme_unsol_ctx;
 struct scsi_qla_host;
 struct qla_hw_data;
 struct req_que;
@@ -37,6 +38,7 @@ struct nvme_private {
 
 struct qla_nvme_rport {
 	struct fc_port *fcport;
+	struct qla_nvme_unsol_ctx *uctx;
 };
 
 #define COMMAND_NVME    0x88            /* Command Type FC-NVMe IOCB */
@@ -75,6 +77,9 @@ struct cmd_nvme {
 	struct dsd64 nvme_dsd;
 };
 
+#define PURLS_MSLEEP_INTERVAL	1
+#define PURLS_RETRY_COUNT	5
+
 #define PT_LS4_REQUEST 0x89	/* Link Service pass-through IOCB (request) */
 struct pt_ls4_request {
 	uint8_t entry_type;
@@ -118,21 +123,19 @@ struct pt_ls4_rx_unsol {
 	__le32	exchange_address;
 	uint8_t d_id[3];
 	uint8_t r_ctl;
-	be_id_t s_id;
+	le_id_t s_id;
 	uint8_t cs_ctl;
 	uint8_t f_ctl[3];
 	uint8_t type;
 	__le16	seq_cnt;
 	uint8_t df_ctl;
 	uint8_t seq_id;
-	__le16	rx_id;
-	__le16	ox_id;
-	__le32	param;
-	__le32	desc0;
+	__le16 rx_id;
+	__le16 ox_id;
+	__le32  desc0;
 #define PT_LS4_PAYLOAD_OFFSET 0x2c
 #define PT_LS4_FIRST_PACKET_LEN 20
-	__le32	desc_len;
-	__le32	payload[3];
+	__le32 payload[5];
 };
 
 /*
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index f31d0ead8919..aa492489a4b8 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -4457,8 +4457,9 @@ qla2x00_mem_alloc(struct qla_hw_data *ha, uint16_t req_len, uint16_t rsp_len,
 
 	ha->elsrej.size = sizeof(struct fc_els_ls_rjt) + 16;
 	ha->elsrej.c = dma_alloc_coherent(&ha->pdev->dev,
-	    ha->elsrej.size, &ha->elsrej.cdma, GFP_KERNEL);
-
+					  ha->elsrej.size,
+					  &ha->elsrej.cdma,
+					  GFP_KERNEL);
 	if (!ha->elsrej.c) {
 		ql_dbg_pci(ql_dbg_init, ha->pdev, 0xffff,
 		    "Alloc failed for els reject cmd.\n");
@@ -4467,8 +4468,21 @@ qla2x00_mem_alloc(struct qla_hw_data *ha, uint16_t req_len, uint16_t rsp_len,
 	ha->elsrej.c->er_cmd = ELS_LS_RJT;
 	ha->elsrej.c->er_reason = ELS_RJT_LOGIC;
 	ha->elsrej.c->er_explan = ELS_EXPL_UNAB_DATA;
+
+	ha->lsrjt.size = sizeof(struct fcnvme_ls_rjt);
+	ha->lsrjt.c = dma_alloc_coherent(&ha->pdev->dev, ha->lsrjt.size,
+			&ha->lsrjt.cdma, GFP_KERNEL);
+	if (!ha->lsrjt.c) {
+		ql_dbg_pci(ql_dbg_init, ha->pdev, 0xffff,
+			   "Alloc failed for nvme fc reject cmd.\n");
+		goto fail_lsrjt;
+	}
+
 	return 0;
 
+fail_lsrjt:
+	dma_free_coherent(&ha->pdev->dev, ha->elsrej.size,
+			  ha->elsrej.c, ha->elsrej.cdma);
 fail_elsrej:
 	dma_pool_destroy(ha->purex_dma_pool);
 fail_flt:
@@ -4998,6 +5012,12 @@ qla2x00_mem_free(struct qla_hw_data *ha)
 		ha->elsrej.c = NULL;
 	}
 
+	if (ha->lsrjt.c) {
+		dma_free_coherent(&ha->pdev->dev, ha->lsrjt.size, ha->lsrjt.c,
+				  ha->lsrjt.cdma);
+		ha->lsrjt.c = NULL;
+	}
+
 	ha->init_cb = NULL;
 	ha->init_cb_dma = 0;
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 4109f1bd6128..f6ef8cf5d774 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -53,10 +53,10 @@
 struct nvmefc_ls_req {
 	void			*rqstaddr;
 	dma_addr_t		rqstdma;
-	u32			rqstlen;
+	__le32			rqstlen;
 	void			*rspaddr;
 	dma_addr_t		rspdma;
-	u32			rsplen;
+	__le32			rsplen;
 	u32			timeout;
 
 	void			*private;
@@ -120,7 +120,7 @@ struct nvmefc_ls_req {
 struct nvmefc_ls_rsp {
 	void		*rspbuf;
 	dma_addr_t	rspdma;
-	u16		rsplen;
+	__le32		rsplen;
 
 	void (*done)(struct nvmefc_ls_rsp *rsp);
 	void		*nvme_fc_private;	/* LLDD is not to access !! */
-- 
cgit v1.2.3


From 08b8a0440eeec83f8330349f829908858fd52d31 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 14 Mar 2022 15:47:18 -0400
Subject: libceph: add spinlock around osd->o_requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In a later patch, we're going to need to search for a request in
the rbtree, but taking the o_mutex is inconvenient as we already
hold the con mutex at the point where we need it.

Add a new spinlock that we take when inserting and erasing entries from
the o_requests tree. Search of the rbtree can be done with either the
mutex or the spinlock, but insertion and removal requires both.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 8 +++++++-
 net/ceph/osd_client.c           | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fb6be72104df..92addef18738 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,7 +29,12 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 #define CEPH_HOMELESS_OSD	-1
 
-/* a given osd we're communicating with */
+/*
+ * A given osd we're communicating with.
+ *
+ * Note that the o_requests tree can be searched while holding the "lock" mutex
+ * or the "o_requests_lock" spinlock. Insertion or removal requires both!
+ */
 struct ceph_osd {
 	refcount_t o_ref;
 	struct ceph_osd_client *o_osdc;
@@ -37,6 +42,7 @@ struct ceph_osd {
 	int o_incarnation;
 	struct rb_node o_node;
 	struct ceph_connection o_con;
+	spinlock_t o_requests_lock;
 	struct rb_root o_requests;
 	struct rb_root o_linger_requests;
 	struct rb_root o_backoff_mappings;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 658a6f2320cf..6d7a430bfed8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1177,6 +1177,7 @@ static void osd_init(struct ceph_osd *osd)
 {
 	refcount_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
+	spin_lock_init(&osd->o_requests_lock);
 	osd->o_requests = RB_ROOT;
 	osd->o_linger_requests = RB_ROOT;
 	osd->o_backoff_mappings = RB_ROOT;
@@ -1406,7 +1407,9 @@ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 		atomic_inc(&osd->o_osdc->num_homeless);
 
 	get_osd(osd);
+	spin_lock(&osd->o_requests_lock);
 	insert_request(&osd->o_requests, req);
+	spin_unlock(&osd->o_requests_lock);
 	req->r_osd = osd;
 }
 
@@ -1418,7 +1421,9 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 	     req, req->r_tid);
 
 	req->r_osd = NULL;
+	spin_lock(&osd->o_requests_lock);
 	erase_request(&osd->o_requests, req);
+	spin_unlock(&osd->o_requests_lock);
 	put_osd(osd);
 
 	if (!osd_homeless(osd))
-- 
cgit v1.2.3


From a679e50f728648f7b2f3b349e082448abd388038 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 16 Mar 2022 15:23:00 -0400
Subject: libceph: define struct ceph_sparse_extent and add some helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the OSD sends back a sparse read reply, it contains an array of
these structures. Define the structure and add a couple of helpers for
dealing with them.

Also add a place in struct ceph_osd_req_op to store the extent buffer,
and code to free it if it's populated when the req is torn down.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 43 ++++++++++++++++++++++++++++++++++++++++-
 net/ceph/osd_client.c           | 13 +++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 92addef18738..05da1e755b7b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,6 +29,17 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 #define CEPH_HOMELESS_OSD	-1
 
+/*
+ * A single extent in a SPARSE_READ reply.
+ *
+ * Note that these come from the OSD as little-endian values. On BE arches,
+ * we convert them in-place after receipt.
+ */
+struct ceph_sparse_extent {
+	u64	off;
+	u64	len;
+} __packed;
+
 /*
  * A given osd we're communicating with.
  *
@@ -104,6 +115,8 @@ struct ceph_osd_req_op {
 			u64 offset, length;
 			u64 truncate_size;
 			u32 truncate_seq;
+			int sparse_ext_cnt;
+			struct ceph_sparse_extent *sparse_ext;
 			struct ceph_osd_data osd_data;
 		} extent;
 		struct {
@@ -510,6 +523,20 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
 
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
+
+/*
+ * How big an extent array should we preallocate for a sparse read? This is
+ * just a starting value.  If we get more than this back from the OSD, the
+ * receiver will reallocate.
+ */
+#define CEPH_SPARSE_EXT_ARRAY_INITIAL  16
+
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op)
+{
+	return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+}
+
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -564,5 +591,19 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 			    struct ceph_object_locator *oloc,
 			    struct ceph_watch_item **watchers,
 			    u32 *num_watchers);
-#endif
 
+/* Find offset into the buffer of the end of the extent map */
+static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op)
+{
+	struct ceph_sparse_extent *ext;
+
+	/* No extents? No data */
+	if (op->extent.sparse_ext_cnt == 0)
+		return 0;
+
+	ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1];
+
+	return ext->off + ext->len - op->extent.offset;
+}
+
+#endif
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 6d7a430bfed8..3e03ae68722c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -378,6 +378,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
+		kfree(op->extent.sparse_ext);
 		ceph_osd_data_release(&op->extent.osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
@@ -1120,6 +1121,18 @@ fail:
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
+{
+	op->extent.sparse_ext_cnt = cnt;
+	op->extent.sparse_ext = kmalloc_array(cnt,
+					      sizeof(*op->extent.sparse_ext),
+					      GFP_NOFS);
+	if (!op->extent.sparse_ext)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map);
+
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-- 
cgit v1.2.3


From ec3bc567eac12c557a2b99bd0b34b5dff12cab23 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 25 Jan 2022 08:26:31 -0500
Subject: libceph: new sparse_read op, support sparse reads on msgr2 crc
 codepath
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for a new sparse_read ceph_connection operation. The idea is
that the client driver can define this operation use it to do special
handling for incoming reads.

The alloc_msg routine will look at the request and determine whether the
reply is expected to be sparse. If it is, then we'll dispatch to a
different set of state machine states that will repeatedly call the
driver's sparse_read op to get length and placement info for reading the
extent map, and the extents themselves.

This necessitates adding some new field to some other structs:

- The msg gets a new bool to track whether it's a sparse_read request.

- A new field is added to the cursor to track the amount remaining in the
current extent. This is used to cap the read from the socket into the
msg_data

- Handing a revoke with all of this is particularly difficult, so I've
added a new data_len_remain field to the v2 connection info, and then
use that to skip that much on a revoke. We may want to expand the use of
that to the normal read path as well, just for consistency's sake.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  28 +++++++
 net/ceph/messenger.c           |   1 +
 net/ceph/messenger_v2.c        | 167 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 187 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 99c1726be6ee..8a6938fa324e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -17,6 +17,7 @@
 
 struct ceph_msg;
 struct ceph_connection;
+struct ceph_msg_data_cursor;
 
 /*
  * Ceph defines these callbacks for handling connection events.
@@ -70,6 +71,30 @@ struct ceph_connection_operations {
 				      int used_proto, int result,
 				      const int *allowed_protos, int proto_cnt,
 				      const int *allowed_modes, int mode_cnt);
+
+	/**
+	 * sparse_read: read sparse data
+	 * @con: connection we're reading from
+	 * @cursor: data cursor for reading extents
+	 * @buf: optional buffer to read into
+	 *
+	 * This should be called more than once, each time setting up to
+	 * receive an extent into the current cursor position, and zeroing
+	 * the holes between them.
+	 *
+	 * Returns amount of data to be read (in bytes), 0 if reading is
+	 * complete, or -errno if there was an error.
+	 *
+	 * If @buf is set on a >0 return, then the data should be read into
+	 * the provided buffer. Otherwise, it should be read into the cursor.
+	 *
+	 * The sparse read operation is expected to initialize the cursor
+	 * with a length covering up to the end of the last extent.
+	 */
+	int (*sparse_read)(struct ceph_connection *con,
+			   struct ceph_msg_data_cursor *cursor,
+			   char **buf);
+
 };
 
 /* use format string %s%lld */
@@ -207,6 +232,7 @@ struct ceph_msg_data_cursor {
 
 	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
+	int			sr_resid;	/* residual sparse_read len */
 	bool			need_crc;	/* crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
@@ -251,6 +277,7 @@ struct ceph_msg {
 	struct kref kref;
 	bool more_to_follow;
 	bool needs_out_seq;
+	bool sparse_read;
 	int front_alloc_len;
 
 	struct ceph_msgpool *pool;
@@ -395,6 +422,7 @@ struct ceph_connection_v2_info {
 
 	void *conn_bufs[16];
 	int conn_buf_cnt;
+	int data_len_remain;
 
 	struct kvec in_sign_kvecs[8];
 	struct kvec out_sign_kvecs[8];
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5eb4898cccd4..2eb10d7518e8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1013,6 +1013,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
 
 	cursor->total_resid = length;
 	cursor->data = msg->data;
+	cursor->sr_resid = 0;
 
 	__ceph_msg_data_cursor_init(cursor);
 }
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 1df1d29dee92..17c9a858bfbd 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -52,14 +52,16 @@
 #define FRAME_LATE_STATUS_COMPLETE	0xe
 #define FRAME_LATE_STATUS_ABORTED_MASK	0xf
 
-#define IN_S_HANDLE_PREAMBLE		1
-#define IN_S_HANDLE_CONTROL		2
-#define IN_S_HANDLE_CONTROL_REMAINDER	3
-#define IN_S_PREPARE_READ_DATA		4
-#define IN_S_PREPARE_READ_DATA_CONT	5
-#define IN_S_PREPARE_READ_ENC_PAGE	6
-#define IN_S_HANDLE_EPILOGUE		7
-#define IN_S_FINISH_SKIP		8
+#define IN_S_HANDLE_PREAMBLE			1
+#define IN_S_HANDLE_CONTROL			2
+#define IN_S_HANDLE_CONTROL_REMAINDER		3
+#define IN_S_PREPARE_READ_DATA			4
+#define IN_S_PREPARE_READ_DATA_CONT		5
+#define IN_S_PREPARE_READ_ENC_PAGE		6
+#define IN_S_PREPARE_SPARSE_DATA		7
+#define IN_S_PREPARE_SPARSE_DATA_CONT		8
+#define IN_S_HANDLE_EPILOGUE			9
+#define IN_S_FINISH_SKIP			10
 
 #define OUT_S_QUEUE_DATA		1
 #define OUT_S_QUEUE_DATA_CONT		2
@@ -1825,6 +1827,123 @@ static void prepare_read_data_cont(struct ceph_connection *con)
 	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
 }
 
+static int prepare_sparse_read_cont(struct ceph_connection *con)
+{
+	int ret;
+	struct bio_vec bv;
+	char *buf = NULL;
+	struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor;
+
+	WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT);
+
+	if (iov_iter_is_bvec(&con->v2.in_iter)) {
+		if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+			con->in_data_crc = crc32c(con->in_data_crc,
+						  page_address(con->bounce_page),
+						  con->v2.in_bvec.bv_len);
+			get_bvec_at(cursor, &bv);
+			memcpy_to_page(bv.bv_page, bv.bv_offset,
+				       page_address(con->bounce_page),
+				       con->v2.in_bvec.bv_len);
+		} else {
+			con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+							    con->v2.in_bvec.bv_page,
+							    con->v2.in_bvec.bv_offset,
+							    con->v2.in_bvec.bv_len);
+		}
+
+		ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len);
+		cursor->sr_resid -= con->v2.in_bvec.bv_len;
+		dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__,
+		     con->v2.in_bvec.bv_len, cursor->sr_resid);
+		WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid);
+		if (cursor->sr_resid) {
+			get_bvec_at(cursor, &bv);
+			if (bv.bv_len > cursor->sr_resid)
+				bv.bv_len = cursor->sr_resid;
+			if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+				bv.bv_page = con->bounce_page;
+				bv.bv_offset = 0;
+			}
+			set_in_bvec(con, &bv);
+			con->v2.data_len_remain -= bv.bv_len;
+			return 0;
+		}
+	} else if (iov_iter_is_kvec(&con->v2.in_iter)) {
+		/* On first call, we have no kvec so don't compute crc */
+		if (con->v2.in_kvec_cnt) {
+			WARN_ON_ONCE(con->v2.in_kvec_cnt > 1);
+			con->in_data_crc = crc32c(con->in_data_crc,
+						  con->v2.in_kvecs[0].iov_base,
+						  con->v2.in_kvecs[0].iov_len);
+		}
+	} else {
+		return -EIO;
+	}
+
+	/* get next extent */
+	ret = con->ops->sparse_read(con, cursor, &buf);
+	if (ret <= 0) {
+		if (ret < 0)
+			return ret;
+
+		reset_in_kvecs(con);
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+		return 0;
+	}
+
+	if (buf) {
+		/* receive into buffer */
+		reset_in_kvecs(con);
+		add_in_kvec(con, buf, ret);
+		con->v2.data_len_remain -= ret;
+		return 0;
+	}
+
+	if (ret > cursor->total_resid) {
+		pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n",
+			__func__, ret, cursor->total_resid, cursor->resid);
+		return -EIO;
+	}
+	get_bvec_at(cursor, &bv);
+	if (bv.bv_len > cursor->sr_resid)
+		bv.bv_len = cursor->sr_resid;
+	if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+		if (unlikely(!con->bounce_page)) {
+			con->bounce_page = alloc_page(GFP_NOIO);
+			if (!con->bounce_page) {
+				pr_err("failed to allocate bounce page\n");
+				return -ENOMEM;
+			}
+		}
+
+		bv.bv_page = con->bounce_page;
+		bv.bv_offset = 0;
+	}
+	set_in_bvec(con, &bv);
+	con->v2.data_len_remain -= ret;
+	return ret;
+}
+
+static int prepare_sparse_read_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+
+	dout("%s: starting sparse read\n", __func__);
+
+	if (WARN_ON_ONCE(!con->ops->sparse_read))
+		return -EOPNOTSUPP;
+
+	if (!con_secure(con))
+		con->in_data_crc = -1;
+
+	reset_in_kvecs(con);
+	con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT;
+	con->v2.data_len_remain = data_len(msg);
+	return prepare_sparse_read_cont(con);
+}
+
 static int prepare_read_tail_plain(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
@@ -1845,7 +1964,10 @@ static int prepare_read_tail_plain(struct ceph_connection *con)
 	}
 
 	if (data_len(msg)) {
-		con->v2.in_state = IN_S_PREPARE_READ_DATA;
+		if (msg->sparse_read)
+			con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
+		else
+			con->v2.in_state = IN_S_PREPARE_READ_DATA;
 	} else {
 		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
 		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
@@ -2898,6 +3020,12 @@ static int populate_in_iter(struct ceph_connection *con)
 			prepare_read_enc_page(con);
 			ret = 0;
 			break;
+		case IN_S_PREPARE_SPARSE_DATA:
+			ret = prepare_sparse_read_data(con);
+			break;
+		case IN_S_PREPARE_SPARSE_DATA_CONT:
+			ret = prepare_sparse_read_cont(con);
+			break;
 		case IN_S_HANDLE_EPILOGUE:
 			ret = handle_epilogue(con);
 			break;
@@ -3489,6 +3617,23 @@ static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
 	con->v2.in_state = IN_S_FINISH_SKIP;
 }
 
+static void revoke_at_prepare_sparse_data(struct ceph_connection *con)
+{
+	int resid;  /* current piece of data */
+	int remaining;
+
+	WARN_ON(con_secure(con));
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	dout("%s con %p resid %d\n", __func__, con, resid);
+
+	remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain;
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
 static void revoke_at_handle_epilogue(struct ceph_connection *con)
 {
 	int resid;
@@ -3505,6 +3650,7 @@ static void revoke_at_handle_epilogue(struct ceph_connection *con)
 void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
 {
 	switch (con->v2.in_state) {
+	case IN_S_PREPARE_SPARSE_DATA:
 	case IN_S_PREPARE_READ_DATA:
 		revoke_at_prepare_read_data(con);
 		break;
@@ -3514,6 +3660,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
 	case IN_S_PREPARE_READ_ENC_PAGE:
 		revoke_at_prepare_read_enc_page(con);
 		break;
+	case IN_S_PREPARE_SPARSE_DATA_CONT:
+		revoke_at_prepare_sparse_data(con);
+		break;
 	case IN_S_HANDLE_EPILOGUE:
 		revoke_at_handle_epilogue(con);
 		break;
-- 
cgit v1.2.3


From d396f89db39a2f259e2125ca43b4c31bb65afcad Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 24 Mar 2022 13:33:06 -0400
Subject: libceph: add sparse read support to msgr1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 2 new fields to ceph_connection_v1_info to track the necessary info
in sparse reads. Skip initializing the cursor for a sparse read.

Break out read_partial_message_section into a wrapper around a new
read_partial_message_chunk function that doesn't zero out the crc first.

Add new helper functions to drive receiving into the destinations
provided by the sparse_read state machine.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  4 ++
 net/ceph/messenger_v1.c        | 98 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 94 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8a6938fa324e..9fd7255172ad 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -336,6 +336,10 @@ struct ceph_connection_v1_info {
 
 	int in_base_pos;     /* bytes read */
 
+	/* sparse reads */
+	struct kvec in_sr_kvec; /* current location to receive into */
+	u64 in_sr_len;		/* amount of data in this extent */
+
 	/* message in temps */
 	u8 in_tag;           /* protocol control byte */
 	struct ceph_msg_header in_hdr;
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 3d57bb48a2b4..f9a50d7f0d20 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -159,9 +159,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
 
 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
-	/* Initialize data cursor */
-
-	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+	/* Initialize data cursor if it's not a sparse read */
+	if (!msg->sparse_read)
+		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
 }
 
 /*
@@ -960,9 +960,9 @@ static void process_ack(struct ceph_connection *con)
 	prepare_read_tag(con);
 }
 
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
+static int read_partial_message_chunk(struct ceph_connection *con,
+				      struct kvec *section,
+				      unsigned int sec_len, u32 *crc)
 {
 	int ret, left;
 
@@ -978,11 +978,91 @@ static int read_partial_message_section(struct ceph_connection *con,
 		section->iov_len += ret;
 	}
 	if (section->iov_len == sec_len)
-		*crc = crc32c(0, section->iov_base, section->iov_len);
+		*crc = crc32c(*crc, section->iov_base, section->iov_len);
 
 	return 1;
 }
 
+static inline int read_partial_message_section(struct ceph_connection *con,
+					       struct kvec *section,
+					       unsigned int sec_len, u32 *crc)
+{
+	*crc = 0;
+	return read_partial_message_chunk(con, section, sec_len, crc);
+}
+
+static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc)
+{
+	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+	bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE);
+
+	if (do_bounce && unlikely(!con->bounce_page)) {
+		con->bounce_page = alloc_page(GFP_NOIO);
+		if (!con->bounce_page) {
+			pr_err("failed to allocate bounce page\n");
+			return -ENOMEM;
+		}
+	}
+
+	while (cursor->sr_resid > 0) {
+		struct page *page, *rpage;
+		size_t off, len;
+		int ret;
+
+		page = ceph_msg_data_next(cursor, &off, &len);
+		rpage = do_bounce ? con->bounce_page : page;
+
+		/* clamp to what remains in extent */
+		len = min_t(int, len, cursor->sr_resid);
+		ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
+		if (ret <= 0)
+			return ret;
+		*crc = ceph_crc32c_page(*crc, rpage, off, ret);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+		cursor->sr_resid -= ret;
+		if (do_bounce)
+			memcpy_page(page, off, rpage, off, ret);
+	}
+	return 1;
+}
+
+static int read_sparse_msg_data(struct ceph_connection *con)
+{
+	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	u32 crc = 0;
+	int ret = 1;
+
+	if (do_datacrc)
+		crc = con->in_data_crc;
+
+	do {
+		if (con->v1.in_sr_kvec.iov_base)
+			ret = read_partial_message_chunk(con,
+							 &con->v1.in_sr_kvec,
+							 con->v1.in_sr_len,
+							 &crc);
+		else if (cursor->sr_resid > 0)
+			ret = read_sparse_msg_extent(con, &crc);
+
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+			return ret;
+		}
+
+		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
+		ret = con->ops->sparse_read(con, cursor,
+				(char **)&con->v1.in_sr_kvec.iov_base);
+		con->v1.in_sr_len = ret;
+	} while (ret > 0);
+
+	if (do_datacrc)
+		con->in_data_crc = crc;
+
+	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
+}
+
 static int read_partial_msg_data(struct ceph_connection *con)
 {
 	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
@@ -1173,7 +1253,9 @@ static int read_partial_message(struct ceph_connection *con)
 		if (!m->num_data_items)
 			return -EIO;
 
-		if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
+		if (m->sparse_read)
+			ret = read_sparse_msg_data(con);
+		else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
 			ret = read_partial_msg_data_bounce(con);
 		else
 			ret = read_partial_msg_data(con);
-- 
cgit v1.2.3


From f628d799972799023d32c2542bb2639eb8c4f84e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 11 Feb 2022 11:38:02 -0500
Subject: libceph: add sparse read support to OSD client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Have get_reply check for the presence of sparse read ops in the
request and set the sparse_read boolean in the msg. That will queue the
messenger layer to use the sparse read codepath instead of the normal
data receive.

Add a new sparse_read operation for the OSD client, driven by its own
state machine. The messenger will repeatedly call the sparse_read
operation, and it will pass back the necessary info to set up to read
the next extent of data, while zero-filling the sparse regions.

The state machine will stop at the end of the last extent, and will
attach the extent map buffer to the ceph_osd_req_op so that the caller
can use it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  32 +++++
 net/ceph/osd_client.c           | 257 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 285 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 05da1e755b7b..bfa4813590da 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -40,6 +40,36 @@ struct ceph_sparse_extent {
 	u64	len;
 } __packed;
 
+/* Sparse read state machine state values */
+enum ceph_sparse_read_state {
+	CEPH_SPARSE_READ_HDR	= 0,
+	CEPH_SPARSE_READ_EXTENTS,
+	CEPH_SPARSE_READ_DATA_LEN,
+	CEPH_SPARSE_READ_DATA,
+};
+
+/*
+ * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of
+ * 64-bit offset/length pairs, and then all of the actual file data
+ * concatenated after it (sans holes).
+ *
+ * Unfortunately, we don't know how long the extent array is until we've
+ * started reading the data section of the reply. The caller should send down
+ * a destination buffer for the array, but we'll alloc one if it's too small
+ * or if the caller doesn't.
+ */
+struct ceph_sparse_read {
+	enum ceph_sparse_read_state	sr_state;    /* state machine state */
+	u64				sr_req_off;  /* orig request offset */
+	u64				sr_req_len;  /* orig request length */
+	u64				sr_pos;      /* current pos in buffer */
+	int				sr_index;    /* current extent index */
+	__le32				sr_datalen;  /* length of actual data */
+	u32				sr_count;    /* extent count in reply */
+	int				sr_ext_len;  /* length of extent array */
+	struct ceph_sparse_extent	*sr_extent;  /* extent array */
+};
+
 /*
  * A given osd we're communicating with.
  *
@@ -48,6 +78,7 @@ struct ceph_sparse_extent {
  */
 struct ceph_osd {
 	refcount_t o_ref;
+	int o_sparse_op_idx;
 	struct ceph_osd_client *o_osdc;
 	int o_osd;
 	int o_incarnation;
@@ -63,6 +94,7 @@ struct ceph_osd {
 	unsigned long lru_ttl;
 	struct list_head o_keepalive_item;
 	struct mutex lock;
+	struct ceph_sparse_read	o_sparse_read;
 };
 
 #define CEPH_OSD_SLAB_OPS	2
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3e03ae68722c..0aacbadcab06 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -376,6 +376,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 
 	switch (op->op) {
 	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_SPARSE_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 		kfree(op->extent.sparse_ext);
@@ -670,6 +671,7 @@ static void get_num_data_items(struct ceph_osd_request *req,
 		/* reply */
 		case CEPH_OSD_OP_STAT:
 		case CEPH_OSD_OP_READ:
+		case CEPH_OSD_OP_SPARSE_READ:
 		case CEPH_OSD_OP_LIST_WATCHERS:
 			*num_reply_data_items += 1;
 			break;
@@ -739,7 +741,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
 	       opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
-	       opcode != CEPH_OSD_OP_TRUNCATE);
+	       opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ);
 
 	op->extent.offset = offset;
 	op->extent.length = length;
@@ -964,6 +966,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 	case CEPH_OSD_OP_STAT:
 		break;
 	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_SPARSE_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
@@ -1060,7 +1063,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
 	       opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
-	       opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
+	       opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE &&
+	       opcode != CEPH_OSD_OP_SPARSE_READ);
 
 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
@@ -1201,6 +1205,13 @@ static void osd_init(struct ceph_osd *osd)
 	mutex_init(&osd->lock);
 }
 
+static void ceph_init_sparse_read(struct ceph_sparse_read *sr)
+{
+	kfree(sr->sr_extent);
+	memset(sr, '\0', sizeof(*sr));
+	sr->sr_state = CEPH_SPARSE_READ_HDR;
+}
+
 static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
@@ -1211,6 +1222,8 @@ static void osd_cleanup(struct ceph_osd *osd)
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
 
+	ceph_init_sparse_read(&osd->o_sparse_read);
+
 	if (osd->o_auth.authorizer) {
 		WARN_ON(osd_homeless(osd));
 		ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
@@ -1230,6 +1243,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 	osd_init(osd);
 	osd->o_osdc = osdc;
 	osd->o_osd = onum;
+	osd->o_sparse_op_idx = -1;
+
+	ceph_init_sparse_read(&osd->o_sparse_read);
 
 	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
 
@@ -2034,6 +2050,7 @@ static void setup_request_data(struct ceph_osd_request *req)
 					       &op->raw_data_in);
 			break;
 		case CEPH_OSD_OP_READ:
+		case CEPH_OSD_OP_SPARSE_READ:
 			ceph_osdc_msg_data_add(reply_msg,
 					       &op->extent.osd_data);
 			break;
@@ -2453,8 +2470,10 @@ static void finish_request(struct ceph_osd_request *req)
 
 	req->r_end_latency = ktime_get();
 
-	if (req->r_osd)
+	if (req->r_osd) {
+		ceph_init_sparse_read(&req->r_osd->o_sparse_read);
 		unlink_request(req->r_osd, req);
+	}
 	atomic_dec(&osdc->num_requests);
 
 	/*
@@ -5366,6 +5385,24 @@ static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	ceph_msg_put(msg);
 }
 
+/* How much sparse data was requested? */
+static u64 sparse_data_requested(struct ceph_osd_request *req)
+{
+	u64 len = 0;
+
+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
+		int i;
+
+		for (i = 0; i < req->r_num_ops; ++i) {
+			struct ceph_osd_req_op *op = &req->r_ops[i];
+
+			if (op->op == CEPH_OSD_OP_SPARSE_READ)
+				len += op->extent.length;
+		}
+	}
+	return len;
+}
+
 /*
  * Lookup and return message for incoming reply.  Don't try to do
  * anything about a larger than preallocated data portion of the
@@ -5382,6 +5419,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	int front_len = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
 	u64 tid = le64_to_cpu(hdr->tid);
+	u64 srlen;
 
 	down_read(&osdc->lock);
 	if (!osd_registered(osd)) {
@@ -5414,7 +5452,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		req->r_reply = m;
 	}
 
-	if (data_len > req->r_reply->data_length) {
+	srlen = sparse_data_requested(req);
+	if (!srlen && data_len > req->r_reply->data_length) {
 		pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
 			__func__, osd->o_osd, req->r_tid, data_len,
 			req->r_reply->data_length);
@@ -5424,6 +5463,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	}
 
 	m = ceph_msg_get(req->r_reply);
+	m->sparse_read = (bool)srlen;
+
 	dout("get_reply tid %lld %p\n", tid, m);
 
 out_unlock_session:
@@ -5656,9 +5697,217 @@ static int osd_check_message_signature(struct ceph_msg *msg)
 	return ceph_auth_check_message_signature(auth, msg);
 }
 
+static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len,
+			   bool zero)
+{
+	while (len) {
+		struct page *page;
+		size_t poff, plen;
+
+		page = ceph_msg_data_next(cursor, &poff, &plen);
+		if (plen > len)
+			plen = len;
+		if (zero)
+			zero_user_segment(page, poff, poff + plen);
+		len -= plen;
+		ceph_msg_data_advance(cursor, plen);
+	}
+}
+
+static int prep_next_sparse_read(struct ceph_connection *con,
+				 struct ceph_msg_data_cursor *cursor)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_sparse_read *sr = &o->o_sparse_read;
+	struct ceph_osd_request *req;
+	struct ceph_osd_req_op *op;
+
+	spin_lock(&o->o_requests_lock);
+	req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid));
+	if (!req) {
+		spin_unlock(&o->o_requests_lock);
+		return -EBADR;
+	}
+
+	if (o->o_sparse_op_idx < 0) {
+		u64 srlen = sparse_data_requested(req);
+
+		dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n",
+		     __func__, o->o_osd, srlen);
+		ceph_msg_data_cursor_init(cursor, con->in_msg, srlen);
+	} else {
+		u64 end;
+
+		op = &req->r_ops[o->o_sparse_op_idx];
+
+		WARN_ON_ONCE(op->extent.sparse_ext);
+
+		/* hand back buffer we took earlier */
+		op->extent.sparse_ext = sr->sr_extent;
+		sr->sr_extent = NULL;
+		op->extent.sparse_ext_cnt = sr->sr_count;
+		sr->sr_ext_len = 0;
+		dout("%s: [%d] completed extent array len %d cursor->resid %zd\n",
+		     __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid);
+		/* Advance to end of data for this operation */
+		end = ceph_sparse_ext_map_end(op);
+		if (end < sr->sr_req_len)
+			advance_cursor(cursor, sr->sr_req_len - end, false);
+	}
+
+	ceph_init_sparse_read(sr);
+
+	/* find next op in this request (if any) */
+	while (++o->o_sparse_op_idx < req->r_num_ops) {
+		op = &req->r_ops[o->o_sparse_op_idx];
+		if (op->op == CEPH_OSD_OP_SPARSE_READ)
+			goto found;
+	}
+
+	/* reset for next sparse read request */
+	spin_unlock(&o->o_requests_lock);
+	o->o_sparse_op_idx = -1;
+	return 0;
+found:
+	sr->sr_req_off = op->extent.offset;
+	sr->sr_req_len = op->extent.length;
+	sr->sr_pos = sr->sr_req_off;
+	dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__,
+	     o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len);
+
+	/* hand off request's sparse extent map buffer */
+	sr->sr_ext_len = op->extent.sparse_ext_cnt;
+	op->extent.sparse_ext_cnt = 0;
+	sr->sr_extent = op->extent.sparse_ext;
+	op->extent.sparse_ext = NULL;
+
+	spin_unlock(&o->o_requests_lock);
+	return 1;
+}
+
+#ifdef __BIG_ENDIAN
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+	int i;
+
+	for (i = 0; i < sr->sr_count; i++) {
+		struct ceph_sparse_extent *ext = &sr->sr_extent[i];
+
+		ext->off = le64_to_cpu((__force __le64)ext->off);
+		ext->len = le64_to_cpu((__force __le64)ext->len);
+	}
+}
+#else
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+}
+#endif
+
+#define MAX_EXTENTS 4096
+
+static int osd_sparse_read(struct ceph_connection *con,
+			   struct ceph_msg_data_cursor *cursor,
+			   char **pbuf)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_sparse_read *sr = &o->o_sparse_read;
+	u32 count = sr->sr_count;
+	u64 eoff, elen;
+	int ret;
+
+	switch (sr->sr_state) {
+	case CEPH_SPARSE_READ_HDR:
+next_op:
+		ret = prep_next_sparse_read(con, cursor);
+		if (ret <= 0)
+			return ret;
+
+		/* number of extents */
+		ret = sizeof(sr->sr_count);
+		*pbuf = (char *)&sr->sr_count;
+		sr->sr_state = CEPH_SPARSE_READ_EXTENTS;
+		break;
+	case CEPH_SPARSE_READ_EXTENTS:
+		/* Convert sr_count to host-endian */
+		count = le32_to_cpu((__force __le32)sr->sr_count);
+		sr->sr_count = count;
+		dout("[%d] got %u extents\n", o->o_osd, count);
+
+		if (count > 0) {
+			if (!sr->sr_extent || count > sr->sr_ext_len) {
+				/*
+				 * Apply a hard cap to the number of extents.
+				 * If we have more, assume something is wrong.
+				 */
+				if (count > MAX_EXTENTS) {
+					dout("%s: OSD returned 0x%x extents in a single reply!\n",
+					     __func__, count);
+					return -EREMOTEIO;
+				}
+
+				/* no extent array provided, or too short */
+				kfree(sr->sr_extent);
+				sr->sr_extent = kmalloc_array(count,
+							      sizeof(*sr->sr_extent),
+							      GFP_NOIO);
+				if (!sr->sr_extent)
+					return -ENOMEM;
+				sr->sr_ext_len = count;
+			}
+			ret = count * sizeof(*sr->sr_extent);
+			*pbuf = (char *)sr->sr_extent;
+			sr->sr_state = CEPH_SPARSE_READ_DATA_LEN;
+			break;
+		}
+		/* No extents? Read data len */
+		fallthrough;
+	case CEPH_SPARSE_READ_DATA_LEN:
+		convert_extent_map(sr);
+		ret = sizeof(sr->sr_datalen);
+		*pbuf = (char *)&sr->sr_datalen;
+		sr->sr_state = CEPH_SPARSE_READ_DATA;
+		break;
+	case CEPH_SPARSE_READ_DATA:
+		if (sr->sr_index >= count) {
+			sr->sr_state = CEPH_SPARSE_READ_HDR;
+			goto next_op;
+		}
+
+		eoff = sr->sr_extent[sr->sr_index].off;
+		elen = sr->sr_extent[sr->sr_index].len;
+
+		dout("[%d] ext %d off 0x%llx len 0x%llx\n",
+		     o->o_osd, sr->sr_index, eoff, elen);
+
+		if (elen > INT_MAX) {
+			dout("Sparse read extent length too long (0x%llx)\n",
+			     elen);
+			return -EREMOTEIO;
+		}
+
+		/* zero out anything from sr_pos to start of extent */
+		if (sr->sr_pos < eoff)
+			advance_cursor(cursor, eoff - sr->sr_pos, true);
+
+		/* Set position to end of extent */
+		sr->sr_pos = eoff + elen;
+
+		/* send back the new length and nullify the ptr */
+		cursor->sr_resid = elen;
+		ret = elen;
+		*pbuf = NULL;
+
+		/* Bump the array index */
+		++sr->sr_index;
+		break;
+	}
+	return ret;
+}
+
 static const struct ceph_connection_operations osd_con_ops = {
 	.get = osd_get_con,
 	.put = osd_put_con,
+	.sparse_read = osd_sparse_read,
 	.alloc_msg = osd_alloc_msg,
 	.dispatch = osd_dispatch,
 	.fault = osd_fault,
-- 
cgit v1.2.3


From dee0c5f834605ce9b384ee8b9c7032ffd8db4eca Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 1 Jul 2022 06:30:12 -0400
Subject: libceph: add new iov_iter-based ceph_msg_data_type and
 ceph_osd_data_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an iov_iter to the unions in ceph_msg_data and ceph_msg_data_cursor.
Instead of requiring a list of pages or bvecs, we can just use an
iov_iter directly, and avoid extra allocations.

We assume that the pages represented by the iter are pinned such that
they shouldn't incur page faults, which is the case for the iov_iters
created by netfs.

While working on this, Al Viro informed me that he was going to change
iov_iter_get_pages to auto-advance the iterator as that pattern is more
or less required for ITER_PIPE anyway. We emulate that here for now by
advancing in the _next op and tracking that amount in the "lastlen"
field.

In the event that _next is called twice without an intervening
_advance, we revert the iov_iter by the remaining lastlen before
calling iov_iter_get_pages.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h  |  8 +++++
 include/linux/ceph/osd_client.h |  4 +++
 net/ceph/messenger.c            | 77 +++++++++++++++++++++++++++++++++++++++++
 net/ceph/osd_client.c           | 27 +++++++++++++++
 4 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 9fd7255172ad..2eaaabbe98cb 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -123,6 +123,7 @@ enum ceph_msg_data_type {
 	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
 #endif /* CONFIG_BLOCK */
 	CEPH_MSG_DATA_BVECS,	/* data source/destination is a bio_vec array */
+	CEPH_MSG_DATA_ITER,	/* data source/destination is an iov_iter */
 };
 
 #ifdef CONFIG_BLOCK
@@ -224,6 +225,7 @@ struct ceph_msg_data {
 			bool		own_pages;
 		};
 		struct ceph_pagelist	*pagelist;
+		struct iov_iter		iter;
 	};
 };
 
@@ -248,6 +250,10 @@ struct ceph_msg_data_cursor {
 			struct page	*page;		/* page from list */
 			size_t		offset;		/* bytes from list */
 		};
+		struct {
+			struct iov_iter		iov_iter;
+			unsigned int		lastlen;
+		};
 	};
 };
 
@@ -605,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
 #endif /* CONFIG_BLOCK */
 void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
 			     struct ceph_bvec_iter *bvec_pos);
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+			    struct iov_iter *iter);
 
 struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
 			       gfp_t flags, bool can_fail);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bfa4813590da..8f5d2b5bbba2 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -108,6 +108,7 @@ enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_BIO,
 #endif /* CONFIG_BLOCK */
 	CEPH_OSD_DATA_TYPE_BVECS,
+	CEPH_OSD_DATA_TYPE_ITER,
 };
 
 struct ceph_osd_data {
@@ -131,6 +132,7 @@ struct ceph_osd_data {
 			struct ceph_bvec_iter	bvec_pos;
 			u32			num_bvecs;
 		};
+		struct iov_iter		iter;
 	};
 };
 
@@ -501,6 +503,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
 					 unsigned int which,
 					 struct ceph_bvec_iter *bvec_pos);
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+				unsigned int which, struct iov_iter *iter);
 
 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
 					unsigned int which,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2eb10d7518e8..10a41cd9c523 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -969,6 +969,62 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 	return true;
 }
 
+static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
+					   size_t length)
+{
+	struct ceph_msg_data *data = cursor->data;
+
+	cursor->iov_iter = data->iter;
+	cursor->lastlen = 0;
+	iov_iter_truncate(&cursor->iov_iter, length);
+	cursor->resid = iov_iter_count(&cursor->iov_iter);
+}
+
+static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
+					    size_t *page_offset, size_t *length)
+{
+	struct page *page;
+	ssize_t len;
+
+	if (cursor->lastlen)
+		iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
+
+	len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE,
+				  1, page_offset);
+	BUG_ON(len < 0);
+
+	cursor->lastlen = len;
+
+	/*
+	 * FIXME: The assumption is that the pages represented by the iov_iter
+	 *	  are pinned, with the references held by the upper-level
+	 *	  callers, or by virtue of being under writeback. Eventually,
+	 *	  we'll get an iov_iter_get_pages2 variant that doesn't take
+	 *	  page refs. Until then, just put the page ref.
+	 */
+	VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
+	put_page(page);
+
+	*length = min_t(size_t, len, cursor->resid);
+	return page;
+}
+
+static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
+				       size_t bytes)
+{
+	BUG_ON(bytes > cursor->resid);
+	cursor->resid -= bytes;
+
+	if (bytes < cursor->lastlen) {
+		cursor->lastlen -= bytes;
+	} else {
+		iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
+		cursor->lastlen = 0;
+	}
+
+	return cursor->resid;
+}
+
 /*
  * Message data is handled (sent or received) in pieces, where each
  * piece resides on a single page.  The network layer might not
@@ -996,6 +1052,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	case CEPH_MSG_DATA_BVECS:
 		ceph_msg_data_bvecs_cursor_init(cursor, length);
 		break;
+	case CEPH_MSG_DATA_ITER:
+		ceph_msg_data_iter_cursor_init(cursor, length);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		/* BUG(); */
@@ -1043,6 +1102,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
 	case CEPH_MSG_DATA_BVECS:
 		page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
 		break;
+	case CEPH_MSG_DATA_ITER:
+		page = ceph_msg_data_iter_next(cursor, page_offset, length);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		page = NULL;
@@ -1081,6 +1143,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 	case CEPH_MSG_DATA_BVECS:
 		new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
 		break;
+	case CEPH_MSG_DATA_ITER:
+		new_piece = ceph_msg_data_iter_advance(cursor, bytes);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		BUG();
@@ -1880,6 +1945,18 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
 }
 EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
 
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+			    struct iov_iter *iter)
+{
+	struct ceph_msg_data *data;
+
+	data = ceph_msg_data_add(msg);
+	data->type = CEPH_MSG_DATA_ITER;
+	data->iter = *iter;
+
+	msg->data_length += iov_iter_count(&data->iter);
+}
+
 /*
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0aacbadcab06..684faf8553de 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
 	osd_data->num_bvecs = num_bvecs;
 }
 
+static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
+			       struct iov_iter *iter)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
+	osd_data->iter = *iter;
+}
+
 static struct ceph_osd_data *
 osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
 {
@@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
 
+/**
+ * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
+ * @osd_req: The request to set up
+ * @which: Index of the operation in which to set the iter
+ * @iter: The buffer iterator
+ */
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+				unsigned int which, struct iov_iter *iter)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+	ceph_osd_iter_init(osd_data, iter);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
+
 static void osd_req_op_cls_request_info_pagelist(
 			struct ceph_osd_request *osd_req,
 			unsigned int which, struct ceph_pagelist *pagelist)
@@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
 #endif /* CONFIG_BLOCK */
 	case CEPH_OSD_DATA_TYPE_BVECS:
 		return osd_data->bvec_pos.iter.bi_size;
+	case CEPH_OSD_DATA_TYPE_ITER:
+		return iov_iter_count(&osd_data->iter);
 	default:
 		WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
 		return 0;
@@ -954,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 #endif
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
 		ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
+		ceph_msg_data_add_iter(msg, &osd_data->iter);
 	} else {
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 	}
-- 
cgit v1.2.3


From 2d332d5bc424404911540006a8bb450fbb96b178 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 27 Jul 2020 10:16:09 -0400
Subject: ceph: fscrypt_auth handling for ceph
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most fscrypt-enabled filesystems store the crypto context in an xattr,
but that's problematic for ceph as xatts are governed by the XATTR cap,
but we really want the crypto context as part of the AUTH cap.

Because of this, the MDS has added two new inode metadata fields:
fscrypt_auth and fscrypt_file. The former is used to hold the crypto
context, and the latter is used to track the real file size.

Parse new fscrypt_auth and fscrypt_file fields in inode traces. For now,
we don't use fscrypt_file, but fscrypt_auth is used to hold the fscrypt
context.

Allow the client to use a setattr request for setting the fscrypt_auth
field. Since this is not a standard setattr request from the VFS, we add
a new field to __ceph_setattr that carries ceph-specific inode attrs.

Have the set_context op do a setattr that sets the fscrypt_auth value,
and get_context just return the contents of that field (since it should
always be available).

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/Makefile             |   1 +
 fs/ceph/acl.c                |   4 +-
 fs/ceph/caps.c               |  78 ++++++++++++++++++++++++-----
 fs/ceph/crypto.c             |  77 +++++++++++++++++++++++++++++
 fs/ceph/crypto.h             |  36 ++++++++++++++
 fs/ceph/inode.c              |  64 +++++++++++++++++++++++-
 fs/ceph/mds_client.c         | 114 +++++++++++++++++++++++++++++++++++++++----
 fs/ceph/mds_client.h         |   7 +++
 fs/ceph/super.c              |   3 ++
 fs/ceph/super.h              |  15 +++++-
 include/linux/ceph/ceph_fs.h |  21 +++++---
 11 files changed, 385 insertions(+), 35 deletions(-)
 create mode 100644 fs/ceph/crypto.c
 create mode 100644 fs/ceph/crypto.h

(limited to 'include/linux')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 50c635dc7f71..1f77ca04c426 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
 
 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
 ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 6945a938d396..8a56f979c7cb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		newattrs.ia_ctime = current_time(inode);
 		newattrs.ia_mode = new_mode;
 		newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-		ret = __ceph_setattr(inode, &newattrs);
+		ret = __ceph_setattr(inode, &newattrs, NULL);
 		if (ret)
 			goto out_free;
 	}
@@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 			newattrs.ia_ctime = old_ctime;
 			newattrs.ia_mode = old_mode;
 			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-			__ceph_setattr(inode, &newattrs);
+			__ceph_setattr(inode, &newattrs, NULL);
 		}
 		goto out_free;
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e2bb0d0072da..1c62ef339bc6 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -14,6 +14,7 @@
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "crypto.h"
 #include <linux/ceph/decode.h>
 #include <linux/ceph/messenger.h>
 
@@ -1216,15 +1217,12 @@ struct cap_msg_args {
 	umode_t			mode;
 	bool			inline_data;
 	bool			wake;
+	u32			fscrypt_auth_len;
+	u32			fscrypt_file_len;
+	u8			fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
+	u8			fscrypt_file[sizeof(u64)]; // for size
 };
 
-/*
- * cap struct size + flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid
- */
-#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
-		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
-
 /* Marshal up the cap msg to the MDS */
 static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
 {
@@ -1240,7 +1238,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
 	     arg->size, arg->max_size, arg->xattr_version,
 	     arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
 
-	msg->hdr.version = cpu_to_le16(10);
+	msg->hdr.version = cpu_to_le16(12);
 	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
 
 	fc = msg->front.iov_base;
@@ -1311,6 +1309,21 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
 
 	/* Advisory flags (version 10) */
 	ceph_encode_32(&p, arg->flags);
+
+	/* dirstats (version 11) - these are r/o on the client */
+	ceph_encode_64(&p, 0);
+	ceph_encode_64(&p, 0);
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	/* fscrypt_auth and fscrypt_file (version 12) */
+	ceph_encode_32(&p, arg->fscrypt_auth_len);
+	ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
+	ceph_encode_32(&p, arg->fscrypt_file_len);
+	ceph_encode_copy(&p, arg->fscrypt_file, arg->fscrypt_file_len);
+#else /* CONFIG_FS_ENCRYPTION */
+	ceph_encode_32(&p, 0);
+	ceph_encode_32(&p, 0);
+#endif /* CONFIG_FS_ENCRYPTION */
 }
 
 /*
@@ -1432,7 +1445,37 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 		}
 	}
 	arg->flags = flags;
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	if (ci->fscrypt_auth_len &&
+	    WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
+		/* Don't set this if it's too big */
+		arg->fscrypt_auth_len = 0;
+	} else {
+		arg->fscrypt_auth_len = ci->fscrypt_auth_len;
+		memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
+		       min_t(size_t, ci->fscrypt_auth_len,
+			     sizeof(arg->fscrypt_auth)));
+	}
+	/* FIXME: use this to track "real" size */
+	arg->fscrypt_file_len = 0;
+#endif /* CONFIG_FS_ENCRYPTION */
+}
+
+#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+	return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len +
+			arg->fscrypt_file_len;
 }
+#else
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+	return CAP_MSG_FIXED_FIELDS;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
 
 /*
  * Send a cap msg on the given inode.
@@ -1444,7 +1487,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
 	struct ceph_msg *msg;
 	struct inode *inode = &ci->netfs.inode;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
+			   false);
 	if (!msg) {
 		pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
 		       ceph_vinop(inode), ceph_cap_string(arg->dirty),
@@ -1470,10 +1514,6 @@ static inline int __send_flush_snap(struct inode *inode,
 	struct cap_msg_args	arg;
 	struct ceph_msg		*msg;
 
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
-	if (!msg)
-		return -ENOMEM;
-
 	arg.session = session;
 	arg.ino = ceph_vino(inode).ino;
 	arg.cid = 0;
@@ -1511,6 +1551,18 @@ static inline int __send_flush_snap(struct inode *inode,
 	arg.flags = 0;
 	arg.wake = false;
 
+	/*
+	 * No fscrypt_auth changes from a capsnap. It will need
+	 * to update fscrypt_file on size changes (TODO).
+	 */
+	arg.fscrypt_auth_len = 0;
+	arg.fscrypt_file_len = 0;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
+			   GFP_NOFS, false);
+	if (!msg)
+		return -ENOMEM;
+
 	encode_cap_msg(msg, &arg);
 	ceph_con_send(&arg.session->s_con, msg);
 	return 0;
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..b17a6ee16cf0
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+#include <linux/xattr.h>
+#include <linux/fscrypt.h>
+
+#include "super.h"
+#include "crypto.h"
+
+static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth;
+	u32 ctxlen;
+
+	/* Non existent or too short? */
+	if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1)))
+		return -ENOBUFS;
+
+	/* Some format we don't recognize? */
+	if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION)
+		return -ENOBUFS;
+
+	ctxlen = le32_to_cpu(cfa->cfa_blob_len);
+	if (len < ctxlen)
+		return -ERANGE;
+
+	memcpy(ctx, cfa->cfa_blob, ctxlen);
+	return ctxlen;
+}
+
+static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
+				  size_t len, void *fs_data)
+{
+	int ret;
+	struct iattr attr = { };
+	struct ceph_iattr cia = { };
+	struct ceph_fscrypt_auth *cfa;
+
+	WARN_ON_ONCE(fs_data);
+
+	if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE)
+		return -EINVAL;
+
+	cfa = kzalloc(sizeof(*cfa), GFP_KERNEL);
+	if (!cfa)
+		return -ENOMEM;
+
+	cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION);
+	cfa->cfa_blob_len = cpu_to_le32(len);
+	memcpy(cfa->cfa_blob, ctx, len);
+
+	cia.fscrypt_auth = cfa;
+
+	ret = __ceph_setattr(inode, &attr, &cia);
+	if (ret == 0)
+		inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+	kfree(cia.fscrypt_auth);
+	return ret;
+}
+
+static bool ceph_crypt_empty_dir(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	return ci->i_rsubdirs + ci->i_rfiles == 1;
+}
+
+static struct fscrypt_operations ceph_fscrypt_ops = {
+	.get_context		= ceph_crypt_get_context,
+	.set_context		= ceph_crypt_set_context,
+	.empty_dir		= ceph_crypt_empty_dir,
+};
+
+void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+	fscrypt_set_ops(sb, &ceph_fscrypt_ops);
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..6dca674f79b8
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ceph fscrypt functionality
+ */
+
+#ifndef _CEPH_CRYPTO_H
+#define _CEPH_CRYPTO_H
+
+#include <linux/fscrypt.h>
+
+struct ceph_fscrypt_auth {
+	__le32	cfa_version;
+	__le32	cfa_blob_len;
+	u8	cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE];
+} __packed;
+
+#define CEPH_FSCRYPT_AUTH_VERSION	1
+static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa)
+{
+	u32 ctxsize = le32_to_cpu(fa->cfa_blob_len);
+
+	return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+void ceph_fscrypt_set_ops(struct super_block *sb);
+
+#else /* CONFIG_FS_ENCRYPTION */
+
+static inline void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+}
+
+#endif /* CONFIG_FS_ENCRYPTION */
+
+#endif
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e23172024707..a3aa7870a6a2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -14,10 +14,12 @@
 #include <linux/random.h>
 #include <linux/sort.h>
 #include <linux/iversion.h>
+#include <linux/fscrypt.h>
 
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "crypto.h"
 #include <linux/ceph/decode.h>
 
 /*
@@ -617,6 +619,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	INIT_WORK(&ci->i_work, ceph_inode_work);
 	ci->i_work_mask = 0;
 	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+#ifdef CONFIG_FS_ENCRYPTION
+	ci->fscrypt_auth = NULL;
+	ci->fscrypt_auth_len = 0;
+#endif
 	return &ci->netfs.inode;
 }
 
@@ -625,6 +631,9 @@ void ceph_free_inode(struct inode *inode)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	kfree(ci->i_symlink);
+#ifdef CONFIG_FS_ENCRYPTION
+	kfree(ci->fscrypt_auth);
+#endif
 	kmem_cache_free(ceph_inode_cachep, ci);
 }
 
@@ -645,6 +654,7 @@ void ceph_evict_inode(struct inode *inode)
 	clear_inode(inode);
 
 	ceph_fscache_unregister_inode_cookie(ci);
+	fscrypt_put_encryption_info(inode);
 
 	__ceph_remove_caps(ci);
 
@@ -935,6 +945,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
 	__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
 
+#ifdef CONFIG_FS_ENCRYPTION
+	if (iinfo->fscrypt_auth_len && (inode->i_state & I_NEW)) {
+		kfree(ci->fscrypt_auth);
+		ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+		ci->fscrypt_auth = iinfo->fscrypt_auth;
+		iinfo->fscrypt_auth = NULL;
+		iinfo->fscrypt_auth_len = 0;
+		inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+	}
+#endif
+
 	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = mode;
@@ -2079,7 +2100,8 @@ static const struct inode_operations ceph_symlink_iops = {
 	.listxattr = ceph_listxattr,
 };
 
-int __ceph_setattr(struct inode *inode, struct iattr *attr)
+int __ceph_setattr(struct inode *inode, struct iattr *attr,
+		   struct ceph_iattr *cia)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned int ia_valid = attr->ia_valid;
@@ -2119,6 +2141,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 	}
 
 	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	if (cia && cia->fscrypt_auth) {
+		u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+
+		if (len > sizeof(*cia->fscrypt_auth)) {
+			err = -EINVAL;
+			spin_unlock(&ci->i_ceph_lock);
+			goto out;
+		}
+
+		dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
+			ceph_vinop(inode), ci->fscrypt_auth_len, len);
+
+		/* It should never be re-set once set */
+		WARN_ON_ONCE(ci->fscrypt_auth);
+
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+			kfree(ci->fscrypt_auth);
+			ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+			ci->fscrypt_auth_len = len;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   ci->fscrypt_auth_len != len ||
+			   memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+			req->r_fscrypt_auth = cia->fscrypt_auth;
+			mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+		cia->fscrypt_auth = NULL;
+	}
+#else
+	if (cia && cia->fscrypt_auth) {
+		err = -EINVAL;
+		spin_unlock(&ci->i_ceph_lock);
+		goto out;
+	}
+#endif /* CONFIG_FS_ENCRYPTION */
 
 	if (ia_valid & ATTR_UID) {
 		dout("setattr %p uid %d -> %d\n", inode,
@@ -2282,6 +2341,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 		req->r_stamp = attr->ia_ctime;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 	}
+out:
 	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
 	     ceph_cap_string(dirtied), mask);
 
@@ -2322,7 +2382,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	    ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
 		return -EDQUOT;
 
-	err = __ceph_setattr(inode, attr);
+	err = __ceph_setattr(inode, attr, NULL);
 
 	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
 		err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 77b5e7b2e5dd..c3927dab5a3b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -15,6 +15,7 @@
 
 #include "super.h"
 #include "mds_client.h"
+#include "crypto.h"
 
 #include <linux/ceph/ceph_features.h>
 #include <linux/ceph/messenger.h>
@@ -184,8 +185,54 @@ static int parse_reply_info_in(void **p, void *end,
 			info->rsnaps = 0;
 		}
 
+		if (struct_v >= 5) {
+			u32 alen;
+
+			ceph_decode_32_safe(p, end, alen, bad);
+
+			while (alen--) {
+				u32 len;
+
+				/* key */
+				ceph_decode_32_safe(p, end, len, bad);
+				ceph_decode_skip_n(p, end, len, bad);
+				/* value */
+				ceph_decode_32_safe(p, end, len, bad);
+				ceph_decode_skip_n(p, end, len, bad);
+			}
+		}
+
+		/* fscrypt flag -- ignore */
+		if (struct_v >= 6)
+			ceph_decode_skip_8(p, end, bad);
+
+		info->fscrypt_auth = NULL;
+		info->fscrypt_auth_len = 0;
+		info->fscrypt_file = NULL;
+		info->fscrypt_file_len = 0;
+		if (struct_v >= 7) {
+			ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
+			if (info->fscrypt_auth_len) {
+				info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
+							     GFP_KERNEL);
+				if (!info->fscrypt_auth)
+					return -ENOMEM;
+				ceph_decode_copy_safe(p, end, info->fscrypt_auth,
+						      info->fscrypt_auth_len, bad);
+			}
+			ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
+			if (info->fscrypt_file_len) {
+				info->fscrypt_file = kmalloc(info->fscrypt_file_len,
+							     GFP_KERNEL);
+				if (!info->fscrypt_file)
+					return -ENOMEM;
+				ceph_decode_copy_safe(p, end, info->fscrypt_file,
+						      info->fscrypt_file_len, bad);
+			}
+		}
 		*p = end;
 	} else {
+		/* legacy (unversioned) struct */
 		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
 			ceph_decode_64_safe(p, end, info->inline_version, bad);
 			ceph_decode_32_safe(p, end, info->inline_len, bad);
@@ -651,8 +698,21 @@ out_bad:
 
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
+	int i;
+
+	kfree(info->diri.fscrypt_auth);
+	kfree(info->diri.fscrypt_file);
+	kfree(info->targeti.fscrypt_auth);
+	kfree(info->targeti.fscrypt_file);
 	if (!info->dir_entries)
 		return;
+
+	for (i = 0; i < info->dir_nr; i++) {
+		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+
+		kfree(rde->inode.fscrypt_auth);
+		kfree(rde->inode.fscrypt_file);
+	}
 	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
 
@@ -966,6 +1026,7 @@ void ceph_mdsc_release_request(struct kref *kref)
 	put_cred(req->r_cred);
 	if (req->r_pagelist)
 		ceph_pagelist_release(req->r_pagelist);
+	kfree(req->r_fscrypt_auth);
 	put_request_session(req);
 	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
 	WARN_ON_ONCE(!list_empty(&req->r_wait));
@@ -2543,8 +2604,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 	return r;
 }
 
-static void encode_timestamp_and_gids(void **p,
-				      const struct ceph_mds_request *req)
+static void encode_mclientrequest_tail(void **p,
+				       const struct ceph_mds_request *req)
 {
 	struct ceph_timespec ts;
 	int i;
@@ -2557,6 +2618,20 @@ static void encode_timestamp_and_gids(void **p,
 	for (i = 0; i < req->r_cred->group_info->ngroups; i++)
 		ceph_encode_64(p, from_kgid(&init_user_ns,
 					    req->r_cred->group_info->gid[i]));
+
+	/* v5: altname (TODO: skip for now) */
+	ceph_encode_32(p, 0);
+
+	/* v6: fscrypt_auth and fscrypt_file */
+	if (req->r_fscrypt_auth) {
+		u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+		ceph_encode_32(p, authlen);
+		ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
+	} else {
+		ceph_encode_32(p, 0);
+	}
+	ceph_encode_32(p, 0); // fscrypt_file for now
 }
 
 /*
@@ -2605,12 +2680,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		goto out_free1;
 	}
 
+	/* head */
 	len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
-	len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
-		sizeof(struct ceph_timespec);
-	len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
 
-	/* calculate (max) length for cap releases */
+	/* filepaths */
+	len += 2 * (1 + sizeof(u32) + sizeof(u64));
+	len += pathlen1 + pathlen2;
+
+	/* cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
@@ -2620,6 +2697,25 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (req->r_old_dentry_drop)
 		len += pathlen2;
 
+	/* MClientRequest tail */
+
+	/* req->r_stamp */
+	len += sizeof(struct ceph_timespec);
+
+	/* gid list */
+	len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
+
+	/* alternate name */
+	len += sizeof(u32);	// TODO
+
+	/* fscrypt_auth */
+	len += sizeof(u32); // fscrypt_auth
+	if (req->r_fscrypt_auth)
+		len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+	/* fscrypt_file */
+	len += sizeof(u32);
+
 	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
 	if (!msg) {
 		msg = ERR_PTR(-ENOMEM);
@@ -2639,7 +2735,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	} else {
 		struct ceph_mds_request_head *new_head = msg->front.iov_base;
 
-		msg->hdr.version = cpu_to_le16(4);
+		msg->hdr.version = cpu_to_le16(6);
 		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
 		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
 		p = msg->front.iov_base + sizeof(*new_head);
@@ -2690,7 +2786,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 
 	head->num_releases = cpu_to_le16(releases);
 
-	encode_timestamp_and_gids(&p, req);
+	encode_mclientrequest_tail(&p, req);
 
 	if (WARN_ON_ONCE(p > end)) {
 		ceph_msg_put(msg);
@@ -2820,7 +2916,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		rhead->num_releases = 0;
 
 		p = msg->front.iov_base + req->r_request_release_offset;
-		encode_timestamp_and_gids(&p, req);
+		encode_mclientrequest_tail(&p, req);
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3c1aa99c1876..a2e85fb5aab1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -86,6 +86,10 @@ struct ceph_mds_reply_info_in {
 	s32 dir_pin;
 	struct ceph_timespec btime;
 	struct ceph_timespec snap_btime;
+	u8 *fscrypt_auth;
+	u8 *fscrypt_file;
+	u32 fscrypt_auth_len;
+	u32 fscrypt_file_len;
 	u64 rsnaps;
 	u64 change_attr;
 };
@@ -278,6 +282,9 @@ struct ceph_mds_request {
 	struct mutex r_fill_mutex;
 
 	union ceph_mds_request_args r_args;
+
+	struct ceph_fscrypt_auth *r_fscrypt_auth;
+
 	int r_fmode;        /* file mode, if expecting cap */
 	int r_request_release_offset;
 	const struct cred *r_cred;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d6a1790f6923..c4ab2db85ef0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -20,6 +20,7 @@
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "crypto.h"
 
 #include <linux/ceph/ceph_features.h>
 #include <linux/ceph/decode.h>
@@ -1135,6 +1136,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
 	s->s_time_max = U32_MAX;
 	s->s_flags |= SB_NODIRATIME | SB_NOATIME;
 
+	ceph_fscrypt_set_ops(s);
+
 	ret = set_anon_super_fc(s, fc);
 	if (ret != 0)
 		fsc->sb = NULL;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index eb621bb276bd..3a39a9b3bc33 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -450,6 +450,13 @@ struct ceph_inode_info {
 
 	struct work_struct i_work;
 	unsigned long  i_work_mask;
+
+#ifdef CONFIG_FS_ENCRYPTION
+	u32 fscrypt_auth_len;
+	u32 fscrypt_file_len;
+	u8 *fscrypt_auth;
+	u8 *fscrypt_file;
+#endif
 };
 
 struct ceph_netfs_request_data {
@@ -1073,7 +1080,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
 }
 extern int ceph_permission(struct mnt_idmap *idmap,
 			   struct inode *inode, int mask);
-extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
+
+struct ceph_iattr {
+	struct ceph_fscrypt_auth	*fscrypt_auth;
+};
+
+extern int __ceph_setattr(struct inode *inode, struct iattr *attr,
+			  struct ceph_iattr *cia);
 extern int ceph_setattr(struct mnt_idmap *idmap,
 			struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct mnt_idmap *idmap,
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 49586ff26152..45f8ce61e103 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -359,14 +359,19 @@ enum {
 
 extern const char *ceph_mds_op_name(int op);
 
-
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
+#define CEPH_SETATTR_MODE              (1 << 0)
+#define CEPH_SETATTR_UID               (1 << 1)
+#define CEPH_SETATTR_GID               (1 << 2)
+#define CEPH_SETATTR_MTIME             (1 << 3)
+#define CEPH_SETATTR_ATIME             (1 << 4)
+#define CEPH_SETATTR_SIZE              (1 << 5)
+#define CEPH_SETATTR_CTIME             (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW         (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW         (1 << 8)
+#define CEPH_SETATTR_BTIME             (1 << 9)
+#define CEPH_SETATTR_KILL_SGUID        (1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH      (1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE      (1 << 12)
 
 /*
  * Ceph setxattr request flags.
-- 
cgit v1.2.3


From 69dd3b3930f96b624228000921f417fb0919a6ab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 25 Aug 2022 09:31:05 -0400
Subject: libceph: add CEPH_OSD_OP_ASSERT_VER support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

...and record the user_version in the reply in a new field in
ceph_osd_request, so we can populate the assert_ver appropriately.
Shuffle the fields a bit too so that the new field fits in an
existing hole on x86_64.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 6 +++++-
 include/linux/ceph/rados.h      | 4 ++++
 net/ceph/osd_client.c           | 5 +++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 8f5d2b5bbba2..bf9823956758 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,6 +198,9 @@ struct ceph_osd_req_op {
 			u32 src_fadvise_flags;
 			struct ceph_osd_data osd_data;
 		} copy_from;
+		struct {
+			u64 ver;
+		} assert_ver;
 	};
 };
 
@@ -252,6 +255,7 @@ struct ceph_osd_request {
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
+	bool		  r_linger;           /* don't resend on failure */
 	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
 
@@ -264,9 +268,9 @@ struct ceph_osd_request {
 	struct ceph_snap_context *r_snapc;    /* for writes */
 	struct timespec64 r_mtime;            /* ditto */
 	u64 r_data_offset;                    /* ditto */
-	bool r_linger;                        /* don't resend on failure */
 
 	/* internal */
+	u64 r_version;			      /* data version sent in reply */
 	unsigned long r_stamp;                /* jiffies, send or check time */
 	unsigned long r_start_stamp;          /* jiffies */
 	ktime_t r_start_latency;              /* ktime_t */
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 43a7a1573b51..73c3efbec36c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -523,6 +523,10 @@ struct ceph_osd_op {
 		struct {
 			__le64 cookie;
 		} __attribute__ ((packed)) notify;
+		struct {
+			__le64 unused;
+			__le64 ver;
+		} __attribute__ ((packed)) assert_ver;
 		struct {
 			__le64 offset, length;
 			__le64 src_offset;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 684faf8553de..7f159e40cf9c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1048,6 +1048,10 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		dst->copy_from.src_fadvise_flags =
 			cpu_to_le32(src->copy_from.src_fadvise_flags);
 		break;
+	case CEPH_OSD_OP_ASSERT_VER:
+		dst->assert_ver.unused = cpu_to_le64(0);
+		dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver);
+		break;
 	default:
 		pr_err("unsupported osd opcode %s\n",
 			ceph_osd_op_name(src->op));
@@ -3859,6 +3863,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	 * one (type of) reply back.
 	 */
 	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+	req->r_version = m.user_version;
 	req->r_result = m.result ?: data_len;
 	finish_request(req);
 	mutex_unlock(&osd->lock);
-- 
cgit v1.2.3


From 781589e40ac5f929f58824c15448e1ba49c3ac32 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 17 Aug 2023 15:55:31 -0700
Subject: rtc: Add support for limited alarm timer offsets

Some alarm timers are based on time offsets, not on absolute times.
In some situations, the amount of time that can be scheduled in the
future is limited. This may result in a refusal to suspend the system,
causing substantial battery drain.

Some RTC alarm drivers remedy the situation by setting the alarm time
to the maximum supported time if a request for an out-of-range timeout
is made. This is not really desirable since it may result in unexpected
early wakeups.

To reduce the impact of this problem, let RTC drivers report the maximum
supported alarm timer offset. The code setting alarm timers can then
decide if it wants to reject setting alarm timers to a larger value, if it
wants to implement recurring alarms until the actually requested alarm
time is met, or if it wants to accept the limited alarm time.

Only introduce the necessary variable into struct rtc_device.
Code to set and use the variable will follow with subsequent patches.

Cc: Brian Norris <briannorris@chromium.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20230817225537.4053865-2-linux@roeck-us.net
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/rtc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 1fd9c6a21ebe..4c0bcbeb1f00 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -146,6 +146,7 @@ struct rtc_device {
 
 	time64_t range_min;
 	timeu64_t range_max;
+	timeu64_t alarm_offset_max;
 	time64_t start_secs;
 	time64_t offset_secs;
 	bool set_start_time;
-- 
cgit v1.2.3


From 35d8dbbb25add265a880ab0dc48a229f06b08325 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 29 Aug 2023 20:46:31 +0200
Subject: thermal: core: Drop unused .get_trip_*() callbacks

After recent changes in the ACPI thermal driver and in the Intel DTS
IOSF thermal driver, all thermal zone drivers are expected to use trip
tables for initialization and none of them should implement
.get_trip_type(), .get_trip_temp() or .get_trip_hyst() callbacks, so
drop these callbacks entirely from the core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c |  2 +-
 drivers/thermal/thermal_trip.c | 24 +++---------------------
 include/linux/thermal.h        |  4 ----
 3 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index a59700593d32..0bdde1ab5d8b 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1266,7 +1266,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (num_trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp) && !trips)
+	if (num_trips > 0 && !trips)
 		return ERR_PTR(-EINVAL);
 
 	if (!thermal_class)
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index 53115cfdfd42..024e2e365a26 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -101,29 +101,11 @@ void __thermal_zone_set_trips(struct thermal_zone_device *tz)
 int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			    struct thermal_trip *trip)
 {
-	int ret;
-
-	if (!tz || trip_id < 0 || trip_id >= tz->num_trips || !trip)
+	if (!tz || !tz->trips || trip_id < 0 || trip_id >= tz->num_trips || !trip)
 		return -EINVAL;
 
-	if (tz->trips) {
-		*trip = tz->trips[trip_id];
-		return 0;
-	}
-
-	if (tz->ops->get_trip_hyst) {
-		ret = tz->ops->get_trip_hyst(tz, trip_id, &trip->hysteresis);
-		if (ret)
-			return ret;
-	} else {
-		trip->hysteresis = 0;
-	}
-
-	ret = tz->ops->get_trip_temp(tz, trip_id, &trip->temperature);
-	if (ret)
-		return ret;
-
-	return tz->ops->get_trip_type(tz, trip_id, &trip->type);
+	*trip = tz->trips[trip_id];
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__thermal_zone_get_trip);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index b449a46766f5..e6bd3b7c9eda 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -62,11 +62,7 @@ struct thermal_zone_device_ops {
 	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*change_mode) (struct thermal_zone_device *,
 		enum thermal_device_mode);
-	int (*get_trip_type) (struct thermal_zone_device *, int,
-		enum thermal_trip_type *);
-	int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
 	int (*set_trip_temp) (struct thermal_zone_device *, int, int);
-	int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-- 
cgit v1.2.3


From 8289d810ea85755a9d22f75785806cb34eecd5e5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 22 Aug 2023 13:40:06 +0200
Subject: thermal: core: Rework .get_trend() thermal zone callback

Passing a struct thermal_trip pointer instead of a trip index to the
.get_trend() thermal zone callback allows one of its 2 implementations,
the thermal_get_trend() function in the ACPI thermal driver, to be
simplified quite a bit, and the other implementation of it in the
ti-soc-thermal driver does not even use the relevant callback argument.

For this reason, change the .get_trend() thermal zone callback
definition and adjust the related code accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/thermal.c                             | 41 ++++++++++------------
 drivers/thermal/thermal_core.h                     |  2 +-
 drivers/thermal/thermal_helpers.c                  |  3 +-
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c |  3 +-
 include/linux/thermal.h                            | 30 ++++++++--------
 5 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 419590f41ed5..f14e68266ccd 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -492,26 +492,22 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
 }
 
 static int thermal_get_trend(struct thermal_zone_device *thermal,
-			     int trip_index, enum thermal_trend *trend)
+			     struct thermal_trip *trip,
+			     enum thermal_trend *trend)
 {
 	struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
 	struct acpi_thermal_trip *acpi_trip;
-	int t, i;
+	int t;
 
-	if (!tz || trip_index < 0)
+	if (!tz || !trip)
 		return -EINVAL;
 
-	if (tz->trips.critical.valid)
-		trip_index--;
-
-	if (tz->trips.hot.valid)
-		trip_index--;
-
-	if (trip_index < 0)
+	acpi_trip = trip->priv;
+	if (!acpi_trip || !acpi_trip->valid)
 		return -EINVAL;
 
-	acpi_trip = &tz->trips.passive.trip;
-	if (acpi_trip->valid && !trip_index--) {
+	switch (trip->type) {
+	case THERMAL_TRIP_PASSIVE:
 		t = tz->trips.passive.tc1 * (tz->temperature -
 						tz->last_temperature) +
 			tz->trips.passive.tc2 * (tz->temperature -
@@ -524,19 +520,18 @@ static int thermal_get_trend(struct thermal_zone_device *thermal,
 			*trend = THERMAL_TREND_STABLE;
 
 		return 0;
-	}
-
-	t = acpi_thermal_temp(tz, tz->temperature);
 
-	for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) {
-		acpi_trip = &tz->trips.active[i].trip;
-		if (acpi_trip->valid && !trip_index--) {
-			if (t > acpi_thermal_temp(tz, acpi_trip->temperature)) {
-				*trend = THERMAL_TREND_RAISING;
-				return 0;
-			}
+	case THERMAL_TRIP_ACTIVE:
+		t = acpi_thermal_temp(tz, tz->temperature);
+		if (t <= trip->temperature)
 			break;
-		}
+
+		*trend = THERMAL_TREND_RAISING;
+
+		return 0;
+
+	default:
+		break;
 	}
 
 	return -EINVAL;
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 04513f9fbfa1..de884bea28b6 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -70,7 +70,7 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 void thermal_cdev_update(struct thermal_cooling_device *);
 void __thermal_cdev_update(struct thermal_cooling_device *cdev);
 
-int get_tz_trend(struct thermal_zone_device *tz, int trip);
+int get_tz_trend(struct thermal_zone_device *tz, int trip_index);
 
 struct thermal_instance *
 get_thermal_instance(struct thermal_zone_device *tz,
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index cfba0965a22d..4d66372c9629 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -22,8 +22,9 @@
 #include "thermal_core.h"
 #include "thermal_trace.h"
 
-int get_tz_trend(struct thermal_zone_device *tz, int trip)
+int get_tz_trend(struct thermal_zone_device *tz, int trip_index)
 {
+	struct thermal_trip *trip = tz->trips ? &tz->trips[trip_index] : NULL;
 	enum thermal_trend trend;
 
 	if (tz->emul_temperature || !tz->ops->get_trend ||
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index d414a4b7a94a..6ba2613627e1 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -109,7 +109,8 @@ static inline int __ti_thermal_get_temp(struct thermal_zone_device *tz, int *tem
 	return ret;
 }
 
-static int __ti_thermal_get_trend(struct thermal_zone_device *tz, int trip, enum thermal_trend *trend)
+static int __ti_thermal_get_trend(struct thermal_zone_device *tz,
+				  struct thermal_trip *trip, enum thermal_trend *trend)
 {
 	struct ti_thermal_data *data = thermal_zone_device_priv(tz);
 	struct ti_bandgap *bgp;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e6bd3b7c9eda..eb17495c8acc 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -53,6 +53,20 @@ enum thermal_notify_event {
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
 };
 
+/**
+ * struct thermal_trip - representation of a point in temperature domain
+ * @temperature: temperature value in miliCelsius
+ * @hysteresis: relative hysteresis in miliCelsius
+ * @type: trip point type
+ * @priv: pointer to driver data associated with this trip
+ */
+struct thermal_trip {
+	int temperature;
+	int hysteresis;
+	enum thermal_trip_type type;
+	void *priv;
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
@@ -66,26 +80,12 @@ struct thermal_zone_device_ops {
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-	int (*get_trend) (struct thermal_zone_device *, int,
+	int (*get_trend) (struct thermal_zone_device *, struct thermal_trip *,
 			  enum thermal_trend *);
 	void (*hot)(struct thermal_zone_device *);
 	void (*critical)(struct thermal_zone_device *);
 };
 
-/**
- * struct thermal_trip - representation of a point in temperature domain
- * @temperature: temperature value in miliCelsius
- * @hysteresis: relative hysteresis in miliCelsius
- * @type: trip point type
- * @priv: pointer to driver data associated with this trip
- */
-struct thermal_trip {
-	int temperature;
-	int hysteresis;
-	enum thermal_trip_type type;
-	void *priv;
-};
-
 struct thermal_cooling_device_ops {
 	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
-- 
cgit v1.2.3


From 218a06a79d9a98a96ef46bb003d4d8adb0962056 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Tue, 22 Aug 2023 20:48:37 +0800
Subject: cpufreq: Support per-policy performance boost

The boost control currently applies to the whole system.  However, users
may prefer to boost a subset of cores in order to provide prioritized
performance to workloads running on the boosted cores.

Enable per-policy boost by adding a 'boost' sysfs interface under each
policy path.  This can be found at:

	/sys/devices/system/cpu/cpufreq/policy<*>/boost

Same to the global boost switch, writing 1/0 to the per-policy 'boost'
enables/disables boost on a cpufreq policy respectively.

The user view of global and per-policy boost controls should be:

1. Enabling global boost initially enables boost on all policies, and
per-policy boost can then be enabled or disabled individually, given that
the platform does support so.

2. Disabling global boost makes the per-policy boost interface illegal.

Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Wei Xu <xuwei5@hisilicon.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |  3 +++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d9eb036245c3..60ed89000e82 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -86,6 +86,7 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy);
 static int cpufreq_set_policy(struct cpufreq_policy *policy,
 			      struct cpufreq_governor *new_gov,
 			      unsigned int new_pol);
+static bool cpufreq_boost_supported(void);
 
 /*
  * Two notifier lists: the "policy" list is involved in the
@@ -623,6 +624,40 @@ static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr,
 }
 define_one_global_rw(boost);
 
+static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", policy->boost_enabled);
+}
+
+static ssize_t store_local_boost(struct cpufreq_policy *policy,
+				 const char *buf, size_t count)
+{
+	int ret, enable;
+
+	ret = kstrtoint(buf, 10, &enable);
+	if (ret || enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (!cpufreq_driver->boost_enabled)
+		return -EINVAL;
+
+	if (policy->boost_enabled == enable)
+		return count;
+
+	cpus_read_lock();
+	ret = cpufreq_driver->set_boost(policy, enable);
+	cpus_read_unlock();
+
+	if (ret)
+		return ret;
+
+	policy->boost_enabled = enable;
+
+	return count;
+}
+
+static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost);
+
 static struct cpufreq_governor *find_governor(const char *str_governor)
 {
 	struct cpufreq_governor *t;
@@ -1057,6 +1092,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
 			return ret;
 	}
 
+	if (cpufreq_boost_supported()) {
+		ret = sysfs_create_file(&policy->kobj, &local_boost.attr);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -2718,6 +2759,8 @@ int cpufreq_boost_trigger_state(int state)
 		ret = cpufreq_driver->set_boost(policy, state);
 		if (ret)
 			goto err_reset_state;
+
+		policy->boost_enabled = state;
 	}
 	cpus_read_unlock();
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 43b363a99215..71d186d6933a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -141,6 +141,9 @@ struct cpufreq_policy {
 	 */
 	bool			dvfs_possible_from_any_cpu;
 
+	/* Per policy boost enabled flag. */
+	bool			boost_enabled;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	unsigned int cached_resolved_idx;
-- 
cgit v1.2.3


From 7e9be1124dbe7888907e82cab20164578e3f9ab7 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:57 +0200
Subject: netfilter: nf_tables: Audit log setelem reset

Since set element reset is not integrated into nf_tables' transaction
logic, an explicit log call is needed, similar to NFT_MSG_GETOBJ_RESET
handling.

For the sake of simplicity, catchall element reset will always generate
a dedicated log entry. This relieves nf_tables_dump_set() from having to
adjust the logged element count depending on whether a catchall element
was found or not.

Fixes: 079cd633219d7 ("netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h         |  1 +
 kernel/auditsc.c              |  1 +
 net/netfilter/nf_tables_api.c | 31 ++++++++++++++++++++++++++++---
 3 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6a3a9e122bb5..192bf03aacc5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_OBJ_RESET,
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	AUDIT_NFT_OP_SETELEM_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index addeed3df15d..38481e318197 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -143,6 +143,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
 	{ AUDIT_NFT_OP_OBJ_RESET,		"nft_reset_obj"		   },
 	{ AUDIT_NFT_OP_FLOWTABLE_REGISTER,	"nft_register_flowtable"   },
 	{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,	"nft_unregister_flowtable" },
+	{ AUDIT_NFT_OP_SETELEM_RESET,		"nft_reset_setelem"        },
 	{ AUDIT_NFT_OP_INVALID,			"nft_invalid"		   },
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 41b826dff6f5..361e98e71692 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -102,6 +102,7 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
 	[NFT_MSG_NEWFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	[NFT_MSG_GETFLOWTABLE]	= AUDIT_NFT_OP_INVALID,
 	[NFT_MSG_DELFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	[NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
 };
 
 static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
@@ -5624,13 +5625,25 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
 	return nf_tables_fill_setelem(args->skb, set, elem, args->reset);
 }
 
+static void audit_log_nft_set_reset(const struct nft_table *table,
+				    unsigned int base_seq,
+				    unsigned int nentries)
+{
+	char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+	audit_log_nfcfg(buf, table->family, nentries,
+			AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
+	kfree(buf);
+}
+
 struct nft_set_dump_ctx {
 	const struct nft_set	*set;
 	struct nft_ctx		ctx;
 };
 
 static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
-				 const struct nft_set *set, bool reset)
+				 const struct nft_set *set, bool reset,
+				 unsigned int base_seq)
 {
 	struct nft_set_elem_catchall *catchall;
 	u8 genmask = nft_genmask_cur(net);
@@ -5646,6 +5659,8 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
 
 		elem.priv = catchall->elem;
 		ret = nf_tables_fill_setelem(skb, set, &elem, reset);
+		if (reset && !ret)
+			audit_log_nft_set_reset(set->table, base_seq, 1);
 		break;
 	}
 
@@ -5725,12 +5740,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	set->ops->walk(&dump_ctx->ctx, set, &args.iter);
 
 	if (!args.iter.err && args.iter.count == cb->args[0])
-		args.iter.err = nft_set_catchall_dump(net, skb, set, reset);
+		args.iter.err = nft_set_catchall_dump(net, skb, set,
+						      reset, cb->seq);
 	rcu_read_unlock();
 
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
 
+	if (reset && args.iter.count > args.iter.skip)
+		audit_log_nft_set_reset(table, cb->seq,
+					args.iter.count - args.iter.skip);
+
 	if (args.iter.err && args.iter.err != -EMSGSIZE)
 		return args.iter.err;
 	if (args.iter.count == cb->args[0])
@@ -5955,13 +5975,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
 	u8 family = info->nfmsg->nfgen_family;
+	int rem, err = 0, nelems = 0;
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nlattr *attr;
 	struct nft_ctx ctx;
 	bool reset = false;
-	int rem, err = 0;
 
 	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
 				 genmask, 0);
@@ -6004,8 +6024,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 			NL_SET_BAD_ATTR(extack, attr);
 			break;
 		}
+		nelems++;
 	}
 
+	if (reset)
+		audit_log_nft_set_reset(table, nft_pernet(net)->base_seq,
+					nelems);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From ea078ae9108e25fc881c84369f7c03931d22e555 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:58 +0200
Subject: netfilter: nf_tables: Audit log rule reset

Resetting rules' stateful data happens outside of the transaction logic,
so 'get' and 'dump' handlers have to emit audit log entries themselves.

Fixes: 8daa8fde3fc3f ("netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h         |  1 +
 kernel/auditsc.c              |  1 +
 net/netfilter/nf_tables_api.c | 18 ++++++++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 192bf03aacc5..51b1b7054a23 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -118,6 +118,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
 	AUDIT_NFT_OP_SETELEM_RESET,
+	AUDIT_NFT_OP_RULE_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 38481e318197..fc0c7c03eeab 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -144,6 +144,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
 	{ AUDIT_NFT_OP_FLOWTABLE_REGISTER,	"nft_register_flowtable"   },
 	{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,	"nft_unregister_flowtable" },
 	{ AUDIT_NFT_OP_SETELEM_RESET,		"nft_reset_setelem"        },
+	{ AUDIT_NFT_OP_RULE_RESET,		"nft_reset_rule"           },
 	{ AUDIT_NFT_OP_INVALID,			"nft_invalid"		   },
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 361e98e71692..2c81cee858d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3422,6 +3422,18 @@ err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
+static void audit_log_rule_reset(const struct nft_table *table,
+				 unsigned int base_seq,
+				 unsigned int nentries)
+{
+	char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+			      table->name, base_seq);
+
+	audit_log_nfcfg(buf, table->family, nentries,
+			AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+	kfree(buf);
+}
+
 struct nft_rule_dump_ctx {
 	char *table;
 	char *chain;
@@ -3528,6 +3540,9 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 done:
 	rcu_read_unlock();
 
+	if (reset && idx > cb->args[0])
+		audit_log_rule_reset(table, cb->seq, idx - cb->args[0]);
+
 	cb->args[0] = idx;
 	return skb->len;
 }
@@ -3635,6 +3650,9 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (err < 0)
 		goto err_fill_rule_info;
 
+	if (reset)
+		audit_log_rule_reset(table, nft_pernet(net)->base_seq, 1);
+
 	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
 err_fill_rule_info:
-- 
cgit v1.2.3


From 3af5ae22030cb59fab4fba35f5a2b62f47e14df9 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 25 Jul 2023 09:44:40 +0800
Subject: ceph: make members in struct ceph_mds_request_args_ext a union

In ceph mainline it will allow to set the btime in the setattr request
and just add a 'btime' member in the union 'ceph_mds_request_args' and
then bump up the header version to 4. That means the total size of union
'ceph_mds_request_args' will increase sizeof(struct ceph_timespec) bytes,
but in kclient it will increase the sizeof(setattr_ext) bytes for each
request.

Since the MDS will always depend on the header's vesion and front_len
members to decode the 'ceph_mds_request_head' struct, at the same time
kclient hasn't supported the 'btime' feature yet in setattr request,
so it's safe to do this change here.

This will save 48 bytes memories for each request.

Fixes: 4f1ddb1ea874 ("ceph: implement updated ceph_mds_request_head structure")
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 45f8ce61e103..ae44812eb6d4 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -467,17 +467,19 @@ union ceph_mds_request_args {
 } __attribute__ ((packed));
 
 union ceph_mds_request_args_ext {
-	union ceph_mds_request_args old;
-	struct {
-		__le32 mode;
-		__le32 uid;
-		__le32 gid;
-		struct ceph_timespec mtime;
-		struct ceph_timespec atime;
-		__le64 size, old_size;       /* old_size needed by truncate */
-		__le32 mask;                 /* CEPH_SETATTR_* */
-		struct ceph_timespec btime;
-	} __attribute__ ((packed)) setattr_ext;
+	union {
+		union ceph_mds_request_args old;
+		struct {
+			__le32 mode;
+			__le32 uid;
+			__le32 gid;
+			struct ceph_timespec mtime;
+			struct ceph_timespec atime;
+			__le64 size, old_size;       /* old_size needed by truncate */
+			__le32 mask;                 /* CEPH_SETATTR_* */
+			struct ceph_timespec btime;
+		} __attribute__ ((packed)) setattr_ext;
+	};
 };
 
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
-- 
cgit v1.2.3


From ce0d5bd3a6c176f9a3bf867624a07119dd4d0878 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 25 Jul 2023 17:51:59 +0800
Subject: ceph: make num_fwd and num_retry to __u32

The num_fwd in MClientRequestForward is int32_t, while the num_fwd
in ceph_mds_request_head is __u8. This is buggy when the num_fwd
is larger than 256 it will always be truncate to 0 again. But the
client couldn't recoginize this.

This will make them to __u32 instead. Because the old cephs will
directly copy the raw memories when decoding the reqeust's head,
so we need to make sure this kclient will be compatible with old
cephs. For newer cephs they will decode the requests depending
the version, which will be much simpler and easier to extend new
members.

Link: https://tracker.ceph.com/issues/62145
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 189 +++++++++++++++++++++++--------------------
 fs/ceph/mds_client.h         |   4 +-
 include/linux/ceph/ceph_fs.h |  23 +++++-
 3 files changed, 126 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 04a881343e43..615db141b6c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2840,6 +2840,18 @@ static void encode_mclientrequest_tail(void **p,
 	}
 }
 
+static struct ceph_mds_request_head_legacy *
+find_legacy_request_head(void *p, u64 features)
+{
+	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+	struct ceph_mds_request_head_old *ohead;
+
+	if (legacy)
+		return (struct ceph_mds_request_head_legacy *)p;
+	ohead = (struct ceph_mds_request_head_old *)p;
+	return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+}
+
 /*
  * called under mdsc->mutex
  */
@@ -2850,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_msg *msg;
-	struct ceph_mds_request_head_old *head;
+	struct ceph_mds_request_head_legacy *lhead;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
@@ -2862,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	void *p, *end;
 	int ret;
 	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
+	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+				     &session->s_features);
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2893,7 +2907,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		goto out_free2;
 	}
 
-	len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
+	/*
+	 * For old cephs without supporting the 32bit retry/fwd feature
+	 * it will copy the raw memories directly when decoding the
+	 * requests. While new cephs will decode the head depending the
+	 * version member, so we need to make sure it will be compatible
+	 * with them both.
+	 */
+	if (legacy)
+		len = sizeof(struct ceph_mds_request_head_legacy);
+	else if (old_version)
+		len = sizeof(struct ceph_mds_request_head_old);
+	else
+		len = sizeof(struct ceph_mds_request_head);
 
 	/* filepaths */
 	len += 2 * (1 + sizeof(u32) + sizeof(u64));
@@ -2938,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
+	lhead = find_legacy_request_head(msg->front.iov_base,
+					 session->s_con.peer_features);
+
 	/*
-	 * The old ceph_mds_request_head didn't contain a version field, and
+	 * The ceph_mds_request_head_legacy didn't contain a version field, and
 	 * one was added when we moved the message version from 3->4.
 	 */
 	if (legacy) {
 		msg->hdr.version = cpu_to_le16(3);
-		head = msg->front.iov_base;
-		p = msg->front.iov_base + sizeof(*head);
+		p = msg->front.iov_base + sizeof(*lhead);
+	} else if (old_version) {
+		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(4);
+		ohead->version = cpu_to_le16(1);
+		p = msg->front.iov_base + sizeof(*ohead);
 	} else {
-		struct ceph_mds_request_head *new_head = msg->front.iov_base;
+		struct ceph_mds_request_head *nhead = msg->front.iov_base;
 
 		msg->hdr.version = cpu_to_le16(6);
-		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
-		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-		p = msg->front.iov_base + sizeof(*new_head);
+		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		p = msg->front.iov_base + sizeof(*nhead);
 	}
 
 	end = msg->front.iov_base + msg->front.iov_len;
 
-	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
-	head->op = cpu_to_le32(req->r_op);
-	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
-						 req->r_cred->fsuid));
-	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
-						 req->r_cred->fsgid));
-	head->ino = cpu_to_le64(req->r_deleg_ino);
-	head->args = req->r_args;
+	lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	lhead->op = cpu_to_le32(req->r_op);
+	lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+						  req->r_cred->fsuid));
+	lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+						  req->r_cred->fsgid));
+	lhead->ino = cpu_to_le64(req->r_deleg_ino);
+	lhead->args = req->r_args;
 
 	ceph_encode_filepath(&p, end, ino1, path1);
 	ceph_encode_filepath(&p, end, ino2, path2);
@@ -3006,7 +3039,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		p = msg->front.iov_base + req->r_request_release_offset;
 	}
 
-	head->num_releases = cpu_to_le16(releases);
+	lhead->num_releases = cpu_to_le16(releases);
 
 	encode_mclientrequest_tail(&p, req);
 
@@ -3057,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
 	complete_all(&req->r_completion);
 }
 
-static struct ceph_mds_request_head_old *
-find_old_request_head(void *p, u64 features)
-{
-	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
-	struct ceph_mds_request_head *new_head;
-
-	if (legacy)
-		return (struct ceph_mds_request_head_old *)p;
-	new_head = (struct ceph_mds_request_head *)p;
-	return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-}
-
 /*
  * called under mdsc->mutex
  */
@@ -3078,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct ceph_mds_request_head_old *rhead;
+	struct ceph_mds_request_head_legacy *lhead;
+	struct ceph_mds_request_head *nhead;
 	struct ceph_msg *msg;
-	int flags = 0, max_retry;
+	int flags = 0, old_max_retry;
+	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+				     &session->s_features);
 
 	/*
-	 * The type of 'r_attempts' in kernel 'ceph_mds_request'
-	 * is 'int', while in 'ceph_mds_request_head' the type of
-	 * 'num_retry' is '__u8'. So in case the request retries
-	 *  exceeding 256 times, the MDS will receive a incorrect
-	 *  retry seq.
-	 *
-	 * In this case it's ususally a bug in MDS and continue
-	 * retrying the request makes no sense.
-	 *
-	 * In future this could be fixed in ceph code, so avoid
-	 * using the hardcode here.
+	 * Avoid inifinite retrying after overflow. The client will
+	 * increase the retry count and if the MDS is old version,
+	 * so we limit to retry at most 256 times.
 	 */
-	max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
-	max_retry = 1 << (max_retry * BITS_PER_BYTE);
-	if (req->r_attempts >= max_retry) {
-		pr_warn_ratelimited("%s request tid %llu seq overflow\n",
-				    __func__, req->r_tid);
-		return -EMULTIHOP;
+	if (req->r_attempts) {
+	       old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+					    num_retry);
+	       old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+	       if ((old_version && req->r_attempts >= old_max_retry) ||
+		   ((uint32_t)req->r_attempts >= U32_MAX)) {
+			pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+					    __func__, req->r_tid);
+			return -EMULTIHOP;
+	       }
 	}
 
 	req->r_attempts++;
@@ -3126,20 +3146,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
-		rhead = find_old_request_head(msg->front.iov_base,
-					      session->s_con.peer_features);
+		lhead = find_legacy_request_head(msg->front.iov_base,
+						 session->s_con.peer_features);
 
-		flags = le32_to_cpu(rhead->flags);
+		flags = le32_to_cpu(lhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
-		rhead->flags = cpu_to_le32(flags);
+		lhead->flags = cpu_to_le32(flags);
 
 		if (req->r_target_inode)
-			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+			lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
 
-		rhead->num_retry = req->r_attempts - 1;
+		lhead->num_retry = req->r_attempts - 1;
+		if (!old_version) {
+			nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+			nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+		}
 
 		/* remove cap/dentry releases from message */
-		rhead->num_releases = 0;
+		lhead->num_releases = 0;
 
 		p = msg->front.iov_base + req->r_request_release_offset;
 		encode_mclientrequest_tail(&p, req);
@@ -3160,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 	}
 	req->r_request = msg;
 
-	rhead = find_old_request_head(msg->front.iov_base,
-				      session->s_con.peer_features);
-	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	lhead = find_legacy_request_head(msg->front.iov_base,
+					 session->s_con.peer_features);
+	lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_REPLAY;
 	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_ASYNC;
 	if (req->r_parent)
 		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
-	rhead->flags = cpu_to_le32(flags);
-	rhead->num_fwd = req->r_num_fwd;
-	rhead->num_retry = req->r_attempts - 1;
+	lhead->flags = cpu_to_le32(flags);
+	lhead->num_fwd = req->r_num_fwd;
+	lhead->num_retry = req->r_attempts - 1;
+	if (!old_version) {
+		nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+		nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+		nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+	}
 
 	dout(" r_parent = %p\n", req->r_parent);
 	return 0;
@@ -3830,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
 		dout("forward tid %llu aborted, unregistering\n", tid);
 		__unregister_request(mdsc, req);
-	} else if (fwd_seq <= req->r_num_fwd) {
+	} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
 		/*
-		 * The type of 'num_fwd' in ceph 'MClientRequestForward'
-		 * is 'int32_t', while in 'ceph_mds_request_head' the
-		 * type is '__u8'. So in case the request bounces between
-		 * MDSes exceeding 256 times, the client will get stuck.
-		 *
-		 * In this case it's ususally a bug in MDS and continue
-		 * bouncing the request makes no sense.
+		 * Avoid inifinite retrying after overflow.
 		 *
-		 * In future this could be fixed in ceph code, so avoid
-		 * using the hardcode here.
+		 * The MDS will increase the fwd count and in client side
+		 * if the num_fwd is less than the one saved in request
+		 * that means the MDS is an old version and overflowed of
+		 * 8 bits.
 		 */
-		int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
-		max = 1 << (max * BITS_PER_BYTE);
-		if (req->r_num_fwd >= max) {
-			mutex_lock(&req->r_fill_mutex);
-			req->r_err = -EMULTIHOP;
-			set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
-			mutex_unlock(&req->r_fill_mutex);
-			aborted = true;
-			pr_warn_ratelimited("forward tid %llu seq overflow\n",
-					    tid);
-		} else {
-			dout("forward tid %llu to mds%d - old seq %d <= %d\n",
-			     tid, next_mds, req->r_num_fwd, fwd_seq);
-		}
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = -EMULTIHOP;
+		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+		mutex_unlock(&req->r_fill_mutex);
+		aborted = true;
+		pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
 	} else {
 		/* resend. forward race not possible; mds would drop */
 		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 1fa0f78b7b79..5a3714bdd64a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -32,8 +32,9 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_ALTERNATE_NAME,
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
 	CEPHFS_FEATURE_OP_GETVXATTR,
+	CEPHFS_FEATURE_32BITS_RETRY_FWD,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
 };
 
 #define CEPHFS_FEATURES_CLIENT_SUPPORTED {	\
@@ -47,6 +48,7 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_ALTERNATE_NAME,		\
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,	\
 	CEPHFS_FEATURE_OP_GETVXATTR,		\
+	CEPHFS_FEATURE_32BITS_RETRY_FWD,	\
 }
 
 /*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ae44812eb6d4..5f2301ee88bc 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -486,7 +486,7 @@ union ceph_mds_request_args_ext {
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -499,9 +499,9 @@ struct ceph_mds_request_head_old {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+#define CEPH_MDS_REQUEST_HEAD_VERSION  2
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le16 version;                /* struct version */
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
@@ -515,6 +515,23 @@ struct ceph_mds_request_head {
 	union ceph_mds_request_args_ext args;
 } __attribute__ ((packed));
 
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+
+	__le32 ext_num_retry;          /* new count retry attempts */
+	__le32 ext_num_fwd;            /* new count fwd attempts */
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */
-- 
cgit v1.2.3


From 52e322eda3d475614210efbc0f2793a1da9d367a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 28 Jul 2023 17:47:22 -0700
Subject: KVM: x86/mmu: BUG() in rmap helpers iff
 CONFIG_BUG_ON_DATA_CORRUPTION=y

Introduce KVM_BUG_ON_DATA_CORRUPTION() and use it in the low-level rmap
helpers to convert the existing BUG()s to WARN_ON_ONCE() when the kernel
is built with CONFIG_BUG_ON_DATA_CORRUPTION=n, i.e. does NOT want to BUG()
on corruption of host kernel data structures.  Environments that don't
have infrastructure to automatically capture crash dumps, i.e. aren't
likely to enable CONFIG_BUG_ON_DATA_CORRUPTION=y, are typically better
served overall by WARN-and-continue behavior (for the kernel, the VM is
dead regardless), as a BUG() while holding mmu_lock all but guarantees
the _best_ case scenario is a panic().

Make the BUG()s conditional instead of removing/replacing them entirely as
there's a non-zero chance (though by no means a guarantee) that the damage
isn't contained to the target VM, e.g. if no rmap is found for a SPTE then
KVM may be double-zapping the SPTE, i.e. has already freed the memory the
SPTE pointed at and thus KVM is reading/writing memory that KVM no longer
owns.

Link: https://lore.kernel.org/all/20221129191237.31447-1-mizhang@google.com
Suggested-by: Mingwei Zhang <mizhang@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Jim Mattson <jmattson@google.com>
Reviewed-by: Mingwei Zhang <mizhang@google.com>
Link: https://lore.kernel.org/r/20230729004722.1056172-13-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 21 ++++++++++-----------
 include/linux/kvm_host.h | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f518dd569a14..0420944da242 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -973,7 +973,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
 	 * when adding an entry and the previous head is full, and heads are
 	 * removed (this flow) when they become empty.
 	 */
-	BUG_ON(j < 0);
+	KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
 
 	/*
 	 * Replace the to-be-freed SPTE with the last valid entry from the head
@@ -1004,14 +1004,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
 	struct pte_list_desc *desc;
 	int i;
 
-	if (!rmap_head->val) {
-		pr_err("%s: %p 0->BUG\n", __func__, spte);
-		BUG();
-	} else if (!(rmap_head->val & 1)) {
-		if ((u64 *)rmap_head->val != spte) {
-			pr_err("%s:  %p 1->BUG\n", __func__, spte);
-			BUG();
-		}
+	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
+		return;
+
+	if (!(rmap_head->val & 1)) {
+		if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
+			return;
+
 		rmap_head->val = 0;
 	} else {
 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
@@ -1025,8 +1024,8 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
 			}
 			desc = desc->more;
 		}
-		pr_err("%s: %p many->many\n", __func__, spte);
-		BUG();
+
+		KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
 	}
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b583f35547e..fb6c6109fdca 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -867,6 +867,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
 	unlikely(__ret);					\
 })
 
+/*
+ * Note, "data corruption" refers to corruption of host kernel data structures,
+ * not guest data.  Guest data corruption, suspected or confirmed, that is tied
+ * and contained to a single VM should *never* BUG() and potentially panic the
+ * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure
+ * is corrupted and that corruption can have a cascading effect to other parts
+ * of the hosts and/or to other VMs.
+ */
+#define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm)			\
+({								\
+	bool __ret = !!(cond);					\
+								\
+	if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION))		\
+		BUG_ON(__ret);					\
+	else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))	\
+		kvm_vm_bugged(kvm);				\
+	unlikely(__ret);					\
+})
+
 static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_PROVE_RCU
-- 
cgit v1.2.3


From 6a86b5b5cd76d2734304a0173f5f01aa8aa2025e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 29 Aug 2023 22:53:52 +0200
Subject: bpf: Annotate bpf_long_memcpy with data_race

syzbot reported a data race splat between two processes trying to
update the same BPF map value via syscall on different CPUs:

  BUG: KCSAN: data-race in bpf_percpu_array_update / bpf_percpu_array_update

  write to 0xffffe8fffe7425d8 of 8 bytes by task 8257 on cpu 1:
   bpf_long_memcpy include/linux/bpf.h:428 [inline]
   bpf_obj_memcpy include/linux/bpf.h:441 [inline]
   copy_map_value_long include/linux/bpf.h:464 [inline]
   bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380
   bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175
   generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749
   bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648
   __sys_bpf+0x28a/0x780
   __do_sys_bpf kernel/bpf/syscall.c:5241 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:5239 [inline]
   __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

  write to 0xffffe8fffe7425d8 of 8 bytes by task 8268 on cpu 0:
   bpf_long_memcpy include/linux/bpf.h:428 [inline]
   bpf_obj_memcpy include/linux/bpf.h:441 [inline]
   copy_map_value_long include/linux/bpf.h:464 [inline]
   bpf_percpu_array_update+0x3bb/0x500 kernel/bpf/arraymap.c:380
   bpf_map_update_value+0x190/0x370 kernel/bpf/syscall.c:175
   generic_map_update_batch+0x3ae/0x4f0 kernel/bpf/syscall.c:1749
   bpf_map_do_batch+0x2df/0x3d0 kernel/bpf/syscall.c:4648
   __sys_bpf+0x28a/0x780
   __do_sys_bpf kernel/bpf/syscall.c:5241 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:5239 [inline]
   __x64_sys_bpf+0x43/0x50 kernel/bpf/syscall.c:5239
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x63/0xcd

  value changed: 0x0000000000000000 -> 0xfffffff000002788

The bpf_long_memcpy is used with 8-byte aligned pointers, power-of-8 size
and forced to use long read/writes to try to atomically copy long counters.
It is best-effort only and no barriers are here since it _will_ race with
concurrent updates from BPF programs. The bpf_long_memcpy() is called from
bpf(2) syscall. Marco suggested that the best way to make this known to
KCSAN would be to use data_race() annotation.

Reported-by: syzbot+97522333291430dd277f@syzkaller.appspotmail.com
Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/bpf/000000000000d87a7f06040c970c@google.com
Link: https://lore.kernel.org/bpf/57628f7a15e20d502247c3b55fceb1cb2b31f266.1693342186.git.daniel@iogearbox.net
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 12596af59c00..024e8b28c34b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -438,7 +438,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 
 	size /= sizeof(long);
 	while (size--)
-		*ldst++ = *lsrc++;
+		data_race(*ldst++ = *lsrc++);
 }
 
 /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
-- 
cgit v1.2.3


From 8423be8926aa82cd2e28bba5cc96ccb72c7ce6be Mon Sep 17 00:00:00 2001
From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Date: Thu, 31 Aug 2023 10:03:31 +0200
Subject: ipv6: ignore dst hint for multipath routes

Route hints when the nexthop is part of a multipath group causes packets
in the same receive batch to be sent to the same nexthop irrespective of
the multipath hash of the packet. So, do not extract route hint for
packets whose destination is part of a multipath group.

A new SKB flag IP6SKB_MULTIPATH is introduced for this purpose, set the
flag when route is looked up in fib6_select_path() and use it in
ip6_can_use_hint() to check for the existence of the flag.

Fixes: 197dbf24e360 ("ipv6: introduce and uses route look hints for list input.")
Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 +
 net/ipv6/ip6_input.c | 3 ++-
 net/ipv6/route.c     | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5883551b1ee8..af8a771a053c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -147,6 +147,7 @@ struct inet6_skb_parm {
 #define IP6SKB_JUMBOGRAM      128
 #define IP6SKB_SEG6	      256
 #define IP6SKB_FAKEJUMBO      512
+#define IP6SKB_MULTIPATH      1024
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index d94041bb4287..b8378814532c 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -99,7 +99,8 @@ static bool ip6_can_use_hint(const struct sk_buff *skb,
 static struct sk_buff *ip6_extract_route_hint(const struct net *net,
 					      struct sk_buff *skb)
 {
-	if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
+	if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
+	    IP6CB(skb)->flags & IP6SKB_MULTIPATH)
 		return NULL;
 
 	return skb;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 846aec8e0093..01d6d352850a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -423,6 +423,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
 	if (match->nh && have_oif_match && res->nh)
 		return;
 
+	if (skb)
+		IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
+
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
-- 
cgit v1.2.3


From 6ad11bc6ed37c6371e9e13619862709b6529b43a Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Thu, 24 Aug 2023 14:38:08 +0300
Subject: rmap: remove anon_vma_link() nommu stub

anon_vma_link() is unused since commit 5beb49305251 ("mm: change anon_vma
linking to fix multi-process server scalability issue").

Link: https://lkml.kernel.org/r/cdce9b00c9ab15f6d02eddf40dcad537d3e9676f.1692877089.git.baruch@tkos.co.il
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index a3825ce81102..51cc21ebb568 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -479,7 +479,6 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
 
 #define anon_vma_init()		do {} while (0)
 #define anon_vma_prepare(vma)	(0)
-#define anon_vma_link(vma)	do {} while (0)
 
 static inline int folio_referenced(struct folio *folio, int is_locked,
 				  struct mem_cgroup *memcg,
-- 
cgit v1.2.3


From 719c5e37e99d2fd588d1c994284d17650a66354c Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 1 Sep 2023 06:53:23 +0200
Subject: net: phy: micrel: Correct bit assignments for phy_device flags

Previously, the defines for phy_device flags in the Micrel driver were
ambiguous in their representation. They were intended to be bit masks
but were mistakenly defined as bit positions. This led to the following
issues:

- MICREL_KSZ8_P1_ERRATA, designated for KSZ88xx switches, overlapped
  with MICREL_PHY_FXEN and MICREL_PHY_50MHZ_CLK.
- Due to this overlap, the code path for MICREL_PHY_FXEN, tailored for
  the KSZ8041 PHY, was not executed for KSZ88xx PHYs.
- Similarly, the code associated with MICREL_PHY_50MHZ_CLK wasn't
  triggered for KSZ88xx.

To rectify this, all three flags have now been explicitly converted to
use the `BIT()` macro, ensuring they are defined as bit masks and
preventing potential overlaps in the future.

Fixes: 49011e0c1555 ("net: phy: micrel: ksz886x/ksz8081: add cabletest support")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/micrel_phy.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 8bef1ab62bba..322d87255984 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -41,9 +41,9 @@
 #define	PHY_ID_KSZ9477		0x00221631
 
 /* struct phy_device dev_flags definitions */
-#define MICREL_PHY_50MHZ_CLK	0x00000001
-#define MICREL_PHY_FXEN		0x00000002
-#define MICREL_KSZ8_P1_ERRATA	0x00000003
+#define MICREL_PHY_50MHZ_CLK	BIT(0)
+#define MICREL_PHY_FXEN		BIT(1)
+#define MICREL_KSZ8_P1_ERRATA	BIT(2)
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3


From 0be7592885d7b4c20595c388adc13930b653b847 Mon Sep 17 00:00:00 2001
From: Nilesh Javali <njavali@marvell.com>
Date: Thu, 31 Aug 2023 16:51:45 +0530
Subject: scsi: qla2xxx: Correct endianness for rqstlen and rsplen

rqstlen and rsplen were changed to __le32 to fix sparse warnings:

drivers/scsi/qla2xxx/qla_nvme.c:402:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:402:30:    expected restricted __le32 [usertype] cmd_len
drivers/scsi/qla2xxx/qla_nvme.c:402:30:    got unsigned short [usertype] rsplen
drivers/scsi/qla2xxx/qla_nvme.c:507:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:507:30:    expected restricted __le32 [usertype] cmd_len
drivers/scsi/qla2xxx/qla_nvme.c:507:30:    got unsigned int [usertype] rqstlen
drivers/scsi/qla2xxx/qla_nvme.c:508:30: warning: incorrect type in assignment (different base types)
drivers/scsi/qla2xxx/qla_nvme.c:508:30:    expected restricted __le32 [usertype] rsp_len
drivers/scsi/qla2xxx/qla_nvme.c:508:30:    got unsigned int [usertype] rsplen

Correct the endianness in qla2xxx driver thus avoiding changes in
nvme-fc-driver.h.

Fixes: 875386b98857 ("scsi: qla2xxx: Add Unsolicited LS Request and Response Support for NVMe")
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Link: https://lore.kernel.org/r/20230831112146.32595-1-njavali@marvell.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/qla2xxx/qla_nvme.c | 10 +++++-----
 include/linux/nvme-fc-driver.h  |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index db753d712991..a8ddf356e662 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -399,14 +399,14 @@ static int qla_nvme_xmt_ls_rsp(struct nvme_fc_local_port *lport,
 	nvme->u.nvme.dl = 0;
 	nvme->u.nvme.timeout_sec = 0;
 	nvme->u.nvme.cmd_dma = fd_resp->rspdma;
-	nvme->u.nvme.cmd_len = fd_resp->rsplen;
+	nvme->u.nvme.cmd_len = cpu_to_le32(fd_resp->rsplen);
 	nvme->u.nvme.rsp_len = 0;
 	nvme->u.nvme.rsp_dma = 0;
 	nvme->u.nvme.exchange_address = uctx->exchange_address;
 	nvme->u.nvme.nport_handle = uctx->nport_handle;
 	nvme->u.nvme.ox_id = uctx->ox_id;
 	dma_sync_single_for_device(&ha->pdev->dev, nvme->u.nvme.cmd_dma,
-				   le32_to_cpu(fd_resp->rsplen), DMA_TO_DEVICE);
+				   fd_resp->rsplen, DMA_TO_DEVICE);
 
 	ql_dbg(ql_dbg_unsol, vha, 0x2122,
 	       "Unsol lsreq portid=%06x %8phC exchange_address 0x%x ox_id 0x%x hdl 0x%x\n",
@@ -504,13 +504,13 @@ static int qla_nvme_ls_req(struct nvme_fc_local_port *lport,
 	nvme->u.nvme.desc = fd;
 	nvme->u.nvme.dir = 0;
 	nvme->u.nvme.dl = 0;
-	nvme->u.nvme.cmd_len = fd->rqstlen;
-	nvme->u.nvme.rsp_len = fd->rsplen;
+	nvme->u.nvme.cmd_len = cpu_to_le32(fd->rqstlen);
+	nvme->u.nvme.rsp_len = cpu_to_le32(fd->rsplen);
 	nvme->u.nvme.rsp_dma = fd->rspdma;
 	nvme->u.nvme.timeout_sec = fd->timeout;
 	nvme->u.nvme.cmd_dma = fd->rqstdma;
 	dma_sync_single_for_device(&ha->pdev->dev, nvme->u.nvme.cmd_dma,
-	    le32_to_cpu(fd->rqstlen), DMA_TO_DEVICE);
+	    fd->rqstlen, DMA_TO_DEVICE);
 
 	rval = qla2x00_start_sp(sp);
 	if (rval != QLA_SUCCESS) {
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f6ef8cf5d774..4109f1bd6128 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -53,10 +53,10 @@
 struct nvmefc_ls_req {
 	void			*rqstaddr;
 	dma_addr_t		rqstdma;
-	__le32			rqstlen;
+	u32			rqstlen;
 	void			*rspaddr;
 	dma_addr_t		rspdma;
-	__le32			rsplen;
+	u32			rsplen;
 	u32			timeout;
 
 	void			*private;
@@ -120,7 +120,7 @@ struct nvmefc_ls_req {
 struct nvmefc_ls_rsp {
 	void		*rspbuf;
 	dma_addr_t	rspdma;
-	__le32		rsplen;
+	u16		rsplen;
 
 	void (*done)(struct nvmefc_ls_rsp *rsp);
 	void		*nvme_fc_private;	/* LLDD is not to access !! */
-- 
cgit v1.2.3


From cf60ce92358da29f3b1e24005f7b748584b82753 Mon Sep 17 00:00:00 2001
From: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Date: Mon, 4 Sep 2023 12:00:02 +0200
Subject: of: overlay: Fix of_overlay_fdt_apply prototype when
 !CONFIG_OF_OVERLAY

The of_overlay_fdt_apply has been changed but when CONFIG_OF_OVERLAY
support is not configured then old stub prototype is declared
by of.h header.

Signed-off-by: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Fixes: 47284862bfc7 ("of: overlay: Extend of_overlay_fdt_apply() to specify the target node")
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20230904100002.7913-1-pisa@cmp.felk.cvut.cz
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index ed679819c279..6a9ddf20e79a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1676,8 +1676,8 @@ int of_overlay_notifier_unregister(struct notifier_block *nb);
 
 #else
 
-static inline int of_overlay_fdt_apply(void *overlay_fdt, u32 overlay_fdt_size,
-				       int *ovcs_id)
+static inline int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size,
+				       int *ovcs_id, struct device_node *target_base)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 9ffa7b92bc768609243d79fb0c0125b9704c7061 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:11:38 +0200
Subject: thermal: core: Clean up headers of thermal zone registration
 functions

For consistency, add a missing thermal_zone_device_register_with_trips()
stub for the CONFIG_THERMAL unset case, specify argument names in all of
the thermal zone registration and unregistration function headers and
make all of them use white space consistently.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/thermal.h | 53 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index eb17495c8acc..a30643104f3f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -300,16 +300,24 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
 #endif
 
 #ifdef CONFIG_THERMAL
-struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
-		void *, struct thermal_zone_device_ops *,
-		const struct thermal_zone_params *, int, int);
-
-void thermal_zone_device_unregister(struct thermal_zone_device *);
-
-struct thermal_zone_device *
-thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int,
-					void *, struct thermal_zone_device_ops *,
-					const struct thermal_zone_params *, int, int);
+struct thermal_zone_device *thermal_zone_device_register(
+					const char *type,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay);
+
+struct thermal_zone_device *thermal_zone_device_register_with_trips(
+					const char *type,
+					struct thermal_trip *trips,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay);
+
+void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
 const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
@@ -351,14 +359,27 @@ int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
 #else
 static inline struct thermal_zone_device *thermal_zone_device_register(
-	const char *type, int trips, int mask, void *devdata,
-	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
-	int passive_delay, int polling_delay)
+					const char *type,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay)
+{ return ERR_PTR(-ENODEV); }
+
+static inline struct thermal_zone_device *thermal_zone_device_register_with_trips(
+					const char *type,
+					struct thermal_trip *trips,
+					int num_trips, int mask,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp,
+					int passive_delay, int polling_delay)
 { return ERR_PTR(-ENODEV); }
-static inline void thermal_zone_device_unregister(
-	struct thermal_zone_device *tz)
+
+static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 { }
+
 static inline struct thermal_cooling_device *
 thermal_cooling_device_register(const char *type, void *devdata,
 	const struct thermal_cooling_device_ops *ops)
-- 
cgit v1.2.3


From d332db8fc1a2dfb4738281b1d6d4ed20115dd9d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:13:35 +0200
Subject: thermal: core: Add function for registering tripless thermal zones

Multiple callers of thermal_zone_device_register() don't pass any trips
to it and they might use a shortened argument list for that, so add
a special function with fewer arguments for this purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 11 +++++++++++
 include/linux/thermal.h        | 13 +++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 0bdde1ab5d8b..33ebec043800 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1400,6 +1400,17 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, int n
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_register);
 
+struct thermal_zone_device *thermal_tripless_zone_device_register(
+					const char *type,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp)
+{
+	return thermal_zone_device_register_with_trips(type, NULL, 0, 0, devdata,
+						       ops, tzp, 0, 0);
+}
+EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register);
+
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd)
 {
 	return tzd->devdata;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a30643104f3f..1a1433eb9b59 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -317,6 +317,12 @@ struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const struct thermal_zone_params *tzp,
 					int passive_delay, int polling_delay);
 
+struct thermal_zone_device *thermal_tripless_zone_device_register(
+					const char *type,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp);
+
 void thermal_zone_device_unregister(struct thermal_zone_device *tz);
 
 void *thermal_zone_device_priv(struct thermal_zone_device *tzd);
@@ -377,6 +383,13 @@ static inline struct thermal_zone_device *thermal_zone_device_register_with_trip
 					int passive_delay, int polling_delay)
 { return ERR_PTR(-ENODEV); }
 
+static inline struct thermal_zone_device *thermal_tripless_zone_device_register(
+					const char *type,
+					void *devdata,
+					struct thermal_zone_device_ops *ops,
+					const struct thermal_zone_params *tzp)
+{ return ERR_PTR(-ENODEV); }
+
 static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 { }
 
-- 
cgit v1.2.3


From edd220b33f479cf9dcda0bfefb2cb8c5902e9885 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Aug 2023 18:16:29 +0200
Subject: thermal: core: Drop thermal_zone_device_register()

There are no more users of thermal_zone_device_register(), so drop it
from the core.

Note that thermal_zone_device_register_with_trips() may be renamed to
thermal_zone_device_register() in the future, but only after a grace
period allowing all of the possible work in progress that may be using
the latter to adjust.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 11 -----------
 include/linux/thermal.h        | 17 -----------------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 33ebec043800..8717a3343512 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1389,17 +1389,6 @@ free_tz:
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips);
 
-struct thermal_zone_device *thermal_zone_device_register(const char *type, int ntrips, int mask,
-							 void *devdata, struct thermal_zone_device_ops *ops,
-							 const struct thermal_zone_params *tzp, int passive_delay,
-							 int polling_delay)
-{
-	return thermal_zone_device_register_with_trips(type, NULL, ntrips, mask,
-						       devdata, ops, tzp,
-						       passive_delay, polling_delay);
-}
-EXPORT_SYMBOL_GPL(thermal_zone_device_register);
-
 struct thermal_zone_device *thermal_tripless_zone_device_register(
 					const char *type,
 					void *devdata,
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1a1433eb9b59..c99440aac1a1 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -300,14 +300,6 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
 #endif
 
 #ifdef CONFIG_THERMAL
-struct thermal_zone_device *thermal_zone_device_register(
-					const char *type,
-					int num_trips, int mask,
-					void *devdata,
-					struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					int passive_delay, int polling_delay);
-
 struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
 					struct thermal_trip *trips,
@@ -364,15 +356,6 @@ int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
 #else
-static inline struct thermal_zone_device *thermal_zone_device_register(
-					const char *type,
-					int num_trips, int mask,
-					void *devdata,
-					struct thermal_zone_device_ops *ops,
-					const struct thermal_zone_params *tzp,
-					int passive_delay, int polling_delay)
-{ return ERR_PTR(-ENODEV); }
-
 static inline struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
 					struct thermal_trip *trips,
-- 
cgit v1.2.3


From e7716c74e3882405f9eca16faa6cb1bf19995399 Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Mon, 21 Aug 2023 10:21:29 +0200
Subject: xarray: Document necessary flag in alloc functions

Adds a new line to the docstrings of functions wrapping __xa_alloc() and
__xa_alloc_cyclic(), informing about the necessity of flag XA_FLAGS_ALLOC
being set previously.

The documentation so far says that functions wrapping __xa_alloc() and
__xa_alloc_cyclic() are supposed to return either -ENOMEM or -EBUSY in
case of an error. If the xarray has been initialized without the flag
XA_FLAGS_ALLOC, however, they fail with a different, undocumented error
code.

As hinted at in Documentation/core-api/xarray.rst, wrappers around these
functions should only be invoked when the flag has been set. The
functions' documentation should reflect that as well.

Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h | 18 ++++++++++++++++++
 lib/xarray.c           |  6 ++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 741703b45f61..cb571dfcf4b1 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -856,6 +856,9 @@ static inline int __must_check xa_insert_irq(struct xarray *xa,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -886,6 +889,9 @@ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -916,6 +922,9 @@ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -949,6 +958,9 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock.  May sleep if
  * the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
@@ -983,6 +995,9 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Takes and releases the xa_lock while
  * disabling softirqs.  May sleep if the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
@@ -1017,6 +1032,9 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Process context.  Takes and releases the xa_lock while
  * disabling interrupts.  May sleep if the @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
diff --git a/lib/xarray.c b/lib/xarray.c
index 142e36f9dfda..39f07bfc4dcc 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1802,6 +1802,9 @@ EXPORT_SYMBOL(xa_get_order);
  * stores the index into the @id pointer, then stores the entry at
  * that index.  A concurrent lookup will not see an uninitialised @id.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Expects xa_lock to be held on entry.  May
  * release and reacquire xa_lock if @gfp flags permit.
  * Return: 0 on success, -ENOMEM if memory could not be allocated or
@@ -1850,6 +1853,9 @@ EXPORT_SYMBOL(__xa_alloc);
  * The search for an empty entry will start at @next and will wrap
  * around if necessary.
  *
+ * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
+ * in xa_init_flags().
+ *
  * Context: Any context.  Expects xa_lock to be held on entry.  May
  * release and reacquire xa_lock if @gfp flags permit.
  * Return: 0 if the allocation succeeded without wrapping.  1 if the
-- 
cgit v1.2.3


From 39285e124edbc752331e98ace37cc141a6a3747a Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 5 Sep 2023 08:46:10 +0000
Subject: net: team: do not use dynamic lockdep key

team interface has used a dynamic lockdep key to avoid false-positive
lockdep deadlock detection. Virtual interfaces such as team usually
have their own lock for protecting private data.
These interfaces can be nested.
team0
  |
team1

Each interface's lock is actually different(team0->lock and team1->lock).
So,
mutex_lock(&team0->lock);
mutex_lock(&team1->lock);
mutex_unlock(&team1->lock);
mutex_unlock(&team0->lock);
The above case is absolutely safe. But lockdep warns about deadlock.
Because the lockdep understands these two locks are same. This is a
false-positive lockdep warning.

So, in order to avoid this problem, the team interfaces started to use
dynamic lockdep key. The false-positive problem was fixed, but it
introduced a new problem.

When the new team virtual interface is created, it registers a dynamic
lockdep key(creates dynamic lockdep key) and uses it. But there is the
limitation of the number of lockdep keys.
So, If so many team interfaces are created, it consumes all lockdep keys.
Then, the lockdep stops to work and warns about it.

In order to fix this problem, team interfaces use the subclass instead
of the dynamic key. So, when a new team interface is created, it doesn't
register(create) a new lockdep, but uses existed subclass key instead.
It is already used by the bonding interface for a similar case.

As the bonding interface does, the subclass variable is the same as
the 'dev->nested_level'. This variable indicates the depth in the stacked
interface graph.

The 'dev->nested_level' is protected by RTNL and RCU.
So, 'mutex_lock_nested()' for 'team->lock' requires RTNL or RCU.
In the current code, 'team->lock' is usually acquired under RTNL, there is
no problem with using 'dev->nested_level'.

The 'team_nl_team_get()' and The 'lb_stats_refresh()' functions acquire
'team->lock' without RTNL.
But these don't iterate their own ports nested so they don't need nested
lock.

Reproducer:
   for i in {0..1000}
   do
           ip link add team$i type team
           ip link add dummy$i master team$i type dummy
           ip link set dummy$i up
           ip link set team$i up
   done

Splat looks like:
   BUG: MAX_LOCKDEP_ENTRIES too low!
   turning off the locking correctness validator.
   Please attach the output of /proc/lock_stat to the bug report
   CPU: 0 PID: 4104 Comm: ip Not tainted 6.5.0-rc7+ #45
   Call Trace:
    <TASK>
   dump_stack_lvl+0x64/0xb0
   add_lock_to_list+0x30d/0x5e0
   check_prev_add+0x73a/0x23a0
   ...
   sock_def_readable+0xfe/0x4f0
   netlink_broadcast+0x76b/0xac0
   nlmsg_notify+0x69/0x1d0
   dev_open+0xed/0x130
   ...

Reported-by: syzbot+9bbbacfbf1e04d5221f7@syzkaller.appspotmail.com
Fixes: 369f61bee0f5 ("team: fix nested locking lockdep warning")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c                  | 111 +++++++++++++++----------------
 drivers/net/team/team_mode_loadbalance.c |   4 +-
 include/linux/if_team.h                  |  30 ++++++++-
 3 files changed, 85 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index e8b94580194e..ad29122a5468 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1135,8 +1135,8 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 			 struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = team->dev;
-	struct team_port *port;
 	char *portname = port_dev->name;
+	struct team_port *port;
 	int err;
 
 	if (port_dev->flags & IFF_LOOPBACK) {
@@ -1203,18 +1203,31 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 
 	memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len);
 
-	err = team_port_enter(team, port);
+	err = dev_open(port_dev, extack);
 	if (err) {
-		netdev_err(dev, "Device %s failed to enter team mode\n",
+		netdev_dbg(dev, "Device %s opening failed\n",
 			   portname);
-		goto err_port_enter;
+		goto err_dev_open;
 	}
 
-	err = dev_open(port_dev, extack);
+	err = team_upper_dev_link(team, port, extack);
 	if (err) {
-		netdev_dbg(dev, "Device %s opening failed\n",
+		netdev_err(dev, "Device %s failed to set upper link\n",
 			   portname);
-		goto err_dev_open;
+		goto err_set_upper_link;
+	}
+
+	/* lockdep subclass variable(dev->nested_level) was updated by
+	 * team_upper_dev_link().
+	 */
+	team_unlock(team);
+	team_lock(team);
+
+	err = team_port_enter(team, port);
+	if (err) {
+		netdev_err(dev, "Device %s failed to enter team mode\n",
+			   portname);
+		goto err_port_enter;
 	}
 
 	err = vlan_vids_add_by_dev(port_dev, dev);
@@ -1242,13 +1255,6 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 		goto err_handler_register;
 	}
 
-	err = team_upper_dev_link(team, port, extack);
-	if (err) {
-		netdev_err(dev, "Device %s failed to set upper link\n",
-			   portname);
-		goto err_set_upper_link;
-	}
-
 	err = __team_option_inst_add_port(team, port);
 	if (err) {
 		netdev_err(dev, "Device %s failed to add per-port options\n",
@@ -1295,9 +1301,6 @@ err_set_slave_promisc:
 	__team_option_inst_del_port(team, port);
 
 err_option_port_add:
-	team_upper_dev_unlink(team, port);
-
-err_set_upper_link:
 	netdev_rx_handler_unregister(port_dev);
 
 err_handler_register:
@@ -1307,13 +1310,16 @@ err_enable_netpoll:
 	vlan_vids_del_by_dev(port_dev, dev);
 
 err_vids_add:
+	team_port_leave(team, port);
+
+err_port_enter:
+	team_upper_dev_unlink(team, port);
+
+err_set_upper_link:
 	dev_close(port_dev);
 
 err_dev_open:
-	team_port_leave(team, port);
 	team_port_set_orig_dev_addr(port);
-
-err_port_enter:
 	dev_set_mtu(port_dev, port->orig.mtu);
 
 err_set_mtu:
@@ -1616,6 +1622,7 @@ static int team_init(struct net_device *dev)
 	int err;
 
 	team->dev = dev;
+	mutex_init(&team->lock);
 	team_set_no_mode(team);
 	team->notifier_ctx = false;
 
@@ -1643,8 +1650,6 @@ static int team_init(struct net_device *dev)
 		goto err_options_register;
 	netif_carrier_off(dev);
 
-	lockdep_register_key(&team->team_lock_key);
-	__mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key);
 	netdev_lockdep_set_classes(dev);
 
 	return 0;
@@ -1665,7 +1670,7 @@ static void team_uninit(struct net_device *dev)
 	struct team_port *port;
 	struct team_port *tmp;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	list_for_each_entry_safe(port, tmp, &team->port_list, list)
 		team_port_del(team, port->dev);
 
@@ -1674,9 +1679,8 @@ static void team_uninit(struct net_device *dev)
 	team_mcast_rejoin_fini(team);
 	team_notify_peers_fini(team);
 	team_queue_override_fini(team);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 	netdev_change_features(dev);
-	lockdep_unregister_key(&team->team_lock_key);
 }
 
 static void team_destructor(struct net_device *dev)
@@ -1790,18 +1794,18 @@ static void team_set_rx_mode(struct net_device *dev)
 
 static int team_set_mac_address(struct net_device *dev, void *p)
 {
-	struct sockaddr *addr = p;
 	struct team *team = netdev_priv(dev);
+	struct sockaddr *addr = p;
 	struct team_port *port;
 
 	if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
 	dev_addr_set(dev, addr->sa_data);
-	mutex_lock(&team->lock);
+	team_lock(team);
 	list_for_each_entry(port, &team->port_list, list)
 		if (team->ops.port_change_dev_addr)
 			team->ops.port_change_dev_addr(team, port);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 	return 0;
 }
 
@@ -1815,7 +1819,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 	 * Alhough this is reader, it's guarded by team lock. It's not possible
 	 * to traverse list in reverse under rcu_read_lock
 	 */
-	mutex_lock(&team->lock);
+	team_lock(team);
 	team->port_mtu_change_allowed = true;
 	list_for_each_entry(port, &team->port_list, list) {
 		err = dev_set_mtu(port->dev, new_mtu);
@@ -1826,7 +1830,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 		}
 	}
 	team->port_mtu_change_allowed = false;
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	dev->mtu = new_mtu;
 
@@ -1836,7 +1840,7 @@ unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		dev_set_mtu(port->dev, dev->mtu);
 	team->port_mtu_change_allowed = false;
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	return err;
 }
@@ -1890,20 +1894,20 @@ static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	 * Alhough this is reader, it's guarded by team lock. It's not possible
 	 * to traverse list in reverse under rcu_read_lock
 	 */
-	mutex_lock(&team->lock);
+	team_lock(team);
 	list_for_each_entry(port, &team->port_list, list) {
 		err = vlan_vid_add(port->dev, proto, vid);
 		if (err)
 			goto unwind;
 	}
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	return 0;
 
 unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	return err;
 }
@@ -1913,10 +1917,10 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	list_for_each_entry(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	return 0;
 }
@@ -1938,9 +1942,9 @@ static void team_netpoll_cleanup(struct net_device *dev)
 {
 	struct team *team = netdev_priv(dev);
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	__team_netpoll_cleanup(team);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 }
 
 static int team_netpoll_setup(struct net_device *dev,
@@ -1950,7 +1954,7 @@ static int team_netpoll_setup(struct net_device *dev,
 	struct team_port *port;
 	int err = 0;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	list_for_each_entry(port, &team->port_list, list) {
 		err = __team_port_enable_netpoll(port);
 		if (err) {
@@ -1958,7 +1962,7 @@ static int team_netpoll_setup(struct net_device *dev,
 			break;
 		}
 	}
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 	return err;
 }
 #endif
@@ -1969,9 +1973,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	err = team_port_add(team, port_dev, extack);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 
 	if (!err)
 		netdev_change_features(dev);
@@ -1984,19 +1988,12 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	err = team_port_del(team, port_dev);
-	mutex_unlock(&team->lock);
-
-	if (err)
-		return err;
+	team_unlock(team);
 
-	if (netif_is_team_master(port_dev)) {
-		lockdep_unregister_key(&team->team_lock_key);
-		lockdep_register_key(&team->team_lock_key);
-		lockdep_set_class(&team->lock, &team->team_lock_key);
-	}
-	netdev_change_features(dev);
+	if (!err)
+		netdev_change_features(dev);
 
 	return err;
 }
@@ -2316,13 +2313,13 @@ static struct team *team_nl_team_get(struct genl_info *info)
 	}
 
 	team = netdev_priv(dev);
-	mutex_lock(&team->lock);
+	__team_lock(team);
 	return team;
 }
 
 static void team_nl_team_put(struct team *team)
 {
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 	dev_put(team->dev);
 }
 
@@ -2984,9 +2981,9 @@ static void team_port_change_check(struct team_port *port, bool linkup)
 {
 	struct team *team = port->team;
 
-	mutex_lock(&team->lock);
+	team_lock(team);
 	__team_port_change_check(port, linkup);
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 }
 
 
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 00f8989c29c0..7bcc9d37447a 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -478,7 +478,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	team = lb_priv_ex->team;
 	lb_priv = get_lb_priv(team);
 
-	if (!mutex_trylock(&team->lock)) {
+	if (!team_trylock(team)) {
 		schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0);
 		return;
 	}
@@ -515,7 +515,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	schedule_delayed_work(&lb_priv_ex->stats.refresh_dw,
 			      (lb_priv_ex->stats.refresh_interval * HZ) / 10);
 
-	mutex_unlock(&team->lock);
+	team_unlock(team);
 }
 
 static void lb_stats_refresh_interval_get(struct team *team,
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 1b9b15a492fa..12d4447fc8ab 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -221,10 +221,38 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
-	struct lock_class_key team_lock_key;
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
+static inline void __team_lock(struct team *team)
+{
+	mutex_lock(&team->lock);
+}
+
+static inline int team_trylock(struct team *team)
+{
+	return mutex_trylock(&team->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+static inline void team_lock(struct team *team)
+{
+	ASSERT_RTNL();
+	mutex_lock_nested(&team->lock, team->dev->nested_level);
+}
+
+#else
+static inline void team_lock(struct team *team)
+{
+	__team_lock(team);
+}
+#endif
+
+static inline void team_unlock(struct team *team)
+{
+	mutex_unlock(&team->lock);
+}
+
 static inline int team_dev_queue_xmit(struct team *team, struct team_port *port,
 				      struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 1a961e74d5abbea049588a3d74b759955b4ed9d5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 5 Sep 2023 16:42:02 -0700
Subject: net: phylink: fix sphinx complaint about invalid literal

sphinx complains about the use of "%PHYLINK_PCS_NEG_*":

Documentation/networking/kapi:144: ./include/linux/phylink.h:601: WARNING: Inline literal start-string without end-string.
Documentation/networking/kapi:144: ./include/linux/phylink.h:633: WARNING: Inline literal start-string without end-string.

These are not valid symbols so drop the '%' prefix.

Alternatively we could use %PHYLINK_PCS_NEG_\* (escape the *)
or use normal literal ``PHYLINK_PCS_NEG_*`` but there is already
a handful of un-adorned DEFINE_* in this file.

Fixes: f99d471afa03 ("net: phylink: add PCS negotiation mode")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/all/20230626162908.2f149f98@canb.auug.org.au/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 7d07f8736431..2b886ea654bb 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -600,7 +600,7 @@ void pcs_get_state(struct phylink_pcs *pcs,
  *
  * The %neg_mode argument should be tested via the phylink_mode_*() family of
  * functions, or for PCS that set pcs->neg_mode true, should be tested
- * against the %PHYLINK_PCS_NEG_* definitions.
+ * against the PHYLINK_PCS_NEG_* definitions.
  */
 int pcs_config(struct phylink_pcs *pcs, unsigned int neg_mode,
 	       phy_interface_t interface, const unsigned long *advertising,
@@ -630,7 +630,7 @@ void pcs_an_restart(struct phylink_pcs *pcs);
  *
  * The %mode argument should be tested via the phylink_mode_*() family of
  * functions, or for PCS that set pcs->neg_mode true, should be tested
- * against the %PHYLINK_PCS_NEG_* definitions.
+ * against the PHYLINK_PCS_NEG_* definitions.
  */
 void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
-- 
cgit v1.2.3


From 8f3f06dfd6873135068ccf1a0b386308e8c4da38 Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Wed, 6 Sep 2023 22:53:55 +0800
Subject: raid6: Add LoongArch SIMD syndrome calculation

The algorithms work on 64 bytes at a time, which is the L1 cache line
size of all current and future LoongArch cores (that we care about), as
confirmed by Huacai. The code is based on the generic int.uc algorithm,
unrolled 4 times for LSX and 2 times for LASX. Further unrolling does
not meaningfully improve the performance according to experiments.

Performance numbers measured during system boot on a 3A5000 @ 2.5GHz:

> raid6: lasx     gen() 12726 MB/s
> raid6: lsx      gen() 10001 MB/s
> raid6: int64x8  gen()  2876 MB/s
> raid6: int64x4  gen()  3867 MB/s
> raid6: int64x2  gen()  2531 MB/s
> raid6: int64x1  gen()  1945 MB/s

Comparison of xor() speeds (from different boots but meaningful anyway):

> lasx:    11226 MB/s
> lsx:     6395 MB/s
> int64x4: 2147 MB/s

Performance as measured by raid6test:

> raid6: lasx     gen() 25109 MB/s
> raid6: lsx      gen() 13233 MB/s
> raid6: int64x8  gen()  4164 MB/s
> raid6: int64x4  gen()  6005 MB/s
> raid6: int64x2  gen()  5781 MB/s
> raid6: int64x1  gen()  4119 MB/s
> raid6: using algorithm lasx gen() 25109 MB/s
> raid6: .... xor() 14439 MB/s, rmw enabled

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/raid/pq.h    |   2 +
 lib/raid6/Makefile         |   1 +
 lib/raid6/algos.c          |   8 +
 lib/raid6/loongarch.h      |  38 ++++
 lib/raid6/loongarch_simd.c | 422 +++++++++++++++++++++++++++++++++++++++++++++
 lib/raid6/test/Makefile    |  12 ++
 6 files changed, 483 insertions(+)
 create mode 100644 lib/raid6/loongarch.h
 create mode 100644 lib/raid6/loongarch_simd.c

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index f29aaaf2eb21..874447485848 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -108,6 +108,8 @@ extern const struct raid6_calls raid6_vpermxor1;
 extern const struct raid6_calls raid6_vpermxor2;
 extern const struct raid6_calls raid6_vpermxor4;
 extern const struct raid6_calls raid6_vpermxor8;
+extern const struct raid6_calls raid6_lsx;
+extern const struct raid6_calls raid6_lasx;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 45e17619422b..2b9ebe105480 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -9,6 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
                               vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
+raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index a22a05c9af8a..739c7ebcae1a 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -73,6 +73,14 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_neonx2,
 	&raid6_neonx1,
 #endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_lsx,
+#endif
+#endif
 #if defined(__ia64__)
 	&raid6_intx32,
 	&raid6_intx16,
diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h
new file mode 100644
index 000000000000..acfc33ce7056
--- /dev/null
+++ b/lib/raid6/loongarch.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * raid6/loongarch.h
+ *
+ * Definitions common to LoongArch RAID-6 code only
+ */
+
+#ifndef _LIB_RAID6_LOONGARCH_H
+#define _LIB_RAID6_LOONGARCH_H
+
+#ifdef __KERNEL__
+
+#include <asm/cpu-features.h>
+#include <asm/fpu.h>
+
+#else /* for user-space testing */
+
+#include <sys/auxv.h>
+
+/* have to supply these defines for glibc 2.37- and musl */
+#ifndef HWCAP_LOONGARCH_LSX
+#define HWCAP_LOONGARCH_LSX	(1 << 4)
+#endif
+#ifndef HWCAP_LOONGARCH_LASX
+#define HWCAP_LOONGARCH_LASX	(1 << 5)
+#endif
+
+#define kernel_fpu_begin()
+#define kernel_fpu_end()
+
+#define cpu_has_lsx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX)
+#define cpu_has_lasx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LIB_RAID6_LOONGARCH_H */
diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c
new file mode 100644
index 000000000000..aa5d9f924ca3
--- /dev/null
+++ b/lib/raid6/loongarch_simd.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Based on the generic RAID-6 code (int.uc):
+ *
+ * Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * The vector algorithms are currently priority 0, which means the generic
+ * scalar algorithms are not being disabled if vector support is present.
+ * This is like the similar LoongArch RAID5 XOR code, with the main reason
+ * repeated here: it cannot be ruled out at this point of time, that some
+ * future (maybe reduced) models could run the vector algorithms slower than
+ * the scalar ones, maybe for errata or micro-op reasons. It may be
+ * appropriate to revisit this after one or two more uarch generations.
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+#define NSIZE 16
+
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
+		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
+		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
+		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
+		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
+				   size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr12");
+			asm volatile("vxor.v $vr5, $vr17, $vr13");
+			asm volatile("vxor.v $vr6, $vr18, $vr14");
+			asm volatile("vxor.v $vr7, $vr19, $vr15");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"vld $vr20, %0\n\t"
+			"vld $vr21, %1\n\t"
+			"vld $vr22, %2\n\t"
+			"vld $vr23, %3\n\t"
+			"vld $vr24, %4\n\t"
+			"vld $vr25, %5\n\t"
+			"vld $vr26, %6\n\t"
+			"vld $vr27, %7\n\t"
+			"vxor.v $vr20, $vr20, $vr0\n\t"
+			"vxor.v $vr21, $vr21, $vr1\n\t"
+			"vxor.v $vr22, $vr22, $vr2\n\t"
+			"vxor.v $vr23, $vr23, $vr3\n\t"
+			"vxor.v $vr24, $vr24, $vr4\n\t"
+			"vxor.v $vr25, $vr25, $vr5\n\t"
+			"vxor.v $vr26, $vr26, $vr6\n\t"
+			"vxor.v $vr27, $vr27, $vr7\n\t"
+			"vst $vr20, %0\n\t"
+			"vst $vr21, %1\n\t"
+			"vst $vr22, %2\n\t"
+			"vst $vr23, %3\n\t"
+			"vst $vr24, %4\n\t"
+			"vst $vr25, %5\n\t"
+			"vst $vr26, %6\n\t"
+			"vst $vr27, %7\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lsx = {
+	raid6_lsx_gen_syndrome,
+	raid6_lsx_xor_syndrome,
+	raid6_has_lsx,
+	"lsx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+#define NSIZE 32
+
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
+				    size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr6");
+			asm volatile("xvxor.v $xr3, $xr9, $xr7");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"xvld $xr10, %0\n\t"
+			"xvld $xr11, %1\n\t"
+			"xvld $xr12, %2\n\t"
+			"xvld $xr13, %3\n\t"
+			"xvxor.v $xr10, $xr10, $xr0\n\t"
+			"xvxor.v $xr11, $xr11, $xr1\n\t"
+			"xvxor.v $xr12, $xr12, $xr2\n\t"
+			"xvxor.v $xr13, $xr13, $xr3\n\t"
+			"xvst $xr10, %0\n\t"
+			"xvst $xr11, %1\n\t"
+			"xvst $xr12, %2\n\t"
+			"xvst $xr13, %3\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lasx = {
+	raid6_lasx_gen_syndrome,
+	raid6_lasx_xor_syndrome,
+	raid6_has_lasx,
+	"lasx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 1f693ea3b980..7b244bce32b3 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -41,6 +41,16 @@ ifeq ($(findstring ppc,$(ARCH)),ppc)
                          gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
 endif
 
+ifeq ($(ARCH),loongarch64)
+        CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1
+        CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' |         \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1)
+        CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' |        \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1)
+endif
+
 ifeq ($(IS_X86),yes)
         OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += -DCONFIG_X86
@@ -54,6 +64,8 @@ else ifeq ($(HAS_ALTIVEC),yes)
         CFLAGS += -DCONFIG_ALTIVEC
         OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
                 vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
+else ifeq ($(ARCH),loongarch64)
+        OBJS += loongarch_simd.o
 endif
 
 .c.o:
-- 
cgit v1.2.3


From f2091321044d9fbcadb93dfc1c9cf23e563ea40c Mon Sep 17 00:00:00 2001
From: WANG Xuerui <git@xen0n.name>
Date: Wed, 6 Sep 2023 22:53:55 +0800
Subject: raid6: Add LoongArch SIMD recovery implementation

Similar to the syndrome calculation, the recovery algorithms also work
on 64 bytes at a time to align with the L1 cache line size of current
and future LoongArch cores (that we care about). Which means
unrolled-by-4 LSX and unrolled-by-2 LASX code.

The assembly is originally based on the x86 SSSE3/AVX2 ports, but
register allocation has been redone to take advantage of LSX/LASX's 32
vector registers, and instruction sequence has been optimized to suit
(e.g. LoongArch can perform per-byte srl and andi on vectors, but x86
cannot).

Performance numbers measured by instrumenting the raid6test code, on a
3A5000 system clocked at 2.5GHz:

> lasx  2data: 354.987 MiB/s
> lasx  datap: 350.430 MiB/s
> lsx   2data: 340.026 MiB/s
> lsx   datap: 337.318 MiB/s
> intx1 2data: 164.280 MiB/s
> intx1 datap: 187.966 MiB/s

Because recovery algorithms are chosen solely based on priority and
availability, lasx is marked as priority 2 and lsx priority 1. At least
for the current generation of LoongArch micro-architectures, LASX should
always be faster than LSX whenever supported, and have similar power
consumption characteristics (because the only known LASX-capable uarch,
the LA464, always compute the full 256-bit result for vector ops).

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/raid/pq.h          |   2 +
 lib/raid6/Makefile               |   2 +-
 lib/raid6/algos.c                |   8 +
 lib/raid6/recov_loongarch_simd.c | 513 +++++++++++++++++++++++++++++++++++++++
 lib/raid6/test/Makefile          |   2 +-
 5 files changed, 525 insertions(+), 2 deletions(-)
 create mode 100644 lib/raid6/recov_loongarch_simd.c

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 874447485848..006e18decfad 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -125,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
+extern const struct raid6_recov_calls raid6_recov_lsx;
+extern const struct raid6_recov_calls raid6_recov_lasx;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 2b9ebe105480..035b0a4db476 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -9,7 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
                               vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
-raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o
+raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 739c7ebcae1a..0ec534faf019 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -111,6 +111,14 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #endif
 #if defined(CONFIG_KERNEL_MODE_NEON)
 	&raid6_recov_neon,
+#endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_recov_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_recov_lsx,
+#endif
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
new file mode 100644
index 000000000000..94aeac85e6f7
--- /dev/null
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Originally based on recov_avx2.c and recov_ssse3.c:
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * Unlike with the syndrome calculation algorithms, there's no boot-time
+ * selection of recovery algorithms by benchmarking, so we have to specify
+ * the priorities and hope the future cores will all have decent vector
+ * support (i.e. no LASX slower than LSX, or even scalar code).
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
+				  int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * vr20, vr21: qmul
+	 * vr22, vr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+
+	while (bytes) {
+		/* vr4 - vr7: Q */
+		asm volatile("vld $vr4, %0" : : "m" (q[0]));
+		asm volatile("vld $vr5, %0" : : "m" (q[16]));
+		asm volatile("vld $vr6, %0" : : "m" (q[32]));
+		asm volatile("vld $vr7, %0" : : "m" (q[48]));
+		/*  vr4 - vr7: Q + Qxy */
+		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		/* vr0 - vr3: P */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr0 - vr3: P + Pxy */
+		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
+		asm volatile("vxor.v $vr0, $vr0, $vr8");
+		asm volatile("vxor.v $vr1, $vr1, $vr9");
+		asm volatile("vxor.v $vr2, $vr2, $vr10");
+		asm volatile("vxor.v $vr3, $vr3, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
+		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
+		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
+		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
+		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
+		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
+		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
+		/* vr16 - vr19: B(Q + Qxy) */
+		asm volatile("vxor.v $vr16, $vr8, $vr4");
+		asm volatile("vxor.v $vr17, $vr9, $vr5");
+		asm volatile("vxor.v $vr18, $vr10, $vr6");
+		asm volatile("vxor.v $vr19, $vr11, $vr7");
+
+		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("vsrli.b $vr4, $vr0, 4");
+		asm volatile("vsrli.b $vr5, $vr1, 4");
+		asm volatile("vsrli.b $vr6, $vr2, 4");
+		asm volatile("vsrli.b $vr7, $vr3, 4");
+		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("vandi.b $vr12, $vr0, 0x0f");
+		asm volatile("vandi.b $vr13, $vr1, 0x0f");
+		asm volatile("vandi.b $vr14, $vr2, 0x0f");
+		asm volatile("vandi.b $vr15, $vr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
+		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
+		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
+		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
+		/* lookup from pbmul[16] */
+		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
+		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
+		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
+		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
+		/* vr4 - vr7: A(P + Pxy) */
+		asm volatile("vxor.v $vr4, $vr4, $vr12");
+		asm volatile("vxor.v $vr5, $vr5, $vr13");
+		asm volatile("vxor.v $vr6, $vr6, $vr14");
+		asm volatile("vxor.v $vr7, $vr7, $vr15");
+
+		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr16");
+		asm volatile("vxor.v $vr5, $vr5, $vr17");
+		asm volatile("vxor.v $vr6, $vr6, $vr18");
+		asm volatile("vxor.v $vr7, $vr7, $vr19");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Pxy + Dx = Dy */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
+		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
+		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
+		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
+				  void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* vr22, vr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+
+	while (bytes) {
+		/* vr0 - vr3: P + Dx */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr4 - vr7: Qx */
+		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
+		/* vr4 - vr7: Q + Qx */
+		asm volatile("vld $vr8, %0" : : "m" (q[0]));
+		asm volatile("vld $vr9, %0" : : "m" (q[16]));
+		asm volatile("vld $vr10, %0" : : "m" (q[32]));
+		asm volatile("vld $vr11, %0" : : "m" (q[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
+		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
+		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
+		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
+		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
+		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
+		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
+		/* vr4 - vr7: qmul(Q + Qx) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Dx + Dx = P */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (p[0]));
+		asm volatile("vst $vr1, %0" : "=m" (p[16]));
+		asm volatile("vst $vr2, %0" : "=m" (p[32]));
+		asm volatile("vst $vr3, %0" : "=m" (p[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lsx = {
+	.data2 = raid6_2data_recov_lsx,
+	.datap = raid6_datap_recov_lsx,
+	.valid = raid6_has_lsx,
+	.name = "lsx",
+	.priority = 1,
+};
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
+				   int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * xr20, xr21: qmul
+	 * xr22, xr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+	asm volatile("xvreplve0.q $xr20, $xr20");
+	asm volatile("xvreplve0.q $xr21, $xr21");
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: Q */
+		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
+		/* xr0, xr1: Q + Qxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+		/* xr2, xr3: P */
+		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
+		/* xr2, xr3: P + Pxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvsrli.b $xr4, $xr0, 4");
+		asm volatile("xvsrli.b $xr5, $xr1, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
+		/* xr6, xr7: B(Q + Qxy) */
+		asm volatile("xvxor.v $xr6, $xr4, $xr0");
+		asm volatile("xvxor.v $xr7, $xr5, $xr1");
+
+		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
+		/* lookup from pbmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr0, xr1: A(P + Pxy) */
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+
+		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("xvxor.v $xr0, $xr0, $xr6");
+		asm volatile("xvxor.v $xr1, $xr1, $xr7");
+
+		/* xr2, xr3: P + Pxy + Dx = Dy */
+		asm volatile("xvxor.v $xr2, $xr2, $xr0");
+		asm volatile("xvxor.v $xr3, $xr3, $xr1");
+
+		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
+				   void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* xr22, xr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: P + Dx */
+		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
+		/* xr2, xr3: Qx */
+		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
+		/* xr2, xr3: Q + Qx */
+		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
+		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr2, xr3: qmul(Q + Qx) = Dx */
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr0, xr1: P + Dx + Dx = P */
+		asm volatile("xvxor.v $xr0, $xr0, $xr2");
+		asm volatile("xvxor.v $xr1, $xr1, $xr3");
+
+		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lasx = {
+	.data2 = raid6_2data_recov_lasx,
+	.datap = raid6_datap_recov_lasx,
+	.valid = raid6_has_lasx,
+	.name = "lasx",
+	.priority = 2,
+};
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 7b244bce32b3..2abe0076a636 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -65,7 +65,7 @@ else ifeq ($(HAS_ALTIVEC),yes)
         OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
                 vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 else ifeq ($(ARCH),loongarch64)
-        OBJS += loongarch_simd.o
+        OBJS += loongarch_simd.o recov_loongarch_simd.o
 endif
 
 .c.o:
-- 
cgit v1.2.3


From 9b04c764af18a1dab6d48ca0671f70cdcccf90a2 Mon Sep 17 00:00:00 2001
From: Qing Zhang <zhangqing@loongson.cn>
Date: Wed, 6 Sep 2023 22:54:16 +0800
Subject: kasan: Add __HAVE_ARCH_SHADOW_MAP to support arch specific mapping

MIPS, LoongArch and some other architectures have many holes between
different segments and the valid address space (256T available) is
insufficient to map all these segments to kasan shadow memory with the
common formula provided by kasan core. So we need architecture specific
mapping formulas to ensure different segments are mapped individually,
and only limited space lengths of those specific segments are mapped to
shadow.

Therefore, when the incoming address is converted to a shadow, we need
to add a condition to determine whether it is valid.

Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Qing Zhang <zhangqing@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 include/linux/kasan.h | 2 ++
 mm/kasan/kasan.h      | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..3df5499f7936 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -54,11 +54,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#ifndef __HAVE_ARCH_SHADOW_MAP
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
 		+ KASAN_SHADOW_OFFSET;
 }
+#endif
 
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 2e973b36fe07..f70e3d7a602e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -291,16 +291,22 @@ struct kasan_stack_ring {
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
+#ifndef __HAVE_ARCH_SHADOW_MAP
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
 		<< KASAN_SHADOW_SCALE_SHIFT);
 }
+#endif
 
 static __always_inline bool addr_has_metadata(const void *addr)
 {
+#ifdef __HAVE_ARCH_SHADOW_MAP
+	return (kasan_mem_to_shadow((void *)addr) != NULL);
+#else
 	return (kasan_reset_tag(addr) >=
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+#endif
 }
 
 /**
-- 
cgit v1.2.3


From 08c6d8bae48c2c28f7017d7b61b5d5a1518ceb39 Mon Sep 17 00:00:00 2001
From: Lukasz Majewski <lukma@denx.de>
Date: Tue, 5 Sep 2023 11:33:15 +0200
Subject: net: phy: Provide Module 4 KSZ9477 errata (DS80000754C)

The KSZ9477 errata points out (in 'Module 4') the link up/down problems
when EEE (Energy Efficient Ethernet) is enabled in the device to which
the KSZ9477 tries to auto negotiate.

The suggested workaround is to clear advertisement of EEE for PHYs in
this chip driver.

To avoid regressions with other switch ICs the new MICREL_NO_EEE flag
has been introduced.

Moreover, the in-register disablement of MMD_DEVICE_ID_EEE_ADV.MMD_EEE_ADV
MMD register is removed, as this code is both; now executed too late
(after previous rework of the PHY and DSA for KSZ switches) and not
required as setting all members of eee_broken_modes bit field prevents
the KSZ9477 from advertising EEE.

Fixes: 69d3b36ca045 ("net: dsa: microchip: enable EEE support") # for KSZ9477
Signed-off-by: Lukasz Majewski <lukma@denx.de>
Tested-by: Oleksij Rempel <o.rempel@pengutronix.de> # Confirmed disabled EEE with oscilloscope.
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20230905093315.784052-1-lukma@denx.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/microchip/ksz_common.c | 16 +++++++++++++++-
 drivers/net/phy/micrel.c               |  9 ++++++---
 include/linux/micrel_phy.h             |  1 +
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 6673122266b7..42db7679c360 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -2335,13 +2335,27 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port)
 {
 	struct ksz_device *dev = ds->priv;
 
-	if (dev->chip_id == KSZ8830_CHIP_ID) {
+	switch (dev->chip_id) {
+	case KSZ8830_CHIP_ID:
 		/* Silicon Errata Sheet (DS80000830A):
 		 * Port 1 does not work with LinkMD Cable-Testing.
 		 * Port 1 does not respond to received PAUSE control frames.
 		 */
 		if (!port)
 			return MICREL_KSZ8_P1_ERRATA;
+		break;
+	case KSZ9477_CHIP_ID:
+		/* KSZ9477 Errata DS80000754C
+		 *
+		 * Module 4: Energy Efficient Ethernet (EEE) feature select must
+		 * be manually disabled
+		 *   The EEE feature is enabled by default, but it is not fully
+		 *   operational. It must be manually disabled through register
+		 *   controls. If not disabled, the PHY ports can auto-negotiate
+		 *   to enable EEE, and this feature can cause link drops when
+		 *   linked to another device supporting EEE.
+		 */
+		return MICREL_NO_EEE;
 	}
 
 	return 0;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index b6d7981b2d1e..927d3d54658e 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -1800,9 +1800,6 @@ static const struct ksz9477_errata_write ksz9477_errata_writes[] = {
 	/* Transmit waveform amplitude can be improved (1000BASE-T, 100BASE-TX, 10BASE-Te) */
 	{0x1c, 0x04, 0x00d0},
 
-	/* Energy Efficient Ethernet (EEE) feature select must be manually disabled */
-	{0x07, 0x3c, 0x0000},
-
 	/* Register settings are required to meet data sheet supply current specifications */
 	{0x1c, 0x13, 0x6eff},
 	{0x1c, 0x14, 0xe6ff},
@@ -1847,6 +1844,12 @@ static int ksz9477_config_init(struct phy_device *phydev)
 			return err;
 	}
 
+	/* According to KSZ9477 Errata DS80000754C (Module 4) all EEE modes
+	 * in this switch shall be regarded as broken.
+	 */
+	if (phydev->dev_flags & MICREL_NO_EEE)
+		phydev->eee_broken_modes = -1;
+
 	err = genphy_restart_aneg(phydev);
 	if (err)
 		return err;
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 322d87255984..4e27ca7c49de 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -44,6 +44,7 @@
 #define MICREL_PHY_50MHZ_CLK	BIT(0)
 #define MICREL_PHY_FXEN		BIT(1)
 #define MICREL_KSZ8_P1_ERRATA	BIT(2)
+#define MICREL_NO_EEE		BIT(3)
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3


From 6afcf0fb92701487421aa73c692855aa70fbc796 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 7 Sep 2023 11:01:04 -0700
Subject: Revert "net: team: do not use dynamic lockdep key"

This reverts commit 39285e124edbc752331e98ace37cc141a6a3747a.

Looks like the change has unintended consequences in exposing
objects before they are initialized. Let's drop this patch
and try again in net-next.

Reported-by: syzbot+44ae022028805f4600fc@syzkaller.appspotmail.com
Fixes: 39285e124edb ("net: team: do not use dynamic lockdep key")
Link: https://lore.kernel.org/all/20230907103124.6adb7256@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/team/team.c                  | 111 ++++++++++++++++---------------
 drivers/net/team/team_mode_loadbalance.c |   4 +-
 include/linux/if_team.h                  |  30 +--------
 3 files changed, 60 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index ad29122a5468..e8b94580194e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1135,8 +1135,8 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 			 struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = team->dev;
-	char *portname = port_dev->name;
 	struct team_port *port;
+	char *portname = port_dev->name;
 	int err;
 
 	if (port_dev->flags & IFF_LOOPBACK) {
@@ -1203,31 +1203,18 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 
 	memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len);
 
-	err = dev_open(port_dev, extack);
-	if (err) {
-		netdev_dbg(dev, "Device %s opening failed\n",
-			   portname);
-		goto err_dev_open;
-	}
-
-	err = team_upper_dev_link(team, port, extack);
+	err = team_port_enter(team, port);
 	if (err) {
-		netdev_err(dev, "Device %s failed to set upper link\n",
+		netdev_err(dev, "Device %s failed to enter team mode\n",
 			   portname);
-		goto err_set_upper_link;
+		goto err_port_enter;
 	}
 
-	/* lockdep subclass variable(dev->nested_level) was updated by
-	 * team_upper_dev_link().
-	 */
-	team_unlock(team);
-	team_lock(team);
-
-	err = team_port_enter(team, port);
+	err = dev_open(port_dev, extack);
 	if (err) {
-		netdev_err(dev, "Device %s failed to enter team mode\n",
+		netdev_dbg(dev, "Device %s opening failed\n",
 			   portname);
-		goto err_port_enter;
+		goto err_dev_open;
 	}
 
 	err = vlan_vids_add_by_dev(port_dev, dev);
@@ -1255,6 +1242,13 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
 		goto err_handler_register;
 	}
 
+	err = team_upper_dev_link(team, port, extack);
+	if (err) {
+		netdev_err(dev, "Device %s failed to set upper link\n",
+			   portname);
+		goto err_set_upper_link;
+	}
+
 	err = __team_option_inst_add_port(team, port);
 	if (err) {
 		netdev_err(dev, "Device %s failed to add per-port options\n",
@@ -1301,6 +1295,9 @@ err_set_slave_promisc:
 	__team_option_inst_del_port(team, port);
 
 err_option_port_add:
+	team_upper_dev_unlink(team, port);
+
+err_set_upper_link:
 	netdev_rx_handler_unregister(port_dev);
 
 err_handler_register:
@@ -1310,16 +1307,13 @@ err_enable_netpoll:
 	vlan_vids_del_by_dev(port_dev, dev);
 
 err_vids_add:
-	team_port_leave(team, port);
-
-err_port_enter:
-	team_upper_dev_unlink(team, port);
-
-err_set_upper_link:
 	dev_close(port_dev);
 
 err_dev_open:
+	team_port_leave(team, port);
 	team_port_set_orig_dev_addr(port);
+
+err_port_enter:
 	dev_set_mtu(port_dev, port->orig.mtu);
 
 err_set_mtu:
@@ -1622,7 +1616,6 @@ static int team_init(struct net_device *dev)
 	int err;
 
 	team->dev = dev;
-	mutex_init(&team->lock);
 	team_set_no_mode(team);
 	team->notifier_ctx = false;
 
@@ -1650,6 +1643,8 @@ static int team_init(struct net_device *dev)
 		goto err_options_register;
 	netif_carrier_off(dev);
 
+	lockdep_register_key(&team->team_lock_key);
+	__mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key);
 	netdev_lockdep_set_classes(dev);
 
 	return 0;
@@ -1670,7 +1665,7 @@ static void team_uninit(struct net_device *dev)
 	struct team_port *port;
 	struct team_port *tmp;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	list_for_each_entry_safe(port, tmp, &team->port_list, list)
 		team_port_del(team, port->dev);
 
@@ -1679,8 +1674,9 @@ static void team_uninit(struct net_device *dev)
 	team_mcast_rejoin_fini(team);
 	team_notify_peers_fini(team);
 	team_queue_override_fini(team);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 	netdev_change_features(dev);
+	lockdep_unregister_key(&team->team_lock_key);
 }
 
 static void team_destructor(struct net_device *dev)
@@ -1794,18 +1790,18 @@ static void team_set_rx_mode(struct net_device *dev)
 
 static int team_set_mac_address(struct net_device *dev, void *p)
 {
-	struct team *team = netdev_priv(dev);
 	struct sockaddr *addr = p;
+	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
 	if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
 	dev_addr_set(dev, addr->sa_data);
-	team_lock(team);
+	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list)
 		if (team->ops.port_change_dev_addr)
 			team->ops.port_change_dev_addr(team, port);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 	return 0;
 }
 
@@ -1819,7 +1815,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 	 * Alhough this is reader, it's guarded by team lock. It's not possible
 	 * to traverse list in reverse under rcu_read_lock
 	 */
-	team_lock(team);
+	mutex_lock(&team->lock);
 	team->port_mtu_change_allowed = true;
 	list_for_each_entry(port, &team->port_list, list) {
 		err = dev_set_mtu(port->dev, new_mtu);
@@ -1830,7 +1826,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 		}
 	}
 	team->port_mtu_change_allowed = false;
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	dev->mtu = new_mtu;
 
@@ -1840,7 +1836,7 @@ unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		dev_set_mtu(port->dev, dev->mtu);
 	team->port_mtu_change_allowed = false;
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	return err;
 }
@@ -1894,20 +1890,20 @@ static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	 * Alhough this is reader, it's guarded by team lock. It's not possible
 	 * to traverse list in reverse under rcu_read_lock
 	 */
-	team_lock(team);
+	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list) {
 		err = vlan_vid_add(port->dev, proto, vid);
 		if (err)
 			goto unwind;
 	}
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	return 0;
 
 unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	return err;
 }
@@ -1917,10 +1913,10 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	return 0;
 }
@@ -1942,9 +1938,9 @@ static void team_netpoll_cleanup(struct net_device *dev)
 {
 	struct team *team = netdev_priv(dev);
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	__team_netpoll_cleanup(team);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 }
 
 static int team_netpoll_setup(struct net_device *dev,
@@ -1954,7 +1950,7 @@ static int team_netpoll_setup(struct net_device *dev,
 	struct team_port *port;
 	int err = 0;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list) {
 		err = __team_port_enable_netpoll(port);
 		if (err) {
@@ -1962,7 +1958,7 @@ static int team_netpoll_setup(struct net_device *dev,
 			break;
 		}
 	}
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 	return err;
 }
 #endif
@@ -1973,9 +1969,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	err = team_port_add(team, port_dev, extack);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
 	if (!err)
 		netdev_change_features(dev);
@@ -1988,12 +1984,19 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	err = team_port_del(team, port_dev);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 
-	if (!err)
-		netdev_change_features(dev);
+	if (err)
+		return err;
+
+	if (netif_is_team_master(port_dev)) {
+		lockdep_unregister_key(&team->team_lock_key);
+		lockdep_register_key(&team->team_lock_key);
+		lockdep_set_class(&team->lock, &team->team_lock_key);
+	}
+	netdev_change_features(dev);
 
 	return err;
 }
@@ -2313,13 +2316,13 @@ static struct team *team_nl_team_get(struct genl_info *info)
 	}
 
 	team = netdev_priv(dev);
-	__team_lock(team);
+	mutex_lock(&team->lock);
 	return team;
 }
 
 static void team_nl_team_put(struct team *team)
 {
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 	dev_put(team->dev);
 }
 
@@ -2981,9 +2984,9 @@ static void team_port_change_check(struct team_port *port, bool linkup)
 {
 	struct team *team = port->team;
 
-	team_lock(team);
+	mutex_lock(&team->lock);
 	__team_port_change_check(port, linkup);
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 }
 
 
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 7bcc9d37447a..00f8989c29c0 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -478,7 +478,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	team = lb_priv_ex->team;
 	lb_priv = get_lb_priv(team);
 
-	if (!team_trylock(team)) {
+	if (!mutex_trylock(&team->lock)) {
 		schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0);
 		return;
 	}
@@ -515,7 +515,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	schedule_delayed_work(&lb_priv_ex->stats.refresh_dw,
 			      (lb_priv_ex->stats.refresh_interval * HZ) / 10);
 
-	team_unlock(team);
+	mutex_unlock(&team->lock);
 }
 
 static void lb_stats_refresh_interval_get(struct team *team,
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 12d4447fc8ab..1b9b15a492fa 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -221,38 +221,10 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
+	struct lock_class_key team_lock_key;
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
-static inline void __team_lock(struct team *team)
-{
-	mutex_lock(&team->lock);
-}
-
-static inline int team_trylock(struct team *team)
-{
-	return mutex_trylock(&team->lock);
-}
-
-#ifdef CONFIG_LOCKDEP
-static inline void team_lock(struct team *team)
-{
-	ASSERT_RTNL();
-	mutex_lock_nested(&team->lock, team->dev->nested_level);
-}
-
-#else
-static inline void team_lock(struct team *team)
-{
-	__team_lock(team);
-}
-#endif
-
-static inline void team_unlock(struct team *team)
-{
-	mutex_unlock(&team->lock);
-}
-
 static inline int team_dev_queue_xmit(struct team *team, struct team_port *port,
 				      struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From f94cf2206b066bd6d761d3347fd35f77b828c376 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 7 Sep 2023 09:40:07 -0400
Subject: buffer: Make bh_offset() work for compound pages

If the buffer pointed to by the buffer_head is part of a compound page,
bh_offset() assumes that b_page is the precise page that contains
the data.  A recent change to jbd2 inadvertently violated that assumption.

By using page_size(), we support both b_page being set to the head page
(as page_size() will return the size of the entire folio) and the precise
page (as it will return PAGE_SIZE for a tail page).

Fixes: 8147c4c4546f ("jbd2: use a folio in jbd2_journal_write_metadata_buffer()")
Reported-by: Zorro Lang <zlang@kernel.org>
Tested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/buffer_head.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6cb3e9af78c9..4ba242073adc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -173,7 +173,10 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
 	return test_bit_acquire(BH_Uptodate, &bh->b_state);
 }
 
-#define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
+static inline unsigned long bh_offset(const struct buffer_head *bh)
+{
+	return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1);
+}
 
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
-- 
cgit v1.2.3


From 41a5db8d8161457b121a03fde999ff6e00090ee2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:34 -0700
Subject: bpf: Add support for non-fix-size percpu mem allocation

This is needed for later percpu mem allocation when the
allocation is done by bpf program. For such cases, a global
bpf_global_percpu_ma is added where a flexible allocation
size is needed.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152734.1995725-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  4 ++--
 kernel/bpf/core.c     |  8 +++++---
 kernel/bpf/memalloc.c | 14 ++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..440dd1f59a1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -55,8 +55,8 @@ struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
-extern struct bpf_mem_alloc bpf_global_ma;
-extern bool bpf_global_ma_set;
+extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f8f036d8bd1..95599df82ee4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma;
-bool bpf_global_ma_set;
+struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 /* No hurry in this branch
  *
@@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	return ret;
+	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+	bpf_global_percpu_ma_set = !ret;
+	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9c49ae53deaf..cb60445de98a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	/* room for llist_node and per-cpu pointer */
+	if (percpu)
+		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
 		if (!pc)
 			return -ENOMEM;
 
-		if (percpu)
-			/* room for llist_node and per-cpu pointer */
-			percpu_size = LLIST_NODE_SZ + sizeof(void *);
-		else
+		if (!percpu)
 			size += LLIST_NODE_SZ; /* room for llist_node */
 		unit_size = size;
 
@@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 		return 0;
 	}
 
-	/* size == 0 && percpu is an invalid combination */
-	if (WARN_ON_ONCE(percpu))
-		return -EINVAL;
-
 	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
 	if (!pcc)
 		return -ENOMEM;
@@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
+			c->percpu_size = percpu_size;
 			c->tgt = c;
 			prefill_mem_cache(c, cpu);
 		}
-- 
cgit v1.2.3


From 55db92f42fe4a4ef7b4c2b4960c6212c8512dd53 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:39 -0700
Subject: bpf: Add BPF_KPTR_PERCPU as a field type

BPF_KPTR_PERCPU represents a percpu field type like below

  struct val_t {
    ... fields ...
  };
  struct t {
    ...
    struct val_t __percpu_kptr *percpu_data_ptr;
    ...
  };

where
  #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))

While BPF_KPTR_REF points to a trusted kernel object or a trusted
local object, BPF_KPTR_PERCPU points to a trusted local
percpu object.

This patch added basic support for BPF_KPTR_PERCPU
related to percpu_kptr field parsing, recording and free operations.
BPF_KPTR_PERCPU also supports the same map types
as BPF_KPTR_REF does.

Note that unlike a local kptr, it is possible that
a BPF_KTPR_PERCPU struct may not contain any
special fields like other kptr, bpf_spin_lock, bpf_list_head, etc.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152739.1996391-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 18 ++++++++++++------
 kernel/bpf/btf.c     |  5 +++++
 kernel/bpf/syscall.c |  4 ++++
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 440dd1f59a1c..87eeb3a46a1d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -180,14 +180,15 @@ enum btf_field_type {
 	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
-	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
-	BPF_LIST_HEAD  = (1 << 4),
-	BPF_LIST_NODE  = (1 << 5),
-	BPF_RB_ROOT    = (1 << 6),
-	BPF_RB_NODE    = (1 << 7),
+	BPF_KPTR_PERCPU = (1 << 4),
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
+	BPF_LIST_HEAD  = (1 << 5),
+	BPF_LIST_NODE  = (1 << 6),
+	BPF_RB_ROOT    = (1 << 7),
+	BPF_RB_NODE    = (1 << 8),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
-	BPF_REFCOUNT   = (1 << 8),
+	BPF_REFCOUNT   = (1 << 9),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -300,6 +301,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_KPTR_PERCPU:
+		return "percpu_kptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -325,6 +328,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -351,6 +355,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
@@ -389,6 +394,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_TIMER:
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1095bbe29859..187b57276fec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 		type = BPF_KPTR_UNREF;
 	else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
 		type = BPF_KPTR_REF;
+	else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_PERCPU;
 	else
 		return -EINVAL;
 
@@ -3457,6 +3459,7 @@ static int btf_find_struct_field(const struct btf *btf,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, member_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3523,6 +3526,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, var_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3783,6 +3787,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
 			if (ret < 0)
 				goto end;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb01c31ed591..6a692f3bea15 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -514,6 +514,7 @@ void btf_record_free(struct btf_record *rec)
 		switch (rec->fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			if (rec->fields[i].kptr.module)
 				module_put(rec->fields[i].kptr.module);
 			btf_put(rec->fields[i].kptr.btf);
@@ -560,6 +561,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		switch (fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			btf_get(fields[i].kptr.btf);
 			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
 				ret = -ENXIO;
@@ -650,6 +652,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
 			if (!xchgd_field)
 				break;
@@ -1045,6 +1048,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_PERCPU:
 			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
-- 
cgit v1.2.3


From 01cc55af93884f1ff5a883426e1924378dfcc62a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:49 -0700
Subject: bpf: Add bpf_this_cpu_ptr/bpf_per_cpu_ptr support for allocated
 percpu obj

The bpf helpers bpf_this_cpu_ptr() and bpf_per_cpu_ptr() are re-purposed
for allocated percpu objects. For an allocated percpu obj,
the reg type is 'PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU'.

The return type for these two re-purposed helpera is
'PTR_TO_MEM | MEM_RCU | MEM_ALLOC'.
The MEM_ALLOC allows that the per-cpu data can be read and written.

Since the memory allocator bpf_mem_alloc() returns
a ptr to a percpu ptr for percpu data, the first argument
of bpf_this_cpu_ptr() and bpf_per_cpu_ptr() is patched
with a dereference before passing to the helper func.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152749.1997202-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 59 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b6e58dab8e27..a3236651ec64 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -480,6 +480,7 @@ struct bpf_insn_aux_data {
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
+	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c886ead18f6..6b7e7ca611f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6221,7 +6221,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 
 		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
-		    !reg->ref_obj_id) {
+		    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
 			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
 			return -EFAULT;
 		}
@@ -7765,6 +7765,7 @@ static const struct bpf_reg_types btf_ptr_types = {
 static const struct bpf_reg_types percpu_btf_ptr_types = {
 	.types = {
 		PTR_TO_BTF_ID | MEM_PERCPU,
+		PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
 		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
 	}
 };
@@ -7941,6 +7942,7 @@ found:
 		}
 		break;
 	case PTR_TO_BTF_ID | MEM_PERCPU:
+	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
 		/* Handled by helper specific checks */
 		break;
@@ -9547,6 +9549,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			     int *insn_idx_p)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	bool returns_cpu_specific_alloc_ptr = false;
 	const struct bpf_func_proto *fn = NULL;
 	enum bpf_return_type ret_type;
 	enum bpf_type_flag ret_flag;
@@ -9785,6 +9788,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 		break;
 	}
+	case BPF_FUNC_per_cpu_ptr:
+	case BPF_FUNC_this_cpu_ptr:
+	{
+		struct bpf_reg_state *reg = &regs[BPF_REG_1];
+		const struct btf_type *type;
+
+		if (reg->type & MEM_RCU) {
+			type = btf_type_by_id(reg->btf, reg->btf_id);
+			if (!type || !btf_type_is_struct(type)) {
+				verbose(env, "Helper has invalid btf/btf_id in R1\n");
+				return -EFAULT;
+			}
+			returns_cpu_specific_alloc_ptr = true;
+			env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+		}
+		break;
+	}
 	case BPF_FUNC_user_ringbuf_drain:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_user_ringbuf_callback_state);
@@ -9874,14 +9894,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
-			/* MEM_RDONLY may be carried from ret_flag, but it
-			 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
-			 * it will confuse the check of PTR_TO_BTF_ID in
-			 * check_mem_access().
-			 */
-			ret_flag &= ~MEM_RDONLY;
+			if (returns_cpu_specific_alloc_ptr) {
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+			} else {
+				/* MEM_RDONLY may be carried from ret_flag, but it
+				 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+				 * it will confuse the check of PTR_TO_BTF_ID in
+				 * check_mem_access().
+				 */
+				ret_flag &= ~MEM_RDONLY;
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+			}
 
-			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 			regs[BPF_REG_0].btf = meta.ret_btf;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
@@ -18676,6 +18700,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+		if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+			/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+			 * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+			 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
-- 
cgit v1.2.3


From 6fdac58c560e4d164eb8161987bee045147cabe4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Sep 2023 22:19:12 -0400
Subject: tracing: Remove unused trace_event_file dir field

Now that eventfs structure is used to create the events directory via the
eventfs dynamically allocate code, the "dir" field of the trace_event_file
structure is no longer used. Remove it.

Link: https://lkml.kernel.org/r/20230908022001.580400115@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  1 -
 kernel/trace/trace_events.c  | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index eb5c3add939b..12f875e9e69a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -650,7 +650,6 @@ struct trace_event_file {
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
 	struct eventfs_file             *ef;
-	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2af92177b765..065c63991858 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -992,19 +992,6 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 
 static void remove_event_file_dir(struct trace_event_file *file)
 {
-	struct dentry *dir = file->dir;
-	struct dentry *child;
-
-	if (dir) {
-		spin_lock(&dir->d_lock);	/* probably unneeded */
-		list_for_each_entry(child, &dir->d_subdirs, d_child) {
-			if (d_really_is_positive(child))	/* probably unneeded */
-				d_inode(child)->i_private = NULL;
-		}
-		spin_unlock(&dir->d_lock);
-
-		tracefs_remove(dir);
-	}
 	eventfs_remove(file->ef);
 	list_del(&file->list);
 	remove_subsystem(file->system);
-- 
cgit v1.2.3


From 5d153cd128251aaedc8e9657f0a949ec94952055 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 8 Sep 2023 16:34:59 -0500
Subject: spnego: add missing OID to oid registry

Add missing OID to the registry. Some servers and clients (including
Windows) now request "NEGOEX - SPNEGEO Extended Negotiation Security")

See https://datatracker.ietf.org/doc/html/draft-zhu-negoex-02

Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 include/linux/oid_registry.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 0f4a8903922a..f86a08ba0207 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -67,6 +67,7 @@ enum OID {
 	OID_msOutlookExpress,		/* 1.3.6.1.4.1.311.16.4 */
 
 	OID_ntlmssp,			/* 1.3.6.1.4.1.311.2.2.10 */
+	OID_negoex,			/* 1.3.6.1.4.1.311.2.2.30 */
 
 	OID_spnego,			/* 1.3.6.1.5.5.2 */
 
-- 
cgit v1.2.3


From 24e0e61db3cb86a66824531989f1df80e0939f26 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 4 Sep 2023 22:42:56 +0200
Subject: ata: libata: disallow dev-initiated LPM transitions to unsupported
 states
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In AHCI 1.3.1, the register description for CAP.SSC:
"When cleared to ‘0’, software must not allow the HBA to initiate
transitions to the Slumber state via agressive link power management nor
the PxCMD.ICC field in each port, and the PxSCTL.IPM field in each port
must be programmed to disallow device initiated Slumber requests."

In AHCI 1.3.1, the register description for CAP.PSC:
"When cleared to ‘0’, software must not allow the HBA to initiate
transitions to the Partial state via agressive link power management nor
the PxCMD.ICC field in each port, and the PxSCTL.IPM field in each port
must be programmed to disallow device initiated Partial requests."

Ensure that we always set the corresponding bits in PxSCTL.IPM, such that
a device is not allowed to initiate transitions to power states which are
unsupported by the HBA.

DevSleep is always initiated by the HBA, however, for completeness, set the
corresponding bit in PxSCTL.IPM such that agressive link power management
cannot transition to DevSleep if DevSleep is not supported.

sata_link_scr_lpm() is used by libahci, ata_piix and libata-pmp.
However, only libahci has the ability to read the CAP/CAP2 register to see
if these features are supported. Therefore, in order to not introduce any
regressions on ata_piix or libata-pmp, create flags that indicate that the
respective feature is NOT supported. This way, the behavior for ata_piix
and libata-pmp should remain unchanged.

This change is based on a patch originally submitted by Runa Guo-oc.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Fixes: 1152b2617a6e ("libata: implement sata_link_scr_lpm() and make ata_dev_set_feature() global")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/ahci.c        |  9 +++++++++
 drivers/ata/libata-sata.c | 19 ++++++++++++++++---
 include/linux/libata.h    |  4 ++++
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index abb5911c9d09..08745e7db820 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1883,6 +1883,15 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	else
 		dev_info(&pdev->dev, "SSS flag set, parallel bus scan disabled\n");
 
+	if (!(hpriv->cap & HOST_CAP_PART))
+		host->flags |= ATA_HOST_NO_PART;
+
+	if (!(hpriv->cap & HOST_CAP_SSC))
+		host->flags |= ATA_HOST_NO_SSC;
+
+	if (!(hpriv->cap2 & HOST_CAP2_SDS))
+		host->flags |= ATA_HOST_NO_DEVSLP;
+
 	if (pi.flags & ATA_FLAG_EM)
 		ahci_reset_em(host);
 
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 5d31c08be013..a701e1538482 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -396,10 +396,23 @@ int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 	case ATA_LPM_MED_POWER_WITH_DIPM:
 	case ATA_LPM_MIN_POWER_WITH_PARTIAL:
 	case ATA_LPM_MIN_POWER:
-		if (ata_link_nr_enabled(link) > 0)
-			/* no restrictions on LPM transitions */
+		if (ata_link_nr_enabled(link) > 0) {
+			/* assume no restrictions on LPM transitions */
 			scontrol &= ~(0x7 << 8);
-		else {
+
+			/*
+			 * If the controller does not support partial, slumber,
+			 * or devsleep, then disallow these transitions.
+			 */
+			if (link->ap->host->flags & ATA_HOST_NO_PART)
+				scontrol |= (0x1 << 8);
+
+			if (link->ap->host->flags & ATA_HOST_NO_SSC)
+				scontrol |= (0x2 << 8);
+
+			if (link->ap->host->flags & ATA_HOST_NO_DEVSLP)
+				scontrol |= (0x4 << 8);
+		} else {
 			/* empty port, power off */
 			scontrol &= ~0xf;
 			scontrol |= (0x1 << 2);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 52d58b13e5ee..bf4913f4d7ac 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -222,6 +222,10 @@ enum {
 	ATA_HOST_PARALLEL_SCAN	= (1 << 2),	/* Ports on this host can be scanned in parallel */
 	ATA_HOST_IGNORE_ATA	= (1 << 3),	/* Ignore ATA devices on this host. */
 
+	ATA_HOST_NO_PART	= (1 << 4), /* Host does not support partial */
+	ATA_HOST_NO_SSC		= (1 << 5), /* Host does not support slumber */
+	ATA_HOST_NO_DEVSLP	= (1 << 6), /* Host does not support devslp */
+
 	/* bits 24:31 of host->flags are reserved for LLD specific flags */
 
 	/* various lengths of time */
-- 
cgit v1.2.3


From 2f4d3e293392571e02b106c8b431b638bd029276 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Sep 2023 16:40:32 +0300
Subject: gpio: pca953x: Drop unused fields in struct pca953x_platform_data

New code should solely use firmware nodes for the specifics and
not any callbacks.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpio-pca953x.c           | 37 ++++++++---------------------------
 include/linux/platform_data/pca953x.h | 13 ------------
 2 files changed, 8 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c
index bdd50a78e414..02695abd0eb1 100644
--- a/drivers/gpio/gpio-pca953x.c
+++ b/drivers/gpio/gpio-pca953x.c
@@ -211,7 +211,6 @@ struct pca953x_chip {
 
 	struct i2c_client *client;
 	struct gpio_chip gpio_chip;
-	const char *const *names;
 	unsigned long driver_data;
 	struct regulator *regulator;
 
@@ -712,7 +711,6 @@ static void pca953x_setup_gpio(struct pca953x_chip *chip, int gpios)
 	gc->label = dev_name(&chip->client->dev);
 	gc->parent = &chip->client->dev;
 	gc->owner = THIS_MODULE;
-	gc->names = chip->names;
 }
 
 #ifdef CONFIG_GPIO_PCA953X_IRQ
@@ -998,7 +996,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip,
 }
 #endif
 
-static int device_pca95xx_init(struct pca953x_chip *chip, u32 invert)
+static int device_pca95xx_init(struct pca953x_chip *chip)
 {
 	DECLARE_BITMAP(val, MAX_LINE);
 	u8 regaddr;
@@ -1016,24 +1014,21 @@ static int device_pca95xx_init(struct pca953x_chip *chip, u32 invert)
 	if (ret)
 		goto out;
 
-	/* set platform specific polarity inversion */
-	if (invert)
-		bitmap_fill(val, MAX_LINE);
-	else
-		bitmap_zero(val, MAX_LINE);
+	/* clear polarity inversion */
+	bitmap_zero(val, MAX_LINE);
 
 	ret = pca953x_write_regs(chip, chip->regs->invert, val);
 out:
 	return ret;
 }
 
-static int device_pca957x_init(struct pca953x_chip *chip, u32 invert)
+static int device_pca957x_init(struct pca953x_chip *chip)
 {
 	DECLARE_BITMAP(val, MAX_LINE);
 	unsigned int i;
 	int ret;
 
-	ret = device_pca95xx_init(chip, invert);
+	ret = device_pca95xx_init(chip);
 	if (ret)
 		goto out;
 
@@ -1054,9 +1049,8 @@ static int pca953x_probe(struct i2c_client *client)
 {
 	struct pca953x_platform_data *pdata;
 	struct pca953x_chip *chip;
-	int irq_base = 0;
+	int irq_base;
 	int ret;
-	u32 invert = 0;
 	struct regulator *reg;
 	const struct regmap_config *regmap_config;
 
@@ -1068,8 +1062,6 @@ static int pca953x_probe(struct i2c_client *client)
 	if (pdata) {
 		irq_base = pdata->irq_base;
 		chip->gpio_start = pdata->gpio_base;
-		invert = pdata->invert;
-		chip->names = pdata->names;
 	} else {
 		struct gpio_desc *reset_gpio;
 
@@ -1158,10 +1150,10 @@ static int pca953x_probe(struct i2c_client *client)
 	 */
 	if (PCA_CHIP_TYPE(chip->driver_data) == PCA957X_TYPE) {
 		chip->regs = &pca957x_regs;
-		ret = device_pca957x_init(chip, invert);
+		ret = device_pca957x_init(chip);
 	} else {
 		chip->regs = &pca953x_regs;
-		ret = device_pca95xx_init(chip, invert);
+		ret = device_pca95xx_init(chip);
 	}
 	if (ret)
 		goto err_exit;
@@ -1174,13 +1166,6 @@ static int pca953x_probe(struct i2c_client *client)
 	if (ret)
 		goto err_exit;
 
-	if (pdata && pdata->setup) {
-		ret = pdata->setup(client, chip->gpio_chip.base,
-				   chip->gpio_chip.ngpio, pdata->context);
-		if (ret < 0)
-			dev_warn(&client->dev, "setup failed, %d\n", ret);
-	}
-
 	return 0;
 
 err_exit:
@@ -1190,14 +1175,8 @@ err_exit:
 
 static void pca953x_remove(struct i2c_client *client)
 {
-	struct pca953x_platform_data *pdata = dev_get_platdata(&client->dev);
 	struct pca953x_chip *chip = i2c_get_clientdata(client);
 
-	if (pdata && pdata->teardown) {
-		pdata->teardown(client, chip->gpio_chip.base,
-				chip->gpio_chip.ngpio, pdata->context);
-	}
-
 	regulator_disable(chip->regulator);
 }
 
diff --git a/include/linux/platform_data/pca953x.h b/include/linux/platform_data/pca953x.h
index 96c1a14ab365..3c3787c4d96c 100644
--- a/include/linux/platform_data/pca953x.h
+++ b/include/linux/platform_data/pca953x.h
@@ -11,21 +11,8 @@ struct pca953x_platform_data {
 	/* number of the first GPIO */
 	unsigned	gpio_base;
 
-	/* initial polarity inversion setting */
-	u32		invert;
-
 	/* interrupt base */
 	int		irq_base;
-
-	void		*context;	/* param to setup/teardown */
-
-	int		(*setup)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	void		(*teardown)(struct i2c_client *client,
-				unsigned gpio, unsigned ngpio,
-				void *context);
-	const char	*const *names;
 };
 
 #endif /* _LINUX_PCA953X_H */
-- 
cgit v1.2.3


From cf8e8658100d4eae80ce9b21f7a81cb024dd5057 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 20 Oct 2022 15:54:33 +0200
Subject: arch: Remove Itanium (IA-64) architecture

The Itanium architecture is obsolete, and an informal survey [0] reveals
that any residual use of Itanium hardware in production is mostly HP-UX
or OpenVMS based. The use of Linux on Itanium appears to be limited to
enthusiasts that occasionally boot a fresh Linux kernel to see whether
things are still working as intended, and perhaps to churn out some
distro packages that are rarely used in practice.

None of the original companies behind Itanium still produce or support
any hardware or software for the architecture, and it is listed as
'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers
that contributed on behalf of those companies (nor anyone else, for that
matter) have been willing to support or maintain the architecture
upstream or even be responsible for applying the odd fix. The Intel
firmware team removed all IA-64 support from the Tianocore/EDK2
reference implementation of EFI in 2018. (Itanium is the original
architecture for which EFI was developed, and the way Linux supports it
deviates significantly from other architectures.) Some distros, such as
Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have
dropped support years ago.

While the argument is being made [1] that there is a 'for the common
good' angle to being able to build and run existing projects such as the
Grid Community Toolkit [2] on Itanium for interoperability testing, the
fact remains that none of those projects are known to be deployed on
Linux/ia64, and very few people actually have access to such a system in
the first place. Even if there were ways imaginable in which Linux/ia64
could be put to good use today, what matters is whether anyone is
actually doing that, and this does not appear to be the case.

There are no emulators widely available, and so boot testing Itanium is
generally infeasible for ordinary contributors. GCC still supports IA-64
but its compile farm [3] no longer has any IA-64 machines. GLIBC would
like to get rid of IA-64 [4] too because it would permit some overdue
code cleanups. In summary, the benefits to the ecosystem of having IA-64
be part of it are mostly theoretical, whereas the maintenance overhead
of keeping it supported is real.

So let's rip off the band aid, and remove the IA-64 arch code entirely.
This follows the timeline proposed by the Debian/ia64 maintainer [5],
which removes support in a controlled manner, leaving IA-64 in a known
good state in the most recent LTS release. Other projects will follow
once the kernel support is removed.

[0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/
[1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/
[2] https://gridcf.org/gct-docs/latest/index.html
[3] https://cfarm.tetaneutral.net/machines/list/
[4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/
[5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/

Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 Documentation/arch/ia64/aliasing.rst               |  246 ---
 Documentation/arch/ia64/efirtc.rst                 |  144 --
 Documentation/arch/ia64/err_inject.rst             | 1067 ---------
 Documentation/arch/ia64/features.rst               |    3 -
 Documentation/arch/ia64/fsys.rst                   |  303 ---
 Documentation/arch/ia64/ia64.rst                   |   49 -
 Documentation/arch/ia64/index.rst                  |   19 -
 Documentation/arch/ia64/irq-redir.rst              |   80 -
 Documentation/arch/ia64/mca.rst                    |  198 --
 Documentation/arch/ia64/serial.rst                 |  165 --
 Documentation/core-api/cpu_hotplug.rst             |    6 -
 MAINTAINERS                                        |   11 -
 arch/Kconfig                                       |    1 -
 arch/ia64/Kbuild                                   |    3 -
 arch/ia64/Kconfig                                  |  394 ----
 arch/ia64/Kconfig.debug                            |   55 -
 arch/ia64/Makefile                                 |   82 -
 arch/ia64/configs/bigsur_defconfig                 |  102 -
 arch/ia64/configs/generic_defconfig                |  206 --
 arch/ia64/configs/gensparse_defconfig              |  184 --
 arch/ia64/configs/tiger_defconfig                  |  169 --
 arch/ia64/configs/zx1_defconfig                    |  148 --
 arch/ia64/hp/common/Makefile                       |   10 -
 arch/ia64/hp/common/aml_nfw.c                      |  232 --
 arch/ia64/hp/common/sba_iommu.c                    | 2155 ------------------
 arch/ia64/include/asm/Kbuild                       |    6 -
 arch/ia64/include/asm/acenv.h                      |   49 -
 arch/ia64/include/asm/acpi-ext.h                   |   17 -
 arch/ia64/include/asm/acpi.h                       |  110 -
 arch/ia64/include/asm/asm-offsets.h                |    1 -
 arch/ia64/include/asm/asm-prototypes.h             |   30 -
 arch/ia64/include/asm/asmmacro.h                   |  136 --
 arch/ia64/include/asm/atomic.h                     |  216 --
 arch/ia64/include/asm/barrier.h                    |   79 -
 arch/ia64/include/asm/bitops.h                     |  453 ----
 arch/ia64/include/asm/bug.h                        |   19 -
 arch/ia64/include/asm/cache.h                      |   30 -
 arch/ia64/include/asm/cacheflush.h                 |   39 -
 arch/ia64/include/asm/checksum.h                   |   63 -
 arch/ia64/include/asm/clocksource.h                |   11 -
 arch/ia64/include/asm/cmpxchg.h                    |   33 -
 arch/ia64/include/asm/cpu.h                        |   23 -
 arch/ia64/include/asm/cputime.h                    |   21 -
 arch/ia64/include/asm/current.h                    |   18 -
 arch/ia64/include/asm/cyclone.h                    |   16 -
 arch/ia64/include/asm/delay.h                      |   89 -
 arch/ia64/include/asm/device.h                     |   14 -
 arch/ia64/include/asm/div64.h                      |    1 -
 arch/ia64/include/asm/dma-mapping.h                |   16 -
 arch/ia64/include/asm/dma.h                        |   17 -
 arch/ia64/include/asm/dmi.h                        |   15 -
 arch/ia64/include/asm/early_ioremap.h              |   11 -
 arch/ia64/include/asm/efi.h                        |   13 -
 arch/ia64/include/asm/elf.h                        |  233 --
 arch/ia64/include/asm/emergency-restart.h          |    6 -
 arch/ia64/include/asm/esi.h                        |   30 -
 arch/ia64/include/asm/exception.h                  |   23 -
 arch/ia64/include/asm/extable.h                    |   12 -
 arch/ia64/include/asm/fb.h                         |   43 -
 arch/ia64/include/asm/fpswa.h                      |   74 -
 arch/ia64/include/asm/ftrace.h                     |   28 -
 arch/ia64/include/asm/futex.h                      |  109 -
 arch/ia64/include/asm/gcc_intrin.h                 |   13 -
 arch/ia64/include/asm/hardirq.h                    |   27 -
 arch/ia64/include/asm/hugetlb.h                    |   34 -
 arch/ia64/include/asm/hw_irq.h                     |  167 --
 arch/ia64/include/asm/idle.h                       |    8 -
 arch/ia64/include/asm/intrinsics.h                 |   13 -
 arch/ia64/include/asm/io.h                         |  271 ---
 arch/ia64/include/asm/iommu.h                      |   22 -
 arch/ia64/include/asm/iosapic.h                    |  106 -
 arch/ia64/include/asm/irq.h                        |   37 -
 arch/ia64/include/asm/irq_regs.h                   |    1 -
 arch/ia64/include/asm/irq_remapping.h              |    5 -
 arch/ia64/include/asm/irqflags.h                   |   95 -
 arch/ia64/include/asm/kdebug.h                     |   45 -
 arch/ia64/include/asm/kexec.h                      |   46 -
 arch/ia64/include/asm/kprobes.h                    |  116 -
 arch/ia64/include/asm/kregs.h                      |  166 --
 arch/ia64/include/asm/libata-portmap.h             |    9 -
 arch/ia64/include/asm/linkage.h                    |   19 -
 arch/ia64/include/asm/local.h                      |    1 -
 arch/ia64/include/asm/mca.h                        |  185 --
 arch/ia64/include/asm/mca_asm.h                    |  245 ---
 arch/ia64/include/asm/meminit.h                    |   59 -
 arch/ia64/include/asm/mman.h                       |   18 -
 arch/ia64/include/asm/mmiowb.h                     |   17 -
 arch/ia64/include/asm/mmu.h                        |   14 -
 arch/ia64/include/asm/mmu_context.h                |  194 --
 arch/ia64/include/asm/mmzone.h                     |   35 -
 arch/ia64/include/asm/module.h                     |   35 -
 arch/ia64/include/asm/module.lds.h                 |   14 -
 arch/ia64/include/asm/msidef.h                     |   43 -
 arch/ia64/include/asm/native/inst.h                |  119 -
 arch/ia64/include/asm/native/irq.h                 |   20 -
 arch/ia64/include/asm/native/patchlist.h           |   24 -
 arch/ia64/include/asm/nodedata.h                   |   63 -
 arch/ia64/include/asm/numa.h                       |   83 -
 arch/ia64/include/asm/page.h                       |  208 --
 arch/ia64/include/asm/pal.h                        | 1827 ---------------
 arch/ia64/include/asm/param.h                      |   18 -
 arch/ia64/include/asm/parport.h                    |   20 -
 arch/ia64/include/asm/patch.h                      |   28 -
 arch/ia64/include/asm/pci.h                        |   66 -
 arch/ia64/include/asm/percpu.h                     |   53 -
 arch/ia64/include/asm/pgalloc.h                    |   64 -
 arch/ia64/include/asm/pgtable.h                    |  545 -----
 arch/ia64/include/asm/processor.h                  |  660 ------
 arch/ia64/include/asm/ptrace.h                     |  146 --
 arch/ia64/include/asm/sal.h                        |  919 --------
 arch/ia64/include/asm/sections.h                   |   33 -
 arch/ia64/include/asm/serial.h                     |   17 -
 arch/ia64/include/asm/shmparam.h                   |   13 -
 arch/ia64/include/asm/signal.h                     |   33 -
 arch/ia64/include/asm/smp.h                        |  103 -
 arch/ia64/include/asm/sn/intr.h                    |   15 -
 arch/ia64/include/asm/sn/sn_sal.h                  |  124 --
 arch/ia64/include/asm/sparsemem.h                  |   28 -
 arch/ia64/include/asm/spinlock.h                   |  265 ---
 arch/ia64/include/asm/spinlock_types.h             |   22 -
 arch/ia64/include/asm/string.h                     |   22 -
 arch/ia64/include/asm/switch_to.h                  |   71 -
 arch/ia64/include/asm/syscall.h                    |   65 -
 arch/ia64/include/asm/thread_info.h                |  131 --
 arch/ia64/include/asm/timex.h                      |   47 -
 arch/ia64/include/asm/tlb.h                        |   50 -
 arch/ia64/include/asm/tlbflush.h                   |  128 --
 arch/ia64/include/asm/topology.h                   |   56 -
 arch/ia64/include/asm/types.h                      |   32 -
 arch/ia64/include/asm/uaccess.h                    |  265 ---
 arch/ia64/include/asm/uncached.h                   |    9 -
 arch/ia64/include/asm/unistd.h                     |   38 -
 arch/ia64/include/asm/unwind.h                     |  234 --
 arch/ia64/include/asm/user.h                       |   53 -
 arch/ia64/include/asm/ustack.h                     |   12 -
 arch/ia64/include/asm/uv/uv.h                      |   30 -
 arch/ia64/include/asm/uv/uv_hub.h                  |  315 ---
 arch/ia64/include/asm/uv/uv_mmrs.h                 |  825 -------
 arch/ia64/include/asm/vermagic.h                   |   15 -
 arch/ia64/include/asm/vga.h                        |   26 -
 arch/ia64/include/asm/vmalloc.h                    |    4 -
 arch/ia64/include/asm/xor.h                        |   30 -
 arch/ia64/include/asm/xtp.h                        |   46 -
 arch/ia64/include/uapi/asm/Kbuild                  |    2 -
 arch/ia64/include/uapi/asm/auxvec.h                |   14 -
 arch/ia64/include/uapi/asm/bitsperlong.h           |    9 -
 arch/ia64/include/uapi/asm/break.h                 |   23 -
 arch/ia64/include/uapi/asm/byteorder.h             |    7 -
 arch/ia64/include/uapi/asm/cmpxchg.h               |  138 --
 arch/ia64/include/uapi/asm/fcntl.h                 |   15 -
 arch/ia64/include/uapi/asm/fpu.h                   |   67 -
 arch/ia64/include/uapi/asm/gcc_intrin.h            |  619 ------
 arch/ia64/include/uapi/asm/ia64regs.h              |  101 -
 arch/ia64/include/uapi/asm/intrinsics.h            |   82 -
 arch/ia64/include/uapi/asm/mman.h                  |   17 -
 arch/ia64/include/uapi/asm/param.h                 |   30 -
 arch/ia64/include/uapi/asm/posix_types.h           |    9 -
 arch/ia64/include/uapi/asm/ptrace.h                |  248 ---
 arch/ia64/include/uapi/asm/ptrace_offsets.h        |  269 ---
 arch/ia64/include/uapi/asm/resource.h              |    8 -
 arch/ia64/include/uapi/asm/rse.h                   |   67 -
 arch/ia64/include/uapi/asm/setup.h                 |   25 -
 arch/ia64/include/uapi/asm/sigcontext.h            |   71 -
 arch/ia64/include/uapi/asm/siginfo.h               |   28 -
 arch/ia64/include/uapi/asm/signal.h                |   98 -
 arch/ia64/include/uapi/asm/stat.h                  |   52 -
 arch/ia64/include/uapi/asm/statfs.h                |   21 -
 arch/ia64/include/uapi/asm/swab.h                  |   35 -
 arch/ia64/include/uapi/asm/types.h                 |   32 -
 arch/ia64/include/uapi/asm/ucontext.h              |   13 -
 arch/ia64/include/uapi/asm/unistd.h                |   22 -
 arch/ia64/include/uapi/asm/ustack.h                |   13 -
 arch/ia64/install.sh                               |   30 -
 arch/ia64/kernel/.gitignore                        |    3 -
 arch/ia64/kernel/Makefile                          |   46 -
 arch/ia64/kernel/Makefile.gate                     |   29 -
 arch/ia64/kernel/acpi-ext.c                        |  101 -
 arch/ia64/kernel/acpi.c                            |  913 --------
 arch/ia64/kernel/asm-offsets.c                     |  289 ---
 arch/ia64/kernel/audit.c                           |   63 -
 arch/ia64/kernel/brl_emu.c                         |  217 --
 arch/ia64/kernel/crash.c                           |  257 ---
 arch/ia64/kernel/crash_dump.c                      |   27 -
 arch/ia64/kernel/cyclone.c                         |  125 --
 arch/ia64/kernel/dma-mapping.c                     |    9 -
 arch/ia64/kernel/efi.c                             | 1360 ------------
 arch/ia64/kernel/efi_stub.S                        |   87 -
 arch/ia64/kernel/elfcore.c                         |   77 -
 arch/ia64/kernel/entry.S                           | 1427 ------------
 arch/ia64/kernel/entry.h                           |   83 -
 arch/ia64/kernel/err_inject.c                      |  273 ---
 arch/ia64/kernel/esi.c                             |  193 --
 arch/ia64/kernel/esi_stub.S                        |   99 -
 arch/ia64/kernel/fsys.S                            |  837 -------
 arch/ia64/kernel/fsyscall_gtod_data.h              |   30 -
 arch/ia64/kernel/ftrace.c                          |  196 --
 arch/ia64/kernel/gate-data.S                       |    3 -
 arch/ia64/kernel/gate.S                            |  380 ----
 arch/ia64/kernel/gate.lds.S                        |  108 -
 arch/ia64/kernel/head.S                            | 1167 ----------
 arch/ia64/kernel/iosapic.c                         | 1137 ----------
 arch/ia64/kernel/irq.c                             |  181 --
 arch/ia64/kernel/irq.h                             |    3 -
 arch/ia64/kernel/irq_ia64.c                        |  645 ------
 arch/ia64/kernel/irq_lsapic.c                      |   45 -
 arch/ia64/kernel/ivt.S                             | 1688 --------------
 arch/ia64/kernel/kprobes.c                         |  911 --------
 arch/ia64/kernel/machine_kexec.c                   |  163 --
 arch/ia64/kernel/mca.c                             | 2111 ------------------
 arch/ia64/kernel/mca_asm.S                         | 1123 ----------
 arch/ia64/kernel/mca_drv.c                         |  796 -------
 arch/ia64/kernel/mca_drv.h                         |  123 --
 arch/ia64/kernel/mca_drv_asm.S                     |   56 -
 arch/ia64/kernel/minstate.h                        |  251 ---
 arch/ia64/kernel/module.c                          |  959 --------
 arch/ia64/kernel/msi_ia64.c                        |  198 --
 arch/ia64/kernel/numa.c                            |   73 -
 arch/ia64/kernel/pal.S                             |  306 ---
 arch/ia64/kernel/palinfo.c                         |  942 --------
 arch/ia64/kernel/patch.c                           |  237 --
 arch/ia64/kernel/pci-dma.c                         |   33 -
 arch/ia64/kernel/perfmon_itanium.h                 |  116 -
 arch/ia64/kernel/process.c                         |  611 ------
 arch/ia64/kernel/ptrace.c                          | 2012 -----------------
 arch/ia64/kernel/relocate_kernel.S                 |  321 ---
 arch/ia64/kernel/sal.c                             |  400 ----
 arch/ia64/kernel/salinfo.c                         |  646 ------
 arch/ia64/kernel/setup.c                           | 1081 ---------
 arch/ia64/kernel/sigframe.h                        |   26 -
 arch/ia64/kernel/signal.c                          |  412 ----
 arch/ia64/kernel/smp.c                             |  335 ---
 arch/ia64/kernel/smpboot.c                         |  839 -------
 arch/ia64/kernel/stacktrace.c                      |   40 -
 arch/ia64/kernel/sys_ia64.c                        |  197 --
 arch/ia64/kernel/syscalls/Makefile                 |   32 -
 arch/ia64/kernel/syscalls/syscall.tbl              |  375 ----
 arch/ia64/kernel/time.c                            |  463 ----
 arch/ia64/kernel/topology.c                        |  410 ----
 arch/ia64/kernel/traps.c                           |  612 ------
 arch/ia64/kernel/unaligned.c                       | 1560 -------------
 arch/ia64/kernel/uncached.c                        |  273 ---
 arch/ia64/kernel/unwind.c                          | 2320 --------------------
 arch/ia64/kernel/unwind_decoder.c                  |  460 ----
 arch/ia64/kernel/unwind_i.h                        |  165 --
 arch/ia64/kernel/vmlinux.lds.S                     |  224 --
 arch/ia64/lib/Makefile                             |   48 -
 arch/ia64/lib/checksum.c                           |  102 -
 arch/ia64/lib/clear_page.S                         |   79 -
 arch/ia64/lib/clear_user.S                         |  212 --
 arch/ia64/lib/copy_page.S                          |  101 -
 arch/ia64/lib/copy_page_mck.S                      |  188 --
 arch/ia64/lib/copy_user.S                          |  613 ------
 arch/ia64/lib/csum_partial_copy.c                  |   98 -
 arch/ia64/lib/do_csum.S                            |  324 ---
 arch/ia64/lib/flush.S                              |  119 -
 arch/ia64/lib/idiv32.S                             |   86 -
 arch/ia64/lib/idiv64.S                             |   83 -
 arch/ia64/lib/io.c                                 |   51 -
 arch/ia64/lib/ip_fast_csum.S                       |  148 --
 arch/ia64/lib/memcpy.S                             |  304 ---
 arch/ia64/lib/memcpy_mck.S                         |  659 ------
 arch/ia64/lib/memset.S                             |  365 ---
 arch/ia64/lib/strlen.S                             |  195 --
 arch/ia64/lib/strncpy_from_user.S                  |   47 -
 arch/ia64/lib/strnlen_user.S                       |   48 -
 arch/ia64/lib/xor.S                                |  181 --
 arch/ia64/mm/Makefile                              |   11 -
 arch/ia64/mm/contig.c                              |  208 --
 arch/ia64/mm/discontig.c                           |  635 ------
 arch/ia64/mm/extable.c                             |   24 -
 arch/ia64/mm/fault.c                               |  251 ---
 arch/ia64/mm/hugetlbpage.c                         |  186 --
 arch/ia64/mm/init.c                                |  532 -----
 arch/ia64/mm/ioremap.c                             |   94 -
 arch/ia64/mm/numa.c                                |   80 -
 arch/ia64/mm/tlb.c                                 |  591 -----
 arch/ia64/pci/Makefile                             |    5 -
 arch/ia64/pci/fixup.c                              |   80 -
 arch/ia64/pci/pci.c                                |  576 -----
 arch/ia64/scripts/check-gas                        |   16 -
 arch/ia64/scripts/check-gas-asm.S                  |    2 -
 arch/ia64/scripts/check-model.c                    |    1 -
 arch/ia64/scripts/check-segrel.S                   |    5 -
 arch/ia64/scripts/check-segrel.lds                 |   13 -
 arch/ia64/scripts/check-serialize.S                |    2 -
 arch/ia64/scripts/check-text-align.S               |    7 -
 arch/ia64/scripts/toolchain-flags                  |   54 -
 arch/ia64/scripts/unwcheck.py                      |   65 -
 arch/ia64/uv/Makefile                              |   12 -
 arch/ia64/uv/kernel/Makefile                       |   12 -
 arch/ia64/uv/kernel/setup.c                        |  120 -
 drivers/acpi/Kconfig                               |    6 +-
 drivers/acpi/numa/Kconfig                          |    4 +-
 drivers/acpi/osl.c                                 |    2 +-
 drivers/char/Kconfig                               |    4 +-
 drivers/char/Makefile                              |    1 -
 drivers/char/agp/Kconfig                           |   16 +-
 drivers/char/agp/Makefile                          |    2 -
 drivers/char/agp/hp-agp.c                          |  550 -----
 drivers/char/agp/i460-agp.c                        |  659 ------
 drivers/char/hpet.c                                |   30 -
 drivers/char/hw_random/Kconfig                     |    2 +-
 drivers/char/mem.c                                 |   12 -
 drivers/char/mspec.c                               |  295 ---
 drivers/cpufreq/Kconfig                            |   11 -
 drivers/cpufreq/Makefile                           |    1 -
 drivers/cpufreq/ia64-acpi-cpufreq.c                |  353 ---
 drivers/firmware/Kconfig                           |   24 -
 drivers/firmware/Makefile                          |    1 -
 drivers/firmware/efi/Kconfig                       |    6 +-
 drivers/firmware/efi/efi.c                         |   13 +-
 drivers/firmware/pcdp.c                            |  135 --
 drivers/firmware/pcdp.h                            |  108 -
 drivers/gpu/drm/drm_ioc32.c                        |    4 +-
 drivers/input/serio/i8042.h                        |    2 +-
 drivers/iommu/Kconfig                              |    4 +-
 drivers/iommu/intel/Kconfig                        |    2 +-
 drivers/media/cec/platform/Kconfig                 |    2 +-
 drivers/misc/Kconfig                               |    2 +-
 drivers/misc/sgi-gru/gru.h                         |    4 +-
 drivers/misc/sgi-gru/gru_instructions.h            |   12 +-
 drivers/misc/sgi-gru/grufile.c                     |   72 -
 drivers/misc/sgi-gru/gruhandles.c                  |    6 -
 drivers/misc/sgi-gru/grumain.c                     |    4 -
 drivers/misc/sgi-xp/xp.h                           |    2 +-
 drivers/misc/sgi-xp/xp_uv.c                        |   24 -
 drivers/misc/sgi-xp/xpc_main.c                     |   31 -
 drivers/misc/sgi-xp/xpc_uv.c                       |   85 -
 drivers/net/ethernet/broadcom/tg3.c                |    2 +-
 drivers/net/ethernet/brocade/bna/bnad.h            |    1 -
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |    2 -
 drivers/pci/vgaarb.c                               |    2 +-
 drivers/tty/serial/8250/Kconfig                    |    2 +-
 drivers/tty/vt/keyboard.c                          |    2 +-
 drivers/video/fbdev/Kconfig                        |    2 +-
 drivers/watchdog/Kconfig                           |    2 +-
 fs/Kconfig                                         |    2 +-
 fs/afs/main.c                                      |    2 -
 fs/xfs/xfs_ioctl32.h                               |    2 +-
 include/linux/acpi.h                               |    9 +-
 include/linux/efi.h                                |    7 -
 include/linux/mm.h                                 |    2 -
 include/linux/moduleparam.h                        |    2 +-
 include/trace/events/mmflags.h                     |    2 +-
 init/Kconfig                                       |    2 +-
 kernel/cpu.c                                       |    3 -
 kernel/fork.c                                      |    2 +-
 kernel/sched/core.c                                |   29 +-
 kernel/sysctl.c                                    |    9 -
 lib/Kconfig.debug                                  |    2 +-
 lib/decompress_unxz.c                              |    3 -
 lib/xz/Kconfig                                     |    5 -
 mm/mmap.c                                          |    6 +-
 tools/arch/ia64/include/asm/barrier.h              |   59 -
 tools/arch/ia64/include/uapi/asm/bitsperlong.h     |    9 -
 tools/arch/ia64/include/uapi/asm/mman.h            |    7 -
 usr/include/Makefile                               |    6 -
 357 files changed, 45 insertions(+), 64955 deletions(-)
 delete mode 100644 Documentation/arch/ia64/aliasing.rst
 delete mode 100644 Documentation/arch/ia64/efirtc.rst
 delete mode 100644 Documentation/arch/ia64/err_inject.rst
 delete mode 100644 Documentation/arch/ia64/features.rst
 delete mode 100644 Documentation/arch/ia64/fsys.rst
 delete mode 100644 Documentation/arch/ia64/ia64.rst
 delete mode 100644 Documentation/arch/ia64/index.rst
 delete mode 100644 Documentation/arch/ia64/irq-redir.rst
 delete mode 100644 Documentation/arch/ia64/mca.rst
 delete mode 100644 Documentation/arch/ia64/serial.rst
 delete mode 100644 arch/ia64/Kbuild
 delete mode 100644 arch/ia64/Kconfig
 delete mode 100644 arch/ia64/Kconfig.debug
 delete mode 100644 arch/ia64/Makefile
 delete mode 100644 arch/ia64/configs/bigsur_defconfig
 delete mode 100644 arch/ia64/configs/generic_defconfig
 delete mode 100644 arch/ia64/configs/gensparse_defconfig
 delete mode 100644 arch/ia64/configs/tiger_defconfig
 delete mode 100644 arch/ia64/configs/zx1_defconfig
 delete mode 100644 arch/ia64/hp/common/Makefile
 delete mode 100644 arch/ia64/hp/common/aml_nfw.c
 delete mode 100644 arch/ia64/hp/common/sba_iommu.c
 delete mode 100644 arch/ia64/include/asm/Kbuild
 delete mode 100644 arch/ia64/include/asm/acenv.h
 delete mode 100644 arch/ia64/include/asm/acpi-ext.h
 delete mode 100644 arch/ia64/include/asm/acpi.h
 delete mode 100644 arch/ia64/include/asm/asm-offsets.h
 delete mode 100644 arch/ia64/include/asm/asm-prototypes.h
 delete mode 100644 arch/ia64/include/asm/asmmacro.h
 delete mode 100644 arch/ia64/include/asm/atomic.h
 delete mode 100644 arch/ia64/include/asm/barrier.h
 delete mode 100644 arch/ia64/include/asm/bitops.h
 delete mode 100644 arch/ia64/include/asm/bug.h
 delete mode 100644 arch/ia64/include/asm/cache.h
 delete mode 100644 arch/ia64/include/asm/cacheflush.h
 delete mode 100644 arch/ia64/include/asm/checksum.h
 delete mode 100644 arch/ia64/include/asm/clocksource.h
 delete mode 100644 arch/ia64/include/asm/cmpxchg.h
 delete mode 100644 arch/ia64/include/asm/cpu.h
 delete mode 100644 arch/ia64/include/asm/cputime.h
 delete mode 100644 arch/ia64/include/asm/current.h
 delete mode 100644 arch/ia64/include/asm/cyclone.h
 delete mode 100644 arch/ia64/include/asm/delay.h
 delete mode 100644 arch/ia64/include/asm/device.h
 delete mode 100644 arch/ia64/include/asm/div64.h
 delete mode 100644 arch/ia64/include/asm/dma-mapping.h
 delete mode 100644 arch/ia64/include/asm/dma.h
 delete mode 100644 arch/ia64/include/asm/dmi.h
 delete mode 100644 arch/ia64/include/asm/early_ioremap.h
 delete mode 100644 arch/ia64/include/asm/efi.h
 delete mode 100644 arch/ia64/include/asm/elf.h
 delete mode 100644 arch/ia64/include/asm/emergency-restart.h
 delete mode 100644 arch/ia64/include/asm/esi.h
 delete mode 100644 arch/ia64/include/asm/exception.h
 delete mode 100644 arch/ia64/include/asm/extable.h
 delete mode 100644 arch/ia64/include/asm/fb.h
 delete mode 100644 arch/ia64/include/asm/fpswa.h
 delete mode 100644 arch/ia64/include/asm/ftrace.h
 delete mode 100644 arch/ia64/include/asm/futex.h
 delete mode 100644 arch/ia64/include/asm/gcc_intrin.h
 delete mode 100644 arch/ia64/include/asm/hardirq.h
 delete mode 100644 arch/ia64/include/asm/hugetlb.h
 delete mode 100644 arch/ia64/include/asm/hw_irq.h
 delete mode 100644 arch/ia64/include/asm/idle.h
 delete mode 100644 arch/ia64/include/asm/intrinsics.h
 delete mode 100644 arch/ia64/include/asm/io.h
 delete mode 100644 arch/ia64/include/asm/iommu.h
 delete mode 100644 arch/ia64/include/asm/iosapic.h
 delete mode 100644 arch/ia64/include/asm/irq.h
 delete mode 100644 arch/ia64/include/asm/irq_regs.h
 delete mode 100644 arch/ia64/include/asm/irq_remapping.h
 delete mode 100644 arch/ia64/include/asm/irqflags.h
 delete mode 100644 arch/ia64/include/asm/kdebug.h
 delete mode 100644 arch/ia64/include/asm/kexec.h
 delete mode 100644 arch/ia64/include/asm/kprobes.h
 delete mode 100644 arch/ia64/include/asm/kregs.h
 delete mode 100644 arch/ia64/include/asm/libata-portmap.h
 delete mode 100644 arch/ia64/include/asm/linkage.h
 delete mode 100644 arch/ia64/include/asm/local.h
 delete mode 100644 arch/ia64/include/asm/mca.h
 delete mode 100644 arch/ia64/include/asm/mca_asm.h
 delete mode 100644 arch/ia64/include/asm/meminit.h
 delete mode 100644 arch/ia64/include/asm/mman.h
 delete mode 100644 arch/ia64/include/asm/mmiowb.h
 delete mode 100644 arch/ia64/include/asm/mmu.h
 delete mode 100644 arch/ia64/include/asm/mmu_context.h
 delete mode 100644 arch/ia64/include/asm/mmzone.h
 delete mode 100644 arch/ia64/include/asm/module.h
 delete mode 100644 arch/ia64/include/asm/module.lds.h
 delete mode 100644 arch/ia64/include/asm/msidef.h
 delete mode 100644 arch/ia64/include/asm/native/inst.h
 delete mode 100644 arch/ia64/include/asm/native/irq.h
 delete mode 100644 arch/ia64/include/asm/native/patchlist.h
 delete mode 100644 arch/ia64/include/asm/nodedata.h
 delete mode 100644 arch/ia64/include/asm/numa.h
 delete mode 100644 arch/ia64/include/asm/page.h
 delete mode 100644 arch/ia64/include/asm/pal.h
 delete mode 100644 arch/ia64/include/asm/param.h
 delete mode 100644 arch/ia64/include/asm/parport.h
 delete mode 100644 arch/ia64/include/asm/patch.h
 delete mode 100644 arch/ia64/include/asm/pci.h
 delete mode 100644 arch/ia64/include/asm/percpu.h
 delete mode 100644 arch/ia64/include/asm/pgalloc.h
 delete mode 100644 arch/ia64/include/asm/pgtable.h
 delete mode 100644 arch/ia64/include/asm/processor.h
 delete mode 100644 arch/ia64/include/asm/ptrace.h
 delete mode 100644 arch/ia64/include/asm/sal.h
 delete mode 100644 arch/ia64/include/asm/sections.h
 delete mode 100644 arch/ia64/include/asm/serial.h
 delete mode 100644 arch/ia64/include/asm/shmparam.h
 delete mode 100644 arch/ia64/include/asm/signal.h
 delete mode 100644 arch/ia64/include/asm/smp.h
 delete mode 100644 arch/ia64/include/asm/sn/intr.h
 delete mode 100644 arch/ia64/include/asm/sn/sn_sal.h
 delete mode 100644 arch/ia64/include/asm/sparsemem.h
 delete mode 100644 arch/ia64/include/asm/spinlock.h
 delete mode 100644 arch/ia64/include/asm/spinlock_types.h
 delete mode 100644 arch/ia64/include/asm/string.h
 delete mode 100644 arch/ia64/include/asm/switch_to.h
 delete mode 100644 arch/ia64/include/asm/syscall.h
 delete mode 100644 arch/ia64/include/asm/thread_info.h
 delete mode 100644 arch/ia64/include/asm/timex.h
 delete mode 100644 arch/ia64/include/asm/tlb.h
 delete mode 100644 arch/ia64/include/asm/tlbflush.h
 delete mode 100644 arch/ia64/include/asm/topology.h
 delete mode 100644 arch/ia64/include/asm/types.h
 delete mode 100644 arch/ia64/include/asm/uaccess.h
 delete mode 100644 arch/ia64/include/asm/uncached.h
 delete mode 100644 arch/ia64/include/asm/unistd.h
 delete mode 100644 arch/ia64/include/asm/unwind.h
 delete mode 100644 arch/ia64/include/asm/user.h
 delete mode 100644 arch/ia64/include/asm/ustack.h
 delete mode 100644 arch/ia64/include/asm/uv/uv.h
 delete mode 100644 arch/ia64/include/asm/uv/uv_hub.h
 delete mode 100644 arch/ia64/include/asm/uv/uv_mmrs.h
 delete mode 100644 arch/ia64/include/asm/vermagic.h
 delete mode 100644 arch/ia64/include/asm/vga.h
 delete mode 100644 arch/ia64/include/asm/vmalloc.h
 delete mode 100644 arch/ia64/include/asm/xor.h
 delete mode 100644 arch/ia64/include/asm/xtp.h
 delete mode 100644 arch/ia64/include/uapi/asm/Kbuild
 delete mode 100644 arch/ia64/include/uapi/asm/auxvec.h
 delete mode 100644 arch/ia64/include/uapi/asm/bitsperlong.h
 delete mode 100644 arch/ia64/include/uapi/asm/break.h
 delete mode 100644 arch/ia64/include/uapi/asm/byteorder.h
 delete mode 100644 arch/ia64/include/uapi/asm/cmpxchg.h
 delete mode 100644 arch/ia64/include/uapi/asm/fcntl.h
 delete mode 100644 arch/ia64/include/uapi/asm/fpu.h
 delete mode 100644 arch/ia64/include/uapi/asm/gcc_intrin.h
 delete mode 100644 arch/ia64/include/uapi/asm/ia64regs.h
 delete mode 100644 arch/ia64/include/uapi/asm/intrinsics.h
 delete mode 100644 arch/ia64/include/uapi/asm/mman.h
 delete mode 100644 arch/ia64/include/uapi/asm/param.h
 delete mode 100644 arch/ia64/include/uapi/asm/posix_types.h
 delete mode 100644 arch/ia64/include/uapi/asm/ptrace.h
 delete mode 100644 arch/ia64/include/uapi/asm/ptrace_offsets.h
 delete mode 100644 arch/ia64/include/uapi/asm/resource.h
 delete mode 100644 arch/ia64/include/uapi/asm/rse.h
 delete mode 100644 arch/ia64/include/uapi/asm/setup.h
 delete mode 100644 arch/ia64/include/uapi/asm/sigcontext.h
 delete mode 100644 arch/ia64/include/uapi/asm/siginfo.h
 delete mode 100644 arch/ia64/include/uapi/asm/signal.h
 delete mode 100644 arch/ia64/include/uapi/asm/stat.h
 delete mode 100644 arch/ia64/include/uapi/asm/statfs.h
 delete mode 100644 arch/ia64/include/uapi/asm/swab.h
 delete mode 100644 arch/ia64/include/uapi/asm/types.h
 delete mode 100644 arch/ia64/include/uapi/asm/ucontext.h
 delete mode 100644 arch/ia64/include/uapi/asm/unistd.h
 delete mode 100644 arch/ia64/include/uapi/asm/ustack.h
 delete mode 100755 arch/ia64/install.sh
 delete mode 100644 arch/ia64/kernel/.gitignore
 delete mode 100644 arch/ia64/kernel/Makefile
 delete mode 100644 arch/ia64/kernel/Makefile.gate
 delete mode 100644 arch/ia64/kernel/acpi-ext.c
 delete mode 100644 arch/ia64/kernel/acpi.c
 delete mode 100644 arch/ia64/kernel/asm-offsets.c
 delete mode 100644 arch/ia64/kernel/audit.c
 delete mode 100644 arch/ia64/kernel/brl_emu.c
 delete mode 100644 arch/ia64/kernel/crash.c
 delete mode 100644 arch/ia64/kernel/crash_dump.c
 delete mode 100644 arch/ia64/kernel/cyclone.c
 delete mode 100644 arch/ia64/kernel/dma-mapping.c
 delete mode 100644 arch/ia64/kernel/efi.c
 delete mode 100644 arch/ia64/kernel/efi_stub.S
 delete mode 100644 arch/ia64/kernel/elfcore.c
 delete mode 100644 arch/ia64/kernel/entry.S
 delete mode 100644 arch/ia64/kernel/entry.h
 delete mode 100644 arch/ia64/kernel/err_inject.c
 delete mode 100644 arch/ia64/kernel/esi.c
 delete mode 100644 arch/ia64/kernel/esi_stub.S
 delete mode 100644 arch/ia64/kernel/fsys.S
 delete mode 100644 arch/ia64/kernel/fsyscall_gtod_data.h
 delete mode 100644 arch/ia64/kernel/ftrace.c
 delete mode 100644 arch/ia64/kernel/gate-data.S
 delete mode 100644 arch/ia64/kernel/gate.S
 delete mode 100644 arch/ia64/kernel/gate.lds.S
 delete mode 100644 arch/ia64/kernel/head.S
 delete mode 100644 arch/ia64/kernel/iosapic.c
 delete mode 100644 arch/ia64/kernel/irq.c
 delete mode 100644 arch/ia64/kernel/irq.h
 delete mode 100644 arch/ia64/kernel/irq_ia64.c
 delete mode 100644 arch/ia64/kernel/irq_lsapic.c
 delete mode 100644 arch/ia64/kernel/ivt.S
 delete mode 100644 arch/ia64/kernel/kprobes.c
 delete mode 100644 arch/ia64/kernel/machine_kexec.c
 delete mode 100644 arch/ia64/kernel/mca.c
 delete mode 100644 arch/ia64/kernel/mca_asm.S
 delete mode 100644 arch/ia64/kernel/mca_drv.c
 delete mode 100644 arch/ia64/kernel/mca_drv.h
 delete mode 100644 arch/ia64/kernel/mca_drv_asm.S
 delete mode 100644 arch/ia64/kernel/minstate.h
 delete mode 100644 arch/ia64/kernel/module.c
 delete mode 100644 arch/ia64/kernel/msi_ia64.c
 delete mode 100644 arch/ia64/kernel/numa.c
 delete mode 100644 arch/ia64/kernel/pal.S
 delete mode 100644 arch/ia64/kernel/palinfo.c
 delete mode 100644 arch/ia64/kernel/patch.c
 delete mode 100644 arch/ia64/kernel/pci-dma.c
 delete mode 100644 arch/ia64/kernel/perfmon_itanium.h
 delete mode 100644 arch/ia64/kernel/process.c
 delete mode 100644 arch/ia64/kernel/ptrace.c
 delete mode 100644 arch/ia64/kernel/relocate_kernel.S
 delete mode 100644 arch/ia64/kernel/sal.c
 delete mode 100644 arch/ia64/kernel/salinfo.c
 delete mode 100644 arch/ia64/kernel/setup.c
 delete mode 100644 arch/ia64/kernel/sigframe.h
 delete mode 100644 arch/ia64/kernel/signal.c
 delete mode 100644 arch/ia64/kernel/smp.c
 delete mode 100644 arch/ia64/kernel/smpboot.c
 delete mode 100644 arch/ia64/kernel/stacktrace.c
 delete mode 100644 arch/ia64/kernel/sys_ia64.c
 delete mode 100644 arch/ia64/kernel/syscalls/Makefile
 delete mode 100644 arch/ia64/kernel/syscalls/syscall.tbl
 delete mode 100644 arch/ia64/kernel/time.c
 delete mode 100644 arch/ia64/kernel/topology.c
 delete mode 100644 arch/ia64/kernel/traps.c
 delete mode 100644 arch/ia64/kernel/unaligned.c
 delete mode 100644 arch/ia64/kernel/uncached.c
 delete mode 100644 arch/ia64/kernel/unwind.c
 delete mode 100644 arch/ia64/kernel/unwind_decoder.c
 delete mode 100644 arch/ia64/kernel/unwind_i.h
 delete mode 100644 arch/ia64/kernel/vmlinux.lds.S
 delete mode 100644 arch/ia64/lib/Makefile
 delete mode 100644 arch/ia64/lib/checksum.c
 delete mode 100644 arch/ia64/lib/clear_page.S
 delete mode 100644 arch/ia64/lib/clear_user.S
 delete mode 100644 arch/ia64/lib/copy_page.S
 delete mode 100644 arch/ia64/lib/copy_page_mck.S
 delete mode 100644 arch/ia64/lib/copy_user.S
 delete mode 100644 arch/ia64/lib/csum_partial_copy.c
 delete mode 100644 arch/ia64/lib/do_csum.S
 delete mode 100644 arch/ia64/lib/flush.S
 delete mode 100644 arch/ia64/lib/idiv32.S
 delete mode 100644 arch/ia64/lib/idiv64.S
 delete mode 100644 arch/ia64/lib/io.c
 delete mode 100644 arch/ia64/lib/ip_fast_csum.S
 delete mode 100644 arch/ia64/lib/memcpy.S
 delete mode 100644 arch/ia64/lib/memcpy_mck.S
 delete mode 100644 arch/ia64/lib/memset.S
 delete mode 100644 arch/ia64/lib/strlen.S
 delete mode 100644 arch/ia64/lib/strncpy_from_user.S
 delete mode 100644 arch/ia64/lib/strnlen_user.S
 delete mode 100644 arch/ia64/lib/xor.S
 delete mode 100644 arch/ia64/mm/Makefile
 delete mode 100644 arch/ia64/mm/contig.c
 delete mode 100644 arch/ia64/mm/discontig.c
 delete mode 100644 arch/ia64/mm/extable.c
 delete mode 100644 arch/ia64/mm/fault.c
 delete mode 100644 arch/ia64/mm/hugetlbpage.c
 delete mode 100644 arch/ia64/mm/init.c
 delete mode 100644 arch/ia64/mm/ioremap.c
 delete mode 100644 arch/ia64/mm/numa.c
 delete mode 100644 arch/ia64/mm/tlb.c
 delete mode 100644 arch/ia64/pci/Makefile
 delete mode 100644 arch/ia64/pci/fixup.c
 delete mode 100644 arch/ia64/pci/pci.c
 delete mode 100755 arch/ia64/scripts/check-gas
 delete mode 100644 arch/ia64/scripts/check-gas-asm.S
 delete mode 100644 arch/ia64/scripts/check-model.c
 delete mode 100644 arch/ia64/scripts/check-segrel.S
 delete mode 100644 arch/ia64/scripts/check-segrel.lds
 delete mode 100644 arch/ia64/scripts/check-serialize.S
 delete mode 100644 arch/ia64/scripts/check-text-align.S
 delete mode 100755 arch/ia64/scripts/toolchain-flags
 delete mode 100644 arch/ia64/scripts/unwcheck.py
 delete mode 100644 arch/ia64/uv/Makefile
 delete mode 100644 arch/ia64/uv/kernel/Makefile
 delete mode 100644 arch/ia64/uv/kernel/setup.c
 delete mode 100644 drivers/char/agp/hp-agp.c
 delete mode 100644 drivers/char/agp/i460-agp.c
 delete mode 100644 drivers/char/mspec.c
 delete mode 100644 drivers/cpufreq/ia64-acpi-cpufreq.c
 delete mode 100644 drivers/firmware/pcdp.c
 delete mode 100644 drivers/firmware/pcdp.h
 delete mode 100644 tools/arch/ia64/include/asm/barrier.h
 delete mode 100644 tools/arch/ia64/include/uapi/asm/bitsperlong.h
 delete mode 100644 tools/arch/ia64/include/uapi/asm/mman.h

(limited to 'include/linux')

diff --git a/Documentation/arch/ia64/aliasing.rst b/Documentation/arch/ia64/aliasing.rst
deleted file mode 100644
index 36a1e1d4842b..000000000000
--- a/Documentation/arch/ia64/aliasing.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-==================================
-Memory Attribute Aliasing on IA-64
-==================================
-
-Bjorn Helgaas <bjorn.helgaas@hp.com>
-
-May 4, 2006
-
-
-Memory Attributes
-=================
-
-    Itanium supports several attributes for virtual memory references.
-    The attribute is part of the virtual translation, i.e., it is
-    contained in the TLB entry.  The ones of most interest to the Linux
-    kernel are:
-
-	==		======================
-        WB		Write-back (cacheable)
-	UC		Uncacheable
-	WC		Write-coalescing
-	==		======================
-
-    System memory typically uses the WB attribute.  The UC attribute is
-    used for memory-mapped I/O devices.  The WC attribute is uncacheable
-    like UC is, but writes may be delayed and combined to increase
-    performance for things like frame buffers.
-
-    The Itanium architecture requires that we avoid accessing the same
-    page with both a cacheable mapping and an uncacheable mapping[1].
-
-    The design of the chipset determines which attributes are supported
-    on which regions of the address space.  For example, some chipsets
-    support either WB or UC access to main memory, while others support
-    only WB access.
-
-Memory Map
-==========
-
-    Platform firmware describes the physical memory map and the
-    supported attributes for each region.  At boot-time, the kernel uses
-    the EFI GetMemoryMap() interface.  ACPI can also describe memory
-    devices and the attributes they support, but Linux/ia64 currently
-    doesn't use this information.
-
-    The kernel uses the efi_memmap table returned from GetMemoryMap() to
-    learn the attributes supported by each region of physical address
-    space.  Unfortunately, this table does not completely describe the
-    address space because some machines omit some or all of the MMIO
-    regions from the map.
-
-    The kernel maintains another table, kern_memmap, which describes the
-    memory Linux is actually using and the attribute for each region.
-    This contains only system memory; it does not contain MMIO space.
-
-    The kern_memmap table typically contains only a subset of the system
-    memory described by the efi_memmap.  Linux/ia64 can't use all memory
-    in the system because of constraints imposed by the identity mapping
-    scheme.
-
-    The efi_memmap table is preserved unmodified because the original
-    boot-time information is required for kexec.
-
-Kernel Identity Mappings
-========================
-
-    Linux/ia64 identity mappings are done with large pages, currently
-    either 16MB or 64MB, referred to as "granules."  Cacheable mappings
-    are speculative[2], so the processor can read any location in the
-    page at any time, independent of the programmer's intentions.  This
-    means that to avoid attribute aliasing, Linux can create a cacheable
-    identity mapping only when the entire granule supports cacheable
-    access.
-
-    Therefore, kern_memmap contains only full granule-sized regions that
-    can referenced safely by an identity mapping.
-
-    Uncacheable mappings are not speculative, so the processor will
-    generate UC accesses only to locations explicitly referenced by
-    software.  This allows UC identity mappings to cover granules that
-    are only partially populated, or populated with a combination of UC
-    and WB regions.
-
-User Mappings
-=============
-
-    User mappings are typically done with 16K or 64K pages.  The smaller
-    page size allows more flexibility because only 16K or 64K has to be
-    homogeneous with respect to memory attributes.
-
-Potential Attribute Aliasing Cases
-==================================
-
-    There are several ways the kernel creates new mappings:
-
-mmap of /dev/mem
-----------------
-
-	This uses remap_pfn_range(), which creates user mappings.  These
-	mappings may be either WB or UC.  If the region being mapped
-	happens to be in kern_memmap, meaning that it may also be mapped
-	by a kernel identity mapping, the user mapping must use the same
-	attribute as the kernel mapping.
-
-	If the region is not in kern_memmap, the user mapping should use
-	an attribute reported as being supported in the EFI memory map.
-
-	Since the EFI memory map does not describe MMIO on some
-	machines, this should use an uncacheable mapping as a fallback.
-
-mmap of /sys/class/pci_bus/.../legacy_mem
------------------------------------------
-
-	This is very similar to mmap of /dev/mem, except that legacy_mem
-	only allows mmap of the one megabyte "legacy MMIO" area for a
-	specific PCI bus.  Typically this is the first megabyte of
-	physical address space, but it may be different on machines with
-	several VGA devices.
-
-	"X" uses this to access VGA frame buffers.  Using legacy_mem
-	rather than /dev/mem allows multiple instances of X to talk to
-	different VGA cards.
-
-	The /dev/mem mmap constraints apply.
-
-mmap of /proc/bus/pci/.../??.?
-------------------------------
-
-	This is an MMIO mmap of PCI functions, which additionally may or
-	may not be requested as using the WC attribute.
-
-	If WC is requested, and the region in kern_memmap is either WC
-	or UC, and the EFI memory map designates the region as WC, then
-	the WC mapping is allowed.
-
-	Otherwise, the user mapping must use the same attribute as the
-	kernel mapping.
-
-read/write of /dev/mem
-----------------------
-
-	This uses copy_from_user(), which implicitly uses a kernel
-	identity mapping.  This is obviously safe for things in
-	kern_memmap.
-
-	There may be corner cases of things that are not in kern_memmap,
-	but could be accessed this way.  For example, registers in MMIO
-	space are not in kern_memmap, but could be accessed with a UC
-	mapping.  This would not cause attribute aliasing.  But
-	registers typically can be accessed only with four-byte or
-	eight-byte accesses, and the copy_from_user() path doesn't allow
-	any control over the access size, so this would be dangerous.
-
-ioremap()
----------
-
-	This returns a mapping for use inside the kernel.
-
-	If the region is in kern_memmap, we should use the attribute
-	specified there.
-
-	If the EFI memory map reports that the entire granule supports
-	WB, we should use that (granules that are partially reserved
-	or occupied by firmware do not appear in kern_memmap).
-
-	If the granule contains non-WB memory, but we can cover the
-	region safely with kernel page table mappings, we can use
-	ioremap_page_range() as most other architectures do.
-
-	Failing all of the above, we have to fall back to a UC mapping.
-
-Past Problem Cases
-==================
-
-mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
---------------------------------------------------------------------
-
-      The EFI memory map may not report these MMIO regions.
-
-      These must be allowed so that X will work.  This means that
-      when the EFI memory map is incomplete, every /dev/mem mmap must
-      succeed.  It may create either WB or UC user mappings, depending
-      on whether the region is in kern_memmap or the EFI memory map.
-
-mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
-----------------------------------------------------------------------
-
-      The EFI memory map reports the following attributes:
-
-        =============== ======= ==================
-        0x00000-0x9FFFF WB only
-        0xA0000-0xBFFFF UC only (VGA frame buffer)
-        0xC0000-0xFFFFF WB only
-        =============== ======= ==================
-
-      This mmap is done with user pages, not kernel identity mappings,
-      so it is safe to use WB mappings.
-
-      The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
-      which uses a granule-sized UC mapping.  This granule will cover some
-      WB-only memory, but since UC is non-speculative, the processor will
-      never generate an uncacheable reference to the WB-only areas unless
-      the driver explicitly touches them.
-
-mmap of 0x0-0xFFFFF legacy_mem by "X"
--------------------------------------
-
-      If the EFI memory map reports that the entire range supports the
-      same attributes, we can allow the mmap (and we will prefer WB if
-      supported, as is the case with HP sx[12]000 machines with VGA
-      disabled).
-
-      If EFI reports the range as partly WB and partly UC (as on sx[12]000
-      machines with VGA enabled), we must fail the mmap because there's no
-      safe attribute to use.
-
-      If EFI reports some of the range but not all (as on Intel firmware
-      that doesn't report the VGA frame buffer at all), we should fail the
-      mmap and force the user to map just the specific region of interest.
-
-mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
-------------------------------------------------------------------------
-
-      The EFI memory map reports the following attributes::
-
-        0x00000-0xFFFFF WB only (no VGA MMIO hole)
-
-      This is a special case of the previous case, and the mmap should
-      fail for the same reason as above.
-
-read of /sys/devices/.../rom
-----------------------------
-
-      For VGA devices, this may cause an ioremap() of 0xC0000.  This
-      used to be done with a UC mapping, because the VGA frame buffer
-      at 0xA0000 prevents use of a WB granule.  The UC mapping causes
-      an MCA on HP sx[12]000 chipsets.
-
-      We should use WB page table mappings to avoid covering the VGA
-      frame buffer.
-
-Notes
-=====
-
-    [1] SDM rev 2.2, vol 2, sec 4.4.1.
-    [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/arch/ia64/efirtc.rst b/Documentation/arch/ia64/efirtc.rst
deleted file mode 100644
index fd8328408301..000000000000
--- a/Documentation/arch/ia64/efirtc.rst
+++ /dev/null
@@ -1,144 +0,0 @@
-==========================
-EFI Real Time Clock driver
-==========================
-
-S. Eranian <eranian@hpl.hp.com>
-
-March 2000
-
-1. Introduction
-===============
-
-This document describes the efirtc.c driver has provided for
-the IA-64 platform.
-
-The purpose of this driver is to supply an API for kernel and user applications
-to get access to the Time Service offered by EFI version 0.92.
-
-EFI provides 4 calls one can make once the OS is booted: GetTime(),
-SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
-driver. We describe those calls as well the design of the driver in the
-following sections.
-
-2. Design Decisions
-===================
-
-The original ideas was to provide a very simple driver to get access to,
-at first, the time of day service. This is required in order to access, in a
-portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
-to initialize the system view of the time during boot.
-
-Because we wanted to minimize the impact on existing user-level apps using
-the CMOS clock, we decided to expose an API that was very similar to the one
-used today with the legacy RTC driver (driver/char/rtc.c). However, because
-EFI provides a simpler services, not all ioctl() are available. Also
-new ioctl()s have been introduced for things that EFI provides but not the
-legacy.
-
-EFI uses a slightly different way of representing the time, noticeably
-the reference date is different. Year is the using the full 4-digit format.
-The Epoch is January 1st 1998. For backward compatibility reasons we don't
-expose this new way of representing time. Instead we use something very
-similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
-One of the reasons for doing it this way is to allow for EFI to still evolve
-without necessarily impacting any of the user applications. The decoupling
-enables flexibility and permits writing wrapper code is ncase things change.
-
-The driver exposes two interfaces, one via the device file and a set of
-ioctl()s. The other is read-only via the /proc filesystem.
-
-As of today we don't offer a /proc/sys interface.
-
-To allow for a uniform interface between the legacy RTC and EFI time service,
-we have created the include/linux/rtc.h header file to contain only the
-"public" API of the two drivers.  The specifics of the legacy RTC are still
-in include/linux/mc146818rtc.h.
-
-
-3. Time of day service
-======================
-
-The part of the driver gives access to the time of day service of EFI.
-Two ioctl()s, compatible with the legacy RTC calls:
-
-	Read the CMOS clock::
-
-		ioctl(d, RTC_RD_TIME, &rtc);
-
-	Write the CMOS clock::
-
-		ioctl(d, RTC_SET_TIME, &rtc);
-
-The rtc is a pointer to a data structure defined in rtc.h which is close
-to a struct tm::
-
-  struct rtc_time {
-          int tm_sec;
-          int tm_min;
-          int tm_hour;
-          int tm_mday;
-          int tm_mon;
-          int tm_year;
-          int tm_wday;
-          int tm_yday;
-          int tm_isdst;
-  };
-
-The driver takes care of converting back an forth between the EFI time and
-this format.
-
-Those two ioctl()s can be exercised with the hwclock command:
-
-For reading::
-
-	# /sbin/hwclock --show
-	Mon Mar  6 15:32:32 2000  -0.910248 seconds
-
-For setting::
-
-	# /sbin/hwclock --systohc
-
-Root privileges are required to be able to set the time of day.
-
-4. Wakeup Alarm service
-=======================
-
-EFI provides an API by which one can program when a machine should wakeup,
-i.e. reboot. This is very different from the alarm provided by the legacy
-RTC which is some kind of interval timer alarm. For this reason we don't use
-the same ioctl()s to get access to the service. Instead we have
-introduced 2 news ioctl()s to the interface of an RTC.
-
-We have added 2 new ioctl()s that are specific to the EFI driver:
-
-	Read the current state of the alarm::
-
-		ioctl(d, RTC_WKALM_RD, &wkt)
-
-	Set the alarm or change its status::
-
-		ioctl(d, RTC_WKALM_SET, &wkt)
-
-The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
-status information::
-
-  struct rtc_wkalrm {
-
-          unsigned char enabled; /* =1 if alarm is enabled */
-          unsigned char pending; /* =1 if alarm is pending  */
-
-          struct rtc_time time;
-  }
-
-As of today, none of the existing user-level apps supports this feature.
-However writing such a program should be hard by simply using those two
-ioctl().
-
-Root privileges are required to be able to set the alarm.
-
-5. References
-=============
-
-Checkout the following Web site for more information on EFI:
-
-http://developer.intel.com/technology/efi/
diff --git a/Documentation/arch/ia64/err_inject.rst b/Documentation/arch/ia64/err_inject.rst
deleted file mode 100644
index 900f71e93a29..000000000000
--- a/Documentation/arch/ia64/err_inject.rst
+++ /dev/null
@@ -1,1067 +0,0 @@
-========================================
-IPF Machine Check (MC) error inject tool
-========================================
-
-IPF Machine Check (MC) error inject tool is used to inject MC
-errors from Linux. The tool is a test bed for IPF MC work flow including
-hardware correctable error handling, OS recoverable error handling, MC
-event logging, etc.
-
-The tool includes two parts: a kernel driver and a user application
-sample. The driver provides interface to PAL to inject error
-and query error injection capabilities. The driver code is in
-arch/ia64/kernel/err_inject.c. The application sample (shown below)
-provides a combination of various errors and calls the driver's interface
-(sysfs interface) to inject errors or query error injection capabilities.
-
-The tool can be used to test Intel IPF machine MC handling capabilities.
-It's especially useful for people who can not access hardware MC injection
-tool to inject error. It's also very useful to integrate with other
-software test suits to do stressful testing on IPF.
-
-Below is a sample application as part of the whole tool. The sample
-can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a library or other user
-application to have more thorough test.
-
-The sample application takes err.conf as error configuration input. GCC
-compiles the code. After you install err_inject driver, you can run
-this sample application to inject errors.
-
-Errata: Itanium 2 Processors Specification Update lists some errata against
-the pal_mc_error_inject PAL procedure. The following err.conf has been tested
-on latest Montecito PAL.
-
-err.conf::
-
-  #This is configuration file for err_inject_tool.
-  #The format of the each line is:
-  #cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
-  #where
-  #	cpu: logical cpu number the error will be inject in.
-  #	loop: times the error will be injected.
-  #	interval: In second. every so often one error is injected.
-  #	err_type_info, err_struct_info: PAL parameters.
-  #
-  #Note: All values are hex w/o or w/ 0x prefix.
-
-
-  #On cpu2, inject only total 0x10 errors, interval 5 seconds
-  #corrected, data cache, hier-2, physical addr(assigned by tool code).
-  #working on Montecito latest PAL.
-  2, 10, 5, 4101, 95
-
-  #On cpu4, inject and consume total 0x10 errors, interval 5 seconds
-  #corrected, data cache, hier-2, physical addr(assigned by tool code).
-  #working on Montecito latest PAL.
-  4, 10, 5, 4109, 95
-
-  #On cpu15, inject and consume total 0x10 errors, interval 5 seconds
-  #recoverable, DTR0, hier-2.
-  #working on Montecito latest PAL.
-  0xf, 0x10, 5, 4249, 15
-
-The sample application source code:
-
-err_injection_tool.c::
-
-  /*
-   * This program is free software; you can redistribute it and/or modify
-   * it under the terms of the GNU General Public License as published by
-   * the Free Software Foundation; either version 2 of the License, or
-   * (at your option) any later version.
-   *
-   * This program is distributed in the hope that it will be useful, but
-   * WITHOUT ANY WARRANTY; without even the implied warranty of
-   * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
-   * NON INFRINGEMENT.  See the GNU General Public License for more
-   * details.
-   *
-   * You should have received a copy of the GNU General Public License
-   * along with this program; if not, write to the Free Software
-   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-   *
-   * Copyright (C) 2006 Intel Co
-   *	Fenghua Yu <fenghua.yu@intel.com>
-   *
-   */
-  #include <sys/types.h>
-  #include <sys/stat.h>
-  #include <fcntl.h>
-  #include <stdio.h>
-  #include <sched.h>
-  #include <unistd.h>
-  #include <stdlib.h>
-  #include <stdarg.h>
-  #include <string.h>
-  #include <errno.h>
-  #include <time.h>
-  #include <sys/ipc.h>
-  #include <sys/sem.h>
-  #include <sys/wait.h>
-  #include <sys/mman.h>
-  #include <sys/shm.h>
-
-  #define MAX_FN_SIZE 		256
-  #define MAX_BUF_SIZE 		256
-  #define DATA_BUF_SIZE 		256
-  #define NR_CPUS 		512
-  #define MAX_TASK_NUM		2048
-  #define MIN_INTERVAL		5	// seconds
-  #define	ERR_DATA_BUFFER_SIZE 	3	// Three 8-byte.
-  #define PARA_FIELD_NUM		5
-  #define MASK_SIZE		(NR_CPUS/64)
-  #define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
-
-  int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
-
-  int verbose;
-  #define vbprintf if (verbose) printf
-
-  int log_info(int cpu, const char *fmt, ...)
-  {
-	FILE *log;
-	char fn[MAX_FN_SIZE];
-	char buf[MAX_BUF_SIZE];
-	va_list args;
-
-	sprintf(fn, "%d.log", cpu);
-	log=fopen(fn, "a+");
-	if (log==NULL) {
-		perror("Error open:");
-		return -1;
-	}
-
-	va_start(args, fmt);
-	vprintf(fmt, args);
-	memset(buf, 0, MAX_BUF_SIZE);
-	vsprintf(buf, fmt, args);
-	va_end(args);
-
-	fwrite(buf, sizeof(buf), 1, log);
-	fclose(log);
-
-	return 0;
-  }
-
-  typedef unsigned long u64;
-  typedef unsigned int  u32;
-
-  typedef union err_type_info_u {
-	struct {
-		u64	mode		: 3,	/* 0-2 */
-			err_inj		: 3,	/* 3-5 */
-			err_sev		: 2,	/* 6-7 */
-			err_struct	: 5,	/* 8-12 */
-			struct_hier	: 3,	/* 13-15 */
-			reserved	: 48;	/* 16-63 */
-	} err_type_info_u;
-	u64	err_type_info;
-  } err_type_info_t;
-
-  typedef union err_struct_info_u {
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			c_t		: 2,	/* 1-2	 */
-			cl_p		: 3,	/* 3-5	 */
-			cl_id		: 3,	/* 6-8	 */
-			cl_dp		: 1,	/* 9	 */
-			reserved1	: 22,	/* 10-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_cache;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			tt		: 2,	/* 1-2	 */
-			tc_tr		: 2,	/* 3-4	 */
-			tr_slot		: 8,	/* 5-12	 */
-			reserved1	: 19,	/* 13-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_tlb;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			regfile_id	: 4,	/* 1-4	 */
-			reg_num		: 7,	/* 5-11	 */
-			reserved1	: 20,	/* 12-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_register;
-	struct {
-		u64	reserved;
-	} err_struct_info_bus_processor_interconnect;
-	u64	err_struct_info;
-  } err_struct_info_t;
-
-  typedef union err_data_buffer_u {
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-					: 39;	/* 153-191	*/
-	} err_data_buffer_cache;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-			reserved	: 39;	/* 153-191	*/
-	} err_data_buffer_tlb;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-	} err_data_buffer_register;
-	struct {
-		u64	reserved;		/* 0-63		*/
-	} err_data_buffer_bus_processor_interconnect;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-  } err_data_buffer_t;
-
-  typedef union capabilities_u {
-	struct {
-		u64	i		: 1,
-			d		: 1,
-			rv		: 1,
-			tag		: 1,
-			data		: 1,
-			mesi		: 1,
-			dp		: 1,
-			reserved1	: 3,
-			pa		: 1,
-			va		: 1,
-			wi		: 1,
-			reserved2	: 20,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_cache;
-	struct {
-		u64	d		: 1,
-			i		: 1,
-			rv		: 1,
-			tc		: 1,
-			tr		: 1,
-			reserved1	: 27,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved2	: 30;
-	} capabilities_tlb;
-	struct {
-		u64	gr_b0		: 1,
-			gr_b1		: 1,
-			fr		: 1,
-			br		: 1,
-			pr		: 1,
-			ar		: 1,
-			cr		: 1,
-			rr		: 1,
-			pkr		: 1,
-			dbr		: 1,
-			ibr		: 1,
-			pmc		: 1,
-			pmd		: 1,
-			reserved1	: 3,
-			regnum		: 1,
-			reserved2	: 15,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_register;
-	struct {
-		u64	reserved;
-	} capabilities_bus_processor_interconnect;
-  } capabilities_t;
-
-  typedef struct resources_s {
-	u64	ibr0		: 1,
-		ibr2		: 1,
-		ibr4		: 1,
-		ibr6		: 1,
-		dbr0		: 1,
-		dbr2		: 1,
-		dbr4		: 1,
-		dbr6		: 1,
-		reserved	: 48;
-  } resources_t;
-
-
-  long get_page_size(void)
-  {
-	long page_size=sysconf(_SC_PAGESIZE);
-	return page_size;
-  }
-
-  #define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
-  #define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
-  #define SHM_VA 0x2000000100000000
-
-  int shmid;
-  void *shmaddr;
-
-  int create_shm(void)
-  {
-	key_t key;
-	char fn[MAX_FN_SIZE];
-
-	/* cpu0 is always existing */
-	sprintf(fn, PATH_FORMAT, 0);
-	if ((key = ftok(fn, 's')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
-	if (shmid == -1) {
-		if (errno==EEXIST) {
-			shmid = shmget(key, SHM_SIZE, 0);
-			if (shmid == -1) {
-				perror("shmget");
-				return -1;
-			}
-		}
-		else {
-			perror("shmget");
-			return -1;
-		}
-	}
-	vbprintf("shmid=%d", shmid);
-
-	/* connect to the segment: */
-	shmaddr = shmat(shmid, (void *)SHM_VA, 0);
-	if (shmaddr == (void*)-1) {
-		perror("shmat");
-		return -1;
-	}
-
-	memset(shmaddr, 0, SHM_SIZE);
-	mlock(shmaddr, SHM_SIZE);
-
-	return 0;
-  }
-
-  int free_shm()
-  {
-	munlock(shmaddr, SHM_SIZE);
-          shmdt(shmaddr);
-	semctl(shmid, 0, IPC_RMID);
-
-	return 0;
-  }
-
-  #ifdef _SEM_SEMUN_UNDEFINED
-  union semun
-  {
-	int val;
-	struct semid_ds *buf;
-	unsigned short int *array;
-	struct seminfo *__buf;
-  };
-  #endif
-
-  u32 mode=1; /* 1: physical mode; 2: virtual mode. */
-  int one_lock=1;
-  key_t key[NR_CPUS];
-  int semid[NR_CPUS];
-
-  int create_sem(int cpu)
-  {
-	union semun arg;
-	char fn[MAX_FN_SIZE];
-	int sid;
-
-	sprintf(fn, PATH_FORMAT, cpu);
-	sprintf(fn, "%s/%s", fn, "err_type_info");
-	if ((key[cpu] = ftok(fn, 'e')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	if (semid[cpu]!=0)
-		return 0;
-
-	/* clear old semaphore */
-	if ((sid = semget(key[cpu], 1, 0)) != -1)
-		semctl(sid, 0, IPC_RMID);
-
-	/* get one semaphore */
-	if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
-		perror("semget");
-		printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
-			(u64)key[cpu]);
-		return -1;
-	}
-
-	vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
-		(u64)key[cpu]);
-	/* initialize the semaphore to 1: */
-	arg.val = 1;
-	if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
-		perror("semctl");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  static int lock(int cpu)
-  {
-	struct sembuf lock;
-
-	lock.sem_num = cpu;
-	lock.sem_op = 1;
-	semop(semid[cpu], &lock, 1);
-
-          return 0;
-  }
-
-  static int unlock(int cpu)
-  {
-	struct sembuf unlock;
-
-	unlock.sem_num = cpu;
-	unlock.sem_op = -1;
-	semop(semid[cpu], &unlock, 1);
-
-          return 0;
-  }
-
-  void free_sem(int cpu)
-  {
-	semctl(semid[cpu], 0, IPC_RMID);
-  }
-
-  int wr_multi(char *fn, unsigned long *data, int size)
-  {
-	int fd;
-	char buf[MAX_BUF_SIZE];
-	int ret;
-
-	if (size==1)
-		sprintf(buf, "%lx", *data);
-	else if (size==3)
-		sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
-	else {
-		fprintf(stderr,"write to file with wrong size!\n");
-		return -1;
-	}
-
-	fd=open(fn, O_RDWR);
-	if (!fd) {
-		perror("Error:");
-		return -1;
-	}
-	ret=write(fd, buf, sizeof(buf));
-	close(fd);
-	return ret;
-  }
-
-  int wr(char *fn, unsigned long data)
-  {
-	return wr_multi(fn, &data, 1);
-  }
-
-  int rd(char *fn, unsigned long *data)
-  {
-	int fd;
-	char buf[MAX_BUF_SIZE];
-
-	fd=open(fn, O_RDONLY);
-	if (fd<0) {
-		perror("Error:");
-		return -1;
-	}
-	read(fd, buf, MAX_BUF_SIZE);
-	*data=strtoul(buf, NULL, 16);
-	close(fd);
-	return 0;
-  }
-
-  int rd_status(char *path, int *status)
-  {
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/status", path);
-	if (rd(fn, (u64*)status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  int rd_capabilities(char *path, u64 *capabilities)
-  {
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn, capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  int rd_all(char *path)
-  {
-	unsigned long err_type_info, err_struct_info, err_data_buffer;
-	int status;
-	unsigned long capabilities, resources;
-	char fn[MAX_FN_SIZE];
-
-	sprintf(fn, "%s/err_type_info", path);
-	if (rd(fn, &err_type_info)<0) {
-		perror("err_type_info reading error.\n");
-		return -1;
-	}
-	printf("err_type_info=%lx\n", err_type_info);
-
-	sprintf(fn, "%s/err_struct_info", path);
-	if (rd(fn, &err_struct_info)<0) {
-		perror("err_struct_info reading error.\n");
-		return -1;
-	}
-	printf("err_struct_info=%lx\n", err_struct_info);
-
-	sprintf(fn, "%s/err_data_buffer", path);
-	if (rd(fn, &err_data_buffer)<0) {
-		perror("err_data_buffer reading error.\n");
-		return -1;
-	}
-	printf("err_data_buffer=%lx\n", err_data_buffer);
-
-	sprintf(fn, "%s/status", path);
-	if (rd("status", (u64*)&status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-	printf("status=%d\n", status);
-
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn,&capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-	printf("capabilities=%lx\n", capabilities);
-
-	sprintf(fn, "%s/resources", path);
-	if (rd(fn, &resources)<0) {
-		perror("resources reading error.\n");
-		return -1;
-	}
-	printf("resources=%lx\n", resources);
-
-	return 0;
-  }
-
-  int query_capabilities(char *path, err_type_info_t err_type_info,
-			u64 *capabilities)
-  {
-	char fn[MAX_FN_SIZE];
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-
-	err_struct_info.err_struct_info=0;
-	memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
-
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, 0x0);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn, mode);
-
-	if (rd_capabilities(path, capabilities)<0)
-		return -1;
-
-	return 0;
-  }
-
-  int query_all_capabilities()
-  {
-	int status;
-	err_type_info_t err_type_info;
-	int err_sev, err_struct, struct_hier;
-	int cap=0;
-	u64 capabilities;
-	char path[MAX_FN_SIZE];
-
-	err_type_info.err_type_info=0;			// Initial
-	err_type_info.err_type_info_u.mode=0;		// Query mode;
-	err_type_info.err_type_info_u.err_inj=0;
-
-	printf("All capabilities implemented in pal_mc_error_inject:\n");
-	sprintf(path, PATH_FORMAT ,0);
-	for (err_sev=0;err_sev<3;err_sev++)
-		for (err_struct=0;err_struct<5;err_struct++)
-			for (struct_hier=0;struct_hier<5;struct_hier++)
-	{
-		status=-1;
-		capabilities=0;
-		err_type_info.err_type_info_u.err_sev=err_sev;
-		err_type_info.err_type_info_u.err_struct=err_struct;
-		err_type_info.err_type_info_u.struct_hier=struct_hier;
-
-		if (query_capabilities(path, err_type_info, &capabilities)<0)
-			continue;
-
-		if (rd_status(path, &status)<0)
-			continue;
-
-		if (status==0) {
-			cap=1;
-			printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
-				err_sev, err_struct, struct_hier);
-			printf("capabilities 0x%lx\n", capabilities);
-		}
-	}
-	if (!cap) {
-		printf("No capabilities supported.\n");
-		return 0;
-	}
-
-	return 0;
-  }
-
-  int err_inject(int cpu, char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t err_data_buffer)
-  {
-	int status;
-	char fn[MAX_FN_SIZE];
-
-	log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
-		err_type_info.err_type_info,
-		err_struct_info.err_struct_info);
-	log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
-		err_data_buffer.err_data_buffer[0],
-		err_data_buffer.err_data_buffer[1],
-		err_data_buffer.err_data_buffer[2]);
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, err_struct_info.err_struct_info);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn,mode);
-
-	if (rd_status(path, &status)<0) {
-		vbprintf("fail: read status\n");
-		return -100;
-	}
-
-	if (status!=0) {
-		log_info(cpu, "fail: status=%d\n", status);
-		return status;
-	}
-
-	return status;
-  }
-
-  static int construct_data_buf(char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t *err_data_buffer,
-		void *va1)
-  {
-	char fn[MAX_FN_SIZE];
-	u64 virt_addr=0, phys_addr=0;
-
-	vbprintf("va1=%lx\n", (u64)va1);
-	memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
-
-	switch (err_type_info.err_type_info_u.err_struct) {
-		case 1: // Cache
-			switch (err_struct_info.err_struct_info_cache.cl_id) {
-				case 1: //Virtual addr
-					err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
-					break;
-				case 2: //Phys addr
-					sprintf(fn, "%s/virtual_to_phys", path);
-					virt_addr=(u64)va1;
-					if (wr(fn,virt_addr)<0)
-						return -1;
-					rd(fn, &phys_addr);
-					err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
-					break;
-				default:
-					printf("Not supported cl_id\n");
-					break;
-			}
-			break;
-		case 2: //  TLB
-			break;
-		case 3: //  Register file
-			break;
-		case 4: //  Bus/system interconnect
-		default:
-			printf("Not supported err_struct\n");
-			break;
-	}
-
-	return 0;
-  }
-
-  typedef struct {
-	u64 cpu;
-	u64 loop;
-	u64 interval;
-	u64 err_type_info;
-	u64 err_struct_info;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-  } parameters_t;
-
-  parameters_t line_para;
-  int para;
-
-  static int empty_data_buffer(u64 *err_data_buffer)
-  {
-	int empty=1;
-	int i;
-
-	for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
-	   if (err_data_buffer[i]!=-1)
-		empty=0;
-
-	return empty;
-  }
-
-  int err_inj()
-  {
-	err_type_info_t err_type_info;
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-	int count;
-	FILE *fp;
-	unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
-	u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
-	int num;
-	int i;
-	char path[MAX_FN_SIZE];
-	parameters_t parameters[MAX_TASK_NUM]={};
-	pid_t child_pid[MAX_TASK_NUM];
-	time_t current_time;
-	int status;
-
-	if (!para) {
-	    fp=fopen("err.conf", "r");
-	    if (fp==NULL) {
-		perror("Error open err.conf");
-		return -1;
-	    }
-
-	    num=0;
-	    while (!feof(fp)) {
-		char buf[256];
-		memset(buf,0,256);
-		fgets(buf, 256, fp);
-		count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf,
-				&err_data_buffer_conf[0],
-				&err_data_buffer_conf[1],
-				&err_data_buffer_conf[2]);
-		if (count!=PARA_FIELD_NUM+3) {
-			err_data_buffer_conf[0]=-1;
-			err_data_buffer_conf[1]=-1;
-			err_data_buffer_conf[2]=-1;
-			count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf);
-			if (count!=PARA_FIELD_NUM)
-				continue;
-		}
-
-		parameters[num].cpu=cpu;
-		parameters[num].loop=loop;
-		parameters[num].interval= interval>MIN_INTERVAL
-					  ?interval:MIN_INTERVAL;
-		parameters[num].err_type_info=err_type_info_conf;
-		parameters[num].err_struct_info=err_struct_info_conf;
-		memcpy(parameters[num++].err_data_buffer,
-			err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
-
-		if (num>=MAX_TASK_NUM)
-			break;
-	    }
-	}
-	else {
-		parameters[0].cpu=line_para.cpu;
-		parameters[0].loop=line_para.loop;
-		parameters[0].interval= line_para.interval>MIN_INTERVAL
-					  ?line_para.interval:MIN_INTERVAL;
-		parameters[0].err_type_info=line_para.err_type_info;
-		parameters[0].err_struct_info=line_para.err_struct_info;
-		memcpy(parameters[0].err_data_buffer,
-			line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
-
-		num=1;
-	}
-
-	/* Create semaphore: If one_lock, one semaphore for all processors.
-	   Otherwise, one semaphore for each processor. */
-	if (one_lock) {
-		if (create_sem(0)) {
-			printf("Can not create semaphore...exit\n");
-			free_sem(0);
-			return -1;
-		}
-	}
-	else {
-		for (i=0;i<num;i++) {
-		   if (create_sem(parameters[i].cpu)) {
-			printf("Can not create semaphore for cpu%d...exit\n",i);
-			free_sem(parameters[num].cpu);
-			return -1;
-		   }
-		}
-	}
-
-	/* Create a shm segment which will be used to inject/consume errors on.*/
-	if (create_shm()==-1) {
-		printf("Error to create shm...exit\n");
-		return -1;
-	}
-
-	for (i=0;i<num;i++) {
-		pid_t pid;
-
-		current_time=time(NULL);
-		log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
-		log_info(parameters[i].cpu, "Configurations:\n");
-		log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
-			parameters[i].cpu,
-			parameters[i].loop,
-			parameters[i].interval);
-		log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
-			parameters[i].err_type_info,
-			parameters[i].err_struct_info);
-
-		sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
-		err_type_info.err_type_info=parameters[i].err_type_info;
-		err_struct_info.err_struct_info=parameters[i].err_struct_info;
-		memcpy(err_data_buffer.err_data_buffer,
-			parameters[i].err_data_buffer,
-			ERR_DATA_BUFFER_SIZE*8);
-
-		pid=fork();
-		if (pid==0) {
-			unsigned long mask[MASK_SIZE];
-			int j, k;
-
-			void *va1, *va2;
-
-			/* Allocate two memory areas va1 and va2 in shm */
-			va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
-			va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
-
-			vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
-			memset(va1, 0x1, PAGE_SIZE);
-			memset(va2, 0x2, PAGE_SIZE);
-
-			if (empty_data_buffer(err_data_buffer.err_data_buffer))
-				/* If not specified yet, construct data buffer
-				 * with va1
-				 */
-				construct_data_buf(path, err_type_info,
-					err_struct_info, &err_data_buffer,va1);
-
-			for (j=0;j<MASK_SIZE;j++)
-				mask[j]=0;
-
-			cpu=parameters[i].cpu;
-			k = cpu%64;
-			j = cpu/64;
-			mask[j] = 1UL << k;
-
-			if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
-				perror("Error sched_setaffinity:");
-				return -1;
-			}
-
-			for (j=0; j<parameters[i].loop; j++) {
-				log_info(parameters[i].cpu,"Injection ");
-				log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
-
-					parameters[i].cpu,j+1, parameters[i].loop);
-
-				/* Hold the lock */
-				if (one_lock)
-					lock(0);
-				else
-				/* Hold lock on this cpu */
-					lock(parameters[i].cpu);
-
-				if ((status=err_inject(parameters[i].cpu,
-					   path, err_type_info,
-					   err_struct_info, err_data_buffer))
-					   ==0) {
-					/* consume the error for "inject only"*/
-					memcpy(va2, va1, PAGE_SIZE);
-					memcpy(va1, va2, PAGE_SIZE);
-					log_info(parameters[i].cpu,
-						"successful\n");
-				}
-				else {
-					log_info(parameters[i].cpu,"fail:");
-					log_info(parameters[i].cpu,
-						"status=%d\n", status);
-					unlock(parameters[i].cpu);
-					break;
-				}
-				if (one_lock)
-				/* Release the lock */
-					unlock(0);
-				/* Release lock on this cpu */
-				else
-					unlock(parameters[i].cpu);
-
-				if (j < parameters[i].loop-1)
-					sleep(parameters[i].interval);
-			}
-			current_time=time(NULL);
-			log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
-			return 0;
-		}
-		else if (pid<0) {
-			perror("Error fork:");
-			continue;
-		}
-		child_pid[i]=pid;
-	}
-	for (i=0;i<num;i++)
-		waitpid(child_pid[i], NULL, 0);
-
-	if (one_lock)
-		free_sem(0);
-	else
-		for (i=0;i<num;i++)
-			free_sem(parameters[i].cpu);
-
-	printf("All done.\n");
-
-	return 0;
-  }
-
-  void help()
-  {
-	printf("err_inject_tool:\n");
-	printf("\t-q: query all capabilities. default: off\n");
-	printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
-	printf("\t-i: inject errors. default: off\n");
-	printf("\t-l: one lock per cpu. default: one lock for all\n");
-	printf("\t-e: error parameters:\n");
-	printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
-	printf("\t\t   cpu: logical cpu number the error will be inject in.\n");
-	printf("\t\t   loop: times the error will be injected.\n");
-	printf("\t\t   interval: In second. every so often one error is injected.\n");
-	printf("\t\t   err_type_info, err_struct_info: PAL parameters.\n");
-	printf("\t\t   err_data_buffer: PAL parameter. Optional. If not present,\n");
-	printf("\t\t                    it's constructed by tool automatically. Be\n");
-	printf("\t\t                    careful to provide err_data_buffer and make\n");
-	printf("\t\t                    sure it's working with the environment.\n");
-	printf("\t    Note:no space between error parameters.\n");
-	printf("\t    default: Take error parameters from err.conf instead of command line.\n");
-	printf("\t-v: verbose. default: off\n");
-	printf("\t-h: help\n\n");
-	printf("The tool will take err.conf file as ");
-	printf("input to inject single or multiple errors ");
-	printf("on one or multiple cpus in parallel.\n");
-  }
-
-  int main(int argc, char **argv)
-  {
-	char c;
-	int do_err_inj=0;
-	int do_query_all=0;
-	int count;
-	u32 m;
-
-	/* Default one lock for all cpu's */
-	one_lock=1;
-	while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
-		switch (c) {
-			case 'm':	/* Procedure mode. 1: phys 2: virt */
-				count=sscanf(optarg, "%x", &m);
-				if (count!=1 || (m!=1 && m!=2)) {
-					printf("Wrong mode number.\n");
-					help();
-					return -1;
-				}
-				mode=m;
-				break;
-			case 'i':	/* Inject errors */
-				do_err_inj=1;
-				break;
-			case 'q':	/* Query */
-				do_query_all=1;
-				break;
-			case 'v':	/* Verbose */
-				verbose=1;
-				break;
-			case 'l':	/* One lock per cpu */
-				one_lock=0;
-				break;
-			case 'e':	/* error arguments */
-				/* Take parameters:
-				 * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
-				 * err_data_buffer is optional. Recommend not to specify
-				 * err_data_buffer. Better to use tool to generate it.
-				 */
-				count=sscanf(optarg,
-					"%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-					&line_para.cpu,
-					&line_para.loop,
-					&line_para.interval,
-					&line_para.err_type_info,
-					&line_para.err_struct_info,
-					&line_para.err_data_buffer[0],
-					&line_para.err_data_buffer[1],
-					&line_para.err_data_buffer[2]);
-				if (count!=PARA_FIELD_NUM+3) {
-				    line_para.err_data_buffer[0]=-1,
-				    line_para.err_data_buffer[1]=-1,
-				    line_para.err_data_buffer[2]=-1;
-				    count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
-						&line_para.cpu,
-						&line_para.loop,
-						&line_para.interval,
-						&line_para.err_type_info,
-						&line_para.err_struct_info);
-				    if (count!=PARA_FIELD_NUM) {
-					printf("Wrong error arguments.\n");
-					help();
-					return -1;
-				    }
-				}
-				para=1;
-				break;
-			continue;
-				break;
-			case 'h':
-				help();
-				return 0;
-			default:
-				break;
-		}
-
-	if (do_query_all)
-		query_all_capabilities();
-	if (do_err_inj)
-		err_inj();
-
-	if (!do_query_all &&  !do_err_inj)
-		help();
-
-	return 0;
-  }
diff --git a/Documentation/arch/ia64/features.rst b/Documentation/arch/ia64/features.rst
deleted file mode 100644
index d7226fdcf5f8..000000000000
--- a/Documentation/arch/ia64/features.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. kernel-feat:: $srctree/Documentation/features ia64
diff --git a/Documentation/arch/ia64/fsys.rst b/Documentation/arch/ia64/fsys.rst
deleted file mode 100644
index a702d2cc94b6..000000000000
--- a/Documentation/arch/ia64/fsys.rst
+++ /dev/null
@@ -1,303 +0,0 @@
-===================================
-Light-weight System Calls for IA-64
-===================================
-
-		        Started: 13-Jan-2003
-
-		    Last update: 27-Sep-2003
-
-	              David Mosberger-Tang
-		      <davidm@hpl.hp.com>
-
-Using the "epc" instruction effectively introduces a new mode of
-execution to the ia64 linux kernel.  We call this mode the
-"fsys-mode".  To recap, the normal states of execution are:
-
-  - kernel mode:
-	Both the register stack and the memory stack have been
-	switched over to kernel memory.  The user-level state is saved
-	in a pt-regs structure at the top of the kernel memory stack.
-
-  - user mode:
-	Both the register stack and the kernel stack are in
-	user memory.  The user-level state is contained in the
-	CPU registers.
-
-  - bank 0 interruption-handling mode:
-	This is the non-interruptible state which all
-	interruption-handlers start execution in.  The user-level
-	state remains in the CPU registers and some kernel state may
-	be stored in bank 0 of registers r16-r31.
-
-In contrast, fsys-mode has the following special properties:
-
-  - execution is at privilege level 0 (most-privileged)
-
-  - CPU registers may contain a mixture of user-level and kernel-level
-    state (it is the responsibility of the kernel to ensure that no
-    security-sensitive kernel-level state is leaked back to
-    user-level)
-
-  - execution is interruptible and preemptible (an fsys-mode handler
-    can disable interrupts and avoid all other interruption-sources
-    to avoid preemption)
-
-  - neither the memory-stack nor the register-stack can be trusted while
-    in fsys-mode (they point to the user-level stacks, which may
-    be invalid, or completely bogus addresses)
-
-In summary, fsys-mode is much more similar to running in user-mode
-than it is to running in kernel-mode.  Of course, given that the
-privilege level is at level 0, this means that fsys-mode requires some
-care (see below).
-
-
-How to tell fsys-mode
-=====================
-
-Linux operates in fsys-mode when (a) the privilege level is 0 (most
-privileged) and (b) the stacks have NOT been switched to kernel memory
-yet.  For convenience, the header file <asm-ia64/ptrace.h> provides
-three macros::
-
-	user_mode(regs)
-	user_stack(task,regs)
-	fsys_mode(task,regs)
-
-The "regs" argument is a pointer to a pt_regs structure.  The "task"
-argument is a pointer to the task structure to which the "regs"
-pointer belongs to.  user_mode() returns TRUE if the CPU state pointed
-to by "regs" was executing in user mode (privilege level 3).
-user_stack() returns TRUE if the state pointed to by "regs" was
-executing on the user-level stack(s).  Finally, fsys_mode() returns
-TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
-The fsys_mode() macro is equivalent to the expression::
-
-	!user_mode(regs) && user_stack(task,regs)
-
-How to write an fsyscall handler
-================================
-
-The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
-(fsyscall_table).  This table contains one entry for each system call.
-By default, a system call is handled by fsys_fallback_syscall().  This
-routine takes care of entering (full) kernel mode and calling the
-normal Linux system call handler.  For performance-critical system
-calls, it is possible to write a hand-tuned fsyscall_handler.  For
-example, fsys.S contains fsys_getpid(), which is a hand-tuned version
-of the getpid() system call.
-
-The entry and exit-state of an fsyscall handler is as follows:
-
-Machine state on entry to fsyscall handler
-------------------------------------------
-
-  ========= ===============================================================
-  r10	    0
-  r11	    saved ar.pfs (a user-level value)
-  r15	    system call number
-  r16	    "current" task pointer (in normal kernel-mode, this is in r13)
-  r32-r39   system call arguments
-  b6	    return address (a user-level value)
-  ar.pfs    previous frame-state (a user-level value)
-  PSR.be    cleared to zero (i.e., little-endian byte order is in effect)
-  -         all other registers may contain values passed in from user-mode
-  ========= ===============================================================
-
-Required machine state on exit to fsyscall handler
---------------------------------------------------
-
-  ========= ===========================================================
-  r11	    saved ar.pfs (as passed into the fsyscall handler)
-  r15	    system call number (as passed into the fsyscall handler)
-  r32-r39   system call arguments (as passed into the fsyscall handler)
-  b6	    return address (as passed into the fsyscall handler)
-  ar.pfs    previous frame-state (as passed into the fsyscall handler)
-  ========= ===========================================================
-
-Fsyscall handlers can execute with very little overhead, but with that
-speed comes a set of restrictions:
-
- * Fsyscall-handlers MUST check for any pending work in the flags
-   member of the thread-info structure and if any of the
-   TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
-   doing a full system call (by calling fsys_fallback_syscall).
-
- * Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
-   r15, b6, and ar.pfs) because they will be needed in case of a
-   system call restart.  Of course, all "preserved" registers also
-   must be preserved, in accordance to the normal calling conventions.
-
- * Fsyscall-handlers MUST check argument registers for containing a
-   NaT value before using them in any way that could trigger a
-   NaT-consumption fault.  If a system call argument is found to
-   contain a NaT value, an fsyscall-handler may return immediately
-   with r8=EINVAL, r10=-1.
-
- * Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
-   any other operation that would trigger mandatory RSE
-   (register-stack engine) traffic.
-
- * Fsyscall-handlers MUST NOT write to any stacked registers because
-   it is not safe to assume that user-level called a handler with the
-   proper number of arguments.
-
- * Fsyscall-handlers need to be careful when accessing per-CPU variables:
-   unless proper safe-guards are taken (e.g., interruptions are avoided),
-   execution may be pre-empted and resumed on another CPU at any given
-   time.
-
- * Fsyscall-handlers must be careful not to leak sensitive kernel'
-   information back to user-level.  In particular, before returning to
-   user-level, care needs to be taken to clear any scratch registers
-   that could contain sensitive information (note that the current
-   task pointer is not considered sensitive: it's already exposed
-   through ar.k6).
-
- * Fsyscall-handlers MUST NOT access user-memory without first
-   validating access-permission (this can be done typically via
-   probe.r.fault and/or probe.w.fault) and without guarding against
-   memory access exceptions (this can be done with the EX() macros
-   defined by asmmacro.h).
-
-The above restrictions may seem draconian, but remember that it's
-possible to trade off some of the restrictions by paying a slightly
-higher overhead.  For example, if an fsyscall-handler could benefit
-from the shadow register bank, it could temporarily disable PSR.i and
-PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
-needed.  In other words, following the above rules yields extremely
-fast system call execution (while fully preserving system call
-semantics), but there is also a lot of flexibility in handling more
-complicated cases.
-
-Signal handling
-===============
-
-The delivery of (asynchronous) signals must be delayed until fsys-mode
-is exited.  This is accomplished with the help of the lower-privilege
-transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
-checks whether the interrupted task was in fsys-mode and, if so, sets
-PSR.lp and returns immediately.  When fsys-mode is exited via the
-"br.ret" instruction that lowers the privilege level, a trap will
-occur.  The trap handler clears PSR.lp again and returns immediately.
-The kernel exit path then checks for and delivers any pending signals.
-
-PSR Handling
-============
-
-The "epc" instruction doesn't change the contents of PSR at all.  This
-is in contrast to a regular interruption, which clears almost all
-bits.  Because of that, some care needs to be taken to ensure things
-work as expected.  The following discussion describes how each PSR bit
-is handled.
-
-======= =======================================================================
-PSR.be	Cleared when entering fsys-mode.  A srlz.d instruction is used
-	to ensure the CPU is in little-endian mode before the first
-	load/store instruction is executed.  PSR.be is normally NOT
-	restored upon return from an fsys-mode handler.  In other
-	words, user-level code must not rely on PSR.be being preserved
-	across a system call.
-PSR.up	Unchanged.
-PSR.ac	Unchanged.
-PSR.mfl Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.mfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.ic	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.i	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.pk	Unchanged.
-PSR.dt	Unchanged.
-PSR.dfl	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.dfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.sp	Unchanged.
-PSR.pp	Unchanged.
-PSR.di	Unchanged.
-PSR.si	Unchanged.
-PSR.db	Unchanged.  The kernel prevents user-level from setting a hardware
-	breakpoint that triggers at any privilege level other than
-	3 (user-mode).
-PSR.lp	Unchanged.
-PSR.tb	Lazy redirect.  If a taken-branch trap occurs while in
-	fsys-mode, the trap-handler modifies the saved machine state
-	such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.  Note: the
-	taken branch would occur on the branch invoking the
-	fsyscall-handler, at which point, by definition, a syscall
-	restart is still safe.  If the system call number is invalid,
-	the fsys-mode handler will return directly to user-level.  This
-	return will trigger a taken-branch trap, but since the trap is
-	taken _after_ restoring the privilege level, the CPU has already
-	left fsys-mode, so no special treatment is needed.
-PSR.rt	Unchanged.
-PSR.cpl	Cleared to 0.
-PSR.is	Unchanged (guaranteed to be 0 on entry to the gate page).
-PSR.mc	Unchanged.
-PSR.it	Unchanged (guaranteed to be 1).
-PSR.id	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.da	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.dd	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.ss	Lazy redirect.  If set, "epc" will cause a Single Step Trap to
-	be taken.  The trap handler then modifies the saved machine
-	state such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.
-PSR.ri	Unchanged.
-PSR.ed	Unchanged.  Note: This bit could only have an effect if an fsys-mode
-	handler performed a speculative load that gets NaTted.  If so, this
-	would be the normal & expected behavior, so no special treatment is
-	needed.
-PSR.bn	Unchanged.  Note: fsys-mode handlers may clear the bit, if needed.
-	Doing so requires clearing PSR.i and PSR.ic as well.
-PSR.ia	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-======= =======================================================================
-
-Using fast system calls
-=======================
-
-To use fast system calls, userspace applications need simply call
-__kernel_syscall_via_epc().  For example
-
--- example fgettimeofday() call --
-
--- fgettimeofday.S --
-
-::
-
-  #include <asm/asmmacro.h>
-
-  GLOBAL_ENTRY(fgettimeofday)
-  .prologue
-  .save ar.pfs, r11
-  mov r11 = ar.pfs
-  .body
-
-  mov r2 = 0xa000000000020660;;  // gate address
-			       // found by inspection of System.map for the
-			       // __kernel_syscall_via_epc() function.  See
-			       // below for how to do this for real.
-
-  mov b7 = r2
-  mov r15 = 1087		       // gettimeofday syscall
-  ;;
-  br.call.sptk.many b6 = b7
-  ;;
-
-  .restore sp
-
-  mov ar.pfs = r11
-  br.ret.sptk.many rp;;	      // return to caller
-  END(fgettimeofday)
-
--- end fgettimeofday.S --
-
-In reality, getting the gate address is accomplished by two extra
-values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
-
- * AT_SYSINFO : is the address of __kernel_syscall_via_epc()
- * AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
-
-The ELF DSO is a pre-linked library that is mapped in by the kernel at
-the gate page.  It is a proper ELF shared object so, with a dynamic
-loader that recognises the library, you should be able to make calls to
-the exported functions within it as with any other shared library.
-AT_SYSINFO points into the kernel DSO at the
-__kernel_syscall_via_epc() function for historical reasons (it was
-used before the kernel DSO) and as a convenience.
diff --git a/Documentation/arch/ia64/ia64.rst b/Documentation/arch/ia64/ia64.rst
deleted file mode 100644
index b725019a9492..000000000000
--- a/Documentation/arch/ia64/ia64.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-===========================================
-Linux kernel release for the IA-64 Platform
-===========================================
-
-   These are the release notes for Linux since version 2.4 for IA-64
-   platform.  This document provides information specific to IA-64
-   ONLY, to get additional information about the Linux kernel also
-   read the original Linux README provided with the kernel.
-
-Installing the Kernel
-=====================
-
- - IA-64 kernel installation is the same as the other platforms, see
-   original README for details.
-
-
-Software Requirements
-=====================
-
-   Compiling and running this kernel requires an IA-64 compliant GCC
-   compiler.  And various software packages also compiled with an
-   IA-64 compliant GCC compiler.
-
-
-Configuring the Kernel
-======================
-
-   Configuration is the same, see original README for details.
-
-
-Compiling the Kernel:
-
- - Compiling this kernel doesn't differ from other platform so read
-   the original README for details BUT make sure you have an IA-64
-   compliant GCC compiler.
-
-IA-64 Specifics
-===============
-
- - General issues:
-
-    * Hardly any performance tuning has been done. Obvious targets
-      include the library routines (IP checksum, etc.). Less
-      obvious targets include making sure we don't flush the TLB
-      needlessly, etc.
-
-    * SMP locks cleanup/optimization
-
-    * IA32 support.  Currently experimental.  It mostly works.
diff --git a/Documentation/arch/ia64/index.rst b/Documentation/arch/ia64/index.rst
deleted file mode 100644
index 761f2154dfa2..000000000000
--- a/Documentation/arch/ia64/index.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==================
-IA-64 Architecture
-==================
-
-.. toctree::
-   :maxdepth: 1
-
-   ia64
-   aliasing
-   efirtc
-   err_inject
-   fsys
-   irq-redir
-   mca
-   serial
-
-   features
diff --git a/Documentation/arch/ia64/irq-redir.rst b/Documentation/arch/ia64/irq-redir.rst
deleted file mode 100644
index 6bbbbe4f73ef..000000000000
--- a/Documentation/arch/ia64/irq-redir.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-==============================
-IRQ affinity on IA64 platforms
-==============================
-
-07.01.2002, Erich Focht <efocht@ess.nec.de>
-
-
-By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
-controlled. The behavior on IA64 platforms is slightly different from
-that described in Documentation/core-api/irq/irq-affinity.rst for i386 systems.
-
-Because of the usage of SAPIC mode and physical destination mode the
-IRQ target is one particular CPU and cannot be a mask of several
-CPUs. Only the first non-zero bit is taken into account.
-
-
-Usage examples
-==============
-
-The target CPU has to be specified as a hexadecimal CPU mask. The
-first non-zero bit is the selected CPU. This format has been kept for
-compatibility reasons with i386.
-
-Set the delivery mode of interrupt 41 to fixed and route the
-interrupts to CPU #3 (logical CPU number) (2^3=0x08)::
-
-     echo "8" >/proc/irq/41/smp_affinity
-
-Set the default route for IRQ number 41 to CPU 6 in lowest priority
-delivery mode (redirectable)::
-
-     echo "r 40" >/proc/irq/41/smp_affinity
-
-The output of the command::
-
-     cat /proc/irq/IRQ#/smp_affinity
-
-gives the target CPU mask for the specified interrupt vector. If the CPU
-mask is preceded by the character "r", the interrupt is redirectable
-(i.e. lowest priority mode routing is used), otherwise its route is
-fixed.
-
-
-
-Initialization and default behavior
-===================================
-
-If the platform features IRQ redirection (info provided by SAL) all
-IO-SAPIC interrupts are initialized with CPU#0 as their default target
-and the routing is the so called "lowest priority mode" (actually
-fixed SAPIC mode with hint). The XTP chipset registers are used as hints
-for the IRQ routing. Currently in Linux XTP registers can have three
-values:
-
-	- minimal for an idle task,
-	- normal if any other task runs,
-	- maximal if the CPU is going to be switched off.
-
-The IRQ is routed to the CPU with lowest XTP register value, the
-search begins at the default CPU. Therefore most of the interrupts
-will be handled by CPU #0.
-
-If the platform doesn't feature interrupt redirection IOSAPIC fixed
-routing is used. The target CPUs are distributed in a round robin
-manner. IRQs will be routed only to the selected target CPUs. Check
-with::
-
-        cat /proc/interrupts
-
-
-
-Comments
-========
-
-On large (multi-node) systems it is recommended to route the IRQs to
-the node to which the corresponding device is connected.
-For systems like the NEC AzusA we get IRQ node-affinity for free. This
-is because usually the chipsets on each node redirect the interrupts
-only to their own CPUs (as they cannot see the XTP registers on the
-other nodes).
diff --git a/Documentation/arch/ia64/mca.rst b/Documentation/arch/ia64/mca.rst
deleted file mode 100644
index 08270bba44a4..000000000000
--- a/Documentation/arch/ia64/mca.rst
+++ /dev/null
@@ -1,198 +0,0 @@
-=============================================================
-An ad-hoc collection of notes on IA64 MCA and INIT processing
-=============================================================
-
-Feel free to update it with notes about any area that is not clear.
-
----
-
-MCA/INIT are completely asynchronous.  They can occur at any time, when
-the OS is in any state.  Including when one of the cpus is already
-holding a spinlock.  Trying to get any lock from MCA/INIT state is
-asking for deadlock.  Also the state of structures that are protected
-by locks is indeterminate, including linked lists.
-
----
-
-The complicated ia64 MCA process.  All of this is mandated by Intel's
-specification for ia64 SAL, error recovery and unwind, it is not as
-if we have a choice here.
-
-* MCA occurs on one cpu, usually due to a double bit memory error.
-  This is the monarch cpu.
-
-* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
-  to all the other cpus, the slaves.
-
-* Slave cpus that receive the MCA interrupt call down into SAL, they
-  end up spinning disabled while the MCA is being serviced.
-
-* If any slave cpu was already spinning disabled when the MCA occurred
-  then it cannot service the MCA interrupt.  SAL waits ~20 seconds then
-  sends an unmaskable INIT event to the slave cpus that have not
-  already rendezvoused.
-
-* Because MCA/INIT can be delivered at any time, including when the cpu
-  is down in PAL in physical mode, the registers at the time of the
-  event are _completely_ undefined.  In particular the MCA/INIT
-  handlers cannot rely on the thread pointer, PAL physical mode can
-  (and does) modify TP.  It is allowed to do that as long as it resets
-  TP on return.  However MCA/INIT events expose us to these PAL
-  internal TP changes.  Hence curr_task().
-
-* If an MCA/INIT event occurs while the kernel was running (not user
-  space) and the kernel has called PAL then the MCA/INIT handler cannot
-  assume that the kernel stack is in a fit state to be used.  Mainly
-  because PAL may or may not maintain the stack pointer internally.
-  Because the MCA/INIT handlers cannot trust the kernel stack, they
-  have to use their own, per-cpu stacks.  The MCA/INIT stacks are
-  preformatted with just enough task state to let the relevant handlers
-  do their job.
-
-* Unlike most other architectures, the ia64 struct task is embedded in
-  the kernel stack[1].  So switching to a new kernel stack means that
-  we switch to a new task as well.  Because various bits of the kernel
-  assume that current points into the struct task, switching to a new
-  stack also means a new value for current.
-
-* Once all slaves have rendezvoused and are spinning disabled, the
-  monarch is entered.  The monarch now tries to diagnose the problem
-  and decide if it can recover or not.
-
-* Part of the monarch's job is to look at the state of all the other
-  tasks.  The only way to do that on ia64 is to call the unwinder,
-  as mandated by Intel.
-
-* The starting point for the unwind depends on whether a task is
-  running or not.  That is, whether it is on a cpu or is blocked.  The
-  monarch has to determine whether or not a task is on a cpu before it
-  knows how to start unwinding it.  The tasks that received an MCA or
-  INIT event are no longer running, they have been converted to blocked
-  tasks.  But (and its a big but), the cpus that received the MCA
-  rendezvous interrupt are still running on their normal kernel stacks!
-
-* To distinguish between these two cases, the monarch must know which
-  tasks are on a cpu and which are not.  Hence each slave cpu that
-  switches to an MCA/INIT stack, registers its new stack using
-  set_curr_task(), so the monarch can tell that the _original_ task is
-  no longer running on that cpu.  That gives us a decent chance of
-  getting a valid backtrace of the _original_ task.
-
-* MCA/INIT can be nested, to a depth of 2 on any cpu.  In the case of a
-  nested error, we want diagnostics on the MCA/INIT handler that
-  failed, not on the task that was originally running.  Again this
-  requires set_curr_task() so the MCA/INIT handlers can register their
-  own stack as running on that cpu.  Then a recursive error gets a
-  trace of the failing handler's "task".
-
-[1]
-    My (Keith Owens) original design called for ia64 to separate its
-    struct task and the kernel stacks.  Then the MCA/INIT data would be
-    chained stacks like i386 interrupt stacks.  But that required
-    radical surgery on the rest of ia64, plus extra hard wired TLB
-    entries with its associated performance degradation.  David
-    Mosberger vetoed that approach.  Which meant that separate kernel
-    stacks meant separate "tasks" for the MCA/INIT handlers.
-
----
-
-INIT is less complicated than MCA.  Pressing the nmi button or using
-the equivalent command on the management console sends INIT to all
-cpus.  SAL picks one of the cpus as the monarch and the rest are
-slaves.  All the OS INIT handlers are entered at approximately the same
-time.  The OS monarch prints the state of all tasks and returns, after
-which the slaves return and the system resumes.
-
-At least that is what is supposed to happen.  Alas there are broken
-versions of SAL out there.  Some drive all the cpus as monarchs.  Some
-drive them all as slaves.  Some drive one cpu as monarch, wait for that
-cpu to return from the OS then drive the rest as slaves.  Some versions
-of SAL cannot even cope with returning from the OS, they spin inside
-SAL on resume.  The OS INIT code has workarounds for some of these
-broken SAL symptoms, but some simply cannot be fixed from the OS side.
-
----
-
-The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
-violations.  Unfortunately MCA/INIT start off as massive layer
-violations (can occur at _any_ time) and they build from there.
-
-At least ia64 makes an attempt at recovering from hardware errors, but
-it is a difficult problem because of the asynchronous nature of these
-errors.  When processing an unmaskable interrupt we sometimes need
-special code to cope with our inability to take any locks.
-
----
-
-How is ia64 MCA/INIT different from x86 NMI?
-
-* x86 NMI typically gets delivered to one cpu.  MCA/INIT gets sent to
-  all cpus.
-
-* x86 NMI cannot be nested.  MCA/INIT can be nested, to a depth of 2
-  per cpu.
-
-* x86 has a separate struct task which points to one of multiple kernel
-  stacks.  ia64 has the struct task embedded in the single kernel
-  stack, so switching stack means switching task.
-
-* x86 does not call the BIOS so the NMI handler does not have to worry
-  about any registers having changed.  MCA/INIT can occur while the cpu
-  is in PAL in physical mode, with undefined registers and an undefined
-  kernel stack.
-
-* i386 backtrace is not very sensitive to whether a process is running
-  or not.  ia64 unwind is very, very sensitive to whether a process is
-  running or not.
-
----
-
-What happens when MCA/INIT is delivered what a cpu is running user
-space code?
-
-The user mode registers are stored in the RSE area of the MCA/INIT on
-entry to the OS and are restored from there on return to SAL, so user
-mode registers are preserved across a recoverable MCA/INIT.  Since the
-OS has no idea what unwind data is available for the user space stack,
-MCA/INIT never tries to backtrace user space.  Which means that the OS
-does not bother making the user space process look like a blocked task,
-i.e. the OS does not copy pt_regs and switch_stack to the user space
-stack.  Also the OS has no idea how big the user space RSE and memory
-stacks are, which makes it too risky to copy the saved state to a user
-mode stack.
-
----
-
-How do we get a backtrace on the tasks that were running when MCA/INIT
-was delivered?
-
-mca.c:::ia64_mca_modify_original_stack().  That identifies and
-verifies the original kernel stack, copies the dirty registers from
-the MCA/INIT stack's RSE to the original stack's RSE, copies the
-skeleton struct pt_regs and switch_stack to the original stack, fills
-in the skeleton structures from the PAL minstate area and updates the
-original stack's thread.ksp.  That makes the original stack look
-exactly like any other blocked task, i.e. it now appears to be
-sleeping.  To get a backtrace, just start with thread.ksp for the
-original task and unwind like any other sleeping task.
-
----
-
-How do we identify the tasks that were running when MCA/INIT was
-delivered?
-
-If the previous task has been verified and converted to a blocked
-state, then sos->prev_task on the MCA/INIT stack is updated to point to
-the previous task.  You can look at that field in dumps or debuggers.
-To help distinguish between the handler and the original tasks,
-handlers have _TIF_MCA_INIT set in thread_info.flags.
-
-The sos data is always in the MCA/INIT handler stack, at offset
-MCA_SOS_OFFSET.  You can get that value from mca_asm.h or calculate it
-as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
-ia64_sal_os_state), with 16 byte alignment for all structures.
-
-Also the comm field of the MCA/INIT task is modified to include the pid
-of the original task, for humans to use.  For example, a comm field of
-'MCA 12159' means that pid 12159 was running when the MCA was
-delivered.
diff --git a/Documentation/arch/ia64/serial.rst b/Documentation/arch/ia64/serial.rst
deleted file mode 100644
index 1de70c305a79..000000000000
--- a/Documentation/arch/ia64/serial.rst
+++ /dev/null
@@ -1,165 +0,0 @@
-==============
-Serial Devices
-==============
-
-Serial Device Naming
-====================
-
-    As of 2.6.10, serial devices on ia64 are named based on the
-    order of ACPI and PCI enumeration.  The first device in the
-    ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
-    /dev/ttyS1, etc., and PCI devices are named sequentially
-    starting after the ACPI devices.
-
-    Prior to 2.6.10, there were confusing exceptions to this:
-
-	- Firmware on some machines (mostly from HP) provides an HCDP
-	  table[1] that tells the kernel about devices that can be used
-	  as a serial console.  If the user specified "console=ttyS0"
-	  or the EFI ConOut path contained only UART devices, the
-	  kernel registered the device described by the HCDP as
-	  /dev/ttyS0.
-
-	- If there was no HCDP, we assumed there were UARTs at the
-	  legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
-	  the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
-
-    Any additional ACPI or PCI devices were registered sequentially
-    after /dev/ttyS0 as they were discovered.
-
-    With an HCDP, device names changed depending on EFI configuration
-    and "console=" arguments.  Without an HCDP, device names didn't
-    change, but we registered devices that might not really exist.
-
-    For example, an HP rx1600 with a single built-in serial port
-    (described in the ACPI namespace) plus an MP[2] (a PCI device) has
-    these ports:
-
-      ==========  ==========     ============    ============   =======
-      Type        MMIO           pre-2.6.10      pre-2.6.10     2.6.10+
-		  address
-				 (EFI console    (EFI console
-                                 on builtin)     on MP port)
-      ==========  ==========     ============    ============   =======
-      builtin     0xff5e0000        ttyS0           ttyS1         ttyS0
-      MP UPS      0xf8031000        ttyS1           ttyS2         ttyS1
-      MP Console  0xf8030000        ttyS2           ttyS0         ttyS2
-      MP 2        0xf8030010        ttyS3           ttyS3         ttyS3
-      MP 3        0xf8030038        ttyS4           ttyS4         ttyS4
-      ==========  ==========     ============    ============   =======
-
-Console Selection
-=================
-
-    EFI knows what your console devices are, but it doesn't tell the
-    kernel quite enough to actually locate them.  The DIG64 HCDP
-    table[1] does tell the kernel where potential serial console
-    devices are, but not all firmware supplies it.  Also, EFI supports
-    multiple simultaneous consoles and doesn't tell the kernel which
-    should be the "primary" one.
-
-    So how do you tell Linux which console device to use?
-
-	- If your firmware supplies the HCDP, it is simplest to
-	  configure EFI with a single device (either a UART or a VGA
-	  card) as the console.  Then you don't need to tell Linux
-	  anything; the kernel will automatically use the EFI console.
-
-	  (This works only in 2.6.6 or later; prior to that you had
-	  to specify "console=ttyS0" to get a serial console.)
-
-	- Without an HCDP, Linux defaults to a VGA console unless you
-	  specify a "console=" argument.
-
-    NOTE: Don't assume that a serial console device will be /dev/ttyS0.
-    It might be ttyS1, ttyS2, etc.  Make sure you have the appropriate
-    entries in /etc/inittab (for getty) and /etc/securetty (to allow
-    root login).
-
-Early Serial Console
-====================
-
-    The kernel can't start using a serial console until it knows where
-    the device lives.  Normally this happens when the driver enumerates
-    all the serial devices, which can happen a minute or more after the
-    kernel starts booting.
-
-    2.6.10 and later kernels have an "early uart" driver that works
-    very early in the boot process.  The kernel will automatically use
-    this if the user supplies an argument like "console=uart,io,0x3f8",
-    or if the EFI console path contains only a UART device and the
-    firmware supplies an HCDP.
-
-Troubleshooting Serial Console Problems
-=======================================
-
-    No kernel output after elilo prints "Uncompressing Linux... done":
-
-	- You specified "console=ttyS0" but Linux changed the device
-	  to which ttyS0 refers.  Configure exactly one EFI console
-	  device[3] and remove the "console=" option.
-
-	- The EFI console path contains both a VGA device and a UART.
-	  EFI and elilo use both, but Linux defaults to VGA.  Remove
-	  the VGA device from the EFI console path[3].
-
-	- Multiple UARTs selected as EFI console devices.  EFI and
-	  elilo use all selected devices, but Linux uses only one.
-	  Make sure only one UART is selected in the EFI console
-	  path[3].
-
-	- You're connected to an HP MP port[2] but have a non-MP UART
-	  selected as EFI console device.  EFI uses the MP as a
-	  console device even when it isn't explicitly selected.
-	  Either move the console cable to the non-MP UART, or change
-	  the EFI console path[3] to the MP UART.
-
-    Long pause (60+ seconds) between "Uncompressing Linux... done" and
-    start of kernel output:
-
-	- No early console because you used "console=ttyS<n>".  Remove
-	  the "console=" option if your firmware supplies an HCDP.
-
-	- If you don't have an HCDP, the kernel doesn't know where
-	  your console lives until the driver discovers serial
-	  devices.  Use "console=uart,io,0x3f8" (or appropriate
-	  address for your machine).
-
-    Kernel and init script output works fine, but no "login:" prompt:
-
-	- Add getty entry to /etc/inittab for console tty.  Look for
-	  the "Adding console on ttyS<n>" message that tells you which
-	  device is the console.
-
-    "login:" prompt, but can't login as root:
-
-	- Add entry to /etc/securetty for console tty.
-
-    No ACPI serial devices found in 2.6.17 or later:
-
-	- Turn on CONFIG_PNP and CONFIG_PNPACPI.  Prior to 2.6.17, ACPI
-	  serial devices were discovered by 8250_acpi.  In 2.6.17,
-	  8250_acpi was replaced by the combination of 8250_pnp and
-	  CONFIG_PNPACPI.
-
-
-
-[1]
-    http://www.dig64.org/specifications/agreement
-    The table was originally defined as the "HCDP" for "Headless
-    Console/Debug Port."  The current version is the "PCDP" for
-    "Primary Console and Debug Port Devices."
-
-[2]
-    The HP MP (management processor) is a PCI device that provides
-    several UARTs.  One of the UARTs is often used as a console; the
-    EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
-    The external connection is usually a 25-pin connector, and a
-    special dongle converts that to three 9-pin connectors, one of
-    which is labelled "Console."
-
-[3]
-    EFI console devices are configured using the EFI Boot Manager
-    "Boot option maintenance" menu.  You may have to interrupt the
-    boot sequence to use this menu, and you will have to reset the
-    box after changing console configuration.
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index 9511e405aabd..dcb0e379e5e8 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -40,12 +40,6 @@ Command Line Switches
   supplied here is lower than the number of physically available CPUs, then
   those CPUs can not be brought online later.
 
-``additional_cpus=n``
-  Use this to limit hotpluggable CPUs. This option sets
-  ``cpu_possible_mask = cpu_present_mask + additional_cpus``
-
-  This option is limited to the IA64 architecture.
-
 ``possible_cpus=n``
   This option sets ``possible_cpus`` bits in ``cpu_possible_mask``.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 90f13281d297..aac200bb0aca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9935,12 +9935,6 @@ F:	Documentation/driver-api/i3c
 F:	drivers/i3c/
 F:	include/linux/i3c/
 
-IA64 (Itanium) PLATFORM
-L:	linux-ia64@vger.kernel.org
-S:	Orphan
-F:	Documentation/arch/ia64/
-F:	arch/ia64/
-
 IBM Operation Panel Input Driver
 M:	Eddie James <eajames@linux.ibm.com>
 L:	linux-input@vger.kernel.org
@@ -16269,11 +16263,6 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/muxes/i2c-mux-pca9541.c
 
-PCDP - PRIMARY CONSOLE AND DEBUG PORT
-M:	Khalid Aziz <khalid@gonehiking.org>
-S:	Maintained
-F:	drivers/firmware/pcdp.*
-
 PCI DRIVER FOR AARDVARK (Marvell Armada 3700)
 M:	Thomas Petazzoni <thomas.petazzoni@bootlin.com>
 M:	Pali Rohár <pali@kernel.org>
diff --git a/arch/Kconfig b/arch/Kconfig
index 12d51495caec..f4b210ab0612 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1088,7 +1088,6 @@ config HAVE_ARCH_COMPAT_MMAP_BASES
 config PAGE_SIZE_LESS_THAN_64KB
 	def_bool y
 	depends on !ARM64_64K_PAGES
-	depends on !IA64_PAGE_SIZE_64KB
 	depends on !PAGE_SIZE_64KB
 	depends on !PARISC_PAGE_SIZE_64KB
 	depends on PAGE_SIZE_LESS_THAN_256KB
diff --git a/arch/ia64/Kbuild b/arch/ia64/Kbuild
deleted file mode 100644
index e77cc76d228c..000000000000
--- a/arch/ia64/Kbuild
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-y				+= kernel/ mm/
-obj-$(CONFIG_IA64_SGI_UV)	+= uv/
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
deleted file mode 100644
index 53faa122b0f4..000000000000
--- a/arch/ia64/Kconfig
+++ /dev/null
@@ -1,394 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config PGTABLE_LEVELS
-	int "Page Table Levels" if !IA64_PAGE_SIZE_64KB
-	range 3 4 if !IA64_PAGE_SIZE_64KB
-	default 3
-
-menu "Processor type and features"
-
-config IA64
-	bool
-	select ARCH_BINFMT_ELF_EXTRA_PHDRS
-	select ARCH_HAS_CPU_FINALIZE_INIT
-	select ARCH_HAS_DMA_MARK_CLEAN
-	select ARCH_HAS_STRNCPY_FROM_USER
-	select ARCH_HAS_STRNLEN_USER
-	select ARCH_MIGHT_HAVE_PC_PARPORT
-	select ARCH_MIGHT_HAVE_PC_SERIO
-	select ACPI
-	select ACPI_NUMA if NUMA
-	select ARCH_ENABLE_MEMORY_HOTPLUG
-	select ARCH_ENABLE_MEMORY_HOTREMOVE
-	select ARCH_SUPPORTS_ACPI
-	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
-	select FORCE_PCI
-	select PCI_DOMAINS if PCI
-	select PCI_MSI
-	select PCI_SYSCALL if PCI
-	select HAS_IOPORT
-	select HAVE_ASM_MODVERSIONS
-	select HAVE_UNSTABLE_SCHED_CLOCK
-	select HAVE_EXIT_THREAD
-	select HAVE_KPROBES
-	select HAVE_KRETPROBES
-	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_DYNAMIC_FTRACE if (!ITANIUM)
-	select HAVE_FUNCTION_TRACER
-	select HAVE_SETUP_PER_CPU_AREA
-	select TTY
-	select HAVE_ARCH_TRACEHOOK
-	select HAVE_FUNCTION_DESCRIPTORS
-	select HAVE_VIRT_CPU_ACCOUNTING
-	select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
-	select GENERIC_IRQ_PROBE
-	select GENERIC_PENDING_IRQ if SMP
-	select GENERIC_IRQ_SHOW
-	select GENERIC_IRQ_LEGACY
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select GENERIC_IOMAP
-	select GENERIC_IOREMAP
-	select GENERIC_SMP_IDLE_THREAD
-	select ARCH_TASK_STRUCT_ON_STACK
-	select ARCH_TASK_STRUCT_ALLOCATOR
-	select ARCH_THREAD_STACK_ALLOCATOR
-	select ARCH_CLOCKSOURCE_DATA
-	select GENERIC_TIME_VSYSCALL
-	select LEGACY_TIMER_TICK
-	select SWIOTLB
-	select SYSCTL_ARCH_UNALIGN_NO_WARN
-	select HAVE_MOD_ARCH_SPECIFIC
-	select MODULES_USE_ELF_RELA
-	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_ARCH_AUDITSYSCALL
-	select NEED_DMA_MAP_STATE
-	select NEED_SG_DMA_LENGTH
-	select NUMA if !FLATMEM
-	select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
-	select ZONE_DMA32
-	select FUNCTION_ALIGNMENT_32B
-	default y
-	help
-	  The Itanium Processor Family is Intel's 64-bit successor to
-	  the 32-bit X86 line.  The IA-64 Linux project has a home
-	  page at <http://www.linuxia64.org/> and a mailing list at
-	  <linux-ia64@vger.kernel.org>.
-
-config 64BIT
-	bool
-	select ATA_NONSTANDARD if ATA
-	default y
-
-config MMU
-	bool
-	default y
-
-config STACKTRACE_SUPPORT
-	def_bool y
-
-config GENERIC_LOCKBREAK
-	def_bool n
-
-config GENERIC_CALIBRATE_DELAY
-	bool
-	default y
-
-config DMI
-	bool
-	default y
-	select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
-
-config EFI
-	bool
-	select UCS2_STRING
-	default y
-
-config SCHED_OMIT_FRAME_POINTER
-	bool
-	default y
-
-config IA64_UNCACHED_ALLOCATOR
-	bool
-	select GENERIC_ALLOCATOR
-
-config ARCH_USES_PG_UNCACHED
-	def_bool y
-	depends on IA64_UNCACHED_ALLOCATOR
-
-config AUDIT_ARCH
-	bool
-	default y
-
-choice
-	prompt "Processor type"
-	default ITANIUM
-
-config ITANIUM
-	bool "Itanium"
-	help
-	  Select your IA-64 processor type.  The default is Itanium.
-	  This choice is safe for all IA-64 systems, but may not perform
-	  optimally on systems with, say, Itanium 2 or newer processors.
-
-config MCKINLEY
-	bool "Itanium 2"
-	help
-	  Select this to configure for an Itanium 2 (McKinley) processor.
-
-endchoice
-
-choice
-	prompt "Kernel page size"
-	default IA64_PAGE_SIZE_16KB
-
-config IA64_PAGE_SIZE_4KB
-	bool "4KB"
-	help
-	  This lets you select the page size of the kernel.  For best IA-64
-	  performance, a page size of 8KB or 16KB is recommended.  For best
-	  IA-32 compatibility, a page size of 4KB should be selected (the vast
-	  majority of IA-32 binaries work perfectly fine with a larger page
-	  size).  For Itanium 2 or newer systems, a page size of 64KB can also
-	  be selected.
-
-	  4KB                For best IA-32 compatibility
-	  8KB                For best IA-64 performance
-	  16KB               For best IA-64 performance
-	  64KB               Requires Itanium 2 or newer processor.
-
-	  If you don't know what to do, choose 16KB.
-
-config IA64_PAGE_SIZE_8KB
-	bool "8KB"
-
-config IA64_PAGE_SIZE_16KB
-	bool "16KB"
-
-config IA64_PAGE_SIZE_64KB
-	depends on !ITANIUM
-	bool "64KB"
-
-endchoice
-
-source "kernel/Kconfig.hz"
-
-config IA64_BRL_EMU
-	bool
-	depends on ITANIUM
-	default y
-
-# align cache-sensitive data to 128 bytes
-config IA64_L1_CACHE_SHIFT
-	int
-	default "7" if MCKINLEY
-	default "6" if ITANIUM
-
-config IA64_SGI_UV
-	bool "SGI-UV support"
-	help
-	  Selecting this option will add specific support for running on SGI
-	  UV based systems.  If you have an SGI UV system or are building a
-	  distro kernel, select this option.
-
-config IA64_HP_SBA_IOMMU
-	bool "HP SBA IOMMU support"
-	select DMA_OPS
-	default y
-	help
-	  Say Y here to add support for the SBA IOMMU found on HP zx1 and
-	  sx1000 systems.  If you're unsure, answer Y.
-
-config IA64_CYCLONE
-	bool "Cyclone (EXA) Time Source support"
-	help
-	  Say Y here to enable support for IBM EXA Cyclone time source.
-	  If you're unsure, answer N.
-
-config ARCH_FORCE_MAX_ORDER
-	int
-	default "16" if HUGETLB_PAGE
-	default "10"
-
-config SMP
-	bool "Symmetric multi-processing support"
-	help
-	  This enables support for systems with more than one CPU. If you have
-	  a system with only one CPU, say N.  If you have a system with more
-	  than one CPU, say Y.
-
-	  If you say N here, the kernel will run on single and multiprocessor
-	  systems, but will use only one CPU of a multiprocessor system.  If
-	  you say Y here, the kernel will run on many, but not all,
-	  single processor systems.  On a single processor system, the kernel
-	  will run faster if you say N here.
-
-	  See also the SMP-HOWTO available at
-	  <http://www.tldp.org/docs.html#howto>.
-
-	  If you don't know what to do here, say N.
-
-config NR_CPUS
-	int "Maximum number of CPUs (2-4096)"
-	range 2 4096
-	depends on SMP
-	default "4096"
-	help
-	  You should set this to the number of CPUs in your system, but
-	  keep in mind that a kernel compiled for, e.g., 2 CPUs will boot but
-	  only use 2 CPUs on a >2 CPU system.  Setting this to a value larger
-	  than 64 will cause the use of a CPU mask array, causing a small
-	  performance hit.
-
-config HOTPLUG_CPU
-	bool "Support for hot-pluggable CPUs"
-	depends on SMP
-	default n
-	help
-	  Say Y here to experiment with turning CPUs off and on.  CPUs
-	  can be controlled through /sys/devices/system/cpu/cpu#.
-	  Say N if you want to disable CPU hotplug.
-
-config SCHED_SMT
-	bool "SMT scheduler support"
-	depends on SMP
-	help
-	  Improves the CPU scheduler's decision making when dealing with
-	  Intel IA64 chips with MultiThreading at a cost of slightly increased
-	  overhead in some places. If unsure say N here.
-
-config PERMIT_BSP_REMOVE
-	bool "Support removal of Bootstrap Processor"
-	depends on HOTPLUG_CPU
-	default n
-	help
-	Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
-	support. 
-
-config FORCE_CPEI_RETARGET
-	bool "Force assumption that CPEI can be re-targeted"
-	depends on PERMIT_BSP_REMOVE
-	default n
-	help
-	Say Y if you need to force the assumption that CPEI can be re-targeted to
-	any cpu in the system. This hint is available via ACPI 3.0 specifications.
-	Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
-	This option it useful to enable this feature on older BIOS's as well.
-	You can also enable this by using boot command line option force_cpei=1.
-
-config ARCH_SELECT_MEMORY_MODEL
-	def_bool y
-
-config ARCH_FLATMEM_ENABLE
-	def_bool y
-
-config ARCH_SPARSEMEM_ENABLE
-	def_bool y
-	select SPARSEMEM_VMEMMAP_ENABLE
-
-config ARCH_SPARSEMEM_DEFAULT
-	def_bool y
-	depends on ARCH_SPARSEMEM_ENABLE
-
-config NUMA
-	bool "NUMA support"
-	depends on !FLATMEM
-	select SMP
-	select USE_PERCPU_NUMA_NODE_ID
-	help
-	  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
-	  Access).  This option is for configuring high-end multiprocessor
-	  server systems.  If in doubt, say N.
-
-config NODES_SHIFT
-	int "Max num nodes shift(3-10)"
-	range 3 10
-	default "10"
-	depends on NUMA
-	help
-	  This option specifies the maximum number of nodes in your SSI system.
-	  MAX_NUMNODES will be 2^(This value).
-	  If in doubt, use the default.
-
-config HAVE_ARCH_NODEDATA_EXTENSION
-	def_bool y
-	depends on NUMA
-
-config HAVE_MEMORYLESS_NODES
-	def_bool NUMA
-
-config ARCH_PROC_KCORE_TEXT
-	def_bool y
-	depends on PROC_KCORE
-
-config IA64_MCA_RECOVERY
-	bool "MCA recovery from errors other than TLB."
-
-config IA64_PALINFO
-	tristate "/proc/pal support"
-	help
-	  If you say Y here, you are able to get PAL (Processor Abstraction
-	  Layer) information in /proc/pal.  This contains useful information
-	  about the processors in your systems, such as cache and TLB sizes
-	  and the PAL firmware version in use.
-
-	  To use this option, you have to ensure that the "/proc file system
-	  support" (CONFIG_PROC_FS) is enabled, too.
-
-config IA64_MC_ERR_INJECT
-	tristate "MC error injection support"
-	help
-	  Adds support for MC error injection. If enabled, the kernel 
-	  will provide a sysfs interface for user applications to
-	  call MC error injection PAL procedures to inject various errors.
-	  This is a useful tool for MCA testing.
-
-	  If you're unsure, do not select this option.
-
-config IA64_ESI
-	bool "ESI (Extensible SAL Interface) support"
-	help
-	  If you say Y here, support is built into the kernel to
-	  make ESI calls.  ESI calls are used to support vendor-specific
-	  firmware extensions, such as the ability to inject memory-errors
-	  for test-purposes.  If you're unsure, say N.
-
-config IA64_HP_AML_NFW
-	bool "Support ACPI AML calls to native firmware"
-	help
-	  This driver installs a global ACPI Operation Region handler for
-	  region 0xA1.  AML methods can use this OpRegion to call arbitrary
-	  native firmware functions.  The driver installs the OpRegion
-	  handler if there is an HPQ5001 device or if the user supplies
-	  the "force" module parameter, e.g., with the "aml_nfw.force"
-	  kernel command line option.
-
-endmenu
-
-config ARCH_SUPPORTS_KEXEC
-	def_bool !SMP || HOTPLUG_CPU
-
-config ARCH_SUPPORTS_CRASH_DUMP
-	def_bool IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
-
-menu "Power management and ACPI options"
-
-source "kernel/power/Kconfig"
-
-source "drivers/acpi/Kconfig"
-
-if PM
-menu "CPU Frequency scaling"
-source "drivers/cpufreq/Kconfig"
-endmenu
-endif
-
-endmenu
-
-config MSPEC
-	tristate "Memory special operations driver"
-	depends on IA64
-	select IA64_UNCACHED_ALLOCATOR
-	help
-	  If you have an ia64 and you want to enable memory special
-	  operations support (formerly known as fetchop), say Y here,
-	  otherwise say N.
diff --git a/arch/ia64/Kconfig.debug b/arch/ia64/Kconfig.debug
deleted file mode 100644
index 2ce008e2d164..000000000000
--- a/arch/ia64/Kconfig.debug
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-choice
-	prompt "Physical memory granularity"
-	default IA64_GRANULE_64MB
-
-config IA64_GRANULE_16MB
-	bool "16MB"
-	help
-	  IA-64 identity-mapped regions use a large page size called "granules".
-
-	  Select "16MB" for a small granule size.
-	  Select "64MB" for a large granule size.  This is the current default.
-
-config IA64_GRANULE_64MB
-	bool "64MB"
-	depends on BROKEN
-
-endchoice
-
-config IA64_PRINT_HAZARDS
-	bool "Print possible IA-64 dependency violations to console"
-	depends on DEBUG_KERNEL
-	help
-	  Selecting this option prints more information for Illegal Dependency
-	  Faults, that is, for Read-after-Write (RAW), Write-after-Write (WAW),
-	  or Write-after-Read (WAR) violations.  This option is ignored if you
-	  are compiling for an Itanium A step processor
-	  (CONFIG_ITANIUM_ASTEP_SPECIFIC).  If you're unsure, select Y.
-
-config DISABLE_VHPT
-	bool "Disable VHPT"
-	depends on DEBUG_KERNEL
-	help
-	  The Virtual Hash Page Table (VHPT) enhances virtual address
-	  translation performance.  Normally you want the VHPT active but you
-	  can select this option to disable the VHPT for debugging.  If you're
-	  unsure, answer N.
-
-config IA64_DEBUG_CMPXCHG
-	bool "Turn on compare-and-exchange bug checking (slow!)"
-	depends on DEBUG_KERNEL && PRINTK
-	help
-	  Selecting this option turns on bug checking for the IA-64
-	  compare-and-exchange instructions.  This is slow!  Itaniums
-	  from step B3 or later don't have this problem. If you're unsure,
-	  select N.
-
-config IA64_DEBUG_IRQ
-	bool "Turn on irq debug checks (slow!)"
-	depends on DEBUG_KERNEL
-	help
-	  Selecting this option turns on bug checking for the IA-64 irq_save
-	  and restore instructions.  It's useful for tracking down spinlock
-	  problems, but slow!  If you're unsure, select N.
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
deleted file mode 100644
index d553ab7022fe..000000000000
--- a/arch/ia64/Makefile
+++ /dev/null
@@ -1,82 +0,0 @@
-#
-# ia64/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1998-2004 by David Mosberger-Tang <davidm@hpl.hp.com>
-#
-
-KBUILD_DEFCONFIG := generic_defconfig
-
-NM := $(CROSS_COMPILE)nm -B
-
-CHECKFLAGS	+= -D__ia64=1 -D__ia64__=1 -D_LP64 -D__LP64__
-
-OBJCOPYFLAGS	:= --strip-all
-LDFLAGS_vmlinux	:= -static
-KBUILD_AFLAGS_KERNEL := -mconstant-gp
-EXTRA		:=
-
-cflags-y	:= -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f12-f15,f32-f127 \
-		   -frename-registers -fno-optimize-sibling-calls
-KBUILD_CFLAGS_KERNEL := -mconstant-gp
-
-GAS_STATUS	= $(shell $(srctree)/arch/ia64/scripts/check-gas "$(CC)" "$(OBJDUMP)")
-KBUILD_CPPFLAGS += $(shell $(srctree)/arch/ia64/scripts/toolchain-flags "$(CC)" "$(OBJDUMP)" "$(READELF)")
-
-ifeq ($(GAS_STATUS),buggy)
-$(error Sorry, you need a newer version of the assember, one that is built from	\
-	a source-tree that post-dates 18-Dec-2002.  You can find a pre-compiled	\
-	static binary of such an assembler at:					\
-										\
-		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
-endif
-
-quiet_cmd_gzip = GZIP    $@
-cmd_gzip = cat $(real-prereqs) | $(KGZIP) -n -f -9 > $@
-
-quiet_cmd_objcopy = OBJCOPY $@
-cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
-
-KBUILD_CFLAGS += $(cflags-y)
-
-libs-y				+= arch/ia64/lib/
-
-drivers-y			+= arch/ia64/pci/ arch/ia64/hp/common/
-
-PHONY += compressed check
-
-all: compressed unwcheck
-
-compressed: vmlinux.gz
-
-vmlinuz: vmlinux.gz
-
-vmlinux.gz: vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-unwcheck: vmlinux
-	-$(Q)READELF=$(READELF) $(PYTHON3) $(srctree)/arch/ia64/scripts/unwcheck.py $<
-
-archheaders:
-	$(Q)$(MAKE) $(build)=arch/ia64/kernel/syscalls all
-
-CLEAN_FILES += vmlinux.gz
-
-install: KBUILD_IMAGE := vmlinux.gz
-install:
-	$(call cmd,install)
-
-define archhelp
-  echo '* compressed	- Build compressed kernel image'
-  echo '  install	- Install compressed kernel image'
-  echo '* unwcheck	- Check vmlinux for invalid unwind info'
-endef
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
deleted file mode 100644
index 7cb96db9a25d..000000000000
--- a/arch/ia64/configs/bigsur_defconfig
+++ /dev/null
@@ -1,102 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_LOG_BUF_SHIFT=16
-CONFIG_PROFILING=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=2
-CONFIG_PREEMPT=y
-CONFIG_IA64_PALINFO=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATA=m
-CONFIG_ATA_GENERIC=m
-CONFIG_ATA_PIIX=m
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_RAID10=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_DRM=m
-CONFIG_DRM_R128=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_CS4281=m
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_ACM=m
-CONFIG_USB_PRINTER=m
-CONFIG_USB_STORAGE=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-CONFIG_XFS_FS=y
-CONFIG_XFS_QUOTA=y
-CONFIG_XFS_POSIX_ACL=y
-CONFIG_AUTOFS_FS=m
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_DES=y
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
deleted file mode 100644
index 4581240013dd..000000000000
--- a/arch/ia64/configs/generic_defconfig
+++ /dev/null
@@ -1,206 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_CGROUPS=y
-CONFIG_CPUSETS=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_PAGE_SIZE_64KB=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_HOTPLUG_CPU=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_ACPI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_CONNECTOR=y
-# CONFIG_PNP_DEBUG_MESSAGES is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_SGI_XP=m
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_SATA_VITESSE=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=m
-CONFIG_FUSION_SAS=y
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-CONFIG_IGB=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_AGP_HP_ZX1=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_SEQ_DUMMY=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_VERBOSE_PRINTK=y
-CONFIG_SND_DUMMY=m
-CONFIG_SND_VIRMIDI=m
-CONFIG_SND_MTPAV=m
-CONFIG_SND_SERIAL_U16550=m
-CONFIG_SND_MPU401=m
-CONFIG_SND_CS4281=m
-CONFIG_SND_CS46XX=m
-CONFIG_SND_EMU10K1=m
-CONFIG_SND_FM801=m
-CONFIG_HID_GYRATION=m
-CONFIG_HID_PANTHERLORD=m
-CONFIG_HID_PETALYNX=m
-CONFIG_HID_SAMSUNG=m
-CONFIG_HID_SONY=m
-CONFIG_HID_SUNPLUS=m
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_INTEL_IOMMU=y
-CONFIG_MSPEC=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=m
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRC_T10DIF=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
deleted file mode 100644
index c9e806616544..000000000000
--- a/arch/ia64/configs/gensparse_defconfig
+++ /dev/null
@@ -1,184 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=512
-CONFIG_HOTPLUG_CPU=y
-CONFIG_SPARSEMEM_MANUAL=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_AGP_HP_ZX1=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_SEQ_DUMMY=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_VERBOSE_PRINTK=y
-CONFIG_SND_DUMMY=m
-CONFIG_SND_VIRMIDI=m
-CONFIG_SND_MTPAV=m
-CONFIG_SND_SERIAL_U16550=m
-CONFIG_SND_MPU401=m
-CONFIG_SND_CS4281=m
-CONFIG_SND_CS46XX=m
-CONFIG_SND_EMU10K1=m
-CONFIG_SND_FM801=m
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=y
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_MD5=y
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
deleted file mode 100644
index d7d8fb5c7b71..000000000000
--- a/arch/ia64/configs/tiger_defconfig
+++ /dev/null
@@ -1,169 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_PAGE_SIZE_64KB=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_HOTPLUG_CPU=y
-CONFIG_PERMIT_BSP_REMOVE=y
-CONFIG_FORCE_CPEI_RETARGET=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_KEXEC=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=y
-CONFIG_FUSION_CTL=y
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_USB=y
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=y
-CONFIG_USB_STORAGE=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=y
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_IA64_GRANULE_16MB=y
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_MD5=y
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
deleted file mode 100644
index ed104550d0d5..000000000000
--- a/arch/ia64/configs/zx1_defconfig
+++ /dev/null
@@ -1,148 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KPROBES=y
-CONFIG_MODULES=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_MCKINLEY=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_HOTPLUG_CPU=y
-CONFIG_FLATMEM_MANUAL=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_CRASH_DUMP=y
-CONFIG_BINFMT_MISC=y
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_ACPI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-# CONFIG_IPV6 is not set
-CONFIG_NETFILTER=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
-CONFIG_BLK_DEV_SR=y
-CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=y
-CONFIG_FUSION_CTL=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=y
-CONFIG_TULIP_MWI=y
-CONFIG_TULIP_MMIO=y
-CONFIG_TULIP_NAPI=y
-CONFIG_TULIP_NAPI_HW_MITIGATION=y
-CONFIG_E100=y
-CONFIG_E1000=y
-CONFIG_INPUT_JOYDEV=y
-CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_SERIO_I8042 is not set
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=8
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_AGP=y
-CONFIG_AGP_HP_ZX1=y
-CONFIG_DRM=y
-CONFIG_DRM_RADEON=y
-CONFIG_FB_RADEON=y
-CONFIG_FB_RADEON_DEBUG=y
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_FM801=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=y
-CONFIG_USB_MON=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_UHCI_HCD=y
-CONFIG_USB_STORAGE=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT3_FS=y
-CONFIG_ISO9660_FS=y
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=y
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V4=y
-CONFIG_NFSD=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=y
-CONFIG_NLS_CODEPAGE_775=y
-CONFIG_NLS_CODEPAGE_850=y
-CONFIG_NLS_CODEPAGE_852=y
-CONFIG_NLS_CODEPAGE_855=y
-CONFIG_NLS_CODEPAGE_857=y
-CONFIG_NLS_CODEPAGE_860=y
-CONFIG_NLS_CODEPAGE_861=y
-CONFIG_NLS_CODEPAGE_862=y
-CONFIG_NLS_CODEPAGE_863=y
-CONFIG_NLS_CODEPAGE_864=y
-CONFIG_NLS_CODEPAGE_865=y
-CONFIG_NLS_CODEPAGE_866=y
-CONFIG_NLS_CODEPAGE_869=y
-CONFIG_NLS_CODEPAGE_936=y
-CONFIG_NLS_CODEPAGE_950=y
-CONFIG_NLS_CODEPAGE_932=y
-CONFIG_NLS_CODEPAGE_949=y
-CONFIG_NLS_CODEPAGE_874=y
-CONFIG_NLS_ISO8859_8=y
-CONFIG_NLS_CODEPAGE_1251=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=y
-CONFIG_NLS_ISO8859_3=y
-CONFIG_NLS_ISO8859_4=y
-CONFIG_NLS_ISO8859_5=y
-CONFIG_NLS_ISO8859_6=y
-CONFIG_NLS_ISO8859_7=y
-CONFIG_NLS_ISO8859_9=y
-CONFIG_NLS_ISO8859_13=y
-CONFIG_NLS_ISO8859_14=y
-CONFIG_NLS_ISO8859_15=y
-CONFIG_NLS_KOI8_R=y
-CONFIG_NLS_KOI8_U=y
-CONFIG_NLS_UTF8=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_IA64_PRINT_HAZARDS=y
-CONFIG_CRYPTO_ECB=m
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/ia64/hp/common/Makefile b/arch/ia64/hp/common/Makefile
deleted file mode 100644
index 11a56ed38229..000000000000
--- a/arch/ia64/hp/common/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# ia64/platform/hp/common/Makefile
-#
-# Copyright (C) 2002 Hewlett Packard
-# Copyright (C) Alex Williamson (alex_williamson@hp.com)
-#
-
-obj-$(CONFIG_IA64_HP_SBA_IOMMU) += sba_iommu.o
-obj-$(CONFIG_IA64_HP_AML_NFW) += aml_nfw.o
diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c
deleted file mode 100644
index 901df49461a0..000000000000
--- a/arch/ia64/hp/common/aml_nfw.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * OpRegion handler to allow AML to call native firmware
- *
- * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * This driver implements HP Open Source Review Board proposal 1842,
- * which was approved on 9/20/2006.
- *
- * For technical documentation, see the HP SPPA Firmware EAS, Appendix F.
- *
- * ACPI does not define a mechanism for AML methods to call native firmware
- * interfaces such as PAL or SAL.  This OpRegion handler adds such a mechanism.
- * After the handler is installed, an AML method can call native firmware by
- * storing the arguments and firmware entry point to specific offsets in the
- * OpRegion.  When AML reads the "return value" offset from the OpRegion, this
- * handler loads up the arguments, makes the firmware call, and returns the
- * result.
- */
-
-#include <linux/module.h>
-#include <linux/acpi.h>
-#include <asm/sal.h>
-
-MODULE_AUTHOR("Bjorn Helgaas <bjorn.helgaas@hp.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ACPI opregion handler for native firmware calls");
-
-static bool force_register;
-module_param_named(force, force_register, bool, 0);
-MODULE_PARM_DESC(force, "Install opregion handler even without HPQ5001 device");
-
-#define AML_NFW_SPACE		0xA1
-
-struct ia64_pdesc {
-	void *ip;
-	void *gp;
-};
-
-/*
- * N.B.  The layout of this structure is defined in the HP SPPA FW EAS, and
- *	 the member offsets are embedded in AML methods.
- */
-struct ia64_nfw_context {
-	u64 arg[8];
-	struct ia64_sal_retval ret;
-	u64 ip;
-	u64 gp;
-	u64 pad[2];
-};
-
-static void *virt_map(u64 address)
-{
-	if (address & (1UL << 63))
-		return (void *) (__IA64_UNCACHED_OFFSET | address);
-
-	return __va(address);
-}
-
-static void aml_nfw_execute(struct ia64_nfw_context *c)
-{
-	struct ia64_pdesc virt_entry;
-	ia64_sal_handler entry;
-
-	virt_entry.ip = virt_map(c->ip);
-	virt_entry.gp = virt_map(c->gp);
-
-	entry = (ia64_sal_handler) &virt_entry;
-
-	IA64_FW_CALL(entry, c->ret,
-		     c->arg[0], c->arg[1], c->arg[2], c->arg[3],
-		     c->arg[4], c->arg[5], c->arg[6], c->arg[7]);
-}
-
-static void aml_nfw_read_arg(u8 *offset, u32 bit_width, u64 *value)
-{
-	switch (bit_width) {
-	case 8:
-		*value = *(u8 *)offset;
-		break;
-	case 16:
-		*value = *(u16 *)offset;
-		break;
-	case 32:
-		*value = *(u32 *)offset;
-		break;
-	case 64:
-		*value = *(u64 *)offset;
-		break;
-	}
-}
-
-static void aml_nfw_write_arg(u8 *offset, u32 bit_width, u64 *value)
-{
-	switch (bit_width) {
-	case 8:
-		*(u8 *) offset = *value;
-		break;
-	case 16:
-		*(u16 *) offset = *value;
-		break;
-	case 32:
-		*(u32 *) offset = *value;
-		break;
-	case 64:
-		*(u64 *) offset = *value;
-		break;
-	}
-}
-
-static acpi_status aml_nfw_handler(u32 function, acpi_physical_address address,
-	u32 bit_width, u64 *value, void *handler_context,
-	void *region_context)
-{
-	struct ia64_nfw_context *context = handler_context;
-	u8 *offset = (u8 *) context + address;
-
-	if (bit_width !=  8 && bit_width != 16 &&
-	    bit_width != 32 && bit_width != 64)
-		return AE_BAD_PARAMETER;
-
-	if (address + (bit_width >> 3) > sizeof(struct ia64_nfw_context))
-		return AE_BAD_PARAMETER;
-
-	switch (function) {
-	case ACPI_READ:
-		if (address == offsetof(struct ia64_nfw_context, ret))
-			aml_nfw_execute(context);
-		aml_nfw_read_arg(offset, bit_width, value);
-		break;
-	case ACPI_WRITE:
-		aml_nfw_write_arg(offset, bit_width, value);
-		break;
-	}
-
-	return AE_OK;
-}
-
-static struct ia64_nfw_context global_context;
-static int global_handler_registered;
-
-static int aml_nfw_add_global_handler(void)
-{
-	acpi_status status;
-
-	if (global_handler_registered)
-		return 0;
-
-	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
-		AML_NFW_SPACE, aml_nfw_handler, NULL, &global_context);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	global_handler_registered = 1;
-	printk(KERN_INFO "Global 0x%02X opregion handler registered\n",
-		AML_NFW_SPACE);
-	return 0;
-}
-
-static int aml_nfw_remove_global_handler(void)
-{
-	acpi_status status;
-
-	if (!global_handler_registered)
-		return 0;
-
-	status = acpi_remove_address_space_handler(ACPI_ROOT_OBJECT,
-		AML_NFW_SPACE, aml_nfw_handler);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	global_handler_registered = 0;
-	printk(KERN_INFO "Global 0x%02X opregion handler removed\n",
-		AML_NFW_SPACE);
-	return 0;
-}
-
-static int aml_nfw_add(struct acpi_device *device)
-{
-	/*
-	 * We would normally allocate a new context structure and install
-	 * the address space handler for the specific device we found.
-	 * But the HP-UX implementation shares a single global context
-	 * and always puts the handler at the root, so we'll do the same.
-	 */
-	return aml_nfw_add_global_handler();
-}
-
-static void aml_nfw_remove(struct acpi_device *device)
-{
-	aml_nfw_remove_global_handler();
-}
-
-static const struct acpi_device_id aml_nfw_ids[] = {
-	{"HPQ5001", 0},
-	{"", 0}
-};
-
-static struct acpi_driver acpi_aml_nfw_driver = {
-	.name = "native firmware",
-	.ids = aml_nfw_ids,
-	.ops = {
-		.add = aml_nfw_add,
-		.remove = aml_nfw_remove,
-		},
-};
-
-static int __init aml_nfw_init(void)
-{
-	int result;
-
-	if (force_register)
-		aml_nfw_add_global_handler();
-
-	result = acpi_bus_register_driver(&acpi_aml_nfw_driver);
-	if (result < 0) {
-		aml_nfw_remove_global_handler();
-		return result;
-	}
-
-	return 0;
-}
-
-static void __exit aml_nfw_exit(void)
-{
-	acpi_bus_unregister_driver(&acpi_aml_nfw_driver);
-	aml_nfw_remove_global_handler();
-}
-
-module_init(aml_nfw_init);
-module_exit(aml_nfw_exit);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
deleted file mode 100644
index c4d477e8bcd4..000000000000
--- a/arch/ia64/hp/common/sba_iommu.c
+++ /dev/null
@@ -1,2155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-**  IA64 System Bus Adapter (SBA) I/O MMU manager
-**
-**	(c) Copyright 2002-2005 Alex Williamson
-**	(c) Copyright 2002-2003 Grant Grundler
-**	(c) Copyright 2002-2005 Hewlett-Packard Company
-**
-**	Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)
-**	Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)
-**
-**
-**
-** This module initializes the IOC (I/O Controller) found on HP
-** McKinley machines and their successors.
-**
-*/
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/nodemask.h>
-#include <linux/bitops.h>         /* hweight64() */
-#include <linux/crash_dump.h>
-#include <linux/iommu-helper.h>
-#include <linux/dma-map-ops.h>
-#include <linux/prefetch.h>
-#include <linux/swiotlb.h>
-
-#include <asm/delay.h>		/* ia64_get_itc() */
-#include <asm/io.h>
-#include <asm/page.h>		/* PAGE_OFFSET */
-#include <asm/dma.h>
-
-#include <asm/acpi-ext.h>
-
-#define PFX "IOC: "
-
-/*
-** Enabling timing search of the pdir resource map.  Output in /proc.
-** Disabled by default to optimize performance.
-*/
-#undef PDIR_SEARCH_TIMING
-
-/*
-** This option allows cards capable of 64bit DMA to bypass the IOMMU.  If
-** not defined, all DMA will be 32bit and go through the TLB.
-** There's potentially a conflict in the bio merge code with us
-** advertising an iommu, but then bypassing it.  Since I/O MMU bypassing
-** appears to give more performance than bio-level virtual merging, we'll
-** do the former for now.  NOTE: BYPASS_SG also needs to be undef'd to
-** completely restrict DMA to the IOMMU.
-*/
-#define ALLOW_IOV_BYPASS
-
-/*
-** This option specifically allows/disallows bypassing scatterlists with
-** multiple entries.  Coalescing these entries can allow better DMA streaming
-** and in some cases shows better performance than entirely bypassing the
-** IOMMU.  Performance increase on the order of 1-2% sequential output/input
-** using bonnie++ on a RAID0 MD device (sym2 & mpt).
-*/
-#undef ALLOW_IOV_BYPASS_SG
-
-/*
-** If a device prefetches beyond the end of a valid pdir entry, it will cause
-** a hard failure, ie. MCA.  Version 3.0 and later of the zx1 LBA should
-** disconnect on 4k boundaries and prevent such issues.  If the device is
-** particularly aggressive, this option will keep the entire pdir valid such
-** that prefetching will hit a valid address.  This could severely impact
-** error containment, and is therefore off by default.  The page that is
-** used for spill-over is poisoned, so that should help debugging somewhat.
-*/
-#undef FULL_VALID_PDIR
-
-#define ENABLE_MARK_CLEAN
-
-/*
-** The number of debug flags is a clue - this code is fragile.  NOTE: since
-** tightening the use of res_lock the resource bitmap and actual pdir are no
-** longer guaranteed to stay in sync.  The sanity checking code isn't going to
-** like that.
-*/
-#undef DEBUG_SBA_INIT
-#undef DEBUG_SBA_RUN
-#undef DEBUG_SBA_RUN_SG
-#undef DEBUG_SBA_RESOURCE
-#undef ASSERT_PDIR_SANITY
-#undef DEBUG_LARGE_SG_ENTRIES
-#undef DEBUG_BYPASS
-
-#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)
-#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive
-#endif
-
-#define SBA_INLINE	__inline__
-/* #define SBA_INLINE */
-
-#ifdef DEBUG_SBA_INIT
-#define DBG_INIT(x...)	printk(x)
-#else
-#define DBG_INIT(x...)
-#endif
-
-#ifdef DEBUG_SBA_RUN
-#define DBG_RUN(x...)	printk(x)
-#else
-#define DBG_RUN(x...)
-#endif
-
-#ifdef DEBUG_SBA_RUN_SG
-#define DBG_RUN_SG(x...)	printk(x)
-#else
-#define DBG_RUN_SG(x...)
-#endif
-
-
-#ifdef DEBUG_SBA_RESOURCE
-#define DBG_RES(x...)	printk(x)
-#else
-#define DBG_RES(x...)
-#endif
-
-#ifdef DEBUG_BYPASS
-#define DBG_BYPASS(x...)	printk(x)
-#else
-#define DBG_BYPASS(x...)
-#endif
-
-#ifdef ASSERT_PDIR_SANITY
-#define ASSERT(expr) \
-        if(!(expr)) { \
-                printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \
-                panic(#expr); \
-        }
-#else
-#define ASSERT(expr)
-#endif
-
-/*
-** The number of pdir entries to "free" before issuing
-** a read to PCOM register to flush out PCOM writes.
-** Interacts with allocation granularity (ie 4 or 8 entries
-** allocated and free'd/purged at a time might make this
-** less interesting).
-*/
-#define DELAYED_RESOURCE_CNT	64
-
-#define PCI_DEVICE_ID_HP_SX2000_IOC	0x12ec
-
-#define ZX1_IOC_ID	((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)
-#define ZX2_IOC_ID	((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)
-#define REO_IOC_ID	((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)
-#define SX1000_IOC_ID	((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)
-#define SX2000_IOC_ID	((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)
-
-#define ZX1_IOC_OFFSET	0x1000	/* ACPI reports SBA, we want IOC */
-
-#define IOC_FUNC_ID	0x000
-#define IOC_FCLASS	0x008	/* function class, bist, header, rev... */
-#define IOC_IBASE	0x300	/* IO TLB */
-#define IOC_IMASK	0x308
-#define IOC_PCOM	0x310
-#define IOC_TCNFG	0x318
-#define IOC_PDIR_BASE	0x320
-
-#define IOC_ROPE0_CFG	0x500
-#define   IOC_ROPE_AO	  0x10	/* Allow "Relaxed Ordering" */
-
-
-/* AGP GART driver looks for this */
-#define ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
-
-/*
-** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)
-**
-** Some IOCs (sx1000) can run at the above pages sizes, but are
-** really only supported using the IOC at a 4k page size.
-**
-** iovp_size could only be greater than PAGE_SIZE if we are
-** confident the drivers really only touch the next physical
-** page iff that driver instance owns it.
-*/
-static unsigned long iovp_size;
-static unsigned long iovp_shift;
-static unsigned long iovp_mask;
-
-struct ioc {
-	void __iomem	*ioc_hpa;	/* I/O MMU base address */
-	char		*res_map;	/* resource map, bit == pdir entry */
-	u64		*pdir_base;	/* physical base address */
-	unsigned long	ibase;		/* pdir IOV Space base */
-	unsigned long	imask;		/* pdir IOV Space mask */
-
-	unsigned long	*res_hint;	/* next avail IOVP - circular search */
-	unsigned long	dma_mask;
-	spinlock_t	res_lock;	/* protects the resource bitmap, but must be held when */
-					/* clearing pdir to prevent races with allocations. */
-	unsigned int	res_bitshift;	/* from the RIGHT! */
-	unsigned int	res_size;	/* size of resource map in bytes */
-#ifdef CONFIG_NUMA
-	unsigned int	node;		/* node where this IOC lives */
-#endif
-#if DELAYED_RESOURCE_CNT > 0
-	spinlock_t	saved_lock;	/* may want to try to get this on a separate cacheline */
-					/* than res_lock for bigger systems. */
-	int		saved_cnt;
-	struct sba_dma_pair {
-		dma_addr_t	iova;
-		size_t		size;
-	} saved[DELAYED_RESOURCE_CNT];
-#endif
-
-#ifdef PDIR_SEARCH_TIMING
-#define SBA_SEARCH_SAMPLE	0x100
-	unsigned long avg_search[SBA_SEARCH_SAMPLE];
-	unsigned long avg_idx;	/* current index into avg_search */
-#endif
-
-	/* Stuff we don't need in performance path */
-	struct ioc	*next;		/* list of IOC's in system */
-	acpi_handle	handle;		/* for multiple IOC's */
-	const char 	*name;
-	unsigned int	func_id;
-	unsigned int	rev;		/* HW revision of chip */
-	u32		iov_size;
-	unsigned int	pdir_size;	/* in bytes, determined by IOV Space size */
-	struct pci_dev	*sac_only_dev;
-};
-
-static struct ioc *ioc_list, *ioc_found;
-static int reserve_sba_gart = 1;
-
-static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);
-static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);
-
-#define sba_sg_address(sg)	sg_virt((sg))
-
-#ifdef FULL_VALID_PDIR
-static u64 prefetch_spill_page;
-#endif
-
-#define GET_IOC(dev)	((dev_is_pci(dev))						\
-			 ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)
-
-/*
-** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
-** (or rather not merge) DMAs into manageable chunks.
-** On parisc, this is more of the software/tuning constraint
-** rather than the HW. I/O MMU allocation algorithms can be
-** faster with smaller sizes (to some degree).
-*/
-#define DMA_CHUNK_SIZE  (BITS_PER_LONG*iovp_size)
-
-#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))
-
-/************************************
-** SBA register read and write support
-**
-** BE WARNED: register writes are posted.
-**  (ie follow writes which must reach HW with a read)
-**
-*/
-#define READ_REG(addr)       __raw_readq(addr)
-#define WRITE_REG(val, addr) __raw_writeq(val, addr)
-
-#ifdef DEBUG_SBA_INIT
-
-/**
- * sba_dump_tlb - debugging only - print IOMMU operating parameters
- * @hpa: base address of the IOMMU
- *
- * Print the size/location of the IO MMU PDIR.
- */
-static void
-sba_dump_tlb(char *hpa)
-{
-	DBG_INIT("IO TLB at 0x%p\n", (void *)hpa);
-	DBG_INIT("IOC_IBASE    : %016lx\n", READ_REG(hpa+IOC_IBASE));
-	DBG_INIT("IOC_IMASK    : %016lx\n", READ_REG(hpa+IOC_IMASK));
-	DBG_INIT("IOC_TCNFG    : %016lx\n", READ_REG(hpa+IOC_TCNFG));
-	DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE));
-	DBG_INIT("\n");
-}
-#endif
-
-
-#ifdef ASSERT_PDIR_SANITY
-
-/**
- * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @msg: text to print ont the output line.
- * @pide: pdir index.
- *
- * Print one entry of the IO MMU PDIR in human readable form.
- */
-static void
-sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide)
-{
-	/* start printing from lowest pde in rval */
-	u64 *ptr = &ioc->pdir_base[pide  & ~(BITS_PER_LONG - 1)];
-	unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)];
-	uint rcnt;
-
-	printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n",
-		 msg, rptr, pide & (BITS_PER_LONG - 1), *rptr);
-
-	rcnt = 0;
-	while (rcnt < BITS_PER_LONG) {
-		printk(KERN_DEBUG "%s %2d %p %016Lx\n",
-		       (rcnt == (pide & (BITS_PER_LONG - 1)))
-		       ? "    -->" : "       ",
-		       rcnt, ptr, (unsigned long long) *ptr );
-		rcnt++;
-		ptr++;
-	}
-	printk(KERN_DEBUG "%s", msg);
-}
-
-
-/**
- * sba_check_pdir - debugging only - consistency checker
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @msg: text to print ont the output line.
- *
- * Verify the resource map and pdir state is consistent
- */
-static int
-sba_check_pdir(struct ioc *ioc, char *msg)
-{
-	u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]);
-	u64 *rptr = (u64 *) ioc->res_map;	/* resource map ptr */
-	u64 *pptr = ioc->pdir_base;	/* pdir ptr */
-	uint pide = 0;
-
-	while (rptr < rptr_end) {
-		u64 rval;
-		int rcnt; /* number of bits we might check */
-
-		rval = *rptr;
-		rcnt = 64;
-
-		while (rcnt) {
-			/* Get last byte and highest bit from that */
-			u32 pde = ((u32)((*pptr >> (63)) & 0x1));
-			if ((rval & 0x1) ^ pde)
-			{
-				/*
-				** BUMMER!  -- res_map != pdir --
-				** Dump rval and matching pdir entries
-				*/
-				sba_dump_pdir_entry(ioc, msg, pide);
-				return(1);
-			}
-			rcnt--;
-			rval >>= 1;	/* try the next bit */
-			pptr++;
-			pide++;
-		}
-		rptr++;	/* look at next word of res_map */
-	}
-	/* It'd be nice if we always got here :^) */
-	return 0;
-}
-
-
-/**
- * sba_dump_sg - debugging only - print Scatter-Gather list
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg: head of the SG list
- * @nents: number of entries in SG list
- *
- * print the SG list so we can verify it's correct by hand.
- */
-static void
-sba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
-{
-	while (nents-- > 0) {
-		printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
-		       startsg->dma_address, startsg->dma_length,
-		       sba_sg_address(startsg));
-		startsg = sg_next(startsg);
-	}
-}
-
-static void
-sba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
-{
-	struct scatterlist *the_sg = startsg;
-	int the_nents = nents;
-
-	while (the_nents-- > 0) {
-		if (sba_sg_address(the_sg) == 0x0UL)
-			sba_dump_sg(NULL, startsg, nents);
-		the_sg = sg_next(the_sg);
-	}
-}
-
-#endif /* ASSERT_PDIR_SANITY */
-
-
-
-
-/**************************************************************
-*
-*   I/O Pdir Resource Management
-*
-*   Bits set in the resource map are in use.
-*   Each bit can represent a number of pages.
-*   LSbs represent lower addresses (IOVA's).
-*
-***************************************************************/
-#define PAGES_PER_RANGE 1	/* could increase this to 4 or 8 if needed */
-
-/* Convert from IOVP to IOVA and vice versa. */
-#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))
-#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))
-
-#define PDIR_ENTRY_SIZE	sizeof(u64)
-
-#define PDIR_INDEX(iovp)   ((iovp)>>iovp_shift)
-
-#define RESMAP_MASK(n)    ~(~0UL << (n))
-#define RESMAP_IDX_MASK   (sizeof(unsigned long) - 1)
-
-
-/**
- * For most cases the normal get_order is sufficient, however it limits us
- * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity.
- * It only incurs about 1 clock cycle to use this one with the static variable
- * and makes the code more intuitive.
- */
-static SBA_INLINE int
-get_iovp_order (unsigned long size)
-{
-	long double d = size - 1;
-	long order;
-
-	order = ia64_getf_exp(d);
-	order = order - iovp_shift - 0xffff + 1;
-	if (order < 0)
-		order = 0;
-	return order;
-}
-
-static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr,
-				 unsigned int bitshiftcnt)
-{
-	return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3)
-		+ bitshiftcnt;
-}
-
-/**
- * sba_search_bitmap - find free space in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @bits_wanted: number of entries we need.
- * @use_hint: use res_hint to indicate where to start looking
- *
- * Find consecutive free bits in resource bitmap.
- * Each bit represents one entry in the IO Pdir.
- * Cool perf optimization: search for log2(size) bits at a time.
- */
-static SBA_INLINE unsigned long
-sba_search_bitmap(struct ioc *ioc, struct device *dev,
-		  unsigned long bits_wanted, int use_hint)
-{
-	unsigned long *res_ptr;
-	unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]);
-	unsigned long flags, pide = ~0UL, tpide;
-	unsigned long boundary_size;
-	unsigned long shift;
-	int ret;
-
-	ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0);
-	ASSERT(res_ptr < res_end);
-
-	boundary_size = dma_get_seg_boundary_nr_pages(dev, iovp_shift);
-
-	BUG_ON(ioc->ibase & ~iovp_mask);
-	shift = ioc->ibase >> iovp_shift;
-
-	spin_lock_irqsave(&ioc->res_lock, flags);
-
-	/* Allow caller to force a search through the entire resource space */
-	if (likely(use_hint)) {
-		res_ptr = ioc->res_hint;
-	} else {
-		res_ptr = (ulong *)ioc->res_map;
-		ioc->res_bitshift = 0;
-	}
-
-	/*
-	 * N.B.  REO/Grande defect AR2305 can cause TLB fetch timeouts
-	 * if a TLB entry is purged while in use.  sba_mark_invalid()
-	 * purges IOTLB entries in power-of-two sizes, so we also
-	 * allocate IOVA space in power-of-two sizes.
-	 */
-	bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift);
-
-	if (likely(bits_wanted == 1)) {
-		unsigned int bitshiftcnt;
-		for(; res_ptr < res_end ; res_ptr++) {
-			if (likely(*res_ptr != ~0UL)) {
-				bitshiftcnt = ffz(*res_ptr);
-				*res_ptr |= (1UL << bitshiftcnt);
-				pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
-				ioc->res_bitshift = bitshiftcnt + bits_wanted;
-				goto found_it;
-			}
-		}
-		goto not_found;
-
-	}
-	
-	if (likely(bits_wanted <= BITS_PER_LONG/2)) {
-		/*
-		** Search the resource bit map on well-aligned values.
-		** "o" is the alignment.
-		** We need the alignment to invalidate I/O TLB using
-		** SBA HW features in the unmap path.
-		*/
-		unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift);
-		uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o);
-		unsigned long mask, base_mask;
-
-		base_mask = RESMAP_MASK(bits_wanted);
-		mask = base_mask << bitshiftcnt;
-
-		DBG_RES("%s() o %ld %p", __func__, o, res_ptr);
-		for(; res_ptr < res_end ; res_ptr++)
-		{ 
-			DBG_RES("    %p %lx %lx\n", res_ptr, mask, *res_ptr);
-			ASSERT(0 != mask);
-			for (; mask ; mask <<= o, bitshiftcnt += o) {
-				tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
-				ret = iommu_is_span_boundary(tpide, bits_wanted,
-							     shift,
-							     boundary_size);
-				if ((0 == ((*res_ptr) & mask)) && !ret) {
-					*res_ptr |= mask;     /* mark resources busy! */
-					pide = tpide;
-					ioc->res_bitshift = bitshiftcnt + bits_wanted;
-					goto found_it;
-				}
-			}
-
-			bitshiftcnt = 0;
-			mask = base_mask;
-
-		}
-
-	} else {
-		int qwords, bits, i;
-		unsigned long *end;
-
-		qwords = bits_wanted >> 6; /* /64 */
-		bits = bits_wanted - (qwords * BITS_PER_LONG);
-
-		end = res_end - qwords;
-
-		for (; res_ptr < end; res_ptr++) {
-			tpide = ptr_to_pide(ioc, res_ptr, 0);
-			ret = iommu_is_span_boundary(tpide, bits_wanted,
-						     shift, boundary_size);
-			if (ret)
-				goto next_ptr;
-			for (i = 0 ; i < qwords ; i++) {
-				if (res_ptr[i] != 0)
-					goto next_ptr;
-			}
-			if (bits && res_ptr[i] && (__ffs(res_ptr[i]) < bits))
-				continue;
-
-			/* Found it, mark it */
-			for (i = 0 ; i < qwords ; i++)
-				res_ptr[i] = ~0UL;
-			res_ptr[i] |= RESMAP_MASK(bits);
-
-			pide = tpide;
-			res_ptr += qwords;
-			ioc->res_bitshift = bits;
-			goto found_it;
-next_ptr:
-			;
-		}
-	}
-
-not_found:
-	prefetch(ioc->res_map);
-	ioc->res_hint = (unsigned long *) ioc->res_map;
-	ioc->res_bitshift = 0;
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-	return (pide);
-
-found_it:
-	ioc->res_hint = res_ptr;
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-	return (pide);
-}
-
-
-/**
- * sba_alloc_range - find free bits and mark them in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @size: number of bytes to create a mapping for
- *
- * Given a size, find consecutive unmarked and then mark those bits in the
- * resource bit map.
- */
-static int
-sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
-{
-	unsigned int pages_needed = size >> iovp_shift;
-#ifdef PDIR_SEARCH_TIMING
-	unsigned long itc_start;
-#endif
-	unsigned long pide;
-
-	ASSERT(pages_needed);
-	ASSERT(0 == (size & ~iovp_mask));
-
-#ifdef PDIR_SEARCH_TIMING
-	itc_start = ia64_get_itc();
-#endif
-	/*
-	** "seek and ye shall find"...praying never hurts either...
-	*/
-	pide = sba_search_bitmap(ioc, dev, pages_needed, 1);
-	if (unlikely(pide >= (ioc->res_size << 3))) {
-		pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
-		if (unlikely(pide >= (ioc->res_size << 3))) {
-#if DELAYED_RESOURCE_CNT > 0
-			unsigned long flags;
-
-			/*
-			** With delayed resource freeing, we can give this one more shot.  We're
-			** getting close to being in trouble here, so do what we can to make this
-			** one count.
-			*/
-			spin_lock_irqsave(&ioc->saved_lock, flags);
-			if (ioc->saved_cnt > 0) {
-				struct sba_dma_pair *d;
-				int cnt = ioc->saved_cnt;
-
-				d = &(ioc->saved[ioc->saved_cnt - 1]);
-
-				spin_lock(&ioc->res_lock);
-				while (cnt--) {
-					sba_mark_invalid(ioc, d->iova, d->size);
-					sba_free_range(ioc, d->iova, d->size);
-					d--;
-				}
-				ioc->saved_cnt = 0;
-				READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-				spin_unlock(&ioc->res_lock);
-			}
-			spin_unlock_irqrestore(&ioc->saved_lock, flags);
-
-			pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
-			if (unlikely(pide >= (ioc->res_size << 3))) {
-				printk(KERN_WARNING "%s: I/O MMU @ %p is"
-				       "out of mapping resources, %u %u %lx\n",
-				       __func__, ioc->ioc_hpa, ioc->res_size,
-				       pages_needed, dma_get_seg_boundary(dev));
-				return -1;
-			}
-#else
-			printk(KERN_WARNING "%s: I/O MMU @ %p is"
-			       "out of mapping resources, %u %u %lx\n",
-			       __func__, ioc->ioc_hpa, ioc->res_size,
-			       pages_needed, dma_get_seg_boundary(dev));
-			return -1;
-#endif
-		}
-	}
-
-#ifdef PDIR_SEARCH_TIMING
-	ioc->avg_search[ioc->avg_idx++] = (ia64_get_itc() - itc_start) / pages_needed;
-	ioc->avg_idx &= SBA_SEARCH_SAMPLE - 1;
-#endif
-
-	prefetchw(&(ioc->pdir_base[pide]));
-
-#ifdef ASSERT_PDIR_SANITY
-	/* verify the first enable bit is clear */
-	if(0x00 != ((u8 *) ioc->pdir_base)[pide*PDIR_ENTRY_SIZE + 7]) {
-		sba_dump_pdir_entry(ioc, "sba_search_bitmap() botched it?", pide);
-	}
-#endif
-
-	DBG_RES("%s(%x) %d -> %lx hint %x/%x\n",
-		__func__, size, pages_needed, pide,
-		(uint) ((unsigned long) ioc->res_hint - (unsigned long) ioc->res_map),
-		ioc->res_bitshift );
-
-	return (pide);
-}
-
-
-/**
- * sba_free_range - unmark bits in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @iova: IO virtual address which was previously allocated.
- * @size: number of bytes to create a mapping for
- *
- * clear bits in the ioc's resource map
- */
-static SBA_INLINE void
-sba_free_range(struct ioc *ioc, dma_addr_t iova, size_t size)
-{
-	unsigned long iovp = SBA_IOVP(ioc, iova);
-	unsigned int pide = PDIR_INDEX(iovp);
-	unsigned int ridx = pide >> 3;	/* convert bit to byte address */
-	unsigned long *res_ptr = (unsigned long *) &((ioc)->res_map[ridx & ~RESMAP_IDX_MASK]);
-	int bits_not_wanted = size >> iovp_shift;
-	unsigned long m;
-
-	/* Round up to power-of-two size: see AR2305 note above */
-	bits_not_wanted = 1UL << get_iovp_order(bits_not_wanted << iovp_shift);
-	for (; bits_not_wanted > 0 ; res_ptr++) {
-		
-		if (unlikely(bits_not_wanted > BITS_PER_LONG)) {
-
-			/* these mappings start 64bit aligned */
-			*res_ptr = 0UL;
-			bits_not_wanted -= BITS_PER_LONG;
-			pide += BITS_PER_LONG;
-
-		} else {
-
-			/* 3-bits "bit" address plus 2 (or 3) bits for "byte" == bit in word */
-			m = RESMAP_MASK(bits_not_wanted) << (pide & (BITS_PER_LONG - 1));
-			bits_not_wanted = 0;
-
-			DBG_RES("%s( ,%x,%x) %x/%lx %x %p %lx\n", __func__, (uint) iova, size,
-			        bits_not_wanted, m, pide, res_ptr, *res_ptr);
-
-			ASSERT(m != 0);
-			ASSERT(bits_not_wanted);
-			ASSERT((*res_ptr & m) == m); /* verify same bits are set */
-			*res_ptr &= ~m;
-		}
-	}
-}
-
-
-/**************************************************************
-*
-*   "Dynamic DMA Mapping" support (aka "Coherent I/O")
-*
-***************************************************************/
-
-/**
- * sba_io_pdir_entry - fill in one IO PDIR entry
- * @pdir_ptr:  pointer to IO PDIR entry
- * @vba: Virtual CPU address of buffer to map
- *
- * SBA Mapping Routine
- *
- * Given a virtual address (vba, arg1) sba_io_pdir_entry()
- * loads the I/O PDIR entry pointed to by pdir_ptr (arg0).
- * Each IO Pdir entry consists of 8 bytes as shown below
- * (LSB == bit 0):
- *
- *  63                    40                                 11    7        0
- * +-+---------------------+----------------------------------+----+--------+
- * |V|        U            |            PPN[39:12]            | U  |   FF   |
- * +-+---------------------+----------------------------------+----+--------+
- *
- *  V  == Valid Bit
- *  U  == Unused
- * PPN == Physical Page Number
- *
- * The physical address fields are filled with the results of virt_to_phys()
- * on the vba.
- */
-
-#if 1
-#define sba_io_pdir_entry(pdir_ptr, vba) *pdir_ptr = ((vba & ~0xE000000000000FFFULL)	\
-						      | 0x8000000000000000ULL)
-#else
-void SBA_INLINE
-sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba)
-{
-	*pdir_ptr = ((vba & ~0xE000000000000FFFULL) | 0x80000000000000FFULL);
-}
-#endif
-
-#ifdef ENABLE_MARK_CLEAN
-/*
- * Since DMA is i-cache coherent, any (complete) pages that were written via
- * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
- * flush them when they get mapped into an executable vm-area.
- */
-static void mark_clean(void *addr, size_t size)
-{
-	struct folio *folio = virt_to_folio(addr);
-	ssize_t left = size;
-	size_t offset = offset_in_folio(folio, addr);
-
-	if (offset) {
-		left -= folio_size(folio) - offset;
-		if (left <= 0)
-			return;
-		folio = folio_next(folio);
-	}
-
-	while (left >= folio_size(folio)) {
-		left -= folio_size(folio);
-		set_bit(PG_arch_1, &folio->flags);
-		if (!left)
-			break;
-		folio = folio_next(folio);
-	}
-}
-#endif
-
-/**
- * sba_mark_invalid - invalidate one or more IO PDIR entries
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @iova:  IO Virtual Address mapped earlier
- * @byte_cnt:  number of bytes this mapping covers.
- *
- * Marking the IO PDIR entry(ies) as Invalid and invalidate
- * corresponding IO TLB entry. The PCOM (Purge Command Register)
- * is to purge stale entries in the IO TLB when unmapping entries.
- *
- * The PCOM register supports purging of multiple pages, with a minium
- * of 1 page and a maximum of 2GB. Hardware requires the address be
- * aligned to the size of the range being purged. The size of the range
- * must be a power of 2. The "Cool perf optimization" in the
- * allocation routine helps keep that true.
- */
-static SBA_INLINE void
-sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
-{
-	u32 iovp = (u32) SBA_IOVP(ioc,iova);
-
-	int off = PDIR_INDEX(iovp);
-
-	/* Must be non-zero and rounded up */
-	ASSERT(byte_cnt > 0);
-	ASSERT(0 == (byte_cnt & ~iovp_mask));
-
-#ifdef ASSERT_PDIR_SANITY
-	/* Assert first pdir entry is set */
-	if (!(ioc->pdir_base[off] >> 60)) {
-		sba_dump_pdir_entry(ioc,"sba_mark_invalid()", PDIR_INDEX(iovp));
-	}
-#endif
-
-	if (byte_cnt <= iovp_size)
-	{
-		ASSERT(off < ioc->pdir_size);
-
-		iovp |= iovp_shift;     /* set "size" field for PCOM */
-
-#ifndef FULL_VALID_PDIR
-		/*
-		** clear I/O PDIR entry "valid" bit
-		** Do NOT clear the rest - save it for debugging.
-		** We should only clear bits that have previously
-		** been enabled.
-		*/
-		ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
-#else
-		/*
-  		** If we want to maintain the PDIR as valid, put in
-		** the spill page so devices prefetching won't
-		** cause a hard fail.
-		*/
-		ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
-#endif
-	} else {
-		u32 t = get_iovp_order(byte_cnt) + iovp_shift;
-
-		iovp |= t;
-		ASSERT(t <= 31);   /* 2GB! Max value of "size" field */
-
-		do {
-			/* verify this pdir entry is enabled */
-			ASSERT(ioc->pdir_base[off]  >> 63);
-#ifndef FULL_VALID_PDIR
-			/* clear I/O Pdir entry "valid" bit first */
-			ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
-#else
-			ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
-#endif
-			off++;
-			byte_cnt -= iovp_size;
-		} while (byte_cnt > 0);
-	}
-
-	WRITE_REG(iovp | ioc->ibase, ioc->ioc_hpa+IOC_PCOM);
-}
-
-/**
- * sba_map_page - map one buffer and return IOVA for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @page: page to map
- * @poff: offset into page
- * @size: number of bytes to map
- * @dir: dma direction
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static dma_addr_t sba_map_page(struct device *dev, struct page *page,
-			       unsigned long poff, size_t size,
-			       enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	struct ioc *ioc;
-	void *addr = page_address(page) + poff;
-	dma_addr_t iovp;
-	dma_addr_t offset;
-	u64 *pdir_start;
-	int pide;
-#ifdef ASSERT_PDIR_SANITY
-	unsigned long flags;
-#endif
-#ifdef ALLOW_IOV_BYPASS
-	unsigned long pci_addr = virt_to_phys(addr);
-#endif
-
-#ifdef ALLOW_IOV_BYPASS
-	ASSERT(to_pci_dev(dev)->dma_mask);
-	/*
- 	** Check if the PCI device can DMA to ptr... if so, just return ptr
- 	*/
-	if (likely((pci_addr & ~to_pci_dev(dev)->dma_mask) == 0)) {
-		/*
- 		** Device is bit capable of DMA'ing to the buffer...
-		** just return the PCI address of ptr
- 		*/
-		DBG_BYPASS("sba_map_page() bypass mask/addr: "
-			   "0x%lx/0x%lx\n",
-		           to_pci_dev(dev)->dma_mask, pci_addr);
-		return pci_addr;
-	}
-#endif
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-	prefetch(ioc->res_hint);
-
-	ASSERT(size > 0);
-	ASSERT(size <= DMA_CHUNK_SIZE);
-
-	/* save offset bits */
-	offset = ((dma_addr_t) (long) addr) & ~iovp_mask;
-
-	/* round up to nearest iovp_size */
-	size = (size + offset + ~iovp_mask) & iovp_mask;
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check before sba_map_page()"))
-		panic("Sanity check failed");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	pide = sba_alloc_range(ioc, dev, size);
-	if (pide < 0)
-		return DMA_MAPPING_ERROR;
-
-	iovp = (dma_addr_t) pide << iovp_shift;
-
-	DBG_RUN("%s() 0x%p -> 0x%lx\n", __func__, addr, (long) iovp | offset);
-
-	pdir_start = &(ioc->pdir_base[pide]);
-
-	while (size > 0) {
-		ASSERT(((u8 *)pdir_start)[7] == 0); /* verify availability */
-		sba_io_pdir_entry(pdir_start, (unsigned long) addr);
-
-		DBG_RUN("     pdir 0x%p %lx\n", pdir_start, *pdir_start);
-
-		addr += iovp_size;
-		size -= iovp_size;
-		pdir_start++;
-	}
-	/* force pdir update */
-	wmb();
-
-	/* form complete address */
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check after sba_map_page()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-	return SBA_IOVA(ioc, iovp, offset);
-}
-
-#ifdef ENABLE_MARK_CLEAN
-static SBA_INLINE void
-sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
-{
-	u32	iovp = (u32) SBA_IOVP(ioc,iova);
-	int	off = PDIR_INDEX(iovp);
-	void	*addr;
-
-	if (size <= iovp_size) {
-		addr = phys_to_virt(ioc->pdir_base[off] &
-		                    ~0xE000000000000FFFULL);
-		mark_clean(addr, size);
-	} else {
-		do {
-			addr = phys_to_virt(ioc->pdir_base[off] &
-			                    ~0xE000000000000FFFULL);
-			mark_clean(addr, min(size, iovp_size));
-			off++;
-			size -= iovp_size;
-		} while (size > 0);
-	}
-}
-#endif
-
-/**
- * sba_unmap_page - unmap one IOVA and free resources
- * @dev: instance of PCI owned by the driver that's asking.
- * @iova:  IOVA of driver buffer previously mapped.
- * @size:  number of bytes mapped in driver buffer.
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
-			   enum dma_data_direction dir, unsigned long attrs)
-{
-	struct ioc *ioc;
-#if DELAYED_RESOURCE_CNT > 0
-	struct sba_dma_pair *d;
-#endif
-	unsigned long flags;
-	dma_addr_t offset;
-
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-#ifdef ALLOW_IOV_BYPASS
-	if (likely((iova & ioc->imask) != ioc->ibase)) {
-		/*
-		** Address does not fall w/in IOVA, must be bypassing
-		*/
-		DBG_BYPASS("sba_unmap_page() bypass addr: 0x%lx\n",
-			   iova);
-
-#ifdef ENABLE_MARK_CLEAN
-		if (dir == DMA_FROM_DEVICE) {
-			mark_clean(phys_to_virt(iova), size);
-		}
-#endif
-		return;
-	}
-#endif
-	offset = iova & ~iovp_mask;
-
-	DBG_RUN("%s() iovp 0x%lx/%x\n", __func__, (long) iova, size);
-
-	iova ^= offset;        /* clear offset bits */
-	size += offset;
-	size = ROUNDUP(size, iovp_size);
-
-#ifdef ENABLE_MARK_CLEAN
-	if (dir == DMA_FROM_DEVICE)
-		sba_mark_clean(ioc, iova, size);
-#endif
-
-#if DELAYED_RESOURCE_CNT > 0
-	spin_lock_irqsave(&ioc->saved_lock, flags);
-	d = &(ioc->saved[ioc->saved_cnt]);
-	d->iova = iova;
-	d->size = size;
-	if (unlikely(++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT)) {
-		int cnt = ioc->saved_cnt;
-		spin_lock(&ioc->res_lock);
-		while (cnt--) {
-			sba_mark_invalid(ioc, d->iova, d->size);
-			sba_free_range(ioc, d->iova, d->size);
-			d--;
-		}
-		ioc->saved_cnt = 0;
-		READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-		spin_unlock(&ioc->res_lock);
-	}
-	spin_unlock_irqrestore(&ioc->saved_lock, flags);
-#else /* DELAYED_RESOURCE_CNT == 0 */
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_mark_invalid(ioc, iova, size);
-	sba_free_range(ioc, iova, size);
-	READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif /* DELAYED_RESOURCE_CNT == 0 */
-}
-
-/**
- * sba_alloc_coherent - allocate/map shared mem for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @size:  number of bytes mapped in driver buffer.
- * @dma_handle:  IOVA of new buffer.
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void *
-sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flags, unsigned long attrs)
-{
-	struct page *page;
-	struct ioc *ioc;
-	int node = -1;
-	void *addr;
-
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-#ifdef CONFIG_NUMA
-	node = ioc->node;
-#endif
-
-	page = alloc_pages_node(node, flags, get_order(size));
-	if (unlikely(!page))
-		return NULL;
-
-	addr = page_address(page);
-	memset(addr, 0, size);
-	*dma_handle = page_to_phys(page);
-
-#ifdef ALLOW_IOV_BYPASS
-	ASSERT(dev->coherent_dma_mask);
-	/*
- 	** Check if the PCI device can DMA to ptr... if so, just return ptr
- 	*/
-	if (likely((*dma_handle & ~dev->coherent_dma_mask) == 0)) {
-		DBG_BYPASS("sba_alloc_coherent() bypass mask/addr: 0x%lx/0x%lx\n",
-		           dev->coherent_dma_mask, *dma_handle);
-
-		return addr;
-	}
-#endif
-
-	/*
-	 * If device can't bypass or bypass is disabled, pass the 32bit fake
-	 * device to map single to get an iova mapping.
-	 */
-	*dma_handle = sba_map_page(&ioc->sac_only_dev->dev, page, 0, size,
-			DMA_BIDIRECTIONAL, 0);
-	if (dma_mapping_error(dev, *dma_handle))
-		return NULL;
-	return addr;
-}
-
-
-/**
- * sba_free_coherent - free/unmap shared mem for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @size:  number of bytes mapped in driver buffer.
- * @vaddr:  virtual address IOVA of "consistent" buffer.
- * @dma_handler:  IO virtual address of "consistent" buffer.
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
-			      dma_addr_t dma_handle, unsigned long attrs)
-{
-	sba_unmap_page(dev, dma_handle, size, 0, 0);
-	free_pages((unsigned long) vaddr, get_order(size));
-}
-
-
-/*
-** Since 0 is a valid pdir_base index value, can't use that
-** to determine if a value is valid or not. Use a flag to indicate
-** the SG list entry contains a valid pdir index.
-*/
-#define PIDE_FLAG 0x1UL
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-int dump_run_sg = 0;
-#endif
-
-
-/**
- * sba_fill_pdir - write allocated SG entries into IO PDIR
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg:  list of IOVA/size pairs
- * @nents: number of entries in startsg list
- *
- * Take preprocessed SG list and write corresponding entries
- * in the IO PDIR.
- */
-
-static SBA_INLINE int
-sba_fill_pdir(
-	struct ioc *ioc,
-	struct scatterlist *startsg,
-	int nents)
-{
-	struct scatterlist *dma_sg = startsg;	/* pointer to current DMA */
-	int n_mappings = 0;
-	u64 *pdirp = NULL;
-	unsigned long dma_offset = 0;
-
-	while (nents-- > 0) {
-		int     cnt = startsg->dma_length;
-		startsg->dma_length = 0;
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-		if (dump_run_sg)
-			printk(" %2d : %08lx/%05x %p\n",
-				nents, startsg->dma_address, cnt,
-				sba_sg_address(startsg));
-#else
-		DBG_RUN_SG(" %d : %08lx/%05x %p\n",
-				nents, startsg->dma_address, cnt,
-				sba_sg_address(startsg));
-#endif
-		/*
-		** Look for the start of a new DMA stream
-		*/
-		if (startsg->dma_address & PIDE_FLAG) {
-			u32 pide = startsg->dma_address & ~PIDE_FLAG;
-			dma_offset = (unsigned long) pide & ~iovp_mask;
-			startsg->dma_address = 0;
-			if (n_mappings)
-				dma_sg = sg_next(dma_sg);
-			dma_sg->dma_address = pide | ioc->ibase;
-			pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
-			n_mappings++;
-		}
-
-		/*
-		** Look for a VCONTIG chunk
-		*/
-		if (cnt) {
-			unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
-			ASSERT(pdirp);
-
-			/* Since multiple Vcontig blocks could make up
-			** one DMA stream, *add* cnt to dma_len.
-			*/
-			dma_sg->dma_length += cnt;
-			cnt += dma_offset;
-			dma_offset=0;	/* only want offset on first chunk */
-			cnt = ROUNDUP(cnt, iovp_size);
-			do {
-				sba_io_pdir_entry(pdirp, vaddr);
-				vaddr += iovp_size;
-				cnt -= iovp_size;
-				pdirp++;
-			} while (cnt > 0);
-		}
-		startsg = sg_next(startsg);
-	}
-	/* force pdir update */
-	wmb();
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-	dump_run_sg = 0;
-#endif
-	return(n_mappings);
-}
-
-
-/*
-** Two address ranges are DMA contiguous *iff* "end of prev" and
-** "start of next" are both on an IOV page boundary.
-**
-** (shift left is a quick trick to mask off upper bits)
-*/
-#define DMA_CONTIG(__X, __Y) \
-	(((((unsigned long) __X) | ((unsigned long) __Y)) << (BITS_PER_LONG - iovp_shift)) == 0UL)
-
-
-/**
- * sba_coalesce_chunks - preprocess the SG list
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg:  list of IOVA/size pairs
- * @nents: number of entries in startsg list
- *
- * First pass is to walk the SG list and determine where the breaks are
- * in the DMA stream. Allocates PDIR entries but does not fill them.
- * Returns the number of DMA chunks.
- *
- * Doing the fill separate from the coalescing/allocation keeps the
- * code simpler. Future enhancement could make one pass through
- * the sglist do both.
- */
-static SBA_INLINE int
-sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
-	struct scatterlist *startsg,
-	int nents)
-{
-	struct scatterlist *vcontig_sg;    /* VCONTIG chunk head */
-	unsigned long vcontig_len;         /* len of VCONTIG chunk */
-	unsigned long vcontig_end;
-	struct scatterlist *dma_sg;        /* next DMA stream head */
-	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
-	int n_mappings = 0;
-	unsigned int max_seg_size = dma_get_max_seg_size(dev);
-	int idx;
-
-	while (nents > 0) {
-		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
-
-		/*
-		** Prepare for first/next DMA stream
-		*/
-		dma_sg = vcontig_sg = startsg;
-		dma_len = vcontig_len = vcontig_end = startsg->length;
-		vcontig_end +=  vaddr;
-		dma_offset = vaddr & ~iovp_mask;
-
-		/* PARANOID: clear entries */
-		startsg->dma_address = startsg->dma_length = 0;
-
-		/*
-		** This loop terminates one iteration "early" since
-		** it's always looking one "ahead".
-		*/
-		while (--nents > 0) {
-			unsigned long vaddr;	/* tmp */
-
-			startsg = sg_next(startsg);
-
-			/* PARANOID */
-			startsg->dma_address = startsg->dma_length = 0;
-
-			/* catch brokenness in SCSI layer */
-			ASSERT(startsg->length <= DMA_CHUNK_SIZE);
-
-			/*
-			** First make sure current dma stream won't
-			** exceed DMA_CHUNK_SIZE if we coalesce the
-			** next entry.
-			*/
-			if (((dma_len + dma_offset + startsg->length + ~iovp_mask) & iovp_mask)
-			    > DMA_CHUNK_SIZE)
-				break;
-
-			if (dma_len + startsg->length > max_seg_size)
-				break;
-
-			/*
-			** Then look for virtually contiguous blocks.
-			**
-			** append the next transaction?
-			*/
-			vaddr = (unsigned long) sba_sg_address(startsg);
-			if  (vcontig_end == vaddr)
-			{
-				vcontig_len += startsg->length;
-				vcontig_end += startsg->length;
-				dma_len     += startsg->length;
-				continue;
-			}
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-			dump_run_sg = (vcontig_len > iovp_size);
-#endif
-
-			/*
-			** Not virtually contiguous.
-			** Terminate prev chunk.
-			** Start a new chunk.
-			**
-			** Once we start a new VCONTIG chunk, dma_offset
-			** can't change. And we need the offset from the first
-			** chunk - not the last one. Ergo Successive chunks
-			** must start on page boundaries and dove tail
-			** with it's predecessor.
-			*/
-			vcontig_sg->dma_length = vcontig_len;
-
-			vcontig_sg = startsg;
-			vcontig_len = startsg->length;
-
-			/*
-			** 3) do the entries end/start on page boundaries?
-			**    Don't update vcontig_end until we've checked.
-			*/
-			if (DMA_CONTIG(vcontig_end, vaddr))
-			{
-				vcontig_end = vcontig_len + vaddr;
-				dma_len += vcontig_len;
-				continue;
-			} else {
-				break;
-			}
-		}
-
-		/*
-		** End of DMA Stream
-		** Terminate last VCONTIG block.
-		** Allocate space for DMA stream.
-		*/
-		vcontig_sg->dma_length = vcontig_len;
-		dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
-		ASSERT(dma_len <= DMA_CHUNK_SIZE);
-		idx = sba_alloc_range(ioc, dev, dma_len);
-		if (idx < 0) {
-			dma_sg->dma_length = 0;
-			return -1;
-		}
-		dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
-						   | dma_offset);
-		n_mappings++;
-	}
-
-	return n_mappings;
-}
-
-static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			       int nents, enum dma_data_direction dir,
-			       unsigned long attrs);
-/**
- * sba_map_sg - map Scatter/Gather list
- * @dev: instance of PCI owned by the driver that's asking.
- * @sglist:  array of buffer/length pairs
- * @nents:  number of entries in list
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			    int nents, enum dma_data_direction dir,
-			    unsigned long attrs)
-{
-	struct ioc *ioc;
-	int coalesced, filled = 0;
-#ifdef ASSERT_PDIR_SANITY
-	unsigned long flags;
-#endif
-#ifdef ALLOW_IOV_BYPASS_SG
-	struct scatterlist *sg;
-#endif
-
-	DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-#ifdef ALLOW_IOV_BYPASS_SG
-	ASSERT(to_pci_dev(dev)->dma_mask);
-	if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
-		for_each_sg(sglist, sg, nents, filled) {
-			sg->dma_length = sg->length;
-			sg->dma_address = virt_to_phys(sba_sg_address(sg));
-		}
-		return filled;
-	}
-#endif
-	/* Fast path single entry scatterlists. */
-	if (nents == 1) {
-		sglist->dma_length = sglist->length;
-		sglist->dma_address = sba_map_page(dev, sg_page(sglist),
-				sglist->offset, sglist->length, dir, attrs);
-		if (dma_mapping_error(dev, sglist->dma_address))
-			return -EIO;
-		return 1;
-	}
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check before sba_map_sg_attrs()"))
-	{
-		sba_dump_sg(ioc, sglist, nents);
-		panic("Check before sba_map_sg_attrs()");
-	}
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	prefetch(ioc->res_hint);
-
-	/*
-	** First coalesce the chunks and allocate I/O pdir space
-	**
-	** If this is one DMA stream, we can properly map using the
-	** correct virtual address associated with each DMA page.
-	** w/o this association, we wouldn't have coherent DMA!
-	** Access to the virtual address is what forces a two pass algorithm.
-	*/
-	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
-	if (coalesced < 0) {
-		sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
-		return -ENOMEM;
-	}
-
-	/*
-	** Program the I/O Pdir
-	**
-	** map the virtual addresses to the I/O Pdir
-	** o dma_address will contain the pdir index
-	** o dma_len will contain the number of bytes to map
-	** o address contains the virtual address.
-	*/
-	filled = sba_fill_pdir(ioc, sglist, nents);
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check after sba_map_sg_attrs()"))
-	{
-		sba_dump_sg(ioc, sglist, nents);
-		panic("Check after sba_map_sg_attrs()\n");
-	}
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	ASSERT(coalesced == filled);
-	DBG_RUN_SG("%s() DONE %d mappings\n", __func__, filled);
-
-	return filled;
-}
-
-/**
- * sba_unmap_sg_attrs - unmap Scatter/Gather list
- * @dev: instance of PCI owned by the driver that's asking.
- * @sglist:  array of buffer/length pairs
- * @nents:  number of entries in list
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			       int nents, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-#ifdef ASSERT_PDIR_SANITY
-	struct ioc *ioc;
-	unsigned long flags;
-#endif
-
-	DBG_RUN_SG("%s() START %d entries,  %p,%x\n",
-		   __func__, nents, sba_sg_address(sglist), sglist->length);
-
-#ifdef ASSERT_PDIR_SANITY
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check before sba_unmap_sg_attrs()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	while (nents && sglist->dma_length) {
-
-		sba_unmap_page(dev, sglist->dma_address, sglist->dma_length,
-			       dir, attrs);
-		sglist = sg_next(sglist);
-		nents--;
-	}
-
-	DBG_RUN_SG("%s() DONE (nents %d)\n", __func__,  nents);
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check after sba_unmap_sg_attrs()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-}
-
-/**************************************************************
-*
-*   Initialization and claim
-*
-***************************************************************/
-
-static void
-ioc_iova_init(struct ioc *ioc)
-{
-	int tcnfg;
-	int agp_found = 0;
-	struct pci_dev *device = NULL;
-#ifdef FULL_VALID_PDIR
-	unsigned long index;
-#endif
-
-	/*
-	** Firmware programs the base and size of a "safe IOVA space"
-	** (one that doesn't overlap memory or LMMIO space) in the
-	** IBASE and IMASK registers.
-	*/
-	ioc->ibase = READ_REG(ioc->ioc_hpa + IOC_IBASE) & ~0x1UL;
-	ioc->imask = READ_REG(ioc->ioc_hpa + IOC_IMASK) | 0xFFFFFFFF00000000UL;
-
-	ioc->iov_size = ~ioc->imask + 1;
-
-	DBG_INIT("%s() hpa %p IOV base 0x%lx mask 0x%lx (%dMB)\n",
-		__func__, ioc->ioc_hpa, ioc->ibase, ioc->imask,
-		ioc->iov_size >> 20);
-
-	switch (iovp_size) {
-		case  4*1024: tcnfg = 0; break;
-		case  8*1024: tcnfg = 1; break;
-		case 16*1024: tcnfg = 2; break;
-		case 64*1024: tcnfg = 3; break;
-		default:
-			panic(PFX "Unsupported IOTLB page size %ldK",
-				iovp_size >> 10);
-			break;
-	}
-	WRITE_REG(tcnfg, ioc->ioc_hpa + IOC_TCNFG);
-
-	ioc->pdir_size = (ioc->iov_size / iovp_size) * PDIR_ENTRY_SIZE;
-	ioc->pdir_base = (void *) __get_free_pages(GFP_KERNEL,
-						   get_order(ioc->pdir_size));
-	if (!ioc->pdir_base)
-		panic(PFX "Couldn't allocate I/O Page Table\n");
-
-	memset(ioc->pdir_base, 0, ioc->pdir_size);
-
-	DBG_INIT("%s() IOV page size %ldK pdir %p size %x\n", __func__,
-		iovp_size >> 10, ioc->pdir_base, ioc->pdir_size);
-
-	ASSERT(ALIGN((unsigned long) ioc->pdir_base, 4*1024) == (unsigned long) ioc->pdir_base);
-	WRITE_REG(virt_to_phys(ioc->pdir_base), ioc->ioc_hpa + IOC_PDIR_BASE);
-
-	/*
-	** If an AGP device is present, only use half of the IOV space
-	** for PCI DMA.  Unfortunately we can't know ahead of time
-	** whether GART support will actually be used, for now we
-	** can just key on an AGP device found in the system.
-	** We program the next pdir index after we stop w/ a key for
-	** the GART code to handshake on.
-	*/
-	for_each_pci_dev(device)	
-		agp_found |= pci_find_capability(device, PCI_CAP_ID_AGP);
-
-	if (agp_found && reserve_sba_gart) {
-		printk(KERN_INFO PFX "reserving %dMb of IOVA space at 0x%lx for agpgart\n",
-		      ioc->iov_size/2 >> 20, ioc->ibase + ioc->iov_size/2);
-		ioc->pdir_size /= 2;
-		((u64 *)ioc->pdir_base)[PDIR_INDEX(ioc->iov_size/2)] = ZX1_SBA_IOMMU_COOKIE;
-	}
-#ifdef FULL_VALID_PDIR
-	/*
-  	** Check to see if the spill page has been allocated, we don't need more than
-	** one across multiple SBAs.
-	*/
-	if (!prefetch_spill_page) {
-		char *spill_poison = "SBAIOMMU POISON";
-		int poison_size = 16;
-		void *poison_addr, *addr;
-
-		addr = (void *)__get_free_pages(GFP_KERNEL, get_order(iovp_size));
-		if (!addr)
-			panic(PFX "Couldn't allocate PDIR spill page\n");
-
-		poison_addr = addr;
-		for ( ; (u64) poison_addr < addr + iovp_size; poison_addr += poison_size)
-			memcpy(poison_addr, spill_poison, poison_size);
-
-		prefetch_spill_page = virt_to_phys(addr);
-
-		DBG_INIT("%s() prefetch spill addr: 0x%lx\n", __func__, prefetch_spill_page);
-	}
-	/*
-  	** Set all the PDIR entries valid w/ the spill page as the target
-	*/
-	for (index = 0 ; index < (ioc->pdir_size / PDIR_ENTRY_SIZE) ; index++)
-		((u64 *)ioc->pdir_base)[index] = (0x80000000000000FF | prefetch_spill_page);
-#endif
-
-	/* Clear I/O TLB of any possible entries */
-	WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
-	READ_REG(ioc->ioc_hpa + IOC_PCOM);
-
-	/* Enable IOVA translation */
-	WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa + IOC_IBASE);
-	READ_REG(ioc->ioc_hpa + IOC_IBASE);
-}
-
-static void __init
-ioc_resource_init(struct ioc *ioc)
-{
-	spin_lock_init(&ioc->res_lock);
-#if DELAYED_RESOURCE_CNT > 0
-	spin_lock_init(&ioc->saved_lock);
-#endif
-
-	/* resource map size dictated by pdir_size */
-	ioc->res_size = ioc->pdir_size / PDIR_ENTRY_SIZE; /* entries */
-	ioc->res_size >>= 3;  /* convert bit count to byte count */
-	DBG_INIT("%s() res_size 0x%x\n", __func__, ioc->res_size);
-
-	ioc->res_map = (char *) __get_free_pages(GFP_KERNEL,
-						 get_order(ioc->res_size));
-	if (!ioc->res_map)
-		panic(PFX "Couldn't allocate resource map\n");
-
-	memset(ioc->res_map, 0, ioc->res_size);
-	/* next available IOVP - circular search */
-	ioc->res_hint = (unsigned long *) ioc->res_map;
-
-#ifdef ASSERT_PDIR_SANITY
-	/* Mark first bit busy - ie no IOVA 0 */
-	ioc->res_map[0] = 0x1;
-	ioc->pdir_base[0] = 0x8000000000000000ULL | ZX1_SBA_IOMMU_COOKIE;
-#endif
-#ifdef FULL_VALID_PDIR
-	/* Mark the last resource used so we don't prefetch beyond IOVA space */
-	ioc->res_map[ioc->res_size - 1] |= 0x80UL; /* res_map is chars */
-	ioc->pdir_base[(ioc->pdir_size / PDIR_ENTRY_SIZE) - 1] = (0x80000000000000FF
-							      | prefetch_spill_page);
-#endif
-
-	DBG_INIT("%s() res_map %x %p\n", __func__,
-		 ioc->res_size, (void *) ioc->res_map);
-}
-
-static void __init
-ioc_sac_init(struct ioc *ioc)
-{
-	struct pci_dev *sac = NULL;
-	struct pci_controller *controller = NULL;
-
-	/*
-	 * pci_alloc_coherent() must return a DMA address which is
-	 * SAC (single address cycle) addressable, so allocate a
-	 * pseudo-device to enforce that.
-	 */
-	sac = kzalloc(sizeof(*sac), GFP_KERNEL);
-	if (!sac)
-		panic(PFX "Couldn't allocate struct pci_dev");
-
-	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
-	if (!controller)
-		panic(PFX "Couldn't allocate struct pci_controller");
-
-	controller->iommu = ioc;
-	sac->sysdata = controller;
-	sac->dma_mask = 0xFFFFFFFFUL;
-	sac->dev.bus = &pci_bus_type;
-	ioc->sac_only_dev = sac;
-}
-
-static void __init
-ioc_zx1_init(struct ioc *ioc)
-{
-	unsigned long rope_config;
-	unsigned int i;
-
-	if (ioc->rev < 0x20)
-		panic(PFX "IOC 2.0 or later required for IOMMU support\n");
-
-	/* 38 bit memory controller + extra bit for range displaced by MMIO */
-	ioc->dma_mask = (0x1UL << 39) - 1;
-
-	/*
-	** Clear ROPE(N)_CONFIG AO bit.
-	** Disables "NT Ordering" (~= !"Relaxed Ordering")
-	** Overrides bit 1 in DMA Hint Sets.
-	** Improves netperf UDP_STREAM by ~10% for tg3 on bcm5701.
-	*/
-	for (i=0; i<(8*8); i+=8) {
-		rope_config = READ_REG(ioc->ioc_hpa + IOC_ROPE0_CFG + i);
-		rope_config &= ~IOC_ROPE_AO;
-		WRITE_REG(rope_config, ioc->ioc_hpa + IOC_ROPE0_CFG + i);
-	}
-}
-
-typedef void (initfunc)(struct ioc *);
-
-struct ioc_iommu {
-	u32 func_id;
-	char *name;
-	initfunc *init;
-};
-
-static struct ioc_iommu ioc_iommu_info[] __initdata = {
-	{ ZX1_IOC_ID, "zx1", ioc_zx1_init },
-	{ ZX2_IOC_ID, "zx2", NULL },
-	{ SX1000_IOC_ID, "sx1000", NULL },
-	{ SX2000_IOC_ID, "sx2000", NULL },
-};
-
-static void __init ioc_init(unsigned long hpa, struct ioc *ioc)
-{
-	struct ioc_iommu *info;
-
-	ioc->next = ioc_list;
-	ioc_list = ioc;
-
-	ioc->ioc_hpa = ioremap(hpa, 0x1000);
-
-	ioc->func_id = READ_REG(ioc->ioc_hpa + IOC_FUNC_ID);
-	ioc->rev = READ_REG(ioc->ioc_hpa + IOC_FCLASS) & 0xFFUL;
-	ioc->dma_mask = 0xFFFFFFFFFFFFFFFFUL;	/* conservative */
-
-	for (info = ioc_iommu_info; info < ioc_iommu_info + ARRAY_SIZE(ioc_iommu_info); info++) {
-		if (ioc->func_id == info->func_id) {
-			ioc->name = info->name;
-			if (info->init)
-				(info->init)(ioc);
-		}
-	}
-
-	iovp_size = (1 << iovp_shift);
-	iovp_mask = ~(iovp_size - 1);
-
-	DBG_INIT("%s: PAGE_SIZE %ldK, iovp_size %ldK\n", __func__,
-		PAGE_SIZE >> 10, iovp_size >> 10);
-
-	if (!ioc->name) {
-		ioc->name = kmalloc(24, GFP_KERNEL);
-		if (ioc->name)
-			sprintf((char *) ioc->name, "Unknown (%04x:%04x)",
-				ioc->func_id & 0xFFFF, (ioc->func_id >> 16) & 0xFFFF);
-		else
-			ioc->name = "Unknown";
-	}
-
-	ioc_iova_init(ioc);
-	ioc_resource_init(ioc);
-	ioc_sac_init(ioc);
-
-	printk(KERN_INFO PFX
-		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
-		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
-		hpa, ioc->iov_size >> 20, ioc->ibase);
-}
-
-
-
-/**************************************************************************
-**
-**   SBA initialization code (HW and SW)
-**
-**   o identify SBA chip itself
-**   o FIXME: initialize DMA hints for reasonable defaults
-**
-**************************************************************************/
-
-#ifdef CONFIG_PROC_FS
-static void *
-ioc_start(struct seq_file *s, loff_t *pos)
-{
-	struct ioc *ioc;
-	loff_t n = *pos;
-
-	for (ioc = ioc_list; ioc; ioc = ioc->next)
-		if (!n--)
-			return ioc;
-
-	return NULL;
-}
-
-static void *
-ioc_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct ioc *ioc = v;
-
-	++*pos;
-	return ioc->next;
-}
-
-static void
-ioc_stop(struct seq_file *s, void *v)
-{
-}
-
-static int
-ioc_show(struct seq_file *s, void *v)
-{
-	struct ioc *ioc = v;
-	unsigned long *res_ptr = (unsigned long *)ioc->res_map;
-	int i, used = 0;
-
-	seq_printf(s, "Hewlett Packard %s IOC rev %d.%d\n",
-		ioc->name, ((ioc->rev >> 4) & 0xF), (ioc->rev & 0xF));
-#ifdef CONFIG_NUMA
-	if (ioc->node != NUMA_NO_NODE)
-		seq_printf(s, "NUMA node       : %d\n", ioc->node);
-#endif
-	seq_printf(s, "IOVA size       : %ld MB\n", ((ioc->pdir_size >> 3) * iovp_size)/(1024*1024));
-	seq_printf(s, "IOVA page size  : %ld kb\n", iovp_size/1024);
-
-	for (i = 0; i < (ioc->res_size / sizeof(unsigned long)); ++i, ++res_ptr)
-		used += hweight64(*res_ptr);
-
-	seq_printf(s, "PDIR size       : %d entries\n", ioc->pdir_size >> 3);
-	seq_printf(s, "PDIR used       : %d entries\n", used);
-
-#ifdef PDIR_SEARCH_TIMING
-	{
-		unsigned long i = 0, avg = 0, min, max;
-		min = max = ioc->avg_search[0];
-		for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
-			avg += ioc->avg_search[i];
-			if (ioc->avg_search[i] > max) max = ioc->avg_search[i];
-			if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
-		}
-		avg /= SBA_SEARCH_SAMPLE;
-		seq_printf(s, "Bitmap search   : %ld/%ld/%ld (min/avg/max CPU Cycles/IOVA page)\n",
-		           min, avg, max);
-	}
-#endif
-#ifndef ALLOW_IOV_BYPASS
-	 seq_printf(s, "IOVA bypass disabled\n");
-#endif
-	return 0;
-}
-
-static const struct seq_operations ioc_seq_ops = {
-	.start = ioc_start,
-	.next  = ioc_next,
-	.stop  = ioc_stop,
-	.show  = ioc_show
-};
-
-static void __init
-ioc_proc_init(void)
-{
-	struct proc_dir_entry *dir;
-
-	dir = proc_mkdir("bus/mckinley", NULL);
-	if (!dir)
-		return;
-
-	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
-}
-#endif
-
-static void
-sba_connect_bus(struct pci_bus *bus)
-{
-	acpi_handle handle, parent;
-	acpi_status status;
-	struct ioc *ioc;
-
-	if (!PCI_CONTROLLER(bus))
-		panic(PFX "no sysdata on bus %d!\n", bus->number);
-
-	if (PCI_CONTROLLER(bus)->iommu)
-		return;
-
-	handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
-	if (!handle)
-		return;
-
-	/*
-	 * The IOC scope encloses PCI root bridges in the ACPI
-	 * namespace, so work our way out until we find an IOC we
-	 * claimed previously.
-	 */
-	do {
-		for (ioc = ioc_list; ioc; ioc = ioc->next)
-			if (ioc->handle == handle) {
-				PCI_CONTROLLER(bus)->iommu = ioc;
-				return;
-			}
-
-		status = acpi_get_parent(handle, &parent);
-		handle = parent;
-	} while (ACPI_SUCCESS(status));
-
-	printk(KERN_WARNING "No IOC for PCI Bus %04x:%02x in ACPI\n", pci_domain_nr(bus), bus->number);
-}
-
-static void __init
-sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
-{
-#ifdef CONFIG_NUMA
-	unsigned int node;
-
-	node = acpi_get_node(handle);
-	if (node != NUMA_NO_NODE && !node_online(node))
-		node = NUMA_NO_NODE;
-
-	ioc->node = node;
-#endif
-}
-
-static void __init acpi_sba_ioc_add(struct ioc *ioc)
-{
-	acpi_handle handle = ioc->handle;
-	acpi_status status;
-	u64 hpa, length;
-	struct acpi_device_info *adi;
-
-	ioc_found = ioc->next;
-	status = hp_acpi_csr_space(handle, &hpa, &length);
-	if (ACPI_FAILURE(status))
-		goto err;
-
-	status = acpi_get_object_info(handle, &adi);
-	if (ACPI_FAILURE(status))
-		goto err;
-
-	/*
-	 * For HWP0001, only SBA appears in ACPI namespace.  It encloses the PCI
-	 * root bridges, and its CSR space includes the IOC function.
-	 */
-	if (strncmp("HWP0001", adi->hardware_id.string, 7) == 0) {
-		hpa += ZX1_IOC_OFFSET;
-		/* zx1 based systems default to kernel page size iommu pages */
-		if (!iovp_shift)
-			iovp_shift = min(PAGE_SHIFT, 16);
-	}
-	kfree(adi);
-
-	/*
-	 * default anything not caught above or specified on cmdline to 4k
-	 * iommu page size
-	 */
-	if (!iovp_shift)
-		iovp_shift = 12;
-
-	ioc_init(hpa, ioc);
-	/* setup NUMA node association */
-	sba_map_ioc_to_node(ioc, handle);
-	return;
-
- err:
-	kfree(ioc);
-}
-
-static const struct acpi_device_id hp_ioc_iommu_device_ids[] = {
-	{"HWP0001", 0},
-	{"HWP0004", 0},
-	{"", 0},
-};
-
-static int acpi_sba_ioc_attach(struct acpi_device *device,
-			       const struct acpi_device_id *not_used)
-{
-	struct ioc *ioc;
-
-	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
-	if (!ioc)
-		return -ENOMEM;
-
-	ioc->next = ioc_found;
-	ioc_found = ioc;
-	ioc->handle = device->handle;
-	return 1;
-}
-
-
-static struct acpi_scan_handler acpi_sba_ioc_handler = {
-	.ids	= hp_ioc_iommu_device_ids,
-	.attach	= acpi_sba_ioc_attach,
-};
-
-static int __init acpi_sba_ioc_init_acpi(void)
-{
-	return acpi_scan_add_handler(&acpi_sba_ioc_handler);
-}
-/* This has to run before acpi_scan_init(). */
-arch_initcall(acpi_sba_ioc_init_acpi);
-
-static int sba_dma_supported (struct device *dev, u64 mask)
-{
-	/* make sure it's at least 32bit capable */
-	return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL);
-}
-
-static const struct dma_map_ops sba_dma_ops = {
-	.alloc			= sba_alloc_coherent,
-	.free			= sba_free_coherent,
-	.map_page		= sba_map_page,
-	.unmap_page		= sba_unmap_page,
-	.map_sg			= sba_map_sg_attrs,
-	.unmap_sg		= sba_unmap_sg_attrs,
-	.dma_supported		= sba_dma_supported,
-	.mmap			= dma_common_mmap,
-	.get_sgtable		= dma_common_get_sgtable,
-	.alloc_pages		= dma_common_alloc_pages,
-	.free_pages		= dma_common_free_pages,
-};
-
-static int __init
-sba_init(void)
-{
-	/*
-	 * If we are booting a kdump kernel, the sba_iommu will cause devices
-	 * that were not shutdown properly to MCA as soon as they are turned
-	 * back on.  Our only option for a successful kdump kernel boot is to
-	 * use swiotlb.
-	 */
-	if (is_kdump_kernel())
-		return 0;
-
-	/*
-	 * ioc_found should be populated by the acpi_sba_ioc_handler's .attach()
-	 * routine, but that only happens if acpi_scan_init() has already run.
-	 */
-	while (ioc_found)
-		acpi_sba_ioc_add(ioc_found);
-
-	if (!ioc_list)
-		return 0;
-
-	{
-		struct pci_bus *b = NULL;
-		while ((b = pci_find_next_bus(b)) != NULL)
-			sba_connect_bus(b);
-	}
-
-	/* no need for swiotlb with the iommu */
-	swiotlb_exit();
-	dma_ops = &sba_dma_ops;
-
-#ifdef CONFIG_PROC_FS
-	ioc_proc_init();
-#endif
-	return 0;
-}
-
-subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
-
-static int __init
-nosbagart(char *str)
-{
-	reserve_sba_gart = 0;
-	return 1;
-}
-
-__setup("nosbagart", nosbagart);
-
-static int __init
-sba_page_override(char *str)
-{
-	unsigned long page_size;
-
-	page_size = memparse(str, &str);
-	switch (page_size) {
-		case 4096:
-		case 8192:
-		case 16384:
-		case 65536:
-			iovp_shift = ffs(page_size) - 1;
-			break;
-		default:
-			printk("%s: unknown/unsupported iommu page size %ld\n",
-			       __func__, page_size);
-	}
-
-	return 1;
-}
-
-__setup("sbapagesize=",sba_page_override);
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
deleted file mode 100644
index aefae2efde9f..000000000000
--- a/arch/ia64/include/asm/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generated-y += syscall_table.h
-generic-y += agp.h
-generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += vtime.h
diff --git a/arch/ia64/include/asm/acenv.h b/arch/ia64/include/asm/acenv.h
deleted file mode 100644
index 9d673cd4c2ad..000000000000
--- a/arch/ia64/include/asm/acenv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * IA64 specific ACPICA environments and implementation
- *
- * Copyright (C) 2014, Intel Corporation
- *   Author: Lv Zheng <lv.zheng@intel.com>
- */
-
-#ifndef _ASM_IA64_ACENV_H
-#define _ASM_IA64_ACENV_H
-
-#include <asm/intrinsics.h>
-
-#define COMPILER_DEPENDENT_INT64	long
-#define COMPILER_DEPENDENT_UINT64	unsigned long
-
-/* Asm macros */
-
-static inline int
-ia64_acpi_acquire_global_lock(unsigned int *lock)
-{
-	unsigned int old, new, val;
-	do {
-		old = *lock;
-		new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
-		val = ia64_cmpxchg4_acq(lock, new, old);
-	} while (unlikely (val != old));
-	return (new < 3) ? -1 : 0;
-}
-
-static inline int
-ia64_acpi_release_global_lock(unsigned int *lock)
-{
-	unsigned int old, new, val;
-	do {
-		old = *lock;
-		new = old & ~0x3;
-		val = ia64_cmpxchg4_acq(lock, new, old);
-	} while (unlikely (val != old));
-	return old & 0x1;
-}
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq)				\
-	((Acq) = ia64_acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq)				\
-	((Acq) = ia64_acpi_release_global_lock(&facs->global_lock))
-
-#endif /* _ASM_IA64_ACENV_H */
diff --git a/arch/ia64/include/asm/acpi-ext.h b/arch/ia64/include/asm/acpi-ext.h
deleted file mode 100644
index eaa57583d151..000000000000
--- a/arch/ia64/include/asm/acpi-ext.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P.
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * Vendor specific extensions to ACPI.
- */
-
-#ifndef _ASM_IA64_ACPI_EXT_H
-#define _ASM_IA64_ACPI_EXT_H
-
-#include <linux/types.h>
-
-extern acpi_status hp_acpi_csr_space (acpi_handle, u64 *base, u64 *length);
-
-#endif /* _ASM_IA64_ACPI_EXT_H */
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
deleted file mode 100644
index 58500a964238..000000000000
--- a/arch/ia64/include/asm/acpi.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *  Copyright (C) 1999 VA Linux Systems
- *  Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
- *  Copyright (C) 2001,2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- */
-
-#ifndef _ASM_ACPI_H
-#define _ASM_ACPI_H
-
-#ifdef __KERNEL__
-
-#include <acpi/proc_cap_intel.h>
-
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <asm/numa.h>
-
-
-extern int acpi_lapic;
-#define acpi_disabled 0	/* ACPI always enabled on IA64 */
-#define acpi_noirq 0	/* ACPI always enabled on IA64 */
-#define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */
-#define acpi_strict 1	/* no ACPI spec workarounds on IA64 */
-
-static inline bool acpi_has_cpu_in_madt(void)
-{
-	return !!acpi_lapic;
-}
-
-#define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */
-static inline void disable_acpi(void) { }
-
-int acpi_request_vector (u32 int_type);
-int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
-
-/* Low-level suspend routine. */
-extern int acpi_suspend_lowlevel(void);
-
-static inline unsigned long acpi_get_wakeup_address(void)
-{
-	return 0;
-}
-
-/*
- * Record the cpei override flag and current logical cpu. This is
- * useful for CPU removal.
- */
-extern unsigned int can_cpei_retarget(void);
-extern unsigned int is_cpu_cpei_target(unsigned int cpu);
-extern void set_cpei_target_cpu(unsigned int cpu);
-extern unsigned int get_cpei_target_cpu(void);
-extern void prefill_possible_map(void);
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-extern int additional_cpus;
-#else
-#define additional_cpus 0
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-#if MAX_NUMNODES > 256
-#define MAX_PXM_DOMAINS MAX_NUMNODES
-#else
-#define MAX_PXM_DOMAINS (256)
-#endif
-extern int pxm_to_nid_map[MAX_PXM_DOMAINS];
-extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
-#endif
-
-static inline bool arch_has_acpi_pdc(void) { return true; }
-static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
-{
-	*cap |= ACPI_PROC_CAP_EST_CAPABILITY_SMP;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-extern cpumask_t early_cpu_possible_map;
-#define for_each_possible_early_cpu(cpu)  \
-	for_each_cpu((cpu), &early_cpu_possible_map)
-
-static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus)
-{
-	int low_cpu, high_cpu;
-	int cpu;
-	int next_nid = 0;
-
-	low_cpu = cpumask_weight(&early_cpu_possible_map);
-
-	high_cpu = max(low_cpu, min_cpus);
-	high_cpu = min(high_cpu + reserve_cpus, NR_CPUS);
-
-	for (cpu = low_cpu; cpu < high_cpu; cpu++) {
-		cpumask_set_cpu(cpu, &early_cpu_possible_map);
-		if (node_cpuid[cpu].nid == NUMA_NO_NODE) {
-			node_cpuid[cpu].nid = next_nid;
-			next_nid++;
-			if (next_nid >= num_online_nodes())
-				next_nid = 0;
-		}
-	}
-}
-
-extern void acpi_numa_fixup(void);
-
-#endif /* CONFIG_ACPI_NUMA */
-
-#endif /*__KERNEL__*/
-
-#endif /*_ASM_ACPI_H*/
diff --git a/arch/ia64/include/asm/asm-offsets.h b/arch/ia64/include/asm/asm-offsets.h
deleted file mode 100644
index d370ee36a182..000000000000
--- a/arch/ia64/include/asm/asm-offsets.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <generated/asm-offsets.h>
diff --git a/arch/ia64/include/asm/asm-prototypes.h b/arch/ia64/include/asm/asm-prototypes.h
deleted file mode 100644
index a96689447a74..000000000000
--- a/arch/ia64/include/asm/asm-prototypes.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ASM_PROTOTYPES_H
-#define _ASM_IA64_ASM_PROTOTYPES_H
-
-#include <asm/cacheflush.h>
-#include <asm/checksum.h>
-#include <asm/esi.h>
-#include <asm/ftrace.h>
-#include <asm/page.h>
-#include <asm/pal.h>
-#include <asm/string.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-#include <asm/xor.h>
-
-extern const char ia64_ivt[];
-
-signed int __divsi3(signed int, unsigned int);
-signed int __modsi3(signed int, unsigned int);
-
-signed long long __divdi3(signed long long, unsigned long long);
-signed long long __moddi3(signed long long, unsigned long long);
-
-unsigned int __udivsi3(unsigned int, unsigned int);
-unsigned int __umodsi3(unsigned int, unsigned int);
-
-unsigned long long __udivdi3(unsigned long long, unsigned long long);
-unsigned long long __umoddi3(unsigned long long, unsigned long long);
-
-#endif /* _ASM_IA64_ASM_PROTOTYPES_H */
diff --git a/arch/ia64/include/asm/asmmacro.h b/arch/ia64/include/asm/asmmacro.h
deleted file mode 100644
index 52619c517f09..000000000000
--- a/arch/ia64/include/asm/asmmacro.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ASMMACRO_H
-#define _ASM_IA64_ASMMACRO_H
-
-/*
- * Copyright (C) 2000-2001, 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#define ENTRY(name)				\
-	.align 32;				\
-	.proc name;				\
-name:
-
-#define ENTRY_MIN_ALIGN(name)			\
-	.align 16;				\
-	.proc name;				\
-name:
-
-#define GLOBAL_ENTRY(name)			\
-	.global name;				\
-	ENTRY(name)
-
-#define END(name)				\
-	.endp name
-
-/*
- * Helper macros to make unwind directives more readable:
- */
-
-/* prologue_gr: */
-#define ASM_UNW_PRLG_RP			0x8
-#define ASM_UNW_PRLG_PFS		0x4
-#define ASM_UNW_PRLG_PSP		0x2
-#define ASM_UNW_PRLG_PR			0x1
-#define ASM_UNW_PRLG_GRSAVE(ninputs)	(32+(ninputs))
-
-/*
- * Helper macros for accessing user memory.
- *
- * When adding any new .section/.previous entries here, make sure to
- * also add it to the DISCARD section in arch/ia64/kernel/gate.lds.S or
- * unpleasant things will happen.
- */
-
-	.section "__ex_table", "a"		// declare section & section attributes
-	.previous
-
-# define EX(y,x...)				\
-	.xdata4 "__ex_table", 99f-., y-.;	\
-  [99:]	x
-# define EXCLR(y,x...)				\
-	.xdata4 "__ex_table", 99f-., y-.+4;	\
-  [99:]	x
-
-/*
- * Tag MCA recoverable instruction ranges.
- */
-
-	.section "__mca_table", "a"		// declare section & section attributes
-	.previous
-
-# define MCA_RECOVER_RANGE(y)			\
-	.xdata4 "__mca_table", y-., 99f-.;	\
-  [99:]
-
-/*
- * Mark instructions that need a load of a virtual address patched to be
- * a load of a physical address.  We use this either in critical performance
- * path (ivt.S - TLB miss processing) or in places where it might not be
- * safe to use a "tpa" instruction (mca_asm.S - error recovery).
- */
-	.section ".data..patch.vtop", "a"	// declare section & section attributes
-	.previous
-
-#define	LOAD_PHYSICAL(pr, reg, obj)		\
-[1:](pr)movl reg = obj;				\
-	.xdata4 ".data..patch.vtop", 1b-.
-
-/*
- * For now, we always put in the McKinley E9 workaround.  On CPUs that don't need it,
- * we'll patch out the work-around bundles with NOPs, so their impact is minimal.
- */
-#define DO_MCKINLEY_E9_WORKAROUND
-
-#ifdef DO_MCKINLEY_E9_WORKAROUND
-	.section ".data..patch.mckinley_e9", "a"
-	.previous
-/* workaround for Itanium 2 Errata 9: */
-# define FSYS_RETURN					\
-	.xdata4 ".data..patch.mckinley_e9", 1f-.;	\
-1:{ .mib;						\
-	nop.m 0;					\
-	mov r16=ar.pfs;					\
-	br.call.sptk.many b7=2f;;			\
-  };							\
-2:{ .mib;						\
-	nop.m 0;					\
-	mov ar.pfs=r16;					\
-	br.ret.sptk.many b6;;				\
-  }
-#else
-# define FSYS_RETURN	br.ret.sptk.many b6
-#endif
-
-/*
- * If physical stack register size is different from DEF_NUM_STACK_REG,
- * dynamically patch the kernel for correct size.
- */
-	.section ".data..patch.phys_stack_reg", "a"
-	.previous
-#define LOAD_PHYS_STACK_REG_SIZE(reg)			\
-[1:]	adds reg=IA64_NUM_PHYS_STACK_REG*8+8,r0;	\
-	.xdata4 ".data..patch.phys_stack_reg", 1b-.
-
-/*
- * Up until early 2004, use of .align within a function caused bad unwind info.
- * TEXT_ALIGN(n) expands into ".align n" if a fixed GAS is available or into nothing
- * otherwise.
- */
-#ifdef HAVE_WORKING_TEXT_ALIGN
-# define TEXT_ALIGN(n)	.align n
-#else
-# define TEXT_ALIGN(n)
-#endif
-
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-# define dv_serialize_data		.serialize.data
-# define dv_serialize_instruction	.serialize.instruction
-#else
-# define dv_serialize_data
-# define dv_serialize_instruction
-#endif
-
-#endif /* _ASM_IA64_ASMMACRO_H */
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
deleted file mode 100644
index 6540a628d257..000000000000
--- a/arch/ia64/include/asm/atomic.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ATOMIC_H
-#define _ASM_IA64_ATOMIC_H
-
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc..
- *
- * NOTE: don't mess with the types below!  The "unsigned long" and
- * "int" types were carefully placed so as to ensure proper operation
- * of the macros.
- *
- * Copyright (C) 1998, 1999, 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/types.h>
-
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-
-
-#define ATOMIC64_INIT(i)	{ (i) }
-
-#define arch_atomic_read(v)	READ_ONCE((v)->counter)
-#define arch_atomic64_read(v)	READ_ONCE((v)->counter)
-
-#define arch_atomic_set(v,i)	WRITE_ONCE(((v)->counter), (i))
-#define arch_atomic64_set(v,i)	WRITE_ONCE(((v)->counter), (i))
-
-#define ATOMIC_OP(op, c_op)						\
-static __inline__ int							\
-ia64_atomic_##op (int i, atomic_t *v)					\
-{									\
-	__s32 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
-	return new;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static __inline__ int							\
-ia64_atomic_fetch_##op (int i, atomic_t *v)				\
-{									\
-	__s32 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
-	return old;							\
-}
-
-#define ATOMIC_OPS(op, c_op)						\
-	ATOMIC_OP(op, c_op)						\
-	ATOMIC_FETCH_OP(op, c_op)
-
-ATOMIC_OPS(add, +)
-ATOMIC_OPS(sub, -)
-
-#ifdef __OPTIMIZE__
-#define __ia64_atomic_const(i)						\
-	static const int __ia64_atomic_p = __builtin_constant_p(i) ?	\
-		((i) == 1 || (i) == 4 || (i) == 8 || (i) == 16 ||	\
-		 (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0;\
-	__ia64_atomic_p
-#else
-#define __ia64_atomic_const(i)	0
-#endif
-
-#define arch_atomic_add_return(i,v)					\
-({									\
-	int __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic_add(__ia64_aar_i, v);			\
-})
-
-#define arch_atomic_sub_return(i,v)					\
-({									\
-	int __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic_sub(__ia64_asr_i, v);			\
-})
-
-#define arch_atomic_fetch_add(i,v)					\
-({									\
-	int __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq)	\
-		: ia64_atomic_fetch_add(__ia64_aar_i, v);		\
-})
-
-#define arch_atomic_fetch_sub(i,v)					\
-({									\
-	int __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq)	\
-		: ia64_atomic_fetch_sub(__ia64_asr_i, v);		\
-})
-
-ATOMIC_FETCH_OP(and, &)
-ATOMIC_FETCH_OP(or, |)
-ATOMIC_FETCH_OP(xor, ^)
-
-#define arch_atomic_and(i,v)	(void)ia64_atomic_fetch_and(i,v)
-#define arch_atomic_or(i,v)	(void)ia64_atomic_fetch_or(i,v)
-#define arch_atomic_xor(i,v)	(void)ia64_atomic_fetch_xor(i,v)
-
-#define arch_atomic_fetch_and(i,v)	ia64_atomic_fetch_and(i,v)
-#define arch_atomic_fetch_or(i,v)	ia64_atomic_fetch_or(i,v)
-#define arch_atomic_fetch_xor(i,v)	ia64_atomic_fetch_xor(i,v)
-
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
-
-#define ATOMIC64_OP(op, c_op)						\
-static __inline__ s64							\
-ia64_atomic64_##op (s64 i, atomic64_t *v)				\
-{									\
-	s64 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic64_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
-	return new;							\
-}
-
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static __inline__ s64							\
-ia64_atomic64_fetch_##op (s64 i, atomic64_t *v)				\
-{									\
-	s64 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic64_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
-	return old;							\
-}
-
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op, c_op)						\
-	ATOMIC64_FETCH_OP(op, c_op)
-
-ATOMIC64_OPS(add, +)
-ATOMIC64_OPS(sub, -)
-
-#define arch_atomic64_add_return(i,v)					\
-({									\
-	s64 __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic64_add(__ia64_aar_i, v);			\
-})
-
-#define arch_atomic64_sub_return(i,v)					\
-({									\
-	s64 __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic64_sub(__ia64_asr_i, v);			\
-})
-
-#define arch_atomic64_fetch_add(i,v)					\
-({									\
-	s64 __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq)	\
-		: ia64_atomic64_fetch_add(__ia64_aar_i, v);		\
-})
-
-#define arch_atomic64_fetch_sub(i,v)					\
-({									\
-	s64 __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq)	\
-		: ia64_atomic64_fetch_sub(__ia64_asr_i, v);		\
-})
-
-ATOMIC64_FETCH_OP(and, &)
-ATOMIC64_FETCH_OP(or, |)
-ATOMIC64_FETCH_OP(xor, ^)
-
-#define arch_atomic64_and(i,v)	(void)ia64_atomic64_fetch_and(i,v)
-#define arch_atomic64_or(i,v)	(void)ia64_atomic64_fetch_or(i,v)
-#define arch_atomic64_xor(i,v)	(void)ia64_atomic64_fetch_xor(i,v)
-
-#define arch_atomic64_fetch_and(i,v)	ia64_atomic64_fetch_and(i,v)
-#define arch_atomic64_fetch_or(i,v)	ia64_atomic64_fetch_or(i,v)
-#define arch_atomic64_fetch_xor(i,v)	ia64_atomic64_fetch_xor(i,v)
-
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
-
-#define arch_atomic_add(i,v)		(void)arch_atomic_add_return((i), (v))
-#define arch_atomic_sub(i,v)		(void)arch_atomic_sub_return((i), (v))
-
-#define arch_atomic64_add(i,v)		(void)arch_atomic64_add_return((i), (v))
-#define arch_atomic64_sub(i,v)		(void)arch_atomic64_sub_return((i), (v))
-
-#endif /* _ASM_IA64_ATOMIC_H */
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
deleted file mode 100644
index 751cdd353446..000000000000
--- a/arch/ia64/include/asm/barrier.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Memory barrier definitions.  This is based on information published
- * in the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _ASM_IA64_BARRIER_H
-#define _ASM_IA64_BARRIER_H
-
-#include <linux/compiler.h>
-
-/*
- * Macros to force memory ordering.  In these descriptions, "previous"
- * and "subsequent" refer to program order; "visible" means that all
- * architecturally visible effects of a memory access have occurred
- * (at a minimum, this means the memory has been read or written).
- *
- *   wmb():	Guarantees that all preceding stores to memory-
- *		like regions are visible before any subsequent
- *		stores and that all following stores will be
- *		visible only after all previous stores.
- *   rmb():	Like wmb(), but for reads.
- *   mb():	wmb()/rmb() combo, i.e., all previous memory
- *		accesses are visible before all subsequent
- *		accesses and vice versa.  This is also known as
- *		a "fence."
- *
- * Note: "mb()" and its variants cannot be used as a fence to order
- * accesses to memory mapped I/O registers.  For that, mf.a needs to
- * be used.  However, we don't want to always use mf.a because (a)
- * it's (presumably) much slower than mf and (b) mf.a is supported for
- * sequential memory pages only.
- */
-#define mb()		ia64_mf()
-#define rmb()		mb()
-#define wmb()		mb()
-
-#define dma_rmb()	mb()
-#define dma_wmb()	mb()
-
-# define __smp_mb()	mb()
-
-#define __smp_mb__before_atomic()	barrier()
-#define __smp_mb__after_atomic()	barrier()
-
-/*
- * IA64 GCC turns volatile stores into st.rel and volatile loads into ld.acq no
- * need for asm trickery!
- */
-
-#define __smp_store_release(p, v)						\
-do {									\
-	compiletime_assert_atomic_type(*p);				\
-	barrier();							\
-	WRITE_ONCE(*p, v);						\
-} while (0)
-
-#define __smp_load_acquire(p)						\
-({									\
-	typeof(*p) ___p1 = READ_ONCE(*p);				\
-	compiletime_assert_atomic_type(*p);				\
-	barrier();							\
-	___p1;								\
-})
-
-/*
- * The group barrier in front of the rsm & ssm are necessary to ensure
- * that none of the previous instructions in the same group are
- * affected by the rsm/ssm.
- */
-
-#include <asm-generic/barrier.h>
-
-#endif /* _ASM_IA64_BARRIER_H */
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
deleted file mode 100644
index 1accb7842f58..000000000000
--- a/arch/ia64/include/asm/bitops.h
+++ /dev/null
@@ -1,453 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_BITOPS_H
-#define _ASM_IA64_BITOPS_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64
- * O(1) scheduler patch
- */
-
-#ifndef _LINUX_BITOPS_H
-#error only <linux/bitops.h> can be included directly
-#endif
-
-#include <linux/compiler.h>
-#include <linux/types.h>
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- *
- * The address must be (at least) "long" aligned.
- * Note that there are driver (e.g., eepro100) which use these operations to
- * operate on hw-defined data-structures, so we can't easily change these
- * operations to force a bigger alignment.
- *
- * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
- */
-static __inline__ void
-set_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = 1 << (nr & 31);
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old | bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * arch___set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___set_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
- * in order to ensure changes are visible on other processors.
- */
-static __inline__ void
-clear_bit (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * clear_bit_unlock - Clears a bit in memory with release
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit_unlock() is atomic and may not be reordered.  It does
- * contain a memory barrier suitable for unlock type operations.
- */
-static __inline__ void
-clear_bit_unlock (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_rel(m, old, new) != old);
-}
-
-/**
- * __clear_bit_unlock - Non-atomically clears a bit in memory with release
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * Similarly to clear_bit_unlock, the implementation uses a store
- * with release semantics. See also arch_spin_unlock().
- */
-static __inline__ void
-__clear_bit_unlock(int nr, void *addr)
-{
-	__u32 * const m = (__u32 *) addr + (nr >> 5);
-	__u32 const new = *m & ~(1 << (nr & 31));
-
-	ia64_st4_rel_nta(m, new);
-}
-
-/**
- * arch___clear_bit - Clears a bit in memory (non-atomic version)
- * @nr: the bit to clear
- * @addr: the address to start counting from
- *
- * Unlike clear_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31));
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to toggle
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static __inline__ void
-change_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = (1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old ^ bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * arch___change_bit - Toggle a bit in memory
- * @nr: the bit to toggle
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___change_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) ^= (1 << (nr & 31));
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_set_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = 1 << (nr & 31);
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old | bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & bit) != 0;
-}
-
-/**
- * test_and_set_bit_lock - Set a bit and return its old value for lock
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This is the same as test_and_set_bit on ia64
- */
-#define test_and_set_bit_lock test_and_set_bit
-
-/**
- * arch___test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline bool
-arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 *p = (__u32 *) addr + (nr >> 5);
-	__u32 m = 1 << (nr & 31);
-	int oldbitset = (*p & m) != 0;
-
-	*p |= m;
-	return oldbitset;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_clear_bit (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & ~mask) != 0;
-}
-
-/**
- * arch___test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline bool
-arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 *p = (__u32 *) addr + (nr >> 5);
-	__u32 m = 1 << (nr & 31);
-	int oldbitset = (*p & m) != 0;
-
-	*p &= ~m;
-	return oldbitset;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_change_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = (1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old ^ bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & bit) != 0;
-}
-
-/**
- * arch___test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- */
-static __always_inline bool
-arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 old, bit = (1 << (nr & 31));
-	__u32 *m = (__u32 *) addr + (nr >> 5);
-
-	old = *m;
-	*m = old ^ bit;
-	return (old & bit) != 0;
-}
-
-#define arch_test_bit generic_test_bit
-#define arch_test_bit_acquire generic_test_bit_acquire
-
-/**
- * ffz - find the first zero bit in a long word
- * @x: The long word to find the bit in
- *
- * Returns the bit-number (0..63) of the first (least significant) zero bit.
- * Undefined if no zero exists, so code should check against ~0UL first...
- */
-static inline unsigned long
-ffz (unsigned long x)
-{
-	unsigned long result;
-
-	result = ia64_popcnt(x & (~x - 1));
-	return result;
-}
-
-/**
- * __ffs - find first bit in word.
- * @x: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static __inline__ unsigned long
-__ffs (unsigned long x)
-{
-	unsigned long result;
-
-	result = ia64_popcnt((x-1) & ~x);
-	return result;
-}
-
-#ifdef __KERNEL__
-
-/*
- * Return bit number of last (most-significant) bit set.  Undefined
- * for x==0.  Bits are numbered from 0..63 (e.g., ia64_fls(9) == 3).
- */
-static inline unsigned long
-ia64_fls (unsigned long x)
-{
-	long double d = x;
-	long exp;
-
-	exp = ia64_getf_exp(d);
-	return exp - 0xffff;
-}
-
-/*
- * Find the last (most significant) bit set.  Returns 0 for x==0 and
- * bits are numbered from 1..32 (e.g., fls(9) == 4).
- */
-static inline int fls(unsigned int t)
-{
-	unsigned long x = t & 0xffffffffu;
-
-	if (!x)
-		return 0;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	return ia64_popcnt(x);
-}
-
-/*
- * Find the last (most significant) bit set.  Undefined for x==0.
- * Bits are numbered from 0..63 (e.g., __fls(9) == 3).
- */
-static inline unsigned long
-__fls (unsigned long x)
-{
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x |= x >> 32;
-	return ia64_popcnt(x) - 1;
-}
-
-#include <asm-generic/bitops/fls64.h>
-
-#include <asm-generic/bitops/builtin-ffs.h>
-
-/*
- * hweightN: returns the hamming weight (i.e. the number
- * of bits set) of a N-bit word
- */
-static __inline__ unsigned long __arch_hweight64(unsigned long x)
-{
-	unsigned long result;
-	result = ia64_popcnt(x);
-	return result;
-}
-
-#define __arch_hweight32(x) ((unsigned int) __arch_hweight64((x) & 0xfffffffful))
-#define __arch_hweight16(x) ((unsigned int) __arch_hweight64((x) & 0xfffful))
-#define __arch_hweight8(x)  ((unsigned int) __arch_hweight64((x) & 0xfful))
-
-#include <asm-generic/bitops/const_hweight.h>
-
-#endif /* __KERNEL__ */
-
-#ifdef __KERNEL__
-
-#include <asm-generic/bitops/non-instrumented-non-atomic.h>
-
-#include <asm-generic/bitops/le.h>
-
-#include <asm-generic/bitops/ext2-atomic-setbit.h>
-
-#include <asm-generic/bitops/sched.h>
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_IA64_BITOPS_H */
diff --git a/arch/ia64/include/asm/bug.h b/arch/ia64/include/asm/bug.h
deleted file mode 100644
index 66b37a532765..000000000000
--- a/arch/ia64/include/asm/bug.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_BUG_H
-#define _ASM_IA64_BUG_H
-
-#ifdef CONFIG_BUG
-#define ia64_abort()	__builtin_trap()
-#define BUG() do {						\
-	printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__);	\
-	barrier_before_unreachable();				\
-	ia64_abort();						\
-} while (0)
-
-/* should this BUG be made generic? */
-#define HAVE_ARCH_BUG
-#endif
-
-#include <asm-generic/bug.h>
-
-#endif
diff --git a/arch/ia64/include/asm/cache.h b/arch/ia64/include/asm/cache.h
deleted file mode 100644
index 2f1c70647068..000000000000
--- a/arch/ia64/include/asm/cache.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CACHE_H
-#define _ASM_IA64_CACHE_H
-
-
-/*
- * Copyright (C) 1998-2000 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/* Bytes per L1 (data) cache line.  */
-#define L1_CACHE_SHIFT		CONFIG_IA64_L1_CACHE_SHIFT
-#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
-
-#ifdef CONFIG_SMP
-# define SMP_CACHE_SHIFT	L1_CACHE_SHIFT
-# define SMP_CACHE_BYTES	L1_CACHE_BYTES
-#else
-  /*
-   * The "aligned" directive can only _increase_ alignment, so this is
-   * safe and provides an easy way to avoid wasting space on a
-   * uni-processor:
-   */
-# define SMP_CACHE_SHIFT	3
-# define SMP_CACHE_BYTES	(1 << 3)
-#endif
-
-#define __read_mostly __section(".data..read_mostly")
-
-#endif /* _ASM_IA64_CACHE_H */
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
deleted file mode 100644
index eac493fa9e0d..000000000000
--- a/arch/ia64/include/asm/cacheflush.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CACHEFLUSH_H
-#define _ASM_IA64_CACHEFLUSH_H
-
-/*
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/page-flags.h>
-#include <linux/bitops.h>
-
-#include <asm/page.h>
-
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-static inline void flush_dcache_folio(struct folio *folio)
-{
-	clear_bit(PG_arch_1, &folio->flags);
-}
-#define flush_dcache_folio flush_dcache_folio
-
-static inline void flush_dcache_page(struct page *page)
-{
-	flush_dcache_folio(page_folio(page));
-}
-
-extern void flush_icache_range(unsigned long start, unsigned long end);
-#define flush_icache_range flush_icache_range
-extern void clflush_cache_range(void *addr, int size);
-
-#define flush_icache_user_page(vma, page, user_addr, len)					\
-do {												\
-	unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK);	\
-	flush_icache_range(_addr, _addr + (len));						\
-} while (0)
-
-#include <asm-generic/cacheflush.h>
-
-#endif /* _ASM_IA64_CACHEFLUSH_H */
diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h
deleted file mode 100644
index f3026213aa32..000000000000
--- a/arch/ia64/include/asm/checksum.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CHECKSUM_H
-#define _ASM_IA64_CHECKSUM_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-/*
- * This is a version of ip_compute_csum() optimized for IP headers,
- * which always checksum on 4 octet boundaries.
- */
-extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
-
-/*
- * Computes the checksum of the TCP/UDP pseudo-header returns a 16-bit
- * checksum, already complemented
- */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-				 __u32 len, __u8 proto, __wsum sum);
-
-extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-				 __u32 len, __u8 proto, __wsum sum);
-
-/*
- * Computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-extern __wsum csum_partial(const void *buff, int len, __wsum sum);
-
-/*
- * This routine is used for miscellaneous IP-like checksums, mainly in
- * icmp.c
- */
-extern __sum16 ip_compute_csum(const void *buff, int len);
-
-/*
- * Fold a partial checksum without adding pseudo headers.
- */
-static inline __sum16 csum_fold(__wsum csum)
-{
-	u32 sum = (__force u32)csum;
-	sum = (sum & 0xffff) + (sum >> 16);
-	sum = (sum & 0xffff) + (sum >> 16);
-	return (__force __sum16)~sum;
-}
-
-#define _HAVE_ARCH_IPV6_CSUM	1
-struct in6_addr;
-extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-			       const struct in6_addr *daddr,
-			       __u32 len, __u8 proto, __wsum csum);
-
-#endif /* _ASM_IA64_CHECKSUM_H */
diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
deleted file mode 100644
index 71a517751afa..000000000000
--- a/arch/ia64/include/asm/clocksource.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* IA64-specific clocksource additions */
-
-#ifndef _ASM_IA64_CLOCKSOURCE_H
-#define _ASM_IA64_CLOCKSOURCE_H
-
-struct arch_clocksource_data {
-	void *fsys_mmio;        /* used by fsyscall asm code */
-};
-
-#endif /* _ASM_IA64_CLOCKSOURCE_H */
diff --git a/arch/ia64/include/asm/cmpxchg.h b/arch/ia64/include/asm/cmpxchg.h
deleted file mode 100644
index d85ee1a0a227..000000000000
--- a/arch/ia64/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CMPXCHG_H
-#define _ASM_IA64_CMPXCHG_H
-
-#include <uapi/asm/cmpxchg.h>
-
-#define arch_xchg(ptr, x)	\
-({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
-
-#define arch_cmpxchg(ptr, o, n)		cmpxchg_acq((ptr), (o), (n))
-#define arch_cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-
-#define arch_cmpxchg_local		arch_cmpxchg
-#define arch_cmpxchg64_local		arch_cmpxchg64
-
-#ifdef CONFIG_IA64_DEBUG_CMPXCHG
-# define CMPXCHG_BUGCHECK_DECL	int _cmpxchg_bugcheck_count = 128;
-# define CMPXCHG_BUGCHECK(v)						\
-do {									\
-	if (_cmpxchg_bugcheck_count-- <= 0) {				\
-		void *ip;						\
-		extern int _printk(const char *fmt, ...);		\
-		ip = (void *) ia64_getreg(_IA64_REG_IP);		\
-		_printk("CMPXCHG_BUGCHECK: stuck at %p on word %p\n", ip, (v));\
-		break;							\
-	}								\
-} while (0)
-#else /* !CONFIG_IA64_DEBUG_CMPXCHG */
-# define CMPXCHG_BUGCHECK_DECL
-# define CMPXCHG_BUGCHECK(v)
-#endif /* !CONFIG_IA64_DEBUG_CMPXCHG */
-
-#endif /* _ASM_IA64_CMPXCHG_H */
diff --git a/arch/ia64/include/asm/cpu.h b/arch/ia64/include/asm/cpu.h
deleted file mode 100644
index db125df9e088..000000000000
--- a/arch/ia64/include/asm/cpu.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CPU_H_
-#define _ASM_IA64_CPU_H_
-
-#include <linux/device.h>
-#include <linux/cpu.h>
-#include <linux/topology.h>
-#include <linux/percpu.h>
-
-struct ia64_cpu {
-	struct cpu cpu;
-};
-
-DECLARE_PER_CPU(struct ia64_cpu, cpu_devices);
-
-DECLARE_PER_CPU(int, cpu_state);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern int arch_register_cpu(int num);
-extern void arch_unregister_cpu(int);
-#endif
-
-#endif /* _ASM_IA64_CPU_H_ */
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
deleted file mode 100644
index 7f28c3564d5d..000000000000
--- a/arch/ia64/include/asm/cputime.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Definitions for measuring cputime on ia64 machines.
- *
- * Based on <asm-powerpc/cputime.h>.
- *
- * Copyright (C) 2007 FUJITSU LIMITED
- * Copyright (C) 2007 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *
- * If we have CONFIG_VIRT_CPU_ACCOUNTING_NATIVE, we measure cpu time in nsec.
- * Otherwise we measure cpu time in jiffies using the generic definitions.
- */
-
-#ifndef __IA64_CPUTIME_H
-#define __IA64_CPUTIME_H
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void arch_vtime_task_switch(struct task_struct *tsk);
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-#endif /* __IA64_CPUTIME_H */
diff --git a/arch/ia64/include/asm/current.h b/arch/ia64/include/asm/current.h
deleted file mode 100644
index 86fbcc88dff2..000000000000
--- a/arch/ia64/include/asm/current.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CURRENT_H
-#define _ASM_IA64_CURRENT_H
-
-/*
- * Modified 1998-2000
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-#include <asm/intrinsics.h>
-
-/*
- * In kernel mode, thread pointer (r13) is used to point to the current task
- * structure.
- */
-#define current	((struct task_struct *) ia64_getreg(_IA64_REG_TP))
-
-#endif /* _ASM_IA64_CURRENT_H */
diff --git a/arch/ia64/include/asm/cyclone.h b/arch/ia64/include/asm/cyclone.h
deleted file mode 100644
index a481393647e9..000000000000
--- a/arch/ia64/include/asm/cyclone.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_IA64_CYCLONE_H
-#define ASM_IA64_CYCLONE_H
-
-#ifdef	CONFIG_IA64_CYCLONE
-extern int use_cyclone;
-extern void __init cyclone_setup(void);
-#else	/* CONFIG_IA64_CYCLONE */
-#define use_cyclone 0
-static inline void cyclone_setup(void)
-{
-	printk(KERN_ERR "Cyclone Counter: System not configured"
-					" w/ CONFIG_IA64_CYCLONE.\n");
-}
-#endif	/* CONFIG_IA64_CYCLONE */
-#endif	/* !ASM_IA64_CYCLONE_H */
diff --git a/arch/ia64/include/asm/delay.h b/arch/ia64/include/asm/delay.h
deleted file mode 100644
index 0227ac586107..000000000000
--- a/arch/ia64/include/asm/delay.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DELAY_H
-#define _ASM_IA64_DELAY_H
-
-/*
- * Delay routines using a pre-computed "cycles/usec" value.
- *
- * Copyright (C) 1998, 1999 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/compiler.h>
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-
-static __inline__ void
-ia64_set_itm (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_ITM, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itm (void)
-{
-	unsigned long result;
-
-	result = ia64_getreg(_IA64_REG_CR_ITM);
-	ia64_srlz_d();
-	return result;
-}
-
-static __inline__ void
-ia64_set_itv (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_ITV, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itv (void)
-{
-	return ia64_getreg(_IA64_REG_CR_ITV);
-}
-
-static __inline__ void
-ia64_set_itc (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_AR_ITC, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itc (void)
-{
-	unsigned long result;
-
-	result = ia64_getreg(_IA64_REG_AR_ITC);
-	ia64_barrier();
-#ifdef CONFIG_ITANIUM
-	while (unlikely((__s32) result == -1)) {
-		result = ia64_getreg(_IA64_REG_AR_ITC);
-		ia64_barrier();
-	}
-#endif
-	return result;
-}
-
-extern void ia64_delay_loop (unsigned long loops);
-
-static __inline__ void
-__delay (unsigned long loops)
-{
-	if (unlikely(loops < 1))
-		return;
-
-	ia64_delay_loop (loops - 1);
-}
-
-extern void udelay (unsigned long usecs);
-
-#endif /* _ASM_IA64_DELAY_H */
diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h
deleted file mode 100644
index 918b198cd5bb..000000000000
--- a/arch/ia64/include/asm/device.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Arch specific extensions to struct device
- */
-#ifndef _ASM_IA64_DEVICE_H
-#define _ASM_IA64_DEVICE_H
-
-struct dev_archdata {
-};
-
-struct pdev_archdata {
-};
-
-#endif /* _ASM_IA64_DEVICE_H */
diff --git a/arch/ia64/include/asm/div64.h b/arch/ia64/include/asm/div64.h
deleted file mode 100644
index 6cd978cefb28..000000000000
--- a/arch/ia64/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
deleted file mode 100644
index af6fa8e1597c..000000000000
--- a/arch/ia64/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DMA_MAPPING_H
-#define _ASM_IA64_DMA_MAPPING_H
-
-/*
- * Copyright (C) 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-extern const struct dma_map_ops *dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(void)
-{
-	return dma_ops;
-}
-
-#endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
deleted file mode 100644
index eaed2626ffda..000000000000
--- a/arch/ia64/include/asm/dma.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DMA_H
-#define _ASM_IA64_DMA_H
-
-/*
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/io.h>		/* need byte IO */
-
-extern unsigned long MAX_DMA_ADDRESS;
-
-#define free_dma(x)
-
-#endif /* _ASM_IA64_DMA_H */
diff --git a/arch/ia64/include/asm/dmi.h b/arch/ia64/include/asm/dmi.h
deleted file mode 100644
index ecd9e0a0f5f9..000000000000
--- a/arch/ia64/include/asm/dmi.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_DMI_H
-#define _ASM_DMI_H 1
-
-#include <linux/slab.h>
-#include <asm/io.h>
-
-/* Use normal IO mappings for DMI */
-#define dmi_early_remap		ioremap
-#define dmi_early_unmap(x, l)	iounmap(x)
-#define dmi_remap		ioremap
-#define dmi_unmap		iounmap
-#define dmi_alloc(l)		kzalloc(l, GFP_ATOMIC)
-
-#endif
diff --git a/arch/ia64/include/asm/early_ioremap.h b/arch/ia64/include/asm/early_ioremap.h
deleted file mode 100644
index 934191b1e2e3..000000000000
--- a/arch/ia64/include/asm/early_ioremap.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_EARLY_IOREMAP_H
-#define _ASM_IA64_EARLY_IOREMAP_H
-
-extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size);
-#define early_memremap(phys_addr, size)        early_ioremap(phys_addr, size)
-
-extern void early_iounmap (volatile void __iomem *addr, unsigned long size);
-#define early_memunmap(addr, size)             early_iounmap(addr, size)
-
-#endif
diff --git a/arch/ia64/include/asm/efi.h b/arch/ia64/include/asm/efi.h
deleted file mode 100644
index 6a4a50d8f19a..000000000000
--- a/arch/ia64/include/asm/efi.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_EFI_H
-#define _ASM_EFI_H
-
-typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
-
-void *efi_get_pal_addr(void);
-void efi_map_pal_code(void);
-void efi_memmap_walk(efi_freemem_callback_t, void *);
-void efi_memmap_walk_uc(efi_freemem_callback_t, void *);
-void efi_gettimeofday(struct timespec64 *ts);
-
-#endif
diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h
deleted file mode 100644
index 2ef5f9966ad1..000000000000
--- a/arch/ia64/include/asm/elf.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ELF_H
-#define _ASM_IA64_ELF_H
-
-/*
- * ELF-specific definitions.
- *
- * Copyright (C) 1998-1999, 2002-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/fpu.h>
-#include <asm/page.h>
-#include <asm/auxvec.h>
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x) ((x)->e_machine == EM_IA_64)
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS	ELFCLASS64
-#define ELF_DATA	ELFDATA2LSB
-#define ELF_ARCH	EM_IA_64
-
-#define CORE_DUMP_USE_REGSET
-
-/* Least-significant four bits of ELF header's e_flags are OS-specific.  The bits are
-   interpreted as follows by Linux: */
-#define EF_IA_64_LINUX_EXECUTABLE_STACK	0x1	/* is stack (& heap) executable by default? */
-
-#define ELF_EXEC_PAGESIZE	PAGE_SIZE
-
-/*
- * This is the location that an ET_DYN program is loaded if exec'ed.
- * Typical use of this is to invoke "./ld.so someprog" to test out a
- * new version of the loader.  We need to make sure that it is out of
- * the way of the program that it will "exec", and that there is
- * sufficient room for the brk.
- */
-#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_BASE + 0x800000000UL)
-
-#define PT_IA_64_UNWIND		0x70000001
-
-/* IA-64 relocations: */
-#define R_IA64_NONE		0x00	/* none */
-#define R_IA64_IMM14		0x21	/* symbol + addend, add imm14 */
-#define R_IA64_IMM22		0x22	/* symbol + addend, add imm22 */
-#define R_IA64_IMM64		0x23	/* symbol + addend, mov imm64 */
-#define R_IA64_DIR32MSB		0x24	/* symbol + addend, data4 MSB */
-#define R_IA64_DIR32LSB		0x25	/* symbol + addend, data4 LSB */
-#define R_IA64_DIR64MSB		0x26	/* symbol + addend, data8 MSB */
-#define R_IA64_DIR64LSB		0x27	/* symbol + addend, data8 LSB */
-#define R_IA64_GPREL22		0x2a	/* @gprel(sym+add), add imm22 */
-#define R_IA64_GPREL64I		0x2b	/* @gprel(sym+add), mov imm64 */
-#define R_IA64_GPREL32MSB	0x2c	/* @gprel(sym+add), data4 MSB */
-#define R_IA64_GPREL32LSB	0x2d	/* @gprel(sym+add), data4 LSB */
-#define R_IA64_GPREL64MSB	0x2e	/* @gprel(sym+add), data8 MSB */
-#define R_IA64_GPREL64LSB	0x2f	/* @gprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF22		0x32	/* @ltoff(sym+add), add imm22 */
-#define R_IA64_LTOFF64I		0x33	/* @ltoff(sym+add), mov imm64 */
-#define R_IA64_PLTOFF22		0x3a	/* @pltoff(sym+add), add imm22 */
-#define R_IA64_PLTOFF64I	0x3b	/* @pltoff(sym+add), mov imm64 */
-#define R_IA64_PLTOFF64MSB	0x3e	/* @pltoff(sym+add), data8 MSB */
-#define R_IA64_PLTOFF64LSB	0x3f	/* @pltoff(sym+add), data8 LSB */
-#define R_IA64_FPTR64I		0x43	/* @fptr(sym+add), mov imm64 */
-#define R_IA64_FPTR32MSB	0x44	/* @fptr(sym+add), data4 MSB */
-#define R_IA64_FPTR32LSB	0x45	/* @fptr(sym+add), data4 LSB */
-#define R_IA64_FPTR64MSB	0x46	/* @fptr(sym+add), data8 MSB */
-#define R_IA64_FPTR64LSB	0x47	/* @fptr(sym+add), data8 LSB */
-#define R_IA64_PCREL60B		0x48	/* @pcrel(sym+add), brl */
-#define R_IA64_PCREL21B		0x49	/* @pcrel(sym+add), ptb, call */
-#define R_IA64_PCREL21M		0x4a	/* @pcrel(sym+add), chk.s */
-#define R_IA64_PCREL21F		0x4b	/* @pcrel(sym+add), fchkf */
-#define R_IA64_PCREL32MSB	0x4c	/* @pcrel(sym+add), data4 MSB */
-#define R_IA64_PCREL32LSB	0x4d	/* @pcrel(sym+add), data4 LSB */
-#define R_IA64_PCREL64MSB	0x4e	/* @pcrel(sym+add), data8 MSB */
-#define R_IA64_PCREL64LSB	0x4f	/* @pcrel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_FPTR22	0x52	/* @ltoff(@fptr(s+a)), imm22 */
-#define R_IA64_LTOFF_FPTR64I	0x53	/* @ltoff(@fptr(s+a)), imm64 */
-#define R_IA64_LTOFF_FPTR32MSB	0x54	/* @ltoff(@fptr(s+a)), 4 MSB */
-#define R_IA64_LTOFF_FPTR32LSB	0x55	/* @ltoff(@fptr(s+a)), 4 LSB */
-#define R_IA64_LTOFF_FPTR64MSB	0x56	/* @ltoff(@fptr(s+a)), 8 MSB */
-#define R_IA64_LTOFF_FPTR64LSB	0x57	/* @ltoff(@fptr(s+a)), 8 LSB */
-#define R_IA64_SEGREL32MSB	0x5c	/* @segrel(sym+add), data4 MSB */
-#define R_IA64_SEGREL32LSB	0x5d	/* @segrel(sym+add), data4 LSB */
-#define R_IA64_SEGREL64MSB	0x5e	/* @segrel(sym+add), data8 MSB */
-#define R_IA64_SEGREL64LSB	0x5f	/* @segrel(sym+add), data8 LSB */
-#define R_IA64_SECREL32MSB	0x64	/* @secrel(sym+add), data4 MSB */
-#define R_IA64_SECREL32LSB	0x65	/* @secrel(sym+add), data4 LSB */
-#define R_IA64_SECREL64MSB	0x66	/* @secrel(sym+add), data8 MSB */
-#define R_IA64_SECREL64LSB	0x67	/* @secrel(sym+add), data8 LSB */
-#define R_IA64_REL32MSB		0x6c	/* data 4 + REL */
-#define R_IA64_REL32LSB		0x6d	/* data 4 + REL */
-#define R_IA64_REL64MSB		0x6e	/* data 8 + REL */
-#define R_IA64_REL64LSB		0x6f	/* data 8 + REL */
-#define R_IA64_LTV32MSB		0x74	/* symbol + addend, data4 MSB */
-#define R_IA64_LTV32LSB		0x75	/* symbol + addend, data4 LSB */
-#define R_IA64_LTV64MSB		0x76	/* symbol + addend, data8 MSB */
-#define R_IA64_LTV64LSB		0x77	/* symbol + addend, data8 LSB */
-#define R_IA64_PCREL21BI	0x79	/* @pcrel(sym+add), ptb, call */
-#define R_IA64_PCREL22		0x7a	/* @pcrel(sym+add), imm22 */
-#define R_IA64_PCREL64I		0x7b	/* @pcrel(sym+add), imm64 */
-#define R_IA64_IPLTMSB		0x80	/* dynamic reloc, imported PLT, MSB */
-#define R_IA64_IPLTLSB		0x81	/* dynamic reloc, imported PLT, LSB */
-#define R_IA64_COPY		0x84	/* dynamic reloc, data copy */
-#define R_IA64_SUB		0x85	/* -symbol + addend, add imm22 */
-#define R_IA64_LTOFF22X		0x86	/* LTOFF22, relaxable.  */
-#define R_IA64_LDXMOV		0x87	/* Use of LTOFF22X.  */
-#define R_IA64_TPREL14		0x91	/* @tprel(sym+add), add imm14 */
-#define R_IA64_TPREL22		0x92	/* @tprel(sym+add), add imm22 */
-#define R_IA64_TPREL64I		0x93	/* @tprel(sym+add), add imm64 */
-#define R_IA64_TPREL64MSB	0x96	/* @tprel(sym+add), data8 MSB */
-#define R_IA64_TPREL64LSB	0x97	/* @tprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_TPREL22	0x9a	/* @ltoff(@tprel(s+a)), add imm22 */
-#define R_IA64_DTPMOD64MSB	0xa6	/* @dtpmod(sym+add), data8 MSB */
-#define R_IA64_DTPMOD64LSB	0xa7	/* @dtpmod(sym+add), data8 LSB */
-#define R_IA64_LTOFF_DTPMOD22	0xaa	/* @ltoff(@dtpmod(s+a)), imm22 */
-#define R_IA64_DTPREL14		0xb1	/* @dtprel(sym+add), imm14 */
-#define R_IA64_DTPREL22		0xb2	/* @dtprel(sym+add), imm22 */
-#define R_IA64_DTPREL64I	0xb3	/* @dtprel(sym+add), imm64 */
-#define R_IA64_DTPREL32MSB	0xb4	/* @dtprel(sym+add), data4 MSB */
-#define R_IA64_DTPREL32LSB	0xb5	/* @dtprel(sym+add), data4 LSB */
-#define R_IA64_DTPREL64MSB	0xb6	/* @dtprel(sym+add), data8 MSB */
-#define R_IA64_DTPREL64LSB	0xb7	/* @dtprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_DTPREL22	0xba	/* @ltoff(@dtprel(s+a)), imm22 */
-
-/* IA-64 specific section flags: */
-#define SHF_IA_64_SHORT		0x10000000	/* section near gp */
-
-/*
- * We use (abuse?) this macro to insert the (empty) vm_area that is
- * used to map the register backing store.  I don't see any better
- * place to do this, but we should discuss this with Linus once we can
- * talk to him...
- */
-extern void ia64_init_addr_space (void);
-#define ELF_PLAT_INIT(_r, load_addr)	ia64_init_addr_space()
-
-/* ELF register definitions.  This is needed for core dump support.  */
-
-/*
- * elf_gregset_t contains the application-level state in the following order:
- *	r0-r31
- *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
- *	predicate registers (p0-p63)
- *	b0-b7
- *	ip cfm psr
- *	ar.rsc ar.bsp ar.bspstore ar.rnat
- *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd
- */
-#define ELF_NGREG	128	/* we really need just 72 but let's leave some headroom... */
-#define ELF_NFPREG	128	/* f0 and f1 could be omitted, but so what... */
-
-/* elf_gregset_t register offsets */
-#define ELF_GR_0_OFFSET     0
-#define ELF_NAT_OFFSET     (32 * sizeof(elf_greg_t))
-#define ELF_PR_OFFSET      (33 * sizeof(elf_greg_t))
-#define ELF_BR_0_OFFSET    (34 * sizeof(elf_greg_t))
-#define ELF_CR_IIP_OFFSET  (42 * sizeof(elf_greg_t))
-#define ELF_CFM_OFFSET     (43 * sizeof(elf_greg_t))
-#define ELF_CR_IPSR_OFFSET (44 * sizeof(elf_greg_t))
-#define ELF_GR_OFFSET(i)   (ELF_GR_0_OFFSET + i * sizeof(elf_greg_t))
-#define ELF_BR_OFFSET(i)   (ELF_BR_0_OFFSET + i * sizeof(elf_greg_t))
-#define ELF_AR_RSC_OFFSET  (45 * sizeof(elf_greg_t))
-#define ELF_AR_BSP_OFFSET  (46 * sizeof(elf_greg_t))
-#define ELF_AR_BSPSTORE_OFFSET (47 * sizeof(elf_greg_t))
-#define ELF_AR_RNAT_OFFSET (48 * sizeof(elf_greg_t))
-#define ELF_AR_CCV_OFFSET  (49 * sizeof(elf_greg_t))
-#define ELF_AR_UNAT_OFFSET (50 * sizeof(elf_greg_t))
-#define ELF_AR_FPSR_OFFSET (51 * sizeof(elf_greg_t))
-#define ELF_AR_PFS_OFFSET  (52 * sizeof(elf_greg_t))
-#define ELF_AR_LC_OFFSET   (53 * sizeof(elf_greg_t))
-#define ELF_AR_EC_OFFSET   (54 * sizeof(elf_greg_t))
-#define ELF_AR_CSD_OFFSET  (55 * sizeof(elf_greg_t))
-#define ELF_AR_SSD_OFFSET  (56 * sizeof(elf_greg_t))
-#define ELF_AR_END_OFFSET  (57 * sizeof(elf_greg_t))
-
-typedef unsigned long elf_greg_t;
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct ia64_fpreg elf_fpreg_t;
-typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
-
-
-
-struct pt_regs;	/* forward declaration... */
-extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst);
-#define ELF_CORE_COPY_REGS(_dest,_regs)	ia64_elf_core_copy_regs(_regs, _dest);
-
-/* This macro yields a bitmask that programs can use to figure out
-   what instruction set this CPU supports.  */
-#define ELF_HWCAP 	0
-
-/* This macro yields a string that ld.so will use to load
-   implementation specific libraries for optimization.  Not terribly
-   relevant until we have real hardware to play with... */
-#define ELF_PLATFORM	NULL
-
-#define elf_read_implies_exec(ex, executable_stack)					\
-	((executable_stack!=EXSTACK_DISABLE_X) && ((ex).e_flags & EF_IA_64_LINUX_EXECUTABLE_STACK) != 0)
-
-struct task_struct;
-
-#define GATE_EHDR	((const struct elfhdr *) GATE_ADDR)
-
-/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO								\
-do {										\
-	extern char __kernel_syscall_via_epc[];					\
-	NEW_AUX_ENT(AT_SYSINFO, (unsigned long) __kernel_syscall_via_epc);	\
-	NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR);		\
-} while (0)
-
-/*
- * format for entries in the Global Offset Table
- */
-struct got_entry {
-	uint64_t val;
-};
-
-/*
- * Layout of the Function Descriptor
- */
-struct fdesc {
-	uint64_t addr;
-	uint64_t gp;
-};
-
-#endif /* _ASM_IA64_ELF_H */
diff --git a/arch/ia64/include/asm/emergency-restart.h b/arch/ia64/include/asm/emergency-restart.h
deleted file mode 100644
index 108d8c48e42e..000000000000
--- a/arch/ia64/include/asm/emergency-restart.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_EMERGENCY_RESTART_H
-#define _ASM_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/arch/ia64/include/asm/esi.h b/arch/ia64/include/asm/esi.h
deleted file mode 100644
index 56d1310af06e..000000000000
--- a/arch/ia64/include/asm/esi.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ESI service calls.
- *
- * Copyright (c) Copyright 2005-2006 Hewlett-Packard Development Company, L.P.
- * 	Alex Williamson <alex.williamson@hp.com>
- */
-#ifndef esi_h
-#define esi_h
-
-#include <linux/efi.h>
-
-#define ESI_QUERY			0x00000001
-#define ESI_OPEN_HANDLE			0x02000000
-#define ESI_CLOSE_HANDLE		0x02000001
-
-enum esi_proc_type {
-	ESI_PROC_SERIALIZED,	/* calls need to be serialized */
-	ESI_PROC_MP_SAFE,	/* MP-safe, but not reentrant */
-	ESI_PROC_REENTRANT	/* MP-safe and reentrant */
-};
-
-extern struct ia64_sal_retval esi_call_phys (void *, u64 *);
-extern int ia64_esi_call(efi_guid_t, struct ia64_sal_retval *,
-			 enum esi_proc_type,
-			 u64, u64, u64, u64, u64, u64, u64, u64);
-extern int ia64_esi_call_phys(efi_guid_t, struct ia64_sal_retval *, u64, u64,
-                              u64, u64, u64, u64, u64, u64);
-
-#endif /* esi_h */
diff --git a/arch/ia64/include/asm/exception.h b/arch/ia64/include/asm/exception.h
deleted file mode 100644
index 1d5df8116a31..000000000000
--- a/arch/ia64/include/asm/exception.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef __ASM_EXCEPTION_H
-#define __ASM_EXCEPTION_H
-
-struct pt_regs;
-struct exception_table_entry;
-
-extern void ia64_handle_exception(struct pt_regs *regs,
-				  const struct exception_table_entry *e);
-
-#define ia64_done_with_exception(regs)					  \
-({									  \
-	int __ex_ret = 0;						  \
-	const struct exception_table_entry *e;				  \
-	e = search_exception_tables((regs)->cr_iip + ia64_psr(regs)->ri); \
-	if (e) {							  \
-		ia64_handle_exception(regs, e);				  \
-		__ex_ret = 1;						  \
-	}								  \
-	__ex_ret;							  \
-})
-
-#endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/ia64/include/asm/extable.h b/arch/ia64/include/asm/extable.h
deleted file mode 100644
index 83eac6aa0639..000000000000
--- a/arch/ia64/include/asm/extable.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_EXTABLE_H
-#define _ASM_IA64_EXTABLE_H
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-struct exception_table_entry {
-	int insn;	/* location-relative address of insn this fixup is for */
-	int fixup;	/* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
-};
-
-#endif
diff --git a/arch/ia64/include/asm/fb.h b/arch/ia64/include/asm/fb.h
deleted file mode 100644
index 1717b26fd423..000000000000
--- a/arch/ia64/include/asm/fb.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/compiler.h>
-#include <linux/efi.h>
-#include <linux/string.h>
-
-#include <asm/page.h>
-
-struct file;
-
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
-				unsigned long off)
-{
-	if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start))
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-	else
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-}
-#define fb_pgprotect fb_pgprotect
-
-static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
-{
-	memcpy(to, (void __force *)from, n);
-}
-#define fb_memcpy_fromio fb_memcpy_fromio
-
-static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
-{
-	memcpy((void __force *)to, from, n);
-}
-#define fb_memcpy_toio fb_memcpy_toio
-
-static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
-{
-	memset((void __force *)addr, c, n);
-}
-#define fb_memset fb_memset_io
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/ia64/include/asm/fpswa.h b/arch/ia64/include/asm/fpswa.h
deleted file mode 100644
index 2a0c23728b26..000000000000
--- a/arch/ia64/include/asm/fpswa.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_FPSWA_H
-#define _ASM_IA64_FPSWA_H
-
-/*
- * Floating-point Software Assist
- *
- * Copyright (C) 1999 Intel Corporation.
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Goutham Rao <goutham.rao@intel.com>
- */
-
-typedef struct {
-	/* 4 * 128 bits */
-	unsigned long fp_lp[4*2];
-} fp_state_low_preserved_t;
-
-typedef struct {
-	/* 10 * 128 bits */
-	unsigned long fp_lv[10 * 2];
-} fp_state_low_volatile_t;
-
-typedef	struct {
-	/* 16 * 128 bits */
-	unsigned long fp_hp[16 * 2];
-} fp_state_high_preserved_t;
-
-typedef struct {
-	/* 96 * 128 bits */
-	unsigned long fp_hv[96 * 2];
-} fp_state_high_volatile_t;
-
-/**
- * floating point state to be passed to the FP emulation library by
- * the trap/fault handler
- */
-typedef struct {
-	unsigned long			bitmask_low64;
-	unsigned long			bitmask_high64;
-	fp_state_low_preserved_t	*fp_state_low_preserved;
-	fp_state_low_volatile_t		*fp_state_low_volatile;
-	fp_state_high_preserved_t	*fp_state_high_preserved;
-	fp_state_high_volatile_t	*fp_state_high_volatile;
-} fp_state_t;
-
-typedef struct {
-	unsigned long status;
-	unsigned long err0;
-	unsigned long err1;
-	unsigned long err2;
-} fpswa_ret_t;
-
-/**
- * function header for the Floating Point software assist
- * library. This function is invoked by the Floating point software
- * assist trap/fault handler.
- */
-typedef fpswa_ret_t (*efi_fpswa_t) (unsigned long trap_type, void *bundle, unsigned long *ipsr,
-				    unsigned long *fsr, unsigned long *isr, unsigned long *preds,
-				    unsigned long *ifs, fp_state_t *fp_state);
-
-/**
- * This is the FPSWA library interface as defined by EFI.  We need to pass a 
- * pointer to the interface itself on a call to the assist library
- */
-typedef struct {
-	unsigned int	 revision;
-	unsigned int	 reserved;
-	efi_fpswa_t	 fpswa;
-} fpswa_interface_t;
-
-extern fpswa_interface_t *fpswa_interface;
-
-#endif /* _ASM_IA64_FPSWA_H */
diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h
deleted file mode 100644
index a07a8e575453..000000000000
--- a/arch/ia64/include/asm/ftrace.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_FTRACE_H
-#define _ASM_IA64_FTRACE_H
-
-#ifdef CONFIG_FUNCTION_TRACER
-#define MCOUNT_INSN_SIZE        32 /* sizeof mcount call */
-
-#ifndef __ASSEMBLY__
-extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0);
-#define mcount _mcount
-
-/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */
-#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip)
-#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip)
-
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
-	/* second bundle, insn 2 */
-	return addr - 0x12;
-}
-
-struct dyn_arch_ftrace {
-};
-#endif
-
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#endif /* _ASM_IA64_FTRACE_H */
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
deleted file mode 100644
index 1db26b432d8c..000000000000
--- a/arch/ia64/include/asm/futex.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FUTEX_H
-#define _ASM_FUTEX_H
-
-#include <linux/futex.h>
-#include <linux/uaccess.h>
-#include <asm/errno.h>
-
-#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
-do {									\
-	register unsigned long r8 __asm ("r8") = 0;			\
-	__asm__ __volatile__(						\
-		"	mf;;					\n"	\
-		"[1:] "	insn ";;				\n"	\
-		"	.xdata4 \"__ex_table\", 1b-., 2f-.	\n"	\
-		"[2:]"							\
-		: "+r" (r8), "=r" (oldval)				\
-		: "r" (uaddr), "r" (oparg)				\
-		: "memory");						\
-	ret = r8;							\
-} while (0)
-
-#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
-do {									\
-	register unsigned long r8 __asm ("r8") = 0;			\
-	int val, newval;						\
-	do {								\
-		__asm__ __volatile__(					\
-			"	mf;;				  \n"	\
-			"[1:]	ld4 %3=[%4];;			  \n"	\
-			"	mov %2=%3			  \n"	\
-				insn	";;			  \n"	\
-			"	mov ar.ccv=%2;;			  \n"	\
-			"[2:]	cmpxchg4.acq %1=[%4],%3,ar.ccv;;  \n"	\
-			"	.xdata4 \"__ex_table\", 1b-., 3f-.\n"	\
-			"	.xdata4 \"__ex_table\", 2b-., 3f-.\n"	\
-			"[3:]"						\
-			: "+r" (r8), "=r" (val), "=&r" (oldval),	\
-			   "=&r" (newval)				\
-			: "r" (uaddr), "r" (oparg)			\
-			: "memory");					\
-		if (unlikely (r8))					\
-			break;						\
-	} while (unlikely (val != oldval));				\
-	ret = r8;							\
-} while (0)
-
-static inline int
-arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
-{
-	int oldval = 0, ret;
-
-	if (!access_ok(uaddr, sizeof(u32)))
-		return -EFAULT;
-
-	switch (op) {
-	case FUTEX_OP_SET:
-		__futex_atomic_op1("xchg4 %1=[%2],%3", ret, oldval, uaddr,
-				   oparg);
-		break;
-	case FUTEX_OP_ADD:
-		__futex_atomic_op2("add %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	case FUTEX_OP_OR:
-		__futex_atomic_op2("or %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	case FUTEX_OP_ANDN:
-		__futex_atomic_op2("and %3=%3,%5", ret, oldval, uaddr,
-				   ~oparg);
-		break;
-	case FUTEX_OP_XOR:
-		__futex_atomic_op2("xor %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	default:
-		ret = -ENOSYS;
-	}
-
-	if (!ret)
-		*oval = oldval;
-
-	return ret;
-}
-
-static inline int
-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-			      u32 oldval, u32 newval)
-{
-	if (!access_ok(uaddr, sizeof(u32)))
-		return -EFAULT;
-
-	{
-		register unsigned long r8 __asm ("r8") = 0;
-		unsigned long prev;
-		__asm__ __volatile__(
-			"	mf;;					\n"
-			"	mov ar.ccv=%4;;				\n"
-			"[1:]	cmpxchg4.acq %1=[%2],%3,ar.ccv		\n"
-			"	.xdata4 \"__ex_table\", 1b-., 2f-.	\n"
-			"[2:]"
-			: "+r" (r8), "=&r" (prev)
-			: "r" (uaddr), "r" (newval),
-			  "rO" ((long) (unsigned) oldval)
-			: "memory");
-		*uval = prev;
-		return r8;
-	}
-}
-
-#endif /* _ASM_FUTEX_H */
diff --git a/arch/ia64/include/asm/gcc_intrin.h b/arch/ia64/include/asm/gcc_intrin.h
deleted file mode 100644
index 83f230b23867..000000000000
--- a/arch/ia64/include/asm/gcc_intrin.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
- * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#ifndef _ASM_IA64_GCC_INTRIN_H
-#define _ASM_IA64_GCC_INTRIN_H
-
-#include <uapi/asm/gcc_intrin.h>
-
-register unsigned long ia64_r13 asm ("r13") __used;
-#endif /* _ASM_IA64_GCC_INTRIN_H */
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
deleted file mode 100644
index ccde7c2ba00f..000000000000
--- a/arch/ia64/include/asm/hardirq.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HARDIRQ_H
-#define _ASM_IA64_HARDIRQ_H
-
-/*
- * Modified 1998-2002, 2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * No irq_cpustat_t for IA-64.  The data is held in the per-CPU data structure.
- */
-
-#define __ARCH_IRQ_STAT	1
-
-#define local_softirq_pending_ref	ia64_cpu_info.softirq_pending
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-#include <asm/processor.h>
-
-extern void __iomem *ipi_base_addr;
-
-void ack_bad_irq(unsigned int irq);
-
-#endif /* _ASM_IA64_HARDIRQ_H */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
deleted file mode 100644
index 026ead47cd53..000000000000
--- a/arch/ia64/include/asm/hugetlb.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HUGETLB_H
-#define _ASM_IA64_HUGETLB_H
-
-#include <asm/page.h>
-
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-			    unsigned long end, unsigned long floor,
-			    unsigned long ceiling);
-
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-int prepare_hugepage_range(struct file *file,
-			unsigned long addr, unsigned long len);
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
-					 unsigned long addr,
-					 unsigned long len)
-{
-	return (REGION_NUMBER(addr) == RGN_HPAGE ||
-		REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
-}
-#define is_hugepage_only_range is_hugepage_only_range
-
-#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
-static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
-					  unsigned long addr, pte_t *ptep)
-{
-	return *ptep;
-}
-
-#include <asm-generic/hugetlb.h>
-
-#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/include/asm/hw_irq.h b/arch/ia64/include/asm/hw_irq.h
deleted file mode 100644
index 5d267132f8cb..000000000000
--- a/arch/ia64/include/asm/hw_irq.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HW_IRQ_H
-#define _ASM_IA64_HW_IRQ_H
-
-/*
- * Copyright (C) 2001-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/profile.h>
-
-#include <asm/ptrace.h>
-#include <asm/smp.h>
-
-typedef u8 ia64_vector;
-
-/*
- * 0 special
- *
- * 1,3-14 are reserved from firmware
- *
- * 16-255 (vectored external interrupts) are available
- *
- * 15 spurious interrupt (see IVR)
- *
- * 16 lowest priority, 255 highest priority
- *
- * 15 classes of 16 interrupts each.
- */
-#define IA64_MIN_VECTORED_IRQ		 16
-#define IA64_MAX_VECTORED_IRQ		255
-#define IA64_NUM_VECTORS		256
-
-#define AUTO_ASSIGN			-1
-
-#define IA64_SPURIOUS_INT_VECTOR	0x0f
-
-/*
- * Vectors 0x10-0x1f are used for low priority interrupts, e.g. CMCI.
- */
-#define IA64_CPEP_VECTOR		0x1c	/* corrected platform error polling vector */
-#define IA64_CMCP_VECTOR		0x1d	/* corrected machine-check polling vector */
-#define IA64_CPE_VECTOR			0x1e	/* corrected platform error interrupt vector */
-#define IA64_CMC_VECTOR			0x1f	/* corrected machine-check interrupt vector */
-/*
- * Vectors 0x20-0x2f are reserved for legacy ISA IRQs.
- * Use vectors 0x30-0xe7 as the default device vector range for ia64.
- * Platforms may choose to reduce this range in platform_irq_setup, but the
- * platform range must fall within
- *	[IA64_DEF_FIRST_DEVICE_VECTOR..IA64_DEF_LAST_DEVICE_VECTOR]
- */
-extern int ia64_first_device_vector;
-extern int ia64_last_device_vector;
-
-#ifdef CONFIG_SMP
-/* Reserve the lower priority vector than device vectors for "move IRQ" IPI */
-#define IA64_IRQ_MOVE_VECTOR		0x30	/* "move IRQ" IPI */
-#define IA64_DEF_FIRST_DEVICE_VECTOR	0x31
-#else
-#define IA64_DEF_FIRST_DEVICE_VECTOR	0x30
-#endif
-#define IA64_DEF_LAST_DEVICE_VECTOR	0xe7
-#define IA64_FIRST_DEVICE_VECTOR	ia64_first_device_vector
-#define IA64_LAST_DEVICE_VECTOR		ia64_last_device_vector
-#define IA64_MAX_DEVICE_VECTORS		(IA64_DEF_LAST_DEVICE_VECTOR - IA64_DEF_FIRST_DEVICE_VECTOR + 1)
-#define IA64_NUM_DEVICE_VECTORS		(IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1)
-
-#define IA64_MCA_RENDEZ_VECTOR		0xe8	/* MCA rendez interrupt */
-#define IA64_TIMER_VECTOR		0xef	/* use highest-prio group 15 interrupt for timer */
-#define	IA64_MCA_WAKEUP_VECTOR		0xf0	/* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
-#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP flush local TLB */
-#define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
-#define IA64_IPI_VECTOR			0xfe	/* inter-processor interrupt vector */
-
-/* Used for encoding redirected irqs */
-
-#define IA64_IRQ_REDIRECTED		(1 << 31)
-
-/* IA64 inter-cpu interrupt related definitions */
-
-#define IA64_IPI_DEFAULT_BASE_ADDR	0xfee00000
-
-/* Delivery modes for inter-cpu interrupts */
-enum {
-        IA64_IPI_DM_INT =       0x0,    /* pend an external interrupt */
-        IA64_IPI_DM_PMI =       0x2,    /* pend a PMI */
-        IA64_IPI_DM_NMI =       0x4,    /* pend an NMI (vector 2) */
-        IA64_IPI_DM_INIT =      0x5,    /* pend an INIT interrupt */
-        IA64_IPI_DM_EXTINT =    0x7,    /* pend an 8259-compatible interrupt. */
-};
-
-extern __u8 isa_irq_to_vector_map[16];
-#define isa_irq_to_vector(x)	isa_irq_to_vector_map[(x)]
-
-struct irq_cfg {
-	ia64_vector vector;
-	cpumask_t domain;
-	cpumask_t old_domain;
-	unsigned move_cleanup_count;
-	u8 move_in_progress : 1;
-};
-extern spinlock_t vector_lock;
-extern struct irq_cfg irq_cfg[NR_IRQS];
-#define irq_to_domain(x)	irq_cfg[(x)].domain
-DECLARE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq);
-
-extern struct irq_chip irq_type_ia64_lsapic;	/* CPU-internal interrupt controller */
-
-#define ia64_register_ipi	ia64_native_register_ipi
-#define assign_irq_vector	ia64_native_assign_irq_vector
-#define free_irq_vector		ia64_native_free_irq_vector
-#define ia64_resend_irq		ia64_native_resend_irq
-
-extern void ia64_native_register_ipi(void);
-extern int bind_irq_vector(int irq, int vector, cpumask_t domain);
-extern int ia64_native_assign_irq_vector (int irq);	/* allocate a free vector */
-extern void ia64_native_free_irq_vector (int vector);
-extern int reserve_irq_vector (int vector);
-extern void __setup_vector_irq(int cpu);
-extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect);
-extern void destroy_and_reserve_irq (unsigned int irq);
-
-#ifdef CONFIG_SMP
-extern int irq_prepare_move(int irq, int cpu);
-extern void irq_complete_move(unsigned int irq);
-#else
-static inline int irq_prepare_move(int irq, int cpu) { return 0; }
-static inline void irq_complete_move(unsigned int irq) {}
-#endif
-
-static inline void ia64_native_resend_irq(unsigned int vector)
-{
-	ia64_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * Next follows the irq descriptor interface.  On IA-64, each CPU supports 256 interrupt
- * vectors.  On smaller systems, there is a one-to-one correspondence between interrupt
- * vectors and the Linux irq numbers.  However, larger systems may have multiple interrupt
- * domains meaning that the translation from vector number to irq number depends on the
- * interrupt domain that a CPU belongs to.  This API abstracts such platform-dependent
- * differences and provides a uniform means to translate between vector and irq numbers
- * and to obtain the irq descriptor for a given irq number.
- */
-
-/* Extract the IA-64 vector that corresponds to IRQ.  */
-static inline ia64_vector
-irq_to_vector (int irq)
-{
-	return irq_cfg[irq].vector;
-}
-
-/*
- * Convert the local IA-64 vector to the corresponding irq number.  This translation is
- * done in the context of the interrupt domain that the currently executing CPU belongs
- * to.
- */
-static inline unsigned int
-local_vector_to_irq (ia64_vector vec)
-{
-	return __this_cpu_read(vector_irq[vec]);
-}
-
-#endif /* _ASM_IA64_HW_IRQ_H */
diff --git a/arch/ia64/include/asm/idle.h b/arch/ia64/include/asm/idle.h
deleted file mode 100644
index 97c55b97e0ba..000000000000
--- a/arch/ia64/include/asm/idle.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IDLE_H
-#define _ASM_IA64_IDLE_H
-
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-
-#endif /* _ASM_IA64_IDLE_H */
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
deleted file mode 100644
index 035b17fe12ef..000000000000
--- a/arch/ia64/include/asm/intrinsics.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Compiler-dependent intrinsics.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_INTRINSICS_H
-#define _ASM_IA64_INTRINSICS_H
-
-#include <uapi/asm/intrinsics.h>
-
-#endif /* _ASM_IA64_INTRINSICS_H */
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
deleted file mode 100644
index eedc0afa8cad..000000000000
--- a/arch/ia64/include/asm/io.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IO_H
-#define _ASM_IA64_IO_H
-
-/*
- * This file contains the definitions for the emulated IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated to
- * (a) handle it all in a way that makes gcc able to optimize it as
- * well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#include <asm/unaligned.h>
-#include <asm/early_ioremap.h>
-
-#define __IA64_UNCACHED_OFFSET	RGN_BASE(RGN_UNCACHED)
-
-/*
- * The legacy I/O space defined by the ia64 architecture supports only 65536 ports, but
- * large machines may have multiple other I/O spaces so we can't place any a priori limit
- * on IO_SPACE_LIMIT.  These additional spaces are described in ACPI.
- */
-#define IO_SPACE_LIMIT		0xffffffffffffffffUL
-
-#define MAX_IO_SPACES_BITS		8
-#define MAX_IO_SPACES			(1UL << MAX_IO_SPACES_BITS)
-#define IO_SPACE_BITS			24
-#define IO_SPACE_SIZE			(1UL << IO_SPACE_BITS)
-
-#define IO_SPACE_NR(port)		((port) >> IO_SPACE_BITS)
-#define IO_SPACE_BASE(space)		((space) << IO_SPACE_BITS)
-#define IO_SPACE_PORT(port)		((port) & (IO_SPACE_SIZE - 1))
-
-#define IO_SPACE_SPARSE_ENCODING(p)	((((p) >> 2) << 12) | ((p) & 0xfff))
-
-struct io_space {
-	unsigned long mmio_base;	/* base in MMIO space */
-	int sparse;
-};
-
-extern struct io_space io_space[];
-extern unsigned int num_io_spaces;
-
-# ifdef __KERNEL__
-
-/*
- * All MMIO iomem cookies are in region 6; anything less is a PIO cookie:
- *	0xCxxxxxxxxxxxxxxx	MMIO cookie (return from ioremap)
- *	0x000000001SPPPPPP	PIO cookie (S=space number, P..P=port)
- *
- * ioread/writeX() uses the leading 1 in PIO cookies (PIO_OFFSET) to catch
- * code that uses bare port numbers without the prerequisite pci_iomap().
- */
-#define PIO_OFFSET		(1UL << (MAX_IO_SPACES_BITS + IO_SPACE_BITS))
-#define PIO_MASK		(PIO_OFFSET - 1)
-#define PIO_RESERVED		__IA64_UNCACHED_OFFSET
-#define HAVE_ARCH_PIO_SIZE
-
-#include <asm/intrinsics.h>
-#include <asm/page.h>
-#include <asm-generic/iomap.h>
-
-/*
- * Change virtual addresses to physical addresses and vv.
- */
-static inline unsigned long
-virt_to_phys (volatile void *address)
-{
-	return (unsigned long) address - PAGE_OFFSET;
-}
-#define virt_to_phys virt_to_phys
-
-static inline void*
-phys_to_virt (unsigned long address)
-{
-	return (void *) (address + PAGE_OFFSET);
-}
-#define phys_to_virt phys_to_virt
-
-#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
-extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size);
-extern int valid_phys_addr_range (phys_addr_t addr, size_t count); /* efi.c */
-extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count);
-
-# endif /* KERNEL */
-
-/*
- * Memory fence w/accept.  This should never be used in code that is
- * not IA-64 specific.
- */
-#define __ia64_mf_a()	ia64_mfa()
-
-static inline void*
-__ia64_mk_io_addr (unsigned long port)
-{
-	struct io_space *space;
-	unsigned long offset;
-
-	space = &io_space[IO_SPACE_NR(port)];
-	port = IO_SPACE_PORT(port);
-	if (space->sparse)
-		offset = IO_SPACE_SPARSE_ENCODING(port);
-	else
-		offset = port;
-
-	return (void *) (space->mmio_base | offset);
-}
-
-/*
- * For the in/out routines, we need to do "mf.a" _after_ doing the I/O access to ensure
- * that the access has completed before executing other I/O accesses.  Since we're doing
- * the accesses through an uncachable (UC) translation, the CPU will execute them in
- * program order.  However, we still need to tell the compiler not to shuffle them around
- * during optimization, which is why we use "volatile" pointers.
- */
-
-#define inb inb
-static inline unsigned int inb(unsigned long port)
-{
-	volatile unsigned char *addr = __ia64_mk_io_addr(port);
-	unsigned char ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define inw inw
-static inline unsigned int inw(unsigned long port)
-{
-	volatile unsigned short *addr = __ia64_mk_io_addr(port);
-	unsigned short ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define inl inl
-static inline unsigned int inl(unsigned long port)
-{
-	volatile unsigned int *addr = __ia64_mk_io_addr(port);
-	unsigned int ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define outb outb
-static inline void outb(unsigned char val, unsigned long port)
-{
-	volatile unsigned char *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define outw outw
-static inline void outw(unsigned short val, unsigned long port)
-{
-	volatile unsigned short *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define outl outl
-static inline void outl(unsigned int val, unsigned long port)
-{
-	volatile unsigned int *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define insb insb
-static inline void insb(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned char *dp = dst;
-
-	while (count--)
-		*dp++ = inb(port);
-}
-
-#define insw insw
-static inline void insw(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned short *dp = dst;
-
-	while (count--)
-		put_unaligned(inw(port), dp++);
-}
-
-#define insl insl
-static inline void insl(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned int *dp = dst;
-
-	while (count--)
-		put_unaligned(inl(port), dp++);
-}
-
-#define outsb outsb
-static inline void outsb(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned char *sp = src;
-
-	while (count--)
-		outb(*sp++, port);
-}
-
-#define outsw outsw
-static inline void outsw(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned short *sp = src;
-
-	while (count--)
-		outw(get_unaligned(sp++), port);
-}
-
-#define outsl outsl
-static inline void outsl(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned int *sp = src;
-
-	while (count--)
-		outl(get_unaligned(sp++), port);
-}
-
-# ifdef __KERNEL__
-
-#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
-
-extern void __iomem * ioremap_uc(unsigned long offset, unsigned long size);
-
-#define ioremap_prot ioremap_prot
-#define ioremap_cache ioremap
-#define ioremap_uc ioremap_uc
-#define iounmap iounmap
-
-/*
- * String version of IO memory access ops:
- */
-extern void memcpy_fromio(void *dst, const volatile void __iomem *src, long n);
-extern void memcpy_toio(volatile void __iomem *dst, const void *src, long n);
-extern void memset_io(volatile void __iomem *s, int c, long n);
-
-#define memcpy_fromio memcpy_fromio
-#define memcpy_toio memcpy_toio
-#define memset_io memset_io
-#define xlate_dev_mem_ptr xlate_dev_mem_ptr
-#include <asm-generic/io.h>
-#undef PCI_IOBASE
-
-# endif /* __KERNEL__ */
-
-#endif /* _ASM_IA64_IO_H */
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
deleted file mode 100644
index eb0db20c9d4c..000000000000
--- a/arch/ia64/include/asm/iommu.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IOMMU_H
-#define _ASM_IA64_IOMMU_H 1
-
-#include <linux/acpi.h>
-
-/* 10 seconds */
-#define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
-
-extern void no_iommu_init(void);
-#ifdef	CONFIG_INTEL_IOMMU
-extern int force_iommu, no_iommu;
-extern int iommu_detected;
-
-static inline int __init
-arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { return 0; }
-#else
-#define no_iommu		(1)
-#define iommu_detected		(0)
-#endif
-
-#endif
diff --git a/arch/ia64/include/asm/iosapic.h b/arch/ia64/include/asm/iosapic.h
deleted file mode 100644
index a91aeb413e17..000000000000
--- a/arch/ia64/include/asm/iosapic.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_IA64_IOSAPIC_H
-#define __ASM_IA64_IOSAPIC_H
-
-#define	IOSAPIC_REG_SELECT	0x0
-#define	IOSAPIC_WINDOW		0x10
-#define	IOSAPIC_EOI		0x40
-
-#define	IOSAPIC_VERSION		0x1
-
-/*
- * Redirection table entry
- */
-#define	IOSAPIC_RTE_LOW(i)	(0x10+i*2)
-#define	IOSAPIC_RTE_HIGH(i)	(0x11+i*2)
-
-#define	IOSAPIC_DEST_SHIFT		16
-
-/*
- * Delivery mode
- */
-#define	IOSAPIC_DELIVERY_SHIFT		8
-#define	IOSAPIC_FIXED			0x0
-#define	IOSAPIC_LOWEST_PRIORITY	0x1
-#define	IOSAPIC_PMI			0x2
-#define	IOSAPIC_NMI			0x4
-#define	IOSAPIC_INIT			0x5
-#define	IOSAPIC_EXTINT			0x7
-
-/*
- * Interrupt polarity
- */
-#define	IOSAPIC_POLARITY_SHIFT		13
-#define	IOSAPIC_POL_HIGH		0
-#define	IOSAPIC_POL_LOW		1
-
-/*
- * Trigger mode
- */
-#define	IOSAPIC_TRIGGER_SHIFT		15
-#define	IOSAPIC_EDGE			0
-#define	IOSAPIC_LEVEL			1
-
-/*
- * Mask bit
- */
-
-#define	IOSAPIC_MASK_SHIFT		16
-#define	IOSAPIC_MASK			(1<<IOSAPIC_MASK_SHIFT)
-
-#define IOSAPIC_VECTOR_MASK		0xffffff00
-
-#ifndef __ASSEMBLY__
-
-#define NR_IOSAPICS			256
-
-#define iosapic_pcat_compat_init	ia64_native_iosapic_pcat_compat_init
-#define __iosapic_read			__ia64_native_iosapic_read
-#define __iosapic_write			__ia64_native_iosapic_write
-#define iosapic_get_irq_chip		ia64_native_iosapic_get_irq_chip
-
-extern void __init ia64_native_iosapic_pcat_compat_init(void);
-extern struct irq_chip *ia64_native_iosapic_get_irq_chip(unsigned long trigger);
-
-static inline unsigned int
-__ia64_native_iosapic_read(char __iomem *iosapic, unsigned int reg)
-{
-	writel(reg, iosapic + IOSAPIC_REG_SELECT);
-	return readl(iosapic + IOSAPIC_WINDOW);
-}
-
-static inline void
-__ia64_native_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
-{
-	writel(reg, iosapic + IOSAPIC_REG_SELECT);
-	writel(val, iosapic + IOSAPIC_WINDOW);
-}
-
-static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
-{
-	writel(vector, iosapic + IOSAPIC_EOI);
-}
-
-extern void __init iosapic_system_init (int pcat_compat);
-extern int iosapic_init (unsigned long address, unsigned int gsi_base);
-extern int iosapic_remove (unsigned int gsi_base);
-extern int gsi_to_irq (unsigned int gsi);
-extern int iosapic_register_intr (unsigned int gsi, unsigned long polarity,
-				  unsigned long trigger);
-extern void iosapic_unregister_intr (unsigned int irq);
-extern void iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi,
-				      unsigned long polarity,
-				      unsigned long trigger);
-extern int __init iosapic_register_platform_intr (u32 int_type,
-					   unsigned int gsi,
-					   int pmi_vector,
-					   u16 eid, u16 id,
-					   unsigned long polarity,
-					   unsigned long trigger);
-
-#ifdef CONFIG_NUMA
-extern void map_iosapic_to_node (unsigned int, int);
-#endif
-
-# endif /* !__ASSEMBLY__ */
-#endif /* __ASM_IA64_IOSAPIC_H */
diff --git a/arch/ia64/include/asm/irq.h b/arch/ia64/include/asm/irq.h
deleted file mode 100644
index 0eccf33dfe8b..000000000000
--- a/arch/ia64/include/asm/irq.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IRQ_H
-#define _ASM_IA64_IRQ_H
-
-/*
- * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 11/24/98	S.Eranian 	updated TIMER_IRQ and irq_canonicalize
- * 01/20/99	S.Eranian	added keyboard interrupt
- * 02/29/00     D.Mosberger	moved most things into hw_irq.h
- */
-
-#include <linux/types.h>
-#include <linux/cpumask.h>
-#include <asm/native/irq.h>
-
-#define NR_IRQS		IA64_NATIVE_NR_IRQS
-
-static __inline__ int
-irq_canonicalize (int irq)
-{
-	/*
-	 * We do the legacy thing here of pretending that irqs < 16
-	 * are 8259 irqs.  This really shouldn't be necessary at all,
-	 * but we keep it here as serial.c still uses it...
-	 */
-	return ((irq == 2) ? 9 : irq);
-}
-
-extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
-
-int create_irq(void);
-void destroy_irq(unsigned int irq);
-
-#endif /* _ASM_IA64_IRQ_H */
diff --git a/arch/ia64/include/asm/irq_regs.h b/arch/ia64/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/ia64/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
deleted file mode 100644
index 547a6e87018c..000000000000
--- a/arch/ia64/include/asm/irq_remapping.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IA64_INTR_REMAPPING_H
-#define __IA64_INTR_REMAPPING_H
-#define irq_remapping_enabled 0
-#endif
diff --git a/arch/ia64/include/asm/irqflags.h b/arch/ia64/include/asm/irqflags.h
deleted file mode 100644
index 1dc30f12e545..000000000000
--- a/arch/ia64/include/asm/irqflags.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * IRQ flags defines.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#ifndef _ASM_IA64_IRQFLAGS_H
-#define _ASM_IA64_IRQFLAGS_H
-
-#include <asm/pal.h>
-#include <asm/kregs.h>
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
-extern unsigned long last_cli_ip;
-static inline void arch_maybe_save_ip(unsigned long flags)
-{
-	if (flags & IA64_PSR_I)
-		last_cli_ip = ia64_getreg(_IA64_REG_IP);
-}
-#else
-#define arch_maybe_save_ip(flags) do {} while (0)
-#endif
-
-/*
- * - clearing psr.i is implicitly serialized (visible by next insn)
- * - setting psr.i requires data serialization
- * - we need a stop-bit before reading PSR because we sometimes
- *   write a floating-point register right before reading the PSR
- *   and that writes to PSR.mfl
- */
-
-static inline unsigned long arch_local_save_flags(void)
-{
-	ia64_stop();
-	return ia64_getreg(_IA64_REG_PSR);
-}
-
-static inline unsigned long arch_local_irq_save(void)
-{
-	unsigned long flags = arch_local_save_flags();
-
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I);
-	arch_maybe_save_ip(flags);
-	return flags;
-}
-
-static inline void arch_local_irq_disable(void)
-{
-#ifdef CONFIG_IA64_DEBUG_IRQ
-	arch_local_irq_save();
-#else
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I);
-#endif
-}
-
-static inline void arch_local_irq_enable(void)
-{
-	ia64_stop();
-	ia64_ssm(IA64_PSR_I);
-	ia64_srlz_d();
-}
-
-static inline void arch_local_irq_restore(unsigned long flags)
-{
-#ifdef CONFIG_IA64_DEBUG_IRQ
-	unsigned long old_psr = arch_local_save_flags();
-#endif
-	ia64_intrin_local_irq_restore(flags & IA64_PSR_I);
-	arch_maybe_save_ip(old_psr & ~flags);
-}
-
-static inline bool arch_irqs_disabled_flags(unsigned long flags)
-{
-	return (flags & IA64_PSR_I) == 0;
-}
-
-static inline bool arch_irqs_disabled(void)
-{
-	return arch_irqs_disabled_flags(arch_local_save_flags());
-}
-
-static inline void arch_safe_halt(void)
-{
-	arch_local_irq_enable();
-	ia64_pal_halt_light();	/* PAL_HALT_LIGHT */
-}
-
-
-#endif /* _ASM_IA64_IRQFLAGS_H */
diff --git a/arch/ia64/include/asm/kdebug.h b/arch/ia64/include/asm/kdebug.h
deleted file mode 100644
index 4f7e6dc974bc..000000000000
--- a/arch/ia64/include/asm/kdebug.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _IA64_KDEBUG_H
-#define _IA64_KDEBUG_H 1
-/*
- *
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adopted from
- *              include/asm-x86_64/kdebug.h
- *
- * 2005-Oct	Keith Owens <kaos@sgi.com>.  Expand notify_die to cover more
- *		events.
- */
-
-enum die_val {
-	DIE_BREAK = 1,
-	DIE_FAULT,
-	DIE_OOPS,
-	DIE_MACHINE_HALT,
-	DIE_MACHINE_RESTART,
-	DIE_MCA_MONARCH_ENTER,
-	DIE_MCA_MONARCH_PROCESS,
-	DIE_MCA_MONARCH_LEAVE,
-	DIE_MCA_SLAVE_ENTER,
-	DIE_MCA_SLAVE_PROCESS,
-	DIE_MCA_SLAVE_LEAVE,
-	DIE_MCA_RENDZVOUS_ENTER,
-	DIE_MCA_RENDZVOUS_PROCESS,
-	DIE_MCA_RENDZVOUS_LEAVE,
-	DIE_MCA_NEW_TIMEOUT,
-	DIE_INIT_ENTER,
-	DIE_INIT_MONARCH_ENTER,
-	DIE_INIT_MONARCH_PROCESS,
-	DIE_INIT_MONARCH_LEAVE,
-	DIE_INIT_SLAVE_ENTER,
-	DIE_INIT_SLAVE_PROCESS,
-	DIE_INIT_SLAVE_LEAVE,
-	DIE_KDEBUG_ENTER,
-	DIE_KDEBUG_LEAVE,
-	DIE_KDUMP_ENTER,
-	DIE_KDUMP_LEAVE,
-};
-
-#endif
diff --git a/arch/ia64/include/asm/kexec.h b/arch/ia64/include/asm/kexec.h
deleted file mode 100644
index 294b1e1ebd2d..000000000000
--- a/arch/ia64/include/asm/kexec.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KEXEC_H
-#define _ASM_IA64_KEXEC_H
-
-#include <asm/setup.h>
-
-/* Maximum physical address we can use pages from */
-#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
-/* Maximum address we can reach in physical address mode */
-#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
-/* Maximum address we can use for the control code buffer */
-#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-
-#define KEXEC_CONTROL_PAGE_SIZE (8192 + 8192 + 4096)
-
-/* The native architecture */
-#define KEXEC_ARCH KEXEC_ARCH_IA_64
-
-#define kexec_flush_icache_page(page) do { \
-                unsigned long page_addr = (unsigned long)page_address(page); \
-                flush_icache_range(page_addr, page_addr + PAGE_SIZE); \
-        } while(0)
-
-extern struct kimage *ia64_kimage;
-extern const unsigned int relocate_new_kernel_size;
-extern void relocate_new_kernel(unsigned long, unsigned long,
-		struct ia64_boot_param *, unsigned long);
-static inline void
-crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
-{
-}
-extern struct resource efi_memmap_res;
-extern struct resource boot_param_res;
-extern void kdump_smp_send_stop(void);
-extern void kdump_smp_send_init(void);
-extern void kexec_disable_iosapic(void);
-extern void crash_save_this_cpu(void);
-struct rsvd_region;
-extern unsigned long kdump_find_rsvd_region(unsigned long size,
-		struct rsvd_region *rsvd_regions, int n);
-extern void kdump_cpu_freeze(struct unw_frame_info *info, void *arg);
-extern int kdump_status[];
-extern atomic_t kdump_cpu_freezed;
-extern atomic_t kdump_in_progress;
-
-#endif /* _ASM_IA64_KEXEC_H */
diff --git a/arch/ia64/include/asm/kprobes.h b/arch/ia64/include/asm/kprobes.h
deleted file mode 100644
index 9e956768946c..000000000000
--- a/arch/ia64/include/asm/kprobes.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adapted from i386
- */
-#include <asm-generic/kprobes.h>
-#include <asm/break.h>
-
-#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
-
-#ifdef CONFIG_KPROBES
-
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/percpu.h>
-
-#define __ARCH_WANT_KPROBES_INSN_SLOT
-#define MAX_INSN_SIZE   2	/* last half is for kprobe-booster */
-#define NOP_M_INST	(long)(1<<27)
-#define BRL_INST(i1, i2) ((long)((0xcL << 37) |	/* brl */ \
-				(0x1L << 12) |	/* many */ \
-				(((i1) & 1) << 36) | ((i2) << 13))) /* imm */
-
-typedef union cmp_inst {
-	struct {
-	unsigned long long qp : 6;
-	unsigned long long p1 : 6;
-	unsigned long long c  : 1;
-	unsigned long long r2 : 7;
-	unsigned long long r3 : 7;
-	unsigned long long p2 : 6;
-	unsigned long long ta : 1;
-	unsigned long long x2 : 2;
-	unsigned long long tb : 1;
-	unsigned long long opcode : 4;
-	unsigned long long reserved : 23;
-	}f;
-	unsigned long long l;
-} cmp_inst_t;
-
-struct kprobe;
-
-typedef struct _bundle {
-	struct {
-		unsigned long long template : 5;
-		unsigned long long slot0 : 41;
-		unsigned long long slot1_p0 : 64-46;
-	} quad0;
-	struct {
-		unsigned long long slot1_p1 : 41 - (64-46);
-		unsigned long long slot2 : 41;
-	} quad1;
-} __attribute__((__aligned__(16)))  bundle_t;
-
-struct prev_kprobe {
-	struct kprobe *kp;
-	unsigned long status;
-};
-
-#define	MAX_PARAM_RSE_SIZE	(0x60+0x60/0x3f)
-/* per-cpu kprobe control block */
-#define ARCH_PREV_KPROBE_SZ 2
-struct kprobe_ctlblk {
-	unsigned long kprobe_status;
-	unsigned long *bsp;
-	unsigned long cfm;
-	atomic_t prev_kprobe_index;
-	struct prev_kprobe prev_kprobe[ARCH_PREV_KPROBE_SZ];
-};
-
-#define kretprobe_blacklist_size 0
-
-#define SLOT0_OPCODE_SHIFT	(37)
-#define SLOT1_p1_OPCODE_SHIFT	(37 - (64-46))
-#define SLOT2_OPCODE_SHIFT 	(37)
-
-#define INDIRECT_CALL_OPCODE		(1)
-#define IP_RELATIVE_CALL_OPCODE		(5)
-#define IP_RELATIVE_BRANCH_OPCODE	(4)
-#define IP_RELATIVE_PREDICT_OPCODE	(7)
-#define LONG_BRANCH_OPCODE		(0xC)
-#define LONG_CALL_OPCODE		(0xD)
-#define flush_insn_slot(p)		do { } while (0)
-
-typedef struct kprobe_opcode {
-	bundle_t bundle;
-} kprobe_opcode_t;
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-	/* copy of the instruction to be emulated */
-	kprobe_opcode_t *insn;
- #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
- #define INST_FLAG_FIX_BRANCH_REG		2
- #define INST_FLAG_BREAK_INST			4
- #define INST_FLAG_BOOSTABLE			8
- 	unsigned long inst_flag;
- 	unsigned short target_br_reg;
-	unsigned short slot;
-};
-
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
-
-extern void arch_remove_kprobe(struct kprobe *p);
-
-#endif /* CONFIG_KPROBES */
-#endif /* _ASM_KPROBES_H */
diff --git a/arch/ia64/include/asm/kregs.h b/arch/ia64/include/asm/kregs.h
deleted file mode 100644
index 44113b75e4eb..000000000000
--- a/arch/ia64/include/asm/kregs.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KREGS_H
-#define _ASM_IA64_KREGS_H
-
-/*
- * Copyright (C) 2001-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * This file defines the kernel register usage convention used by Linux/ia64.
- */
-
-/*
- * Kernel registers:
- */
-#define IA64_KR_IO_BASE		0	/* ar.k0: legacy I/O base address */
-#define IA64_KR_TSSD		1	/* ar.k1: IVE uses this as the TSSD */
-#define IA64_KR_PER_CPU_DATA	3	/* ar.k3: physical per-CPU base */
-#define IA64_KR_CURRENT_STACK	4	/* ar.k4: what's mapped in IA64_TR_CURRENT_STACK */
-#define IA64_KR_FPU_OWNER	5	/* ar.k5: fpu-owner (UP only, at the moment) */
-#define IA64_KR_CURRENT		6	/* ar.k6: "current" task pointer */
-#define IA64_KR_PT_BASE		7	/* ar.k7: page table base address (physical) */
-
-#define _IA64_KR_PASTE(x,y)	x##y
-#define _IA64_KR_PREFIX(n)	_IA64_KR_PASTE(ar.k, n)
-#define IA64_KR(n)		_IA64_KR_PREFIX(IA64_KR_##n)
-
-/*
- * Translation registers:
- */
-#define IA64_TR_KERNEL		0	/* itr0, dtr0: maps kernel image (code & data) */
-#define IA64_TR_PALCODE		1	/* itr1: maps PALcode as required by EFI */
-#define IA64_TR_CURRENT_STACK	1	/* dtr1: maps kernel's memory- & register-stacks */
-
-#define IA64_TR_ALLOC_BASE	2 	/* itr&dtr: Base of dynamic TR resource*/
-#define IA64_TR_ALLOC_MAX	64 	/* Max number for dynamic use*/
-
-/* Processor status register bits: */
-#define IA64_PSR_BE_BIT		1
-#define IA64_PSR_UP_BIT		2
-#define IA64_PSR_AC_BIT		3
-#define IA64_PSR_MFL_BIT	4
-#define IA64_PSR_MFH_BIT	5
-#define IA64_PSR_IC_BIT		13
-#define IA64_PSR_I_BIT		14
-#define IA64_PSR_PK_BIT		15
-#define IA64_PSR_DT_BIT		17
-#define IA64_PSR_DFL_BIT	18
-#define IA64_PSR_DFH_BIT	19
-#define IA64_PSR_SP_BIT		20
-#define IA64_PSR_PP_BIT		21
-#define IA64_PSR_DI_BIT		22
-#define IA64_PSR_SI_BIT		23
-#define IA64_PSR_DB_BIT		24
-#define IA64_PSR_LP_BIT		25
-#define IA64_PSR_TB_BIT		26
-#define IA64_PSR_RT_BIT		27
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_CPL0_BIT	32
-#define IA64_PSR_CPL1_BIT	33
-#define IA64_PSR_IS_BIT		34
-#define IA64_PSR_MC_BIT		35
-#define IA64_PSR_IT_BIT		36
-#define IA64_PSR_ID_BIT		37
-#define IA64_PSR_DA_BIT		38
-#define IA64_PSR_DD_BIT		39
-#define IA64_PSR_SS_BIT		40
-#define IA64_PSR_RI_BIT		41
-#define IA64_PSR_ED_BIT		43
-#define IA64_PSR_BN_BIT		44
-#define IA64_PSR_IA_BIT		45
-
-/* A mask of PSR bits that we generally don't want to inherit across a clone2() or an
-   execve().  Only list flags here that need to be cleared/set for BOTH clone2() and
-   execve().  */
-#define IA64_PSR_BITS_TO_CLEAR	(IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_DB | IA64_PSR_LP | \
-				 IA64_PSR_TB  | IA64_PSR_ID  | IA64_PSR_DA | IA64_PSR_DD | \
-				 IA64_PSR_SS  | IA64_PSR_ED  | IA64_PSR_IA)
-#define IA64_PSR_BITS_TO_SET	(IA64_PSR_DFH | IA64_PSR_SP)
-
-#define IA64_PSR_BE	(__IA64_UL(1) << IA64_PSR_BE_BIT)
-#define IA64_PSR_UP	(__IA64_UL(1) << IA64_PSR_UP_BIT)
-#define IA64_PSR_AC	(__IA64_UL(1) << IA64_PSR_AC_BIT)
-#define IA64_PSR_MFL	(__IA64_UL(1) << IA64_PSR_MFL_BIT)
-#define IA64_PSR_MFH	(__IA64_UL(1) << IA64_PSR_MFH_BIT)
-#define IA64_PSR_IC	(__IA64_UL(1) << IA64_PSR_IC_BIT)
-#define IA64_PSR_I	(__IA64_UL(1) << IA64_PSR_I_BIT)
-#define IA64_PSR_PK	(__IA64_UL(1) << IA64_PSR_PK_BIT)
-#define IA64_PSR_DT	(__IA64_UL(1) << IA64_PSR_DT_BIT)
-#define IA64_PSR_DFL	(__IA64_UL(1) << IA64_PSR_DFL_BIT)
-#define IA64_PSR_DFH	(__IA64_UL(1) << IA64_PSR_DFH_BIT)
-#define IA64_PSR_SP	(__IA64_UL(1) << IA64_PSR_SP_BIT)
-#define IA64_PSR_PP	(__IA64_UL(1) << IA64_PSR_PP_BIT)
-#define IA64_PSR_DI	(__IA64_UL(1) << IA64_PSR_DI_BIT)
-#define IA64_PSR_SI	(__IA64_UL(1) << IA64_PSR_SI_BIT)
-#define IA64_PSR_DB	(__IA64_UL(1) << IA64_PSR_DB_BIT)
-#define IA64_PSR_LP	(__IA64_UL(1) << IA64_PSR_LP_BIT)
-#define IA64_PSR_TB	(__IA64_UL(1) << IA64_PSR_TB_BIT)
-#define IA64_PSR_RT	(__IA64_UL(1) << IA64_PSR_RT_BIT)
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_CPL	(__IA64_UL(3) << IA64_PSR_CPL0_BIT)
-#define IA64_PSR_IS	(__IA64_UL(1) << IA64_PSR_IS_BIT)
-#define IA64_PSR_MC	(__IA64_UL(1) << IA64_PSR_MC_BIT)
-#define IA64_PSR_IT	(__IA64_UL(1) << IA64_PSR_IT_BIT)
-#define IA64_PSR_ID	(__IA64_UL(1) << IA64_PSR_ID_BIT)
-#define IA64_PSR_DA	(__IA64_UL(1) << IA64_PSR_DA_BIT)
-#define IA64_PSR_DD	(__IA64_UL(1) << IA64_PSR_DD_BIT)
-#define IA64_PSR_SS	(__IA64_UL(1) << IA64_PSR_SS_BIT)
-#define IA64_PSR_RI	(__IA64_UL(3) << IA64_PSR_RI_BIT)
-#define IA64_PSR_ED	(__IA64_UL(1) << IA64_PSR_ED_BIT)
-#define IA64_PSR_BN	(__IA64_UL(1) << IA64_PSR_BN_BIT)
-#define IA64_PSR_IA	(__IA64_UL(1) << IA64_PSR_IA_BIT)
-
-/* User mask bits: */
-#define IA64_PSR_UM	(IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH)
-
-/* Default Control Register */
-#define IA64_DCR_PP_BIT		 0	/* privileged performance monitor default */
-#define IA64_DCR_BE_BIT		 1	/* big-endian default */
-#define IA64_DCR_LC_BIT		 2	/* ia32 lock-check enable */
-#define IA64_DCR_DM_BIT		 8	/* defer TLB miss faults */
-#define IA64_DCR_DP_BIT		 9	/* defer page-not-present faults */
-#define IA64_DCR_DK_BIT		10	/* defer key miss faults */
-#define IA64_DCR_DX_BIT		11	/* defer key permission faults */
-#define IA64_DCR_DR_BIT		12	/* defer access right faults */
-#define IA64_DCR_DA_BIT		13	/* defer access bit faults */
-#define IA64_DCR_DD_BIT		14	/* defer debug faults */
-
-#define IA64_DCR_PP	(__IA64_UL(1) << IA64_DCR_PP_BIT)
-#define IA64_DCR_BE	(__IA64_UL(1) << IA64_DCR_BE_BIT)
-#define IA64_DCR_LC	(__IA64_UL(1) << IA64_DCR_LC_BIT)
-#define IA64_DCR_DM	(__IA64_UL(1) << IA64_DCR_DM_BIT)
-#define IA64_DCR_DP	(__IA64_UL(1) << IA64_DCR_DP_BIT)
-#define IA64_DCR_DK	(__IA64_UL(1) << IA64_DCR_DK_BIT)
-#define IA64_DCR_DX	(__IA64_UL(1) << IA64_DCR_DX_BIT)
-#define IA64_DCR_DR	(__IA64_UL(1) << IA64_DCR_DR_BIT)
-#define IA64_DCR_DA	(__IA64_UL(1) << IA64_DCR_DA_BIT)
-#define IA64_DCR_DD	(__IA64_UL(1) << IA64_DCR_DD_BIT)
-
-/* Interrupt Status Register */
-#define IA64_ISR_X_BIT		32	/* execute access */
-#define IA64_ISR_W_BIT		33	/* write access */
-#define IA64_ISR_R_BIT		34	/* read access */
-#define IA64_ISR_NA_BIT		35	/* non-access */
-#define IA64_ISR_SP_BIT		36	/* speculative load exception */
-#define IA64_ISR_RS_BIT		37	/* mandatory register-stack exception */
-#define IA64_ISR_IR_BIT		38	/* invalid register frame exception */
-#define IA64_ISR_CODE_MASK	0xf
-
-#define IA64_ISR_X	(__IA64_UL(1) << IA64_ISR_X_BIT)
-#define IA64_ISR_W	(__IA64_UL(1) << IA64_ISR_W_BIT)
-#define IA64_ISR_R	(__IA64_UL(1) << IA64_ISR_R_BIT)
-#define IA64_ISR_NA	(__IA64_UL(1) << IA64_ISR_NA_BIT)
-#define IA64_ISR_SP	(__IA64_UL(1) << IA64_ISR_SP_BIT)
-#define IA64_ISR_RS	(__IA64_UL(1) << IA64_ISR_RS_BIT)
-#define IA64_ISR_IR	(__IA64_UL(1) << IA64_ISR_IR_BIT)
-
-/* ISR code field for non-access instructions */
-#define IA64_ISR_CODE_TPA	0
-#define IA64_ISR_CODE_FC	1
-#define IA64_ISR_CODE_PROBE	2
-#define IA64_ISR_CODE_TAK	3
-#define IA64_ISR_CODE_LFETCH	4
-#define IA64_ISR_CODE_PROBEF	5
-
-#endif /* _ASM_IA64_kREGS_H */
diff --git a/arch/ia64/include/asm/libata-portmap.h b/arch/ia64/include/asm/libata-portmap.h
deleted file mode 100644
index 757f84e5dc6e..000000000000
--- a/arch/ia64/include/asm/libata-portmap.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_IA64_LIBATA_PORTMAP_H
-#define __ASM_IA64_LIBATA_PORTMAP_H
-
-#define ATA_PRIMARY_IRQ(dev)	isa_irq_to_vector(14)
-
-#define ATA_SECONDARY_IRQ(dev)	isa_irq_to_vector(15)
-
-#endif
diff --git a/arch/ia64/include/asm/linkage.h b/arch/ia64/include/asm/linkage.h
deleted file mode 100644
index 5178af560925..000000000000
--- a/arch/ia64/include/asm/linkage.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-#ifndef __ASSEMBLY__
-
-#define asmlinkage CPP_ASMLINKAGE __attribute__((syscall_linkage))
-
-#else
-
-#include <asm/asmmacro.h>
-
-#endif
-
-#define cond_syscall(x) asm(".weak\t" #x "#\n" #x "#\t=\tsys_ni_syscall#")
-#define SYSCALL_ALIAS(alias, name)					\
-	asm ( #alias "# = " #name "#\n\t.globl " #alias "#")
-
-#endif
diff --git a/arch/ia64/include/asm/local.h b/arch/ia64/include/asm/local.h
deleted file mode 100644
index c11c530f74d0..000000000000
--- a/arch/ia64/include/asm/local.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local.h>
diff --git a/arch/ia64/include/asm/mca.h b/arch/ia64/include/asm/mca.h
deleted file mode 100644
index 05805249296c..000000000000
--- a/arch/ia64/include/asm/mca.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca.h
- * Purpose:	Machine check handling specific defines
- *
- * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
- * Copyright (C) Russ Anderson <rja@sgi.com>
- */
-
-#ifndef _ASM_IA64_MCA_H
-#define _ASM_IA64_MCA_H
-
-#if !defined(__ASSEMBLY__)
-
-#include <linux/percpu.h>
-#include <linux/threads.h>
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#define IA64_MCA_RENDEZ_TIMEOUT		(20 * 1000)	/* value in milliseconds - 20 seconds */
-
-typedef struct ia64_fptr {
-	unsigned long fp;
-	unsigned long gp;
-} ia64_fptr_t;
-
-typedef union cmcv_reg_u {
-	u64	cmcv_regval;
-	struct	{
-		u64	cmcr_vector		: 8;
-		u64	cmcr_reserved1		: 4;
-		u64	cmcr_ignored1		: 1;
-		u64	cmcr_reserved2		: 3;
-		u64	cmcr_mask		: 1;
-		u64	cmcr_ignored2		: 47;
-	} cmcv_reg_s;
-
-} cmcv_reg_t;
-
-#define cmcv_mask		cmcv_reg_s.cmcr_mask
-#define cmcv_vector		cmcv_reg_s.cmcr_vector
-
-enum {
-	IA64_MCA_RENDEZ_CHECKIN_NOTDONE	=	0x0,
-	IA64_MCA_RENDEZ_CHECKIN_DONE	=	0x1,
-	IA64_MCA_RENDEZ_CHECKIN_INIT	=	0x2,
-	IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA	=	0x3,
-};
-
-/* Information maintained by the MC infrastructure */
-typedef struct ia64_mc_info_s {
-	u64		imi_mca_handler;
-	size_t		imi_mca_handler_size;
-	u64		imi_monarch_init_handler;
-	size_t		imi_monarch_init_handler_size;
-	u64		imi_slave_init_handler;
-	size_t		imi_slave_init_handler_size;
-	u8		imi_rendez_checkin[NR_CPUS];
-
-} ia64_mc_info_t;
-
-/* Handover state from SAL to OS and vice versa, for both MCA and INIT events.
- * Besides the handover state, it also contains some saved registers from the
- * time of the event.
- * Note: mca_asm.S depends on the precise layout of this structure.
- */
-
-struct ia64_sal_os_state {
-
-	/* SAL to OS */
-	unsigned long		os_gp;			/* GP of the os registered with the SAL, physical */
-	unsigned long		pal_proc;		/* PAL_PROC entry point, physical */
-	unsigned long		sal_proc;		/* SAL_PROC entry point, physical */
-	unsigned long		rv_rc;			/* MCA - Rendezvous state, INIT - reason code */
-	unsigned long		proc_state_param;	/* from R18 */
-	unsigned long		monarch;		/* 1 for a monarch event, 0 for a slave */
-
-	/* common */
-	unsigned long		sal_ra;			/* Return address in SAL, physical */
-	unsigned long		sal_gp;			/* GP of the SAL - physical */
-	struct pal_min_state_area *pal_min_state;	/* from R17.  physical in asm, virtual in C */
-	/* Previous values of IA64_KR(CURRENT) and IA64_KR(CURRENT_STACK).
-	 * Note: if the MCA/INIT recovery code wants to resume to a new context
-	 * then it must change these values to reflect the new kernel stack.
-	 */
-	unsigned long		prev_IA64_KR_CURRENT;	/* previous value of IA64_KR(CURRENT) */
-	unsigned long		prev_IA64_KR_CURRENT_STACK;
-	struct task_struct	*prev_task;		/* previous task, NULL if it is not useful */
-	/* Some interrupt registers are not saved in minstate, pt_regs or
-	 * switch_stack.  Because MCA/INIT can occur when interrupts are
-	 * disabled, we need to save the additional interrupt registers over
-	 * MCA/INIT and resume.
-	 */
-	unsigned long		isr;
-	unsigned long		ifa;
-	unsigned long		itir;
-	unsigned long		iipa;
-	unsigned long		iim;
-	unsigned long		iha;
-
-	/* OS to SAL */
-	unsigned long		os_status;		/* OS status to SAL, enum below */
-	unsigned long		context;		/* 0 if return to same context
-							   1 if return to new context */
-
-	/* I-resources */
-	unsigned long		iip;
-	unsigned long		ipsr;
-	unsigned long		ifs;
-};
-
-enum {
-	IA64_MCA_CORRECTED	=	0x0,	/* Error has been corrected by OS_MCA */
-	IA64_MCA_WARM_BOOT	=	-1,	/* Warm boot of the system need from SAL */
-	IA64_MCA_COLD_BOOT	=	-2,	/* Cold boot of the system need from SAL */
-	IA64_MCA_HALT		=	-3	/* System to be halted by SAL */
-};
-
-enum {
-	IA64_INIT_RESUME	=	0x0,	/* Resume after return from INIT */
-	IA64_INIT_WARM_BOOT	=	-1,	/* Warm boot of the system need from SAL */
-};
-
-enum {
-	IA64_MCA_SAME_CONTEXT	=	0x0,	/* SAL to return to same context */
-	IA64_MCA_NEW_CONTEXT	=	-1	/* SAL to return to new context */
-};
-
-/* Per-CPU MCA state that is too big for normal per-CPU variables.  */
-
-struct ia64_mca_cpu {
-	u64 mca_stack[KERNEL_STACK_SIZE/8];
-	u64 init_stack[KERNEL_STACK_SIZE/8];
-};
-
-/* Array of physical addresses of each CPU's MCA area.  */
-extern unsigned long __per_cpu_mca[NR_CPUS];
-
-extern int cpe_vector;
-extern int ia64_cpe_irq;
-extern void ia64_mca_init(void);
-extern void ia64_mca_irq_init(void);
-extern void ia64_mca_cpu_init(void *);
-extern void ia64_os_mca_dispatch(void);
-extern void ia64_os_mca_dispatch_end(void);
-extern void ia64_mca_ucmc_handler(struct pt_regs *, struct ia64_sal_os_state *);
-extern void ia64_init_handler(struct pt_regs *,
-			      struct switch_stack *,
-			      struct ia64_sal_os_state *);
-extern void ia64_os_init_on_kdump(void);
-extern void ia64_monarch_init_handler(void);
-extern void ia64_slave_init_handler(void);
-extern void ia64_mca_cmc_vector_setup(void);
-extern int  ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
-extern void ia64_unreg_MCA_extension(void);
-extern unsigned long ia64_get_rnat(unsigned long *);
-extern void ia64_set_psr_mc(void);
-extern void ia64_mca_printk(const char * fmt, ...)
-	 __attribute__ ((format (printf, 1, 2)));
-
-struct ia64_mca_notify_die {
-	struct ia64_sal_os_state *sos;
-	int *monarch_cpu;
-	int *data;
-};
-
-DECLARE_PER_CPU(u64, ia64_mca_pal_base);
-
-#else	/* __ASSEMBLY__ */
-
-#define IA64_MCA_CORRECTED	0x0	/* Error has been corrected by OS_MCA */
-#define IA64_MCA_WARM_BOOT	-1	/* Warm boot of the system need from SAL */
-#define IA64_MCA_COLD_BOOT	-2	/* Cold boot of the system need from SAL */
-#define IA64_MCA_HALT		-3	/* System to be halted by SAL */
-
-#define IA64_INIT_RESUME	0x0	/* Resume after return from INIT */
-#define IA64_INIT_WARM_BOOT	-1	/* Warm boot of the system need from SAL */
-
-#define IA64_MCA_SAME_CONTEXT	0x0	/* SAL to return to same context */
-#define IA64_MCA_NEW_CONTEXT	-1	/* SAL to return to new context */
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_MCA_H */
diff --git a/arch/ia64/include/asm/mca_asm.h b/arch/ia64/include/asm/mca_asm.h
deleted file mode 100644
index e3ab1f41f1c3..000000000000
--- a/arch/ia64/include/asm/mca_asm.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_asm.h
- * Purpose:	Machine check handling specific defines
- *
- * Copyright (C) 1999 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
- * Copyright (C) 2000 Hewlett-Packard Co.
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Jenna Hall <jenna.s.hall@intel.com>
- * Copyright (C) 2005 Silicon Graphics, Inc
- * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
- */
-#ifndef _ASM_IA64_MCA_ASM_H
-#define _ASM_IA64_MCA_ASM_H
-
-#include <asm/percpu.h>
-
-#define PSR_IC		13
-#define PSR_I		14
-#define	PSR_DT		17
-#define PSR_RT		27
-#define PSR_MC		35
-#define PSR_IT		36
-#define PSR_BN		44
-
-/*
- * This macro converts a instruction virtual address to a physical address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Lop off bits 61 thru 63 in the virtual address
- */
-#define INST_VA_TO_PA(addr)							\
-	dep	addr	= 0, addr, 61, 3
-/*
- * This macro converts a data virtual address to a physical address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Lop off bits 61 thru 63 in the virtual address
- */
-#define DATA_VA_TO_PA(addr)							\
-	tpa	addr	= addr
-/*
- * This macro converts a data physical address to a virtual address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Put 0x7 in bits 61 thru 63.
- */
-#define DATA_PA_TO_VA(addr,temp)							\
-	mov	temp	= 0x7	;;							\
-	dep	addr	= temp, addr, 61, 3
-
-#define GET_THIS_PADDR(reg, var)		\
-	mov	reg = IA64_KR(PER_CPU_DATA);;	\
-        addl	reg = THIS_CPU(var), reg
-
-/*
- * This macro jumps to the instruction at the given virtual address
- * and starts execution in physical mode with all the address
- * translations turned off.
- *	1.	Save the current psr
- *	2.	Make sure that all the upper 32 bits are off
- *
- *	3.	Clear the interrupt enable and interrupt state collection bits
- *		in the psr before updating the ipsr and iip.
- *
- *	4.	Turn off the instruction, data and rse translation bits of the psr
- *		and store the new value into ipsr
- *		Also make sure that the interrupts are disabled.
- *		Ensure that we are in little endian mode.
- *		[psr.{rt, it, dt, i, be} = 0]
- *
- *	5.	Get the physical address corresponding to the virtual address
- *		of the next instruction bundle and put it in iip.
- *		(Using magic numbers 24 and 40 in the deposint instruction since
- *		 the IA64_SDK code directly maps to lower 24bits as physical address
- *		 from a virtual address).
- *
- *	6.	Do an rfi to move the values from ipsr to psr and iip to ip.
- */
-#define  PHYSICAL_MODE_ENTER(temp1, temp2, start_addr, old_psr)				\
-	mov	old_psr = psr;								\
-	;;										\
-	dep	old_psr = 0, old_psr, 32, 32;						\
-											\
-	mov	ar.rsc = 0 ;								\
-	;;										\
-	srlz.d;										\
-	mov	temp2 = ar.bspstore;							\
-	;;										\
-	DATA_VA_TO_PA(temp2);								\
-	;;										\
-	mov	temp1 = ar.rnat;							\
-	;;										\
-	mov	ar.bspstore = temp2;							\
-	;;										\
-	mov	ar.rnat = temp1;							\
-	mov	temp1 = psr;								\
-	mov	temp2 = psr;								\
-	;;										\
-											\
-	dep	temp2 = 0, temp2, PSR_IC, 2;						\
-	;;										\
-	mov	psr.l = temp2;								\
-	;;										\
-	srlz.d;										\
-	dep	temp1 = 0, temp1, 32, 32;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_IT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_DT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_RT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_I, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_IC, 1;						\
-	;;										\
-	dep	temp1 = -1, temp1, PSR_MC, 1;						\
-	;;										\
-	mov	cr.ipsr = temp1;							\
-	;;										\
-	LOAD_PHYSICAL(p0, temp2, start_addr);						\
-	;;										\
-	mov	cr.iip = temp2;								\
-	mov	cr.ifs = r0;								\
-	DATA_VA_TO_PA(sp);								\
-	DATA_VA_TO_PA(gp);								\
-	;;										\
-	srlz.i;										\
-	;;										\
-	nop	1;									\
-	nop	2;									\
-	nop	1;									\
-	nop	2;									\
-	rfi;										\
-	;;
-
-/*
- * This macro jumps to the instruction at the given virtual address
- * and starts execution in virtual mode with all the address
- * translations turned on.
- *	1.	Get the old saved psr
- *
- *	2.	Clear the interrupt state collection bit in the current psr.
- *
- *	3.	Set the instruction translation bit back in the old psr
- *		Note we have to do this since we are right now saving only the
- *		lower 32-bits of old psr.(Also the old psr has the data and
- *		rse translation bits on)
- *
- *	4.	Set ipsr to this old_psr with "it" bit set and "bn" = 1.
- *
- *	5.	Reset the current thread pointer (r13).
- *
- *	6.	Set iip to the virtual address of the next instruction bundle.
- *
- *	7.	Do an rfi to move ipsr to psr and iip to ip.
- */
-
-#define VIRTUAL_MODE_ENTER(temp1, temp2, start_addr, old_psr)	\
-	mov	temp2 = psr;					\
-	;;							\
-	mov	old_psr = temp2;				\
-	;;							\
-	dep	temp2 = 0, temp2, PSR_IC, 2;			\
-	;;							\
-	mov	psr.l = temp2;					\
-	mov	ar.rsc = 0;					\
-	;;							\
-	srlz.d;							\
-	mov	r13 = ar.k6;					\
-	mov	temp2 = ar.bspstore;				\
-	;;							\
-	DATA_PA_TO_VA(temp2,temp1);				\
-	;;							\
-	mov	temp1 = ar.rnat;				\
-	;;							\
-	mov	ar.bspstore = temp2;				\
-	;;							\
-	mov	ar.rnat = temp1;				\
-	;;							\
-	mov	temp1 = old_psr;				\
-	;;							\
-	mov	temp2 = 1;					\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_IC, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_IT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_DT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_RT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_BN, 1;		\
-	;;							\
-								\
-	mov     cr.ipsr = temp1;				\
-	movl	temp2 = start_addr;				\
-	;;							\
-	mov	cr.iip = temp2;					\
-	movl	gp = __gp					\
-	;;							\
-	DATA_PA_TO_VA(sp, temp1);				\
-	srlz.i;							\
-	;;							\
-	nop	1;						\
-	nop	2;						\
-	nop	1;						\
-	rfi							\
-	;;
-
-/*
- * The MCA and INIT stacks in struct ia64_mca_cpu look like normal kernel
- * stacks, except that the SAL/OS state and a switch_stack are stored near the
- * top of the MCA/INIT stack.  To support concurrent entry to MCA or INIT, as
- * well as MCA over INIT, each event needs its own SAL/OS state.  All entries
- * are 16 byte aligned.
- *
- *      +---------------------------+
- *      |          pt_regs          |
- *      +---------------------------+
- *      |        switch_stack       |
- *      +---------------------------+
- *      |        SAL/OS state       |
- *      +---------------------------+
- *      |    16 byte scratch area   |
- *      +---------------------------+ <-------- SP at start of C MCA handler
- *      |           .....           |
- *      +---------------------------+
- *      | RBS for MCA/INIT handler  |
- *      +---------------------------+
- *      | struct task for MCA/INIT  |
- *      +---------------------------+ <-------- Bottom of MCA/INIT stack
- */
-
-#define ALIGN16(x)			((x)&~15)
-#define MCA_PT_REGS_OFFSET		ALIGN16(KERNEL_STACK_SIZE-IA64_PT_REGS_SIZE)
-#define MCA_SWITCH_STACK_OFFSET		ALIGN16(MCA_PT_REGS_OFFSET-IA64_SWITCH_STACK_SIZE)
-#define MCA_SOS_OFFSET			ALIGN16(MCA_SWITCH_STACK_OFFSET-IA64_SAL_OS_STATE_SIZE)
-#define MCA_SP_OFFSET			ALIGN16(MCA_SOS_OFFSET-16)
-
-#endif /* _ASM_IA64_MCA_ASM_H */
diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h
deleted file mode 100644
index f1d5bf2ba847..000000000000
--- a/arch/ia64/include/asm/meminit.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef meminit_h
-#define meminit_h
-
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-
-/*
- * Entries defined so far:
- * 	- boot param structure itself
- * 	- memory map
- * 	- initrd (optional)
- * 	- command line string
- * 	- kernel code & data
- * 	- crash dumping code reserved region
- * 	- Kernel memory map built from EFI memory map
- * 	- ELF core header
- *
- * More could be added if necessary
- */
-#define IA64_MAX_RSVD_REGIONS 9
-
-struct rsvd_region {
-	u64 start;	/* virtual address of beginning of element */
-	u64 end;	/* virtual address of end of element + 1 */
-};
-
-extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
-
-extern void find_memory (void);
-extern void reserve_memory (void);
-extern void find_initrd (void);
-extern int filter_rsvd_memory (u64 start, u64 end, void *arg);
-extern int filter_memory (u64 start, u64 end, void *arg);
-extern unsigned long efi_memmap_init(u64 *s, u64 *e);
-extern int find_max_min_low_pfn (u64, u64, void *);
-
-extern unsigned long vmcore_find_descriptor_size(unsigned long address);
-
-/*
- * For rounding an address to the next IA64_GRANULE_SIZE or order
- */
-#define GRANULEROUNDDOWN(n)	((n) & ~(IA64_GRANULE_SIZE-1))
-#define GRANULEROUNDUP(n)	(((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
-
-#ifdef CONFIG_NUMA
-  extern void call_pernode_memory (unsigned long start, unsigned long len, void *func);
-#else
-# define call_pernode_memory(start, len, func)	(*func)(start, len, 0)
-#endif
-
-#define IGNORE_PFN0	1	/* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */
-
-extern int register_active_ranges(u64 start, u64 len, int nid);
-
-#endif /* meminit_h */
diff --git a/arch/ia64/include/asm/mman.h b/arch/ia64/include/asm/mman.h
deleted file mode 100644
index 15cf100add0e..000000000000
--- a/arch/ia64/include/asm/mman.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Based on <asm-i386/mman.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_MMAN_H
-#define _ASM_IA64_MMAN_H
-
-#include <uapi/asm/mman.h>
-
-#ifndef __ASSEMBLY__
-#define arch_mmap_check	ia64_mmap_check
-int ia64_mmap_check(unsigned long addr, unsigned long len,
-		unsigned long flags);
-#endif
-#endif /* _ASM_IA64_MMAN_H */
diff --git a/arch/ia64/include/asm/mmiowb.h b/arch/ia64/include/asm/mmiowb.h
deleted file mode 100644
index d67aab4ea3b4..000000000000
--- a/arch/ia64/include/asm/mmiowb.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_IA64_MMIOWB_H
-#define _ASM_IA64_MMIOWB_H
-
-/**
- * mmiowb - I/O write barrier
- *
- * Ensure ordering of I/O space writes.  This will make sure that writes
- * following the barrier will arrive after all previous writes.  For most
- * ia64 platforms, this is a simple 'mf.a' instruction.
- */
-#define mmiowb()	ia64_mfa()
-
-#include <asm-generic/mmiowb.h>
-
-#endif	/* _ASM_IA64_MMIOWB_H */
diff --git a/arch/ia64/include/asm/mmu.h b/arch/ia64/include/asm/mmu.h
deleted file mode 100644
index f75f44f531c2..000000000000
--- a/arch/ia64/include/asm/mmu.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __MMU_H
-#define __MMU_H
-
-/*
- * Type for a context number.  We declare it volatile to ensure proper
- * ordering when it's accessed outside of spinlock'd critical sections
- * (e.g., as done in activate_mm() and init_new_context()).
- */
-typedef volatile unsigned long mm_context_t;
-
-typedef unsigned long nv_mm_context_t;
-
-#endif
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
deleted file mode 100644
index 06257e355d00..000000000000
--- a/arch/ia64/include/asm/mmu_context.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_MMU_CONTEXT_H
-#define _ASM_IA64_MMU_CONTEXT_H
-
-/*
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * Routines to manage the allocation of task context numbers.  Task context
- * numbers are used to reduce or eliminate the need to perform TLB flushes
- * due to context switches.  Context numbers are implemented using ia-64
- * region ids.  Since the IA-64 TLB does not consider the region number when
- * performing a TLB lookup, we need to assign a unique region id to each
- * region in a process.  We use the least significant three bits in aregion
- * id for this purpose.
- */
-
-#define IA64_REGION_ID_KERNEL	0 /* the kernel's region id (tlb.c depends on this being 0) */
-
-#define ia64_rid(ctx,addr)	(((ctx) << 3) | (addr >> 61))
-
-# include <asm/page.h>
-# ifndef __ASSEMBLY__
-
-#include <linux/compiler.h>
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/spinlock.h>
-
-#include <asm/processor.h>
-#include <asm-generic/mm_hooks.h>
-
-struct ia64_ctx {
-	spinlock_t lock;
-	unsigned int next;	/* next context number to use */
-	unsigned int limit;     /* available free range */
-	unsigned int max_ctx;   /* max. context value supported by all CPUs */
-				/* call wrap_mmu_context when next >= max */
-	unsigned long *bitmap;  /* bitmap size is max_ctx+1 */
-	unsigned long *flushmap;/* pending rid to be flushed */
-};
-
-extern struct ia64_ctx ia64_ctx;
-DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
-
-extern void mmu_context_init (void);
-extern void wrap_mmu_context (struct mm_struct *mm);
-
-/*
- * When the context counter wraps around all TLBs need to be flushed because
- * an old context number might have been reused. This is signalled by the
- * ia64_need_tlb_flush per-CPU variable, which is checked in the routine
- * below. Called by activate_mm(). <efocht@ess.nec.de>
- */
-static inline void
-delayed_tlb_flush (void)
-{
-	extern void local_flush_tlb_all (void);
-	unsigned long flags;
-
-	if (unlikely(__ia64_per_cpu_var(ia64_need_tlb_flush))) {
-		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
-			local_flush_tlb_all();
-			__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
-		}
-		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
-	}
-}
-
-static inline nv_mm_context_t
-get_mmu_context (struct mm_struct *mm)
-{
-	unsigned long flags;
-	nv_mm_context_t context = mm->context;
-
-	if (likely(context))
-		goto out;
-
-	spin_lock_irqsave(&ia64_ctx.lock, flags);
-	/* re-check, now that we've got the lock: */
-	context = mm->context;
-	if (context == 0) {
-		cpumask_clear(mm_cpumask(mm));
-		if (ia64_ctx.next >= ia64_ctx.limit) {
-			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
-					ia64_ctx.max_ctx, ia64_ctx.next);
-			ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
-					ia64_ctx.max_ctx, ia64_ctx.next);
-			if (ia64_ctx.next >= ia64_ctx.max_ctx)
-				wrap_mmu_context(mm);
-		}
-		mm->context = context = ia64_ctx.next++;
-		__set_bit(context, ia64_ctx.bitmap);
-	}
-	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
-out:
-	/*
-	 * Ensure we're not starting to use "context" before any old
-	 * uses of it are gone from our TLB.
-	 */
-	delayed_tlb_flush();
-
-	return context;
-}
-
-/*
- * Initialize context number to some sane value.  MM is guaranteed to be a
- * brand-new address-space, so no TLB flushing is needed, ever.
- */
-#define init_new_context init_new_context
-static inline int
-init_new_context (struct task_struct *p, struct mm_struct *mm)
-{
-	mm->context = 0;
-	return 0;
-}
-
-static inline void
-reload_context (nv_mm_context_t context)
-{
-	unsigned long rid;
-	unsigned long rid_incr = 0;
-	unsigned long rr0, rr1, rr2, rr3, rr4;
-
-#ifdef CONFIG_HUGETLB_PAGE
-	unsigned long old_rr4;
-	old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE));
-#endif
-	rid = context << 3;	/* make space for encoding the region number */
-	rid_incr = 1 << 8;
-
-	/* encode the region id, preferred page size, and VHPT enable bit: */
-	rr0 = (rid << 8) | (PAGE_SHIFT << 2) | 1;
-	rr1 = rr0 + 1*rid_incr;
-	rr2 = rr0 + 2*rid_incr;
-	rr3 = rr0 + 3*rid_incr;
-	rr4 = rr0 + 4*rid_incr;
-#ifdef  CONFIG_HUGETLB_PAGE
-	rr4 = (rr4 & (~(0xfcUL))) | (old_rr4 & 0xfc);
-
-#  if RGN_HPAGE != 4
-#    error "reload_context assumes RGN_HPAGE is 4"
-#  endif
-#endif
-
-	ia64_set_rr0_to_rr4(rr0, rr1, rr2, rr3, rr4);
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-/*
- * Must be called with preemption off
- */
-static inline void
-activate_context (struct mm_struct *mm)
-{
-	nv_mm_context_t context;
-
-	do {
-		context = get_mmu_context(mm);
-		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
-			cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-		reload_context(context);
-		/*
-		 * in the unlikely event of a TLB-flush by another thread,
-		 * redo the load.
-		 */
-	} while (unlikely(context != mm->context));
-}
-
-/*
- * Switch from address space PREV to address space NEXT.
- */
-#define activate_mm activate_mm
-static inline void
-activate_mm (struct mm_struct *prev, struct mm_struct *next)
-{
-	/*
-	 * We may get interrupts here, but that's OK because interrupt
-	 * handlers cannot touch user-space.
-	 */
-	ia64_set_kr(IA64_KR_PT_BASE, __pa(next->pgd));
-	activate_context(next);
-}
-
-#define switch_mm(prev_mm,next_mm,next_task)	activate_mm(prev_mm, next_mm)
-
-#include <asm-generic/mmu_context.h>
-
-# endif /* ! __ASSEMBLY__ */
-#endif /* _ASM_IA64_MMU_CONTEXT_H */
diff --git a/arch/ia64/include/asm/mmzone.h b/arch/ia64/include/asm/mmzone.h
deleted file mode 100644
index 767201f66c93..000000000000
--- a/arch/ia64/include/asm/mmzone.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000,2003 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- */
-#ifndef _ASM_IA64_MMZONE_H
-#define _ASM_IA64_MMZONE_H
-
-#include <linux/numa.h>
-#include <asm/page.h>
-#include <asm/meminit.h>
-
-#ifdef CONFIG_NUMA
-
-static inline int pfn_to_nid(unsigned long pfn)
-{
-	extern int paddr_to_nid(unsigned long);
-	int nid = paddr_to_nid(pfn << PAGE_SHIFT);
-	if (nid < 0)
-		return 0;
-	else
-		return nid;
-}
-
-#define MAX_PHYSNODE_ID		2048
-#endif /* CONFIG_NUMA */
-
-#define NR_NODE_MEMBLKS		(MAX_NUMNODES * 4)
-
-#endif /* _ASM_IA64_MMZONE_H */
diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
deleted file mode 100644
index 7271b9c5fc76..000000000000
--- a/arch/ia64/include/asm/module.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_MODULE_H
-#define _ASM_IA64_MODULE_H
-
-#include <asm-generic/module.h>
-
-/*
- * IA-64-specific support for kernel module loader.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-struct elf64_shdr;			/* forward declration */
-
-struct mod_arch_specific {
-	/* Used only at module load time. */
-	struct elf64_shdr *core_plt;	/* core PLT section */
-	struct elf64_shdr *init_plt;	/* init PLT section */
-	struct elf64_shdr *got;		/* global offset table */
-	struct elf64_shdr *opd;		/* official procedure descriptors */
-	struct elf64_shdr *unwind;	/* unwind-table section */
-	unsigned long gp;		/* global-pointer for module */
-	unsigned int next_got_entry;	/* index of next available got entry */
-
-	/* Used at module run and cleanup time. */
-	void *core_unw_table;		/* core unwind-table cookie returned by unwinder */
-	void *init_unw_table;		/* init unwind-table cookie returned by unwinder */
-	void *opd_addr;			/* symbolize uses .opd to get to actual function */
-	unsigned long opd_size;
-};
-
-#define ARCH_SHF_SMALL	SHF_IA_64_SHORT
-
-#endif /* _ASM_IA64_MODULE_H */
diff --git a/arch/ia64/include/asm/module.lds.h b/arch/ia64/include/asm/module.lds.h
deleted file mode 100644
index eff68f362793..000000000000
--- a/arch/ia64/include/asm/module.lds.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-SECTIONS {
-	/* Group unwind sections into a single section: */
-	.IA_64.unwind_info : { *(.IA_64.unwind_info*) }
-	.IA_64.unwind : { *(.IA_64.unwind*) }
-	/*
-	 * Create place-holder sections to hold the PLTs, GOT, and
-	 * official procedure-descriptors (.opd).
-	 */
-	.core.plt : { BYTE(0) }
-	.init.plt : { BYTE(0) }
-	.got : { BYTE(0) }
-	.opd : { BYTE(0) }
-}
diff --git a/arch/ia64/include/asm/msidef.h b/arch/ia64/include/asm/msidef.h
deleted file mode 100644
index 18d0e4226748..000000000000
--- a/arch/ia64/include/asm/msidef.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _IA64_MSI_DEF_H
-#define _IA64_MSI_DEF_H
-
-/*
- * Shifts for APIC-based data
- */
-
-#define     MSI_DATA_VECTOR_SHIFT	0
-#define	    MSI_DATA_VECTOR(v)		(((u8)v) << MSI_DATA_VECTOR_SHIFT)
-#define     MSI_DATA_VECTOR_MASK	0xffffff00
-
-#define     MSI_DATA_DELIVERY_MODE_SHIFT	8
-#define     MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
-#define     MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
-
-#define     MSI_DATA_LEVEL_SHIFT	14
-#define     MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
-#define     MSI_DATA_LEVEL_ASSERT	(1 << MSI_DATA_LEVEL_SHIFT)
-
-#define     MSI_DATA_TRIGGER_SHIFT	15
-#define     MSI_DATA_TRIGGER_EDGE	(0 << MSI_DATA_TRIGGER_SHIFT)
-#define     MSI_DATA_TRIGGER_LEVEL	(1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for APIC-based bus address
- */
-
-#define     MSI_ADDR_DEST_ID_SHIFT	4
-#define     MSI_ADDR_HEADER		0xfee00000
-
-#define     MSI_ADDR_DEST_ID_MASK	0xfff0000f
-#define     MSI_ADDR_DEST_ID_CPU(cpu)	((cpu) << MSI_ADDR_DEST_ID_SHIFT)
-
-#define     MSI_ADDR_DEST_MODE_SHIFT	2
-#define     MSI_ADDR_DEST_MODE_PHYS	(0 << MSI_ADDR_DEST_MODE_SHIFT)
-#define	    MSI_ADDR_DEST_MODE_LOGIC	(1 << MSI_ADDR_DEST_MODE_SHIFT)
-
-#define     MSI_ADDR_REDIRECTION_SHIFT	3
-#define     MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT)
-#define     MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT)
-
-#endif/* _IA64_MSI_DEF_H */
diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
deleted file mode 100644
index e08662396029..000000000000
--- a/arch/ia64/include/asm/native/inst.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#define DO_SAVE_MIN		IA64_NATIVE_DO_SAVE_MIN
-
-#define MOV_FROM_IFA(reg)	\
-	mov reg = cr.ifa
-
-#define MOV_FROM_ITIR(reg)	\
-	mov reg = cr.itir
-
-#define MOV_FROM_ISR(reg)	\
-	mov reg = cr.isr
-
-#define MOV_FROM_IHA(reg)	\
-	mov reg = cr.iha
-
-#define MOV_FROM_IPSR(pred, reg)	\
-(pred)	mov reg = cr.ipsr
-
-#define MOV_FROM_IIM(reg)	\
-	mov reg = cr.iim
-
-#define MOV_FROM_IIP(reg)	\
-	mov reg = cr.iip
-
-#define MOV_FROM_IVR(reg, clob)	\
-	mov reg = cr.ivr
-
-#define MOV_FROM_PSR(pred, reg, clob)	\
-(pred)	mov reg = psr
-
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-(pred)	mov reg = ar.itc
-
-#define MOV_TO_IFA(reg, clob)	\
-	mov cr.ifa = reg
-
-#define MOV_TO_ITIR(pred, reg, clob)	\
-(pred)	mov cr.itir = reg
-
-#define MOV_TO_IHA(pred, reg, clob)	\
-(pred)	mov cr.iha = reg
-
-#define MOV_TO_IPSR(pred, reg, clob)		\
-(pred)	mov cr.ipsr = reg
-
-#define MOV_TO_IFS(pred, reg, clob)	\
-(pred)	mov cr.ifs = reg
-
-#define MOV_TO_IIP(reg, clob)	\
-	mov cr.iip = reg
-
-#define MOV_TO_KR(kr, reg, clob0, clob1)	\
-	mov IA64_KR(kr) = reg
-
-#define ITC_I(pred, reg, clob)	\
-(pred)	itc.i reg
-
-#define ITC_D(pred, reg, clob)	\
-(pred)	itc.d reg
-
-#define ITC_I_AND_D(pred_i, pred_d, reg, clob)	\
-(pred_i) itc.i reg;				\
-(pred_d) itc.d reg
-
-#define THASH(pred, reg0, reg1, clob)		\
-(pred)	thash reg0 = reg1
-
-#define SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(clob0, clob1)		\
-	ssm psr.ic | PSR_DEFAULT_BITS					\
-	;;								\
-	srlz.i /* guarantee that interruption collectin is on */	\
-	;;
-
-#define SSM_PSR_IC_AND_SRLZ_D(clob0, clob1)	\
-	ssm psr.ic				\
-	;;					\
-	srlz.d
-
-#define RSM_PSR_IC(clob)	\
-	rsm psr.ic
-
-#define SSM_PSR_I(pred, pred_clob, clob)	\
-(pred)	ssm psr.i
-
-#define RSM_PSR_I(pred, clob0, clob1)	\
-(pred)	rsm psr.i
-
-#define RSM_PSR_I_IC(clob0, clob1, clob2)	\
-	rsm psr.i | psr.ic
-
-#define RSM_PSR_DT		\
-	rsm psr.dt
-
-#define RSM_PSR_BE_I(clob0, clob1)	\
-	rsm psr.be | psr.i
-
-#define SSM_PSR_DT_AND_SRLZ_I	\
-	ssm psr.dt		\
-	;;			\
-	srlz.i
-
-#define BSW_0(clob0, clob1, clob2)	\
-	bsw.0
-
-#define BSW_1(clob0, clob1)	\
-	bsw.1
-
-#define COVER	\
-	cover
-
-#define RFI	\
-	rfi
diff --git a/arch/ia64/include/asm/native/irq.h b/arch/ia64/include/asm/native/irq.h
deleted file mode 100644
index aa74915f8aa2..000000000000
--- a/arch/ia64/include/asm/native/irq.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/irq.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#ifndef _ASM_IA64_NATIVE_IRQ_H
-#define _ASM_IA64_NATIVE_IRQ_H
-
-#define NR_VECTORS	256
-
-#if (NR_VECTORS + 32 * NR_CPUS) < 1024
-#define IA64_NATIVE_NR_IRQS (NR_VECTORS + 32 * NR_CPUS)
-#else
-#define IA64_NATIVE_NR_IRQS 1024
-#endif
-
-#endif /* _ASM_IA64_NATIVE_IRQ_H */
diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h
deleted file mode 100644
index f13e7675758c..000000000000
--- a/arch/ia64/include/asm/native/patchlist.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#define __paravirt_start_gate_fsyscall_patchlist		\
-	__ia64_native_start_gate_fsyscall_patchlist
-#define __paravirt_end_gate_fsyscall_patchlist			\
-	__ia64_native_end_gate_fsyscall_patchlist
-#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_start_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_end_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_start_gate_vtop_patchlist			\
-	__ia64_native_start_gate_vtop_patchlist
-#define __paravirt_end_gate_vtop_patchlist			\
-	__ia64_native_end_gate_vtop_patchlist
-#define __paravirt_start_gate_mckinley_e9_patchlist		\
-	__ia64_native_start_gate_mckinley_e9_patchlist
-#define __paravirt_end_gate_mckinley_e9_patchlist		\
-	__ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/nodedata.h b/arch/ia64/include/asm/nodedata.h
deleted file mode 100644
index 2fb337b0e9b7..000000000000
--- a/arch/ia64/include/asm/nodedata.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- */
-#ifndef _ASM_IA64_NODEDATA_H
-#define _ASM_IA64_NODEDATA_H
-
-#include <linux/numa.h>
-
-#include <asm/percpu.h>
-#include <asm/mmzone.h>
-
-#ifdef CONFIG_NUMA
-
-/*
- * Node Data. One of these structures is located on each node of a NUMA system.
- */
-
-struct pglist_data;
-struct ia64_node_data {
-	short			active_cpu_count;
-	short			node;
-	struct pglist_data	*pg_data_ptrs[MAX_NUMNODES];
-};
-
-
-/*
- * Return a pointer to the node_data structure for the executing cpu.
- */
-#define local_node_data		(local_cpu_data->node_data)
-
-/*
- * Given a node id, return a pointer to the pg_data_t for the node.
- *
- * NODE_DATA 	- should be used in all code not related to system
- *		  initialization. It uses pernode data structures to minimize
- *		  offnode memory references. However, these structure are not 
- *		  present during boot. This macro can be used once cpu_init
- *		  completes.
- */
-#define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
-
-/*
- * LOCAL_DATA_ADDR - This is to calculate the address of other node's
- *		     "local_node_data" at hot-plug phase. The local_node_data
- *		     is pointed by per_cpu_page. Kernel usually use it for
- *		     just executing cpu. However, when new node is hot-added,
- *		     the addresses of local data for other nodes are necessary
- *		     to update all of them.
- */
-#define LOCAL_DATA_ADDR(pgdat)  			\
-	((struct ia64_node_data *)((u64)(pgdat) + 	\
-				   L1_CACHE_ALIGN(sizeof(struct pglist_data))))
-
-#endif /* CONFIG_NUMA */
-
-#endif /* _ASM_IA64_NODEDATA_H */
diff --git a/arch/ia64/include/asm/numa.h b/arch/ia64/include/asm/numa.h
deleted file mode 100644
index c5c253cb9bd6..000000000000
--- a/arch/ia64/include/asm/numa.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific prototypes and definitions.
- *
- * 2002/08/05 Erich Focht <efocht@ess.nec.de>
- *
- */
-#ifndef _ASM_IA64_NUMA_H
-#define _ASM_IA64_NUMA_H
-
-
-#ifdef CONFIG_NUMA
-
-#include <linux/cache.h>
-#include <linux/cpumask.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-
-#include <asm/mmzone.h>
-
-extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
-extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
-extern pg_data_t *pgdat_list[MAX_NUMNODES];
-
-/* Stuff below this line could be architecture independent */
-
-extern int num_node_memblks;		/* total number of memory chunks */
-
-/*
- * List of node memory chunks. Filled when parsing SRAT table to
- * obtain information about memory nodes.
-*/
-
-struct node_memblk_s {
-	unsigned long start_paddr;
-	unsigned long size;
-	int nid;		/* which logical node contains this chunk? */
-	int bank;		/* which mem bank on this node */
-};
-
-struct node_cpuid_s {
-	u16	phys_id;	/* id << 8 | eid */
-	int	nid;		/* logical node containing this CPU */
-};
-
-extern struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-extern struct node_cpuid_s node_cpuid[NR_CPUS];
-
-/*
- * ACPI 2.0 SLIT (System Locality Information Table)
- * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
- *
- * This is a matrix with "distances" between nodes, they should be
- * proportional to the memory access latency ratios.
- */
-
-extern u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
-#define slit_distance(from,to) (numa_slit[(from) * MAX_NUMNODES + (to)])
-extern int __node_distance(int from, int to);
-#define node_distance(from,to) __node_distance(from, to)
-
-extern int paddr_to_nid(unsigned long paddr);
-
-#define local_nodeid (cpu_to_node_map[smp_processor_id()])
-
-#define numa_off     0
-
-extern void map_cpu_to_node(int cpu, int nid);
-extern void unmap_cpu_from_node(int cpu, int nid);
-extern void numa_clear_node(int cpu);
-
-#else /* !CONFIG_NUMA */
-#define map_cpu_to_node(cpu, nid)	do{}while(0)
-#define unmap_cpu_from_node(cpu, nid)	do{}while(0)
-#define paddr_to_nid(addr)	0
-#define numa_clear_node(cpu)	do { } while (0)
-#endif /* CONFIG_NUMA */
-
-#endif /* _ASM_IA64_NUMA_H */
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
deleted file mode 100644
index 310b09c3342d..000000000000
--- a/arch/ia64/include/asm/page.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PAGE_H
-#define _ASM_IA64_PAGE_H
-/*
- * Pagetable related stuff.
- *
- * Copyright (C) 1998, 1999, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/intrinsics.h>
-#include <asm/types.h>
-
-/*
- * The top three bits of an IA64 address are its Region Number.
- * Different regions are assigned to different purposes.
- */
-#define RGN_SHIFT	(61)
-#define RGN_BASE(r)	(__IA64_UL_CONST(r)<<RGN_SHIFT)
-#define RGN_BITS	(RGN_BASE(-1))
-
-#define RGN_KERNEL	7	/* Identity mapped region */
-#define RGN_UNCACHED    6	/* Identity mapped I/O region */
-#define RGN_GATE	5	/* Gate page, Kernel text, etc */
-#define RGN_HPAGE	4	/* For Huge TLB pages */
-
-/*
- * PAGE_SHIFT determines the actual kernel page size.
- */
-#if defined(CONFIG_IA64_PAGE_SIZE_4KB)
-# define PAGE_SHIFT	12
-#elif defined(CONFIG_IA64_PAGE_SIZE_8KB)
-# define PAGE_SHIFT	13
-#elif defined(CONFIG_IA64_PAGE_SIZE_16KB)
-# define PAGE_SHIFT	14
-#elif defined(CONFIG_IA64_PAGE_SIZE_64KB)
-# define PAGE_SHIFT	16
-#else
-# error Unsupported page size!
-#endif
-
-#define PAGE_SIZE		(__IA64_UL_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE - 1))
-
-#define PERCPU_PAGE_SHIFT	18	/* log2() of max. size of per-CPU area */
-#define PERCPU_PAGE_SIZE	(__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT)
-
-
-#ifdef CONFIG_HUGETLB_PAGE
-# define HPAGE_REGION_BASE	RGN_BASE(RGN_HPAGE)
-# define HPAGE_SHIFT		hpage_shift
-# define HPAGE_SHIFT_DEFAULT	28	/* check ia64 SDM for architecture supported size */
-# define HPAGE_SIZE		(__IA64_UL_CONST(1) << HPAGE_SHIFT)
-# define HPAGE_MASK		(~(HPAGE_SIZE - 1))
-
-# define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#endif /* CONFIG_HUGETLB_PAGE */
-
-#ifdef __ASSEMBLY__
-# define __pa(x)		((x) - PAGE_OFFSET)
-# define __va(x)		((x) + PAGE_OFFSET)
-#else /* !__ASSEMBLY */
-#  define STRICT_MM_TYPECHECKS
-
-extern void clear_page (void *page);
-extern void copy_page (void *to, void *from);
-
-/*
- * clear_user_page() and copy_user_page() can't be inline functions because
- * flush_dcache_page() can't be defined until later...
- */
-#define clear_user_page(addr, vaddr, page)	\
-do {						\
-	clear_page(addr);			\
-	flush_dcache_page(page);		\
-} while (0)
-
-#define copy_user_page(to, from, vaddr, page)	\
-do {						\
-	copy_page((to), (from));		\
-	flush_dcache_page(page);		\
-} while (0)
-
-
-#define vma_alloc_zeroed_movable_folio(vma, vaddr)			\
-({									\
-	struct folio *folio = vma_alloc_folio(				\
-		GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false); \
-	if (folio)							\
-		flush_dcache_folio(folio);				\
-	folio;								\
-})
-
-#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-
-#include <asm-generic/memory_model.h>
-
-#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
-#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
-
-typedef union ia64_va {
-	struct {
-		unsigned long off : 61;		/* intra-region offset */
-		unsigned long reg :  3;		/* region number */
-	} f;
-	unsigned long l;
-	void *p;
-} ia64_va;
-
-/*
- * Note: These macros depend on the fact that PAGE_OFFSET has all
- * region bits set to 1 and all other bits set to zero.  They are
- * expressed in this way to ensure they result in a single "dep"
- * instruction.
- */
-#define __pa(x)		({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;})
-#define __va(x)		({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;})
-
-#define REGION_NUMBER(x)	({ia64_va _v; _v.l = (long) (x); _v.f.reg;})
-#define REGION_OFFSET(x)	({ia64_va _v; _v.l = (long) (x); _v.f.off;})
-
-#ifdef CONFIG_HUGETLB_PAGE
-# define htlbpage_to_page(x)	(((unsigned long) REGION_NUMBER(x) << 61)			\
-				 | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
-# define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
-extern unsigned int hpage_shift;
-#endif
-
-static __inline__ int
-get_order (unsigned long size)
-{
-	long double d = size - 1;
-	long order;
-
-	order = ia64_getf_exp(d);
-	order = order - PAGE_SHIFT - 0xffff + 1;
-	if (order < 0)
-		order = 0;
-	return order;
-}
-
-#endif /* !__ASSEMBLY__ */
-
-#ifdef STRICT_MM_TYPECHECKS
-  /*
-   * These are used to make use of C type-checking..
-   */
-  typedef struct { unsigned long pte; } pte_t;
-  typedef struct { unsigned long pmd; } pmd_t;
-#if CONFIG_PGTABLE_LEVELS == 4
-  typedef struct { unsigned long pud; } pud_t;
-#endif
-  typedef struct { unsigned long pgd; } pgd_t;
-  typedef struct { unsigned long pgprot; } pgprot_t;
-  typedef struct page *pgtable_t;
-
-# define pte_val(x)	((x).pte)
-# define pmd_val(x)	((x).pmd)
-#if CONFIG_PGTABLE_LEVELS == 4
-# define pud_val(x)	((x).pud)
-#endif
-# define pgd_val(x)	((x).pgd)
-# define pgprot_val(x)	((x).pgprot)
-
-# define __pte(x)	((pte_t) { (x) } )
-# define __pmd(x)	((pmd_t) { (x) } )
-# define __pgprot(x)	((pgprot_t) { (x) } )
-
-#else /* !STRICT_MM_TYPECHECKS */
-  /*
-   * .. while these make it easier on the compiler
-   */
-# ifndef __ASSEMBLY__
-    typedef unsigned long pte_t;
-    typedef unsigned long pmd_t;
-    typedef unsigned long pgd_t;
-    typedef unsigned long pgprot_t;
-    typedef struct page *pgtable_t;
-# endif
-
-# define pte_val(x)	(x)
-# define pmd_val(x)	(x)
-# define pgd_val(x)	(x)
-# define pgprot_val(x)	(x)
-
-# define __pte(x)	(x)
-# define __pgd(x)	(x)
-# define __pgprot(x)	(x)
-#endif /* !STRICT_MM_TYPECHECKS */
-
-#define PAGE_OFFSET			RGN_BASE(RGN_KERNEL)
-
-#define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_TSK_EXEC
-
-#define GATE_ADDR		RGN_BASE(RGN_GATE)
-
-/*
- * 0xa000000000000000+2*PERCPU_PAGE_SIZE
- * - 0xa000000000000000+3*PERCPU_PAGE_SIZE remain unmapped (guard page)
- */
-#define KERNEL_START		 (GATE_ADDR+__IA64_UL_CONST(0x100000000))
-#define PERCPU_ADDR		(-PERCPU_PAGE_SIZE)
-#define LOAD_OFFSET		(KERNEL_START - KERNEL_TR_PAGE_SIZE)
-
-#define __HAVE_ARCH_GATE_AREA	1
-
-#endif /* _ASM_IA64_PAGE_H */
diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h
deleted file mode 100644
index e6b652f9e45e..000000000000
--- a/arch/ia64/include/asm/pal.h
+++ /dev/null
@@ -1,1827 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PAL_H
-#define _ASM_IA64_PAL_H
-
-/*
- * Processor Abstraction Layer definitions.
- *
- * This is based on Intel IA-64 Architecture Software Developer's Manual rev 1.0
- * chapter 11 IA-64 Processor Abstraction Layer
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Srinivasa Prasad Thirumalachar <sprasad@sprasad.engr.sgi.com>
- * Copyright (C) 2008 Silicon Graphics, Inc. (SGI)
- *
- * 99/10/01	davidm	Make sure we pass zero for reserved parameters.
- * 00/03/07	davidm	Updated pal_cache_flush() to be in sync with PAL v2.6.
- * 00/03/23     cfleck  Modified processor min-state save area to match updated PAL & SAL info
- * 00/05/24     eranian Updated to latest PAL spec, fix structures bugs, added
- * 00/05/25	eranian Support for stack calls, and static physical calls
- * 00/06/18	eranian Support for stacked physical calls
- * 06/10/26	rja	Support for Intel Itanium Architecture Software Developer's
- *			Manual Rev 2.2 (Jan 2006)
- */
-
-/*
- * Note that some of these calls use a static-register only calling
- * convention which has nothing to do with the regular calling
- * convention.
- */
-#define PAL_CACHE_FLUSH		1	/* flush i/d cache */
-#define PAL_CACHE_INFO		2	/* get detailed i/d cache info */
-#define PAL_CACHE_INIT		3	/* initialize i/d cache */
-#define PAL_CACHE_SUMMARY	4	/* get summary of cache hierarchy */
-#define PAL_MEM_ATTRIB		5	/* list supported memory attributes */
-#define PAL_PTCE_INFO		6	/* purge TLB info */
-#define PAL_VM_INFO		7	/* return supported virtual memory features */
-#define PAL_VM_SUMMARY		8	/* return summary on supported vm features */
-#define PAL_BUS_GET_FEATURES	9	/* return processor bus interface features settings */
-#define PAL_BUS_SET_FEATURES	10	/* set processor bus features */
-#define PAL_DEBUG_INFO		11	/* get number of debug registers */
-#define PAL_FIXED_ADDR		12	/* get fixed component of processors's directed address */
-#define PAL_FREQ_BASE		13	/* base frequency of the platform */
-#define PAL_FREQ_RATIOS		14	/* ratio of processor, bus and ITC frequency */
-#define PAL_PERF_MON_INFO	15	/* return performance monitor info */
-#define PAL_PLATFORM_ADDR	16	/* set processor interrupt block and IO port space addr */
-#define PAL_PROC_GET_FEATURES	17	/* get configurable processor features & settings */
-#define PAL_PROC_SET_FEATURES	18	/* enable/disable configurable processor features */
-#define PAL_RSE_INFO		19	/* return rse information */
-#define PAL_VERSION		20	/* return version of PAL code */
-#define PAL_MC_CLEAR_LOG	21	/* clear all processor log info */
-#define PAL_MC_DRAIN		22	/* drain operations which could result in an MCA */
-#define PAL_MC_EXPECTED		23	/* set/reset expected MCA indicator */
-#define PAL_MC_DYNAMIC_STATE	24	/* get processor dynamic state */
-#define PAL_MC_ERROR_INFO	25	/* get processor MCA info and static state */
-#define PAL_MC_RESUME		26	/* Return to interrupted process */
-#define PAL_MC_REGISTER_MEM	27	/* Register memory for PAL to use during MCAs and inits */
-#define PAL_HALT		28	/* enter the low power HALT state */
-#define PAL_HALT_LIGHT		29	/* enter the low power light halt state*/
-#define PAL_COPY_INFO		30	/* returns info needed to relocate PAL */
-#define PAL_CACHE_LINE_INIT	31	/* init tags & data of cache line */
-#define PAL_PMI_ENTRYPOINT	32	/* register PMI memory entry points with the processor */
-#define PAL_ENTER_IA_32_ENV	33	/* enter IA-32 system environment */
-#define PAL_VM_PAGE_SIZE	34	/* return vm TC and page walker page sizes */
-
-#define PAL_MEM_FOR_TEST	37	/* get amount of memory needed for late processor test */
-#define PAL_CACHE_PROT_INFO	38	/* get i/d cache protection info */
-#define PAL_REGISTER_INFO	39	/* return AR and CR register information*/
-#define PAL_SHUTDOWN		40	/* enter processor shutdown state */
-#define PAL_PREFETCH_VISIBILITY	41	/* Make Processor Prefetches Visible */
-#define PAL_LOGICAL_TO_PHYSICAL 42	/* returns information on logical to physical processor mapping */
-#define PAL_CACHE_SHARED_INFO	43	/* returns information on caches shared by logical processor */
-#define PAL_GET_HW_POLICY	48	/* Get current hardware resource sharing policy */
-#define PAL_SET_HW_POLICY	49	/* Set current hardware resource sharing policy */
-#define PAL_VP_INFO		50	/* Information about virtual processor features */
-#define PAL_MC_HW_TRACKING	51	/* Hardware tracking status */
-
-#define PAL_COPY_PAL		256	/* relocate PAL procedures and PAL PMI */
-#define PAL_HALT_INFO		257	/* return the low power capabilities of processor */
-#define PAL_TEST_PROC		258	/* perform late processor self-test */
-#define PAL_CACHE_READ		259	/* read tag & data of cacheline for diagnostic testing */
-#define PAL_CACHE_WRITE		260	/* write tag & data of cacheline for diagnostic testing */
-#define PAL_VM_TR_READ		261	/* read contents of translation register */
-#define PAL_GET_PSTATE		262	/* get the current P-state */
-#define PAL_SET_PSTATE		263	/* set the P-state */
-#define PAL_BRAND_INFO		274	/* Processor branding information */
-
-#define PAL_GET_PSTATE_TYPE_LASTSET	0
-#define PAL_GET_PSTATE_TYPE_AVGANDRESET	1
-#define PAL_GET_PSTATE_TYPE_AVGNORESET	2
-#define PAL_GET_PSTATE_TYPE_INSTANT	3
-
-#define PAL_MC_ERROR_INJECT	276	/* Injects processor error or returns injection capabilities */
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include <asm/intrinsics.h>
-
-/*
- * Data types needed to pass information into PAL procedures and
- * interpret information returned by them.
- */
-
-/* Return status from the PAL procedure */
-typedef s64				pal_status_t;
-
-#define PAL_STATUS_SUCCESS		0	/* No error */
-#define PAL_STATUS_UNIMPLEMENTED	(-1)	/* Unimplemented procedure */
-#define PAL_STATUS_EINVAL		(-2)	/* Invalid argument */
-#define PAL_STATUS_ERROR		(-3)	/* Error */
-#define PAL_STATUS_CACHE_INIT_FAIL	(-4)	/* Could not initialize the
-						 * specified level and type of
-						 * cache without sideeffects
-						 * and "restrict" was 1
-						 */
-#define PAL_STATUS_REQUIRES_MEMORY	(-9)	/* Call requires PAL memory buffer */
-
-/* Processor cache level in the hierarchy */
-typedef u64				pal_cache_level_t;
-#define PAL_CACHE_LEVEL_L0		0	/* L0 */
-#define PAL_CACHE_LEVEL_L1		1	/* L1 */
-#define PAL_CACHE_LEVEL_L2		2	/* L2 */
-
-
-/* Processor cache type at a particular level in the hierarchy */
-
-typedef u64				pal_cache_type_t;
-#define PAL_CACHE_TYPE_INSTRUCTION	1	/* Instruction cache */
-#define PAL_CACHE_TYPE_DATA		2	/* Data or unified cache */
-#define PAL_CACHE_TYPE_INSTRUCTION_DATA	3	/* Both Data & Instruction */
-
-
-#define PAL_CACHE_FLUSH_INVALIDATE	1	/* Invalidate clean lines */
-#define PAL_CACHE_FLUSH_CHK_INTRS	2	/* check for interrupts/mc while flushing */
-
-/* Processor cache line size in bytes  */
-typedef int				pal_cache_line_size_t;
-
-/* Processor cache line state */
-typedef u64				pal_cache_line_state_t;
-#define PAL_CACHE_LINE_STATE_INVALID	0	/* Invalid */
-#define PAL_CACHE_LINE_STATE_SHARED	1	/* Shared */
-#define PAL_CACHE_LINE_STATE_EXCLUSIVE	2	/* Exclusive */
-#define PAL_CACHE_LINE_STATE_MODIFIED	3	/* Modified */
-
-typedef struct pal_freq_ratio {
-	u32 den, num;		/* numerator & denominator */
-} itc_ratio, proc_ratio;
-
-typedef	union  pal_cache_config_info_1_s {
-	struct {
-		u64		u		: 1,	/* 0 Unified cache ? */
-				at		: 2,	/* 2-1 Cache mem attr*/
-				reserved	: 5,	/* 7-3 Reserved */
-				associativity	: 8,	/* 16-8 Associativity*/
-				line_size	: 8,	/* 23-17 Line size */
-				stride		: 8,	/* 31-24 Stride */
-				store_latency	: 8,	/*39-32 Store latency*/
-				load_latency	: 8,	/* 47-40 Load latency*/
-				store_hints	: 8,	/* 55-48 Store hints*/
-				load_hints	: 8;	/* 63-56 Load hints */
-	} pcci1_bits;
-	u64			pcci1_data;
-} pal_cache_config_info_1_t;
-
-typedef	union  pal_cache_config_info_2_s {
-	struct {
-		u32		cache_size;		/*cache size in bytes*/
-
-
-		u32		alias_boundary	: 8,	/* 39-32 aliased addr
-							 * separation for max
-							 * performance.
-							 */
-				tag_ls_bit	: 8,	/* 47-40 LSb of addr*/
-				tag_ms_bit	: 8,	/* 55-48 MSb of addr*/
-				reserved	: 8;	/* 63-56 Reserved */
-	} pcci2_bits;
-	u64			pcci2_data;
-} pal_cache_config_info_2_t;
-
-
-typedef struct pal_cache_config_info_s {
-	pal_status_t			pcci_status;
-	pal_cache_config_info_1_t	pcci_info_1;
-	pal_cache_config_info_2_t	pcci_info_2;
-	u64				pcci_reserved;
-} pal_cache_config_info_t;
-
-#define pcci_ld_hints		pcci_info_1.pcci1_bits.load_hints
-#define pcci_st_hints		pcci_info_1.pcci1_bits.store_hints
-#define pcci_ld_latency		pcci_info_1.pcci1_bits.load_latency
-#define pcci_st_latency		pcci_info_1.pcci1_bits.store_latency
-#define pcci_stride		pcci_info_1.pcci1_bits.stride
-#define pcci_line_size		pcci_info_1.pcci1_bits.line_size
-#define pcci_assoc		pcci_info_1.pcci1_bits.associativity
-#define pcci_cache_attr		pcci_info_1.pcci1_bits.at
-#define pcci_unified		pcci_info_1.pcci1_bits.u
-#define pcci_tag_msb		pcci_info_2.pcci2_bits.tag_ms_bit
-#define pcci_tag_lsb		pcci_info_2.pcci2_bits.tag_ls_bit
-#define pcci_alias_boundary	pcci_info_2.pcci2_bits.alias_boundary
-#define pcci_cache_size		pcci_info_2.pcci2_bits.cache_size
-
-
-
-/* Possible values for cache attributes */
-
-#define PAL_CACHE_ATTR_WT		0	/* Write through cache */
-#define PAL_CACHE_ATTR_WB		1	/* Write back cache */
-#define PAL_CACHE_ATTR_WT_OR_WB		2	/* Either write thru or write
-						 * back depending on TLB
-						 * memory attributes
-						 */
-
-
-/* Possible values for cache hints */
-
-#define PAL_CACHE_HINT_TEMP_1		0	/* Temporal level 1 */
-#define PAL_CACHE_HINT_NTEMP_1		1	/* Non-temporal level 1 */
-#define PAL_CACHE_HINT_NTEMP_ALL	3	/* Non-temporal all levels */
-
-/* Processor cache protection  information */
-typedef union pal_cache_protection_element_u {
-	u32			pcpi_data;
-	struct {
-		u32		data_bits	: 8, /* # data bits covered by
-						      * each unit of protection
-						      */
-
-				tagprot_lsb	: 6, /* Least -do- */
-				tagprot_msb	: 6, /* Most Sig. tag address
-						      * bit that this
-						      * protection covers.
-						      */
-				prot_bits	: 6, /* # of protection bits */
-				method		: 4, /* Protection method */
-				t_d		: 2; /* Indicates which part
-						      * of the cache this
-						      * protection encoding
-						      * applies.
-						      */
-	} pcp_info;
-} pal_cache_protection_element_t;
-
-#define pcpi_cache_prot_part	pcp_info.t_d
-#define pcpi_prot_method	pcp_info.method
-#define pcpi_prot_bits		pcp_info.prot_bits
-#define pcpi_tagprot_msb	pcp_info.tagprot_msb
-#define pcpi_tagprot_lsb	pcp_info.tagprot_lsb
-#define pcpi_data_bits		pcp_info.data_bits
-
-/* Processor cache part encodings */
-#define PAL_CACHE_PROT_PART_DATA	0	/* Data protection  */
-#define PAL_CACHE_PROT_PART_TAG		1	/* Tag  protection */
-#define PAL_CACHE_PROT_PART_TAG_DATA	2	/* Tag+data protection (tag is
-						 * more significant )
-						 */
-#define PAL_CACHE_PROT_PART_DATA_TAG	3	/* Data+tag protection (data is
-						 * more significant )
-						 */
-#define PAL_CACHE_PROT_PART_MAX		6
-
-
-typedef struct pal_cache_protection_info_s {
-	pal_status_t			pcpi_status;
-	pal_cache_protection_element_t	pcp_info[PAL_CACHE_PROT_PART_MAX];
-} pal_cache_protection_info_t;
-
-
-/* Processor cache protection method encodings */
-#define PAL_CACHE_PROT_METHOD_NONE		0	/* No protection */
-#define PAL_CACHE_PROT_METHOD_ODD_PARITY	1	/* Odd parity */
-#define PAL_CACHE_PROT_METHOD_EVEN_PARITY	2	/* Even parity */
-#define PAL_CACHE_PROT_METHOD_ECC		3	/* ECC protection */
-
-
-/* Processor cache line identification in the hierarchy */
-typedef union pal_cache_line_id_u {
-	u64			pclid_data;
-	struct {
-		u64		cache_type	: 8,	/* 7-0 cache type */
-				level		: 8,	/* 15-8 level of the
-							 * cache in the
-							 * hierarchy.
-							 */
-				way		: 8,	/* 23-16 way in the set
-							 */
-				part		: 8,	/* 31-24 part of the
-							 * cache
-							 */
-				reserved	: 32;	/* 63-32 is reserved*/
-	} pclid_info_read;
-	struct {
-		u64		cache_type	: 8,	/* 7-0 cache type */
-				level		: 8,	/* 15-8 level of the
-							 * cache in the
-							 * hierarchy.
-							 */
-				way		: 8,	/* 23-16 way in the set
-							 */
-				part		: 8,	/* 31-24 part of the
-							 * cache
-							 */
-				mesi		: 8,	/* 39-32 cache line
-							 * state
-							 */
-				start		: 8,	/* 47-40 lsb of data to
-							 * invert
-							 */
-				length		: 8,	/* 55-48 #bits to
-							 * invert
-							 */
-				trigger		: 8;	/* 63-56 Trigger error
-							 * by doing a load
-							 * after the write
-							 */
-
-	} pclid_info_write;
-} pal_cache_line_id_u_t;
-
-#define pclid_read_part		pclid_info_read.part
-#define pclid_read_way		pclid_info_read.way
-#define pclid_read_level	pclid_info_read.level
-#define pclid_read_cache_type	pclid_info_read.cache_type
-
-#define pclid_write_trigger	pclid_info_write.trigger
-#define pclid_write_length	pclid_info_write.length
-#define pclid_write_start	pclid_info_write.start
-#define pclid_write_mesi	pclid_info_write.mesi
-#define pclid_write_part	pclid_info_write.part
-#define pclid_write_way		pclid_info_write.way
-#define pclid_write_level	pclid_info_write.level
-#define pclid_write_cache_type	pclid_info_write.cache_type
-
-/* Processor cache line part encodings */
-#define PAL_CACHE_LINE_ID_PART_DATA		0	/* Data */
-#define PAL_CACHE_LINE_ID_PART_TAG		1	/* Tag */
-#define PAL_CACHE_LINE_ID_PART_DATA_PROT	2	/* Data protection */
-#define PAL_CACHE_LINE_ID_PART_TAG_PROT		3	/* Tag protection */
-#define PAL_CACHE_LINE_ID_PART_DATA_TAG_PROT	4	/* Data+tag
-							 * protection
-							 */
-typedef struct pal_cache_line_info_s {
-	pal_status_t		pcli_status;		/* Return status of the read cache line
-							 * info call.
-							 */
-	u64			pcli_data;		/* 64-bit data, tag, protection bits .. */
-	u64			pcli_data_len;		/* data length in bits */
-	pal_cache_line_state_t	pcli_cache_line_state;	/* mesi state */
-
-} pal_cache_line_info_t;
-
-
-/* Machine Check related crap */
-
-/* Pending event status bits  */
-typedef u64					pal_mc_pending_events_t;
-
-#define PAL_MC_PENDING_MCA			(1 << 0)
-#define PAL_MC_PENDING_INIT			(1 << 1)
-
-/* Error information type */
-typedef u64					pal_mc_info_index_t;
-
-#define PAL_MC_INFO_PROCESSOR			0	/* Processor */
-#define PAL_MC_INFO_CACHE_CHECK			1	/* Cache check */
-#define PAL_MC_INFO_TLB_CHECK			2	/* Tlb check */
-#define PAL_MC_INFO_BUS_CHECK			3	/* Bus check */
-#define PAL_MC_INFO_REQ_ADDR			4	/* Requestor address */
-#define PAL_MC_INFO_RESP_ADDR			5	/* Responder address */
-#define PAL_MC_INFO_TARGET_ADDR			6	/* Target address */
-#define PAL_MC_INFO_IMPL_DEP			7	/* Implementation
-							 * dependent
-							 */
-
-#define PAL_TLB_CHECK_OP_PURGE			8
-
-typedef struct pal_process_state_info_s {
-	u64		reserved1	: 2,
-			rz		: 1,	/* PAL_CHECK processor
-						 * rendezvous
-						 * successful.
-						 */
-
-			ra		: 1,	/* PAL_CHECK attempted
-						 * a rendezvous.
-						 */
-			me		: 1,	/* Distinct multiple
-						 * errors occurred
-						 */
-
-			mn		: 1,	/* Min. state save
-						 * area has been
-						 * registered with PAL
-						 */
-
-			sy		: 1,	/* Storage integrity
-						 * synched
-						 */
-
-
-			co		: 1,	/* Continuable */
-			ci		: 1,	/* MC isolated */
-			us		: 1,	/* Uncontained storage
-						 * damage.
-						 */
-
-
-			hd		: 1,	/* Non-essential hw
-						 * lost (no loss of
-						 * functionality)
-						 * causing the
-						 * processor to run in
-						 * degraded mode.
-						 */
-
-			tl		: 1,	/* 1 => MC occurred
-						 * after an instr was
-						 * executed but before
-						 * the trap that
-						 * resulted from instr
-						 * execution was
-						 * generated.
-						 * (Trap Lost )
-						 */
-			mi		: 1,	/* More information available
-						 * call PAL_MC_ERROR_INFO
-						 */
-			pi		: 1,	/* Precise instruction pointer */
-			pm		: 1,	/* Precise min-state save area */
-
-			dy		: 1,	/* Processor dynamic
-						 * state valid
-						 */
-
-
-			in		: 1,	/* 0 = MC, 1 = INIT */
-			rs		: 1,	/* RSE valid */
-			cm		: 1,	/* MC corrected */
-			ex		: 1,	/* MC is expected */
-			cr		: 1,	/* Control regs valid*/
-			pc		: 1,	/* Perf cntrs valid */
-			dr		: 1,	/* Debug regs valid */
-			tr		: 1,	/* Translation regs
-						 * valid
-						 */
-			rr		: 1,	/* Region regs valid */
-			ar		: 1,	/* App regs valid */
-			br		: 1,	/* Branch regs valid */
-			pr		: 1,	/* Predicate registers
-						 * valid
-						 */
-
-			fp		: 1,	/* fp registers valid*/
-			b1		: 1,	/* Preserved bank one
-						 * general registers
-						 * are valid
-						 */
-			b0		: 1,	/* Preserved bank zero
-						 * general registers
-						 * are valid
-						 */
-			gr		: 1,	/* General registers
-						 * are valid
-						 * (excl. banked regs)
-						 */
-			dsize		: 16,	/* size of dynamic
-						 * state returned
-						 * by the processor
-						 */
-
-			se		: 1,	/* Shared error.  MCA in a
-						   shared structure */
-			reserved2	: 10,
-			cc		: 1,	/* Cache check */
-			tc		: 1,	/* TLB check */
-			bc		: 1,	/* Bus check */
-			rc		: 1,	/* Register file check */
-			uc		: 1;	/* Uarch check */
-
-} pal_processor_state_info_t;
-
-typedef struct pal_cache_check_info_s {
-	u64		op		: 4,	/* Type of cache
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			level		: 2,	/* Cache level */
-			reserved1	: 2,
-			dl		: 1,	/* Failure in data part
-						 * of cache line
-						 */
-			tl		: 1,	/* Failure in tag part
-						 * of cache line
-						 */
-			dc		: 1,	/* Failure in dcache */
-			ic		: 1,	/* Failure in icache */
-			mesi		: 3,	/* Cache line state */
-			mv		: 1,	/* mesi valid */
-			way		: 5,	/* Way in which the
-						 * error occurred
-						 */
-			wiv		: 1,	/* Way field valid */
-			reserved2	: 1,
-			dp		: 1,	/* Data poisoned on MBE */
-			reserved3	: 6,
-			hlth		: 2,	/* Health indicator */
-
-			index		: 20,	/* Cache line index */
-			reserved4	: 2,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_cache_check_info_t;
-
-typedef struct pal_tlb_check_info_s {
-
-	u64		tr_slot		: 8,	/* Slot# of TR where
-						 * error occurred
-						 */
-			trv		: 1,	/* tr_slot field is valid */
-			reserved1	: 1,
-			level		: 2,	/* TLB level where failure occurred */
-			reserved2	: 4,
-			dtr		: 1,	/* Fail in data TR */
-			itr		: 1,	/* Fail in inst TR */
-			dtc		: 1,	/* Fail in data TC */
-			itc		: 1,	/* Fail in inst. TC */
-			op		: 4,	/* Cache operation */
-			reserved3	: 6,
-			hlth		: 2,	/* Health indicator */
-			reserved4	: 22,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_tlb_check_info_t;
-
-typedef struct pal_bus_check_info_s {
-	u64		size		: 5,	/* Xaction size */
-			ib		: 1,	/* Internal bus error */
-			eb		: 1,	/* External bus error */
-			cc		: 1,	/* Error occurred
-						 * during cache-cache
-						 * transfer.
-						 */
-			type		: 8,	/* Bus xaction type*/
-			sev		: 5,	/* Bus error severity*/
-			hier		: 2,	/* Bus hierarchy level */
-			dp		: 1,	/* Data poisoned on MBE */
-			bsi		: 8,	/* Bus error status
-						 * info
-						 */
-			reserved2	: 22,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_bus_check_info_t;
-
-typedef struct pal_reg_file_check_info_s {
-	u64		id		: 4,	/* Register file identifier */
-			op		: 4,	/* Type of register
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			reg_num		: 7,	/* Register number */
-			rnv		: 1,	/* reg_num valid */
-			reserved2	: 38,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			reserved3	: 3,
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_reg_file_check_info_t;
-
-typedef struct pal_uarch_check_info_s {
-	u64		sid		: 5,	/* Structure identification */
-			level		: 3,	/* Level of failure */
-			array_id	: 4,	/* Array identification */
-			op		: 4,	/* Type of
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			way		: 6,	/* Way of structure */
-			wv		: 1,	/* way valid */
-			xv		: 1,	/* index valid */
-			reserved1	: 6,
-			hlth		: 2,	/* Health indicator */
-			index		: 8,	/* Index or set of the uarch
-						 * structure that failed.
-						 */
-			reserved2	: 24,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_uarch_check_info_t;
-
-typedef union pal_mc_error_info_u {
-	u64				pmei_data;
-	pal_processor_state_info_t	pme_processor;
-	pal_cache_check_info_t		pme_cache;
-	pal_tlb_check_info_t		pme_tlb;
-	pal_bus_check_info_t		pme_bus;
-	pal_reg_file_check_info_t	pme_reg_file;
-	pal_uarch_check_info_t		pme_uarch;
-} pal_mc_error_info_t;
-
-#define pmci_proc_unknown_check			pme_processor.uc
-#define pmci_proc_bus_check			pme_processor.bc
-#define pmci_proc_tlb_check			pme_processor.tc
-#define pmci_proc_cache_check			pme_processor.cc
-#define pmci_proc_dynamic_state_size		pme_processor.dsize
-#define pmci_proc_gpr_valid			pme_processor.gr
-#define pmci_proc_preserved_bank0_gpr_valid	pme_processor.b0
-#define pmci_proc_preserved_bank1_gpr_valid	pme_processor.b1
-#define pmci_proc_fp_valid			pme_processor.fp
-#define pmci_proc_predicate_regs_valid		pme_processor.pr
-#define pmci_proc_branch_regs_valid		pme_processor.br
-#define pmci_proc_app_regs_valid		pme_processor.ar
-#define pmci_proc_region_regs_valid		pme_processor.rr
-#define pmci_proc_translation_regs_valid	pme_processor.tr
-#define pmci_proc_debug_regs_valid		pme_processor.dr
-#define pmci_proc_perf_counters_valid		pme_processor.pc
-#define pmci_proc_control_regs_valid		pme_processor.cr
-#define pmci_proc_machine_check_expected	pme_processor.ex
-#define pmci_proc_machine_check_corrected	pme_processor.cm
-#define pmci_proc_rse_valid			pme_processor.rs
-#define pmci_proc_machine_check_or_init		pme_processor.in
-#define pmci_proc_dynamic_state_valid		pme_processor.dy
-#define pmci_proc_operation			pme_processor.op
-#define pmci_proc_trap_lost			pme_processor.tl
-#define pmci_proc_hardware_damage		pme_processor.hd
-#define pmci_proc_uncontained_storage_damage	pme_processor.us
-#define pmci_proc_machine_check_isolated	pme_processor.ci
-#define pmci_proc_continuable			pme_processor.co
-#define pmci_proc_storage_intergrity_synced	pme_processor.sy
-#define pmci_proc_min_state_save_area_regd	pme_processor.mn
-#define	pmci_proc_distinct_multiple_errors	pme_processor.me
-#define pmci_proc_pal_attempted_rendezvous	pme_processor.ra
-#define pmci_proc_pal_rendezvous_complete	pme_processor.rz
-
-
-#define pmci_cache_level			pme_cache.level
-#define pmci_cache_line_state			pme_cache.mesi
-#define pmci_cache_line_state_valid		pme_cache.mv
-#define pmci_cache_line_index			pme_cache.index
-#define pmci_cache_instr_cache_fail		pme_cache.ic
-#define pmci_cache_data_cache_fail		pme_cache.dc
-#define pmci_cache_line_tag_fail		pme_cache.tl
-#define pmci_cache_line_data_fail		pme_cache.dl
-#define pmci_cache_operation			pme_cache.op
-#define pmci_cache_way_valid			pme_cache.wv
-#define pmci_cache_target_address_valid		pme_cache.tv
-#define pmci_cache_way				pme_cache.way
-#define pmci_cache_mc				pme_cache.mc
-
-#define pmci_tlb_instr_translation_cache_fail	pme_tlb.itc
-#define pmci_tlb_data_translation_cache_fail	pme_tlb.dtc
-#define pmci_tlb_instr_translation_reg_fail	pme_tlb.itr
-#define pmci_tlb_data_translation_reg_fail	pme_tlb.dtr
-#define pmci_tlb_translation_reg_slot		pme_tlb.tr_slot
-#define pmci_tlb_mc				pme_tlb.mc
-
-#define pmci_bus_status_info			pme_bus.bsi
-#define pmci_bus_req_address_valid		pme_bus.rq
-#define pmci_bus_resp_address_valid		pme_bus.rp
-#define pmci_bus_target_address_valid		pme_bus.tv
-#define pmci_bus_error_severity			pme_bus.sev
-#define pmci_bus_transaction_type		pme_bus.type
-#define pmci_bus_cache_cache_transfer		pme_bus.cc
-#define pmci_bus_transaction_size		pme_bus.size
-#define pmci_bus_internal_error			pme_bus.ib
-#define pmci_bus_external_error			pme_bus.eb
-#define pmci_bus_mc				pme_bus.mc
-
-/*
- * NOTE: this min_state_save area struct only includes the 1KB
- * architectural state save area.  The other 3 KB is scratch space
- * for PAL.
- */
-
-struct pal_min_state_area {
-	u64	pmsa_nat_bits;		/* nat bits for saved GRs  */
-	u64	pmsa_gr[15];		/* GR1	- GR15		   */
-	u64	pmsa_bank0_gr[16];	/* GR16 - GR31		   */
-	u64	pmsa_bank1_gr[16];	/* GR16 - GR31		   */
-	u64	pmsa_pr;		/* predicate registers	   */
-	u64	pmsa_br0;		/* branch register 0	   */
-	u64	pmsa_rsc;		/* ar.rsc		   */
-	u64	pmsa_iip;		/* cr.iip		   */
-	u64	pmsa_ipsr;		/* cr.ipsr		   */
-	u64	pmsa_ifs;		/* cr.ifs		   */
-	u64	pmsa_xip;		/* previous iip		   */
-	u64	pmsa_xpsr;		/* previous psr		   */
-	u64	pmsa_xfs;		/* previous ifs		   */
-	u64	pmsa_br1;		/* branch register 1	   */
-	u64	pmsa_reserved[70];	/* pal_min_state_area should total to 1KB */
-};
-
-
-struct ia64_pal_retval {
-	/*
-	 * A zero status value indicates call completed without error.
-	 * A negative status value indicates reason of call failure.
-	 * A positive status value indicates success but an
-	 * informational value should be printed (e.g., "reboot for
-	 * change to take effect").
-	 */
-	s64 status;
-	u64 v0;
-	u64 v1;
-	u64 v2;
-};
-
-/*
- * Note: Currently unused PAL arguments are generally labeled
- * "reserved" so the value specified in the PAL documentation
- * (generally 0) MUST be passed.  Reserved parameters are not optional
- * parameters.
- */
-extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_stacked (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_phys_static (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_phys_stacked (u64, u64, u64, u64);
-extern void ia64_save_scratch_fpregs (struct ia64_fpreg *);
-extern void ia64_load_scratch_fpregs (struct ia64_fpreg *);
-
-#define PAL_CALL(iprv,a0,a1,a2,a3) do {			\
-	struct ia64_fpreg fr[6];			\
-	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_static(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);			\
-} while (0)
-
-#define PAL_CALL_STK(iprv,a0,a1,a2,a3) do {		\
-	struct ia64_fpreg fr[6];			\
-	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_stacked(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);			\
-} while (0)
-
-#define PAL_CALL_PHYS(iprv,a0,a1,a2,a3) do {			\
-	struct ia64_fpreg fr[6];				\
-	ia64_save_scratch_fpregs(fr);				\
-	iprv = ia64_pal_call_phys_static(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);				\
-} while (0)
-
-#define PAL_CALL_PHYS_STK(iprv,a0,a1,a2,a3) do {		\
-	struct ia64_fpreg fr[6];				\
-	ia64_save_scratch_fpregs(fr);				\
-	iprv = ia64_pal_call_phys_stacked(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);				\
-} while (0)
-
-typedef int (*ia64_pal_handler) (u64, ...);
-extern ia64_pal_handler ia64_pal;
-extern void ia64_pal_handler_init (void *);
-
-extern ia64_pal_handler ia64_pal;
-
-extern pal_cache_config_info_t		l0d_cache_config_info;
-extern pal_cache_config_info_t		l0i_cache_config_info;
-extern pal_cache_config_info_t		l1_cache_config_info;
-extern pal_cache_config_info_t		l2_cache_config_info;
-
-extern pal_cache_protection_info_t	l0d_cache_protection_info;
-extern pal_cache_protection_info_t	l0i_cache_protection_info;
-extern pal_cache_protection_info_t	l1_cache_protection_info;
-extern pal_cache_protection_info_t	l2_cache_protection_info;
-
-extern pal_cache_config_info_t		pal_cache_config_info_get(pal_cache_level_t,
-								  pal_cache_type_t);
-
-extern pal_cache_protection_info_t	pal_cache_protection_info_get(pal_cache_level_t,
-								      pal_cache_type_t);
-
-
-extern void				pal_error(int);
-
-
-/* Useful wrappers for the current list of pal procedures */
-
-typedef union pal_bus_features_u {
-	u64	pal_bus_features_val;
-	struct {
-		u64	pbf_reserved1				:	29;
-		u64	pbf_req_bus_parking			:	1;
-		u64	pbf_bus_lock_mask			:	1;
-		u64	pbf_enable_half_xfer_rate		:	1;
-		u64	pbf_reserved2				:	20;
-		u64	pbf_enable_shared_line_replace		:	1;
-		u64	pbf_enable_exclusive_line_replace	:	1;
-		u64	pbf_disable_xaction_queueing		:	1;
-		u64	pbf_disable_resp_err_check		:	1;
-		u64	pbf_disable_berr_check			:	1;
-		u64	pbf_disable_bus_req_internal_err_signal	:	1;
-		u64	pbf_disable_bus_req_berr_signal		:	1;
-		u64	pbf_disable_bus_init_event_check	:	1;
-		u64	pbf_disable_bus_init_event_signal	:	1;
-		u64	pbf_disable_bus_addr_err_check		:	1;
-		u64	pbf_disable_bus_addr_err_signal		:	1;
-		u64	pbf_disable_bus_data_err_check		:	1;
-	} pal_bus_features_s;
-} pal_bus_features_u_t;
-
-extern void pal_bus_features_print (u64);
-
-/* Provide information about configurable processor bus features */
-static inline s64
-ia64_pal_bus_get_features (pal_bus_features_u_t *features_avail,
-			   pal_bus_features_u_t *features_status,
-			   pal_bus_features_u_t *features_control)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_BUS_GET_FEATURES, 0, 0, 0);
-	if (features_avail)
-		features_avail->pal_bus_features_val = iprv.v0;
-	if (features_status)
-		features_status->pal_bus_features_val = iprv.v1;
-	if (features_control)
-		features_control->pal_bus_features_val = iprv.v2;
-	return iprv.status;
-}
-
-/* Enables/disables specific processor bus features */
-static inline s64
-ia64_pal_bus_set_features (pal_bus_features_u_t feature_select)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_BUS_SET_FEATURES, feature_select.pal_bus_features_val, 0, 0);
-	return iprv.status;
-}
-
-/* Get detailed cache information */
-static inline s64
-ia64_pal_cache_config_info (u64 cache_level, u64 cache_type, pal_cache_config_info_t *conf)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_INFO, cache_level, cache_type, 0);
-
-	if (iprv.status == 0) {
-		conf->pcci_status                 = iprv.status;
-		conf->pcci_info_1.pcci1_data      = iprv.v0;
-		conf->pcci_info_2.pcci2_data      = iprv.v1;
-		conf->pcci_reserved               = iprv.v2;
-	}
-	return iprv.status;
-
-}
-
-/* Get detailed cche protection information */
-static inline s64
-ia64_pal_cache_prot_info (u64 cache_level, u64 cache_type, pal_cache_protection_info_t *prot)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_PROT_INFO, cache_level, cache_type, 0);
-
-	if (iprv.status == 0) {
-		prot->pcpi_status           = iprv.status;
-		prot->pcp_info[0].pcpi_data = iprv.v0 & 0xffffffff;
-		prot->pcp_info[1].pcpi_data = iprv.v0 >> 32;
-		prot->pcp_info[2].pcpi_data = iprv.v1 & 0xffffffff;
-		prot->pcp_info[3].pcpi_data = iprv.v1 >> 32;
-		prot->pcp_info[4].pcpi_data = iprv.v2 & 0xffffffff;
-		prot->pcp_info[5].pcpi_data = iprv.v2 >> 32;
-	}
-	return iprv.status;
-}
-
-/*
- * Flush the processor instruction or data caches.  *PROGRESS must be
- * initialized to zero before calling this for the first time..
- */
-static inline s64
-ia64_pal_cache_flush (u64 cache_type, u64 invalidate, u64 *progress, u64 *vector)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress);
-	if (vector)
-		*vector = iprv.v0;
-	*progress = iprv.v1;
-	return iprv.status;
-}
-
-
-/* Initialize the processor controlled caches */
-static inline s64
-ia64_pal_cache_init (u64 level, u64 cache_type, u64 rest)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_INIT, level, cache_type, rest);
-	return iprv.status;
-}
-
-/* Initialize the tags and data of a data or unified cache line of
- * processor controlled cache to known values without the availability
- * of backing memory.
- */
-static inline s64
-ia64_pal_cache_line_init (u64 physical_addr, u64 data_value)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_LINE_INIT, physical_addr, data_value, 0);
-	return iprv.status;
-}
-
-
-/* Read the data and tag of a processor controlled cache line for diags */
-static inline s64
-ia64_pal_cache_read (pal_cache_line_id_u_t line_id, u64 physical_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_READ, line_id.pclid_data,
-				physical_addr, 0);
-	return iprv.status;
-}
-
-/* Return summary information about the hierarchy of caches controlled by the processor */
-static inline long ia64_pal_cache_summary(unsigned long *cache_levels,
-						unsigned long *unique_caches)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_SUMMARY, 0, 0, 0);
-	if (cache_levels)
-		*cache_levels = iprv.v0;
-	if (unique_caches)
-		*unique_caches = iprv.v1;
-	return iprv.status;
-}
-
-/* Write the data and tag of a processor-controlled cache line for diags */
-static inline s64
-ia64_pal_cache_write (pal_cache_line_id_u_t line_id, u64 physical_addr, u64 data)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_WRITE, line_id.pclid_data,
-				physical_addr, data);
-	return iprv.status;
-}
-
-
-/* Return the parameters needed to copy relocatable PAL procedures from ROM to memory */
-static inline s64
-ia64_pal_copy_info (u64 copy_type, u64 num_procs, u64 num_iopics,
-		    u64 *buffer_size, u64 *buffer_align)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_COPY_INFO, copy_type, num_procs, num_iopics);
-	if (buffer_size)
-		*buffer_size = iprv.v0;
-	if (buffer_align)
-		*buffer_align = iprv.v1;
-	return iprv.status;
-}
-
-/* Copy relocatable PAL procedures from ROM to memory */
-static inline s64
-ia64_pal_copy_pal (u64 target_addr, u64 alloc_size, u64 processor, u64 *pal_proc_offset)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_COPY_PAL, target_addr, alloc_size, processor);
-	if (pal_proc_offset)
-		*pal_proc_offset = iprv.v0;
-	return iprv.status;
-}
-
-/* Return the number of instruction and data debug register pairs */
-static inline long ia64_pal_debug_info(unsigned long *inst_regs,
-						unsigned long *data_regs)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_DEBUG_INFO, 0, 0, 0);
-	if (inst_regs)
-		*inst_regs = iprv.v0;
-	if (data_regs)
-		*data_regs = iprv.v1;
-
-	return iprv.status;
-}
-
-#ifdef TBD
-/* Switch from IA64-system environment to IA-32 system environment */
-static inline s64
-ia64_pal_enter_ia32_env (ia32_env1, ia32_env2, ia32_env3)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_ENTER_IA_32_ENV, ia32_env1, ia32_env2, ia32_env3);
-	return iprv.status;
-}
-#endif
-
-/* Get unique geographical address of this processor on its bus */
-static inline s64
-ia64_pal_fixed_addr (u64 *global_unique_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FIXED_ADDR, 0, 0, 0);
-	if (global_unique_addr)
-		*global_unique_addr = iprv.v0;
-	return iprv.status;
-}
-
-/* Get base frequency of the platform if generated by the processor */
-static inline long ia64_pal_freq_base(unsigned long *platform_base_freq)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FREQ_BASE, 0, 0, 0);
-	if (platform_base_freq)
-		*platform_base_freq = iprv.v0;
-	return iprv.status;
-}
-
-/*
- * Get the ratios for processor frequency, bus frequency and interval timer to
- * the base frequency of the platform
- */
-static inline s64
-ia64_pal_freq_ratios (struct pal_freq_ratio *proc_ratio, struct pal_freq_ratio *bus_ratio,
-		      struct pal_freq_ratio *itc_ratio)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FREQ_RATIOS, 0, 0, 0);
-	if (proc_ratio)
-		*(u64 *)proc_ratio = iprv.v0;
-	if (bus_ratio)
-		*(u64 *)bus_ratio = iprv.v1;
-	if (itc_ratio)
-		*(u64 *)itc_ratio = iprv.v2;
-	return iprv.status;
-}
-
-/*
- * Get the current hardware resource sharing policy of the processor
- */
-static inline s64
-ia64_pal_get_hw_policy (u64 proc_num, u64 *cur_policy, u64 *num_impacted,
-			u64 *la)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_GET_HW_POLICY, proc_num, 0, 0);
-	if (cur_policy)
-		*cur_policy = iprv.v0;
-	if (num_impacted)
-		*num_impacted = iprv.v1;
-	if (la)
-		*la = iprv.v2;
-	return iprv.status;
-}
-
-/* Make the processor enter HALT or one of the implementation dependent low
- * power states where prefetching and execution are suspended and cache and
- * TLB coherency is not maintained.
- */
-static inline s64
-ia64_pal_halt (u64 halt_state)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_HALT, halt_state, 0, 0);
-	return iprv.status;
-}
-
-typedef union pal_power_mgmt_info_u {
-	u64			ppmi_data;
-	struct {
-	       u64		exit_latency		: 16,
-				entry_latency		: 16,
-				power_consumption	: 28,
-				im			: 1,
-				co			: 1,
-				reserved		: 2;
-	} pal_power_mgmt_info_s;
-} pal_power_mgmt_info_u_t;
-
-/* Return information about processor's optional power management capabilities. */
-static inline s64
-ia64_pal_halt_info (pal_power_mgmt_info_u_t *power_buf)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_HALT_INFO, (unsigned long) power_buf, 0, 0);
-	return iprv.status;
-}
-
-/* Get the current P-state information */
-static inline s64
-ia64_pal_get_pstate (u64 *pstate_index, unsigned long type)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_GET_PSTATE, type, 0, 0);
-	*pstate_index = iprv.v0;
-	return iprv.status;
-}
-
-/* Set the P-state */
-static inline s64
-ia64_pal_set_pstate (u64 pstate_index)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_SET_PSTATE, pstate_index, 0, 0);
-	return iprv.status;
-}
-
-/* Processor branding information*/
-static inline s64
-ia64_pal_get_brand_info (char *brand_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_BRAND_INFO, 0, (u64)brand_info, 0);
-	return iprv.status;
-}
-
-/* Cause the processor to enter LIGHT HALT state, where prefetching and execution are
- * suspended, but cache and TLB coherency is maintained.
- */
-static inline s64
-ia64_pal_halt_light (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_HALT_LIGHT, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Clear all the processor error logging   registers and reset the indicator that allows
- * the error logging registers to be written. This procedure also checks the pending
- * machine check bit and pending INIT bit and reports their states.
- */
-static inline s64
-ia64_pal_mc_clear_log (u64 *pending_vector)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_CLEAR_LOG, 0, 0, 0);
-	if (pending_vector)
-		*pending_vector = iprv.v0;
-	return iprv.status;
-}
-
-/* Ensure that all outstanding transactions in a processor are completed or that any
- * MCA due to thes outstanding transaction is taken.
- */
-static inline s64
-ia64_pal_mc_drain (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_DRAIN, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Return the machine check dynamic processor state */
-static inline s64
-ia64_pal_mc_dynamic_state (u64 info_type, u64 dy_buffer, u64 *size)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_DYNAMIC_STATE, info_type, dy_buffer, 0);
-	if (size)
-		*size = iprv.v0;
-	return iprv.status;
-}
-
-/* Return processor machine check information */
-static inline s64
-ia64_pal_mc_error_info (u64 info_index, u64 type_index, u64 *size, u64 *error_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_ERROR_INFO, info_index, type_index, 0);
-	if (size)
-		*size = iprv.v0;
-	if (error_info)
-		*error_info = iprv.v1;
-	return iprv.status;
-}
-
-/* Injects the requested processor error or returns info on
- * supported injection capabilities for current processor implementation
- */
-static inline s64
-ia64_pal_mc_error_inject_phys (u64 err_type_info, u64 err_struct_info,
-			u64 err_data_buffer, u64 *capabilities, u64 *resources)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_MC_ERROR_INJECT, err_type_info,
-			  err_struct_info, err_data_buffer);
-	if (capabilities)
-		*capabilities= iprv.v0;
-	if (resources)
-		*resources= iprv.v1;
-	return iprv.status;
-}
-
-static inline s64
-ia64_pal_mc_error_inject_virt (u64 err_type_info, u64 err_struct_info,
-			u64 err_data_buffer, u64 *capabilities, u64 *resources)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_MC_ERROR_INJECT, err_type_info,
-			  err_struct_info, err_data_buffer);
-	if (capabilities)
-		*capabilities= iprv.v0;
-	if (resources)
-		*resources= iprv.v1;
-	return iprv.status;
-}
-
-/* Inform PALE_CHECK whether a machine check is expected so that PALE_CHECK willnot
- * attempt to correct any expected machine checks.
- */
-static inline s64
-ia64_pal_mc_expected (u64 expected, u64 *previous)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_EXPECTED, expected, 0, 0);
-	if (previous)
-		*previous = iprv.v0;
-	return iprv.status;
-}
-
-typedef union pal_hw_tracking_u {
-	u64			pht_data;
-	struct {
-		u64		itc	:4,	/* Instruction cache tracking */
-				dct	:4,	/* Date cache tracking */
-				itt	:4,	/* Instruction TLB tracking */
-				ddt	:4,	/* Data TLB tracking */
-				reserved:48;
-	} pal_hw_tracking_s;
-} pal_hw_tracking_u_t;
-
-/*
- * Hardware tracking status.
- */
-static inline s64
-ia64_pal_mc_hw_tracking (u64 *status)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_HW_TRACKING, 0, 0, 0);
-	if (status)
-		*status = iprv.v0;
-	return iprv.status;
-}
-
-/* Register a platform dependent location with PAL to which it can save
- * minimal processor state in the event of a machine check or initialization
- * event.
- */
-static inline s64
-ia64_pal_mc_register_mem (u64 physical_addr, u64 size, u64 *req_size)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_REGISTER_MEM, physical_addr, size, 0);
-	if (req_size)
-		*req_size = iprv.v0;
-	return iprv.status;
-}
-
-/* Restore minimal architectural processor state, set CMC interrupt if necessary
- * and resume execution
- */
-static inline s64
-ia64_pal_mc_resume (u64 set_cmci, u64 save_ptr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_RESUME, set_cmci, save_ptr, 0);
-	return iprv.status;
-}
-
-/* Return the memory attributes implemented by the processor */
-static inline s64
-ia64_pal_mem_attrib (u64 *mem_attrib)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MEM_ATTRIB, 0, 0, 0);
-	if (mem_attrib)
-		*mem_attrib = iprv.v0 & 0xff;
-	return iprv.status;
-}
-
-/* Return the amount of memory needed for second phase of processor
- * self-test and the required alignment of memory.
- */
-static inline s64
-ia64_pal_mem_for_test (u64 *bytes_needed, u64 *alignment)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MEM_FOR_TEST, 0, 0, 0);
-	if (bytes_needed)
-		*bytes_needed = iprv.v0;
-	if (alignment)
-		*alignment = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_perf_mon_info_u {
-	u64			  ppmi_data;
-	struct {
-	       u64		generic		: 8,
-				width		: 8,
-				cycles		: 8,
-				retired		: 8,
-				reserved	: 32;
-	} pal_perf_mon_info_s;
-} pal_perf_mon_info_u_t;
-
-/* Return the performance monitor information about what can be counted
- * and how to configure the monitors to count the desired events.
- */
-static inline s64
-ia64_pal_perf_mon_info (u64 *pm_buffer, pal_perf_mon_info_u_t *pm_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PERF_MON_INFO, (unsigned long) pm_buffer, 0, 0);
-	if (pm_info)
-		pm_info->ppmi_data = iprv.v0;
-	return iprv.status;
-}
-
-/* Specifies the physical address of the processor interrupt block
- * and I/O port space.
- */
-static inline s64
-ia64_pal_platform_addr (u64 type, u64 physical_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PLATFORM_ADDR, type, physical_addr, 0);
-	return iprv.status;
-}
-
-/* Set the SAL PMI entrypoint in memory */
-static inline s64
-ia64_pal_pmi_entrypoint (u64 sal_pmi_entry_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PMI_ENTRYPOINT, sal_pmi_entry_addr, 0, 0);
-	return iprv.status;
-}
-
-struct pal_features_s;
-/* Provide information about configurable processor features */
-static inline s64
-ia64_pal_proc_get_features (u64 *features_avail,
-			    u64 *features_status,
-			    u64 *features_control,
-			    u64 features_set)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, features_set, 0);
-	if (iprv.status == 0) {
-		*features_avail   = iprv.v0;
-		*features_status  = iprv.v1;
-		*features_control = iprv.v2;
-	}
-	return iprv.status;
-}
-
-/* Enable/disable processor dependent features */
-static inline s64
-ia64_pal_proc_set_features (u64 feature_select)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES, feature_select, 0, 0);
-	return iprv.status;
-}
-
-/*
- * Put everything in a struct so we avoid the global offset table whenever
- * possible.
- */
-typedef struct ia64_ptce_info_s {
-	unsigned long	base;
-	u32		count[2];
-	u32		stride[2];
-} ia64_ptce_info_t;
-
-/* Return the information required for the architected loop used to purge
- * (initialize) the entire TC
- */
-static inline s64
-ia64_get_ptce (ia64_ptce_info_t *ptce)
-{
-	struct ia64_pal_retval iprv;
-
-	if (!ptce)
-		return -1;
-
-	PAL_CALL(iprv, PAL_PTCE_INFO, 0, 0, 0);
-	if (iprv.status == 0) {
-		ptce->base = iprv.v0;
-		ptce->count[0] = iprv.v1 >> 32;
-		ptce->count[1] = iprv.v1 & 0xffffffff;
-		ptce->stride[0] = iprv.v2 >> 32;
-		ptce->stride[1] = iprv.v2 & 0xffffffff;
-	}
-	return iprv.status;
-}
-
-/* Return info about implemented application and control registers. */
-static inline s64
-ia64_pal_register_info (u64 info_request, u64 *reg_info_1, u64 *reg_info_2)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_REGISTER_INFO, info_request, 0, 0);
-	if (reg_info_1)
-		*reg_info_1 = iprv.v0;
-	if (reg_info_2)
-		*reg_info_2 = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_hints_u {
-	unsigned long		ph_data;
-	struct {
-	       unsigned long	si		: 1,
-				li		: 1,
-				reserved	: 62;
-	} pal_hints_s;
-} pal_hints_u_t;
-
-/* Return information about the register stack and RSE for this processor
- * implementation.
- */
-static inline long ia64_pal_rse_info(unsigned long *num_phys_stacked,
-							pal_hints_u_t *hints)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_RSE_INFO, 0, 0, 0);
-	if (num_phys_stacked)
-		*num_phys_stacked = iprv.v0;
-	if (hints)
-		hints->ph_data = iprv.v1;
-	return iprv.status;
-}
-
-/*
- * Set the current hardware resource sharing policy of the processor
- */
-static inline s64
-ia64_pal_set_hw_policy (u64 policy)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_SET_HW_POLICY, policy, 0, 0);
-	return iprv.status;
-}
-
-/* Cause the processor to enter	SHUTDOWN state, where prefetching and execution are
- * suspended, but cause cache and TLB coherency to be maintained.
- * This is usually called in IA-32 mode.
- */
-static inline s64
-ia64_pal_shutdown (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_SHUTDOWN, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Perform the second phase of processor self-test. */
-static inline s64
-ia64_pal_test_proc (u64 test_addr, u64 test_size, u64 attributes, u64 *self_test_state)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_TEST_PROC, test_addr, test_size, attributes);
-	if (self_test_state)
-		*self_test_state = iprv.v0;
-	return iprv.status;
-}
-
-typedef union  pal_version_u {
-	u64	pal_version_val;
-	struct {
-		u64	pv_pal_b_rev		:	8;
-		u64	pv_pal_b_model		:	8;
-		u64	pv_reserved1		:	8;
-		u64	pv_pal_vendor		:	8;
-		u64	pv_pal_a_rev		:	8;
-		u64	pv_pal_a_model		:	8;
-		u64	pv_reserved2		:	16;
-	} pal_version_s;
-} pal_version_u_t;
-
-
-/*
- * Return PAL version information.  While the documentation states that
- * PAL_VERSION can be called in either physical or virtual mode, some
- * implementations only allow physical calls.  We don't call it very often,
- * so the overhead isn't worth eliminating.
- */
-static inline s64
-ia64_pal_version (pal_version_u_t *pal_min_version, pal_version_u_t *pal_cur_version)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_VERSION, 0, 0, 0);
-	if (pal_min_version)
-		pal_min_version->pal_version_val = iprv.v0;
-
-	if (pal_cur_version)
-		pal_cur_version->pal_version_val = iprv.v1;
-
-	return iprv.status;
-}
-
-typedef union pal_tc_info_u {
-	u64			pti_val;
-	struct {
-	       u64		num_sets	:	8,
-				associativity	:	8,
-				num_entries	:	16,
-				pf		:	1,
-				unified		:	1,
-				reduce_tr	:	1,
-				reserved	:	29;
-	} pal_tc_info_s;
-} pal_tc_info_u_t;
-
-#define tc_reduce_tr		pal_tc_info_s.reduce_tr
-#define tc_unified		pal_tc_info_s.unified
-#define tc_pf			pal_tc_info_s.pf
-#define tc_num_entries		pal_tc_info_s.num_entries
-#define tc_associativity	pal_tc_info_s.associativity
-#define tc_num_sets		pal_tc_info_s.num_sets
-
-
-/* Return information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64
-ia64_pal_vm_info (u64 tc_level, u64 tc_type,  pal_tc_info_u_t *tc_info, u64 *tc_pages)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_INFO, tc_level, tc_type, 0);
-	if (tc_info)
-		tc_info->pti_val = iprv.v0;
-	if (tc_pages)
-		*tc_pages = iprv.v1;
-	return iprv.status;
-}
-
-/* Get page size information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64 ia64_pal_vm_page_size(u64 *tr_pages, u64 *vw_pages)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_PAGE_SIZE, 0, 0, 0);
-	if (tr_pages)
-		*tr_pages = iprv.v0;
-	if (vw_pages)
-		*vw_pages = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_vm_info_1_u {
-	u64			pvi1_val;
-	struct {
-		u64		vw		: 1,
-				phys_add_size	: 7,
-				key_size	: 8,
-				max_pkr		: 8,
-				hash_tag_id	: 8,
-				max_dtr_entry	: 8,
-				max_itr_entry	: 8,
-				max_unique_tcs	: 8,
-				num_tc_levels	: 8;
-	} pal_vm_info_1_s;
-} pal_vm_info_1_u_t;
-
-#define PAL_MAX_PURGES		0xFFFF		/* all ones is means unlimited */
-
-typedef union pal_vm_info_2_u {
-	u64			pvi2_val;
-	struct {
-		u64		impl_va_msb	: 8,
-				rid_size	: 8,
-				max_purges	: 16,
-				reserved	: 32;
-	} pal_vm_info_2_s;
-} pal_vm_info_2_u_t;
-
-/* Get summary information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64
-ia64_pal_vm_summary (pal_vm_info_1_u_t *vm_info_1, pal_vm_info_2_u_t *vm_info_2)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_SUMMARY, 0, 0, 0);
-	if (vm_info_1)
-		vm_info_1->pvi1_val = iprv.v0;
-	if (vm_info_2)
-		vm_info_2->pvi2_val = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_vp_info_u {
-	u64			pvi_val;
-	struct {
-		u64		index:		48,	/* virtual feature set info */
-				vmm_id:		16;	/* feature set id */
-	} pal_vp_info_s;
-} pal_vp_info_u_t;
-
-/*
- * Returns information about virtual processor features
- */
-static inline s64
-ia64_pal_vp_info (u64 feature_set, u64 vp_buffer, u64 *vp_info, u64 *vmm_id)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VP_INFO, feature_set, vp_buffer, 0);
-	if (vp_info)
-		*vp_info = iprv.v0;
-	if (vmm_id)
-		*vmm_id = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_itr_valid_u {
-	u64			piv_val;
-	struct {
-	       u64		access_rights_valid	: 1,
-				priv_level_valid	: 1,
-				dirty_bit_valid		: 1,
-				mem_attr_valid		: 1,
-				reserved		: 60;
-	} pal_tr_valid_s;
-} pal_tr_valid_u_t;
-
-/* Read a translation register */
-static inline s64
-ia64_pal_tr_read (u64 reg_num, u64 tr_type, u64 *tr_buffer, pal_tr_valid_u_t *tr_valid)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_VM_TR_READ, reg_num, tr_type,(u64)ia64_tpa(tr_buffer));
-	if (tr_valid)
-		tr_valid->piv_val = iprv.v0;
-	return iprv.status;
-}
-
-/*
- * PAL_PREFETCH_VISIBILITY transaction types
- */
-#define PAL_VISIBILITY_VIRTUAL		0
-#define PAL_VISIBILITY_PHYSICAL		1
-
-/*
- * PAL_PREFETCH_VISIBILITY return codes
- */
-#define PAL_VISIBILITY_OK		1
-#define PAL_VISIBILITY_OK_REMOTE_NEEDED	0
-#define PAL_VISIBILITY_INVAL_ARG	-2
-#define PAL_VISIBILITY_ERROR		-3
-
-static inline s64
-ia64_pal_prefetch_visibility (s64 trans_type)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PREFETCH_VISIBILITY, trans_type, 0, 0);
-	return iprv.status;
-}
-
-/* data structure for getting information on logical to physical mappings */
-typedef union pal_log_overview_u {
-	struct {
-		u64	num_log		:16,	/* Total number of logical
-						 * processors on this die
-						 */
-			tpc		:8,	/* Threads per core */
-			reserved3	:8,	/* Reserved */
-			cpp		:8,	/* Cores per processor */
-			reserved2	:8,	/* Reserved */
-			ppid		:8,	/* Physical processor ID */
-			reserved1	:8;	/* Reserved */
-	} overview_bits;
-	u64 overview_data;
-} pal_log_overview_t;
-
-typedef union pal_proc_n_log_info1_u{
-	struct {
-		u64	tid		:16,	/* Thread id */
-			reserved2	:16,	/* Reserved */
-			cid		:16,	/* Core id */
-			reserved1	:16;	/* Reserved */
-	} ppli1_bits;
-	u64	ppli1_data;
-} pal_proc_n_log_info1_t;
-
-typedef union pal_proc_n_log_info2_u {
-	struct {
-		u64	la		:16,	/* Logical address */
-			reserved	:48;	/* Reserved */
-	} ppli2_bits;
-	u64	ppli2_data;
-} pal_proc_n_log_info2_t;
-
-typedef struct pal_logical_to_physical_s
-{
-	pal_log_overview_t overview;
-	pal_proc_n_log_info1_t ppli1;
-	pal_proc_n_log_info2_t ppli2;
-} pal_logical_to_physical_t;
-
-#define overview_num_log	overview.overview_bits.num_log
-#define overview_tpc		overview.overview_bits.tpc
-#define overview_cpp		overview.overview_bits.cpp
-#define overview_ppid		overview.overview_bits.ppid
-#define log1_tid		ppli1.ppli1_bits.tid
-#define log1_cid		ppli1.ppli1_bits.cid
-#define log2_la			ppli2.ppli2_bits.la
-
-/* Get information on logical to physical processor mappings. */
-static inline s64
-ia64_pal_logical_to_phys(u64 proc_number, pal_logical_to_physical_t *mapping)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_LOGICAL_TO_PHYSICAL, proc_number, 0, 0);
-
-	if (iprv.status == PAL_STATUS_SUCCESS)
-	{
-		mapping->overview.overview_data = iprv.v0;
-		mapping->ppli1.ppli1_data = iprv.v1;
-		mapping->ppli2.ppli2_data = iprv.v2;
-	}
-
-	return iprv.status;
-}
-
-typedef struct pal_cache_shared_info_s
-{
-	u64 num_shared;
-	pal_proc_n_log_info1_t ppli1;
-	pal_proc_n_log_info2_t ppli2;
-} pal_cache_shared_info_t;
-
-/* Get information on logical to physical processor mappings. */
-static inline s64
-ia64_pal_cache_shared_info(u64 level,
-		u64 type,
-		u64 proc_number,
-		pal_cache_shared_info_t *info)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_SHARED_INFO, level, type, proc_number);
-
-	if (iprv.status == PAL_STATUS_SUCCESS) {
-		info->num_shared = iprv.v0;
-		info->ppli1.ppli1_data = iprv.v1;
-		info->ppli2.ppli2_data = iprv.v2;
-	}
-
-	return iprv.status;
-}
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PAL_H */
diff --git a/arch/ia64/include/asm/param.h b/arch/ia64/include/asm/param.h
deleted file mode 100644
index f0b786227c40..000000000000
--- a/arch/ia64/include/asm/param.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Fundamental kernel parameters.
- *
- * Based on <asm-i386/param.h>.
- *
- * Modified 1998, 1999, 2002-2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_PARAM_H
-#define _ASM_IA64_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# define HZ		CONFIG_HZ
-# define USER_HZ	HZ
-# define CLOCKS_PER_SEC	HZ	/* frequency at which times() counts */
-#endif /* _ASM_IA64_PARAM_H */
diff --git a/arch/ia64/include/asm/parport.h b/arch/ia64/include/asm/parport.h
deleted file mode 100644
index 360ca9bf2f6f..000000000000
--- a/arch/ia64/include/asm/parport.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * parport.h: platform-specific PC-style parport initialisation
- *
- * Copyright (C) 1999, 2000  Tim Waugh <tim@cyberelk.demon.co.uk>
- *
- * This file should only be included by drivers/parport/parport_pc.c.
- */
-
-#ifndef _ASM_IA64_PARPORT_H
-#define _ASM_IA64_PARPORT_H 1
-
-static int parport_pc_find_isa_ports(int autoirq, int autodma);
-
-static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
-{
-	return parport_pc_find_isa_ports(autoirq, autodma);
-}
-
-#endif /* _ASM_IA64_PARPORT_H */
diff --git a/arch/ia64/include/asm/patch.h b/arch/ia64/include/asm/patch.h
deleted file mode 100644
index bd487ed22bf5..000000000000
--- a/arch/ia64/include/asm/patch.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PATCH_H
-#define _ASM_IA64_PATCH_H
-
-/*
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * There are a number of reasons for patching instructions.  Rather than duplicating code
- * all over the place, we put the common stuff here.  Reasons for patching: in-kernel
- * module-loader, virtual-to-physical patch-list, McKinley Errata 9 workaround, and gate
- * shared library.  Undoubtedly, some of these reasons will disappear and others will
- * be added over time.
- */
-#include <linux/elf.h>
-#include <linux/types.h>
-
-extern void ia64_patch (u64 insn_addr, u64 mask, u64 val);	/* patch any insn slot */
-extern void ia64_patch_imm64 (u64 insn_addr, u64 val);		/* patch "movl" w/abs. value*/
-extern void ia64_patch_imm60 (u64 insn_addr, u64 val);		/* patch "brl" w/ip-rel value */
-
-extern void ia64_patch_mckinley_e9 (unsigned long start, unsigned long end);
-extern void ia64_patch_vtop (unsigned long start, unsigned long end);
-extern void ia64_patch_phys_stack_reg(unsigned long val);
-extern void ia64_patch_rse (unsigned long start, unsigned long end);
-extern void ia64_patch_gate (void);
-
-#endif /* _ASM_IA64_PATCH_H */
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
deleted file mode 100644
index fa8f545c24c9..000000000000
--- a/arch/ia64/include/asm/pci.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PCI_H
-#define _ASM_IA64_PCI_H
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/scatterlist.h>
-
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-
-struct pci_vector_struct {
-	__u16 segment;	/* PCI Segment number */
-	__u16 bus;	/* PCI Bus number */
-	__u32 pci_id;	/* ACPI split 16 bits device, 16 bits function (see section 6.1.1) */
-	__u8 pin;	/* PCI PIN (0 = A, 1 = B, 2 = C, 3 = D) */
-	__u32 irq;	/* IRQ assigned */
-};
-
-/*
- * Can be used to override the logic in pci_scan_bus for skipping already-configured bus
- * numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the
- * loader.
- */
-#define pcibios_assign_all_busses()     0
-
-#define PCIBIOS_MIN_IO		0x1000
-#define PCIBIOS_MIN_MEM		0x10000000
-
-#define HAVE_PCI_MMAP
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE
-#define arch_can_pci_mmap_wc()	1
-
-#define HAVE_PCI_LEGACY
-extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
-				      struct vm_area_struct *vma,
-				      enum pci_mmap_state mmap_state);
-
-char *pci_get_legacy_mem(struct pci_bus *bus);
-int pci_legacy_read(struct pci_bus *bus, u16 port, u32 *val, u8 size);
-int pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size);
-
-struct pci_controller {
-	struct acpi_device *companion;
-	void *iommu;
-	int segment;
-	int node;		/* nearest node with memory or NUMA_NO_NODE for global allocation */
-
-	void *platform_data;
-};
-
-
-#define PCI_CONTROLLER(busdev) ((struct pci_controller *) busdev->sysdata)
-#define pci_domain_nr(busdev)    (PCI_CONTROLLER(busdev)->segment)
-
-extern struct pci_ops pci_root_ops;
-
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	return (pci_domain_nr(bus) != 0);
-}
-
-#endif /* _ASM_IA64_PCI_H */
diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
deleted file mode 100644
index f357b9bb3576..000000000000
--- a/arch/ia64/include/asm/percpu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PERCPU_H
-#define _ASM_IA64_PERCPU_H
-
-/*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifdef __ASSEMBLY__
-# define THIS_CPU(var)	(var)  /* use this to mark accesses to per-CPU variables... */
-#else /* !__ASSEMBLY__ */
-
-
-#include <linux/threads.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef HAVE_MODEL_SMALL_ATTRIBUTE
-# define PER_CPU_ATTRIBUTES	__attribute__((__model__ (__small__)))
-#endif
-
-#define __my_cpu_offset	__ia64_per_cpu_var(local_per_cpu_offset)
-
-extern void *per_cpu_init(void);
-
-#else /* ! SMP */
-
-#define per_cpu_init()				(__phys_per_cpu_start)
-
-#endif	/* SMP */
-
-#define PER_CPU_BASE_SECTION ".data..percpu"
-
-/*
- * Be extremely careful when taking the address of this variable!  Due to virtual
- * remapping, it is different from the canonical address returned by this_cpu_ptr(&var)!
- * On the positive side, using __ia64_per_cpu_var() instead of this_cpu_ptr() is slightly
- * more efficient.
- */
-#define __ia64_per_cpu_var(var) (*({					\
-	__verify_pcpu_ptr(&(var));					\
-	((typeof(var) __kernel __force *)&(var));			\
-}))
-
-#include <asm-generic/percpu.h>
-
-/* Equal to __per_cpu_offset[smp_processor_id()], but faster to access: */
-DECLARE_PER_CPU(unsigned long, local_per_cpu_offset);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PERCPU_H */
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
deleted file mode 100644
index 0fb2b6291d58..000000000000
--- a/arch/ia64/include/asm/pgalloc.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PGALLOC_H
-#define _ASM_IA64_PGALLOC_H
-
-/*
- * This file contains the functions and defines necessary to allocate
- * page tables.
- *
- * This hopefully works with any (fixed) ia-64 page-size, as defined
- * in <asm/page.h> (currently 8192).
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2000, Goutham Rao <goutham.rao@intel.com>
- */
-
-
-#include <linux/compiler.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/threads.h>
-
-#include <asm-generic/pgalloc.h>
-
-#include <asm/mmu_context.h>
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-#if CONFIG_PGTABLE_LEVELS == 4
-static inline void
-p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
-{
-	p4d_val(*p4d_entry) = __pa(pud);
-}
-
-#define __pud_free_tlb(tlb, pud, address)	pud_free((tlb)->mm, pud)
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
-
-static inline void
-pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
-{
-	pud_val(*pud_entry) = __pa(pmd);
-}
-
-#define __pmd_free_tlb(tlb, pmd, address)	pmd_free((tlb)->mm, pmd)
-
-static inline void
-pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte)
-{
-	pmd_val(*pmd_entry) = page_to_phys(pte);
-}
-
-static inline void
-pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
-{
-	pmd_val(*pmd_entry) = __pa(pte);
-}
-
-#define __pte_free_tlb(tlb, pte, address)	pte_free((tlb)->mm, pte)
-
-#endif				/* _ASM_IA64_PGALLOC_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
deleted file mode 100644
index 9be2d2ba6016..000000000000
--- a/arch/ia64/include/asm/pgtable.h
+++ /dev/null
@@ -1,545 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PGTABLE_H
-#define _ASM_IA64_PGTABLE_H
-
-/*
- * This file contains the functions and defines necessary to modify and use
- * the IA-64 page table tree.
- *
- * This hopefully works with any (fixed) IA-64 page-size, as defined
- * in <asm/page.h>.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/mman.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/types.h>
-
-#define IA64_MAX_PHYS_BITS	50	/* max. number of physical address bits (architected) */
-
-/*
- * First, define the various bits in a PTE.  Note that the PTE format
- * matches the VHPT short format, the firt doubleword of the VHPD long
- * format, and the first doubleword of the TLB insertion format.
- */
-#define _PAGE_P_BIT		0
-#define _PAGE_A_BIT		5
-#define _PAGE_D_BIT		6
-
-#define _PAGE_P			(1 << _PAGE_P_BIT)	/* page present bit */
-#define _PAGE_MA_WB		(0x0 <<  2)	/* write back memory attribute */
-#define _PAGE_MA_UC		(0x4 <<  2)	/* uncacheable memory attribute */
-#define _PAGE_MA_UCE		(0x5 <<  2)	/* UC exported attribute */
-#define _PAGE_MA_WC		(0x6 <<  2)	/* write coalescing memory attribute */
-#define _PAGE_MA_NAT		(0x7 <<  2)	/* not-a-thing attribute */
-#define _PAGE_MA_MASK		(0x7 <<  2)
-#define _PAGE_PL_0		(0 <<  7)	/* privilege level 0 (kernel) */
-#define _PAGE_PL_1		(1 <<  7)	/* privilege level 1 (unused) */
-#define _PAGE_PL_2		(2 <<  7)	/* privilege level 2 (unused) */
-#define _PAGE_PL_3		(3 <<  7)	/* privilege level 3 (user) */
-#define _PAGE_PL_MASK		(3 <<  7)
-#define _PAGE_AR_R		(0 <<  9)	/* read only */
-#define _PAGE_AR_RX		(1 <<  9)	/* read & execute */
-#define _PAGE_AR_RW		(2 <<  9)	/* read & write */
-#define _PAGE_AR_RWX		(3 <<  9)	/* read, write & execute */
-#define _PAGE_AR_R_RW		(4 <<  9)	/* read / read & write */
-#define _PAGE_AR_RX_RWX		(5 <<  9)	/* read & exec / read, write & exec */
-#define _PAGE_AR_RWX_RW		(6 <<  9)	/* read, write & exec / read & write */
-#define _PAGE_AR_X_RX		(7 <<  9)	/* exec & promote / read & exec */
-#define _PAGE_AR_MASK		(7 <<  9)
-#define _PAGE_AR_SHIFT		9
-#define _PAGE_A			(1 << _PAGE_A_BIT)	/* page accessed bit */
-#define _PAGE_D			(1 << _PAGE_D_BIT)	/* page dirty bit */
-#define _PAGE_PPN_MASK		(((__IA64_UL(1) << IA64_MAX_PHYS_BITS) - 1) & ~0xfffUL)
-#define _PAGE_ED		(__IA64_UL(1) << 52)	/* exception deferral */
-#define _PAGE_PROTNONE		(__IA64_UL(1) << 63)
-
-/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
-#define _PAGE_SWP_EXCLUSIVE	(1 << 7)
-
-#define _PFN_MASK		_PAGE_PPN_MASK
-/* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */
-#define _PAGE_CHG_MASK	(_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED)
-
-#define _PAGE_SIZE_4K	12
-#define _PAGE_SIZE_8K	13
-#define _PAGE_SIZE_16K	14
-#define _PAGE_SIZE_64K	16
-#define _PAGE_SIZE_256K	18
-#define _PAGE_SIZE_1M	20
-#define _PAGE_SIZE_4M	22
-#define _PAGE_SIZE_16M	24
-#define _PAGE_SIZE_64M	26
-#define _PAGE_SIZE_256M	28
-#define _PAGE_SIZE_1G	30
-#define _PAGE_SIZE_4G	32
-
-#define __ACCESS_BITS		_PAGE_ED | _PAGE_A | _PAGE_P | _PAGE_MA_WB
-#define __DIRTY_BITS_NO_ED	_PAGE_A | _PAGE_P | _PAGE_D | _PAGE_MA_WB
-#define __DIRTY_BITS		_PAGE_ED | __DIRTY_BITS_NO_ED
-
-/*
- * How many pointers will a page table level hold expressed in shift
- */
-#define PTRS_PER_PTD_SHIFT	(PAGE_SHIFT-3)
-
-/*
- * Definitions for fourth level:
- */
-#define PTRS_PER_PTE	(__IA64_UL(1) << (PTRS_PER_PTD_SHIFT))
-
-/*
- * Definitions for third level:
- *
- * PMD_SHIFT determines the size of the area a third-level page table
- * can map.
- */
-#define PMD_SHIFT	(PAGE_SHIFT + (PTRS_PER_PTD_SHIFT))
-#define PMD_SIZE	(1UL << PMD_SHIFT)
-#define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1UL << (PTRS_PER_PTD_SHIFT))
-
-#if CONFIG_PGTABLE_LEVELS == 4
-/*
- * Definitions for second level:
- *
- * PUD_SHIFT determines the size of the area a second-level page table
- * can map.
- */
-#define PUD_SHIFT	(PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#define PUD_SIZE	(1UL << PUD_SHIFT)
-#define PUD_MASK	(~(PUD_SIZE-1))
-#define PTRS_PER_PUD	(1UL << (PTRS_PER_PTD_SHIFT))
-#endif
-
-/*
- * Definitions for first level:
- *
- * PGDIR_SHIFT determines what a first-level page table entry can map.
- */
-#if CONFIG_PGTABLE_LEVELS == 4
-#define PGDIR_SHIFT		(PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#else
-#define PGDIR_SHIFT		(PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#endif
-#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD_SHIFT	PTRS_PER_PTD_SHIFT
-#define PTRS_PER_PGD		(1UL << PTRS_PER_PGD_SHIFT)
-#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-
-/*
- * All the normal masks have the "page accessed" bits on, as any time
- * they are used, the page is accessed. They are cleared only by the
- * page-out routines.
- */
-#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_A)
-#define PAGE_SHARED	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW)
-#define PAGE_READONLY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY_EXEC	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
-#define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
-#define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
-#define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
-#define PAGE_KERNEL_UC	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX | \
-				 _PAGE_MA_UC)
-
-# ifndef __ASSEMBLY__
-
-#include <linux/sched/mm.h>	/* for mm_struct */
-#include <linux/bitops.h>
-#include <asm/cacheflush.h>
-#include <asm/mmu_context.h>
-
-/*
- * Next come the mappings that determine how mmap() protection bits
- * (PROT_EXEC, PROT_READ, PROT_WRITE, PROT_NONE) get implemented.  The
- * _P version gets used for a private shared memory segment, the _S
- * version gets used for a shared memory segment with MAP_SHARED on.
- * In a private shared memory segment, we do a copy-on-write if a task
- * attempts to write to the page.
- */
-	/* xwr */
-#define pgd_ERROR(e)	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
-#if CONFIG_PGTABLE_LEVELS == 4
-#define pud_ERROR(e)	printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
-#endif
-#define pmd_ERROR(e)	printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
-#define pte_ERROR(e)	printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
-
-
-/*
- * Some definitions to translate between mem_map, PTEs, and page addresses:
- */
-
-
-/* Quick test to see if ADDR is a (potentially) valid physical address. */
-static inline long
-ia64_phys_addr_valid (unsigned long addr)
-{
-	return (addr & (local_cpu_data->unimpl_pa_mask)) == 0;
-}
-
-/*
- * Now come the defines and routines to manage and access the three-level
- * page table.
- */
-
-
-#define VMALLOC_START		(RGN_BASE(RGN_GATE) + 0x200000000UL)
-#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP)
-/* SPARSEMEM_VMEMMAP uses half of vmalloc... */
-# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10)))
-# define vmemmap		((struct page *)VMALLOC_END)
-#else
-# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
-#endif
-
-/* fs/proc/kcore.c */
-#define	kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
-#define	kc_offset_to_vaddr(o) ((o) + RGN_BASE(RGN_GATE))
-
-#define RGN_MAP_SHIFT (PGDIR_SHIFT + PTRS_PER_PGD_SHIFT - 3)
-#define RGN_MAP_LIMIT	((1UL << RGN_MAP_SHIFT) - PAGE_SIZE)	/* per region addr limit */
-
-#define PFN_PTE_SHIFT	PAGE_SHIFT
-/*
- * Conversion functions: convert page frame number (pfn) and a protection value to a page
- * table entry (pte).
- */
-#define pfn_pte(pfn, pgprot) \
-({ pte_t __pte; pte_val(__pte) = ((pfn) << PAGE_SHIFT) | pgprot_val(pgprot); __pte; })
-
-/* Extract pfn from pte.  */
-#define pte_pfn(_pte)		((pte_val(_pte) & _PFN_MASK) >> PAGE_SHIFT)
-
-#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-
-/* This takes a physical page address that is used by the remapping functions */
-#define mk_pte_phys(physpage, pgprot) \
-({ pte_t __pte; pte_val(__pte) = physpage + pgprot_val(pgprot); __pte; })
-
-#define pte_modify(_pte, newprot) \
-	(__pte((pte_val(_pte) & ~_PAGE_CHG_MASK) | (pgprot_val(newprot) & _PAGE_CHG_MASK)))
-
-#define pte_none(pte) 			(!pte_val(pte))
-#define pte_present(pte)		(pte_val(pte) & (_PAGE_P | _PAGE_PROTNONE))
-#define pte_clear(mm,addr,pte)		(pte_val(*(pte)) = 0UL)
-/* pte_page() returns the "struct page *" corresponding to the PTE: */
-#define pte_page(pte)			virt_to_page(((pte_val(pte) & _PFN_MASK) + PAGE_OFFSET))
-
-#define pmd_none(pmd)			(!pmd_val(pmd))
-#define pmd_bad(pmd)			(!ia64_phys_addr_valid(pmd_val(pmd)))
-#define pmd_present(pmd)		(pmd_val(pmd) != 0UL)
-#define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
-#define pmd_page_vaddr(pmd)		((unsigned long) __va(pmd_val(pmd) & _PFN_MASK))
-#define pmd_pfn(pmd)			((pmd_val(pmd) & _PFN_MASK) >> PAGE_SHIFT)
-#define pmd_page(pmd)			virt_to_page((pmd_val(pmd) + PAGE_OFFSET))
-
-#define pud_none(pud)			(!pud_val(pud))
-#define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
-#define pud_present(pud)		(pud_val(pud) != 0UL)
-#define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define pud_pgtable(pud)		((pmd_t *) __va(pud_val(pud) & _PFN_MASK))
-#define pud_page(pud)			virt_to_page((pud_val(pud) + PAGE_OFFSET))
-
-#if CONFIG_PGTABLE_LEVELS == 4
-#define p4d_none(p4d)			(!p4d_val(p4d))
-#define p4d_bad(p4d)			(!ia64_phys_addr_valid(p4d_val(p4d)))
-#define p4d_present(p4d)		(p4d_val(p4d) != 0UL)
-#define p4d_clear(p4dp)			(p4d_val(*(p4dp)) = 0UL)
-#define p4d_pgtable(p4d)		((pud_t *) __va(p4d_val(p4d) & _PFN_MASK))
-#define p4d_page(p4d)			virt_to_page((p4d_val(p4d) + PAGE_OFFSET))
-#endif
-
-/*
- * The following have defined behavior only work if pte_present() is true.
- */
-#define pte_write(pte)	((unsigned) (((pte_val(pte) & _PAGE_AR_MASK) >> _PAGE_AR_SHIFT) - 2) <= 4)
-#define pte_exec(pte)		((pte_val(pte) & _PAGE_AR_RX) != 0)
-#define pte_dirty(pte)		((pte_val(pte) & _PAGE_D) != 0)
-#define pte_young(pte)		((pte_val(pte) & _PAGE_A) != 0)
-
-/*
- * Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the
- * access rights:
- */
-#define pte_wrprotect(pte)	(__pte(pte_val(pte) & ~_PAGE_AR_RW))
-#define pte_mkwrite_novma(pte)	(__pte(pte_val(pte) | _PAGE_AR_RW))
-#define pte_mkold(pte)		(__pte(pte_val(pte) & ~_PAGE_A))
-#define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
-#define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
-#define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
-#define pte_mkhuge(pte)		(__pte(pte_val(pte)))
-
-/*
- * Because ia64's Icache and Dcache is not coherent (on a cpu), we need to
- * sync icache and dcache when we insert *new* executable page.
- *  __ia64_sync_icache_dcache() check Pg_arch_1 bit and flush icache
- * if necessary.
- *
- *  set_pte() is also called by the kernel, but we can expect that the kernel
- *  flushes icache explicitly if necessary.
- */
-#define pte_present_exec_user(pte)\
-	((pte_val(pte) & (_PAGE_P | _PAGE_PL_MASK | _PAGE_AR_RX)) == \
-		(_PAGE_P | _PAGE_PL_3 | _PAGE_AR_RX))
-
-extern void __ia64_sync_icache_dcache(pte_t pteval);
-static inline void set_pte(pte_t *ptep, pte_t pteval)
-{
-	/* page is present && page is user  && page is executable
-	 * && (page swapin or new page or page migration
-	 *	|| copy_on_write with page copying.)
-	 */
-	if (pte_present_exec_user(pteval) &&
-	    (!pte_present(*ptep) ||
-		pte_pfn(*ptep) != pte_pfn(pteval)))
-		/* load_module() calles flush_icache_range() explicitly*/
-		__ia64_sync_icache_dcache(pteval);
-	*ptep = pteval;
-}
-
-/*
- * Make page protection values cacheable, uncacheable, or write-
- * combining.  Note that "protection" is really a misnomer here as the
- * protection value contains the memory attribute bits, dirty bits, and
- * various other bits as well.
- */
-#define pgprot_cacheable(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WB)
-#define pgprot_noncached(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_UC)
-#define pgprot_writecombine(prot)	__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WC)
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-				     unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
-static inline unsigned long
-pgd_index (unsigned long address)
-{
-	unsigned long region = address >> 61;
-	unsigned long l1index = (address >> PGDIR_SHIFT) & ((PTRS_PER_PGD >> 3) - 1);
-
-	return (region << (PAGE_SHIFT - 6)) | l1index;
-}
-#define pgd_index pgd_index
-
-/*
- * In the kernel's mapped region we know everything is in region number 5, so
- * as an optimisation its PGD already points to the area for that region.
- * However, this also means that we cannot use pgd_index() and we must
- * never add the region here.
- */
-#define pgd_offset_k(addr) \
-	(init_mm.pgd + (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)))
-
-/* Look up a pgd entry in the gate area.  On IA-64, the gate-area
-   resides in the kernel-mapped segment, hence we use pgd_offset_k()
-   here.  */
-#define pgd_offset_gate(mm, addr)	pgd_offset_k(addr)
-
-/* atomic versions of the some PTE manipulations: */
-
-static inline int
-ptep_test_and_clear_young (struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	if (!pte_young(*ptep))
-		return 0;
-	return test_and_clear_bit(_PAGE_A_BIT, ptep);
-#else
-	pte_t pte = *ptep;
-	if (!pte_young(pte))
-		return 0;
-	set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte));
-	return 1;
-#endif
-}
-
-static inline pte_t
-ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	return __pte(xchg((long *) ptep, 0));
-#else
-	pte_t pte = *ptep;
-	pte_clear(mm, addr, ptep);
-	return pte;
-#endif
-}
-
-static inline void
-ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	unsigned long new, old;
-
-	do {
-		old = pte_val(*ptep);
-		new = pte_val(pte_wrprotect(__pte (old)));
-	} while (cmpxchg((unsigned long *) ptep, old, new) != old);
-#else
-	pte_t old_pte = *ptep;
-	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
-#endif
-}
-
-static inline int
-pte_same (pte_t a, pte_t b)
-{
-	return pte_val(a) == pte_val(b);
-}
-
-#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0)
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern void paging_init (void);
-
-/*
- * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
- * are !pte_none() && !pte_present().
- *
- * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of
- *	 bits in the swap-type field of the swap pte.  It would be nice to
- *	 enforce that, but we can't easily include <linux/swap.h> here.
- *	 (Of course, better still would be to define MAX_SWAPFILES_SHIFT here...).
- *
- * Format of swap pte:
- *	bit   0   : present bit (must be zero)
- *	bits  1- 6: swap type
- *	bit   7   : exclusive marker
- *	bits  8-62: swap offset
- *	bit  63   : _PAGE_PROTNONE bit
- */
-#define __swp_type(entry)		(((entry).val >> 1) & 0x3f)
-#define __swp_offset(entry)		(((entry).val << 1) >> 9)
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type & 0x3f) << 1) | \
-							 ((long) (offset) << 8) })
-#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(x)		((pte_t) { (x).val })
-
-static inline int pte_swp_exclusive(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
-}
-
-static inline pte_t pte_swp_mkexclusive(pte_t pte)
-{
-	pte_val(pte) |= _PAGE_SWP_EXCLUSIVE;
-	return pte;
-}
-
-static inline pte_t pte_swp_clear_exclusive(pte_t pte)
-{
-	pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE;
-	return pte;
-}
-
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-extern struct page *zero_page_memmap_ptr;
-#define ZERO_PAGE(vaddr) (zero_page_memmap_ptr)
-
-/* We provide our own get_unmapped_area to cope with VA holes for userland */
-#define HAVE_ARCH_UNMAPPED_AREA
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define HUGETLB_PGDIR_SHIFT	(HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
-#define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
-#define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
-#endif
-
-
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-/*
- * Update PTEP with ENTRY, which is guaranteed to be a less
- * restrictive PTE.  That is, ENTRY may have the ACCESSED, DIRTY, and
- * WRITABLE bits turned on, when the value at PTEP did not.  The
- * WRITABLE bit may only be turned if SAFELY_WRITABLE is TRUE.
- *
- * SAFELY_WRITABLE is TRUE if we can update the value at PTEP without
- * having to worry about races.  On SMP machines, there are only two
- * cases where this is true:
- *
- *	(1) *PTEP has the PRESENT bit turned OFF
- *	(2) ENTRY has the DIRTY bit turned ON
- *
- * On ia64, we could implement this routine with a cmpxchg()-loop
- * which ORs in the _PAGE_A/_PAGE_D bit if they're set in ENTRY.
- * However, like on x86, we can get a more streamlined version by
- * observing that it is OK to drop ACCESSED bit updates when
- * SAFELY_WRITABLE is FALSE.  Besides being rare, all that would do is
- * result in an extra Access-bit fault, which would then turn on the
- * ACCESSED bit in the low-level fault handler (iaccess_bit or
- * daccess_bit in ivt.S).
- */
-#ifdef CONFIG_SMP
-# define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __safely_writable) \
-({									\
-	int __changed = !pte_same(*(__ptep), __entry);			\
-	if (__changed && __safely_writable) {				\
-		set_pte(__ptep, __entry);				\
-		flush_tlb_page(__vma, __addr);				\
-	}								\
-	__changed;							\
-})
-#else
-# define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __safely_writable) \
-({									\
-	int __changed = !pte_same(*(__ptep), __entry);			\
-	if (__changed) {						\
-		set_pte_at((__vma)->vm_mm, (__addr), __ptep, __entry);	\
-		flush_tlb_page(__vma, __addr);				\
-	}								\
-	__changed;							\
-})
-#endif
-# endif /* !__ASSEMBLY__ */
-
-/*
- * Identity-mapped regions use a large page size.  We'll call such large pages
- * "granules".  If you can think of a better name that's unambiguous, let me
- * know...
- */
-#if defined(CONFIG_IA64_GRANULE_64MB)
-# define IA64_GRANULE_SHIFT	_PAGE_SIZE_64M
-#elif defined(CONFIG_IA64_GRANULE_16MB)
-# define IA64_GRANULE_SHIFT	_PAGE_SIZE_16M
-#endif
-#define IA64_GRANULE_SIZE	(1 << IA64_GRANULE_SHIFT)
-/*
- * log2() of the page size we use to map the kernel image (IA64_TR_KERNEL):
- */
-#define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
-#define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
-
-/* These tell get_user_pages() that the first gate page is accessible from user-level.  */
-#define FIXADDR_USER_START	GATE_ADDR
-#ifdef HAVE_BUGGY_SEGREL
-# define FIXADDR_USER_END	(GATE_ADDR + 2*PAGE_SIZE)
-#else
-# define FIXADDR_USER_END	(GATE_ADDR + 2*PERCPU_PAGE_SIZE)
-#endif
-
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
-#define __HAVE_ARCH_PGD_OFFSET_GATE
-
-
-#if CONFIG_PGTABLE_LEVELS == 3
-#include <asm-generic/pgtable-nopud.h>
-#endif
-#include <asm-generic/pgtable-nop4d.h>
-
-#endif /* _ASM_IA64_PGTABLE_H */
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
deleted file mode 100644
index 47e3801b526a..000000000000
--- a/arch/ia64/include/asm/processor.h
+++ /dev/null
@@ -1,660 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PROCESSOR_H
-#define _ASM_IA64_PROCESSOR_H
-
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- *
- * 11/24/98	S.Eranian	added ia64_set_iva()
- * 12/03/99	D. Mosberger	implement thread_saved_pc() via kernel unwind API
- * 06/16/00	A. Mallick	added csd/ssd/tssd for ia32 support
- */
-
-
-#include <asm/intrinsics.h>
-#include <asm/kregs.h>
-#include <asm/ptrace.h>
-#include <asm/ustack.h>
-
-#define IA64_NUM_PHYS_STACK_REG	96
-#define IA64_NUM_DBG_REGS	8
-
-#define DEFAULT_MAP_BASE	__IA64_UL_CONST(0x2000000000000000)
-#define DEFAULT_TASK_SIZE	__IA64_UL_CONST(0xa000000000000000)
-
-/*
- * TASK_SIZE really is a mis-named.  It really is the maximum user
- * space address (plus one).  On IA-64, there are five regions of 2TB
- * each (assuming 8KB page size), for a total of 8TB of user virtual
- * address space.
- */
-#define TASK_SIZE       	DEFAULT_TASK_SIZE
-
-/*
- * This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE	(current->thread.map_base)
-
-#define IA64_THREAD_FPH_VALID	(__IA64_UL(1) << 0)	/* floating-point high state valid? */
-#define IA64_THREAD_DBG_VALID	(__IA64_UL(1) << 1)	/* debug registers valid? */
-#define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
-#define IA64_THREAD_UAC_NOPRINT	(__IA64_UL(1) << 3)	/* don't log unaligned accesses */
-#define IA64_THREAD_UAC_SIGBUS	(__IA64_UL(1) << 4)	/* generate SIGBUS on unaligned acc. */
-#define IA64_THREAD_MIGRATION	(__IA64_UL(1) << 5)	/* require migration
-							   sync at ctx sw */
-#define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6)	/* don't log any fpswa faults */
-#define IA64_THREAD_FPEMU_SIGFPE  (__IA64_UL(1) << 7)	/* send a SIGFPE for fpswa faults */
-
-#define IA64_THREAD_UAC_SHIFT	3
-#define IA64_THREAD_UAC_MASK	(IA64_THREAD_UAC_NOPRINT | IA64_THREAD_UAC_SIGBUS)
-#define IA64_THREAD_FPEMU_SHIFT	6
-#define IA64_THREAD_FPEMU_MASK	(IA64_THREAD_FPEMU_NOPRINT | IA64_THREAD_FPEMU_SIGFPE)
-
-
-/*
- * This shift should be large enough to be able to represent 1000000000/itc_freq with good
- * accuracy while being small enough to fit 10*1000000000<<IA64_NSEC_PER_CYC_SHIFT in 64 bits
- * (this will give enough slack to represent 10 seconds worth of time as a scaled number).
- */
-#define IA64_NSEC_PER_CYC_SHIFT	30
-
-#ifndef __ASSEMBLY__
-
-#include <linux/cache.h>
-#include <linux/compiler.h>
-#include <linux/threads.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-
-#include <asm/fpu.h>
-#include <asm/page.h>
-#include <asm/percpu.h>
-#include <asm/rse.h>
-#include <asm/unwind.h>
-#include <linux/atomic.h>
-#ifdef CONFIG_NUMA
-#include <asm/nodedata.h>
-#endif
-
-/* like above but expressed as bitfields for more efficient access: */
-struct ia64_psr {
-	__u64 reserved0 : 1;
-	__u64 be : 1;
-	__u64 up : 1;
-	__u64 ac : 1;
-	__u64 mfl : 1;
-	__u64 mfh : 1;
-	__u64 reserved1 : 7;
-	__u64 ic : 1;
-	__u64 i : 1;
-	__u64 pk : 1;
-	__u64 reserved2 : 1;
-	__u64 dt : 1;
-	__u64 dfl : 1;
-	__u64 dfh : 1;
-	__u64 sp : 1;
-	__u64 pp : 1;
-	__u64 di : 1;
-	__u64 si : 1;
-	__u64 db : 1;
-	__u64 lp : 1;
-	__u64 tb : 1;
-	__u64 rt : 1;
-	__u64 reserved3 : 4;
-	__u64 cpl : 2;
-	__u64 is : 1;
-	__u64 mc : 1;
-	__u64 it : 1;
-	__u64 id : 1;
-	__u64 da : 1;
-	__u64 dd : 1;
-	__u64 ss : 1;
-	__u64 ri : 2;
-	__u64 ed : 1;
-	__u64 bn : 1;
-	__u64 reserved4 : 19;
-};
-
-union ia64_isr {
-	__u64  val;
-	struct {
-		__u64 code : 16;
-		__u64 vector : 8;
-		__u64 reserved1 : 8;
-		__u64 x : 1;
-		__u64 w : 1;
-		__u64 r : 1;
-		__u64 na : 1;
-		__u64 sp : 1;
-		__u64 rs : 1;
-		__u64 ir : 1;
-		__u64 ni : 1;
-		__u64 so : 1;
-		__u64 ei : 2;
-		__u64 ed : 1;
-		__u64 reserved2 : 20;
-	};
-};
-
-union ia64_lid {
-	__u64 val;
-	struct {
-		__u64  rv  : 16;
-		__u64  eid : 8;
-		__u64  id  : 8;
-		__u64  ig  : 32;
-	};
-};
-
-union ia64_tpr {
-	__u64 val;
-	struct {
-		__u64 ig0 : 4;
-		__u64 mic : 4;
-		__u64 rsv : 8;
-		__u64 mmi : 1;
-		__u64 ig1 : 47;
-	};
-};
-
-union ia64_itir {
-	__u64 val;
-	struct {
-		__u64 rv3  :  2; /* 0-1 */
-		__u64 ps   :  6; /* 2-7 */
-		__u64 key  : 24; /* 8-31 */
-		__u64 rv4  : 32; /* 32-63 */
-	};
-};
-
-union  ia64_rr {
-	__u64 val;
-	struct {
-		__u64  ve	:  1;  /* enable hw walker */
-		__u64  reserved0:  1;  /* reserved */
-		__u64  ps	:  6;  /* log page size */
-		__u64  rid	: 24;  /* region id */
-		__u64  reserved1: 32;  /* reserved */
-	};
-};
-
-/*
- * CPU type, hardware bug flags, and per-CPU state.  Frequently used
- * state comes earlier:
- */
-struct cpuinfo_ia64 {
-	unsigned int softirq_pending;
-	unsigned long itm_delta;	/* # of clock cycles between clock ticks */
-	unsigned long itm_next;		/* interval timer mask value to use for next clock tick */
-	unsigned long nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
-	unsigned long unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
-	unsigned long unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
-	unsigned long itc_freq;		/* frequency of ITC counter */
-	unsigned long proc_freq;	/* frequency of processor */
-	unsigned long cyc_per_usec;	/* itc_freq/1000000 */
-	unsigned long ptce_base;
-	unsigned int ptce_count[2];
-	unsigned int ptce_stride[2];
-	struct task_struct *ksoftirqd;	/* kernel softirq daemon for this CPU */
-
-#ifdef CONFIG_SMP
-	unsigned long loops_per_jiffy;
-	int cpu;
-	unsigned int socket_id;	/* physical processor socket id */
-	unsigned short core_id;	/* core id */
-	unsigned short thread_id; /* thread id */
-	unsigned short num_log;	/* Total number of logical processors on
-				 * this socket that were successfully booted */
-	unsigned char cores_per_socket;	/* Cores per processor socket */
-	unsigned char threads_per_core;	/* Threads per core */
-#endif
-
-	/* CPUID-derived information: */
-	unsigned long ppn;
-	unsigned long features;
-	unsigned char number;
-	unsigned char revision;
-	unsigned char model;
-	unsigned char family;
-	unsigned char archrev;
-	char vendor[16];
-	char *model_name;
-
-#ifdef CONFIG_NUMA
-	struct ia64_node_data *node_data;
-#endif
-};
-
-DECLARE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info);
-
-/*
- * The "local" data variable.  It refers to the per-CPU data of the currently executing
- * CPU, much like "current" points to the per-task data of the currently executing task.
- * Do not use the address of local_cpu_data, since it will be different from
- * cpu_data(smp_processor_id())!
- */
-#define local_cpu_data		(&__ia64_per_cpu_var(ia64_cpu_info))
-#define cpu_data(cpu)		(&per_cpu(ia64_cpu_info, cpu))
-
-extern void print_cpu_info (struct cpuinfo_ia64 *);
-
-#define SET_UNALIGN_CTL(task,value)								\
-({												\
-	(task)->thread.flags = (((task)->thread.flags & ~IA64_THREAD_UAC_MASK)			\
-				| (((value) << IA64_THREAD_UAC_SHIFT) & IA64_THREAD_UAC_MASK));	\
-	0;											\
-})
-#define GET_UNALIGN_CTL(task,addr)								\
-({												\
-	put_user(((task)->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT,	\
-		 (int __user *) (addr));							\
-})
-
-#define SET_FPEMU_CTL(task,value)								\
-({												\
-	(task)->thread.flags = (((task)->thread.flags & ~IA64_THREAD_FPEMU_MASK)		\
-			  | (((value) << IA64_THREAD_FPEMU_SHIFT) & IA64_THREAD_FPEMU_MASK));	\
-	0;											\
-})
-#define GET_FPEMU_CTL(task,addr)								\
-({												\
-	put_user(((task)->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT,	\
-		 (int __user *) (addr));							\
-})
-
-struct thread_struct {
-	__u32 flags;			/* various thread flags (see IA64_THREAD_*) */
-	/* writing on_ustack is performance-critical, so it's worth spending 8 bits on it... */
-	__u8 on_ustack;			/* executing on user-stacks? */
-	__u8 pad[3];
-	__u64 ksp;			/* kernel stack pointer */
-	__u64 map_base;			/* base address for get_unmapped_area() */
-	__u64 rbs_bot;			/* the base address for the RBS */
-	int last_fph_cpu;		/* CPU that may hold the contents of f32-f127 */
-	unsigned long dbr[IA64_NUM_DBG_REGS];
-	unsigned long ibr[IA64_NUM_DBG_REGS];
-	struct ia64_fpreg fph[96];	/* saved/loaded on demand */
-};
-
-#define INIT_THREAD {						\
-	.flags =	0,					\
-	.on_ustack =	0,					\
-	.ksp =		0,					\
-	.map_base =	DEFAULT_MAP_BASE,			\
-	.rbs_bot =	STACK_TOP - DEFAULT_USER_STACK_SIZE,	\
-	.last_fph_cpu =  -1,					\
-	.dbr =		{0, },					\
-	.ibr =		{0, },					\
-	.fph =		{{{{0}}}, }				\
-}
-
-#define start_thread(regs,new_ip,new_sp) do {							\
-	regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL))		\
-			 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS));		\
-	regs->cr_iip = new_ip;									\
-	regs->ar_rsc = 0xf;		/* eager mode, privilege level 3 */			\
-	regs->ar_rnat = 0;									\
-	regs->ar_bspstore = current->thread.rbs_bot;						\
-	regs->ar_fpsr = FPSR_DEFAULT;								\
-	regs->loadrs = 0;									\
-	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
-	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) {	\
-		/*										\
-		 * Zap scratch regs to avoid leaking bits between processes with different	\
-		 * uid/privileges.								\
-		 */										\
-		regs->ar_pfs = 0; regs->b0 = 0; regs->pr = 0;					\
-		regs->r1 = 0; regs->r9  = 0; regs->r11 = 0; regs->r13 = 0; regs->r15 = 0;	\
-	}											\
-} while (0)
-
-/* Forward declarations, a strange C thing... */
-struct mm_struct;
-struct task_struct;
-
-/* Get wait channel for task P.  */
-extern unsigned long __get_wchan (struct task_struct *p);
-
-/* Return instruction pointer of blocked task TSK.  */
-#define KSTK_EIP(tsk)					\
-  ({							\
-	struct pt_regs *_regs = task_pt_regs(tsk);	\
-	_regs->cr_iip + ia64_psr(_regs)->ri;		\
-  })
-
-/* Return stack pointer of blocked task TSK.  */
-#define KSTK_ESP(tsk)  ((tsk)->thread.ksp)
-
-extern void ia64_getreg_unknown_kr (void);
-extern void ia64_setreg_unknown_kr (void);
-
-#define ia64_get_kr(regnum)					\
-({								\
-	unsigned long r = 0;					\
-								\
-	switch (regnum) {					\
-	    case 0: r = ia64_getreg(_IA64_REG_AR_KR0); break;	\
-	    case 1: r = ia64_getreg(_IA64_REG_AR_KR1); break;	\
-	    case 2: r = ia64_getreg(_IA64_REG_AR_KR2); break;	\
-	    case 3: r = ia64_getreg(_IA64_REG_AR_KR3); break;	\
-	    case 4: r = ia64_getreg(_IA64_REG_AR_KR4); break;	\
-	    case 5: r = ia64_getreg(_IA64_REG_AR_KR5); break;	\
-	    case 6: r = ia64_getreg(_IA64_REG_AR_KR6); break;	\
-	    case 7: r = ia64_getreg(_IA64_REG_AR_KR7); break;	\
-	    default: ia64_getreg_unknown_kr(); break;		\
-	}							\
-	r;							\
-})
-
-#define ia64_set_kr(regnum, r) 					\
-({								\
-	switch (regnum) {					\
-	    case 0: ia64_setreg(_IA64_REG_AR_KR0, r); break;	\
-	    case 1: ia64_setreg(_IA64_REG_AR_KR1, r); break;	\
-	    case 2: ia64_setreg(_IA64_REG_AR_KR2, r); break;	\
-	    case 3: ia64_setreg(_IA64_REG_AR_KR3, r); break;	\
-	    case 4: ia64_setreg(_IA64_REG_AR_KR4, r); break;	\
-	    case 5: ia64_setreg(_IA64_REG_AR_KR5, r); break;	\
-	    case 6: ia64_setreg(_IA64_REG_AR_KR6, r); break;	\
-	    case 7: ia64_setreg(_IA64_REG_AR_KR7, r); break;	\
-	    default: ia64_setreg_unknown_kr(); break;		\
-	}							\
-})
-
-/*
- * The following three macros can't be inline functions because we don't have struct
- * task_struct at this point.
- */
-
-/*
- * Return TRUE if task T owns the fph partition of the CPU we're running on.
- * Must be called from code that has preemption disabled.
- */
-#define ia64_is_local_fpu_owner(t)								\
-({												\
-	struct task_struct *__ia64_islfo_task = (t);						\
-	(__ia64_islfo_task->thread.last_fph_cpu == smp_processor_id()				\
-	 && __ia64_islfo_task == (struct task_struct *) ia64_get_kr(IA64_KR_FPU_OWNER));	\
-})
-
-/*
- * Mark task T as owning the fph partition of the CPU we're running on.
- * Must be called from code that has preemption disabled.
- */
-#define ia64_set_local_fpu_owner(t) do {						\
-	struct task_struct *__ia64_slfo_task = (t);					\
-	__ia64_slfo_task->thread.last_fph_cpu = smp_processor_id();			\
-	ia64_set_kr(IA64_KR_FPU_OWNER, (unsigned long) __ia64_slfo_task);		\
-} while (0)
-
-/* Mark the fph partition of task T as being invalid on all CPUs.  */
-#define ia64_drop_fpu(t)	((t)->thread.last_fph_cpu = -1)
-
-extern void __ia64_init_fpu (void);
-extern void __ia64_save_fpu (struct ia64_fpreg *fph);
-extern void __ia64_load_fpu (struct ia64_fpreg *fph);
-extern void ia64_save_debug_regs (unsigned long *save_area);
-extern void ia64_load_debug_regs (unsigned long *save_area);
-
-#define ia64_fph_enable()	do { ia64_rsm(IA64_PSR_DFH); ia64_srlz_d(); } while (0)
-#define ia64_fph_disable()	do { ia64_ssm(IA64_PSR_DFH); ia64_srlz_d(); } while (0)
-
-/* load fp 0.0 into fph */
-static inline void
-ia64_init_fpu (void) {
-	ia64_fph_enable();
-	__ia64_init_fpu();
-	ia64_fph_disable();
-}
-
-/* save f32-f127 at FPH */
-static inline void
-ia64_save_fpu (struct ia64_fpreg *fph) {
-	ia64_fph_enable();
-	__ia64_save_fpu(fph);
-	ia64_fph_disable();
-}
-
-/* load f32-f127 from FPH */
-static inline void
-ia64_load_fpu (struct ia64_fpreg *fph) {
-	ia64_fph_enable();
-	__ia64_load_fpu(fph);
-	ia64_fph_disable();
-}
-
-static inline __u64
-ia64_clear_ic (void)
-{
-	__u64 psr;
-	psr = ia64_getreg(_IA64_REG_PSR);
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I | IA64_PSR_IC);
-	ia64_srlz_i();
-	return psr;
-}
-
-/*
- * Restore the psr.
- */
-static inline void
-ia64_set_psr (__u64 psr)
-{
-	ia64_stop();
-	ia64_setreg(_IA64_REG_PSR_L, psr);
-	ia64_srlz_i();
-}
-
-/*
- * Insert a translation into an instruction and/or data translation
- * register.
- */
-static inline void
-ia64_itr (__u64 target_mask, __u64 tr_num,
-	  __u64 vmaddr, __u64 pte,
-	  __u64 log_page_size)
-{
-	ia64_setreg(_IA64_REG_CR_ITIR, (log_page_size << 2));
-	ia64_setreg(_IA64_REG_CR_IFA, vmaddr);
-	ia64_stop();
-	if (target_mask & 0x1)
-		ia64_itri(tr_num, pte);
-	if (target_mask & 0x2)
-		ia64_itrd(tr_num, pte);
-}
-
-/*
- * Insert a translation into the instruction and/or data translation
- * cache.
- */
-static inline void
-ia64_itc (__u64 target_mask, __u64 vmaddr, __u64 pte,
-	  __u64 log_page_size)
-{
-	ia64_setreg(_IA64_REG_CR_ITIR, (log_page_size << 2));
-	ia64_setreg(_IA64_REG_CR_IFA, vmaddr);
-	ia64_stop();
-	/* as per EAS2.6, itc must be the last instruction in an instruction group */
-	if (target_mask & 0x1)
-		ia64_itci(pte);
-	if (target_mask & 0x2)
-		ia64_itcd(pte);
-}
-
-/*
- * Purge a range of addresses from instruction and/or data translation
- * register(s).
- */
-static inline void
-ia64_ptr (__u64 target_mask, __u64 vmaddr, __u64 log_size)
-{
-	if (target_mask & 0x1)
-		ia64_ptri(vmaddr, (log_size << 2));
-	if (target_mask & 0x2)
-		ia64_ptrd(vmaddr, (log_size << 2));
-}
-
-/* Set the interrupt vector address.  The address must be suitably aligned (32KB).  */
-static inline void
-ia64_set_iva (void *ivt_addr)
-{
-	ia64_setreg(_IA64_REG_CR_IVA, (__u64) ivt_addr);
-	ia64_srlz_i();
-}
-
-/* Set the page table address and control bits.  */
-static inline void
-ia64_set_pta (__u64 pta)
-{
-	/* Note: srlz.i implies srlz.d */
-	ia64_setreg(_IA64_REG_CR_PTA, pta);
-	ia64_srlz_i();
-}
-
-static inline void
-ia64_eoi (void)
-{
-	ia64_setreg(_IA64_REG_CR_EOI, 0);
-	ia64_srlz_d();
-}
-
-#define cpu_relax()	ia64_hint(ia64_hint_pause)
-
-static inline int
-ia64_get_irr(unsigned int vector)
-{
-	unsigned int reg = vector / 64;
-	unsigned int bit = vector % 64;
-	unsigned long irr;
-
-	switch (reg) {
-	case 0: irr = ia64_getreg(_IA64_REG_CR_IRR0); break;
-	case 1: irr = ia64_getreg(_IA64_REG_CR_IRR1); break;
-	case 2: irr = ia64_getreg(_IA64_REG_CR_IRR2); break;
-	case 3: irr = ia64_getreg(_IA64_REG_CR_IRR3); break;
-	}
-
-	return test_bit(bit, &irr);
-}
-
-static inline void
-ia64_set_lrr0 (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_LRR0, val);
-	ia64_srlz_d();
-}
-
-static inline void
-ia64_set_lrr1 (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_LRR1, val);
-	ia64_srlz_d();
-}
-
-
-/*
- * Given the address to which a spill occurred, return the unat bit
- * number that corresponds to this address.
- */
-static inline __u64
-ia64_unat_pos (void *spill_addr)
-{
-	return ((__u64) spill_addr >> 3) & 0x3f;
-}
-
-/*
- * Set the NaT bit of an integer register which was spilled at address
- * SPILL_ADDR.  UNAT is the mask to be updated.
- */
-static inline void
-ia64_set_unat (__u64 *unat, void *spill_addr, unsigned long nat)
-{
-	__u64 bit = ia64_unat_pos(spill_addr);
-	__u64 mask = 1UL << bit;
-
-	*unat = (*unat & ~mask) | (nat << bit);
-}
-
-static inline __u64
-ia64_get_ivr (void)
-{
-	__u64 r;
-	ia64_srlz_d();
-	r = ia64_getreg(_IA64_REG_CR_IVR);
-	ia64_srlz_d();
-	return r;
-}
-
-static inline void
-ia64_set_dbr (__u64 regnum, __u64 value)
-{
-	__ia64_set_dbr(regnum, value);
-#ifdef CONFIG_ITANIUM
-	ia64_srlz_d();
-#endif
-}
-
-static inline __u64
-ia64_get_dbr (__u64 regnum)
-{
-	__u64 retval;
-
-	retval = __ia64_get_dbr(regnum);
-#ifdef CONFIG_ITANIUM
-	ia64_srlz_d();
-#endif
-	return retval;
-}
-
-static inline __u64
-ia64_rotr (__u64 w, __u64 n)
-{
-	return (w >> n) | (w << (64 - n));
-}
-
-#define ia64_rotl(w,n)	ia64_rotr((w), (64) - (n))
-
-/*
- * Take a mapped kernel address and return the equivalent address
- * in the region 7 identity mapped virtual area.
- */
-static inline void *
-ia64_imva (void *addr)
-{
-	void *result;
-	result = (void *) ia64_tpa(addr);
-	return __va(result);
-}
-
-#define ARCH_HAS_PREFETCH
-#define ARCH_HAS_PREFETCHW
-#define PREFETCH_STRIDE			L1_CACHE_BYTES
-
-static inline void
-prefetch (const void *x)
-{
-	 ia64_lfetch(ia64_lfhint_none, x);
-}
-
-static inline void
-prefetchw (const void *x)
-{
-	ia64_lfetch_excl(ia64_lfhint_none, x);
-}
-
-extern unsigned long boot_option_idle_override;
-
-enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_FORCE_MWAIT,
-			 IDLE_NOMWAIT, IDLE_POLL};
-
-void default_idle(void);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PROCESSOR_H */
diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
deleted file mode 100644
index 402874489890..000000000000
--- a/arch/ia64/include/asm/ptrace.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2003 Intel Co
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Arun Sharma <arun.sharma@intel.com>
- *
- * 12/07/98	S. Eranian	added pt_regs & switch_stack
- * 12/21/98	D. Mosberger	updated to match latest code
- *  6/17/99	D. Mosberger	added second unat member to "struct switch_stack"
- *
- */
-#ifndef _ASM_IA64_PTRACE_H
-#define _ASM_IA64_PTRACE_H
-
-#ifndef ASM_OFFSETS_C
-#include <asm/asm-offsets.h>
-#endif
-#include <uapi/asm/ptrace.h>
-
-/*
- * Base-2 logarithm of number of pages to allocate per task structure
- * (including register backing store and memory stack):
- */
-#if defined(CONFIG_IA64_PAGE_SIZE_4KB)
-# define KERNEL_STACK_SIZE_ORDER		3
-#elif defined(CONFIG_IA64_PAGE_SIZE_8KB)
-# define KERNEL_STACK_SIZE_ORDER		2
-#elif defined(CONFIG_IA64_PAGE_SIZE_16KB)
-# define KERNEL_STACK_SIZE_ORDER		1
-#else
-# define KERNEL_STACK_SIZE_ORDER		0
-#endif
-
-#define IA64_RBS_OFFSET			((IA64_TASK_SIZE + IA64_THREAD_INFO_SIZE + 31) & ~31)
-#define IA64_STK_OFFSET			((1 << KERNEL_STACK_SIZE_ORDER)*PAGE_SIZE)
-
-#define KERNEL_STACK_SIZE		IA64_STK_OFFSET
-
-#ifndef __ASSEMBLY__
-
-#include <asm/current.h>
-#include <asm/page.h>
-
-/*
- * We use the ia64_psr(regs)->ri to determine which of the three
- * instructions in bundle (16 bytes) took the sample. Generate
- * the canonical representation by adding to instruction pointer.
- */
-# define instruction_pointer(regs) ((regs)->cr_iip + ia64_psr(regs)->ri)
-# define instruction_pointer_set(regs, val)	\
-({						\
-	ia64_psr(regs)->ri = (val & 0xf);	\
-	regs->cr_iip = (val & ~0xfULL);		\
-})
-
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
-{
-	return regs->r12;
-}
-
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return regs->r10 != -1;
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	if (is_syscall_success(regs))
-		return regs->r8;
-	else
-		return -regs->r8;
-}
-
-/* Conserve space in histogram by encoding slot bits in address
- * bits 2 and 3 rather than bits 0 and 1.
- */
-#define profile_pc(regs)						\
-({									\
-	unsigned long __ip = instruction_pointer(regs);			\
-	(__ip & ~3UL) + ((__ip & 3UL) << 2);				\
-})
-
-  /* given a pointer to a task_struct, return the user's pt_regs */
-# define task_pt_regs(t)		(((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1)
-# define ia64_psr(regs)			((struct ia64_psr *) &(regs)->cr_ipsr)
-# define user_mode(regs)		(((struct ia64_psr *) &(regs)->cr_ipsr)->cpl != 0)
-# define user_stack(task,regs)	((long) regs - (long) task == IA64_STK_OFFSET - sizeof(*regs))
-# define fsys_mode(task,regs)					\
-  ({								\
-	  struct task_struct *_task = (task);			\
-	  struct pt_regs *_regs = (regs);			\
-	  !user_mode(_regs) && user_stack(_task, _regs);	\
-  })
-
-  /*
-   * System call handlers that, upon successful completion, need to return a negative value
-   * should call force_successful_syscall_return() right before returning.  On architectures
-   * where the syscall convention provides for a separate error flag (e.g., alpha, ia64,
-   * ppc{,64}, sparc{,64}, possibly others), this macro can be used to ensure that the error
-   * flag will not get set.  On architectures which do not support a separate error flag,
-   * the macro is a no-op and the spurious error condition needs to be filtered out by some
-   * other means (e.g., in user-level, by passing an extra argument to the syscall handler,
-   * or something along those lines).
-   *
-   * On ia64, we can clear the user's pt_regs->r8 to force a successful syscall.
-   */
-# define force_successful_syscall_return()	(task_pt_regs(current)->r8 = 0)
-
-  struct task_struct;			/* forward decl */
-  struct unw_frame_info;		/* forward decl */
-
-  extern unsigned long ia64_get_user_rbs_end (struct task_struct *, struct pt_regs *,
-					      unsigned long *);
-  extern long ia64_peek (struct task_struct *, struct switch_stack *, unsigned long,
-			 unsigned long, long *);
-  extern long ia64_poke (struct task_struct *, struct switch_stack *, unsigned long,
-			 unsigned long, long);
-  extern void ia64_flush_fph (struct task_struct *);
-  extern void ia64_sync_fph (struct task_struct *);
-  extern void ia64_sync_krbs(void);
-  extern long ia64_sync_user_rbs (struct task_struct *, struct switch_stack *,
-				  unsigned long, unsigned long);
-
-  /* get nat bits for scratch registers such that bit N==1 iff scratch register rN is a NaT */
-  extern unsigned long ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat);
-  /* put nat bits for scratch registers such that scratch register rN is a NaT iff bit N==1 */
-  extern unsigned long ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat);
-
-  extern void ia64_increment_ip (struct pt_regs *pt);
-  extern void ia64_decrement_ip (struct pt_regs *pt);
-
-  extern void ia64_ptrace_stop(void);
-  #define arch_ptrace_stop() \
-	ia64_ptrace_stop()
-  #define arch_ptrace_stop_needed() \
-	(!test_thread_flag(TIF_RESTORE_RSE))
-
-  #define arch_has_single_step()  (1)
-  #define arch_has_block_step()   (1)
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_PTRACE_H */
diff --git a/arch/ia64/include/asm/sal.h b/arch/ia64/include/asm/sal.h
deleted file mode 100644
index 22749a201e92..000000000000
--- a/arch/ia64/include/asm/sal.h
+++ /dev/null
@@ -1,919 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SAL_H
-#define _ASM_IA64_SAL_H
-
-/*
- * System Abstraction Layer definitions.
- *
- * This is based on version 2.5 of the manual "IA-64 System
- * Abstraction Layer".
- *
- * Copyright (C) 2001 Intel
- * Copyright (C) 2002 Jenna Hall <jenna.s.hall@intel.com>
- * Copyright (C) 2001 Fred Lewis <frederick.v.lewis@intel.com>
- * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Srinivasa Prasad Thirumalachar <sprasad@sprasad.engr.sgi.com>
- *
- * 02/01/04 J. Hall Updated Error Record Structures to conform to July 2001
- *		    revision of the SAL spec.
- * 01/01/03 fvlewis Updated Error Record Structures to conform with Nov. 2000
- *                  revision of the SAL spec.
- * 99/09/29 davidm	Updated for SAL 2.6.
- * 00/03/29 cfleck      Updated SAL Error Logging info for processor (SAL 2.6)
- *                      (plus examples of platform error info structures from smariset @ Intel)
- */
-
-#define IA64_SAL_PLATFORM_FEATURE_BUS_LOCK_BIT		0
-#define IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT_BIT	1
-#define IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT_BIT	2
-#define IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT	 	3
-
-#define IA64_SAL_PLATFORM_FEATURE_BUS_LOCK	  (1<<IA64_SAL_PLATFORM_FEATURE_BUS_LOCK_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT (1<<IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT (1<<IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT	  (1<<IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT)
-
-#ifndef __ASSEMBLY__
-
-#include <linux/bcd.h>
-#include <linux/spinlock.h>
-#include <linux/efi.h>
-
-#include <asm/pal.h>
-#include <asm/fpu.h>
-
-extern unsigned long sal_systab_phys;
-extern spinlock_t sal_lock;
-
-/* SAL spec _requires_ eight args for each call. */
-#define __IA64_FW_CALL(entry,result,a0,a1,a2,a3,a4,a5,a6,a7)	\
-	result = (*entry)(a0,a1,a2,a3,a4,a5,a6,a7)
-
-# define IA64_FW_CALL(entry,result,args...) do {		\
-	unsigned long __ia64_sc_flags;				\
-	struct ia64_fpreg __ia64_sc_fr[6];			\
-	ia64_save_scratch_fpregs(__ia64_sc_fr);			\
-	spin_lock_irqsave(&sal_lock, __ia64_sc_flags);		\
-	__IA64_FW_CALL(entry, result, args);			\
-	spin_unlock_irqrestore(&sal_lock, __ia64_sc_flags);	\
-	ia64_load_scratch_fpregs(__ia64_sc_fr);			\
-} while (0)
-
-# define SAL_CALL(result,args...)			\
-	IA64_FW_CALL(ia64_sal, result, args);
-
-# define SAL_CALL_NOLOCK(result,args...) do {		\
-	unsigned long __ia64_scn_flags;			\
-	struct ia64_fpreg __ia64_scn_fr[6];		\
-	ia64_save_scratch_fpregs(__ia64_scn_fr);	\
-	local_irq_save(__ia64_scn_flags);		\
-	__IA64_FW_CALL(ia64_sal, result, args);		\
-	local_irq_restore(__ia64_scn_flags);		\
-	ia64_load_scratch_fpregs(__ia64_scn_fr);	\
-} while (0)
-
-# define SAL_CALL_REENTRANT(result,args...) do {	\
-	struct ia64_fpreg __ia64_scs_fr[6];		\
-	ia64_save_scratch_fpregs(__ia64_scs_fr);	\
-	preempt_disable();				\
-	__IA64_FW_CALL(ia64_sal, result, args);		\
-	preempt_enable();				\
-	ia64_load_scratch_fpregs(__ia64_scs_fr);	\
-} while (0)
-
-#define SAL_SET_VECTORS			0x01000000
-#define SAL_GET_STATE_INFO		0x01000001
-#define SAL_GET_STATE_INFO_SIZE		0x01000002
-#define SAL_CLEAR_STATE_INFO		0x01000003
-#define SAL_MC_RENDEZ			0x01000004
-#define SAL_MC_SET_PARAMS		0x01000005
-#define SAL_REGISTER_PHYSICAL_ADDR	0x01000006
-
-#define SAL_CACHE_FLUSH			0x01000008
-#define SAL_CACHE_INIT			0x01000009
-#define SAL_PCI_CONFIG_READ		0x01000010
-#define SAL_PCI_CONFIG_WRITE		0x01000011
-#define SAL_FREQ_BASE			0x01000012
-#define SAL_PHYSICAL_ID_INFO		0x01000013
-
-#define SAL_UPDATE_PAL			0x01000020
-
-struct ia64_sal_retval {
-	/*
-	 * A zero status value indicates call completed without error.
-	 * A negative status value indicates reason of call failure.
-	 * A positive status value indicates success but an
-	 * informational value should be printed (e.g., "reboot for
-	 * change to take effect").
-	 */
-	long status;
-	unsigned long v0;
-	unsigned long v1;
-	unsigned long v2;
-};
-
-typedef struct ia64_sal_retval (*ia64_sal_handler) (u64, ...);
-
-enum {
-	SAL_FREQ_BASE_PLATFORM = 0,
-	SAL_FREQ_BASE_INTERVAL_TIMER = 1,
-	SAL_FREQ_BASE_REALTIME_CLOCK = 2
-};
-
-/*
- * The SAL system table is followed by a variable number of variable
- * length descriptors.  The structure of these descriptors follows
- * below.
- * The defininition follows SAL specs from July 2000
- */
-struct ia64_sal_systab {
-	u8 signature[4];	/* should be "SST_" */
-	u32 size;		/* size of this table in bytes */
-	u8 sal_rev_minor;
-	u8 sal_rev_major;
-	u16 entry_count;	/* # of entries in variable portion */
-	u8 checksum;
-	u8 reserved1[7];
-	u8 sal_a_rev_minor;
-	u8 sal_a_rev_major;
-	u8 sal_b_rev_minor;
-	u8 sal_b_rev_major;
-	/* oem_id & product_id: terminating NUL is missing if string is exactly 32 bytes long. */
-	u8 oem_id[32];
-	u8 product_id[32];	/* ASCII product id  */
-	u8 reserved2[8];
-};
-
-enum sal_systab_entry_type {
-	SAL_DESC_ENTRY_POINT = 0,
-	SAL_DESC_MEMORY = 1,
-	SAL_DESC_PLATFORM_FEATURE = 2,
-	SAL_DESC_TR = 3,
-	SAL_DESC_PTC = 4,
-	SAL_DESC_AP_WAKEUP = 5
-};
-
-/*
- * Entry type:	Size:
- *	0	48
- *	1	32
- *	2	16
- *	3	32
- *	4	16
- *	5	16
- */
-#define SAL_DESC_SIZE(type)	"\060\040\020\040\020\020"[(unsigned) type]
-
-typedef struct ia64_sal_desc_entry_point {
-	u8 type;
-	u8 reserved1[7];
-	u64 pal_proc;
-	u64 sal_proc;
-	u64 gp;
-	u8 reserved2[16];
-}ia64_sal_desc_entry_point_t;
-
-typedef struct ia64_sal_desc_memory {
-	u8 type;
-	u8 used_by_sal;	/* needs to be mapped for SAL? */
-	u8 mem_attr;		/* current memory attribute setting */
-	u8 access_rights;	/* access rights set up by SAL */
-	u8 mem_attr_mask;	/* mask of supported memory attributes */
-	u8 reserved1;
-	u8 mem_type;		/* memory type */
-	u8 mem_usage;		/* memory usage */
-	u64 addr;		/* physical address of memory */
-	u32 length;	/* length (multiple of 4KB pages) */
-	u32 reserved2;
-	u8 oem_reserved[8];
-} ia64_sal_desc_memory_t;
-
-typedef struct ia64_sal_desc_platform_feature {
-	u8 type;
-	u8 feature_mask;
-	u8 reserved1[14];
-} ia64_sal_desc_platform_feature_t;
-
-typedef struct ia64_sal_desc_tr {
-	u8 type;
-	u8 tr_type;		/* 0 == instruction, 1 == data */
-	u8 regnum;		/* translation register number */
-	u8 reserved1[5];
-	u64 addr;		/* virtual address of area covered */
-	u64 page_size;		/* encoded page size */
-	u8 reserved2[8];
-} ia64_sal_desc_tr_t;
-
-typedef struct ia64_sal_desc_ptc {
-	u8 type;
-	u8 reserved1[3];
-	u32 num_domains;	/* # of coherence domains */
-	u64 domain_info;	/* physical address of domain info table */
-} ia64_sal_desc_ptc_t;
-
-typedef struct ia64_sal_ptc_domain_info {
-	u64 proc_count;		/* number of processors in domain */
-	u64 proc_list;		/* physical address of LID array */
-} ia64_sal_ptc_domain_info_t;
-
-typedef struct ia64_sal_ptc_domain_proc_entry {
-	u64 id  : 8;		/* id of processor */
-	u64 eid : 8;		/* eid of processor */
-} ia64_sal_ptc_domain_proc_entry_t;
-
-
-#define IA64_SAL_AP_EXTERNAL_INT 0
-
-typedef struct ia64_sal_desc_ap_wakeup {
-	u8 type;
-	u8 mechanism;		/* 0 == external interrupt */
-	u8 reserved1[6];
-	u64 vector;		/* interrupt vector in range 0x10-0xff */
-} ia64_sal_desc_ap_wakeup_t ;
-
-extern ia64_sal_handler ia64_sal;
-extern struct ia64_sal_desc_ptc *ia64_ptc_domain_info;
-
-extern unsigned short sal_revision;	/* supported SAL spec revision */
-extern unsigned short sal_version;	/* SAL version; OEM dependent */
-#define SAL_VERSION_CODE(major, minor) ((bin2bcd(major) << 8) | bin2bcd(minor))
-
-extern const char *ia64_sal_strerror (long status);
-extern void ia64_sal_init (struct ia64_sal_systab *sal_systab);
-
-/* SAL information type encodings */
-enum {
-	SAL_INFO_TYPE_MCA  = 0,		/* Machine check abort information */
-        SAL_INFO_TYPE_INIT = 1,		/* Init information */
-        SAL_INFO_TYPE_CMC  = 2,		/* Corrected machine check information */
-        SAL_INFO_TYPE_CPE  = 3		/* Corrected platform error information */
-};
-
-/* Encodings for machine check parameter types */
-enum {
-	SAL_MC_PARAM_RENDEZ_INT    = 1,	/* Rendezvous interrupt */
-	SAL_MC_PARAM_RENDEZ_WAKEUP = 2,	/* Wakeup */
-	SAL_MC_PARAM_CPE_INT	   = 3	/* Corrected Platform Error Int */
-};
-
-/* Encodings for rendezvous mechanisms */
-enum {
-	SAL_MC_PARAM_MECHANISM_INT = 1,	/* Use interrupt */
-	SAL_MC_PARAM_MECHANISM_MEM = 2	/* Use memory synchronization variable*/
-};
-
-/* Encodings for vectors which can be registered by the OS with SAL */
-enum {
-	SAL_VECTOR_OS_MCA	  = 0,
-	SAL_VECTOR_OS_INIT	  = 1,
-	SAL_VECTOR_OS_BOOT_RENDEZ = 2
-};
-
-/* Encodings for mca_opt parameter sent to SAL_MC_SET_PARAMS */
-#define	SAL_MC_PARAM_RZ_ALWAYS		0x1
-#define	SAL_MC_PARAM_BINIT_ESCALATE	0x10
-
-/*
- * Definition of the SAL Error Log from the SAL spec
- */
-
-/* SAL Error Record Section GUID Definitions */
-#define SAL_PROC_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf1, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_MEM_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf2, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SEL_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf3, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_PCI_BUS_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf4, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf5, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_PCI_COMP_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf6, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SPECIFIC_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf7, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_HOST_CTLR_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf8, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_BUS_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf9, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID \
-    EFI_GUID(0x6cb0a200, 0x893a, 0x11da, 0x96, 0xd2, 0x0, 0x10, 0x83, 0xff, \
-		0xca, 0x4d)
-
-#define MAX_CACHE_ERRORS	6
-#define MAX_TLB_ERRORS		6
-#define MAX_BUS_ERRORS		1
-
-/* Definition of version  according to SAL spec for logging purposes */
-typedef struct sal_log_revision {
-	u8 minor;		/* BCD (0..99) */
-	u8 major;		/* BCD (0..99) */
-} sal_log_revision_t;
-
-/* Definition of timestamp according to SAL spec for logging purposes */
-typedef struct sal_log_timestamp {
-	u8 slh_second;		/* Second (0..59) */
-	u8 slh_minute;		/* Minute (0..59) */
-	u8 slh_hour;		/* Hour (0..23) */
-	u8 slh_reserved;
-	u8 slh_day;		/* Day (1..31) */
-	u8 slh_month;		/* Month (1..12) */
-	u8 slh_year;		/* Year (00..99) */
-	u8 slh_century;		/* Century (19, 20, 21, ...) */
-} sal_log_timestamp_t;
-
-/* Definition of log record  header structures */
-typedef struct sal_log_record_header {
-	u64 id;				/* Unique monotonically increasing ID */
-	sal_log_revision_t revision;	/* Major and Minor revision of header */
-	u8 severity;			/* Error Severity */
-	u8 validation_bits;		/* 0: platform_guid, 1: !timestamp */
-	u32 len;			/* Length of this error log in bytes */
-	sal_log_timestamp_t timestamp;	/* Timestamp */
-	efi_guid_t platform_guid;	/* Unique OEM Platform ID */
-} sal_log_record_header_t;
-
-#define sal_log_severity_recoverable	0
-#define sal_log_severity_fatal		1
-#define sal_log_severity_corrected	2
-
-/*
- * Error Recovery Info (ERI) bit decode.  From SAL Spec section B.2.2 Table B-3
- * Error Section Error_Recovery_Info Field Definition.
- */
-#define ERI_NOT_VALID		0x0	/* Error Recovery Field is not valid */
-#define ERI_NOT_ACCESSIBLE	0x30	/* Resource not accessible */
-#define ERI_CONTAINMENT_WARN	0x22	/* Corrupt data propagated */
-#define ERI_UNCORRECTED_ERROR	0x20	/* Uncorrected error */
-#define ERI_COMPONENT_RESET	0x24	/* Component must be reset */
-#define ERI_CORR_ERROR_LOG	0x21	/* Corrected error, needs logging */
-#define ERI_CORR_ERROR_THRESH	0x29	/* Corrected error threshold exceeded */
-
-/* Definition of log section header structures */
-typedef struct sal_log_sec_header {
-    efi_guid_t guid;			/* Unique Section ID */
-    sal_log_revision_t revision;	/* Major and Minor revision of Section */
-    u8 error_recovery_info;		/* Platform error recovery status */
-    u8 reserved;
-    u32 len;				/* Section length */
-} sal_log_section_hdr_t;
-
-typedef struct sal_log_mod_error_info {
-	struct {
-		u64 check_info              : 1,
-		    requestor_identifier    : 1,
-		    responder_identifier    : 1,
-		    target_identifier       : 1,
-		    precise_ip              : 1,
-		    reserved                : 59;
-	} valid;
-	u64 check_info;
-	u64 requestor_identifier;
-	u64 responder_identifier;
-	u64 target_identifier;
-	u64 precise_ip;
-} sal_log_mod_error_info_t;
-
-typedef struct sal_processor_static_info {
-	struct {
-		u64 minstate        : 1,
-		    br              : 1,
-		    cr              : 1,
-		    ar              : 1,
-		    rr              : 1,
-		    fr              : 1,
-		    reserved        : 58;
-	} valid;
-	struct pal_min_state_area min_state_area;
-	u64 br[8];
-	u64 cr[128];
-	u64 ar[128];
-	u64 rr[8];
-	struct ia64_fpreg __attribute__ ((packed)) fr[128];
-} sal_processor_static_info_t;
-
-struct sal_cpuid_info {
-	u64 regs[5];
-	u64 reserved;
-};
-
-typedef struct sal_log_processor_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 proc_error_map      : 1,
-		    proc_state_param    : 1,
-		    proc_cr_lid         : 1,
-		    psi_static_struct   : 1,
-		    num_cache_check     : 4,
-		    num_tlb_check       : 4,
-		    num_bus_check       : 4,
-		    num_reg_file_check  : 4,
-		    num_ms_check        : 4,
-		    cpuid_info          : 1,
-		    reserved1           : 39;
-	} valid;
-	u64 proc_error_map;
-	u64 proc_state_parameter;
-	u64 proc_cr_lid;
-	/*
-	 * The rest of this structure consists of variable-length arrays, which can't be
-	 * expressed in C.
-	 */
-	sal_log_mod_error_info_t info[];
-	/*
-	 * This is what the rest looked like if C supported variable-length arrays:
-	 *
-	 * sal_log_mod_error_info_t cache_check_info[.valid.num_cache_check];
-	 * sal_log_mod_error_info_t tlb_check_info[.valid.num_tlb_check];
-	 * sal_log_mod_error_info_t bus_check_info[.valid.num_bus_check];
-	 * sal_log_mod_error_info_t reg_file_check_info[.valid.num_reg_file_check];
-	 * sal_log_mod_error_info_t ms_check_info[.valid.num_ms_check];
-	 * struct sal_cpuid_info cpuid_info;
-	 * sal_processor_static_info_t processor_static_info;
-	 */
-} sal_log_processor_info_t;
-
-/* Given a sal_log_processor_info_t pointer, return a pointer to the processor_static_info: */
-#define SAL_LPI_PSI_INFO(l)									\
-({	sal_log_processor_info_t *_l = (l);							\
-	((sal_processor_static_info_t *)							\
-	 ((char *) _l->info + ((_l->valid.num_cache_check + _l->valid.num_tlb_check		\
-				+ _l->valid.num_bus_check + _l->valid.num_reg_file_check	\
-				+ _l->valid.num_ms_check) * sizeof(sal_log_mod_error_info_t)	\
-			       + sizeof(struct sal_cpuid_info))));				\
-})
-
-/* platform error log structures */
-
-typedef struct sal_log_mem_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 error_status    : 1,
-		    physical_addr   : 1,
-		    addr_mask       : 1,
-		    node            : 1,
-		    card            : 1,
-		    module          : 1,
-		    bank            : 1,
-		    device          : 1,
-		    row             : 1,
-		    column          : 1,
-		    bit_position    : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_id          : 1,
-		    oem_data        : 1,
-		    reserved        : 47;
-	} valid;
-	u64 error_status;
-	u64 physical_addr;
-	u64 addr_mask;
-	u16 node;
-	u16 card;
-	u16 module;
-	u16 bank;
-	u16 device;
-	u16 row;
-	u16 column;
-	u16 bit_position;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_id[16];
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_mem_dev_err_info_t;
-
-typedef struct sal_log_sel_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 record_id       : 1,
-		    record_type     : 1,
-		    generator_id    : 1,
-		    evm_rev         : 1,
-		    sensor_type     : 1,
-		    sensor_num      : 1,
-		    event_dir       : 1,
-		    event_data1     : 1,
-		    event_data2     : 1,
-		    event_data3     : 1,
-		    reserved        : 54;
-	} valid;
-	u16 record_id;
-	u8 record_type;
-	u8 timestamp[4];
-	u16 generator_id;
-	u8 evm_rev;
-	u8 sensor_type;
-	u8 sensor_num;
-	u8 event_dir;
-	u8 event_data1;
-	u8 event_data2;
-	u8 event_data3;
-} sal_log_sel_dev_err_info_t;
-
-typedef struct sal_log_pci_bus_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    err_type        : 1,
-		    bus_id          : 1,
-		    bus_address     : 1,
-		    bus_data        : 1,
-		    bus_cmd         : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    oem_data        : 1,
-		    reserved        : 54;
-	} valid;
-	u64 err_status;
-	u16 err_type;
-	u16 bus_id;
-	u32 reserved;
-	u64 bus_address;
-	u64 bus_data;
-	u64 bus_cmd;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_pci_bus_err_info_t;
-
-typedef struct sal_log_smbios_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 event_type      : 1,
-		    length          : 1,
-		    time_stamp      : 1,
-		    data            : 1,
-		    reserved1       : 60;
-	} valid;
-	u8 event_type;
-	u8 length;
-	u8 time_stamp[6];
-	u8 data[1];			/* data of variable length, length == slsmb_length */
-} sal_log_smbios_dev_err_info_t;
-
-typedef struct sal_log_pci_comp_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    comp_info       : 1,
-		    num_mem_regs    : 1,
-		    num_io_regs     : 1,
-		    reg_data_pairs  : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	struct {
-		u16 vendor_id;
-		u16 device_id;
-		u8 class_code[3];
-		u8 func_num;
-		u8 dev_num;
-		u8 bus_num;
-		u8 seg_num;
-		u8 reserved[5];
-	} comp_info;
-	u32 num_mem_regs;
-	u32 num_io_regs;
-	u64 reg_data_pairs[1];
-	/*
-	 * array of address/data register pairs is num_mem_regs + num_io_regs elements
-	 * long.  Each array element consists of a u64 address followed by a u64 data
-	 * value.  The oem_data array immediately follows the reg_data_pairs array
-	 */
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_pci_comp_err_info_t;
-
-typedef struct sal_log_plat_specific_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    guid            : 1,
-		    oem_data        : 1,
-		    reserved        : 61;
-	} valid;
-	u64 err_status;
-	efi_guid_t guid;
-	u8 oem_data[1];			/* platform specific variable length data */
-} sal_log_plat_specific_err_info_t;
-
-typedef struct sal_log_host_ctlr_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_data[1];			/* Variable length OEM data */
-} sal_log_host_ctlr_err_info_t;
-
-typedef struct sal_log_plat_bus_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_data[1];			/* Variable length OEM data */
-} sal_log_plat_bus_err_info_t;
-
-/* Overall platform error section structure */
-typedef union sal_log_platform_err_info {
-	sal_log_mem_dev_err_info_t mem_dev_err;
-	sal_log_sel_dev_err_info_t sel_dev_err;
-	sal_log_pci_bus_err_info_t pci_bus_err;
-	sal_log_smbios_dev_err_info_t smbios_dev_err;
-	sal_log_pci_comp_err_info_t pci_comp_err;
-	sal_log_plat_specific_err_info_t plat_specific_err;
-	sal_log_host_ctlr_err_info_t host_ctlr_err;
-	sal_log_plat_bus_err_info_t plat_bus_err;
-} sal_log_platform_err_info_t;
-
-/* SAL log over-all, multi-section error record structure (processor+platform) */
-typedef struct err_rec {
-	sal_log_record_header_t sal_elog_header;
-	sal_log_processor_info_t proc_err;
-	sal_log_platform_err_info_t plat_err;
-	u8 oem_data_pad[1024];
-} ia64_err_rec_t;
-
-/*
- * Now define a couple of inline functions for improved type checking
- * and convenience.
- */
-
-extern s64 ia64_sal_cache_flush (u64 cache_type);
-extern void __init check_sal_cache_flush (void);
-
-/* Initialize all the processor and platform level instruction and data caches */
-static inline s64
-ia64_sal_cache_init (void)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_CACHE_INIT, 0, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Clear the processor and platform information logged by SAL with respect to the machine
- * state at the time of MCA's, INITs, CMCs, or CPEs.
- */
-static inline s64
-ia64_sal_clear_state_info (u64 sal_info_type)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_CLEAR_STATE_INFO, sal_info_type, 0,
-	              0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-
-/* Get the processor and platform information logged by SAL with respect to the machine
- * state at the time of the MCAs, INITs, CMCs, or CPEs.
- */
-static inline u64
-ia64_sal_get_state_info (u64 sal_info_type, u64 *sal_info)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
-	              sal_info, 0, 0, 0, 0);
-	if (isrv.status)
-		return 0;
-
-	return isrv.v0;
-}
-
-/*
- * Get the maximum size of the information logged by SAL with respect to the machine state
- * at the time of MCAs, INITs, CMCs, or CPEs.
- */
-static inline u64
-ia64_sal_get_state_info_size (u64 sal_info_type)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO_SIZE, sal_info_type, 0,
-	              0, 0, 0, 0, 0);
-	if (isrv.status)
-		return 0;
-	return isrv.v0;
-}
-
-/*
- * Causes the processor to go into a spin loop within SAL where SAL awaits a wakeup from
- * the monarch processor.  Must not lock, because it will not return on any cpu until the
- * monarch processor sends a wake up.
- */
-static inline s64
-ia64_sal_mc_rendez (void)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_NOLOCK(isrv, SAL_MC_RENDEZ, 0, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Allow the OS to specify the interrupt number to be used by SAL to interrupt OS during
- * the machine check rendezvous sequence as well as the mechanism to wake up the
- * non-monarch processor at the end of machine check processing.
- * Returns the complete ia64_sal_retval because some calls return more than just a status
- * value.
- */
-static inline struct ia64_sal_retval
-ia64_sal_mc_set_params (u64 param_type, u64 i_or_m, u64 i_or_m_val, u64 timeout, u64 rz_always)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_MC_SET_PARAMS, param_type, i_or_m, i_or_m_val,
-		 timeout, rz_always, 0, 0);
-	return isrv;
-}
-
-/* Read from PCI configuration space */
-static inline s64
-ia64_sal_pci_config_read (u64 pci_config_addr, int type, u64 size, u64 *value)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_PCI_CONFIG_READ, pci_config_addr, size, type, 0, 0, 0, 0);
-	if (value)
-		*value = isrv.v0;
-	return isrv.status;
-}
-
-/* Write to PCI configuration space */
-static inline s64
-ia64_sal_pci_config_write (u64 pci_config_addr, int type, u64 size, u64 value)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_PCI_CONFIG_WRITE, pci_config_addr, size, value,
-	         type, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Register physical addresses of locations needed by SAL when SAL procedures are invoked
- * in virtual mode.
- */
-static inline s64
-ia64_sal_register_physical_addr (u64 phys_entry, u64 phys_addr)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_REGISTER_PHYSICAL_ADDR, phys_entry, phys_addr,
-	         0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Register software dependent code locations within SAL. These locations are handlers or
- * entry points where SAL will pass control for the specified event. These event handlers
- * are for the bott rendezvous, MCAs and INIT scenarios.
- */
-static inline s64
-ia64_sal_set_vectors (u64 vector_type,
-		      u64 handler_addr1, u64 gp1, u64 handler_len1,
-		      u64 handler_addr2, u64 gp2, u64 handler_len2)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_SET_VECTORS, vector_type,
-			handler_addr1, gp1, handler_len1,
-			handler_addr2, gp2, handler_len2);
-
-	return isrv.status;
-}
-
-/* Update the contents of PAL block in the non-volatile storage device */
-static inline s64
-ia64_sal_update_pal (u64 param_buf, u64 scratch_buf, u64 scratch_buf_size,
-		     u64 *error_code, u64 *scratch_buf_size_needed)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_UPDATE_PAL, param_buf, scratch_buf, scratch_buf_size,
-	         0, 0, 0, 0);
-	if (error_code)
-		*error_code = isrv.v0;
-	if (scratch_buf_size_needed)
-		*scratch_buf_size_needed = isrv.v1;
-	return isrv.status;
-}
-
-/* Get physical processor die mapping in the platform. */
-static inline s64
-ia64_sal_physical_id_info(u16 *splid)
-{
-	struct ia64_sal_retval isrv;
-
-	if (sal_revision < SAL_VERSION_CODE(3,2))
-		return -1;
-
-	SAL_CALL(isrv, SAL_PHYSICAL_ID_INFO, 0, 0, 0, 0, 0, 0, 0);
-	if (splid)
-		*splid = isrv.v0;
-	return isrv.status;
-}
-
-extern unsigned long sal_platform_features;
-
-extern int (*salinfo_platform_oemdata)(const u8 *, u8 **, u64 *);
-
-struct sal_ret_values {
-	long r8; long r9; long r10; long r11;
-};
-
-#define IA64_SAL_OEMFUNC_MIN		0x02000000
-#define IA64_SAL_OEMFUNC_MAX		0x03ffffff
-
-extern int ia64_sal_oemcall(struct ia64_sal_retval *, u64, u64, u64, u64, u64,
-			    u64, u64, u64);
-extern int ia64_sal_oemcall_nolock(struct ia64_sal_retval *, u64, u64, u64,
-				   u64, u64, u64, u64, u64);
-extern int ia64_sal_oemcall_reentrant(struct ia64_sal_retval *, u64, u64, u64,
-				      u64, u64, u64, u64, u64);
-extern long
-ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
-		    unsigned long *drift_info);
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * System Abstraction Layer Specification
- * Section 3.2.5.1: OS_BOOT_RENDEZ to SAL return State.
- * Note: region regs are stored first in head.S _start. Hence they must
- * stay up front.
- */
-struct sal_to_os_boot {
-	u64 rr[8];		/* Region Registers */
-	u64 br[6];		/* br0:
-				 * return addr into SAL boot rendez routine */
-	u64 gr1;		/* SAL:GP */
-	u64 gr12;		/* SAL:SP */
-	u64 gr13;		/* SAL: Task Pointer */
-	u64 fpsr;
-	u64 pfs;
-	u64 rnat;
-	u64 unat;
-	u64 bspstore;
-	u64 dcr;		/* Default Control Register */
-	u64 iva;
-	u64 pta;
-	u64 itv;
-	u64 pmv;
-	u64 cmcv;
-	u64 lrr[2];
-	u64 gr[4];
-	u64 pr;			/* Predicate registers */
-	u64 lc;			/* Loop Count */
-	struct ia64_fpreg fp[20];
-};
-
-/*
- * Global array allocated for NR_CPUS at boot time
- */
-extern struct sal_to_os_boot sal_boot_rendez_state[NR_CPUS];
-
-extern void ia64_jump_to_sal(struct sal_to_os_boot *);
-#endif
-
-extern void ia64_sal_handler_init(void *entry_point, void *gpval);
-
-#define PALO_MAX_TLB_PURGES	0xFFFF
-#define PALO_SIG	"PALO"
-
-struct palo_table {
-	u8  signature[4];	/* Should be "PALO" */
-	u32 length;
-	u8  minor_revision;
-	u8  major_revision;
-	u8  checksum;
-	u8  reserved1[5];
-	u16 max_tlb_purges;
-	u8  reserved2[6];
-};
-
-#define NPTCG_FROM_PAL			0
-#define NPTCG_FROM_PALO			1
-#define NPTCG_FROM_KERNEL_PARAMETER	2
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_SAL_H */
diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
deleted file mode 100644
index 8e0875cf6071..000000000000
--- a/arch/ia64/include/asm/sections.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SECTIONS_H
-#define _ASM_IA64_SECTIONS_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/elf.h>
-#include <linux/uaccess.h>
-
-typedef struct fdesc func_desc_t;
-
-#include <asm-generic/sections.h>
-
-extern char __phys_per_cpu_start[];
-#ifdef	CONFIG_SMP
-extern char __cpu0_per_cpu[];
-#endif
-extern char __start___vtop_patchlist[], __end___vtop_patchlist[];
-extern char __start___rse_patchlist[], __end___rse_patchlist[];
-extern char __start___mckinley_e9_bundles[], __end___mckinley_e9_bundles[];
-extern char __start___phys_stack_reg_patchlist[], __end___phys_stack_reg_patchlist[];
-extern char __start_gate_section[];
-extern char __start_gate_mckinley_e9_patchlist[], __end_gate_mckinley_e9_patchlist[];
-extern char __start_gate_vtop_patchlist[], __end_gate_vtop_patchlist[];
-extern char __start_gate_fsyscall_patchlist[], __end_gate_fsyscall_patchlist[];
-extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_bubble_down_patchlist[];
-extern char __start_unwind[], __end_unwind[];
-extern char __start_ivt_text[], __end_ivt_text[];
-
-#endif /* _ASM_IA64_SECTIONS_H */
diff --git a/arch/ia64/include/asm/serial.h b/arch/ia64/include/asm/serial.h
deleted file mode 100644
index 068be11583df..000000000000
--- a/arch/ia64/include/asm/serial.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Derived from the i386 version.
- */
-
-/*
- * This assumes you have a 1.8432 MHz clock for your UART.
- *
- * It'd be nice if someone built a serial card with a 24.576 MHz
- * clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
- */
-#define BASE_BAUD ( 1843200 / 16 )
-
-/*
- * All legacy serial ports should be enumerated via ACPI namespace, so
- * we need not list them here.
- */
diff --git a/arch/ia64/include/asm/shmparam.h b/arch/ia64/include/asm/shmparam.h
deleted file mode 100644
index 43bd8324ab71..000000000000
--- a/arch/ia64/include/asm/shmparam.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SHMPARAM_H
-#define _ASM_IA64_SHMPARAM_H
-
-/*
- * SHMLBA controls minimum alignment at which shared memory segments
- * get attached.  The IA-64 architecture says that there may be a
- * performance degradation when there are virtual aliases within 1MB.
- * To reduce the chance of this, we set SHMLBA to 1MB. --davidm 00/12/20
- */
-#define	SHMLBA	(1024*1024)
-
-#endif /* _ASM_IA64_SHMPARAM_H */
diff --git a/arch/ia64/include/asm/signal.h b/arch/ia64/include/asm/signal.h
deleted file mode 100644
index 80f067f9b3ce..000000000000
--- a/arch/ia64/include/asm/signal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Modified 1998-2001, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- *
- * Unfortunately, this file is being included by bits/signal.h in
- * glibc-2.x.  Hence the #ifdef __KERNEL__ ugliness.
- */
-#ifndef _ASM_IA64_SIGNAL_H
-#define _ASM_IA64_SIGNAL_H
-
-#include <uapi/asm/signal.h>
-
-
-#define _NSIG		64
-#define _NSIG_BPW	64
-#define _NSIG_WORDS	(_NSIG / _NSIG_BPW)
-
-# ifndef __ASSEMBLY__
-
-/* Most things should be clean enough to redefine this at will, if care
-   is taken to make libc match.  */
-
-typedef unsigned long old_sigset_t;
-
-typedef struct {
-	unsigned long sig[_NSIG_WORDS];
-} sigset_t;
-
-#  include <asm/sigcontext.h>
-
-# endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_SIGNAL_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
deleted file mode 100644
index aa92234c0142..000000000000
--- a/arch/ia64/include/asm/smp.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * SMP Support
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-#ifndef _ASM_IA64_SMP_H
-#define _ASM_IA64_SMP_H
-
-#include <linux/init.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/bitops.h>
-#include <linux/irqreturn.h>
-
-#include <asm/param.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-static inline unsigned int
-ia64_get_lid (void)
-{
-	union {
-		struct {
-			unsigned long reserved : 16;
-			unsigned long eid : 8;
-			unsigned long id : 8;
-			unsigned long ignored : 32;
-		} f;
-		unsigned long bits;
-	} lid;
-
-	lid.bits = ia64_getreg(_IA64_REG_CR_LID);
-	return lid.f.id << 8 | lid.f.eid;
-}
-
-#define hard_smp_processor_id()		ia64_get_lid()
-
-#ifdef CONFIG_SMP
-
-#define raw_smp_processor_id() (current_thread_info()->cpu)
-
-extern struct smp_boot_data {
-	int cpu_count;
-	int cpu_phys_id[NR_CPUS];
-} smp_boot_data __initdata;
-
-extern char no_int_routing;
-
-extern cpumask_t cpu_core_map[NR_CPUS];
-DECLARE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
-extern int smp_num_siblings;
-extern void __iomem *ipi_base_addr;
-
-extern volatile int ia64_cpu_to_sapicid[];
-#define cpu_physical_id(i)	ia64_cpu_to_sapicid[i]
-
-extern unsigned long ap_wakeup_vector;
-
-/*
- * Function to map hard smp processor id to logical id.  Slow, so don't use this in
- * performance-critical code.
- */
-static inline int
-cpu_logical_id (int cpuid)
-{
-	int i;
-
-	for (i = 0; i < NR_CPUS; ++i)
-		if (cpu_physical_id(i) == cpuid)
-			break;
-	return i;
-}
-
-/* Upping and downing of CPUs */
-extern int __cpu_disable (void);
-extern void __cpu_die (unsigned int cpu);
-extern void cpu_die (void) __attribute__ ((noreturn));
-extern void __init smp_build_cpu_map(void);
-
-extern void __init init_smp_config (void);
-extern void smp_do_timer (struct pt_regs *regs);
-
-extern irqreturn_t handle_IPI(int irq, void *dev_id);
-extern void smp_send_reschedule (int cpu);
-extern void identify_siblings (struct cpuinfo_ia64 *);
-extern int is_multithreading_enabled(void);
-
-extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-
-#else /* CONFIG_SMP */
-
-#define cpu_logical_id(i)		0
-#define cpu_physical_id(i)		ia64_get_lid()
-
-#endif /* CONFIG_SMP */
-#endif /* _ASM_IA64_SMP_H */
diff --git a/arch/ia64/include/asm/sn/intr.h b/arch/ia64/include/asm/sn/intr.h
deleted file mode 100644
index 3885a77b21df..000000000000
--- a/arch/ia64/include/asm/sn/intr.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1992 - 1997, 2000-2006 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef _ASM_IA64_SN_INTR_H
-#define _ASM_IA64_SN_INTR_H
-
-#define SGI_XPC_ACTIVATE	0x30
-#define SGI_XPC_NOTIFY		0xe7
-
-#endif /* _ASM_IA64_SN_INTR_H */
diff --git a/arch/ia64/include/asm/sn/sn_sal.h b/arch/ia64/include/asm/sn/sn_sal.h
deleted file mode 100644
index d437aa43343b..000000000000
--- a/arch/ia64/include/asm/sn/sn_sal.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#ifndef _ASM_IA64_SN_SN_SAL_H
-#define _ASM_IA64_SN_SN_SAL_H
-
-/*
- * System Abstraction Layer definitions for IA64
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.  All rights reserved.
- */
-
-#include <linux/types.h>
-#include <asm/sal.h>
-
-// SGI Specific Calls
-#define  SN_SAL_GET_PARTITION_ADDR		   0x02000009
-#define  SN_SAL_MEMPROTECT                         0x0200003e
-
-#define  SN_SAL_WATCHLIST_ALLOC			   0x02000070
-#define  SN_SAL_WATCHLIST_FREE			   0x02000071
-
-/*
- * SAL Error Codes
- */
-#define SALRET_MORE_PASSES	1
-#define SALRET_OK		0
-#define SALRET_NOT_IMPLEMENTED	(-1)
-#define SALRET_INVALID_ARG	(-2)
-#define SALRET_ERROR		(-3)
-
-/*
- * Returns the physical address of the partition's reserved page through
- * an iterative number of calls.
- *
- * On first call, 'cookie' and 'len' should be set to 0, and 'addr'
- * set to the nasid of the partition whose reserved page's address is
- * being sought.
- * On subsequent calls, pass the values, that were passed back on the
- * previous call.
- *
- * While the return status equals SALRET_MORE_PASSES, keep calling
- * this function after first copying 'len' bytes starting at 'addr'
- * into 'buf'. Once the return status equals SALRET_OK, 'addr' will
- * be the physical address of the partition's reserved page. If the
- * return status equals neither of these, an error as occurred.
- */
-static inline s64
-sn_partition_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
-	struct ia64_sal_retval rv;
-	ia64_sal_oemcall_reentrant(&rv, SN_SAL_GET_PARTITION_ADDR, *cookie,
-				   *addr, buf, *len, 0, 0, 0);
-	*cookie = rv.v0;
-	*addr = rv.v1;
-	*len = rv.v2;
-	return rv.status;
-}
-
-/*
- * Change memory access protections for a physical address range.
- * nasid_array is not used on Altix, but may be in future architectures.
- * Available memory protection access classes are defined after the function.
- */
-static inline int
-sn_change_memprotect(u64 paddr, u64 len, u64 perms, u64 *nasid_array)
-{
-	struct ia64_sal_retval ret_stuff;
-
-	ia64_sal_oemcall_nolock(&ret_stuff, SN_SAL_MEMPROTECT, paddr, len,
-				(u64)nasid_array, perms, 0, 0, 0);
-	return ret_stuff.status;
-}
-#define SN_MEMPROT_ACCESS_CLASS_0		0x14a080
-#define SN_MEMPROT_ACCESS_CLASS_1		0x2520c2
-#define SN_MEMPROT_ACCESS_CLASS_2		0x14a1ca
-#define SN_MEMPROT_ACCESS_CLASS_3		0x14a290
-#define SN_MEMPROT_ACCESS_CLASS_6		0x084080
-#define SN_MEMPROT_ACCESS_CLASS_7		0x021080
-
-union sn_watchlist_u {
-	u64     val;
-	struct {
-		u64	blade	: 16,
-			size	: 32,
-			filler	: 16;
-	};
-};
-
-static inline int
-sn_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size,
-				unsigned long *intr_mmr_offset)
-{
-	struct ia64_sal_retval rv;
-	unsigned long addr;
-	union sn_watchlist_u size_blade;
-	int watchlist;
-
-	addr = (unsigned long)mq;
-	size_blade.size = mq_size;
-	size_blade.blade = blade;
-
-	/*
-	 * bios returns watchlist number or negative error number.
-	 */
-	ia64_sal_oemcall_nolock(&rv, SN_SAL_WATCHLIST_ALLOC, addr,
-			size_blade.val, (u64)intr_mmr_offset,
-			(u64)&watchlist, 0, 0, 0);
-	if (rv.status < 0)
-		return rv.status;
-
-	return watchlist;
-}
-
-static inline int
-sn_mq_watchlist_free(int blade, int watchlist_num)
-{
-	struct ia64_sal_retval rv;
-	ia64_sal_oemcall_nolock(&rv, SN_SAL_WATCHLIST_FREE, blade,
-			watchlist_num, 0, 0, 0, 0, 0);
-	return rv.status;
-}
-#endif /* _ASM_IA64_SN_SN_SAL_H */
diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
deleted file mode 100644
index a58f8b466d96..000000000000
--- a/arch/ia64/include/asm/sparsemem.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPARSEMEM_H
-#define _ASM_IA64_SPARSEMEM_H
-
-#ifdef CONFIG_SPARSEMEM
-#include <asm/page.h>
-/*
- * SECTION_SIZE_BITS            2^N: how big each section will be
- * MAX_PHYSMEM_BITS             2^N: how much memory we can have in that space
- */
-
-#define SECTION_SIZE_BITS	(30)
-#define MAX_PHYSMEM_BITS	(50)
-#ifdef CONFIG_ARCH_FORCE_MAX_ORDER
-#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS)
-#undef SECTION_SIZE_BITS
-#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT)
-#endif
-#endif
-
-#endif /* CONFIG_SPARSEMEM */
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 addr);
-#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-#endif
-
-#endif /* _ASM_IA64_SPARSEMEM_H */
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
deleted file mode 100644
index 0e5c1ad3239c..000000000000
--- a/arch/ia64/include/asm/spinlock.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPINLOCK_H
-#define _ASM_IA64_SPINLOCK_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *
- * This file is used for SMP configurations only.
- */
-
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-
-#include <linux/atomic.h>
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-#include <asm/processor.h>
-
-#define arch_spin_lock_init(x)			((x)->lock = 0)
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the initial value of the tail.
- * The pad bits in the middle are used to prevent the next_ticket number
- * overflowing into the now_serving number.
- *
- *   31             17  16    15  14                    0
- *  +----------------------------------------------------+
- *  |  now_serving     | padding |   next_ticket         |
- *  +----------------------------------------------------+
- */
-
-#define TICKET_SHIFT	17
-#define TICKET_BITS	15
-#define	TICKET_MASK	((1 << TICKET_BITS) - 1)
-
-static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
-{
-	int	*p = (int *)&lock->lock, ticket, serve;
-
-	ticket = ia64_fetchadd(1, p, acq);
-
-	if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-		return;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld4.c.nc %0=[%1]" : "=r"(serve) : "r"(p) : "memory");
-
-		if (!(((serve >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-			return;
-		cpu_relax();
-	}
-}
-
-static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
-{
-	int tmp = READ_ONCE(lock->lock);
-
-	if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK))
-		return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp;
-	return 0;
-}
-
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
-	unsigned short	*p = (unsigned short *)&lock->lock + 1, tmp;
-
-	/* This could be optimised with ARCH_HAS_MMIOWB */
-	mmiowb();
-	asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p));
-	WRITE_ONCE(*p, (tmp + 2) & ~1);
-}
-
-static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
-{
-	long tmp = READ_ONCE(lock->lock);
-
-	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK);
-}
-
-static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
-{
-	long tmp = READ_ONCE(lock->lock);
-
-	return ((tmp - (tmp >> TICKET_SHIFT)) & TICKET_MASK) > 1;
-}
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return !(((lock.lock >> TICKET_SHIFT) ^ lock.lock) & TICKET_MASK);
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-	return __ticket_spin_is_locked(lock);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-	return __ticket_spin_is_contended(lock);
-}
-#define arch_spin_is_contended	arch_spin_is_contended
-
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	__ticket_spin_lock(lock);
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	return __ticket_spin_trylock(lock);
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	__ticket_spin_unlock(lock);
-}
-
-#ifdef ASM_SUPPORTED
-
-static __always_inline void
-arch_read_lock(arch_rwlock_t *lock)
-{
-	unsigned long flags = 0;
-
-	__asm__ __volatile__ (
-		"tbit.nz p6, p0 = %1,%2\n"
-		"br.few 3f\n"
-		"1:\n"
-		"fetchadd4.rel r2 = [%0], -1;;\n"
-		"(p6) ssm psr.i\n"
-		"2:\n"
-		"hint @pause\n"
-		"ld4 r2 = [%0];;\n"
-		"cmp4.lt p7,p0 = r2, r0\n"
-		"(p7) br.cond.spnt.few 2b\n"
-		"(p6) rsm psr.i\n"
-		";;\n"
-		"3:\n"
-		"fetchadd4.acq r2 = [%0], 1;;\n"
-		"cmp4.lt p7,p0 = r2, r0\n"
-		"(p7) br.cond.spnt.few 1b\n"
-		: : "r"(lock), "r"(flags), "i"(IA64_PSR_I_BIT)
-		: "p6", "p7", "r2", "memory");
-}
-
-#else /* !ASM_SUPPORTED */
-
-#define arch_read_lock(rw)								\
-do {											\
-	arch_rwlock_t *__read_lock_ptr = (rw);						\
-											\
-	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
-		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
-		while (*(volatile int *)__read_lock_ptr < 0)				\
-			cpu_relax();							\
-	}										\
-} while (0)
-
-#endif /* !ASM_SUPPORTED */
-
-#define arch_read_unlock(rw)					\
-do {								\
-	arch_rwlock_t *__read_lock_ptr = (rw);			\
-	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
-} while (0)
-
-#ifdef ASM_SUPPORTED
-
-static __always_inline void
-arch_write_lock(arch_rwlock_t *lock)
-{
-	unsigned long flags = 0;
-
-	__asm__ __volatile__ (
-		"tbit.nz p6, p0 = %1, %2\n"
-		"mov ar.ccv = r0\n"
-		"dep r29 = -1, r0, 31, 1\n"
-		"br.few 3f;;\n"
-		"1:\n"
-		"(p6) ssm psr.i\n"
-		"2:\n"
-		"hint @pause\n"
-		"ld4 r2 = [%0];;\n"
-		"cmp4.eq p0,p7 = r0, r2\n"
-		"(p7) br.cond.spnt.few 2b\n"
-		"(p6) rsm psr.i\n"
-		";;\n"
-		"3:\n"
-		"cmpxchg4.acq r2 = [%0], r29, ar.ccv;;\n"
-		"cmp4.eq p0,p7 = r0, r2\n"
-		"(p7) br.cond.spnt.few 1b;;\n"
-		: : "r"(lock), "r"(flags), "i"(IA64_PSR_I_BIT)
-		: "ar.ccv", "p6", "p7", "r2", "r29", "memory");
-}
-
-#define arch_write_trylock(rw)							\
-({										\
-	register long result;							\
-										\
-	__asm__ __volatile__ (							\
-		"mov ar.ccv = r0\n"						\
-		"dep r29 = -1, r0, 31, 1;;\n"					\
-		"cmpxchg4.acq %0 = [%1], r29, ar.ccv\n"				\
-		: "=r"(result) : "r"(rw) : "ar.ccv", "r29", "memory");		\
-	(result == 0);								\
-})
-
-static inline void arch_write_unlock(arch_rwlock_t *x)
-{
-	u8 *y = (u8 *)x;
-	barrier();
-	asm volatile ("st1.rel.nta [%0] = r0\n\t" :: "r"(y+3) : "memory" );
-}
-
-#else /* !ASM_SUPPORTED */
-
-#define arch_write_lock(l)								\
-({											\
-	__u64 ia64_val, ia64_set_val = ia64_dep_mi(-1, 0, 31, 1);			\
-	__u32 *ia64_write_lock_ptr = (__u32 *) (l);					\
-	do {										\
-		while (*ia64_write_lock_ptr)						\
-			ia64_barrier();							\
-		ia64_val = ia64_cmpxchg4_acq(ia64_write_lock_ptr, ia64_set_val, 0);	\
-	} while (ia64_val);								\
-})
-
-#define arch_write_trylock(rw)						\
-({									\
-	__u64 ia64_val;							\
-	__u64 ia64_set_val = ia64_dep_mi(-1, 0, 31,1);			\
-	ia64_val = ia64_cmpxchg4_acq((__u32 *)(rw), ia64_set_val, 0);	\
-	(ia64_val == 0);						\
-})
-
-static inline void arch_write_unlock(arch_rwlock_t *x)
-{
-	barrier();
-	x->write_lock = 0;
-}
-
-#endif /* !ASM_SUPPORTED */
-
-static inline int arch_read_trylock(arch_rwlock_t *x)
-{
-	union {
-		arch_rwlock_t lock;
-		__u32 word;
-	} old, new;
-	old.lock = new.lock = *x;
-	old.lock.write_lock = new.lock.write_lock = 0;
-	++new.lock.read_counter;
-	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
-}
-
-#endif /*  _ASM_IA64_SPINLOCK_H */
diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
deleted file mode 100644
index 14b8a161c165..000000000000
--- a/arch/ia64/include/asm/spinlock_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPINLOCK_TYPES_H
-#define _ASM_IA64_SPINLOCK_TYPES_H
-
-#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
-#endif
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int read_counter	: 31;
-	volatile unsigned int write_lock	:  1;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0, 0 }
-
-#endif
diff --git a/arch/ia64/include/asm/string.h b/arch/ia64/include/asm/string.h
deleted file mode 100644
index 8b84df0dbfad..000000000000
--- a/arch/ia64/include/asm/string.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_STRING_H
-#define _ASM_IA64_STRING_H
-
-/*
- * Here is where we want to put optimized versions of the string
- * routines.
- *
- * Copyright (C) 1998-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#define __HAVE_ARCH_STRLEN	1 /* see arch/ia64/lib/strlen.S */
-#define __HAVE_ARCH_MEMSET	1 /* see arch/ia64/lib/memset.S */
-#define __HAVE_ARCH_MEMCPY	1 /* see arch/ia64/lib/memcpy.S */
-
-extern __kernel_size_t strlen (const char *);
-extern void *memcpy (void *, const void *, __kernel_size_t);
-extern void *memset (void *, int, __kernel_size_t);
-
-#endif /* _ASM_IA64_STRING_H */
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h
deleted file mode 100644
index a5a4e09468fa..000000000000
--- a/arch/ia64/include/asm/switch_to.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Low-level task switching. This is based on information published in
- * the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _ASM_IA64_SWITCH_TO_H
-#define _ASM_IA64_SWITCH_TO_H
-
-#include <linux/percpu.h>
-
-struct task_struct;
-
-/*
- * Context switch from one thread to another.  If the two threads have
- * different address spaces, schedule() has already taken care of
- * switching to the new address space by calling switch_mm().
- *
- * Disabling access to the fph partition and the debug-register
- * context switch MUST be done before calling ia64_switch_to() since a
- * newly created thread returns directly to
- * ia64_ret_from_syscall_clear_r8.
- */
-extern struct task_struct *ia64_switch_to (void *next_task);
-
-extern void ia64_save_extra (struct task_struct *task);
-extern void ia64_load_extra (struct task_struct *task);
-
-#define IA64_HAS_EXTRA_STATE(t)							\
-	((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID))
-
-#define __switch_to(prev,next,last) do {							 \
-	if (IA64_HAS_EXTRA_STATE(prev))								 \
-		ia64_save_extra(prev);								 \
-	if (IA64_HAS_EXTRA_STATE(next))								 \
-		ia64_load_extra(next);								 \
-	ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next);			 \
-	(last) = ia64_switch_to((next));							 \
-} while (0)
-
-#ifdef CONFIG_SMP
-/*
- * In the SMP case, we save the fph state when context-switching away from a thread that
- * modified fph.  This way, when the thread gets scheduled on another CPU, the CPU can
- * pick up the state from task->thread.fph, avoiding the complication of having to fetch
- * the latest fph state from another CPU.  In other words: eager save, lazy restore.
- */
-# define switch_to(prev,next,last) do {						\
-	if (ia64_psr(task_pt_regs(prev))->mfh && ia64_is_local_fpu_owner(prev)) {				\
-		ia64_psr(task_pt_regs(prev))->mfh = 0;			\
-		(prev)->thread.flags |= IA64_THREAD_FPH_VALID;			\
-		__ia64_save_fpu((prev)->thread.fph);				\
-	}									\
-	__switch_to(prev, next, last);						\
-	/* "next" in old context is "current" in new context */			\
-	if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) &&	       \
-		     (task_cpu(current) !=				       \
-		      		      task_thread_info(current)->last_cpu))) { \
-		task_thread_info(current)->last_cpu = task_cpu(current);       \
-	}								       \
-} while (0)
-#else
-# define switch_to(prev,next,last)	__switch_to(prev, next, last)
-#endif
-
-#endif /* _ASM_IA64_SWITCH_TO_H */
diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h
deleted file mode 100644
index 2b02a3fb862a..000000000000
--- a/arch/ia64/include/asm/syscall.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Access to user system call parameters and results
- *
- * Copyright (C) 2008 Intel Corp.  Shaohua Li <shaohua.li@intel.com>
- *
- * See asm-generic/syscall.h for descriptions of what we must do here.
- */
-
-#ifndef _ASM_SYSCALL_H
-#define _ASM_SYSCALL_H	1
-
-#include <uapi/linux/audit.h>
-#include <linux/sched.h>
-#include <linux/err.h>
-
-static inline long syscall_get_nr(struct task_struct *task,
-				  struct pt_regs *regs)
-{
-	if ((long)regs->cr_ifs < 0) /* Not a syscall */
-		return -1;
-
-	return regs->r15;
-}
-
-static inline void syscall_rollback(struct task_struct *task,
-				    struct pt_regs *regs)
-{
-	/* do nothing */
-}
-
-static inline long syscall_get_error(struct task_struct *task,
-				     struct pt_regs *regs)
-{
-	return regs->r10 == -1 ? -regs->r8:0;
-}
-
-static inline long syscall_get_return_value(struct task_struct *task,
-					    struct pt_regs *regs)
-{
-	return regs->r8;
-}
-
-static inline void syscall_set_return_value(struct task_struct *task,
-					    struct pt_regs *regs,
-					    int error, long val)
-{
-	if (error) {
-		/* error < 0, but ia64 uses > 0 return value */
-		regs->r8 = -error;
-		regs->r10 = -1;
-	} else {
-		regs->r8 = val;
-		regs->r10 = 0;
-	}
-}
-
-extern void syscall_get_arguments(struct task_struct *task,
-	struct pt_regs *regs, unsigned long *args);
-
-static inline int syscall_get_arch(struct task_struct *task)
-{
-	return AUDIT_ARCH_IA64;
-}
-#endif	/* _ASM_SYSCALL_H */
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
deleted file mode 100644
index 21b257117e0a..000000000000
--- a/arch/ia64/include/asm/thread_info.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_THREAD_INFO_H
-#define _ASM_IA64_THREAD_INFO_H
-
-#ifndef ASM_OFFSETS_C
-#include <asm/asm-offsets.h>
-#endif
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-#define THREAD_SIZE			KERNEL_STACK_SIZE
-
-#ifndef __ASSEMBLY__
-
-/*
- * On IA-64, we want to keep the task structure and kernel stack together, so they can be
- * mapped by a single TLB entry and so they can be addressed by the "current" pointer
- * without having to do pointer masking.
- */
-struct thread_info {
-	struct task_struct *task;	/* XXX not really needed, except for dup_task_struct() */
-	__u32 flags;			/* thread_info flags (see TIF_*) */
-	__u32 cpu;			/* current CPU */
-	__u32 last_cpu;			/* Last CPU thread ran on */
-	__u32 status;			/* Thread synchronous flags */
-	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	__u64 utime;
-	__u64 stime;
-	__u64 gtime;
-	__u64 hardirq_time;
-	__u64 softirq_time;
-	__u64 idle_time;
-	__u64 ac_stamp;
-	__u64 ac_leave;
-	__u64 ac_stime;
-	__u64 ac_utime;
-#endif
-};
-
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.task		= &tsk,			\
-	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= INIT_PREEMPT_COUNT,	\
-}
-
-#ifndef ASM_OFFSETS_C
-/* how to get the thread information struct from C */
-#define current_thread_info()	((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define arch_alloc_thread_stack_node(tsk, node)	\
-		((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
-#define task_thread_info(tsk)	((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
-#else
-#define current_thread_info()	((struct thread_info *) 0)
-#define arch_alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
-#define task_thread_info(tsk)	((struct thread_info *) 0)
-#endif
-#define arch_free_thread_stack(tsk)	/* nothing */
-#define task_stack_page(tsk)	((void *)(tsk))
-
-#define __HAVE_THREAD_FUNCTIONS
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#define setup_thread_stack(p, org)			\
-	*task_thread_info(p) = *task_thread_info(org);	\
-	task_thread_info(p)->ac_stime = 0;		\
-	task_thread_info(p)->ac_utime = 0;		\
-	task_thread_info(p)->task = (p);
-#else
-#define setup_thread_stack(p, org) \
-	*task_thread_info(p) = *task_thread_info(org); \
-	task_thread_info(p)->task = (p);
-#endif
-#define end_of_stack(p) (unsigned long *)((void *)(p) + IA64_RBS_OFFSET)
-
-#define alloc_task_struct_node(node)						\
-({										\
-	struct page *page = alloc_pages_node(node, GFP_KERNEL | __GFP_COMP,	\
-					     KERNEL_STACK_SIZE_ORDER);		\
-	struct task_struct *ret = page ? page_address(page) : NULL;		\
-										\
-	ret;									\
-})
-#define free_task_struct(tsk)	free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER)
-
-#endif /* !__ASSEMBLY */
-
-/*
- * thread information flags
- * - these are process state flags that various assembly files may need to access
- * - pending work-to-be-done flags are in least-significant 16 bits, other flags
- *   in top 16 bits
- */
-#define TIF_SIGPENDING		0	/* signal pending */
-#define TIF_NEED_RESCHED	1	/* rescheduling necessary */
-#define TIF_SYSCALL_TRACE	2	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	3	/* syscall auditing active */
-#define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
-#define TIF_NOTIFY_SIGNAL	5	/* signal notification exist */
-#define TIF_NOTIFY_RESUME	6	/* resumption notification requested */
-#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
-#define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
-#define TIF_RESTORE_RSE		21	/* user RBS is newer than kernel RBS */
-#define TIF_POLLING_NRFLAG	22	/* idle is polling for TIF_NEED_RESCHED */
-
-#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-#define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
-#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_SYSCALL_TRACEAUDIT	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
-#define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
-#define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
-#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-
-/* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
-				 _TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_NOTIFY_SIGNAL)
-/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
-#define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
-
-#endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h
deleted file mode 100644
index 7ccc077a60be..000000000000
--- a/arch/ia64/include/asm/timex.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TIMEX_H
-#define _ASM_IA64_TIMEX_H
-
-/*
- * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * 2001/01/18 davidm	Removed CLOCK_TICK_RATE.  It makes no sense on IA-64.
- *			Also removed cacheflush_time as it's entirely unused.
- */
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-
-typedef unsigned long cycles_t;
-
-extern void (*ia64_udelay)(unsigned long usecs);
-
-/*
- * For performance reasons, we don't want to define CLOCK_TICK_TRATE as
- * local_cpu_data->itc_rate.  Fortunately, we don't have to, either: according to George
- * Anzinger, 1/CLOCK_TICK_RATE is taken as the resolution of the timer clock.  The time
- * calculation assumes that you will use enough of these so that your tick size <= 1/HZ.
- * If the calculation shows that your CLOCK_TICK_RATE can not supply exactly 1/HZ ticks,
- * the actual value is calculated and used to update the wall clock each jiffie.  Setting
- * the CLOCK_TICK_RATE to x*HZ insures that the calculation will find no errors.  Hence we
- * pick a multiple of HZ which gives us a (totally virtual) CLOCK_TICK_RATE of about
- * 100MHz.
- */
-#define CLOCK_TICK_RATE		(HZ * 100000UL)
-
-static inline cycles_t
-get_cycles (void)
-{
-	cycles_t ret;
-
-	ret = ia64_getreg(_IA64_REG_AR_ITC);
-	return ret;
-}
-#define get_cycles get_cycles
-
-extern void ia64_cpu_local_tick (void);
-extern unsigned long long ia64_native_sched_clock (void);
-
-#endif /* _ASM_IA64_TIMEX_H */
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
deleted file mode 100644
index a15fe0809aae..000000000000
--- a/arch/ia64/include/asm/tlb.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TLB_H
-#define _ASM_IA64_TLB_H
-/*
- * Based on <asm-generic/tlb.h>.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * Removing a translation from a page table (including TLB-shootdown) is a four-step
- * procedure:
- *
- *	(1) Flush (virtual) caches --- ensures virtual memory is coherent with kernel memory
- *	    (this is a no-op on ia64).
- *	(2) Clear the relevant portions of the page-table
- *	(3) Flush the TLBs --- ensures that stale content is gone from CPU TLBs
- *	(4) Release the pages that were freed up in step (2).
- *
- * Note that the ordering of these steps is crucial to avoid races on MP machines.
- *
- * The Linux kernel defines several platform-specific hooks for TLB-shootdown.  When
- * unmapping a portion of the virtual address space, these hooks are called according to
- * the following template:
- *
- *	tlb <- tlb_gather_mmu(mm);			// start unmap for address space MM
- *	{
- *	  for each vma that needs a shootdown do {
- *	    tlb_start_vma(tlb, vma);
- *	      for each page-table-entry PTE that needs to be removed do {
- *		tlb_remove_tlb_entry(tlb, pte, address);
- *		if (pte refers to a normal page) {
- *		  tlb_remove_page(tlb, page);
- *		}
- *	      }
- *	    tlb_end_vma(tlb, vma);
- *	  }
- *	}
- *	tlb_finish_mmu(tlb);				// finish unmap for address space MM
- */
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-
-#include <asm-generic/tlb.h>
-
-#endif /* _ASM_IA64_TLB_H */
diff --git a/arch/ia64/include/asm/tlbflush.h b/arch/ia64/include/asm/tlbflush.h
deleted file mode 100644
index ceac10c4d6e2..000000000000
--- a/arch/ia64/include/asm/tlbflush.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TLBFLUSH_H
-#define _ASM_IA64_TLBFLUSH_H
-
-/*
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <linux/mm.h>
-
-#include <asm/intrinsics.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-
-struct ia64_tr_entry {
-	u64 ifa;
-	u64 itir;
-	u64 pte;
-	u64 rr;
-}; /*Record for tr entry!*/
-
-extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size);
-extern void ia64_ptr_entry(u64 target_mask, int slot);
-extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS];
-
-/*
- region register macros
-*/
-#define RR_TO_VE(val)   (((val) >> 0) & 0x0000000000000001)
-#define RR_VE(val)     (((val) & 0x0000000000000001) << 0)
-#define RR_VE_MASK     0x0000000000000001L
-#define RR_VE_SHIFT    0
-#define RR_TO_PS(val)  (((val) >> 2) & 0x000000000000003f)
-#define RR_PS(val)     (((val) & 0x000000000000003f) << 2)
-#define RR_PS_MASK     0x00000000000000fcL
-#define RR_PS_SHIFT    2
-#define RR_RID_MASK    0x00000000ffffff00L
-#define RR_TO_RID(val)         ((val >> 8) & 0xffffff)
-
-/*
- * Now for some TLB flushing routines.  This is the kind of stuff that
- * can be very expensive, so try to avoid them whenever possible.
- */
-extern void setup_ptcg_sem(int max_purges, int from_palo);
-
-/*
- * Flush everything (kernel mapping may also have changed due to
- * vmalloc/vfree).
- */
-extern void local_flush_tlb_all (void);
-
-#ifdef CONFIG_SMP
-  extern void smp_flush_tlb_all (void);
-  extern void smp_flush_tlb_mm (struct mm_struct *mm);
-  extern void smp_flush_tlb_cpumask (cpumask_t xcpumask);
-# define flush_tlb_all()	smp_flush_tlb_all()
-#else
-# define flush_tlb_all()	local_flush_tlb_all()
-# define smp_flush_tlb_cpumask(m) local_flush_tlb_all()
-#endif
-
-static inline void
-local_finish_flush_tlb_mm (struct mm_struct *mm)
-{
-	if (mm == current->active_mm)
-		activate_context(mm);
-}
-
-/*
- * Flush a specified user mapping.  This is called, e.g., as a result of fork() and
- * exit().  fork() ends up here because the copy-on-write mechanism needs to write-protect
- * the PTEs of the parent task.
- */
-static inline void
-flush_tlb_mm (struct mm_struct *mm)
-{
-	if (!mm)
-		return;
-
-	set_bit(mm->context, ia64_ctx.flushmap);
-	mm->context = 0;
-
-	if (atomic_read(&mm->mm_users) == 0)
-		return;		/* happens as a result of exit_mmap() */
-
-#ifdef CONFIG_SMP
-	smp_flush_tlb_mm(mm);
-#else
-	local_finish_flush_tlb_mm(mm);
-#endif
-}
-
-extern void flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end);
-
-/*
- * Page-granular tlb flush.
- */
-static inline void
-flush_tlb_page (struct vm_area_struct *vma, unsigned long addr)
-{
-#ifdef CONFIG_SMP
-	flush_tlb_range(vma, (addr & PAGE_MASK), (addr & PAGE_MASK) + PAGE_SIZE);
-#else
-	if (vma->vm_mm == current->active_mm)
-		ia64_ptcl(addr, (PAGE_SHIFT << 2));
-	else
-		vma->vm_mm->context = 0;
-#endif
-}
-
-/*
- * Flush the local TLB. Invoked from another cpu using an IPI.
- */
-#ifdef CONFIG_SMP
-void smp_local_flush_tlb(void);
-#else
-#define smp_local_flush_tlb()
-#endif
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-	flush_tlb_all();	/* XXX fix me */
-}
-
-#endif /* _ASM_IA64_TLBFLUSH_H */
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
deleted file mode 100644
index 43567240b0d6..000000000000
--- a/arch/ia64/include/asm/topology.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2002, Erich Focht, NEC
- *
- * All rights reserved.
- */
-#ifndef _ASM_IA64_TOPOLOGY_H
-#define _ASM_IA64_TOPOLOGY_H
-
-#include <asm/acpi.h>
-#include <asm/numa.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_NUMA
-
-/* Nodes w/o CPUs are preferred for memory allocations, see build_zonelists */
-#define PENALTY_FOR_NODE_WITH_CPUS 255
-
-/*
- * Nodes within this distance are eligible for reclaim by zone_reclaim() when
- * zone_reclaim_mode is enabled.
- */
-#define RECLAIM_DISTANCE 15
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- */
-#define cpumask_of_node(node) ((node) == -1 ?				\
-			       cpu_all_mask :				\
-			       &node_to_cpu_mask[node])
-
-/*
- * Determines the node for a given pci bus
- */
-#define pcibus_to_node(bus) PCI_CONTROLLER(bus)->node
-
-void build_cpu_to_node_map(void);
-
-#endif /* CONFIG_NUMA */
-
-#ifdef CONFIG_SMP
-#define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
-#define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
-#define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
-#endif
-
-extern void arch_fix_phys_package_id(int num, u32 slot);
-
-#define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
-				 cpu_all_mask :				\
-				 cpumask_of_node(pcibus_to_node(bus)))
-
-#include <asm-generic/topology.h>
-
-#endif /* _ASM_IA64_TOPOLOGY_H */
diff --git a/arch/ia64/include/asm/types.h b/arch/ia64/include/asm/types.h
deleted file mode 100644
index 5ddc7703de99..000000000000
--- a/arch/ia64/include/asm/types.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file is never included by application software unless explicitly
- * requested (e.g., via linux/types.h) in which case the application is
- * Linux specific so (user-) name space pollution is not a major issue.
- * However, for interoperability, libraries still need to be careful to
- * avoid naming clashes.
- *
- * Based on <asm-alpha/types.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_TYPES_H
-#define _ASM_IA64_TYPES_H
-
-#include <asm-generic/int-ll64.h>
-#include <uapi/asm/types.h>
-
-#ifdef __ASSEMBLY__
-#else
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-
-struct fnptr {
-	unsigned long ip;
-	unsigned long gp;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_TYPES_H */
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
deleted file mode 100644
index 60adadeb3e9e..000000000000
--- a/arch/ia64/include/asm/uaccess.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UACCESS_H
-#define _ASM_IA64_UACCESS_H
-
-/*
- * This file defines various macros to transfer memory areas across
- * the user/kernel boundary.  This needs to be done carefully because
- * this code is executed in kernel mode and uses user-specified
- * addresses.  Thus, we need to be careful not to let the user to
- * trick us into accessing kernel memory that would normally be
- * inaccessible.  This code is also fairly performance sensitive,
- * so we want to spend as little time doing safety checks as
- * possible.
- *
- * To make matters a bit more interesting, these macros sometimes also
- * called from within the kernel itself, in which case the address
- * validity check must be skipped.  The get_fs() macro tells us what
- * to do: if get_fs()==USER_DS, checking is performed, if
- * get_fs()==KERNEL_DS, checking is bypassed.
- *
- * Note that even if the memory area specified by the user is in a
- * valid address range, it is still possible that we'll get a page
- * fault while accessing it.  This is handled by filling out an
- * exception handler fixup entry for each instruction that has the
- * potential to fault.  When such a fault occurs, the page fault
- * handler checks to see whether the faulting instruction has a fixup
- * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
- * then resumes execution at the continuation point.
- *
- * Based on <asm-alpha/uaccess.h>.
- *
- * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/page-flags.h>
-
-#include <asm/intrinsics.h>
-#include <linux/pgtable.h>
-#include <asm/io.h>
-#include <asm/extable.h>
-
-/*
- * When accessing user memory, we need to make sure the entire area really is
- * in user-level space.  We also need to make sure that the address doesn't
- * point inside the virtually mapped linear page table.
- */
-static inline int __access_ok(const void __user *p, unsigned long size)
-{
-	unsigned long limit = TASK_SIZE;
-	unsigned long addr = (unsigned long)p;
-
-	return likely((size <= limit) && (addr <= (limit - size)) &&
-		 likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT));
-}
-#define __access_ok __access_ok
-#include <asm-generic/access_ok.h>
-
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * Careful to not
- * (a) re-use the arguments for side effects (sizeof/typeof is ok)
- * (b) require any knowledge of processes at this stage
- */
-#define put_user(x, ptr)	__put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
-#define get_user(x, ptr)	__get_user_check((x), (ptr), sizeof(*(ptr)))
-
-/*
- * The "__xxx" versions do not do address space checking, useful when
- * doing multiple accesses to the same area (the programmer has to do the
- * checks by hand with "access_ok()")
- */
-#define __put_user(x, ptr)	__put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
-#define __get_user(x, ptr)	__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-#ifdef ASM_SUPPORTED
-  struct __large_struct { unsigned long buf[100]; };
-# define __m(x) (*(struct __large_struct __user *)(x))
-
-/* We need to declare the __ex_table section before we can use it in .xdata.  */
-asm (".section \"__ex_table\", \"a\"\n\t.previous");
-
-# define __get_user_size(val, addr, n, err)							\
-do {												\
-	register long __gu_r8 asm ("r8") = 0;							\
-	register long __gu_r9 asm ("r9");							\
-	asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n"	\
-	     "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"						\
-	     "[1:]"										\
-	     : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));			\
-	(err) = __gu_r8;									\
-	(val) = __gu_r9;									\
-} while (0)
-
-/*
- * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it.  This
- * is because they do not write to any memory gcc knows about, so there are no aliasing
- * issues.
- */
-# define __put_user_size(val, addr, n, err)							\
-do {												\
-	register long __pu_r8 asm ("r8") = 0;							\
-	asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n"	\
-		      "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"					\
-		      "[1:]"									\
-		      : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8));		\
-	(err) = __pu_r8;									\
-} while (0)
-
-#else /* !ASM_SUPPORTED */
-# define RELOC_TYPE	2	/* ip-rel */
-# define __get_user_size(val, addr, n, err)				\
-do {									\
-	__ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);	\
-	(err) = ia64_getreg(_IA64_REG_R8);				\
-	(val) = ia64_getreg(_IA64_REG_R9);				\
-} while (0)
-# define __put_user_size(val, addr, n, err)				\
-do {									\
-	__st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE,	\
-		  (__force unsigned long) (val));			\
-	(err) = ia64_getreg(_IA64_REG_R8);				\
-} while (0)
-#endif /* !ASM_SUPPORTED */
-
-extern void __get_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
- * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate it while
- * using r8/r9.
- */
-#define __do_get_user(check, x, ptr, size)						\
-({											\
-	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);				\
-	__typeof__ (size) __gu_size = (size);						\
-	long __gu_err = -EFAULT;							\
-	unsigned long __gu_val = 0;							\
-	if (!check || __access_ok(__gu_ptr, size))					\
-		switch (__gu_size) {							\
-		      case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break;	\
-		      case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break;	\
-		      case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break;	\
-		      case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break;	\
-		      default: __get_user_unknown(); break;				\
-		}									\
-	(x) = (__force __typeof__(*(__gu_ptr))) __gu_val;				\
-	__gu_err;									\
-})
-
-#define __get_user_nocheck(x, ptr, size)	__do_get_user(0, x, ptr, size)
-#define __get_user_check(x, ptr, size)	__do_get_user(1, x, ptr, size)
-
-extern void __put_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
- * could clobber r8 (among others).  Thus, be careful not to evaluate them while using r8.
- */
-#define __do_put_user(check, x, ptr, size)						\
-({											\
-	__typeof__ (x) __pu_x = (x);							\
-	__typeof__ (*(ptr)) __user *__pu_ptr = (ptr);					\
-	__typeof__ (size) __pu_size = (size);						\
-	long __pu_err = -EFAULT;							\
-											\
-	if (!check || __access_ok(__pu_ptr, __pu_size))					\
-		switch (__pu_size) {							\
-		      case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break;	\
-		      case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break;	\
-		      case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break;	\
-		      case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break;	\
-		      default: __put_user_unknown(); break;				\
-		}									\
-	__pu_err;									\
-})
-
-#define __put_user_nocheck(x, ptr, size)	__do_put_user(0, x, ptr, size)
-#define __put_user_check(x, ptr, size)	__do_put_user(1, x, ptr, size)
-
-/*
- * Complex access routines
- */
-extern unsigned long __must_check __copy_user (void __user *to, const void __user *from,
-					       unsigned long count);
-
-static inline unsigned long
-raw_copy_to_user(void __user *to, const void *from, unsigned long count)
-{
-	return __copy_user(to, (__force void __user *) from, count);
-}
-
-static inline unsigned long
-raw_copy_from_user(void *to, const void __user *from, unsigned long count)
-{
-	return __copy_user((__force void __user *) to, from, count);
-}
-
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-extern unsigned long __do_clear_user (void __user *, unsigned long);
-
-#define __clear_user(to, n)		__do_clear_user(to, n)
-
-#define clear_user(to, n)					\
-({								\
-	unsigned long __cu_len = (n);				\
-	if (__access_ok(to, __cu_len))				\
-		__cu_len = __do_clear_user(to, __cu_len);	\
-	__cu_len;						\
-})
-
-
-/*
- * Returns: -EFAULT if exception before terminator, N if the entire buffer filled, else
- * strlen.
- */
-extern long __must_check __strncpy_from_user (char *to, const char __user *from, long to_len);
-
-#define strncpy_from_user(to, from, n)					\
-({									\
-	const char __user * __sfu_from = (from);			\
-	long __sfu_ret = -EFAULT;					\
-	if (__access_ok(__sfu_from, 0))					\
-		__sfu_ret = __strncpy_from_user((to), __sfu_from, (n));	\
-	__sfu_ret;							\
-})
-
-/*
- * Returns: 0 if exception before NUL or reaching the supplied limit
- * (N), a value greater than N if the limit would be exceeded, else
- * strlen.
- */
-extern unsigned long __strnlen_user (const char __user *, long);
-
-#define strnlen_user(str, len)					\
-({								\
-	const char __user *__su_str = (str);			\
-	unsigned long __su_ret = 0;				\
-	if (__access_ok(__su_str, 0))				\
-		__su_ret = __strnlen_user(__su_str, len);	\
-	__su_ret;						\
-})
-
-#define ARCH_HAS_TRANSLATE_MEM_PTR	1
-static __inline__ void *
-xlate_dev_mem_ptr(phys_addr_t p)
-{
-	struct page *page;
-	void *ptr;
-
-	page = pfn_to_page(p >> PAGE_SHIFT);
-	if (PageUncached(page))
-		ptr = (void *)p + __IA64_UNCACHED_OFFSET;
-	else
-		ptr = __va(p);
-
-	return ptr;
-}
-
-#endif /* _ASM_IA64_UACCESS_H */
diff --git a/arch/ia64/include/asm/uncached.h b/arch/ia64/include/asm/uncached.h
deleted file mode 100644
index 98f447fc77b7..000000000000
--- a/arch/ia64/include/asm/uncached.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2001-2008 Silicon Graphics, Inc.  All rights reserved.
- *
- * Prototypes for the uncached page allocator
- */
-
-extern unsigned long uncached_alloc_page(int starting_nid, int n_pages);
-extern void uncached_free_page(unsigned long uc_addr, int n_pages);
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
deleted file mode 100644
index 9ba6110b10b9..000000000000
--- a/arch/ia64/include/asm/unistd.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * IA-64 Linux syscall numbers and inline-functions.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_UNISTD_H
-#define _ASM_IA64_UNISTD_H
-
-#include <uapi/asm/unistd.h>
-
-#define NR_syscalls		__NR_syscalls /* length of syscall table */
-
-#define __ARCH_WANT_NEW_STAT
-#define __ARCH_WANT_SYS_UTIME
-
-#if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
-
-#include <linux/types.h>
-#include <linux/linkage.h>
-#include <linux/compiler.h>
-
-extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr);
-
-asmlinkage unsigned long sys_mmap(
-				unsigned long addr, unsigned long len,
-				int prot, int flags,
-				int fd, long off);
-asmlinkage unsigned long sys_mmap2(
-				unsigned long addr, unsigned long len,
-				int prot, int flags,
-				int fd, long pgoff);
-struct pt_regs;
-asmlinkage long sys_ia64_pipe(void);
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/include/asm/unwind.h b/arch/ia64/include/asm/unwind.h
deleted file mode 100644
index c5bd4b3e3a36..000000000000
--- a/arch/ia64/include/asm/unwind.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UNWIND_H
-#define _ASM_IA64_UNWIND_H
-
-/*
- * Copyright (C) 1999-2000, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * A simple API for unwinding kernel stacks.  This is used for
- * debugging and error reporting purposes.  The kernel doesn't need
- * full-blown stack unwinding with all the bells and whitles, so there
- * is not much point in implementing the full IA-64 unwind API (though
- * it would of course be possible to implement the kernel API on top
- * of it).
- */
-
-struct task_struct;	/* forward declaration */
-struct switch_stack;	/* forward declaration */
-
-enum unw_application_register {
-	UNW_AR_BSP,
-	UNW_AR_BSPSTORE,
-	UNW_AR_PFS,
-	UNW_AR_RNAT,
-	UNW_AR_UNAT,
-	UNW_AR_LC,
-	UNW_AR_EC,
-	UNW_AR_FPSR,
-	UNW_AR_RSC,
-	UNW_AR_CCV,
-	UNW_AR_CSD,
-	UNW_AR_SSD
-};
-
-/*
- * The following declarations are private to the unwind
- * implementation:
- */
-
-struct unw_stack {
-	unsigned long limit;
-	unsigned long top;
-};
-
-#define UNW_FLAG_INTERRUPT_FRAME	(1UL << 0)
-
-/*
- * No user of this module should every access this structure directly
- * as it is subject to change.  It is declared here solely so we can
- * use automatic variables.
- */
-struct unw_frame_info {
-	struct unw_stack regstk;
-	struct unw_stack memstk;
-	unsigned int flags;
-	short hint;
-	short prev_script;
-
-	/* current frame info: */
-	unsigned long bsp;		/* backing store pointer value */
-	unsigned long sp;		/* stack pointer value */
-	unsigned long psp;		/* previous sp value */
-	unsigned long ip;		/* instruction pointer value */
-	unsigned long pr;		/* current predicate values */
-	unsigned long *cfm_loc;		/* cfm save location (or NULL) */
-	unsigned long pt;		/* struct pt_regs location */
-
-	struct task_struct *task;
-	struct switch_stack *sw;
-
-	/* preserved state: */
-	unsigned long *bsp_loc;		/* previous bsp save location */
-	unsigned long *bspstore_loc;
-	unsigned long *pfs_loc;
-	unsigned long *rnat_loc;
-	unsigned long *rp_loc;
-	unsigned long *pri_unat_loc;
-	unsigned long *unat_loc;
-	unsigned long *pr_loc;
-	unsigned long *lc_loc;
-	unsigned long *fpsr_loc;
-	struct unw_ireg {
-		unsigned long *loc;
-		struct unw_ireg_nat {
-			unsigned long type : 3;		/* enum unw_nat_type */
-			signed long off : 61;		/* NaT word is at loc+nat.off */
-		} nat;
-	} r4, r5, r6, r7;
-	unsigned long *b1_loc, *b2_loc, *b3_loc, *b4_loc, *b5_loc;
-	struct ia64_fpreg *f2_loc, *f3_loc, *f4_loc, *f5_loc, *fr_loc[16];
-};
-
-/*
- * The official API follows below:
- */
-
-struct unw_table_entry {
-	u64 start_offset;
-	u64 end_offset;
-	u64 info_offset;
-};
-
-/*
- * Initialize unwind support.
- */
-extern void unw_init (void);
-
-extern void *unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
-				   const void *table_start, const void *table_end);
-
-extern void unw_remove_unwind_table (void *handle);
-
-/*
- * Prepare to unwind blocked task t.
- */
-extern void unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t);
-
-extern void unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t,
-				 struct switch_stack *sw);
-
-/*
- * Prepare to unwind the currently running thread.
- */
-extern void unw_init_running (void (*callback)(struct unw_frame_info *info, void *arg), void *arg);
-
-/*
- * Unwind to previous to frame.  Returns 0 if successful, negative
- * number in case of an error.
- */
-extern int unw_unwind (struct unw_frame_info *info);
-
-/*
- * Unwind until the return pointer is in user-land (or until an error
- * occurs).  Returns 0 if successful, negative number in case of
- * error.
- */
-extern int unw_unwind_to_user (struct unw_frame_info *info);
-
-#define unw_is_intr_frame(info)	(((info)->flags & UNW_FLAG_INTERRUPT_FRAME) != 0)
-
-static inline int
-unw_get_ip (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->ip;
-	return 0;
-}
-
-static inline int
-unw_get_sp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->sp;
-	return 0;
-}
-
-static inline int
-unw_get_psp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->psp;
-	return 0;
-}
-
-static inline int
-unw_get_bsp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->bsp;
-	return 0;
-}
-
-static inline int
-unw_get_cfm (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = *(info)->cfm_loc;
-	return 0;
-}
-
-static inline int
-unw_set_cfm (struct unw_frame_info *info, unsigned long val)
-{
-	*(info)->cfm_loc = val;
-	return 0;
-}
-
-static inline int
-unw_get_rp (struct unw_frame_info *info, unsigned long *val)
-{
-	if (!info->rp_loc)
-		return -1;
-	*val = *info->rp_loc;
-	return 0;
-}
-
-extern int unw_access_gr (struct unw_frame_info *, int, unsigned long *, char *, int);
-extern int unw_access_br (struct unw_frame_info *, int, unsigned long *, int);
-extern int unw_access_fr (struct unw_frame_info *, int, struct ia64_fpreg *, int);
-extern int unw_access_ar (struct unw_frame_info *, int, unsigned long *, int);
-extern int unw_access_pr (struct unw_frame_info *, unsigned long *, int);
-
-static inline int
-unw_set_gr (struct unw_frame_info *i, int n, unsigned long v, char nat)
-{
-	return unw_access_gr(i, n, &v, &nat, 1);
-}
-
-static inline int
-unw_set_br (struct unw_frame_info *i, int n, unsigned long v)
-{
-	return unw_access_br(i, n, &v, 1);
-}
-
-static inline int
-unw_set_fr (struct unw_frame_info *i, int n, struct ia64_fpreg v)
-{
-	return unw_access_fr(i, n, &v, 1);
-}
-
-static inline int
-unw_set_ar (struct unw_frame_info *i, int n, unsigned long v)
-{
-	return unw_access_ar(i, n, &v, 1);
-}
-
-static inline int
-unw_set_pr (struct unw_frame_info *i, unsigned long v)
-{
-	return unw_access_pr(i, &v, 1);
-}
-
-#define unw_get_gr(i,n,v,nat)	unw_access_gr(i,n,v,nat,0)
-#define unw_get_br(i,n,v)	unw_access_br(i,n,v,0)
-#define unw_get_fr(i,n,v)	unw_access_fr(i,n,v,0)
-#define unw_get_ar(i,n,v)	unw_access_ar(i,n,v,0)
-#define unw_get_pr(i,v)		unw_access_pr(i,v,0)
-
-#endif /* _ASM_UNWIND_H */
diff --git a/arch/ia64/include/asm/user.h b/arch/ia64/include/asm/user.h
deleted file mode 100644
index ec03d3ab8715..000000000000
--- a/arch/ia64/include/asm/user.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_USER_H
-#define _ASM_IA64_USER_H
-
-/*
- * Core file format: The core file is written in such a way that gdb
- * can understand it and provide useful information to the user (under
- * linux we use the `trad-core' bfd).  The file contents are as
- * follows:
- *
- *  upage: 1 page consisting of a user struct that tells gdb
- *	what is present in the file.  Directly after this is a
- *	copy of the task_struct, which is currently not used by gdb,
- *	but it may come in handy at some point.  All of the registers
- *	are stored as part of the upage.  The upage should always be
- *	only one page long.
- *  data: The data segment follows next.  We use current->end_text to
- *	current->brk to pick up all of the user variables, plus any memory
- *	that may have been sbrk'ed.  No attempt is made to determine if a
- *	page is demand-zero or if a page is totally unused, we just cover
- *	the entire range.  All of the addresses are rounded in such a way
- *	that an integral number of pages is written.
- *  stack: We need the stack information in order to get a meaningful
- *	backtrace.  We need to write the data from usp to
- *	current->start_stack, so we round each of these in order to be able
- *	to write an integer number of pages.
- *
- * Modified 1998, 1999, 2001
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-#include <linux/ptrace.h>
-#include <linux/types.h>
-
-#include <asm/page.h>
-
-#define EF_SIZE		3072	/* XXX fix me */
-
-struct user {
-	unsigned long	regs[EF_SIZE/8+32];	/* integer and fp regs */
-	size_t		u_tsize;		/* text size (pages) */
-	size_t		u_dsize;		/* data size (pages) */
-	size_t		u_ssize;		/* stack size (pages) */
-	unsigned long	start_code;		/* text starting address */
-	unsigned long	start_data;		/* data starting address */
-	unsigned long	start_stack;		/* stack starting address */
-	long int	signal;			/* signal causing core dump */
-	unsigned long	u_ar0;			/* help gdb find registers */
-	unsigned long	magic;			/* identifies a core file */
-	char		u_comm[32];		/* user command name */
-};
-
-#endif /* _ASM_IA64_USER_H */
diff --git a/arch/ia64/include/asm/ustack.h b/arch/ia64/include/asm/ustack.h
deleted file mode 100644
index 112d40a0fec2..000000000000
--- a/arch/ia64/include/asm/ustack.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_USTACK_H
-#define _ASM_IA64_USTACK_H
-
-#include <asm/page.h>
-#include <uapi/asm/ustack.h>
-
-/* The absolute hard limit for stack size is 1/2 of the mappable space in the region */
-#define MAX_USER_STACK_SIZE	(RGN_MAP_LIMIT/2)
-#define STACK_TOP		(0x6000000000000000UL + RGN_MAP_LIMIT)
-#define STACK_TOP_MAX		STACK_TOP
-#endif /* _ASM_IA64_USTACK_H */
diff --git a/arch/ia64/include/asm/uv/uv.h b/arch/ia64/include/asm/uv/uv.h
deleted file mode 100644
index 48d4526bf4cd..000000000000
--- a/arch/ia64/include/asm/uv/uv.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UV_UV_H
-#define _ASM_IA64_UV_UV_H
-
-#ifdef CONFIG_IA64_SGI_UV
-extern bool ia64_is_uv;
-
-static inline int is_uv_system(void)
-{
-	return ia64_is_uv;
-}
-
-void __init uv_probe_system_type(void);
-void __init uv_setup(char **cmdline_p);
-#else /* CONFIG_IA64_SGI_UV */
-static inline int is_uv_system(void)
-{
-	return false;
-}
-
-static inline void __init uv_probe_system_type(void)
-{
-}
-
-static inline void __init uv_setup(char **cmdline_p)
-{
-}
-#endif /* CONFIG_IA64_SGI_UV */
-
-#endif	/* _ASM_IA64_UV_UV_H */
diff --git a/arch/ia64/include/asm/uv/uv_hub.h b/arch/ia64/include/asm/uv/uv_hub.h
deleted file mode 100644
index 809ddb6896db..000000000000
--- a/arch/ia64/include/asm/uv/uv_hub.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV architectural definitions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef __ASM_IA64_UV_HUB_H__
-#define __ASM_IA64_UV_HUB_H__
-
-#include <linux/numa.h>
-#include <linux/percpu.h>
-#include <asm/types.h>
-#include <asm/percpu.h>
-
-
-/*
- * Addressing Terminology
- *
- *	M       - The low M bits of a physical address represent the offset
- *		  into the blade local memory. RAM memory on a blade is physically
- *		  contiguous (although various IO spaces may punch holes in
- *		  it)..
- *
- * 	N	- Number of bits in the node portion of a socket physical
- * 		  address.
- *
- * 	NASID   - network ID of a router, Mbrick or Cbrick. Nasid values of
- * 	 	  routers always have low bit of 1, C/MBricks have low bit
- * 		  equal to 0. Most addressing macros that target UV hub chips
- * 		  right shift the NASID by 1 to exclude the always-zero bit.
- * 		  NASIDs contain up to 15 bits.
- *
- *	GNODE   - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
- *		  of nasids.
- *
- * 	PNODE   - the low N bits of the GNODE. The PNODE is the most useful variant
- * 		  of the nasid for socket usage.
- *
- *
- *  NumaLink Global Physical Address Format:
- *  +--------------------------------+---------------------+
- *  |00..000|      GNODE             |      NodeOffset     |
- *  +--------------------------------+---------------------+
- *          |<-------53 - M bits --->|<--------M bits ----->
- *
- *	M - number of node offset bits (35 .. 40)
- *
- *
- *  Memory/UV-HUB Processor Socket Address Format:
- *  +----------------+---------------+---------------------+
- *  |00..000000000000|   PNODE       |      NodeOffset     |
- *  +----------------+---------------+---------------------+
- *                   <--- N bits --->|<--------M bits ----->
- *
- *	M - number of node offset bits (35 .. 40)
- *	N - number of PNODE bits (0 .. 10)
- *
- *		Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
- *		The actual values are configuration dependent and are set at
- *		boot time. M & N values are set by the hardware/BIOS at boot.
- */
-
-
-/*
- * Maximum number of bricks in all partitions and in all coherency domains.
- * This is the total number of bricks accessible in the numalink fabric. It
- * includes all C & M bricks. Routers are NOT included.
- *
- * This value is also the value of the maximum number of non-router NASIDs
- * in the numalink fabric.
- *
- * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
- */
-#define UV_MAX_NUMALINK_BLADES	16384
-
-/*
- * Maximum number of C/Mbricks within a software SSI (hardware may support
- * more).
- */
-#define UV_MAX_SSI_BLADES	1
-
-/*
- * The largest possible NASID of a C or M brick (+ 2)
- */
-#define UV_MAX_NASID_VALUE	(UV_MAX_NUMALINK_NODES * 2)
-
-/*
- * The following defines attributes of the HUB chip. These attributes are
- * frequently referenced and are kept in the per-cpu data areas of each cpu.
- * They are kept together in a struct to minimize cache misses.
- */
-struct uv_hub_info_s {
-	unsigned long	global_mmr_base;
-	unsigned long	gpa_mask;
-	unsigned long	gnode_upper;
-	unsigned long	lowmem_remap_top;
-	unsigned long	lowmem_remap_base;
-	unsigned short	pnode;
-	unsigned short	pnode_mask;
-	unsigned short	coherency_domain_number;
-	unsigned short	numa_blade_id;
-	unsigned char	blade_processor_id;
-	unsigned char	m_val;
-	unsigned char	n_val;
-};
-DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info 		this_cpu_ptr(&__uv_hub_info)
-#define uv_cpu_hub_info(cpu)	(&per_cpu(__uv_hub_info, cpu))
-
-/*
- * Local & Global MMR space macros.
- * 	Note: macros are intended to be used ONLY by inline functions
- * 	in this file - not by other kernel code.
- * 		n -  NASID (full 15-bit global nasid)
- * 		g -  GNODE (full 15-bit global nasid, right shifted 1)
- * 		p -  PNODE (local part of nsids, right shifted 1)
- */
-#define UV_NASID_TO_PNODE(n)		(((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)		(((p) << 1) | uv_hub_info->gnode_upper)
-
-#define UV_LOCAL_MMR_BASE		0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE		0xf8000000UL
-#define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
-
-#define UV_GLOBAL_MMR32_PNODE_SHIFT	15
-#define UV_GLOBAL_MMR64_PNODE_SHIFT	26
-
-#define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
-
-#define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
-	((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
-
-/*
- * Macros for converting between kernel virtual addresses, socket local physical
- * addresses, and UV global physical addresses.
- * 	Note: use the standard __pa() & __va() macros for converting
- * 	      between socket virtual and socket physical addresses.
- */
-
-/* socket phys RAM --> UV global physical address */
-static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
-{
-	if (paddr < uv_hub_info->lowmem_remap_top)
-		paddr += uv_hub_info->lowmem_remap_base;
-	return paddr | uv_hub_info->gnode_upper;
-}
-
-
-/* socket virtual --> UV global physical address */
-static inline unsigned long uv_gpa(void *v)
-{
-	return __pa(v) | uv_hub_info->gnode_upper;
-}
-
-/* socket virtual --> UV global physical address */
-static inline void *uv_vgpa(void *v)
-{
-	return (void *)uv_gpa(v);
-}
-
-/* UV global physical address --> socket virtual */
-static inline void *uv_va(unsigned long gpa)
-{
-	return __va(gpa & uv_hub_info->gpa_mask);
-}
-
-/* pnode, offset --> socket virtual */
-static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
-{
-	return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
-}
-
-
-/*
- * Access global MMRs using the low memory MMR32 space. This region supports
- * faster MMR access but not all MMRs are accessible in this space.
- */
-static inline unsigned long *uv_global_mmr32_address(int pnode,
-				unsigned long offset)
-{
-	return __va(UV_GLOBAL_MMR32_BASE |
-		       UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
-}
-
-static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
-				 unsigned long val)
-{
-	*uv_global_mmr32_address(pnode, offset) = val;
-}
-
-static inline unsigned long uv_read_global_mmr32(int pnode,
-						 unsigned long offset)
-{
-	return *uv_global_mmr32_address(pnode, offset);
-}
-
-/*
- * Access Global MMR space using the MMR space located at the top of physical
- * memory.
- */
-static inline unsigned long *uv_global_mmr64_address(int pnode,
-				unsigned long offset)
-{
-	return __va(UV_GLOBAL_MMR64_BASE |
-		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
-}
-
-static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
-				unsigned long val)
-{
-	*uv_global_mmr64_address(pnode, offset) = val;
-}
-
-static inline unsigned long uv_read_global_mmr64(int pnode,
-						 unsigned long offset)
-{
-	return *uv_global_mmr64_address(pnode, offset);
-}
-
-/*
- * Access hub local MMRs. Faster than using global space but only local MMRs
- * are accessible.
- */
-static inline unsigned long *uv_local_mmr_address(unsigned long offset)
-{
-	return __va(UV_LOCAL_MMR_BASE | offset);
-}
-
-static inline unsigned long uv_read_local_mmr(unsigned long offset)
-{
-	return *uv_local_mmr_address(offset);
-}
-
-static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
-{
-	*uv_local_mmr_address(offset) = val;
-}
-
-/*
- * Structures and definitions for converting between cpu, node, pnode, and blade
- * numbers.
- */
-
-/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
-static inline int uv_blade_processor_id(void)
-{
-	return smp_processor_id();
-}
-
-/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
-static inline int uv_numa_blade_id(void)
-{
-	return 0;
-}
-
-/* Convert a cpu number to the UV blade number */
-static inline int uv_cpu_to_blade_id(int cpu)
-{
-	return 0;
-}
-
-/* Convert linux node number to the UV blade number */
-static inline int uv_node_to_blade_id(int nid)
-{
-	return 0;
-}
-
-/* Convert a blade id to the PNODE of the blade */
-static inline int uv_blade_to_pnode(int bid)
-{
-	return 0;
-}
-
-/* Determine the number of possible cpus on a blade */
-static inline int uv_blade_nr_possible_cpus(int bid)
-{
-	return num_possible_cpus();
-}
-
-/* Determine the number of online cpus on a blade */
-static inline int uv_blade_nr_online_cpus(int bid)
-{
-	return num_online_cpus();
-}
-
-/* Convert a cpu id to the PNODE of the blade containing the cpu */
-static inline int uv_cpu_to_pnode(int cpu)
-{
-	return 0;
-}
-
-/* Convert a linux node number to the PNODE of the blade */
-static inline int uv_node_to_pnode(int nid)
-{
-	return 0;
-}
-
-/* Maximum possible number of blades */
-static inline int uv_num_possible_blades(void)
-{
-	return 1;
-}
-
-static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
-{
-	/* not currently needed on ia64 */
-}
-
-
-#endif /* __ASM_IA64_UV_HUB__ */
-
diff --git a/arch/ia64/include/asm/uv/uv_mmrs.h b/arch/ia64/include/asm/uv/uv_mmrs.h
deleted file mode 100644
index fe0b8f05e1a8..000000000000
--- a/arch/ia64/include/asm/uv/uv_mmrs.h
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV MMR definitions
- *
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef _ASM_IA64_UV_UV_MMRS_H
-#define _ASM_IA64_UV_UV_MMRS_H
-
-#define UV_MMR_ENABLE		(1UL << 63)
-
-/* ========================================================================= */
-/*                           UVH_BAU_DATA_CONFIG                             */
-/* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_bau_data_config_u {
-    unsigned long	v;
-    struct uvh_bau_data_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_EVENT_OCCURRED0                             */
-/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
-union uvh_event_occurred0_u {
-    unsigned long	v;
-    struct uvh_event_occurred0_s {
-	unsigned long	lb_hcerr             :  1;  /* RW, W1C */
-	unsigned long	gr0_hcerr            :  1;  /* RW, W1C */
-	unsigned long	gr1_hcerr            :  1;  /* RW, W1C */
-	unsigned long	lh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	rh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	xn_hcerr             :  1;  /* RW, W1C */
-	unsigned long	si_hcerr             :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_vpi_int           :  1;  /* RW, W1C */
-	unsigned long	system_shutdown_int  :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_0         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_1         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_2         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_3         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_4         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_5         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_6         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_7         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_8         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_9         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_10        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_11        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_12        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_13        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_14        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_15        :  1;  /* RW, W1C */
-	unsigned long	l1_nmi_int           :  1;  /* RW, W1C */
-	unsigned long	stop_clock           :  1;  /* RW, W1C */
-	unsigned long	asic_to_l1           :  1;  /* RW, W1C */
-	unsigned long	l1_to_asic           :  1;  /* RW, W1C */
-	unsigned long	ltc_int              :  1;  /* RW, W1C */
-	unsigned long	la_seq_trigger       :  1;  /* RW, W1C */
-	unsigned long	ipi_int              :  1;  /* RW, W1C */
-	unsigned long	extio_int0           :  1;  /* RW, W1C */
-	unsigned long	extio_int1           :  1;  /* RW, W1C */
-	unsigned long	extio_int2           :  1;  /* RW, W1C */
-	unsigned long	extio_int3           :  1;  /* RW, W1C */
-	unsigned long	profile_int          :  1;  /* RW, W1C */
-	unsigned long	rtc0                 :  1;  /* RW, W1C */
-	unsigned long	rtc1                 :  1;  /* RW, W1C */
-	unsigned long	rtc2                 :  1;  /* RW, W1C */
-	unsigned long	rtc3                 :  1;  /* RW, W1C */
-	unsigned long	bau_data             :  1;  /* RW, W1C */
-	unsigned long	power_management_req :  1;  /* RW, W1C */
-	unsigned long	rsvd_57_63           :  7;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                        UVH_EVENT_OCCURRED0_ALIAS                          */
-/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
-
-/* ========================================================================= */
-/*                         UVH_GR0_TLB_INT0_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr0_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR0_TLB_INT1_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr0_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR1_TLB_INT0_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr1_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR1_TLB_INT1_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr1_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPB                                */
-/* ========================================================================= */
-#define UVH_INT_CMPB 0x22080UL
-
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpb_u {
-    unsigned long	v;
-    struct uvh_int_cmpb_s {
-	unsigned long	real_time_cmpb : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPC                                */
-/* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
-
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpc_u {
-    unsigned long	v;
-    struct uvh_int_cmpc_s {
-	unsigned long	real_time_cmpc : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPD                                */
-/* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
-
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpd_u {
-    unsigned long	v;
-    struct uvh_int_cmpd_s {
-	unsigned long	real_time_cmpd : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_NODE_ID                                 */
-/* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
-
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
-
-union uvh_node_id_u {
-    unsigned long	v;
-    struct uvh_node_id_s {
-	unsigned long	force1        :  1;  /* RO */
-	unsigned long	manufacturer  : 11;  /* RO */
-	unsigned long	part_number   : 16;  /* RO */
-	unsigned long	revision      :  4;  /* RO */
-	unsigned long	node_id       : 15;  /* RW */
-	unsigned long	rsvd_47       :  1;  /*    */
-	unsigned long	nodes_per_bit :  7;  /* RW */
-	unsigned long	rsvd_55       :  1;  /*    */
-	unsigned long	ni_port       :  4;  /* RO */
-	unsigned long	rsvd_60_63    :  4;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
-
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_gru_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_gru_overlay_config_mmr_s {
-	unsigned long	rsvd_0_27: 28;  /*    */
-	unsigned long	base   : 18;  /* RW */
-	unsigned long	rsvd_46_47:  2;  /*    */
-	unsigned long	gr4    :  1;  /* RW */
-	unsigned long	rsvd_49_51:  3;  /*    */
-	unsigned long	n_gru  :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
-
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_mmr_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_mmr_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base     : 20;  /* RW */
-	unsigned long	dual_hub :  1;  /* RW */
-	unsigned long	rsvd_47_62: 16;  /*    */
-	unsigned long	enable   :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                                 UVH_RTC                                   */
-/* ========================================================================= */
-#define UVH_RTC 0x340000UL
-
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
-
-union uvh_rtc_u {
-    unsigned long	v;
-    struct uvh_rtc_s {
-	unsigned long	real_time_clock : 56;  /* RW */
-	unsigned long	rsvd_56_63      :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC1_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
-
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc1_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc1_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC2_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc2_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC3_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc3_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                            UVH_RTC_INC_RATIO                              */
-/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
-
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
-
-union uvh_rtc_inc_ratio_u {
-    unsigned long	v;
-    struct uvh_rtc_inc_ratio_s {
-	unsigned long	fraction : 20;  /* RW */
-	unsigned long	ratio    :  3;  /* RW */
-	unsigned long	rsvd_23_63: 41;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_SI_ADDR_MAP_CONFIG                           */
-/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
-
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
-
-union uvh_si_addr_map_config_u {
-    unsigned long	v;
-    struct uvh_si_addr_map_config_s {
-	unsigned long	m_skt :  6;  /* RW */
-	unsigned long	rsvd_6_7:  2;  /*    */
-	unsigned long	n_skt :  4;  /* RW */
-	unsigned long	rsvd_12_63: 52;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS0_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
-
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias0_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias0_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS1_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
-
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias1_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias1_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS2_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
-
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias2_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias2_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-
-#endif /* _ASM_IA64_UV_UV_MMRS_H */
diff --git a/arch/ia64/include/asm/vermagic.h b/arch/ia64/include/asm/vermagic.h
deleted file mode 100644
index 29c7424f4c25..000000000000
--- a/arch/ia64/include/asm/vermagic.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifndef _ASM_VERMAGIC_H
-#define _ASM_VERMAGIC_H
-
-#include <linux/stringify.h>
-
-#define MODULE_ARCH_VERMAGIC	"ia64" \
-	"gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__)
-
-#endif /* _ASM_VERMAGIC_H */
diff --git a/arch/ia64/include/asm/vga.h b/arch/ia64/include/asm/vga.h
deleted file mode 100644
index 64ce0b971a0a..000000000000
--- a/arch/ia64/include/asm/vga.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *	Access to VGA videoram
- *
- *	(c) 1998 Martin Mares <mj@ucw.cz>
- *	(c) 1999 Asit Mallick <asit.k.mallick@intel.com>
- *	(c) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#ifndef __ASM_IA64_VGA_H_
-#define __ASM_IA64_VGA_H_
-
-/*
- * On the PC, we can just recalculate addresses and then access the
- * videoram directly without any black magic.
- */
-
-extern unsigned long vga_console_iobase;
-extern unsigned long vga_console_membase;
-
-#define VGA_MAP_MEM(x,s)	((unsigned long) ioremap(vga_console_membase + (x), s))
-
-#define vga_readb(x)	(*(x))
-#define vga_writeb(x,y)	(*(y) = (x))
-
-#endif /* __ASM_IA64_VGA_H_ */
diff --git a/arch/ia64/include/asm/vmalloc.h b/arch/ia64/include/asm/vmalloc.h
deleted file mode 100644
index a2b51141ad28..000000000000
--- a/arch/ia64/include/asm/vmalloc.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _ASM_IA64_VMALLOC_H
-#define _ASM_IA64_VMALLOC_H
-
-#endif /* _ASM_IA64_VMALLOC_H */
diff --git a/arch/ia64/include/asm/xor.h b/arch/ia64/include/asm/xor.h
deleted file mode 100644
index 6785f70d3208..000000000000
--- a/arch/ia64/include/asm/xor.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-
-extern void xor_ia64_2(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2);
-extern void xor_ia64_3(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3);
-extern void xor_ia64_4(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3,
-		       const unsigned long * __restrict p4);
-extern void xor_ia64_5(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3,
-		       const unsigned long * __restrict p4,
-		       const unsigned long * __restrict p5);
-
-static struct xor_block_template xor_block_ia64 = {
-	.name =	"ia64",
-	.do_2 =	xor_ia64_2,
-	.do_3 =	xor_ia64_3,
-	.do_4 =	xor_ia64_4,
-	.do_5 =	xor_ia64_5,
-};
-
-#define XOR_TRY_TEMPLATES	xor_speed(&xor_block_ia64)
diff --git a/arch/ia64/include/asm/xtp.h b/arch/ia64/include/asm/xtp.h
deleted file mode 100644
index 5bf1d70ad860..000000000000
--- a/arch/ia64/include/asm/xtp.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_XTP_H
-#define _ASM_IA64_XTP_H
-
-#include <asm/io.h>
-
-#ifdef CONFIG_SMP
-
-#define XTP_OFFSET		0x1e0008
-
-#define SMP_IRQ_REDIRECTION	(1 << 0)
-#define SMP_IPI_REDIRECTION	(1 << 1)
-
-extern unsigned char smp_int_redirect;
-
-/*
- * XTP control functions:
- *	min_xtp   : route all interrupts to this CPU
- *	normal_xtp: nominal XTP value
- *	max_xtp   : never deliver interrupts to this CPU.
- */
-
-static inline void
-min_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */
-}
-
-static inline void
-normal_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */
-}
-
-static inline void
-max_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
-}
-
-#endif /* CONFIG_SMP */
-
-#endif /* _ASM_IA64_XTP_Hy */
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
deleted file mode 100644
index 3a1341e3535a..000000000000
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generated-y += unistd_64.h
diff --git a/arch/ia64/include/uapi/asm/auxvec.h b/arch/ia64/include/uapi/asm/auxvec.h
deleted file mode 100644
index 09969a5d2e0a..000000000000
--- a/arch/ia64/include/uapi/asm/auxvec.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_AUXVEC_H
-#define _ASM_IA64_AUXVEC_H
-
-/*
- * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
- * them, start the architecture-specific ones at 32.
- */
-#define AT_SYSINFO	32
-#define AT_SYSINFO_EHDR	33
-
-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
-
-#endif /* _ASM_IA64_AUXVEC_H */
diff --git a/arch/ia64/include/uapi/asm/bitsperlong.h b/arch/ia64/include/uapi/asm/bitsperlong.h
deleted file mode 100644
index 1146d55563db..000000000000
--- a/arch/ia64/include/uapi/asm/bitsperlong.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ASM_IA64_BITSPERLONG_H
-#define __ASM_IA64_BITSPERLONG_H
-
-#define __BITS_PER_LONG 64
-
-#include <asm-generic/bitsperlong.h>
-
-#endif /* __ASM_IA64_BITSPERLONG_H */
diff --git a/arch/ia64/include/uapi/asm/break.h b/arch/ia64/include/uapi/asm/break.h
deleted file mode 100644
index 4ca110f0a94b..000000000000
--- a/arch/ia64/include/uapi/asm/break.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_BREAK_H
-#define _ASM_IA64_BREAK_H
-
-/*
- * IA-64 Linux break numbers.
- *
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * OS-specific debug break numbers:
- */
-#define __IA64_BREAK_KDB		0x80100
-#define __IA64_BREAK_KPROBE		0x81000 /* .. 0x81fff */
-
-/*
- * OS-specific break numbers:
- */
-#define __IA64_BREAK_SYSCALL		0x100000
-
-#endif /* _ASM_IA64_BREAK_H */
diff --git a/arch/ia64/include/uapi/asm/byteorder.h b/arch/ia64/include/uapi/asm/byteorder.h
deleted file mode 100644
index f85d0faaaf34..000000000000
--- a/arch/ia64/include/uapi/asm/byteorder.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_BYTEORDER_H
-#define _ASM_IA64_BYTEORDER_H
-
-#include <linux/byteorder/little_endian.h>
-
-#endif /* _ASM_IA64_BYTEORDER_H */
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
deleted file mode 100644
index a59b5de6eec6..000000000000
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_IA64_CMPXCHG_H
-#define _UAPI_ASM_IA64_CMPXCHG_H
-
-/*
- * Compare/Exchange, forked from asm/intrinsics.h
- * which was:
- *
- *	Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-/* include compiler specific intrinsics */
-#include <asm/ia64regs.h>
-#include <asm/gcc_intrin.h>
-
-/*
- * This function doesn't exist, so you'll get a linker error if
- * something tries to do an invalid xchg().
- */
-extern void ia64_xchg_called_with_bad_pointer(void);
-
-#define __arch_xchg(x, ptr, size)					\
-({									\
-	unsigned long __xchg_result;					\
-									\
-	switch (size) {							\
-	case 1:								\
-		__xchg_result = ia64_xchg1((__u8 __force *)ptr, x);	\
-		break;							\
-									\
-	case 2:								\
-		__xchg_result = ia64_xchg2((__u16 __force *)ptr, x);	\
-		break;							\
-									\
-	case 4:								\
-		__xchg_result = ia64_xchg4((__u32 __force *)ptr, x);	\
-		break;							\
-									\
-	case 8:								\
-		__xchg_result = ia64_xchg8((__u64 __force *)ptr, x);	\
-		break;							\
-	default:							\
-		ia64_xchg_called_with_bad_pointer();			\
-	}								\
-	(__typeof__ (*(ptr)) __force) __xchg_result;			\
-})
-
-#ifndef __KERNEL__
-#define xchg(ptr, x)							\
-({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
-#endif
-
-/*
- * Atomic compare and exchange.  Compare OLD with MEM, if identical,
- * store NEW in MEM.  Return the initial value in MEM.  Success is
- * indicated by comparing RETURN with OLD.
- */
-
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid cmpxchg().
- */
-extern long ia64_cmpxchg_called_with_bad_pointer(void);
-
-#define ia64_cmpxchg(sem, ptr, old, new, size)				\
-({									\
-	__u64 _o_, _r_;							\
-									\
-	switch (size) {							\
-	case 1:								\
-		_o_ = (__u8) (long __force) (old);			\
-		break;							\
-	case 2:								\
-		_o_ = (__u16) (long __force) (old);			\
-		break;							\
-	case 4:								\
-		_o_ = (__u32) (long __force) (old);			\
-		break;							\
-	case 8:								\
-		_o_ = (__u64) (long __force) (old);			\
-		break;							\
-	default:							\
-		break;							\
-	}								\
-	switch (size) {							\
-	case 1:								\
-		_r_ = ia64_cmpxchg1_##sem((__u8 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 2:								\
-		_r_ = ia64_cmpxchg2_##sem((__u16 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 4:								\
-		_r_ = ia64_cmpxchg4_##sem((__u32 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 8:								\
-		_r_ = ia64_cmpxchg8_##sem((__u64 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	default:							\
-		_r_ = ia64_cmpxchg_called_with_bad_pointer();		\
-		break;							\
-	}								\
-	(__typeof__(old) __force) _r_;					\
-})
-
-#define cmpxchg_acq(ptr, o, n)	\
-	ia64_cmpxchg(acq, (ptr), (o), (n), sizeof(*(ptr)))
-#define cmpxchg_rel(ptr, o, n)	\
-	ia64_cmpxchg(rel, (ptr), (o), (n), sizeof(*(ptr)))
-
-/*
- * Worse still - early processor implementations actually just ignored
- * the acquire/release and did a full fence all the time.  Unfortunately
- * this meant a lot of badly written code that used .acq when they really
- * wanted .rel became legacy out in the wild - so when we made a cpu
- * that strictly did the .acq or .rel ... all that code started breaking - so
- * we had to back-pedal and keep the "legacy" behavior of a full fence :-(
- */
-
-#ifndef __KERNEL__
-/* for compatibility with other platforms: */
-#define cmpxchg(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-#define cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-
-#define cmpxchg_local		cmpxchg
-#define cmpxchg64_local		cmpxchg64
-#endif
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _UAPI_ASM_IA64_CMPXCHG_H */
diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h
deleted file mode 100644
index 7b95523efe5a..000000000000
--- a/arch/ia64/include/uapi/asm/fcntl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_FCNTL_H
-#define _ASM_IA64_FCNTL_H
-/*
- * Modified 1998-2000
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co.
- */
-
-#define force_o_largefile()	\
-		(personality(current->personality) != PER_LINUX32)
-
-#include <linux/personality.h>
-#include <asm-generic/fcntl.h>
-
-#endif /* _ASM_IA64_FCNTL_H */
diff --git a/arch/ia64/include/uapi/asm/fpu.h b/arch/ia64/include/uapi/asm/fpu.h
deleted file mode 100644
index 0df392982ce8..000000000000
--- a/arch/ia64/include/uapi/asm/fpu.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_FPU_H
-#define _ASM_IA64_FPU_H
-
-/*
- * Copyright (C) 1998, 1999, 2002, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/types.h>
-
-/* floating point status register: */
-#define FPSR_TRAP_VD	(1 << 0)	/* invalid op trap disabled */
-#define FPSR_TRAP_DD	(1 << 1)	/* denormal trap disabled */
-#define FPSR_TRAP_ZD	(1 << 2)	/* zero-divide trap disabled */
-#define FPSR_TRAP_OD	(1 << 3)	/* overflow trap disabled */
-#define FPSR_TRAP_UD	(1 << 4)	/* underflow trap disabled */
-#define FPSR_TRAP_ID	(1 << 5)	/* inexact trap disabled */
-#define FPSR_S0(x)	((x) <<  6)
-#define FPSR_S1(x)	((x) << 19)
-#define FPSR_S2(x)	(__IA64_UL(x) << 32)
-#define FPSR_S3(x)	(__IA64_UL(x) << 45)
-
-/* floating-point status field controls: */
-#define FPSF_FTZ	(1 << 0)		/* flush-to-zero */
-#define FPSF_WRE	(1 << 1)		/* widest-range exponent */
-#define FPSF_PC(x)	(((x) & 0x3) << 2)	/* precision control */
-#define FPSF_RC(x)	(((x) & 0x3) << 4)	/* rounding control */
-#define FPSF_TD		(1 << 6)		/* trap disabled */
-
-/* floating-point status field flags: */
-#define FPSF_V		(1 <<  7)		/* invalid operation flag */
-#define FPSF_D		(1 <<  8)		/* denormal/unnormal operand flag */
-#define FPSF_Z		(1 <<  9)		/* zero divide (IEEE) flag */
-#define FPSF_O		(1 << 10)		/* overflow (IEEE) flag */
-#define FPSF_U		(1 << 11)		/* underflow (IEEE) flag */
-#define FPSF_I		(1 << 12)		/* inexact (IEEE) flag) */
-
-/* floating-point rounding control: */
-#define FPRC_NEAREST	0x0
-#define FPRC_NEGINF	0x1
-#define FPRC_POSINF	0x2
-#define FPRC_TRUNC	0x3
-
-#define FPSF_DEFAULT	(FPSF_PC (0x3) | FPSF_RC (FPRC_NEAREST))
-
-/* This default value is the same as HP-UX uses.  Don't change it
-   without a very good reason.  */
-#define FPSR_DEFAULT	(FPSR_TRAP_VD | FPSR_TRAP_DD | FPSR_TRAP_ZD	\
-			 | FPSR_TRAP_OD | FPSR_TRAP_UD | FPSR_TRAP_ID	\
-			 | FPSR_S0 (FPSF_DEFAULT)			\
-			 | FPSR_S1 (FPSF_DEFAULT | FPSF_TD | FPSF_WRE)	\
-			 | FPSR_S2 (FPSF_DEFAULT | FPSF_TD)		\
-			 | FPSR_S3 (FPSF_DEFAULT | FPSF_TD))
-
-# ifndef __ASSEMBLY__
-
-struct ia64_fpreg {
-	union {
-		unsigned long bits[2];
-		long double __dummy;	/* force 16-byte alignment */
-	} u;
-};
-
-# endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_FPU_H */
diff --git a/arch/ia64/include/uapi/asm/gcc_intrin.h b/arch/ia64/include/uapi/asm/gcc_intrin.h
deleted file mode 100644
index ecfa3eadb217..000000000000
--- a/arch/ia64/include/uapi/asm/gcc_intrin.h
+++ /dev/null
@@ -1,619 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *
- * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
- * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#ifndef _UAPI_ASM_IA64_GCC_INTRIN_H
-#define _UAPI_ASM_IA64_GCC_INTRIN_H
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-/* define this macro to get some asm stmts included in 'c' files */
-#define ASM_SUPPORTED
-
-/* Optimization barrier */
-/* The "volatile" is due to gcc bugs */
-#define ia64_barrier()	asm volatile ("":::"memory")
-
-#define ia64_stop()	asm volatile (";;"::)
-
-#define ia64_invala_gr(regnum)	asm volatile ("invala.e r%0" :: "i"(regnum))
-
-#define ia64_invala_fr(regnum)	asm volatile ("invala.e f%0" :: "i"(regnum))
-
-#define ia64_flushrs() asm volatile ("flushrs;;":::"memory")
-
-#define ia64_loadrs() asm volatile ("loadrs;;":::"memory")
-
-extern void ia64_bad_param_for_setreg (void);
-extern void ia64_bad_param_for_getreg (void);
-
-
-#define ia64_setreg(regnum, val)						\
-({										\
-	switch (regnum) {							\
-	    case _IA64_REG_PSR_L:						\
-		    asm volatile ("mov psr.l=%0" :: "r"(val) : "memory");	\
-		    break;							\
-	    case _IA64_REG_AR_KR0 ... _IA64_REG_AR_EC:				\
-		    asm volatile ("mov ar%0=%1" ::				\
-		    			  "i" (regnum - _IA64_REG_AR_KR0),	\
-					  "r"(val): "memory");			\
-		    break;							\
-	    case _IA64_REG_CR_DCR ... _IA64_REG_CR_LRR1:			\
-		    asm volatile ("mov cr%0=%1" ::				\
-				          "i" (regnum - _IA64_REG_CR_DCR),	\
-					  "r"(val): "memory" );			\
-		    break;							\
-	    case _IA64_REG_SP:							\
-		    asm volatile ("mov r12=%0" ::				\
-			    		  "r"(val): "memory");			\
-		    break;							\
-	    case _IA64_REG_GP:							\
-		    asm volatile ("mov gp=%0" :: "r"(val) : "memory");		\
-		break;								\
-	    default:								\
-		    ia64_bad_param_for_setreg();				\
-		    break;							\
-	}									\
-})
-
-#define ia64_getreg(regnum)							\
-({										\
-	__u64 ia64_intri_res;							\
-										\
-	switch (regnum) {							\
-	case _IA64_REG_GP:							\
-		asm volatile ("mov %0=gp" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_IP:							\
-		asm volatile ("mov %0=ip" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_PSR:							\
-		asm volatile ("mov %0=psr" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_TP:	/* for current() */				\
-		ia64_intri_res = ia64_r13;					\
-		break;								\
-	case _IA64_REG_AR_KR0 ... _IA64_REG_AR_EC:				\
-		asm volatile ("mov %0=ar%1" : "=r" (ia64_intri_res)		\
-				      : "i"(regnum - _IA64_REG_AR_KR0));	\
-		break;								\
-	case _IA64_REG_CR_DCR ... _IA64_REG_CR_LRR1:				\
-		asm volatile ("mov %0=cr%1" : "=r" (ia64_intri_res)		\
-				      : "i" (regnum - _IA64_REG_CR_DCR));	\
-		break;								\
-	case _IA64_REG_SP:							\
-		asm volatile ("mov %0=sp" : "=r" (ia64_intri_res));		\
-		break;								\
-	default:								\
-		ia64_bad_param_for_getreg();					\
-		break;								\
-	}									\
-	ia64_intri_res;								\
-})
-
-#define ia64_hint_pause 0
-
-#define ia64_hint(mode)						\
-({								\
-	switch (mode) {						\
-	case ia64_hint_pause:					\
-		asm volatile ("hint @pause" ::: "memory");	\
-		break;						\
-	}							\
-})
-
-
-/* Integer values for mux1 instruction */
-#define ia64_mux1_brcst 0
-#define ia64_mux1_mix   8
-#define ia64_mux1_shuf  9
-#define ia64_mux1_alt  10
-#define ia64_mux1_rev  11
-
-#define ia64_mux1(x, mode)							\
-({										\
-	__u64 ia64_intri_res;							\
-										\
-	switch (mode) {								\
-	case ia64_mux1_brcst:							\
-		asm ("mux1 %0=%1,@brcst" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_mix:							\
-		asm ("mux1 %0=%1,@mix" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_shuf:							\
-		asm ("mux1 %0=%1,@shuf" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_alt:							\
-		asm ("mux1 %0=%1,@alt" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_rev:							\
-		asm ("mux1 %0=%1,@rev" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	}									\
-	ia64_intri_res;								\
-})
-
-#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-# define ia64_popcnt(x)		__builtin_popcountl(x)
-#else
-# define ia64_popcnt(x)						\
-  ({								\
-	__u64 ia64_intri_res;					\
-	asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x));	\
-								\
-	ia64_intri_res;						\
-  })
-#endif
-
-#define ia64_getf_exp(x)					\
-({								\
-	long ia64_intri_res;					\
-								\
-	asm ("getf.exp %0=%1" : "=r"(ia64_intri_res) : "f"(x));	\
-								\
-	ia64_intri_res;						\
-})
-
-#define ia64_shrp(a, b, count)								\
-({											\
-	__u64 ia64_intri_res;								\
-	asm ("shrp %0=%1,%2,%3" : "=r"(ia64_intri_res) : "r"(a), "r"(b), "i"(count));	\
-	ia64_intri_res;									\
-})
-
-#define ia64_ldfs(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfs %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldfd(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfd %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldfe(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfe %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldf8(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldf8 %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldf_fill(regnum, x)				\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldf.fill %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_st4_rel_nta(m, val)					\
-({									\
-	asm volatile ("st4.rel.nta [%0] = %1\n\t" :: "r"(m), "r"(val));	\
-})
-
-#define ia64_stfs(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfs [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stfd(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfd [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stfe(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfe [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stf8(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stf8 [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stf_spill(x, regnum)						\
-({										\
-	register double __f__ asm ("f"#regnum);					\
-	asm volatile ("stf.spill [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_fetchadd4_acq(p, inc)						\
-({										\
-										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd4.acq %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd4_rel(p, inc)						\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd4.rel %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd8_acq(p, inc)						\
-({										\
-										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd8.acq %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd8_rel(p, inc)						\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd8.rel %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_xchg1(ptr,x)							\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("xchg1 %0=[%1],%2"					\
-		      : "=r" (ia64_intri_res) : "r" (ptr), "r" (x) : "memory");	\
-	ia64_intri_res;								\
-})
-
-#define ia64_xchg2(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg2 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_xchg4(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg4 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_xchg8(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg8 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_cmpxchg1_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg1.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg1_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg1.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg2_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg2.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg2_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-											\
-	asm volatile ("cmpxchg2.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg4_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg4_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg4.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg8_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg8.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg8_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-											\
-	asm volatile ("cmpxchg8.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_mf()	asm volatile ("mf" ::: "memory")
-#define ia64_mfa()	asm volatile ("mf.a" ::: "memory")
-
-#define ia64_invala() asm volatile ("invala" ::: "memory")
-
-#define ia64_thash(addr)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_srlz_i()	asm volatile (";; srlz.i ;;" ::: "memory")
-#define ia64_srlz_d()	asm volatile (";; srlz.d" ::: "memory");
-
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-# define ia64_dv_serialize_data()		asm volatile (".serialize.data");
-# define ia64_dv_serialize_instruction()	asm volatile (".serialize.instruction");
-#else
-# define ia64_dv_serialize_data()
-# define ia64_dv_serialize_instruction()
-#endif
-
-#define ia64_nop(x)	asm volatile ("nop %0"::"i"(x));
-
-#define ia64_itci(addr)	asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
-
-#define ia64_itcd(addr)	asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
-
-
-#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"				\
-					     :: "r"(trnum), "r"(addr) : "memory")
-
-#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"				\
-					     :: "r"(trnum), "r"(addr) : "memory")
-
-#define ia64_tpa(addr)								\
-({										\
-	unsigned long ia64_pa;							\
-	asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");	\
-	ia64_pa;								\
-})
-
-#define __ia64_set_dbr(index, val)						\
-	asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_ibr(index, val)						\
-	asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pkr(index, val)						\
-	asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pmc(index, val)						\
-	asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pmd(index, val)						\
-	asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_rr(index, val)							\
-	asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
-
-#define ia64_get_cpuid(index)								\
-({											\
-	unsigned long ia64_intri_res;							\
-	asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));	\
-	ia64_intri_res;									\
-})
-
-#define __ia64_get_dbr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=dbr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_ibr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_pkr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_pmc(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-
-#define ia64_get_pmd(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_rr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_fc(addr)	asm volatile ("fc %0" :: "r"(addr) : "memory")
-
-
-#define ia64_sync_i()	asm volatile (";; sync.i" ::: "memory")
-
-#define ia64_ssm(mask)	asm volatile ("ssm %0":: "i"((mask)) : "memory")
-#define ia64_rsm(mask)	asm volatile ("rsm %0":: "i"((mask)) : "memory")
-#define ia64_sum(mask)	asm volatile ("sum %0":: "i"((mask)) : "memory")
-#define ia64_rum(mask)	asm volatile ("rum %0":: "i"((mask)) : "memory")
-
-#define ia64_ptce(addr)	asm volatile ("ptc.e %0" :: "r"(addr))
-
-#define ia64_ptcga(addr, size)							\
-do {										\
-	asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");	\
-	ia64_dv_serialize_data();						\
-} while (0)
-
-#define ia64_ptcl(addr, size)							\
-do {										\
-	asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");	\
-	ia64_dv_serialize_data();						\
-} while (0)
-
-#define ia64_ptri(addr, size)						\
-	asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
-
-#define ia64_ptrd(addr, size)						\
-	asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
-
-#define ia64_ttag(addr)							\
-({									  \
-	__u64 ia64_intri_res;						   \
-	asm volatile ("ttag %0=%1" : "=r"(ia64_intri_res) : "r" (addr));   \
-	ia64_intri_res;							 \
-})
-
-
-/* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
-
-#define ia64_lfhint_none   0
-#define ia64_lfhint_nt1    1
-#define ia64_lfhint_nt2    2
-#define ia64_lfhint_nta    3
-
-#define ia64_lfetch(lfhint, y)					\
-({								\
-        switch (lfhint) {					\
-        case ia64_lfhint_none:					\
-                asm volatile ("lfetch [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nt1:					\
-                asm volatile ("lfetch.nt1 [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nt2:					\
-                asm volatile ("lfetch.nt2 [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nta:					\
-                asm volatile ("lfetch.nta [%0]" : : "r"(y));	\
-                break;						\
-        }							\
-})
-
-#define ia64_lfetch_excl(lfhint, y)					\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.excl [%0]" :: "r"(y));		\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.excl.nt1 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.excl.nt2 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.excl.nta [%0]" :: "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_lfetch_fault(lfhint, y)					\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.fault [%0]" : : "r"(y));		\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.fault.nt1 [%0]" : : "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.fault.nt2 [%0]" : : "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.fault.nta [%0]" : : "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_lfetch_fault_excl(lfhint, y)				\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.fault.excl [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.fault.excl.nt1 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.fault.excl.nt2 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.fault.excl.nta [%0]" :: "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_intrin_local_irq_restore(x)			\
-do {								\
-	asm volatile (";;   cmp.ne p6,p7=%0,r0;;"		\
-		      "(p6) ssm psr.i;"				\
-		      "(p7) rsm psr.i;;"			\
-		      "(p6) srlz.d"				\
-		      :: "r"((x)) : "p6", "p7", "memory");	\
-} while (0)
-
-#endif /* _UAPI_ASM_IA64_GCC_INTRIN_H */
diff --git a/arch/ia64/include/uapi/asm/ia64regs.h b/arch/ia64/include/uapi/asm/ia64regs.h
deleted file mode 100644
index d7d10cec8b9f..000000000000
--- a/arch/ia64/include/uapi/asm/ia64regs.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2002,2003 Intel Corp.
- *      Jun Nakajima <jun.nakajima@intel.com>
- *      Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#ifndef _ASM_IA64_IA64REGS_H
-#define _ASM_IA64_IA64REGS_H
-
-/*
- * Register Names for getreg() and setreg().
- *
- * The "magic" numbers happen to match the values used by the Intel compiler's
- * getreg()/setreg() intrinsics.
- */
-
-/* Special Registers */
-
-#define _IA64_REG_IP		1016	/* getreg only */
-#define _IA64_REG_PSR		1019
-#define _IA64_REG_PSR_L		1019
-
-/* General Integer Registers */
-
-#define _IA64_REG_GP		1025	/* R1 */
-#define _IA64_REG_R8		1032	/* R8 */
-#define _IA64_REG_R9		1033	/* R9 */
-#define _IA64_REG_SP		1036	/* R12 */
-#define _IA64_REG_TP		1037	/* R13 */
-
-/* Application Registers */
-
-#define _IA64_REG_AR_KR0	3072
-#define _IA64_REG_AR_KR1	3073
-#define _IA64_REG_AR_KR2	3074
-#define _IA64_REG_AR_KR3	3075
-#define _IA64_REG_AR_KR4	3076
-#define _IA64_REG_AR_KR5	3077
-#define _IA64_REG_AR_KR6	3078
-#define _IA64_REG_AR_KR7	3079
-#define _IA64_REG_AR_RSC	3088
-#define _IA64_REG_AR_BSP	3089
-#define _IA64_REG_AR_BSPSTORE	3090
-#define _IA64_REG_AR_RNAT	3091
-#define _IA64_REG_AR_FCR	3093
-#define _IA64_REG_AR_EFLAG	3096
-#define _IA64_REG_AR_CSD	3097
-#define _IA64_REG_AR_SSD	3098
-#define _IA64_REG_AR_CFLAG	3099
-#define _IA64_REG_AR_FSR	3100
-#define _IA64_REG_AR_FIR	3101
-#define _IA64_REG_AR_FDR	3102
-#define _IA64_REG_AR_CCV	3104
-#define _IA64_REG_AR_UNAT	3108
-#define _IA64_REG_AR_FPSR	3112
-#define _IA64_REG_AR_ITC	3116
-#define _IA64_REG_AR_PFS	3136
-#define _IA64_REG_AR_LC		3137
-#define _IA64_REG_AR_EC		3138
-
-/* Control Registers */
-
-#define _IA64_REG_CR_DCR	4096
-#define _IA64_REG_CR_ITM	4097
-#define _IA64_REG_CR_IVA	4098
-#define _IA64_REG_CR_PTA	4104
-#define _IA64_REG_CR_IPSR	4112
-#define _IA64_REG_CR_ISR	4113
-#define _IA64_REG_CR_IIP	4115
-#define _IA64_REG_CR_IFA	4116
-#define _IA64_REG_CR_ITIR	4117
-#define _IA64_REG_CR_IIPA	4118
-#define _IA64_REG_CR_IFS	4119
-#define _IA64_REG_CR_IIM	4120
-#define _IA64_REG_CR_IHA	4121
-#define _IA64_REG_CR_LID	4160
-#define _IA64_REG_CR_IVR	4161	/* getreg only */
-#define _IA64_REG_CR_TPR	4162
-#define _IA64_REG_CR_EOI	4163
-#define _IA64_REG_CR_IRR0	4164	/* getreg only */
-#define _IA64_REG_CR_IRR1	4165	/* getreg only */
-#define _IA64_REG_CR_IRR2	4166	/* getreg only */
-#define _IA64_REG_CR_IRR3	4167	/* getreg only */
-#define _IA64_REG_CR_ITV	4168
-#define _IA64_REG_CR_PMV	4169
-#define _IA64_REG_CR_CMCV	4170
-#define _IA64_REG_CR_LRR0	4176
-#define _IA64_REG_CR_LRR1	4177
-
-/* Indirect Registers for getindreg() and setindreg() */
-
-#define _IA64_REG_INDR_CPUID	9000	/* getindreg only */
-#define _IA64_REG_INDR_DBR	9001
-#define _IA64_REG_INDR_IBR	9002
-#define _IA64_REG_INDR_PKR	9003
-#define _IA64_REG_INDR_PMC	9004
-#define _IA64_REG_INDR_PMD	9005
-#define _IA64_REG_INDR_RR	9006
-
-#endif /* _ASM_IA64_IA64REGS_H */
diff --git a/arch/ia64/include/uapi/asm/intrinsics.h b/arch/ia64/include/uapi/asm/intrinsics.h
deleted file mode 100644
index 63f27c4ec739..000000000000
--- a/arch/ia64/include/uapi/asm/intrinsics.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Compiler-dependent intrinsics.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _UAPI_ASM_IA64_INTRINSICS_H
-#define _UAPI_ASM_IA64_INTRINSICS_H
-
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-/* include compiler specific intrinsics */
-#include <asm/ia64regs.h>
-#include <asm/gcc_intrin.h>
-#include <asm/cmpxchg.h>
-
-#define ia64_set_rr0_to_rr4(val0, val1, val2, val3, val4)		\
-do {									\
-	ia64_set_rr(0x0000000000000000UL, (val0));			\
-	ia64_set_rr(0x2000000000000000UL, (val1));			\
-	ia64_set_rr(0x4000000000000000UL, (val2));			\
-	ia64_set_rr(0x6000000000000000UL, (val3));			\
-	ia64_set_rr(0x8000000000000000UL, (val4));			\
-} while (0)
-
-/*
- * Force an unresolved reference if someone tries to use
- * ia64_fetch_and_add() with a bad value.
- */
-extern unsigned long __bad_size_for_ia64_fetch_and_add (void);
-extern unsigned long __bad_increment_for_ia64_fetch_and_add (void);
-
-#define IA64_FETCHADD(tmp,v,n,sz,sem)						\
-({										\
-	switch (sz) {								\
-	      case 4:								\
-	        tmp = ia64_fetchadd4_##sem((unsigned int *) v, n);		\
-		break;								\
-										\
-	      case 8:								\
-	        tmp = ia64_fetchadd8_##sem((unsigned long *) v, n);		\
-		break;								\
-										\
-	      default:								\
-		__bad_size_for_ia64_fetch_and_add();				\
-	}									\
-})
-
-#define ia64_fetchadd(i,v,sem)								\
-({											\
-	__u64 _tmp;									\
-	volatile __typeof__(*(v)) *_v = (v);						\
-	/* Can't use a switch () here: gcc isn't always smart enough for that... */	\
-	if ((i) == -16)									\
-		IA64_FETCHADD(_tmp, _v, -16, sizeof(*(v)), sem);			\
-	else if ((i) == -8)								\
-		IA64_FETCHADD(_tmp, _v, -8, sizeof(*(v)), sem);				\
-	else if ((i) == -4)								\
-		IA64_FETCHADD(_tmp, _v, -4, sizeof(*(v)), sem);				\
-	else if ((i) == -1)								\
-		IA64_FETCHADD(_tmp, _v, -1, sizeof(*(v)), sem);				\
-	else if ((i) == 1)								\
-		IA64_FETCHADD(_tmp, _v, 1, sizeof(*(v)), sem);				\
-	else if ((i) == 4)								\
-		IA64_FETCHADD(_tmp, _v, 4, sizeof(*(v)), sem);				\
-	else if ((i) == 8)								\
-		IA64_FETCHADD(_tmp, _v, 8, sizeof(*(v)), sem);				\
-	else if ((i) == 16)								\
-		IA64_FETCHADD(_tmp, _v, 16, sizeof(*(v)), sem);				\
-	else										\
-		_tmp = __bad_increment_for_ia64_fetch_and_add();			\
-	(__typeof__(*(v))) (_tmp);	/* return old value */				\
-})
-
-#define ia64_fetch_and_add(i,v)	(ia64_fetchadd(i, v, rel) + (i)) /* return new value */
-
-#endif
-
-#endif /* _UAPI_ASM_IA64_INTRINSICS_H */
diff --git a/arch/ia64/include/uapi/asm/mman.h b/arch/ia64/include/uapi/asm/mman.h
deleted file mode 100644
index ce0cc3d7509e..000000000000
--- a/arch/ia64/include/uapi/asm/mman.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Based on <asm-i386/mman.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_MMAN_H
-#define _UAPI_ASM_IA64_MMAN_H
-
-
-#include <asm-generic/mman.h>
-
-#define MAP_GROWSUP	0x0200		/* register stack-like segment */
-
-
-#endif /* _UAPI_ASM_IA64_MMAN_H */
diff --git a/arch/ia64/include/uapi/asm/param.h b/arch/ia64/include/uapi/asm/param.h
deleted file mode 100644
index 123ab45940b4..000000000000
--- a/arch/ia64/include/uapi/asm/param.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Fundamental kernel parameters.
- *
- * Based on <asm-i386/param.h>.
- *
- * Modified 1998, 1999, 2002-2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_PARAM_H
-#define _UAPI_ASM_IA64_PARAM_H
-
-
-#define EXEC_PAGESIZE	65536
-
-#ifndef NOGROUP
-# define NOGROUP	(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
-
-#ifndef __KERNEL__
-   /*
-    * Technically, this is wrong, but some old apps still refer to it.  The proper way to
-    * get the HZ value is via sysconf(_SC_CLK_TCK).
-    */
-# define HZ 1024
-#endif
-
-#endif /* _UAPI_ASM_IA64_PARAM_H */
diff --git a/arch/ia64/include/uapi/asm/posix_types.h b/arch/ia64/include/uapi/asm/posix_types.h
deleted file mode 100644
index bded40f7defe..000000000000
--- a/arch/ia64/include/uapi/asm/posix_types.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_POSIX_TYPES_H
-#define _ASM_IA64_POSIX_TYPES_H
-
-typedef unsigned long	__kernel_sigset_t;	/* at least 32 bits */
-
-#include <asm-generic/posix_types.h>
-
-#endif /* _ASM_IA64_POSIX_TYPES_H */
diff --git a/arch/ia64/include/uapi/asm/ptrace.h b/arch/ia64/include/uapi/asm/ptrace.h
deleted file mode 100644
index f52655b44414..000000000000
--- a/arch/ia64/include/uapi/asm/ptrace.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2003 Intel Co
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Arun Sharma <arun.sharma@intel.com>
- *
- * 12/07/98	S. Eranian	added pt_regs & switch_stack
- * 12/21/98	D. Mosberger	updated to match latest code
- *  6/17/99	D. Mosberger	added second unat member to "struct switch_stack"
- *
- */
-#ifndef _UAPI_ASM_IA64_PTRACE_H
-#define _UAPI_ASM_IA64_PTRACE_H
-
-/*
- * When a user process is blocked, its state looks as follows:
- *
- *            +----------------------+	-------	IA64_STK_OFFSET
- *     	      |			     |	 ^
- *            | struct pt_regs       |	 |
- *	      |			     |	 |
- *            +----------------------+	 |
- *	      |			     |	 |
- *     	      |	   memory stack	     |	 |
- *	      |	(growing downwards)  |	 |
- *	      //.....................//	 |
- *					 |
- *	      //.....................//	 |
- *	      |			     |	 |
- *            +----------------------+	 |
- *            | struct switch_stack  |	 |
- *	      |			     |	 |
- *	      +----------------------+	 |
- *	      |			     |	 |
- *	      //.....................//	 |
- *					 |
- *	      //.....................//	 |
- *	      |			     |	 |
- *	      |	 register stack	     |	 |
- *	      |	(growing upwards)    |	 |
- *            |			     |	 |
- *	      +----------------------+	 |  ---	IA64_RBS_OFFSET
- *            |  struct thread_info  |	 |  ^
- *	      +----------------------+	 |  |
- *	      |			     |	 |  |
- *            |  struct task_struct  |	 |  |
- * current -> |			     |   |  |
- *	      +----------------------+ -------
- *
- * Note that ar.ec is not saved explicitly in pt_reg or switch_stack.
- * This is because ar.ec is saved as part of ar.pfs.
- */
-
-
-#include <asm/fpu.h>
-
-
-#ifndef __ASSEMBLY__
-
-/*
- * This struct defines the way the registers are saved on system
- * calls.
- *
- * We don't save all floating point register because the kernel
- * is compiled to use only a very small subset, so the other are
- * untouched.
- *
- * THIS STRUCTURE MUST BE A MULTIPLE 16-BYTE IN SIZE
- * (because the memory stack pointer MUST ALWAYS be aligned this way)
- *
- */
-struct pt_regs {
-	/* The following registers are saved by SAVE_MIN: */
-	unsigned long b6;		/* scratch */
-	unsigned long b7;		/* scratch */
-
-	unsigned long ar_csd;           /* used by cmp8xchg16 (scratch) */
-	unsigned long ar_ssd;           /* reserved for future use (scratch) */
-
-	unsigned long r8;		/* scratch (return value register 0) */
-	unsigned long r9;		/* scratch (return value register 1) */
-	unsigned long r10;		/* scratch (return value register 2) */
-	unsigned long r11;		/* scratch (return value register 3) */
-
-	unsigned long cr_ipsr;		/* interrupted task's psr */
-	unsigned long cr_iip;		/* interrupted task's instruction pointer */
-	/*
-	 * interrupted task's function state; if bit 63 is cleared, it
-	 * contains syscall's ar.pfs.pfm:
-	 */
-	unsigned long cr_ifs;
-
-	unsigned long ar_unat;		/* interrupted task's NaT register (preserved) */
-	unsigned long ar_pfs;		/* prev function state  */
-	unsigned long ar_rsc;		/* RSE configuration */
-	/* The following two are valid only if cr_ipsr.cpl > 0 || ti->flags & _TIF_MCA_INIT */
-	unsigned long ar_rnat;		/* RSE NaT */
-	unsigned long ar_bspstore;	/* RSE bspstore */
-
-	unsigned long pr;		/* 64 predicate registers (1 bit each) */
-	unsigned long b0;		/* return pointer (bp) */
-	unsigned long loadrs;		/* size of dirty partition << 16 */
-
-	unsigned long r1;		/* the gp pointer */
-	unsigned long r12;		/* interrupted task's memory stack pointer */
-	unsigned long r13;		/* thread pointer */
-
-	unsigned long ar_fpsr;		/* floating point status (preserved) */
-	unsigned long r15;		/* scratch */
-
-	/* The remaining registers are NOT saved for system calls.  */
-
-	unsigned long r14;		/* scratch */
-	unsigned long r2;		/* scratch */
-	unsigned long r3;		/* scratch */
-
-	/* The following registers are saved by SAVE_REST: */
-	unsigned long r16;		/* scratch */
-	unsigned long r17;		/* scratch */
-	unsigned long r18;		/* scratch */
-	unsigned long r19;		/* scratch */
-	unsigned long r20;		/* scratch */
-	unsigned long r21;		/* scratch */
-	unsigned long r22;		/* scratch */
-	unsigned long r23;		/* scratch */
-	unsigned long r24;		/* scratch */
-	unsigned long r25;		/* scratch */
-	unsigned long r26;		/* scratch */
-	unsigned long r27;		/* scratch */
-	unsigned long r28;		/* scratch */
-	unsigned long r29;		/* scratch */
-	unsigned long r30;		/* scratch */
-	unsigned long r31;		/* scratch */
-
-	unsigned long ar_ccv;		/* compare/exchange value (scratch) */
-
-	/*
-	 * Floating point registers that the kernel considers scratch:
-	 */
-	struct ia64_fpreg f6;		/* scratch */
-	struct ia64_fpreg f7;		/* scratch */
-	struct ia64_fpreg f8;		/* scratch */
-	struct ia64_fpreg f9;		/* scratch */
-	struct ia64_fpreg f10;		/* scratch */
-	struct ia64_fpreg f11;		/* scratch */
-};
-
-/*
- * This structure contains the addition registers that need to
- * preserved across a context switch.  This generally consists of
- * "preserved" registers.
- */
-struct switch_stack {
-	unsigned long caller_unat;	/* user NaT collection register (preserved) */
-	unsigned long ar_fpsr;		/* floating-point status register */
-
-	struct ia64_fpreg f2;		/* preserved */
-	struct ia64_fpreg f3;		/* preserved */
-	struct ia64_fpreg f4;		/* preserved */
-	struct ia64_fpreg f5;		/* preserved */
-
-	struct ia64_fpreg f12;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f13;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f14;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f15;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f16;		/* preserved */
-	struct ia64_fpreg f17;		/* preserved */
-	struct ia64_fpreg f18;		/* preserved */
-	struct ia64_fpreg f19;		/* preserved */
-	struct ia64_fpreg f20;		/* preserved */
-	struct ia64_fpreg f21;		/* preserved */
-	struct ia64_fpreg f22;		/* preserved */
-	struct ia64_fpreg f23;		/* preserved */
-	struct ia64_fpreg f24;		/* preserved */
-	struct ia64_fpreg f25;		/* preserved */
-	struct ia64_fpreg f26;		/* preserved */
-	struct ia64_fpreg f27;		/* preserved */
-	struct ia64_fpreg f28;		/* preserved */
-	struct ia64_fpreg f29;		/* preserved */
-	struct ia64_fpreg f30;		/* preserved */
-	struct ia64_fpreg f31;		/* preserved */
-
-	unsigned long r4;		/* preserved */
-	unsigned long r5;		/* preserved */
-	unsigned long r6;		/* preserved */
-	unsigned long r7;		/* preserved */
-
-	unsigned long b0;		/* so we can force a direct return in copy_thread */
-	unsigned long b1;
-	unsigned long b2;
-	unsigned long b3;
-	unsigned long b4;
-	unsigned long b5;
-
-	unsigned long ar_pfs;		/* previous function state */
-	unsigned long ar_lc;		/* loop counter (preserved) */
-	unsigned long ar_unat;		/* NaT bits for r4-r7 */
-	unsigned long ar_rnat;		/* RSE NaT collection register */
-	unsigned long ar_bspstore;	/* RSE dirty base (preserved) */
-	unsigned long pr;		/* 64 predicate registers (1 bit each) */
-};
-
-
-/* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
-struct pt_all_user_regs {
-	unsigned long nat;
-	unsigned long cr_iip;
-	unsigned long cfm;
-	unsigned long cr_ipsr;
-	unsigned long pr;
-
-	unsigned long gr[32];
-	unsigned long br[8];
-	unsigned long ar[128];
-	struct ia64_fpreg fr[128];
-};
-
-#endif /* !__ASSEMBLY__ */
-
-/* indices to application-registers array in pt_all_user_regs */
-#define PT_AUR_RSC	16
-#define PT_AUR_BSP	17
-#define PT_AUR_BSPSTORE	18
-#define PT_AUR_RNAT	19
-#define PT_AUR_CCV	32
-#define PT_AUR_UNAT	36
-#define PT_AUR_FPSR	40
-#define PT_AUR_PFS	64
-#define PT_AUR_LC	65
-#define PT_AUR_EC	66
-
-/*
- * The numbers chosen here are somewhat arbitrary but absolutely MUST
- * not overlap with any of the number assigned in <linux/ptrace.h>.
- */
-#define PTRACE_SINGLEBLOCK	12	/* resume execution until next branch */
-#define PTRACE_OLD_GETSIGINFO	13	/* (replaced by PTRACE_GETSIGINFO in <linux/ptrace.h>)  */
-#define PTRACE_OLD_SETSIGINFO	14	/* (replaced by PTRACE_SETSIGINFO in <linux/ptrace.h>)  */
-#define PTRACE_GETREGS		18	/* get all registers (pt_all_user_regs) in one shot */
-#define PTRACE_SETREGS		19	/* set all registers (pt_all_user_regs) in one shot */
-
-#define PTRACE_OLDSETOPTIONS	21
-
-#endif /* _UAPI_ASM_IA64_PTRACE_H */
diff --git a/arch/ia64/include/uapi/asm/ptrace_offsets.h b/arch/ia64/include/uapi/asm/ptrace_offsets.h
deleted file mode 100644
index 2847c18139ef..000000000000
--- a/arch/ia64/include/uapi/asm/ptrace_offsets.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_PTRACE_OFFSETS_H
-#define _ASM_IA64_PTRACE_OFFSETS_H
-
-/*
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * The "uarea" that can be accessed via PEEKUSER and POKEUSER is a
- * virtual structure that would have the following definition:
- *
- *	struct uarea {
- *		struct ia64_fpreg fph[96];		// f32-f127
- *		unsigned long nat_bits;
- *		unsigned long empty1;
- *		struct ia64_fpreg f2;			// f2-f5
- *			:
- *		struct ia64_fpreg f5;
- *		struct ia64_fpreg f10;			// f10-f31
- *			:
- *		struct ia64_fpreg f31;
- *		unsigned long r4;			// r4-r7
- *			:
- *		unsigned long r7;
- *		unsigned long b1;			// b1-b5
- *			:
- *		unsigned long b5;
- *		unsigned long ar_ec;
- *		unsigned long ar_lc;
- *		unsigned long empty2[5];
- *		unsigned long cr_ipsr;
- *		unsigned long cr_iip;
- *		unsigned long cfm;
- *		unsigned long ar_unat;
- *		unsigned long ar_pfs;
- *		unsigned long ar_rsc;
- *		unsigned long ar_rnat;
- *		unsigned long ar_bspstore;
- *		unsigned long pr;
- *		unsigned long b6;
- *		unsigned long ar_bsp;
- *		unsigned long r1;
- *		unsigned long r2;
- *		unsigned long r3;
- *		unsigned long r12;
- *		unsigned long r13;
- *		unsigned long r14;
- *		unsigned long r15;
- *		unsigned long r8;
- *		unsigned long r9;
- *		unsigned long r10;
- *		unsigned long r11;
- *		unsigned long r16;
- *			:
- *		unsigned long r31;
- *		unsigned long ar_ccv;
- *		unsigned long ar_fpsr;
- *		unsigned long b0;
- *		unsigned long b7;
- *		unsigned long f6;
- *		unsigned long f7;
- *		unsigned long f8;
- *		unsigned long f9;
- *		unsigned long ar_csd;
- *		unsigned long ar_ssd;
- *		unsigned long rsvd1[710];
- *		unsigned long dbr[8];
- *		unsigned long rsvd2[504];
- *		unsigned long ibr[8];
- *		unsigned long rsvd3[504];
- *		unsigned long pmd[4];
- *	}
- */
-
-/* fph: */
-#define PT_F32			0x0000
-#define PT_F33			0x0010
-#define PT_F34			0x0020
-#define PT_F35			0x0030
-#define PT_F36			0x0040
-#define PT_F37			0x0050
-#define PT_F38			0x0060
-#define PT_F39			0x0070
-#define PT_F40			0x0080
-#define PT_F41			0x0090
-#define PT_F42			0x00a0
-#define PT_F43			0x00b0
-#define PT_F44			0x00c0
-#define PT_F45			0x00d0
-#define PT_F46			0x00e0
-#define PT_F47			0x00f0
-#define PT_F48			0x0100
-#define PT_F49			0x0110
-#define PT_F50			0x0120
-#define PT_F51			0x0130
-#define PT_F52			0x0140
-#define PT_F53			0x0150
-#define PT_F54			0x0160
-#define PT_F55			0x0170
-#define PT_F56			0x0180
-#define PT_F57			0x0190
-#define PT_F58			0x01a0
-#define PT_F59			0x01b0
-#define PT_F60			0x01c0
-#define PT_F61			0x01d0
-#define PT_F62			0x01e0
-#define PT_F63			0x01f0
-#define PT_F64			0x0200
-#define PT_F65			0x0210
-#define PT_F66			0x0220
-#define PT_F67			0x0230
-#define PT_F68			0x0240
-#define PT_F69			0x0250
-#define PT_F70			0x0260
-#define PT_F71			0x0270
-#define PT_F72			0x0280
-#define PT_F73			0x0290
-#define PT_F74			0x02a0
-#define PT_F75			0x02b0
-#define PT_F76			0x02c0
-#define PT_F77			0x02d0
-#define PT_F78			0x02e0
-#define PT_F79			0x02f0
-#define PT_F80			0x0300
-#define PT_F81			0x0310
-#define PT_F82			0x0320
-#define PT_F83			0x0330
-#define PT_F84			0x0340
-#define PT_F85			0x0350
-#define PT_F86			0x0360
-#define PT_F87			0x0370
-#define PT_F88			0x0380
-#define PT_F89			0x0390
-#define PT_F90			0x03a0
-#define PT_F91			0x03b0
-#define PT_F92			0x03c0
-#define PT_F93			0x03d0
-#define PT_F94			0x03e0
-#define PT_F95			0x03f0
-#define PT_F96			0x0400
-#define PT_F97			0x0410
-#define PT_F98			0x0420
-#define PT_F99			0x0430
-#define PT_F100			0x0440
-#define PT_F101			0x0450
-#define PT_F102			0x0460
-#define PT_F103			0x0470
-#define PT_F104			0x0480
-#define PT_F105			0x0490
-#define PT_F106			0x04a0
-#define PT_F107			0x04b0
-#define PT_F108			0x04c0
-#define PT_F109			0x04d0
-#define PT_F110			0x04e0
-#define PT_F111			0x04f0
-#define PT_F112			0x0500
-#define PT_F113			0x0510
-#define PT_F114			0x0520
-#define PT_F115			0x0530
-#define PT_F116			0x0540
-#define PT_F117			0x0550
-#define PT_F118			0x0560
-#define PT_F119			0x0570
-#define PT_F120			0x0580
-#define PT_F121			0x0590
-#define PT_F122			0x05a0
-#define PT_F123			0x05b0
-#define PT_F124			0x05c0
-#define PT_F125			0x05d0
-#define PT_F126			0x05e0
-#define PT_F127			0x05f0
-
-#define PT_NAT_BITS		0x0600
-
-#define PT_F2			0x0610
-#define PT_F3			0x0620
-#define PT_F4			0x0630
-#define PT_F5			0x0640
-#define PT_F10			0x0650
-#define PT_F11			0x0660
-#define PT_F12			0x0670
-#define PT_F13			0x0680
-#define PT_F14			0x0690
-#define PT_F15			0x06a0
-#define PT_F16			0x06b0
-#define PT_F17			0x06c0
-#define PT_F18			0x06d0
-#define PT_F19			0x06e0
-#define PT_F20			0x06f0
-#define PT_F21			0x0700
-#define PT_F22			0x0710
-#define PT_F23			0x0720
-#define PT_F24			0x0730
-#define PT_F25			0x0740
-#define PT_F26			0x0750
-#define PT_F27			0x0760
-#define PT_F28			0x0770
-#define PT_F29			0x0780
-#define PT_F30			0x0790
-#define PT_F31			0x07a0
-#define PT_R4			0x07b0
-#define PT_R5			0x07b8
-#define PT_R6			0x07c0
-#define PT_R7			0x07c8
-
-#define PT_B1			0x07d8
-#define PT_B2			0x07e0
-#define PT_B3			0x07e8
-#define PT_B4			0x07f0
-#define PT_B5			0x07f8
-
-#define PT_AR_EC		0x0800
-#define PT_AR_LC		0x0808
-
-#define PT_CR_IPSR		0x0830
-#define PT_CR_IIP		0x0838
-#define PT_CFM			0x0840
-#define PT_AR_UNAT		0x0848
-#define PT_AR_PFS		0x0850
-#define PT_AR_RSC		0x0858
-#define PT_AR_RNAT		0x0860
-#define PT_AR_BSPSTORE		0x0868
-#define PT_PR			0x0870
-#define PT_B6			0x0878
-#define PT_AR_BSP		0x0880	/* note: this points to the *end* of the backing store! */
-#define PT_R1			0x0888
-#define PT_R2			0x0890
-#define PT_R3			0x0898
-#define PT_R12			0x08a0
-#define PT_R13			0x08a8
-#define PT_R14			0x08b0
-#define PT_R15			0x08b8
-#define PT_R8 			0x08c0
-#define PT_R9			0x08c8
-#define PT_R10			0x08d0
-#define PT_R11			0x08d8
-#define PT_R16			0x08e0
-#define PT_R17			0x08e8
-#define PT_R18			0x08f0
-#define PT_R19			0x08f8
-#define PT_R20			0x0900
-#define PT_R21			0x0908
-#define PT_R22			0x0910
-#define PT_R23			0x0918
-#define PT_R24			0x0920
-#define PT_R25			0x0928
-#define PT_R26			0x0930
-#define PT_R27			0x0938
-#define PT_R28			0x0940
-#define PT_R29			0x0948
-#define PT_R30			0x0950
-#define PT_R31			0x0958
-#define PT_AR_CCV		0x0960
-#define PT_AR_FPSR		0x0968
-#define PT_B0			0x0970
-#define PT_B7			0x0978
-#define PT_F6			0x0980
-#define PT_F7			0x0990
-#define PT_F8			0x09a0
-#define PT_F9			0x09b0
-#define PT_AR_CSD		0x09c0
-#define PT_AR_SSD		0x09c8
-
-#define PT_DBR			0x2000	/* data breakpoint registers */
-#define PT_IBR			0x3000	/* instruction breakpoint registers */
-#define PT_PMD			0x4000	/* performance monitoring counters */
-
-#endif /* _ASM_IA64_PTRACE_OFFSETS_H */
diff --git a/arch/ia64/include/uapi/asm/resource.h b/arch/ia64/include/uapi/asm/resource.h
deleted file mode 100644
index d488d2b22ac4..000000000000
--- a/arch/ia64/include/uapi/asm/resource.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_RESOURCE_H
-#define _ASM_IA64_RESOURCE_H
-
-#include <asm/ustack.h>
-#include <asm-generic/resource.h>
-
-#endif /* _ASM_IA64_RESOURCE_H */
diff --git a/arch/ia64/include/uapi/asm/rse.h b/arch/ia64/include/uapi/asm/rse.h
deleted file mode 100644
index 6d260af571c5..000000000000
--- a/arch/ia64/include/uapi/asm/rse.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_RSE_H
-#define _ASM_IA64_RSE_H
-
-/*
- * Copyright (C) 1998, 1999 Hewlett-Packard Co
- * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Register stack engine related helper functions.  This file may be
- * used in applications, so be careful about the name-space and give
- * some consideration to non-GNU C compilers (though __inline__ is
- * fine).
- */
-
-static __inline__ unsigned long
-ia64_rse_slot_num (unsigned long *addr)
-{
-	return (((unsigned long) addr) >> 3) & 0x3f;
-}
-
-/*
- * Return TRUE if ADDR is the address of an RNAT slot.
- */
-static __inline__ unsigned long
-ia64_rse_is_rnat_slot (unsigned long *addr)
-{
-	return ia64_rse_slot_num(addr) == 0x3f;
-}
-
-/*
- * Returns the address of the RNAT slot that covers the slot at
- * address SLOT_ADDR.
- */
-static __inline__ unsigned long *
-ia64_rse_rnat_addr (unsigned long *slot_addr)
-{
-	return (unsigned long *) ((unsigned long) slot_addr | (0x3f << 3));
-}
-
-/*
- * Calculate the number of registers in the dirty partition starting at BSPSTORE and
- * ending at BSP.  This isn't simply (BSP-BSPSTORE)/8 because every 64th slot stores
- * ar.rnat.
- */
-static __inline__ unsigned long
-ia64_rse_num_regs (unsigned long *bspstore, unsigned long *bsp)
-{
-	unsigned long slots = (bsp - bspstore);
-
-	return slots - (ia64_rse_slot_num(bspstore) + slots)/0x40;
-}
-
-/*
- * The inverse of the above: given bspstore and the number of
- * registers, calculate ar.bsp.
- */
-static __inline__ unsigned long *
-ia64_rse_skip_regs (unsigned long *addr, long num_regs)
-{
-	long delta = ia64_rse_slot_num(addr) + num_regs;
-
-	if (num_regs < 0)
-		delta -= 0x3e;
-	return addr + num_regs + delta/0x3f;
-}
-
-#endif /* _ASM_IA64_RSE_H */
diff --git a/arch/ia64/include/uapi/asm/setup.h b/arch/ia64/include/uapi/asm/setup.h
deleted file mode 100644
index 8d13ce8fb03a..000000000000
--- a/arch/ia64/include/uapi/asm/setup.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __IA64_SETUP_H
-#define __IA64_SETUP_H
-
-#define COMMAND_LINE_SIZE	2048
-
-extern struct ia64_boot_param {
-	__u64 command_line;		/* physical address of command line arguments */
-	__u64 efi_systab;		/* physical address of EFI system table */
-	__u64 efi_memmap;		/* physical address of EFI memory map */
-	__u64 efi_memmap_size;		/* size of EFI memory map */
-	__u64 efi_memdesc_size;		/* size of an EFI memory map descriptor */
-	__u32 efi_memdesc_version;	/* memory descriptor version */
-	struct {
-		__u16 num_cols;	/* number of columns on console output device */
-		__u16 num_rows;	/* number of rows on console output device */
-		__u16 orig_x;	/* cursor's x position */
-		__u16 orig_y;	/* cursor's y position */
-	} console_info;
-	__u64 fpswa;		/* physical address of the fpswa interface */
-	__u64 initrd_start;
-	__u64 initrd_size;
-} *ia64_boot_param;
-
-#endif
diff --git a/arch/ia64/include/uapi/asm/sigcontext.h b/arch/ia64/include/uapi/asm/sigcontext.h
deleted file mode 100644
index 1bb6f0f2bd73..000000000000
--- a/arch/ia64/include/uapi/asm/sigcontext.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SIGCONTEXT_H
-#define _ASM_IA64_SIGCONTEXT_H
-
-/*
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * Copyright (C) 1998, 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/fpu.h>
-
-#define IA64_SC_FLAG_ONSTACK_BIT		0	/* is handler running on signal stack? */
-#define IA64_SC_FLAG_IN_SYSCALL_BIT		1	/* did signal interrupt a syscall? */
-#define IA64_SC_FLAG_FPH_VALID_BIT		2	/* is state in f[32]-f[127] valid? */
-
-#define IA64_SC_FLAG_ONSTACK		(1 << IA64_SC_FLAG_ONSTACK_BIT)
-#define IA64_SC_FLAG_IN_SYSCALL		(1 << IA64_SC_FLAG_IN_SYSCALL_BIT)
-#define IA64_SC_FLAG_FPH_VALID		(1 << IA64_SC_FLAG_FPH_VALID_BIT)
-
-# ifndef __ASSEMBLY__
-
-/*
- * Note on handling of register backing store: sc_ar_bsp contains the address that would
- * be found in ar.bsp after executing a "cover" instruction the context in which the
- * signal was raised.  If signal delivery required switching to an alternate signal stack
- * (sc_rbs_base is not NULL), the "dirty" partition (as it would exist after executing the
- * imaginary "cover" instruction) is backed by the *alternate* signal stack, not the
- * original one.  In this case, sc_rbs_base contains the base address of the new register
- * backing store.  The number of registers in the dirty partition can be calculated as:
- *
- *   ndirty = ia64_rse_num_regs(sc_rbs_base, sc_rbs_base + (sc_loadrs >> 16))
- *
- */
-
-struct sigcontext {
-	unsigned long		sc_flags;	/* see manifest constants above */
-	unsigned long		sc_nat;		/* bit i == 1 iff scratch reg gr[i] is a NaT */
-	stack_t			sc_stack;	/* previously active stack */
-
-	unsigned long		sc_ip;		/* instruction pointer */
-	unsigned long		sc_cfm;		/* current frame marker */
-	unsigned long		sc_um;		/* user mask bits */
-	unsigned long		sc_ar_rsc;	/* register stack configuration register */
-	unsigned long		sc_ar_bsp;	/* backing store pointer */
-	unsigned long		sc_ar_rnat;	/* RSE NaT collection register */
-	unsigned long		sc_ar_ccv;	/* compare and exchange compare value register */
-	unsigned long		sc_ar_unat;	/* ar.unat of interrupted context */
-	unsigned long		sc_ar_fpsr;	/* floating-point status register */
-	unsigned long		sc_ar_pfs;	/* previous function state */
-	unsigned long		sc_ar_lc;	/* loop count register */
-	unsigned long		sc_pr;		/* predicate registers */
-	unsigned long		sc_br[8];	/* branch registers */
-	/* Note: sc_gr[0] is used as the "uc_link" member of ucontext_t */
-	unsigned long		sc_gr[32];	/* general registers (static partition) */
-	struct ia64_fpreg	sc_fr[128];	/* floating-point registers */
-
-	unsigned long		sc_rbs_base;	/* NULL or new base of sighandler's rbs */
-	unsigned long		sc_loadrs;	/* see description above */
-
-	unsigned long		sc_ar25;	/* cmp8xchg16 uses this */
-	unsigned long		sc_ar26;	/* rsvd for scratch use */
-	unsigned long		sc_rsvd[12];	/* reserved for future use */
-	/*
-	 * The mask must come last so we can increase _NSIG_WORDS
-	 * without breaking binary compatibility.
-	 */
-	sigset_t		sc_mask;	/* signal mask to restore after handler returns */
-};
-
-# endif /* __ASSEMBLY__ */
-#endif /* _ASM_IA64_SIGCONTEXT_H */
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
deleted file mode 100644
index 796af1ccaa7e..000000000000
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Based on <asm-i386/siginfo.h>.
- *
- * Modified 1998-2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_SIGINFO_H
-#define _UAPI_ASM_IA64_SIGINFO_H
-
-
-#include <asm-generic/siginfo.h>
-
-#define si_imm		_sifields._sigfault._imm	/* as per UNIX SysV ABI spec */
-#define si_flags	_sifields._sigfault._flags
-/*
- * si_isr is valid for SIGILL, SIGFPE, SIGSEGV, SIGBUS, and SIGTRAP provided that
- * si_code is non-zero and __ISR_VALID is set in si_flags.
- */
-#define si_isr		_sifields._sigfault._isr
-
-/*
- * Flag values for si_flags:
- */
-#define __ISR_VALID_BIT	0
-#define __ISR_VALID	(1 << __ISR_VALID_BIT)
-
-#endif /* _UAPI_ASM_IA64_SIGINFO_H */
diff --git a/arch/ia64/include/uapi/asm/signal.h b/arch/ia64/include/uapi/asm/signal.h
deleted file mode 100644
index 63d574e802a2..000000000000
--- a/arch/ia64/include/uapi/asm/signal.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Modified 1998-2001, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- *
- * Unfortunately, this file is being included by bits/signal.h in
- * glibc-2.x.  Hence the #ifdef __KERNEL__ ugliness.
- */
-#ifndef _UAPI_ASM_IA64_SIGNAL_H
-#define _UAPI_ASM_IA64_SIGNAL_H
-
-
-#define SIGHUP		 1
-#define SIGINT		 2
-#define SIGQUIT		 3
-#define SIGILL		 4
-#define SIGTRAP		 5
-#define SIGABRT		 6
-#define SIGIOT		 6
-#define SIGBUS		 7
-#define SIGFPE		 8
-#define SIGKILL		 9
-#define SIGUSR1		10
-#define SIGSEGV		11
-#define SIGUSR2		12
-#define SIGPIPE		13
-#define SIGALRM		14
-#define SIGTERM		15
-#define SIGSTKFLT	16
-#define SIGCHLD		17
-#define SIGCONT		18
-#define SIGSTOP		19
-#define SIGTSTP		20
-#define SIGTTIN		21
-#define SIGTTOU		22
-#define SIGURG		23
-#define SIGXCPU		24
-#define SIGXFSZ		25
-#define SIGVTALRM	26
-#define SIGPROF		27
-#define SIGWINCH	28
-#define SIGIO		29
-#define SIGPOLL		SIGIO
-/*
-#define SIGLOST		29
-*/
-#define SIGPWR		30
-#define SIGSYS		31
-/* signal 31 is no longer "unused", but the SIGUNUSED macro remains for backwards compatibility */
-#define	SIGUNUSED	31
-
-/* These should not be considered constants from userland.  */
-#define SIGRTMIN	32
-#define SIGRTMAX	_NSIG
-
-#define SA_RESTORER	0x04000000
-
-/*
- * The minimum stack size needs to be fairly large because we want to
- * be sure that an app compiled for today's CPUs will continue to run
- * on all future CPU models.  The CPU model matters because the signal
- * frame needs to have space for the complete machine state, including
- * all physical stacked registers.  The number of physical stacked
- * registers is CPU model dependent, but given that the width of
- * ar.rsc.loadrs is 14 bits, we can assume that they'll never take up
- * more than 16KB of space.
- */
-#if 1
-  /*
-   * This is a stupid typo: the value was _meant_ to be 131072 (0x20000), but I typed it
-   * in wrong. ;-(  To preserve backwards compatibility, we leave the kernel at the
-   * incorrect value and fix libc only.
-   */
-# define MINSIGSTKSZ	131027	/* min. stack size for sigaltstack() */
-#else
-# define MINSIGSTKSZ	131072	/* min. stack size for sigaltstack() */
-#endif
-#define SIGSTKSZ	262144	/* default stack size for sigaltstack() */
-
-
-#include <asm-generic/signal-defs.h>
-
-# ifndef __ASSEMBLY__
-
-#  include <linux/types.h>
-
-/* Avoid too many header ordering problems.  */
-struct siginfo;
-
-typedef struct sigaltstack {
-	void __user *ss_sp;
-	int ss_flags;
-	__kernel_size_t ss_size;
-} stack_t;
-
-
-# endif /* !__ASSEMBLY__ */
-#endif /* _UAPI_ASM_IA64_SIGNAL_H */
diff --git a/arch/ia64/include/uapi/asm/stat.h b/arch/ia64/include/uapi/asm/stat.h
deleted file mode 100644
index 3265ed5aac0f..000000000000
--- a/arch/ia64/include/uapi/asm/stat.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_STAT_H
-#define _ASM_IA64_STAT_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-struct stat {
-	unsigned long	st_dev;
-	unsigned long	st_ino;
-	unsigned long	st_nlink;
-	unsigned int	st_mode;
-	unsigned int	st_uid;
-	unsigned int	st_gid;
-	unsigned int	__pad0;
-	unsigned long	st_rdev;
-	unsigned long	st_size;
-	unsigned long	st_atime;
-	unsigned long	st_atime_nsec;
-	unsigned long	st_mtime;
-	unsigned long	st_mtime_nsec;
-	unsigned long	st_ctime;
-	unsigned long	st_ctime_nsec;
-	unsigned long	st_blksize;
-	long		st_blocks;
-	unsigned long	__unused[3];
-};
-
-#define STAT_HAVE_NSEC 1
-
-struct ia64_oldstat {
-	unsigned int	st_dev;
-	unsigned int	st_ino;
-	unsigned int	st_mode;
-	unsigned int	st_nlink;
-	unsigned int	st_uid;
-	unsigned int	st_gid;
-	unsigned int	st_rdev;
-	unsigned int	__pad1;
-	unsigned long	st_size;
-	unsigned long	st_atime;
-	unsigned long	st_mtime;
-	unsigned long	st_ctime;
-	unsigned int	st_blksize;
-	int		st_blocks;
-	unsigned int	__unused1;
-	unsigned int	__unused2;
-};
-
-#endif /* _ASM_IA64_STAT_H */
diff --git a/arch/ia64/include/uapi/asm/statfs.h b/arch/ia64/include/uapi/asm/statfs.h
deleted file mode 100644
index de3bae4f137d..000000000000
--- a/arch/ia64/include/uapi/asm/statfs.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_STATFS_H
-#define _ASM_IA64_STATFS_H
-
-/*
- * Based on <asm-i386/statfs.h>.
- *
- * Modified 1998, 1999, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-/*
- * We need compat_statfs64 to be packed, because the i386 ABI won't
- * add padding at the end to bring it to a multiple of 8 bytes, but
- * the IA64 ABI will.
- */
-#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
-
-#include <asm-generic/statfs.h>
-
-#endif /* _ASM_IA64_STATFS_H */
diff --git a/arch/ia64/include/uapi/asm/swab.h b/arch/ia64/include/uapi/asm/swab.h
deleted file mode 100644
index 79f3fef1a05e..000000000000
--- a/arch/ia64/include/uapi/asm/swab.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SWAB_H
-#define _ASM_IA64_SWAB_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co.
- */
-
-#include <linux/types.h>
-#include <asm/intrinsics.h>
-#include <linux/compiler.h>
-
-static __inline__ __attribute_const__ __u64 __arch_swab64(__u64 x)
-{
-	__u64 result;
-
-	result = ia64_mux1(x, ia64_mux1_rev);
-	return result;
-}
-#define __arch_swab64 __arch_swab64
-
-static __inline__ __attribute_const__ __u32 __arch_swab32(__u32 x)
-{
-	return __arch_swab64(x) >> 32;
-}
-#define __arch_swab32 __arch_swab32
-
-static __inline__ __attribute_const__ __u16 __arch_swab16(__u16 x)
-{
-	return __arch_swab64(x) >> 48;
-}
-#define __arch_swab16 __arch_swab16
-
-#endif /* _ASM_IA64_SWAB_H */
diff --git a/arch/ia64/include/uapi/asm/types.h b/arch/ia64/include/uapi/asm/types.h
deleted file mode 100644
index 2000de474be6..000000000000
--- a/arch/ia64/include/uapi/asm/types.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This file is never included by application software unless explicitly
- * requested (e.g., via linux/types.h) in which case the application is
- * Linux specific so (user-) name space pollution is not a major issue.
- * However, for interoperability, libraries still need to be careful to
- * avoid naming clashes.
- *
- * Based on <asm-alpha/types.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_TYPES_H
-#define _UAPI_ASM_IA64_TYPES_H
-
-
-#ifndef __KERNEL__
-#include <asm-generic/int-l64.h>
-#endif
-
-#ifdef __ASSEMBLY__
-# define __IA64_UL(x)		(x)
-# define __IA64_UL_CONST(x)	x
-
-#else
-# define __IA64_UL(x)		((unsigned long)(x))
-# define __IA64_UL_CONST(x)	x##UL
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _UAPI_ASM_IA64_TYPES_H */
diff --git a/arch/ia64/include/uapi/asm/ucontext.h b/arch/ia64/include/uapi/asm/ucontext.h
deleted file mode 100644
index 46f51e535e04..000000000000
--- a/arch/ia64/include/uapi/asm/ucontext.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_UCONTEXT_H
-#define _ASM_IA64_UCONTEXT_H
-
-struct ucontext {
-	struct sigcontext uc_mcontext;
-};
-
-#define uc_link		uc_mcontext.sc_gr[0]	/* wrong type; nobody cares */
-#define uc_sigmask	uc_mcontext.sc_sigmask
-#define uc_stack	uc_mcontext.sc_stack
-
-#endif /* _ASM_IA64_UCONTEXT_H */
diff --git a/arch/ia64/include/uapi/asm/unistd.h b/arch/ia64/include/uapi/asm/unistd.h
deleted file mode 100644
index 013e0bcacc39..000000000000
--- a/arch/ia64/include/uapi/asm/unistd.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * IA-64 Linux syscall numbers and inline-functions.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _UAPI_ASM_IA64_UNISTD_H
-#define _UAPI_ASM_IA64_UNISTD_H
-
-
-#include <asm/break.h>
-
-#define __BREAK_SYSCALL	__IA64_BREAK_SYSCALL
-
-#define __NR_Linux      1024
-
-#define __NR_umount __NR_umount2
-
-#include <asm/unistd_64.h>
-
-#endif /* _UAPI_ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/include/uapi/asm/ustack.h b/arch/ia64/include/uapi/asm/ustack.h
deleted file mode 100644
index 703cc5f546ff..000000000000
--- a/arch/ia64/include/uapi/asm/ustack.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_IA64_USTACK_H
-#define _UAPI_ASM_IA64_USTACK_H
-
-/*
- * Constants for the user stack size
- */
-
-
-/* Make a default stack size of 2GiB */
-#define DEFAULT_USER_STACK_SIZE	(1UL << 31)
-
-#endif /* _UAPI_ASM_IA64_USTACK_H */
diff --git a/arch/ia64/install.sh b/arch/ia64/install.sh
deleted file mode 100755
index 2d4b66a9f362..000000000000
--- a/arch/ia64/install.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1995 by Linus Torvalds
-#
-# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
-#
-# "make install" script for ia64 architecture
-#
-# Arguments:
-#   $1 - kernel version
-#   $2 - kernel image file
-#   $3 - kernel map file
-#   $4 - default install path (blank if root directory)
-
-if [ -f $4/vmlinuz ]; then
-	mv $4/vmlinuz $4/vmlinuz.old
-fi
-
-if [ -f $4/System.map ]; then
-	mv $4/System.map $4/System.old
-fi
-
-cat $2 > $4/vmlinuz
-cp $3 $4/System.map
-
-test -x /usr/sbin/elilo && /usr/sbin/elilo
diff --git a/arch/ia64/kernel/.gitignore b/arch/ia64/kernel/.gitignore
deleted file mode 100644
index 0374827206e7..000000000000
--- a/arch/ia64/kernel/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-gate.lds
-vmlinux.lds
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
deleted file mode 100644
index d7e1cabee2ec..000000000000
--- a/arch/ia64/kernel/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux kernel.
-#
-
-ifdef CONFIG_DYNAMIC_FTRACE
-CFLAGS_REMOVE_ftrace.o = -pg
-endif
-
-extra-y	:= vmlinux.lds
-
-obj-y := head.o entry.o efi.o efi_stub.o gate-data.o fsys.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o pal.o patch.o process.o ptrace.o sal.o		\
-	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
-	 unwind.o mca.o mca_asm.o topology.o dma-mapping.o iosapic.o acpi.o \
-	 acpi-ext.o
-
-obj-$(CONFIG_IA64_BRL_EMU)	+= brl_emu.o
-
-obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
-obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o
-obj-$(CONFIG_NUMA)		+= numa.o
-obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
-obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
-obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
-obj-$(CONFIG_AUDIT)		+= audit.o
-obj-y				+= msi_ia64.o
-mca_recovery-y			+= mca_drv.o mca_drv_asm.o
-obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
-
-obj-$(CONFIG_IA64_ESI)		+= esi.o esi_stub.o # must be in kernel proper
-obj-$(CONFIG_INTEL_IOMMU)	+= pci-dma.o
-
-obj-$(CONFIG_ELF_CORE)		+= elfcore.o
-
-# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
-CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
-
-# The gate DSO image is built using a special linker script.
-include $(srctree)/$(src)/Makefile.gate
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
deleted file mode 100644
index 846867bff6d6..000000000000
--- a/arch/ia64/kernel/Makefile.gate
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# The gate DSO image is built using a special linker script.
-
-targets += gate.so gate.lds gate.o gate-dummy.o
-
-obj-y += gate-syms.o
-
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
-
-quiet_cmd_gate = GATE    $@
-      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     -Wl,--hash-style=sysv
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-GATECFLAGS_gate-dummy.o = -r
-$(obj)/gate-dummy.o: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-LDFLAGS_gate-syms.o := -r -R
-$(obj)/gate-syms.o: $(obj)/gate-dummy.o FORCE
-	$(call if_changed,ld)
-
-# gate-data.o contains the gate DSO image as data in section .data..gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
deleted file mode 100644
index 42cd21480833..000000000000
--- a/arch/ia64/kernel/acpi-ext.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P.
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/acpi.h>
-
-#include <asm/acpi-ext.h>
-
-/*
- * Device CSRs that do not appear in PCI config space should be described
- * via ACPI.  This would normally be done with Address Space Descriptors
- * marked as "consumer-only," but old versions of Windows and Linux ignore
- * the producer/consumer flag, so HP invented a vendor-defined resource to
- * describe the location and size of CSR space.
- */
-
-struct acpi_vendor_uuid hp_ccsr_uuid = {
-	.subtype = 2,
-	.data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a,
-	    0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad },
-};
-
-static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length)
-{
-	acpi_status status;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	struct acpi_resource *resource;
-	struct acpi_resource_vendor_typed *vendor;
-
-	status = acpi_get_vendor_resource(obj, METHOD_NAME__CRS, &hp_ccsr_uuid,
-		&buffer);
-
-	resource = buffer.pointer;
-	vendor = &resource->data.vendor_typed;
-
-	if (ACPI_FAILURE(status) || vendor->byte_length < 16) {
-		status = AE_NOT_FOUND;
-		goto exit;
-	}
-
-	memcpy(base, vendor->byte_data, sizeof(*base));
-	memcpy(length, vendor->byte_data + 8, sizeof(*length));
-
-  exit:
-	kfree(buffer.pointer);
-	return status;
-}
-
-struct csr_space {
-	u64	base;
-	u64	length;
-};
-
-static acpi_status find_csr_space(struct acpi_resource *resource, void *data)
-{
-	struct csr_space *space = data;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-
-	status = acpi_resource_to_address64(resource, &addr);
-	if (ACPI_SUCCESS(status) &&
-	    addr.resource_type == ACPI_MEMORY_RANGE &&
-	    addr.address.address_length &&
-	    addr.producer_consumer == ACPI_CONSUMER) {
-		space->base = addr.address.minimum;
-		space->length = addr.address.address_length;
-		return AE_CTRL_TERMINATE;
-	}
-	return AE_OK;		/* keep looking */
-}
-
-static acpi_status hp_crs_locate(acpi_handle obj, u64 *base, u64 *length)
-{
-	struct csr_space space = { 0, 0 };
-
-	acpi_walk_resources(obj, METHOD_NAME__CRS, find_csr_space, &space);
-	if (!space.length)
-		return AE_NOT_FOUND;
-
-	*base = space.base;
-	*length = space.length;
-	return AE_OK;
-}
-
-acpi_status hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
-{
-	acpi_status status;
-
-	status = hp_ccsr_locate(obj, csr_base, csr_length);
-	if (ACPI_SUCCESS(status))
-		return status;
-
-	return hp_crs_locate(obj, csr_base, csr_length);
-}
-EXPORT_SYMBOL(hp_acpi_csr_space);
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
deleted file mode 100644
index 41e8fe55cd98..000000000000
--- a/arch/ia64/kernel/acpi.c
+++ /dev/null
@@ -1,913 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  acpi.c - Architecture-Specific Low-Level ACPI Support
- *
- *  Copyright (C) 1999 VA Linux Systems
- *  Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
- *  Copyright (C) 2000, 2002-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *  Copyright (C) 2000 Intel Corp.
- *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
- *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
- *  Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
- *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
- *  Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/irq.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/mmzone.h>
-#include <linux/nodemask.h>
-#include <linux/slab.h>
-#include <acpi/processor.h>
-#include <asm/io.h>
-#include <asm/iosapic.h>
-#include <asm/page.h>
-#include <asm/numa.h>
-#include <asm/sal.h>
-#include <asm/cyclone.h>
-
-#define PREFIX			"ACPI: "
-
-int acpi_lapic;
-unsigned int acpi_cpei_override;
-unsigned int acpi_cpei_phys_cpuid;
-
-#define ACPI_MAX_PLATFORM_INTERRUPTS	256
-
-/* Array to record platform interrupt vectors for generic interrupt routing. */
-int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = {
-	[0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1
-};
-
-enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC;
-
-/*
- * Interrupt routing API for device drivers.  Provides interrupt vector for
- * a generic platform event.  Currently only CPEI is implemented.
- */
-int acpi_request_vector(u32 int_type)
-{
-	int vector = -1;
-
-	if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) {
-		/* corrected platform error interrupt */
-		vector = platform_intr_list[int_type];
-	} else
-		printk(KERN_ERR
-		       "acpi_request_vector(): invalid interrupt type\n");
-	return vector;
-}
-
-void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
-{
-	return __va(phys);
-}
-
-void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
-{
-}
-
-/* --------------------------------------------------------------------------
-                            Boot-time Table Parsing
-   -------------------------------------------------------------------------- */
-
-static int available_cpus __initdata;
-struct acpi_table_madt *acpi_madt __initdata;
-static u8 has_8259;
-
-static int __init
-acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
-			  const unsigned long end)
-{
-	struct acpi_madt_local_apic_override *lapic;
-
-	lapic = (struct acpi_madt_local_apic_override *)header;
-
-	if (BAD_MADT_ENTRY(lapic, end))
-		return -EINVAL;
-
-	if (lapic->address) {
-		iounmap(ipi_base_addr);
-		ipi_base_addr = ioremap(lapic->address, 0);
-	}
-	return 0;
-}
-
-static int __init
-acpi_parse_lsapic(union acpi_subtable_headers *header, const unsigned long end)
-{
-	struct acpi_madt_local_sapic *lsapic;
-
-	lsapic = (struct acpi_madt_local_sapic *)header;
-
-	/*Skip BAD_MADT_ENTRY check, as lsapic size could vary */
-
-	if (lsapic->lapic_flags & ACPI_MADT_ENABLED) {
-#ifdef CONFIG_SMP
-		smp_boot_data.cpu_phys_id[available_cpus] =
-		    (lsapic->id << 8) | lsapic->eid;
-#endif
-		++available_cpus;
-	}
-
-	total_cpus++;
-	return 0;
-}
-
-static int __init
-acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_local_apic_nmi *lacpi_nmi;
-
-	lacpi_nmi = (struct acpi_madt_local_apic_nmi *)header;
-
-	if (BAD_MADT_ENTRY(lacpi_nmi, end))
-		return -EINVAL;
-
-	/* TBD: Support lapic_nmi entries */
-	return 0;
-}
-
-static int __init
-acpi_parse_iosapic(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_io_sapic *iosapic;
-
-	iosapic = (struct acpi_madt_io_sapic *)header;
-
-	if (BAD_MADT_ENTRY(iosapic, end))
-		return -EINVAL;
-
-	return iosapic_init(iosapic->address, iosapic->global_irq_base);
-}
-
-static unsigned int __initdata acpi_madt_rev;
-
-static int __init
-acpi_parse_plat_int_src(union acpi_subtable_headers * header,
-			const unsigned long end)
-{
-	struct acpi_madt_interrupt_source *plintsrc;
-	int vector;
-
-	plintsrc = (struct acpi_madt_interrupt_source *)header;
-
-	if (BAD_MADT_ENTRY(plintsrc, end))
-		return -EINVAL;
-
-	/*
-	 * Get vector assignment for this interrupt, set attributes,
-	 * and program the IOSAPIC routing table.
-	 */
-	vector = iosapic_register_platform_intr(plintsrc->type,
-						plintsrc->global_irq,
-						plintsrc->io_sapic_vector,
-						plintsrc->eid,
-						plintsrc->id,
-						((plintsrc->inti_flags & ACPI_MADT_POLARITY_MASK) ==
-						 ACPI_MADT_POLARITY_ACTIVE_HIGH) ?
-						IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
-						((plintsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
-						 ACPI_MADT_TRIGGER_EDGE) ?
-						IOSAPIC_EDGE : IOSAPIC_LEVEL);
-
-	platform_intr_list[plintsrc->type] = vector;
-	if (acpi_madt_rev > 1) {
-		acpi_cpei_override = plintsrc->flags & ACPI_MADT_CPEI_OVERRIDE;
-	}
-
-	/*
-	 * Save the physical id, so we can check when its being removed
-	 */
-	acpi_cpei_phys_cpuid = ((plintsrc->id << 8) | (plintsrc->eid)) & 0xffff;
-
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-unsigned int can_cpei_retarget(void)
-{
-	extern int cpe_vector;
-	extern unsigned int force_cpei_retarget;
-
-	/*
-	 * Only if CPEI is supported and the override flag
-	 * is present, otherwise return that its re-targettable
-	 * if we are in polling mode.
-	 */
-	if (cpe_vector > 0) {
-		if (acpi_cpei_override || force_cpei_retarget)
-			return 1;
-		else
-			return 0;
-	}
-	return 1;
-}
-
-unsigned int is_cpu_cpei_target(unsigned int cpu)
-{
-	unsigned int logical_id;
-
-	logical_id = cpu_logical_id(acpi_cpei_phys_cpuid);
-
-	if (logical_id == cpu)
-		return 1;
-	else
-		return 0;
-}
-
-void set_cpei_target_cpu(unsigned int cpu)
-{
-	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
-}
-#endif
-
-unsigned int get_cpei_target_cpu(void)
-{
-	return acpi_cpei_phys_cpuid;
-}
-
-static int __init
-acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
-		       const unsigned long end)
-{
-	struct acpi_madt_interrupt_override *p;
-
-	p = (struct acpi_madt_interrupt_override *)header;
-
-	if (BAD_MADT_ENTRY(p, end))
-		return -EINVAL;
-
-	iosapic_override_isa_irq(p->source_irq, p->global_irq,
-				 ((p->inti_flags & ACPI_MADT_POLARITY_MASK) ==
-				  ACPI_MADT_POLARITY_ACTIVE_LOW) ?
-				 IOSAPIC_POL_LOW : IOSAPIC_POL_HIGH,
-				 ((p->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
-				 ACPI_MADT_TRIGGER_LEVEL) ?
-				 IOSAPIC_LEVEL : IOSAPIC_EDGE);
-	return 0;
-}
-
-static int __init
-acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_nmi_source *nmi_src;
-
-	nmi_src = (struct acpi_madt_nmi_source *)header;
-
-	if (BAD_MADT_ENTRY(nmi_src, end))
-		return -EINVAL;
-
-	/* TBD: Support nimsrc entries */
-	return 0;
-}
-
-static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERMOW", 6))) {
-
-		/*
-		 * Unfortunately ITC_DRIFT is not yet part of the
-		 * official SAL spec, so the ITC_DRIFT bit is not
-		 * set by the BIOS on this hardware.
-		 */
-		sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT;
-
-		cyclone_setup();
-	}
-}
-
-static int __init acpi_parse_madt(struct acpi_table_header *table)
-{
-	acpi_madt = (struct acpi_table_madt *)table;
-
-	acpi_madt_rev = acpi_madt->header.revision;
-
-	/* remember the value for reference after free_initmem() */
-#ifdef CONFIG_ITANIUM
-	has_8259 = 1;		/* Firmware on old Itanium systems is broken */
-#else
-	has_8259 = acpi_madt->flags & ACPI_MADT_PCAT_COMPAT;
-#endif
-	iosapic_system_init(has_8259);
-
-	/* Get base address of IPI Message Block */
-
-	if (acpi_madt->address)
-		ipi_base_addr = ioremap(acpi_madt->address, 0);
-
-	printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr);
-
-	acpi_madt_oem_check(acpi_madt->header.oem_id,
-			    acpi_madt->header.oem_table_id);
-
-	return 0;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-
-#undef SLIT_DEBUG
-
-#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
-
-static int __initdata srat_num_cpus;	/* number of cpus */
-static u32 pxm_flag[PXM_FLAG_LEN];
-#define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
-#define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-static struct acpi_table_slit __initdata *slit_table;
-cpumask_t early_cpu_possible_map = CPU_MASK_NONE;
-
-static int __init
-get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa)
-{
-	int pxm;
-
-	pxm = pa->proximity_domain_lo;
-	if (acpi_srat_revision >= 2)
-		pxm += pa->proximity_domain_hi[0] << 8;
-	return pxm;
-}
-
-static int __init
-get_memory_proximity_domain(struct acpi_srat_mem_affinity *ma)
-{
-	int pxm;
-
-	pxm = ma->proximity_domain;
-	if (acpi_srat_revision <= 1)
-		pxm &= 0xff;
-
-	return pxm;
-}
-
-/*
- * ACPI 2.0 SLIT (System Locality Information Table)
- * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
- */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-	u32 len;
-
-	len = sizeof(struct acpi_table_header) + 8
-	    + slit->locality_count * slit->locality_count;
-	if (slit->header.length != len) {
-		printk(KERN_ERR
-		       "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
-		       len, slit->header.length);
-		return;
-	}
-	slit_table = slit;
-}
-
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
-	int pxm;
-
-	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
-		return;
-
-	if (srat_num_cpus >= ARRAY_SIZE(node_cpuid)) {
-		printk_once(KERN_WARNING
-			    "node_cpuid[%ld] is too small, may not be able to use all cpus\n",
-			    ARRAY_SIZE(node_cpuid));
-		return;
-	}
-	pxm = get_processor_proximity_domain(pa);
-
-	/* record this node in proximity bitmap */
-	pxm_bit_set(pxm);
-
-	node_cpuid[srat_num_cpus].phys_id =
-	    (pa->apic_id << 8) | (pa->local_sapic_eid);
-	/* nid should be overridden as logical node id later */
-	node_cpuid[srat_num_cpus].nid = pxm;
-	cpumask_set_cpu(srat_num_cpus, &early_cpu_possible_map);
-	srat_num_cpus++;
-}
-
-int __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
-	unsigned long paddr, size;
-	int pxm;
-	struct node_memblk_s *p, *q, *pend;
-
-	pxm = get_memory_proximity_domain(ma);
-
-	/* fill node memory chunk structure */
-	paddr = ma->base_address;
-	size = ma->length;
-
-	/* Ignore disabled entries */
-	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
-		return -1;
-
-	if (num_node_memblks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
-		return -EINVAL;
-	}
-
-	/* record this node in proximity bitmap */
-	pxm_bit_set(pxm);
-
-	/* Insertion sort based on base address */
-	pend = &node_memblk[num_node_memblks];
-	for (p = &node_memblk[0]; p < pend; p++) {
-		if (paddr < p->start_paddr)
-			break;
-	}
-	if (p < pend) {
-		for (q = pend - 1; q >= p; q--)
-			*(q + 1) = *q;
-	}
-	p->start_paddr = paddr;
-	p->size = size;
-	p->nid = pxm;
-	num_node_memblks++;
-	return 0;
-}
-
-void __init acpi_numa_fixup(void)
-{
-	int i, j, node_from, node_to;
-
-	/* If there's no SRAT, fix the phys_id and mark node 0 online */
-	if (srat_num_cpus == 0) {
-		node_set_online(0);
-		node_cpuid[0].phys_id = hard_smp_processor_id();
-		slit_distance(0, 0) = LOCAL_DISTANCE;
-		goto out;
-	}
-
-	/*
-	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
-	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
-	 */
-	nodes_clear(node_online_map);
-	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-		if (pxm_bit_test(i)) {
-			int nid = acpi_map_pxm_to_node(i);
-			node_set_online(nid);
-		}
-	}
-
-	/* set logical node id in memory chunk structure */
-	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
-
-	/* assign memory bank numbers for each chunk on each node */
-	for_each_online_node(i) {
-		int bank;
-
-		bank = 0;
-		for (j = 0; j < num_node_memblks; j++)
-			if (node_memblk[j].nid == i)
-				node_memblk[j].bank = bank++;
-	}
-
-	/* set logical node id in cpu structure */
-	for_each_possible_early_cpu(i)
-		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
-
-	printk(KERN_INFO "Number of logical nodes in system = %d\n",
-	       num_online_nodes());
-	printk(KERN_INFO "Number of memory chunks in system = %d\n",
-	       num_node_memblks);
-
-	if (!slit_table) {
-		for (i = 0; i < MAX_NUMNODES; i++)
-			for (j = 0; j < MAX_NUMNODES; j++)
-				slit_distance(i, j) = i == j ?
-					LOCAL_DISTANCE : REMOTE_DISTANCE;
-		goto out;
-	}
-
-	memset(numa_slit, -1, sizeof(numa_slit));
-	for (i = 0; i < slit_table->locality_count; i++) {
-		if (!pxm_bit_test(i))
-			continue;
-		node_from = pxm_to_node(i);
-		for (j = 0; j < slit_table->locality_count; j++) {
-			if (!pxm_bit_test(j))
-				continue;
-			node_to = pxm_to_node(j);
-			slit_distance(node_from, node_to) =
-			    slit_table->entry[i * slit_table->locality_count + j];
-		}
-	}
-
-#ifdef SLIT_DEBUG
-	printk("ACPI 2.0 SLIT locality table:\n");
-	for_each_online_node(i) {
-		for_each_online_node(j)
-		    printk("%03d ", node_distance(i, j));
-		printk("\n");
-	}
-#endif
-out:
-	node_possible_map = node_online_map;
-}
-#endif				/* CONFIG_ACPI_NUMA */
-
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
-{
-	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
-		return gsi;
-
-	if (has_8259 && gsi < 16)
-		return isa_irq_to_vector(gsi);
-
-	return iosapic_register_intr(gsi,
-				     (polarity ==
-				      ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH :
-				     IOSAPIC_POL_LOW,
-				     (triggering ==
-				      ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE :
-				     IOSAPIC_LEVEL);
-}
-EXPORT_SYMBOL_GPL(acpi_register_gsi);
-
-void acpi_unregister_gsi(u32 gsi)
-{
-	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
-		return;
-
-	if (has_8259 && gsi < 16)
-		return;
-
-	iosapic_unregister_intr(gsi);
-}
-EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
-
-static int __init acpi_parse_fadt(struct acpi_table_header *table)
-{
-	struct acpi_table_header *fadt_header;
-	struct acpi_table_fadt *fadt;
-
-	fadt_header = (struct acpi_table_header *)table;
-	if (fadt_header->revision != 3)
-		return -ENODEV;	/* Only deal with ACPI 2.0 FADT */
-
-	fadt = (struct acpi_table_fadt *)fadt_header;
-
-	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
-				 ACPI_ACTIVE_LOW);
-	return 0;
-}
-
-int __init early_acpi_boot_init(void)
-{
-	int ret;
-
-	/*
-	 * do a partial walk of MADT to determine how many CPUs
-	 * we have including offline CPUs
-	 */
-	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
-		printk(KERN_ERR PREFIX "Can't find MADT\n");
-		return 0;
-	}
-
-	ret = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-		acpi_parse_lsapic, NR_CPUS);
-	if (ret < 1)
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no LAPIC entries\n");
-	else
-		acpi_lapic = 1;
-
-#ifdef CONFIG_SMP
-	if (available_cpus == 0) {
-		printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
-		printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
-		smp_boot_data.cpu_phys_id[available_cpus] =
-		    hard_smp_processor_id();
-		available_cpus = 1;	/* We've got at least one of these, no? */
-	}
-	smp_boot_data.cpu_count = available_cpus;
-#endif
-	/* Make boot-up look pretty */
-	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus,
-	       total_cpus);
-
-	return 0;
-}
-
-int __init acpi_boot_init(void)
-{
-
-	/*
-	 * MADT
-	 * ----
-	 * Parse the Multiple APIC Description Table (MADT), if exists.
-	 * Note that this table provides platform SMP configuration
-	 * information -- the successor to MPS tables.
-	 */
-
-	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
-		printk(KERN_ERR PREFIX "Can't find MADT\n");
-		goto skip_madt;
-	}
-
-	/* Local APIC */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing LAPIC address override entry\n");
-
-	if (acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0)
-	    < 0)
-		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
-
-	/* I/O APIC */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_IO_SAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) {
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no IOSAPIC entries\n");
-	}
-
-	/* System-Level Interrupt Routing */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_INTERRUPT_SOURCE, acpi_parse_plat_int_src,
-	     ACPI_MAX_PLATFORM_INTERRUPTS) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing platform interrupt source entry\n");
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 0) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing interrupt source overrides entry\n");
-
-	if (acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 0) < 0)
-		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
-      skip_madt:
-
-	/*
-	 * FADT says whether a legacy keyboard controller is present.
-	 * The FADT also contains an SCI_INT line, by which the system
-	 * gets interrupts such as power and sleep buttons.  If it's not
-	 * on a Legacy interrupt, it needs to be setup.
-	 */
-	if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt))
-		printk(KERN_ERR PREFIX "Can't find FADT\n");
-
-#ifdef CONFIG_ACPI_NUMA
-#ifdef CONFIG_SMP
-	if (srat_num_cpus == 0) {
-		int cpu, i = 1;
-		for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
-			if (smp_boot_data.cpu_phys_id[cpu] !=
-			    hard_smp_processor_id())
-				node_cpuid[i++].phys_id =
-				    smp_boot_data.cpu_phys_id[cpu];
-	}
-#endif
-	build_cpu_to_node_map();
-#endif
-	return 0;
-}
-
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
-{
-	int tmp;
-
-	if (has_8259 && gsi < 16)
-		*irq = isa_irq_to_vector(gsi);
-	else {
-		tmp = gsi_to_irq(gsi);
-		if (tmp == -1)
-			return -1;
-		*irq = tmp;
-	}
-	return 0;
-}
-
-int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
-{
-	if (isa_irq >= 16)
-		return -1;
-	*gsi = isa_irq;
-	return 0;
-}
-
-/*
- *  ACPI based hotplug CPU support
- */
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
-{
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * We don't have cpu-only-node hotadd. But if the system equips
-	 * SRAT table, pxm is already found and node is ready.
-  	 * So, just pxm_to_nid(pxm) is OK.
-	 * This code here is for the system which doesn't have full SRAT
-  	 * table for possible cpus.
-	 */
-	node_cpuid[cpu].phys_id = physid;
-	node_cpuid[cpu].nid = acpi_get_node(handle);
-#endif
-	return 0;
-}
-
-int additional_cpus __initdata = -1;
-
-static __init int setup_additional_cpus(char *s)
-{
-	if (s)
-		additional_cpus = simple_strtol(s, NULL, 0);
-
-	return 0;
-}
-
-early_param("additional_cpus", setup_additional_cpus);
-
-/*
- * cpu_possible_mask should be static, it cannot change as CPUs
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_mask on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- */
-__init void prefill_possible_map(void)
-{
-	int i;
-	int possible, disabled_cpus;
-
-	disabled_cpus = total_cpus - available_cpus;
-
- 	if (additional_cpus == -1) {
- 		if (disabled_cpus > 0)
-			additional_cpus = disabled_cpus;
- 		else
-			additional_cpus = 0;
- 	}
-
-	possible = available_cpus + additional_cpus;
-
-	if (possible > nr_cpu_ids)
-		possible = nr_cpu_ids;
-
-	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
-		possible, max((possible - available_cpus), 0));
-
-	for (i = 0; i < possible; i++)
-		set_cpu_possible(i, true);
-}
-
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
-{
-	int cpu;
-
-	cpu = cpumask_first_zero(cpu_present_mask);
-	if (cpu >= nr_cpu_ids)
-		return -EINVAL;
-
-	acpi_map_cpu2node(handle, cpu, physid);
-
-	set_cpu_present(cpu, true);
-	ia64_cpu_to_sapicid[cpu] = physid;
-
-	acpi_processor_set_pdc(handle);
-
-	*pcpu = cpu;
-	return (0);
-}
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
-		       int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
-EXPORT_SYMBOL(acpi_map_cpu);
-
-int acpi_unmap_cpu(int cpu)
-{
-	ia64_cpu_to_sapicid[cpu] = -1;
-	set_cpu_present(cpu, false);
-
-#ifdef CONFIG_ACPI_NUMA
-	/* NUMA specific cleanup's */
-#endif
-
-	return (0);
-}
-EXPORT_SYMBOL(acpi_unmap_cpu);
-#endif				/* CONFIG_ACPI_HOTPLUG_CPU */
-
-#ifdef CONFIG_ACPI_NUMA
-static acpi_status acpi_map_iosapic(acpi_handle handle, u32 depth,
-				    void *context, void **ret)
-{
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-	struct acpi_madt_io_sapic *iosapic;
-	unsigned int gsi_base;
-	int node;
-
-	/* Only care about objects w/ a method that returns the MADT */
-	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
-		return AE_OK;
-
-	if (!buffer.length || !buffer.pointer)
-		return AE_OK;
-
-	obj = buffer.pointer;
-	if (obj->type != ACPI_TYPE_BUFFER ||
-	    obj->buffer.length < sizeof(*iosapic)) {
-		kfree(buffer.pointer);
-		return AE_OK;
-	}
-
-	iosapic = (struct acpi_madt_io_sapic *)obj->buffer.pointer;
-
-	if (iosapic->header.type != ACPI_MADT_TYPE_IO_SAPIC) {
-		kfree(buffer.pointer);
-		return AE_OK;
-	}
-
-	gsi_base = iosapic->global_irq_base;
-
-	kfree(buffer.pointer);
-
-	/* OK, it's an IOSAPIC MADT entry; associate it with a node */
-	node = acpi_get_node(handle);
-	if (node == NUMA_NO_NODE || !node_online(node) ||
-	    cpumask_empty(cpumask_of_node(node)))
-		return AE_OK;
-
-	/* We know a gsi to node mapping! */
-	map_iosapic_to_node(gsi_base, node);
-	return AE_OK;
-}
-
-static int __init
-acpi_map_iosapics (void)
-{
-	acpi_get_devices(NULL, acpi_map_iosapic, NULL, NULL);
-	return 0;
-}
-
-fs_initcall(acpi_map_iosapics);
-#endif				/* CONFIG_ACPI_NUMA */
-
-int __ref acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
-{
-	int err;
-
-	if ((err = iosapic_init(phys_addr, gsi_base)))
-		return err;
-
-#ifdef CONFIG_ACPI_NUMA
-	acpi_map_iosapic(handle, 0, NULL, NULL);
-#endif				/* CONFIG_ACPI_NUMA */
-
-	return 0;
-}
-
-EXPORT_SYMBOL(acpi_register_ioapic);
-
-int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
-{
-	return iosapic_remove(gsi_base);
-}
-
-EXPORT_SYMBOL(acpi_unregister_ioapic);
-
-/*
- * acpi_suspend_lowlevel() - save kernel state and suspend.
- *
- * TBD when IA64 starts to support suspend...
- */
-int acpi_suspend_lowlevel(void) { return 0; }
-
-void acpi_proc_quirk_mwait_check(void)
-{
-}
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
deleted file mode 100644
index be3b90fef2e9..000000000000
--- a/arch/ia64/kernel/asm-offsets.c
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#define ASM_OFFSETS_C 1
-
-#include <linux/sched/signal.h>
-#include <linux/pid.h>
-#include <linux/clocksource.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/siginfo.h>
-#include <asm/sigcontext.h>
-#include <asm/mca.h>
-
-#include "../kernel/sigframe.h"
-#include "../kernel/fsyscall_gtod_data.h"
-
-void foo(void)
-{
-	DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct));
-	DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info));
-	DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs));
-	DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack));
-	DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo));
-	DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64));
-	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
-	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
-
-	BUILD_BUG_ON(sizeof(struct upid) != 16);
-	DEFINE(IA64_UPID_SHIFT, 4);
-
-	BLANK();
-
-	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
-	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
-	DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
-	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
-	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
-	DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
-#endif
-
-	BLANK();
-
-	DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
-	DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
-	DEFINE(IA64_TASK_THREAD_PID_OFFSET,offsetof (struct task_struct, thread_pid));
-	DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level));
-	DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0]));
-	DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
-	DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
-	DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
-	DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
-	DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
-	DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
-	DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
-
-	BLANK();
-
-
-	DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct,
-							     group_stop_count));
-	DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending));
-	DEFINE(IA64_SIGNAL_PIDS_TGID_OFFSET, offsetof (struct signal_struct, pids[PIDTYPE_TGID]));
-
-	BLANK();
-
-	DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6));
-	DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7));
-	DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd));
-	DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd));
-	DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8));
-	DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9));
-	DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10));
-	DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11));
-	DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr));
-	DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip));
-	DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs));
-	DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat));
-	DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs));
-	DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc));
-	DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat));
-
-	DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore));
-	DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr));
-	DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0));
-	DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs));
-	DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1));
-	DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12));
-	DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13));
-	DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr));
-	DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15));
-	DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14));
-	DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2));
-	DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3));
-	DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16));
-	DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17));
-	DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18));
-	DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19));
-	DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20));
-	DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21));
-	DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22));
-	DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23));
-	DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24));
-	DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25));
-	DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26));
-	DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27));
-	DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28));
-	DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29));
-	DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30));
-	DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31));
-	DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv));
-	DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6));
-	DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7));
-	DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8));
-	DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9));
-	DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10));
-	DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11));
-
-	BLANK();
-
-	DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat));
-	DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr));
-	DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2));
-	DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3));
-	DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4));
-	DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5));
-	DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12));
-	DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13));
-	DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14));
-	DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15));
-	DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16));
-	DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17));
-	DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18));
-	DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19));
-	DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20));
-	DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21));
-	DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22));
-	DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23));
-	DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24));
-	DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25));
-	DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26));
-	DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27));
-	DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28));
-	DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29));
-	DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30));
-	DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31));
-	DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4));
-	DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5));
-	DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6));
-	DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7));
-	DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0));
-	DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1));
-	DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2));
-	DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3));
-	DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4));
-	DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5));
-	DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs));
-	DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc));
-	DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat));
-	DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat));
-	DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore));
-	DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr));
-
-	BLANK();
-
-	DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip));
-	DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp));
-	DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr));
-	DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat));
-	DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat));
-	DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0]));
-	DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm));
-	DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags));
-	DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6]));
-	DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr));
-	DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12]));
-	DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base));
-	DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs));
-
-	BLANK();
-
-	DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal));
-
-	BLANK();
-
-	DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0));
-	DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1));
-	DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2));
-	DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler));
-	DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc));
-	BLANK();
-    /* for assembly files which can't include sched.h: */
-	DEFINE(IA64_CLONE_VFORK, CLONE_VFORK);
-	DEFINE(IA64_CLONE_VM, CLONE_VM);
-
-	BLANK();
-	DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET,
-	       offsetof (struct cpuinfo_ia64, nsec_per_cyc));
-	DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_base));
-	DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_count));
-	DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_stride));
-	BLANK();
-	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET,
-	       offsetof (struct __kernel_old_timespec, tv_nsec));
-	DEFINE(IA64_TIME_SN_SPEC_SNSEC_OFFSET,
-	       offsetof (struct time_sn_spec, snsec));
-
-	DEFINE(CLONE_SETTLS_BIT, 19);
-#if CLONE_SETTLS != (1<<19)
-# error "CLONE_SETTLS_BIT incorrect, please fix"
-#endif
-
-	BLANK();
-	DEFINE(IA64_MCA_CPU_MCA_STACK_OFFSET,
-	       offsetof (struct ia64_mca_cpu, mca_stack));
-	DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
-	       offsetof (struct ia64_mca_cpu, init_stack));
-	BLANK();
-	DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET,
-	       offsetof (struct ia64_sal_os_state, os_gp));
-	DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET,
-	       offsetof (struct ia64_sal_os_state, proc_state_param));
-	DEFINE(IA64_SAL_OS_STATE_SAL_RA_OFFSET,
-	       offsetof (struct ia64_sal_os_state, sal_ra));
-	DEFINE(IA64_SAL_OS_STATE_SAL_GP_OFFSET,
-	       offsetof (struct ia64_sal_os_state, sal_gp));
-	DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET,
-	       offsetof (struct ia64_sal_os_state, pal_min_state));
-	DEFINE(IA64_SAL_OS_STATE_OS_STATUS_OFFSET,
-	       offsetof (struct ia64_sal_os_state, os_status));
-	DEFINE(IA64_SAL_OS_STATE_CONTEXT_OFFSET,
-	       offsetof (struct ia64_sal_os_state, context));
-	DEFINE(IA64_SAL_OS_STATE_SIZE,
-	       sizeof (struct ia64_sal_os_state));
-	BLANK();
-
-	DEFINE(IA64_PMSA_GR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_gr));
-	DEFINE(IA64_PMSA_BANK1_GR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_bank1_gr));
-	DEFINE(IA64_PMSA_PR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_pr));
-	DEFINE(IA64_PMSA_BR0_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_br0));
-	DEFINE(IA64_PMSA_RSC_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_rsc));
-	DEFINE(IA64_PMSA_IIP_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_iip));
-	DEFINE(IA64_PMSA_IPSR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_ipsr));
-	DEFINE(IA64_PMSA_IFS_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_ifs));
-	DEFINE(IA64_PMSA_XIP_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_xip));
-	BLANK();
-
-	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
-	DEFINE(IA64_GTOD_SEQ_OFFSET,
-	       offsetof (struct fsyscall_gtod_data_t, seq));
-	DEFINE(IA64_GTOD_WALL_TIME_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, wall_time));
-	DEFINE(IA64_GTOD_MONO_TIME_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, monotonic_time));
-	DEFINE(IA64_CLKSRC_MASK_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_mask));
-	DEFINE(IA64_CLKSRC_MULT_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_mult));
-	DEFINE(IA64_CLKSRC_SHIFT_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_shift));
-	DEFINE(IA64_CLKSRC_MMIO_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio));
-	DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_cycle_last));
-	DEFINE(IA64_ITC_JITTER_OFFSET,
-		offsetof (struct itc_jitter_data_t, itc_jitter));
-	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
-		offsetof (struct itc_jitter_data_t, itc_lastcycle));
-
-}
diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
deleted file mode 100644
index ec61f20ca61f..000000000000
--- a/arch/ia64/kernel/audit.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/audit.h>
-#include <asm/unistd.h>
-
-static unsigned dir_class[] = {
-#include <asm-generic/audit_dir_write.h>
-~0U
-};
-
-static unsigned read_class[] = {
-#include <asm-generic/audit_read.h>
-~0U
-};
-
-static unsigned write_class[] = {
-#include <asm-generic/audit_write.h>
-~0U
-};
-
-static unsigned chattr_class[] = {
-#include <asm-generic/audit_change_attr.h>
-~0U
-};
-
-static unsigned signal_class[] = {
-#include <asm-generic/audit_signal.h>
-~0U
-};
-
-int audit_classify_arch(int arch)
-{
-	return 0;
-}
-
-int audit_classify_syscall(int abi, unsigned syscall)
-{
-	switch(syscall) {
-	case __NR_open:
-		return AUDITSC_OPEN;
-	case __NR_openat:
-		return AUDITSC_OPENAT;
-	case __NR_execve:
-		return AUDITSC_EXECVE;
-	case __NR_openat2:
-		return AUDITSC_OPENAT2;
-	default:
-		return AUDITSC_NATIVE;
-	}
-}
-
-static int __init audit_classes_init(void)
-{
-	audit_register_class(AUDIT_CLASS_WRITE, write_class);
-	audit_register_class(AUDIT_CLASS_READ, read_class);
-	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
-	audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
-	audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
-	return 0;
-}
-
-__initcall(audit_classes_init);
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
deleted file mode 100644
index 782c481d7052..000000000000
--- a/arch/ia64/kernel/brl_emu.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  Emulation of the "brl" instruction for IA64 processors that
- *  don't support it in hardware.
- *  Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com>
- *
- *    02/22/02	D. Mosberger	Clear si_flgs, si_isr, and si_imm to avoid
- *				leaking kernel bits.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/uaccess.h>
-#include <asm/processor.h>
-
-extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5;
-
-struct illegal_op_return {
-	unsigned long fkt, arg1, arg2, arg3;
-};
-
-/*
- *  The unimplemented bits of a virtual address must be set
- *  to the value of the most significant implemented bit.
- *  unimpl_va_mask includes all unimplemented bits and
- *  the most significant implemented bit, so the result
- *  of an and operation with the mask must be all 0's
- *  or all 1's for the address to be valid.
- */
-#define unimplemented_virtual_address(va) (						\
-	((va) & local_cpu_data->unimpl_va_mask) != 0 &&					\
-	((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask	\
-)
-
-/*
- *  The unimplemented bits of a physical address must be 0.
- *  unimpl_pa_mask includes all unimplemented bits, so the result
- *  of an and operation with the mask must be all 0's for the
- *  address to be valid.
- */
-#define unimplemented_physical_address(pa) (		\
-	((pa) & local_cpu_data->unimpl_pa_mask) != 0	\
-)
-
-/*
- *  Handle an illegal operation fault that was caused by an
- *  unimplemented "brl" instruction.
- *  If we are not successful (e.g because the illegal operation
- *  wasn't caused by a "brl" after all), we return -1.
- *  If we are successful, we return either 0 or the address
- *  of a "fixup" function for manipulating preserved register
- *  state.
- */
-
-struct illegal_op_return
-ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
-{
-	unsigned long bundle[2];
-	unsigned long opcode, btype, qp, offset, cpl;
-	unsigned long next_ip;
-	struct illegal_op_return rv;
-	long tmp_taken, unimplemented_address;
-
-	rv.fkt = (unsigned long) -1;
-
-	/*
-	 *  Decode the instruction bundle.
-	 */
-
-	if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle)))
-		return rv;
-
-	next_ip = (unsigned long) regs->cr_iip + 16;
-
-	/* "brl" must be in slot 2. */
-	if (ia64_psr(regs)->ri != 1) return rv;
-
-	/* Must be "mlx" template */
-	if ((bundle[0] & 0x1e) != 0x4) return rv;
-
-	opcode = (bundle[1] >> 60);
-	btype = ((bundle[1] >> 29) & 0x7);
-	qp = ((bundle[1] >> 23) & 0x3f);
-	offset = ((bundle[1] & 0x0800000000000000L) << 4)
-		| ((bundle[1] & 0x00fffff000000000L) >> 32)
-		| ((bundle[1] & 0x00000000007fffffL) << 40)
-		| ((bundle[0] & 0xffff000000000000L) >> 24);
-
-	tmp_taken = regs->pr & (1L << qp);
-
-	switch(opcode) {
-
-		case 0xC:
-			/*
-			 *  Long Branch.
-			 */
-			if (btype != 0) return rv;
-			rv.fkt = 0;
-			if (!(tmp_taken)) {
-				/*
-				 *  Qualifying predicate is 0.
-				 *  Skip instruction.
-				 */
-				regs->cr_iip = next_ip;
-				ia64_psr(regs)->ri = 0;
-				return rv;
-			}
-			break;
-
-		case 0xD:
-			/*
-			 *  Long Call.
-			 */
-			rv.fkt = 0;
-			if (!(tmp_taken)) {
-				/*
-				 *  Qualifying predicate is 0.
-				 *  Skip instruction.
-				 */
-				regs->cr_iip = next_ip;
-				ia64_psr(regs)->ri = 0;
-				return rv;
-			}
-
-			/*
-			 *  BR[btype] = IP+16
-			 */
-			switch(btype) {
-				case 0:
-					regs->b0 = next_ip;
-					break;
-				case 1:
-					rv.fkt = (unsigned long) &ia64_set_b1;
-					break;
-				case 2:
-					rv.fkt = (unsigned long) &ia64_set_b2;
-					break;
-				case 3:
-					rv.fkt = (unsigned long) &ia64_set_b3;
-					break;
-				case 4:
-					rv.fkt = (unsigned long) &ia64_set_b4;
-					break;
-				case 5:
-					rv.fkt = (unsigned long) &ia64_set_b5;
-					break;
-				case 6:
-					regs->b6 = next_ip;
-					break;
-				case 7:
-					regs->b7 = next_ip;
-					break;
-			}
-			rv.arg1 = next_ip;
-
-			/*
-			 *  AR[PFS].pfm = CFM
-			 *  AR[PFS].pec = AR[EC]
-			 *  AR[PFS].ppl = PSR.cpl
-			 */
-			cpl = ia64_psr(regs)->cpl;
-			regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff)
-					| (ar_ec << 52) | (cpl << 62));
-
-			/*
-			 *  CFM.sof -= CFM.sol
-			 *  CFM.sol = 0
-			 *  CFM.sor = 0
-			 *  CFM.rrb.gr = 0
-			 *  CFM.rrb.fr = 0
-			 *  CFM.rrb.pr = 0
-			 */
-			regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f)
-					- ((regs->cr_ifs >> 7) & 0x7f));
-
-			break;
-
-		default:
-			/*
-			 *  Unknown opcode.
-			 */
-			return rv;
-
-	}
-
-	regs->cr_iip += offset;
-	ia64_psr(regs)->ri = 0;
-
-	if (ia64_psr(regs)->it == 0)
-		unimplemented_address = unimplemented_physical_address(regs->cr_iip);
-	else
-		unimplemented_address = unimplemented_virtual_address(regs->cr_iip);
-
-	if (unimplemented_address) {
-		/*
-		 *  The target address contains unimplemented bits.
-		 */
-		printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n");
-		force_sig_fault(SIGILL, ILL_BADIADDR, (void __user *)NULL,
-				0, 0, 0);
-	} else if (ia64_psr(regs)->tb) {
-		/*
-		 *  Branch Tracing is enabled.
-		 *  Force a taken branch signal.
-		 */
-		force_sig_fault(SIGTRAP, TRAP_BRANCH, (void __user *)NULL,
-				0, 0, 0);
-	} else if (ia64_psr(regs)->ss) {
-		/*
-		 *  Single Step is enabled.
-		 *  Force a trace signal.
-		 */
-		force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)NULL,
-				0, 0, 0);
-	}
-	return rv;
-}
diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
deleted file mode 100644
index 88b3ce3e66cd..000000000000
--- a/arch/ia64/kernel/crash.c
+++ /dev/null
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/ia64/kernel/crash.c
- *
- * Architecture specific (ia64) functions for kexec based crash dumps.
- *
- * Created by: Khalid Aziz <khalid.aziz@hp.com>
- * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 Intel Corp	Zou Nan hai <nanhai.zou@intel.com>
- *
- */
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/crash_dump.h>
-#include <linux/memblock.h>
-#include <linux/kexec.h>
-#include <linux/elfcore.h>
-#include <linux/reboot.h>
-#include <linux/sysctl.h>
-#include <linux/init.h>
-#include <linux/kdebug.h>
-
-#include <asm/mca.h>
-
-int kdump_status[NR_CPUS];
-static atomic_t kdump_cpu_frozen;
-atomic_t kdump_in_progress;
-static int kdump_freeze_monarch;
-static int kdump_on_init = 1;
-static int kdump_on_fatal_mca = 1;
-
-extern void ia64_dump_cpu_regs(void *);
-
-static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
-
-void
-crash_save_this_cpu(void)
-{
-	void *buf;
-	unsigned long cfm, sof, sol;
-
-	int cpu = smp_processor_id();
-	struct elf_prstatus *prstatus = &per_cpu(elf_prstatus, cpu);
-
-	elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg);
-	memset(prstatus, 0, sizeof(*prstatus));
-	prstatus->common.pr_pid = current->pid;
-
-	ia64_dump_cpu_regs(dst);
-	cfm = dst[43];
-	sol = (cfm >> 7) & 0x7f;
-	sof = cfm & 0x7f;
-	dst[46] = (unsigned long)ia64_rse_skip_regs((unsigned long *)dst[46],
-			sof - sol);
-
-	buf = (u64 *) per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, prstatus,
-			sizeof(*prstatus));
-	final_note(buf);
-}
-
-#ifdef CONFIG_SMP
-static int
-kdump_wait_cpu_freeze(void)
-{
-	int cpu_num = num_online_cpus() - 1;
-	int timeout = 1000;
-	while(timeout-- > 0) {
-		if (atomic_read(&kdump_cpu_frozen) == cpu_num)
-			return 0;
-		udelay(1000);
-	}
-	return 1;
-}
-#endif
-
-void
-machine_crash_shutdown(struct pt_regs *pt)
-{
-	/* This function is only called after the system
-	 * has paniced or is otherwise in a critical state.
-	 * The minimum amount of code to allow a kexec'd kernel
-	 * to run successfully needs to happen here.
-	 *
-	 * In practice this means shooting down the other cpus in
-	 * an SMP system.
-	 */
-	kexec_disable_iosapic();
-#ifdef CONFIG_SMP
-	/*
-	 * If kdump_on_init is set and an INIT is asserted here, kdump will
-	 * be started again via INIT monarch.
-	 */
-	local_irq_disable();
-	ia64_set_psr_mc();	/* mask MCA/INIT */
-	if (atomic_inc_return(&kdump_in_progress) != 1)
-		unw_init_running(kdump_cpu_freeze, NULL);
-
-	/*
-	 * Now this cpu is ready for kdump.
-	 * Stop all others by IPI or INIT.  They could receive INIT from
-	 * outside and might be INIT monarch, but only thing they have to
-	 * do is falling into kdump_cpu_freeze().
-	 *
-	 * If an INIT is asserted here:
-	 * - All receivers might be slaves, since some of cpus could already
-	 *   be frozen and INIT might be masked on monarch.  In this case,
-	 *   all slaves will be frozen soon since kdump_in_progress will let
-	 *   them into DIE_INIT_SLAVE_LEAVE.
-	 * - One might be a monarch, but INIT rendezvous will fail since
-	 *   at least this cpu already have INIT masked so it never join
-	 *   to the rendezvous.  In this case, all slaves and monarch will
-	 *   be frozen soon with no wait since the INIT rendezvous is skipped
-	 *   by kdump_in_progress.
-	 */
-	kdump_smp_send_stop();
-	/* not all cpu response to IPI, send INIT to freeze them */
-	if (kdump_wait_cpu_freeze()) {
-		kdump_smp_send_init();
-		/* wait again, don't go ahead if possible */
-		kdump_wait_cpu_freeze();
-	}
-#endif
-}
-
-static void
-machine_kdump_on_init(void)
-{
-	crash_save_vmcoreinfo();
-	local_irq_disable();
-	kexec_disable_iosapic();
-	machine_kexec(ia64_kimage);
-}
-
-void
-kdump_cpu_freeze(struct unw_frame_info *info, void *arg)
-{
-	int cpuid;
-
-	local_irq_disable();
-	cpuid = smp_processor_id();
-	crash_save_this_cpu();
-	current->thread.ksp = (__u64)info->sw - 16;
-
-	ia64_set_psr_mc();	/* mask MCA/INIT and stop reentrance */
-
-	atomic_inc(&kdump_cpu_frozen);
-	kdump_status[cpuid] = 1;
-	mb();
-	for (;;)
-		cpu_relax();
-}
-
-static int
-kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data)
-{
-	struct ia64_mca_notify_die *nd;
-	struct die_args *args = data;
-
-	if (atomic_read(&kdump_in_progress)) {
-		switch (val) {
-		case DIE_INIT_MONARCH_LEAVE:
-			if (!kdump_freeze_monarch)
-				break;
-			fallthrough;
-		case DIE_INIT_SLAVE_LEAVE:
-		case DIE_INIT_MONARCH_ENTER:
-		case DIE_MCA_RENDZVOUS_LEAVE:
-			unw_init_running(kdump_cpu_freeze, NULL);
-			break;
-		}
-	}
-
-	if (!kdump_on_init && !kdump_on_fatal_mca)
-		return NOTIFY_DONE;
-
-	if (!ia64_kimage) {
-		if (val == DIE_INIT_MONARCH_LEAVE)
-			ia64_mca_printk(KERN_NOTICE
-					"%s: kdump not configured\n",
-					__func__);
-		return NOTIFY_DONE;
-	}
-
-	if (val != DIE_INIT_MONARCH_LEAVE &&
-	    val != DIE_INIT_MONARCH_PROCESS &&
-	    val != DIE_MCA_MONARCH_LEAVE)
-		return NOTIFY_DONE;
-
-	nd = (struct ia64_mca_notify_die *)args->err;
-
-	switch (val) {
-	case DIE_INIT_MONARCH_PROCESS:
-		/* Reason code 1 means machine check rendezvous*/
-		if (kdump_on_init && (nd->sos->rv_rc != 1)) {
-			if (atomic_inc_return(&kdump_in_progress) != 1)
-				kdump_freeze_monarch = 1;
-		}
-		break;
-	case DIE_INIT_MONARCH_LEAVE:
-		/* Reason code 1 means machine check rendezvous*/
-		if (kdump_on_init && (nd->sos->rv_rc != 1))
-			machine_kdump_on_init();
-		break;
-	case DIE_MCA_MONARCH_LEAVE:
-		/* *(nd->data) indicate if MCA is recoverable */
-		if (kdump_on_fatal_mca && !(*(nd->data))) {
-			if (atomic_inc_return(&kdump_in_progress) == 1)
-				machine_kdump_on_init();
-			/* We got fatal MCA while kdump!? No way!! */
-		}
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table kdump_ctl_table[] = {
-	{
-		.procname = "kdump_on_init",
-		.data = &kdump_on_init,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{
-		.procname = "kdump_on_fatal_mca",
-		.data = &kdump_on_fatal_mca,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{ }
-};
-#endif
-
-static int
-machine_crash_setup(void)
-{
-	/* be notified before default_monarch_init_process */
-	static struct notifier_block kdump_init_notifier_nb = {
-		.notifier_call = kdump_init_notifier,
-		.priority = 1,
-	};
-	int ret;
-	if((ret = register_die_notifier(&kdump_init_notifier_nb)) != 0)
-		return ret;
-#ifdef CONFIG_SYSCTL
-	register_sysctl("kernel", kdump_ctl_table);
-#endif
-	return 0;
-}
-
-__initcall(machine_crash_setup);
-
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
deleted file mode 100644
index 4ef68e2aa757..000000000000
--- a/arch/ia64/kernel/crash_dump.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *	kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *	Created by: Simon Horman <horms@verge.net.au>
- *	Original code moved from kernel/crash.c
- *	Original code comment copied from the i386 version of this file
- */
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/crash_dump.h>
-#include <linux/uio.h>
-#include <asm/page.h>
-
-ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn,
-		size_t csize, unsigned long offset)
-{
-	void  *vaddr;
-
-	if (!csize)
-		return 0;
-	vaddr = __va(pfn<<PAGE_SHIFT);
-	csize = copy_to_iter(vaddr + offset, csize, iter);
-	return csize;
-}
-
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
deleted file mode 100644
index 258d7b70c0f3..000000000000
--- a/arch/ia64/kernel/cyclone.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/timex.h>
-#include <linux/clocksource.h>
-#include <linux/io.h>
-
-/* IBM Summit (EXA) Cyclone counter code*/
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-
-int use_cyclone;
-void __init cyclone_setup(void)
-{
-	use_cyclone = 1;
-}
-
-static void __iomem *cyclone_mc;
-
-static u64 read_cyclone(struct clocksource *cs)
-{
-	return (u64)readq((void __iomem *)cyclone_mc);
-}
-
-static struct clocksource clocksource_cyclone = {
-        .name           = "cyclone",
-        .rating         = 300,
-        .read           = read_cyclone,
-        .mask           = (1LL << 40) - 1,
-        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-int __init init_cyclone_clock(void)
-{
-	u64 __iomem *reg;
-	u64 base;	/* saved cyclone base address */
-	u64 offset;	/* offset from pageaddr to cyclone_timer register */
-	int i;
-	u32 __iomem *cyclone_timer;	/* Cyclone MPMC0 register */
-
-	if (!use_cyclone)
-		return 0;
-
-	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
-	/* find base address */
-	offset = (CYCLONE_CBAR_ADDR);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	base = readq(reg);
-	iounmap(reg);
-	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
-				" value.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-
-	/* setup PMCC */
-	offset = (base + CYCLONE_PMCC_OFFSET);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	writel(0x00000001,reg);
-	iounmap(reg);
-
-	/* setup MPCS */
-	offset = (base + CYCLONE_MPCS_OFFSET);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	writel(0x00000001,reg);
-	iounmap(reg);
-
-	/* map in cyclone_timer */
-	offset = (base + CYCLONE_MPMC_OFFSET);
-	cyclone_timer = ioremap(offset, sizeof(u32));
-	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-
-	/*quick test to make sure its ticking*/
-	for(i=0; i<3; i++){
-		u32 old = readl(cyclone_timer);
-		int stall = 100;
-		while(stall--) barrier();
-		if(readl(cyclone_timer) == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting!"
-					" DISABLED\n");
-			iounmap(cyclone_timer);
-			cyclone_timer = NULL;
-			use_cyclone = 0;
-			return -ENODEV;
-		}
-	}
-	/* initialize last tick */
-	cyclone_mc = cyclone_timer;
-	clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
-	clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
-
-	return 0;
-}
-
-__initcall(init_cyclone_clock);
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
deleted file mode 100644
index cd0c166bfbc2..000000000000
--- a/arch/ia64/kernel/dma-mapping.c
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/dma-map-ops.h>
-#include <linux/export.h>
-
-/* Set this to 1 if there is a HW IOMMU in the system */
-int iommu_detected __read_mostly;
-
-const struct dma_map_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
deleted file mode 100644
index 033f5aead88a..000000000000
--- a/arch/ia64/kernel/efi.c
+++ /dev/null
@@ -1,1360 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 0.9
- * April 30, 1999
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * (c) Copyright 2006 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *	Skip non-WB memory and ignore empty memory ranges.
- */
-#include <linux/module.h>
-#include <linux/memblock.h>
-#include <linux/crash_dump.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/efi.h>
-#include <linux/kexec.h>
-#include <linux/mm.h>
-
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/kregs.h>
-#include <asm/meminit.h>
-#include <asm/processor.h>
-#include <asm/mca.h>
-#include <asm/sal.h>
-#include <asm/setup.h>
-#include <asm/tlbflush.h>
-
-#define EFI_DEBUG	0
-
-#define ESI_TABLE_GUID					\
-    EFI_GUID(0x43EA58DC, 0xCF28, 0x4b06, 0xB3,		\
-	     0x91, 0xB7, 0x50, 0x59, 0x34, 0x2B, 0xD4)
-
-static unsigned long mps_phys = EFI_INVALID_TABLE_ADDR;
-static __initdata unsigned long palo_phys;
-
-unsigned long __initdata esi_phys = EFI_INVALID_TABLE_ADDR;
-unsigned long hcdp_phys = EFI_INVALID_TABLE_ADDR;
-unsigned long sal_systab_phys = EFI_INVALID_TABLE_ADDR;
-
-static const efi_config_table_type_t arch_tables[] __initconst = {
-	{ESI_TABLE_GUID,				&esi_phys,		"ESI"		},
-	{HCDP_TABLE_GUID,				&hcdp_phys,		"HCDP"		},
-	{MPS_TABLE_GUID,				&mps_phys,		"MPS"		},
-	{PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID,	&palo_phys,		"PALO"		},
-	{SAL_SYSTEM_TABLE_GUID,				&sal_systab_phys,	"SALsystab"	},
-	{},
-};
-
-extern efi_status_t efi_call_phys (void *, ...);
-
-static efi_runtime_services_t *runtime;
-static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
-
-#define efi_call_virt(f, args...)	(*(f))(args)
-
-#define STUB_GET_TIME(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)			       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_time_cap_t *atc = NULL;					       \
-	efi_status_t ret;						       \
-									       \
-	if (tc)								       \
-		atc = adjust_arg(tc);					       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time),    \
-				adjust_arg(tm), atc);			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_TIME(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_set_time (efi_time_t *tm)					       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time),    \
-				adjust_arg(tm));			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending,	       \
-			  efi_time_t *tm)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),      \
-		adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));     \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)		       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_time_t *atm = NULL;						       \
-	efi_status_t ret;						       \
-									       \
-	if (tm)								       \
-		atm = adjust_arg(tm);					       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),      \
-		enabled, atm);						       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_VARIABLE(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,      \
-		       unsigned long *data_size, void *data)		       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	u32 *aattr = NULL;						       \
-	efi_status_t ret;						       \
-									       \
-	if (attr)							       \
-		aattr = adjust_arg(attr);				       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_variable_t *) __va(runtime->get_variable),	       \
-		adjust_arg(name), adjust_arg(vendor), aattr,		       \
-		adjust_arg(data_size), adjust_arg(data));		       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name,      \
-			    efi_guid_t *vendor)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_next_variable_t *) __va(runtime->get_next_variable),  \
-		adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));  \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_VARIABLE(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor,		       \
-		       u32 attr, unsigned long data_size,		       \
-		       void *data)					       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_set_variable_t *) __va(runtime->set_variable),	       \
-		adjust_arg(name), adjust_arg(vendor), attr, data_size,	       \
-		adjust_arg(data));					       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)		       \
-static efi_status_t							       \
-prefix##_get_next_high_mono_count (u32 *count)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)	       \
-				__va(runtime->get_next_high_mono_count),       \
-				adjust_arg(count));			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_RESET_SYSTEM(prefix, adjust_arg)				       \
-static void								       \
-prefix##_reset_system (int reset_type, efi_status_t status,		       \
-		       unsigned long data_size, efi_char16_t *data)	       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_char16_t *adata = NULL;					       \
-									       \
-	if (data)							       \
-		adata = adjust_arg(data);				       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	efi_call_##prefix(						       \
-		(efi_reset_system_t *) __va(runtime->reset_system),	       \
-		reset_type, status, data_size, adata);			       \
-	/* should not return, but just in case... */			       \
-	ia64_load_scratch_fpregs(fr);					       \
-}
-
-#define phys_ptr(arg)	((__typeof__(arg)) ia64_tpa(arg))
-
-STUB_GET_TIME(phys, phys_ptr)
-STUB_SET_TIME(phys, phys_ptr)
-STUB_GET_WAKEUP_TIME(phys, phys_ptr)
-STUB_SET_WAKEUP_TIME(phys, phys_ptr)
-STUB_GET_VARIABLE(phys, phys_ptr)
-STUB_GET_NEXT_VARIABLE(phys, phys_ptr)
-STUB_SET_VARIABLE(phys, phys_ptr)
-STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr)
-STUB_RESET_SYSTEM(phys, phys_ptr)
-
-#define id(arg)	arg
-
-STUB_GET_TIME(virt, id)
-STUB_SET_TIME(virt, id)
-STUB_GET_WAKEUP_TIME(virt, id)
-STUB_SET_WAKEUP_TIME(virt, id)
-STUB_GET_VARIABLE(virt, id)
-STUB_GET_NEXT_VARIABLE(virt, id)
-STUB_SET_VARIABLE(virt, id)
-STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id)
-STUB_RESET_SYSTEM(virt, id)
-
-void
-efi_gettimeofday (struct timespec64 *ts)
-{
-	efi_time_t tm;
-
-	if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) {
-		memset(ts, 0, sizeof(*ts));
-		return;
-	}
-
-	ts->tv_sec = mktime64(tm.year, tm.month, tm.day,
-			    tm.hour, tm.minute, tm.second);
-	ts->tv_nsec = tm.nanosecond;
-}
-
-static int
-is_memory_available (efi_memory_desc_t *md)
-{
-	if (!(md->attribute & EFI_MEMORY_WB))
-		return 0;
-
-	switch (md->type) {
-	      case EFI_LOADER_CODE:
-	      case EFI_LOADER_DATA:
-	      case EFI_BOOT_SERVICES_CODE:
-	      case EFI_BOOT_SERVICES_DATA:
-	      case EFI_CONVENTIONAL_MEMORY:
-		return 1;
-	}
-	return 0;
-}
-
-typedef struct kern_memdesc {
-	u64 attribute;
-	u64 start;
-	u64 num_pages;
-} kern_memdesc_t;
-
-static kern_memdesc_t *kern_memmap;
-
-#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
-
-static inline u64
-kmd_end(kern_memdesc_t *kmd)
-{
-	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
-}
-
-static inline u64
-efi_md_end(efi_memory_desc_t *md)
-{
-	return (md->phys_addr + efi_md_size(md));
-}
-
-static inline int
-efi_wb(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_WB);
-}
-
-static inline int
-efi_uc(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_UC);
-}
-
-static void
-walk (efi_freemem_callback_t callback, void *arg, u64 attr)
-{
-	kern_memdesc_t *k;
-	u64 start, end, voff;
-
-	voff = (attr == EFI_MEMORY_WB) ? PAGE_OFFSET : __IA64_UNCACHED_OFFSET;
-	for (k = kern_memmap; k->start != ~0UL; k++) {
-		if (k->attribute != attr)
-			continue;
-		start = PAGE_ALIGN(k->start);
-		end = (k->start + (k->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK;
-		if (start < end)
-			if ((*callback)(start + voff, end + voff, arg) < 0)
-				return;
-	}
-}
-
-/*
- * Walk the EFI memory map and call CALLBACK once for each EFI memory
- * descriptor that has memory that is available for OS use.
- */
-void
-efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
-{
-	walk(callback, arg, EFI_MEMORY_WB);
-}
-
-/*
- * Walk the EFI memory map and call CALLBACK once for each EFI memory
- * descriptor that has memory that is available for uncached allocator.
- */
-void
-efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
-{
-	walk(callback, arg, EFI_MEMORY_UC);
-}
-
-/*
- * Look for the PAL_CODE region reported by EFI and map it using an
- * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
- * Abstraction Layer chapter 11 in ADAG
- */
-void *
-efi_get_pal_addr (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	int pal_code_count = 0;
-	u64 vaddr, mask;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->type != EFI_PAL_CODE)
-			continue;
-
-		if (++pal_code_count > 1) {
-			printk(KERN_ERR "Too many EFI Pal Code memory ranges, "
-			       "dropped @ %llx\n", md->phys_addr);
-			continue;
-		}
-		/*
-		 * The only ITLB entry in region 7 that is used is the one
-		 * installed by __start().  That entry covers a 64MB range.
-		 */
-		mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
-		vaddr = PAGE_OFFSET + md->phys_addr;
-
-		/*
-		 * We must check that the PAL mapping won't overlap with the
-		 * kernel mapping.
-		 *
-		 * PAL code is guaranteed to be aligned on a power of 2 between
-		 * 4k and 256KB and that only one ITR is needed to map it. This
-		 * implies that the PAL code is always aligned on its size,
-		 * i.e., the closest matching page size supported by the TLB.
-		 * Therefore PAL code is guaranteed never to cross a 64MB unless
-		 * it is bigger than 64MB (very unlikely!).  So for now the
-		 * following test is enough to determine whether or not we need
-		 * a dedicated ITR for the PAL code.
-		 */
-		if ((vaddr & mask) == (KERNEL_START & mask)) {
-			printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
-			       __func__);
-			continue;
-		}
-
-		if (efi_md_size(md) > IA64_GRANULE_SIZE)
-			panic("Whoa!  PAL code size bigger than a granule!");
-
-#if EFI_DEBUG
-		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
-
-		printk(KERN_INFO "CPU %d: mapping PAL code "
-			"[0x%llx-0x%llx) into [0x%llx-0x%llx)\n",
-			smp_processor_id(), md->phys_addr,
-			md->phys_addr + efi_md_size(md),
-			vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
-#endif
-		return __va(md->phys_addr);
-	}
-	printk(KERN_WARNING "%s: no PAL-code memory-descriptor found\n",
-	       __func__);
-	return NULL;
-}
-
-
-static u8 __init palo_checksum(u8 *buffer, u32 length)
-{
-	u8 sum = 0;
-	u8 *end = buffer + length;
-
-	while (buffer < end)
-		sum = (u8) (sum + *(buffer++));
-
-	return sum;
-}
-
-/*
- * Parse and handle PALO table which is published at:
- * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf
- */
-static void __init handle_palo(unsigned long phys_addr)
-{
-	struct palo_table *palo = __va(phys_addr);
-	u8  checksum;
-
-	if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) {
-		printk(KERN_INFO "PALO signature incorrect.\n");
-		return;
-	}
-
-	checksum = palo_checksum((u8 *)palo, palo->length);
-	if (checksum) {
-		printk(KERN_INFO "PALO checksum incorrect.\n");
-		return;
-	}
-
-	setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO);
-}
-
-void
-efi_map_pal_code (void)
-{
-	void *pal_vaddr = efi_get_pal_addr ();
-	u64 psr;
-
-	if (!pal_vaddr)
-		return;
-
-	/*
-	 * Cannot write to CRx with PSR.ic=1
-	 */
-	psr = ia64_clear_ic();
-	ia64_itr(0x1, IA64_TR_PALCODE,
-		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
-		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
-		 IA64_GRANULE_SHIFT);
-	ia64_set_psr(psr);		/* restore psr */
-}
-
-void __init
-efi_init (void)
-{
-	const efi_system_table_t *efi_systab;
-	void *efi_map_start, *efi_map_end;
-	u64 efi_desc_size;
-	char *cp;
-
-	set_bit(EFI_BOOT, &efi.flags);
-	set_bit(EFI_64BIT, &efi.flags);
-
-	/*
-	 * It's too early to be able to use the standard kernel command line
-	 * support...
-	 */
-	for (cp = boot_command_line; *cp; ) {
-		if (memcmp(cp, "mem=", 4) == 0) {
-			mem_limit = memparse(cp + 4, &cp);
-		} else if (memcmp(cp, "max_addr=", 9) == 0) {
-			max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
-		} else if (memcmp(cp, "min_addr=", 9) == 0) {
-			min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
-		} else {
-			while (*cp != ' ' && *cp)
-				++cp;
-			while (*cp == ' ')
-				++cp;
-		}
-	}
-	if (min_addr != 0UL)
-		printk(KERN_INFO "Ignoring memory below %lluMB\n",
-		       min_addr >> 20);
-	if (max_addr != ~0UL)
-		printk(KERN_INFO "Ignoring memory above %lluMB\n",
-		       max_addr >> 20);
-
-	efi_systab = __va(ia64_boot_param->efi_systab);
-
-	/*
-	 * Verify the EFI Table
-	 */
-	if (efi_systab == NULL)
-		panic("Whoa! Can't find EFI system table.\n");
-	if (efi_systab_check_header(&efi_systab->hdr))
-		panic("Whoa! EFI system table signature incorrect\n");
-
-	efi_systab_report_header(&efi_systab->hdr, efi_systab->fw_vendor);
-
-	palo_phys      = EFI_INVALID_TABLE_ADDR;
-
-	if (efi_config_parse_tables(__va(efi_systab->tables),
-				    efi_systab->nr_tables,
-				    arch_tables) != 0)
-		return;
-
-	if (palo_phys != EFI_INVALID_TABLE_ADDR)
-		handle_palo(palo_phys);
-
-	runtime = __va(efi_systab->runtime);
-	efi.get_time = phys_get_time;
-	efi.set_time = phys_set_time;
-	efi.get_wakeup_time = phys_get_wakeup_time;
-	efi.set_wakeup_time = phys_set_wakeup_time;
-	efi.get_variable = phys_get_variable;
-	efi.get_next_variable = phys_get_next_variable;
-	efi.set_variable = phys_set_variable;
-	efi.get_next_high_mono_count = phys_get_next_high_mono_count;
-	efi.reset_system = phys_reset_system;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-#if EFI_DEBUG
-	/* print EFI memory map: */
-	{
-		efi_memory_desc_t *md;
-		void *p;
-		unsigned int i;
-
-		for (i = 0, p = efi_map_start; p < efi_map_end;
-		     ++i, p += efi_desc_size)
-		{
-			const char *unit;
-			unsigned long size;
-			char buf[64];
-
-			md = p;
-			size = md->num_pages << EFI_PAGE_SHIFT;
-
-			if ((size >> 40) > 0) {
-				size >>= 40;
-				unit = "TB";
-			} else if ((size >> 30) > 0) {
-				size >>= 30;
-				unit = "GB";
-			} else if ((size >> 20) > 0) {
-				size >>= 20;
-				unit = "MB";
-			} else {
-				size >>= 10;
-				unit = "KB";
-			}
-
-			printk("mem%02d: %s "
-			       "range=[0x%016llx-0x%016llx) (%4lu%s)\n",
-			       i, efi_md_typeattr_format(buf, sizeof(buf), md),
-			       md->phys_addr,
-			       md->phys_addr + efi_md_size(md), size, unit);
-		}
-	}
-#endif
-
-	efi_map_pal_code();
-	efi_enter_virtual_mode();
-}
-
-void
-efi_enter_virtual_mode (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	efi_status_t status;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->attribute & EFI_MEMORY_RUNTIME) {
-			/*
-			 * Some descriptors have multiple bits set, so the
-			 * order of the tests is relevant.
-			 */
-			if (md->attribute & EFI_MEMORY_WB) {
-				md->virt_addr = (u64) __va(md->phys_addr);
-			} else if (md->attribute & EFI_MEMORY_UC) {
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-			} else if (md->attribute & EFI_MEMORY_WC) {
-#if 0
-				md->virt_addr = ia64_remap(md->phys_addr,
-							   (_PAGE_A |
-							    _PAGE_P |
-							    _PAGE_D |
-							    _PAGE_MA_WC |
-							    _PAGE_PL_0 |
-							    _PAGE_AR_RW));
-#else
-				printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-#endif
-			} else if (md->attribute & EFI_MEMORY_WT) {
-#if 0
-				md->virt_addr = ia64_remap(md->phys_addr,
-							   (_PAGE_A |
-							    _PAGE_P |
-							    _PAGE_D |
-							    _PAGE_MA_WT |
-							    _PAGE_PL_0 |
-							    _PAGE_AR_RW));
-#else
-				printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-#endif
-			}
-		}
-	}
-
-	status = efi_call_phys(__va(runtime->set_virtual_address_map),
-			       ia64_boot_param->efi_memmap_size,
-			       efi_desc_size,
-			       ia64_boot_param->efi_memdesc_version,
-			       ia64_boot_param->efi_memmap);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "warning: unable to switch EFI into "
-		       "virtual mode (status=%lu)\n", status);
-		return;
-	}
-
-	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-
-	/*
-	 * Now that EFI is in virtual mode, we call the EFI functions more
-	 * efficiently:
-	 */
-	efi.get_time = virt_get_time;
-	efi.set_time = virt_set_time;
-	efi.get_wakeup_time = virt_get_wakeup_time;
-	efi.set_wakeup_time = virt_set_wakeup_time;
-	efi.get_variable = virt_get_variable;
-	efi.get_next_variable = virt_get_next_variable;
-	efi.set_variable = virt_set_variable;
-	efi.get_next_high_mono_count = virt_get_next_high_mono_count;
-	efi.reset_system = virt_reset_system;
-}
-
-/*
- * Walk the EFI memory map looking for the I/O port range.  There can only be
- * one entry of this type, other I/O port ranges should be described via ACPI.
- */
-u64
-efi_get_iobase (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
-			if (md->attribute & EFI_MEMORY_UC)
-				return md->phys_addr;
-		}
-	}
-	return 0;
-}
-
-static struct kern_memdesc *
-kern_memory_descriptor (unsigned long phys_addr)
-{
-	struct kern_memdesc *md;
-
-	for (md = kern_memmap; md->start != ~0UL; md++) {
-		if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT))
-			 return md;
-	}
-	return NULL;
-}
-
-static efi_memory_desc_t *
-efi_memory_descriptor (unsigned long phys_addr)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		if (phys_addr - md->phys_addr < efi_md_size(md))
-			 return md;
-	}
-	return NULL;
-}
-
-static int
-efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	unsigned long end;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	end = phys_addr + size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->phys_addr < end && efi_md_end(md) > phys_addr)
-			return 1;
-	}
-	return 0;
-}
-
-int
-efi_mem_type (unsigned long phys_addr)
-{
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-
-	if (md)
-		return md->type;
-	return -EINVAL;
-}
-
-u64
-efi_mem_attributes (unsigned long phys_addr)
-{
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-
-	if (md)
-		return md->attribute;
-	return 0;
-}
-EXPORT_SYMBOL(efi_mem_attributes);
-
-u64
-efi_mem_attribute (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long end = phys_addr + size;
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-	u64 attr;
-
-	if (!md)
-		return 0;
-
-	/*
-	 * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells
-	 * the kernel that firmware needs this region mapped.
-	 */
-	attr = md->attribute & ~EFI_MEMORY_RUNTIME;
-	do {
-		unsigned long md_end = efi_md_end(md);
-
-		if (end <= md_end)
-			return attr;
-
-		md = efi_memory_descriptor(md_end);
-		if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr)
-			return 0;
-	} while (md);
-	return 0;	/* never reached */
-}
-
-u64
-kern_mem_attribute (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long end = phys_addr + size;
-	struct kern_memdesc *md;
-	u64 attr;
-
-	/*
-	 * This is a hack for ioremap calls before we set up kern_memmap.
-	 * Maybe we should do efi_memmap_init() earlier instead.
-	 */
-	if (!kern_memmap) {
-		attr = efi_mem_attribute(phys_addr, size);
-		if (attr & EFI_MEMORY_WB)
-			return EFI_MEMORY_WB;
-		return 0;
-	}
-
-	md = kern_memory_descriptor(phys_addr);
-	if (!md)
-		return 0;
-
-	attr = md->attribute;
-	do {
-		unsigned long md_end = kmd_end(md);
-
-		if (end <= md_end)
-			return attr;
-
-		md = kern_memory_descriptor(md_end);
-		if (!md || md->attribute != attr)
-			return 0;
-	} while (md);
-	return 0;	/* never reached */
-}
-
-int
-valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size)
-{
-	u64 attr;
-
-	/*
-	 * /dev/mem reads and writes use copy_to_user(), which implicitly
-	 * uses a granule-sized kernel identity mapping.  It's really
-	 * only safe to do this for regions in kern_memmap.  For more
-	 * details, see Documentation/arch/ia64/aliasing.rst.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
-		return 1;
-	return 0;
-}
-
-int
-valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size)
-{
-	unsigned long phys_addr = pfn << PAGE_SHIFT;
-	u64 attr;
-
-	attr = efi_mem_attribute(phys_addr, size);
-
-	/*
-	 * /dev/mem mmap uses normal user pages, so we don't need the entire
-	 * granule, but the entire region we're mapping must support the same
-	 * attribute.
-	 */
-	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
-		return 1;
-
-	/*
-	 * Intel firmware doesn't tell us about all the MMIO regions, so
-	 * in general we have to allow mmap requests.  But if EFI *does*
-	 * tell us about anything inside this region, we should deny it.
-	 * The user can always map a smaller region to avoid the overlap.
-	 */
-	if (efi_memmap_intersects(phys_addr, size))
-		return 0;
-
-	return 1;
-}
-
-pgprot_t
-phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size,
-		     pgprot_t vma_prot)
-{
-	unsigned long phys_addr = pfn << PAGE_SHIFT;
-	u64 attr;
-
-	/*
-	 * For /dev/mem mmap, we use user mappings, but if the region is
-	 * in kern_memmap (and hence may be covered by a kernel mapping),
-	 * we must use the same attribute as the kernel mapping.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return pgprot_cacheable(vma_prot);
-	else if (attr & EFI_MEMORY_UC)
-		return pgprot_noncached(vma_prot);
-
-	/*
-	 * Some chipsets don't support UC access to memory.  If
-	 * WB is supported, we prefer that.
-	 */
-	if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
-		return pgprot_cacheable(vma_prot);
-
-	return pgprot_noncached(vma_prot);
-}
-
-int __init
-efi_uart_console_only(void)
-{
-	efi_status_t status;
-	char *s, name[] = "ConOut";
-	efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID;
-	efi_char16_t *utf16, name_utf16[32];
-	unsigned char data[1024];
-	unsigned long size = sizeof(data);
-	struct efi_generic_dev_path *hdr, *end_addr;
-	int uart = 0;
-
-	/* Convert to UTF-16 */
-	utf16 = name_utf16;
-	s = name;
-	while (*s)
-		*utf16++ = *s++ & 0x7f;
-	*utf16 = 0;
-
-	status = efi.get_variable(name_utf16, &guid, NULL, &size, data);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "No EFI %s variable?\n", name);
-		return 0;
-	}
-
-	hdr = (struct efi_generic_dev_path *) data;
-	end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size);
-	while (hdr < end_addr) {
-		if (hdr->type == EFI_DEV_MSG &&
-		    hdr->sub_type == EFI_DEV_MSG_UART)
-			uart = 1;
-		else if (hdr->type == EFI_DEV_END_PATH ||
-			  hdr->type == EFI_DEV_END_PATH2) {
-			if (!uart)
-				return 0;
-			if (hdr->sub_type == EFI_DEV_END_ENTIRE)
-				return 1;
-			uart = 0;
-		}
-		hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length);
-	}
-	printk(KERN_ERR "Malformed %s value\n", name);
-	return 0;
-}
-
-/*
- * Look for the first granule aligned memory descriptor memory
- * that is big enough to hold EFI memory map. Make sure this
- * descriptor is at least granule sized so it does not get trimmed
- */
-struct kern_memdesc *
-find_memmap_space (void)
-{
-	u64	contig_low=0, contig_high=0;
-	u64	as = 0, ae;
-	void *efi_map_start, *efi_map_end, *p, *q;
-	efi_memory_desc_t *md, *pmd = NULL, *check_md;
-	u64	space_needed, efi_desc_size;
-	unsigned long total_mem = 0;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	/*
-	 * Worst case: we need 3 kernel descriptors for each efi descriptor
-	 * (if every entry has a WB part in the middle, and UC head and tail),
-	 * plus one for the end marker.
-	 */
-	space_needed = sizeof(kern_memdesc_t) *
-		(3 * (ia64_boot_param->efi_memmap_size/efi_desc_size) + 1);
-
-	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md)) {
-			continue;
-		}
-		if (pmd == NULL || !efi_wb(pmd) ||
-		    efi_md_end(pmd) != md->phys_addr) {
-			contig_low = GRANULEROUNDUP(md->phys_addr);
-			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end;
-			     q += efi_desc_size) {
-				check_md = q;
-				if (!efi_wb(check_md))
-					break;
-				if (contig_high != check_md->phys_addr)
-					break;
-				contig_high = efi_md_end(check_md);
-			}
-			contig_high = GRANULEROUNDDOWN(contig_high);
-		}
-		if (!is_memory_available(md) || md->type == EFI_LOADER_DATA)
-			continue;
-
-		/* Round ends inward to granule boundaries */
-		as = max(contig_low, md->phys_addr);
-		ae = min(contig_high, efi_md_end(md));
-
-		/* keep within max_addr= and min_addr= command line arg */
-		as = max(as, min_addr);
-		ae = min(ae, max_addr);
-		if (ae <= as)
-			continue;
-
-		/* avoid going over mem= command line arg */
-		if (total_mem + (ae - as) > mem_limit)
-			ae -= total_mem + (ae - as) - mem_limit;
-
-		if (ae <= as)
-			continue;
-
-		if (ae - as > space_needed)
-			break;
-	}
-	if (p >= efi_map_end)
-		panic("Can't allocate space for kernel memory descriptors");
-
-	return __va(as);
-}
-
-/*
- * Walk the EFI memory map and gather all memory available for kernel
- * to use.  We can allocate partial granules only if the unavailable
- * parts exist, and are WB.
- */
-unsigned long
-efi_memmap_init(u64 *s, u64 *e)
-{
-	struct kern_memdesc *k, *prev = NULL;
-	u64	contig_low=0, contig_high=0;
-	u64	as, ae, lim;
-	void *efi_map_start, *efi_map_end, *p, *q;
-	efi_memory_desc_t *md, *pmd = NULL, *check_md;
-	u64	efi_desc_size;
-	unsigned long total_mem = 0;
-
-	k = kern_memmap = find_memmap_space();
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md)) {
-			if (efi_uc(md) &&
-			    (md->type == EFI_CONVENTIONAL_MEMORY ||
-			     md->type == EFI_BOOT_SERVICES_DATA)) {
-				k->attribute = EFI_MEMORY_UC;
-				k->start = md->phys_addr;
-				k->num_pages = md->num_pages;
-				k++;
-			}
-			continue;
-		}
-		if (pmd == NULL || !efi_wb(pmd) ||
-		    efi_md_end(pmd) != md->phys_addr) {
-			contig_low = GRANULEROUNDUP(md->phys_addr);
-			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end;
-			     q += efi_desc_size) {
-				check_md = q;
-				if (!efi_wb(check_md))
-					break;
-				if (contig_high != check_md->phys_addr)
-					break;
-				contig_high = efi_md_end(check_md);
-			}
-			contig_high = GRANULEROUNDDOWN(contig_high);
-		}
-		if (!is_memory_available(md))
-			continue;
-
-		/*
-		 * Round ends inward to granule boundaries
-		 * Give trimmings to uncached allocator
-		 */
-		if (md->phys_addr < contig_low) {
-			lim = min(efi_md_end(md), contig_low);
-			if (efi_uc(md)) {
-				if (k > kern_memmap &&
-				    (k-1)->attribute == EFI_MEMORY_UC &&
-				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages +=
-						(lim - md->phys_addr)
-						>> EFI_PAGE_SHIFT;
-				} else {
-					k->attribute = EFI_MEMORY_UC;
-					k->start = md->phys_addr;
-					k->num_pages = (lim - md->phys_addr)
-						>> EFI_PAGE_SHIFT;
-					k++;
-				}
-			}
-			as = contig_low;
-		} else
-			as = md->phys_addr;
-
-		if (efi_md_end(md) > contig_high) {
-			lim = max(md->phys_addr, contig_high);
-			if (efi_uc(md)) {
-				if (lim == md->phys_addr && k > kern_memmap &&
-				    (k-1)->attribute == EFI_MEMORY_UC &&
-				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages += md->num_pages;
-				} else {
-					k->attribute = EFI_MEMORY_UC;
-					k->start = lim;
-					k->num_pages = (efi_md_end(md) - lim)
-						>> EFI_PAGE_SHIFT;
-					k++;
-				}
-			}
-			ae = contig_high;
-		} else
-			ae = efi_md_end(md);
-
-		/* keep within max_addr= and min_addr= command line arg */
-		as = max(as, min_addr);
-		ae = min(ae, max_addr);
-		if (ae <= as)
-			continue;
-
-		/* avoid going over mem= command line arg */
-		if (total_mem + (ae - as) > mem_limit)
-			ae -= total_mem + (ae - as) - mem_limit;
-
-		if (ae <= as)
-			continue;
-		if (prev && kmd_end(prev) == md->phys_addr) {
-			prev->num_pages += (ae - as) >> EFI_PAGE_SHIFT;
-			total_mem += ae - as;
-			continue;
-		}
-		k->attribute = EFI_MEMORY_WB;
-		k->start = as;
-		k->num_pages = (ae - as) >> EFI_PAGE_SHIFT;
-		total_mem += ae - as;
-		prev = k++;
-	}
-	k->start = ~0L; /* end-marker */
-
-	/* reserve the memory we are using for kern_memmap */
-	*s = (u64)kern_memmap;
-	*e = (u64)++k;
-
-	return total_mem;
-}
-
-void
-efi_initialize_iomem_resources(struct resource *code_resource,
-			       struct resource *data_resource,
-			       struct resource *bss_resource)
-{
-	struct resource *res;
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	char *name;
-	unsigned long flags, desc;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	res = NULL;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		if (md->num_pages == 0) /* should not happen */
-			continue;
-
-		flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		desc = IORES_DESC_NONE;
-
-		switch (md->type) {
-
-			case EFI_MEMORY_MAPPED_IO:
-			case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
-				continue;
-
-			case EFI_LOADER_CODE:
-			case EFI_LOADER_DATA:
-			case EFI_BOOT_SERVICES_DATA:
-			case EFI_BOOT_SERVICES_CODE:
-			case EFI_CONVENTIONAL_MEMORY:
-				if (md->attribute & EFI_MEMORY_WP) {
-					name = "System ROM";
-					flags |= IORESOURCE_READONLY;
-				} else if (md->attribute == EFI_MEMORY_UC) {
-					name = "Uncached RAM";
-				} else {
-					name = "System RAM";
-					flags |= IORESOURCE_SYSRAM;
-				}
-				break;
-
-			case EFI_ACPI_MEMORY_NVS:
-				name = "ACPI Non-volatile Storage";
-				desc = IORES_DESC_ACPI_NV_STORAGE;
-				break;
-
-			case EFI_UNUSABLE_MEMORY:
-				name = "reserved";
-				flags |= IORESOURCE_DISABLED;
-				break;
-
-			case EFI_PERSISTENT_MEMORY:
-				name = "Persistent Memory";
-				desc = IORES_DESC_PERSISTENT_MEMORY;
-				break;
-
-			case EFI_RESERVED_TYPE:
-			case EFI_RUNTIME_SERVICES_CODE:
-			case EFI_RUNTIME_SERVICES_DATA:
-			case EFI_ACPI_RECLAIM_MEMORY:
-			default:
-				name = "reserved";
-				break;
-		}
-
-		if ((res = kzalloc(sizeof(struct resource),
-				   GFP_KERNEL)) == NULL) {
-			printk(KERN_ERR
-			       "failed to allocate resource for iomem\n");
-			return;
-		}
-
-		res->name = name;
-		res->start = md->phys_addr;
-		res->end = md->phys_addr + efi_md_size(md) - 1;
-		res->flags = flags;
-		res->desc = desc;
-
-		if (insert_resource(&iomem_resource, res) < 0)
-			kfree(res);
-		else {
-			/*
-			 * We don't know which region contains
-			 * kernel data so we try it repeatedly and
-			 * let the resource manager test it.
-			 */
-			insert_resource(res, code_resource);
-			insert_resource(res, data_resource);
-			insert_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-                        insert_resource(res, &efi_memmap_res);
-                        insert_resource(res, &boot_param_res);
-			if (crashk_res.end > crashk_res.start)
-				insert_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
-#ifdef CONFIG_KEXEC
-/* find a block of memory aligned to 64M exclude reserved regions
-   rsvd_regions are sorted
- */
-unsigned long __init
-kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n)
-{
-	int i;
-	u64 start, end;
-	u64 alignment = 1UL << _PAGE_SIZE_64M;
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md))
-			continue;
-		start = ALIGN(md->phys_addr, alignment);
-		end = efi_md_end(md);
-		for (i = 0; i < n; i++) {
-			if (__pa(r[i].start) >= start && __pa(r[i].end) < end) {
-				if (__pa(r[i].start) > start + size)
-					return start;
-				start = ALIGN(__pa(r[i].end), alignment);
-				if (i < n-1 &&
-				    __pa(r[i+1].start) < start + size)
-					continue;
-				else
-					break;
-			}
-		}
-		if (end > start + size)
-			return start;
-	}
-
-	printk(KERN_WARNING
-	       "Cannot reserve 0x%lx byte of memory for crashdump\n", size);
-	return ~0UL;
-}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-/* locate the size find a the descriptor at a certain address */
-unsigned long __init
-vmcore_find_descriptor_size (unsigned long address)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	unsigned long ret = 0;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (efi_wb(md) && md->type == EFI_LOADER_DATA
-		    && md->phys_addr == address) {
-			ret = efi_md_size(md);
-			break;
-		}
-	}
-
-	if (ret == 0)
-		printk(KERN_WARNING "Cannot locate EFI vmcore descriptor\n");
-
-	return ret;
-}
-#endif
-
-char *efi_systab_show_arch(char *str)
-{
-	if (mps_phys != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "MPS=0x%lx\n", mps_phys);
-	if (hcdp_phys != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "HCDP=0x%lx\n", hcdp_phys);
-	return str;
-}
diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S
deleted file mode 100644
index 1fd61b78fb29..000000000000
--- a/arch/ia64/kernel/efi_stub.S
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * EFI call stub.
- *
- * Copyright (C) 1999-2001 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.  We need this because we can't call SetVirtualMap() until
- * the kernel has booted far enough to allow allocation of struct vm_area_struct
- * entries (which we would need to map stuff with memory attributes other
- * than uncached or writeback...).  Since the GetTime() service gets called
- * earlier than that, we need to be able to make physical mode EFI calls from
- * the kernel.
- */
-
-/*
- * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
- * Abstraction Layer Specification", revision 2.6e).  Note that
- * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
- * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
- * (the br.ia instruction fails unless psr.dfl and psr.dfh are
- * cleared).  Fortunately, SAL promises not to touch the floating
- * point regs, so at least we don't have to save f2-f127.
- */
-#define PSR_BITS_TO_CLEAR						\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PSR_BITS_TO_SET							\
-	(IA64_PSR_BN)
-
-#include <asm/processor.h>
-#include <asm/asmmacro.h>
-
-/*
- * Inputs:
- *	in0 = address of function descriptor of EFI routine to call
- *	in1..in7 = arguments to routine
- *
- * Outputs:
- *	r8 = EFI_STATUS returned by called function
- */
-
-GLOBAL_ENTRY(efi_call_phys)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,7,7,0
-	ld8 r2=[in0],8			// load EFI function's entry point
-	mov loc0=rp
-	.body
-	;;
-	mov loc2=gp			// save global pointer
-	mov loc4=ar.rsc			// save RSE configuration
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	;;
-	ld8 gp=[in0]			// load EFI function's global pointer
-	movl r16=PSR_BITS_TO_CLEAR
-	mov loc3=psr			// save processor status word
-	movl r17=PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17
-	mov b6=r2
-	;;
-	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
-	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret0:	mov out4=in5
-	mov out0=in1
-	mov out1=in2
-	mov out2=in3
-	mov out3=in4
-	mov out5=in6
-	mov out6=in7
-	mov loc5=r19
-	mov loc6=r20
-	br.call.sptk.many rp=b6		// call the EFI function
-.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:	mov ar.rsc=loc4			// restore RSE configuration
-	mov ar.pfs=loc1
-	mov rp=loc0
-	mov gp=loc2
-	br.ret.sptk.many rp
-END(efi_call_phys)
diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c
deleted file mode 100644
index 8895df121540..000000000000
--- a/arch/ia64/kernel/elfcore.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/elf.h>
-#include <linux/coredump.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-
-#include <asm/elf.h>
-
-
-Elf64_Half elf_core_extra_phdrs(struct coredump_params *cprm)
-{
-	return GATE_EHDR->e_phnum;
-}
-
-int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-	Elf64_Off ofs = 0;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		struct elf_phdr phdr = gate_phdrs[i];
-
-		if (phdr.p_type == PT_LOAD) {
-			phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz);
-			phdr.p_filesz = phdr.p_memsz;
-			if (ofs == 0) {
-				ofs = phdr.p_offset = offset;
-				offset += phdr.p_filesz;
-			} else {
-				phdr.p_offset = ofs;
-			}
-		} else {
-			phdr.p_offset += ofs;
-		}
-		phdr.p_paddr = 0; /* match other core phdrs */
-		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
-			return 0;
-	}
-	return 1;
-}
-
-int elf_core_write_extra_data(struct coredump_params *cprm)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		if (gate_phdrs[i].p_type == PT_LOAD) {
-			void *addr = (void *)gate_phdrs[i].p_vaddr;
-			size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz);
-
-			if (!dump_emit(cprm, addr, memsz))
-				return 0;
-			break;
-		}
-	}
-	return 1;
-}
-
-size_t elf_core_extra_data_size(struct coredump_params *cprm)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-	size_t size = 0;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		if (gate_phdrs[i].p_type == PT_LOAD) {
-			size += PAGE_ALIGN(gate_phdrs[i].p_memsz);
-			break;
-		}
-	}
-	return size;
-}
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
deleted file mode 100644
index ac06d44b9b27..000000000000
--- a/arch/ia64/kernel/entry.S
+++ /dev/null
@@ -1,1427 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/ia64/kernel/entry.S
- *
- * Kernel entry points.
- *
- * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999, 2002-2003
- *	Asit Mallick <Asit.K.Mallick@intel.com>
- * 	Don Dugger <Don.Dugger@intel.com>
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- */
-/*
- * ia64_switch_to now places correct virtual mapping in in TR2 for
- * kernel stack. This allows us to handle interrupts without changing
- * to physical mode.
- *
- * Jonathan Nicklin	<nicklin@missioncriticallinux.com>
- * Patrick O'Rourke	<orourke@missioncriticallinux.com>
- * 11/07/2000
- */
-/*
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *                    pv_ops.
- */
-/*
- * Global (preserved) predicate usage on syscall entry/exit path:
- *
- *	pKStk:		See entry.h.
- *	pUStk:		See entry.h.
- *	pSys:		See entry.h.
- *	pNonSys:	!pSys
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/kregs.h>
-#include <asm/asm-offsets.h>
-#include <asm/percpu.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-#include <asm/ftrace.h>
-
-#include "minstate.h"
-
-	/*
-	 * execve() is special because in case of success, we need to
-	 * setup a null register window frame.
-	 */
-ENTRY(ia64_execve)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,3,0
-	mov loc0=rp
-	.body
-	mov out0=in0			// filename
-	;;				// stop bit between alloc and call
-	mov out1=in1			// argv
-	mov out2=in2			// envp
-	br.call.sptk.many rp=sys_execve
-.ret0:
-	cmp4.ge p6,p7=r8,r0
-	mov ar.pfs=loc1			// restore ar.pfs
-	sxt4 r8=r8			// return 64-bit result
-	;;
-	stf.spill [sp]=f0
-	mov rp=loc0
-(p6)	mov ar.pfs=r0			// clear ar.pfs on success
-(p7)	br.ret.sptk.many rp
-
-	/*
-	 * In theory, we'd have to zap this state only to prevent leaking of
-	 * security sensitive state (e.g., if current->mm->dumpable is zero).  However,
-	 * this executes in less than 20 cycles even on Itanium, so it's not worth
-	 * optimizing for...).
-	 */
-	mov ar.unat=0; 		mov ar.lc=0
-	mov r4=0;		mov f2=f0;		mov b1=r0
-	mov r5=0;		mov f3=f0;		mov b2=r0
-	mov r6=0;		mov f4=f0;		mov b3=r0
-	mov r7=0;		mov f5=f0;		mov b4=r0
-	ldf.fill f12=[sp];	mov f13=f0;		mov b5=r0
-	ldf.fill f14=[sp];	ldf.fill f15=[sp];	mov f16=f0
-	ldf.fill f17=[sp];	ldf.fill f18=[sp];	mov f19=f0
-	ldf.fill f20=[sp];	ldf.fill f21=[sp];	mov f22=f0
-	ldf.fill f23=[sp];	ldf.fill f24=[sp];	mov f25=f0
-	ldf.fill f26=[sp];	ldf.fill f27=[sp];	mov f28=f0
-	ldf.fill f29=[sp];	ldf.fill f30=[sp];	mov f31=f0
-	br.ret.sptk.many rp
-END(ia64_execve)
-
-/*
- * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr,
- *	      u64 tls)
- */
-GLOBAL_ENTRY(sys_clone2)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc r16=ar.pfs,8,2,6,0
-	DO_SAVE_SWITCH_STACK
-	mov loc0=rp
-	mov loc1=r16                             // save ar.pfs across ia64_clone
-	.body
-	mov out0=in0
-	mov out1=in1
-	mov out2=in2
-	mov out3=in3
-	mov out4=in4
-	mov out5=in5
-	br.call.sptk.many rp=ia64_clone
-.ret1:	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(sys_clone2)
-
-/*
- * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls)
- *	Deprecated.  Use sys_clone2() instead.
- */
-GLOBAL_ENTRY(sys_clone)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc r16=ar.pfs,8,2,6,0
-	DO_SAVE_SWITCH_STACK
-	mov loc0=rp
-	mov loc1=r16                             // save ar.pfs across ia64_clone
-	.body
-	mov out0=in0
-	mov out1=in1
-	mov out2=16				// stacksize (compensates for 16-byte scratch area)
-	mov out3=in3
-	mov out4=in4
-	mov out5=in5
-	br.call.sptk.many rp=ia64_clone
-.ret2:	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(sys_clone)
-
-/*
- * prev_task <- ia64_switch_to(struct task_struct *next)
- *	With Ingo's new scheduler, interrupts are disabled when this routine gets
- *	called.  The code starting at .map relies on this.  The rest of the code
- *	doesn't care about the interrupt masking status.
- */
-GLOBAL_ENTRY(ia64_switch_to)
-	.prologue
-	alloc r16=ar.pfs,1,0,0,0
-	DO_SAVE_SWITCH_STACK
-	.body
-
-	adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
-	movl r25=init_task
-	mov r27=IA64_KR(CURRENT_STACK)
-	adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
-	dep r20=0,in0,61,3		// physical address of "next"
-	;;
-	st8 [r22]=sp			// save kernel stack pointer of old task
-	shr.u r26=r20,IA64_GRANULE_SHIFT
-	cmp.eq p7,p6=r25,in0
-	;;
-	/*
-	 * If we've already mapped this task's page, we can skip doing it again.
-	 */
-(p6)	cmp.eq p7,p6=r26,r27
-(p6)	br.cond.dpnt .map
-	;;
-.done:
-	ld8 sp=[r21]			// load kernel stack pointer of new task
-	MOV_TO_KR(CURRENT, in0, r8, r9)		// update "current" application register
-	mov r8=r13			// return pointer to previously running task
-	mov r13=in0			// set "current" pointer
-	;;
-	DO_LOAD_SWITCH_STACK
-
-#ifdef CONFIG_SMP
-	sync.i				// ensure "fc"s done by this CPU are visible on other CPUs
-#endif
-	br.ret.sptk.many rp		// boogie on out in new context
-
-.map:
-	RSM_PSR_IC(r25)			// interrupts (psr.i) are already disabled here
-	movl r25=PAGE_KERNEL
-	;;
-	srlz.d
-	or r23=r25,r20			// construct PA | page properties
-	mov r25=IA64_GRANULE_SHIFT<<2
-	;;
-	MOV_TO_ITIR(p0, r25, r8)
-	MOV_TO_IFA(in0, r8)		// VA of next task...
-	;;
-	mov r25=IA64_TR_CURRENT_STACK
-	MOV_TO_KR(CURRENT_STACK, r26, r8, r9)	// remember last page we mapped...
-	;;
-	itr.d dtr[r25]=r23		// wire in new mapping...
-	SSM_PSR_IC_AND_SRLZ_D(r8, r9)	// reenable the psr.ic bit
-	br.cond.sptk .done
-END(ia64_switch_to)
-
-/*
- * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
- * means that we may get an interrupt with "sp" pointing to the new kernel stack while
- * ar.bspstore is still pointing to the old kernel backing store area.  Since ar.rsc,
- * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a
- * problem.  Also, we don't need to specify unwind information for preserved registers
- * that are not modified in save_switch_stack as the right unwind information is already
- * specified at the call-site of save_switch_stack.
- */
-
-/*
- * save_switch_stack:
- *	- r16 holds ar.pfs
- *	- b7 holds address to return to
- *	- rp (b0) holds return address to save
- */
-GLOBAL_ENTRY(save_switch_stack)
-	.prologue
-	.altrp b7
-	flushrs			// flush dirty regs to backing store (must be first in insn group)
-	.save @priunat,r17
-	mov r17=ar.unat		// preserve caller's
-	.body
-#ifdef CONFIG_ITANIUM
-	adds r2=16+128,sp
-	adds r3=16+64,sp
-	adds r14=SW(R4)+16,sp
-	;;
-	st8.spill [r14]=r4,16		// spill r4
-	lfetch.fault.excl.nt1 [r3],128
-	;;
-	lfetch.fault.excl.nt1 [r2],128
-	lfetch.fault.excl.nt1 [r3],128
-	;;
-	lfetch.fault.excl [r2]
-	lfetch.fault.excl [r3]
-	adds r15=SW(R5)+16,sp
-#else
-	add r2=16+3*128,sp
-	add r3=16,sp
-	add r14=SW(R4)+16,sp
-	;;
-	st8.spill [r14]=r4,SW(R6)-SW(R4)	// spill r4 and prefetch offset 0x1c0
-	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x010
-	;;
-	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x090
-	lfetch.fault.excl.nt1 [r2],128	//		prefetch offset 0x190
-	;;
-	lfetch.fault.excl.nt1 [r3]	//		prefetch offset 0x110
-	lfetch.fault.excl.nt1 [r2]	//		prefetch offset 0x210
-	adds r15=SW(R5)+16,sp
-#endif
-	;;
-	st8.spill [r15]=r5,SW(R7)-SW(R5)	// spill r5
-	mov.m ar.rsc=0			// put RSE in mode: enforced lazy, little endian, pl 0
-	add r2=SW(F2)+16,sp		// r2 = &sw->f2
-	;;
-	st8.spill [r14]=r6,SW(B0)-SW(R6)	// spill r6
-	mov.m r18=ar.fpsr		// preserve fpsr
-	add r3=SW(F3)+16,sp		// r3 = &sw->f3
-	;;
-	stf.spill [r2]=f2,32
-	mov.m r19=ar.rnat
-	mov r21=b0
-
-	stf.spill [r3]=f3,32
-	st8.spill [r15]=r7,SW(B2)-SW(R7)	// spill r7
-	mov r22=b1
-	;;
-	// since we're done with the spills, read and save ar.unat:
-	mov.m r29=ar.unat
-	mov.m r20=ar.bspstore
-	mov r23=b2
-	stf.spill [r2]=f4,32
-	stf.spill [r3]=f5,32
-	mov r24=b3
-	;;
-	st8 [r14]=r21,SW(B1)-SW(B0)		// save b0
-	st8 [r15]=r23,SW(B3)-SW(B2)		// save b2
-	mov r25=b4
-	mov r26=b5
-	;;
-	st8 [r14]=r22,SW(B4)-SW(B1)		// save b1
-	st8 [r15]=r24,SW(AR_PFS)-SW(B3)		// save b3
-	mov r21=ar.lc		// I-unit
-	stf.spill [r2]=f12,32
-	stf.spill [r3]=f13,32
-	;;
-	st8 [r14]=r25,SW(B5)-SW(B4)		// save b4
-	st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS)	// save ar.pfs
-	stf.spill [r2]=f14,32
-	stf.spill [r3]=f15,32
-	;;
-	st8 [r14]=r26				// save b5
-	st8 [r15]=r21				// save ar.lc
-	stf.spill [r2]=f16,32
-	stf.spill [r3]=f17,32
-	;;
-	stf.spill [r2]=f18,32
-	stf.spill [r3]=f19,32
-	;;
-	stf.spill [r2]=f20,32
-	stf.spill [r3]=f21,32
-	;;
-	stf.spill [r2]=f22,32
-	stf.spill [r3]=f23,32
-	;;
-	stf.spill [r2]=f24,32
-	stf.spill [r3]=f25,32
-	;;
-	stf.spill [r2]=f26,32
-	stf.spill [r3]=f27,32
-	;;
-	stf.spill [r2]=f28,32
-	stf.spill [r3]=f29,32
-	;;
-	stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
-	stf.spill [r3]=f31,SW(PR)-SW(F31)
-	add r14=SW(CALLER_UNAT)+16,sp
-	;;
-	st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT)	// save ar.unat
-	st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
-	mov r21=pr
-	;;
-	st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
-	st8 [r3]=r21				// save predicate registers
-	;;
-	st8 [r2]=r20				// save ar.bspstore
-	st8 [r14]=r18				// save fpsr
-	mov ar.rsc=3		// put RSE back into eager mode, pl 0
-	br.cond.sptk.many b7
-END(save_switch_stack)
-
-/*
- * load_switch_stack:
- *	- "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK)
- *	- b7 holds address to return to
- *	- must not touch r8-r11
- */
-GLOBAL_ENTRY(load_switch_stack)
-	.prologue
-	.altrp b7
-
-	.body
-	lfetch.fault.nt1 [sp]
-	adds r2=SW(AR_BSPSTORE)+16,sp
-	adds r3=SW(AR_UNAT)+16,sp
-	mov ar.rsc=0						// put RSE into enforced lazy mode
-	adds r14=SW(CALLER_UNAT)+16,sp
-	adds r15=SW(AR_FPSR)+16,sp
-	;;
-	ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE))	// bspstore
-	ld8 r29=[r3],(SW(B1)-SW(AR_UNAT))	// unat
-	;;
-	ld8 r21=[r2],16		// restore b0
-	ld8 r22=[r3],16		// restore b1
-	;;
-	ld8 r23=[r2],16		// restore b2
-	ld8 r24=[r3],16		// restore b3
-	;;
-	ld8 r25=[r2],16		// restore b4
-	ld8 r26=[r3],16		// restore b5
-	;;
-	ld8 r16=[r2],(SW(PR)-SW(AR_PFS))	// ar.pfs
-	ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC))	// ar.lc
-	;;
-	ld8 r28=[r2]		// restore pr
-	ld8 r30=[r3]		// restore rnat
-	;;
-	ld8 r18=[r14],16	// restore caller's unat
-	ld8 r19=[r15],24	// restore fpsr
-	;;
-	ldf.fill f2=[r14],32
-	ldf.fill f3=[r15],32
-	;;
-	ldf.fill f4=[r14],32
-	ldf.fill f5=[r15],32
-	;;
-	ldf.fill f12=[r14],32
-	ldf.fill f13=[r15],32
-	;;
-	ldf.fill f14=[r14],32
-	ldf.fill f15=[r15],32
-	;;
-	ldf.fill f16=[r14],32
-	ldf.fill f17=[r15],32
-	;;
-	ldf.fill f18=[r14],32
-	ldf.fill f19=[r15],32
-	mov b0=r21
-	;;
-	ldf.fill f20=[r14],32
-	ldf.fill f21=[r15],32
-	mov b1=r22
-	;;
-	ldf.fill f22=[r14],32
-	ldf.fill f23=[r15],32
-	mov b2=r23
-	;;
-	mov ar.bspstore=r27
-	mov ar.unat=r29		// establish unat holding the NaT bits for r4-r7
-	mov b3=r24
-	;;
-	ldf.fill f24=[r14],32
-	ldf.fill f25=[r15],32
-	mov b4=r25
-	;;
-	ldf.fill f26=[r14],32
-	ldf.fill f27=[r15],32
-	mov b5=r26
-	;;
-	ldf.fill f28=[r14],32
-	ldf.fill f29=[r15],32
-	mov ar.pfs=r16
-	;;
-	ldf.fill f30=[r14],32
-	ldf.fill f31=[r15],24
-	mov ar.lc=r17
-	;;
-	ld8.fill r4=[r14],16
-	ld8.fill r5=[r15],16
-	mov pr=r28,-1
-	;;
-	ld8.fill r6=[r14],16
-	ld8.fill r7=[r15],16
-
-	mov ar.unat=r18				// restore caller's unat
-	mov ar.rnat=r30				// must restore after bspstore but before rsc!
-	mov ar.fpsr=r19				// restore fpsr
-	mov ar.rsc=3				// put RSE back into eager mode, pl 0
-	br.cond.sptk.many b7
-END(load_switch_stack)
-
-	/*
-	 * Invoke a system call, but do some tracing before and after the call.
-	 * We MUST preserve the current register frame throughout this routine
-	 * because some system calls (such as ia64_execve) directly
-	 * manipulate ar.pfs.
-	 */
-GLOBAL_ENTRY(ia64_trace_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * We need to preserve the scratch registers f6-f11 in case the system
-	 * call is sigreturn.
-	 */
-	adds r16=PT(F6)+16,sp
-	adds r17=PT(F7)+16,sp
-	;;
- 	stf.spill [r16]=f6,32
- 	stf.spill [r17]=f7,32
-	;;
- 	stf.spill [r16]=f8,32
- 	stf.spill [r17]=f9,32
-	;;
- 	stf.spill [r16]=f10
- 	stf.spill [r17]=f11
-	br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
-	cmp.lt p6,p0=r8,r0			// check tracehook
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	mov r10=0
-(p6)	br.cond.sptk strace_error		// syscall failed ->
-	adds r16=PT(F6)+16,sp
-	adds r17=PT(F7)+16,sp
-	;;
-	ldf.fill f6=[r16],32
-	ldf.fill f7=[r17],32
-	;;
-	ldf.fill f8=[r16],32
-	ldf.fill f9=[r17],32
-	;;
-	ldf.fill f10=[r16]
-	ldf.fill f11=[r17]
-	// the syscall number may have changed, so re-load it and re-calculate the
-	// syscall entry-point:
-	adds r15=PT(R15)+16,sp			// r15 = &pt_regs.r15 (syscall #)
-	;;
-	ld8 r15=[r15]
-	mov r3=NR_syscalls - 1
-	;;
-	adds r15=-1024,r15
-	movl r16=sys_call_table
-	;;
-	shladd r20=r15,3,r16			// r20 = sys_call_table + 8*(syscall-1024)
-	cmp.leu p6,p7=r15,r3
-	;;
-(p6)	ld8 r20=[r20]				// load address of syscall entry point
-(p7)	movl r20=sys_ni_syscall
-	;;
-	mov b6=r20
-	br.call.sptk.many rp=b6			// do the syscall
-.strace_check_retval:
-	cmp.lt p6,p0=r8,r0			// syscall failed?
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	mov r10=0
-(p6)	br.cond.sptk strace_error		// syscall failed ->
-	;;					// avoid RAW on r10
-.strace_save_retval:
-.mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
-.mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
-	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3:
-(pUStk)	cmp.eq.unc p6,p0=r0,r0			// p6 <- pUStk
-(pUStk)	rsm psr.i				// disable interrupts
-	br.cond.sptk ia64_work_pending_syscall_end
-
-strace_error:
-	ld8 r3=[r2]				// load pt_regs.r8
-	sub r9=0,r8				// negate return value to get errno value
-	;;
-	cmp.ne p6,p0=r3,r0			// is pt_regs.r8!=0?
-	adds r3=16,r2				// r3=&pt_regs.r10
-	;;
-(p6)	mov r10=-1
-(p6)	mov r8=r9
-	br.cond.sptk .strace_save_retval
-END(ia64_trace_syscall)
-
-	/*
-	 * When traced and returning from sigreturn, we invoke syscall_trace but then
-	 * go straight to ia64_leave_kernel rather than ia64_leave_syscall.
-	 */
-GLOBAL_ENTRY(ia64_strace_leave_kernel)
-	PT_REGS_UNWIND_INFO(0)
-{	/*
-	 * Some versions of gas generate bad unwind info if the first instruction of a
-	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
-	 */
-	nop.m 0
-	nop.i 0
-	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-}
-.ret4:	br.cond.sptk ia64_leave_kernel
-END(ia64_strace_leave_kernel)
-
-ENTRY(call_payload)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(0)
-	/* call the kernel_thread payload; fn is in r4, arg - in r5 */
-	alloc loc1=ar.pfs,0,3,1,0
-	mov loc0=rp
-	mov loc2=gp
-	mov out0=r5		// arg
-	ld8 r14 = [r4], 8	// fn.address
-	;;
-	mov b6 = r14
-	ld8 gp = [r4]		// fn.gp
-	;;
-	br.call.sptk.many rp=b6	// fn(arg)
-.ret12:	mov gp=loc2
-	mov rp=loc0
-	mov ar.pfs=loc1
-	/* ... and if it has returned, we are going to userland */
-	cmp.ne pKStk,pUStk=r0,r0
-	br.ret.sptk.many rp
-END(call_payload)
-
-GLOBAL_ENTRY(ia64_ret_from_clone)
-	PT_REGS_UNWIND_INFO(0)
-{	/*
-	 * Some versions of gas generate bad unwind info if the first instruction of a
-	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
-	 */
-	nop.m 0
-	nop.i 0
-	/*
-	 * We need to call schedule_tail() to complete the scheduling process.
-	 * Called by ia64_switch_to() after ia64_clone()->copy_thread().  r8 contains the
-	 * address of the previously executing task.
-	 */
-	br.call.sptk.many rp=ia64_invoke_schedule_tail
-}
-.ret8:
-(pKStk)	br.call.sptk.many rp=call_payload
-	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-	ld4 r2=[r2]
-	;;
-	mov r8=0
-	and r2=_TIF_SYSCALL_TRACEAUDIT,r2
-	;;
-	cmp.ne p6,p0=r2,r0
-(p6)	br.cond.spnt .strace_check_retval
-	;;					// added stop bits to prevent r8 dependency
-END(ia64_ret_from_clone)
-	// fall through
-GLOBAL_ENTRY(ia64_ret_from_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	cmp.ge p6,p7=r8,r0			// syscall executed successfully?
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	mov r10=r0				// clear error indication in r10
-(p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
-END(ia64_ret_from_syscall)
-	// fall through
-
-/*
- * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
- *	need to switch to bank 0 and doesn't restore the scratch registers.
- *	To avoid leaking kernel bits, the scratch registers are set to
- *	the following known-to-be-safe values:
- *
- *		  r1: restored (global pointer)
- *		  r2: cleared
- *		  r3: 1 (when returning to user-level)
- *	      r8-r11: restored (syscall return value(s))
- *		 r12: restored (user-level stack pointer)
- *		 r13: restored (user-level thread pointer)
- *		 r14: set to __kernel_syscall_via_epc
- *		 r15: restored (syscall #)
- *	     r16-r17: cleared
- *		 r18: user-level b6
- *		 r19: cleared
- *		 r20: user-level ar.fpsr
- *		 r21: user-level b0
- *		 r22: cleared
- *		 r23: user-level ar.bspstore
- *		 r24: user-level ar.rnat
- *		 r25: user-level ar.unat
- *		 r26: user-level ar.pfs
- *		 r27: user-level ar.rsc
- *		 r28: user-level ip
- *		 r29: user-level psr
- *		 r30: user-level cfm
- *		 r31: user-level pr
- *	      f6-f11: cleared
- *		  pr: restored (user-level pr)
- *		  b0: restored (user-level rp)
- *	          b6: restored
- *		  b7: set to __kernel_syscall_via_epc
- *	     ar.unat: restored (user-level ar.unat)
- *	      ar.pfs: restored (user-level ar.pfs)
- *	      ar.rsc: restored (user-level ar.rsc)
- *	     ar.rnat: restored (user-level ar.rnat)
- *	 ar.bspstore: restored (user-level ar.bspstore)
- *	     ar.fpsr: restored (user-level ar.fpsr)
- *	      ar.ccv: cleared
- *	      ar.csd: cleared
- *	      ar.ssd: cleared
- */
-GLOBAL_ENTRY(ia64_leave_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
-	 * user- or fsys-mode, hence we disable interrupts early on.
-	 *
-	 * p6 controls whether current_thread_info()->flags needs to be check for
-	 * extra work.  We always check for extra work when returning to user-level.
-	 * With CONFIG_PREEMPTION, we also check for extra work when the preempt_count
-	 * is 0.  After extra work processing has been completed, execution
-	 * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check
-	 * needs to be redone.
-	 */
-#ifdef CONFIG_PREEMPTION
-	RSM_PSR_I(p0, r2, r18)			// disable interrupts
-	cmp.eq pLvSys,p0=r0,r0			// pLvSys=1: leave from syscall
-(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-	.pred.rel.mutex pUStk,pKStk
-(pKStk) ld4 r21=[r20]			// r21 <- preempt_count
-(pUStk)	mov r21=0			// r21 <- 0
-	;;
-	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
-#else /* !CONFIG_PREEMPTION */
-	RSM_PSR_I(pUStk, r2, r18)
-	cmp.eq pLvSys,p0=r0,r0		// pLvSys=1: leave from syscall
-(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
-#endif
-.global ia64_work_processed_syscall;
-ia64_work_processed_syscall:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	adds r2=PT(LOADRS)+16,r12
-	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
-	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
-	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	adds r3=PT(AR_BSPSTORE)+16,r12		// deferred
-	;;
-#else
-	adds r2=PT(LOADRS)+16,r12
-	adds r3=PT(AR_BSPSTORE)+16,r12
-	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
-	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	nop.i 0
-	;;
-#endif
-	mov r16=ar.bsp				// M2  get existing backing store pointer
-	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
-(p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
-	;;
-	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
-(p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
-(p6)	br.cond.spnt .work_pending_syscall
-	;;
-	// start restoring the state saved on the kernel stack (struct pt_regs):
-	ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
-	ld8 r11=[r3],PT(CR_IIP)-PT(R11)
-(pNonSys) break 0		//      bug check: we shouldn't be here if pNonSys is TRUE!
-	;;
-	invala			// M0|1 invalidate ALAT
-	RSM_PSR_I_IC(r28, r29, r30)	// M2   turn off interrupts and interruption collection
-	cmp.eq p9,p0=r0,r0	// A    set p9 to indicate that we should restore cr.ifs
-
-	ld8 r29=[r2],16		// M0|1 load cr.ipsr
-	ld8 r28=[r3],16		// M0|1 load cr.iip
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
-	;;
-	ld8 r30=[r2],16		// M0|1 load cr.ifs
-	ld8 r25=[r3],16		// M0|1 load ar.unat
-(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-#else
-	mov r22=r0		// A    clear r22
-	;;
-	ld8 r30=[r2],16		// M0|1 load cr.ifs
-	ld8 r25=[r3],16		// M0|1 load ar.unat
-(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-#endif
-	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
-	MOV_FROM_PSR(pKStk, r22, r21)	// M2   read PSR now that interrupts are disabled
-	nop 0
-	;;
-	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
-	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// M0|1 load ar.rsc
-	mov f6=f0			// F    clear f6
-	;;
-	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// M0|1 load ar.rnat (may be garbage)
-	ld8 r31=[r3],PT(R1)-PT(PR)		// M0|1 load predicates
-	mov f7=f0				// F    clear f7
-	;;
-	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// M0|1 load ar.fpsr
-	ld8.fill r1=[r3],16			// M0|1 load r1
-(pUStk) mov r17=1				// A
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk) st1 [r15]=r17				// M2|3
-#else
-(pUStk) st1 [r14]=r17				// M2|3
-#endif
-	ld8.fill r13=[r3],16			// M0|1
-	mov f8=f0				// F    clear f8
-	;;
-	ld8.fill r12=[r2]			// M0|1 restore r12 (sp)
-	ld8.fill r15=[r3]			// M0|1 restore r15
-	mov b6=r18				// I0   restore b6
-
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-	mov f9=f0					// F    clear f9
-(pKStk) br.cond.dpnt.many skip_rbs_switch		// B
-
-	srlz.d				// M0   ensure interruption collection is off (for cover)
-	shr.u r18=r19,16		// I0|1 get byte size of existing "dirty" partition
-	COVER				// B    add current frame into dirty partition & set cr.ifs
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	mov r19=ar.bsp			// M2   get new backing store pointer
-	st8 [r14]=r22			// M	save time at leave
-	mov f10=f0			// F    clear f10
-
-	mov r22=r0			// A	clear r22
-	movl r14=__kernel_syscall_via_epc // X
-	;;
-#else
-	mov r19=ar.bsp			// M2   get new backing store pointer
-	mov f10=f0			// F    clear f10
-
-	nop.m 0
-	movl r14=__kernel_syscall_via_epc // X
-	;;
-#endif
-	mov.m ar.csd=r0			// M2   clear ar.csd
-	mov.m ar.ccv=r0			// M2   clear ar.ccv
-	mov b7=r14			// I0   clear b7 (hint with __kernel_syscall_via_epc)
-
-	mov.m ar.ssd=r0			// M2   clear ar.ssd
-	mov f11=f0			// F    clear f11
-	br.cond.sptk.many rbs_switch	// B
-END(ia64_leave_syscall)
-
-GLOBAL_ENTRY(ia64_leave_kernel)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
-	 * user- or fsys-mode, hence we disable interrupts early on.
-	 *
-	 * p6 controls whether current_thread_info()->flags needs to be check for
-	 * extra work.  We always check for extra work when returning to user-level.
-	 * With CONFIG_PREEMPTION, we also check for extra work when the preempt_count
-	 * is 0.  After extra work processing has been completed, execution
-	 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
-	 * needs to be redone.
-	 */
-#ifdef CONFIG_PREEMPTION
-	RSM_PSR_I(p0, r17, r31)			// disable interrupts
-	cmp.eq p0,pLvSys=r0,r0			// pLvSys=0: leave from kernel
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-	.pred.rel.mutex pUStk,pKStk
-(pKStk)	ld4 r21=[r20]			// r21 <- preempt_count
-(pUStk)	mov r21=0			// r21 <- 0
-	;;
-	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
-#else
-	RSM_PSR_I(pUStk, r17, r31)
-	cmp.eq p0,pLvSys=r0,r0		// pLvSys=0: leave from kernel
-(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
-#endif
-.work_processed_kernel:
-	adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r17]				// load current_thread_info()->flags
-	adds r21=PT(PR)+16,r12
-	;;
-
-	lfetch [r21],PT(CR_IPSR)-PT(PR)
-	adds r2=PT(B6)+16,r12
-	adds r3=PT(R16)+16,r12
-	;;
-	lfetch [r21]
-	ld8 r28=[r2],8		// load b6
-	adds r29=PT(R24)+16,r12
-
-	ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
-	adds r30=PT(AR_CCV)+16,r12
-(p6)	and r19=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
-	;;
-	ld8.fill r24=[r29]
-	ld8 r15=[r30]		// load ar.ccv
-(p6)	cmp4.ne.unc p6,p0=r19, r0		// any special work pending?
-	;;
-	ld8 r29=[r2],16		// load b7
-	ld8 r30=[r3],16		// load ar.csd
-(p6)	br.cond.spnt .work_pending
-	;;
-	ld8 r31=[r2],16		// load ar.ssd
-	ld8.fill r8=[r3],16
-	;;
-	ld8.fill r9=[r2],16
-	ld8.fill r10=[r3],PT(R17)-PT(R10)
-	;;
-	ld8.fill r11=[r2],PT(R18)-PT(R11)
-	ld8.fill r17=[r3],16
-	;;
-	ld8.fill r18=[r2],16
-	ld8.fill r19=[r3],16
-	;;
-	ld8.fill r20=[r2],16
-	ld8.fill r21=[r3],16
-	mov ar.csd=r30
-	mov ar.ssd=r31
-	;;
-	RSM_PSR_I_IC(r23, r22, r25)	// initiate turning off of interrupt and interruption collection
-	invala			// invalidate ALAT
-	;;
-	ld8.fill r22=[r2],24
-	ld8.fill r23=[r3],24
-	mov b6=r28
-	;;
-	ld8.fill r25=[r2],16
-	ld8.fill r26=[r3],16
-	mov b7=r29
-	;;
-	ld8.fill r27=[r2],16
-	ld8.fill r28=[r3],16
-	;;
-	ld8.fill r29=[r2],16
-	ld8.fill r30=[r3],24
-	;;
-	ld8.fill r31=[r2],PT(F9)-PT(R31)
-	adds r3=PT(F10)-PT(F6),r3
-	;;
-	ldf.fill f9=[r2],PT(F6)-PT(F9)
-	ldf.fill f10=[r3],PT(F8)-PT(F10)
-	;;
-	ldf.fill f6=[r2],PT(F7)-PT(F6)
-	;;
-	ldf.fill f7=[r2],PT(F11)-PT(F7)
-	ldf.fill f8=[r3],32
-	;;
-	srlz.d	// ensure that inter. collection is off (VHPT is don't care, since text is pinned)
-	mov ar.ccv=r15
-	;;
-	ldf.fill f11=[r2]
-	BSW_0(r2, r3, r15)	// switch back to bank 0 (no stop bit required beforehand...)
-	;;
-(pUStk)	mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
-	adds r16=PT(CR_IPSR)+16,r12
-	adds r17=PT(CR_IIP)+16,r12
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	.pred.rel.mutex pUStk,pKStk
-	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
-	nop.i 0
-	;;
-#else
-	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-	nop.i 0
-	nop.i 0
-	;;
-#endif
-	ld8 r29=[r16],16	// load cr.ipsr
-	ld8 r28=[r17],16	// load cr.iip
-	;;
-	ld8 r30=[r16],16	// load cr.ifs
-	ld8 r25=[r17],16	// load ar.unat
-	;;
-	ld8 r26=[r16],16	// load ar.pfs
-	ld8 r27=[r17],16	// load ar.rsc
-	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
-	;;
-	ld8 r24=[r16],16	// load ar.rnat (may be garbage)
-	ld8 r23=[r17],16	// load ar.bspstore (may be garbage)
-	;;
-	ld8 r31=[r16],16	// load predicates
-	ld8 r21=[r17],16	// load b0
-	;;
-	ld8 r19=[r16],16	// load ar.rsc value for "loadrs"
-	ld8.fill r1=[r17],16	// load r1
-	;;
-	ld8.fill r12=[r16],16
-	ld8.fill r13=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk)	adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
-#else
-(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
-#endif
-	;;
-	ld8 r20=[r16],16	// ar.fpsr
-	ld8.fill r15=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18	// deferred
-#endif
-	;;
-	ld8.fill r14=[r16],16
-	ld8.fill r2=[r17]
-(pUStk)	mov r17=1
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	//  mmi_ :  ld8 st1 shr;;         mmi_ : st8 st1 shr;;
-	//  mib  :  mov add br        ->  mib  : ld8 add br
-	//  bbb_ :  br  nop cover;;       mbb_ : mov br  cover;;
-	//
-	//  no one require bsp in r16 if (pKStk) branch is selected.
-(pUStk)	st8 [r3]=r22		// save time at leave
-(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
-	shr.u r18=r19,16	// get byte size of existing "dirty" partition
-	;;
-	ld8.fill r3=[r16]	// deferred
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-(pKStk)	br.cond.dpnt skip_rbs_switch
-	mov r16=ar.bsp		// get existing backing store pointer
-#else
-	ld8.fill r3=[r16]
-(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
-	shr.u r18=r19,16	// get byte size of existing "dirty" partition
-	;;
-	mov r16=ar.bsp		// get existing backing store pointer
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-(pKStk)	br.cond.dpnt skip_rbs_switch
-#endif
-
-	/*
-	 * Restore user backing store.
-	 *
-	 * NOTE: alloc, loadrs, and cover can't be predicated.
-	 */
-(pNonSys) br.cond.dpnt dont_preserve_current_frame
-	COVER				// add current frame into dirty partition and set cr.ifs
-	;;
-	mov r19=ar.bsp			// get new backing store pointer
-rbs_switch:
-	sub r16=r16,r18			// krbs = old bsp - size of dirty partition
-	cmp.ne p9,p0=r0,r0		// clear p9 to skip restore of cr.ifs
-	;;
-	sub r19=r19,r16			// calculate total byte size of dirty partition
-	add r18=64,r18			// don't force in0-in7 into memory...
-	;;
-	shl r19=r19,16			// shift size of dirty partition into loadrs position
-	;;
-dont_preserve_current_frame:
-	/*
-	 * To prevent leaking bits between the kernel and user-space,
-	 * we must clear the stacked registers in the "invalid" partition here.
-	 * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
-	 * 5 registers/cycle on McKinley).
-	 */
-#	define pRecurse	p6
-#	define pReturn	p7
-#ifdef CONFIG_ITANIUM
-#	define Nregs	10
-#else
-#	define Nregs	14
-#endif
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	shr.u loc1=r18,9		// RNaTslots <= floor(dirtySize / (64*8))
-	sub r17=r17,r18			// r17 = (physStackedSize + 8) - dirtySize
-	;;
-	mov ar.rsc=r19			// load ar.rsc to be used for "loadrs"
-	shladd in0=loc1,3,r17
-	mov in1=0
-	;;
-	TEXT_ALIGN(32)
-rse_clear_invalid:
-#ifdef CONFIG_ITANIUM
-	// cycle 0
- { .mii
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
-	add out0=-Nregs*8,in0
-}{ .mfb
-	add out1=1,in1			// increment recursion count
-	nop.f 0
-	nop.b 0				// can't do br.call here because of alloc (WAW on CFM)
-	;;
-}{ .mfi	// cycle 1
-	mov loc1=0
-	nop.f 0
-	mov loc2=0
-}{ .mib
-	mov loc3=0
-	mov loc4=0
-(pRecurse) br.call.sptk.many b0=rse_clear_invalid
-
-}{ .mfi	// cycle 2
-	mov loc5=0
-	nop.f 0
-	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
-}{ .mib
-	mov loc6=0
-	mov loc7=0
-(pReturn) br.ret.sptk.many b0
-}
-#else /* !CONFIG_ITANIUM */
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
-	add out0=-Nregs*8,in0
-	add out1=1,in1			// increment recursion count
-	mov loc1=0
-	mov loc2=0
-	;;
-	mov loc3=0
-	mov loc4=0
-	mov loc5=0
-	mov loc6=0
-	mov loc7=0
-(pRecurse) br.call.dptk.few b0=rse_clear_invalid
-	;;
-	mov loc8=0
-	mov loc9=0
-	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
-	mov loc10=0
-	mov loc11=0
-(pReturn) br.ret.dptk.many b0
-#endif /* !CONFIG_ITANIUM */
-#	undef pRecurse
-#	undef pReturn
-	;;
-	alloc r17=ar.pfs,0,0,0,0	// drop current register frame
-	;;
-	loadrs
-	;;
-skip_rbs_switch:
-	mov ar.unat=r25		// M2
-(pKStk)	extr.u r22=r22,21,1	// I0 extract current value of psr.pp from r22
-(pLvSys)mov r19=r0		// A  clear r19 for leave_syscall, no-op otherwise
-	;;
-(pUStk)	mov ar.bspstore=r23	// M2
-(pKStk)	dep r29=r22,r29,21,1	// I0 update ipsr.pp with psr.pp
-(pLvSys)mov r16=r0		// A  clear r16 for leave_syscall, no-op otherwise
-	;;
-	MOV_TO_IPSR(p0, r29, r25)	// M2
-	mov ar.pfs=r26		// I0
-(pLvSys)mov r17=r0		// A  clear r17 for leave_syscall, no-op otherwise
-
-	MOV_TO_IFS(p9, r30, r25)// M2
-	mov b0=r21		// I0
-(pLvSys)mov r18=r0		// A  clear r18 for leave_syscall, no-op otherwise
-
-	mov ar.fpsr=r20		// M2
-	MOV_TO_IIP(r28, r25)	// M2
-	nop 0
-	;;
-(pUStk)	mov ar.rnat=r24		// M2 must happen with RSE in lazy mode
-	nop 0
-(pLvSys)mov r2=r0
-
-	mov ar.rsc=r27		// M2
-	mov pr=r31,-1		// I0
-	RFI			// B
-
-	/*
-	 * On entry:
-	 *	r20 = &current->thread_info->pre_count (if CONFIG_PREEMPTION)
-	 *	r31 = current->thread_info->flags
-	 * On exit:
-	 *	p6 = TRUE if work-pending-check needs to be redone
-	 *
-	 * Interrupts are disabled on entry, reenabled depend on work, and
-	 * disabled on exit.
-	 */
-.work_pending_syscall:
-	add r2=-8,r2
-	add r3=-8,r3
-	;;
-	st8 [r2]=r8
-	st8 [r3]=r10
-.work_pending:
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED	// is resched not needed?
-(p6)	br.cond.sptk.few .notify
-	br.call.spnt.many rp=preempt_schedule_irq
-.ret9:	cmp.eq p6,p0=r0,r0	// p6 <- 1 (re-check)
-(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel
-
-.notify:
-(pUStk)	br.call.spnt.many rp=notify_resume_user
-.ret10:	cmp.ne p6,p0=r0,r0	// p6 <- 0 (don't re-check)
-(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel
-
-.global ia64_work_pending_syscall_end;
-ia64_work_pending_syscall_end:
-	adds r2=PT(R8)+16,r12
-	adds r3=PT(R10)+16,r12
-	;;
-	ld8 r8=[r2]
-	ld8 r10=[r3]
-	br.cond.sptk.many ia64_work_processed_syscall
-END(ia64_leave_kernel)
-
-ENTRY(handle_syscall_error)
-	/*
-	 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
-	 * lead us to mistake a negative return value as a failed syscall.  Those syscall
-	 * must deposit a non-zero value in pt_regs.r8 to indicate an error.  If
-	 * pt_regs.r8 is zero, we assume that the call completed successfully.
-	 */
-	PT_REGS_UNWIND_INFO(0)
-	ld8 r3=[r2]		// load pt_regs.r8
-	;;
-	cmp.eq p6,p7=r3,r0	// is pt_regs.r8==0?
-	;;
-(p7)	mov r10=-1
-(p7)	sub r8=0,r8		// negate return value to get errno
-	br.cond.sptk ia64_leave_syscall
-END(handle_syscall_error)
-
-	/*
-	 * Invoke schedule_tail(task) while preserving in0-in7, which may be needed
-	 * in case a system call gets restarted.
-	 */
-GLOBAL_ENTRY(ia64_invoke_schedule_tail)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,1,0
-	mov loc0=rp
-	mov out0=r8				// Address of previous task
-	;;
-	br.call.sptk.many rp=schedule_tail
-.ret11:	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(ia64_invoke_schedule_tail)
-
-	/*
-	 * Setup stack and call do_notify_resume_user(), keeping interrupts
-	 * disabled.
-	 *
-	 * Note that pSys and pNonSys need to be set up by the caller.
-	 * We declare 8 input registers so the system call args get preserved,
-	 * in case we need to restart a system call.
-	 */
-GLOBAL_ENTRY(notify_resume_user)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
-	mov r9=ar.unat
-	mov loc0=rp				// save return address
-	mov out0=0				// there is no "oldset"
-	adds out1=8,sp				// out1=&sigscratch->ar_pfs
-(pSys)	mov out2=1				// out2==1 => we're in a syscall
-	;;
-(pNonSys) mov out2=0				// out2==0 => not a syscall
-	.fframe 16
-	.spillsp ar.unat, 16
-	st8 [sp]=r9,-16				// allocate space for ar.unat and save it
-	st8 [out1]=loc1,-8			// save ar.pfs, out1=&sigscratch
-	.body
-	br.call.sptk.many rp=do_notify_resume_user
-.ret15:	.restore sp
-	adds sp=16,sp				// pop scratch stack space
-	;;
-	ld8 r9=[sp]				// load new unat from sigscratch->scratch_unat
-	mov rp=loc0
-	;;
-	mov ar.unat=r9
-	mov ar.pfs=loc1
-	br.ret.sptk.many rp
-END(notify_resume_user)
-
-ENTRY(sys_rt_sigreturn)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	alloc r2=ar.pfs,8,0,1,0
-	.prologue
-	PT_REGS_SAVES(16)
-	adds sp=-16,sp
-	.body
-	cmp.eq pNonSys,pSys=r0,r0		// sigreturn isn't a normal syscall...
-	;;
-	/*
-	 * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined
-	 * syscall-entry path does not save them we save them here instead.  Note: we
-	 * don't need to save any other registers that are not saved by the stream-lined
-	 * syscall path, because restore_sigcontext() restores them.
-	 */
-	adds r16=PT(F6)+32,sp
-	adds r17=PT(F7)+32,sp
-	;;
- 	stf.spill [r16]=f6,32
- 	stf.spill [r17]=f7,32
-	;;
- 	stf.spill [r16]=f8,32
- 	stf.spill [r17]=f9,32
-	;;
- 	stf.spill [r16]=f10
- 	stf.spill [r17]=f11
-	adds out0=16,sp				// out0 = &sigscratch
-	br.call.sptk.many rp=ia64_rt_sigreturn
-.ret19:	.restore sp,0
-	adds sp=16,sp
-	;;
-	ld8 r9=[sp]				// load new ar.unat
-	mov.sptk b7=r8,ia64_leave_kernel
-	;;
-	mov ar.unat=r9
-	br.many b7
-END(sys_rt_sigreturn)
-
-GLOBAL_ENTRY(ia64_prepare_handle_unaligned)
-	.prologue
-	/*
-	 * r16 = fake ar.pfs, we simply need to make sure privilege is still 0
-	 */
-	mov r16=r0
-	DO_SAVE_SWITCH_STACK
-	br.call.sptk.many rp=ia64_handle_unaligned	// stack frame setup in ivt
-.ret21:	.body
-	DO_LOAD_SWITCH_STACK
-	br.cond.sptk.many rp				// goes to ia64_leave_kernel
-END(ia64_prepare_handle_unaligned)
-
-	//
-	// unw_init_running(void (*callback)(info, arg), void *arg)
-	//
-#	define EXTRA_FRAME_SIZE	((UNW_FRAME_INFO_SIZE+15)&~15)
-
-GLOBAL_ENTRY(unw_init_running)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	alloc loc1=ar.pfs,2,3,3,0
-	;;
-	ld8 loc2=[in0],8
-	mov loc0=rp
-	mov r16=loc1
-	DO_SAVE_SWITCH_STACK
-	.body
-
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	.fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE
-	SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE)
-	adds sp=-EXTRA_FRAME_SIZE,sp
-	.body
-	;;
-	adds out0=16,sp				// &info
-	mov out1=r13				// current
-	adds out2=16+EXTRA_FRAME_SIZE,sp	// &switch_stack
-	br.call.sptk.many rp=unw_init_frame_info
-1:	adds out0=16,sp				// &info
-	mov b6=loc2
-	mov loc2=gp				// save gp across indirect function call
-	;;
-	ld8 gp=[in0]
-	mov out1=in1				// arg
-	br.call.sptk.many rp=b6			// invoke the callback function
-1:	mov gp=loc2				// restore gp
-
-	// For now, we don't allow changing registers from within
-	// unw_init_running; if we ever want to allow that, we'd
-	// have to do a load_switch_stack here:
-	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp
-
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(unw_init_running)
-EXPORT_SYMBOL(unw_init_running)
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-GLOBAL_ENTRY(_mcount)
-	br ftrace_stub
-END(_mcount)
-EXPORT_SYMBOL(_mcount)
-
-.here:
-	br.ret.sptk.many b0
-
-GLOBAL_ENTRY(ftrace_caller)
-	alloc out0 = ar.pfs, 8, 0, 4, 0
-	mov out3 = r0
-	;;
-	mov out2 = b0
-	add r3 = 0x20, r3
-	mov out1 = r1;
-	br.call.sptk.many b0 = ftrace_patch_gp
-	//this might be called from module, so we must patch gp
-ftrace_patch_gp:
-	movl gp=__gp
-	mov b0 = r3
-	;;
-.global ftrace_call;
-ftrace_call:
-{
-	.mlx
-	nop.m 0x0
-	movl r3 = .here;;
-}
-	alloc loc0 = ar.pfs, 4, 4, 2, 0
-	;;
-	mov loc1 = b0
-	mov out0 = b0
-	mov loc2 = r8
-	mov loc3 = r15
-	;;
-	adds out0 = -MCOUNT_INSN_SIZE, out0
-	mov out1 = in2
-	mov b6 = r3
-
-	br.call.sptk.many b0 = b6
-	;;
-	mov ar.pfs = loc0
-	mov b0 = loc1
-	mov r8 = loc2
-	mov r15 = loc3
-	br ftrace_stub
-	;;
-END(ftrace_caller)
-
-#else
-GLOBAL_ENTRY(_mcount)
-	movl r2 = ftrace_stub
-	movl r3 = ftrace_trace_function;;
-	ld8 r3 = [r3];;
-	ld8 r3 = [r3];;
-	cmp.eq p7,p0 = r2, r3
-(p7)	br.sptk.many ftrace_stub
-	;;
-
-	alloc loc0 = ar.pfs, 4, 4, 2, 0
-	;;
-	mov loc1 = b0
-	mov out0 = b0
-	mov loc2 = r8
-	mov loc3 = r15
-	;;
-	adds out0 = -MCOUNT_INSN_SIZE, out0
-	mov out1 = in2
-	mov b6 = r3
-
-	br.call.sptk.many b0 = b6
-	;;
-	mov ar.pfs = loc0
-	mov b0 = loc1
-	mov r8 = loc2
-	mov r15 = loc3
-	br ftrace_stub
-	;;
-END(_mcount)
-#endif
-
-GLOBAL_ENTRY(ftrace_stub)
-	mov r3 = b0
-	movl r2 = _mcount_ret_helper
-	;;
-	mov b6 = r2
-	mov b7 = r3
-	br.ret.sptk.many b6
-
-_mcount_ret_helper:
-	mov b0 = r42
-	mov r1 = r41
-	mov ar.pfs = r40
-	br b7
-END(ftrace_stub)
-
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#define __SYSCALL(nr, entry) data8 entry
-	.rodata
-	.align 8
-	.globl sys_call_table
-sys_call_table:
-#include <asm/syscall_table.h>
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
deleted file mode 100644
index 6463dc316263..000000000000
--- a/arch/ia64/kernel/entry.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/*
- * Preserved registers that are shared between code in ivt.S and
- * entry.S.  Be careful not to step on these!
- */
-#define PRED_LEAVE_SYSCALL	1 /* TRUE iff leave from syscall */
-#define PRED_KERNEL_STACK	2 /* returning to kernel-stacks? */
-#define PRED_USER_STACK		3 /* returning to user-stacks? */
-#define PRED_SYSCALL		4 /* inside a system call? */
-#define PRED_NON_SYSCALL	5 /* complement of PRED_SYSCALL */
-
-#ifdef __ASSEMBLY__
-# define PASTE2(x,y)	x##y
-# define PASTE(x,y)	PASTE2(x,y)
-
-# define pLvSys		PASTE(p,PRED_LEAVE_SYSCALL)
-# define pKStk		PASTE(p,PRED_KERNEL_STACK)
-# define pUStk		PASTE(p,PRED_USER_STACK)
-# define pSys		PASTE(p,PRED_SYSCALL)
-# define pNonSys	PASTE(p,PRED_NON_SYSCALL)
-#endif
-
-#define PT(f)		(IA64_PT_REGS_##f##_OFFSET)
-#define SW(f)		(IA64_SWITCH_STACK_##f##_OFFSET)
-#define SOS(f)		(IA64_SAL_OS_STATE_##f##_OFFSET)
-
-#define PT_REGS_SAVES(off)			\
-	.unwabi 3, 'i';				\
-	.fframe IA64_PT_REGS_SIZE+16+(off);	\
-	.spillsp rp, PT(CR_IIP)+16+(off);	\
-	.spillsp ar.pfs, PT(CR_IFS)+16+(off);	\
-	.spillsp ar.unat, PT(AR_UNAT)+16+(off);	\
-	.spillsp ar.fpsr, PT(AR_FPSR)+16+(off);	\
-	.spillsp pr, PT(PR)+16+(off);
-
-#define PT_REGS_UNWIND_INFO(off)		\
-	.prologue;				\
-	PT_REGS_SAVES(off);			\
-	.body
-
-#define SWITCH_STACK_SAVES(off)							\
-	.savesp ar.unat,SW(CALLER_UNAT)+16+(off);				\
-	.savesp ar.fpsr,SW(AR_FPSR)+16+(off);					\
-	.spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off);		\
-	.spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off);		\
-	.spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off);		\
-	.spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off);		\
-	.spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off);		\
-	.spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off);		\
-	.spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off);		\
-	.spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off);		\
-	.spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off);		\
-	.spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off);		\
-	.spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off);		\
-	.spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off);		\
-	.spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off);		\
-	.spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off);		\
-	.spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off);		\
-	.spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off);	\
-	.spillsp @priunat,SW(AR_UNAT)+16+(off);					\
-	.spillsp ar.rnat,SW(AR_RNAT)+16+(off);					\
-	.spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off);				\
-	.spillsp pr,SW(PR)+16+(off)
-
-#define DO_SAVE_SWITCH_STACK			\
-	movl r28=1f;				\
-	;;					\
-	.fframe IA64_SWITCH_STACK_SIZE;		\
-	adds sp=-IA64_SWITCH_STACK_SIZE,sp;	\
-	mov.ret.sptk b7=r28,1f;			\
-	SWITCH_STACK_SAVES(0);			\
-	br.cond.sptk.many save_switch_stack;	\
-1:
-
-#define DO_LOAD_SWITCH_STACK			\
-	movl r28=1f;				\
-	;;					\
-	invala;					\
-	mov.ret.sptk b7=r28,1f;			\
-	br.cond.sptk.many load_switch_stack;	\
-1:	.restore sp;				\
-	adds sp=IA64_SWITCH_STACK_SIZE,sp
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
deleted file mode 100644
index dd5bfed52031..000000000000
--- a/arch/ia64/kernel/err_inject.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * err_inject.c -
- *	1.) Inject errors to a processor.
- *	2.) Query error injection capabilities.
- * This driver along with user space code can be acting as an error
- * injection tool.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Written by: Fenghua Yu <fenghua.yu@intel.com>, Intel Corporation
- * Copyright (C) 2006, Intel Corp.  All rights reserved.
- *
- */
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#define ERR_INJ_DEBUG
-
-#define ERR_DATA_BUFFER_SIZE 3 		// Three 8-byte;
-
-#define define_one_ro(name) 						\
-static DEVICE_ATTR(name, 0444, show_##name, NULL)
-
-#define define_one_rw(name) 						\
-static DEVICE_ATTR(name, 0644, show_##name, store_##name)
-
-static u64 call_start[NR_CPUS];
-static u64 phys_addr[NR_CPUS];
-static u64 err_type_info[NR_CPUS];
-static u64 err_struct_info[NR_CPUS];
-static struct {
-	u64 data1;
-	u64 data2;
-	u64 data3;
-} __attribute__((__aligned__(16))) err_data_buffer[NR_CPUS];
-static s64 status[NR_CPUS];
-static u64 capabilities[NR_CPUS];
-static u64 resources[NR_CPUS];
-
-#define show(name) 							\
-static ssize_t 								\
-show_##name(struct device *dev, struct device_attribute *attr,	\
-		char *buf)						\
-{									\
-	u32 cpu=dev->id;						\
-	return sprintf(buf, "%llx\n", name[cpu]);			\
-}
-
-#define store(name)							\
-static ssize_t 								\
-store_##name(struct device *dev, struct device_attribute *attr,	\
-					const char *buf, size_t size)	\
-{									\
-	unsigned int cpu=dev->id;					\
-	name[cpu] = simple_strtoull(buf, NULL, 16);			\
-	return size;							\
-}
-
-show(call_start)
-
-/* It's user's responsibility to call the PAL procedure on a specific
- * processor. The cpu number in driver is only used for storing data.
- */
-static ssize_t
-store_call_start(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	unsigned long call_start = simple_strtoull(buf, NULL, 16);
-
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu);
-	printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]);
-	printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]);
-	printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n",
-			  err_data_buffer[cpu].data1,
-			  err_data_buffer[cpu].data2,
-			  err_data_buffer[cpu].data3);
-#endif
-	switch (call_start) {
-	    case 0: /* Do nothing. */
-		break;
-	    case 1: /* Call pal_mc_error_inject in physical mode. */
-		status[cpu]=ia64_pal_mc_error_inject_phys(err_type_info[cpu],
-					err_struct_info[cpu],
-					ia64_tpa(&err_data_buffer[cpu]),
-					&capabilities[cpu],
-			 		&resources[cpu]);
-		break;
-	    case 2: /* Call pal_mc_error_inject in virtual mode. */
-		status[cpu]=ia64_pal_mc_error_inject_virt(err_type_info[cpu],
-					err_struct_info[cpu],
-					ia64_tpa(&err_data_buffer[cpu]),
-					&capabilities[cpu],
-			 		&resources[cpu]);
-		break;
-	    default:
-		status[cpu] = -EINVAL;
-		break;
-	}
-
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]);
-	printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]);
-	printk(KERN_DEBUG "resources=%llx\n", resources[cpu]);
-#endif
-	return size;
-}
-
-show(err_type_info)
-store(err_type_info)
-
-static ssize_t
-show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	unsigned int cpu=dev->id;
-	return sprintf(buf, "%llx\n", phys_addr[cpu]);
-}
-
-static ssize_t
-store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
-			const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	u64 virt_addr=simple_strtoull(buf, NULL, 16);
-	int ret;
-
-	ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL);
-	if (ret<=0) {
-#ifdef ERR_INJ_DEBUG
-		printk("Virtual address %llx is not existing.\n", virt_addr);
-#endif
-		return -EINVAL;
-	}
-
-	phys_addr[cpu] = ia64_tpa(virt_addr);
-	return size;
-}
-
-show(err_struct_info)
-store(err_struct_info)
-
-static ssize_t
-show_err_data_buffer(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	unsigned int cpu=dev->id;
-
-	return sprintf(buf, "%llx, %llx, %llx\n",
-			err_data_buffer[cpu].data1,
-			err_data_buffer[cpu].data2,
-			err_data_buffer[cpu].data3);
-}
-
-static ssize_t
-store_err_data_buffer(struct device *dev,
-			struct device_attribute *attr,
-			const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	int ret;
-
-#ifdef ERR_INJ_DEBUG
-	printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n",
-		 err_data_buffer[cpu].data1,
-		 err_data_buffer[cpu].data2,
-		 err_data_buffer[cpu].data3,
-		 cpu);
-#endif
-	ret = sscanf(buf, "%llx, %llx, %llx",
-			&err_data_buffer[cpu].data1,
-			&err_data_buffer[cpu].data2,
-			&err_data_buffer[cpu].data3);
-	if (ret!=ERR_DATA_BUFFER_SIZE)
-		return -EINVAL;
-
-	return size;
-}
-
-show(status)
-show(capabilities)
-show(resources)
-
-define_one_rw(call_start);
-define_one_rw(err_type_info);
-define_one_rw(err_struct_info);
-define_one_rw(err_data_buffer);
-define_one_rw(virtual_to_phys);
-define_one_ro(status);
-define_one_ro(capabilities);
-define_one_ro(resources);
-
-static struct attribute *default_attrs[] = {
-	&dev_attr_call_start.attr,
-	&dev_attr_virtual_to_phys.attr,
-	&dev_attr_err_type_info.attr,
-	&dev_attr_err_struct_info.attr,
-	&dev_attr_err_data_buffer.attr,
-	&dev_attr_status.attr,
-	&dev_attr_capabilities.attr,
-	&dev_attr_resources.attr,
-	NULL
-};
-
-static struct attribute_group err_inject_attr_group = {
-	.attrs = default_attrs,
-	.name = "err_inject"
-};
-/* Add/Remove err_inject interface for CPU device */
-static int err_inject_add_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-
-	return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group);
-}
-
-static int err_inject_remove_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-
-	sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
-	return 0;
-}
-
-static enum cpuhp_state hp_online;
-
-static int __init err_inject_init(void)
-{
-	int ret;
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_INFO "Enter error injection driver.\n");
-#endif
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/err_inj:online",
-				err_inject_add_dev, err_inject_remove_dev);
-	if (ret >= 0) {
-		hp_online = ret;
-		ret = 0;
-	}
-	return ret;
-}
-
-static void __exit err_inject_exit(void)
-{
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_INFO "Exit error injection driver.\n");
-#endif
-	cpuhp_remove_state(hp_online);
-}
-
-module_init(err_inject_init);
-module_exit(err_inject_exit);
-
-MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>");
-MODULE_DESCRIPTION("MC error injection kernel sysfs interface");
-MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c
deleted file mode 100644
index 4df57c93e0a8..000000000000
--- a/arch/ia64/kernel/esi.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Extensible SAL Interface (ESI) support routines.
- *
- * Copyright (C) 2006 Hewlett-Packard Co
- * 	Alex Williamson <alex.williamson@hp.com>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/esi.h>
-#include <asm/sal.h>
-
-MODULE_AUTHOR("Alex Williamson <alex.williamson@hp.com>");
-MODULE_DESCRIPTION("Extensible SAL Interface (ESI) support");
-MODULE_LICENSE("GPL");
-
-#define MODULE_NAME	"esi"
-
-enum esi_systab_entry_type {
-	ESI_DESC_ENTRY_POINT = 0
-};
-
-/*
- * Entry type:	Size:
- *	0	48
- */
-#define ESI_DESC_SIZE(type)	"\060"[(unsigned) (type)]
-
-typedef struct ia64_esi_desc_entry_point {
-	u8 type;
-	u8 reserved1[15];
-	u64 esi_proc;
-	u64 gp;
-	efi_guid_t guid;
-} ia64_esi_desc_entry_point_t;
-
-struct pdesc {
-	void *addr;
-	void *gp;
-};
-
-static struct ia64_sal_systab *esi_systab;
-
-extern unsigned long esi_phys;
-
-static int __init esi_init (void)
-{
-	struct ia64_sal_systab *systab;
-	char *p;
-	int i;
-
-	if (esi_phys == EFI_INVALID_TABLE_ADDR)
-		return -ENODEV;
-
-	systab = __va(esi_phys);
-
-	if (strncmp(systab->signature, "ESIT", 4) != 0) {
-		printk(KERN_ERR "bad signature in ESI system table!");
-		return -ENODEV;
-	}
-
-	p = (char *) (systab + 1);
-	for (i = 0; i < systab->entry_count; i++) {
-		/*
-		 * The first byte of each entry type contains the type
-		 * descriptor.
-		 */
-		switch (*p) {
-		      case ESI_DESC_ENTRY_POINT:
-			break;
-		      default:
-			printk(KERN_WARNING "Unknown table type %d found in "
-			       "ESI table, ignoring rest of table\n", *p);
-			return -ENODEV;
-		}
-
-		p += ESI_DESC_SIZE(*p);
-	}
-
-	esi_systab = systab;
-	return 0;
-}
-
-
-int ia64_esi_call (efi_guid_t guid, struct ia64_sal_retval *isrvp,
-		   enum esi_proc_type proc_type, u64 func,
-		   u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
-		   u64 arg7)
-{
-	struct ia64_fpreg fr[6];
-	unsigned long flags = 0;
-	int i;
-	char *p;
-
-	if (!esi_systab)
-		return -1;
-
-	p = (char *) (esi_systab + 1);
-	for (i = 0; i < esi_systab->entry_count; i++) {
-		if (*p == ESI_DESC_ENTRY_POINT) {
-			ia64_esi_desc_entry_point_t *esi = (void *)p;
-			if (!efi_guidcmp(guid, esi->guid)) {
-				ia64_sal_handler esi_proc;
-				struct pdesc pdesc;
-
-				pdesc.addr = __va(esi->esi_proc);
-				pdesc.gp = __va(esi->gp);
-
-				esi_proc = (ia64_sal_handler) &pdesc;
-
-				ia64_save_scratch_fpregs(fr);
-				if (proc_type == ESI_PROC_SERIALIZED)
-					spin_lock_irqsave(&sal_lock, flags);
-				else if (proc_type == ESI_PROC_MP_SAFE)
-					local_irq_save(flags);
-				else
-					preempt_disable();
-				*isrvp = (*esi_proc)(func, arg1, arg2, arg3,
-						     arg4, arg5, arg6, arg7);
-				if (proc_type == ESI_PROC_SERIALIZED)
-					spin_unlock_irqrestore(&sal_lock,
-							       flags);
-				else if (proc_type == ESI_PROC_MP_SAFE)
-					local_irq_restore(flags);
-				else
-					preempt_enable();
-				ia64_load_scratch_fpregs(fr);
-				return 0;
-			}
-		}
-		p += ESI_DESC_SIZE(*p);
-	}
-	return -1;
-}
-EXPORT_SYMBOL_GPL(ia64_esi_call);
-
-int ia64_esi_call_phys (efi_guid_t guid, struct ia64_sal_retval *isrvp,
-			u64 func, u64 arg1, u64 arg2, u64 arg3, u64 arg4,
-			u64 arg5, u64 arg6, u64 arg7)
-{
-	struct ia64_fpreg fr[6];
-	unsigned long flags;
-	u64 esi_params[8];
-	char *p;
-	int i;
-
-	if (!esi_systab)
-		return -1;
-
-	p = (char *) (esi_systab + 1);
-	for (i = 0; i < esi_systab->entry_count; i++) {
-		if (*p == ESI_DESC_ENTRY_POINT) {
-			ia64_esi_desc_entry_point_t *esi = (void *)p;
-			if (!efi_guidcmp(guid, esi->guid)) {
-				ia64_sal_handler esi_proc;
-				struct pdesc pdesc;
-
-				pdesc.addr = (void *)esi->esi_proc;
-				pdesc.gp = (void *)esi->gp;
-
-				esi_proc = (ia64_sal_handler) &pdesc;
-
-				esi_params[0] = func;
-				esi_params[1] = arg1;
-				esi_params[2] = arg2;
-				esi_params[3] = arg3;
-				esi_params[4] = arg4;
-				esi_params[5] = arg5;
-				esi_params[6] = arg6;
-				esi_params[7] = arg7;
-				ia64_save_scratch_fpregs(fr);
-				spin_lock_irqsave(&sal_lock, flags);
-				*isrvp = esi_call_phys(esi_proc, esi_params);
-				spin_unlock_irqrestore(&sal_lock, flags);
-				ia64_load_scratch_fpregs(fr);
-				return 0;
-			}
-		}
-		p += ESI_DESC_SIZE(*p);
-	}
-	return -1;
-}
-EXPORT_SYMBOL_GPL(ia64_esi_call_phys);
-
-static void __exit esi_exit (void)
-{
-}
-
-module_init(esi_init);
-module_exit(esi_exit);	/* makes module removable... */
diff --git a/arch/ia64/kernel/esi_stub.S b/arch/ia64/kernel/esi_stub.S
deleted file mode 100644
index 9928c5b2957c..000000000000
--- a/arch/ia64/kernel/esi_stub.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * ESI call stub.
- *
- * Copyright (C) 2005 Hewlett-Packard Co
- *	Alex Williamson <alex.williamson@hp.com>
- *
- * Based on EFI call stub by David Mosberger.  The stub is virtually
- * identical to the one for EFI phys-mode calls, except that ESI
- * calls may have up to 8 arguments, so they get passed to this routine
- * through memory.
- *
- * This stub allows us to make ESI calls in physical mode with interrupts
- * turned off.  ESI calls may not support calling from virtual mode.
- *
- * Google for "Extensible SAL specification" for a document describing the
- * ESI standard.
- */
-
-/*
- * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
- * Abstraction Layer Specification", revision 2.6e).  Note that
- * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
- * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
- * (the br.ia instruction fails unless psr.dfl and psr.dfh are
- * cleared).  Fortunately, SAL promises not to touch the floating
- * point regs, so at least we don't have to save f2-f127.
- */
-#define PSR_BITS_TO_CLEAR						\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PSR_BITS_TO_SET							\
-	(IA64_PSR_BN)
-
-#include <linux/export.h>
-#include <asm/processor.h>
-#include <asm/asmmacro.h>
-
-/*
- * Inputs:
- *	in0 = address of function descriptor of ESI routine to call
- *	in1 = address of array of ESI parameters
- *
- * Outputs:
- *	r8 = result returned by called function
- */
-GLOBAL_ENTRY(esi_call_phys)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	alloc loc1=ar.pfs,2,7,8,0
-	ld8 r2=[in0],8			// load ESI function's entry point
-	mov loc0=rp
-	.body
-	;;
-	ld8 out0=[in1],8		// ESI params loaded from array
-	;;				// passing all as inputs doesn't work
-	ld8 out1=[in1],8
-	;;
-	ld8 out2=[in1],8
-	;;
-	ld8 out3=[in1],8
-	;;
-	ld8 out4=[in1],8
-	;;
-	ld8 out5=[in1],8
-	;;
-	ld8 out6=[in1],8
-	;;
-	ld8 out7=[in1]
-	mov loc2=gp			// save global pointer
-	mov loc4=ar.rsc			// save RSE configuration
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	;;
-	ld8 gp=[in0]			// load ESI function's global pointer
-	movl r16=PSR_BITS_TO_CLEAR
-	mov loc3=psr			// save processor status word
-	movl r17=PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17
-	mov b6=r2
-	;;
-	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
-	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret0:	mov loc5=r19			// old ar.bsp
-	mov loc6=r20			// old sp
-	br.call.sptk.many rp=b6		// call the ESI function
-.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// save virtual mode psr
-	mov r19=loc5			// save virtual mode bspstore
-	mov r20=loc6			// save virtual mode sp
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:	mov ar.rsc=loc4			// restore RSE configuration
-	mov ar.pfs=loc1
-	mov rp=loc0
-	mov gp=loc2
-	br.ret.sptk.many rp
-END(esi_call_phys)
-EXPORT_SYMBOL_GPL(esi_call_phys)
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
deleted file mode 100644
index cc4733e9990a..000000000000
--- a/arch/ia64/kernel/fsys.S
+++ /dev/null
@@ -1,837 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the light-weight system call handlers (fsyscall-handlers).
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- * 	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 25-Sep-03 davidm	Implement fsys_rt_sigprocmask().
- * 18-Feb-03 louisk	Implement fsys_gettimeofday().
- * 28-Feb-03 davidm	Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
- *			probably broke it along the way... ;-)
- * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
- *                      it capable of using memory based clocks without falling back to C code.
- * 08-Feb-07 Fenghua Yu Implement fsys_getcpu.
- *
- */
-
-#include <asm/asmmacro.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/percpu.h>
-#include <asm/thread_info.h>
-#include <asm/sal.h>
-#include <asm/signal.h>
-#include <asm/unistd.h>
-
-#include "entry.h"
-#include <asm/native/inst.h>
-
-/*
- * See Documentation/arch/ia64/fsys.rst for details on fsyscalls.
- *
- * On entry to an fsyscall handler:
- *   r10	= 0 (i.e., defaults to "successful syscall return")
- *   r11	= saved ar.pfs (a user-level value)
- *   r15	= system call number
- *   r16	= "current" task pointer (in normal kernel-mode, this is in r13)
- *   r32-r39	= system call arguments
- *   b6		= return address (a user-level value)
- *   ar.pfs	= previous frame-state (a user-level value)
- *   PSR.be	= cleared to zero (i.e., little-endian byte order is in effect)
- *   all other registers may contain values passed in from user-mode
- *
- * On return from an fsyscall handler:
- *   r11	= saved ar.pfs (as passed into the fsyscall handler)
- *   r15	= system call number (as passed into the fsyscall handler)
- *   r32-r39	= system call arguments (as passed into the fsyscall handler)
- *   b6		= return address (as passed into the fsyscall handler)
- *   ar.pfs	= previous frame-state (as passed into the fsyscall handler)
- */
-
-ENTRY(fsys_ni_syscall)
-	.prologue
-	.altrp b6
-	.body
-	mov r8=ENOSYS
-	mov r10=-1
-	FSYS_RETURN
-END(fsys_ni_syscall)
-
-ENTRY(fsys_getpid)
-	.prologue
-	.altrp b6
-	.body
-	add r17=IA64_TASK_SIGNAL_OFFSET,r16
-	;;
-	ld8 r17=[r17]				// r17 = current->signal
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	;;
-	ld4 r9=[r9]
-	add r17=IA64_SIGNAL_PIDS_TGID_OFFSET,r17
-	;;
-	and r9=TIF_ALLWORK_MASK,r9
-	ld8 r17=[r17]				// r17 = current->signal->pids[PIDTYPE_TGID]
-	;;
-	add r8=IA64_PID_LEVEL_OFFSET,r17
-	;;
-	ld4 r8=[r8]				// r8 = pid->level
-	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
-	;;
-	shl r8=r8,IA64_UPID_SHIFT
-	;;
-	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
-	;;
-	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
-	;;
-	mov r17=0
-	;;
-	cmp.ne p8,p0=0,r9
-(p8)	br.spnt.many fsys_fallback_syscall
-	FSYS_RETURN
-END(fsys_getpid)
-
-ENTRY(fsys_set_tid_address)
-	.prologue
-	.altrp b6
-	.body
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	add r17=IA64_TASK_THREAD_PID_OFFSET,r16
-	;;
-	ld4 r9=[r9]
-	tnat.z p6,p7=r32		// check argument register for being NaT
-	ld8 r17=[r17]				// r17 = current->thread_pid
-	;;
-	and r9=TIF_ALLWORK_MASK,r9
-	add r8=IA64_PID_LEVEL_OFFSET,r17
-	add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
-	;;
-	ld4 r8=[r8]				// r8 = pid->level
-	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
-	;;
-	shl r8=r8,IA64_UPID_SHIFT
-	;;
-	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
-	;;
-	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
-	;;
-	cmp.ne p8,p0=0,r9
-	mov r17=-1
-	;;
-(p6)	st8 [r18]=r32
-(p7)	st8 [r18]=r17
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-	mov r17=0			// i must not leak kernel bits...
-	mov r18=0			// i must not leak kernel bits...
-	FSYS_RETURN
-END(fsys_set_tid_address)
-
-#if IA64_GTOD_SEQ_OFFSET !=0
-#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
-#endif
-#if IA64_ITC_JITTER_OFFSET !=0
-#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
-#endif
-#define CLOCK_REALTIME 0
-#define CLOCK_MONOTONIC 1
-#define CLOCK_DIVIDE_BY_1000 0x4000
-#define CLOCK_ADD_MONOTONIC 0x8000
-
-ENTRY(fsys_gettimeofday)
-	.prologue
-	.altrp b6
-	.body
-	mov r31 = r32
-	tnat.nz p6,p0 = r33		// guard against NaT argument
-(p6)    br.cond.spnt.few .fail_einval
-	mov r30 = CLOCK_DIVIDE_BY_1000
-	;;
-.gettime:
-	// Register map
-	// Incoming r31 = pointer to address where to place result
-	//          r30 = flags determining how time is processed
-	// r2,r3 = temp r4-r7 preserved
-	// r8 = result nanoseconds
-	// r9 = result seconds
-	// r10 = temporary storage for clock difference
-	// r11 = preserved: saved ar.pfs
-	// r12 = preserved: memory stack
-	// r13 = preserved: thread pointer
-	// r14 = address of mask / mask value
-	// r15 = preserved: system call number
-	// r16 = preserved: current task pointer
-	// r17 = (not used)
-	// r18 = (not used)
-	// r19 = address of itc_lastcycle
-	// r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
-	// r21 = address of mmio_ptr
-	// r22 = address of wall_time or monotonic_time
-	// r23 = address of shift / value
-	// r24 = address mult factor / cycle_last value
-	// r25 = itc_lastcycle value
-	// r26 = address clocksource cycle_last
-	// r27 = (not used)
-	// r28 = sequence number at the beginning of critical section
-	// r29 = address of itc_jitter
-	// r30 = time processing flags / memory address
-	// r31 = pointer to result
-	// Predicates
-	// p6,p7 short term use
-	// p8 = timesource ar.itc
-	// p9 = timesource mmio64
-	// p10 = timesource mmio32 - not used
-	// p11 = timesource not to be handled by asm code
-	// p12 = memory time source ( = p9 | p10) - not used
-	// p13 = do cmpxchg with itc_lastcycle
-	// p14 = Divide by 1000
-	// p15 = Add monotonic
-	//
-	// Note that instructions are optimized for McKinley. McKinley can
-	// process two bundles simultaneously and therefore we continuously
-	// try to feed the CPU two bundles and then a stop.
-
-	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-	tnat.nz p6,p0 = r31		// guard against Nat argument
-(p6)	br.cond.spnt.few .fail_einval
-	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
-	;;
-	ld4 r2 = [r2]			// process work pending flags
-	movl r29 = itc_jitter_data	// itc_jitter
-	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20	// wall_time
-	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
-	mov pr = r30,0xc000	// Set predicates according to function
-	;;
-	and r2 = TIF_ALLWORK_MASK,r2
-	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
-(p15)	add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20	// monotonic_time
-	;;
-	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20	// clksrc_cycle_last
-	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
-(p6)	br.cond.spnt.many fsys_fallback_syscall
-	;;
-	// Begin critical section
-.time_redo:
-	ld4.acq r28 = [r20]	// gtod_lock.sequence, Must take first
-	;;
-	and r28 = ~1,r28	// And make sequence even to force retry if odd
-	;;
-	ld8 r30 = [r21]		// clocksource->mmio_ptr
-	add r24 = IA64_CLKSRC_MULT_OFFSET,r20
-	ld4 r2 = [r29]		// itc_jitter value
-	add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
-	add r14 = IA64_CLKSRC_MASK_OFFSET,r20
-	;;
-	ld4 r3 = [r24]		// clocksource mult value
-	ld8 r14 = [r14]         // clocksource mask value
-	cmp.eq p8,p9 = 0,r30	// use cpu timer if no mmio_ptr
-	;;
-	setf.sig f7 = r3	// Setup for mult scaling of counter
-(p8)	cmp.ne p13,p0 = r2,r0	// need itc_jitter compensation, set p13
-	ld4 r23 = [r23]		// clocksource shift value
-	ld8 r24 = [r26]		// get clksrc_cycle_last value
-(p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
-	;;
-	.pred.rel.mutex p8,p9
-	MOV_FROM_ITC(p8, p6, r2, r10)	// CPU_TIMER. 36 clocks latency!!!
-(p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
-(p13)	ld8 r25 = [r19]		// get itc_lastcycle value
-	ld8 r9 = [r22],IA64_TIME_SN_SPEC_SNSEC_OFFSET	// sec
-	;;
-	ld8 r8 = [r22],-IA64_TIME_SN_SPEC_SNSEC_OFFSET	// snsec
-(p13)	sub r3 = r25,r2		// Diff needed before comparison (thanks davidm)
-	;;
-(p13)	cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
-	sub r10 = r2,r24	// current_cycle - last_cycle
-	;;
-(p6)	sub r10 = r25,r24	// time we got was less than last_cycle
-(p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
-	;;
-(p7)	cmpxchg8.rel r3 = [r19],r2,ar.ccv
-	;;
-(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful
-	;;
-(p7)	sub r10 = r3,r24	// then use new last_cycle instead
-	;;
-	and r10 = r10,r14	// Apply mask
-	;;
-	setf.sig f8 = r10
-	nop.i 123
-	;;
-	// fault check takes 5 cycles and we have spare time
-EX(.fail_efault, probe.w.fault r31, 3)
-	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
-	;;
-	getf.sig r2 = f8
-	mf
-	;;
-	ld4 r10 = [r20]		// gtod_lock.sequence
-	add r8 = r8,r2		// Add xtime.nsecs
-	;;
-	shr.u r8 = r8,r23	// shift by factor
-	cmp4.ne p7,p0 = r28,r10
-(p7)	br.cond.dpnt.few .time_redo	// sequence number changed, redo
-	// End critical section.
-	// Now r8=tv->tv_nsec and r9=tv->tv_sec
-	mov r10 = r0
-	movl r2 = 1000000000
-	add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
-(p14)	movl r3 = 2361183241434822607	// Prep for / 1000 hack
-	;;
-.time_normalize:
-	mov r21 = r8
-	cmp.ge p6,p0 = r8,r2
-(p14)	shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
-	;;
-(p14)	setf.sig f8 = r20
-(p6)	sub r8 = r8,r2
-(p6)	add r9 = 1,r9		// two nops before the branch.
-(p14)	setf.sig f7 = r3	// Chances for repeats are 1 in 10000 for gettod
-(p6)	br.cond.dpnt.few .time_normalize
-	;;
-	// Divided by 8 though shift. Now divide by 125
-	// The compiler was able to do that with a multiply
-	// and a shift and we do the same
-EX(.fail_efault, probe.w.fault r23, 3)	// This also costs 5 cycles
-(p14)	xmpy.hu f8 = f8, f7		// xmpy has 5 cycles latency so use it
-	;;
-(p14)	getf.sig r2 = f8
-	;;
-	mov r8 = r0
-(p14)	shr.u r21 = r2, 4
-	;;
-EX(.fail_efault, st8 [r31] = r9)
-EX(.fail_efault, st8 [r23] = r21)
-	FSYS_RETURN
-.fail_einval:
-	mov r8 = EINVAL
-	mov r10 = -1
-	FSYS_RETURN
-.fail_efault:
-	mov r8 = EFAULT
-	mov r10 = -1
-	FSYS_RETURN
-END(fsys_gettimeofday)
-
-ENTRY(fsys_clock_gettime)
-	.prologue
-	.altrp b6
-	.body
-	cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
-	// Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
-(p6)	br.spnt.few fsys_fallback_syscall
-	mov r31 = r33
-	shl r30 = r32,15
-	br.many .gettime
-END(fsys_clock_gettime)
-
-/*
- * fsys_getcpu doesn't use the third parameter in this implementation. It reads
- * current_thread_info()->cpu and corresponding node in cpu_to_node_map.
- */
-ENTRY(fsys_getcpu)
-	.prologue
-	.altrp b6
-	.body
-	;;
-	add r2=TI_FLAGS+IA64_TASK_SIZE,r16
-	tnat.nz p6,p0 = r32			// guard against NaT argument
-	add r3=TI_CPU+IA64_TASK_SIZE,r16
-	;;
-	ld4 r3=[r3]				// M r3 = thread_info->cpu
-	ld4 r2=[r2]				// M r2 = thread_info->flags
-(p6)    br.cond.spnt.few .fail_einval		// B
-	;;
-	tnat.nz p7,p0 = r33			// I guard against NaT argument
-(p7)    br.cond.spnt.few .fail_einval		// B
-	;;
-	cmp.ne p6,p0=r32,r0
-	cmp.ne p7,p0=r33,r0
-	;;
-#ifdef CONFIG_NUMA
-	movl r17=cpu_to_node_map
-	;;
-EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
-EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
-	shladd r18=r3,1,r17
-	;;
-	ld2 r20=[r18]				// r20 = cpu_to_node_map[cpu]
-	and r2 = TIF_ALLWORK_MASK,r2
-	;;
-	cmp.ne p8,p0=0,r2
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-	;;
-EX(.fail_efault, (p6) st4 [r32] = r3)
-EX(.fail_efault, (p7) st2 [r33] = r20)
-	mov r8=0
-	;;
-#else
-EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
-EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
-	and r2 = TIF_ALLWORK_MASK,r2
-	;;
-	cmp.ne p8,p0=0,r2
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-EX(.fail_efault, (p6) st4 [r32] = r3)
-EX(.fail_efault, (p7) st2 [r33] = r0)
-	mov r8=0
-	;;
-#endif
-	FSYS_RETURN
-END(fsys_getcpu)
-
-ENTRY(fsys_fallback_syscall)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * We only get here from light-weight syscall handlers.  Thus, we already
-	 * know that r15 contains a valid syscall number.  No need to re-check.
-	 */
-	adds r17=-1024,r15
-	movl r14=sys_call_table
-	;;
-	RSM_PSR_I(p0, r26, r27)
-	shladd r18=r17,3,r14
-	;;
-	ld8 r18=[r18]				// load normal (heavy-weight) syscall entry-point
-	MOV_FROM_PSR(p0, r29, r26)		// read psr (12 cyc load latency)
-	mov r27=ar.rsc
-	mov r21=ar.fpsr
-	mov r26=ar.pfs
-END(fsys_fallback_syscall)
-	/* FALL THROUGH */
-GLOBAL_ENTRY(fsys_bubble_down)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * We get here for syscalls that don't have a lightweight
-	 * handler.  For those, we need to bubble down into the kernel
-	 * and that requires setting up a minimal pt_regs structure,
-	 * and initializing the CPU state more or less as if an
-	 * interruption had occurred.  To make syscall-restarts work,
-	 * we setup pt_regs such that cr_iip points to the second
-	 * instruction in syscall_via_break.  Decrementing the IP
-	 * hence will restart the syscall via break and not
-	 * decrementing IP will return us to the caller, as usual.
-	 * Note that we preserve the value of psr.pp rather than
-	 * initializing it from dcr.pp.  This makes it possible to
-	 * distinguish fsyscall execution from other privileged
-	 * execution.
-	 *
-	 * On entry:
-	 *	- normal fsyscall handler register usage, except
-	 *	  that we also have:
-	 *	- r18: address of syscall entry point
-	 *	- r21: ar.fpsr
-	 *	- r26: ar.pfs
-	 *	- r27: ar.rsc
-	 *	- r29: psr
-	 *
-	 * We used to clear some PSR bits here but that requires slow
-	 * serialization.  Fortunately, that isn't really necessary.
-	 * The rationale is as follows: we used to clear bits
-	 * ~PSR_PRESERVED_BITS in PSR.L.  Since
-	 * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we
-	 * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}.
-	 * However,
-	 *
-	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
-	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time fsys_bubble_down gets
-	 *	    invoked
-	 * PSR.DFL: always 0 (kernel never turns it on)
-	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
-	 *	    initiative
-	 * PSR.DI : always 0 (kernel never turns it on)
-	 * PSR.SI : always 0 (kernel never turns it on)
-	 * PSR.DB : don't care --- kernel never enables kernel-level
-	 *	    breakpoints
-	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
-	 *          will trigger a taken branch; the taken-trap-handler then
-	 *          converts the syscall into a break-based system-call.
-	 */
-	/*
-	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.
-	 * The rest we have to synthesize.
-	 */
-#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT)	\
-					 | (0x1 << IA64_PSR_RI_BIT)	\
-					 | IA64_PSR_BN | IA64_PSR_I)
-
-	invala					// M0|1
-	movl r14=ia64_ret_from_syscall		// X
-
-	nop.m 0
-	movl r28=__kernel_syscall_via_break	// X	create cr.iip
-	;;
-
-	mov r2=r16				// A    get task addr to addl-addressable register
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A
-	mov r31=pr				// I0   save pr (2 cyc)
-	;;
-	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
-	addl r22=IA64_RBS_OFFSET,r2		// A    compute base of RBS
-	add r3=TI_FLAGS+IA64_TASK_SIZE,r2	// A
-	;;
-	ld4 r3=[r3]				// M0|1 r3 = current_thread_info()->flags
-	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch register backing-store
-	nop.i 0
-	;;
-	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
-#else
-	nop.m 0
-#endif
-	nop.i 0
-	;;
-	mov r23=ar.bspstore			// M2 (12 cyc) save ar.bspstore
-	mov.m r24=ar.rnat			// M2 (5 cyc) read ar.rnat (dual-issues!)
-	nop.i 0
-	;;
-	mov ar.bspstore=r22			// M2 (6 cyc) switch to kernel RBS
-	movl r8=PSR_ONE_BITS			// X
-	;;
-	mov r25=ar.unat				// M2 (5 cyc) save ar.unat
-	mov r19=b6				// I0   save b6 (2 cyc)
-	mov r20=r1				// A    save caller's gp in r20
-	;;
-	or r29=r8,r29				// A    construct cr.ipsr value to save
-	mov b6=r18				// I0   copy syscall entry-point to b6 (7 cyc)
-	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack
-
-	mov r18=ar.bsp				// M2   save (kernel) ar.bsp (12 cyc)
-	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
-	br.call.sptk.many b7=ia64_syscall_setup	// B
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	// mov.m r30=ar.itc is called in advance
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at leave kernel
-	;;
-	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
-	ld8 r21=[r17]				// cumulated utime
-	sub r22=r19,r18				// stime before leave kernel
-	;;
-	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// update stamp
-	sub r18=r30,r19				// elapsed time in user mode
-	;;
-	add r20=r20,r22				// sum stime
-	add r21=r21,r18				// sum utime
-	;;
-	st8 [r16]=r20				// update stime
-	st8 [r17]=r21				// update utime
-	;;
-#endif
-	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
-	mov rp=r14				// I0   set the real return addr
-	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
-	;;
-	SSM_PSR_I(p0, p6, r22)			// M2   we're on kernel stacks now, reenable irqs
-	cmp.eq p8,p0=r3,r0			// A
-(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
-
-	nop.m 0
-(p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
-	br.cond.spnt ia64_trace_syscall		// B
-END(fsys_bubble_down)
-
-	.rodata
-	.align 8
-	.globl fsyscall_table
-
-	data8 fsys_bubble_down
-fsyscall_table:
-	data8 fsys_ni_syscall
-	data8 0				// exit			// 1025
-	data8 0				// read
-	data8 0				// write
-	data8 0				// open
-	data8 0				// close
-	data8 0				// creat		// 1030
-	data8 0				// link
-	data8 0				// unlink
-	data8 0				// execve
-	data8 0				// chdir
-	data8 0				// fchdir		// 1035
-	data8 0				// utimes
-	data8 0				// mknod
-	data8 0				// chmod
-	data8 0				// chown
-	data8 0				// lseek		// 1040
-	data8 fsys_getpid		// getpid
-	data8 0				// getppid
-	data8 0				// mount
-	data8 0				// umount
-	data8 0				// setuid		// 1045
-	data8 0				// getuid
-	data8 0				// geteuid
-	data8 0				// ptrace
-	data8 0				// access
-	data8 0				// sync			// 1050
-	data8 0				// fsync
-	data8 0				// fdatasync
-	data8 0				// kill
-	data8 0				// rename
-	data8 0				// mkdir		// 1055
-	data8 0				// rmdir
-	data8 0				// dup
-	data8 0				// pipe
-	data8 0				// times
-	data8 0				// brk			// 1060
-	data8 0				// setgid
-	data8 0				// getgid
-	data8 0				// getegid
-	data8 0				// acct
-	data8 0				// ioctl		// 1065
-	data8 0				// fcntl
-	data8 0				// umask
-	data8 0				// chroot
-	data8 0				// ustat
-	data8 0				// dup2			// 1070
-	data8 0				// setreuid
-	data8 0				// setregid
-	data8 0				// getresuid
-	data8 0				// setresuid
-	data8 0				// getresgid		// 1075
-	data8 0				// setresgid
-	data8 0				// getgroups
-	data8 0				// setgroups
-	data8 0				// getpgid
-	data8 0				// setpgid		// 1080
-	data8 0				// setsid
-	data8 0				// getsid
-	data8 0				// sethostname
-	data8 0				// setrlimit
-	data8 0				// getrlimit		// 1085
-	data8 0				// getrusage
-	data8 fsys_gettimeofday		// gettimeofday
-	data8 0				// settimeofday
-	data8 0				// select
-	data8 0				// poll			// 1090
-	data8 0				// symlink
-	data8 0				// readlink
-	data8 0				// uselib
-	data8 0				// swapon
-	data8 0				// swapoff		// 1095
-	data8 0				// reboot
-	data8 0				// truncate
-	data8 0				// ftruncate
-	data8 0				// fchmod
-	data8 0				// fchown		// 1100
-	data8 0				// getpriority
-	data8 0				// setpriority
-	data8 0				// statfs
-	data8 0				// fstatfs
-	data8 0				// gettid		// 1105
-	data8 0				// semget
-	data8 0				// semop
-	data8 0				// semctl
-	data8 0				// msgget
-	data8 0				// msgsnd		// 1110
-	data8 0				// msgrcv
-	data8 0				// msgctl
-	data8 0				// shmget
-	data8 0				// shmat
-	data8 0				// shmdt		// 1115
-	data8 0				// shmctl
-	data8 0				// syslog
-	data8 0				// setitimer
-	data8 0				// getitimer
-	data8 0					 		// 1120
-	data8 0
-	data8 0
-	data8 0				// vhangup
-	data8 0				// lchown
-	data8 0				// remap_file_pages	// 1125
-	data8 0				// wait4
-	data8 0				// sysinfo
-	data8 0				// clone
-	data8 0				// setdomainname
-	data8 0				// newuname		// 1130
-	data8 0				// adjtimex
-	data8 0
-	data8 0				// init_module
-	data8 0				// delete_module
-	data8 0							// 1135
-	data8 0
-	data8 0				// quotactl
-	data8 0				// bdflush
-	data8 0				// sysfs
-	data8 0				// personality		// 1140
-	data8 0				// afs_syscall
-	data8 0				// setfsuid
-	data8 0				// setfsgid
-	data8 0				// getdents
-	data8 0				// flock		// 1145
-	data8 0				// readv
-	data8 0				// writev
-	data8 0				// pread64
-	data8 0				// pwrite64
-	data8 0				// sysctl		// 1150
-	data8 0				// mmap
-	data8 0				// munmap
-	data8 0				// mlock
-	data8 0				// mlockall
-	data8 0				// mprotect		// 1155
-	data8 0				// mremap
-	data8 0				// msync
-	data8 0				// munlock
-	data8 0				// munlockall
-	data8 0				// sched_getparam	// 1160
-	data8 0				// sched_setparam
-	data8 0				// sched_getscheduler
-	data8 0				// sched_setscheduler
-	data8 0				// sched_yield
-	data8 0				// sched_get_priority_max	// 1165
-	data8 0				// sched_get_priority_min
-	data8 0				// sched_rr_get_interval
-	data8 0				// nanosleep
-	data8 0				// nfsservctl
-	data8 0				// prctl		// 1170
-	data8 0				// getpagesize
-	data8 0				// mmap2
-	data8 0				// pciconfig_read
-	data8 0				// pciconfig_write
-	data8 0				// perfmonctl		// 1175
-	data8 0				// sigaltstack
-	data8 0				// rt_sigaction
-	data8 0				// rt_sigpending
-	data8 0				// rt_sigprocmask
-	data8 0				// rt_sigqueueinfo	// 1180
-	data8 0				// rt_sigreturn
-	data8 0				// rt_sigsuspend
-	data8 0				// rt_sigtimedwait
-	data8 0				// getcwd
-	data8 0				// capget		// 1185
-	data8 0				// capset
-	data8 0				// sendfile
-	data8 0
-	data8 0
-	data8 0				// socket		// 1190
-	data8 0				// bind
-	data8 0				// connect
-	data8 0				// listen
-	data8 0				// accept
-	data8 0				// getsockname		// 1195
-	data8 0				// getpeername
-	data8 0				// socketpair
-	data8 0				// send
-	data8 0				// sendto
-	data8 0				// recv			// 1200
-	data8 0				// recvfrom
-	data8 0				// shutdown
-	data8 0				// setsockopt
-	data8 0				// getsockopt
-	data8 0				// sendmsg		// 1205
-	data8 0				// recvmsg
-	data8 0				// pivot_root
-	data8 0				// mincore
-	data8 0				// madvise
-	data8 0				// newstat		// 1210
-	data8 0				// newlstat
-	data8 0				// newfstat
-	data8 0				// clone2
-	data8 0				// getdents64
-	data8 0				// getunwind		// 1215
-	data8 0				// readahead
-	data8 0				// setxattr
-	data8 0				// lsetxattr
-	data8 0				// fsetxattr
-	data8 0				// getxattr		// 1220
-	data8 0				// lgetxattr
-	data8 0				// fgetxattr
-	data8 0				// listxattr
-	data8 0				// llistxattr
-	data8 0				// flistxattr		// 1225
-	data8 0				// removexattr
-	data8 0				// lremovexattr
-	data8 0				// fremovexattr
-	data8 0				// tkill
-	data8 0				// futex		// 1230
-	data8 0				// sched_setaffinity
-	data8 0				// sched_getaffinity
-	data8 fsys_set_tid_address	// set_tid_address
-	data8 0				// fadvise64_64
-	data8 0				// tgkill		// 1235
-	data8 0				// exit_group
-	data8 0				// lookup_dcookie
-	data8 0				// io_setup
-	data8 0				// io_destroy
-	data8 0				// io_getevents		// 1240
-	data8 0				// io_submit
-	data8 0				// io_cancel
-	data8 0				// epoll_create
-	data8 0				// epoll_ctl
-	data8 0				// epoll_wait		// 1245
-	data8 0				// restart_syscall
-	data8 0				// semtimedop
-	data8 0				// timer_create
-	data8 0				// timer_settime
-	data8 0				// timer_gettime 	// 1250
-	data8 0				// timer_getoverrun
-	data8 0				// timer_delete
-	data8 0				// clock_settime
-	data8 fsys_clock_gettime	// clock_gettime
-	data8 0				// clock_getres		// 1255
-	data8 0				// clock_nanosleep
-	data8 0				// fstatfs64
-	data8 0				// statfs64
-	data8 0				// mbind
-	data8 0				// get_mempolicy	// 1260
-	data8 0				// set_mempolicy
-	data8 0				// mq_open
-	data8 0				// mq_unlink
-	data8 0				// mq_timedsend
-	data8 0				// mq_timedreceive	// 1265
-	data8 0				// mq_notify
-	data8 0				// mq_getsetattr
-	data8 0				// kexec_load
-	data8 0				// vserver
-	data8 0				// waitid		// 1270
-	data8 0				// add_key
-	data8 0				// request_key
-	data8 0				// keyctl
-	data8 0				// ioprio_set
-	data8 0				// ioprio_get		// 1275
-	data8 0				// move_pages
-	data8 0				// inotify_init
-	data8 0				// inotify_add_watch
-	data8 0				// inotify_rm_watch
-	data8 0				// migrate_pages	// 1280
-	data8 0				// openat
-	data8 0				// mkdirat
-	data8 0				// mknodat
-	data8 0				// fchownat
-	data8 0				// futimesat		// 1285
-	data8 0				// newfstatat
-	data8 0				// unlinkat
-	data8 0				// renameat
-	data8 0				// linkat
-	data8 0				// symlinkat		// 1290
-	data8 0				// readlinkat
-	data8 0				// fchmodat
-	data8 0				// faccessat
-	data8 0
-	data8 0							// 1295
-	data8 0				// unshare
-	data8 0				// splice
-	data8 0				// set_robust_list
-	data8 0				// get_robust_list
-	data8 0				// sync_file_range	// 1300
-	data8 0				// tee
-	data8 0				// vmsplice
-	data8 0
-	data8 fsys_getcpu		// getcpu		// 1304
-
-	// fill in zeros for the remaining entries
-	.zero:
-	.space fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
deleted file mode 100644
index cc2861445965..000000000000
--- a/arch/ia64/kernel/fsyscall_gtod_data.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
- *        Contributed by Peter Keilty <peter.keilty@hp.com>
- *
- * fsyscall gettimeofday data
- */
-
-/* like timespec, but includes "shifted nanoseconds" */
-struct time_sn_spec {
-	u64	sec;
-	u64	snsec;
-};
-
-struct fsyscall_gtod_data_t {
-	seqcount_t	seq;
-	struct time_sn_spec wall_time;
-	struct time_sn_spec monotonic_time;
-	u64		clk_mask;
-	u32		clk_mult;
-	u32		clk_shift;
-	void		*clk_fsys_mmio;
-	u64		clk_cycle_last;
-} ____cacheline_aligned;
-
-struct itc_jitter_data_t {
-	int		itc_jitter;
-	u64		itc_lastcycle;
-} ____cacheline_aligned;
-
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
deleted file mode 100644
index d6360fd404ab..000000000000
--- a/arch/ia64/kernel/ftrace.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Dynamic function tracing support.
- *
- * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com>
- *
- * For licencing details, see COPYING.
- *
- * Defines low-level handling of mcount calls when the kernel
- * is compiled with the -pg flag. When using dynamic ftrace, the
- * mcount call-sites get patched lazily with NOP till they are
- * enabled. All code mutation routines here take effect atomically.
- */
-
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-
-#include <asm/cacheflush.h>
-#include <asm/patch.h>
-
-/* In IA64, each function will be added below two bundles with -pg option */
-static unsigned char __attribute__((aligned(8)))
-ftrace_orig_code[MCOUNT_INSN_SIZE] = {
-	0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */
-	0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */
-	0x05, 0x00, 0xc4, 0x00,             /* mov r42=b0 */
-	0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */
-	0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */
-	0x08, 0x00, 0x00, 0x50              /* br.call.sptk.many b0 = _mcount;; */
-};
-
-struct ftrace_orig_insn {
-	u64 dummy1, dummy2, dummy3;
-	u64 dummy4:64-41+13;
-	u64 imm20:20;
-	u64 dummy5:3;
-	u64 sign:1;
-	u64 dummy6:4;
-};
-
-/* mcount stub will be converted below for nop */
-static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = {
-	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
-	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
-	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
-	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */
-	0x00, 0x00, 0x04, 0x00
-};
-
-static unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop_code;
-}
-
-/*
- * mcount stub will be converted below for call
- * Note: Just the last instruction is changed against nop
- * */
-static unsigned char __attribute__((aligned(8)))
-ftrace_call_code[MCOUNT_INSN_SIZE] = {
-	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
-	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
-	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
-	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
-	0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/
-	0xf8, 0xff, 0xff, 0xc8
-};
-
-struct ftrace_call_insn {
-	u64 dummy1, dummy2;
-	u64 dummy3:48;
-	u64 imm39_l:16;
-	u64 imm39_h:23;
-	u64 dummy4:13;
-	u64 imm20:20;
-	u64 dummy5:3;
-	u64 i:1;
-	u64 dummy6:4;
-};
-
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
-	struct ftrace_call_insn *code = (void *)ftrace_call_code;
-	unsigned long offset = addr - (ip + 0x10);
-
-	code->imm39_l = offset >> 24;
-	code->imm39_h = offset >> 40;
-	code->imm20 = offset >> 4;
-	code->i = offset >> 63;
-	return ftrace_call_code;
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-		   unsigned char *new_code, int do_check)
-{
-	unsigned char replaced[MCOUNT_INSN_SIZE];
-
-	/*
-	 * Note:
-	 * We are paranoid about modifying text, as if a bug was to happen, it
-	 * could cause us to read or write to someplace that could cause harm.
-	 * Carefully read and modify the code with probe_kernel_*(), and make
-	 * sure what we read is what we expected it to be before modifying it.
-	 */
-
-	if (!do_check)
-		goto skip_check;
-
-	/* read the text we want to modify */
-	if (copy_from_kernel_nofault(replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* Make sure it is what we expect it to be */
-	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
-		return -EINVAL;
-
-skip_check:
-	/* replace the text with the new text */
-	if (copy_to_kernel_nofault(((void *)ip), new_code, MCOUNT_INSN_SIZE))
-		return -EPERM;
-	flush_icache_range(ip, ip + MCOUNT_INSN_SIZE);
-
-	return 0;
-}
-
-static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE];
-	unsigned long ip = rec->ip;
-
-	if (copy_from_kernel_nofault(replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-	if (rec->flags & FTRACE_FL_CONVERTED) {
-		struct ftrace_call_insn *call_insn, *tmp_call;
-
-		call_insn = (void *)ftrace_call_code;
-		tmp_call = (void *)replaced;
-		call_insn->imm39_l = tmp_call->imm39_l;
-		call_insn->imm39_h = tmp_call->imm39_h;
-		call_insn->imm20 = tmp_call->imm20;
-		call_insn->i = tmp_call->i;
-		if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0)
-			return -EINVAL;
-		return 0;
-	} else {
-		struct ftrace_orig_insn *call_insn, *tmp_call;
-
-		call_insn = (void *)ftrace_orig_code;
-		tmp_call = (void *)replaced;
-		call_insn->sign = tmp_call->sign;
-		call_insn->imm20 = tmp_call->imm20;
-		if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0)
-			return -EINVAL;
-		return 0;
-	}
-}
-
-int ftrace_make_nop(struct module *mod,
-		    struct dyn_ftrace *rec, unsigned long addr)
-{
-	int ret;
-	char *new;
-
-	ret = ftrace_make_nop_check(rec, addr);
-	if (ret)
-		return ret;
-	new = ftrace_nop_replace();
-	return ftrace_modify_code(rec->ip, NULL, new, 0);
-}
-
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned char *old, *new;
-
-	old=  ftrace_nop_replace();
-	new = ftrace_call_replace(ip, addr);
-	return ftrace_modify_code(ip, old, new, 1);
-}
-
-/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */
-int ftrace_update_ftrace_func(ftrace_func_t func)
-{
-	unsigned long ip;
-	unsigned long addr = ((struct fnptr *)ftrace_call)->ip;
-
-	if (func == ftrace_stub)
-		return 0;
-	ip = ((struct fnptr *)func)->ip;
-
-	ia64_patch_imm64(addr + 2, ip);
-
-	flush_icache_range(addr, addr + 16);
-	return 0;
-}
diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S
deleted file mode 100644
index b3ef1c72e132..000000000000
--- a/arch/ia64/kernel/gate-data.S
+++ /dev/null
@@ -1,3 +0,0 @@
-	.section .data..gate, "aw"
-
-	.incbin "arch/ia64/kernel/gate.so"
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
deleted file mode 100644
index 9f235cd551ab..000000000000
--- a/arch/ia64/kernel/gate.S
+++ /dev/null
@@ -1,380 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the code that gets mapped at the upper end of each task's text
- * region.  For now, it contains the signal trampoline code only.
- *
- * Copyright (C) 1999-2003 Hewlett-Packard Co
- * 	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/asmmacro.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/sigcontext.h>
-#include <asm/unistd.h>
-#include <asm/kregs.h>
-#include <asm/page.h>
-#include <asm/native/inst.h>
-
-/*
- * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
- * complications with the linker (which likes to create PLT stubs for branches
- * to targets outside the shared object) and to avoid multi-phase kernel builds, we
- * simply create minimalistic "patch lists" in special ELF sections.
- */
-	.section ".data..patch.fsyscall_table", "a"
-	.previous
-#define LOAD_FSYSCALL_TABLE(reg)			\
-[1:]	movl reg=0;					\
-	.xdata4 ".data..patch.fsyscall_table", 1b-.
-
-	.section ".data..patch.brl_fsys_bubble_down", "a"
-	.previous
-#define BRL_COND_FSYS_BUBBLE_DOWN(pr)			\
-[1:](pr)brl.cond.sptk 0;				\
-	;;						\
-	.xdata4 ".data..patch.brl_fsys_bubble_down", 1b-.
-
-GLOBAL_ENTRY(__kernel_syscall_via_break)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * Note: for (fast) syscall restart to work, the break instruction must be
-	 *	 the first one in the bundle addressed by syscall_via_break.
-	 */
-{ .mib
-	break 0x100000
-	nop.i 0
-	br.ret.sptk.many b6
-}
-END(__kernel_syscall_via_break)
-
-#	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
-#	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
-#	define ARG2_OFF		(16 + IA64_SIGFRAME_ARG2_OFFSET)
-#	define SIGHANDLER_OFF	(16 + IA64_SIGFRAME_HANDLER_OFFSET)
-#	define SIGCONTEXT_OFF	(16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET)
-
-#	define FLAGS_OFF	IA64_SIGCONTEXT_FLAGS_OFFSET
-#	define CFM_OFF		IA64_SIGCONTEXT_CFM_OFFSET
-#	define FR6_OFF		IA64_SIGCONTEXT_FR6_OFFSET
-#	define BSP_OFF		IA64_SIGCONTEXT_AR_BSP_OFFSET
-#	define RNAT_OFF		IA64_SIGCONTEXT_AR_RNAT_OFFSET
-#	define UNAT_OFF		IA64_SIGCONTEXT_AR_UNAT_OFFSET
-#	define FPSR_OFF		IA64_SIGCONTEXT_AR_FPSR_OFFSET
-#	define PR_OFF		IA64_SIGCONTEXT_PR_OFFSET
-#	define RP_OFF		IA64_SIGCONTEXT_IP_OFFSET
-#	define SP_OFF		IA64_SIGCONTEXT_R12_OFFSET
-#	define RBS_BASE_OFF	IA64_SIGCONTEXT_RBS_BASE_OFFSET
-#	define LOADRS_OFF	IA64_SIGCONTEXT_LOADRS_OFFSET
-#	define base0		r2
-#	define base1		r3
-	/*
-	 * When we get here, the memory stack looks like this:
-	 *
-	 *   +===============================+
-       	 *   |				     |
-       	 *   //	    struct sigframe          //
-       	 *   |				     |
-	 *   +-------------------------------+ <-- sp+16
-	 *   |      16 byte of scratch       |
-	 *   |            space              |
-	 *   +-------------------------------+ <-- sp
-	 *
-	 * The register stack looks _exactly_ the way it looked at the time the signal
-	 * occurred.  In other words, we're treading on a potential mine-field: each
-	 * incoming general register may be a NaT value (including sp, in which case the
-	 * process ends up dying with a SIGSEGV).
-	 *
-	 * The first thing need to do is a cover to get the registers onto the backing
-	 * store.  Once that is done, we invoke the signal handler which may modify some
-	 * of the machine state.  After returning from the signal handler, we return
-	 * control to the previous context by executing a sigreturn system call.  A signal
-	 * handler may call the rt_sigreturn() function to directly return to a given
-	 * sigcontext.  However, the user-level sigreturn() needs to do much more than
-	 * calling the rt_sigreturn() system call as it needs to unwind the stack to
-	 * restore preserved registers that may have been saved on the signal handler's
-	 * call stack.
-	 */
-
-#define SIGTRAMP_SAVES										\
-	.unwabi 3, 's';		/* mark this as a sigtramp handler (saves scratch regs) */	\
-	.unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */	\
-	.savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF;						\
-	.savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF;						\
-	.savesp pr, PR_OFF+SIGCONTEXT_OFF;     							\
-	.savesp rp, RP_OFF+SIGCONTEXT_OFF;							\
-	.savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF;							\
-	.vframesp SP_OFF+SIGCONTEXT_OFF
-
-GLOBAL_ENTRY(__kernel_sigtramp)
-	// describe the state that is active when we get here:
-	.prologue
-	SIGTRAMP_SAVES
-	.body
-
-	.label_state 1
-
-	adds base0=SIGHANDLER_OFF,sp
-	adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp
-	br.call.sptk.many rp=1f
-1:
-	ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF)	// get pointer to signal handler's plabel
-	ld8 r15=[base1]					// get address of new RBS base (or NULL)
-	cover				// push args in interrupted frame onto backing store
-	;;
-	cmp.ne p1,p0=r15,r0		// do we need to switch rbs? (note: pr is saved by kernel)
-	mov.m r9=ar.bsp			// fetch ar.bsp
-	.spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-(p1)	br.cond.spnt setup_rbs		// yup -> (clobbers p8, r14-r16, and r18-r20)
-back_from_setup_rbs:
-	alloc r8=ar.pfs,0,0,3,0
-	ld8 out0=[base0],16		// load arg0 (signum)
-	adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1
-	;;
-	ld8 out1=[base1]		// load arg1 (siginfop)
-	ld8 r10=[r17],8			// get signal handler entry point
-	;;
-	ld8 out2=[base0]		// load arg2 (sigcontextp)
-	ld8 gp=[r17]			// get signal handler's global pointer
-	adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
-	;;
-	.spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF
-	st8 [base0]=r9			// save sc_ar_bsp
-	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
-	adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
-	;;
-	stf.spill [base0]=f6,32
-	stf.spill [base1]=f7,32
-	;;
-	stf.spill [base0]=f8,32
-	stf.spill [base1]=f9,32
-	mov b6=r10
-	;;
-	stf.spill [base0]=f10,32
-	stf.spill [base1]=f11,32
-	;;
-	stf.spill [base0]=f12,32
-	stf.spill [base1]=f13,32
-	;;
-	stf.spill [base0]=f14,32
-	stf.spill [base1]=f15,32
-	br.call.sptk.many rp=b6			// call the signal handler
-.ret0:	adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
-	;;
-	ld8 r15=[base0]				// fetch sc_ar_bsp
-	mov r14=ar.bsp
-	;;
-	cmp.ne p1,p0=r14,r15			// do we need to restore the rbs?
-(p1)	br.cond.spnt restore_rbs		// yup -> (clobbers r14-r18, f6 & f7)
-	;;
-back_from_restore_rbs:
-	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
-	adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
-	;;
-	ldf.fill f6=[base0],32
-	ldf.fill f7=[base1],32
-	;;
-	ldf.fill f8=[base0],32
-	ldf.fill f9=[base1],32
-	;;
-	ldf.fill f10=[base0],32
-	ldf.fill f11=[base1],32
-	;;
-	ldf.fill f12=[base0],32
-	ldf.fill f13=[base1],32
-	;;
-	ldf.fill f14=[base0],32
-	ldf.fill f15=[base1],32
-	mov r15=__NR_rt_sigreturn
-	.restore sp				// pop .prologue
-	break __BREAK_SYSCALL
-
-	.prologue
-	SIGTRAMP_SAVES
-setup_rbs:
-	mov ar.rsc=0				// put RSE into enforced lazy mode
-	;;
-	.save ar.rnat, r19
-	mov r19=ar.rnat				// save RNaT before switching backing store area
-	adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp
-
-	mov r18=ar.bspstore
-	mov ar.bspstore=r15			// switch over to new register backing store area
-	;;
-
-	.spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-	st8 [r14]=r19				// save sc_ar_rnat
-	.body
-	mov.m r16=ar.bsp			// sc_loadrs <- (new bsp - new bspstore) << 16
-	adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp
-	;;
-	invala
-	sub r15=r16,r15
-	extr.u r20=r18,3,6
-	;;
-	mov ar.rsc=0xf				// set RSE into eager mode, pl 3
-	cmp.eq p8,p0=63,r20
-	shl r15=r15,16
-	;;
-	st8 [r14]=r15				// save sc_loadrs
-(p8)	st8 [r18]=r19		// if bspstore points at RNaT slot, store RNaT there now
-	.restore sp				// pop .prologue
-	br.cond.sptk back_from_setup_rbs
-
-	.prologue
-	SIGTRAMP_SAVES
-	.spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-	.body
-restore_rbs:
-	// On input:
-	//	r14 = bsp1 (bsp at the time of return from signal handler)
-	//	r15 = bsp0 (bsp at the time the signal occurred)
-	//
-	// Here, we need to calculate bspstore0, the value that ar.bspstore needs
-	// to be set to, based on bsp0 and the size of the dirty partition on
-	// the alternate stack (sc_loadrs >> 16).  This can be done with the
-	// following algorithm:
-	//
-	//  bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
-	//
-	// This is what the code below does.
-	//
-	alloc r2=ar.pfs,0,0,0,0			// alloc null frame
-	adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
-	adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
-	;;
-	ld8 r17=[r16]
-	ld8 r16=[r18]			// get new rnat
-	extr.u r18=r15,3,6	// r18 <- rse_slot_num(bsp0)
-	;;
-	mov ar.rsc=r17			// put RSE into enforced lazy mode
-	shr.u r17=r17,16
-	;;
-	sub r14=r14,r17		// r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
-	shr.u r17=r17,3		// r17 <- (sc_loadrs >> 19)
-	;;
-	loadrs			// restore dirty partition
-	extr.u r14=r14,3,6	// r14 <- rse_slot_num(bspstore1)
-	;;
-	add r14=r14,r17		// r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
-	;;
-	shr.u r14=r14,6		// r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
-	;;
-	sub r14=r14,r17		// r14 <- -rse_num_regs(bspstore1, bsp1)
-	movl r17=0x8208208208208209
-	;;
-	add r18=r18,r14		// r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
-	setf.sig f7=r17
-	cmp.lt p7,p0=r14,r0	// p7 <- (r14 < 0)?
-	;;
-(p7)	adds r18=-62,r18	// delta -= 62
-	;;
-	setf.sig f6=r18
-	;;
-	xmpy.h f6=f6,f7
-	;;
-	getf.sig r17=f6
-	;;
-	add r17=r17,r18
-	shr r18=r18,63
-	;;
-	shr r17=r17,5
-	;;
-	sub r17=r17,r18		// r17 = delta/63
-	;;
-	add r17=r14,r17		// r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
-	;;
-	shladd r15=r17,3,r15	// r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
-	;;
-	mov ar.bspstore=r15			// switch back to old register backing store area
-	;;
-	mov ar.rnat=r16				// restore RNaT
-	mov ar.rsc=0xf				// (will be restored later on from sc_ar_rsc)
-	// invala not necessary as that will happen when returning to user-mode
-	br.cond.sptk back_from_restore_rbs
-END(__kernel_sigtramp)
-
-/*
- * On entry:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	b6  = return address
- * On exit:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	all other "scratch" registers:	undefined
- *	all "preserved" registers:	same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
-	.prologue
-	.altrp b6
-	.body
-{
-	/*
-	 * Note: the kernel cannot assume that the first two instructions in this
-	 * bundle get executed.  The remaining code must be safe even if
-	 * they do not get executed.
-	 */
-	adds r17=-1024,r15			// A
-	mov r10=0				// A    default to successful syscall execution
-	epc					// B	causes split-issue
-}
-	;;
-	RSM_PSR_BE_I(r20, r22)			// M2 (5 cyc to srlz.d)
-	LOAD_FSYSCALL_TABLE(r14)		// X
-	;;
-	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
-	shladd r18=r17,3,r14			// A
-	mov r19=NR_syscalls-1			// A
-	;;
-	lfetch [r18]				// M0|1
-	MOV_FROM_PSR(p0, r29, r8)		// M2 (12 cyc)
-	// If r17 is a NaT, p6 will be zero
-	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
-	;;
-	mov r21=ar.fpsr				// M2 (12 cyc)
-	tnat.nz p10,p9=r15			// I0
-	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
-	;;
-	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
-(p6)	ld8 r18=[r18]				// M0|1
-	nop.i 0
-	;;
-	nop.m 0
-(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
-	nop.i 0
-	;;
-	SSM_PSR_I(p8, p14, r25)
-(p6)	mov b7=r18				// I0
-(p8)	br.dptk.many b7				// B
-
-	mov r27=ar.rsc				// M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
- * future version of the linker.  In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
-	;;
-(p6)	mov b7=r14
-(p6)	br.sptk.many b7
-#else
-	BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
-	SSM_PSR_I(p0, p14, r10)
-	mov r10=-1
-(p10)	mov r8=EINVAL
-(p9)	mov r8=ENOSYS
-	FSYS_RETURN
-
-END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
deleted file mode 100644
index 461c7e69d465..000000000000
--- a/arch/ia64/kernel/gate.lds.S
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Linker script for gate DSO.  The gate pages are an ELF shared object
- * prelinked to its virtual address, with only one read-only segment and
- * one execute-only segment (both fit in one page).  This script controls
- * its layout.
- */
-
-#include <asm/page.h>
-
-SECTIONS
-{
-	. = GATE_ADDR + SIZEOF_HEADERS;
-
-	.hash			: { *(.hash) }		:readable
-	.gnu.hash		: { *(.gnu.hash) }
-	.dynsym			: { *(.dynsym) }
-	.dynstr			: { *(.dynstr) }
-	.gnu.version		: { *(.gnu.version) }
-	.gnu.version_d		: { *(.gnu.version_d) }
-	.gnu.version_r		: { *(.gnu.version_r) }
-
-	.note			: { *(.note*) }		:readable	:note
-
-	.dynamic		: { *(.dynamic) }	:readable	:dynamic
-
-	/*
-	 * This linker script is used both with -r and with -shared.  For
-	 * the layouts to match, we need to skip more than enough space for
-	 * the dynamic symbol table et al.  If this amount is insufficient,
-	 * ld -shared will barf.  Just increase it here.
-	 */
-	. = GATE_ADDR + 0x600;
-
-	.data..patch		: {
-		__start_gate_mckinley_e9_patchlist = .;
-		*(.data..patch.mckinley_e9)
-		__end_gate_mckinley_e9_patchlist = .;
-
-		__start_gate_vtop_patchlist = .;
-		*(.data..patch.vtop)
-		__end_gate_vtop_patchlist = .;
-
-		__start_gate_fsyscall_patchlist = .;
-		*(.data..patch.fsyscall_table)
-		__end_gate_fsyscall_patchlist = .;
-
-		__start_gate_brl_fsys_bubble_down_patchlist = .;
-		*(.data..patch.brl_fsys_bubble_down)
-		__end_gate_brl_fsys_bubble_down_patchlist = .;
-	}						:readable
-
-	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
-	.IA_64.unwind		: { *(.IA_64.unwind*) }	:readable	:unwind
-#ifdef HAVE_BUGGY_SEGREL
-	.text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) }	:readable
-#else
-	. = ALIGN(PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
-	.text			: { *(.text) *(.text.*) }	:epc
-#endif
-
-	/DISCARD/		: {
-		*(.got.plt) *(.got)
-		*(.data .data.* .gnu.linkonce.d.*)
-		*(.dynbss)
-		*(.bss .bss.* .gnu.linkonce.b.*)
-		*(__ex_table)
-		*(__mca_table)
-	}
-}
-
-/*
- * ld does not recognize this name token; use the constant.
- */
-#define	PT_IA_64_UNWIND	0x70000001
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-	readable	PT_LOAD	FILEHDR	PHDRS	FLAGS(4);	/* PF_R */
-#ifndef HAVE_BUGGY_SEGREL
-	epc		PT_LOAD	FILEHDR PHDRS	FLAGS(1);	/* PF_X */
-#endif
-	dynamic		PT_DYNAMIC		FLAGS(4);	/* PF_R */
-	note		PT_NOTE			FLAGS(4);	/* PF_R */
-	unwind		PT_IA_64_UNWIND;
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-	LINUX_2.5 {
-	global:
-		__kernel_syscall_via_break;
-		__kernel_syscall_via_epc;
-		__kernel_sigtramp;
-
-	local: *;
-	};
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
deleted file mode 100644
index 85c8a57da402..000000000000
--- a/arch/ia64/kernel/head.S
+++ /dev/null
@@ -1,1167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Here is where the ball gets rolling as far as the kernel is concerned.
- * When control is transferred to _start, the bootload has already
- * loaded us to the correct address.  All that's left to do here is
- * to set up the kernel's global pointer and jump to the kernel
- * entry point.
- *
- * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Intel Corp.
- * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
- * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com>
- *   -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2.
- * Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
- *   Support for CPU Hotplug
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/fpu.h>
-#include <asm/kregs.h>
-#include <asm/mmu_context.h>
-#include <asm/asm-offsets.h>
-#include <asm/pal.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/mca_asm.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define SAL_PSR_BITS_TO_SET				\
-	(IA64_PSR_AC | IA64_PSR_BN | IA64_PSR_MFH | IA64_PSR_MFL)
-
-#define SAVE_FROM_REG(src, ptr, dest)	\
-	mov dest=src;;						\
-	st8 [ptr]=dest,0x08
-
-#define RESTORE_REG(reg, ptr, _tmp)		\
-	ld8 _tmp=[ptr],0x08;;				\
-	mov reg=_tmp
-
-#define SAVE_BREAK_REGS(ptr, _idx, _breg, _dest)\
-	mov ar.lc=IA64_NUM_DBG_REGS-1;; 			\
-	mov _idx=0;; 								\
-1: 												\
-	SAVE_FROM_REG(_breg[_idx], ptr, _dest);;	\
-	add _idx=1,_idx;;							\
-	br.cloop.sptk.many 1b
-
-#define RESTORE_BREAK_REGS(ptr, _idx, _breg, _tmp, _lbl)\
-	mov ar.lc=IA64_NUM_DBG_REGS-1;;			\
-	mov _idx=0;;							\
-_lbl:  RESTORE_REG(_breg[_idx], ptr, _tmp);;	\
-	add _idx=1, _idx;;						\
-	br.cloop.sptk.many _lbl
-
-#define SAVE_ONE_RR(num, _reg, _tmp) \
-	movl _tmp=(num<<61);;	\
-	mov _reg=rr[_tmp]
-
-#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \
-	SAVE_ONE_RR(0,_r0, _tmp);; \
-	SAVE_ONE_RR(1,_r1, _tmp);; \
-	SAVE_ONE_RR(2,_r2, _tmp);; \
-	SAVE_ONE_RR(3,_r3, _tmp);; \
-	SAVE_ONE_RR(4,_r4, _tmp);; \
-	SAVE_ONE_RR(5,_r5, _tmp);; \
-	SAVE_ONE_RR(6,_r6, _tmp);; \
-	SAVE_ONE_RR(7,_r7, _tmp);;
-
-#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \
-	st8 [ptr]=_r0, 8;; \
-	st8 [ptr]=_r1, 8;; \
-	st8 [ptr]=_r2, 8;; \
-	st8 [ptr]=_r3, 8;; \
-	st8 [ptr]=_r4, 8;; \
-	st8 [ptr]=_r5, 8;; \
-	st8 [ptr]=_r6, 8;; \
-	st8 [ptr]=_r7, 8;;
-
-#define RESTORE_REGION_REGS(ptr, _idx1, _idx2, _tmp) \
-	mov		ar.lc=0x08-1;;						\
-	movl	_idx1=0x00;;						\
-RestRR:											\
-	dep.z	_idx2=_idx1,61,3;;					\
-	ld8		_tmp=[ptr],8;;						\
-	mov		rr[_idx2]=_tmp;;					\
-	srlz.d;;									\
-	add		_idx1=1,_idx1;;						\
-	br.cloop.sptk.few	RestRR
-
-#define SET_AREA_FOR_BOOTING_CPU(reg1, reg2) \
-	movl reg1=sal_state_for_booting_cpu;;	\
-	ld8 reg2=[reg1];;
-
-/*
- * Adjust region registers saved before starting to save
- * break regs and rest of the states that need to be preserved.
- */
-#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(_reg1,_reg2,_pred)  \
-	SAVE_FROM_REG(b0,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b1,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b2,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b3,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b4,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b5,_reg1,_reg2);;						\
-	st8 [_reg1]=r1,0x08;;								\
-	st8 [_reg1]=r12,0x08;;								\
-	st8 [_reg1]=r13,0x08;;								\
-	SAVE_FROM_REG(ar.fpsr,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.pfs,_reg1,_reg2);;					\
-	SAVE_FROM_REG(ar.rnat,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.unat,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.bspstore,_reg1,_reg2);;			\
-	SAVE_FROM_REG(cr.dcr,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.iva,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.pta,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.itv,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.pmv,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.cmcv,_reg1,_reg2);;				\
-	SAVE_FROM_REG(cr.lrr0,_reg1,_reg2);;				\
-	SAVE_FROM_REG(cr.lrr1,_reg1,_reg2);;				\
-	st8 [_reg1]=r4,0x08;;								\
-	st8 [_reg1]=r5,0x08;;								\
-	st8 [_reg1]=r6,0x08;;								\
-	st8 [_reg1]=r7,0x08;;								\
-	st8 [_reg1]=_pred,0x08;;							\
-	SAVE_FROM_REG(ar.lc, _reg1, _reg2);;				\
-	stf.spill.nta [_reg1]=f2,16;;						\
-	stf.spill.nta [_reg1]=f3,16;;						\
-	stf.spill.nta [_reg1]=f4,16;;						\
-	stf.spill.nta [_reg1]=f5,16;;						\
-	stf.spill.nta [_reg1]=f16,16;;						\
-	stf.spill.nta [_reg1]=f17,16;;						\
-	stf.spill.nta [_reg1]=f18,16;;						\
-	stf.spill.nta [_reg1]=f19,16;;						\
-	stf.spill.nta [_reg1]=f20,16;;						\
-	stf.spill.nta [_reg1]=f21,16;;						\
-	stf.spill.nta [_reg1]=f22,16;;						\
-	stf.spill.nta [_reg1]=f23,16;;						\
-	stf.spill.nta [_reg1]=f24,16;;						\
-	stf.spill.nta [_reg1]=f25,16;;						\
-	stf.spill.nta [_reg1]=f26,16;;						\
-	stf.spill.nta [_reg1]=f27,16;;						\
-	stf.spill.nta [_reg1]=f28,16;;						\
-	stf.spill.nta [_reg1]=f29,16;;						\
-	stf.spill.nta [_reg1]=f30,16;;						\
-	stf.spill.nta [_reg1]=f31,16;;
-
-#else
-#define SET_AREA_FOR_BOOTING_CPU(a1, a2)
-#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(a1,a2, a3)
-#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7)
-#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7)
-#endif
-
-#define SET_ONE_RR(num, pgsize, _tmp1, _tmp2, vhpt) \
-	movl _tmp1=(num << 61);;	\
-	mov _tmp2=((ia64_rid(IA64_REGION_ID_KERNEL, (num<<61)) << 8) | (pgsize << 2) | vhpt);; \
-	mov rr[_tmp1]=_tmp2
-
-	__PAGE_ALIGNED_DATA
-
-	.global empty_zero_page
-EXPORT_SYMBOL_GPL(empty_zero_page)
-empty_zero_page:
-	.skip PAGE_SIZE
-
-	.global swapper_pg_dir
-swapper_pg_dir:
-	.skip PAGE_SIZE
-
-	.rodata
-halt_msg:
-	stringz "Halting kernel\n"
-
-	__REF
-
-	.global start_ap
-
-	/*
-	 * Start the kernel.  When the bootloader passes control to _start(), r28
-	 * points to the address of the boot parameter area.  Execution reaches
-	 * here in physical mode.
-	 */
-GLOBAL_ENTRY(_start)
-start_ap:
-	.prologue
-	.save rp, r0		// terminate unwind chain with a NULL rp
-	.body
-
-	rsm psr.i | psr.ic
-	;;
-	srlz.i
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	/*
-	 * Save the region registers, predicate before they get clobbered
-	 */
-	SAVE_REGION_REGS(r2, r8,r9,r10,r11,r12,r13,r14,r15);
-	mov r25=pr;;
-
-	/*
-	 * Initialize kernel region registers:
-	 *	rr[0]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[1]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[2]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[3]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[4]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[5]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
-	 *	rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
-	 * We initialize all of them to prevent inadvertently assuming
-	 * something about the state of address translation early in boot.
-	 */
-	SET_ONE_RR(0, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(1, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(2, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(3, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(4, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(5, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(6, IA64_GRANULE_SHIFT, r2, r16, 0);;
-	SET_ONE_RR(7, IA64_GRANULE_SHIFT, r2, r16, 0);;
-	/*
-	 * Now pin mappings into the TLB for kernel text and data
-	 */
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	movl r17=KERNEL_START
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r17
-	mov r16=IA64_TR_KERNEL
-	mov r3=ip
-	movl r18=PAGE_KERNEL
-	;;
-	dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT
-	;;
-	or r18=r2,r18
-	;;
-	srlz.i
-	;;
-	itr.i itr[r16]=r18
-	;;
-	itr.d dtr[r16]=r18
-	;;
-	srlz.i
-
-	/*
-	 * Switch into virtual mode:
-	 */
-	movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
-		  |IA64_PSR_DI)
-	;;
-	mov cr.ipsr=r16
-	movl r17=1f
-	;;
-	mov cr.iip=r17
-	mov cr.ifs=r0
-	;;
-	rfi
-	;;
-1:	// now we are in virtual mode
-
-	SET_AREA_FOR_BOOTING_CPU(r2, r16);
-
-	STORE_REGION_REGS(r16, r8,r9,r10,r11,r12,r13,r14,r15);
-	SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(r16,r17,r25)
-	;;
-
-	// set IVT entry point---can't access I/O ports without it
-	movl r3=ia64_ivt
-	;;
-	mov cr.iva=r3
-	movl r2=FPSR_DEFAULT
-	;;
-	srlz.i
-	movl gp=__gp
-
-	mov ar.fpsr=r2
-	;;
-
-#define isAP	p2	// are we an Application Processor?
-#define isBP	p3	// are we the Bootstrap Processor?
-
-#ifdef CONFIG_SMP
-	/*
-	 * Find the init_task for the currently booting CPU.  At poweron, and in
-	 * UP mode, task_for_booting_cpu is NULL.
-	 */
-	movl r3=task_for_booting_cpu
- 	;;
-	ld8 r3=[r3]
-	movl r2=init_task
-	;;
-	cmp.eq isBP,isAP=r3,r0
-	;;
-(isAP)	mov r2=r3
-#else
-	movl r2=init_task
-	cmp.eq isBP,isAP=r0,r0
-#endif
-	;;
-	tpa r3=r2		// r3 == phys addr of task struct
-	mov r16=-1
-(isBP)	br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
-
-	// load mapping for stack (virtaddr in r2, physaddr in r3)
-	rsm psr.ic
-	movl r17=PAGE_KERNEL
-	;;
-	srlz.d
-	dep r18=0,r3,0,12
-	;;
-	or r18=r17,r18
-	dep r2=-1,r3,61,3	// IMVA of task
-	;;
-	mov r17=rr[r2]
-	shr.u r16=r3,IA64_GRANULE_SHIFT
-	;;
-	dep r17=0,r17,8,24
-	;;
-	mov cr.itir=r17
-	mov cr.ifa=r2
-
-	mov r19=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r19]=r18
-	;;
-	ssm psr.ic
-	srlz.d
-  	;;
-
-.load_current:
-	// load the "current" pointer (r13) and ar.k6 with the current task
-	mov IA64_KR(CURRENT)=r2		// virtual address
-	mov IA64_KR(CURRENT_STACK)=r16
-	mov r13=r2
-	/*
-	 * Reserve space at the top of the stack for "struct pt_regs".  Kernel
-	 * threads don't store interesting values in that structure, but the space
-	 * still needs to be there because time-critical stuff such as the context
-	 * switching can be implemented more efficiently (for example, __switch_to()
-	 * always sets the psr.dfh bit of the task it is switching to).
-	 */
-
-	addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2
-	addl r2=IA64_RBS_OFFSET,r2	// initialize the RSE
-	mov ar.rsc=0		// place RSE in enforced lazy mode
-	;;
-	loadrs			// clear the dirty partition
-	movl r19=__phys_per_cpu_start
-	mov r18=PERCPU_PAGE_SIZE
-	;;
-#ifndef CONFIG_SMP
-	add r19=r19,r18
-	;;
-#else
-(isAP)	br.few 2f
-	movl r20=__cpu0_per_cpu
-	;;
-	shr.u r18=r18,3
-1:
-	ld8 r21=[r19],8;;
-	st8[r20]=r21,8
-	adds r18=-1,r18;;
-	cmp4.lt p7,p6=0,r18
-(p7)	br.cond.dptk.few 1b
-	mov r19=r20
-	;;
-2:
-#endif
-	tpa r19=r19
-	;;
-	.pred.rel.mutex isBP,isAP
-(isBP)	mov IA64_KR(PER_CPU_DATA)=r19	// per-CPU base for cpu0
-(isAP)	mov IA64_KR(PER_CPU_DATA)=r0	// clear physical per-CPU base
-	;;
-	mov ar.bspstore=r2	// establish the new RSE stack
-	;;
-	mov ar.rsc=0x3		// place RSE in eager mode
-
-(isBP)	dep r28=-1,r28,61,3	// make address virtual
-(isBP)	movl r2=ia64_boot_param
-	;;
-(isBP)	st8 [r2]=r28		// save the address of the boot param area passed by the bootloader
-
-#ifdef CONFIG_SMP
-(isAP)	br.call.sptk.many rp=start_secondary
-.ret0:
-(isAP)	br.cond.sptk self
-#endif
-
-	// This is executed by the bootstrap processor (bsp) only:
-
-	br.call.sptk.many rp=start_kernel
-.ret2:	addl r3=@ltoff(halt_msg),gp
-	;;
-	alloc r2=ar.pfs,8,0,2,0
-	;;
-	ld8 out0=[r3]
-	br.call.sptk.many b0=console_print
-
-self:	hint @pause
-	br.sptk.many self		// endless loop
-END(_start)
-
-	.text
-
-GLOBAL_ENTRY(ia64_save_debug_regs)
-	alloc r16=ar.pfs,1,0,0,0
-	mov r20=ar.lc			// preserve ar.lc
-	mov ar.lc=IA64_NUM_DBG_REGS-1
-	mov r18=0
-	add r19=IA64_NUM_DBG_REGS*8,in0
-	;;
-1:	mov r16=dbr[r18]
-#ifdef CONFIG_ITANIUM
-	;;
-	srlz.d
-#endif
-	mov r17=ibr[r18]
-	add r18=1,r18
-	;;
-	st8.nta [in0]=r16,8
-	st8.nta [r19]=r17,8
-	br.cloop.sptk.many 1b
-	;;
-	mov ar.lc=r20			// restore ar.lc
-	br.ret.sptk.many rp
-END(ia64_save_debug_regs)
-
-GLOBAL_ENTRY(ia64_load_debug_regs)
-	alloc r16=ar.pfs,1,0,0,0
-	lfetch.nta [in0]
-	mov r20=ar.lc			// preserve ar.lc
-	add r19=IA64_NUM_DBG_REGS*8,in0
-	mov ar.lc=IA64_NUM_DBG_REGS-1
-	mov r18=-1
-	;;
-1:	ld8.nta r16=[in0],8
-	ld8.nta r17=[r19],8
-	add r18=1,r18
-	;;
-	mov dbr[r18]=r16
-#ifdef CONFIG_ITANIUM
-	;;
-	srlz.d				// Errata 132 (NoFix status)
-#endif
-	mov ibr[r18]=r17
-	br.cloop.sptk.many 1b
-	;;
-	mov ar.lc=r20			// restore ar.lc
-	br.ret.sptk.many rp
-END(ia64_load_debug_regs)
-
-GLOBAL_ENTRY(__ia64_save_fpu)
-	alloc r2=ar.pfs,1,4,0,0
-	adds loc0=96*16-16,in0
-	adds loc1=96*16-16-128,in0
-	;;
-	stf.spill.nta [loc0]=f127,-256
-	stf.spill.nta [loc1]=f119,-256
-	;;
-	stf.spill.nta [loc0]=f111,-256
-	stf.spill.nta [loc1]=f103,-256
-	;;
-	stf.spill.nta [loc0]=f95,-256
-	stf.spill.nta [loc1]=f87,-256
-	;;
-	stf.spill.nta [loc0]=f79,-256
-	stf.spill.nta [loc1]=f71,-256
-	;;
-	stf.spill.nta [loc0]=f63,-256
-	stf.spill.nta [loc1]=f55,-256
-	adds loc2=96*16-32,in0
-	;;
-	stf.spill.nta [loc0]=f47,-256
-	stf.spill.nta [loc1]=f39,-256
-	adds loc3=96*16-32-128,in0
-	;;
-	stf.spill.nta [loc2]=f126,-256
-	stf.spill.nta [loc3]=f118,-256
-	;;
-	stf.spill.nta [loc2]=f110,-256
-	stf.spill.nta [loc3]=f102,-256
-	;;
-	stf.spill.nta [loc2]=f94,-256
-	stf.spill.nta [loc3]=f86,-256
-	;;
-	stf.spill.nta [loc2]=f78,-256
-	stf.spill.nta [loc3]=f70,-256
-	;;
-	stf.spill.nta [loc2]=f62,-256
-	stf.spill.nta [loc3]=f54,-256
-	adds loc0=96*16-48,in0
-	;;
-	stf.spill.nta [loc2]=f46,-256
-	stf.spill.nta [loc3]=f38,-256
-	adds loc1=96*16-48-128,in0
-	;;
-	stf.spill.nta [loc0]=f125,-256
-	stf.spill.nta [loc1]=f117,-256
-	;;
-	stf.spill.nta [loc0]=f109,-256
-	stf.spill.nta [loc1]=f101,-256
-	;;
-	stf.spill.nta [loc0]=f93,-256
-	stf.spill.nta [loc1]=f85,-256
-	;;
-	stf.spill.nta [loc0]=f77,-256
-	stf.spill.nta [loc1]=f69,-256
-	;;
-	stf.spill.nta [loc0]=f61,-256
-	stf.spill.nta [loc1]=f53,-256
-	adds loc2=96*16-64,in0
-	;;
-	stf.spill.nta [loc0]=f45,-256
-	stf.spill.nta [loc1]=f37,-256
-	adds loc3=96*16-64-128,in0
-	;;
-	stf.spill.nta [loc2]=f124,-256
-	stf.spill.nta [loc3]=f116,-256
-	;;
-	stf.spill.nta [loc2]=f108,-256
-	stf.spill.nta [loc3]=f100,-256
-	;;
-	stf.spill.nta [loc2]=f92,-256
-	stf.spill.nta [loc3]=f84,-256
-	;;
-	stf.spill.nta [loc2]=f76,-256
-	stf.spill.nta [loc3]=f68,-256
-	;;
-	stf.spill.nta [loc2]=f60,-256
-	stf.spill.nta [loc3]=f52,-256
-	adds loc0=96*16-80,in0
-	;;
-	stf.spill.nta [loc2]=f44,-256
-	stf.spill.nta [loc3]=f36,-256
-	adds loc1=96*16-80-128,in0
-	;;
-	stf.spill.nta [loc0]=f123,-256
-	stf.spill.nta [loc1]=f115,-256
-	;;
-	stf.spill.nta [loc0]=f107,-256
-	stf.spill.nta [loc1]=f99,-256
-	;;
-	stf.spill.nta [loc0]=f91,-256
-	stf.spill.nta [loc1]=f83,-256
-	;;
-	stf.spill.nta [loc0]=f75,-256
-	stf.spill.nta [loc1]=f67,-256
-	;;
-	stf.spill.nta [loc0]=f59,-256
-	stf.spill.nta [loc1]=f51,-256
-	adds loc2=96*16-96,in0
-	;;
-	stf.spill.nta [loc0]=f43,-256
-	stf.spill.nta [loc1]=f35,-256
-	adds loc3=96*16-96-128,in0
-	;;
-	stf.spill.nta [loc2]=f122,-256
-	stf.spill.nta [loc3]=f114,-256
-	;;
-	stf.spill.nta [loc2]=f106,-256
-	stf.spill.nta [loc3]=f98,-256
-	;;
-	stf.spill.nta [loc2]=f90,-256
-	stf.spill.nta [loc3]=f82,-256
-	;;
-	stf.spill.nta [loc2]=f74,-256
-	stf.spill.nta [loc3]=f66,-256
-	;;
-	stf.spill.nta [loc2]=f58,-256
-	stf.spill.nta [loc3]=f50,-256
-	adds loc0=96*16-112,in0
-	;;
-	stf.spill.nta [loc2]=f42,-256
-	stf.spill.nta [loc3]=f34,-256
-	adds loc1=96*16-112-128,in0
-	;;
-	stf.spill.nta [loc0]=f121,-256
-	stf.spill.nta [loc1]=f113,-256
-	;;
-	stf.spill.nta [loc0]=f105,-256
-	stf.spill.nta [loc1]=f97,-256
-	;;
-	stf.spill.nta [loc0]=f89,-256
-	stf.spill.nta [loc1]=f81,-256
-	;;
-	stf.spill.nta [loc0]=f73,-256
-	stf.spill.nta [loc1]=f65,-256
-	;;
-	stf.spill.nta [loc0]=f57,-256
-	stf.spill.nta [loc1]=f49,-256
-	adds loc2=96*16-128,in0
-	;;
-	stf.spill.nta [loc0]=f41,-256
-	stf.spill.nta [loc1]=f33,-256
-	adds loc3=96*16-128-128,in0
-	;;
-	stf.spill.nta [loc2]=f120,-256
-	stf.spill.nta [loc3]=f112,-256
-	;;
-	stf.spill.nta [loc2]=f104,-256
-	stf.spill.nta [loc3]=f96,-256
-	;;
-	stf.spill.nta [loc2]=f88,-256
-	stf.spill.nta [loc3]=f80,-256
-	;;
-	stf.spill.nta [loc2]=f72,-256
-	stf.spill.nta [loc3]=f64,-256
-	;;
-	stf.spill.nta [loc2]=f56,-256
-	stf.spill.nta [loc3]=f48,-256
-	;;
-	stf.spill.nta [loc2]=f40
-	stf.spill.nta [loc3]=f32
-	br.ret.sptk.many rp
-END(__ia64_save_fpu)
-
-GLOBAL_ENTRY(__ia64_load_fpu)
-	alloc r2=ar.pfs,1,2,0,0
-	adds r3=128,in0
-	adds r14=256,in0
-	adds r15=384,in0
-	mov loc0=512
-	mov loc1=-1024+16
-	;;
-	ldf.fill.nta f32=[in0],loc0
-	ldf.fill.nta f40=[ r3],loc0
-	ldf.fill.nta f48=[r14],loc0
-	ldf.fill.nta f56=[r15],loc0
-	;;
-	ldf.fill.nta f64=[in0],loc0
-	ldf.fill.nta f72=[ r3],loc0
-	ldf.fill.nta f80=[r14],loc0
-	ldf.fill.nta f88=[r15],loc0
-	;;
-	ldf.fill.nta f96=[in0],loc1
-	ldf.fill.nta f104=[ r3],loc1
-	ldf.fill.nta f112=[r14],loc1
-	ldf.fill.nta f120=[r15],loc1
-	;;
-	ldf.fill.nta f33=[in0],loc0
-	ldf.fill.nta f41=[ r3],loc0
-	ldf.fill.nta f49=[r14],loc0
-	ldf.fill.nta f57=[r15],loc0
-	;;
-	ldf.fill.nta f65=[in0],loc0
-	ldf.fill.nta f73=[ r3],loc0
-	ldf.fill.nta f81=[r14],loc0
-	ldf.fill.nta f89=[r15],loc0
-	;;
-	ldf.fill.nta f97=[in0],loc1
-	ldf.fill.nta f105=[ r3],loc1
-	ldf.fill.nta f113=[r14],loc1
-	ldf.fill.nta f121=[r15],loc1
-	;;
-	ldf.fill.nta f34=[in0],loc0
-	ldf.fill.nta f42=[ r3],loc0
-	ldf.fill.nta f50=[r14],loc0
-	ldf.fill.nta f58=[r15],loc0
-	;;
-	ldf.fill.nta f66=[in0],loc0
-	ldf.fill.nta f74=[ r3],loc0
-	ldf.fill.nta f82=[r14],loc0
-	ldf.fill.nta f90=[r15],loc0
-	;;
-	ldf.fill.nta f98=[in0],loc1
-	ldf.fill.nta f106=[ r3],loc1
-	ldf.fill.nta f114=[r14],loc1
-	ldf.fill.nta f122=[r15],loc1
-	;;
-	ldf.fill.nta f35=[in0],loc0
-	ldf.fill.nta f43=[ r3],loc0
-	ldf.fill.nta f51=[r14],loc0
-	ldf.fill.nta f59=[r15],loc0
-	;;
-	ldf.fill.nta f67=[in0],loc0
-	ldf.fill.nta f75=[ r3],loc0
-	ldf.fill.nta f83=[r14],loc0
-	ldf.fill.nta f91=[r15],loc0
-	;;
-	ldf.fill.nta f99=[in0],loc1
-	ldf.fill.nta f107=[ r3],loc1
-	ldf.fill.nta f115=[r14],loc1
-	ldf.fill.nta f123=[r15],loc1
-	;;
-	ldf.fill.nta f36=[in0],loc0
-	ldf.fill.nta f44=[ r3],loc0
-	ldf.fill.nta f52=[r14],loc0
-	ldf.fill.nta f60=[r15],loc0
-	;;
-	ldf.fill.nta f68=[in0],loc0
-	ldf.fill.nta f76=[ r3],loc0
-	ldf.fill.nta f84=[r14],loc0
-	ldf.fill.nta f92=[r15],loc0
-	;;
-	ldf.fill.nta f100=[in0],loc1
-	ldf.fill.nta f108=[ r3],loc1
-	ldf.fill.nta f116=[r14],loc1
-	ldf.fill.nta f124=[r15],loc1
-	;;
-	ldf.fill.nta f37=[in0],loc0
-	ldf.fill.nta f45=[ r3],loc0
-	ldf.fill.nta f53=[r14],loc0
-	ldf.fill.nta f61=[r15],loc0
-	;;
-	ldf.fill.nta f69=[in0],loc0
-	ldf.fill.nta f77=[ r3],loc0
-	ldf.fill.nta f85=[r14],loc0
-	ldf.fill.nta f93=[r15],loc0
-	;;
-	ldf.fill.nta f101=[in0],loc1
-	ldf.fill.nta f109=[ r3],loc1
-	ldf.fill.nta f117=[r14],loc1
-	ldf.fill.nta f125=[r15],loc1
-	;;
-	ldf.fill.nta f38 =[in0],loc0
-	ldf.fill.nta f46 =[ r3],loc0
-	ldf.fill.nta f54 =[r14],loc0
-	ldf.fill.nta f62 =[r15],loc0
-	;;
-	ldf.fill.nta f70 =[in0],loc0
-	ldf.fill.nta f78 =[ r3],loc0
-	ldf.fill.nta f86 =[r14],loc0
-	ldf.fill.nta f94 =[r15],loc0
-	;;
-	ldf.fill.nta f102=[in0],loc1
-	ldf.fill.nta f110=[ r3],loc1
-	ldf.fill.nta f118=[r14],loc1
-	ldf.fill.nta f126=[r15],loc1
-	;;
-	ldf.fill.nta f39 =[in0],loc0
-	ldf.fill.nta f47 =[ r3],loc0
-	ldf.fill.nta f55 =[r14],loc0
-	ldf.fill.nta f63 =[r15],loc0
-	;;
-	ldf.fill.nta f71 =[in0],loc0
-	ldf.fill.nta f79 =[ r3],loc0
-	ldf.fill.nta f87 =[r14],loc0
-	ldf.fill.nta f95 =[r15],loc0
-	;;
-	ldf.fill.nta f103=[in0]
-	ldf.fill.nta f111=[ r3]
-	ldf.fill.nta f119=[r14]
-	ldf.fill.nta f127=[r15]
-	br.ret.sptk.many rp
-END(__ia64_load_fpu)
-
-GLOBAL_ENTRY(__ia64_init_fpu)
-	stf.spill [sp]=f0		// M3
-	mov	 f32=f0			// F
-	nop.b	 0
-
-	ldfps	 f33,f34=[sp]		// M0
-	ldfps	 f35,f36=[sp]		// M1
-	mov      f37=f0			// F
-	;;
-
-	setf.s	 f38=r0			// M2
-	setf.s	 f39=r0			// M3
-	mov      f40=f0			// F
-
-	ldfps	 f41,f42=[sp]		// M0
-	ldfps	 f43,f44=[sp]		// M1
-	mov      f45=f0			// F
-
-	setf.s	 f46=r0			// M2
-	setf.s	 f47=r0			// M3
-	mov      f48=f0			// F
-
-	ldfps	 f49,f50=[sp]		// M0
-	ldfps	 f51,f52=[sp]		// M1
-	mov      f53=f0			// F
-
-	setf.s	 f54=r0			// M2
-	setf.s	 f55=r0			// M3
-	mov      f56=f0			// F
-
-	ldfps	 f57,f58=[sp]		// M0
-	ldfps	 f59,f60=[sp]		// M1
-	mov      f61=f0			// F
-
-	setf.s	 f62=r0			// M2
-	setf.s	 f63=r0			// M3
-	mov      f64=f0			// F
-
-	ldfps	 f65,f66=[sp]		// M0
-	ldfps	 f67,f68=[sp]		// M1
-	mov      f69=f0			// F
-
-	setf.s	 f70=r0			// M2
-	setf.s	 f71=r0			// M3
-	mov      f72=f0			// F
-
-	ldfps	 f73,f74=[sp]		// M0
-	ldfps	 f75,f76=[sp]		// M1
-	mov      f77=f0			// F
-
-	setf.s	 f78=r0			// M2
-	setf.s	 f79=r0			// M3
-	mov      f80=f0			// F
-
-	ldfps	 f81,f82=[sp]		// M0
-	ldfps	 f83,f84=[sp]		// M1
-	mov      f85=f0			// F
-
-	setf.s	 f86=r0			// M2
-	setf.s	 f87=r0			// M3
-	mov      f88=f0			// F
-
-	/*
-	 * When the instructions are cached, it would be faster to initialize
-	 * the remaining registers with simply mov instructions (F-unit).
-	 * This gets the time down to ~29 cycles.  However, this would use up
-	 * 33 bundles, whereas continuing with the above pattern yields
-	 * 10 bundles and ~30 cycles.
-	 */
-
-	ldfps	 f89,f90=[sp]		// M0
-	ldfps	 f91,f92=[sp]		// M1
-	mov      f93=f0			// F
-
-	setf.s	 f94=r0			// M2
-	setf.s	 f95=r0			// M3
-	mov      f96=f0			// F
-
-	ldfps	 f97,f98=[sp]		// M0
-	ldfps	 f99,f100=[sp]		// M1
-	mov      f101=f0		// F
-
-	setf.s	 f102=r0		// M2
-	setf.s	 f103=r0		// M3
-	mov      f104=f0		// F
-
-	ldfps	 f105,f106=[sp]		// M0
-	ldfps	 f107,f108=[sp]		// M1
-	mov      f109=f0		// F
-
-	setf.s	 f110=r0		// M2
-	setf.s	 f111=r0		// M3
-	mov      f112=f0		// F
-
-	ldfps	 f113,f114=[sp]		// M0
-	ldfps	 f115,f116=[sp]		// M1
-	mov      f117=f0		// F
-
-	setf.s	 f118=r0		// M2
-	setf.s	 f119=r0		// M3
-	mov      f120=f0		// F
-
-	ldfps	 f121,f122=[sp]		// M0
-	ldfps	 f123,f124=[sp]		// M1
-	mov      f125=f0		// F
-
-	setf.s	 f126=r0		// M2
-	setf.s	 f127=r0		// M3
-	br.ret.sptk.many rp		// F
-END(__ia64_init_fpu)
-
-/*
- * Switch execution mode from virtual to physical
- *
- * Inputs:
- *	r16 = new psr to establish
- * Output:
- *	r19 = old virtual address of ar.bsp
- *	r20 = old virtual address of sp
- *
- * Note: RSE must already be in enforced lazy mode
- */
-GLOBAL_ENTRY(ia64_switch_mode_phys)
- {
-	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
-	mov r15=ip
- }
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	mov cr.ipsr=r16			// set new PSR
-	add r3=1f-ia64_switch_mode_phys,r15
-
-	mov r19=ar.bsp
-	mov r20=sp
-	mov r14=rp			// get return address into a general register
-	;;
-
-	// going to physical mode, use tpa to translate virt->phys
-	tpa r17=r19
-	tpa r3=r3
-	tpa sp=sp
-	tpa r14=r14
-	;;
-
-	mov r18=ar.rnat			// save ar.rnat
-	mov ar.bspstore=r17		// this steps on ar.rnat
-	mov cr.iip=r3
-	mov cr.ifs=r0
-	;;
-	mov ar.rnat=r18			// restore ar.rnat
-	rfi				// must be last insn in group
-	;;
-1:	mov rp=r14
-	br.ret.sptk.many rp
-END(ia64_switch_mode_phys)
-
-/*
- * Switch execution mode from physical to virtual
- *
- * Inputs:
- *	r16 = new psr to establish
- *	r19 = new bspstore to establish
- *	r20 = new sp to establish
- *
- * Note: RSE must already be in enforced lazy mode
- */
-GLOBAL_ENTRY(ia64_switch_mode_virt)
- {
-	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
-	mov r15=ip
- }
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	mov cr.ipsr=r16			// set new PSR
-	add r3=1f-ia64_switch_mode_virt,r15
-
-	mov r14=rp			// get return address into a general register
-	;;
-
-	// going to virtual
-	//   - for code addresses, set upper bits of addr to KERNEL_START
-	//   - for stack addresses, copy from input argument
-	movl r18=KERNEL_START
-	dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
-	dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
-	mov sp=r20
-	;;
-	or r3=r3,r18
-	or r14=r14,r18
-	;;
-
-	mov r18=ar.rnat			// save ar.rnat
-	mov ar.bspstore=r19		// this steps on ar.rnat
-	mov cr.iip=r3
-	mov cr.ifs=r0
-	;;
-	mov ar.rnat=r18			// restore ar.rnat
-	rfi				// must be last insn in group
-	;;
-1:	mov rp=r14
-	br.ret.sptk.many rp
-END(ia64_switch_mode_virt)
-
-GLOBAL_ENTRY(ia64_delay_loop)
-	.prologue
-{	nop 0			// work around GAS unwind info generation bug...
-	.save ar.lc,r2
-	mov r2=ar.lc
-	.body
-	;;
-	mov ar.lc=r32
-}
-	;;
-	// force loop to be 32-byte aligned (GAS bug means we cannot use .align
-	// inside function body without corrupting unwind info).
-{	nop 0 }
-1:	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc=r2
-	br.ret.sptk.many rp
-END(ia64_delay_loop)
-
-/*
- * Return a CPU-local timestamp in nano-seconds.  This timestamp is
- * NOT synchronized across CPUs its return value must never be
- * compared against the values returned on another CPU.  The usage in
- * kernel/sched/core.c ensures that.
- *
- * The return-value of sched_clock() is NOT supposed to wrap-around.
- * If it did, it would cause some scheduling hiccups (at the worst).
- * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
- * that would happen only once every 5+ years.
- *
- * The code below basically calculates:
- *
- *   (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
- *
- * except that the multiplication and the shift are done with 128-bit
- * intermediate precision so that we can produce a full 64-bit result.
- */
-GLOBAL_ENTRY(ia64_native_sched_clock)
-	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
-	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
-	;;
-	ldf8 f8=[r8]
-	;;
-	setf.sig f9=r9		// certain to stall, so issue it _after_ ldf8...
-	;;
-	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
-	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
-	;;
-	getf.sig r8=f10		//						(5 cyc)
-	getf.sig r9=f11
-	;;
-	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
-	br.ret.sptk.many rp
-END(ia64_native_sched_clock)
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-GLOBAL_ENTRY(cycle_to_nsec)
-	alloc r16=ar.pfs,1,0,0,0
-	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
-	;;
-	ldf8 f8=[r8]
-	;;
-	setf.sig f9=r32
-	;;
-	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
-	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
-	;;
-	getf.sig r8=f10		//						(5 cyc)
-	getf.sig r9=f11
-	;;
-	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
-	br.ret.sptk.many rp
-END(cycle_to_nsec)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-#ifdef CONFIG_IA64_BRL_EMU
-
-/*
- *  Assembly routines used by brl_emu.c to set preserved register state.
- */
-
-#define SET_REG(reg)				\
- GLOBAL_ENTRY(ia64_set_##reg);			\
-	alloc r16=ar.pfs,1,0,0,0;		\
-	mov reg=r32;				\
-	;;					\
-	br.ret.sptk.many rp;			\
- END(ia64_set_##reg)
-
-SET_REG(b1);
-SET_REG(b2);
-SET_REG(b3);
-SET_REG(b4);
-SET_REG(b5);
-
-#endif /* CONFIG_IA64_BRL_EMU */
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_HOTPLUG_CPU
-GLOBAL_ENTRY(ia64_jump_to_sal)
-	alloc r16=ar.pfs,1,0,0,0;;
-	rsm psr.i  | psr.ic
-{
-	flushrs
-	srlz.i
-}
-	tpa r25=in0
-	movl r18=tlb_purge_done;;
-	DATA_VA_TO_PA(r18);;
-	mov b1=r18 	// Return location
-	movl r18=ia64_do_tlb_purge;;
-	DATA_VA_TO_PA(r18);;
-	mov b2=r18 	// doing tlb_flush work
-	mov ar.rsc=0  // Put RSE  in enforced lazy, LE mode
-	movl r17=1f;;
-	DATA_VA_TO_PA(r17);;
-	mov cr.iip=r17
-	movl r16=SAL_PSR_BITS_TO_SET;;
-	mov cr.ipsr=r16
-	mov cr.ifs=r0;;
-	rfi;;			// note: this unmask MCA/INIT (psr.mc)
-1:
-	/*
-	 * Invalidate all TLB data/inst
-	 */
-	br.sptk.many b2;; // jump to tlb purge code
-
-tlb_purge_done:
-	RESTORE_REGION_REGS(r25, r17,r18,r19);;
-	RESTORE_REG(b0, r25, r17);;
-	RESTORE_REG(b1, r25, r17);;
-	RESTORE_REG(b2, r25, r17);;
-	RESTORE_REG(b3, r25, r17);;
-	RESTORE_REG(b4, r25, r17);;
-	RESTORE_REG(b5, r25, r17);;
-	ld8 r1=[r25],0x08;;
-	ld8 r12=[r25],0x08;;
-	ld8 r13=[r25],0x08;;
-	RESTORE_REG(ar.fpsr, r25, r17);;
-	RESTORE_REG(ar.pfs, r25, r17);;
-	RESTORE_REG(ar.rnat, r25, r17);;
-	RESTORE_REG(ar.unat, r25, r17);;
-	RESTORE_REG(ar.bspstore, r25, r17);;
-	RESTORE_REG(cr.dcr, r25, r17);;
-	RESTORE_REG(cr.iva, r25, r17);;
-	RESTORE_REG(cr.pta, r25, r17);;
-	srlz.d;;	// required not to violate RAW dependency
-	RESTORE_REG(cr.itv, r25, r17);;
-	RESTORE_REG(cr.pmv, r25, r17);;
-	RESTORE_REG(cr.cmcv, r25, r17);;
-	RESTORE_REG(cr.lrr0, r25, r17);;
-	RESTORE_REG(cr.lrr1, r25, r17);;
-	ld8 r4=[r25],0x08;;
-	ld8 r5=[r25],0x08;;
-	ld8 r6=[r25],0x08;;
-	ld8 r7=[r25],0x08;;
-	ld8 r17=[r25],0x08;;
-	mov pr=r17,-1;;
-	RESTORE_REG(ar.lc, r25, r17);;
-	/*
-	 * Now Restore floating point regs
-	 */
-	ldf.fill.nta f2=[r25],16;;
-	ldf.fill.nta f3=[r25],16;;
-	ldf.fill.nta f4=[r25],16;;
-	ldf.fill.nta f5=[r25],16;;
-	ldf.fill.nta f16=[r25],16;;
-	ldf.fill.nta f17=[r25],16;;
-	ldf.fill.nta f18=[r25],16;;
-	ldf.fill.nta f19=[r25],16;;
-	ldf.fill.nta f20=[r25],16;;
-	ldf.fill.nta f21=[r25],16;;
-	ldf.fill.nta f22=[r25],16;;
-	ldf.fill.nta f23=[r25],16;;
-	ldf.fill.nta f24=[r25],16;;
-	ldf.fill.nta f25=[r25],16;;
-	ldf.fill.nta f26=[r25],16;;
-	ldf.fill.nta f27=[r25],16;;
-	ldf.fill.nta f28=[r25],16;;
-	ldf.fill.nta f29=[r25],16;;
-	ldf.fill.nta f30=[r25],16;;
-	ldf.fill.nta f31=[r25],16;;
-
-	/*
-	 * Now that we have done all the register restores
-	 * we are now ready for the big DIVE to SAL Land
-	 */
-	ssm psr.ic;;
-	srlz.d;;
-	br.ret.sptk.many b0;;
-END(ia64_jump_to_sal)
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
deleted file mode 100644
index 99300850abc1..000000000000
--- a/arch/ia64/kernel/iosapic.c
+++ /dev/null
@@ -1,1137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * I/O SAPIC support.
- *
- * Copyright (C) 1999 Intel Corp.
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com>
- * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
- *
- * 00/04/19	D. Mosberger	Rewritten to mirror more closely the x86 I/O
- *				APIC code.  In particular, we now have separate
- *				handlers for edge and level triggered
- *				interrupts.
- * 00/10/27	Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector
- *				allocation PCI to vector mapping, shared PCI
- *				interrupts.
- * 00/10/27	D. Mosberger	Document things a bit more to make them more
- *				understandable.  Clean up much of the old
- *				IOSAPIC cruft.
- * 01/07/27	J.I. Lee	PCI irq routing, Platform/Legacy interrupts
- *				and fixes for ACPI S5(SoftOff) support.
- * 02/01/23	J.I. Lee	iosapic pgm fixes for PCI irq routing from _PRT
- * 02/01/07     E. Focht        <efocht@ess.nec.de> Redirectable interrupt
- *				vectors in iosapic_set_affinity(),
- *				initializations for /proc/irq/#/smp_affinity
- * 02/04/02	P. Diefenbaugh	Cleaned up ACPI PCI IRQ routing.
- * 02/04/18	J.I. Lee	bug fix in iosapic_init_pci_irq
- * 02/04/30	J.I. Lee	bug fix in find_iosapic to fix ACPI PCI IRQ to
- *				IOSAPIC mapping error
- * 02/07/29	T. Kochi	Allocate interrupt vectors dynamically
- * 02/08/04	T. Kochi	Cleaned up terminology (irq, global system
- *				interrupt, vector, etc.)
- * 02/09/20	D. Mosberger	Simplified by taking advantage of ACPI's
- *				pci_irq code.
- * 03/02/19	B. Helgaas	Make pcat_compat system-wide, not per-IOSAPIC.
- *				Remove iosapic_address & gsi_base from
- *				external interfaces.  Rationalize
- *				__init/__devinit attributes.
- * 04/12/04 Ashok Raj	<ashok.raj@intel.com> Intel Corporation 2004
- *				Updated to work with irq migration necessary
- *				for CPU Hotplug
- */
-/*
- * Here is what the interrupt logic between a PCI device and the kernel looks
- * like:
- *
- * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC,
- *     INTD).  The device is uniquely identified by its bus-, and slot-number
- *     (the function number does not matter here because all functions share
- *     the same interrupt lines).
- *
- * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC
- *     controller.  Multiple interrupt lines may have to share the same
- *     IOSAPIC pin (if they're level triggered and use the same polarity).
- *     Each interrupt line has a unique Global System Interrupt (GSI) number
- *     which can be calculated as the sum of the controller's base GSI number
- *     and the IOSAPIC pin number to which the line connects.
- *
- * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the
- * IOSAPIC pin into the IA-64 interrupt vector.  This interrupt vector is then
- * sent to the CPU.
- *
- * (4) The kernel recognizes an interrupt as an IRQ.  The IRQ interface is
- *     used as architecture-independent interrupt handling mechanism in Linux.
- *     As an IRQ is a number, we have to have
- *     IA-64 interrupt vector number <-> IRQ number mapping.  On smaller
- *     systems, we use one-to-one mapping between IA-64 vector and IRQ.
- *
- * To sum up, there are three levels of mappings involved:
- *
- *	PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ
- *
- * Note: The term "IRQ" is loosely used everywhere in Linux kernel to
- * describe interrupts.  Now we use "IRQ" only for Linux IRQ's.  ISA IRQ
- * (isa_irq) is the only exception in this source code.
- */
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/memblock.h>
-
-#include <asm/delay.h>
-#include <asm/hw_irq.h>
-#include <asm/io.h>
-#include <asm/iosapic.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/xtp.h>
-
-#undef DEBUG_INTERRUPT_ROUTING
-
-#ifdef DEBUG_INTERRUPT_ROUTING
-#define DBG(fmt...)	printk(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-static DEFINE_SPINLOCK(iosapic_lock);
-
-/*
- * These tables map IA-64 vectors to the IOSAPIC pin that generates this
- * vector.
- */
-
-#define NO_REF_RTE	0
-
-static struct iosapic {
-	char __iomem	*addr;		/* base address of IOSAPIC */
-	unsigned int	gsi_base;	/* GSI base */
-	unsigned short	num_rte;	/* # of RTEs on this IOSAPIC */
-	int		rtes_inuse;	/* # of RTEs in use on this IOSAPIC */
-#ifdef CONFIG_NUMA
-	unsigned short	node;		/* numa node association via pxm */
-#endif
-	spinlock_t	lock;		/* lock for indirect reg access */
-} iosapic_lists[NR_IOSAPICS];
-
-struct iosapic_rte_info {
-	struct list_head rte_list;	/* RTEs sharing the same vector */
-	char		rte_index;	/* IOSAPIC RTE index */
-	int		refcnt;		/* reference counter */
-	struct iosapic	*iosapic;
-} ____cacheline_aligned;
-
-static struct iosapic_intr_info {
-	struct list_head rtes;		/* RTEs using this vector (empty =>
-					 * not an IOSAPIC interrupt) */
-	int		count;		/* # of registered RTEs */
-	u32		low32;		/* current value of low word of
-					 * Redirection table entry */
-	unsigned int	dest;		/* destination CPU physical ID */
-	unsigned char	dmode	: 3;	/* delivery mode (see iosapic.h) */
-	unsigned char 	polarity: 1;	/* interrupt polarity
-					 * (see iosapic.h) */
-	unsigned char	trigger	: 1;	/* trigger mode (see iosapic.h) */
-} iosapic_intr_info[NR_IRQS];
-
-static unsigned char pcat_compat;	/* 8259 compatibility flag */
-
-static inline void
-iosapic_write(struct iosapic *iosapic, unsigned int reg, u32 val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic->lock, flags);
-	__iosapic_write(iosapic->addr, reg, val);
-	spin_unlock_irqrestore(&iosapic->lock, flags);
-}
-
-/*
- * Find an IOSAPIC associated with a GSI
- */
-static inline int
-find_iosapic (unsigned int gsi)
-{
-	int i;
-
-	for (i = 0; i < NR_IOSAPICS; i++) {
-		if ((unsigned) (gsi - iosapic_lists[i].gsi_base) <
-		    iosapic_lists[i].num_rte)
-			return i;
-	}
-
-	return -1;
-}
-
-static inline int __gsi_to_irq(unsigned int gsi)
-{
-	int irq;
-	struct iosapic_intr_info *info;
-	struct iosapic_rte_info *rte;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		info = &iosapic_intr_info[irq];
-		list_for_each_entry(rte, &info->rtes, rte_list)
-			if (rte->iosapic->gsi_base + rte->rte_index == gsi)
-				return irq;
-	}
-	return -1;
-}
-
-int
-gsi_to_irq (unsigned int gsi)
-{
-	unsigned long flags;
-	int irq;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	irq = __gsi_to_irq(gsi);
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return irq;
-}
-
-static struct iosapic_rte_info *find_rte(unsigned int irq, unsigned int gsi)
-{
-	struct iosapic_rte_info *rte;
-
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
-		if (rte->iosapic->gsi_base + rte->rte_index == gsi)
-			return rte;
-	return NULL;
-}
-
-static void
-set_rte (unsigned int gsi, unsigned int irq, unsigned int dest, int mask)
-{
-	unsigned long pol, trigger, dmode;
-	u32 low32, high32;
-	int rte_index;
-	char redir;
-	struct iosapic_rte_info *rte;
-	ia64_vector vector = irq_to_vector(irq);
-
-	DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest);
-
-	rte = find_rte(irq, gsi);
-	if (!rte)
-		return;		/* not an IOSAPIC interrupt */
-
-	rte_index = rte->rte_index;
-	pol     = iosapic_intr_info[irq].polarity;
-	trigger = iosapic_intr_info[irq].trigger;
-	dmode   = iosapic_intr_info[irq].dmode;
-
-	redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0;
-
-#ifdef CONFIG_SMP
-	set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
-#endif
-
-	low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
-		 (trigger << IOSAPIC_TRIGGER_SHIFT) |
-		 (dmode << IOSAPIC_DELIVERY_SHIFT) |
-		 ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) |
-		 vector);
-
-	/* dest contains both id and eid */
-	high32 = (dest << IOSAPIC_DEST_SHIFT);
-
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	iosapic_intr_info[irq].low32 = low32;
-	iosapic_intr_info[irq].dest = dest;
-}
-
-static void
-iosapic_nop (struct irq_data *data)
-{
-	/* do nothing... */
-}
-
-
-#ifdef CONFIG_KEXEC
-void
-kexec_disable_iosapic(void)
-{
-	struct iosapic_intr_info *info;
-	struct iosapic_rte_info *rte;
-	ia64_vector vec;
-	int irq;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		info = &iosapic_intr_info[irq];
-		vec = irq_to_vector(irq);
-		list_for_each_entry(rte, &info->rtes,
-				rte_list) {
-			iosapic_write(rte->iosapic,
-					IOSAPIC_RTE_LOW(rte->rte_index),
-					IOSAPIC_MASK|vec);
-			iosapic_eoi(rte->iosapic->addr, vec);
-		}
-	}
-}
-#endif
-
-static void
-mask_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	u32 low32;
-	int rte_index;
-	struct iosapic_rte_info *rte;
-
-	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt! */
-
-	/* set only the mask bit */
-	low32 = iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		rte_index = rte->rte_index;
-		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-}
-
-static void
-unmask_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	u32 low32;
-	int rte_index;
-	struct iosapic_rte_info *rte;
-
-	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt! */
-
-	low32 = iosapic_intr_info[irq].low32 &= ~IOSAPIC_MASK;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		rte_index = rte->rte_index;
-		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-}
-
-
-static int
-iosapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		     bool force)
-{
-#ifdef CONFIG_SMP
-	unsigned int irq = data->irq;
-	u32 high32, low32;
-	int cpu, dest, rte_index;
-	int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
-	struct iosapic_rte_info *rte;
-	struct iosapic *iosapic;
-
-	irq &= (~IA64_IRQ_REDIRECTED);
-
-	cpu = cpumask_first_and(cpu_online_mask, mask);
-	if (cpu >= nr_cpu_ids)
-		return -1;
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	dest = cpu_physical_id(cpu);
-
-	if (!iosapic_intr_info[irq].count)
-		return -1;			/* not an IOSAPIC interrupt */
-
-	set_irq_affinity_info(irq, dest, redir);
-
-	/* dest contains both id and eid */
-	high32 = dest << IOSAPIC_DEST_SHIFT;
-
-	low32 = iosapic_intr_info[irq].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
-	if (redir)
-		/* change delivery mode to lowest priority */
-		low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
-	else
-		/* change delivery mode to fixed */
-		low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
-	low32 &= IOSAPIC_VECTOR_MASK;
-	low32 |= irq_to_vector(irq);
-
-	iosapic_intr_info[irq].low32 = low32;
-	iosapic_intr_info[irq].dest = dest;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		iosapic = rte->iosapic;
-		rte_index = rte->rte_index;
-		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
-		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-
-#endif
-	return 0;
-}
-
-/*
- * Handlers for level-triggered interrupts.
- */
-
-static unsigned int
-iosapic_startup_level_irq (struct irq_data *data)
-{
-	unmask_irq(data);
-	return 0;
-}
-
-static void
-iosapic_unmask_level_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	ia64_vector vec = irq_to_vector(irq);
-	struct iosapic_rte_info *rte;
-	int do_unmask_irq = 0;
-
-	irq_complete_move(irq);
-	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		do_unmask_irq = 1;
-		mask_irq(data);
-	} else
-		unmask_irq(data);
-
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
-		iosapic_eoi(rte->iosapic->addr, vec);
-
-	if (unlikely(do_unmask_irq)) {
-		irq_move_masked_irq(data);
-		unmask_irq(data);
-	}
-}
-
-#define iosapic_shutdown_level_irq	mask_irq
-#define iosapic_enable_level_irq	unmask_irq
-#define iosapic_disable_level_irq	mask_irq
-#define iosapic_ack_level_irq		iosapic_nop
-
-static struct irq_chip irq_type_iosapic_level = {
-	.name =			"IO-SAPIC-level",
-	.irq_startup =		iosapic_startup_level_irq,
-	.irq_shutdown =		iosapic_shutdown_level_irq,
-	.irq_enable =		iosapic_enable_level_irq,
-	.irq_disable =		iosapic_disable_level_irq,
-	.irq_ack =		iosapic_ack_level_irq,
-	.irq_mask =		mask_irq,
-	.irq_unmask =		iosapic_unmask_level_irq,
-	.irq_set_affinity =	iosapic_set_affinity
-};
-
-/*
- * Handlers for edge-triggered interrupts.
- */
-
-static unsigned int
-iosapic_startup_edge_irq (struct irq_data *data)
-{
-	unmask_irq(data);
-	/*
-	 * IOSAPIC simply drops interrupts pended while the
-	 * corresponding pin was masked, so we can't know if an
-	 * interrupt is pending already.  Let's hope not...
-	 */
-	return 0;
-}
-
-static void
-iosapic_ack_edge_irq (struct irq_data *data)
-{
-	irq_complete_move(data->irq);
-	irq_move_irq(data);
-}
-
-#define iosapic_enable_edge_irq		unmask_irq
-#define iosapic_disable_edge_irq	iosapic_nop
-
-static struct irq_chip irq_type_iosapic_edge = {
-	.name =			"IO-SAPIC-edge",
-	.irq_startup =		iosapic_startup_edge_irq,
-	.irq_shutdown =		iosapic_disable_edge_irq,
-	.irq_enable =		iosapic_enable_edge_irq,
-	.irq_disable =		iosapic_disable_edge_irq,
-	.irq_ack =		iosapic_ack_edge_irq,
-	.irq_mask =		mask_irq,
-	.irq_unmask =		unmask_irq,
-	.irq_set_affinity =	iosapic_set_affinity
-};
-
-static unsigned int
-iosapic_version (char __iomem *addr)
-{
-	/*
-	 * IOSAPIC Version Register return 32 bit structure like:
-	 * {
-	 *	unsigned int version   : 8;
-	 *	unsigned int reserved1 : 8;
-	 *	unsigned int max_redir : 8;
-	 *	unsigned int reserved2 : 8;
-	 * }
-	 */
-	return __iosapic_read(addr, IOSAPIC_VERSION);
-}
-
-static int iosapic_find_sharable_irq(unsigned long trigger, unsigned long pol)
-{
-	int i, irq = -ENOSPC, min_count = -1;
-	struct iosapic_intr_info *info;
-
-	/*
-	 * shared vectors for edge-triggered interrupts are not
-	 * supported yet
-	 */
-	if (trigger == IOSAPIC_EDGE)
-		return -EINVAL;
-
-	for (i = 0; i < NR_IRQS; i++) {
-		info = &iosapic_intr_info[i];
-		if (info->trigger == trigger && info->polarity == pol &&
-		    (info->dmode == IOSAPIC_FIXED ||
-		     info->dmode == IOSAPIC_LOWEST_PRIORITY) &&
-		    can_request_irq(i, IRQF_SHARED)) {
-			if (min_count == -1 || info->count < min_count) {
-				irq = i;
-				min_count = info->count;
-			}
-		}
-	}
-	return irq;
-}
-
-/*
- * if the given vector is already owned by other,
- *  assign a new vector for the other and make the vector available
- */
-static void __init
-iosapic_reassign_vector (int irq)
-{
-	int new_irq;
-
-	if (iosapic_intr_info[irq].count) {
-		new_irq = create_irq();
-		if (new_irq < 0)
-			panic("%s: out of interrupt vectors!\n", __func__);
-		printk(KERN_INFO "Reassigning vector %d to %d\n",
-		       irq_to_vector(irq), irq_to_vector(new_irq));
-		memcpy(&iosapic_intr_info[new_irq], &iosapic_intr_info[irq],
-		       sizeof(struct iosapic_intr_info));
-		INIT_LIST_HEAD(&iosapic_intr_info[new_irq].rtes);
-		list_move(iosapic_intr_info[irq].rtes.next,
-			  &iosapic_intr_info[new_irq].rtes);
-		memset(&iosapic_intr_info[irq], 0,
-		       sizeof(struct iosapic_intr_info));
-		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
-		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
-	}
-}
-
-static inline int irq_is_shared (int irq)
-{
-	return (iosapic_intr_info[irq].count > 1);
-}
-
-struct irq_chip*
-ia64_native_iosapic_get_irq_chip(unsigned long trigger)
-{
-	if (trigger == IOSAPIC_EDGE)
-		return &irq_type_iosapic_edge;
-	else
-		return &irq_type_iosapic_level;
-}
-
-static int
-register_intr (unsigned int gsi, int irq, unsigned char delivery,
-	       unsigned long polarity, unsigned long trigger)
-{
-	struct irq_chip *chip, *irq_type;
-	int index;
-	struct iosapic_rte_info *rte;
-
-	index = find_iosapic(gsi);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
-		       __func__, gsi);
-		return -ENODEV;
-	}
-
-	rte = find_rte(irq, gsi);
-	if (!rte) {
-		rte = kzalloc(sizeof (*rte), GFP_ATOMIC);
-		if (!rte) {
-			printk(KERN_WARNING "%s: cannot allocate memory\n",
-			       __func__);
-			return -ENOMEM;
-		}
-
-		rte->iosapic	= &iosapic_lists[index];
-		rte->rte_index	= gsi - rte->iosapic->gsi_base;
-		rte->refcnt++;
-		list_add_tail(&rte->rte_list, &iosapic_intr_info[irq].rtes);
-		iosapic_intr_info[irq].count++;
-		iosapic_lists[index].rtes_inuse++;
-	}
-	else if (rte->refcnt == NO_REF_RTE) {
-		struct iosapic_intr_info *info = &iosapic_intr_info[irq];
-		if (info->count > 0 &&
-		    (info->trigger != trigger || info->polarity != polarity)){
-			printk (KERN_WARNING
-				"%s: cannot override the interrupt\n",
-				__func__);
-			return -EINVAL;
-		}
-		rte->refcnt++;
-		iosapic_intr_info[irq].count++;
-		iosapic_lists[index].rtes_inuse++;
-	}
-
-	iosapic_intr_info[irq].polarity = polarity;
-	iosapic_intr_info[irq].dmode    = delivery;
-	iosapic_intr_info[irq].trigger  = trigger;
-
-	irq_type = iosapic_get_irq_chip(trigger);
-
-	chip = irq_get_chip(irq);
-	if (irq_type != NULL && chip != irq_type) {
-		if (chip != &no_irq_chip)
-			printk(KERN_WARNING
-			       "%s: changing vector %d from %s to %s\n",
-			       __func__, irq_to_vector(irq),
-			       chip->name, irq_type->name);
-		chip = irq_type;
-	}
-	irq_set_chip_handler_name_locked(irq_get_irq_data(irq), chip,
-		trigger == IOSAPIC_EDGE ? handle_edge_irq : handle_level_irq,
-		NULL);
-	return 0;
-}
-
-static unsigned int
-get_target_cpu (unsigned int gsi, int irq)
-{
-#ifdef CONFIG_SMP
-	static int cpu = -1;
-	extern int cpe_vector;
-	cpumask_t domain = irq_to_domain(irq);
-
-	/*
-	 * In case of vector shared by multiple RTEs, all RTEs that
-	 * share the vector need to use the same destination CPU.
-	 */
-	if (iosapic_intr_info[irq].count)
-		return iosapic_intr_info[irq].dest;
-
-	/*
-	 * If the platform supports redirection via XTP, let it
-	 * distribute interrupts.
-	 */
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		return cpu_physical_id(smp_processor_id());
-
-	/*
-	 * Some interrupts (ACPI SCI, for instance) are registered
-	 * before the BSP is marked as online.
-	 */
-	if (!cpu_online(smp_processor_id()))
-		return cpu_physical_id(smp_processor_id());
-
-	if (cpe_vector > 0 && irq_to_vector(irq) == IA64_CPEP_VECTOR)
-		return get_cpei_target_cpu();
-
-#ifdef CONFIG_NUMA
-	{
-		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
-		const struct cpumask *cpu_mask;
-
-		iosapic_index = find_iosapic(gsi);
-		if (iosapic_index < 0 ||
-		    iosapic_lists[iosapic_index].node == MAX_NUMNODES)
-			goto skip_numa_setup;
-
-		cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node);
-		num_cpus = 0;
-		for_each_cpu_and(numa_cpu, cpu_mask, &domain) {
-			if (cpu_online(numa_cpu))
-				num_cpus++;
-		}
-
-		if (!num_cpus)
-			goto skip_numa_setup;
-
-		/* Use irq assignment to distribute across cpus in node */
-		cpu_index = irq % num_cpus;
-
-		for_each_cpu_and(numa_cpu, cpu_mask, &domain)
-			if (cpu_online(numa_cpu) && i++ >= cpu_index)
-				break;
-
-		if (numa_cpu < nr_cpu_ids)
-			return cpu_physical_id(numa_cpu);
-	}
-skip_numa_setup:
-#endif
-	/*
-	 * Otherwise, round-robin interrupt vectors across all the
-	 * processors.  (It'd be nice if we could be smarter in the
-	 * case of NUMA.)
-	 */
-	do {
-		if (++cpu >= nr_cpu_ids)
-			cpu = 0;
-	} while (!cpu_online(cpu) || !cpumask_test_cpu(cpu, &domain));
-
-	return cpu_physical_id(cpu);
-#else  /* CONFIG_SMP */
-	return cpu_physical_id(smp_processor_id());
-#endif
-}
-
-static inline unsigned char choose_dmode(void)
-{
-#ifdef CONFIG_SMP
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		return IOSAPIC_LOWEST_PRIORITY;
-#endif
-	return IOSAPIC_FIXED;
-}
-
-/*
- * ACPI can describe IOSAPIC interrupts via static tables and namespace
- * methods.  This provides an interface to register those interrupts and
- * program the IOSAPIC RTE.
- */
-int
-iosapic_register_intr (unsigned int gsi,
-		       unsigned long polarity, unsigned long trigger)
-{
-	int irq, mask = 1, err;
-	unsigned int dest;
-	unsigned long flags;
-	struct iosapic_rte_info *rte;
-	u32 low32;
-	unsigned char dmode;
-	struct irq_desc *desc;
-
-	/*
-	 * If this GSI has already been registered (i.e., it's a
-	 * shared interrupt, or we lost a race to register it),
-	 * don't touch the RTE.
-	 */
-	spin_lock_irqsave(&iosapic_lock, flags);
-	irq = __gsi_to_irq(gsi);
-	if (irq > 0) {
-		rte = find_rte(irq, gsi);
-		if(iosapic_intr_info[irq].count == 0) {
-			assign_irq_vector(irq);
-			irq_init_desc(irq);
-		} else if (rte->refcnt != NO_REF_RTE) {
-			rte->refcnt++;
-			goto unlock_iosapic_lock;
-		}
-	} else
-		irq = create_irq();
-
-	/* If vector is running out, we try to find a sharable vector */
-	if (irq < 0) {
-		irq = iosapic_find_sharable_irq(trigger, polarity);
-		if (irq < 0)
-			goto unlock_iosapic_lock;
-	}
-
-	desc = irq_to_desc(irq);
-	raw_spin_lock(&desc->lock);
-	dest = get_target_cpu(gsi, irq);
-	dmode = choose_dmode();
-	err = register_intr(gsi, irq, dmode, polarity, trigger);
-	if (err < 0) {
-		raw_spin_unlock(&desc->lock);
-		irq = err;
-		goto unlock_iosapic_lock;
-	}
-
-	/*
-	 * If the vector is shared and already unmasked for other
-	 * interrupt sources, don't mask it.
-	 */
-	low32 = iosapic_intr_info[irq].low32;
-	if (irq_is_shared(irq) && !(low32 & IOSAPIC_MASK))
-		mask = 0;
-	set_rte(gsi, irq, dest, mask);
-
-	printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
-	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, irq_to_vector(irq));
-
-	raw_spin_unlock(&desc->lock);
- unlock_iosapic_lock:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return irq;
-}
-
-void
-iosapic_unregister_intr (unsigned int gsi)
-{
-	unsigned long flags;
-	int irq, index;
-	u32 low32;
-	unsigned long trigger, polarity;
-	unsigned int dest;
-	struct iosapic_rte_info *rte;
-
-	/*
-	 * If the irq associated with the gsi is not found,
-	 * iosapic_unregister_intr() is unbalanced. We need to check
-	 * this again after getting locks.
-	 */
-	irq = gsi_to_irq(gsi);
-	if (irq < 0) {
-		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
-		       gsi);
-		WARN_ON(1);
-		return;
-	}
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	if ((rte = find_rte(irq, gsi)) == NULL) {
-		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
-		       gsi);
-		WARN_ON(1);
-		goto out;
-	}
-
-	if (--rte->refcnt > 0)
-		goto out;
-
-	rte->refcnt = NO_REF_RTE;
-
-	/* Mask the interrupt */
-	low32 = iosapic_intr_info[irq].low32 | IOSAPIC_MASK;
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte->rte_index), low32);
-
-	iosapic_intr_info[irq].count--;
-	index = find_iosapic(gsi);
-	iosapic_lists[index].rtes_inuse--;
-	WARN_ON(iosapic_lists[index].rtes_inuse < 0);
-
-	trigger  = iosapic_intr_info[irq].trigger;
-	polarity = iosapic_intr_info[irq].polarity;
-	dest     = iosapic_intr_info[irq].dest;
-	printk(KERN_INFO
-	       "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n",
-	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, irq_to_vector(irq));
-
-	if (iosapic_intr_info[irq].count == 0) {
-#ifdef CONFIG_SMP
-		/* Clear affinity */
-		irq_data_update_affinity(irq_get_irq_data(irq), cpu_all_mask);
-#endif
-		/* Clear the interrupt information */
-		iosapic_intr_info[irq].dest = 0;
-		iosapic_intr_info[irq].dmode = 0;
-		iosapic_intr_info[irq].polarity = 0;
-		iosapic_intr_info[irq].trigger = 0;
-		iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
-
-		/* Destroy and reserve IRQ */
-		destroy_and_reserve_irq(irq);
-	}
- out:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-}
-
-/*
- * ACPI calls this when it finds an entry for a platform interrupt.
- */
-int __init
-iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
-				int iosapic_vector, u16 eid, u16 id,
-				unsigned long polarity, unsigned long trigger)
-{
-	static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"};
-	unsigned char delivery;
-	int irq, vector, mask = 0;
-	unsigned int dest = ((id << 8) | eid) & 0xffff;
-
-	switch (int_type) {
-	      case ACPI_INTERRUPT_PMI:
-		irq = vector = iosapic_vector;
-		bind_irq_vector(irq, vector, CPU_MASK_ALL);
-		/*
-		 * since PMI vector is alloc'd by FW(ACPI) not by kernel,
-		 * we need to make sure the vector is available
-		 */
-		iosapic_reassign_vector(irq);
-		delivery = IOSAPIC_PMI;
-		break;
-	      case ACPI_INTERRUPT_INIT:
-		irq = create_irq();
-		if (irq < 0)
-			panic("%s: out of interrupt vectors!\n", __func__);
-		vector = irq_to_vector(irq);
-		delivery = IOSAPIC_INIT;
-		break;
-	      case ACPI_INTERRUPT_CPEI:
-		irq = vector = IA64_CPE_VECTOR;
-		BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
-		delivery = IOSAPIC_FIXED;
-		mask = 1;
-		break;
-	      default:
-		printk(KERN_ERR "%s: invalid int type 0x%x\n", __func__,
-		       int_type);
-		return -1;
-	}
-
-	register_intr(gsi, irq, delivery, polarity, trigger);
-
-	printk(KERN_INFO
-	       "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x)"
-	       " vector %d\n",
-	       int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown",
-	       int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, vector);
-
-	set_rte(gsi, irq, dest, mask);
-	return vector;
-}
-
-/*
- * ACPI calls this when it finds an entry for a legacy ISA IRQ override.
- */
-void iosapic_override_isa_irq(unsigned int isa_irq, unsigned int gsi,
-			      unsigned long polarity, unsigned long trigger)
-{
-	int vector, irq;
-	unsigned int dest = cpu_physical_id(smp_processor_id());
-	unsigned char dmode;
-
-	irq = vector = isa_irq_to_vector(isa_irq);
-	BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
-	dmode = choose_dmode();
-	register_intr(gsi, irq, dmode, polarity, trigger);
-
-	DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n",
-	    isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level",
-	    polarity == IOSAPIC_POL_HIGH ? "high" : "low",
-	    cpu_logical_id(dest), dest, vector);
-
-	set_rte(gsi, irq, dest, 1);
-}
-
-void __init
-ia64_native_iosapic_pcat_compat_init(void)
-{
-	if (pcat_compat) {
-		/*
-		 * Disable the compatibility mode interrupts (8259 style),
-		 * needs IN/OUT support enabled.
-		 */
-		printk(KERN_INFO
-		       "%s: Disabling PC-AT compatible 8259 interrupts\n",
-		       __func__);
-		outb(0xff, 0xA1);
-		outb(0xff, 0x21);
-	}
-}
-
-void __init
-iosapic_system_init (int system_pcat_compat)
-{
-	int irq;
-
-	for (irq = 0; irq < NR_IRQS; ++irq) {
-		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
-		/* mark as unused */
-		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
-
-		iosapic_intr_info[irq].count = 0;
-	}
-
-	pcat_compat = system_pcat_compat;
-	if (pcat_compat)
-		iosapic_pcat_compat_init();
-}
-
-static inline int
-iosapic_alloc (void)
-{
-	int index;
-
-	for (index = 0; index < NR_IOSAPICS; index++)
-		if (!iosapic_lists[index].addr)
-			return index;
-
-	printk(KERN_WARNING "%s: failed to allocate iosapic\n", __func__);
-	return -1;
-}
-
-static inline void
-iosapic_free (int index)
-{
-	memset(&iosapic_lists[index], 0, sizeof(iosapic_lists[0]));
-}
-
-static inline int
-iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver)
-{
-	int index;
-	unsigned int gsi_end, base, end;
-
-	/* check gsi range */
-	gsi_end = gsi_base + ((ver >> 16) & 0xff);
-	for (index = 0; index < NR_IOSAPICS; index++) {
-		if (!iosapic_lists[index].addr)
-			continue;
-
-		base = iosapic_lists[index].gsi_base;
-		end  = base + iosapic_lists[index].num_rte - 1;
-
-		if (gsi_end < base || end < gsi_base)
-			continue; /* OK */
-
-		return -EBUSY;
-	}
-	return 0;
-}
-
-static int
-iosapic_delete_rte(unsigned int irq, unsigned int gsi)
-{
-	struct iosapic_rte_info *rte, *temp;
-
-	list_for_each_entry_safe(rte, temp, &iosapic_intr_info[irq].rtes,
-								rte_list) {
-		if (rte->iosapic->gsi_base + rte->rte_index == gsi) {
-			if (rte->refcnt)
-				return -EBUSY;
-
-			list_del(&rte->rte_list);
-			kfree(rte);
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-int iosapic_init(unsigned long phys_addr, unsigned int gsi_base)
-{
-	int num_rte, err, index;
-	unsigned int isa_irq, ver;
-	char __iomem *addr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	index = find_iosapic(gsi_base);
-	if (index >= 0) {
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return -EBUSY;
-	}
-
-	addr = ioremap(phys_addr, 0);
-	if (addr == NULL) {
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return -ENOMEM;
-	}
-	ver = iosapic_version(addr);
-	if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
-		iounmap(addr);
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return err;
-	}
-
-	/*
-	 * The MAX_REDIR register holds the highest input pin number
-	 * (starting from 0).  We add 1 so that we can use it for
-	 * number of pins (= RTEs)
-	 */
-	num_rte = ((ver >> 16) & 0xff) + 1;
-
-	index = iosapic_alloc();
-	iosapic_lists[index].addr = addr;
-	iosapic_lists[index].gsi_base = gsi_base;
-	iosapic_lists[index].num_rte = num_rte;
-#ifdef CONFIG_NUMA
-	iosapic_lists[index].node = MAX_NUMNODES;
-#endif
-	spin_lock_init(&iosapic_lists[index].lock);
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-
-	if ((gsi_base == 0) && pcat_compat) {
-		/*
-		 * Map the legacy ISA devices into the IOSAPIC data.  Some of
-		 * these may get reprogrammed later on with data from the ACPI
-		 * Interrupt Source Override table.
-		 */
-		for (isa_irq = 0; isa_irq < 16; ++isa_irq)
-			iosapic_override_isa_irq(isa_irq, isa_irq,
-						 IOSAPIC_POL_HIGH,
-						 IOSAPIC_EDGE);
-	}
-	return 0;
-}
-
-int iosapic_remove(unsigned int gsi_base)
-{
-	int i, irq, index, err = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	index = find_iosapic(gsi_base);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n",
-		       __func__, gsi_base);
-		goto out;
-	}
-
-	if (iosapic_lists[index].rtes_inuse) {
-		err = -EBUSY;
-		printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n",
-		       __func__, gsi_base);
-		goto out;
-	}
-
-	for (i = gsi_base; i < gsi_base + iosapic_lists[index].num_rte; i++) {
-		irq = __gsi_to_irq(i);
-		if (irq < 0)
-			continue;
-
-		err = iosapic_delete_rte(irq, i);
-		if (err)
-			goto out;
-	}
-
-	iounmap(iosapic_lists[index].addr);
-	iosapic_free(index);
- out:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return err;
-}
-
-#ifdef CONFIG_NUMA
-void map_iosapic_to_node(unsigned int gsi_base, int node)
-{
-	int index;
-
-	index = find_iosapic(gsi_base);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
-		       __func__, gsi_base);
-		return;
-	}
-	iosapic_lists[index].node = node;
-	return;
-}
-#endif
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
deleted file mode 100644
index 275b9ea58c64..000000000000
--- a/arch/ia64/kernel/irq.c
+++ /dev/null
@@ -1,181 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *	linux/arch/ia64/kernel/irq.c
- *
- *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
- *
- * This file contains the code used by various IRQ handling routines:
- * asking for different IRQs should be done through these routines
- * instead of just grabbing them. Thus setups with different IRQ numbers
- * shouldn't result in any weird surprises, and installing new handlers
- * should be easier.
- *
- * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004
- *
- * 4/14/2004: Added code to handle cpu migration and do safe irq
- *			migration without losing interrupts for iosapic
- *			architecture.
- */
-
-#include <asm/delay.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-
-#include <asm/mca.h>
-#include <asm/xtp.h>
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-	printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id());
-}
-
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-int arch_show_interrupts(struct seq_file *p, int prec)
-{
-	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-	return 0;
-}
-
-#ifdef CONFIG_SMP
-static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
-
-void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
-{
-	if (irq < NR_IRQS) {
-		irq_data_update_affinity(irq_get_irq_data(irq),
-					 cpumask_of(cpu_logical_id(hwid)));
-		irq_redir[irq] = (char) (redir & 0xff);
-	}
-}
-#endif /* CONFIG_SMP */
-
-int __init arch_early_irq_init(void)
-{
-	ia64_mca_irq_init();
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-unsigned int vectors_in_migration[NR_IRQS];
-
-/*
- * Since cpu_online_mask is already updated, we just need to check for
- * affinity that has zeros
- */
-static void migrate_irqs(void)
-{
-	int 		irq, new_cpu;
-
-	for (irq=0; irq < NR_IRQS; irq++) {
-		struct irq_desc *desc = irq_to_desc(irq);
-		struct irq_data *data = irq_desc_get_irq_data(desc);
-		struct irq_chip *chip = irq_data_get_irq_chip(data);
-
-		if (irqd_irq_disabled(data))
-			continue;
-
-		/*
-		 * No handling for now.
-		 * TBD: Implement a disable function so we can now
-		 * tell CPU not to respond to these local intr sources.
-		 * such as ITV,CPEI,MCA etc.
-		 */
-		if (irqd_is_per_cpu(data))
-			continue;
-
-		if (cpumask_any_and(irq_data_get_affinity_mask(data),
-				    cpu_online_mask) >= nr_cpu_ids) {
-			/*
-			 * Save it for phase 2 processing
-			 */
-			vectors_in_migration[irq] = irq;
-
-			new_cpu = cpumask_any(cpu_online_mask);
-
-			/*
-			 * Al three are essential, currently WARN_ON.. maybe panic?
-			 */
-			if (chip && chip->irq_disable &&
-				chip->irq_enable && chip->irq_set_affinity) {
-				chip->irq_disable(data);
-				chip->irq_set_affinity(data,
-						       cpumask_of(new_cpu), false);
-				chip->irq_enable(data);
-			} else {
-				WARN_ON((!chip || !chip->irq_disable ||
-					 !chip->irq_enable ||
-					 !chip->irq_set_affinity));
-			}
-		}
-	}
-}
-
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	extern void ia64_process_pending_intr(void);
-	extern volatile int time_keeper_id;
-
-	/* Mask ITV to disable timer */
-	ia64_set_itv(1 << 16);
-
-	/*
-	 * Find a new timesync master
-	 */
-	if (smp_processor_id() == time_keeper_id) {
-		time_keeper_id = cpumask_first(cpu_online_mask);
-		printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
-	}
-
-	/*
-	 * Phase 1: Locate IRQs bound to this cpu and
-	 * relocate them for cpu removal.
-	 */
-	migrate_irqs();
-
-	/*
-	 * Phase 2: Perform interrupt processing for all entries reported in
-	 * local APIC.
-	 */
-	ia64_process_pending_intr();
-
-	/*
-	 * Phase 3: Now handle any interrupts not captured in local APIC.
-	 * This is to account for cases that device interrupted during the time the
-	 * rte was being disabled and re-programmed.
-	 */
-	for (irq=0; irq < NR_IRQS; irq++) {
-		if (vectors_in_migration[irq]) {
-			struct pt_regs *old_regs = set_irq_regs(NULL);
-
-			vectors_in_migration[irq]=0;
-			generic_handle_irq(irq);
-			set_irq_regs(old_regs);
-		}
-	}
-
-	/*
-	 * Now let processor die. We do irq disable and max_xtp() to
-	 * ensure there is no more interrupts routed to this processor.
-	 * But the local timer interrupt can have 1 pending which we
-	 * take care in timer_interrupt().
-	 */
-	max_xtp();
-	local_irq_disable();
-}
-#endif
diff --git a/arch/ia64/kernel/irq.h b/arch/ia64/kernel/irq.h
deleted file mode 100644
index 4d16f3cbeb1d..000000000000
--- a/arch/ia64/kernel/irq.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-extern void register_percpu_irq(ia64_vector vec, irq_handler_t handler,
-				unsigned long flags, const char *name);
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
deleted file mode 100644
index 46e33c5cb53d..000000000000
--- a/arch/ia64/kernel/irq_ia64.c
+++ /dev/null
@@ -1,645 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/ia64/kernel/irq_ia64.c
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- *  6/10/99: Updated to bring in sync with x86 version to facilitate
- *	     support for SMP and different interrupt controllers.
- *
- * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
- *                      PCI to vector allocation routine.
- * 04/14/2004 Ashok Raj <ashok.raj@intel.com>
- *						Added CPU Hotplug handling for IPF.
- */
-
-#include <linux/module.h>
-#include <linux/pgtable.h>
-
-#include <linux/jiffies.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/kernel_stat.h>
-#include <linux/ptrace.h>
-#include <linux/signal.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/bitops.h>
-#include <linux/irq.h>
-#include <linux/ratelimit.h>
-#include <linux/acpi.h>
-#include <linux/sched.h>
-
-#include <asm/delay.h>
-#include <asm/intrinsics.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/tlbflush.h>
-
-#define IRQ_DEBUG	0
-
-#define IRQ_VECTOR_UNASSIGNED	(0)
-
-#define IRQ_UNUSED		(0)
-#define IRQ_USED		(1)
-#define IRQ_RSVD		(2)
-
-int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR;
-int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR;
-
-/* default base addr of IPI table */
-void __iomem *ipi_base_addr = ((void __iomem *)
-			       (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
-
-static cpumask_t vector_allocation_domain(int cpu);
-
-/*
- * Legacy IRQ to IA-64 vector translation table.
- */
-__u8 isa_irq_to_vector_map[16] = {
-	/* 8259 IRQ translation, first 16 entries */
-	0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
-	0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
-};
-EXPORT_SYMBOL(isa_irq_to_vector_map);
-
-DEFINE_SPINLOCK(vector_lock);
-
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
-	[0 ... NR_IRQS - 1] = {
-		.vector = IRQ_VECTOR_UNASSIGNED,
-		.domain = CPU_MASK_NONE
-	}
-};
-
-DEFINE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq) = {
-	[0 ... IA64_NUM_VECTORS - 1] = -1
-};
-
-static cpumask_t vector_table[IA64_NUM_VECTORS] = {
-	[0 ... IA64_NUM_VECTORS - 1] = CPU_MASK_NONE
-};
-
-static int irq_status[NR_IRQS] = {
-	[0 ... NR_IRQS -1] = IRQ_UNUSED
-};
-
-static inline int find_unassigned_irq(void)
-{
-	int irq;
-
-	for (irq = IA64_FIRST_DEVICE_VECTOR; irq < NR_IRQS; irq++)
-		if (irq_status[irq] == IRQ_UNUSED)
-			return irq;
-	return -ENOSPC;
-}
-
-static inline int find_unassigned_vector(cpumask_t domain)
-{
-	cpumask_t mask;
-	int pos, vector;
-
-	cpumask_and(&mask, &domain, cpu_online_mask);
-	if (cpumask_empty(&mask))
-		return -EINVAL;
-
-	for (pos = 0; pos < IA64_NUM_DEVICE_VECTORS; pos++) {
-		vector = IA64_FIRST_DEVICE_VECTOR + pos;
-		cpumask_and(&mask, &domain, &vector_table[vector]);
-		if (!cpumask_empty(&mask))
-			continue;
-		return vector;
-	}
-	return -ENOSPC;
-}
-
-static int __bind_irq_vector(int irq, int vector, cpumask_t domain)
-{
-	cpumask_t mask;
-	int cpu;
-	struct irq_cfg *cfg = &irq_cfg[irq];
-
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	BUG_ON((unsigned)vector >= IA64_NUM_VECTORS);
-
-	cpumask_and(&mask, &domain, cpu_online_mask);
-	if (cpumask_empty(&mask))
-		return -EINVAL;
-	if ((cfg->vector == vector) && cpumask_equal(&cfg->domain, &domain))
-		return 0;
-	if (cfg->vector != IRQ_VECTOR_UNASSIGNED)
-		return -EBUSY;
-	for_each_cpu(cpu, &mask)
-		per_cpu(vector_irq, cpu)[vector] = irq;
-	cfg->vector = vector;
-	cfg->domain = domain;
-	irq_status[irq] = IRQ_USED;
-	cpumask_or(&vector_table[vector], &vector_table[vector], &domain);
-	return 0;
-}
-
-int bind_irq_vector(int irq, int vector, cpumask_t domain)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	ret = __bind_irq_vector(irq, vector, domain);
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return ret;
-}
-
-static void __clear_irq_vector(int irq)
-{
-	int vector, cpu;
-	cpumask_t domain;
-	struct irq_cfg *cfg = &irq_cfg[irq];
-
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED);
-	vector = cfg->vector;
-	domain = cfg->domain;
-	for_each_cpu_and(cpu, &cfg->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = -1;
-	cfg->vector = IRQ_VECTOR_UNASSIGNED;
-	cfg->domain = CPU_MASK_NONE;
-	irq_status[irq] = IRQ_UNUSED;
-	cpumask_andnot(&vector_table[vector], &vector_table[vector], &domain);
-}
-
-static void clear_irq_vector(int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
-	spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-int
-ia64_native_assign_irq_vector (int irq)
-{
-	unsigned long flags;
-	int vector, cpu;
-	cpumask_t domain = CPU_MASK_NONE;
-
-	vector = -ENOSPC;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	for_each_online_cpu(cpu) {
-		domain = vector_allocation_domain(cpu);
-		vector = find_unassigned_vector(domain);
-		if (vector >= 0)
-			break;
-	}
-	if (vector < 0)
-		goto out;
-	if (irq == AUTO_ASSIGN)
-		irq = vector;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
- out:
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return vector;
-}
-
-void
-ia64_native_free_irq_vector (int vector)
-{
-	if (vector < IA64_FIRST_DEVICE_VECTOR ||
-	    vector > IA64_LAST_DEVICE_VECTOR)
-		return;
-	clear_irq_vector(vector);
-}
-
-int
-reserve_irq_vector (int vector)
-{
-	if (vector < IA64_FIRST_DEVICE_VECTOR ||
-	    vector > IA64_LAST_DEVICE_VECTOR)
-		return -EINVAL;
-	return !!bind_irq_vector(vector, vector, CPU_MASK_ALL);
-}
-
-/*
- * Initialize vector_irq on a new cpu. This function must be called
- * with vector_lock held.
- */
-void __setup_vector_irq(int cpu)
-{
-	int irq, vector;
-
-	/* Clear vector_irq */
-	for (vector = 0; vector < IA64_NUM_VECTORS; ++vector)
-		per_cpu(vector_irq, cpu)[vector] = -1;
-	/* Mark the inuse vectors */
-	for (irq = 0; irq < NR_IRQS; ++irq) {
-		if (!cpumask_test_cpu(cpu, &irq_cfg[irq].domain))
-			continue;
-		vector = irq_to_vector(irq);
-		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
-}
-
-#ifdef CONFIG_SMP
-
-static enum vector_domain_type {
-	VECTOR_DOMAIN_NONE,
-	VECTOR_DOMAIN_PERCPU
-} vector_domain_type = VECTOR_DOMAIN_NONE;
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
-	if (vector_domain_type == VECTOR_DOMAIN_PERCPU)
-		return *cpumask_of(cpu);
-	return CPU_MASK_ALL;
-}
-
-static int __irq_prepare_move(int irq, int cpu)
-{
-	struct irq_cfg *cfg = &irq_cfg[irq];
-	int vector;
-	cpumask_t domain;
-
-	if (cfg->move_in_progress || cfg->move_cleanup_count)
-		return -EBUSY;
-	if (cfg->vector == IRQ_VECTOR_UNASSIGNED || !cpu_online(cpu))
-		return -EINVAL;
-	if (cpumask_test_cpu(cpu, &cfg->domain))
-		return 0;
-	domain = vector_allocation_domain(cpu);
-	vector = find_unassigned_vector(domain);
-	if (vector < 0)
-		return -ENOSPC;
-	cfg->move_in_progress = 1;
-	cfg->old_domain = cfg->domain;
-	cfg->vector = IRQ_VECTOR_UNASSIGNED;
-	cfg->domain = CPU_MASK_NONE;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
-	return 0;
-}
-
-int irq_prepare_move(int irq, int cpu)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	ret = __irq_prepare_move(irq, cpu);
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return ret;
-}
-
-void irq_complete_move(unsigned irq)
-{
-	struct irq_cfg *cfg = &irq_cfg[irq];
-	cpumask_t cleanup_mask;
-	int i;
-
-	if (likely(!cfg->move_in_progress))
-		return;
-
-	if (unlikely(cpumask_test_cpu(smp_processor_id(), &cfg->old_domain)))
-		return;
-
-	cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask);
-	cfg->move_cleanup_count = cpumask_weight(&cleanup_mask);
-	for_each_cpu(i, &cleanup_mask)
-		ia64_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0);
-	cfg->move_in_progress = 0;
-}
-
-static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id)
-{
-	int me = smp_processor_id();
-	ia64_vector vector;
-	unsigned long flags;
-
-	for (vector = IA64_FIRST_DEVICE_VECTOR;
-	     vector < IA64_LAST_DEVICE_VECTOR; vector++) {
-		int irq;
-		struct irq_desc *desc;
-		struct irq_cfg *cfg;
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq < 0)
-			continue;
-
-		desc = irq_to_desc(irq);
-		cfg = irq_cfg + irq;
-		raw_spin_lock(&desc->lock);
-		if (!cfg->move_cleanup_count)
-			goto unlock;
-
-		if (!cpumask_test_cpu(me, &cfg->old_domain))
-			goto unlock;
-
-		spin_lock_irqsave(&vector_lock, flags);
-		__this_cpu_write(vector_irq[vector], -1);
-		cpumask_clear_cpu(me, &vector_table[vector]);
-		spin_unlock_irqrestore(&vector_lock, flags);
-		cfg->move_cleanup_count--;
-	unlock:
-		raw_spin_unlock(&desc->lock);
-	}
-	return IRQ_HANDLED;
-}
-
-static int __init parse_vector_domain(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-	if (!strcmp(arg, "percpu")) {
-		vector_domain_type = VECTOR_DOMAIN_PERCPU;
-		no_int_routing = 1;
-	}
-	return 0;
-}
-early_param("vector", parse_vector_domain);
-#else
-static cpumask_t vector_allocation_domain(int cpu)
-{
-	return CPU_MASK_ALL;
-}
-#endif
-
-
-void destroy_and_reserve_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	irq_init_desc(irq);
-	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
-	irq_status[irq] = IRQ_RSVD;
-	spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * Dynamic irq allocate and deallocation for MSI
- */
-int create_irq(void)
-{
-	unsigned long flags;
-	int irq, vector, cpu;
-	cpumask_t domain = CPU_MASK_NONE;
-
-	irq = vector = -ENOSPC;
-	spin_lock_irqsave(&vector_lock, flags);
-	for_each_online_cpu(cpu) {
-		domain = vector_allocation_domain(cpu);
-		vector = find_unassigned_vector(domain);
-		if (vector >= 0)
-			break;
-	}
-	if (vector < 0)
-		goto out;
-	irq = find_unassigned_irq();
-	if (irq < 0)
-		goto out;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
- out:
-	spin_unlock_irqrestore(&vector_lock, flags);
-	if (irq >= 0)
-		irq_init_desc(irq);
-	return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
-	irq_init_desc(irq);
-	clear_irq_vector(irq);
-}
-
-#ifdef CONFIG_SMP
-#	define IS_RESCHEDULE(vec)	(vec == IA64_IPI_RESCHEDULE)
-#	define IS_LOCAL_TLB_FLUSH(vec)	(vec == IA64_IPI_LOCAL_TLB_FLUSH)
-#else
-#	define IS_RESCHEDULE(vec)	(0)
-#	define IS_LOCAL_TLB_FLUSH(vec)	(0)
-#endif
-/*
- * That's where the IVT branches when we get an external
- * interrupt. This branches to the correct hardware IRQ handler via
- * function ptr.
- */
-void
-ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	unsigned long saved_tpr;
-
-#if IRQ_DEBUG
-	{
-		unsigned long bsp, sp;
-
-		/*
-		 * Note: if the interrupt happened while executing in
-		 * the context switch routine (ia64_switch_to), we may
-		 * get a spurious stack overflow here.  This is
-		 * because the register and the memory stack are not
-		 * switched atomically.
-		 */
-		bsp = ia64_getreg(_IA64_REG_AR_BSP);
-		sp = ia64_getreg(_IA64_REG_SP);
-
-		if ((sp - bsp) < 1024) {
-			static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
-
-			if (__ratelimit(&ratelimit)) {
-				printk("ia64_handle_irq: DANGER: less than "
-				       "1KB of free stack space!!\n"
-				       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
-			}
-		}
-	}
-#endif /* IRQ_DEBUG */
-
-	/*
-	 * Always set TPR to limit maximum interrupt nesting depth to
-	 * 16 (without this, it would be ~240, which could easily lead
-	 * to kernel stack overflows).
-	 */
-	irq_enter();
-	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
-	ia64_srlz_d();
-	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		int irq = local_vector_to_irq(vector);
-
-		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
-			smp_local_flush_tlb();
-			kstat_incr_irq_this_cpu(irq);
-		} else if (unlikely(IS_RESCHEDULE(vector))) {
-			scheduler_ipi();
-			kstat_incr_irq_this_cpu(irq);
-		} else {
-			ia64_setreg(_IA64_REG_CR_TPR, vector);
-			ia64_srlz_d();
-
-			if (unlikely(irq < 0)) {
-				printk(KERN_ERR "%s: Unexpected interrupt "
-				       "vector %d on CPU %d is not mapped "
-				       "to any IRQ!\n", __func__, vector,
-				       smp_processor_id());
-			} else
-				generic_handle_irq(irq);
-
-			/*
-			 * Disable interrupts and send EOI:
-			 */
-			local_irq_disable();
-			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
-		}
-		ia64_eoi();
-		vector = ia64_get_ivr();
-	}
-	/*
-	 * This must be done *after* the ia64_eoi().  For example, the keyboard softirq
-	 * handler needs to be able to wait for further keyboard interrupts, which can't
-	 * come through until ia64_eoi() has been done.
-	 */
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * This function emulates a interrupt processing when a cpu is about to be
- * brought down.
- */
-void ia64_process_pending_intr(void)
-{
-	ia64_vector vector;
-	unsigned long saved_tpr;
-	extern unsigned int vectors_in_migration[NR_IRQS];
-
-	vector = ia64_get_ivr();
-
-	irq_enter();
-	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
-	ia64_srlz_d();
-
-	 /*
-	  * Perform normal interrupt style processing
-	  */
-	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		int irq = local_vector_to_irq(vector);
-
-		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
-			smp_local_flush_tlb();
-			kstat_incr_irq_this_cpu(irq);
-		} else if (unlikely(IS_RESCHEDULE(vector))) {
-			kstat_incr_irq_this_cpu(irq);
-		} else {
-			struct pt_regs *old_regs = set_irq_regs(NULL);
-
-			ia64_setreg(_IA64_REG_CR_TPR, vector);
-			ia64_srlz_d();
-
-			/*
-			 * Now try calling normal ia64_handle_irq as it would have got called
-			 * from a real intr handler. Try passing null for pt_regs, hopefully
-			 * it will work. I hope it works!.
-			 * Probably could shared code.
-			 */
-			if (unlikely(irq < 0)) {
-				printk(KERN_ERR "%s: Unexpected interrupt "
-				       "vector %d on CPU %d not being mapped "
-				       "to any IRQ!!\n", __func__, vector,
-				       smp_processor_id());
-			} else {
-				vectors_in_migration[irq]=0;
-				generic_handle_irq(irq);
-			}
-			set_irq_regs(old_regs);
-
-			/*
-			 * Disable interrupts and send EOI
-			 */
-			local_irq_disable();
-			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
-		}
-		ia64_eoi();
-		vector = ia64_get_ivr();
-	}
-	irq_exit();
-}
-#endif
-
-
-#ifdef CONFIG_SMP
-
-static irqreturn_t dummy_handler (int irq, void *dev_id)
-{
-	BUG();
-	return IRQ_NONE;
-}
-
-/*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-
-#endif
-
-void
-register_percpu_irq(ia64_vector vec, irq_handler_t handler, unsigned long flags,
-		    const char *name)
-{
-	unsigned int irq;
-
-	irq = vec;
-	BUG_ON(bind_irq_vector(irq, vec, CPU_MASK_ALL));
-	irq_set_status_flags(irq, IRQ_PER_CPU);
-	irq_set_chip(irq, &irq_type_ia64_lsapic);
-	if (handler)
-		if (request_irq(irq, handler, flags, name, NULL))
-			pr_err("Failed to request irq %u (%s)\n", irq, name);
-	irq_set_handler(irq, handle_percpu_irq);
-}
-
-void __init
-ia64_native_register_ipi(void)
-{
-#ifdef CONFIG_SMP
-	register_percpu_irq(IA64_IPI_VECTOR, handle_IPI, 0, "IPI");
-	register_percpu_irq(IA64_IPI_RESCHEDULE, dummy_handler, 0, "resched");
-	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, dummy_handler, 0,
-			    "tlb_flush");
-#endif
-}
-
-void __init
-init_IRQ (void)
-{
-	acpi_boot_init();
-	ia64_register_ipi();
-	register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL, 0, NULL);
-#ifdef CONFIG_SMP
-	if (vector_domain_type != VECTOR_DOMAIN_NONE) {
-		register_percpu_irq(IA64_IRQ_MOVE_VECTOR,
-				    smp_irq_move_cleanup_interrupt, 0,
-				    "irq_move");
-	}
-#endif
-}
-
-void
-ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
-{
-	void __iomem *ipi_addr;
-	unsigned long ipi_data;
-	unsigned long phys_cpu_id;
-
-	phys_cpu_id = cpu_physical_id(cpu);
-
-	/*
-	 * cpu number is in 8bit ID and 8bit EID
-	 */
-
-	ipi_data = (delivery_mode << 8) | (vector & 0xff);
-	ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3));
-
-	writeq(ipi_data, ipi_addr);
-}
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
deleted file mode 100644
index 23bf4499a75d..000000000000
--- a/arch/ia64/kernel/irq_lsapic.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * LSAPIC Interrupt Controller
- *
- * This takes care of interrupts that are generated by the CPU's
- * internal Streamlined Advanced Programmable Interrupt Controller
- * (LSAPIC), such as the ITC and IPI interrupts.
-    *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/sched.h>
-#include <linux/irq.h>
-
-static unsigned int
-lsapic_noop_startup (struct irq_data *data)
-{
-	return 0;
-}
-
-static void
-lsapic_noop (struct irq_data *data)
-{
-	/* nothing to do... */
-}
-
-static int lsapic_retrigger(struct irq_data *data)
-{
-	ia64_resend_irq(data->irq);
-
-	return 1;
-}
-
-struct irq_chip irq_type_ia64_lsapic = {
-	.name =			"LSAPIC",
-	.irq_startup =		lsapic_noop_startup,
-	.irq_shutdown =		lsapic_noop,
-	.irq_enable =		lsapic_noop,
-	.irq_disable =		lsapic_noop,
-	.irq_ack =		lsapic_noop,
-	.irq_retrigger =	lsapic_retrigger,
-};
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
deleted file mode 100644
index da90c49df628..000000000000
--- a/arch/ia64/kernel/ivt.S
+++ /dev/null
@@ -1,1688 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/ia64/kernel/ivt.S
- *
- * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- * Copyright (C) 2000, 2002-2003 Intel Co
- *	Asit Mallick <asit.k.mallick@intel.com>
- *      Suresh Siddha <suresh.b.siddha@intel.com>
- *      Kenneth Chen <kenneth.w.chen@intel.com>
- *      Fenghua Yu <fenghua.yu@intel.com>
- *
- * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
- * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
- *
- * Copyright (C) 2005 Hewlett-Packard Co
- *	Dan Magenheimer <dan.magenheimer@hp.com>
- *      Xen paravirtualization
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *                    pv_ops.
- *      Yaozu (Eddie) Dong <eddie.dong@intel.com>
- */
-/*
- * This file defines the interruption vector table used by the CPU.
- * It does not include one entry per possible cause of interruption.
- *
- * The first 20 entries of the table contain 64 bundles each while the
- * remaining 48 entries contain only 16 bundles each.
- *
- * The 64 bundles are used to allow inlining the whole handler for critical
- * interruptions like TLB misses.
- *
- *  For each entry, the comment is as follows:
- *
- *		// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
- *  entry offset ----/     /         /                  /          /
- *  entry number ---------/         /                  /          /
- *  size of the entry -------------/                  /          /
- *  vector name -------------------------------------/          /
- *  interruptions triggering this vector ----------------------/
- *
- * The table is 32KB in size and must be aligned on 32KB boundary.
- * (The CPU ignores the 15 lower bits of the address)
- *
- * Table is based upon EAS2.6 (Oct 1999)
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/break.h>
-#include <asm/kregs.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-#include <asm/errno.h>
-
-#if 0
-# define PSR_DEFAULT_BITS	psr.ac
-#else
-# define PSR_DEFAULT_BITS	0
-#endif
-
-#if 0
-  /*
-   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
-   * needed for something else before enabling this...
-   */
-# define DBG_FAULT(i)	mov r16=ar.k2;;	shl r16=r16,8;;	add r16=(i),r16;;mov ar.k2=r16
-#else
-# define DBG_FAULT(i)
-#endif
-
-#include "minstate.h"
-
-#define FAULT(n)									\
-	mov r31=pr;									\
-	mov r19=n;;			/* prepare to save predicates */		\
-	br.sptk.many dispatch_to_fault_handler
-
-	.section .text..ivt,"ax"
-
-	.align 32768	// align on 32KB boundary
-	.global ia64_ivt
-	EXPORT_SYMBOL(ia64_ivt)
-ia64_ivt:
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
-ENTRY(vhpt_miss)
-	DBG_FAULT(0)
-	/*
-	 * The VHPT vector is invoked when the TLB entry for the virtual page table
-	 * is missing.  This happens only as a result of a previous
-	 * (the "original") TLB miss, which may either be caused by an instruction
-	 * fetch or a data access (or non-access).
-	 *
-	 * What we do here is normal TLB miss handing for the _original_ miss,
-	 * followed by inserting the TLB entry for the virtual page table page
-	 * that the VHPT walker was attempting to access.  The latter gets
-	 * inserted as long as page table entry above pte level have valid
-	 * mappings for the faulting address.  The TLB entry for the original
-	 * miss gets inserted only if the pte entry indicates that the page is
-	 * present.
-	 *
-	 * do_page_fault gets invoked in the following cases:
-	 *	- the faulting virtual address uses unimplemented address bits
-	 *	- the faulting virtual address has no valid page table mapping
-	 */
-	MOV_FROM_IFA(r16)			// get address that caused the TLB miss
-#ifdef CONFIG_HUGETLB_PAGE
-	movl r18=PAGE_SHIFT
-	MOV_FROM_ITIR(r25)
-#endif
-	;;
-	RSM_PSR_DT				// use physical addressing for data
-	mov r31=pr				// save the predicate registers
-	mov r19=IA64_KR(PT_BASE)		// get page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	shr.u r17=r16,61			// get the region number into r17
-	;;
-	shr.u r22=r21,3
-#ifdef CONFIG_HUGETLB_PAGE
-	extr.u r26=r25,2,6
-	;;
-	cmp.ne p8,p0=r18,r26
-	sub r27=r26,r18
-	;;
-(p8)	dep r25=r18,r25,2,6
-(p8)	shr r22=r22,r27
-#endif
-	;;
-	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r22,PGDIR_SHIFT		// get bottom portion of pgd index bit
-	;;
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-
-	srlz.d
-	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
-
-	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-#if CONFIG_PGTABLE_LEVELS == 4
-	shr.u r28=r22,PUD_SHIFT			// shift pud index into position
-#else
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-#endif
-	;;
-	ld8 r17=[r17]				// get *pgd (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
-#if CONFIG_PGTABLE_LEVELS == 4
-	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
-	;;
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-(p7)	ld8 r29=[r28]				// get *pud (may be 0)
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was pud_present(*pud) == NULL?
-	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
-#else
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
-#endif
-	;;
-(p7)	ld8 r20=[r17]				// get *pmd (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was pmd_present(*pmd) == NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// r21=pte_offset(pmd,addr)
-	;;
-(p7)	ld8 r18=[r21]				// read *pte
-	MOV_FROM_ISR(r19)			// cr.isr bit 32 tells us if this is an insn miss
-	;;
-(p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
-	MOV_FROM_IHA(r22)			// get the VHPT address that caused the TLB miss
-	;;					// avoid RAW on p7
-(p7)	tbit.nz.unc p10,p11=r19,32		// is it an instruction TLB miss?
-	dep r23=0,r20,0,PAGE_SHIFT		// clear low bits to get page address
-	;;
-	ITC_I_AND_D(p10, p11, r18, r24)		// insert the instruction TLB entry and
-						// insert the data TLB entry
-(p6)	br.cond.spnt.many page_fault		// handle bad address/page not present (page fault)
-	MOV_TO_IFA(r22, r24)
-
-#ifdef CONFIG_HUGETLB_PAGE
-	MOV_TO_ITIR(p8, r25, r24)		// change to default page-size for VHPT
-#endif
-
-	/*
-	 * Now compute and insert the TLB entry for the virtual page table.  We never
-	 * execute in a page table page so there is no need to set the exception deferral
-	 * bit.
-	 */
-	adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
-	;;
-	ITC_D(p7, r24, r25)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	/*
-	 * Re-check pagetable entry.  If they changed, we may have received a ptc.g
-	 * between reading the pagetable and the "itc".  If so, flush the entry we
-	 * inserted and retry.  At this point, we have:
-	 *
-	 * r28 = equivalent of pud_offset(pgd, ifa)
-	 * r17 = equivalent of pmd_offset(pud, ifa)
-	 * r21 = equivalent of pte_offset(pmd, ifa)
-	 *
-	 * r29 = *pud
-	 * r20 = *pmd
-	 * r18 = *pte
-	 */
-	ld8 r25=[r21]				// read *pte again
-	ld8 r26=[r17]				// read *pmd again
-#if CONFIG_PGTABLE_LEVELS == 4
-	ld8 r19=[r28]				// read *pud again
-#endif
-	cmp.ne p6,p7=r0,r0
-	;;
-	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
-#if CONFIG_PGTABLE_LEVELS == 4
-	cmp.ne.or.andcm p6,p7=r19,r29		// did *pud change
-#endif
-	mov r27=PAGE_SHIFT<<2
-	;;
-(p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did *pte change
-	;;
-(p6)	ptc.l r16,r27				// purge translation
-#endif
-
-	mov pr=r31,-1				// restore predicate registers
-	RFI
-END(vhpt_miss)
-
-	.org ia64_ivt+0x400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
-ENTRY(itlb_miss)
-	DBG_FAULT(1)
-	/*
-	 * The ITLB handler accesses the PTE via the virtually mapped linear
-	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the PTE read and
-	 * go on normally after that.
-	 */
-	MOV_FROM_IFA(r16)			// get virtual address
-	mov r29=b0				// save b0
-	mov r31=pr				// save predicates
-.itlb_fault:
-	MOV_FROM_IHA(r17)			// get virtual address of PTE
-	movl r30=1f				// load nested fault continuation point
-	;;
-1:	ld8 r18=[r17]				// read *pte
-	;;
-	mov b0=r29
-	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
-(p6)	br.cond.spnt page_fault
-	;;
-	ITC_I(p0, r18, r19)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r19=[r17]				// read *pte again and see if same
-	mov r20=PAGE_SHIFT<<2			// setup page size for purge
-	;;
-	cmp.ne p7,p0=r18,r19
-	;;
-(p7)	ptc.l r16,r20
-#endif
-	mov pr=r31,-1
-	RFI
-END(itlb_miss)
-
-	.org ia64_ivt+0x0800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
-ENTRY(dtlb_miss)
-	DBG_FAULT(2)
-	/*
-	 * The DTLB handler accesses the PTE via the virtually mapped linear
-	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the PTE read and
-	 * go on normally after that.
-	 */
-	MOV_FROM_IFA(r16)			// get virtual address
-	mov r29=b0				// save b0
-	mov r31=pr				// save predicates
-dtlb_fault:
-	MOV_FROM_IHA(r17)			// get virtual address of PTE
-	movl r30=1f				// load nested fault continuation point
-	;;
-1:	ld8 r18=[r17]				// read *pte
-	;;
-	mov b0=r29
-	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
-(p6)	br.cond.spnt page_fault
-	;;
-	ITC_D(p0, r18, r19)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r19=[r17]				// read *pte again and see if same
-	mov r20=PAGE_SHIFT<<2			// setup page size for purge
-	;;
-	cmp.ne p7,p0=r18,r19
-	;;
-(p7)	ptc.l r16,r20
-#endif
-	mov pr=r31,-1
-	RFI
-END(dtlb_miss)
-
-	.org ia64_ivt+0x0c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
-ENTRY(alt_itlb_miss)
-	DBG_FAULT(3)
-	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
-	movl r17=PAGE_KERNEL
-	MOV_FROM_IPSR(p0, r21)
-	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-	mov r31=pr
-	;;
-#ifdef CONFIG_DISABLE_VHPT
-	shr.u r22=r16,61			// get the region number into r21
-	;;
-	cmp.gt p8,p0=6,r22			// user mode
-	;;
-	THASH(p8, r17, r16, r23)
-	;;
-	MOV_TO_IHA(p8, r17, r23)
-(p8)	mov r29=b0				// save b0
-(p8)	br.cond.dptk .itlb_fault
-#endif
-	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
-	and r19=r19,r16		// clear ed, reserved bits, and PTE control bits
-	shr.u r18=r16,57	// move address bit 61 to bit 4
-	;;
-	andcm r18=0x10,r18	// bit 4=~address-bit(61)
-	cmp.ne p8,p0=r0,r23	// psr.cpl != 0?
-	or r19=r17,r19		// insert PTE control bits into r19
-	;;
-	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
-(p8)	br.cond.spnt page_fault
-	;;
-	ITC_I(p0, r19, r18)	// insert the TLB entry
-	mov pr=r31,-1
-	RFI
-END(alt_itlb_miss)
-
-	.org ia64_ivt+0x1000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
-ENTRY(alt_dtlb_miss)
-	DBG_FAULT(4)
-	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
-	movl r17=PAGE_KERNEL
-	MOV_FROM_ISR(r20)
-	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-	MOV_FROM_IPSR(p0, r21)
-	mov r31=pr
-	mov r24=PERCPU_ADDR
-	;;
-#ifdef CONFIG_DISABLE_VHPT
-	shr.u r22=r16,61			// get the region number into r21
-	;;
-	cmp.gt p8,p0=6,r22			// access to region 0-5
-	;;
-	THASH(p8, r17, r16, r25)
-	;;
-	MOV_TO_IHA(p8, r17, r25)
-(p8)	mov r29=b0				// save b0
-(p8)	br.cond.dptk dtlb_fault
-#endif
-	cmp.ge p10,p11=r16,r24			// access to per_cpu_data?
-	tbit.z p12,p0=r16,61			// access to region 6?
-	mov r25=PERCPU_PAGE_SHIFT << 2
-	mov r26=PERCPU_PAGE_SIZE
-	nop.m 0
-	nop.b 0
-	;;
-(p10)	mov r19=IA64_KR(PER_CPU_DATA)
-(p11)	and r19=r19,r16				// clear non-ppn fields
-	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
-	and r22=IA64_ISR_CODE_MASK,r20		// get the isr.code field
-	tbit.nz p6,p7=r20,IA64_ISR_SP_BIT	// is speculation bit on?
-	tbit.nz p9,p0=r20,IA64_ISR_NA_BIT	// is non-access bit on?
-	;;
-(p10)	sub r19=r19,r26
-	MOV_TO_ITIR(p10, r25, r24)
-	cmp.ne p8,p0=r0,r23
-(p9)	cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22	// check isr.code field
-(p12)	dep r17=-1,r17,4,1			// set ma=UC for region 6 addr
-(p8)	br.cond.spnt page_fault
-
-	dep r21=-1,r21,IA64_PSR_ED_BIT,1
-	;;
-	or r19=r19,r17		// insert PTE control bits into r19
-	MOV_TO_IPSR(p6, r21, r24)
-	;;
-	ITC_D(p7, r19, r18)	// insert the TLB entry
-	mov pr=r31,-1
-	RFI
-END(alt_dtlb_miss)
-
-	.org ia64_ivt+0x1400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
-ENTRY(nested_dtlb_miss)
-	/*
-	 * In the absence of kernel bugs, we get here when the virtually mapped linear
-	 * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
-	 * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
-	 * table is missing, a nested TLB miss fault is triggered and control is
-	 * transferred to this point.  When this happens, we lookup the pte for the
-	 * faulting address by walking the page table in physical mode and return to the
-	 * continuation point passed in register r30 (or call page_fault if the address is
-	 * not mapped).
-	 *
-	 * Input:	r16:	faulting address
-	 *		r29:	saved b0
-	 *		r30:	continuation address
-	 *		r31:	saved pr
-	 *
-	 * Output:	r17:	physical address of PTE of faulting address
-	 *		r29:	saved b0
-	 *		r30:	continuation address
-	 *		r31:	saved pr
-	 *
-	 * Clobbered:	b0, r18, r19, r21, r22, psr.dt (cleared)
-	 */
-	RSM_PSR_DT				// switch to using physical data addressing
-	mov r19=IA64_KR(PT_BASE)		// get the page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	MOV_FROM_ITIR(r18)
-	;;
-	shr.u r17=r16,61			// get the region number into r17
-	extr.u r18=r18,2,6			// get the faulting page size
-	;;
-	cmp.eq p6,p7=5,r17			// is faulting address in region 5?
-	add r22=-PAGE_SHIFT,r18			// adjustment for hugetlb address
-	add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
-	;;
-	shr.u r22=r16,r22
-	shr.u r18=r16,r18
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-
-	srlz.d
-	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
-
-	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-#if CONFIG_PGTABLE_LEVELS == 4
-	shr.u r18=r22,PUD_SHIFT			// shift pud index into position
-#else
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-#endif
-	;;
-	ld8 r17=[r17]				// get *pgd (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=p[u|m]d_offset(pgd,addr)
-	;;
-#if CONFIG_PGTABLE_LEVELS == 4
-(p7)	ld8 r17=[r17]				// get *pud (may be 0)
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pud_present(*pud) == NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
-	;;
-#endif
-(p7)	ld8 r17=[r17]				// get *pmd (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pmd_present(*pmd) == NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// r17=pte_offset(pmd,addr);
-(p6)	br.cond.spnt page_fault
-	mov b0=r30
-	br.sptk.many b0				// return to continuation point
-END(nested_dtlb_miss)
-
-	.org ia64_ivt+0x1800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
-ENTRY(ikey_miss)
-	DBG_FAULT(6)
-	FAULT(6)
-END(ikey_miss)
-
-	.org ia64_ivt+0x1c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
-ENTRY(dkey_miss)
-	DBG_FAULT(7)
-	FAULT(7)
-END(dkey_miss)
-
-	.org ia64_ivt+0x2000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
-ENTRY(dirty_bit)
-	DBG_FAULT(8)
-	/*
-	 * What we do here is to simply turn on the dirty bit in the PTE.  We need to
-	 * update both the page-table and the TLB entry.  To efficiently access the PTE,
-	 * we address it through the virtual page table.  Most likely, the TLB entry for
-	 * the relevant virtual page table page is still present in the TLB so we can
-	 * normally do this without additional TLB misses.  In case the necessary virtual
-	 * page table TLB entry isn't present, we take a nested TLB miss hit where we look
-	 * up the physical address of the L3 PTE and then continue at label 1 below.
-	 */
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r29=b0				// save b0 in case of nested fault
-	mov r31=pr				// save pr
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
-	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only update if page is present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only compare if page is present
-	;;
-	ITC_D(p6, r25, r18)			// install updated PTE
-	;;
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov b0=r29				// restore b0
-	mov ar.ccv=r28
-#else
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_D(p0, r18, r16)			// install updated PTE
-#endif
-	mov pr=r31,-1				// restore pr
-	RFI
-END(dirty_bit)
-
-	.org ia64_ivt+0x2400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
-ENTRY(iaccess_bit)
-	DBG_FAULT(9)
-	// Like Entry 8, except for instruction access
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	mov r31=pr				// save predicates
-#ifdef CONFIG_ITANIUM
-	/*
-	 * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
-	 */
-	MOV_FROM_IPSR(p0, r17)
-	;;
-	MOV_FROM_IIP(r18)
-	tbit.z p6,p0=r17,IA64_PSR_IS_BIT	// IA64 instruction set?
-	;;
-(p6)	mov r16=r18				// if so, use cr.iip instead of cr.ifa
-#endif /* CONFIG_ITANIUM */
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r29=b0				// save b0 in case of nested fault)
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_A,r18			// set the accessed bit
-	tbit.z p7,p6 = r18,_PAGE_P_BIT	 	// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only if page present
-	;;
-	ITC_I(p6, r25, r26)			// install updated PTE
-	;;
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov b0=r29				// restore b0
-	mov ar.ccv=r28
-#else /* !CONFIG_SMP */
-	;;
-1:	ld8 r18=[r17]
-	;;
-	or r18=_PAGE_A,r18			// set the accessed bit
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_I(p0, r18, r16)			// install updated PTE
-#endif /* !CONFIG_SMP */
-	mov pr=r31,-1
-	RFI
-END(iaccess_bit)
-
-	.org ia64_ivt+0x2800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
-ENTRY(daccess_bit)
-	DBG_FAULT(10)
-	// Like Entry 8, except for data access
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r31=pr
-	mov r29=b0				// save b0 in case of nested fault)
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_A,r18			// set the dirty bit
-	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page is present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only if page is present
-	;;
-	ITC_D(p6, r25, r26)			// install updated PTE
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-	;;
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov ar.ccv=r28
-#else
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_A,r18			// set the accessed bit
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_D(p0, r18, r16)			// install updated PTE
-#endif
-	mov b0=r29				// restore b0
-	mov pr=r31,-1
-	RFI
-END(daccess_bit)
-
-	.org ia64_ivt+0x2c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
-ENTRY(break_fault)
-	/*
-	 * The streamlined system call entry/exit paths only save/restore the initial part
-	 * of pt_regs.  This implies that the callers of system-calls must adhere to the
-	 * normal procedure calling conventions.
-	 *
-	 *   Registers to be saved & restored:
-	 *	CR registers: cr.ipsr, cr.iip, cr.ifs
-	 *	AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
-	 * 	others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
-	 *   Registers to be restored only:
-	 * 	r8-r11: output value from the system call.
-	 *
-	 * During system call exit, scratch registers (including r15) are modified/cleared
-	 * to prevent leaking bits from kernel to user level.
-	 */
-	DBG_FAULT(11)
-	mov.m r16=IA64_KR(CURRENT)		// M2 r16 <- current task (12 cyc)
-	MOV_FROM_IPSR(p0, r29)			// M2 (12 cyc)
-	mov r31=pr				// I0 (2 cyc)
-
-	MOV_FROM_IIM(r17)			// M2 (2 cyc)
-	mov.m r27=ar.rsc			// M2 (12 cyc)
-	mov r18=__IA64_BREAK_SYSCALL		// A
-
-	mov.m ar.rsc=0				// M2
-	mov.m r21=ar.fpsr			// M2 (12 cyc)
-	mov r19=b6				// I0 (2 cyc)
-	;;
-	mov.m r23=ar.bspstore			// M2 (12 cyc)
-	mov.m r24=ar.rnat			// M2 (5 cyc)
-	mov.i r26=ar.pfs			// I0 (2 cyc)
-
-	invala					// M0|1
-	nop.m 0					// M
-	mov r20=r1				// A			save r1
-
-	nop.m 0
-	movl r30=sys_call_table			// X
-
-	MOV_FROM_IIP(r28)			// M2 (2 cyc)
-	cmp.eq p0,p7=r18,r17			// I0 is this a system call?
-(p7)	br.cond.spnt non_syscall		// B  no ->
-	//
-	// From this point on, we are definitely on the syscall-path
-	// and we can use (non-banked) scratch registers.
-	//
-///////////////////////////////////////////////////////////////////////
-	mov r1=r16				// A    move task-pointer to "addl"-addressable reg
-	mov r2=r16				// A    setup r2 for ia64_syscall_setup
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16	// A	r9 = &current_thread_info()->flags
-
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
-	adds r15=-1024,r15			// A    subtract 1024 from syscall number
-	mov r3=NR_syscalls - 1
-	;;
-	ld1.bias r17=[r16]			// M0|1 r17 = current->thread.on_ustack flag
-	ld4 r9=[r9]				// M0|1 r9 = current_thread_info()->flags
-	extr.u r8=r29,41,2			// I0   extract ei field from cr.ipsr
-
-	shladd r30=r15,3,r30			// A    r30 = sys_call_table + 8*(syscall-1024)
-	addl r22=IA64_RBS_OFFSET,r1		// A    compute base of RBS
-	cmp.leu p6,p7=r15,r3			// A    syscall number in range?
-	;;
-
-	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch RBS
-(p6)	ld8 r30=[r30]				// M0|1 load address of syscall entry point
-	tnat.nz.or p7,p0=r15			// I0	is syscall nr a NaT?
-
-	mov.m ar.bspstore=r22			// M2   switch to kernel RBS
-	cmp.eq p8,p9=2,r8			// A    isr.ei==2?
-	;;
-
-(p8)	mov r8=0				// A    clear ei to 0
-(p7)	movl r30=sys_ni_syscall			// X
-
-(p8)	adds r28=16,r28				// A    switch cr.iip to next bundle
-(p9)	adds r8=1,r8				// A    increment ei to next slot
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	;;
-	mov b6=r30				// I0   setup syscall handler branch reg early
-#else
-	nop.i 0
-	;;
-#endif
-
-	mov.m r25=ar.unat			// M2 (5 cyc)
-	dep r29=r8,r29,41,2			// I0   insert new ei into cr.ipsr
-	adds r15=1024,r15			// A    restore original syscall number
-	//
-	// If any of the above loads miss in L1D, we'll stall here until
-	// the data arrives.
-	//
-///////////////////////////////////////////////////////////////////////
-	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
-#else
-	mov b6=r30				// I0   setup syscall handler branch reg early
-#endif
-	cmp.eq pKStk,pUStk=r0,r17		// A    were we on kernel stacks already?
-
-	and r9=_TIF_SYSCALL_TRACEAUDIT,r9	// A    mask trace or audit
-	mov r18=ar.bsp				// M2 (12 cyc)
-(pKStk)	br.cond.spnt .break_fixup		// B	we're already in kernel-mode -- fix up RBS
-	;;
-.back_from_break_fixup:
-(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A    compute base of memory stack
-	cmp.eq p14,p0=r9,r0			// A    are syscalls being traced/audited?
-	br.call.sptk.many b7=ia64_syscall_setup	// B
-1:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	// mov.m r30=ar.itc is called in advance, and r13 is current
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13	// A
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13	// A
-(pKStk)	br.cond.spnt .skip_accounting		// B	unlikely skip
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// M  get last stamp
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// M  time at leave
-	;;
-	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// M  cumulated stime
-	ld8 r21=[r17]				// M  cumulated utime
-	sub r22=r19,r18				// A  stime before leave
-	;;
-	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// M  update stamp
-	sub r18=r30,r19				// A  elapsed time in user
-	;;
-	add r20=r20,r22				// A  sum stime
-	add r21=r21,r18				// A  sum utime
-	;;
-	st8 [r16]=r20				// M  update stime
-	st8 [r17]=r21				// M  update utime
-	;;
-.skip_accounting:
-#endif
-	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
-	nop 0
-	BSW_1(r2, r14)				// B (6 cyc) regs are saved, switch to bank 1
-	;;
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r16)	// M2	now it's safe to re-enable intr.-collection
-						// M0   ensure interruption collection is on
-	movl r3=ia64_ret_from_syscall		// X
-	;;
-	mov rp=r3				// I0   set the real return addr
-(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
-
-	SSM_PSR_I(p15, p15, r16)		// M2   restore psr.i
-(p14)	br.call.sptk.many b6=b6			// B    invoke syscall-handker (ignore return addr)
-	br.cond.spnt.many ia64_trace_syscall	// B	do syscall-tracing thingamagic
-	// NOT REACHED
-///////////////////////////////////////////////////////////////////////
-	// On entry, we optimistically assumed that we're coming from user-space.
-	// For the rare cases where a system-call is done from within the kernel,
-	// we fix things up at this point:
-.break_fixup:
-	add r1=-IA64_PT_REGS_SIZE,sp		// A    allocate space for pt_regs structure
-	mov ar.rnat=r24				// M2	restore kernel's AR.RNAT
-	;;
-	mov ar.bspstore=r23			// M2	restore kernel's AR.BSPSTORE
-	br.cond.sptk .back_from_break_fixup
-END(break_fault)
-
-	.org ia64_ivt+0x3000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
-ENTRY(interrupt)
-	/* interrupt handler has become too big to fit this area. */
-	br.sptk.many __interrupt
-END(interrupt)
-
-	.org ia64_ivt+0x3400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3400 Entry 13 (size 64 bundles) Reserved
-	DBG_FAULT(13)
-	FAULT(13)
-
-	.org ia64_ivt+0x3800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3800 Entry 14 (size 64 bundles) Reserved
-	DBG_FAULT(14)
-	FAULT(14)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 *
-	 * ia64_syscall_setup() is a separate subroutine so that it can
-	 *	allocate stacked registers so it can safely demine any
-	 *	potential NaT values from the input registers.
-	 *
-	 * On entry:
-	 *	- executing on bank 0 or bank 1 register set (doesn't matter)
-	 *	-  r1: stack pointer
-	 *	-  r2: current task pointer
-	 *	-  r3: preserved
-	 *	- r11: original contents (saved ar.pfs to be saved)
-	 *	- r12: original contents (sp to be saved)
-	 *	- r13: original contents (tp to be saved)
-	 *	- r15: original contents (syscall # to be saved)
-	 *	- r18: saved bsp (after switching to kernel stack)
-	 *	- r19: saved b6
-	 *	- r20: saved r1 (gp)
-	 *	- r21: saved ar.fpsr
-	 *	- r22: kernel's register backing store base (krbs_base)
-	 *	- r23: saved ar.bspstore
-	 *	- r24: saved ar.rnat
-	 *	- r25: saved ar.unat
-	 *	- r26: saved ar.pfs
-	 *	- r27: saved ar.rsc
-	 *	- r28: saved cr.iip
-	 *	- r29: saved cr.ipsr
-	 *	- r30: ar.itc for accounting (don't touch)
-	 *	- r31: saved pr
-	 *	-  b0: original contents (to be saved)
-	 * On exit:
-	 *	-  p10: TRUE if syscall is invoked with more than 8 out
-	 *		registers or r15's Nat is true
-	 *	-  r1: kernel's gp
-	 *	-  r3: preserved (same as on entry)
-	 *	-  r8: -EINVAL if p10 is true
-	 *	- r12: points to kernel stack
-	 *	- r13: points to current task
-	 *	- r14: preserved (same as on entry)
-	 *	- p13: preserved
-	 *	- p15: TRUE if interrupts need to be re-enabled
-	 *	- ar.fpsr: set to kernel settings
-	 *	-  b6: preserved (same as on entry)
-	 */
-GLOBAL_ENTRY(ia64_syscall_setup)
-#if PT(B6) != 0
-# error This code assumes that b6 is the first field in pt_regs.
-#endif
-	st8 [r1]=r19				// save b6
-	add r16=PT(CR_IPSR),r1			// initialize first base pointer
-	add r17=PT(R11),r1			// initialize second base pointer
-	;;
-	alloc r19=ar.pfs,8,0,0,0		// ensure in0-in7 are writable
-	st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)	// save cr.ipsr
-	tnat.nz p8,p0=in0
-
-	st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)	// save r11
-	tnat.nz p9,p0=in1
-(pKStk)	mov r18=r0				// make sure r18 isn't NaT
-	;;
-
-	st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)	// save ar.pfs
-	st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)	// save cr.iip
-	mov r28=b0				// save b0 (2 cyc)
-	;;
-
-	st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)	// save ar.unat
-	dep r19=0,r19,38,26			// clear all bits but 0..37 [I0]
-(p8)	mov in0=-1
-	;;
-
-	st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)	// store ar.pfs.pfm in cr.ifs
-	extr.u r11=r19,7,7	// I0		// get sol of ar.pfs
-	and r8=0x7f,r19		// A		// get sof of ar.pfs
-
-	st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
-	tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
-(p9)	mov in1=-1
-	;;
-
-(pUStk) sub r18=r18,r22				// r18=RSE.ndirty*8
-	tnat.nz p10,p0=in2
-	add r11=8,r11
-	;;
-(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16		// skip over ar_rnat field
-(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17	// skip over ar_bspstore field
-	tnat.nz p11,p0=in3
-	;;
-(p10)	mov in2=-1
-	tnat.nz p12,p0=in4				// [I0]
-(p11)	mov in3=-1
-	;;
-(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)	// save ar.rnat
-(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)	// save ar.bspstore
-	shl r18=r18,16				// compute ar.rsc to be used for "loadrs"
-	;;
-	st8 [r16]=r31,PT(LOADRS)-PT(PR)		// save predicates
-	st8 [r17]=r28,PT(R1)-PT(B0)		// save b0
-	tnat.nz p13,p0=in5				// [I0]
-	;;
-	st8 [r16]=r18,PT(R12)-PT(LOADRS)	// save ar.rsc value for "loadrs"
-	st8.spill [r17]=r20,PT(R13)-PT(R1)	// save original r1
-(p12)	mov in4=-1
-	;;
-
-.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)	// save r12
-.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)		// save r13
-(p13)	mov in5=-1
-	;;
-	st8 [r16]=r21,PT(R8)-PT(AR_FPSR)	// save ar.fpsr
-	tnat.nz p13,p0=in6
-	cmp.lt p10,p9=r11,r8	// frame size can't be more than local+8
-	;;
-	mov r8=1
-(p9)	tnat.nz p10,p0=r15
-	adds r12=-16,r1		// switch to kernel memory stack (with 16 bytes of scratch)
-
-	st8.spill [r17]=r15			// save r15
-	tnat.nz p8,p0=in7
-	nop.i 0
-
-	mov r13=r2				// establish `current'
-	movl r1=__gp				// establish kernel global pointer
-	;;
-	st8 [r16]=r8		// ensure pt_regs.r8 != 0 (see handle_syscall_error)
-(p13)	mov in6=-1
-(p8)	mov in7=-1
-
-	cmp.eq pSys,pNonSys=r0,r0		// set pSys=1, pNonSys=0
-	movl r17=FPSR_DEFAULT
-	;;
-	mov.m ar.fpsr=r17			// set ar.fpsr to kernel default value
-(p10)	mov r8=-EINVAL
-	br.ret.sptk.many b7
-END(ia64_syscall_setup)
-
-	.org ia64_ivt+0x3c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3c00 Entry 15 (size 64 bundles) Reserved
-	DBG_FAULT(15)
-	FAULT(15)
-
-	.org ia64_ivt+0x4000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4000 Entry 16 (size 64 bundles) Reserved
-	DBG_FAULT(16)
-	FAULT(16)
-
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
-	/*
-	 * There is no particular reason for this code to be here, other than
-	 * that there happens to be space here that would go unused otherwise.
-	 * If this fault ever gets "unreserved", simply moved the following
-	 * code to a more suitable spot...
-	 *
-	 * account_sys_enter is called from SAVE_MIN* macros if accounting is
-	 * enabled and if the macro is entered from user mode.
-	 */
-GLOBAL_ENTRY(account_sys_enter)
-	// mov.m r20=ar.itc is called in advance, and r13 is current
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at left from kernel
-        ;;
-	ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
-	ld8 r21=[r17]				// cumulated utime
-	sub r22=r19,r18				// stime before leave kernel
-	;;
-	st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP	// update stamp
-	sub r18=r20,r19				// elapsed time in user mode
-	;;
-	add r23=r23,r22				// sum stime
-	add r21=r21,r18				// sum utime
-	;;
-	st8 [r16]=r23				// update stime
-	st8 [r17]=r21				// update utime
-	;;
-	br.ret.sptk.many rp
-END(account_sys_enter)
-#endif
-
-	.org ia64_ivt+0x4400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4400 Entry 17 (size 64 bundles) Reserved
-	DBG_FAULT(17)
-	FAULT(17)
-
-	.org ia64_ivt+0x4800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4800 Entry 18 (size 64 bundles) Reserved
-	DBG_FAULT(18)
-	FAULT(18)
-
-	.org ia64_ivt+0x4c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4c00 Entry 19 (size 64 bundles) Reserved
-	DBG_FAULT(19)
-	FAULT(19)
-
-//
-// --- End of long entries, Beginning of short entries
-//
-
-	.org ia64_ivt+0x5000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
-ENTRY(page_not_present)
-	DBG_FAULT(20)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	/*
-	 * The Linux page fault handler doesn't expect non-present pages to be in
-	 * the TLB.  Flush the existing entry now, so we meet that expectation.
-	 */
-	mov r17=PAGE_SHIFT<<2
-	;;
-	ptc.l r16,r17
-	;;
-	mov r31=pr
-	srlz.d
-	br.sptk.many page_fault
-END(page_not_present)
-
-	.org ia64_ivt+0x5100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
-ENTRY(key_permission)
-	DBG_FAULT(21)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(key_permission)
-
-	.org ia64_ivt+0x5200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
-ENTRY(iaccess_rights)
-	DBG_FAULT(22)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(iaccess_rights)
-
-	.org ia64_ivt+0x5300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
-ENTRY(daccess_rights)
-	DBG_FAULT(23)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(daccess_rights)
-
-	.org ia64_ivt+0x5400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
-ENTRY(general_exception)
-	DBG_FAULT(24)
-	MOV_FROM_ISR(r16)
-	mov r31=pr
-	;;
-	cmp4.eq p6,p0=0,r16
-(p6)	br.sptk.many dispatch_illegal_op_fault
-	;;
-	mov r19=24		// fault number
-	br.sptk.many dispatch_to_fault_handler
-END(general_exception)
-
-	.org ia64_ivt+0x5500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
-ENTRY(disabled_fp_reg)
-	DBG_FAULT(25)
-	rsm psr.dfh		// ensure we can access fph
-	;;
-	srlz.d
-	mov r31=pr
-	mov r19=25
-	br.sptk.many dispatch_to_fault_handler
-END(disabled_fp_reg)
-
-	.org ia64_ivt+0x5600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
-ENTRY(nat_consumption)
-	DBG_FAULT(26)
-
-	MOV_FROM_IPSR(p0, r16)
-	MOV_FROM_ISR(r17)
-	mov r31=pr				// save PR
-	;;
-	and r18=0xf,r17				// r18 = cr.ipsr.code{3:0}
-	tbit.z p6,p0=r17,IA64_ISR_NA_BIT
-	;;
-	cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
-	dep r16=-1,r16,IA64_PSR_ED_BIT,1
-(p6)	br.cond.spnt 1f		// branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
-	;;
-	MOV_TO_IPSR(p0, r16, r18)
-	mov pr=r31,-1
-	;;
-	RFI
-
-1:	mov pr=r31,-1
-	;;
-	FAULT(26)
-END(nat_consumption)
-
-	.org ia64_ivt+0x5700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
-ENTRY(speculation_vector)
-	DBG_FAULT(27)
-	/*
-	 * A [f]chk.[as] instruction needs to take the branch to the recovery code but
-	 * this part of the architecture is not implemented in hardware on some CPUs, such
-	 * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
-	 * the relative target (not yet sign extended).  So after sign extending it we
-	 * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
-	 * i.e., the slot to restart into.
-	 *
-	 * cr.imm contains zero_ext(imm21)
-	 */
-	MOV_FROM_IIM(r18)
-	;;
-	MOV_FROM_IIP(r17)
-	shl r18=r18,43			// put sign bit in position (43=64-21)
-	;;
-
-	MOV_FROM_IPSR(p0, r16)
-	shr r18=r18,39			// sign extend (39=43-4)
-	;;
-
-	add r17=r17,r18			// now add the offset
-	;;
-	MOV_TO_IIP(r17, r19)
-	dep r16=0,r16,41,2		// clear EI
-	;;
-
-	MOV_TO_IPSR(p0, r16, r19)
-	;;
-
-	RFI
-END(speculation_vector)
-
-	.org ia64_ivt+0x5800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5800 Entry 28 (size 16 bundles) Reserved
-	DBG_FAULT(28)
-	FAULT(28)
-
-	.org ia64_ivt+0x5900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
-ENTRY(debug_vector)
-	DBG_FAULT(29)
-	FAULT(29)
-END(debug_vector)
-
-	.org ia64_ivt+0x5a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
-ENTRY(unaligned_access)
-	DBG_FAULT(30)
-	mov r31=pr		// prepare to save predicates
-	;;
-	br.sptk.many dispatch_unaligned_handler
-END(unaligned_access)
-
-	.org ia64_ivt+0x5b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
-ENTRY(unsupported_data_reference)
-	DBG_FAULT(31)
-	FAULT(31)
-END(unsupported_data_reference)
-
-	.org ia64_ivt+0x5c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
-ENTRY(floating_point_fault)
-	DBG_FAULT(32)
-	FAULT(32)
-END(floating_point_fault)
-
-	.org ia64_ivt+0x5d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
-ENTRY(floating_point_trap)
-	DBG_FAULT(33)
-	FAULT(33)
-END(floating_point_trap)
-
-	.org ia64_ivt+0x5e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
-ENTRY(lower_privilege_trap)
-	DBG_FAULT(34)
-	FAULT(34)
-END(lower_privilege_trap)
-
-	.org ia64_ivt+0x5f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
-ENTRY(taken_branch_trap)
-	DBG_FAULT(35)
-	FAULT(35)
-END(taken_branch_trap)
-
-	.org ia64_ivt+0x6000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
-ENTRY(single_step_trap)
-	DBG_FAULT(36)
-	FAULT(36)
-END(single_step_trap)
-
-	.org ia64_ivt+0x6100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6100 Entry 37 (size 16 bundles) Reserved
-	DBG_FAULT(37)
-	FAULT(37)
-
-	.org ia64_ivt+0x6200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6200 Entry 38 (size 16 bundles) Reserved
-	DBG_FAULT(38)
-	FAULT(38)
-
-	.org ia64_ivt+0x6300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6300 Entry 39 (size 16 bundles) Reserved
-	DBG_FAULT(39)
-	FAULT(39)
-
-	.org ia64_ivt+0x6400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6400 Entry 40 (size 16 bundles) Reserved
-	DBG_FAULT(40)
-	FAULT(40)
-
-	.org ia64_ivt+0x6500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6500 Entry 41 (size 16 bundles) Reserved
-	DBG_FAULT(41)
-	FAULT(41)
-
-	.org ia64_ivt+0x6600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6600 Entry 42 (size 16 bundles) Reserved
-	DBG_FAULT(42)
-	FAULT(42)
-
-	.org ia64_ivt+0x6700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6700 Entry 43 (size 16 bundles) Reserved
-	DBG_FAULT(43)
-	FAULT(43)
-
-	.org ia64_ivt+0x6800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6800 Entry 44 (size 16 bundles) Reserved
-	DBG_FAULT(44)
-	FAULT(44)
-
-	.org ia64_ivt+0x6900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
-ENTRY(ia32_exception)
-	DBG_FAULT(45)
-	FAULT(45)
-END(ia32_exception)
-
-	.org ia64_ivt+0x6a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
-ENTRY(ia32_intercept)
-	DBG_FAULT(46)
-	FAULT(46)
-END(ia32_intercept)
-
-	.org ia64_ivt+0x6b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
-ENTRY(ia32_interrupt)
-	DBG_FAULT(47)
-	FAULT(47)
-END(ia32_interrupt)
-
-	.org ia64_ivt+0x6c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6c00 Entry 48 (size 16 bundles) Reserved
-	DBG_FAULT(48)
-	FAULT(48)
-
-	.org ia64_ivt+0x6d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6d00 Entry 49 (size 16 bundles) Reserved
-	DBG_FAULT(49)
-	FAULT(49)
-
-	.org ia64_ivt+0x6e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6e00 Entry 50 (size 16 bundles) Reserved
-	DBG_FAULT(50)
-	FAULT(50)
-
-	.org ia64_ivt+0x6f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6f00 Entry 51 (size 16 bundles) Reserved
-	DBG_FAULT(51)
-	FAULT(51)
-
-	.org ia64_ivt+0x7000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7000 Entry 52 (size 16 bundles) Reserved
-	DBG_FAULT(52)
-	FAULT(52)
-
-	.org ia64_ivt+0x7100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7100 Entry 53 (size 16 bundles) Reserved
-	DBG_FAULT(53)
-	FAULT(53)
-
-	.org ia64_ivt+0x7200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7200 Entry 54 (size 16 bundles) Reserved
-	DBG_FAULT(54)
-	FAULT(54)
-
-	.org ia64_ivt+0x7300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7300 Entry 55 (size 16 bundles) Reserved
-	DBG_FAULT(55)
-	FAULT(55)
-
-	.org ia64_ivt+0x7400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7400 Entry 56 (size 16 bundles) Reserved
-	DBG_FAULT(56)
-	FAULT(56)
-
-	.org ia64_ivt+0x7500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7500 Entry 57 (size 16 bundles) Reserved
-	DBG_FAULT(57)
-	FAULT(57)
-
-	.org ia64_ivt+0x7600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7600 Entry 58 (size 16 bundles) Reserved
-	DBG_FAULT(58)
-	FAULT(58)
-
-	.org ia64_ivt+0x7700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7700 Entry 59 (size 16 bundles) Reserved
-	DBG_FAULT(59)
-	FAULT(59)
-
-	.org ia64_ivt+0x7800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7800 Entry 60 (size 16 bundles) Reserved
-	DBG_FAULT(60)
-	FAULT(60)
-
-	.org ia64_ivt+0x7900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7900 Entry 61 (size 16 bundles) Reserved
-	DBG_FAULT(61)
-	FAULT(61)
-
-	.org ia64_ivt+0x7a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7a00 Entry 62 (size 16 bundles) Reserved
-	DBG_FAULT(62)
-	FAULT(62)
-
-	.org ia64_ivt+0x7b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7b00 Entry 63 (size 16 bundles) Reserved
-	DBG_FAULT(63)
-	FAULT(63)
-
-	.org ia64_ivt+0x7c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7c00 Entry 64 (size 16 bundles) Reserved
-	DBG_FAULT(64)
-	FAULT(64)
-
-	.org ia64_ivt+0x7d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7d00 Entry 65 (size 16 bundles) Reserved
-	DBG_FAULT(65)
-	FAULT(65)
-
-	.org ia64_ivt+0x7e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7e00 Entry 66 (size 16 bundles) Reserved
-	DBG_FAULT(66)
-	FAULT(66)
-
-	.org ia64_ivt+0x7f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7f00 Entry 67 (size 16 bundles) Reserved
-	DBG_FAULT(67)
-	FAULT(67)
-
-	//-----------------------------------------------------------------------------------
-	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
-ENTRY(page_fault)
-	SSM_PSR_DT_AND_SRLZ_I
-	;;
-	SAVE_MIN_WITH_COVER
-	alloc r15=ar.pfs,0,0,3,0
-	MOV_FROM_IFA(out0)
-	MOV_FROM_ISR(out1)
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r14, r3)
-	adds r3=8,r2				// set up second base pointer
-	SSM_PSR_I(p15, p15, r14)		// restore psr.i
-	movl r14=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r14
-	;;
-	adds out2=16,r12			// out2 = pointer to pt_regs
-	br.call.sptk.many b6=ia64_do_page_fault	// ignore return address
-END(page_fault)
-
-ENTRY(non_syscall)
-	mov ar.rsc=r27			// restore ar.rsc before SAVE_MIN_WITH_COVER
-	;;
-	SAVE_MIN_WITH_COVER
-
-	// There is no particular reason for this code to be here, other than that
-	// there happens to be space here that would go unused otherwise.  If this
-	// fault ever gets "unreserved", simply moved the following code to a more
-	// suitable spot...
-
-	alloc r14=ar.pfs,0,0,2,0
-	MOV_FROM_IIM(out0)
-	add out1=16,sp
-	adds r3=8,r2			// set up second base pointer for SAVE_REST
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r15, r24)
-					// guarantee that interruption collection is on
-	SSM_PSR_I(p15, p15, r15)	// restore psr.i
-	movl r15=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r15
-	;;
-	br.call.sptk.many b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
-END(non_syscall)
-
-ENTRY(__interrupt)
-	DBG_FAULT(12)
-	mov r31=pr		// prepare to save predicates
-	;;
-	SAVE_MIN_WITH_COVER	// uses r31; defines r2 and r3
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r14)
-				// ensure everybody knows psr.ic is back on
-	adds r3=8,r2		// set up second base pointer for SAVE_REST
-	;;
-	SAVE_REST
-	;;
-	MCA_RECOVER_RANGE(interrupt)
-	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
-	MOV_FROM_IVR(out0, r8)	// pass cr.ivr as first arg
-	add out1=16,sp		// pass pointer to pt_regs as second arg
-	;;
-	srlz.d			// make sure we see the effect of cr.ivr
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_handle_irq
-END(__interrupt)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_unaligned_handler)
-	SAVE_MIN_WITH_COVER
-	;;
-	alloc r14=ar.pfs,0,0,2,0		// now it's safe (must be first in insn group!)
-	MOV_FROM_IFA(out0)
-	adds out1=16,sp
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
-						// guarantee that interruption collection is on
-	SSM_PSR_I(p15, p15, r3)			// restore psr.i
-	adds r3=8,r2				// set up second base pointer
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.sptk.many ia64_prepare_handle_unaligned
-END(dispatch_unaligned_handler)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_to_fault_handler)
-	/*
-	 * Input:
-	 *	psr.ic:	off
-	 *	r19:	fault vector number (e.g., 24 for General Exception)
-	 *	r31:	contains saved predicates (pr)
-	 */
-	SAVE_MIN_WITH_COVER_R19
-	alloc r14=ar.pfs,0,0,5,0
-	MOV_FROM_ISR(out1)
-	MOV_FROM_IFA(out2)
-	MOV_FROM_IIM(out3)
-	MOV_FROM_ITIR(out4)
-	;;
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, out0)
-						// guarantee that interruption collection is on
-	mov out0=r15
-	;;
-	SSM_PSR_I(p15, p15, r3)			// restore psr.i
-	adds r3=8,r2				// set up second base pointer for SAVE_REST
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_fault
-END(dispatch_to_fault_handler)
-
-	/*
-	 * Squatting in this space ...
-	 *
-	 * This special case dispatcher for illegal operation faults allows preserved
-	 * registers to be modified through a callback function (asm only) that is handed
-	 * back from the fault handler in r8. Up to three arguments can be passed to the
-	 * callback function by returning an aggregate with the callback as its first
-	 * element, followed by the arguments.
-	 */
-ENTRY(dispatch_illegal_op_fault)
-	.prologue
-	.body
-	SAVE_MIN_WITH_COVER
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
-				// guarantee that interruption collection is on
-	;;
-	SSM_PSR_I(p15, p15, r3)	// restore psr.i
-	adds r3=8,r2	// set up second base pointer for SAVE_REST
-	;;
-	alloc r14=ar.pfs,0,0,1,0	// must be first in insn group
-	mov out0=ar.ec
-	;;
-	SAVE_REST
-	PT_REGS_UNWIND_INFO(0)
-	;;
-	br.call.sptk.many rp=ia64_illegal_op_fault
-.ret0:	;;
-	alloc r14=ar.pfs,0,0,3,0	// must be first in insn group
-	mov out0=r9
-	mov out1=r10
-	mov out2=r11
-	movl r15=ia64_leave_kernel
-	;;
-	mov rp=r15
-	mov b6=r8
-	;;
-	cmp.ne p6,p0=0,r8
-(p6)	br.call.dpnt.many b6=b6		// call returns to ia64_leave_kernel
-	br.sptk.many ia64_leave_kernel
-END(dispatch_illegal_op_fault)
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
deleted file mode 100644
index ca34e51e84b4..000000000000
--- a/arch/ia64/kernel/kprobes.c
+++ /dev/null
@@ -1,911 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  Kernel Probes (KProbes)
- *  arch/ia64/kernel/kprobes.c
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adapted from i386
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/extable.h>
-#include <linux/kdebug.h>
-#include <linux/pgtable.h>
-
-#include <asm/sections.h>
-#include <asm/exception.h>
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
-
-enum instruction_type {A, I, M, F, B, L, X, u};
-static enum instruction_type bundle_encoding[32][3] = {
-	[0x00] = { M, I, I },
-	[0x01] = { M, I, I },
-	[0x02] = { M, I, I },
-	[0x03] = { M, I, I },
-	[0x04] = { M, L, X },
-	[0x05] = { M, L, X },
-	[0x06] = { u, u, u },
-	[0x07] = { u, u, u },
-	[0x08] = { M, M, I },
-	[0x09] = { M, M, I },
-	[0x0A] = { M, M, I },
-	[0x0B] = { M, M, I },
-	[0x0C] = { M, F, I },
-	[0x0D] = { M, F, I },
-	[0x0E] = { M, M, F },
-	[0x0F] = { M, M, F },
-	[0x10] = { M, I, B },
-	[0x11] = { M, I, B },
-	[0x12] = { M, B, B },
-	[0x13] = { M, B, B },
-	[0x14] = { u, u, u },
-	[0x15] = { u, u, u },
-	[0x16] = { B, B, B },
-	[0x17] = { B, B, B },
-	[0x18] = { M, M, B },
-	[0x19] = { M, M, B },
-	[0x1A] = { u, u, u },
-	[0x1B] = { u, u, u },
-	[0x1C] = { M, F, B },
-	[0x1D] = { M, F, B },
-	[0x1E] = { u, u, u },
-	[0x1F] = { u, u, u },
-};
-
-/* Insert a long branch code */
-static void __kprobes set_brl_inst(void *from, void *to)
-{
-	s64 rel = ((s64) to - (s64) from) >> 4;
-	bundle_t *brl;
-	brl = (bundle_t *) ((u64) from & ~0xf);
-	brl->quad0.template = 0x05;	/* [MLX](stop) */
-	brl->quad0.slot0 = NOP_M_INST;	/* nop.m 0x0 */
-	brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2;
-	brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46);
-	/* brl.cond.sptk.many.clr rel<<4 (qp=0) */
-	brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff);
-}
-
-/*
- * In this function we check to see if the instruction
- * is IP relative instruction and update the kprobe
- * inst flag accordingly
- */
-static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
-					      uint major_opcode,
-					      unsigned long kprobe_inst,
-					      struct kprobe *p)
-{
-	p->ainsn.inst_flag = 0;
-	p->ainsn.target_br_reg = 0;
-	p->ainsn.slot = slot;
-
-	/* Check for Break instruction
-	 * Bits 37:40 Major opcode to be zero
-	 * Bits 27:32 X6 to be zero
-	 * Bits 32:35 X3 to be zero
-	 */
-	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
-		/* is a break instruction */
-	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
-		return;
-	}
-
-	if (bundle_encoding[template][slot] == B) {
-		switch (major_opcode) {
-		  case INDIRECT_CALL_OPCODE:
-	 		p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-			break;
-		  case IP_RELATIVE_PREDICT_OPCODE:
-		  case IP_RELATIVE_BRANCH_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
-			break;
-		  case IP_RELATIVE_CALL_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
-			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-			break;
-		}
-	} else if (bundle_encoding[template][slot] == X) {
-		switch (major_opcode) {
-		  case LONG_CALL_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-		  break;
-		}
-	}
-	return;
-}
-
-/*
- * In this function we check to see if the instruction
- * (qp) cmpx.crel.ctype p1,p2=r2,r3
- * on which we are inserting kprobe is cmp instruction
- * with ctype as unc.
- */
-static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot,
-					    uint major_opcode,
-					    unsigned long kprobe_inst)
-{
-	cmp_inst_t cmp_inst;
-	uint ctype_unc = 0;
-
-	if (!((bundle_encoding[template][slot] == I) ||
-		(bundle_encoding[template][slot] == M)))
-		goto out;
-
-	if (!((major_opcode == 0xC) || (major_opcode == 0xD) ||
-		(major_opcode == 0xE)))
-		goto out;
-
-	cmp_inst.l = kprobe_inst;
-	if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) {
-		/* Integer compare - Register Register (A6 type)*/
-		if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0)
-				&&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
-	} else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) {
-		/* Integer compare - Immediate Register (A8 type)*/
-		if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
-	}
-out:
-	return ctype_unc;
-}
-
-/*
- * In this function we check to see if the instruction
- * on which we are inserting kprobe is supported.
- * Returns qp value if supported
- * Returns -EINVAL if unsupported
- */
-static int __kprobes unsupported_inst(uint template, uint  slot,
-				      uint major_opcode,
-				      unsigned long kprobe_inst,
-				      unsigned long addr)
-{
-	int qp;
-
-	qp = kprobe_inst & 0x3f;
-	if (is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) {
-		if (slot == 1 && qp)  {
-			printk(KERN_WARNING "Kprobes on cmp unc "
-					"instruction on slot 1 at <0x%lx> "
-					"is not supported\n", addr);
-			return -EINVAL;
-
-		}
-		qp = 0;
-	}
-	else if (bundle_encoding[template][slot] == I) {
-		if (major_opcode == 0) {
-			/*
-			 * Check for Integer speculation instruction
-			 * - Bit 33-35 to be equal to 0x1
-			 */
-			if (((kprobe_inst >> 33) & 0x7) == 1) {
-				printk(KERN_WARNING
-					"Kprobes on speculation inst at <0x%lx> not supported\n",
-						addr);
-				return -EINVAL;
-			}
-			/*
-			 * IP relative mov instruction
-			 *  - Bit 27-35 to be equal to 0x30
-			 */
-			if (((kprobe_inst >> 27) & 0x1FF) == 0x30) {
-				printk(KERN_WARNING
-					"Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n",
-						addr);
-				return -EINVAL;
-
-			}
-		}
-		else if ((major_opcode == 5) &&	!(kprobe_inst & (0xFUl << 33)) &&
-				(kprobe_inst & (0x1UL << 12))) {
-			/* test bit instructions, tbit,tnat,tf
-			 * bit 33-36 to be equal to 0
-			 * bit 12 to be equal to 1
-			 */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on test bit "
-						"instruction on slot at <0x%lx> "
-						"is not supported\n", addr);
-				return -EINVAL;
-			}
-			qp = 0;
-		}
-	}
-	else if (bundle_encoding[template][slot] == B) {
-		if (major_opcode == 7) {
-			/* IP-Relative Predict major code is 7 */
-			printk(KERN_WARNING "Kprobes on IP-Relative"
-					"Predict is not supported\n");
-			return -EINVAL;
-		}
-		else if (major_opcode == 2) {
-			/* Indirect Predict, major code is 2
-			 * bit 27-32 to be equal to 10 or 11
-			 */
-			int x6=(kprobe_inst >> 27) & 0x3F;
-			if ((x6 == 0x10) || (x6 == 0x11)) {
-				printk(KERN_WARNING "Kprobes on "
-					"Indirect Predict is not supported\n");
-				return -EINVAL;
-			}
-		}
-	}
-	/* kernel does not use float instruction, here for safety kprobe
-	 * will judge whether it is fcmp/flass/float approximation instruction
-	 */
-	else if (unlikely(bundle_encoding[template][slot] == F)) {
-		if ((major_opcode == 4 || major_opcode == 5) &&
-				(kprobe_inst  & (0x1 << 12))) {
-			/* fcmp/fclass unc instruction */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on fcmp/fclass "
-					"instruction on slot at <0x%lx> "
-					"is not supported\n", addr);
-				return -EINVAL;
-
-			}
-			qp = 0;
-		}
-		if ((major_opcode == 0 || major_opcode == 1) &&
-			(kprobe_inst & (0x1UL << 33))) {
-			/* float Approximation instruction */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on float Approx "
-					"instr at <0x%lx> is not supported\n",
-						addr);
-				return -EINVAL;
-			}
-			qp = 0;
-		}
-	}
-	return qp;
-}
-
-/*
- * In this function we override the bundle with
- * the break instruction at the given slot.
- */
-static void __kprobes prepare_break_inst(uint template, uint  slot,
-					 uint major_opcode,
-					 unsigned long kprobe_inst,
-					 struct kprobe *p,
-					 int qp)
-{
-	unsigned long break_inst = BREAK_INST;
-	bundle_t *bundle = &p->opcode.bundle;
-
-	/*
-	 * Copy the original kprobe_inst qualifying predicate(qp)
-	 * to the break instruction
-	 */
-	break_inst |= qp;
-
-	switch (slot) {
-	  case 0:
-		bundle->quad0.slot0 = break_inst;
-		break;
-	  case 1:
-		bundle->quad0.slot1_p0 = break_inst;
-		bundle->quad1.slot1_p1 = break_inst >> (64-46);
-		break;
-	  case 2:
-		bundle->quad1.slot2 = break_inst;
-		break;
-	}
-
-	/*
-	 * Update the instruction flag, so that we can
-	 * emulate the instruction properly after we
-	 * single step on original instruction
-	 */
-	update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p);
-}
-
-static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot,
-	       	unsigned long *kprobe_inst, uint *major_opcode)
-{
-	unsigned long kprobe_inst_p0, kprobe_inst_p1;
-	unsigned int template;
-
-	template = bundle->quad0.template;
-
-	switch (slot) {
-	  case 0:
-		*major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT);
-		*kprobe_inst = bundle->quad0.slot0;
-		  break;
-	  case 1:
-		*major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT);
-		kprobe_inst_p0 = bundle->quad0.slot1_p0;
-		kprobe_inst_p1 = bundle->quad1.slot1_p1;
-		*kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46));
-		break;
-	  case 2:
-		*major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT);
-		*kprobe_inst = bundle->quad1.slot2;
-		break;
-	}
-}
-
-/* Returns non-zero if the addr is in the Interrupt Vector Table */
-static int __kprobes in_ivt_functions(unsigned long addr)
-{
-	return (addr >= (unsigned long)__start_ivt_text
-		&& addr < (unsigned long)__end_ivt_text);
-}
-
-static int __kprobes valid_kprobe_addr(int template, int slot,
-				       unsigned long addr)
-{
-	if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) {
-		printk(KERN_WARNING "Attempting to insert unaligned kprobe "
-				"at 0x%lx\n", addr);
-		return -EINVAL;
-	}
-
-	if (in_ivt_functions(addr)) {
-		printk(KERN_WARNING "Kprobes can't be inserted inside "
-				"IVT functions at 0x%lx\n", addr);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	unsigned int i;
-	i = atomic_add_return(1, &kcb->prev_kprobe_index);
-	kcb->prev_kprobe[i-1].kp = kprobe_running();
-	kcb->prev_kprobe[i-1].status = kcb->kprobe_status;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	unsigned int i;
-	i = atomic_read(&kcb->prev_kprobe_index);
-	__this_cpu_write(current_kprobe, kcb->prev_kprobe[i-1].kp);
-	kcb->kprobe_status = kcb->prev_kprobe[i-1].status;
-	atomic_sub(1, &kcb->prev_kprobe_index);
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p,
-			struct kprobe_ctlblk *kcb)
-{
-	__this_cpu_write(current_kprobe, p);
-}
-
-void __kretprobe_trampoline(void)
-{
-}
-
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	regs->cr_iip = __kretprobe_trampoline_handler(regs, NULL);
-	/*
-	 * By returning a non-zero value, we are telling
-	 * kprobe_handler() that we don't want the post_handler
-	 * to run (and have re-enabled preemption)
-	 */
-	return 1;
-}
-
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
-{
-	ri->ret_addr = (kprobe_opcode_t *)regs->b0;
-	ri->fp = NULL;
-
-	/* Replace the return addr with trampoline addr */
-	regs->b0 = (unsigned long)dereference_function_descriptor(__kretprobe_trampoline);
-}
-
-/* Check the instruction in the slot is break */
-static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot)
-{
-	unsigned int major_opcode;
-	unsigned int template = bundle->quad0.template;
-	unsigned long kprobe_inst;
-
-	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot++;
-
-	/* Get Kprobe probe instruction at given slot*/
-	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
-
-	/* For break instruction,
-	 * Bits 37:40 Major opcode to be zero
-	 * Bits 27:32 X6 to be zero
-	 * Bits 32:35 X3 to be zero
-	 */
-	if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) {
-		/* Not a break instruction */
-		return 0;
-	}
-
-	/* Is a break instruction */
-	return 1;
-}
-
-/*
- * In this function, we check whether the target bundle modifies IP or
- * it triggers an exception. If so, it cannot be boostable.
- */
-static int __kprobes can_boost(bundle_t *bundle, uint slot,
-			       unsigned long bundle_addr)
-{
-	unsigned int template = bundle->quad0.template;
-
-	do {
-		if (search_exception_tables(bundle_addr + slot) ||
-		    __is_ia64_break_inst(bundle, slot))
-			return 0;	/* exception may occur in this bundle*/
-	} while ((++slot) < 3);
-	template &= 0x1e;
-	if (template >= 0x10 /* including B unit */ ||
-	    template == 0x04 /* including X unit */ ||
-	    template == 0x06) /* undefined */
-		return 0;
-
-	return 1;
-}
-
-/* Prepare long jump bundle and disables other boosters if need */
-static void __kprobes prepare_booster(struct kprobe *p)
-{
-	unsigned long addr = (unsigned long)p->addr & ~0xFULL;
-	unsigned int slot = (unsigned long)p->addr & 0xf;
-	struct kprobe *other_kp;
-
-	if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) {
-		set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1);
-		p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE;
-	}
-
-	/* disables boosters in previous slots */
-	for (; addr < (unsigned long)p->addr; addr++) {
-		other_kp = get_kprobe((void *)addr);
-		if (other_kp)
-			other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE;
-	}
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	unsigned long addr = (unsigned long) p->addr;
-	unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL);
-	unsigned long kprobe_inst=0;
-	unsigned int slot = addr & 0xf, template, major_opcode = 0;
-	bundle_t *bundle;
-	int qp;
-
-	bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle;
-	template = bundle->quad0.template;
-
-	if(valid_kprobe_addr(template, slot, addr))
-		return -EINVAL;
-
-	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot++;
-
-	/* Get kprobe_inst and major_opcode from the bundle */
-	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
-
-	qp = unsupported_inst(template, slot, major_opcode, kprobe_inst, addr);
-	if (qp < 0)
-		return -EINVAL;
-
-	p->ainsn.insn = get_insn_slot();
-	if (!p->ainsn.insn)
-		return -ENOMEM;
-	memcpy(&p->opcode, kprobe_addr, sizeof(kprobe_opcode_t));
-	memcpy(p->ainsn.insn, kprobe_addr, sizeof(kprobe_opcode_t));
-
-	prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp);
-
-	prepare_booster(p);
-
-	return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-	unsigned long arm_addr;
-	bundle_t *src, *dest;
-
-	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
-	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
-	src = &p->opcode.bundle;
-
-	flush_icache_range((unsigned long)p->ainsn.insn,
-			   (unsigned long)p->ainsn.insn +
-			   sizeof(kprobe_opcode_t) * MAX_INSN_SIZE);
-
-	switch (p->ainsn.slot) {
-		case 0:
-			dest->quad0.slot0 = src->quad0.slot0;
-			break;
-		case 1:
-			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
-			break;
-		case 2:
-			dest->quad1.slot2 = src->quad1.slot2;
-			break;
-	}
-	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-	unsigned long arm_addr;
-	bundle_t *src, *dest;
-
-	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
-	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
-	/* p->ainsn.insn contains the original unaltered kprobe_opcode_t */
-	src = &p->ainsn.insn->bundle;
-	switch (p->ainsn.slot) {
-		case 0:
-			dest->quad0.slot0 = src->quad0.slot0;
-			break;
-		case 1:
-			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
-			break;
-		case 2:
-			dest->quad1.slot2 = src->quad1.slot2;
-			break;
-	}
-	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-	if (p->ainsn.insn) {
-		free_insn_slot(p->ainsn.insn,
-			       p->ainsn.inst_flag & INST_FLAG_BOOSTABLE);
-		p->ainsn.insn = NULL;
-	}
-}
-/*
- * We are resuming execution after a single step fault, so the pt_regs
- * structure reflects the register state after we executed the instruction
- * located in the kprobe (p->ainsn.insn->bundle).  We still need to adjust
- * the ip to point back to the original stack address. To set the IP address
- * to original stack address, handle the case where we need to fixup the
- * relative IP address and/or fixup branch register.
- */
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
-{
-	unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle);
-	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
-	unsigned long template;
-	int slot = ((unsigned long)p->addr & 0xf);
-
-	template = p->ainsn.insn->bundle.quad0.template;
-
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot = 2;
-
-	if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) {
-
-		if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) {
-			/* Fix relative IP address */
-			regs->cr_iip = (regs->cr_iip - bundle_addr) +
-					resume_addr;
-		}
-
-		if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) {
-		/*
-		 * Fix target branch register, software convention is
-		 * to use either b0 or b6 or b7, so just checking
-		 * only those registers
-		 */
-			switch (p->ainsn.target_br_reg) {
-			case 0:
-				if ((regs->b0 == bundle_addr) ||
-					(regs->b0 == bundle_addr + 0x10)) {
-					regs->b0 = (regs->b0 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			case 6:
-				if ((regs->b6 == bundle_addr) ||
-					(regs->b6 == bundle_addr + 0x10)) {
-					regs->b6 = (regs->b6 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			case 7:
-				if ((regs->b7 == bundle_addr) ||
-					(regs->b7 == bundle_addr + 0x10)) {
-					regs->b7 = (regs->b7 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			} /* end switch */
-		}
-		goto turn_ss_off;
-	}
-
-	if (slot == 2) {
-		if (regs->cr_iip == bundle_addr + 0x10) {
-			regs->cr_iip = resume_addr + 0x10;
-		}
-	} else {
-		if (regs->cr_iip == bundle_addr) {
-			regs->cr_iip = resume_addr;
-		}
-	}
-
-turn_ss_off:
-	/* Turn off Single Step bit */
-	ia64_psr(regs)->ss = 0;
-}
-
-static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
-{
-	unsigned long bundle_addr = (unsigned long) &p->ainsn.insn->bundle;
-	unsigned long slot = (unsigned long)p->addr & 0xf;
-
-	/* single step inline if break instruction */
-	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
-		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
-	else
-		regs->cr_iip = bundle_addr & ~0xFULL;
-
-	if (slot > 2)
-		slot = 0;
-
-	ia64_psr(regs)->ri = slot;
-
-	/* turn on single stepping */
-	ia64_psr(regs)->ss = 1;
-}
-
-static int __kprobes is_ia64_break_inst(struct pt_regs *regs)
-{
-	unsigned int slot = ia64_psr(regs)->ri;
-	unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip;
-	bundle_t bundle;
-
-	memcpy(&bundle, kprobe_addr, sizeof(bundle_t));
-
-	return __is_ia64_break_inst(&bundle, slot);
-}
-
-static int __kprobes pre_kprobes_handler(struct die_args *args)
-{
-	struct kprobe *p;
-	int ret = 0;
-	struct pt_regs *regs = args->regs;
-	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
-	struct kprobe_ctlblk *kcb;
-
-	/*
-	 * We don't want to be preempted for the entire
-	 * duration of kprobe processing
-	 */
-	preempt_disable();
-	kcb = get_kprobe_ctlblk();
-
-	/* Handle recursion cases */
-	if (kprobe_running()) {
-		p = get_kprobe(addr);
-		if (p) {
-			if ((kcb->kprobe_status == KPROBE_HIT_SS) &&
-	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
-				ia64_psr(regs)->ss = 0;
-				goto no_kprobe;
-			}
-			/* We have reentered the pre_kprobe_handler(), since
-			 * another probe was hit while within the handler.
-			 * We here save the original kprobes variables and
-			 * just single step on the instruction of the new probe
-			 * without calling any user handlers.
-			 */
-			save_previous_kprobe(kcb);
-			set_current_kprobe(p, kcb);
-			kprobes_inc_nmissed_count(p);
-			prepare_ss(p, regs);
-			kcb->kprobe_status = KPROBE_REENTER;
-			return 1;
-		} else if (!is_ia64_break_inst(regs)) {
-			/* The breakpoint instruction was removed by
-			 * another cpu right after we hit, no further
-			 * handling of this interrupt is appropriate
-			 */
-			ret = 1;
-			goto no_kprobe;
-		} else {
-			/* Not our break */
-			goto no_kprobe;
-		}
-	}
-
-	p = get_kprobe(addr);
-	if (!p) {
-		if (!is_ia64_break_inst(regs)) {
-			/*
-			 * The breakpoint instruction was removed right
-			 * after we hit it.  Another cpu has removed
-			 * either a probepoint or a debugger breakpoint
-			 * at this address.  In either case, no further
-			 * handling of this interrupt is appropriate.
-			 */
-			ret = 1;
-
-		}
-
-		/* Not one of our break, let kernel handle it */
-		goto no_kprobe;
-	}
-
-	set_current_kprobe(p, kcb);
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-	if (p->pre_handler && p->pre_handler(p, regs)) {
-		reset_current_kprobe();
-		preempt_enable_no_resched();
-		return 1;
-	}
-
-#if !defined(CONFIG_PREEMPTION)
-	if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) {
-		/* Boost up -- we can execute copied instructions directly */
-		ia64_psr(regs)->ri = p->ainsn.slot;
-		regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL;
-		/* turn single stepping off */
-		ia64_psr(regs)->ss = 0;
-
-		reset_current_kprobe();
-		preempt_enable_no_resched();
-		return 1;
-	}
-#endif
-	prepare_ss(p, regs);
-	kcb->kprobe_status = KPROBE_HIT_SS;
-	return 1;
-
-no_kprobe:
-	preempt_enable_no_resched();
-	return ret;
-}
-
-static int __kprobes post_kprobes_handler(struct pt_regs *regs)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	if (!cur)
-		return 0;
-
-	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-		kcb->kprobe_status = KPROBE_HIT_SSDONE;
-		cur->post_handler(cur, regs, 0);
-	}
-
-	resume_execution(cur, regs);
-
-	/*Restore back the original saved kprobes variables and continue. */
-	if (kcb->kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe(kcb);
-		goto out;
-	}
-	reset_current_kprobe();
-
-out:
-	preempt_enable_no_resched();
-	return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-
-	switch(kcb->kprobe_status) {
-	case KPROBE_HIT_SS:
-	case KPROBE_REENTER:
-		/*
-		 * We are here because the instruction being single
-		 * stepped caused a page fault. We reset the current
-		 * kprobe and the instruction pointer points back to
-		 * the probe address and allow the page fault handler
-		 * to continue as a normal page fault.
-		 */
-		regs->cr_iip = ((unsigned long)cur->addr) & ~0xFULL;
-		ia64_psr(regs)->ri = ((unsigned long)cur->addr) & 0xf;
-		if (kcb->kprobe_status == KPROBE_REENTER)
-			restore_previous_kprobe(kcb);
-		else
-			reset_current_kprobe();
-		preempt_enable_no_resched();
-		break;
-	case KPROBE_HIT_ACTIVE:
-	case KPROBE_HIT_SSDONE:
-		/*
-		 * In case the user-specified fault handler returned
-		 * zero, try to fix up.
-		 */
-		if (ia64_done_with_exception(regs))
-			return 1;
-
-		/*
-		 * Let ia64_do_page_fault() fix it.
-		 */
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
-{
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	if (args->regs && user_mode(args->regs))
-		return ret;
-
-	switch(val) {
-	case DIE_BREAK:
-		/* err is break number from ia64_bad_break() */
-		if ((args->err >> 12) == (__IA64_BREAK_KPROBE >> 12)
-			|| args->err == 0)
-			if (pre_kprobes_handler(args))
-				ret = NOTIFY_STOP;
-		break;
-	case DIE_FAULT:
-		/* err is vector number from ia64_fault() */
-		if (args->err == 36)
-			if (post_kprobes_handler(args->regs))
-				ret = NOTIFY_STOP;
-		break;
-	default:
-		break;
-	}
-	return ret;
-}
-
-static struct kprobe trampoline_p = {
-	.pre_handler = trampoline_probe_handler
-};
-
-int __init arch_init_kprobes(void)
-{
-	trampoline_p.addr =
-		dereference_function_descriptor(__kretprobe_trampoline);
-	return register_kprobe(&trampoline_p);
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-	if (p->addr ==
-		dereference_function_descriptor(__kretprobe_trampoline))
-		return 1;
-
-	return 0;
-}
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
deleted file mode 100644
index 4db9ca144fa5..000000000000
--- a/arch/ia64/kernel/machine_kexec.c
+++ /dev/null
@@ -1,163 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch/ia64/kernel/machine_kexec.c
- *
- * Handle transition of Linux booting another kernel
- * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P.
- * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com>
- * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/kexec.h>
-#include <linux/cpu.h>
-#include <linux/irq.h>
-#include <linux/efi.h>
-#include <linux/numa.h>
-#include <linux/mmzone.h>
-
-#include <asm/efi.h>
-#include <asm/numa.h>
-#include <asm/mmu_context.h>
-#include <asm/setup.h>
-#include <asm/delay.h>
-#include <asm/meminit.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-
-typedef void (*relocate_new_kernel_t)(
-					unsigned long indirection_page,
-					unsigned long start_address,
-					struct ia64_boot_param *boot_param,
-					unsigned long pal_addr) __noreturn;
-
-struct kimage *ia64_kimage;
-
-struct resource efi_memmap_res = {
-        .name  = "EFI Memory Map",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource boot_param_res = {
-        .name  = "Boot parameter",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-
-/*
- * Do what every setup is needed on image and the
- * reboot code buffer to allow us to avoid allocations
- * later.
- */
-int machine_kexec_prepare(struct kimage *image)
-{
-	void *control_code_buffer;
-	const unsigned long *func;
-
-	func = (unsigned long *)&relocate_new_kernel;
-	/* Pre-load control code buffer to minimize work in kexec path */
-	control_code_buffer = page_address(image->control_code_page);
-	memcpy((void *)control_code_buffer, (const void *)func[0],
-			relocate_new_kernel_size);
-	flush_icache_range((unsigned long)control_code_buffer,
-			(unsigned long)control_code_buffer + relocate_new_kernel_size);
-	ia64_kimage = image;
-
-	return 0;
-}
-
-void machine_kexec_cleanup(struct kimage *image)
-{
-}
-
-/*
- * Do not allocate memory (or fail in any way) in machine_kexec().
- * We are past the point of no return, committed to rebooting now.
- */
-static void ia64_machine_kexec(struct unw_frame_info *info, void *arg)
-{
-	struct kimage *image = arg;
-	relocate_new_kernel_t rnk;
-	void *pal_addr = efi_get_pal_addr();
-	unsigned long code_addr;
-	int ii;
-	u64 fp, gp;
-	ia64_fptr_t *init_handler = (ia64_fptr_t *)ia64_os_init_on_kdump;
-
-	BUG_ON(!image);
-	code_addr = (unsigned long)page_address(image->control_code_page);
-	if (image->type == KEXEC_TYPE_CRASH) {
-		crash_save_this_cpu();
-		current->thread.ksp = (__u64)info->sw - 16;
-
-		/* Register noop init handler */
-		fp = ia64_tpa(init_handler->fp);
-		gp = ia64_tpa(ia64_getreg(_IA64_REG_GP));
-		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, fp, gp, 0, fp, gp, 0);
-	} else {
-		/* Unregister init handlers of current kernel */
-		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, 0, 0, 0, 0, 0, 0);
-	}
-
-	/* Unregister mca handler - No more recovery on current kernel */
-	ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, 0, 0, 0, 0, 0, 0);
-
-	/* Interrupts aren't acceptable while we reboot */
-	local_irq_disable();
-
-	/* Mask CMC and Performance Monitor interrupts */
-	ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
-	ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
-
-	/* Mask ITV and Local Redirect Registers */
-	ia64_set_itv(1 << 16);
-	ia64_set_lrr0(1 << 16);
-	ia64_set_lrr1(1 << 16);
-
-	/* terminate possible nested in-service interrupts */
-	for (ii = 0; ii < 16; ii++)
-		ia64_eoi();
-
-	/* unmask TPR and clear any pending interrupts */
-	ia64_setreg(_IA64_REG_CR_TPR, 0);
-	ia64_srlz_d();
-	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
-		ia64_eoi();
-	rnk = (relocate_new_kernel_t)&code_addr;
-	(*rnk)(image->head, image->start, ia64_boot_param,
-		     GRANULEROUNDDOWN((unsigned long) pal_addr));
-	BUG();
-}
-
-void machine_kexec(struct kimage *image)
-{
-	BUG_ON(!image);
-	unw_init_running(ia64_machine_kexec, image);
-	for(;;);
-}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#if defined(CONFIG_SPARSEMEM)
-	VMCOREINFO_SYMBOL(pgdat_list);
-	VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(node_memblk);
-	VMCOREINFO_LENGTH(node_memblk, NR_NODE_MEMBLKS);
-	VMCOREINFO_STRUCT_SIZE(node_memblk_s);
-	VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
-	VMCOREINFO_OFFSET(node_memblk_s, size);
-#endif
-#if CONFIG_PGTABLE_LEVELS == 3
-	VMCOREINFO_CONFIG(PGTABLE_3);
-#elif CONFIG_PGTABLE_LEVELS == 4
-	VMCOREINFO_CONFIG(PGTABLE_4);
-#endif
-}
-
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
deleted file mode 100644
index 2671688d349a..000000000000
--- a/arch/ia64/kernel/mca.c
+++ /dev/null
@@ -1,2111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * File:	mca.c
- * Purpose:	Generic MCA handling layer
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Copyright (C) 2002 Dell Inc.
- * Copyright (C) Matt Domsch <Matt_Domsch@dell.com>
- *
- * Copyright (C) 2002 Intel
- * Copyright (C) Jenna Hall <jenna.s.hall@intel.com>
- *
- * Copyright (C) 2001 Intel
- * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com>
- *
- * Copyright (C) 2000 Intel
- * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com>
- *
- * Copyright (C) 1999, 2004-2008 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- *
- * Copyright (C) 2006 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *
- * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
- *	      Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
- *	      added min save state dump, added INIT handler.
- *
- * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com>
- *	      Added setup of CMCI and CPEI IRQs, logging of corrected platform
- *	      errors, completed code for logging of corrected & uncorrected
- *	      machine check errors, and updated for conformance with Nov. 2000
- *	      revision of the SAL 3.0 spec.
- *
- * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com>
- *	      Aligned MCA stack to 16 bytes, added platform vs. CPU error flag,
- *	      set SAL default return values, changed error record structure to
- *	      linked list, added init call to sal_get_state_info_size().
- *
- * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com>
- *	      GUID cleanups.
- *
- * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com>
- *	      Added INIT backtrace support.
- *
- * 2003-12-08 Keith Owens <kaos@sgi.com>
- *	      smp_call_function() must not be called from interrupt context
- *	      (can deadlock on tasklist_lock).
- *	      Use keventd to call smp_call_function().
- *
- * 2004-02-01 Keith Owens <kaos@sgi.com>
- *	      Avoid deadlock when using printk() for MCA and INIT records.
- *	      Delete all record printing code, moved to salinfo_decode in user
- *	      space.  Mark variables and functions static where possible.
- *	      Delete dead variables and functions.  Reorder to remove the need
- *	      for forward declarations and to consolidate related code.
- *
- * 2005-08-12 Keith Owens <kaos@sgi.com>
- *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS
- *	      state.
- *
- * 2005-10-07 Keith Owens <kaos@sgi.com>
- *	      Add notify_die() hooks.
- *
- * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *	      Add printing support for MCA/INIT.
- *
- * 2007-04-27 Russ Anderson <rja@sgi.com>
- *	      Support multiple cpus going through OS_MCA in the same event.
- */
-#include <linux/jiffies.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/workqueue.h>
-#include <linux/cpumask.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/gfp.h>
-
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/meminit.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-#include <asm/mca_asm.h>
-#include <asm/kexec.h>
-
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-#include <asm/tlb.h>
-
-#include "mca_drv.h"
-#include "entry.h"
-#include "irq.h"
-
-#if defined(IA64_MCA_DEBUG_INFO)
-# define IA64_MCA_DEBUG(fmt...) printk(fmt)
-#else
-# define IA64_MCA_DEBUG(fmt...) do {} while (0)
-#endif
-
-#define NOTIFY_INIT(event, regs, arg, spin)				\
-do {									\
-	if ((notify_die((event), "INIT", (regs), (arg), 0, 0)		\
-			== NOTIFY_STOP) && ((spin) == 1))		\
-		ia64_mca_spin(__func__);				\
-} while (0)
-
-#define NOTIFY_MCA(event, regs, arg, spin)				\
-do {									\
-	if ((notify_die((event), "MCA", (regs), (arg), 0, 0)		\
-			== NOTIFY_STOP) && ((spin) == 1))		\
-		ia64_mca_spin(__func__);				\
-} while (0)
-
-/* Used by mca_asm.S */
-DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */
-DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */
-DEFINE_PER_CPU(u64, ia64_mca_pal_pte);	    /* PTE to map PAL code */
-DEFINE_PER_CPU(u64, ia64_mca_pal_base);    /* vaddr PAL code granule */
-DEFINE_PER_CPU(u64, ia64_mca_tr_reload);   /* Flag for TR reload */
-
-unsigned long __per_cpu_mca[NR_CPUS];
-
-/* In mca_asm.S */
-extern void			ia64_os_init_dispatch_monarch (void);
-extern void			ia64_os_init_dispatch_slave (void);
-
-static int monarch_cpu = -1;
-
-static ia64_mc_info_t		ia64_mc_info;
-
-#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */
-#define MIN_CPE_POLL_INTERVAL (2*60*HZ)  /* 2 minutes */
-#define CMC_POLL_INTERVAL     (1*60*HZ)  /* 1 minute */
-#define CPE_HISTORY_LENGTH    5
-#define CMC_HISTORY_LENGTH    5
-
-static struct timer_list cpe_poll_timer;
-static struct timer_list cmc_poll_timer;
-/*
- * This variable tells whether we are currently in polling mode.
- * Start with this in the wrong state so we won't play w/ timers
- * before the system is ready.
- */
-static int cmc_polling_enabled = 1;
-
-/*
- * Clearing this variable prevents CPE polling from getting activated
- * in mca_late_init.  Use it if your system doesn't provide a CPEI,
- * but encounters problems retrieving CPE logs.  This should only be
- * necessary for debugging.
- */
-static int cpe_poll_enabled = 1;
-
-extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
-
-static int mca_init __initdata;
-
-/*
- * limited & delayed printing support for MCA/INIT handler
- */
-
-#define mprintk(fmt...) ia64_mca_printk(fmt)
-
-#define MLOGBUF_SIZE (512+256*NR_CPUS)
-#define MLOGBUF_MSGMAX 256
-static char mlogbuf[MLOGBUF_SIZE];
-static DEFINE_SPINLOCK(mlogbuf_wlock);	/* mca context only */
-static DEFINE_SPINLOCK(mlogbuf_rlock);	/* normal context only */
-static unsigned long mlogbuf_start;
-static unsigned long mlogbuf_end;
-static unsigned int mlogbuf_finished = 0;
-static unsigned long mlogbuf_timestamp = 0;
-
-static int loglevel_save = -1;
-#define BREAK_LOGLEVEL(__console_loglevel)		\
-	oops_in_progress = 1;				\
-	if (loglevel_save < 0)				\
-		loglevel_save = __console_loglevel;	\
-	__console_loglevel = 15;
-
-#define RESTORE_LOGLEVEL(__console_loglevel)		\
-	if (loglevel_save >= 0) {			\
-		__console_loglevel = loglevel_save;	\
-		loglevel_save = -1;			\
-	}						\
-	mlogbuf_finished = 0;				\
-	oops_in_progress = 0;
-
-/*
- * Push messages into buffer, print them later if not urgent.
- */
-void ia64_mca_printk(const char *fmt, ...)
-{
-	va_list args;
-	int printed_len;
-	char temp_buf[MLOGBUF_MSGMAX];
-	char *p;
-
-	va_start(args, fmt);
-	printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
-	va_end(args);
-
-	/* Copy the output into mlogbuf */
-	if (oops_in_progress) {
-		/* mlogbuf was abandoned, use printk directly instead. */
-		printk("%s", temp_buf);
-	} else {
-		spin_lock(&mlogbuf_wlock);
-		for (p = temp_buf; *p; p++) {
-			unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
-			if (next != mlogbuf_start) {
-				mlogbuf[mlogbuf_end] = *p;
-				mlogbuf_end = next;
-			} else {
-				/* buffer full */
-				break;
-			}
-		}
-		mlogbuf[mlogbuf_end] = '\0';
-		spin_unlock(&mlogbuf_wlock);
-	}
-}
-EXPORT_SYMBOL(ia64_mca_printk);
-
-/*
- * Print buffered messages.
- *  NOTE: call this after returning normal context. (ex. from salinfod)
- */
-void ia64_mlogbuf_dump(void)
-{
-	char temp_buf[MLOGBUF_MSGMAX];
-	char *p;
-	unsigned long index;
-	unsigned long flags;
-	unsigned int printed_len;
-
-	/* Get output from mlogbuf */
-	while (mlogbuf_start != mlogbuf_end) {
-		temp_buf[0] = '\0';
-		p = temp_buf;
-		printed_len = 0;
-
-		spin_lock_irqsave(&mlogbuf_rlock, flags);
-
-		index = mlogbuf_start;
-		while (index != mlogbuf_end) {
-			*p = mlogbuf[index];
-			index = (index + 1) % MLOGBUF_SIZE;
-			if (!*p)
-				break;
-			p++;
-			if (++printed_len >= MLOGBUF_MSGMAX - 1)
-				break;
-		}
-		*p = '\0';
-		if (temp_buf[0])
-			printk("%s", temp_buf);
-		mlogbuf_start = index;
-
-		mlogbuf_timestamp = 0;
-		spin_unlock_irqrestore(&mlogbuf_rlock, flags);
-	}
-}
-EXPORT_SYMBOL(ia64_mlogbuf_dump);
-
-/*
- * Call this if system is going to down or if immediate flushing messages to
- * console is required. (ex. recovery was failed, crash dump is going to be
- * invoked, long-wait rendezvous etc.)
- *  NOTE: this should be called from monarch.
- */
-static void ia64_mlogbuf_finish(int wait)
-{
-	BREAK_LOGLEVEL(console_loglevel);
-
-	ia64_mlogbuf_dump();
-	printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, "
-		"MCA/INIT might be dodgy or fail.\n");
-
-	if (!wait)
-		return;
-
-	/* wait for console */
-	printk("Delaying for 5 seconds...\n");
-	udelay(5*1000000);
-
-	mlogbuf_finished = 1;
-}
-
-/*
- * Print buffered messages from INIT context.
- */
-static void ia64_mlogbuf_dump_from_init(void)
-{
-	if (mlogbuf_finished)
-		return;
-
-	if (mlogbuf_timestamp &&
-			time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) {
-		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT "
-			" and the system seems to be messed up.\n");
-		ia64_mlogbuf_finish(0);
-		return;
-	}
-
-	if (!spin_trylock(&mlogbuf_rlock)) {
-		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. "
-			"Generated messages other than stack dump will be "
-			"buffered to mlogbuf and will be printed later.\n");
-		printk(KERN_ERR "INIT: If messages would not printed after "
-			"this INIT, wait 30sec and assert INIT again.\n");
-		if (!mlogbuf_timestamp)
-			mlogbuf_timestamp = jiffies;
-		return;
-	}
-	spin_unlock(&mlogbuf_rlock);
-	ia64_mlogbuf_dump();
-}
-
-static inline void
-ia64_mca_spin(const char *func)
-{
-	if (monarch_cpu == smp_processor_id())
-		ia64_mlogbuf_finish(0);
-	mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
-	while (1)
-		cpu_relax();
-}
-/*
- * IA64_MCA log support
- */
-#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
-#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
-
-typedef struct ia64_state_log_s
-{
-	spinlock_t	isl_lock;
-	int		isl_index;
-	unsigned long	isl_count;
-	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
-} ia64_state_log_t;
-
-static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
-
-#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock)
-#define IA64_LOG_LOCK(it)      spin_lock_irqsave(&ia64_state_log[it].isl_lock, s)
-#define IA64_LOG_UNLOCK(it)    spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s)
-#define IA64_LOG_NEXT_INDEX(it)    ia64_state_log[it].isl_index
-#define IA64_LOG_CURR_INDEX(it)    1 - ia64_state_log[it].isl_index
-#define IA64_LOG_INDEX_INC(it) \
-    {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \
-    ia64_state_log[it].isl_count++;}
-#define IA64_LOG_INDEX_DEC(it) \
-    ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index
-#define IA64_LOG_NEXT_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]))
-#define IA64_LOG_CURR_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]))
-#define IA64_LOG_COUNT(it)         ia64_state_log[it].isl_count
-
-static inline void ia64_log_allocate(int it, u64 size)
-{
-	ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] =
-		(ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);
-	if (!ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])
-		panic("%s: Failed to allocate %llu bytes\n", __func__, size);
-
-	ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] =
-		(ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);
-	if (!ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)])
-		panic("%s: Failed to allocate %llu bytes\n", __func__, size);
-}
-
-/*
- * ia64_log_init
- *	Reset the OS ia64 log buffer
- * Inputs   :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
- * Outputs	:	None
- */
-static void __init
-ia64_log_init(int sal_info_type)
-{
-	u64	max_size = 0;
-
-	IA64_LOG_NEXT_INDEX(sal_info_type) = 0;
-	IA64_LOG_LOCK_INIT(sal_info_type);
-
-	// SAL will tell us the maximum size of any error record of this type
-	max_size = ia64_sal_get_state_info_size(sal_info_type);
-	if (!max_size)
-		/* alloc_bootmem() doesn't like zero-sized allocations! */
-		return;
-
-	// set up OS data structures to hold error info
-	ia64_log_allocate(sal_info_type, max_size);
-}
-
-/*
- * ia64_log_get
- *
- *	Get the current MCA log from SAL and copy it into the OS log buffer.
- *
- *  Inputs  :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
- *              irq_safe    whether you can use printk at this point
- *  Outputs :   size        (total record length)
- *              *buffer     (ptr to error record)
- *
- */
-static u64
-ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
-{
-	sal_log_record_header_t     *log_buffer;
-	u64                         total_len = 0;
-	unsigned long               s;
-
-	IA64_LOG_LOCK(sal_info_type);
-
-	/* Get the process state information */
-	log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type);
-
-	total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer);
-
-	if (total_len) {
-		IA64_LOG_INDEX_INC(sal_info_type);
-		IA64_LOG_UNLOCK(sal_info_type);
-		if (irq_safe) {
-			IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. Record length = %ld\n",
-				       __func__, sal_info_type, total_len);
-		}
-		*buffer = (u8 *) log_buffer;
-		return total_len;
-	} else {
-		IA64_LOG_UNLOCK(sal_info_type);
-		return 0;
-	}
-}
-
-/*
- *  ia64_mca_log_sal_error_record
- *
- *  This function retrieves a specified error record type from SAL
- *  and wakes up any processes waiting for error records.
- *
- *  Inputs  :   sal_info_type   (Type of error record MCA/CMC/CPE)
- *              FIXME: remove MCA and irq_safe.
- */
-static void
-ia64_mca_log_sal_error_record(int sal_info_type)
-{
-	u8 *buffer;
-	sal_log_record_header_t *rh;
-	u64 size;
-	int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA;
-#ifdef IA64_MCA_DEBUG_INFO
-	static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" };
-#endif
-
-	size = ia64_log_get(sal_info_type, &buffer, irq_safe);
-	if (!size)
-		return;
-
-	salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe);
-
-	if (irq_safe)
-		IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n",
-			smp_processor_id(),
-			sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN");
-
-	/* Clear logs from corrected errors in case there's no user-level logger */
-	rh = (sal_log_record_header_t *)buffer;
-	if (rh->severity == sal_log_severity_corrected)
-		ia64_sal_clear_state_info(sal_info_type);
-}
-
-/*
- * search_mca_table
- *  See if the MCA surfaced in an instruction range
- *  that has been tagged as recoverable.
- *
- *  Inputs
- *	first	First address range to check
- *	last	Last address range to check
- *	ip	Instruction pointer, address we are looking for
- *
- * Return value:
- *      1 on Success (in the table)/ 0 on Failure (not in the  table)
- */
-int
-search_mca_table (const struct mca_table_entry *first,
-                const struct mca_table_entry *last,
-                unsigned long ip)
-{
-        const struct mca_table_entry *curr;
-        u64 curr_start, curr_end;
-
-        curr = first;
-        while (curr <= last) {
-                curr_start = (u64) &curr->start_addr + curr->start_addr;
-                curr_end = (u64) &curr->end_addr + curr->end_addr;
-
-                if ((ip >= curr_start) && (ip <= curr_end)) {
-                        return 1;
-                }
-                curr++;
-        }
-        return 0;
-}
-
-/* Given an address, look for it in the mca tables. */
-int mca_recover_range(unsigned long addr)
-{
-	extern struct mca_table_entry __start___mca_table[];
-	extern struct mca_table_entry __stop___mca_table[];
-
-	return search_mca_table(__start___mca_table, __stop___mca_table-1, addr);
-}
-EXPORT_SYMBOL_GPL(mca_recover_range);
-
-int cpe_vector = -1;
-int ia64_cpe_irq = -1;
-
-static irqreturn_t
-ia64_mca_cpe_int_handler (int cpe_irq, void *arg)
-{
-	static unsigned long	cpe_history[CPE_HISTORY_LENGTH];
-	static int		index;
-	static DEFINE_SPINLOCK(cpe_history_lock);
-
-	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __func__, cpe_irq, smp_processor_id());
-
-	/* SAL spec states this should run w/ interrupts enabled */
-	local_irq_enable();
-
-	spin_lock(&cpe_history_lock);
-	if (!cpe_poll_enabled && cpe_vector >= 0) {
-
-		int i, count = 1; /* we know 1 happened now */
-		unsigned long now = jiffies;
-
-		for (i = 0; i < CPE_HISTORY_LENGTH; i++) {
-			if (now - cpe_history[i] <= HZ)
-				count++;
-		}
-
-		IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH);
-		if (count >= CPE_HISTORY_LENGTH) {
-
-			cpe_poll_enabled = 1;
-			spin_unlock(&cpe_history_lock);
-			disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR));
-
-			/*
-			 * Corrected errors will still be corrected, but
-			 * make sure there's a log somewhere that indicates
-			 * something is generating more than we can handle.
-			 */
-			printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n");
-
-			mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL);
-
-			/* lock already released, get out now */
-			goto out;
-		} else {
-			cpe_history[index++] = now;
-			if (index == CPE_HISTORY_LENGTH)
-				index = 0;
-		}
-	}
-	spin_unlock(&cpe_history_lock);
-out:
-	/* Get the CPE error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
-
-	local_irq_disable();
-
-	return IRQ_HANDLED;
-}
-
-/*
- * ia64_mca_register_cpev
- *
- *  Register the corrected platform error vector with SAL.
- *
- *  Inputs
- *      cpev        Corrected Platform Error Vector number
- *
- *  Outputs
- *      None
- */
-void
-ia64_mca_register_cpev (int cpev)
-{
-	/* Register the CPE interrupt vector with SAL */
-	struct ia64_sal_retval isrv;
-
-	isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0);
-	if (isrv.status) {
-		printk(KERN_ERR "Failed to register Corrected Platform "
-		       "Error interrupt vector with SAL (status %ld)\n", isrv.status);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: corrected platform error "
-		       "vector %#x registered\n", __func__, cpev);
-}
-
-/*
- * ia64_mca_cmc_vector_setup
- *
- *  Setup the corrected machine check vector register in the processor.
- *  (The interrupt is masked on boot. ia64_mca_late_init unmask this.)
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      None
- *
- * Outputs
- *	None
- */
-void
-ia64_mca_cmc_vector_setup (void)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval	= 0;
-	cmcv.cmcv_mask		= 1;        /* Mask/disable interrupt at first */
-	cmcv.cmcv_vector	= IA64_CMC_VECTOR;
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x registered.\n",
-		       __func__, smp_processor_id(), IA64_CMC_VECTOR);
-
-	IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n",
-		       __func__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
-}
-
-/*
- * ia64_mca_cmc_vector_disable
- *
- *  Mask the corrected machine check vector register in the processor.
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      dummy(unused)
- *
- * Outputs
- *	None
- */
-static void
-ia64_mca_cmc_vector_disable (void *dummy)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
-
-	cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x disabled.\n",
-		       __func__, smp_processor_id(), cmcv.cmcv_vector);
-}
-
-/*
- * ia64_mca_cmc_vector_enable
- *
- *  Unmask the corrected machine check vector register in the processor.
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      dummy(unused)
- *
- * Outputs
- *	None
- */
-static void
-ia64_mca_cmc_vector_enable (void *dummy)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
-
-	cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x enabled.\n",
-		       __func__, smp_processor_id(), cmcv.cmcv_vector);
-}
-
-/*
- * ia64_mca_cmc_vector_disable_keventd
- *
- * Called via keventd (smp_call_function() is not safe in interrupt context) to
- * disable the cmc interrupt vector.
- */
-static void
-ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
-{
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
-}
-
-/*
- * ia64_mca_cmc_vector_enable_keventd
- *
- * Called via keventd (smp_call_function() is not safe in interrupt context) to
- * enable the cmc interrupt vector.
- */
-static void
-ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
-{
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
-}
-
-/*
- * ia64_mca_wakeup
- *
- *	Send an inter-cpu interrupt to wake-up a particular cpu.
- *
- *  Inputs  :   cpuid
- *  Outputs :   None
- */
-static void
-ia64_mca_wakeup(int cpu)
-{
-	ia64_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * ia64_mca_wakeup_all
- *
- *	Wakeup all the slave cpus which have rendez'ed previously.
- *
- *  Inputs  :   None
- *  Outputs :   None
- */
-static void
-ia64_mca_wakeup_all(void)
-{
-	int cpu;
-
-	/* Clear the Rendez checkin flag for all cpus */
-	for_each_online_cpu(cpu) {
-		if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE)
-			ia64_mca_wakeup(cpu);
-	}
-
-}
-
-/*
- * ia64_mca_rendez_interrupt_handler
- *
- *	This is handler used to put slave processors into spinloop
- *	while the monarch processor does the mca handling and later
- *	wake each slave up once the monarch is done.  The state
- *	IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed
- *	in SAL.  The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates
- *	the cpu has come out of OS rendezvous.
- *
- *  Inputs  :   None
- *  Outputs :   None
- */
-static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg)
-{
-	unsigned long flags;
-	int cpu = smp_processor_id();
-	struct ia64_mca_notify_die nd =
-		{ .sos = NULL, .monarch_cpu = &monarch_cpu };
-
-	/* Mask all interrupts */
-	local_irq_save(flags);
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_ENTER, get_irq_regs(), (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
-	/* Register with the SAL monarch that the slave has
-	 * reached SAL
-	 */
-	ia64_sal_mc_rendez();
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_PROCESS, get_irq_regs(), (long)&nd, 1);
-
-	/* Wait for the monarch cpu to exit. */
-	while (monarch_cpu != -1)
-	       cpu_relax();	/* spin until monarch leaves */
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_LEAVE, get_irq_regs(), (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-	/* Enable all interrupts */
-	local_irq_restore(flags);
-	return IRQ_HANDLED;
-}
-
-/*
- * ia64_mca_wakeup_int_handler
- *
- *	The interrupt handler for processing the inter-cpu interrupt to the
- *	slave cpu which was spinning in the rendez loop.
- *	Since this spinning is done by turning off the interrupts and
- *	polling on the wakeup-interrupt bit in the IRR, there is
- *	nothing useful to be done in the handler.
- *
- *  Inputs  :   wakeup_irq  (Wakeup-interrupt bit)
- *	arg		(Interrupt handler specific argument)
- *  Outputs :   None
- *
- */
-static irqreturn_t
-ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg)
-{
-	return IRQ_HANDLED;
-}
-
-/* Function pointer for extra MCA recovery */
-int (*ia64_mca_ucmc_extension)
-	(void*,struct ia64_sal_os_state*)
-	= NULL;
-
-int
-ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *))
-{
-	if (ia64_mca_ucmc_extension)
-		return 1;
-
-	ia64_mca_ucmc_extension = fn;
-	return 0;
-}
-
-void
-ia64_unreg_MCA_extension(void)
-{
-	if (ia64_mca_ucmc_extension)
-		ia64_mca_ucmc_extension = NULL;
-}
-
-EXPORT_SYMBOL(ia64_reg_MCA_extension);
-EXPORT_SYMBOL(ia64_unreg_MCA_extension);
-
-
-static inline void
-copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat)
-{
-	u64 fslot, tslot, nat;
-	*tr = *fr;
-	fslot = ((unsigned long)fr >> 3) & 63;
-	tslot = ((unsigned long)tr >> 3) & 63;
-	*tnat &= ~(1UL << tslot);
-	nat = (fnat >> fslot) & 1;
-	*tnat |= (nat << tslot);
-}
-
-/* Change the comm field on the MCA/INT task to include the pid that
- * was interrupted, it makes for easier debugging.  If that pid was 0
- * (swapper or nested MCA/INIT) then use the start of the previous comm
- * field suffixed with its cpu.
- */
-
-static void
-ia64_mca_modify_comm(const struct task_struct *previous_current)
-{
-	char *p, comm[sizeof(current->comm)];
-	if (previous_current->pid)
-		snprintf(comm, sizeof(comm), "%s %d",
-			current->comm, previous_current->pid);
-	else {
-		int l;
-		if ((p = strchr(previous_current->comm, ' ')))
-			l = p - previous_current->comm;
-		else
-			l = strlen(previous_current->comm);
-		snprintf(comm, sizeof(comm), "%s %*s %d",
-			current->comm, l, previous_current->comm,
-			task_thread_info(previous_current)->cpu);
-	}
-	memcpy(current->comm, comm, sizeof(current->comm));
-}
-
-static void
-finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos,
-		unsigned long *nat)
-{
-	const struct pal_min_state_area *ms = sos->pal_min_state;
-	const u64 *bank;
-
-	/* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use
-	 * pmsa_{xip,xpsr,xfs}
-	 */
-	if (ia64_psr(regs)->ic) {
-		regs->cr_iip = ms->pmsa_iip;
-		regs->cr_ipsr = ms->pmsa_ipsr;
-		regs->cr_ifs = ms->pmsa_ifs;
-	} else {
-		regs->cr_iip = ms->pmsa_xip;
-		regs->cr_ipsr = ms->pmsa_xpsr;
-		regs->cr_ifs = ms->pmsa_xfs;
-
-		sos->iip = ms->pmsa_iip;
-		sos->ipsr = ms->pmsa_ipsr;
-		sos->ifs = ms->pmsa_ifs;
-	}
-	regs->pr = ms->pmsa_pr;
-	regs->b0 = ms->pmsa_br0;
-	regs->ar_rsc = ms->pmsa_rsc;
-	copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &regs->r1, nat);
-	copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &regs->r2, nat);
-	copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &regs->r3, nat);
-	copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &regs->r8, nat);
-	copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &regs->r9, nat);
-	copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &regs->r10, nat);
-	copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &regs->r11, nat);
-	copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &regs->r12, nat);
-	copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &regs->r13, nat);
-	copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &regs->r14, nat);
-	copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &regs->r15, nat);
-	if (ia64_psr(regs)->bn)
-		bank = ms->pmsa_bank1_gr;
-	else
-		bank = ms->pmsa_bank0_gr;
-	copy_reg(&bank[16-16], ms->pmsa_nat_bits, &regs->r16, nat);
-	copy_reg(&bank[17-16], ms->pmsa_nat_bits, &regs->r17, nat);
-	copy_reg(&bank[18-16], ms->pmsa_nat_bits, &regs->r18, nat);
-	copy_reg(&bank[19-16], ms->pmsa_nat_bits, &regs->r19, nat);
-	copy_reg(&bank[20-16], ms->pmsa_nat_bits, &regs->r20, nat);
-	copy_reg(&bank[21-16], ms->pmsa_nat_bits, &regs->r21, nat);
-	copy_reg(&bank[22-16], ms->pmsa_nat_bits, &regs->r22, nat);
-	copy_reg(&bank[23-16], ms->pmsa_nat_bits, &regs->r23, nat);
-	copy_reg(&bank[24-16], ms->pmsa_nat_bits, &regs->r24, nat);
-	copy_reg(&bank[25-16], ms->pmsa_nat_bits, &regs->r25, nat);
-	copy_reg(&bank[26-16], ms->pmsa_nat_bits, &regs->r26, nat);
-	copy_reg(&bank[27-16], ms->pmsa_nat_bits, &regs->r27, nat);
-	copy_reg(&bank[28-16], ms->pmsa_nat_bits, &regs->r28, nat);
-	copy_reg(&bank[29-16], ms->pmsa_nat_bits, &regs->r29, nat);
-	copy_reg(&bank[30-16], ms->pmsa_nat_bits, &regs->r30, nat);
-	copy_reg(&bank[31-16], ms->pmsa_nat_bits, &regs->r31, nat);
-}
-
-/* On entry to this routine, we are running on the per cpu stack, see
- * mca_asm.h.  The original stack has not been touched by this event.  Some of
- * the original stack's registers will be in the RBS on this stack.  This stack
- * also contains a partial pt_regs and switch_stack, the rest of the data is in
- * PAL minstate.
- *
- * The first thing to do is modify the original stack to look like a blocked
- * task so we can run backtrace on the original task.  Also mark the per cpu
- * stack as current to ensure that we use the correct task state, it also means
- * that we can do backtrace on the MCA/INIT handler code itself.
- */
-
-static struct task_struct *
-ia64_mca_modify_original_stack(struct pt_regs *regs,
-		const struct switch_stack *sw,
-		struct ia64_sal_os_state *sos,
-		const char *type)
-{
-	char *p;
-	ia64_va va;
-	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
-	const struct pal_min_state_area *ms = sos->pal_min_state;
-	struct task_struct *previous_current;
-	struct pt_regs *old_regs;
-	struct switch_stack *old_sw;
-	unsigned size = sizeof(struct pt_regs) +
-			sizeof(struct switch_stack) + 16;
-	unsigned long *old_bspstore, *old_bsp;
-	unsigned long *new_bspstore, *new_bsp;
-	unsigned long old_unat, old_rnat, new_rnat, nat;
-	u64 slots, loadrs = regs->loadrs;
-	u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1];
-	u64 ar_bspstore = regs->ar_bspstore;
-	u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16);
-	const char *msg;
-	int cpu = smp_processor_id();
-
-	previous_current = curr_task(cpu);
-	ia64_set_curr_task(cpu, current);
-	if ((p = strchr(current->comm, ' ')))
-		*p = '\0';
-
-	/* Best effort attempt to cope with MCA/INIT delivered while in
-	 * physical mode.
-	 */
-	regs->cr_ipsr = ms->pmsa_ipsr;
-	if (ia64_psr(regs)->dt == 0) {
-		va.l = r12;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			r12 = va.l;
-		}
-		va.l = r13;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			r13 = va.l;
-		}
-	}
-	if (ia64_psr(regs)->rt == 0) {
-		va.l = ar_bspstore;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			ar_bspstore = va.l;
-		}
-		va.l = ar_bsp;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			ar_bsp = va.l;
-		}
-	}
-
-	/* mca_asm.S ia64_old_stack() cannot assume that the dirty registers
-	 * have been copied to the old stack, the old stack may fail the
-	 * validation tests below.  So ia64_old_stack() must restore the dirty
-	 * registers from the new stack.  The old and new bspstore probably
-	 * have different alignments, so loadrs calculated on the old bsp
-	 * cannot be used to restore from the new bsp.  Calculate a suitable
-	 * loadrs for the new stack and save it in the new pt_regs, where
-	 * ia64_old_stack() can get it.
-	 */
-	old_bspstore = (unsigned long *)ar_bspstore;
-	old_bsp = (unsigned long *)ar_bsp;
-	slots = ia64_rse_num_regs(old_bspstore, old_bsp);
-	new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET);
-	new_bsp = ia64_rse_skip_regs(new_bspstore, slots);
-	regs->loadrs = (new_bsp - new_bspstore) * 8 << 16;
-
-	/* Verify the previous stack state before we change it */
-	if (user_mode(regs)) {
-		msg = "occurred in user space";
-		/* previous_current is guaranteed to be valid when the task was
-		 * in user space, so ...
-		 */
-		ia64_mca_modify_comm(previous_current);
-		goto no_mod;
-	}
-
-	if (r13 != sos->prev_IA64_KR_CURRENT) {
-		msg = "inconsistent previous current and r13";
-		goto no_mod;
-	}
-
-	if (!mca_recover_range(ms->pmsa_iip)) {
-		if ((r12 - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent r12 and r13";
-			goto no_mod;
-		}
-		if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bspstore and r13";
-			goto no_mod;
-		}
-		va.p = old_bspstore;
-		if (va.f.reg < 5) {
-			msg = "old_bspstore is in the wrong region";
-			goto no_mod;
-		}
-		if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bsp and r13";
-			goto no_mod;
-		}
-		size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
-		if (ar_bspstore + size > r12) {
-			msg = "no room for blocked state";
-			goto no_mod;
-		}
-	}
-
-	ia64_mca_modify_comm(previous_current);
-
-	/* Make the original task look blocked.  First stack a struct pt_regs,
-	 * describing the state at the time of interrupt.  mca_asm.S built a
-	 * partial pt_regs, copy it and fill in the blanks using minstate.
-	 */
-	p = (char *)r12 - sizeof(*regs);
-	old_regs = (struct pt_regs *)p;
-	memcpy(old_regs, regs, sizeof(*regs));
-	old_regs->loadrs = loadrs;
-	old_unat = old_regs->ar_unat;
-	finish_pt_regs(old_regs, sos, &old_unat);
-
-	/* Next stack a struct switch_stack.  mca_asm.S built a partial
-	 * switch_stack, copy it and fill in the blanks using pt_regs and
-	 * minstate.
-	 *
-	 * In the synthesized switch_stack, b0 points to ia64_leave_kernel,
-	 * ar.pfs is set to 0.
-	 *
-	 * unwind.c::unw_unwind() does special processing for interrupt frames.
-	 * It checks if the PRED_NON_SYSCALL predicate is set, if the predicate
-	 * is clear then unw_unwind() does _not_ adjust bsp over pt_regs.  Not
-	 * that this is documented, of course.  Set PRED_NON_SYSCALL in the
-	 * switch_stack on the original stack so it will unwind correctly when
-	 * unwind.c reads pt_regs.
-	 *
-	 * thread.ksp is updated to point to the synthesized switch_stack.
-	 */
-	p -= sizeof(struct switch_stack);
-	old_sw = (struct switch_stack *)p;
-	memcpy(old_sw, sw, sizeof(*sw));
-	old_sw->caller_unat = old_unat;
-	old_sw->ar_fpsr = old_regs->ar_fpsr;
-	copy_reg(&ms->pmsa_gr[4-1], ms->pmsa_nat_bits, &old_sw->r4, &old_unat);
-	copy_reg(&ms->pmsa_gr[5-1], ms->pmsa_nat_bits, &old_sw->r5, &old_unat);
-	copy_reg(&ms->pmsa_gr[6-1], ms->pmsa_nat_bits, &old_sw->r6, &old_unat);
-	copy_reg(&ms->pmsa_gr[7-1], ms->pmsa_nat_bits, &old_sw->r7, &old_unat);
-	old_sw->b0 = (u64)ia64_leave_kernel;
-	old_sw->b1 = ms->pmsa_br1;
-	old_sw->ar_pfs = 0;
-	old_sw->ar_unat = old_unat;
-	old_sw->pr = old_regs->pr | (1UL << PRED_NON_SYSCALL);
-	previous_current->thread.ksp = (u64)p - 16;
-
-	/* Finally copy the original stack's registers back to its RBS.
-	 * Registers from ar.bspstore through ar.bsp at the time of the event
-	 * are in the current RBS, copy them back to the original stack.  The
-	 * copy must be done register by register because the original bspstore
-	 * and the current one have different alignments, so the saved RNAT
-	 * data occurs at different places.
-	 *
-	 * mca_asm does cover, so the old_bsp already includes all registers at
-	 * the time of MCA/INIT.  It also does flushrs, so all registers before
-	 * this function have been written to backing store on the MCA/INIT
-	 * stack.
-	 */
-	new_rnat = ia64_get_rnat(ia64_rse_rnat_addr(new_bspstore));
-	old_rnat = regs->ar_rnat;
-	while (slots--) {
-		if (ia64_rse_is_rnat_slot(new_bspstore)) {
-			new_rnat = ia64_get_rnat(new_bspstore++);
-		}
-		if (ia64_rse_is_rnat_slot(old_bspstore)) {
-			*old_bspstore++ = old_rnat;
-			old_rnat = 0;
-		}
-		nat = (new_rnat >> ia64_rse_slot_num(new_bspstore)) & 1UL;
-		old_rnat &= ~(1UL << ia64_rse_slot_num(old_bspstore));
-		old_rnat |= (nat << ia64_rse_slot_num(old_bspstore));
-		*old_bspstore++ = *new_bspstore++;
-	}
-	old_sw->ar_bspstore = (unsigned long)old_bspstore;
-	old_sw->ar_rnat = old_rnat;
-
-	sos->prev_task = previous_current;
-	return previous_current;
-
-no_mod:
-	mprintk(KERN_INFO "cpu %d, %s %s, original stack not modified\n",
-			smp_processor_id(), type, msg);
-	old_unat = regs->ar_unat;
-	finish_pt_regs(regs, sos, &old_unat);
-	return previous_current;
-}
-
-/* The monarch/slave interaction is based on monarch_cpu and requires that all
- * slaves have entered rendezvous before the monarch leaves.  If any cpu has
- * not entered rendezvous yet then wait a bit.  The assumption is that any
- * slave that has not rendezvoused after a reasonable time is never going to do
- * so.  In this context, slave includes cpus that respond to the MCA rendezvous
- * interrupt, as well as cpus that receive the INIT slave event.
- */
-
-static void
-ia64_wait_for_slaves(int monarch, const char *type)
-{
-	int c, i , wait;
-
-	/*
-	 * wait 5 seconds total for slaves (arbitrary)
-	 */
-	for (i = 0; i < 5000; i++) {
-		wait = 0;
-		for_each_online_cpu(c) {
-			if (c == monarch)
-				continue;
-			if (ia64_mc_info.imi_rendez_checkin[c]
-					== IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
-				udelay(1000);		/* short wait */
-				wait = 1;
-				break;
-			}
-		}
-		if (!wait)
-			goto all_in;
-	}
-
-	/*
-	 * Maybe slave(s) dead. Print buffered messages immediately.
-	 */
-	ia64_mlogbuf_finish(0);
-	mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
-	for_each_online_cpu(c) {
-		if (c == monarch)
-			continue;
-		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
-			mprintk(" %d", c);
-	}
-	mprintk("\n");
-	return;
-
-all_in:
-	mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
-	return;
-}
-
-/*  mca_insert_tr
- *
- *  Switch rid when TR reload and needed!
- *  iord: 1: itr, 2: itr;
- *
-*/
-static void mca_insert_tr(u64 iord)
-{
-
-	int i;
-	u64 old_rr;
-	struct ia64_tr_entry *p;
-	unsigned long psr;
-	int cpu = smp_processor_id();
-
-	if (!ia64_idtrs[cpu])
-		return;
-
-	psr = ia64_clear_ic();
-	for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) {
-		p = ia64_idtrs[cpu] + (iord - 1) * IA64_TR_ALLOC_MAX;
-		if (p->pte & 0x1) {
-			old_rr = ia64_get_rr(p->ifa);
-			if (old_rr != p->rr) {
-				ia64_set_rr(p->ifa, p->rr);
-				ia64_srlz_d();
-			}
-			ia64_ptr(iord, p->ifa, p->itir >> 2);
-			ia64_srlz_i();
-			if (iord & 0x1) {
-				ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2);
-				ia64_srlz_i();
-			}
-			if (iord & 0x2) {
-				ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2);
-				ia64_srlz_i();
-			}
-			if (old_rr != p->rr) {
-				ia64_set_rr(p->ifa, old_rr);
-				ia64_srlz_d();
-			}
-		}
-	}
-	ia64_set_psr(psr);
-}
-
-/*
- * ia64_mca_handler
- *
- *	This is uncorrectable machine check handler called from OS_MCA
- *	dispatch code which is in turn called from SAL_CHECK().
- *	This is the place where the core of OS MCA handling is done.
- *	Right now the logs are extracted and displayed in a well-defined
- *	format. This handler code is supposed to be run only on the
- *	monarch processor. Once the monarch is done with MCA handling
- *	further MCA logging is enabled by clearing logs.
- *	Monarch also has the duty of sending wakeup-IPIs to pull the
- *	slave processors out of rendezvous spinloop.
- *
- *	If multiple processors call into OS_MCA, the first will become
- *	the monarch.  Subsequent cpus will be recorded in the mca_cpu
- *	bitmask.  After the first monarch has processed its MCA, it
- *	will wake up the next cpu in the mca_cpu bitmask and then go
- *	into the rendezvous loop.  When all processors have serviced
- *	their MCA, the last monarch frees up the rest of the processors.
- */
-void
-ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
-		 struct ia64_sal_os_state *sos)
-{
-	int recover, cpu = smp_processor_id();
-	struct task_struct *previous_current;
-	struct ia64_mca_notify_die nd =
-		{ .sos = sos, .monarch_cpu = &monarch_cpu, .data = &recover };
-	static atomic_t mca_count;
-	static cpumask_t mca_cpu;
-
-	if (atomic_add_return(1, &mca_count) == 1) {
-		monarch_cpu = cpu;
-		sos->monarch = 1;
-	} else {
-		cpumask_set_cpu(cpu, &mca_cpu);
-		sos->monarch = 0;
-	}
-	mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
-		"monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
-
-	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_ENTER, regs, (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
-	if (sos->monarch) {
-		ia64_wait_for_slaves(cpu, "MCA");
-
-		/* Wakeup all the processors which are spinning in the
-		 * rendezvous loop.  They will leave SAL, then spin in the OS
-		 * with interrupts disabled until this monarch cpu leaves the
-		 * MCA handler.  That gets control back to the OS so we can
-		 * backtrace the other cpus, backtrace when spinning in SAL
-		 * does not work.
-		 */
-		ia64_mca_wakeup_all();
-	} else {
-		while (cpumask_test_cpu(cpu, &mca_cpu))
-			cpu_relax();	/* spin until monarch wakes us */
-	}
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_PROCESS, regs, (long)&nd, 1);
-
-	/* Get the MCA error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
-
-	/* MCA error recovery */
-	recover = (ia64_mca_ucmc_extension
-		&& ia64_mca_ucmc_extension(
-			IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
-			sos));
-
-	if (recover) {
-		sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
-		rh->severity = sal_log_severity_corrected;
-		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
-		sos->os_status = IA64_MCA_CORRECTED;
-	} else {
-		/* Dump buffered message to console */
-		ia64_mlogbuf_finish(1);
-	}
-
-	if (__this_cpu_read(ia64_mca_tr_reload)) {
-		mca_insert_tr(0x1); /*Reload dynamic itrs*/
-		mca_insert_tr(0x2); /*Reload dynamic itrs*/
-	}
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_LEAVE, regs, (long)&nd, 1);
-
-	if (atomic_dec_return(&mca_count) > 0) {
-		int i;
-
-		/* wake up the next monarch cpu,
-		 * and put this cpu in the rendez loop.
-		 */
-		for_each_online_cpu(i) {
-			if (cpumask_test_cpu(i, &mca_cpu)) {
-				monarch_cpu = i;
-				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
-				while (monarch_cpu != -1)
-					cpu_relax();	/* spin until last cpu leaves */
-				ia64_set_curr_task(cpu, previous_current);
-				ia64_mc_info.imi_rendez_checkin[cpu]
-						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-				return;
-			}
-		}
-	}
-	ia64_set_curr_task(cpu, previous_current);
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
-}
-
-static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);
-static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd);
-
-/*
- * ia64_mca_cmc_int_handler
- *
- *  This is corrected machine check interrupt handler.
- *	Right now the logs are extracted and displayed in a well-defined
- *	format.
- *
- * Inputs
- *      interrupt number
- *      client data arg ptr
- *
- * Outputs
- *	None
- */
-static irqreturn_t
-ia64_mca_cmc_int_handler(int cmc_irq, void *arg)
-{
-	static unsigned long	cmc_history[CMC_HISTORY_LENGTH];
-	static int		index;
-	static DEFINE_SPINLOCK(cmc_history_lock);
-
-	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __func__, cmc_irq, smp_processor_id());
-
-	/* SAL spec states this should run w/ interrupts enabled */
-	local_irq_enable();
-
-	spin_lock(&cmc_history_lock);
-	if (!cmc_polling_enabled) {
-		int i, count = 1; /* we know 1 happened now */
-		unsigned long now = jiffies;
-
-		for (i = 0; i < CMC_HISTORY_LENGTH; i++) {
-			if (now - cmc_history[i] <= HZ)
-				count++;
-		}
-
-		IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH);
-		if (count >= CMC_HISTORY_LENGTH) {
-
-			cmc_polling_enabled = 1;
-			spin_unlock(&cmc_history_lock);
-			/* If we're being hit with CMC interrupts, we won't
-			 * ever execute the schedule_work() below.  Need to
-			 * disable CMC interrupts on this processor now.
-			 */
-			ia64_mca_cmc_vector_disable(NULL);
-			schedule_work(&cmc_disable_work);
-
-			/*
-			 * Corrected errors will still be corrected, but
-			 * make sure there's a log somewhere that indicates
-			 * something is generating more than we can handle.
-			 */
-			printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n");
-
-			mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
-
-			/* lock already released, get out now */
-			goto out;
-		} else {
-			cmc_history[index++] = now;
-			if (index == CMC_HISTORY_LENGTH)
-				index = 0;
-		}
-	}
-	spin_unlock(&cmc_history_lock);
-out:
-	/* Get the CMC error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
-
-	local_irq_disable();
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cmc_int_caller
- *
- * 	Triggered by sw interrupt from CMC polling routine.  Calls
- * 	real interrupt handler and either triggers a sw interrupt
- * 	on the next cpu or does cleanup at the end.
- *
- * Inputs
- *	interrupt number
- *	client data arg ptr
- * Outputs
- * 	handled
- */
-static irqreturn_t
-ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
-{
-	static int start_count = -1;
-	unsigned int cpuid;
-
-	cpuid = smp_processor_id();
-
-	/* If first cpu, update count */
-	if (start_count == -1)
-		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
-
-	ia64_mca_cmc_int_handler(cmc_irq, arg);
-
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
-
-	if (cpuid < nr_cpu_ids) {
-		ia64_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
-	} else {
-		/* If no log record, switch out of polling mode */
-		if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
-
-			printk(KERN_WARNING "Returning to interrupt driven CMC handler\n");
-			schedule_work(&cmc_enable_work);
-			cmc_polling_enabled = 0;
-
-		} else {
-
-			mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
-		}
-
-		start_count = -1;
-	}
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cmc_poll
- *
- *	Poll for Corrected Machine Checks (CMCs)
- *
- * Inputs   :   dummy(unused)
- * Outputs  :   None
- *
- */
-static void
-ia64_mca_cmc_poll (struct timer_list *unused)
-{
-	/* Trigger a CMC interrupt cascade  */
-	ia64_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR,
-							IA64_IPI_DM_INT, 0);
-}
-
-/*
- *  ia64_mca_cpe_int_caller
- *
- * 	Triggered by sw interrupt from CPE polling routine.  Calls
- * 	real interrupt handler and either triggers a sw interrupt
- * 	on the next cpu or does cleanup at the end.
- *
- * Inputs
- *	interrupt number
- *	client data arg ptr
- * Outputs
- * 	handled
- */
-static irqreturn_t
-ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
-{
-	static int start_count = -1;
-	static int poll_time = MIN_CPE_POLL_INTERVAL;
-	unsigned int cpuid;
-
-	cpuid = smp_processor_id();
-
-	/* If first cpu, update count */
-	if (start_count == -1)
-		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
-
-	ia64_mca_cpe_int_handler(cpe_irq, arg);
-
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
-
-	if (cpuid < NR_CPUS) {
-		ia64_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
-	} else {
-		/*
-		 * If a log was recorded, increase our polling frequency,
-		 * otherwise, backoff or return to interrupt mode.
-		 */
-		if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) {
-			poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2);
-		} else if (cpe_vector < 0) {
-			poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2);
-		} else {
-			poll_time = MIN_CPE_POLL_INTERVAL;
-
-			printk(KERN_WARNING "Returning to interrupt driven CPE handler\n");
-			enable_irq(local_vector_to_irq(IA64_CPE_VECTOR));
-			cpe_poll_enabled = 0;
-		}
-
-		if (cpe_poll_enabled)
-			mod_timer(&cpe_poll_timer, jiffies + poll_time);
-		start_count = -1;
-	}
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cpe_poll
- *
- *	Poll for Corrected Platform Errors (CPEs), trigger interrupt
- *	on first cpu, from there it will trickle through all the cpus.
- *
- * Inputs   :   dummy(unused)
- * Outputs  :   None
- *
- */
-static void
-ia64_mca_cpe_poll (struct timer_list *unused)
-{
-	/* Trigger a CPE interrupt cascade  */
-	ia64_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR,
-							IA64_IPI_DM_INT, 0);
-}
-
-static int
-default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
-{
-	int c;
-	struct task_struct *g, *t;
-	if (val != DIE_INIT_MONARCH_PROCESS)
-		return NOTIFY_DONE;
-#ifdef CONFIG_KEXEC
-	if (atomic_read(&kdump_in_progress))
-		return NOTIFY_DONE;
-#endif
-
-	/*
-	 * FIXME: mlogbuf will brim over with INIT stack dumps.
-	 * To enable show_stack from INIT, we use oops_in_progress which should
-	 * be used in real oops. This would cause something wrong after INIT.
-	 */
-	BREAK_LOGLEVEL(console_loglevel);
-	ia64_mlogbuf_dump_from_init();
-
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		for_each_process_thread(g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL, KERN_DEFAULT);
-		}
-		read_unlock(&tasklist_lock);
-	}
-	/* FIXME: This will not restore zapped printk locks. */
-	RESTORE_LOGLEVEL(console_loglevel);
-	return NOTIFY_DONE;
-}
-
-/*
- * C portion of the OS INIT handler
- *
- * Called from ia64_os_init_dispatch
- *
- * Inputs: pointer to pt_regs where processor info was saved.  SAL/OS state for
- * this event.  This code is used for both monarch and slave INIT events, see
- * sos->monarch.
- *
- * All INIT events switch to the INIT stack and change the previous process to
- * blocked status.  If one of the INIT events is the monarch then we are
- * probably processing the nmi button/command.  Use the monarch cpu to dump all
- * the processes.  The slave INIT events all spin until the monarch cpu
- * returns.  We can also get INIT slave events for MCA, in which case the MCA
- * process is the monarch.
- */
-
-void
-ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
-		  struct ia64_sal_os_state *sos)
-{
-	static atomic_t slaves;
-	static atomic_t monarchs;
-	struct task_struct *previous_current;
-	int cpu = smp_processor_id();
-	struct ia64_mca_notify_die nd =
-		{ .sos = sos, .monarch_cpu = &monarch_cpu };
-
-	NOTIFY_INIT(DIE_INIT_ENTER, regs, (long)&nd, 0);
-
-	mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
-		sos->proc_state_param, cpu, sos->monarch);
-	salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
-
-	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "INIT");
-	sos->os_status = IA64_INIT_RESUME;
-
-	/* FIXME: Workaround for broken proms that drive all INIT events as
-	 * slaves.  The last slave that enters is promoted to be a monarch.
-	 * Remove this code in September 2006, that gives platforms a year to
-	 * fix their proms and get their customers updated.
-	 */
-	if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) {
-		mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
-		        __func__, cpu);
-		atomic_dec(&slaves);
-		sos->monarch = 1;
-	}
-
-	/* FIXME: Workaround for broken proms that drive all INIT events as
-	 * monarchs.  Second and subsequent monarchs are demoted to slaves.
-	 * Remove this code in September 2006, that gives platforms a year to
-	 * fix their proms and get their customers updated.
-	 */
-	if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
-		mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
-			       __func__, cpu);
-		atomic_dec(&monarchs);
-		sos->monarch = 0;
-	}
-
-	if (!sos->monarch) {
-		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
-
-#ifdef CONFIG_KEXEC
-		while (monarch_cpu == -1 && !atomic_read(&kdump_in_progress))
-			udelay(1000);
-#else
-		while (monarch_cpu == -1)
-			cpu_relax();	/* spin until monarch enters */
-#endif
-
-		NOTIFY_INIT(DIE_INIT_SLAVE_ENTER, regs, (long)&nd, 1);
-		NOTIFY_INIT(DIE_INIT_SLAVE_PROCESS, regs, (long)&nd, 1);
-
-#ifdef CONFIG_KEXEC
-		while (monarch_cpu != -1 && !atomic_read(&kdump_in_progress))
-			udelay(1000);
-#else
-		while (monarch_cpu != -1)
-			cpu_relax();	/* spin until monarch leaves */
-#endif
-
-		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
-
-		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
-		ia64_set_curr_task(cpu, previous_current);
-		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-		atomic_dec(&slaves);
-		return;
-	}
-
-	monarch_cpu = cpu;
-	NOTIFY_INIT(DIE_INIT_MONARCH_ENTER, regs, (long)&nd, 1);
-
-	/*
-	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
-	 * generated via the BMC's command-line interface, but since the console is on the
-	 * same serial line, the user will need some time to switch out of the BMC before
-	 * the dump begins.
-	 */
-	mprintk("Delaying for 5 seconds...\n");
-	udelay(5*1000000);
-	ia64_wait_for_slaves(cpu, "INIT");
-	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
-	 * to default_monarch_init_process() above and just print all the
-	 * tasks.
-	 */
-	NOTIFY_INIT(DIE_INIT_MONARCH_PROCESS, regs, (long)&nd, 1);
-	NOTIFY_INIT(DIE_INIT_MONARCH_LEAVE, regs, (long)&nd, 1);
-
-	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
-	atomic_dec(&monarchs);
-	ia64_set_curr_task(cpu, previous_current);
-	monarch_cpu = -1;
-	return;
-}
-
-static int __init
-ia64_mca_disable_cpe_polling(char *str)
-{
-	cpe_poll_enabled = 0;
-	return 1;
-}
-
-__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling);
-
-/* Minimal format of the MCA/INIT stacks.  The pseudo processes that run on
- * these stacks can never sleep, they cannot return from the kernel to user
- * space, they do not appear in a normal ps listing.  So there is no need to
- * format most of the fields.
- */
-
-static void
-format_mca_init_stack(void *mca_data, unsigned long offset,
-		const char *type, int cpu)
-{
-	struct task_struct *p = (struct task_struct *)((char *)mca_data + offset);
-	struct thread_info *ti;
-	memset(p, 0, KERNEL_STACK_SIZE);
-	ti = task_thread_info(p);
-	ti->flags = _TIF_MCA_INIT;
-	ti->preempt_count = 1;
-	ti->task = p;
-	ti->cpu = cpu;
-	p->stack = ti;
-	p->__state = TASK_UNINTERRUPTIBLE;
-	cpumask_set_cpu(cpu, &p->cpus_mask);
-	INIT_LIST_HEAD(&p->tasks);
-	p->parent = p->real_parent = p->group_leader = p;
-	INIT_LIST_HEAD(&p->children);
-	INIT_LIST_HEAD(&p->sibling);
-	strscpy(p->comm, type, sizeof(p->comm)-1);
-}
-
-/* Caller prevents this from being called after init */
-static void * __ref mca_bootmem(void)
-{
-	return memblock_alloc(sizeof(struct ia64_mca_cpu), KERNEL_STACK_SIZE);
-}
-
-/* Do per-CPU MCA-related initialization.  */
-void
-ia64_mca_cpu_init(void *cpu_data)
-{
-	void *pal_vaddr;
-	void *data;
-	long sz = sizeof(struct ia64_mca_cpu);
-	int cpu = smp_processor_id();
-	static int first_time = 1;
-
-	/*
-	 * Structure will already be allocated if cpu has been online,
-	 * then offlined.
-	 */
-	if (__per_cpu_mca[cpu]) {
-		data = __va(__per_cpu_mca[cpu]);
-	} else {
-		if (first_time) {
-			data = mca_bootmem();
-			first_time = 0;
-		} else
-			data = (void *)__get_free_pages(GFP_ATOMIC,
-							get_order(sz));
-		if (!data)
-			panic("Could not allocate MCA memory for cpu %d\n",
-					cpu);
-	}
-	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, mca_stack),
-		"MCA", cpu);
-	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, init_stack),
-		"INIT", cpu);
-	__this_cpu_write(ia64_mca_data, (__per_cpu_mca[cpu] = __pa(data)));
-
-	/*
-	 * Stash away a copy of the PTE needed to map the per-CPU page.
-	 * We may need it during MCA recovery.
-	 */
-	__this_cpu_write(ia64_mca_per_cpu_pte,
-		pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL)));
-
-	/*
-	 * Also, stash away a copy of the PAL address and the PTE
-	 * needed to map it.
-	 */
-	pal_vaddr = efi_get_pal_addr();
-	if (!pal_vaddr)
-		return;
-	__this_cpu_write(ia64_mca_pal_base,
-		GRANULEROUNDDOWN((unsigned long) pal_vaddr));
-	__this_cpu_write(ia64_mca_pal_pte, pte_val(mk_pte_phys(__pa(pal_vaddr),
-							      PAGE_KERNEL)));
-}
-
-static int ia64_mca_cpu_online(unsigned int cpu)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (!cmc_polling_enabled)
-		ia64_mca_cmc_vector_enable(NULL);
-	local_irq_restore(flags);
-	return 0;
-}
-
-/*
- * ia64_mca_init
- *
- *  Do all the system level mca specific initialization.
- *
- *	1. Register spinloop and wakeup request interrupt vectors
- *
- *	2. Register OS_MCA handler entry point
- *
- *	3. Register OS_INIT handler entry point
- *
- *  4. Initialize MCA/CMC/INIT related log buffers maintained by the OS.
- *
- *  Note that this initialization is done very early before some kernel
- *  services are available.
- *
- *  Inputs  :   None
- *
- *  Outputs :   None
- */
-void __init
-ia64_mca_init(void)
-{
-	ia64_fptr_t *init_hldlr_ptr_monarch = (ia64_fptr_t *)ia64_os_init_dispatch_monarch;
-	ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave;
-	ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
-	int i;
-	long rc;
-	struct ia64_sal_retval isrv;
-	unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
-	static struct notifier_block default_init_monarch_nb = {
-		.notifier_call = default_monarch_init_process,
-		.priority = 0/* we need to notified last */
-	};
-
-	IA64_MCA_DEBUG("%s: begin\n", __func__);
-
-	/* Clear the Rendez checkin flag for all cpus */
-	for(i = 0 ; i < NR_CPUS; i++)
-		ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-
-	/*
-	 * Register the rendezvous spinloop and wakeup mechanism with SAL
-	 */
-
-	/* Register the rendezvous interrupt vector with SAL */
-	while (1) {
-		isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
-					      SAL_MC_PARAM_MECHANISM_INT,
-					      IA64_MCA_RENDEZ_VECTOR,
-					      timeout,
-					      SAL_MC_PARAM_RZ_ALWAYS);
-		rc = isrv.status;
-		if (rc == 0)
-			break;
-		if (rc == -2) {
-			printk(KERN_INFO "Increasing MCA rendezvous timeout from "
-				"%ld to %ld milliseconds\n", timeout, isrv.v0);
-			timeout = isrv.v0;
-			NOTIFY_MCA(DIE_MCA_NEW_TIMEOUT, NULL, timeout, 0);
-			continue;
-		}
-		printk(KERN_ERR "Failed to register rendezvous interrupt "
-		       "with SAL (status %ld)\n", rc);
-		return;
-	}
-
-	/* Register the wakeup interrupt vector with SAL */
-	isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
-				      SAL_MC_PARAM_MECHANISM_INT,
-				      IA64_MCA_WAKEUP_VECTOR,
-				      0, 0);
-	rc = isrv.status;
-	if (rc) {
-		printk(KERN_ERR "Failed to register wakeup interrupt with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __func__);
-
-	ia64_mc_info.imi_mca_handler        = ia64_tpa(mca_hldlr_ptr->fp);
-	/*
-	 * XXX - disable SAL checksum by setting size to 0; should be
-	 *	ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch);
-	 */
-	ia64_mc_info.imi_mca_handler_size	= 0;
-
-	/* Register the os mca handler with SAL */
-	if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA,
-				       ia64_mc_info.imi_mca_handler,
-				       ia64_tpa(mca_hldlr_ptr->gp),
-				       ia64_mc_info.imi_mca_handler_size,
-				       0, 0, 0)))
-	{
-		printk(KERN_ERR "Failed to register OS MCA handler with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __func__,
-		       ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp));
-
-	/*
-	 * XXX - disable SAL checksum by setting size to 0, should be
-	 * size of the actual init handler in mca_asm.S.
-	 */
-	ia64_mc_info.imi_monarch_init_handler		= ia64_tpa(init_hldlr_ptr_monarch->fp);
-	ia64_mc_info.imi_monarch_init_handler_size	= 0;
-	ia64_mc_info.imi_slave_init_handler		= ia64_tpa(init_hldlr_ptr_slave->fp);
-	ia64_mc_info.imi_slave_init_handler_size	= 0;
-
-	IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __func__,
-		       ia64_mc_info.imi_monarch_init_handler);
-
-	/* Register the os init handler with SAL */
-	if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT,
-				       ia64_mc_info.imi_monarch_init_handler,
-				       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
-				       ia64_mc_info.imi_monarch_init_handler_size,
-				       ia64_mc_info.imi_slave_init_handler,
-				       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
-				       ia64_mc_info.imi_slave_init_handler_size)))
-	{
-		printk(KERN_ERR "Failed to register m/s INIT handlers with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-	if (register_die_notifier(&default_init_monarch_nb)) {
-		printk(KERN_ERR "Failed to register default monarch INIT process\n");
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __func__);
-
-	/* Initialize the areas set aside by the OS to buffer the
-	 * platform/processor error states for MCA/INIT/CMC
-	 * handling.
-	 */
-	ia64_log_init(SAL_INFO_TYPE_MCA);
-	ia64_log_init(SAL_INFO_TYPE_INIT);
-	ia64_log_init(SAL_INFO_TYPE_CMC);
-	ia64_log_init(SAL_INFO_TYPE_CPE);
-
-	mca_init = 1;
-	printk(KERN_INFO "MCA related initialization done\n");
-}
-
-
-/*
- * These pieces cannot be done in ia64_mca_init() because it is called before
- * early_irq_init() which would wipe out our percpu irq registrations. But we
- * cannot leave them until ia64_mca_late_init() because by then all the other
- * processors have been brought online and have set their own CMC vectors to
- * point at a non-existant action. Called from arch_early_irq_init().
- */
-void __init ia64_mca_irq_init(void)
-{
-	/*
-	 *  Configure the CMCI/P vector and handler. Interrupts for CMC are
-	 *  per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
-	 */
-	register_percpu_irq(IA64_CMC_VECTOR, ia64_mca_cmc_int_handler, 0,
-			    "cmc_hndlr");
-	register_percpu_irq(IA64_CMCP_VECTOR, ia64_mca_cmc_int_caller, 0,
-			    "cmc_poll");
-	ia64_mca_cmc_vector_setup();       /* Setup vector on BSP */
-
-	/* Setup the MCA rendezvous interrupt vector */
-	register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, ia64_mca_rendez_int_handler,
-			    0, "mca_rdzv");
-
-	/* Setup the MCA wakeup interrupt vector */
-	register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, ia64_mca_wakeup_int_handler,
-			    0, "mca_wkup");
-
-	/* Setup the CPEI/P handler */
-	register_percpu_irq(IA64_CPEP_VECTOR, ia64_mca_cpe_int_caller, 0,
-			    "cpe_poll");
-}
-
-/*
- * ia64_mca_late_init
- *
- *	Opportunity to setup things that require initialization later
- *	than ia64_mca_init.  Setup a timer to poll for CPEs if the
- *	platform doesn't support an interrupt driven mechanism.
- *
- *  Inputs  :   None
- *  Outputs :   Status
- */
-static int __init
-ia64_mca_late_init(void)
-{
-	if (!mca_init)
-		return 0;
-
-	/* Setup the CMCI/P vector and handler */
-	timer_setup(&cmc_poll_timer, ia64_mca_cmc_poll, 0);
-
-	/* Unmask/enable the vector */
-	cmc_polling_enabled = 0;
-	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/mca:online",
-			  ia64_mca_cpu_online, NULL);
-	IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__);
-
-	/* Setup the CPEI/P vector and handler */
-	cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI);
-	timer_setup(&cpe_poll_timer, ia64_mca_cpe_poll, 0);
-
-	{
-		unsigned int irq;
-
-		if (cpe_vector >= 0) {
-			/* If platform supports CPEI, enable the irq. */
-			irq = local_vector_to_irq(cpe_vector);
-			if (irq > 0) {
-				cpe_poll_enabled = 0;
-				irq_set_status_flags(irq, IRQ_PER_CPU);
-				if (request_irq(irq, ia64_mca_cpe_int_handler,
-						0, "cpe_hndlr", NULL))
-					pr_err("Failed to register cpe_hndlr interrupt\n");
-				ia64_cpe_irq = irq;
-				ia64_mca_register_cpev(cpe_vector);
-				IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n",
-					__func__);
-				return 0;
-			}
-			printk(KERN_ERR "%s: Failed to find irq for CPE "
-					"interrupt handler, vector %d\n",
-					__func__, cpe_vector);
-		}
-		/* If platform doesn't support CPEI, get the timer going. */
-		if (cpe_poll_enabled) {
-			ia64_mca_cpe_poll(0UL);
-			IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __func__);
-		}
-	}
-
-	return 0;
-}
-
-device_initcall(ia64_mca_late_init);
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
deleted file mode 100644
index 0d6b8cf9d1d0..000000000000
--- a/arch/ia64/kernel/mca_asm.S
+++ /dev/null
@@ -1,1123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_asm.S
- * Purpose:	assembly portion of the IA64 MCA handling
- *
- * Mods by cfleck to integrate into kernel build
- *
- * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com>
- *		Added various stop bits to get a clean compile
- *
- * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
- *		Added code to save INIT handoff state in pt_regs format,
- *		switch to temp kstack, switch modes, jump to C INIT handler
- *
- * 2002-01-04 J.Hall <jenna.s.hall@intel.com>
- *		Before entering virtual mode code:
- *		 1. Check for TLB CPU error
- *		 2. Restore current thread pointer to kr6
- *		 3. Move stack ptr 16 bytes to conform to C calling convention
- *
- * 2004-11-12 Russ Anderson <rja@sgi.com>
- *		Added per cpu MCA/INIT stack save areas.
- *
- * 2005-12-08 Keith Owens <kaos@sgi.com>
- *		Use per cpu MCA/INIT stacks for all data.
- */
-#include <linux/threads.h>
-#include <linux/pgtable.h>
-
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-#include <asm/mca_asm.h>
-#include <asm/mca.h>
-
-#include "entry.h"
-
-#define GET_IA64_MCA_DATA(reg)						\
-	GET_THIS_PADDR(reg, ia64_mca_data)				\
-	;;								\
-	ld8 reg=[reg]
-
-	.global ia64_do_tlb_purge
-	.global ia64_os_mca_dispatch
-	.global ia64_os_init_on_kdump
-	.global ia64_os_init_dispatch_monarch
-	.global ia64_os_init_dispatch_slave
-
-	.text
-	.align 16
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-/*
- * Just the TLB purge part is moved to a separate function
- * so we can re-use the code for cpu hotplug code as well
- * Caller should now setup b1, so we can branch once the
- * tlb flush is complete.
- */
-
-ia64_do_tlb_purge:
-#define O(member)	IA64_CPUINFO_##member##_OFFSET
-
-	GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
-	;;
-	addl r17=O(PTCE_STRIDE),r2
-	addl r2=O(PTCE_BASE),r2
-	;;
-	ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;	// r18=ptce_base
-	ld4 r19=[r2],4					// r19=ptce_count[0]
-	ld4 r21=[r17],4					// r21=ptce_stride[0]
-	;;
-	ld4 r20=[r2]					// r20=ptce_count[1]
-	ld4 r22=[r17]					// r22=ptce_stride[1]
-	mov r24=0
-	;;
-	adds r20=-1,r20
-	;;
-#undef O
-
-2:
-	cmp.ltu p6,p7=r24,r19
-(p7)	br.cond.dpnt.few 4f
-	mov ar.lc=r20
-3:
-	ptc.e r18
-	;;
-	add r18=r22,r18
-	br.cloop.sptk.few 3b
-	;;
-	add r18=r21,r18
-	add r24=1,r24
-	;;
-	br.sptk.few 2b
-4:
-	srlz.i 			// srlz.i implies srlz.d
-	;;
-
-        // Now purge addresses formerly mapped by TR registers
-	// 1. Purge ITR&DTR for kernel.
-	movl r16=KERNEL_START
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	;;
-	ptr.i r16, r18
-	ptr.d r16, r18
-	;;
-	srlz.i
-	;;
-	srlz.d
-	;;
-	// 3. Purge ITR for PAL code.
-	GET_THIS_PADDR(r2, ia64_mca_pal_base)
-	;;
-	ld8 r16=[r2]
-	mov r18=IA64_GRANULE_SHIFT<<2
-	;;
-	ptr.i r16,r18
-	;;
-	srlz.i
-	;;
-	// 4. Purge DTR for stack.
-	mov r16=IA64_KR(CURRENT_STACK)
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	movl r19=PAGE_OFFSET
-	;;
-	add r16=r19,r16
-	mov r18=IA64_GRANULE_SHIFT<<2
-	;;
-	ptr.d r16,r18
-	;;
-	srlz.i
-	;;
-	// Now branch away to caller.
-	br.sptk.many b1
-	;;
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-ia64_os_mca_dispatch:
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	mov r19=1				// All MCA events are treated as monarch (for now)
-	br.sptk ia64_state_save			// save the state that is not in minstate
-1:
-
-	GET_IA64_MCA_DATA(r2)
-	// Using MCA stack, struct ia64_sal_os_state, variable proc_state_param
-	;;
-	add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+SOS(PROC_STATE_PARAM), r2
-	;;
-	ld8 r18=[r3]				// Get processor state parameter on existing PALE_CHECK.
-	;;
-	tbit.nz p6,p7=r18,60
-(p7)	br.spnt done_tlb_purge_and_reload
-
-	// The following code purges TC and TR entries. Then reload all TC entries.
-	// Purge percpu data TC entries.
-begin_tlb_purge_and_reload:
-	movl r18=ia64_reload_tr;;
-	LOAD_PHYSICAL(p0,r18,ia64_reload_tr);;
-	mov b1=r18;;
-	br.sptk.many ia64_do_tlb_purge;;
-
-ia64_reload_tr:
-	// Finally reload the TR registers.
-	// 1. Reload DTR/ITR registers for kernel.
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	movl r17=KERNEL_START
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r17
-        mov r16=IA64_TR_KERNEL
-	mov r19=ip
-	movl r18=PAGE_KERNEL
-	;;
-        dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT
-	;;
-	or r18=r17,r18
-	;;
-        itr.i itr[r16]=r18
-	;;
-        itr.d dtr[r16]=r18
-        ;;
-	srlz.i
-	srlz.d
-	;;
-	// 3. Reload ITR for PAL code.
-	GET_THIS_PADDR(r2, ia64_mca_pal_pte)
-	;;
-	ld8 r18=[r2]			// load PAL PTE
-	;;
-	GET_THIS_PADDR(r2, ia64_mca_pal_base)
-	;;
-	ld8 r16=[r2]			// load PAL vaddr
-	mov r19=IA64_GRANULE_SHIFT<<2
-	;;
-	mov cr.itir=r19
-	mov cr.ifa=r16
-	mov r20=IA64_TR_PALCODE
-	;;
-	itr.i itr[r20]=r18
-	;;
-	srlz.i
-	;;
-	// 4. Reload DTR for stack.
-	mov r16=IA64_KR(CURRENT_STACK)
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	movl r19=PAGE_OFFSET
-	;;
-	add r18=r19,r16
-	movl r20=PAGE_KERNEL
-	;;
-	add r16=r20,r16
-	mov r19=IA64_GRANULE_SHIFT<<2
-	;;
-	mov cr.itir=r19
-	mov cr.ifa=r18
-	mov r20=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r20]=r16
-	GET_THIS_PADDR(r2, ia64_mca_tr_reload)
-	mov r18 = 1
-	;;
-	srlz.d
-	;;
-	st8 [r2] =r18
-	;;
-
-done_tlb_purge_and_reload:
-
-	// switch to per cpu MCA stack
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_new_stack
-1:
-
-	// everything saved, now we can set the kernel registers
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_set_kernel_registers
-1:
-
-	// This must be done in physical mode
-	GET_IA64_MCA_DATA(r2)
-	;;
-	mov r7=r2
-
-        // Enter virtual mode from physical mode
-	VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4)
-
-	// This code returns to SAL via SOS r2, in general SAL has no unwind
-	// data.  To get a clean termination when backtracing the C MCA/INIT
-	// handler, set a dummy return address of 0 in this routine.  That
-	// requires that ia64_os_mca_virtual_begin be a global function.
-ENTRY(ia64_os_mca_virtual_begin)
-	.prologue
-	.save rp,r0
-	.body
-
-	mov ar.rsc=3				// set eager mode for C handler
-	mov r2=r7				// see GET_IA64_MCA_DATA above
-	;;
-
-	// Call virtual mode handler
-	alloc r14=ar.pfs,0,0,3,0
-	;;
-	DATA_PA_TO_VA(r2,r7)
-	;;
-	add out0=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2
-	add out1=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2
-	add out2=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET, r2
-	br.call.sptk.many    b0=ia64_mca_handler
-
-	// Revert back to physical mode before going back to SAL
-	PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4)
-ia64_os_mca_virtual_end:
-
-END(ia64_os_mca_virtual_begin)
-
-	// switch back to previous stack
-	alloc r14=ar.pfs,0,0,0,0		// remove the MCA handler frame
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_old_stack
-1:
-
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_restore		// restore the SAL state
-1:
-
-	mov		b0=r12			// SAL_CHECK return address
-
-	br		b0
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-//
-// NOP init handler for kdump.  In panic situation, we may receive INIT
-// while kernel transition.  Since we initialize registers on leave from
-// current kernel, no longer monarch/slave handlers of current kernel in
-// virtual mode are called safely.
-// We can unregister these init handlers from SAL, however then the INIT
-// will result in warmboot by SAL and we cannot retrieve the crashdump.
-// Therefore register this NOP function to SAL, to prevent entering virtual
-// mode and resulting warmboot by SAL.
-//
-ia64_os_init_on_kdump:
-	mov		r8=r0		// IA64_INIT_RESUME
-	mov             r9=r10		// SAL_GP
-	mov		r22=r17		// *minstate
-	;;
-	mov		r10=r0		// return to same context
-	mov		b0=r12		// SAL_CHECK return address
-	br		b0
-
-//
-// SAL to OS entry point for INIT on all processors.  This has been defined for
-// registration purposes with SAL as a part of ia64_mca_init.  Monarch and
-// slave INIT have identical processing, except for the value of the
-// sos->monarch flag in r19.
-//
-
-ia64_os_init_dispatch_monarch:
-	mov r19=1				// Bow, bow, ye lower middle classes!
-	br.sptk ia64_os_init_dispatch
-
-ia64_os_init_dispatch_slave:
-	mov r19=0				// <igor>yeth, mathter</igor>
-
-ia64_os_init_dispatch:
-
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_save			// save the state that is not in minstate
-1:
-
-	// switch to per cpu INIT stack
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_new_stack
-1:
-
-	// everything saved, now we can set the kernel registers
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_set_kernel_registers
-1:
-
-	// This must be done in physical mode
-	GET_IA64_MCA_DATA(r2)
-	;;
-	mov r7=r2
-
-        // Enter virtual mode from physical mode
-	VIRTUAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_begin, r4)
-
-	// This code returns to SAL via SOS r2, in general SAL has no unwind
-	// data.  To get a clean termination when backtracing the C MCA/INIT
-	// handler, set a dummy return address of 0 in this routine.  That
-	// requires that ia64_os_init_virtual_begin be a global function.
-ENTRY(ia64_os_init_virtual_begin)
-	.prologue
-	.save rp,r0
-	.body
-
-	mov ar.rsc=3				// set eager mode for C handler
-	mov r2=r7				// see GET_IA64_MCA_DATA above
-	;;
-
-	// Call virtual mode handler
-	alloc r14=ar.pfs,0,0,3,0
-	;;
-	DATA_PA_TO_VA(r2,r7)
-	;;
-	add out0=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2
-	add out1=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2
-	add out2=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SOS_OFFSET, r2
-	br.call.sptk.many    b0=ia64_init_handler
-
-	// Revert back to physical mode before going back to SAL
-	PHYSICAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_end, r4)
-ia64_os_init_virtual_end:
-
-END(ia64_os_init_virtual_begin)
-
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_restore		// restore the SAL state
-1:
-
-	// switch back to previous stack
-	alloc r14=ar.pfs,0,0,0,0		// remove the INIT handler frame
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_old_stack
-1:
-
-	mov		b0=r12			// SAL_CHECK return address
-	br		b0
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-// common defines for the stubs
-#define	ms		r4
-#define	regs		r5
-#define	temp1		r2	/* careful, it overlaps with input registers */
-#define	temp2		r3	/* careful, it overlaps with input registers */
-#define	temp3		r7
-#define	temp4		r14
-
-
-//++
-// Name:
-//	ia64_state_save()
-//
-// Stub Description:
-//
-//	Save the state that is not in minstate.  This is sensitive to the layout of
-//	struct ia64_sal_os_state in mca.h.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	The OS to SAL section of struct ia64_sal_os_state is set to a default
-//	value of cold boot (MCA) or warm boot (INIT) and return to the same
-//	context.  ia64_sal_os_state is also used to hold some registers that
-//	need to be saved and restored across the stack switches.
-//
-//	Most input registers to this stub come from PAL/SAL
-//	r1  os gp, physical
-//	r8  pal_proc entry point
-//	r9  sal_proc entry point
-//	r10 sal gp
-//	r11 MCA - rendevzous state, INIT - reason code
-//	r12 sal return address
-//	r17 pal min_state
-//	r18 processor state parameter
-//	r19 monarch flag, set by the caller of this routine
-//
-//	In addition to the SAL to OS state, this routine saves all the
-//	registers that appear in struct pt_regs and struct switch_stack,
-//	excluding those that are already in the PAL minstate area.  This
-//	results in a partial pt_regs and switch_stack, the C code copies the
-//	remaining registers from PAL minstate to pt_regs and switch_stack.  The
-//	resulting structures contain all the state of the original process when
-//	MCA/INIT occurred.
-//
-//--
-
-ia64_state_save:
-	add regs=MCA_SOS_OFFSET, r3
-	add ms=MCA_SOS_OFFSET+8, r3
-	mov b0=r2		// save return address
-	cmp.eq p1,p2=IA64_MCA_CPU_MCA_STACK_OFFSET, r3
-	;;
-	GET_IA64_MCA_DATA(temp2)
-	;;
-	add temp1=temp2, regs	// struct ia64_sal_os_state on MCA or INIT stack
-	add temp2=temp2, ms	// struct ia64_sal_os_state+8 on MCA or INIT stack
-	;;
-	mov regs=temp1		// save the start of sos
-	st8 [temp1]=r1,16	// os_gp
-	st8 [temp2]=r8,16	// pal_proc
-	;;
-	st8 [temp1]=r9,16	// sal_proc
-	st8 [temp2]=r11,16	// rv_rc
-	mov r11=cr.iipa
-	;;
-	st8 [temp1]=r18		// proc_state_param
-	st8 [temp2]=r19		// monarch
-	mov r6=IA64_KR(CURRENT)
-	add temp1=SOS(SAL_RA), regs
-	add temp2=SOS(SAL_GP), regs
-	;;
-	st8 [temp1]=r12,16	// sal_ra
-	st8 [temp2]=r10,16	// sal_gp
-	mov r12=cr.isr
-	;;
-	st8 [temp1]=r17,16	// pal_min_state
-	st8 [temp2]=r6,16	// prev_IA64_KR_CURRENT
-	mov r6=IA64_KR(CURRENT_STACK)
-	;;
-	st8 [temp1]=r6,16	// prev_IA64_KR_CURRENT_STACK
-	st8 [temp2]=r0,16	// prev_task, starts off as NULL
-	mov r6=cr.ifa
-	;;
-	st8 [temp1]=r12,16	// cr.isr
-	st8 [temp2]=r6,16	// cr.ifa
-	mov r12=cr.itir
-	;;
-	st8 [temp1]=r12,16	// cr.itir
-	st8 [temp2]=r11,16	// cr.iipa
-	mov r12=cr.iim
-	;;
-	st8 [temp1]=r12		// cr.iim
-(p1)	mov r12=IA64_MCA_COLD_BOOT
-(p2)	mov r12=IA64_INIT_WARM_BOOT
-	mov r6=cr.iha
-	add temp1=SOS(OS_STATUS), regs
-	;;
-	st8 [temp2]=r6		// cr.iha
-	add temp2=SOS(CONTEXT), regs
-	st8 [temp1]=r12		// os_status, default is cold boot
-	mov r6=IA64_MCA_SAME_CONTEXT
-	;;
-	st8 [temp2]=r6		// context, default is same context
-
-	// Save the pt_regs data that is not in minstate.  The previous code
-	// left regs at sos.
-	add regs=MCA_PT_REGS_OFFSET-MCA_SOS_OFFSET, regs
-	;;
-	add temp1=PT(B6), regs
-	mov temp3=b6
-	mov temp4=b7
-	add temp2=PT(B7), regs
-	;;
-	st8 [temp1]=temp3,PT(AR_CSD)-PT(B6)		// save b6
-	st8 [temp2]=temp4,PT(AR_SSD)-PT(B7)		// save b7
-	mov temp3=ar.csd
-	mov temp4=ar.ssd
-	cover						// must be last in group
-	;;
-	st8 [temp1]=temp3,PT(AR_UNAT)-PT(AR_CSD)	// save ar.csd
-	st8 [temp2]=temp4,PT(AR_PFS)-PT(AR_SSD)		// save ar.ssd
-	mov temp3=ar.unat
-	mov temp4=ar.pfs
-	;;
-	st8 [temp1]=temp3,PT(AR_RNAT)-PT(AR_UNAT)	// save ar.unat
-	st8 [temp2]=temp4,PT(AR_BSPSTORE)-PT(AR_PFS)	// save ar.pfs
-	mov temp3=ar.rnat
-	mov temp4=ar.bspstore
-	;;
-	st8 [temp1]=temp3,PT(LOADRS)-PT(AR_RNAT)	// save ar.rnat
-	st8 [temp2]=temp4,PT(AR_FPSR)-PT(AR_BSPSTORE)	// save ar.bspstore
-	mov temp3=ar.bsp
-	;;
-	sub temp3=temp3, temp4	// ar.bsp - ar.bspstore
-	mov temp4=ar.fpsr
-	;;
-	shl temp3=temp3,16	// compute ar.rsc to be used for "loadrs"
-	;;
-	st8 [temp1]=temp3,PT(AR_CCV)-PT(LOADRS)		// save loadrs
-	st8 [temp2]=temp4,PT(F6)-PT(AR_FPSR)		// save ar.fpsr
-	mov temp3=ar.ccv
-	;;
-	st8 [temp1]=temp3,PT(F7)-PT(AR_CCV)		// save ar.ccv
-	stf.spill [temp2]=f6,PT(F8)-PT(F6)
-	;;
-	stf.spill [temp1]=f7,PT(F9)-PT(F7)
-	stf.spill [temp2]=f8,PT(F10)-PT(F8)
-	;;
-	stf.spill [temp1]=f9,PT(F11)-PT(F9)
-	stf.spill [temp2]=f10
-	;;
-	stf.spill [temp1]=f11
-
-	// Save the switch_stack data that is not in minstate nor pt_regs.  The
-	// previous code left regs at pt_regs.
-	add regs=MCA_SWITCH_STACK_OFFSET-MCA_PT_REGS_OFFSET, regs
-	;;
-	add temp1=SW(F2), regs
-	add temp2=SW(F3), regs
-	;;
-	stf.spill [temp1]=f2,32
-	stf.spill [temp2]=f3,32
-	;;
-	stf.spill [temp1]=f4,32
-	stf.spill [temp2]=f5,32
-	;;
-	stf.spill [temp1]=f12,32
-	stf.spill [temp2]=f13,32
-	;;
-	stf.spill [temp1]=f14,32
-	stf.spill [temp2]=f15,32
-	;;
-	stf.spill [temp1]=f16,32
-	stf.spill [temp2]=f17,32
-	;;
-	stf.spill [temp1]=f18,32
-	stf.spill [temp2]=f19,32
-	;;
-	stf.spill [temp1]=f20,32
-	stf.spill [temp2]=f21,32
-	;;
-	stf.spill [temp1]=f22,32
-	stf.spill [temp2]=f23,32
-	;;
-	stf.spill [temp1]=f24,32
-	stf.spill [temp2]=f25,32
-	;;
-	stf.spill [temp1]=f26,32
-	stf.spill [temp2]=f27,32
-	;;
-	stf.spill [temp1]=f28,32
-	stf.spill [temp2]=f29,32
-	;;
-	stf.spill [temp1]=f30,SW(B2)-SW(F30)
-	stf.spill [temp2]=f31,SW(B3)-SW(F31)
-	mov temp3=b2
-	mov temp4=b3
-	;;
-	st8 [temp1]=temp3,16	// save b2
-	st8 [temp2]=temp4,16	// save b3
-	mov temp3=b4
-	mov temp4=b5
-	;;
-	st8 [temp1]=temp3,SW(AR_LC)-SW(B4)	// save b4
-	st8 [temp2]=temp4	// save b5
-	mov temp3=ar.lc
-	;;
-	st8 [temp1]=temp3	// save ar.lc
-
-	// FIXME: Some proms are incorrectly accessing the minstate area as
-	// cached data.  The C code uses region 6, uncached virtual.  Ensure
-	// that there is no cache data lying around for the first 1K of the
-	// minstate area.
-	// Remove this code in September 2006, that gives platforms a year to
-	// fix their proms and get their customers updated.
-
-	add r1=32*1,r17
-	add r2=32*2,r17
-	add r3=32*3,r17
-	add r4=32*4,r17
-	add r5=32*5,r17
-	add r6=32*6,r17
-	add r7=32*7,r17
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_state_restore()
-//
-// Stub Description:
-//
-//	Restore the SAL/OS state.  This is sensitive to the layout of struct
-//	ia64_sal_os_state in mca.h.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	In addition to the SAL to OS state, this routine restores all the
-//	registers that appear in struct pt_regs and struct switch_stack,
-//	excluding those in the PAL minstate area.
-//
-//--
-
-ia64_state_restore:
-	// Restore the switch_stack data that is not in minstate nor pt_regs.
-	add regs=MCA_SWITCH_STACK_OFFSET, r3
-	mov b0=r2		// save return address
-	;;
-	GET_IA64_MCA_DATA(temp2)
-	;;
-	add regs=temp2, regs
-	;;
-	add temp1=SW(F2), regs
-	add temp2=SW(F3), regs
-	;;
-	ldf.fill f2=[temp1],32
-	ldf.fill f3=[temp2],32
-	;;
-	ldf.fill f4=[temp1],32
-	ldf.fill f5=[temp2],32
-	;;
-	ldf.fill f12=[temp1],32
-	ldf.fill f13=[temp2],32
-	;;
-	ldf.fill f14=[temp1],32
-	ldf.fill f15=[temp2],32
-	;;
-	ldf.fill f16=[temp1],32
-	ldf.fill f17=[temp2],32
-	;;
-	ldf.fill f18=[temp1],32
-	ldf.fill f19=[temp2],32
-	;;
-	ldf.fill f20=[temp1],32
-	ldf.fill f21=[temp2],32
-	;;
-	ldf.fill f22=[temp1],32
-	ldf.fill f23=[temp2],32
-	;;
-	ldf.fill f24=[temp1],32
-	ldf.fill f25=[temp2],32
-	;;
-	ldf.fill f26=[temp1],32
-	ldf.fill f27=[temp2],32
-	;;
-	ldf.fill f28=[temp1],32
-	ldf.fill f29=[temp2],32
-	;;
-	ldf.fill f30=[temp1],SW(B2)-SW(F30)
-	ldf.fill f31=[temp2],SW(B3)-SW(F31)
-	;;
-	ld8 temp3=[temp1],16	// restore b2
-	ld8 temp4=[temp2],16	// restore b3
-	;;
-	mov b2=temp3
-	mov b3=temp4
-	ld8 temp3=[temp1],SW(AR_LC)-SW(B4)	// restore b4
-	ld8 temp4=[temp2]	// restore b5
-	;;
-	mov b4=temp3
-	mov b5=temp4
-	ld8 temp3=[temp1]	// restore ar.lc
-	;;
-	mov ar.lc=temp3
-
-	// Restore the pt_regs data that is not in minstate.  The previous code
-	// left regs at switch_stack.
-	add regs=MCA_PT_REGS_OFFSET-MCA_SWITCH_STACK_OFFSET, regs
-	;;
-	add temp1=PT(B6), regs
-	add temp2=PT(B7), regs
-	;;
-	ld8 temp3=[temp1],PT(AR_CSD)-PT(B6)		// restore b6
-	ld8 temp4=[temp2],PT(AR_SSD)-PT(B7)		// restore b7
-	;;
-	mov b6=temp3
-	mov b7=temp4
-	ld8 temp3=[temp1],PT(AR_UNAT)-PT(AR_CSD)	// restore ar.csd
-	ld8 temp4=[temp2],PT(AR_PFS)-PT(AR_SSD)		// restore ar.ssd
-	;;
-	mov ar.csd=temp3
-	mov ar.ssd=temp4
-	ld8 temp3=[temp1]				// restore ar.unat
-	add temp1=PT(AR_CCV)-PT(AR_UNAT), temp1
-	ld8 temp4=[temp2],PT(AR_FPSR)-PT(AR_PFS)	// restore ar.pfs
-	;;
-	mov ar.unat=temp3
-	mov ar.pfs=temp4
-	// ar.rnat, ar.bspstore, loadrs are restore in ia64_old_stack.
-	ld8 temp3=[temp1],PT(F6)-PT(AR_CCV)		// restore ar.ccv
-	ld8 temp4=[temp2],PT(F7)-PT(AR_FPSR)		// restore ar.fpsr
-	;;
-	mov ar.ccv=temp3
-	mov ar.fpsr=temp4
-	ldf.fill f6=[temp1],PT(F8)-PT(F6)
-	ldf.fill f7=[temp2],PT(F9)-PT(F7)
-	;;
-	ldf.fill f8=[temp1],PT(F10)-PT(F8)
-	ldf.fill f9=[temp2],PT(F11)-PT(F9)
-	;;
-	ldf.fill f10=[temp1]
-	ldf.fill f11=[temp2]
-
-	// Restore the SAL to OS state. The previous code left regs at pt_regs.
-	add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs
-	;;
-	add temp1=SOS(SAL_RA), regs
-	add temp2=SOS(SAL_GP), regs
-	;;
-	ld8 r12=[temp1],16	// sal_ra
-	ld8 r9=[temp2],16	// sal_gp
-	;;
-	ld8 r22=[temp1],16	// pal_min_state, virtual
-	ld8 r13=[temp2],16	// prev_IA64_KR_CURRENT
-	;;
-	ld8 r16=[temp1],16	// prev_IA64_KR_CURRENT_STACK
-	ld8 r20=[temp2],16	// prev_task
-	;;
-	ld8 temp3=[temp1],16	// cr.isr
-	ld8 temp4=[temp2],16	// cr.ifa
-	;;
-	mov cr.isr=temp3
-	mov cr.ifa=temp4
-	ld8 temp3=[temp1],16	// cr.itir
-	ld8 temp4=[temp2],16	// cr.iipa
-	;;
-	mov cr.itir=temp3
-	mov cr.iipa=temp4
-	ld8 temp3=[temp1]	// cr.iim
-	ld8 temp4=[temp2]		// cr.iha
-	add temp1=SOS(OS_STATUS), regs
-	add temp2=SOS(CONTEXT), regs
-	;;
-	mov cr.iim=temp3
-	mov cr.iha=temp4
-	dep r22=0,r22,62,1	// pal_min_state, physical, uncached
-	mov IA64_KR(CURRENT)=r13
-	ld8 r8=[temp1]		// os_status
-	ld8 r10=[temp2]		// context
-
-	/* Wire IA64_TR_CURRENT_STACK to the stack that we are resuming to.  To
-	 * avoid any dependencies on the algorithm in ia64_switch_to(), just
-	 * purge any existing CURRENT_STACK mapping and insert the new one.
-	 *
-	 * r16 contains prev_IA64_KR_CURRENT_STACK, r13 contains
-	 * prev_IA64_KR_CURRENT, these values may have been changed by the C
-	 * code.  Do not use r8, r9, r10, r22, they contain values ready for
-	 * the return to SAL.
-	 */
-
-	mov r15=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
-	;;
-	shl r15=r15,IA64_GRANULE_SHIFT
-	;;
-	dep r15=-1,r15,61,3			// virtual granule
-	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
-	;;
-	ptr.d r15,r18
-	;;
-	srlz.d
-
-	extr.u r19=r13,61,3			// r13 = prev_IA64_KR_CURRENT
-	shl r20=r16,IA64_GRANULE_SHIFT		// r16 = prev_IA64_KR_CURRENT_STACK
-	movl r21=PAGE_KERNEL			// page properties
-	;;
-	mov IA64_KR(CURRENT_STACK)=r16
-	cmp.ne p6,p0=RGN_KERNEL,r19		// new stack is in the kernel region?
-	or r21=r20,r21				// construct PA | page properties
-(p6)	br.spnt 1f				// the dreaded cpu 0 idle task in region 5:(
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r13
-	mov r20=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r20]=r21
-	;;
-	srlz.d
-1:
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_new_stack()
-//
-// Stub Description:
-//
-//	Switch to the MCA/INIT stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	On entry RBS is still on the original stack, this routine switches RBS
-//	to use the MCA/INIT stack.
-//
-//	On entry, sos->pal_min_state is physical, on exit it is virtual.
-//
-//--
-
-ia64_new_stack:
-	add regs=MCA_PT_REGS_OFFSET, r3
-	add temp2=MCA_SOS_OFFSET+SOS(PAL_MIN_STATE), r3
-	mov b0=r2			// save return address
-	GET_IA64_MCA_DATA(temp1)
-	invala
-	;;
-	add temp2=temp2, temp1		// struct ia64_sal_os_state.pal_min_state on MCA or INIT stack
-	add regs=regs, temp1		// struct pt_regs on MCA or INIT stack
-	;;
-	// Address of minstate area provided by PAL is physical, uncacheable.
-	// Convert to Linux virtual address in region 6 for C code.
-	ld8 ms=[temp2]			// pal_min_state, physical
-	;;
-	dep temp1=-1,ms,62,2		// set region 6
-	mov temp3=IA64_RBS_OFFSET-MCA_PT_REGS_OFFSET
-	;;
-	st8 [temp2]=temp1		// pal_min_state, virtual
-
-	add temp4=temp3, regs		// start of bspstore on new stack
-	;;
-	mov ar.bspstore=temp4		// switch RBS to MCA/INIT stack
-	;;
-	flushrs				// must be first in group
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_old_stack()
-//
-// Stub Description:
-//
-//	Switch to the old stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	On entry, pal_min_state is virtual, on exit it is physical.
-//
-//	On entry RBS is on the MCA/INIT stack, this routine switches RBS
-//	back to the previous stack.
-//
-//	The psr is set to all zeroes.  SAL return requires either all zeroes or
-//	just psr.mc set.  Leaving psr.mc off allows INIT to be issued if this
-//	code does not perform correctly.
-//
-//	The dirty registers at the time of the event were flushed to the
-//	MCA/INIT stack in ia64_pt_regs_save().  Restore the dirty registers
-//	before reverting to the previous bspstore.
-//--
-
-ia64_old_stack:
-	add regs=MCA_PT_REGS_OFFSET, r3
-	mov b0=r2			// save return address
-	GET_IA64_MCA_DATA(temp2)
-	LOAD_PHYSICAL(p0,temp1,1f)
-	;;
-	mov cr.ipsr=r0
-	mov cr.ifs=r0
-	mov cr.iip=temp1
-	;;
-	invala
-	rfi
-1:
-
-	add regs=regs, temp2		// struct pt_regs on MCA or INIT stack
-	;;
-	add temp1=PT(LOADRS), regs
-	;;
-	ld8 temp2=[temp1],PT(AR_BSPSTORE)-PT(LOADRS)	// restore loadrs
-	;;
-	ld8 temp3=[temp1],PT(AR_RNAT)-PT(AR_BSPSTORE)	// restore ar.bspstore
-	mov ar.rsc=temp2
-	;;
-	loadrs
-	ld8 temp4=[temp1]		// restore ar.rnat
-	;;
-	mov ar.bspstore=temp3		// back to old stack
-	;;
-	mov ar.rnat=temp4
-	;;
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_set_kernel_registers()
-//
-// Stub Description:
-//
-//	Set the registers that are required by the C code in order to run on an
-//	MCA/INIT stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//--
-
-ia64_set_kernel_registers:
-	add temp3=MCA_SP_OFFSET, r3
-	mov b0=r2		// save return address
-	GET_IA64_MCA_DATA(temp1)
-	;;
-	add r12=temp1, temp3	// kernel stack pointer on MCA/INIT stack
-	add r13=temp1, r3	// set current to start of MCA/INIT stack
-	add r20=temp1, r3	// physical start of MCA/INIT stack
-	;;
-	DATA_PA_TO_VA(r12,temp2)
-	DATA_PA_TO_VA(r13,temp3)
-	;;
-	mov IA64_KR(CURRENT)=r13
-
-	/* Wire IA64_TR_CURRENT_STACK to the MCA/INIT handler stack.  To avoid
-	 * any dependencies on the algorithm in ia64_switch_to(), just purge
-	 * any existing CURRENT_STACK mapping and insert the new one.
-	 */
-
-	mov r16=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	;;
-	dep r16=-1,r16,61,3			// virtual granule
-	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
-	;;
-	ptr.d r16,r18
-	;;
-	srlz.d
-
-	shr.u r16=r20,IA64_GRANULE_SHIFT	// r20 = physical start of MCA/INIT stack
-	movl r21=PAGE_KERNEL			// page properties
-	;;
-	mov IA64_KR(CURRENT_STACK)=r16
-	or r21=r20,r21				// construct PA | page properties
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r13
-	mov r20=IA64_TR_CURRENT_STACK
-
-	movl r17=FPSR_DEFAULT
-	;;
-	mov.m ar.fpsr=r17			// set ar.fpsr to kernel default value
-	;;
-	itr.d dtr[r20]=r21
-	;;
-	srlz.d
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-#undef	ms
-#undef	regs
-#undef	temp1
-#undef	temp2
-#undef	temp3
-#undef	temp4
-
-
-// Support function for mca.c, it is here to avoid using inline asm.  Given the
-// address of an rnat slot, if that address is below the current ar.bspstore
-// then return the contents of that slot, otherwise return the contents of
-// ar.rnat.
-GLOBAL_ENTRY(ia64_get_rnat)
-	alloc r14=ar.pfs,1,0,0,0
-	mov ar.rsc=0
-	;;
-	mov r14=ar.bspstore
-	;;
-	cmp.lt p6,p7=in0,r14
-	;;
-(p6)	ld8 r8=[in0]
-(p7)	mov r8=ar.rnat
-	mov ar.rsc=3
-	br.ret.sptk.many rp
-END(ia64_get_rnat)
-
-
-// void ia64_set_psr_mc(void)
-//
-// Set psr.mc bit to mask MCA/INIT.
-GLOBAL_ENTRY(ia64_set_psr_mc)
-	rsm psr.i | psr.ic		// disable interrupts
-	;;
-	srlz.d
-	;;
-	mov r14 = psr			// get psr{36:35,31:0}
-	movl r15 = 1f
-	;;
-	dep r14 = -1, r14, PSR_MC, 1	// set psr.mc
-	;;
-	dep r14 = -1, r14, PSR_IC, 1	// set psr.ic
-	;;
-	dep r14 = -1, r14, PSR_BN, 1	// keep bank1 in use
-	;;
-	mov cr.ipsr = r14
-	mov cr.ifs = r0
-	mov cr.iip = r15
-	;;
-	rfi
-1:
-	br.ret.sptk.many rp
-END(ia64_set_psr_mc)
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
deleted file mode 100644
index 23c203639a96..000000000000
--- a/arch/ia64/kernel/mca_drv.c
+++ /dev/null
@@ -1,796 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * File:	mca_drv.c
- * Purpose:	Generic MCA handling layer
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- * Copyright (C) 2005 Silicon Graphics, Inc
- * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
- * Copyright (C) 2006 Russ Anderson <rja@sgi.com>
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kallsyms.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/workqueue.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-
-#include "mca_drv.h"
-
-/* max size of SAL error record (default) */
-static int sal_rec_max = 10000;
-
-/* from mca_drv_asm.S */
-extern void *mca_handler_bhhook(void);
-
-static DEFINE_SPINLOCK(mca_bh_lock);
-
-typedef enum {
-	MCA_IS_LOCAL  = 0,
-	MCA_IS_GLOBAL = 1
-} mca_type_t;
-
-#define MAX_PAGE_ISOLATE 1024
-
-static struct page *page_isolate[MAX_PAGE_ISOLATE];
-static int num_page_isolate = 0;
-
-typedef enum {
-	ISOLATE_NG,
-	ISOLATE_OK,
-	ISOLATE_NONE
-} isolate_status_t;
-
-typedef enum {
-	MCA_NOT_RECOVERED = 0,
-	MCA_RECOVERED	  = 1
-} recovery_status_t;
-
-/*
- *  This pool keeps pointers to the section part of SAL error record
- */
-static struct {
-	slidx_list_t *buffer; /* section pointer list pool */
-	int	     cur_idx; /* Current index of section pointer list pool */
-	int	     max_idx; /* Maximum index of section pointer list pool */
-} slidx_pool;
-
-static int
-fatal_mca(const char *fmt, ...)
-{
-	va_list args;
-	char buf[256];
-
-	va_start(args, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-	ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
-
-	return MCA_NOT_RECOVERED;
-}
-
-static int
-mca_recovered(const char *fmt, ...)
-{
-	va_list args;
-	char buf[256];
-
-	va_start(args, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-	ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
-
-	return MCA_RECOVERED;
-}
-
-/**
- * mca_page_isolate - isolate a poisoned page in order not to use it later
- * @paddr:	poisoned memory location
- *
- * Return value:
- *	one of isolate_status_t, ISOLATE_OK/NG/NONE.
- */
-
-static isolate_status_t
-mca_page_isolate(unsigned long paddr)
-{
-	int i;
-	struct page *p;
-
-	/* whether physical address is valid or not */
-	if (!ia64_phys_addr_valid(paddr))
-		return ISOLATE_NONE;
-
-	if (!pfn_valid(paddr >> PAGE_SHIFT))
-		return ISOLATE_NONE;
-
-	/* convert physical address to physical page number */
-	p = pfn_to_page(paddr>>PAGE_SHIFT);
-
-	/* check whether a page number have been already registered or not */
-	for (i = 0; i < num_page_isolate; i++)
-		if (page_isolate[i] == p)
-			return ISOLATE_OK; /* already listed */
-
-	/* limitation check */
-	if (num_page_isolate == MAX_PAGE_ISOLATE)
-		return ISOLATE_NG;
-
-	/* kick pages having attribute 'SLAB' or 'Reserved' */
-	if (PageSlab(p) || PageReserved(p))
-		return ISOLATE_NG;
-
-	/* add attribute 'Reserved' and register the page */
-	get_page(p);
-	SetPageReserved(p);
-	page_isolate[num_page_isolate++] = p;
-
-	return ISOLATE_OK;
-}
-
-/**
- * mca_hanlder_bh - Kill the process which occurred memory read error
- * @paddr:	poisoned address received from MCA Handler
- */
-
-void
-mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
-{
-	ia64_mlogbuf_dump();
-	printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
-		"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
-	       raw_smp_processor_id(), current->pid,
-		from_kuid(&init_user_ns, current_uid()),
-		iip, ipsr, paddr, current->comm);
-
-	spin_lock(&mca_bh_lock);
-	switch (mca_page_isolate(paddr)) {
-	case ISOLATE_OK:
-		printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
-		break;
-	case ISOLATE_NG:
-		printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr);
-		break;
-	default:
-		break;
-	}
-	spin_unlock(&mca_bh_lock);
-
-	/* This process is about to be killed itself */
-	make_task_dead(SIGKILL);
-}
-
-/**
- * mca_make_peidx - Make index of processor error section
- * @slpi:	pointer to record of processor error section
- * @peidx:	pointer to index of processor error section
- */
-
-static void
-mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx)
-{
-	/*
-	 * calculate the start address of
-	 *   "struct cpuid_info" and "sal_processor_static_info_t".
-	 */
-	u64 total_check_num = slpi->valid.num_cache_check
-				+ slpi->valid.num_tlb_check
-				+ slpi->valid.num_bus_check
-				+ slpi->valid.num_reg_file_check
-				+ slpi->valid.num_ms_check;
-	u64 head_size =	sizeof(sal_log_mod_error_info_t) * total_check_num
-			+ sizeof(sal_log_processor_info_t);
-	u64 mid_size  = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info);
-
-	peidx_head(peidx)   = slpi;
-	peidx_mid(peidx)    = (struct sal_cpuid_info *)
-		(slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL);
-	peidx_bottom(peidx) = (sal_processor_static_info_t *)
-		(slpi->valid.psi_static_struct ?
-			((char*)slpi + head_size + mid_size) : NULL);
-}
-
-/**
- * mca_make_slidx -  Make index of SAL error record
- * @buffer:	pointer to SAL error record
- * @slidx:	pointer to index of SAL error record
- *
- * Return value:
- *	1 if record has platform error / 0 if not
- */
-#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \
-	{slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \
-	hl->hdr = ptr; \
-	list_add(&hl->list, &(sect)); \
-	slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; }
-
-static int
-mca_make_slidx(void *buffer, slidx_table_t *slidx)
-{
-	int platform_err = 0;
-	int record_len = ((sal_log_record_header_t*)buffer)->len;
-	u32 ercd_pos;
-	int sects;
-	sal_log_section_hdr_t *sp;
-
-	/*
-	 * Initialize index referring current record
-	 */
-	INIT_LIST_HEAD(&(slidx->proc_err));
-	INIT_LIST_HEAD(&(slidx->mem_dev_err));
-	INIT_LIST_HEAD(&(slidx->sel_dev_err));
-	INIT_LIST_HEAD(&(slidx->pci_bus_err));
-	INIT_LIST_HEAD(&(slidx->smbios_dev_err));
-	INIT_LIST_HEAD(&(slidx->pci_comp_err));
-	INIT_LIST_HEAD(&(slidx->plat_specific_err));
-	INIT_LIST_HEAD(&(slidx->host_ctlr_err));
-	INIT_LIST_HEAD(&(slidx->plat_bus_err));
-	INIT_LIST_HEAD(&(slidx->unsupported));
-
-	/*
-	 * Extract a Record Header
-	 */
-	slidx->header = buffer;
-
-	/*
-	 * Extract each section records
-	 * (arranged from "int ia64_log_platform_info_print()")
-	 */
-	for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0;
-		ercd_pos < record_len; ercd_pos += sp->len, sects++) {
-		sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos);
-		if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) {
-			LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_BUS_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp);
-		} else {
-			LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp);
-		}
-	}
-	slidx->n_sections = sects;
-
-	return platform_err;
-}
-
-/**
- * init_record_index_pools - Initialize pool of lists for SAL record index
- *
- * Return value:
- *	0 on Success / -ENOMEM on Failure
- */
-static int
-init_record_index_pools(void)
-{
-	int i;
-	int rec_max_size;  /* Maximum size of SAL error records */
-	int sect_min_size; /* Minimum size of SAL error sections */
-	/* minimum size table of each section */
-	static int sal_log_sect_min_sizes[] = {
-		sizeof(sal_log_processor_info_t)
-		+ sizeof(sal_processor_static_info_t),
-		sizeof(sal_log_mem_dev_err_info_t),
-		sizeof(sal_log_sel_dev_err_info_t),
-		sizeof(sal_log_pci_bus_err_info_t),
-		sizeof(sal_log_smbios_dev_err_info_t),
-		sizeof(sal_log_pci_comp_err_info_t),
-		sizeof(sal_log_plat_specific_err_info_t),
-		sizeof(sal_log_host_ctlr_err_info_t),
-		sizeof(sal_log_plat_bus_err_info_t),
-	};
-
-	/*
-	 * MCA handler cannot allocate new memory on flight,
-	 * so we preallocate enough memory to handle a SAL record.
-	 *
-	 * Initialize a handling set of slidx_pool:
-	 *   1. Pick up the max size of SAL error records
-	 *   2. Pick up the min size of SAL error sections
-	 *   3. Allocate the pool as enough to 2 SAL records
-	 *     (now we can estimate the maxinum of section in a record.)
-	 */
-
-	/* - 1 - */
-	rec_max_size = sal_rec_max;
-
-	/* - 2 - */
-	sect_min_size = sal_log_sect_min_sizes[0];
-	for (i = 1; i < ARRAY_SIZE(sal_log_sect_min_sizes); i++)
-		if (sect_min_size > sal_log_sect_min_sizes[i])
-			sect_min_size = sal_log_sect_min_sizes[i];
-
-	/* - 3 - */
-	slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1;
-	slidx_pool.buffer =
-		kmalloc_array(slidx_pool.max_idx, sizeof(slidx_list_t),
-			      GFP_KERNEL);
-
-	return slidx_pool.buffer ? 0 : -ENOMEM;
-}
-
-
-/*****************************************************************************
- * Recovery functions                                                        *
- *****************************************************************************/
-
-/**
- * is_mca_global - Check whether this MCA is global or not
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer to pal_bus_check_info_t
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	MCA_IS_LOCAL / MCA_IS_GLOBAL
- */
-
-static mca_type_t
-is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-	      struct ia64_sal_os_state *sos)
-{
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	/*
-	 * PAL can request a rendezvous, if the MCA has a global scope.
-	 * If "rz_always" flag is set, SAL requests MCA rendezvous
-	 * in spite of global MCA.
-	 * Therefore it is local MCA when rendezvous has not been requested.
-	 * Failed to rendezvous, the system must be down.
-	 */
-	switch (sos->rv_rc) {
-		case -1: /* SAL rendezvous unsuccessful */
-			return MCA_IS_GLOBAL;
-		case  0: /* SAL rendezvous not required */
-			return MCA_IS_LOCAL;
-		case  1: /* SAL rendezvous successful int */
-		case  2: /* SAL rendezvous successful int with init */
-		default:
-			break;
-	}
-
-	/*
-	 * If One or more Cache/TLB/Reg_File/Uarch_Check is here,
-	 * it would be a local MCA. (i.e. processor internal error)
-	 */
-	if (psp->tc || psp->cc || psp->rc || psp->uc)
-		return MCA_IS_LOCAL;
-	
-	/*
-	 * Bus_Check structure with Bus_Check.ib (internal bus error) flag set
-	 * would be a global MCA. (e.g. a system bus address parity error)
-	 */
-	if (!pbci || pbci->ib)
-		return MCA_IS_GLOBAL;
-
-	/*
-	 * Bus_Check structure with Bus_Check.eb (external bus error) flag set
-	 * could be either a local MCA or a global MCA.
-	 *
-	 * Referring Bus_Check.bsi:
-	 *   0: Unknown/unclassified
-	 *   1: BERR#
-	 *   2: BINIT#
-	 *   3: Hard Fail
-	 * (FIXME: Are these SGI specific or generic bsi values?)
-	 */
-	if (pbci->eb)
-		switch (pbci->bsi) {
-			case 0:
-				/* e.g. a load from poisoned memory */
-				return MCA_IS_LOCAL;
-			case 1:
-			case 2:
-			case 3:
-				return MCA_IS_GLOBAL;
-		}
-
-	return MCA_IS_GLOBAL;
-}
-
-/**
- * get_target_identifier - Get the valid Cache or Bus check target identifier.
- * @peidx:	pointer of index of processor error section
- *
- * Return value:
- *	target address on Success / 0 on Failure
- */
-static u64
-get_target_identifier(peidx_table_t *peidx)
-{
-	u64 target_address = 0;
-	sal_log_mod_error_info_t *smei;
-	pal_cache_check_info_t *pcci;
-	int i, level = 9;
-
-	/*
-	 * Look through the cache checks for a valid target identifier
-	 * If more than one valid target identifier, return the one
-	 * with the lowest cache level.
-	 */
-	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
-		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
-		if (smei->valid.target_identifier && smei->target_identifier) {
-			pcci = (pal_cache_check_info_t *)&(smei->check_info);
-			if (!target_address || (pcci->level < level)) {
-				target_address = smei->target_identifier;
-				level = pcci->level;
-				continue;
-			}
-		}
-	}
-	if (target_address)
-		return target_address;
-
-	/*
-	 * Look at the bus check for a valid target identifier
-	 */
-	smei = peidx_bus_check(peidx, 0);
-	if (smei && smei->valid.target_identifier)
-		return smei->target_identifier;
-
-	return 0;
-}
-
-/**
- * recover_from_read_error - Try to recover the errors which type are "read"s.
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_read_error(slidx_table_t *slidx,
-			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-			struct ia64_sal_os_state *sos)
-{
-	u64 target_identifier;
-	struct pal_min_state_area *pmsa;
-	struct ia64_psr *psr1, *psr2;
-	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
-
-	/* Is target address valid? */
-	target_identifier = get_target_identifier(peidx);
-	if (!target_identifier)
-		return fatal_mca("target address not valid");
-
-	/*
-	 * cpu read or memory-mapped io read
-	 *
-	 *    offending process  affected process  OS MCA do
-	 *     kernel mode        kernel mode       down system
-	 *     kernel mode        user   mode       kill the process
-	 *     user   mode        kernel mode       down system (*)
-	 *     user   mode        user   mode       kill the process
-	 *
-	 * (*) You could terminate offending user-mode process
-	 *    if (pbci->pv && pbci->pl != 0) *and* if you sure
-	 *    the process not have any locks of kernel.
-	 */
-
-	/* Is minstate valid? */
-	if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
-		return fatal_mca("minstate not valid");
-	psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
-	psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
-
-	/*
-	 *  Check the privilege level of interrupted context.
-	 *   If it is user-mode, then terminate affected process.
-	 */
-
-	pmsa = sos->pal_min_state;
-	if (psr1->cpl != 0 ||
-	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
-		/*
-		 *  setup for resume to bottom half of MCA,
-		 * "mca_handler_bhhook"
-		 */
-		/* pass to bhhook as argument (gr8, ...) */
-		pmsa->pmsa_gr[8-1] = target_identifier;
-		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
-		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
-		/* set interrupted return address (but no use) */
-		pmsa->pmsa_br0 = pmsa->pmsa_iip;
-		/* change resume address to bottom half */
-		pmsa->pmsa_iip = mca_hdlr_bh->fp;
-		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
-		/* set cpl with kernel mode */
-		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
-		psr2->cpl = 0;
-		psr2->ri  = 0;
-		psr2->bn  = 1;
-		psr2->i  = 0;
-
-		return mca_recovered("user memory corruption. "
-				"kill affected process - recovered.");
-	}
-
-	return fatal_mca("kernel context not recovered, iip 0x%lx\n",
-			 pmsa->pmsa_iip);
-}
-
-/**
- * recover_from_platform_error - Recover from platform error.
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx,
-			    pal_bus_check_info_t *pbci,
-			    struct ia64_sal_os_state *sos)
-{
-	int status = 0;
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	if (psp->bc && pbci->eb && pbci->bsi == 0) {
-		switch(pbci->type) {
-		case 1: /* partial read */
-		case 3: /* full line(cpu) read */
-		case 9: /* I/O space read */
-			status = recover_from_read_error(slidx, peidx, pbci,
-							 sos);
-			break;
-		case 0: /* unknown */
-		case 2: /* partial write */
-		case 4: /* full line write */
-		case 5: /* implicit or explicit write-back operation */
-		case 6: /* snoop probe */
-		case 7: /* incoming or outgoing ptc.g */
-		case 8: /* write coalescing transactions */
-		case 10: /* I/O space write */
-		case 11: /* inter-processor interrupt message(IPI) */
-		case 12: /* interrupt acknowledge or
-				external task priority cycle */
-		default:
-			break;
-		}
-	} else if (psp->cc && !psp->bc) {	/* Cache error */
-		status = recover_from_read_error(slidx, peidx, pbci, sos);
-	}
-
-	return status;
-}
-
-/*
- * recover_from_tlb_check
- * @peidx:	pointer of index of processor error section
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-static int
-recover_from_tlb_check(peidx_table_t *peidx)
-{
-	sal_log_mod_error_info_t *smei;
-	pal_tlb_check_info_t *ptci;
-
-	smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0);
-	ptci = (pal_tlb_check_info_t *)&(smei->check_info);
-
-	/*
-	 * Look for signature of a duplicate TLB DTC entry, which is
-	 * a SW bug and always fatal.
-	 */
-	if (ptci->op == PAL_TLB_CHECK_OP_PURGE
-	    && !(ptci->itr || ptci->dtc || ptci->itc))
-		return fatal_mca("Duplicate TLB entry");
-
-	return mca_recovered("TLB check recovered");
-}
-
-/**
- * recover_from_processor_error
- * @platform:	whether there are some platform error section or not
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_processor_error(int platform, slidx_table_t *slidx,
-			     peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-			     struct ia64_sal_os_state *sos)
-{
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	/*
-	 * Processor recovery status must key off of the PAL recovery
-	 * status in the Processor State Parameter.
-	 */
-
-	/*
-	 * The machine check is corrected.
-	 */
-	if (psp->cm == 1)
-		return mca_recovered("machine check is already corrected.");
-
-	/*
-	 * The error was not contained.  Software must be reset.
-	 */
-	if (psp->us || psp->ci == 0)
-		return fatal_mca("error not contained");
-
-	/*
-	 * Look for recoverable TLB check
-	 */
-	if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
-		return recover_from_tlb_check(peidx);
-
-	/*
-	 * The cache check and bus check bits have four possible states
-	 *   cc bc
-	 *    1  1	Memory error, attempt recovery
-	 *    1  0	Cache error, attempt recovery
-	 *    0  1	I/O error, attempt recovery
-	 *    0  0	Other error type, not recovered
-	 */
-	if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL))
-		return fatal_mca("No cache or bus check");
-
-	/*
-	 * Cannot handle more than one bus check.
-	 */
-	if (peidx_bus_check_num(peidx) > 1)
-		return fatal_mca("Too many bus checks");
-
-	if (pbci->ib)
-		return fatal_mca("Internal Bus error");
-	if (pbci->eb && pbci->bsi > 0)
-		return fatal_mca("External bus check fatal status");
-
-	/*
-	 * This is a local MCA and estimated as a recoverable error.
-	 */
-	if (platform)
-		return recover_from_platform_error(slidx, peidx, pbci, sos);
-
-	/*
-	 * On account of strange SAL error record, we cannot recover.
-	 */
-	return fatal_mca("Strange SAL record");
-}
-
-/**
- * mca_try_to_recover - Try to recover from MCA
- * @rec:	pointer to a SAL error record
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
-{
-	int platform_err;
-	int n_proc_err;
-	slidx_table_t slidx;
-	peidx_table_t peidx;
-	pal_bus_check_info_t pbci;
-
-	/* Make index of SAL error record */
-	platform_err = mca_make_slidx(rec, &slidx);
-
-	/* Count processor error sections */
-	n_proc_err = slidx_count(&slidx, proc_err);
-
-	 /* Now, OS can recover when there is one processor error section */
-	if (n_proc_err > 1)
-		return fatal_mca("Too Many Errors");
-	else if (n_proc_err == 0)
-		/* Weird SAL record ... We can't do anything */
-		return fatal_mca("Weird SAL record");
-
-	/* Make index of processor error section */
-	mca_make_peidx((sal_log_processor_info_t*)
-		slidx_first_entry(&slidx.proc_err)->hdr, &peidx);
-
-	/* Extract Processor BUS_CHECK[0] */
-	*((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0);
-
-	/* Check whether MCA is global or not */
-	if (is_mca_global(&peidx, &pbci, sos))
-		return fatal_mca("global MCA");
-	
-	/* Try to recover a processor error */
-	return recover_from_processor_error(platform_err, &slidx, &peidx,
-					    &pbci, sos);
-}
-
-/*
- * =============================================================================
- */
-
-int __init mca_external_handler_init(void)
-{
-	if (init_record_index_pools())
-		return -ENOMEM;
-
-	/* register external mca handlers */
-	if (ia64_reg_MCA_extension(mca_try_to_recover)) {	
-		printk(KERN_ERR "ia64_reg_MCA_extension failed.\n");
-		kfree(slidx_pool.buffer);
-		return -EFAULT;
-	}
-	return 0;
-}
-
-void __exit mca_external_handler_exit(void)
-{
-	/* unregister external mca handlers */
-	ia64_unreg_MCA_extension();
-	kfree(slidx_pool.buffer);
-}
-
-module_init(mca_external_handler_init);
-module_exit(mca_external_handler_exit);
-
-module_param(sal_rec_max, int, 0644);
-MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record");
-
-MODULE_DESCRIPTION("ia64 platform dependent mca handler driver");
-MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
deleted file mode 100644
index 45bc4e3ae14f..000000000000
--- a/arch/ia64/kernel/mca_drv.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_drv.h
- * Purpose:	Define helpers for Generic MCA handling
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- */
-/*
- * Processor error section:
- *
- *  +-sal_log_processor_info_t *info-------------+
- *  | sal_log_section_hdr_t header;              |
- *  | ...                                        |
- *  | sal_log_mod_error_info_t info[0];          |
- *  +-+----------------+-------------------------+
- *    | CACHE_CHECK    |  ^ num_cache_check v
- *    +----------------+
- *    | TLB_CHECK      |  ^ num_tlb_check v
- *    +----------------+
- *    | BUS_CHECK      |  ^ num_bus_check v
- *    +----------------+
- *    | REG_FILE_CHECK |  ^ num_reg_file_check v
- *    +----------------+
- *    | MS_CHECK       |  ^ num_ms_check v
- *  +-struct cpuid_info *id----------------------+
- *  | regs[5];                                   |
- *  | reserved;                                  |
- *  +-sal_processor_static_info_t *regs----------+
- *  | valid;                                     |
- *  | ...                                        |
- *  | fr[128];                                   |
- *  +--------------------------------------------+
- */
-
-/* peidx: index of processor error section */
-typedef struct peidx_table {
-	sal_log_processor_info_t        *info;
-	struct sal_cpuid_info           *id;
-	sal_processor_static_info_t     *regs;
-} peidx_table_t;
-
-#define peidx_head(p)   (((p)->info))
-#define peidx_mid(p)    (((p)->id))
-#define peidx_bottom(p) (((p)->regs))
-
-#define peidx_psp(p)           (&(peidx_head(p)->proc_state_parameter))
-#define peidx_field_valid(p)   (&(peidx_head(p)->valid))
-#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area))
-
-#define peidx_cache_check_num(p)    (peidx_head(p)->valid.num_cache_check)
-#define peidx_tlb_check_num(p)      (peidx_head(p)->valid.num_tlb_check)
-#define peidx_bus_check_num(p)      (peidx_head(p)->valid.num_bus_check)
-#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check)
-#define peidx_ms_check_num(p)       (peidx_head(p)->valid.num_ms_check)
-
-#define peidx_cache_check_idx(p, n)    (n)
-#define peidx_tlb_check_idx(p, n)      (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n)
-#define peidx_bus_check_idx(p, n)      (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n)
-#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n)
-#define peidx_ms_check_idx(p, n)       (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n)
-
-#define peidx_mod_error_info(p, name, n) \
-({	int __idx = peidx_##name##_idx(p, n); \
-	sal_log_mod_error_info_t *__ret = NULL; \
-	if (peidx_##name##_num(p) > n) /*BUG*/ \
-		__ret = &(peidx_head(p)->info[__idx]); \
-	__ret; })
-
-#define peidx_cache_check(p, n)    peidx_mod_error_info(p, cache_check, n)
-#define peidx_tlb_check(p, n)      peidx_mod_error_info(p, tlb_check, n)
-#define peidx_bus_check(p, n)      peidx_mod_error_info(p, bus_check, n)
-#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n)
-#define peidx_ms_check(p, n)       peidx_mod_error_info(p, ms_check, n)
-
-#define peidx_check_info(proc, name, n) \
-({ \
-	sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\
-	u64 __temp = __info && __info->valid.check_info \
-		? __info->check_info : 0; \
-	__temp; })
-
-/* slidx: index of SAL log error record */
-
-typedef struct slidx_list {
-	struct list_head list;
-	sal_log_section_hdr_t *hdr;
-} slidx_list_t;
-
-typedef struct slidx_table {
-	sal_log_record_header_t *header;
-	int n_sections;			/* # of section headers */
-	struct list_head proc_err;
-	struct list_head mem_dev_err;
-	struct list_head sel_dev_err;
-	struct list_head pci_bus_err;
-	struct list_head smbios_dev_err;
-	struct list_head pci_comp_err;
-	struct list_head plat_specific_err;
-	struct list_head host_ctlr_err;
-	struct list_head plat_bus_err;
-	struct list_head unsupported;	/* list of unsupported sections */
-} slidx_table_t;
-
-#define slidx_foreach_entry(pos, head) \
-	list_for_each_entry(pos, head, list)
-#define slidx_first_entry(head) \
-	(((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL)
-#define slidx_count(slidx, sec) \
-({	int __count = 0; \
-	slidx_list_t *__pos; \
-	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
-	__count; })
-
-struct mca_table_entry {
-	int start_addr;	/* location-relative starting address of MCA recoverable range */
-	int end_addr;	/* location-relative ending address of MCA recoverable range */
-};
-
-extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
-extern int mca_recover_range(unsigned long);
-extern void ia64_mlogbuf_dump(void);
-
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
deleted file mode 100644
index 4428f57bee73..000000000000
--- a/arch/ia64/kernel/mca_drv_asm.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:        mca_drv_asm.S
- * Purpose:     Assembly portion of Generic MCA handling
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- */
-#include <linux/threads.h>
-
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-GLOBAL_ENTRY(mca_handler_bhhook)
-	invala				// clear RSE ?
-	cover
-	;;
-	clrrrb
-	;;						
-	alloc	r16=ar.pfs,0,2,3,0	// make a new frame
-	mov	ar.rsc=0
-	mov	r13=IA64_KR(CURRENT)	// current task pointer
-	;;
-	mov	r2=r13
-	;;
-	addl	r22=IA64_RBS_OFFSET,r2
-	;;
-	mov	ar.bspstore=r22
-	addl	sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2
-	;;
-	adds	r2=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-	st1	[r2]=r0		// clear current->thread.on_ustack flag
-	mov	loc0=r16
-	movl	loc1=mca_handler_bh	// recovery C function
-	;;
-	mov	out0=r8			// poisoned address
-	mov	out1=r9			// iip
-	mov	out2=r10		// psr
-	mov	b6=loc1
-	;;
-	mov	loc1=rp
-	ssm	psr.ic
-	;;
-	srlz.i
-	;;
-	ssm	psr.i
-	br.call.sptk.many rp=b6		// does not return ...
-	;;
-	mov	ar.pfs=loc0
-	mov 	rp=loc1
-	;;
-	mov	r8=r0
-	br.ret.sptk.many rp
-END(mca_handler_bhhook)
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
deleted file mode 100644
index d6eab2a1084d..000000000000
--- a/arch/ia64/kernel/minstate.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <asm/cache.h>
-
-#include "entry.h"
-#include <asm/native/inst.h>
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-/* read ar.itc in advance, and use it before leaving bank 0 */
-#define ACCOUNT_GET_STAMP				\
-(pUStk) mov.m r20=ar.itc;
-#define ACCOUNT_SYS_ENTER				\
-(pUStk) br.call.spnt rp=account_sys_enter		\
-	;;
-#else
-#define ACCOUNT_GET_STAMP
-#define ACCOUNT_SYS_ENTER
-#endif
-
-.section ".data..patch.rse", "a"
-.previous
-
-/*
- * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
- * the minimum state necessary that allows us to turn psr.ic back
- * on.
- *
- * Assumed state upon entry:
- *	psr.ic: off
- *	r31:	contains saved predicates (pr)
- *
- * Upon exit, the state is as follows:
- *	psr.ic: off
- *	 r2 = points to &pt_regs.r16
- *	 r8 = contents of ar.ccv
- *	 r9 = contents of ar.csd
- *	r10 = contents of ar.ssd
- *	r11 = FPSR_DEFAULT
- *	r12 = kernel sp (kernel virtual address)
- *	r13 = points to current task_struct (kernel virtual address)
- *	p15 = TRUE if psr.i is set in cr.ipsr
- *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
- *		preserved
- *
- * Note that psr.ic is NOT turned on by this macro.  This is so that
- * we can pass interruption state as arguments to a handler.
- */
-#define IA64_NATIVE_DO_SAVE_MIN(__COVER,SAVE_IFS,EXTRA,WORKAROUND)				\
-	mov r16=IA64_KR(CURRENT);	/* M */							\
-	mov r27=ar.rsc;			/* M */							\
-	mov r20=r1;			/* A */							\
-	mov r25=ar.unat;		/* M */							\
-	MOV_FROM_IPSR(p0,r29);		/* M */							\
-	mov r26=ar.pfs;			/* I */							\
-	MOV_FROM_IIP(r28);			/* M */						\
-	mov r21=ar.fpsr;		/* M */							\
-	__COVER;				/* B;; (or nothing) */				\
-	;;											\
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
-	;;											\
-	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
-	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
-	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
-	/* switch from user to kernel RBS: */							\
-	;;											\
-	invala;				/* M */							\
-	SAVE_IFS;										\
-	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
-	;;											\
-(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
-	;;											\
-(pUStk)	mov.m r24=ar.rnat;									\
-(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
-(pKStk) mov r1=sp;					/* get sp  */				\
-	;;											\
-(pUStk) lfetch.fault.excl.nt1 [r22];								\
-(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
-(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
-	;;											\
-(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
-	;;											\
-(pUStk)	mov r18=ar.bsp;										\
-(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
-	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
-	adds r16=PT(CR_IPSR),r1;								\
-	;;											\
-	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
-	st8 [r16]=r29;		/* save cr.ipsr */						\
-	;;											\
-	lfetch.fault.excl.nt1 [r17];								\
-	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
-	mov r29=b0										\
-	;;											\
-	WORKAROUND;										\
-	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
-	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
-(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r8,16;								\
-.mem.offset 8,0; st8.spill [r17]=r9,16;								\
-        ;;											\
-.mem.offset 0,0; st8.spill [r16]=r10,24;							\
-.mem.offset 8,0; st8.spill [r17]=r11,24;							\
-        ;;											\
-	st8 [r16]=r28,16;	/* save cr.iip */						\
-	st8 [r17]=r30,16;	/* save cr.ifs */						\
-(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
-	mov r8=ar.ccv;										\
-	mov r9=ar.csd;										\
-	mov r10=ar.ssd;										\
-	movl r11=FPSR_DEFAULT;   /* L-unit */							\
-	;;											\
-	st8 [r16]=r25,16;	/* save ar.unat */						\
-	st8 [r17]=r26,16;	/* save ar.pfs */						\
-	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
-	;;											\
-	st8 [r16]=r27,16;	/* save ar.rsc */						\
-(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
-(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
-	;;			/* avoid RAW on r16 & r17 */					\
-(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
-	st8 [r17]=r31,16;	/* save predicates */						\
-(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
-	;;											\
-	st8 [r16]=r29,16;	/* save b0 */							\
-	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
-	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
-.mem.offset 8,0; st8.spill [r17]=r12,16;							\
-	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r13,16;							\
-.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
-	mov r13=IA64_KR(CURRENT);	/* establish `current' */				\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r15,16;							\
-.mem.offset 8,0; st8.spill [r17]=r14,16;							\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r2,16;								\
-.mem.offset 8,0; st8.spill [r17]=r3,16;								\
-	ACCOUNT_GET_STAMP									\
-	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
-	;;											\
-	EXTRA;											\
-	movl r1=__gp;		/* establish kernel global pointer */				\
-	;;											\
-	ACCOUNT_SYS_ENTER									\
-	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
-	;;
-
-/*
- * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
- *
- * Assumed state upon entry:
- *	psr.ic: on
- *	r2:	points to &pt_regs.r16
- *	r3:	points to &pt_regs.r17
- *	r8:	contents of ar.ccv
- *	r9:	contents of ar.csd
- *	r10:	contents of ar.ssd
- *	r11:	FPSR_DEFAULT
- *
- * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
- */
-#define SAVE_REST				\
-.mem.offset 0,0; st8.spill [r2]=r16,16;		\
-.mem.offset 8,0; st8.spill [r3]=r17,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r18,16;		\
-.mem.offset 8,0; st8.spill [r3]=r19,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r20,16;		\
-.mem.offset 8,0; st8.spill [r3]=r21,16;		\
-	mov r18=b6;				\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r22,16;		\
-.mem.offset 8,0; st8.spill [r3]=r23,16;		\
-	mov r19=b7;				\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r24,16;		\
-.mem.offset 8,0; st8.spill [r3]=r25,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r26,16;		\
-.mem.offset 8,0; st8.spill [r3]=r27,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r28,16;		\
-.mem.offset 8,0; st8.spill [r3]=r29,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r30,16;		\
-.mem.offset 8,0; st8.spill [r3]=r31,32;		\
-	;;					\
-	mov ar.fpsr=r11;	/* M-unit */	\
-	st8 [r2]=r8,8;		/* ar.ccv */	\
-	adds r24=PT(B6)-PT(F7),r3;		\
-	;;					\
-	stf.spill [r2]=f6,32;			\
-	stf.spill [r3]=f7,32;			\
-	;;					\
-	stf.spill [r2]=f8,32;			\
-	stf.spill [r3]=f9,32;			\
-	;;					\
-	stf.spill [r2]=f10;			\
-	stf.spill [r3]=f11;			\
-	adds r25=PT(B7)-PT(F11),r3;		\
-	;;					\
-	st8 [r24]=r18,16;       /* b6 */	\
-	st8 [r25]=r19,16;       /* b7 */	\
-	;;					\
-	st8 [r24]=r9;        	/* ar.csd */	\
-	st8 [r25]=r10;      	/* ar.ssd */	\
-	;;
-
-#define RSE_WORKAROUND				\
-(pUStk) extr.u r17=r18,3,6;			\
-(pUStk)	sub r16=r18,r22;			\
-[1:](pKStk)	br.cond.sptk.many 1f;		\
-	.xdata4 ".data..patch.rse",1b-.		\
-	;;					\
-	cmp.ge p6,p7 = 33,r17;			\
-	;;					\
-(p6)	mov r17=0x310;				\
-(p7)	mov r17=0x308;				\
-	;;					\
-	cmp.leu p1,p0=r16,r17;			\
-(p1)	br.cond.sptk.many 1f;			\
-	dep.z r17=r26,0,62;			\
-	movl r16=2f;				\
-	;;					\
-	mov ar.pfs=r17;				\
-	dep r27=r0,r27,16,14;			\
-	mov b0=r16;				\
-	;;					\
-	br.ret.sptk b0;				\
-	;;					\
-2:						\
-	mov ar.rsc=r0				\
-	;;					\
-	flushrs;				\
-	;;					\
-	mov ar.bspstore=r22			\
-	;;					\
-	mov r18=ar.bsp;				\
-	;;					\
-1:						\
-	.pred.rel "mutex", pKStk, pUStk
-
-#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(COVER, mov r30=cr.ifs, , RSE_WORKAROUND)
-#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(COVER, mov r30=cr.ifs, mov r15=r19, RSE_WORKAROUND)
-#define SAVE_MIN			DO_SAVE_MIN(     , mov r30=r0, , )
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
deleted file mode 100644
index 3661135da9d9..000000000000
--- a/arch/ia64/kernel/module.c
+++ /dev/null
@@ -1,959 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IA-64-specific support for kernel module loader.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Loosely based on patch by Rusty Russell.
- */
-
-/* relocs tested so far:
-
-   DIR64LSB
-   FPTR64LSB
-   GPREL22
-   LDXMOV
-   LDXMOV
-   LTOFF22
-   LTOFF22X
-   LTOFF22X
-   LTOFF_FPTR22
-   PCREL21B	(for br.call only; br.cond is not supported out of modules!)
-   PCREL60B	(for brl.cond only; brl.call is not supported for modules!)
-   PCREL64LSB
-   SECREL32LSB
-   SEGREL64LSB
- */
-
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/elf.h>
-#include <linux/moduleloader.h>
-#include <linux/string.h>
-#include <linux/vmalloc.h>
-
-#include <asm/patch.h>
-#include <asm/unaligned.h>
-#include <asm/sections.h>
-
-#define ARCH_MODULE_DEBUG 0
-
-#if ARCH_MODULE_DEBUG
-# define DEBUGP printk
-# define inline
-#else
-# define DEBUGP(fmt , a...)
-#endif
-
-#ifdef CONFIG_ITANIUM
-# define USE_BRL	0
-#else
-# define USE_BRL	1
-#endif
-
-#define MAX_LTOFF	((uint64_t) (1 << 22))	/* max. allowable linkage-table offset */
-
-/* Define some relocation helper macros/types: */
-
-#define FORMAT_SHIFT	0
-#define FORMAT_BITS	3
-#define FORMAT_MASK	((1 << FORMAT_BITS) - 1)
-#define VALUE_SHIFT	3
-#define VALUE_BITS	5
-#define VALUE_MASK	((1 << VALUE_BITS) - 1)
-
-enum reloc_target_format {
-	/* direct encoded formats: */
-	RF_NONE = 0,
-	RF_INSN14 = 1,
-	RF_INSN22 = 2,
-	RF_INSN64 = 3,
-	RF_32MSB = 4,
-	RF_32LSB = 5,
-	RF_64MSB = 6,
-	RF_64LSB = 7,
-
-	/* formats that cannot be directly decoded: */
-	RF_INSN60,
-	RF_INSN21B,	/* imm21 form 1 */
-	RF_INSN21M,	/* imm21 form 2 */
-	RF_INSN21F	/* imm21 form 3 */
-};
-
-enum reloc_value_formula {
-	RV_DIRECT = 4,		/* S + A */
-	RV_GPREL = 5,		/* @gprel(S + A) */
-	RV_LTREL = 6,		/* @ltoff(S + A) */
-	RV_PLTREL = 7,		/* @pltoff(S + A) */
-	RV_FPTR = 8,		/* @fptr(S + A) */
-	RV_PCREL = 9,		/* S + A - P */
-	RV_LTREL_FPTR = 10,	/* @ltoff(@fptr(S + A)) */
-	RV_SEGREL = 11,		/* @segrel(S + A) */
-	RV_SECREL = 12,		/* @secrel(S + A) */
-	RV_BDREL = 13,		/* BD + A */
-	RV_LTV = 14,		/* S + A (like RV_DIRECT, except frozen at static link-time) */
-	RV_PCREL2 = 15,		/* S + A - P */
-	RV_SPECIAL = 16,	/* various (see below) */
-	RV_RSVD17 = 17,
-	RV_TPREL = 18,		/* @tprel(S + A) */
-	RV_LTREL_TPREL = 19,	/* @ltoff(@tprel(S + A)) */
-	RV_DTPMOD = 20,		/* @dtpmod(S + A) */
-	RV_LTREL_DTPMOD = 21,	/* @ltoff(@dtpmod(S + A)) */
-	RV_DTPREL = 22,		/* @dtprel(S + A) */
-	RV_LTREL_DTPREL = 23,	/* @ltoff(@dtprel(S + A)) */
-	RV_RSVD24 = 24,
-	RV_RSVD25 = 25,
-	RV_RSVD26 = 26,
-	RV_RSVD27 = 27
-	/* 28-31 reserved for implementation-specific purposes.  */
-};
-
-#define N(reloc)	[R_IA64_##reloc] = #reloc
-
-static const char *reloc_name[256] = {
-	N(NONE),		N(IMM14),		N(IMM22),		N(IMM64),
-	N(DIR32MSB),		N(DIR32LSB),		N(DIR64MSB),		N(DIR64LSB),
-	N(GPREL22),		N(GPREL64I),		N(GPREL32MSB),		N(GPREL32LSB),
-	N(GPREL64MSB),		N(GPREL64LSB),		N(LTOFF22),		N(LTOFF64I),
-	N(PLTOFF22),		N(PLTOFF64I),		N(PLTOFF64MSB),		N(PLTOFF64LSB),
-	N(FPTR64I),		N(FPTR32MSB),		N(FPTR32LSB),		N(FPTR64MSB),
-	N(FPTR64LSB),		N(PCREL60B),		N(PCREL21B),		N(PCREL21M),
-	N(PCREL21F),		N(PCREL32MSB),		N(PCREL32LSB),		N(PCREL64MSB),
-	N(PCREL64LSB),		N(LTOFF_FPTR22),	N(LTOFF_FPTR64I),	N(LTOFF_FPTR32MSB),
-	N(LTOFF_FPTR32LSB),	N(LTOFF_FPTR64MSB),	N(LTOFF_FPTR64LSB),	N(SEGREL32MSB),
-	N(SEGREL32LSB),		N(SEGREL64MSB),		N(SEGREL64LSB),		N(SECREL32MSB),
-	N(SECREL32LSB),		N(SECREL64MSB),		N(SECREL64LSB),		N(REL32MSB),
-	N(REL32LSB),		N(REL64MSB),		N(REL64LSB),		N(LTV32MSB),
-	N(LTV32LSB),		N(LTV64MSB),		N(LTV64LSB),		N(PCREL21BI),
-	N(PCREL22),		N(PCREL64I),		N(IPLTMSB),		N(IPLTLSB),
-	N(COPY),		N(LTOFF22X),		N(LDXMOV),		N(TPREL14),
-	N(TPREL22),		N(TPREL64I),		N(TPREL64MSB),		N(TPREL64LSB),
-	N(LTOFF_TPREL22),	N(DTPMOD64MSB),		N(DTPMOD64LSB),		N(LTOFF_DTPMOD22),
-	N(DTPREL14),		N(DTPREL22),		N(DTPREL64I),		N(DTPREL32MSB),
-	N(DTPREL32LSB),		N(DTPREL64MSB),		N(DTPREL64LSB),		N(LTOFF_DTPREL22)
-};
-
-#undef N
-
-/* Opaque struct for insns, to protect against derefs. */
-struct insn;
-
-static inline uint64_t
-bundle (const struct insn *insn)
-{
-	return (uint64_t) insn & ~0xfUL;
-}
-
-static inline int
-slot (const struct insn *insn)
-{
-	return (uint64_t) insn & 0x3;
-}
-
-static int
-apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (slot(insn) != 1 && slot(insn) != 2) {
-		printk(KERN_ERR "%s: invalid slot number %d for IMM64\n",
-		       mod->name, slot(insn));
-		return 0;
-	}
-	ia64_patch_imm64((u64) insn, val);
-	return 1;
-}
-
-static int
-apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (slot(insn) != 1 && slot(insn) != 2) {
-		printk(KERN_ERR "%s: invalid slot number %d for IMM60\n",
-		       mod->name, slot(insn));
-		return 0;
-	}
-	if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
-		printk(KERN_ERR "%s: value %ld out of IMM60 range\n",
-			mod->name, (long) val);
-		return 0;
-	}
-	ia64_patch_imm60((u64) insn, val);
-	return 1;
-}
-
-static int
-apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (val + (1 << 21) >= (1 << 22)) {
-		printk(KERN_ERR "%s: value %li out of IMM22 range\n",
-			mod->name, (long)val);
-		return 0;
-	}
-	ia64_patch((u64) insn, 0x01fffcfe000UL, (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
-					         | ((val & 0x1f0000UL) <<  6) /* bit 16 -> 22 */
-					         | ((val & 0x00ff80UL) << 20) /* bit  7 -> 27 */
-					         | ((val & 0x00007fUL) << 13) /* bit  0 -> 13 */));
-	return 1;
-}
-
-static int
-apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (val + (1 << 20) >= (1 << 21)) {
-		printk(KERN_ERR "%s: value %li out of IMM21b range\n",
-			mod->name, (long)val);
-		return 0;
-	}
-	ia64_patch((u64) insn, 0x11ffffe000UL, (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
-					        | ((val & 0x0fffffUL) << 13) /* bit  0 -> 13 */));
-	return 1;
-}
-
-#if USE_BRL
-
-struct plt_entry {
-	/* Three instruction bundles in PLT. */
- 	unsigned char bundle[2][16];
-};
-
-static const struct plt_entry ia64_plt_template = {
-	{
-		{
-			0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*	     movl gp=TARGET_GP */
-			0x00, 0x00, 0x00, 0x60
-		},
-		{
-			0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*	     brl.many gp=TARGET_GP */
-			0x08, 0x00, 0x00, 0xc0
-		}
-	}
-};
-
-static int
-patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
-{
-	if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_gp)
-	    && apply_imm60(mod, (struct insn *) (plt->bundle[1] + 2),
-			   (target_ip - (int64_t) plt->bundle[1]) / 16))
-		return 1;
-	return 0;
-}
-
-unsigned long
-plt_target (struct plt_entry *plt)
-{
-	uint64_t b0, b1, *b = (uint64_t *) plt->bundle[1];
-	long off;
-
-	b0 = b[0]; b1 = b[1];
-	off = (  ((b1 & 0x00fffff000000000UL) >> 36)		/* imm20b -> bit 0 */
-	       | ((b0 >> 48) << 20) | ((b1 & 0x7fffffUL) << 36)	/* imm39 -> bit 20 */
-	       | ((b1 & 0x0800000000000000UL) << 0));		/* i -> bit 59 */
-	return (long) plt->bundle[1] + 16*off;
-}
-
-#else /* !USE_BRL */
-
-struct plt_entry {
-	/* Three instruction bundles in PLT. */
- 	unsigned char bundle[3][16];
-};
-
-static const struct plt_entry ia64_plt_template = {
-	{
-		{
-			0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*	     movl r16=TARGET_IP */
-			0x02, 0x00, 0x00, 0x60
-		},
-		{
-			0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*	     movl gp=TARGET_GP */
-			0x00, 0x00, 0x00, 0x60
-		},
-		{
-			0x11, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MIB] nop.m 0 */
-			0x60, 0x80, 0x04, 0x80, 0x03, 0x00, /*	     mov b6=r16 */
-			0x60, 0x00, 0x80, 0x00		    /*	     br.few b6 */
-		}
-	}
-};
-
-static int
-patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
-{
-	if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_ip)
-	    && apply_imm64(mod, (struct insn *) (plt->bundle[1] + 2), target_gp))
-		return 1;
-	return 0;
-}
-
-unsigned long
-plt_target (struct plt_entry *plt)
-{
-	uint64_t b0, b1, *b = (uint64_t *) plt->bundle[0];
-
-	b0 = b[0]; b1 = b[1];
-	return (  ((b1 & 0x000007f000000000) >> 36)		/* imm7b -> bit 0 */
-		| ((b1 & 0x07fc000000000000) >> 43)		/* imm9d -> bit 7 */
-		| ((b1 & 0x0003e00000000000) >> 29)		/* imm5c -> bit 16 */
-		| ((b1 & 0x0000100000000000) >> 23)		/* ic -> bit 21 */
-		| ((b0 >> 46) << 22) | ((b1 & 0x7fffff) << 40)	/* imm41 -> bit 22 */
-		| ((b1 & 0x0800000000000000) <<  4));		/* i -> bit 63 */
-}
-
-#endif /* !USE_BRL */
-
-void
-module_arch_freeing_init (struct module *mod)
-{
-	if (mod->arch.init_unw_table) {
-		unw_remove_unwind_table(mod->arch.init_unw_table);
-		mod->arch.init_unw_table = NULL;
-	}
-}
-
-/* Have we already seen one of these relocations? */
-/* FIXME: we could look in other sections, too --RR */
-static int
-duplicate_reloc (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		if (rela[i].r_info == rela[num].r_info && rela[i].r_addend == rela[num].r_addend)
-			return 1;
-	}
-	return 0;
-}
-
-/* Count how many GOT entries we may need */
-static unsigned int
-count_gots (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not
-           time critical */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_LTOFF22:
-		      case R_IA64_LTOFF22X:
-		      case R_IA64_LTOFF64I:
-		      case R_IA64_LTOFF_FPTR22:
-		      case R_IA64_LTOFF_FPTR64I:
-		      case R_IA64_LTOFF_FPTR32MSB:
-		      case R_IA64_LTOFF_FPTR32LSB:
-		      case R_IA64_LTOFF_FPTR64MSB:
-		      case R_IA64_LTOFF_FPTR64LSB:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-/* Count how many PLT entries we may need */
-static unsigned int
-count_plts (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not
-           time critical */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_PCREL21B:
-		      case R_IA64_PLTOFF22:
-		      case R_IA64_PLTOFF64I:
-		      case R_IA64_PLTOFF64MSB:
-		      case R_IA64_PLTOFF64LSB:
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-/* We need to create an function-descriptors for any internal function
-   which is referenced. */
-static unsigned int
-count_fdescs (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not time critical.  */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_FPTR64I:
-		      case R_IA64_FPTR32LSB:
-		      case R_IA64_FPTR32MSB:
-		      case R_IA64_FPTR64LSB:
-		      case R_IA64_FPTR64MSB:
-		      case R_IA64_LTOFF_FPTR22:
-		      case R_IA64_LTOFF_FPTR32LSB:
-		      case R_IA64_LTOFF_FPTR32MSB:
-		      case R_IA64_LTOFF_FPTR64I:
-		      case R_IA64_LTOFF_FPTR64LSB:
-		      case R_IA64_LTOFF_FPTR64MSB:
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			/*
-			 * Jumps to static functions sometimes go straight to their
-			 * offset.  Of course, that may not be possible if the jump is
-			 * from init -> core or vice. versa, so we need to generate an
-			 * FDESC (and PLT etc) for that.
-			 */
-		      case R_IA64_PCREL21B:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-int
-module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
-			   struct module *mod)
-{
-	unsigned long core_plts = 0, init_plts = 0, gots = 0, fdescs = 0;
-	Elf64_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
-
-	/*
-	 * To store the PLTs and function-descriptors, we expand the .text section for
-	 * core module-code and the .init.text section for initialization code.
-	 */
-	for (s = sechdrs; s < sechdrs_end; ++s)
-		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
-			mod->arch.core_plt = s;
-		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
-			mod->arch.init_plt = s;
-		else if (strcmp(".got", secstrings + s->sh_name) == 0)
-			mod->arch.got = s;
-		else if (strcmp(".opd", secstrings + s->sh_name) == 0)
-			mod->arch.opd = s;
-		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
-			mod->arch.unwind = s;
-
-	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
-		printk(KERN_ERR "%s: sections missing\n", mod->name);
-		return -ENOEXEC;
-	}
-
-	/* GOT and PLTs can occur in any relocated section... */
-	for (s = sechdrs + 1; s < sechdrs_end; ++s) {
-		const Elf64_Rela *rels = (void *)ehdr + s->sh_offset;
-		unsigned long numrels = s->sh_size/sizeof(Elf64_Rela);
-
-		if (s->sh_type != SHT_RELA)
-			continue;
-
-		gots += count_gots(rels, numrels);
-		fdescs += count_fdescs(rels, numrels);
-		if (strstr(secstrings + s->sh_name, ".init"))
-			init_plts += count_plts(rels, numrels);
-		else
-			core_plts += count_plts(rels, numrels);
-	}
-
-	mod->arch.core_plt->sh_type = SHT_NOBITS;
-	mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.core_plt->sh_addralign = 16;
-	mod->arch.core_plt->sh_size = core_plts * sizeof(struct plt_entry);
-	mod->arch.init_plt->sh_type = SHT_NOBITS;
-	mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.init_plt->sh_addralign = 16;
-	mod->arch.init_plt->sh_size = init_plts * sizeof(struct plt_entry);
-	mod->arch.got->sh_type = SHT_NOBITS;
-	mod->arch.got->sh_flags = ARCH_SHF_SMALL | SHF_ALLOC;
-	mod->arch.got->sh_addralign = 8;
-	mod->arch.got->sh_size = gots * sizeof(struct got_entry);
-	mod->arch.opd->sh_type = SHT_NOBITS;
-	mod->arch.opd->sh_flags = SHF_ALLOC;
-	mod->arch.opd->sh_addralign = 8;
-	mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc);
-	DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n",
-	       __func__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
-	       mod->arch.got->sh_size, mod->arch.opd->sh_size);
-	return 0;
-}
-
-static inline bool
-in_init (const struct module *mod, uint64_t addr)
-{
-	return within_module_init(addr, mod);
-}
-
-static inline bool
-in_core (const struct module *mod, uint64_t addr)
-{
-	return within_module_core(addr, mod);
-}
-
-static inline bool
-is_internal (const struct module *mod, uint64_t value)
-{
-	return in_init(mod, value) || in_core(mod, value);
-}
-
-/*
- * Get gp-relative offset for the linkage-table entry of VALUE.
- */
-static uint64_t
-get_ltoff (struct module *mod, uint64_t value, int *okp)
-{
-	struct got_entry *got, *e;
-
-	if (!*okp)
-		return 0;
-
-	got = (void *) mod->arch.got->sh_addr;
-	for (e = got; e < got + mod->arch.next_got_entry; ++e)
-		if (e->val == value)
-			goto found;
-
-	/* Not enough GOT entries? */
-	BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
-
-	e->val = value;
-	++mod->arch.next_got_entry;
-  found:
-	return (uint64_t) e - mod->arch.gp;
-}
-
-static inline int
-gp_addressable (struct module *mod, uint64_t value)
-{
-	return value - mod->arch.gp + MAX_LTOFF/2 < MAX_LTOFF;
-}
-
-/* Get PC-relative PLT entry for this value.  Returns 0 on failure. */
-static uint64_t
-get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp)
-{
-	struct plt_entry *plt, *plt_end;
-	uint64_t target_ip, target_gp;
-
-	if (!*okp)
-		return 0;
-
-	if (in_init(mod, (uint64_t) insn)) {
-		plt = (void *) mod->arch.init_plt->sh_addr;
-		plt_end = (void *) plt + mod->arch.init_plt->sh_size;
-	} else {
-		plt = (void *) mod->arch.core_plt->sh_addr;
-		plt_end = (void *) plt + mod->arch.core_plt->sh_size;
-	}
-
-	/* "value" is a pointer to a function-descriptor; fetch the target ip/gp from it: */
-	target_ip = ((uint64_t *) value)[0];
-	target_gp = ((uint64_t *) value)[1];
-
-	/* Look for existing PLT entry. */
-	while (plt->bundle[0][0]) {
-		if (plt_target(plt) == target_ip)
-			goto found;
-		if (++plt >= plt_end)
-			BUG();
-	}
-	*plt = ia64_plt_template;
-	if (!patch_plt(mod, plt, target_ip, target_gp)) {
-		*okp = 0;
-		return 0;
-	}
-#if ARCH_MODULE_DEBUG
-	if (plt_target(plt) != target_ip) {
-		printk("%s: mistargeted PLT: wanted %lx, got %lx\n",
-		       __func__, target_ip, plt_target(plt));
-		*okp = 0;
-		return 0;
-	}
-#endif
-  found:
-	return (uint64_t) plt;
-}
-
-/* Get function descriptor for VALUE. */
-static uint64_t
-get_fdesc (struct module *mod, uint64_t value, int *okp)
-{
-	struct fdesc *fdesc = (void *) mod->arch.opd->sh_addr;
-
-	if (!*okp)
-		return 0;
-
-	if (!value) {
-		printk(KERN_ERR "%s: fdesc for zero requested!\n", mod->name);
-		return 0;
-	}
-
-	if (!is_internal(mod, value))
-		/*
-		 * If it's not a module-local entry-point, "value" already points to a
-		 * function-descriptor.
-		 */
-		return value;
-
-	/* Look for existing function descriptor. */
-	while (fdesc->addr) {
-		if (fdesc->addr == value)
-			return (uint64_t)fdesc;
-		if ((uint64_t) ++fdesc >= mod->arch.opd->sh_addr + mod->arch.opd->sh_size)
-			BUG();
-	}
-
-	/* Create new one */
-	fdesc->addr = value;
-	fdesc->gp = mod->arch.gp;
-	return (uint64_t) fdesc;
-}
-
-static inline int
-do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
-	  Elf64_Shdr *sec, void *location)
-{
-	enum reloc_target_format format = (r_type >> FORMAT_SHIFT) & FORMAT_MASK;
-	enum reloc_value_formula formula = (r_type >> VALUE_SHIFT) & VALUE_MASK;
-	uint64_t val;
-	int ok = 1;
-
-	val = sym->st_value + addend;
-
-	switch (formula) {
-	      case RV_SEGREL:	/* segment base is arbitrarily chosen to be 0 for kernel modules */
-	      case RV_DIRECT:
-		break;
-
-	      case RV_GPREL:	  val -= mod->arch.gp; break;
-	      case RV_LTREL:	  val = get_ltoff(mod, val, &ok); break;
-	      case RV_PLTREL:	  val = get_plt(mod, location, val, &ok); break;
-	      case RV_FPTR:	  val = get_fdesc(mod, val, &ok); break;
-	      case RV_SECREL:	  val -= sec->sh_addr; break;
-	      case RV_LTREL_FPTR: val = get_ltoff(mod, get_fdesc(mod, val, &ok), &ok); break;
-
-	      case RV_PCREL:
-		switch (r_type) {
-		      case R_IA64_PCREL21B:
-			if ((in_init(mod, val) && in_core(mod, (uint64_t)location)) ||
-			    (in_core(mod, val) && in_init(mod, (uint64_t)location))) {
-				/*
-				 * Init section may have been allocated far away from core,
-				 * if the branch won't reach, then allocate a plt for it.
-				 */
-				uint64_t delta = ((int64_t)val - (int64_t)location) / 16;
-				if (delta + (1 << 20) >= (1 << 21)) {
-					val = get_fdesc(mod, val, &ok);
-					val = get_plt(mod, location, val, &ok);
-				}
-			} else if (!is_internal(mod, val))
-				val = get_plt(mod, location, val, &ok);
-			fallthrough;
-		      default:
-			val -= bundle(location);
-			break;
-
-		      case R_IA64_PCREL32MSB:
-		      case R_IA64_PCREL32LSB:
-		      case R_IA64_PCREL64MSB:
-		      case R_IA64_PCREL64LSB:
-			val -= (uint64_t) location;
-			break;
-
-		}
-		switch (r_type) {
-		      case R_IA64_PCREL60B: format = RF_INSN60; break;
-		      case R_IA64_PCREL21B: format = RF_INSN21B; break;
-		      case R_IA64_PCREL21M: format = RF_INSN21M; break;
-		      case R_IA64_PCREL21F: format = RF_INSN21F; break;
-		      default: break;
-		}
-		break;
-
-	      case RV_BDREL:
-		val -= (uint64_t) (in_init(mod, val) ? mod->mem[MOD_INIT_TEXT].base :
-				   mod->mem[MOD_TEXT].base);
-		break;
-
-	      case RV_LTV:
-		/* can link-time value relocs happen here?  */
-		BUG();
-		break;
-
-	      case RV_PCREL2:
-		if (r_type == R_IA64_PCREL21BI) {
-			if (!is_internal(mod, val)) {
-				printk(KERN_ERR "%s: %s reloc against "
-					"non-local symbol (%lx)\n", __func__,
-					reloc_name[r_type], (unsigned long)val);
-				return -ENOEXEC;
-			}
-			format = RF_INSN21B;
-		}
-		val -= bundle(location);
-		break;
-
-	      case RV_SPECIAL:
-		switch (r_type) {
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			val = get_fdesc(mod, get_plt(mod, location, val, &ok), &ok);
-			format = RF_64LSB;
-			if (r_type == R_IA64_IPLTMSB)
-				format = RF_64MSB;
-			break;
-
-		      case R_IA64_SUB:
-			val = addend - sym->st_value;
-			format = RF_INSN64;
-			break;
-
-		      case R_IA64_LTOFF22X:
-			if (gp_addressable(mod, val))
-				val -= mod->arch.gp;
-			else
-				val = get_ltoff(mod, val, &ok);
-			format = RF_INSN22;
-			break;
-
-		      case R_IA64_LDXMOV:
-			if (gp_addressable(mod, val)) {
-				/* turn "ld8" into "mov": */
-				DEBUGP("%s: patching ld8 at %p to mov\n", __func__, location);
-				ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL);
-			}
-			return 0;
-
-		      default:
-			if (reloc_name[r_type])
-				printk(KERN_ERR "%s: special reloc %s not supported",
-				       mod->name, reloc_name[r_type]);
-			else
-				printk(KERN_ERR "%s: unknown special reloc %x\n",
-				       mod->name, r_type);
-			return -ENOEXEC;
-		}
-		break;
-
-	      case RV_TPREL:
-	      case RV_LTREL_TPREL:
-	      case RV_DTPMOD:
-	      case RV_LTREL_DTPMOD:
-	      case RV_DTPREL:
-	      case RV_LTREL_DTPREL:
-		printk(KERN_ERR "%s: %s reloc not supported\n",
-		       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?");
-		return -ENOEXEC;
-
-	      default:
-		printk(KERN_ERR "%s: unknown reloc %x\n", mod->name, r_type);
-		return -ENOEXEC;
-	}
-
-	if (!ok)
-		return -ENOEXEC;
-
-	DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __func__, location, val,
-	       reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend);
-
-	switch (format) {
-	      case RF_INSN21B:	ok = apply_imm21b(mod, location, (int64_t) val / 16); break;
-	      case RF_INSN22:	ok = apply_imm22(mod, location, val); break;
-	      case RF_INSN64:	ok = apply_imm64(mod, location, val); break;
-	      case RF_INSN60:	ok = apply_imm60(mod, location, (int64_t) val / 16); break;
-	      case RF_32LSB:	put_unaligned(val, (uint32_t *) location); break;
-	      case RF_64LSB:	put_unaligned(val, (uint64_t *) location); break;
-	      case RF_32MSB:	/* ia64 Linux is little-endian... */
-	      case RF_64MSB:	/* ia64 Linux is little-endian... */
-	      case RF_INSN14:	/* must be within-module, i.e., resolved by "ld -r" */
-	      case RF_INSN21M:	/* must be within-module, i.e., resolved by "ld -r" */
-	      case RF_INSN21F:	/* must be within-module, i.e., resolved by "ld -r" */
-		printk(KERN_ERR "%s: format %u needed by %s reloc is not supported\n",
-		       mod->name, format, reloc_name[r_type] ? reloc_name[r_type] : "?");
-		return -ENOEXEC;
-
-	      default:
-		printk(KERN_ERR "%s: relocation %s resulted in unknown format %u\n",
-		       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?", format);
-		return -ENOEXEC;
-	}
-	return ok ? 0 : -ENOEXEC;
-}
-
-int
-apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
-		    unsigned int relsec, struct module *mod)
-{
-	unsigned int i, n = sechdrs[relsec].sh_size / sizeof(Elf64_Rela);
-	Elf64_Rela *rela = (void *) sechdrs[relsec].sh_addr;
-	Elf64_Shdr *target_sec;
-	int ret;
-
-	DEBUGP("%s: applying section %u (%u relocs) to %u\n", __func__,
-	       relsec, n, sechdrs[relsec].sh_info);
-
-	target_sec = sechdrs + sechdrs[relsec].sh_info;
-
-	if (target_sec->sh_entsize == ~0UL)
-		/*
-		 * If target section wasn't allocated, we don't need to relocate it.
-		 * Happens, e.g., for debug sections.
-		 */
-		return 0;
-
-	if (!mod->arch.gp) {
-		/*
-		 * XXX Should have an arch-hook for running this after final section
-		 *     addresses have been selected...
-		 */
-		uint64_t gp;
-		struct module_memory *mod_mem;
-
-		mod_mem = &mod->mem[MOD_DATA];
-		if (mod_mem->size > MAX_LTOFF)
-			/*
-			 * This takes advantage of fact that SHF_ARCH_SMALL gets allocated
-			 * at the end of the module.
-			 */
-			gp = mod_mem->size - MAX_LTOFF / 2;
-		else
-			gp = mod_mem->size / 2;
-		gp = (uint64_t) mod_mem->base + ((gp + 7) & -8);
-		mod->arch.gp = gp;
-		DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp);
-	}
-
-	for (i = 0; i < n; i++) {
-		ret = do_reloc(mod, ELF64_R_TYPE(rela[i].r_info),
-			       ((Elf64_Sym *) sechdrs[symindex].sh_addr
-				+ ELF64_R_SYM(rela[i].r_info)),
-			       rela[i].r_addend, target_sec,
-			       (void *) target_sec->sh_addr + rela[i].r_offset);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
-}
-
-/*
- * Modules contain a single unwind table which covers both the core and the init text
- * sections but since the two are not contiguous, we need to split this table up such that
- * we can register (and unregister) each "segment" separately.  Fortunately, this sounds
- * more complicated than it really is.
- */
-static void
-register_unwind_table (struct module *mod)
-{
-	struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
-	struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
-	struct unw_table_entry *e1, *e2, *core, *init;
-	unsigned long num_init = 0, num_core = 0;
-
-	/* First, count how many init and core unwind-table entries there are.  */
-	for (e1 = start; e1 < end; ++e1)
-		if (in_init(mod, e1->start_offset))
-			++num_init;
-		else
-			++num_core;
-	/*
-	 * Second, sort the table such that all unwind-table entries for the init and core
-	 * text sections are nicely separated.  We do this with a stupid bubble sort
-	 * (unwind tables don't get ridiculously huge).
-	 */
-	for (e1 = start; e1 < end; ++e1) {
-		for (e2 = e1 + 1; e2 < end; ++e2) {
-			if (e2->start_offset < e1->start_offset) {
-				swap(*e1, *e2);
-			}
-		}
-	}
-	/*
-	 * Third, locate the init and core segments in the unwind table:
-	 */
-	if (in_init(mod, start->start_offset)) {
-		init = start;
-		core = start + num_init;
-	} else {
-		core = start;
-		init = start + num_core;
-	}
-
-	DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __func__,
-	       mod->name, mod->arch.gp, num_init, num_core);
-
-	/*
-	 * Fourth, register both tables (if not empty).
-	 */
-	if (num_core > 0) {
-		mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
-								core, core + num_core);
-		DEBUGP("%s:  core: handle=%p [%p-%p)\n", __func__,
-		       mod->arch.core_unw_table, core, core + num_core);
-	}
-	if (num_init > 0) {
-		mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
-								init, init + num_init);
-		DEBUGP("%s:  init: handle=%p [%p-%p)\n", __func__,
-		       mod->arch.init_unw_table, init, init + num_init);
-	}
-}
-
-int
-module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod)
-{
-	struct mod_arch_specific *mas = &mod->arch;
-
-	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
-	if (mas->unwind)
-		register_unwind_table(mod);
-
-	/*
-	 * ".opd" was already relocated to the final destination. Store
-	 * it's address for use in symbolizer.
-	 */
-	mas->opd_addr = (void *)mas->opd->sh_addr;
-	mas->opd_size = mas->opd->sh_size;
-
-	/*
-	 * Module relocation was already done at this point. Section
-	 * headers are about to be deleted. Wipe out load-time context.
-	 */
-	mas->core_plt = NULL;
-	mas->init_plt = NULL;
-	mas->got = NULL;
-	mas->opd = NULL;
-	mas->unwind = NULL;
-	mas->gp = 0;
-	mas->next_got_entry = 0;
-
-	return 0;
-}
-
-void
-module_arch_cleanup (struct module *mod)
-{
-	if (mod->arch.init_unw_table) {
-		unw_remove_unwind_table(mod->arch.init_unw_table);
-		mod->arch.init_unw_table = NULL;
-	}
-	if (mod->arch.core_unw_table) {
-		unw_remove_unwind_table(mod->arch.core_unw_table);
-		mod->arch.core_unw_table = NULL;
-	}
-}
-
-void *dereference_module_function_descriptor(struct module *mod, void *ptr)
-{
-	struct mod_arch_specific *mas = &mod->arch;
-
-	if (ptr < mas->opd_addr || ptr >= mas->opd_addr + mas->opd_size)
-		return ptr;
-
-	return dereference_function_descriptor(ptr);
-}
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
deleted file mode 100644
index 025e5133c860..000000000000
--- a/arch/ia64/kernel/msi_ia64.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MSI hooks for standard x86 apic
- */
-
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <linux/msi.h>
-#include <linux/dmar.h>
-#include <asm/smp.h>
-#include <asm/msidef.h>
-
-static struct irq_chip	ia64_msi_chip;
-
-#ifdef CONFIG_SMP
-static int ia64_set_msi_irq_affinity(struct irq_data *idata,
-				     const cpumask_t *cpu_mask, bool force)
-{
-	struct msi_msg msg;
-	u32 addr, data;
-	int cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
-	unsigned int irq = idata->irq;
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	__get_cached_msi_msg(irq_data_get_msi_desc(idata), &msg);
-
-	addr = msg.address_lo;
-	addr &= MSI_ADDR_DEST_ID_MASK;
-	addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
-	msg.address_lo = addr;
-
-	data = msg.data;
-	data &= MSI_DATA_VECTOR_MASK;
-	data |= MSI_DATA_VECTOR(irq_to_vector(irq));
-	msg.data = data;
-
-	pci_write_msi_msg(irq, &msg);
-	irq_data_update_affinity(idata, cpumask_of(cpu));
-
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
-{
-	struct msi_msg	msg;
-	unsigned long	dest_phys_id;
-	int	irq, vector;
-
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
-
-	irq_set_msi_desc(irq, desc);
-	dest_phys_id = cpu_physical_id(cpumask_any_and(&(irq_to_domain(irq)),
-						       cpu_online_mask));
-	vector = irq_to_vector(irq);
-
-	msg.address_hi = 0;
-	msg.address_lo =
-		MSI_ADDR_HEADER |
-		MSI_ADDR_DEST_MODE_PHYS |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID_CPU(dest_phys_id);
-
-	msg.data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(vector);
-
-	pci_write_msi_msg(irq, &msg);
-	irq_set_chip_and_handler(irq, &ia64_msi_chip, handle_edge_irq);
-
-	return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-	destroy_irq(irq);
-}
-
-static void ia64_ack_msi_irq(struct irq_data *data)
-{
-	irq_complete_move(data->irq);
-	irq_move_irq(data);
-	ia64_eoi();
-}
-
-static int ia64_msi_retrigger_irq(struct irq_data *data)
-{
-	unsigned int vector = irq_to_vector(data->irq);
-	ia64_resend_irq(vector);
-
-	return 1;
-}
-
-/*
- * Generic ops used on most IA64 platforms.
- */
-static struct irq_chip ia64_msi_chip = {
-	.name			= "PCI-MSI",
-	.irq_mask		= pci_msi_mask_irq,
-	.irq_unmask		= pci_msi_unmask_irq,
-	.irq_ack		= ia64_ack_msi_irq,
-#ifdef CONFIG_SMP
-	.irq_set_affinity	= ia64_set_msi_irq_affinity,
-#endif
-	.irq_retrigger		= ia64_msi_retrigger_irq,
-};
-
-#ifdef CONFIG_INTEL_IOMMU
-#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
-{
-	unsigned int irq = data->irq;
-	struct irq_cfg *cfg = irq_cfg + irq;
-	struct msi_msg msg;
-	int cpu = cpumask_first_and(mask, cpu_online_mask);
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	dmar_msi_read(irq, &msg);
-
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
-
-	dmar_msi_write(irq, &msg);
-	irq_data_update_affinity(data, mask);
-
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-static struct irq_chip dmar_msi_type = {
-	.name = "DMAR_MSI",
-	.irq_unmask = dmar_msi_unmask,
-	.irq_mask = dmar_msi_mask,
-	.irq_ack = ia64_ack_msi_irq,
-#ifdef CONFIG_SMP
-	.irq_set_affinity = dmar_msi_set_affinity,
-#endif
-	.irq_retrigger = ia64_msi_retrigger_irq,
-};
-
-static void
-msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
-	struct irq_cfg *cfg = irq_cfg + irq;
-	unsigned dest;
-
-	dest = cpu_physical_id(cpumask_first_and(&(irq_to_domain(irq)),
-						 cpu_online_mask));
-
-	msg->address_hi = 0;
-	msg->address_lo =
-		MSI_ADDR_HEADER |
-		MSI_ADDR_DEST_MODE_PHYS |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID_CPU(dest);
-
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(cfg->vector);
-}
-
-int dmar_alloc_hwirq(int id, int node, void *arg)
-{
-	int irq;
-	struct msi_msg msg;
-
-	irq = create_irq();
-	if (irq > 0) {
-		irq_set_handler_data(irq, arg);
-		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
-					      handle_edge_irq, "edge");
-		msi_compose_msg(NULL, irq, &msg);
-		dmar_msi_write(irq, &msg);
-	}
-
-	return irq;
-}
-
-void dmar_free_hwirq(int irq)
-{
-	irq_set_handler_data(irq, NULL);
-	destroy_irq(irq);
-}
-#endif /* CONFIG_INTEL_IOMMU */
-
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
deleted file mode 100644
index 8a959f20662d..000000000000
--- a/arch/ia64/kernel/numa.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *
- * ia64 kernel NUMA specific stuff
- *
- * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (C) 2004 Silicon Graphics, Inc.
- *   Jesse Barnes <jbarnes@sgi.com>
- */
-#include <linux/topology.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/smp.h>
-
-u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_to_node_map);
-
-cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
-EXPORT_SYMBOL(node_to_cpu_mask);
-
-void map_cpu_to_node(int cpu, int nid)
-{
-	int oldnid;
-	if (nid < 0) { /* just initialize by zero */
-		cpu_to_node_map[cpu] = 0;
-		return;
-	}
-	/* sanity check first */
-	oldnid = cpu_to_node_map[cpu];
-	if (cpumask_test_cpu(cpu, &node_to_cpu_mask[oldnid])) {
-		return; /* nothing to do */
-	}
-	/* we don't have cpu-driven node hot add yet...
-	   In usual case, node is created from SRAT at boot time. */
-	if (!node_online(nid))
-		nid = first_online_node;
-	cpu_to_node_map[cpu] = nid;
-	cpumask_set_cpu(cpu, &node_to_cpu_mask[nid]);
-	return;
-}
-
-void unmap_cpu_from_node(int cpu, int nid)
-{
-	WARN_ON(!cpumask_test_cpu(cpu, &node_to_cpu_mask[nid]));
-	WARN_ON(cpu_to_node_map[cpu] != nid);
-	cpu_to_node_map[cpu] = 0;
-	cpumask_clear_cpu(cpu, &node_to_cpu_mask[nid]);
-}
-
-
-/**
- * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays
- *
- * Build cpu to node mapping and initialize the per node cpu masks using
- * info from the node_cpuid array handed to us by ACPI.
- */
-void __init build_cpu_to_node_map(void)
-{
-	int cpu, i, node;
-
-	for(node=0; node < MAX_NUMNODES; node++)
-		cpumask_clear(&node_to_cpu_mask[node]);
-
-	for_each_possible_early_cpu(cpu) {
-		node = NUMA_NO_NODE;
-		for (i = 0; i < NR_CPUS; ++i)
-			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
-				node = node_cpuid[i].nid;
-				break;
-			}
-		map_cpu_to_node(cpu, node);
-	}
-}
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
deleted file mode 100644
index fb6db6966f70..000000000000
--- a/arch/ia64/kernel/pal.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PAL Firmware support
- * IA-64 Processor Programmers Reference Vol 2
- *
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2001, 2003 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 05/22/2000 eranian Added support for stacked register calls
- * 05/24/2000 eranian Added support for physical mode static calls
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-
-	.data
-pal_entry_point:
-	data8 ia64_pal_default_handler
-	.text
-
-/*
- * Set the PAL entry point address.  This could be written in C code, but we
- * do it here to keep it all in one module (besides, it's so trivial that it's
- * not a big deal).
- *
- * in0		Address of the PAL entry point (text address, NOT a function
- *		descriptor).
- */
-GLOBAL_ENTRY(ia64_pal_handler_init)
-	alloc r3=ar.pfs,1,0,0,0
-	movl r2=pal_entry_point
-	;;
-	st8 [r2]=in0
-	br.ret.sptk.many rp
-END(ia64_pal_handler_init)
-
-/*
- * Default PAL call handler.  This needs to be coded in assembly because it
- * uses the static calling convention, i.e., the RSE may not be used and
- * calls are done via "br.cond" (not "br.call").
- */
-GLOBAL_ENTRY(ia64_pal_default_handler)
-	mov r8=-1
-	br.cond.sptk.many rp
-END(ia64_pal_default_handler)
-
-/*
- * Make a PAL call using the static calling convention.
- *
- * in0         Index of PAL service
- * in1 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,5,0,0
-	movl loc2 = pal_entry_point
-1:	{
-	  mov r28 = in0
-	  mov r29 = in1
-	  mov r8 = ip
-	}
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	adds r8 = 1f-1b,r8
-	mov loc4=ar.rsc			// save RSE configuration
-	;;
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov loc3 = psr
-	mov loc0 = rp
-	.body
-	mov r30 = in2
-
-	mov r31 = in3
-	mov b7 = loc2
-
-	rsm psr.i
-	;;
-	mov rp = r8
-	br.cond.sptk.many b7
-1:	mov psr.l = loc3
-	mov ar.rsc = loc4		// restore RSE configuration
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_static)
-EXPORT_SYMBOL(ia64_pal_call_static)
-
-/*
- * Make a PAL call using the stacked registers calling convention.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_stacked)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,4,4,0
-	movl loc2 = pal_entry_point
-
-	mov r28  = in0			// Index MUST be copied to r28
-	mov out0 = in0			// AND in0 of PAL function
-	mov loc0 = rp
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov out1 = in1
-	mov out2 = in2
-	mov out3 = in3
-	mov loc3 = psr
-	;;
-	rsm psr.i
-	mov b7 = loc2
-	;;
-	br.call.sptk.many rp=b7		// now make the call
-.ret0:	mov psr.l  = loc3
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_stacked)
-EXPORT_SYMBOL(ia64_pal_call_stacked)
-
-/*
- * Make a physical mode PAL call using the static registers calling convention.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- *
- * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
- * So we don't need to clear them.
- */
-#define PAL_PSR_BITS_TO_CLEAR						      \
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	      \
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PAL_PSR_BITS_TO_SET						      \
-	(IA64_PSR_BN)
-
-
-GLOBAL_ENTRY(ia64_pal_call_phys_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,7,0,0
-	movl loc2 = pal_entry_point
-1:	{
-	  mov r28  = in0		// copy procedure index
-	  mov r8   = ip			// save ip to compute branch
-	  mov loc0 = rp			// save rp
-	}
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov r29  = in1			// first argument
-	mov r30  = in2			// copy arg2
-	mov r31  = in3			// copy arg3
-	;;
-	mov loc3 = psr			// save psr
-	adds r8  = 1f-1b,r8		// calculate return address for call
-	;;
-	mov loc4=ar.rsc			// save RSE configuration
-	dep.z loc2=loc2,0,61		// convert pal entry point to physical
-	tpa r8=r8			// convert rp to physical
-	;;
-	mov b7 = loc2			// install target to branch reg
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	movl r16=PAL_PSR_BITS_TO_CLEAR
-	movl r17=PAL_PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17		// add in psr the bits to set
-	;;
-	andcm r16=loc3,r16		// removes bits to clear from psr
-	br.call.sptk.many rp=ia64_switch_mode_phys
-	mov rp = r8			// install return address (physical)
-	mov loc5 = r19
-	mov loc6 = r20
-	br.cond.sptk.many b7
-1:
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// r16= original psr
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-	mov psr.l = loc3		// restore init PSR
-
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	mov ar.rsc=loc4			// restore RSE configuration
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_phys_static)
-EXPORT_SYMBOL(ia64_pal_call_phys_static)
-
-/*
- * Make a PAL call using the stacked registers in physical mode.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
-	alloc	loc1 = ar.pfs,5,7,4,0
-	movl	loc2 = pal_entry_point
-1:	{
-	  mov r28  = in0		// copy procedure index
-	  mov loc0 = rp			// save rp
-	}
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov loc3 = psr			// save psr
-	;;
-	mov loc4=ar.rsc			// save RSE configuration
-	dep.z loc2=loc2,0,61		// convert pal entry point to physical
-	;;
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	movl r16=PAL_PSR_BITS_TO_CLEAR
-	movl r17=PAL_PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17		// add in psr the bits to set
-	mov b7 = loc2			// install target to branch reg
-	;;
-	andcm r16=loc3,r16		// removes bits to clear from psr
-	br.call.sptk.many rp=ia64_switch_mode_phys
-
-	mov out0 = in0			// first argument
-	mov out1 = in1			// copy arg2
-	mov out2 = in2			// copy arg3
-	mov out3 = in3			// copy arg3
-	mov loc5 = r19
-	mov loc6 = r20
-
-	br.call.sptk.many rp=b7		// now make the call
-
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// r16= original psr
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-
-	mov psr.l  = loc3		// restore init PSR
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	mov ar.rsc=loc4			// restore RSE configuration
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_phys_stacked)
-EXPORT_SYMBOL(ia64_pal_call_phys_stacked)
-
-/*
- * Save scratch fp scratch regs which aren't saved in pt_regs already
- * (fp10-fp15).
- *
- * NOTE: We need to do this since firmware (SAL and PAL) may use any of the
- * scratch regs fp-low partition.
- *
- * Inputs:
- *      in0	Address of stack storage for fp regs
- */
-GLOBAL_ENTRY(ia64_save_scratch_fpregs)
-	alloc r3=ar.pfs,1,0,0,0
-	add r2=16,in0
-	;;
-	stf.spill [in0] = f10,32
-	stf.spill [r2]  = f11,32
-	;;
-	stf.spill [in0] = f12,32
-	stf.spill [r2]  = f13,32
-	;;
-	stf.spill [in0] = f14,32
-	stf.spill [r2]  = f15,32
-	br.ret.sptk.many rp
-END(ia64_save_scratch_fpregs)
-EXPORT_SYMBOL(ia64_save_scratch_fpregs)
-
-/*
- * Load scratch fp scratch regs (fp10-fp15)
- *
- * Inputs:
- *      in0	Address of stack storage for fp regs
- */
-GLOBAL_ENTRY(ia64_load_scratch_fpregs)
-	alloc r3=ar.pfs,1,0,0,0
-	add r2=16,in0
-	;;
-	ldf.fill  f10 = [in0],32
-	ldf.fill  f11 = [r2],32
-	;;
-	ldf.fill  f12 = [in0],32
-	ldf.fill  f13 = [r2],32
-	;;
-	ldf.fill  f14 = [in0],32
-	ldf.fill  f15 = [r2],32
-	br.ret.sptk.many rp
-END(ia64_load_scratch_fpregs)
-EXPORT_SYMBOL(ia64_load_scratch_fpregs)
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
deleted file mode 100644
index b9ae093bfe37..000000000000
--- a/arch/ia64/kernel/palinfo.c
+++ /dev/null
@@ -1,942 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * palinfo.c
- *
- * Prints processor specific information reported by PAL.
- * This code is based on specification of PAL as of the
- * Intel IA-64 Architecture Software Developer's Manual v1.0.
- *
- *
- * Copyright (C) 2000-2001, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2004 Intel Corporation
- *  Ashok Raj <ashok.raj@intel.com>
- *
- * 05/26/2000	S.Eranian	initial release
- * 08/21/2000	S.Eranian	updated to July 2000 PAL specs
- * 02/05/2001   S.Eranian	fixed module support
- * 10/23/2001	S.Eranian	updated pal_perf_mon_info bug fixes
- * 03/24/2004	Ashok Raj	updated to work with CPU Hotplug
- * 10/26/2006   Russ Anderson	updated processor features to rev 2.2 spec
- */
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/pal.h>
-#include <asm/sal.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <linux/smp.h>
-
-MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
-MODULE_DESCRIPTION("/proc interface to IA-64 PAL");
-MODULE_LICENSE("GPL");
-
-#define PALINFO_VERSION "0.5"
-
-typedef int (*palinfo_func_t)(struct seq_file *);
-
-typedef struct {
-	const char		*name;		/* name of the proc entry */
-	palinfo_func_t		proc_read;	/* function to call for reading */
-	struct proc_dir_entry	*entry;		/* registered entry (removal) */
-} palinfo_entry_t;
-
-
-/*
- *  A bunch of string array to get pretty printing
- */
-
-static const char *cache_types[] = {
-	"",			/* not used */
-	"Instruction",
-	"Data",
-	"Data/Instruction"	/* unified */
-};
-
-static const char *cache_mattrib[]={
-	"WriteThrough",
-	"WriteBack",
-	"",		/* reserved */
-	""		/* reserved */
-};
-
-static const char *cache_st_hints[]={
-	"Temporal, level 1",
-	"Reserved",
-	"Reserved",
-	"Non-temporal, all levels",
-	"Reserved",
-	"Reserved",
-	"Reserved",
-	"Reserved"
-};
-
-static const char *cache_ld_hints[]={
-	"Temporal, level 1",
-	"Non-temporal, level 1",
-	"Reserved",
-	"Non-temporal, all levels",
-	"Reserved",
-	"Reserved",
-	"Reserved",
-	"Reserved"
-};
-
-static const char *rse_hints[]={
-	"enforced lazy",
-	"eager stores",
-	"eager loads",
-	"eager loads and stores"
-};
-
-#define RSE_HINTS_COUNT ARRAY_SIZE(rse_hints)
-
-static const char *mem_attrib[]={
-	"WB",		/* 000 */
-	"SW",		/* 001 */
-	"010",		/* 010 */
-	"011",		/* 011 */
-	"UC",		/* 100 */
-	"UCE",		/* 101 */
-	"WC",		/* 110 */
-	"NaTPage"	/* 111 */
-};
-
-/*
- * Take a 64bit vector and produces a string such that
- * if bit n is set then 2^n in clear text is generated. The adjustment
- * to the right unit is also done.
- *
- * Input:
- *	- a pointer to a buffer to hold the string
- *	- a 64-bit vector
- * Output:
- *	- a pointer to the end of the buffer
- *
- */
-static void bitvector_process(struct seq_file *m, u64 vector)
-{
-	int i,j;
-	static const char *units[]={ "", "K", "M", "G", "T" };
-
-	for (i=0, j=0; i < 64; i++ , j=i/10) {
-		if (vector & 0x1)
-			seq_printf(m, "%d%s ", 1 << (i-j*10), units[j]);
-		vector >>= 1;
-	}
-}
-
-/*
- * Take a 64bit vector and produces a string such that
- * if bit n is set then register n is present. The function
- * takes into account consecutive registers and prints out ranges.
- *
- * Input:
- *	- a pointer to a buffer to hold the string
- *	- a 64-bit vector
- * Ouput:
- *	- a pointer to the end of the buffer
- *
- */
-static void bitregister_process(struct seq_file *m, u64 *reg_info, int max)
-{
-	int i, begin, skip = 0;
-	u64 value = reg_info[0];
-
-	value >>= i = begin = ffs(value) - 1;
-
-	for(; i < max; i++ ) {
-
-		if (i != 0 && (i%64) == 0) value = *++reg_info;
-
-		if ((value & 0x1) == 0 && skip == 0) {
-			if (begin  <= i - 2)
-				seq_printf(m, "%d-%d ", begin, i-1);
-			else
-				seq_printf(m, "%d ", i-1);
-			skip  = 1;
-			begin = -1;
-		} else if ((value & 0x1) && skip == 1) {
-			skip = 0;
-			begin = i;
-		}
-		value >>=1;
-	}
-	if (begin > -1) {
-		if (begin < 127)
-			seq_printf(m, "%d-127", begin);
-		else
-			seq_puts(m, "127");
-	}
-}
-
-static int power_info(struct seq_file *m)
-{
-	s64 status;
-	u64 halt_info_buffer[8];
-	pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer;
-	int i;
-
-	status = ia64_pal_halt_info(halt_info);
-	if (status != 0) return 0;
-
-	for (i=0; i < 8 ; i++ ) {
-		if (halt_info[i].pal_power_mgmt_info_s.im == 1) {
-			seq_printf(m,
-				   "Power level %d:\n"
-				   "\tentry_latency       : %d cycles\n"
-				   "\texit_latency        : %d cycles\n"
-				   "\tpower consumption   : %d mW\n"
-				   "\tCache+TLB coherency : %s\n", i,
-				   halt_info[i].pal_power_mgmt_info_s.entry_latency,
-				   halt_info[i].pal_power_mgmt_info_s.exit_latency,
-				   halt_info[i].pal_power_mgmt_info_s.power_consumption,
-				   halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
-		} else {
-			seq_printf(m,"Power level %d: not implemented\n", i);
-		}
-	}
-	return 0;
-}
-
-static int cache_info(struct seq_file *m)
-{
-	unsigned long i, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	int j, k;
-	long status;
-
-	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
-		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
-		return 0;
-	}
-
-	seq_printf(m, "Cache levels  : %ld\nUnique caches : %ld\n\n",
-		   levels, unique_caches);
-
-	for (i=0; i < levels; i++) {
-		for (j=2; j >0 ; j--) {
-			/* even without unification some level may not be present */
-			if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0)
-				continue;
-
-			seq_printf(m,
-				   "%s Cache level %lu:\n"
-				   "\tSize           : %u bytes\n"
-				   "\tAttributes     : ",
-				   cache_types[j+cci.pcci_unified], i+1,
-				   cci.pcci_cache_size);
-
-			if (cci.pcci_unified)
-				seq_puts(m, "Unified ");
-
-			seq_printf(m, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
-
-			seq_printf(m,
-				   "\tAssociativity  : %d\n"
-				   "\tLine size      : %d bytes\n"
-				   "\tStride         : %d bytes\n",
-				   cci.pcci_assoc,
-				   1<<cci.pcci_line_size,
-				   1<<cci.pcci_stride);
-			if (j == 1)
-				seq_puts(m, "\tStore latency  : N/A\n");
-			else
-				seq_printf(m, "\tStore latency  : %d cycle(s)\n",
-					   cci.pcci_st_latency);
-
-			seq_printf(m,
-				   "\tLoad latency   : %d cycle(s)\n"
-				   "\tStore hints    : ", cci.pcci_ld_latency);
-
-			for(k=0; k < 8; k++ ) {
-				if ( cci.pcci_st_hints & 0x1)
-					seq_printf(m, "[%s]", cache_st_hints[k]);
-				cci.pcci_st_hints >>=1;
-			}
-			seq_puts(m, "\n\tLoad hints     : ");
-
-			for(k=0; k < 8; k++ ) {
-				if (cci.pcci_ld_hints & 0x1)
-					seq_printf(m, "[%s]", cache_ld_hints[k]);
-				cci.pcci_ld_hints >>=1;
-			}
-			seq_printf(m,
-				   "\n\tAlias boundary : %d byte(s)\n"
-				   "\tTag LSB        : %d\n"
-				   "\tTag MSB        : %d\n",
-				   1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
-				   cci.pcci_tag_msb);
-
-			/* when unified, data(j=2) is enough */
-			if (cci.pcci_unified)
-				break;
-		}
-	}
-	return 0;
-}
-
-
-static int vm_info(struct seq_file *m)
-{
-	u64 tr_pages =0, vw_pages=0, tc_pages;
-	u64 attrib;
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	pal_tc_info_u_t	tc_info;
-	ia64_ptce_info_t ptce;
-	const char *sep;
-	int i, j;
-	long status;
-
-	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-	} else {
-
-		seq_printf(m,
-		     "Physical Address Space         : %d bits\n"
-		     "Virtual Address Space          : %d bits\n"
-		     "Protection Key Registers(PKR)  : %d\n"
-		     "Implemented bits in PKR.key    : %d\n"
-		     "Hash Tag ID                    : 0x%x\n"
-		     "Size of RR.rid                 : %d\n"
-		     "Max Purges                     : ",
-		     vm_info_1.pal_vm_info_1_s.phys_add_size,
-		     vm_info_2.pal_vm_info_2_s.impl_va_msb+1,
-		     vm_info_1.pal_vm_info_1_s.max_pkr+1,
-		     vm_info_1.pal_vm_info_1_s.key_size,
-		     vm_info_1.pal_vm_info_1_s.hash_tag_id,
-		     vm_info_2.pal_vm_info_2_s.rid_size);
-		if (vm_info_2.pal_vm_info_2_s.max_purges == PAL_MAX_PURGES)
-			seq_puts(m, "unlimited\n");
-		else
-			seq_printf(m, "%d\n",
-		     		vm_info_2.pal_vm_info_2_s.max_purges ?
-				vm_info_2.pal_vm_info_2_s.max_purges : 1);
-	}
-
-	if (ia64_pal_mem_attrib(&attrib) == 0) {
-		seq_puts(m, "Supported memory attributes    : ");
-		sep = "";
-		for (i = 0; i < 8; i++) {
-			if (attrib & (1 << i)) {
-				seq_printf(m, "%s%s", sep, mem_attrib[i]);
-				sep = ", ";
-			}
-		}
-		seq_putc(m, '\n');
-	}
-
-	if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status);
-	} else {
-
-		seq_printf(m,
-			   "\nTLB walker                     : %simplemented\n"
-			   "Number of DTR                  : %d\n"
-			   "Number of ITR                  : %d\n"
-			   "TLB insertable page sizes      : ",
-			   vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
-			   vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
-			   vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
-
-		bitvector_process(m, tr_pages);
-
-		seq_puts(m, "\nTLB purgeable page sizes       : ");
-
-		bitvector_process(m, vw_pages);
-	}
-
-	if ((status = ia64_get_ptce(&ptce)) != 0) {
-		printk(KERN_ERR "ia64_get_ptce=%ld\n", status);
-	} else {
-		seq_printf(m,
-		     "\nPurge base address             : 0x%016lx\n"
-		     "Purge outer loop count         : %d\n"
-		     "Purge inner loop count         : %d\n"
-		     "Purge outer loop stride        : %d\n"
-		     "Purge inner loop stride        : %d\n",
-		     ptce.base, ptce.count[0], ptce.count[1],
-		     ptce.stride[0], ptce.stride[1]);
-
-		seq_printf(m,
-		     "TC Levels                      : %d\n"
-		     "Unique TC(s)                   : %d\n",
-		     vm_info_1.pal_vm_info_1_s.num_tc_levels,
-		     vm_info_1.pal_vm_info_1_s.max_unique_tcs);
-
-		for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) {
-			for (j=2; j>0 ; j--) {
-				tc_pages = 0; /* just in case */
-
-				/* even without unification, some levels may not be present */
-				if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0)
-					continue;
-
-				seq_printf(m,
-				     "\n%s Translation Cache Level %d:\n"
-				     "\tHash sets           : %d\n"
-				     "\tAssociativity       : %d\n"
-				     "\tNumber of entries   : %d\n"
-				     "\tFlags               : ",
-				     cache_types[j+tc_info.tc_unified], i+1,
-				     tc_info.tc_num_sets,
-				     tc_info.tc_associativity,
-				     tc_info.tc_num_entries);
-
-				if (tc_info.tc_pf)
-					seq_puts(m, "PreferredPageSizeOptimized ");
-				if (tc_info.tc_unified)
-					seq_puts(m, "Unified ");
-				if (tc_info.tc_reduce_tr)
-					seq_puts(m, "TCReduction");
-
-				seq_puts(m, "\n\tSupported page sizes: ");
-
-				bitvector_process(m, tc_pages);
-
-				/* when unified date (j=2) is enough */
-				if (tc_info.tc_unified)
-					break;
-			}
-		}
-	}
-
-	seq_putc(m, '\n');
-	return 0;
-}
-
-
-static int register_info(struct seq_file *m)
-{
-	u64 reg_info[2];
-	u64 info;
-	unsigned long phys_stacked;
-	pal_hints_u_t hints;
-	unsigned long iregs, dregs;
-	static const char * const info_type[] = {
-		"Implemented AR(s)",
-		"AR(s) with read side-effects",
-		"Implemented CR(s)",
-		"CR(s) with read side-effects",
-	};
-
-	for(info=0; info < 4; info++) {
-		if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0)
-			return 0;
-		seq_printf(m, "%-32s : ", info_type[info]);
-		bitregister_process(m, reg_info, 128);
-		seq_putc(m, '\n');
-	}
-
-	if (ia64_pal_rse_info(&phys_stacked, &hints) == 0)
-		seq_printf(m,
-			   "RSE stacked physical registers   : %ld\n"
-			   "RSE load/store hints             : %ld (%s)\n",
-			   phys_stacked, hints.ph_data,
-			   hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
-
-	if (ia64_pal_debug_info(&iregs, &dregs))
-		return 0;
-
-	seq_printf(m,
-		   "Instruction debug register pairs : %ld\n"
-		   "Data debug register pairs        : %ld\n", iregs, dregs);
-
-	return 0;
-}
-
-static const char *const proc_features_0[]={		/* Feature set 0 */
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,
-	"Unimplemented instruction address fault",
-	"INIT, PMI, and LINT pins",
-	"Simple unimplemented instr addresses",
-	"Variable P-state performance",
-	"Virtual machine features implemented",
-	"XIP,XPSR,XFS implemented",
-	"XR1-XR3 implemented",
-	"Disable dynamic predicate prediction",
-	"Disable processor physical number",
-	"Disable dynamic data cache prefetch",
-	"Disable dynamic inst cache prefetch",
-	"Disable dynamic branch prediction",
-	NULL, NULL, NULL, NULL,
-	"Disable P-states",
-	"Enable MCA on Data Poisoning",
-	"Enable vmsw instruction",
-	"Enable extern environmental notification",
-	"Disable BINIT on processor time-out",
-	"Disable dynamic power management (DPM)",
-	"Disable coherency",
-	"Disable cache",
-	"Enable CMCI promotion",
-	"Enable MCA to BINIT promotion",
-	"Enable MCA promotion",
-	"Enable BERR promotion"
-};
-
-static const char *const proc_features_16[]={		/* Feature set 16 */
-	"Disable ETM",
-	"Enable ETM",
-	"Enable MCA on half-way timer",
-	"Enable snoop WC",
-	NULL,
-	"Enable Fast Deferral",
-	"Disable MCA on memory aliasing",
-	"Enable RSB",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"DP system processor",
-	"Low Voltage",
-	"HT supported",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL
-};
-
-static const char *const *const proc_features[]={
-	proc_features_0,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL,
-	proc_features_16,
-	NULL, NULL, NULL, NULL,
-};
-
-static void feature_set_info(struct seq_file *m, u64 avail, u64 status, u64 control,
-			     unsigned long set)
-{
-	const char *const *vf, *const *v;
-	int i;
-
-	vf = v = proc_features[set];
-	for(i=0; i < 64; i++, avail >>=1, status >>=1, control >>=1) {
-
-		if (!(control))		/* No remaining bits set */
-			break;
-		if (!(avail & 0x1))	/* Print only bits that are available */
-			continue;
-		if (vf)
-			v = vf + i;
-		if ( v && *v ) {
-			seq_printf(m, "%-40s : %s %s\n", *v,
-				avail & 0x1 ? (status & 0x1 ?
-					      "On " : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ?
-						"Ctrl" : "NoCtrl"): "");
-		} else {
-			seq_printf(m, "Feature set %2ld bit %2d\t\t\t"
-					" : %s %s\n",
-				set, i,
-				avail & 0x1 ? (status & 0x1 ?
-						"On " : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ?
-						"Ctrl" : "NoCtrl"): "");
-		}
-	}
-}
-
-static int processor_info(struct seq_file *m)
-{
-	u64 avail=1, status=1, control=1, feature_set=0;
-	s64 ret;
-
-	do {
-		ret = ia64_pal_proc_get_features(&avail, &status, &control,
-						feature_set);
-		if (ret < 0)
-			return 0;
-
-		if (ret == 1) {
-			feature_set++;
-			continue;
-		}
-
-		feature_set_info(m, avail, status, control, feature_set);
-		feature_set++;
-	} while(1);
-
-	return 0;
-}
-
-static const char *const bus_features[]={
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,
-	"Request  Bus Parking",
-	"Bus Lock Mask",
-	"Enable Half Transfer",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL,
-	"Enable Cache Line Repl. Shared",
-	"Enable Cache Line Repl. Exclusive",
-	"Disable Transaction Queuing",
-	"Disable Response Error Checking",
-	"Disable Bus Error Checking",
-	"Disable Bus Requester Internal Error Signalling",
-	"Disable Bus Requester Error Signalling",
-	"Disable Bus Initialization Event Checking",
-	"Disable Bus Initialization Event Signalling",
-	"Disable Bus Address Error Checking",
-	"Disable Bus Address Error Signalling",
-	"Disable Bus Data Error Checking"
-};
-
-
-static int bus_info(struct seq_file *m)
-{
-	const char *const *v = bus_features;
-	pal_bus_features_u_t av, st, ct;
-	u64 avail, status, control;
-	int i;
-	s64 ret;
-
-	if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0)
-		return 0;
-
-	avail   = av.pal_bus_features_val;
-	status  = st.pal_bus_features_val;
-	control = ct.pal_bus_features_val;
-
-	for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) {
-		if ( ! *v )
-			continue;
-		seq_printf(m, "%-48s : %s%s %s\n", *v,
-			   avail & 0x1 ? "" : "NotImpl",
-			   avail & 0x1 ? (status  & 0x1 ? "On" : "Off"): "",
-			   avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
-	}
-	return 0;
-}
-
-static int version_info(struct seq_file *m)
-{
-	pal_version_u_t min_ver, cur_ver;
-
-	if (ia64_pal_version(&min_ver, &cur_ver) != 0)
-		return 0;
-
-	seq_printf(m,
-		   "PAL_vendor : 0x%02x (min=0x%02x)\n"
-		   "PAL_A      : %02x.%02x (min=%02x.%02x)\n"
-		   "PAL_B      : %02x.%02x (min=%02x.%02x)\n",
-		   cur_ver.pal_version_s.pv_pal_vendor,
-		   min_ver.pal_version_s.pv_pal_vendor,
-		   cur_ver.pal_version_s.pv_pal_a_model,
-		   cur_ver.pal_version_s.pv_pal_a_rev,
-		   min_ver.pal_version_s.pv_pal_a_model,
-		   min_ver.pal_version_s.pv_pal_a_rev,
-		   cur_ver.pal_version_s.pv_pal_b_model,
-		   cur_ver.pal_version_s.pv_pal_b_rev,
-		   min_ver.pal_version_s.pv_pal_b_model,
-		   min_ver.pal_version_s.pv_pal_b_rev);
-	return 0;
-}
-
-static int frequency_info(struct seq_file *m)
-{
-	struct pal_freq_ratio proc, itc, bus;
-	unsigned long base;
-
-	if (ia64_pal_freq_base(&base) == -1)
-		seq_puts(m, "Output clock            : not implemented\n");
-	else
-		seq_printf(m, "Output clock            : %ld ticks/s\n", base);
-
-	if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0;
-
-	seq_printf(m,
-		     "Processor/Clock ratio   : %d/%d\n"
-		     "Bus/Clock ratio         : %d/%d\n"
-		     "ITC/Clock ratio         : %d/%d\n",
-		     proc.num, proc.den, bus.num, bus.den, itc.num, itc.den);
-	return 0;
-}
-
-static int tr_info(struct seq_file *m)
-{
-	long status;
-	pal_tr_valid_u_t tr_valid;
-	u64 tr_buffer[4];
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	unsigned long i, j;
-	unsigned long max[3], pgm;
-	struct ifa_reg {
-		unsigned long valid:1;
-		unsigned long ig:11;
-		unsigned long vpn:52;
-	} *ifa_reg;
-	struct itir_reg {
-		unsigned long rv1:2;
-		unsigned long ps:6;
-		unsigned long key:24;
-		unsigned long rv2:32;
-	} *itir_reg;
-	struct gr_reg {
-		unsigned long p:1;
-		unsigned long rv1:1;
-		unsigned long ma:3;
-		unsigned long a:1;
-		unsigned long d:1;
-		unsigned long pl:2;
-		unsigned long ar:3;
-		unsigned long ppn:38;
-		unsigned long rv2:2;
-		unsigned long ed:1;
-		unsigned long ig:11;
-	} *gr_reg;
-	struct rid_reg {
-		unsigned long ig1:1;
-		unsigned long rv1:1;
-		unsigned long ig2:6;
-		unsigned long rid:24;
-		unsigned long rv2:32;
-	} *rid_reg;
-
-	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-		return 0;
-	}
-	max[0] = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
-	max[1] = vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
-
-	for (i=0; i < 2; i++ ) {
-		for (j=0; j < max[i]; j++) {
-
-		status = ia64_pal_tr_read(j, i, tr_buffer, &tr_valid);
-		if (status != 0) {
-			printk(KERN_ERR "palinfo: pal call failed on tr[%lu:%lu]=%ld\n",
-			       i, j, status);
-			continue;
-		}
-
-		ifa_reg  = (struct ifa_reg *)&tr_buffer[2];
-
-		if (ifa_reg->valid == 0)
-			continue;
-
-		gr_reg   = (struct gr_reg *)tr_buffer;
-		itir_reg = (struct itir_reg *)&tr_buffer[1];
-		rid_reg  = (struct rid_reg *)&tr_buffer[3];
-
-		pgm	 = -1 << (itir_reg->ps - 12);
-		seq_printf(m,
-			   "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
-			   "\tppn  : 0x%lx\n"
-			   "\tvpn  : 0x%lx\n"
-			   "\tps   : ",
-			   "ID"[i], j,
-			   tr_valid.pal_tr_valid_s.access_rights_valid,
-			   tr_valid.pal_tr_valid_s.priv_level_valid,
-			   tr_valid.pal_tr_valid_s.dirty_bit_valid,
-			   tr_valid.pal_tr_valid_s.mem_attr_valid,
-			   (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
-
-		bitvector_process(m, 1<< itir_reg->ps);
-
-		seq_printf(m,
-			   "\n\tpl   : %d\n"
-			   "\tar   : %d\n"
-			   "\trid  : %x\n"
-			   "\tp    : %d\n"
-			   "\tma   : %d\n"
-			   "\td    : %d\n",
-			   gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
-			   gr_reg->d);
-		}
-	}
-	return 0;
-}
-
-
-
-/*
- * List {name,function} pairs for every entry in /proc/palinfo/cpu*
- */
-static const palinfo_entry_t palinfo_entries[]={
-	{ "version_info",	version_info, },
-	{ "vm_info",		vm_info, },
-	{ "cache_info",		cache_info, },
-	{ "power_info",		power_info, },
-	{ "register_info",	register_info, },
-	{ "processor_info",	processor_info, },
-	{ "frequency_info",	frequency_info, },
-	{ "bus_info",		bus_info },
-	{ "tr_info",		tr_info, }
-};
-
-#define NR_PALINFO_ENTRIES	(int) ARRAY_SIZE(palinfo_entries)
-
-static struct proc_dir_entry *palinfo_dir;
-
-/*
- * This data structure is used to pass which cpu,function is being requested
- * It must fit in a 64bit quantity to be passed to the proc callback routine
- *
- * In SMP mode, when we get a request for another CPU, we must call that
- * other CPU using IPI and wait for the result before returning.
- */
-typedef union {
-	u64 value;
-	struct {
-		unsigned	req_cpu: 32;	/* for which CPU this info is */
-		unsigned	func_id: 32;	/* which function is requested */
-	} pal_func_cpu;
-} pal_func_cpu_u_t;
-
-#define req_cpu	pal_func_cpu.req_cpu
-#define func_id pal_func_cpu.func_id
-
-#ifdef CONFIG_SMP
-
-/*
- * used to hold information about final function to call
- */
-typedef struct {
-	palinfo_func_t	func;	/* pointer to function to call */
-	struct seq_file *m;	/* buffer to store results */
-	int		ret;	/* return value from call */
-} palinfo_smp_data_t;
-
-
-/*
- * this function does the actual final call and he called
- * from the smp code, i.e., this is the palinfo callback routine
- */
-static void
-palinfo_smp_call(void *info)
-{
-	palinfo_smp_data_t *data = (palinfo_smp_data_t *)info;
-	data->ret = (*data->func)(data->m);
-}
-
-/*
- * function called to trigger the IPI, we need to access a remote CPU
- * Return:
- *	0 : error or nothing to output
- *	otherwise how many bytes in the "page" buffer were written
- */
-static
-int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
-{
-	palinfo_smp_data_t ptr;
-	int ret;
-
-	ptr.func = palinfo_entries[f->func_id].proc_read;
-	ptr.m = m;
-	ptr.ret  = 0; /* just in case */
-
-
-	/* will send IPI to other CPU and wait for completion of remote call */
-	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) {
-		printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
-		       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
-		return 0;
-	}
-	return ptr.ret;
-}
-#else /* ! CONFIG_SMP */
-static
-int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
-{
-	printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n");
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Entry point routine: all calls go through this function
- */
-static int proc_palinfo_show(struct seq_file *m, void *v)
-{
-	pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&m->private;
-
-	/*
-	 * in SMP mode, we may need to call another CPU to get correct
-	 * information. PAL, by definition, is processor specific
-	 */
-	if (f->req_cpu == get_cpu())
-		(*palinfo_entries[f->func_id].proc_read)(m);
-	else
-		palinfo_handle_smp(m, f);
-
-	put_cpu();
-	return 0;
-}
-
-static int palinfo_add_proc(unsigned int cpu)
-{
-	pal_func_cpu_u_t f;
-	struct proc_dir_entry *cpu_dir;
-	int j;
-	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
-	sprintf(cpustr, "cpu%d", cpu);
-
-	cpu_dir = proc_mkdir(cpustr, palinfo_dir);
-	if (!cpu_dir)
-		return -EINVAL;
-
-	f.req_cpu = cpu;
-
-	for (j=0; j < NR_PALINFO_ENTRIES; j++) {
-		f.func_id = j;
-		proc_create_single_data(palinfo_entries[j].name, 0, cpu_dir,
-				proc_palinfo_show, (void *)f.value);
-	}
-	return 0;
-}
-
-static int palinfo_del_proc(unsigned int hcpu)
-{
-	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
-
-	sprintf(cpustr, "cpu%d", hcpu);
-	remove_proc_subtree(cpustr, palinfo_dir);
-	return 0;
-}
-
-static enum cpuhp_state hp_online;
-
-static int __init palinfo_init(void)
-{
-	int i = 0;
-
-	printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION);
-	palinfo_dir = proc_mkdir("pal", NULL);
-	if (!palinfo_dir)
-		return -ENOMEM;
-
-	i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/palinfo:online",
-			      palinfo_add_proc, palinfo_del_proc);
-	if (i < 0) {
-		remove_proc_subtree("pal", NULL);
-		return i;
-	}
-	hp_online = i;
-	return 0;
-}
-
-static void __exit palinfo_exit(void)
-{
-	cpuhp_remove_state(hp_online);
-	remove_proc_subtree("pal", NULL);
-}
-
-module_init(palinfo_init);
-module_exit(palinfo_exit);
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
deleted file mode 100644
index 7f21a8c57ed7..000000000000
--- a/arch/ia64/kernel/patch.c
+++ /dev/null
@@ -1,237 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Instruction-patching support.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/init.h>
-#include <linux/string.h>
-
-#include <asm/patch.h>
-#include <asm/processor.h>
-#include <asm/sections.h>
-#include <asm/unistd.h>
-
-/*
- * This was adapted from code written by Tony Luck:
- *
- * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
- * like this:
- *
- * 6  6         5         4         3         2         1
- * 3210987654321098765432109876543210987654321098765432109876543210
- * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
- *
- * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
- * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
- */
-static u64
-get_imm64 (u64 insn_addr)
-{
-	u64 *p = (u64 *) (insn_addr & -16);	/* mask out slot number */
-
-	return ( (p[1] & 0x0800000000000000UL) << 4)  | /*A*/
-		((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
-		((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
-		((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
-		((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
-		((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
-		((p[1] & 0x000007f000000000UL) >> 36);  /*G*/
-}
-
-/* Patch instruction with "val" where "mask" has 1 bits. */
-void
-ia64_patch (u64 insn_addr, u64 mask, u64 val)
-{
-	u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
-#	define insn_mask ((1UL << 41) - 1)
-	unsigned long shift;
-
-	b0 = b[0]; b1 = b[1];
-	shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
-	if (shift >= 64) {
-		m1 = mask << (shift - 64);
-		v1 = val << (shift - 64);
-	} else {
-		m0 = mask << shift; m1 = mask >> (64 - shift);
-		v0 = val  << shift; v1 = val >> (64 - shift);
-		b[0] = (b0 & ~m0) | (v0 & m0);
-	}
-	b[1] = (b1 & ~m1) | (v1 & m1);
-}
-
-void
-ia64_patch_imm64 (u64 insn_addr, u64 val)
-{
-	/* The assembler may generate offset pointing to either slot 1
-	   or slot 2 for a long (2-slot) instruction, occupying slots 1
-	   and 2.  */
-  	insn_addr &= -16UL;
-	ia64_patch(insn_addr + 2,
-		   0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
-				     | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
-				     | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
-				     | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
-				     | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr + 1, 0x1ffffffffffUL, val >> 22);
-}
-
-void
-ia64_patch_imm60 (u64 insn_addr, u64 val)
-{
-	/* The assembler may generate offset pointing to either slot 1
-	   or slot 2 for a long (2-slot) instruction, occupying slots 1
-	   and 2.  */
-  	insn_addr &= -16UL;
-	ia64_patch(insn_addr + 2,
-		   0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
-				     | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr + 1, 0x1fffffffffcUL, val >> 18);
-}
-
-/*
- * We need sometimes to load the physical address of a kernel
- * object.  Often we can convert the virtual address to physical
- * at execution time, but sometimes (either for performance reasons
- * or during error recovery) we cannot to this.  Patch the marked
- * bundles to load the physical address.
- */
-void __init
-ia64_patch_vtop (unsigned long start, unsigned long end)
-{
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-
-		/* replace virtual address with corresponding physical address: */
-		ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-/*
- * Disable the RSE workaround by turning the conditional branch
- * that we tagged in each place the workaround was used into an
- * unconditional branch.
- */
-void __init
-ia64_patch_rse (unsigned long start, unsigned long end)
-{
-	s32 *offp = (s32 *) start;
-	u64 ip, *b;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-
-		b = (u64 *)(ip & -16);
-		b[1] &= ~0xf800000L;
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
-{
-	static int first_time = 1;
-	int need_workaround;
-	s32 *offp = (s32 *) start;
-	u64 *wp;
-
-	need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
-
-	if (first_time) {
-		first_time = 0;
-		if (need_workaround)
-			printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
-	}
-	if (need_workaround)
-		return;
-
-	while (offp < (s32 *) end) {
-		wp = (u64 *) ia64_imva((char *) offp + *offp);
-		wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
-		wp[1] = 0x0084006880000200UL;
-		wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
-		wp[3] = 0x0004000000000200UL;
-		ia64_fc(wp); ia64_fc(wp + 2);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-static void __init
-patch_fsyscall_table (unsigned long start, unsigned long end)
-{
-	extern unsigned long fsyscall_table[NR_syscalls];
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, (u64) fsyscall_table);
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-static void __init
-patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
-{
-	extern char fsys_bubble_down[];
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-		ia64_patch_imm60((u64) ia64_imva((void *) ip),
-				 (u64) (fsys_bubble_down - (ip & -16)) / 16);
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-ia64_patch_gate (void)
-{
-#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
-#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
-
-	patch_fsyscall_table(START(fsyscall), END(fsyscall));
-	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
-	ia64_patch_vtop(START(vtop), END(vtop));
-	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
-}
-
-void ia64_patch_phys_stack_reg(unsigned long val)
-{
-	s32 * offp = (s32 *) __start___phys_stack_reg_patchlist;
-	s32 * end = (s32 *) __end___phys_stack_reg_patchlist;
-	u64 ip, mask, imm;
-
-	/* see instruction format A4: adds r1 = imm13, r3 */
-	mask = (0x3fUL << 27) | (0x7f << 13);
-	imm = (((val >> 7) & 0x3f) << 27) | (val & 0x7f) << 13;
-
-	while (offp < end) {
-		ip = (u64) offp + *offp;
-		ia64_patch(ip, mask, imm);
-		ia64_fc((void *)ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
deleted file mode 100644
index c90221733c6b..000000000000
--- a/arch/ia64/kernel/pci-dma.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Dynamic DMA mapping support.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/dmar.h>
-#include <asm/iommu.h>
-#include <linux/dma-mapping.h>
-#include <linux/kernel.h>
-#include <asm/page.h>
-
-int no_iommu __read_mostly;
-#ifdef CONFIG_IOMMU_DEBUG
-int force_iommu __read_mostly = 1;
-#else
-int force_iommu __read_mostly;
-#endif
-
-static int __init pci_iommu_init(void)
-{
-	if (iommu_detected)
-		intel_iommu_init();
-
-	return 0;
-}
-
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h
deleted file mode 100644
index dbd04028aafa..000000000000
--- a/arch/ia64/kernel/perfmon_itanium.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the Itanium PMU register description tables
- * and pmc checker.
- *
- * Copyright (C) 2002-2003  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-
-static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd1  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd2  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd3  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
-/* pmd8  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd9  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd10 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd11 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd12 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd13 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd14 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd15 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd16 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd17 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static int
-pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	int ret;
-	int is_loaded;
-
-	/* sanitfy check */
-	if (ctx == NULL) return -EINVAL;
-
-	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
-
-	/*
-	 * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-
-	/*
-	 * we must clear the (data) debug registers if pmc11.pt bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-	return 0;
-}
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_ita={
-	.pmu_name      = "Itanium",
-	.pmu_family    = 0x7,
-	.ovfl_val      = (1UL << 32) - 1,
-	.pmd_desc      = pfm_ita_pmd_desc,
-	.pmc_desc      = pfm_ita_pmc_desc,
-	.num_ibrs      = 8,
-	.num_dbrs      = 8,
-	.use_rr_dbregs = 1, /* debug register are use for range retrictions */
-};
-
-
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
deleted file mode 100644
index 9a5cd9fad3a9..000000000000
--- a/arch/ia64/kernel/process.c
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific setup.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
- *
- * 2005-10-07 Keith Owens <kaos@sgi.com>
- *	      Add notify_die() hooks.
- */
-#include <linux/cpu.h>
-#include <linux/pm.h>
-#include <linux/elf.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/personality.h>
-#include <linux/reboot.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/stddef.h>
-#include <linux/thread_info.h>
-#include <linux/unistd.h>
-#include <linux/efi.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-#include <linux/resume_user_mode.h>
-#include <linux/rcupdate.h>
-
-#include <asm/cpu.h>
-#include <asm/delay.h>
-#include <asm/elf.h>
-#include <asm/irq.h>
-#include <asm/kexec.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/switch_to.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-#include <asm/user.h>
-#include <asm/xtp.h>
-
-#include "entry.h"
-
-#include "sigframe.h"
-
-void (*ia64_mark_idle)(int);
-
-unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
-EXPORT_SYMBOL(boot_option_idle_override);
-void (*pm_power_off) (void);
-EXPORT_SYMBOL(pm_power_off);
-
-static void
-ia64_do_show_stack (struct unw_frame_info *info, void *arg)
-{
-	unsigned long ip, sp, bsp;
-	const char *loglvl = arg;
-
-	printk("%s\nCall Trace:\n", loglvl);
-	do {
-		unw_get_ip(info, &ip);
-		if (ip == 0)
-			break;
-
-		unw_get_sp(info, &sp);
-		unw_get_bsp(info, &bsp);
-		printk("%s [<%016lx>] %pS\n"
-			 "                                sp=%016lx bsp=%016lx\n",
-			 loglvl, ip, (void *)ip, sp, bsp);
-	} while (unw_unwind(info) >= 0);
-}
-
-void
-show_stack (struct task_struct *task, unsigned long *sp, const char *loglvl)
-{
-	if (!task)
-		unw_init_running(ia64_do_show_stack, (void *)loglvl);
-	else {
-		struct unw_frame_info info;
-
-		unw_init_from_blocked_task(&info, task);
-		ia64_do_show_stack(&info, (void *)loglvl);
-	}
-}
-
-void
-show_regs (struct pt_regs *regs)
-{
-	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
-
-	print_modules();
-	printk("\n");
-	show_regs_print_info(KERN_DEFAULT);
-	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s (%s)\n",
-	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(),
-	       init_utsname()->release);
-	printk("ip is at %pS\n", (void *)ip);
-	printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
-	       regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
-	printk("rnat: %016lx bsps: %016lx pr  : %016lx\n",
-	       regs->ar_rnat, regs->ar_bspstore, regs->pr);
-	printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
-	       regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
-	printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
-	printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0, regs->b6, regs->b7);
-	printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
-	       regs->f6.u.bits[1], regs->f6.u.bits[0],
-	       regs->f7.u.bits[1], regs->f7.u.bits[0]);
-	printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
-	       regs->f8.u.bits[1], regs->f8.u.bits[0],
-	       regs->f9.u.bits[1], regs->f9.u.bits[0]);
-	printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
-	       regs->f10.u.bits[1], regs->f10.u.bits[0],
-	       regs->f11.u.bits[1], regs->f11.u.bits[0]);
-
-	printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1, regs->r2, regs->r3);
-	printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
-	printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
-	printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
-	printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
-	printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
-	printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
-	printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
-	printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
-
-	if (user_mode(regs)) {
-		/* print the stacked registers */
-		unsigned long val, *bsp, ndirty;
-		int i, sof, is_nat = 0;
-
-		sof = regs->cr_ifs & 0x7f;	/* size of frame */
-		ndirty = (regs->loadrs >> 19);
-		bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
-		for (i = 0; i < sof; ++i) {
-			get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
-			printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
-			       ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
-		}
-	} else
-		show_stack(NULL, NULL, KERN_DEFAULT);
-}
-
-/* local support for deprecated console_print */
-void
-console_print(const char *s)
-{
-	printk(KERN_EMERG "%s", s);
-}
-
-void
-do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
-{
-	if (fsys_mode(current, &scr->pt)) {
-		/*
-		 * defer signal-handling etc. until we return to
-		 * privilege-level 0.
-		 */
-		if (!ia64_psr(&scr->pt)->lp)
-			ia64_psr(&scr->pt)->lp = 1;
-		return;
-	}
-
-	/* deal with pending signal delivery */
-	if (test_thread_flag(TIF_SIGPENDING) ||
-	    test_thread_flag(TIF_NOTIFY_SIGNAL)) {
-		local_irq_enable();	/* force interrupt enable */
-		ia64_do_signal(scr, in_syscall);
-	}
-
-	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
-		local_irq_enable();	/* force interrupt enable */
-		resume_user_mode_work(&scr->pt);
-	}
-
-	/* copy user rbs to kernel rbs */
-	if (unlikely(test_thread_flag(TIF_RESTORE_RSE))) {
-		local_irq_enable();	/* force interrupt enable */
-		ia64_sync_krbs();
-	}
-
-	local_irq_disable();	/* force interrupt disable */
-}
-
-static int __init nohalt_setup(char * str)
-{
-	cpu_idle_poll_ctrl(true);
-	return 1;
-}
-__setup("nohalt", nohalt_setup);
-
-#ifdef CONFIG_HOTPLUG_CPU
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void __noreturn play_dead(void)
-{
-	unsigned int this_cpu = smp_processor_id();
-
-	/* Ack it */
-	__this_cpu_write(cpu_state, CPU_DEAD);
-
-	max_xtp();
-	local_irq_disable();
-	idle_task_exit();
-	ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]);
-	/*
-	 * The above is a point of no-return, the processor is
-	 * expected to be in SAL loop now.
-	 */
-	BUG();
-}
-#else
-static inline void __noreturn play_dead(void)
-{
-	BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void __noreturn arch_cpu_idle_dead(void)
-{
-	play_dead();
-}
-
-void arch_cpu_idle(void)
-{
-	void (*mark_idle)(int) = ia64_mark_idle;
-
-#ifdef CONFIG_SMP
-	min_xtp();
-#endif
-	rmb();
-	if (mark_idle)
-		(*mark_idle)(1);
-
-	raw_safe_halt();
-	raw_local_irq_disable();
-
-	if (mark_idle)
-		(*mark_idle)(0);
-#ifdef CONFIG_SMP
-	normal_xtp();
-#endif
-}
-
-void
-ia64_save_extra (struct task_struct *task)
-{
-	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
-		ia64_save_debug_regs(&task->thread.dbr[0]);
-}
-
-void
-ia64_load_extra (struct task_struct *task)
-{
-	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
-		ia64_load_debug_regs(&task->thread.dbr[0]);
-}
-
-/*
- * Copy the state of an ia-64 thread.
- *
- * We get here through the following  call chain:
- *
- *	from user-level:	from kernel:
- *
- *	<clone syscall>	        <some kernel call frames>
- *	sys_clone		   :
- *	kernel_clone		kernel_clone
- *	copy_thread		copy_thread
- *
- * This means that the stack layout is as follows:
- *
- *	+---------------------+ (highest addr)
- *	|   struct pt_regs    |
- *	+---------------------+
- *	| struct switch_stack |
- *	+---------------------+
- *	|                     |
- *	|    memory stack     |
- *	|                     | <-- sp (lowest addr)
- *	+---------------------+
- *
- * Observe that we copy the unat values that are in pt_regs and switch_stack.  Spilling an
- * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
- * with N=(X & 0x1ff)/8.  Thus, copying the unat value preserves the NaT bits ONLY if the
- * pt_regs structure in the parent is congruent to that of the child, modulo 512.  Since
- * the stack is page aligned and the page size is at least 4KB, this is always the case,
- * so there is nothing to worry about.
- */
-int
-copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
-{
-	unsigned long clone_flags = args->flags;
-	unsigned long user_stack_base = args->stack;
-	unsigned long user_stack_size = args->stack_size;
-	unsigned long tls = args->tls;
-	extern char ia64_ret_from_clone;
-	struct switch_stack *child_stack, *stack;
-	unsigned long rbs, child_rbs, rbs_size;
-	struct pt_regs *child_ptregs;
-	struct pt_regs *regs = current_pt_regs();
-	int retval = 0;
-
-	child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
-	child_stack = (struct switch_stack *) child_ptregs - 1;
-
-	rbs = (unsigned long) current + IA64_RBS_OFFSET;
-	child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
-
-	/* copy parts of thread_struct: */
-	p->thread.ksp = (unsigned long) child_stack - 16;
-
-	/*
-	 * NOTE: The calling convention considers all floating point
-	 * registers in the high partition (fph) to be scratch.  Since
-	 * the only way to get to this point is through a system call,
-	 * we know that the values in fph are all dead.  Hence, there
-	 * is no need to inherit the fph state from the parent to the
-	 * child and all we have to do is to make sure that
-	 * IA64_THREAD_FPH_VALID is cleared in the child.
-	 *
-	 * XXX We could push this optimization a bit further by
-	 * clearing IA64_THREAD_FPH_VALID on ANY system call.
-	 * However, it's not clear this is worth doing.  Also, it
-	 * would be a slight deviation from the normal Linux system
-	 * call behavior where scratch registers are preserved across
-	 * system calls (unless used by the system call itself).
-	 */
-#	define THREAD_FLAGS_TO_CLEAR	(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
-					 | IA64_THREAD_PM_VALID)
-#	define THREAD_FLAGS_TO_SET	0
-	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
-			   | THREAD_FLAGS_TO_SET);
-
-	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
-
-	if (unlikely(args->fn)) {
-		if (unlikely(args->idle)) {
-			/* fork_idle() called us */
-			return 0;
-		}
-		memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
-		child_stack->r4 = (unsigned long) args->fn;
-		child_stack->r5 = (unsigned long) args->fn_arg;
-		/*
-		 * Preserve PSR bits, except for bits 32-34 and 37-45,
-		 * which we can't read.
-		 */
-		child_ptregs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
-		/* mark as valid, empty frame */
-		child_ptregs->cr_ifs = 1UL << 63;
-		child_stack->ar_fpsr = child_ptregs->ar_fpsr
-			= ia64_getreg(_IA64_REG_AR_FPSR);
-		child_stack->pr = (1 << PRED_KERNEL_STACK);
-		child_stack->ar_bspstore = child_rbs;
-		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
-
-		/* stop some PSR bits from being inherited.
-		 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-		 * therefore we must specify them explicitly here and not include them in
-		 * IA64_PSR_BITS_TO_CLEAR.
-		 */
-		child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-
-		return 0;
-	}
-	stack = ((struct switch_stack *) regs) - 1;
-	/* copy parent's switch_stack & pt_regs to child: */
-	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
-
-	/* copy the parent's register backing store to the child: */
-	rbs_size = stack->ar_bspstore - rbs;
-	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
-	if (clone_flags & CLONE_SETTLS)
-		child_ptregs->r13 = tls;
-	if (user_stack_base) {
-		child_ptregs->r12 = user_stack_base + user_stack_size - 16;
-		child_ptregs->ar_bspstore = user_stack_base;
-		child_ptregs->ar_rnat = 0;
-		child_ptregs->loadrs = 0;
-	}
-	child_stack->ar_bspstore = child_rbs + rbs_size;
-	child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
-
-	/* stop some PSR bits from being inherited.
-	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-	 * therefore we must specify them explicitly here and not include them in
-	 * IA64_PSR_BITS_TO_CLEAR.
-	 */
-	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-	return retval;
-}
-
-asmlinkage long ia64_clone(unsigned long clone_flags, unsigned long stack_start,
-			   unsigned long stack_size, unsigned long parent_tidptr,
-			   unsigned long child_tidptr, unsigned long tls)
-{
-	struct kernel_clone_args args = {
-		.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),
-		.pidfd		= (int __user *)parent_tidptr,
-		.child_tid	= (int __user *)child_tidptr,
-		.parent_tid	= (int __user *)parent_tidptr,
-		.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),
-		.stack		= stack_start,
-		.stack_size	= stack_size,
-		.tls		= tls,
-	};
-
-	return kernel_clone(&args);
-}
-
-static void
-do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
-{
-	unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm;
-	unsigned long ip;
-	elf_greg_t *dst = arg;
-	struct pt_regs *pt;
-	char nat;
-	int i;
-
-	memset(dst, 0, sizeof(elf_gregset_t));	/* don't leak any kernel bits to user-level */
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	unw_get_sp(info, &sp);
-	pt = (struct pt_regs *) (sp + 16);
-
-	urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
-
-	if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
-		return;
-
-	ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
-		  &ar_rnat);
-
-	/*
-	 * coredump format:
-	 *	r0-r31
-	 *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
-	 *	predicate registers (p0-p63)
-	 *	b0-b7
-	 *	ip cfm user-mask
-	 *	ar.rsc ar.bsp ar.bspstore ar.rnat
-	 *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
-	 */
-
-	/* r0 is zero */
-	for (i = 1, mask = (1UL << i); i < 32; ++i) {
-		unw_get_gr(info, i, &dst[i], &nat);
-		if (nat)
-			nat_bits |= mask;
-		mask <<= 1;
-	}
-	dst[32] = nat_bits;
-	unw_get_pr(info, &dst[33]);
-
-	for (i = 0; i < 8; ++i)
-		unw_get_br(info, i, &dst[34 + i]);
-
-	unw_get_rp(info, &ip);
-	dst[42] = ip + ia64_psr(pt)->ri;
-	dst[43] = cfm;
-	dst[44] = pt->cr_ipsr & IA64_PSR_UM;
-
-	unw_get_ar(info, UNW_AR_RSC, &dst[45]);
-	/*
-	 * For bsp and bspstore, unw_get_ar() would return the kernel
-	 * addresses, but we need the user-level addresses instead:
-	 */
-	dst[46] = urbs_end;	/* note: by convention PT_AR_BSP points to the end of the urbs! */
-	dst[47] = pt->ar_bspstore;
-	dst[48] = ar_rnat;
-	unw_get_ar(info, UNW_AR_CCV, &dst[49]);
-	unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
-	unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
-	dst[52] = pt->ar_pfs;	/* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
-	unw_get_ar(info, UNW_AR_LC, &dst[53]);
-	unw_get_ar(info, UNW_AR_EC, &dst[54]);
-	unw_get_ar(info, UNW_AR_CSD, &dst[55]);
-	unw_get_ar(info, UNW_AR_SSD, &dst[56]);
-}
-
-static void
-do_copy_regs (struct unw_frame_info *info, void *arg)
-{
-	do_copy_task_regs(current, info, arg);
-}
-
-void
-ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
-{
-	unw_init_running(do_copy_regs, dst);
-}
-
-/*
- * Flush thread state.  This is called when a thread does an execve().
- */
-void
-flush_thread (void)
-{
-	/* drop floating-point and debug-register state if it exists: */
-	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
-	ia64_drop_fpu(current);
-}
-
-/*
- * Clean up state associated with a thread.  This is called when
- * the thread calls exit().
- */
-void
-exit_thread (struct task_struct *tsk)
-{
-
-	ia64_drop_fpu(tsk);
-}
-
-unsigned long
-__get_wchan (struct task_struct *p)
-{
-	struct unw_frame_info info;
-	unsigned long ip;
-	int count = 0;
-
-	/*
-	 * Note: p may not be a blocked task (it could be current or
-	 * another process running on some other CPU.  Rather than
-	 * trying to determine if p is really blocked, we just assume
-	 * it's blocked and rely on the unwind routines to fail
-	 * gracefully if the process wasn't really blocked after all.
-	 * --davidm 99/12/15
-	 */
-	unw_init_from_blocked_task(&info, p);
-	do {
-		if (task_is_running(p))
-			return 0;
-		if (unw_unwind(&info) < 0)
-			return 0;
-		unw_get_ip(&info, &ip);
-		if (!in_sched_functions(ip))
-			return ip;
-	} while (count++ < 16);
-	return 0;
-}
-
-void
-cpu_halt (void)
-{
-	pal_power_mgmt_info_u_t power_info[8];
-	unsigned long min_power;
-	int i, min_power_state;
-
-	if (ia64_pal_halt_info(power_info) != 0)
-		return;
-
-	min_power_state = 0;
-	min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
-	for (i = 1; i < 8; ++i)
-		if (power_info[i].pal_power_mgmt_info_s.im
-		    && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
-			min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
-			min_power_state = i;
-		}
-
-	while (1)
-		ia64_pal_halt(min_power_state);
-}
-
-void machine_shutdown(void)
-{
-	smp_shutdown_nonboot_cpus(reboot_cpu);
-
-#ifdef CONFIG_KEXEC
-	kexec_disable_iosapic();
-#endif
-}
-
-void
-machine_restart (char *restart_cmd)
-{
-	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
-	efi_reboot(REBOOT_WARM, NULL);
-}
-
-void
-machine_halt (void)
-{
-	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
-	cpu_halt();
-}
-
-void
-machine_power_off (void)
-{
-	do_kernel_power_off();
-	machine_halt();
-}
-
-EXPORT_SYMBOL(ia64_delay_loop);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
deleted file mode 100644
index 4c41912c550f..000000000000
--- a/arch/ia64/kernel/ptrace.c
+++ /dev/null
@@ -1,2012 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Kernel support for the ptrace() and syscall tracing interfaces.
- *
- * Copyright (C) 1999-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2006 Intel Co
- *  2006-08-12	- IA64 Native Utrace implementation support added by
- *	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *
- * Derived from the x86 and Alpha versions.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/signal.h>
-#include <linux/regset.h>
-#include <linux/elf.h>
-#include <linux/resume_user_mode.h>
-
-#include <asm/processor.h>
-#include <asm/ptrace_offsets.h>
-#include <asm/rse.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-
-#include "entry.h"
-
-/*
- * Bits in the PSR that we allow ptrace() to change:
- *	be, up, ac, mfl, mfh (the user mask; five bits total)
- *	db (debug breakpoint fault; one bit)
- *	id (instruction debug fault disable; one bit)
- *	dd (data debug fault disable; one bit)
- *	ri (restart instruction; two bits)
- *	is (instruction set; one bit)
- */
-#define IPSR_MASK (IA64_PSR_UM | IA64_PSR_DB | IA64_PSR_IS	\
-		   | IA64_PSR_ID | IA64_PSR_DD | IA64_PSR_RI)
-
-#define MASK(nbits)	((1UL << (nbits)) - 1)	/* mask with NBITS bits set */
-#define PFM_MASK	MASK(38)
-
-#define PTRACE_DEBUG	0
-
-#if PTRACE_DEBUG
-# define dprintk(format...)	printk(format)
-# define inline
-#else
-# define dprintk(format...)
-#endif
-
-/* Return TRUE if PT was created due to kernel-entry via a system-call.  */
-
-static inline int
-in_syscall (struct pt_regs *pt)
-{
-	return (long) pt->cr_ifs >= 0;
-}
-
-/*
- * Collect the NaT bits for r1-r31 from scratch_unat and return a NaT
- * bitset where bit i is set iff the NaT bit of register i is set.
- */
-unsigned long
-ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat)
-{
-#	define GET_BITS(first, last, unat)				\
-	({								\
-		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
-		unsigned long nbits = (last - first + 1);		\
-		unsigned long mask = MASK(nbits) << first;		\
-		unsigned long dist;					\
-		if (bit < first)					\
-			dist = 64 + bit - first;			\
-		else							\
-			dist = bit - first;				\
-		ia64_rotr(unat, dist) & mask;				\
-	})
-	unsigned long val;
-
-	/*
-	 * Registers that are stored consecutively in struct pt_regs
-	 * can be handled in parallel.  If the register order in
-	 * struct_pt_regs changes, this code MUST be updated.
-	 */
-	val  = GET_BITS( 1,  1, scratch_unat);
-	val |= GET_BITS( 2,  3, scratch_unat);
-	val |= GET_BITS(12, 13, scratch_unat);
-	val |= GET_BITS(14, 14, scratch_unat);
-	val |= GET_BITS(15, 15, scratch_unat);
-	val |= GET_BITS( 8, 11, scratch_unat);
-	val |= GET_BITS(16, 31, scratch_unat);
-	return val;
-
-#	undef GET_BITS
-}
-
-/*
- * Set the NaT bits for the scratch registers according to NAT and
- * return the resulting unat (assuming the scratch registers are
- * stored in PT).
- */
-unsigned long
-ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
-{
-#	define PUT_BITS(first, last, nat)				\
-	({								\
-		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
-		unsigned long nbits = (last - first + 1);		\
-		unsigned long mask = MASK(nbits) << first;		\
-		long dist;						\
-		if (bit < first)					\
-			dist = 64 + bit - first;			\
-		else							\
-			dist = bit - first;				\
-		ia64_rotl(nat & mask, dist);				\
-	})
-	unsigned long scratch_unat;
-
-	/*
-	 * Registers that are stored consecutively in struct pt_regs
-	 * can be handled in parallel.  If the register order in
-	 * struct_pt_regs changes, this code MUST be updated.
-	 */
-	scratch_unat  = PUT_BITS( 1,  1, nat);
-	scratch_unat |= PUT_BITS( 2,  3, nat);
-	scratch_unat |= PUT_BITS(12, 13, nat);
-	scratch_unat |= PUT_BITS(14, 14, nat);
-	scratch_unat |= PUT_BITS(15, 15, nat);
-	scratch_unat |= PUT_BITS( 8, 11, nat);
-	scratch_unat |= PUT_BITS(16, 31, nat);
-
-	return scratch_unat;
-
-#	undef PUT_BITS
-}
-
-#define IA64_MLX_TEMPLATE	0x2
-#define IA64_MOVL_OPCODE	6
-
-void
-ia64_increment_ip (struct pt_regs *regs)
-{
-	unsigned long w0, ri = ia64_psr(regs)->ri + 1;
-
-	if (ri > 2) {
-		ri = 0;
-		regs->cr_iip += 16;
-	} else if (ri == 2) {
-		get_user(w0, (char __user *) regs->cr_iip + 0);
-		if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
-			/*
-			 * rfi'ing to slot 2 of an MLX bundle causes
-			 * an illegal operation fault.  We don't want
-			 * that to happen...
-			 */
-			ri = 0;
-			regs->cr_iip += 16;
-		}
-	}
-	ia64_psr(regs)->ri = ri;
-}
-
-void
-ia64_decrement_ip (struct pt_regs *regs)
-{
-	unsigned long w0, ri = ia64_psr(regs)->ri - 1;
-
-	if (ia64_psr(regs)->ri == 0) {
-		regs->cr_iip -= 16;
-		ri = 2;
-		get_user(w0, (char __user *) regs->cr_iip + 0);
-		if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
-			/*
-			 * rfi'ing to slot 2 of an MLX bundle causes
-			 * an illegal operation fault.  We don't want
-			 * that to happen...
-			 */
-			ri = 1;
-		}
-	}
-	ia64_psr(regs)->ri = ri;
-}
-
-/*
- * This routine is used to read an rnat bits that are stored on the
- * kernel backing store.  Since, in general, the alignment of the user
- * and kernel are different, this is not completely trivial.  In
- * essence, we need to construct the user RNAT based on up to two
- * kernel RNAT values and/or the RNAT value saved in the child's
- * pt_regs.
- *
- * user rbs
- *
- * +--------+ <-- lowest address
- * | slot62 |
- * +--------+
- * |  rnat  | 0x....1f8
- * +--------+
- * | slot00 | \
- * +--------+ |
- * | slot01 | > child_regs->ar_rnat
- * +--------+ |
- * | slot02 | /				kernel rbs
- * +--------+				+--------+
- *	    <- child_regs->ar_bspstore	| slot61 | <-- krbs
- * +- - - - +				+--------+
- *					| slot62 |
- * +- - - - +				+--------+
- *					|  rnat	 |
- * +- - - - +				+--------+
- *   vrnat				| slot00 |
- * +- - - - +				+--------+
- *					=	 =
- *					+--------+
- *					| slot00 | \
- *					+--------+ |
- *					| slot01 | > child_stack->ar_rnat
- *					+--------+ |
- *					| slot02 | /
- *					+--------+
- *						  <--- child_stack->ar_bspstore
- *
- * The way to think of this code is as follows: bit 0 in the user rnat
- * corresponds to some bit N (0 <= N <= 62) in one of the kernel rnat
- * value.  The kernel rnat value holding this bit is stored in
- * variable rnat0.  rnat1 is loaded with the kernel rnat value that
- * form the upper bits of the user rnat value.
- *
- * Boundary cases:
- *
- * o when reading the rnat "below" the first rnat slot on the kernel
- *   backing store, rnat0/rnat1 are set to 0 and the low order bits are
- *   merged in from pt->ar_rnat.
- *
- * o when reading the rnat "above" the last rnat slot on the kernel
- *   backing store, rnat0/rnat1 gets its value from sw->ar_rnat.
- */
-static unsigned long
-get_rnat (struct task_struct *task, struct switch_stack *sw,
-	  unsigned long *krbs, unsigned long *urnat_addr,
-	  unsigned long *urbs_end)
-{
-	unsigned long rnat0 = 0, rnat1 = 0, urnat = 0, *slot0_kaddr;
-	unsigned long umask = 0, mask, m;
-	unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
-	long num_regs, nbits;
-	struct pt_regs *pt;
-
-	pt = task_pt_regs(task);
-	kbsp = (unsigned long *) sw->ar_bspstore;
-	ubspstore = (unsigned long *) pt->ar_bspstore;
-
-	if (urbs_end < urnat_addr)
-		nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_end);
-	else
-		nbits = 63;
-	mask = MASK(nbits);
-	/*
-	 * First, figure out which bit number slot 0 in user-land maps
-	 * to in the kernel rnat.  Do this by figuring out how many
-	 * register slots we're beyond the user's backingstore and
-	 * then computing the equivalent address in kernel space.
-	 */
-	num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
-	slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
-	shift = ia64_rse_slot_num(slot0_kaddr);
-	rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
-	rnat0_kaddr = rnat1_kaddr - 64;
-
-	if (ubspstore + 63 > urnat_addr) {
-		/* some bits need to be merged in from pt->ar_rnat */
-		umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
-		urnat = (pt->ar_rnat & umask);
-		mask &= ~umask;
-		if (!mask)
-			return urnat;
-	}
-
-	m = mask << shift;
-	if (rnat0_kaddr >= kbsp)
-		rnat0 = sw->ar_rnat;
-	else if (rnat0_kaddr > krbs)
-		rnat0 = *rnat0_kaddr;
-	urnat |= (rnat0 & m) >> shift;
-
-	m = mask >> (63 - shift);
-	if (rnat1_kaddr >= kbsp)
-		rnat1 = sw->ar_rnat;
-	else if (rnat1_kaddr > krbs)
-		rnat1 = *rnat1_kaddr;
-	urnat |= (rnat1 & m) << (63 - shift);
-	return urnat;
-}
-
-/*
- * The reverse of get_rnat.
- */
-static void
-put_rnat (struct task_struct *task, struct switch_stack *sw,
-	  unsigned long *krbs, unsigned long *urnat_addr, unsigned long urnat,
-	  unsigned long *urbs_end)
-{
-	unsigned long rnat0 = 0, rnat1 = 0, *slot0_kaddr, umask = 0, mask, m;
-	unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
-	long num_regs, nbits;
-	struct pt_regs *pt;
-	unsigned long cfm, *urbs_kargs;
-
-	pt = task_pt_regs(task);
-	kbsp = (unsigned long *) sw->ar_bspstore;
-	ubspstore = (unsigned long *) pt->ar_bspstore;
-
-	urbs_kargs = urbs_end;
-	if (in_syscall(pt)) {
-		/*
-		 * If entered via syscall, don't allow user to set rnat bits
-		 * for syscall args.
-		 */
-		cfm = pt->cr_ifs;
-		urbs_kargs = ia64_rse_skip_regs(urbs_end, -(cfm & 0x7f));
-	}
-
-	if (urbs_kargs >= urnat_addr)
-		nbits = 63;
-	else {
-		if ((urnat_addr - 63) >= urbs_kargs)
-			return;
-		nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_kargs);
-	}
-	mask = MASK(nbits);
-
-	/*
-	 * First, figure out which bit number slot 0 in user-land maps
-	 * to in the kernel rnat.  Do this by figuring out how many
-	 * register slots we're beyond the user's backingstore and
-	 * then computing the equivalent address in kernel space.
-	 */
-	num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
-	slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
-	shift = ia64_rse_slot_num(slot0_kaddr);
-	rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
-	rnat0_kaddr = rnat1_kaddr - 64;
-
-	if (ubspstore + 63 > urnat_addr) {
-		/* some bits need to be place in pt->ar_rnat: */
-		umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
-		pt->ar_rnat = (pt->ar_rnat & ~umask) | (urnat & umask);
-		mask &= ~umask;
-		if (!mask)
-			return;
-	}
-	/*
-	 * Note: Section 11.1 of the EAS guarantees that bit 63 of an
-	 * rnat slot is ignored. so we don't have to clear it here.
-	 */
-	rnat0 = (urnat << shift);
-	m = mask << shift;
-	if (rnat0_kaddr >= kbsp)
-		sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat0 & m);
-	else if (rnat0_kaddr > krbs)
-		*rnat0_kaddr = ((*rnat0_kaddr & ~m) | (rnat0 & m));
-
-	rnat1 = (urnat >> (63 - shift));
-	m = mask >> (63 - shift);
-	if (rnat1_kaddr >= kbsp)
-		sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat1 & m);
-	else if (rnat1_kaddr > krbs)
-		*rnat1_kaddr = ((*rnat1_kaddr & ~m) | (rnat1 & m));
-}
-
-static inline int
-on_kernel_rbs (unsigned long addr, unsigned long bspstore,
-	       unsigned long urbs_end)
-{
-	unsigned long *rnat_addr = ia64_rse_rnat_addr((unsigned long *)
-						      urbs_end);
-	return (addr >= bspstore && addr <= (unsigned long) rnat_addr);
-}
-
-/*
- * Read a word from the user-level backing store of task CHILD.  ADDR
- * is the user-level address to read the word from, VAL a pointer to
- * the return value, and USER_BSP gives the end of the user-level
- * backing store (i.e., it's the address that would be in ar.bsp after
- * the user executed a "cover" instruction).
- *
- * This routine takes care of accessing the kernel register backing
- * store for those registers that got spilled there.  It also takes
- * care of calculating the appropriate RNaT collection words.
- */
-long
-ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
-	   unsigned long user_rbs_end, unsigned long addr, long *val)
-{
-	unsigned long *bspstore, *krbs, regnum, *laddr, *urbs_end, *rnat_addr;
-	struct pt_regs *child_regs;
-	size_t copied;
-	long ret;
-
-	urbs_end = (long *) user_rbs_end;
-	laddr = (unsigned long *) addr;
-	child_regs = task_pt_regs(child);
-	bspstore = (unsigned long *) child_regs->ar_bspstore;
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	if (on_kernel_rbs(addr, (unsigned long) bspstore,
-			  (unsigned long) urbs_end))
-	{
-		/*
-		 * Attempt to read the RBS in an area that's actually
-		 * on the kernel RBS => read the corresponding bits in
-		 * the kernel RBS.
-		 */
-		rnat_addr = ia64_rse_rnat_addr(laddr);
-		ret = get_rnat(child, child_stack, krbs, rnat_addr, urbs_end);
-
-		if (laddr == rnat_addr) {
-			/* return NaT collection word itself */
-			*val = ret;
-			return 0;
-		}
-
-		if (((1UL << ia64_rse_slot_num(laddr)) & ret) != 0) {
-			/*
-			 * It is implementation dependent whether the
-			 * data portion of a NaT value gets saved on a
-			 * st8.spill or RSE spill (e.g., see EAS 2.6,
-			 * 4.4.4.6 Register Spill and Fill).  To get
-			 * consistent behavior across all possible
-			 * IA-64 implementations, we return zero in
-			 * this case.
-			 */
-			*val = 0;
-			return 0;
-		}
-
-		if (laddr < urbs_end) {
-			/*
-			 * The desired word is on the kernel RBS and
-			 * is not a NaT.
-			 */
-			regnum = ia64_rse_num_regs(bspstore, laddr);
-			*val = *ia64_rse_skip_regs(krbs, regnum);
-			return 0;
-		}
-	}
-	copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE);
-	if (copied != sizeof(ret))
-		return -EIO;
-	*val = ret;
-	return 0;
-}
-
-long
-ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
-	   unsigned long user_rbs_end, unsigned long addr, long val)
-{
-	unsigned long *bspstore, *krbs, regnum, *laddr;
-	unsigned long *urbs_end = (long *) user_rbs_end;
-	struct pt_regs *child_regs;
-
-	laddr = (unsigned long *) addr;
-	child_regs = task_pt_regs(child);
-	bspstore = (unsigned long *) child_regs->ar_bspstore;
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	if (on_kernel_rbs(addr, (unsigned long) bspstore,
-			  (unsigned long) urbs_end))
-	{
-		/*
-		 * Attempt to write the RBS in an area that's actually
-		 * on the kernel RBS => write the corresponding bits
-		 * in the kernel RBS.
-		 */
-		if (ia64_rse_is_rnat_slot(laddr))
-			put_rnat(child, child_stack, krbs, laddr, val,
-				 urbs_end);
-		else {
-			if (laddr < urbs_end) {
-				regnum = ia64_rse_num_regs(bspstore, laddr);
-				*ia64_rse_skip_regs(krbs, regnum) = val;
-			}
-		}
-	} else if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE | FOLL_WRITE)
-		   != sizeof(val))
-		return -EIO;
-	return 0;
-}
-
-/*
- * Calculate the address of the end of the user-level register backing
- * store.  This is the address that would have been stored in ar.bsp
- * if the user had executed a "cover" instruction right before
- * entering the kernel.  If CFMP is not NULL, it is used to return the
- * "current frame mask" that was active at the time the kernel was
- * entered.
- */
-unsigned long
-ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt,
-		       unsigned long *cfmp)
-{
-	unsigned long *krbs, *bspstore, cfm = pt->cr_ifs;
-	long ndirty;
-
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	bspstore = (unsigned long *) pt->ar_bspstore;
-	ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
-
-	if (in_syscall(pt))
-		ndirty += (cfm & 0x7f);
-	else
-		cfm &= ~(1UL << 63);	/* clear valid bit */
-
-	if (cfmp)
-		*cfmp = cfm;
-	return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty);
-}
-
-/*
- * Synchronize (i.e, write) the RSE backing store living in kernel
- * space to the VM of the CHILD task.  SW and PT are the pointers to
- * the switch_stack and pt_regs structures, respectively.
- * USER_RBS_END is the user-level address at which the backing store
- * ends.
- */
-long
-ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
-		    unsigned long user_rbs_start, unsigned long user_rbs_end)
-{
-	unsigned long addr, val;
-	long ret;
-
-	/* now copy word for word from kernel rbs to user rbs: */
-	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
-		if (ret < 0)
-			return ret;
-		if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE | FOLL_WRITE)
-		    != sizeof(val))
-			return -EIO;
-	}
-	return 0;
-}
-
-static long
-ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
-		unsigned long user_rbs_start, unsigned long user_rbs_end)
-{
-	unsigned long addr, val;
-	long ret;
-
-	/* now copy word for word from user rbs to kernel rbs: */
-	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE)
-				!= sizeof(val))
-			return -EIO;
-
-		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
-}
-
-typedef long (*syncfunc_t)(struct task_struct *, struct switch_stack *,
-			    unsigned long, unsigned long);
-
-static void do_sync_rbs(struct unw_frame_info *info, void *arg)
-{
-	struct pt_regs *pt;
-	unsigned long urbs_end;
-	syncfunc_t fn = arg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-	pt = task_pt_regs(info->task);
-	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
-
-	fn(info->task, info->sw, pt->ar_bspstore, urbs_end);
-}
-
-/*
- * when a thread is stopped (ptraced), debugger might change thread's user
- * stack (change memory directly), and we must avoid the RSE stored in kernel
- * to override user stack (user space's RSE is newer than kernel's in the
- * case). To workaround the issue, we copy kernel RSE to user RSE before the
- * task is stopped, so user RSE has updated data.  we then copy user RSE to
- * kernel after the task is resummed from traced stop and kernel will use the
- * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need
- * synchronize user RSE to kernel.
- */
-void ia64_ptrace_stop(void)
-{
-	if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE))
-		return;
-	set_notify_resume(current);
-	unw_init_running(do_sync_rbs, ia64_sync_user_rbs);
-}
-
-/*
- * This is called to read back the register backing store.
- */
-void ia64_sync_krbs(void)
-{
-	clear_tsk_thread_flag(current, TIF_RESTORE_RSE);
-
-	unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs);
-}
-
-/*
- * Write f32-f127 back to task->thread.fph if it has been modified.
- */
-inline void
-ia64_flush_fph (struct task_struct *task)
-{
-	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
-
-	/*
-	 * Prevent migrating this task while
-	 * we're fiddling with the FPU state
-	 */
-	preempt_disable();
-	if (ia64_is_local_fpu_owner(task) && psr->mfh) {
-		psr->mfh = 0;
-		task->thread.flags |= IA64_THREAD_FPH_VALID;
-		ia64_save_fpu(&task->thread.fph[0]);
-	}
-	preempt_enable();
-}
-
-/*
- * Sync the fph state of the task so that it can be manipulated
- * through thread.fph.  If necessary, f32-f127 are written back to
- * thread.fph or, if the fph state hasn't been used before, thread.fph
- * is cleared to zeroes.  Also, access to f32-f127 is disabled to
- * ensure that the task picks up the state from thread.fph when it
- * executes again.
- */
-void
-ia64_sync_fph (struct task_struct *task)
-{
-	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
-
-	ia64_flush_fph(task);
-	if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) {
-		task->thread.flags |= IA64_THREAD_FPH_VALID;
-		memset(&task->thread.fph, 0, sizeof(task->thread.fph));
-	}
-	ia64_drop_fpu(task);
-	psr->dfh = 1;
-}
-
-/*
- * Change the machine-state of CHILD such that it will return via the normal
- * kernel exit-path, rather than the syscall-exit path.
- */
-static void
-convert_to_non_syscall (struct task_struct *child, struct pt_regs  *pt,
-			unsigned long cfm)
-{
-	struct unw_frame_info info, prev_info;
-	unsigned long ip, sp, pr;
-
-	unw_init_from_blocked_task(&info, child);
-	while (1) {
-		prev_info = info;
-		if (unw_unwind(&info) < 0)
-			return;
-
-		unw_get_sp(&info, &sp);
-		if ((long)((unsigned long)child + IA64_STK_OFFSET - sp)
-		    < IA64_PT_REGS_SIZE) {
-			dprintk("ptrace.%s: ran off the top of the kernel "
-				"stack\n", __func__);
-			return;
-		}
-		if (unw_get_pr (&prev_info, &pr) < 0) {
-			unw_get_rp(&prev_info, &ip);
-			dprintk("ptrace.%s: failed to read "
-				"predicate register (ip=0x%lx)\n",
-				__func__, ip);
-			return;
-		}
-		if (unw_is_intr_frame(&info)
-		    && (pr & (1UL << PRED_USER_STACK)))
-			break;
-	}
-
-	/*
-	 * Note: at the time of this call, the target task is blocked
-	 * in notify_resume_user() and by clearling PRED_LEAVE_SYSCALL
-	 * (aka, "pLvSys") we redirect execution from
-	 * .work_pending_syscall_end to .work_processed_kernel.
-	 */
-	unw_get_pr(&prev_info, &pr);
-	pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
-	pr |=  (1UL << PRED_NON_SYSCALL);
-	unw_set_pr(&prev_info, pr);
-
-	pt->cr_ifs = (1UL << 63) | cfm;
-	/*
-	 * Clear the memory that is NOT written on syscall-entry to
-	 * ensure we do not leak kernel-state to user when execution
-	 * resumes.
-	 */
-	pt->r2 = 0;
-	pt->r3 = 0;
-	pt->r14 = 0;
-	memset(&pt->r16, 0, 16*8);	/* clear r16-r31 */
-	memset(&pt->f6, 0, 6*16);	/* clear f6-f11 */
-	pt->b7 = 0;
-	pt->ar_ccv = 0;
-	pt->ar_csd = 0;
-	pt->ar_ssd = 0;
-}
-
-static int
-access_nat_bits (struct task_struct *child, struct pt_regs *pt,
-		 struct unw_frame_info *info,
-		 unsigned long *data, int write_access)
-{
-	unsigned long regnum, nat_bits, scratch_unat, dummy = 0;
-	char nat = 0;
-
-	if (write_access) {
-		nat_bits = *data;
-		scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits);
-		if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) {
-			dprintk("ptrace: failed to set ar.unat\n");
-			return -1;
-		}
-		for (regnum = 4; regnum <= 7; ++regnum) {
-			unw_get_gr(info, regnum, &dummy, &nat);
-			unw_set_gr(info, regnum, dummy,
-				   (nat_bits >> regnum) & 1);
-		}
-	} else {
-		if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) {
-			dprintk("ptrace: failed to read ar.unat\n");
-			return -1;
-		}
-		nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat);
-		for (regnum = 4; regnum <= 7; ++regnum) {
-			unw_get_gr(info, regnum, &dummy, &nat);
-			nat_bits |= (nat != 0) << regnum;
-		}
-		*data = nat_bits;
-	}
-	return 0;
-}
-
-static int
-access_elf_reg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access);
-
-static long
-ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
-{
-	unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val;
-	struct unw_frame_info info;
-	struct ia64_fpreg fpval;
-	struct switch_stack *sw;
-	struct pt_regs *pt;
-	long ret, retval = 0;
-	char nat = 0;
-	int i;
-
-	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
-		return -EIO;
-
-	pt = task_pt_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
-	unw_init_from_blocked_task(&info, child);
-	if (unw_unwind_to_user(&info) < 0) {
-		return -EIO;
-	}
-
-	if (((unsigned long) ppr & 0x7) != 0) {
-		dprintk("ptrace:unaligned register address %p\n", ppr);
-		return -EIO;
-	}
-
-	if (access_elf_reg(child, &info, ELF_CR_IPSR_OFFSET, &psr, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_EC_OFFSET, &ec, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_LC_OFFSET, &lc, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_RNAT_OFFSET, &rnat, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_BSP_OFFSET, &bsp, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_CFM_OFFSET, &cfm, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_NAT_OFFSET, &nat_bits, 0) < 0)
-		return -EIO;
-
-	/* control regs */
-
-	retval |= __put_user(pt->cr_iip, &ppr->cr_iip);
-	retval |= __put_user(psr, &ppr->cr_ipsr);
-
-	/* app regs */
-
-	retval |= __put_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
-	retval |= __put_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
-	retval |= __put_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
-	retval |= __put_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
-	retval |= __put_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
-	retval |= __put_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
-
-	retval |= __put_user(ec, &ppr->ar[PT_AUR_EC]);
-	retval |= __put_user(lc, &ppr->ar[PT_AUR_LC]);
-	retval |= __put_user(rnat, &ppr->ar[PT_AUR_RNAT]);
-	retval |= __put_user(bsp, &ppr->ar[PT_AUR_BSP]);
-	retval |= __put_user(cfm, &ppr->cfm);
-
-	/* gr1-gr3 */
-
-	retval |= __copy_to_user(&ppr->gr[1], &pt->r1, sizeof(long));
-	retval |= __copy_to_user(&ppr->gr[2], &pt->r2, sizeof(long) *2);
-
-	/* gr4-gr7 */
-
-	for (i = 4; i < 8; i++) {
-		if (unw_access_gr(&info, i, &val, &nat, 0) < 0)
-			return -EIO;
-		retval |= __put_user(val, &ppr->gr[i]);
-	}
-
-	/* gr8-gr11 */
-
-	retval |= __copy_to_user(&ppr->gr[8], &pt->r8, sizeof(long) * 4);
-
-	/* gr12-gr15 */
-
-	retval |= __copy_to_user(&ppr->gr[12], &pt->r12, sizeof(long) * 2);
-	retval |= __copy_to_user(&ppr->gr[14], &pt->r14, sizeof(long));
-	retval |= __copy_to_user(&ppr->gr[15], &pt->r15, sizeof(long));
-
-	/* gr16-gr31 */
-
-	retval |= __copy_to_user(&ppr->gr[16], &pt->r16, sizeof(long) * 16);
-
-	/* b0 */
-
-	retval |= __put_user(pt->b0, &ppr->br[0]);
-
-	/* b1-b5 */
-
-	for (i = 1; i < 6; i++) {
-		if (unw_access_br(&info, i, &val, 0) < 0)
-			return -EIO;
-		__put_user(val, &ppr->br[i]);
-	}
-
-	/* b6-b7 */
-
-	retval |= __put_user(pt->b6, &ppr->br[6]);
-	retval |= __put_user(pt->b7, &ppr->br[7]);
-
-	/* fr2-fr5 */
-
-	for (i = 2; i < 6; i++) {
-		if (unw_get_fr(&info, i, &fpval) < 0)
-			return -EIO;
-		retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
-	}
-
-	/* fr6-fr11 */
-
-	retval |= __copy_to_user(&ppr->fr[6], &pt->f6,
-				 sizeof(struct ia64_fpreg) * 6);
-
-	/* fp scratch regs(12-15) */
-
-	retval |= __copy_to_user(&ppr->fr[12], &sw->f12,
-				 sizeof(struct ia64_fpreg) * 4);
-
-	/* fr16-fr31 */
-
-	for (i = 16; i < 32; i++) {
-		if (unw_get_fr(&info, i, &fpval) < 0)
-			return -EIO;
-		retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
-	}
-
-	/* fph */
-
-	ia64_flush_fph(child);
-	retval |= __copy_to_user(&ppr->fr[32], &child->thread.fph,
-				 sizeof(ppr->fr[32]) * 96);
-
-	/*  preds */
-
-	retval |= __put_user(pt->pr, &ppr->pr);
-
-	/* nat bits */
-
-	retval |= __put_user(nat_bits, &ppr->nat);
-
-	ret = retval ? -EIO : 0;
-	return ret;
-}
-
-static long
-ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
-{
-	unsigned long psr, rsc, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
-	struct unw_frame_info info;
-	struct switch_stack *sw;
-	struct ia64_fpreg fpval;
-	struct pt_regs *pt;
-	long retval = 0;
-	int i;
-
-	memset(&fpval, 0, sizeof(fpval));
-
-	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
-		return -EIO;
-
-	pt = task_pt_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
-	unw_init_from_blocked_task(&info, child);
-	if (unw_unwind_to_user(&info) < 0) {
-		return -EIO;
-	}
-
-	if (((unsigned long) ppr & 0x7) != 0) {
-		dprintk("ptrace:unaligned register address %p\n", ppr);
-		return -EIO;
-	}
-
-	/* control regs */
-
-	retval |= __get_user(pt->cr_iip, &ppr->cr_iip);
-	retval |= __get_user(psr, &ppr->cr_ipsr);
-
-	/* app regs */
-
-	retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
-	retval |= __get_user(rsc, &ppr->ar[PT_AUR_RSC]);
-	retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
-	retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
-	retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
-	retval |= __get_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
-
-	retval |= __get_user(ec, &ppr->ar[PT_AUR_EC]);
-	retval |= __get_user(lc, &ppr->ar[PT_AUR_LC]);
-	retval |= __get_user(rnat, &ppr->ar[PT_AUR_RNAT]);
-	retval |= __get_user(bsp, &ppr->ar[PT_AUR_BSP]);
-	retval |= __get_user(cfm, &ppr->cfm);
-
-	/* gr1-gr3 */
-
-	retval |= __copy_from_user(&pt->r1, &ppr->gr[1], sizeof(long));
-	retval |= __copy_from_user(&pt->r2, &ppr->gr[2], sizeof(long) * 2);
-
-	/* gr4-gr7 */
-
-	for (i = 4; i < 8; i++) {
-		retval |= __get_user(val, &ppr->gr[i]);
-		/* NaT bit will be set via PT_NAT_BITS: */
-		if (unw_set_gr(&info, i, val, 0) < 0)
-			return -EIO;
-	}
-
-	/* gr8-gr11 */
-
-	retval |= __copy_from_user(&pt->r8, &ppr->gr[8], sizeof(long) * 4);
-
-	/* gr12-gr15 */
-
-	retval |= __copy_from_user(&pt->r12, &ppr->gr[12], sizeof(long) * 2);
-	retval |= __copy_from_user(&pt->r14, &ppr->gr[14], sizeof(long));
-	retval |= __copy_from_user(&pt->r15, &ppr->gr[15], sizeof(long));
-
-	/* gr16-gr31 */
-
-	retval |= __copy_from_user(&pt->r16, &ppr->gr[16], sizeof(long) * 16);
-
-	/* b0 */
-
-	retval |= __get_user(pt->b0, &ppr->br[0]);
-
-	/* b1-b5 */
-
-	for (i = 1; i < 6; i++) {
-		retval |= __get_user(val, &ppr->br[i]);
-		unw_set_br(&info, i, val);
-	}
-
-	/* b6-b7 */
-
-	retval |= __get_user(pt->b6, &ppr->br[6]);
-	retval |= __get_user(pt->b7, &ppr->br[7]);
-
-	/* fr2-fr5 */
-
-	for (i = 2; i < 6; i++) {
-		retval |= __copy_from_user(&fpval, &ppr->fr[i], sizeof(fpval));
-		if (unw_set_fr(&info, i, fpval) < 0)
-			return -EIO;
-	}
-
-	/* fr6-fr11 */
-
-	retval |= __copy_from_user(&pt->f6, &ppr->fr[6],
-				   sizeof(ppr->fr[6]) * 6);
-
-	/* fp scratch regs(12-15) */
-
-	retval |= __copy_from_user(&sw->f12, &ppr->fr[12],
-				   sizeof(ppr->fr[12]) * 4);
-
-	/* fr16-fr31 */
-
-	for (i = 16; i < 32; i++) {
-		retval |= __copy_from_user(&fpval, &ppr->fr[i],
-					   sizeof(fpval));
-		if (unw_set_fr(&info, i, fpval) < 0)
-			return -EIO;
-	}
-
-	/* fph */
-
-	ia64_sync_fph(child);
-	retval |= __copy_from_user(&child->thread.fph, &ppr->fr[32],
-				   sizeof(ppr->fr[32]) * 96);
-
-	/* preds */
-
-	retval |= __get_user(pt->pr, &ppr->pr);
-
-	/* nat bits */
-
-	retval |= __get_user(nat_bits, &ppr->nat);
-
-	retval |= access_elf_reg(child, &info, ELF_CR_IPSR_OFFSET, &psr, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_RSC_OFFSET, &rsc, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_EC_OFFSET, &ec, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_LC_OFFSET, &lc, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_RNAT_OFFSET, &rnat, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_BSP_OFFSET, &bsp, 1);
-	retval |= access_elf_reg(child, &info, ELF_CFM_OFFSET, &cfm, 1);
-	retval |= access_elf_reg(child, &info, ELF_NAT_OFFSET, &nat_bits, 1);
-
-	return retval ? -EIO : 0;
-}
-
-void
-user_enable_single_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->ss = 1;
-}
-
-void
-user_enable_block_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->tb = 1;
-}
-
-void
-user_disable_single_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	/* make sure the single step/taken-branch trap bits are not set: */
-	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->ss = 0;
-	child_psr->tb = 0;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void
-ptrace_disable (struct task_struct *child)
-{
-	user_disable_single_step(child);
-}
-
-static int
-access_uarea (struct task_struct *child, unsigned long addr,
-	      unsigned long *data, int write_access);
-
-long
-arch_ptrace (struct task_struct *child, long request,
-	     unsigned long addr, unsigned long data)
-{
-	switch (request) {
-	case PTRACE_PEEKTEXT:
-	case PTRACE_PEEKDATA:
-		/* read word at location addr */
-		if (ptrace_access_vm(child, addr, &data, sizeof(data),
-				FOLL_FORCE)
-		    != sizeof(data))
-			return -EIO;
-		/* ensure return value is not mistaken for error code */
-		force_successful_syscall_return();
-		return data;
-
-	/* PTRACE_POKETEXT and PTRACE_POKEDATA is handled
-	 * by the generic ptrace_request().
-	 */
-
-	case PTRACE_PEEKUSR:
-		/* read the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 0) < 0)
-			return -EIO;
-		/* ensure return value is not mistaken for error code */
-		force_successful_syscall_return();
-		return data;
-
-	case PTRACE_POKEUSR:
-		/* write the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 1) < 0)
-			return -EIO;
-		return 0;
-
-	case PTRACE_OLD_GETSIGINFO:
-		/* for backwards-compatibility */
-		return ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
-
-	case PTRACE_OLD_SETSIGINFO:
-		/* for backwards-compatibility */
-		return ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
-
-	case PTRACE_GETREGS:
-		return ptrace_getregs(child,
-				      (struct pt_all_user_regs __user *) data);
-
-	case PTRACE_SETREGS:
-		return ptrace_setregs(child,
-				      (struct pt_all_user_regs __user *) data);
-
-	default:
-		return ptrace_request(child, request, addr, data);
-	}
-}
-
-
-/* "asmlinkage" so the input arguments are preserved... */
-
-asmlinkage long
-syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
-{
-	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (ptrace_report_syscall_entry(&regs))
-			return -ENOSYS;
-
-	/* copy user rbs to kernel rbs */
-	if (test_thread_flag(TIF_RESTORE_RSE))
-		ia64_sync_krbs();
-
-
-	audit_syscall_entry(regs.r15, arg0, arg1, arg2, arg3);
-
-	return 0;
-}
-
-/* "asmlinkage" so the input arguments are preserved... */
-
-asmlinkage void
-syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
-{
-	int step;
-
-	audit_syscall_exit(&regs);
-
-	step = test_thread_flag(TIF_SINGLESTEP);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		ptrace_report_syscall_exit(&regs, step);
-
-	/* copy user rbs to kernel rbs */
-	if (test_thread_flag(TIF_RESTORE_RSE))
-		ia64_sync_krbs();
-}
-
-/* Utrace implementation starts here */
-struct regset_get {
-	void *kbuf;
-	void __user *ubuf;
-};
-
-struct regset_set {
-	const void *kbuf;
-	const void __user *ubuf;
-};
-
-struct regset_getset {
-	struct task_struct *target;
-	const struct user_regset *regset;
-	union {
-		struct regset_get get;
-		struct regset_set set;
-	} u;
-	unsigned int pos;
-	unsigned int count;
-	int ret;
-};
-
-static const ptrdiff_t pt_offsets[32] =
-{
-#define R(n) offsetof(struct pt_regs, r##n)
-	[0] = -1, R(1), R(2), R(3),
-	[4] = -1, [5] = -1, [6] = -1, [7] = -1,
-	R(8), R(9), R(10), R(11), R(12), R(13), R(14), R(15),
-	R(16), R(17), R(18), R(19), R(20), R(21), R(22), R(23),
-	R(24), R(25), R(26), R(27), R(28), R(29), R(30), R(31),
-#undef R
-};
-
-static int
-access_elf_gpreg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt = task_pt_regs(target);
-	unsigned reg = addr / sizeof(unsigned long);
-	ptrdiff_t d = pt_offsets[reg];
-
-	if (d >= 0) {
-		unsigned long *ptr = (void *)pt + d;
-		if (write_access)
-			*ptr = *data;
-		else
-			*data = *ptr;
-		return 0;
-	} else {
-		char nat = 0;
-		if (write_access) {
-			/* read NaT bit first: */
-			unsigned long dummy;
-			int ret = unw_get_gr(info, reg, &dummy, &nat);
-			if (ret < 0)
-				return ret;
-		}
-		return unw_access_gr(info, reg, data, &nat, write_access);
-	}
-}
-
-static int
-access_elf_breg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt;
-	unsigned long *ptr = NULL;
-
-	pt = task_pt_regs(target);
-	switch (addr) {
-	case ELF_BR_OFFSET(0):
-		ptr = &pt->b0;
-		break;
-	case ELF_BR_OFFSET(1) ... ELF_BR_OFFSET(5):
-		return unw_access_br(info, (addr - ELF_BR_OFFSET(0))/8,
-				     data, write_access);
-	case ELF_BR_OFFSET(6):
-		ptr = &pt->b6;
-		break;
-	case ELF_BR_OFFSET(7):
-		ptr = &pt->b7;
-	}
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-	return 0;
-}
-
-static int
-access_elf_areg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt;
-	unsigned long cfm, urbs_end;
-	unsigned long *ptr = NULL;
-
-	pt = task_pt_regs(target);
-	if (addr >= ELF_AR_RSC_OFFSET && addr <= ELF_AR_SSD_OFFSET) {
-		switch (addr) {
-		case ELF_AR_RSC_OFFSET:
-			/* force PL3 */
-			if (write_access)
-				pt->ar_rsc = *data | (3 << 2);
-			else
-				*data = pt->ar_rsc;
-			return 0;
-		case ELF_AR_BSP_OFFSET:
-			/*
-			 * By convention, we use PT_AR_BSP to refer to
-			 * the end of the user-level backing store.
-			 * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
-			 * to get the real value of ar.bsp at the time
-			 * the kernel was entered.
-			 *
-			 * Furthermore, when changing the contents of
-			 * PT_AR_BSP (or PT_CFM) while the task is
-			 * blocked in a system call, convert the state
-			 * so that the non-system-call exit
-			 * path is used.  This ensures that the proper
-			 * state will be picked up when resuming
-			 * execution.  However, it *also* means that
-			 * once we write PT_AR_BSP/PT_CFM, it won't be
-			 * possible to modify the syscall arguments of
-			 * the pending system call any longer.  This
-			 * shouldn't be an issue because modifying
-			 * PT_AR_BSP/PT_CFM generally implies that
-			 * we're either abandoning the pending system
-			 * call or that we defer it's re-execution
-			 * (e.g., due to GDB doing an inferior
-			 * function call).
-			 */
-			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
-			if (write_access) {
-				if (*data != urbs_end) {
-					if (in_syscall(pt))
-						convert_to_non_syscall(target,
-								       pt,
-								       cfm);
-					/*
-					 * Simulate user-level write
-					 * of ar.bsp:
-					 */
-					pt->loadrs = 0;
-					pt->ar_bspstore = *data;
-				}
-			} else
-				*data = urbs_end;
-			return 0;
-		case ELF_AR_BSPSTORE_OFFSET:
-			ptr = &pt->ar_bspstore;
-			break;
-		case ELF_AR_RNAT_OFFSET:
-			ptr = &pt->ar_rnat;
-			break;
-		case ELF_AR_CCV_OFFSET:
-			ptr = &pt->ar_ccv;
-			break;
-		case ELF_AR_UNAT_OFFSET:
-			ptr = &pt->ar_unat;
-			break;
-		case ELF_AR_FPSR_OFFSET:
-			ptr = &pt->ar_fpsr;
-			break;
-		case ELF_AR_PFS_OFFSET:
-			ptr = &pt->ar_pfs;
-			break;
-		case ELF_AR_LC_OFFSET:
-			return unw_access_ar(info, UNW_AR_LC, data,
-					     write_access);
-		case ELF_AR_EC_OFFSET:
-			return unw_access_ar(info, UNW_AR_EC, data,
-					     write_access);
-		case ELF_AR_CSD_OFFSET:
-			ptr = &pt->ar_csd;
-			break;
-		case ELF_AR_SSD_OFFSET:
-			ptr = &pt->ar_ssd;
-		}
-	} else if (addr >= ELF_CR_IIP_OFFSET && addr <= ELF_CR_IPSR_OFFSET) {
-		switch (addr) {
-		case ELF_CR_IIP_OFFSET:
-			ptr = &pt->cr_iip;
-			break;
-		case ELF_CFM_OFFSET:
-			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
-			if (write_access) {
-				if (((cfm ^ *data) & PFM_MASK) != 0) {
-					if (in_syscall(pt))
-						convert_to_non_syscall(target,
-								       pt,
-								       cfm);
-					pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
-						      | (*data & PFM_MASK));
-				}
-			} else
-				*data = cfm;
-			return 0;
-		case ELF_CR_IPSR_OFFSET:
-			if (write_access) {
-				unsigned long tmp = *data;
-				/* psr.ri==3 is a reserved value: SDM 2:25 */
-				if ((tmp & IA64_PSR_RI) == IA64_PSR_RI)
-					tmp &= ~IA64_PSR_RI;
-				pt->cr_ipsr = ((tmp & IPSR_MASK)
-					       | (pt->cr_ipsr & ~IPSR_MASK));
-			} else
-				*data = (pt->cr_ipsr & IPSR_MASK);
-			return 0;
-		}
-	} else if (addr == ELF_NAT_OFFSET)
-		return access_nat_bits(target, pt, info,
-				       data, write_access);
-	else if (addr == ELF_PR_OFFSET)
-		ptr = &pt->pr;
-	else
-		return -1;
-
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-
-	return 0;
-}
-
-static int
-access_elf_reg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	if (addr >= ELF_GR_OFFSET(1) && addr <= ELF_GR_OFFSET(31))
-		return access_elf_gpreg(target, info, addr, data, write_access);
-	else if (addr >= ELF_BR_OFFSET(0) && addr <= ELF_BR_OFFSET(7))
-		return access_elf_breg(target, info, addr, data, write_access);
-	else
-		return access_elf_areg(target, info, addr, data, write_access);
-}
-
-struct regset_membuf {
-	struct membuf to;
-	int ret;
-};
-
-static void do_gpregs_get(struct unw_frame_info *info, void *arg)
-{
-	struct regset_membuf *dst = arg;
-	struct membuf to = dst->to;
-	unsigned int n;
-	elf_greg_t reg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/*
-	 * coredump format:
-	 *      r0-r31
-	 *      NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
-	 *      predicate registers (p0-p63)
-	 *      b0-b7
-	 *      ip cfm user-mask
-	 *      ar.rsc ar.bsp ar.bspstore ar.rnat
-	 *      ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
-	 */
-
-
-	/* Skip r0 */
-	membuf_zero(&to, 8);
-	for (n = 8; to.left && n < ELF_AR_END_OFFSET; n += 8) {
-		if (access_elf_reg(info->task, info, n, &reg, 0) < 0) {
-			dst->ret = -EIO;
-			return;
-		}
-		membuf_store(&to, reg);
-	}
-}
-
-static void do_gpregs_set(struct unw_frame_info *info, void *arg)
-{
-	struct regset_getset *dst = arg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	if (!dst->count)
-		return;
-	/* Skip r0 */
-	if (dst->pos < ELF_GR_OFFSET(1)) {
-		user_regset_copyin_ignore(&dst->pos, &dst->count,
-					  &dst->u.set.kbuf, &dst->u.set.ubuf,
-					  0, ELF_GR_OFFSET(1));
-		dst->ret = 0;
-	}
-
-	while (dst->count && dst->pos < ELF_AR_END_OFFSET) {
-		unsigned int n, from, to;
-		elf_greg_t tmp[16];
-
-		from = dst->pos;
-		to = from + sizeof(tmp);
-		if (to > ELF_AR_END_OFFSET)
-			to = ELF_AR_END_OFFSET;
-		/* get up to 16 values */
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
-				from, to);
-		if (dst->ret)
-			return;
-		/* now copy them into registers */
-		for (n = 0; from < dst->pos; from += sizeof(elf_greg_t), n++)
-			if (access_elf_reg(dst->target, info, from,
-						&tmp[n], 1) < 0) {
-				dst->ret = -EIO;
-				return;
-			}
-	}
-}
-
-#define ELF_FP_OFFSET(i)	(i * sizeof(elf_fpreg_t))
-
-static void do_fpregs_get(struct unw_frame_info *info, void *arg)
-{
-	struct task_struct *task = info->task;
-	struct regset_membuf *dst = arg;
-	struct membuf to = dst->to;
-	elf_fpreg_t reg;
-	unsigned int n;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/* Skip pos 0 and 1 */
-	membuf_zero(&to, 2 * sizeof(elf_fpreg_t));
-
-	/* fr2-fr31 */
-	for (n = 2; to.left && n < 32; n++) {
-		if (unw_get_fr(info, n, &reg)) {
-			dst->ret = -EIO;
-			return;
-		}
-		membuf_write(&to, &reg, sizeof(reg));
-	}
-
-	/* fph */
-	if (!to.left)
-		return;
-
-	ia64_flush_fph(task);
-	if (task->thread.flags & IA64_THREAD_FPH_VALID)
-		membuf_write(&to, &task->thread.fph, 96 * sizeof(reg));
-	else
-		membuf_zero(&to, 96 * sizeof(reg));
-}
-
-static void do_fpregs_set(struct unw_frame_info *info, void *arg)
-{
-	struct regset_getset *dst = arg;
-	elf_fpreg_t fpreg, tmp[30];
-	int index, start, end;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/* Skip pos 0 and 1 */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) {
-		user_regset_copyin_ignore(&dst->pos, &dst->count,
-					  &dst->u.set.kbuf, &dst->u.set.ubuf,
-					  0, ELF_FP_OFFSET(2));
-		dst->ret = 0;
-		if (dst->count == 0)
-			return;
-	}
-
-	/* fr2-fr31 */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) {
-		start = dst->pos;
-		end = min(((unsigned int)ELF_FP_OFFSET(32)),
-			 dst->pos + dst->count);
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
-				ELF_FP_OFFSET(2), ELF_FP_OFFSET(32));
-		if (dst->ret)
-			return;
-
-		if (start & 0xF) { /* only write high part */
-			if (unw_get_fr(info, start / sizeof(elf_fpreg_t),
-					 &fpreg)) {
-				dst->ret = -EIO;
-				return;
-			}
-			tmp[start / sizeof(elf_fpreg_t) - 2].u.bits[0]
-				= fpreg.u.bits[0];
-			start &= ~0xFUL;
-		}
-		if (end & 0xF) { /* only write low part */
-			if (unw_get_fr(info, end / sizeof(elf_fpreg_t),
-					&fpreg)) {
-				dst->ret = -EIO;
-				return;
-			}
-			tmp[end / sizeof(elf_fpreg_t) - 2].u.bits[1]
-				= fpreg.u.bits[1];
-			end = (end + 0xF) & ~0xFUL;
-		}
-
-		for ( ;	start < end ; start += sizeof(elf_fpreg_t)) {
-			index = start / sizeof(elf_fpreg_t);
-			if (unw_set_fr(info, index, tmp[index - 2])) {
-				dst->ret = -EIO;
-				return;
-			}
-		}
-		if (dst->ret || dst->count == 0)
-			return;
-	}
-
-	/* fph */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(128)) {
-		ia64_sync_fph(dst->target);
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-						&dst->u.set.kbuf,
-						&dst->u.set.ubuf,
-						&dst->target->thread.fph,
-						ELF_FP_OFFSET(32), -1);
-	}
-}
-
-static void
-unwind_and_call(void (*call)(struct unw_frame_info *, void *),
-	       struct task_struct *target, void *data)
-{
-	if (target == current)
-		unw_init_running(call, data);
-	else {
-		struct unw_frame_info info;
-		memset(&info, 0, sizeof(info));
-		unw_init_from_blocked_task(&info, target);
-		(*call)(&info, data);
-	}
-}
-
-static int
-do_regset_call(void (*call)(struct unw_frame_info *, void *),
-	       struct task_struct *target,
-	       const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       const void *kbuf, const void __user *ubuf)
-{
-	struct regset_getset info = { .target = target, .regset = regset,
-				 .pos = pos, .count = count,
-				 .u.set = { .kbuf = kbuf, .ubuf = ubuf },
-				 .ret = 0 };
-	unwind_and_call(call, target, &info);
-	return info.ret;
-}
-
-static int
-gpregs_get(struct task_struct *target,
-	   const struct user_regset *regset,
-	   struct membuf to)
-{
-	struct regset_membuf info = {.to = to};
-	unwind_and_call(do_gpregs_get, target, &info);
-	return info.ret;
-}
-
-static int gpregs_set(struct task_struct *target,
-		const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	return do_regset_call(do_gpregs_set, target, regset, pos, count,
-		kbuf, ubuf);
-}
-
-static void do_gpregs_writeback(struct unw_frame_info *info, void *arg)
-{
-	do_sync_rbs(info, ia64_sync_user_rbs);
-}
-
-/*
- * This is called to write back the register backing store.
- * ptrace does this before it stops, so that a tracer reading the user
- * memory after the thread stops will get the current register data.
- */
-static int
-gpregs_writeback(struct task_struct *target,
-		 const struct user_regset *regset,
-		 int now)
-{
-	if (test_and_set_tsk_thread_flag(target, TIF_RESTORE_RSE))
-		return 0;
-	set_notify_resume(target);
-	return do_regset_call(do_gpregs_writeback, target, regset, 0, 0,
-		NULL, NULL);
-}
-
-static int
-fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return (target->thread.flags & IA64_THREAD_FPH_VALID) ? 128 : 32;
-}
-
-static int fpregs_get(struct task_struct *target,
-		const struct user_regset *regset,
-		struct membuf to)
-{
-	struct regset_membuf info = {.to = to};
-	unwind_and_call(do_fpregs_get, target, &info);
-	return info.ret;
-}
-
-static int fpregs_set(struct task_struct *target,
-		const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	return do_regset_call(do_fpregs_set, target, regset, pos, count,
-		kbuf, ubuf);
-}
-
-static int
-access_uarea(struct task_struct *child, unsigned long addr,
-	      unsigned long *data, int write_access)
-{
-	unsigned int pos = -1; /* an invalid value */
-	unsigned long *ptr, regnum;
-
-	if ((addr & 0x7) != 0) {
-		dprintk("ptrace: unaligned register address 0x%lx\n", addr);
-		return -1;
-	}
-	if ((addr >= PT_NAT_BITS + 8 && addr < PT_F2) ||
-		(addr >= PT_R7 + 8 && addr < PT_B1) ||
-		(addr >= PT_AR_LC + 8 && addr < PT_CR_IPSR) ||
-		(addr >= PT_AR_SSD + 8 && addr < PT_DBR)) {
-		dprintk("ptrace: rejecting access to register "
-					"address 0x%lx\n", addr);
-		return -1;
-	}
-
-	switch (addr) {
-	case PT_F32 ... (PT_F127 + 15):
-		pos = addr - PT_F32 + ELF_FP_OFFSET(32);
-		break;
-	case PT_F2 ... (PT_F5 + 15):
-		pos = addr - PT_F2 + ELF_FP_OFFSET(2);
-		break;
-	case PT_F10 ... (PT_F31 + 15):
-		pos = addr - PT_F10 + ELF_FP_OFFSET(10);
-		break;
-	case PT_F6 ... (PT_F9 + 15):
-		pos = addr - PT_F6 + ELF_FP_OFFSET(6);
-		break;
-	}
-
-	if (pos != -1) {
-		unsigned reg = pos / sizeof(elf_fpreg_t);
-		int which_half = (pos / sizeof(unsigned long)) & 1;
-
-		if (reg < 32) { /* fr2-fr31 */
-			struct unw_frame_info info;
-			elf_fpreg_t fpreg;
-
-			memset(&info, 0, sizeof(info));
-			unw_init_from_blocked_task(&info, child);
-			if (unw_unwind_to_user(&info) < 0)
-				return 0;
-
-			if (unw_get_fr(&info, reg, &fpreg))
-				return -1;
-			if (write_access) {
-				fpreg.u.bits[which_half] = *data;
-				if (unw_set_fr(&info, reg, fpreg))
-					return -1;
-			} else {
-				*data = fpreg.u.bits[which_half];
-			}
-		} else { /* fph */
-			elf_fpreg_t *p = &child->thread.fph[reg - 32];
-			unsigned long *bits = &p->u.bits[which_half];
-
-			ia64_sync_fph(child);
-			if (write_access)
-				*bits = *data;
-			else if (child->thread.flags & IA64_THREAD_FPH_VALID)
-				*data = *bits;
-			else
-				*data = 0;
-		}
-		return 0;
-	}
-
-	switch (addr) {
-	case PT_NAT_BITS:
-		pos = ELF_NAT_OFFSET;
-		break;
-	case PT_R4 ... PT_R7:
-		pos = addr - PT_R4 + ELF_GR_OFFSET(4);
-		break;
-	case PT_B1 ... PT_B5:
-		pos = addr - PT_B1 + ELF_BR_OFFSET(1);
-		break;
-	case PT_AR_EC:
-		pos = ELF_AR_EC_OFFSET;
-		break;
-	case PT_AR_LC:
-		pos = ELF_AR_LC_OFFSET;
-		break;
-	case PT_CR_IPSR:
-		pos = ELF_CR_IPSR_OFFSET;
-		break;
-	case PT_CR_IIP:
-		pos = ELF_CR_IIP_OFFSET;
-		break;
-	case PT_CFM:
-		pos = ELF_CFM_OFFSET;
-		break;
-	case PT_AR_UNAT:
-		pos = ELF_AR_UNAT_OFFSET;
-		break;
-	case PT_AR_PFS:
-		pos = ELF_AR_PFS_OFFSET;
-		break;
-	case PT_AR_RSC:
-		pos = ELF_AR_RSC_OFFSET;
-		break;
-	case PT_AR_RNAT:
-		pos = ELF_AR_RNAT_OFFSET;
-		break;
-	case PT_AR_BSPSTORE:
-		pos = ELF_AR_BSPSTORE_OFFSET;
-		break;
-	case PT_PR:
-		pos = ELF_PR_OFFSET;
-		break;
-	case PT_B6:
-		pos = ELF_BR_OFFSET(6);
-		break;
-	case PT_AR_BSP:
-		pos = ELF_AR_BSP_OFFSET;
-		break;
-	case PT_R1 ... PT_R3:
-		pos = addr - PT_R1 + ELF_GR_OFFSET(1);
-		break;
-	case PT_R12 ... PT_R15:
-		pos = addr - PT_R12 + ELF_GR_OFFSET(12);
-		break;
-	case PT_R8 ... PT_R11:
-		pos = addr - PT_R8 + ELF_GR_OFFSET(8);
-		break;
-	case PT_R16 ... PT_R31:
-		pos = addr - PT_R16 + ELF_GR_OFFSET(16);
-		break;
-	case PT_AR_CCV:
-		pos = ELF_AR_CCV_OFFSET;
-		break;
-	case PT_AR_FPSR:
-		pos = ELF_AR_FPSR_OFFSET;
-		break;
-	case PT_B0:
-		pos = ELF_BR_OFFSET(0);
-		break;
-	case PT_B7:
-		pos = ELF_BR_OFFSET(7);
-		break;
-	case PT_AR_CSD:
-		pos = ELF_AR_CSD_OFFSET;
-		break;
-	case PT_AR_SSD:
-		pos = ELF_AR_SSD_OFFSET;
-		break;
-	}
-
-	if (pos != -1) {
-		struct unw_frame_info info;
-
-		memset(&info, 0, sizeof(info));
-		unw_init_from_blocked_task(&info, child);
-		if (unw_unwind_to_user(&info) < 0)
-			return 0;
-
-		return access_elf_reg(child, &info, pos, data, write_access);
-	}
-
-	/* access debug registers */
-	if (addr >= PT_IBR) {
-		regnum = (addr - PT_IBR) >> 3;
-		ptr = &child->thread.ibr[0];
-	} else {
-		regnum = (addr - PT_DBR) >> 3;
-		ptr = &child->thread.dbr[0];
-	}
-
-	if (regnum >= 8) {
-		dprintk("ptrace: rejecting access to register "
-				"address 0x%lx\n", addr);
-		return -1;
-	}
-
-	if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
-		child->thread.flags |= IA64_THREAD_DBG_VALID;
-		memset(child->thread.dbr, 0,
-				sizeof(child->thread.dbr));
-		memset(child->thread.ibr, 0,
-				sizeof(child->thread.ibr));
-	}
-
-	ptr += regnum;
-
-	if ((regnum & 1) && write_access) {
-		/* don't let the user set kernel-level breakpoints: */
-		*ptr = *data & ~(7UL << 56);
-		return 0;
-	}
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-	return 0;
-}
-
-static const struct user_regset native_regsets[] = {
-	{
-		.core_note_type = NT_PRSTATUS,
-		.n = ELF_NGREG,
-		.size = sizeof(elf_greg_t), .align = sizeof(elf_greg_t),
-		.regset_get = gpregs_get, .set = gpregs_set,
-		.writeback = gpregs_writeback
-	},
-	{
-		.core_note_type = NT_PRFPREG,
-		.n = ELF_NFPREG,
-		.size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t),
-		.regset_get = fpregs_get, .set = fpregs_set, .active = fpregs_active
-	},
-};
-
-static const struct user_regset_view user_ia64_view = {
-	.name = "ia64",
-	.e_machine = EM_IA_64,
-	.regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
-};
-
-const struct user_regset_view *task_user_regset_view(struct task_struct *tsk)
-{
-	return &user_ia64_view;
-}
-
-struct syscall_get_args {
-	unsigned int i;
-	unsigned int n;
-	unsigned long *args;
-	struct pt_regs *regs;
-};
-
-static void syscall_get_args_cb(struct unw_frame_info *info, void *data)
-{
-	struct syscall_get_args *args = data;
-	struct pt_regs *pt = args->regs;
-	unsigned long *krbs, cfm, ndirty, nlocals, nouts;
-	int i, count;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/*
-	 * We get here via a few paths:
-	 * - break instruction: cfm is shared with caller.
-	 *   syscall args are in out= regs, locals are non-empty.
-	 * - epsinstruction: cfm is set by br.call
-	 *   locals don't exist.
-	 *
-	 * For both cases arguments are reachable in cfm.sof - cfm.sol.
-	 * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ]
-	 */
-	cfm = pt->cr_ifs;
-	nlocals = (cfm >> 7) & 0x7f; /* aka sol */
-	nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */
-	krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8;
-	ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
-
-	count = 0;
-	if (in_syscall(pt))
-		count = min_t(int, args->n, nouts);
-
-	/* Iterate over outs. */
-	for (i = 0; i < count; i++) {
-		int j = ndirty + nlocals + i + args->i;
-		args->args[i] = *ia64_rse_skip_regs(krbs, j);
-	}
-
-	while (i < args->n) {
-		args->args[i] = 0;
-		i++;
-	}
-}
-
-void syscall_get_arguments(struct task_struct *task,
-	struct pt_regs *regs, unsigned long *args)
-{
-	struct syscall_get_args data = {
-		.i = 0,
-		.n = 6,
-		.args = args,
-		.regs = regs,
-	};
-
-	if (task == current)
-		unw_init_running(syscall_get_args_cb, &data);
-	else {
-		struct unw_frame_info ufi;
-		memset(&ufi, 0, sizeof(ufi));
-		unw_init_from_blocked_task(&ufi, task);
-		syscall_get_args_cb(&ufi, &data);
-	}
-}
diff --git a/arch/ia64/kernel/relocate_kernel.S b/arch/ia64/kernel/relocate_kernel.S
deleted file mode 100644
index 527a7b896a6e..000000000000
--- a/arch/ia64/kernel/relocate_kernel.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * arch/ia64/kernel/relocate_kernel.S
- *
- * Relocate kexec'able kernel and start it
- *
- * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 Khalid Aziz  <khalid.aziz@hp.com>
- * Copyright (C) 2005 Intel Corp,  Zou Nan hai <nanhai.zou@intel.com>
- */
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/kregs.h>
-#include <asm/page.h>
-#include <asm/mca_asm.h>
-
-       /* Must be relocatable PIC code callable as a C function
-        */
-GLOBAL_ENTRY(relocate_new_kernel)
-	.prologue
-	alloc r31=ar.pfs,4,0,0,0
-        .body
-.reloc_entry:
-{
-	rsm psr.i| psr.ic
-	mov r2=ip
-}
-	;;
-{
-        flushrs                         // must be first insn in group
-        srlz.i
-}
-	;;
-	dep r2=0,r2,61,3		//to physical address
-	;;
-	//first switch to physical mode
-	add r3=1f-.reloc_entry, r2
-	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC
-	mov ar.rsc=0	          	// put RSE in enforced lazy mode
-	;;
-	add sp=(memory_stack_end - 16 - .reloc_entry),r2
-	add r8=(register_stack - .reloc_entry),r2
-	;;
-	mov r18=ar.rnat
-	mov ar.bspstore=r8
-	;;
-        mov cr.ipsr=r16
-        mov cr.iip=r3
-        mov cr.ifs=r0
-	srlz.i
-	;;
-	mov ar.rnat=r18
-	rfi				// note: this unmask MCA/INIT (psr.mc)
-	;;
-1:
-	//physical mode code begin
-	mov b6=in1
-	dep r28=0,in2,61,3	//to physical address
-
-	// purge all TC entries
-#define O(member)       IA64_CPUINFO_##member##_OFFSET
-        GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
-        ;;
-        addl r17=O(PTCE_STRIDE),r2
-        addl r2=O(PTCE_BASE),r2
-        ;;
-        ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;    	// r18=ptce_base
-        ld4 r19=[r2],4                                  // r19=ptce_count[0]
-        ld4 r21=[r17],4                                 // r21=ptce_stride[0]
-        ;;
-        ld4 r20=[r2]                                    // r20=ptce_count[1]
-        ld4 r22=[r17]                                   // r22=ptce_stride[1]
-        mov r24=r0
-        ;;
-        adds r20=-1,r20
-        ;;
-#undef O
-2:
-        cmp.ltu p6,p7=r24,r19
-(p7)    br.cond.dpnt.few 4f
-        mov ar.lc=r20
-3:
-        ptc.e r18
-        ;;
-        add r18=r22,r18
-        br.cloop.sptk.few 3b
-        ;;
-        add r18=r21,r18
-        add r24=1,r24
-        ;;
-        br.sptk.few 2b
-4:
-        srlz.i
-        ;;
-	// purge TR entry for kernel text and data
-        movl r16=KERNEL_START
-        mov r18=KERNEL_TR_PAGE_SHIFT<<2
-        ;;
-        ptr.i r16, r18
-        ptr.d r16, r18
-        ;;
-        srlz.i
-        ;;
-
-        // purge TR entry for pal code
-        mov r16=in3
-        mov r18=IA64_GRANULE_SHIFT<<2
-        ;;
-        ptr.i r16,r18
-        ;;
-        srlz.i
-	;;
-
-        // purge TR entry for stack
-        mov r16=IA64_KR(CURRENT_STACK)
-        ;;
-        shl r16=r16,IA64_GRANULE_SHIFT
-        movl r19=PAGE_OFFSET
-        ;;
-        add r16=r19,r16
-        mov r18=IA64_GRANULE_SHIFT<<2
-        ;;
-        ptr.d r16,r18
-        ;;
-        srlz.i
-	;;
-
-	//copy segments
-	movl r16=PAGE_MASK
-        mov  r30=in0                    // in0 is page_list
-        br.sptk.few .dest_page
-	;;
-.loop:
-	ld8  r30=[in0], 8;;
-.dest_page:
-	tbit.z p0, p6=r30, 0;;    	// 0x1 dest page
-(p6)	and r17=r30, r16
-(p6)	br.cond.sptk.few .loop;;
-
-	tbit.z p0, p6=r30, 1;;		// 0x2 indirect page
-(p6)	and in0=r30, r16
-(p6)	br.cond.sptk.few .loop;;
-
-	tbit.z p0, p6=r30, 2;;		// 0x4 end flag
-(p6)	br.cond.sptk.few .end_loop;;
-
-	tbit.z p6, p0=r30, 3;;		// 0x8 source page
-(p6)	br.cond.sptk.few .loop
-
-	and r18=r30, r16
-
-	// simple copy page, may optimize later
-	movl r14=PAGE_SIZE/8 - 1;;
-	mov ar.lc=r14;;
-1:
-	ld8 r14=[r18], 8;;
-	st8 [r17]=r14;;
-	fc.i r17
-	add r17=8, r17
-	br.ctop.sptk.few 1b
-	br.sptk.few .loop
-	;;
-
-.end_loop:
-	sync.i			// for fc.i
-	;;
-	srlz.i
-	;;
-	srlz.d
-	;;
-	br.call.sptk.many b0=b6;;
-
-.align  32
-memory_stack:
-	.fill           8192, 1, 0
-memory_stack_end:
-register_stack:
-	.fill           8192, 1, 0
-register_stack_end:
-relocate_new_kernel_end:
-END(relocate_new_kernel)
-
-.global relocate_new_kernel_size
-relocate_new_kernel_size:
-	data8	relocate_new_kernel_end - relocate_new_kernel
-
-GLOBAL_ENTRY(ia64_dump_cpu_regs)
-        .prologue
-        alloc loc0=ar.pfs,1,2,0,0
-        .body
-        mov     ar.rsc=0                // put RSE in enforced lazy mode
-        add     loc1=4*8, in0           // save r4 and r5 first
-        ;;
-{
-        flushrs                         // flush dirty regs to backing store
-        srlz.i
-}
-        st8 [loc1]=r4, 8
-        ;;
-        st8 [loc1]=r5, 8
-        ;;
-        add loc1=32*8, in0
-        mov r4=ar.rnat
-        ;;
-        st8 [in0]=r0, 8			// r0
-        st8 [loc1]=r4, 8		// rnat
-        mov r5=pr
-        ;;
-        st8 [in0]=r1, 8			// r1
-        st8 [loc1]=r5, 8		// pr
-        mov r4=b0
-        ;;
-        st8 [in0]=r2, 8			// r2
-        st8 [loc1]=r4, 8		// b0
-        mov r5=b1;
-        ;;
-        st8 [in0]=r3, 24		// r3
-        st8 [loc1]=r5, 8		// b1
-        mov r4=b2
-        ;;
-        st8 [in0]=r6, 8			// r6
-        st8 [loc1]=r4, 8		// b2
-	mov r5=b3
-        ;;
-        st8 [in0]=r7, 8			// r7
-        st8 [loc1]=r5, 8		// b3
-        mov r4=b4
-        ;;
-        st8 [in0]=r8, 8			// r8
-        st8 [loc1]=r4, 8		// b4
-        mov r5=b5
-        ;;
-        st8 [in0]=r9, 8			// r9
-        st8 [loc1]=r5, 8		// b5
-        mov r4=b6
-        ;;
-        st8 [in0]=r10, 8		// r10
-        st8 [loc1]=r5, 8		// b6
-        mov r5=b7
-        ;;
-        st8 [in0]=r11, 8		// r11
-        st8 [loc1]=r5, 8		// b7
-        mov r4=b0
-        ;;
-        st8 [in0]=r12, 8		// r12
-        st8 [loc1]=r4, 8		// ip
-        mov r5=loc0
-	;;
-        st8 [in0]=r13, 8		// r13
-        extr.u r5=r5, 0, 38		// ar.pfs.pfm
-	mov r4=r0			// user mask
-        ;;
-        st8 [in0]=r14, 8		// r14
-        st8 [loc1]=r5, 8		// cfm
-        ;;
-        st8 [in0]=r15, 8		// r15
-        st8 [loc1]=r4, 8        	// user mask
-	mov r5=ar.rsc
-        ;;
-        st8 [in0]=r16, 8		// r16
-        st8 [loc1]=r5, 8        	// ar.rsc
-        mov r4=ar.bsp
-        ;;
-        st8 [in0]=r17, 8		// r17
-        st8 [loc1]=r4, 8        	// ar.bsp
-        mov r5=ar.bspstore
-        ;;
-        st8 [in0]=r18, 8		// r18
-        st8 [loc1]=r5, 8        	// ar.bspstore
-        mov r4=ar.rnat
-        ;;
-        st8 [in0]=r19, 8		// r19
-        st8 [loc1]=r4, 8        	// ar.rnat
-        mov r5=ar.ccv
-        ;;
-        st8 [in0]=r20, 8		// r20
-	st8 [loc1]=r5, 8        	// ar.ccv
-        mov r4=ar.unat
-        ;;
-        st8 [in0]=r21, 8		// r21
-        st8 [loc1]=r4, 8        	// ar.unat
-        mov r5 = ar.fpsr
-        ;;
-        st8 [in0]=r22, 8		// r22
-        st8 [loc1]=r5, 8        	// ar.fpsr
-        mov r4 = ar.unat
-        ;;
-        st8 [in0]=r23, 8		// r23
-        st8 [loc1]=r4, 8        	// unat
-        mov r5 = ar.fpsr
-        ;;
-        st8 [in0]=r24, 8		// r24
-        st8 [loc1]=r5, 8        	// fpsr
-        mov r4 = ar.pfs
-        ;;
-        st8 [in0]=r25, 8		// r25
-        st8 [loc1]=r4, 8        	// ar.pfs
-        mov r5 = ar.lc
-        ;;
-        st8 [in0]=r26, 8		// r26
-        st8 [loc1]=r5, 8        	// ar.lc
-        mov r4 = ar.ec
-        ;;
-        st8 [in0]=r27, 8		// r27
-        st8 [loc1]=r4, 8        	// ar.ec
-        mov r5 = ar.csd
-        ;;
-        st8 [in0]=r28, 8		// r28
-        st8 [loc1]=r5, 8        	// ar.csd
-        mov r4 = ar.ssd
-        ;;
-        st8 [in0]=r29, 8		// r29
-        st8 [loc1]=r4, 8        	// ar.ssd
-        ;;
-        st8 [in0]=r30, 8		// r30
-        ;;
-	st8 [in0]=r31, 8		// r31
-        mov ar.pfs=loc0
-        ;;
-        br.ret.sptk.many rp
-END(ia64_dump_cpu_regs)
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
deleted file mode 100644
index e4f0705c0282..000000000000
--- a/arch/ia64/kernel/sal.c
+++ /dev/null
@@ -1,400 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * System Abstraction Layer (SAL) interface routines.
- *
- * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/sal.h>
-#include <asm/pal.h>
-#include <asm/xtp.h>
-
- __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
-unsigned long sal_platform_features;
-
-unsigned short sal_revision;
-unsigned short sal_version;
-
-#define SAL_MAJOR(x) ((x) >> 8)
-#define SAL_MINOR(x) ((x) & 0xff)
-
-static struct {
-	void *addr;	/* function entry point */
-	void *gpval;	/* gp value to use */
-} pdesc;
-
-static long
-default_handler (void)
-{
-	return -1;
-}
-
-ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler;
-ia64_sal_desc_ptc_t *ia64_ptc_domain_info;
-
-const char *
-ia64_sal_strerror (long status)
-{
-	const char *str;
-	switch (status) {
-	      case 0: str = "Call completed without error"; break;
-	      case 1: str = "Effect a warm boot of the system to complete "
-			      "the update"; break;
-	      case -1: str = "Not implemented"; break;
-	      case -2: str = "Invalid argument"; break;
-	      case -3: str = "Call completed with error"; break;
-	      case -4: str = "Virtual address not registered"; break;
-	      case -5: str = "No information available"; break;
-	      case -6: str = "Insufficient space to add the entry"; break;
-	      case -7: str = "Invalid entry_addr value"; break;
-	      case -8: str = "Invalid interrupt vector"; break;
-	      case -9: str = "Requested memory not available"; break;
-	      case -10: str = "Unable to write to the NVM device"; break;
-	      case -11: str = "Invalid partition type specified"; break;
-	      case -12: str = "Invalid NVM_Object id specified"; break;
-	      case -13: str = "NVM_Object already has the maximum number "
-				"of partitions"; break;
-	      case -14: str = "Insufficient space in partition for the "
-				"requested write sub-function"; break;
-	      case -15: str = "Insufficient data buffer space for the "
-				"requested read record sub-function"; break;
-	      case -16: str = "Scratch buffer required for the write/delete "
-				"sub-function"; break;
-	      case -17: str = "Insufficient space in the NVM_Object for the "
-				"requested create sub-function"; break;
-	      case -18: str = "Invalid value specified in the partition_rec "
-				"argument"; break;
-	      case -19: str = "Record oriented I/O not supported for this "
-				"partition"; break;
-	      case -20: str = "Bad format of record to be written or "
-				"required keyword variable not "
-				"specified"; break;
-	      default: str = "Unknown SAL status code"; break;
-	}
-	return str;
-}
-
-void __init
-ia64_sal_handler_init (void *entry_point, void *gpval)
-{
-	/* fill in the SAL procedure descriptor and point ia64_sal to it: */
-	pdesc.addr = entry_point;
-	pdesc.gpval = gpval;
-	ia64_sal = (ia64_sal_handler) &pdesc;
-}
-
-static void __init
-check_versions (struct ia64_sal_systab *systab)
-{
-	sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor;
-	sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor;
-
-	/* Check for broken firmware */
-	if ((sal_revision == SAL_VERSION_CODE(49, 29))
-	    && (sal_version == SAL_VERSION_CODE(49, 29)))
-	{
-		/*
-		 * Old firmware for zx2000 prototypes have this weird version number,
-		 * reset it to something sane.
-		 */
-		sal_revision = SAL_VERSION_CODE(2, 8);
-		sal_version = SAL_VERSION_CODE(0, 0);
-	}
-}
-
-static void __init
-sal_desc_entry_point (void *p)
-{
-	struct ia64_sal_desc_entry_point *ep = p;
-	ia64_pal_handler_init(__va(ep->pal_proc));
-	ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp));
-}
-
-#ifdef CONFIG_SMP
-static void __init
-set_smp_redirect (int flag)
-{
-#ifndef CONFIG_HOTPLUG_CPU
-	if (no_int_routing)
-		smp_int_redirect &= ~flag;
-	else
-		smp_int_redirect |= flag;
-#else
-	/*
-	 * For CPU Hotplug we dont want to do any chipset supported
-	 * interrupt redirection. The reason is this would require that
-	 * All interrupts be stopped and hard bind the irq to a cpu.
-	 * Later when the interrupt is fired we need to set the redir hint
-	 * on again in the vector. This is cumbersome for something that the
-	 * user mode irq balancer will solve anyways.
-	 */
-	no_int_routing=1;
-	smp_int_redirect &= ~flag;
-#endif
-}
-#else
-#define set_smp_redirect(flag)	do { } while (0)
-#endif
-
-static void __init
-sal_desc_platform_feature (void *p)
-{
-	struct ia64_sal_desc_platform_feature *pf = p;
-	sal_platform_features = pf->feature_mask;
-
-	printk(KERN_INFO "SAL Platform features:");
-	if (!sal_platform_features) {
-		printk(" None\n");
-		return;
-	}
-
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK)
-		printk(" BusLock");
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) {
-		printk(" IRQ_Redirection");
-		set_smp_redirect(SMP_IRQ_REDIRECTION);
-	}
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) {
-		printk(" IPI_Redirection");
-		set_smp_redirect(SMP_IPI_REDIRECTION);
-	}
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)
-		printk(" ITC_Drift");
-	printk("\n");
-}
-
-#ifdef CONFIG_SMP
-static void __init
-sal_desc_ap_wakeup (void *p)
-{
-	struct ia64_sal_desc_ap_wakeup *ap = p;
-
-	switch (ap->mechanism) {
-	case IA64_SAL_AP_EXTERNAL_INT:
-		ap_wakeup_vector = ap->vector;
-		printk(KERN_INFO "SAL: AP wakeup using external interrupt "
-				"vector 0x%lx\n", ap_wakeup_vector);
-		break;
-	default:
-		printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n");
-		break;
-	}
-}
-
-static void __init
-chk_nointroute_opt(void)
-{
-	char *cp;
-
-	for (cp = boot_command_line; *cp; ) {
-		if (memcmp(cp, "nointroute", 10) == 0) {
-			no_int_routing = 1;
-			printk ("no_int_routing on\n");
-			break;
-		} else {
-			while (*cp != ' ' && *cp)
-				++cp;
-			while (*cp == ' ')
-				++cp;
-		}
-	}
-}
-
-#else
-static void __init sal_desc_ap_wakeup(void *p) { }
-#endif
-
-/*
- * HP rx5670 firmware polls for interrupts during SAL_CACHE_FLUSH by reading
- * cr.ivr, but it never writes cr.eoi.  This leaves any interrupt marked as
- * "in-service" and masks other interrupts of equal or lower priority.
- *
- * HP internal defect reports: F1859, F2775, F3031.
- */
-static int sal_cache_flush_drops_interrupts;
-
-static int __init
-force_pal_cache_flush(char *str)
-{
-	sal_cache_flush_drops_interrupts = 1;
-	return 0;
-}
-early_param("force_pal_cache_flush", force_pal_cache_flush);
-
-void __init
-check_sal_cache_flush (void)
-{
-	unsigned long flags;
-	int cpu;
-	u64 vector, cache_type = 3;
-	struct ia64_sal_retval isrv;
-
-	if (sal_cache_flush_drops_interrupts)
-		return;
-
-	cpu = get_cpu();
-	local_irq_save(flags);
-
-	/*
-	 * Send ourselves a timer interrupt, wait until it's reported, and see
-	 * if SAL_CACHE_FLUSH drops it.
-	 */
-	ia64_send_ipi(cpu, IA64_TIMER_VECTOR, IA64_IPI_DM_INT, 0);
-
-	while (!ia64_get_irr(IA64_TIMER_VECTOR))
-		cpu_relax();
-
-	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
-
-	if (isrv.status)
-		printk(KERN_ERR "SAL_CAL_FLUSH failed with %ld\n", isrv.status);
-
-	if (ia64_get_irr(IA64_TIMER_VECTOR)) {
-		vector = ia64_get_ivr();
-		ia64_eoi();
-		WARN_ON(vector != IA64_TIMER_VECTOR);
-	} else {
-		sal_cache_flush_drops_interrupts = 1;
-		printk(KERN_ERR "SAL: SAL_CACHE_FLUSH drops interrupts; "
-			"PAL_CACHE_FLUSH will be used instead\n");
-		ia64_eoi();
-	}
-
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-s64
-ia64_sal_cache_flush (u64 cache_type)
-{
-	struct ia64_sal_retval isrv;
-
-	if (sal_cache_flush_drops_interrupts) {
-		unsigned long flags;
-		u64 progress;
-		s64 rc;
-
-		progress = 0;
-		local_irq_save(flags);
-		rc = ia64_pal_cache_flush(cache_type,
-			PAL_CACHE_FLUSH_INVALIDATE, &progress, NULL);
-		local_irq_restore(flags);
-		return rc;
-	}
-
-	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-EXPORT_SYMBOL_GPL(ia64_sal_cache_flush);
-
-void __init
-ia64_sal_init (struct ia64_sal_systab *systab)
-{
-	char *p;
-	int i;
-
-	if (!systab) {
-		printk(KERN_WARNING "Hmm, no SAL System Table.\n");
-		return;
-	}
-
-	if (strncmp(systab->signature, "SST_", 4) != 0)
-		printk(KERN_ERR "bad signature in system table!");
-
-	check_versions(systab);
-#ifdef CONFIG_SMP
-	chk_nointroute_opt();
-#endif
-
-	/* revisions are coded in BCD, so %x does the job for us */
-	printk(KERN_INFO "SAL %x.%x: %.32s %.32s%sversion %x.%x\n",
-			SAL_MAJOR(sal_revision), SAL_MINOR(sal_revision),
-			systab->oem_id, systab->product_id,
-			systab->product_id[0] ? " " : "",
-			SAL_MAJOR(sal_version), SAL_MINOR(sal_version));
-
-	p = (char *) (systab + 1);
-	for (i = 0; i < systab->entry_count; i++) {
-		/*
-		 * The first byte of each entry type contains the type
-		 * descriptor.
-		 */
-		switch (*p) {
-		case SAL_DESC_ENTRY_POINT:
-			sal_desc_entry_point(p);
-			break;
-		case SAL_DESC_PLATFORM_FEATURE:
-			sal_desc_platform_feature(p);
-			break;
-		case SAL_DESC_PTC:
-			ia64_ptc_domain_info = (ia64_sal_desc_ptc_t *)p;
-			break;
-		case SAL_DESC_AP_WAKEUP:
-			sal_desc_ap_wakeup(p);
-			break;
-		}
-		p += SAL_DESC_SIZE(*p);
-	}
-
-}
-
-int
-ia64_sal_oemcall(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
-		 u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall);
-
-int
-ia64_sal_oemcall_nolock(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
-			u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
-			u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL_NOLOCK(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
-			arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall_nolock);
-
-int
-ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
-			   u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5,
-			   u64 arg6, u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL_REENTRANT(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
-			   arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
-
-long
-ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
-		    unsigned long *drift_info)
-{
-	struct ia64_sal_retval isrv;
-
-	SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
-	*ticks_per_second = isrv.v0;
-	*drift_info = isrv.v1;
-	return isrv.status;
-}
-EXPORT_SYMBOL_GPL(ia64_sal_freq_base);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
deleted file mode 100644
index 03b632c56899..000000000000
--- a/arch/ia64/kernel/salinfo.c
+++ /dev/null
@@ -1,646 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * salinfo.c
- *
- * Creates entries in /proc/sal for various system features.
- *
- * Copyright (c) 2003, 2006 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2003 Hewlett-Packard Co
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * 10/30/2001	jbarnes@sgi.com		copied much of Stephane's palinfo
- *					code to create this file
- * Oct 23 2003	kaos@sgi.com
- *   Replace IPI with set_cpus_allowed() to read a record from the required cpu.
- *   Redesign salinfo log processing to separate interrupt and user space
- *   contexts.
- *   Cache the record across multi-block reads from user space.
- *   Support > 64 cpus.
- *   Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module.
- *
- * Jan 28 2004	kaos@sgi.com
- *   Periodically check for outstanding MCA or INIT records.
- *
- * Dec  5 2004	kaos@sgi.com
- *   Standardize which records are cleared automatically.
- *
- * Aug 18 2005	kaos@sgi.com
- *   mca.c may not pass a buffer, a NULL buffer just indicates that a new
- *   record is available in SAL.
- *   Replace some NR_CPUS by cpus_online, for hotplug cpu.
- *
- * Jan  5 2006        kaos@sgi.com
- *   Handle hotplug cpus coming online.
- *   Handle hotplug cpus going offline while they still have outstanding records.
- *   Use the cpu_* macros consistently.
- *   Replace the counting semaphore with a mutex and a test if the cpumask is non-empty.
- *   Modify the locking to make the test for "work to do" an atomic operation.
- */
-
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/types.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/timer.h>
-#include <linux/vmalloc.h>
-#include <linux/semaphore.h>
-
-#include <asm/sal.h>
-#include <linux/uaccess.h>
-
-MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
-MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
-MODULE_LICENSE("GPL");
-
-typedef struct {
-	const char		*name;		/* name of the proc entry */
-	unsigned long           feature;        /* feature bit */
-	struct proc_dir_entry	*entry;		/* registered entry (removal) */
-} salinfo_entry_t;
-
-/*
- * List {name,feature} pairs for every entry in /proc/sal/<feature>
- * that this module exports
- */
-static const salinfo_entry_t salinfo_entries[]={
-	{ "bus_lock",           IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, },
-	{ "irq_redirection",	IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, },
-	{ "ipi_redirection",	IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, },
-	{ "itc_drift",		IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT, },
-};
-
-#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries)
-
-static char *salinfo_log_name[] = {
-	"mca",
-	"init",
-	"cmc",
-	"cpe",
-};
-
-static struct proc_dir_entry *salinfo_proc_entries[
-	ARRAY_SIZE(salinfo_entries) +			/* /proc/sal/bus_lock */
-	ARRAY_SIZE(salinfo_log_name) +			/* /proc/sal/{mca,...} */
-	(2 * ARRAY_SIZE(salinfo_log_name)) +		/* /proc/sal/mca/{event,data} */
-	1];						/* /proc/sal */
-
-/* Some records we get ourselves, some are accessed as saved data in buffers
- * that are owned by mca.c.
- */
-struct salinfo_data_saved {
-	u8*			buffer;
-	u64			size;
-	u64			id;
-	int			cpu;
-};
-
-/* State transitions.  Actions are :-
- *   Write "read <cpunum>" to the data file.
- *   Write "clear <cpunum>" to the data file.
- *   Write "oemdata <cpunum> <offset> to the data file.
- *   Read from the data file.
- *   Close the data file.
- *
- * Start state is NO_DATA.
- *
- * NO_DATA
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> return -EINVAL.
- *    read data -> return EOF.
- *    close -> unchanged.  Free record areas.
- *
- * LOG_RECORD
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
- *    read data -> return the INIT/MCA/CMC/CPE record.
- *    close -> unchanged.  Keep record areas.
- *
- * OEMDATA
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
- *    read data -> return the formatted oemdata.
- *    close -> unchanged.  Keep record areas.
- *
- * Closing the data file does not change the state.  This allows shell scripts
- * to manipulate salinfo data, each shell redirection opens the file, does one
- * action then closes it again.  The record areas are only freed at close when
- * the state is NO_DATA.
- */
-enum salinfo_state {
-	STATE_NO_DATA,
-	STATE_LOG_RECORD,
-	STATE_OEMDATA,
-};
-
-struct salinfo_data {
-	cpumask_t		cpu_event;	/* which cpus have outstanding events */
-	wait_queue_head_t	read_wait;
-	u8			*log_buffer;
-	u64			log_size;
-	u8			*oemdata;	/* decoded oem data */
-	u64			oemdata_size;
-	int			open;		/* single-open to prevent races */
-	u8			type;
-	u8			saved_num;	/* using a saved record? */
-	enum salinfo_state	state :8;	/* processing state */
-	u8			padding;
-	int			cpu_check;	/* next CPU to check */
-	struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */
-};
-
-static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
-
-static DEFINE_SPINLOCK(data_lock);
-static DEFINE_SPINLOCK(data_saved_lock);
-
-/** salinfo_platform_oemdata - optional callback to decode oemdata from an error
- * record.
- * @sect_header: pointer to the start of the section to decode.
- * @oemdata: returns vmalloc area containing the decoded output.
- * @oemdata_size: returns length of decoded output (strlen).
- *
- * Description: If user space asks for oem data to be decoded by the kernel
- * and/or prom and the platform has set salinfo_platform_oemdata to the address
- * of a platform specific routine then call that routine.  salinfo_platform_oemdata
- * vmalloc's and formats its output area, returning the address of the text
- * and its strlen.  Returns 0 for success, -ve for error.  The callback is
- * invoked on the cpu that generated the error record.
- */
-int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size);
-
-struct salinfo_platform_oemdata_parms {
-	const u8 *efi_guid;
-	u8 **oemdata;
-	u64 *oemdata_size;
-};
-
-static long
-salinfo_platform_oemdata_cpu(void *context)
-{
-	struct salinfo_platform_oemdata_parms *parms = context;
-
-	return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
-}
-
-static void
-shift1_data_saved (struct salinfo_data *data, int shift)
-{
-	memcpy(data->data_saved+shift, data->data_saved+shift+1,
-	       (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0]));
-	memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0,
-	       sizeof(data->data_saved[0]));
-}
-
-/* This routine is invoked in interrupt context.  Note: mca.c enables
- * interrupts before calling this code for CMC/CPE.  MCA and INIT events are
- * not irq safe, do not call any routines that use spinlocks, they may deadlock.
- * MCA and INIT records are recorded, a timer event will look for any
- * outstanding events and wake up the user space code.
- *
- * The buffer passed from mca.c points to the output from ia64_log_get. This is
- * a persistent buffer but its contents can change between the interrupt and
- * when user space processes the record.  Save the record id to identify
- * changes.  If the buffer is NULL then just update the bitmap.
- */
-void
-salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
-{
-	struct salinfo_data *data = salinfo_data + type;
-	struct salinfo_data_saved *data_saved;
-	unsigned long flags = 0;
-	int i;
-	int saved_size = ARRAY_SIZE(data->data_saved);
-
-	BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
-
-	if (irqsafe)
-		spin_lock_irqsave(&data_saved_lock, flags);
-	if (buffer) {
-		for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
-			if (!data_saved->buffer)
-				break;
-		}
-		if (i == saved_size) {
-			if (!data->saved_num) {
-				shift1_data_saved(data, 0);
-				data_saved = data->data_saved + saved_size - 1;
-			} else
-				data_saved = NULL;
-		}
-		if (data_saved) {
-			data_saved->cpu = smp_processor_id();
-			data_saved->id = ((sal_log_record_header_t *)buffer)->id;
-			data_saved->size = size;
-			data_saved->buffer = buffer;
-		}
-	}
-	cpumask_set_cpu(smp_processor_id(), &data->cpu_event);
-	if (irqsafe) {
-		wake_up_interruptible(&data->read_wait);
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-	}
-}
-
-/* Check for outstanding MCA/INIT records every minute (arbitrary) */
-#define SALINFO_TIMER_DELAY (60*HZ)
-static struct timer_list salinfo_timer;
-extern void ia64_mlogbuf_dump(void);
-
-static void
-salinfo_timeout_check(struct salinfo_data *data)
-{
-	if (!data->open)
-		return;
-	if (!cpumask_empty(&data->cpu_event))
-		wake_up_interruptible(&data->read_wait);
-}
-
-static void
-salinfo_timeout(struct timer_list *unused)
-{
-	ia64_mlogbuf_dump();
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
-	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
-	add_timer(&salinfo_timer);
-}
-
-static int
-salinfo_event_open(struct inode *inode, struct file *file)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	return 0;
-}
-
-static ssize_t
-salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	char cmd[32];
-	size_t size;
-	int i, n, cpu = -1;
-
-retry:
-	if (cpumask_empty(&data->cpu_event)) {
-		if (file->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		if (wait_event_interruptible(data->read_wait,
-					     !cpumask_empty(&data->cpu_event)))
-			return -EINTR;
-	}
-
-	n = data->cpu_check;
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (cpumask_test_cpu(n, &data->cpu_event)) {
-			if (!cpu_online(n)) {
-				cpumask_clear_cpu(n, &data->cpu_event);
-				continue;
-			}
-			cpu = n;
-			break;
-		}
-		if (++n == nr_cpu_ids)
-			n = 0;
-	}
-
-	if (cpu == -1)
-		goto retry;
-
-	ia64_mlogbuf_dump();
-
-	/* for next read, start checking at next CPU */
-	data->cpu_check = cpu;
-	if (++data->cpu_check == nr_cpu_ids)
-		data->cpu_check = 0;
-
-	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
-
-	size = strlen(cmd);
-	if (size > count)
-		size = count;
-	if (copy_to_user(buffer, cmd, size))
-		return -EFAULT;
-
-	return size;
-}
-
-static const struct proc_ops salinfo_event_proc_ops = {
-	.proc_open	= salinfo_event_open,
-	.proc_read	= salinfo_event_read,
-	.proc_lseek	= noop_llseek,
-};
-
-static int
-salinfo_log_open(struct inode *inode, struct file *file)
-{
-	struct salinfo_data *data = pde_data(inode);
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	spin_lock(&data_lock);
-	if (data->open) {
-		spin_unlock(&data_lock);
-		return -EBUSY;
-	}
-	data->open = 1;
-	spin_unlock(&data_lock);
-
-	if (data->state == STATE_NO_DATA &&
-	    !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) {
-		data->open = 0;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static int
-salinfo_log_release(struct inode *inode, struct file *file)
-{
-	struct salinfo_data *data = pde_data(inode);
-
-	if (data->state == STATE_NO_DATA) {
-		vfree(data->log_buffer);
-		vfree(data->oemdata);
-		data->log_buffer = NULL;
-		data->oemdata = NULL;
-	}
-	spin_lock(&data_lock);
-	data->open = 0;
-	spin_unlock(&data_lock);
-	return 0;
-}
-
-static long
-salinfo_log_read_cpu(void *context)
-{
-	struct salinfo_data *data = context;
-	sal_log_record_header_t *rh;
-	data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
-	rh = (sal_log_record_header_t *)(data->log_buffer);
-	/* Clear corrected errors as they are read from SAL */
-	if (rh->severity == sal_log_severity_corrected)
-		ia64_sal_clear_state_info(data->type);
-	return 0;
-}
-
-static void
-salinfo_log_new_read(int cpu, struct salinfo_data *data)
-{
-	struct salinfo_data_saved *data_saved;
-	unsigned long flags;
-	int i;
-	int saved_size = ARRAY_SIZE(data->data_saved);
-
-	data->saved_num = 0;
-	spin_lock_irqsave(&data_saved_lock, flags);
-retry:
-	for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
-		if (data_saved->buffer && data_saved->cpu == cpu) {
-			sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
-			data->log_size = data_saved->size;
-			memcpy(data->log_buffer, rh, data->log_size);
-			barrier();	/* id check must not be moved */
-			if (rh->id == data_saved->id) {
-				data->saved_num = i+1;
-				break;
-			}
-			/* saved record changed by mca.c since interrupt, discard it */
-			shift1_data_saved(data, i);
-			goto retry;
-		}
-	}
-	spin_unlock_irqrestore(&data_saved_lock, flags);
-
-	if (!data->saved_num)
-		work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
-	if (!data->log_size) {
-		data->state = STATE_NO_DATA;
-		cpumask_clear_cpu(cpu, &data->cpu_event);
-	} else {
-		data->state = STATE_LOG_RECORD;
-	}
-}
-
-static ssize_t
-salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	u8 *buf;
-	u64 bufsize;
-
-	if (data->state == STATE_LOG_RECORD) {
-		buf = data->log_buffer;
-		bufsize = data->log_size;
-	} else if (data->state == STATE_OEMDATA) {
-		buf = data->oemdata;
-		bufsize = data->oemdata_size;
-	} else {
-		buf = NULL;
-		bufsize = 0;
-	}
-	return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
-}
-
-static long
-salinfo_log_clear_cpu(void *context)
-{
-	struct salinfo_data *data = context;
-
-	ia64_sal_clear_state_info(data->type);
-	return 0;
-}
-
-static int
-salinfo_log_clear(struct salinfo_data *data, int cpu)
-{
-	sal_log_record_header_t *rh;
-	unsigned long flags;
-	spin_lock_irqsave(&data_saved_lock, flags);
-	data->state = STATE_NO_DATA;
-	if (!cpumask_test_cpu(cpu, &data->cpu_event)) {
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-		return 0;
-	}
-	cpumask_clear_cpu(cpu, &data->cpu_event);
-	if (data->saved_num) {
-		shift1_data_saved(data, data->saved_num - 1);
-		data->saved_num = 0;
-	}
-	spin_unlock_irqrestore(&data_saved_lock, flags);
-	rh = (sal_log_record_header_t *)(data->log_buffer);
-	/* Corrected errors have already been cleared from SAL */
-	if (rh->severity != sal_log_severity_corrected)
-		work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
-	/* clearing a record may make a new record visible */
-	salinfo_log_new_read(cpu, data);
-	if (data->state == STATE_LOG_RECORD) {
-		spin_lock_irqsave(&data_saved_lock, flags);
-		cpumask_set_cpu(cpu, &data->cpu_event);
-		wake_up_interruptible(&data->read_wait);
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-	}
-	return 0;
-}
-
-static ssize_t
-salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	char cmd[32];
-	size_t size;
-	u32 offset;
-	int cpu;
-
-	size = sizeof(cmd);
-	if (count < size)
-		size = count;
-	if (copy_from_user(cmd, buffer, size))
-		return -EFAULT;
-
-	if (sscanf(cmd, "read %d", &cpu) == 1) {
-		salinfo_log_new_read(cpu, data);
-	} else if (sscanf(cmd, "clear %d", &cpu) == 1) {
-		int ret;
-		if ((ret = salinfo_log_clear(data, cpu)))
-			count = ret;
-	} else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) {
-		if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA)
-			return -EINVAL;
-		if (offset > data->log_size - sizeof(efi_guid_t))
-			return -EINVAL;
-		data->state = STATE_OEMDATA;
-		if (salinfo_platform_oemdata) {
-			struct salinfo_platform_oemdata_parms parms = {
-				.efi_guid = data->log_buffer + offset,
-				.oemdata = &data->oemdata,
-				.oemdata_size = &data->oemdata_size
-			};
-			count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
-						 &parms);
-		} else
-			data->oemdata_size = 0;
-	} else
-		return -EINVAL;
-
-	return count;
-}
-
-static const struct proc_ops salinfo_data_proc_ops = {
-	.proc_open	= salinfo_log_open,
-	.proc_release	= salinfo_log_release,
-	.proc_read	= salinfo_log_read,
-	.proc_write	= salinfo_log_write,
-	.proc_lseek	= default_llseek,
-};
-
-static int salinfo_cpu_online(unsigned int cpu)
-{
-	unsigned int i, end = ARRAY_SIZE(salinfo_data);
-	struct salinfo_data *data;
-
-	spin_lock_irq(&data_saved_lock);
-	for (i = 0, data = salinfo_data; i < end; ++i, ++data) {
-		cpumask_set_cpu(cpu, &data->cpu_event);
-		wake_up_interruptible(&data->read_wait);
-	}
-	spin_unlock_irq(&data_saved_lock);
-	return 0;
-}
-
-static int salinfo_cpu_pre_down(unsigned int cpu)
-{
-	unsigned int i, end = ARRAY_SIZE(salinfo_data);
-	struct salinfo_data *data;
-
-	spin_lock_irq(&data_saved_lock);
-	for (i = 0, data = salinfo_data; i < end; ++i, ++data) {
-		struct salinfo_data_saved *data_saved;
-		int j = ARRAY_SIZE(data->data_saved) - 1;
-
-		for (data_saved = data->data_saved + j; j >= 0;
-		     --j, --data_saved) {
-			if (data_saved->buffer && data_saved->cpu == cpu)
-				shift1_data_saved(data, j);
-		}
-		cpumask_clear_cpu(cpu, &data->cpu_event);
-	}
-	spin_unlock_irq(&data_saved_lock);
-	return 0;
-}
-
-/*
- * 'data' contains an integer that corresponds to the feature we're
- * testing
- */
-static int __maybe_unused proc_salinfo_show(struct seq_file *m, void *v)
-{
-	unsigned long data = (unsigned long)v;
-	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
-	return 0;
-}
-
-static int __init
-salinfo_init(void)
-{
-	struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
-	struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
-	struct proc_dir_entry *dir, *entry;
-	struct salinfo_data *data;
-	int i;
-
-	salinfo_dir = proc_mkdir("sal", NULL);
-	if (!salinfo_dir)
-		return 0;
-
-	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
-		/* pass the feature bit in question as misc data */
-		*sdir++ = proc_create_single_data(salinfo_entries[i].name, 0,
-				salinfo_dir, proc_salinfo_show,
-				(void *)salinfo_entries[i].feature);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
-		data = salinfo_data + i;
-		data->type = i;
-		init_waitqueue_head(&data->read_wait);
-		dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
-		if (!dir)
-			continue;
-
-		entry = proc_create_data("event", S_IRUSR, dir,
-					 &salinfo_event_proc_ops, data);
-		if (!entry)
-			continue;
-		*sdir++ = entry;
-
-		entry = proc_create_data("data", S_IRUSR | S_IWUSR, dir,
-					 &salinfo_data_proc_ops, data);
-		if (!entry)
-			continue;
-		*sdir++ = entry;
-
-		*sdir++ = dir;
-	}
-
-	*sdir++ = salinfo_dir;
-
-	timer_setup(&salinfo_timer, salinfo_timeout, 0);
-	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
-	add_timer(&salinfo_timer);
-
-	i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/salinfo:online",
-			      salinfo_cpu_online, salinfo_cpu_pre_down);
-	WARN_ON(i < 0);
-	return 0;
-}
-
-module_init(salinfo_init);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
deleted file mode 100644
index 5a55ac82c13a..000000000000
--- a/arch/ia64/kernel/setup.c
+++ /dev/null
@@ -1,1081 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific setup.
- *
- * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2000, 2004 Intel Corp
- * 	Rohit Seth <rohit.seth@intel.com>
- * 	Suresh Siddha <suresh.b.siddha@intel.com>
- * 	Gordon Jin <gordon.jin@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *
- * 12/26/04 S.Siddha, G.Jin, R.Seth
- *			Add multi-threading and multi-core detection
- * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
- * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
- * 03/31/00 R.Seth	cpu_initialized and current->processor fixes
- * 02/04/00 D.Mosberger	some more get_cpuinfo fixes...
- * 02/01/00 R.Seth	fixed get_cpuinfo for SMP
- * 01/07/99 S.Eranian	added the support for command line argument
- * 06/24/99 W.Drummond	added boot_cpu_data.
- * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pgtable.h>
-
-#include <linux/acpi.h>
-#include <linux/console.h>
-#include <linux/delay.h>
-#include <linux/cpu.h>
-#include <linux/kdev_t.h>
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/reboot.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/task_stack.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/threads.h>
-#include <linux/screen_info.h>
-#include <linux/dmi.h>
-#include <linux/root_dev.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <linux/efi.h>
-#include <linux/initrd.h>
-#include <linux/pm.h>
-#include <linux/cpufreq.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-
-#include <asm/mca.h>
-#include <asm/meminit.h>
-#include <asm/page.h>
-#include <asm/patch.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-#include <asm/uv/uv.h>
-#include <asm/xtp.h>
-
-#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
-# error "struct cpuinfo_ia64 too big!"
-#endif
-
-char ia64_platform_name[64];
-
-#ifdef CONFIG_SMP
-unsigned long __per_cpu_offset[NR_CPUS];
-EXPORT_SYMBOL(__per_cpu_offset);
-#endif
-
-DEFINE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info);
-EXPORT_SYMBOL(ia64_cpu_info);
-DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(local_per_cpu_offset);
-#endif
-unsigned long ia64_cycles_per_usec;
-struct ia64_boot_param *ia64_boot_param;
-struct screen_info screen_info;
-unsigned long vga_console_iobase;
-unsigned long vga_console_membase;
-
-static struct resource data_resource = {
-	.name	= "Kernel data",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-static struct resource code_resource = {
-	.name	= "Kernel code",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-static struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-unsigned long ia64_max_cacheline_size;
-
-unsigned long ia64_iobase;	/* virtual address for I/O accesses */
-EXPORT_SYMBOL(ia64_iobase);
-struct io_space io_space[MAX_IO_SPACES];
-EXPORT_SYMBOL(io_space);
-unsigned int num_io_spaces;
-
-/*
- * "flush_icache_range()" needs to know what processor dependent stride size to use
- * when it makes i-cache(s) coherent with d-caches.
- */
-#define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
-unsigned long ia64_i_cache_stride_shift = ~0;
-/*
- * "clflush_cache_range()" needs to know what processor dependent stride size to
- * use when it flushes cache lines including both d-cache and i-cache.
- */
-/* Safest way to go: 32 bytes by 32 bytes */
-#define	CACHE_STRIDE_SHIFT	5
-unsigned long ia64_cache_stride_shift = ~0;
-
-/*
- * We use a special marker for the end of memory and it uses the extra (+1) slot
- */
-struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
-static int num_rsvd_regions __initdata;
-
-
-/*
- * Filter incoming memory segments based on the primitive map created from the boot
- * parameters. Segments contained in the map are removed from the memory ranges. A
- * caller-specified function is called with the memory ranges that remain after filtering.
- * This routine does not assume the incoming segments are sorted.
- */
-int __init
-filter_rsvd_memory (u64 start, u64 end, void *arg)
-{
-	u64 range_start, range_end, prev_start;
-	void (*func)(unsigned long, unsigned long, int);
-	int i;
-
-#if IGNORE_PFN0
-	if (start == PAGE_OFFSET) {
-		printk(KERN_WARNING "warning: skipping physical page 0\n");
-		start += PAGE_SIZE;
-		if (start >= end) return 0;
-	}
-#endif
-	/*
-	 * lowest possible address(walker uses virtual)
-	 */
-	prev_start = PAGE_OFFSET;
-	func = arg;
-
-	for (i = 0; i < num_rsvd_regions; ++i) {
-		range_start = max(start, prev_start);
-		range_end   = min(end, rsvd_region[i].start);
-
-		if (range_start < range_end)
-			call_pernode_memory(__pa(range_start), range_end - range_start, func);
-
-		/* nothing more available in this segment */
-		if (range_end == end) return 0;
-
-		prev_start = rsvd_region[i].end;
-	}
-	/* end of memory marker allows full processing inside loop body */
-	return 0;
-}
-
-/*
- * Similar to "filter_rsvd_memory()", but the reserved memory ranges
- * are not filtered out.
- */
-int __init
-filter_memory(u64 start, u64 end, void *arg)
-{
-	void (*func)(unsigned long, unsigned long, int);
-
-#if IGNORE_PFN0
-	if (start == PAGE_OFFSET) {
-		printk(KERN_WARNING "warning: skipping physical page 0\n");
-		start += PAGE_SIZE;
-		if (start >= end)
-			return 0;
-	}
-#endif
-	func = arg;
-	if (start < end)
-		call_pernode_memory(__pa(start), end - start, func);
-	return 0;
-}
-
-static void __init
-sort_regions (struct rsvd_region *rsvd_region, int max)
-{
-	int j;
-
-	/* simple bubble sorting */
-	while (max--) {
-		for (j = 0; j < max; ++j) {
-			if (rsvd_region[j].start > rsvd_region[j+1].start) {
-				swap(rsvd_region[j], rsvd_region[j + 1]);
-			}
-		}
-	}
-}
-
-/* merge overlaps */
-static int __init
-merge_regions (struct rsvd_region *rsvd_region, int max)
-{
-	int i;
-	for (i = 1; i < max; ++i) {
-		if (rsvd_region[i].start >= rsvd_region[i-1].end)
-			continue;
-		if (rsvd_region[i].end > rsvd_region[i-1].end)
-			rsvd_region[i-1].end = rsvd_region[i].end;
-		--max;
-		memmove(&rsvd_region[i], &rsvd_region[i+1],
-			(max - i) * sizeof(struct rsvd_region));
-	}
-	return max;
-}
-
-/*
- * Request address space for all standard resources
- */
-static int __init register_memory(void)
-{
-	code_resource.start = ia64_tpa(_text);
-	code_resource.end   = ia64_tpa(_etext) - 1;
-	data_resource.start = ia64_tpa(_etext);
-	data_resource.end   = ia64_tpa(_edata) - 1;
-	bss_resource.start  = ia64_tpa(__bss_start);
-	bss_resource.end    = ia64_tpa(_end) - 1;
-	efi_initialize_iomem_resources(&code_resource, &data_resource,
-			&bss_resource);
-
-	return 0;
-}
-
-__initcall(register_memory);
-
-
-#ifdef CONFIG_KEXEC
-
-/*
- * This function checks if the reserved crashkernel is allowed on the specific
- * IA64 machine flavour. Machines without an IO TLB use swiotlb and require
- * some memory below 4 GB (i.e. in 32 bit area), see the implementation of
- * kernel/dma/swiotlb.c. The hpzx1 architecture has an IO TLB but cannot use that
- * in kdump case. See the comment in sba_init() in sba_iommu.c.
- *
- * So, the only machvec that really supports loading the kdump kernel
- * over 4 GB is "uv".
- */
-static int __init check_crashkernel_memory(unsigned long pbase, size_t size)
-{
-	if (is_uv_system())
-		return 1;
-	else
-		return pbase < (1UL << 32);
-}
-
-static void __init setup_crashkernel(unsigned long total, int *n)
-{
-	unsigned long long base = 0, size = 0;
-	int ret;
-
-	ret = parse_crashkernel(boot_command_line, total,
-			&size, &base);
-	if (ret == 0 && size > 0) {
-		if (!base) {
-			sort_regions(rsvd_region, *n);
-			*n = merge_regions(rsvd_region, *n);
-			base = kdump_find_rsvd_region(size,
-					rsvd_region, *n);
-		}
-
-		if (!check_crashkernel_memory(base, size)) {
-			pr_warn("crashkernel: There would be kdump memory "
-				"at %ld GB but this is unusable because it "
-				"must\nbe below 4 GB. Change the memory "
-				"configuration of the machine.\n",
-				(unsigned long)(base >> 30));
-			return;
-		}
-
-		if (base != ~0UL) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(size >> 20),
-					(unsigned long)(base >> 20),
-					(unsigned long)(total >> 20));
-			rsvd_region[*n].start =
-				(unsigned long)__va(base);
-			rsvd_region[*n].end =
-				(unsigned long)__va(base + size);
-			(*n)++;
-			crashk_res.start = base;
-			crashk_res.end = base + size - 1;
-		}
-	}
-	efi_memmap_res.start = ia64_boot_param->efi_memmap;
-	efi_memmap_res.end = efi_memmap_res.start +
-		ia64_boot_param->efi_memmap_size;
-	boot_param_res.start = __pa(ia64_boot_param);
-	boot_param_res.end = boot_param_res.start +
-		sizeof(*ia64_boot_param);
-}
-#else
-static inline void __init setup_crashkernel(unsigned long total, int *n)
-{}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-static int __init reserve_elfcorehdr(u64 *start, u64 *end)
-{
-	u64 length;
-
-	/* We get the address using the kernel command line,
-	 * but the size is extracted from the EFI tables.
-	 * Both address and size are required for reservation
-	 * to work properly.
-	 */
-
-	if (!is_vmcore_usable())
-		return -EINVAL;
-
-	if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
-		vmcore_unusable();
-		return -EINVAL;
-	}
-
-	*start = (unsigned long)__va(elfcorehdr_addr);
-	*end = *start + length;
-	return 0;
-}
-#endif /* CONFIG_CRASH_DUMP */
-
-/**
- * reserve_memory - setup reserved memory areas
- *
- * Setup the reserved memory areas set aside for the boot parameters,
- * initrd, etc.  There are currently %IA64_MAX_RSVD_REGIONS defined,
- * see arch/ia64/include/asm/meminit.h if you need to define more.
- */
-void __init
-reserve_memory (void)
-{
-	int n = 0;
-	unsigned long total_memory;
-
-	/*
-	 * none of the entries in this table overlap
-	 */
-	rsvd_region[n].start = (unsigned long) ia64_boot_param;
-	rsvd_region[n].end   = rsvd_region[n].start + sizeof(*ia64_boot_param);
-	n++;
-
-	rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->efi_memmap);
-	rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->efi_memmap_size;
-	n++;
-
-	rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->command_line);
-	rsvd_region[n].end   = (rsvd_region[n].start
-				+ strlen(__va(ia64_boot_param->command_line)) + 1);
-	n++;
-
-	rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START);
-	rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
-	n++;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (ia64_boot_param->initrd_start) {
-		rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
-		rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->initrd_size;
-		n++;
-	}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-	if (reserve_elfcorehdr(&rsvd_region[n].start,
-			       &rsvd_region[n].end) == 0)
-		n++;
-#endif
-
-	total_memory = efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
-	n++;
-
-	setup_crashkernel(total_memory, &n);
-
-	/* end of memory marker */
-	rsvd_region[n].start = ~0UL;
-	rsvd_region[n].end   = ~0UL;
-	n++;
-
-	num_rsvd_regions = n;
-	BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
-
-	sort_regions(rsvd_region, num_rsvd_regions);
-	num_rsvd_regions = merge_regions(rsvd_region, num_rsvd_regions);
-
-	/* reserve all regions except the end of memory marker with memblock */
-	for (n = 0; n < num_rsvd_regions - 1; n++) {
-		struct rsvd_region *region = &rsvd_region[n];
-		phys_addr_t addr = __pa(region->start);
-		phys_addr_t size = region->end - region->start;
-
-		memblock_reserve(addr, size);
-	}
-}
-
-/**
- * find_initrd - get initrd parameters from the boot parameter structure
- *
- * Grab the initrd start and end from the boot parameter struct given us by
- * the boot loader.
- */
-void __init
-find_initrd (void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (ia64_boot_param->initrd_start) {
-		initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
-		initrd_end   = initrd_start+ia64_boot_param->initrd_size;
-
-		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%llu bytes)\n",
-		       initrd_start, ia64_boot_param->initrd_size);
-	}
-#endif
-}
-
-static void __init
-io_port_init (void)
-{
-	unsigned long phys_iobase;
-
-	/*
-	 * Set `iobase' based on the EFI memory map or, failing that, the
-	 * value firmware left in ar.k0.
-	 *
-	 * Note that in ia32 mode, IN/OUT instructions use ar.k0 to compute
-	 * the port's virtual address, so ia32_load_state() loads it with a
-	 * user virtual address.  But in ia64 mode, glibc uses the
-	 * *physical* address in ar.k0 to mmap the appropriate area from
-	 * /dev/mem, and the inX()/outX() interfaces use MMIO.  In both
-	 * cases, user-mode can only use the legacy 0-64K I/O port space.
-	 *
-	 * ar.k0 is not involved in kernel I/O port accesses, which can use
-	 * any of the I/O port spaces and are done via MMIO using the
-	 * virtual mmio_base from the appropriate io_space[].
-	 */
-	phys_iobase = efi_get_iobase();
-	if (!phys_iobase) {
-		phys_iobase = ia64_get_kr(IA64_KR_IO_BASE);
-		printk(KERN_INFO "No I/O port range found in EFI memory map, "
-			"falling back to AR.KR0 (0x%lx)\n", phys_iobase);
-	}
-	ia64_iobase = (unsigned long) ioremap(phys_iobase, 0);
-	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
-
-	/* setup legacy IO port space */
-	io_space[0].mmio_base = ia64_iobase;
-	io_space[0].sparse = 1;
-	num_io_spaces = 1;
-}
-
-/**
- * early_console_setup - setup debugging console
- *
- * Consoles started here require little enough setup that we can start using
- * them very early in the boot process, either right after the machine
- * vector initialization, or even before if the drivers can detect their hw.
- *
- * Returns non-zero if a console couldn't be setup.
- */
-static inline int __init
-early_console_setup (char *cmdline)
-{
-#ifdef CONFIG_EFI_PCDP
-	if (!efi_setup_pcdp_console(cmdline))
-		return 0;
-#endif
-	return -1;
-}
-
-static void __init
-screen_info_setup(void)
-{
-	unsigned int orig_x, orig_y, num_cols, num_rows, font_height;
-
-	memset(&screen_info, 0, sizeof(screen_info));
-
-	if (!ia64_boot_param->console_info.num_rows ||
-	    !ia64_boot_param->console_info.num_cols) {
-		printk(KERN_WARNING "invalid screen-info, guessing 80x25\n");
-		orig_x = 0;
-		orig_y = 0;
-		num_cols = 80;
-		num_rows = 25;
-		font_height = 16;
-	} else {
-		orig_x = ia64_boot_param->console_info.orig_x;
-		orig_y = ia64_boot_param->console_info.orig_y;
-		num_cols = ia64_boot_param->console_info.num_cols;
-		num_rows = ia64_boot_param->console_info.num_rows;
-		font_height = 400 / num_rows;
-	}
-
-	screen_info.orig_x = orig_x;
-	screen_info.orig_y = orig_y;
-	screen_info.orig_video_cols  = num_cols;
-	screen_info.orig_video_lines = num_rows;
-	screen_info.orig_video_points = font_height;
-	screen_info.orig_video_mode = 3;	/* XXX fake */
-	screen_info.orig_video_isVGA = 1;	/* XXX fake */
-	screen_info.orig_video_ega_bx = 3;	/* XXX fake */
-}
-
-static inline void
-mark_bsp_online (void)
-{
-#ifdef CONFIG_SMP
-	/* If we register an early console, allow CPU 0 to printk */
-	set_cpu_online(smp_processor_id(), true);
-#endif
-}
-
-static __initdata int nomca;
-static __init int setup_nomca(char *s)
-{
-	nomca = 1;
-	return 0;
-}
-early_param("nomca", setup_nomca);
-
-void __init
-setup_arch (char **cmdline_p)
-{
-	unw_init();
-
-	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
-
-	*cmdline_p = __va(ia64_boot_param->command_line);
-	strscpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
-
-	efi_init();
-	io_port_init();
-
-	uv_probe_system_type();
-	parse_early_param();
-
-	if (early_console_setup(*cmdline_p) == 0)
-		mark_bsp_online();
-
-	/* Initialize the ACPI boot-time table parser */
-	acpi_table_init();
-	early_acpi_boot_init();
-#ifdef CONFIG_ACPI_NUMA
-	acpi_numa_init();
-	acpi_numa_fixup();
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-	prefill_possible_map();
-#endif
-	per_cpu_scan_finalize((cpumask_empty(&early_cpu_possible_map) ?
-		32 : cpumask_weight(&early_cpu_possible_map)),
-		additional_cpus > 0 ? additional_cpus : 0);
-#endif /* CONFIG_ACPI_NUMA */
-
-#ifdef CONFIG_SMP
-	smp_build_cpu_map();
-#endif
-	find_memory();
-
-	/* process SAL system table: */
-	ia64_sal_init(__va(sal_systab_phys));
-
-#ifdef CONFIG_ITANIUM
-	ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
-#else
-	{
-		unsigned long num_phys_stacked;
-
-		if (ia64_pal_rse_info(&num_phys_stacked, 0) == 0 && num_phys_stacked > 96)
-			ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
-	}
-#endif
-
-#ifdef CONFIG_SMP
-	cpu_physical_id(0) = hard_smp_processor_id();
-#endif
-
-	cpu_init();	/* initialize the bootstrap CPU */
-	mmu_context_init();	/* initialize context_id bitmap */
-
-#ifdef CONFIG_VT
-	if (!conswitchp) {
-# if defined(CONFIG_VGA_CONSOLE)
-		/*
-		 * Non-legacy systems may route legacy VGA MMIO range to system
-		 * memory.  vga_con probes the MMIO hole, so memory looks like
-		 * a VGA device to it.  The EFI memory map can tell us if it's
-		 * memory so we can avoid this problem.
-		 */
-		if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY)
-			conswitchp = &vga_con;
-# endif
-	}
-#endif
-
-	/* enable IA-64 Machine Check Abort Handling unless disabled */
-	if (!nomca)
-		ia64_mca_init();
-
-	/*
-	 * Default to /dev/sda2.  This assumes that the EFI partition
-	 * is physical disk 1 partition 1 and the Linux root disk is
-	 * physical disk 1 partition 2.
-	 */
-	ROOT_DEV = MKDEV(SCSI_DISK0_MAJOR, 2);
-
-	if (is_uv_system())
-		uv_setup(cmdline_p);
-#ifdef CONFIG_SMP
-	else
-		init_smp_config();
-#endif
-
-	screen_info_setup();
-	paging_init();
-
-	clear_sched_clock_stable();
-}
-
-/*
- * Display cpu info for all CPUs.
- */
-static int
-show_cpuinfo (struct seq_file *m, void *v)
-{
-#ifdef CONFIG_SMP
-#	define lpj	c->loops_per_jiffy
-#	define cpunum	c->cpu
-#else
-#	define lpj	loops_per_jiffy
-#	define cpunum	0
-#endif
-	static struct {
-		unsigned long mask;
-		const char *feature_name;
-	} feature_bits[] = {
-		{ 1UL << 0, "branchlong" },
-		{ 1UL << 1, "spontaneous deferral"},
-		{ 1UL << 2, "16-byte atomic ops" }
-	};
-	char features[128], *cp, *sep;
-	struct cpuinfo_ia64 *c = v;
-	unsigned long mask;
-	unsigned long proc_freq;
-	int i, size;
-
-	mask = c->features;
-
-	/* build the feature string: */
-	memcpy(features, "standard", 9);
-	cp = features;
-	size = sizeof(features);
-	sep = "";
-	for (i = 0; i < ARRAY_SIZE(feature_bits) && size > 1; ++i) {
-		if (mask & feature_bits[i].mask) {
-			cp += snprintf(cp, size, "%s%s", sep,
-				       feature_bits[i].feature_name),
-			sep = ", ";
-			mask &= ~feature_bits[i].mask;
-			size = sizeof(features) - (cp - features);
-		}
-	}
-	if (mask && size > 1) {
-		/* print unknown features as a hex value */
-		snprintf(cp, size, "%s0x%lx", sep, mask);
-	}
-
-	proc_freq = cpufreq_quick_get(cpunum);
-	if (!proc_freq)
-		proc_freq = c->proc_freq / 1000;
-
-	seq_printf(m,
-		   "processor  : %d\n"
-		   "vendor     : %s\n"
-		   "arch       : IA-64\n"
-		   "family     : %u\n"
-		   "model      : %u\n"
-		   "model name : %s\n"
-		   "revision   : %u\n"
-		   "archrev    : %u\n"
-		   "features   : %s\n"
-		   "cpu number : %lu\n"
-		   "cpu regs   : %u\n"
-		   "cpu MHz    : %lu.%03lu\n"
-		   "itc MHz    : %lu.%06lu\n"
-		   "BogoMIPS   : %lu.%02lu\n",
-		   cpunum, c->vendor, c->family, c->model,
-		   c->model_name, c->revision, c->archrev,
-		   features, c->ppn, c->number,
-		   proc_freq / 1000, proc_freq % 1000,
-		   c->itc_freq / 1000000, c->itc_freq % 1000000,
-		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
-#ifdef CONFIG_SMP
-	seq_printf(m, "siblings   : %u\n",
-		   cpumask_weight(&cpu_core_map[cpunum]));
-	if (c->socket_id != -1)
-		seq_printf(m, "physical id: %u\n", c->socket_id);
-	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
-		seq_printf(m,
-			   "core id    : %u\n"
-			   "thread id  : %u\n",
-			   c->core_id, c->thread_id);
-#endif
-	seq_printf(m,"\n");
-
-	return 0;
-}
-
-static void *
-c_start (struct seq_file *m, loff_t *pos)
-{
-#ifdef CONFIG_SMP
-	while (*pos < nr_cpu_ids && !cpu_online(*pos))
-		++*pos;
-#endif
-	return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
-}
-
-static void *
-c_next (struct seq_file *m, void *v, loff_t *pos)
-{
-	++*pos;
-	return c_start(m, pos);
-}
-
-static void
-c_stop (struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
-	.start =	c_start,
-	.next =		c_next,
-	.stop =		c_stop,
-	.show =		show_cpuinfo
-};
-
-#define MAX_BRANDS	8
-static char brandname[MAX_BRANDS][128];
-
-static char *
-get_model_name(__u8 family, __u8 model)
-{
-	static int overflow;
-	char brand[128];
-	int i;
-
-	memcpy(brand, "Unknown", 8);
-	if (ia64_pal_get_brand_info(brand)) {
-		if (family == 0x7)
-			memcpy(brand, "Merced", 7);
-		else if (family == 0x1f) switch (model) {
-			case 0: memcpy(brand, "McKinley", 9); break;
-			case 1: memcpy(brand, "Madison", 8); break;
-			case 2: memcpy(brand, "Madison up to 9M cache", 23); break;
-		}
-	}
-	for (i = 0; i < MAX_BRANDS; i++)
-		if (strcmp(brandname[i], brand) == 0)
-			return brandname[i];
-	for (i = 0; i < MAX_BRANDS; i++)
-		if (brandname[i][0] == '\0')
-			return strcpy(brandname[i], brand);
-	if (overflow++ == 0)
-		printk(KERN_ERR
-		       "%s: Table overflow. Some processor model information will be missing\n",
-		       __func__);
-	return "Unknown";
-}
-
-static void
-identify_cpu (struct cpuinfo_ia64 *c)
-{
-	union {
-		unsigned long bits[5];
-		struct {
-			/* id 0 & 1: */
-			char vendor[16];
-
-			/* id 2 */
-			u64 ppn;		/* processor serial number */
-
-			/* id 3: */
-			unsigned number		:  8;
-			unsigned revision	:  8;
-			unsigned model		:  8;
-			unsigned family		:  8;
-			unsigned archrev	:  8;
-			unsigned reserved	: 24;
-
-			/* id 4: */
-			u64 features;
-		} field;
-	} cpuid;
-	pal_vm_info_1_u_t vm1;
-	pal_vm_info_2_u_t vm2;
-	pal_status_t status;
-	unsigned long impl_va_msb = 50, phys_addr_size = 44;	/* Itanium defaults */
-	int i;
-	for (i = 0; i < 5; ++i)
-		cpuid.bits[i] = ia64_get_cpuid(i);
-
-	memcpy(c->vendor, cpuid.field.vendor, 16);
-#ifdef CONFIG_SMP
-	c->cpu = smp_processor_id();
-
-	/* below default values will be overwritten  by identify_siblings() 
-	 * for Multi-Threading/Multi-Core capable CPUs
-	 */
-	c->threads_per_core = c->cores_per_socket = c->num_log = 1;
-	c->socket_id = -1;
-
-	identify_siblings(c);
-
-	if (c->threads_per_core > smp_num_siblings)
-		smp_num_siblings = c->threads_per_core;
-#endif
-	c->ppn = cpuid.field.ppn;
-	c->number = cpuid.field.number;
-	c->revision = cpuid.field.revision;
-	c->model = cpuid.field.model;
-	c->family = cpuid.field.family;
-	c->archrev = cpuid.field.archrev;
-	c->features = cpuid.field.features;
-	c->model_name = get_model_name(c->family, c->model);
-
-	status = ia64_pal_vm_summary(&vm1, &vm2);
-	if (status == PAL_STATUS_SUCCESS) {
-		impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb;
-		phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size;
-	}
-	c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1));
-	c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
-}
-
-/*
- * Do the following calculations:
- *
- * 1. the max. cache line size.
- * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
- * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
- */
-static void
-get_cache_info(void)
-{
-	unsigned long line_size, max = 1;
-	unsigned long l, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	long status;
-
-        status = ia64_pal_cache_summary(&levels, &unique_caches);
-        if (status != 0) {
-                printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
-                       __func__, status);
-                max = SMP_CACHE_BYTES;
-		/* Safest setup for "flush_icache_range()" */
-		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
-		/* Safest setup for "clflush_cache_range()" */
-		ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
-		goto out;
-        }
-
-	for (l = 0; l < levels; ++l) {
-		/* cache_type (data_or_unified)=2 */
-		status = ia64_pal_cache_config_info(l, 2, &cci);
-		if (status != 0) {
-			printk(KERN_ERR "%s: ia64_pal_cache_config_info"
-				"(l=%lu, 2) failed (status=%ld)\n",
-				__func__, l, status);
-			max = SMP_CACHE_BYTES;
-			/* The safest setup for "flush_icache_range()" */
-			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
-			/* The safest setup for "clflush_cache_range()" */
-			ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
-			cci.pcci_unified = 1;
-		} else {
-			if (cci.pcci_stride < ia64_cache_stride_shift)
-				ia64_cache_stride_shift = cci.pcci_stride;
-
-			line_size = 1 << cci.pcci_line_size;
-			if (line_size > max)
-				max = line_size;
-		}
-
-		if (!cci.pcci_unified) {
-			/* cache_type (instruction)=1*/
-			status = ia64_pal_cache_config_info(l, 1, &cci);
-			if (status != 0) {
-				printk(KERN_ERR "%s: ia64_pal_cache_config_info"
-					"(l=%lu, 1) failed (status=%ld)\n",
-					__func__, l, status);
-				/* The safest setup for flush_icache_range() */
-				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
-			}
-		}
-		if (cci.pcci_stride < ia64_i_cache_stride_shift)
-			ia64_i_cache_stride_shift = cci.pcci_stride;
-	}
-  out:
-	if (max > ia64_max_cacheline_size)
-		ia64_max_cacheline_size = max;
-}
-
-/*
- * cpu_init() initializes state that is per-CPU.  This function acts
- * as a 'CPU state barrier', nothing should get across.
- */
-void
-cpu_init (void)
-{
-	extern void ia64_mmu_init(void *);
-	static unsigned long max_num_phys_stacked = IA64_NUM_PHYS_STACK_REG;
-	unsigned long num_phys_stacked;
-	pal_vm_info_2_u_t vmi;
-	unsigned int max_ctx;
-	struct cpuinfo_ia64 *cpu_info;
-	void *cpu_data;
-
-	cpu_data = per_cpu_init();
-#ifdef CONFIG_SMP
-	/*
-	 * insert boot cpu into sibling and core mapes
-	 * (must be done after per_cpu area is setup)
-	 */
-	if (smp_processor_id() == 0) {
-		cpumask_set_cpu(0, &per_cpu(cpu_sibling_map, 0));
-		cpumask_set_cpu(0, &cpu_core_map[0]);
-	} else {
-		/*
-		 * Set ar.k3 so that assembly code in MCA handler can compute
-		 * physical addresses of per cpu variables with a simple:
-		 *   phys = ar.k3 + &per_cpu_var
-		 * and the alt-dtlb-miss handler can set per-cpu mapping into
-		 * the TLB when needed. head.S already did this for cpu0.
-		 */
-		ia64_set_kr(IA64_KR_PER_CPU_DATA,
-			    ia64_tpa(cpu_data) - (long) __per_cpu_start);
-	}
-#endif
-
-	get_cache_info();
-
-	/*
-	 * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
-	 * ia64_mmu_init() yet.  And we can't call ia64_mmu_init() first because it
-	 * depends on the data returned by identify_cpu().  We break the dependency by
-	 * accessing cpu_data() through the canonical per-CPU address.
-	 */
-	cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(ia64_cpu_info) - __per_cpu_start);
-	identify_cpu(cpu_info);
-
-#ifdef CONFIG_MCKINLEY
-	{
-#		define FEATURE_SET 16
-		struct ia64_pal_retval iprv;
-
-		if (cpu_info->family == 0x1f) {
-			PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0);
-			if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80))
-				PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES,
-				              (iprv.v1 | 0x80), FEATURE_SET, 0);
-		}
-	}
-#endif
-
-	/* Clear the stack memory reserved for pt_regs: */
-	memset(task_pt_regs(current), 0, sizeof(struct pt_regs));
-
-	ia64_set_kr(IA64_KR_FPU_OWNER, 0);
-
-	/*
-	 * Initialize the page-table base register to a global
-	 * directory with all zeroes.  This ensure that we can handle
-	 * TLB-misses to user address-space even before we created the
-	 * first user address-space.  This may happen, e.g., due to
-	 * aggressive use of lfetch.fault.
-	 */
-	ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
-
-	/*
-	 * Initialize default control register to defer speculative faults except
-	 * for those arising from TLB misses, which are not deferred.  The
-	 * kernel MUST NOT depend on a particular setting of these bits (in other words,
-	 * the kernel must have recovery code for all speculative accesses).  Turn on
-	 * dcr.lc as per recommendation by the architecture team.  Most IA-32 apps
-	 * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll
-	 * be fine).
-	 */
-	ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
-					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
-	mmgrab(&init_mm);
-	current->active_mm = &init_mm;
-	BUG_ON(current->mm);
-
-	ia64_mmu_init(ia64_imva(cpu_data));
-	ia64_mca_cpu_init(ia64_imva(cpu_data));
-
-	/* Clear ITC to eliminate sched_clock() overflows in human time.  */
-	ia64_set_itc(0);
-
-	/* disable all local interrupt sources: */
-	ia64_set_itv(1 << 16);
-	ia64_set_lrr0(1 << 16);
-	ia64_set_lrr1(1 << 16);
-	ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
-	ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
-
-	/* clear TPR & XTP to enable all interrupt classes: */
-	ia64_setreg(_IA64_REG_CR_TPR, 0);
-
-	/* Clear any pending interrupts left by SAL/EFI */
-	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
-		ia64_eoi();
-
-#ifdef CONFIG_SMP
-	normal_xtp();
-#endif
-
-	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
-	if (ia64_pal_vm_summary(NULL, &vmi) == 0) {
-		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
-		setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, NPTCG_FROM_PAL);
-	} else {
-		printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
-		max_ctx = (1U << 15) - 1;	/* use architected minimum */
-	}
-	while (max_ctx < ia64_ctx.max_ctx) {
-		unsigned int old = ia64_ctx.max_ctx;
-		if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old)
-			break;
-	}
-
-	if (ia64_pal_rse_info(&num_phys_stacked, NULL) != 0) {
-		printk(KERN_WARNING "cpu_init: PAL RSE info failed; assuming 96 physical "
-		       "stacked regs\n");
-		num_phys_stacked = 96;
-	}
-	/* size of physical stacked register partition plus 8 bytes: */
-	if (num_phys_stacked > max_num_phys_stacked) {
-		ia64_patch_phys_stack_reg(num_phys_stacked*8 + 8);
-		max_num_phys_stacked = num_phys_stacked;
-	}
-}
-
-void __init arch_cpu_finalize_init(void)
-{
-	ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles,
-			       (unsigned long) __end___mckinley_e9_bundles);
-}
-
-static int __init run_dmi_scan(void)
-{
-	dmi_setup();
-	return 0;
-}
-core_initcall(run_dmi_scan);
diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h
deleted file mode 100644
index 58a36ce6c26e..000000000000
--- a/arch/ia64/kernel/sigframe.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-struct sigscratch {
-	unsigned long scratch_unat;	/* ar.unat for the general registers saved in pt */
-	unsigned long ar_pfs;		/* for syscalls, the user-level function-state  */
-	struct pt_regs pt;
-};
-
-struct sigframe {
-	/*
-	 * Place signal handler args where user-level unwinder can find them easily.
-	 * DO NOT MOVE THESE.  They are part of the IA-64 Linux ABI and there is
-	 * user-level code that depends on their presence!
-	 */
-	unsigned long arg0;		/* signum */
-	unsigned long arg1;		/* siginfo pointer */
-	unsigned long arg2;		/* sigcontext pointer */
-	/*
-	 * End of architected state.
-	 */
-
-	void __user *handler;		/* pointer to the plabel of the signal handler */
-	struct siginfo info;
-	struct sigcontext sc;
-};
-
-extern void ia64_do_signal (struct sigscratch *, long);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
deleted file mode 100644
index 51cf6a7ec158..000000000000
--- a/arch/ia64/kernel/signal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific signal handling support.
- *
- * Copyright (C) 1999-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Derived from i386 and Alpha versions.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/ptrace.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/tty.h>
-#include <linux/binfmts.h>
-#include <linux/unistd.h>
-#include <linux/wait.h>
-
-#include <asm/intrinsics.h>
-#include <linux/uaccess.h>
-#include <asm/rse.h>
-#include <asm/sigcontext.h>
-
-#include "sigframe.h"
-
-#define DEBUG_SIG	0
-#define STACK_ALIGN	16		/* minimal alignment for stack pointer */
-
-#if _NSIG_WORDS > 1
-# define PUT_SIGSET(k,u)	__copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
-# define GET_SIGSET(k,u)	__copy_from_user((k)->sig, (u)->sig, sizeof(sigset_t))
-#else
-# define PUT_SIGSET(k,u)	__put_user((k)->sig[0], &(u)->sig[0])
-# define GET_SIGSET(k,u)	__get_user((k)->sig[0], &(u)->sig[0])
-#endif
-
-static long
-restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
-{
-	unsigned long ip, flags, nat, um, cfm, rsc;
-	long err;
-
-	/* Always make any pending restarted system calls return -EINTR */
-	current->restart_block.fn = do_no_restart_syscall;
-
-	/* restore scratch that always needs gets updated during signal delivery: */
-	err  = __get_user(flags, &sc->sc_flags);
-	err |= __get_user(nat, &sc->sc_nat);
-	err |= __get_user(ip, &sc->sc_ip);			/* instruction pointer */
-	err |= __get_user(cfm, &sc->sc_cfm);
-	err |= __get_user(um, &sc->sc_um);			/* user mask */
-	err |= __get_user(rsc, &sc->sc_ar_rsc);
-	err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat);
-	err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);
-	err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
-	err |= __get_user(scr->pt.pr, &sc->sc_pr);		/* predicates */
-	err |= __get_user(scr->pt.b0, &sc->sc_br[0]);		/* b0 (rp) */
-	err |= __get_user(scr->pt.b6, &sc->sc_br[6]);		/* b6 */
-	err |= __copy_from_user(&scr->pt.r1, &sc->sc_gr[1], 8);	/* r1 */
-	err |= __copy_from_user(&scr->pt.r8, &sc->sc_gr[8], 4*8);	/* r8-r11 */
-	err |= __copy_from_user(&scr->pt.r12, &sc->sc_gr[12], 2*8);	/* r12-r13 */
-	err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8);	/* r15 */
-
-	scr->pt.cr_ifs = cfm | (1UL << 63);
-	scr->pt.ar_rsc = rsc | (3 << 2); /* force PL3 */
-
-	/* establish new instruction pointer: */
-	scr->pt.cr_iip = ip & ~0x3UL;
-	ia64_psr(&scr->pt)->ri = ip & 0x3;
-	scr->pt.cr_ipsr = (scr->pt.cr_ipsr & ~IA64_PSR_UM) | (um & IA64_PSR_UM);
-
-	scr->scratch_unat = ia64_put_scratch_nat_bits(&scr->pt, nat);
-
-	if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
-		/* Restore most scratch-state only when not in syscall. */
-		err |= __get_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);		/* ar.ccv */
-		err |= __get_user(scr->pt.b7, &sc->sc_br[7]);			/* b7 */
-		err |= __get_user(scr->pt.r14, &sc->sc_gr[14]);			/* r14 */
-		err |= __copy_from_user(&scr->pt.ar_csd, &sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */
-		err |= __copy_from_user(&scr->pt.r2, &sc->sc_gr[2], 2*8);	/* r2-r3 */
-		err |= __copy_from_user(&scr->pt.r16, &sc->sc_gr[16], 16*8);	/* r16-r31 */
-	}
-
-	if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) {
-		struct ia64_psr *psr = ia64_psr(&scr->pt);
-
-		err |= __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
-		psr->mfh = 0;	/* drop signal handler's fph contents... */
-		preempt_disable();
-		if (psr->dfh)
-			ia64_drop_fpu(current);
-		else {
-			/* We already own the local fph, otherwise psr->dfh wouldn't be 0.  */
-			__ia64_load_fpu(current->thread.fph);
-			ia64_set_local_fpu_owner(current);
-		}
-		preempt_enable();
-	}
-	return err;
-}
-
-long
-ia64_rt_sigreturn (struct sigscratch *scr)
-{
-	extern char ia64_strace_leave_kernel, ia64_leave_kernel;
-	struct sigcontext __user *sc;
-	sigset_t set;
-	long retval;
-
-	sc = &((struct sigframe __user *) (scr->pt.r12 + 16))->sc;
-
-	/*
-	 * When we return to the previously executing context, r8 and r10 have already
-	 * been setup the way we want them.  Indeed, if the signal wasn't delivered while
-	 * in a system call, we must not touch r8 or r10 as otherwise user-level state
-	 * could be corrupted.
-	 */
-	retval = (long) &ia64_leave_kernel;
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    || test_thread_flag(TIF_SYSCALL_AUDIT))
-		/*
-		 * strace expects to be notified after sigreturn returns even though the
-		 * context to which we return may not be in the middle of a syscall.
-		 * Thus, the return-value that strace displays for sigreturn is
-		 * meaningless.
-		 */
-		retval = (long) &ia64_strace_leave_kernel;
-
-	if (!access_ok(sc, sizeof(*sc)))
-		goto give_sigsegv;
-
-	if (GET_SIGSET(&set, &sc->sc_mask))
-		goto give_sigsegv;
-
-	set_current_blocked(&set);
-
-	if (restore_sigcontext(sc, scr))
-		goto give_sigsegv;
-
-#if DEBUG_SIG
-	printk("SIG return (%s:%d): sp=%lx ip=%lx\n",
-	       current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip);
-#endif
-	if (restore_altstack(&sc->sc_stack))
-		goto give_sigsegv;
-	return retval;
-
-  give_sigsegv:
-	force_sig(SIGSEGV);
-	return retval;
-}
-
-/*
- * This does just the minimum required setup of sigcontext.
- * Specifically, it only installs data that is either not knowable at
- * the user-level or that gets modified before execution in the
- * trampoline starts.  Everything else is done at the user-level.
- */
-static long
-setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr)
-{
-	unsigned long flags = 0, ifs, cfm, nat;
-	long err = 0;
-
-	ifs = scr->pt.cr_ifs;
-
-	if (on_sig_stack((unsigned long) sc))
-		flags |= IA64_SC_FLAG_ONSTACK;
-	if ((ifs & (1UL << 63)) == 0)
-		/* if cr_ifs doesn't have the valid bit set, we got here through a syscall */
-		flags |= IA64_SC_FLAG_IN_SYSCALL;
-	cfm = ifs & ((1UL << 38) - 1);
-	ia64_flush_fph(current);
-	if ((current->thread.flags & IA64_THREAD_FPH_VALID)) {
-		flags |= IA64_SC_FLAG_FPH_VALID;
-		err = __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
-	}
-
-	nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat);
-
-	err |= __put_user(flags, &sc->sc_flags);
-	err |= __put_user(nat, &sc->sc_nat);
-	err |= PUT_SIGSET(mask, &sc->sc_mask);
-	err |= __put_user(cfm, &sc->sc_cfm);
-	err |= __put_user(scr->pt.cr_ipsr & IA64_PSR_UM, &sc->sc_um);
-	err |= __put_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
-	err |= __put_user(scr->pt.ar_unat, &sc->sc_ar_unat);		/* ar.unat */
-	err |= __put_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);		/* ar.fpsr */
-	err |= __put_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
-	err |= __put_user(scr->pt.pr, &sc->sc_pr);			/* predicates */
-	err |= __put_user(scr->pt.b0, &sc->sc_br[0]);			/* b0 (rp) */
-	err |= __put_user(scr->pt.b6, &sc->sc_br[6]);			/* b6 */
-	err |= __copy_to_user(&sc->sc_gr[1], &scr->pt.r1, 8);		/* r1 */
-	err |= __copy_to_user(&sc->sc_gr[8], &scr->pt.r8, 4*8);		/* r8-r11 */
-	err |= __copy_to_user(&sc->sc_gr[12], &scr->pt.r12, 2*8);	/* r12-r13 */
-	err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8);		/* r15 */
-	err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip);
-
-	if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
-		/* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */
-		err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);		/* ar.ccv */
-		err |= __put_user(scr->pt.b7, &sc->sc_br[7]);			/* b7 */
-		err |= __put_user(scr->pt.r14, &sc->sc_gr[14]);			/* r14 */
-		err |= __copy_to_user(&sc->sc_ar25, &scr->pt.ar_csd, 2*8); /* ar.csd & ar.ssd */
-		err |= __copy_to_user(&sc->sc_gr[2], &scr->pt.r2, 2*8);		/* r2-r3 */
-		err |= __copy_to_user(&sc->sc_gr[16], &scr->pt.r16, 16*8);	/* r16-r31 */
-	}
-	return err;
-}
-
-/*
- * Check whether the register-backing store is already on the signal stack.
- */
-static inline int
-rbs_on_sig_stack (unsigned long bsp)
-{
-	return (bsp - current->sas_ss_sp < current->sas_ss_size);
-}
-
-static long
-setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
-{
-	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0, new_sp;
-	struct sigframe __user *frame;
-	long err;
-
-	new_sp = scr->pt.r12;
-	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if (ksig->ka.sa.sa_flags & SA_ONSTACK) {
-		int onstack = sas_ss_flags(new_sp);
-
-		if (onstack == 0) {
-			new_sp = current->sas_ss_sp + current->sas_ss_size;
-			/*
-			 * We need to check for the register stack being on the
-			 * signal stack separately, because it's switched
-			 * separately (memory stack is switched in the kernel,
-			 * register stack is switched in the signal trampoline).
-			 */
-			if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
-				new_rbs = ALIGN(current->sas_ss_sp,
-						sizeof(long));
-		} else if (onstack == SS_ONSTACK) {
-			unsigned long check_sp;
-
-			/*
-			 * If we are on the alternate signal stack and would
-			 * overflow it, don't. Return an always-bogus address
-			 * instead so we will die with SIGSEGV.
-			 */
-			check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN;
-			if (!likely(on_sig_stack(check_sp))) {
-				force_sigsegv(ksig->sig);
-				return 1;
-			}
-		}
-	}
-	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
-
-	if (!access_ok(frame, sizeof(*frame))) {
-		force_sigsegv(ksig->sig);
-		return 1;
-	}
-
-	err  = __put_user(ksig->sig, &frame->arg0);
-	err |= __put_user(&frame->info, &frame->arg1);
-	err |= __put_user(&frame->sc, &frame->arg2);
-	err |= __put_user(new_rbs, &frame->sc.sc_rbs_base);
-	err |= __put_user(0, &frame->sc.sc_loadrs);	/* initialize to zero */
-	err |= __put_user(ksig->ka.sa.sa_handler, &frame->handler);
-
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-
-	err |= __save_altstack(&frame->sc.sc_stack, scr->pt.r12);
-	err |= setup_sigcontext(&frame->sc, set, scr);
-
-	if (unlikely(err)) {
-		force_sigsegv(ksig->sig);
-		return 1;
-	}
-
-	scr->pt.r12 = (unsigned long) frame - 16;	/* new stack pointer */
-	scr->pt.ar_fpsr = FPSR_DEFAULT;			/* reset fpsr for signal handler */
-	scr->pt.cr_iip = tramp_addr;
-	ia64_psr(&scr->pt)->ri = 0;			/* start executing in first slot */
-	ia64_psr(&scr->pt)->be = 0;			/* force little-endian byte-order */
-	/*
-	 * Force the interruption function mask to zero.  This has no effect when a
-	 * system-call got interrupted by a signal (since, in that case, scr->pt_cr_ifs is
-	 * ignored), but it has the desirable effect of making it possible to deliver a
-	 * signal with an incomplete register frame (which happens when a mandatory RSE
-	 * load faults).  Furthermore, it has no negative effect on the getting the user's
-	 * dirty partition preserved, because that's governed by scr->pt.loadrs.
-	 */
-	scr->pt.cr_ifs = (1UL << 63);
-
-	/*
-	 * Note: this affects only the NaT bits of the scratch regs (the ones saved in
-	 * pt_regs), which is exactly what we want.
-	 */
-	scr->scratch_unat = 0; /* ensure NaT bits of r12 is clear */
-
-#if DEBUG_SIG
-	printk("SIG deliver (%s:%d): sig=%d sp=%lx ip=%lx handler=%p\n",
-	       current->comm, current->pid, ksig->sig, scr->pt.r12, frame->sc.sc_ip, frame->handler);
-#endif
-	return 0;
-}
-
-static long
-handle_signal (struct ksignal *ksig, struct sigscratch *scr)
-{
-	int ret = setup_frame(ksig, sigmask_to_save(), scr);
-
-	if (!ret)
-		signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP));
-
-	return ret;
-}
-
-/*
- * Note that `init' is a special process: it doesn't get signals it doesn't want to
- * handle.  Thus you cannot kill init even with a SIGKILL even by mistake.
- */
-void
-ia64_do_signal (struct sigscratch *scr, long in_syscall)
-{
-	long restart = in_syscall;
-	long errno = scr->pt.r8;
-	struct ksignal ksig;
-
-	/*
-	 * This only loops in the rare cases of handle_signal() failing, in which case we
-	 * need to push through a forced SIGSEGV.
-	 */
-	while (1) {
-		if (!get_signal(&ksig))
-			break;
-
-		/*
-		 * get_signal() may have run a debugger (via notify_parent())
-		 * and the debugger may have modified the state (e.g., to arrange for an
-		 * inferior call), thus it's important to check for restarting _after_
-		 * get_signal().
-		 */
-		if ((long) scr->pt.r10 != -1)
-			/*
-			 * A system calls has to be restarted only if one of the error codes
-			 * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned.  If r10
-			 * isn't -1 then r8 doesn't hold an error code and we don't need to
-			 * restart the syscall, so we can clear the "restart" flag here.
-			 */
-			restart = 0;
-
-		if (ksig.sig <= 0)
-			break;
-
-		if (unlikely(restart)) {
-			switch (errno) {
-			case ERESTART_RESTARTBLOCK:
-			case ERESTARTNOHAND:
-				scr->pt.r8 = EINTR;
-				/* note: scr->pt.r10 is already -1 */
-				break;
-			case ERESTARTSYS:
-				if ((ksig.ka.sa.sa_flags & SA_RESTART) == 0) {
-					scr->pt.r8 = EINTR;
-					/* note: scr->pt.r10 is already -1 */
-					break;
-				}
-				fallthrough;
-			case ERESTARTNOINTR:
-				ia64_decrement_ip(&scr->pt);
-				restart = 0; /* don't restart twice if handle_signal() fails... */
-			}
-		}
-
-		/*
-		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
-		 * continue to iterate in this loop so we can deliver the SIGSEGV...
-		 */
-		if (handle_signal(&ksig, scr))
-			return;
-	}
-
-	/* Did we come from a system call? */
-	if (restart) {
-		/* Restart the system call - no handlers present */
-		if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
-		    || errno == ERESTART_RESTARTBLOCK)
-		{
-			/*
-			 * Note: the syscall number is in r15 which is saved in
-			 * pt_regs so all we need to do here is adjust ip so that
-			 * the "break" instruction gets re-executed.
-			 */
-			ia64_decrement_ip(&scr->pt);
-			if (errno == ERESTART_RESTARTBLOCK)
-				scr->pt.r15 = __NR_restart_syscall;
-		}
-	}
-
-	/* if there's no signal to deliver, we just put the saved sigmask
-	 * back */
-	restore_saved_sigmask();
-}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
deleted file mode 100644
index ea4f009a232b..000000000000
--- a/arch/ia64/kernel/smp.c
+++ /dev/null
@@ -1,335 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SMP Support
- *
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999, 2001, 2003 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Lots of stuff stolen from arch/alpha/kernel/smp.c
- *
- * 01/05/16 Rohit Seth <rohit.seth@intel.com>  IA64-SMP functions. Reorganized
- * the existing code (on the lines of x86 port).
- * 00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy
- * calibration on each CPU.
- * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id
- * 00/03/31 Rohit Seth <rohit.seth@intel.com>	Fixes for Bootstrap Processor
- * & cpu_online_map now gets done here (instead of setup.c)
- * 99/10/05 davidm	Update to bring it in sync with new command-line processing
- *  scheme.
- * 10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and
- *		smp_call_function_single to resend IPI on timeouts
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/cache.h>
-#include <linux/delay.h>
-#include <linux/efi.h>
-#include <linux/bitops.h>
-#include <linux/kexec.h>
-
-#include <linux/atomic.h>
-#include <asm/current.h>
-#include <asm/delay.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-#include <asm/mca.h>
-#include <asm/xtp.h>
-
-/*
- * Note: alignment of 4 entries/cacheline was empirically determined
- * to be a good tradeoff between hot cachelines & spreading the array
- * across too many cacheline.
- */
-static struct local_tlb_flush_counts {
-	unsigned int count;
-} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
-				     shadow_flush_counts);
-
-#define IPI_CALL_FUNC		0
-#define IPI_CPU_STOP		1
-#define IPI_CALL_FUNC_SINGLE	2
-#define IPI_KDUMP_CPU_STOP	3
-
-/* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, ipi_operation);
-
-extern void cpu_halt (void);
-
-static void
-stop_this_cpu(void)
-{
-	/*
-	 * Remove this CPU:
-	 */
-	set_cpu_online(smp_processor_id(), false);
-	max_xtp();
-	local_irq_disable();
-	cpu_halt();
-}
-
-void
-cpu_die(void)
-{
-	max_xtp();
-	local_irq_disable();
-	cpu_halt();
-	/* Should never be here */
-	BUG();
-	for (;;);
-}
-
-irqreturn_t
-handle_IPI (int irq, void *dev_id)
-{
-	int this_cpu = get_cpu();
-	unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation);
-	unsigned long ops;
-
-	mb();	/* Order interrupt and bit testing. */
-	while ((ops = xchg(pending_ipis, 0)) != 0) {
-		mb();	/* Order bit clearing and data access. */
-		do {
-			unsigned long which;
-
-			which = ffz(~ops);
-			ops &= ~(1 << which);
-
-			switch (which) {
-			case IPI_CPU_STOP:
-				stop_this_cpu();
-				break;
-			case IPI_CALL_FUNC:
-				generic_smp_call_function_interrupt();
-				break;
-			case IPI_CALL_FUNC_SINGLE:
-				generic_smp_call_function_single_interrupt();
-				break;
-#ifdef CONFIG_KEXEC
-			case IPI_KDUMP_CPU_STOP:
-				unw_init_running(kdump_cpu_freeze, NULL);
-				break;
-#endif
-			default:
-				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n",
-						this_cpu, which);
-				break;
-			}
-		} while (ops);
-		mb();	/* Order data access and bit testing. */
-	}
-	put_cpu();
-	return IRQ_HANDLED;
-}
-
-
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_single (int dest_cpu, int op)
-{
-	set_bit(op, &per_cpu(ipi_operation, dest_cpu));
-	ia64_send_ipi(dest_cpu, IA64_IPI_VECTOR, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_allbutself (int op)
-{
-	unsigned int i;
-
-	for_each_online_cpu(i) {
-		if (i != smp_processor_id())
-			send_IPI_single(i, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_mask(const struct cpumask *mask, int op)
-{
-	unsigned int cpu;
-
-	for_each_cpu(cpu, mask) {
-			send_IPI_single(cpu, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_all (int op)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		send_IPI_single(i, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_self (int op)
-{
-	send_IPI_single(smp_processor_id(), op);
-}
-
-#ifdef CONFIG_KEXEC
-void
-kdump_smp_send_stop(void)
-{
- 	send_IPI_allbutself(IPI_KDUMP_CPU_STOP);
-}
-
-void
-kdump_smp_send_init(void)
-{
-	unsigned int cpu, self_cpu;
-	self_cpu = smp_processor_id();
-	for_each_online_cpu(cpu) {
-		if (cpu != self_cpu) {
-			if(kdump_status[cpu] == 0)
-				ia64_send_ipi(cpu, 0, IA64_IPI_DM_INIT, 0);
-		}
-	}
-}
-#endif
-/*
- * Called with preemption disabled.
- */
-void
-arch_smp_send_reschedule (int cpu)
-{
-	ia64_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
-}
-EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
-
-/*
- * Called with preemption disabled.
- */
-static void
-smp_send_local_flush_tlb (int cpu)
-{
-	ia64_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0);
-}
-
-void
-smp_local_flush_tlb(void)
-{
-	/*
-	 * Use atomic ops. Otherwise, the load/increment/store sequence from
-	 * a "++" operation can have the line stolen between the load & store.
-	 * The overhead of the atomic op in negligible in this case & offers
-	 * significant benefit for the brief periods where lots of cpus
-	 * are simultaneously flushing TLBs.
-	 */
-	ia64_fetchadd(1, &local_tlb_flush_counts[smp_processor_id()].count, acq);
-	local_flush_tlb_all();
-}
-
-#define FLUSH_DELAY	5 /* Usec backoff to eliminate excessive cacheline bouncing */
-
-void
-smp_flush_tlb_cpumask(cpumask_t xcpumask)
-{
-	unsigned short *counts = __ia64_per_cpu_var(shadow_flush_counts);
-	cpumask_t cpumask = xcpumask;
-	int mycpu, cpu, flush_mycpu = 0;
-
-	preempt_disable();
-	mycpu = smp_processor_id();
-
-	for_each_cpu(cpu, &cpumask)
-		counts[cpu] = local_tlb_flush_counts[cpu].count & 0xffff;
-
-	mb();
-	for_each_cpu(cpu, &cpumask) {
-		if (cpu == mycpu)
-			flush_mycpu = 1;
-		else
-			smp_send_local_flush_tlb(cpu);
-	}
-
-	if (flush_mycpu)
-		smp_local_flush_tlb();
-
-	for_each_cpu(cpu, &cpumask)
-		while(counts[cpu] == (local_tlb_flush_counts[cpu].count & 0xffff))
-			udelay(FLUSH_DELAY);
-
-	preempt_enable();
-}
-
-void
-smp_flush_tlb_all (void)
-{
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
-}
-
-void
-smp_flush_tlb_mm (struct mm_struct *mm)
-{
-	cpumask_var_t cpus;
-	preempt_disable();
-	/* this happens for the common case of a single-threaded fork():  */
-	if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
-	{
-		local_finish_flush_tlb_mm(mm);
-		preempt_enable();
-		return;
-	}
-	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) {
-		smp_call_function((void (*)(void *))local_finish_flush_tlb_mm,
-			mm, 1);
-	} else {
-		cpumask_copy(cpus, mm_cpumask(mm));
-		smp_call_function_many(cpus,
-			(void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
-		free_cpumask_var(cpus);
-	}
-	local_irq_disable();
-	local_finish_flush_tlb_mm(mm);
-	local_irq_enable();
-	preempt_enable();
-}
-
-void arch_send_call_function_single_ipi(int cpu)
-{
-	send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
-}
-
-void arch_send_call_function_ipi_mask(const struct cpumask *mask)
-{
-	send_IPI_mask(mask, IPI_CALL_FUNC);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-void
-smp_send_stop (void)
-{
-	send_IPI_allbutself(IPI_CPU_STOP);
-}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
deleted file mode 100644
index d0e935cf2093..000000000000
--- a/arch/ia64/kernel/smpboot.c
+++ /dev/null
@@ -1,839 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SMP boot-related support
- *
- * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2001, 2004-2005 Intel Corp
- * 	Rohit Seth <rohit.seth@intel.com>
- * 	Suresh Siddha <suresh.b.siddha@intel.com>
- * 	Gordon Jin <gordon.jin@intel.com>
- *	Ashok Raj  <ashok.raj@intel.com>
- *
- * 01/05/16 Rohit Seth <rohit.seth@intel.com>	Moved SMP booting functions from smp.c to here.
- * 01/04/27 David Mosberger <davidm@hpl.hp.com>	Added ITC synching code.
- * 02/07/31 David Mosberger <davidm@hpl.hp.com>	Switch over to hotplug-CPU boot-sequence.
- *						smp_boot_cpus()/smp_commence() is replaced by
- *						smp_prepare_cpus()/__cpu_up()/smp_cpus_done().
- * 04/06/21 Ashok Raj		<ashok.raj@intel.com> Added CPU Hotplug Support
- * 04/12/26 Jin Gordon <gordon.jin@intel.com>
- * 04/12/26 Rohit Seth <rohit.seth@intel.com>
- *						Add multi-threading and multi-core detection
- * 05/01/30 Suresh Siddha <suresh.b.siddha@intel.com>
- *						Setup cpu_sibling_map and cpu_core_map
- */
-
-#include <linux/module.h>
-#include <linux/acpi.h>
-#include <linux/memblock.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/efi.h>
-#include <linux/percpu.h>
-#include <linux/bitops.h>
-
-#include <linux/atomic.h>
-#include <asm/cache.h>
-#include <asm/current.h>
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/mca.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-
-#define SMP_DEBUG 0
-
-#if SMP_DEBUG
-#define Dprintk(x...)  printk(x)
-#else
-#define Dprintk(x...)
-#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef CONFIG_PERMIT_BSP_REMOVE
-#define bsp_remove_ok	1
-#else
-#define bsp_remove_ok	0
-#endif
-
-/*
- * Global array allocated for NR_CPUS at boot time
- */
-struct sal_to_os_boot sal_boot_rendez_state[NR_CPUS];
-
-/*
- * start_ap in head.S uses this to store current booting cpu
- * info.
- */
-struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
-
-#define set_brendez_area(x) (sal_state_for_booting_cpu = &sal_boot_rendez_state[(x)]);
-
-#else
-#define set_brendez_area(x)
-#endif
-
-
-/*
- * ITC synchronization related stuff:
- */
-#define MASTER	(0)
-#define SLAVE	(SMP_CACHE_BYTES/8)
-
-#define NUM_ROUNDS	64	/* magic value */
-#define NUM_ITERS	5	/* likewise */
-
-static DEFINE_SPINLOCK(itc_sync_lock);
-static volatile unsigned long go[SLAVE + 1];
-
-#define DEBUG_ITC_SYNC	0
-
-extern void start_ap (void);
-extern unsigned long ia64_iobase;
-
-struct task_struct *task_for_booting_cpu;
-
-/*
- * State for each CPU
- */
-DEFINE_PER_CPU(int, cpu_state);
-
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_core_map);
-DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
-EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
-
-int smp_num_siblings = 1;
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-volatile int ia64_cpu_to_sapicid[NR_CPUS];
-EXPORT_SYMBOL(ia64_cpu_to_sapicid);
-
-static cpumask_t cpu_callin_map;
-
-struct smp_boot_data smp_boot_data __initdata;
-
-unsigned long ap_wakeup_vector = -1; /* External Int use to wakeup APs */
-
-char __initdata no_int_routing;
-
-unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
-
-#ifdef CONFIG_FORCE_CPEI_RETARGET
-#define CPEI_OVERRIDE_DEFAULT	(1)
-#else
-#define CPEI_OVERRIDE_DEFAULT	(0)
-#endif
-
-unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
-
-static int __init
-cmdl_force_cpei(char *str)
-{
-	int value=0;
-
-	get_option (&str, &value);
-	force_cpei_retarget = value;
-
-	return 1;
-}
-
-__setup("force_cpei=", cmdl_force_cpei);
-
-static int __init
-nointroute (char *str)
-{
-	no_int_routing = 1;
-	printk ("no_int_routing on\n");
-	return 1;
-}
-
-__setup("nointroute", nointroute);
-
-static void fix_b0_for_bsp(void)
-{
-#ifdef CONFIG_HOTPLUG_CPU
-	int cpuid;
-	static int fix_bsp_b0 = 1;
-
-	cpuid = smp_processor_id();
-
-	/*
-	 * Cache the b0 value on the first AP that comes up
-	 */
-	if (!(fix_bsp_b0 && cpuid))
-		return;
-
-	sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
-	printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
-
-	fix_bsp_b0 = 0;
-#endif
-}
-
-void
-sync_master (void *arg)
-{
-	unsigned long flags, i;
-
-	go[MASTER] = 0;
-
-	local_irq_save(flags);
-	{
-		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
-			while (!go[MASTER])
-				cpu_relax();
-			go[MASTER] = 0;
-			go[SLAVE] = ia64_get_itc();
-		}
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our itc differs from the itc on the master
- * (time-keeper) CPU.  A positive number indicates our itc is ahead of the master,
- * negative that it is behind.
- */
-static inline long
-get_delta (long *rt, long *master)
-{
-	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
-	unsigned long tcenter, t0, t1, tm;
-	long i;
-
-	for (i = 0; i < NUM_ITERS; ++i) {
-		t0 = ia64_get_itc();
-		go[MASTER] = 1;
-		while (!(tm = go[SLAVE]))
-			cpu_relax();
-		go[SLAVE] = 0;
-		t1 = ia64_get_itc();
-
-		if (t1 - t0 < best_t1 - best_t0)
-			best_t0 = t0, best_t1 = t1, best_tm = tm;
-	}
-
-	*rt = best_t1 - best_t0;
-	*master = best_tm - best_t0;
-
-	/* average best_t0 and best_t1 without overflow: */
-	tcenter = (best_t0/2 + best_t1/2);
-	if (best_t0 % 2 + best_t1 % 2 == 2)
-		++tcenter;
-	return tcenter - best_tm;
-}
-
-/*
- * Synchronize ar.itc of the current (slave) CPU with the ar.itc of the MASTER CPU
- * (normally the time-keeper CPU).  We use a closed loop to eliminate the possibility of
- * unaccounted-for errors (such as getting a machine check in the middle of a calibration
- * step).  The basic idea is for the slave to ask the master what itc value it has and to
- * read its own itc before and after the master responds.  Each iteration gives us three
- * timestamps:
- *
- *	slave		master
- *
- *	t0 ---\
- *             ---\
- *		   --->
- *			tm
- *		   /---
- *	       /---
- *	t1 <---
- *
- *
- * The goal is to adjust the slave's ar.itc such that tm falls exactly half-way between t0
- * and t1.  If we achieve this, the clocks are synchronized provided the interconnect
- * between the slave and the master is symmetric.  Even if the interconnect were
- * asymmetric, we would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us synchronize the itc to
- * within one or two cycles.  However, we can only *guarantee* that the synchronization is
- * accurate to within a round-trip time, which is typically in the range of several
- * hundred cycles (e.g., ~500 cycles).  In practice, this means that the itc's are usually
- * almost perfectly synchronized, but we shouldn't assume that the accuracy is much better
- * than half a micro second or so.
- */
-void
-ia64_sync_itc (unsigned int master)
-{
-	long i, delta, adj, adjust_latency = 0, done = 0;
-	unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_ITC_SYNC
-	struct {
-		long rt;	/* roundtrip time */
-		long master;	/* master's timestamp */
-		long diff;	/* difference between midpoint and master's timestamp */
-		long lat;	/* estimate of itc adjustment latency */
-	} t[NUM_ROUNDS];
-#endif
-
-	/*
-	 * Make sure local timer ticks are disabled while we sync.  If
-	 * they were enabled, we'd have to worry about nasty issues
-	 * like setting the ITC ahead of (or a long time before) the
-	 * next scheduled tick.
-	 */
-	BUG_ON((ia64_get_itv() & (1 << 16)) == 0);
-
-	go[MASTER] = 1;
-
-	if (smp_call_function_single(master, sync_master, NULL, 0) < 0) {
-		printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
-		return;
-	}
-
-	while (go[MASTER])
-		cpu_relax();	/* wait for master to be ready */
-
-	spin_lock_irqsave(&itc_sync_lock, flags);
-	{
-		for (i = 0; i < NUM_ROUNDS; ++i) {
-			delta = get_delta(&rt, &master_time_stamp);
-			if (delta == 0) {
-				done = 1;	/* let's lock on to this... */
-				bound = rt;
-			}
-
-			if (!done) {
-				if (i > 0) {
-					adjust_latency += -delta;
-					adj = -delta + adjust_latency/4;
-				} else
-					adj = -delta;
-
-				ia64_set_itc(ia64_get_itc() + adj);
-			}
-#if DEBUG_ITC_SYNC
-			t[i].rt = rt;
-			t[i].master = master_time_stamp;
-			t[i].diff = delta;
-			t[i].lat = adjust_latency/4;
-#endif
-		}
-	}
-	spin_unlock_irqrestore(&itc_sync_lock, flags);
-
-#if DEBUG_ITC_SYNC
-	for (i = 0; i < NUM_ROUNDS; ++i)
-		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
-		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
-	printk(KERN_INFO "CPU %d: synchronized ITC with CPU %u (last diff %ld cycles, "
-	       "maxerr %lu cycles)\n", smp_processor_id(), master, delta, rt);
-}
-
-/*
- * Ideally sets up per-cpu profiling hooks.  Doesn't do much now...
- */
-static inline void smp_setup_percpu_timer(void)
-{
-}
-
-static void
-smp_callin (void)
-{
-	int cpuid, phys_id, itc_master;
-	struct cpuinfo_ia64 *last_cpuinfo, *this_cpuinfo;
-	extern void ia64_init_itm(void);
-	extern volatile int time_keeper_id;
-
-	cpuid = smp_processor_id();
-	phys_id = hard_smp_processor_id();
-	itc_master = time_keeper_id;
-
-	if (cpu_online(cpuid)) {
-		printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
-		       phys_id, cpuid);
-		BUG();
-	}
-
-	fix_b0_for_bsp();
-
-	/*
-	 * numa_node_id() works after this.
-	 */
-	set_numa_node(cpu_to_node_map[cpuid]);
-	set_numa_mem(local_memory_node(cpu_to_node_map[cpuid]));
-
-	spin_lock(&vector_lock);
-	/* Setup the per cpu irq handling data structures */
-	__setup_vector_irq(cpuid);
-	notify_cpu_starting(cpuid);
-	set_cpu_online(cpuid, true);
-	per_cpu(cpu_state, cpuid) = CPU_ONLINE;
-	spin_unlock(&vector_lock);
-
-	smp_setup_percpu_timer();
-
-	ia64_mca_cmc_vector_setup();	/* Setup vector on AP */
-
-	local_irq_enable();
-
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-		/*
-		 * Synchronize the ITC with the BP.  Need to do this after irqs are
-		 * enabled because ia64_sync_itc() calls smp_call_function_single(), which
-		 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
-		 * local_bh_enable(), which bugs out if irqs are not enabled...
-		 */
-		Dprintk("Going to syncup ITC with ITC Master.\n");
-		ia64_sync_itc(itc_master);
-	}
-
-	/*
-	 * Get our bogomips.
-	 */
-	ia64_init_itm();
-
-	/*
-	 * Delay calibration can be skipped if new processor is identical to the
-	 * previous processor.
-	 */
-	last_cpuinfo = cpu_data(cpuid - 1);
-	this_cpuinfo = local_cpu_data;
-	if (last_cpuinfo->itc_freq != this_cpuinfo->itc_freq ||
-	    last_cpuinfo->proc_freq != this_cpuinfo->proc_freq ||
-	    last_cpuinfo->features != this_cpuinfo->features ||
-	    last_cpuinfo->revision != this_cpuinfo->revision ||
-	    last_cpuinfo->family != this_cpuinfo->family ||
-	    last_cpuinfo->archrev != this_cpuinfo->archrev ||
-	    last_cpuinfo->model != this_cpuinfo->model)
-		calibrate_delay();
-	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
-
-	/*
-	 * Allow the master to continue.
-	 */
-	cpumask_set_cpu(cpuid, &cpu_callin_map);
-	Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid);
-}
-
-
-/*
- * Activate a secondary processor.  head.S calls this.
- */
-int
-start_secondary (void *unused)
-{
-	/* Early console may use I/O ports */
-	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
-#ifndef CONFIG_PRINTK_TIME
-	Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
-#endif
-	efi_map_pal_code();
-	cpu_init();
-	smp_callin();
-
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-	return 0;
-}
-
-static int
-do_boot_cpu (int sapicid, int cpu, struct task_struct *idle)
-{
-	int timeout;
-
-	task_for_booting_cpu = idle;
-	Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
-
-	set_brendez_area(cpu);
-	ia64_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
-
-	/*
-	 * Wait 10s total for the AP to start
-	 */
-	Dprintk("Waiting on callin_map ...");
-	for (timeout = 0; timeout < 100000; timeout++) {
-		if (cpumask_test_cpu(cpu, &cpu_callin_map))
-			break;  /* It has booted */
-		barrier(); /* Make sure we re-read cpu_callin_map */
-		udelay(100);
-	}
-	Dprintk("\n");
-
-	if (!cpumask_test_cpu(cpu, &cpu_callin_map)) {
-		printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
-		ia64_cpu_to_sapicid[cpu] = -1;
-		set_cpu_online(cpu, false);  /* was set in smp_callin() */
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int __init
-decay (char *str)
-{
-	int ticks;
-	get_option (&str, &ticks);
-	return 1;
-}
-
-__setup("decay=", decay);
-
-/*
- * Initialize the logical CPU number to SAPICID mapping
- */
-void __init
-smp_build_cpu_map (void)
-{
-	int sapicid, cpu, i;
-	int boot_cpu_id = hard_smp_processor_id();
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		ia64_cpu_to_sapicid[cpu] = -1;
-	}
-
-	ia64_cpu_to_sapicid[0] = boot_cpu_id;
-	init_cpu_present(cpumask_of(0));
-	set_cpu_possible(0, true);
-	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
-		sapicid = smp_boot_data.cpu_phys_id[i];
-		if (sapicid == boot_cpu_id)
-			continue;
-		set_cpu_present(cpu, true);
-		set_cpu_possible(cpu, true);
-		ia64_cpu_to_sapicid[cpu] = sapicid;
-		cpu++;
-	}
-}
-
-/*
- * Cycle through the APs sending Wakeup IPIs to boot each.
- */
-void __init
-smp_prepare_cpus (unsigned int max_cpus)
-{
-	int boot_cpu_id = hard_smp_processor_id();
-
-	/*
-	 * Initialize the per-CPU profiling counter/multiplier
-	 */
-
-	smp_setup_percpu_timer();
-
-	cpumask_set_cpu(0, &cpu_callin_map);
-
-	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
-	ia64_cpu_to_sapicid[0] = boot_cpu_id;
-
-	printk(KERN_INFO "Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
-
-	current_thread_info()->cpu = 0;
-
-	/*
-	 * If SMP should be disabled, then really disable it!
-	 */
-	if (!max_cpus) {
-		printk(KERN_INFO "SMP mode deactivated.\n");
-		init_cpu_online(cpumask_of(0));
-		init_cpu_present(cpumask_of(0));
-		init_cpu_possible(cpumask_of(0));
-		return;
-	}
-}
-
-void smp_prepare_boot_cpu(void)
-{
-	set_cpu_online(smp_processor_id(), true);
-	cpumask_set_cpu(smp_processor_id(), &cpu_callin_map);
-	set_numa_node(cpu_to_node_map[smp_processor_id()]);
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static inline void
-clear_cpu_sibling_map(int cpu)
-{
-	int i;
-
-	for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu))
-		cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i));
-	for_each_cpu(i, &cpu_core_map[cpu])
-		cpumask_clear_cpu(cpu, &cpu_core_map[i]);
-
-	per_cpu(cpu_sibling_map, cpu) = cpu_core_map[cpu] = CPU_MASK_NONE;
-}
-
-static void
-remove_siblinginfo(int cpu)
-{
-	if (cpu_data(cpu)->threads_per_core == 1 &&
-	    cpu_data(cpu)->cores_per_socket == 1) {
-		cpumask_clear_cpu(cpu, &cpu_core_map[cpu]);
-		cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, cpu));
-		return;
-	}
-
-	/* remove it from all sibling map's */
-	clear_cpu_sibling_map(cpu);
-}
-
-extern void fixup_irqs(void);
-
-int migrate_platform_irqs(unsigned int cpu)
-{
-	int new_cpei_cpu;
-	struct irq_data *data = NULL;
-	const struct cpumask *mask;
-	int 		retval = 0;
-
-	/*
-	 * dont permit CPEI target to removed.
-	 */
-	if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
-		printk ("CPU (%d) is CPEI Target\n", cpu);
-		if (can_cpei_retarget()) {
-			/*
-			 * Now re-target the CPEI to a different processor
-			 */
-			new_cpei_cpu = cpumask_any(cpu_online_mask);
-			mask = cpumask_of(new_cpei_cpu);
-			set_cpei_target_cpu(new_cpei_cpu);
-			data = irq_get_irq_data(ia64_cpe_irq);
-			/*
-			 * Switch for now, immediately, we need to do fake intr
-			 * as other interrupts, but need to study CPEI behaviour with
-			 * polling before making changes.
-			 */
-			if (data && data->chip) {
-				data->chip->irq_disable(data);
-				data->chip->irq_set_affinity(data, mask, false);
-				data->chip->irq_enable(data);
-				printk ("Re-targeting CPEI to cpu %d\n", new_cpei_cpu);
-			}
-		}
-		if (!data) {
-			printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
-			retval = -EBUSY;
-		}
-	}
-	return retval;
-}
-
-/* must be called with cpucontrol mutex held */
-int __cpu_disable(void)
-{
-	int cpu = smp_processor_id();
-
-	/*
-	 * dont permit boot processor for now
-	 */
-	if (cpu == 0 && !bsp_remove_ok) {
-		printk ("Your platform does not support removal of BSP\n");
-		return (-EBUSY);
-	}
-
-	set_cpu_online(cpu, false);
-
-	if (migrate_platform_irqs(cpu)) {
-		set_cpu_online(cpu, true);
-		return -EBUSY;
-	}
-
-	remove_siblinginfo(cpu);
-	fixup_irqs();
-	local_flush_tlb_all();
-	cpumask_clear_cpu(cpu, &cpu_callin_map);
-	return 0;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-	unsigned int i;
-
-	for (i = 0; i < 100; i++) {
-		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
-		{
-			printk ("CPU %d is now offline\n", cpu);
-			return;
-		}
-		msleep(100);
-	}
- 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void
-smp_cpus_done (unsigned int dummy)
-{
-	int cpu;
-	unsigned long bogosum = 0;
-
-	/*
-	 * Allow the user to impress friends.
-	 */
-
-	for_each_online_cpu(cpu) {
-		bogosum += cpu_data(cpu)->loops_per_jiffy;
-	}
-
-	printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
-	       (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
-}
-
-static inline void set_cpu_sibling_map(int cpu)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		if ((cpu_data(cpu)->socket_id == cpu_data(i)->socket_id)) {
-			cpumask_set_cpu(i, &cpu_core_map[cpu]);
-			cpumask_set_cpu(cpu, &cpu_core_map[i]);
-			if (cpu_data(cpu)->core_id == cpu_data(i)->core_id) {
-				cpumask_set_cpu(i,
-						&per_cpu(cpu_sibling_map, cpu));
-				cpumask_set_cpu(cpu,
-						&per_cpu(cpu_sibling_map, i));
-			}
-		}
-	}
-}
-
-int
-__cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
-	int ret;
-	int sapicid;
-
-	sapicid = ia64_cpu_to_sapicid[cpu];
-	if (sapicid == -1)
-		return -EINVAL;
-
-	/*
-	 * Already booted cpu? not valid anymore since we dont
-	 * do idle loop tightspin anymore.
-	 */
-	if (cpumask_test_cpu(cpu, &cpu_callin_map))
-		return -EINVAL;
-
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-	/* Processor goes to start_secondary(), sets online flag */
-	ret = do_boot_cpu(sapicid, cpu, tidle);
-	if (ret < 0)
-		return ret;
-
-	if (cpu_data(cpu)->threads_per_core == 1 &&
-	    cpu_data(cpu)->cores_per_socket == 1) {
-		cpumask_set_cpu(cpu, &per_cpu(cpu_sibling_map, cpu));
-		cpumask_set_cpu(cpu, &cpu_core_map[cpu]);
-		return 0;
-	}
-
-	set_cpu_sibling_map(cpu);
-
-	return 0;
-}
-
-/*
- * Assume that CPUs have been discovered by some platform-dependent interface.  For
- * SoftSDV/Lion, that would be ACPI.
- *
- * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP().
- */
-void __init
-init_smp_config(void)
-{
-	struct fptr {
-		unsigned long fp;
-		unsigned long gp;
-	} *ap_startup;
-	long sal_ret;
-
-	/* Tell SAL where to drop the APs.  */
-	ap_startup = (struct fptr *) start_ap;
-	sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ,
-				       ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0);
-	if (sal_ret < 0)
-		printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
-		       ia64_sal_strerror(sal_ret));
-}
-
-/*
- * identify_siblings(cpu) gets called from identify_cpu. This populates the 
- * information related to logical execution units in per_cpu_data structure.
- */
-void identify_siblings(struct cpuinfo_ia64 *c)
-{
-	long status;
-	u16 pltid;
-	pal_logical_to_physical_t info;
-
-	status = ia64_pal_logical_to_phys(-1, &info);
-	if (status != PAL_STATUS_SUCCESS) {
-		if (status != PAL_STATUS_UNIMPLEMENTED) {
-			printk(KERN_ERR
-				"ia64_pal_logical_to_phys failed with %ld\n",
-				status);
-			return;
-		}
-
-		info.overview_ppid = 0;
-		info.overview_cpp  = 1;
-		info.overview_tpc  = 1;
-	}
-
-	status = ia64_sal_physical_id_info(&pltid);
-	if (status != PAL_STATUS_SUCCESS) {
-		if (status != PAL_STATUS_UNIMPLEMENTED)
-			printk(KERN_ERR
-				"ia64_sal_pltid failed with %ld\n",
-				status);
-		return;
-	}
-
-	c->socket_id =  (pltid << 8) | info.overview_ppid;
-
-	if (info.overview_cpp == 1 && info.overview_tpc == 1)
-		return;
-
-	c->cores_per_socket = info.overview_cpp;
-	c->threads_per_core = info.overview_tpc;
-	c->num_log = info.overview_num_log;
-
-	c->core_id = info.log1_cid;
-	c->thread_id = info.log1_tid;
-}
-
-/*
- * returns non zero, if multi-threading is enabled
- * on at least one physical package. Due to hotplug cpu
- * and (maxcpus=), all threads may not necessarily be enabled
- * even though the processor supports multi-threading.
- */
-int is_multithreading_enabled(void)
-{
-	int i, j;
-
-	for_each_present_cpu(i) {
-		for_each_present_cpu(j) {
-			if (j == i)
-				continue;
-			if ((cpu_data(j)->socket_id == cpu_data(i)->socket_id)) {
-				if (cpu_data(j)->core_id == cpu_data(i)->core_id)
-					return 1;
-			}
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(is_multithreading_enabled);
diff --git a/arch/ia64/kernel/stacktrace.c b/arch/ia64/kernel/stacktrace.c
deleted file mode 100644
index 6e583a6bd2f6..000000000000
--- a/arch/ia64/kernel/stacktrace.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/ia64/kernel/stacktrace.c
- *
- * Stack trace management functions
- *
- */
-#include <linux/sched.h>
-#include <linux/stacktrace.h>
-#include <linux/module.h>
-
-static void
-ia64_do_save_stack(struct unw_frame_info *info, void *arg)
-{
-	struct stack_trace *trace = arg;
-	unsigned long ip;
-	int skip = trace->skip;
-
-	trace->nr_entries = 0;
-	do {
-		unw_get_ip(info, &ip);
-		if (ip == 0)
-			break;
-		if (skip == 0) {
-			trace->entries[trace->nr_entries++] = ip;
-			if (trace->nr_entries == trace->max_entries)
-				break;
-		} else
-			skip--;
-	} while (unw_unwind(info) >= 0);
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
-	unw_init_running(ia64_do_save_stack, trace);
-}
-EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
deleted file mode 100644
index eb561cc93632..000000000000
--- a/arch/ia64/kernel/sys_ia64.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains various system calls that have different calling
- * conventions on different platforms.
- *
- * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task_stack.h>
-#include <linux/shm.h>
-#include <linux/file.h>		/* doh, must come after sched.h... */
-#include <linux/smp.h>
-#include <linux/syscalls.h>
-#include <linux/highuid.h>
-#include <linux/hugetlb.h>
-
-#include <asm/shmparam.h>
-#include <linux/uaccess.h>
-
-unsigned long
-arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len,
-			unsigned long pgoff, unsigned long flags)
-{
-	long map_shared = (flags & MAP_SHARED);
-	unsigned long align_mask = 0;
-	struct mm_struct *mm = current->mm;
-	struct vm_unmapped_area_info info;
-
-	if (len > RGN_MAP_LIMIT)
-		return -ENOMEM;
-
-	/* handle fixed mapping: prevent overlap with huge pages */
-	if (flags & MAP_FIXED) {
-		if (is_hugepage_only_range(mm, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-#ifdef CONFIG_HUGETLB_PAGE
-	if (REGION_NUMBER(addr) == RGN_HPAGE)
-		addr = 0;
-#endif
-	if (!addr)
-		addr = TASK_UNMAPPED_BASE;
-
-	if (map_shared && (TASK_SIZE > 0xfffffffful))
-		/*
-		 * For 64-bit tasks, align shared segments to 1MB to avoid potential
-		 * performance penalty due to virtual aliasing (see ASDM).  For 32-bit
-		 * tasks, we prefer to avoid exhausting the address space too quickly by
-		 * limiting alignment to a single page.
-		 */
-		align_mask = PAGE_MASK & (SHMLBA - 1);
-
-	info.flags = 0;
-	info.length = len;
-	info.low_limit = addr;
-	info.high_limit = TASK_SIZE;
-	info.align_mask = align_mask;
-	info.align_offset = pgoff << PAGE_SHIFT;
-	return vm_unmapped_area(&info);
-}
-
-asmlinkage long
-ia64_getpriority (int which, int who)
-{
-	long prio;
-
-	prio = sys_getpriority(which, who);
-	if (prio >= 0) {
-		force_successful_syscall_return();
-		prio = 20 - prio;
-	}
-	return prio;
-}
-
-/* XXX obsolete, but leave it here until the old libc is gone... */
-asmlinkage unsigned long
-sys_getpagesize (void)
-{
-	return PAGE_SIZE;
-}
-
-asmlinkage unsigned long
-ia64_brk (unsigned long brk)
-{
-	unsigned long retval = sys_brk(brk);
-	force_successful_syscall_return();
-	return retval;
-}
-
-/*
- * On IA-64, we return the two file descriptors in ret0 and ret1 (r8
- * and r9) as this is faster than doing a copy_to_user().
- */
-asmlinkage long
-sys_ia64_pipe (void)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	int fd[2];
-	int retval;
-
-	retval = do_pipe_flags(fd, 0);
-	if (retval)
-		goto out;
-	retval = fd[0];
-	regs->r9 = fd[1];
-  out:
-	return retval;
-}
-
-int ia64_mmap_check(unsigned long addr, unsigned long len,
-		unsigned long flags)
-{
-	unsigned long roff;
-
-	/*
-	 * Don't permit mappings into unmapped space, the virtual page table
-	 * of a region, or across a region boundary.  Note: RGN_MAP_LIMIT is
-	 * equal to 2^n-PAGE_SIZE (for some integer n <= 61) and len > 0.
-	 */
-	roff = REGION_OFFSET(addr);
-	if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len)))
-		return -EINVAL;
-	return 0;
-}
-
-/*
- * mmap2() is like mmap() except that the offset is expressed in units
- * of PAGE_SIZE (instead of bytes).  This allows to mmap2() (pieces
- * of) files that are larger than the address space of the CPU.
- */
-asmlinkage unsigned long
-sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
-{
-	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage unsigned long
-sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
-{
-	if (offset_in_page(off) != 0)
-		return -EINVAL;
-
-	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage unsigned long
-ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
-	     unsigned long new_addr)
-{
-	addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage long
-ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp)
-{
-	struct timespec64 rtn_tp;
-	s64 tick_ns;
-
-	/*
-	 * ia64's clock_gettime() syscall is implemented as a vdso call
-	 * fsys_clock_gettime(). Currently it handles only
-	 * CLOCK_REALTIME and CLOCK_MONOTONIC. Both are based on
-	 * 'ar.itc' counter which gets incremented at a constant
-	 * frequency. It's usually 400MHz, ~2.5x times slower than CPU
-	 * clock frequency. Which is almost a 1ns hrtimer, but not quite.
-	 *
-	 * Let's special-case these timers to report correct precision
-	 * based on ITC frequency and not HZ frequency for supported
-	 * clocks.
-	 */
-	switch (which_clock) {
-	case CLOCK_REALTIME:
-	case CLOCK_MONOTONIC:
-		tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);
-		rtn_tp = ns_to_timespec64(tick_ns);
-		return put_timespec64(&rtn_tp, tp);
-	}
-
-	return sys_clock_getres(which_clock, tp);
-}
diff --git a/arch/ia64/kernel/syscalls/Makefile b/arch/ia64/kernel/syscalls/Makefile
deleted file mode 100644
index d009f927a048..000000000000
--- a/arch/ia64/kernel/syscalls/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-kapi := arch/$(SRCARCH)/include/generated/asm
-uapi := arch/$(SRCARCH)/include/generated/uapi/asm
-
-$(shell mkdir -p $(uapi) $(kapi))
-
-syscall := $(src)/syscall.tbl
-syshdr := $(srctree)/scripts/syscallhdr.sh
-systbl := $(srctree)/scripts/syscalltbl.sh
-
-quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --offset __NR_Linux $< $@
-
-quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@
-
-$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE
-	$(call if_changed,syshdr)
-
-$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE
-	$(call if_changed,systbl)
-
-uapisyshdr-y		+= unistd_64.h
-kapisyshdr-y		+= syscall_table.h
-
-uapisyshdr-y	:= $(addprefix $(uapi)/, $(uapisyshdr-y))
-kapisyshdr-y	:= $(addprefix $(kapi)/, $(kapisyshdr-y))
-targets		+= $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y))
-
-PHONY += all
-all: $(uapisyshdr-y) $(kapisyshdr-y)
-	@:
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
deleted file mode 100644
index 83d8609aec03..000000000000
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ /dev/null
@@ -1,375 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-#
-# Linux system call numbers and entry vectors for ia64
-#
-# The format is:
-# <number> <abi> <name> <entry point>
-#
-# Add 1024 to <number> will get the actual system call number
-#
-# The <abi> is always "common" for this file
-#
-0	common	ni_syscall			sys_ni_syscall
-1	common	exit				sys_exit
-2	common	read				sys_read
-3	common	write				sys_write
-4	common	open				sys_open
-5	common	close				sys_close
-6	common	creat				sys_creat
-7	common	link				sys_link
-8	common	unlink				sys_unlink
-9	common	execve				ia64_execve
-10	common	chdir				sys_chdir
-11	common	fchdir				sys_fchdir
-12	common	utimes				sys_utimes
-13	common	mknod				sys_mknod
-14	common	chmod				sys_chmod
-15	common	chown				sys_chown
-16	common	lseek				sys_lseek
-17	common	getpid				sys_getpid
-18	common	getppid				sys_getppid
-19	common	mount				sys_mount
-20	common	umount2				sys_umount
-21	common	setuid				sys_setuid
-22	common	getuid				sys_getuid
-23	common	geteuid				sys_geteuid
-24	common	ptrace				sys_ptrace
-25	common	access				sys_access
-26	common	sync				sys_sync
-27	common	fsync				sys_fsync
-28	common	fdatasync			sys_fdatasync
-29	common	kill				sys_kill
-30	common	rename				sys_rename
-31	common	mkdir				sys_mkdir
-32	common	rmdir				sys_rmdir
-33	common	dup				sys_dup
-34	common	pipe				sys_ia64_pipe
-35	common	times				sys_times
-36	common	brk				ia64_brk
-37	common	setgid				sys_setgid
-38	common	getgid				sys_getgid
-39	common	getegid				sys_getegid
-40	common	acct				sys_acct
-41	common	ioctl				sys_ioctl
-42	common	fcntl				sys_fcntl
-43	common	umask				sys_umask
-44	common	chroot				sys_chroot
-45	common	ustat				sys_ustat
-46	common	dup2				sys_dup2
-47	common	setreuid			sys_setreuid
-48	common	setregid			sys_setregid
-49	common	getresuid			sys_getresuid
-50	common	setresuid			sys_setresuid
-51	common	getresgid			sys_getresgid
-52	common	setresgid			sys_setresgid
-53	common	getgroups			sys_getgroups
-54	common	setgroups			sys_setgroups
-55	common	getpgid				sys_getpgid
-56	common	setpgid				sys_setpgid
-57	common	setsid				sys_setsid
-58	common	getsid				sys_getsid
-59	common	sethostname			sys_sethostname
-60	common	setrlimit			sys_setrlimit
-61	common	getrlimit			sys_getrlimit
-62	common	getrusage			sys_getrusage
-63	common	gettimeofday			sys_gettimeofday
-64	common	settimeofday			sys_settimeofday
-65	common	select				sys_select
-66	common	poll				sys_poll
-67	common	symlink				sys_symlink
-68	common	readlink			sys_readlink
-69	common	uselib				sys_uselib
-70	common	swapon				sys_swapon
-71	common	swapoff				sys_swapoff
-72	common	reboot				sys_reboot
-73	common	truncate			sys_truncate
-74	common	ftruncate			sys_ftruncate
-75	common	fchmod				sys_fchmod
-76	common	fchown				sys_fchown
-77	common	getpriority			ia64_getpriority
-78	common	setpriority			sys_setpriority
-79	common	statfs				sys_statfs
-80	common	fstatfs				sys_fstatfs
-81	common	gettid				sys_gettid
-82	common	semget				sys_semget
-83	common	semop				sys_semop
-84	common	semctl				sys_semctl
-85	common	msgget				sys_msgget
-86	common	msgsnd				sys_msgsnd
-87	common	msgrcv				sys_msgrcv
-88	common	msgctl				sys_msgctl
-89	common	shmget				sys_shmget
-90	common	shmat				sys_shmat
-91	common	shmdt				sys_shmdt
-92	common	shmctl				sys_shmctl
-93	common	syslog				sys_syslog
-94	common	setitimer			sys_setitimer
-95	common	getitimer			sys_getitimer
-# 1120 was old_stat
-# 1121 was old_lstat
-# 1122 was old_fstat
-99	common	vhangup				sys_vhangup
-100	common	lchown				sys_lchown
-101	common	remap_file_pages		sys_remap_file_pages
-102	common	wait4				sys_wait4
-103	common	sysinfo				sys_sysinfo
-104	common	clone				sys_clone
-105	common	setdomainname			sys_setdomainname
-106	common	uname				sys_newuname
-107	common	adjtimex			sys_adjtimex
-# 1132 was create_module
-109	common	init_module			sys_init_module
-110	common	delete_module			sys_delete_module
-# 1135 was get_kernel_syms
-# 1136 was query_module
-113	common	quotactl			sys_quotactl
-114	common	bdflush				sys_ni_syscall
-115	common	sysfs				sys_sysfs
-116	common	personality			sys_personality
-117	common	afs_syscall			sys_ni_syscall
-118	common	setfsuid			sys_setfsuid
-119	common	setfsgid			sys_setfsgid
-120	common	getdents			sys_getdents
-121	common	flock				sys_flock
-122	common	readv				sys_readv
-123	common	writev				sys_writev
-124	common	pread64				sys_pread64
-125	common	pwrite64			sys_pwrite64
-126	common	_sysctl				sys_ni_syscall
-127	common	mmap				sys_mmap
-128	common	munmap				sys_munmap
-129	common	mlock				sys_mlock
-130	common	mlockall			sys_mlockall
-131	common	mprotect			sys_mprotect
-132	common	mremap				ia64_mremap
-133	common	msync				sys_msync
-134	common	munlock				sys_munlock
-135	common	munlockall			sys_munlockall
-136	common	sched_getparam			sys_sched_getparam
-137	common	sched_setparam			sys_sched_setparam
-138	common	sched_getscheduler		sys_sched_getscheduler
-139	common	sched_setscheduler		sys_sched_setscheduler
-140	common	sched_yield			sys_sched_yield
-141	common	sched_get_priority_max		sys_sched_get_priority_max
-142	common	sched_get_priority_min		sys_sched_get_priority_min
-143	common	sched_rr_get_interval		sys_sched_rr_get_interval
-144	common	nanosleep			sys_nanosleep
-145	common	nfsservctl			sys_ni_syscall
-146	common	prctl				sys_prctl
-147	common	old_getpagesize			sys_getpagesize
-148	common	mmap2				sys_mmap2
-149	common	pciconfig_read			sys_pciconfig_read
-150	common	pciconfig_write			sys_pciconfig_write
-151	common	perfmonctl			sys_ni_syscall
-152	common	sigaltstack			sys_sigaltstack
-153	common	rt_sigaction			sys_rt_sigaction
-154	common	rt_sigpending			sys_rt_sigpending
-155	common	rt_sigprocmask			sys_rt_sigprocmask
-156	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
-157	common	rt_sigreturn			sys_rt_sigreturn
-158	common	rt_sigsuspend			sys_rt_sigsuspend
-159	common	rt_sigtimedwait			sys_rt_sigtimedwait
-160	common	getcwd				sys_getcwd
-161	common	capget				sys_capget
-162	common	capset				sys_capset
-163	common	sendfile			sys_sendfile64
-164	common	getpmsg				sys_ni_syscall
-165	common	putpmsg				sys_ni_syscall
-166	common	socket				sys_socket
-167	common	bind				sys_bind
-168	common	connect				sys_connect
-169	common	listen				sys_listen
-170	common	accept				sys_accept
-171	common	getsockname			sys_getsockname
-172	common	getpeername			sys_getpeername
-173	common	socketpair			sys_socketpair
-174	common	send				sys_send
-175	common	sendto				sys_sendto
-176	common	recv				sys_recv
-177	common	recvfrom			sys_recvfrom
-178	common	shutdown			sys_shutdown
-179	common	setsockopt			sys_setsockopt
-180	common	getsockopt			sys_getsockopt
-181	common	sendmsg				sys_sendmsg
-182	common	recvmsg				sys_recvmsg
-183	common	pivot_root			sys_pivot_root
-184	common	mincore				sys_mincore
-185	common	madvise				sys_madvise
-186	common	stat				sys_newstat
-187	common	lstat				sys_newlstat
-188	common	fstat				sys_newfstat
-189	common	clone2				sys_clone2
-190	common	getdents64			sys_getdents64
-191	common	getunwind			sys_getunwind
-192	common	readahead			sys_readahead
-193	common	setxattr			sys_setxattr
-194	common	lsetxattr			sys_lsetxattr
-195	common	fsetxattr			sys_fsetxattr
-196	common	getxattr			sys_getxattr
-197	common	lgetxattr			sys_lgetxattr
-198	common	fgetxattr			sys_fgetxattr
-199	common	listxattr			sys_listxattr
-200	common	llistxattr			sys_llistxattr
-201	common	flistxattr			sys_flistxattr
-202	common	removexattr			sys_removexattr
-203	common	lremovexattr			sys_lremovexattr
-204	common	fremovexattr			sys_fremovexattr
-205	common	tkill				sys_tkill
-206	common	futex				sys_futex
-207	common	sched_setaffinity		sys_sched_setaffinity
-208	common	sched_getaffinity		sys_sched_getaffinity
-209	common	set_tid_address			sys_set_tid_address
-210	common	fadvise64			sys_fadvise64_64
-211	common	tgkill				sys_tgkill
-212	common	exit_group			sys_exit_group
-213	common	lookup_dcookie			sys_lookup_dcookie
-214	common	io_setup			sys_io_setup
-215	common	io_destroy			sys_io_destroy
-216	common	io_getevents			sys_io_getevents
-217	common	io_submit			sys_io_submit
-218	common	io_cancel			sys_io_cancel
-219	common	epoll_create			sys_epoll_create
-220	common	epoll_ctl			sys_epoll_ctl
-221	common	epoll_wait			sys_epoll_wait
-222	common	restart_syscall			sys_restart_syscall
-223	common	semtimedop			sys_semtimedop
-224	common	timer_create			sys_timer_create
-225	common	timer_settime			sys_timer_settime
-226	common	timer_gettime			sys_timer_gettime
-227	common	timer_getoverrun		sys_timer_getoverrun
-228	common	timer_delete			sys_timer_delete
-229	common	clock_settime			sys_clock_settime
-230	common	clock_gettime			sys_clock_gettime
-231	common	clock_getres			ia64_clock_getres
-232	common	clock_nanosleep			sys_clock_nanosleep
-233	common	fstatfs64			sys_fstatfs64
-234	common	statfs64			sys_statfs64
-235	common	mbind				sys_mbind
-236	common	get_mempolicy			sys_get_mempolicy
-237	common	set_mempolicy			sys_set_mempolicy
-238	common	mq_open				sys_mq_open
-239	common	mq_unlink			sys_mq_unlink
-240	common	mq_timedsend			sys_mq_timedsend
-241	common	mq_timedreceive			sys_mq_timedreceive
-242	common	mq_notify			sys_mq_notify
-243	common	mq_getsetattr			sys_mq_getsetattr
-244	common	kexec_load			sys_kexec_load
-245	common	vserver				sys_ni_syscall
-246	common	waitid				sys_waitid
-247	common	add_key				sys_add_key
-248	common	request_key			sys_request_key
-249	common	keyctl				sys_keyctl
-250	common	ioprio_set			sys_ioprio_set
-251	common	ioprio_get			sys_ioprio_get
-252	common	move_pages			sys_move_pages
-253	common	inotify_init			sys_inotify_init
-254	common	inotify_add_watch		sys_inotify_add_watch
-255	common	inotify_rm_watch		sys_inotify_rm_watch
-256	common	migrate_pages			sys_migrate_pages
-257	common	openat				sys_openat
-258	common	mkdirat				sys_mkdirat
-259	common	mknodat				sys_mknodat
-260	common	fchownat			sys_fchownat
-261	common	futimesat			sys_futimesat
-262	common	newfstatat			sys_newfstatat
-263	common	unlinkat			sys_unlinkat
-264	common	renameat			sys_renameat
-265	common	linkat				sys_linkat
-266	common	symlinkat			sys_symlinkat
-267	common	readlinkat			sys_readlinkat
-268	common	fchmodat			sys_fchmodat
-269	common	faccessat			sys_faccessat
-270	common	pselect6			sys_pselect6
-271	common	ppoll				sys_ppoll
-272	common	unshare				sys_unshare
-273	common	splice				sys_splice
-274	common	set_robust_list			sys_set_robust_list
-275	common	get_robust_list			sys_get_robust_list
-276	common	sync_file_range			sys_sync_file_range
-277	common	tee				sys_tee
-278	common	vmsplice			sys_vmsplice
-279	common	fallocate			sys_fallocate
-280	common	getcpu				sys_getcpu
-281	common	epoll_pwait			sys_epoll_pwait
-282	common	utimensat			sys_utimensat
-283	common	signalfd			sys_signalfd
-284	common	timerfd				sys_ni_syscall
-285	common	eventfd				sys_eventfd
-286	common	timerfd_create			sys_timerfd_create
-287	common	timerfd_settime			sys_timerfd_settime
-288	common	timerfd_gettime			sys_timerfd_gettime
-289	common	signalfd4			sys_signalfd4
-290	common	eventfd2			sys_eventfd2
-291	common	epoll_create1			sys_epoll_create1
-292	common	dup3				sys_dup3
-293	common	pipe2				sys_pipe2
-294	common	inotify_init1			sys_inotify_init1
-295	common	preadv				sys_preadv
-296	common	pwritev				sys_pwritev
-297	common	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
-298	common	recvmmsg			sys_recvmmsg
-299	common	fanotify_init			sys_fanotify_init
-300	common	fanotify_mark			sys_fanotify_mark
-301	common	prlimit64			sys_prlimit64
-302	common	name_to_handle_at		sys_name_to_handle_at
-303	common	open_by_handle_at		sys_open_by_handle_at
-304	common	clock_adjtime			sys_clock_adjtime
-305	common	syncfs				sys_syncfs
-306	common	setns				sys_setns
-307	common	sendmmsg			sys_sendmmsg
-308	common	process_vm_readv		sys_process_vm_readv
-309	common	process_vm_writev		sys_process_vm_writev
-310	common	accept4				sys_accept4
-311	common	finit_module			sys_finit_module
-312	common	sched_setattr			sys_sched_setattr
-313	common	sched_getattr			sys_sched_getattr
-314	common	renameat2			sys_renameat2
-315	common	getrandom			sys_getrandom
-316	common	memfd_create			sys_memfd_create
-317	common	bpf				sys_bpf
-318	common	execveat			sys_execveat
-319	common	userfaultfd			sys_userfaultfd
-320	common	membarrier			sys_membarrier
-321	common	kcmp				sys_kcmp
-322	common	mlock2				sys_mlock2
-323	common	copy_file_range			sys_copy_file_range
-324	common	preadv2				sys_preadv2
-325	common	pwritev2			sys_pwritev2
-326	common	statx				sys_statx
-327	common	io_pgetevents			sys_io_pgetevents
-328	common	perf_event_open			sys_perf_event_open
-329	common	seccomp				sys_seccomp
-330	common	pkey_mprotect			sys_pkey_mprotect
-331	common	pkey_alloc			sys_pkey_alloc
-332	common	pkey_free			sys_pkey_free
-333	common	rseq				sys_rseq
-# 334 through 423 are reserved to sync up with other architectures
-424	common	pidfd_send_signal		sys_pidfd_send_signal
-425	common	io_uring_setup			sys_io_uring_setup
-426	common	io_uring_enter			sys_io_uring_enter
-427	common	io_uring_register		sys_io_uring_register
-428	common	open_tree			sys_open_tree
-429	common	move_mount			sys_move_mount
-430	common	fsopen				sys_fsopen
-431	common	fsconfig			sys_fsconfig
-432	common	fsmount				sys_fsmount
-433	common	fspick				sys_fspick
-434	common	pidfd_open			sys_pidfd_open
-# 435 reserved for clone3
-436	common	close_range			sys_close_range
-437	common	openat2				sys_openat2
-438	common	pidfd_getfd			sys_pidfd_getfd
-439	common	faccessat2			sys_faccessat2
-440	common	process_madvise			sys_process_madvise
-441	common	epoll_pwait2			sys_epoll_pwait2
-442	common	mount_setattr			sys_mount_setattr
-443	common	quotactl_fd			sys_quotactl_fd
-444	common	landlock_create_ruleset		sys_landlock_create_ruleset
-445	common	landlock_add_rule		sys_landlock_add_rule
-446	common	landlock_restrict_self		sys_landlock_restrict_self
-# 447 reserved for memfd_secret
-448	common	process_mrelease		sys_process_mrelease
-449	common  futex_waitv                     sys_futex_waitv
-450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
-451	common	cachestat			sys_cachestat
-452	common	fchmodat2			sys_fchmodat2
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
deleted file mode 100644
index 83ef044b63ef..000000000000
--- a/arch/ia64/kernel/time.c
+++ /dev/null
@@ -1,463 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/ia64/kernel/time.c
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- * Copyright (C) 1999-2000 VA Linux Systems
- * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/profile.h>
-#include <linux/sched.h>
-#include <linux/time.h>
-#include <linux/nmi.h>
-#include <linux/interrupt.h>
-#include <linux/efi.h>
-#include <linux/timex.h>
-#include <linux/timekeeper_internal.h>
-#include <linux/platform_device.h>
-#include <linux/sched/cputime.h>
-
-#include <asm/cputime.h>
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/hw_irq.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-
-#include "fsyscall_gtod_data.h"
-#include "irq.h"
-
-static u64 itc_get_cycles(struct clocksource *cs);
-
-struct fsyscall_gtod_data_t fsyscall_gtod_data;
-
-struct itc_jitter_data_t itc_jitter_data;
-
-volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
-
-unsigned long last_cli_ip;
-EXPORT_SYMBOL(last_cli_ip);
-
-#endif
-
-static struct clocksource clocksource_itc = {
-	.name           = "itc",
-	.rating         = 350,
-	.read           = itc_get_cycles,
-	.mask           = CLOCKSOURCE_MASK(64),
-	.flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static struct clocksource *itc_clocksource;
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-#include <linux/kernel_stat.h>
-
-extern u64 cycle_to_nsec(u64 cyc);
-
-void vtime_flush(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	u64 delta;
-
-	if (ti->utime)
-		account_user_time(tsk, cycle_to_nsec(ti->utime));
-
-	if (ti->gtime)
-		account_guest_time(tsk, cycle_to_nsec(ti->gtime));
-
-	if (ti->idle_time)
-		account_idle_time(cycle_to_nsec(ti->idle_time));
-
-	if (ti->stime) {
-		delta = cycle_to_nsec(ti->stime);
-		account_system_index_time(tsk, delta, CPUTIME_SYSTEM);
-	}
-
-	if (ti->hardirq_time) {
-		delta = cycle_to_nsec(ti->hardirq_time);
-		account_system_index_time(tsk, delta, CPUTIME_IRQ);
-	}
-
-	if (ti->softirq_time) {
-		delta = cycle_to_nsec(ti->softirq_time);
-		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
-	}
-
-	ti->utime = 0;
-	ti->gtime = 0;
-	ti->idle_time = 0;
-	ti->stime = 0;
-	ti->hardirq_time = 0;
-	ti->softirq_time = 0;
-}
-
-/*
- * Called from the context switch with interrupts disabled, to charge all
- * accumulated times to the current process, and to prepare accounting on
- * the next process.
- */
-void arch_vtime_task_switch(struct task_struct *prev)
-{
-	struct thread_info *pi = task_thread_info(prev);
-	struct thread_info *ni = task_thread_info(current);
-
-	ni->ac_stamp = pi->ac_stamp;
-	ni->ac_stime = ni->ac_utime = 0;
-}
-
-/*
- * Account time for a transition between system, hard irq or soft irq state.
- * Note that this function is called with interrupts enabled.
- */
-static __u64 vtime_delta(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	__u64 now, delta_stime;
-
-	WARN_ON_ONCE(!irqs_disabled());
-
-	now = ia64_get_itc();
-	delta_stime = now - ti->ac_stamp;
-	ti->ac_stamp = now;
-
-	return delta_stime;
-}
-
-void vtime_account_kernel(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	__u64 stime = vtime_delta(tsk);
-
-	if (tsk->flags & PF_VCPU)
-		ti->gtime += stime;
-	else
-		ti->stime += stime;
-}
-EXPORT_SYMBOL_GPL(vtime_account_kernel);
-
-void vtime_account_idle(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->idle_time += vtime_delta(tsk);
-}
-
-void vtime_account_softirq(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->softirq_time += vtime_delta(tsk);
-}
-
-void vtime_account_hardirq(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->hardirq_time += vtime_delta(tsk);
-}
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-static irqreturn_t
-timer_interrupt (int irq, void *dev_id)
-{
-	unsigned long new_itm;
-
-	if (cpu_is_offline(smp_processor_id())) {
-		return IRQ_HANDLED;
-	}
-
-	new_itm = local_cpu_data->itm_next;
-
-	if (!time_after(ia64_get_itc(), new_itm))
-		printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
-		       ia64_get_itc(), new_itm);
-
-	while (1) {
-		new_itm += local_cpu_data->itm_delta;
-
-		legacy_timer_tick(smp_processor_id() == time_keeper_id);
-
-		local_cpu_data->itm_next = new_itm;
-
-		if (time_after(new_itm, ia64_get_itc()))
-			break;
-
-		/*
-		 * Allow IPIs to interrupt the timer loop.
-		 */
-		local_irq_enable();
-		local_irq_disable();
-	}
-
-	do {
-		/*
-		 * If we're too close to the next clock tick for
-		 * comfort, we increase the safety margin by
-		 * intentionally dropping the next tick(s).  We do NOT
-		 * update itm.next because that would force us to call
-		 * xtime_update() which in turn would let our clock run
-		 * too fast (with the potentially devastating effect
-		 * of losing monotony of time).
-		 */
-		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
-			new_itm += local_cpu_data->itm_delta;
-		ia64_set_itm(new_itm);
-		/* double check, in case we got hit by a (slow) PMI: */
-	} while (time_after_eq(ia64_get_itc(), new_itm));
-	return IRQ_HANDLED;
-}
-
-/*
- * Encapsulate access to the itm structure for SMP.
- */
-void
-ia64_cpu_local_tick (void)
-{
-	int cpu = smp_processor_id();
-	unsigned long shift = 0, delta;
-
-	/* arrange for the cycle counter to generate a timer interrupt: */
-	ia64_set_itv(IA64_TIMER_VECTOR);
-
-	delta = local_cpu_data->itm_delta;
-	/*
-	 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
-	 * same time:
-	 */
-	if (cpu) {
-		unsigned long hi = 1UL << ia64_fls(cpu);
-		shift = (2*(cpu - hi) + 1) * delta/hi/2;
-	}
-	local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
-	ia64_set_itm(local_cpu_data->itm_next);
-}
-
-static int nojitter;
-
-static int __init nojitter_setup(char *str)
-{
-	nojitter = 1;
-	printk("Jitter checking for ITC timers disabled\n");
-	return 1;
-}
-
-__setup("nojitter", nojitter_setup);
-
-
-void ia64_init_itm(void)
-{
-	unsigned long platform_base_freq, itc_freq;
-	struct pal_freq_ratio itc_ratio, proc_ratio;
-	long status, platform_base_drift, itc_drift;
-
-	/*
-	 * According to SAL v2.6, we need to use a SAL call to determine the platform base
-	 * frequency and then a PAL call to determine the frequency ratio between the ITC
-	 * and the base frequency.
-	 */
-	status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
-				    &platform_base_freq, &platform_base_drift);
-	if (status != 0) {
-		printk(KERN_ERR "SAL_FREQ_BASE_PLATFORM failed: %s\n", ia64_sal_strerror(status));
-	} else {
-		status = ia64_pal_freq_ratios(&proc_ratio, NULL, &itc_ratio);
-		if (status != 0)
-			printk(KERN_ERR "PAL_FREQ_RATIOS failed with status=%ld\n", status);
-	}
-	if (status != 0) {
-		/* invent "random" values */
-		printk(KERN_ERR
-		       "SAL/PAL failed to obtain frequency info---inventing reasonable values\n");
-		platform_base_freq = 100000000;
-		platform_base_drift = -1;	/* no drift info */
-		itc_ratio.num = 3;
-		itc_ratio.den = 1;
-	}
-	if (platform_base_freq < 40000000) {
-		printk(KERN_ERR "Platform base frequency %lu bogus---resetting to 75MHz!\n",
-		       platform_base_freq);
-		platform_base_freq = 75000000;
-		platform_base_drift = -1;
-	}
-	if (!proc_ratio.den)
-		proc_ratio.den = 1;	/* avoid division by zero */
-	if (!itc_ratio.den)
-		itc_ratio.den = 1;	/* avoid division by zero */
-
-	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
-
-	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
-	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, "
-	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
-	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
-	       itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000);
-
-	if (platform_base_drift != -1) {
-		itc_drift = platform_base_drift*itc_ratio.num/itc_ratio.den;
-		printk("+/-%ldppm\n", itc_drift);
-	} else {
-		itc_drift = -1;
-		printk("\n");
-	}
-
-	local_cpu_data->proc_freq = (platform_base_freq*proc_ratio.num)/proc_ratio.den;
-	local_cpu_data->itc_freq = itc_freq;
-	local_cpu_data->cyc_per_usec = (itc_freq + USEC_PER_SEC/2) / USEC_PER_SEC;
-	local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
-					+ itc_freq/2)/itc_freq;
-
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-#ifdef CONFIG_SMP
-		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
-		 * Jitter compensation requires a cmpxchg which may limit
-		 * the scalability of the syscalls for retrieving time.
-		 * The ITC synchronization is usually successful to within a few
-		 * ITC ticks but this is not a sure thing. If you need to improve
-		 * timer performance in SMP situations then boot the kernel with the
-		 * "nojitter" option. However, doing so may result in time fluctuating (maybe
-		 * even going backward) if the ITC offsets between the individual CPUs
-		 * are too large.
-		 */
-		if (!nojitter)
-			itc_jitter_data.itc_jitter = 1;
-#endif
-	} else
-		/*
-		 * ITC is drifty and we have not synchronized the ITCs in smpboot.c.
-		 * ITC values may fluctuate significantly between processors.
-		 * Clock should not be used for hrtimers. Mark itc as only
-		 * useful for boot and testing.
-		 *
-		 * Note that jitter compensation is off! There is no point of
-		 * synchronizing ITCs since they may be large differentials
-		 * that change over time.
-		 *
-		 * The only way to fix this would be to repeatedly sync the
-		 * ITCs. Until that time we have to avoid ITC.
-		 */
-		clocksource_itc.rating = 50;
-
-	/* avoid softlock up message when cpu is unplug and plugged again. */
-	touch_softlockup_watchdog();
-
-	/* Setup the CPU local timer tick */
-	ia64_cpu_local_tick();
-
-	if (!itc_clocksource) {
-		clocksource_register_hz(&clocksource_itc,
-						local_cpu_data->itc_freq);
-		itc_clocksource = &clocksource_itc;
-	}
-}
-
-static u64 itc_get_cycles(struct clocksource *cs)
-{
-	unsigned long lcycle, now, ret;
-
-	if (!itc_jitter_data.itc_jitter)
-		return get_cycles();
-
-	lcycle = itc_jitter_data.itc_lastcycle;
-	now = get_cycles();
-	if (lcycle && time_after(lcycle, now))
-		return lcycle;
-
-	/*
-	 * Keep track of the last timer value returned.
-	 * In an SMP environment, you could lose out in contention of
-	 * cmpxchg. If so, your cmpxchg returns new value which the
-	 * winner of contention updated to. Use the new value instead.
-	 */
-	ret = cmpxchg(&itc_jitter_data.itc_lastcycle, lcycle, now);
-	if (unlikely(ret != lcycle))
-		return ret;
-
-	return now;
-}
-
-void read_persistent_clock64(struct timespec64 *ts)
-{
-	efi_gettimeofday(ts);
-}
-
-void __init
-time_init (void)
-{
-	register_percpu_irq(IA64_TIMER_VECTOR, timer_interrupt, IRQF_IRQPOLL,
-			    "timer");
-	ia64_init_itm();
-}
-
-/*
- * Generic udelay assumes that if preemption is allowed and the thread
- * migrates to another CPU, that the ITC values are synchronized across
- * all CPUs.
- */
-static void
-ia64_itc_udelay (unsigned long usecs)
-{
-	unsigned long start = ia64_get_itc();
-	unsigned long end = start + usecs*local_cpu_data->cyc_per_usec;
-
-	while (time_before(ia64_get_itc(), end))
-		cpu_relax();
-}
-
-void (*ia64_udelay)(unsigned long usecs) = &ia64_itc_udelay;
-
-void
-udelay (unsigned long usecs)
-{
-	(*ia64_udelay)(usecs);
-}
-EXPORT_SYMBOL(udelay);
-
-/* IA64 doesn't cache the timezone */
-void update_vsyscall_tz(void)
-{
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	write_seqcount_begin(&fsyscall_gtod_data.seq);
-
-	/* copy vsyscall data */
-	fsyscall_gtod_data.clk_mask = tk->tkr_mono.mask;
-	fsyscall_gtod_data.clk_mult = tk->tkr_mono.mult;
-	fsyscall_gtod_data.clk_shift = tk->tkr_mono.shift;
-	fsyscall_gtod_data.clk_fsys_mmio = tk->tkr_mono.clock->archdata.fsys_mmio;
-	fsyscall_gtod_data.clk_cycle_last = tk->tkr_mono.cycle_last;
-
-	fsyscall_gtod_data.wall_time.sec = tk->xtime_sec;
-	fsyscall_gtod_data.wall_time.snsec = tk->tkr_mono.xtime_nsec;
-
-	fsyscall_gtod_data.monotonic_time.sec = tk->xtime_sec
-					      + tk->wall_to_monotonic.tv_sec;
-	fsyscall_gtod_data.monotonic_time.snsec = tk->tkr_mono.xtime_nsec
-						+ ((u64)tk->wall_to_monotonic.tv_nsec
-							<< tk->tkr_mono.shift);
-
-	/* normalize */
-	while (fsyscall_gtod_data.monotonic_time.snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
-		fsyscall_gtod_data.monotonic_time.snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
-		fsyscall_gtod_data.monotonic_time.sec++;
-	}
-
-	write_seqcount_end(&fsyscall_gtod_data.seq);
-}
-
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
deleted file mode 100644
index 94a848b06f15..000000000000
--- a/arch/ia64/kernel/topology.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific variables and functions which are used on
- * NUMA machines with contiguous memory.
- * 		2002/08/07 Erich Focht <efocht@ess.nec.de>
- * Populate cpu entries in sysfs for non-numa systems as well
- *  	Intel Corporation - Ashok Raj
- * 02/27/2006 Zhang, Yanmin
- *	Populate cpu cache entries in sysfs for cpu cache info
- */
-
-#include <linux/cpu.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/node.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/nodemask.h>
-#include <linux/notifier.h>
-#include <linux/export.h>
-#include <asm/mmzone.h>
-#include <asm/numa.h>
-#include <asm/cpu.h>
-
-static struct ia64_cpu *sysfs_cpus;
-
-void arch_fix_phys_package_id(int num, u32 slot)
-{
-#ifdef CONFIG_SMP
-	if (cpu_data(num)->socket_id == -1)
-		cpu_data(num)->socket_id = slot;
-#endif
-}
-EXPORT_SYMBOL_GPL(arch_fix_phys_package_id);
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-int __ref arch_register_cpu(int num)
-{
-	/*
-	 * If CPEI can be re-targeted or if this is not
-	 * CPEI target, then it is hotpluggable
-	 */
-	if (can_cpei_retarget() || !is_cpu_cpei_target(num))
-		sysfs_cpus[num].cpu.hotpluggable = 1;
-	map_cpu_to_node(num, node_cpuid[num].nid);
-	return register_cpu(&sysfs_cpus[num].cpu, num);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void __ref arch_unregister_cpu(int num)
-{
-	unregister_cpu(&sysfs_cpus[num].cpu);
-	unmap_cpu_from_node(num, cpu_to_node(num));
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else
-static int __init arch_register_cpu(int num)
-{
-	return register_cpu(&sysfs_cpus[num].cpu, num);
-}
-#endif /*CONFIG_HOTPLUG_CPU*/
-
-
-static int __init topology_init(void)
-{
-	int i, err = 0;
-
-	sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL);
-	if (!sysfs_cpus)
-		panic("kzalloc in topology_init failed - NR_CPUS too big?");
-
-	for_each_present_cpu(i) {
-		if((err = arch_register_cpu(i)))
-			goto out;
-	}
-out:
-	return err;
-}
-
-subsys_initcall(topology_init);
-
-
-/*
- * Export cpu cache information through sysfs
- */
-
-/*
- *  A bunch of string array to get pretty printing
- */
-static const char *cache_types[] = {
-	"",			/* not used */
-	"Instruction",
-	"Data",
-	"Unified"	/* unified */
-};
-
-static const char *cache_mattrib[]={
-	"WriteThrough",
-	"WriteBack",
-	"",		/* reserved */
-	""		/* reserved */
-};
-
-struct cache_info {
-	pal_cache_config_info_t	cci;
-	cpumask_t shared_cpu_map;
-	int level;
-	int type;
-	struct kobject kobj;
-};
-
-struct cpu_cache_info {
-	struct cache_info *cache_leaves;
-	int	num_cache_leaves;
-	struct kobject kobj;
-};
-
-static struct cpu_cache_info	all_cpu_cache_info[NR_CPUS];
-#define LEAF_KOBJECT_PTR(x,y)    (&all_cpu_cache_info[x].cache_leaves[y])
-
-#ifdef CONFIG_SMP
-static void cache_shared_cpu_map_setup(unsigned int cpu,
-		struct cache_info * this_leaf)
-{
-	pal_cache_shared_info_t	csi;
-	int num_shared, i = 0;
-	unsigned int j;
-
-	if (cpu_data(cpu)->threads_per_core <= 1 &&
-		cpu_data(cpu)->cores_per_socket <= 1) {
-		cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
-		return;
-	}
-
-	if (ia64_pal_cache_shared_info(this_leaf->level,
-					this_leaf->type,
-					0,
-					&csi) != PAL_STATUS_SUCCESS)
-		return;
-
-	num_shared = (int) csi.num_shared;
-	do {
-		for_each_possible_cpu(j)
-			if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id
-				&& cpu_data(j)->core_id == csi.log1_cid
-				&& cpu_data(j)->thread_id == csi.log1_tid)
-				cpumask_set_cpu(j, &this_leaf->shared_cpu_map);
-
-		i++;
-	} while (i < num_shared &&
-		ia64_pal_cache_shared_info(this_leaf->level,
-				this_leaf->type,
-				i,
-				&csi) == PAL_STATUS_SUCCESS);
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu,
-		struct cache_info * this_leaf)
-{
-	cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
-	return;
-}
-#endif
-
-static ssize_t show_coherency_line_size(struct cache_info *this_leaf,
-					char *buf)
-{
-	return sprintf(buf, "%u\n", 1 << this_leaf->cci.pcci_line_size);
-}
-
-static ssize_t show_ways_of_associativity(struct cache_info *this_leaf,
-					char *buf)
-{
-	return sprintf(buf, "%u\n", this_leaf->cci.pcci_assoc);
-}
-
-static ssize_t show_attributes(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf,
-			"%s\n",
-			cache_mattrib[this_leaf->cci.pcci_cache_attr]);
-}
-
-static ssize_t show_size(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf, "%uK\n", this_leaf->cci.pcci_cache_size / 1024);
-}
-
-static ssize_t show_number_of_sets(struct cache_info *this_leaf, char *buf)
-{
-	unsigned number_of_sets = this_leaf->cci.pcci_cache_size;
-	number_of_sets /= this_leaf->cci.pcci_assoc;
-	number_of_sets /= 1 << this_leaf->cci.pcci_line_size;
-
-	return sprintf(buf, "%u\n", number_of_sets);
-}
-
-static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
-{
-	cpumask_t shared_cpu_map;
-
-	cpumask_and(&shared_cpu_map,
-				&this_leaf->shared_cpu_map, cpu_online_mask);
-	return scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			 cpumask_pr_args(&shared_cpu_map));
-}
-
-static ssize_t show_type(struct cache_info *this_leaf, char *buf)
-{
-	int type = this_leaf->type + this_leaf->cci.pcci_unified;
-	return sprintf(buf, "%s\n", cache_types[type]);
-}
-
-static ssize_t show_level(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf, "%u\n", this_leaf->level);
-}
-
-struct cache_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct cache_info *, char *);
-	ssize_t (*store)(struct cache_info *, const char *, size_t count);
-};
-
-#ifdef define_one_ro
-	#undef define_one_ro
-#endif
-#define define_one_ro(_name) \
-	static struct cache_attr _name = \
-__ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(ways_of_associativity);
-define_one_ro(size);
-define_one_ro(number_of_sets);
-define_one_ro(shared_cpu_map);
-define_one_ro(attributes);
-
-static struct attribute * cache_default_attrs[] = {
-	&type.attr,
-	&level.attr,
-	&coherency_line_size.attr,
-	&ways_of_associativity.attr,
-	&attributes.attr,
-	&size.attr,
-	&number_of_sets.attr,
-	&shared_cpu_map.attr,
-	NULL
-};
-ATTRIBUTE_GROUPS(cache_default);
-
-#define to_object(k) container_of(k, struct cache_info, kobj)
-#define to_attr(a) container_of(a, struct cache_attr, attr)
-
-static ssize_t ia64_cache_show(struct kobject * kobj, struct attribute * attr, char * buf)
-{
-	struct cache_attr *fattr = to_attr(attr);
-	struct cache_info *this_leaf = to_object(kobj);
-	ssize_t ret;
-
-	ret = fattr->show ? fattr->show(this_leaf, buf) : 0;
-	return ret;
-}
-
-static const struct sysfs_ops cache_sysfs_ops = {
-	.show   = ia64_cache_show
-};
-
-static struct kobj_type cache_ktype = {
-	.sysfs_ops	= &cache_sysfs_ops,
-	.default_groups	= cache_default_groups,
-};
-
-static struct kobj_type cache_ktype_percpu_entry = {
-	.sysfs_ops	= &cache_sysfs_ops,
-};
-
-static void cpu_cache_sysfs_exit(unsigned int cpu)
-{
-	kfree(all_cpu_cache_info[cpu].cache_leaves);
-	all_cpu_cache_info[cpu].cache_leaves = NULL;
-	all_cpu_cache_info[cpu].num_cache_leaves = 0;
-	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
-	return;
-}
-
-static int cpu_cache_sysfs_init(unsigned int cpu)
-{
-	unsigned long i, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	int j;
-	long status;
-	struct cache_info *this_cache;
-	int num_cache_leaves = 0;
-
-	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
-		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
-		return -1;
-	}
-
-	this_cache=kcalloc(unique_caches, sizeof(struct cache_info),
-			   GFP_KERNEL);
-	if (this_cache == NULL)
-		return -ENOMEM;
-
-	for (i=0; i < levels; i++) {
-		for (j=2; j >0 ; j--) {
-			if ((status=ia64_pal_cache_config_info(i,j, &cci)) !=
-					PAL_STATUS_SUCCESS)
-				continue;
-
-			this_cache[num_cache_leaves].cci = cci;
-			this_cache[num_cache_leaves].level = i + 1;
-			this_cache[num_cache_leaves].type = j;
-
-			cache_shared_cpu_map_setup(cpu,
-					&this_cache[num_cache_leaves]);
-			num_cache_leaves ++;
-		}
-	}
-
-	all_cpu_cache_info[cpu].cache_leaves = this_cache;
-	all_cpu_cache_info[cpu].num_cache_leaves = num_cache_leaves;
-
-	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
-
-	return 0;
-}
-
-/* Add cache interface for CPU device */
-static int cache_add_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-	unsigned long i, j;
-	struct cache_info *this_object;
-	int retval = 0;
-
-	if (all_cpu_cache_info[cpu].kobj.parent)
-		return 0;
-
-
-	retval = cpu_cache_sysfs_init(cpu);
-	if (unlikely(retval < 0))
-		return retval;
-
-	retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj,
-				      &cache_ktype_percpu_entry, &sys_dev->kobj,
-				      "%s", "cache");
-	if (unlikely(retval < 0)) {
-		cpu_cache_sysfs_exit(cpu);
-		return retval;
-	}
-
-	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) {
-		this_object = LEAF_KOBJECT_PTR(cpu,i);
-		retval = kobject_init_and_add(&(this_object->kobj),
-					      &cache_ktype,
-					      &all_cpu_cache_info[cpu].kobj,
-					      "index%1lu", i);
-		if (unlikely(retval)) {
-			for (j = 0; j < i; j++) {
-				kobject_put(&(LEAF_KOBJECT_PTR(cpu,j)->kobj));
-			}
-			kobject_put(&all_cpu_cache_info[cpu].kobj);
-			cpu_cache_sysfs_exit(cpu);
-			return retval;
-		}
-		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
-	}
-	kobject_uevent(&all_cpu_cache_info[cpu].kobj, KOBJ_ADD);
-	return retval;
-}
-
-/* Remove cache interface for CPU device */
-static int cache_remove_dev(unsigned int cpu)
-{
-	unsigned long i;
-
-	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++)
-		kobject_put(&(LEAF_KOBJECT_PTR(cpu,i)->kobj));
-
-	if (all_cpu_cache_info[cpu].kobj.parent) {
-		kobject_put(&all_cpu_cache_info[cpu].kobj);
-		memset(&all_cpu_cache_info[cpu].kobj,
-			0,
-			sizeof(struct kobject));
-	}
-
-	cpu_cache_sysfs_exit(cpu);
-
-	return 0;
-}
-
-static int __init cache_sysfs_init(void)
-{
-	int ret;
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/topology:online",
-				cache_add_dev, cache_remove_dev);
-	WARN_ON(ret < 0);
-	return 0;
-}
-device_initcall(cache_sysfs_init);
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
deleted file mode 100644
index 53735b1d1be3..000000000000
--- a/arch/ia64/kernel/traps.c
+++ /dev/null
@@ -1,612 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific trap handling.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/export.h>
-#include <linux/extable.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/delay.h>		/* for ssleep() */
-#include <linux/kdebug.h>
-#include <linux/uaccess.h>
-
-#include <asm/fpswa.h>
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-#include <asm/exception.h>
-#include <asm/setup.h>
-
-fpswa_interface_t *fpswa_interface;
-EXPORT_SYMBOL(fpswa_interface);
-
-void __init
-trap_init (void)
-{
-	if (ia64_boot_param->fpswa)
-		/* FPSWA fixup: make the interface pointer a kernel virtual address: */
-		fpswa_interface = __va(ia64_boot_param->fpswa);
-}
-
-int
-die (const char *str, struct pt_regs *regs, long err)
-{
-	static struct {
-		spinlock_t lock;
-		u32 lock_owner;
-		int lock_owner_depth;
-	} die = {
-		.lock =	__SPIN_LOCK_UNLOCKED(die.lock),
-		.lock_owner = -1,
-		.lock_owner_depth = 0
-	};
-	static int die_counter;
-	int cpu = get_cpu();
-
-	if (die.lock_owner != cpu) {
-		console_verbose();
-		spin_lock_irq(&die.lock);
-		die.lock_owner = cpu;
-		die.lock_owner_depth = 0;
-		bust_spinlocks(1);
-	}
-	put_cpu();
-
-	if (++die.lock_owner_depth < 3) {
-		printk("%s[%d]: %s %ld [%d]\n",
-		current->comm, task_pid_nr(current), str, err, ++die_counter);
-		if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV)
-	            != NOTIFY_STOP)
-			show_regs(regs);
-		else
-			regs = NULL;
-  	} else
-		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
-
-	bust_spinlocks(0);
-	die.lock_owner = -1;
-	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	spin_unlock_irq(&die.lock);
-
-	if (!regs)
-		return 1;
-
-	if (panic_on_oops)
-		panic("Fatal exception");
-
-	make_task_dead(SIGSEGV);
-	return 0;
-}
-
-int
-die_if_kernel (char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode(regs))
-		return die(str, regs, err);
-	return 0;
-}
-
-void
-__kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
-{
-	int sig, code;
-
-	switch (break_num) {
-	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
-		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP)
-			return;
-		if (die_if_kernel("bugcheck!", regs, break_num))
-			return;
-		sig = SIGILL; code = ILL_ILLOPC;
-		break;
-
-	      case 1: /* integer divide by zero */
-		sig = SIGFPE; code = FPE_INTDIV;
-		break;
-
-	      case 2: /* integer overflow */
-		sig = SIGFPE; code = FPE_INTOVF;
-		break;
-
-	      case 3: /* range check/bounds check */
-		sig = SIGFPE; code = FPE_FLTSUB;
-		break;
-
-	      case 4: /* null pointer dereference */
-		sig = SIGSEGV; code = SEGV_MAPERR;
-		break;
-
-	      case 5: /* misaligned data */
-		sig = SIGSEGV; code = BUS_ADRALN;
-		break;
-
-	      case 6: /* decimal overflow */
-		sig = SIGFPE; code = __FPE_DECOVF;
-		break;
-
-	      case 7: /* decimal divide by zero */
-		sig = SIGFPE; code = __FPE_DECDIV;
-		break;
-
-	      case 8: /* packed decimal error */
-		sig = SIGFPE; code = __FPE_DECERR;
-		break;
-
-	      case 9: /* invalid ASCII digit */
-		sig = SIGFPE; code = __FPE_INVASC;
-		break;
-
-	      case 10: /* invalid decimal digit */
-		sig = SIGFPE; code = __FPE_INVDEC;
-		break;
-
-	      case 11: /* paragraph stack overflow */
-		sig = SIGSEGV; code = __SEGV_PSTKOVF;
-		break;
-
-	      case 0x3f000 ... 0x3ffff:	/* bundle-update in progress */
-		sig = SIGILL; code = __ILL_BNDMOD;
-		break;
-
-	      default:
-		if ((break_num < 0x40000 || break_num > 0x100000)
-		    && die_if_kernel("Bad break", regs, break_num))
-			return;
-
-		if (break_num < 0x80000) {
-			sig = SIGILL; code = __ILL_BREAK;
-		} else {
-			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
-					== NOTIFY_STOP)
-				return;
-			sig = SIGTRAP; code = TRAP_BRKPT;
-		}
-	}
-	force_sig_fault(sig, code,
-			(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-			break_num, 0 /* clear __ISR_VALID */, 0);
-}
-
-/*
- * disabled_fph_fault() is called when a user-level process attempts to access f32..f127
- * and it doesn't own the fp-high register partition.  When this happens, we save the
- * current fph partition in the task_struct of the fpu-owner (if necessary) and then load
- * the fp-high partition of the current task (if necessary).  Note that the kernel has
- * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
- * care of clearing psr.dfh.
- */
-static inline void
-disabled_fph_fault (struct pt_regs *regs)
-{
-	struct ia64_psr *psr = ia64_psr(regs);
-
-	/* first, grant user-level access to fph partition: */
-	psr->dfh = 0;
-
-	/*
-	 * Make sure that no other task gets in on this processor
-	 * while we're claiming the FPU
-	 */
-	preempt_disable();
-#ifndef CONFIG_SMP
-	{
-		struct task_struct *fpu_owner
-			= (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
-
-		if (ia64_is_local_fpu_owner(current)) {
-			preempt_enable_no_resched();
-			return;
-		}
-
-		if (fpu_owner)
-			ia64_flush_fph(fpu_owner);
-	}
-#endif /* !CONFIG_SMP */
-	ia64_set_local_fpu_owner(current);
-	if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0) {
-		__ia64_load_fpu(current->thread.fph);
-		psr->mfh = 0;
-	} else {
-		__ia64_init_fpu();
-		/*
-		 * Set mfh because the state in thread.fph does not match the state in
-		 * the fph partition.
-		 */
-		psr->mfh = 1;
-	}
-	preempt_enable_no_resched();
-}
-
-static inline int
-fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long *pr, long *ifs,
-	    struct pt_regs *regs)
-{
-	fp_state_t fp_state;
-	fpswa_ret_t ret;
-
-	if (!fpswa_interface)
-		return -1;
-
-	memset(&fp_state, 0, sizeof(fp_state_t));
-
-	/*
-	 * compute fp_state.  only FP registers f6 - f11 are used by the
-	 * kernel, so set those bits in the mask and set the low volatile
-	 * pointer to point to these registers.
-	 */
-	fp_state.bitmask_low64 = 0xfc0;  /* bit6..bit11 */
-
-	fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) &regs->f6;
-	/*
-	 * unsigned long (*EFI_FPSWA) (
-	 *      unsigned long    trap_type,
-	 *	void             *Bundle,
-	 *	unsigned long    *pipsr,
-	 *	unsigned long    *pfsr,
-	 *	unsigned long    *pisr,
-	 *	unsigned long    *ppreds,
-	 *	unsigned long    *pifs,
-	 *	void             *fp_state);
-	 */
-	ret = (*fpswa_interface->fpswa)((unsigned long) fp_fault, bundle,
-					(unsigned long *) ipsr, (unsigned long *) fpsr,
-					(unsigned long *) isr, (unsigned long *) pr,
-					(unsigned long *) ifs, &fp_state);
-
-	return ret.status;
-}
-
-struct fpu_swa_msg {
-	unsigned long count;
-	unsigned long time;
-};
-static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast);
-DECLARE_PER_CPU(struct fpu_swa_msg, cpulast);
-static struct fpu_swa_msg last __cacheline_aligned;
-
-
-/*
- * Handle floating-point assist faults and traps.
- */
-static int
-handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
-{
-	long exception, bundle[2];
-	unsigned long fault_ip;
-
-	fault_ip = regs->cr_iip;
-	if (!fp_fault && (ia64_psr(regs)->ri == 0))
-		fault_ip -= 16;
-	if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
-		return -1;
-
-	if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT))  {
-		unsigned long count, current_jiffies = jiffies;
-		struct fpu_swa_msg *cp = this_cpu_ptr(&cpulast);
-
-		if (unlikely(current_jiffies > cp->time))
-			cp->count = 0;
-		if (unlikely(cp->count < 5)) {
-			cp->count++;
-			cp->time = current_jiffies + 5 * HZ;
-
-			/* minimize races by grabbing a copy of count BEFORE checking last.time. */
-			count = last.count;
-			barrier();
-
-			/*
-			 * Lower 4 bits are used as a count. Upper bits are a sequence
-			 * number that is updated when count is reset. The cmpxchg will
-			 * fail is seqno has changed. This minimizes multiple cpus
-			 * resetting the count.
-			 */
-			if (current_jiffies > last.time)
-				(void) cmpxchg_acq(&last.count, count, 16 + (count & ~15));
-
-			/* used fetchadd to atomically update the count */
-			if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) {
-				last.time = current_jiffies + 5 * HZ;
-				printk(KERN_WARNING
-		       			"%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
-		       			current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr);
-			}
-		}
-	}
-
-	exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
-			       &regs->cr_ifs, regs);
-	if (fp_fault) {
-		if (exception == 0) {
-			/* emulation was successful */
-			ia64_increment_ip(regs);
-		} else if (exception == -1) {
-			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
-			return -1;
-		} else {
-			/* is next instruction a trap? */
-			int si_code;
-
-			if (exception & 2) {
-				ia64_increment_ip(regs);
-			}
-			si_code = FPE_FLTUNK;	/* default code */
-			if (isr & 0x11) {
-				si_code = FPE_FLTINV;
-			} else if (isr & 0x22) {
-				/* denormal operand gets the same si_code as underflow 
-				* see arch/i386/kernel/traps.c:math_error()  */
-				si_code = FPE_FLTUND;
-			} else if (isr & 0x44) {
-				si_code = FPE_FLTDIV;
-			}
-			force_sig_fault(SIGFPE, si_code,
-					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr);
-		}
-	} else {
-		if (exception == -1) {
-			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
-			return -1;
-		} else if (exception != 0) {
-			/* raise exception */
-			int si_code;
-
-			si_code = FPE_FLTUNK;	/* default code */
-			if (isr & 0x880) {
-				si_code = FPE_FLTOVF;
-			} else if (isr & 0x1100) {
-				si_code = FPE_FLTUND;
-			} else if (isr & 0x2200) {
-				si_code = FPE_FLTRES;
-			}
-			force_sig_fault(SIGFPE, si_code,
-					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr);
-		}
-	}
-	return 0;
-}
-
-struct illegal_op_return {
-	unsigned long fkt, arg1, arg2, arg3;
-};
-
-struct illegal_op_return
-ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
-		       long arg4, long arg5, long arg6, long arg7,
-		       struct pt_regs regs)
-{
-	struct illegal_op_return rv;
-	char buf[128];
-
-#ifdef CONFIG_IA64_BRL_EMU
-	{
-		extern struct illegal_op_return ia64_emulate_brl (struct pt_regs *, unsigned long);
-
-		rv = ia64_emulate_brl(&regs, ec);
-		if (rv.fkt != (unsigned long) -1)
-			return rv;
-	}
-#endif
-
-	sprintf(buf, "IA-64 Illegal operation fault");
-	rv.fkt = 0;
-	if (die_if_kernel(buf, &regs, 0))
-		return rv;
-
-	force_sig_fault(SIGILL, ILL_ILLOPC,
-			(void __user *) (regs.cr_iip + ia64_psr(&regs)->ri),
-			0, 0, 0);
-	return rv;
-}
-
-void __kprobes
-ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
-	    unsigned long iim, unsigned long itir, long arg5, long arg6,
-	    long arg7, struct pt_regs regs)
-{
-	unsigned long code, error = isr, iip;
-	char buf[128];
-	int result, sig, si_code;
-	static const char *reason[] = {
-		"IA-64 Illegal Operation fault",
-		"IA-64 Privileged Operation fault",
-		"IA-64 Privileged Register fault",
-		"IA-64 Reserved Register/Field fault",
-		"Disabled Instruction Set Transition fault",
-		"Unknown fault 5", "Unknown fault 6", "Unknown fault 7", "Illegal Hazard fault",
-		"Unknown fault 9", "Unknown fault 10", "Unknown fault 11", "Unknown fault 12",
-		"Unknown fault 13", "Unknown fault 14", "Unknown fault 15"
-	};
-
-	if ((isr & IA64_ISR_NA) && ((isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) {
-		/*
-		 * This fault was due to lfetch.fault, set "ed" bit in the psr to cancel
-		 * the lfetch.
-		 */
-		ia64_psr(&regs)->ed = 1;
-		return;
-	}
-
-	iip = regs.cr_iip + ia64_psr(&regs)->ri;
-
-	switch (vector) {
-	      case 24: /* General Exception */
-		code = (isr >> 4) & 0xf;
-		sprintf(buf, "General Exception: %s%s", reason[code],
-			(code == 3) ? ((isr & (1UL << 37))
-				       ? " (RSE access)" : " (data access)") : "");
-		if (code == 8) {
-# ifdef CONFIG_IA64_PRINT_HAZARDS
-			printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n",
-			       current->comm, task_pid_nr(current),
-			       regs.cr_iip + ia64_psr(&regs)->ri, regs.pr);
-# endif
-			return;
-		}
-		break;
-
-	      case 25: /* Disabled FP-Register */
-		if (isr & 2) {
-			disabled_fph_fault(&regs);
-			return;
-		}
-		sprintf(buf, "Disabled FPL fault---not supposed to happen!");
-		break;
-
-	      case 26: /* NaT Consumption */
-		if (user_mode(&regs)) {
-			void __user *addr;
-
-			if (((isr >> 4) & 0xf) == 2) {
-				/* NaT page consumption */
-				sig = SIGSEGV;
-				code = SEGV_ACCERR;
-				addr = (void __user *) ifa;
-			} else {
-				/* register NaT consumption */
-				sig = SIGILL;
-				code = ILL_ILLOPN;
-				addr = (void __user *) (regs.cr_iip
-							+ ia64_psr(&regs)->ri);
-			}
-			force_sig_fault(sig, code, addr,
-					vector, __ISR_VALID, isr);
-			return;
-		} else if (ia64_done_with_exception(&regs))
-			return;
-		sprintf(buf, "NaT consumption");
-		break;
-
-	      case 31: /* Unsupported Data Reference */
-		if (user_mode(&regs)) {
-			force_sig_fault(SIGILL, ILL_ILLOPN, (void __user *) iip,
-					vector, __ISR_VALID, isr);
-			return;
-		}
-		sprintf(buf, "Unsupported data reference");
-		break;
-
-	      case 29: /* Debug */
-	      case 35: /* Taken Branch Trap */
-	      case 36: /* Single Step Trap */
-		if (fsys_mode(current, &regs)) {
-			extern char __kernel_syscall_via_break[];
-			/*
-			 * Got a trap in fsys-mode: Taken Branch Trap
-			 * and Single Step trap need special handling;
-			 * Debug trap is ignored (we disable it here
-			 * and re-enable it in the lower-privilege trap).
-			 */
-			if (unlikely(vector == 29)) {
-				set_thread_flag(TIF_DB_DISABLED);
-				ia64_psr(&regs)->db = 0;
-				ia64_psr(&regs)->lp = 1;
-				return;
-			}
-			/* re-do the system call via break 0x100000: */
-			regs.cr_iip = (unsigned long) __kernel_syscall_via_break;
-			ia64_psr(&regs)->ri = 0;
-			ia64_psr(&regs)->cpl = 3;
-			return;
-		}
-		switch (vector) {
-		      default:
-		      case 29:
-			si_code = TRAP_HWBKPT;
-#ifdef CONFIG_ITANIUM
-			/*
-			 * Erratum 10 (IFA may contain incorrect address) now has
-			 * "NoFix" status.  There are no plans for fixing this.
-			 */
-			if (ia64_psr(&regs)->is == 0)
-			  ifa = regs.cr_iip;
-#endif
-			break;
-		      case 35: si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36: si_code = TRAP_TRACE; ifa = 0; break;
-		}
-		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, si_code, SIGTRAP)
-			       	== NOTIFY_STOP)
-			return;
-		force_sig_fault(SIGTRAP, si_code, (void __user *) ifa,
-				0, __ISR_VALID, isr);
-		return;
-
-	      case 32: /* fp fault */
-	      case 33: /* fp trap */
-		result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
-		if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
-			force_sig_fault(SIGFPE, FPE_FLTINV, (void __user *) iip,
-					0, __ISR_VALID, isr);
-		}
-		return;
-
-	      case 34:
-		if (isr & 0x2) {
-			/* Lower-Privilege Transfer Trap */
-
-			/* If we disabled debug traps during an fsyscall,
-			 * re-enable them here.
-			 */
-			if (test_thread_flag(TIF_DB_DISABLED)) {
-				clear_thread_flag(TIF_DB_DISABLED);
-				ia64_psr(&regs)->db = 1;
-			}
-
-			/*
-			 * Just clear PSR.lp and then return immediately:
-			 * all the interesting work (e.g., signal delivery)
-			 * is done in the kernel exit path.
-			 */
-			ia64_psr(&regs)->lp = 0;
-			return;
-		} else {
-			/* Unimplemented Instr. Address Trap */
-			if (user_mode(&regs)) {
-				force_sig_fault(SIGILL, ILL_BADIADDR,
-						(void __user *) iip,
-						0, 0, 0);
-				return;
-			}
-			sprintf(buf, "Unimplemented Instruction Address fault");
-		}
-		break;
-
-	      case 45:
-		printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
-		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
-		       iip, ifa, isr);
-		force_sig(SIGSEGV);
-		return;
-
-	      case 46:
-		printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
-		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
-		       iip, ifa, isr, iim);
-		force_sig(SIGSEGV);
-		return;
-
-	      case 47:
-		sprintf(buf, "IA-32 Interruption Fault (int 0x%lx)", isr >> 16);
-		break;
-
-	      default:
-		sprintf(buf, "Fault %lu", vector);
-		break;
-	}
-	if (!die_if_kernel(buf, &regs, error))
-		force_sig(SIGILL);
-}
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
deleted file mode 100644
index 0acb5a0cd7ab..000000000000
--- a/arch/ia64/kernel/unaligned.c
+++ /dev/null
@@ -1,1560 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific unaligned trap handling.
- *
- * Copyright (C) 1999-2002, 2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 2002/12/09   Fix rotating register handling (off-by-1 error, missing fr-rotation).  Fix
- *		get_rse_reg() to not leak kernel bits to user-level (reading an out-of-frame
- *		stacked register returns an undefined value; it does NOT trigger a
- *		"rsvd register fault").
- * 2001/10/11	Fix unaligned access to rotating registers in s/w pipelined loops.
- * 2001/08/13	Correct size of extended floats (float_fsz) from 16 to 10 bytes.
- * 2001/01/17	Add support emulation of unaligned kernel accesses.
- */
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/tty.h>
-#include <linux/extable.h>
-#include <linux/ratelimit.h>
-#include <linux/uaccess.h>
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-#include <asm/rse.h>
-#include <asm/exception.h>
-#include <asm/unaligned.h>
-
-extern int die_if_kernel(char *str, struct pt_regs *regs, long err);
-
-#undef DEBUG_UNALIGNED_TRAP
-
-#ifdef DEBUG_UNALIGNED_TRAP
-# define DPRINT(a...)	do { printk("%s %u: ", __func__, __LINE__); printk (a); } while (0)
-# define DDUMP(str,vp,len)	dump(str, vp, len)
-
-static void
-dump (const char *str, void *vp, size_t len)
-{
-	unsigned char *cp = vp;
-	int i;
-
-	printk("%s", str);
-	for (i = 0; i < len; ++i)
-		printk (" %02x", *cp++);
-	printk("\n");
-}
-#else
-# define DPRINT(a...)
-# define DDUMP(str,vp,len)
-#endif
-
-#define IA64_FIRST_STACKED_GR	32
-#define IA64_FIRST_ROTATING_FR	32
-#define SIGN_EXT9		0xffffffffffffff00ul
-
-/*
- *  sysctl settable hook which tells the kernel whether to honor the
- *  IA64_THREAD_UAC_NOPRINT prctl.  Because this is user settable, we want
- *  to allow the super user to enable/disable this for security reasons
- *  (i.e. don't allow attacker to fill up logs with unaligned accesses).
- */
-int no_unaligned_warning;
-int unaligned_dump_stack;
-
-/*
- * For M-unit:
- *
- *  opcode |   m  |   x6    |
- * --------|------|---------|
- * [40-37] | [36] | [35:30] |
- * --------|------|---------|
- *     4   |   1  |    6    | = 11 bits
- * --------------------------
- * However bits [31:30] are not directly useful to distinguish between
- * load/store so we can use [35:32] instead, which gives the following
- * mask ([40:32]) using 9 bits. The 'e' comes from the fact that we defer
- * checking the m-bit until later in the load/store emulation.
- */
-#define IA64_OPCODE_MASK	0x1ef
-#define IA64_OPCODE_SHIFT	32
-
-/*
- * Table C-28 Integer Load/Store
- *
- * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
- *
- * ld8.fill, st8.fill  MUST be aligned because the RNATs are based on
- * the address (bits [8:3]), so we must failed.
- */
-#define LD_OP            0x080
-#define LDS_OP           0x081
-#define LDA_OP           0x082
-#define LDSA_OP          0x083
-#define LDBIAS_OP        0x084
-#define LDACQ_OP         0x085
-/* 0x086, 0x087 are not relevant */
-#define LDCCLR_OP        0x088
-#define LDCNC_OP         0x089
-#define LDCCLRACQ_OP     0x08a
-#define ST_OP            0x08c
-#define STREL_OP         0x08d
-/* 0x08e,0x8f are not relevant */
-
-/*
- * Table C-29 Integer Load +Reg
- *
- * we use the ld->m (bit [36:36]) field to determine whether or not we have
- * a load/store of this form.
- */
-
-/*
- * Table C-30 Integer Load/Store +Imm
- *
- * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
- *
- * ld8.fill, st8.fill  must be aligned because the Nat register are based on
- * the address, so we must fail and the program must be fixed.
- */
-#define LD_IMM_OP            0x0a0
-#define LDS_IMM_OP           0x0a1
-#define LDA_IMM_OP           0x0a2
-#define LDSA_IMM_OP          0x0a3
-#define LDBIAS_IMM_OP        0x0a4
-#define LDACQ_IMM_OP         0x0a5
-/* 0x0a6, 0xa7 are not relevant */
-#define LDCCLR_IMM_OP        0x0a8
-#define LDCNC_IMM_OP         0x0a9
-#define LDCCLRACQ_IMM_OP     0x0aa
-#define ST_IMM_OP            0x0ac
-#define STREL_IMM_OP         0x0ad
-/* 0x0ae,0xaf are not relevant */
-
-/*
- * Table C-32 Floating-point Load/Store
- */
-#define LDF_OP           0x0c0
-#define LDFS_OP          0x0c1
-#define LDFA_OP          0x0c2
-#define LDFSA_OP         0x0c3
-/* 0x0c6 is irrelevant */
-#define LDFCCLR_OP       0x0c8
-#define LDFCNC_OP        0x0c9
-/* 0x0cb is irrelevant  */
-#define STF_OP           0x0cc
-
-/*
- * Table C-33 Floating-point Load +Reg
- *
- * we use the ld->m (bit [36:36]) field to determine whether or not we have
- * a load/store of this form.
- */
-
-/*
- * Table C-34 Floating-point Load/Store +Imm
- */
-#define LDF_IMM_OP       0x0e0
-#define LDFS_IMM_OP      0x0e1
-#define LDFA_IMM_OP      0x0e2
-#define LDFSA_IMM_OP     0x0e3
-/* 0x0e6 is irrelevant */
-#define LDFCCLR_IMM_OP   0x0e8
-#define LDFCNC_IMM_OP    0x0e9
-#define STF_IMM_OP       0x0ec
-
-typedef struct {
-	unsigned long	 qp:6;	/* [0:5]   */
-	unsigned long    r1:7;	/* [6:12]  */
-	unsigned long   imm:7;	/* [13:19] */
-	unsigned long    r3:7;	/* [20:26] */
-	unsigned long     x:1;  /* [27:27] */
-	unsigned long  hint:2;	/* [28:29] */
-	unsigned long x6_sz:2;	/* [30:31] */
-	unsigned long x6_op:4;	/* [32:35], x6 = x6_sz|x6_op */
-	unsigned long     m:1;	/* [36:36] */
-	unsigned long    op:4;	/* [37:40] */
-	unsigned long   pad:23; /* [41:63] */
-} load_store_t;
-
-
-typedef enum {
-	UPD_IMMEDIATE,	/* ldXZ r1=[r3],imm(9) */
-	UPD_REG		/* ldXZ r1=[r3],r2     */
-} update_t;
-
-/*
- * We use tables to keep track of the offsets of registers in the saved state.
- * This way we save having big switch/case statements.
- *
- * We use bit 0 to indicate switch_stack or pt_regs.
- * The offset is simply shifted by 1 bit.
- * A 2-byte value should be enough to hold any kind of offset
- *
- * In case the calling convention changes (and thus pt_regs/switch_stack)
- * simply use RSW instead of RPT or vice-versa.
- */
-
-#define RPO(x)	((size_t) &((struct pt_regs *)0)->x)
-#define RSO(x)	((size_t) &((struct switch_stack *)0)->x)
-
-#define RPT(x)		(RPO(x) << 1)
-#define RSW(x)		(1| RSO(x)<<1)
-
-#define GR_OFFS(x)	(gr_info[x]>>1)
-#define GR_IN_SW(x)	(gr_info[x] & 0x1)
-
-#define FR_OFFS(x)	(fr_info[x]>>1)
-#define FR_IN_SW(x)	(fr_info[x] & 0x1)
-
-static u16 gr_info[32]={
-	0,			/* r0 is read-only : WE SHOULD NEVER GET THIS */
-
-	RPT(r1), RPT(r2), RPT(r3),
-
-	RSW(r4), RSW(r5), RSW(r6), RSW(r7),
-
-	RPT(r8), RPT(r9), RPT(r10), RPT(r11),
-	RPT(r12), RPT(r13), RPT(r14), RPT(r15),
-
-	RPT(r16), RPT(r17), RPT(r18), RPT(r19),
-	RPT(r20), RPT(r21), RPT(r22), RPT(r23),
-	RPT(r24), RPT(r25), RPT(r26), RPT(r27),
-	RPT(r28), RPT(r29), RPT(r30), RPT(r31)
-};
-
-static u16 fr_info[32]={
-	0,			/* constant : WE SHOULD NEVER GET THIS */
-	0,			/* constant : WE SHOULD NEVER GET THIS */
-
-	RSW(f2), RSW(f3), RSW(f4), RSW(f5),
-
-	RPT(f6), RPT(f7), RPT(f8), RPT(f9),
-	RPT(f10), RPT(f11),
-
-	RSW(f12), RSW(f13), RSW(f14),
-	RSW(f15), RSW(f16), RSW(f17), RSW(f18), RSW(f19),
-	RSW(f20), RSW(f21), RSW(f22), RSW(f23), RSW(f24),
-	RSW(f25), RSW(f26), RSW(f27), RSW(f28), RSW(f29),
-	RSW(f30), RSW(f31)
-};
-
-/* Invalidate ALAT entry for integer register REGNO.  */
-static void
-invala_gr (int regno)
-{
-#	define F(reg)	case reg: ia64_invala_gr(reg); break
-
-	switch (regno) {
-		F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
-		F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
-		F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
-		F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
-		F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
-		F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
-		F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
-		F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
-		F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
-		F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
-		F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
-		F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
-		F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
-		F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
-		F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
-		F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
-	}
-#	undef F
-}
-
-/* Invalidate ALAT entry for floating-point register REGNO.  */
-static void
-invala_fr (int regno)
-{
-#	define F(reg)	case reg: ia64_invala_fr(reg); break
-
-	switch (regno) {
-		F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
-		F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
-		F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
-		F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
-		F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
-		F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
-		F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
-		F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
-		F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
-		F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
-		F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
-		F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
-		F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
-		F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
-		F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
-		F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
-	}
-#	undef F
-}
-
-static inline unsigned long
-rotate_reg (unsigned long sor, unsigned long rrb, unsigned long reg)
-{
-	reg += rrb;
-	if (reg >= sor)
-		reg -= sor;
-	return reg;
-}
-
-static void
-set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end;
-	unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
-	unsigned long rnats, nat_mask;
-	unsigned long on_kbs;
-	long sof = (regs->cr_ifs) & 0x7f;
-	long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
-	long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
-	long ridx = r1 - 32;
-
-	if (ridx >= sof) {
-		/* this should never happen, as the "rsvd register fault" has higher priority */
-		DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof);
-		return;
-	}
-
-	if (ridx < sor)
-		ridx = rotate_reg(sor, rrb_gr, ridx);
-
-	DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
-	       r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
-
-	on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
-	addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
-	if (addr >= kbs) {
-		/* the register is on the kernel backing store: easy... */
-		rnat_addr = ia64_rse_rnat_addr(addr);
-		if ((unsigned long) rnat_addr >= sw->ar_bspstore)
-			rnat_addr = &sw->ar_rnat;
-		nat_mask = 1UL << ia64_rse_slot_num(addr);
-
-		*addr = val;
-		if (nat)
-			*rnat_addr |=  nat_mask;
-		else
-			*rnat_addr &= ~nat_mask;
-		return;
-	}
-
-	if (!user_stack(current, regs)) {
-		DPRINT("ignoring kernel write to r%lu; register isn't on the kernel RBS!", r1);
-		return;
-	}
-
-	bspstore = (unsigned long *)regs->ar_bspstore;
-	ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
-	bsp     = ia64_rse_skip_regs(ubs_end, -sof);
-	addr    = ia64_rse_skip_regs(bsp, ridx);
-
-	DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
-
-	ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
-
-	rnat_addr = ia64_rse_rnat_addr(addr);
-
-	ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
-	DPRINT("rnat @%p = 0x%lx nat=%d old nat=%ld\n",
-	       (void *) rnat_addr, rnats, nat, (rnats >> ia64_rse_slot_num(addr)) & 1);
-
-	nat_mask = 1UL << ia64_rse_slot_num(addr);
-	if (nat)
-		rnats |=  nat_mask;
-	else
-		rnats &= ~nat_mask;
-	ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, rnats);
-
-	DPRINT("rnat changed to @%p = 0x%lx\n", (void *) rnat_addr, rnats);
-}
-
-
-static void
-get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, int *nat)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore;
-	unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
-	unsigned long rnats, nat_mask;
-	unsigned long on_kbs;
-	long sof = (regs->cr_ifs) & 0x7f;
-	long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
-	long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
-	long ridx = r1 - 32;
-
-	if (ridx >= sof) {
-		/* read of out-of-frame register returns an undefined value; 0 in our case.  */
-		DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof);
-		goto fail;
-	}
-
-	if (ridx < sor)
-		ridx = rotate_reg(sor, rrb_gr, ridx);
-
-	DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
-	       r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
-
-	on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
-	addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
-	if (addr >= kbs) {
-		/* the register is on the kernel backing store: easy... */
-		*val = *addr;
-		if (nat) {
-			rnat_addr = ia64_rse_rnat_addr(addr);
-			if ((unsigned long) rnat_addr >= sw->ar_bspstore)
-				rnat_addr = &sw->ar_rnat;
-			nat_mask = 1UL << ia64_rse_slot_num(addr);
-			*nat = (*rnat_addr & nat_mask) != 0;
-		}
-		return;
-	}
-
-	if (!user_stack(current, regs)) {
-		DPRINT("ignoring kernel read of r%lu; register isn't on the RBS!", r1);
-		goto fail;
-	}
-
-	bspstore = (unsigned long *)regs->ar_bspstore;
-	ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
-	bsp     = ia64_rse_skip_regs(ubs_end, -sof);
-	addr    = ia64_rse_skip_regs(bsp, ridx);
-
-	DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
-
-	ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
-
-	if (nat) {
-		rnat_addr = ia64_rse_rnat_addr(addr);
-		nat_mask = 1UL << ia64_rse_slot_num(addr);
-
-		DPRINT("rnat @%p = 0x%lx\n", (void *) rnat_addr, rnats);
-
-		ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
-		*nat = (rnats & nat_mask) != 0;
-	}
-	return;
-
-  fail:
-	*val = 0;
-	if (nat)
-		*nat = 0;
-	return;
-}
-
-
-static void
-setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr;
-	unsigned long bitmask;
-	unsigned long *unat;
-
-	/*
-	 * First takes care of stacked registers
-	 */
-	if (regnum >= IA64_FIRST_STACKED_GR) {
-		set_rse_reg(regs, regnum, val, nat);
-		return;
-	}
-
-	/*
-	 * Using r0 as a target raises a General Exception fault which has higher priority
-	 * than the Unaligned Reference fault.
-	 */
-
-	/*
-	 * Now look at registers in [0-31] range and init correct UNAT
-	 */
-	if (GR_IN_SW(regnum)) {
-		addr = (unsigned long)sw;
-		unat = &sw->ar_unat;
-	} else {
-		addr = (unsigned long)regs;
-		unat = &sw->caller_unat;
-	}
-	DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n",
-	       addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum));
-	/*
-	 * add offset from base of struct
-	 * and do it !
-	 */
-	addr += GR_OFFS(regnum);
-
-	*(unsigned long *)addr = val;
-
-	/*
-	 * We need to clear the corresponding UNAT bit to fully emulate the load
-	 * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4
-	 */
-	bitmask   = 1UL << (addr >> 3 & 0x3f);
-	DPRINT("*0x%lx=0x%lx NaT=%d prev_unat @%p=%lx\n", addr, val, nat, (void *) unat, *unat);
-	if (nat) {
-		*unat |= bitmask;
-	} else {
-		*unat &= ~bitmask;
-	}
-	DPRINT("*0x%lx=0x%lx NaT=%d new unat: %p=%lx\n", addr, val, nat, (void *) unat,*unat);
-}
-
-/*
- * Return the (rotated) index for floating point register REGNUM (REGNUM must be in the
- * range from 32-127, result is in the range from 0-95.
- */
-static inline unsigned long
-fph_index (struct pt_regs *regs, long regnum)
-{
-	unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f;
-	return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
-}
-
-static void
-setfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *)regs - 1;
-	unsigned long addr;
-
-	/*
-	 * From EAS-2.5: FPDisableFault has higher priority than Unaligned
-	 * Fault. Thus, when we get here, we know the partition is enabled.
-	 * To update f32-f127, there are three choices:
-	 *
-	 *	(1) save f32-f127 to thread.fph and update the values there
-	 *	(2) use a gigantic switch statement to directly access the registers
-	 *	(3) generate code on the fly to update the desired register
-	 *
-	 * For now, we are using approach (1).
-	 */
-	if (regnum >= IA64_FIRST_ROTATING_FR) {
-		ia64_sync_fph(current);
-		current->thread.fph[fph_index(regs, regnum)] = *fpval;
-	} else {
-		/*
-		 * pt_regs or switch_stack ?
-		 */
-		if (FR_IN_SW(regnum)) {
-			addr = (unsigned long)sw;
-		} else {
-			addr = (unsigned long)regs;
-		}
-
-		DPRINT("tmp_base=%lx offset=%d\n", addr, FR_OFFS(regnum));
-
-		addr += FR_OFFS(regnum);
-		*(struct ia64_fpreg *)addr = *fpval;
-
-		/*
-		 * mark the low partition as being used now
-		 *
-		 * It is highly unlikely that this bit is not already set, but
-		 * let's do it for safety.
-		 */
-		regs->cr_ipsr |= IA64_PSR_MFL;
-	}
-}
-
-/*
- * Those 2 inline functions generate the spilled versions of the constant floating point
- * registers which can be used with stfX
- */
-static inline void
-float_spill_f0 (struct ia64_fpreg *final)
-{
-	ia64_stf_spill(final, 0);
-}
-
-static inline void
-float_spill_f1 (struct ia64_fpreg *final)
-{
-	ia64_stf_spill(final, 1);
-}
-
-static void
-getfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr;
-
-	/*
-	 * From EAS-2.5: FPDisableFault has higher priority than
-	 * Unaligned Fault. Thus, when we get here, we know the partition is
-	 * enabled.
-	 *
-	 * When regnum > 31, the register is still live and we need to force a save
-	 * to current->thread.fph to get access to it.  See discussion in setfpreg()
-	 * for reasons and other ways of doing this.
-	 */
-	if (regnum >= IA64_FIRST_ROTATING_FR) {
-		ia64_flush_fph(current);
-		*fpval = current->thread.fph[fph_index(regs, regnum)];
-	} else {
-		/*
-		 * f0 = 0.0, f1= 1.0. Those registers are constant and are thus
-		 * not saved, we must generate their spilled form on the fly
-		 */
-		switch(regnum) {
-		case 0:
-			float_spill_f0(fpval);
-			break;
-		case 1:
-			float_spill_f1(fpval);
-			break;
-		default:
-			/*
-			 * pt_regs or switch_stack ?
-			 */
-			addr =  FR_IN_SW(regnum) ? (unsigned long)sw
-						 : (unsigned long)regs;
-
-			DPRINT("is_sw=%d tmp_base=%lx offset=0x%x\n",
-			       FR_IN_SW(regnum), addr, FR_OFFS(regnum));
-
-			addr  += FR_OFFS(regnum);
-			*fpval = *(struct ia64_fpreg *)addr;
-		}
-	}
-}
-
-
-static void
-getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr, *unat;
-
-	if (regnum >= IA64_FIRST_STACKED_GR) {
-		get_rse_reg(regs, regnum, val, nat);
-		return;
-	}
-
-	/*
-	 * take care of r0 (read-only always evaluate to 0)
-	 */
-	if (regnum == 0) {
-		*val = 0;
-		if (nat)
-			*nat = 0;
-		return;
-	}
-
-	/*
-	 * Now look at registers in [0-31] range and init correct UNAT
-	 */
-	if (GR_IN_SW(regnum)) {
-		addr = (unsigned long)sw;
-		unat = &sw->ar_unat;
-	} else {
-		addr = (unsigned long)regs;
-		unat = &sw->caller_unat;
-	}
-
-	DPRINT("addr_base=%lx offset=0x%x\n", addr,  GR_OFFS(regnum));
-
-	addr += GR_OFFS(regnum);
-
-	*val  = *(unsigned long *)addr;
-
-	/*
-	 * do it only when requested
-	 */
-	if (nat)
-		*nat  = (*unat >> (addr >> 3 & 0x3f)) & 0x1UL;
-}
-
-static void
-emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsigned long ifa)
-{
-	/*
-	 * IMPORTANT:
-	 * Given the way we handle unaligned speculative loads, we should
-	 * not get to this point in the code but we keep this sanity check,
-	 * just in case.
-	 */
-	if (ld.x6_op == 1 || ld.x6_op == 3) {
-		printk(KERN_ERR "%s: register update on speculative load, error\n", __func__);
-		if (die_if_kernel("unaligned reference on speculative load with register update\n",
-				  regs, 30))
-			return;
-	}
-
-
-	/*
-	 * at this point, we know that the base register to update is valid i.e.,
-	 * it's not r0
-	 */
-	if (type == UPD_IMMEDIATE) {
-		unsigned long imm;
-
-		/*
-		 * Load +Imm: ldXZ r1=[r3],imm(9)
-		 *
-		 *
-		 * form imm9: [13:19] contain the first 7 bits
-		 */
-		imm = ld.x << 7 | ld.imm;
-
-		/*
-		 * sign extend (1+8bits) if m set
-		 */
-		if (ld.m) imm |= SIGN_EXT9;
-
-		/*
-		 * ifa == r3 and we know that the NaT bit on r3 was clear so
-		 * we can directly use ifa.
-		 */
-		ifa += imm;
-
-		setreg(ld.r3, ifa, 0, regs);
-
-		DPRINT("ld.x=%d ld.m=%d imm=%ld r3=0x%lx\n", ld.x, ld.m, imm, ifa);
-
-	} else if (ld.m) {
-		unsigned long r2;
-		int nat_r2;
-
-		/*
-		 * Load +Reg Opcode: ldXZ r1=[r3],r2
-		 *
-		 * Note: that we update r3 even in the case of ldfX.a
-		 * (where the load does not happen)
-		 *
-		 * The way the load algorithm works, we know that r3 does not
-		 * have its NaT bit set (would have gotten NaT consumption
-		 * before getting the unaligned fault). So we can use ifa
-		 * which equals r3 at this point.
-		 *
-		 * IMPORTANT:
-		 * The above statement holds ONLY because we know that we
-		 * never reach this code when trying to do a ldX.s.
-		 * If we ever make it to here on an ldfX.s then
-		 */
-		getreg(ld.imm, &r2, &nat_r2, regs);
-
-		ifa += r2;
-
-		/*
-		 * propagate Nat r2 -> r3
-		 */
-		setreg(ld.r3, ifa, nat_r2, regs);
-
-		DPRINT("imm=%d r2=%ld r3=0x%lx nat_r2=%d\n",ld.imm, r2, ifa, nat_r2);
-	}
-}
-
-static int emulate_store(unsigned long ifa, void *val, int len, bool kernel_mode)
-{
-	if (kernel_mode)
-		return copy_to_kernel_nofault((void *)ifa, val, len);
-
-	return copy_to_user((void __user *)ifa, val, len);
-}
-
-static int emulate_load(void *val, unsigned long ifa, int len, bool kernel_mode)
-{
-	if (kernel_mode)
-	       return copy_from_kernel_nofault(val, (void *)ifa, len);
-
-	return copy_from_user(val, (void __user *)ifa, len);
-}
-
-static int
-emulate_load_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		  bool kernel_mode)
-{
-	unsigned int len = 1 << ld.x6_sz;
-	unsigned long val = 0;
-
-	/*
-	 * r0, as target, doesn't need to be checked because Illegal Instruction
-	 * faults have higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an
-	 * unaligned reference.
-	 */
-
-	/*
-	 * ldX.a we will emulate load and also invalidate the ALAT entry.
-	 * See comment below for explanation on how we handle ldX.a
-	 */
-
-	if (len != 2 && len != 4 && len != 8) {
-		DPRINT("unknown size: x6=%d\n", ld.x6_sz);
-		return -1;
-	}
-	/* this assumes little-endian byte-order: */
-	if (emulate_load(&val, ifa, len, kernel_mode))
-		return -1;
-	setreg(ld.r1, val, 0, regs);
-
-	/*
-	 * check for updates on any kind of loads
-	 */
-	if (ld.op == 0x5 || ld.m)
-		emulate_load_updates(ld.op == 0x5 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
-
-	/*
-	 * handling of various loads (based on EAS2.4):
-	 *
-	 * ldX.acq (ordered load):
-	 *	- acquire semantics would have been used, so force fence instead.
-	 *
-	 * ldX.c.clr (check load and clear):
-	 *	- if we get to this handler, it's because the entry was not in the ALAT.
-	 *	  Therefore the operation reverts to a normal load
-	 *
-	 * ldX.c.nc (check load no clear):
-	 *	- same as previous one
-	 *
-	 * ldX.c.clr.acq (ordered check load and clear):
-	 *	- same as above for c.clr part. The load needs to have acquire semantics. So
-	 *	  we use the fence semantics which is stronger and thus ensures correctness.
-	 *
-	 * ldX.a (advanced load):
-	 *	- suppose ldX.a r1=[r3]. If we get to the unaligned trap it's because the
-	 *	  address doesn't match requested size alignment. This means that we would
-	 *	  possibly need more than one load to get the result.
-	 *
-	 *	  The load part can be handled just like a normal load, however the difficult
-	 *	  part is to get the right thing into the ALAT. The critical piece of information
-	 *	  in the base address of the load & size. To do that, a ld.a must be executed,
-	 *	  clearly any address can be pushed into the table by using ld1.a r1=[r3]. Now
-	 *	  if we use the same target register, we will be okay for the check.a instruction.
-	 *	  If we look at the store, basically a stX [r3]=r1 checks the ALAT  for any entry
-	 *	  which would overlap within [r3,r3+X] (the size of the load was store in the
-	 *	  ALAT). If such an entry is found the entry is invalidated. But this is not good
-	 *	  enough, take the following example:
-	 *		r3=3
-	 *		ld4.a r1=[r3]
-	 *
-	 *	  Could be emulated by doing:
-	 *		ld1.a r1=[r3],1
-	 *		store to temporary;
-	 *		ld1.a r1=[r3],1
-	 *		store & shift to temporary;
-	 *		ld1.a r1=[r3],1
-	 *		store & shift to temporary;
-	 *		ld1.a r1=[r3]
-	 *		store & shift to temporary;
-	 *		r1=temporary
-	 *
-	 *	  So in this case, you would get the right value is r1 but the wrong info in
-	 *	  the ALAT.  Notice that you could do it in reverse to finish with address 3
-	 *	  but you would still get the size wrong.  To get the size right, one needs to
-	 *	  execute exactly the same kind of load. You could do it from a aligned
-	 *	  temporary location, but you would get the address wrong.
-	 *
-	 *	  So no matter what, it is not possible to emulate an advanced load
-	 *	  correctly. But is that really critical ?
-	 *
-	 *	  We will always convert ld.a into a normal load with ALAT invalidated.  This
-	 *	  will enable compiler to do optimization where certain code path after ld.a
-	 *	  is not required to have ld.c/chk.a, e.g., code path with no intervening stores.
-	 *
-	 *	  If there is a store after the advanced load, one must either do a ld.c.* or
-	 *	  chk.a.* to reuse the value stored in the ALAT. Both can "fail" (meaning no
-	 *	  entry found in ALAT), and that's perfectly ok because:
-	 *
-	 *		- ld.c.*, if the entry is not present a  normal load is executed
-	 *		- chk.a.*, if the entry is not present, execution jumps to recovery code
-	 *
-	 *	  In either case, the load can be potentially retried in another form.
-	 *
-	 *	  ALAT must be invalidated for the register (so that chk.a or ld.c don't pick
-	 *	  up a stale entry later). The register base update MUST also be performed.
-	 */
-
-	/*
-	 * when the load has the .acq completer then
-	 * use ordering fence.
-	 */
-	if (ld.x6_op == 0x5 || ld.x6_op == 0xa)
-		mb();
-
-	/*
-	 * invalidate ALAT entry in case of advanced load
-	 */
-	if (ld.x6_op == 0x2)
-		invala_gr(ld.r1);
-
-	return 0;
-}
-
-static int
-emulate_store_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		   bool kernel_mode)
-{
-	unsigned long r2;
-	unsigned int len = 1 << ld.x6_sz;
-
-	/*
-	 * if we get to this handler, Nat bits on both r3 and r2 have already
-	 * been checked. so we don't need to do it
-	 *
-	 * extract the value to be stored
-	 */
-	getreg(ld.imm, &r2, NULL, regs);
-
-	/*
-	 * we rely on the macros in unaligned.h for now i.e.,
-	 * we let the compiler figure out how to read memory gracefully.
-	 *
-	 * We need this switch/case because the way the inline function
-	 * works. The code is optimized by the compiler and looks like
-	 * a single switch/case.
-	 */
-	DPRINT("st%d [%lx]=%lx\n", len, ifa, r2);
-
-	if (len != 2 && len != 4 && len != 8) {
-		DPRINT("unknown size: x6=%d\n", ld.x6_sz);
-		return -1;
-	}
-
-	/* this assumes little-endian byte-order: */
-	if (emulate_store(ifa, &r2, len, kernel_mode))
-		return -1;
-
-	/*
-	 * stX [r3]=r2,imm(9)
-	 *
-	 * NOTE:
-	 * ld.r3 can never be r0, because r0 would not generate an
-	 * unaligned access.
-	 */
-	if (ld.op == 0x5) {
-		unsigned long imm;
-
-		/*
-		 * form imm9: [12:6] contain first 7bits
-		 */
-		imm = ld.x << 7 | ld.r1;
-		/*
-		 * sign extend (8bits) if m set
-		 */
-		if (ld.m) imm |= SIGN_EXT9;
-		/*
-		 * ifa == r3 (NaT is necessarily cleared)
-		 */
-		ifa += imm;
-
-		DPRINT("imm=%lx r3=%lx\n", imm, ifa);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-	/*
-	 * we don't have alat_invalidate_multiple() so we need
-	 * to do the complete flush :-<<
-	 */
-	ia64_invala();
-
-	/*
-	 * stX.rel: use fence instead of release
-	 */
-	if (ld.x6_op == 0xd)
-		mb();
-
-	return 0;
-}
-
-/*
- * floating point operations sizes in bytes
- */
-static const unsigned char float_fsz[4]={
-	10, /* extended precision (e) */
-	8,  /* integer (8)            */
-	4,  /* single precision (s)   */
-	8   /* double precision (d)   */
-};
-
-static inline void
-mem2float_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfe(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf8(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfs(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfd(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-float2mem_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfe(final, 6);
-}
-
-static inline void
-float2mem_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stf8(final, 6);
-}
-
-static inline void
-float2mem_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfs(final, 6);
-}
-
-static inline void
-float2mem_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfd(final, 6);
-}
-
-static int
-emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs, bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init[2];
-	struct ia64_fpreg fpr_final[2];
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * fr0 & fr1 don't need to be checked because Illegal Instruction faults have
-	 * higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an unaligned
-	 * reference.
-	 */
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init, 0, sizeof(fpr_init));
-	memset(&fpr_final, 0, sizeof(fpr_final));
-
-	/*
-	 * ldfpX.a: we don't try to emulate anything but we must
-	 * invalidate the ALAT entry and execute updates, if any.
-	 */
-	if (ld.x6_op != 0x2) {
-		/*
-		 * This assumes little-endian byte-order.  Note that there is no "ldfpe"
-		 * instruction:
-		 */
-		if (emulate_load(&fpr_init[0], ifa, len, kernel_mode)
-		    || emulate_load(&fpr_init[1], (ifa + len), len, kernel_mode))
-			return -1;
-
-		DPRINT("ld.r1=%d ld.imm=%d x6_sz=%d\n", ld.r1, ld.imm, ld.x6_sz);
-		DDUMP("frp_init =", &fpr_init, 2*len);
-		/*
-		 * XXX fixme
-		 * Could optimize inlines by using ldfpX & 2 spills
-		 */
-		switch( ld.x6_sz ) {
-			case 0:
-				mem2float_extended(&fpr_init[0], &fpr_final[0]);
-				mem2float_extended(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 1:
-				mem2float_integer(&fpr_init[0], &fpr_final[0]);
-				mem2float_integer(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 2:
-				mem2float_single(&fpr_init[0], &fpr_final[0]);
-				mem2float_single(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 3:
-				mem2float_double(&fpr_init[0], &fpr_final[0]);
-				mem2float_double(&fpr_init[1], &fpr_final[1]);
-				break;
-		}
-		DDUMP("fpr_final =", &fpr_final, 2*len);
-		/*
-		 * XXX fixme
-		 *
-		 * A possible optimization would be to drop fpr_final and directly
-		 * use the storage from the saved context i.e., the actual final
-		 * destination (pt_regs, switch_stack or thread structure).
-		 */
-		setfpreg(ld.r1, &fpr_final[0], regs);
-		setfpreg(ld.imm, &fpr_final[1], regs);
-	}
-
-	/*
-	 * Check for updates: only immediate updates are available for this
-	 * instruction.
-	 */
-	if (ld.m) {
-		/*
-		 * the immediate is implicit given the ldsz of the operation:
-		 * single: 8 (2x4) and for  all others it's 16 (2x8)
-		 */
-		ifa += len<<1;
-
-		/*
-		 * IMPORTANT:
-		 * the fact that we force the NaT of r3 to zero is ONLY valid
-		 * as long as we don't come here with a ldfpX.s.
-		 * For this reason we keep this sanity check
-		 */
-		if (ld.x6_op == 1 || ld.x6_op == 3)
-			printk(KERN_ERR "%s: register update on speculative load pair, error\n",
-			       __func__);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-
-	/*
-	 * Invalidate ALAT entries, if any, for both registers.
-	 */
-	if (ld.x6_op == 0x2) {
-		invala_fr(ld.r1);
-		invala_fr(ld.imm);
-	}
-	return 0;
-}
-
-
-static int
-emulate_load_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-	            bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init;
-	struct ia64_fpreg fpr_final;
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * fr0 & fr1 don't need to be checked because Illegal Instruction
-	 * faults have higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an
-	 * unaligned reference.
-	 */
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init,0, sizeof(fpr_init));
-	memset(&fpr_final,0, sizeof(fpr_final));
-
-	/*
-	 * ldfX.a we don't try to emulate anything but we must
-	 * invalidate the ALAT entry.
-	 * See comments in ldX for descriptions on how the various loads are handled.
-	 */
-	if (ld.x6_op != 0x2) {
-		if (emulate_load(&fpr_init, ifa, len, kernel_mode))
-			return -1;
-
-		DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
-		DDUMP("fpr_init =", &fpr_init, len);
-		/*
-		 * we only do something for x6_op={0,8,9}
-		 */
-		switch( ld.x6_sz ) {
-			case 0:
-				mem2float_extended(&fpr_init, &fpr_final);
-				break;
-			case 1:
-				mem2float_integer(&fpr_init, &fpr_final);
-				break;
-			case 2:
-				mem2float_single(&fpr_init, &fpr_final);
-				break;
-			case 3:
-				mem2float_double(&fpr_init, &fpr_final);
-				break;
-		}
-		DDUMP("fpr_final =", &fpr_final, len);
-		/*
-		 * XXX fixme
-		 *
-		 * A possible optimization would be to drop fpr_final and directly
-		 * use the storage from the saved context i.e., the actual final
-		 * destination (pt_regs, switch_stack or thread structure).
-		 */
-		setfpreg(ld.r1, &fpr_final, regs);
-	}
-
-	/*
-	 * check for updates on any loads
-	 */
-	if (ld.op == 0x7 || ld.m)
-		emulate_load_updates(ld.op == 0x7 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
-
-	/*
-	 * invalidate ALAT entry in case of advanced floating point loads
-	 */
-	if (ld.x6_op == 0x2)
-		invala_fr(ld.r1);
-
-	return 0;
-}
-
-
-static int
-emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		     bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init;
-	struct ia64_fpreg fpr_final;
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init,0, sizeof(fpr_init));
-	memset(&fpr_final,0, sizeof(fpr_final));
-
-	/*
-	 * if we get to this handler, Nat bits on both r3 and r2 have already
-	 * been checked. so we don't need to do it
-	 *
-	 * extract the value to be stored
-	 */
-	getfpreg(ld.imm, &fpr_init, regs);
-	/*
-	 * during this step, we extract the spilled registers from the saved
-	 * context i.e., we refill. Then we store (no spill) to temporary
-	 * aligned location
-	 */
-	switch( ld.x6_sz ) {
-		case 0:
-			float2mem_extended(&fpr_init, &fpr_final);
-			break;
-		case 1:
-			float2mem_integer(&fpr_init, &fpr_final);
-			break;
-		case 2:
-			float2mem_single(&fpr_init, &fpr_final);
-			break;
-		case 3:
-			float2mem_double(&fpr_init, &fpr_final);
-			break;
-	}
-	DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
-	DDUMP("fpr_init =", &fpr_init, len);
-	DDUMP("fpr_final =", &fpr_final, len);
-
-	if (emulate_store(ifa, &fpr_final, len, kernel_mode))
-		return -1;
-
-	/*
-	 * stfX [r3]=r2,imm(9)
-	 *
-	 * NOTE:
-	 * ld.r3 can never be r0, because r0 would not generate an
-	 * unaligned access.
-	 */
-	if (ld.op == 0x7) {
-		unsigned long imm;
-
-		/*
-		 * form imm9: [12:6] contain first 7bits
-		 */
-		imm = ld.x << 7 | ld.r1;
-		/*
-		 * sign extend (8bits) if m set
-		 */
-		if (ld.m)
-			imm |= SIGN_EXT9;
-		/*
-		 * ifa == r3 (NaT is necessarily cleared)
-		 */
-		ifa += imm;
-
-		DPRINT("imm=%lx r3=%lx\n", imm, ifa);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-	/*
-	 * we don't have alat_invalidate_multiple() so we need
-	 * to do the complete flush :-<<
-	 */
-	ia64_invala();
-
-	return 0;
-}
-
-/*
- * Make sure we log the unaligned access, so that user/sysadmin can notice it and
- * eventually fix the program.  However, we don't want to do that for every access so we
- * pace it with jiffies.
- */
-static DEFINE_RATELIMIT_STATE(logging_rate_limit, 5 * HZ, 5);
-
-void
-ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
-{
-	struct ia64_psr *ipsr = ia64_psr(regs);
-	unsigned long bundle[2];
-	unsigned long opcode;
-	const struct exception_table_entry *eh = NULL;
-	union {
-		unsigned long l;
-		load_store_t insn;
-	} u;
-	int ret = -1;
-	bool kernel_mode = false;
-
-	if (ia64_psr(regs)->be) {
-		/* we don't support big-endian accesses */
-		if (die_if_kernel("big-endian unaligned accesses are not supported", regs, 0))
-			return;
-		goto force_sigbus;
-	}
-
-	/*
-	 * Treat kernel accesses for which there is an exception handler entry the same as
-	 * user-level unaligned accesses.  Otherwise, a clever program could trick this
-	 * handler into reading an arbitrary kernel addresses...
-	 */
-	if (!user_mode(regs))
-		eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
-	if (user_mode(regs) || eh) {
-		if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
-			goto force_sigbus;
-
-		if (!no_unaligned_warning &&
-		    !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) &&
-		    __ratelimit(&logging_rate_limit))
-		{
-			char buf[200];	/* comm[] is at most 16 bytes... */
-			size_t len;
-
-			len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, "
-				      "ip=0x%016lx\n\r", current->comm,
-				      task_pid_nr(current),
-				      ifa, regs->cr_iip + ipsr->ri);
-			/*
-			 * Don't call tty_write_message() if we're in the kernel; we might
-			 * be holding locks...
-			 */
-			if (user_mode(regs)) {
-				struct tty_struct *tty = get_current_tty();
-				tty_write_message(tty, buf);
-				tty_kref_put(tty);
-			}
-			buf[len-1] = '\0';	/* drop '\r' */
-			/* watch for command names containing %s */
-			printk(KERN_WARNING "%s", buf);
-		} else {
-			if (no_unaligned_warning) {
-				printk_once(KERN_WARNING "%s(%d) encountered an "
-				       "unaligned exception which required\n"
-				       "kernel assistance, which degrades "
-				       "the performance of the application.\n"
-				       "Unaligned exception warnings have "
-				       "been disabled by the system "
-				       "administrator\n"
-				       "echo 0 > /proc/sys/kernel/ignore-"
-				       "unaligned-usertrap to re-enable\n",
-				       current->comm, task_pid_nr(current));
-			}
-		}
-	} else {
-		if (__ratelimit(&logging_rate_limit)) {
-			printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n",
-			       ifa, regs->cr_iip + ipsr->ri);
-			if (unaligned_dump_stack)
-				dump_stack();
-		}
-		kernel_mode = true;
-	}
-
-	DPRINT("iip=%lx ifa=%lx isr=%lx (ei=%d, sp=%d)\n",
-	       regs->cr_iip, ifa, regs->cr_ipsr, ipsr->ri, ipsr->it);
-
-	if (emulate_load(bundle, regs->cr_iip, 16, kernel_mode))
-		goto failure;
-
-	/*
-	 * extract the instruction from the bundle given the slot number
-	 */
-	switch (ipsr->ri) {
-	      default:
-	      case 0: u.l = (bundle[0] >>  5); break;
-	      case 1: u.l = (bundle[0] >> 46) | (bundle[1] << 18); break;
-	      case 2: u.l = (bundle[1] >> 23); break;
-	}
-	opcode = (u.l >> IA64_OPCODE_SHIFT) & IA64_OPCODE_MASK;
-
-	DPRINT("opcode=%lx ld.qp=%d ld.r1=%d ld.imm=%d ld.r3=%d ld.x=%d ld.hint=%d "
-	       "ld.x6=0x%x ld.m=%d ld.op=%d\n", opcode, u.insn.qp, u.insn.r1, u.insn.imm,
-	       u.insn.r3, u.insn.x, u.insn.hint, u.insn.x6_sz, u.insn.m, u.insn.op);
-
-	/*
-	 * IMPORTANT:
-	 * Notice that the switch statement DOES not cover all possible instructions
-	 * that DO generate unaligned references. This is made on purpose because for some
-	 * instructions it DOES NOT make sense to try and emulate the access. Sometimes it
-	 * is WRONG to try and emulate. Here is a list of instruction we don't emulate i.e.,
-	 * the program will get a signal and die:
-	 *
-	 *	load/store:
-	 *		- ldX.spill
-	 *		- stX.spill
-	 *	Reason: RNATs are based on addresses
-	 *		- ld16
-	 *		- st16
-	 *	Reason: ld16 and st16 are supposed to occur in a single
-	 *		memory op
-	 *
-	 *	synchronization:
-	 *		- cmpxchg
-	 *		- fetchadd
-	 *		- xchg
-	 *	Reason: ATOMIC operations cannot be emulated properly using multiple
-	 *	        instructions.
-	 *
-	 *	speculative loads:
-	 *		- ldX.sZ
-	 *	Reason: side effects, code must be ready to deal with failure so simpler
-	 *		to let the load fail.
-	 * ---------------------------------------------------------------------------------
-	 * XXX fixme
-	 *
-	 * I would like to get rid of this switch case and do something
-	 * more elegant.
-	 */
-	switch (opcode) {
-	      case LDS_OP:
-	      case LDSA_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case LDS_IMM_OP:
-	      case LDSA_IMM_OP:
-	      case LDFS_OP:
-	      case LDFSA_OP:
-	      case LDFS_IMM_OP:
-		/*
-		 * The instruction will be retried with deferred exceptions turned on, and
-		 * we should get Nat bit installed
-		 *
-		 * IMPORTANT: When PSR_ED is set, the register & immediate update forms
-		 * are actually executed even though the operation failed. So we don't
-		 * need to take care of this.
-		 */
-		DPRINT("forcing PSR_ED\n");
-		regs->cr_ipsr |= IA64_PSR_ED;
-		goto done;
-
-	      case LD_OP:
-	      case LDA_OP:
-	      case LDBIAS_OP:
-	      case LDACQ_OP:
-	      case LDCCLR_OP:
-	      case LDCNC_OP:
-	      case LDCCLRACQ_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case LD_IMM_OP:
-	      case LDA_IMM_OP:
-	      case LDBIAS_IMM_OP:
-	      case LDACQ_IMM_OP:
-	      case LDCCLR_IMM_OP:
-	      case LDCNC_IMM_OP:
-	      case LDCCLRACQ_IMM_OP:
-		ret = emulate_load_int(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case ST_OP:
-	      case STREL_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case ST_IMM_OP:
-	      case STREL_IMM_OP:
-		ret = emulate_store_int(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case LDF_OP:
-	      case LDFA_OP:
-	      case LDFCCLR_OP:
-	      case LDFCNC_OP:
-		if (u.insn.x)
-			ret = emulate_load_floatpair(ifa, u.insn, regs, kernel_mode);
-		else
-			ret = emulate_load_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case LDF_IMM_OP:
-	      case LDFA_IMM_OP:
-	      case LDFCCLR_IMM_OP:
-	      case LDFCNC_IMM_OP:
-		ret = emulate_load_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case STF_OP:
-	      case STF_IMM_OP:
-		ret = emulate_store_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      default:
-		goto failure;
-	}
-	DPRINT("ret=%d\n", ret);
-	if (ret)
-		goto failure;
-
-	if (ipsr->ri == 2)
-		/*
-		 * given today's architecture this case is not likely to happen because a
-		 * memory access instruction (M) can never be in the last slot of a
-		 * bundle. But let's keep it for now.
-		 */
-		regs->cr_iip += 16;
-	ipsr->ri = (ipsr->ri + 1) & 0x3;
-
-	DPRINT("ipsr->ri=%d iip=%lx\n", ipsr->ri, regs->cr_iip);
-  done:
-	return;
-
-  failure:
-	/* something went wrong... */
-	if (!user_mode(regs)) {
-		if (eh) {
-			ia64_handle_exception(regs, eh);
-			goto done;
-		}
-		if (die_if_kernel("error during unaligned kernel access\n", regs, ret))
-			return;
-		/* NOT_REACHED */
-	}
-  force_sigbus:
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) ifa,
-			0, 0, 0);
-	goto done;
-}
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
deleted file mode 100644
index a0fec82c56b8..000000000000
--- a/arch/ia64/kernel/uncached.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2001-2008 Silicon Graphics, Inc.  All rights reserved.
- *
- * A simple uncached page allocator using the generic allocator. This
- * allocator first utilizes the spare (spill) pages found in the EFI
- * memmap and will then start converting cached pages to uncached ones
- * at a granule at a time. Node awareness is implemented by having a
- * pool of pages per node.
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/efi.h>
-#include <linux/nmi.h>
-#include <linux/genalloc.h>
-#include <linux/gfp.h>
-#include <linux/pgtable.h>
-#include <asm/efi.h>
-#include <asm/page.h>
-#include <asm/pal.h>
-#include <linux/atomic.h>
-#include <asm/tlbflush.h>
-
-struct uncached_pool {
-	struct gen_pool *pool;
-	struct mutex add_chunk_mutex;	/* serialize adding a converted chunk */
-	int nchunks_added;		/* #of converted chunks added to pool */
-	atomic_t status;		/* smp called function's return status*/
-};
-
-#define MAX_CONVERTED_CHUNKS_PER_NODE	2
-
-struct uncached_pool uncached_pools[MAX_NUMNODES];
-
-
-static void uncached_ipi_visibility(void *data)
-{
-	int status;
-	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
-
-	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-	if ((status != PAL_VISIBILITY_OK) &&
-	    (status != PAL_VISIBILITY_OK_REMOTE_NEEDED))
-		atomic_inc(&uc_pool->status);
-}
-
-
-static void uncached_ipi_mc_drain(void *data)
-{
-	int status;
-	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
-
-	status = ia64_pal_mc_drain();
-	if (status != PAL_STATUS_SUCCESS)
-		atomic_inc(&uc_pool->status);
-}
-
-
-/*
- * Add a new chunk of uncached memory pages to the specified pool.
- *
- * @pool: pool to add new chunk of uncached memory to
- * @nid: node id of node to allocate memory from, or -1
- *
- * This is accomplished by first allocating a granule of cached memory pages
- * and then converting them to uncached memory pages.
- */
-static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
-{
-	struct page *page;
-	int status, i, nchunks_added = uc_pool->nchunks_added;
-	unsigned long c_addr, uc_addr;
-
-	if (mutex_lock_interruptible(&uc_pool->add_chunk_mutex) != 0)
-		return -1;	/* interrupted by a signal */
-
-	if (uc_pool->nchunks_added > nchunks_added) {
-		/* someone added a new chunk while we were waiting */
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return 0;
-	}
-
-	if (uc_pool->nchunks_added >= MAX_CONVERTED_CHUNKS_PER_NODE) {
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return -1;
-	}
-
-	/* attempt to allocate a granule's worth of cached memory pages */
-
-	page = __alloc_pages_node(nid,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				IA64_GRANULE_SHIFT-PAGE_SHIFT);
-	if (!page) {
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return -1;
-	}
-
-	/* convert the memory pages from cached to uncached */
-
-	c_addr = (unsigned long)page_address(page);
-	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
-
-	/*
-	 * There's a small race here where it's possible for someone to
-	 * access the page through /dev/mem halfway through the conversion
-	 * to uncached - not sure it's really worth bothering about
-	 */
-	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
-		SetPageUncached(&page[i]);
-
-	flush_tlb_kernel_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
-
-	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
-		atomic_set(&uc_pool->status, 0);
-		smp_call_function(uncached_ipi_visibility, uc_pool, 1);
-		if (atomic_read(&uc_pool->status))
-			goto failed;
-	} else if (status != PAL_VISIBILITY_OK)
-		goto failed;
-
-	preempt_disable();
-
-	flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
-
-	/* flush the just introduced uncached translation from the TLB */
-	local_flush_tlb_all();
-
-	preempt_enable();
-
-	status = ia64_pal_mc_drain();
-	if (status != PAL_STATUS_SUCCESS)
-		goto failed;
-	atomic_set(&uc_pool->status, 0);
-	smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
-	if (atomic_read(&uc_pool->status))
-		goto failed;
-
-	/*
-	 * The chunk of memory pages has been converted to uncached so now we
-	 * can add it to the pool.
-	 */
-	status = gen_pool_add(uc_pool->pool, uc_addr, IA64_GRANULE_SIZE, nid);
-	if (status)
-		goto failed;
-
-	uc_pool->nchunks_added++;
-	mutex_unlock(&uc_pool->add_chunk_mutex);
-	return 0;
-
-	/* failed to convert or add the chunk so give it back to the kernel */
-failed:
-	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
-		ClearPageUncached(&page[i]);
-
-	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
-	mutex_unlock(&uc_pool->add_chunk_mutex);
-	return -1;
-}
-
-
-/*
- * uncached_alloc_page
- *
- * @starting_nid: node id of node to start with, or -1
- * @n_pages: number of contiguous pages to allocate
- *
- * Allocate the specified number of contiguous uncached pages on the
- * requested node. If not enough contiguous uncached pages are available
- * on the requested node, roundrobin starting with the next higher node.
- */
-unsigned long uncached_alloc_page(int starting_nid, int n_pages)
-{
-	unsigned long uc_addr;
-	struct uncached_pool *uc_pool;
-	int nid;
-
-	if (unlikely(starting_nid >= MAX_NUMNODES))
-		return 0;
-
-	if (starting_nid < 0)
-		starting_nid = numa_node_id();
-	nid = starting_nid;
-
-	do {
-		if (!node_state(nid, N_HIGH_MEMORY))
-			continue;
-		uc_pool = &uncached_pools[nid];
-		if (uc_pool->pool == NULL)
-			continue;
-		do {
-			uc_addr = gen_pool_alloc(uc_pool->pool,
-						 n_pages * PAGE_SIZE);
-			if (uc_addr != 0)
-				return uc_addr;
-		} while (uncached_add_chunk(uc_pool, nid) == 0);
-
-	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
-
-	return 0;
-}
-EXPORT_SYMBOL(uncached_alloc_page);
-
-
-/*
- * uncached_free_page
- *
- * @uc_addr: uncached address of first page to free
- * @n_pages: number of contiguous pages to free
- *
- * Free the specified number of uncached pages.
- */
-void uncached_free_page(unsigned long uc_addr, int n_pages)
-{
-	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
-	struct gen_pool *pool = uncached_pools[nid].pool;
-
-	if (unlikely(pool == NULL))
-		return;
-
-	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", uc_addr);
-
-	gen_pool_free(pool, uc_addr, n_pages * PAGE_SIZE);
-}
-EXPORT_SYMBOL(uncached_free_page);
-
-
-/*
- * uncached_build_memmap,
- *
- * @uc_start: uncached starting address of a chunk of uncached memory
- * @uc_end: uncached ending address of a chunk of uncached memory
- * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
- *
- * Called at boot time to build a map of pages that can be used for
- * memory special operations.
- */
-static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg)
-{
-	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
-	struct gen_pool *pool = uncached_pools[nid].pool;
-	size_t size = uc_end - uc_start;
-
-	touch_softlockup_watchdog();
-
-	if (pool != NULL) {
-		memset((char *)uc_start, 0, size);
-		(void) gen_pool_add(pool, uc_start, size, nid);
-	}
-	return 0;
-}
-
-
-static int __init uncached_init(void)
-{
-	int nid;
-
-	for_each_online_node(nid) {
-		uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
-		mutex_init(&uncached_pools[nid].add_chunk_mutex);
-	}
-
-	efi_memmap_walk_uc(uncached_build_memmap, NULL);
-	return 0;
-}
-
-__initcall(uncached_init);
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
deleted file mode 100644
index 6bd64c35e691..000000000000
--- a/arch/ia64/kernel/unwind.c
+++ /dev/null
@@ -1,2320 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 1999-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com>
- * 	- Change pt_regs_off() to make it less dependent on pt_regs structure.
- */
-/*
- * This file implements call frame unwind support for the Linux
- * kernel.  Parsing and processing the unwind information is
- * time-consuming, so this implementation translates the unwind
- * descriptors into unwind scripts.  These scripts are very simple
- * (basically a sequence of assignments) and efficient to execute.
- * They are cached for later re-use.  Each script is specific for a
- * given instruction pointer address and the set of predicate values
- * that the script depends on (most unwind descriptors are
- * unconditional and scripts often do not depend on predicates at
- * all).  This code is based on the unwind conventions described in
- * the "IA-64 Software Conventions and Runtime Architecture" manual.
- *
- * SMP conventions:
- *	o updates to the global unwind data (in structure "unw") are serialized
- *	  by the unw.lock spinlock
- *	o each unwind script has its own read-write lock; a thread must acquire
- *	  a read lock before executing a script and must acquire a write lock
- *	  before modifying a script
- *	o if both the unw.lock spinlock and a script's read-write lock must be
- *	  acquired, then the read-write lock must be acquired first.
- */
-#include <linux/module.h>
-#include <linux/memblock.h>
-#include <linux/elf.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/unwind.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/ptrace_offsets.h>
-#include <asm/rse.h>
-#include <asm/sections.h>
-#include <linux/uaccess.h>
-
-#include "entry.h"
-#include "unwind_i.h"
-
-#define UNW_LOG_CACHE_SIZE	7	/* each unw_script is ~256 bytes in size */
-#define UNW_CACHE_SIZE		(1 << UNW_LOG_CACHE_SIZE)
-
-#define UNW_LOG_HASH_SIZE	(UNW_LOG_CACHE_SIZE + 1)
-#define UNW_HASH_SIZE		(1 << UNW_LOG_HASH_SIZE)
-
-#define UNW_STATS	0	/* WARNING: this disabled interrupts for long time-spans!! */
-
-#ifdef UNW_DEBUG
-  static unsigned int unw_debug_level = UNW_DEBUG;
-#  define UNW_DEBUG_ON(n)	unw_debug_level >= n
-   /* Do not code a printk level, not all debug lines end in newline */
-#  define UNW_DPRINT(n, ...)  if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__)
-#  undef inline
-#  define inline
-#else /* !UNW_DEBUG */
-#  define UNW_DEBUG_ON(n)  0
-#  define UNW_DPRINT(n, ...)
-#endif /* UNW_DEBUG */
-
-#if UNW_STATS
-# define STAT(x...)	x
-#else
-# define STAT(x...)
-#endif
-
-#define alloc_reg_state()	kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
-#define free_reg_state(usr)	kfree(usr)
-#define alloc_labeled_state()	kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
-#define free_labeled_state(usr)	kfree(usr)
-
-typedef unsigned long unw_word;
-typedef unsigned char unw_hash_index_t;
-
-static struct {
-	spinlock_t lock;			/* spinlock for unwind data */
-
-	/* list of unwind tables (one per load-module) */
-	struct unw_table *tables;
-
-	unsigned long r0;			/* constant 0 for r0 */
-
-	/* table of registers that prologues can save (and order in which they're saved): */
-	const unsigned char save_order[8];
-
-	/* maps a preserved register index (preg_index) to corresponding switch_stack offset: */
-	unsigned short sw_off[sizeof(struct unw_frame_info) / 8];
-
-	unsigned short lru_head;		/* index of lead-recently used script */
-	unsigned short lru_tail;		/* index of most-recently used script */
-
-	/* index into unw_frame_info for preserved register i */
-	unsigned short preg_index[UNW_NUM_REGS];
-
-	short pt_regs_offsets[32];
-
-	/* unwind table for the kernel: */
-	struct unw_table kernel_table;
-
-	/* unwind table describing the gate page (kernel code that is mapped into user space): */
-	size_t gate_table_size;
-	unsigned long *gate_table;
-
-	/* hash table that maps instruction pointer to script index: */
-	unsigned short hash[UNW_HASH_SIZE];
-
-	/* script cache: */
-	struct unw_script cache[UNW_CACHE_SIZE];
-
-# ifdef UNW_DEBUG
-	const char *preg_name[UNW_NUM_REGS];
-# endif
-# if UNW_STATS
-	struct {
-		struct {
-			int lookups;
-			int hinted_hits;
-			int normal_hits;
-			int collision_chain_traversals;
-		} cache;
-		struct {
-			unsigned long build_time;
-			unsigned long run_time;
-			unsigned long parse_time;
-			int builds;
-			int news;
-			int collisions;
-			int runs;
-		} script;
-		struct {
-			unsigned long init_time;
-			unsigned long unwind_time;
-			int inits;
-			int unwinds;
-		} api;
-	} stat;
-# endif
-} unw = {
-	.tables = &unw.kernel_table,
-	.lock = __SPIN_LOCK_UNLOCKED(unw.lock),
-	.save_order = {
-		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
-		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
-	},
-	.preg_index = {
-		offsetof(struct unw_frame_info, pri_unat_loc)/8,	/* PRI_UNAT_GR */
-		offsetof(struct unw_frame_info, pri_unat_loc)/8,	/* PRI_UNAT_MEM */
-		offsetof(struct unw_frame_info, bsp_loc)/8,
-		offsetof(struct unw_frame_info, bspstore_loc)/8,
-		offsetof(struct unw_frame_info, pfs_loc)/8,
-		offsetof(struct unw_frame_info, rnat_loc)/8,
-		offsetof(struct unw_frame_info, psp)/8,
-		offsetof(struct unw_frame_info, rp_loc)/8,
-		offsetof(struct unw_frame_info, r4)/8,
-		offsetof(struct unw_frame_info, r5)/8,
-		offsetof(struct unw_frame_info, r6)/8,
-		offsetof(struct unw_frame_info, r7)/8,
-		offsetof(struct unw_frame_info, unat_loc)/8,
-		offsetof(struct unw_frame_info, pr_loc)/8,
-		offsetof(struct unw_frame_info, lc_loc)/8,
-		offsetof(struct unw_frame_info, fpsr_loc)/8,
-		offsetof(struct unw_frame_info, b1_loc)/8,
-		offsetof(struct unw_frame_info, b2_loc)/8,
-		offsetof(struct unw_frame_info, b3_loc)/8,
-		offsetof(struct unw_frame_info, b4_loc)/8,
-		offsetof(struct unw_frame_info, b5_loc)/8,
-		offsetof(struct unw_frame_info, f2_loc)/8,
-		offsetof(struct unw_frame_info, f3_loc)/8,
-		offsetof(struct unw_frame_info, f4_loc)/8,
-		offsetof(struct unw_frame_info, f5_loc)/8,
-		offsetof(struct unw_frame_info, fr_loc[16 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[17 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[18 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[19 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[20 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[21 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[22 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[23 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[24 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[25 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[26 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[27 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[28 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[29 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[30 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[31 - 16])/8,
-	},
-	.pt_regs_offsets = {
-		[0] = -1,
-		offsetof(struct pt_regs,  r1),
-		offsetof(struct pt_regs,  r2),
-		offsetof(struct pt_regs,  r3),
-		[4] = -1, [5] = -1, [6] = -1, [7] = -1,
-		offsetof(struct pt_regs,  r8),
-		offsetof(struct pt_regs,  r9),
-		offsetof(struct pt_regs, r10),
-		offsetof(struct pt_regs, r11),
-		offsetof(struct pt_regs, r12),
-		offsetof(struct pt_regs, r13),
-		offsetof(struct pt_regs, r14),
-		offsetof(struct pt_regs, r15),
-		offsetof(struct pt_regs, r16),
-		offsetof(struct pt_regs, r17),
-		offsetof(struct pt_regs, r18),
-		offsetof(struct pt_regs, r19),
-		offsetof(struct pt_regs, r20),
-		offsetof(struct pt_regs, r21),
-		offsetof(struct pt_regs, r22),
-		offsetof(struct pt_regs, r23),
-		offsetof(struct pt_regs, r24),
-		offsetof(struct pt_regs, r25),
-		offsetof(struct pt_regs, r26),
-		offsetof(struct pt_regs, r27),
-		offsetof(struct pt_regs, r28),
-		offsetof(struct pt_regs, r29),
-		offsetof(struct pt_regs, r30),
-		offsetof(struct pt_regs, r31),
-	},
-	.hash = { [0 ... UNW_HASH_SIZE - 1] = -1 },
-#ifdef UNW_DEBUG
-	.preg_name = {
-		"pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp",
-		"r4", "r5", "r6", "r7",
-		"ar.unat", "pr", "ar.lc", "ar.fpsr",
-		"b1", "b2", "b3", "b4", "b5",
-		"f2", "f3", "f4", "f5",
-		"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
-		"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"
-	}
-#endif
-};
-
-static inline int
-read_only (void *addr)
-{
-	return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0);
-}
-
-/*
- * Returns offset of rREG in struct pt_regs.
- */
-static inline unsigned long
-pt_regs_off (unsigned long reg)
-{
-	short off = -1;
-
-	if (reg < ARRAY_SIZE(unw.pt_regs_offsets))
-		off = unw.pt_regs_offsets[reg];
-
-	if (off < 0) {
-		UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __func__, reg);
-		off = 0;
-	}
-	return (unsigned long) off;
-}
-
-static inline struct pt_regs *
-get_scratch_regs (struct unw_frame_info *info)
-{
-	if (!info->pt) {
-		/* This should not happen with valid unwind info.  */
-		UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __func__);
-		if (info->flags & UNW_FLAG_INTERRUPT_FRAME)
-			info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1);
-		else
-			info->pt = info->sp - 16;
-	}
-	UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __func__, info->sp, info->pt);
-	return (struct pt_regs *) info->pt;
-}
-
-/* Unwind accessors.  */
-
-int
-unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write)
-{
-	unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat;
-	struct unw_ireg *ireg;
-	struct pt_regs *pt;
-
-	if ((unsigned) regnum - 1 >= 127) {
-		if (regnum == 0 && !write) {
-			*val = 0;	/* read r0 always returns 0 */
-			*nat = 0;
-			return 0;
-		}
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (regnum < 32) {
-		if (regnum >= 4 && regnum <= 7) {
-			/* access a preserved register */
-			ireg = &info->r4 + (regnum - 4);
-			addr = ireg->loc;
-			if (addr) {
-				nat_addr = addr + ireg->nat.off;
-				switch (ireg->nat.type) {
-				      case UNW_NAT_VAL:
-					/* simulate getf.sig/setf.sig */
-					if (write) {
-						if (*nat) {
-							/* write NaTVal and be done with it */
-							addr[0] = 0;
-							addr[1] = 0x1fffe;
-							return 0;
-						}
-						addr[1] = 0x1003e;
-					} else {
-						if (addr[0] == 0 && addr[1] == 0x1ffe) {
-							/* return NaT and be done with it */
-							*val = 0;
-							*nat = 1;
-							return 0;
-						}
-					}
-					fallthrough;
-				      case UNW_NAT_NONE:
-					dummy_nat = 0;
-					nat_addr = &dummy_nat;
-					break;
-
-				      case UNW_NAT_MEMSTK:
-					nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-					break;
-
-				      case UNW_NAT_REGSTK:
-					nat_addr = ia64_rse_rnat_addr(addr);
-					if ((unsigned long) addr < info->regstk.limit
-					    || (unsigned long) addr >= info->regstk.top)
-					{
-						UNW_DPRINT(0, "unwind.%s: %p outside of regstk "
-							"[0x%lx-0x%lx)\n",
-							__func__, (void *) addr,
-							info->regstk.limit,
-							info->regstk.top);
-						return -1;
-					}
-					if ((unsigned long) nat_addr >= info->regstk.top)
-						nat_addr = &info->sw->ar_rnat;
-					nat_mask = (1UL << ia64_rse_slot_num(addr));
-					break;
-				}
-			} else {
-				addr = &info->sw->r4 + (regnum - 4);
-				nat_addr = &info->sw->ar_unat;
-				nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-			}
-		} else {
-			/* access a scratch register */
-			pt = get_scratch_regs(info);
-			addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum));
-			if (info->pri_unat_loc)
-				nat_addr = info->pri_unat_loc;
-			else
-				nat_addr = &info->sw->caller_unat;
-			nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-		}
-	} else {
-		/* access a stacked register */
-		addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32);
-		nat_addr = ia64_rse_rnat_addr(addr);
-		if ((unsigned long) addr < info->regstk.limit
-		    || (unsigned long) addr >= info->regstk.top)
-		{
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside "
-				   "of rbs\n",  __func__);
-			return -1;
-		}
-		if ((unsigned long) nat_addr >= info->regstk.top)
-			nat_addr = &info->sw->ar_rnat;
-		nat_mask = (1UL << ia64_rse_slot_num(addr));
-	}
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else {
-			*addr = *val;
-			if (*nat)
-				*nat_addr |= nat_mask;
-			else
-				*nat_addr &= ~nat_mask;
-		}
-	} else {
-		if ((*nat_addr & nat_mask) == 0) {
-			*val = *addr;
-			*nat = 0;
-		} else {
-			*val = 0;	/* if register is a NaT, *addr may contain kernel data! */
-			*nat = 1;
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_gr);
-
-int
-unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
-{
-	unsigned long *addr;
-	struct pt_regs *pt;
-
-	switch (regnum) {
-		/* scratch: */
-	      case 0: pt = get_scratch_regs(info); addr = &pt->b0; break;
-	      case 6: pt = get_scratch_regs(info); addr = &pt->b6; break;
-	      case 7: pt = get_scratch_regs(info); addr = &pt->b7; break;
-
-		/* preserved: */
-	      case 1: case 2: case 3: case 4: case 5:
-		addr = *(&info->b1_loc + (regnum - 1));
-		if (!addr)
-			addr = &info->sw->b1 + (regnum - 1);
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-	if (write)
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_br);
-
-int
-unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write)
-{
-	struct ia64_fpreg *addr = NULL;
-	struct pt_regs *pt;
-
-	if ((unsigned) (regnum - 2) >= 126) {
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (regnum <= 5) {
-		addr = *(&info->f2_loc + (regnum - 2));
-		if (!addr)
-			addr = &info->sw->f2 + (regnum - 2);
-	} else if (regnum <= 15) {
-		if (regnum <= 11) {
-			pt = get_scratch_regs(info);
-			addr = &pt->f6  + (regnum - 6);
-		}
-		else
-			addr = &info->sw->f12 + (regnum - 12);
-	} else if (regnum <= 31) {
-		addr = info->fr_loc[regnum - 16];
-		if (!addr)
-			addr = &info->sw->f16 + (regnum - 16);
-	} else {
-		struct task_struct *t = info->task;
-
-		if (write)
-			ia64_sync_fph(t);
-		else
-			ia64_flush_fph(t);
-		addr = t->thread.fph + (regnum - 32);
-	}
-
-	if (write)
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_fr);
-
-int
-unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
-{
-	unsigned long *addr;
-	struct pt_regs *pt;
-
-	switch (regnum) {
-	      case UNW_AR_BSP:
-		addr = info->bsp_loc;
-		if (!addr)
-			addr = &info->sw->ar_bspstore;
-		break;
-
-	      case UNW_AR_BSPSTORE:
-		addr = info->bspstore_loc;
-		if (!addr)
-			addr = &info->sw->ar_bspstore;
-		break;
-
-	      case UNW_AR_PFS:
-		addr = info->pfs_loc;
-		if (!addr)
-			addr = &info->sw->ar_pfs;
-		break;
-
-	      case UNW_AR_RNAT:
-		addr = info->rnat_loc;
-		if (!addr)
-			addr = &info->sw->ar_rnat;
-		break;
-
-	      case UNW_AR_UNAT:
-		addr = info->unat_loc;
-		if (!addr)
-			addr = &info->sw->caller_unat;
-		break;
-
-	      case UNW_AR_LC:
-		addr = info->lc_loc;
-		if (!addr)
-			addr = &info->sw->ar_lc;
-		break;
-
-	      case UNW_AR_EC:
-		if (!info->cfm_loc)
-			return -1;
-		if (write)
-			*info->cfm_loc =
-				(*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52);
-		else
-			*val = (*info->cfm_loc >> 52) & 0x3f;
-		return 0;
-
-	      case UNW_AR_FPSR:
-		addr = info->fpsr_loc;
-		if (!addr)
-			addr = &info->sw->ar_fpsr;
-		break;
-
-	      case UNW_AR_RSC:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_rsc;
-		break;
-
-	      case UNW_AR_CCV:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_ccv;
-		break;
-
-	      case UNW_AR_CSD:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_csd;
-		break;
-
-	      case UNW_AR_SSD:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_ssd;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	} else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_ar);
-
-int
-unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write)
-{
-	unsigned long *addr;
-
-	addr = info->pr_loc;
-	if (!addr)
-		addr = &info->sw->pr;
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	} else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_pr);
-
-
-/* Routines to manipulate the state stack.  */
-
-static inline void
-push (struct unw_state_record *sr)
-{
-	struct unw_reg_state *rs;
-
-	rs = alloc_reg_state();
-	if (!rs) {
-		printk(KERN_ERR "unwind: cannot stack reg state!\n");
-		return;
-	}
-	memcpy(rs, &sr->curr, sizeof(*rs));
-	sr->curr.next = rs;
-}
-
-static void
-pop (struct unw_state_record *sr)
-{
-	struct unw_reg_state *rs = sr->curr.next;
-
-	if (!rs) {
-		printk(KERN_ERR "unwind: stack underflow!\n");
-		return;
-	}
-	memcpy(&sr->curr, rs, sizeof(*rs));
-	free_reg_state(rs);
-}
-
-/* Make a copy of the state stack.  Non-recursive to avoid stack overflows.  */
-static struct unw_reg_state *
-dup_state_stack (struct unw_reg_state *rs)
-{
-	struct unw_reg_state *copy, *prev = NULL, *first = NULL;
-
-	while (rs) {
-		copy = alloc_reg_state();
-		if (!copy) {
-			printk(KERN_ERR "unwind.dup_state_stack: out of memory\n");
-			return NULL;
-		}
-		memcpy(copy, rs, sizeof(*copy));
-		if (first)
-			prev->next = copy;
-		else
-			first = copy;
-		rs = rs->next;
-		prev = copy;
-	}
-	return first;
-}
-
-/* Free all stacked register states (but not RS itself).  */
-static void
-free_state_stack (struct unw_reg_state *rs)
-{
-	struct unw_reg_state *p, *next;
-
-	for (p = rs->next; p != NULL; p = next) {
-		next = p->next;
-		free_reg_state(p);
-	}
-	rs->next = NULL;
-}
-
-/* Unwind decoder routines */
-
-static enum unw_register_index __attribute_const__
-decode_abreg (unsigned char abreg, int memory)
-{
-	switch (abreg) {
-	      case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04);
-	      case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22);
-	      case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30);
-	      case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41);
-	      case 0x60: return UNW_REG_PR;
-	      case 0x61: return UNW_REG_PSP;
-	      case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR;
-	      case 0x63: return UNW_REG_RP;
-	      case 0x64: return UNW_REG_BSP;
-	      case 0x65: return UNW_REG_BSPSTORE;
-	      case 0x66: return UNW_REG_RNAT;
-	      case 0x67: return UNW_REG_UNAT;
-	      case 0x68: return UNW_REG_FPSR;
-	      case 0x69: return UNW_REG_PFS;
-	      case 0x6a: return UNW_REG_LC;
-	      default:
-		break;
-	}
-	UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __func__, abreg);
-	return UNW_REG_LC;
-}
-
-static void
-set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val)
-{
-	reg->val = val;
-	reg->where = where;
-	if (reg->when == UNW_WHEN_NEVER)
-		reg->when = when;
-}
-
-static void
-alloc_spill_area (unsigned long *offp, unsigned long regsize,
-		  struct unw_reg_info *lo, struct unw_reg_info *hi)
-{
-	struct unw_reg_info *reg;
-
-	for (reg = hi; reg >= lo; --reg) {
-		if (reg->where == UNW_WHERE_SPILL_HOME) {
-			reg->where = UNW_WHERE_PSPREL;
-			*offp -= regsize;
-			reg->val = *offp;
-		}
-	}
-}
-
-static inline void
-spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t)
-{
-	struct unw_reg_info *reg;
-
-	for (reg = *regp; reg <= lim; ++reg) {
-		if (reg->where == UNW_WHERE_SPILL_HOME) {
-			reg->when = t;
-			*regp = reg + 1;
-			return;
-		}
-	}
-	UNW_DPRINT(0, "unwind.%s: excess spill!\n",  __func__);
-}
-
-static inline void
-finish_prologue (struct unw_state_record *sr)
-{
-	struct unw_reg_info *reg;
-	unsigned long off;
-	int i;
-
-	/*
-	 * First, resolve implicit register save locations (see Section "11.4.2.3 Rules
-	 * for Using Unwind Descriptors", rule 3):
-	 */
-	for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) {
-		reg = sr->curr.reg + unw.save_order[i];
-		if (reg->where == UNW_WHERE_GR_SAVE) {
-			reg->where = UNW_WHERE_GR;
-			reg->val = sr->gr_save_loc++;
-		}
-	}
-
-	/*
-	 * Next, compute when the fp, general, and branch registers get
-	 * saved.  This must come before alloc_spill_area() because
-	 * we need to know which registers are spilled to their home
-	 * locations.
-	 */
-	if (sr->imask) {
-		unsigned char kind, mask = 0, *cp = sr->imask;
-		int t;
-		static const unsigned char limit[3] = {
-			UNW_REG_F31, UNW_REG_R7, UNW_REG_B5
-		};
-		struct unw_reg_info *(regs[3]);
-
-		regs[0] = sr->curr.reg + UNW_REG_F2;
-		regs[1] = sr->curr.reg + UNW_REG_R4;
-		regs[2] = sr->curr.reg + UNW_REG_B1;
-
-		for (t = 0; t < sr->region_len; ++t) {
-			if ((t & 3) == 0)
-				mask = *cp++;
-			kind = (mask >> 2*(3-(t & 3))) & 3;
-			if (kind > 0)
-				spill_next_when(&regs[kind - 1], sr->curr.reg + limit[kind - 1],
-						sr->region_start + t);
-		}
-	}
-	/*
-	 * Next, lay out the memory stack spill area:
-	 */
-	if (sr->any_spills) {
-		off = sr->spill_offset;
-		alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31);
-		alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5);
-		alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7);
-	}
-}
-
-/*
- * Region header descriptors.
- */
-
-static void
-desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave,
-	       struct unw_state_record *sr)
-{
-	int i, region_start;
-
-	if (!(sr->in_body || sr->first_region))
-		finish_prologue(sr);
-	sr->first_region = 0;
-
-	/* check if we're done: */
-	if (sr->when_target < sr->region_start + sr->region_len) {
-		sr->done = 1;
-		return;
-	}
-
-	region_start = sr->region_start + sr->region_len;
-
-	for (i = 0; i < sr->epilogue_count; ++i)
-		pop(sr);
-	sr->epilogue_count = 0;
-	sr->epilogue_start = UNW_WHEN_NEVER;
-
-	sr->region_start = region_start;
-	sr->region_len = rlen;
-	sr->in_body = body;
-
-	if (!body) {
-		push(sr);
-
-		for (i = 0; i < 4; ++i) {
-			if (mask & 0x8)
-				set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR,
-					sr->region_start + sr->region_len - 1, grsave++);
-			mask <<= 1;
-		}
-		sr->gr_save_loc = grsave;
-		sr->any_spills = 0;
-		sr->imask = NULL;
-		sr->spill_offset = 0x10;	/* default to psp+16 */
-	}
-}
-
-/*
- * Prologue descriptors.
- */
-
-static inline void
-desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr)
-{
-	if (abi == 3 && context == 'i') {
-		sr->flags |= UNW_FLAG_INTERRUPT_FRAME;
-		UNW_DPRINT(3, "unwind.%s: interrupt frame\n",  __func__);
-	}
-	else
-		UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n",
-				__func__, abi, context);
-}
-
-static inline void
-desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 5; ++i) {
-		if (brmask & 1)
-			set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR,
-				sr->region_start + sr->region_len - 1, gr++);
-		brmask >>= 1;
-	}
-}
-
-static inline void
-desc_br_mem (unsigned char brmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 5; ++i) {
-		if (brmask & 1) {
-			set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		brmask >>= 1;
-	}
-}
-
-static inline void
-desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		grmask >>= 1;
-	}
-	for (i = 0; i < 20; ++i) {
-		if ((frmask & 1) != 0) {
-			int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4;
-			set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		frmask >>= 1;
-	}
-}
-
-static inline void
-desc_fr_mem (unsigned char frmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((frmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		frmask >>= 1;
-	}
-}
-
-static inline void
-desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0)
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR,
-				sr->region_start + sr->region_len - 1, gr++);
-		grmask >>= 1;
-	}
-}
-
-static inline void
-desc_gr_mem (unsigned char grmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		grmask >>= 1;
-	}
-}
-
-static inline void
-desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE,
-		sr->region_start + min_t(int, t, sr->region_len - 1), 16*size);
-}
-
-static inline void
-desc_mem_stack_v (unw_word t, struct unw_state_record *sr)
-{
-	sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1);
-}
-
-static inline void
-desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst);
-}
-
-static inline void
-desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1,
-		0x10 - 4*pspoff);
-}
-
-static inline void
-desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1,
-		4*spoff);
-}
-
-static inline void
-desc_rp_br (unsigned char dst, struct unw_state_record *sr)
-{
-	sr->return_link_reg = dst;
-}
-
-static inline void
-desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr)
-{
-	struct unw_reg_info *reg = sr->curr.reg + regnum;
-
-	if (reg->where == UNW_WHERE_NONE)
-		reg->where = UNW_WHERE_GR_SAVE;
-	reg->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-}
-
-static inline void
-desc_spill_base (unw_word pspoff, struct unw_state_record *sr)
-{
-	sr->spill_offset = 0x10 - 4*pspoff;
-}
-
-static inline unsigned char *
-desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr)
-{
-	sr->imask = imaskp;
-	return imaskp + (2*sr->region_len + 7)/8;
-}
-
-/*
- * Body descriptors.
- */
-static inline void
-desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr)
-{
-	sr->epilogue_start = sr->region_start + sr->region_len - 1 - t;
-	sr->epilogue_count = ecount + 1;
-}
-
-static inline void
-desc_copy_state (unw_word label, struct unw_state_record *sr)
-{
-	struct unw_labeled_state *ls;
-
-	for (ls = sr->labeled_states; ls; ls = ls->next) {
-		if (ls->label == label) {
-			free_state_stack(&sr->curr);
-			memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr));
-			sr->curr.next = dup_state_stack(ls->saved_state.next);
-			return;
-		}
-	}
-	printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label);
-}
-
-static inline void
-desc_label_state (unw_word label, struct unw_state_record *sr)
-{
-	struct unw_labeled_state *ls;
-
-	ls = alloc_labeled_state();
-	if (!ls) {
-		printk(KERN_ERR "unwind.desc_label_state(): out of memory\n");
-		return;
-	}
-	ls->label = label;
-	memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state));
-	ls->saved_state.next = dup_state_stack(sr->curr.next);
-
-	/* insert into list of labeled states: */
-	ls->next = sr->labeled_states;
-	sr->labeled_states = ls;
-}
-
-/*
- * General descriptors.
- */
-
-static inline int
-desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr)
-{
-	if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1))
-		return 0;
-	if (qp > 0) {
-		if ((sr->pr_val & (1UL << qp)) == 0)
-			return 0;
-		sr->pr_mask |= (1UL << qp);
-	}
-	return 1;
-}
-
-static inline void
-desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 0);
-	r->where = UNW_WHERE_NONE;
-	r->when = UNW_WHEN_NEVER;
-	r->val = 0;
-}
-
-static inline void
-desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x,
-		     unsigned char ytreg, struct unw_state_record *sr)
-{
-	enum unw_where where = UNW_WHERE_GR;
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	if (x)
-		where = UNW_WHERE_BR;
-	else if (ytreg & 0x80)
-		where = UNW_WHERE_FR;
-
-	r = sr->curr.reg + decode_abreg(abreg, 0);
-	r->where = where;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = (ytreg & 0x7f);
-}
-
-static inline void
-desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff,
-		     struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 1);
-	r->where = UNW_WHERE_PSPREL;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = 0x10 - 4*pspoff;
-}
-
-static inline void
-desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff,
-		       struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 1);
-	r->where = UNW_WHERE_SPREL;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = 4*spoff;
-}
-
-#define UNW_DEC_BAD_CODE(code)			printk(KERN_ERR "unwind: unknown code 0x%02x\n", \
-						       code);
-
-/*
- * region headers:
- */
-#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg)	desc_prologue(0,r,m,gr,arg)
-#define UNW_DEC_PROLOGUE(fmt,b,r,arg)		desc_prologue(b,r,0,32,arg)
-/*
- * prologue descriptors:
- */
-#define UNW_DEC_ABI(fmt,a,c,arg)		desc_abi(a,c,arg)
-#define UNW_DEC_BR_GR(fmt,b,g,arg)		desc_br_gr(b,g,arg)
-#define UNW_DEC_BR_MEM(fmt,b,arg)		desc_br_mem(b,arg)
-#define UNW_DEC_FRGR_MEM(fmt,g,f,arg)		desc_frgr_mem(g,f,arg)
-#define UNW_DEC_FR_MEM(fmt,f,arg)		desc_fr_mem(f,arg)
-#define UNW_DEC_GR_GR(fmt,m,g,arg)		desc_gr_gr(m,g,arg)
-#define UNW_DEC_GR_MEM(fmt,m,arg)		desc_gr_mem(m,arg)
-#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg)	desc_mem_stack_f(t,s,arg)
-#define UNW_DEC_MEM_STACK_V(fmt,t,arg)		desc_mem_stack_v(t,arg)
-#define UNW_DEC_REG_GR(fmt,r,d,arg)		desc_reg_gr(r,d,arg)
-#define UNW_DEC_REG_PSPREL(fmt,r,o,arg)		desc_reg_psprel(r,o,arg)
-#define UNW_DEC_REG_SPREL(fmt,r,o,arg)		desc_reg_sprel(r,o,arg)
-#define UNW_DEC_REG_WHEN(fmt,r,t,arg)		desc_reg_when(r,t,arg)
-#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)	desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg)
-#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)	desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg)
-#define UNW_DEC_PRIUNAT_GR(fmt,r,arg)		desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg)
-#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg)	desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg)
-#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg)	desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg)
-#define UNW_DEC_RP_BR(fmt,d,arg)		desc_rp_br(d,arg)
-#define UNW_DEC_SPILL_BASE(fmt,o,arg)		desc_spill_base(o,arg)
-#define UNW_DEC_SPILL_MASK(fmt,m,arg)		(m = desc_spill_mask(m,arg))
-/*
- * body descriptors:
- */
-#define UNW_DEC_EPILOGUE(fmt,t,c,arg)		desc_epilogue(t,c,arg)
-#define UNW_DEC_COPY_STATE(fmt,l,arg)		desc_copy_state(l,arg)
-#define UNW_DEC_LABEL_STATE(fmt,l,arg)		desc_label_state(l,arg)
-/*
- * general unwind descriptors:
- */
-#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg)	desc_spill_reg_p(p,t,a,x,y,arg)
-#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg)	desc_spill_reg_p(0,t,a,x,y,arg)
-#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg)	desc_spill_psprel_p(p,t,a,o,arg)
-#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg)	desc_spill_psprel_p(0,t,a,o,arg)
-#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg)	desc_spill_sprel_p(p,t,a,o,arg)
-#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg)	desc_spill_sprel_p(0,t,a,o,arg)
-#define UNW_DEC_RESTORE_P(f,p,t,a,arg)		desc_restore_p(p,t,a,arg)
-#define UNW_DEC_RESTORE(f,t,a,arg)		desc_restore_p(0,t,a,arg)
-
-#include "unwind_decoder.c"
-
-
-/* Unwind scripts. */
-
-static inline unw_hash_index_t
-hash (unsigned long ip)
-{
-	/* magic number = ((sqrt(5)-1)/2)*2^64 */
-	static const unsigned long hashmagic = 0x9e3779b97f4a7c16UL;
-
-	return (ip >> 4) * hashmagic >> (64 - UNW_LOG_HASH_SIZE);
-}
-
-static inline long
-cache_match (struct unw_script *script, unsigned long ip, unsigned long pr)
-{
-	read_lock(&script->lock);
-	if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0)
-		/* keep the read lock... */
-		return 1;
-	read_unlock(&script->lock);
-	return 0;
-}
-
-static inline struct unw_script *
-script_lookup (struct unw_frame_info *info)
-{
-	struct unw_script *script = unw.cache + info->hint;
-	unsigned short index;
-	unsigned long ip, pr;
-
-	if (UNW_DEBUG_ON(0))
-		return NULL;	/* Always regenerate scripts in debug mode */
-
-	STAT(++unw.stat.cache.lookups);
-
-	ip = info->ip;
-	pr = info->pr;
-
-	if (cache_match(script, ip, pr)) {
-		STAT(++unw.stat.cache.hinted_hits);
-		return script;
-	}
-
-	index = unw.hash[hash(ip)];
-	if (index >= UNW_CACHE_SIZE)
-		return NULL;
-
-	script = unw.cache + index;
-	while (1) {
-		if (cache_match(script, ip, pr)) {
-			/* update hint; no locking required as single-word writes are atomic */
-			STAT(++unw.stat.cache.normal_hits);
-			unw.cache[info->prev_script].hint = script - unw.cache;
-			return script;
-		}
-		if (script->coll_chain >= UNW_HASH_SIZE)
-			return NULL;
-		script = unw.cache + script->coll_chain;
-		STAT(++unw.stat.cache.collision_chain_traversals);
-	}
-}
-
-/*
- * On returning, a write lock for the SCRIPT is still being held.
- */
-static inline struct unw_script *
-script_new (unsigned long ip)
-{
-	struct unw_script *script, *prev, *tmp;
-	unw_hash_index_t index;
-	unsigned short head;
-
-	STAT(++unw.stat.script.news);
-
-	/*
-	 * Can't (easily) use cmpxchg() here because of ABA problem
-	 * that is intrinsic in cmpxchg()...
-	 */
-	head = unw.lru_head;
-	script = unw.cache + head;
-	unw.lru_head = script->lru_chain;
-
-	/*
-	 * We'd deadlock here if we interrupted a thread that is holding a read lock on
-	 * script->lock.  Thus, if the write_trylock() fails, we simply bail out.  The
-	 * alternative would be to disable interrupts whenever we hold a read-lock, but
-	 * that seems silly.
-	 */
-	if (!write_trylock(&script->lock))
-		return NULL;
-
-	/* re-insert script at the tail of the LRU chain: */
-	unw.cache[unw.lru_tail].lru_chain = head;
-	unw.lru_tail = head;
-
-	/* remove the old script from the hash table (if it's there): */
-	if (script->ip) {
-		index = hash(script->ip);
-		tmp = unw.cache + unw.hash[index];
-		prev = NULL;
-		while (1) {
-			if (tmp == script) {
-				if (prev)
-					prev->coll_chain = tmp->coll_chain;
-				else
-					unw.hash[index] = tmp->coll_chain;
-				break;
-			} else
-				prev = tmp;
-			if (tmp->coll_chain >= UNW_CACHE_SIZE)
-			/* old script wasn't in the hash-table */
-				break;
-			tmp = unw.cache + tmp->coll_chain;
-		}
-	}
-
-	/* enter new script in the hash table */
-	index = hash(ip);
-	script->coll_chain = unw.hash[index];
-	unw.hash[index] = script - unw.cache;
-
-	script->ip = ip;	/* set new IP while we're holding the locks */
-
-	STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions);
-
-	script->flags = 0;
-	script->hint = 0;
-	script->count = 0;
-	return script;
-}
-
-static void
-script_finalize (struct unw_script *script, struct unw_state_record *sr)
-{
-	script->pr_mask = sr->pr_mask;
-	script->pr_val = sr->pr_val;
-	/*
-	 * We could down-grade our write-lock on script->lock here but
-	 * the rwlock API doesn't offer atomic lock downgrading, so
-	 * we'll just keep the write-lock and release it later when
-	 * we're done using the script.
-	 */
-}
-
-static inline void
-script_emit (struct unw_script *script, struct unw_insn insn)
-{
-	if (script->count >= UNW_MAX_SCRIPT_LEN) {
-		UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n",
-			__func__, UNW_MAX_SCRIPT_LEN);
-		return;
-	}
-	script->insn[script->count++] = insn;
-}
-
-static inline void
-emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script)
-{
-	struct unw_reg_info *r = sr->curr.reg + i;
-	enum unw_insn_opcode opc;
-	struct unw_insn insn;
-	unsigned long val = 0;
-
-	switch (r->where) {
-	      case UNW_WHERE_GR:
-		if (r->val >= 32) {
-			/* register got spilled to a stacked register */
-			opc = UNW_INSN_SETNAT_TYPE;
-			val = UNW_NAT_REGSTK;
-		} else
-			/* register got spilled to a scratch register */
-			opc = UNW_INSN_SETNAT_MEMSTK;
-		break;
-
-	      case UNW_WHERE_FR:
-		opc = UNW_INSN_SETNAT_TYPE;
-		val = UNW_NAT_VAL;
-		break;
-
-	      case UNW_WHERE_BR:
-		opc = UNW_INSN_SETNAT_TYPE;
-		val = UNW_NAT_NONE;
-		break;
-
-	      case UNW_WHERE_PSPREL:
-	      case UNW_WHERE_SPREL:
-		opc = UNW_INSN_SETNAT_MEMSTK;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n",
-			   __func__, r->where);
-		return;
-	}
-	insn.opc = opc;
-	insn.dst = unw.preg_index[i];
-	insn.val = val;
-	script_emit(script, insn);
-}
-
-static void
-compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
-{
-	struct unw_reg_info *r = sr->curr.reg + i;
-	enum unw_insn_opcode opc;
-	unsigned long val, rval;
-	struct unw_insn insn;
-	long need_nat_info;
-
-	if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target)
-		return;
-
-	opc = UNW_INSN_MOVE;
-	val = rval = r->val;
-	need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7);
-
-	switch (r->where) {
-	      case UNW_WHERE_GR:
-		if (rval >= 32) {
-			opc = UNW_INSN_MOVE_STACKED;
-			val = rval - 32;
-		} else if (rval >= 4 && rval <= 7) {
-			if (need_nat_info) {
-				opc = UNW_INSN_MOVE2;
-				need_nat_info = 0;
-			}
-			val = unw.preg_index[UNW_REG_R4 + (rval - 4)];
-		} else if (rval == 0) {
-			opc = UNW_INSN_MOVE_CONST;
-			val = 0;
-		} else {
-			/* register got spilled to a scratch register */
-			opc = UNW_INSN_MOVE_SCRATCH;
-			val = pt_regs_off(rval);
-		}
-		break;
-
-	      case UNW_WHERE_FR:
-		if (rval <= 5)
-			val = unw.preg_index[UNW_REG_F2  + (rval -  2)];
-		else if (rval >= 16 && rval <= 31)
-			val = unw.preg_index[UNW_REG_F16 + (rval - 16)];
-		else {
-			opc = UNW_INSN_MOVE_SCRATCH;
-			if (rval <= 11)
-				val = offsetof(struct pt_regs, f6) + 16*(rval - 6);
-			else
-				UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n",
-					   __func__, rval);
-		}
-		break;
-
-	      case UNW_WHERE_BR:
-		if (rval >= 1 && rval <= 5)
-			val = unw.preg_index[UNW_REG_B1 + (rval - 1)];
-		else {
-			opc = UNW_INSN_MOVE_SCRATCH;
-			if (rval == 0)
-				val = offsetof(struct pt_regs, b0);
-			else if (rval == 6)
-				val = offsetof(struct pt_regs, b6);
-			else
-				val = offsetof(struct pt_regs, b7);
-		}
-		break;
-
-	      case UNW_WHERE_SPREL:
-		opc = UNW_INSN_ADD_SP;
-		break;
-
-	      case UNW_WHERE_PSPREL:
-		opc = UNW_INSN_ADD_PSP;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n",
-			   __func__, i, r->where);
-		break;
-	}
-	insn.opc = opc;
-	insn.dst = unw.preg_index[i];
-	insn.val = val;
-	script_emit(script, insn);
-	if (need_nat_info)
-		emit_nat_info(sr, i, script);
-
-	if (i == UNW_REG_PSP) {
-		/*
-		 * info->psp must contain the _value_ of the previous
-		 * sp, not it's save location.  We get this by
-		 * dereferencing the value we just stored in
-		 * info->psp:
-		 */
-		insn.opc = UNW_INSN_LOAD;
-		insn.dst = insn.val = unw.preg_index[UNW_REG_PSP];
-		script_emit(script, insn);
-	}
-}
-
-static inline const struct unw_table_entry *
-lookup (struct unw_table *table, unsigned long rel_ip)
-{
-	const struct unw_table_entry *e = NULL;
-	unsigned long lo, hi, mid;
-
-	/* do a binary search for right entry: */
-	for (lo = 0, hi = table->length; lo < hi; ) {
-		mid = (lo + hi) / 2;
-		e = &table->array[mid];
-		if (rel_ip < e->start_offset)
-			hi = mid;
-		else if (rel_ip >= e->end_offset)
-			lo = mid + 1;
-		else
-			break;
-	}
-	if (rel_ip < e->start_offset || rel_ip >= e->end_offset)
-		return NULL;
-	return e;
-}
-
-/*
- * Build an unwind script that unwinds from state OLD_STATE to the
- * entrypoint of the function that called OLD_STATE.
- */
-static inline struct unw_script *
-build_script (struct unw_frame_info *info)
-{
-	const struct unw_table_entry *e = NULL;
-	struct unw_script *script = NULL;
-	struct unw_labeled_state *ls, *next;
-	unsigned long ip = info->ip;
-	struct unw_state_record sr;
-	struct unw_table *table, *prev;
-	struct unw_reg_info *r;
-	struct unw_insn insn;
-	u8 *dp, *desc_end;
-	u64 hdr;
-	int i;
-	STAT(unsigned long start, parse_start;)
-
-	STAT(++unw.stat.script.builds; start = ia64_get_itc());
-
-	/* build state record */
-	memset(&sr, 0, sizeof(sr));
-	for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
-		r->when = UNW_WHEN_NEVER;
-	sr.pr_val = info->pr;
-
-	UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __func__, ip);
-	script = script_new(ip);
-	if (!script) {
-		UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n",  __func__);
-		STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-		return NULL;
-	}
-	unw.cache[info->prev_script].hint = script - unw.cache;
-
-	/* search the kernels and the modules' unwind tables for IP: */
-
-	STAT(parse_start = ia64_get_itc());
-
-	prev = NULL;
-	for (table = unw.tables; table; table = table->next) {
-		if (ip >= table->start && ip < table->end) {
-			/*
-			 * Leave the kernel unwind table at the very front,
-			 * lest moving it breaks some assumption elsewhere.
-			 * Otherwise, move the matching table to the second
-			 * position in the list so that traversals can benefit
-			 * from commonality in backtrace paths.
-			 */
-			if (prev && prev != unw.tables) {
-				/* unw is safe - we're already spinlocked */
-				prev->next = table->next;
-				table->next = unw.tables->next;
-				unw.tables->next = table;
-			}
-			e = lookup(table, ip - table->segment_base);
-			break;
-		}
-		prev = table;
-	}
-	if (!e) {
-		/* no info, return default unwinder (leaf proc, no mem stack, no saved regs)  */
-		UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n",
-			__func__, ip, unw.cache[info->prev_script].ip);
-		sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
-		sr.curr.reg[UNW_REG_RP].when = -1;
-		sr.curr.reg[UNW_REG_RP].val = 0;
-		compile_reg(&sr, UNW_REG_RP, script);
-		script_finalize(script, &sr);
-		STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-		STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-		return script;
-	}
-
-	sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16
-			  + (ip & 0xfUL));
-	hdr = *(u64 *) (table->segment_base + e->info_offset);
-	dp =   (u8 *)  (table->segment_base + e->info_offset + 8);
-	desc_end = dp + 8*UNW_LENGTH(hdr);
-
-	while (!sr.done && dp < desc_end)
-		dp = unw_decode(dp, sr.in_body, &sr);
-
-	if (sr.when_target > sr.epilogue_start) {
-		/*
-		 * sp has been restored and all values on the memory stack below
-		 * psp also have been restored.
-		 */
-		sr.curr.reg[UNW_REG_PSP].val = 0;
-		sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE;
-		sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER;
-		for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
-			if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10)
-			    || r->where == UNW_WHERE_SPREL)
-			{
-				r->val = 0;
-				r->where = UNW_WHERE_NONE;
-				r->when = UNW_WHEN_NEVER;
-			}
-	}
-
-	script->flags = sr.flags;
-
-	/*
-	 * If RP did't get saved, generate entry for the return link
-	 * register.
-	 */
-	if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) {
-		sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
-		sr.curr.reg[UNW_REG_RP].when = -1;
-		sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg;
-		UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n",
-			   __func__, ip, sr.curr.reg[UNW_REG_RP].where,
-			   sr.curr.reg[UNW_REG_RP].val);
-	}
-
-#ifdef UNW_DEBUG
-	UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n",
-		__func__, table->segment_base + e->start_offset, sr.when_target);
-	for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) {
-		if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) {
-			UNW_DPRINT(1, "  %s <- ", unw.preg_name[r - sr.curr.reg]);
-			switch (r->where) {
-			      case UNW_WHERE_GR:     UNW_DPRINT(1, "r%lu", r->val); break;
-			      case UNW_WHERE_FR:     UNW_DPRINT(1, "f%lu", r->val); break;
-			      case UNW_WHERE_BR:     UNW_DPRINT(1, "b%lu", r->val); break;
-			      case UNW_WHERE_SPREL:  UNW_DPRINT(1, "[sp+0x%lx]", r->val); break;
-			      case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break;
-			      case UNW_WHERE_NONE:
-				UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val);
-				break;
-
-			      default:
-				UNW_DPRINT(1, "BADWHERE(%d)", r->where);
-				break;
-			}
-			UNW_DPRINT(1, "\t\t%d\n", r->when);
-		}
-	}
-#endif
-
-	STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-
-	/* translate state record into unwinder instructions: */
-
-	/*
-	 * First, set psp if we're dealing with a fixed-size frame;
-	 * subsequent instructions may depend on this value.
-	 */
-	if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when
-	    && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE)
-	    && sr.curr.reg[UNW_REG_PSP].val != 0) {
-		/* new psp is sp plus frame size */
-		insn.opc = UNW_INSN_ADD;
-		insn.dst = offsetof(struct unw_frame_info, psp)/8;
-		insn.val = sr.curr.reg[UNW_REG_PSP].val;	/* frame size */
-		script_emit(script, insn);
-	}
-
-	/* determine where the primary UNaT is: */
-	if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
-		i = UNW_REG_PRI_UNAT_MEM;
-	else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when)
-		i = UNW_REG_PRI_UNAT_GR;
-	else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
-		i = UNW_REG_PRI_UNAT_MEM;
-	else
-		i = UNW_REG_PRI_UNAT_GR;
-
-	compile_reg(&sr, i, script);
-
-	for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i)
-		compile_reg(&sr, i, script);
-
-	/* free labeled register states & stack: */
-
-	STAT(parse_start = ia64_get_itc());
-	for (ls = sr.labeled_states; ls; ls = next) {
-		next = ls->next;
-		free_state_stack(&ls->saved_state);
-		free_labeled_state(ls);
-	}
-	free_state_stack(&sr.curr);
-	STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-
-	script_finalize(script, &sr);
-	STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-	return script;
-}
-
-/*
- * Apply the unwinding actions represented by OPS and update SR to
- * reflect the state that existed upon entry to the function that this
- * unwinder represents.
- */
-static inline void
-run_script (struct unw_script *script, struct unw_frame_info *state)
-{
-	struct unw_insn *ip, *limit, next_insn;
-	unsigned long opc, dst, val, off;
-	unsigned long *s = (unsigned long *) state;
-	STAT(unsigned long start;)
-
-	STAT(++unw.stat.script.runs; start = ia64_get_itc());
-	state->flags = script->flags;
-	ip = script->insn;
-	limit = script->insn + script->count;
-	next_insn = *ip;
-
-	while (ip++ < limit) {
-		opc = next_insn.opc;
-		dst = next_insn.dst;
-		val = next_insn.val;
-		next_insn = *ip;
-
-	  redo:
-		switch (opc) {
-		      case UNW_INSN_ADD:
-			s[dst] += val;
-			break;
-
-		      case UNW_INSN_MOVE2:
-			if (!s[val])
-				goto lazy_init;
-			s[dst+1] = s[val+1];
-			s[dst] = s[val];
-			break;
-
-		      case UNW_INSN_MOVE:
-			if (!s[val])
-				goto lazy_init;
-			s[dst] = s[val];
-			break;
-
-		      case UNW_INSN_MOVE_SCRATCH:
-			if (state->pt) {
-				s[dst] = (unsigned long) get_scratch_regs(state) + val;
-			} else {
-				s[dst] = 0;
-				UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n",
-					   __func__, dst, val);
-			}
-			break;
-
-		      case UNW_INSN_MOVE_CONST:
-			if (val == 0)
-				s[dst] = (unsigned long) &unw.r0;
-			else {
-				s[dst] = 0;
-				UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n",
-					   __func__, val);
-			}
-			break;
-
-
-		      case UNW_INSN_MOVE_STACKED:
-			s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp,
-								    val);
-			break;
-
-		      case UNW_INSN_ADD_PSP:
-			s[dst] = state->psp + val;
-			break;
-
-		      case UNW_INSN_ADD_SP:
-			s[dst] = state->sp + val;
-			break;
-
-		      case UNW_INSN_SETNAT_MEMSTK:
-			if (!state->pri_unat_loc)
-				state->pri_unat_loc = &state->sw->caller_unat;
-			/* register off. is a multiple of 8, so the least 3 bits (type) are 0 */
-			s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK;
-			break;
-
-		      case UNW_INSN_SETNAT_TYPE:
-			s[dst+1] = val;
-			break;
-
-		      case UNW_INSN_LOAD:
-#ifdef UNW_DEBUG
-			if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
-			    || s[val] < TASK_SIZE)
-			{
-				UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
-					   __func__, s[val]);
-				break;
-			}
-#endif
-			s[dst] = *(unsigned long *) s[val];
-			break;
-		}
-	}
-	STAT(unw.stat.script.run_time += ia64_get_itc() - start);
-	return;
-
-  lazy_init:
-	off = unw.sw_off[val];
-	s[val] = (unsigned long) state->sw + off;
-	if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7))
-		/*
-		 * We're initializing a general register: init NaT info, too.  Note that
-		 * the offset is a multiple of 8 which gives us the 3 bits needed for
-		 * the type field.
-		 */
-		s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK;
-	goto redo;
-}
-
-static int
-find_save_locs (struct unw_frame_info *info)
-{
-	int have_write_lock = 0;
-	struct unw_script *scr;
-	unsigned long flags = 0;
-
-	if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
-		/* don't let obviously bad addresses pollute the cache */
-		/* FIXME: should really be level 0 but it occurs too often. KAO */
-		UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __func__, info->ip);
-		info->rp_loc = NULL;
-		return -1;
-	}
-
-	scr = script_lookup(info);
-	if (!scr) {
-		spin_lock_irqsave(&unw.lock, flags);
-		scr = build_script(info);
-		if (!scr) {
-			spin_unlock_irqrestore(&unw.lock, flags);
-			UNW_DPRINT(0,
-				   "unwind.%s: failed to locate/build unwind script for ip %lx\n",
-				   __func__, info->ip);
-			return -1;
-		}
-		have_write_lock = 1;
-	}
-	info->hint = scr->hint;
-	info->prev_script = scr - unw.cache;
-
-	run_script(scr, info);
-
-	if (have_write_lock) {
-		write_unlock(&scr->lock);
-		spin_unlock_irqrestore(&unw.lock, flags);
-	} else
-		read_unlock(&scr->lock);
-	return 0;
-}
-
-static int
-unw_valid(const struct unw_frame_info *info, unsigned long* p)
-{
-	unsigned long loc = (unsigned long)p;
-	return (loc >= info->regstk.limit && loc < info->regstk.top) ||
-	       (loc >= info->memstk.top && loc < info->memstk.limit);
-}
-
-int
-unw_unwind (struct unw_frame_info *info)
-{
-	unsigned long prev_ip, prev_sp, prev_bsp;
-	unsigned long ip, pr, num_regs;
-	STAT(unsigned long start, flags;)
-	int retval;
-
-	STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc());
-
-	prev_ip = info->ip;
-	prev_sp = info->sp;
-	prev_bsp = info->bsp;
-
-	/* validate the return IP pointer */
-	if (!unw_valid(info, info->rp_loc)) {
-		/* FIXME: should really be level 0 but it occurs too often. KAO */
-		UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n",
-			   __func__, info->ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-	/* restore the ip */
-	ip = info->ip = *info->rp_loc;
-	if (ip < GATE_ADDR) {
-		UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __func__, ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* validate the previous stack frame pointer */
-	if (!unw_valid(info, info->pfs_loc)) {
-		UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __func__);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-	/* restore the cfm: */
-	info->cfm_loc = info->pfs_loc;
-
-	/* restore the bsp: */
-	pr = info->pr;
-	num_regs = 0;
-	if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) {
-		info->pt = info->sp + 16;
-		if ((pr & (1UL << PRED_NON_SYSCALL)) != 0)
-			num_regs = *info->cfm_loc & 0x7f;		/* size of frame */
-		info->pfs_loc =
-			(unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs));
-		UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __func__, info->pt);
-	} else
-		num_regs = (*info->cfm_loc >> 7) & 0x7f;	/* size of locals */
-	info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs);
-	if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) {
-		UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__func__, info->bsp, info->regstk.limit, info->regstk.top);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* restore the sp: */
-	info->sp = info->psp;
-	if (info->sp < info->memstk.top || info->sp > info->memstk.limit) {
-		UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__func__, info->sp, info->memstk.top, info->memstk.limit);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) {
-		UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n",
-			   __func__, ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* as we unwind, the saved ar.unat becomes the primary unat: */
-	info->pri_unat_loc = info->unat_loc;
-
-	/* finally, restore the predicates: */
-	unw_get_pr(info, &info->pr);
-
-	retval = find_save_locs(info);
-	STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-	return retval;
-}
-EXPORT_SYMBOL(unw_unwind);
-
-int
-unw_unwind_to_user (struct unw_frame_info *info)
-{
-	unsigned long ip, sp, pr = info->pr;
-
-	do {
-		unw_get_sp(info, &sp);
-		if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp)
-		    < IA64_PT_REGS_SIZE) {
-			UNW_DPRINT(0, "unwind.%s: ran off the top of the kernel stack\n",
-				   __func__);
-			break;
-		}
-		if (unw_is_intr_frame(info) &&
-		    (pr & (1UL << PRED_USER_STACK)))
-			return 0;
-		if (unw_get_pr (info, &pr) < 0) {
-			unw_get_rp(info, &ip);
-			UNW_DPRINT(0, "unwind.%s: failed to read "
-				   "predicate register (ip=0x%lx)\n",
-				__func__, ip);
-			return -1;
-		}
-	} while (unw_unwind(info) >= 0);
-	unw_get_ip(info, &ip);
-	UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n",
-		   __func__, ip);
-	return -1;
-}
-EXPORT_SYMBOL(unw_unwind_to_user);
-
-static void
-init_frame_info (struct unw_frame_info *info, struct task_struct *t,
-		 struct switch_stack *sw, unsigned long stktop)
-{
-	unsigned long rbslimit, rbstop, stklimit;
-	STAT(unsigned long start, flags;)
-
-	STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc());
-
-	/*
-	 * Subtle stuff here: we _could_ unwind through the switch_stack frame but we
-	 * don't want to do that because it would be slow as each preserved register would
-	 * have to be processed.  Instead, what we do here is zero out the frame info and
-	 * start the unwind process at the function that created the switch_stack frame.
-	 * When a preserved value in switch_stack needs to be accessed, run_script() will
-	 * initialize the appropriate pointer on demand.
-	 */
-	memset(info, 0, sizeof(*info));
-
-	rbslimit = (unsigned long) t + IA64_RBS_OFFSET;
-	stklimit = (unsigned long) t + IA64_STK_OFFSET;
-
-	rbstop   = sw->ar_bspstore;
-	if (rbstop > stklimit || rbstop < rbslimit)
-		rbstop = rbslimit;
-
-	if (stktop <= rbstop)
-		stktop = rbstop;
-	if (stktop > stklimit)
-		stktop = stklimit;
-
-	info->regstk.limit = rbslimit;
-	info->regstk.top   = rbstop;
-	info->memstk.limit = stklimit;
-	info->memstk.top   = stktop;
-	info->task = t;
-	info->sw  = sw;
-	info->sp = info->psp = stktop;
-	info->pr = sw->pr;
-	UNW_DPRINT(3, "unwind.%s:\n"
-		   "  task   0x%lx\n"
-		   "  rbs = [0x%lx-0x%lx)\n"
-		   "  stk = [0x%lx-0x%lx)\n"
-		   "  pr     0x%lx\n"
-		   "  sw     0x%lx\n"
-		   "  sp     0x%lx\n",
-		   __func__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
-		   info->pr, (unsigned long) info->sw, info->sp);
-	STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
-}
-
-void
-unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw)
-{
-	unsigned long sol;
-
-	init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16);
-	info->cfm_loc = &sw->ar_pfs;
-	sol = (*info->cfm_loc >> 7) & 0x7f;
-	info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol);
-	info->ip = sw->b0;
-	UNW_DPRINT(3, "unwind.%s:\n"
-		   "  bsp    0x%lx\n"
-		   "  sol    0x%lx\n"
-		   "  ip     0x%lx\n",
-		   __func__, info->bsp, sol, info->ip);
-	find_save_locs(info);
-}
-
-EXPORT_SYMBOL(unw_init_frame_info);
-
-void
-unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t)
-{
-	struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16);
-
-	UNW_DPRINT(1, "unwind.%s\n", __func__);
-	unw_init_frame_info(info, t, sw);
-}
-EXPORT_SYMBOL(unw_init_from_blocked_task);
-
-static void
-init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base,
-		   unsigned long gp, const void *table_start, const void *table_end)
-{
-	const struct unw_table_entry *start = table_start, *end = table_end;
-
-	table->name = name;
-	table->segment_base = segment_base;
-	table->gp = gp;
-	table->start = segment_base + start[0].start_offset;
-	table->end = segment_base + end[-1].end_offset;
-	table->array = start;
-	table->length = end - start;
-}
-
-void *
-unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
-		      const void *table_start, const void *table_end)
-{
-	const struct unw_table_entry *start = table_start, *end = table_end;
-	struct unw_table *table;
-	unsigned long flags;
-
-	if (end - start <= 0) {
-		UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n",
-			   __func__);
-		return NULL;
-	}
-
-	table = kmalloc(sizeof(*table), GFP_USER);
-	if (!table)
-		return NULL;
-
-	init_unwind_table(table, name, segment_base, gp, table_start, table_end);
-
-	spin_lock_irqsave(&unw.lock, flags);
-	{
-		/* keep kernel unwind table at the front (it's searched most commonly): */
-		table->next = unw.tables->next;
-		unw.tables->next = table;
-	}
-	spin_unlock_irqrestore(&unw.lock, flags);
-
-	return table;
-}
-
-void
-unw_remove_unwind_table (void *handle)
-{
-	struct unw_table *table, *prev;
-	struct unw_script *tmp;
-	unsigned long flags;
-	long index;
-
-	if (!handle) {
-		UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n",
-			   __func__);
-		return;
-	}
-
-	table = handle;
-	if (table == &unw.kernel_table) {
-		UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a "
-			   "no-can-do!\n", __func__);
-		return;
-	}
-
-	spin_lock_irqsave(&unw.lock, flags);
-	{
-		/* first, delete the table: */
-
-		for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next)
-			if (prev->next == table)
-				break;
-		if (!prev) {
-			UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n",
-				   __func__, (void *) table);
-			spin_unlock_irqrestore(&unw.lock, flags);
-			return;
-		}
-		prev->next = table->next;
-	}
-	spin_unlock_irqrestore(&unw.lock, flags);
-
-	/* next, remove hash table entries for this table */
-
-	for (index = 0; index < UNW_HASH_SIZE; ++index) {
-		tmp = unw.cache + unw.hash[index];
-		if (unw.hash[index] >= UNW_CACHE_SIZE
-		    || tmp->ip < table->start || tmp->ip >= table->end)
-			continue;
-
-		write_lock(&tmp->lock);
-		{
-			if (tmp->ip >= table->start && tmp->ip < table->end) {
-				unw.hash[index] = tmp->coll_chain;
-				tmp->ip = 0;
-			}
-		}
-		write_unlock(&tmp->lock);
-	}
-
-	kfree(table);
-}
-
-static int __init
-create_gate_table (void)
-{
-	const struct unw_table_entry *entry, *start, *end;
-	unsigned long *lp, segbase = GATE_ADDR;
-	size_t info_size, size;
-	char *info;
-	Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr)
-		if (phdr->p_type == PT_IA_64_UNWIND) {
-			punw = phdr;
-			break;
-		}
-
-	if (!punw) {
-		printk("%s: failed to find gate DSO's unwind table!\n", __func__);
-		return 0;
-	}
-
-	start = (const struct unw_table_entry *) punw->p_vaddr;
-	end = (struct unw_table_entry *) ((char *) start + punw->p_memsz);
-	size  = 0;
-
-	unw_add_unwind_table("linux-gate.so", segbase, 0, start, end);
-
-	for (entry = start; entry < end; ++entry)
-		size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
-	size += 8;	/* reserve space for "end of table" marker */
-
-	unw.gate_table = kmalloc(size, GFP_KERNEL);
-	if (!unw.gate_table) {
-		unw.gate_table_size = 0;
-		printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __func__);
-		return 0;
-	}
-	unw.gate_table_size = size;
-
-	lp = unw.gate_table;
-	info = (char *) unw.gate_table + size;
-
-	for (entry = start; entry < end; ++entry, lp += 3) {
-		info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
-		info -= info_size;
-		memcpy(info, (char *) segbase + entry->info_offset, info_size);
-
-		lp[0] = segbase + entry->start_offset;		/* start */
-		lp[1] = segbase + entry->end_offset;		/* end */
-		lp[2] = info - (char *) unw.gate_table;		/* info */
-	}
-	*lp = 0;	/* end-of-table marker */
-	return 0;
-}
-
-__initcall(create_gate_table);
-
-void __init
-unw_init (void)
-{
-	extern char __gp[];
-	extern void unw_hash_index_t_is_too_narrow (void);
-	long i, off;
-
-	if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE)
-		unw_hash_index_t_is_too_narrow();
-
-	unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(CALLER_UNAT);
-	unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE);
-	unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_PFS);
-	unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0);
-	unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(CALLER_UNAT);
-	unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR);
-	unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC);
-	unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR);
-	for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16)
-		unw.sw_off[unw.preg_index[i]] = off;
-
-	for (i = 0; i < UNW_CACHE_SIZE; ++i) {
-		if (i > 0)
-			unw.cache[i].lru_chain = (i - 1);
-		unw.cache[i].coll_chain = -1;
-		rwlock_init(&unw.cache[i].lock);
-	}
-	unw.lru_head = UNW_CACHE_SIZE - 1;
-	unw.lru_tail = 0;
-
-	init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
-			  __start_unwind, __end_unwind);
-}
-
-/*
- * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
- *
- *	This system call has been deprecated.  The new and improved way to get
- *	at the kernel's unwind info is via the gate DSO.  The address of the
- *	ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR.
- *
- * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
- *
- * This system call copies the unwind data into the buffer pointed to by BUF and returns
- * the size of the unwind data.  If BUF_SIZE is smaller than the size of the unwind data
- * or if BUF is NULL, nothing is copied, but the system call still returns the size of the
- * unwind data.
- *
- * The first portion of the unwind data contains an unwind table and rest contains the
- * associated unwind info (in no particular order).  The unwind table consists of a table
- * of entries of the form:
- *
- *	u64 start;	(64-bit address of start of function)
- *	u64 end;	(64-bit address of start of function)
- *	u64 info;	(BUF-relative offset to unwind info)
- *
- * The end of the unwind table is indicated by an entry with a START address of zero.
- *
- * Please see the IA-64 Software Conventions and Runtime Architecture manual for details
- * on the format of the unwind info.
- *
- * ERRORS
- *	EFAULT	BUF points outside your accessible address space.
- */
-asmlinkage long
-sys_getunwind (void __user *buf, size_t buf_size)
-{
-	if (buf && buf_size >= unw.gate_table_size)
-		if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0)
-			return -EFAULT;
-	return unw.gate_table_size;
-}
diff --git a/arch/ia64/kernel/unwind_decoder.c b/arch/ia64/kernel/unwind_decoder.c
deleted file mode 100644
index 83f54f7929b5..000000000000
--- a/arch/ia64/kernel/unwind_decoder.c
+++ /dev/null
@@ -1,460 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Generic IA-64 unwind info decoder.
- *
- * This file is used both by the Linux kernel and objdump.  Please keep
- * the two copies of this file in sync.
- *
- * You need to customize the decoder by defining the following
- * macros/constants before including this file:
- *
- *  Types:
- *	unw_word	Unsigned integer type with at least 64 bits 
- *
- *  Register names:
- *	UNW_REG_BSP
- *	UNW_REG_BSPSTORE
- *	UNW_REG_FPSR
- *	UNW_REG_LC
- *	UNW_REG_PFS
- *	UNW_REG_PR
- *	UNW_REG_RNAT
- *	UNW_REG_PSP
- *	UNW_REG_RP
- *	UNW_REG_UNAT
- *
- *  Decoder action macros:
- *	UNW_DEC_BAD_CODE(code)
- *	UNW_DEC_ABI(fmt,abi,context,arg)
- *	UNW_DEC_BR_GR(fmt,brmask,gr,arg)
- *	UNW_DEC_BR_MEM(fmt,brmask,arg)
- *	UNW_DEC_COPY_STATE(fmt,label,arg)
- *	UNW_DEC_EPILOGUE(fmt,t,ecount,arg)
- *	UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg)
- *	UNW_DEC_FR_MEM(fmt,frmask,arg)
- *	UNW_DEC_GR_GR(fmt,grmask,gr,arg)
- *	UNW_DEC_GR_MEM(fmt,grmask,arg)
- *	UNW_DEC_LABEL_STATE(fmt,label,arg)
- *	UNW_DEC_MEM_STACK_F(fmt,t,size,arg)
- *	UNW_DEC_MEM_STACK_V(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_GR(fmt,r,arg)
- *	UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg)
- *	UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg)
- *	UNW_DEC_PROLOGUE(fmt,body,rlen,arg)
- *	UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg)
- *	UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg)
- *	UNW_DEC_REG_REG(fmt,src,dst,arg)
- *	UNW_DEC_REG_SPREL(fmt,reg,spoff,arg)
- *	UNW_DEC_REG_WHEN(fmt,reg,t,arg)
- *	UNW_DEC_RESTORE(fmt,t,abreg,arg)
- *	UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg)
- *	UNW_DEC_SPILL_BASE(fmt,pspoff,arg)
- *	UNW_DEC_SPILL_MASK(fmt,imaskp,arg)
- *	UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg)
- *	UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg)
- *	UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg)
- *	UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg)
- *	UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg)
- *	UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg)
- */
-
-static unw_word
-unw_decode_uleb128 (unsigned char **dpp)
-{
-  unsigned shift = 0;
-  unw_word byte, result = 0;
-  unsigned char *bp = *dpp;
-
-  while (1)
-    {
-      byte = *bp++;
-      result |= (byte & 0x7f) << shift;
-      if ((byte & 0x80) == 0)
-	break;
-      shift += 7;
-    }
-  *dpp = bp;
-  return result;
-}
-
-static unsigned char *
-unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, abreg;
-  unw_word t, off;
-
-  byte1 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  off = unw_decode_uleb128 (&dp);
-  abreg = (byte1 & 0x7f);
-  if (byte1 & 0x80)
-	  UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg);
-  else
-	  UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, abreg, x, ytreg;
-  unw_word t;
-
-  byte1 = *dp++; byte2 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  abreg = (byte1 & 0x7f);
-  ytreg = byte2;
-  x = (byte1 >> 7) & 1;
-  if ((byte1 & 0x80) == 0 && ytreg == 0)
-    UNW_DEC_RESTORE(X2, t, abreg, arg);
-  else
-    UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, abreg, qp;
-  unw_word t, off;
-
-  byte1 = *dp++; byte2 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  off = unw_decode_uleb128 (&dp);
-
-  qp = (byte1 & 0x3f);
-  abreg = (byte2 & 0x7f);
-
-  if (byte1 & 0x80)
-    UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg);
-  else
-    UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg;
-  unw_word t;
-
-  byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-
-  qp = (byte1 & 0x3f);
-  abreg = (byte2 & 0x7f);
-  x = (byte2 >> 7) & 1;
-  ytreg = byte3;
-
-  if ((byte2 & 0x80) == 0 && byte3 == 0)
-    UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg);
-  else
-    UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  int body = (code & 0x20) != 0;
-  unw_word rlen;
-
-  rlen = (code & 0x1f);
-  UNW_DEC_PROLOGUE(R1, body, rlen, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, mask, grsave;
-  unw_word rlen;
-
-  byte1 = *dp++;
-
-  mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
-  grsave = (byte1 & 0x7f);
-  rlen = unw_decode_uleb128 (&dp);
-  UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word rlen;
-
-  rlen = unw_decode_uleb128 (&dp);
-  UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char brmask = (code & 0x1f);
-
-  UNW_DEC_BR_MEM(P1, brmask, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg)
-{
-  if ((code & 0x10) == 0)
-    {
-      unsigned char byte1 = *dp++;
-
-      UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1),
-		    (byte1 & 0x7f), arg);
-    }
-  else if ((code & 0x08) == 0)
-    {
-      unsigned char byte1 = *dp++, r, dst;
-
-      r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
-      dst = (byte1 & 0x7f);
-      switch (r)
-	{
-	case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break;
-	case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break;
-	case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break;
-	case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break;
-	case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break;
-	case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break;
-	case 6: UNW_DEC_RP_BR(P3, dst, arg); break;
-	case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break;
-	case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break;
-	case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break;
-	case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break;
-	case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break;
-	default: UNW_DEC_BAD_CODE(r); break;
-	}
-    }
-  else if ((code & 0x7) == 0)
-    UNW_DEC_SPILL_MASK(P4, dp, arg);
-  else if ((code & 0x7) == 1)
-    {
-      unw_word grmask, frmask, byte1, byte2, byte3;
-
-      byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
-      grmask = ((byte1 >> 4) & 0xf);
-      frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3;
-      UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg);
-    }
-  else
-    UNW_DEC_BAD_CODE(code);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg)
-{
-  int gregs = (code & 0x10) != 0;
-  unsigned char mask = (code & 0x0f);
-
-  if (gregs)
-    UNW_DEC_GR_MEM(P6, mask, arg);
-  else
-    UNW_DEC_FR_MEM(P6, mask, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char r, byte1, byte2;
-  unw_word t, size;
-
-  if ((code & 0x10) == 0)
-    {
-      r = (code & 0xf);
-      t = unw_decode_uleb128 (&dp);
-      switch (r)
-	{
-	case 0:
-	  size = unw_decode_uleb128 (&dp);
-	  UNW_DEC_MEM_STACK_F(P7, t, size, arg);
-	  break;
-
-	case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break;
-	case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break;
-	case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break;
-	case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break;
-	case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break;
-	case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break;
-	case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break;
-	case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break;
-	case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break;
-	case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break;
-	case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break;
-	case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break;
-	case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break;
-	case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break;
-	case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break;
-	default: UNW_DEC_BAD_CODE(r); break;
-	}
-    }
-  else
-    {
-      switch (code & 0xf)
-	{
-	case 0x0: /* p8 */
-	  {
-	    r = *dp++;
-	    t = unw_decode_uleb128 (&dp);
-	    switch (r)
-	      {
-	      case  1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break;
-	      case  2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break;
-	      case  3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break;
-	      case  4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break;
-	      case  5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break;
-	      case  6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break;
-	      case  7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break;
-	      case  8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break;
-	      case  9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break;
-	      case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break;
-	      case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break;
-	      case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break;
-	      case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break;
-	      case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break;
-	      case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break;
-	      case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break;
-	      default: UNW_DEC_BAD_CODE(r); break;
-	    }
-	  }
-	  break;
-
-	case 0x1:
-	  byte1 = *dp++; byte2 = *dp++;
-	  UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg);
-	  break;
-
-	case 0xf: /* p10 */
-	  byte1 = *dp++; byte2 = *dp++;
-	  UNW_DEC_ABI(P10, byte1, byte2, arg);
-	  break;
-
-	case 0x9:
-	  return unw_decode_x1 (dp, code, arg);
-
-	case 0xa:
-	  return unw_decode_x2 (dp, code, arg);
-
-	case 0xb:
-	  return unw_decode_x3 (dp, code, arg);
-
-	case 0xc:
-	  return unw_decode_x4 (dp, code, arg);
-
-	default:
-	  UNW_DEC_BAD_CODE(code);
-	  break;
-	}
-    }
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word label = (code & 0x1f);
-
-  if ((code & 0x20) != 0)
-    UNW_DEC_COPY_STATE(B1, label, arg);
-  else
-    UNW_DEC_LABEL_STATE(B1, label, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word t;
-
-  t = unw_decode_uleb128 (&dp);
-  UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word t, ecount, label;
-
-  if ((code & 0x10) == 0)
-    {
-      t = unw_decode_uleb128 (&dp);
-      ecount = unw_decode_uleb128 (&dp);
-      UNW_DEC_EPILOGUE(B3, t, ecount, arg);
-    }
-  else if ((code & 0x07) == 0)
-    {
-      label = unw_decode_uleb128 (&dp);
-      if ((code & 0x08) != 0)
-	UNW_DEC_COPY_STATE(B4, label, arg);
-      else
-	UNW_DEC_LABEL_STATE(B4, label, arg);
-    }
-  else
-    switch (code & 0x7)
-      {
-      case 1: return unw_decode_x1 (dp, code, arg);
-      case 2: return unw_decode_x2 (dp, code, arg);
-      case 3: return unw_decode_x3 (dp, code, arg);
-      case 4: return unw_decode_x4 (dp, code, arg);
-      default: UNW_DEC_BAD_CODE(code); break;
-      }
-  return dp;
-}
-
-typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *);
-
-static unw_decoder unw_decode_table[2][8] =
-{
-  /* prologue table: */
-  {
-    unw_decode_r1,	/* 0 */
-    unw_decode_r1,
-    unw_decode_r2,
-    unw_decode_r3,
-    unw_decode_p1,	/* 4 */
-    unw_decode_p2_p5,
-    unw_decode_p6,
-    unw_decode_p7_p10
-  },
-  {
-    unw_decode_r1,	/* 0 */
-    unw_decode_r1,
-    unw_decode_r2,
-    unw_decode_r3,
-    unw_decode_b1,	/* 4 */
-    unw_decode_b1,
-    unw_decode_b2,
-    unw_decode_b3_x4
-  }
-};
-
-/*
- * Decode one descriptor and return address of next descriptor.
- */
-static inline unsigned char *
-unw_decode (unsigned char *dp, int inside_body, void *arg)
-{
-  unw_decoder decoder;
-  unsigned char code;
-
-  code = *dp++;
-  decoder = unw_decode_table[inside_body][code >> 5];
-  dp = (*decoder) (dp, code, arg);
-  return dp;
-}
diff --git a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h
deleted file mode 100644
index 1dd57ba44327..000000000000
--- a/arch/ia64/kernel/unwind_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Kernel unwind support.
- */
-
-#define UNW_VER(x)		((x) >> 48)
-#define UNW_FLAG_MASK		0x0000ffff00000000
-#define UNW_FLAG_OSMASK		0x0000f00000000000
-#define UNW_FLAG_EHANDLER(x)	((x) & 0x0000000100000000L)
-#define UNW_FLAG_UHANDLER(x)	((x) & 0x0000000200000000L)
-#define UNW_LENGTH(x)		((x) & 0x00000000ffffffffL)
-
-enum unw_register_index {
-	/* primary unat: */
-	UNW_REG_PRI_UNAT_GR,
-	UNW_REG_PRI_UNAT_MEM,
-
-	/* register stack */
-	UNW_REG_BSP,					/* register stack pointer */
-	UNW_REG_BSPSTORE,
-	UNW_REG_PFS,					/* previous function state */
-	UNW_REG_RNAT,
-	/* memory stack */
-	UNW_REG_PSP,					/* previous memory stack pointer */
-	/* return pointer: */
-	UNW_REG_RP,
-
-	/* preserved registers: */
-	UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7,
-	UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR,
-	UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5,
-	UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5,
-	UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19,
-	UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23,
-	UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27,
-	UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31,
-	UNW_NUM_REGS
-};
-
-struct unw_info_block {
-	u64 header;
-	u64 desc[];		/* unwind descriptors */
-	/* personality routine and language-specific data follow behind descriptors */
-};
-
-struct unw_table {
-	struct unw_table *next;		/* must be first member! */
-	const char *name;
-	unsigned long gp;		/* global pointer for this load-module */
-	unsigned long segment_base;	/* base for offsets in the unwind table entries */
-	unsigned long start;
-	unsigned long end;
-	const struct unw_table_entry *array;
-	unsigned long length;
-};
-
-enum unw_where {
-	UNW_WHERE_NONE,			/* register isn't saved at all */
-	UNW_WHERE_GR,			/* register is saved in a general register */
-	UNW_WHERE_FR,			/* register is saved in a floating-point register */
-	UNW_WHERE_BR,			/* register is saved in a branch register */
-	UNW_WHERE_SPREL,		/* register is saved on memstack (sp-relative) */
-	UNW_WHERE_PSPREL,		/* register is saved on memstack (psp-relative) */
-	/*
-	 * At the end of each prologue these locations get resolved to
-	 * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively:
-	 */
-	UNW_WHERE_SPILL_HOME,		/* register is saved in its spill home */
-	UNW_WHERE_GR_SAVE		/* register is saved in next general register */
-};
-
-#define UNW_WHEN_NEVER	0x7fffffff
-
-struct unw_reg_info {
-	unsigned long val;		/* save location: register number or offset */
-	enum unw_where where;		/* where the register gets saved */
-	int when;			/* when the register gets saved */
-};
-
-struct unw_reg_state {
-	struct unw_reg_state *next;		/* next (outer) element on state stack */
-	struct unw_reg_info reg[UNW_NUM_REGS];	/* register save locations */
-};
-
-struct unw_labeled_state {
-	struct unw_labeled_state *next;		/* next labeled state (or NULL) */
-	unsigned long label;			/* label for this state */
-	struct unw_reg_state saved_state;
-};
-
-struct unw_state_record {
-	unsigned int first_region : 1;	/* is this the first region? */
-	unsigned int done : 1;		/* are we done scanning descriptors? */
-	unsigned int any_spills : 1;	/* got any register spills? */
-	unsigned int in_body : 1;	/* are we inside a body (as opposed to a prologue)? */
-	unsigned long flags;		/* see UNW_FLAG_* in unwind.h */
-
-	u8 *imask;			/* imask of spill_mask record or NULL */
-	unsigned long pr_val;		/* predicate values */
-	unsigned long pr_mask;		/* predicate mask */
-	long spill_offset;		/* psp-relative offset for spill base */
-	int region_start;
-	int region_len;
-	int epilogue_start;
-	int epilogue_count;
-	int when_target;
-
-	u8 gr_save_loc;			/* next general register to use for saving a register */
-	u8 return_link_reg;		/* branch register in which the return link is passed */
-
-	struct unw_labeled_state *labeled_states;	/* list of all labeled states */
-	struct unw_reg_state curr;	/* current state */
-};
-
-enum unw_nat_type {
-	UNW_NAT_NONE,		/* NaT not represented */
-	UNW_NAT_VAL,		/* NaT represented by NaT value (fp reg) */
-	UNW_NAT_MEMSTK,		/* NaT value is in unat word at offset OFF  */
-	UNW_NAT_REGSTK		/* NaT is in rnat */
-};
-
-enum unw_insn_opcode {
-	UNW_INSN_ADD,			/* s[dst] += val */
-	UNW_INSN_ADD_PSP,		/* s[dst] = (s.psp + val) */
-	UNW_INSN_ADD_SP,		/* s[dst] = (s.sp + val) */
-	UNW_INSN_MOVE,			/* s[dst] = s[val] */
-	UNW_INSN_MOVE2,			/* s[dst] = s[val]; s[dst+1] = s[val+1] */
-	UNW_INSN_MOVE_STACKED,		/* s[dst] = ia64_rse_skip(*s.bsp, val) */
-	UNW_INSN_SETNAT_MEMSTK,		/* s[dst+1].nat.type = MEMSTK;
-					   s[dst+1].nat.off = *s.pri_unat - s[dst] */
-	UNW_INSN_SETNAT_TYPE,		/* s[dst+1].nat.type = val */
-	UNW_INSN_LOAD,			/* s[dst] = *s[val] */
-	UNW_INSN_MOVE_SCRATCH,		/* s[dst] = scratch reg "val" */
-	UNW_INSN_MOVE_CONST,            /* s[dst] = constant reg "val" */
-};
-
-struct unw_insn {
-	unsigned int opc	:  4;
-	unsigned int dst	:  9;
-	signed int val		: 19;
-};
-
-/*
- * Preserved general static registers (r4-r7) give rise to two script
- * instructions; everything else yields at most one instruction; at
- * the end of the script, the psp gets popped, accounting for one more
- * instruction.
- */
-#define UNW_MAX_SCRIPT_LEN	(UNW_NUM_REGS + 5)
-
-struct unw_script {
-	unsigned long ip;		/* ip this script is for */
-	unsigned long pr_mask;		/* mask of predicates script depends on */
-	unsigned long pr_val;		/* predicate values this script is for */
-	rwlock_t lock;
-	unsigned int flags;		/* see UNW_FLAG_* in unwind.h */
-	unsigned short lru_chain;	/* used for least-recently-used chain */
-	unsigned short coll_chain;	/* used for hash collisions */
-	unsigned short hint;		/* hint for next script to try (or -1) */
-	unsigned short count;		/* number of instructions in script */
-	struct unw_insn insn[UNW_MAX_SCRIPT_LEN];
-};
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
deleted file mode 100644
index 53dfde161c8a..000000000000
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ /dev/null
@@ -1,224 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <linux/pgtable.h>
-#include <asm/cache.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-
-#define EMITS_PT_NOTE
-#define RO_EXCEPTION_TABLE_ALIGN	16
-
-#include <asm-generic/vmlinux.lds.h>
-
-OUTPUT_FORMAT("elf64-ia64-little")
-OUTPUT_ARCH(ia64)
-ENTRY(phys_start)
-jiffies = jiffies_64;
-
-PHDRS {
-	text   PT_LOAD;
-	percpu PT_LOAD;
-	data   PT_LOAD;
-	note   PT_NOTE;
-	unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
-}
-
-SECTIONS {
-	/*
-	 * unwind exit sections must be discarded before
-	 * the rest of the sections get included.
-	 */
-	/DISCARD/ : {
-		*(.IA_64.unwind.exit.text)
-		*(.IA_64.unwind_info.exit.text)
-		*(.comment)
-		*(.note)
-	}
-
-	v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
-	phys_start = _start - LOAD_OFFSET;
-
-	code : {
-	} :text
-	. = KERNEL_START;
-
-	_text = .;
-	_stext = .;
-
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		__start_ivt_text = .;
-		*(.text..ivt)
-		__end_ivt_text = .;
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		SOFTIRQENTRY_TEXT
-		*(.gnu.linkonce.t*)
-	}
-
-	.text2 : AT(ADDR(.text2) - LOAD_OFFSET)	{
-		*(.text2)
-	}
-
-#ifdef CONFIG_SMP
-	.text..lock : AT(ADDR(.text..lock) - LOAD_OFFSET) {
-		*(.text..lock)
-	}
-#endif
-	_etext = .;
-
-	/*
-	 * Read-only data
-	 */
-
-	/* MCA table */
-	. = ALIGN(16);
-	__mca_table : AT(ADDR(__mca_table) - LOAD_OFFSET) {
-		__start___mca_table = .;
-		*(__mca_table)
-		__stop___mca_table = .;
-	}
-
-	.data..patch.phys_stack_reg : AT(ADDR(.data..patch.phys_stack_reg) - LOAD_OFFSET) {
-		__start___phys_stack_reg_patchlist = .;
-		*(.data..patch.phys_stack_reg)
-		__end___phys_stack_reg_patchlist = .;
-	}
-
-	/*
-	 * Global data
-	 */
-	_data = .;
-
-	/* Unwind info & table: */
-	. = ALIGN(8);
-	.IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) {
-		*(.IA_64.unwind_info*)
-	}
-	.IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) {
-		__start_unwind = .;
-		*(.IA_64.unwind*)
-		__end_unwind = .;
-	} :text :unwind
-	code_continues2 : {
-	} :text
-
-	RO_DATA(4096)
-
-	.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
-		__start_opd = .;
-		*(.opd)
-		__end_opd = .;
-	}
-
-	/*
-	 * Initialization code and data:
-	 */
-	. = ALIGN(PAGE_SIZE);
-	__init_begin = .;
-
-	INIT_TEXT_SECTION(PAGE_SIZE)
-	INIT_DATA_SECTION(16)
-
-	.data..patch.vtop : AT(ADDR(.data..patch.vtop) - LOAD_OFFSET) {
-		__start___vtop_patchlist = .;
-		*(.data..patch.vtop)
-		__end___vtop_patchlist = .;
-	}
-
-	.data..patch.rse : AT(ADDR(.data..patch.rse) - LOAD_OFFSET) {
-		__start___rse_patchlist = .;
-		*(.data..patch.rse)
-		__end___rse_patchlist = .;
-	}
-
-	.data..patch.mckinley_e9 : AT(ADDR(.data..patch.mckinley_e9) - LOAD_OFFSET) {
-		__start___mckinley_e9_bundles = .;
-		*(.data..patch.mckinley_e9)
-		__end___mckinley_e9_bundles = .;
-	}
-
-#ifdef	CONFIG_SMP
-	. = ALIGN(PERCPU_PAGE_SIZE);
-	__cpu0_per_cpu = .;
-	. = . + PERCPU_PAGE_SIZE;   /* cpu0 per-cpu space */
-#endif
-
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
-
-	.data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) {
-		PAGE_ALIGNED_DATA(PAGE_SIZE)
-		. = ALIGN(PAGE_SIZE);
-		__start_gate_section = .;
-		*(.data..gate)
-		__stop_gate_section = .;
-	}
-	/*
-	 * make sure the gate page doesn't expose
-	 * kernel data
-	 */
-	. = ALIGN(PAGE_SIZE);
-
-	/* Per-cpu data: */
-	. = ALIGN(PERCPU_PAGE_SIZE);
-	PERCPU_VADDR(SMP_CACHE_BYTES, PERCPU_ADDR, :percpu)
-	__phys_per_cpu_start = __per_cpu_load;
-	/*
-	 * ensure percpu data fits
-	 * into percpu page size
-	 */
-	. = __phys_per_cpu_start + PERCPU_PAGE_SIZE;
-
-	data : {
-	} :data
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		_sdata  =  .;
-		INIT_TASK_DATA(PAGE_SIZE)
-		CACHELINE_ALIGNED_DATA(SMP_CACHE_BYTES)
-		READ_MOSTLY_DATA(SMP_CACHE_BYTES)
-		DATA_DATA
-		*(.data1)
-		*(.gnu.linkonce.d*)
-		CONSTRUCTORS
-	}
-
-	BUG_TABLE
-
-	. = ALIGN(16);	/* gp must be 16-byte aligned for exc. table */
-	.got : AT(ADDR(.got) - LOAD_OFFSET) {
-		*(.got.plt)
-		*(.got)
-	}
-	__gp = ADDR(.got) + 0x200000;
-
-	/*
-	 * We want the small data sections together,
-	 * so single-instruction offsets can access
-	 * them all, and initialized data all before
-	 * uninitialized, so we can shorten the
-	 * on-disk segment size.
-	 */
-	.sdata : AT(ADDR(.sdata) - LOAD_OFFSET) {
-		*(.sdata)
-		*(.sdata1)
-		*(.srdata)
-	}
-	_edata  =  .;
-
-	BSS_SECTION(0, 0, 0)
-
-	_end = .;
-
-	code : {
-	} :text
-
-	STABS_DEBUG
-	DWARF_DEBUG
-	ELF_DETAILS
-
-	/* Default discards */
-	DISCARDS
-}
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o		\
-	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
-	checksum.o clear_page.o csum_partial_copy.o			\
-	clear_user.o strncpy_from_user.o strnlen_user.o			\
-	flush.o ip_fast_csum.o do_csum.o				\
-	memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o	=
-AFLAGS___udivdi3.o	= -DUNSIGNED
-AFLAGS___moddi3.o	= 	     -DMODULO
-AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o	=
-AFLAGS___udivsi3.o	= -DUNSIGNED
-AFLAGS___modsi3.o	=	     -DMODULO
-AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
-		  __u8 proto, __wsum sum)
-{
-	return (__force __sum16)~from64to16(
-		(__force u64)saddr + (__force u64)daddr +
-		(__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	unsigned long result;
-
-	result = (__force u64)saddr + (__force u64)daddr +
-		 (__force u64)sum + ((len + proto) << 8);
-
-	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
-	/* 64 to 33 */
-	result = (result & 0xffffffff) + (result >> 32);
-	/* 33 to 32 */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	u64 result = do_csum(buff, len);
-
-	/* add in old sum, and carry.. */
-	result += (__force u32)sum;
-	/* 32+c bits -> 32 bits */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
-	return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index ba0dd2538fa5..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm	Tuned for Itanium.
- * 2/12/02 kchen	Tuned for both Itanium and McKinley
- * 3/08/02 davidm	Some more tweaking
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE	64	// Itanium L3 line size
-# define PREFETCH_LINES	9	// magic number
-#else
-# define L3_LINE_SIZE	128	// McKinley L3 line size
-# define PREFETCH_LINES	12	// magic number
-#endif
-
-#define saved_lc	r2
-#define dst_fetch	r3
-#define dst1		r8
-#define dst2		r9
-#define dst3		r10
-#define dst4		r11
-
-#define dst_last	r31
-
-GLOBAL_ENTRY(clear_page)
-	.prologue
-	.regstk 1,0,0,0
-	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-
-	.body
-	mov ar.lc = (PREFETCH_LINES - 1)
-	mov dst_fetch = in0
-	adds dst1 = 16, in0
-	adds dst2 = 32, in0
-	;;
-.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	adds dst3 = 48, in0		// executing this multiple times is harmless
-	br.cloop.sptk.few .fetch
-	;;
-	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
-	mov ar.lc = r16			// one L3 line per iteration
-	adds dst4 = 64, in0
-	;;
-#ifdef CONFIG_ITANIUM
-	// Optimized for Itanium
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-#else
-	// Optimized for McKinley
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	stf.spill.nta [dst3] = f0, 64
-	stf.spill.nta [dst4] = f0, 128
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-#endif
-	stf.spill.nta [dst3] = f0, 64
-(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc = saved_lc		// restore lc
-	br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index 1d9e45ccf8e5..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	length of buffer in bytes
- * Outputs:
- *	r8:	number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf		r32
-#define len		r33
-
-//
-// local registers
-//
-#define cnt		r16
-#define buf2		r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define tmp		r20
-#define len2		r21
-#define len3		r22
-
-//
-// Theory of operations:
-//	- we check whether or not the buffer is small, i.e., less than 17
-//	  in which case we do the byte by byte loop.
-//
-//	- Otherwise we go progressively from 1 byte store to 8byte store in
-//	  the head part, the body is a 16byte store loop and we finish we the
-//	  tail for the last 15 bytes.
-//	  The good point about this breakdown is that the long buffer handling
-//	  contains only 2 branches.
-//
-//	The reason for not using shifting & masking for both the head and the
-//	tail is to stay semantically correct. This routine is not supposed
-//	to write bytes outside of the buffer. While most of the time this would
-//	be ok, we can't tolerate a mistake. A classical example is the case
-//	of multithreaded code were to the extra bytes touched is actually owned
-//	by another thread which runs concurrently to ours. Another, less likely,
-//	example is with device drivers where reading an I/O mapped location may
-//	have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,2,0,0,0
-	cmp.eq p6,p0=r0,len		// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc		// preserve ar.lc (slow)
-	.body
-	;;				// avoid WAW on CFM
-	adds tmp=-1,len			// br.ctop is repeat/until
-	mov ret0=len			// return value is length at this point
-(p6)	br.ret.spnt.many rp
-	;;
-	cmp.lt p6,p0=16,len		// if len > 16 then long memset
-	mov ar.lc=tmp			// initialize lc for small count
-(p6)	br.cond.dptk .long_do_clear
-	;;				// WAR on ar.lc
-	//
-	// worst case 16 iterations, avg 8 iterations
-	//
-	// We could have played with the predicates to use the extra
-	// M slot for 2 stores/iteration but the cost the initialization
-	// the various counters compared to how long the loop is supposed
-	// to last on average does not make this solution viable.
-	//
-1:
-	EX( .Lexit1, st1 [buf]=r0,1 )
-	adds len=-1,len			// countdown length using len
-	br.cloop.dptk 1b
-	;;				// avoid RAW on ar.lc
-	//
-	// .Lexit4: comes from byte by byte loop
-	//	    len contains bytes left
-.Lexit1:
-	mov ret0=len			// faster than using ar.lc
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp		// end of short clear_user
-
-
-	//
-	// At this point we know we have more than 16 bytes to copy
-	// so we focus on alignment (no branches required)
-	//
-	// The use of len/len2 for countdown of the number of bytes left
-	// instead of ret0 is due to the fact that the exception code
-	// changes the values of r8.
-	//
-.long_do_clear:
-	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
-	;;
-	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
-(p6)	adds len=-1,len;;		// sync because buf is modified
-	tbit.nz p6,p0=buf,1
-	;;
-	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
-(p6)	adds len=-2,len;;
-	tbit.nz p6,p0=buf,2
-	;;
-	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
-(p6)	adds len=-4,len;;
-	tbit.nz p6,p0=buf,3
-	;;
-	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
-(p6)	adds len=-8,len;;
-	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
-	;;
-	cmp.eq p6,p0=r0,cnt
-	adds tmp=-1,cnt
-(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds buf2=8,buf			// setup second base pointer
-	mov ar.lc=tmp
-	;;
-
-	//
-	// 16bytes/iteration core loop
-	//
-	// The second store can never generate a fault because
-	// we come into the loop only when we are 16-byte aligned.
-	// This means that if we cross a page then it will always be
-	// in the first store and never in the second.
-	//
-	//
-	// We need to keep track of the remaining length. A possible (optimistic)
-	// way would be to use ar.lc and derive how many byte were left by
-	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
-	// every iteration.
-	// However we need to keep the synchronization point. A template
-	// M;;MB does not exist and thus we can keep the addition at no
-	// extra cycle cost (use a nop slot anyway). It also simplifies the
-	// (unlikely)  error recovery code
-	//
-
-2:	EX(.Lexit3, st8 [buf]=r0,16 )
-	;;				// needed to get len correct when error
-	st8 [buf2]=r0,16
-	adds len=-16,len
-	br.cloop.dptk 2b
-	;;
-	mov ar.lc=saved_lc
-	//
-	// tail correction based on len only
-	//
-	// We alternate the use of len3,len2 to allow parallelism and correct
-	// error handling. We also reuse p6/p7 to return correct value.
-	// The addition of len2/len3 does not cost anything more compared to
-	// the regular memset as we had empty slots.
-	//
-.dotail:
-	mov len2=len			// for parallelization of error handling
-	mov len3=len
-	tbit.nz p6,p0=len,3
-	;;
-	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
-(p6)	adds len3=-8,len2
-	tbit.nz p7,p6=len,2
-	;;
-	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
-(p7)	adds len2=-4,len3
-	tbit.nz p6,p7=len,1
-	;;
-	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
-(p6)	adds len3=-2,len2
-	tbit.nz p7,p6=len,0
-	;;
-	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
-	mov ret0=r0				// success
-	br.ret.sptk.many rp			// end of most likely path
-
-	//
-	// Outlined error handling code
-	//
-
-	//
-	// .Lexit3: comes from core loop, need restore pr/lc
-	//	    len contains bytes left
-	//
-	//
-	// .Lexit2:
-	//	if p6 -> coming from st8 or st2 : len2 contains what's left
-	//	if p7 -> coming from st4 or st1 : len3 contains what's left
-	// We must restore lc/pr even though might not have been used.
-.Lexit2:
-	.pred.rel "mutex", p6, p7
-(p6)	mov len=len2
-(p7)	mov len=len3
-	;;
-	//
-	// .Lexit4: comes from head, need not restore pr/lc
-	//	    len contains bytes left
-	//
-.Lexit3:
-	mov ret0=len
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index c0a0e6b2af00..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH	3
-#define EPI		p[PIPE_DEPTH-1]
-
-#define lcount		r16
-#define saved_pr	r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define src1		r20
-#define src2		r21
-#define tgt1		r22
-#define tgt2		r23
-#define srcf		r24
-#define tgtf		r25
-#define tgt_last	r26
-
-#define Nrot		((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	mov ar.ec=PIPE_DEPTH
-
-	mov lcount=PAGE_SIZE/64-1
-	.save pr, saved_pr
-	mov saved_pr=pr
-	mov pr.rot=1<<16
-
-	.body
-
-	mov src1=in1
-	adds src2=8,in1
-	mov tgt_last = PAGE_SIZE
-	;;
-	adds tgt2=8,in0
-	add srcf=512,in1
-	mov ar.lc=lcount
-	mov tgt1=in0
-	add tgtf=512,in0
-	add tgt_last = tgt_last, in0
-	;;
-1:
-(p[0])	ld8 t1[0]=[src1],16
-(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0])	ld8 t2[0]=[src2],16
-(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
-	cmp.ltu p6,p0 = tgtf, tgt_last
-	;;
-(p[0])	ld8 t3[0]=[src1],16
-(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0])	ld8 t4[0]=[src2],16
-(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t5[0]=[src1],16
-(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0])	ld8 t6[0]=[src2],16
-(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t7[0]=[src1],16
-(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0])	ld8 t8[0]=[src2],16
-(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)	lfetch [srcf], 64
-(p6)	lfetch [tgtf], 64
-	br.ctop.sptk.few 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
-	mov ar.pfs=saved_pfs
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index 5e8bb4b4b535..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * General idea:
- *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
- *	  lfetches => good for in-cache performance
- *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- *	  cycle
- *
- * Principle of operation:
- *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
- *	source line is in L1, we start copying the remaining words.  The second half of the
- *	source line is prefetched in an earlier iteration, so that by the time we start
- *	accessing it, it's also present in the L1.
- *
- *	We use a software-pipelined loop to control the overall operation.  The pipeline
- *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
- *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
- *	cache-lines, the last K stages are used to copy the cache-line words not copied by
- *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
- *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- *	the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
- *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- *	so that each loop iteration is faster (again, good for cached case).
- *
- *	When reading the code, it helps to keep the following picture in mind:
- *
- *	       word 0 word 1
- *            +------+------+---
- *	      |	v[x] | 	t1  | ^
- *	      |	t2   |	t3  | |
- *	      |	t4   |	t5  | |
- *	      |	t6   |	t7  | | 128 bytes
- *     	      |	n[y] | 	t9  | |	(L2 cache line)
- *	      |	t10  | 	t11 | |
- *	      |	t12  | 	t13 | |
- *	      |	t14  | 	t15 | v
- *	      +------+------+---
- *
- *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
- *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- *	an order that avoids bank conflicts.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0		r2
-#define src1		r3
-#define dst0		r9
-#define dst1		r10
-#define src_pre_mem	r11
-#define dst_pre_mem	r14
-#define src_pre_l2	r15
-#define dst_pre_l2	r16
-#define t1		r17
-#define t2		r18
-#define t3		r19
-#define t4		r20
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r21
-#define t15		r22
-
-#define saved_lc	r23
-#define saved_pr	r24
-
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 3)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
-	.rotr v[2*PREFETCH_DIST], n[D-C+1]
-	.rotp p[N]
-
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-	.save pr, saved_pr
-	mov saved_pr = pr
-	.body
-
-	mov src_pre_mem = in1
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = in0
-	mov ar.lc = 2*PREFETCH_DIST - 1
-
-	add src_pre_l2 = 8*8, in1
-	add dst_pre_l2 = 8*8, in0
-	add src0 = 8, in1			// first t1 src
-	add src1 = 3*8, in1			// first t3 src
-	add dst0 = 8, in0			// first t1 dst
-	add dst1 = 3*8, in0			// first t3 dst
-	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
-	nop.m 0
-	nop.i 0
-	;;
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
-	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-(p[D])	ld8 t2 = [src0], 3*8			// M0
-(p[D])	ld8 t4 = [src1], 3*8			// M1
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
-(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
-	;;
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
-(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
-(p[D])	st8 [dst0] =  t1, 8			// M2
-(p[D])	st8 [dst1] =  t3, 8			// M3
-	;;
-(p[D])	ld8  t5 = [src0], 8
-(p[D])	ld8  t7 = [src1], 3*8
-(p[D])	st8 [dst0] =  t2, 3*8
-(p[D])	st8 [dst1] =  t4, 3*8
-	;;
-(p[D])	ld8  t6 = [src0], 3*8
-(p[D])	ld8 t10 = [src1], 8
-(p[D])	st8 [dst0] =  t5, 8
-(p[D])	st8 [dst1] =  t7, 3*8
-	;;
-(p[D])	ld8  t9 = [src0], 3*8
-(p[D])	ld8 t11 = [src1], 3*8
-(p[D])	st8 [dst0] =  t6, 3*8
-(p[D])	st8 [dst1] = t10, 8
-	;;
-(p[D])	ld8 t12 = [src0], 8
-(p[D])	ld8 t14 = [src1], 8
-(p[D])	st8 [dst0] =  t9, 3*8
-(p[D])	st8 [dst1] = t11, 3*8
-	;;
-(p[D])	ld8 t13 = [src0], 4*8
-(p[D])	ld8 t15 = [src1], 4*8
-(p[D])	st8 [dst0] = t12, 8
-(p[D])	st8 [dst1] = t14, 8
-	;;
-(p[D-1])ld8  t1 = [src0], 8
-(p[D-1])ld8  t3 = [src1], 8
-(p[D])	st8 [dst0] = t13, 4*8
-(p[D])	st8 [dst1] = t15, 4*8
-	br.ctop.sptk .line_copy
-	;;
-	mov ar.lc = saved_lc
-	mov pr = saved_pr, -1
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index 8daab72cfe77..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *	in0	address of source buffer
- *	in1	address of destination buffer
- *	in2	number of bytes to copy
- *
- * Outputs:
- *	ret0	0 in case of success. The number of bytes NOT copied in
- *		case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- *	- handle the case where we have more than 16 bytes and the alignment
- *	  are different.
- *	- more benchmarking
- *	- fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK	16	// we do byte copy below (must be >=16)
-#define PIPE_DEPTH	21	// pipe depth
-
-#define EPI		p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst		in0
-#define src		in1
-#define len		in2
-
-//
-// local registers
-//
-#define t1		r2	// rshift in bytes
-#define t2		r3	// lshift in bytes
-#define rshift		r14	// right shift in bits
-#define lshift		r15	// left shift in bits
-#define word1		r16
-#define word2		r17
-#define cnt		r18
-#define len2		r19
-#define saved_lc	r20
-#define saved_pr	r21
-#define tmp		r22
-#define val		r23
-#define src1		r24
-#define dst1		r25
-#define src2		r26
-#define dst2		r27
-#define len1		r28
-#define enddst		r29
-#define endsrc		r30
-#define saved_pfs	r31
-
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	adds len2=-1,len	// br.ctop is repeat/until
-	mov ret0=r0
-
-	;;			// RAW of cfm when len=0
-	cmp.eq p8,p0=r0,len	// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// preserve ar.lc (slow)
-(p8)	br.ret.spnt.many rp	// empty mempcy()
-	;;
-	add enddst=dst,len	// first byte after end of source
-	add endsrc=src,len	// first byte after end of destination
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates
-
-	.body
-
-	mov dst1=dst		// copy because of rotation
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-
-	mov src1=src		// copy because of rotation
-	mov ar.lc=len2		// initialize lc for small count
-	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
-
-	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk .long_copy_user
-	;;			// RAW pr.rot/p16 ?
-	//
-	// Now we do the byte by byte loop with software pipeline
-	//
-	// p7 is necessarily false by now
-1:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 1b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs		// restore ar.ec
-	br.ret.sptk.many rp		// end of short memcpy
-
-	//
-	// Not 8-byte aligned
-	//
-.diff_align_copy_user:
-	// At this point we know we have more than 16 bytes to copy
-	// and also that src and dest do _not_ have the same alignment.
-	and src2=0x7,src1				// src offset
-	and dst2=0x7,dst1				// dst offset
-	;;
-	// The basic idea is that we copy byte-by-byte at the head so
-	// that we can reach 8-byte alignment for both src1 and dst1.
-	// Then copy the body using software pipelined 8-byte copy,
-	// shifting the two back-to-back words right and left, then copy
-	// the tail by copying byte-by-byte.
-	//
-	// Fault handling. If the byte-by-byte at the head fails on the
-	// load, then restart and finish the pipleline by copying zeros
-	// to the dst1. Then copy zeros for the rest of dst1.
-	// If 8-byte software pipeline fails on the load, do the same as
-	// failure_in3 does. If the byte-by-byte at the tail fails, it is
-	// handled simply by failure_in_pipe1.
-	//
-	// The case p14 represents the source has more bytes in the
-	// the first word (by the shifted part), whereas the p15 needs to
-	// copy some bytes from the 2nd word of the source that has the
-	// tail of the 1st of the destination.
-	//
-
-	//
-	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-	// to copy the head to dst1, to start 8-byte copy software pipeline.
-	// We know src1 is not 8-byte aligned in this case.
-	//
-	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt 1f
-	;;
-	sub t1=8,src2
-	mov t2=src2
-	;;
-	shl rshift=t2,3
-	sub len1=len,t1					// set len1
-	;;
-	sub lshift=64,rshift
-	;;
-	br.cond.spnt .word_copy_user
-	;;
-1:
-	cmp.leu	p14,p15=src2,dst2
-	sub t1=dst2,src2
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub word1=8,src2				// (8 - src offset)
-(p15)	sub t1=r0,t1					// absolute value
-(p15)	sub word1=8,dst2				// (8 - dst offset)
-	;;
-	// For the case p14, we don't need to copy the shifted part to
-	// the 1st word of destination.
-	sub t2=8,t1
-(p14)	sub word1=word1,t1
-	;;
-	sub len1=len,word1				// resulting len
-(p15)	shl rshift=t1,3					// in bits
-(p14)	shl rshift=t2,3
-	;;
-(p14)	sub len1=len1,t1
-	adds cnt=-1,word1
-	;;
-	sub lshift=64,rshift
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=cnt
-	;;
-2:
-	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 2b
-	;;
-	clrrrb
-	;;
-.word_copy_user:
-	cmp.gtu p9,p0=16,len1
-(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
-	;;
-	shr.u cnt=len1,3		// number of 64-bit words
-	;;
-	adds cnt=-1,cnt
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t2
-(p15)	sub src1=src1,t1
-	//
-	// Now both src1 and dst1 point to an 8-byte aligned address. And
-	// we have more than 8 bytes to copy.
-	//
-	mov ar.lc=cnt
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	;;
-3:
-	//
-	// The pipleline consists of 3 stages:
-	// 1 (p16):	Load a word from src1
-	// 2 (EPI_1):	Shift right pair, saving to tmp
-	// 3 (EPI):	Store tmp to dst1
-	//
-	// To make it simple, use at least 2 (p16) loops to set up val1[n]
-	// because we need 2 back-to-back val1[] to get tmp.
-	// Note that this implies EPI_2 must be p18 or greater.
-	//
-
-#define EPI_1		p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)	\
-	(pred)	br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)						\
-.copy_user_bit##rshift:						\
-1:								\
-	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 1b;					\
-	;;							\
-	br.cond.sptk.many .diff_align_do_tail;			\
-2:								\
-(EPI)	st8 [dst1]=tmp,8;					\
-(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-3:								\
-(p16)	mov val1[1]=r0;						\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 2b;					\
-	;;							\
-	br.cond.sptk.many .failure_in2
-
-	//
-	// Since the instruction 'shrp' requires a fixed 128-bit value
-	// specifying the bits to shift, we need to provide 7 cases
-	// below.
-	//
-	SWITCH(p6, 8)
-	SWITCH(p7, 16)
-	SWITCH(p8, 24)
-	SWITCH(p9, 32)
-	SWITCH(p10, 40)
-	SWITCH(p11, 48)
-	SWITCH(p12, 56)
-	;;
-	CASE(p6, 8)
-	CASE(p7, 16)
-	CASE(p8, 24)
-	CASE(p9, 32)
-	CASE(p10, 40)
-	CASE(p11, 48)
-	CASE(p12, 56)
-	;;
-	BODY(8)
-	BODY(16)
-	BODY(24)
-	BODY(32)
-	BODY(40)
-	BODY(48)
-	BODY(56)
-	;;
-.diff_align_do_tail:
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t1
-(p14)	adds dst1=-8,dst1
-(p15)	sub dst1=dst1,t1
-	;;
-4:
-	// Tail correction.
-	//
-	// The problem with this piplelined loop is that the last word is not
-	// loaded and thus parf of the last word written is not correct.
-	// To fix that, we simply copy the tail byte by byte.
-
-	sub len1=endsrc,src1,1
-	clrrrb
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=len1
-	;;
-5:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 5b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Beginning of long mempcy (i.e. > 16 bytes)
-	//
-.long_copy_user:
-	tbit.nz p6,p7=src1,0	// odd alignment
-	and tmp=7,tmp
-	;;
-	cmp.eq p10,p8=r0,tmp
-	mov len1=len		// copy because of rotation
-(p8)	br.cond.dpnt .diff_align_copy_user
-	;;
-	// At this point we know we have more than 16 bytes to copy
-	// and also that both src and dest have the same alignment
-	// which may not be the one we want. So for now we must move
-	// forward slowly until we reach 16byte alignment: no need to
-	// worry about reaching the end of buffer.
-	//
-	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
-(p6)	adds len1=-1,len1;;
-	tbit.nz p7,p0=src1,1
-	;;
-	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
-(p7)	adds len1=-2,len1;;
-	tbit.nz p8,p0=src1,2
-	;;
-	//
-	// Stop bit not required after ld4 because if we fail on ld4
-	// we have never executed the ld1, therefore st1 is not executed.
-	//
-	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
-	;;
-	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-	tbit.nz p9,p0=src1,3
-	;;
-	//
-	// Stop bit not required after ld8 because if we fail on ld8
-	// we have never executed the ld2, therefore st2 is not executed.
-	//
-	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
-	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)	adds len1=-4,len1
-	;;
-	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)	adds len1=-8,len1;;
-	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
-	;;
-	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-	tbit.nz p6,p0=len1,3
-	cmp.eq p7,p0=r0,cnt
-	adds tmp=-1,cnt			// br.ctop is repeat/until
-(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds src2=8,src1
-	adds dst2=8,dst1
-	mov ar.lc=tmp
-	;;
-	//
-	// 16bytes/iteration
-	//
-2:
-	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)	ld8 val2[0]=[src2],16
-
-	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;			// RAW on src1 when fall through from loop
-	//
-	// Tail correction based on len only
-	//
-	// No matter where we come from (loop or test) the src1 pointer
-	// is 16 byte aligned AND we have less than 16 bytes to copy.
-	//
-.dotail:
-	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
-	tbit.nz p7,p0=len1,2
-	;;
-	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
-	tbit.nz p8,p0=len1,1
-	;;
-	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
-	tbit.nz p9,p0=len1,0
-	;;
-	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-	;;
-	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
-	mov ar.lc=saved_lc
-	;;
-	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
-	mov ar.pfs=saved_pfs
-	;;
-	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
-	br.ret.sptk.many rp
-
-
-	//
-	// Here we handle the case where the byte by byte copy fails
-	// on the load.
-	// Several factors make the zeroing of the rest of the buffer kind of
-	// tricky:
-	//	- the pipeline: loads/stores are not in sync (pipeline)
-	//
-	//	  In the same loop iteration, the dst1 pointer does not directly
-	//	  reflect where the faulty load was.
-	//
-	//	- pipeline effect
-	//	  When you get a fault on load, you may have valid data from
-	//	  previous loads not yet store in transit. Such data must be
-	//	  store normally before moving onto zeroing the rest.
-	//
-	//	- single/multi dispersal independence.
-	//
-	// solution:
-	//	- we don't disrupt the pipeline, i.e. data in transit in
-	//	  the software pipeline will be eventually move to memory.
-	//	  We simply replace the load with a simple mov and keep the
-	//	  pipeline going. We can't really do this inline because
-	//	  p16 is always reset to 1 when lc > 0.
-	//
-.failure_in_pipe1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-1:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// This is the case where the byte by byte copy fails on the load
-	// when we copy the head. We need to finish the pipeline and copy
-	// zeros for the rest of the destination. Since this happens
-	// at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-2:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 2b
-	;;
-	sub len=enddst,dst1,1		// precompute len
-	br.cond.dptk.many .failure_in1bis
-	;;
-
-	//
-	// Here we handle the head & tail part when we check for alignment.
-	// The following code handles only the load failures. The
-	// main diffculty comes from the fact that loads/stores are
-	// scheduled. So when you fail on a load, the stores corresponding
-	// to previous successful loads must be executed.
-	//
-	// However some simplifications are possible given the way
-	// things work.
-	//
-	// 1) HEAD
-	// Theory of operation:
-	//
-	//  Page A   | Page B
-	//  ---------|-----
-	//          1|8 x
-	//	  1 2|8 x
-	//	    4|8 x
-	//	  1 4|8 x
-	//        2 4|8 x
-	//      1 2 4|8 x
-	//	     |1
-	//	     |2 x
-	//	     |4 x
-	//
-	// page_size >= 4k (2^12).  (x means 4, 2, 1)
-	// Here we suppose Page A exists and Page B does not.
-	//
-	// As we move towards eight byte alignment we may encounter faults.
-	// The numbers on each page show the size of the load (current alignment).
-	//
-	// Key point:
-	//	- if you fail on 1, 2, 4 then you have never executed any smaller
-	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-	//	  before.
-	//
-	// This allows us to simplify the cleanup code, because basically you
-	// only have to worry about "pending" stores in the case of a failing
-	// ld8(). Given the way the code is written today, this means only
-	// worry about st2, st4. There we can use the information encapsulated
-	// into the predicates.
-	//
-	// Other key point:
-	//	- if you fail on the ld8 in the head, it means you went straight
-	//	  to it, i.e. 8byte alignment within an unexisting page.
-	// Again this comes from the fact that if you crossed just for the ld8 then
-	// you are 8byte aligned but also 16byte align, therefore you would
-	// either go for the 16byte copy loop OR the ld8 in the tail part.
-	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
-	// because it would mean you had 15bytes to copy in which case you
-	// would have defaulted to the byte by byte copy.
-	//
-	//
-	// 2) TAIL
-	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-	// aligned.
-	//
-	// Key point:
-	// This means that we either:
-	//		- are right on a page boundary
-	//	OR
-	//		- are at more than 16 bytes from a page boundary with
-	//		  at most 15 bytes to copy: no chance of crossing.
-	//
-	// This allows us to assume that if we fail on a load we haven't possibly
-	// executed any of the previous (tail) ones, so we don't need to do
-	// any stores. For instance, if we fail on ld2, this means we had
-	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-	//
-	// This means that we are in a situation similar the a fault in the
-	// head part. That's nice!
-	//
-.failure_in1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	sub len=endsrc,src1,1
-	//
-	// we know that ret0 can never be zero at this point
-	// because we failed why trying to do a load, i.e. there is still
-	// some work to do.
-	// The failure_in1bis and length problem is taken care of at the
-	// calling side.
-	//
-	;;
-.failure_in1bis:		// from (.failure_in3)
-	mov ar.lc=len		// Continue with a stupid byte store.
-	;;
-5:
-	st1 [dst1]=r0,1
-	br.cloop.dptk 5b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Here we simply restart the loop but instead
-	// of doing loads we fill the pipeline with zeroes
-	// We can't simply store r0 because we may have valid
-	// data in transit in the pipeline.
-	// ar.lc and ar.ec are setup correctly at this point
-	//
-	// we MUST use src1/endsrc here and not dst1/enddst because
-	// of the pipeline effect.
-	//
-.failure_in3:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	;;
-2:
-(p16)	mov val1[0]=r0
-(p16)	mov val2[0]=r0
-(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-.failure_in2:
-	sub ret0=endsrc,src1
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// handling of failures on stores: that's the easy part
-	//
-.failure_out:
-	sub ret0=enddst,dst1
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-	int odd, count;
-	unsigned long result = (unsigned long)psum;
-
-	if (len <= 0)
-		goto out;
-	odd = 1 & (unsigned long) buff;
-	if (odd) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *) buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-			if (count) {
-				unsigned long carry = 0;
-				do {
-					unsigned long w = *(unsigned long *) buff;
-					count--;
-					buff += 8;
-					result += carry;
-					result += w;
-					carry = (w > result);
-				} while (count);
-				result += carry;
-				result = (result & 0xffffffff) + (result >> 32);
-			}
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-
-	result = from64to16(result);
-
-	if (odd)
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-	return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *	in0: address of buffer to checksum (char *)
- *	in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
- *		Data locality study on the checksum buffer.
- *		More optimization cleanup - remove excessive stop bits.
- * 02/04/08	David Mosberger <davidm@hpl.hp.com>
- *		More cleanup and tuning.
- * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
- *		Clean up and optimize and the software pipeline, loading two
- *		back-to-back 8-byte words per loop. Clean up the initialization
- *		for the loop. Support the cases where load latency = 1 or 2.
- *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//	The goal is to go as quickly as possible to the point where
-//	we can checksum 16 bytes/loop. Before reaching that point we must
-//	take care of incorrect alignment of first byte.
-//
-//	The code hereafter also takes care of the "tail" part of the buffer
-//	before entering the core loop, if any. The checksum is a sum so it
-//	allows us to commute operations. So we do the "head" and "tail"
-//	first to finish at full speed in the body. Once we get the head and
-//	tail values, we feed them into the pipeline, very handy initialization.
-//
-//	Of course we deal with the special case where the whole buffer fits
-//	into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//	possible load latency and also to accommodate for head and tail.
-//
-//	The end of the function deals with folding the checksum from 64bits
-//	down to 16bits taking care of the carry.
-//
-//	This version avoids synchronization in the core loop by also using a
-//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//	 wordx[] (x=1,2)
-//	|---|
-//      |   | 0			: new value loaded in pipeline
-//	|---|
-//      |   | -			: in transit data
-//	|---|
-//      |   | LOAD_LATENCY	: current value to add to checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: previous value added to checksum
-//      |---|			(previous iteration)
-//
-//	resultx[] (x=1,2)
-//	|---|
-//      |   | 0			: initial value
-//	|---|
-//      |   | LOAD_LATENCY-1	: new checksum
-//	|---|
-//      |   | LOAD_LATENCY	: previous value of checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
-//      |---|
-//
-//
-//	See RFC1071 "Computing the Internet Checksum" for various techniques for
-//	calculating the Internet checksum.
-//
-// NOT YET DONE:
-//	- Maybe another algorithm which would take care of the folding at the
-//	  end in a different manner
-//	- Work with people more knowledgeable than me on the network stack
-//	  to figure out if we could not split the function depending on the
-//	  type of packet or alignment we get. Like the ip_fast_csum() routine
-//	  where we know we have at least 20bytes worth of data to checksum.
-//	- Do a better job of handling small packets.
-//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
-//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-//	  on the data that buffer points to (partly because the checksum is often preceded by
-//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
-//	  the data is already in the cache.
-//
-
-#define saved_pfs	r11
-#define hmask		r16
-#define tmask		r17
-#define first1		r18
-#define firstval	r19
-#define firstoff	r20
-#define last		r21
-#define lastval		r22
-#define lastoff		r23
-#define saved_lc	r24
-#define saved_pr	r25
-#define tmp1		r26
-#define tmp2		r27
-#define tmp3		r28
-#define carry1		r29
-#define carry2		r30
-#define first2		r31
-
-#define buf		in0
-#define len		in1
-
-#define LOAD_LATENCY	2	// XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH			(LOAD_LATENCY+2)
-#define ELD	p[LOAD_LATENCY]		// end of load
-#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,16,0,16
-	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-	mov ret0=r0		// in case we have zero length
-	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
-	;;
-	add tmp1=buf,len	// last byte's address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-(p6)	br.ret.spnt.many rp	// return if zero or negative length
-
-	mov hmask=-1		// initialize head mask
-	tbit.nz p15,p0=buf,0	// is buf an odd address?
-	and first1=-8,buf	// 8-byte align down address of first1 element
-
-	and firstoff=7,buf	// how many bytes off for first1 element
-	mov tmask=-1		// initialize tail mask
-
-	;;
-	adds tmp2=-1,tmp1	// last-1
-	and lastoff=7,tmp1	// how many bytes off for last element
-	;;
-	sub tmp1=8,lastoff	// complement to lastoff
-	and last=-8,tmp2	// address of word containing last byte
-	;;
-	sub tmp3=last,first1	// tmp3=distance from first1 to last
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save lc
-	cmp.eq p8,p9=last,first1	// everything fits in one word ?
-
-	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
-	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
-	shl tmp2=firstoff,3	// number of bits
-	;;
-(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
-	shl tmp1=tmp1,3		// number of bits
-(p9)	adds tmp3=-8,tmp3	// effectively loaded
-	;;
-(p8)	mov lastval=r0		// we don't need lastval if first1==last
-	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
-	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
-	;;
-	.body
-#define count tmp3
-
-(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
-(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
-	shr.u count=count,3	// how many 8-byte?
-	;;
-	// If count is odd, finish this 8-byte word so that we can
-	// load two back-to-back 8-byte words per loop thereafter.
-	and word1[0]=firstval,hmask	// and mask it as appropriate
-	tbit.nz p10,p11=count,0		// if (count is odd)
-	;;
-(p8)	mov result1[0]=word1[0]
-(p9)	add result1[0]=word1[0],word2[0]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
-	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
-(p11)	br.cond.dptk .do_csum16		// if (count is even)
-
-	// Here count is odd.
-	ld8 word1[1]=[first1],8		// load an 8-byte word
-	cmp.eq p9,p10=1,count		// if (count == 1)
-	adds count=-1,count		// loaded an 8-byte word
-	;;
-	add result1[0]=result1[0],word1[1]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
-	// Fall through to calculate the checksum, feeding result1[0] as
-	// the initial value in result1[0].
-	//
-	// Calculate the checksum loading two 8-byte words per loop.
-	//
-.do_csum16:
-	add first2=8,first1
-	shr.u count=count,1	// we do 16 bytes per loop
-	;;
-	adds count=-1,count
-	mov carry1=r0
-	mov carry2=r0
-	brp.loop.imp 1f,2f
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov ar.lc=count	// set lc
-	mov pr.rot=1<<16
-	// result1[0] must be initialized in advance.
-	mov result2[0]=r0
-	;;
-	.align 32
-1:
-(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0])	ld8 word1[0]=[first1],16
-(p[0])	ld8 word2[0]=[first2],16
-	br.ctop.sptk 1b
-	;;
-	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1	// since we miss the last one
-(pC2[1])adds carry2=1,carry2
-	;;
-	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-	;;
-	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-	;;
-(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-	;;
-	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-	;;
-	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-	;;
-.do_csum_exit:
-	//
-	// now fold 64 into 16 bits taking care of carry
-	// that's not very good because it has lots of sequentiality
-	//
-	mov tmp3=0xffff
-	zxt4 tmp1=result1[0]
-	shr.u tmp2=result1[0],32
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add ret0=tmp1,tmp2
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	// if buf was odd then swap bytes
-	mov ar.pfs=saved_pfs		// restore ar.ec
-(p15)	mux1 ret0=ret0,@rev		// reverse word
-	;;
-	mov ar.lc=saved_lc
-(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-	br.ret.sptk.many rp
-
-//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//	not much better than the original. So keep the original there so that
-//	someone else can challenge.
-//
-//	shr.u word1[0]=result1[0],32
-//	zxt4 result1[0]=result1[0]
-//	;;
-//	add result1[0]=result1[0],word1[0]
-//	;;
-//	zxt2 result2[0]=result1[0]
-//	extr.u word1[0]=result1[0],16,16
-//	shr.u carry1=result1[0],32
-//	;;
-//	add result2[0]=result2[0],word1[0]
-//	;;
-//	add result2[0]=result2[0],carry1
-//	;;
-//	extr.u ret0=result2[0],16,16
-//	;;
-//	add ret0=ret0,result2[0]
-//	;;
-//	zxt2 ret0=ret0
-//	mov ar.pfs=saved_pfs		 // restore ar.ec
-//	mov pr=saved_pr,0xffffffffffff0000
-//	;;
-//	// if buf was odd then swap bytes
-//	mov ar.lc=saved_lc
-//(p15)	mux1 ret0=ret0,@rev		// reverse word
-//	;;
-//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-//	br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index f8e795fe45cb..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart	Dynamic stride size
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-	/*
-	 * flush_icache_range(start,end)
-	 *
-	 *	Make i-cache(s) coherent with d-caches.
-	 *
-	 *	Must deal with range from start to end-1 but nothing else (need to
-	 *	be careful not to touch addresses that may be unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_i_cache_stride_shift
- 	mov	r21=1
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=in1,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc.i" =
-					//	"start" rounded down to stride boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop:	fc.i	r24			// issuable on M0 only
-	add	r24=r21,r24		// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
-	/*
-	 * clflush_cache_range(start,size)
-	 *
-	 *	Flush cache lines from start to start+size-1.
-	 *
-	 *	Must deal with range from start to start+size-1 but nothing else
-	 *	(need to be careful not to touch addresses that may be
-	 *	unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_cache_stride_shift
-	mov	r21=1
-	add     r22=in1,in0
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=r22,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc" =
-					//	"start" rounded down to stride
-					//	boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop_fc:
-	fc	r24		// issuable on M0 only
-	add	r24=r21,r24	// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop_fc
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index 83586fbc51ff..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define EXTEND	zxt4
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND	sxt4
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
-	EXTEND in0 = in0		// in0 = a
-	EXTEND in1 = in1		// in1 = b
-	;;
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	setf.exp f7 = r2		// f7 = 2^-34
-	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
-	;;
-#ifdef MODULO
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
-(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
-	;;
-#ifdef MODULO
-	setf.sig f7 = in0
-#endif
-(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
-	;;
-	FP_TO_INT(f6, f6)		// q = trunc(q2)
-	;;
-#ifdef MODULO
-	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f6		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index 5c9113691f72..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
-	;;
-(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
-(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
-	;;
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
-(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
-	;;
-(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
-(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
-	;;
-#ifdef MODULO
-	setf.sig f8 = in0		// f8 = a
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
-	;;
-	FP_TO_INT(f11, f11)		// q = trunc(q3)
-	;;
-#ifdef MODULO
-	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f11		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-	char *dst = to;
-
-	while (count) {
-		count--;
-		*dst++ = readb(from++);
-	}
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-	const char *src = from;
-
-	while (count) {
-		count--;
-		writeb(*src++, to++);
-	}
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-	unsigned char ch = (char)(c & 0xff);
-
-	while (count) {
-		count--;
-		writeb(ch, dst);
-		dst++;
-	}
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index fcc0b812ce2e..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0	r32
-#define in1	r33
-#define in2	r34
-#define in3	r35
-#define in4	r36
-#define ret0	r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-	.prologue
-	.body
-	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
-	and	r14=3,in0	// is it aligned on 4-byte?
-	add	r15=4,in0	// second source pointer
-	;;
-	cmp.ne.or.andcm p6,p7=r14,r0
-	;;
-(p7)	ld4	r20=[in0],8
-(p7)	ld4	r21=[r15],8
-(p6)	br.spnt	.generic
-	;;
-	ld4	r22=[in0],8
-	ld4	r23=[r15],8
-	;;
-	ld4	r24=[in0]
-	add	r20=r20,r21
-	add	r22=r22,r23
-	;;
-	add	r20=r20,r22
-	;;
-	add	r20=r20,r24
-	;;
-	shr.u	ret0=r20,16	// now need to add the carry
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16	// add carry again
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	mov	r9=0xffff
-	;;
-	andcm	ret0=r9,r20
-	.restore sp		// reset frame state
-	br.ret.sptk.many b0
-	;;
-
-.generic:
-	.prologue
-	.save ar.pfs, r35
-	alloc	r35=ar.pfs,2,2,2,0
-	.save rp, r34
-	mov	r34=b0
-	.body
-	dep.z	out1=in1,2,30
-	mov	out0=in0
-	;;
-	br.call.sptk.many b0=do_csum
-	;;
-	andcm	ret0=-1,ret0
-	mov	ar.pfs=r35
-	mov	b0=r34
-	br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
-	ld4	r20=[in0],4
-	ld4	r21=[in1],4
-	zxt4	in2=in2
-	;;
-	ld4	r22=[in0],4
-	ld4	r23=[in1],4
-	dep	r15=in3,in2,32,16
-	;;
-	ld4	r24=[in0],4
-	ld4	r25=[in1],4
-	mux1	r15=r15,@rev
-	add	r16=r20,r21
-	add	r17=r22,r23
-	zxt4	in4=in4
-	;;
-	ld4	r26=[in0],4
-	ld4	r27=[in1],4
-	shr.u	r15=r15,16
-	add	r18=r24,r25
-	add	r8=r16,r17
-	;;
-	add	r19=r26,r27
-	add	r8=r8,r18
-	;;
-	add	r8=r8,r19
-	add	r15=r15,in4
-	;;
-	add	r8=r8,r15
-	;;
-	shr.u	r10=r8,32	// now fold sum into short
-	zxt4	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// yeah, keep it rolling
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// three times lucky
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	mov	r9=0xffff
-	;;
-	andcm	r8=r9,r8
-	br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- * 	no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#	define MEM_LAT	21		/* latency to memory */
-
-#	define dst	r2
-#	define src	r3
-#	define retval	r8
-#	define saved_pfs r9
-#	define saved_lc	r10
-#	define saved_pr	r11
-#	define cnt	r16
-#	define src2	r17
-#	define t0	r18
-#	define t1	r19
-#	define t2	r20
-#	define t3	r21
-#	define t4	r22
-#	define src_end	r23
-
-#	define N	(MEM_LAT + 4)
-#	define Nrot	((N + 7) & ~7)
-
-	/*
-	 * First, check if everything (src, dst, len) is a multiple of eight.  If
-	 * so, we handle everything with no taken branches (other than the loop
-	 * itself) and a small icache footprint.  Otherwise, we jump off to
-	 * the more general copy routine handling arbitrary
-	 * sizes/alignment etc.
-	 */
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	or t0=in0,in1
-	;;
-
-	or t0=t0,in2
-	.save pr, saved_pr
-	mov saved_pr=pr
-
-	.body
-
-	cmp.eq p6,p0=in2,r0	// zero length?
-	mov retval=in0		// return dst
-(p6)	br.ret.spnt.many rp	// zero length, return immediately
-	;;
-
-	mov dst=in0		// copy because of rotation
-	shr.u cnt=in2,3		// number of 8-byte words to copy
-	mov pr.rot=1<<16
-	;;
-
-	adds cnt=-1,cnt		// br.ctop is repeat/until
-	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
-	mov ar.ec=N
-	;;
-
-	and t0=0x7,t0
-	mov ar.lc=cnt
-	;;
-	cmp.ne p6,p0=t0,r0
-
-	mov src=in1		// copy because of rotation
-(p7)	br.cond.spnt.few .memcpy_short
-(p6)	br.cond.spnt.few .memcpy_long
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	.rotr val[N]
-	.rotp p[N]
-	.align 32
-1: { .mib
-(p[0])	ld8 val[0]=[src],8
-	nop.i 0
-	brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-	nop.f 0
-	br.ctop.dptk.few 1b
-}
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
-	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
-	 * get used very often (gcc inlines small copies) and due to atomicity
-	 * issues, we want to avoid read-modify-write of entire words.
-	 */
-	.align 32
-.memcpy_short:
-	adds cnt=-1,in2		// br.ctop is repeat/until
-	mov ar.ec=MEM_LAT
-	brp.loop.imp 1f, 2f
-	;;
-	mov ar.lc=cnt
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	;;
-	/*
-	 * It is faster to put a stop bit in the loop here because it makes
-	 * the pipeline shorter (and latency is what matters on short copies).
-	 */
-	.align 32
-1: { .mib
-(p[0])	ld1 val[0]=[src],1
-	nop.i 0
-	brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-	nop.f 0
-	br.ctop.dptk.few 1b
-} ;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-	 * an overriding concern here, but throughput is.  We first do
-	 * sub-word copying until the destination is aligned, then we check
-	 * if the source is also aligned.  If so, we do a simple load/store-loop
-	 * until there are less than 8 bytes left over and then we do the tail,
-	 * by storing the last few bytes using sub-word copying.  If the source
-	 * is not aligned, we branch off to the non-congruent loop.
-	 *
-	 *   stage:   op:
-	 *         0  ld
-	 *	   :
-	 * MEM_LAT+3  shrp
-	 * MEM_LAT+4  st
-	 *
-	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
-	 * seems to introduce an unavoidable bubble in the pipeline so the overall
-	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-	 * of 4 byte/cycle.  Still not bad.
-	 */
-#	undef N
-#	undef Nrot
-#	define N	(MEM_LAT + 5)		/* number of stages */
-#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
-
-#define LOG_LOOP_SIZE	6
-
-.memcpy_long:
-	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
-	and t0=-8,src		// t0 = src & ~7
-	and t2=7,src		// t2 = src & 7
-	;;
-	ld8 t0=[t0]		// t0 = 1st source word
-	adds src2=7,src		// src2 = (src + 7)
-	sub t4=r0,dst		// t4 = -dst
-	;;
-	and src2=-8,src2	// src2 = (src + 7) & ~7
-	shl t2=t2,3		// t2 = 8*(src & 7)
-	shl t4=t4,3		// t4 = 8*(dst & 7)
-	;;
-	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
-	sub t3=64,t2		// t3 = 64-8*(src & 7)
-	shr.u t0=t0,t2
-	;;
-	add src_end=src,in2
-	shl t1=t1,t3
-	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
-	;;
-	or t0=t0,t1
-	mov cnt=r0
-	adds src_end=-1,src_end
-	;;
-(p3)	st1 [dst]=t0,1
-(p3)	shr.u t0=t0,8
-(p3)	adds cnt=1,cnt
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-(p4)	adds cnt=2,cnt
-	;;
-(p5)	st4 [dst]=t0,4
-(p5)	adds cnt=4,cnt
-	and src_end=-8,src_end	// src_end = last word of source buffer
-	;;
-
-	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{	add src=cnt,src			// make src point to remainder of source buffer
-	sub cnt=in2,cnt			// cnt = number of bytes left to copy
-	mov t4=ip
-  }	;;
-	and src2=-8,src			// align source pointer
-	adds t4=.memcpy_loops-1b,t4
-	mov ar.ec=N
-
-	and t0=7,src			// t0 = src & 7
-	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
-	shl cnt=cnt,3			// move bits 0-2 to 3-5
-	;;
-
-	.rotr val[N+1], w[2]
-	.rotp p[N]
-
-	cmp.ne p6,p0=t0,r0		// is src aligned, too?
-	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
-	adds t2=-1,t2			// br.ctop is repeat/until
-	;;
-	add t4=t0,t4
-	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
-	mov ar.lc=t2
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-(p6)	ld8 val[1]=[src2],8		// prime the pump...
-	mov b6=t4
-	br.sptk.few b6
-	;;
-
-.memcpy_tail:
-	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
-	// less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)	st4 [dst]=t0,4
-(p5)	shr.u t0=t0,32
-	mov ar.lc=saved_lc
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-	mov ar.pfs=saved_pfs
-	;;
-(p3)	st1 [dst]=t0
-	mov pr=saved_pr,-1
-	br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-	.align 64
-
-#define COPY(shift,index)									\
- 1: { .mib											\
-	(p[0])		ld8 val[0]=[src2],8;							\
-	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
-			brp.loop.imp 1b, 2f							\
-    };												\
- 2: { .mfb											\
-	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
-			nop.f 0;								\
-			br.ctop.dptk.few 1b;							\
-    };												\
-			;;									\
-			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
-			;;									\
-			shrp t0=val[N-1],val[N-index],shift;					\
-			br .memcpy_tail
-.memcpy_loops:
-	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
-	COPY(8, 0)
-	COPY(16, 0)
-	COPY(24, 0)
-	COPY(32, 0)
-	COPY(40, 0)
-	COPY(48, 0)
-	COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index c0d4362217ae..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- *	for memcpy:    return dest
- * 	for copy_user: return 0 if success,
- *		       or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval		r8
-#define saved_pfs	r31
-#define saved_lc	r10
-#define saved_pr	r11
-#define saved_in0	r14
-#define saved_in1	r15
-#define saved_in2	r16
-
-#define src0		r2
-#define src1		r3
-#define dst0		r17
-#define dst1		r18
-#define cnt		r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST	8
-#define src_pre_mem	r19
-#define dst_pre_mem	r20
-#define src_pre_l2	r21
-#define dst_pre_l2	r22
-#define t1		r23
-#define t2		r24
-#define t3		r25
-#define t4		r26
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define n8		r27
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r28
-#define t15		r29
-#define tmp		r30
-
-/* defines for long_copy block */
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 1)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-/* alias */
-#define in0		r32
-#define in1		r33
-#define in2		r34
-
-GLOBAL_ENTRY(memcpy)
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f0
-	mov	retval=in0
-	br.cond.sptk .common_code
-	;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-// check dest alignment
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f1
-	mov	saved_in0=in0	// save dest pointer
-	mov	saved_in1=in1	// save src pointer
-	mov	retval=r0	// initialize return value
-	;;
-.common_code:
-	cmp.gt	p15,p0=8,in2	// check for small size
-	cmp.ne	p13,p0=0,r28	// check dest alignment
-	cmp.ne	p14,p0=0,r29	// check src alignment
-	add	src0=0,in1
-	sub	r30=8,r28	// for .align_dest
-	mov	saved_in2=in2	// save len
-	;;
-	add	dst0=0,in0
-	add	dst1=1,in0	// dest odd index
-	cmp.le	p6,p0 = 1,r30	// for .align_dest
-(p15)	br.cond.dpnt .memcpy_short
-(p13)	br.cond.dpnt .align_dest
-(p14)	br.cond.dpnt .unaligned_src
-	;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-	.save pr, saved_pr
-	mov	saved_pr=pr
-
-	shr.u	cnt=in2,7	// this much cache line
-	;;
-	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
-	cmp.lt	p7,p8=1,cnt
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.body
-	add	cnt=-1,cnt
-	add	src_pre_mem=0,in1	// prefetch src pointer
-	add	dst_pre_mem=0,in0	// prefetch dest pointer
-	;;
-(p7)	mov	ar.lc=cnt	// prefetch count
-(p8)	mov	ar.lc=r0
-(p6)	br.cond.dpnt .long_copy
-	;;
-
-.prefetch:
-	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few .prefetch
-	;;
-
-.medium_copy:
-	and	tmp=31,in2	// copy length after iteration
-	shr.u	r29=in2,5	// number of 32-byte iteration
-	add	dst1=8,dst0	// 2nd dest pointer
-	;;
-	add	cnt=-1,r29	// ctop iteration adjustment
-	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
-	add	src1=8,src0	// 2nd src pointer
-	cmp.le	p6,p0=8,tmp
-	;;
-	cmp.le	p7,p0=16,tmp
-	mov	ar.lc=cnt	// loop setup
-	cmp.eq	p16,p17 = r0,r0
-	mov	ar.ec=2
-(p10)	br.dpnt.few .aligned_src_tail
-	;;
-	TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16)	ld8	r34=[src0],16)
-EK(.ex_handler, (p16)	ld8	r38=[src1],16)
-EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
-EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
-	;;
-EX(.ex_handler, (p16)	ld8	r32=[src0],16)
-EK(.ex_handler, (p16)	ld8	r36=[src1],16)
-EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
-EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
-	br.ctop.dptk.few 1b
-	;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6)	ld8	t1=[src0])
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
-	cmp.le	p8,p0=24,tmp
-	and	r21=-8,tmp
-	;;
-EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
-EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
-	and	in2=7,tmp	// remaining length
-EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
-	add	src0=src0,r21	// setting up src pointer
-	add	dst0=dst0,r21	// setting up dest pointer
-	;;
-EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
-	mov	pr=saved_pr,-1
-	br.dptk.many .memcpy_short
-	;;
-
-/* code taken from copy_page_mck */
-.long_copy:
-	.rotr v[2*PREFETCH_DIST]
-	.rotp p[N]
-
-	mov src_pre_mem = src0
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = dst0
-
-	add src_pre_l2 = 8*8, src0
-	add dst_pre_l2 = 8*8, dst0
-	;;
-	add src0 = 8, src_pre_mem		// first t1 src
-	mov ar.lc = 2*PREFETCH_DIST - 1
-	shr.u cnt=in2,7				// number of lines
-	add src1 = 3*8, src_pre_mem		// first t3 src
-	add dst0 = 8, dst_pre_mem		// first t1 dst
-	add dst1 = 3*8, dst_pre_mem		// first t3 dst
-	;;
-	and tmp=127,in2				// remaining bytes after this block
-	add cnt = -(2*PREFETCH_DIST) - 1, cnt
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
-EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
-	mov ar.lc = cnt
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
-EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
-EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
-EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
-	;;
-EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
-EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
-	;;
-EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
-EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
-	;;
-EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
-EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
-	br.ctop.sptk .line_copy
-	;;
-
-	add dst0=-8,dst0
-	add src0=-8,src0
-	mov in2=tmp
-	.restore sp
-	br.sptk.many .medium_copy
-	;;
-
-#define BLOCK_SIZE	128*32
-#define blocksize	r23
-#define curlen		r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
-.unaligned_src:
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,5,0,8
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.save pr, saved_pr
-	mov	saved_pr=pr
-	.body
-.4k_block:
-	mov	saved_in0=dst0	// need to save all input arguments
-	mov	saved_in2=in2
-	mov	blocksize=BLOCK_SIZE
-	;;
-	cmp.lt	p6,p7=blocksize,in2
-	mov	saved_in1=src0
-	;;
-(p6)	mov	in2=blocksize
-	;;
-	shr.u	r21=in2,7	// this much cache line
-	shr.u	r22=in2,4	// number of 16-byte iteration
-	and	curlen=15,in2	// copy length after iteration
-	and	r30=7,src0	// source alignment
-	;;
-	cmp.lt	p7,p8=1,r21
-	add	cnt=-1,r21
-	;;
-
-	add	src_pre_mem=0,src0	// prefetch src pointer
-	add	dst_pre_mem=0,dst0	// prefetch dest pointer
-	and	src0=-8,src0		// 1st src pointer
-(p7)	mov	ar.lc = cnt
-(p8)	mov	ar.lc = r0
-	;;
-	TEXT_ALIGN(32)
-1:	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few 1b
-	;;
-
-	shladd	dst1=r22,3,dst0	// 2nd dest pointer
-	shladd	src1=r22,3,src0	// 2nd src pointer
-	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
-	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
-	add	cnt=-1,r22	// ctop iteration adjustment
-	;;
-EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
-EK(.ex_handler, (p9)	ld8	r37=[src1],8)
-(p8)	br.dpnt.few .noloop
-	;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size.  This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
-	mov	r29=ip		// jmp_table thread
-	mov	ar.lc=cnt
-	;;
-	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
-	shl	r28=r30, LOOP_SIZE	// jmp_table thread
-	mov	ar.ec=2		// loop setup
-	;;
-	add	r29=r29,r28		// jmp_table thread
-	cmp.eq	p16,p17=r0,r0
-	;;
-	mov	b6=r29			// jmp_table thread
-	;;
-	br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6)	ld8	r37=[src1],8)
-	add	src0=8,src0
-(p6)	shl	r25=r30,3
-	;;
-EX(.ex_handler, (p6)	ld8	r27=[src1])
-(p6)	shr.u	r28=r37,r25
-(p6)	sub	r26=64,r25
-	;;
-(p6)	shl	r27=r27,r26
-	;;
-(p6)	or	r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
-	cmp.gt	p8,p0=saved_in2,blocksize
-	;;
-(p8)	add	dst0=saved_in0,blocksize
-(p8)	add	src0=saved_in1,blocksize
-(p8)	sub	in2=saved_in2,blocksize
-(p8)	br.dpnt	.4k_block
-	;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- * 
- *   xxxxxx xx                   <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0        8        16
- *          ^
- *          |
- *          src1
- * 
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
-(p6)	add	curlen=-8,curlen	// update length
-	mov	ar.pfs=saved_pfs
-	;;
-	mov	ar.lc=saved_lc
-	mov	pr=saved_pr,-1
-	mov	in2=curlen	// remaining length
-	mov	dst0=dst1	// dest pointer
-	add	src0=src1,r30	// forward by src alignment
-	;;
-
-// 7 byte or smaller.
-.memcpy_short:
-	cmp.le	p8,p9   = 1,in2
-	cmp.le	p10,p11 = 2,in2
-	cmp.le	p12,p13 = 3,in2
-	cmp.le	p14,p15 = 4,in2
-	add	src1=1,src0	// second src pointer
-	add	dst1=1,dst0	// second dest pointer
-	;;
-
-EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
-EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
-(p9)	br.ret.dpnt rp		// 0 byte copy
-	;;
-
-EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
-EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
-(p11)	br.ret.dpnt rp		// 1 byte copy
-
-EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
-EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
-(p13)	br.ret.dpnt rp		// 2 byte copy
-	;;
-
-	cmp.le	p6,p7   = 5,in2
-	cmp.le	p8,p9   = 6,in2
-	cmp.le	p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
-EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
-(p15)	br.ret.dpnt rp		// 3 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
-EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
-(p7)	br.ret.dpnt rp		// 4 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
-EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
-(p9)	br.ret.dptk rp		// 5 byte copy
-
-EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
-(p11)	br.ret.dptk rp		// 6 byte copy
-	;;
-
-EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
-	br.ret.dptk rp		// done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source  odd index
- * dst0 - dest even index
- * dst1 - dest  odd index
- * r30  - distance to 8-byte boundary
- */
-
-.align_dest:
-	add	src1=1,in1	// source odd index
-	cmp.le	p7,p0 = 2,r30	// for .align_dest
-	cmp.le	p8,p0 = 3,r30	// for .align_dest
-EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
-	cmp.le	p9,p0 = 4,r30	// for .align_dest
-	cmp.le	p10,p0 = 5,r30
-	;;
-EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
-EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
-	cmp.le	p11,p0 = 6,r30
-EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
-	cmp.le	p12,p0 = 7,r30
-	;;
-EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
-EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
-EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
-EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
-	;;
-EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
-EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
-	cmp.eq	p6,p7=r28,r29
-EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
-EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
-	sub	in2=in2,r30
-	;;
-EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
-EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
-	add	dst0=in0,r30	// setup arguments
-	add	src0=in1,r30
-(p6)	br.cond.dptk .aligned_src
-(p7)	br.cond.dpnt .unaligned_src
-	;;
-
-/* main loop body in jump table format */
-#define COPYU(shift)									\
-1:											\
-EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
-EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
-		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
-EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
-		 nop.m	0;								\
-		 (p16)	shrp	r38=r36,r37,shift;					\
-EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
-EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
-		 br.ctop.dptk.few 1b;;							\
-		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
-		 shrp	r21=r22,r38,shift;	/* speculative work */			\
-		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
-		 ;;
-	TEXT_ALIGN(32)
-.jump_table:
-	COPYU(8)	// unaligned cases
-.jmp1:
-	COPYU(16)
-	COPYU(24)
-	COPYU(32)
-	COPYU(40)
-	COPYU(48)
-	COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle.  The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A	r19
-#define B	r20
-#define C	r21
-#define D	r22
-#define F	r28
-
-#define saved_retval	loc0
-#define saved_rtlink	loc1
-#define saved_pfs_stack	loc2
-
-.ex_hndlr_s:
-	add	src0=8,src0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_d:
-	add	dst0=8,dst0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_lcpy_1:
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-	cmp.gtu	p10,p11=src_pre_mem,saved_in1
-	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
-	;;
-(p10)	add	src0=8,saved_in1
-(p11)	mov	src0=saved_in1
-(p12)	add	dst0=8,saved_in0
-(p13)	mov	dst0=saved_in0
-	br.sptk	.ex_handler
-.ex_handler_lcpy:
-	// in line_copy block, the preload addresses should always ahead
-	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
-	// always ahead of src0/dst0.
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-.ex_handler:
-	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-	;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
-	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
-	cmp.ltu	p10,p11=src0,src1
-	cmp.ltu	p12,p13=dst0,dst1
-	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
-	mov	tmp = dst0
-	;;
-(p11)	mov	src1 = src0		// pick the larger of the two
-(p13)	mov	dst0 = dst1		// make dst0 the smaller one
-(p13)	mov	dst1 = tmp		// and dst1 the larger one
-	;;
-(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
-	;;
-(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
-(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
-	mov	retval=saved_in2
-(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
-(p8)	st1	[dst1]=r0		// force an oops for memcpy call
-(p14)	br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A =	(faulting_addr - orig_src)	-> len to faulting ld address
- *	or 
- * 	(faulting_addr - orig_dst)	-> len to faulting st address
- * B =	(cur_dst - orig_dst)		-> len copied so far
- * C =	A - B				-> len need to be copied
- * D =	orig_len - A			-> len need to be left along
- */
-(p6)	sub	A = F, saved_in0
-(p7)	sub	A = F, saved_in1
-	clrrrb
-	;;
-	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
-	cmp.lt	p8,p0=A,r0
-	sub	B = dst0, saved_in0	// how many byte copied so far
-	;;
-(p8)	mov	A = 0;			// A shouldn't be negative, cap it
-	;;
-	sub	C = A, B
-	sub	D = saved_in2, A
-	;;
-	cmp.gt	p8,p0=C,r0		// more than 1 byte?
-	mov	r8=0
-	mov	saved_retval = D
-	mov	saved_rtlink = b0
-
-	add	out0=saved_in0, B
-	add	out1=saved_in1, B
-	mov	out2=C
-(p8)	br.call.sptk.few b0=__copy_user	// recursive call
-	;;
-
-	add	saved_retval=saved_retval,r8	// above might return non-zero value
-	;;
-
-	mov	retval=saved_retval
-	mov	ar.pfs=saved_pfs_stack
-	mov	b0=saved_rtlink
-	br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 552c5c7e4d06..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
-   Copyright (c) 2002 Hewlett-Packard Co/CERN
-	Sverre Jarp <Sverre.Jarp@cern.ch>
-
-   Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    value
-        in2:    count
-
-   The algorithm is fairly straightforward: set byte by byte until we
-   we get to a 16B-aligned address, then loop on 128 B chunks using an
-   early store as prefetching, then loop on 32B chucks, then clear remaining
-   words, finally clear remaining bytes.
-   Since a stf.spill f0 can store 16B in one go, we use this instruction
-   to get peak speed when value = 0.  */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#undef ret
-
-#define dest		in0
-#define value		in1
-#define	cnt		in2
-
-#define tmp		r31
-#define save_lc		r30
-#define ptr0		r29
-#define ptr1		r28
-#define ptr2		r27
-#define ptr3		r26
-#define ptr9 		r24
-#define	loopcnt		r23
-#define linecnt		r22
-#define bytecnt		r21
-
-#define fvalue		f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
-#define p_nz		p7
-#define p_zr		p8
-#define p_unalgn	p9
-#define p_y		p11
-#define p_n		p12
-#define p_yy		p13
-#define p_nn		p14
-
-#define MIN1		15
-#define MIN1P1HALF	8
-#define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
-#define PREF_AHEAD	8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
-	.prologue
-	alloc	tmp = ar.pfs, 3, 0, 0, 0
-	lfetch.nt1 [dest]			//
-	.save   ar.lc, save_lc
-	mov.i	save_lc = ar.lc
-	.body
-} { .mmi
-	mov	ret0 = dest			// return value
-	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
-	cmp.eq	p_scr, p0 = cnt, r0
-;; }
-{ .mmi
-	and	ptr2 = -(MIN1+1), dest		// aligned address
-	and	tmp = MIN1, dest		// prepare to check for correct alignment
-	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
-} { .mib
-	mov	ptr1 = dest
-	mux1	value = value, @brcst		// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
-;; }
-{ .mib
-	cmp.ne	p_unalgn, p0 = tmp, r0		//
-} { .mib
-	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
-;; }
-{ .mib
-(p_y)	add	cnt = -8, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
-} { .mib
-(p_y)	st8	[ptr2] = value,-4		//
-(p_n)	add	ptr2 = 4, ptr2			//
-;; }
-{ .mib
-(p_yy)	add	cnt = -4, cnt			//
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
-} { .mib
-(p_yy)	st4	[ptr2] = value,-2		//
-(p_nn)	add	ptr2 = 2, ptr2			//
-;; }
-{ .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
-(p_y)	add	cnt = -2, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
-} { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
-(p_y)	st2	[ptr2] = value,-1		//
-(p_n)	add	ptr2 = 1, ptr2			//
-;; }
-
-{ .mmi
-(p_yy)	st1	[ptr2] = value 			//
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
-} { .mbb
-(p_yy)	add	cnt = -1, cnt			//
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
-;; }
-
-{ .mib
-	nop.m 0
-	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
-;; }
-
-	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt		//
-	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt			//
-;; }
-.pref_l1a:
-{ .mib
-	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
-	nop.i	0
-	br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp			//
-;; }
-.l1ax:
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf8 [ptr2] = fvalue, 24
-(p_scr)	stf8 [ptr9] = fvalue, 128
-	br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
-;; }
-
-	TEXT_ALIGN(32)
-.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
-	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
-	nop.i   0
-	br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf.spill [ptr2] = f0, 32
-(p_scr)	stf.spill [ptr9] = f0, 128
-	br.cloop.dptk.few .l1bx
-;; }
-{ .mib
-	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
-;; }
-
-.fraction_of_line:
-{ .mib
-	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
-;; }
-{ .mib
-	cmp.eq	p_scr, p0 = loopcnt, r0
-	add	loopcnt = -1, loopcnt
-(p_scr)	br.cond.dpnt.many .store_words
-;; }
-{ .mib
-	and	cnt = 0x1f, cnt			// compute the remaining cnt
-	mov.i   ar.lc = loopcnt
-;; }
-	TEXT_ALIGN(32)
-.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
-{ .mmb
-	stf8	[ptr1] = fvalue, 8
-	stf8	[ptr2] = fvalue, 8
-;; } { .mmb
-	stf8	[ptr1] = fvalue, 24
-	stf8	[ptr2] = fvalue, 24
-	br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
-;; }
-
-{ .mmi
-	stf8	[ptr1] = fvalue, 8		// store
-	cmp.le	p_y, p_n = 16, cnt
-	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi
-(p_y)	stf8	[ptr1] = fvalue, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y)	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi						// store
-(p_yy)	stf8	[ptr1] = fvalue, 8
-(p_yy)	add	cnt = -8, cnt			// subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
-	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
-(p_scr)	br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y)	st4	[ptr1] = value,4
-	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy)	st2	[ptr1] = value,2
-	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y)	st1	[ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
-	nop.m	0
-	mov.i	ar.lc = save_lc
-	br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
-       .pred.rel "mutex",p_y, p_n
-       .pred.rel "mutex",p_yy, p_nn
-(p_n)	cmp.le  p_yy, p_nn = 4, cnt
-(p_y)	cmp.le  p_yy, p_nn = 5, cnt
-(p_n)	add	ptr2 = 2, ptr1
-} { .mmi
-(p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y)	add	cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt		// prepare last store
-	mov.i	ar.lc = save_lc
-} { .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3			// last store
-	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
-} { .mmi
-(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
-} { .mmi
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
-	br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index 1f4a46c15127..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- *	in0	address of string
- *
- * Outputs:
- *	ret0	the number of characters in the string (0 if empty string)
- *	does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//	  The goal is to look at the string in chunks of 8 bytes.
-//	  so we need to do a few extra checks at the beginning because the
-//	  string may not be 8-byte aligned. In this case we load the 8byte
-//	  quantity which includes the start of the string and mask the unused
-//	  bytes with 0xff to avoid confusing czx.
-//	  We use speculative loads and software pipelining to hide memory
-//	  latency and do read ahead safely. This way we defer any exception.
-//
-//	  Because we don't want the kernel to be relying on particular
-//	  settings of the DCR register, we provide recovery code in case
-//	  speculation fails. The recovery code is going to "redo" the work using
-//	  only normal loads. If we still get a fault then we generate a
-//	  kernel panic. Otherwise we return the strlen as usual.
-//
-//	  The fact that speculation may fail can be caused, for instance, by
-//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//	  a NaT bit will be set if the translation is not present. The normal
-//	  load, on the other hand, will cause the translation to be inserted
-//	  if the mapping exists.
-//
-//	  It should be noted that we execute recovery code only when we need
-//	  to use the data that has been speculatively loaded: we don't execute
-//	  recovery code on pure read ahead data.
-//
-// Remarks:
-//	- the cmp r0,r0 is used as a fast way to initialize a predicate
-//	  register to 1. This is required to make sure that we get the parallel
-//	  compare correct.
-//
-//	- we don't use the epilogue counter to exit the loop but we need to set
-//	  it to zero beforehand.
-//
-//	- after the loop we must test for Nat values because neither the
-//	  czx nor cmp instruction raise a NaT consumption fault. We must be
-//	  careful not to look too far for a Nat for which we don't care.
-//	  For instance we don't need to look at a NaT in val2 if the zero byte
-//	  was in val1.
-//
-//	- Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs	r11
-#define	tmp		r10
-#define base		r16
-#define orig		r17
-#define saved_pr	r18
-#define src		r19
-#define mask		r20
-#define val		r21
-#define val1		r22
-#define val2		r23
-
-GLOBAL_ENTRY(strlen)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
-	.rotr v[2], w[2]	// declares our 4 aliases
-
-	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
-	mov orig=in0		// keep trackof initial byte address
-	dep src=0,in0,0,3	// src=8byte-aligned in0 address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-	;;
-
-	.body
-
-	ld8 v[1]=[src],8	// must not speculate: can fail here
-	shl tmp=tmp,3		// multiply by 8bits/byte
-	mov mask=-1		// our mask
-	;;
-	ld8.s w[1]=[src],8	// speculatively load next
-	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
-	sub tmp=64,tmp		// how many bits to shift our mask on the right
-	;;
-	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
-	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
-	;;
-	add base=-16,src	// keep track of aligned base
-	or v[1]=v[1],mask	// now we have a safe initial byte pattern
-	;;
-1:
-	ld8.s v[0]=[src],8	// speculatively load next
-	czx1.r val1=v[1]	// search 0 byte from right
-	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
-	;;
-	ld8.s w[0]=[src],8	// speculatively load next to next
-	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
-	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
-(p6)	br.wtop.dptk 1b		// loop until p6 == 0
-	;;
-	//
-	// We must return try the recovery code iff
-	// val1_is_nat || (val1==8 && val2_is_nat)
-	//
-	// XXX Fixme
-	//	- there must be a better way of doing the test
-	//
-	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
-	tnat.nz p6,p7=val1	// test NaT on val1
-(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
-	;;
-	//
-	// if we come here p7 is true, i.e., initialized for // cmp
-	//
-	cmp.eq.and  p7,p0=8,val1// val1==8?
-	tnat.nz.and p7,p0=val2	// test NaT if val2
-(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
-	;;
-(p8)	mov val1=val2		// the other test got us out of the loop
-(p8)	adds src=-16,src	// correct position when 3 ahead
-(p9)	adds src=-24,src	// correct position when 4 ahead
-	;;
-	sub ret0=src,orig	// distance from base
-	sub tmp=8,val1		// which byte in word
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// adjust
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of normal execution
-
-	//
-	// Outlined recovery code when speculation failed
-	//
-	// This time we don't use speculation and rely on the normal exception
-	// mechanism. that's why the loop is not as good as the previous one
-	// because read ahead is not possible
-	//
-	// IMPORTANT:
-	// Please note that in the case of strlen() as opposed to strlen_user()
-	// we don't use the exception mechanism, as this function is not
-	// supposed to fail. If that happens it means we have a bug and the
-	// code will cause of kernel fault.
-	//
-	// XXX Fixme
-	//	- today we restart from the beginning of the string instead
-	//	  of trying to continue where we left off.
-	//
-.recover:
-	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	or val=val,mask		// remask first bytes
-	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
-	;;
-	//
-	// ar.ec is still zero here
-	//
-2:
-(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	czx1.r val1=val		// search 0 byte from right
-	;;
-	cmp.eq p6,p0=8,val1	// val1==8 ?
-(p6)	br.wtop.dptk 2b		// loop until p6 == 0
-	;;			// (avoid WAW on p63)
-	sub ret0=base,orig	// distance from base
-	sub tmp=8,val1
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// length=now - back -1
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index a287169bd953..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *	in0:	address of destination buffer
- *	in1:	address of string to be copied
- *	in2:	length of buffer in bytes
- * Outputs:
- *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *			 by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-	alloc r2=ar.pfs,3,0,0,0
-	mov r8=0
-	mov r9=in1
-	;;
-	add r10=in1,in2
-	cmp.eq p6,p0=r0,in2
-(p6)	br.ret.spnt.many rp
-
-	// XXX braindead copy loop---this needs to be optimized
-.Loop1:
-	EX(.Lexit, ld1 r8=[in1],1)
-	;;
-	EX(.Lexit, st1 [in0]=r8,1)
-	cmp.ne p6,p7=r8,r0
-	;;
-(p6)	cmp.ne.unc p8,p0=in1,r10
-(p8)	br.cond.dpnt.few .Loop1
-	;;
-(p6)	mov r8=in2		// buffer filled up---return buffer length
-(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
-[.Lexit:]
-	br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index a7eb56e840a9..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	string length limit N
- * Outputs:
- *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-	.prologue
-	alloc r2=ar.pfs,2,0,0,0
-	.save ar.lc, r16
-	mov r16=ar.lc			// preserve ar.lc
-
-	.body
-
-	add r3=-1,in1
-	;;
-	mov ar.lc=r3
-	mov r9=0
-	;;
-	// XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-	EXCLR(.Lexit, ld1 r8=[in0],1)
-	add r9=1,r9
-	;;
-	cmp.eq p6,p0=r8,r0
-(p6)	br.cond.dpnt .Lexit
-	br.cloop.dptk.few .Loop1
-
-	add r9=1,in1			// NUL not found---return N+1
-	;;
-.Lexit:
-	mov r8=r9
-	mov ar.lc=r16			// restore ar.lc
-	br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 6e2a69662c06..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 3, 0, 13, 16
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 4, 0, 20, 24
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-	;;
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], s3[6]
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 5, 0, 27, 32
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r20 = s3[6], s4[6]
-	;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r20
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 6, 0, 34, 40
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	mov r20 = in5
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r21 = s3[6], s4[6]
-	;;
-(p[0])	ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r21
-	;;
-(p[6])	  xor d[0] = d[0], s5[6]
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
deleted file mode 100644
index c03f63c62ac4..000000000000
--- a/arch/ia64/mm/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the ia64-specific parts of the memory manager.
-#
-
-obj-y := init.o fault.o tlb.o extable.o ioremap.o
-
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA)	   += numa.o
-obj-$(CONFIG_SPARSEMEM)	   += discontig.o
-obj-$(CONFIG_FLATMEM)	   += contig.o
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
deleted file mode 100644
index 1e9eaa107eb7..000000000000
--- a/arch/ia64/mm/contig.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
- *
- * Routines used by ia64 machines with contiguous (or virtually contiguous)
- * memory.
- */
-#include <linux/efi.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <linux/nmi.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-
-#include <asm/efi.h>
-#include <asm/meminit.h>
-#include <asm/sections.h>
-#include <asm/mca.h>
-
-/* physical address where the bootmem map is located */
-unsigned long bootmap_start;
-
-#ifdef CONFIG_SMP
-static void *cpu_data;
-/**
- * per_cpu_init - setup per-cpu variables
- *
- * Allocate and setup per-cpu data areas.
- */
-void *per_cpu_init(void)
-{
-	static bool first_time = true;
-	void *cpu0_data = __cpu0_per_cpu;
-	unsigned int cpu;
-
-	if (!first_time)
-		goto skip;
-	first_time = false;
-
-	/*
-	 * get_free_pages() cannot be used before cpu_init() done.
-	 * BSP allocates PERCPU_PAGE_SIZE bytes for all possible CPUs
-	 * to avoid that AP calls get_zeroed_page().
-	 */
-	for_each_possible_cpu(cpu) {
-		void *src = cpu == 0 ? cpu0_data : __phys_per_cpu_start;
-
-		memcpy(cpu_data, src, __per_cpu_end - __per_cpu_start);
-		__per_cpu_offset[cpu] = (char *)cpu_data - __per_cpu_start;
-		per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
-
-		/*
-		 * percpu area for cpu0 is moved from the __init area
-		 * which is setup by head.S and used till this point.
-		 * Update ar.k3.  This move is ensures that percpu
-		 * area for cpu0 is on the correct node and its
-		 * virtual address isn't insanely far from other
-		 * percpu areas which is important for congruent
-		 * percpu allocator.
-		 */
-		if (cpu == 0)
-			ia64_set_kr(IA64_KR_PER_CPU_DATA, __pa(cpu_data) -
-				    (unsigned long)__per_cpu_start);
-
-		cpu_data += PERCPU_PAGE_SIZE;
-	}
-skip:
-	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-}
-
-static inline __init void
-alloc_per_cpu_data(void)
-{
-	size_t size = PERCPU_PAGE_SIZE * num_possible_cpus();
-
-	cpu_data = memblock_alloc_from(size, PERCPU_PAGE_SIZE,
-				       __pa(MAX_DMA_ADDRESS));
-	if (!cpu_data)
-		panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
-		      __func__, size, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-}
-
-/**
- * setup_per_cpu_areas - setup percpu areas
- *
- * Arch code has already allocated and initialized percpu areas.  All
- * this function has to do is to teach the determined layout to the
- * dynamic percpu allocator, which happens to be more complex than
- * creating whole new ones using helpers.
- */
-void __init
-setup_per_cpu_areas(void)
-{
-	struct pcpu_alloc_info *ai;
-	struct pcpu_group_info *gi;
-	unsigned int cpu;
-	ssize_t static_size, reserved_size, dyn_size;
-
-	ai = pcpu_alloc_alloc_info(1, num_possible_cpus());
-	if (!ai)
-		panic("failed to allocate pcpu_alloc_info");
-	gi = &ai->groups[0];
-
-	/* units are assigned consecutively to possible cpus */
-	for_each_possible_cpu(cpu)
-		gi->cpu_map[gi->nr_units++] = cpu;
-
-	/* set parameters */
-	static_size = __per_cpu_end - __per_cpu_start;
-	reserved_size = PERCPU_MODULE_RESERVE;
-	dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
-	if (dyn_size < 0)
-		panic("percpu area overflow static=%zd reserved=%zd\n",
-		      static_size, reserved_size);
-
-	ai->static_size		= static_size;
-	ai->reserved_size	= reserved_size;
-	ai->dyn_size		= dyn_size;
-	ai->unit_size		= PERCPU_PAGE_SIZE;
-	ai->atom_size		= PAGE_SIZE;
-	ai->alloc_size		= PERCPU_PAGE_SIZE;
-
-	pcpu_setup_first_chunk(ai, __per_cpu_start + __per_cpu_offset[0]);
-	pcpu_free_alloc_info(ai);
-}
-#else
-#define alloc_per_cpu_data() do { } while (0)
-#endif /* CONFIG_SMP */
-
-/**
- * find_memory - setup memory map
- *
- * Walk the EFI memory map and find usable memory for the system, taking
- * into account reserved areas.
- */
-void __init
-find_memory (void)
-{
-	reserve_memory();
-
-	/* first find highest page frame number */
-	min_low_pfn = ~0UL;
-	max_low_pfn = 0;
-	efi_memmap_walk(find_max_min_low_pfn, NULL);
-	max_pfn = max_low_pfn;
-
-	memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE);
-
-	find_initrd();
-
-	alloc_per_cpu_data();
-}
-
-static int __init find_largest_hole(u64 start, u64 end, void *arg)
-{
-	u64 *max_gap = arg;
-
-	static u64 last_end = PAGE_OFFSET;
-
-	/* NOTE: this algorithm assumes efi memmap table is ordered */
-
-	if (*max_gap < (start - last_end))
-		*max_gap = start - last_end;
-	last_end = end;
-	return 0;
-}
-
-static void __init verify_gap_absence(void)
-{
-	unsigned long max_gap;
-
-	/* Forbid FLATMEM if hole is > than 1G */
-	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
-	if (max_gap >= SZ_1G)
-		panic("Cannot use FLATMEM with %ldMB hole\n"
-		      "Please switch over to SPARSEMEM\n",
-		      (max_gap >> 20));
-}
-
-/*
- * Set up the page tables.
- */
-
-void __init
-paging_init (void)
-{
-	unsigned long max_dma;
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_DMA32] = max_dma;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-
-	verify_gap_absence();
-
-	free_area_init(max_zone_pfns);
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
deleted file mode 100644
index 73d0db36edb6..000000000000
--- a/arch/ia64/mm/discontig.c
+++ /dev/null
@@ -1,635 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2001 Intel Corp.
- * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- * Copyright (c) 2004 Silicon Graphics, Inc
- *	Russ Anderson <rja@sgi.com>
- *	Jesse Barnes <jbarnes@sgi.com>
- *	Jack Steiner <steiner@sgi.com>
- */
-
-/*
- * Platform initialization for Discontig Memory
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/nmi.h>
-#include <linux/swap.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/nodemask.h>
-#include <linux/slab.h>
-#include <asm/efi.h>
-#include <asm/tlb.h>
-#include <asm/meminit.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-
-/*
- * Track per-node information needed to setup the boot memory allocator, the
- * per-node areas, and the real VM.
- */
-struct early_node_data {
-	struct ia64_node_data *node_data;
-	unsigned long pernode_addr;
-	unsigned long pernode_size;
-	unsigned long min_pfn;
-	unsigned long max_pfn;
-};
-
-static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
-static nodemask_t memory_less_mask __initdata;
-
-pg_data_t *pgdat_list[MAX_NUMNODES];
-
-/*
- * To prevent cache aliasing effects, align per-node structures so that they
- * start at addresses that are strided by node number.
- */
-#define MAX_NODE_ALIGN_OFFSET	(32 * 1024 * 1024)
-#define NODEDATA_ALIGN(addr, node)						\
-	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + 				\
-	     (((node)*PERCPU_PAGE_SIZE) & (MAX_NODE_ALIGN_OFFSET - 1)))
-
-/**
- * build_node_maps - callback to setup mem_data structs for each node
- * @start: physical start of range
- * @len: length of range
- * @node: node where this range resides
- *
- * Detect extents of each piece of memory that we wish to
- * treat as a virtually contiguous block (i.e. each node). Each such block
- * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
- * if necessary.  Any non-existent pages will simply be part of the virtual
- * memmap.
- */
-static int __init build_node_maps(unsigned long start, unsigned long len,
-				  int node)
-{
-	unsigned long spfn, epfn, end = start + len;
-
-	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
-	spfn = GRANULEROUNDDOWN(start) >> PAGE_SHIFT;
-
-	if (!mem_data[node].min_pfn) {
-		mem_data[node].min_pfn = spfn;
-		mem_data[node].max_pfn = epfn;
-	} else {
-		mem_data[node].min_pfn = min(spfn, mem_data[node].min_pfn);
-		mem_data[node].max_pfn = max(epfn, mem_data[node].max_pfn);
-	}
-
-	return 0;
-}
-
-/**
- * early_nr_cpus_node - return number of cpus on a given node
- * @node: node to check
- *
- * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
- * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
- * called yet.  Note that node 0 will also count all non-existent cpus.
- */
-static int early_nr_cpus_node(int node)
-{
-	int cpu, n = 0;
-
-	for_each_possible_early_cpu(cpu)
-		if (node == node_cpuid[cpu].nid)
-			n++;
-
-	return n;
-}
-
-/**
- * compute_pernodesize - compute size of pernode data
- * @node: the node id.
- */
-static unsigned long compute_pernodesize(int node)
-{
-	unsigned long pernodesize = 0, cpus;
-
-	cpus = early_nr_cpus_node(node);
-	pernodesize += PERCPU_PAGE_SIZE * cpus;
-	pernodesize += node * L1_CACHE_BYTES;
-	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-	pernodesize = PAGE_ALIGN(pernodesize);
-	return pernodesize;
-}
-
-/**
- * per_cpu_node_setup - setup per-cpu areas on each node
- * @cpu_data: per-cpu area on this node
- * @node: node to setup
- *
- * Copy the static per-cpu data into the region we just set aside and then
- * setup __per_cpu_offset for each CPU on this node.  Return a pointer to
- * the end of the area.
- */
-static void *per_cpu_node_setup(void *cpu_data, int node)
-{
-#ifdef CONFIG_SMP
-	int cpu;
-
-	for_each_possible_early_cpu(cpu) {
-		void *src = cpu == 0 ? __cpu0_per_cpu : __phys_per_cpu_start;
-
-		if (node != node_cpuid[cpu].nid)
-			continue;
-
-		memcpy(__va(cpu_data), src, __per_cpu_end - __per_cpu_start);
-		__per_cpu_offset[cpu] = (char *)__va(cpu_data) -
-			__per_cpu_start;
-
-		/*
-		 * percpu area for cpu0 is moved from the __init area
-		 * which is setup by head.S and used till this point.
-		 * Update ar.k3.  This move is ensures that percpu
-		 * area for cpu0 is on the correct node and its
-		 * virtual address isn't insanely far from other
-		 * percpu areas which is important for congruent
-		 * percpu allocator.
-		 */
-		if (cpu == 0)
-			ia64_set_kr(IA64_KR_PER_CPU_DATA,
-				    (unsigned long)cpu_data -
-				    (unsigned long)__per_cpu_start);
-
-		cpu_data += PERCPU_PAGE_SIZE;
-	}
-#endif
-	return cpu_data;
-}
-
-#ifdef CONFIG_SMP
-/**
- * setup_per_cpu_areas - setup percpu areas
- *
- * Arch code has already allocated and initialized percpu areas.  All
- * this function has to do is to teach the determined layout to the
- * dynamic percpu allocator, which happens to be more complex than
- * creating whole new ones using helpers.
- */
-void __init setup_per_cpu_areas(void)
-{
-	struct pcpu_alloc_info *ai;
-	struct pcpu_group_info *gi;
-	unsigned int *cpu_map;
-	void *base;
-	unsigned long base_offset;
-	unsigned int cpu;
-	ssize_t static_size, reserved_size, dyn_size;
-	int node, prev_node, unit, nr_units;
-
-	ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids);
-	if (!ai)
-		panic("failed to allocate pcpu_alloc_info");
-	cpu_map = ai->groups[0].cpu_map;
-
-	/* determine base */
-	base = (void *)ULONG_MAX;
-	for_each_possible_cpu(cpu)
-		base = min(base,
-			   (void *)(__per_cpu_offset[cpu] + __per_cpu_start));
-	base_offset = (void *)__per_cpu_start - base;
-
-	/* build cpu_map, units are grouped by node */
-	unit = 0;
-	for_each_node(node)
-		for_each_possible_cpu(cpu)
-			if (node == node_cpuid[cpu].nid)
-				cpu_map[unit++] = cpu;
-	nr_units = unit;
-
-	/* set basic parameters */
-	static_size = __per_cpu_end - __per_cpu_start;
-	reserved_size = PERCPU_MODULE_RESERVE;
-	dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
-	if (dyn_size < 0)
-		panic("percpu area overflow static=%zd reserved=%zd\n",
-		      static_size, reserved_size);
-
-	ai->static_size		= static_size;
-	ai->reserved_size	= reserved_size;
-	ai->dyn_size		= dyn_size;
-	ai->unit_size		= PERCPU_PAGE_SIZE;
-	ai->atom_size		= PAGE_SIZE;
-	ai->alloc_size		= PERCPU_PAGE_SIZE;
-
-	/*
-	 * CPUs are put into groups according to node.  Walk cpu_map
-	 * and create new groups at node boundaries.
-	 */
-	prev_node = NUMA_NO_NODE;
-	ai->nr_groups = 0;
-	for (unit = 0; unit < nr_units; unit++) {
-		cpu = cpu_map[unit];
-		node = node_cpuid[cpu].nid;
-
-		if (node == prev_node) {
-			gi->nr_units++;
-			continue;
-		}
-		prev_node = node;
-
-		gi = &ai->groups[ai->nr_groups++];
-		gi->nr_units		= 1;
-		gi->base_offset		= __per_cpu_offset[cpu] + base_offset;
-		gi->cpu_map		= &cpu_map[unit];
-	}
-
-	pcpu_setup_first_chunk(ai, base);
-	pcpu_free_alloc_info(ai);
-}
-#endif
-
-/**
- * fill_pernode - initialize pernode data.
- * @node: the node id.
- * @pernode: physical address of pernode data
- * @pernodesize: size of the pernode data
- */
-static void __init fill_pernode(int node, unsigned long pernode,
-	unsigned long pernodesize)
-{
-	void *cpu_data;
-	int cpus = early_nr_cpus_node(node);
-
-	mem_data[node].pernode_addr = pernode;
-	mem_data[node].pernode_size = pernodesize;
-	memset(__va(pernode), 0, pernodesize);
-
-	cpu_data = (void *)pernode;
-	pernode += PERCPU_PAGE_SIZE * cpus;
-	pernode += node * L1_CACHE_BYTES;
-
-	pgdat_list[node] = __va(pernode);
-	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-	mem_data[node].node_data = __va(pernode);
-	pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-	cpu_data = per_cpu_node_setup(cpu_data, node);
-
-	return;
-}
-
-/**
- * find_pernode_space - allocate memory for memory map and per-node structures
- * @start: physical start of range
- * @len: length of range
- * @node: node where this range resides
- *
- * This routine reserves space for the per-cpu data struct, the list of
- * pg_data_ts and the per-node data struct.  Each node will have something like
- * the following in the first chunk of addr. space large enough to hold it.
- *
- *    ________________________
- *   |                        |
- *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
- *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
- *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
- *   |------------------------|
- *   |   local pg_data_t *    |
- *   |------------------------|
- *   |  local ia64_node_data  |
- *   |------------------------|
- *   |          ???           |
- *   |________________________|
- *
- * Once this space has been set aside, the bootmem maps are initialized.  We
- * could probably move the allocation of the per-cpu and ia64_node_data space
- * outside of this function and use alloc_bootmem_node(), but doing it here
- * is straightforward and we get the alignments we want so...
- */
-static int __init find_pernode_space(unsigned long start, unsigned long len,
-				     int node)
-{
-	unsigned long spfn, epfn;
-	unsigned long pernodesize = 0, pernode;
-
-	spfn = start >> PAGE_SHIFT;
-	epfn = (start + len) >> PAGE_SHIFT;
-
-	/*
-	 * Make sure this memory falls within this node's usable memory
-	 * since we may have thrown some away in build_maps().
-	 */
-	if (spfn < mem_data[node].min_pfn || epfn > mem_data[node].max_pfn)
-		return 0;
-
-	/* Don't setup this node's local space twice... */
-	if (mem_data[node].pernode_addr)
-		return 0;
-
-	/*
-	 * Calculate total size needed, incl. what's necessary
-	 * for good alignment and alias prevention.
-	 */
-	pernodesize = compute_pernodesize(node);
-	pernode = NODEDATA_ALIGN(start, node);
-
-	/* Is this range big enough for what we want to store here? */
-	if (start + len > (pernode + pernodesize))
-		fill_pernode(node, pernode, pernodesize);
-
-	return 0;
-}
-
-/**
- * reserve_pernode_space - reserve memory for per-node space
- *
- * Reserve the space used by the bootmem maps & per-node space in the boot
- * allocator so that when we actually create the real mem maps we don't
- * use their memory.
- */
-static void __init reserve_pernode_space(void)
-{
-	unsigned long base, size;
-	int node;
-
-	for_each_online_node(node) {
-		if (node_isset(node, memory_less_mask))
-			continue;
-
-		/* Now the per-node space */
-		size = mem_data[node].pernode_size;
-		base = __pa(mem_data[node].pernode_addr);
-		memblock_reserve(base, size);
-	}
-}
-
-static void scatter_node_data(void)
-{
-	pg_data_t **dst;
-	int node;
-
-	/*
-	 * for_each_online_node() can't be used at here.
-	 * node_online_map is not set for hot-added nodes at this time,
-	 * because we are halfway through initialization of the new node's
-	 * structures.  If for_each_online_node() is used, a new node's
-	 * pg_data_ptrs will be not initialized. Instead of using it,
-	 * pgdat_list[] is checked.
-	 */
-	for_each_node(node) {
-		if (pgdat_list[node]) {
-			dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
-			memcpy(dst, pgdat_list, sizeof(pgdat_list));
-		}
-	}
-}
-
-/**
- * initialize_pernode_data - fixup per-cpu & per-node pointers
- *
- * Each node's per-node area has a copy of the global pg_data_t list, so
- * we copy that to each node here, as well as setting the per-cpu pointer
- * to the local node data structure.
- */
-static void __init initialize_pernode_data(void)
-{
-	int cpu, node;
-
-	scatter_node_data();
-
-#ifdef CONFIG_SMP
-	/* Set the node_data pointer for each per-cpu struct */
-	for_each_possible_early_cpu(cpu) {
-		node = node_cpuid[cpu].nid;
-		per_cpu(ia64_cpu_info, cpu).node_data =
-			mem_data[node].node_data;
-	}
-#else
-	{
-		struct cpuinfo_ia64 *cpu0_cpu_info;
-		cpu = 0;
-		node = node_cpuid[cpu].nid;
-		cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
-			((char *)&ia64_cpu_info - __per_cpu_start));
-		cpu0_cpu_info->node_data = mem_data[node].node_data;
-	}
-#endif /* CONFIG_SMP */
-}
-
-/**
- * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
- * 	node but fall back to any other node when __alloc_bootmem_node fails
- *	for best.
- * @nid: node id
- * @pernodesize: size of this node's pernode data
- */
-static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
-{
-	void *ptr = NULL;
-	u8 best = 0xff;
-	int bestnode = NUMA_NO_NODE, node, anynode = 0;
-
-	for_each_online_node(node) {
-		if (node_isset(node, memory_less_mask))
-			continue;
-		else if (node_distance(nid, node) < best) {
-			best = node_distance(nid, node);
-			bestnode = node;
-		}
-		anynode = node;
-	}
-
-	if (bestnode == NUMA_NO_NODE)
-		bestnode = anynode;
-
-	ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
-				     __pa(MAX_DMA_ADDRESS),
-				     MEMBLOCK_ALLOC_ACCESSIBLE,
-				     bestnode);
-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%lx\n",
-		      __func__, pernodesize, PERCPU_PAGE_SIZE, bestnode,
-		      __pa(MAX_DMA_ADDRESS));
-
-	return ptr;
-}
-
-/**
- * memory_less_nodes - allocate and initialize CPU only nodes pernode
- *	information.
- */
-static void __init memory_less_nodes(void)
-{
-	unsigned long pernodesize;
-	void *pernode;
-	int node;
-
-	for_each_node_mask(node, memory_less_mask) {
-		pernodesize = compute_pernodesize(node);
-		pernode = memory_less_node_alloc(node, pernodesize);
-		fill_pernode(node, __pa(pernode), pernodesize);
-	}
-
-	return;
-}
-
-/**
- * find_memory - walk the EFI memory map and setup the bootmem allocator
- *
- * Called early in boot to setup the bootmem allocator, and to
- * allocate the per-cpu and per-node structures.
- */
-void __init find_memory(void)
-{
-	int node;
-
-	reserve_memory();
-	efi_memmap_walk(filter_memory, register_active_ranges);
-
-	if (num_online_nodes() == 0) {
-		printk(KERN_ERR "node info missing!\n");
-		node_set_online(0);
-	}
-
-	nodes_or(memory_less_mask, memory_less_mask, node_online_map);
-	min_low_pfn = -1;
-	max_low_pfn = 0;
-
-	/* These actually end up getting called by call_pernode_memory() */
-	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
-	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
-	efi_memmap_walk(find_max_min_low_pfn, NULL);
-
-	for_each_online_node(node)
-		if (mem_data[node].min_pfn)
-			node_clear(node, memory_less_mask);
-
-	reserve_pernode_space();
-	memory_less_nodes();
-	initialize_pernode_data();
-
-	max_pfn = max_low_pfn;
-
-	find_initrd();
-}
-
-#ifdef CONFIG_SMP
-/**
- * per_cpu_init - setup per-cpu variables
- *
- * find_pernode_space() does most of this already, we just need to set
- * local_per_cpu_offset
- */
-void *per_cpu_init(void)
-{
-	int cpu;
-	static int first_time = 1;
-
-	if (first_time) {
-		first_time = 0;
-		for_each_possible_early_cpu(cpu)
-			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
-	}
-
-	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-}
-#endif /* CONFIG_SMP */
-
-/**
- * call_pernode_memory - use SRAT to call callback functions with node info
- * @start: physical start of range
- * @len: length of range
- * @arg: function to call for each range
- *
- * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
- * out to which node a block of memory belongs.  Ignore memory that we cannot
- * identify, and split blocks that run across multiple nodes.
- *
- * Take this opportunity to round the start address up and the end address
- * down to page boundaries.
- */
-void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
-{
-	unsigned long rs, re, end = start + len;
-	void (*func)(unsigned long, unsigned long, int);
-	int i;
-
-	start = PAGE_ALIGN(start);
-	end &= PAGE_MASK;
-	if (start >= end)
-		return;
-
-	func = arg;
-
-	if (!num_node_memblks) {
-		/* No SRAT table, so assume one node (node 0) */
-		if (start < end)
-			(*func)(start, end - start, 0);
-		return;
-	}
-
-	for (i = 0; i < num_node_memblks; i++) {
-		rs = max(start, node_memblk[i].start_paddr);
-		re = min(end, node_memblk[i].start_paddr +
-			 node_memblk[i].size);
-
-		if (rs < re)
-			(*func)(rs, re - rs, node_memblk[i].nid);
-
-		if (re == end)
-			break;
-	}
-}
-
-/**
- * paging_init - setup page tables
- *
- * paging_init() sets up the page tables for each node of the system and frees
- * the bootmem allocator memory for general use.
- */
-void __init paging_init(void)
-{
-	unsigned long max_dma;
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-	sparse_init();
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA32] = max_dma;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	free_area_init(max_zone_pfns);
-
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
-
-pg_data_t * __init arch_alloc_nodedata(int nid)
-{
-	unsigned long size = compute_pernodesize(nid);
-
-	return memblock_alloc(size, SMP_CACHE_BYTES);
-}
-
-void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
-{
-	pgdat_list[update_node] = update_pgdat;
-	scatter_node_data();
-}
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
-		struct vmem_altmap *altmap)
-{
-	return vmemmap_populate_basepages(start, end, node, NULL);
-}
-
-void vmemmap_free(unsigned long start, unsigned long end,
-		struct vmem_altmap *altmap)
-{
-}
-#endif
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
deleted file mode 100644
index da477c11770b..000000000000
--- a/arch/ia64/mm/extable.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Kernel exception handling table support.  Derived from arch/alpha/mm/extable.c.
- *
- * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/ptrace.h>
-#include <asm/extable.h>
-#include <asm/errno.h>
-#include <asm/processor.h>
-
-void
-ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
-{
-	long fix = (u64) &e->fixup + e->fixup;
-
-	regs->r8 = -EFAULT;
-	if (fix & 4)
-		regs->r9 = 0;
-	regs->cr_iip = fix & ~0xf;
-	ia64_psr(regs)->ri = fix & 0x3;		/* set continuation slot number */
-}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
deleted file mode 100644
index 5458b52b4009..000000000000
--- a/arch/ia64/mm/fault.c
+++ /dev/null
@@ -1,251 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MMU fault handling support.
- *
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/sched/signal.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/extable.h>
-#include <linux/interrupt.h>
-#include <linux/kprobes.h>
-#include <linux/kdebug.h>
-#include <linux/prefetch.h>
-#include <linux/uaccess.h>
-#include <linux/perf_event.h>
-
-#include <asm/processor.h>
-#include <asm/exception.h>
-
-extern int die(char *, struct pt_regs *, long);
-
-/*
- * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
- * (inside region 5, on ia64) and that page is present.
- */
-static int
-mapped_kernel_page_is_present (unsigned long address)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
-
-	pgd = pgd_offset_k(address);
-	if (pgd_none(*pgd) || pgd_bad(*pgd))
-		return 0;
-
-	p4d = p4d_offset(pgd, address);
-	if (p4d_none(*p4d) || p4d_bad(*p4d))
-		return 0;
-
-	pud = pud_offset(p4d, address);
-	if (pud_none(*pud) || pud_bad(*pud))
-		return 0;
-
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
-		return 0;
-
-	ptep = pte_offset_kernel(pmd, address);
-	if (!ptep)
-		return 0;
-
-	pte = *ptep;
-	return pte_present(pte);
-}
-
-#	define VM_READ_BIT	0
-#	define VM_WRITE_BIT	1
-#	define VM_EXEC_BIT	2
-
-void __kprobes
-ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
-{
-	int signal = SIGSEGV, code = SEGV_MAPERR;
-	struct vm_area_struct *vma, *prev_vma;
-	struct mm_struct *mm = current->mm;
-	unsigned long mask;
-	vm_fault_t fault;
-	unsigned int flags = FAULT_FLAG_DEFAULT;
-
-	mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
-		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
-
-	/* mmap_lock is performance critical.... */
-	prefetchw(&mm->mmap_lock);
-
-	/*
-	 * If we're in an interrupt or have no user context, we must not take the fault..
-	 */
-	if (faulthandler_disabled() || !mm)
-		goto no_context;
-
-	/*
-	 * This is to handle the kprobes on user space access instructions
-	 */
-	if (kprobe_page_fault(regs, TRAP_BRKPT))
-		return;
-
-	if (user_mode(regs))
-		flags |= FAULT_FLAG_USER;
-	if (mask & VM_WRITE)
-		flags |= FAULT_FLAG_WRITE;
-
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-retry:
-	mmap_read_lock(mm);
-
-	vma = find_vma_prev(mm, address, &prev_vma);
-	if (!vma && !prev_vma )
-		goto bad_area;
-
-        /*
-         * find_vma_prev() returns vma such that address < vma->vm_end or NULL
-         *
-         * May find no vma, but could be that the last vm area is the
-         * register backing store that needs to expand upwards, in
-         * this case vma will be null, but prev_vma will ne non-null
-         */
-        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
-		vma = expand_stack(mm, address);
-		if (!vma)
-			goto bad_area_nosemaphore;
-	}
-
-	code = SEGV_ACCERR;
-
-	/* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
-
-#	if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
-	    || (1 << VM_EXEC_BIT) != VM_EXEC)
-#		error File is out of sync with <linux/mm.h>.  Please update.
-#	endif
-
-	if (((isr >> IA64_ISR_R_BIT) & 1UL) && (!(vma->vm_flags & (VM_READ | VM_WRITE))))
-		goto bad_area;
-
-	if ((vma->vm_flags & mask) != mask)
-		goto bad_area;
-
-	/*
-	 * If for any reason at all we couldn't handle the fault, make
-	 * sure we exit gracefully rather than endlessly redo the
-	 * fault.
-	 */
-	fault = handle_mm_fault(vma, address, flags, regs);
-
-	if (fault_signal_pending(fault, regs)) {
-		if (!user_mode(regs))
-			goto no_context;
-		return;
-	}
-
-	/* The fault is fully completed (including releasing mmap lock) */
-	if (fault & VM_FAULT_COMPLETED)
-		return;
-
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		/*
-		 * We ran out of memory, or some other thing happened
-		 * to us that made us unable to handle the page fault
-		 * gracefully.
-		 */
-		if (fault & VM_FAULT_OOM) {
-			goto out_of_memory;
-		} else if (fault & VM_FAULT_SIGSEGV) {
-			goto bad_area;
-		} else if (fault & VM_FAULT_SIGBUS) {
-			signal = SIGBUS;
-			goto bad_area;
-		}
-		BUG();
-	}
-
-	if (fault & VM_FAULT_RETRY) {
-		flags |= FAULT_FLAG_TRIED;
-
-		/* No need to mmap_read_unlock(mm) as we would
-		 * have already released it in __lock_page_or_retry
-		 * in mm/filemap.c.
-		 */
-
-		goto retry;
-	}
-
-	mmap_read_unlock(mm);
-	return;
-
-  bad_area:
-	mmap_read_unlock(mm);
-  bad_area_nosemaphore:
-	if ((isr & IA64_ISR_SP)
-	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
-	{
-		/*
-		 * This fault was due to a speculative load or lfetch.fault, set the "ed"
-		 * bit in the psr to ensure forward progress.  (Target register will get a
-		 * NaT for ld.s, lfetch will be canceled.)
-		 */
-		ia64_psr(regs)->ed = 1;
-		return;
-	}
-	if (user_mode(regs)) {
-		force_sig_fault(signal, code, (void __user *) address,
-				0, __ISR_VALID, isr);
-		return;
-	}
-
-  no_context:
-	if ((isr & IA64_ISR_SP)
-	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
-	{
-		/*
-		 * This fault was due to a speculative load or lfetch.fault, set the "ed"
-		 * bit in the psr to ensure forward progress.  (Target register will get a
-		 * NaT for ld.s, lfetch will be canceled.)
-		 */
-		ia64_psr(regs)->ed = 1;
-		return;
-	}
-
-	/*
-	 * Since we have no vma's for region 5, we might get here even if the address is
-	 * valid, due to the VHPT walker inserting a non present translation that becomes
-	 * stale. If that happens, the non present fault handler already purged the stale
-	 * translation, which fixed the problem. So, we check to see if the translation is
-	 * valid, and return if it is.
-	 */
-	if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
-		return;
-
-	if (ia64_done_with_exception(regs))
-		return;
-
-	/*
-	 * Oops. The kernel tried to access some bad page. We'll have to terminate things
-	 * with extreme prejudice.
-	 */
-	bust_spinlocks(1);
-
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request at "
-		       "virtual address %016lx\n", address);
-	if (die("Oops", regs, isr))
-		regs = NULL;
-	bust_spinlocks(0);
-	if (regs)
-		make_task_dead(SIGKILL);
-	return;
-
-  out_of_memory:
-	mmap_read_unlock(mm);
-	if (!user_mode(regs))
-		goto no_context;
-	pagefault_out_of_memory();
-}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
deleted file mode 100644
index adc49f2d22e8..000000000000
--- a/arch/ia64/mm/hugetlbpage.c
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IA-64 Huge TLB Page Support for Kernel.
- *
- * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
- * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
- *
- * Sep, 2003: add numa support
- * Feb, 2004: dynamic hugetlb page size via boot parameter
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-#include <linux/log2.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
-EXPORT_SYMBOL(hpage_shift);
-
-pte_t *
-huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
-	       unsigned long addr, unsigned long sz)
-{
-	unsigned long taddr = htlbpage_to_page(addr);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, taddr);
-	p4d = p4d_offset(pgd, taddr);
-	pud = pud_alloc(mm, p4d, taddr);
-	if (pud) {
-		pmd = pmd_alloc(mm, pud, taddr);
-		if (pmd)
-			pte = pte_alloc_huge(mm, pmd, taddr);
-	}
-	return pte;
-}
-
-pte_t *
-huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz)
-{
-	unsigned long taddr = htlbpage_to_page(addr);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, taddr);
-	if (pgd_present(*pgd)) {
-		p4d = p4d_offset(pgd, taddr);
-		if (p4d_present(*p4d)) {
-			pud = pud_offset(p4d, taddr);
-			if (pud_present(*pud)) {
-				pmd = pmd_offset(pud, taddr);
-				if (pmd_present(*pmd))
-					pte = pte_offset_huge(pmd, taddr);
-			}
-		}
-	}
-
-	return pte;
-}
-
-#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
-
-/*
- * Don't actually need to do any preparation, but need to make sure
- * the address is in the right region.
- */
-int prepare_hugepage_range(struct file *file,
-			unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return -EINVAL;
-
-	return 0;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
-			unsigned long addr, unsigned long end,
-			unsigned long floor, unsigned long ceiling)
-{
-	/*
-	 * This is called to free hugetlb page tables.
-	 *
-	 * The offset of these addresses from the base of the hugetlb
-	 * region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
-	 * the standard free_pgd_range will free the right page tables.
-	 *
-	 * If floor and ceiling are also in the hugetlb region, they
-	 * must likewise be scaled down; but if outside, left unchanged.
-	 */
-
-	addr = htlbpage_to_page(addr);
-	end  = htlbpage_to_page(end);
-	if (REGION_NUMBER(floor) == RGN_HPAGE)
-		floor = htlbpage_to_page(floor);
-	if (REGION_NUMBER(ceiling) == RGN_HPAGE)
-		ceiling = htlbpage_to_page(ceiling);
-
-	free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct vm_unmapped_area_info info;
-
-	if (len > RGN_MAP_LIMIT)
-		return -ENOMEM;
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-
-	/* Handle MAP_FIXED */
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	/* This code assumes that RGN_HPAGE != 0. */
-	if ((REGION_NUMBER(addr) != RGN_HPAGE) || (addr & (HPAGE_SIZE - 1)))
-		addr = HPAGE_REGION_BASE;
-
-	info.flags = 0;
-	info.length = len;
-	info.low_limit = addr;
-	info.high_limit = HPAGE_REGION_BASE + RGN_MAP_LIMIT;
-	info.align_mask = PAGE_MASK & (HPAGE_SIZE - 1);
-	info.align_offset = 0;
-	return vm_unmapped_area(&info);
-}
-
-static int __init hugetlb_setup_sz(char *str)
-{
-	u64 tr_pages;
-	unsigned long long size;
-
-	if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
-		/*
-		 * shouldn't happen, but just in case.
-		 */
-		tr_pages = 0x15557000UL;
-
-	size = memparse(str, &str);
-	if (*str || !is_power_of_2(size) || !(tr_pages & size) ||
-		size <= PAGE_SIZE ||
-		size > (1UL << PAGE_SHIFT << MAX_ORDER)) {
-		printk(KERN_WARNING "Invalid huge page size specified\n");
-		return 1;
-	}
-
-	hpage_shift = __ffs(size);
-	/*
-	 * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
-	 * override here with new page shift.
-	 */
-	ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
-	return 0;
-}
-early_param("hugepagesz", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
deleted file mode 100644
index 05b0f2f0c073..000000000000
--- a/arch/ia64/mm/init.c
+++ /dev/null
@@ -1,532 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Initialize MMU support.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-
-#include <linux/dma-map-ops.h>
-#include <linux/dmar.h>
-#include <linux/efi.h>
-#include <linux/elf.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/personality.h>
-#include <linux/reboot.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/bitops.h>
-#include <linux/kexec.h>
-#include <linux/swiotlb.h>
-
-#include <asm/dma.h>
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/numa.h>
-#include <asm/patch.h>
-#include <asm/pgalloc.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-#include <asm/tlb.h>
-#include <linux/uaccess.h>
-#include <asm/unistd.h>
-#include <asm/mca.h>
-
-extern void ia64_tlb_init (void);
-
-unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
-
-struct page *zero_page_memmap_ptr;	/* map entry for zero page */
-EXPORT_SYMBOL(zero_page_memmap_ptr);
-
-void
-__ia64_sync_icache_dcache (pte_t pte)
-{
-	unsigned long addr;
-	struct folio *folio;
-
-	folio = page_folio(pte_page(pte));
-	addr = (unsigned long)folio_address(folio);
-
-	if (test_bit(PG_arch_1, &folio->flags))
-		return;				/* i-cache is already coherent with d-cache */
-
-	flush_icache_range(addr, addr + folio_size(folio));
-	set_bit(PG_arch_1, &folio->flags);	/* mark page as clean */
-}
-
-/*
- * Since DMA is i-cache coherent, any (complete) folios that were written via
- * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
- * flush them when they get mapped into an executable vm-area.
- */
-void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
-{
-	unsigned long pfn = PHYS_PFN(paddr);
-	struct folio *folio = page_folio(pfn_to_page(pfn));
-	ssize_t left = size;
-	size_t offset = offset_in_folio(folio, paddr);
-
-	if (offset) {
-		left -= folio_size(folio) - offset;
-		if (left <= 0)
-			return;
-		folio = folio_next(folio);
-	}
-
-	while (left >= (ssize_t)folio_size(folio)) {
-		left -= folio_size(folio);
-		set_bit(PG_arch_1, &pfn_to_page(pfn)->flags);
-		if (!left)
-			break;
-		folio = folio_next(folio);
-	}
-}
-
-inline void
-ia64_set_rbs_bot (void)
-{
-	unsigned long stack_size = rlimit_max(RLIMIT_STACK) & -16;
-
-	if (stack_size > MAX_USER_STACK_SIZE)
-		stack_size = MAX_USER_STACK_SIZE;
-	current->thread.rbs_bot = PAGE_ALIGN(current->mm->start_stack - stack_size);
-}
-
-/*
- * This performs some platform-dependent address space initialization.
- * On IA-64, we want to setup the VM area for the register backing
- * store (which grows upwards) and install the gateway page which is
- * used for signal trampolines, etc.
- */
-void
-ia64_init_addr_space (void)
-{
-	struct vm_area_struct *vma;
-
-	ia64_set_rbs_bot();
-
-	/*
-	 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
-	 * the problem.  When the process attempts to write to the register backing store
-	 * for the first time, it will get a SEGFAULT in this case.
-	 */
-	vma = vm_area_alloc(current->mm);
-	if (vma) {
-		vma_set_anonymous(vma);
-		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
-		vma->vm_end = vma->vm_start + PAGE_SIZE;
-		vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT);
-		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-		mmap_write_lock(current->mm);
-		if (insert_vm_struct(current->mm, vma)) {
-			mmap_write_unlock(current->mm);
-			vm_area_free(vma);
-			return;
-		}
-		mmap_write_unlock(current->mm);
-	}
-
-	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
-	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = vm_area_alloc(current->mm);
-		if (vma) {
-			vma_set_anonymous(vma);
-			vma->vm_end = PAGE_SIZE;
-			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
-			vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO |
-				      VM_DONTEXPAND | VM_DONTDUMP);
-			mmap_write_lock(current->mm);
-			if (insert_vm_struct(current->mm, vma)) {
-				mmap_write_unlock(current->mm);
-				vm_area_free(vma);
-				return;
-			}
-			mmap_write_unlock(current->mm);
-		}
-	}
-}
-
-void
-free_initmem (void)
-{
-	free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end),
-			   -1, "unused kernel");
-}
-
-void __init
-free_initrd_mem (unsigned long start, unsigned long end)
-{
-	/*
-	 * EFI uses 4KB pages while the kernel can use 4KB or bigger.
-	 * Thus EFI and the kernel may have different page sizes. It is
-	 * therefore possible to have the initrd share the same page as
-	 * the end of the kernel (given current setup).
-	 *
-	 * To avoid freeing/using the wrong page (kernel sized) we:
-	 *	- align up the beginning of initrd
-	 *	- align down the end of initrd
-	 *
-	 *  |             |
-	 *  |=============| a000
-	 *  |             |
-	 *  |             |
-	 *  |             | 9000
-	 *  |/////////////|
-	 *  |/////////////|
-	 *  |=============| 8000
-	 *  |///INITRD////|
-	 *  |/////////////|
-	 *  |/////////////| 7000
-	 *  |             |
-	 *  |KKKKKKKKKKKKK|
-	 *  |=============| 6000
-	 *  |KKKKKKKKKKKKK|
-	 *  |KKKKKKKKKKKKK|
-	 *  K=kernel using 8KB pages
-	 *
-	 * In this example, we must free page 8000 ONLY. So we must align up
-	 * initrd_start and keep initrd_end as is.
-	 */
-	start = PAGE_ALIGN(start);
-	end = end & PAGE_MASK;
-
-	if (start < end)
-		printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
-
-	for (; start < end; start += PAGE_SIZE) {
-		if (!virt_addr_valid(start))
-			continue;
-		free_reserved_page(virt_to_page(start));
-	}
-}
-
-/*
- * This installs a clean page in the kernel's page table.
- */
-static struct page * __init
-put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
-
-	{
-		p4d = p4d_alloc(&init_mm, pgd, address);
-		if (!p4d)
-			goto out;
-		pud = pud_alloc(&init_mm, p4d, address);
-		if (!pud)
-			goto out;
-		pmd = pmd_alloc(&init_mm, pud, address);
-		if (!pmd)
-			goto out;
-		pte = pte_alloc_kernel(pmd, address);
-		if (!pte)
-			goto out;
-		if (!pte_none(*pte))
-			goto out;
-		set_pte(pte, mk_pte(page, pgprot));
-	}
-  out:
-	/* no need for flush_tlb */
-	return page;
-}
-
-static void __init
-setup_gate (void)
-{
-	struct page *page;
-
-	/*
-	 * Map the gate page twice: once read-only to export the ELF
-	 * headers etc. and once execute-only page to enable
-	 * privilege-promotion via "epc":
-	 */
-	page = virt_to_page(ia64_imva(__start_gate_section));
-	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
-#ifdef HAVE_BUGGY_SEGREL
-	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
-	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
-#else
-	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
-	/* Fill in the holes (if any) with read-only zero pages: */
-	{
-		unsigned long addr;
-
-		for (addr = GATE_ADDR + PAGE_SIZE;
-		     addr < GATE_ADDR + PERCPU_PAGE_SIZE;
-		     addr += PAGE_SIZE)
-		{
-			put_kernel_page(ZERO_PAGE(0), addr,
-					PAGE_READONLY);
-			put_kernel_page(ZERO_PAGE(0), addr + PERCPU_PAGE_SIZE,
-					PAGE_READONLY);
-		}
-	}
-#endif
-	ia64_patch_gate();
-}
-
-static struct vm_area_struct gate_vma;
-
-static int __init gate_vma_init(void)
-{
-	vma_init(&gate_vma, NULL);
-	gate_vma.vm_start = FIXADDR_USER_START;
-	gate_vma.vm_end = FIXADDR_USER_END;
-	vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC);
-	gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX);
-
-	return 0;
-}
-__initcall(gate_vma_init);
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-	return &gate_vma;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
-	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
-		return 1;
-	return 0;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
-	return in_gate_area_no_mm(addr);
-}
-
-void ia64_mmu_init(void *my_cpu_data)
-{
-	unsigned long pta, impl_va_bits;
-	extern void tlb_init(void);
-
-#ifdef CONFIG_DISABLE_VHPT
-#	define VHPT_ENABLE_BIT	0
-#else
-#	define VHPT_ENABLE_BIT	1
-#endif
-
-	/*
-	 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
-	 * address space.  The IA-64 architecture guarantees that at least 50 bits of
-	 * virtual address space are implemented but if we pick a large enough page size
-	 * (e.g., 64KB), the mapped address space is big enough that it will overlap with
-	 * VMLPT.  I assume that once we run on machines big enough to warrant 64KB pages,
-	 * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
-	 * problem in practice.  Alternatively, we could truncate the top of the mapped
-	 * address space to not permit mappings that would overlap with the VMLPT.
-	 * --davidm 00/12/06
-	 */
-#	define pte_bits			3
-#	define mapped_space_bits	(3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
-	/*
-	 * The virtual page table has to cover the entire implemented address space within
-	 * a region even though not all of this space may be mappable.  The reason for
-	 * this is that the Access bit and Dirty bit fault handlers perform
-	 * non-speculative accesses to the virtual page table, so the address range of the
-	 * virtual page table itself needs to be covered by virtual page table.
-	 */
-#	define vmlpt_bits		(impl_va_bits - PAGE_SHIFT + pte_bits)
-#	define POW2(n)			(1ULL << (n))
-
-	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
-
-	if (impl_va_bits < 51 || impl_va_bits > 61)
-		panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
-	/*
-	 * mapped_space_bits - PAGE_SHIFT is the total number of ptes we need,
-	 * which must fit into "vmlpt_bits - pte_bits" slots. Second half of
-	 * the test makes sure that our mapped space doesn't overlap the
-	 * unimplemented hole in the middle of the region.
-	 */
-	if ((mapped_space_bits - PAGE_SHIFT > vmlpt_bits - pte_bits) ||
-	    (mapped_space_bits > impl_va_bits - 1))
-		panic("Cannot build a big enough virtual-linear page table"
-		      " to cover mapped address space.\n"
-		      " Try using a smaller page size.\n");
-
-
-	/* place the VMLPT at the end of each page-table mapped region: */
-	pta = POW2(61) - POW2(vmlpt_bits);
-
-	/*
-	 * Set the (virtually mapped linear) page table address.  Bit
-	 * 8 selects between the short and long format, bits 2-7 the
-	 * size of the table, and bit 0 whether the VHPT walker is
-	 * enabled.
-	 */
-	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
-
-	ia64_tlb_init();
-
-#ifdef	CONFIG_HUGETLB_PAGE
-	ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
-	ia64_srlz_d();
-#endif
-}
-
-int __init register_active_ranges(u64 start, u64 len, int nid)
-{
-	u64 end = start + len;
-
-#ifdef CONFIG_KEXEC
-	if (start > crashk_res.start && start < crashk_res.end)
-		start = crashk_res.end;
-	if (end > crashk_res.start && end < crashk_res.end)
-		end = crashk_res.start;
-#endif
-
-	if (start < end)
-		memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE);
-	return 0;
-}
-
-int
-find_max_min_low_pfn (u64 start, u64 end, void *arg)
-{
-	unsigned long pfn_start, pfn_end;
-#ifdef CONFIG_FLATMEM
-	pfn_start = (PAGE_ALIGN(__pa(start))) >> PAGE_SHIFT;
-	pfn_end = (PAGE_ALIGN(__pa(end - 1))) >> PAGE_SHIFT;
-#else
-	pfn_start = GRANULEROUNDDOWN(__pa(start)) >> PAGE_SHIFT;
-	pfn_end = GRANULEROUNDUP(__pa(end - 1)) >> PAGE_SHIFT;
-#endif
-	min_low_pfn = min(min_low_pfn, pfn_start);
-	max_low_pfn = max(max_low_pfn, pfn_end);
-	return 0;
-}
-
-/*
- * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
- * system call handler.  When this option is in effect, all fsyscalls will end up bubbling
- * down into the kernel and calling the normal (heavy-weight) syscall handler.  This is
- * useful for performance testing, but conceivably could also come in handy for debugging
- * purposes.
- */
-
-static int nolwsys __initdata;
-
-static int __init
-nolwsys_setup (char *s)
-{
-	nolwsys = 1;
-	return 1;
-}
-
-__setup("nolwsys", nolwsys_setup);
-
-void __init
-mem_init (void)
-{
-	int i;
-
-	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
-	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
-	BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
-
-	/*
-	 * This needs to be called _after_ the command line has been parsed but
-	 * _before_ any drivers that may need the PCI DMA interface are
-	 * initialized or bootmem has been freed.
-	 */
-	do {
-#ifdef CONFIG_INTEL_IOMMU
-		detect_intel_iommu();
-		if (iommu_detected)
-			break;
-#endif
-		swiotlb_init(true, SWIOTLB_VERBOSE);
-	} while (0);
-
-#ifdef CONFIG_FLATMEM
-	BUG_ON(!mem_map);
-#endif
-
-	set_max_mapnr(max_low_pfn);
-	high_memory = __va(max_low_pfn * PAGE_SIZE);
-	memblock_free_all();
-
-	/*
-	 * For fsyscall entrypoints with no light-weight handler, use the ordinary
-	 * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
-	 * code can tell them apart.
-	 */
-	for (i = 0; i < NR_syscalls; ++i) {
-		extern unsigned long fsyscall_table[NR_syscalls];
-		extern unsigned long sys_call_table[NR_syscalls];
-
-		if (!fsyscall_table[i] || nolwsys)
-			fsyscall_table[i] = sys_call_table[i] | 1;
-	}
-	setup_gate();
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
-		    struct mhp_params *params)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	int ret;
-
-	if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
-		return -EINVAL;
-
-	ret = __add_pages(nid, start_pfn, nr_pages, params);
-	if (ret)
-		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
-		       __func__,  ret);
-
-	return ret;
-}
-
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-
-	__remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
-static const pgprot_t protection_map[16] = {
-	[VM_NONE]					= PAGE_NONE,
-	[VM_READ]					= PAGE_READONLY,
-	[VM_WRITE]					= PAGE_READONLY,
-	[VM_WRITE | VM_READ]				= PAGE_READONLY,
-	[VM_EXEC]					= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_X_RX),
-	[VM_EXEC | VM_READ]				= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RX),
-	[VM_EXEC | VM_WRITE]				= PAGE_COPY_EXEC,
-	[VM_EXEC | VM_WRITE | VM_READ]			= PAGE_COPY_EXEC,
-	[VM_SHARED]					= PAGE_NONE,
-	[VM_SHARED | VM_READ]				= PAGE_READONLY,
-	[VM_SHARED | VM_WRITE]				= PAGE_SHARED,
-	[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_SHARED,
-	[VM_SHARED | VM_EXEC]				= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_X_RX),
-	[VM_SHARED | VM_EXEC | VM_READ]			= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RX),
-	[VM_SHARED | VM_EXEC | VM_WRITE]		= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RWX),
-	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RWX)
-};
-DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c
deleted file mode 100644
index 711b6abc822e..000000000000
--- a/arch/ia64/mm/ioremap.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (c) Copyright 2006, 2007 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <asm/io.h>
-#include <asm/meminit.h>
-
-static inline void __iomem *
-__ioremap_uc(unsigned long phys_addr)
-{
-	return (void __iomem *) (__IA64_UNCACHED_OFFSET | phys_addr);
-}
-
-void __iomem *
-early_ioremap (unsigned long phys_addr, unsigned long size)
-{
-	u64 attr;
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-	return __ioremap_uc(phys_addr);
-}
-
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
-			   unsigned long flags)
-{
-	u64 attr;
-	unsigned long gran_base, gran_size;
-	unsigned long page_base;
-
-	/*
-	 * For things in kern_memmap, we must use the same attribute
-	 * as the rest of the kernel.  For more details, see
-	 * Documentation/arch/ia64/aliasing.rst.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-	else if (attr & EFI_MEMORY_UC)
-		return __ioremap_uc(phys_addr);
-
-	/*
-	 * Some chipsets don't support UC access to memory.  If
-	 * WB is supported for the whole granule, we prefer that.
-	 */
-	gran_base = GRANULEROUNDDOWN(phys_addr);
-	gran_size = GRANULEROUNDUP(phys_addr + size) - gran_base;
-	if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-
-	/*
-	 * WB is not supported for the whole granule, so we can't use
-	 * the region 7 identity mapping.  If we can safely cover the
-	 * area with kernel page table mappings, we can use those
-	 * instead.
-	 */
-	page_base = phys_addr & PAGE_MASK;
-	size = PAGE_ALIGN(phys_addr + size) - page_base;
-	if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB)
-		return generic_ioremap_prot(phys_addr, size, __pgprot(flags));
-
-	return __ioremap_uc(phys_addr);
-}
-EXPORT_SYMBOL(ioremap_prot);
-
-void __iomem *
-ioremap_uc(unsigned long phys_addr, unsigned long size)
-{
-	if (kern_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
-		return NULL;
-
-	return __ioremap_uc(phys_addr);
-}
-EXPORT_SYMBOL(ioremap_uc);
-
-void
-early_iounmap (volatile void __iomem *addr, unsigned long size)
-{
-}
-
-void iounmap(volatile void __iomem *addr)
-{
-	if (REGION_NUMBER(addr) == RGN_GATE)
-		vunmap((void *) ((unsigned long) addr & PAGE_MASK));
-}
-EXPORT_SYMBOL(iounmap);
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
deleted file mode 100644
index 4c7b1f50e3b7..000000000000
--- a/arch/ia64/mm/numa.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific variables and functions which are used on
- * NUMA machines with contiguous memory.
- * 
- *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
- */
-
-#include <linux/cpu.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/node.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/module.h>
-#include <asm/mmzone.h>
-#include <asm/numa.h>
-
-
-/*
- * The following structures are usually initialized by ACPI or
- * similar mechanisms and describe the NUMA characteristics of the machine.
- */
-int num_node_memblks;
-struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-struct node_cpuid_s node_cpuid[NR_CPUS] =
-	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };
-
-/*
- * This is a matrix with "distances" between nodes, they should be
- * proportional to the memory access latency ratios.
- */
-u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
-
-int __node_distance(int from, int to)
-{
-	return slit_distance(from, to);
-}
-EXPORT_SYMBOL(__node_distance);
-
-/* Identify which cnode a physical address resides on */
-int
-paddr_to_nid(unsigned long paddr)
-{
-	int	i;
-
-	for (i = 0; i < num_node_memblks; i++)
-		if (paddr >= node_memblk[i].start_paddr &&
-		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
-			break;
-
-	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
-}
-EXPORT_SYMBOL(paddr_to_nid);
-
-#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)
-void numa_clear_node(int cpu)
-{
-	unmap_cpu_from_node(cpu, NUMA_NO_NODE);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
- *  SRAT information is stored in node_memblk[], then we can use SRAT
- *  information at memory-hot-add if necessary.
- */
-
-int memory_add_physaddr_to_nid(u64 addr)
-{
-	int nid = paddr_to_nid(addr);
-	if (nid < 0)
-		return 0;
-	return nid;
-}
-EXPORT_SYMBOL(memory_add_physaddr_to_nid);
-#endif
-#endif
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
deleted file mode 100644
index ca060e7a2a46..000000000000
--- a/arch/ia64/mm/tlb.c
+++ /dev/null
@@ -1,591 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * TLB support routines.
- *
- * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
- *		Modified RID allocation for SMP
- *          Goutham Rao <goutham.rao@intel.com>
- *              IPI based ptc implementation and A-step IPI implementation.
- * Rohit Seth <rohit.seth@intel.com>
- * Ken Chen <kenneth.w.chen@intel.com>
- * Christophe de Dinechin <ddd@hp.com>: Avoid ptc.e on memory allocation
- * Copyright (C) 2007 Intel Corp
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Add multiple ptc.g/ptc.ga instruction support in global tlb purge.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <linux/memblock.h>
-#include <linux/slab.h>
-
-#include <asm/delay.h>
-#include <asm/mmu_context.h>
-#include <asm/pal.h>
-#include <asm/tlbflush.h>
-#include <asm/dma.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/tlb.h>
-
-static struct {
-	u64 mask;		/* mask of supported purge page-sizes */
-	unsigned long max_bits;	/* log2 of largest supported purge page-size */
-} purge;
-
-struct ia64_ctx ia64_ctx = {
-	.lock =	__SPIN_LOCK_UNLOCKED(ia64_ctx.lock),
-	.next =	1,
-	.max_ctx = ~0U
-};
-
-DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
-DEFINE_PER_CPU(u8, ia64_tr_num);  /*Number of TR slots in current processor*/
-DEFINE_PER_CPU(u8, ia64_tr_used); /*Max Slot number used by kernel*/
-
-struct ia64_tr_entry *ia64_idtrs[NR_CPUS];
-
-/*
- * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
- * Called after cpu_init() has setup ia64_ctx.max_ctx based on
- * maximum RID that is supported by boot CPU.
- */
-void __init
-mmu_context_init (void)
-{
-	ia64_ctx.bitmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3,
-					 SMP_CACHE_BYTES);
-	if (!ia64_ctx.bitmap)
-		panic("%s: Failed to allocate %u bytes\n", __func__,
-		      (ia64_ctx.max_ctx + 1) >> 3);
-	ia64_ctx.flushmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3,
-					   SMP_CACHE_BYTES);
-	if (!ia64_ctx.flushmap)
-		panic("%s: Failed to allocate %u bytes\n", __func__,
-		      (ia64_ctx.max_ctx + 1) >> 3);
-}
-
-/*
- * Acquire the ia64_ctx.lock before calling this function!
- */
-void
-wrap_mmu_context (struct mm_struct *mm)
-{
-	int i, cpu;
-	unsigned long flush_bit;
-
-	for (i=0; i <= ia64_ctx.max_ctx / BITS_PER_LONG; i++) {
-		flush_bit = xchg(&ia64_ctx.flushmap[i], 0);
-		ia64_ctx.bitmap[i] ^= flush_bit;
-	}
- 
-	/* use offset at 300 to skip daemons */
-	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
-				ia64_ctx.max_ctx, 300);
-	ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
-				ia64_ctx.max_ctx, ia64_ctx.next);
-
-	/*
-	 * can't call flush_tlb_all() here because of race condition
-	 * with O(1) scheduler [EF]
-	 */
-	cpu = get_cpu(); /* prevent preemption/migration */
-	for_each_online_cpu(i)
-		if (i != cpu)
-			per_cpu(ia64_need_tlb_flush, i) = 1;
-	put_cpu();
-	local_flush_tlb_all();
-}
-
-/*
- * Implement "spinaphores" ... like counting semaphores, but they
- * spin instead of sleeping.  If there are ever any other users for
- * this primitive it can be moved up to a spinaphore.h header.
- */
-struct spinaphore {
-	unsigned long	ticket;
-	unsigned long	serve;
-};
-
-static inline void spinaphore_init(struct spinaphore *ss, int val)
-{
-	ss->ticket = 0;
-	ss->serve = val;
-}
-
-static inline void down_spin(struct spinaphore *ss)
-{
-	unsigned long t = ia64_fetchadd(1, &ss->ticket, acq), serve;
-
-	if (time_before(t, ss->serve))
-		return;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld8.c.nc %0=[%1]" : "=r"(serve) : "r"(&ss->serve) : "memory");
-		if (time_before(t, serve))
-			return;
-		cpu_relax();
-	}
-}
-
-static inline void up_spin(struct spinaphore *ss)
-{
-	ia64_fetchadd(1, &ss->serve, rel);
-}
-
-static struct spinaphore ptcg_sem;
-static u16 nptcg = 1;
-static int need_ptcg_sem = 1;
-static int toolatetochangeptcgsem = 0;
-
-/*
- * Kernel parameter "nptcg=" overrides max number of concurrent global TLB
- * purges which is reported from either PAL or SAL PALO.
- *
- * We don't have sanity checking for nptcg value. It's the user's responsibility
- * for valid nptcg value on the platform. Otherwise, kernel may hang in some
- * cases.
- */
-static int __init
-set_nptcg(char *str)
-{
-	int value = 0;
-
-	get_option(&str, &value);
-	setup_ptcg_sem(value, NPTCG_FROM_KERNEL_PARAMETER);
-
-	return 1;
-}
-
-__setup("nptcg=", set_nptcg);
-
-/*
- * Maximum number of simultaneous ptc.g purges in the system can
- * be defined by PAL_VM_SUMMARY (in which case we should take
- * the smallest value for any cpu in the system) or by the PAL
- * override table (in which case we should ignore the value from
- * PAL_VM_SUMMARY).
- *
- * Kernel parameter "nptcg=" overrides maximum number of simultaneous ptc.g
- * purges defined in either PAL_VM_SUMMARY or PAL override table. In this case,
- * we should ignore the value from either PAL_VM_SUMMARY or PAL override table.
- *
- * Complicating the logic here is the fact that num_possible_cpus()
- * isn't fully setup until we start bringing cpus online.
- */
-void
-setup_ptcg_sem(int max_purges, int nptcg_from)
-{
-	static int kp_override;
-	static int palo_override;
-	static int firstcpu = 1;
-
-	if (toolatetochangeptcgsem) {
-		if (nptcg_from == NPTCG_FROM_PAL && max_purges == 0)
-			BUG_ON(1 < nptcg);
-		else
-			BUG_ON(max_purges < nptcg);
-		return;
-	}
-
-	if (nptcg_from == NPTCG_FROM_KERNEL_PARAMETER) {
-		kp_override = 1;
-		nptcg = max_purges;
-		goto resetsema;
-	}
-	if (kp_override) {
-		need_ptcg_sem = num_possible_cpus() > nptcg;
-		return;
-	}
-
-	if (nptcg_from == NPTCG_FROM_PALO) {
-		palo_override = 1;
-
-		/* In PALO max_purges == 0 really means it! */
-		if (max_purges == 0)
-			panic("Whoa! Platform does not support global TLB purges.\n");
-		nptcg = max_purges;
-		if (nptcg == PALO_MAX_TLB_PURGES) {
-			need_ptcg_sem = 0;
-			return;
-		}
-		goto resetsema;
-	}
-	if (palo_override) {
-		if (nptcg != PALO_MAX_TLB_PURGES)
-			need_ptcg_sem = (num_possible_cpus() > nptcg);
-		return;
-	}
-
-	/* In PAL_VM_SUMMARY max_purges == 0 actually means 1 */
-	if (max_purges == 0) max_purges = 1;
-
-	if (firstcpu) {
-		nptcg = max_purges;
-		firstcpu = 0;
-	}
-	if (max_purges < nptcg)
-		nptcg = max_purges;
-	if (nptcg == PAL_MAX_PURGES) {
-		need_ptcg_sem = 0;
-		return;
-	} else
-		need_ptcg_sem = (num_possible_cpus() > nptcg);
-
-resetsema:
-	spinaphore_init(&ptcg_sem, max_purges);
-}
-
-#ifdef CONFIG_SMP
-static void
-ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start,
-		       unsigned long end, unsigned long nbits)
-{
-	struct mm_struct *active_mm = current->active_mm;
-
-	toolatetochangeptcgsem = 1;
-
-	if (mm != active_mm) {
-		/* Restore region IDs for mm */
-		if (mm && active_mm) {
-			activate_context(mm);
-		} else {
-			flush_tlb_all();
-			return;
-		}
-	}
-
-	if (need_ptcg_sem)
-		down_spin(&ptcg_sem);
-
-	do {
-		/*
-		 * Flush ALAT entries also.
-		 */
-		ia64_ptcga(start, (nbits << 2));
-		ia64_srlz_i();
-		start += (1UL << nbits);
-	} while (start < end);
-
-	if (need_ptcg_sem)
-		up_spin(&ptcg_sem);
-
-        if (mm != active_mm) {
-                activate_context(active_mm);
-        }
-}
-#endif /* CONFIG_SMP */
-
-void
-local_flush_tlb_all (void)
-{
-	unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
-
-	addr    = local_cpu_data->ptce_base;
-	count0  = local_cpu_data->ptce_count[0];
-	count1  = local_cpu_data->ptce_count[1];
-	stride0 = local_cpu_data->ptce_stride[0];
-	stride1 = local_cpu_data->ptce_stride[1];
-
-	local_irq_save(flags);
-	for (i = 0; i < count0; ++i) {
-		for (j = 0; j < count1; ++j) {
-			ia64_ptce(addr);
-			addr += stride1;
-		}
-		addr += stride0;
-	}
-	local_irq_restore(flags);
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-static void
-__flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
-		 unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long size = end - start;
-	unsigned long nbits;
-
-#ifndef CONFIG_SMP
-	if (mm != current->active_mm) {
-		mm->context = 0;
-		return;
-	}
-#endif
-
-	nbits = ia64_fls(size + 0xfff);
-	while (unlikely (((1UL << nbits) & purge.mask) == 0) &&
-			(nbits < purge.max_bits))
-		++nbits;
-	if (nbits > purge.max_bits)
-		nbits = purge.max_bits;
-	start &= ~((1UL << nbits) - 1);
-
-	preempt_disable();
-#ifdef CONFIG_SMP
-	if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
-		ia64_global_tlb_purge(mm, start, end, nbits);
-		preempt_enable();
-		return;
-	}
-#endif
-	do {
-		ia64_ptcl(start, (nbits<<2));
-		start += (1UL << nbits);
-	} while (start < end);
-	preempt_enable();
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-void flush_tlb_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
-{
-	if (unlikely(end - start >= 1024*1024*1024*1024UL
-			|| REGION_NUMBER(start) != REGION_NUMBER(end - 1))) {
-		/*
-		 * If we flush more than a tera-byte or across regions, we're
-		 * probably better off just flushing the entire TLB(s).  This
-		 * should be very rare and is not worth optimizing for.
-		 */
-		flush_tlb_all();
-	} else {
-		/* flush the address range from the tlb */
-		__flush_tlb_range(vma, start, end);
-		/* flush the virt. page-table area mapping the addr range */
-		__flush_tlb_range(vma, ia64_thash(start), ia64_thash(end));
-	}
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void ia64_tlb_init(void)
-{
-	ia64_ptce_info_t ptce_info;
-	u64 tr_pgbits;
-	long status;
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	int cpu = smp_processor_id();
-
-	if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
-		printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld; "
-		       "defaulting to architected purge page-sizes.\n", status);
-		purge.mask = 0x115557000UL;
-	}
-	purge.max_bits = ia64_fls(purge.mask);
-
-	ia64_get_ptce(&ptce_info);
-	local_cpu_data->ptce_base = ptce_info.base;
-	local_cpu_data->ptce_count[0] = ptce_info.count[0];
-	local_cpu_data->ptce_count[1] = ptce_info.count[1];
-	local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
-	local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
-
-	local_flush_tlb_all();	/* nuke left overs from bootstrapping... */
-	status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2);
-
-	if (status) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-		per_cpu(ia64_tr_num, cpu) = 8;
-		return;
-	}
-	per_cpu(ia64_tr_num, cpu) = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
-	if (per_cpu(ia64_tr_num, cpu) >
-				(vm_info_1.pal_vm_info_1_s.max_dtr_entry+1))
-		per_cpu(ia64_tr_num, cpu) =
-				vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
-	if (per_cpu(ia64_tr_num, cpu) > IA64_TR_ALLOC_MAX) {
-		static int justonce = 1;
-		per_cpu(ia64_tr_num, cpu) = IA64_TR_ALLOC_MAX;
-		if (justonce) {
-			justonce = 0;
-			printk(KERN_DEBUG "TR register number exceeds "
-			       "IA64_TR_ALLOC_MAX!\n");
-		}
-	}
-}
-
-/*
- * is_tr_overlap
- *
- * Check overlap with inserted TRs.
- */
-static int is_tr_overlap(struct ia64_tr_entry *p, u64 va, u64 log_size)
-{
-	u64 tr_log_size;
-	u64 tr_end;
-	u64 va_rr = ia64_get_rr(va);
-	u64 va_rid = RR_TO_RID(va_rr);
-	u64 va_end = va + (1<<log_size) - 1;
-
-	if (va_rid != RR_TO_RID(p->rr))
-		return 0;
-	tr_log_size = (p->itir & 0xff) >> 2;
-	tr_end = p->ifa + (1<<tr_log_size) - 1;
-
-	if (va > tr_end || p->ifa > va_end)
-		return 0;
-	return 1;
-
-}
-
-/*
- * ia64_insert_tr in virtual mode. Allocate a TR slot
- *
- * target_mask : 0x1 : itr, 0x2 : dtr, 0x3 : idtr
- *
- * va 	: virtual address.
- * pte 	: pte entries inserted.
- * log_size: range to be covered.
- *
- * Return value:  <0 :  error No.
- *
- *		  >=0 : slot number allocated for TR.
- * Must be called with preemption disabled.
- */
-int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size)
-{
-	int i, r;
-	unsigned long psr;
-	struct ia64_tr_entry *p;
-	int cpu = smp_processor_id();
-
-	if (!ia64_idtrs[cpu]) {
-		ia64_idtrs[cpu] = kmalloc_array(2 * IA64_TR_ALLOC_MAX,
-						sizeof(struct ia64_tr_entry),
-						GFP_KERNEL);
-		if (!ia64_idtrs[cpu])
-			return -ENOMEM;
-	}
-	r = -EINVAL;
-	/*Check overlap with existing TR entries*/
-	if (target_mask & 0x1) {
-		p = ia64_idtrs[cpu];
-		for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu);
-								i++, p++) {
-			if (p->pte & 0x1)
-				if (is_tr_overlap(p, va, log_size)) {
-					printk(KERN_DEBUG "Overlapped Entry"
-						"Inserted for TR Register!!\n");
-					goto out;
-			}
-		}
-	}
-	if (target_mask & 0x2) {
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX;
-		for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu);
-								i++, p++) {
-			if (p->pte & 0x1)
-				if (is_tr_overlap(p, va, log_size)) {
-					printk(KERN_DEBUG "Overlapped Entry"
-						"Inserted for TR Register!!\n");
-					goto out;
-				}
-		}
-	}
-
-	for (i = IA64_TR_ALLOC_BASE; i < per_cpu(ia64_tr_num, cpu); i++) {
-		switch (target_mask & 0x3) {
-		case 1:
-			if (!((ia64_idtrs[cpu] + i)->pte & 0x1))
-				goto found;
-			continue;
-		case 2:
-			if (!((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-				goto found;
-			continue;
-		case 3:
-			if (!((ia64_idtrs[cpu] + i)->pte & 0x1) &&
-			    !((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-				goto found;
-			continue;
-		default:
-			r = -EINVAL;
-			goto out;
-		}
-	}
-found:
-	if (i >= per_cpu(ia64_tr_num, cpu))
-		return -EBUSY;
-
-	/*Record tr info for mca handler use!*/
-	if (i > per_cpu(ia64_tr_used, cpu))
-		per_cpu(ia64_tr_used, cpu) = i;
-
-	psr = ia64_clear_ic();
-	if (target_mask & 0x1) {
-		ia64_itr(0x1, i, va, pte, log_size);
-		ia64_srlz_i();
-		p = ia64_idtrs[cpu] + i;
-		p->ifa = va;
-		p->pte = pte;
-		p->itir = log_size << 2;
-		p->rr = ia64_get_rr(va);
-	}
-	if (target_mask & 0x2) {
-		ia64_itr(0x2, i, va, pte, log_size);
-		ia64_srlz_i();
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i;
-		p->ifa = va;
-		p->pte = pte;
-		p->itir = log_size << 2;
-		p->rr = ia64_get_rr(va);
-	}
-	ia64_set_psr(psr);
-	r = i;
-out:
-	return r;
-}
-EXPORT_SYMBOL_GPL(ia64_itr_entry);
-
-/*
- * ia64_purge_tr
- *
- * target_mask: 0x1: purge itr, 0x2 : purge dtr, 0x3 purge idtr.
- * slot: slot number to be freed.
- *
- * Must be called with preemption disabled.
- */
-void ia64_ptr_entry(u64 target_mask, int slot)
-{
-	int cpu = smp_processor_id();
-	int i;
-	struct ia64_tr_entry *p;
-
-	if (slot < IA64_TR_ALLOC_BASE || slot >= per_cpu(ia64_tr_num, cpu))
-		return;
-
-	if (target_mask & 0x1) {
-		p = ia64_idtrs[cpu] + slot;
-		if ((p->pte&0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) {
-			p->pte = 0;
-			ia64_ptr(0x1, p->ifa, p->itir>>2);
-			ia64_srlz_i();
-		}
-	}
-
-	if (target_mask & 0x2) {
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + slot;
-		if ((p->pte & 0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) {
-			p->pte = 0;
-			ia64_ptr(0x2, p->ifa, p->itir>>2);
-			ia64_srlz_i();
-		}
-	}
-
-	for (i = per_cpu(ia64_tr_used, cpu); i >= IA64_TR_ALLOC_BASE; i--) {
-		if (((ia64_idtrs[cpu] + i)->pte & 0x1) ||
-		    ((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-			break;
-	}
-	per_cpu(ia64_tr_used, cpu) = i;
-}
-EXPORT_SYMBOL_GPL(ia64_ptr_entry);
diff --git a/arch/ia64/pci/Makefile b/arch/ia64/pci/Makefile
deleted file mode 100644
index 81ea50eeb527..000000000000
--- a/arch/ia64/pci/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the ia64-specific parts of the pci bus
-#
-obj-y		:= pci.o fixup.o
diff --git a/arch/ia64/pci/fixup.c b/arch/ia64/pci/fixup.c
deleted file mode 100644
index 2bcdd7d3a1ad..000000000000
--- a/arch/ia64/pci/fixup.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Exceptions for specific devices. Usually work-arounds for fatal design flaws.
- * Derived from fixup.c of i386 tree.
- */
-
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/vgaarb.h>
-#include <linux/screen_info.h>
-#include <asm/uv/uv.h>
-
-/*
- * Fixup to mark boot BIOS video selected by BIOS before it changes
- *
- * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
- *
- * The standard boot ROM sequence for an x86 machine uses the BIOS
- * to select an initial video card for boot display. This boot video
- * card will have its BIOS copied to 0xC0000 in system RAM.
- * IORESOURCE_ROM_SHADOW is used to associate the boot video
- * card with this copy. On laptops this copy has to be used since
- * the main ROM may be compressed or combined with another image.
- * See pci_map_rom() for use of this flag. Before marking the device
- * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
- * by either arch code or vga-arbitration; if so only apply the fixup to this
- * already-determined primary video card.
- */
-
-static void pci_fixup_video(struct pci_dev *pdev)
-{
-	struct pci_dev *bridge;
-	struct pci_bus *bus;
-	u16 config;
-	struct resource *res;
-
-	if (is_uv_system())
-		return;
-	/* Maybe, this machine supports legacy memory map. */
-
-	/* Is VGA routed to us? */
-	bus = pdev->bus;
-	while (bus) {
-		bridge = bus->self;
-
-		/*
-		 * From information provided by
-		 * "David Miller" <davem@davemloft.net>
-		 * The bridge control register is valid for PCI header
-		 * type BRIDGE, or CARDBUS. Host to PCI controllers use
-		 * PCI header type NORMAL.
-		 */
-		if (bridge && (pci_is_bridge(bridge))) {
-			pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
-						&config);
-			if (!(config & PCI_BRIDGE_CTL_VGA))
-				return;
-		}
-		bus = bus->parent;
-	}
-	if (!vga_default_device() || pdev == vga_default_device()) {
-		pci_read_config_word(pdev, PCI_COMMAND, &config);
-		if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
-			res = &pdev->resource[PCI_ROM_RESOURCE];
-
-			pci_disable_rom(pdev);
-			if (res->parent)
-				release_resource(res);
-
-			res->start = 0xC0000;
-			res->end = res->start + 0x20000 - 1;
-			res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
-				     IORESOURCE_PCI_FIXED;
-			dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
-				 res);
-		}
-	}
-}
-DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
-			       PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
deleted file mode 100644
index 0a0328e61bef..000000000000
--- a/arch/ia64/pci/pci.c
+++ /dev/null
@@ -1,576 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * pci.c - Low-Level PCI Access in IA-64
- *
- * Derived from bios32.c of i386 tree.
- *
- * (c) Copyright 2002, 2005 Hewlett-Packard Development Company, L.P.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- * Copyright (C) 2004 Silicon Graphics, Inc.
- *
- * Note: Above list of copyright holders is incomplete...
- */
-
-#include <linux/acpi.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci-acpi.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/memblock.h>
-#include <linux/export.h>
-
-#include <asm/page.h>
-#include <asm/io.h>
-#include <asm/sal.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-
-/*
- * Low-level SAL-based PCI configuration access functions. Note that SAL
- * calls are already serialized (via sal_lock), so we don't need another
- * synchronization mechanism here.
- */
-
-#define PCI_SAL_ADDRESS(seg, bus, devfn, reg)		\
-	(((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
-
-/* SAL 3.2 adds support for extended config space. */
-
-#define PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg)	\
-	(((u64) seg << 28) | (bus << 20) | (devfn << 12) | (reg))
-
-int raw_pci_read(unsigned int seg, unsigned int bus, unsigned int devfn,
-	      int reg, int len, u32 *value)
-{
-	u64 addr, data = 0;
-	int mode, result;
-
-	if (!value || (seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
-		return -EINVAL;
-
-	if ((seg | reg) <= 255) {
-		addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
-		mode = 0;
-	} else if (sal_revision >= SAL_VERSION_CODE(3,2)) {
-		addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
-		mode = 1;
-	} else {
-		return -EINVAL;
-	}
-
-	result = ia64_sal_pci_config_read(addr, mode, len, &data);
-	if (result != 0)
-		return -EINVAL;
-
-	*value = (u32) data;
-	return 0;
-}
-
-int raw_pci_write(unsigned int seg, unsigned int bus, unsigned int devfn,
-	       int reg, int len, u32 value)
-{
-	u64 addr;
-	int mode, result;
-
-	if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
-		return -EINVAL;
-
-	if ((seg | reg) <= 255) {
-		addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
-		mode = 0;
-	} else if (sal_revision >= SAL_VERSION_CODE(3,2)) {
-		addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
-		mode = 1;
-	} else {
-		return -EINVAL;
-	}
-	result = ia64_sal_pci_config_write(addr, mode, len, value);
-	if (result != 0)
-		return -EINVAL;
-	return 0;
-}
-
-static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
-							int size, u32 *value)
-{
-	return raw_pci_read(pci_domain_nr(bus), bus->number,
-				 devfn, where, size, value);
-}
-
-static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
-							int size, u32 value)
-{
-	return raw_pci_write(pci_domain_nr(bus), bus->number,
-				  devfn, where, size, value);
-}
-
-struct pci_ops pci_root_ops = {
-	.read = pci_read,
-	.write = pci_write,
-};
-
-struct pci_root_info {
-	struct acpi_pci_root_info common;
-	struct pci_controller controller;
-	struct list_head io_resources;
-};
-
-static unsigned int new_space(u64 phys_base, int sparse)
-{
-	u64 mmio_base;
-	int i;
-
-	if (phys_base == 0)
-		return 0;	/* legacy I/O port space */
-
-	mmio_base = (u64) ioremap(phys_base, 0);
-	for (i = 0; i < num_io_spaces; i++)
-		if (io_space[i].mmio_base == mmio_base &&
-		    io_space[i].sparse == sparse)
-			return i;
-
-	if (num_io_spaces == MAX_IO_SPACES) {
-		pr_err("PCI: Too many IO port spaces "
-			"(MAX_IO_SPACES=%lu)\n", MAX_IO_SPACES);
-		return ~0;
-	}
-
-	i = num_io_spaces++;
-	io_space[i].mmio_base = mmio_base;
-	io_space[i].sparse = sparse;
-
-	return i;
-}
-
-static int add_io_space(struct device *dev, struct pci_root_info *info,
-			struct resource_entry *entry)
-{
-	struct resource_entry *iospace;
-	struct resource *resource, *res = entry->res;
-	char *name;
-	unsigned long base, min, max, base_port;
-	unsigned int sparse = 0, space_nr, len;
-
-	len = strlen(info->common.name) + 32;
-	iospace = resource_list_create_entry(NULL, len);
-	if (!iospace) {
-		dev_err(dev, "PCI: No memory for %s I/O port space\n",
-			info->common.name);
-		return -ENOMEM;
-	}
-
-	if (res->flags & IORESOURCE_IO_SPARSE)
-		sparse = 1;
-	space_nr = new_space(entry->offset, sparse);
-	if (space_nr == ~0)
-		goto free_resource;
-
-	name = (char *)(iospace + 1);
-	min = res->start - entry->offset;
-	max = res->end - entry->offset;
-	base = __pa(io_space[space_nr].mmio_base);
-	base_port = IO_SPACE_BASE(space_nr);
-	snprintf(name, len, "%s I/O Ports %08lx-%08lx", info->common.name,
-		 base_port + min, base_port + max);
-
-	/*
-	 * The SDM guarantees the legacy 0-64K space is sparse, but if the
-	 * mapping is done by the processor (not the bridge), ACPI may not
-	 * mark it as sparse.
-	 */
-	if (space_nr == 0)
-		sparse = 1;
-
-	resource = iospace->res;
-	resource->name  = name;
-	resource->flags = IORESOURCE_MEM;
-	resource->start = base + (sparse ? IO_SPACE_SPARSE_ENCODING(min) : min);
-	resource->end   = base + (sparse ? IO_SPACE_SPARSE_ENCODING(max) : max);
-	if (insert_resource(&iomem_resource, resource)) {
-		dev_err(dev,
-			"can't allocate host bridge io space resource  %pR\n",
-			resource);
-		goto free_resource;
-	}
-
-	entry->offset = base_port;
-	res->start = min + base_port;
-	res->end = max + base_port;
-	resource_list_add_tail(iospace, &info->io_resources);
-
-	return 0;
-
-free_resource:
-	resource_list_free_entry(iospace);
-	return -ENOSPC;
-}
-
-/*
- * An IO port or MMIO resource assigned to a PCI host bridge may be
- * consumed by the host bridge itself or available to its child
- * bus/devices. The ACPI specification defines a bit (Producer/Consumer)
- * to tell whether the resource is consumed by the host bridge itself,
- * but firmware hasn't used that bit consistently, so we can't rely on it.
- *
- * On x86 and IA64 platforms, all IO port and MMIO resources are assumed
- * to be available to child bus/devices except one special case:
- *     IO port [0xCF8-0xCFF] is consumed by the host bridge itself
- *     to access PCI configuration space.
- *
- * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF].
- */
-static bool resource_is_pcicfg_ioport(struct resource *res)
-{
-	return (res->flags & IORESOURCE_IO) &&
-		res->start == 0xCF8 && res->end == 0xCFF;
-}
-
-static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
-{
-	struct device *dev = &ci->bridge->dev;
-	struct pci_root_info *info;
-	struct resource *res;
-	struct resource_entry *entry, *tmp;
-	int status;
-
-	status = acpi_pci_probe_root_resources(ci);
-	if (status > 0) {
-		info = container_of(ci, struct pci_root_info, common);
-		resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
-			res = entry->res;
-			if (res->flags & IORESOURCE_MEM) {
-				/*
-				 * HP's firmware has a hack to work around a
-				 * Windows bug. Ignore these tiny memory ranges.
-				 */
-				if (resource_size(res) <= 16) {
-					resource_list_del(entry);
-					insert_resource(&iomem_resource,
-							entry->res);
-					resource_list_add_tail(entry,
-							&info->io_resources);
-				}
-			} else if (res->flags & IORESOURCE_IO) {
-				if (resource_is_pcicfg_ioport(entry->res))
-					resource_list_destroy_entry(entry);
-				else if (add_io_space(dev, info, entry))
-					resource_list_destroy_entry(entry);
-			}
-		}
-	}
-
-	return status;
-}
-
-static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
-{
-	struct pci_root_info *info;
-	struct resource_entry *entry, *tmp;
-
-	info = container_of(ci, struct pci_root_info, common);
-	resource_list_for_each_entry_safe(entry, tmp, &info->io_resources) {
-		release_resource(entry->res);
-		resource_list_destroy_entry(entry);
-	}
-	kfree(info);
-}
-
-static struct acpi_pci_root_ops pci_acpi_root_ops = {
-	.pci_ops = &pci_root_ops,
-	.release_info = pci_acpi_root_release_info,
-	.prepare_resources = pci_acpi_root_prepare_resources,
-};
-
-struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
-{
-	struct acpi_device *device = root->device;
-	struct pci_root_info *info;
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		dev_err(&device->dev,
-			"pci_bus %04x:%02x: ignored (out of memory)\n",
-			root->segment, (int)root->secondary.start);
-		return NULL;
-	}
-
-	info->controller.segment = root->segment;
-	info->controller.companion = device;
-	info->controller.node = acpi_get_node(device->handle);
-	INIT_LIST_HEAD(&info->io_resources);
-	return acpi_pci_root_create(root, &pci_acpi_root_ops,
-				    &info->common, &info->controller);
-}
-
-int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
-{
-	/*
-	 * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
-	 * here, pci_create_root_bus() has been called by someone else and
-	 * sysdata is likely to be different from what we expect.  Let it go in
-	 * that case.
-	 */
-	if (!bridge->dev.parent) {
-		struct pci_controller *controller = bridge->bus->sysdata;
-		ACPI_COMPANION_SET(&bridge->dev, controller->companion);
-	}
-	return 0;
-}
-
-void pcibios_fixup_device_resources(struct pci_dev *dev)
-{
-	int idx;
-
-	if (!dev->bus)
-		return;
-
-	for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) {
-		struct resource *r = &dev->resource[idx];
-
-		if (!r->flags || r->parent || !r->start)
-			continue;
-
-		pci_claim_resource(dev, idx);
-	}
-}
-EXPORT_SYMBOL_GPL(pcibios_fixup_device_resources);
-
-static void pcibios_fixup_bridge_resources(struct pci_dev *dev)
-{
-	int idx;
-
-	if (!dev->bus)
-		return;
-
-	for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
-		struct resource *r = &dev->resource[idx];
-
-		if (!r->flags || r->parent || !r->start)
-			continue;
-
-		pci_claim_bridge_resource(dev, idx);
-	}
-}
-
-/*
- *  Called after each bus is probed, but before its children are examined.
- */
-void pcibios_fixup_bus(struct pci_bus *b)
-{
-	struct pci_dev *dev;
-
-	if (b->self) {
-		pci_read_bridge_bases(b);
-		pcibios_fixup_bridge_resources(b->self);
-	}
-	list_for_each_entry(dev, &b->devices, bus_list)
-		pcibios_fixup_device_resources(dev);
-}
-
-void pcibios_add_bus(struct pci_bus *bus)
-{
-	acpi_pci_add_bus(bus);
-}
-
-void pcibios_remove_bus(struct pci_bus *bus)
-{
-	acpi_pci_remove_bus(bus);
-}
-
-void pcibios_set_master (struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
-
-int
-pcibios_enable_device (struct pci_dev *dev, int mask)
-{
-	int ret;
-
-	ret = pci_enable_resources(dev, mask);
-	if (ret < 0)
-		return ret;
-
-	if (!pci_dev_msi_enabled(dev))
-		return acpi_pci_irq_enable(dev);
-	return 0;
-}
-
-void
-pcibios_disable_device (struct pci_dev *dev)
-{
-	BUG_ON(atomic_read(&dev->enable_cnt));
-	if (!pci_dev_msi_enabled(dev))
-		acpi_pci_irq_disable(dev);
-}
-
-/**
- * pci_get_legacy_mem - generic legacy mem routine
- * @bus: bus to get legacy memory base address for
- *
- * Find the base of legacy memory for @bus.  This is typically the first
- * megabyte of bus address space for @bus or is simply 0 on platforms whose
- * chipsets support legacy I/O and memory routing.  Returns the base address
- * or an error pointer if an error occurred.
- *
- * This is the ia64 generic version of this routine.  Other platforms
- * are free to override it with a machine vector.
- */
-char *pci_get_legacy_mem(struct pci_bus *bus)
-{
-	return (char *)__IA64_UNCACHED_OFFSET;
-}
-
-/**
- * pci_mmap_legacy_page_range - map legacy memory space to userland
- * @bus: bus whose legacy space we're mapping
- * @vma: vma passed in by mmap
- *
- * Map legacy memory space for this device back to userspace using a machine
- * vector to get the base address.
- */
-int
-pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma,
-			   enum pci_mmap_state mmap_state)
-{
-	unsigned long size = vma->vm_end - vma->vm_start;
-	pgprot_t prot;
-	char *addr;
-
-	/* We only support mmap'ing of legacy memory space */
-	if (mmap_state != pci_mmap_mem)
-		return -ENOSYS;
-
-	/*
-	 * Avoid attribute aliasing.  See Documentation/arch/ia64/aliasing.rst
-	 * for more details.
-	 */
-	if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))
-		return -EINVAL;
-	prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size,
-				    vma->vm_page_prot);
-
-	addr = pci_get_legacy_mem(bus);
-	if (IS_ERR(addr))
-		return PTR_ERR(addr);
-
-	vma->vm_pgoff += (unsigned long)addr >> PAGE_SHIFT;
-	vma->vm_page_prot = prot;
-
-	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			    size, vma->vm_page_prot))
-		return -EAGAIN;
-
-	return 0;
-}
-
-/**
- * pci_legacy_read - read from legacy I/O space
- * @bus: bus to read
- * @port: legacy port value
- * @val: caller allocated storage for returned value
- * @size: number of bytes to read
- *
- * Simply reads @size bytes from @port and puts the result in @val.
- *
- * Again, this (and the write routine) are generic versions that can be
- * overridden by the platform.  This is necessary on platforms that don't
- * support legacy I/O routing or that hard fail on legacy I/O timeouts.
- */
-int pci_legacy_read(struct pci_bus *bus, u16 port, u32 *val, u8 size)
-{
-	int ret = size;
-
-	switch (size) {
-	case 1:
-		*val = inb(port);
-		break;
-	case 2:
-		*val = inw(port);
-		break;
-	case 4:
-		*val = inl(port);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-/**
- * pci_legacy_write - perform a legacy I/O write
- * @bus: bus pointer
- * @port: port to write
- * @val: value to write
- * @size: number of bytes to write from @val
- *
- * Simply writes @size bytes of @val to @port.
- */
-int pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size)
-{
-	int ret = size;
-
-	switch (size) {
-	case 1:
-		outb(val, port);
-		break;
-	case 2:
-		outw(val, port);
-		break;
-	case 4:
-		outl(val, port);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-/**
- * set_pci_cacheline_size - determine cacheline size for PCI devices
- *
- * We want to use the line-size of the outer-most cache.  We assume
- * that this line-size is the same for all CPUs.
- *
- * Code mostly taken from arch/ia64/kernel/palinfo.c:cache_info().
- */
-static void __init set_pci_dfl_cacheline_size(void)
-{
-	unsigned long levels, unique_caches;
-	long status;
-	pal_cache_config_info_t cci;
-
-	status = ia64_pal_cache_summary(&levels, &unique_caches);
-	if (status != 0) {
-		pr_err("%s: ia64_pal_cache_summary() failed "
-			"(status=%ld)\n", __func__, status);
-		return;
-	}
-
-	status = ia64_pal_cache_config_info(levels - 1,
-				/* cache_type (data_or_unified)= */ 2, &cci);
-	if (status != 0) {
-		pr_err("%s: ia64_pal_cache_config_info() failed "
-			"(status=%ld)\n", __func__, status);
-		return;
-	}
-	pci_dfl_cache_line_size = (1 << cci.pcci_line_size) / 4;
-}
-
-static int __init pcibios_init(void)
-{
-	set_pci_dfl_cacheline_size();
-	return 0;
-}
-
-subsys_initcall(pcibios_init);
diff --git a/arch/ia64/scripts/check-gas b/arch/ia64/scripts/check-gas
deleted file mode 100755
index 787cf9b6b04a..000000000000
--- a/arch/ia64/scripts/check-gas
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-dir=$(dirname $0)
-CC=$1
-OBJDUMP=$2
-tmp=${TMPDIR:-/tmp}
-out=$tmp/out$$.o
-$CC -c $dir/check-gas-asm.S -o $out
-res=$($OBJDUMP -r --section .data $out | fgrep 00004 | tr -s ' ' |cut -f3 -d' ')
-rm -f $out
-if [ $res != ".text" ]; then
-	echo buggy
-else
-	echo good
-fi
-exit 0
diff --git a/arch/ia64/scripts/check-gas-asm.S b/arch/ia64/scripts/check-gas-asm.S
deleted file mode 100644
index 010e1d227e5d..000000000000
--- a/arch/ia64/scripts/check-gas-asm.S
+++ /dev/null
@@ -1,2 +0,0 @@
-[1:]	nop 0
-	.xdata4 ".data", 0, 1b-.
diff --git a/arch/ia64/scripts/check-model.c b/arch/ia64/scripts/check-model.c
deleted file mode 100644
index e1d4e86e3d63..000000000000
--- a/arch/ia64/scripts/check-model.c
+++ /dev/null
@@ -1 +0,0 @@
-int __attribute__ ((__model__ (__small__))) x;
diff --git a/arch/ia64/scripts/check-segrel.S b/arch/ia64/scripts/check-segrel.S
deleted file mode 100644
index 65d6378adaaa..000000000000
--- a/arch/ia64/scripts/check-segrel.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.rodata
-	data4 @segrel(start)
-	.data
-start:
diff --git a/arch/ia64/scripts/check-segrel.lds b/arch/ia64/scripts/check-segrel.lds
deleted file mode 100644
index c385d246e458..000000000000
--- a/arch/ia64/scripts/check-segrel.lds
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-SECTIONS {
-	. = SIZEOF_HEADERS;
-	.rodata : { *(.rodata) } :ro
-	.note : { *(.note*) }
-	. = 0xa0000;
-	.data : { *(.data) } :dat
-	/DISCARD/ : { *(*) }
-}
-PHDRS {
-  ro PT_LOAD FILEHDR PHDRS;
-  dat PT_LOAD;
-}
diff --git a/arch/ia64/scripts/check-serialize.S b/arch/ia64/scripts/check-serialize.S
deleted file mode 100644
index 0400c106806c..000000000000
--- a/arch/ia64/scripts/check-serialize.S
+++ /dev/null
@@ -1,2 +0,0 @@
-	.serialize.data
-	.serialize.instruction
diff --git a/arch/ia64/scripts/check-text-align.S b/arch/ia64/scripts/check-text-align.S
deleted file mode 100644
index 107fa1c88c2e..000000000000
--- a/arch/ia64/scripts/check-text-align.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.proc foo
-	.prologue
-foo:	.save rp, r2
-	nop 0
-	.align 64
-	.endp foo
diff --git a/arch/ia64/scripts/toolchain-flags b/arch/ia64/scripts/toolchain-flags
deleted file mode 100755
index 12dff5c981cf..000000000000
--- a/arch/ia64/scripts/toolchain-flags
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Check whether linker can handle cross-segment @segrel():
-#
-CPPFLAGS=""
-CC=$1
-OBJDUMP=$2
-READELF=$3
-dir=$(dirname $0)
-tmp=${TMPDIR:-/tmp}
-out=$tmp/out$$
-
-# Check whether cross-segment segment-relative relocs work fine.  We need
-# that for building the gate DSO:
-
-$CC -nostdlib -static -Wl,-T$dir/check-segrel.lds $dir/check-segrel.S -o $out
-res=$($OBJDUMP --full --section .rodata $out | fgrep 000 | cut -f3 -d' ')
-rm -f $out
-if [ $res != 00000a00 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_BUGGY_SEGREL"
-    cat >&2 <<EOF
-warning: your linker cannot handle cross-segment segment-relative relocations.
-         please upgrade to a newer version (it is safe to use this linker, but
-         the kernel will be bigger than strictly necessary).
-EOF
-fi
-
-# Check whether .align inside a function works as expected.
-
-$CC -c $dir/check-text-align.S -o $out
-$READELF -u $out | fgrep -q 'prologue(rlen=12)'
-res=$?
-rm -f $out
-if [ $res -eq 0 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_WORKING_TEXT_ALIGN"
-fi
-
-if ! $CC -c $dir/check-model.c -o $out 2>&1 | grep  __model__ | grep -q attrib
-then
-    CPPFLAGS="$CPPFLAGS -DHAVE_MODEL_SMALL_ATTRIBUTE"
-fi
-rm -f $out
-
-# Check whether assembler supports .serialize.{data,instruction} directive.
-
-$CC -c $dir/check-serialize.S -o $out 2>/dev/null
-res=$?
-rm -f $out
-if [ $res -eq 0 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_SERIALIZE_DIRECTIVE"
-fi
-
-echo $CPPFLAGS
diff --git a/arch/ia64/scripts/unwcheck.py b/arch/ia64/scripts/unwcheck.py
deleted file mode 100644
index 9581742f0db2..000000000000
--- a/arch/ia64/scripts/unwcheck.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-#
-# Usage: unwcheck.py FILE
-#
-# This script checks the unwind info of each function in file FILE
-# and verifies that the sum of the region-lengths matches the total
-# length of the function.
-#
-# Based on a shell/awk script originally written by Harish Patil,
-# which was converted to Perl by Matthew Chapman, which was converted
-# to Python by David Mosberger.
-#
-import os
-import re
-import sys
-
-if len(sys.argv) != 2:
-    print("Usage: %s FILE" % sys.argv[0])
-    sys.exit(2)
-
-readelf = os.getenv("READELF", "readelf")
-
-start_pattern = re.compile("<([^>]*)>: \[0x([0-9a-f]+)-0x([0-9a-f]+)\]")
-rlen_pattern  = re.compile(".*rlen=([0-9]+)")
-
-def check_func (func, slots, rlen_sum):
-    if slots != rlen_sum:
-        global num_errors
-        num_errors += 1
-        if not func: func = "[%#x-%#x]" % (start, end)
-        print("ERROR: %s: %lu slots, total region length = %lu" % (func, slots, rlen_sum))
-    return
-
-num_funcs = 0
-num_errors = 0
-func = False
-slots = 0
-rlen_sum = 0
-for line in os.popen("%s -u %s" % (readelf, sys.argv[1])):
-    m = start_pattern.match(line)
-    if m:
-        check_func(func, slots, rlen_sum)
-
-        func  = m.group(1)
-        start = int(m.group(2), 16)
-        end   = int(m.group(3), 16)
-        slots = 3 * (end - start) / 16
-        rlen_sum = 0
-        num_funcs += 1
-    else:
-        m = rlen_pattern.match(line)
-        if m:
-            rlen_sum += int(m.group(1))
-check_func(func, slots, rlen_sum)
-
-if num_errors == 0:
-    print("No errors detected in %u functions." % num_funcs)
-else:
-    if num_errors > 1:
-        err="errors"
-    else:
-        err="error"
-    print("%u %s detected in %u functions." % (num_errors, err, num_funcs))
-    sys.exit(1)
diff --git a/arch/ia64/uv/Makefile b/arch/ia64/uv/Makefile
deleted file mode 100644
index aa9f91947c49..000000000000
--- a/arch/ia64/uv/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# arch/ia64/uv/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2008 Silicon Graphics, Inc.  All Rights Reserved.
-#
-# Makefile for the sn uv subplatform
-#
-
-obj-y += kernel/
diff --git a/arch/ia64/uv/kernel/Makefile b/arch/ia64/uv/kernel/Makefile
deleted file mode 100644
index 297196578d19..000000000000
--- a/arch/ia64/uv/kernel/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# arch/ia64/uv/kernel/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2008 Silicon Graphics, Inc.  All Rights Reserved.
-#
-
-ccflags-y := -Iarch/ia64/sn/include
-
-obj-y				+= setup.o
diff --git a/arch/ia64/uv/kernel/setup.c b/arch/ia64/uv/kernel/setup.c
deleted file mode 100644
index bb025486d791..000000000000
--- a/arch/ia64/uv/kernel/setup.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV Core Functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-
-bool ia64_is_uv;
-EXPORT_SYMBOL_GPL(ia64_is_uv);
-
-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
-
-struct redir_addr {
-	unsigned long redirect;
-	unsigned long alias;
-};
-
-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
-
-static __initdata struct redir_addr redir_addrs[] = {
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
-};
-
-static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
-{
-	union uvh_si_alias0_overlay_config_u alias;
-	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
-		alias.v = uv_read_local_mmr(redir_addrs[i].alias);
-		if (alias.s.base == 0) {
-			*size = (1UL << alias.s.m_alias);
-			redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
-			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
-			return;
-		}
-	}
-	BUG();
-}
-
-void __init uv_probe_system_type(void)
-{
-	struct acpi_table_rsdp *rsdp;
-	struct acpi_table_xsdt *xsdt;
-
-	if (efi.acpi20 == EFI_INVALID_TABLE_ADDR) {
-		pr_err("ACPI 2.0 RSDP not found.\n");
-		return;
-	}
-
-	rsdp = (struct acpi_table_rsdp *)__va(efi.acpi20);
-	if (strncmp(rsdp->signature, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1)) {
-		pr_err("ACPI 2.0 RSDP signature incorrect.\n");
-		return;
-	}
-
-	xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_physical_address);
-	if (strncmp(xsdt->header.signature, ACPI_SIG_XSDT,
-			sizeof(ACPI_SIG_XSDT) - 1)) {
-		pr_err("ACPI 2.0 XSDT signature incorrect.\n");
-		return;
-	}
-
-	if (!strcmp(xsdt->header.oem_id, "SGI") &&
-	    !strcmp(xsdt->header.oem_table_id + 4, "UV"))
-		ia64_is_uv = true;
-}
-
-void __init uv_setup(char **cmdline_p)
-{
-	union uvh_si_addr_map_config_u m_n_config;
-	union uvh_node_id_u node_id;
-	unsigned long gnode_upper;
-	int nid, cpu, m_val, n_val;
-	unsigned long mmr_base, lowmem_redir_base, lowmem_redir_size;
-
-	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
-	mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
-			~UV_MMR_ENABLE;
-
-	m_val = m_n_config.s.m_skt;
-	n_val = m_n_config.s.n_skt;
-	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
-
-	gnode_upper = (((unsigned long)node_id.s.node_id) &
-		       ~((1 << n_val) - 1)) << m_val;
-
-	for_each_present_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
-		uv_cpu_hub_info(cpu)->lowmem_remap_top =
-			lowmem_redir_base + lowmem_redir_size;
-		uv_cpu_hub_info(cpu)->m_val = m_val;
-		uv_cpu_hub_info(cpu)->n_val = n_val;
-		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) -1;
-		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
-		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
-		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
-		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
-		printk(KERN_DEBUG "UV cpu %d, nid %d\n", cpu, nid);
-	}
-}
-
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index cee82b473dc5..554e487cbfab 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -257,7 +257,7 @@ config ACPI_CPU_FREQ_PSS
 config ACPI_PROCESSOR_CSTATE
 	def_bool y
 	depends on ACPI_PROCESSOR
-	depends on IA64 || X86
+	depends on X86
 
 config ACPI_PROCESSOR_IDLE
 	bool
@@ -281,9 +281,9 @@ config ACPI_CPPC_LIB
 
 config ACPI_PROCESSOR
 	tristate "Processor"
-	depends on X86 || IA64 || ARM64 || LOONGARCH
+	depends on X86 || ARM64 || LOONGARCH
 	select ACPI_PROCESSOR_IDLE
-	select ACPI_CPU_FREQ_PSS if X86 || IA64 || LOONGARCH
+	select ACPI_CPU_FREQ_PSS if X86 || LOONGARCH
 	select THERMAL
 	default y
 	help
diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig
index 39b1f34c21df..849c2bd820b9 100644
--- a/drivers/acpi/numa/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -2,8 +2,8 @@
 config ACPI_NUMA
 	bool "NUMA support"
 	depends on NUMA
-	depends on (X86 || IA64 || ARM64 || LOONGARCH)
-	default y if IA64 || ARM64
+	depends on (X86 || ARM64 || LOONGARCH)
+	default y if ARM64
 
 config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index f725813d0cce..06289a93d3d2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -276,7 +276,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 	return NULL;
 }
 
-#if defined(CONFIG_IA64) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
+#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
 /* ioremap will take care of cache attributes */
 #define should_use_kmap(pfn)   0
 #else
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 625af75833fc..7c8dd0abcfdf 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -348,7 +348,7 @@ config DEVPORT
 	  device is similar to /dev/mem, but for I/O ports.
 
 config HPET
-	bool "HPET - High Precision Event Timer" if (X86 || IA64)
+	bool "HPET - High Precision Event Timer" if X86
 	default n
 	depends on ACPI
 	help
@@ -377,7 +377,7 @@ config HPET_MMAP_DEFAULT
 
 config HANGCHECK_TIMER
 	tristate "Hangcheck timer"
-	depends on X86 || IA64 || PPC64 || S390
+	depends on X86 || PPC64 || S390
 	help
 	  The hangcheck-timer module detects when the system has gone
 	  out to lunch past a certain margin.  It can reboot the system
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c5f532e412f1..e9b360cdc99a 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
 obj-y				+= misc.o
 obj-$(CONFIG_ATARI_DSP56K)	+= dsp56k.o
 obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
-obj-$(CONFIG_MSPEC)		+= mspec.o
 obj-$(CONFIG_UV_MMTIMER)	+= uv_mmtimer.o
 obj-$(CONFIG_IBM_BSR)		+= bsr.o
 
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4f501e4842ab..c47eb7bf06d4 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 menuconfig AGP
 	tristate "/dev/agpgart (AGP Support)"
-	depends on ALPHA || IA64 || PARISC || PPC || X86
+	depends on ALPHA || PARISC || PPC || X86
 	depends on PCI
 	help
 	  AGP (Accelerated Graphics Port) is a bus system mainly used to
@@ -109,20 +109,6 @@ config AGP_VIA
 	  This option gives you AGP support for the GLX component of
 	  X on VIA MVP3/Apollo Pro chipsets.
 
-config AGP_I460
-	tristate "Intel 460GX chipset support"
-	depends on AGP && IA64
-	help
-	  This option gives you AGP GART support for the Intel 460GX chipset
-	  for IA64 processors.
-
-config AGP_HP_ZX1
-	tristate "HP ZX1 chipset AGP support"
-	depends on AGP && IA64
-	help
-	  This option gives you AGP GART support for the HP ZX1 chipset
-	  for IA64 processors.
-
 config AGP_PARISC
 	tristate "HP Quicksilver AGP support"
 	depends on AGP && PARISC && 64BIT && IOMMU_SBA
diff --git a/drivers/char/agp/Makefile b/drivers/char/agp/Makefile
index 90ed8c789e48..25834557e486 100644
--- a/drivers/char/agp/Makefile
+++ b/drivers/char/agp/Makefile
@@ -14,9 +14,7 @@ obj-$(CONFIG_AGP_AMD)		+= amd-k7-agp.o
 obj-$(CONFIG_AGP_AMD64)		+= amd64-agp.o
 obj-$(CONFIG_AGP_ALPHA_CORE)	+= alpha-agp.o
 obj-$(CONFIG_AGP_EFFICEON)	+= efficeon-agp.o
-obj-$(CONFIG_AGP_HP_ZX1)	+= hp-agp.o
 obj-$(CONFIG_AGP_PARISC)	+= parisc-agp.o
-obj-$(CONFIG_AGP_I460)		+= i460-agp.o
 obj-$(CONFIG_AGP_INTEL)		+= intel-agp.o
 obj-$(CONFIG_INTEL_GTT)		+= intel-gtt.o
 obj-$(CONFIG_AGP_NVIDIA)	+= nvidia-agp.o
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
deleted file mode 100644
index 84d9adbb62f6..000000000000
--- a/drivers/char/agp/hp-agp.c
+++ /dev/null
@@ -1,550 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * HP zx1 AGPGART routines.
- *
- * (c) Copyright 2002, 2003 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/agp_backend.h>
-#include <linux/log2.h>
-#include <linux/slab.h>
-
-#include <asm/acpi-ext.h>
-
-#include "agp.h"
-
-#define HP_ZX1_IOC_OFFSET	0x1000  /* ACPI reports SBA, we want IOC */
-
-/* HP ZX1 IOC registers */
-#define HP_ZX1_IBASE		0x300
-#define HP_ZX1_IMASK		0x308
-#define HP_ZX1_PCOM		0x310
-#define HP_ZX1_TCNFG		0x318
-#define HP_ZX1_PDIR_BASE	0x320
-
-#define HP_ZX1_IOVA_BASE	GB(1UL)
-#define HP_ZX1_IOVA_SIZE	GB(1UL)
-#define HP_ZX1_GART_SIZE	(HP_ZX1_IOVA_SIZE / 2)
-#define HP_ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
-
-#define HP_ZX1_PDIR_VALID_BIT	0x8000000000000000UL
-#define HP_ZX1_IOVA_TO_PDIR(va)	((va - hp_private.iova_base) >> hp_private.io_tlb_shift)
-
-#define AGP8X_MODE_BIT		3
-#define AGP8X_MODE		(1 << AGP8X_MODE_BIT)
-
-/* AGP bridge need not be PCI device, but DRM thinks it is. */
-static struct pci_dev fake_bridge_dev;
-
-static int hp_zx1_gart_found;
-
-static struct aper_size_info_fixed hp_zx1_sizes[] =
-{
-	{0, 0, 0},		/* filled in by hp_zx1_fetch_size() */
-};
-
-static struct gatt_mask hp_zx1_masks[] =
-{
-	{.mask = HP_ZX1_PDIR_VALID_BIT, .type = 0}
-};
-
-static struct _hp_private {
-	volatile u8 __iomem *ioc_regs;
-	volatile u8 __iomem *lba_regs;
-	int lba_cap_offset;
-	u64 *io_pdir;		// PDIR for entire IOVA
-	u64 *gatt;		// PDIR just for GART (subset of above)
-	u64 gatt_entries;
-	u64 iova_base;
-	u64 gart_base;
-	u64 gart_size;
-	u64 io_pdir_size;
-	int io_pdir_owner;	// do we own it, or share it with sba_iommu?
-	int io_page_size;
-	int io_tlb_shift;
-	int io_tlb_ps;		// IOC ps config
-	int io_pages_per_kpage;
-} hp_private;
-
-static int __init hp_zx1_ioc_shared(void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	printk(KERN_INFO PFX "HP ZX1 IOC: IOPDIR shared with sba_iommu\n");
-
-	/*
-	 * IOC already configured by sba_iommu module; just use
-	 * its setup.  We assume:
-	 *	- IOVA space is 1Gb in size
-	 *	- first 512Mb is IOMMU, second 512Mb is GART
-	 */
-	hp->io_tlb_ps = readq(hp->ioc_regs+HP_ZX1_TCNFG);
-	switch (hp->io_tlb_ps) {
-		case 0: hp->io_tlb_shift = 12; break;
-		case 1: hp->io_tlb_shift = 13; break;
-		case 2: hp->io_tlb_shift = 14; break;
-		case 3: hp->io_tlb_shift = 16; break;
-		default:
-			printk(KERN_ERR PFX "Invalid IOTLB page size "
-			       "configuration 0x%x\n", hp->io_tlb_ps);
-			hp->gatt = NULL;
-			hp->gatt_entries = 0;
-			return -ENODEV;
-	}
-	hp->io_page_size = 1 << hp->io_tlb_shift;
-	hp->io_pages_per_kpage = PAGE_SIZE / hp->io_page_size;
-
-	hp->iova_base = readq(hp->ioc_regs+HP_ZX1_IBASE) & ~0x1;
-	hp->gart_base = hp->iova_base + HP_ZX1_IOVA_SIZE - HP_ZX1_GART_SIZE;
-
-	hp->gart_size = HP_ZX1_GART_SIZE;
-	hp->gatt_entries = hp->gart_size / hp->io_page_size;
-
-	hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
-	hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
-
-	if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) {
-		/* Normal case when no AGP device in system */
-		hp->gatt = NULL;
-		hp->gatt_entries = 0;
-		printk(KERN_ERR PFX "No reserved IO PDIR entry found; "
-		       "GART disabled\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-static int __init
-hp_zx1_ioc_owner (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	printk(KERN_INFO PFX "HP ZX1 IOC: IOPDIR dedicated to GART\n");
-
-	/*
-	 * Select an IOV page size no larger than system page size.
-	 */
-	if (PAGE_SIZE >= KB(64)) {
-		hp->io_tlb_shift = 16;
-		hp->io_tlb_ps = 3;
-	} else if (PAGE_SIZE >= KB(16)) {
-		hp->io_tlb_shift = 14;
-		hp->io_tlb_ps = 2;
-	} else if (PAGE_SIZE >= KB(8)) {
-		hp->io_tlb_shift = 13;
-		hp->io_tlb_ps = 1;
-	} else {
-		hp->io_tlb_shift = 12;
-		hp->io_tlb_ps = 0;
-	}
-	hp->io_page_size = 1 << hp->io_tlb_shift;
-	hp->io_pages_per_kpage = PAGE_SIZE / hp->io_page_size;
-
-	hp->iova_base = HP_ZX1_IOVA_BASE;
-	hp->gart_size = HP_ZX1_GART_SIZE;
-	hp->gart_base = hp->iova_base + HP_ZX1_IOVA_SIZE - hp->gart_size;
-
-	hp->gatt_entries = hp->gart_size / hp->io_page_size;
-	hp->io_pdir_size = (HP_ZX1_IOVA_SIZE / hp->io_page_size) * sizeof(u64);
-
-	return 0;
-}
-
-static int __init
-hp_zx1_ioc_init (u64 hpa)
-{
-	struct _hp_private *hp = &hp_private;
-
-	hp->ioc_regs = ioremap(hpa, 1024);
-	if (!hp->ioc_regs)
-		return -ENOMEM;
-
-	/*
-	 * If the IOTLB is currently disabled, we can take it over.
-	 * Otherwise, we have to share with sba_iommu.
-	 */
-	hp->io_pdir_owner = (readq(hp->ioc_regs+HP_ZX1_IBASE) & 0x1) == 0;
-
-	if (hp->io_pdir_owner)
-		return hp_zx1_ioc_owner();
-
-	return hp_zx1_ioc_shared();
-}
-
-static int
-hp_zx1_lba_find_capability (volatile u8 __iomem *hpa, int cap)
-{
-	u16 status;
-	u8 pos, id;
-	int ttl = 48;
-
-	status = readw(hpa+PCI_STATUS);
-	if (!(status & PCI_STATUS_CAP_LIST))
-		return 0;
-	pos = readb(hpa+PCI_CAPABILITY_LIST);
-	while (ttl-- && pos >= 0x40) {
-		pos &= ~3;
-		id = readb(hpa+pos+PCI_CAP_LIST_ID);
-		if (id == 0xff)
-			break;
-		if (id == cap)
-			return pos;
-		pos = readb(hpa+pos+PCI_CAP_LIST_NEXT);
-	}
-	return 0;
-}
-
-static int __init
-hp_zx1_lba_init (u64 hpa)
-{
-	struct _hp_private *hp = &hp_private;
-	int cap;
-
-	hp->lba_regs = ioremap(hpa, 256);
-	if (!hp->lba_regs)
-		return -ENOMEM;
-
-	hp->lba_cap_offset = hp_zx1_lba_find_capability(hp->lba_regs, PCI_CAP_ID_AGP);
-
-	cap = readl(hp->lba_regs+hp->lba_cap_offset) & 0xff;
-	if (cap != PCI_CAP_ID_AGP) {
-		printk(KERN_ERR PFX "Invalid capability ID 0x%02x at 0x%x\n",
-		       cap, hp->lba_cap_offset);
-		iounmap(hp->lba_regs);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-static int
-hp_zx1_fetch_size(void)
-{
-	int size;
-
-	size = hp_private.gart_size / MB(1);
-	hp_zx1_sizes[0].size = size;
-	agp_bridge->current_size = (void *) &hp_zx1_sizes[0];
-	return size;
-}
-
-static int
-hp_zx1_configure (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	agp_bridge->gart_bus_addr = hp->gart_base;
-	agp_bridge->capndx = hp->lba_cap_offset;
-	agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
-
-	if (hp->io_pdir_owner) {
-		writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
-		readl(hp->ioc_regs+HP_ZX1_PDIR_BASE);
-		writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG);
-		readl(hp->ioc_regs+HP_ZX1_TCNFG);
-		writel((unsigned int)(~(HP_ZX1_IOVA_SIZE-1)), hp->ioc_regs+HP_ZX1_IMASK);
-		readl(hp->ioc_regs+HP_ZX1_IMASK);
-		writel(hp->iova_base|1, hp->ioc_regs+HP_ZX1_IBASE);
-		readl(hp->ioc_regs+HP_ZX1_IBASE);
-		writel(hp->iova_base|ilog2(HP_ZX1_IOVA_SIZE), hp->ioc_regs+HP_ZX1_PCOM);
-		readl(hp->ioc_regs+HP_ZX1_PCOM);
-	}
-
-	return 0;
-}
-
-static void
-hp_zx1_cleanup (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	if (hp->ioc_regs) {
-		if (hp->io_pdir_owner) {
-			writeq(0, hp->ioc_regs+HP_ZX1_IBASE);
-			readq(hp->ioc_regs+HP_ZX1_IBASE);
-		}
-		iounmap(hp->ioc_regs);
-	}
-	if (hp->lba_regs)
-		iounmap(hp->lba_regs);
-}
-
-static void
-hp_zx1_tlbflush (struct agp_memory *mem)
-{
-	struct _hp_private *hp = &hp_private;
-
-	writeq(hp->gart_base | ilog2(hp->gart_size), hp->ioc_regs+HP_ZX1_PCOM);
-	readq(hp->ioc_regs+HP_ZX1_PCOM);
-}
-
-static int
-hp_zx1_create_gatt_table (struct agp_bridge_data *bridge)
-{
-	struct _hp_private *hp = &hp_private;
-	int i;
-
-	if (hp->io_pdir_owner) {
-		hp->io_pdir = (u64 *) __get_free_pages(GFP_KERNEL,
-						get_order(hp->io_pdir_size));
-		if (!hp->io_pdir) {
-			printk(KERN_ERR PFX "Couldn't allocate contiguous "
-				"memory for I/O PDIR\n");
-			hp->gatt = NULL;
-			hp->gatt_entries = 0;
-			return -ENOMEM;
-		}
-		memset(hp->io_pdir, 0, hp->io_pdir_size);
-
-		hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
-	}
-
-	for (i = 0; i < hp->gatt_entries; i++) {
-		hp->gatt[i] = (unsigned long) agp_bridge->scratch_page;
-	}
-
-	return 0;
-}
-
-static int
-hp_zx1_free_gatt_table (struct agp_bridge_data *bridge)
-{
-	struct _hp_private *hp = &hp_private;
-
-	if (hp->io_pdir_owner)
-		free_pages((unsigned long) hp->io_pdir,
-			    get_order(hp->io_pdir_size));
-	else
-		hp->gatt[0] = HP_ZX1_SBA_IOMMU_COOKIE;
-	return 0;
-}
-
-static int
-hp_zx1_insert_memory (struct agp_memory *mem, off_t pg_start, int type)
-{
-	struct _hp_private *hp = &hp_private;
-	int i, k;
-	off_t j, io_pg_start;
-	int io_pg_count;
-
-	if (type != mem->type ||
-		agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type)) {
-		return -EINVAL;
-	}
-
-	io_pg_start = hp->io_pages_per_kpage * pg_start;
-	io_pg_count = hp->io_pages_per_kpage * mem->page_count;
-	if ((io_pg_start + io_pg_count) > hp->gatt_entries) {
-		return -EINVAL;
-	}
-
-	j = io_pg_start;
-	while (j < (io_pg_start + io_pg_count)) {
-		if (hp->gatt[j]) {
-			return -EBUSY;
-		}
-		j++;
-	}
-
-	if (!mem->is_flushed) {
-		global_cache_flush();
-		mem->is_flushed = true;
-	}
-
-	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		unsigned long paddr;
-
-		paddr = page_to_phys(mem->pages[i]);
-		for (k = 0;
-		     k < hp->io_pages_per_kpage;
-		     k++, j++, paddr += hp->io_page_size) {
-			hp->gatt[j] = HP_ZX1_PDIR_VALID_BIT | paddr;
-		}
-	}
-
-	agp_bridge->driver->tlb_flush(mem);
-	return 0;
-}
-
-static int
-hp_zx1_remove_memory (struct agp_memory *mem, off_t pg_start, int type)
-{
-	struct _hp_private *hp = &hp_private;
-	int i, io_pg_start, io_pg_count;
-
-	if (type != mem->type ||
-		agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type)) {
-		return -EINVAL;
-	}
-
-	io_pg_start = hp->io_pages_per_kpage * pg_start;
-	io_pg_count = hp->io_pages_per_kpage * mem->page_count;
-	for (i = io_pg_start; i < io_pg_count + io_pg_start; i++) {
-		hp->gatt[i] = agp_bridge->scratch_page;
-	}
-
-	agp_bridge->driver->tlb_flush(mem);
-	return 0;
-}
-
-static unsigned long
-hp_zx1_mask_memory (struct agp_bridge_data *bridge, dma_addr_t addr, int type)
-{
-	return HP_ZX1_PDIR_VALID_BIT | addr;
-}
-
-static void
-hp_zx1_enable (struct agp_bridge_data *bridge, u32 mode)
-{
-	struct _hp_private *hp = &hp_private;
-	u32 command;
-
-	command = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
-	command = agp_collect_device_status(bridge, mode, command);
-	command |= 0x00000100;
-
-	writel(command, hp->lba_regs+hp->lba_cap_offset+PCI_AGP_COMMAND);
-
-	agp_device_command(command, (mode & AGP8X_MODE) != 0);
-}
-
-const struct agp_bridge_driver hp_zx1_driver = {
-	.owner			= THIS_MODULE,
-	.size_type		= FIXED_APER_SIZE,
-	.configure		= hp_zx1_configure,
-	.fetch_size		= hp_zx1_fetch_size,
-	.cleanup		= hp_zx1_cleanup,
-	.tlb_flush		= hp_zx1_tlbflush,
-	.mask_memory		= hp_zx1_mask_memory,
-	.masks			= hp_zx1_masks,
-	.agp_enable		= hp_zx1_enable,
-	.cache_flush		= global_cache_flush,
-	.create_gatt_table	= hp_zx1_create_gatt_table,
-	.free_gatt_table	= hp_zx1_free_gatt_table,
-	.insert_memory		= hp_zx1_insert_memory,
-	.remove_memory		= hp_zx1_remove_memory,
-	.alloc_by_type		= agp_generic_alloc_by_type,
-	.free_by_type		= agp_generic_free_by_type,
-	.agp_alloc_page		= agp_generic_alloc_page,
-	.agp_alloc_pages	= agp_generic_alloc_pages,
-	.agp_destroy_page	= agp_generic_destroy_page,
-	.agp_destroy_pages	= agp_generic_destroy_pages,
-	.agp_type_to_mask_type  = agp_generic_type_to_mask_type,
-	.cant_use_aperture	= true,
-};
-
-static int __init
-hp_zx1_setup (u64 ioc_hpa, u64 lba_hpa)
-{
-	struct agp_bridge_data *bridge;
-	int error = 0;
-
-	error = hp_zx1_ioc_init(ioc_hpa);
-	if (error)
-		goto fail;
-
-	error = hp_zx1_lba_init(lba_hpa);
-	if (error)
-		goto fail;
-
-	bridge = agp_alloc_bridge();
-	if (!bridge) {
-		error = -ENOMEM;
-		goto fail;
-	}
-	bridge->driver = &hp_zx1_driver;
-
-	fake_bridge_dev.vendor = PCI_VENDOR_ID_HP;
-	fake_bridge_dev.device = PCI_DEVICE_ID_HP_PCIX_LBA;
-	bridge->dev = &fake_bridge_dev;
-
-	error = agp_add_bridge(bridge);
-  fail:
-	if (error)
-		hp_zx1_cleanup();
-	return error;
-}
-
-static acpi_status __init
-zx1_gart_probe (acpi_handle obj, u32 depth, void *context, void **ret)
-{
-	acpi_handle handle, parent;
-	acpi_status status;
-	struct acpi_device_info *info;
-	u64 lba_hpa, sba_hpa, length;
-	int match;
-
-	status = hp_acpi_csr_space(obj, &lba_hpa, &length);
-	if (ACPI_FAILURE(status))
-		return AE_OK; /* keep looking for another bridge */
-
-	/* Look for an enclosing IOC scope and find its CSR space */
-	handle = obj;
-	do {
-		status = acpi_get_object_info(handle, &info);
-		if (ACPI_SUCCESS(status) && (info->valid & ACPI_VALID_HID)) {
-			/* TBD check _CID also */
-			match = (strcmp(info->hardware_id.string, "HWP0001") == 0);
-			kfree(info);
-			if (match) {
-				status = hp_acpi_csr_space(handle, &sba_hpa, &length);
-				if (ACPI_SUCCESS(status))
-					break;
-				else {
-					printk(KERN_ERR PFX "Detected HP ZX1 "
-					       "AGP LBA but no IOC.\n");
-					return AE_OK;
-				}
-			}
-		}
-
-		status = acpi_get_parent(handle, &parent);
-		handle = parent;
-	} while (ACPI_SUCCESS(status));
-
-	if (ACPI_FAILURE(status))
-		return AE_OK;	/* found no enclosing IOC */
-
-	if (hp_zx1_setup(sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa))
-		return AE_OK;
-
-	printk(KERN_INFO PFX "Detected HP ZX1 %s AGP chipset "
-		"(ioc=%llx, lba=%llx)\n", (char *)context,
-		sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa);
-
-	hp_zx1_gart_found = 1;
-	return AE_CTRL_TERMINATE; /* we only support one bridge; quit looking */
-}
-
-static int __init
-agp_hp_init (void)
-{
-	if (agp_off)
-		return -EINVAL;
-
-	acpi_get_devices("HWP0003", zx1_gart_probe, "HWP0003", NULL);
-	if (hp_zx1_gart_found)
-		return 0;
-
-	acpi_get_devices("HWP0007", zx1_gart_probe, "HWP0007", NULL);
-	if (hp_zx1_gart_found)
-		return 0;
-
-	return -ENODEV;
-}
-
-static void __exit
-agp_hp_cleanup (void)
-{
-}
-
-module_init(agp_hp_init);
-module_exit(agp_hp_cleanup);
-
-MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
deleted file mode 100644
index 15b240ea4848..000000000000
--- a/drivers/char/agp/i460-agp.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/*
- * For documentation on the i460 AGP interface, see Chapter 7 (AGP Subsystem) of
- * the "Intel 460GTX Chipset Software Developer's Manual":
- * http://www.intel.com/design/archives/itanium/downloads/248704.htm 
- */
-/*
- * 460GX support by Chris Ahna <christopher.j.ahna@intel.com>
- * Clean up & simplification by David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/agp_backend.h>
-#include <linux/log2.h>
-
-#include "agp.h"
-
-#define INTEL_I460_BAPBASE		0x98
-#define INTEL_I460_GXBCTL		0xa0
-#define INTEL_I460_AGPSIZ		0xa2
-#define INTEL_I460_ATTBASE		0xfe200000
-#define INTEL_I460_GATT_VALID		(1UL << 24)
-#define INTEL_I460_GATT_COHERENT	(1UL << 25)
-
-/*
- * The i460 can operate with large (4MB) pages, but there is no sane way to support this
- * within the current kernel/DRM environment, so we disable the relevant code for now.
- * See also comments in ia64_alloc_page()...
- */
-#define I460_LARGE_IO_PAGES		0
-
-#if I460_LARGE_IO_PAGES
-# define I460_IO_PAGE_SHIFT		i460.io_page_shift
-#else
-# define I460_IO_PAGE_SHIFT		12
-#endif
-
-#define I460_IOPAGES_PER_KPAGE		(PAGE_SIZE >> I460_IO_PAGE_SHIFT)
-#define I460_KPAGES_PER_IOPAGE		(1 << (I460_IO_PAGE_SHIFT - PAGE_SHIFT))
-#define I460_SRAM_IO_DISABLE		(1 << 4)
-#define I460_BAPBASE_ENABLE		(1 << 3)
-#define I460_AGPSIZ_MASK		0x7
-#define I460_4M_PS			(1 << 1)
-
-/* Control bits for Out-Of-GART coherency and Burst Write Combining */
-#define I460_GXBCTL_OOG		(1UL << 0)
-#define I460_GXBCTL_BWC		(1UL << 2)
-
-/*
- * gatt_table entries are 32-bits wide on the i460; the generic code ought to declare the
- * gatt_table and gatt_table_real pointers a "void *"...
- */
-#define RD_GATT(index)		readl((u32 *) i460.gatt + (index))
-#define WR_GATT(index, val)	writel((val), (u32 *) i460.gatt + (index))
-/*
- * The 460 spec says we have to read the last location written to make sure that all
- * writes have taken effect
- */
-#define WR_FLUSH_GATT(index)	RD_GATT(index)
-
-static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
-				       dma_addr_t addr, int type);
-
-static struct {
-	void *gatt;				/* ioremap'd GATT area */
-
-	/* i460 supports multiple GART page sizes, so GART pageshift is dynamic: */
-	u8 io_page_shift;
-
-	/* BIOS configures chipset to one of 2 possible apbase values: */
-	u8 dynamic_apbase;
-
-	/* structure for tracking partial use of 4MB GART pages: */
-	struct lp_desc {
-		unsigned long *alloced_map;	/* bitmap of kernel-pages in use */
-		int refcount;			/* number of kernel pages using the large page */
-		u64 paddr;			/* physical address of large page */
-		struct page *page; 		/* page pointer */
-	} *lp_desc;
-} i460;
-
-static const struct aper_size_info_8 i460_sizes[3] =
-{
-	/*
-	 * The 32GB aperture is only available with a 4M GART page size.  Due to the
-	 * dynamic GART page size, we can't figure out page_order or num_entries until
-	 * runtime.
-	 */
-	{32768, 0, 0, 4},
-	{1024, 0, 0, 2},
-	{256, 0, 0, 1}
-};
-
-static struct gatt_mask i460_masks[] =
-{
-	{
-	  .mask = INTEL_I460_GATT_VALID | INTEL_I460_GATT_COHERENT,
-	  .type = 0
-	}
-};
-
-static int i460_fetch_size (void)
-{
-	int i;
-	u8 temp;
-	struct aper_size_info_8 *values;
-
-	/* Determine the GART page size */
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL, &temp);
-	i460.io_page_shift = (temp & I460_4M_PS) ? 22 : 12;
-	pr_debug("i460_fetch_size: io_page_shift=%d\n", i460.io_page_shift);
-
-	if (i460.io_page_shift != I460_IO_PAGE_SHIFT) {
-		printk(KERN_ERR PFX
-			"I/O (GART) page-size %luKB doesn't match expected "
-				"size %luKB\n",
-			1UL << (i460.io_page_shift - 10),
-			1UL << (I460_IO_PAGE_SHIFT));
-		return 0;
-	}
-
-	values = A_SIZE_8(agp_bridge->driver->aperture_sizes);
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ, &temp);
-
-	/* Exit now if the IO drivers for the GART SRAMS are turned off */
-	if (temp & I460_SRAM_IO_DISABLE) {
-		printk(KERN_ERR PFX "GART SRAMS disabled on 460GX chipset\n");
-		printk(KERN_ERR PFX "AGPGART operation not possible\n");
-		return 0;
-	}
-
-	/* Make sure we don't try to create an 2 ^ 23 entry GATT */
-	if ((i460.io_page_shift == 0) && ((temp & I460_AGPSIZ_MASK) == 4)) {
-		printk(KERN_ERR PFX "We can't have a 32GB aperture with 4KB GART pages\n");
-		return 0;
-	}
-
-	/* Determine the proper APBASE register */
-	if (temp & I460_BAPBASE_ENABLE)
-		i460.dynamic_apbase = INTEL_I460_BAPBASE;
-	else
-		i460.dynamic_apbase = AGP_APBASE;
-
-	for (i = 0; i < agp_bridge->driver->num_aperture_sizes; i++) {
-		/*
-		 * Dynamically calculate the proper num_entries and page_order values for
-		 * the define aperture sizes. Take care not to shift off the end of
-		 * values[i].size.
-		 */
-		values[i].num_entries = (values[i].size << 8) >> (I460_IO_PAGE_SHIFT - 12);
-		values[i].page_order = ilog2((sizeof(u32)*values[i].num_entries) >> PAGE_SHIFT);
-	}
-
-	for (i = 0; i < agp_bridge->driver->num_aperture_sizes; i++) {
-		/* Neglect control bits when matching up size_value */
-		if ((temp & I460_AGPSIZ_MASK) == values[i].size_value) {
-			agp_bridge->previous_size = agp_bridge->current_size = (void *) (values + i);
-			agp_bridge->aperture_size_idx = i;
-			return values[i].size;
-		}
-	}
-
-	return 0;
-}
-
-/* There isn't anything to do here since 460 has no GART TLB. */
-static void i460_tlb_flush (struct agp_memory *mem)
-{
-	return;
-}
-
-/*
- * This utility function is needed to prevent corruption of the control bits
- * which are stored along with the aperture size in 460's AGPSIZ register
- */
-static void i460_write_agpsiz (u8 size_value)
-{
-	u8 temp;
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ, &temp);
-	pci_write_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ,
-			      ((temp & ~I460_AGPSIZ_MASK) | size_value));
-}
-
-static void i460_cleanup (void)
-{
-	struct aper_size_info_8 *previous_size;
-
-	previous_size = A_SIZE_8(agp_bridge->previous_size);
-	i460_write_agpsiz(previous_size->size_value);
-
-	if (I460_IO_PAGE_SHIFT > PAGE_SHIFT)
-		kfree(i460.lp_desc);
-}
-
-static int i460_configure (void)
-{
-	union {
-		u32 small[2];
-		u64 large;
-	} temp;
-	size_t size;
-	u8 scratch;
-	struct aper_size_info_8 *current_size;
-
-	temp.large = 0;
-
-	current_size = A_SIZE_8(agp_bridge->current_size);
-	i460_write_agpsiz(current_size->size_value);
-
-	/*
-	 * Do the necessary rigmarole to read all eight bytes of APBASE.
-	 * This has to be done since the AGP aperture can be above 4GB on
-	 * 460 based systems.
-	 */
-	pci_read_config_dword(agp_bridge->dev, i460.dynamic_apbase, &(temp.small[0]));
-	pci_read_config_dword(agp_bridge->dev, i460.dynamic_apbase + 4, &(temp.small[1]));
-
-	/* Clear BAR control bits */
-	agp_bridge->gart_bus_addr = temp.large & ~((1UL << 3) - 1);
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL, &scratch);
-	pci_write_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL,
-			      (scratch & 0x02) | I460_GXBCTL_OOG | I460_GXBCTL_BWC);
-
-	/*
-	 * Initialize partial allocation trackers if a GART page is bigger than a kernel
-	 * page.
-	 */
-	if (I460_IO_PAGE_SHIFT > PAGE_SHIFT) {
-		size = current_size->num_entries * sizeof(i460.lp_desc[0]);
-		i460.lp_desc = kzalloc(size, GFP_KERNEL);
-		if (!i460.lp_desc)
-			return -ENOMEM;
-	}
-	return 0;
-}
-
-static int i460_create_gatt_table (struct agp_bridge_data *bridge)
-{
-	int page_order, num_entries, i;
-	void *temp;
-
-	/*
-	 * Load up the fixed address of the GART SRAMS which hold our GATT table.
-	 */
-	temp = agp_bridge->current_size;
-	page_order = A_SIZE_8(temp)->page_order;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	i460.gatt = ioremap(INTEL_I460_ATTBASE, PAGE_SIZE << page_order);
-	if (!i460.gatt) {
-		printk(KERN_ERR PFX "ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	/* These are no good, the should be removed from the agp_bridge strucure... */
-	agp_bridge->gatt_table_real = NULL;
-	agp_bridge->gatt_table = NULL;
-	agp_bridge->gatt_bus_addr = 0;
-
-	for (i = 0; i < num_entries; ++i)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(i - 1);
-	return 0;
-}
-
-static int i460_free_gatt_table (struct agp_bridge_data *bridge)
-{
-	int num_entries, i;
-	void *temp;
-
-	temp = agp_bridge->current_size;
-
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	for (i = 0; i < num_entries; ++i)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(num_entries - 1);
-
-	iounmap(i460.gatt);
-	return 0;
-}
-
-/*
- * The following functions are called when the I/O (GART) page size is smaller than
- * PAGE_SIZE.
- */
-
-static int i460_insert_memory_small_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	unsigned long paddr, io_pg_start, io_page_size;
-	int i, j, k, num_entries;
-	void *temp;
-
-	pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n",
-		 mem, pg_start, type, page_to_phys(mem->pages[0]));
-
-	if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
-		return -EINVAL;
-
-	io_pg_start = I460_IOPAGES_PER_KPAGE * pg_start;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	if ((io_pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count) > num_entries) {
-		printk(KERN_ERR PFX "Looks like we're out of AGP memory\n");
-		return -EINVAL;
-	}
-
-	j = io_pg_start;
-	while (j < (io_pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count)) {
-		if (!PGE_EMPTY(agp_bridge, RD_GATT(j))) {
-			pr_debug("i460_insert_memory_small_io_page: GATT[%d]=0x%x is busy\n",
-				 j, RD_GATT(j));
-			return -EBUSY;
-		}
-		j++;
-	}
-
-	io_page_size = 1UL << I460_IO_PAGE_SHIFT;
-	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		paddr = page_to_phys(mem->pages[i]);
-		for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size)
-			WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type));
-	}
-	WR_FLUSH_GATT(j - 1);
-	return 0;
-}
-
-static int i460_remove_memory_small_io_page(struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i;
-
-	pr_debug("i460_remove_memory_small_io_page(mem=%p, pg_start=%ld, type=%d)\n",
-		 mem, pg_start, type);
-
-	pg_start = I460_IOPAGES_PER_KPAGE * pg_start;
-
-	for (i = pg_start; i < (pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count); i++)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(i - 1);
-	return 0;
-}
-
-#if I460_LARGE_IO_PAGES
-
-/*
- * These functions are called when the I/O (GART) page size exceeds PAGE_SIZE.
- *
- * This situation is interesting since AGP memory allocations that are smaller than a
- * single GART page are possible.  The i460.lp_desc array tracks partial allocation of the
- * large GART pages to work around this issue.
- *
- * i460.lp_desc[pg_num].refcount tracks the number of kernel pages in use within GART page
- * pg_num.  i460.lp_desc[pg_num].paddr is the physical address of the large page and
- * i460.lp_desc[pg_num].alloced_map is a bitmap of kernel pages that are in use (allocated).
- */
-
-static int i460_alloc_large_page (struct lp_desc *lp)
-{
-	unsigned long order = I460_IO_PAGE_SHIFT - PAGE_SHIFT;
-	size_t map_size;
-
-	lp->page = alloc_pages(GFP_KERNEL, order);
-	if (!lp->page) {
-		printk(KERN_ERR PFX "Couldn't alloc 4M GART page...\n");
-		return -ENOMEM;
-	}
-
-	map_size = ((I460_KPAGES_PER_IOPAGE + BITS_PER_LONG - 1) & -BITS_PER_LONG)/8;
-	lp->alloced_map = kzalloc(map_size, GFP_KERNEL);
-	if (!lp->alloced_map) {
-		__free_pages(lp->page, order);
-		printk(KERN_ERR PFX "Out of memory, we're in trouble...\n");
-		return -ENOMEM;
-	}
-
-	lp->paddr = page_to_phys(lp->page);
-	lp->refcount = 0;
-	atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
-	return 0;
-}
-
-static void i460_free_large_page (struct lp_desc *lp)
-{
-	kfree(lp->alloced_map);
-	lp->alloced_map = NULL;
-
-	__free_pages(lp->page, I460_IO_PAGE_SHIFT - PAGE_SHIFT);
-	atomic_sub(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
-}
-
-static int i460_insert_memory_large_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i, start_offset, end_offset, idx, pg, num_entries;
-	struct lp_desc *start, *end, *lp;
-	void *temp;
-
-	if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
-		return -EINVAL;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	/* Figure out what pg_start means in terms of our large GART pages */
-	start = &i460.lp_desc[pg_start / I460_KPAGES_PER_IOPAGE];
-	end = &i460.lp_desc[(pg_start + mem->page_count - 1) / I460_KPAGES_PER_IOPAGE];
-	start_offset = pg_start % I460_KPAGES_PER_IOPAGE;
-	end_offset = (pg_start + mem->page_count - 1) % I460_KPAGES_PER_IOPAGE;
-
-	if (end > i460.lp_desc + num_entries) {
-		printk(KERN_ERR PFX "Looks like we're out of AGP memory\n");
-		return -EINVAL;
-	}
-
-	/* Check if the requested region of the aperture is free */
-	for (lp = start; lp <= end; ++lp) {
-		if (!lp->alloced_map)
-			continue;	/* OK, the entire large page is available... */
-
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++)
-		{
-			if (test_bit(idx, lp->alloced_map))
-				return -EBUSY;
-		}
-	}
-
-	for (lp = start, i = 0; lp <= end; ++lp) {
-		if (!lp->alloced_map) {
-			/* Allocate new GART pages... */
-			if (i460_alloc_large_page(lp) < 0)
-				return -ENOMEM;
-			pg = lp - i460.lp_desc;
-			WR_GATT(pg, i460_mask_memory(agp_bridge,
-						     lp->paddr, 0));
-			WR_FLUSH_GATT(pg);
-		}
-
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++, i++)
-		{
-			mem->pages[i] = lp->page;
-			__set_bit(idx, lp->alloced_map);
-			++lp->refcount;
-		}
-	}
-	return 0;
-}
-
-static int i460_remove_memory_large_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i, pg, start_offset, end_offset, idx, num_entries;
-	struct lp_desc *start, *end, *lp;
-	void *temp;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	/* Figure out what pg_start means in terms of our large GART pages */
-	start = &i460.lp_desc[pg_start / I460_KPAGES_PER_IOPAGE];
-	end = &i460.lp_desc[(pg_start + mem->page_count - 1) / I460_KPAGES_PER_IOPAGE];
-	start_offset = pg_start % I460_KPAGES_PER_IOPAGE;
-	end_offset = (pg_start + mem->page_count - 1) % I460_KPAGES_PER_IOPAGE;
-
-	for (i = 0, lp = start; lp <= end; ++lp) {
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++, i++)
-		{
-			mem->pages[i] = NULL;
-			__clear_bit(idx, lp->alloced_map);
-			--lp->refcount;
-		}
-
-		/* Free GART pages if they are unused */
-		if (lp->refcount == 0) {
-			pg = lp - i460.lp_desc;
-			WR_GATT(pg, 0);
-			WR_FLUSH_GATT(pg);
-			i460_free_large_page(lp);
-		}
-	}
-	return 0;
-}
-
-/* Wrapper routines to call the approriate {small_io_page,large_io_page} function */
-
-static int i460_insert_memory (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT)
-		return i460_insert_memory_small_io_page(mem, pg_start, type);
-	else
-		return i460_insert_memory_large_io_page(mem, pg_start, type);
-}
-
-static int i460_remove_memory (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT)
-		return i460_remove_memory_small_io_page(mem, pg_start, type);
-	else
-		return i460_remove_memory_large_io_page(mem, pg_start, type);
-}
-
-/*
- * If the I/O (GART) page size is bigger than the kernel page size, we don't want to
- * allocate memory until we know where it is to be bound in the aperture (a
- * multi-kernel-page alloc might fit inside of an already allocated GART page).
- *
- * Let's just hope nobody counts on the allocated AGP memory being there before bind time
- * (I don't think current drivers do)...
- */
-static struct page *i460_alloc_page (struct agp_bridge_data *bridge)
-{
-	void *page;
-
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
-		page = agp_generic_alloc_page(agp_bridge);
-	} else
-		/* Returning NULL would cause problems */
-		/* AK: really dubious code. */
-		page = (void *)~0UL;
-	return page;
-}
-
-static void i460_destroy_page (struct page *page, int flags)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
-		agp_generic_destroy_page(page, flags);
-	}
-}
-
-#endif /* I460_LARGE_IO_PAGES */
-
-static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
-				       dma_addr_t addr, int type)
-{
-	/* Make sure the returned address is a valid GATT entry */
-	return bridge->driver->masks[0].mask
-		| (((addr & ~((1 << I460_IO_PAGE_SHIFT) - 1)) & 0xfffff000) >> 12);
-}
-
-const struct agp_bridge_driver intel_i460_driver = {
-	.owner			= THIS_MODULE,
-	.aperture_sizes		= i460_sizes,
-	.size_type		= U8_APER_SIZE,
-	.num_aperture_sizes	= 3,
-	.configure		= i460_configure,
-	.fetch_size		= i460_fetch_size,
-	.cleanup		= i460_cleanup,
-	.tlb_flush		= i460_tlb_flush,
-	.mask_memory		= i460_mask_memory,
-	.masks			= i460_masks,
-	.agp_enable		= agp_generic_enable,
-	.cache_flush		= global_cache_flush,
-	.create_gatt_table	= i460_create_gatt_table,
-	.free_gatt_table	= i460_free_gatt_table,
-#if I460_LARGE_IO_PAGES
-	.insert_memory		= i460_insert_memory,
-	.remove_memory		= i460_remove_memory,
-	.agp_alloc_page		= i460_alloc_page,
-	.agp_destroy_page	= i460_destroy_page,
-#else
-	.insert_memory		= i460_insert_memory_small_io_page,
-	.remove_memory		= i460_remove_memory_small_io_page,
-	.agp_alloc_page		= agp_generic_alloc_page,
-	.agp_alloc_pages	= agp_generic_alloc_pages,
-	.agp_destroy_page	= agp_generic_destroy_page,
-	.agp_destroy_pages	= agp_generic_destroy_pages,
-#endif
-	.alloc_by_type		= agp_generic_alloc_by_type,
-	.free_by_type		= agp_generic_free_by_type,
-	.agp_type_to_mask_type  = agp_generic_type_to_mask_type,
-	.cant_use_aperture	= true,
-};
-
-static int agp_intel_i460_probe(struct pci_dev *pdev,
-				const struct pci_device_id *ent)
-{
-	struct agp_bridge_data *bridge;
-	u8 cap_ptr;
-
-	cap_ptr = pci_find_capability(pdev, PCI_CAP_ID_AGP);
-	if (!cap_ptr)
-		return -ENODEV;
-
-	bridge = agp_alloc_bridge();
-	if (!bridge)
-		return -ENOMEM;
-
-	bridge->driver = &intel_i460_driver;
-	bridge->dev = pdev;
-	bridge->capndx = cap_ptr;
-
-	printk(KERN_INFO PFX "Detected Intel 460GX chipset\n");
-
-	pci_set_drvdata(pdev, bridge);
-	return agp_add_bridge(bridge);
-}
-
-static void agp_intel_i460_remove(struct pci_dev *pdev)
-{
-	struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
-
-	agp_remove_bridge(bridge);
-	agp_put_bridge(bridge);
-}
-
-static struct pci_device_id agp_intel_i460_pci_table[] = {
-	{
-	.class		= (PCI_CLASS_BRIDGE_HOST << 8),
-	.class_mask	= ~0,
-	.vendor		= PCI_VENDOR_ID_INTEL,
-	.device		= PCI_DEVICE_ID_INTEL_84460GX,
-	.subvendor	= PCI_ANY_ID,
-	.subdevice	= PCI_ANY_ID,
-	},
-	{ }
-};
-
-MODULE_DEVICE_TABLE(pci, agp_intel_i460_pci_table);
-
-static struct pci_driver agp_intel_i460_pci_driver = {
-	.name		= "agpgart-intel-i460",
-	.id_table	= agp_intel_i460_pci_table,
-	.probe		= agp_intel_i460_probe,
-	.remove		= agp_intel_i460_remove,
-};
-
-static int __init agp_intel_i460_init(void)
-{
-	if (agp_off)
-		return -EINVAL;
-	return pci_register_driver(&agp_intel_i460_pci_driver);
-}
-
-static void __exit agp_intel_i460_cleanup(void)
-{
-	pci_unregister_driver(&agp_intel_i460_pci_driver);
-}
-
-module_init(agp_intel_i460_init);
-module_exit(agp_intel_i460_cleanup);
-
-MODULE_AUTHOR("Chris Ahna <Christopher.J.Ahna@intel.com>");
-MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index ee71376f174b..3b2159416e62 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -64,25 +64,6 @@
 static DEFINE_MUTEX(hpet_mutex); /* replaces BKL */
 static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 
-/* This clocksource driver currently only works on ia64 */
-#ifdef CONFIG_IA64
-static void __iomem *hpet_mctr;
-
-static u64 read_hpet(struct clocksource *cs)
-{
-	return (u64)read_counter((void __iomem *)hpet_mctr);
-}
-
-static struct clocksource clocksource_hpet = {
-	.name		= "hpet",
-	.rating		= 250,
-	.read		= read_hpet,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static struct clocksource *hpet_clocksource;
-#endif
-
 /* A lock for concurrent access by app and isr hpet activity. */
 static DEFINE_SPINLOCK(hpet_lock);
 
@@ -907,17 +888,6 @@ int hpet_alloc(struct hpet_data *hdp)
 
 	hpetp->hp_delta = hpet_calibrate(hpetp);
 
-/* This clocksource driver currently only works on ia64 */
-#ifdef CONFIG_IA64
-	if (!hpet_clocksource) {
-		hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
-		clocksource_hpet.archdata.fsys_mmio = hpet_mctr;
-		clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
-		hpetp->hp_clocksource = &clocksource_hpet;
-		hpet_clocksource = &clocksource_hpet;
-	}
-#endif
-
 	return 0;
 }
 
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 8de74dcfa18c..442c40efb200 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -37,7 +37,7 @@ config HW_RANDOM_TIMERIOMEM
 
 config HW_RANDOM_INTEL
 	tristate "Intel HW Random Number Generator support"
-	depends on (X86 || IA64 || COMPILE_TEST) && PCI
+	depends on (X86 || COMPILE_TEST) && PCI
 	default HW_RANDOM
 	help
 	  This driver provides kernel-side support for the Random Number
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 1052b0f2d4cf..8d27aa6b5b50 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -31,10 +31,6 @@
 #include <linux/uaccess.h>
 #include <linux/security.h>
 
-#ifdef CONFIG_IA64
-# include <linux/efi.h>
-#endif
-
 #define DEVMEM_MINOR	1
 #define DEVPORT_MINOR	4
 
@@ -277,13 +273,6 @@ int __weak phys_mem_access_prot_allowed(struct file *file,
 #ifdef pgprot_noncached
 static int uncached_access(struct file *file, phys_addr_t addr)
 {
-#if defined(CONFIG_IA64)
-	/*
-	 * On ia64, we ignore O_DSYNC because we cannot tolerate memory
-	 * attribute aliases.
-	 */
-	return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
-#else
 	/*
 	 * Accessing memory above the top the kernel knows about or through a
 	 * file pointer
@@ -292,7 +281,6 @@ static int uncached_access(struct file *file, phys_addr_t addr)
 	if (file->f_flags & O_DSYNC)
 		return 1;
 	return addr >= __pa(high_memory);
-#endif
 }
 #endif
 
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
deleted file mode 100644
index b35f651837c8..000000000000
--- a/drivers/char/mspec.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2001-2006 Silicon Graphics, Inc.  All rights
- * reserved.
- */
-
-/*
- * SN Platform Special Memory (mspec) Support
- *
- * This driver exports the SN special memory (mspec) facility to user
- * processes.
- * There are two types of memory made available thru this driver:
- * uncached and cached.
- *
- * Uncached are used for memory write combining feature of the ia64
- * cpu.
- *
- * Cached are used for areas of memory that are used as cached addresses
- * on our partition and used as uncached addresses from other partitions.
- * Due to a design constraint of the SN2 Shub, you can not have processors
- * on the same FSB perform both a cached and uncached reference to the
- * same cache line.  These special memory cached regions prevent the
- * kernel from ever dropping in a TLB entry and therefore prevent the
- * processor from ever speculating a cache line from this page.
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/vmalloc.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/numa.h>
-#include <linux/refcount.h>
-#include <asm/page.h>
-#include <linux/atomic.h>
-#include <asm/tlbflush.h>
-#include <asm/uncached.h>
-
-
-#define CACHED_ID	"Cached,"
-#define UNCACHED_ID	"Uncached"
-#define REVISION	"4.0"
-#define MSPEC_BASENAME	"mspec"
-
-/*
- * Page types allocated by the device.
- */
-enum mspec_page_type {
-	MSPEC_CACHED = 2,
-	MSPEC_UNCACHED
-};
-
-/*
- * One of these structures is allocated when an mspec region is mmaped. The
- * structure is pointed to by the vma->vm_private_data field in the vma struct.
- * This structure is used to record the addresses of the mspec pages.
- * This structure is shared by all vma's that are split off from the
- * original vma when split_vma()'s are done.
- *
- * The refcnt is incremented atomically because mm->mmap_lock does not
- * protect in fork case where multiple tasks share the vma_data.
- */
-struct vma_data {
-	refcount_t refcnt;	/* Number of vmas sharing the data. */
-	spinlock_t lock;	/* Serialize access to this structure. */
-	int count;		/* Number of pages allocated. */
-	enum mspec_page_type type; /* Type of pages allocated. */
-	unsigned long vm_start;	/* Original (unsplit) base. */
-	unsigned long vm_end;	/* Original (unsplit) end. */
-	unsigned long maddr[];	/* Array of MSPEC addresses. */
-};
-
-/*
- * mspec_open
- *
- * Called when a device mapping is created by a means other than mmap
- * (via fork, munmap, etc.).  Increments the reference count on the
- * underlying mspec data so it is not freed prematurely.
- */
-static void
-mspec_open(struct vm_area_struct *vma)
-{
-	struct vma_data *vdata;
-
-	vdata = vma->vm_private_data;
-	refcount_inc(&vdata->refcnt);
-}
-
-/*
- * mspec_close
- *
- * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to all the vma's sharing this vma_data structure.
- */
-static void
-mspec_close(struct vm_area_struct *vma)
-{
-	struct vma_data *vdata;
-	int index, last_index;
-	unsigned long my_page;
-
-	vdata = vma->vm_private_data;
-
-	if (!refcount_dec_and_test(&vdata->refcnt))
-		return;
-
-	last_index = (vdata->vm_end - vdata->vm_start) >> PAGE_SHIFT;
-	for (index = 0; index < last_index; index++) {
-		if (vdata->maddr[index] == 0)
-			continue;
-		/*
-		 * Clear the page before sticking it back
-		 * into the pool.
-		 */
-		my_page = vdata->maddr[index];
-		vdata->maddr[index] = 0;
-		memset((char *)my_page, 0, PAGE_SIZE);
-		uncached_free_page(my_page, 1);
-	}
-
-	kvfree(vdata);
-}
-
-/*
- * mspec_fault
- *
- * Creates a mspec page and maps it to user space.
- */
-static vm_fault_t
-mspec_fault(struct vm_fault *vmf)
-{
-	unsigned long paddr, maddr;
-	unsigned long pfn;
-	pgoff_t index = vmf->pgoff;
-	struct vma_data *vdata = vmf->vma->vm_private_data;
-
-	maddr = (volatile unsigned long) vdata->maddr[index];
-	if (maddr == 0) {
-		maddr = uncached_alloc_page(numa_node_id(), 1);
-		if (maddr == 0)
-			return VM_FAULT_OOM;
-
-		spin_lock(&vdata->lock);
-		if (vdata->maddr[index] == 0) {
-			vdata->count++;
-			vdata->maddr[index] = maddr;
-		} else {
-			uncached_free_page(maddr, 1);
-			maddr = vdata->maddr[index];
-		}
-		spin_unlock(&vdata->lock);
-	}
-
-	paddr = maddr & ~__IA64_UNCACHED_OFFSET;
-	pfn = paddr >> PAGE_SHIFT;
-
-	return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
-}
-
-static const struct vm_operations_struct mspec_vm_ops = {
-	.open = mspec_open,
-	.close = mspec_close,
-	.fault = mspec_fault,
-};
-
-/*
- * mspec_mmap
- *
- * Called when mmapping the device.  Initializes the vma with a fault handler
- * and private data structure necessary to allocate, track, and free the
- * underlying pages.
- */
-static int
-mspec_mmap(struct file *file, struct vm_area_struct *vma,
-					enum mspec_page_type type)
-{
-	struct vma_data *vdata;
-	int pages, vdata_size;
-
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
-	if ((vma->vm_flags & VM_SHARED) == 0)
-		return -EINVAL;
-
-	if ((vma->vm_flags & VM_WRITE) == 0)
-		return -EPERM;
-
-	pages = vma_pages(vma);
-	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-	vdata = kvzalloc(vdata_size, GFP_KERNEL);
-	if (!vdata)
-		return -ENOMEM;
-
-	vdata->vm_start = vma->vm_start;
-	vdata->vm_end = vma->vm_end;
-	vdata->type = type;
-	spin_lock_init(&vdata->lock);
-	refcount_set(&vdata->refcnt, 1);
-	vma->vm_private_data = vdata;
-
-	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
-	if (vdata->type == MSPEC_UNCACHED)
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	vma->vm_ops = &mspec_vm_ops;
-
-	return 0;
-}
-
-static int
-cached_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	return mspec_mmap(file, vma, MSPEC_CACHED);
-}
-
-static int
-uncached_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	return mspec_mmap(file, vma, MSPEC_UNCACHED);
-}
-
-static const struct file_operations cached_fops = {
-	.owner = THIS_MODULE,
-	.mmap = cached_mmap,
-	.llseek = noop_llseek,
-};
-
-static struct miscdevice cached_miscdev = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "mspec_cached",
-	.fops = &cached_fops
-};
-
-static const struct file_operations uncached_fops = {
-	.owner = THIS_MODULE,
-	.mmap = uncached_mmap,
-	.llseek = noop_llseek,
-};
-
-static struct miscdevice uncached_miscdev = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "mspec_uncached",
-	.fops = &uncached_fops
-};
-
-/*
- * mspec_init
- *
- * Called at boot time to initialize the mspec facility.
- */
-static int __init
-mspec_init(void)
-{
-	int ret;
-
-	ret = misc_register(&cached_miscdev);
-	if (ret) {
-		printk(KERN_ERR "%s: failed to register device %i\n",
-		       CACHED_ID, ret);
-		return ret;
-	}
-	ret = misc_register(&uncached_miscdev);
-	if (ret) {
-		printk(KERN_ERR "%s: failed to register device %i\n",
-		       UNCACHED_ID, ret);
-		misc_deregister(&cached_miscdev);
-		return ret;
-	}
-
-	printk(KERN_INFO "%s %s initialized devices: %s %s\n",
-	       MSPEC_BASENAME, REVISION, CACHED_ID, UNCACHED_ID);
-
-	return 0;
-}
-
-static void __exit
-mspec_exit(void)
-{
-	misc_deregister(&uncached_miscdev);
-	misc_deregister(&cached_miscdev);
-}
-
-module_init(mspec_init);
-module_exit(mspec_exit);
-
-MODULE_AUTHOR("Silicon Graphics, Inc. <linux-altix@sgi.com>");
-MODULE_DESCRIPTION("Driver for SGI SN special memory operations");
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index f429b9b37b76..35efb53d5492 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -239,17 +239,6 @@ if PPC32 || PPC64
 source "drivers/cpufreq/Kconfig.powerpc"
 endif
 
-if IA64
-config IA64_ACPI_CPUFREQ
-	tristate "ACPI Processor P-States driver"
-	depends on ACPI_PROCESSOR
-	help
-	This driver adds a CPUFreq driver which utilizes the ACPI
-	Processor Performance States.
-
-	If in doubt, say N.
-endif
-
 if MIPS
 config BMIPS_CPUFREQ
 	tristate "BMIPS CPUfreq Driver"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index ef8510774913..8d141c71b016 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -102,7 +102,6 @@ obj-$(CONFIG_POWERNV_CPUFREQ)		+= powernv-cpufreq.o
 ##################################################################################
 # Other platform drivers
 obj-$(CONFIG_BMIPS_CPUFREQ)		+= bmips-cpufreq.o
-obj-$(CONFIG_IA64_ACPI_CPUFREQ)		+= ia64-acpi-cpufreq.o
 obj-$(CONFIG_LOONGSON2_CPUFREQ)		+= loongson2_cpufreq.o
 obj-$(CONFIG_SH_CPU_FREQ)		+= sh-cpufreq.o
 obj-$(CONFIG_SPARC_US2E_CPUFREQ)	+= sparc-us2e-cpufreq.o
diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c
deleted file mode 100644
index c6bdc455517f..000000000000
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ /dev/null
@@ -1,353 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file provides the ACPI based P-state support. This
- * module works with generic cpufreq infrastructure. Most of
- * the code is based on i386 version
- * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c)
- *
- * Copyright (C) 2005 Intel Corp
- *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/proc_fs.h>
-#include <asm/io.h>
-#include <linux/uaccess.h>
-#include <asm/pal.h>
-
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-
-MODULE_AUTHOR("Venkatesh Pallipadi");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-struct cpufreq_acpi_io {
-	struct acpi_processor_performance	acpi_data;
-	unsigned int				resume;
-};
-
-struct cpufreq_acpi_req {
-	unsigned int		cpu;
-	unsigned int		state;
-};
-
-static struct cpufreq_acpi_io	*acpi_io_data[NR_CPUS];
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-
-static int
-processor_set_pstate (
-	u32	value)
-{
-	s64 retval;
-
-	pr_debug("processor_set_pstate\n");
-
-	retval = ia64_pal_set_pstate((u64)value);
-
-	if (retval) {
-		pr_debug("Failed to set freq to 0x%x, with error 0x%llx\n",
-		        value, retval);
-		return -ENODEV;
-	}
-	return (int)retval;
-}
-
-
-static int
-processor_get_pstate (
-	u32	*value)
-{
-	u64	pstate_index = 0;
-	s64 	retval;
-
-	pr_debug("processor_get_pstate\n");
-
-	retval = ia64_pal_get_pstate(&pstate_index,
-	                             PAL_GET_PSTATE_TYPE_INSTANT);
-	*value = (u32) pstate_index;
-
-	if (retval)
-		pr_debug("Failed to get current freq with "
-			"error 0x%llx, idx 0x%x\n", retval, *value);
-
-	return (int)retval;
-}
-
-
-/* To be used only after data->acpi_data is initialized */
-static unsigned
-extract_clock (
-	struct cpufreq_acpi_io *data,
-	unsigned value)
-{
-	unsigned long i;
-
-	pr_debug("extract_clock\n");
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		if (value == data->acpi_data.states[i].status)
-			return data->acpi_data.states[i].core_frequency;
-	}
-	return data->acpi_data.states[i-1].core_frequency;
-}
-
-
-static long
-processor_get_freq (
-	void *arg)
-{
-	struct cpufreq_acpi_req *req = arg;
-	unsigned int		cpu = req->cpu;
-	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	u32			value;
-	int			ret;
-
-	pr_debug("processor_get_freq\n");
-	if (smp_processor_id() != cpu)
-		return -EAGAIN;
-
-	/* processor_get_pstate gets the instantaneous frequency */
-	ret = processor_get_pstate(&value);
-	if (ret) {
-		pr_warn("get performance failed with error %d\n", ret);
-		return ret;
-	}
-	return 1000 * extract_clock(data, value);
-}
-
-
-static long
-processor_set_freq (
-	void *arg)
-{
-	struct cpufreq_acpi_req *req = arg;
-	unsigned int		cpu = req->cpu;
-	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	int			ret, state = req->state;
-	u32			value;
-
-	pr_debug("processor_set_freq\n");
-	if (smp_processor_id() != cpu)
-		return -EAGAIN;
-
-	if (state == data->acpi_data.state) {
-		if (unlikely(data->resume)) {
-			pr_debug("Called after resume, resetting to P%d\n", state);
-			data->resume = 0;
-		} else {
-			pr_debug("Already at target state (P%d)\n", state);
-			return 0;
-		}
-	}
-
-	pr_debug("Transitioning from P%d to P%d\n",
-		data->acpi_data.state, state);
-
-	/*
-	 * First we write the target state's 'control' value to the
-	 * control_register.
-	 */
-	value = (u32) data->acpi_data.states[state].control;
-
-	pr_debug("Transitioning to state: 0x%08x\n", value);
-
-	ret = processor_set_pstate(value);
-	if (ret) {
-		pr_warn("Transition failed with error %d\n", ret);
-		return -ENODEV;
-	}
-
-	data->acpi_data.state = state;
-	return 0;
-}
-
-
-static unsigned int
-acpi_cpufreq_get (
-	unsigned int		cpu)
-{
-	struct cpufreq_acpi_req req;
-	long ret;
-
-	req.cpu = cpu;
-	ret = work_on_cpu(cpu, processor_get_freq, &req);
-
-	return ret > 0 ? (unsigned int) ret : 0;
-}
-
-
-static int
-acpi_cpufreq_target (
-	struct cpufreq_policy   *policy,
-	unsigned int index)
-{
-	struct cpufreq_acpi_req req;
-
-	req.cpu = policy->cpu;
-	req.state = index;
-
-	return work_on_cpu(req.cpu, processor_set_freq, &req);
-}
-
-static int
-acpi_cpufreq_cpu_init (
-	struct cpufreq_policy   *policy)
-{
-	unsigned int		i;
-	unsigned int		cpu = policy->cpu;
-	struct cpufreq_acpi_io	*data;
-	unsigned int		result = 0;
-	struct cpufreq_frequency_table *freq_table;
-
-	pr_debug("acpi_cpufreq_cpu_init\n");
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return (-ENOMEM);
-
-	acpi_io_data[cpu] = data;
-
-	result = acpi_processor_register_performance(&data->acpi_data, cpu);
-
-	if (result)
-		goto err_free;
-
-	/* capability check */
-	if (data->acpi_data.state_count <= 1) {
-		pr_debug("No P-States\n");
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	if ((data->acpi_data.control_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-	    (data->acpi_data.status_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		pr_debug("Unsupported address space [%d, %d]\n",
-			(u32) (data->acpi_data.control_register.space_id),
-			(u32) (data->acpi_data.status_register.space_id));
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	/* alloc freq_table */
-	freq_table = kcalloc(data->acpi_data.state_count + 1,
-	                           sizeof(*freq_table),
-	                           GFP_KERNEL);
-	if (!freq_table) {
-		result = -ENOMEM;
-		goto err_unreg;
-	}
-
-	/* detect transition latency */
-	policy->cpuinfo.transition_latency = 0;
-	for (i=0; i<data->acpi_data.state_count; i++) {
-		if ((data->acpi_data.states[i].transition_latency * 1000) >
-		    policy->cpuinfo.transition_latency) {
-			policy->cpuinfo.transition_latency =
-			    data->acpi_data.states[i].transition_latency * 1000;
-		}
-	}
-
-	/* table init */
-	for (i = 0; i <= data->acpi_data.state_count; i++)
-	{
-		if (i < data->acpi_data.state_count) {
-			freq_table[i].frequency =
-			      data->acpi_data.states[i].core_frequency * 1000;
-		} else {
-			freq_table[i].frequency = CPUFREQ_TABLE_END;
-		}
-	}
-
-	policy->freq_table = freq_table;
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	pr_info("CPU%u - ACPI performance management activated\n", cpu);
-
-	for (i = 0; i < data->acpi_data.state_count; i++)
-		pr_debug("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
-			(i == data->acpi_data.state?'*':' '), i,
-			(u32) data->acpi_data.states[i].core_frequency,
-			(u32) data->acpi_data.states[i].power,
-			(u32) data->acpi_data.states[i].transition_latency,
-			(u32) data->acpi_data.states[i].bus_master_latency,
-			(u32) data->acpi_data.states[i].status,
-			(u32) data->acpi_data.states[i].control);
-
-	/* the first call to ->target() should result in us actually
-	 * writing something to the appropriate registers. */
-	data->resume = 1;
-
-	return (result);
-
- err_unreg:
-	acpi_processor_unregister_performance(cpu);
- err_free:
-	kfree(data);
-	acpi_io_data[cpu] = NULL;
-
-	return (result);
-}
-
-
-static int
-acpi_cpufreq_cpu_exit (
-	struct cpufreq_policy   *policy)
-{
-	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
-
-	pr_debug("acpi_cpufreq_cpu_exit\n");
-
-	if (data) {
-		acpi_io_data[policy->cpu] = NULL;
-		acpi_processor_unregister_performance(policy->cpu);
-		kfree(policy->freq_table);
-		kfree(data);
-	}
-
-	return (0);
-}
-
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
-	.verify 	= cpufreq_generic_frequency_table_verify,
-	.target_index	= acpi_cpufreq_target,
-	.get 		= acpi_cpufreq_get,
-	.init		= acpi_cpufreq_cpu_init,
-	.exit		= acpi_cpufreq_cpu_exit,
-	.name		= "acpi-cpufreq",
-	.attr		= cpufreq_generic_attr,
-};
-
-
-static int __init
-acpi_cpufreq_init (void)
-{
-	pr_debug("acpi_cpufreq_init\n");
-
- 	return cpufreq_register_driver(&acpi_cpufreq_driver);
-}
-
-
-static void __exit
-acpi_cpufreq_exit (void)
-{
-	pr_debug("acpi_cpufreq_exit\n");
-
-	cpufreq_unregister_driver(&acpi_cpufreq_driver);
-}
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index b59e3041fd62..a79579fea6f0 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -77,30 +77,6 @@ config FIRMWARE_MEMMAP
 
       See also Documentation/ABI/testing/sysfs-firmware-memmap.
 
-config EFI_PCDP
-	bool "Console device selection via EFI PCDP or HCDP table"
-	depends on ACPI && EFI && IA64
-	default y if IA64
-	help
-	  If your firmware supplies the PCDP table, and you want to
-	  automatically use the primary console device it describes
-	  as the Linux console, say Y here.
-
-	  If your firmware supplies the HCDP table, and you want to
-	  use the first serial port it describes as the Linux console,
-	  say Y here.  If your EFI ConOut path contains only a UART
-	  device, it will become the console automatically.  Otherwise,
-	  you must specify the "console=hcdp" kernel boot argument.
-
-	  Neither the PCDP nor the HCDP affects naming of serial devices,
-	  so a serial console may be /dev/ttyS0, /dev/ttyS1, etc, depending
-	  on how the driver discovers devices.
-
-	  You must also enable the appropriate drivers (serial, VGA, etc.)
-
-	  See DIG64_HCDPv20_042804.pdf available from
-	  <http://www.dig64.org/specifications/> 
-
 config DMIID
     bool "Export DMI identification via sysfs to userspace"
     depends on DMI
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 28fcddcd688f..1d1eb671d805 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_ARM_SDE_INTERFACE)	+= arm_sdei.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
-obj-$(CONFIG_EFI_PCDP)		+= pcdp.o
 obj-$(CONFIG_DMIID)		+= dmi-id.o
 obj-$(CONFIG_INTEL_STRATIX10_SERVICE) += stratix10-svc.o
 obj-$(CONFIG_INTEL_STRATIX10_RSU)     += stratix10-rsu.o
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 231f1c70d1db..cb374b2da9b7 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -4,7 +4,7 @@ menu "EFI (Extensible Firmware Interface) Support"
 
 config EFI_ESRT
 	bool
-	depends on EFI && !IA64
+	depends on EFI
 	default y
 
 config EFI_VARS_PSTORE
@@ -123,7 +123,7 @@ config EFI_BOOTLOADER_CONTROL
 
 config EFI_CAPSULE_LOADER
 	tristate "EFI capsule loader"
-	depends on EFI && !IA64
+	depends on EFI
 	help
 	  This option exposes a loader interface "/dev/efi_capsule_loader" for
 	  users to load EFI capsules. This driver requires working runtime
@@ -224,7 +224,7 @@ config EFI_DISABLE_PCI_DMA
 
 config EFI_EARLYCON
 	def_bool y
-	depends on SERIAL_EARLYCON && !ARM && !IA64
+	depends on SERIAL_EARLYCON && !ARM
 	select FONT_SUPPORT
 	select ARCH_USE_MEMREMAP_PROT
 
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 1599f1176842..47ca652f02f6 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -147,7 +147,7 @@ static ssize_t systab_show(struct kobject *kobj,
 	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
 
-	if (IS_ENABLED(CONFIG_IA64) || IS_ENABLED(CONFIG_X86))
+	if (IS_ENABLED(CONFIG_X86))
 		str = efi_systab_show_arch(str);
 
 	return str - buf;
@@ -777,7 +777,6 @@ int __init efi_systab_check_header(const efi_table_hdr_t *systab_hdr)
 	return 0;
 }
 
-#ifndef CONFIG_IA64
 static const efi_char16_t *__init map_fw_vendor(unsigned long fw_vendor,
 						size_t size)
 {
@@ -793,10 +792,6 @@ static void __init unmap_fw_vendor(const void *fw_vendor, size_t size)
 {
 	early_memunmap((void *)fw_vendor, size);
 }
-#else
-#define map_fw_vendor(p, s)	__va(p)
-#define unmap_fw_vendor(v, s)
-#endif
 
 void __init efi_systab_report_header(const efi_table_hdr_t *systab_hdr,
 				     unsigned long fw_vendor)
@@ -899,11 +894,6 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
 	return buf;
 }
 
-/*
- * IA64 has a funky EFI memory map that doesn't work the same way as
- * other architectures.
- */
-#ifndef CONFIG_IA64
 /*
  * efi_mem_attributes - lookup memmap attributes for physical address
  * @phys_addr: the physical address to lookup
@@ -951,7 +941,6 @@ int efi_mem_type(unsigned long phys_addr)
 	}
 	return -EINVAL;
 }
-#endif
 
 int efi_status_to_err(efi_status_t status)
 {
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
deleted file mode 100644
index 715a45442d1c..000000000000
--- a/drivers/firmware/pcdp.c
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Parse the EFI PCDP table to locate the console device.
- *
- * (c) Copyright 2002, 2003, 2004 Hewlett-Packard Development Company, L.P.
- *	Khalid Aziz <khalid.aziz@hp.com>
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/acpi.h>
-#include <linux/console.h>
-#include <linux/efi.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <asm/vga.h>
-#include "pcdp.h"
-
-static int __init
-setup_serial_console(struct pcdp_uart *uart)
-{
-#ifdef CONFIG_SERIAL_8250_CONSOLE
-	int mmio;
-	static char options[64], *p = options;
-	char parity;
-
-	mmio = (uart->addr.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY);
-	p += sprintf(p, "uart8250,%s,0x%llx",
-		mmio ? "mmio" : "io", uart->addr.address);
-	if (uart->baud) {
-		p += sprintf(p, ",%llu", uart->baud);
-		if (uart->bits) {
-			switch (uart->parity) {
-			    case 0x2: parity = 'e'; break;
-			    case 0x3: parity = 'o'; break;
-			    default:  parity = 'n';
-			}
-			p += sprintf(p, "%c%d", parity, uart->bits);
-		}
-	}
-
-	add_preferred_console("uart", 8250, &options[9]);
-	return setup_earlycon(options);
-#else
-	return -ENODEV;
-#endif
-}
-
-static int __init
-setup_vga_console(struct pcdp_device *dev)
-{
-#if defined(CONFIG_VT) && defined(CONFIG_VGA_CONSOLE)
-	u8 *if_ptr;
-
-	if_ptr = ((u8 *)dev + sizeof(struct pcdp_device));
-	if (if_ptr[0] == PCDP_IF_PCI) {
-		struct pcdp_if_pci if_pci;
-
-		/* struct copy since ifptr might not be correctly aligned */
-
-		memcpy(&if_pci, if_ptr, sizeof(if_pci));
-
-		if (if_pci.trans & PCDP_PCI_TRANS_IOPORT)
-			vga_console_iobase = if_pci.ioport_tra;
-
-		if (if_pci.trans & PCDP_PCI_TRANS_MMIO)
-			vga_console_membase = if_pci.mmio_tra;
-	}
-
-	if (efi_mem_type(vga_console_membase + 0xA0000) == EFI_CONVENTIONAL_MEMORY) {
-		printk(KERN_ERR "PCDP: VGA selected, but frame buffer is not MMIO!\n");
-		return -ENODEV;
-	}
-
-	conswitchp = &vga_con;
-	printk(KERN_INFO "PCDP: VGA console\n");
-	return 0;
-#else
-	return -ENODEV;
-#endif
-}
-
-extern unsigned long hcdp_phys;
-
-int __init
-efi_setup_pcdp_console(char *cmdline)
-{
-	struct pcdp *pcdp;
-	struct pcdp_uart *uart;
-	struct pcdp_device *dev, *end;
-	int i, serial = 0;
-	int rc = -ENODEV;
-
-	if (hcdp_phys == EFI_INVALID_TABLE_ADDR)
-		return -ENODEV;
-
-	pcdp = early_memremap(hcdp_phys, 4096);
-	printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, hcdp_phys);
-
-	if (strstr(cmdline, "console=hcdp")) {
-		if (pcdp->rev < 3)
-			serial = 1;
-	} else if (strstr(cmdline, "console=")) {
-		printk(KERN_INFO "Explicit \"console=\"; ignoring PCDP\n");
-		goto out;
-	}
-
-	if (pcdp->rev < 3 && efi_uart_console_only())
-		serial = 1;
-
-	for (i = 0, uart = pcdp->uart; i < pcdp->num_uarts; i++, uart++) {
-		if (uart->flags & PCDP_UART_PRIMARY_CONSOLE || serial) {
-			if (uart->type == PCDP_CONSOLE_UART) {
-				rc = setup_serial_console(uart);
-				goto out;
-			}
-		}
-	}
-
-	end = (struct pcdp_device *) ((u8 *) pcdp + pcdp->length);
-	for (dev = (struct pcdp_device *) (pcdp->uart + pcdp->num_uarts);
-	     dev < end;
-	     dev = (struct pcdp_device *) ((u8 *) dev + dev->length)) {
-		if (dev->flags & PCDP_PRIMARY_CONSOLE) {
-			if (dev->type == PCDP_CONSOLE_VGA) {
-				rc = setup_vga_console(dev);
-				goto out;
-			}
-		}
-	}
-
-out:
-	early_memunmap(pcdp, 4096);
-	return rc;
-}
diff --git a/drivers/firmware/pcdp.h b/drivers/firmware/pcdp.h
deleted file mode 100644
index e02540571c52..000000000000
--- a/drivers/firmware/pcdp.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Definitions for PCDP-defined console devices
- *
- * For DIG64_HCDPv10a_01.pdf and DIG64_PCDPv20.pdf (v1.0a and v2.0 resp.),
- * please see <http://www.dig64.org/specifications/>
- *
- * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
- *	Khalid Aziz <khalid.aziz@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#define PCDP_CONSOLE			0
-#define PCDP_DEBUG			1
-#define PCDP_CONSOLE_OUTPUT		2
-#define PCDP_CONSOLE_INPUT		3
-
-#define PCDP_UART			(0 << 3)
-#define PCDP_VGA			(1 << 3)
-#define PCDP_USB			(2 << 3)
-
-/* pcdp_uart.type and pcdp_device.type */
-#define PCDP_CONSOLE_UART		(PCDP_UART | PCDP_CONSOLE)
-#define PCDP_DEBUG_UART			(PCDP_UART | PCDP_DEBUG)
-#define PCDP_CONSOLE_VGA		(PCDP_VGA  | PCDP_CONSOLE_OUTPUT)
-#define PCDP_CONSOLE_USB		(PCDP_USB  | PCDP_CONSOLE_INPUT)
-
-/* pcdp_uart.flags */
-#define PCDP_UART_EDGE_SENSITIVE	(1 << 0)
-#define PCDP_UART_ACTIVE_LOW		(1 << 1)
-#define PCDP_UART_PRIMARY_CONSOLE	(1 << 2)
-#define PCDP_UART_IRQ			(1 << 6) /* in pci_func for rev < 3 */
-#define PCDP_UART_PCI			(1 << 7) /* in pci_func for rev < 3 */
-
-struct pcdp_uart {
-	u8				type;
-	u8				bits;
-	u8				parity;
-	u8				stop_bits;
-	u8				pci_seg;
-	u8				pci_bus;
-	u8				pci_dev;
-	u8				pci_func;
-	u64				baud;
-	struct acpi_generic_address	addr;
-	u16				pci_dev_id;
-	u16				pci_vendor_id;
-	u32				gsi;
-	u32				clock_rate;
-	u8				pci_prog_intfc;
-	u8				flags;
-	u16				conout_index;
-	u32				reserved;
-} __attribute__((packed));
-
-#define PCDP_IF_PCI	1
-
-/* pcdp_if_pci.trans */
-#define PCDP_PCI_TRANS_IOPORT	0x02
-#define PCDP_PCI_TRANS_MMIO	0x01
-
-struct pcdp_if_pci {
-	u8			interconnect;
-	u8			reserved;
-	u16			length;
-	u8			segment;
-	u8			bus;
-	u8			dev;
-	u8			fun;
-	u16			dev_id;
-	u16			vendor_id;
-	u32			acpi_interrupt;
-	u64			mmio_tra;
-	u64			ioport_tra;
-	u8			flags;
-	u8			trans;
-} __attribute__((packed));
-
-struct pcdp_vga {
-	u8			count;		/* address space descriptors */
-} __attribute__((packed));
-
-/* pcdp_device.flags */
-#define PCDP_PRIMARY_CONSOLE	1
-
-struct pcdp_device {
-	u8			type;
-	u8			flags;
-	u16			length;
-	u16			efi_index;
-	/* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
-	/* next data is device specific type (currently only pcdp_vga) */
-} __attribute__((packed));
-
-struct pcdp {
-	u8			signature[4];
-	u32			length;
-	u8			rev;		/* PCDP v2.0 is rev 3 */
-	u8			chksum;
-	u8			oemid[6];
-	u8			oem_tabid[8];
-	u32			oem_rev;
-	u8			creator_id[4];
-	u32			creator_rev;
-	u32			num_uarts;
-	struct pcdp_uart	uart[];	/* actual size is num_uarts */
-	/* remainder of table is pcdp_device structures */
-} __attribute__((packed));
diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c
index 49a743f62b4a..025dc558c94e 100644
--- a/drivers/gpu/drm/drm_ioc32.c
+++ b/drivers/gpu/drm/drm_ioc32.c
@@ -945,11 +945,11 @@ static struct {
 	DRM_IOCTL32_DEF(DRM_IOCTL_SG_ALLOC, compat_drm_sg_alloc),
 	DRM_IOCTL32_DEF(DRM_IOCTL_SG_FREE, compat_drm_sg_free),
 #endif
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	DRM_IOCTL32_DEF(DRM_IOCTL_UPDATE_DRAW, compat_drm_update_draw),
 #endif
 	DRM_IOCTL32_DEF(DRM_IOCTL_WAIT_VBLANK, compat_drm_wait_vblank),
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	DRM_IOCTL32_DEF(DRM_IOCTL_MODE_ADDFB2, compat_drm_mode_addfb2),
 #endif
 };
diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h
index adb5173372d3..5f61672d55b7 100644
--- a/drivers/input/serio/i8042.h
+++ b/drivers/input/serio/i8042.h
@@ -19,7 +19,7 @@
 #include "i8042-snirm.h"
 #elif defined(CONFIG_SPARC)
 #include "i8042-sparcio.h"
-#elif defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH)
+#elif defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 #include "i8042-acpipnpio.h"
 #else
 #include "i8042-io.h"
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..7f04491ca5f0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -91,7 +91,7 @@ config IOMMU_DEBUGFS
 choice
 	prompt "IOMMU default domain type"
 	depends on IOMMU_API
-	default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
+	default IOMMU_DEFAULT_DMA_LAZY if X86
 	default IOMMU_DEFAULT_DMA_STRICT
 	help
 	  Choose the type of IOMMU domain used to manage DMA API usage by
@@ -146,7 +146,7 @@ config OF_IOMMU
 
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
-	def_bool ARM64 || IA64 || X86
+	def_bool ARM64 || X86
 	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 2e56bd79f589..119d2c57a48e 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -11,7 +11,7 @@ config DMAR_DEBUG
 
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
-	depends on PCI_MSI && ACPI && (X86 || IA64)
+	depends on PCI_MSI && ACPI && X86
 	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
diff --git a/drivers/media/cec/platform/Kconfig b/drivers/media/cec/platform/Kconfig
index b672d3142eb7..ede81fe331b0 100644
--- a/drivers/media/cec/platform/Kconfig
+++ b/drivers/media/cec/platform/Kconfig
@@ -99,7 +99,7 @@ config CEC_TEGRA
 
 config CEC_SECO
 	tristate "SECO Boards HDMI CEC driver"
-	depends on (X86 || IA64) || COMPILE_TEST
+	depends on X86 || COMPILE_TEST
 	depends on PCI && DMI
 	select CEC_CORE
 	select CEC_NOTIFIER
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index cadd4a820c03..f37c4b8380ae 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -166,7 +166,7 @@ config ENCLOSURE_SERVICES
 config SGI_XP
 	tristate "Support communication between SGI SSIs"
 	depends on NET
-	depends on (IA64_SGI_UV || X86_UV) && SMP
+	depends on X86_UV && SMP
 	depends on X86_64 || BROKEN
 	select SGI_GRU if X86_64 && SMP
 	help
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
index 3ad76cd18b4b..6ae045037219 100644
--- a/drivers/misc/sgi-gru/gru.h
+++ b/drivers/misc/sgi-gru/gru.h
@@ -30,9 +30,7 @@
 /*
  * Size used to map GRU GSeg
  */
-#if defined(CONFIG_IA64)
-#define GRU_GSEG_PAGESIZE	(256 * 1024UL)
-#elif defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #define GRU_GSEG_PAGESIZE	(256 * 1024UL)		/* ZZZ 2MB ??? */
 #else
 #error "Unsupported architecture"
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
index 04d5170ac149..da5eb9edf9ec 100644
--- a/drivers/misc/sgi-gru/gru_instructions.h
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -29,17 +29,7 @@ extern void gru_wait_abort_proc(void *cb);
  * Architecture dependent functions
  */
 
-#if defined(CONFIG_IA64)
-#include <linux/compiler.h>
-#include <asm/intrinsics.h>
-#define __flush_cache(p)		ia64_fc((unsigned long)p)
-/* Use volatile on IA64 to ensure ordering via st4.rel */
-#define gru_ordered_store_ulong(p, v)					\
-		do {							\
-			barrier();					\
-			*((volatile unsigned long *)(p)) = v; /* force st.rel */	\
-		} while (0)
-#elif defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #include <asm/cacheflush.h>
 #define __flush_cache(p)		clflush(p)
 #define gru_ordered_store_ulong(p, v)					\
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index a3d659c11cc4..e755690c9805 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -337,72 +337,6 @@ static unsigned long gru_chiplet_cpu_to_mmr(int chiplet, int cpu, int *corep)
 	return mmr;
 }
 
-#ifdef CONFIG_IA64
-
-static int gru_irq_count[GRU_CHIPLETS_PER_BLADE];
-
-static void gru_noop(struct irq_data *d)
-{
-}
-
-static struct irq_chip gru_chip[GRU_CHIPLETS_PER_BLADE] = {
-	[0 ... GRU_CHIPLETS_PER_BLADE - 1] {
-		.irq_mask	= gru_noop,
-		.irq_unmask	= gru_noop,
-		.irq_ack	= gru_noop
-	}
-};
-
-static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
-			irq_handler_t irq_handler, int cpu, int blade)
-{
-	unsigned long mmr;
-	int irq = IRQ_GRU + chiplet;
-	int ret, core;
-
-	mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
-	if (mmr == 0)
-		return 0;
-
-	if (gru_irq_count[chiplet] == 0) {
-		gru_chip[chiplet].name = irq_name;
-		ret = irq_set_chip(irq, &gru_chip[chiplet]);
-		if (ret) {
-			printk(KERN_ERR "%s: set_irq_chip failed, errno=%d\n",
-			       GRU_DRIVER_ID_STR, -ret);
-			return ret;
-		}
-
-		ret = request_irq(irq, irq_handler, 0, irq_name, NULL);
-		if (ret) {
-			printk(KERN_ERR "%s: request_irq failed, errno=%d\n",
-			       GRU_DRIVER_ID_STR, -ret);
-			return ret;
-		}
-	}
-	gru_irq_count[chiplet]++;
-
-	return 0;
-}
-
-static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
-{
-	unsigned long mmr;
-	int core, irq = IRQ_GRU + chiplet;
-
-	if (gru_irq_count[chiplet] == 0)
-		return;
-
-	mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
-	if (mmr == 0)
-		return;
-
-	if (--gru_irq_count[chiplet] == 0)
-		free_irq(irq, NULL);
-}
-
-#elif defined CONFIG_X86_64
-
 static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
 			irq_handler_t irq_handler, int cpu, int blade)
 {
@@ -447,8 +381,6 @@ static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
 	}
 }
 
-#endif
-
 static void gru_teardown_tlb_irqs(void)
 {
 	int blade;
@@ -514,12 +446,8 @@ static int __init gru_init(void)
 	if (!gru_supported())
 		return 0;
 
-#if defined CONFIG_IA64
-	gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
-#else
 	gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG) &
 				0x7fffffffffffUL;
-#endif
 	gru_start_vaddr = __va(gru_start_paddr);
 	gru_end_paddr = gru_start_paddr + GRU_MAX_BLADES * GRU_SIZE;
 	printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 1d75d5e540bc..695316a83b01 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -11,16 +11,10 @@
 #include "grutables.h"
 
 /* 10 sec */
-#ifdef CONFIG_IA64
-#include <asm/processor.h>
-#define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
-#define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
-#else
 #include <linux/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
-#endif
 
 /* Extract the status field from a kernel handle */
 #define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 4eb4b9455139..0f5b09e290c8 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -41,16 +41,12 @@ struct device *grudev = &gru_device;
  */
 int gru_cpu_fault_map_id(void)
 {
-#ifdef CONFIG_IA64
-	return uv_blade_processor_id() % GRU_NUM_TFM;
-#else
 	int cpu = smp_processor_id();
 	int id, core;
 
 	core = uv_cpu_core_number(cpu);
 	id = core + UV_MAX_INT_CORES * uv_cpu_socket_number(cpu);
 	return id;
-#endif
 }
 
 /*--------- ASID Management -------------------------------------------
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index f1336f43d3bd..3185711beb07 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -16,7 +16,7 @@
 
 #include <linux/mutex.h>
 
-#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV
+#if defined CONFIG_X86_UV
 #include <asm/uv/uv.h>
 #endif
 
diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c
index 19fc7076af27..3faa7eadf679 100644
--- a/drivers/misc/sgi-xp/xp_uv.c
+++ b/drivers/misc/sgi-xp/xp_uv.c
@@ -18,8 +18,6 @@
 #include <asm/uv/uv_hub.h>
 #if defined CONFIG_X86_64
 #include <asm/uv/bios.h>
-#elif defined CONFIG_IA64_SGI_UV
-#include <asm/sn/sn_sal.h>
 #endif
 #include "../sgi-gru/grukservices.h"
 #include "xp.h"
@@ -99,17 +97,6 @@ xp_expand_memprotect_uv(unsigned long phys_addr, unsigned long size)
 			"UV_MEMPROT_ALLOW_RW) failed, ret=%d\n", ret);
 		return xpBiosError;
 	}
-
-#elif defined CONFIG_IA64_SGI_UV
-	u64 nasid_array;
-
-	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1,
-				   &nasid_array);
-	if (ret != 0) {
-		dev_err(xp, "sn_change_memprotect(,, "
-			"SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret);
-		return xpSalError;
-	}
 #else
 	#error not a supported configuration
 #endif
@@ -129,17 +116,6 @@ xp_restrict_memprotect_uv(unsigned long phys_addr, unsigned long size)
 			"UV_MEMPROT_RESTRICT_ACCESS) failed, ret=%d\n", ret);
 		return xpBiosError;
 	}
-
-#elif defined CONFIG_IA64_SGI_UV
-	u64 nasid_array;
-
-	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0,
-				   &nasid_array);
-	if (ret != 0) {
-		dev_err(xp, "sn_change_memprotect(,, "
-			"SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret);
-		return xpSalError;
-	}
 #else
 	#error not a supported configuration
 #endif
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 6da509d692bb..cc71395782b6 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -1155,36 +1155,6 @@ xpc_die_deactivate(void)
 static int
 xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 {
-#ifdef CONFIG_IA64		/* !!! temporary kludge */
-	switch (event) {
-	case DIE_MACHINE_RESTART:
-	case DIE_MACHINE_HALT:
-		xpc_die_deactivate();
-		break;
-
-	case DIE_KDEBUG_ENTER:
-		/* Should lack of heartbeat be ignored by other partitions? */
-		if (!xpc_kdebug_ignore)
-			break;
-
-		fallthrough;
-	case DIE_MCA_MONARCH_ENTER:
-	case DIE_INIT_MONARCH_ENTER:
-		xpc_arch_ops.offline_heartbeat();
-		break;
-
-	case DIE_KDEBUG_LEAVE:
-		/* Is lack of heartbeat being ignored by other partitions? */
-		if (!xpc_kdebug_ignore)
-			break;
-
-		fallthrough;
-	case DIE_MCA_MONARCH_LEAVE:
-	case DIE_INIT_MONARCH_LEAVE:
-		xpc_arch_ops.online_heartbeat();
-		break;
-	}
-#else
 	struct die_args *die_args = _die_args;
 
 	switch (event) {
@@ -1206,7 +1176,6 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 	default:
 		xpc_die_deactivate();
 	}
-#endif
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index fff522d347e3..2f03a7080d96 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -24,34 +24,12 @@
 #include <linux/slab.h>
 #include <linux/numa.h>
 #include <asm/uv/uv_hub.h>
-#if defined CONFIG_X86_64
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_irq.h>
-#elif defined CONFIG_IA64_SGI_UV
-#include <asm/sn/intr.h>
-#include <asm/sn/sn_sal.h>
-#endif
 #include "../sgi-gru/gru.h"
 #include "../sgi-gru/grukservices.h"
 #include "xpc.h"
 
-#if defined CONFIG_IA64_SGI_UV
-struct uv_IO_APIC_route_entry {
-	__u64	vector		:  8,
-		delivery_mode	:  3,
-		dest_mode	:  1,
-		delivery_status	:  1,
-		polarity	:  1,
-		__reserved_1	:  1,
-		trigger		:  1,
-		mask		:  1,
-		__reserved_2	: 15,
-		dest		: 32;
-};
-
-#define sn_partition_id 0
-#endif
-
 static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
 
 #define XPC_ACTIVATE_MSG_SIZE_UV	(1 * GRU_CACHE_LINE_BYTES)
@@ -113,7 +91,6 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 {
 	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
 
-#if defined CONFIG_X86_64
 	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
 			UV_AFFINITY_CPU);
 	if (mq->irq < 0)
@@ -121,40 +98,13 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 
 	mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
 
-#elif defined CONFIG_IA64_SGI_UV
-	if (strcmp(irq_name, XPC_ACTIVATE_IRQ_NAME) == 0)
-		mq->irq = SGI_XPC_ACTIVATE;
-	else if (strcmp(irq_name, XPC_NOTIFY_IRQ_NAME) == 0)
-		mq->irq = SGI_XPC_NOTIFY;
-	else
-		return -EINVAL;
-
-	mq->mmr_value = (unsigned long)cpu_physical_id(cpu) << 32 | mq->irq;
-	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mq->mmr_value);
-#else
-	#error not a supported configuration
-#endif
-
 	return 0;
 }
 
 static void
 xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq)
 {
-#if defined CONFIG_X86_64
 	uv_teardown_irq(mq->irq);
-
-#elif defined CONFIG_IA64_SGI_UV
-	int mmr_pnode;
-	unsigned long mmr_value;
-
-	mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
-	mmr_value = 1UL << 16;
-
-	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mmr_value);
-#else
-	#error not a supported configuration
-#endif
 }
 
 static int
@@ -162,17 +112,6 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
 {
 	int ret;
 
-#if defined CONFIG_IA64_SGI_UV
-	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
-
-	ret = sn_mq_watchlist_alloc(mmr_pnode, (void *)uv_gpa(mq->address),
-				    mq->order, &mq->mmr_offset);
-	if (ret < 0) {
-		dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n",
-			ret);
-		return -EBUSY;
-	}
-#elif defined CONFIG_X86_64
 	ret = uv_bios_mq_watchlist_alloc(uv_gpa(mq->address),
 					 mq->order, &mq->mmr_offset);
 	if (ret < 0) {
@@ -180,9 +119,6 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
 			"ret=%d\n", ret);
 		return ret;
 	}
-#else
-	#error not a supported configuration
-#endif
 
 	mq->watchlist_num = ret;
 	return 0;
@@ -194,15 +130,8 @@ xpc_gru_mq_watchlist_free_uv(struct xpc_gru_mq_uv *mq)
 	int ret;
 	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
 
-#if defined CONFIG_X86_64
 	ret = uv_bios_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
 	BUG_ON(ret != BIOS_STATUS_SUCCESS);
-#elif defined CONFIG_IA64_SGI_UV
-	ret = sn_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
-	BUG_ON(ret != SALRET_OK);
-#else
-	#error not a supported configuration
-#endif
 }
 
 static struct xpc_gru_mq_uv *
@@ -786,7 +715,6 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
 	s64 status;
 	enum xp_retval ret;
 
-#if defined CONFIG_X86_64
 	status = uv_bios_reserved_page_pa((u64)buf, cookie, (u64 *)rp_pa,
 					  (u64 *)len);
 	if (status == BIOS_STATUS_SUCCESS)
@@ -796,19 +724,6 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
 	else
 		ret = xpBiosError;
 
-#elif defined CONFIG_IA64_SGI_UV
-	status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len);
-	if (status == SALRET_OK)
-		ret = xpSuccess;
-	else if (status == SALRET_MORE_PASSES)
-		ret = xpNeedMoreInfo;
-	else
-		ret = xpSalError;
-
-#else
-	#error not a supported configuration
-#endif
-
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 14b311196b8f..c7c1ff38ea33 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -17005,7 +17005,7 @@ static u32 tg3_calc_dma_bndry(struct tg3 *tp, u32 val)
 	    !tg3_flag(tp, PCI_EXPRESS))
 		goto out;
 
-#if defined(CONFIG_PPC64) || defined(CONFIG_IA64) || defined(CONFIG_PARISC)
+#if defined(CONFIG_PPC64) || defined(CONFIG_PARISC)
 	goal = BOUNDARY_MULTI_CACHELINE;
 #else
 #if defined(CONFIG_SPARC64) || defined(CONFIG_ALPHA)
diff --git a/drivers/net/ethernet/brocade/bna/bnad.h b/drivers/net/ethernet/brocade/bna/bnad.h
index 627a93ce38ab..10b1e534030e 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.h
+++ b/drivers/net/ethernet/brocade/bna/bnad.h
@@ -19,7 +19,6 @@
 #include <linux/firmware.h>
 #include <linux/if_vlan.h>
 
-/* Fix for IA64 */
 #include <asm/checksum.h>
 #include <net/ip6_checksum.h>
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 1d1e183d3a8b..ed24d6af7487 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -233,9 +233,7 @@ static int nx_set_dma_mask(struct netxen_adapter *adapter)
 	cmask = DMA_BIT_MASK(32);
 
 	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
-#ifndef CONFIG_IA64
 		mask = DMA_BIT_MASK(35);
-#endif
 	} else {
 		mask = DMA_BIT_MASK(39);
 		cmask = mask;
diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index 5e6b1eb54c64..a771b2259f21 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(vga_put);
 
 static bool vga_is_firmware_default(struct pci_dev *pdev)
 {
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	u64 base = screen_info.lfb_base;
 	u64 size = screen_info.lfb_size;
 	struct resource *r;
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index ee17cf5c44c6..c12978311a09 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -222,7 +222,7 @@ config SERIAL_8250_EXTENDED
 
 config SERIAL_8250_MANY_PORTS
 	bool "Support more than 4 legacy serial ports"
-	depends on SERIAL_8250_EXTENDED && !IA64
+	depends on SERIAL_8250_EXTENDED
 	help
 	  Say Y here if you have dumb serial boards other than the four
 	  standard COM 1/2/3/4 ports. This may happen if you have an AST
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 358f216c6cd6..1fe6107b539b 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -1273,7 +1273,7 @@ static void kbd_bh(struct tasklet_struct *unused)
 	}
 }
 
-#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\
+#if defined(CONFIG_X86) || defined(CONFIG_ALPHA) ||\
     defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
     defined(CONFIG_PARISC) || defined(CONFIG_SUPERH) ||\
     (defined(CONFIG_ARM) && defined(CONFIG_KEYBOARD_ATKBD) && !defined(CONFIG_ARCH_RPC))
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index eac0ba39581e..747053f529a5 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -463,7 +463,7 @@ config FB_VESA
 
 config FB_EFI
 	bool "EFI-based Framebuffer Support"
-	depends on (FB = y) && !IA64 && EFI
+	depends on (FB = y) && EFI
 	select APERTURE_HELPERS
 	select DRM_PANEL_ORIENTATION_QUIRKS
 	select FB_IOMEM_HELPERS
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 751458959411..8cb6fa45d599 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1287,7 +1287,7 @@ config INTEL_MID_WATCHDOG
 
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
-	depends on (X86 || IA64) && PCI
+	depends on X86 && PCI
 	select WATCHDOG_CORE
 	depends on I2C || I2C=n
 	depends on MFD_INTEL_PMC_BXT || !MFD_INTEL_PMC_BXT
diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..421f68fed1ba 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -255,7 +255,7 @@ config ARCH_SUPPORTS_HUGETLBFS
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
-	depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
 	help
diff --git a/fs/afs/main.c b/fs/afs/main.c
index eae288c8d40a..6425c81d07de 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26";
 const char afs_init_sysname[] = "aarch64_linux26";
 #elif defined(CONFIG_X86_32)
 const char afs_init_sysname[] = "i386_linux26";
-#elif defined(CONFIG_IA64)
-const char afs_init_sysname[] = "ia64_linux26";
 #elif defined(CONFIG_PPC64)
 const char afs_init_sysname[] = "ppc64_linux26";
 #elif defined(CONFIG_PPC32)
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index c14852362fce..052d0e888c27 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -22,7 +22,7 @@
 /*
  * On intel, even if sizes match, alignment and/or padding may differ.
  */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #define BROKEN_X86_ALIGNMENT
 #define __compat_packed __attribute__((packed))
 #else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..9bcf5641a7cf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,7 +259,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH)
+#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
 #else
 static inline void
@@ -1114,15 +1114,8 @@ static inline int acpi_get_lps0_constraint(struct device *dev)
 	return ACPI_STATE_UNKNOWN;
 }
 #endif /* CONFIG_SUSPEND && CONFIG_X86 */
-#ifndef CONFIG_IA64
 void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
 #else
-static inline void arch_reserve_mem_area(acpi_physical_address addr,
-					  size_t size)
-{
-}
-#endif /* CONFIG_X86 */
-#else
 #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
 #endif
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 80b21d1c6eaf..9cc5bf32f6f2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -358,13 +358,10 @@ void efi_native_runtime_setup(void);
  * where the UEFI SPEC breaks the line.
  */
 #define NULL_GUID				EFI_GUID(0x00000000, 0x0000, 0x0000,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
-#define MPS_TABLE_GUID				EFI_GUID(0xeb9d2d2f, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_TABLE_GUID				EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_20_TABLE_GUID			EFI_GUID(0x8868e871, 0xe4f1, 0x11d3,  0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
 #define SMBIOS_TABLE_GUID			EFI_GUID(0xeb9d2d31, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define SMBIOS3_TABLE_GUID			EFI_GUID(0xf2fd1544, 0x9794, 0x4a2c,  0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94)
-#define SAL_SYSTEM_TABLE_GUID			EFI_GUID(0xeb9d2d32, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
-#define HCDP_TABLE_GUID				EFI_GUID(0xf951938d, 0x620b, 0x42ef,  0x82, 0x79, 0xa8, 0x4b, 0x79, 0x61, 0x78, 0x98)
 #define UGA_IO_PROTOCOL_GUID			EFI_GUID(0x61a4d49e, 0x6f68, 0x4f1b,  0xb9, 0x22, 0xa8, 0x6e, 0xed, 0x0b, 0x07, 0xa2)
 #define EFI_GLOBAL_VARIABLE_GUID		EFI_GUID(0x8be4df61, 0x93ca, 0x11d2,  0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c)
 #define UV_SYSTEM_TABLE_GUID			EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd,  0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93)
@@ -851,10 +848,6 @@ static inline int efi_range_is_wc(unsigned long start, unsigned long len)
 	return 1;
 }
 
-#ifdef CONFIG_EFI_PCDP
-extern int __init efi_setup_pcdp_console(char *);
-#endif
-
 /*
  * We play games with efi_enabled so that the compiler will, if
  * possible, remove EFI-related code altogether.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..17d530e12f70 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -362,8 +362,6 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
 # define VM_GROWSUP	VM_ARCH_1
-#elif defined(CONFIG_IA64)
-# define VM_GROWSUP	VM_ARCH_1
 #elif defined(CONFIG_SPARC64)
 # define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
 # define VM_ARCH_CLEAR	VM_SPARC_ADI
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 962cd41a2cb5..99e4f1f718c7 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -276,7 +276,7 @@ struct kparam_array
    read-only sections (which is part of respective UNIX ABI on these
    platforms). So 'const' makes no sense and even causes compile failures
    with some compilers. */
-#if defined(CONFIG_ALPHA) || defined(CONFIG_IA64) || defined(CONFIG_PPC64)
+#if defined(CONFIG_ALPHA) || defined(CONFIG_PPC64)
 #define __moduleparam_const
 #else
 #define __moduleparam_const const
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 1478b9dd05fa..d801409b33cf 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -144,7 +144,7 @@ IF_HAVE_PG_ARCH_X(arch_3)
 #define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
 #elif defined(CONFIG_PPC)
 #define __VM_ARCH_SPECIFIC_1 {VM_SAO,     "sao"           }
-#elif defined(CONFIG_PARISC) || defined(CONFIG_IA64)
+#elif defined(CONFIG_PARISC)
 #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP,	"growsup"	}
 #elif !defined(CONFIG_MMU)
 #define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy"	}
diff --git a/init/Kconfig b/init/Kconfig
index 6d35728b94b2..9ffb103fc927 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1723,7 +1723,7 @@ config KALLSYMS_ABSOLUTE_PERCPU
 config KALLSYMS_BASE_RELATIVE
 	bool
 	depends on KALLSYMS
-	default !IA64
+	default y
 	help
 	  Instead of emitting them as absolute values in the native word size,
 	  emit the symbol references in the kallsyms table as 32-bit entries,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..234361530007 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1725,9 +1725,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
 	if (!cpu_possible(cpu)) {
 		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
 		       cpu);
-#if defined(CONFIG_IA64)
-		pr_err("please check additional_cpus= boot parameter\n");
-#endif
 		return -EINVAL;
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..faa921ecbe67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3144,7 +3144,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
 		if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
 			return false;
 
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
 		kargs->stack += kargs->stack_size;
 #endif
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..caab7cd26790 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10289,9 +10289,9 @@ void normalize_rt_tasks(void)
 
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
 /*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
@@ -10313,30 +10313,7 @@ struct task_struct *curr_task(int cpu)
 	return cpu_curr(cpu);
 }
 
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
-	cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
 
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..dca7a8f735cd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1939,15 +1939,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_IA64
-	{
-		.procname	= "unaligned-dump-stack",
-		.data		= &unaligned_dump_stack,
-		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_RT_MUTEXES
 	{
 		.procname	= "max_lock_depth",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa307f93fa2e..2caf73c9f623 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -759,7 +759,7 @@ config SHRINKER_DEBUG
 
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
-	depends on DEBUG_KERNEL && !IA64
+	depends on DEBUG_KERNEL
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index 353268b9f129..842894158944 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -133,9 +133,6 @@
 #ifdef CONFIG_ARM
 #	define XZ_DEC_ARM
 #endif
-#ifdef CONFIG_IA64
-#	define XZ_DEC_IA64
-#endif
 #ifdef CONFIG_SPARC
 #	define XZ_DEC_SPARC
 #endif
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index adce22ac18d6..aef086a6bf2f 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -19,11 +19,6 @@ config XZ_DEC_POWERPC
 	default y
 	select XZ_DEC_BCJ
 
-config XZ_DEC_IA64
-	bool "IA-64 BCJ filter decoder" if EXPERT
-	default y
-	select XZ_DEC_BCJ
-
 config XZ_DEC_ARM
 	bool "ARM BCJ filter decoder" if EXPERT
 	default y
diff --git a/mm/mmap.c b/mm/mmap.c
index b56a7f0c9f85..4b0eaf6427c4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1924,9 +1924,9 @@ static int acct_stack_growth(struct vm_area_struct *vma,
 	return 0;
 }
 
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+#if defined(CONFIG_STACK_GROWSUP)
 /*
- * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * PA-RISC uses this for its stack.
  * vma is the last one with address > vma->vm_end.  Have to extend vma.
  */
 static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
@@ -2023,7 +2023,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	validate_mm(mm);
 	return error;
 }
-#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+#endif /* CONFIG_STACK_GROWSUP */
 
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h
deleted file mode 100644
index 6fffe5682713..000000000000
--- a/tools/arch/ia64/include/asm/barrier.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copied from the kernel sources to tools/:
- *
- * Memory barrier definitions.  This is based on information published
- * in the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _TOOLS_LINUX_ASM_IA64_BARRIER_H
-#define _TOOLS_LINUX_ASM_IA64_BARRIER_H
-
-#include <linux/compiler.h>
-
-/*
- * Macros to force memory ordering.  In these descriptions, "previous"
- * and "subsequent" refer to program order; "visible" means that all
- * architecturally visible effects of a memory access have occurred
- * (at a minimum, this means the memory has been read or written).
- *
- *   wmb():	Guarantees that all preceding stores to memory-
- *		like regions are visible before any subsequent
- *		stores and that all following stores will be
- *		visible only after all previous stores.
- *   rmb():	Like wmb(), but for reads.
- *   mb():	wmb()/rmb() combo, i.e., all previous memory
- *		accesses are visible before all subsequent
- *		accesses and vice versa.  This is also known as
- *		a "fence."
- *
- * Note: "mb()" and its variants cannot be used as a fence to order
- * accesses to memory mapped I/O registers.  For that, mf.a needs to
- * be used.  However, we don't want to always use mf.a because (a)
- * it's (presumably) much slower than mf and (b) mf.a is supported for
- * sequential memory pages only.
- */
-
-#define mb()		ia64_mf()
-#define rmb()		mb()
-#define wmb()		mb()
-
-#define smp_store_release(p, v)			\
-do {						\
-	barrier();				\
-	WRITE_ONCE(*p, v);			\
-} while (0)
-
-#define smp_load_acquire(p)			\
-({						\
-	typeof(*p) ___p1 = READ_ONCE(*p);	\
-	barrier();				\
-	___p1;					\
-})
-
-#endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/ia64/include/uapi/asm/bitsperlong.h b/tools/arch/ia64/include/uapi/asm/bitsperlong.h
deleted file mode 100644
index 1146d55563db..000000000000
--- a/tools/arch/ia64/include/uapi/asm/bitsperlong.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ASM_IA64_BITSPERLONG_H
-#define __ASM_IA64_BITSPERLONG_H
-
-#define __BITS_PER_LONG 64
-
-#include <asm-generic/bitsperlong.h>
-
-#endif /* __ASM_IA64_BITSPERLONG_H */
diff --git a/tools/arch/ia64/include/uapi/asm/mman.h b/tools/arch/ia64/include/uapi/asm/mman.h
deleted file mode 100644
index 2a19bb1db4ab..000000000000
--- a/tools/arch/ia64/include/uapi/asm/mman.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef TOOLS_ARCH_IA64_UAPI_ASM_MMAN_FIX_H
-#define TOOLS_ARCH_IA64_UAPI_ASM_MMAN_FIX_H
-#include <uapi/asm-generic/mman.h>
-/* MAP_32BIT is undefined on ia64, fix it for perf */
-#define MAP_32BIT	0
-#endif
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 07796df0a295..338c81f1fcf3 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -59,12 +59,6 @@ ifeq ($(SRCARCH),arc)
 no-header-test += linux/bpf_perf_event.h
 endif
 
-ifeq ($(SRCARCH),ia64)
-no-header-test += asm/setup.h
-no-header-test += asm/sigcontext.h
-no-header-test += linux/if_bonding.h
-endif
-
 ifeq ($(SRCARCH),powerpc)
 no-header-test += linux/bpf_perf_event.h
 endif
-- 
cgit v1.2.3


From f5e836884d8e55b416dfad55c29481ec1b65c1f0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 13 Jan 2023 17:57:47 +0100
Subject: kernel: Drop IA64 support from sig_fault handlers

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/sched/signal.h       | 17 ++++-------------
 include/uapi/asm-generic/siginfo.h |  5 -----
 kernel/signal.c                    | 25 +++++--------------------
 3 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..155332977239 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -303,20 +303,11 @@ static inline void kernel_signal_stop(void)
 
 	schedule();
 }
-#ifdef __ia64__
-# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
-#else
-# define ___ARCH_SI_IA64(_a1, _a2, _a3)
-#endif
 
-int force_sig_fault_to_task(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
-int force_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
-int send_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+			    struct task_struct *t);
+int force_sig_fault(int sig, int code, void __user *addr);
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
 
 int force_sig_mceerr(int code, void __user *, short);
 int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 0f52d0ac47c5..b7bc545ec3b2 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -68,11 +68,6 @@ union __sifields {
 	/* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */
 	struct {
 		void __user *_addr; /* faulting insn/memory ref. */
-#ifdef __ia64__
-		int _imm;		/* immediate value for "break" */
-		unsigned int _flags;	/* see ia64 si_flags */
-		unsigned long _isr;	/* isr */
-#endif
 
 #define __ADDR_BND_PKEY_PAD  (__alignof__(void *) < sizeof(short) ? \
 			      sizeof(short) : __alignof__(void *))
diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..26d9f66e5364 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1718,9 +1718,8 @@ void force_sigsegv(int sig)
 		force_sig(SIGSEGV);
 }
 
-int force_sig_fault_to_task(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t)
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+			    struct task_struct *t)
 {
 	struct kernel_siginfo info;
 
@@ -1729,24 +1728,15 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
 	info.si_errno = 0;
 	info.si_code  = code;
 	info.si_addr  = addr;
-#ifdef __ia64__
-	info.si_imm = imm;
-	info.si_flags = flags;
-	info.si_isr = isr;
-#endif
 	return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
 }
 
-int force_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+int force_sig_fault(int sig, int code, void __user *addr)
 {
-	return force_sig_fault_to_task(sig, code, addr
-				       ___ARCH_SI_IA64(imm, flags, isr), current);
+	return force_sig_fault_to_task(sig, code, addr, current);
 }
 
-int send_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t)
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
 {
 	struct kernel_siginfo info;
 
@@ -1755,11 +1745,6 @@ int send_sig_fault(int sig, int code, void __user *addr
 	info.si_errno = 0;
 	info.si_code  = code;
 	info.si_addr  = addr;
-#ifdef __ia64__
-	info.si_imm = imm;
-	info.si_flags = flags;
-	info.si_isr = isr;
-#endif
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-- 
cgit v1.2.3


From b089ea3cc30de85ea7e20aa66500feb4082dfbf7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 13 Jan 2023 18:08:32 +0100
Subject: lib/raid6: Drop IA64 support

Drop Itanium support from the RAID6 code, and along with it, the 16x and
32x unrolled versions, which were only used by IA64.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/raid/pq.h | 2 --
 lib/raid6/Makefile      | 4 ++--
 lib/raid6/algos.c       | 4 ----
 lib/raid6/int.uc        | 9 ---------
 4 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 006e18decfad..98030accf641 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -84,8 +84,6 @@ extern const struct raid6_calls raid6_intx1;
 extern const struct raid6_calls raid6_intx2;
 extern const struct raid6_calls raid6_intx4;
 extern const struct raid6_calls raid6_intx8;
-extern const struct raid6_calls raid6_intx16;
-extern const struct raid6_calls raid6_intx32;
 extern const struct raid6_calls raid6_mmxx1;
 extern const struct raid6_calls raid6_mmxx2;
 extern const struct raid6_calls raid6_sse1x1;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 035b0a4db476..1c5420ff254e 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -2,7 +2,7 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
-		   int8.o int16.o int32.o
+		   int8.o
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
@@ -55,7 +55,7 @@ endif
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
 
-targets += int1.c int2.c int4.c int8.c int16.c int32.c
+targets += int1.c int2.c int4.c int8.c
 $(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 0ec534faf019..cd2e88ee1f14 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -80,10 +80,6 @@ const struct raid6_calls * const raid6_algos[] = {
 #ifdef CONFIG_CPU_HAS_LSX
 	&raid6_lsx,
 #endif
-#endif
-#if defined(__ia64__)
-	&raid6_intx32,
-	&raid6_intx16,
 #endif
 	&raid6_intx8,
 	&raid6_intx4,
diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index 558aeac9342a..1ba56c3fa482 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc
@@ -41,13 +41,6 @@ typedef u32 unative_t;
 
 
-/*
- * IA-64 wants insane amounts of unrolling.  On other architectures that
- * is just a waste of space.
- */
-#if ($# <= 8) || defined(__ia64__)
-
-
 /*
  * These sub-operations are separate inlines since they can sometimes be
  * specially optimized using architecture-specific hacks.
@@ -152,5 +145,3 @@ const struct raid6_calls raid6_intx$# = {
 	"int" NSTRING "x$#",
 	0
 };
-
-#endif
-- 
cgit v1.2.3


From f42dafe3da0cd887c9d2aaa59576f2a92ee4d876 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Sun, 3 Sep 2023 21:06:57 +0200
Subject: gpiolib: unexport gpiod_set_transitory()

There are no and never have been any users of gpiod_set_transitory()
outside the core GPIOLIB code. Make it private.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c        | 1 -
 drivers/gpio/gpiolib.h        | 2 ++
 include/linux/gpio/consumer.h | 8 --------
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 40a0022ea719..edffa0d2acaa 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2700,7 +2700,6 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory)
 						      PIN_CONFIG_PERSIST_STATE,
 						      !transitory);
 }
-EXPORT_SYMBOL_GPL(gpiod_set_transitory);
 
 /**
  * gpiod_is_active_low - test whether a GPIO is active-low or not
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index a0a67569300b..d1c94bd571c6 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -144,6 +144,8 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				  struct gpio_array *array_info,
 				  unsigned long *value_bitmap);
 
+int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
+
 extern spinlock_t gpio_lock;
 extern struct list_head gpio_devices;
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 1c4385a00f88..6cc345440a5b 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -159,7 +159,6 @@ int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 
 int gpiod_set_config(struct gpio_desc *desc, unsigned long config);
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce);
-int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
 void gpiod_toggle_active_low(struct gpio_desc *desc);
 
 int gpiod_is_active_low(const struct gpio_desc *desc);
@@ -494,13 +493,6 @@ static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned int deboun
 	return -ENOSYS;
 }
 
-static inline int gpiod_set_transitory(struct gpio_desc *desc, bool transitory)
-{
-	/* GPIO can never have been requested */
-	WARN_ON(desc);
-	return -ENOSYS;
-}
-
 static inline void gpiod_toggle_active_low(struct gpio_desc *desc)
 {
 	/* GPIO can never have been requested */
-- 
cgit v1.2.3


From 8de54392b849a612f337044d81d9859ee95ab871 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:35:20 +0200
Subject: gpiolib: remove stray newline in gpio/driver.h

Fix a double newline in the GPIO provider header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 4f0c5d62c8f3..1571cfca65e7 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -764,7 +764,6 @@ void gpiochip_free_own_desc(struct gpio_desc *desc);
 int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 
-
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
 
 #else /* CONFIG_GPIOLIB */
-- 
cgit v1.2.3


From 37d42ab3924919652858f836a80ab49ec7d11f1e Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:34:58 +0200
Subject: gpiolib: remove unnecessary extern specifiers from the driver header

'extern' doesn't do anything for function declarations. Remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1571cfca65e7..b721422f4bfa 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -529,8 +529,7 @@ struct gpio_chip {
 #endif /* CONFIG_OF_GPIO */
 };
 
-extern const char *gpiochip_is_requested(struct gpio_chip *gc,
-			unsigned int offset);
+const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
@@ -549,9 +548,9 @@ extern const char *gpiochip_is_requested(struct gpio_chip *gc,
 	for_each_requested_gpio_in_range(chip, i, 0, chip->ngpio, label)
 
 /* add/remove chips */
-extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
-				      struct lock_class_key *lock_key,
-				      struct lock_class_key *request_key);
+int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
+			       struct lock_class_key *lock_key,
+			       struct lock_class_key *request_key);
 
 /**
  * gpiochip_add_data() - register a gpio_chip
@@ -599,13 +598,13 @@ static inline int gpiochip_add(struct gpio_chip *gc)
 {
 	return gpiochip_add_data(gc, NULL);
 }
-extern void gpiochip_remove(struct gpio_chip *gc);
-extern int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data,
-					   struct lock_class_key *lock_key,
-					   struct lock_class_key *request_key);
+void gpiochip_remove(struct gpio_chip *gc);
+int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
+				    void *data, struct lock_class_key *lock_key,
+				    struct lock_class_key *request_key);
 
-extern struct gpio_chip *gpiochip_find(void *data,
-			      int (*match)(struct gpio_chip *gc, void *data));
+struct gpio_chip *gpiochip_find(void *data,
+				int (*match)(struct gpio_chip *gc, void *data));
 
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 2ae5c9248e06dac2c2360be26b4e25f673238337 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 31 Aug 2023 11:22:58 -0700
Subject: wifi: mac80211: Use flexible array in struct ieee80211_tim_ie

Currently struct ieee80211_tim_ie defines:
	u8 virtual_map[1];

Per the guidance in [1] change this to be a flexible array.

Per the discussion in [2] wrap the virtual_map in a union with a u8
item in order to preserve the existing expectation that the
virtual_map must contain at least one octet (at least when used in a
non-S1G PPDU). This means that no driver changes are required.

[1] https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays
[2] https://lore.kernel.org/linux-wireless/202308301529.AC90A9EF98@keescook/

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20230831-ieee80211_tim_ie-v3-2-e10ff584ab5d@quicinc.com
[add wifi prefix]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bd2f6e19c357..340d7e0f6bf7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -951,17 +951,24 @@ struct ieee80211_wide_bw_chansw_ie {
  * @dtim_count: DTIM Count
  * @dtim_period: DTIM Period
  * @bitmap_ctrl: Bitmap Control
+ * @required_octet: "Syntatic sugar" to force the struct size to the
+ *                  minimum valid size when carried in a non-S1G PPDU
  * @virtual_map: Partial Virtual Bitmap
  *
  * This structure represents the payload of the "TIM element" as
- * described in IEEE Std 802.11-2020 section 9.4.2.5.
+ * described in IEEE Std 802.11-2020 section 9.4.2.5. Note that this
+ * definition is only applicable when the element is carried in a
+ * non-S1G PPDU. When the TIM is carried in an S1G PPDU, the Bitmap
+ * Control and Partial Virtual Bitmap may not be present.
  */
 struct ieee80211_tim_ie {
 	u8 dtim_count;
 	u8 dtim_period;
 	u8 bitmap_ctrl;
-	/* variable size: 1 - 251 bytes */
-	u8 virtual_map[1];
+	union {
+		u8 required_octet;
+		DECLARE_FLEX_ARRAY(u8, virtual_map);
+	};
 } __packed;
 
 /**
-- 
cgit v1.2.3


From 0d423c4a78984dd02f6596d6fd9dd40446eec517 Mon Sep 17 00:00:00 2001
From: Alexey Romanov <avromanov@salutedevices.com>
Date: Wed, 30 Aug 2023 17:08:50 +0300
Subject: drivers: meson: sm: correct meson_sm_* API retval handling

1. Following the ARM SMC32 calling convention, the return value
from secure monitor is a 32-bit signed integer. This patch changes
the type of the return value of the function meson_sm_call().

2. Now, when meson_sm_call() returns a 32-bit signed integer, we need
to ensure that this value is not negative. It is important to check
that the return value is not negative in both the meson_sm_call_read()
and meson_sm_call_write() functions.

3. Add a comment explaining why it is necessary to check if the SMC
return value is equal to 0 in the function meson_sm_call_read().
It is not obvious when reading this code.

Signed-off-by: Alexey Romanov <avromanov@salutedevices.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20230830140850.17130-1-avromanov@salutedevices.com
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
---
 drivers/firmware/meson/meson_sm.c       | 20 +++++++++++++-------
 include/linux/firmware/meson/meson_sm.h |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/meson/meson_sm.c b/drivers/firmware/meson/meson_sm.c
index 9a2656d73600..53bf56e18743 100644
--- a/drivers/firmware/meson/meson_sm.c
+++ b/drivers/firmware/meson/meson_sm.c
@@ -67,7 +67,7 @@ static u32 meson_sm_get_cmd(const struct meson_sm_chip *chip,
 	return cmd->smc_id;
 }
 
-static u32 __meson_sm_call(u32 cmd, u32 arg0, u32 arg1, u32 arg2,
+static s32 __meson_sm_call(u32 cmd, u32 arg0, u32 arg1, u32 arg2,
 			   u32 arg3, u32 arg4)
 {
 	struct arm_smccc_res res;
@@ -102,9 +102,10 @@ static void __iomem *meson_sm_map_shmem(u32 cmd_shmem, unsigned int size)
  * Return:	0 on success, a negative value on error
  */
 int meson_sm_call(struct meson_sm_firmware *fw, unsigned int cmd_index,
-		  u32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
+		  s32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
 {
-	u32 cmd, lret;
+	u32 cmd;
+	s32 lret;
 
 	if (!fw->chip)
 		return -ENOENT;
@@ -143,7 +144,7 @@ int meson_sm_call_read(struct meson_sm_firmware *fw, void *buffer,
 		       unsigned int bsize, unsigned int cmd_index, u32 arg0,
 		       u32 arg1, u32 arg2, u32 arg3, u32 arg4)
 {
-	u32 size;
+	s32 size;
 	int ret;
 
 	if (!fw->chip)
@@ -158,11 +159,16 @@ int meson_sm_call_read(struct meson_sm_firmware *fw, void *buffer,
 	if (meson_sm_call(fw, cmd_index, &size, arg0, arg1, arg2, arg3, arg4) < 0)
 		return -EINVAL;
 
-	if (size > bsize)
+	if (size < 0 || size > bsize)
 		return -EINVAL;
 
 	ret = size;
 
+	/* In some cases (for example GET_CHIP_ID command),
+	 * SMC doesn't return the number of bytes read, even
+	 * though the bytes were actually read into sm_shmem_out.
+	 * So this check is needed.
+	 */
 	if (!size)
 		size = bsize;
 
@@ -192,7 +198,7 @@ int meson_sm_call_write(struct meson_sm_firmware *fw, void *buffer,
 			unsigned int size, unsigned int cmd_index, u32 arg0,
 			u32 arg1, u32 arg2, u32 arg3, u32 arg4)
 {
-	u32 written;
+	s32 written;
 
 	if (!fw->chip)
 		return -ENOENT;
@@ -208,7 +214,7 @@ int meson_sm_call_write(struct meson_sm_firmware *fw, void *buffer,
 	if (meson_sm_call(fw, cmd_index, &written, arg0, arg1, arg2, arg3, arg4) < 0)
 		return -EINVAL;
 
-	if (!written)
+	if (written <= 0 || written > size)
 		return -EINVAL;
 
 	return written;
diff --git a/include/linux/firmware/meson/meson_sm.h b/include/linux/firmware/meson/meson_sm.h
index 95b0da2326a9..8eaf8922ab02 100644
--- a/include/linux/firmware/meson/meson_sm.h
+++ b/include/linux/firmware/meson/meson_sm.h
@@ -19,7 +19,7 @@ enum {
 struct meson_sm_firmware;
 
 int meson_sm_call(struct meson_sm_firmware *fw, unsigned int cmd_index,
-		  u32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+		  s32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
 int meson_sm_call_write(struct meson_sm_firmware *fw, void *buffer,
 			unsigned int b_size, unsigned int cmd_index, u32 arg0,
 			u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-- 
cgit v1.2.3


From 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 8 Sep 2023 16:28:59 +0300
Subject: fs: rename __mnt_{want,drop}_write*() helpers

Before exporting these helpers to modules, make their names more
meaningful.

The names mnt_{get,put)_write_access*() were chosen, because they rhyme
with the inode {get,put)_write_access() helpers, which have a very close
meaning for the inode object.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-2-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c            |  8 ++++----
 fs/internal.h         | 12 ++++++------
 fs/namespace.c        | 34 +++++++++++++++++-----------------
 fs/open.c             |  2 +-
 include/linux/mount.h |  4 ++--
 kernel/acct.c         |  4 ++--
 6 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..7febdc9fd1a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2006,7 +2006,7 @@ void touch_atime(const struct path *path)
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt) != 0)
+	if (mnt_get_write_access(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -2018,7 +2018,7 @@ void touch_atime(const struct path *path)
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
 	inode_update_time(inode, S_ATIME);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 skip_update:
 	sb_end_write(inode->i_sb);
 }
@@ -2173,9 +2173,9 @@ static int __file_update_time(struct file *file, int sync_mode)
 	struct inode *inode = file_inode(file);
 
 	/* try to update time settings */
-	if (!__mnt_want_write_file(file)) {
+	if (!mnt_get_write_access_file(file)) {
 		ret = inode_update_time(inode, sync_mode);
-		__mnt_drop_write_file(file);
+		mnt_put_write_access_file(file);
 	}
 
 	return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..8260c738980c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
 
 extern void dissolve_on_fput(struct vfsmount *);
 extern bool may_mount(void);
@@ -101,7 +101,7 @@ static inline void put_file_access(struct file *file)
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
 		put_write_access(file->f_inode);
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 	}
 }
 
@@ -130,9 +130,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
 	 * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
 	 * cleared, it will see s_readonly_remount set.
 	 * For RW->RO transition, the barrier pairs with the barrier in
-	 * __mnt_want_write() before the mnt_is_readonly() check. The barrier
-	 * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
-	 * cleared, it will see s_readonly_remount set.
+	 * mnt_get_write_access() before the mnt_is_readonly() check.
+	 * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+	 * already cleared, it will see s_readonly_remount set.
 	 */
 	smp_wmb();
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..3fe7c0484e6a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
  * can determine when writes are able to occur to a filesystem.
  */
 /**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
  * called. This is effectively a refcount.
  */
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
@@ -401,7 +401,7 @@ int mnt_want_write(struct vfsmount *m)
 	int ret;
 
 	sb_start_write(m->mnt_sb);
-	ret = __mnt_want_write(m);
+	ret = mnt_get_write_access(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
@@ -409,15 +409,15 @@ int mnt_want_write(struct vfsmount *m)
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
  * skips incrementing mnt_writers (since the open file already has a reference)
  * and instead only does the check for emergency r/o remounts.  This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
  */
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
 {
 	if (file->f_mode & FMODE_WRITER) {
 		/*
@@ -428,7 +428,7 @@ int __mnt_want_write_file(struct file *file)
 			return -EROFS;
 		return 0;
 	}
-	return __mnt_want_write(file->f_path.mnt);
+	return mnt_get_write_access(file->f_path.mnt);
 }
 
 /**
@@ -445,7 +445,7 @@ int mnt_want_write_file(struct file *file)
 	int ret;
 
 	sb_start_write(file_inode(file)->i_sb);
-	ret = __mnt_want_write_file(file);
+	ret = mnt_get_write_access_file(file);
 	if (ret)
 		sb_end_write(file_inode(file)->i_sb);
 	return ret;
@@ -453,14 +453,14 @@ int mnt_want_write_file(struct file *file)
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
  */
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
@@ -477,20 +477,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write_file(file);
+	mnt_put_write_access_file(file);
 	sb_end_write(file_inode(file)->i_sb);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..a65ce47810cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -895,7 +895,7 @@ static int do_dentry_open(struct file *f,
 		error = get_write_access(inode);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = __mnt_want_write(f->f_path.mnt);
+		error = mnt_get_write_access(f->f_path.mnt);
 		if (unlikely(error)) {
 			put_write_access(inode);
 			goto cleanup_file;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..ac3dd2876197 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
 extern bool mnt_may_suid(struct vfsmount *mnt);
 
 extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
diff --git a/kernel/acct.c b/kernel/acct.c
index 1a9f929fe629..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return PTR_ERR(internal);
 	}
-	err = __mnt_want_write(internal);
+	err = mnt_get_write_access(internal);
 	if (err) {
 		mntput(internal);
 		kfree(acct);
@@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname)
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	mntput(mnt);
 	return 0;
 }
-- 
cgit v1.2.3


From fa671e4f1556e2c18e5443f777a75ae041290068 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Thu, 7 Sep 2023 10:52:03 +0200
Subject: fbdev/core: Unexport logo helpers

The interfaces for the fbdev logo are not used outside of the fbdev
module. Hence declare the fbdev logo functions in the internal header
file and remove their symbol exports. Only build the functions if
CONFIG_LOGO has been selected.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230907085408.9354-5-tzimmermann@suse.de
---
 drivers/video/fbdev/core/fb_internal.h | 16 ++++++++++++++++
 drivers/video/fbdev/core/fbmem.c       |  5 -----
 include/linux/fb.h                     |  5 -----
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_internal.h b/drivers/video/fbdev/core/fb_internal.h
index 4c8d509a0026..1116faefa034 100644
--- a/drivers/video/fbdev/core/fb_internal.h
+++ b/drivers/video/fbdev/core/fb_internal.h
@@ -21,6 +21,22 @@ static inline void fb_unregister_chrdev(void)
 #endif
 
 /* fbmem.c */
+#if defined(CONFIG_LOGO)
+extern bool fb_center_logo;
+extern int fb_logo_count;
+int fb_prepare_logo(struct fb_info *fb_info, int rotate);
+int fb_show_logo(struct fb_info *fb_info, int rotate);
+#else
+static inline int fb_prepare_logo(struct fb_info *info, int rotate)
+{
+	return 0;
+}
+static inline int fb_show_logo(struct fb_info *info, int rotate)
+{
+	return 0;
+}
+#endif /* CONFIG_LOGO */
+
 extern struct class *fb_class;
 extern struct mutex registration_lock;
 extern struct fb_info *registered_fb[FB_MAX];
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 98e1847e4287..ee25ac38737d 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -696,12 +696,7 @@ int fb_show_logo(struct fb_info *info, int rotate)
 
 	return y;
 }
-#else
-int fb_prepare_logo(struct fb_info *info, int rotate) { return 0; }
-int fb_show_logo(struct fb_info *info, int rotate) { return 0; }
 #endif /* CONFIG_LOGO */
-EXPORT_SYMBOL(fb_prepare_logo);
-EXPORT_SYMBOL(fb_show_logo);
 
 int
 fb_pan_display(struct fb_info *info, struct fb_var_screeninfo *var)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 16c3e6d6c55d..d110676c9c83 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -591,8 +591,6 @@ extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf,
 /* drivers/video/fbmem.c */
 extern int register_framebuffer(struct fb_info *fb_info);
 extern void unregister_framebuffer(struct fb_info *fb_info);
-extern int fb_prepare_logo(struct fb_info *fb_info, int rotate);
-extern int fb_show_logo(struct fb_info *fb_info, int rotate);
 extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size);
 extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx,
 				u32 height, u32 shift_high, u32 shift_low, u32 mod);
@@ -603,9 +601,6 @@ extern int fb_get_color_depth(struct fb_var_screeninfo *var,
 extern int fb_get_options(const char *name, char **option);
 extern int fb_new_modelist(struct fb_info *info);
 
-extern bool fb_center_logo;
-extern int fb_logo_count;
-
 static inline void lock_fb_info(struct fb_info *info)
 {
 	mutex_lock(&info->lock);
-- 
cgit v1.2.3


From ebc7abb35b258152d4a424f89d7c03db1d7ce61c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Sep 2023 20:18:56 +0200
Subject: thermal: Constify the trip argument of the .get_trend() zone callback

Add 'const' to the definition of the 'trip' argument of the
.get_trend() thermal zone callback to indicate that the trip point
passed to it should not be modified by it and adjust the
callback functions implementing it, thermal_get_trend() in the
ACPI thermal driver and __ti_thermal_get_trend(), accordingly.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Michal Wilczynski <michal.wilczynski@intel.com>
---
 drivers/acpi/thermal.c                             | 2 +-
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 3 ++-
 include/linux/thermal.h                            | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index f14e68266ccd..312730f8272e 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -492,7 +492,7 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
 }
 
 static int thermal_get_trend(struct thermal_zone_device *thermal,
-			     struct thermal_trip *trip,
+			     const struct thermal_trip *trip,
 			     enum thermal_trend *trend)
 {
 	struct acpi_thermal *tz = thermal_zone_device_priv(thermal);
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 6ba2613627e1..0cf0826b805a 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -110,7 +110,8 @@ static inline int __ti_thermal_get_temp(struct thermal_zone_device *tz, int *tem
 }
 
 static int __ti_thermal_get_trend(struct thermal_zone_device *tz,
-				  struct thermal_trip *trip, enum thermal_trend *trend)
+				  const struct thermal_trip *trip,
+				  enum thermal_trend *trend)
 {
 	struct ti_thermal_data *data = thermal_zone_device_priv(tz);
 	struct ti_bandgap *bgp;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c99440aac1a1..a5ae4af955ff 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -80,8 +80,8 @@ struct thermal_zone_device_ops {
 	int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
 	int (*get_crit_temp) (struct thermal_zone_device *, int *);
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
-	int (*get_trend) (struct thermal_zone_device *, struct thermal_trip *,
-			  enum thermal_trend *);
+	int (*get_trend) (struct thermal_zone_device *,
+			  const struct thermal_trip *, enum thermal_trend *);
 	void (*hot)(struct thermal_zone_device *);
 	void (*critical)(struct thermal_zone_device *);
 };
-- 
cgit v1.2.3


From ef7d9593390a050c50eba5fc02d2cb65a1104434 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 11 Sep 2023 08:39:04 -0700
Subject: xfs: remove CPU hotplug infrastructure

There are no users of the cpu hotplug hooks in xfs now, so remove it.
This reverts f1653c2e2831e ("xfs: introduce CPU hotplug
infrastructure").

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_super.c         | 42 +-----------------------------------------
 include/linux/cpuhotplug.h |  1 -
 2 files changed, 1 insertion(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5cced7713cd2..c8a2dae1dd65 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2294,39 +2294,6 @@ xfs_destroy_workqueues(void)
 	destroy_workqueue(xfs_alloc_wq);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int
-xfs_cpu_dead(
-	unsigned int		cpu)
-{
-	return 0;
-}
-
-static int __init
-xfs_cpu_hotplug_init(void)
-{
-	int	error;
-
-	error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
-			xfs_cpu_dead);
-	if (error < 0)
-		xfs_alert(NULL,
-"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
-			error);
-	return error;
-}
-
-static void
-xfs_cpu_hotplug_destroy(void)
-{
-	cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline int xfs_cpu_hotplug_init(void) { return 0; }
-static inline void xfs_cpu_hotplug_destroy(void) {}
-#endif
-
 STATIC int __init
 init_xfs_fs(void)
 {
@@ -2343,13 +2310,9 @@ init_xfs_fs(void)
 
 	xfs_dir_startup();
 
-	error = xfs_cpu_hotplug_init();
-	if (error)
-		goto out;
-
 	error = xfs_init_caches();
 	if (error)
-		goto out_destroy_hp;
+		goto out;
 
 	error = xfs_init_workqueues();
 	if (error)
@@ -2433,8 +2396,6 @@ init_xfs_fs(void)
 	xfs_destroy_workqueues();
  out_destroy_caches:
 	xfs_destroy_caches();
- out_destroy_hp:
-	xfs_cpu_hotplug_destroy();
  out:
 	return error;
 }
@@ -2458,7 +2419,6 @@ exit_xfs_fs(void)
 	xfs_destroy_workqueues();
 	xfs_destroy_caches();
 	xfs_uuid_table_free();
-	xfs_cpu_hotplug_destroy();
 }
 
 module_init(init_xfs_fs);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 06dda85f0424..068f7738be22 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -90,7 +90,6 @@ enum cpuhp_state {
 	CPUHP_FS_BUFF_DEAD,
 	CPUHP_PRINTK_DEAD,
 	CPUHP_MM_MEMCQ_DEAD,
-	CPUHP_XFS_DEAD,
 	CPUHP_PERCPU_CNT_DEAD,
 	CPUHP_RADIX_DEAD,
 	CPUHP_PAGE_ALLOC,
-- 
cgit v1.2.3


From 5b404fdabacf4bee92d8c66013402a85f18a6a10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Aug 2023 15:46:32 -0700
Subject: rcu: Add RCU CPU stall notifier

It is sometimes helpful to have a way for the subsystem causing
the stall to dump its state when an RCU CPU stall occurs.  This
commit therefore bases rcu_stall_chain_notifier_register() and
rcu_stall_chain_notifier_unregister() on atomic notifiers in order to
provide this functionality.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcu_notifier.h | 32 ++++++++++++++++++++++++
 kernel/rcu/rcu.h             |  6 +++++
 kernel/rcu/tree_exp.h        |  6 ++++-
 kernel/rcu/tree_stall.h      | 59 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/rcu_notifier.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
new file mode 100644
index 000000000000..ebf371364581
--- /dev/null
+++ b/include/linux/rcu_notifier.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update notifiers, initially RCU CPU stall notifier.
+ * Separate from rcupdate.h to avoid #include loops.
+ *
+ * Copyright (C) 2023 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_RCU_NOTIFIER_H
+#define __LINUX_RCU_NOTIFIER_H
+
+// Actions for RCU CPU stall notifier calls.
+#define RCU_STALL_NOTIFY_NORM	1
+#define RCU_STALL_NOTIFY_EXP	2
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+int rcu_stall_chain_notifier_register(struct notifier_block *n);
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
+
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+
+// No RCU CPU stall warnings in Tiny RCU.
+static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
+static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
+
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
+#endif /* __LINUX_RCU_NOTIFIER_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..ef3bab977407 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -654,4 +654,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
 bool rcu_cpu_beenfullyonline(int cpu);
 #endif
 
+#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8239b39d945b..6d7cea5d591f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void)
 	}
 
 	for (;;) {
+		unsigned long j;
+
 		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
 			return;
 		if (rcu_stall_is_suppressed())
 			continue;
+		j = jiffies;
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
 		trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rcu_state.name);
@@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void)
 			}
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-			jiffies - jiffies_start, rcu_state.expedited_sequence,
+			j - jiffies_start, rcu_state.expedited_sequence,
 			data_race(rnp_root->expmask),
 			".T"[!!data_race(rnp_root->exp_tasks)]);
 		if (ndetected) {
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2443d1d4a6dc..49544f932279 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -770,6 +771,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		if (kvm_check_and_clear_guest_paused())
 			return;
 
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
 		if (self_detected) {
 			/* We haven't checked in, so go dump stack. */
 			print_cpu_stall(gps);
@@ -790,7 +792,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
 
 
 /*
@@ -1042,3 +1044,58 @@ static int __init rcu_sysrq_init(void)
 	return 0;
 }
 early_initcall(rcu_sysrq_init);
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends.  The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+	return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+	return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain.  See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
-- 
cgit v1.2.3


From fc52a64416b010c8324e2cb50070faae868521c1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 8 Sep 2023 16:39:29 -0400
Subject: tracing/synthetic: Fix order of struct trace_dynamic_info

To make handling BIG and LITTLE endian better the offset/len of dynamic
fields of the synthetic events was changed into a structure of:

 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
	u16	offset;
	u16	len;
 #else
	u16	len;
	u16	offset;
 #endif
 };

to replace the manual changes of:

 data_offset = offset & 0xffff;
 data_offest = len << 16;

But if you look closely, the above is:

  <len> << 16 | offset

Which in little endian would be in memory:

 offset_lo offset_hi len_lo len_hi

and in big endian:

 len_hi len_lo offset_hi offset_lo

Which if broken into a structure would be:

 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
	u16	len;
	u16	offset;
 #else
	u16	offset;
	u16	len;
 #endif
 };

Which is the opposite of what was defined.

Fix this and just to be safe also add "__packed".

Link: https://lore.kernel.org/all/20230908154417.5172e343@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20230908163929.2c25f3dc@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Fixes: ddeea494a16f3 ("tracing/synthetic: Use union instead of casts")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12f875e9e69a..21ae37e49319 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -62,13 +62,13 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
 /* Used to find the offset and length of dynamic fields in trace events */
 struct trace_dynamic_info {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	u16	offset;
 	u16	len;
+	u16	offset;
 #else
-	u16	len;
 	u16	offset;
+	u16	len;
 #endif
-};
+} __packed;
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
-- 
cgit v1.2.3


From 49f776724e64c27dd861e7ac8da9d42f01d9d172 Mon Sep 17 00:00:00 2001
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Date: Wed, 23 Aug 2023 23:43:04 +0000
Subject: PCI/AER: Export pcie_aer_is_native()

Export and move the declaration of pcie_aer_is_native() to a common header
file to be reused by cxl/pci module.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Robert Richter <rrichter@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20230823234305.27333-3-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/pcie/aer.c     | 1 +
 drivers/pci/pcie/portdrv.h | 2 --
 include/linux/aer.h        | 2 ++
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e85ff946e8c8..9c8fd69ae5ad 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -229,6 +229,7 @@ int pcie_aer_is_native(struct pci_dev *dev)
 
 	return pcie_ports_native || host->native_aer;
 }
+EXPORT_SYMBOL_NS_GPL(pcie_aer_is_native, CXL);
 
 static int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 58a2b1a1cae4..1f3803bde7ee 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -29,10 +29,8 @@ extern bool pcie_ports_dpc_native;
 
 #ifdef CONFIG_PCIEAER
 int pcie_aer_init(void);
-int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pcie_aer_init(void) { return 0; }
-static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
 #ifdef CONFIG_HOTPLUG_PCI_PCIE
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 2dd175f5debd..29cc10220952 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -42,11 +42,13 @@ struct aer_capability_regs {
 
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
+int pcie_aer_is_native(struct pci_dev *dev);
 #else
 static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
+static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
 void cper_print_aer(struct pci_dev *dev, int aer_severity,
-- 
cgit v1.2.3


From 2b69987be575b92adb6c177679f3c559134f0d8f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 16 Oct 2019 15:03:50 -0400
Subject: sched: Add task_struct->faults_disabled_mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There has been a long standing page cache coherence bug with direct IO.
This provides part of a mechanism to fix it, currently just used by
bcachefs but potentially worth promoting to the VFS.

Direct IO evicts the range of the pagecache being read or written to.

For reads, we need dirty pages to be written to disk, so that the read
doesn't return stale data. For writes, we need to evict that range of
the pagecache so that it's not stale after the write completes.

However, without a locking mechanism to prevent those pages from being
re-added to the pagecache - by a buffered read or page fault - page
cache inconsistency is still possible.

This isn't necessarily just an issue for userspace when they're playing
games; filesystems may hang arbitrary state off the pagecache, and so
page cache inconsistency may cause real filesystem bugs, depending on
the filesystem. This is less of an issue for iomap based filesystems,
but e.g. buffer heads caches disk block mappings (!) and attaches them
to the pagecache, and bcachefs attaches disk reservations to pagecache
pages.

This issue has been hard to fix, because
 - we need to add a lock (henceforth called pagecache_add_lock), which
   would be held for the duration of the direct IO
 - page faults add pages to the page cache, thus need to take the same
   lock
 - dio -> gup -> page fault thus can deadlock

And we cannot enforce a lock ordering with this lock, since userspace
will be controlling the lock ordering (via the fd and buffer arguments
to direct IOs), so we need a different method of deadlock avoidance.

We need to tell the page fault handler that we're already holding a
pagecache_add_lock, and since plumbing it through the entire gup() path
would be highly impractical this adds a field to task_struct.

Then the full method is:
 - in the dio path, when we first take the pagecache_add_lock, note the
   mapping in the current task_struct
 - in the page fault handler, if faults_disabled_mapping is set, we
   check if it's the same mapping as the one we're taking a page fault
   for, and if so return an error.

   Then we check lock ordering: if there's a lock ordering violation and
   trylock fails, we'll have to cycle the locks and return an error that
   tells the DIO path to retry: faults_disabled_mapping is also used for
   signalling "locks were dropped, please retry".

Also relevant to this patch: mapping->invalidate_lock.
mapping->invalidate_lock provides most of the required semantics - it's
used by truncate/fallocate to block pages being added to the pagecache.
However, since it's a rwsem, direct IOs would need to take the write
side in order to block page cache adds, and would then be exclusive with
each other - we'll need a new type of lock to pair with this approach.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Jan Kara <jack@suse.cz>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Andreas Grünbacher <andreas.gruenbacher@gmail.com>
---
 include/linux/sched.h | 1 +
 init/init_task.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..d5951e99706a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ struct task_struct {
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
+	struct address_space		*faults_disabled_mapping;
 
 	int				exit_state;
 	int				exit_code;
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b..f703116e0523 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -85,6 +85,7 @@ struct task_struct init_task
 	.nr_cpus_allowed= NR_CPUS,
 	.mm		= NULL,
 	.active_mm	= &init_mm,
+	.faults_disabled_mapping = NULL,
 	.restart_block	= {
 		.fn = do_no_restart_syscall,
 	},
-- 
cgit v1.2.3


From 771eb4fe8b420bb8563863e242861e635c742bc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 9 Jul 2018 23:27:33 -0400
Subject: fs: factor out d_mark_tmpfile()

New helper for bcachefs - bcachefs doesn't want the
inode_dec_link_count() call that d_tmpfile does, it handles i_nlink on
its own atomically with other btree updates

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c            | 12 ++++++++++--
 include/linux/dcache.h |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..796e23761ba0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3246,11 +3246,10 @@ void d_genocide(struct dentry *parent)
 	d_walk(parent, parent, d_genocide_kill);
 }
 
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
 
-	inode_dec_link_count(inode);
 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
@@ -3260,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode)
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	inode_dec_link_count(inode);
+	d_mark_tmpfile(file, inode);
 	d_instantiate(dentry, inode);
 }
 EXPORT_SYMBOL(d_tmpfile);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..3da2f0545d5d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -251,6 +251,7 @@ extern struct dentry * d_make_root(struct inode *);
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
+extern void d_mark_tmpfile(struct file *, struct inode *);
 extern void d_tmpfile(struct file *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
-- 
cgit v1.2.3


From 83feeb195592b533541ff6399c8084e5b7c59677 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 25 Apr 2022 15:26:28 -0400
Subject: lib/string_helpers: string_get_size() now returns characters wrote

printbuf now needs to know the number of characters that would have been
written if the buffer was too small, like snprintf(); this changes
string_get_size() to return the the return value of snprintf().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/string_helpers.h |  4 ++--
 lib/string_helpers.c           | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9d1f5bb74dd5..58fb1f90eda5 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -24,8 +24,8 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
-		     char *buf, int len);
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+		    char *buf, int len);
 
 int parse_int_array_user(const char __user *from, size_t count, int **array);
 
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 9982344cca34..7713f73e66b0 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -31,9 +31,11 @@
  * giving the size in the required units.  @buf should have room for
  * at least 9 bytes and will always be zero terminated.
  *
+ * Return value: number of characters of output that would have been written
+ * (which may be greater than len, if output was truncated).
  */
-void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
-		     char *buf, int len)
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+		    char *buf, int len)
 {
 	static const char *const units_10[] = {
 		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
@@ -126,8 +128,8 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
 	else
 		unit = units_str[units][i];
 
-	snprintf(buf, len, "%u%s %s", (u32)size,
-		 tmp, unit);
+	return snprintf(buf, len, "%u%s %s", (u32)size,
+			tmp, unit);
 }
 EXPORT_SYMBOL(string_get_size);
 
-- 
cgit v1.2.3


From fe4fa2e4f7d0722c179fffa25911ea35cafadce2 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 1 Sep 2023 13:29:25 +0200
Subject: gpiolib: make gpiochip_get_desc() public

It makes sense for a GPIO driver to want to get its own descriptor
without requesting it. After all, the driver knows that it'll still be
valid. Let's move this helper to linux/gpio/driver.h.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.h      | 2 --
 include/linux/gpio/driver.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index d1c94bd571c6..9bff5c2cf720 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -122,8 +122,6 @@ struct gpio_array {
 	unsigned long		invert_mask[];
 };
 
-struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
-
 #define for_each_gpio_desc(gc, desc)					\
 	for (unsigned int __i = 0;					\
 	     __i < gc->ngpio && (desc = gpiochip_get_desc(gc, __i));	\
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index b721422f4bfa..8f0859ba7065 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -757,6 +757,8 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc,
 					    enum gpiod_flags dflags);
 void gpiochip_free_own_desc(struct gpio_desc *desc);
 
+struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
+
 #ifdef CONFIG_GPIOLIB
 
 /* lock/unlock as IRQ */
-- 
cgit v1.2.3


From 2c97d3e55b70edf33b6e7f211bab8a748a0a2bcc Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Wed, 30 Aug 2023 15:22:37 +1200
Subject: platform/x86: asus-wmi: add support for ASUS screenpad

Add support for the WMI methods used to turn off and adjust the
brightness of the secondary "screenpad" device found on some high-end
ASUS laptops like the GX650P series and others.

There are some small quirks with this device when considering only the
raw WMI methods:
1. The Off method can only switch the device off
2. Changing the brightness turns the device back on
3. To turn the device back on the brightness must be > 1
4. When the device is off the brightness can't be changed (so it is
   stored by the driver if device is off).
5. Booting with a value of 0 brightness (retained by bios) means the bios
   will set a value of >0 <15
6. When the device is off it is "unplugged"

asus_wmi sets the minimum brightness as 20 in general use, and 60 for
booting with values <= min.

The ACPI methods are used in a new backlight device named asus_screenpad.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20230830032237.42987-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/asus-wmi.c            | 133 +++++++++++++++++++++++++++++
 drivers/platform/x86/asus-wmi.h            |   1 +
 include/linux/platform_data/x86/asus-wmi.h |   4 +
 3 files changed, 138 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 9f8cea5f9615..928fc74e79b4 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -25,6 +25,7 @@
 #include <linux/input/sparse-keymap.h>
 #include <linux/kernel.h>
 #include <linux/leds.h>
+#include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
@@ -127,6 +128,10 @@ module_param(fnlock_default, bool, 0444);
 #define NVIDIA_TEMP_MIN		75
 #define NVIDIA_TEMP_MAX		87
 
+#define ASUS_SCREENPAD_BRIGHT_MIN 20
+#define ASUS_SCREENPAD_BRIGHT_MAX 255
+#define ASUS_SCREENPAD_BRIGHT_DEFAULT 60
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static int throttle_thermal_policy_write(struct asus_wmi *);
@@ -212,6 +217,7 @@ struct asus_wmi {
 
 	struct input_dev *inputdev;
 	struct backlight_device *backlight_device;
+	struct backlight_device *screenpad_backlight_device;
 	struct platform_device *platform_device;
 
 	struct led_classdev wlan_led;
@@ -3776,6 +3782,124 @@ static int is_display_toggle(int code)
 	return 0;
 }
 
+/* Screenpad backlight *******************************************************/
+
+static int read_screenpad_backlight_power(struct asus_wmi *asus)
+{
+	int ret;
+
+	ret = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_SCREENPAD_POWER);
+	if (ret < 0)
+		return ret;
+	/* 1 == powered */
+	return ret ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN;
+}
+
+static int read_screenpad_brightness(struct backlight_device *bd)
+{
+	struct asus_wmi *asus = bl_get_data(bd);
+	u32 retval;
+	int err;
+
+	err = read_screenpad_backlight_power(asus);
+	if (err < 0)
+		return err;
+	/* The device brightness can only be read if powered, so return stored */
+	if (err == FB_BLANK_POWERDOWN)
+		return asus->driver->screenpad_brightness - ASUS_SCREENPAD_BRIGHT_MIN;
+
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_SCREENPAD_LIGHT, &retval);
+	if (err < 0)
+		return err;
+
+	return (retval & ASUS_WMI_DSTS_BRIGHTNESS_MASK) - ASUS_SCREENPAD_BRIGHT_MIN;
+}
+
+static int update_screenpad_bl_status(struct backlight_device *bd)
+{
+	struct asus_wmi *asus = bl_get_data(bd);
+	int power, err = 0;
+	u32 ctrl_param;
+
+	power = read_screenpad_backlight_power(asus);
+	if (power < 0)
+		return power;
+
+	if (bd->props.power != power) {
+		if (power != FB_BLANK_UNBLANK) {
+			/* Only brightness > 0 can power it back on */
+			ctrl_param = asus->driver->screenpad_brightness - ASUS_SCREENPAD_BRIGHT_MIN;
+			err = asus_wmi_set_devstate(ASUS_WMI_DEVID_SCREENPAD_LIGHT,
+						    ctrl_param, NULL);
+		} else {
+			err = asus_wmi_set_devstate(ASUS_WMI_DEVID_SCREENPAD_POWER, 0, NULL);
+		}
+	} else if (power == FB_BLANK_UNBLANK) {
+		/* Only set brightness if powered on or we get invalid/unsync state */
+		ctrl_param = bd->props.brightness + ASUS_SCREENPAD_BRIGHT_MIN;
+		err = asus_wmi_set_devstate(ASUS_WMI_DEVID_SCREENPAD_LIGHT, ctrl_param, NULL);
+	}
+
+	/* Ensure brightness is stored to turn back on with */
+	if (err == 0)
+		asus->driver->screenpad_brightness = bd->props.brightness + ASUS_SCREENPAD_BRIGHT_MIN;
+
+	return err;
+}
+
+static const struct backlight_ops asus_screenpad_bl_ops = {
+	.get_brightness = read_screenpad_brightness,
+	.update_status = update_screenpad_bl_status,
+	.options = BL_CORE_SUSPENDRESUME,
+};
+
+static int asus_screenpad_init(struct asus_wmi *asus)
+{
+	struct backlight_device *bd;
+	struct backlight_properties props;
+	int err, power;
+	int brightness = 0;
+
+	power = read_screenpad_backlight_power(asus);
+	if (power < 0)
+		return power;
+
+	if (power != FB_BLANK_POWERDOWN) {
+		err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_SCREENPAD_LIGHT, &brightness);
+		if (err < 0)
+			return err;
+	}
+	/* default to an acceptable min brightness on boot if too low */
+	if (brightness < ASUS_SCREENPAD_BRIGHT_MIN)
+		brightness = ASUS_SCREENPAD_BRIGHT_DEFAULT;
+
+	memset(&props, 0, sizeof(struct backlight_properties));
+	props.type = BACKLIGHT_RAW; /* ensure this bd is last to be picked */
+	props.max_brightness = ASUS_SCREENPAD_BRIGHT_MAX - ASUS_SCREENPAD_BRIGHT_MIN;
+	bd = backlight_device_register("asus_screenpad",
+				       &asus->platform_device->dev, asus,
+				       &asus_screenpad_bl_ops, &props);
+	if (IS_ERR(bd)) {
+		pr_err("Could not register backlight device\n");
+		return PTR_ERR(bd);
+	}
+
+	asus->screenpad_backlight_device = bd;
+	asus->driver->screenpad_brightness = brightness;
+	bd->props.brightness = brightness - ASUS_SCREENPAD_BRIGHT_MIN;
+	bd->props.power = power;
+	backlight_update_status(bd);
+
+	return 0;
+}
+
+static void asus_screenpad_exit(struct asus_wmi *asus)
+{
+	backlight_device_unregister(asus->screenpad_backlight_device);
+
+	asus->screenpad_backlight_device = NULL;
+}
+
 /* Fn-lock ********************************************************************/
 
 static bool asus_wmi_has_fnlock_key(struct asus_wmi *asus)
@@ -4431,6 +4555,12 @@ static int asus_wmi_add(struct platform_device *pdev)
 	} else if (asus->driver->quirks->wmi_backlight_set_devstate)
 		err = asus_wmi_set_devstate(ASUS_WMI_DEVID_BACKLIGHT, 2, NULL);
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_SCREENPAD_LIGHT)) {
+		err = asus_screenpad_init(asus);
+		if (err && err != -ENODEV)
+			goto fail_screenpad;
+	}
+
 	if (asus_wmi_has_fnlock_key(asus)) {
 		asus->fnlock_locked = fnlock_default;
 		asus_wmi_fnlock_update(asus);
@@ -4454,6 +4584,8 @@ fail_wmi_handler:
 	asus_wmi_backlight_exit(asus);
 fail_backlight:
 	asus_wmi_rfkill_exit(asus);
+fail_screenpad:
+	asus_screenpad_exit(asus);
 fail_rfkill:
 	asus_wmi_led_exit(asus);
 fail_leds:
@@ -4480,6 +4612,7 @@ static int asus_wmi_remove(struct platform_device *device)
 	asus = platform_get_drvdata(device);
 	wmi_remove_notify_handler(asus->driver->event_guid);
 	asus_wmi_backlight_exit(asus);
+	asus_screenpad_exit(asus);
 	asus_wmi_input_exit(asus);
 	asus_wmi_led_exit(asus);
 	asus_wmi_rfkill_exit(asus);
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index a478ebfd34df..5fbdd0eafa02 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -57,6 +57,7 @@ struct quirk_entry {
 struct asus_wmi_driver {
 	int			brightness;
 	int			panel_power;
+	int			screenpad_brightness;
 	int			wlan_ctrl_by_user;
 
 	const char		*name;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 16e99a1c37fc..63e630276499 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -58,6 +58,10 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+/* This can only be used to disable the screen, not re-enable */
+#define ASUS_WMI_DEVID_SCREENPAD_POWER	0x00050031
+/* Writing a brightness re-enables the screen if disabled */
+#define ASUS_WMI_DEVID_SCREENPAD_LIGHT	0x00050032
 #define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075
 
-- 
cgit v1.2.3


From 4eaf928622abc5dabc9b42a0de4dafbe29ddf491 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 11 Aug 2023 17:57:01 +0800
Subject: iio: Remove unused declarations

Commit 0f3a8c3f34f7 ("iio: Add support for creating IIO devices via configfs")
declared but never implemented iio_sw_device_type_configfs_{un}register().
Commit b662f809d410 ("iio: core: Introduce IIO software triggers") declared but
never implemented iio_sw_trigger_type_configfs_{un}register().
Commit a3e0b51884ee ("iio: accel: add support for FXLS8962AF/FXLS8964AF accelerometers")
declared but never implemented fxls8962af_core_remove().
Commit 8dedcc3eee3a ("iio: core: centralize ioctl() calls to the main chardev")
declared but never implemented iio_device_ioctl().

Commit d430f3c36ca6 ("iio: imu: inv_mpu6050: Use regmap instead of i2c specific functions")
removed inv_mpu6050_write_reg() but not its declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20230811095701.35372-1-yuehaibing@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/fxls8962af.h            | 1 -
 drivers/iio/iio_core.h                    | 3 ---
 drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h | 1 -
 include/linux/iio/sw_device.h             | 3 ---
 include/linux/iio/sw_trigger.h            | 3 ---
 5 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/fxls8962af.h b/drivers/iio/accel/fxls8962af.h
index 9cbe98c3ba9a..6eaa2803b26f 100644
--- a/drivers/iio/accel/fxls8962af.h
+++ b/drivers/iio/accel/fxls8962af.h
@@ -14,7 +14,6 @@ enum {
 };
 
 int fxls8962af_core_probe(struct device *dev, struct regmap *regmap, int irq);
-int fxls8962af_core_remove(struct device *dev);
 
 extern const struct dev_pm_ops fxls8962af_pm_ops;
 extern const struct regmap_config fxls8962af_i2c_regmap_conf;
diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 501e286702ef..1a38b1915e7a 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -30,9 +30,6 @@ struct iio_ioctl_handler {
 		      unsigned int cmd, unsigned long arg);
 };
 
-long iio_device_ioctl(struct iio_dev *indio_dev, struct file *filp,
-		      unsigned int cmd, unsigned long arg);
-
 void iio_device_ioctl_handler_register(struct iio_dev *indio_dev,
 				       struct iio_ioctl_handler *h);
 void iio_device_ioctl_handler_unregister(struct iio_ioctl_handler *h);
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
index ed5a96e78df0..95f548235de7 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_iio.h
@@ -464,7 +464,6 @@ int inv_mpu6050_probe_trigger(struct iio_dev *indio_dev, int irq_type);
 int inv_mpu6050_prepare_fifo(struct inv_mpu6050_state *st, bool enable);
 int inv_mpu6050_switch_engine(struct inv_mpu6050_state *st, bool en,
 			      unsigned int mask);
-int inv_mpu6050_write_reg(struct inv_mpu6050_state *st, int reg, u8 val);
 int inv_mpu_acpi_create_mux_client(struct i2c_client *client);
 void inv_mpu_acpi_delete_mux_client(struct i2c_client *client);
 int inv_mpu_core_probe(struct regmap *regmap, int irq, const char *name,
diff --git a/include/linux/iio/sw_device.h b/include/linux/iio/sw_device.h
index eff1e6b2595c..0f7fe7b522e3 100644
--- a/include/linux/iio/sw_device.h
+++ b/include/linux/iio/sw_device.h
@@ -51,9 +51,6 @@ void iio_unregister_sw_device_type(struct iio_sw_device_type *dt);
 struct iio_sw_device *iio_sw_device_create(const char *, const char *);
 void iio_sw_device_destroy(struct iio_sw_device *);
 
-int iio_sw_device_type_configfs_register(struct iio_sw_device_type *dt);
-void iio_sw_device_type_configfs_unregister(struct iio_sw_device_type *dt);
-
 static inline
 void iio_swd_group_init_type_name(struct iio_sw_device *d,
 				  const char *name,
diff --git a/include/linux/iio/sw_trigger.h b/include/linux/iio/sw_trigger.h
index 47de2443e984..bc77f88df303 100644
--- a/include/linux/iio/sw_trigger.h
+++ b/include/linux/iio/sw_trigger.h
@@ -51,9 +51,6 @@ void iio_unregister_sw_trigger_type(struct iio_sw_trigger_type *tt);
 struct iio_sw_trigger *iio_sw_trigger_create(const char *, const char *);
 void iio_sw_trigger_destroy(struct iio_sw_trigger *);
 
-int iio_sw_trigger_type_configfs_register(struct iio_sw_trigger_type *tt);
-void iio_sw_trigger_type_configfs_unregister(struct iio_sw_trigger_type *tt);
-
 static inline
 void iio_swt_group_init_type_name(struct iio_sw_trigger *t,
 				  const char *name,
-- 
cgit v1.2.3


From 373beef00f7d781a000b12c31fb17a5a9c25969c Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Mon, 11 Sep 2023 15:52:57 +0100
Subject: KVM: arm64: nvhe: Ignore SVE hint in SMCCC function ID

When SVE is enabled, the host may set bit 16 in SMCCC function IDs, a
hint that indicates an unused SVE state. At the moment NVHE doesn't
account for this bit when inspecting the function ID, and rejects most
calls. Clear the hint bit before comparing function IDs.

About version compatibility: the host's PSCI driver initially probes the
firmware for a SMCCC version number. If the firmware implements a
protocol recent enough (1.3), subsequent SMCCC calls have the hint bit
set. Since the hint bit was reserved in earlier versions of the
protocol, clearing it is fine regardless of the version in use.

When a new hint is added to the protocol in the future, it will be added
to ARM_SMCCC_CALL_HINTS and NVHE will handle it straight away. This
patch only clears known hints and leaves reserved bits as is, because
future SMCCC versions could use reserved bits as modifiers for the
function ID, rather than hints.

Fixes: cfa7ff959a78 ("arm64: smccc: Support SMCCC v1.3 SVE register saving hint")
Reported-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230911145254.934414-4-jean-philippe@linaro.org
---
 arch/arm64/include/asm/kvm_hyp.h      | 2 +-
 arch/arm64/kvm/hyp/include/nvhe/ffa.h | 2 +-
 arch/arm64/kvm/hyp/nvhe/ffa.c         | 3 +--
 arch/arm64/kvm/hyp/nvhe/hyp-init.S    | 1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c    | 8 ++++++--
 arch/arm64/kvm/hyp/nvhe/psci-relay.c  | 3 +--
 include/linux/arm-smccc.h             | 2 ++
 7 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index b7238c72a04c..66efd67ea7e8 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -118,7 +118,7 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
 
 u64 __guest_enter(struct kvm_vcpu *vcpu);
 
-bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id);
 
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
index 1becb10ecd80..d9fd5e6c7d3c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -12,6 +12,6 @@
 #define FFA_MAX_FUNC_NUM 0x7F
 
 int hyp_ffa_init(void *pages);
-bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id);
 
 #endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index ab4f5d160c58..6e4dba9eadef 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -634,9 +634,8 @@ out_handled:
 	return true;
 }
 
-bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
 {
-	DECLARE_REG(u64, func_id, host_ctxt, 0);
 	struct arm_smccc_res res;
 
 	/*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 90fade1b032e..1cc06e6797bd 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -57,6 +57,7 @@ __do_hyp_init:
 	cmp	x0, #HVC_STUB_HCALL_NR
 	b.lo	__kvm_handle_stub_hvc
 
+	bic	x0, x0, #ARM_SMCCC_CALL_HINTS
 	mov	x3, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
 	cmp	x0, x3
 	b.eq	1f
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 857d9bc04fd4..2385fd03ed87 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -368,6 +368,7 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
 	if (static_branch_unlikely(&kvm_protected_mode_initialized))
 		hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;
 
+	id &= ~ARM_SMCCC_CALL_HINTS;
 	id -= KVM_HOST_SMCCC_ID(0);
 
 	if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall)))
@@ -392,11 +393,14 @@ static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 
 static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 {
+	DECLARE_REG(u64, func_id, host_ctxt, 0);
 	bool handled;
 
-	handled = kvm_host_psci_handler(host_ctxt);
+	func_id &= ~ARM_SMCCC_CALL_HINTS;
+
+	handled = kvm_host_psci_handler(host_ctxt, func_id);
 	if (!handled)
-		handled = kvm_host_ffa_handler(host_ctxt);
+		handled = kvm_host_ffa_handler(host_ctxt, func_id);
 	if (!handled)
 		default_host_smc_handler(host_ctxt);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 24543d2a3490..d57bcb6ab94d 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -273,9 +273,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
 	}
 }
 
-bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt)
+bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt, u32 func_id)
 {
-	DECLARE_REG(u64, func_id, host_ctxt, 0);
 	unsigned long ret;
 
 	switch (kvm_host_psci_config.version) {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 7c67c17321d4..083f85653716 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -67,6 +67,8 @@
 #define ARM_SMCCC_VERSION_1_3		0x10003
 
 #define ARM_SMCCC_1_3_SVE_HINT		0x10000
+#define ARM_SMCCC_CALL_HINTS		ARM_SMCCC_1_3_SVE_HINT
+
 
 #define ARM_SMCCC_VERSION_FUNC_ID					\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
-- 
cgit v1.2.3


From 08700ec705043eb0cee01b35cf5b9d63f0230d12 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 6 Sep 2023 03:46:57 +0900
Subject: linux/export: fix reference to exported functions for parisc64

John David Anglin reported parisc has been broken since commit
ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost").

Like ia64, parisc64 uses a function descriptor. The function
references must be prefixed with P%.

Also, symbols prefixed $$ from the library have the symbol type
STT_LOPROC instead of STT_FUNC. They should be handled as functions
too.

Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost")
Reported-by: John David Anglin <dave.anglin@bell.net>
Tested-by: John David Anglin <dave.anglin@bell.net>
Tested-by: Helge Deller <deller@gmx.de>
Closes: https://lore.kernel.org/linux-parisc/1901598a-e11d-f7dd-a5d9-9a69d06e6b6e@bell.net/T/#u
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/export-internal.h | 2 ++
 scripts/mod/modpost.c           | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 1c849db953a5..45fca09b2319 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -52,6 +52,8 @@
 
 #ifdef CONFIG_IA64
 #define KSYM_FUNC(name)		@fptr(name)
+#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
+#define KSYM_FUNC(name)		P%name
 #else
 #define KSYM_FUNC(name)		name
 #endif
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index b29b29707f10..ba981f22908a 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1226,6 +1226,15 @@ static void check_export_symbol(struct module *mod, struct elf_info *elf,
 	 */
 	s->is_func = (ELF_ST_TYPE(sym->st_info) == STT_FUNC);
 
+	/*
+	 * For parisc64, symbols prefixed $$ from the library have the symbol type
+	 * STT_LOPROC. They should be handled as functions too.
+	 */
+	if (elf->hdr->e_ident[EI_CLASS] == ELFCLASS64 &&
+	    elf->hdr->e_machine == EM_PARISC &&
+	    ELF_ST_TYPE(sym->st_info) == STT_LOPROC)
+		s->is_func = true;
+
 	if (match(secname, PATTERNS(INIT_SECTIONS)))
 		warn("%s: %s: EXPORT_SYMBOL used for init symbol. Remove __init or EXPORT_SYMBOL.\n",
 		     mod->name, name);
-- 
cgit v1.2.3


From 25e73b7e3f72a25aa30cbb2eecb49036e0acf066 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 2 Aug 2023 12:55:46 +0200
Subject: x86/ibt: Suppress spurious ENDBR

It was reported that under certain circumstances GCC emits ENDBR
instructions for _THIS_IP_ usage. Specifically, when it appears at the
start of a basic block -- but not elsewhere.

Since _THIS_IP_ is never used for control flow, these ENDBR
instructions are completely superfluous. Override the _THIS_IP_
definition for x86_64 to avoid this.

Less ENDBR instructions is better.

Fixes: 156ff4a544ae ("x86/ibt: Base IBT bits")
Reported-by: David Kaplan <David.Kaplan@amd.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230802110323.016197440@infradead.org
---
 arch/x86/include/asm/linkage.h      | 8 ++++++++
 include/linux/instruction_pointer.h | 5 +++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 97a3de7892d3..5ff49fd67732 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,6 +8,14 @@
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
+#ifdef CONFIG_64BIT
+/*
+ * The generic version tends to create spurious ENDBR instructions under
+ * certain conditions.
+ */
+#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; })
+#endif
+
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/instruction_pointer.h b/include/linux/instruction_pointer.h
index cda1f706eaeb..aa0b3ffea935 100644
--- a/include/linux/instruction_pointer.h
+++ b/include/linux/instruction_pointer.h
@@ -2,7 +2,12 @@
 #ifndef _LINUX_INSTRUCTION_POINTER_H
 #define _LINUX_INSTRUCTION_POINTER_H
 
+#include <asm/linkage.h>
+
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
+
+#ifndef _THIS_IP_
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
 
 #endif /* _LINUX_INSTRUCTION_POINTER_H */
-- 
cgit v1.2.3


From 133c4c0d37175f510a10fa9bed51e223936073fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 11 Sep 2023 17:05:31 +0000
Subject: tcp: defer regular ACK while processing socket backlog

This idea came after a particular workload requested
the quickack attribute set on routes, and a performance
drop was noticed for large bulk transfers.

For high throughput flows, it is best to use one cpu
running the user thread issuing socket system calls,
and a separate cpu to process incoming packets from BH context.
(With TSO/GRO, bottleneck is usually the 'user' cpu)

Problem is the user thread can spend a lot of time while holding
the socket lock, forcing BH handler to queue most of incoming
packets in the socket backlog.

Whenever the user thread releases the socket lock, it must first
process all accumulated packets in the backlog, potentially
adding latency spikes. Due to flood mitigation, having too many
packets in the backlog increases chance of unexpected drops.

Backlog processing unfortunately shifts a fair amount of cpu cycles
from the BH cpu to the 'user' cpu, thus reducing max throughput.

This patch takes advantage of the backlog processing,
and the fact that ACK are mostly cumulative.

The idea is to detect we are in the backlog processing
and defer all eligible ACK into a single one,
sent from tcp_release_cb().

This saves cpu cycles on both sides, and network resources.

Performance of a single TCP flow on a 200Gbit NIC:

- Throughput is increased by 20% (100Gbit -> 120Gbit).
- Number of generated ACK per second shrinks from 240,000 to 40,000.
- Number of backlog drops per second shrinks from 230 to 0.

Benchmark context:
 - Regular netperf TCP_STREAM (no zerocopy)
 - Intel(R) Xeon(R) Platinum 8481C (Saphire Rapids)
 - MAX_SKB_FRAGS = 17 (~60KB per GRO packet)

This feature is guarded by a new sysctl, and enabled by default:
 /proc/sys/net/ipv4/tcp_backlog_ack_defer

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst |  7 +++++++
 include/linux/tcp.h                    | 14 ++++++++------
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv4/tcp_input.c                   |  8 ++++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  |  5 ++++-
 7 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a66054d0763a..5bfa1837968c 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
 
 	Default : 44
 
+tcp_backlog_ack_defer - BOOLEAN
+	If set, user thread processing socket backlog tries sending
+	one ACK for the whole queue. This helps to avoid potential
+	long latencies at end of a TCP socket syscall.
+
+	Default : true
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3c5efeeb024f..44d946161d4a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -463,15 +463,17 @@ enum tsq_enum {
 	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
 				    * tcp_v{4|6}_mtu_reduced()
 				    */
+	TCP_ACK_DEFERRED,	   /* TX pure ack is deferred */
 };
 
 enum tsq_flags {
-	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
-	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
-	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
-	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
-	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
-	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+	TSQF_THROTTLED			= BIT(TSQ_THROTTLED),
+	TSQF_QUEUED			= BIT(TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= BIT(TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= BIT(TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= BIT(TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= BIT(TCP_MTU_REDUCED_DEFERRED),
+	TCPF_ACK_DEFERRED		= BIT(TCP_ACK_DEFERRED),
 };
 
 #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7a41c4791536..d96d05b08819 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -132,6 +132,7 @@ struct netns_ipv4 {
 	u8 sysctl_tcp_syncookies;
 	u8 sysctl_tcp_migrate_req;
 	u8 sysctl_tcp_comp_sack_nr;
+	u8 sysctl_tcp_backlog_ack_defer;
 	int sysctl_tcp_reordering;
 	u8 sysctl_tcp_retries1;
 	u8 sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6ac890b4073f..e7f024d93572 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dou8vec_minmax,
 		.extra1		= SYSCTL_ZERO,
 	},
+	{
+		.procname	= "tcp_backlog_ack_defer",
+		.data		= &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname       = "tcp_reflect_tos",
 		.data           = &init_net.ipv4.sysctl_tcp_reflect_tos,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 06fe1cf645d5..41b471748437 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */
 	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+		/* If we are running from __release_sock() in user context,
+		 * Defer the ack until tcp_release_cb().
+		 */
+		if (sock_owned_by_user_nocheck(sk) &&
+		    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+			set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+			return;
+		}
 send_now:
 		tcp_send_ack(sk);
 		return;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 27140e5cdc06..f13eb7e23d03 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4cac12d0e63..1fc1f879cfd6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
 #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
 			  TCPF_WRITE_TIMER_DEFERRED |	\
 			  TCPF_DELACK_TIMER_DEFERRED |	\
-			  TCPF_MTU_REDUCED_DEFERRED)
+			  TCPF_MTU_REDUCED_DEFERRED |	\
+			  TCPF_ACK_DEFERRED)
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -1114,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
+	if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
+		tcp_send_ack(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
-- 
cgit v1.2.3


From 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Tue, 12 Sep 2023 23:04:41 +0800
Subject: bpf, x64: Fix tailcall infinite loop

From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall
handling in JIT"), the tailcall on x64 works better than before.

From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms
for x64 JIT"), tailcall is able to run in BPF subprograms on x64.

From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program
to other BPF programs"), BPF program is able to trace other BPF programs.

How about combining them all together?

1. FENTRY/FEXIT on a BPF subprogram.
2. A tailcall runs in the BPF subprogram.
3. The tailcall calls the subprogram's caller.

As a result, a tailcall infinite loop comes up. And the loop would halt
the machine.

As we know, in tail call context, the tail_call_cnt propagates by stack
and rax register between BPF subprograms. So do in trampolines.

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------
 include/linux/bpf.h         |  5 +++++
 kernel/bpf/trampoline.c     |  4 ++--
 kernel/bpf/verifier.c       |  3 +++
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bcca1c9b9a02..2846c21d75bf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1022,6 +1022,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
+/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
+#define RESTORE_TAIL_CALL_CNT(stack)				\
+	EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1627,9 +1631,7 @@ st:			if (is_imm8(insn->off))
 
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
-				/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-				EMIT3_off32(0x48, 0x8B, 0x85,
-					    -round_up(bpf_prog->aux->stack_depth, 8) - 8);
+				RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
 				if (!imm32)
 					return -EINVAL;
 				offs = 7 + x86_call_depth_emit_accounting(&prog, func);
@@ -2404,6 +2406,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	 *                     [ ...        ]
 	 *                     [ stack_arg2 ]
 	 * RBP - arg_stack_off [ stack_arg1 ]
+	 * RSP                 [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
 	 */
 
 	/* room for return value of orig_call or fentry prog */
@@ -2468,6 +2471,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	else
 		/* sub rsp, stack_size */
 		EMIT4(0x48, 0x83, 0xEC, stack_size);
+	if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		EMIT1(0x50);		/* push rax */
 	/* mov QWORD PTR [rbp - rbx_off], rbx */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
 
@@ -2520,9 +2525,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		restore_regs(m, &prog, regs_off);
 		save_args(m, &prog, arg_stack_off, true);
 
+		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+			/* Before calling the original function, restore the
+			 * tail_call_cnt from stack to rax.
+			 */
+			RESTORE_TAIL_CALL_CNT(stack_size);
+
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
-			emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
-			EMIT2(0xff, 0xd0); /* call *rax */
+			emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
+			EMIT2(0xff, 0xd3); /* call *rbx */
 		} else {
 			/* call original function */
 			if (emit_rsb_call(&prog, orig_call, prog)) {
@@ -2573,7 +2584,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 			ret = -EINVAL;
 			goto cleanup;
 		}
-	}
+	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		/* Before running the original function, restore the
+		 * tail_call_cnt from stack to rax.
+		 */
+		RESTORE_TAIL_CALL_CNT(stack_size);
+
 	/* restore return value of orig_call or fentry prog back into RAX */
 	if (save_ret)
 		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 87eeb3a46a1d..b9e573159432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1035,6 +1035,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
 
+/* Indicate that current trampoline is in a tail call context. Then, it has to
+ * cache and restore tail_call_cnt to avoid infinite tail call loop.
+ */
+#define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 53ff50cac61e..e97aeda3a86b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		goto out;
 	}
 
-	/* clear all bits except SHARE_IPMODIFY */
-	tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
 	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
 	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbba2b806017..18e673c0ac15 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19774,6 +19774,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	if (!tr)
 		return -ENOMEM;
 
+	if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+		tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
 	prog->aux->dst_trampoline = tr;
 	return 0;
 }
-- 
cgit v1.2.3


From 8f012db27c9516be1a7aca93ea4a6ca9c75056c9 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Mon, 10 Jul 2023 13:02:58 -0700
Subject: x86/numa: Introduce numa_fill_memblks()

numa_fill_memblks() fills in the gaps in numa_meminfo memblks
over an physical address range.

The ACPI driver will use numa_fill_memblks() to implement a new Linux
policy that prescribes extending proximity domains in a portion of a
CFMWS window to the entire window.

Dan Williams offered this explanation of the policy:
A CFWMS is an ACPI data structure that indicates *potential* locations
where CXL memory can be placed. It is the playground where the CXL
driver has free reign to establish regions. That space can be populated
by BIOS created regions, or driver created regions, after hotplug or
other reconfiguration.

When BIOS creates a region in a CXL Window it additionally describes
that subset of the Window range in the other typical ACPI tables SRAT,
SLIT, and HMAT. The rationale for BIOS not pre-describing the entire
CXL Window in SRAT, SLIT, and HMAT is that it can not predict the
future. I.e. there is nothing stopping higher or lower performance
devices being placed in the same Window. Compare that to ACPI memory
hotplug that just onlines additional capacity in the proximity domain
with little freedom for dynamic performance differentiation.

That leaves the OS with a choice, should unpopulated window capacity
match the proximity domain of an existing region, or should it allocate
a new one? This patch takes the simple position of minimizing proximity
domain proliferation by reusing any proximity domain intersection for
the entire Window. If the Window has no intersections then allocate a
new proximity domain. Note that SRAT, SLIT and HMAT information can be
enumerated dynamically in a standard way from device provided data.
Think of CXL as the end of ACPI needing to describe memory attributes,
CXL offers a standard discovery model for performance attributes, but
Linux still needs to interoperate with the old regime.

Reported-by: Derick Marks <derick.w.marks@intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Derick Marks <derick.w.marks@intel.com>
Link: https://lore.kernel.org/all/ef078a6f056ca974e5af85997013c0fda9e3326d.1689018477.git.alison.schofield%40intel.com
---
 arch/x86/include/asm/sparsemem.h |  2 +
 arch/x86/mm/numa.c               | 80 ++++++++++++++++++++++++++++++++++++++++
 include/linux/numa.h             |  7 ++++
 3 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 64df897c0ee3..1be13b2dfe8b 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start);
 #define phys_to_target_node phys_to_target_node
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+extern int numa_fill_memblks(u64 start, u64 end);
+#define numa_fill_memblks numa_fill_memblks
 #endif
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2aadb2019b4f..c01c5506fd4a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -11,6 +11,7 @@
 #include <linux/nodemask.h>
 #include <linux/sched.h>
 #include <linux/topology.h>
+#include <linux/sort.h>
 
 #include <asm/e820/api.h>
 #include <asm/proto.h>
@@ -961,4 +962,83 @@ int memory_add_physaddr_to_nid(u64 start)
 	return nid;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+static int __init cmp_memblk(const void *a, const void *b)
+{
+	const struct numa_memblk *ma = *(const struct numa_memblk **)a;
+	const struct numa_memblk *mb = *(const struct numa_memblk **)b;
+
+	return ma->start - mb->start;
+}
+
+static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
+
+/**
+ * numa_fill_memblks - Fill gaps in numa_meminfo memblks
+ * @start: address to begin fill
+ * @end: address to end fill
+ *
+ * Find and extend numa_meminfo memblks to cover the @start-@end
+ * physical address range, such that the first memblk includes
+ * @start, the last memblk includes @end, and any gaps in between
+ * are filled.
+ *
+ * RETURNS:
+ * 0		  : Success
+ * NUMA_NO_MEMBLK : No memblk exists in @start-@end range
+ */
+
+int __init numa_fill_memblks(u64 start, u64 end)
+{
+	struct numa_memblk **blk = &numa_memblk_list[0];
+	struct numa_meminfo *mi = &numa_meminfo;
+	int count = 0;
+	u64 prev_end;
+
+	/*
+	 * Create a list of pointers to numa_meminfo memblks that
+	 * overlap start, end. Exclude (start == bi->end) since
+	 * end addresses in both a CFMWS range and a memblk range
+	 * are exclusive.
+	 *
+	 * This list of pointers is used to make in-place changes
+	 * that fill out the numa_meminfo memblks.
+	 */
+	for (int i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
+
+		if (start < bi->end && end >= bi->start) {
+			blk[count] = &mi->blk[i];
+			count++;
+		}
+	}
+	if (!count)
+		return NUMA_NO_MEMBLK;
+
+	/* Sort the list of pointers in memblk->start order */
+	sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
+
+	/* Make sure the first/last memblks include start/end */
+	blk[0]->start = min(blk[0]->start, start);
+	blk[count - 1]->end = max(blk[count - 1]->end, end);
+
+	/*
+	 * Fill any gaps by tracking the previous memblks
+	 * end address and backfilling to it if needed.
+	 */
+	prev_end = blk[0]->end;
+	for (int i = 1; i < count; i++) {
+		struct numa_memblk *curr = blk[i];
+
+		if (prev_end >= curr->start) {
+			if (prev_end < curr->end)
+				prev_end = curr->end;
+		} else {
+			curr->start = prev_end;
+			prev_end = curr->end;
+		}
+	}
+	return 0;
+}
+
 #endif
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..0f512c0aba54 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -12,6 +12,7 @@
 #define MAX_NUMNODES    (1 << NODES_SHIFT)
 
 #define	NUMA_NO_NODE	(-1)
+#define	NUMA_NO_MEMBLK	(-1)
 
 /* optionally keep NUMA memory info available post init */
 #ifdef CONFIG_NUMA_KEEP_MEMINFO
@@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start)
 	return 0;
 }
 #endif
+#ifndef numa_fill_memblks
+static inline int __init numa_fill_memblks(u64 start, u64 end)
+{
+	return NUMA_NO_MEMBLK;
+}
+#endif
 #else /* !CONFIG_NUMA */
 static inline int numa_map_to_online_node(int node)
 {
-- 
cgit v1.2.3


From 5eb1e6e459cfa025f79c43014f66ff62a55542f1 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 5 Sep 2023 21:42:53 +0200
Subject: i2c: Drop legacy callback .probe_new()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that all drivers are converted to the (new) .probe() callback, the
temporary .probe_new() can go away. \o/

Link: https://lore.kernel.org/linux-i2c/20230626094548.559542-1-u.kleine-koenig@pengutronix.de
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 3430cc2b05a6..0dae9db27538 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -237,7 +237,6 @@ enum i2c_driver_flags {
  * struct i2c_driver - represent an I2C device driver
  * @class: What kind of i2c device we instantiate (for detect)
  * @probe: Callback for device binding
- * @probe_new: Transitional callback for device binding - do not use
  * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
  * @alert: Alert callback, for example for the SMBus alert protocol
@@ -272,16 +271,8 @@ enum i2c_driver_flags {
 struct i2c_driver {
 	unsigned int class;
 
-	union {
 	/* Standard driver model interfaces */
-		int (*probe)(struct i2c_client *client);
-		/*
-		 * Legacy callback that was part of a conversion of .probe().
-		 * Today it has the same semantic as .probe(). Don't use for new
-		 * code.
-		 */
-		int (*probe_new)(struct i2c_client *client);
-	};
+	int (*probe)(struct i2c_client *client);
 	void (*remove)(struct i2c_client *client);
 
 
-- 
cgit v1.2.3


From bbaa6ffa5b6c9609d3b3c431c389b407eea5441f Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Wed, 13 Sep 2023 11:32:33 +0800
Subject: power: supply: core: Use blocking_notifier_call_chain to avoid RCU
 complaint

AMD PMF driver can cause the following warning:
[  196.159546] ------------[ cut here ]------------
[  196.159556] Voluntary context switch within RCU read-side critical section!
[  196.159571] WARNING: CPU: 0 PID: 9 at kernel/rcu/tree_plugin.h:320 rcu_note_context_switch+0x43d/0x560
[  196.159604] Modules linked in: nvme_fabrics ccm rfcomm snd_hda_scodec_cs35l41_spi cmac algif_hash algif_skcipher af_alg bnep joydev btusb btrtl uvcvideo btintel btbcm videobuf2_vmalloc intel_rapl_msr btmtk videobuf2_memops uvc videobuf2_v4l2 intel_rapl_common binfmt_misc hid_sensor_als snd_sof_amd_vangogh hid_sensor_trigger bluetooth industrialio_triggered_buffer videodev snd_sof_amd_rembrandt hid_sensor_iio_common amdgpu ecdh_generic kfifo_buf videobuf2_common hp_wmi kvm_amd sparse_keymap snd_sof_amd_renoir wmi_bmof industrialio ecc mc nls_iso8859_1 kvm snd_sof_amd_acp irqbypass snd_sof_xtensa_dsp crct10dif_pclmul crc32_pclmul mt7921e snd_sof_pci snd_ctl_led polyval_clmulni mt7921_common polyval_generic snd_sof ghash_clmulni_intel mt792x_lib mt76_connac_lib sha512_ssse3 snd_sof_utils aesni_intel snd_hda_codec_realtek crypto_simd mt76 snd_hda_codec_generic cryptd snd_soc_core snd_hda_codec_hdmi rapl ledtrig_audio input_leds snd_compress i2c_algo_bit drm_ttm_helper mac80211 snd_pci_ps hid_multitouch ttm drm_exec
[  196.159970]  drm_suballoc_helper snd_rpl_pci_acp6x amdxcp drm_buddy snd_hda_intel snd_acp_pci snd_hda_scodec_cs35l41_i2c serio_raw gpu_sched snd_hda_scodec_cs35l41 snd_acp_legacy_common snd_intel_dspcfg snd_hda_cs_dsp_ctls snd_hda_codec libarc4 drm_display_helper snd_pci_acp6x cs_dsp snd_hwdep snd_soc_cs35l41_lib video k10temp snd_pci_acp5x thunderbolt snd_hda_core drm_kms_helper cfg80211 snd_seq snd_rn_pci_acp3x snd_pcm snd_acp_config cec snd_soc_acpi snd_seq_device rc_core ccp snd_pci_acp3x snd_timer snd soundcore wmi amd_pmf platform_profile amd_pmc mac_hid serial_multi_instantiate wireless_hotkey hid_sensor_hub sch_fq_codel msr parport_pc ppdev lp parport efi_pstore ip_tables x_tables autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c xor raid6_pq raid1 raid0 multipath linear dm_mirror dm_region_hash dm_log cdc_ether usbnet r8152 mii hid_generic nvme i2c_hid_acpi i2c_hid nvme_core i2c_piix4 xhci_pci amd_sfh drm xhci_pci_renesas nvme_common hid
[  196.160382] CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1 #4
[  196.160397] Hardware name: HP HP EliteBook 845 14 inch G10 Notebook PC/8B6E, BIOS V82 Ver. 01.02.00 08/24/2023
[  196.160405] Workqueue: events power_supply_changed_work
[  196.160426] RIP: 0010:rcu_note_context_switch+0x43d/0x560
[  196.160440] Code: 00 48 89 be 40 08 00 00 48 89 86 48 08 00 00 48 89 10 e9 63 fe ff ff 48 c7 c7 10 e7 b0 9e c6 05 e8 d8 20 02 01 e8 13 0f f3 ff <0f> 0b e9 27 fc ff ff a9 ff ff ff 7f 0f 84 cf fc ff ff 65 48 8b 3c
[  196.160450] RSP: 0018:ffffc900001878f0 EFLAGS: 00010046
[  196.160462] RAX: 0000000000000000 RBX: ffff88885e834040 RCX: 0000000000000000
[  196.160470] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  196.160476] RBP: ffffc90000187910 R08: 0000000000000000 R09: 0000000000000000
[  196.160482] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  196.160488] R13: 0000000000000000 R14: ffff888100990000 R15: ffff888100990000
[  196.160495] FS:  0000000000000000(0000) GS:ffff88885e800000(0000) knlGS:0000000000000000
[  196.160504] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  196.160512] CR2: 000055cb053c8246 CR3: 000000013443a000 CR4: 0000000000750ef0
[  196.160520] PKRU: 55555554
[  196.160526] Call Trace:
[  196.160532]  <TASK>
[  196.160548]  ? show_regs+0x72/0x90
[  196.160570]  ? rcu_note_context_switch+0x43d/0x560
[  196.160580]  ? __warn+0x8d/0x160
[  196.160600]  ? rcu_note_context_switch+0x43d/0x560
[  196.160613]  ? report_bug+0x1bb/0x1d0
[  196.160637]  ? handle_bug+0x46/0x90
[  196.160658]  ? exc_invalid_op+0x19/0x80
[  196.160675]  ? asm_exc_invalid_op+0x1b/0x20
[  196.160709]  ? rcu_note_context_switch+0x43d/0x560
[  196.160727]  __schedule+0xb9/0x15f0
[  196.160746]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160765]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160778]  ? acpi_ns_search_one_scope+0xbe/0x270
[  196.160806]  schedule+0x68/0x110
[  196.160820]  schedule_timeout+0x151/0x160
[  196.160829]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160842]  ? srso_alias_return_thunk+0x5/0x7f
[  196.160855]  ? acpi_ns_lookup+0x3c5/0xa90
[  196.160878]  __down_common+0xff/0x220
[  196.160905]  __down_timeout+0x16/0x30
[  196.160920]  down_timeout+0x64/0x70
[  196.160938]  acpi_os_wait_semaphore+0x85/0x200
[  196.160959]  acpi_ut_acquire_mutex+0x9e/0x280
[  196.160979]  acpi_ex_enter_interpreter+0x2d/0xb0
[  196.160992]  acpi_ns_evaluate+0x2f0/0x5f0
[  196.161005]  acpi_evaluate_object+0x172/0x490
[  196.161018]  ? acpi_os_signal_semaphore+0x8a/0xd0
[  196.161038]  acpi_evaluate_integer+0x52/0xe0
[  196.161055]  ? kfree+0x79/0x120
[  196.161071]  ? srso_alias_return_thunk+0x5/0x7f
[  196.161089]  acpi_ac_get_state.part.0+0x27/0x80
[  196.161110]  get_ac_property+0x5c/0x70
[  196.161127]  ? __pfx___power_supply_is_system_supplied+0x10/0x10
[  196.161146]  __power_supply_is_system_supplied+0x44/0xb0
[  196.161166]  class_for_each_device+0x124/0x160
[  196.161184]  ? acpi_ac_get_state.part.0+0x27/0x80
[  196.161203]  ? srso_alias_return_thunk+0x5/0x7f
[  196.161223]  power_supply_is_system_supplied+0x3c/0x70
[  196.161243]  amd_pmf_get_power_source+0xe/0x20 [amd_pmf]
[  196.161276]  amd_pmf_power_slider_update_event+0x49/0x90 [amd_pmf]
[  196.161310]  amd_pmf_pwr_src_notify_call+0xe7/0x100 [amd_pmf]
[  196.161340]  notifier_call_chain+0x5f/0xe0
[  196.161362]  atomic_notifier_call_chain+0x33/0x60
[  196.161378]  power_supply_changed_work+0x84/0x110
[  196.161394]  process_one_work+0x178/0x360
[  196.161412]  ? __pfx_worker_thread+0x10/0x10
[  196.161424]  worker_thread+0x307/0x430
[  196.161440]  ? __pfx_worker_thread+0x10/0x10
[  196.161451]  kthread+0xf4/0x130
[  196.161467]  ? __pfx_kthread+0x10/0x10
[  196.161486]  ret_from_fork+0x43/0x70
[  196.161502]  ? __pfx_kthread+0x10/0x10
[  196.161518]  ret_from_fork_asm+0x1b/0x30
[  196.161558]  </TASK>
[  196.161562] ---[ end trace 0000000000000000 ]---

Since there's no guarantee that all the callbacks can work in atomic
context, switch to use blocking_notifier_call_chain to relax the
constraint.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reported-by: Allen Zhong <allen@atr.me>
Fixes: 4c71ae414474 ("platform/x86/amd/pmf: Add support SPS PMF feature")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217571
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://lore.kernel.org/r/20230913033233.602986-1-kai.heng.feng@canonical.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 8 ++++----
 include/linux/power_supply.h             | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 0b69fb7bafd8..416409e2fd6d 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -29,7 +29,7 @@
 struct class *power_supply_class;
 EXPORT_SYMBOL_GPL(power_supply_class);
 
-ATOMIC_NOTIFIER_HEAD(power_supply_notifier);
+BLOCKING_NOTIFIER_HEAD(power_supply_notifier);
 EXPORT_SYMBOL_GPL(power_supply_notifier);
 
 static struct device_type power_supply_dev_type;
@@ -97,7 +97,7 @@ static void power_supply_changed_work(struct work_struct *work)
 		class_for_each_device(power_supply_class, NULL, psy,
 				      __power_supply_changed_work);
 		power_supply_update_leds(psy);
-		atomic_notifier_call_chain(&power_supply_notifier,
+		blocking_notifier_call_chain(&power_supply_notifier,
 				PSY_EVENT_PROP_CHANGED, psy);
 		kobject_uevent(&psy->dev.kobj, KOBJ_CHANGE);
 		spin_lock_irqsave(&psy->changed_lock, flags);
@@ -1262,13 +1262,13 @@ static void power_supply_dev_release(struct device *dev)
 
 int power_supply_reg_notifier(struct notifier_block *nb)
 {
-	return atomic_notifier_chain_register(&power_supply_notifier, nb);
+	return blocking_notifier_chain_register(&power_supply_notifier, nb);
 }
 EXPORT_SYMBOL_GPL(power_supply_reg_notifier);
 
 void power_supply_unreg_notifier(struct notifier_block *nb)
 {
-	atomic_notifier_chain_unregister(&power_supply_notifier, nb);
+	blocking_notifier_chain_unregister(&power_supply_notifier, nb);
 }
 EXPORT_SYMBOL_GPL(power_supply_unreg_notifier);
 
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index a427f13c757f..85b86768c0b9 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -767,7 +767,7 @@ struct power_supply_battery_info {
 	int bti_resistance_tolerance;
 };
 
-extern struct atomic_notifier_head power_supply_notifier;
+extern struct blocking_notifier_head power_supply_notifier;
 extern int power_supply_reg_notifier(struct notifier_block *nb);
 extern void power_supply_unreg_notifier(struct notifier_block *nb);
 #if IS_ENABLED(CONFIG_POWER_SUPPLY)
-- 
cgit v1.2.3


From d090ec0df81e56556af3a2bf04a7e89347ae5784 Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Thu, 31 Aug 2023 03:31:28 -0300
Subject: smp: Change function signatures to use call_single_data_t

call_single_data_t is a size-aligned typedef of struct __call_single_data.

This alignment is desirable in order to have smp_call_function*() avoid
bouncing an extra cacheline in case of an unaligned csd, given this
would hurt performance.

Since the removal of struct request->csd in commit 660e802c76c8
("blk-mq: use percpu csd to remote complete instead of per-rq csd") there
are no current users of smp_call_function*() with unaligned csd.

Change every 'struct __call_single_data' function parameter to
'call_single_data_t', so we have warnings if any new code tries to
introduce an smp_call_function*() call with unaligned csd.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Reviewed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230831063129.335425-1-leobras@redhat.com
---
 include/linux/smp.h        |  2 +-
 include/trace/events/csd.h |  8 ++++----
 kernel/smp.c               | 26 +++++++++++++-------------
 kernel/up.c                |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 91ea4a67f8ca..e87520dc2959 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 /*
  * Cpus stopping functions in panic. All have default weak definitions.
diff --git a/include/trace/events/csd.h b/include/trace/events/csd.h
index 67e9d01f80c2..58cc83b99c34 100644
--- a/include/trace/events/csd.h
+++ b/include/trace/events/csd.h
@@ -12,7 +12,7 @@ TRACE_EVENT(csd_queue_cpu,
 	TP_PROTO(const unsigned int cpu,
 		unsigned long callsite,
 		smp_call_func_t func,
-		struct __call_single_data *csd),
+		call_single_data_t *csd),
 
 	TP_ARGS(cpu, callsite, func, csd),
 
@@ -39,7 +39,7 @@ TRACE_EVENT(csd_queue_cpu,
  */
 DECLARE_EVENT_CLASS(csd_function,
 
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 
 	TP_ARGS(func, csd),
 
@@ -57,12 +57,12 @@ DECLARE_EVENT_CLASS(csd_function,
 );
 
 DEFINE_EVENT(csd_function, csd_function_entry,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
 DEFINE_EVENT(csd_function, csd_function_exit,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 385179dae360..822fabb7e3e1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -125,7 +125,7 @@ send_call_function_ipi_mask(struct cpumask *mask)
 }
 
 static __always_inline void
-csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 {
 	trace_csd_function_entry(func, csd);
 	func(info);
@@ -172,7 +172,7 @@ module_param(csd_lock_timeout, ulong, 0444);
 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 
 /* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(struct __call_single_data *csd)
+static void __csd_lock_record(call_single_data_t *csd)
 {
 	if (!csd) {
 		smp_mb(); /* NULL cur_csd after unlock. */
@@ -187,13 +187,13 @@ static void __csd_lock_record(struct __call_single_data *csd)
 		  /* Or before unlock, as the case may be. */
 }
 
-static __always_inline void csd_lock_record(struct __call_single_data *csd)
+static __always_inline void csd_lock_record(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled))
 		__csd_lock_record(csd);
 }
 
-static int csd_lock_wait_getcpu(struct __call_single_data *csd)
+static int csd_lock_wait_getcpu(call_single_data_t *csd)
 {
 	unsigned int csd_type;
 
@@ -208,7 +208,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
 {
 	int cpu = -1;
 	int cpux;
@@ -272,7 +272,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void __csd_lock_wait(struct __call_single_data *csd)
+static void __csd_lock_wait(call_single_data_t *csd)
 {
 	int bug_id = 0;
 	u64 ts0, ts1;
@@ -286,7 +286,7 @@ static void __csd_lock_wait(struct __call_single_data *csd)
 	smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled)) {
 		__csd_lock_wait(csd);
@@ -296,17 +296,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #else
-static void csd_lock_record(struct __call_single_data *csd)
+static void csd_lock_record(call_single_data_t *csd)
 {
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
-static __always_inline void csd_lock(struct __call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
 	csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -319,7 +319,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd)
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(struct __call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
@@ -372,7 +372,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, struct __call_single_data *csd)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
 {
 	if (cpu == smp_processor_id()) {
 		smp_call_func_t func = csd->func;
@@ -658,7 +658,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  *
  * Return: %0 on success or negative errno value on error
  */
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	int err = 0;
 
diff --git a/kernel/up.c b/kernel/up.c
index a38b8b095251..df50828cc2f0 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 85be6d842447067ce76047a14d4258c96fd33b7b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Aug 2023 12:52:04 +0200
Subject: cleanup: Make no_free_ptr() __must_check

recent discussion brought about the realization that it makes sense for
no_free_ptr() to have __must_check semantics in order to avoid leaking
the resource.

Additionally, add a few comments to clarify why/how things work.

All credit to Linus on how to combine __must_check and the
stmt-expression.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net
---
 include/linux/cleanup.h | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 53f1a7a932b0..9f1a9c455b68 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -7,8 +7,9 @@
 /*
  * DEFINE_FREE(name, type, free):
  *	simple helper macro that defines the required wrapper for a __free()
- *	based cleanup function. @free is an expression using '_T' to access
- *	the variable.
+ *	based cleanup function. @free is an expression using '_T' to access the
+ *	variable. @free should typically include a NULL test before calling a
+ *	function, see the example below.
  *
  * __free(name):
  *	variable attribute to add a scoped based cleanup to the variable.
@@ -17,6 +18,9 @@
  *	like a non-atomic xchg(var, NULL), such that the cleanup function will
  *	be inhibited -- provided it sanely deals with a NULL value.
  *
+ *	NOTE: this has __must_check semantics so that it is harder to accidentally
+ *	leak the resource.
+ *
  * return_ptr(p):
  *	returns p while inhibiting the __free().
  *
@@ -24,6 +28,8 @@
  *
  * DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
  *
+ * void *alloc_obj(...)
+ * {
  *	struct obj *p __free(kfree) = kmalloc(...);
  *	if (!p)
  *		return NULL;
@@ -32,6 +38,24 @@
  *		return NULL;
  *
  *	return_ptr(p);
+ * }
+ *
+ * NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though
+ * kfree() is fine to be called with a NULL value. This is on purpose. This way
+ * the compiler sees the end of our alloc_obj() function as:
+ *
+ *	tmp = p;
+ *	p = NULL;
+ *	if (p)
+ *		kfree(p);
+ *	return tmp;
+ *
+ * And through the magic of value-propagation and dead-code-elimination, it
+ * eliminates the actual cleanup call and compiles into:
+ *
+ *	return p;
+ *
+ * Without the NULL test it turns into a mess and the compiler can't help us.
  */
 
 #define DEFINE_FREE(_name, _type, _free) \
@@ -39,8 +63,17 @@
 
 #define __free(_name)	__cleanup(__free_##_name)
 
+#define __get_and_null_ptr(p) \
+	({ __auto_type __ptr = &(p); \
+	   __auto_type __val = *__ptr; \
+	   *__ptr = NULL;  __val; })
+
+static inline __must_check
+const volatile void * __must_check_fn(const volatile void *val)
+{ return val; }
+
 #define no_free_ptr(p) \
-	({ __auto_type __ptr = (p); (p) = NULL; __ptr; })
+	((typeof(p)) __must_check_fn(__get_and_null_ptr(p)))
 
 #define return_ptr(p)	return no_free_ptr(p)
 
-- 
cgit v1.2.3


From 6fb45460615358157a6d3c990e74f9c1395247e2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:45:16 +0200
Subject: sched: Simplify tg_set_cfs_bandwidth()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpu.h |  2 ++
 kernel/sched/core.c | 38 +++++++++++++++++++-------------------
 2 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..f19f56501809 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a3f4fb8a6841..5d9f36359461 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10802,11 +10802,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 * Prevent race between setting of cfs_rq->runtime_enabled and
 	 * unthrottle_offline_cfs_rqs().
 	 */
-	cpus_read_lock();
-	mutex_lock(&cfs_constraints_mutex);
+	guard(cpus_read_lock)();
+	guard(mutex)(&cfs_constraints_mutex);
+
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10816,39 +10817,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 */
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
-	raw_spin_lock_irq(&cfs_b->lock);
-	cfs_b->period = ns_to_ktime(period);
-	cfs_b->quota = quota;
-	cfs_b->burst = burst;
 
-	__refill_cfs_bandwidth_runtime(cfs_b);
+	scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+		cfs_b->period = ns_to_ktime(period);
+		cfs_b->quota = quota;
+		cfs_b->burst = burst;
 
-	/* Restart the period timer (if active) to handle new period expiry: */
-	if (runtime_enabled)
-		start_cfs_bandwidth(cfs_b);
+		__refill_cfs_bandwidth_runtime(cfs_b);
 
-	raw_spin_unlock_irq(&cfs_b->lock);
+		/*
+		 * Restart the period timer (if active) to handle new
+		 * period expiry:
+		 */
+		if (runtime_enabled)
+			start_cfs_bandwidth(cfs_b);
+	}
 
 	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
-		struct rq_flags rf;
 
-		rq_lock_irq(rq, &rf);
+		guard(rq_lock_irq)(rq);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
-		rq_unlock_irq(rq, &rf);
 	}
+
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
-out_unlock:
-	mutex_unlock(&cfs_constraints_mutex);
-	cpus_read_unlock();
 
-	return ret;
+	return 0;
 }
 
 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
-- 
cgit v1.2.3


From 89de9921dfa77e43b985bde99a6031ab66511020 Mon Sep 17 00:00:00 2001
From: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Date: Wed, 6 Sep 2023 13:57:01 -0600
Subject: virtchnl: Add CRC stripping capability

Some VFs may want to disable CRC stripping on incoming packets so create
an offload for that. The VF already sends information about configuring
its RX queues so use that structure to indicate that the CRC stripping
should be enabled or not.

Signed-off-by: Paul M Stillwell Jr <paul.m.stillwell.jr@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index d0807ad43f93..dd71d3009771 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -240,6 +240,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		BIT(6)
 /* used to negotiate communicating link speeds in Mbps */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
+#define  VIRTCHNL_VF_OFFLOAD_CRC		BIT(10)
 #define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
@@ -295,7 +296,13 @@ VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info);
 /* VIRTCHNL_OP_CONFIG_RX_QUEUE
  * VF sends this message to set up parameters for one RX queue.
  * External data buffer contains one instance of virtchnl_rxq_info.
- * PF configures requested queue and returns a status code.
+ * PF configures requested queue and returns a status code. The
+ * crc_disable flag disables CRC stripping on the VF. Setting
+ * the crc_disable flag to 1 will disable CRC stripping for each
+ * queue in the VF where the flag is set. The VIRTCHNL_VF_OFFLOAD_CRC
+ * offload must have been set prior to sending this info or the PF
+ * will ignore the request. This flag should be set the same for
+ * all of the queues for a VF.
  */
 
 /* Rx queue config info */
@@ -307,7 +314,7 @@ struct virtchnl_rxq_info {
 	u16 splithdr_enabled; /* deprecated with AVF 1.0 */
 	u32 databuffer_size;
 	u32 max_pkt_size;
-	u8 pad0;
+	u8 crc_disable;
 	u8 rxdid;
 	u8 pad1[2];
 	u64 dma_ring_addr;
-- 
cgit v1.2.3


From b193a78ddb5ee7dba074d3f28dc050069ba083c0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 4 Sep 2023 12:34:40 -0400
Subject: NFS: Use the correct commit info in nfs_join_page_group()

Ensure that nfs_clear_request_commit() updates the correct counters when
it removes them from the commit list.

Fixes: ed5d588fe47f ("NFS: Try to join page groups before an O_DIRECT retransmission")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/direct.c          |  8 +++++---
 fs/nfs/write.c           | 23 ++++++++++++-----------
 include/linux/nfs_page.h |  4 +++-
 3 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a53e50123499..3391c8b97da5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -498,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list,
 	kref_get(&head->wb_kref);
 }
 
-static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_join_group(struct list_head *list,
+				  struct nfs_commit_info *cinfo,
+				  struct inode *inode)
 {
 	struct nfs_page *req, *subreq;
 
@@ -520,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
 				nfs_release_request(subreq);
 			}
 		} while ((subreq = subreq->wb_this_page) != req);
-		nfs_join_page_group(req, inode);
+		nfs_join_page_group(req, cinfo, inode);
 	}
 }
 
@@ -545,7 +547,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 
-	nfs_direct_join_group(&reqs, dreq->inode);
+	nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
 
 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
 	get_dreq(dreq);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f4cca8f00c0c..8c1ee1a1a28f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_inode_remove_request(struct nfs_page *req);
-static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+				     struct nfs_page *req);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
 				      struct inode *inode);
 static struct nfs_page *
@@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
  * the (former) group.  All subrequests are removed from any write or commit
  * lists, unlinked from the group and destroyed.
  */
-void
-nfs_join_page_group(struct nfs_page *head, struct inode *inode)
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+			 struct inode *inode)
 {
 	struct nfs_page *subreq;
 	struct nfs_page *destroy_list = NULL;
@@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
 	 * Commit list removal accounting is done after locks are dropped */
 	subreq = head;
 	do {
-		nfs_clear_request_commit(subreq);
+		nfs_clear_request_commit(cinfo, subreq);
 		subreq = subreq->wb_this_page;
 	} while (subreq != head);
 
@@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
 {
 	struct inode *inode = folio_file_mapping(folio)->host;
 	struct nfs_page *head;
+	struct nfs_commit_info cinfo;
 	int ret;
 
+	nfs_init_cinfo_from_inode(&cinfo, inode);
 	/*
 	 * A reference is taken only on the head request which acts as a
 	 * reference to the whole page group - the group will not be destroyed
@@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
 		return ERR_PTR(ret);
 	}
 
-	nfs_join_page_group(head, inode);
+	nfs_join_page_group(head, &cinfo, inode);
 
 	return head;
 }
@@ -955,18 +958,16 @@ static void nfs_folio_clear_commit(struct folio *folio)
 }
 
 /* Called holding the request lock on @req */
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+				     struct nfs_page *req)
 {
 	if (test_bit(PG_CLEAN, &req->wb_flags)) {
 		struct nfs_open_context *ctx = nfs_req_openctx(req);
 		struct inode *inode = d_inode(ctx->dentry);
-		struct nfs_commit_info cinfo;
 
-		nfs_init_cinfo_from_inode(&cinfo, inode);
 		mutex_lock(&NFS_I(inode)->commit_mutex);
-		if (!pnfs_clear_request_commit(req, &cinfo)) {
-			nfs_request_remove_commit_list(req, &cinfo);
+		if (!pnfs_clear_request_commit(req, cinfo)) {
+			nfs_request_remove_commit_list(req, cinfo);
 		}
 		mutex_unlock(&NFS_I(inode)->commit_mutex);
 		nfs_folio_clear_commit(nfs_page_to_folio(req));
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index aa9f4c6ebe26..1c315f854ea8 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -157,7 +157,9 @@ extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
 extern	struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req);
 extern	int nfs_page_group_lock_subrequests(struct nfs_page *head);
-extern	void nfs_join_page_group(struct nfs_page *head, struct inode *inode);
+extern void nfs_join_page_group(struct nfs_page *head,
+				struct nfs_commit_info *cinfo,
+				struct inode *inode);
 extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
-- 
cgit v1.2.3


From 806a3bc421a115fbb287c1efce63a48c54ee804b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 30 Aug 2023 15:29:34 -0400
Subject: NFSv4.1: fix pnfs MDS=DS session trunking

Currently, when GETDEVICEINFO returns multiple locations where each
is a different IP but the server's identity is same as MDS, then
nfs4_set_ds_client() finds the existing nfs_client structure which
has the MDS's max_connect value (and if it's 1), then the 1st IP
on the DS's list will get dropped due to MDS trunking rules. Other
IPs would be added as they fall under the pnfs trunking rules.

For the list of IPs the 1st goes thru calling nfs4_set_ds_client()
which will eventually call nfs4_add_trunk() and call into
rpc_clnt_test_and_add_xprt() which has the check for MDS trunking.
The other IPs (after the 1st one), would call rpc_clnt_add_xprt()
which doesn't go thru that check.

nfs4_add_trunk() is called when MDS trunking is happening and it
needs to enforce the usage of max_connect mount option of the
1st mount. However, this shouldn't be applied to pnfs flow.

Instead, this patch proposed to treat MDS=DS as DS trunking and
make sure that MDS's max_connect limit does not apply to the
1st IP returned in the GETDEVICEINFO list. It does so by
marking the newly created client with a new flag NFS_CS_PNFS
which then used to pass max_connect value to use into the
rpc_clnt_test_and_add_xprt() instead of the existing rpc
client's max_connect value set by the MDS connection.

For example, mount was done without max_connect value set
so MDS's rpc client has cl_max_connect=1. Upon calling into
rpc_clnt_test_and_add_xprt() and using rpc client's value,
the caller passes in max_connect value which is previously
been set in the pnfs path (as a part of handling
GETDEVICEINFO list of IPs) in nfs4_set_ds_client().

However, when NFS_CS_PNFS flag is not set and we know we
are doing MDS trunking, comparing a new IP of the same
server, we then set the max_connect value to the
existing MDS's value and pass that into
rpc_clnt_test_and_add_xprt().

Fixes: dc48e0abee24 ("SUNRPC enforce creation of no more than max_connect xprts")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4client.c       |  6 +++++-
 include/linux/nfs_fs_sb.h |  1 +
 net/sunrpc/clnt.c         | 11 +++++++----
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 27fb25567ce7..11e3a285594c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
 		.net = old->cl_net,
 		.servername = old->cl_hostname,
 	};
+	int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ?
+		clp->cl_max_connect : old->cl_max_connect;
 
 	if (clp->cl_proto != old->cl_proto)
 		return;
@@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
 	xprt_args.addrlen = clp_salen;
 
 	rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
-			  rpc_clnt_test_and_add_xprt, NULL);
+			  rpc_clnt_test_and_add_xprt, &max_connect);
 }
 
 /**
@@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 		__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
 	__set_bit(NFS_CS_DS, &cl_init.init_flags);
+	__set_bit(NFS_CS_PNFS, &cl_init.init_flags);
+	cl_init.max_connect = NFS_MAX_TRANSPORTS;
 	/*
 	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
 	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 20eeba8b009d..cd628c4b011e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -48,6 +48,7 @@ struct nfs_client {
 #define NFS_CS_NOPING		6		/* - don't ping on connect */
 #define NFS_CS_DS		7		/* - Server is a DS */
 #define NFS_CS_REUSEPORT	8		/* - reuse src port on reconnect */
+#define NFS_CS_PNFS		9		/* - Server used for pnfs */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index edbcfdd84e1f..37b0b212b934 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2908,19 +2908,22 @@ static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
  * @clnt: pointer to struct rpc_clnt
  * @xps: pointer to struct rpc_xprt_switch,
  * @xprt: pointer struct rpc_xprt
- * @dummy: unused
+ * @in_max_connect: pointer to the max_connect value for the passed in xprt transport
  */
 int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 		struct rpc_xprt_switch *xps, struct rpc_xprt *xprt,
-		void *dummy)
+		void *in_max_connect)
 {
 	struct rpc_cb_add_xprt_calldata *data;
 	struct rpc_task *task;
+	int max_connect = clnt->cl_max_connect;
 
-	if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) {
+	if (in_max_connect)
+		max_connect = *(int *)in_max_connect;
+	if (xps->xps_nunique_destaddr_xprts + 1 > max_connect) {
 		rcu_read_lock();
 		pr_warn("SUNRPC: reached max allowed number (%d) did not add "
-			"transport to server: %s\n", clnt->cl_max_connect,
+			"transport to server: %s\n", max_connect,
 			rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
 		rcu_read_unlock();
 		return -EINVAL;
-- 
cgit v1.2.3


From e4c89f9380017b6b2e63836e2de1af8eb4535384 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Sun, 27 Aug 2023 23:14:04 +0200
Subject: lib/ucs2_string: Add UCS-2 strscpy function

Add a ucs2_strscpy() function for UCS-2 strings. The behavior is
equivalent to the standard strscpy() function, just for 16-bit character
UCS-2 strings.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230827211408.689076-2-luzmaximilian@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/ucs2_string.h |  1 +
 lib/ucs2_string.c           | 52 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h
index cf3ada3e820e..c499ae809c7d 100644
--- a/include/linux/ucs2_string.h
+++ b/include/linux/ucs2_string.h
@@ -10,6 +10,7 @@ typedef u16 ucs2_char_t;
 unsigned long ucs2_strnlen(const ucs2_char_t *s, size_t maxlength);
 unsigned long ucs2_strlen(const ucs2_char_t *s);
 unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength);
+ssize_t ucs2_strscpy(ucs2_char_t *dst, const ucs2_char_t *src, size_t count);
 int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len);
 
 unsigned long ucs2_utf8size(const ucs2_char_t *src);
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
index 0a559a42359b..9308bcfb2ad5 100644
--- a/lib/ucs2_string.c
+++ b/lib/ucs2_string.c
@@ -32,6 +32,58 @@ ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength)
 }
 EXPORT_SYMBOL(ucs2_strsize);
 
+/**
+ * ucs2_strscpy() - Copy a UCS2 string into a sized buffer.
+ *
+ * @dst: Pointer to the destination buffer where to copy the string to.
+ * @src: Pointer to the source buffer where to copy the string from.
+ * @count: Size of the destination buffer, in UCS2 (16-bit) characters.
+ *
+ * Like strscpy(), only for UCS2 strings.
+ *
+ * Copy the source string @src, or as much of it as fits, into the destination
+ * buffer @dst. The behavior is undefined if the string buffers overlap. The
+ * destination buffer @dst is always NUL-terminated, unless it's zero-sized.
+ *
+ * Return: The number of characters copied into @dst (excluding the trailing
+ * %NUL terminator) or -E2BIG if @count is 0 or @src was truncated due to the
+ * destination buffer being too small.
+ */
+ssize_t ucs2_strscpy(ucs2_char_t *dst, const ucs2_char_t *src, size_t count)
+{
+	long res;
+
+	/*
+	 * Ensure that we have a valid amount of space. We need to store at
+	 * least one NUL-character.
+	 */
+	if (count == 0 || WARN_ON_ONCE(count > INT_MAX / sizeof(*dst)))
+		return -E2BIG;
+
+	/*
+	 * Copy at most 'count' characters, return early if we find a
+	 * NUL-terminator.
+	 */
+	for (res = 0; res < count; res++) {
+		ucs2_char_t c;
+
+		c = src[res];
+		dst[res] = c;
+
+		if (!c)
+			return res;
+	}
+
+	/*
+	 * The loop above terminated without finding a NUL-terminator,
+	 * exceeding the 'count': Enforce proper NUL-termination and return
+	 * error.
+	 */
+	dst[count - 1] = 0;
+	return -E2BIG;
+}
+EXPORT_SYMBOL(ucs2_strscpy);
+
 int
 ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
 {
-- 
cgit v1.2.3


From 00b1248606ba3979ccae30ed11df8cdc1a84245a Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Sun, 27 Aug 2023 23:14:05 +0200
Subject: firmware: qcom_scm: Add support for Qualcomm Secure Execution
 Environment SCM interface

Add support for SCM calls to Secure OS and the Secure Execution
Environment (SEE) residing in the TrustZone (TZ) via the QSEECOM
interface. This allows communication with Secure/TZ applications, for
example 'uefisecapp' managing access to UEFI variables.

For better separation, make qcom_scm spin up a dedicated child
(platform) device in case QSEECOM support has been detected. The
corresponding driver for this device is then responsible for managing
any QSEECOM clients. Specifically, this driver attempts to automatically
detect known and supported applications, creating a client (auxiliary)
device for each one. The respective client/auxiliary driver is then
responsible for managing and communicating with the application.

While this patch introduces only a very basic interface without the more
advanced features (such as re-entrant and blocking SCM calls and
listeners/callbacks), this is enough to talk to the aforementioned
'uefisecapp'.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20230827211408.689076-3-luzmaximilian@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 MAINTAINERS                                |   6 +
 drivers/firmware/Kconfig                   |  16 ++
 drivers/firmware/Makefile                  |   1 +
 drivers/firmware/qcom_qseecom.c            | 118 +++++++++
 drivers/firmware/qcom_scm.c                | 394 +++++++++++++++++++++++++++++
 include/linux/firmware/qcom/qcom_qseecom.h |  46 ++++
 include/linux/firmware/qcom/qcom_scm.h     |  22 ++
 7 files changed, 603 insertions(+)
 create mode 100644 drivers/firmware/qcom_qseecom.c
 create mode 100644 include/linux/firmware/qcom/qcom_qseecom.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 90f13281d297..8f373d8bdd6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17800,6 +17800,12 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/mtd/qcom,nandc.yaml
 F:	drivers/mtd/nand/raw/qcom_nandc.c
 
+QUALCOMM QSEECOM DRIVER
+M:	Maximilian Luz <luzmaximilian@gmail.com>
+L:	linux-arm-msm@vger.kernel.org
+S:	Maintained
+F:	drivers/firmware/qcom_qseecom.c
+
 QUALCOMM RMNET DRIVER
 M:	Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
 M:	Sean Tranchetti <quic_stranche@quicinc.com>
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index b59e3041fd62..3e41efe494d4 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -226,6 +226,22 @@ config QCOM_SCM_DOWNLOAD_MODE_DEFAULT
 
 	  Say Y here to enable "download mode" by default.
 
+config QCOM_QSEECOM
+	bool "Qualcomm QSEECOM interface driver"
+	depends on QCOM_SCM=y
+	help
+	  Various Qualcomm SoCs have a Secure Execution Environment (SEE) running
+	  in the Trust Zone. This module provides an interface to that via the
+	  QSEECOM mechanism, using SCM calls.
+
+	  The QSEECOM interface allows, among other things, access to applications
+	  running in the SEE. An example of such an application is 'uefisecapp',
+	  which is required to access UEFI variables on certain systems. If
+	  selected, the interface will also attempt to detect and register client
+	  devices for supported applications.
+
+	  Select Y here to enable the QSEECOM interface driver.
+
 config SYSFB
 	bool
 	select BOOT_VESA_SUPPORT
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 28fcddcd688f..aa48e0821b7d 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RASPBERRYPI_FIRMWARE) += raspberrypi.o
 obj-$(CONFIG_FW_CFG_SYSFS)	+= qemu_fw_cfg.o
 obj-$(CONFIG_QCOM_SCM)		+= qcom-scm.o
 qcom-scm-objs += qcom_scm.o qcom_scm-smc.o qcom_scm-legacy.o
+obj-$(CONFIG_QCOM_QSEECOM)	+= qcom_qseecom.o
 obj-$(CONFIG_SYSFB)		+= sysfb.o
 obj-$(CONFIG_SYSFB_SIMPLEFB)	+= sysfb_simplefb.o
 obj-$(CONFIG_TI_SCI_PROTOCOL)	+= ti_sci.o
diff --git a/drivers/firmware/qcom_qseecom.c b/drivers/firmware/qcom_qseecom.c
new file mode 100644
index 000000000000..bba32096c956
--- /dev/null
+++ b/drivers/firmware/qcom_qseecom.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Qualcomm Secure Execution Environment (SEE) interface (QSEECOM).
+ * Responsible for setting up and managing QSEECOM client devices.
+ *
+ * Copyright (C) 2023 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+#include <linux/auxiliary_bus.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <linux/firmware/qcom/qcom_qseecom.h>
+#include <linux/firmware/qcom/qcom_scm.h>
+
+struct qseecom_app_desc {
+	const char *app_name;
+	const char *dev_name;
+};
+
+static void qseecom_client_release(struct device *dev)
+{
+	struct qseecom_client *client;
+
+	client = container_of(dev, struct qseecom_client, aux_dev.dev);
+	kfree(client);
+}
+
+static void qseecom_client_remove(void *data)
+{
+	struct qseecom_client *client = data;
+
+	auxiliary_device_delete(&client->aux_dev);
+	auxiliary_device_uninit(&client->aux_dev);
+}
+
+static int qseecom_client_register(struct platform_device *qseecom_dev,
+				   const struct qseecom_app_desc *desc)
+{
+	struct qseecom_client *client;
+	u32 app_id;
+	int ret;
+
+	/* Try to find the app ID, skip device if not found */
+	ret = qcom_scm_qseecom_app_get_id(desc->app_name, &app_id);
+	if (ret)
+		return ret == -ENOENT ? 0 : ret;
+
+	dev_info(&qseecom_dev->dev, "setting up client for %s\n", desc->app_name);
+
+	/* Allocate and set-up the client device */
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+
+	client->aux_dev.name = desc->dev_name;
+	client->aux_dev.dev.parent = &qseecom_dev->dev;
+	client->aux_dev.dev.release = qseecom_client_release;
+	client->app_id = app_id;
+
+	ret = auxiliary_device_init(&client->aux_dev);
+	if (ret) {
+		kfree(client);
+		return ret;
+	}
+
+	ret = auxiliary_device_add(&client->aux_dev);
+	if (ret) {
+		auxiliary_device_uninit(&client->aux_dev);
+		return ret;
+	}
+
+	ret = devm_add_action_or_reset(&qseecom_dev->dev, qseecom_client_remove, client);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * List of supported applications. One client device will be created per entry,
+ * assuming the app has already been loaded (usually by firmware bootloaders)
+ * and its ID can be queried successfully.
+ */
+static const struct qseecom_app_desc qcom_qseecom_apps[] = {};
+
+static int qcom_qseecom_probe(struct platform_device *qseecom_dev)
+{
+	int ret;
+	int i;
+
+	/* Set up client devices for each base application */
+	for (i = 0; i < ARRAY_SIZE(qcom_qseecom_apps); i++) {
+		ret = qseecom_client_register(qseecom_dev, &qcom_qseecom_apps[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct platform_driver qcom_qseecom_driver = {
+	.driver = {
+		.name	= "qcom_qseecom",
+	},
+	.probe = qcom_qseecom_probe,
+};
+
+static int __init qcom_qseecom_init(void)
+{
+	return platform_driver_register(&qcom_qseecom_driver);
+}
+subsys_initcall(qcom_qseecom_init);
+
+MODULE_AUTHOR("Maximilian Luz <luzmaximilian@gmail.com>");
+MODULE_DESCRIPTION("Driver for the Qualcomm SEE (QSEECOM) interface");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index 06fe8aca870d..f9d5c31b8ec7 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -55,6 +55,53 @@ struct qcom_scm_mem_map_info {
 	__le64 mem_size;
 };
 
+/**
+ * struct qcom_scm_qseecom_resp - QSEECOM SCM call response.
+ * @result:    Result or status of the SCM call. See &enum qcom_scm_qseecom_result.
+ * @resp_type: Type of the response. See &enum qcom_scm_qseecom_resp_type.
+ * @data:      Response data. The type of this data is given in @resp_type.
+ */
+struct qcom_scm_qseecom_resp {
+	u64 result;
+	u64 resp_type;
+	u64 data;
+};
+
+enum qcom_scm_qseecom_result {
+	QSEECOM_RESULT_SUCCESS			= 0,
+	QSEECOM_RESULT_INCOMPLETE		= 1,
+	QSEECOM_RESULT_BLOCKED_ON_LISTENER	= 2,
+	QSEECOM_RESULT_FAILURE			= 0xFFFFFFFF,
+};
+
+enum qcom_scm_qseecom_resp_type {
+	QSEECOM_SCM_RES_APP_ID			= 0xEE01,
+	QSEECOM_SCM_RES_QSEOS_LISTENER_ID	= 0xEE02,
+};
+
+enum qcom_scm_qseecom_tz_owner {
+	QSEECOM_TZ_OWNER_SIP			= 2,
+	QSEECOM_TZ_OWNER_TZ_APPS		= 48,
+	QSEECOM_TZ_OWNER_QSEE_OS		= 50
+};
+
+enum qcom_scm_qseecom_tz_svc {
+	QSEECOM_TZ_SVC_APP_ID_PLACEHOLDER	= 0,
+	QSEECOM_TZ_SVC_APP_MGR			= 1,
+	QSEECOM_TZ_SVC_INFO			= 6,
+};
+
+enum qcom_scm_qseecom_tz_cmd_app {
+	QSEECOM_TZ_CMD_APP_SEND			= 1,
+	QSEECOM_TZ_CMD_APP_LOOKUP		= 3,
+};
+
+enum qcom_scm_qseecom_tz_cmd_info {
+	QSEECOM_TZ_CMD_INFO_VERSION		= 3,
+};
+
+#define QSEECOM_MAX_APP_NAME_SIZE		64
+
 /* Each bit configures cold/warm boot address for one of the 4 CPUs */
 static const u8 qcom_scm_cpu_cold_bits[QCOM_SCM_BOOT_MAX_CPUS] = {
 	0, BIT(0), BIT(3), BIT(5)
@@ -1321,6 +1368,340 @@ static int qcom_scm_find_dload_address(struct device *dev, u64 *addr)
 	return 0;
 }
 
+#ifdef CONFIG_QCOM_QSEECOM
+
+/* Lock for QSEECOM SCM call executions */
+static DEFINE_MUTEX(qcom_scm_qseecom_call_lock);
+
+static int __qcom_scm_qseecom_call(const struct qcom_scm_desc *desc,
+				   struct qcom_scm_qseecom_resp *res)
+{
+	struct qcom_scm_res scm_res = {};
+	int status;
+
+	/*
+	 * QSEECOM SCM calls should not be executed concurrently. Therefore, we
+	 * require the respective call lock to be held.
+	 */
+	lockdep_assert_held(&qcom_scm_qseecom_call_lock);
+
+	status = qcom_scm_call(__scm->dev, desc, &scm_res);
+
+	res->result = scm_res.result[0];
+	res->resp_type = scm_res.result[1];
+	res->data = scm_res.result[2];
+
+	if (status)
+		return status;
+
+	return 0;
+}
+
+/**
+ * qcom_scm_qseecom_call() - Perform a QSEECOM SCM call.
+ * @desc: SCM call descriptor.
+ * @res:  SCM call response (output).
+ *
+ * Performs the QSEECOM SCM call described by @desc, returning the response in
+ * @rsp.
+ *
+ * Return: Zero on success, nonzero on failure.
+ */
+static int qcom_scm_qseecom_call(const struct qcom_scm_desc *desc,
+				 struct qcom_scm_qseecom_resp *res)
+{
+	int status;
+
+	/*
+	 * Note: Multiple QSEECOM SCM calls should not be executed same time,
+	 * so lock things here. This needs to be extended to callback/listener
+	 * handling when support for that is implemented.
+	 */
+
+	mutex_lock(&qcom_scm_qseecom_call_lock);
+	status = __qcom_scm_qseecom_call(desc, res);
+	mutex_unlock(&qcom_scm_qseecom_call_lock);
+
+	dev_dbg(__scm->dev, "%s: owner=%x, svc=%x, cmd=%x, result=%lld, type=%llx, data=%llx\n",
+		__func__, desc->owner, desc->svc, desc->cmd, res->result,
+		res->resp_type, res->data);
+
+	if (status) {
+		dev_err(__scm->dev, "qseecom: scm call failed with error %d\n", status);
+		return status;
+	}
+
+	/*
+	 * TODO: Handle incomplete and blocked calls:
+	 *
+	 * Incomplete and blocked calls are not supported yet. Some devices
+	 * and/or commands require those, some don't. Let's warn about them
+	 * prominently in case someone attempts to try these commands with a
+	 * device/command combination that isn't supported yet.
+	 */
+	WARN_ON(res->result == QSEECOM_RESULT_INCOMPLETE);
+	WARN_ON(res->result == QSEECOM_RESULT_BLOCKED_ON_LISTENER);
+
+	return 0;
+}
+
+/**
+ * qcom_scm_qseecom_get_version() - Query the QSEECOM version.
+ * @version: Pointer where the QSEECOM version will be stored.
+ *
+ * Performs the QSEECOM SCM querying the QSEECOM version currently running in
+ * the TrustZone.
+ *
+ * Return: Zero on success, nonzero on failure.
+ */
+static int qcom_scm_qseecom_get_version(u32 *version)
+{
+	struct qcom_scm_desc desc = {};
+	struct qcom_scm_qseecom_resp res = {};
+	u32 feature = 10;
+	int ret;
+
+	desc.owner = QSEECOM_TZ_OWNER_SIP;
+	desc.svc = QSEECOM_TZ_SVC_INFO;
+	desc.cmd = QSEECOM_TZ_CMD_INFO_VERSION;
+	desc.arginfo = QCOM_SCM_ARGS(1, QCOM_SCM_VAL);
+	desc.args[0] = feature;
+
+	ret = qcom_scm_qseecom_call(&desc, &res);
+	if (ret)
+		return ret;
+
+	*version = res.result;
+	return 0;
+}
+
+/**
+ * qcom_scm_qseecom_app_get_id() - Query the app ID for a given QSEE app name.
+ * @app_name: The name of the app.
+ * @app_id:   The returned app ID.
+ *
+ * Query and return the application ID of the SEE app identified by the given
+ * name. This returned ID is the unique identifier of the app required for
+ * subsequent communication.
+ *
+ * Return: Zero on success, nonzero on failure, -ENOENT if the app has not been
+ * loaded or could not be found.
+ */
+int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id)
+{
+	unsigned long name_buf_size = QSEECOM_MAX_APP_NAME_SIZE;
+	unsigned long app_name_len = strlen(app_name);
+	struct qcom_scm_desc desc = {};
+	struct qcom_scm_qseecom_resp res = {};
+	dma_addr_t name_buf_phys;
+	char *name_buf;
+	int status;
+
+	if (app_name_len >= name_buf_size)
+		return -EINVAL;
+
+	name_buf = kzalloc(name_buf_size, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	memcpy(name_buf, app_name, app_name_len);
+
+	name_buf_phys = dma_map_single(__scm->dev, name_buf, name_buf_size, DMA_TO_DEVICE);
+	status = dma_mapping_error(__scm->dev, name_buf_phys);
+	if (status) {
+		kfree(name_buf);
+		dev_err(__scm->dev, "qseecom: failed to map dma address\n");
+		return status;
+	}
+
+	desc.owner = QSEECOM_TZ_OWNER_QSEE_OS;
+	desc.svc = QSEECOM_TZ_SVC_APP_MGR;
+	desc.cmd = QSEECOM_TZ_CMD_APP_LOOKUP;
+	desc.arginfo = QCOM_SCM_ARGS(2, QCOM_SCM_RW, QCOM_SCM_VAL);
+	desc.args[0] = name_buf_phys;
+	desc.args[1] = app_name_len;
+
+	status = qcom_scm_qseecom_call(&desc, &res);
+	dma_unmap_single(__scm->dev, name_buf_phys, name_buf_size, DMA_TO_DEVICE);
+	kfree(name_buf);
+
+	if (status)
+		return status;
+
+	if (res.result == QSEECOM_RESULT_FAILURE)
+		return -ENOENT;
+
+	if (res.result != QSEECOM_RESULT_SUCCESS)
+		return -EINVAL;
+
+	if (res.resp_type != QSEECOM_SCM_RES_APP_ID)
+		return -EINVAL;
+
+	*app_id = res.data;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qcom_scm_qseecom_app_get_id);
+
+/**
+ * qcom_scm_qseecom_app_send() - Send to and receive data from a given QSEE app.
+ * @app_id:   The ID of the target app.
+ * @req:      Request buffer sent to the app (must be DMA-mappable).
+ * @req_size: Size of the request buffer.
+ * @rsp:      Response buffer, written to by the app (must be DMA-mappable).
+ * @rsp_size: Size of the response buffer.
+ *
+ * Sends a request to the QSEE app associated with the given ID and read back
+ * its response. The caller must provide two DMA memory regions, one for the
+ * request and one for the response, and fill out the @req region with the
+ * respective (app-specific) request data. The QSEE app reads this and returns
+ * its response in the @rsp region.
+ *
+ * Return: Zero on success, nonzero on failure.
+ */
+int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp,
+			      size_t rsp_size)
+{
+	struct qcom_scm_qseecom_resp res = {};
+	struct qcom_scm_desc desc = {};
+	dma_addr_t req_phys;
+	dma_addr_t rsp_phys;
+	int status;
+
+	/* Map request buffer */
+	req_phys = dma_map_single(__scm->dev, req, req_size, DMA_TO_DEVICE);
+	status = dma_mapping_error(__scm->dev, req_phys);
+	if (status) {
+		dev_err(__scm->dev, "qseecom: failed to map request buffer\n");
+		return status;
+	}
+
+	/* Map response buffer */
+	rsp_phys = dma_map_single(__scm->dev, rsp, rsp_size, DMA_FROM_DEVICE);
+	status = dma_mapping_error(__scm->dev, rsp_phys);
+	if (status) {
+		dma_unmap_single(__scm->dev, req_phys, req_size, DMA_TO_DEVICE);
+		dev_err(__scm->dev, "qseecom: failed to map response buffer\n");
+		return status;
+	}
+
+	/* Set up SCM call data */
+	desc.owner = QSEECOM_TZ_OWNER_TZ_APPS;
+	desc.svc = QSEECOM_TZ_SVC_APP_ID_PLACEHOLDER;
+	desc.cmd = QSEECOM_TZ_CMD_APP_SEND;
+	desc.arginfo = QCOM_SCM_ARGS(5, QCOM_SCM_VAL,
+				     QCOM_SCM_RW, QCOM_SCM_VAL,
+				     QCOM_SCM_RW, QCOM_SCM_VAL);
+	desc.args[0] = app_id;
+	desc.args[1] = req_phys;
+	desc.args[2] = req_size;
+	desc.args[3] = rsp_phys;
+	desc.args[4] = rsp_size;
+
+	/* Perform call */
+	status = qcom_scm_qseecom_call(&desc, &res);
+
+	/* Unmap buffers */
+	dma_unmap_single(__scm->dev, rsp_phys, rsp_size, DMA_FROM_DEVICE);
+	dma_unmap_single(__scm->dev, req_phys, req_size, DMA_TO_DEVICE);
+
+	if (status)
+		return status;
+
+	if (res.result != QSEECOM_RESULT_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qcom_scm_qseecom_app_send);
+
+/*
+ * We do not yet support re-entrant calls via the qseecom interface. To prevent
+ + any potential issues with this, only allow validated machines for now.
+ */
+static const struct of_device_id qcom_scm_qseecom_allowlist[] = {
+	{ .compatible = "lenovo,thinkpad-x13s", },
+	{ }
+};
+
+static bool qcom_scm_qseecom_machine_is_allowed(void)
+{
+	struct device_node *np;
+	bool match;
+
+	np = of_find_node_by_path("/");
+	if (!np)
+		return false;
+
+	match = of_match_node(qcom_scm_qseecom_allowlist, np);
+	of_node_put(np);
+
+	return match;
+}
+
+static void qcom_scm_qseecom_free(void *data)
+{
+	struct platform_device *qseecom_dev = data;
+
+	platform_device_del(qseecom_dev);
+	platform_device_put(qseecom_dev);
+}
+
+static int qcom_scm_qseecom_init(struct qcom_scm *scm)
+{
+	struct platform_device *qseecom_dev;
+	u32 version;
+	int ret;
+
+	/*
+	 * Note: We do two steps of validation here: First, we try to query the
+	 * QSEECOM version as a check to see if the interface exists on this
+	 * device. Second, we check against known good devices due to current
+	 * driver limitations (see comment in qcom_scm_qseecom_allowlist).
+	 *
+	 * Note that we deliberately do the machine check after the version
+	 * check so that we can log potentially supported devices. This should
+	 * be safe as downstream sources indicate that the version query is
+	 * neither blocking nor reentrant.
+	 */
+	ret = qcom_scm_qseecom_get_version(&version);
+	if (ret)
+		return 0;
+
+	dev_info(scm->dev, "qseecom: found qseecom with version 0x%x\n", version);
+
+	if (!qcom_scm_qseecom_machine_is_allowed()) {
+		dev_info(scm->dev, "qseecom: untested machine, skipping\n");
+		return 0;
+	}
+
+	/*
+	 * Set up QSEECOM interface device. All application clients will be
+	 * set up and managed by the corresponding driver for it.
+	 */
+	qseecom_dev = platform_device_alloc("qcom_qseecom", -1);
+	if (!qseecom_dev)
+		return -ENOMEM;
+
+	qseecom_dev->dev.parent = scm->dev;
+
+	ret = platform_device_add(qseecom_dev);
+	if (ret) {
+		platform_device_put(qseecom_dev);
+		return ret;
+	}
+
+	return devm_add_action_or_reset(scm->dev, qcom_scm_qseecom_free, qseecom_dev);
+}
+
+#else /* CONFIG_QCOM_QSEECOM */
+
+static int qcom_scm_qseecom_init(struct qcom_scm *scm)
+{
+	return 0;
+}
+
+#endif /* CONFIG_QCOM_QSEECOM */
+
 /**
  * qcom_scm_is_available() - Checks if SCM is available
  */
@@ -1468,6 +1849,19 @@ static int qcom_scm_probe(struct platform_device *pdev)
 	if (download_mode)
 		qcom_scm_set_download_mode(true);
 
+	/*
+	 * Initialize the QSEECOM interface.
+	 *
+	 * Note: QSEECOM is fairly self-contained and this only adds the
+	 * interface device (the driver of which does most of the heavy
+	 * lifting). So any errors returned here should be either -ENOMEM or
+	 * -EINVAL (with the latter only in case there's a bug in our code).
+	 * This means that there is no need to bring down the whole SCM driver.
+	 * Just log the error instead and let SCM live.
+	 */
+	ret = qcom_scm_qseecom_init(scm);
+	WARN(ret < 0, "failed to initialize qseecom: %d\n", ret);
+
 	return 0;
 }
 
diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h
new file mode 100644
index 000000000000..b531547e1dc9
--- /dev/null
+++ b/include/linux/firmware/qcom/qcom_qseecom.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Qualcomm Secure Execution Environment (SEE) interface (QSEECOM).
+ * Responsible for setting up and managing QSEECOM client devices.
+ *
+ * Copyright (C) 2023 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+#include <linux/auxiliary_bus.h>
+#include <linux/types.h>
+
+#include <linux/firmware/qcom/qcom_scm.h>
+
+/**
+ * struct qseecom_client - QSEECOM client device.
+ * @aux_dev: Underlying auxiliary device.
+ * @app_id: ID of the loaded application.
+ */
+struct qseecom_client {
+	struct auxiliary_device aux_dev;
+	u32 app_id;
+};
+
+/**
+ * qcom_qseecom_app_send() - Send to and receive data from a given QSEE app.
+ * @client:   The QSEECOM client associated with the target app.
+ * @req:      Request buffer sent to the app (must be DMA-mappable).
+ * @req_size: Size of the request buffer.
+ * @rsp:      Response buffer, written to by the app (must be DMA-mappable).
+ * @rsp_size: Size of the response buffer.
+ *
+ * Sends a request to the QSEE app associated with the given client and read
+ * back its response. The caller must provide two DMA memory regions, one for
+ * the request and one for the response, and fill out the @req region with the
+ * respective (app-specific) request data. The QSEE app reads this and returns
+ * its response in the @rsp region.
+ *
+ * Note: This is a convenience wrapper around qcom_scm_qseecom_app_send().
+ * Clients should prefer to use this wrapper.
+ *
+ * Return: Zero on success, nonzero on failure.
+ */
+static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req, size_t req_size,
+					void *rsp, size_t rsp_size)
+{
+	return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size);
+}
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 0c091a3f6d49..e1ea2eb56d04 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -122,4 +122,26 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
 extern int qcom_scm_lmh_profile_change(u32 profile_id);
 extern bool qcom_scm_lmh_dcvsh_available(void);
 
+#ifdef CONFIG_QCOM_QSEECOM
+
+int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id);
+int qcom_scm_qseecom_app_send(u32 app_id, void *req, size_t req_size, void *rsp,
+			      size_t rsp_size);
+
+#else /* CONFIG_QCOM_QSEECOM */
+
+static inline int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id)
+{
+	return -EINVAL;
+}
+
+static inline int qcom_scm_qseecom_app_send(u32 app_id, void *req,
+					    size_t req_size, void *rsp,
+					    size_t rsp_size)
+{
+	return -EINVAL;
+}
+
+#endif /* CONFIG_QCOM_QSEECOM */
+
 #endif
-- 
cgit v1.2.3


From b93c5fe16e4aa177cd072c1c4652cbe1b19a7812 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 3 Aug 2023 21:49:19 +0800
Subject: rcu: Remove unused function declaration rcu_eqs_special_set()

Commit a86baa69c2b7 ("rcu: Remove special bit at the bottom of the ->dynticks counter")
left behind this, remove it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutree.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 126f6b418f6a..153cfc7bbffd 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,6 @@ void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 
 void rcu_barrier(void);
-bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
-- 
cgit v1.2.3


From 6e284c55fc0bef7d25fd34d29db11f483da60ea4 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 5 Aug 2023 11:17:25 +0800
Subject: mm: Remove kmem_valid_obj()

Function kmem_dump_obj() will splat if passed a pointer to a non-slab
object. So nothing calls it directly, instead calling kmem_valid_obj()
first to determine whether the passed pointer to a valid slab object. This
means that merging kmem_valid_obj() into kmem_dump_obj() will make the
code more concise. Therefore, convert kmem_dump_obj() to work the same
way as vmalloc_dump_obj(), removing the need for the kmem_dump_obj()
caller to check kmem_valid_obj().  After this, there are no remaining
calls to kmem_valid_obj() anymore, and it can be safely removed.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/slab.h |  5 +++--
 mm/slab_common.c     | 41 +++++++++++------------------------------
 mm/util.c            |  4 +---
 3 files changed, 15 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..ff56ab804bf6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -245,8 +245,9 @@ DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
 size_t ksize(const void *objp);
 
 #ifdef CONFIG_PRINTK
-bool kmem_valid_obj(void *object);
-void kmem_dump_obj(void *object);
+bool kmem_dump_obj(void *object);
+#else
+static inline bool kmem_dump_obj(void *object) { return false; }
 #endif
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index cd71f9581e67..a425bedf2103 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -528,26 +528,6 @@ bool slab_is_available(void)
 }
 
 #ifdef CONFIG_PRINTK
-/**
- * kmem_valid_obj - does the pointer reference a valid slab object?
- * @object: pointer to query.
- *
- * Return: %true if the pointer is to a not-yet-freed object from
- * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
- * is to an already-freed object, and %false otherwise.
- */
-bool kmem_valid_obj(void *object)
-{
-	struct folio *folio;
-
-	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
-	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
-		return false;
-	folio = virt_to_folio(object);
-	return folio_test_slab(folio);
-}
-EXPORT_SYMBOL_GPL(kmem_valid_obj);
-
 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
 {
 	if (__kfence_obj_info(kpp, object, slab))
@@ -566,11 +546,11 @@ static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *
  * and, if available, the slab name, return address, and stack trace from
  * the allocation and last free path of that object.
  *
- * This function will splat if passed a pointer to a non-slab object.
- * If you are not sure what type of object you have, you should instead
- * use mem_dump_obj().
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
  */
-void kmem_dump_obj(void *object)
+bool kmem_dump_obj(void *object)
 {
 	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
 	int i;
@@ -578,13 +558,13 @@ void kmem_dump_obj(void *object)
 	unsigned long ptroffset;
 	struct kmem_obj_info kp = { };
 
-	if (WARN_ON_ONCE(!virt_addr_valid(object)))
-		return;
+	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+		return false;
 	slab = virt_to_slab(object);
-	if (WARN_ON_ONCE(!slab)) {
-		pr_cont(" non-slab memory.\n");
-		return;
-	}
+	if (!slab)
+		return false;
+
 	kmem_obj_info(&kp, object, slab);
 	if (kp.kp_slab_cache)
 		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
@@ -621,6 +601,7 @@ void kmem_dump_obj(void *object)
 		pr_info("    %pS\n", kp.kp_free_stack[i]);
 	}
 
+	return true;
 }
 EXPORT_SYMBOL_GPL(kmem_dump_obj);
 #endif
diff --git a/mm/util.c b/mm/util.c
index 8cbbfd3a3d59..6eddd891198e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1060,10 +1060,8 @@ void mem_dump_obj(void *object)
 {
 	const char *type;
 
-	if (kmem_valid_obj(object)) {
-		kmem_dump_obj(object);
+	if (kmem_dump_obj(object))
 		return;
-	}
 
 	if (vmalloc_dump_obj(object))
 		return;
-- 
cgit v1.2.3


From 25cc71d1527b55c880d333e7cc1dc37aeef9843f Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 11:44:41 +0500
Subject: lsm: constify 'sb' parameter in security_quotactl()

SELinux registers the implementation for the "quotactl" hook. Looking at
the function implementation we observe that the parameter "sb" is not
changing.

Mark the "sb" parameter of LSM hook security_quotactl() as "const" since
it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 security/security.c           | 2 +-
 security/selinux/hooks.c      | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ac962c4cb44b..b464f9c1894f 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -43,7 +43,7 @@ LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
 	 const kernel_cap_t *permitted)
 LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
 	 int cap, unsigned int opts)
-LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb)
+LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb)
 LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
diff --git a/include/linux/security.h b/include/linux/security.h
index 5f16eecde00b..1a02e67e682f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -284,7 +284,7 @@ int security_capable(const struct cred *cred,
 		       struct user_namespace *ns,
 		       int cap,
 		       unsigned int opts);
-int security_quotactl(int cmds, int type, int id, struct super_block *sb);
+int security_quotactl(int cmds, int type, int id, const struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
@@ -581,7 +581,7 @@ static inline int security_capable(const struct cred *cred,
 }
 
 static inline int security_quotactl(int cmds, int type, int id,
-				     struct super_block *sb)
+				     const struct super_block *sb)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 23b129d482a7..b944b19e4512 100644
--- a/security/security.c
+++ b/security/security.c
@@ -957,7 +957,7 @@ int security_capable(const struct cred *cred,
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_quotactl(int cmds, int type, int id, struct super_block *sb)
+int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
 {
 	return call_int_hook(quotactl, 0, cmds, type, id, sb);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 10350534de6d..e1c7640a5df1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1937,7 +1937,7 @@ static inline int may_rename(struct inode *old_dir,
 
 /* Check whether a task can perform a filesystem operation. */
 static int superblock_has_perm(const struct cred *cred,
-			       struct super_block *sb,
+			       const struct super_block *sb,
 			       u32 perms,
 			       struct common_audit_data *ad)
 {
@@ -2139,7 +2139,7 @@ static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
 	return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
 }
 
-static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
+static int selinux_quotactl(int cmds, int type, int id, const struct super_block *sb)
 {
 	const struct cred *cred = current_cred();
 	int rc = 0;
-- 
cgit v1.2.3


From 4a00c673068e72c12d243f5c31000246d6984e44 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 12:17:29 +0500
Subject: lsm: constify 'file' parameter in security_bprm_creds_from_file()

The 'bprm_creds_from_file' hook has implementation registered in
commoncap. Looking at the function implementation we observe that the
'file' parameter is not changing.

Mark the 'file' parameter of LSM hook security_bprm_creds_from_file() as
'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/fs.h            | 2 +-
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 6 +++---
 security/commoncap.c          | 4 ++--
 security/security.c           | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..f69d085e531f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2450,7 +2450,7 @@ struct filename {
 };
 static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
 
-static inline struct mnt_idmap *file_mnt_idmap(struct file *file)
+static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
 {
 	return mnt_idmap(file->f_path.mnt);
 }
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index b464f9c1894f..5dfe67d69aba 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -50,7 +50,7 @@ LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
 LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
-LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file)
+LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1a02e67e682f..edbea3e0a13f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -151,7 +151,7 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *effective,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
-extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags);
 int cap_inode_removexattr(struct mnt_idmap *idmap,
@@ -290,7 +290,7 @@ int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_creds_for_exec(struct linux_binprm *bprm);
-int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
@@ -613,7 +613,7 @@ static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm)
 }
 
 static inline int security_bprm_creds_from_file(struct linux_binprm *bprm,
-						struct file *file)
+						const struct file *file)
 {
 	return cap_bprm_creds_from_file(bprm, file);
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index bc0521104197..8e8c630ce204 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -720,7 +720,7 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
  * its xattrs and, if present, apply them to the proposed credentials being
  * constructed by execve().
  */
-static int get_file_caps(struct linux_binprm *bprm, struct file *file,
+static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
 			 bool *effective, bool *has_fcap)
 {
 	int rc = 0;
@@ -882,7 +882,7 @@ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
  *
  * Return: 0 if successful, -ve on error.
  */
-int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
+int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
 {
 	/* Process setpcap binaries and capabilities for uid 0 */
 	const struct cred *old = current_cred();
diff --git a/security/security.c b/security/security.c
index b944b19e4512..bde8813e89ff 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1079,7 +1079,7 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm)
  *
  * Return: Returns 0 if the hook is successful and permission is granted.
  */
-int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
+int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
 {
 	return call_int_hook(bprm_creds_from_file, 0, bprm, file);
 }
-- 
cgit v1.2.3


From 64fc9526147c7fc14535134d8ea79b9c8dc549a7 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 12:47:56 +0500
Subject: lsm: constify 'bprm' parameter in security_bprm_committing_creds()

The 'bprm_committing_creds' hook has implementations registered in
SELinux and Apparmor. Looking at the function implementations we observe
that the 'bprm' parameter is not changing.

Mark the 'bprm' parameter of LSM hook security_bprm_committing_creds()
as 'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 security/apparmor/lsm.c       | 2 +-
 security/security.c           | 2 +-
 security/selinux/hooks.c      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 5dfe67d69aba..f6acc3ed66a3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -52,7 +52,7 @@ LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
 LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
diff --git a/include/linux/security.h b/include/linux/security.h
index edbea3e0a13f..885053f81019 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -292,7 +292,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_creds_for_exec(struct linux_binprm *bprm);
 int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
-void security_bprm_committing_creds(struct linux_binprm *bprm);
+void security_bprm_committing_creds(const struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
@@ -623,7 +623,7 @@ static inline int security_bprm_check(struct linux_binprm *bprm)
 	return 0;
 }
 
-static inline void security_bprm_committing_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 }
 
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 108eccc5ada5..b03f46e0f6c5 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -734,7 +734,7 @@ fail:
  * apparmor_bprm_committing_creds - do task cleanup on committing new creds
  * @bprm: binprm for the exec  (NOT NULL)
  */
-static void apparmor_bprm_committing_creds(struct linux_binprm *bprm)
+static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	struct aa_label *label = aa_current_raw_label();
 	struct aa_label *new_label = cred_label(bprm->cred);
diff --git a/security/security.c b/security/security.c
index bde8813e89ff..77a1601ead36 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1118,7 +1118,7 @@ int security_bprm_check(struct linux_binprm *bprm)
  * open file descriptors to which access will no longer be granted when the
  * attributes are changed.  This is called immediately before commit_creds().
  */
-void security_bprm_committing_creds(struct linux_binprm *bprm)
+void security_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committing_creds, bprm);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e1c7640a5df1..f42a1b78bc43 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2455,7 +2455,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 /*
  * Prepare a process for imminent new credential changes due to exec
  */
-static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	struct task_security_struct *new_tsec;
 	struct rlimit *rlim, *initrlim;
-- 
cgit v1.2.3


From 81b36803ac139827538ac5ce4028e750a3c53f53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:21 +0000
Subject: udp: introduce udp->udp_flags

According to syzbot, it is time to use proper atomic flags
for various UDP flags.

Add udp_flags field, and convert udp->corkflag to first
bit in it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 28 +++++++++++++++++++++-------
 net/ipv4/udp.c      | 12 ++++++------
 net/ipv6/udp.c      |  6 +++---
 3 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 43c1fb2d2c21..23f0693e0d9c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -32,14 +32,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 	return (num + net_hash_mix(net)) & mask;
 }
 
+enum {
+	UDP_FLAGS_CORK,		/* Cork is required */
+};
+
 struct udp_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock inet;
 #define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
 #define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
 #define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
+
+	unsigned long	 udp_flags;
+
 	int		 pending;	/* Any pending frames ? */
-	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
@@ -51,6 +57,11 @@ struct udp_sock {
 			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
+/* indicator bits used by pcflag: */
+#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
+#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -62,12 +73,6 @@ struct udp_sock {
 	 */
 	__u16		 pcslen;
 	__u16		 pcrlen;
-/* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
-	__u8		 unused[3];
 	/*
 	 * For encapsulation sockets.
 	 */
@@ -95,6 +100,15 @@ struct udp_sock {
 	int		forward_threshold;
 };
 
+#define udp_test_bit(nr, sk)			\
+	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_set_bit(nr, sk)			\
+	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_clear_bit(nr, sk)			\
+	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_assign_bit(nr, sk, val)		\
+	assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)
+
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
 
 #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f39b9c844580..9709f8a532dc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1051,7 +1051,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	u8 tos, scope;
 	__be16 dport;
 	int err, is_udplite = IS_UDPLITE(sk);
-	int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+	int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 	struct sk_buff *skb;
 	struct ip_options_data opt_copy;
@@ -1315,11 +1315,11 @@ void udp_splice_eof(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct udp_sock *up = udp_sk(sk);
 
-	if (!up->pending || READ_ONCE(up->corkflag))
+	if (!up->pending || udp_test_bit(CORK, sk))
 		return;
 
 	lock_sock(sk);
-	if (up->pending && !READ_ONCE(up->corkflag))
+	if (up->pending && !udp_test_bit(CORK, sk))
 		udp_push_pending_frames(sk);
 	release_sock(sk);
 }
@@ -2658,9 +2658,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	switch (optname) {
 	case UDP_CORK:
 		if (val != 0) {
-			WRITE_ONCE(up->corkflag, 1);
+			udp_set_bit(CORK, sk);
 		} else {
-			WRITE_ONCE(up->corkflag, 0);
+			udp_clear_bit(CORK, sk);
 			lock_sock(sk);
 			push_pending_frames(sk);
 			release_sock(sk);
@@ -2783,7 +2783,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 
 	switch (optname) {
 	case UDP_CORK:
-		val = READ_ONCE(up->corkflag);
+		val = udp_test_bit(CORK, sk);
 		break;
 
 	case UDP_ENCAP:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 86b5d509a468..0c6973cd22ce 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1332,7 +1332,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int addr_len = msg->msg_namelen;
 	bool connected = false;
 	int ulen = len;
-	int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
+	int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
@@ -1644,11 +1644,11 @@ static void udpv6_splice_eof(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct udp_sock *up = udp_sk(sk);
 
-	if (!up->pending || READ_ONCE(up->corkflag))
+	if (!up->pending || udp_test_bit(CORK, sk))
 		return;
 
 	lock_sock(sk);
-	if (up->pending && !READ_ONCE(up->corkflag))
+	if (up->pending && !udp_test_bit(CORK, sk))
 		udp_v6_push_pending_frames(sk);
 	release_sock(sk);
 }
-- 
cgit v1.2.3


From a0002127cd746fcaa182ad3386ef6931c37f3bda Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:22 +0000
Subject: udp: move udp->no_check6_tx to udp->udp_flags

syzbot reported that udp->no_check6_tx can be read locklessly.
Use one atomic bit from udp->udp_flags

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 net/ipv4/udp.c      |  4 ++--
 net/ipv6/udp.c      |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 23f0693e0d9c..e3f2a6c7ac1d 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -34,6 +34,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
+	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 };
 
 struct udp_sock {
@@ -47,8 +48,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
 			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
@@ -115,7 +115,7 @@ struct udp_sock {
 
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_tx = val;
+	udp_assign_bit(NO_CHECK6_TX, sk, val);
 }
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
@@ -123,9 +123,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 	udp_sk(sk)->no_check6_rx = val;
 }
 
-static inline bool udp_get_no_check6_tx(struct sock *sk)
+static inline bool udp_get_no_check6_tx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_tx;
+	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
 static inline bool udp_get_no_check6_rx(struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9709f8a532dc..0c6998291c99 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2694,7 +2694,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_TX:
-		up->no_check6_tx = valbool;
+		udp_set_no_check6_tx(sk, valbool);
 		break;
 
 	case UDP_NO_CHECK6_RX:
@@ -2791,7 +2791,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_TX:
-		val = up->no_check6_tx;
+		val = udp_get_no_check6_tx(sk);
 		break;
 
 	case UDP_NO_CHECK6_RX:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0c6973cd22ce..469df0ca561f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1241,7 +1241,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			kfree_skb(skb);
 			return -EINVAL;
 		}
-		if (udp_sk(sk)->no_check6_tx) {
+		if (udp_get_no_check6_tx(sk)) {
 			kfree_skb(skb);
 			return -EINVAL;
 		}
@@ -1262,7 +1262,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 
 	if (is_udplite)
 		csum = udplite_csum(skb);
-	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
+	else if (udp_get_no_check6_tx(sk)) {   /* UDP csum disabled */
 		skb->ip_summed = CHECKSUM_NONE;
 		goto send;
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
-- 
cgit v1.2.3


From bcbc1b1de884647aa0318bf74eb7f293d72a1e40 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:23 +0000
Subject: udp: move udp->no_check6_rx to udp->udp_flags

syzbot reported that udp->no_check6_rx can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 10 +++++-----
 net/ipv4/udp.c      |  4 ++--
 net/ipv6/udp.c      |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e3f2a6c7ac1d..8d4c3835b1b2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,6 +35,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 };
 
 struct udp_sock {
@@ -48,8 +49,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
@@ -120,7 +120,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 
 static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
 {
-	udp_sk(sk)->no_check6_rx = val;
+	udp_assign_bit(NO_CHECK6_RX, sk, val);
 }
 
 static inline bool udp_get_no_check6_tx(const struct sock *sk)
@@ -128,9 +128,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk)
 	return udp_test_bit(NO_CHECK6_TX, sk);
 }
 
-static inline bool udp_get_no_check6_rx(struct sock *sk)
+static inline bool udp_get_no_check6_rx(const struct sock *sk)
 {
-	return udp_sk(sk)->no_check6_rx;
+	return udp_test_bit(NO_CHECK6_RX, sk);
 }
 
 static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0c6998291c99..cb32826a1db2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2698,7 +2698,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_RX:
-		up->no_check6_rx = valbool;
+		udp_set_no_check6_rx(sk, valbool);
 		break;
 
 	case UDP_SEGMENT:
@@ -2795,7 +2795,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_NO_CHECK6_RX:
-		val = up->no_check6_rx;
+		val = udp_get_no_check6_rx(sk);
 		break;
 
 	case UDP_SEGMENT:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 469df0ca561f..6e1ea3029260 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -858,7 +858,7 @@ start_lookup:
 		/* If zero checksum and no_check is not on for
 		 * the socket then skip it.
 		 */
-		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+		if (!uh->check && !udp_get_no_check6_rx(sk))
 			continue;
 		if (!first) {
 			first = sk;
@@ -980,7 +980,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
 			udp6_sk_rx_dst_set(sk, dst);
 
-		if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+		if (!uh->check && !udp_get_no_check6_rx(sk)) {
 			if (refcounted)
 				sock_put(sk);
 			goto report_csum_error;
@@ -1002,7 +1002,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	/* Unicast */
 	sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 	if (sk) {
-		if (!uh->check && !udp_sk(sk)->no_check6_rx)
+		if (!uh->check && !udp_get_no_check6_rx(sk))
 			goto report_csum_error;
 		return udp6_unicast_rcv_skb(sk, skb, uh);
 	}
-- 
cgit v1.2.3


From e1dc0615c6b08ef36414f08c011965b8fb56198b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:24 +0000
Subject: udp: move udp->gro_enabled to udp->udp_flags

syzbot reported that udp->gro_enabled can be read locklessly.
Use one atomic bit from udp->udp_flags.

Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h    | 2 +-
 net/ipv4/udp.c         | 6 +++---
 net/ipv4/udp_offload.c | 4 ++--
 net/ipv6/udp.c         | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 8d4c3835b1b2..b344bd2e41fc 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -36,6 +36,7 @@ enum {
 	UDP_FLAGS_CORK,		/* Cork is required */
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
+	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 };
 
 struct udp_sock {
@@ -54,7 +55,6 @@ struct udp_sock {
 					   * different encapsulation layer set
 					   * this
 					   */
-			 gro_enabled:1,	/* Request GRO aggregation */
 			 accept_udp_l4:1,
 			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cb32826a1db2..1debc10a0f02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1868,7 +1868,7 @@ try_again:
 						      (struct sockaddr *)sin);
 	}
 
-	if (udp_sk(sk)->gro_enabled)
+	if (udp_test_bit(GRO_ENABLED, sk))
 		udp_cmsg_recv(msg, sk, skb);
 
 	if (inet_cmsg_flags(inet))
@@ -2713,7 +2713,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		/* when enabling GRO, accept the related GSO packet type */
 		if (valbool)
 			udp_tunnel_encap_enable(sk->sk_socket);
-		up->gro_enabled = valbool;
+		udp_assign_bit(GRO_ENABLED, sk, valbool);
 		up->accept_udp_l4 = valbool;
 		release_sock(sk);
 		break;
@@ -2803,7 +2803,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_GRO:
-		val = up->gro_enabled;
+		val = udp_test_bit(GRO_ENABLED, sk);
 		break;
 
 	/* The following two cannot be changed on UDP sockets, the return is
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0f46b3c2e4ac..6c95d28d0c4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -557,10 +557,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	NAPI_GRO_CB(skb)->is_flist = 0;
 	if (!sk || !udp_sk(sk)->gro_receive) {
 		if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
-			NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
+			NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
 
 		if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
-		    (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
+		    (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
 			return call_gro_receive(udp_gro_receive_segment, head, skb);
 
 		/* no GRO, be sure flush the current packet */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6e1ea3029260..2c3281879b6d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -413,7 +413,7 @@ try_again:
 						      (struct sockaddr *)sin6);
 	}
 
-	if (udp_sk(sk)->gro_enabled)
+	if (udp_test_bit(GRO_ENABLED, sk))
 		udp_cmsg_recv(msg, sk, skb);
 
 	if (np->rxopt.all)
-- 
cgit v1.2.3


From f5f52f0884a595ff99ab1a608643fe4025fca2d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:26 +0000
Subject: udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags

These are read locklessly, move them to udp_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 16 +++++++++-------
 net/ipv4/udp.c      |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index b344bd2e41fc..bb2b87adfbea 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -37,6 +37,8 @@ enum {
 	UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
 	UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
+	UDP_FLAGS_ACCEPT_FRAGLIST,
+	UDP_FLAGS_ACCEPT_L4,
 };
 
 struct udp_sock {
@@ -50,13 +52,11 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1, /* This socket enabled encap
+	unsigned char	 encap_enabled:1; /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
-			 accept_udp_l4:1,
-			 accept_udp_fraglist:1;
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -149,10 +149,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 	if (!skb_is_gso(skb))
 		return false;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+	    !udp_test_bit(ACCEPT_L4, sk))
 		return true;
 
-	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
+	    !udp_test_bit(ACCEPT_FRAGLIST, sk))
 		return true;
 
 	return false;
@@ -160,8 +162,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 
 static inline void udp_allow_gso(struct sock *sk)
 {
-	udp_sk(sk)->accept_udp_l4 = 1;
-	udp_sk(sk)->accept_udp_fraglist = 1;
+	udp_set_bit(ACCEPT_L4, sk);
+	udp_set_bit(ACCEPT_FRAGLIST, sk);
 }
 
 #define udp_portaddr_for_each_entry(__sk, list) \
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db43907b9a3e..75ba86a87bb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2716,7 +2716,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		if (valbool)
 			udp_tunnel_encap_enable(sk->sk_socket);
 		udp_assign_bit(GRO_ENABLED, sk, valbool);
-		up->accept_udp_l4 = valbool;
+		udp_assign_bit(ACCEPT_L4, sk, valbool);
 		release_sock(sk);
 		break;
 
-- 
cgit v1.2.3


From ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:27 +0000
Subject: udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO

Move udp->encap_enabled to udp->udp_flags.

Add udp_test_and_set_bit() helper to allow lockless
udp_tunnel_encap_enable() implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h        |  9 ++++-----
 include/net/udp_tunnel.h   |  9 +++------
 net/ipv4/udp.c             | 10 +++-------
 net/ipv4/udp_tunnel_core.c |  2 +-
 net/ipv6/udp.c             |  2 +-
 5 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index bb2b87adfbea..0cf83270a4a2 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -39,6 +39,7 @@ enum {
 	UDP_FLAGS_GRO_ENABLED,	/* Request GRO aggregation */
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
+	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
 };
 
 struct udp_sock {
@@ -52,11 +53,7 @@ struct udp_sock {
 
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
-	unsigned char	 encap_enabled:1; /* This socket enabled encap
-					   * processing; UDP tunnels and
-					   * different encapsulation layer set
-					   * this
-					   */
+
 /* indicator bits used by pcflag: */
 #define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
@@ -104,6 +101,8 @@ struct udp_sock {
 	test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_set_bit(nr, sk)			\
 	set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+#define udp_test_and_set_bit(nr, sk)		\
+	test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_clear_bit(nr, sk)			\
 	clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
 #define udp_assign_bit(nr, sk, val)		\
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 0ca9b7a11baf..29251c3519cf 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -174,16 +174,13 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 }
 #endif
 
-static inline void udp_tunnel_encap_enable(struct socket *sock)
+static inline void udp_tunnel_encap_enable(struct sock *sk)
 {
-	struct udp_sock *up = udp_sk(sock->sk);
-
-	if (up->encap_enabled)
+	if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
 		return;
 
-	up->encap_enabled = 1;
 #if IS_ENABLED(CONFIG_IPV6)
-	if (sock->sk->sk_family == PF_INET6)
+	if (READ_ONCE(sk->sk_family) == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
 #endif
 	udp_encap_enable();
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 75ba86a87bb6..637a4faf9aff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2618,7 +2618,7 @@ void udp_destroy_sock(struct sock *sk)
 			if (encap_destroy)
 				encap_destroy(sk);
 		}
-		if (up->encap_enabled)
+		if (udp_test_bit(ENCAP_ENABLED, sk))
 			static_branch_dec(&udp_encap_needed_key);
 	}
 }
@@ -2685,9 +2685,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			fallthrough;
 		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
-			lock_sock(sk);
-			udp_tunnel_encap_enable(sk->sk_socket);
-			release_sock(sk);
+			udp_tunnel_encap_enable(sk);
 			break;
 		default:
 			err = -ENOPROTOOPT;
@@ -2710,14 +2708,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_GRO:
-		lock_sock(sk);
 
 		/* when enabling GRO, accept the related GSO packet type */
 		if (valbool)
-			udp_tunnel_encap_enable(sk->sk_socket);
+			udp_tunnel_encap_enable(sk);
 		udp_assign_bit(GRO_ENABLED, sk, valbool);
 		udp_assign_bit(ACCEPT_L4, sk, valbool);
-		release_sock(sk);
 		break;
 
 	/*
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 9b18f371af0d..1e7e4aecdc48 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
 
-	udp_tunnel_encap_enable(sock);
+	udp_tunnel_encap_enable(sk);
 }
 EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2c3281879b6d..90688877e900 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1670,7 +1670,7 @@ void udpv6_destroy_sock(struct sock *sk)
 			if (encap_destroy)
 				encap_destroy(sk);
 		}
-		if (up->encap_enabled) {
+		if (udp_test_bit(ENCAP_ENABLED, sk)) {
 			static_branch_dec(&udpv6_encap_needed_key);
 			udp_encap_disable();
 		}
-- 
cgit v1.2.3


From 729549aa350c56a777bb342941ed4d69b6585769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:29 +0000
Subject: udplite: remove UDPLITE_BIT

This flag is set but never read, we can remove it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 5 ++---
 net/ipv4/udplite.c  | 1 -
 net/ipv6/udplite.c  | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0cf83270a4a2..58156edec009 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,9 +55,8 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
 /* indicator bits used by pcflag: */
-#define UDPLITE_BIT      0x1  		/* set by udplite proto init function */
-#define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
+#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
+#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
 	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 39ecdad1b50c..af37af3ab727 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -21,7 +21,6 @@ EXPORT_SYMBOL(udplite_table);
 static int udplite_sk_init(struct sock *sk)
 {
 	udp_init_sock(sk);
-	udp_sk(sk)->pcflag = UDPLITE_BIT;
 	pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
 		     "please contact the netdev mailing list\n");
 	return 0;
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 267d491e9707..a60bec9b14f1 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -17,7 +17,6 @@
 static int udplitev6_sk_init(struct sock *sk)
 {
 	udpv6_init_sock(sk);
-	udp_sk(sk)->pcflag = UDPLITE_BIT;
 	pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
 		     "please contact the netdev mailing list\n");
 	return 0;
-- 
cgit v1.2.3


From 882af43a0fc37e26d85fb0df0c9edd3bed928de4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 09:17:30 +0000
Subject: udplite: fix various data-races

udp->pcflag, udp->pcslen and udp->pcrlen reads/writes are racy.

Move udp->pcflag to udp->udp_flags for atomicity,
and add READ_ONCE()/WRITE_ONCE() annotations for pcslen and pcrlen.

Fixes: ba4e58eca8aa ("[NET]: Supporting UDP-Lite (RFC 3828) in Linux")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h   |  6 ++----
 include/net/udplite.h | 14 +++++++++-----
 net/ipv4/udp.c        | 21 +++++++++++----------
 net/ipv6/udp.c        |  9 +++++----
 4 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 58156edec009..d04188714dca 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -40,6 +40,8 @@ enum {
 	UDP_FLAGS_ACCEPT_FRAGLIST,
 	UDP_FLAGS_ACCEPT_L4,
 	UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
+	UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */
+	UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */
 };
 
 struct udp_sock {
@@ -54,10 +56,6 @@ struct udp_sock {
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
-/* indicator bits used by pcflag: */
-#define UDPLITE_SEND_CC  0x1  		/* set via udplite setsockopt         */
-#define UDPLITE_RECV_CC  0x2		/* set via udplite setsocktopt        */
-	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/net/udplite.h b/include/net/udplite.h
index bd33ff2b8f42..786919d29f8d 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -66,14 +66,18 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
 /* Fast-path computation of checksum. Socket may not be locked. */
 static inline __wsum udplite_csum(struct sk_buff *skb)
 {
-	const struct udp_sock *up = udp_sk(skb->sk);
 	const int off = skb_transport_offset(skb);
+	const struct sock *sk = skb->sk;
 	int len = skb->len - off;
 
-	if ((up->pcflag & UDPLITE_SEND_CC) && up->pcslen < len) {
-		if (0 < up->pcslen)
-			len = up->pcslen;
-		udp_hdr(skb)->len = htons(up->pcslen);
+	if (udp_test_bit(UDPLITE_SEND_CC, sk)) {
+		u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen);
+
+		if (pcslen < len) {
+			if (pcslen > 0)
+				len = pcslen;
+			udp_hdr(skb)->len = htons(pcslen);
+		}
 	}
 	skb->ip_summed = CHECKSUM_NONE;     /* no HW support for checksumming */
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2eeab4af17a1..c3ff984b6354 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2120,7 +2120,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * 	UDP-Lite specific tests, ignored on UDP sockets
 	 */
-	if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+	if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+		u16 pcrlen = READ_ONCE(up->pcrlen);
 
 		/*
 		 * MIB statistics other than incrementing the error count are
@@ -2133,7 +2134,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 		 * delivery of packets with coverage values less than a value
 		 * provided by the application."
 		 */
-		if (up->pcrlen == 0) {          /* full coverage was set  */
+		if (pcrlen == 0) {          /* full coverage was set  */
 			net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
 					    UDP_SKB_CB(skb)->cscov, skb->len);
 			goto drop;
@@ -2144,9 +2145,9 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 		 * that it wants x while sender emits packets of smaller size y.
 		 * Therefore the above ...()->partial_cov statement is essential.
 		 */
-		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+		if (UDP_SKB_CB(skb)->cscov < pcrlen) {
 			net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
-					    UDP_SKB_CB(skb)->cscov, up->pcrlen);
+					    UDP_SKB_CB(skb)->cscov, pcrlen);
 			goto drop;
 		}
 	}
@@ -2729,8 +2730,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			val = 8;
 		else if (val > USHRT_MAX)
 			val = USHRT_MAX;
-		up->pcslen = val;
-		up->pcflag |= UDPLITE_SEND_CC;
+		WRITE_ONCE(up->pcslen, val);
+		udp_set_bit(UDPLITE_SEND_CC, sk);
 		break;
 
 	/* The receiver specifies a minimum checksum coverage value. To make
@@ -2743,8 +2744,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			val = 8;
 		else if (val > USHRT_MAX)
 			val = USHRT_MAX;
-		up->pcrlen = val;
-		up->pcflag |= UDPLITE_RECV_CC;
+		WRITE_ONCE(up->pcrlen, val);
+		udp_set_bit(UDPLITE_RECV_CC, sk);
 		break;
 
 	default:
@@ -2808,11 +2809,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
-		val = up->pcslen;
+		val = READ_ONCE(up->pcslen);
 		break;
 
 	case UDPLITE_RECV_CSCOV:
-		val = up->pcrlen;
+		val = READ_ONCE(up->pcrlen);
 		break;
 
 	default:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0e79d189613b..f60ba4295435 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -727,16 +727,17 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	/*
 	 * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
 	 */
-	if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+	if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+		u16 pcrlen = READ_ONCE(up->pcrlen);
 
-		if (up->pcrlen == 0) {          /* full coverage was set  */
+		if (pcrlen == 0) {          /* full coverage was set  */
 			net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
 					    UDP_SKB_CB(skb)->cscov, skb->len);
 			goto drop;
 		}
-		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+		if (UDP_SKB_CB(skb)->cscov < pcrlen) {
 			net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
-					    UDP_SKB_CB(skb)->cscov, up->pcrlen);
+					    UDP_SKB_CB(skb)->cscov, pcrlen);
 			goto drop;
 		}
 	}
-- 
cgit v1.2.3


From 2758ac3a11d78af56e6969af04dec611806a62de Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 13 Sep 2023 21:28:25 +0200
Subject: firmware: qcom-scm: drop unneeded 'extern' specifiers

The 'extern' specifier in front of a function declaration has no effect.
Remove all of them from the qcom-scm header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Link: https://lore.kernel.org/r/20230913192826.36187-1-bartosz.golaszewski@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_scm.h | 101 +++++++++++++++------------------
 1 file changed, 47 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index e1ea2eb56d04..ccaf28846054 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -59,12 +59,12 @@ enum qcom_scm_ice_cipher {
 #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE)
 #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC)
 
-extern bool qcom_scm_is_available(void);
+bool qcom_scm_is_available(void);
 
-extern int qcom_scm_set_cold_boot_addr(void *entry);
-extern int qcom_scm_set_warm_boot_addr(void *entry);
-extern void qcom_scm_cpu_power_down(u32 flags);
-extern int qcom_scm_set_remote_state(u32 state, u32 id);
+int qcom_scm_set_cold_boot_addr(void *entry);
+int qcom_scm_set_warm_boot_addr(void *entry);
+void qcom_scm_cpu_power_down(u32 flags);
+int qcom_scm_set_remote_state(u32 state, u32 id);
 
 struct qcom_scm_pas_metadata {
 	void *ptr;
@@ -72,55 +72,48 @@ struct qcom_scm_pas_metadata {
 	ssize_t size;
 };
 
-extern int qcom_scm_pas_init_image(u32 peripheral, const void *metadata,
-				   size_t size,
-				   struct qcom_scm_pas_metadata *ctx);
-extern void qcom_scm_pas_metadata_release(struct qcom_scm_pas_metadata *ctx);
-extern int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr,
-				  phys_addr_t size);
-extern int qcom_scm_pas_auth_and_reset(u32 peripheral);
-extern int qcom_scm_pas_shutdown(u32 peripheral);
-extern bool qcom_scm_pas_supported(u32 peripheral);
-
-extern int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val);
-extern int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
-
-extern bool qcom_scm_restore_sec_cfg_available(void);
-extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
-extern int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
-extern int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
-extern int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size);
-extern int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
-					  u32 cp_nonpixel_start,
-					  u32 cp_nonpixel_size);
-extern int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
-			       u64 *src,
-			       const struct qcom_scm_vmperm *newvm,
-			       unsigned int dest_cnt);
-
-extern bool qcom_scm_ocmem_lock_available(void);
-extern int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset,
-			       u32 size, u32 mode);
-extern int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, u32 offset,
-				 u32 size);
-
-extern bool qcom_scm_ice_available(void);
-extern int qcom_scm_ice_invalidate_key(u32 index);
-extern int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size,
-				enum qcom_scm_ice_cipher cipher,
-				u32 data_unit_size);
-
-extern bool qcom_scm_hdcp_available(void);
-extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
-			     u32 *resp);
-
-extern int qcom_scm_iommu_set_pt_format(u32 sec_id, u32 ctx_num, u32 pt_fmt);
-extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
-
-extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
-			      u64 limit_node, u32 node_id, u64 version);
-extern int qcom_scm_lmh_profile_change(u32 profile_id);
-extern bool qcom_scm_lmh_dcvsh_available(void);
+int qcom_scm_pas_init_image(u32 peripheral, const void *metadata, size_t size,
+			    struct qcom_scm_pas_metadata *ctx);
+void qcom_scm_pas_metadata_release(struct qcom_scm_pas_metadata *ctx);
+int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, phys_addr_t size);
+int qcom_scm_pas_auth_and_reset(u32 peripheral);
+int qcom_scm_pas_shutdown(u32 peripheral);
+bool qcom_scm_pas_supported(u32 peripheral);
+
+int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val);
+int qcom_scm_io_writel(phys_addr_t addr, unsigned int val);
+
+bool qcom_scm_restore_sec_cfg_available(void);
+int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
+int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
+int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
+int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size);
+int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size,
+				   u32 cp_nonpixel_start, u32 cp_nonpixel_size);
+int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, u64 *src,
+			const struct qcom_scm_vmperm *newvm,
+			unsigned int dest_cnt);
+
+bool qcom_scm_ocmem_lock_available(void);
+int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset, u32 size,
+			u32 mode);
+int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, u32 offset, u32 size);
+
+bool qcom_scm_ice_available(void);
+int qcom_scm_ice_invalidate_key(u32 index);
+int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size,
+			 enum qcom_scm_ice_cipher cipher, u32 data_unit_size);
+
+bool qcom_scm_hdcp_available(void);
+int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp);
+
+int qcom_scm_iommu_set_pt_format(u32 sec_id, u32 ctx_num, u32 pt_fmt);
+int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
+
+int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+		       u64 limit_node, u32 node_id, u64 version);
+int qcom_scm_lmh_profile_change(u32 profile_id);
+bool qcom_scm_lmh_dcvsh_available(void);
 
 #ifdef CONFIG_QCOM_QSEECOM
 
-- 
cgit v1.2.3


From a721f7b8c3548e943e514a957f2a37f4763b9888 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 13:16:40 +0500
Subject: lsm: constify 'bprm' parameter in security_bprm_committed_creds()

Three LSMs register the implementations for the 'bprm_committed_creds()'
hook: AppArmor, SELinux and tomoyo. Looking at the function
implementations we may observe that the 'bprm' parameter is not changing.

Mark the 'bprm' parameter of LSM hook security_bprm_committed_creds() as
'const' since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
[PM: minor merge fuzzing due to other constification patches]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 security/apparmor/lsm.c       | 2 +-
 security/security.c           | 2 +-
 security/selinux/hooks.c      | 2 +-
 security/tomoyo/tomoyo.c      | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index f6acc3ed66a3..3b0f5cfca464 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -53,7 +53,7 @@ LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
 LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm)
 LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
 LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 	 struct fs_context *src_sc)
diff --git a/include/linux/security.h b/include/linux/security.h
index 885053f81019..3148103123fb 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -293,7 +293,7 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm);
 int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(const struct linux_binprm *bprm);
-void security_bprm_committed_creds(struct linux_binprm *bprm);
+void security_bprm_committed_creds(const struct linux_binprm *bprm);
 int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
@@ -627,7 +627,7 @@ static inline void security_bprm_committing_creds(const struct linux_binprm *bpr
 {
 }
 
-static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 }
 
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index b03f46e0f6c5..3fa325d5efac 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -756,7 +756,7 @@ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm)
  * apparmor_bprm_committed_creds() - do cleanup after new creds committed
  * @bprm: binprm for the exec  (NOT NULL)
  */
-static void apparmor_bprm_committed_creds(struct linux_binprm *bprm)
+static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	/* clear out temporary/transitional state from the context */
 	aa_clear_task_ctx_trans(task_ctx(current));
diff --git a/security/security.c b/security/security.c
index 77a1601ead36..e4aec0f65f75 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1134,7 +1134,7 @@ void security_bprm_committing_creds(const struct linux_binprm *bprm)
  * process such as clearing out non-inheritable signal state.  This is called
  * immediately after commit_creds().
  */
-void security_bprm_committed_creds(struct linux_binprm *bprm)
+void security_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committed_creds, bprm);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f42a1b78bc43..e9ee008a9537 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2501,7 +2501,7 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
  * Clean up the process immediately after the installation of new credentials
  * due to exec
  */
-static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	const struct task_security_struct *tsec = selinux_cred(current_cred());
 	u32 osid, sid;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 25006fddc964..255f1b470295 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -52,7 +52,7 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
  *
  * @bprm: Pointer to "struct linux_binprm".
  */
-static void tomoyo_bprm_committed_creds(struct linux_binprm *bprm)
+static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	/* Clear old_domain_info saved by execve() request. */
 	struct tomoyo_task *s = tomoyo_task(current);
-- 
cgit v1.2.3


From 20a2aa47097aae7016209c4dbe392b3b25e0d883 Mon Sep 17 00:00:00 2001
From: Khadija Kamran <kamrankhadijadj@gmail.com>
Date: Wed, 23 Aug 2023 14:01:28 +0500
Subject: lsm: constify 'sb' parameter in security_sb_kern_mount()

The "sb_kern_mount" hook has implementation registered in SELinux.
Looking at the function implementation we observe that the "sb"
parameter is not changing.

Mark the "sb" parameter of LSM hook security_sb_kern_mount() as "const"
since it will not be changing in the LSM hook.

Signed-off-by: Khadija Kamran <kamrankhadijadj@gmail.com>
[PM: minor merge fuzzing due to other constification patches]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 2 +-
 security/security.c           | 2 +-
 security/selinux/hooks.c      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3b0f5cfca464..99b8176c3738 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -66,7 +66,7 @@ LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
 LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
 LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
-LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb)
+LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb)
 LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
 LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
 LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
diff --git a/include/linux/security.h b/include/linux/security.h
index 3148103123fb..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -304,7 +304,7 @@ void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
 int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts);
 int security_sb_remount(struct super_block *sb, void *mnt_opts);
-int security_sb_kern_mount(struct super_block *sb);
+int security_sb_kern_mount(const struct super_block *sb);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(const char *dev_name, const struct path *path,
diff --git a/security/security.c b/security/security.c
index e4aec0f65f75..7b0052e96806 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1319,7 +1319,7 @@ EXPORT_SYMBOL(security_sb_remount);
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_sb_kern_mount(struct super_block *sb)
+int security_sb_kern_mount(const struct super_block *sb)
 {
 	return call_int_hook(sb_kern_mount, 0, sb);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e9ee008a9537..195db92ac99c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2721,7 +2721,7 @@ out_bad_option:
 	return -EINVAL;
 }
 
-static int selinux_sb_kern_mount(struct super_block *sb)
+static int selinux_sb_kern_mount(const struct super_block *sb)
 {
 	const struct cred *cred = current_cred();
 	struct common_audit_data ad;
-- 
cgit v1.2.3


From 4de7b17fd05d03fa919e8c47fc66122bd24d7b6c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 21 Aug 2023 14:44:28 +0100
Subject: sched: Assert for_each_thread() is properly locked

list_for_each_entry_rcu() takes an optional fourth argument which
allows RCU to assert that the correct lock is held.  Several callers
of for_each_thread() rely on their caller to be holding the appropriate
lock, so this is a useful assertion to include.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20230821134428.2504912-1-willy@infradead.org
---
 include/linux/sched/signal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..9610bad018a3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
 	while ((t = next_thread(t)) != g)
 
 #define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+		lockdep_is_held(&tasklist_lock))
 
 #define for_each_thread(p, t)		\
 	__for_each_thread((p)->signal, t)
-- 
cgit v1.2.3


From b0adfba7ee770fef20b1b6d86706c28f7fccfb07 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:01:59 +0000
Subject: ipv6: lockless IPV6_UNICAST_HOPS implementation

Some np->hop_limit accesses are racy, when socket lock is not held.

Add missing annotations and switch to full lockless implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 12 +-----------
 include/net/ipv6.h       |  2 +-
 net/ipv6/ip6_output.c    |  2 +-
 net/ipv6/ipv6_sockglue.c | 20 +++++++++++---------
 net/ipv6/mcast.c         |  2 +-
 net/ipv6/ndisc.c         |  2 +-
 6 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index af8a771a053c..c2e087071384 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -213,17 +213,7 @@ struct ipv6_pinfo {
 	__be32			flow_label;
 	__u32			frag_size;
 
-	/*
-	 * Packed in 16bits.
-	 * Omit one shift by putting the signed field at MSB.
-	 */
-#if defined(__BIG_ENDIAN_BITFIELD)
-	__s16			hop_limit:9;
-	__u16			__unused_1:7;
-#else
-	__u16			__unused_1:7;
-	__s16			hop_limit:9;
-#endif
+	s16			hop_limit;
 
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c6932d1a3fa8..2e8e7e31e02e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -916,7 +916,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	if (ipv6_addr_is_multicast(&fl6->daddr))
 		hlimit = np->mcast_hops;
 	else
-		hlimit = np->hop_limit;
+		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
 		hlimit = ip6_dst_hoplimit(dst);
 	return hlimit;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 54fc4c711f2c..1e16d56d8c38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -309,7 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	 *	Fill in the IPv6 header
 	 */
 	if (np)
-		hlimit = np->hop_limit;
+		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
 		hlimit = ip6_dst_hoplimit(dst);
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 0e2a0847b387..f27993a1470d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -415,6 +415,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	if (ip6_mroute_opt(optname))
 		return ip6_mroute_setsockopt(sk, optname, optval, optlen);
 
+	/* Handle options that can be set without locking the socket. */
+	switch (optname) {
+	case IPV6_UNICAST_HOPS:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val > 255 || val < -1)
+			return -EINVAL;
+		WRITE_ONCE(np->hop_limit, val);
+		return 0;
+	}
 	if (needs_rtnl)
 		rtnl_lock();
 	sockopt_lock_sock(sk);
@@ -733,14 +743,6 @@ done:
 		}
 		break;
 	}
-	case IPV6_UNICAST_HOPS:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val > 255 || val < -1)
-			goto e_inval;
-		np->hop_limit = val;
-		retv = 0;
-		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (sk->sk_type == SOCK_STREAM)
@@ -1347,7 +1349,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		struct dst_entry *dst;
 
 		if (optname == IPV6_UNICAST_HOPS)
-			val = np->hop_limit;
+			val = READ_ONCE(np->hop_limit);
 		else
 			val = np->mcast_hops;
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 421264a69e97..4a7967623909 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1716,7 +1716,7 @@ static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
 
 	hdr->payload_len = htons(len);
 	hdr->nexthdr = proto;
-	hdr->hop_limit = inet6_sk(sk)->hop_limit;
+	hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);
 
 	hdr->saddr = *saddr;
 	hdr->daddr = *daddr;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 553c8664e0a7..b554fd40bdc3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -500,7 +500,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
 					      csum_partial(icmp6h,
 							   skb->len, 0));
 
-	ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+	ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
 
 	rcu_read_lock();
 	idev = __in6_dev_get(dst->dev);
-- 
cgit v1.2.3


From d986f52124e062753e33b6fe303be5904a997eac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:00 +0000
Subject: ipv6: lockless IPV6_MULTICAST_LOOP implementation

Add inet6_{test|set|clear|assign}_bit() helpers.

Note that I am using bits from inet->inet_flags,
this might change in the future if we need more flags.

While solving data-races accessing np->mc_loop,
this patch also allows to implement lockless accesses
to np->mcast_hops in the following patch.

Also constify sk_mc_loop() argument.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            | 18 ++++++++++++++----
 include/net/inet_sock.h         |  1 +
 include/net/sock.h              |  2 +-
 net/core/sock.c                 |  4 ++--
 net/ipv6/af_inet6.c             |  2 +-
 net/ipv6/ipv6_sockglue.c        | 18 ++++++++----------
 net/ipv6/ndisc.c                |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  8 ++------
 8 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c2e087071384..68cf1ca94914 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -218,11 +218,9 @@ struct ipv6_pinfo {
 #if defined(__BIG_ENDIAN_BITFIELD)
 	/* Packed in 16bits. */
 	__s16			mcast_hops:9;
-	__u16			__unused_2:6,
-				mc_loop:1;
+	__u16			__unused_2:7,
 #else
-	__u16			mc_loop:1,
-				__unused_2:6;
+	__u16			__unused_2:7;
 	__s16			mcast_hops:9;
 #endif
 	int			ucast_oif;
@@ -283,6 +281,18 @@ struct ipv6_pinfo {
 	struct inet6_cork	cork;
 };
 
+/* We currently use available bits from inet_sk(sk)->inet_flags,
+ * this could change in the future.
+ */
+#define inet6_test_bit(nr, sk)			\
+	test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_set_bit(nr, sk)			\
+	set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_clear_bit(nr, sk)			\
+	clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
+#define inet6_assign_bit(nr, sk, val)		\
+	assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+
 /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
 struct raw6_sock {
 	/* inet_sock has to be the first member of raw6_sock */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 2de0e4d4a027..b5a9dca92fb4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -268,6 +268,7 @@ enum {
 	INET_FLAGS_NODEFRAG	= 17,
 	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
 	INET_FLAGS_DEFER_CONNECT = 19,
+	INET_FLAGS_MC6_LOOP	= 20,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/sock.h b/include/net/sock.h
index 676146e9d181..56ac1abadea5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2238,7 +2238,7 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
 	}
 }
 
-bool sk_mc_loop(struct sock *sk);
+bool sk_mc_loop(const struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
 {
diff --git a/net/core/sock.c b/net/core/sock.c
index bb89b88bc1e8..213a62ac13f2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -759,7 +759,7 @@ out:
 	return ret;
 }
 
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
 {
 	if (dev_recursion_level())
 		return false;
@@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk)
 		return inet_test_bit(MC_LOOP, sk);
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		return inet6_sk(sk)->mc_loop;
+		return inet6_test_bit(MC6_LOOP, sk);
 #endif
 	}
 	WARN_ON_ONCE(1);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 368824fe9719..bbd4aa1b96d0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -217,7 +217,7 @@ lookup_protocol:
 	inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
 	np->hop_limit	= -1;
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
-	np->mc_loop	= 1;
+	inet6_set_bit(MC6_LOOP, sk);
 	np->mc_all	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f27993a1470d..755fac85a120 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -424,6 +424,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(np->hop_limit, val);
 		return 0;
+	case IPV6_MULTICAST_LOOP:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val != valbool)
+			return -EINVAL;
+		inet6_assign_bit(MC6_LOOP, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -755,15 +762,6 @@ done:
 		retv = 0;
 		break;
 
-	case IPV6_MULTICAST_LOOP:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val != valbool)
-			goto e_inval;
-		np->mc_loop = valbool;
-		retv = 0;
-		break;
-
 	case IPV6_UNICAST_IF:
 	{
 		struct net_device *dev = NULL;
@@ -1367,7 +1365,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	}
 
 	case IPV6_MULTICAST_LOOP:
-		val = np->mc_loop;
+		val = inet6_test_bit(MC6_LOOP, sk);
 		break;
 
 	case IPV6_MULTICAST_IF:
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b554fd40bdc3..679443d7ecb5 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1996,7 +1996,7 @@ static int __net_init ndisc_net_init(struct net *net)
 	np = inet6_sk(sk);
 	np->hop_limit = 255;
 	/* Do not loopback ndisc messages */
-	np->mc_loop = 0;
+	inet6_clear_bit(MC6_LOOP, sk);
 
 	return 0;
 }
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index da5af28ff57b..3c2251cabd04 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1298,17 +1298,13 @@ static void set_sock_size(struct sock *sk, int mode, int val)
 static void set_mcast_loop(struct sock *sk, u_char loop)
 {
 	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
-	lock_sock(sk);
 	inet_assign_bit(MC_LOOP, sk, loop);
 #ifdef CONFIG_IP_VS_IPV6
-	if (sk->sk_family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
-
+	if (READ_ONCE(sk->sk_family) == AF_INET6) {
 		/* IPV6_MULTICAST_LOOP */
-		np->mc_loop = loop ? 1 : 0;
+		inet6_assign_bit(MC6_LOOP, sk, loop);
 	}
 #endif
-	release_sock(sk);
 }
 
 /*
-- 
cgit v1.2.3


From 2da23eb07c91241d962f3ff05565065484cd8929 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:01 +0000
Subject: ipv6: lockless IPV6_MULTICAST_HOPS implementation

This fixes data-races around np->mcast_hops,
and make IPV6_MULTICAST_HOPS lockless.

Note that np->mcast_hops is never negative,
thus can fit an u8 field instead of s16.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            |  9 +--------
 include/net/ipv6.h              |  2 +-
 net/dccp/ipv6.c                 |  2 +-
 net/ipv6/ipv6_sockglue.c        | 28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c             |  3 ++-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 6 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 68cf1ca94914..9cc278b5e4f4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -214,15 +214,8 @@ struct ipv6_pinfo {
 	__u32			frag_size;
 
 	s16			hop_limit;
+	u8			mcast_hops;
 
-#if defined(__BIG_ENDIAN_BITFIELD)
-	/* Packed in 16bits. */
-	__s16			mcast_hops:9;
-	__u16			__unused_2:7,
-#else
-	__u16			__unused_2:7;
-	__s16			mcast_hops:9;
-#endif
 	int			ucast_oif;
 	int			mcast_oif;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2e8e7e31e02e..8a04a8985336 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -914,7 +914,7 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	int hlimit;
 
 	if (ipv6_addr_is_multicast(&fl6->daddr))
-		hlimit = np->mcast_hops;
+		hlimit = READ_ONCE(np->mcast_hops);
 	else
 		hlimit = READ_ONCE(np->hop_limit);
 	if (hlimit < 0)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 33f6ccf6ba77..83617a16b98e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -676,7 +676,7 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
 			np->mcast_oif = inet6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
-			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+			WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
 		if (np->repflow)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 755fac85a120..5fff19a87c75 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -431,6 +431,16 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(MC6_LOOP, sk, valbool);
 		return 0;
+	case IPV6_MULTICAST_HOPS:
+		if (sk->sk_type == SOCK_STREAM)
+			return retv;
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val > 255 || val < -1)
+			return -EINVAL;
+		WRITE_ONCE(np->mcast_hops,
+			   val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -751,16 +761,6 @@ done:
 		break;
 	}
 
-	case IPV6_MULTICAST_HOPS:
-		if (sk->sk_type == SOCK_STREAM)
-			break;
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val > 255 || val < -1)
-			goto e_inval;
-		np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
-		retv = 0;
-		break;
 
 	case IPV6_UNICAST_IF:
 	{
@@ -1180,7 +1180,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxhlim) {
-				int hlim = np->mcast_hops;
+				int hlim = READ_ONCE(np->mcast_hops);
+
 				put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
 			}
 			if (np->rxopt.bits.rxtclass) {
@@ -1197,7 +1198,8 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 				put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
 			}
 			if (np->rxopt.bits.rxohlim) {
-				int hlim = np->mcast_hops;
+				int hlim = READ_ONCE(np->mcast_hops);
+
 				put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
 			}
 			if (np->rxopt.bits.rxflow) {
@@ -1349,7 +1351,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		if (optname == IPV6_UNICAST_HOPS)
 			val = READ_ONCE(np->hop_limit);
 		else
-			val = np->mcast_hops;
+			val = READ_ONCE(np->mcast_hops);
 
 		if (val < 0) {
 			rcu_read_lock();
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3a88545a265d..54db5fab318b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1542,7 +1542,8 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
 			np->mcast_oif = tcp_v6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
-			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+			WRITE_ONCE(np->mcast_hops,
+				   ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
 		if (np->repflow)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3c2251cabd04..df1b33b61059 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1322,7 +1322,7 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		/* IPV6_MULTICAST_HOPS */
-		np->mcast_hops = ttl;
+		WRITE_ONCE(np->mcast_hops, ttl);
 	}
 #endif
 	release_sock(sk);
-- 
cgit v1.2.3


From dcae74622c051b219ee628669a31716473efda2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:04 +0000
Subject: ipv6: lockless IPV6_RECVERR_RFC4884 implementation

Move np->recverr_rfc4884 to an atomic flag to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/ipv6/datagram.c      |  2 +-
 net/ipv6/ipv6_sockglue.c | 17 ++++++++---------
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9cc278b5e4f4..0d2b0a1b2dae 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -256,7 +256,6 @@ struct ipv6_pinfo {
 				autoflowlabel:1,
 				autoflowlabel_set:1,
 				mc_all:1,
-				recverr_rfc4884:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index b5a9dca92fb4..8cf1f7b44234 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -269,6 +269,7 @@ enum {
 	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
 	INET_FLAGS_DEFER_CONNECT = 19,
 	INET_FLAGS_MC6_LOOP	= 20,
+	INET_FLAGS_RECVERR6_RFC4884 = 21,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 41ebc4e57473..e81892814935 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -332,7 +332,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 
 	__skb_pull(skb, payload - skb->data);
 
-	if (inet6_sk(sk)->recverr_rfc4884)
+	if (inet6_test_bit(RECVERR6_RFC4884, sk))
 		ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
 
 	skb_reset_transport_header(skb);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index bbc8a009e05d..b65e73ac2ccd 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -462,6 +462,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		 */
 		WRITE_ONCE(np->min_hopcount, val);
 		return 0;
+	case IPV6_RECVERR_RFC4884:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -974,14 +981,6 @@ done:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
 		break;
-	case IPV6_RECVERR_RFC4884:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val < 0 || val > 1)
-			goto e_inval;
-		np->recverr_rfc4884 = valbool;
-		retv = 0;
-		break;
 	}
 
 unlock:
@@ -1462,7 +1461,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_RECVERR_RFC4884:
-		val = np->recverr_rfc4884;
+		val = inet6_test_bit(RECVERR6_RFC4884, sk);
 		break;
 
 	default:
-- 
cgit v1.2.3


From 6559c0ff3bc27d7e4d447d31c1d7e8eae0e959f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:05 +0000
Subject: ipv6: lockless IPV6_MULTICAST_ALL implementation

Move np->mc_all to an atomic flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/ipv6/af_inet6.c      |  2 +-
 net/ipv6/ipv6_sockglue.c | 14 ++++++--------
 net/ipv6/mcast.c         |  2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0d2b0a1b2dae..d88e91b7f0a3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -255,7 +255,6 @@ struct ipv6_pinfo {
 				dontfrag:1,
 				autoflowlabel:1,
 				autoflowlabel_set:1,
-				mc_all:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 8cf1f7b44234..97e70a97dae8 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -270,6 +270,7 @@ enum {
 	INET_FLAGS_DEFER_CONNECT = 19,
 	INET_FLAGS_MC6_LOOP	= 20,
 	INET_FLAGS_RECVERR6_RFC4884 = 21,
+	INET_FLAGS_MC6_ALL	= 22,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bbd4aa1b96d0..372fb7b9112c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -218,7 +218,7 @@ lookup_protocol:
 	np->hop_limit	= -1;
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	inet6_set_bit(MC6_LOOP, sk);
-	np->mc_all	= 1;
+	inet6_set_bit(MC6_ALL, sk);
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b65e73ac2ccd..7a181831f226 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -469,6 +469,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
 		return 0;
+	case IPV6_MULTICAST_ALL:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(MC6_ALL, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -890,13 +895,6 @@ done:
 			retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
 		break;
 	}
-	case IPV6_MULTICAST_ALL:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->mc_all = valbool;
-		retv = 0;
-		break;
-
 	case MCAST_JOIN_GROUP:
 	case MCAST_LEAVE_GROUP:
 		if (in_compat_syscall())
@@ -1372,7 +1370,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_MULTICAST_ALL:
-		val = np->mc_all;
+		val = inet6_test_bit(MC6_ALL, sk);
 		break;
 
 	case IPV6_UNICAST_IF:
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4a7967623909..99e28b444a4c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -642,7 +642,7 @@ bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 	}
 	if (!mc) {
 		rcu_read_unlock();
-		return np->mc_all;
+		return inet6_test_bit(MC6_ALL, sk);
 	}
 	psl = rcu_dereference(mc->sflist);
 	if (!psl) {
-- 
cgit v1.2.3


From 5121516b0c4736b7977d977b239e36d23ec64401 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:06 +0000
Subject: ipv6: lockless IPV6_AUTOFLOWLABEL implementation

Move np->autoflowlabel and np->autoflowlabel_set in inet->inet_flags,
to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  2 --
 include/net/inet_sock.h  |  2 ++
 include/net/ipv6.h       |  2 +-
 net/ipv6/ip6_output.c    | 12 +++++-------
 net/ipv6/ipv6_sockglue.c | 11 +++++------
 5 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index d88e91b7f0a3..e3be5dc21b7d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -253,8 +253,6 @@ struct ipv6_pinfo {
 						 * 100: prefer care-of address
 						 */
 				dontfrag:1,
-				autoflowlabel:1,
-				autoflowlabel_set:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 97e70a97dae8..f1af64a40673 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -271,6 +271,8 @@ enum {
 	INET_FLAGS_MC6_LOOP	= 20,
 	INET_FLAGS_RECVERR6_RFC4884 = 21,
 	INET_FLAGS_MC6_ALL	= 22,
+	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
+	INET_FLAGS_AUTOFLOWLABEL = 24,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8a04a8985336..4b6cbec059e2 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -428,7 +428,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 			   int flags);
 int ip6_flowlabel_init(void);
 void ip6_flowlabel_cleanup(void);
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk);
 
 static inline void fl6_sock_release(struct ip6_flowlabel *fl)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ab7ede4a731a..47aa42f93ccd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -232,12 +232,11 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ip6_output);
 
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
 {
-	if (!np->autoflowlabel_set)
+	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
 		return ip6_default_np_autolabel(net);
-	else
-		return np->autoflowlabel;
+	return inet6_test_bit(AUTOFLOWLABEL, sk);
 }
 
 /*
@@ -314,7 +313,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		hlimit = ip6_dst_hoplimit(dst);
 
 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
-				ip6_autoflowlabel(net, np), fl6));
+				ip6_autoflowlabel(net, sk), fl6));
 
 	hdr->payload_len = htons(seg_len);
 	hdr->nexthdr = proto;
@@ -1938,7 +1937,6 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	struct sk_buff *skb, *tmp_skb;
 	struct sk_buff **tail_skb;
 	struct in6_addr *final_dst;
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *hdr;
 	struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1981,7 +1979,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	ip6_flow_hdr(hdr, v6_cork->tclass,
 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
-					ip6_autoflowlabel(net, np), fl6));
+					ip6_autoflowlabel(net, sk), fl6));
 	hdr->hop_limit = v6_cork->hop_limit;
 	hdr->nexthdr = proto;
 	hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7a181831f226..d5d428a695f7 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -474,6 +474,10 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(MC6_ALL, sk, valbool);
 		return 0;
+	case IPV6_AUTOFLOWLABEL:
+		inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
+		inet6_set_bit(AUTOFLOWLABEL_SET, sk);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -970,11 +974,6 @@ done:
 		np->dontfrag = valbool;
 		retv = 0;
 		break;
-	case IPV6_AUTOFLOWLABEL:
-		np->autoflowlabel = valbool;
-		np->autoflowlabel_set = 1;
-		retv = 0;
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1447,7 +1446,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_AUTOFLOWLABEL:
-		val = ip6_autoflowlabel(sock_net(sk), np);
+		val = ip6_autoflowlabel(sock_net(sk), sk);
 		break;
 
 	case IPV6_RECVFRAGSIZE:
-- 
cgit v1.2.3


From 1086ca7cce292bb498d7f8f85f4593c9ef4902b7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:07 +0000
Subject: ipv6: lockless IPV6_DONTFRAG implementation

Move np->dontfrag flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 1 -
 include/net/inet_sock.h  | 1 +
 include/net/ipv6.h       | 6 +++---
 include/net/xfrm.h       | 2 +-
 net/ipv6/icmp.c          | 4 ++--
 net/ipv6/ip6_output.c    | 2 +-
 net/ipv6/ipv6_sockglue.c | 9 ++++-----
 net/ipv6/ping.c          | 2 +-
 net/ipv6/raw.c           | 2 +-
 net/ipv6/udp.c           | 2 +-
 net/l2tp/l2tp_ip6.c      | 2 +-
 11 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e3be5dc21b7d..57d563f1d4b1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -252,7 +252,6 @@ struct ipv6_pinfo {
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				dontfrag:1,
 				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f1af64a40673..ac75324e9e1e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -273,6 +273,7 @@ enum {
 	INET_FLAGS_MC6_ALL	= 22,
 	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
 	INET_FLAGS_AUTOFLOWLABEL = 24,
+	INET_FLAGS_DONTFRAG	= 25,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4b6cbec059e2..5a1f2993680d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -373,12 +373,12 @@ static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
 }
 
 static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
-				 const struct ipv6_pinfo *np)
+				 const struct sock *sk)
 {
 	*ipc6 = (struct ipcm6_cookie) {
 		.hlimit = -1,
-		.tclass = np->tclass,
-		.dontfrag = np->dontfrag,
+		.tclass = inet6_sk(sk)->tclass,
+		.dontfrag = inet6_test_bit(DONTFRAG, sk),
 	};
 }
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 363c7d510554..98d7aa78adda 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2166,7 +2166,7 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 
 	proto = sk->sk_protocol;
 	if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
-		return inet6_sk(sk)->dontfrag;
+		return inet6_test_bit(DONTFRAG, sk);
 
 	return false;
 }
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 93a594a901d1..8fb4a791881a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -588,7 +588,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	else if (!fl6.flowi6_oif)
 		fl6.flowi6_oif = np->ucast_oif;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.sockc.mark = mark;
 	fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
 
@@ -791,7 +791,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
 	msg.offset = 0;
 	msg.type = type;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 	ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
 	ipc6.sockc.mark = mark;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 47aa42f93ccd..8851fe5d45a0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -2092,7 +2092,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
-		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
+		ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
 				&current->task_frag, getfrag, from,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index d5d428a695f7..33dd4dd872e6 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -478,6 +478,9 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
 		inet6_set_bit(AUTOFLOWLABEL_SET, sk);
 		return 0;
+	case IPV6_DONTFRAG:
+		inet6_assign_bit(DONTFRAG, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -970,10 +973,6 @@ done:
 			goto e_inval;
 		retv = __ip6_sock_set_addr_preferences(sk, val);
 		break;
-	case IPV6_DONTFRAG:
-		np->dontfrag = valbool;
-		retv = 0;
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1442,7 +1441,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_DONTFRAG:
-		val = np->dontfrag;
+		val = inet6_test_bit(DONTFRAG, sk);
 		break;
 
 	case IPV6_AUTOFLOWLABEL:
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 5831aaa53d75..4444b61eb23b 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -118,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	     l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if))
 		return -EINVAL;
 
-	ipcm6_init_sk(&ipc6, np);
+	ipcm6_init_sk(&ipc6, sk);
 	ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
 	ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 42fcec3ecf5e..cc9673c1809f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -898,7 +898,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	if (msg->msg_flags&MSG_CONFIRM)
 		goto do_confirm;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f60ba4295435..e4301500741a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1595,7 +1595,7 @@ back_from_confirm:
 
 do_append_data:
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 	up->len += ulen;
 	err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
 			      &ipc6, fl6, (struct rt6_info *)dst,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index ed8ebb6f5909..40af2431e73a 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -621,7 +621,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
 
 	if (ipc6.dontfrag < 0)
-		ipc6.dontfrag = np->dontfrag;
+		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 
 	if (msg->msg_flags & MSG_CONFIRM)
 		goto do_confirm;
-- 
cgit v1.2.3


From 3fa29971c69519629370b119b0b618ee88ade6b9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:08 +0000
Subject: ipv6: lockless IPV6_RECVERR implemetation

np->recverr is moved to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 include/net/ipv6.h       |  4 +---
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/ping.c          |  2 +-
 net/ipv6/datagram.c      |  6 ++----
 net/ipv6/ipv6_sockglue.c | 17 ++++++++---------
 net/ipv6/raw.c           | 10 +++++-----
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/ipv6/udp.c           |  6 +++---
 net/sctp/ipv6.c          |  4 +---
 11 files changed, 25 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 57d563f1d4b1..53f4f1b97a78 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			recverr:1,
-	                        sndflow:1,
+	__u16			sndflow:1,
 				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ac75324e9e1e..3b79bc759ff4 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -274,6 +274,7 @@ enum {
 	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
 	INET_FLAGS_AUTOFLOWLABEL = 24,
 	INET_FLAGS_DONTFRAG	= 25,
+	INET_FLAGS_RECVERR6	= 26,
 };
 
 /* cmsg flags for inet */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5a1f2993680d..bd115980809f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1303,9 +1303,7 @@ static inline int ip6_sock_set_v6only(struct sock *sk)
 
 static inline void ip6_sock_set_recverr(struct sock *sk)
 {
-	lock_sock(sk);
-	inet6_sk(sk)->recverr = true;
-	release_sock(sk);
+	inet6_set_bit(RECVERR6, sk);
 }
 
 static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 83617a16b98e..e6c3d84c2b9e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -185,7 +185,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out;
 	}
 
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	} else {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 75e0aee35eb7..bc01ad5fc01a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -581,7 +581,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
 	 *	4.1.3.3.
 	 */
 	if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
-	    (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+	    (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
 	} else {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index e81892814935..74673a5eff31 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -305,11 +305,10 @@ static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
 void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 		     __be16 port, u32 info, u8 *payload)
 {
-	struct ipv6_pinfo *np  = inet6_sk(sk);
 	struct icmp6hdr *icmph = icmp6_hdr(skb);
 	struct sock_exterr_skb *serr;
 
-	if (!np->recverr)
+	if (!inet6_test_bit(RECVERR6, sk))
 		return;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
@@ -344,12 +343,11 @@ EXPORT_SYMBOL_GPL(ipv6_icmp_error);
 
 void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 {
-	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sock_exterr_skb *serr;
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
 
-	if (!np->recverr)
+	if (!inet6_test_bit(RECVERR6, sk))
 		return;
 
 	skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 33dd4dd872e6..ec10b45c49c1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -481,6 +481,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DONTFRAG:
 		inet6_assign_bit(DONTFRAG, sk, valbool);
 		return 0;
+	case IPV6_RECVERR:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(RECVERR6, sk, valbool);
+		if (!val)
+			skb_errqueue_purge(&sk->sk_error_queue);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -943,14 +950,6 @@ done:
 		np->pmtudisc = val;
 		retv = 0;
 		break;
-	case IPV6_RECVERR:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->recverr = valbool;
-		if (!val)
-			skb_errqueue_purge(&sk->sk_error_queue);
-		retv = 0;
-		break;
 	case IPV6_FLOWINFO_SEND:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1380,7 +1379,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_RECVERR:
-		val = np->recverr;
+		val = inet6_test_bit(RECVERR6, sk);
 		break;
 
 	case IPV6_FLOWINFO_SEND:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index cc9673c1809f..71f6bdccfa1f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -291,6 +291,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	       struct inet6_skb_parm *opt,
 	       u8 type, u8 code, int offset, __be32 info)
 {
+	bool recverr = inet6_test_bit(RECVERR6, sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	int err;
 	int harderr;
@@ -300,7 +301,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	   2. Socket is connected (otherwise the error indication
 	      is useless without recverr and error is hard.
 	 */
-	if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+	if (!recverr && sk->sk_state != TCP_ESTABLISHED)
 		return;
 
 	harderr = icmpv6_err_convert(type, code, &err);
@@ -312,14 +313,14 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 		ip6_sk_redirect(skb, sk);
 		return;
 	}
-	if (np->recverr) {
+	if (recverr) {
 		u8 *payload = skb->data;
 		if (!inet_test_bit(HDRINCL, sk))
 			payload += offset;
 		ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
 	}
 
-	if (np->recverr || harderr) {
+	if (recverr || harderr) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	}
@@ -587,7 +588,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 			struct flowi6 *fl6, struct dst_entry **dstp,
 			unsigned int flags, const struct sockcm_cookie *sockc)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
@@ -668,7 +668,7 @@ out:
 error:
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 error_check:
-	if (err == -ENOBUFS && !np->recverr)
+	if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
 		err = 0;
 	return err;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54db5fab318b..b5954b136b57 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -508,7 +508,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			tcp_ld_RTO_revert(sk, seq);
 	}
 
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		WRITE_ONCE(sk->sk_err, err);
 		sk_error_report(sk);
 	} else {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e4301500741a..90e873689b88 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -619,7 +619,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out;
 	}
 
-	if (!np->recverr) {
+	if (!inet6_test_bit(RECVERR6, sk)) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
 	} else {
@@ -1283,7 +1283,7 @@ csum_partial:
 send:
 	err = ip6_send_skb(skb);
 	if (err) {
-		if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+		if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
 			UDP6_INC_STATS(sock_net(sk),
 				       UDP_MIB_SNDBUFERRORS, is_udplite);
 			err = 0;
@@ -1608,7 +1608,7 @@ do_append_data:
 		up->pending = 0;
 
 	if (err > 0)
-		err = np->recverr ? net_xmit_errno(err) : 0;
+		err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
 	release_sock(sk);
 
 out:
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 43f2731bf590..42b5b853ea01 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -128,7 +128,6 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 {
 	struct sctp_association *asoc = t->asoc;
 	struct sock *sk = asoc->base.sk;
-	struct ipv6_pinfo *np;
 	int err = 0;
 
 	switch (type) {
@@ -149,9 +148,8 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 		break;
 	}
 
-	np = inet6_sk(sk);
 	icmpv6_err_convert(type, code, &err);
-	if (!sock_owned_by_user(sk) && np->recverr) {
+	if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
 		sk->sk_err = err;
 		sk_error_report(sk);
 	} else {
-- 
cgit v1.2.3


From 3cccda8db2cf2f2a224d55d5b6e2251d478c58ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:09 +0000
Subject: ipv6: move np->repflow to atomic flags

Move np->repflow to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 -
 include/net/inet_sock.h  |  1 +
 net/dccp/ipv6.c          |  2 +-
 net/ipv6/af_inet6.c      |  3 ++-
 net/ipv6/ip6_flowlabel.c |  8 ++++----
 net/ipv6/tcp_ipv6.c      | 14 ++++++--------
 6 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 53f4f1b97a78..e62413371ea4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -244,7 +244,6 @@ struct ipv6_pinfo {
 
 	/* sockopt flags */
 	__u16			sndflow:1,
-				repflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
 				srcprefs:3,	/* 001: prefer temporary address
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 3b79bc759ff4..5d61c7dc6577 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -275,6 +275,7 @@ enum {
 	INET_FLAGS_AUTOFLOWLABEL = 24,
 	INET_FLAGS_DONTFRAG	= 25,
 	INET_FLAGS_RECVERR6	= 26,
+	INET_FLAGS_REPFLOW	= 27,
 };
 
 /* cmsg flags for inet */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e6c3d84c2b9e..d7e63eea705d 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -679,7 +679,7 @@ ipv6_pktoptions:
 			WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb,
 				      &DCCP_SKB_CB(opt_skb)->header.h6)) {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 372fb7b9112c..48737363377f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -220,7 +220,8 @@ lookup_protocol:
 	inet6_set_bit(MC6_LOOP, sk);
 	inet6_set_bit(MC6_ALL, sk);
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->repflow	= net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
+	inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
+				     FLOWLABEL_REFLECT_ESTABLISHED);
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 	sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
 
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b3ca4beb4405..eca07e10e21f 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -513,7 +513,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 		return 0;
 	}
 
-	if (np->repflow) {
+	if (inet6_test_bit(REPFLOW, sk)) {
 		freq->flr_label = np->flow_label;
 		return 0;
 	}
@@ -551,10 +551,10 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
 	if (freq->flr_flags & IPV6_FL_F_REFLECT) {
 		if (sk->sk_protocol != IPPROTO_TCP)
 			return -ENOPROTOOPT;
-		if (!np->repflow)
+		if (!inet6_test_bit(REPFLOW, sk))
 			return -ESRCH;
 		np->flow_label = 0;
-		np->repflow = 0;
+		inet6_clear_bit(REPFLOW, sk);
 		return 0;
 	}
 
@@ -626,7 +626,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
 
 		if (sk->sk_protocol != IPPROTO_TCP)
 			return -ENOPROTOOPT;
-		np->repflow = 1;
+		inet6_set_bit(REPFLOW, sk);
 		return 0;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b5954b136b57..201caf88bb99 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -548,7 +548,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 				    &ireq->ir_v6_rmt_addr);
 
 		fl6->daddr = ireq->ir_v6_rmt_addr;
-		if (np->repflow && ireq->pktopts)
+		if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
 			fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
 
 		tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
@@ -797,7 +797,7 @@ static void tcp_v6_init_req(struct request_sock *req,
 	    (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
 	     np->rxopt.bits.rxinfo ||
 	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
-	     np->rxopt.bits.rxohlim || np->repflow)) {
+	     np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
 		refcount_inc(&skb->users);
 		ireq->pktopts = skb;
 	}
@@ -1055,10 +1055,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
 		if (sk_fullsock(sk)) {
-			const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
-
 			trace_tcp_send_reset(sk, skb);
-			if (np->repflow)
+			if (inet6_test_bit(REPFLOW, sk))
 				label = ip6_flowlabel(ipv6h);
 			priority = sk->sk_priority;
 			txhash = sk->sk_txhash;
@@ -1247,7 +1245,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		newnp->mcast_oif   = inet_iif(skb);
 		newnp->mcast_hops  = ip_hdr(skb)->ttl;
 		newnp->rcv_flowinfo = 0;
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			newnp->flow_label = 0;
 
 		/*
@@ -1320,7 +1318,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	newnp->mcast_oif  = tcp_v6_iif(skb);
 	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
 	newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
-	if (np->repflow)
+	if (inet6_test_bit(REPFLOW, sk))
 		newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
 
 	/* Set ToS of the new socket based upon the value of incoming SYN.
@@ -1546,7 +1544,7 @@ ipv6_pktoptions:
 				   ipv6_hdr(opt_skb)->hop_limit);
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
-		if (np->repflow)
+		if (inet6_test_bit(REPFLOW, sk))
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
 			tcp_v6_restore_cb(opt_skb);
-- 
cgit v1.2.3


From 83cd5eb654b320c1972254f243531f3f3cebcccf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:10 +0000
Subject: ipv6: lockless IPV6_ROUTER_ALERT_ISOLATE implementation

Reads from np->rtalert_isolate are racy.

Move this flag to inet->inet_flags to fix data-races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 net/ipv6/ip6_output.c    |  3 +--
 net/ipv6/ipv6_sockglue.c | 13 ++++++-------
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e62413371ea4..f288a35f157f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -246,11 +246,10 @@ struct ipv6_pinfo {
 	__u16			sndflow:1,
 				pmtudisc:3,
 				padding:1,	/* 1 bit hole */
-				srcprefs:3,	/* 001: prefer temporary address
+				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
-				rtalert_isolate:1;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 5d61c7dc6577..befee0f66c05 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -276,6 +276,7 @@ enum {
 	INET_FLAGS_DONTFRAG	= 25,
 	INET_FLAGS_RECVERR6	= 26,
 	INET_FLAGS_REPFLOW	= 27,
+	INET_FLAGS_RTALERT_ISOLATE = 28,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8851fe5d45a0..f87d8491d7e2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -368,9 +368,8 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 		if (sk && ra->sel == sel &&
 		    (!sk->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
 
-			if (np && np->rtalert_isolate &&
+			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
 				continue;
 			}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index ec10b45c49c1..c22a492e0536 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -488,6 +488,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (!val)
 			skb_errqueue_purge(&sk->sk_error_queue);
 		return 0;
+	case IPV6_ROUTER_ALERT_ISOLATE:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -936,12 +941,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_ROUTER_ALERT_ISOLATE:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->rtalert_isolate = valbool;
-		retv = 0;
-		break;
 	case IPV6_MTU_DISCOVER:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1452,7 +1451,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_ROUTER_ALERT_ISOLATE:
-		val = np->rtalert_isolate;
+		val = inet6_test_bit(RTALERT_ISOLATE, sk);
 		break;
 
 	case IPV6_RECVERR_RFC4884:
-- 
cgit v1.2.3


From 6b724bc4300b431443f3b99520994a5aece347cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:11 +0000
Subject: ipv6: lockless IPV6_MTU_DISCOVER implementation

Most np->pmtudisc reads are racy.

Move this 3bit field on a full byte, add annotations
and make IPV6_MTU_DISCOVER setsockopt() lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h            |  5 ++---
 include/net/ip6_route.h         | 14 +++++++++-----
 net/ipv6/ip6_output.c           |  4 ++--
 net/ipv6/ipv6_sockglue.c        | 17 ++++++++---------
 net/ipv6/raw.c                  |  2 +-
 net/ipv6/udp.c                  |  2 +-
 net/netfilter/ipvs/ip_vs_sync.c |  2 +-
 7 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index f288a35f157f..10f521a6a9c8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,13 +243,12 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u16			sndflow:1,
-				pmtudisc:3,
-				padding:1,	/* 1 bit hole */
+	__u8			sndflow:1,
 				srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
+	__u8			pmtudisc;
 	__u8			min_hopcount;
 	__u8			tclass;
 	__be32			rcv_flowinfo;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b32539bb0fb0..b1ea49900b4a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -266,7 +266,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
 	const struct dst_entry *dst = skb_dst(skb);
 	unsigned int mtu;
 
-	if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) {
+	if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
 		mtu = READ_ONCE(dst->dev->mtu);
 		mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
 	} else {
@@ -277,14 +277,18 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
 
 static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
 {
-	return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE &&
-	       inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);
+
+	return pmtudisc != IPV6_PMTUDISC_INTERFACE &&
+	       pmtudisc != IPV6_PMTUDISC_OMIT;
 }
 
 static inline bool ip6_sk_ignore_df(const struct sock *sk)
 {
-	return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO ||
-	       inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
+	u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);
+
+	return pmtudisc < IPV6_PMTUDISC_DO ||
+	       pmtudisc == IPV6_PMTUDISC_OMIT;
 }
 
 static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f87d8491d7e2..7e5d9eeb990f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1436,10 +1436,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	v6_cork->hop_limit = ipc6->hlimit;
 	v6_cork->tclass = ipc6->tclass;
 	if (rt->dst.flags & DST_XFRM_TUNNEL)
-		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
 	else
-		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
 
 	frag_size = READ_ONCE(np->frag_size);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c22a492e0536..85ea42644dcb 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -493,6 +493,13 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
 		return 0;
+	case IPV6_MTU_DISCOVER:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+			return -EINVAL;
+		WRITE_ONCE(np->pmtudisc, val);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -941,14 +948,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_MTU_DISCOVER:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
-			goto e_inval;
-		np->pmtudisc = val;
-		retv = 0;
-		break;
 	case IPV6_FLOWINFO_SEND:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1374,7 +1373,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_MTU_DISCOVER:
-		val = np->pmtudisc;
+		val = READ_ONCE(np->pmtudisc);
 		break;
 
 	case IPV6_RECVERR:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 71f6bdccfa1f..47372cceb98f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -307,7 +307,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 	harderr = icmpv6_err_convert(type, code, &err);
 	if (type == ICMPV6_PKT_TOOBIG) {
 		ip6_sk_update_pmtu(skb, sk, info);
-		harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+		harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
 	}
 	if (type == NDISC_REDIRECT) {
 		ip6_sk_redirect(skb, sk);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 90e873689b88..c17e19fece1b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!ip6_sk_accept_pmtu(sk))
 			goto out;
 		ip6_sk_update_pmtu(skb, sk, info);
-		if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+		if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
 			harderr = 1;
 	}
 	if (type == NDISC_REDIRECT) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index df1b33b61059..5820a8156c47 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1341,7 +1341,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 
 		/* IPV6_MTU_DISCOVER */
-		np->pmtudisc = val;
+		WRITE_ONCE(np->pmtudisc, val);
 	}
 #endif
 	release_sock(sk);
-- 
cgit v1.2.3


From 859f8b265fc2a11af0fb0c52b4087e0409250592 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Sep 2023 16:02:12 +0000
Subject: ipv6: lockless IPV6_FLOWINFO_SEND implementation

np->sndflow reads are racy.

Use one bit ftom atomic inet->inet_flags instead,
IPV6_FLOWINFO_SEND setsockopt() can be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  3 +--
 include/net/inet_sock.h  |  1 +
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/ping.c          |  3 +--
 net/ipv6/af_inet6.c      |  2 +-
 net/ipv6/datagram.c      |  7 ++++---
 net/ipv6/ipv6_sockglue.c | 13 ++++++-------
 net/ipv6/ping.c          |  2 +-
 net/ipv6/raw.c           |  2 +-
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/ipv6/udp.c           |  2 +-
 net/l2tp/l2tp_ip6.c      |  4 ++--
 net/sctp/ipv6.c          |  3 ++-
 13 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 10f521a6a9c8..09253825c99c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,8 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			sndflow:1,
-				srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs:3;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index befee0f66c05..98e11958cdff 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -277,6 +277,7 @@ enum {
 	INET_FLAGS_RECVERR6	= 26,
 	INET_FLAGS_REPFLOW	= 27,
 	INET_FLAGS_RTALERT_ISOLATE = 28,
+	INET_FLAGS_SNDFLOW	= 29,
 };
 
 /* cmsg flags for inet */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d7e63eea705d..4803f0614848 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -844,7 +844,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	memset(&fl6, 0, sizeof(fl6));
 
-	if (np->sndflow) {
+	if (inet6_test_bit(SNDFLOW, sk)) {
 		fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 		IP6_ECN_flow_init(fl6.flowlabel);
 		if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index bc01ad5fc01a..4dd809b7b188 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -899,7 +899,6 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct ipv6hdr *ip6 = ipv6_hdr(skb);
 		DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 
@@ -908,7 +907,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 			sin6->sin6_port = 0;
 			sin6->sin6_addr = ip6->saddr;
 			sin6->sin6_flowinfo = 0;
-			if (np->sndflow)
+			if (inet6_test_bit(SNDFLOW, sk))
 				sin6->sin6_flowinfo = ip6_flowinfo(ip6);
 			sin6->sin6_scope_id =
 				ipv6_iface_scope_id(&sin6->sin6_addr,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 48737363377f..c6ad0d6e99b5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -537,7 +537,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin6_port = inet->inet_dport;
 		sin->sin6_addr = sk->sk_v6_daddr;
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			sin->sin6_flowinfo = np->flow_label;
 		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
 				       CGROUP_INET6_GETPEERNAME);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 74673a5eff31..cc6a502db39d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -80,7 +80,8 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 	struct flowi6 fl6;
 	int err = 0;
 
-	if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+	if (inet6_test_bit(SNDFLOW, sk) &&
+	    (np->flow_label & IPV6_FLOWLABEL_MASK)) {
 		flowlabel = fl6_sock_lookup(sk, np->flow_label);
 		if (IS_ERR(flowlabel))
 			return -EINVAL;
@@ -163,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	if (np->sndflow)
+	if (inet6_test_bit(SNDFLOW, sk))
 		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 
 	if (ipv6_addr_any(&usin->sin6_addr)) {
@@ -491,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 			const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
 								  struct ipv6hdr, daddr);
 			sin->sin6_addr = ip6h->daddr;
-			if (np->sndflow)
+			if (inet6_test_bit(SNDFLOW, sk))
 				sin->sin6_flowinfo = ip6_flowinfo(ip6h);
 			sin->sin6_scope_id =
 				ipv6_iface_scope_id(&sin->sin6_addr,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 85ea42644dcb..e9dc6f881bb9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -500,6 +500,11 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		WRITE_ONCE(np->pmtudisc, val);
 		return 0;
+	case IPV6_FLOWINFO_SEND:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		inet6_assign_bit(SNDFLOW, sk, valbool);
+		return 0;
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -948,12 +953,6 @@ done:
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
 		break;
-	case IPV6_FLOWINFO_SEND:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		np->sndflow = valbool;
-		retv = 0;
-		break;
 	case IPV6_FLOWLABEL_MGR:
 		retv = ipv6_flowlabel_opt(sk, optval, optlen);
 		break;
@@ -1381,7 +1380,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_FLOWINFO_SEND:
-		val = np->sndflow;
+		val = inet6_test_bit(SNDFLOW, sk);
 		break;
 
 	case IPV6_FLOWLABEL_MGR:
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 4444b61eb23b..e8fb0d275cc2 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -89,7 +89,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EAFNOSUPPORT;
 		}
 		daddr = &(u->sin6_addr);
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
 		if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
 			oif = u->sin6_scope_id;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 47372cceb98f..a2aa54a2baae 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -795,7 +795,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EINVAL;
 
 		daddr = &sin6->sin6_addr;
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 201caf88bb99..94afb8d0f2d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -163,7 +163,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	memset(&fl6, 0, sizeof(fl6));
 
-	if (np->sndflow) {
+	if (inet6_test_bit(SNDFLOW, sk)) {
 		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 		IP6_ECN_flow_init(fl6.flowlabel);
 		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c17e19fece1b..5e9312eefed0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1429,7 +1429,7 @@ do_udp_sendmsg:
 		fl6->fl6_dport = sin6->sin6_port;
 		daddr = &sin6->sin6_addr;
 
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 40af2431e73a..44cfb72bbd18 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -431,7 +431,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
 			return -ENOTCONN;
 		lsa->l2tp_conn_id = lsk->peer_conn_id;
 		lsa->l2tp_addr = sk->sk_v6_daddr;
-		if (np->sndflow)
+		if (inet6_test_bit(SNDFLOW, sk))
 			lsa->l2tp_flowinfo = np->flow_label;
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -529,7 +529,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return -EAFNOSUPPORT;
 
 		daddr = &lsa->l2tp_addr;
-		if (np->sndflow) {
+		if (inet6_test_bit(SNDFLOW, sk)) {
 			fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
 				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 42b5b853ea01..5c0ed5909d85 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -296,7 +296,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
 		fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
 
-	if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+	if (inet6_test_bit(SNDFLOW, sk) &&
+	    (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
 		struct ip6_flowlabel *flowlabel;
 
 		flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
-- 
cgit v1.2.3


From 0a596b0682a7ce37e26c36629816f105c6459d06 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 26 Aug 2023 16:36:41 +0800
Subject: KEYS: Include linux/errno.h in linux/verification.h

Add inclusion of linux/errno.h as otherwise the reference to EINVAL
may be invalid.

Fixes: f3cf4134c5c6 ("bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308261414.HKw1Mrip-lkp@intel.com/
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/verification.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/verification.h b/include/linux/verification.h
index f34e50ebcf60..cb2d47f28091 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_VERIFICATION_H
 #define _LINUX_VERIFICATION_H
 
+#include <linux/errno.h>
 #include <linux/types.h>
 
 /*
-- 
cgit v1.2.3


From b58a36008bfa1aadf55f516bcbfae40c779eb54b Mon Sep 17 00:00:00 2001
From: Stefan Wahren <wahrenst@gmx.net>
Date: Wed, 6 Sep 2023 01:27:57 +0200
Subject: hwrng: bcm2835 - Fix hwrng throughput regression

The last RCU stall fix caused a massive throughput regression of the
hwrng on Raspberry Pi 0 - 3. hwrng_msleep doesn't sleep precisely enough
and usleep_range doesn't allow scheduling. So try to restore the
best possible throughput by introducing hwrng_yield which interruptable
sleeps for one jiffy.

Some performance measurements on Raspberry Pi 3B+ (arm64/defconfig):

sudo dd if=/dev/hwrng of=/dev/null count=1 bs=10000

cpu_relax              ~138025 Bytes / sec
hwrng_msleep(1000)         ~13 Bytes / sec
hwrng_yield              ~2510 Bytes / sec

Fixes: 96cb9d055445 ("hwrng: bcm2835 - use hwrng_msleep() instead of cpu_relax()")
Link: https://lore.kernel.org/linux-arm-kernel/bc97ece5-44a3-4c4e-77da-2db3eb66b128@gmx.net/
Signed-off-by: Stefan Wahren <wahrenst@gmx.net>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/bcm2835-rng.c | 2 +-
 drivers/char/hw_random/core.c        | 6 ++++++
 include/linux/hw_random.h            | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c
index eb04b12f9f01..b03e80300627 100644
--- a/drivers/char/hw_random/bcm2835-rng.c
+++ b/drivers/char/hw_random/bcm2835-rng.c
@@ -70,7 +70,7 @@ static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max,
 	while ((rng_readl(priv, RNG_STATUS) >> 24) == 0) {
 		if (!wait)
 			return 0;
-		hwrng_msleep(rng, 1000);
+		hwrng_yield(rng);
 	}
 
 	num_words = rng_readl(priv, RNG_STATUS) >> 24;
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index e3598ec9cfca..420f155d251f 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -678,6 +678,12 @@ long hwrng_msleep(struct hwrng *rng, unsigned int msecs)
 }
 EXPORT_SYMBOL_GPL(hwrng_msleep);
 
+long hwrng_yield(struct hwrng *rng)
+{
+	return wait_for_completion_interruptible_timeout(&rng->dying, 1);
+}
+EXPORT_SYMBOL_GPL(hwrng_yield);
+
 static int __init hwrng_modinit(void)
 {
 	int ret;
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index 8a3115516a1b..136e9842120e 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -63,5 +63,6 @@ extern void hwrng_unregister(struct hwrng *rng);
 extern void devm_hwrng_unregister(struct device *dve, struct hwrng *rng);
 
 extern long hwrng_msleep(struct hwrng *rng, unsigned int msecs);
+extern long hwrng_yield(struct hwrng *rng);
 
 #endif /* LINUX_HWRANDOM_H_ */
-- 
cgit v1.2.3


From b1f099b1cf51d553c510c6c8141c27d9ba7ea1fe Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:33 -0700
Subject: numa: Generalize numa_map_to_online_node()

The function in fact searches the nearest node for a given one,
based on a N_ONLINE state. This is a common pattern to search
for a nearest node.

This patch converts numa_map_to_online_node() to numa_nearest_node()
so that others won't need to opencode the logic.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-2-yury.norov@gmail.com
---
 include/linux/numa.h |  7 +++++--
 mm/mempolicy.c       | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..fb30a42f0700 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -25,7 +25,7 @@
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
-int numa_map_to_online_node(int node);
+int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
 static inline int memory_add_physaddr_to_nid(u64 start)
@@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 #else /* !CONFIG_NUMA */
-static inline int numa_map_to_online_node(int node)
+static inline int numa_nearest_node(int node, unsigned int state)
 {
 	return NUMA_NO_NODE;
 }
+
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
 	return 0;
@@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 
+#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
+
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 extern const struct attribute_group arch_node_dev_group;
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..d4c0fff79758 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,22 +131,26 @@ static struct mempolicy default_policy = {
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
 /**
- * numa_map_to_online_node - Find closest online node
+ * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
+ * @state: State to filter the search
  *
- * Lookup the next closest node by distance if @nid is not online.
+ * Lookup the closest node by distance if @nid is not in state.
  *
- * Return: this @node if it is online, otherwise the closest node by distance
+ * Return: this @node if it is in state, otherwise the closest node by distance
  */
-int numa_map_to_online_node(int node)
+int numa_nearest_node(int node, unsigned int state)
 {
 	int min_dist = INT_MAX, dist, n, min_node;
 
-	if (node == NUMA_NO_NODE || node_online(node))
+	if (state >= NR_NODE_STATES)
+		return -EINVAL;
+
+	if (node == NUMA_NO_NODE || node_state(node, state))
 		return node;
 
 	min_node = node;
-	for_each_online_node(n) {
+	for_each_node_state(n, state) {
 		dist = node_distance(node, n);
 		if (dist < min_dist) {
 			min_dist = dist;
@@ -156,7 +160,7 @@ int numa_map_to_online_node(int node)
 
 	return min_node;
 }
-EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+EXPORT_SYMBOL_GPL(numa_nearest_node);
 
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 8ab63d418d4339d996f80d02a00dbce0aa3ff972 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:36 -0700
Subject: sched/topology: Fix sched_numa_find_nth_cpu() in non-NUMA case

When CONFIG_NUMA is enabled, sched_numa_find_nth_cpu() searches for a
CPU in sched_domains_numa_masks. The masks includes only online CPUs,
so effectively offline CPUs are skipped.

When CONFIG_NUMA is disabled, the fallback function should be consistent.

Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()")
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-5-yury.norov@gmail.com
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..52f5850730b3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	return cpumask_nth(cpu, cpus);
+	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
 static inline const struct cpumask *
-- 
cgit v1.2.3


From 8f908db77782630c45ba29dac35c434b5ce0b730 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 15 Sep 2023 10:34:27 -0700
Subject: bpf: Fix BTF_ID symbol generation collision

Marcus and Satya reported an issue where BTF_ID macro generates same
symbol in separate objects and that breaks final vmlinux link.

ld.lld: error: ld-temp.o <inline asm>:14577:1: symbol
'__BTF_ID__struct__cgroup__624' is already defined

This can be triggered under specific configs when __COUNTER__ happens to
be the same for the same symbol in two different translation units,
which is already quite unlikely to happen.

Add __LINE__ number suffix to make BTF_ID symbol more unique, which is
not a complete fix, but it would help for now and meanwhile we can work
on better solution as suggested by Andrii.

Cc: stable@vger.kernel.org
Reported-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
Reported-by: Marcus Seyfarth <m.seyfarth@gmail.com>
Closes: https://github.com/ClangBuiltLinux/linux/issues/1913
Debugged-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4Bzb5KQ2_LmhN769ifMeSJaWfebccUasQOfQKaOd0nQ51tw@mail.gmail.com/
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20230915-bpf_collision-v3-1-263fc519c21f@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index a3462a9b8e18..a9cb10b0e2e9 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -49,7 +49,7 @@ word							\
 	____BTF_ID(symbol, word)
 
 #define __ID(prefix) \
-	__PASTE(prefix, __COUNTER__)
+	__PASTE(__PASTE(prefix, __COUNTER__), __LINE__)
 
 /*
  * The BTF_ID defines unique symbol for each ID pointing
-- 
cgit v1.2.3


From 32e4fa37fa667fdf53499b9de92737dc75199d8e Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 4 Sep 2023 14:13:50 +0200
Subject: cpu/hotplug: Remove unused cpuhp_state CPUHP_AP_X86_VDSO_VMA_ONLINE

Commit b2e2ba578e01 ("x86/vdso: Initialize the CPU/node NR segment
descriptor earlier") removed the single user of this constant.

Remove it to reduce the size of cpuhp_hp_states[].

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230904121350.18055-1-olaf@aepfle.de
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 06dda85f0424..cd8bd6ed04f9 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -205,7 +205,6 @@ enum cpuhp_state {
 	CPUHP_AP_KVM_ONLINE,
 	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
-	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
 	CPUHP_AP_BLK_MQ_ONLINE,
 	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
-- 
cgit v1.2.3


From c656a4d5484ad99e97de549a9affc12a91d94963 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Fri, 15 Sep 2023 15:46:08 -0400
Subject: Revert "SUNRPC: clean up integer overflow check"

This reverts commit e87cf8a28e7592bd19064e8181324ae26bc02932.

This commit was added to silence a tautological comparison warning, but
removing the 'len' value check before calling xdr_inline_decode() is
really not what we want.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5b4fb3c791bc..896a6d2a9cf0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -779,7 +779,9 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr,
 
 	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return -EBADMSG;
-	p = xdr_inline_decode(xdr, size_mul(len, sizeof(*p)));
+	if (len > SIZE_MAX / sizeof(*p))
+		return -EBADMSG;
+	p = xdr_inline_decode(xdr, len * sizeof(*p));
 	if (unlikely(!p))
 		return -EBADMSG;
 	if (array == NULL)
-- 
cgit v1.2.3


From 993b5662f302628db4eb358d69b2720c88cbfaf0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 13 Sep 2023 16:12:33 -0400
Subject: SUNRPC: Silence compiler complaints about tautological comparisons

On 64-bit systems, the compiler will complain that the comparison
between SIZE_MAX and the 32-bit unsigned int 'len' is unnecessary.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 896a6d2a9cf0..2f8dc47f1eb0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -779,7 +779,7 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr,
 
 	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return -EBADMSG;
-	if (len > SIZE_MAX / sizeof(*p))
+	if (U32_MAX >= SIZE_MAX / sizeof(*p) && len > SIZE_MAX / sizeof(*p))
 		return -EBADMSG;
 	p = xdr_inline_decode(xdr, len * sizeof(*p));
 	if (unlikely(!p))
-- 
cgit v1.2.3


From daabb2b098e04753fa3d1b1feed13e5a61bef61c Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Thu, 7 Sep 2023 23:05:49 +0000
Subject: bpf/tests: add tests for cpuv4 instructions

The BPF JITs now support cpuv4 instructions. Add tests for these new
instructions to the test suite:

1. Sign extended Load
2. Sign extended Mov
3. Unconditional byte swap
4. Unconditional jump with 32-bit offset
5. Signed division and modulo

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20230907230550.1417590-9-puranjay12@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  50 ++++++-
 lib/test_bpf.c         | 371 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 417 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 761af6b3cf2b..0138832ad571 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -117,21 +117,25 @@ struct ctl_table_header;
 
 /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
 
-#define BPF_ALU64_IMM(OP, DST, IMM)				\
+#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)
 
-#define BPF_ALU32_IMM(OP, DST, IMM)				\
+#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)			\
 	((struct bpf_insn) {					\
 		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
 		.dst_reg = DST,					\
 		.src_reg = 0,					\
-		.off   = 0,					\
+		.off   = OFF,					\
 		.imm   = IMM })
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)
 
 /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
 
@@ -143,6 +147,16 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = LEN })
 
+/* Byte Swap, bswap16/32/64 */
+
+#define BPF_BSWAP(DST, LEN)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = LEN })
+
 /* Short form of mov, dst_reg = src_reg */
 
 #define BPF_MOV64_REG(DST, SRC)					\
@@ -179,6 +193,24 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = IMM })
 
+/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */
+
+#define BPF_MOVSX64_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+#define BPF_MOVSX32_REG(DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Special form of mov32, used for doing explicit zero extension on dst. */
 #define BPF_ZEXT_REG(DST)					\
 	((struct bpf_insn) {					\
@@ -263,6 +295,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */
+
+#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = src_reg */
 
 #define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index ecde4216201e..7916503e6a6a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5111,6 +5111,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 0xffffffff } }
 	},
+	/* MOVSX32 */
+	{
+		"ALU_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000deadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* MOVSX64 REG */
+	{
+		"ALU64_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
 	/* BPF_ALU | BPF_ADD | BPF_X */
 	{
 		"ALU_ADD_X: 1 + 2 = 3",
@@ -6105,6 +6203,106 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 2 } },
 	},
+	/* BPF_ALU | BPF_DIV | BPF_X off=1 (SDIV) */
+	{
+		"ALU_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_DIV | BPF_K off=1 (SDIV) */
+	{
+		"ALU_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_X off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_K off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU64_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_X off=1 (SMOD) */
+	{
+		"ALU_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_K off=1 (SMOD) */
+	{
+		"ALU_SMOD_K: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_X off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_K off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU64_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
 	/* BPF_ALU | BPF_AND | BPF_X */
 	{
 		"ALU_AND_X: 3 & 2 = 2",
@@ -7837,6 +8035,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, (u32) (cpu_to_le64(0xfedcba9876543210ULL) >> 32) } },
 	},
+	/* BSWAP */
+	{
+		"BSWAP 16: 0x0123456789abcdef -> 0xefcd",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcd } },
+	},
+	{
+		"BSWAP 32: 0x0123456789abcdef -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef -> 0x67452301",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x67452301 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	/* BSWAP, reversed */
+	{
+		"BSWAP 16: 0xfedcba9876543210 -> 0x1032",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1032 } },
+	},
+	{
+		"BSWAP 32: 0xfedcba9876543210 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x98badcfe } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
 	/* BPF_LDX_MEM B/H/W/DW */
 	{
 		"BPF_LDX_MEM | BPF_B, base",
@@ -8228,6 +8524,67 @@ static struct bpf_test tests[] = {
 		{ { 32, 0 } },
 		.stack_depth = 0,
 	},
+	/* BPF_LDX_MEMSX B/H/W */
+	{
+		"BPF_LDX_MEMSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead0000000000f0ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffffff0ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -1),
+#else
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead00000000f123ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffff123ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -2),
+#else
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x00000000deadbeefULL),
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -4),
+#else
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
 	/* BPF_STX_MEM B/H/W/DW */
 	{
 		"BPF_STX_MEM | BPF_B",
@@ -9474,6 +9831,20 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP32 | BPF_JA */
+	{
+		"JMP32_JA: Unconditional jump: if (true) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_JMP32_IMM(BPF_JA, 0, 1, 0),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSLT | BPF_K */
 	{
 		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
-- 
cgit v1.2.3


From 3868ab0f192581eff978501a05f3dc2e01541d77 Mon Sep 17 00:00:00 2001
From: Aananth V <aananthv@google.com>
Date: Thu, 14 Sep 2023 14:36:21 +0000
Subject: tcp: new TCP_INFO stats for RTO events

The 2023 SIGCOMM paper "Improving Network Availability with Protective
ReRoute" has indicated Linux TCP's RTO-triggered txhash rehashing can
effectively reduce application disruption during outages. To better
measure the efficacy of this feature, this patch adds three more
detailed stats during RTO recovery and exports via TCP_INFO.
Applications and monitoring systems can leverage this data to measure
the network path diversity and end-to-end repair latency during network
outages to improve their network infrastructure.

The following counters are added to tcp_sock in order to track RTO
events over the lifetime of a TCP socket.

1. u16 total_rto - Counts the total number of RTO timeouts.
2. u16 total_rto_recoveries - Counts the total number of RTO recoveries.
3. u32 total_rto_time - Counts the total time spent (ms) in RTO
                        recoveries. (time spent in CA_Loss and
                        CA_Recovery states)

To compute total_rto_time, we add a new u32 rto_stamp field to
tcp_sock. rto_stamp records the start timestamp (ms) of the last RTO
recovery (CA_Loss).

Corresponding fields are also added to the tcp_info struct.

Signed-off-by: Aananth V <aananthv@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  8 ++++++++
 include/uapi/linux/tcp.h | 12 ++++++++++++
 net/ipv4/tcp.c           |  9 +++++++++
 net/ipv4/tcp_input.c     | 15 +++++++++++++++
 net/ipv4/tcp_minisocks.c |  4 ++++
 net/ipv4/tcp_timer.c     | 17 +++++++++++++++--
 6 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 44d946161d4a..e15452df9804 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -377,6 +377,14 @@ struct tcp_sock {
 				 * Total data bytes retransmitted
 				 */
 	u32	total_retrans;	/* Total retransmits for entire connection */
+	u32	rto_stamp;	/* Start time (ms) of last CA_Loss recovery */
+	u16	total_rto;	/* Total number of RTO timeouts, including
+				 * SYN/SYN-ACK and recurring timeouts.
+				 */
+	u16	total_rto_recoveries;	/* Total number of RTO recoveries,
+					 * including any unfinished recovery.
+					 */
+	u32	total_rto_time;	/* ms spent in (completed) RTO recoveries. */
 
 	u32	urg_seq;	/* Seq of received urgent pointer */
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 879eeb0a084b..d1d08da6331a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -289,6 +289,18 @@ struct tcp_info {
 				      */
 
 	__u32   tcpi_rehash;         /* PLB or timeout triggered rehash attempts */
+
+	__u16	tcpi_total_rto;	/* Total number of RTO timeouts, including
+				 * SYN/SYN-ACK and recurring timeouts.
+				 */
+	__u16	tcpi_total_rto_recoveries;	/* Total number of RTO
+						 * recoveries, including any
+						 * unfinished recovery.
+						 */
+	__u32	tcpi_total_rto_time;	/* Total time spent in RTO recoveries
+					 * in milliseconds, including any
+					 * unfinished recovery.
+					 */
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0c3040a63ebd..69b8d7073708 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3818,6 +3818,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_rcv_wnd = tp->rcv_wnd;
 	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
 	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+
+	info->tcpi_total_rto = tp->total_rto;
+	info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
+	info->tcpi_total_rto_time = tp->total_rto_time;
+	if (tp->rto_stamp) {
+		info->tcpi_total_rto_time += tcp_time_stamp_raw() -
+						tp->rto_stamp;
+	}
+
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8d2c91703158..584825ddd0a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2088,6 +2088,10 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 	tp->undo_marker = 0;
 	tp->undo_retrans = -1;
 	tp->sacked_out = 0;
+	tp->rto_stamp = 0;
+	tp->total_rto = 0;
+	tp->total_rto_recoveries = 0;
+	tp->total_rto_time = 0;
 }
 
 static inline void tcp_init_undo(struct tcp_sock *tp)
@@ -2825,6 +2829,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 	tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
 
+static void tcp_update_rto_time(struct tcp_sock *tp)
+{
+	if (tp->rto_stamp) {
+		tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp;
+		tp->rto_stamp = 0;
+	}
+}
+
 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
  * recovered or spurious. Otherwise retransmits more on partial ACKs.
  */
@@ -3029,6 +3041,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, num_dupack, rexmit);
+		if (icsk->icsk_ca_state != TCP_CA_Loss)
+			tcp_update_rto_time(tp);
 		tcp_identify_packet_loss(sk, ack_flag);
 		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
 		      (*ack_flag & FLAG_LOST_RETRANS)))
@@ -6454,6 +6468,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
 		tcp_try_undo_recovery(sk);
 
 	/* Reset rtx states to prevent spurious retransmits_timed_out() */
+	tcp_update_rto_time(tp);
 	tp->retrans_stamp = 0;
 	inet_csk(sk)->icsk_retransmits = 0;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b98d476f1594..eee8ab1bfa0e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -565,6 +565,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->undo_marker = treq->snt_isn;
 		newtp->retrans_stamp = div_u64(treq->snt_synack,
 					       USEC_PER_SEC / TCP_TS_HZ);
+		newtp->total_rto = req->num_timeout;
+		newtp->total_rto_recoveries = 1;
+		newtp->total_rto_time = tcp_time_stamp_raw() -
+						newtp->retrans_stamp;
 	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d7d64682b068..3f61c6a70a1f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -415,6 +415,19 @@ abort:		tcp_write_err(sk);
 	}
 }
 
+static void tcp_update_rto_stats(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!icsk->icsk_retransmits) {
+		tp->total_rto_recoveries++;
+		tp->rto_stamp = tcp_time_stamp(tp);
+	}
+	icsk->icsk_retransmits++;
+	tp->total_rto++;
+}
+
 /*
  *	Timer for Fast Open socket to retransmit SYNACK. Note that the
  *	sk here is the child socket, not the parent (listener) socket.
@@ -447,7 +460,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	 */
 	inet_rtx_syn_ack(sk, req);
 	req->num_timeout++;
-	icsk->icsk_retransmits++;
+	tcp_update_rto_stats(sk);
 	if (!tp->retrans_stamp)
 		tp->retrans_stamp = tcp_time_stamp(tp);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -575,7 +588,7 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	tcp_enter_loss(sk);
 
-	icsk->icsk_retransmits++;
+	tcp_update_rto_stats(sk);
 	if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
 		/* Retransmission failed because of local congestion,
 		 * Let senders fight for local resources conservatively.
-- 
cgit v1.2.3


From 9af27da6313c8f8c6a26c7ea3fe23d6b9664a3a8 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:58 +0200
Subject: bpf: Use bpf_is_subprog to check for subprogs

We would like to know whether a bpf_prog corresponds to the main prog or
one of the subprogs. The current JIT implementations simply check this
using the func_idx in bpf_prog->aux->func_idx. When the index is 0, it
belongs to the main program, otherwise it corresponds to some
subprogram.

This will also be necessary to halt exception propagation while walking
the stack when an exception is thrown, so we add a simple helper
function to check this, named bpf_is_subprog, and convert existing JIT
implementations to also make use of it.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 2 +-
 arch/s390/net/bpf_jit_comp.c  | 2 +-
 arch/x86/net/bpf_jit_comp.c   | 2 +-
 include/linux/bpf.h           | 5 +++++
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 150d1c6543f7..7d4af64e3982 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -288,7 +288,7 @@ static bool is_lsi_offset(int offset, int scale)
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 {
 	const struct bpf_prog *prog = ctx->prog;
-	const bool is_main_prog = prog->aux->func_idx == 0;
+	const bool is_main_prog = !bpf_is_subprog(prog);
 	const u8 r6 = bpf2a64[BPF_REG_6];
 	const u8 r7 = bpf2a64[BPF_REG_7];
 	const u8 r8 = bpf2a64[BPF_REG_8];
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index de2fb12120d2..eeb42e5cd7d6 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -556,7 +556,7 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp,
 	EMIT6_PCREL_RILC(0xc0040000, 0, jit->prologue_plt);
 	jit->prologue_plt_ret = jit->prg;
 
-	if (fp->aux->func_idx == 0) {
+	if (!bpf_is_subprog(fp)) {
 		/* Initialize the tail call counter in the main program. */
 		/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
 		_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2846c21d75bf..a0d03503b3cb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1049,7 +1049,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_prog->aux->func_idx != 0);
+		      bpf_is_subprog(bpf_prog));
 	push_callee_regs(&prog, callee_regs_used);
 
 	ilen = prog - temp;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9e573159432..9171b0b6a590 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3194,4 +3194,9 @@ static inline gfp_t bpf_memcg_flags(gfp_t flags)
 	return flags;
 }
 
+static inline bool bpf_is_subprog(const struct bpf_prog *prog)
+{
+	return prog->aux->func_idx != 0;
+}
+
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3


From fd5d27b70188379bb441d404c29a0afb111e1753 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:59 +0200
Subject: arch/x86: Implement arch_bpf_stack_walk

The plumbing for offline unwinding when we throw an exception in
programs would require walking the stack, hence introduce a new
arch_bpf_stack_walk function. This is provided when the JIT supports
exceptions, i.e. bpf_jit_supports_exceptions is true. The arch-specific
code is really minimal, hence it should be straightforward to extend
this support to other architectures as well, as it reuses the logic of
arch_stack_walk, but allowing access to unwind_state data.

Once the stack pointer and frame pointer are known for the main subprog
during the unwinding, we know the stack layout and location of any
callee-saved registers which must be restored before we return back to
the kernel. This handling will be added in the subsequent patches.

Note that while we primarily unwind through BPF frames, which are
effectively CONFIG_UNWINDER_FRAME_POINTER, we still need one of this or
CONFIG_UNWINDER_ORC to be able to unwind through the bpf_throw frame
from which we begin walking the stack. We also require both sp and bp
(stack and frame pointers) from the unwind_state structure, which are
only available when one of these two options are enabled.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
 include/linux/filter.h      |  2 ++
 kernel/bpf/core.c           |  9 +++++++++
 3 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a0d03503b3cb..d0c24b5a6abb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -16,6 +16,7 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/unwind.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -2933,3 +2934,30 @@ void bpf_jit_free(struct bpf_prog *prog)
 
 	bpf_prog_unlock_free(prog);
 }
+
+bool bpf_jit_supports_exceptions(void)
+{
+	/* We unwind through both kernel frames (starting from within bpf_throw
+	 * call) and BPF frames. Therefore we require one of ORC or FP unwinder
+	 * to be enabled to walk kernel frames and reach BPF frames in the stack
+	 * trace.
+	 */
+	return IS_ENABLED(CONFIG_UNWINDER_ORC) || IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER);
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+	struct unwind_state state;
+	unsigned long addr;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
+			break;
+	}
+	return;
+#endif
+	WARN(1, "verification of programs using bpf_throw should have failed\n");
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0138832ad571..88874de974cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
+bool bpf_jit_supports_exceptions(void);
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 95599df82ee4..c4ac084f2767 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2914,6 +2914,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
 	return -ENOTSUPP;
 }
 
+bool __weak bpf_jit_supports_exceptions(void)
+{
+	return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
 {
-- 
cgit v1.2.3


From 335d1c5b545284d75ef96ee42e461eacefe865bb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:00 +0200
Subject: bpf: Implement support for adding hidden subprogs

Introduce support in the verifier for generating a subprogram and
include it as part of a BPF program dynamically after the do_check phase
is complete. The first user will be the next patch which generates
default exception callbacks if none are set for the program. The phase
of invocation will be do_misc_fixups. Note that this is an internal
verifier function, and should be used with instruction blocks which
uphold the invariants stated in check_subprogs.

Since these subprogs are always appended to the end of the instruction
sequence of the program, it becomes relatively inexpensive to do the
related adjustments to the subprog_info of the program. Only the fake
exit subprogram is shifted forward, making room for our new subprog.

This is useful to insert a new subprogram, get it JITed, and obtain its
function pointer. The next patch will use this functionality to insert a
default exception callback which will be invoked after unwinding the
stack.

Note that these added subprograms are invisible to userspace, and never
reported in BPF_OBJ_GET_INFO_BY_ID etc. For now, only a single
subprogram is supported, but more can be easily supported in the future.

To this end, two function counts are introduced now, the existing
func_cnt, and real_func_cnt, the latter including hidden programs. This
allows us to conver the JIT code to use the real_func_cnt for management
of resources while syscall path continues working with existing
func_cnt.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  3 ++-
 kernel/bpf/core.c            | 12 ++++++------
 kernel/bpf/syscall.c         |  2 +-
 kernel/bpf/verifier.c        | 36 +++++++++++++++++++++++++++++++++---
 5 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9171b0b6a590..c3667e95af59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1389,6 +1389,7 @@ struct bpf_prog_aux {
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..3c2a8636ab29 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -588,6 +588,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
+	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -598,7 +599,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
-	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
 	union {
 		struct bpf_idmap idmap_scratch;
 		struct bpf_idset idset_scratch;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c4ac084f2767..840ba952702d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 	const struct bpf_line_info *linfo;
 	void **jited_linfo;
 
-	if (!prog->aux->jited_linfo)
+	if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
 		/* Userspace did not provide linfo */
 		return;
 
@@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
 {
 	int i;
 
-	for (i = 0; i < fp->aux->func_cnt; i++)
+	for (i = 0; i < fp->aux->real_func_cnt; i++)
 		bpf_prog_kallsyms_del(fp->aux->func[i]);
 }
 
@@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 
 	/* prog->aux->name will be ignored if full btf name is available */
-	if (prog->aux->func_info_cnt) {
+	if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
 		type = btf_type_by_id(prog->aux->btf,
 				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -1208,7 +1208,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 		if (!extra_pass)
 			addr = NULL;
 		else if (prog->aux->func &&
-			 off >= 0 && off < prog->aux->func_cnt)
+			 off >= 0 && off < prog->aux->real_func_cnt)
 			addr = (u8 *)prog->aux->func[off]->bpf_func;
 		else
 			return -EINVAL;
@@ -2721,7 +2721,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++) {
+	for (i = 0; i < aux->real_func_cnt; i++) {
 		/* We can just unlink the subprog poke descriptor table as
 		 * it was originally linked to the main program and is also
 		 * released along with it.
@@ -2729,7 +2729,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
 	}
-	if (aux->func_cnt) {
+	if (aux->real_func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
 	} else {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6a692f3bea15..85c1d908f70f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2749,7 +2749,7 @@ free_used_maps:
 	 * period before we can tear down JIT memory since symbols
 	 * are already exposed under kallsyms.
 	 */
-	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
+	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
 free_prog_sec:
 	free_uid(prog->aux->user);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 18e673c0ac15..39548e326d53 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15210,7 +15210,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 	if (!aux->func_info)
 		return;
 
-	for (i = 0; i < env->subprog_cnt; i++)
+	/* func_info is not available for hidden subprogs */
+	for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
 		aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
@@ -18151,7 +18152,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * the call instruction, as an index for this list
 		 */
 		func[i]->aux->func = func;
-		func[i]->aux->func_cnt = env->subprog_cnt;
+		func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+		func[i]->aux->real_func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
@@ -18197,7 +18199,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->extable = func[0]->aux->extable;
 	prog->aux->num_exentries = func[0]->aux->num_exentries;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt;
+	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+	prog->aux->real_func_cnt = env->subprog_cnt;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18433,6 +18436,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return 0;
 }
 
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+	struct bpf_subprog_info *info = env->subprog_info;
+	int cnt = env->subprog_cnt;
+	struct bpf_prog *prog;
+
+	/* We only reserve one slot for hidden subprogs in subprog_info. */
+	if (env->hidden_subprog_cnt) {
+		verbose(env, "verifier internal error: only one hidden subprog supported\n");
+		return -EFAULT;
+	}
+	/* We're not patching any existing instruction, just appending the new
+	 * ones for the hidden subprog. Hence all of the adjustment operations
+	 * in bpf_patch_insn_data are no-ops.
+	 */
+	prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+	if (!prog)
+		return -ENOMEM;
+	env->prog = prog;
+	info[cnt + 1].start = info[cnt].start;
+	info[cnt].start = prog->len - len + 1;
+	env->subprog_cnt++;
+	env->hidden_subprog_cnt++;
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
-- 
cgit v1.2.3


From f18b03fabaa9b7c80e80b72a621f481f0d706ae0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:01 +0200
Subject: bpf: Implement BPF exceptions

This patch implements BPF exceptions, and introduces a bpf_throw kfunc
to allow programs to throw exceptions during their execution at runtime.
A bpf_throw invocation is treated as an immediate termination of the
program, returning back to its caller within the kernel, unwinding all
stack frames.

This allows the program to simplify its implementation, by testing for
runtime conditions which the verifier has no visibility into, and assert
that they are true. In case they are not, the program can simply throw
an exception from the other branch.

BPF exceptions are explicitly *NOT* an unlikely slowpath error handling
primitive, and this objective has guided design choices of the
implementation of the them within the kernel (with the bulk of the cost
for unwinding the stack offloaded to the bpf_throw kfunc).

The implementation of this mechanism requires use of add_hidden_subprog
mechanism introduced in the previous patch, which generates a couple of
instructions to move R1 to R0 and exit. The JIT then rewrites the
prologue of this subprog to take the stack pointer and frame pointer as
inputs and reset the stack frame, popping all callee-saved registers
saved by the main subprog. The bpf_throw function then walks the stack
at runtime, and invokes this exception subprog with the stack and frame
pointers as parameters.

Reviewers must take note that currently the main program is made to save
all callee-saved registers on x86_64 during entry into the program. This
is because we must do an equivalent of a lightweight context switch when
unwinding the stack, therefore we need the callee-saved registers of the
caller of the BPF program to be able to return with a sane state.

Note that we have to additionally handle r12, even though it is not used
by the program, because when throwing the exception the program makes an
entry into the kernel which could clobber r12 after saving it on the
stack. To be able to preserve the value we received on program entry, we
push r12 and restore it from the generated subprogram when unwinding the
stack.

For now, bpf_throw invocation fails when lingering resources or locks
exist in that path of the program. In a future followup, bpf_throw will
be extended to perform frame-by-frame unwinding to release lingering
resources for each stack frame, removing this limitation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c                    |  89 ++++++++++++++++---
 include/linux/bpf.h                            |   3 +
 include/linux/bpf_verifier.h                   |   4 +
 include/linux/filter.h                         |   6 ++
 kernel/bpf/core.c                              |   2 +-
 kernel/bpf/helpers.c                           |  38 ++++++++
 kernel/bpf/verifier.c                          | 116 ++++++++++++++++++++++---
 tools/testing/selftests/bpf/bpf_experimental.h |  16 ++++
 8 files changed, 247 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d0c24b5a6abb..84005f2114e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -18,6 +18,8 @@
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
 
+static bool all_callee_regs_used[4] = {true, true, true, true};
+
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -256,6 +258,14 @@ struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(11 + ENDBR_INSN_SIZE)
 
+static void push_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x54);   /* push r12 */
+	*pprog = prog;
+}
+
 static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -271,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+static void pop_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x5C);   /* pop r12 */
+	*pprog = prog;
+}
+
 static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -292,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  * while jumping to another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
-			  bool tail_call_reachable, bool is_subprog)
+			  bool tail_call_reachable, bool is_subprog,
+			  bool is_exception_cb)
 {
 	u8 *prog = *pprog;
 
@@ -312,8 +331,22 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			/* Keep the same instruction layout. */
 			EMIT2(0x66, 0x90); /* nop2 */
 	}
-	EMIT1(0x55);             /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	/* Exception callback receives FP as third parameter */
+	if (is_exception_cb) {
+		EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
+		EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
+		/* The main frame must have exception_boundary as true, so we
+		 * first restore those callee-saved regs from stack, before
+		 * reusing the stack frame.
+		 */
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+		/* Reset the stack frame. */
+		EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
+	} else {
+		EMIT1(0x55);             /* push rbp */
+		EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	}
 
 	/* X86_TAIL_CALL_OFFSET is here */
 	EMIT_ENDBR();
@@ -472,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip)
  *   goto *(prog->bpf_func + prologue_size);
  * out:
  */
-static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
+static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+					u8 **pprog, bool *callee_regs_used,
 					u32 stack_depth, u8 *ip,
 					struct jit_context *ctx)
 {
@@ -522,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
 	EMIT2(X86_JE, offset);                    /* je out */
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
 
 	EMIT1(0x58);                              /* pop rax */
 	if (stack_depth)
@@ -546,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	*pprog = prog;
 }
 
-static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
+				      struct bpf_jit_poke_descriptor *poke,
 				      u8 **pprog, u8 *ip,
 				      bool *callee_regs_used, u32 stack_depth,
 				      struct jit_context *ctx)
@@ -575,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 	emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
 		  poke->tailcall_bypass);
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
+
 	EMIT1(0x58);                                  /* pop rax */
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1050,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog));
-	push_callee_regs(&prog, callee_regs_used);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+	/* Exception callback will clobber callee regs for its own use, and
+	 * restore the original callee regs from main prog's stack frame.
+	 */
+	if (bpf_prog->aux->exception_boundary) {
+		/* We also need to save r12, which is not mapped to any BPF
+		 * register, as we throw after entry into the kernel, which may
+		 * overwrite r12.
+		 */
+		push_r12(&prog);
+		push_callee_regs(&prog, all_callee_regs_used);
+	} else {
+		push_callee_regs(&prog, callee_regs_used);
+	}
 
 	ilen = prog - temp;
 	if (rw_image)
@@ -1648,13 +1706,15 @@ st:			if (is_imm8(insn->off))
 
 		case BPF_JMP | BPF_TAIL_CALL:
 			if (imm32)
-				emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+				emit_bpf_tail_call_direct(bpf_prog,
+							  &bpf_prog->aux->poke_tab[imm32 - 1],
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
 							  bpf_prog->aux->stack_depth,
 							  ctx);
 			else
-				emit_bpf_tail_call_indirect(&prog,
+				emit_bpf_tail_call_indirect(bpf_prog,
+							    &prog,
 							    callee_regs_used,
 							    bpf_prog->aux->stack_depth,
 							    image + addrs[i - 1],
@@ -1907,7 +1967,12 @@ emit_jmp:
 			seen_exit = true;
 			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
-			pop_callee_regs(&prog, callee_regs_used);
+			if (bpf_prog->aux->exception_boundary) {
+				pop_callee_regs(&prog, all_callee_regs_used);
+				pop_r12(&prog);
+			} else {
+				pop_callee_regs(&prog, callee_regs_used);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3667e95af59..16740ee82082 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1410,6 +1410,8 @@ struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool exception_cb;
+	bool exception_boundary;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -1432,6 +1434,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
+	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3c2a8636ab29..da21a3ec5027 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -541,7 +541,9 @@ struct bpf_subprog_info {
 	bool has_tail_call;
 	bool tail_call_reachable;
 	bool has_ld_abs;
+	bool is_cb;
 	bool is_async_cb;
+	bool is_exception_cb;
 };
 
 struct bpf_verifier_env;
@@ -589,6 +591,7 @@ struct bpf_verifier_env {
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
+	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -596,6 +599,7 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
+	bool seen_exception;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88874de974cb..27406aee2d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1171,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 bool is_bpf_text_address(unsigned long addr);
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym);
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
 
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
@@ -1238,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
 	return -ERANGE;
 }
 
+static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 840ba952702d..7849b9cca749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,7 +733,7 @@ bool is_bpf_text_address(unsigned long addr)
 	return ret;
 }
 
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
 {
 	struct bpf_ksym *ksym = bpf_ksym_find(addr);
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0a9834f1051..78e8f4de6750 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2449,6 +2449,43 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
 	rcu_read_unlock();
 }
 
+struct bpf_throw_ctx {
+	struct bpf_prog_aux *aux;
+	u64 sp;
+	u64 bp;
+	int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+	struct bpf_throw_ctx *ctx = cookie;
+	struct bpf_prog *prog;
+
+	if (!is_bpf_text_address(ip))
+		return !ctx->cnt;
+	prog = bpf_prog_ksym_find(ip);
+	ctx->cnt++;
+	if (bpf_is_subprog(prog))
+		return true;
+	ctx->aux = prog->aux;
+	ctx->sp = sp;
+	ctx->bp = bp;
+	return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+	struct bpf_throw_ctx ctx = {};
+
+	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+	WARN_ON_ONCE(!ctx.aux);
+	if (ctx.aux)
+		WARN_ON_ONCE(!ctx.aux->exception_boundary);
+	WARN_ON_ONCE(!ctx.bp);
+	WARN_ON_ONCE(!ctx.cnt);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -2478,6 +2515,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
 BTF_SET8_END(generic_btf_ids)
 
 static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39548e326d53..9baa6f187b38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 }
 
 static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
 static bool is_callback_calling_function(enum bpf_func_id func_id)
 {
@@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		return -ENOMEM;
 	dst_state->jmp_history_cnt = src->jmp_history_cnt;
 
-	/* if dst has more stack frames then src frame, free them */
+	/* if dst has more stack frames then src frame, free them, this is also
+	 * necessary in case of exceptional exits using bpf_throw.
+	 */
 	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
 		free_func_state(dst_state->frame[i]);
 		dst_state->frame[i] = NULL;
@@ -2868,7 +2871,7 @@ next:
 		if (i == subprog_end - 1) {
 			/* to avoid fall-through from one subprog into another
 			 * the last insn of the subprog should be either exit
-			 * or unconditional jump back
+			 * or unconditional jump back or bpf_throw call
 			 */
 			if (code != (BPF_JMP | BPF_EXIT) &&
 			    code != (BPF_JMP32 | BPF_JA) &&
@@ -5661,6 +5664,27 @@ continue_func:
 	for (; i < subprog_end; i++) {
 		int next_insn, sidx;
 
+		if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+			bool err = false;
+
+			if (!is_bpf_throw_kfunc(insn + i))
+				continue;
+			if (subprog[idx].is_cb)
+				err = true;
+			for (int c = 0; c < frame && !err; c++) {
+				if (subprog[ret_prog[c]].is_cb) {
+					err = true;
+					break;
+				}
+			}
+			if (!err)
+				continue;
+			verbose(env,
+				"bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+				i, idx);
+			return -EINVAL;
+		}
+
 		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
@@ -8919,6 +8943,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	 * callbacks
 	 */
 	if (set_callee_state_cb != set_callee_state) {
+		env->subprog_info[subprog].is_cb = true;
 		if (bpf_pseudo_kfunc_call(insn) &&
 		    !is_callback_calling_kfunc(insn->imm)) {
 			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
@@ -9308,7 +9333,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		verbose(env, "to caller at %d:\n", *insn_idx);
 		print_verifier_state(env, caller, true);
 	}
-	/* clear everything in the callee */
+	/* clear everything in the callee. In case of exceptional exits using
+	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
 	free_func_state(callee);
 	state->frame[state->curframe--] = NULL;
 	return 0;
@@ -9432,17 +9458,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
 {
 	struct bpf_func_state *state = cur_func(env);
 	bool refs_lingering = false;
 	int i;
 
-	if (state->frameno && !state->in_callback_fn)
+	if (!exception_exit && state->frameno && !state->in_callback_fn)
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
-		if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
 			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
@@ -9697,7 +9723,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	switch (func_id) {
 	case BPF_FUNC_tail_call:
-		err = check_reference_leak(env);
+		err = check_reference_leak(env, false);
 		if (err) {
 			verbose(env, "tail_call would lead to reference leak\n");
 			return err;
@@ -10332,6 +10358,7 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_clone,
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
+	KF_bpf_throw,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10354,6 +10381,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10378,6 +10406,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -10695,6 +10724,12 @@ static bool is_callback_calling_kfunc(u32 btf_id)
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+	return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+	       insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
 {
 	return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11480,6 +11515,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
+	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+		if (!bpf_jit_supports_exceptions()) {
+			verbose(env, "JIT does not support calling kfunc %s#%d\n",
+				func_name, meta.func_id);
+			return -ENOTSUPP;
+		}
+		env->seen_exception = true;
+	}
+
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
@@ -14525,7 +14569,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * gen_ld_abs() may terminate the program at runtime, leading to
 	 * reference leak.
 	 */
-	err = check_reference_leak(env);
+	err = check_reference_leak(env, false);
 	if (err) {
 		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
 		return err;
@@ -16539,6 +16583,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int prev_insn_idx = -1;
 
 	for (;;) {
+		bool exception_exit = false;
 		struct bpf_insn *insn;
 		u8 class;
 		int err;
@@ -16753,12 +16798,17 @@ static int do_check(struct bpf_verifier_env *env)
 						return -EINVAL;
 					}
 				}
-				if (insn->src_reg == BPF_PSEUDO_CALL)
+				if (insn->src_reg == BPF_PSEUDO_CALL) {
 					err = check_func_call(env, insn, &env->insn_idx);
-				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+				} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 					err = check_kfunc_call(env, insn, &env->insn_idx);
-				else
+					if (!err && is_bpf_throw_kfunc(insn)) {
+						exception_exit = true;
+						goto process_bpf_exit_full;
+					}
+				} else {
 					err = check_helper_call(env, insn, &env->insn_idx);
+				}
 				if (err)
 					return err;
 
@@ -16788,7 +16838,7 @@ static int do_check(struct bpf_verifier_env *env)
 					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
-
+process_bpf_exit_full:
 				if (env->cur_state->active_lock.ptr &&
 				    !in_rbtree_lock_required_cb(env)) {
 					verbose(env, "bpf_spin_unlock is missing\n");
@@ -16807,10 +16857,23 @@ static int do_check(struct bpf_verifier_env *env)
 				 * function, for which reference_state must
 				 * match caller reference state when it exits.
 				 */
-				err = check_reference_leak(env);
+				err = check_reference_leak(env, exception_exit);
 				if (err)
 					return err;
 
+				/* The side effect of the prepare_func_exit
+				 * which is being skipped is that it frees
+				 * bpf_func_state. Typically, process_bpf_exit
+				 * will only be hit with outermost exit.
+				 * copy_verifier_state in pop_stack will handle
+				 * freeing of any extra bpf_func_state left over
+				 * from not processing all nested function
+				 * exits. We also skip return code checks as
+				 * they are not needed for exceptional exits.
+				 */
+				if (exception_exit)
+					goto process_bpf_exit;
+
 				if (state->curframe) {
 					/* exit from nested function */
 					err = prepare_func_exit(env, &env->insn_idx);
@@ -18113,6 +18176,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		}
 		func[i]->aux->num_exentries = num_exentries;
 		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+		func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+		if (!i)
+			func[i]->aux->exception_boundary = env->seen_exception;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -18201,6 +18267,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
 	prog->aux->real_func_cnt = env->subprog_cnt;
+	prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+	prog->aux->exception_boundary = func[0]->aux->exception_boundary;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18437,7 +18505,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 }
 
 /* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
 {
 	struct bpf_subprog_info *info = env->subprog_info;
 	int cnt = env->subprog_cnt;
@@ -18481,6 +18549,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_map *map_ptr;
 	int i, ret, cnt, delta = 0;
 
+	if (env->seen_exception && !env->exception_callback_subprog) {
+		struct bpf_insn patch[] = {
+			env->prog->insnsi[insn_cnt - 1],
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		};
+
+		ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+		if (ret < 0)
+			return ret;
+		prog = env->prog;
+		insn = prog->insnsi;
+
+		env->exception_callback_subprog = env->subprog_cnt - 1;
+		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+		env->subprog_info[env->exception_callback_subprog].is_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+	}
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 4494eaa9937e..333b54a86e3a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -162,4 +162,20 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Convenience macro to wrap over bpf_obj_drop_impl */
 #define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL)
 
+/* Description
+ *	Throw a BPF exception from the program, immediately terminating its
+ *	execution and unwinding the stack. The supplied 'cookie' parameter
+ *	will be the return value of the program when an exception is thrown.
+ *
+ *	Note that throwing an exception with lingering resources (locks,
+ *	references, etc.) will lead to a verification error.
+ *
+ *	Note that callbacks *cannot* call this helper.
+ * Returns
+ *	Never.
+ * Throws
+ *	An exception with the specified 'cookie' value.
+ */
+extern void bpf_throw(u64 cookie) __ksym;
+
 #endif
-- 
cgit v1.2.3


From b9ae0c9dd0aca79bffc17be51c2dc148d1f72708 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:03 +0200
Subject: bpf: Add support for custom exception callbacks

By default, the subprog generated by the verifier to handle a thrown
exception hardcodes a return value of 0. To allow user-defined logic
and modification of the return value when an exception is thrown,
introduce the 'exception_callback:' declaration tag, which marks a
callback as the default exception handler for the program.

The format of the declaration tag is 'exception_callback:<value>', where
<value> is the name of the exception callback. Each main program can be
tagged using this BTF declaratiion tag to associate it with an exception
callback. In case the tag is absent, the default callback is used.

As such, the exception callback cannot be modified at runtime, only set
during verification.

Allowing modification of the callback for the current program execution
at runtime leads to issues when the programs begin to nest, as any
per-CPU state maintaing this information will have to be saved and
restored. We don't want it to stay in bpf_prog_aux as this takes a
global effect for all programs. An alternative solution is spilling
the callback pointer at a known location on the program stack on entry,
and then passing this location to bpf_throw as a parameter.

However, since exceptions are geared more towards a use case where they
are ideally never invoked, optimizing for this use case and adding to
the complexity has diminishing returns.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                            |   4 +-
 include/linux/bpf_verifier.h                   |   1 +
 kernel/bpf/btf.c                               |  29 +++++--
 kernel/bpf/verifier.c                          | 113 +++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_experimental.h |  31 ++++++-
 5 files changed, 160 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 16740ee82082..30063a760b5a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2422,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg);
+			  struct bpf_reg_state *reg, bool is_ex_cb);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key);
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index da21a3ec5027..94ec766432f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -300,6 +300,7 @@ struct bpf_func_state {
 	bool in_callback_fn;
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
+	bool in_exception_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 187b57276fec..f93e835d90af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3310,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	return BTF_FIELD_FOUND;
 }
 
-static const char *btf_find_decl_tag_value(const struct btf *btf,
-					   const struct btf_type *pt,
-					   int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key)
 {
+	const char *value = NULL;
 	int i;
 
 	for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3327,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
 			continue;
 		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
 			continue;
-		return __btf_name_by_offset(btf, t->name_off) + len;
+		/* Prevent duplicate entries for same type */
+		if (value)
+			return ERR_PTR(-EEXIST);
+		value = __btf_name_by_offset(btf, t->name_off) + len;
 	}
-	return NULL;
+	if (!value)
+		return ERR_PTR(-ENOENT);
+	return value;
 }
 
 static int
@@ -3347,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
 	if (t->size != sz)
 		return BTF_FIELD_IGNORE;
 	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
-	if (!value_type)
+	if (IS_ERR(value_type))
 		return -EINVAL;
 	node_field_name = strstr(value_type, ":");
 	if (!node_field_name)
@@ -6954,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  * (either PTR_TO_CTX or SCALAR_VALUE).
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs)
+			  struct bpf_reg_state *regs, bool is_ex_cb)
 {
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -7011,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	/* check that function returns int */
+	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
@@ -7060,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
+	/* We have already ensured that the callback returns an integer, just
+	 * like all global subprogs. We need to determine it only has a single
+	 * scalar argument.
+	 */
+	if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
+		bpf_log(log, "exception cb only supports single integer argument\n");
+		return -EINVAL;
+	}
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ec767ae08c2b..ec3f22312516 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2457,6 +2457,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	return env->subprog_cnt - 1;
 }
 
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_aux *aux = env->prog->aux;
+	struct btf *btf = aux->btf;
+	const struct btf_type *t;
+	u32 main_btf_id, id;
+	const char *name;
+	int ret, i;
+
+	/* Non-zero func_info_cnt implies valid btf */
+	if (!aux->func_info_cnt)
+		return 0;
+	main_btf_id = aux->func_info[0].type_id;
+
+	t = btf_type_by_id(btf, main_btf_id);
+	if (!t) {
+		verbose(env, "invalid btf id for main subprog in func_info\n");
+		return -EINVAL;
+	}
+
+	name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		/* If there is no tag present, there is no exception callback */
+		if (ret == -ENOENT)
+			ret = 0;
+		else if (ret == -EEXIST)
+			verbose(env, "multiple exception callback tags for main subprog\n");
+		return ret;
+	}
+
+	ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+	if (ret < 0) {
+		verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+		return ret;
+	}
+	id = ret;
+	t = btf_type_by_id(btf, id);
+	if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+		verbose(env, "exception callback '%s' must have global linkage\n", name);
+		return -EINVAL;
+	}
+	ret = 0;
+	for (i = 0; i < aux->func_info_cnt; i++) {
+		if (aux->func_info[i].type_id != id)
+			continue;
+		ret = aux->func_info[i].insn_off;
+		/* Further func_info and subprog checks will also happen
+		 * later, so assume this is the right insn_off for now.
+		 */
+		if (!ret) {
+			verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+			ret = -EINVAL;
+		}
+	}
+	if (!ret) {
+		verbose(env, "exception callback type id not found in func_info\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
 #define MAX_KFUNC_DESCS 256
 #define MAX_KFUNC_BTFS	256
 
@@ -2796,8 +2858,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *subprog = env->subprog_info;
+	int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
@@ -2823,6 +2885,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	ret = bpf_find_exception_callback_insn_off(env);
+	if (ret < 0)
+		return ret;
+	ex_cb_insn = ret;
+
+	/* If ex_cb_insn > 0, this means that the main program has a subprog
+	 * marked using BTF decl tag to serve as the exception callback.
+	 */
+	if (ex_cb_insn) {
+		ret = add_subprog(env, ex_cb_insn);
+		if (ret < 0)
+			return ret;
+		for (i = 1; i < env->subprog_cnt; i++) {
+			if (env->subprog_info[i].start != ex_cb_insn)
+				continue;
+			env->exception_callback_subprog = i;
+			break;
+		}
+	}
+
 	/* Add a fake 'exit' subprog which could simplify subprog iteration
 	 * logic. 'subprog_cnt' should not be increased.
 	 */
@@ -5707,6 +5789,10 @@ continue_func:
 			/* async callbacks don't increase bpf prog stack size unless called directly */
 			if (!bpf_pseudo_call(insn + i))
 				continue;
+			if (subprog[sidx].is_exception_cb) {
+				verbose(env, "insn %d cannot call exception cb directly\n", i);
+				return -EINVAL;
+			}
 		}
 		i = next_insn;
 		idx = sidx;
@@ -5728,8 +5814,13 @@ continue_func:
 	 * tail call counter throughout bpf2bpf calls combined with tailcalls
 	 */
 	if (tail_call_reachable)
-		for (j = 0; j < frame; j++)
+		for (j = 0; j < frame; j++) {
+			if (subprog[ret_prog[j]].is_exception_cb) {
+				verbose(env, "cannot tail call within exception cb\n");
+				return -EINVAL;
+			}
 			subprog[ret_prog[j]].tail_call_reachable = true;
+		}
 	if (subprog[0].tail_call_reachable)
 		env->prog->aux->tail_call_reachable = true;
 
@@ -14630,7 +14721,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	const bool is_subprog = frame->subprogno;
 
 	/* LSM and struct_ops func-ptr's return type could be "void" */
-	if (!is_subprog) {
+	if (!is_subprog || frame->in_exception_callback_fn) {
 		switch (prog_type) {
 		case BPF_PROG_TYPE_LSM:
 			if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -14678,7 +14769,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	if (is_subprog) {
+	if (is_subprog && !frame->in_exception_callback_fn) {
 		if (reg->type != SCALAR_VALUE) {
 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
 				reg_type_str(env, reg->type));
@@ -19334,7 +19425,7 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
-static int do_check_common(struct bpf_verifier_env *env, int subprog)
+static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
@@ -19365,7 +19456,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		ret = btf_prepare_func_args(env, subprog, regs);
+		ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
 		if (ret)
 			goto out;
 		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
@@ -19381,6 +19472,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				regs[i].id = ++env->id_gen;
 			}
 		}
+		if (is_ex_cb) {
+			state->frame[0]->in_exception_callback_fn = true;
+			env->subprog_info[subprog].is_cb = true;
+			env->subprog_info[subprog].is_async_cb = true;
+			env->subprog_info[subprog].is_exception_cb = true;
+		}
 	} else {
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -19445,7 +19542,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 			continue;
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
-		ret = do_check_common(env, i);
+		ret = do_check_common(env, i, env->exception_callback_subprog == i);
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -19462,7 +19559,7 @@ static int do_check_main(struct bpf_verifier_env *env)
 	int ret;
 
 	env->insn_idx = 0;
-	ret = do_check_common(env, 0);
+	ret = do_check_common(env, 0, false);
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 333b54a86e3a..9a87170524ce 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -165,7 +165,16 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Description
  *	Throw a BPF exception from the program, immediately terminating its
  *	execution and unwinding the stack. The supplied 'cookie' parameter
- *	will be the return value of the program when an exception is thrown.
+ *	will be the return value of the program when an exception is thrown,
+ *	and the default exception callback is used. Otherwise, if an exception
+ *	callback is set using the '__exception_cb(callback)' declaration tag
+ *	on the main program, the 'cookie' parameter will be the callback's only
+ *	input argument.
+ *
+ *	Thus, in case of default exception callback, 'cookie' is subjected to
+ *	constraints on the program's return value (as with R0 on exit).
+ *	Otherwise, the return value of the marked exception callback will be
+ *	subjected to the same checks.
  *
  *	Note that throwing an exception with lingering resources (locks,
  *	references, etc.) will lead to a verification error.
@@ -178,4 +187,24 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
  */
 extern void bpf_throw(u64 cookie) __ksym;
 
+/* This macro must be used to mark the exception callback corresponding to the
+ * main program. For example:
+ *
+ * int exception_cb(u64 cookie) {
+ *	return cookie;
+ * }
+ *
+ * SEC("tc")
+ * __exception_cb(exception_cb)
+ * int main_prog(struct __sk_buff *ctx) {
+ *	...
+ *	return TC_ACT_OK;
+ * }
+ *
+ * Here, exception callback for the main program will be 'exception_cb'. Note
+ * that this attribute can only be used once, and multiple exception callbacks
+ * specified for the main program will lead to verification error.
+ */
+#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name)))
+
 #endif
-- 
cgit v1.2.3


From 7ccb84f04cda1dd6f64f352e9795db308e9cdc0c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:06 +0200
Subject: mm: kasan: Declare kasan_unpoison_task_stack_below in kasan.h

We require access to this kasan helper in BPF code in the next patch
where we have to unpoison the task stack when we unwind and reset the
stack frame from bpf_throw, and it never really unpoisons the poisoned
stack slots on entry when compiler instrumentation is generated by
CONFIG_KASAN_STACK and inline instrumentation is supported.

Also, remove the declaration from mm/kasan/kasan.h as we put it in the
header file kasan.h.

Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Suggested-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-10-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kasan.h | 2 ++
 mm/kasan/kasan.h      | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac08..7a463f814db2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -283,8 +283,10 @@ static inline bool kasan_check_byte(const void *address)
 
 #if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 void kasan_unpoison_task_stack(struct task_struct *task);
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
 #else
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+static inline void kasan_unpoison_task_stack_below(const void *watermark) {}
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 2e973b36fe07..5eefe202bb8f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -558,7 +558,6 @@ void kasan_restore_multi_shot(bool enabled);
  * code. Declared here to avoid warnings about missing declarations.
  */
 
-asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
 void __asan_register_globals(void *globals, ssize_t size);
 void __asan_unregister_globals(void *globals, ssize_t size);
 void __asan_handle_no_return(void);
-- 
cgit v1.2.3


From 12e94aee074ce1c5ffdb8f2246a8c4a095b6aa8a Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Wed, 13 Sep 2023 15:39:00 +0200
Subject: power: supply: core: Don't export power_supply_notifier

power_supply_notifier can be internal, since all users are going
through power_supply_reg_notifier()/power_supply_unreg_notifier().

Link: https://lore.kernel.org/r/20230913133900.591637-1-sebastian.reichel@collabora.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 3 +--
 include/linux/power_supply.h             | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 6e30674be366..73265001dd4b 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -29,8 +29,7 @@
 struct class *power_supply_class;
 EXPORT_SYMBOL_GPL(power_supply_class);
 
-BLOCKING_NOTIFIER_HEAD(power_supply_notifier);
-EXPORT_SYMBOL_GPL(power_supply_notifier);
+static BLOCKING_NOTIFIER_HEAD(power_supply_notifier);
 
 static struct device_type power_supply_dev_type;
 
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 85b86768c0b9..c0992a77feea 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -767,7 +767,6 @@ struct power_supply_battery_info {
 	int bti_resistance_tolerance;
 };
 
-extern struct blocking_notifier_head power_supply_notifier;
 extern int power_supply_reg_notifier(struct notifier_block *nb);
 extern void power_supply_unreg_notifier(struct notifier_block *nb);
 #if IS_ENABLED(CONFIG_POWER_SUPPLY)
-- 
cgit v1.2.3


From 9431063ad323ac864750aeba4d304389bc42ca4e Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:37 +0100
Subject: dpll: core: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement core framework functions for further interactions
with device drivers implementing dpll subsystem, as well as for
interactions of DPLL netlink framework part with the subsystem
itself.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS              |  11 +
 drivers/Kconfig          |   2 +
 drivers/Makefile         |   1 +
 drivers/dpll/Kconfig     |   7 +
 drivers/dpll/Makefile    |   9 +
 drivers/dpll/dpll_core.c | 789 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_core.h |  89 ++++++
 include/linux/dpll.h     | 133 ++++++++
 8 files changed, 1041 insertions(+)
 create mode 100644 drivers/dpll/Kconfig
 create mode 100644 drivers/dpll/Makefile
 create mode 100644 drivers/dpll/dpll_core.c
 create mode 100644 drivers/dpll/dpll_core.h
 create mode 100644 include/linux/dpll.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d0027f389e1..c94489dff2fd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6333,6 +6333,17 @@ F:	Documentation/networking/device_drivers/ethernet/freescale/dpaa2/switch-drive
 F:	drivers/net/ethernet/freescale/dpaa2/dpaa2-switch*
 F:	drivers/net/ethernet/freescale/dpaa2/dpsw*
 
+DPLL SUBSYSTEM
+M:	Vadim Fedorenko <vadim.fedorenko@linux.dev>
+M:	Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+M:	Jiri Pirko <jiri@resnulli.us>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	Documentation/driver-api/dpll.rst
+F:	drivers/dpll/*
+F:	include/net/dpll.h
+F:	include/uapi/linux/dpll.h
+
 DRBD DRIVER
 M:	Philipp Reisner <philipp.reisner@linbit.com>
 M:	Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index efb66e25fa2d..8ba3e8b9ad72 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -243,4 +243,6 @@ source "drivers/hte/Kconfig"
 
 source "drivers/cdx/Kconfig"
 
+source "drivers/dpll/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 1bec7819a837..722d15be0eb7 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -197,5 +197,6 @@ obj-$(CONFIG_PECI)		+= peci/
 obj-$(CONFIG_HTE)		+= hte/
 obj-$(CONFIG_DRM_ACCEL)		+= accel/
 obj-$(CONFIG_CDX_BUS)		+= cdx/
+obj-$(CONFIG_DPLL)		+= dpll/
 
 obj-$(CONFIG_S390)		+= s390/
diff --git a/drivers/dpll/Kconfig b/drivers/dpll/Kconfig
new file mode 100644
index 000000000000..a4cae73f20d3
--- /dev/null
+++ b/drivers/dpll/Kconfig
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Generic DPLL drivers configuration
+#
+
+config DPLL
+  bool
diff --git a/drivers/dpll/Makefile b/drivers/dpll/Makefile
new file mode 100644
index 000000000000..2e5b27850110
--- /dev/null
+++ b/drivers/dpll/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for DPLL drivers.
+#
+
+obj-$(CONFIG_DPLL)      += dpll.o
+dpll-y                  += dpll_core.o
+dpll-y                  += dpll_netlink.o
+dpll-y                  += dpll_nl.o
diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
new file mode 100644
index 000000000000..6449ba6a383b
--- /dev/null
+++ b/drivers/dpll/dpll_core.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  dpll_core.c - DPLL subsystem kernel-space interface implementation.
+ *
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel Corporation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "dpll_core.h"
+
+/* Mutex lock to protect DPLL subsystem devices and pins */
+DEFINE_MUTEX(dpll_lock);
+
+DEFINE_XARRAY_FLAGS(dpll_device_xa, XA_FLAGS_ALLOC);
+DEFINE_XARRAY_FLAGS(dpll_pin_xa, XA_FLAGS_ALLOC);
+
+static u32 dpll_xa_id;
+
+#define ASSERT_DPLL_REGISTERED(d)	\
+	WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+#define ASSERT_DPLL_NOT_REGISTERED(d)	\
+	WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+#define ASSERT_PIN_REGISTERED(p)	\
+	WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
+
+struct dpll_device_registration {
+	struct list_head list;
+	const struct dpll_device_ops *ops;
+	void *priv;
+};
+
+struct dpll_pin_registration {
+	struct list_head list;
+	const struct dpll_pin_ops *ops;
+	void *priv;
+};
+
+struct dpll_device *dpll_device_get_by_id(int id)
+{
+	if (xa_get_mark(&dpll_device_xa, id, DPLL_REGISTERED))
+		return xa_load(&dpll_device_xa, id);
+
+	return NULL;
+}
+
+static struct dpll_pin_registration *
+dpll_pin_registration_find(struct dpll_pin_ref *ref,
+			   const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+
+	list_for_each_entry(reg, &ref->registration_list, list) {
+		if (reg->ops == ops && reg->priv == priv)
+			return reg;
+	}
+	return NULL;
+}
+
+static int
+dpll_xa_ref_pin_add(struct xarray *xa_pins, struct dpll_pin *pin,
+		    const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	bool ref_exists = false;
+	unsigned long i;
+	int ret;
+
+	xa_for_each(xa_pins, i, ref) {
+		if (ref->pin != pin)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (reg) {
+			refcount_inc(&ref->refcount);
+			return 0;
+		}
+		ref_exists = true;
+		break;
+	}
+
+	if (!ref_exists) {
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			return -ENOMEM;
+		ref->pin = pin;
+		INIT_LIST_HEAD(&ref->registration_list);
+		ret = xa_insert(xa_pins, pin->pin_idx, ref, GFP_KERNEL);
+		if (ret) {
+			kfree(ref);
+			return ret;
+		}
+		refcount_set(&ref->refcount, 1);
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		if (!ref_exists) {
+			xa_erase(xa_pins, pin->pin_idx);
+			kfree(ref);
+		}
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	if (ref_exists)
+		refcount_inc(&ref->refcount);
+	list_add_tail(&reg->list, &ref->registration_list);
+
+	return 0;
+}
+
+static int dpll_xa_ref_pin_del(struct xarray *xa_pins, struct dpll_pin *pin,
+			       const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	xa_for_each(xa_pins, i, ref) {
+		if (ref->pin != pin)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (WARN_ON(!reg))
+			return -EINVAL;
+		if (refcount_dec_and_test(&ref->refcount)) {
+			list_del(&reg->list);
+			kfree(reg);
+			xa_erase(xa_pins, i);
+			WARN_ON(!list_empty(&ref->registration_list));
+			kfree(ref);
+		}
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int
+dpll_xa_ref_dpll_add(struct xarray *xa_dplls, struct dpll_device *dpll,
+		     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	bool ref_exists = false;
+	unsigned long i;
+	int ret;
+
+	xa_for_each(xa_dplls, i, ref) {
+		if (ref->dpll != dpll)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (reg) {
+			refcount_inc(&ref->refcount);
+			return 0;
+		}
+		ref_exists = true;
+		break;
+	}
+
+	if (!ref_exists) {
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			return -ENOMEM;
+		ref->dpll = dpll;
+		INIT_LIST_HEAD(&ref->registration_list);
+		ret = xa_insert(xa_dplls, dpll->id, ref, GFP_KERNEL);
+		if (ret) {
+			kfree(ref);
+			return ret;
+		}
+		refcount_set(&ref->refcount, 1);
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		if (!ref_exists) {
+			xa_erase(xa_dplls, dpll->id);
+			kfree(ref);
+		}
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	if (ref_exists)
+		refcount_inc(&ref->refcount);
+	list_add_tail(&reg->list, &ref->registration_list);
+
+	return 0;
+}
+
+static void
+dpll_xa_ref_dpll_del(struct xarray *xa_dplls, struct dpll_device *dpll,
+		     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	xa_for_each(xa_dplls, i, ref) {
+		if (ref->dpll != dpll)
+			continue;
+		reg = dpll_pin_registration_find(ref, ops, priv);
+		if (WARN_ON(!reg))
+			return;
+		if (refcount_dec_and_test(&ref->refcount)) {
+			list_del(&reg->list);
+			kfree(reg);
+			xa_erase(xa_dplls, i);
+			WARN_ON(!list_empty(&ref->registration_list));
+			kfree(ref);
+		}
+		return;
+	}
+}
+
+struct dpll_pin_ref *dpll_xa_ref_dpll_first(struct xarray *xa_refs)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i = 0;
+
+	ref = xa_find(xa_refs, &i, ULONG_MAX, XA_PRESENT);
+	WARN_ON(!ref);
+	return ref;
+}
+
+static struct dpll_device *
+dpll_device_alloc(const u64 clock_id, u32 device_idx, struct module *module)
+{
+	struct dpll_device *dpll;
+	int ret;
+
+	dpll = kzalloc(sizeof(*dpll), GFP_KERNEL);
+	if (!dpll)
+		return ERR_PTR(-ENOMEM);
+	refcount_set(&dpll->refcount, 1);
+	INIT_LIST_HEAD(&dpll->registration_list);
+	dpll->device_idx = device_idx;
+	dpll->clock_id = clock_id;
+	dpll->module = module;
+	ret = xa_alloc_cyclic(&dpll_device_xa, &dpll->id, dpll, xa_limit_32b,
+			      &dpll_xa_id, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(dpll);
+		return ERR_PTR(ret);
+	}
+	xa_init_flags(&dpll->pin_refs, XA_FLAGS_ALLOC);
+
+	return dpll;
+}
+
+/**
+ * dpll_device_get - find existing or create new dpll device
+ * @clock_id: clock_id of creator
+ * @device_idx: idx given by device driver
+ * @module: reference to registering module
+ *
+ * Get existing object of a dpll device, unique for given arguments.
+ * Create new if doesn't exist yet.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * valid dpll_device struct pointer if succeeded
+ * * ERR_PTR(X) - error
+ */
+struct dpll_device *
+dpll_device_get(u64 clock_id, u32 device_idx, struct module *module)
+{
+	struct dpll_device *dpll, *ret = NULL;
+	unsigned long index;
+
+	mutex_lock(&dpll_lock);
+	xa_for_each(&dpll_device_xa, index, dpll) {
+		if (dpll->clock_id == clock_id &&
+		    dpll->device_idx == device_idx &&
+		    dpll->module == module) {
+			ret = dpll;
+			refcount_inc(&ret->refcount);
+			break;
+		}
+	}
+	if (!ret)
+		ret = dpll_device_alloc(clock_id, device_idx, module);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_device_get);
+
+/**
+ * dpll_device_put - decrease the refcount and free memory if possible
+ * @dpll: dpll_device struct pointer
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Drop reference for a dpll device, if all references are gone, delete
+ * dpll device object.
+ */
+void dpll_device_put(struct dpll_device *dpll)
+{
+	mutex_lock(&dpll_lock);
+	if (refcount_dec_and_test(&dpll->refcount)) {
+		ASSERT_DPLL_NOT_REGISTERED(dpll);
+		WARN_ON_ONCE(!xa_empty(&dpll->pin_refs));
+		xa_destroy(&dpll->pin_refs);
+		xa_erase(&dpll_device_xa, dpll->id);
+		WARN_ON(!list_empty(&dpll->registration_list));
+		kfree(dpll);
+	}
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_device_put);
+
+static struct dpll_device_registration *
+dpll_device_registration_find(struct dpll_device *dpll,
+			      const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+
+	list_for_each_entry(reg, &dpll->registration_list, list) {
+		if (reg->ops == ops && reg->priv == priv)
+			return reg;
+	}
+	return NULL;
+}
+
+/**
+ * dpll_device_register - register the dpll device in the subsystem
+ * @dpll: pointer to a dpll
+ * @type: type of a dpll
+ * @ops: ops for a dpll device
+ * @priv: pointer to private information of owner
+ *
+ * Make dpll device available for user space.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
+			 const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+	bool first_registration = false;
+
+	if (WARN_ON(!ops))
+		return -EINVAL;
+	if (WARN_ON(!ops->mode_get))
+		return -EINVAL;
+	if (WARN_ON(!ops->lock_status_get))
+		return -EINVAL;
+	if (WARN_ON(type < DPLL_TYPE_PPS || type > DPLL_TYPE_MAX))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	reg = dpll_device_registration_find(dpll, ops, priv);
+	if (reg) {
+		mutex_unlock(&dpll_lock);
+		return -EEXIST;
+	}
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg) {
+		mutex_unlock(&dpll_lock);
+		return -ENOMEM;
+	}
+	reg->ops = ops;
+	reg->priv = priv;
+	dpll->type = type;
+	first_registration = list_empty(&dpll->registration_list);
+	list_add_tail(&reg->list, &dpll->registration_list);
+	if (!first_registration) {
+		mutex_unlock(&dpll_lock);
+		return 0;
+	}
+
+	xa_set_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	mutex_unlock(&dpll_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dpll_device_register);
+
+/**
+ * dpll_device_unregister - unregister dpll device
+ * @dpll: registered dpll pointer
+ * @ops: ops for a dpll device
+ * @priv: pointer to private information of owner
+ *
+ * Unregister device, make it unavailable for userspace.
+ * Note: It does not free the memory
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_device_unregister(struct dpll_device *dpll,
+			    const struct dpll_device_ops *ops, void *priv)
+{
+	struct dpll_device_registration *reg;
+
+	mutex_lock(&dpll_lock);
+	ASSERT_DPLL_REGISTERED(dpll);
+	reg = dpll_device_registration_find(dpll, ops, priv);
+	if (WARN_ON(!reg)) {
+		mutex_unlock(&dpll_lock);
+		return;
+	}
+	list_del(&reg->list);
+	kfree(reg);
+
+	if (!list_empty(&dpll->registration_list)) {
+		mutex_unlock(&dpll_lock);
+		return;
+	}
+	xa_clear_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_device_unregister);
+
+static struct dpll_pin *
+dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+	       const struct dpll_pin_properties *prop)
+{
+	struct dpll_pin *pin;
+	int ret;
+
+	pin = kzalloc(sizeof(*pin), GFP_KERNEL);
+	if (!pin)
+		return ERR_PTR(-ENOMEM);
+	pin->pin_idx = pin_idx;
+	pin->clock_id = clock_id;
+	pin->module = module;
+	if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX ||
+		    prop->type > DPLL_PIN_TYPE_MAX)) {
+		ret = -EINVAL;
+		goto err;
+	}
+	pin->prop = prop;
+	refcount_set(&pin->refcount, 1);
+	xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
+	xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+	ret = xa_alloc(&dpll_pin_xa, &pin->id, pin, xa_limit_16b, GFP_KERNEL);
+	if (ret)
+		goto err;
+	return pin;
+err:
+	xa_destroy(&pin->dpll_refs);
+	xa_destroy(&pin->parent_refs);
+	kfree(pin);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dpll_pin_get - find existing or create new dpll pin
+ * @clock_id: clock_id of creator
+ * @pin_idx: idx given by dev driver
+ * @module: reference to registering module
+ * @prop: dpll pin properties
+ *
+ * Get existing object of a pin (unique for given arguments) or create new
+ * if doesn't exist yet.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * valid allocated dpll_pin struct pointer if succeeded
+ * * ERR_PTR(X) - error
+ */
+struct dpll_pin *
+dpll_pin_get(u64 clock_id, u32 pin_idx, struct module *module,
+	     const struct dpll_pin_properties *prop)
+{
+	struct dpll_pin *pos, *ret = NULL;
+	unsigned long i;
+
+	mutex_lock(&dpll_lock);
+	xa_for_each(&dpll_pin_xa, i, pos) {
+		if (pos->clock_id == clock_id &&
+		    pos->pin_idx == pin_idx &&
+		    pos->module == module) {
+			ret = pos;
+			refcount_inc(&ret->refcount);
+			break;
+		}
+	}
+	if (!ret)
+		ret = dpll_pin_alloc(clock_id, pin_idx, module, prop);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_get);
+
+/**
+ * dpll_pin_put - decrease the refcount and free memory if possible
+ * @pin: pointer to a pin to be put
+ *
+ * Drop reference for a pin, if all references are gone, delete pin object.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_pin_put(struct dpll_pin *pin)
+{
+	mutex_lock(&dpll_lock);
+	if (refcount_dec_and_test(&pin->refcount)) {
+		xa_destroy(&pin->dpll_refs);
+		xa_destroy(&pin->parent_refs);
+		xa_erase(&dpll_pin_xa, pin->id);
+		kfree(pin);
+	}
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_put);
+
+static int
+__dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		    const struct dpll_pin_ops *ops, void *priv)
+{
+	int ret;
+
+	ret = dpll_xa_ref_pin_add(&dpll->pin_refs, pin, ops, priv);
+	if (ret)
+		return ret;
+	ret = dpll_xa_ref_dpll_add(&pin->dpll_refs, dpll, ops, priv);
+	if (ret)
+		goto ref_pin_del;
+	xa_set_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+
+	return ret;
+
+ref_pin_del:
+	dpll_xa_ref_pin_del(&dpll->pin_refs, pin, ops, priv);
+	return ret;
+}
+
+/**
+ * dpll_pin_register - register the dpll pin in the subsystem
+ * @dpll: pointer to a dpll
+ * @pin: pointer to a dpll pin
+ * @ops: ops for a dpll pin ops
+ * @priv: pointer to private information of owner
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int
+dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		  const struct dpll_pin_ops *ops, void *priv)
+{
+	int ret;
+
+	if (WARN_ON(!ops) ||
+	    WARN_ON(!ops->state_on_dpll_get) ||
+	    WARN_ON(!ops->direction_get))
+		return -EINVAL;
+	if (ASSERT_DPLL_REGISTERED(dpll))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	if (WARN_ON(!(dpll->module == pin->module &&
+		      dpll->clock_id == pin->clock_id)))
+		ret = -EINVAL;
+	else
+		ret = __dpll_pin_register(dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_register);
+
+static void
+__dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+		      const struct dpll_pin_ops *ops, void *priv)
+{
+	dpll_xa_ref_pin_del(&dpll->pin_refs, pin, ops, priv);
+	dpll_xa_ref_dpll_del(&pin->dpll_refs, dpll, ops, priv);
+	if (xa_empty(&pin->dpll_refs))
+		xa_clear_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+}
+
+/**
+ * dpll_pin_unregister - unregister dpll pin from dpll device
+ * @dpll: registered dpll pointer
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Note: It does not free the memory
+ * Context: Acquires a lock (dpll_lock)
+ */
+void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+			 const struct dpll_pin_ops *ops, void *priv)
+{
+	if (WARN_ON(xa_empty(&dpll->pin_refs)))
+		return;
+	if (WARN_ON(!xa_empty(&pin->parent_refs)))
+		return;
+
+	mutex_lock(&dpll_lock);
+	__dpll_pin_unregister(dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_unregister);
+
+/**
+ * dpll_pin_on_pin_register - register a pin with a parent pin
+ * @parent: pointer to a parent pin
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Register a pin with a parent pin, create references between them and
+ * between newly registered pin and dplls connected with a parent pin.
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+			     const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i, stop;
+	int ret;
+
+	if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
+		return -EINVAL;
+
+	if (WARN_ON(!ops) ||
+	    WARN_ON(!ops->state_on_pin_get) ||
+	    WARN_ON(!ops->direction_get))
+		return -EINVAL;
+	if (ASSERT_PIN_REGISTERED(parent))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv);
+	if (ret)
+		goto unlock;
+	refcount_inc(&pin->refcount);
+	xa_for_each(&parent->dpll_refs, i, ref) {
+		ret = __dpll_pin_register(ref->dpll, pin, ops, priv);
+		if (ret) {
+			stop = i;
+			goto dpll_unregister;
+		}
+	}
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+
+dpll_unregister:
+	xa_for_each(&parent->dpll_refs, i, ref)
+		if (i < stop)
+			__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+	refcount_dec(&pin->refcount);
+	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
+unlock:
+	mutex_unlock(&dpll_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_on_pin_register);
+
+/**
+ * dpll_pin_on_pin_unregister - unregister dpll pin from a parent pin
+ * @parent: pointer to a parent pin
+ * @pin: pointer to a pin
+ * @ops: ops for a dpll pin
+ * @priv: pointer to private information of owner
+ *
+ * Context: Acquires a lock (dpll_lock)
+ * Note: It does not free the memory
+ */
+void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
+				const struct dpll_pin_ops *ops, void *priv)
+{
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+
+	mutex_lock(&dpll_lock);
+	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
+	refcount_dec(&pin->refcount);
+	xa_for_each(&pin->dpll_refs, i, ref)
+		__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+	mutex_unlock(&dpll_lock);
+}
+EXPORT_SYMBOL_GPL(dpll_pin_on_pin_unregister);
+
+static struct dpll_device_registration *
+dpll_device_registration_first(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = list_first_entry_or_null((struct list_head *)&dpll->registration_list,
+				       struct dpll_device_registration, list);
+	WARN_ON(!reg);
+	return reg;
+}
+
+void *dpll_priv(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = dpll_device_registration_first(dpll);
+	return reg->priv;
+}
+
+const struct dpll_device_ops *dpll_device_ops(struct dpll_device *dpll)
+{
+	struct dpll_device_registration *reg;
+
+	reg = dpll_device_registration_first(dpll);
+	return reg->ops;
+}
+
+static struct dpll_pin_registration *
+dpll_pin_registration_first(struct dpll_pin_ref *ref)
+{
+	struct dpll_pin_registration *reg;
+
+	reg = list_first_entry_or_null(&ref->registration_list,
+				       struct dpll_pin_registration, list);
+	WARN_ON(!reg);
+	return reg;
+}
+
+void *dpll_pin_on_dpll_priv(struct dpll_device *dpll,
+			    struct dpll_pin *pin)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+
+	ref = xa_load(&dpll->pin_refs, pin->pin_idx);
+	if (!ref)
+		return NULL;
+	reg = dpll_pin_registration_first(ref);
+	return reg->priv;
+}
+
+void *dpll_pin_on_pin_priv(struct dpll_pin *parent,
+			   struct dpll_pin *pin)
+{
+	struct dpll_pin_registration *reg;
+	struct dpll_pin_ref *ref;
+
+	ref = xa_load(&pin->parent_refs, parent->pin_idx);
+	if (!ref)
+		return NULL;
+	reg = dpll_pin_registration_first(ref);
+	return reg->priv;
+}
+
+const struct dpll_pin_ops *dpll_pin_ops(struct dpll_pin_ref *ref)
+{
+	struct dpll_pin_registration *reg;
+
+	reg = dpll_pin_registration_first(ref);
+	return reg->ops;
+}
+
+static int __init dpll_init(void)
+{
+	int ret;
+
+	ret = genl_register_family(&dpll_nl_family);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	mutex_destroy(&dpll_lock);
+	return ret;
+}
+
+static void __exit dpll_exit(void)
+{
+	genl_unregister_family(&dpll_nl_family);
+	mutex_destroy(&dpll_lock);
+}
+
+subsys_initcall(dpll_init);
+module_exit(dpll_exit);
diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
new file mode 100644
index 000000000000..5585873c5c1b
--- /dev/null
+++ b/drivers/dpll/dpll_core.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+#ifndef __DPLL_CORE_H__
+#define __DPLL_CORE_H__
+
+#include <linux/dpll.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include "dpll_nl.h"
+
+#define DPLL_REGISTERED		XA_MARK_1
+
+/**
+ * struct dpll_device - stores DPLL device internal data
+ * @id:			unique id number for device given by dpll subsystem
+ * @device_idx:		id given by dev driver
+ * @clock_id:		unique identifier (clock_id) of a dpll
+ * @module:		module of creator
+ * @type:		type of a dpll
+ * @pin_refs:		stores pins registered within a dpll
+ * @refcount:		refcount
+ * @registration_list:	list of registered ops and priv data of dpll owners
+ **/
+struct dpll_device {
+	u32 id;
+	u32 device_idx;
+	u64 clock_id;
+	struct module *module;
+	enum dpll_type type;
+	struct xarray pin_refs;
+	refcount_t refcount;
+	struct list_head registration_list;
+};
+
+/**
+ * struct dpll_pin - structure for a dpll pin
+ * @id:			unique id number for pin given by dpll subsystem
+ * @pin_idx:		index of a pin given by dev driver
+ * @clock_id:		clock_id of creator
+ * @module:		module of creator
+ * @dpll_refs:		hold referencees to dplls pin was registered with
+ * @parent_refs:	hold references to parent pins pin was registered with
+ * @prop:		pointer to pin properties given by registerer
+ * @rclk_dev_name:	holds name of device when pin can recover clock from it
+ * @refcount:		refcount
+ **/
+struct dpll_pin {
+	u32 id;
+	u32 pin_idx;
+	u64 clock_id;
+	struct module *module;
+	struct xarray dpll_refs;
+	struct xarray parent_refs;
+	const struct dpll_pin_properties *prop;
+	refcount_t refcount;
+};
+
+/**
+ * struct dpll_pin_ref - structure for referencing either dpll or pins
+ * @dpll:		pointer to a dpll
+ * @pin:		pointer to a pin
+ * @registration_list:	list of ops and priv data registered with the ref
+ * @refcount:		refcount
+ **/
+struct dpll_pin_ref {
+	union {
+		struct dpll_device *dpll;
+		struct dpll_pin *pin;
+	};
+	struct list_head registration_list;
+	refcount_t refcount;
+};
+
+void *dpll_priv(struct dpll_device *dpll);
+void *dpll_pin_on_dpll_priv(struct dpll_device *dpll, struct dpll_pin *pin);
+void *dpll_pin_on_pin_priv(struct dpll_pin *parent, struct dpll_pin *pin);
+
+const struct dpll_device_ops *dpll_device_ops(struct dpll_device *dpll);
+struct dpll_device *dpll_device_get_by_id(int id);
+const struct dpll_pin_ops *dpll_pin_ops(struct dpll_pin_ref *ref);
+struct dpll_pin_ref *dpll_xa_ref_dpll_first(struct xarray *xa_refs);
+extern struct xarray dpll_device_xa;
+extern struct xarray dpll_pin_xa;
+extern struct mutex dpll_lock;
+#endif
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
new file mode 100644
index 000000000000..b47c3560b937
--- /dev/null
+++ b/include/linux/dpll.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+#ifndef __DPLL_H__
+#define __DPLL_H__
+
+#include <uapi/linux/dpll.h>
+#include <linux/device.h>
+#include <linux/netlink.h>
+
+struct dpll_device;
+struct dpll_pin;
+
+struct dpll_device_ops {
+	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
+			enum dpll_mode *mode, struct netlink_ext_ack *extack);
+	bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv,
+			       const enum dpll_mode mode,
+			       struct netlink_ext_ack *extack);
+	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
+			       enum dpll_lock_status *status,
+			       struct netlink_ext_ack *extack);
+	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
+			s32 *temp, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_ops {
+	int (*frequency_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const u64 frequency,
+			     struct netlink_ext_ack *extack);
+	int (*frequency_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     u64 *frequency, struct netlink_ext_ack *extack);
+	int (*direction_set)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     const enum dpll_pin_direction direction,
+			     struct netlink_ext_ack *extack);
+	int (*direction_get)(const struct dpll_pin *pin, void *pin_priv,
+			     const struct dpll_device *dpll, void *dpll_priv,
+			     enum dpll_pin_direction *direction,
+			     struct netlink_ext_ack *extack);
+	int (*state_on_pin_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				enum dpll_pin_state *state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_get)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv, enum dpll_pin_state *state,
+				 struct netlink_ext_ack *extack);
+	int (*state_on_pin_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_pin *parent_pin,
+				void *parent_pin_priv,
+				const enum dpll_pin_state state,
+				struct netlink_ext_ack *extack);
+	int (*state_on_dpll_set)(const struct dpll_pin *pin, void *pin_priv,
+				 const struct dpll_device *dpll,
+				 void *dpll_priv,
+				 const enum dpll_pin_state state,
+				 struct netlink_ext_ack *extack);
+	int (*prio_get)(const struct dpll_pin *pin,  void *pin_priv,
+			const struct dpll_device *dpll,  void *dpll_priv,
+			u32 *prio, struct netlink_ext_ack *extack);
+	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
+			const struct dpll_device *dpll, void *dpll_priv,
+			const u32 prio, struct netlink_ext_ack *extack);
+};
+
+struct dpll_pin_frequency {
+	u64 min;
+	u64 max;
+};
+
+#define DPLL_PIN_FREQUENCY_RANGE(_min, _max)	\
+	{					\
+		.min = _min,			\
+		.max = _max,			\
+	}
+
+#define DPLL_PIN_FREQUENCY(_val) DPLL_PIN_FREQUENCY_RANGE(_val, _val)
+#define DPLL_PIN_FREQUENCY_1PPS \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_1_HZ)
+#define DPLL_PIN_FREQUENCY_10MHZ \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_MHZ)
+#define DPLL_PIN_FREQUENCY_IRIG_B \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_10_KHZ)
+#define DPLL_PIN_FREQUENCY_DCF77 \
+	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
+
+struct dpll_pin_properties {
+	const char *board_label;
+	const char *panel_label;
+	const char *package_label;
+	enum dpll_pin_type type;
+	unsigned long capabilities;
+	u32 freq_supported_num;
+	struct dpll_pin_frequency *freq_supported;
+};
+
+struct dpll_device *
+dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
+
+void dpll_device_put(struct dpll_device *dpll);
+
+int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
+			 const struct dpll_device_ops *ops, void *priv);
+
+void dpll_device_unregister(struct dpll_device *dpll,
+			    const struct dpll_device_ops *ops, void *priv);
+
+struct dpll_pin *
+dpll_pin_get(u64 clock_id, u32 dev_driver_id, struct module *module,
+	     const struct dpll_pin_properties *prop);
+
+int dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+		      const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
+			 const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_put(struct dpll_pin *pin);
+
+int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+			     const struct dpll_pin_ops *ops, void *priv);
+
+void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
+				const struct dpll_pin_ops *ops, void *priv);
+
+#endif
-- 
cgit v1.2.3


From 9d71b54b65b1fb6c0d3a6c5c88ba9b915c783fbc Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Wed, 13 Sep 2023 21:49:38 +0100
Subject: dpll: netlink: Add DPLL framework base functions

DPLL framework is used to represent and configure DPLL devices
in systems. Each device that has DPLL and can configure inputs
and outputs can use this framework.

Implement dpll netlink framework functions for enablement of dpll
subsystem netlink family.

Co-developed-by: Milena Olech <milena.olech@intel.com>
Signed-off-by: Milena Olech <milena.olech@intel.com>
Co-developed-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Michal Michalik <michal.michalik@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Co-developed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_core.c    |   11 +-
 drivers/dpll/dpll_netlink.c | 1241 +++++++++++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_netlink.h |   13 +
 include/linux/dpll.h        |    4 +
 4 files changed, 1268 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dpll/dpll_netlink.c
 create mode 100644 drivers/dpll/dpll_netlink.h

(limited to 'include/linux')

diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
index 6449ba6a383b..3568149b9562 100644
--- a/drivers/dpll/dpll_core.c
+++ b/drivers/dpll/dpll_core.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 
 #include "dpll_core.h"
+#include "dpll_netlink.h"
 
 /* Mutex lock to protect DPLL subsystem devices and pins */
 DEFINE_MUTEX(dpll_lock);
@@ -381,6 +382,7 @@ int dpll_device_register(struct dpll_device *dpll, enum dpll_type type,
 	}
 
 	xa_set_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED);
+	dpll_device_create_ntf(dpll);
 	mutex_unlock(&dpll_lock);
 
 	return 0;
@@ -404,6 +406,7 @@ void dpll_device_unregister(struct dpll_device *dpll,
 
 	mutex_lock(&dpll_lock);
 	ASSERT_DPLL_REGISTERED(dpll);
+	dpll_device_delete_ntf(dpll);
 	reg = dpll_device_registration_find(dpll, ops, priv);
 	if (WARN_ON(!reg)) {
 		mutex_unlock(&dpll_lock);
@@ -528,6 +531,7 @@ __dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
 	if (ret)
 		goto ref_pin_del;
 	xa_set_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED);
+	dpll_pin_create_ntf(pin);
 
 	return ret;
 
@@ -602,6 +606,7 @@ void dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
 		return;
 
 	mutex_lock(&dpll_lock);
+	dpll_pin_delete_ntf(pin);
 	__dpll_pin_unregister(dpll, pin, ops, priv);
 	mutex_unlock(&dpll_lock);
 }
@@ -650,6 +655,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 			stop = i;
 			goto dpll_unregister;
 		}
+		dpll_pin_create_ntf(pin);
 	}
 	mutex_unlock(&dpll_lock);
 
@@ -657,8 +663,10 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 
 dpll_unregister:
 	xa_for_each(&parent->dpll_refs, i, ref)
-		if (i < stop)
+		if (i < stop) {
 			__dpll_pin_unregister(ref->dpll, pin, ops, priv);
+			dpll_pin_delete_ntf(pin);
+		}
 	refcount_dec(&pin->refcount);
 	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
 unlock:
@@ -684,6 +692,7 @@ void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 	unsigned long i;
 
 	mutex_lock(&dpll_lock);
+	dpll_pin_delete_ntf(pin);
 	dpll_xa_ref_pin_del(&pin->parent_refs, parent, ops, priv);
 	refcount_dec(&pin->refcount);
 	xa_for_each(&pin->dpll_refs, i, ref)
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
new file mode 100644
index 000000000000..9464a6865977
--- /dev/null
+++ b/drivers/dpll/dpll_netlink.c
@@ -0,0 +1,1241 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic netlink for DPLL management framework
+ *
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include "dpll_core.h"
+#include "dpll_netlink.h"
+#include "dpll_nl.h"
+#include <uapi/linux/dpll.h>
+
+#define ASSERT_NOT_NULL(ptr)	(WARN_ON(!ptr))
+
+#define xa_for_each_marked_start(xa, index, entry, filter, start) \
+	for (index = start, entry = xa_find(xa, &index, ULONG_MAX, filter); \
+	     entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
+
+struct dpll_dump_ctx {
+	unsigned long idx;
+};
+
+static struct dpll_dump_ctx *dpll_dump_context(struct netlink_callback *cb)
+{
+	return (struct dpll_dump_ctx *)cb->ctx;
+}
+
+static int
+dpll_msg_add_dev_handle(struct sk_buff *msg, struct dpll_device *dpll)
+{
+	if (nla_put_u32(msg, DPLL_A_ID, dpll->id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
+{
+	if (nla_put_u32(msg, DPLL_A_PIN_PARENT_ID, id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+/**
+ * dpll_msg_add_pin_handle - attach pin handle attribute to a given message
+ * @msg: pointer to sk_buff message to attach a pin handle
+ * @pin: pin pointer
+ *
+ * Return:
+ * * 0 - success
+ * * -EMSGSIZE - no space in message to attach pin handle
+ */
+static int
+dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+{
+	if (!pin)
+		return 0;
+	if (nla_put_u32(msg, DPLL_A_PIN_ID, pin->id))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int
+dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll,
+		  struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_mode mode;
+	int ret;
+
+	ret = ops->mode_get(dpll, dpll_priv(dpll), &mode, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_MODE, mode))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_mode_supported(struct sk_buff *msg, struct dpll_device *dpll,
+			    struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_mode mode;
+
+	if (!ops->mode_supported)
+		return 0;
+	for (mode = DPLL_MODE_MANUAL; mode <= DPLL_MODE_MAX; mode++)
+		if (ops->mode_supported(dpll, dpll_priv(dpll), mode, extack))
+			if (nla_put_u32(msg, DPLL_A_MODE_SUPPORTED, mode))
+				return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll,
+			 struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_lock_status status;
+	int ret;
+
+	ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_LOCK_STATUS, status))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_temp(struct sk_buff *msg, struct dpll_device *dpll,
+		  struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	s32 temp;
+	int ret;
+
+	if (!ops->temp_get)
+		return 0;
+	ret = ops->temp_get(dpll, dpll_priv(dpll), &temp, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_TEMP, temp))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_prio(struct sk_buff *msg, struct dpll_pin *pin,
+		      struct dpll_pin_ref *ref,
+		      struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	u32 prio;
+	int ret;
+
+	if (!ops->prio_get)
+		return 0;
+	ret = ops->prio_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+			    dpll_priv(dpll), &prio, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_PRIO, prio))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_on_dpll_state(struct sk_buff *msg, struct dpll_pin *pin,
+			       struct dpll_pin_ref *ref,
+			       struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	enum dpll_pin_state state;
+	int ret;
+
+	if (!ops->state_on_dpll_get)
+		return 0;
+	ret = ops->state_on_dpll_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				     dpll, dpll_priv(dpll), &state, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_STATE, state))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_direction(struct sk_buff *msg, struct dpll_pin *pin,
+			   struct dpll_pin_ref *ref,
+			   struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	enum dpll_pin_direction direction;
+	int ret;
+
+	ret = ops->direction_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+				 dpll_priv(dpll), &direction, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_PIN_DIRECTION, direction))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
+		      struct dpll_pin_ref *ref, struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	struct nlattr *nest;
+	int fs, ret;
+	u64 freq;
+
+	if (!ops->frequency_get)
+		return 0;
+	ret = ops->frequency_get(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+				 dpll_priv(dpll), &freq, extack);
+	if (ret)
+		return ret;
+	if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
+			  DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+	for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
+		nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
+		if (!nest)
+			return -EMSGSIZE;
+		freq = pin->prop->freq_supported[fs].min;
+		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
+				  &freq, DPLL_A_PIN_PAD)) {
+			nla_nest_cancel(msg, nest);
+			return -EMSGSIZE;
+		}
+		freq = pin->prop->freq_supported[fs].max;
+		if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
+				  &freq, DPLL_A_PIN_PAD)) {
+			nla_nest_cancel(msg, nest);
+			return -EMSGSIZE;
+		}
+		nla_nest_end(msg, nest);
+	}
+
+	return 0;
+}
+
+static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq)
+{
+	int fs;
+
+	for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
+		if (freq >= pin->prop->freq_supported[fs].min &&
+		    freq <= pin->prop->freq_supported[fs].max)
+			return true;
+	return false;
+}
+
+static int
+dpll_msg_add_pin_parents(struct sk_buff *msg, struct dpll_pin *pin,
+			 struct dpll_pin_ref *dpll_ref,
+			 struct netlink_ext_ack *extack)
+{
+	enum dpll_pin_state state;
+	struct dpll_pin_ref *ref;
+	struct dpll_pin *ppin;
+	struct nlattr *nest;
+	unsigned long index;
+	int ret;
+
+	xa_for_each(&pin->parent_refs, index, ref) {
+		const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+		void *parent_priv;
+
+		ppin = ref->pin;
+		parent_priv = dpll_pin_on_dpll_priv(dpll_ref->dpll, ppin);
+		ret = ops->state_on_pin_get(pin,
+					    dpll_pin_on_pin_priv(ppin, pin),
+					    ppin, parent_priv, &state, extack);
+		if (ret)
+			return ret;
+		nest = nla_nest_start(msg, DPLL_A_PIN_PARENT_PIN);
+		if (!nest)
+			return -EMSGSIZE;
+		ret = dpll_msg_add_dev_parent_handle(msg, ppin->id);
+		if (ret)
+			goto nest_cancel;
+		if (nla_put_u32(msg, DPLL_A_PIN_STATE, state)) {
+			ret = -EMSGSIZE;
+			goto nest_cancel;
+		}
+		nla_nest_end(msg, nest);
+	}
+
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(msg, nest);
+	return ret;
+}
+
+static int
+dpll_msg_add_pin_dplls(struct sk_buff *msg, struct dpll_pin *pin,
+		       struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *ref;
+	struct nlattr *attr;
+	unsigned long index;
+	int ret;
+
+	xa_for_each(&pin->dpll_refs, index, ref) {
+		attr = nla_nest_start(msg, DPLL_A_PIN_PARENT_DEVICE);
+		if (!attr)
+			return -EMSGSIZE;
+		ret = dpll_msg_add_dev_parent_handle(msg, ref->dpll->id);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_on_dpll_state(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_prio(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_pin_direction(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		nla_nest_end(msg, attr);
+	}
+
+	return 0;
+
+nest_cancel:
+	nla_nest_end(msg, attr);
+	return ret;
+}
+
+static int
+dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
+		     struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_properties *prop = pin->prop;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	ref = dpll_xa_ref_dpll_first(&pin->dpll_refs);
+	ASSERT_NOT_NULL(ref);
+
+	ret = dpll_msg_add_pin_handle(msg, pin);
+	if (ret)
+		return ret;
+	if (nla_put_string(msg, DPLL_A_PIN_MODULE_NAME,
+			   module_name(pin->module)))
+		return -EMSGSIZE;
+	if (nla_put_64bit(msg, DPLL_A_PIN_CLOCK_ID, sizeof(pin->clock_id),
+			  &pin->clock_id, DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+	if (prop->board_label &&
+	    nla_put_string(msg, DPLL_A_PIN_BOARD_LABEL, prop->board_label))
+		return -EMSGSIZE;
+	if (prop->panel_label &&
+	    nla_put_string(msg, DPLL_A_PIN_PANEL_LABEL, prop->panel_label))
+		return -EMSGSIZE;
+	if (prop->package_label &&
+	    nla_put_string(msg, DPLL_A_PIN_PACKAGE_LABEL,
+			   prop->package_label))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DPLL_A_PIN_TYPE, prop->type))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DPLL_A_PIN_CAPABILITIES, prop->capabilities))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_pin_freq(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	if (xa_empty(&pin->parent_refs))
+		ret = dpll_msg_add_pin_dplls(msg, pin, extack);
+	else
+		ret = dpll_msg_add_pin_parents(msg, pin, ref, extack);
+
+	return ret;
+}
+
+static int
+dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
+		    struct netlink_ext_ack *extack)
+{
+	int ret;
+
+	ret = dpll_msg_add_dev_handle(msg, dpll);
+	if (ret)
+		return ret;
+	if (nla_put_string(msg, DPLL_A_MODULE_NAME, module_name(dpll->module)))
+		return -EMSGSIZE;
+	if (nla_put_64bit(msg, DPLL_A_CLOCK_ID, sizeof(dpll->clock_id),
+			  &dpll->clock_id, DPLL_A_PAD))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_temp(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_lock_status(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_mode(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_mode_supported(msg, dpll, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type))
+		return -EMSGSIZE;
+
+	return ret;
+}
+
+static int
+dpll_device_event_send(enum dpll_cmd event, struct dpll_device *dpll)
+{
+	struct sk_buff *msg;
+	int ret = -ENOMEM;
+	void *hdr;
+
+	if (WARN_ON(!xa_get_mark(&dpll_device_xa, dpll->id, DPLL_REGISTERED)))
+		return -ENODEV;
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put(msg, 0, 0, &dpll_nl_family, 0, event);
+	if (!hdr)
+		goto err_free_msg;
+	ret = dpll_device_get_one(dpll, msg, NULL);
+	if (ret)
+		goto err_cancel_msg;
+	genlmsg_end(msg, hdr);
+	genlmsg_multicast(&dpll_nl_family, msg, 0, 0, GFP_KERNEL);
+
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+err_free_msg:
+	nlmsg_free(msg);
+
+	return ret;
+}
+
+int dpll_device_create_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_CREATE_NTF, dpll);
+}
+
+int dpll_device_delete_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_DELETE_NTF, dpll);
+}
+
+static int
+__dpll_device_change_ntf(struct dpll_device *dpll)
+{
+	return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll);
+}
+
+/**
+ * dpll_device_change_ntf - notify that the dpll device has been changed
+ * @dpll: registered dpll pointer
+ *
+ * Context: acquires and holds a dpll_lock.
+ * Return: 0 if succeeds, error code otherwise.
+ */
+int dpll_device_change_ntf(struct dpll_device *dpll)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	ret = __dpll_device_change_ntf(dpll);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_device_change_ntf);
+
+static int
+dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin)
+{
+	struct sk_buff *msg;
+	int ret = -ENOMEM;
+	void *hdr;
+
+	if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
+		return -ENODEV;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &dpll_nl_family, 0, event);
+	if (!hdr)
+		goto err_free_msg;
+	ret = dpll_cmd_pin_get_one(msg, pin, NULL);
+	if (ret)
+		goto err_cancel_msg;
+	genlmsg_end(msg, hdr);
+	genlmsg_multicast(&dpll_nl_family, msg, 0, 0, GFP_KERNEL);
+
+	return 0;
+
+err_cancel_msg:
+	genlmsg_cancel(msg, hdr);
+err_free_msg:
+	nlmsg_free(msg);
+
+	return ret;
+}
+
+int dpll_pin_create_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_CREATE_NTF, pin);
+}
+
+int dpll_pin_delete_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin);
+}
+
+static int __dpll_pin_change_ntf(struct dpll_pin *pin)
+{
+	return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin);
+}
+
+/**
+ * dpll_pin_change_ntf - notify that the pin has been changed
+ * @pin: registered pin pointer
+ *
+ * Context: acquires and holds a dpll_lock.
+ * Return: 0 if succeeds, error code otherwise.
+ */
+int dpll_pin_change_ntf(struct dpll_pin *pin)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	ret = __dpll_pin_change_ntf(pin);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_change_ntf);
+
+static int
+dpll_pin_freq_set(struct dpll_pin *pin, struct nlattr *a,
+		  struct netlink_ext_ack *extack)
+{
+	u64 freq = nla_get_u64(a);
+	struct dpll_pin_ref *ref;
+	unsigned long i;
+	int ret;
+
+	if (!dpll_pin_is_freq_supported(pin, freq)) {
+		NL_SET_ERR_MSG_ATTR(extack, a, "frequency is not supported by the device");
+		return -EINVAL;
+	}
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+		struct dpll_device *dpll = ref->dpll;
+
+		if (!ops->frequency_set)
+			return -EOPNOTSUPP;
+		ret = ops->frequency_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+					 dpll, dpll_priv(dpll), freq, extack);
+		if (ret)
+			return ret;
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx,
+			  enum dpll_pin_state state,
+			  struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *parent_ref;
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *dpll_ref;
+	void *pin_priv, *parent_priv;
+	struct dpll_pin *parent;
+	unsigned long i;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "state changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	parent = xa_load(&dpll_pin_xa, parent_idx);
+	if (!parent)
+		return -EINVAL;
+	parent_ref = xa_load(&pin->parent_refs, parent->pin_idx);
+	if (!parent_ref)
+		return -EINVAL;
+	xa_for_each(&parent->dpll_refs, i, dpll_ref) {
+		ops = dpll_pin_ops(parent_ref);
+		if (!ops->state_on_pin_set)
+			return -EOPNOTSUPP;
+		pin_priv = dpll_pin_on_pin_priv(parent, pin);
+		parent_priv = dpll_pin_on_dpll_priv(dpll_ref->dpll, parent);
+		ret = ops->state_on_pin_set(pin, pin_priv, parent, parent_priv,
+					    state, extack);
+		if (ret)
+			return ret;
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin,
+		   enum dpll_pin_state state,
+		   struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "state changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->state_on_dpll_set)
+		return -EOPNOTSUPP;
+	ret = ops->state_on_dpll_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				     dpll, dpll_priv(dpll), state, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin,
+		  u32 prio, struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "prio changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->prio_set)
+		return -EOPNOTSUPP;
+	ret = ops->prio_set(pin, dpll_pin_on_dpll_priv(dpll, pin), dpll,
+			    dpll_priv(dpll), prio, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
+		       enum dpll_pin_direction direction,
+		       struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops;
+	struct dpll_pin_ref *ref;
+	int ret;
+
+	if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
+	      pin->prop->capabilities)) {
+		NL_SET_ERR_MSG(extack, "direction changing is not allowed");
+		return -EOPNOTSUPP;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->direction_set)
+		return -EOPNOTSUPP;
+	ret = ops->direction_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				 dpll, dpll_priv(dpll), direction, extack);
+	if (ret)
+		return ret;
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+}
+
+static int
+dpll_pin_parent_device_set(struct dpll_pin *pin, struct nlattr *parent_nest,
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DPLL_A_PIN_MAX + 1];
+	enum dpll_pin_direction direction;
+	enum dpll_pin_state state;
+	struct dpll_pin_ref *ref;
+	struct dpll_device *dpll;
+	u32 pdpll_idx, prio;
+	int ret;
+
+	nla_parse_nested(tb, DPLL_A_PIN_MAX, parent_nest,
+			 dpll_pin_parent_device_nl_policy, extack);
+	if (!tb[DPLL_A_PIN_PARENT_ID]) {
+		NL_SET_ERR_MSG(extack, "device parent id expected");
+		return -EINVAL;
+	}
+	pdpll_idx = nla_get_u32(tb[DPLL_A_PIN_PARENT_ID]);
+	dpll = xa_load(&dpll_device_xa, pdpll_idx);
+	if (!dpll) {
+		NL_SET_ERR_MSG(extack, "parent device not found");
+		return -EINVAL;
+	}
+	ref = xa_load(&pin->dpll_refs, dpll->id);
+	if (!ref) {
+		NL_SET_ERR_MSG(extack, "pin not connected to given parent device");
+		return -EINVAL;
+	}
+	if (tb[DPLL_A_PIN_STATE]) {
+		state = nla_get_u32(tb[DPLL_A_PIN_STATE]);
+		ret = dpll_pin_state_set(dpll, pin, state, extack);
+		if (ret)
+			return ret;
+	}
+	if (tb[DPLL_A_PIN_PRIO]) {
+		prio = nla_get_u32(tb[DPLL_A_PIN_PRIO]);
+		ret = dpll_pin_prio_set(dpll, pin, prio, extack);
+		if (ret)
+			return ret;
+	}
+	if (tb[DPLL_A_PIN_DIRECTION]) {
+		direction = nla_get_u32(tb[DPLL_A_PIN_DIRECTION]);
+		ret = dpll_pin_direction_set(pin, dpll, direction, extack);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int
+dpll_pin_parent_pin_set(struct dpll_pin *pin, struct nlattr *parent_nest,
+			struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DPLL_A_PIN_MAX + 1];
+	enum dpll_pin_state state;
+	u32 ppin_idx;
+	int ret;
+
+	nla_parse_nested(tb, DPLL_A_PIN_MAX, parent_nest,
+			 dpll_pin_parent_pin_nl_policy, extack);
+	if (!tb[DPLL_A_PIN_PARENT_ID]) {
+		NL_SET_ERR_MSG(extack, "device parent id expected");
+		return -EINVAL;
+	}
+	ppin_idx = nla_get_u32(tb[DPLL_A_PIN_PARENT_ID]);
+	state = nla_get_u32(tb[DPLL_A_PIN_STATE]);
+	ret = dpll_pin_on_pin_state_set(pin, ppin_idx, state, extack);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int
+dpll_pin_set_from_nlattr(struct dpll_pin *pin, struct genl_info *info)
+{
+	struct nlattr *a;
+	int rem, ret;
+
+	nla_for_each_attr(a, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(a)) {
+		case DPLL_A_PIN_FREQUENCY:
+			ret = dpll_pin_freq_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		case DPLL_A_PIN_PARENT_DEVICE:
+			ret = dpll_pin_parent_device_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		case DPLL_A_PIN_PARENT_PIN:
+			ret = dpll_pin_parent_pin_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static struct dpll_pin *
+dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr,
+	      enum dpll_pin_type type, struct nlattr *board_label,
+	      struct nlattr *panel_label, struct nlattr *package_label,
+	      struct netlink_ext_ack *extack)
+{
+	bool board_match, panel_match, package_match;
+	struct dpll_pin *pin_match = NULL, *pin;
+	const struct dpll_pin_properties *prop;
+	bool cid_match, mod_match, type_match;
+	unsigned long i;
+
+	xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
+		prop = pin->prop;
+		cid_match = clock_id ? pin->clock_id == clock_id : true;
+		mod_match = mod_name_attr && module_name(pin->module) ?
+			!nla_strcmp(mod_name_attr,
+				    module_name(pin->module)) : true;
+		type_match = type ? prop->type == type : true;
+		board_match = board_label ? (prop->board_label ?
+			!nla_strcmp(board_label, prop->board_label) : false) :
+			true;
+		panel_match = panel_label ? (prop->panel_label ?
+			!nla_strcmp(panel_label, prop->panel_label) : false) :
+			true;
+		package_match = package_label ? (prop->package_label ?
+			!nla_strcmp(package_label, prop->package_label) :
+			false) : true;
+		if (cid_match && mod_match && type_match && board_match &&
+		    panel_match && package_match) {
+			if (pin_match) {
+				NL_SET_ERR_MSG(extack, "multiple matches");
+				return ERR_PTR(-EINVAL);
+			}
+			pin_match = pin;
+		};
+	}
+	if (!pin_match) {
+		NL_SET_ERR_MSG(extack, "not found");
+		return ERR_PTR(-ENODEV);
+	}
+	return pin_match;
+}
+
+static struct dpll_pin *dpll_pin_find_from_nlattr(struct genl_info *info)
+{
+	struct nlattr *attr, *mod_name_attr = NULL, *board_label_attr = NULL,
+		*panel_label_attr = NULL, *package_label_attr = NULL;
+	enum dpll_pin_type type = 0;
+	u64 clock_id = 0;
+	int rem = 0;
+
+	nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(attr)) {
+		case DPLL_A_PIN_CLOCK_ID:
+			if (clock_id)
+				goto duplicated_attr;
+			clock_id = nla_get_u64(attr);
+			break;
+		case DPLL_A_PIN_MODULE_NAME:
+			if (mod_name_attr)
+				goto duplicated_attr;
+			mod_name_attr = attr;
+			break;
+		case DPLL_A_PIN_TYPE:
+			if (type)
+				goto duplicated_attr;
+			type = nla_get_u32(attr);
+		break;
+		case DPLL_A_PIN_BOARD_LABEL:
+			if (board_label_attr)
+				goto duplicated_attr;
+			board_label_attr = attr;
+		break;
+		case DPLL_A_PIN_PANEL_LABEL:
+			if (panel_label_attr)
+				goto duplicated_attr;
+			panel_label_attr = attr;
+		break;
+		case DPLL_A_PIN_PACKAGE_LABEL:
+			if (package_label_attr)
+				goto duplicated_attr;
+			package_label_attr = attr;
+		break;
+		default:
+			break;
+		}
+	}
+	if (!(clock_id  || mod_name_attr || board_label_attr ||
+	      panel_label_attr || package_label_attr)) {
+		NL_SET_ERR_MSG(info->extack, "missing attributes");
+		return ERR_PTR(-EINVAL);
+	}
+	return dpll_pin_find(clock_id, mod_name_attr, type, board_label_attr,
+			     panel_label_attr, package_label_attr,
+			     info->extack);
+duplicated_attr:
+	NL_SET_ERR_MSG(info->extack, "duplicated attribute");
+	return ERR_PTR(-EINVAL);
+}
+
+int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin;
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_PIN_ID_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	pin = dpll_pin_find_from_nlattr(info);
+	if (!IS_ERR(pin)) {
+		ret = dpll_msg_add_pin_handle(msg, pin);
+		if (ret) {
+			nlmsg_free(msg);
+			return ret;
+		}
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin = info->user_ptr[0];
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	if (!pin)
+		return -ENODEV;
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_PIN_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+	ret = dpll_cmd_pin_get_one(msg, pin, info->extack);
+	if (ret) {
+		nlmsg_free(msg);
+		return ret;
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct dpll_dump_ctx *ctx = dpll_dump_context(cb);
+	struct dpll_pin *pin;
+	struct nlattr *hdr;
+	unsigned long i;
+	int ret = 0;
+
+	xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED,
+				 ctx->idx) {
+		hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq,
+				  &dpll_nl_family, NLM_F_MULTI,
+				  DPLL_CMD_PIN_GET);
+		if (!hdr) {
+			ret = -EMSGSIZE;
+			break;
+		}
+		ret = dpll_cmd_pin_get_one(skb, pin, cb->extack);
+		if (ret) {
+			genlmsg_cancel(skb, hdr);
+			break;
+		}
+		genlmsg_end(skb, hdr);
+	}
+	if (ret == -EMSGSIZE) {
+		ctx->idx = i;
+		return skb->len;
+	}
+	return ret;
+}
+
+int dpll_nl_pin_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_pin *pin = info->user_ptr[0];
+
+	return dpll_pin_set_from_nlattr(pin, info);
+}
+
+static struct dpll_device *
+dpll_device_find(u64 clock_id, struct nlattr *mod_name_attr,
+		 enum dpll_type type, struct netlink_ext_ack *extack)
+{
+	struct dpll_device *dpll_match = NULL, *dpll;
+	bool cid_match, mod_match, type_match;
+	unsigned long i;
+
+	xa_for_each_marked(&dpll_device_xa, i, dpll, DPLL_REGISTERED) {
+		cid_match = clock_id ? dpll->clock_id == clock_id : true;
+		mod_match = mod_name_attr ? (module_name(dpll->module) ?
+			!nla_strcmp(mod_name_attr,
+				    module_name(dpll->module)) : false) : true;
+		type_match = type ? dpll->type == type : true;
+		if (cid_match && mod_match && type_match) {
+			if (dpll_match) {
+				NL_SET_ERR_MSG(extack, "multiple matches");
+				return ERR_PTR(-EINVAL);
+			}
+			dpll_match = dpll;
+		}
+	}
+	if (!dpll_match) {
+		NL_SET_ERR_MSG(extack, "not found");
+		return ERR_PTR(-ENODEV);
+	}
+
+	return dpll_match;
+}
+
+static struct dpll_device *
+dpll_device_find_from_nlattr(struct genl_info *info)
+{
+	struct nlattr *attr, *mod_name_attr = NULL;
+	enum dpll_type type = 0;
+	u64 clock_id = 0;
+	int rem = 0;
+
+	nla_for_each_attr(attr, genlmsg_data(info->genlhdr),
+			  genlmsg_len(info->genlhdr), rem) {
+		switch (nla_type(attr)) {
+		case DPLL_A_CLOCK_ID:
+			if (clock_id)
+				goto duplicated_attr;
+			clock_id = nla_get_u64(attr);
+			break;
+		case DPLL_A_MODULE_NAME:
+			if (mod_name_attr)
+				goto duplicated_attr;
+			mod_name_attr = attr;
+			break;
+		case DPLL_A_TYPE:
+			if (type)
+				goto duplicated_attr;
+			type = nla_get_u32(attr);
+			break;
+		default:
+			break;
+		}
+	}
+	if (!clock_id && !mod_name_attr && !type) {
+		NL_SET_ERR_MSG(info->extack, "missing attributes");
+		return ERR_PTR(-EINVAL);
+	}
+	return dpll_device_find(clock_id, mod_name_attr, type, info->extack);
+duplicated_attr:
+	NL_SET_ERR_MSG(info->extack, "duplicated attribute");
+	return ERR_PTR(-EINVAL);
+}
+
+int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_device *dpll;
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_DEVICE_ID_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	dpll = dpll_device_find_from_nlattr(info);
+	if (!IS_ERR(dpll)) {
+		ret = dpll_msg_add_dev_handle(msg, dpll);
+		if (ret) {
+			nlmsg_free(msg);
+			return ret;
+		}
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_device *dpll = info->user_ptr[0];
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0,
+				DPLL_CMD_DEVICE_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	ret = dpll_device_get_one(dpll, msg, info->extack);
+	if (ret) {
+		nlmsg_free(msg);
+		return ret;
+	}
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+}
+
+int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	/* placeholder for set command */
+	return 0;
+}
+
+int dpll_nl_device_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct dpll_dump_ctx *ctx = dpll_dump_context(cb);
+	struct dpll_device *dpll;
+	struct nlattr *hdr;
+	unsigned long i;
+	int ret = 0;
+
+	xa_for_each_marked_start(&dpll_device_xa, i, dpll, DPLL_REGISTERED,
+				 ctx->idx) {
+		hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				  cb->nlh->nlmsg_seq, &dpll_nl_family,
+				  NLM_F_MULTI, DPLL_CMD_DEVICE_GET);
+		if (!hdr) {
+			ret = -EMSGSIZE;
+			break;
+		}
+		ret = dpll_device_get_one(dpll, skb, cb->extack);
+		if (ret) {
+			genlmsg_cancel(skb, hdr);
+			break;
+		}
+		genlmsg_end(skb, hdr);
+	}
+	if (ret == -EMSGSIZE) {
+		ctx->idx = i;
+		return skb->len;
+	}
+	return ret;
+}
+
+int dpll_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		  struct genl_info *info)
+{
+	u32 id;
+
+	if (GENL_REQ_ATTR_CHECK(info, DPLL_A_ID))
+		return -EINVAL;
+
+	mutex_lock(&dpll_lock);
+	id = nla_get_u32(info->attrs[DPLL_A_ID]);
+	info->user_ptr[0] = dpll_device_get_by_id(id);
+	if (!info->user_ptr[0]) {
+		NL_SET_ERR_MSG(info->extack, "device not found");
+		goto unlock;
+	}
+	return 0;
+unlock:
+	mutex_unlock(&dpll_lock);
+	return -ENODEV;
+}
+
+void dpll_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		    struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
+
+int
+dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+	       struct genl_info *info)
+{
+	mutex_lock(&dpll_lock);
+
+	return 0;
+}
+
+void
+dpll_unlock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		 struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
+
+int dpll_lock_dumpit(struct netlink_callback *cb)
+{
+	mutex_lock(&dpll_lock);
+
+	return 0;
+}
+
+int dpll_unlock_dumpit(struct netlink_callback *cb)
+{
+	mutex_unlock(&dpll_lock);
+
+	return 0;
+}
+
+int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+		      struct genl_info *info)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	if (GENL_REQ_ATTR_CHECK(info, DPLL_A_PIN_ID)) {
+		ret = -EINVAL;
+		goto unlock_dev;
+	}
+	info->user_ptr[0] = xa_load(&dpll_pin_xa,
+				    nla_get_u32(info->attrs[DPLL_A_PIN_ID]));
+	if (!info->user_ptr[0]) {
+		NL_SET_ERR_MSG(info->extack, "pin not found");
+		ret = -ENODEV;
+		goto unlock_dev;
+	}
+
+	return 0;
+
+unlock_dev:
+	mutex_unlock(&dpll_lock);
+	return ret;
+}
+
+void dpll_pin_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+			struct genl_info *info)
+{
+	mutex_unlock(&dpll_lock);
+}
diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h
new file mode 100644
index 000000000000..a9cfd55f57fc
--- /dev/null
+++ b/drivers/dpll/dpll_netlink.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (c) 2023 Meta Platforms, Inc. and affiliates
+ *  Copyright (c) 2023 Intel and affiliates
+ */
+
+int dpll_device_create_ntf(struct dpll_device *dpll);
+
+int dpll_device_delete_ntf(struct dpll_device *dpll);
+
+int dpll_pin_create_ntf(struct dpll_pin *pin);
+
+int dpll_pin_delete_ntf(struct dpll_pin *pin);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index b47c3560b937..2202310c10cd 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -130,4 +130,8 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 				const struct dpll_pin_ops *ops, void *priv);
 
+int dpll_device_change_ntf(struct dpll_device *dpll);
+
+int dpll_pin_change_ntf(struct dpll_pin *pin);
+
 #endif
-- 
cgit v1.2.3


From 5f18426928800c59fb0f9bc8fb0c182bb6f5ee24 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:39 +0100
Subject: netdev: expose DPLL pin handle for netdevice

In case netdevice represents a SyncE port, the user needs to understand
the connection between netdevice and associated DPLL pin. There might me
multiple netdevices pointing to the same pin, in case of VF/SF
implementation.

Add a IFLA Netlink attribute to nest the DPLL pin handle, similar to
how it is implemented for devlink port. Add a struct dpll_pin pointer
to netdev and protect access to it by RTNL. Expose netdev_dpll_pin_set()
and netdev_dpll_pin_clear() helpers to the drivers so they can set/clear
the DPLL pin relationship to netdev.

Note that during the lifetime of struct dpll_pin the pin handle does not
change. Therefore it is save to access it lockless. It is drivers
responsibility to call netdev_dpll_pin_clear() before dpll_pin_put().

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_netlink.c  | 16 ++++++++++++++--
 include/linux/dpll.h         | 15 +++++++++++++++
 include/linux/netdevice.h    | 21 +++++++++++++++++++++
 include/uapi/linux/if_link.h |  2 +-
 net/core/dev.c               | 22 ++++++++++++++++++++++
 net/core/rtnetlink.c         | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 9464a6865977..764437a0661b 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -47,6 +47,18 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
 	return 0;
 }
 
+/**
+ * dpll_msg_pin_handle_size - get size of pin handle attribute for given pin
+ * @pin: pin pointer
+ *
+ * Return: byte size of pin handle attribute for given pin.
+ */
+size_t dpll_msg_pin_handle_size(struct dpll_pin *pin)
+{
+	return pin ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */
+}
+EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size);
+
 /**
  * dpll_msg_add_pin_handle - attach pin handle attribute to a given message
  * @msg: pointer to sk_buff message to attach a pin handle
@@ -56,8 +68,7 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
  * * 0 - success
  * * -EMSGSIZE - no space in message to attach pin handle
  */
-static int
-dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
 {
 	if (!pin)
 		return 0;
@@ -65,6 +76,7 @@ dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
 		return -EMSGSIZE;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dpll_msg_add_pin_handle);
 
 static int
 dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll,
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 2202310c10cd..bbc480cd2932 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -101,6 +101,21 @@ struct dpll_pin_properties {
 	struct dpll_pin_frequency *freq_supported;
 };
 
+#if IS_ENABLED(CONFIG_DPLL)
+size_t dpll_msg_pin_handle_size(struct dpll_pin *pin);
+int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin);
+#else
+static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin)
+{
+	return 0;
+}
+
+static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin)
+{
+	return 0;
+}
+#endif
+
 struct dpll_device *
 dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0896aaa91dd7..db3d8429d50d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -79,6 +79,8 @@ struct xdp_buff;
 struct xdp_frame;
 struct xdp_metadata_ops;
 struct xdp_md;
+/* DPLL specific */
+struct dpll_pin;
 
 typedef u32 xdp_features_t;
 
@@ -2049,6 +2051,9 @@ enum netdev_ml_priv_type {
  *			SET_NETDEV_DEVLINK_PORT macro. This pointer is static
  *			during the time netdevice is registered.
  *
+ *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
+ *		   where the clock is recovered.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2405,6 +2410,10 @@ struct net_device {
 	struct rtnl_hw_stats64	*offload_xstats_l3;
 
 	struct devlink_port	*devlink_port;
+
+#if IS_ENABLED(CONFIG_DPLL)
+	struct dpll_pin		*dpll_pin;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3940,6 +3949,18 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int dev_get_port_parent_id(struct net_device *dev,
 			   struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin);
+void netdev_dpll_pin_clear(struct net_device *dev);
+
+static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+	return dev->dpll_pin;
+#else
+	return NULL;
+#endif
+}
+
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ce3117df9cec..fac351a93aed 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -376,7 +376,7 @@ enum {
 
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
-
+	IFLA_DPLL_PIN,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ccff2b6ef958..cc03a5758d2d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9023,6 +9023,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 }
 EXPORT_SYMBOL(netdev_port_same_parent_id);
 
+static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+	rtnl_lock();
+	dev->dpll_pin = dpll_pin;
+	rtnl_unlock();
+#endif
+}
+
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+	WARN_ON(!dpll_pin);
+	netdev_dpll_pin_assign(dev, dpll_pin);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_set);
+
+void netdev_dpll_pin_clear(struct net_device *dev)
+{
+	netdev_dpll_pin_assign(dev, NULL);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_clear);
+
 /**
  *	dev_change_proto_down - set carrier according to proto_down.
  *
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4a2ec33bfb51..7452a6d190c5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -57,6 +57,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/addrconf.h>
 #endif
+#include <linux/dpll.h>
 
 #include "dev.h"
 
@@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev)
 	return size;
 }
 
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+	size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+	size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev));
+
+	return size;
+}
+
 static noinline size_t if_nlmsg_size(const struct net_device *dev,
 				     u32 ext_filter_mask)
 {
@@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_prop_list_size(dev)
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
 	       + rtnl_devlink_port_size(dev)
+	       + rtnl_dpll_pin_size(dev)
 	       + 0;
 }
 
@@ -1774,6 +1785,28 @@ nest_cancel:
 	return ret;
 }
 
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct nlattr *dpll_pin_nest;
+	int ret;
+
+	dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+	if (!dpll_pin_nest)
+		return -EMSGSIZE;
+
+	ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev));
+	if (ret < 0)
+		goto nest_cancel;
+
+	nla_nest_end(skb, dpll_pin_nest);
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(skb, dpll_pin_nest);
+	return ret;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
@@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	if (rtnl_fill_devlink_port(skb, dev))
 		goto nla_put_failure;
 
+	if (rtnl_fill_dpll_pin(skb, dev))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From 496fd0a26bbf73b6b12407ee4fbe5ff49d659a6d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 21:49:43 +0100
Subject: mlx5: Implement SyncE support using DPLL infrastructure

Implement SyncE support using newly introduced DPLL support.
Make sure that each PFs/VFs/SFs probed with appropriate capability
will spawn a dpll auxiliary device and register appropriate dpll device
and pin instances.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig  |   8 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/dev.c    |  17 +
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c   | 432 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                      |   2 +
 include/linux/mlx5/mlx5_ifc.h                    |  59 +++-
 6 files changed, 520 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/dpll.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index c4f4de82e29e..685335832a93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -189,3 +189,11 @@ config MLX5_SF_MANAGER
 	port is managed through devlink.  A subfunction supports RDMA, netdevice
 	and vdpa device. It is similar to a SRIOV VF but it doesn't require
 	SRIOV support.
+
+config MLX5_DPLL
+	tristate "Mellanox 5th generation network adapters (ConnectX series) DPLL support"
+	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
+	select DPLL
+	help
+	  DPLL support in Mellanox Technologies ConnectX NICs.
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 7e94caca4888..c44870b175f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -128,3 +128,6 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_
 # SF manager
 #
 mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o
+
+obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o
+mlx5_dpll-y :=	dpll.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 7909f378dc93..1fc03480c2ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -206,6 +206,19 @@ static bool is_ib_enabled(struct mlx5_core_dev *dev)
 	return err ? false : val.vbool;
 }
 
+static bool is_dpll_supported(struct mlx5_core_dev *dev)
+{
+	if (!IS_ENABLED(CONFIG_MLX5_DPLL))
+		return false;
+
+	if (!MLX5_CAP_MCAM_REG2(dev, synce_registers)) {
+		mlx5_core_warn(dev, "Missing SyncE capability\n");
+		return false;
+	}
+
+	return true;
+}
+
 enum {
 	MLX5_INTERFACE_PROTOCOL_ETH,
 	MLX5_INTERFACE_PROTOCOL_ETH_REP,
@@ -215,6 +228,8 @@ enum {
 	MLX5_INTERFACE_PROTOCOL_MPIB,
 
 	MLX5_INTERFACE_PROTOCOL_VNET,
+
+	MLX5_INTERFACE_PROTOCOL_DPLL,
 };
 
 static const struct mlx5_adev_device {
@@ -237,6 +252,8 @@ static const struct mlx5_adev_device {
 					   .is_supported = &is_ib_rep_supported },
 	[MLX5_INTERFACE_PROTOCOL_MPIB] = { .suffix = "multiport",
 					   .is_supported = &is_mp_supported },
+	[MLX5_INTERFACE_PROTOCOL_DPLL] = { .suffix = "dpll",
+					   .is_supported = &is_dpll_supported },
 };
 
 int mlx5_adev_idx_alloc(void)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
new file mode 100644
index 000000000000..74f0c7867120
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include <linux/dpll.h>
+#include <linux/mlx5/driver.h>
+
+/* This structure represents a reference to DPLL, one is created
+ * per mdev instance.
+ */
+struct mlx5_dpll {
+	struct dpll_device *dpll;
+	struct dpll_pin *dpll_pin;
+	struct mlx5_core_dev *mdev;
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	struct {
+		bool valid;
+		enum dpll_lock_status lock_status;
+		enum dpll_pin_state pin_state;
+	} last;
+	struct notifier_block mdev_nb;
+	struct net_device *tracking_netdev;
+};
+
+static int mlx5_dpll_clock_id_get(struct mlx5_core_dev *mdev, u64 *clock_id)
+{
+	u32 out[MLX5_ST_SZ_DW(msecq_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msecq_reg)] = {};
+	int err;
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MSECQ, 0, 0);
+	if (err)
+		return err;
+	*clock_id = MLX5_GET64(msecq_reg, out, local_clock_identity);
+	return 0;
+}
+
+static int
+mlx5_dpll_synce_status_get(struct mlx5_core_dev *mdev,
+			   enum mlx5_msees_admin_status *admin_status,
+			   enum mlx5_msees_oper_status *oper_status,
+			   bool *ho_acq)
+{
+	u32 out[MLX5_ST_SZ_DW(msees_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msees_reg)] = {};
+	int err;
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MSEES, 0, 0);
+	if (err)
+		return err;
+	if (admin_status)
+		*admin_status = MLX5_GET(msees_reg, out, admin_status);
+	*oper_status = MLX5_GET(msees_reg, out, oper_status);
+	if (ho_acq)
+		*ho_acq = MLX5_GET(msees_reg, out, ho_acq);
+	return 0;
+}
+
+static int
+mlx5_dpll_synce_status_set(struct mlx5_core_dev *mdev,
+			   enum mlx5_msees_admin_status admin_status)
+{
+	u32 out[MLX5_ST_SZ_DW(msees_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(msees_reg)] = {};
+
+	MLX5_SET(msees_reg, in, field_select,
+		 MLX5_MSEES_FIELD_SELECT_ENABLE |
+		 MLX5_MSEES_FIELD_SELECT_ADMIN_STATUS);
+	MLX5_SET(msees_reg, in, admin_status, admin_status);
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				    MLX5_REG_MSEES, 0, 1);
+}
+
+static enum dpll_lock_status
+mlx5_dpll_lock_status_get(enum mlx5_msees_oper_status oper_status, bool ho_acq)
+{
+	switch (oper_status) {
+	case MLX5_MSEES_OPER_STATUS_SELF_TRACK:
+		fallthrough;
+	case MLX5_MSEES_OPER_STATUS_OTHER_TRACK:
+		return ho_acq ? DPLL_LOCK_STATUS_LOCKED_HO_ACQ :
+				DPLL_LOCK_STATUS_LOCKED;
+	case MLX5_MSEES_OPER_STATUS_HOLDOVER:
+		fallthrough;
+	case MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER:
+		return DPLL_LOCK_STATUS_HOLDOVER;
+	default:
+		return DPLL_LOCK_STATUS_UNLOCKED;
+	}
+}
+
+static enum dpll_pin_state
+mlx5_dpll_pin_state_get(enum mlx5_msees_admin_status admin_status,
+			enum mlx5_msees_oper_status oper_status)
+{
+	return (admin_status == MLX5_MSEES_ADMIN_STATUS_TRACK &&
+		(oper_status == MLX5_MSEES_OPER_STATUS_SELF_TRACK ||
+		 oper_status == MLX5_MSEES_OPER_STATUS_OTHER_TRACK)) ?
+	       DPLL_PIN_STATE_CONNECTED : DPLL_PIN_STATE_DISCONNECTED;
+}
+
+static int mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll,
+					    void *priv,
+					    enum dpll_lock_status *status,
+					    struct netlink_ext_ack *extack)
+{
+	enum mlx5_msees_oper_status oper_status;
+	struct mlx5_dpll *mdpll = priv;
+	bool ho_acq;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, NULL,
+					 &oper_status, &ho_acq);
+	if (err)
+		return err;
+
+	*status = mlx5_dpll_lock_status_get(oper_status, ho_acq);
+	return 0;
+}
+
+static int mlx5_dpll_device_mode_get(const struct dpll_device *dpll,
+				     void *priv,
+				     u32 *mode, struct netlink_ext_ack *extack)
+{
+	*mode = DPLL_MODE_MANUAL;
+	return 0;
+}
+
+static bool mlx5_dpll_device_mode_supported(const struct dpll_device *dpll,
+					    void *priv,
+					    enum dpll_mode mode,
+					    struct netlink_ext_ack *extack)
+{
+	return mode == DPLL_MODE_MANUAL;
+}
+
+static const struct dpll_device_ops mlx5_dpll_device_ops = {
+	.lock_status_get = mlx5_dpll_device_lock_status_get,
+	.mode_get = mlx5_dpll_device_mode_get,
+	.mode_supported = mlx5_dpll_device_mode_supported,
+};
+
+static int mlx5_dpll_pin_direction_get(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_direction *direction,
+				       struct netlink_ext_ack *extack)
+{
+	*direction = DPLL_PIN_DIRECTION_INPUT;
+	return 0;
+}
+
+static int mlx5_dpll_state_on_dpll_get(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_state *state,
+				       struct netlink_ext_ack *extack)
+{
+	enum mlx5_msees_admin_status admin_status;
+	enum mlx5_msees_oper_status oper_status;
+	struct mlx5_dpll *mdpll = pin_priv;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, &admin_status,
+					 &oper_status, NULL);
+	if (err)
+		return err;
+	*state = mlx5_dpll_pin_state_get(admin_status, oper_status);
+	return 0;
+}
+
+static int mlx5_dpll_state_on_dpll_set(const struct dpll_pin *pin,
+				       void *pin_priv,
+				       const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       enum dpll_pin_state state,
+				       struct netlink_ext_ack *extack)
+{
+	struct mlx5_dpll *mdpll = pin_priv;
+
+	return mlx5_dpll_synce_status_set(mdpll->mdev,
+					  state == DPLL_PIN_STATE_CONNECTED ?
+					  MLX5_MSEES_ADMIN_STATUS_TRACK :
+					  MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+}
+
+static const struct dpll_pin_ops mlx5_dpll_pins_ops = {
+	.direction_get = mlx5_dpll_pin_direction_get,
+	.state_on_dpll_get = mlx5_dpll_state_on_dpll_get,
+	.state_on_dpll_set = mlx5_dpll_state_on_dpll_set,
+};
+
+static const struct dpll_pin_properties mlx5_dpll_pin_properties = {
+	.type = DPLL_PIN_TYPE_SYNCE_ETH_PORT,
+	.capabilities = DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE,
+};
+
+#define MLX5_DPLL_PERIODIC_WORK_INTERVAL 500 /* ms */
+
+static void mlx5_dpll_periodic_work_queue(struct mlx5_dpll *mdpll)
+{
+	queue_delayed_work(mdpll->wq, &mdpll->work,
+			   msecs_to_jiffies(MLX5_DPLL_PERIODIC_WORK_INTERVAL));
+}
+
+static void mlx5_dpll_periodic_work(struct work_struct *work)
+{
+	struct mlx5_dpll *mdpll = container_of(work, struct mlx5_dpll,
+					       work.work);
+	enum mlx5_msees_admin_status admin_status;
+	enum mlx5_msees_oper_status oper_status;
+	enum dpll_lock_status lock_status;
+	enum dpll_pin_state pin_state;
+	bool ho_acq;
+	int err;
+
+	err = mlx5_dpll_synce_status_get(mdpll->mdev, &admin_status,
+					 &oper_status, &ho_acq);
+	if (err)
+		goto err_out;
+	lock_status = mlx5_dpll_lock_status_get(oper_status, ho_acq);
+	pin_state = mlx5_dpll_pin_state_get(admin_status, oper_status);
+
+	if (!mdpll->last.valid)
+		goto invalid_out;
+
+	if (mdpll->last.lock_status != lock_status)
+		dpll_device_change_ntf(mdpll->dpll);
+	if (mdpll->last.pin_state != pin_state)
+		dpll_pin_change_ntf(mdpll->dpll_pin);
+
+invalid_out:
+	mdpll->last.lock_status = lock_status;
+	mdpll->last.pin_state = pin_state;
+	mdpll->last.valid = true;
+err_out:
+	mlx5_dpll_periodic_work_queue(mdpll);
+}
+
+static void mlx5_dpll_netdev_dpll_pin_set(struct mlx5_dpll *mdpll,
+					  struct net_device *netdev)
+{
+	if (mdpll->tracking_netdev)
+		return;
+	netdev_dpll_pin_set(netdev, mdpll->dpll_pin);
+	mdpll->tracking_netdev = netdev;
+}
+
+static void mlx5_dpll_netdev_dpll_pin_clear(struct mlx5_dpll *mdpll)
+{
+	if (!mdpll->tracking_netdev)
+		return;
+	netdev_dpll_pin_clear(mdpll->tracking_netdev);
+	mdpll->tracking_netdev = NULL;
+}
+
+static int mlx5_dpll_mdev_notifier_event(struct notifier_block *nb,
+					 unsigned long event, void *data)
+{
+	struct mlx5_dpll *mdpll = container_of(nb, struct mlx5_dpll, mdev_nb);
+	struct net_device *netdev = data;
+
+	switch (event) {
+	case MLX5_DRIVER_EVENT_UPLINK_NETDEV:
+		if (netdev)
+			mlx5_dpll_netdev_dpll_pin_set(mdpll, netdev);
+		else
+			mlx5_dpll_netdev_dpll_pin_clear(mdpll);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void mlx5_dpll_mdev_netdev_track(struct mlx5_dpll *mdpll,
+					struct mlx5_core_dev *mdev)
+{
+	mdpll->mdev_nb.notifier_call = mlx5_dpll_mdev_notifier_event;
+	mlx5_blocking_notifier_register(mdev, &mdpll->mdev_nb);
+	mlx5_core_uplink_netdev_event_replay(mdev);
+}
+
+static void mlx5_dpll_mdev_netdev_untrack(struct mlx5_dpll *mdpll,
+					  struct mlx5_core_dev *mdev)
+{
+	mlx5_blocking_notifier_unregister(mdev, &mdpll->mdev_nb);
+	mlx5_dpll_netdev_dpll_pin_clear(mdpll);
+}
+
+static int mlx5_dpll_probe(struct auxiliary_device *adev,
+			   const struct auxiliary_device_id *id)
+{
+	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
+	struct mlx5_core_dev *mdev = edev->mdev;
+	struct mlx5_dpll *mdpll;
+	u64 clock_id;
+	int err;
+
+	err = mlx5_dpll_synce_status_set(mdev,
+					 MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+	if (err)
+		return err;
+
+	err = mlx5_dpll_clock_id_get(mdev, &clock_id);
+	if (err)
+		return err;
+
+	mdpll = kzalloc(sizeof(*mdpll), GFP_KERNEL);
+	if (!mdpll)
+		return -ENOMEM;
+	mdpll->mdev = mdev;
+	auxiliary_set_drvdata(adev, mdpll);
+
+	/* Multiple mdev instances might share one DPLL device. */
+	mdpll->dpll = dpll_device_get(clock_id, 0, THIS_MODULE);
+	if (IS_ERR(mdpll->dpll)) {
+		err = PTR_ERR(mdpll->dpll);
+		goto err_free_mdpll;
+	}
+
+	err = dpll_device_register(mdpll->dpll, DPLL_TYPE_EEC,
+				   &mlx5_dpll_device_ops, mdpll);
+	if (err)
+		goto err_put_dpll_device;
+
+	/* Multiple mdev instances might share one DPLL pin. */
+	mdpll->dpll_pin = dpll_pin_get(clock_id, mlx5_get_dev_index(mdev),
+				       THIS_MODULE, &mlx5_dpll_pin_properties);
+	if (IS_ERR(mdpll->dpll_pin)) {
+		err = PTR_ERR(mdpll->dpll_pin);
+		goto err_unregister_dpll_device;
+	}
+
+	err = dpll_pin_register(mdpll->dpll, mdpll->dpll_pin,
+				&mlx5_dpll_pins_ops, mdpll);
+	if (err)
+		goto err_put_dpll_pin;
+
+	mdpll->wq = create_singlethread_workqueue("mlx5_dpll");
+	if (!mdpll->wq) {
+		err = -ENOMEM;
+		goto err_unregister_dpll_pin;
+	}
+
+	mlx5_dpll_mdev_netdev_track(mdpll, mdev);
+
+	INIT_DELAYED_WORK(&mdpll->work, &mlx5_dpll_periodic_work);
+	mlx5_dpll_periodic_work_queue(mdpll);
+
+	return 0;
+
+err_unregister_dpll_pin:
+	dpll_pin_unregister(mdpll->dpll, mdpll->dpll_pin,
+			    &mlx5_dpll_pins_ops, mdpll);
+err_put_dpll_pin:
+	dpll_pin_put(mdpll->dpll_pin);
+err_unregister_dpll_device:
+	dpll_device_unregister(mdpll->dpll, &mlx5_dpll_device_ops, mdpll);
+err_put_dpll_device:
+	dpll_device_put(mdpll->dpll);
+err_free_mdpll:
+	kfree(mdpll);
+	return err;
+}
+
+static void mlx5_dpll_remove(struct auxiliary_device *adev)
+{
+	struct mlx5_dpll *mdpll = auxiliary_get_drvdata(adev);
+	struct mlx5_core_dev *mdev = mdpll->mdev;
+
+	cancel_delayed_work(&mdpll->work);
+	mlx5_dpll_mdev_netdev_untrack(mdpll, mdev);
+	destroy_workqueue(mdpll->wq);
+	dpll_pin_unregister(mdpll->dpll, mdpll->dpll_pin,
+			    &mlx5_dpll_pins_ops, mdpll);
+	dpll_pin_put(mdpll->dpll_pin);
+	dpll_device_unregister(mdpll->dpll, &mlx5_dpll_device_ops, mdpll);
+	dpll_device_put(mdpll->dpll);
+	kfree(mdpll);
+
+	mlx5_dpll_synce_status_set(mdev,
+				   MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING);
+}
+
+static int mlx5_dpll_suspend(struct auxiliary_device *adev, pm_message_t state)
+{
+	return 0;
+}
+
+static int mlx5_dpll_resume(struct auxiliary_device *adev)
+{
+	return 0;
+}
+
+static const struct auxiliary_device_id mlx5_dpll_id_table[] = {
+	{ .name = MLX5_ADEV_NAME ".dpll", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5_dpll_id_table);
+
+static struct auxiliary_driver mlx5_dpll_driver = {
+	.name = "dpll",
+	.probe = mlx5_dpll_probe,
+	.remove = mlx5_dpll_remove,
+	.suspend = mlx5_dpll_suspend,
+	.resume = mlx5_dpll_resume,
+	.id_table = mlx5_dpll_id_table,
+};
+
+static int __init mlx5_dpll_init(void)
+{
+	return auxiliary_driver_register(&mlx5_dpll_driver);
+}
+
+static void __exit mlx5_dpll_exit(void)
+{
+	auxiliary_driver_unregister(&mlx5_dpll_driver);
+}
+
+module_init(mlx5_dpll_init);
+module_exit(mlx5_dpll_exit);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@nvidia.com>");
+MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) DPLL driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..92434814c855 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -155,6 +155,8 @@ enum {
 	MLX5_REG_MCC		 = 0x9062,
 	MLX5_REG_MCDA		 = 0x9063,
 	MLX5_REG_MCAM		 = 0x907f,
+	MLX5_REG_MSECQ		 = 0x9155,
+	MLX5_REG_MSEES		 = 0x9156,
 	MLX5_REG_MIRC		 = 0x9162,
 	MLX5_REG_SBCAM		 = 0xB01F,
 	MLX5_REG_RESOURCE_DUMP   = 0xC000,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..dd8421d021cf 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10176,7 +10176,9 @@ struct mlx5_ifc_mcam_access_reg_bits2 {
 	u8         mirc[0x1];
 	u8         regs_97_to_96[0x2];
 
-	u8         regs_95_to_64[0x20];
+	u8         regs_95_to_87[0x09];
+	u8         synce_registers[0x2];
+	u8         regs_84_to_64[0x15];
 
 	u8         regs_63_to_32[0x20];
 
@@ -12549,4 +12551,59 @@ struct mlx5_ifc_modify_page_track_obj_in_bits {
 	struct mlx5_ifc_page_track_bits obj_context;
 };
 
+struct mlx5_ifc_msecq_reg_bits {
+	u8         reserved_at_0[0x20];
+
+	u8         reserved_at_20[0x12];
+	u8         network_option[0x2];
+	u8         local_ssm_code[0x4];
+	u8         local_enhanced_ssm_code[0x8];
+
+	u8         local_clock_identity[0x40];
+
+	u8         reserved_at_80[0x180];
+};
+
+enum {
+	MLX5_MSEES_FIELD_SELECT_ENABLE			= BIT(0),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_STATUS		= BIT(1),
+	MLX5_MSEES_FIELD_SELECT_ADMIN_FREQ_MEASURE	= BIT(2),
+};
+
+enum mlx5_msees_admin_status {
+	MLX5_MSEES_ADMIN_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_ADMIN_STATUS_TRACK			= 0x1,
+};
+
+enum mlx5_msees_oper_status {
+	MLX5_MSEES_OPER_STATUS_FREE_RUNNING		= 0x0,
+	MLX5_MSEES_OPER_STATUS_SELF_TRACK		= 0x1,
+	MLX5_MSEES_OPER_STATUS_OTHER_TRACK		= 0x2,
+	MLX5_MSEES_OPER_STATUS_HOLDOVER			= 0x3,
+	MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER		= 0x4,
+	MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING	= 0x5,
+};
+
+struct mlx5_ifc_msees_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         lp_msb[0x2];
+	u8         reserved_at_14[0xc];
+
+	u8         field_select[0x20];
+
+	u8         admin_status[0x4];
+	u8         oper_status[0x4];
+	u8         ho_acq[0x1];
+	u8         reserved_at_49[0xc];
+	u8         admin_freq_measure[0x1];
+	u8         oper_freq_measure[0x1];
+	u8         failure_reason[0x9];
+
+	u8         frequency_diff[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From ac5f395685bd16ca30c1c834dcbf8b555605ccae Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 13 Sep 2023 09:12:40 +0200
Subject: net/mlx5: SF, Implement peer devlink set for SF representor devlink
 port

Benefit from the existence of internal mlx5 notifier and extend it by
event MLX5_DRIVER_EVENT_SF_PEER_DEVLINK. Use this event from SF
auxiliary device probe/remove functions to pass the registered SF
devlink instance to the SF representor.

Process the new event in SF representor code and call
devl_port_fn_devlink_set() to do the assignments. Implement this in work
to avoid possible deadlock when probe/remove function of SF may be
called with devlink instance lock held during devlink reload.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h   |  6 ++++
 .../ethernet/mellanox/mlx5/core/sf/dev/driver.c    | 26 +++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   | 34 ++++++++++++++++++++++
 include/linux/mlx5/device.h                        |  1 +
 4 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
index 2a66a427ef15..b99131e95e37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
@@ -19,6 +19,12 @@ struct mlx5_sf_dev {
 	u16 fn_id;
 };
 
+struct mlx5_sf_peer_devlink_event_ctx {
+	u16 fn_id;
+	struct devlink *devlink;
+	int err;
+};
+
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev);
 void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
index 8fe82f1191bb..169c2c68ed5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -8,6 +8,20 @@
 #include "dev.h"
 #include "devlink.h"
 
+static int mlx5_core_peer_devlink_set(struct mlx5_sf_dev *sf_dev, struct devlink *devlink)
+{
+	struct mlx5_sf_peer_devlink_event_ctx event_ctx = {
+		.fn_id = sf_dev->fn_id,
+		.devlink = devlink,
+	};
+	int ret;
+
+	ret = mlx5_blocking_notifier_call_chain(sf_dev->parent_mdev,
+						MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
+						&event_ctx);
+	return ret == NOTIFY_OK ? event_ctx.err : 0;
+}
+
 static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id)
 {
 	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
@@ -54,9 +68,21 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
 		mlx5_core_warn(mdev, "mlx5_init_one err=%d\n", err);
 		goto init_one_err;
 	}
+
+	err = mlx5_core_peer_devlink_set(sf_dev, devlink);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_core_peer_devlink_set err=%d\n", err);
+		goto peer_devlink_set_err;
+	}
+
 	devlink_register(devlink);
 	return 0;
 
+peer_devlink_set_err:
+	if (mlx5_dev_is_lightweight(sf_dev->mdev))
+		mlx5_uninit_one_light(sf_dev->mdev);
+	else
+		mlx5_uninit_one(sf_dev->mdev);
 init_one_err:
 	iounmap(mdev->iseg);
 remap_err:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index e34a8f88c518..964a5b1876f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -28,6 +28,7 @@ struct mlx5_sf_table {
 	struct mutex sf_state_lock; /* Serializes sf state among user cmds & vhca event handler. */
 	struct notifier_block esw_nb;
 	struct notifier_block vhca_nb;
+	struct notifier_block mdev_nb;
 };
 
 static struct mlx5_sf *
@@ -511,6 +512,35 @@ static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, voi
 	return 0;
 }
 
+static int mlx5_sf_mdev_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, mdev_nb);
+	struct mlx5_sf_peer_devlink_event_ctx *event_ctx = data;
+	int ret = NOTIFY_DONE;
+	struct mlx5_sf *sf;
+
+	if (event != MLX5_DRIVER_EVENT_SF_PEER_DEVLINK)
+		return NOTIFY_DONE;
+
+	table = mlx5_sf_table_try_get(table->dev);
+	if (!table)
+		return NOTIFY_DONE;
+
+	mutex_lock(&table->sf_state_lock);
+	sf = mlx5_sf_lookup_by_function_id(table, event_ctx->fn_id);
+	if (!sf)
+		goto out;
+
+	event_ctx->err = devl_port_fn_devlink_set(&sf->dl_port.dl_port,
+						  event_ctx->devlink);
+
+	ret = NOTIFY_OK;
+out:
+	mutex_unlock(&table->sf_state_lock);
+	mlx5_sf_table_put(table);
+	return ret;
+}
+
 static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev)
 {
 	return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) &&
@@ -544,6 +574,9 @@ int mlx5_sf_table_init(struct mlx5_core_dev *dev)
 	if (err)
 		goto vhca_err;
 
+	table->mdev_nb.notifier_call = mlx5_sf_mdev_event;
+	mlx5_blocking_notifier_register(dev, &table->mdev_nb);
+
 	return 0;
 
 vhca_err:
@@ -562,6 +595,7 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 	if (!table)
 		return;
 
+	mlx5_blocking_notifier_unregister(dev, &table->mdev_nb);
 	mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb);
 	mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb);
 	WARN_ON(refcount_read(&table->refcount));
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..8fbe22de16ef 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,7 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
 };
 
 enum {
-- 
cgit v1.2.3


From 86328b338c3996b814417dd68e3f899a1a649059 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.ibm.com>
Date: Tue, 12 Sep 2023 13:59:49 +0530
Subject: vmcore: remove dependency with is_kdump_kernel() for exporting vmcore

Currently, is_kdump_kernel() returns true when elfcorehdr_addr is set.
While elfcorehdr_addr is set for kexec based kernel dump mechanism,
alternate dump capturing methods like fadump [1] also set it to export
the vmcore. Since, is_kdump_kernel() is used to restrict resources in
crash dump capture kernel and such restrictions may not be desirable
for fadump, allow is_kdump_kernel() to be defined differently for such
scenarios. With this, is_kdump_kernel() could be false while vmcore is
usable. So, remove unnecessary dependency with is_kdump_kernel(), for
exporting vmcore.

[1] https://docs.kernel.org/powerpc/firmware-assisted-dump.html

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20230912082950.856977-1-hbathini@linux.ibm.com
---
 include/linux/crash_dump.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 0f3a656293b0..acc55626afdc 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -50,6 +50,7 @@ void vmcore_cleanup(void);
 #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 #endif
 
+#ifndef is_kdump_kernel
 /*
  * is_kdump_kernel() checks whether this kernel is booting after a panic of
  * previous kernel or not. This is determined by checking if previous kernel
@@ -64,6 +65,7 @@ static inline bool is_kdump_kernel(void)
 {
 	return elfcorehdr_addr != ELFCORE_ADDR_MAX;
 }
+#endif
 
 /* is_vmcore_usable() checks if the kernel is booting after a panic and
  * the vmcore region is usable.
@@ -75,7 +77,8 @@ static inline bool is_kdump_kernel(void)
 
 static inline int is_vmcore_usable(void)
 {
-	return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
+	return elfcorehdr_addr != ELFCORE_ADDR_ERR &&
+		elfcorehdr_addr != ELFCORE_ADDR_MAX ? 1 : 0;
 }
 
 /* vmcore_unusable() marks the vmcore as unusable,
@@ -84,8 +87,7 @@ static inline int is_vmcore_usable(void)
 
 static inline void vmcore_unusable(void)
 {
-	if (is_kdump_kernel())
-		elfcorehdr_addr = ELFCORE_ADDR_ERR;
+	elfcorehdr_addr = ELFCORE_ADDR_ERR;
 }
 
 /**
-- 
cgit v1.2.3


From fbaa6a181a4b1886cbf4214abdf9a2df68471510 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Fri, 8 Sep 2023 15:49:15 -0700
Subject: sched/core: Remove ifdeffery for saved_state

In preparation for freezer to also use saved_state, remove the
CONFIG_PREEMPT_RT compilation guard around saved_state.

On the arm64 platform I tested which did not have CONFIG_PREEMPT_RT,
there was no statistically significant deviation by applying this patch.

Test methodology:

perf bench sched message -g 40 -l 40

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 kernel/sched/core.c   | 8 ++------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..dc37ae787e33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -750,10 +750,8 @@ struct task_struct {
 #endif
 	unsigned int			__state;
 
-#ifdef CONFIG_PREEMPT_RT
 	/* saved state for "spinlock sleepers" */
 	unsigned int			saved_state;
-#endif
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f39482d6a6e6..49541e3c1295 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2232,23 +2232,20 @@ int __task_state_match(struct task_struct *p, unsigned int state)
 	if (READ_ONCE(p->__state) & state)
 		return 1;
 
-#ifdef CONFIG_PREEMPT_RT
 	if (READ_ONCE(p->saved_state) & state)
 		return -1;
-#endif
+
 	return 0;
 }
 
 static __always_inline
 int task_state_match(struct task_struct *p, unsigned int state)
 {
-#ifdef CONFIG_PREEMPT_RT
 	/*
 	 * Serialize against current_save_and_set_rtlock_wait_state() and
 	 * current_restore_rtlock_saved_state().
 	 */
 	guard(raw_spinlock_irq)(&p->pi_lock);
-#endif
 	return __task_state_match(p, state);
 }
 
@@ -4038,7 +4035,6 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 
 	*success = !!(match = __task_state_match(p, state));
 
-#ifdef CONFIG_PREEMPT_RT
 	/*
 	 * Saved state preserves the task state across blocking on
 	 * an RT lock.  If the state matches, set p::saved_state to
@@ -4054,7 +4050,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 	 */
 	if (match < 0)
 		p->saved_state = TASK_RUNNING;
-#endif
+
 	return match > 0;
 }
 
-- 
cgit v1.2.3


From f7b5bd725b737de3f2c4a836e07c82ba156d75df Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Thu, 14 Sep 2023 15:31:57 -0700
Subject: pds_core: check health in devcmd wait

Similar to what we do in the AdminQ, check for devcmd health
while waiting for an answer.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/dev.c | 11 +++++++++--
 include/linux/pds/pds_core_if.h         |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c
index f77cd9f5a2fd..7c1b965d61a9 100644
--- a/drivers/net/ethernet/amd/pds_core/dev.c
+++ b/drivers/net/ethernet/amd/pds_core/dev.c
@@ -42,6 +42,8 @@ int pdsc_err_to_errno(enum pds_core_status_code code)
 		return -ERANGE;
 	case PDS_RC_BAD_ADDR:
 		return -EFAULT;
+	case PDS_RC_BAD_PCI:
+		return -ENXIO;
 	case PDS_RC_EOPCODE:
 	case PDS_RC_EINTR:
 	case PDS_RC_DEV_CMD:
@@ -62,7 +64,7 @@ bool pdsc_is_fw_running(struct pdsc *pdsc)
 	/* Firmware is useful only if the running bit is set and
 	 * fw_status != 0xff (bad PCI read)
 	 */
-	return (pdsc->fw_status != 0xff) &&
+	return (pdsc->fw_status != PDS_RC_BAD_PCI) &&
 		(pdsc->fw_status & PDS_CORE_FW_STS_F_RUNNING);
 }
 
@@ -128,6 +130,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 	unsigned long max_wait;
 	unsigned long duration;
 	int timeout = 0;
+	bool running;
 	int done = 0;
 	int err = 0;
 	int status;
@@ -136,6 +139,10 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 	max_wait = start_time + (max_seconds * HZ);
 
 	while (!done && !timeout) {
+		running = pdsc_is_fw_running(pdsc);
+		if (!running)
+			break;
+
 		done = pdsc_devcmd_done(pdsc);
 		if (done)
 			break;
@@ -152,7 +159,7 @@ static int pdsc_devcmd_wait(struct pdsc *pdsc, u8 opcode, int max_seconds)
 		dev_dbg(dev, "DEVCMD %d %s after %ld secs\n",
 			opcode, pdsc_devcmd_str(opcode), duration / HZ);
 
-	if (!done || timeout) {
+	if ((!done || timeout) && running) {
 		dev_err(dev, "DEVCMD %d %s timeout, done %d timeout %d max_seconds=%d\n",
 			opcode, pdsc_devcmd_str(opcode), done, timeout,
 			max_seconds);
diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h
index e838a2b90440..17a87c1a55d7 100644
--- a/include/linux/pds/pds_core_if.h
+++ b/include/linux/pds/pds_core_if.h
@@ -79,6 +79,7 @@ enum pds_core_status_code {
 	PDS_RC_EVFID	= 31,	/* VF ID does not exist */
 	PDS_RC_BAD_FW	= 32,	/* FW file is invalid or corrupted */
 	PDS_RC_ECLIENT	= 33,   /* No such client id */
+	PDS_RC_BAD_PCI	= 255,  /* Broken PCI when reading status */
 };
 
 /**
-- 
cgit v1.2.3


From b0af4bcb49464c221ad5f95d40f2b1b252ceedcc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Sep 2023 20:43:18 +0206
Subject: serial: core: Provide port lock wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a serial port is used for kernel console output, then all
modifications to the UART registers which are done from other contexts,
e.g. getty, termios, are interference points for the kernel console.

So far this has been ignored and the printk output is based on the
principle of hope. The rework of the console infrastructure which aims to
support threaded and atomic consoles, requires to mark sections which
modify the UART registers as unsafe. This allows the atomic write function
to make informed decisions and eventually to restore operational state. It
also allows to prevent the regular UART code from modifying UART registers
while printk output is in progress.

All modifications of UART registers are guarded by the UART port lock,
which provides an obvious synchronization point with the console
infrastructure.

Provide wrapper functions for spin_[un]lock*(port->lock) invocations so
that the console mechanics can be applied later on at a single place and
does not require to copy the same logic all over the drivers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230914183831.587273-2-john.ogness@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 79 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index bb6f073bc159..f1d5c0d1568c 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -588,6 +588,85 @@ struct uart_port {
 	void			*private_data;		/* generic platform data pointer */
 };
 
+/**
+ * uart_port_lock - Lock the UART port
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_lock(struct uart_port *up)
+{
+	spin_lock(&up->lock);
+}
+
+/**
+ * uart_port_lock_irq - Lock the UART port and disable interrupts
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_lock_irq(struct uart_port *up)
+{
+	spin_lock_irq(&up->lock);
+}
+
+/**
+ * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	Pointer to interrupt flags storage
+ */
+static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
+{
+	spin_lock_irqsave(&up->lock, *flags);
+}
+
+/**
+ * uart_port_trylock - Try to lock the UART port
+ * @up:		Pointer to UART port structure
+ *
+ * Returns: True if lock was acquired, false otherwise
+ */
+static inline bool uart_port_trylock(struct uart_port *up)
+{
+	return spin_trylock(&up->lock);
+}
+
+/**
+ * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	Pointer to interrupt flags storage
+ *
+ * Returns: True if lock was acquired, false otherwise
+ */
+static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
+{
+	return spin_trylock_irqsave(&up->lock, *flags);
+}
+
+/**
+ * uart_port_unlock - Unlock the UART port
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_unlock(struct uart_port *up)
+{
+	spin_unlock(&up->lock);
+}
+
+/**
+ * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
+ * @up:		Pointer to UART port structure
+ */
+static inline void uart_port_unlock_irq(struct uart_port *up)
+{
+	spin_unlock_irq(&up->lock);
+}
+
+/**
+ * uart_port_lock_irqrestore - Unlock the UART port, restore interrupts
+ * @up:		Pointer to UART port structure
+ * @flags:	The saved interrupt flags for restore
+ */
+static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
+{
+	spin_unlock_irqrestore(&up->lock, flags);
+}
+
 static inline int serial_port_in(struct uart_port *up, int offset)
 {
 	return up->serial_in(up, offset);
-- 
cgit v1.2.3


From c5cbdb76e8e33ce90fec2946e8eee7d71d68e57a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Sep 2023 20:43:19 +0206
Subject: serial: core: Use lock wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a serial port is used for kernel console output, then all
modifications to the UART registers which are done from other contexts,
e.g. getty, termios, are interference points for the kernel console.

So far this has been ignored and the printk output is based on the
principle of hope. The rework of the console infrastructure which aims to
support threaded and atomic consoles, requires to mark sections which
modify the UART registers as unsafe. This allows the atomic write function
to make informed decisions and eventually to restore operational state. It
also allows to prevent the regular UART code from modifying UART registers
while printk output is in progress.

All modifications of UART registers are guarded by the UART port lock,
which provides an obvious synchronization point with the console
infrastructure.

To avoid adding this functionality to all UART drivers, wrap the
spin_[un]lock*() invocations for uart_port::lock into helper functions
which just contain the spin_[un]lock*() invocations for now. In a
subsequent step these helpers will gain the console synchronization
mechanisms.

Converted with coccinelle. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230914183831.587273-3-john.ogness@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f1d5c0d1568c..3091c62ec37b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -1035,14 +1035,14 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 	u8 sysrq_ch;
 
 	if (!port->has_sysrq) {
-		spin_unlock(&port->lock);
+		uart_port_unlock(port);
 		return;
 	}
 
 	sysrq_ch = port->sysrq_ch;
 	port->sysrq_ch = 0;
 
-	spin_unlock(&port->lock);
+	uart_port_unlock(port);
 
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
@@ -1054,14 +1054,14 @@ static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port
 	u8 sysrq_ch;
 
 	if (!port->has_sysrq) {
-		spin_unlock_irqrestore(&port->lock, flags);
+		uart_port_unlock_irqrestore(port, flags);
 		return;
 	}
 
 	sysrq_ch = port->sysrq_ch;
 	port->sysrq_ch = 0;
 
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock_irqrestore(port, flags);
 
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
@@ -1077,12 +1077,12 @@ static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
 }
 static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 {
-	spin_unlock(&port->lock);
+	uart_port_unlock(port);
 }
 static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
 		unsigned long flags)
 {
-	spin_unlock_irqrestore(&port->lock, flags);
+	uart_port_unlock_irqrestore(port, flags);
 }
 #endif	/* CONFIG_MAGIC_SYSRQ_SERIAL */
 
-- 
cgit v1.2.3


From 1cb6422ecac8804ebe0b71f4b3440674955fec73 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:15:10 -0700
Subject: ceph: Annotate struct ceph_monmap with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ceph_monmap.
Additionally, since the element count member must be set before accessing
the annotated flexible array member, move its initialization earlier.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: ceph-devel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ceph/mon_client.h | 2 +-
 net/ceph/mon_client.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b658961156a0..7a9a40163c0f 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -19,7 +19,7 @@ struct ceph_monmap {
 	struct ceph_fsid fsid;
 	u32 epoch;
 	u32 num_mon;
-	struct ceph_entity_inst mon_inst[];
+	struct ceph_entity_inst mon_inst[] __counted_by(num_mon);
 };
 
 struct ceph_mon_client;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index faabad6603db..f263f7e91a21 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1136,6 +1136,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+	monc->monmap->num_mon = num_mon;
 
 	for (i = 0; i < num_mon; i++) {
 		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
@@ -1147,7 +1148,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		inst->name.type = CEPH_ENTITY_TYPE_MON;
 		inst->name.num = cpu_to_le64(i);
 	}
-	monc->monmap->num_mon = num_mon;
 	return 0;
 }
 
-- 
cgit v1.2.3


From d57125b55a292a8e74a1fb17182576a3b2b2e795 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 18 Sep 2023 10:44:08 +0200
Subject: Revert "ceph: make members in struct ceph_mds_request_args_ext a
 union"

This reverts commit 3af5ae22030cb59fab4fba35f5a2b62f47e14df9.

ceph_mds_request_args_ext was already (and remains to be) a union.  An
additional anonymous union inside is bogus:

    union ceph_mds_request_args_ext {
        union {
            union ceph_mds_request_args old;
            struct { ... } __attribute__ ((packed)) setattr_ext;
        };
    }

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
---
 include/linux/ceph/ceph_fs.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 5f2301ee88bc..f3b3593254b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -467,19 +467,17 @@ union ceph_mds_request_args {
 } __attribute__ ((packed));
 
 union ceph_mds_request_args_ext {
-	union {
-		union ceph_mds_request_args old;
-		struct {
-			__le32 mode;
-			__le32 uid;
-			__le32 gid;
-			struct ceph_timespec mtime;
-			struct ceph_timespec atime;
-			__le64 size, old_size;       /* old_size needed by truncate */
-			__le32 mask;                 /* CEPH_SETATTR_* */
-			struct ceph_timespec btime;
-		} __attribute__ ((packed)) setattr_ext;
-	};
+	union ceph_mds_request_args old;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr_ext;
 };
 
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
-- 
cgit v1.2.3


From 8452a05b2c633b708dbe3e742f71b24bf21fe42d Mon Sep 17 00:00:00 2001
From: Rohan G Thomas <rohan.g.thomas@intel.com>
Date: Sat, 16 Sep 2023 14:33:12 +0800
Subject: net: stmmac: Tx coe sw fallback

Add sw fallback of tx checksum calculation for those tx queues that
don't support tx checksum offloading. DW xGMAC IP can be synthesized
such that it can support tx checksum offloading only for a few
initial tx queues. Also as Serge pointed out, for the DW QoS IP, tx
coe can be individually configured for each tx queue.

So when tx coe is enabled, for any tx queue that doesn't support
tx coe with 'coe-unsupported' flag set will have a sw fallback
happen in the driver for tx checksum calculation when any packets to
be transmitted on these tx queues.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 10 ++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  3 +++
 include/linux/stmmac.h                                |  1 +
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2206789802bf..9201ed778ebc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4401,6 +4401,16 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	WARN_ON(tx_q->tx_skbuff[first_entry]);
 
 	csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL);
+	/* DWMAC IPs can be synthesized to support tx coe only for a few tx
+	 * queues. In that case, checksum offloading for those queues that don't
+	 * support tx coe needs to fallback to software checksum calculation.
+	 */
+	if (csum_insertion &&
+	    priv->plat->tx_queues_cfg[queue].coe_unsupported) {
+		if (unlikely(skb_checksum_help(skb)))
+			goto dma_map_err;
+		csum_insertion = !csum_insertion;
+	}
 
 	if (likely(priv->extend_desc))
 		desc = (struct dma_desc *)(tx_q->dma_etx + entry);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 0f28795e581c..a09014c9e7d0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -276,6 +276,9 @@ static int stmmac_mtl_setup(struct platform_device *pdev,
 			plat->tx_queues_cfg[queue].use_prio = true;
 		}
 
+		plat->tx_queues_cfg[queue].coe_unsupported =
+			of_property_read_bool(q_node, "snps,coe-unsupported");
+
 		queue++;
 	}
 	if (queue != plat->tx_queues_to_use) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index ce89cc3e4913..c0079a7574ae 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -139,6 +139,7 @@ struct stmmac_rxq_cfg {
 
 struct stmmac_txq_cfg {
 	u32 weight;
+	bool coe_unsupported;
 	u8 mode_to_use;
 	/* Credit Base Shaper parameters */
 	u32 send_slope;
-- 
cgit v1.2.3


From 6b93bb41f6eaa1cc5c5f30ec3b687b380f116cd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:00 +0206
Subject: printk: Add non-BKL (nbcon) console basic infrastructure

The current console/printk subsystem is protected by a Big Kernel Lock,
(aka console_lock) which has ill defined semantics and is more or less
stateless. This puts severe limitations on the console subsystem and
makes forced takeover and output in emergency and panic situations a
fragile endeavour that is based on try and pray.

The goal of non-BKL (nbcon) consoles is to break out of the console lock
jail and to provide a new infrastructure that avoids the pitfalls and
also allows console drivers to be gradually converted over.

The proposed infrastructure aims for the following properties:

  - Per console locking instead of global locking
  - Per console state that allows to make informed decisions
  - Stateful handover and takeover

As a first step, state is added to struct console. The per console state
is an atomic_t using a 32bit bit field.

Reserve state bits, which will be populated later in the series. Wire
it up into the console register/unregister functionality.

It was decided to use a bitfield because using a plain u32 with
mask/shift operations resulted in uncomprehensible code.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-2-john.ogness@linutronix.de
---
 include/linux/console.h  | 31 +++++++++++++++++++++
 kernel/printk/Makefile   |  2 +-
 kernel/printk/internal.h |  8 ++++++
 kernel/printk/nbcon.c    | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c   | 13 ++++++---
 5 files changed, 120 insertions(+), 4 deletions(-)
 create mode 100644 kernel/printk/nbcon.c

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 7de11c763eb3..a2d37a7a98a8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -156,6 +156,8 @@ static inline int con_debug_leave(void)
  *			/dev/kmesg which requires a larger output buffer.
  * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
  *			printing callbacks must not be called.
+ * @CON_NBCON:		Console can operate outside of the legacy style console_lock
+ *			constraints.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -166,8 +168,32 @@ enum cons_flags {
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
 	CON_SUSPENDED		= BIT(7),
+	CON_NBCON		= BIT(8),
 };
 
+/**
+ * struct nbcon_state - console state for nbcon consoles
+ * @atom:	Compound of the state fields for atomic operations
+ *
+ * To be used for reading and preparing of the value stored in the nbcon
+ * state variable @console::nbcon_state.
+ */
+struct nbcon_state {
+	union {
+		unsigned int	atom;
+		struct {
+		};
+	};
+};
+
+/*
+ * The nbcon_state struct is used to easily create and interpret values that
+ * are stored in the @console::nbcon_state variable. Ensure this struct stays
+ * within the size boundaries of the atomic variable's underlying type in
+ * order to avoid any accidental truncation.
+ */
+static_assert(sizeof(struct nbcon_state) <= sizeof(int));
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -187,6 +213,8 @@ enum cons_flags {
  * @dropped:		Number of unreported dropped ringbuffer records
  * @data:		Driver private data
  * @node:		hlist node for the console list
+ *
+ * @nbcon_state:	State for nbcon consoles
  */
 struct console {
 	char			name[16];
@@ -206,6 +234,9 @@ struct console {
 	unsigned long		dropped;
 	void			*data;
 	struct hlist_node	node;
+
+	/* nbcon console specific members */
+	atomic_t		__private nbcon_state;
 };
 
 #ifdef CONFIG_LOCKDEP
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK)	+= printk_safe.o
+obj-$(CONFIG_PRINTK)	+= printk_safe.o nbcon.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
 obj-$(CONFIG_PRINTK_INDEX)	+= index.o
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7d4979d5c3ce..2ca0ab78802c 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,7 @@
  * internal.h - printk internal definitions
  */
 #include <linux/percpu.h>
+#include <linux/console.h>
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
 void __init printk_sysctl_init(void);
@@ -61,6 +62,10 @@ void defer_console_output(void);
 
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
+
+void nbcon_init(struct console *con);
+void nbcon_cleanup(struct console *con);
+
 #else
 
 #define PRINTK_PREFIX_MAX	0
@@ -76,6 +81,9 @@ u16 printk_parse_prefix(const char *text, int *level,
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void nbcon_init(struct console *con) { }
+static inline void nbcon_cleanup(struct console *con) { }
+
 #endif /* CONFIG_PRINTK */
 
 /**
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..63d24ca62ac5
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include "internal.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con:	Console to update
+ * @new:	The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+	atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con:	Console to read
+ * @state:	The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+	state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con:	Console to update
+ * @cur:	Old/expected state
+ * @new:	New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+					   struct nbcon_state *new)
+{
+	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+/**
+ * nbcon_init - Initialize the nbcon console specific data
+ * @con:	Console to initialize
+ */
+void nbcon_init(struct console *con)
+{
+	struct nbcon_state state = { };
+
+	nbcon_state_set(con, &state);
+}
+
+/**
+ * nbcon_cleanup - Cleanup the nbcon console specific data
+ * @con:	Console to cleanup
+ */
+void nbcon_cleanup(struct console *con)
+{
+	struct nbcon_state state = { };
+
+	nbcon_state_set(con, &state);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7e0b4dd02398..5f372eaceb29 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3326,9 +3326,10 @@ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
-#define con_printk(lvl, con, fmt, ...)			\
-	printk(lvl pr_fmt("%sconsole [%s%d] " fmt),	\
-	       (con->flags & CON_BOOT) ? "boot" : "",	\
+#define con_printk(lvl, con, fmt, ...)				\
+	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
+	       (con->flags & CON_NBCON) ? "" : "legacy ",	\
+	       (con->flags & CON_BOOT) ? "boot" : "",		\
 	       con->name, con->index, ##__VA_ARGS__)
 
 static void console_init_seq(struct console *newcon, bool bootcon_registered)
@@ -3488,6 +3489,9 @@ void register_console(struct console *newcon)
 	newcon->dropped = 0;
 	console_init_seq(newcon, bootcon_registered);
 
+	if (newcon->flags & CON_NBCON)
+		nbcon_init(newcon);
+
 	/*
 	 * Put this console in the list - keep the
 	 * preferred driver at the head of the list.
@@ -3579,6 +3583,9 @@ static int unregister_console_locked(struct console *console)
 	 */
 	synchronize_srcu(&console_srcu);
 
+	if (console->flags & CON_NBCON)
+		nbcon_cleanup(console);
+
 	console_sysfs_notify();
 
 	if (console->exit)
-- 
cgit v1.2.3


From 3a5bb25162b880da749f7cdf281c78dbade4164b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:01 +0206
Subject: printk: nbcon: Add acquire/release logic

Add per console acquire/release functionality.

The state of the console is maintained in the "nbcon_state" atomic
variable.

The console is locked when:

  - The 'prio' field contains the priority of the context that owns the
    console. Only higher priority contexts are allowed to take over the
    lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.

  - The 'cpu' field denotes on which CPU the console is locked. It is used
    to prevent busy waiting on the same CPU. Also it informs the lock owner
    that it has lost the lock in a more complex scenario when the lock was
    taken over by a higher priority context, released, and taken on another
    CPU with the same priority as the interrupted owner.

The acquire mechanism uses a few more fields:

  - The 'req_prio' field is used by the handover approach to make the
    current owner aware that there is a context with a higher priority
    waiting for the friendly handover.

  - The 'unsafe' field allows to take over the console in a safe way in the
    middle of emitting a message. The field is set only when accessing some
    shared resources or when the console device is manipulated. It can be
    cleared, for example, after emitting one character when the console
    device is in a consistent state.

  - The 'unsafe_takeover' field is set when a hostile takeover took the
    console in an unsafe state. The console will stay in the unsafe state
    until re-initialized.

The acquire mechanism uses three approaches:

  1) Direct acquire when the console is not owned or is owned by a lower
     priority context and is in a safe state.

  2) Friendly handover mechanism uses a request/grant handshake. It is used
     when the current owner has lower priority and the console is in an
     unsafe state.

     The requesting context:

       a) Sets its priority into the 'req_prio' field.

       b) Waits (with a timeout) for the owning context to unlock the
          console.

       c) Takes the lock and clears the 'req_prio' field.

     The owning context:

       a) Observes the 'req_prio' field set on exit from the unsafe
          console state.

       b) Gives up console ownership by clearing the 'prio' field.

  3) Unsafe hostile takeover allows to take over the lock even when the
     console is an unsafe state. It is used only in panic() by the final
     attempt to flush consoles in a try and hope mode.

     Note that separate record buffers are used in panic(). As a result,
     the messages can be read and formatted without any risk even after
     using the hostile takeover in unsafe state.

The release function simply clears the 'prio' field.

All operations on @console::nbcon_state are atomic cmpxchg based to
handle concurrency.

The acquire/release functions implement only minimal policies:

  - Preference for higher priority contexts.
  - Protection of the panic CPU.

All other policy decisions must be made at the call sites:

  - What is marked as an unsafe section.
  - Whether to spin-wait if there is already an owner and the console is
    in an unsafe state.
  - Whether to attempt an unsafe hostile takeover.

The design allows to implement the well known:

    acquire()
    output_one_printk_record()
    release()

The output of one printk record might be interrupted with a higher priority
context. The new owner is supposed to reprint the entire interrupted record
from scratch.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-3-john.ogness@linutronix.de
---
 include/linux/console.h |  56 ++++++
 kernel/printk/nbcon.c   | 497 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 553 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index a2d37a7a98a8..98210fd01f18 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -175,13 +175,29 @@ enum cons_flags {
  * struct nbcon_state - console state for nbcon consoles
  * @atom:	Compound of the state fields for atomic operations
  *
+ * @req_prio:		The priority of a handover request
+ * @prio:		The priority of the current owner
+ * @unsafe:		Console is busy in a non takeover region
+ * @unsafe_takeover:	A hostile takeover in an unsafe state happened in the
+ *			past. The console cannot be safe until re-initialized.
+ * @cpu:		The CPU on which the owner runs
+ *
  * To be used for reading and preparing of the value stored in the nbcon
  * state variable @console::nbcon_state.
+ *
+ * The @prio and @req_prio fields are particularly important to allow
+ * spin-waiting to timeout and give up without the risk of a waiter being
+ * assigned the lock after giving up.
  */
 struct nbcon_state {
 	union {
 		unsigned int	atom;
 		struct {
+			unsigned int prio		:  2;
+			unsigned int req_prio		:  2;
+			unsigned int unsafe		:  1;
+			unsigned int unsafe_takeover	:  1;
+			unsigned int cpu		: 24;
 		};
 	};
 };
@@ -194,6 +210,46 @@ struct nbcon_state {
  */
 static_assert(sizeof(struct nbcon_state) <= sizeof(int));
 
+/**
+ * nbcon_prio - console owner priority for nbcon consoles
+ * @NBCON_PRIO_NONE:		Unused
+ * @NBCON_PRIO_NORMAL:		Normal (non-emergency) usage
+ * @NBCON_PRIO_EMERGENCY:	Emergency output (WARN/OOPS...)
+ * @NBCON_PRIO_PANIC:		Panic output
+ * @NBCON_PRIO_MAX:		The number of priority levels
+ *
+ * A higher priority context can takeover the console when it is
+ * in the safe state. The final attempt to flush consoles in panic()
+ * can be allowed to do so even in an unsafe state (Hope and pray).
+ */
+enum nbcon_prio {
+	NBCON_PRIO_NONE = 0,
+	NBCON_PRIO_NORMAL,
+	NBCON_PRIO_EMERGENCY,
+	NBCON_PRIO_PANIC,
+	NBCON_PRIO_MAX,
+};
+
+struct console;
+
+/**
+ * struct nbcon_context - Context for console acquire/release
+ * @console:			The associated console
+ * @spinwait_max_us:		Limit for spin-wait acquire
+ * @prio:			Priority of the context
+ * @allow_unsafe_takeover:	Allow performing takeover even if unsafe. Can
+ *				be used only with NBCON_PRIO_PANIC @prio. It
+ *				might cause a system freeze when the console
+ *				is used later.
+ */
+struct nbcon_context {
+	/* members set by caller */
+	struct console		*console;
+	unsigned int		spinwait_max_us;
+	enum nbcon_prio		prio;
+	unsigned int		allow_unsafe_takeover	: 1;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 63d24ca62ac5..a2a354f859f9 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -4,10 +4,98 @@
 
 #include <linux/kernel.h>
 #include <linux/console.h>
+#include <linux/delay.h>
 #include "internal.h"
 /*
  * Printk console printing implementation for consoles which does not depend
  * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ *   - The 'prio' field contains the priority of the context that owns the
+ *     console. Only higher priority contexts are allowed to take over the
+ *     lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ *   - The 'cpu' field denotes on which CPU the console is locked. It is used
+ *     to prevent busy waiting on the same CPU. Also it informs the lock owner
+ *     that it has lost the lock in a more complex scenario when the lock was
+ *     taken over by a higher priority context, released, and taken on another
+ *     CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ *   - The 'req_prio' field is used by the handover approach to make the
+ *     current owner aware that there is a context with a higher priority
+ *     waiting for the friendly handover.
+ *
+ *   - The 'unsafe' field allows to take over the console in a safe way in the
+ *     middle of emitting a message. The field is set only when accessing some
+ *     shared resources or when the console device is manipulated. It can be
+ *     cleared, for example, after emitting one character when the console
+ *     device is in a consistent state.
+ *
+ *   - The 'unsafe_takeover' field is set when a hostile takeover took the
+ *     console in an unsafe state. The console will stay in the unsafe state
+ *     until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ *   1) Direct acquire when the console is not owned or is owned by a lower
+ *      priority context and is in a safe state.
+ *
+ *   2) Friendly handover mechanism uses a request/grant handshake. It is used
+ *      when the current owner has lower priority and the console is in an
+ *      unsafe state.
+ *
+ *      The requesting context:
+ *
+ *        a) Sets its priority into the 'req_prio' field.
+ *
+ *        b) Waits (with a timeout) for the owning context to unlock the
+ *           console.
+ *
+ *        c) Takes the lock and clears the 'req_prio' field.
+ *
+ *      The owning context:
+ *
+ *        a) Observes the 'req_prio' field set on exit from the unsafe
+ *           console state.
+ *
+ *        b) Gives up console ownership by clearing the 'prio' field.
+ *
+ *   3) Unsafe hostile takeover allows to take over the lock even when the
+ *      console is an unsafe state. It is used only in panic() by the final
+ *      attempt to flush consoles in a try and hope mode.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ *   - Preference for higher priority contexts.
+ *   - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ *   - What is marked as an unsafe section.
+ *   - Whether to spin-wait if there is already an owner and the console is
+ *     in an unsafe state.
+ *   - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ *     acquire()
+ *     output_one_printk_record()
+ *     release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
  */
 
 /**
@@ -47,6 +135,415 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
 	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
 }
 
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return:	0 on success. Otherwise, an error code on failure. Also @cur
+ *		is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU.
+ *			Or the current owner or waiter has the same or higher
+ *			priority. No acquire method can be successful in
+ *			this case.
+ *
+ *	-EBUSY:		The current owner has a lower priority but the console
+ *			in an unsafe state. The caller should try using
+ *			the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+					    struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	do {
+		if (other_cpu_in_panic())
+			return -EPERM;
+
+		if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+			return -EPERM;
+
+		if (cur->unsafe)
+			return -EBUSY;
+
+		/*
+		 * The console should never be safe for a direct acquire
+		 * if an unsafe hostile takeover has ever happened.
+		 */
+		WARN_ON_ONCE(cur->unsafe_takeover);
+
+		new.atom = cur->atom;
+		new.prio	= ctxt->prio;
+		new.req_prio	= NBCON_PRIO_NONE;
+		new.unsafe	= cur->unsafe_takeover;
+		new.cpu		= cpu;
+
+	} while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+	return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+	/*
+	 * The request context is well defined by the @req_prio because:
+	 *
+	 * - Only a context with a higher priority can take over the request.
+	 * - There are only three priorities.
+	 * - Only one CPU is allowed to request PANIC priority.
+	 * - Lower priorities are ignored during panic() until reboot.
+	 *
+	 * As a result, the following scenario is *not* possible:
+	 *
+	 * 1. Another context with a higher priority directly takes ownership.
+	 * 2. The higher priority context releases the ownership.
+	 * 3. A lower priority context takes the ownership.
+	 * 4. Another context with the same priority as this context
+	 *    creates a request and starts waiting.
+	 */
+
+	return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ *					 requested a handover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return:	0 on success and @cur is updated to the new console state.
+ *		Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU
+ *			or this context is no longer the waiter.
+ *
+ *	-EBUSY:		The console is still locked. The caller should
+ *			continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ *       except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+					       struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	/* Note that the caller must still remove the request! */
+	if (other_cpu_in_panic())
+		return -EPERM;
+
+	/*
+	 * Note that the waiter will also change if there was an unsafe
+	 * hostile takeover.
+	 */
+	if (!nbcon_waiter_matches(cur, ctxt->prio))
+		return -EPERM;
+
+	/* If still locked, caller should continue waiting. */
+	if (cur->prio != NBCON_PRIO_NONE)
+		return -EBUSY;
+
+	/*
+	 * The previous owner should have never released ownership
+	 * in an unsafe region.
+	 */
+	WARN_ON_ONCE(cur->unsafe);
+
+	new.atom = cur->atom;
+	new.prio	= ctxt->prio;
+	new.req_prio	= NBCON_PRIO_NONE;
+	new.unsafe	= cur->unsafe_takeover;
+	new.cpu		= cpu;
+
+	if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+		/*
+		 * The acquire could fail only when it has been taken
+		 * over by a higher priority context.
+		 */
+		WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+		return -EPERM;
+	}
+
+	/* Handover success. This context now owns the console. */
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return:	0 on success. Otherwise, an error code on failure. Also @cur
+ *		is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU.
+ *			Or a higher priority context has taken over the
+ *			console or the handover request.
+ *
+ *	-EBUSY:		The current owner is on the same CPU so that the hand
+ *			shake could not work. Or the current owner is not
+ *			willing to wait (zero timeout). Or the console does
+ *			not enter the safe state before timeout passed. The
+ *			caller might still use the unsafe hostile takeover
+ *			when allowed.
+ *
+ *	-EAGAIN:	@cur has changed when creating the handover request.
+ *			The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+					      struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+	int timeout;
+	int request_err = -EBUSY;
+
+	/*
+	 * Check that the handover is called when the direct acquire failed
+	 * with -EBUSY.
+	 */
+	WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+	WARN_ON_ONCE(!cur->unsafe);
+
+	/* Handover is not possible on the same CPU. */
+	if (cur->cpu == cpu)
+		return -EBUSY;
+
+	/*
+	 * Console stays unsafe after an unsafe takeover until re-initialized.
+	 * Waiting is not going to help in this case.
+	 */
+	if (cur->unsafe_takeover)
+		return -EBUSY;
+
+	/* Is the caller willing to wait? */
+	if (ctxt->spinwait_max_us == 0)
+		return -EBUSY;
+
+	/*
+	 * Setup a request for the handover. The caller should try to acquire
+	 * the console directly when the current state has been modified.
+	 */
+	new.atom = cur->atom;
+	new.req_prio = ctxt->prio;
+	if (!nbcon_state_try_cmpxchg(con, cur, &new))
+		return -EAGAIN;
+
+	cur->atom = new.atom;
+
+	/* Wait until there is no owner and then acquire the console. */
+	for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+		/* On successful acquire, this request is cleared. */
+		request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+		if (!request_err)
+			return 0;
+
+		/*
+		 * If the acquire should be aborted, it must be ensured
+		 * that the request is removed before returning to caller.
+		 */
+		if (request_err == -EPERM)
+			break;
+
+		udelay(1);
+
+		/* Re-read the state because some time has passed. */
+		nbcon_state_read(con, cur);
+	}
+
+	/* Timed out or aborted. Carefully remove handover request. */
+	do {
+		/*
+		 * No need to remove request if there is a new waiter. This
+		 * can only happen if a higher priority context has taken over
+		 * the console or the handover request.
+		 */
+		if (!nbcon_waiter_matches(cur, ctxt->prio))
+			return -EPERM;
+
+		/* Unset request for handover. */
+		new.atom = cur->atom;
+		new.req_prio = NBCON_PRIO_NONE;
+		if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+			/*
+			 * Request successfully unset. Report failure of
+			 * acquiring via handover.
+			 */
+			cur->atom = new.atom;
+			return request_err;
+		}
+
+		/*
+		 * Unable to remove request. Try to acquire in case
+		 * the owner has released the lock.
+		 */
+	} while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+	/* Lucky timing. The acquire succeeded while removing the request. */
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return:	0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+					     struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	if (!ctxt->allow_unsafe_takeover)
+		return -EPERM;
+
+	/* Ensure caller is allowed to perform unsafe hostile takeovers. */
+	if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+		return -EPERM;
+
+	/*
+	 * Check that try_acquire_direct() and try_acquire_handover() returned
+	 * -EBUSY in the right situation.
+	 */
+	WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+	WARN_ON_ONCE(cur->unsafe != true);
+
+	do {
+		new.atom = cur->atom;
+		new.cpu			= cpu;
+		new.prio		= ctxt->prio;
+		new.unsafe		|= cur->unsafe_takeover;
+		new.unsafe_takeover	|= cur->unsafe;
+
+	} while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt:	The context of the caller
+ *
+ * Return:	True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+__maybe_unused
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+	int err;
+
+	nbcon_state_read(con, &cur);
+try_again:
+	err = nbcon_context_try_acquire_direct(ctxt, &cur);
+	if (err != -EBUSY)
+		goto out;
+
+	err = nbcon_context_try_acquire_handover(ctxt, &cur);
+	if (err == -EAGAIN)
+		goto try_again;
+	if (err != -EBUSY)
+		goto out;
+
+	err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+	return !err;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+				int expected_prio)
+{
+	/*
+	 * Since consoles can only be acquired by higher priorities,
+	 * owning contexts are uniquely identified by @prio. However,
+	 * since contexts can unexpectedly lose ownership, it is
+	 * possible that later another owner appears with the same
+	 * priority. For this reason @cpu is also needed.
+	 */
+
+	if (cur->prio != expected_prio)
+		return false;
+
+	if (cur->cpu != expected_cpu)
+		return false;
+
+	return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt:	The nbcon context from nbcon_context_try_acquire()
+ */
+__maybe_unused
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+	struct nbcon_state new;
+
+	nbcon_state_read(con, &cur);
+
+	do {
+		if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+			return;
+
+		new.atom = cur.atom;
+		new.prio = NBCON_PRIO_NONE;
+
+		/*
+		 * If @unsafe_takeover is set, it is kept set so that
+		 * the state remains permanently unsafe.
+		 */
+		new.unsafe |= cur.unsafe_takeover;
+
+	} while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+}
+
 /**
  * nbcon_init - Initialize the nbcon console specific data
  * @con:	Console to initialize
-- 
cgit v1.2.3


From 5634c90fd8553de7cbafecd048d0273690a2e84e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:03 +0206
Subject: printk: nbcon: Add buffer management

In case of hostile takeovers it must be ensured that the previous
owner cannot scribble over the output buffer of the emergency/panic
context. This is achieved by:

 - Adding a global output buffer instance for the panic context.
   This is the only situation where hostile takeovers can occur and
   there is always at most 1 panic context.

 - Allocating an output buffer per non-boot console upon console
   registration. This buffer is used by the console owner when not
   in panic context. (For boot consoles, the existing shared global
   legacy output buffer is used instead. Boot console printing will
   be synchronized with legacy console printing.)

 - Choosing the appropriate buffer is handled in the acquire/release
   functions.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-5-john.ogness@linutronix.de
---
 include/linux/console.h  |  7 +++++
 kernel/printk/internal.h | 12 ++++++--
 kernel/printk/nbcon.c    | 73 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/printk/printk.c   | 22 +++++++++------
 4 files changed, 99 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 98210fd01f18..ca1ef8700e55 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -231,6 +231,7 @@ enum nbcon_prio {
 };
 
 struct console;
+struct printk_buffers;
 
 /**
  * struct nbcon_context - Context for console acquire/release
@@ -241,6 +242,7 @@ struct console;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @pbufs:			Pointer to the text buffer for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -248,6 +250,9 @@ struct nbcon_context {
 	unsigned int		spinwait_max_us;
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
+
+	/* members set by acquire */
+	struct printk_buffers	*pbufs;
 };
 
 /**
@@ -271,6 +276,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
 	char			name[16];
@@ -293,6 +299,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	struct printk_buffers	*pbufs;
 };
 
 #ifdef CONFIG_LOCKDEP
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7199d60bfc25..f6161cd75d7d 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -13,6 +13,12 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 #define printk_sysctl_init() do { } while (0)
 #endif
 
+#define con_printk(lvl, con, fmt, ...)				\
+	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
+		(con->flags & CON_NBCON) ? "" : "legacy ",	\
+		(con->flags & CON_BOOT) ? "boot" : "",		\
+		con->name, con->index, ##__VA_ARGS__)
+
 #ifdef CONFIG_PRINTK
 
 #ifdef CONFIG_PRINTK_CALLER
@@ -63,8 +69,9 @@ void defer_console_output(void);
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
 
+bool nbcon_alloc(struct console *con);
 void nbcon_init(struct console *con);
-void nbcon_cleanup(struct console *con);
+void nbcon_free(struct console *con);
 
 #else
 
@@ -81,8 +88,9 @@ void nbcon_cleanup(struct console *con);
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline bool nbcon_alloc(struct console *con) { return false; }
 static inline void nbcon_init(struct console *con) { }
-static inline void nbcon_cleanup(struct console *con) { }
+static inline void nbcon_free(struct console *con) { }
 
 #endif /* CONFIG_PRINTK */
 
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index a2a354f859f9..ba1febf15db6 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/console.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include "internal.h"
 /*
  * Printk console printing implementation for consoles which does not depend
@@ -70,6 +71,10 @@
  *      console is an unsafe state. It is used only in panic() by the final
  *      attempt to flush consoles in a try and hope mode.
  *
+ *      Note that separate record buffers are used in panic(). As a result,
+ *      the messages can be read and formatted without any risk even after
+ *      using the hostile takeover in unsafe state.
+ *
  * The release function simply clears the 'prio' field.
  *
  * All operations on @console::nbcon_state are atomic cmpxchg based to
@@ -459,6 +464,8 @@ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
 	return 0;
 }
 
+static struct printk_buffers panic_nbcon_pbufs;
+
 /**
  * nbcon_context_try_acquire - Try to acquire nbcon console
  * @ctxt:	The context of the caller
@@ -473,6 +480,7 @@ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
 __maybe_unused
 static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
 {
+	unsigned int cpu = smp_processor_id();
 	struct console *con = ctxt->console;
 	struct nbcon_state cur;
 	int err;
@@ -491,7 +499,18 @@ try_again:
 
 	err = nbcon_context_try_acquire_hostile(ctxt, &cur);
 out:
-	return !err;
+	if (err)
+		return false;
+
+	/* Acquire succeeded. */
+
+	/* Assign the appropriate buffer for this context. */
+	if (atomic_read(&panic_cpu) == cpu)
+		ctxt->pbufs = &panic_nbcon_pbufs;
+	else
+		ctxt->pbufs = con->pbufs;
+
+	return true;
 }
 
 static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
@@ -530,7 +549,7 @@ static void nbcon_context_release(struct nbcon_context *ctxt)
 
 	do {
 		if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
-			return;
+			break;
 
 		new.atom = cur.atom;
 		new.prio = NBCON_PRIO_NONE;
@@ -542,26 +561,70 @@ static void nbcon_context_release(struct nbcon_context *ctxt)
 		new.unsafe |= cur.unsafe_takeover;
 
 	} while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+	ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_alloc - Allocate buffers needed by the nbcon console
+ * @con:	Console to allocate buffers for
+ *
+ * Return:	True on success. False otherwise and the console cannot
+ *		be used.
+ *
+ * This is not part of nbcon_init() because buffer allocation must
+ * be performed earlier in the console registration process.
+ */
+bool nbcon_alloc(struct console *con)
+{
+	if (con->flags & CON_BOOT) {
+		/*
+		 * Boot console printing is synchronized with legacy console
+		 * printing, so boot consoles can share the same global printk
+		 * buffers.
+		 */
+		con->pbufs = &printk_shared_pbufs;
+	} else {
+		con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+		if (!con->pbufs) {
+			con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+			return false;
+		}
+	}
+
+	return true;
 }
 
 /**
  * nbcon_init - Initialize the nbcon console specific data
  * @con:	Console to initialize
+ *
+ * nbcon_alloc() *must* be called and succeed before this function
+ * is called.
  */
 void nbcon_init(struct console *con)
 {
 	struct nbcon_state state = { };
 
+	/* nbcon_alloc() must have been called and successful! */
+	BUG_ON(!con->pbufs);
+
 	nbcon_state_set(con, &state);
 }
 
 /**
- * nbcon_cleanup - Cleanup the nbcon console specific data
- * @con:	Console to cleanup
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con:	Console to free/cleanup nbcon data
  */
-void nbcon_cleanup(struct console *con)
+void nbcon_free(struct console *con)
 {
 	struct nbcon_state state = { };
 
 	nbcon_state_set(con, &state);
+
+	/* Boot consoles share global printk buffers. */
+	if (!(con->flags & CON_BOOT))
+		kfree(con->pbufs);
+
+	con->pbufs = NULL;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 17def3791bc0..1c9720acd960 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3331,12 +3331,6 @@ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
-#define con_printk(lvl, con, fmt, ...)				\
-	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
-	       (con->flags & CON_NBCON) ? "" : "legacy ",	\
-	       (con->flags & CON_BOOT) ? "boot" : "",		\
-	       con->name, con->index, ##__VA_ARGS__)
-
 static void console_init_seq(struct console *newcon, bool bootcon_registered)
 {
 	struct console *con;
@@ -3450,6 +3444,15 @@ void register_console(struct console *newcon)
 		goto unlock;
 	}
 
+	if (newcon->flags & CON_NBCON) {
+		/*
+		 * Ensure the nbcon console buffers can be allocated
+		 * before modifying any global data.
+		 */
+		if (!nbcon_alloc(newcon))
+			goto unlock;
+	}
+
 	/*
 	 * See if we want to enable this console driver by default.
 	 *
@@ -3477,8 +3480,11 @@ void register_console(struct console *newcon)
 		err = try_enable_preferred_console(newcon, false);
 
 	/* printk() messages are not printed to the Braille console. */
-	if (err || newcon->flags & CON_BRL)
+	if (err || newcon->flags & CON_BRL) {
+		if (newcon->flags & CON_NBCON)
+			nbcon_free(newcon);
 		goto unlock;
+	}
 
 	/*
 	 * If we have a bootconsole, and are switching to a real console,
@@ -3589,7 +3595,7 @@ static int unregister_console_locked(struct console *console)
 	synchronize_srcu(&console_srcu);
 
 	if (console->flags & CON_NBCON)
-		nbcon_cleanup(console);
+		nbcon_free(console);
 
 	console_sysfs_notify();
 
-- 
cgit v1.2.3


From ad56ebd1d79b216dc147474fac89a11daf6b10df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:05 +0206
Subject: printk: nbcon: Add sequence handling

Add an atomic_long_t field @nbcon_seq to the console struct to
store the sequence number for nbcon consoles. For nbcon consoles
this will be used instead of the non-atomic @seq field. The new
field allows for safe atomic sequence number updates without
requiring any locking.

On 64bit systems the new field stores the full sequence number.
On 32bit systems the new field stores the lower 32 bits of the
sequence number, which are expanded to 64bit as needed by
folding the values based on the sequence numbers available in
the ringbuffer.

For 32bit systems, having a 32bit representation in the console
is sufficient. If a console ever gets more than 2^31 records
behind the ringbuffer then this is the least of the problems.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-7-john.ogness@linutronix.de
---
 include/linux/console.h  |   4 ++
 kernel/printk/internal.h |   7 ++++
 kernel/printk/nbcon.c    | 101 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c   |  31 +++++++++++----
 4 files changed, 136 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index ca1ef8700e55..20cd486b76ad 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -243,6 +243,7 @@ struct printk_buffers;
  *				might cause a system freeze when the console
  *				is used later.
  * @pbufs:			Pointer to the text buffer for this context
+ * @seq:			The sequence number to print for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -253,6 +254,7 @@ struct nbcon_context {
 
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
+	u64			seq;
 };
 
 /**
@@ -276,6 +278,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
@@ -299,6 +302,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
 };
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index f6161cd75d7d..6473f5ae4a18 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -4,6 +4,7 @@
  */
 #include <linux/percpu.h>
 #include <linux/console.h>
+#include "printk_ringbuffer.h"
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
 void __init printk_sysctl_init(void);
@@ -42,6 +43,8 @@ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
+extern struct printk_ringbuffer *prb;
+
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
@@ -69,6 +72,8 @@ void defer_console_output(void);
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
 
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
 bool nbcon_alloc(struct console *con);
 void nbcon_init(struct console *con);
 void nbcon_free(struct console *con);
@@ -88,6 +93,8 @@ void nbcon_free(struct console *con);
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
 static inline bool nbcon_alloc(struct console *con) { return false; }
 static inline void nbcon_init(struct console *con) { }
 static inline void nbcon_free(struct console *con) { }
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 98e4be5429f0..e076096b31c0 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -140,6 +140,101 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
 	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
 }
 
+#ifdef CONFIG_64BIT
+
+#define __seq_to_nbcon_seq(seq) (seq)
+#define __nbcon_seq_to_seq(seq) (seq)
+
+#else /* CONFIG_64BIT */
+
+#define __seq_to_nbcon_seq(seq) ((u32)seq)
+
+static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
+{
+	u64 seq;
+	u64 rb_next_seq;
+
+	/*
+	 * The provided sequence is only the lower 32 bits of the ringbuffer
+	 * sequence. It needs to be expanded to 64bit. Get the next sequence
+	 * number from the ringbuffer and fold it.
+	 *
+	 * Having a 32bit representation in the console is sufficient.
+	 * If a console ever gets more than 2^31 records behind
+	 * the ringbuffer then this is the least of the problems.
+	 *
+	 * Also the access to the ring buffer is always safe.
+	 */
+	rb_next_seq = prb_next_seq(prb);
+	seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
+
+	return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con:	Console to read the sequence of
+ *
+ * Return:	Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+	unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+	return __nbcon_seq_to_seq(nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con:	Console to work on
+ * @seq:	Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+	/*
+	 * If the specified record no longer exists, the oldest available record
+	 * is chosen. This is especially important on 32bit systems because only
+	 * the lower 32 bits of the sequence number are stored. The upper 32 bits
+	 * are derived from the sequence numbers available in the ringbuffer.
+	 */
+	u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+	atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+
+	/* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
+	con->seq = 0;
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ * @new_seq:	The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+__maybe_unused
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+	unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+	struct console *con = ctxt->console;
+
+	if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+				    __seq_to_nbcon_seq(new_seq))) {
+		ctxt->seq = new_seq;
+	} else {
+		ctxt->seq = nbcon_seq_read(con);
+	}
+}
+
 /**
  * nbcon_context_try_acquire_direct - Try to acquire directly
  * @ctxt:	The context of the caller
@@ -510,6 +605,9 @@ out:
 	else
 		ctxt->pbufs = con->pbufs;
 
+	/* Set the record sequence for this context to print. */
+	ctxt->seq = nbcon_seq_read(ctxt->console);
+
 	return true;
 }
 
@@ -722,6 +820,8 @@ bool nbcon_alloc(struct console *con)
  *
  * nbcon_alloc() *must* be called and succeed before this function
  * is called.
+ *
+ * This function expects that the legacy @con->seq has been set.
  */
 void nbcon_init(struct console *con)
 {
@@ -730,6 +830,7 @@ void nbcon_init(struct console *con)
 	/* nbcon_alloc() must have been called and successful! */
 	BUG_ON(!con->pbufs);
 
+	nbcon_seq_force(con, con->seq);
 	nbcon_state_set(con, &state);
 }
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1c9720acd960..77857d2118ca 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -494,7 +494,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
 
 static struct printk_ringbuffer printk_rb_dynamic;
 
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
 
 /*
  * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@@ -3168,6 +3168,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
 
 	if (mode == CONSOLE_REPLAY_ALL) {
 		struct console *c;
+		short flags;
 		int cookie;
 		u64 seq;
 
@@ -3175,11 +3176,17 @@ void console_flush_on_panic(enum con_flush_mode mode)
 
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(c) {
-			/*
-			 * This is an unsynchronized assignment, but the
-			 * kernel is in "hope and pray" mode anyway.
-			 */
-			c->seq = seq;
+			flags = console_srcu_read_flags(c);
+
+			if (flags & CON_NBCON) {
+				nbcon_seq_force(c, seq);
+			} else {
+				/*
+				 * This is an unsynchronized assignment. On
+				 * panic legacy consoles are only best effort.
+				 */
+				c->seq = seq;
+			}
 		}
 		console_srcu_read_unlock(cookie);
 	}
@@ -3750,6 +3757,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 	struct console *c;
 	u64 last_diff = 0;
 	u64 printk_seq;
+	short flags;
 	int cookie;
 	u64 diff;
 	u64 seq;
@@ -3771,6 +3779,9 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		for_each_console_srcu(c) {
 			if (con && con != c)
 				continue;
+
+			flags = console_srcu_read_flags(c);
+
 			/*
 			 * If consoles are not usable, it cannot be expected
 			 * that they make forward progress, so only increment
@@ -3778,7 +3789,13 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 			 */
 			if (!console_is_usable(c))
 				continue;
-			printk_seq = c->seq;
+
+			if (flags & CON_NBCON) {
+				printk_seq = nbcon_seq_read(c);
+			} else {
+				printk_seq = c->seq;
+			}
+
 			if (printk_seq < seq)
 				diff += seq - printk_seq;
 		}
-- 
cgit v1.2.3


From 06653d57ff283be627a2c769139d73ecc487810f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:06 +0206
Subject: printk: nbcon: Add emit function and callback function for atomic
 printing

Implement an emit function for nbcon consoles to output printk
messages. It utilizes the lockless printk_get_next_message() and
console_prepend_dropped() functions to retrieve/build the output
message. The emit function includes the required safety points to
check for handover/takeover and calls a new write_atomic callback
of the console driver to output the message. It also includes
proper handling for updating the nbcon console sequence number.

A new nbcon_write_context struct is introduced. This is provided
to the write_atomic callback and includes only the information
necessary for performing atomic writes.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-8-john.ogness@linutronix.de
---
 include/linux/console.h  |  21 ++++++++++
 kernel/printk/internal.h |   6 +++
 kernel/printk/nbcon.c    | 106 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |   9 ++--
 4 files changed, 134 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 20cd486b76ad..14563dcb34b1 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -242,6 +242,7 @@ struct printk_buffers;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @backlog:			Ringbuffer has pending records
  * @pbufs:			Pointer to the text buffer for this context
  * @seq:			The sequence number to print for this context
  */
@@ -252,11 +253,28 @@ struct nbcon_context {
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
 
+	/* members set by emit */
+	unsigned int		backlog			: 1;
+
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
 	u64			seq;
 };
 
+/**
+ * struct nbcon_write_context - Context handed to the nbcon write callbacks
+ * @ctxt:		The core console context
+ * @outbuf:		Pointer to the text buffer for output
+ * @len:		Length to write
+ * @unsafe_takeover:	If a hostile takeover in an unsafe state has occurred
+ */
+struct nbcon_write_context {
+	struct nbcon_context	__private ctxt;
+	char			*outbuf;
+	unsigned int		len;
+	bool			unsafe_takeover;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -277,6 +295,7 @@ struct nbcon_context {
  * @data:		Driver private data
  * @node:		hlist node for the console list
  *
+ * @write_atomic:	Write callback for atomic context
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
@@ -301,6 +320,8 @@ struct console {
 	struct hlist_node	node;
 
 	/* nbcon console specific members */
+	bool			(*write_atomic)(struct console *con,
+						struct nbcon_write_context *wctxt);
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 6473f5ae4a18..6c2afee5ef62 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -130,3 +130,9 @@ struct printk_message {
 };
 
 bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index e076096b31c0..6e05d263fd22 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -221,7 +221,6 @@ void nbcon_seq_force(struct console *con, u64 seq)
  * nbcon_seq_force() was used or the current context no longer owns the
  * console. In the later case, it will stop printing anyway.
  */
-__maybe_unused
 static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
 {
 	unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
@@ -755,7 +754,6 @@ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_s
  *
  * Internal helper to avoid duplicated code.
  */
-__maybe_unused
 static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
 {
 	struct console *con = ctxt->console;
@@ -784,6 +782,110 @@ out:
 	return nbcon_context_can_proceed(ctxt, &cur);
 }
 
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt:	The write context that will be handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+__maybe_unused
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+	struct printk_message pmsg = {
+		.pbufs = ctxt->pbufs,
+	};
+	unsigned long con_dropped;
+	struct nbcon_state cur;
+	unsigned long dropped;
+	bool done;
+
+	/*
+	 * The printk buffers are filled within an unsafe section. This
+	 * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+	 * clobbering each other.
+	 */
+
+	if (!nbcon_context_enter_unsafe(ctxt))
+		return false;
+
+	ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+	if (!ctxt->backlog)
+		return nbcon_context_exit_unsafe(ctxt);
+
+	/*
+	 * @con->dropped is not protected in case of an unsafe hostile
+	 * takeover. In that situation the update can be racy so
+	 * annotate it accordingly.
+	 */
+	con_dropped = data_race(READ_ONCE(con->dropped));
+
+	dropped = con_dropped + pmsg.dropped;
+	if (dropped && !is_extended)
+		console_prepend_dropped(&pmsg, dropped);
+
+	if (!nbcon_context_exit_unsafe(ctxt))
+		return false;
+
+	/* For skipped records just update seq/dropped in @con. */
+	if (pmsg.outbuf_len == 0)
+		goto update_con;
+
+	/* Initialize the write context for driver callbacks. */
+	wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+	wctxt->len = pmsg.outbuf_len;
+	nbcon_state_read(con, &cur);
+	wctxt->unsafe_takeover = cur.unsafe_takeover;
+
+	if (con->write_atomic) {
+		done = con->write_atomic(con, wctxt);
+	} else {
+		nbcon_context_release(ctxt);
+		WARN_ON_ONCE(1);
+		done = false;
+	}
+
+	/* If not done, the emit was aborted. */
+	if (!done)
+		return false;
+
+	/*
+	 * Since any dropped message was successfully output, reset the
+	 * dropped count for the console.
+	 */
+	dropped = 0;
+update_con:
+	/*
+	 * The dropped count and the sequence number are updated within an
+	 * unsafe section. This limits update races to the panic context and
+	 * allows the panic context to win.
+	 */
+
+	if (!nbcon_context_enter_unsafe(ctxt))
+		return false;
+
+	if (dropped != con_dropped) {
+		/* Counterpart to the READ_ONCE() above. */
+		WRITE_ONCE(con->dropped, dropped);
+	}
+
+	nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+	return nbcon_context_exit_unsafe(ctxt);
+}
+
 /**
  * nbcon_alloc - Allocate buffers needed by the nbcon console
  * @con:	Console to allocate buffers for
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 77857d2118ca..778359b21761 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -698,9 +698,6 @@ out:
 	return len;
 }
 
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_supress);
-
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	atomic64_t seq;
@@ -2733,7 +2730,7 @@ static void __console_unlock(void)
  * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
  */
 #ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
 {
 	struct printk_buffers *pbufs = pmsg->pbufs;
 	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@@ -2787,8 +2784,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
  * of @pmsg are valid. (See the documentation of struct printk_message
  * for information about the @pmsg fields.)
  */
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_suppress)
 {
 	static int panic_console_dropped;
 
-- 
cgit v1.2.3


From 9757acd0a700ba4a0d16dde4ba820eb052aba1a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:07 +0206
Subject: printk: nbcon: Allow drivers to mark unsafe regions and check state

For the write_atomic callback, the console driver may have unsafe
regions that need to be appropriately marked. Provide functions
that accept the nbcon_write_context struct to allow for the driver
to enter and exit unsafe regions.

Also provide a function for drivers to check if they are still the
owner of the console.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-9-john.ogness@linutronix.de
---
 include/linux/console.h | 10 +++++++
 kernel/printk/nbcon.c   | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 14563dcb34b1..e4fc6f7c1496 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -451,6 +451,16 @@ static inline bool console_is_registered(const struct console *con)
 	lockdep_assert_console_list_lock_held();			\
 	hlist_for_each_entry(con, &console_list, node)
 
+#ifdef CONFIG_PRINTK
+extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
+extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
+extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
+#else
+static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
+#endif
+
 extern int console_set_on_cmdline;
 extern struct console *early_console;
 
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 6e05d263fd22..b96077152f49 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -732,6 +732,41 @@ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_s
 	return false;
 }
 
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+
+	nbcon_state_read(con, &cur);
+
+	return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
 #define nbcon_context_enter_unsafe(c)	__nbcon_context_update_unsafe(c, true)
 #define nbcon_context_exit_unsafe(c)	__nbcon_context_update_unsafe(c, false)
 
@@ -782,6 +817,46 @@ out:
 	return nbcon_context_can_proceed(ctxt, &cur);
 }
 
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return nbcon_context_enter_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return nbcon_context_exit_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
 /**
  * nbcon_emit_next_record - Emit a record in the acquired context
  * @wctxt:	The write context that will be handed to the write function
-- 
cgit v1.2.3


From 492032760127251e5540a5716a70996bacf2a3fd Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Mon, 18 Sep 2023 20:30:11 +0800
Subject: team: fix null-ptr-deref when team device type is changed

Get a null-ptr-deref bug as follows with reproducer [1].

BUG: kernel NULL pointer dereference, address: 0000000000000228
...
RIP: 0010:vlan_dev_hard_header+0x35/0x140 [8021q]
...
Call Trace:
 <TASK>
 ? __die+0x24/0x70
 ? page_fault_oops+0x82/0x150
 ? exc_page_fault+0x69/0x150
 ? asm_exc_page_fault+0x26/0x30
 ? vlan_dev_hard_header+0x35/0x140 [8021q]
 ? vlan_dev_hard_header+0x8e/0x140 [8021q]
 neigh_connected_output+0xb2/0x100
 ip6_finish_output2+0x1cb/0x520
 ? nf_hook_slow+0x43/0xc0
 ? ip6_mtu+0x46/0x80
 ip6_finish_output+0x2a/0xb0
 mld_sendpack+0x18f/0x250
 mld_ifc_work+0x39/0x160
 process_one_work+0x1e6/0x3f0
 worker_thread+0x4d/0x2f0
 ? __pfx_worker_thread+0x10/0x10
 kthread+0xe5/0x120
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x34/0x50
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1b/0x30

[1]
$ teamd -t team0 -d -c '{"runner": {"name": "loadbalance"}}'
$ ip link add name t-dummy type dummy
$ ip link add link t-dummy name t-dummy.100 type vlan id 100
$ ip link add name t-nlmon type nlmon
$ ip link set t-nlmon master team0
$ ip link set t-nlmon nomaster
$ ip link set t-dummy up
$ ip link set team0 up
$ ip link set t-dummy.100 down
$ ip link set t-dummy.100 master team0

When enslave a vlan device to team device and team device type is changed
from non-ether to ether, header_ops of team device is changed to
vlan_header_ops. That is incorrect and will trigger null-ptr-deref
for vlan->real_dev in vlan_dev_hard_header() because team device is not
a vlan device.

Cache eth_header_ops in team_setup(), then assign cached header_ops to
header_ops of team net device when its type is changed from non-ether
to ether to fix the bug.

Fixes: 1d76efe1577b ("team: add support for non-ethernet devices")
Suggested-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230918123011.1884401-1-william.xuanziyang@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/team/team.c | 10 +++++++++-
 include/linux/if_team.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index e8b94580194e..508d9a392ab1 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2115,7 +2115,12 @@ static const struct ethtool_ops team_ethtool_ops = {
 static void team_setup_by_port(struct net_device *dev,
 			       struct net_device *port_dev)
 {
-	dev->header_ops	= port_dev->header_ops;
+	struct team *team = netdev_priv(dev);
+
+	if (port_dev->type == ARPHRD_ETHER)
+		dev->header_ops	= team->header_ops_cache;
+	else
+		dev->header_ops	= port_dev->header_ops;
 	dev->type = port_dev->type;
 	dev->hard_header_len = port_dev->hard_header_len;
 	dev->needed_headroom = port_dev->needed_headroom;
@@ -2162,8 +2167,11 @@ static int team_dev_type_check_change(struct net_device *dev,
 
 static void team_setup(struct net_device *dev)
 {
+	struct team *team = netdev_priv(dev);
+
 	ether_setup(dev);
 	dev->max_mtu = ETH_MAX_MTU;
+	team->header_ops_cache = dev->header_ops;
 
 	dev->netdev_ops = &team_netdev_ops;
 	dev->ethtool_ops = &team_ethtool_ops;
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 1b9b15a492fa..cdc684e04a2f 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -189,6 +189,8 @@ struct team {
 	struct net_device *dev; /* associated netdevice */
 	struct team_pcpu_stats __percpu *pcpu_stats;
 
+	const struct header_ops *header_ops_cache;
+
 	struct mutex lock; /* used for overall locking, e.g. port lists write */
 
 	/*
-- 
cgit v1.2.3


From 6a23c555f7eb436d6799533675ffa179db3d5834 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 18 Sep 2023 14:25:36 +0100
Subject: net: phy: fix regression with AX88772A PHY driver

Marek reports that a deadlock occurs with the AX88772A PHY used on the
ASIX USB network driver:

asix 1-1.4:1.0 (unnamed net_device) (uninitialized): PHY [usb-001:003:10] driver [Asix Electronics AX88772A] (irq=POLL)
Asix Electronics AX88772A usb-001:003:10: attached PHY driver(mii_bus:phy_addr=usb-001:003:10, irq=POLL)
asix 1-1.4:1.0 eth0: register 'asix' at usb-12110000.usb-1.4, ASIX AX88772 USB 2.0 Ethernet, a2:99:b6:cd:11:eb
asix 1-1.4:1.0 eth0: configuring for phy/internal link mode

============================================
WARNING: possible recursive locking detected
6.6.0-rc1-00239-g8da77df649c4-dirty #13949 Not tainted
--------------------------------------------
kworker/3:3/71 is trying to acquire lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_start_aneg+0x1c/0x38

but task is already holding lock:
c6c704cc (&dev->lock){+.+.}-{3:3}, at: phy_state_machine+0x100/0x2b8

This is because we now consistently call phy_process_state_change()
while holding phydev->lock, but the AX88772A PHY driver then goes on
to call phy_start_aneg() which tries to grab the same lock - causing
deadlock.

Fix this by exporting the unlocked version, and use this in the PHY
driver instead.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Fixes: ef113a60d0a9 ("net: phy: call phy_error_precise() while holding the lock")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1qiEFs-007g7b-Lq@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/ax88796b.c | 2 +-
 drivers/net/phy/phy.c      | 3 ++-
 include/linux/phy.h        | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/ax88796b.c b/drivers/net/phy/ax88796b.c
index 0f1e617a26c9..eb74a8cf8df1 100644
--- a/drivers/net/phy/ax88796b.c
+++ b/drivers/net/phy/ax88796b.c
@@ -90,7 +90,7 @@ static void asix_ax88772a_link_change_notify(struct phy_device *phydev)
 	 */
 	if (phydev->state == PHY_NOLINK) {
 		phy_init_hw(phydev);
-		phy_start_aneg(phydev);
+		_phy_start_aneg(phydev);
 	}
 }
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 93a8676dd8d8..a5fa077650e8 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -981,7 +981,7 @@ static int phy_check_link_status(struct phy_device *phydev)
  *   If the PHYCONTROL Layer is operating, we change the state to
  *   reflect the beginning of Auto-negotiation or forcing.
  */
-static int _phy_start_aneg(struct phy_device *phydev)
+int _phy_start_aneg(struct phy_device *phydev)
 {
 	int err;
 
@@ -1002,6 +1002,7 @@ static int _phy_start_aneg(struct phy_device *phydev)
 
 	return err;
 }
+EXPORT_SYMBOL(_phy_start_aneg);
 
 /**
  * phy_start_aneg - start auto-negotiation for this PHY device
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1351b802ffcf..3cc52826f18e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1736,6 +1736,7 @@ void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_config_aneg(struct phy_device *phydev);
+int _phy_start_aneg(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
 int phy_speed_down(struct phy_device *phydev, bool sync);
-- 
cgit v1.2.3


From fa17a6d8a5bd0cd7565b613cb804242cd0f6b7ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 Sep 2023 14:23:21 +0000
Subject: ipv6: lockless IPV6_ADDR_PREFERENCES implementation

We have data-races while reading np->srcprefs

Switch the field to a plain byte, add READ_ONCE()
and WRITE_ONCE() annotations where needed,
and IPV6_ADDR_PREFERENCES setsockopt() can now be lockless.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230918142321.1794107-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ipv6.h     |  2 +-
 include/net/ip6_route.h  |  5 ++---
 include/net/ipv6.h       | 20 +++++++-------------
 net/ipv6/ip6_output.c    |  2 +-
 net/ipv6/ipv6_sockglue.c | 19 ++++++++++---------
 net/ipv6/route.c         |  2 +-
 6 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 09253825c99c..e400ff757f13 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -243,7 +243,7 @@ struct ipv6_pinfo {
 	} rxopt;
 
 	/* sockopt flags */
-	__u8			srcprefs:3;	/* 001: prefer temporary address
+	__u8			srcprefs;	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
 						 */
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b1ea49900b4a..28b065790261 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -53,13 +53,12 @@ struct route_info {
  */
 static inline int rt6_srcprefs2flags(unsigned int srcprefs)
 {
-	/* No need to bitmask because srcprefs have only 3 bits. */
-	return srcprefs << 3;
+	return (srcprefs & IPV6_PREFER_SRC_MASK) << 3;
 }
 
 static inline unsigned int rt6_flags2srcprefs(int flags)
 {
-	return (flags >> 3) & 7;
+	return (flags >> 3) & IPV6_PREFER_SRC_MASK;
 }
 
 static inline bool rt6_need_strict(const struct in6_addr *daddr)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index bd115980809f..b3444c8a6f74 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1306,10 +1306,13 @@ static inline void ip6_sock_set_recverr(struct sock *sk)
 	inet6_set_bit(RECVERR6, sk);
 }
 
-static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
+#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \
+			      IPV6_PREFER_SRC_COA)
+
+static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
 {
+	unsigned int prefmask = ~IPV6_PREFER_SRC_MASK;
 	unsigned int pref = 0;
-	unsigned int prefmask = ~0;
 
 	/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
 	switch (val & (IPV6_PREFER_SRC_PUBLIC |
@@ -1359,20 +1362,11 @@ static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val)
 		return -EINVAL;
 	}
 
-	inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref;
+	WRITE_ONCE(inet6_sk(sk)->srcprefs,
+		   (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref);
 	return 0;
 }
 
-static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
-{
-	int ret;
-
-	lock_sock(sk);
-	ret = __ip6_sock_set_addr_preferences(sk, val);
-	release_sock(sk);
-	return ret;
-}
-
 static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
 {
 	lock_sock(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e5d9eeb990f..951ba8089b5b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1113,7 +1113,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		rcu_read_lock();
 		from = rt ? rcu_dereference(rt->from) : NULL;
 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
-					  sk ? inet6_sk(sk)->srcprefs : 0,
+					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
 					  &fl6->saddr);
 		rcu_read_unlock();
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e9dc6f881bb9..7d661735cb9d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -505,6 +505,10 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		inet6_assign_bit(SNDFLOW, sk, valbool);
 		return 0;
+	case IPV6_ADDR_PREFERENCES:
+		if (optlen < sizeof(int))
+			return -EINVAL;
+		return ip6_sock_set_addr_preferences(sk, val);
 	}
 	if (needs_rtnl)
 		rtnl_lock();
@@ -964,11 +968,6 @@ done:
 		retv = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
-	case IPV6_ADDR_PREFERENCES:
-		if (optlen < sizeof(int))
-			goto e_inval;
-		retv = __ip6_sock_set_addr_preferences(sk, val);
-		break;
 	case IPV6_RECVFRAGSIZE:
 		np->rxopt.bits.recvfragsize = valbool;
 		retv = 0;
@@ -1415,23 +1414,25 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	}
 
 	case IPV6_ADDR_PREFERENCES:
+		{
+		u8 srcprefs = READ_ONCE(np->srcprefs);
 		val = 0;
 
-		if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+		if (srcprefs & IPV6_PREFER_SRC_TMP)
 			val |= IPV6_PREFER_SRC_TMP;
-		else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+		else if (srcprefs & IPV6_PREFER_SRC_PUBLIC)
 			val |= IPV6_PREFER_SRC_PUBLIC;
 		else {
 			/* XXX: should we return system default? */
 			val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
 		}
 
-		if (np->srcprefs & IPV6_PREFER_SRC_COA)
+		if (srcprefs & IPV6_PREFER_SRC_COA)
 			val |= IPV6_PREFER_SRC_COA;
 		else
 			val |= IPV6_PREFER_SRC_HOME;
 		break;
-
+		}
 	case IPV6_MINHOPCOUNT:
 		val = READ_ONCE(np->min_hopcount);
 		break;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9d8dfc7423e4..b132feae3393 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2622,7 +2622,7 @@ static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
 	if (!any_src)
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 	else if (sk)
-		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+		flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
 
 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
 }
-- 
cgit v1.2.3


From bafd764a8baa87e19e6beeaa58eb85fcbbdd6b20 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:07 +0200
Subject: net: ethernet: mtk_wed: rename mtk_rxbm_desc in mtk_wed_bm_desc

Rename mtk_rxbm_desc structure in mtk_wed_bm_desc since it will be used
even on tx side by MT7988 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c          | 4 ++--
 drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 2 +-
 include/linux/soc/mediatek/mtk_wed.h             | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 750326b298dc..f166d4f0b793 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -422,7 +422,7 @@ free_pagelist:
 static int
 mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 {
-	struct mtk_rxbm_desc *desc;
+	struct mtk_wed_bm_desc *desc;
 	dma_addr_t desc_phys;
 
 	dev->rx_buf_ring.size = dev->wlan.rx_nbuf;
@@ -442,7 +442,7 @@ mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 static void
 mtk_wed_free_rx_buffer(struct mtk_wed_device *dev)
 {
-	struct mtk_rxbm_desc *desc = dev->rx_buf_ring.desc;
+	struct mtk_wed_bm_desc *desc = dev->rx_buf_ring.desc;
 
 	if (!desc)
 		return;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
index fc7ace638ce8..e7d8e03f826f 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c
@@ -591,7 +591,7 @@ static void mt7915_mmio_wed_release_rx_buf(struct mtk_wed_device *wed)
 
 static u32 mt7915_mmio_wed_init_rx_buf(struct mtk_wed_device *wed, int size)
 {
-	struct mtk_rxbm_desc *desc = wed->rx_buf_ring.desc;
+	struct mtk_wed_bm_desc *desc = wed->rx_buf_ring.desc;
 	struct mt76_txwi_cache *t = NULL;
 	struct mt7915_dev *dev;
 	struct mt76_queue *q;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index b2b28180dff7..c6512c216b27 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -45,7 +45,7 @@ enum mtk_wed_wo_cmd {
 	MTK_WED_WO_CMD_WED_END
 };
 
-struct mtk_rxbm_desc {
+struct mtk_wed_bm_desc {
 	__le32 buf0;
 	__le32 token;
 } __packed __aligned(4);
@@ -104,7 +104,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		struct mtk_rxbm_desc *desc;
+		struct mtk_wed_bm_desc *desc;
 		dma_addr_t desc_phys;
 	} rx_buf_ring;
 
-- 
cgit v1.2.3


From ff0ea57fa30e860d3373acd1383e9d9599144b58 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 18 Sep 2023 12:29:08 +0200
Subject: net: ethernet: mtk_wed: introduce mtk_wed_buf structure

Introduce mtk_wed_buf structure to store both virtual and physical
addresses allocated in mtk_wed_tx_buffer_alloc() routine. This is a
preliminary patch to add WED support for MT7988 SoC since it relies on a
different dma descriptor layout not storing page dma addresses.

Co-developed-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 12 ++++++------
 include/linux/soc/mediatek/mtk_wed.h    |  7 ++++++-
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index f166d4f0b793..592e497984e3 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -300,9 +300,9 @@ out:
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
+	struct mtk_wed_buf *page_list;
 	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
-	void **page_list;
 	int token = dev->wlan.token_start;
 	int ring_size;
 	int n_pages;
@@ -343,7 +343,8 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 			return -ENOMEM;
 		}
 
-		page_list[page_idx++] = page;
+		page_list[page_idx].p = page;
+		page_list[page_idx++].phy_addr = page_phys;
 		dma_sync_single_for_cpu(dev->hw->dev, page_phys, PAGE_SIZE,
 					DMA_BIDIRECTIONAL);
 
@@ -387,8 +388,8 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 static void
 mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 {
+	struct mtk_wed_buf *page_list = dev->tx_buf_ring.pages;
 	struct mtk_wdma_desc *desc = dev->tx_buf_ring.desc;
-	void **page_list = dev->tx_buf_ring.pages;
 	int page_idx;
 	int i;
 
@@ -400,13 +401,12 @@ mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 
 	for (i = 0, page_idx = 0; i < dev->tx_buf_ring.size;
 	     i += MTK_WED_BUF_PER_PAGE) {
-		void *page = page_list[page_idx++];
-		dma_addr_t buf_addr;
+		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+		void *page = page_list[page_idx++].p;
 
 		if (!page)
 			break;
 
-		buf_addr = le32_to_cpu(desc[i].buf0);
 		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
 			       DMA_BIDIRECTIONAL);
 		__free_page(page);
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index c6512c216b27..5f00dc26582b 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -76,6 +76,11 @@ struct mtk_wed_wo_rx_stats {
 	__le32 rx_drop_cnt;
 };
 
+struct mtk_wed_buf {
+	void *p;
+	dma_addr_t phy_addr;
+};
+
 struct mtk_wed_device {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 	const struct mtk_wed_ops *ops;
@@ -97,7 +102,7 @@ struct mtk_wed_device {
 
 	struct {
 		int size;
-		void **pages;
+		struct mtk_wed_buf *pages;
 		struct mtk_wdma_desc *desc;
 		dma_addr_t desc_phys;
 	} tx_buf_ring;
-- 
cgit v1.2.3


From e2f64db13aa1d08e32621067e4fe16bbc114b375 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:13 +0200
Subject: net: ethernet: mtk_wed: introduce WED support for MT7988

Similar to MT7986 and MT7622, enable Wireless Ethernet Ditpatcher for
MT7988 in order to offload traffic forwarded from LAN/WLAN to WLAN/LAN

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c     |   1 +
 drivers/net/ethernet/mediatek/mtk_eth_soc.h     |   2 +-
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |   3 +
 drivers/net/ethernet/mediatek/mtk_wed.c         | 442 +++++++++++++++++-------
 drivers/net/ethernet/mediatek/mtk_wed.h         |  29 ++
 drivers/net/ethernet/mediatek/mtk_wed_mcu.c     |  33 +-
 drivers/net/ethernet/mediatek/mtk_wed_regs.h    | 223 +++++++++++-
 drivers/net/ethernet/mediatek/mtk_wed_wo.h      |   2 +
 include/linux/soc/mediatek/mtk_wed.h            |   8 +-
 9 files changed, 603 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 3cffd1bd3067..697620c6354b 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -197,6 +197,7 @@ static const struct mtk_reg_map mt7988_reg_map = {
 	.wdma_base = {
 		[0]		= 0x4800,
 		[1]		= 0x4c00,
+		[2]		= 0x5000,
 	},
 	.pse_iq_sta		= 0x0180,
 	.pse_oq_sta		= 0x01a0,
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index 403219d987ef..9ae3b8a71d0e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -1132,7 +1132,7 @@ struct mtk_reg_map {
 	u32	gdm1_cnt;
 	u32	gdma_to_ppe;
 	u32	ppe_base;
-	u32	wdma_base[2];
+	u32	wdma_base[3];
 	u32	pse_iq_sta;
 	u32	pse_oq_sta;
 };
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index ef3980840695..95f76975f258 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -201,6 +201,9 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
 			case 1:
 				pse_port = PSE_WDMA1_PORT;
 				break;
+			case 2:
+				pse_port = PSE_WDMA2_PORT;
+				break;
 			default:
 				return -EINVAL;
 			}
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index c2ff2d6405f6..b6ca12686beb 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -17,17 +17,19 @@
 #include <net/flow_offload.h>
 #include <net/pkt_cls.h>
 #include "mtk_eth_soc.h"
-#include "mtk_wed_regs.h"
 #include "mtk_wed.h"
 #include "mtk_ppe.h"
 #include "mtk_wed_wo.h"
 
 #define MTK_PCIE_BASE(n)		(0x1a143000 + (n) * 0x2000)
 
-#define MTK_WED_PKT_SIZE		1900
+#define MTK_WED_PKT_SIZE		1920
 #define MTK_WED_BUF_SIZE		2048
+#define MTK_WED_PAGE_BUF_SIZE		128
 #define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
+#define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
 #define MTK_WED_RX_RING_SIZE		1536
+#define MTK_WED_RX_PG_BM_CNT		8192
 
 #define MTK_WED_TX_RING_SIZE		2048
 #define MTK_WED_WDMA_RING_SIZE		1024
@@ -41,7 +43,10 @@
 #define MTK_WED_RRO_QUE_CNT		8192
 #define MTK_WED_MIOD_ENTRY_CNT		128
 
-static struct mtk_wed_hw *hw_list[2];
+#define MTK_WED_TX_BM_DMA_SIZE		65536
+#define MTK_WED_TX_BM_PKT_CNT		32768
+
+static struct mtk_wed_hw *hw_list[3];
 static DEFINE_MUTEX(hw_lock);
 
 struct mtk_wed_flow_block_priv {
@@ -56,6 +61,7 @@ static const struct mtk_wed_soc_data mt7622_data = {
 		.reset_idx_tx_mask	= GENMASK(3, 0),
 		.reset_idx_rx_mask	= GENMASK(17, 16),
 	},
+	.tx_ring_desc_size = sizeof(struct mtk_wdma_desc),
 	.wdma_desc_size = sizeof(struct mtk_wdma_desc),
 };
 
@@ -66,6 +72,18 @@ static const struct mtk_wed_soc_data mt7986_data = {
 		.reset_idx_tx_mask	= GENMASK(1, 0),
 		.reset_idx_rx_mask	= GENMASK(7, 6),
 	},
+	.tx_ring_desc_size = sizeof(struct mtk_wdma_desc),
+	.wdma_desc_size = 2 * sizeof(struct mtk_wdma_desc),
+};
+
+static const struct mtk_wed_soc_data mt7988_data = {
+	.regmap = {
+		.tx_bm_tkid		= 0x0c8,
+		.wpdma_rx_ring0		= 0x7d0,
+		.reset_idx_tx_mask	= GENMASK(1, 0),
+		.reset_idx_rx_mask	= GENMASK(7, 6),
+	},
+	.tx_ring_desc_size = sizeof(struct mtk_wed_bm_desc),
 	.wdma_desc_size = 2 * sizeof(struct mtk_wdma_desc),
 };
 
@@ -320,33 +338,38 @@ out:
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
+	u32 desc_size = dev->hw->soc->tx_ring_desc_size;
+	int i, page_idx = 0, n_pages, ring_size;
+	int token = dev->wlan.token_start;
 	struct mtk_wed_buf *page_list;
-	struct mtk_wdma_desc *desc;
 	dma_addr_t desc_phys;
-	int token = dev->wlan.token_start;
-	int ring_size;
-	int n_pages;
-	int i, page_idx;
+	void *desc_ptr;
 
-	ring_size = dev->wlan.nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
-	n_pages = ring_size / MTK_WED_BUF_PER_PAGE;
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		ring_size = dev->wlan.nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
+		dev->tx_buf_ring.size = ring_size;
+	} else {
+		dev->tx_buf_ring.size = MTK_WED_TX_BM_DMA_SIZE;
+		ring_size = MTK_WED_TX_BM_PKT_CNT;
+	}
+	n_pages = dev->tx_buf_ring.size / MTK_WED_BUF_PER_PAGE;
 
 	page_list = kcalloc(n_pages, sizeof(*page_list), GFP_KERNEL);
 	if (!page_list)
 		return -ENOMEM;
 
-	dev->tx_buf_ring.size = ring_size;
 	dev->tx_buf_ring.pages = page_list;
 
-	desc = dma_alloc_coherent(dev->hw->dev, ring_size * sizeof(*desc),
-				  &desc_phys, GFP_KERNEL);
-	if (!desc)
+	desc_ptr = dma_alloc_coherent(dev->hw->dev,
+				      dev->tx_buf_ring.size * desc_size,
+				      &desc_phys, GFP_KERNEL);
+	if (!desc_ptr)
 		return -ENOMEM;
 
-	dev->tx_buf_ring.desc = desc;
+	dev->tx_buf_ring.desc = desc_ptr;
 	dev->tx_buf_ring.desc_phys = desc_phys;
 
-	for (i = 0, page_idx = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
+	for (i = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
 		dma_addr_t page_phys, buf_phys;
 		struct page *page;
 		void *buf;
@@ -372,28 +395,31 @@ mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 		buf_phys = page_phys;
 
 		for (s = 0; s < MTK_WED_BUF_PER_PAGE; s++) {
-			u32 txd_size;
-			u32 ctrl;
-
-			txd_size = dev->wlan.init_buf(buf, buf_phys, token++);
+			struct mtk_wdma_desc *desc = desc_ptr;
 
 			desc->buf0 = cpu_to_le32(buf_phys);
-			desc->buf1 = cpu_to_le32(buf_phys + txd_size);
-
-			if (mtk_wed_is_v1(dev->hw))
-				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size) |
-				       FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1,
-						  MTK_WED_BUF_SIZE - txd_size) |
-				       MTK_WDMA_DESC_CTRL_LAST_SEG1;
-			else
-				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size) |
-				       FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1_V2,
-						  MTK_WED_BUF_SIZE - txd_size) |
-				       MTK_WDMA_DESC_CTRL_LAST_SEG0;
-			desc->ctrl = cpu_to_le32(ctrl);
-			desc->info = 0;
-			desc++;
-
+			if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+				u32 txd_size, ctrl;
+
+				txd_size = dev->wlan.init_buf(buf, buf_phys,
+							      token++);
+				desc->buf1 = cpu_to_le32(buf_phys + txd_size);
+				ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0, txd_size);
+				if (mtk_wed_is_v1(dev->hw))
+					ctrl |= MTK_WDMA_DESC_CTRL_LAST_SEG1 |
+						FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1,
+							   MTK_WED_BUF_SIZE - txd_size);
+				else
+					ctrl |= MTK_WDMA_DESC_CTRL_LAST_SEG0 |
+						FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1_V2,
+							   MTK_WED_BUF_SIZE - txd_size);
+				desc->ctrl = cpu_to_le32(ctrl);
+				desc->info = 0;
+			} else {
+				desc->ctrl = cpu_to_le32(token << 16);
+			}
+
+			desc_ptr += desc_size;
 			buf += MTK_WED_BUF_SIZE;
 			buf_phys += MTK_WED_BUF_SIZE;
 		}
@@ -409,31 +435,31 @@ static void
 mtk_wed_free_tx_buffer(struct mtk_wed_device *dev)
 {
 	struct mtk_wed_buf *page_list = dev->tx_buf_ring.pages;
-	struct mtk_wdma_desc *desc = dev->tx_buf_ring.desc;
-	int page_idx;
-	int i;
+	struct mtk_wed_hw *hw = dev->hw;
+	int i, page_idx = 0;
 
 	if (!page_list)
 		return;
 
-	if (!desc)
+	if (!dev->tx_buf_ring.desc)
 		goto free_pagelist;
 
-	for (i = 0, page_idx = 0; i < dev->tx_buf_ring.size;
-	     i += MTK_WED_BUF_PER_PAGE) {
-		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+	for (i = 0; i < dev->tx_buf_ring.size; i += MTK_WED_BUF_PER_PAGE) {
+		dma_addr_t page_phy = page_list[page_idx].phy_addr;
 		void *page = page_list[page_idx++].p;
 
 		if (!page)
 			break;
 
-		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
+		dma_unmap_page(dev->hw->dev, page_phy, PAGE_SIZE,
 			       DMA_BIDIRECTIONAL);
 		__free_page(page);
 	}
 
-	dma_free_coherent(dev->hw->dev, dev->tx_buf_ring.size * sizeof(*desc),
-			  desc, dev->tx_buf_ring.desc_phys);
+	dma_free_coherent(dev->hw->dev,
+			  dev->tx_buf_ring.size * hw->soc->tx_ring_desc_size,
+			  dev->tx_buf_ring.desc,
+			  dev->tx_buf_ring.desc_phys);
 
 free_pagelist:
 	kfree(page_list);
@@ -518,13 +544,23 @@ mtk_wed_set_ext_int(struct mtk_wed_device *dev, bool en)
 {
 	u32 mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
 
-	if (mtk_wed_is_v1(dev->hw))
+	switch (dev->hw->version) {
+	case 1:
 		mask |= MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR;
-	else
+		break;
+	case 2:
 		mask |= MTK_WED_EXT_INT_STATUS_RX_FBUF_LO_TH |
 			MTK_WED_EXT_INT_STATUS_RX_FBUF_HI_TH |
 			MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
 			MTK_WED_EXT_INT_STATUS_TX_DMA_W_RESP_ERR;
+		break;
+	case 3:
+		mask = MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
+		       MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
+		break;
+	default:
+		break;
+	}
 
 	if (!dev->hw->num_flows)
 		mask &= ~MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
@@ -536,6 +572,9 @@ mtk_wed_set_ext_int(struct mtk_wed_device *dev, bool en)
 static void
 mtk_wed_set_512_support(struct mtk_wed_device *dev, bool enable)
 {
+	if (!mtk_wed_is_v2(dev->hw))
+		return;
+
 	if (enable) {
 		wed_w32(dev, MTK_WED_TXDP_CTRL, MTK_WED_TXDP_DW9_OVERWR);
 		wed_w32(dev, MTK_WED_TXP_DW1,
@@ -610,6 +649,14 @@ mtk_wed_dma_disable(struct mtk_wed_device *dev)
 			MTK_WED_WPDMA_RX_D_RX_DRV_EN);
 		wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
 			MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
+
+		if (mtk_wed_is_v3_or_greater(dev->hw) &&
+		    mtk_wed_get_rx_capa(dev)) {
+			wdma_clr(dev, MTK_WDMA_PREF_TX_CFG,
+				 MTK_WDMA_PREF_TX_CFG_PREF_EN);
+			wdma_clr(dev, MTK_WDMA_PREF_RX_CFG,
+				 MTK_WDMA_PREF_RX_CFG_PREF_EN);
+		}
 	}
 
 	mtk_wed_set_512_support(dev, false);
@@ -652,6 +699,14 @@ mtk_wed_deinit(struct mtk_wed_device *dev)
 		MTK_WED_CTRL_RX_ROUTE_QM_EN |
 		MTK_WED_CTRL_WED_RX_BM_EN |
 		MTK_WED_CTRL_RX_RRO_QM_EN);
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_clr(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN);
+		wed_clr(dev, MTK_WED_RESET, MTK_WED_RESET_TX_AMSDU);
+		wed_clr(dev, MTK_WED_PCIE_INT_CTRL,
+			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA |
+			MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER);
+	}
 }
 
 static void
@@ -701,21 +756,37 @@ mtk_wed_detach(struct mtk_wed_device *dev)
 	mutex_unlock(&hw_lock);
 }
 
-#define PCIE_BASE_ADDR0		0x11280000
 static void
 mtk_wed_bus_init(struct mtk_wed_device *dev)
 {
 	switch (dev->wlan.bus_type) {
 	case MTK_WED_BUS_PCIE: {
 		struct device_node *np = dev->hw->eth->dev->of_node;
-		struct regmap *regs;
 
-		regs = syscon_regmap_lookup_by_phandle(np,
-						       "mediatek,wed-pcie");
-		if (IS_ERR(regs))
-			break;
+		if (mtk_wed_is_v2(dev->hw)) {
+			struct regmap *regs;
+
+			regs = syscon_regmap_lookup_by_phandle(np,
+							       "mediatek,wed-pcie");
+			if (IS_ERR(regs))
+				break;
 
-		regmap_update_bits(regs, 0, BIT(0), BIT(0));
+			regmap_update_bits(regs, 0, BIT(0), BIT(0));
+		}
+
+		if (dev->wlan.msi) {
+			wed_w32(dev, MTK_WED_PCIE_CFG_INTM,
+				dev->hw->pcie_base | 0xc08);
+			wed_w32(dev, MTK_WED_PCIE_CFG_BASE,
+				dev->hw->pcie_base | 0xc04);
+			wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(8));
+		} else {
+			wed_w32(dev, MTK_WED_PCIE_CFG_INTM,
+				dev->hw->pcie_base | 0x180);
+			wed_w32(dev, MTK_WED_PCIE_CFG_BASE,
+				dev->hw->pcie_base | 0x184);
+			wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(24));
+		}
 
 		wed_w32(dev, MTK_WED_PCIE_INT_CTRL,
 			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_POLL_EN, 2));
@@ -723,19 +794,9 @@ mtk_wed_bus_init(struct mtk_wed_device *dev)
 		/* pcie interrupt control: pola/source selection */
 		wed_set(dev, MTK_WED_PCIE_INT_CTRL,
 			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA |
-			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_SRC_SEL, 1));
-		wed_r32(dev, MTK_WED_PCIE_INT_CTRL);
-
-		wed_w32(dev, MTK_WED_PCIE_CFG_INTM, PCIE_BASE_ADDR0 | 0x180);
-		wed_w32(dev, MTK_WED_PCIE_CFG_BASE, PCIE_BASE_ADDR0 | 0x184);
-
-		/* pcie interrupt status trigger register */
-		wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, BIT(24));
-		wed_r32(dev, MTK_WED_PCIE_INT_TRIGGER);
-
-		/* pola setting */
-		wed_set(dev, MTK_WED_PCIE_INT_CTRL,
-			MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA);
+			MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER  |
+			FIELD_PREP(MTK_WED_PCIE_INT_CTRL_SRC_SEL,
+				   dev->hw->index));
 		break;
 	}
 	case MTK_WED_BUS_AXI:
@@ -773,18 +834,19 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 static void
 mtk_wed_hw_init_early(struct mtk_wed_device *dev)
 {
-	u32 mask, set;
+	u32 set = FIELD_PREP(MTK_WED_WDMA_GLO_CFG_BT_SIZE, 2);
+	u32 mask = MTK_WED_WDMA_GLO_CFG_BT_SIZE;
 
 	mtk_wed_deinit(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
 	mtk_wed_set_wpdma(dev);
 
-	mask = MTK_WED_WDMA_GLO_CFG_BT_SIZE |
-	       MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE |
-	       MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE;
-	set = FIELD_PREP(MTK_WED_WDMA_GLO_CFG_BT_SIZE, 2) |
-	      MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP |
-	      MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY;
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		mask |= MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE |
+			MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE;
+		set |= MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP |
+		       MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY;
+	}
 	wed_m32(dev, MTK_WED_WDMA_GLO_CFG, mask, set);
 
 	if (mtk_wed_is_v1(dev->hw)) {
@@ -932,11 +994,18 @@ mtk_wed_route_qm_hw_init(struct mtk_wed_device *dev)
 	}
 
 	/* configure RX_ROUTE_QM */
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_TXDMAD_FPORT);
-	wed_set(dev, MTK_WED_RTQM_GLO_CFG,
-		FIELD_PREP(MTK_WED_RTQM_TXDMAD_FPORT, 0x3 + dev->hw->index));
-	wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	if (mtk_wed_is_v2(dev->hw)) {
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_TXDMAD_FPORT);
+		wed_set(dev, MTK_WED_RTQM_GLO_CFG,
+			FIELD_PREP(MTK_WED_RTQM_TXDMAD_FPORT,
+				   0x3 + dev->hw->index));
+		wed_clr(dev, MTK_WED_RTQM_GLO_CFG, MTK_WED_RTQM_Q_RST);
+	} else {
+		wed_set(dev, MTK_WED_RTQM_ENQ_CFG0,
+			FIELD_PREP(MTK_WED_RTQM_ENQ_CFG_TXDMAD_FPORT,
+				   0x3 + dev->hw->index));
+	}
 	/* enable RX_ROUTE_QM */
 	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_RX_ROUTE_QM_EN);
 }
@@ -949,22 +1018,30 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 
 	dev->init_done = true;
 	mtk_wed_set_ext_int(dev, false);
-	wed_w32(dev, MTK_WED_TX_BM_CTRL,
-		MTK_WED_TX_BM_CTRL_PAUSE |
-		FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
-			   dev->tx_buf_ring.size / 128) |
-		FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
-			   MTK_WED_TX_RING_SIZE / 256));
 
 	wed_w32(dev, MTK_WED_TX_BM_BASE, dev->tx_buf_ring.desc_phys);
-
 	wed_w32(dev, MTK_WED_TX_BM_BUF_LEN, MTK_WED_PKT_SIZE);
 
 	if (mtk_wed_is_v1(dev->hw)) {
+		wed_w32(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
+				   dev->tx_buf_ring.size / 128) |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
+				   MTK_WED_TX_RING_SIZE / 256));
 		wed_w32(dev, MTK_WED_TX_BM_DYN_THR,
 			FIELD_PREP(MTK_WED_TX_BM_DYN_THR_LO, 1) |
 			MTK_WED_TX_BM_DYN_THR_HI);
-	} else {
+	} else if (mtk_wed_is_v2(dev->hw)) {
+		wed_w32(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
+				   dev->tx_buf_ring.size / 128) |
+			FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
+				   MTK_WED_TX_RING_SIZE / 256));
+		wed_w32(dev, MTK_WED_TX_TKID_DYN_THR,
+			FIELD_PREP(MTK_WED_TX_TKID_DYN_THR_LO, 0) |
+			MTK_WED_TX_TKID_DYN_THR_HI);
 		wed_w32(dev, MTK_WED_TX_BM_DYN_THR,
 			FIELD_PREP(MTK_WED_TX_BM_DYN_THR_LO_V2, 0) |
 			MTK_WED_TX_BM_DYN_THR_HI_V2);
@@ -974,9 +1051,6 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 				   dev->tx_buf_ring.size / 128) |
 			FIELD_PREP(MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM,
 				   dev->tx_buf_ring.size / 128));
-		wed_w32(dev, MTK_WED_TX_TKID_DYN_THR,
-			FIELD_PREP(MTK_WED_TX_TKID_DYN_THR_LO, 0) |
-			MTK_WED_TX_TKID_DYN_THR_HI);
 	}
 
 	wed_w32(dev, dev->hw->soc->regmap.tx_bm_tkid,
@@ -986,26 +1060,62 @@ mtk_wed_hw_init(struct mtk_wed_device *dev)
 
 	mtk_wed_reset(dev, MTK_WED_RESET_TX_BM);
 
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		/* switch to new bm architecture */
+		wed_clr(dev, MTK_WED_TX_BM_CTRL,
+			MTK_WED_TX_BM_CTRL_LEGACY_EN);
+
+		wed_w32(dev, MTK_WED_TX_TKID_CTRL,
+			MTK_WED_TX_TKID_CTRL_PAUSE |
+			FIELD_PREP(MTK_WED_TX_TKID_CTRL_VLD_GRP_NUM_V3,
+				   dev->wlan.nbuf / 128) |
+			FIELD_PREP(MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM_V3,
+				   dev->wlan.nbuf / 128));
+		/* return SKBID + SDP back to bm */
+		wed_set(dev, MTK_WED_TX_TKID_CTRL,
+			MTK_WED_TX_TKID_CTRL_FREE_FORMAT);
+
+		wed_w32(dev, MTK_WED_TX_BM_INIT_PTR,
+			MTK_WED_TX_BM_PKT_CNT |
+			MTK_WED_TX_BM_INIT_SW_TAIL_IDX);
+	}
+
 	if (mtk_wed_is_v1(dev->hw)) {
 		wed_set(dev, MTK_WED_CTRL,
 			MTK_WED_CTRL_WED_TX_BM_EN |
 			MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
-	} else {
-		wed_clr(dev, MTK_WED_TX_TKID_CTRL, MTK_WED_TX_TKID_CTRL_PAUSE);
-		if (mtk_wed_get_rx_capa(dev)) {
-			/* rx hw init */
-			wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX,
-				MTK_WED_WPDMA_RX_D_RST_CRX_IDX |
-				MTK_WED_WPDMA_RX_D_RST_DRV_IDX);
-			wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX, 0);
-
-			mtk_wed_rx_buffer_hw_init(dev);
-			mtk_wed_rro_hw_init(dev);
-			mtk_wed_route_qm_hw_init(dev);
-		}
+	} else if (mtk_wed_get_rx_capa(dev)) {
+		/* rx hw init */
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX,
+			MTK_WED_WPDMA_RX_D_RST_CRX_IDX |
+			MTK_WED_WPDMA_RX_D_RST_DRV_IDX);
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_RST_IDX, 0);
+
+		/* reset prefetch index of ring */
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+		wed_clr(dev, MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+		wed_clr(dev, MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX,
+			MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR);
+
+		/* reset prefetch FIFO of ring */
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG,
+			MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R0_CLR |
+			MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R1_CLR);
+		wed_w32(dev, MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG, 0);
+
+		mtk_wed_rx_buffer_hw_init(dev);
+		mtk_wed_rro_hw_init(dev);
+		mtk_wed_route_qm_hw_init(dev);
 	}
 
 	wed_clr(dev, MTK_WED_TX_BM_CTRL, MTK_WED_TX_BM_CTRL_PAUSE);
+	if (!mtk_wed_is_v1(dev->hw))
+		wed_clr(dev, MTK_WED_TX_TKID_CTRL, MTK_WED_TX_TKID_CTRL_PAUSE);
 }
 
 static void
@@ -1303,6 +1413,24 @@ mtk_wed_wdma_tx_ring_setup(struct mtk_wed_device *dev, int idx, int size,
 					 dev->hw->soc->wdma_desc_size, true))
 		return -ENOMEM;
 
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		struct mtk_wdma_desc *desc = wdma->desc;
+		int i;
+
+		for (i = 0; i < MTK_WED_WDMA_RING_SIZE; i++) {
+			desc->buf0 = 0;
+			desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+			desc->buf1 = 0;
+			desc->info = cpu_to_le32(MTK_WDMA_TXD0_DESC_INFO_DMA_DONE);
+			desc++;
+			desc->buf0 = 0;
+			desc->ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
+			desc->buf1 = 0;
+			desc->info = cpu_to_le32(MTK_WDMA_TXD1_DESC_INFO_DMA_DONE);
+			desc++;
+		}
+	}
+
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
 		 wdma->desc_phys);
 	wdma_w32(dev, MTK_WDMA_RING_TX(idx) + MTK_WED_RING_OFS_COUNT,
@@ -1368,6 +1496,9 @@ mtk_wed_configure_irq(struct mtk_wed_device *dev, u32 irq_mask)
 
 		wed_clr(dev, MTK_WED_WDMA_INT_CTRL, wdma_mask);
 	} else {
+		if (mtk_wed_is_v3_or_greater(dev->hw))
+			wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_TKID_ALI_EN);
+
 		/* initail tx interrupt trigger */
 		wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_TX,
 			MTK_WED_WPDMA_INT_CTRL_TX0_DONE_EN |
@@ -1420,33 +1551,60 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 {
 	int i;
 
-	wed_set(dev, MTK_WED_WPDMA_INT_CTRL, MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV);
+	if (!mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WPDMA_INT_CTRL,
+			MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV);
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
+		wdma_set(dev, MTK_WDMA_GLO_CFG,
+			 MTK_WDMA_GLO_CFG_TX_DMA_EN |
+			 MTK_WDMA_GLO_CFG_RX_INFO1_PRERES |
+			 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES);
+		wed_set(dev, MTK_WED_WPDMA_CTRL, MTK_WED_WPDMA_CTRL_SDL1_FIXED);
+	} else {
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN |
+			MTK_WED_WPDMA_GLO_CFG_RX_DDONE2_WR);
+		wdma_set(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_TX_DMA_EN);
+	}
 
 	wed_set(dev, MTK_WED_GLO_CFG,
 		MTK_WED_GLO_CFG_TX_DMA_EN |
 		MTK_WED_GLO_CFG_RX_DMA_EN);
-	wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
-		MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
-		MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
+
 	wed_set(dev, MTK_WED_WDMA_GLO_CFG,
 		MTK_WED_WDMA_GLO_CFG_RX_DRV_EN);
 
-	wdma_set(dev, MTK_WDMA_GLO_CFG,
-		 MTK_WDMA_GLO_CFG_TX_DMA_EN |
-		 MTK_WDMA_GLO_CFG_RX_INFO1_PRERES |
-		 MTK_WDMA_GLO_CFG_RX_INFO2_PRERES);
-
 	if (mtk_wed_is_v1(dev->hw)) {
 		wdma_set(dev, MTK_WDMA_GLO_CFG,
 			 MTK_WDMA_GLO_CFG_RX_INFO3_PRERES);
 		return;
 	}
 
-	wed_set(dev, MTK_WED_WPDMA_CTRL,
-		MTK_WED_WPDMA_CTRL_SDL1_FIXED);
 	wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
 		MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_PKT_PROC |
 		MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC);
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			FIELD_PREP(MTK_WED_WDMA_RX_PREF_BURST_SIZE, 0x10) |
+			FIELD_PREP(MTK_WED_WDMA_RX_PREF_LOW_THRES, 0x8));
+		wed_clr(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_DDONE2_EN);
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG, MTK_WED_WDMA_RX_PREF_EN);
+
+		wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK_LAST);
+		wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
+			MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_CHK |
+			MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNS_VER_FORCE_4);
+
+		wdma_set(dev, MTK_WDMA_PREF_RX_CFG, MTK_WDMA_PREF_RX_CFG_PREF_EN);
+	}
+
 	wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
 		MTK_WED_WPDMA_GLO_CFG_TX_TKID_KEEP |
 		MTK_WED_WPDMA_GLO_CFG_TX_DMAD_DW3_PREV);
@@ -1458,11 +1616,22 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 		MTK_WED_WDMA_GLO_CFG_TX_DRV_EN |
 		MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK);
 
+	wed_clr(dev, MTK_WED_WPDMA_RX_D_GLO_CFG, MTK_WED_WPDMA_RX_D_RXD_READ_LEN);
 	wed_set(dev, MTK_WED_WPDMA_RX_D_GLO_CFG,
 		MTK_WED_WPDMA_RX_D_RX_DRV_EN |
 		FIELD_PREP(MTK_WED_WPDMA_RX_D_RXD_READ_LEN, 0x18) |
-		FIELD_PREP(MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL,
-			   0x2));
+		FIELD_PREP(MTK_WED_WPDMA_RX_D_INIT_PHASE_RXEN_SEL, 0x2));
+
+	if (mtk_wed_is_v3_or_greater(dev->hw)) {
+		wed_set(dev, MTK_WED_WPDMA_RX_D_PREF_CFG,
+			MTK_WED_WPDMA_RX_D_PREF_EN |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_PREF_BURST_SIZE, 0x10) |
+			FIELD_PREP(MTK_WED_WPDMA_RX_D_PREF_LOW_THRES, 0x8));
+
+		wed_set(dev, MTK_WED_RRO_RX_D_CFG(2), MTK_WED_RRO_RX_D_DRV_EN);
+		wdma_set(dev, MTK_WDMA_PREF_TX_CFG, MTK_WDMA_PREF_TX_CFG_PREF_EN);
+		wdma_set(dev, MTK_WDMA_WRBK_TX_CFG, MTK_WDMA_WRBK_TX_CFG_WRBK_EN);
+	}
 
 	for (i = 0; i < MTK_WED_RX_QUEUES; i++)
 		mtk_wed_check_wfdma_rx_fill(dev, i);
@@ -1502,6 +1671,12 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 		wed_r32(dev, MTK_WED_EXT_INT_MASK1);
 		wed_r32(dev, MTK_WED_EXT_INT_MASK2);
 
+		if (mtk_wed_is_v3_or_greater(dev->hw)) {
+			wed_w32(dev, MTK_WED_EXT_INT_MASK3,
+				MTK_WED_EXT_INT_STATUS_WPDMA_MID_RDY);
+			wed_r32(dev, MTK_WED_EXT_INT_MASK3);
+		}
+
 		if (mtk_wed_rro_cfg(dev))
 			return;
 	}
@@ -1553,6 +1728,7 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	dev->irq = hw->irq;
 	dev->wdma_idx = hw->index;
 	dev->version = hw->version;
+	dev->hw->pcie_base = mtk_wed_get_pcie_base(dev);
 
 	if (hw->eth->dma_dev == hw->eth->dev &&
 	    of_dma_is_coherent(hw->eth->dev->of_node))
@@ -1620,6 +1796,23 @@ mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
 	ring->reg_base = MTK_WED_RING_TX(idx);
 	ring->wpdma = regs;
 
+	if (mtk_wed_is_v3_or_greater(dev->hw) && idx == 1) {
+		/* reset prefetch index */
+		wed_set(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR |
+			MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR);
+
+		wed_clr(dev, MTK_WED_WDMA_RX_PREF_CFG,
+			MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR |
+			MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR);
+
+		/* reset prefetch FIFO */
+		wed_w32(dev, MTK_WED_WDMA_RX_PREF_FIFO_CFG,
+			MTK_WED_WDMA_RX_PREF_FIFO_RX0_CLR |
+			MTK_WED_WDMA_RX_PREF_FIFO_RX1_CLR);
+		wed_w32(dev, MTK_WED_WDMA_RX_PREF_FIFO_CFG, 0);
+	}
+
 	/* WED -> WPDMA */
 	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_BASE, ring->desc_phys);
 	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_COUNT, MTK_WED_TX_RING_SIZE);
@@ -1694,15 +1887,13 @@ mtk_wed_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs,
 static u32
 mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask)
 {
-	u32 val, ext_mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
+	u32 val, ext_mask;
 
-	if (mtk_wed_is_v1(dev->hw))
-		ext_mask |= MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR;
+	if (mtk_wed_is_v3_or_greater(dev->hw))
+		ext_mask = MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
+			   MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
 	else
-		ext_mask |= MTK_WED_EXT_INT_STATUS_RX_FBUF_LO_TH |
-			    MTK_WED_EXT_INT_STATUS_RX_FBUF_HI_TH |
-			    MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT |
-			    MTK_WED_EXT_INT_STATUS_TX_DMA_W_RESP_ERR;
+		ext_mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
 
 	val = wed_r32(dev, MTK_WED_EXT_INT_STATUS);
 	wed_w32(dev, MTK_WED_EXT_INT_STATUS, val);
@@ -1943,6 +2134,9 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 	case 2:
 		hw->soc = &mt7986_data;
 		break;
+	case 3:
+		hw->soc = &mt7988_data;
+		break;
 	default:
 	case 1:
 		hw->mirror = syscon_regmap_lookup_by_phandle(eth_np,
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index afaf5a46fbb3..27d336db4d4d 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -9,6 +9,8 @@
 #include <linux/regmap.h>
 #include <linux/netdevice.h>
 
+#include "mtk_wed_regs.h"
+
 struct mtk_eth;
 struct mtk_wed_wo;
 
@@ -19,6 +21,7 @@ struct mtk_wed_soc_data {
 		u32 reset_idx_tx_mask;
 		u32 reset_idx_rx_mask;
 	} regmap;
+	u32 tx_ring_desc_size;
 	u32 wdma_desc_size;
 };
 
@@ -35,6 +38,7 @@ struct mtk_wed_hw {
 	struct dentry *debugfs_dir;
 	struct mtk_wed_device *wed_dev;
 	struct mtk_wed_wo *wed_wo;
+	u32 pcie_base;
 	u32 debugfs_reg;
 	u32 num_flows;
 	u8 version;
@@ -61,6 +65,16 @@ static inline bool mtk_wed_is_v2(struct mtk_wed_hw *hw)
 	return hw->version == 2;
 }
 
+static inline bool mtk_wed_is_v3(struct mtk_wed_hw *hw)
+{
+	return hw->version == 3;
+}
+
+static inline bool mtk_wed_is_v3_or_greater(struct mtk_wed_hw *hw)
+{
+	return hw->version > 2;
+}
+
 static inline void
 wed_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
 {
@@ -143,6 +157,21 @@ wpdma_txfree_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
 	writel(val, dev->txfree_ring.wpdma + reg);
 }
 
+static inline u32 mtk_wed_get_pcie_base(struct mtk_wed_device *dev)
+{
+	if (!mtk_wed_is_v3_or_greater(dev->hw))
+		return MTK_WED_PCIE_BASE;
+
+	switch (dev->hw->index) {
+	case 1:
+		return MTK_WED_PCIE_BASE1;
+	case 2:
+		return MTK_WED_PCIE_BASE2;
+	default:
+		return MTK_WED_PCIE_BASE0;
+	}
+}
+
 void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		    void __iomem *wdma, phys_addr_t wdma_phy,
 		    int index);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
index e53531252bd9..65a78e274009 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c
@@ -331,10 +331,22 @@ mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo)
 		wo->hw->index + 1);
 
 	/* load firmware */
-	if (of_device_is_compatible(wo->hw->node, "mediatek,mt7981-wed"))
-		fw_name = MT7981_FIRMWARE_WO;
-	else
-		fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1 : MT7986_FIRMWARE_WO0;
+	switch (wo->hw->version) {
+	case 2:
+		if (of_device_is_compatible(wo->hw->node,
+					    "mediatek,mt7981-wed"))
+			fw_name = MT7981_FIRMWARE_WO;
+		else
+			fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1
+						: MT7986_FIRMWARE_WO0;
+		break;
+	case 3:
+		fw_name = wo->hw->index ? MT7988_FIRMWARE_WO1
+					: MT7988_FIRMWARE_WO0;
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	ret = request_firmware(&fw, fw_name, wo->hw->dev);
 	if (ret)
@@ -355,15 +367,16 @@ mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo)
 	}
 
 	/* set the start address */
-	boot_cr = wo->hw->index ? MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR
-				: MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR;
+	if (!mtk_wed_is_v3_or_greater(wo->hw) && wo->hw->index)
+		boot_cr = MTK_WO_MCU_CFG_LS_WA_BOOT_ADDR_ADDR;
+	else
+		boot_cr = MTK_WO_MCU_CFG_LS_WM_BOOT_ADDR_ADDR;
 	wo_w32(wo, boot_cr, mem_region[MTK_WED_WO_REGION_EMI].phy_addr >> 16);
 	/* wo firmware reset */
 	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCCR_CLR_ADDR, 0xc00);
 
-	val = wo_r32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR);
-	val |= wo->hw->index ? MTK_WO_MCU_CFG_LS_WF_WM_WA_WA_CPU_RSTB_MASK
-			     : MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK;
+	val = wo_r32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR) |
+	      MTK_WO_MCU_CFG_LS_WF_WM_WA_WM_CPU_RSTB_MASK;
 	wo_w32(wo, MTK_WO_MCU_CFG_LS_WF_MCU_CFG_WM_WA_ADDR, val);
 out:
 	release_firmware(fw);
@@ -398,3 +411,5 @@ int mtk_wed_mcu_init(struct mtk_wed_wo *wo)
 MODULE_FIRMWARE(MT7981_FIRMWARE_WO);
 MODULE_FIRMWARE(MT7986_FIRMWARE_WO0);
 MODULE_FIRMWARE(MT7986_FIRMWARE_WO1);
+MODULE_FIRMWARE(MT7988_FIRMWARE_WO0);
+MODULE_FIRMWARE(MT7988_FIRMWARE_WO1);
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index 2253f4eb5bc1..a4d3cf64d090 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -13,6 +13,9 @@
 #define MTK_WDMA_DESC_CTRL_LAST_SEG0		BIT(30)
 #define MTK_WDMA_DESC_CTRL_DMA_DONE		BIT(31)
 
+#define MTK_WDMA_TXD0_DESC_INFO_DMA_DONE	BIT(29)
+#define MTK_WDMA_TXD1_DESC_INFO_DMA_DONE	BIT(31)
+
 struct mtk_wdma_desc {
 	__le32 buf0;
 	__le32 ctrl;
@@ -37,6 +40,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_RESET_WDMA_INT_AGENT			BIT(19)
 #define MTK_WED_RESET_RX_RRO_QM				BIT(20)
 #define MTK_WED_RESET_RX_ROUTE_QM			BIT(21)
+#define MTK_WED_RESET_TX_AMSDU				BIT(22)
 #define MTK_WED_RESET_WED				BIT(31)
 
 #define MTK_WED_CTRL					0x00c
@@ -44,6 +48,9 @@ struct mtk_wdma_desc {
 #define MTK_WED_CTRL_WPDMA_INT_AGENT_BUSY		BIT(1)
 #define MTK_WED_CTRL_WDMA_INT_AGENT_EN			BIT(2)
 #define MTK_WED_CTRL_WDMA_INT_AGENT_BUSY		BIT(3)
+#define MTK_WED_CTRL_WED_RX_IND_CMD_EN			BIT(5)
+#define MTK_WED_CTRL_WED_RX_PG_BM_EN			BIT(6)
+#define MTK_WED_CTRL_WED_RX_PG_BM_BUSY			BIT(7)
 #define MTK_WED_CTRL_WED_TX_BM_EN			BIT(8)
 #define MTK_WED_CTRL_WED_TX_BM_BUSY			BIT(9)
 #define MTK_WED_CTRL_WED_TX_FREE_AGENT_EN		BIT(10)
@@ -54,9 +61,14 @@ struct mtk_wdma_desc {
 #define MTK_WED_CTRL_RX_RRO_QM_BUSY			BIT(15)
 #define MTK_WED_CTRL_RX_ROUTE_QM_EN			BIT(16)
 #define MTK_WED_CTRL_RX_ROUTE_QM_BUSY			BIT(17)
+#define MTK_WED_CTRL_TX_TKID_ALI_EN			BIT(20)
+#define MTK_WED_CTRL_TX_TKID_ALI_BUSY			BIT(21)
+#define MTK_WED_CTRL_TX_AMSDU_EN			BIT(22)
+#define MTK_WED_CTRL_TX_AMSDU_BUSY			BIT(23)
 #define MTK_WED_CTRL_FINAL_DIDX_READ			BIT(24)
 #define MTK_WED_CTRL_ETH_DMAD_FMT			BIT(25)
 #define MTK_WED_CTRL_MIB_READ_CLEAR			BIT(28)
+#define MTK_WED_CTRL_FLD_MIB_RD_CLR			BIT(28)
 
 #define MTK_WED_EXT_INT_STATUS				0x020
 #define MTK_WED_EXT_INT_STATUS_TF_LEN_ERR		BIT(0)
@@ -89,6 +101,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_EXT_INT_MASK				0x028
 #define MTK_WED_EXT_INT_MASK1				0x02c
 #define MTK_WED_EXT_INT_MASK2				0x030
+#define MTK_WED_EXT_INT_MASK3				0x034
 
 #define MTK_WED_STATUS					0x060
 #define MTK_WED_STATUS_TX				GENMASK(15, 8)
@@ -96,9 +109,14 @@ struct mtk_wdma_desc {
 #define MTK_WED_TX_BM_CTRL				0x080
 #define MTK_WED_TX_BM_CTRL_VLD_GRP_NUM			GENMASK(6, 0)
 #define MTK_WED_TX_BM_CTRL_RSV_GRP_NUM			GENMASK(22, 16)
+#define MTK_WED_TX_BM_CTRL_LEGACY_EN			BIT(26)
+#define MTK_WED_TX_TKID_CTRL_FREE_FORMAT		BIT(27)
 #define MTK_WED_TX_BM_CTRL_PAUSE			BIT(28)
 
 #define MTK_WED_TX_BM_BASE				0x084
+#define MTK_WED_TX_BM_INIT_PTR				0x088
+#define MTK_WED_TX_BM_SW_TAIL_IDX			GENMASK(16, 0)
+#define MTK_WED_TX_BM_INIT_SW_TAIL_IDX			BIT(16)
 
 #define MTK_WED_TX_BM_TKID_START			GENMASK(15, 0)
 #define MTK_WED_TX_BM_TKID_END				GENMASK(31, 16)
@@ -122,6 +140,9 @@ struct mtk_wdma_desc {
 #define MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM		GENMASK(22, 16)
 #define MTK_WED_TX_TKID_CTRL_PAUSE			BIT(28)
 
+#define MTK_WED_TX_TKID_CTRL_VLD_GRP_NUM_V3		GENMASK(7, 0)
+#define MTK_WED_TX_TKID_CTRL_RSV_GRP_NUM_V3		GENMASK(23, 16)
+
 #define MTK_WED_TX_TKID_DYN_THR				0x0e0
 #define MTK_WED_TX_TKID_DYN_THR_LO			GENMASK(6, 0)
 #define MTK_WED_TX_TKID_DYN_THR_HI			GENMASK(22, 16)
@@ -199,12 +220,15 @@ struct mtk_wdma_desc {
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R1_PKT_PROC	BIT(5)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R0_CRX_SYNC	BIT(6)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_R1_CRX_SYNC	BIT(7)
-#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_VER	GENMASK(18, 16)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_VER	GENMASK(15, 12)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNS_VER_FORCE_4	BIT(18)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UNSUPPORT_FMT	BIT(19)
-#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_UEVENT_PKT_FMT_CHK BIT(20)
+#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EVENT_PKT_FMT_CHK	BIT(20)
 #define MTK_WED_WPDMA_GLO_CFG_RX_DDONE2_WR		BIT(21)
 #define MTK_WED_WPDMA_GLO_CFG_TX_TKID_KEEP		BIT(24)
+#define MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK_LAST		BIT(25)
 #define MTK_WED_WPDMA_GLO_CFG_TX_DMAD_DW3_PREV		BIT(28)
+#define MTK_WED_WPDMA_GLO_CFG_TX_DDONE_CHK		BIT(30)
 
 #define MTK_WED_WPDMA_RESET_IDX				0x50c
 #define MTK_WED_WPDMA_RESET_IDX_TX			GENMASK(3, 0)
@@ -250,9 +274,10 @@ struct mtk_wdma_desc {
 #define MTK_WED_PCIE_INT_TRIGGER_STATUS			BIT(16)
 
 #define MTK_WED_PCIE_INT_CTRL				0x57c
-#define MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA		BIT(20)
-#define MTK_WED_PCIE_INT_CTRL_SRC_SEL			GENMASK(17, 16)
 #define MTK_WED_PCIE_INT_CTRL_POLL_EN			GENMASK(13, 12)
+#define MTK_WED_PCIE_INT_CTRL_SRC_SEL			GENMASK(17, 16)
+#define MTK_WED_PCIE_INT_CTRL_MSK_EN_POLA		BIT(20)
+#define MTK_WED_PCIE_INT_CTRL_MSK_IRQ_FILTER		BIT(21)
 
 #define MTK_WED_WPDMA_CFG_BASE				0x580
 #define MTK_WED_WPDMA_CFG_INT_MASK			0x584
@@ -286,6 +311,20 @@ struct mtk_wdma_desc {
 #define MTK_WED_WPDMA_RX_D_PROCESSED_MIB(_n)		(0x784 + (_n) * 4)
 #define MTK_WED_WPDMA_RX_D_COHERENT_MIB			0x78c
 
+#define MTK_WED_WPDMA_RX_D_PREF_CFG			0x7b4
+#define MTK_WED_WPDMA_RX_D_PREF_EN			BIT(0)
+#define MTK_WED_WPDMA_RX_D_PREF_BURST_SIZE		GENMASK(12, 8)
+#define MTK_WED_WPDMA_RX_D_PREF_LOW_THRES		GENMASK(21, 16)
+
+#define MTK_WED_WPDMA_RX_D_PREF_RX0_SIDX		0x7b8
+#define MTK_WED_WPDMA_RX_D_PREF_SIDX_IDX_CLR		BIT(15)
+
+#define MTK_WED_WPDMA_RX_D_PREF_RX1_SIDX		0x7bc
+
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG		0x7c0
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R0_CLR		BIT(0)
+#define MTK_WED_WPDMA_RX_D_PREF_FIFO_CFG_R1_CLR		BIT(16)
+
 #define MTK_WED_WDMA_RING_TX				0x800
 
 #define MTK_WED_WDMA_TX_MIB				0x810
@@ -293,6 +332,18 @@ struct mtk_wdma_desc {
 #define MTK_WED_WDMA_RING_RX(_n)			(0x900 + (_n) * 0x10)
 #define MTK_WED_WDMA_RX_THRES(_n)			(0x940 + (_n) * 0x4)
 
+#define MTK_WED_WDMA_RX_PREF_CFG			0x950
+#define MTK_WED_WDMA_RX_PREF_EN				BIT(0)
+#define MTK_WED_WDMA_RX_PREF_BURST_SIZE			GENMASK(12, 8)
+#define MTK_WED_WDMA_RX_PREF_LOW_THRES			GENMASK(21, 16)
+#define MTK_WED_WDMA_RX_PREF_RX0_SIDX_CLR		BIT(24)
+#define MTK_WED_WDMA_RX_PREF_RX1_SIDX_CLR		BIT(25)
+#define MTK_WED_WDMA_RX_PREF_DDONE2_EN			BIT(26)
+
+#define MTK_WED_WDMA_RX_PREF_FIFO_CFG			0x95C
+#define MTK_WED_WDMA_RX_PREF_FIFO_RX0_CLR		BIT(0)
+#define MTK_WED_WDMA_RX_PREF_FIFO_RX1_CLR		BIT(16)
+
 #define MTK_WED_WDMA_GLO_CFG				0xa04
 #define MTK_WED_WDMA_GLO_CFG_TX_DRV_EN			BIT(0)
 #define MTK_WED_WDMA_GLO_CFG_TX_DDONE_CHK		BIT(1)
@@ -325,6 +376,7 @@ struct mtk_wdma_desc {
 #define MTK_WED_WDMA_INT_TRIGGER_RX_DONE		GENMASK(17, 16)
 
 #define MTK_WED_WDMA_INT_CTRL				0xa2c
+#define MTK_WED_WDMA_INT_POLL_PRD			GENMASK(7, 0)
 #define MTK_WED_WDMA_INT_CTRL_POLL_SRC_SEL		GENMASK(17, 16)
 
 #define MTK_WED_WDMA_CFG_BASE				0xaa0
@@ -388,6 +440,18 @@ struct mtk_wdma_desc {
 #define MTK_WDMA_INT_GRP1				0x250
 #define MTK_WDMA_INT_GRP2				0x254
 
+#define MTK_WDMA_PREF_TX_CFG				0x2d0
+#define MTK_WDMA_PREF_TX_CFG_PREF_EN			BIT(0)
+
+#define MTK_WDMA_PREF_RX_CFG				0x2dc
+#define MTK_WDMA_PREF_RX_CFG_PREF_EN			BIT(0)
+
+#define MTK_WDMA_WRBK_TX_CFG				0x300
+#define MTK_WDMA_WRBK_TX_CFG_WRBK_EN			BIT(30)
+
+#define MTK_WDMA_WRBK_RX_CFG				0x344
+#define MTK_WDMA_WRBK_RX_CFG_WRBK_EN			BIT(30)
+
 #define MTK_PCIE_MIRROR_MAP(n)				((n) ? 0x4 : 0x0)
 #define MTK_PCIE_MIRROR_MAP_EN				BIT(0)
 #define MTK_PCIE_MIRROR_MAP_WED_ID			BIT(1)
@@ -401,6 +465,30 @@ struct mtk_wdma_desc {
 #define MTK_WED_RTQM_Q_DBG_BYPASS			BIT(5)
 #define MTK_WED_RTQM_TXDMAD_FPORT			GENMASK(23, 20)
 
+#define MTK_WED_RTQM_IGRS0_I2HW_DMAD_CNT		0xb1c
+#define MTK_WED_RTQM_IGRS0_I2H_DMAD_CNT(_n)		(0xb20 + (_n) * 0x4)
+#define	MTK_WED_RTQM_IGRS0_I2HW_PKT_CNT			0xb28
+#define MTK_WED_RTQM_IGRS0_I2H_PKT_CNT(_n)		(0xb2c + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS0_FDROP_CNT			0xb34
+
+#define MTK_WED_RTQM_IGRS1_I2HW_DMAD_CNT		0xb44
+#define MTK_WED_RTQM_IGRS1_I2H_DMAD_CNT(_n)		(0xb48 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS1_I2HW_PKT_CNT			0xb50
+#define MTK_WED_RTQM_IGRS1_I2H_PKT_CNT(_n)		(0xb54 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS1_FDROP_CNT			0xb5c
+
+#define MTK_WED_RTQM_IGRS2_I2HW_DMAD_CNT		0xb6c
+#define MTK_WED_RTQM_IGRS2_I2H_DMAD_CNT(_n)		(0xb70 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS2_I2HW_PKT_CNT			0xb78
+#define MTK_WED_RTQM_IGRS2_I2H_PKT_CNT(_n)		(0xb7c + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS2_FDROP_CNT			0xb84
+
+#define MTK_WED_RTQM_IGRS3_I2HW_DMAD_CNT		0xb94
+#define MTK_WED_RTQM_IGRS3_I2H_DMAD_CNT(_n)		(0xb98 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS3_I2HW_PKT_CNT			0xba0
+#define MTK_WED_RTQM_IGRS3_I2H_PKT_CNT(_n)		(0xba4 + (_n) * 0x4)
+#define MTK_WED_RTQM_IGRS3_FDROP_CNT			0xbac
+
 #define MTK_WED_RTQM_R2H_MIB(_n)			(0xb70 + (_n) * 0x4)
 #define MTK_WED_RTQM_R2Q_MIB(_n)			(0xb78 + (_n) * 0x4)
 #define MTK_WED_RTQM_Q2N_MIB				0xb80
@@ -409,6 +497,24 @@ struct mtk_wdma_desc {
 #define MTK_WED_RTQM_Q2B_MIB				0xb8c
 #define MTK_WED_RTQM_PFDBK_MIB				0xb90
 
+#define MTK_WED_RTQM_ENQ_CFG0				0xbb8
+#define MTK_WED_RTQM_ENQ_CFG_TXDMAD_FPORT		GENMASK(15, 12)
+
+#define MTK_WED_RTQM_FDROP_MIB				0xb84
+#define MTK_WED_RTQM_ENQ_I2Q_DMAD_CNT			0xbbc
+#define MTK_WED_RTQM_ENQ_I2N_DMAD_CNT			0xbc0
+#define MTK_WED_RTQM_ENQ_I2Q_PKT_CNT			0xbc4
+#define MTK_WED_RTQM_ENQ_I2N_PKT_CNT			0xbc8
+#define MTK_WED_RTQM_ENQ_USED_ENTRY_CNT			0xbcc
+#define MTK_WED_RTQM_ENQ_ERR_CNT			0xbd0
+
+#define MTK_WED_RTQM_DEQ_DMAD_CNT			0xbd8
+#define MTK_WED_RTQM_DEQ_Q2I_DMAD_CNT			0xbdc
+#define MTK_WED_RTQM_DEQ_PKT_CNT			0xbe0
+#define MTK_WED_RTQM_DEQ_Q2I_PKT_CNT			0xbe4
+#define MTK_WED_RTQM_DEQ_USED_PFDBK_CNT			0xbe8
+#define MTK_WED_RTQM_DEQ_ERR_CNT			0xbec
+
 #define MTK_WED_RROQM_GLO_CFG				0xc04
 #define MTK_WED_RROQM_RST_IDX				0xc08
 #define MTK_WED_RROQM_RST_IDX_MIOD			BIT(0)
@@ -458,7 +564,116 @@ struct mtk_wdma_desc {
 #define MTK_WED_RX_BM_INTF				0xd9c
 #define MTK_WED_RX_BM_ERR_STS				0xda8
 
+#define MTK_RRO_IND_CMD_SIGNATURE			0xe00
+#define MTK_RRO_IND_CMD_DMA_IDX				GENMASK(11, 0)
+#define MTK_RRO_IND_CMD_MAGIC_CNT			GENMASK(30, 28)
+
+#define MTK_WED_IND_CMD_RX_CTRL0			0xe04
+#define MTK_WED_IND_CMD_PROC_IDX			GENMASK(11, 0)
+#define MTK_WED_IND_CMD_PREFETCH_FREE_CNT		GENMASK(19, 16)
+#define MTK_WED_IND_CMD_MAGIC_CNT			GENMASK(30, 28)
+
+#define MTK_WED_IND_CMD_RX_CTRL1			0xe08
+#define MTK_WED_IND_CMD_RX_CTRL2			0xe0c
+#define MTK_WED_IND_CMD_MAX_CNT				GENMASK(11, 0)
+#define MTK_WED_IND_CMD_BASE_M				GENMASK(19, 16)
+
+#define MTK_WED_RRO_CFG0				0xe10
+#define MTK_WED_RRO_CFG1				0xe14
+#define MTK_WED_RRO_CFG1_MAX_WIN_SZ			GENMASK(31, 29)
+#define MTK_WED_RRO_CFG1_ACK_SN_BASE_M			GENMASK(19, 16)
+#define MTK_WED_RRO_CFG1_PARTICL_SE_ID			GENMASK(11, 0)
+
+#define MTK_WED_ADDR_ELEM_CFG0				0xe18
+#define MTK_WED_ADDR_ELEM_CFG1				0xe1c
+#define MTK_WED_ADDR_ELEM_PREFETCH_FREE_CNT		GENMASK(19, 16)
+
+#define MTK_WED_ADDR_ELEM_TBL_CFG			0xe20
+#define MTK_WED_ADDR_ELEM_TBL_OFFSET			GENMASK(6, 0)
+#define MTK_WED_ADDR_ELEM_TBL_RD_RDY			BIT(28)
+#define MTK_WED_ADDR_ELEM_TBL_WR_RDY			BIT(29)
+#define MTK_WED_ADDR_ELEM_TBL_RD			BIT(30)
+#define MTK_WED_ADDR_ELEM_TBL_WR			BIT(31)
+
+#define MTK_WED_RADDR_ELEM_TBL_WDATA			0xe24
+#define MTK_WED_RADDR_ELEM_TBL_RDATA			0xe28
+
+#define MTK_WED_PN_CHECK_CFG				0xe30
+#define MTK_WED_PN_CHECK_SE_ID				GENMASK(11, 0)
+#define MTK_WED_PN_CHECK_RD_RDY				BIT(28)
+#define MTK_WED_PN_CHECK_WR_RDY				BIT(29)
+#define MTK_WED_PN_CHECK_RD				BIT(30)
+#define MTK_WED_PN_CHECK_WR				BIT(31)
+
+#define MTK_WED_PN_CHECK_WDATA_M			0xe38
+#define MTK_WED_PN_CHECK_IS_FIRST			BIT(17)
+
+#define MTK_WED_RRO_MSDU_PG_RING_CFG(_n)		(0xe44 + (_n) * 0x8)
+
+#define MTK_WED_RRO_MSDU_PG_RING2_CFG			0xe58
+#define MTK_WED_RRO_MSDU_PG_DRV_CLR			BIT(26)
+#define MTK_WED_RRO_MSDU_PG_DRV_EN			BIT(31)
+
+#define MTK_WED_RRO_MSDU_PG_CTRL0(_n)			(0xe5c + (_n) * 0xc)
+#define MTK_WED_RRO_MSDU_PG_CTRL1(_n)			(0xe60 + (_n) * 0xc)
+#define MTK_WED_RRO_MSDU_PG_CTRL2(_n)			(0xe64 + (_n) * 0xc)
+
+#define MTK_WED_RRO_RX_D_RX(_n)				(0xe80 + (_n) * 0x10)
+
+#define MTK_WED_RRO_RX_MAGIC_CNT			BIT(13)
+
+#define MTK_WED_RRO_RX_D_CFG(_n)			(0xea0 + (_n) * 0x4)
+#define MTK_WED_RRO_RX_D_DRV_CLR			BIT(26)
+#define MTK_WED_RRO_RX_D_DRV_EN				BIT(31)
+
+#define MTK_WED_RRO_PG_BM_RX_DMAM			0xeb0
+#define MTK_WED_RRO_PG_BM_RX_SDL0			GENMASK(13, 0)
+
+#define MTK_WED_RRO_PG_BM_BASE				0xeb4
+#define MTK_WED_RRO_PG_BM_INIT_PTR			0xeb8
+#define MTK_WED_RRO_PG_BM_SW_TAIL_IDX			GENMASK(15, 0)
+#define MTK_WED_RRO_PG_BM_INIT_SW_TAIL_IDX		BIT(16)
+
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX			0xeec
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_EN		BIT(0)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_CLR		BIT(1)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX0_DONE_TRIG	GENMASK(6, 2)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_EN		BIT(8)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_CLR		BIT(9)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_RX1_DONE_TRIG	GENMASK(14, 10)
+
+#define MTK_WED_WPDMA_INT_CTRL_RRO_MSDU_PG		0xef4
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_EN		BIT(0)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_CLR		BIT(1)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG0_DONE_TRIG	GENMASK(6, 2)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_EN		BIT(8)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_CLR		BIT(9)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG1_DONE_TRIG	GENMASK(14, 10)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_EN		BIT(16)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_CLR		BIT(17)
+#define MTK_WED_WPDMA_INT_CTRL_RRO_PG2_DONE_TRIG	GENMASK(22, 18)
+
+#define MTK_WED_RX_IND_CMD_CNT0				0xf20
+#define MTK_WED_RX_IND_CMD_DBG_CNT_EN			BIT(31)
+
+#define MTK_WED_RX_IND_CMD_CNT(_n)			(0xf20 + (_n) * 0x4)
+#define MTK_WED_IND_CMD_MAGIC_CNT_FAIL_CNT		GENMASK(15, 0)
+
+#define MTK_WED_RX_ADDR_ELEM_CNT(_n)			(0xf48 + (_n) * 0x4)
+#define MTK_WED_ADDR_ELEM_SIG_FAIL_CNT			GENMASK(15, 0)
+#define MTK_WED_ADDR_ELEM_FIRST_SIG_FAIL_CNT		GENMASK(31, 16)
+#define MTK_WED_ADDR_ELEM_ACKSN_CNT			GENMASK(27, 0)
+
+#define MTK_WED_RX_MSDU_PG_CNT(_n)			(0xf5c + (_n) * 0x4)
+
+#define MTK_WED_RX_PN_CHK_CNT				0xf70
+#define MTK_WED_PN_CHK_FAIL_CNT				GENMASK(15, 0)
+
 #define MTK_WED_WOCPU_VIEW_MIOD_BASE			0x8000
 #define MTK_WED_PCIE_INT_MASK				0x0
 
+#define MTK_WED_PCIE_BASE			0x11280000
+#define MTK_WED_PCIE_BASE0			0x11300000
+#define MTK_WED_PCIE_BASE1			0x11310000
+#define MTK_WED_PCIE_BASE2			0x11290000
 #endif
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
index 8ed81761bf10..87a67fa3868d 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
@@ -91,6 +91,8 @@ enum mtk_wed_dummy_cr_idx {
 #define MT7981_FIRMWARE_WO	"mediatek/mt7981_wo.bin"
 #define MT7986_FIRMWARE_WO0	"mediatek/mt7986_wo_0.bin"
 #define MT7986_FIRMWARE_WO1	"mediatek/mt7986_wo_1.bin"
+#define MT7988_FIRMWARE_WO0	"mediatek/mt7988_wo_0.bin"
+#define MT7988_FIRMWARE_WO1	"mediatek/mt7988_wo_1.bin"
 
 #define MTK_WO_MCU_CFG_LS_BASE				0
 #define MTK_WO_MCU_CFG_LS_HW_VER_ADDR			(MTK_WO_MCU_CFG_LS_BASE + 0x000)
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5f00dc26582b..5b096f9f1975 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -138,6 +138,8 @@ struct mtk_wed_device {
 		u32 wpdma_rx;
 
 		bool wcid_512;
+		bool hw_rro;
+		bool msi;
 
 		u16 token_start;
 		unsigned int nbuf;
@@ -211,10 +213,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev)
 	return ret;
 }
 
-static inline bool
-mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
+static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 {
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	if (dev->version == 3)
+		return dev->wlan.hw_rro;
+
 	return dev->version != 1;
 #else
 	return false;
-- 
cgit v1.2.3


From b230812b9dda125e69ab0a5a11cda88d9c0d18a9 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:15 +0200
Subject: net: ethernet: mtk_wed: introduce partial AMSDU offload support for
 MT7988

Introduce partial AMSDU offload support for MT7988 SoC in order to merge
in hw packets belonging to the same AMSDU before passing them to the
WLAN nic.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_ppe.c         |   4 +-
 drivers/net/ethernet/mediatek/mtk_ppe.h         |  19 +--
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |   3 +-
 drivers/net/ethernet/mediatek/mtk_wed.c         | 154 +++++++++++++++++++++---
 drivers/net/ethernet/mediatek/mtk_wed.h         |   7 ++
 drivers/net/ethernet/mediatek/mtk_wed_regs.h    |  76 ++++++++++++
 include/linux/netdevice.h                       |   1 +
 include/linux/soc/mediatek/mtk_wed.h            |  12 ++
 8 files changed, 248 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c
index 86f32f486043..b2a5d9c3733d 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.c
@@ -425,7 +425,8 @@ int mtk_foe_entry_set_pppoe(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 }
 
 int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
-			   int wdma_idx, int txq, int bss, int wcid)
+			   int wdma_idx, int txq, int bss, int wcid,
+			   bool amsdu_en)
 {
 	struct mtk_foe_mac_info *l2 = mtk_foe_entry_l2(eth, entry);
 	u32 *ib2 = mtk_foe_entry_ib2(eth, entry);
@@ -437,6 +438,7 @@ int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			 MTK_FOE_IB2_WDMA_WINFO_V2;
 		l2->w3info = FIELD_PREP(MTK_FOE_WINFO_WCID_V3, wcid) |
 			     FIELD_PREP(MTK_FOE_WINFO_BSS_V3, bss);
+		l2->amsdu = FIELD_PREP(MTK_FOE_WINFO_AMSDU_EN, amsdu_en);
 		break;
 	case 2:
 		*ib2 &= ~MTK_FOE_IB2_PORT_MG_V2;
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h
index e3d0ec72bc69..691806bca372 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.h
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.h
@@ -88,13 +88,13 @@ enum {
 #define MTK_FOE_WINFO_BSS_V3		GENMASK(23, 16)
 #define MTK_FOE_WINFO_WCID_V3		GENMASK(15, 0)
 
-#define MTK_FOE_WINFO_PAO_USR_INFO	GENMASK(15, 0)
-#define MTK_FOE_WINFO_PAO_TID		GENMASK(19, 16)
-#define MTK_FOE_WINFO_PAO_IS_FIXEDRATE	BIT(20)
-#define MTK_FOE_WINFO_PAO_IS_PRIOR	BIT(21)
-#define MTK_FOE_WINFO_PAO_IS_SP		BIT(22)
-#define MTK_FOE_WINFO_PAO_HF		BIT(23)
-#define MTK_FOE_WINFO_PAO_AMSDU_EN	BIT(24)
+#define MTK_FOE_WINFO_AMSDU_USR_INFO	GENMASK(15, 0)
+#define MTK_FOE_WINFO_AMSDU_TID		GENMASK(19, 16)
+#define MTK_FOE_WINFO_AMSDU_IS_FIXEDRATE	BIT(20)
+#define MTK_FOE_WINFO_AMSDU_IS_PRIOR	BIT(21)
+#define MTK_FOE_WINFO_AMSDU_IS_SP	BIT(22)
+#define MTK_FOE_WINFO_AMSDU_HF		BIT(23)
+#define MTK_FOE_WINFO_AMSDU_EN		BIT(24)
 
 enum {
 	MTK_FOE_STATE_INVALID,
@@ -123,7 +123,7 @@ struct mtk_foe_mac_info {
 
 	/* netsys_v3 */
 	u32 w3info;
-	u32 wpao;
+	u32 amsdu;
 };
 
 /* software-only entry type */
@@ -392,7 +392,8 @@ int mtk_foe_entry_set_vlan(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 int mtk_foe_entry_set_pppoe(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			    int sid);
 int mtk_foe_entry_set_wdma(struct mtk_eth *eth, struct mtk_foe_entry *entry,
-			   int wdma_idx, int txq, int bss, int wcid);
+			   int wdma_idx, int txq, int bss, int wcid,
+			   bool amsdu_en);
 int mtk_foe_entry_set_queue(struct mtk_eth *eth, struct mtk_foe_entry *entry,
 			    unsigned int queue);
 int mtk_foe_entry_commit(struct mtk_ppe *ppe, struct mtk_flow_entry *entry);
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index 95f76975f258..e073d2b5542c 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -111,6 +111,7 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i
 	info->queue = path->mtk_wdma.queue;
 	info->bss = path->mtk_wdma.bss;
 	info->wcid = path->mtk_wdma.wcid;
+	info->amsdu = path->mtk_wdma.amsdu;
 
 	return 0;
 }
@@ -192,7 +193,7 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
 
 	if (mtk_flow_get_wdma_info(dev, dest_mac, &info) == 0) {
 		mtk_foe_entry_set_wdma(eth, foe, info.wdma_idx, info.queue,
-				       info.bss, info.wcid);
+				       info.bss, info.wcid, info.amsdu);
 		if (mtk_is_netsys_v2_or_greater(eth)) {
 			switch (info.wdma_idx) {
 			case 0:
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 18cbf028f6ed..d4b41ccfbad5 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -30,6 +30,8 @@
 #define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
 #define MTK_WED_RX_RING_SIZE		1536
 #define MTK_WED_RX_PG_BM_CNT		8192
+#define MTK_WED_AMSDU_BUF_SIZE		(PAGE_SIZE << 4)
+#define MTK_WED_AMSDU_NPAGES		32
 
 #define MTK_WED_TX_RING_SIZE		2048
 #define MTK_WED_WDMA_RING_SIZE		1024
@@ -173,6 +175,23 @@ mtk_wdma_rx_reset(struct mtk_wed_device *dev)
 	return ret;
 }
 
+static u32
+mtk_wed_check_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
+{
+	return !!(wed_r32(dev, reg) & mask);
+}
+
+static int
+mtk_wed_poll_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
+{
+	int sleep = 15000;
+	int timeout = 100 * sleep;
+	u32 val;
+
+	return read_poll_timeout(mtk_wed_check_busy, val, !val, sleep,
+				 timeout, false, dev, reg, mask);
+}
+
 static void
 mtk_wdma_tx_reset(struct mtk_wed_device *dev)
 {
@@ -335,6 +354,118 @@ out:
 	return hw;
 }
 
+static int
+mtk_wed_amsdu_buffer_alloc(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_hw *hw = dev->hw;
+	struct mtk_wed_amsdu *wed_amsdu;
+	int i;
+
+	if (!mtk_wed_is_v3_or_greater(hw))
+		return 0;
+
+	wed_amsdu = devm_kcalloc(hw->dev, MTK_WED_AMSDU_NPAGES,
+				 sizeof(*wed_amsdu), GFP_KERNEL);
+	if (!wed_amsdu)
+		return -ENOMEM;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++) {
+		void *ptr;
+
+		/* each segment is 64K */
+		ptr = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
+					       __GFP_ZERO | __GFP_COMP |
+					       GFP_DMA32,
+					       get_order(MTK_WED_AMSDU_BUF_SIZE));
+		if (!ptr)
+			goto error;
+
+		wed_amsdu[i].txd = ptr;
+		wed_amsdu[i].txd_phy = dma_map_single(hw->dev, ptr,
+						      MTK_WED_AMSDU_BUF_SIZE,
+						      DMA_TO_DEVICE);
+		if (dma_mapping_error(hw->dev, wed_amsdu[i].txd_phy))
+			goto error;
+	}
+	dev->hw->wed_amsdu = wed_amsdu;
+
+	return 0;
+
+error:
+	for (i--; i >= 0; i--)
+		dma_unmap_single(hw->dev, wed_amsdu[i].txd_phy,
+				 MTK_WED_AMSDU_BUF_SIZE, DMA_TO_DEVICE);
+	return -ENOMEM;
+}
+
+static void
+mtk_wed_amsdu_free_buffer(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_amsdu *wed_amsdu = dev->hw->wed_amsdu;
+	int i;
+
+	if (!wed_amsdu)
+		return;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++) {
+		dma_unmap_single(dev->hw->dev, wed_amsdu[i].txd_phy,
+				 MTK_WED_AMSDU_BUF_SIZE, DMA_TO_DEVICE);
+		free_pages((unsigned long)wed_amsdu[i].txd,
+			   get_order(MTK_WED_AMSDU_BUF_SIZE));
+	}
+}
+
+static int
+mtk_wed_amsdu_init(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_amsdu *wed_amsdu = dev->hw->wed_amsdu;
+	int i, ret;
+
+	if (!wed_amsdu)
+		return 0;
+
+	for (i = 0; i < MTK_WED_AMSDU_NPAGES; i++)
+		wed_w32(dev, MTK_WED_AMSDU_HIFTXD_BASE_L(i),
+			wed_amsdu[i].txd_phy);
+
+	/* init all sta parameter */
+	wed_w32(dev, MTK_WED_AMSDU_STA_INFO_INIT, MTK_WED_AMSDU_STA_RMVL |
+		MTK_WED_AMSDU_STA_WTBL_HDRT_MODE |
+		FIELD_PREP(MTK_WED_AMSDU_STA_MAX_AMSDU_LEN,
+			   dev->wlan.amsdu_max_len >> 8) |
+		FIELD_PREP(MTK_WED_AMSDU_STA_MAX_AMSDU_NUM,
+			   dev->wlan.amsdu_max_subframes));
+
+	wed_w32(dev, MTK_WED_AMSDU_STA_INFO, MTK_WED_AMSDU_STA_INFO_DO_INIT);
+
+	ret = mtk_wed_poll_busy(dev, MTK_WED_AMSDU_STA_INFO,
+				MTK_WED_AMSDU_STA_INFO_DO_INIT);
+	if (ret) {
+		dev_err(dev->hw->dev, "amsdu initialization failed\n");
+		return ret;
+	}
+
+	/* init partial amsdu offload txd src */
+	wed_set(dev, MTK_WED_AMSDU_HIFTXD_CFG,
+		FIELD_PREP(MTK_WED_AMSDU_HIFTXD_SRC, dev->hw->index));
+
+	/* init qmem */
+	wed_set(dev, MTK_WED_AMSDU_PSE, MTK_WED_AMSDU_PSE_RESET);
+	ret = mtk_wed_poll_busy(dev, MTK_WED_MON_AMSDU_QMEM_STS1, BIT(29));
+	if (ret) {
+		pr_info("%s: amsdu qmem initialization failed\n", __func__);
+		return ret;
+	}
+
+	/* eagle E1 PCIE1 tx ring 22 flow control issue */
+	if (dev->wlan.id == 0x7991)
+		wed_clr(dev, MTK_WED_AMSDU_FIFO, MTK_WED_AMSDU_IS_PRIOR0_RING);
+
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN);
+
+	return 0;
+}
+
 static int
 mtk_wed_tx_buffer_alloc(struct mtk_wed_device *dev)
 {
@@ -709,6 +840,7 @@ __mtk_wed_detach(struct mtk_wed_device *dev)
 
 	mtk_wdma_rx_reset(dev);
 	mtk_wed_reset(dev, MTK_WED_RESET_WED);
+	mtk_wed_amsdu_free_buffer(dev);
 	mtk_wed_free_tx_buffer(dev);
 	mtk_wed_free_tx_rings(dev);
 
@@ -1129,23 +1261,6 @@ mtk_wed_ring_reset(struct mtk_wed_ring *ring, int size, bool tx)
 	}
 }
 
-static u32
-mtk_wed_check_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
-{
-	return !!(wed_r32(dev, reg) & mask);
-}
-
-static int
-mtk_wed_poll_busy(struct mtk_wed_device *dev, u32 reg, u32 mask)
-{
-	int sleep = 15000;
-	int timeout = 100 * sleep;
-	u32 val;
-
-	return read_poll_timeout(mtk_wed_check_busy, val, !val, sleep,
-				 timeout, false, dev, reg, mask);
-}
-
 static int
 mtk_wed_rx_reset(struct mtk_wed_device *dev)
 {
@@ -1692,6 +1807,7 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 	}
 
 	mtk_wed_set_512_support(dev, dev->wlan.wcid_512);
+	mtk_wed_amsdu_init(dev);
 
 	mtk_wed_dma_enable(dev);
 	dev->running = true;
@@ -1748,6 +1864,10 @@ mtk_wed_attach(struct mtk_wed_device *dev)
 	if (ret)
 		goto out;
 
+	ret = mtk_wed_amsdu_buffer_alloc(dev);
+	if (ret)
+		goto out;
+
 	if (mtk_wed_get_rx_capa(dev)) {
 		ret = mtk_wed_rro_alloc(dev);
 		if (ret)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index 27d336db4d4d..c1f0479d7a71 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -25,6 +25,11 @@ struct mtk_wed_soc_data {
 	u32 wdma_desc_size;
 };
 
+struct mtk_wed_amsdu {
+	void *txd;
+	dma_addr_t txd_phy;
+};
+
 struct mtk_wed_hw {
 	const struct mtk_wed_soc_data *soc;
 	struct device_node *node;
@@ -38,6 +43,7 @@ struct mtk_wed_hw {
 	struct dentry *debugfs_dir;
 	struct mtk_wed_device *wed_dev;
 	struct mtk_wed_wo *wed_wo;
+	struct mtk_wed_amsdu *wed_amsdu;
 	u32 pcie_base;
 	u32 debugfs_reg;
 	u32 num_flows;
@@ -52,6 +58,7 @@ struct mtk_wdma_info {
 	u8 queue;
 	u16 wcid;
 	u8 bss;
+	u8 amsdu;
 };
 
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_regs.h b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
index a4d3cf64d090..5a7e4a11a54e 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
@@ -672,6 +672,82 @@ struct mtk_wdma_desc {
 #define MTK_WED_WOCPU_VIEW_MIOD_BASE			0x8000
 #define MTK_WED_PCIE_INT_MASK				0x0
 
+#define MTK_WED_AMSDU_FIFO				0x1800
+#define MTK_WED_AMSDU_IS_PRIOR0_RING			BIT(10)
+
+#define MTK_WED_AMSDU_STA_INFO				0x01810
+#define MTK_WED_AMSDU_STA_INFO_DO_INIT			BIT(0)
+#define MTK_WED_AMSDU_STA_INFO_SET_INIT			BIT(1)
+
+#define MTK_WED_AMSDU_STA_INFO_INIT			0x01814
+#define MTK_WED_AMSDU_STA_WTBL_HDRT_MODE		BIT(0)
+#define MTK_WED_AMSDU_STA_RMVL				BIT(1)
+#define MTK_WED_AMSDU_STA_MAX_AMSDU_LEN			GENMASK(7, 2)
+#define MTK_WED_AMSDU_STA_MAX_AMSDU_NUM			GENMASK(11, 8)
+
+#define MTK_WED_AMSDU_HIFTXD_BASE_L(_n)			(0x1980 + (_n) * 0x4)
+
+#define MTK_WED_AMSDU_PSE				0x1910
+#define MTK_WED_AMSDU_PSE_RESET				BIT(16)
+
+#define MTK_WED_AMSDU_HIFTXD_CFG			0x1968
+#define MTK_WED_AMSDU_HIFTXD_SRC			GENMASK(16, 15)
+
+#define MTK_WED_MON_AMSDU_FIFO_DMAD			0x1a34
+
+#define MTK_WED_MON_AMSDU_ENG_DMAD(_n)			(0x1a80 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QFPL(_n)			(0x1a84 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QENI(_n)			(0x1a88 + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_QENO(_n)			(0x1a8c + (_n) * 0x50)
+#define MTK_WED_MON_AMSDU_ENG_MERG(_n)			(0x1a90 + (_n) * 0x50)
+
+#define MTK_WED_MON_AMSDU_ENG_CNT8(_n)			(0x1a94 + (_n) * 0x50)
+#define MTK_WED_AMSDU_ENG_MAX_QGPP_CNT			GENMASK(10, 0)
+#define MTK_WED_AMSDU_ENG_MAX_PL_CNT			GENMASK(27, 16)
+
+#define MTK_WED_MON_AMSDU_ENG_CNT9(_n)			(0x1a98 + (_n) * 0x50)
+#define MTK_WED_AMSDU_ENG_CUR_ENTRY			GENMASK(10, 0)
+#define MTK_WED_AMSDU_ENG_MAX_BUF_MERGED		GENMASK(20, 16)
+#define MTK_WED_AMSDU_ENG_MAX_MSDU_MERGED		GENMASK(28, 24)
+
+#define MTK_WED_MON_AMSDU_QMEM_STS1			0x1e04
+
+#define MTK_WED_MON_AMSDU_QMEM_CNT(_n)			(0x1e0c + (_n) * 0x4)
+#define MTK_WED_AMSDU_QMEM_FQ_CNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QCNT			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QCNT			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QCNT			GENMASK(11, 0)
+
+#define MTK_WED_MON_AMSDU_QMEM_PTR(_n)			(0x1e20 + (_n) * 0x4)
+#define MTK_WED_AMSDU_QMEM_FQ_HEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QHEAD			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QHEAD			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_FQ_TAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_SP_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID0_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID1_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID2_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID3_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID4_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID5_QTAIL			GENMASK(11, 0)
+#define MTK_WED_AMSDU_QMEM_TID6_QTAIL			GENMASK(27, 16)
+#define MTK_WED_AMSDU_QMEM_TID7_QTAIL			GENMASK(11, 0)
+
+#define MTK_WED_MON_AMSDU_HIFTXD_FETCH_MSDU(_n)		(0x1ec4 + (_n) * 0x4)
+
 #define MTK_WED_PCIE_BASE			0x11280000
 #define MTK_WED_PCIE_BASE0			0x11300000
 #define MTK_WED_PCIE_BASE1			0x11310000
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index db3d8429d50d..7e520c14eb8c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -919,6 +919,7 @@ struct net_device_path {
 			u8 queue;
 			u16 wcid;
 			u8 bss;
+			u8 amsdu;
 		} mtk_wdma;
 	};
 };
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 5b096f9f1975..90d9c9ead3bc 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -128,6 +128,7 @@ struct mtk_wed_device {
 		enum mtk_wed_bus_tye bus_type;
 		void __iomem *base;
 		u32 phy_base;
+		u32 id;
 
 		u32 wpdma_phys;
 		u32 wpdma_int;
@@ -146,10 +147,12 @@ struct mtk_wed_device {
 		unsigned int rx_nbuf;
 		unsigned int rx_npkt;
 		unsigned int rx_size;
+		unsigned int amsdu_max_len;
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
 		u8 txfree_tbit;
+		u8 amsdu_max_subframes;
 
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
@@ -225,6 +228,15 @@ static inline bool mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 #endif
 }
 
+static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
+{
+#ifdef CONFIG_NET_MEDIATEK_SOC_WED
+	return dev->version == 3;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 #define mtk_wed_device_active(_dev) !!(_dev)->ops
 #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
-- 
cgit v1.2.3


From 6757d345dd7dba795f5af44d4442d55a83c4b1b4 Mon Sep 17 00:00:00 2001
From: Sujuan Chen <sujuan.chen@mediatek.com>
Date: Mon, 18 Sep 2023 12:29:16 +0200
Subject: net: ethernet: mtk_wed: introduce hw_rro support for MT7988

MT7988 SoC support 802.11 receive reordering offload in hw while
MT7986 SoC implements it through the firmware running on the mcu.

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Sujuan Chen <sujuan.chen@mediatek.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 299 +++++++++++++++++++++++++++++++-
 include/linux/soc/mediatek/mtk_wed.h    |  45 +++++
 2 files changed, 342 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index d4b41ccfbad5..2a0be1f2d43e 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -27,7 +27,7 @@
 #define MTK_WED_BUF_SIZE		2048
 #define MTK_WED_PAGE_BUF_SIZE		128
 #define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
-#define MTK_WED_RX_PAGE_BUF_PER_PAGE	(PAGE_SIZE / 128)
+#define MTK_WED_RX_BUF_PER_PAGE		(PAGE_SIZE / MTK_WED_PAGE_BUF_SIZE)
 #define MTK_WED_RX_RING_SIZE		1536
 #define MTK_WED_RX_PG_BM_CNT		8192
 #define MTK_WED_AMSDU_BUF_SIZE		(PAGE_SIZE << 4)
@@ -596,6 +596,68 @@ free_pagelist:
 	kfree(page_list);
 }
 
+static int
+mtk_wed_hwrro_buffer_alloc(struct mtk_wed_device *dev)
+{
+	int n_pages = MTK_WED_RX_PG_BM_CNT / MTK_WED_RX_BUF_PER_PAGE;
+	struct mtk_wed_buf *page_list;
+	struct mtk_wed_bm_desc *desc;
+	dma_addr_t desc_phys;
+	int i, page_idx = 0;
+
+	if (!dev->wlan.hw_rro)
+		return 0;
+
+	page_list = kcalloc(n_pages, sizeof(*page_list), GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	dev->hw_rro.size = dev->wlan.rx_nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
+	dev->hw_rro.pages = page_list;
+	desc = dma_alloc_coherent(dev->hw->dev,
+				  dev->wlan.rx_nbuf * sizeof(*desc),
+				  &desc_phys, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	dev->hw_rro.desc = desc;
+	dev->hw_rro.desc_phys = desc_phys;
+
+	for (i = 0; i < MTK_WED_RX_PG_BM_CNT; i += MTK_WED_RX_BUF_PER_PAGE) {
+		dma_addr_t page_phys, buf_phys;
+		struct page *page;
+		int s;
+
+		page = __dev_alloc_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		page_phys = dma_map_page(dev->hw->dev, page, 0, PAGE_SIZE,
+					 DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dev->hw->dev, page_phys)) {
+			__free_page(page);
+			return -ENOMEM;
+		}
+
+		page_list[page_idx].p = page;
+		page_list[page_idx++].phy_addr = page_phys;
+		dma_sync_single_for_cpu(dev->hw->dev, page_phys, PAGE_SIZE,
+					DMA_BIDIRECTIONAL);
+
+		buf_phys = page_phys;
+		for (s = 0; s < MTK_WED_RX_BUF_PER_PAGE; s++) {
+			desc->buf0 = cpu_to_le32(buf_phys);
+			buf_phys += MTK_WED_PAGE_BUF_SIZE;
+			desc++;
+		}
+
+		dma_sync_single_for_device(dev->hw->dev, page_phys, PAGE_SIZE,
+					   DMA_BIDIRECTIONAL);
+	}
+
+	return 0;
+}
+
 static int
 mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 {
@@ -613,7 +675,42 @@ mtk_wed_rx_buffer_alloc(struct mtk_wed_device *dev)
 	dev->rx_buf_ring.desc_phys = desc_phys;
 	dev->wlan.init_rx_buf(dev, dev->wlan.rx_npkt);
 
-	return 0;
+	return mtk_wed_hwrro_buffer_alloc(dev);
+}
+
+static void
+mtk_wed_hwrro_free_buffer(struct mtk_wed_device *dev)
+{
+	struct mtk_wed_buf *page_list = dev->hw_rro.pages;
+	struct mtk_wed_bm_desc *desc = dev->hw_rro.desc;
+	int i, page_idx = 0;
+
+	if (!dev->wlan.hw_rro)
+		return;
+
+	if (!page_list)
+		return;
+
+	if (!desc)
+		goto free_pagelist;
+
+	for (i = 0; i < MTK_WED_RX_PG_BM_CNT; i += MTK_WED_RX_BUF_PER_PAGE) {
+		dma_addr_t buf_addr = page_list[page_idx].phy_addr;
+		void *page = page_list[page_idx++].p;
+
+		if (!page)
+			break;
+
+		dma_unmap_page(dev->hw->dev, buf_addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+		__free_page(page);
+	}
+
+	dma_free_coherent(dev->hw->dev, dev->hw_rro.size * sizeof(*desc),
+			  desc, dev->hw_rro.desc_phys);
+
+free_pagelist:
+	kfree(page_list);
 }
 
 static void
@@ -627,6 +724,28 @@ mtk_wed_free_rx_buffer(struct mtk_wed_device *dev)
 	dev->wlan.release_rx_buf(dev);
 	dma_free_coherent(dev->hw->dev, dev->rx_buf_ring.size * sizeof(*desc),
 			  desc, dev->rx_buf_ring.desc_phys);
+
+	mtk_wed_hwrro_free_buffer(dev);
+}
+
+static void
+mtk_wed_hwrro_init(struct mtk_wed_device *dev)
+{
+	if (!mtk_wed_get_rx_capa(dev) || !dev->wlan.hw_rro)
+		return;
+
+	wed_set(dev, MTK_WED_RRO_PG_BM_RX_DMAM,
+		FIELD_PREP(MTK_WED_RRO_PG_BM_RX_SDL0, 128));
+
+	wed_w32(dev, MTK_WED_RRO_PG_BM_BASE, dev->hw_rro.desc_phys);
+
+	wed_w32(dev, MTK_WED_RRO_PG_BM_INIT_PTR,
+		MTK_WED_RRO_PG_BM_INIT_SW_TAIL_IDX |
+		FIELD_PREP(MTK_WED_RRO_PG_BM_SW_TAIL_IDX,
+			   MTK_WED_RX_PG_BM_CNT));
+
+	/* enable rx_page_bm to fetch dmad */
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_PG_BM_EN);
 }
 
 static void
@@ -640,6 +759,8 @@ mtk_wed_rx_buffer_hw_init(struct mtk_wed_device *dev)
 	wed_w32(dev, MTK_WED_RX_BM_DYN_ALLOC_TH,
 		FIELD_PREP(MTK_WED_RX_BM_DYN_ALLOC_TH_H, 0xffff));
 	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_BM_EN);
+
+	mtk_wed_hwrro_init(dev);
 }
 
 static void
@@ -935,6 +1056,8 @@ mtk_wed_bus_init(struct mtk_wed_device *dev)
 static void
 mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 {
+	int i;
+
 	if (mtk_wed_is_v1(dev->hw)) {
 		wed_w32(dev, MTK_WED_WPDMA_CFG_BASE,  dev->wlan.wpdma_phys);
 		return;
@@ -952,6 +1075,15 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev)
 
 	wed_w32(dev, MTK_WED_WPDMA_RX_GLO_CFG, dev->wlan.wpdma_rx_glo);
 	wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring0, dev->wlan.wpdma_rx);
+
+	if (!dev->wlan.hw_rro)
+		return;
+
+	wed_w32(dev, MTK_WED_RRO_RX_D_CFG(0), dev->wlan.wpdma_rx_rro[0]);
+	wed_w32(dev, MTK_WED_RRO_RX_D_CFG(1), dev->wlan.wpdma_rx_rro[1]);
+	for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++)
+		wed_w32(dev, MTK_WED_RRO_MSDU_PG_RING_CFG(i),
+			dev->wlan.wpdma_rx_pg + i * 0x10);
 }
 
 static void
@@ -1762,6 +1894,165 @@ mtk_wed_dma_enable(struct mtk_wed_device *dev)
 	}
 }
 
+static void
+mtk_wed_start_hw_rro(struct mtk_wed_device *dev, u32 irq_mask, bool reset)
+{
+	int i;
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_MASK, irq_mask);
+	wed_w32(dev, MTK_WED_INT_MASK, irq_mask);
+
+	if (!mtk_wed_get_rx_capa(dev) || !dev->wlan.hw_rro)
+		return;
+
+	wed_set(dev, MTK_WED_RRO_RX_D_CFG(2), MTK_WED_RRO_MSDU_PG_DRV_CLR);
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_RING2_CFG,
+		MTK_WED_RRO_MSDU_PG_DRV_CLR);
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_RRO_RX,
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX0_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX0_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX1_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_RX1_CLR |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_RX0_DONE_TRIG,
+			   dev->wlan.rro_rx_tbit[0]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_RX1_DONE_TRIG,
+			   dev->wlan.rro_rx_tbit[1]));
+
+	wed_w32(dev, MTK_WED_WPDMA_INT_CTRL_RRO_MSDU_PG,
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG0_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG0_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG1_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG1_CLR |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG2_EN |
+		MTK_WED_WPDMA_INT_CTRL_RRO_PG2_CLR |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG0_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[0]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG1_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[1]) |
+		FIELD_PREP(MTK_WED_WPDMA_INT_CTRL_RRO_PG2_DONE_TRIG,
+			   dev->wlan.rx_pg_tbit[2]));
+
+	/* RRO_MSDU_PG_RING2_CFG1_FLD_DRV_EN should be enabled after
+	 * WM FWDL completed, otherwise RRO_MSDU_PG ring may broken
+	 */
+	wed_set(dev, MTK_WED_RRO_MSDU_PG_RING2_CFG,
+		MTK_WED_RRO_MSDU_PG_DRV_EN);
+
+	for (i = 0; i < MTK_WED_RX_QUEUES; i++) {
+		struct mtk_wed_ring *ring = &dev->rx_rro_ring[i];
+
+		if (!(ring->flags & MTK_WED_RING_CONFIGURED))
+			continue;
+
+		if (mtk_wed_check_wfdma_rx_fill(dev, ring))
+			dev_err(dev->hw->dev,
+				"rx_rro_ring(%d) initialization failed\n", i);
+	}
+
+	for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++) {
+		struct mtk_wed_ring *ring = &dev->rx_page_ring[i];
+
+		if (!(ring->flags & MTK_WED_RING_CONFIGURED))
+			continue;
+
+		if (mtk_wed_check_wfdma_rx_fill(dev, ring))
+			dev_err(dev->hw->dev,
+				"rx_page_ring(%d) initialization failed\n", i);
+	}
+}
+
+static void
+mtk_wed_rro_rx_ring_setup(struct mtk_wed_device *dev, int idx,
+			  void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->rx_rro_ring[idx];
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_RRO_RX_D_RX(idx) + MTK_WED_RING_OFS_BASE,
+		readl(regs));
+	wed_w32(dev, MTK_WED_RRO_RX_D_RX(idx) + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+	ring->flags |= MTK_WED_RING_CONFIGURED;
+}
+
+static void
+mtk_wed_msdu_pg_rx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->rx_page_ring[idx];
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_CTRL0(idx) + MTK_WED_RING_OFS_BASE,
+		readl(regs));
+	wed_w32(dev, MTK_WED_RRO_MSDU_PG_CTRL0(idx) + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+	ring->flags |= MTK_WED_RING_CONFIGURED;
+}
+
+static int
+mtk_wed_ind_rx_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
+{
+	struct mtk_wed_ring *ring = &dev->ind_cmd_ring;
+	u32 val = readl(regs + MTK_WED_RING_OFS_COUNT);
+	int i, count = 0;
+
+	ring->wpdma = regs;
+	wed_w32(dev, MTK_WED_IND_CMD_RX_CTRL1 + MTK_WED_RING_OFS_BASE,
+		readl(regs) & 0xfffffff0);
+
+	wed_w32(dev, MTK_WED_IND_CMD_RX_CTRL1 + MTK_WED_RING_OFS_COUNT,
+		readl(regs + MTK_WED_RING_OFS_COUNT));
+
+	/* ack sn cr */
+	wed_w32(dev, MTK_WED_RRO_CFG0, dev->wlan.phy_base +
+		dev->wlan.ind_cmd.ack_sn_addr);
+	wed_w32(dev, MTK_WED_RRO_CFG1,
+		FIELD_PREP(MTK_WED_RRO_CFG1_MAX_WIN_SZ,
+			   dev->wlan.ind_cmd.win_size) |
+		FIELD_PREP(MTK_WED_RRO_CFG1_PARTICL_SE_ID,
+			   dev->wlan.ind_cmd.particular_sid));
+
+	/* particular session addr element */
+	wed_w32(dev, MTK_WED_ADDR_ELEM_CFG0,
+		dev->wlan.ind_cmd.particular_se_phys);
+
+	for (i = 0; i < dev->wlan.ind_cmd.se_group_nums; i++) {
+		wed_w32(dev, MTK_WED_RADDR_ELEM_TBL_WDATA,
+			dev->wlan.ind_cmd.addr_elem_phys[i] >> 4);
+		wed_w32(dev, MTK_WED_ADDR_ELEM_TBL_CFG,
+			MTK_WED_ADDR_ELEM_TBL_WR | (i & 0x7f));
+
+		val = wed_r32(dev, MTK_WED_ADDR_ELEM_TBL_CFG);
+		while (!(val & MTK_WED_ADDR_ELEM_TBL_WR_RDY) && count++ < 100)
+			val = wed_r32(dev, MTK_WED_ADDR_ELEM_TBL_CFG);
+		if (count >= 100)
+			dev_err(dev->hw->dev,
+				"write ba session base failed\n");
+	}
+
+	/* pn check init */
+	for (i = 0; i < dev->wlan.ind_cmd.particular_sid; i++) {
+		wed_w32(dev, MTK_WED_PN_CHECK_WDATA_M,
+			MTK_WED_PN_CHECK_IS_FIRST);
+
+		wed_w32(dev, MTK_WED_PN_CHECK_CFG, MTK_WED_PN_CHECK_WR |
+			FIELD_PREP(MTK_WED_PN_CHECK_SE_ID, i));
+
+		count = 0;
+		val = wed_r32(dev, MTK_WED_PN_CHECK_CFG);
+		while (!(val & MTK_WED_PN_CHECK_WR_RDY) && count++ < 100)
+			val = wed_r32(dev, MTK_WED_PN_CHECK_CFG);
+		if (count >= 100)
+			dev_err(dev->hw->dev,
+				"session(%d) initialization failed\n", i);
+	}
+
+	wed_w32(dev, MTK_WED_RX_IND_CMD_CNT0, MTK_WED_RX_IND_CMD_DBG_CNT_EN);
+	wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_WED_RX_IND_CMD_EN);
+
+	return 0;
+}
+
 static void
 mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 {
@@ -2216,6 +2507,10 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		.detach = mtk_wed_detach,
 		.ppe_check = mtk_wed_ppe_check,
 		.setup_tc = mtk_wed_setup_tc,
+		.start_hw_rro = mtk_wed_start_hw_rro,
+		.rro_rx_ring_setup = mtk_wed_rro_rx_ring_setup,
+		.msdu_pg_rx_ring_setup = mtk_wed_msdu_pg_rx_ring_setup,
+		.ind_rx_ring_setup = mtk_wed_ind_rx_ring_setup,
 	};
 	struct device_node *eth_np = eth->dev->of_node;
 	struct platform_device *pdev;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index 90d9c9ead3bc..a476648858a6 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -10,6 +10,7 @@
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
+#define MTK_WED_RX_PAGE_QUEUES		3
 
 #define WED_WO_STA_REC			0x6
 
@@ -99,6 +100,9 @@ struct mtk_wed_device {
 	struct mtk_wed_ring txfree_ring;
 	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
 	struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_rro_ring[MTK_WED_RX_QUEUES];
+	struct mtk_wed_ring rx_page_ring[MTK_WED_RX_PAGE_QUEUES];
+	struct mtk_wed_ring ind_cmd_ring;
 
 	struct {
 		int size;
@@ -119,6 +123,13 @@ struct mtk_wed_device {
 		dma_addr_t fdbk_phys;
 	} rro;
 
+	struct {
+		int size;
+		struct mtk_wed_buf *pages;
+		struct mtk_wed_bm_desc *desc;
+		dma_addr_t desc_phys;
+	} hw_rro;
+
 	/* filled by driver: */
 	struct {
 		union {
@@ -137,6 +148,8 @@ struct mtk_wed_device {
 		u32 wpdma_txfree;
 		u32 wpdma_rx_glo;
 		u32 wpdma_rx;
+		u32 wpdma_rx_rro[MTK_WED_RX_QUEUES];
+		u32 wpdma_rx_pg;
 
 		bool wcid_512;
 		bool hw_rro;
@@ -151,9 +164,20 @@ struct mtk_wed_device {
 
 		u8 tx_tbit[MTK_WED_TX_QUEUES];
 		u8 rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rro_rx_tbit[MTK_WED_RX_QUEUES];
+		u8 rx_pg_tbit[MTK_WED_RX_PAGE_QUEUES];
 		u8 txfree_tbit;
 		u8 amsdu_max_subframes;
 
+		struct {
+			u8 se_group_nums;
+			u16 win_size;
+			u16 particular_sid;
+			u32 ack_sn_addr;
+			dma_addr_t particular_se_phys;
+			dma_addr_t addr_elem_phys[1024];
+		} ind_cmd;
+
 		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
 		int (*offload_enable)(struct mtk_wed_device *wed);
 		void (*offload_disable)(struct mtk_wed_device *wed);
@@ -192,6 +216,14 @@ struct mtk_wed_ops {
 	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
 	int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev,
 			enum tc_setup_type type, void *type_data);
+	void (*start_hw_rro)(struct mtk_wed_device *dev, u32 irq_mask,
+			     bool reset);
+	void (*rro_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				  void __iomem *regs);
+	void (*msdu_pg_rx_ring_setup)(struct mtk_wed_device *dev, int ring,
+				      void __iomem *regs);
+	int (*ind_rx_ring_setup)(struct mtk_wed_device *dev,
+				 void __iomem *regs);
 };
 
 extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
@@ -263,6 +295,15 @@ static inline bool mtk_wed_is_amsdu_supported(struct mtk_wed_device *dev)
 #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \
 	(_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data)
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) \
+	(_dev)->ops->start_hw_rro(_dev, _mask, _reset)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->rro_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) \
+	(_dev)->ops->msdu_pg_rx_ring_setup(_dev, _ring, _regs)
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) \
+	(_dev)->ops->ind_rx_ring_setup(_dev, _regs)
+
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -282,6 +323,10 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_stop(_dev) do {} while (0)
 #define mtk_wed_device_dma_reset(_dev) do {} while (0)
 #define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP
+#define mtk_wed_device_start_hw_rro(_dev, _mask, _reset) do {} while (0)
+#define mtk_wed_device_rro_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_msdu_pg_rx_ring_setup(_dev, _ring, _regs) -ENODEV
+#define mtk_wed_device_ind_rx_ring_setup(_dev, _regs) -ENODEV
 #endif
 
 #endif
-- 
cgit v1.2.3


From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 14 Sep 2023 11:21:39 -0400
Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement

Breno and Josef report a deadlock scenario from cgroup reclaim
re-entering the filesystem:

[  361.546690] ======================================================
[  361.559210] WARNING: possible circular locking dependency detected
[  361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S          E
[  361.589704] ------------------------------------------------------
[  361.602277] find/9315 is trying to acquire lock:
[  361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0
[  361.631437]
[  361.631437] but task is already holding lock:
[  361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40

[  362.904457]  mutex_lock_nested+0x1c/0x30
[  362.912414]  __btrfs_release_delayed_node+0x68/0x4f0
[  362.922460]  btrfs_evict_inode+0x301/0x770
[  362.982726]  evict+0x17c/0x380
[  362.988944]  prune_icache_sb+0x100/0x1d0
[  363.005559]  super_cache_scan+0x1f8/0x260
[  363.013695]  do_shrink_slab+0x2a2/0x540
[  363.021489]  shrink_slab_memcg+0x237/0x3d0
[  363.050606]  shrink_slab+0xa7/0x240
[  363.083382]  shrink_node_memcgs+0x262/0x3b0
[  363.091870]  shrink_node+0x1a4/0x720
[  363.099150]  shrink_zones+0x1f6/0x5d0
[  363.148798]  do_try_to_free_pages+0x19b/0x5e0
[  363.157633]  try_to_free_mem_cgroup_pages+0x266/0x370
[  363.190575]  reclaim_high+0x16f/0x1f0
[  363.208409]  mem_cgroup_handle_over_high+0x10b/0x270
[  363.246678]  try_charge_memcg+0xaf2/0xc70
[  363.304151]  charge_memcg+0xf0/0x350
[  363.320070]  __mem_cgroup_charge+0x28/0x40
[  363.328371]  __filemap_add_folio+0x870/0xd50
[  363.371303]  filemap_add_folio+0xdd/0x310
[  363.399696]  __filemap_get_folio+0x2fc/0x7d0
[  363.419086]  pagecache_get_page+0xe/0x30
[  363.427048]  alloc_extent_buffer+0x1cd/0x6a0
[  363.435704]  read_tree_block+0x43/0xc0
[  363.443316]  read_block_for_search+0x361/0x510
[  363.466690]  btrfs_search_slot+0xc8c/0x1520

This is caused by the mem_cgroup_handle_over_high() not respecting the
gfp_mask of the allocation context.  We used to only call this function on
resume to userspace, where no locks were held.  But c9afe31ec443 ("memcg:
synchronously enforce memory.high for large overcharges") added a call
from the allocation context without considering the gfp.

Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org
Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Breno Leitao <leitao@debian.org>
Reported-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: <stable@vger.kernel.org>	[5.17+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h       | 4 ++--
 include/linux/resume_user_mode.h | 2 +-
 mm/memcontrol.c                  | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab94ad4597d0..e4e24da16d2c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
 	return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
 }
 
-void mem_cgroup_handle_over_high(void);
+void mem_cgroup_handle_over_high(gfp_t gfp_mask);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
@@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void)
 	rcu_read_unlock();
 }
 
-static inline void mem_cgroup_handle_over_high(void)
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
 {
 }
 
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index 285189454449..f8f3e958e9cf 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
 	}
 #endif
 
-	mem_cgroup_handle_over_high();
+	mem_cgroup_handle_over_high(GFP_KERNEL);
 	blkcg_maybe_throttle_current();
 
 	rseq_handle_notify_resume(NULL, regs);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a4d3282493b6..d13dde2f8b56 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2555,7 +2555,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
  * Scheduled by try_charge() to be executed from the userland return path
  * and reclaims memory over the high limit.
  */
-void mem_cgroup_handle_over_high(void)
+void mem_cgroup_handle_over_high(gfp_t gfp_mask)
 {
 	unsigned long penalty_jiffies;
 	unsigned long pflags;
@@ -2583,7 +2583,7 @@ retry_reclaim:
 	 */
 	nr_reclaimed = reclaim_high(memcg,
 				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
-				    GFP_KERNEL);
+				    gfp_mask);
 
 	/*
 	 * memory.high is breached and reclaim is unable to keep up. Throttle
@@ -2819,7 +2819,7 @@ done_restock:
 	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
 	    !(current->flags & PF_MEMALLOC) &&
 	    gfpflags_allow_blocking(gfp_mask)) {
-		mem_cgroup_handle_over_high();
+		mem_cgroup_handle_over_high(gfp_mask);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From b724a6418f1f853bcb39c8923bf14a50c7bdbd07 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Sun, 17 Sep 2023 23:38:46 +0800
Subject: bpf: Fix tr dereferencing

Fix 'tr' dereferencing bug when CONFIG_BPF_JIT is turned off.

When CONFIG_BPF_JIT is turned off, 'bpf_trampoline_get()' returns NULL,
which is same as the cases when CONFIG_BPF_JIT is turned on.

Closes: https://lore.kernel.org/r/202309131936.5Nc8eUD0-lkp@intel.com/
Fixes: f7b12b6fea00 ("bpf: verifier: refactor check_attach_btf_id()")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230917153846.88732-1-hffilwlqm@gmail.com
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..49f8b691496c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1307,7 +1307,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
 							struct bpf_attach_target_info *tgt_info)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return NULL;
 }
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
 #define DEFINE_BPF_DISPATCHER(name)
-- 
cgit v1.2.3


From 54e1f99d91405417b3ddb6050cfba82733c3aa41 Mon Sep 17 00:00:00 2001
From: Komal Bajaj <quic_kbajaj@quicinc.com>
Date: Wed, 30 Aug 2023 16:26:51 +0530
Subject: nvmem: core: Add stub for nvmem_cell_read_u8

Add the stub nvmem_cell_read_u8() function for drivers running with
CONFIG_NVMEM disabled.

Signed-off-by: Komal Bajaj <quic_kbajaj@quicinc.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Link: https://lore.kernel.org/r/20230830105654.28057-4-quic_kbajaj@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/nvmem-consumer.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 4523e4e83319..6ec4b9743e25 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -127,6 +127,12 @@ static inline int nvmem_cell_write(struct nvmem_cell *cell,
 	return -EOPNOTSUPP;
 }
 
+static inline int nvmem_cell_read_u8(struct device *dev,
+				     const char *cell_id, u8 *val)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int nvmem_cell_read_u16(struct device *dev,
 				      const char *cell_id, u16 *val)
 {
-- 
cgit v1.2.3


From 0bc76be64e80b15b975345b6957a87a1893c34f2 Mon Sep 17 00:00:00 2001
From: Komal Bajaj <quic_kbajaj@quicinc.com>
Date: Wed, 30 Aug 2023 16:26:53 +0530
Subject: soc: qcom: llcc: Updating the macro name

Update macro name for LLCC_DRE to LLCC_ECC as per the latest specification.

Signed-off-by: Komal Bajaj <quic_kbajaj@quicinc.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Acked-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Link: https://lore.kernel.org/r/20230830105654.28057-6-quic_kbajaj@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 2 +-
 include/linux/soc/qcom/llcc-qcom.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index c31d9e39e864..3bd841e67eba 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -191,7 +191,7 @@ static const struct llcc_slice_config sc8280xp_data[] = {
 	{ LLCC_MMUHWT,   13, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 0, 1, 0 },
 	{ LLCC_DISP,     16, 6144, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
 	{ LLCC_AUDHW,    22, 2048, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
-	{ LLCC_DRE,      26, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_ECC,      26, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
 	{ LLCC_CVP,      28, 512,  3, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
 	{ LLCC_APTCM,    30, 1024, 3, 1, 0x0,   0x1, 1, 0, 0, 1, 0, 0 },
 	{ LLCC_WRCACHE,  31, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 0, 1, 0 },
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 93417ba1ead4..1a886666bbb6 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -30,7 +30,7 @@
 #define LLCC_NPU         23
 #define LLCC_WLHW        24
 #define LLCC_PIMEM       25
-#define LLCC_DRE         26
+#define LLCC_ECC         26
 #define LLCC_CVP         28
 #define LLCC_MODPE       29
 #define LLCC_APTCM       30
-- 
cgit v1.2.3


From b64d143b752932ef483d0ed8d00958f1832dd6bc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 14 Sep 2023 16:28:23 +0800
Subject: crypto: hash - Hide CRYPTO_ALG_TYPE_AHASH_MASK

Move the macro CRYPTO_ALG_TYPE_AHASH_MASK out of linux/crypto.h
and into crypto/ahash.c so that it's not visible to users of the
Crypto API.

Also remove the unused CRYPTO_ALG_TYPE_HASH_MASK macro.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c         | 2 ++
 include/linux/crypto.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 709ef0940799..213bb3e9f245 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -21,6 +21,8 @@
 
 #include "hash.h"
 
+#define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
+
 static const struct crypto_type crypto_ahash_type;
 
 struct ahash_request_priv {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 31f6fee0c36c..a0780deb017a 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -35,8 +35,6 @@
 #define CRYPTO_ALG_TYPE_SHASH		0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000f
 
-#define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
-#define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
 #define CRYPTO_ALG_TYPE_ACOMPRESS_MASK	0x0000000e
 
 #define CRYPTO_ALG_LARVAL		0x00000010
-- 
cgit v1.2.3


From 31865c4c4db2b742fec6ccbff80483fa3e7ab9b9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 14 Sep 2023 16:28:24 +0800
Subject: crypto: skcipher - Add lskcipher

Add a new API type lskcipher designed for taking straight kernel
pointers instead of SG lists.  Its relationship to skcipher will
be analogous to that between shash and ahash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Makefile                    |   6 +-
 crypto/cryptd.c                    |   2 +-
 crypto/lskcipher.c                 | 594 +++++++++++++++++++++++++++++++++++++
 crypto/skcipher.c                  |  75 +++--
 crypto/skcipher.h                  |  30 ++
 include/crypto/internal/skcipher.h | 114 ++++++-
 include/crypto/skcipher.h          | 309 ++++++++++++++++++-
 include/linux/crypto.h             |   1 +
 8 files changed, 1086 insertions(+), 45 deletions(-)
 create mode 100644 crypto/lskcipher.c
 create mode 100644 crypto/skcipher.h

(limited to 'include/linux')

diff --git a/crypto/Makefile b/crypto/Makefile
index 953a7e105e58..5ac6876f935a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -16,7 +16,11 @@ obj-$(CONFIG_CRYPTO_ALGAPI2) += crypto_algapi.o
 obj-$(CONFIG_CRYPTO_AEAD2) += aead.o
 obj-$(CONFIG_CRYPTO_GENIV) += geniv.o
 
-obj-$(CONFIG_CRYPTO_SKCIPHER2) += skcipher.o
+crypto_skcipher-y += lskcipher.o
+crypto_skcipher-y += skcipher.o
+
+obj-$(CONFIG_CRYPTO_SKCIPHER2) += crypto_skcipher.o
+
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o
 
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index bbcc368b6a55..194a92d677b9 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -929,7 +929,7 @@ static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
 		return PTR_ERR(algt);
 
 	switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
-	case CRYPTO_ALG_TYPE_SKCIPHER:
+	case CRYPTO_ALG_TYPE_LSKCIPHER:
 		return cryptd_create_skcipher(tmpl, tb, algt, &queue);
 	case CRYPTO_ALG_TYPE_HASH:
 		return cryptd_create_hash(tmpl, tb, algt, &queue);
diff --git a/crypto/lskcipher.c b/crypto/lskcipher.c
new file mode 100644
index 000000000000..3343c6d955da
--- /dev/null
+++ b/crypto/lskcipher.c
@@ -0,0 +1,594 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Linear symmetric key cipher operations.
+ *
+ * Generic encrypt/decrypt wrapper for ciphers.
+ *
+ * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/cryptouser.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <net/netlink.h>
+#include "skcipher.h"
+
+static inline struct crypto_lskcipher *__crypto_lskcipher_cast(
+	struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_lskcipher, base);
+}
+
+static inline struct lskcipher_alg *__crypto_lskcipher_alg(
+	struct crypto_alg *alg)
+{
+	return container_of(alg, struct lskcipher_alg, co.base);
+}
+
+static inline struct crypto_istat_cipher *lskcipher_get_stat(
+	struct lskcipher_alg *alg)
+{
+	return skcipher_get_stat_common(&alg->co);
+}
+
+static inline int crypto_lskcipher_errstat(struct lskcipher_alg *alg, int err)
+{
+	struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
+		return err;
+
+	if (err)
+		atomic64_inc(&istat->err_cnt);
+
+	return err;
+}
+
+static int lskcipher_setkey_unaligned(struct crypto_lskcipher *tfm,
+				      const u8 *key, unsigned int keylen)
+{
+	unsigned long alignmask = crypto_lskcipher_alignmask(tfm);
+	struct lskcipher_alg *cipher = crypto_lskcipher_alg(tfm);
+	u8 *buffer, *alignbuffer;
+	unsigned long absize;
+	int ret;
+
+	absize = keylen + alignmask;
+	buffer = kmalloc(absize, GFP_ATOMIC);
+	if (!buffer)
+		return -ENOMEM;
+
+	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	memcpy(alignbuffer, key, keylen);
+	ret = cipher->setkey(tfm, alignbuffer, keylen);
+	kfree_sensitive(buffer);
+	return ret;
+}
+
+int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm, const u8 *key,
+			    unsigned int keylen)
+{
+	unsigned long alignmask = crypto_lskcipher_alignmask(tfm);
+	struct lskcipher_alg *cipher = crypto_lskcipher_alg(tfm);
+
+	if (keylen < cipher->co.min_keysize || keylen > cipher->co.max_keysize)
+		return -EINVAL;
+
+	if ((unsigned long)key & alignmask)
+		return lskcipher_setkey_unaligned(tfm, key, keylen);
+	else
+		return cipher->setkey(tfm, key, keylen);
+}
+EXPORT_SYMBOL_GPL(crypto_lskcipher_setkey);
+
+static int crypto_lskcipher_crypt_unaligned(
+	struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len,
+	u8 *iv, int (*crypt)(struct crypto_lskcipher *tfm, const u8 *src,
+			     u8 *dst, unsigned len, u8 *iv, bool final))
+{
+	unsigned ivsize = crypto_lskcipher_ivsize(tfm);
+	unsigned bs = crypto_lskcipher_blocksize(tfm);
+	unsigned cs = crypto_lskcipher_chunksize(tfm);
+	int err;
+	u8 *tiv;
+	u8 *p;
+
+	BUILD_BUG_ON(MAX_CIPHER_BLOCKSIZE > PAGE_SIZE ||
+		     MAX_CIPHER_ALIGNMASK >= PAGE_SIZE);
+
+	tiv = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+	if (!tiv)
+		return -ENOMEM;
+
+	memcpy(tiv, iv, ivsize);
+
+	p = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+	err = -ENOMEM;
+	if (!p)
+		goto out;
+
+	while (len >= bs) {
+		unsigned chunk = min((unsigned)PAGE_SIZE, len);
+		int err;
+
+		if (chunk > cs)
+			chunk &= ~(cs - 1);
+
+		memcpy(p, src, chunk);
+		err = crypt(tfm, p, p, chunk, tiv, true);
+		if (err)
+			goto out;
+
+		memcpy(dst, p, chunk);
+		src += chunk;
+		dst += chunk;
+		len -= chunk;
+	}
+
+	err = len ? -EINVAL : 0;
+
+out:
+	memcpy(iv, tiv, ivsize);
+	kfree_sensitive(p);
+	kfree_sensitive(tiv);
+	return err;
+}
+
+static int crypto_lskcipher_crypt(struct crypto_lskcipher *tfm, const u8 *src,
+				  u8 *dst, unsigned len, u8 *iv,
+				  int (*crypt)(struct crypto_lskcipher *tfm,
+					       const u8 *src, u8 *dst,
+					       unsigned len, u8 *iv,
+					       bool final))
+{
+	unsigned long alignmask = crypto_lskcipher_alignmask(tfm);
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
+	int ret;
+
+	if (((unsigned long)src | (unsigned long)dst | (unsigned long)iv) &
+	    alignmask) {
+		ret = crypto_lskcipher_crypt_unaligned(tfm, src, dst, len, iv,
+						       crypt);
+		goto out;
+	}
+
+	ret = crypt(tfm, src, dst, len, iv, true);
+
+out:
+	return crypto_lskcipher_errstat(alg, ret);
+}
+
+int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src,
+			     u8 *dst, unsigned len, u8 *iv)
+{
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
+
+		atomic64_inc(&istat->encrypt_cnt);
+		atomic64_add(len, &istat->encrypt_tlen);
+	}
+
+	return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->encrypt);
+}
+EXPORT_SYMBOL_GPL(crypto_lskcipher_encrypt);
+
+int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src,
+			     u8 *dst, unsigned len, u8 *iv)
+{
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
+
+	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
+		struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
+
+		atomic64_inc(&istat->decrypt_cnt);
+		atomic64_add(len, &istat->decrypt_tlen);
+	}
+
+	return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->decrypt);
+}
+EXPORT_SYMBOL_GPL(crypto_lskcipher_decrypt);
+
+int crypto_lskcipher_setkey_sg(struct crypto_skcipher *tfm, const u8 *key,
+			       unsigned int keylen)
+{
+	struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm);
+
+	return crypto_lskcipher_setkey(*ctx, key, keylen);
+}
+
+static int crypto_lskcipher_crypt_sg(struct skcipher_request *req,
+				     int (*crypt)(struct crypto_lskcipher *tfm,
+						  const u8 *src, u8 *dst,
+						  unsigned len, u8 *iv,
+						  bool final))
+{
+	struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
+	struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher);
+	struct crypto_lskcipher *tfm = *ctx;
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	while (walk.nbytes) {
+		err = crypt(tfm, walk.src.virt.addr, walk.dst.virt.addr,
+			    walk.nbytes, walk.iv, walk.nbytes == walk.total);
+		err = skcipher_walk_done(&walk, err);
+	}
+
+	return err;
+}
+
+int crypto_lskcipher_encrypt_sg(struct skcipher_request *req)
+{
+	struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
+	struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher);
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(*ctx);
+
+	return crypto_lskcipher_crypt_sg(req, alg->encrypt);
+}
+
+int crypto_lskcipher_decrypt_sg(struct skcipher_request *req)
+{
+	struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
+	struct crypto_lskcipher **ctx = crypto_skcipher_ctx(skcipher);
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(*ctx);
+
+	return crypto_lskcipher_crypt_sg(req, alg->decrypt);
+}
+
+static void crypto_lskcipher_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_lskcipher *skcipher = __crypto_lskcipher_cast(tfm);
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(skcipher);
+
+	alg->exit(skcipher);
+}
+
+static int crypto_lskcipher_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_lskcipher *skcipher = __crypto_lskcipher_cast(tfm);
+	struct lskcipher_alg *alg = crypto_lskcipher_alg(skcipher);
+
+	if (alg->exit)
+		skcipher->base.exit = crypto_lskcipher_exit_tfm;
+
+	if (alg->init)
+		return alg->init(skcipher);
+
+	return 0;
+}
+
+static void crypto_lskcipher_free_instance(struct crypto_instance *inst)
+{
+	struct lskcipher_instance *skcipher =
+		container_of(inst, struct lskcipher_instance, s.base);
+
+	skcipher->free(skcipher);
+}
+
+static void __maybe_unused crypto_lskcipher_show(
+	struct seq_file *m, struct crypto_alg *alg)
+{
+	struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg);
+
+	seq_printf(m, "type         : lskcipher\n");
+	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
+	seq_printf(m, "min keysize  : %u\n", skcipher->co.min_keysize);
+	seq_printf(m, "max keysize  : %u\n", skcipher->co.max_keysize);
+	seq_printf(m, "ivsize       : %u\n", skcipher->co.ivsize);
+	seq_printf(m, "chunksize    : %u\n", skcipher->co.chunksize);
+}
+
+static int __maybe_unused crypto_lskcipher_report(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg);
+	struct crypto_report_blkcipher rblkcipher;
+
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "lskcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));
+
+	rblkcipher.blocksize = alg->cra_blocksize;
+	rblkcipher.min_keysize = skcipher->co.min_keysize;
+	rblkcipher.max_keysize = skcipher->co.max_keysize;
+	rblkcipher.ivsize = skcipher->co.ivsize;
+
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
+}
+
+static int __maybe_unused crypto_lskcipher_report_stat(
+	struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg);
+	struct crypto_istat_cipher *istat;
+	struct crypto_stat_cipher rcipher;
+
+	istat = lskcipher_get_stat(skcipher);
+
+	memset(&rcipher, 0, sizeof(rcipher));
+
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
+
+	rcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
+	rcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
+	rcipher.stat_decrypt_cnt =  atomic64_read(&istat->decrypt_cnt);
+	rcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
+	rcipher.stat_err_cnt =  atomic64_read(&istat->err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
+}
+
+static const struct crypto_type crypto_lskcipher_type = {
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_lskcipher_init_tfm,
+	.free = crypto_lskcipher_free_instance,
+#ifdef CONFIG_PROC_FS
+	.show = crypto_lskcipher_show,
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_USER)
+	.report = crypto_lskcipher_report,
+#endif
+#ifdef CONFIG_CRYPTO_STATS
+	.report_stat = crypto_lskcipher_report_stat,
+#endif
+	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_LSKCIPHER,
+	.tfmsize = offsetof(struct crypto_lskcipher, base),
+};
+
+static void crypto_lskcipher_exit_tfm_sg(struct crypto_tfm *tfm)
+{
+	struct crypto_lskcipher **ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_lskcipher(*ctx);
+}
+
+int crypto_init_lskcipher_ops_sg(struct crypto_tfm *tfm)
+{
+	struct crypto_lskcipher **ctx = crypto_tfm_ctx(tfm);
+	struct crypto_alg *calg = tfm->__crt_alg;
+	struct crypto_lskcipher *skcipher;
+
+	if (!crypto_mod_get(calg))
+		return -EAGAIN;
+
+	skcipher = crypto_create_tfm(calg, &crypto_lskcipher_type);
+	if (IS_ERR(skcipher)) {
+		crypto_mod_put(calg);
+		return PTR_ERR(skcipher);
+	}
+
+	*ctx = skcipher;
+	tfm->exit = crypto_lskcipher_exit_tfm_sg;
+
+	return 0;
+}
+
+int crypto_grab_lskcipher(struct crypto_lskcipher_spawn *spawn,
+			  struct crypto_instance *inst,
+			  const char *name, u32 type, u32 mask)
+{
+	spawn->base.frontend = &crypto_lskcipher_type;
+	return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_grab_lskcipher);
+
+struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name,
+						u32 type, u32 mask)
+{
+	return crypto_alloc_tfm(alg_name, &crypto_lskcipher_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_lskcipher);
+
+static int lskcipher_prepare_alg(struct lskcipher_alg *alg)
+{
+	struct crypto_alg *base = &alg->co.base;
+	int err;
+
+	err = skcipher_prepare_alg_common(&alg->co);
+	if (err)
+		return err;
+
+	if (alg->co.chunksize & (alg->co.chunksize - 1))
+		return -EINVAL;
+
+	base->cra_type = &crypto_lskcipher_type;
+	base->cra_flags |= CRYPTO_ALG_TYPE_LSKCIPHER;
+
+	return 0;
+}
+
+int crypto_register_lskcipher(struct lskcipher_alg *alg)
+{
+	struct crypto_alg *base = &alg->co.base;
+	int err;
+
+	err = lskcipher_prepare_alg(alg);
+	if (err)
+		return err;
+
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_lskcipher);
+
+void crypto_unregister_lskcipher(struct lskcipher_alg *alg)
+{
+	crypto_unregister_alg(&alg->co.base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_lskcipher);
+
+int crypto_register_lskciphers(struct lskcipher_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_lskcipher(&algs[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_lskcipher(&algs[i]);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_register_lskciphers);
+
+void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count)
+{
+	int i;
+
+	for (i = count - 1; i >= 0; --i)
+		crypto_unregister_lskcipher(&algs[i]);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_lskciphers);
+
+int lskcipher_register_instance(struct crypto_template *tmpl,
+				struct lskcipher_instance *inst)
+{
+	int err;
+
+	if (WARN_ON(!inst->free))
+		return -EINVAL;
+
+	err = lskcipher_prepare_alg(&inst->alg);
+	if (err)
+		return err;
+
+	return crypto_register_instance(tmpl, lskcipher_crypto_instance(inst));
+}
+EXPORT_SYMBOL_GPL(lskcipher_register_instance);
+
+static int lskcipher_setkey_simple(struct crypto_lskcipher *tfm, const u8 *key,
+				   unsigned int keylen)
+{
+	struct crypto_lskcipher *cipher = lskcipher_cipher_simple(tfm);
+
+	crypto_lskcipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK);
+	crypto_lskcipher_set_flags(cipher, crypto_lskcipher_get_flags(tfm) &
+				   CRYPTO_TFM_REQ_MASK);
+	return crypto_lskcipher_setkey(cipher, key, keylen);
+}
+
+static int lskcipher_init_tfm_simple(struct crypto_lskcipher *tfm)
+{
+	struct lskcipher_instance *inst = lskcipher_alg_instance(tfm);
+	struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm);
+	struct crypto_lskcipher_spawn *spawn;
+	struct crypto_lskcipher *cipher;
+
+	spawn = lskcipher_instance_ctx(inst);
+	cipher = crypto_spawn_lskcipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	*ctx = cipher;
+	return 0;
+}
+
+static void lskcipher_exit_tfm_simple(struct crypto_lskcipher *tfm)
+{
+	struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm);
+
+	crypto_free_lskcipher(*ctx);
+}
+
+static void lskcipher_free_instance_simple(struct lskcipher_instance *inst)
+{
+	crypto_drop_lskcipher(lskcipher_instance_ctx(inst));
+	kfree(inst);
+}
+
+/**
+ * lskcipher_alloc_instance_simple - allocate instance of simple block cipher
+ *
+ * Allocate an lskcipher_instance for a simple block cipher mode of operation,
+ * e.g. cbc or ecb.  The instance context will have just a single crypto_spawn,
+ * that for the underlying cipher.  The {min,max}_keysize, ivsize, blocksize,
+ * alignmask, and priority are set from the underlying cipher but can be
+ * overridden if needed.  The tfm context defaults to
+ * struct crypto_lskcipher *, and default ->setkey(), ->init(), and
+ * ->exit() methods are installed.
+ *
+ * @tmpl: the template being instantiated
+ * @tb: the template parameters
+ *
+ * Return: a pointer to the new instance, or an ERR_PTR().  The caller still
+ *	   needs to register the instance.
+ */
+struct lskcipher_instance *lskcipher_alloc_instance_simple(
+	struct crypto_template *tmpl, struct rtattr **tb)
+{
+	u32 mask;
+	struct lskcipher_instance *inst;
+	struct crypto_lskcipher_spawn *spawn;
+	struct lskcipher_alg *cipher_alg;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_LSKCIPHER, &mask);
+	if (err)
+		return ERR_PTR(err);
+
+	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+	if (!inst)
+		return ERR_PTR(-ENOMEM);
+
+	spawn = lskcipher_instance_ctx(inst);
+	err = crypto_grab_lskcipher(spawn,
+				    lskcipher_crypto_instance(inst),
+				    crypto_attr_alg_name(tb[1]), 0, mask);
+	if (err)
+		goto err_free_inst;
+	cipher_alg = crypto_lskcipher_spawn_alg(spawn);
+
+	err = crypto_inst_setname(lskcipher_crypto_instance(inst), tmpl->name,
+				  &cipher_alg->co.base);
+	if (err)
+		goto err_free_inst;
+
+	/* Don't allow nesting. */
+	err = -ELOOP;
+	if ((cipher_alg->co.base.cra_flags & CRYPTO_ALG_INSTANCE))
+		goto err_free_inst;
+
+	err = -EINVAL;
+	if (cipher_alg->co.ivsize)
+		goto err_free_inst;
+
+	inst->free = lskcipher_free_instance_simple;
+
+	/* Default algorithm properties, can be overridden */
+	inst->alg.co.base.cra_blocksize = cipher_alg->co.base.cra_blocksize;
+	inst->alg.co.base.cra_alignmask = cipher_alg->co.base.cra_alignmask;
+	inst->alg.co.base.cra_priority = cipher_alg->co.base.cra_priority;
+	inst->alg.co.min_keysize = cipher_alg->co.min_keysize;
+	inst->alg.co.max_keysize = cipher_alg->co.max_keysize;
+	inst->alg.co.ivsize = cipher_alg->co.base.cra_blocksize;
+
+	/* Use struct crypto_lskcipher * by default, can be overridden */
+	inst->alg.co.base.cra_ctxsize = sizeof(struct crypto_lskcipher *);
+	inst->alg.setkey = lskcipher_setkey_simple;
+	inst->alg.init = lskcipher_init_tfm_simple;
+	inst->alg.exit = lskcipher_exit_tfm_simple;
+
+	return inst;
+
+err_free_inst:
+	lskcipher_free_instance_simple(inst);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(lskcipher_alloc_instance_simple);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 7b275716cf4e..b9496dc8a609 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -24,8 +24,9 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <net/netlink.h>
+#include "skcipher.h"
 
-#include "internal.h"
+#define CRYPTO_ALG_TYPE_SKCIPHER_MASK	0x0000000e
 
 enum {
 	SKCIPHER_WALK_PHYS = 1 << 0,
@@ -43,6 +44,8 @@ struct skcipher_walk_buffer {
 	u8 buffer[];
 };
 
+static const struct crypto_type crypto_skcipher_type;
+
 static int skcipher_walk_next(struct skcipher_walk *walk);
 
 static inline void skcipher_map_src(struct skcipher_walk *walk)
@@ -89,11 +92,7 @@ static inline struct skcipher_alg *__crypto_skcipher_alg(
 static inline struct crypto_istat_cipher *skcipher_get_stat(
 	struct skcipher_alg *alg)
 {
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
+	return skcipher_get_stat_common(&alg->co);
 }
 
 static inline int crypto_skcipher_errstat(struct skcipher_alg *alg, int err)
@@ -468,6 +467,7 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 				  struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
 
 	walk->total = req->cryptlen;
 	walk->nbytes = 0;
@@ -485,10 +485,14 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 		       SKCIPHER_WALK_SLEEP : 0;
 
 	walk->blocksize = crypto_skcipher_blocksize(tfm);
-	walk->stride = crypto_skcipher_walksize(tfm);
 	walk->ivsize = crypto_skcipher_ivsize(tfm);
 	walk->alignmask = crypto_skcipher_alignmask(tfm);
 
+	if (alg->co.base.cra_type != &crypto_skcipher_type)
+		walk->stride = alg->co.chunksize;
+	else
+		walk->stride = alg->walksize;
+
 	return skcipher_walk_first(walk);
 }
 
@@ -616,6 +620,11 @@ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	unsigned long alignmask = crypto_skcipher_alignmask(tfm);
 	int err;
 
+	if (cipher->co.base.cra_type != &crypto_skcipher_type) {
+		err = crypto_lskcipher_setkey_sg(tfm, key, keylen);
+		goto out;
+	}
+
 	if (keylen < cipher->min_keysize || keylen > cipher->max_keysize)
 		return -EINVAL;
 
@@ -624,6 +633,7 @@ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	else
 		err = cipher->setkey(tfm, key, keylen);
 
+out:
 	if (unlikely(err)) {
 		skcipher_set_needkey(tfm);
 		return err;
@@ -649,6 +659,8 @@ int crypto_skcipher_encrypt(struct skcipher_request *req)
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
+	else if (alg->co.base.cra_type != &crypto_skcipher_type)
+		ret = crypto_lskcipher_encrypt_sg(req);
 	else
 		ret = alg->encrypt(req);
 
@@ -671,6 +683,8 @@ int crypto_skcipher_decrypt(struct skcipher_request *req)
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
+	else if (alg->co.base.cra_type != &crypto_skcipher_type)
+		ret = crypto_lskcipher_decrypt_sg(req);
 	else
 		ret = alg->decrypt(req);
 
@@ -693,6 +707,9 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
 
 	skcipher_set_needkey(skcipher);
 
+	if (tfm->__crt_alg->cra_type != &crypto_skcipher_type)
+		return crypto_init_lskcipher_ops_sg(tfm);
+
 	if (alg->exit)
 		skcipher->base.exit = crypto_skcipher_exit_tfm;
 
@@ -702,6 +719,14 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
 	return 0;
 }
 
+static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg)
+{
+	if (alg->cra_type != &crypto_skcipher_type)
+		return sizeof(struct crypto_lskcipher *);
+
+	return crypto_alg_extsize(alg);
+}
+
 static void crypto_skcipher_free_instance(struct crypto_instance *inst)
 {
 	struct skcipher_instance *skcipher =
@@ -770,7 +795,7 @@ static int __maybe_unused crypto_skcipher_report_stat(
 }
 
 static const struct crypto_type crypto_skcipher_type = {
-	.extsize = crypto_alg_extsize,
+	.extsize = crypto_skcipher_extsize,
 	.init_tfm = crypto_skcipher_init_tfm,
 	.free = crypto_skcipher_free_instance,
 #ifdef CONFIG_PROC_FS
@@ -783,7 +808,7 @@ static const struct crypto_type crypto_skcipher_type = {
 	.report_stat = crypto_skcipher_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
-	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK,
 	.type = CRYPTO_ALG_TYPE_SKCIPHER,
 	.tfmsize = offsetof(struct crypto_skcipher, base),
 };
@@ -834,23 +859,18 @@ int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask)
 }
 EXPORT_SYMBOL_GPL(crypto_has_skcipher);
 
-static int skcipher_prepare_alg(struct skcipher_alg *alg)
+int skcipher_prepare_alg_common(struct skcipher_alg_common *alg)
 {
-	struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
+	struct crypto_istat_cipher *istat = skcipher_get_stat_common(alg);
 	struct crypto_alg *base = &alg->base;
 
-	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
-	    alg->walksize > PAGE_SIZE / 8)
+	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8)
 		return -EINVAL;
 
 	if (!alg->chunksize)
 		alg->chunksize = base->cra_blocksize;
-	if (!alg->walksize)
-		alg->walksize = alg->chunksize;
 
-	base->cra_type = &crypto_skcipher_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
-	base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER;
 
 	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
 		memset(istat, 0, sizeof(*istat));
@@ -858,6 +878,27 @@ static int skcipher_prepare_alg(struct skcipher_alg *alg)
 	return 0;
 }
 
+static int skcipher_prepare_alg(struct skcipher_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+	int err;
+
+	err = skcipher_prepare_alg_common(&alg->co);
+	if (err)
+		return err;
+
+	if (alg->walksize > PAGE_SIZE / 8)
+		return -EINVAL;
+
+	if (!alg->walksize)
+		alg->walksize = alg->chunksize;
+
+	base->cra_type = &crypto_skcipher_type;
+	base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER;
+
+	return 0;
+}
+
 int crypto_register_skcipher(struct skcipher_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
diff --git a/crypto/skcipher.h b/crypto/skcipher.h
new file mode 100644
index 000000000000..6f1295f0fef2
--- /dev/null
+++ b/crypto/skcipher.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+#ifndef _LOCAL_CRYPTO_SKCIPHER_H
+#define _LOCAL_CRYPTO_SKCIPHER_H
+
+#include <crypto/internal/skcipher.h>
+#include "internal.h"
+
+static inline struct crypto_istat_cipher *skcipher_get_stat_common(
+	struct skcipher_alg_common *alg)
+{
+#ifdef CONFIG_CRYPTO_STATS
+	return &alg->stat;
+#else
+	return NULL;
+#endif
+}
+
+int crypto_lskcipher_setkey_sg(struct crypto_skcipher *tfm, const u8 *key,
+			       unsigned int keylen);
+int crypto_lskcipher_encrypt_sg(struct skcipher_request *req);
+int crypto_lskcipher_decrypt_sg(struct skcipher_request *req);
+int crypto_init_lskcipher_ops_sg(struct crypto_tfm *tfm);
+int skcipher_prepare_alg_common(struct skcipher_alg_common *alg);
+
+#endif	/* _LOCAL_CRYPTO_SKCIPHER_H */
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index fb3d9e899f52..4382fd707b8a 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -36,10 +36,25 @@ struct skcipher_instance {
 	};
 };
 
+struct lskcipher_instance {
+	void (*free)(struct lskcipher_instance *inst);
+	union {
+		struct {
+			char head[offsetof(struct lskcipher_alg, co.base)];
+			struct crypto_instance base;
+		} s;
+		struct lskcipher_alg alg;
+	};
+};
+
 struct crypto_skcipher_spawn {
 	struct crypto_spawn base;
 };
 
+struct crypto_lskcipher_spawn {
+	struct crypto_spawn base;
+};
+
 struct skcipher_walk {
 	union {
 		struct {
@@ -80,6 +95,12 @@ static inline struct crypto_instance *skcipher_crypto_instance(
 	return &inst->s.base;
 }
 
+static inline struct crypto_instance *lskcipher_crypto_instance(
+	struct lskcipher_instance *inst)
+{
+	return &inst->s.base;
+}
+
 static inline struct skcipher_instance *skcipher_alg_instance(
 	struct crypto_skcipher *skcipher)
 {
@@ -87,11 +108,23 @@ static inline struct skcipher_instance *skcipher_alg_instance(
 			    struct skcipher_instance, alg);
 }
 
+static inline struct lskcipher_instance *lskcipher_alg_instance(
+	struct crypto_lskcipher *lskcipher)
+{
+	return container_of(crypto_lskcipher_alg(lskcipher),
+			    struct lskcipher_instance, alg);
+}
+
 static inline void *skcipher_instance_ctx(struct skcipher_instance *inst)
 {
 	return crypto_instance_ctx(skcipher_crypto_instance(inst));
 }
 
+static inline void *lskcipher_instance_ctx(struct lskcipher_instance *inst)
+{
+	return crypto_instance_ctx(lskcipher_crypto_instance(inst));
+}
+
 static inline void skcipher_request_complete(struct skcipher_request *req, int err)
 {
 	crypto_request_complete(&req->base, err);
@@ -101,29 +134,56 @@ int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn,
 			 struct crypto_instance *inst,
 			 const char *name, u32 type, u32 mask);
 
+int crypto_grab_lskcipher(struct crypto_lskcipher_spawn *spawn,
+			  struct crypto_instance *inst,
+			  const char *name, u32 type, u32 mask);
+
 static inline void crypto_drop_skcipher(struct crypto_skcipher_spawn *spawn)
 {
 	crypto_drop_spawn(&spawn->base);
 }
 
+static inline void crypto_drop_lskcipher(struct crypto_lskcipher_spawn *spawn)
+{
+	crypto_drop_spawn(&spawn->base);
+}
+
 static inline struct skcipher_alg *crypto_skcipher_spawn_alg(
 	struct crypto_skcipher_spawn *spawn)
 {
 	return container_of(spawn->base.alg, struct skcipher_alg, base);
 }
 
+static inline struct lskcipher_alg *crypto_lskcipher_spawn_alg(
+	struct crypto_lskcipher_spawn *spawn)
+{
+	return container_of(spawn->base.alg, struct lskcipher_alg, co.base);
+}
+
 static inline struct skcipher_alg *crypto_spawn_skcipher_alg(
 	struct crypto_skcipher_spawn *spawn)
 {
 	return crypto_skcipher_spawn_alg(spawn);
 }
 
+static inline struct lskcipher_alg *crypto_spawn_lskcipher_alg(
+	struct crypto_lskcipher_spawn *spawn)
+{
+	return crypto_lskcipher_spawn_alg(spawn);
+}
+
 static inline struct crypto_skcipher *crypto_spawn_skcipher(
 	struct crypto_skcipher_spawn *spawn)
 {
 	return crypto_spawn_tfm2(&spawn->base);
 }
 
+static inline struct crypto_lskcipher *crypto_spawn_lskcipher(
+	struct crypto_lskcipher_spawn *spawn)
+{
+	return crypto_spawn_tfm2(&spawn->base);
+}
+
 static inline void crypto_skcipher_set_reqsize(
 	struct crypto_skcipher *skcipher, unsigned int reqsize)
 {
@@ -144,6 +204,13 @@ void crypto_unregister_skciphers(struct skcipher_alg *algs, int count);
 int skcipher_register_instance(struct crypto_template *tmpl,
 			       struct skcipher_instance *inst);
 
+int crypto_register_lskcipher(struct lskcipher_alg *alg);
+void crypto_unregister_lskcipher(struct lskcipher_alg *alg);
+int crypto_register_lskciphers(struct lskcipher_alg *algs, int count);
+void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count);
+int lskcipher_register_instance(struct crypto_template *tmpl,
+				struct lskcipher_instance *inst);
+
 int skcipher_walk_done(struct skcipher_walk *walk, int err);
 int skcipher_walk_virt(struct skcipher_walk *walk,
 		       struct skcipher_request *req,
@@ -166,6 +233,11 @@ static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm)
 	return crypto_tfm_ctx(&tfm->base);
 }
 
+static inline void *crypto_lskcipher_ctx(struct crypto_lskcipher *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
 static inline void *crypto_skcipher_ctx_dma(struct crypto_skcipher *tfm)
 {
 	return crypto_tfm_ctx_dma(&tfm->base);
@@ -209,21 +281,16 @@ static inline unsigned int crypto_skcipher_alg_walksize(
 	return alg->walksize;
 }
 
-/**
- * crypto_skcipher_walksize() - obtain walk size
- * @tfm: cipher handle
- *
- * In some cases, algorithms can only perform optimally when operating on
- * multiple blocks in parallel. This is reflected by the walksize, which
- * must be a multiple of the chunksize (or equal if the concern does not
- * apply)
- *
- * Return: walk size in bytes
- */
-static inline unsigned int crypto_skcipher_walksize(
-	struct crypto_skcipher *tfm)
+static inline unsigned int crypto_lskcipher_alg_min_keysize(
+	struct lskcipher_alg *alg)
+{
+	return alg->co.min_keysize;
+}
+
+static inline unsigned int crypto_lskcipher_alg_max_keysize(
+	struct lskcipher_alg *alg)
 {
-	return crypto_skcipher_alg_walksize(crypto_skcipher_alg(tfm));
+	return alg->co.max_keysize;
 }
 
 /* Helpers for simple block cipher modes of operation */
@@ -249,5 +316,24 @@ static inline struct crypto_alg *skcipher_ialg_simple(
 	return crypto_spawn_cipher_alg(spawn);
 }
 
+static inline struct crypto_lskcipher *lskcipher_cipher_simple(
+	struct crypto_lskcipher *tfm)
+{
+	struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm);
+
+	return *ctx;
+}
+
+struct lskcipher_instance *lskcipher_alloc_instance_simple(
+	struct crypto_template *tmpl, struct rtattr **tb);
+
+static inline struct lskcipher_alg *lskcipher_ialg_simple(
+	struct lskcipher_instance *inst)
+{
+	struct crypto_lskcipher_spawn *spawn = lskcipher_instance_ctx(inst);
+
+	return crypto_lskcipher_spawn_alg(spawn);
+}
+
 #endif	/* _CRYPTO_INTERNAL_SKCIPHER_H */
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 080d1ba3611d..a648ef5ce897 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -49,6 +49,10 @@ struct crypto_sync_skcipher {
 	struct crypto_skcipher base;
 };
 
+struct crypto_lskcipher {
+	struct crypto_tfm base;
+};
+
 /*
  * struct crypto_istat_cipher - statistics for cipher algorithm
  * @encrypt_cnt:	number of encrypt requests
@@ -65,6 +69,43 @@ struct crypto_istat_cipher {
 	atomic64_t err_cnt;
 };
 
+#ifdef CONFIG_CRYPTO_STATS
+#define SKCIPHER_ALG_COMMON_STAT struct crypto_istat_cipher stat;
+#else
+#define SKCIPHER_ALG_COMMON_STAT
+#endif
+
+/*
+ * struct skcipher_alg_common - common properties of skcipher_alg
+ * @min_keysize: Minimum key size supported by the transformation. This is the
+ *		 smallest key length supported by this transformation algorithm.
+ *		 This must be set to one of the pre-defined values as this is
+ *		 not hardware specific. Possible values for this field can be
+ *		 found via git grep "_MIN_KEY_SIZE" include/crypto/
+ * @max_keysize: Maximum key size supported by the transformation. This is the
+ *		 largest key length supported by this transformation algorithm.
+ *		 This must be set to one of the pre-defined values as this is
+ *		 not hardware specific. Possible values for this field can be
+ *		 found via git grep "_MAX_KEY_SIZE" include/crypto/
+ * @ivsize: IV size applicable for transformation. The consumer must provide an
+ *	    IV of exactly that size to perform the encrypt or decrypt operation.
+ * @chunksize: Equal to the block size except for stream ciphers such as
+ *	       CTR where it is set to the underlying block size.
+ * @stat: Statistics for cipher algorithm
+ * @base: Definition of a generic crypto algorithm.
+ */
+#define SKCIPHER_ALG_COMMON {		\
+	unsigned int min_keysize;	\
+	unsigned int max_keysize;	\
+	unsigned int ivsize;		\
+	unsigned int chunksize;		\
+					\
+	SKCIPHER_ALG_COMMON_STAT	\
+					\
+	struct crypto_alg base;		\
+}
+struct skcipher_alg_common SKCIPHER_ALG_COMMON;
+
 /**
  * struct skcipher_alg - symmetric key cipher definition
  * @min_keysize: Minimum key size supported by the transformation. This is the
@@ -120,6 +161,7 @@ struct crypto_istat_cipher {
  * 	      in parallel. Should be a multiple of chunksize.
  * @stat: Statistics for cipher algorithm
  * @base: Definition of a generic crypto algorithm.
+ * @co: see struct skcipher_alg_common
  *
  * All fields except @ivsize are mandatory and must be filled.
  */
@@ -131,17 +173,55 @@ struct skcipher_alg {
 	int (*init)(struct crypto_skcipher *tfm);
 	void (*exit)(struct crypto_skcipher *tfm);
 
-	unsigned int min_keysize;
-	unsigned int max_keysize;
-	unsigned int ivsize;
-	unsigned int chunksize;
 	unsigned int walksize;
 
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_istat_cipher stat;
-#endif
+	union {
+		struct SKCIPHER_ALG_COMMON;
+		struct skcipher_alg_common co;
+	};
+};
 
-	struct crypto_alg base;
+/**
+ * struct lskcipher_alg - linear symmetric key cipher definition
+ * @setkey: Set key for the transformation. This function is used to either
+ *	    program a supplied key into the hardware or store the key in the
+ *	    transformation context for programming it later. Note that this
+ *	    function does modify the transformation context. This function can
+ *	    be called multiple times during the existence of the transformation
+ *	    object, so one must make sure the key is properly reprogrammed into
+ *	    the hardware. This function is also responsible for checking the key
+ *	    length for validity. In case a software fallback was put in place in
+ *	    the @cra_init call, this function might need to use the fallback if
+ *	    the algorithm doesn't support all of the key sizes.
+ * @encrypt: Encrypt a number of bytes. This function is used to encrypt
+ *	     the supplied data.  This function shall not modify
+ *	     the transformation context, as this function may be called
+ *	     in parallel with the same transformation object.  Data
+ *	     may be left over if length is not a multiple of blocks
+ *	     and there is more to come (final == false).  The number of
+ *	     left-over bytes should be returned in case of success.
+ * @decrypt: Decrypt a number of bytes. This is a reverse counterpart to
+ *	     @encrypt and the conditions are exactly the same.
+ * @init: Initialize the cryptographic transformation object. This function
+ *	  is used to initialize the cryptographic transformation object.
+ *	  This function is called only once at the instantiation time, right
+ *	  after the transformation context was allocated.
+ * @exit: Deinitialize the cryptographic transformation object. This is a
+ *	  counterpart to @init, used to remove various changes set in
+ *	  @init.
+ * @co: see struct skcipher_alg_common
+ */
+struct lskcipher_alg {
+	int (*setkey)(struct crypto_lskcipher *tfm, const u8 *key,
+	              unsigned int keylen);
+	int (*encrypt)(struct crypto_lskcipher *tfm, const u8 *src,
+		       u8 *dst, unsigned len, u8 *iv, bool final);
+	int (*decrypt)(struct crypto_lskcipher *tfm, const u8 *src,
+		       u8 *dst, unsigned len, u8 *iv, bool final);
+	int (*init)(struct crypto_lskcipher *tfm);
+	void (*exit)(struct crypto_lskcipher *tfm);
+
+	struct skcipher_alg_common co;
 };
 
 #define MAX_SYNC_SKCIPHER_REQSIZE      384
@@ -213,12 +293,36 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
 struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name,
 					      u32 type, u32 mask);
 
+
+/**
+ * crypto_alloc_lskcipher() - allocate linear symmetric key cipher handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	      lskcipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Allocate a cipher handle for an lskcipher. The returned struct
+ * crypto_lskcipher is the cipher handle that is required for any subsequent
+ * API invocation for that lskcipher.
+ *
+ * Return: allocated cipher handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name,
+						u32 type, u32 mask);
+
 static inline struct crypto_tfm *crypto_skcipher_tfm(
 	struct crypto_skcipher *tfm)
 {
 	return &tfm->base;
 }
 
+static inline struct crypto_tfm *crypto_lskcipher_tfm(
+	struct crypto_lskcipher *tfm)
+{
+	return &tfm->base;
+}
+
 /**
  * crypto_free_skcipher() - zeroize and free cipher handle
  * @tfm: cipher handle to be freed
@@ -235,6 +339,17 @@ static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm)
 	crypto_free_skcipher(&tfm->base);
 }
 
+/**
+ * crypto_free_lskcipher() - zeroize and free cipher handle
+ * @tfm: cipher handle to be freed
+ *
+ * If @tfm is a NULL or error pointer, this function does nothing.
+ */
+static inline void crypto_free_lskcipher(struct crypto_lskcipher *tfm)
+{
+	crypto_destroy_tfm(tfm, crypto_lskcipher_tfm(tfm));
+}
+
 /**
  * crypto_has_skcipher() - Search for the availability of an skcipher.
  * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
@@ -253,6 +368,19 @@ static inline const char *crypto_skcipher_driver_name(
 	return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm));
 }
 
+static inline const char *crypto_lskcipher_driver_name(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_tfm_alg_driver_name(crypto_lskcipher_tfm(tfm));
+}
+
+static inline struct skcipher_alg_common *crypto_skcipher_alg_common(
+	struct crypto_skcipher *tfm)
+{
+	return container_of(crypto_skcipher_tfm(tfm)->__crt_alg,
+			    struct skcipher_alg_common, base);
+}
+
 static inline struct skcipher_alg *crypto_skcipher_alg(
 	struct crypto_skcipher *tfm)
 {
@@ -260,11 +388,24 @@ static inline struct skcipher_alg *crypto_skcipher_alg(
 			    struct skcipher_alg, base);
 }
 
+static inline struct lskcipher_alg *crypto_lskcipher_alg(
+	struct crypto_lskcipher *tfm)
+{
+	return container_of(crypto_lskcipher_tfm(tfm)->__crt_alg,
+			    struct lskcipher_alg, co.base);
+}
+
 static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg)
 {
 	return alg->ivsize;
 }
 
+static inline unsigned int crypto_lskcipher_alg_ivsize(
+	struct lskcipher_alg *alg)
+{
+	return alg->co.ivsize;
+}
+
 /**
  * crypto_skcipher_ivsize() - obtain IV size
  * @tfm: cipher handle
@@ -276,7 +417,7 @@ static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg)
  */
 static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm)
 {
-	return crypto_skcipher_alg(tfm)->ivsize;
+	return crypto_skcipher_alg_common(tfm)->ivsize;
 }
 
 static inline unsigned int crypto_sync_skcipher_ivsize(
@@ -285,6 +426,21 @@ static inline unsigned int crypto_sync_skcipher_ivsize(
 	return crypto_skcipher_ivsize(&tfm->base);
 }
 
+/**
+ * crypto_lskcipher_ivsize() - obtain IV size
+ * @tfm: cipher handle
+ *
+ * The size of the IV for the lskcipher referenced by the cipher handle is
+ * returned. This IV size may be zero if the cipher does not need an IV.
+ *
+ * Return: IV size in bytes
+ */
+static inline unsigned int crypto_lskcipher_ivsize(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_lskcipher_alg(tfm)->co.ivsize;
+}
+
 /**
  * crypto_skcipher_blocksize() - obtain block size of cipher
  * @tfm: cipher handle
@@ -301,12 +457,34 @@ static inline unsigned int crypto_skcipher_blocksize(
 	return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm));
 }
 
+/**
+ * crypto_lskcipher_blocksize() - obtain block size of cipher
+ * @tfm: cipher handle
+ *
+ * The block size for the lskcipher referenced with the cipher handle is
+ * returned. The caller may use that information to allocate appropriate
+ * memory for the data returned by the encryption or decryption operation
+ *
+ * Return: block size of cipher
+ */
+static inline unsigned int crypto_lskcipher_blocksize(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_lskcipher_tfm(tfm));
+}
+
 static inline unsigned int crypto_skcipher_alg_chunksize(
 	struct skcipher_alg *alg)
 {
 	return alg->chunksize;
 }
 
+static inline unsigned int crypto_lskcipher_alg_chunksize(
+	struct lskcipher_alg *alg)
+{
+	return alg->co.chunksize;
+}
+
 /**
  * crypto_skcipher_chunksize() - obtain chunk size
  * @tfm: cipher handle
@@ -321,7 +499,24 @@ static inline unsigned int crypto_skcipher_alg_chunksize(
 static inline unsigned int crypto_skcipher_chunksize(
 	struct crypto_skcipher *tfm)
 {
-	return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm));
+	return crypto_skcipher_alg_common(tfm)->chunksize;
+}
+
+/**
+ * crypto_lskcipher_chunksize() - obtain chunk size
+ * @tfm: cipher handle
+ *
+ * The block size is set to one for ciphers such as CTR.  However,
+ * you still need to provide incremental updates in multiples of
+ * the underlying block size as the IV does not have sub-block
+ * granularity.  This is known in this API as the chunk size.
+ *
+ * Return: chunk size in bytes
+ */
+static inline unsigned int crypto_lskcipher_chunksize(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_lskcipher_alg_chunksize(crypto_lskcipher_alg(tfm));
 }
 
 static inline unsigned int crypto_sync_skcipher_blocksize(
@@ -336,6 +531,12 @@ static inline unsigned int crypto_skcipher_alignmask(
 	return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm));
 }
 
+static inline unsigned int crypto_lskcipher_alignmask(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_lskcipher_tfm(tfm));
+}
+
 static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm)
 {
 	return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm));
@@ -371,6 +572,23 @@ static inline void crypto_sync_skcipher_clear_flags(
 	crypto_skcipher_clear_flags(&tfm->base, flags);
 }
 
+static inline u32 crypto_lskcipher_get_flags(struct crypto_lskcipher *tfm)
+{
+	return crypto_tfm_get_flags(crypto_lskcipher_tfm(tfm));
+}
+
+static inline void crypto_lskcipher_set_flags(struct crypto_lskcipher *tfm,
+					       u32 flags)
+{
+	crypto_tfm_set_flags(crypto_lskcipher_tfm(tfm), flags);
+}
+
+static inline void crypto_lskcipher_clear_flags(struct crypto_lskcipher *tfm,
+						 u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_lskcipher_tfm(tfm), flags);
+}
+
 /**
  * crypto_skcipher_setkey() - set key for cipher
  * @tfm: cipher handle
@@ -396,16 +614,47 @@ static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm,
 	return crypto_skcipher_setkey(&tfm->base, key, keylen);
 }
 
+/**
+ * crypto_lskcipher_setkey() - set key for cipher
+ * @tfm: cipher handle
+ * @key: buffer holding the key
+ * @keylen: length of the key in bytes
+ *
+ * The caller provided key is set for the lskcipher referenced by the cipher
+ * handle.
+ *
+ * Note, the key length determines the cipher type. Many block ciphers implement
+ * different cipher modes depending on the key size, such as AES-128 vs AES-192
+ * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
+ * is performed.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm,
+			    const u8 *key, unsigned int keylen);
+
 static inline unsigned int crypto_skcipher_min_keysize(
 	struct crypto_skcipher *tfm)
 {
-	return crypto_skcipher_alg(tfm)->min_keysize;
+	return crypto_skcipher_alg_common(tfm)->min_keysize;
 }
 
 static inline unsigned int crypto_skcipher_max_keysize(
 	struct crypto_skcipher *tfm)
 {
-	return crypto_skcipher_alg(tfm)->max_keysize;
+	return crypto_skcipher_alg_common(tfm)->max_keysize;
+}
+
+static inline unsigned int crypto_lskcipher_min_keysize(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_lskcipher_alg(tfm)->co.min_keysize;
+}
+
+static inline unsigned int crypto_lskcipher_max_keysize(
+	struct crypto_lskcipher *tfm)
+{
+	return crypto_lskcipher_alg(tfm)->co.max_keysize;
 }
 
 /**
@@ -457,6 +706,42 @@ int crypto_skcipher_encrypt(struct skcipher_request *req);
  */
 int crypto_skcipher_decrypt(struct skcipher_request *req);
 
+/**
+ * crypto_lskcipher_encrypt() - encrypt plaintext
+ * @tfm: lskcipher handle
+ * @src: source buffer
+ * @dst: destination buffer
+ * @len: number of bytes to process
+ * @iv: IV for the cipher operation which must comply with the IV size defined
+ *      by crypto_lskcipher_ivsize
+ *
+ * Encrypt plaintext data using the lskcipher handle.
+ *
+ * Return: >=0 if the cipher operation was successful, if positive
+ *	   then this many bytes have been left unprocessed;
+ *	   < 0 if an error occurred
+ */
+int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src,
+			     u8 *dst, unsigned len, u8 *iv);
+
+/**
+ * crypto_lskcipher_decrypt() - decrypt ciphertext
+ * @tfm: lskcipher handle
+ * @src: source buffer
+ * @dst: destination buffer
+ * @len: number of bytes to process
+ * @iv: IV for the cipher operation which must comply with the IV size defined
+ *      by crypto_lskcipher_ivsize
+ *
+ * Decrypt ciphertext data using the lskcipher handle.
+ *
+ * Return: >=0 if the cipher operation was successful, if positive
+ *	   then this many bytes have been left unprocessed;
+ *	   < 0 if an error occurred
+ */
+int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src,
+			     u8 *dst, unsigned len, u8 *iv);
+
 /**
  * DOC: Symmetric Key Cipher Request Handle
  *
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index a0780deb017a..f3c3a3b27fac 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
 #define CRYPTO_ALG_TYPE_CIPHER		0x00000001
 #define CRYPTO_ALG_TYPE_COMPRESS	0x00000002
 #define CRYPTO_ALG_TYPE_AEAD		0x00000003
+#define CRYPTO_ALG_TYPE_LSKCIPHER	0x00000004
 #define CRYPTO_ALG_TYPE_SKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_AKCIPHER	0x00000006
 #define CRYPTO_ALG_TYPE_SIG		0x00000007
-- 
cgit v1.2.3


From 2a86f1b56a30e242caf7ee1268af68f4f49ce847 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Wed, 20 Sep 2023 14:26:29 +0800
Subject: kasan: Cleanup the __HAVE_ARCH_SHADOW_MAP usage

As Linus suggested, __HAVE_ARCH_XYZ is "stupid" and "having historical
uses of it doesn't make it good". So migrate __HAVE_ARCH_SHADOW_MAP to
separate macros named after the respective functions.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/kasan.h | 10 ++++++++--
 include/linux/kasan.h              |  2 +-
 mm/kasan/kasan.h                   |  8 +++-----
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h
index deeff8158f45..a12ecab37da7 100644
--- a/arch/loongarch/include/asm/kasan.h
+++ b/arch/loongarch/include/asm/kasan.h
@@ -10,8 +10,6 @@
 #include <asm/io.h>
 #include <asm/pgtable.h>
 
-#define __HAVE_ARCH_SHADOW_MAP
-
 #define KASAN_SHADOW_SCALE_SHIFT 3
 #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
@@ -68,6 +66,7 @@ static __always_inline bool kasan_arch_is_ready(void)
 	return !kasan_early_stage;
 }
 
+#define kasan_mem_to_shadow kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	if (!kasan_arch_is_ready()) {
@@ -97,6 +96,7 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 	}
 }
 
+#define kasan_shadow_to_mem kasan_shadow_to_mem
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	unsigned long addr = (unsigned long)shadow_addr;
@@ -119,6 +119,12 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 	}
 }
 
+#define addr_has_metadata addr_has_metadata
+static __always_inline bool addr_has_metadata(const void *addr)
+{
+	return (kasan_mem_to_shadow((void *)addr) != NULL);
+}
+
 void kasan_init(void);
 asmlinkage void kasan_early_init(void);
 
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3df5499f7936..842623d708c2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -54,7 +54,7 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
-#ifndef __HAVE_ARCH_SHADOW_MAP
+#ifndef kasan_mem_to_shadow
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f70e3d7a602e..d37831b8511c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -291,7 +291,7 @@ struct kasan_stack_ring {
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
-#ifndef __HAVE_ARCH_SHADOW_MAP
+#ifndef kasan_shadow_to_mem
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -299,15 +299,13 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 }
 #endif
 
+#ifndef addr_has_metadata
 static __always_inline bool addr_has_metadata(const void *addr)
 {
-#ifdef __HAVE_ARCH_SHADOW_MAP
-	return (kasan_mem_to_shadow((void *)addr) != NULL);
-#else
 	return (kasan_reset_tag(addr) >=
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
-#endif
 }
+#endif
 
 /**
  * kasan_check_range - Check memory region, and report if invalid access.
-- 
cgit v1.2.3


From 653b7eb9d74426397c95061fd57da3063625af65 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@nvidia.com>
Date: Mon, 28 Aug 2023 14:20:00 +0300
Subject: net/mlx5: Bridge, Enable mcast in smfs steering mode

In order to have mcast offloads the driver needs the following:
It should know if that mcast comes from wire port, in addition the flow
should not be marked as any specific source, that way it will give the
flexibility for the driver not to be depended on the way iterator
implemented in the FW.

Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 11 ++---------
 include/linux/mlx5/fs.h                                    |  1 +
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
index 7a01714b3780..a7ed87e9d842 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
@@ -78,6 +78,8 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
 	xa_for_each(&entry->ports, idx, port) {
 		dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 		dests[i].ft = port->mcast.ft;
+		if (port->vport_num == MLX5_VPORT_UPLINK)
+			dests[i].ft->flags |= MLX5_FLOW_TABLE_UPLINK_VPORT;
 		i++;
 	}
 
@@ -585,10 +587,6 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
 	if (!rule_spec)
 		return ERR_PTR(-ENOMEM);
 
-	if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
-	    port->vport_num == MLX5_VPORT_UPLINK)
-		rule_spec->flow_context.flow_source =
-			MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
 	rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 
 	flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
@@ -660,11 +658,6 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
 	if (!rule_spec)
 		return ERR_PTR(-ENOMEM);
 
-	if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
-	    port->vport_num == MLX5_VPORT_UPLINK)
-		rule_spec->flow_context.flow_source =
-			MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
-
 	if (MLX5_CAP_ESW(bridge->br_offloads->esw->dev, merged_eswitch)) {
 		dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
 		dest.vport.vhca_id = port->esw_owner_vhca_id;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1e00c2436377..6f7725238abc 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -67,6 +67,7 @@ enum {
 	MLX5_FLOW_TABLE_TERMINATION = BIT(2),
 	MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
 	MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
+	MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5),
 };
 
 #define LEFTOVERS_RULE_NUM	 2
-- 
cgit v1.2.3


From e0cc92fd945a9c8e43d4268cf1ea985d0e99a90f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Thu, 3 Aug 2023 16:39:40 +0300
Subject: net/mlx5: Add a health error syndrome for pci data poisoned

Add new health error syndrome to indicate that pci data poisoned error
has been received while fetching device ICM data.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 ++
 include/linux/mlx5/mlx5_ifc.h                    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2fb2598b775e..1c220048ae9a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -365,6 +365,8 @@ static const char *hsynd_str(u8 synd)
 		return "FFSER error";
 	case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR:
 		return "High temperature";
+	case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR:
+		return "ICM fetch PCI data poisoned error";
 	default:
 		return "unrecognized error";
 	}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index dd8421d021cf..b23d8ff286a1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10574,6 +10574,7 @@ enum {
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV                       = 0xe,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR                    = 0xf,
 	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR                = 0x10,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR         = 0x12,
 };
 
 struct mlx5_ifc_initial_seg_bits {
-- 
cgit v1.2.3


From 6b596e62ed9f90c4a97e68ae1f7b1af5beeb3c05 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Sep 2023 18:22:51 +0200
Subject: sched: Provide rt_mutex specific scheduler helpers

With PREEMPT_RT there is a rt_mutex recursion problem where
sched_submit_work() can use an rtlock (aka spinlock_t). More
specifically what happens is:

  mutex_lock() /* really rt_mutex */
    ...
      __rt_mutex_slowlock_locked()
	task_blocks_on_rt_mutex()
          // enqueue current task as waiter
          // do PI chain walk
        rt_mutex_slowlock_block()
          schedule()
            sched_submit_work()
              ...
              spin_lock() /* really rtlock */
                ...
                  __rt_mutex_slowlock_locked()
                    task_blocks_on_rt_mutex()
                      // enqueue current task as waiter *AGAIN*
                      // *CONFUSION*

Fix this by making rt_mutex do the sched_submit_work() early, before
it enqueues itself as a waiter -- before it even knows *if* it will
wait.

[[ basically Thomas' patch but with different naming and a few asserts
   added ]]

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-5-bigeasy@linutronix.de
---
 include/linux/sched.h    |  3 +++
 include/linux/sched/rt.h |  4 ++++
 kernel/sched/core.c      | 36 ++++++++++++++++++++++++++++++++----
 3 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..67623ffd4a8e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,6 +911,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+	unsigned			sched_rt_mutex:1;
+#endif
 
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 994c25640e15..b2b9e6eb9683 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ea7ba53aad2..58d0346d1bb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6723,9 +6723,6 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
 	unsigned int task_flags;
 
-	if (task_is_running(tsk))
-		return;
-
 	/*
 	 * Establish LD_WAIT_CONFIG context to ensure none of the code called
 	 * will use a blocking primitive -- which would lead to recursion.
@@ -6783,7 +6780,12 @@ asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 
-	sched_submit_work(tsk);
+#ifdef CONFIG_RT_MUTEXES
+	lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+	if (!task_is_running(tsk))
+		sched_submit_work(tsk);
 	__schedule_loop(SM_NONE);
 	sched_update_worker(tsk);
 }
@@ -7044,6 +7046,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
 
 #ifdef CONFIG_RT_MUTEXES
 
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+	lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+	sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+	lockdep_assert(current->sched_rt_mutex);
+	__schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+	sched_update_worker(current);
+	lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
 {
 	if (pi_task)
-- 
cgit v1.2.3


From 6d2779ecaeb56f92d7105c56772346c71c88c278 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 19 Sep 2023 18:14:29 +0100
Subject: locking/atomic: scripts: fix fallback ifdeffery

Since commit:

  9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery")

The ordering fallbacks for atomic*_read_acquire() and
atomic*_set_release() erroneously fall back to the implictly relaxed
atomic*_read() and atomic*_set() variants respectively, without any
additional barriers. This loses the ACQUIRE and RELEASE ordering
semantics, which can result in a wide variety of problems, even on
strongly-ordered architectures where the implementation of
atomic*_read() and/or atomic*_set() allows the compiler to reorder those
relative to other accesses.

In practice this has been observed to break bit spinlocks on arm64,
resulting in dentry cache corruption.

The fallback logic was intended to allow ACQUIRE/RELEASE/RELAXED ops to
be defined in terms of FULL ops, but where an op had RELAXED ordering by
default, this unintentionally permitted the ACQUIRE/RELEASE ops to be
defined in terms of the implicitly RELAXED default.

This patch corrects the logic to avoid falling back to implicitly
RELAXED ops, resulting in the same behaviour as prior to commit
9257959a6e5b4fca.

I've verified the resulting assembly on arm64 by generating outlined
wrappers of the atomics. Prior to this patch the compiler generates
sequences using relaxed load (LDR) and store (STR) instructions, e.g.

| <outlined_atomic64_read_acquire>:
|         ldr     x0, [x0]
|         ret
|
| <outlined_atomic64_set_release>:
|         str     x1, [x0]
|         ret

With this patch applied the compiler generates sequences using the
intended load-acquire (LDAR) and store-release (STLR) instructions, e.g.

| <outlined_atomic64_read_acquire>:
|         ldar    x0, [x0]
|         ret
|
| <outlined_atomic64_set_release>:
|         stlr    x1, [x0]
|         ret

To make sure that there were no other victims of the ifdeffery rewrite,
I generated outlined copies of all of the {atomic,atomic64,atomic_long}
atomic operations before and after commit 9257959a6e5b4fca. A diff of
the generated assembly on arm64 shows that only the read_acquire() and
set_release() operations were changed, and only lost their intended
ordering:

| [mark@lakrids:~/src/linux]% diff -u \
| 	<(aarch64-linux-gnu-objdump -d before-9257959a6e5b4fca.o)
| 	<(aarch64-linux-gnu-objdump -d after-9257959a6e5b4fca.o)
| --- /proc/self/fd/11    2023-09-19 16:51:51.114779415 +0100
| +++ /proc/self/fd/16    2023-09-19 16:51:51.114779415 +0100
| @@ -1,5 +1,5 @@
|
| -before-9257959a6e5b4fca.o:     file format elf64-littleaarch64
| +after-9257959a6e5b4fca.o:     file format elf64-littleaarch64
|
|
|  Disassembly of section .text:
| @@ -9,7 +9,7 @@
|         4:      d65f03c0        ret
|
|  0000000000000008 <outlined_atomic_read_acquire>:
| -       8:      88dffc00        ldar    w0, [x0]
| +       8:      b9400000        ldr     w0, [x0]
|         c:      d65f03c0        ret
|
|  0000000000000010 <outlined_atomic_set>:
| @@ -17,7 +17,7 @@
|        14:      d65f03c0        ret
|
|  0000000000000018 <outlined_atomic_set_release>:
| -      18:      889ffc01        stlr    w1, [x0]
| +      18:      b9000001        str     w1, [x0]
|        1c:      d65f03c0        ret
|
|  0000000000000020 <outlined_atomic_add>:
| @@ -1230,7 +1230,7 @@
|      1070:      d65f03c0        ret
|
|  0000000000001074 <outlined_atomic64_read_acquire>:
| -    1074:      c8dffc00        ldar    x0, [x0]
| +    1074:      f9400000        ldr     x0, [x0]
|      1078:      d65f03c0        ret
|
|  000000000000107c <outlined_atomic64_set>:
| @@ -1238,7 +1238,7 @@
|      1080:      d65f03c0        ret
|
|  0000000000001084 <outlined_atomic64_set_release>:
| -    1084:      c89ffc01        stlr    x1, [x0]
| +    1084:      f9000001        str     x1, [x0]
|      1088:      d65f03c0        ret
|
|  000000000000108c <outlined_atomic64_add>:
| @@ -2427,7 +2427,7 @@
|      207c:      d65f03c0        ret
|
|  0000000000002080 <outlined_atomic_long_read_acquire>:
| -    2080:      c8dffc00        ldar    x0, [x0]
| +    2080:      f9400000        ldr     x0, [x0]
|      2084:      d65f03c0        ret
|
|  0000000000002088 <outlined_atomic_long_set>:
| @@ -2435,7 +2435,7 @@
|      208c:      d65f03c0        ret
|
|  0000000000002090 <outlined_atomic_long_set_release>:
| -    2090:      c89ffc01        stlr    x1, [x0]
| +    2090:      f9000001        str     x1, [x0]
|      2094:      d65f03c0        ret
|
|  0000000000002098 <outlined_atomic_long_add>:

I've build tested this with a variety of configs for alpha, arm, arm64,
csky, i386, m68k, microblaze, mips, nios2, openrisc, powerpc, riscv,
s390, sh, sparc, x86_64, and xtensa, for which I've seen no issues. I
was unable to build test for ia64 and parisc due to existing build
breakage in v6.6-rc2.

Fixes: 9257959a6e5b4fca ("locking/atomic: scripts: restructure fallback ifdeffery")
Reported-by: Ming Lei <ming.lei@redhat.com>
Reported-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Baokun Li <libaokun1@huawei.com>
Link: https://lkml.kernel.org/r/20230919171430.2697727-1-mark.rutland@arm.com
---
 include/linux/atomic/atomic-arch-fallback.h | 10 +---------
 scripts/atomic/gen-atomic-fallback.sh       |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 18f5744dfb5d..b83ef19da13d 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -459,8 +459,6 @@ raw_atomic_read_acquire(const atomic_t *v)
 {
 #if defined(arch_atomic_read_acquire)
 	return arch_atomic_read_acquire(v);
-#elif defined(arch_atomic_read)
-	return arch_atomic_read(v);
 #else
 	int ret;
 
@@ -508,8 +506,6 @@ raw_atomic_set_release(atomic_t *v, int i)
 {
 #if defined(arch_atomic_set_release)
 	arch_atomic_set_release(v, i);
-#elif defined(arch_atomic_set)
-	arch_atomic_set(v, i);
 #else
 	if (__native_word(atomic_t)) {
 		smp_store_release(&(v)->counter, i);
@@ -2575,8 +2571,6 @@ raw_atomic64_read_acquire(const atomic64_t *v)
 {
 #if defined(arch_atomic64_read_acquire)
 	return arch_atomic64_read_acquire(v);
-#elif defined(arch_atomic64_read)
-	return arch_atomic64_read(v);
 #else
 	s64 ret;
 
@@ -2624,8 +2618,6 @@ raw_atomic64_set_release(atomic64_t *v, s64 i)
 {
 #if defined(arch_atomic64_set_release)
 	arch_atomic64_set_release(v, i);
-#elif defined(arch_atomic64_set)
-	arch_atomic64_set(v, i);
 #else
 	if (__native_word(atomic64_t)) {
 		smp_store_release(&(v)->counter, i);
@@ -4657,4 +4649,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 202b45c7db600ce36198eb1f1fc2c2d5268ace2d
+// 2fdd6702823fa842f9cea57a002e6e4476ae780c
diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh
index c0c8a85d7c81..a45154cefa48 100755
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -102,7 +102,7 @@ gen_proto_order_variant()
 	fi
 
 	# Allow ACQUIRE/RELEASE/RELAXED ops to be defined in terms of FULL ops
-	if [ ! -z "${order}" ]; then
+	if [ ! -z "${order}" ] && ! meta_is_implicitly_relaxed "${meta}"; then
 		printf "#elif defined(arch_${basename})\n"
 		printf "\t${retstmt}arch_${basename}(${args});\n"
 	fi
-- 
cgit v1.2.3


From 03a95cf233b5bdd65ddd4eca63cd4874b6b84d50 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:03 +0100
Subject: firmware: arm_scmi: Simplify enable/disable clock operations

SCMI clock enable/disable operations come in 2 different flavours which
simply just differ in how the underlying SCMI transactions is carried on:
atomic or not.

Currently we expose such SCMI operations through 2 distinctly named
wrappers, that, in turn, are wrapped into another couple of similarly and
distinctly named callbacks inside SCMI clock driver user.

Reduce the churn of duplicated wrappers by adding a param to SCMI clock
enable/disable operations to ask for atomic operation while removing the
_atomic version of such operations.

No functional change.

CC: Michael Turquette <mturquette@baylibre.com>
CC: Stephen Boyd <sboyd@kernel.org>
CC: linux-clk@vger.kernel.org
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20230826125308.462328-2-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/clk/clk-scmi.c            | 11 +++++++----
 drivers/firmware/arm_scmi/clock.c | 24 ++++++------------------
 include/linux/scmi_protocol.h     |  9 ++++-----
 3 files changed, 17 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-scmi.c b/drivers/clk/clk-scmi.c
index 2c7a830ce308..b7a180b3443e 100644
--- a/drivers/clk/clk-scmi.c
+++ b/drivers/clk/clk-scmi.c
@@ -13,6 +13,9 @@
 #include <linux/scmi_protocol.h>
 #include <asm/div64.h>
 
+#define NOT_ATOMIC	false
+#define ATOMIC		true
+
 static const struct scmi_clk_proto_ops *scmi_proto_clk_ops;
 
 struct scmi_clk {
@@ -78,28 +81,28 @@ static int scmi_clk_enable(struct clk_hw *hw)
 {
 	struct scmi_clk *clk = to_scmi_clk(hw);
 
-	return scmi_proto_clk_ops->enable(clk->ph, clk->id);
+	return scmi_proto_clk_ops->enable(clk->ph, clk->id, NOT_ATOMIC);
 }
 
 static void scmi_clk_disable(struct clk_hw *hw)
 {
 	struct scmi_clk *clk = to_scmi_clk(hw);
 
-	scmi_proto_clk_ops->disable(clk->ph, clk->id);
+	scmi_proto_clk_ops->disable(clk->ph, clk->id, NOT_ATOMIC);
 }
 
 static int scmi_clk_atomic_enable(struct clk_hw *hw)
 {
 	struct scmi_clk *clk = to_scmi_clk(hw);
 
-	return scmi_proto_clk_ops->enable_atomic(clk->ph, clk->id);
+	return scmi_proto_clk_ops->enable(clk->ph, clk->id, ATOMIC);
 }
 
 static void scmi_clk_atomic_disable(struct clk_hw *hw)
 {
 	struct scmi_clk *clk = to_scmi_clk(hw);
 
-	scmi_proto_clk_ops->disable_atomic(clk->ph, clk->id);
+	scmi_proto_clk_ops->disable(clk->ph, clk->id, ATOMIC);
 }
 
 /*
diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 96060bf90a24..447d29b5fc72 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -418,26 +418,16 @@ scmi_clock_config_set(const struct scmi_protocol_handle *ph, u32 clk_id,
 	return ret;
 }
 
-static int scmi_clock_enable(const struct scmi_protocol_handle *ph, u32 clk_id)
+static int scmi_clock_enable(const struct scmi_protocol_handle *ph, u32 clk_id,
+			     bool atomic)
 {
-	return scmi_clock_config_set(ph, clk_id, CLOCK_ENABLE, false);
+	return scmi_clock_config_set(ph, clk_id, CLOCK_ENABLE, atomic);
 }
 
-static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id)
+static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id,
+			      bool atomic)
 {
-	return scmi_clock_config_set(ph, clk_id, 0, false);
-}
-
-static int scmi_clock_enable_atomic(const struct scmi_protocol_handle *ph,
-				    u32 clk_id)
-{
-	return scmi_clock_config_set(ph, clk_id, CLOCK_ENABLE, true);
-}
-
-static int scmi_clock_disable_atomic(const struct scmi_protocol_handle *ph,
-				     u32 clk_id)
-{
-	return scmi_clock_config_set(ph, clk_id, 0, true);
+	return scmi_clock_config_set(ph, clk_id, 0, atomic);
 }
 
 static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
@@ -470,8 +460,6 @@ static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.rate_set = scmi_clock_rate_set,
 	.enable = scmi_clock_enable,
 	.disable = scmi_clock_disable,
-	.enable_atomic = scmi_clock_enable_atomic,
-	.disable_atomic = scmi_clock_disable_atomic,
 };
 
 static int scmi_clk_rate_notify(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index e6fe4f73ffe6..b4c631a8d0ac 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -90,11 +90,10 @@ struct scmi_clk_proto_ops {
 			u64 *rate);
 	int (*rate_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			u64 rate);
-	int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*enable_atomic)(const struct scmi_protocol_handle *ph, u32 clk_id);
-	int (*disable_atomic)(const struct scmi_protocol_handle *ph,
-			      u32 clk_id);
+	int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      bool atomic);
+	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id,
+		       bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 34592bf0a5cb0681ce3d64de5598951768f43328 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:05 +0100
Subject: firmware: arm_scmi: Add v3.2 clock CONFIG_GET support

Add support for v3.2 clock CONFIG_GET command and related new clock
protocol operation state_get() to retrieve the status of a clock.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20230826125308.462328-4-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 64 +++++++++++++++++++++++++++++++++++++++
 include/linux/scmi_protocol.h     |  3 ++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 63bd043365cd..aaa95624493d 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -21,6 +21,7 @@ enum scmi_clock_protocol_cmd {
 	CLOCK_NAME_GET = 0x8,
 	CLOCK_RATE_NOTIFY = 0x9,
 	CLOCK_RATE_CHANGE_REQUESTED_NOTIFY = 0xA,
+	CLOCK_CONFIG_GET = 0xB,
 };
 
 enum clk_state {
@@ -59,6 +60,19 @@ struct scmi_msg_clock_config_set_v21 {
 	__le32 oem_config_val;
 };
 
+struct scmi_msg_clock_config_get {
+	__le32 id;
+	__le32 flags;
+#define REGMASK_OEM_TYPE_GET		GENMASK(7, 0)
+};
+
+struct scmi_msg_resp_clock_config_get {
+	__le32 attributes;
+	__le32 config;
+#define IS_CLK_ENABLED(x)		le32_get_bits((x), BIT(0))
+	__le32 oem_config_val;
+};
+
 struct scmi_msg_clock_describe_rates {
 	__le32 id;
 	__le32 rate_index;
@@ -496,6 +510,55 @@ static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id,
 				    NULL_OEM_TYPE, 0, atomic);
 }
 
+static int
+scmi_clock_config_get(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      u8 oem_type, u32 *attributes, bool *enabled,
+		      u32 *oem_val, bool atomic)
+{
+	int ret;
+	u32 flags;
+	struct scmi_xfer *t;
+	struct scmi_msg_clock_config_get *cfg;
+
+	ret = ph->xops->xfer_get_init(ph, CLOCK_CONFIG_GET,
+				      sizeof(*cfg), 0, &t);
+	if (ret)
+		return ret;
+
+	t->hdr.poll_completion = atomic;
+
+	flags = FIELD_PREP(REGMASK_OEM_TYPE_GET, oem_type);
+
+	cfg = t->tx.buf;
+	cfg->id = cpu_to_le32(clk_id);
+	cfg->flags = cpu_to_le32(flags);
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		struct scmi_msg_resp_clock_config_get *resp = t->rx.buf;
+
+		if (attributes)
+			*attributes = le32_to_cpu(resp->attributes);
+
+		if (enabled)
+			*enabled = IS_CLK_ENABLED(resp->config);
+
+		if (oem_val && oem_type)
+			*oem_val = le32_to_cpu(resp->oem_config_val);
+	}
+
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int scmi_clock_state_get(const struct scmi_protocol_handle *ph,
+				u32 clk_id, bool *enabled, bool atomic)
+{
+	return scmi_clock_config_get(ph, clk_id, NULL_OEM_TYPE, NULL,
+				     enabled, NULL, atomic);
+}
+
 static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
 {
 	struct clock_info *ci = ph->get_priv(ph);
@@ -526,6 +589,7 @@ static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.rate_set = scmi_clock_rate_set,
 	.enable = scmi_clock_enable,
 	.disable = scmi_clock_disable,
+	.state_get = scmi_clock_state_get,
 };
 
 static int scmi_clk_rate_notify(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b4c631a8d0ac..d11ca4286d57 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -80,6 +80,7 @@ struct scmi_protocol_handle;
  * @rate_set: set the clock rate of a clock
  * @enable: enables the specified clock
  * @disable: disables the specified clock
+ * @state_get: get the status of the specified clock
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -94,6 +95,8 @@ struct scmi_clk_proto_ops {
 		      bool atomic);
 	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id,
 		       bool atomic);
+	int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			 bool *enabled, bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 141b4fa0362569138653cf0165d92d48576db3fa Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Sat, 26 Aug 2023 13:53:08 +0100
Subject: firmware: arm_scmi: Add clock OEM config clock operations

Expose a couple of new SCMI clock operations to get and set OEM specific
clock configurations when talking to an SCMI v3.2 compliant.

Issuing such requests against an SCMI platform server not supporting v3.2
extension for OEM specific clock configurations will fail.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20230826125308.462328-7-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 22 ++++++++++++++++++++++
 include/linux/scmi_protocol.h     |  7 +++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 333d08822f77..d18bf789fc24 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -594,6 +594,26 @@ static int scmi_clock_state_get(const struct scmi_protocol_handle *ph,
 				    enabled, NULL, atomic);
 }
 
+static int scmi_clock_config_oem_set(const struct scmi_protocol_handle *ph,
+				     u32 clk_id, u8 oem_type, u32 oem_val,
+				     bool atomic)
+{
+	struct clock_info *ci = ph->get_priv(ph);
+
+	return ci->clock_config_set(ph, clk_id, CLK_STATE_UNCHANGED,
+				    oem_type, oem_val, atomic);
+}
+
+static int scmi_clock_config_oem_get(const struct scmi_protocol_handle *ph,
+				     u32 clk_id, u8 oem_type, u32 *oem_val,
+				     u32 *attributes, bool atomic)
+{
+	struct clock_info *ci = ph->get_priv(ph);
+
+	return ci->clock_config_get(ph, clk_id, oem_type, attributes,
+				    NULL, oem_val, atomic);
+}
+
 static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
 {
 	struct clock_info *ci = ph->get_priv(ph);
@@ -625,6 +645,8 @@ static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.enable = scmi_clock_enable,
 	.disable = scmi_clock_disable,
 	.state_get = scmi_clock_state_get,
+	.config_oem_get = scmi_clock_config_oem_get,
+	.config_oem_set = scmi_clock_config_oem_set,
 };
 
 static int scmi_clk_rate_notify(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index d11ca4286d57..e09ac428fa1b 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -81,6 +81,8 @@ struct scmi_protocol_handle;
  * @enable: enables the specified clock
  * @disable: disables the specified clock
  * @state_get: get the status of the specified clock
+ * @config_oem_get: get the value of an OEM specific clock config
+ * @config_oem_set: set the value of an OEM specific clock config
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -97,6 +99,11 @@ struct scmi_clk_proto_ops {
 		       bool atomic);
 	int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			 bool *enabled, bool atomic);
+	int (*config_oem_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			      u8 oem_type, u32 *oem_val, u32 *attributes,
+			      bool atomic);
+	int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			      u8 oem_type, u32 oem_val, bool atomic);
 };
 
 /**
-- 
cgit v1.2.3


From 647aa768281f38cb1002edb3a1f673c3d66a8d81 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 20 Sep 2023 16:40:13 +0200
Subject: Revert "fs: add infrastructure for multigrain timestamps"

This reverts commit ffb6cf19e06334062744b7e3493f71e500964f8e.

Users reported regressions due to enabling multi-grained timestamps
unconditionally. As no clear consensus on a solution has come up and the
discussion has gone back to the drawing board revert the infrastructure
changes for. If it isn't code that's here to stay, make it go away.

Message-ID: <20230920-keine-eile-c9755b5825db@brauner>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 82 ++----------------------------------------------------
 fs/stat.c          | 41 ++-------------------------
 include/linux/fs.h | 46 ++----------------------------
 3 files changed, 7 insertions(+), 162 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..84bc3c76e5cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2102,52 +2102,10 @@ int file_remove_privs(struct file *file)
 }
 EXPORT_SYMBOL(file_remove_privs);
 
-/**
- * current_mgtime - Return FS time (possibly fine-grained)
- * @inode: inode.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
- * as having been QUERIED, get a fine-grained timestamp.
- */
-struct timespec64 current_mgtime(struct inode *inode)
-{
-	struct timespec64 now, ctime;
-	atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
-	long nsec = atomic_long_read(pnsec);
-
-	if (nsec & I_CTIME_QUERIED) {
-		ktime_get_real_ts64(&now);
-		return timestamp_truncate(now, inode);
-	}
-
-	ktime_get_coarse_real_ts64(&now);
-	now = timestamp_truncate(now, inode);
-
-	/*
-	 * If we've recently fetched a fine-grained timestamp
-	 * then the coarse-grained one may still be earlier than the
-	 * existing ctime. Just keep the existing value if so.
-	 */
-	ctime = inode_get_ctime(inode);
-	if (timespec64_compare(&ctime, &now) > 0)
-		now = ctime;
-
-	return now;
-}
-EXPORT_SYMBOL(current_mgtime);
-
-static struct timespec64 current_ctime(struct inode *inode)
-{
-	if (is_mgtime(inode))
-		return current_mgtime(inode);
-	return current_time(inode);
-}
-
 static int inode_needs_update_time(struct inode *inode)
 {
 	int sync_it = 0;
-	struct timespec64 now = current_ctime(inode);
+	struct timespec64 now = current_time(inode);
 	struct timespec64 ctime;
 
 	/* First try to exhaust all avenues to not sync */
@@ -2578,43 +2536,9 @@ EXPORT_SYMBOL(current_time);
  */
 struct timespec64 inode_set_ctime_current(struct inode *inode)
 {
-	struct timespec64 now;
-	struct timespec64 ctime;
-
-	ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec);
-	if (!(ctime.tv_nsec & I_CTIME_QUERIED)) {
-		now = current_time(inode);
+	struct timespec64 now = current_time(inode);
 
-		/* Just copy it into place if it's not multigrain */
-		if (!is_mgtime(inode)) {
-			inode_set_ctime_to_ts(inode, now);
-			return now;
-		}
-
-		/*
-		 * If we've recently updated with a fine-grained timestamp,
-		 * then the coarse-grained one may still be earlier than the
-		 * existing ctime. Just keep the existing value if so.
-		 */
-		ctime.tv_sec = inode->__i_ctime.tv_sec;
-		if (timespec64_compare(&ctime, &now) > 0)
-			return ctime;
-
-		/*
-		 * Ctime updates are usually protected by the inode_lock, but
-		 * we can still race with someone setting the QUERIED flag.
-		 * Try to swap the new nsec value into place. If it's changed
-		 * in the interim, then just go with a fine-grained timestamp.
-		 */
-		if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec,
-			    now.tv_nsec) != ctime.tv_nsec)
-			goto fine_grained;
-		inode->__i_ctime.tv_sec = now.tv_sec;
-		return now;
-	}
-fine_grained:
-	ktime_get_real_ts64(&now);
-	inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode));
+	inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
 	return now;
 }
 EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/stat.c b/fs/stat.c
index 6e60389d6a15..d43a5cc1bfa4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -26,37 +26,6 @@
 #include "internal.h"
 #include "mount.h"
 
-/**
- * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
- * @stat: where to store the resulting values
- * @request_mask: STATX_* values requested
- * @inode: inode from which to grab the c/mtime
- *
- * Given @inode, grab the ctime and mtime out if it and store the result
- * in @stat. When fetching the value, flag it as queried so the next write
- * will use a fine-grained timestamp.
- */
-void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
-{
-	atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
-
-	/* If neither time was requested, then don't report them */
-	if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
-		stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
-		return;
-	}
-
-	stat->mtime = inode->i_mtime;
-	stat->ctime.tv_sec = inode->__i_ctime.tv_sec;
-	/*
-	 * Atomically set the QUERIED flag and fetch the new value with
-	 * the flag masked off.
-	 */
-	stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) &
-					~I_CTIME_QUERIED;
-}
-EXPORT_SYMBOL(fill_mg_cmtime);
-
 /**
  * generic_fillattr - Fill in the basic attributes from the inode struct
  * @idmap:		idmap of the mount the inode was found from
@@ -89,14 +58,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
-
-	if (is_mgtime(inode)) {
-		fill_mg_cmtime(stat, request_mask, inode);
-	} else {
-		stat->mtime = inode->i_mtime;
-		stat->ctime = inode_get_ctime(inode);
-	}
-
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode_get_ctime(inode);
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..b528f063e8ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1508,47 +1508,18 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 	       kgid_has_mapping(fs_userns, kgid);
 }
 
-struct timespec64 current_mgtime(struct inode *inode);
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
 
-/*
- * Multigrain timestamps
- *
- * Conditionally use fine-grained ctime and mtime timestamps when there
- * are users actively observing them via getattr. The primary use-case
- * for this is NFS clients that use the ctime to distinguish between
- * different states of the file, and that are often fooled by multiple
- * operations that occur in the same coarse-grained timer tick.
- *
- * The kernel always keeps normalized struct timespec64 values in the ctime,
- * which means that only the first 30 bits of the value are used. Use the
- * 31st bit of the ctime's tv_nsec field as a flag to indicate that the value
- * has been queried since it was last updated.
- */
-#define I_CTIME_QUERIED		(1L<<30)
-
 /**
  * inode_get_ctime - fetch the current ctime from the inode
  * @inode: inode from which to fetch ctime
  *
- * Grab the current ctime tv_nsec field from the inode, mask off the
- * I_CTIME_QUERIED flag and return it. This is mostly intended for use by
- * internal consumers of the ctime that aren't concerned with ensuring a
- * fine-grained update on the next change (e.g. when preparing to store
- * the value in the backing store for later retrieval).
- *
- * This is safe to call regardless of whether the underlying filesystem
- * is using multigrain timestamps.
+ * Grab the current ctime from the inode and return it.
  */
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
-	struct timespec64 ctime;
-
-	ctime.tv_sec = inode->__i_ctime.tv_sec;
-	ctime.tv_nsec = inode->__i_ctime.tv_nsec & ~I_CTIME_QUERIED;
-
-	return ctime;
+	return inode->__i_ctime;
 }
 
 /**
@@ -2334,7 +2305,6 @@ struct file_system_type {
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
 #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
-#define FS_MGTIME		64	/* FS uses multigrain timestamps */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
@@ -2358,17 +2328,6 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-/**
- * is_mgtime: is this inode using multigrain timestamps
- * @inode: inode to test for multigrain timestamps
- *
- * Return true if the inode uses multigrain timestamps, false otherwise.
- */
-static inline bool is_mgtime(const struct inode *inode)
-{
-	return inode->i_sb->s_type->fs_flags & FS_MGTIME;
-}
-
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
@@ -3054,7 +3013,6 @@ extern void page_put_link(void *);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
-void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
 void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
 void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
-- 
cgit v1.2.3


From 41b43b6c6e30a832c790b010a06772e793bca193 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Sep 2023 12:46:27 +0200
Subject: locking/seqlock: Do the lockdep annotation before locking in
 do_write_seqcount_begin_nested()

It was brought up by Tetsuo that the following sequence:

   write_seqlock_irqsave()
   printk_deferred_enter()

could lead to a deadlock if the lockdep annotation within
write_seqlock_irqsave() triggers.

The problem is that the sequence counter is incremented before the lockdep
annotation is performed. The lockdep splat would then attempt to invoke
printk() but the reader side, of the same seqcount, could have a
tty_port::lock acquired waiting for the sequence number to become even again.

The other lockdep annotations come before the actual locking because "we
want to see the locking error before it happens". There is no reason why
seqcount should be different here.

Do the lockdep annotation first then perform the locking operation (the
sequence increment).

Fixes: 1ca7d67cf5d5a ("seqcount: Add lockdep functionality to seqcount/seqlock structures")
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230920104627._DTHgPyA@linutronix.de

Closes: https://lore.kernel.org/20230621130641.-5iueY1I@linutronix.de
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 987a59d977c5..e9bd2f65d7f4 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -512,8 +512,8 @@ do {									\
 
 static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
-	do_raw_write_seqcount_begin(s);
 	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+	do_raw_write_seqcount_begin(s);
 }
 
 /**
-- 
cgit v1.2.3


From 87c3a5893e865739ce78aa7192d36011022e0af7 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@linux-m68k.org>
Date: Fri, 15 Sep 2023 15:47:11 +1000
Subject: sched/core: Optimize in_task() and in_interrupt() a bit

Except on x86, preempt_count is always accessed with READ_ONCE().
Repeated invocations in macros like irq_count() produce repeated loads.
These redundant instructions appear in various fast paths. In the one
shown below, for example, irq_count() is evaluated during kernel entry
if !tick_nohz_full_cpu(smp_processor_id()).

0001ed0a <irq_enter_rcu>:
   1ed0a:       4e56 0000       linkw %fp,#0
   1ed0e:       200f            movel %sp,%d0
   1ed10:       0280 ffff e000  andil #-8192,%d0
   1ed16:       2040            moveal %d0,%a0
   1ed18:       2028 0008       movel %a0@(8),%d0
   1ed1c:       0680 0001 0000  addil #65536,%d0
   1ed22:       2140 0008       movel %d0,%a0@(8)
   1ed26:       082a 0001 000f  btst #1,%a2@(15)
   1ed2c:       670c            beqs 1ed3a <irq_enter_rcu+0x30>
   1ed2e:       2028 0008       movel %a0@(8),%d0
   1ed32:       2028 0008       movel %a0@(8),%d0
   1ed36:       2028 0008       movel %a0@(8),%d0
   1ed3a:       4e5e            unlk %fp
   1ed3c:       4e75            rts

This patch doesn't prevent the pointless btst and beqs instructions
above, but it does eliminate 2 of the 3 pointless move instructions
here and elsewhere.

On x86, preempt_count is per-cpu data and the problem does not arise
presumably because the compiler is free to optimize more effectively.

This patch was tested on m68k and x86. I was expecting no changes
to object code for x86 and mostly that's what I saw. However, there
were a few places where code generation was perturbed for some reason.

The performance issue addressed here is minor on uniprocessor m68k. I
got a 0.01% improvement from this patch for a simple "find /sys -false"
benchmark. For architectures and workloads susceptible to cache line bounce
the improvement is expected to be larger. The only SMP architecture I have
is x86, and as x86 unaffected I have not done any further measurements.

Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit")
Signed-off-by: Finn Thain <fthain@linux-m68k.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/0a403120a682a525e6db2d81d1a3ffcc137c3742.1694756831.git.fthain@linux-m68k.org
---
 include/linux/preempt.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1424670df161..9aa6358a1a16 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
 	return level;
 }
 
+/*
+ * These macro definitions avoid redundant invocations of preempt_count()
+ * because such invocations would result in redundant loads given that
+ * preempt_count() is commonly implemented with READ_ONCE().
+ */
+
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
 # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
 #else
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define irq_count()		(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
 #endif
-#define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
  * Macros to retrieve the current execution context:
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
 #define in_nmi()		(nmi_count())
 #define in_hardirq()		(hardirq_count())
 #define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-#define in_task()		(!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task()		(!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
 
 /*
  * The following macros are deprecated and should not be used in new code:
-- 
cgit v1.2.3


From 3ba78da711940ce07c39c4cdd1f4ad284067a42d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 6 Jun 2021 13:27:15 +0200
Subject: sched/headers: Add header guard to <linux/sched/deadline.h>

It's the only non-trivial header in include/linux/sched/ missing a header guard.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/deadline.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 7c83d4d5a971..df3aca89d4f5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
-- 
cgit v1.2.3


From 6eddb116dd830436afbd922568292867de6c8b9e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:24:17 +0200
Subject: sched/headers: Standardize the <linux/sched/vhost_task.h> header
 guard name

Use the same _LINUX_SCHED_ prefix nomenclature as the other 29 header guards
in include/linux/sched/ do.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/vhost_task.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index 837a23624a66..bc60243d43b3 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_VHOST_TASK_H
-#define _LINUX_VHOST_TASK_H
-
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
 
 struct vhost_task;
 
@@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
 void vhost_task_stop(struct vhost_task *vtsk);
 void vhost_task_wake(struct vhost_task *vtsk);
 
-#endif
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
-- 
cgit v1.2.3


From 0f9a1a4d234c064d8dff69cf3f3755554dd479ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:27:37 +0200
Subject: sched/headers: Standardize the <linux/sched/type.h> header guard
 #endif

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
 	unsigned long long		sum_exec_runtime;
 };
 
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
-- 
cgit v1.2.3


From 1632d47fae2f2d229dd432854c4443ebb0bb27a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:28:48 +0200
Subject: sched/headers: Standardize the <linux/sched/smt.h> header guard
 #endif

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/smt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..fb1e295e7e63 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
 
 void arch_smt_update(void);
 
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
-- 
cgit v1.2.3


From 0df7cd3c13e44d01f9f28e29cbce74e2931b00fe Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:15 +0300
Subject: vsock/virtio/vhost: read data from non-linear skb

This is preparation patch for MSG_ZEROCOPY support. It adds handling of
non-linear skbs by replacing direct calls of 'memcpy_to_msg()' with
'skb_copy_datagram_iter()'. Main advantage of the second one is that it
can handle paged part of the skb by using 'kmap()' on each page, but if
there are no pages in the skb, it behaves like simple copying to iov
iterator. This patch also adds new field to the control block of skb -
this value shows current offset in the skb to read next portion of data
(it doesn't matter linear it or not). Idea behind this field is that
'skb_copy_datagram_iter()' handles both types of skb internally - it
just needs an offset from which to copy data from the given skb. This
offset is incremented on each read from skb. This approach allows to
simplify handling of both linear and non-linear skbs, because for
linear skb we need to call 'skb_pull()' after reading data from it,
while in non-linear case we need to update 'data_len'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/vhost/vsock.c                   | 14 +++++++++-----
 include/linux/virtio_vsock.h            |  1 +
 net/vmw_vsock/virtio_transport_common.c | 32 +++++++++++++++++++-------------
 3 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 817d377a3f36..83711aad855c 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -114,6 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		struct sk_buff *skb;
 		unsigned out, in;
 		size_t nbytes;
+		u32 offset;
 		int head;
 
 		skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
@@ -156,7 +157,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		}
 
 		iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
-		payload_len = skb->len;
+		offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+		payload_len = skb->len - offset;
 		hdr = virtio_vsock_hdr(skb);
 
 		/* If the packet is greater than the space available in the
@@ -197,8 +199,10 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 			break;
 		}
 
-		nbytes = copy_to_iter(skb->data, payload_len, &iov_iter);
-		if (nbytes != payload_len) {
+		if (skb_copy_datagram_iter(skb,
+					   offset,
+					   &iov_iter,
+					   payload_len)) {
 			kfree_skb(skb);
 			vq_err(vq, "Faulted on copying pkt buf\n");
 			break;
@@ -212,13 +216,13 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 		vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
 		added = true;
 
-		skb_pull(skb, payload_len);
+		VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len;
 		total_len += payload_len;
 
 		/* If we didn't send all the payload we can requeue the packet
 		 * to send it with the next available buffer.
 		 */
-		if (skb->len > 0) {
+		if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) {
 			hdr->flags |= cpu_to_le32(flags_to_restore);
 
 			/* We are queueing the same skb to handle
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c58453699ee9..a91fbdf233e4 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
 struct virtio_vsock_skb_cb {
 	bool reply;
 	bool tap_delivered;
+	u32 offset;
 };
 
 #define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb))
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 352d042b130b..3e08d52a9355 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -364,9 +364,10 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
 		spin_unlock_bh(&vvs->rx_lock);
 
 		/* sk_lock is held by caller so no one else can dequeue.
-		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 		 */
-		err = memcpy_to_msg(msg, skb->data, bytes);
+		err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+					     &msg->msg_iter, bytes);
 		if (err)
 			goto out;
 
@@ -410,25 +411,27 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 	while (total < len && !skb_queue_empty(&vvs->rx_queue)) {
 		skb = skb_peek(&vvs->rx_queue);
 
-		bytes = len - total;
-		if (bytes > skb->len)
-			bytes = skb->len;
+		bytes = min_t(size_t, len - total,
+			      skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset);
 
 		/* sk_lock is held by caller so no one else can dequeue.
-		 * Unlock rx_lock since memcpy_to_msg() may sleep.
+		 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 		 */
 		spin_unlock_bh(&vvs->rx_lock);
 
-		err = memcpy_to_msg(msg, skb->data, bytes);
+		err = skb_copy_datagram_iter(skb,
+					     VIRTIO_VSOCK_SKB_CB(skb)->offset,
+					     &msg->msg_iter, bytes);
 		if (err)
 			goto out;
 
 		spin_lock_bh(&vvs->rx_lock);
 
 		total += bytes;
-		skb_pull(skb, bytes);
 
-		if (skb->len == 0) {
+		VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes;
+
+		if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) {
 			u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
 
 			virtio_transport_dec_rx_pkt(vvs, pkt_len);
@@ -492,9 +495,10 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
 			spin_unlock_bh(&vvs->rx_lock);
 
 			/* sk_lock is held by caller so no one else can dequeue.
-			 * Unlock rx_lock since memcpy_to_msg() may sleep.
+			 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 			 */
-			err = memcpy_to_msg(msg, skb->data, bytes);
+			err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset,
+						     &msg->msg_iter, bytes);
 			if (err)
 				return err;
 
@@ -553,11 +557,13 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 				int err;
 
 				/* sk_lock is held by caller so no one else can dequeue.
-				 * Unlock rx_lock since memcpy_to_msg() may sleep.
+				 * Unlock rx_lock since skb_copy_datagram_iter() may sleep.
 				 */
 				spin_unlock_bh(&vvs->rx_lock);
 
-				err = memcpy_to_msg(msg, skb->data, bytes_to_copy);
+				err = skb_copy_datagram_iter(skb, 0,
+							     &msg->msg_iter,
+							     bytes_to_copy);
 				if (err) {
 					/* Copy of message failed. Rest of
 					 * fragments will be freed without copy.
-- 
cgit v1.2.3


From 581512a6dc939ef122e49336626ae159f3b8a345 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Sat, 16 Sep 2023 16:09:18 +0300
Subject: vsock/virtio: MSG_ZEROCOPY flag support

This adds handling of MSG_ZEROCOPY flag on transmission path:

1) If this flag is set and zerocopy transmission is possible (enabled
   in socket options and transport allows zerocopy), then non-linear
   skb will be created and filled with the pages of user's buffer.
   Pages of user's buffer are locked in memory by 'get_user_pages()'.
2) Replaces way of skb owning: instead of 'skb_set_owner_sk_safe()' it
   calls 'skb_set_owner_w()'. Reason of this change is that
   '__zerocopy_sg_from_iter()' increments 'sk_wmem_alloc' of socket, so
   to decrease this field correctly, proper skb destructor is needed:
   'sock_wfree()'. This destructor is set by 'skb_set_owner_w()'.
3) Adds new callback to 'struct virtio_transport': 'can_msgzerocopy'.
   If this callback is set, then transport needs extra check to be able
   to send provided number of buffers in zerocopy mode. Currently, the
   only transport that needs this callback set is virtio, because this
   transport adds new buffers to the virtio queue and we need to check,
   that number of these buffers is less than size of the queue (it is
   required by virtio spec). vhost and loopback transports don't need
   this check.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/virtio_vsock.h                       |   9 +
 .../trace/events/vsock_virtio_transport_common.h   |  12 +-
 net/vmw_vsock/virtio_transport.c                   |  32 +++
 net/vmw_vsock/virtio_transport_common.c            | 250 ++++++++++++++++-----
 4 files changed, 241 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index a91fbdf233e4..ebb3ce63d64d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -160,6 +160,15 @@ struct virtio_transport {
 
 	/* Takes ownership of the packet */
 	int (*send_pkt)(struct sk_buff *skb);
+
+	/* Used in MSG_ZEROCOPY mode. Checks, that provided data
+	 * (number of buffers) could be transmitted with zerocopy
+	 * mode. If this callback is not implemented for the current
+	 * transport - this means that this transport doesn't need
+	 * extra checks and can perform zerocopy transmission by
+	 * default.
+	 */
+	bool (*can_msgzerocopy)(int bufs_num);
 };
 
 ssize_t
diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h
index d0b3f0ea9ba1..f1ebe36787c3 100644
--- a/include/trace/events/vsock_virtio_transport_common.h
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -43,7 +43,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		 __u32 len,
 		 __u16 type,
 		 __u16 op,
-		 __u32 flags
+		 __u32 flags,
+		 bool zcopy
 	),
 	TP_ARGS(
 		src_cid, src_port,
@@ -51,7 +52,8 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		len,
 		type,
 		op,
-		flags
+		flags,
+		zcopy
 	),
 	TP_STRUCT__entry(
 		__field(__u32, src_cid)
@@ -62,6 +64,7 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		__field(__u16, type)
 		__field(__u16, op)
 		__field(__u32, flags)
+		__field(bool, zcopy)
 	),
 	TP_fast_assign(
 		__entry->src_cid = src_cid;
@@ -72,14 +75,15 @@ TRACE_EVENT(virtio_transport_alloc_pkt,
 		__entry->type = type;
 		__entry->op = op;
 		__entry->flags = flags;
+		__entry->zcopy = zcopy;
 	),
-	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x",
+	TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x zcopy=%s",
 		  __entry->src_cid, __entry->src_port,
 		  __entry->dst_cid, __entry->dst_port,
 		  __entry->len,
 		  show_type(__entry->type),
 		  show_op(__entry->op),
-		  __entry->flags)
+		  __entry->flags, __entry->zcopy ? "true" : "false")
 );
 
 TRACE_EVENT(virtio_transport_recv_pkt,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 73d730156349..09ba3128e759 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -455,6 +455,37 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
 	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
 }
 
+static bool virtio_transport_can_msgzerocopy(int bufs_num)
+{
+	struct virtio_vsock *vsock;
+	bool res = false;
+
+	rcu_read_lock();
+
+	vsock = rcu_dereference(the_virtio_vsock);
+	if (vsock) {
+		struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+
+		/* Check that tx queue is large enough to keep whole
+		 * data to send. This is needed, because when there is
+		 * not enough free space in the queue, current skb to
+		 * send will be reinserted to the head of tx list of
+		 * the socket to retry transmission later, so if skb
+		 * is bigger than whole queue, it will be reinserted
+		 * again and again, thus blocking other skbs to be sent.
+		 * Each page of the user provided buffer will be added
+		 * as a single buffer to the tx virtqueue, so compare
+		 * number of pages against maximum capacity of the queue.
+		 */
+		if (bufs_num <= vq->num_max)
+			res = true;
+	}
+
+	rcu_read_unlock();
+
+	return res;
+}
+
 static bool virtio_transport_seqpacket_allow(u32 remote_cid);
 
 static struct virtio_transport virtio_transport = {
@@ -504,6 +535,7 @@ static struct virtio_transport virtio_transport = {
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
+	.can_msgzerocopy = virtio_transport_can_msgzerocopy,
 };
 
 static bool virtio_transport_seqpacket_allow(u32 remote_cid)
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3a48e48a99ac..e22c81435ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,73 +37,99 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
 	return container_of(t, struct virtio_transport, transport);
 }
 
-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-			   size_t len,
-			   u32 src_cid,
-			   u32 src_port,
-			   u32 dst_cid,
-			   u32 dst_port)
-{
-	const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-	struct virtio_vsock_hdr *hdr;
-	struct sk_buff *skb;
-	void *payload;
-	int err;
+static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
+				       struct virtio_vsock_pkt_info *info,
+				       size_t pkt_len)
+{
+	struct iov_iter *iov_iter;
 
-	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
-	if (!skb)
-		return NULL;
+	if (!info->msg)
+		return false;
 
-	hdr = virtio_vsock_hdr(skb);
-	hdr->type	= cpu_to_le16(info->type);
-	hdr->op		= cpu_to_le16(info->op);
-	hdr->src_cid	= cpu_to_le64(src_cid);
-	hdr->dst_cid	= cpu_to_le64(dst_cid);
-	hdr->src_port	= cpu_to_le32(src_port);
-	hdr->dst_port	= cpu_to_le32(dst_port);
-	hdr->flags	= cpu_to_le32(info->flags);
-	hdr->len	= cpu_to_le32(len);
+	iov_iter = &info->msg->msg_iter;
 
-	if (info->msg && len > 0) {
-		payload = skb_put(skb, len);
-		err = memcpy_from_msg(payload, info->msg, len);
-		if (err)
-			goto out;
+	if (iov_iter->iov_offset)
+		return false;
 
-		if (msg_data_left(info->msg) == 0 &&
-		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
-			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+	/* We can't send whole iov. */
+	if (iov_iter->count > pkt_len)
+		return false;
 
-			if (info->msg->msg_flags & MSG_EOR)
-				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-		}
+	/* Check that transport can send data in zerocopy mode. */
+	t_ops = virtio_transport_get_ops(info->vsk);
+
+	if (t_ops->can_msgzerocopy) {
+		int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
+		int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
+
+		/* +1 is for packet header. */
+		return t_ops->can_msgzerocopy(pages_to_send + 1);
 	}
 
-	if (info->reply)
-		virtio_vsock_skb_set_reply(skb);
+	return true;
+}
 
-	trace_virtio_transport_alloc_pkt(src_cid, src_port,
-					 dst_cid, dst_port,
-					 len,
-					 info->type,
-					 info->op,
-					 info->flags);
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+					   struct sk_buff *skb,
+					   struct msghdr *msg,
+					   bool zerocopy)
+{
+	struct ubuf_info *uarg;
 
-	if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
-		WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
-		goto out;
+	if (msg->msg_ubuf) {
+		uarg = msg->msg_ubuf;
+		net_zcopy_get(uarg);
+	} else {
+		struct iov_iter *iter = &msg->msg_iter;
+		struct ubuf_info_msgzc *uarg_zc;
+
+		uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+					    iter->count,
+					    NULL);
+		if (!uarg)
+			return -1;
+
+		uarg_zc = uarg_to_msgzc(uarg);
+		uarg_zc->zerocopy = zerocopy ? 1 : 0;
 	}
 
-	return skb;
+	skb_zcopy_init(skb, uarg);
 
-out:
-	kfree_skb(skb);
-	return NULL;
+	return 0;
+}
+
+static int virtio_transport_fill_skb(struct sk_buff *skb,
+				     struct virtio_vsock_pkt_info *info,
+				     size_t len,
+				     bool zcopy)
+{
+	if (zcopy)
+		return __zerocopy_sg_from_iter(info->msg, NULL, skb,
+					       &info->msg->msg_iter,
+					       len);
+
+	return memcpy_from_msg(skb_put(skb, len), info->msg, len);
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+				      struct virtio_vsock_pkt_info *info,
+				      size_t payload_len,
+				      u32 src_cid,
+				      u32 src_port,
+				      u32 dst_cid,
+				      u32 dst_port)
+{
+	struct virtio_vsock_hdr *hdr;
+
+	hdr = virtio_vsock_hdr(skb);
+	hdr->type	= cpu_to_le16(info->type);
+	hdr->op		= cpu_to_le16(info->op);
+	hdr->src_cid	= cpu_to_le64(src_cid);
+	hdr->dst_cid	= cpu_to_le64(dst_cid);
+	hdr->src_port	= cpu_to_le32(src_port);
+	hdr->dst_port	= cpu_to_le32(dst_port);
+	hdr->flags	= cpu_to_le32(info->flags);
+	hdr->len	= cpu_to_le32(payload_len);
 }
 
 static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb,
@@ -214,6 +240,82 @@ static u16 virtio_transport_get_type(struct sock *sk)
 		return VIRTIO_VSOCK_TYPE_SEQPACKET;
 }
 
+/* Returns new sk_buff on success, otherwise returns NULL. */
+static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
+						  size_t payload_len,
+						  bool zcopy,
+						  u32 src_cid,
+						  u32 src_port,
+						  u32 dst_cid,
+						  u32 dst_port)
+{
+	struct vsock_sock *vsk;
+	struct sk_buff *skb;
+	size_t skb_len;
+
+	skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
+
+	if (!zcopy)
+		skb_len += payload_len;
+
+	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	virtio_transport_init_hdr(skb, info, payload_len, src_cid, src_port,
+				  dst_cid, dst_port);
+
+	vsk = info->vsk;
+
+	/* If 'vsk' != NULL then payload is always present, so we
+	 * will never call '__zerocopy_sg_from_iter()' below without
+	 * setting skb owner in 'skb_set_owner_w()'. The only case
+	 * when 'vsk' == NULL is VIRTIO_VSOCK_OP_RST control message
+	 * without payload.
+	 */
+	WARN_ON_ONCE(!(vsk && (info->msg && payload_len)) && zcopy);
+
+	/* Set owner here, because '__zerocopy_sg_from_iter()' uses
+	 * owner of skb without check to update 'sk_wmem_alloc'.
+	 */
+	if (vsk)
+		skb_set_owner_w(skb, sk_vsock(vsk));
+
+	if (info->msg && payload_len > 0) {
+		int err;
+
+		err = virtio_transport_fill_skb(skb, info, payload_len, zcopy);
+		if (err)
+			goto out;
+
+		if (msg_data_left(info->msg) == 0 &&
+		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+			struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+
+			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+			if (info->msg->msg_flags & MSG_EOR)
+				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+		}
+	}
+
+	if (info->reply)
+		virtio_vsock_skb_set_reply(skb);
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 payload_len,
+					 info->type,
+					 info->op,
+					 info->flags,
+					 zcopy);
+
+	return skb;
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+
 /* This function can only be used on connecting/connected sockets,
  * since a socket assigned to a transport is required.
  *
@@ -222,10 +324,12 @@ static u16 virtio_transport_get_type(struct sock *sk)
 static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 					  struct virtio_vsock_pkt_info *info)
 {
+	u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
 	u32 src_cid, src_port, dst_cid, dst_port;
 	const struct virtio_transport *t_ops;
 	struct virtio_vsock_sock *vvs;
 	u32 pkt_len = info->pkt_len;
+	bool can_zcopy = false;
 	u32 rest_len;
 	int ret;
 
@@ -254,15 +358,30 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
 		return pkt_len;
 
+	if (info->msg) {
+		/* If zerocopy is not enabled by 'setsockopt()', we behave as
+		 * there is no MSG_ZEROCOPY flag set.
+		 */
+		if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
+			info->msg->msg_flags &= ~MSG_ZEROCOPY;
+
+		if (info->msg->msg_flags & MSG_ZEROCOPY)
+			can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len);
+
+		if (can_zcopy)
+			max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
+					    (MAX_SKB_FRAGS * PAGE_SIZE));
+	}
+
 	rest_len = pkt_len;
 
 	do {
 		struct sk_buff *skb;
 		size_t skb_len;
 
-		skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
+		skb_len = min(max_skb_len, rest_len);
 
-		skb = virtio_transport_alloc_skb(info, skb_len,
+		skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy,
 						 src_cid, src_port,
 						 dst_cid, dst_port);
 		if (!skb) {
@@ -270,6 +389,21 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 			break;
 		}
 
+		/* We process buffer part by part, allocating skb on
+		 * each iteration. If this is last skb for this buffer
+		 * and MSG_ZEROCOPY mode is in use - we must allocate
+		 * completion for the current syscall.
+		 */
+		if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
+		    skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
+			if (virtio_transport_init_zcopy_skb(vsk, skb,
+							    info->msg,
+							    can_zcopy)) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
 		virtio_transport_inc_tx_pkt(vvs, skb);
 
 		ret = t_ops->send_pkt(skb);
@@ -985,7 +1119,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
 	if (!t)
 		return -ENOTCONN;
 
-	reply = virtio_transport_alloc_skb(&info, 0,
+	reply = virtio_transport_alloc_skb(&info, 0, false,
 					   le64_to_cpu(hdr->dst_cid),
 					   le32_to_cpu(hdr->dst_port),
 					   le64_to_cpu(hdr->src_cid),
-- 
cgit v1.2.3


From e9090e70e618cd62ab7bf2914511e5eea31a2535 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:21 +0200
Subject: firmware: arm_scmi: Extend perf protocol ops to get number of domains

Similar to other protocol ops, it's useful for an scmi module driver to get
the number of supported performance domains, hence let's make this
available by adding a new perf protocol callback. Note that, a user is
being added from subsequent changes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-2-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/perf.c | 8 ++++++++
 include/linux/scmi_protocol.h    | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index c0cd556fbaae..9fb63dd44c20 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -423,6 +423,13 @@ scmi_perf_describe_levels_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
+static int scmi_perf_num_domains_get(const struct scmi_protocol_handle *ph)
+{
+	struct scmi_perf_info *pi = ph->get_priv(ph);
+
+	return pi->num_domains;
+}
+
 static int scmi_perf_msg_limits_set(const struct scmi_protocol_handle *ph,
 				    u32 domain, u32 max_perf, u32 min_perf)
 {
@@ -948,6 +955,7 @@ scmi_power_scale_get(const struct scmi_protocol_handle *ph)
 }
 
 static const struct scmi_perf_proto_ops perf_proto_ops = {
+	.num_domains_get = scmi_perf_num_domains_get,
 	.limits_set = scmi_perf_limits_set,
 	.limits_get = scmi_perf_limits_get,
 	.level_set = scmi_perf_level_set,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index e09ac428fa1b..14791dd54706 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -110,6 +110,7 @@ struct scmi_clk_proto_ops {
  * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
  *
+ * @num_domains_get: gets the number of supported performance domains
  * @limits_set: sets limits on the performance level of a domain
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
@@ -129,6 +130,7 @@ struct scmi_clk_proto_ops {
  *	or in some other (abstract) scale
  */
 struct scmi_perf_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
 	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			  u32 max_perf, u32 min_perf);
 	int (*limits_get)(const struct scmi_protocol_handle *ph, u32 domain,
-- 
cgit v1.2.3


From 3d99ed60721bf2e108c8fc660775766057689a92 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:22 +0200
Subject: firmware: arm_scmi: Extend perf protocol ops to get information of a
 domain

Similar to other protocol ops, it's useful for an scmi module driver to get
some generic information of a performance domain. Therefore, let's add a
new callback to provide this information. The information is currently
limited to the name of the performance domain and whether the set-level
operation is supported, although this can easily be extended if we find the
need for it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-3-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/perf.c | 47 +++++++++++++++++++++++++---------------
 include/linux/scmi_protocol.h    |  8 +++++++
 2 files changed, 38 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 9fb63dd44c20..d20bb6b8abfa 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -145,7 +145,6 @@ struct scmi_msg_resp_perf_describe_levels_v4 {
 struct perf_dom_info {
 	u32 id;
 	bool set_limits;
-	bool set_perf;
 	bool perf_limit_notify;
 	bool perf_level_notify;
 	bool perf_fastchannels;
@@ -154,7 +153,7 @@ struct perf_dom_info {
 	u32 sustained_freq_khz;
 	u32 sustained_perf_level;
 	u32 mult_factor;
-	char name[SCMI_MAX_STR_SIZE];
+	struct scmi_perf_domain_info info;
 	struct scmi_opp opp[MAX_OPPS];
 	struct scmi_fc_info *fc_info;
 	struct xarray opps_by_idx;
@@ -257,7 +256,7 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph,
 		flags = le32_to_cpu(attr->flags);
 
 		dom_info->set_limits = SUPPORTS_SET_LIMITS(flags);
-		dom_info->set_perf = SUPPORTS_SET_PERF_LVL(flags);
+		dom_info->info.set_perf = SUPPORTS_SET_PERF_LVL(flags);
 		dom_info->perf_limit_notify = SUPPORTS_PERF_LIMIT_NOTIFY(flags);
 		dom_info->perf_level_notify = SUPPORTS_PERF_LEVEL_NOTIFY(flags);
 		dom_info->perf_fastchannels = SUPPORTS_PERF_FASTCHANNELS(flags);
@@ -276,7 +275,8 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph,
 			dom_info->mult_factor =
 					(dom_info->sustained_freq_khz * 1000) /
 					dom_info->sustained_perf_level;
-		strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE);
+		strscpy(dom_info->info.name, attr->name,
+			SCMI_SHORT_NAME_MAX_SIZE);
 	}
 
 	ph->xops->xfer_put(ph, t);
@@ -288,7 +288,7 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph,
 	if (!ret && PROTOCOL_REV_MAJOR(version) >= 0x3 &&
 	    SUPPORTS_EXTENDED_NAMES(flags))
 		ph->hops->extended_name_get(ph, PERF_DOMAIN_NAME_GET,
-					    dom_info->id, dom_info->name,
+					    dom_info->id, dom_info->info.name,
 					    SCMI_MAX_STR_SIZE);
 
 	if (dom_info->level_indexing_mode) {
@@ -430,6 +430,29 @@ static int scmi_perf_num_domains_get(const struct scmi_protocol_handle *ph)
 	return pi->num_domains;
 }
 
+static inline struct perf_dom_info *
+scmi_perf_domain_lookup(const struct scmi_protocol_handle *ph, u32 domain)
+{
+	struct scmi_perf_info *pi = ph->get_priv(ph);
+
+	if (domain >= pi->num_domains)
+		return ERR_PTR(-EINVAL);
+
+	return pi->dom_info + domain;
+}
+
+static const struct scmi_perf_domain_info *
+scmi_perf_info_get(const struct scmi_protocol_handle *ph, u32 domain)
+{
+	struct perf_dom_info *dom;
+
+	dom = scmi_perf_domain_lookup(ph, domain);
+	if (IS_ERR(dom))
+		return ERR_PTR(-EINVAL);
+
+	return &dom->info;
+}
+
 static int scmi_perf_msg_limits_set(const struct scmi_protocol_handle *ph,
 				    u32 domain, u32 max_perf, u32 min_perf)
 {
@@ -453,17 +476,6 @@ static int scmi_perf_msg_limits_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static inline struct perf_dom_info *
-scmi_perf_domain_lookup(const struct scmi_protocol_handle *ph, u32 domain)
-{
-	struct scmi_perf_info *pi = ph->get_priv(ph);
-
-	if (domain >= pi->num_domains)
-		return ERR_PTR(-EINVAL);
-
-	return pi->dom_info + domain;
-}
-
 static int __scmi_perf_limits_set(const struct scmi_protocol_handle *ph,
 				  struct perf_dom_info *dom, u32 max_perf,
 				  u32 min_perf)
@@ -819,7 +831,7 @@ static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
 		}
 
 		dev_dbg(dev, "[%d][%s]:: Registered OPP[%d] %lu\n",
-			domain, dom->name, idx, freq);
+			domain, dom->info.name, idx, freq);
 	}
 	return 0;
 }
@@ -956,6 +968,7 @@ scmi_power_scale_get(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.num_domains_get = scmi_perf_num_domains_get,
+	.info_get = scmi_perf_info_get,
 	.limits_set = scmi_perf_limits_set,
 	.limits_get = scmi_perf_limits_get,
 	.level_set = scmi_perf_level_set,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 14791dd54706..9108fe8ed3e3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -106,11 +106,17 @@ struct scmi_clk_proto_ops {
 			      u8 oem_type, u32 oem_val, bool atomic);
 };
 
+struct scmi_perf_domain_info {
+	char name[SCMI_MAX_STR_SIZE];
+	bool set_perf;
+};
+
 /**
  * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
  *
  * @num_domains_get: gets the number of supported performance domains
+ * @info_get: get the information of a performance domain
  * @limits_set: sets limits on the performance level of a domain
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
@@ -131,6 +137,8 @@ struct scmi_clk_proto_ops {
  */
 struct scmi_perf_proto_ops {
 	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	const struct scmi_perf_domain_info __must_check *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 domain);
 	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			  u32 max_perf, u32 min_perf);
 	int (*limits_get)(const struct scmi_protocol_handle *ph, u32 domain,
-- 
cgit v1.2.3


From 39dfa5b9e1f0fa63b811a0a87f1c2fb9c76a0456 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:24 +0200
Subject: firmware: arm_scmi: Align perf ops to use domain-id as in-parameter

Most scmi_perf_proto_ops are already using an "u32 domain" as an
in-parameter to indicate what performance domain we shall operate upon.
However, some of the ops are using a "struct device *dev", which means that
an additional OF parsing is needed each time the perf ops gets called, to
find the corresponding domain-id.

To avoid the above, but also to make the code more consistent, let's
replace the in-parameter "struct device *dev" with an "u32 domain". Note
that, this requires us to make some corresponding changes to the scmi
cpufreq driver, so let's do that too.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-5-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/cpufreq/scmi-cpufreq.c   | 14 +++++++++-----
 drivers/firmware/arm_scmi/perf.c | 22 ++++------------------
 include/linux/scmi_protocol.h    |  6 +++---
 3 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 7d05d48c0337..125e8a8421fb 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -137,7 +137,7 @@ scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 
 static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 {
-	int ret, nr_opp;
+	int ret, nr_opp, domain;
 	unsigned int latency;
 	struct device *cpu_dev;
 	struct scmi_data *priv;
@@ -149,6 +149,10 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 		return -ENODEV;
 	}
 
+	domain = scmi_cpu_domain_id(cpu_dev);
+	if (domain < 0)
+		return domain;
+
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
@@ -187,7 +191,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	  */
 	nr_opp = dev_pm_opp_get_opp_count(cpu_dev);
 	if (nr_opp <= 0) {
-		ret = perf_ops->device_opps_add(ph, cpu_dev);
+		ret = perf_ops->device_opps_add(ph, cpu_dev, domain);
 		if (ret) {
 			dev_warn(cpu_dev, "failed to add opps to the device\n");
 			goto out_free_cpumask;
@@ -220,7 +224,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	priv->cpu_dev = cpu_dev;
-	priv->domain_id = scmi_cpu_domain_id(cpu_dev);
+	priv->domain_id = domain;
 
 	policy->driver_data = priv;
 	policy->freq_table = freq_table;
@@ -228,14 +232,14 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	/* SCMI allows DVFS request for any domain from any CPU */
 	policy->dvfs_possible_from_any_cpu = true;
 
-	latency = perf_ops->transition_latency_get(ph, cpu_dev);
+	latency = perf_ops->transition_latency_get(ph, domain);
 	if (!latency)
 		latency = CPUFREQ_ETERNAL;
 
 	policy->cpuinfo.transition_latency = latency;
 
 	policy->fast_switch_possible =
-		perf_ops->fast_switch_possible(ph, cpu_dev);
+		perf_ops->fast_switch_possible(ph, domain);
 
 	return 0;
 
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index d20bb6b8abfa..092b51cf9596 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -795,17 +795,13 @@ static int scmi_dev_domain_id(struct device *dev)
 }
 
 static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
-				     struct device *dev)
+				     struct device *dev, u32 domain)
 {
-	int idx, ret, domain;
+	int idx, ret;
 	unsigned long freq;
 	struct scmi_opp *opp;
 	struct perf_dom_info *dom;
 
-	domain = scmi_dev_domain_id(dev);
-	if (domain < 0)
-		return -EINVAL;
-
 	dom = scmi_perf_domain_lookup(ph, domain);
 	if (IS_ERR(dom))
 		return PTR_ERR(dom);
@@ -838,15 +834,10 @@ static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
 
 static int
 scmi_dvfs_transition_latency_get(const struct scmi_protocol_handle *ph,
-				 struct device *dev)
+				 u32 domain)
 {
-	int domain;
 	struct perf_dom_info *dom;
 
-	domain = scmi_dev_domain_id(dev);
-	if (domain < 0)
-		return -EINVAL;
-
 	dom = scmi_perf_domain_lookup(ph, domain);
 	if (IS_ERR(dom))
 		return PTR_ERR(dom);
@@ -942,15 +933,10 @@ static int scmi_dvfs_est_power_get(const struct scmi_protocol_handle *ph,
 }
 
 static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
-				      struct device *dev)
+				      u32 domain)
 {
-	int domain;
 	struct perf_dom_info *dom;
 
-	domain = scmi_dev_domain_id(dev);
-	if (domain < 0)
-		return false;
-
 	dom = scmi_perf_domain_lookup(ph, domain);
 	if (IS_ERR(dom))
 		return false;
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9108fe8ed3e3..4e63766adc6f 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -149,9 +149,9 @@ struct scmi_perf_proto_ops {
 			 u32 *level, bool poll);
 	int (*device_domain_id)(struct device *dev);
 	int (*transition_latency_get)(const struct scmi_protocol_handle *ph,
-				      struct device *dev);
+				      u32 domain);
 	int (*device_opps_add)(const struct scmi_protocol_handle *ph,
-			       struct device *dev);
+			       struct device *dev, u32 domain);
 	int (*freq_set)(const struct scmi_protocol_handle *ph, u32 domain,
 			unsigned long rate, bool poll);
 	int (*freq_get)(const struct scmi_protocol_handle *ph, u32 domain,
@@ -159,7 +159,7 @@ struct scmi_perf_proto_ops {
 	int (*est_power_get)(const struct scmi_protocol_handle *ph, u32 domain,
 			     unsigned long *rate, unsigned long *power);
 	bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph,
-				     struct device *dev);
+				     u32 domain);
 	enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph);
 };
 
-- 
cgit v1.2.3


From 9b578d83629e13f81a53d1695a4f700cdb10f772 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:25 +0200
Subject: firmware: arm_scmi: Drop redundant ->device_domain_id() from perf ops

There are no longer any users of the ->device_domain_id() ops in the
scmi_perf_proto_ops, therefore let's remove it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-6-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/perf.c | 13 -------------
 include/linux/scmi_protocol.h    |  2 --
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 092b51cf9596..9eb58df9124d 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -782,18 +782,6 @@ static void scmi_perf_domain_init_fc(const struct scmi_protocol_handle *ph,
 	*p_fc = fc;
 }
 
-/* Device specific ops */
-static int scmi_dev_domain_id(struct device *dev)
-{
-	struct of_phandle_args clkspec;
-
-	if (of_parse_phandle_with_args(dev->of_node, "clocks", "#clock-cells",
-				       0, &clkspec))
-		return -EINVAL;
-
-	return clkspec.args[0];
-}
-
 static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
 				     struct device *dev, u32 domain)
 {
@@ -959,7 +947,6 @@ static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.limits_get = scmi_perf_limits_get,
 	.level_set = scmi_perf_level_set,
 	.level_get = scmi_perf_level_get,
-	.device_domain_id = scmi_dev_domain_id,
 	.transition_latency_get = scmi_dvfs_transition_latency_get,
 	.device_opps_add = scmi_dvfs_device_opps_add,
 	.freq_set = scmi_dvfs_freq_set,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 4e63766adc6f..27bfa5a65b45 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -121,7 +121,6 @@ struct scmi_perf_domain_info {
  * @limits_get: gets limits on the performance level of a domain
  * @level_set: sets the performance level of a domain
  * @level_get: gets the performance level of a domain
- * @device_domain_id: gets the scmi domain id for a given device
  * @transition_latency_get: gets the DVFS transition latency for a given device
  * @device_opps_add: adds all the OPPs for a given device
  * @freq_set: sets the frequency for a given device using sustained frequency
@@ -147,7 +146,6 @@ struct scmi_perf_proto_ops {
 			 u32 level, bool poll);
 	int (*level_get)(const struct scmi_protocol_handle *ph, u32 domain,
 			 u32 *level, bool poll);
-	int (*device_domain_id)(struct device *dev);
 	int (*transition_latency_get)(const struct scmi_protocol_handle *ph,
 				      u32 domain);
 	int (*device_opps_add)(const struct scmi_protocol_handle *ph,
-- 
cgit v1.2.3


From 3dd91515ef43dd43e32e2a84e4bd881b64fb33ae Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 25 Aug 2023 13:26:32 +0200
Subject: PM: domains: Allow genpd providers to manage OPP tables directly by
 its FW

In some cases the OPP tables aren't specified in device tree, but rather
encoded in the FW. To allow a genpd provider to specify them dynamically
instead, let's add a new genpd flag, GENPD_FLAG_OPP_TABLE_FW.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20230825112633.236607-13-ulf.hansson@linaro.org
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/base/power/domain.c | 11 ++++++-----
 include/linux/pm_domain.h   |  5 +++++
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 5cb2023581d4..c74edf80417f 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -130,6 +130,7 @@ static const struct genpd_lock_ops genpd_spin_ops = {
 #define genpd_is_active_wakeup(genpd)	(genpd->flags & GENPD_FLAG_ACTIVE_WAKEUP)
 #define genpd_is_cpu_domain(genpd)	(genpd->flags & GENPD_FLAG_CPU_DOMAIN)
 #define genpd_is_rpm_always_on(genpd)	(genpd->flags & GENPD_FLAG_RPM_ALWAYS_ON)
+#define genpd_is_opp_table_fw(genpd)	(genpd->flags & GENPD_FLAG_OPP_TABLE_FW)
 
 static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
 		const struct generic_pm_domain *genpd)
@@ -2328,7 +2329,7 @@ int of_genpd_add_provider_simple(struct device_node *np,
 	genpd->dev.of_node = np;
 
 	/* Parse genpd OPP table */
-	if (genpd->set_performance_state) {
+	if (!genpd_is_opp_table_fw(genpd) && genpd->set_performance_state) {
 		ret = dev_pm_opp_of_add_table(&genpd->dev);
 		if (ret)
 			return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n");
@@ -2343,7 +2344,7 @@ int of_genpd_add_provider_simple(struct device_node *np,
 
 	ret = genpd_add_provider(np, genpd_xlate_simple, genpd);
 	if (ret) {
-		if (genpd->set_performance_state) {
+		if (!genpd_is_opp_table_fw(genpd) && genpd->set_performance_state) {
 			dev_pm_opp_put_opp_table(genpd->opp_table);
 			dev_pm_opp_of_remove_table(&genpd->dev);
 		}
@@ -2387,7 +2388,7 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 		genpd->dev.of_node = np;
 
 		/* Parse genpd OPP table */
-		if (genpd->set_performance_state) {
+		if (!genpd_is_opp_table_fw(genpd) && genpd->set_performance_state) {
 			ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i);
 			if (ret) {
 				dev_err_probe(&genpd->dev, ret,
@@ -2423,7 +2424,7 @@ error:
 		genpd->provider = NULL;
 		genpd->has_provider = false;
 
-		if (genpd->set_performance_state) {
+		if (!genpd_is_opp_table_fw(genpd) && genpd->set_performance_state) {
 			dev_pm_opp_put_opp_table(genpd->opp_table);
 			dev_pm_opp_of_remove_table(&genpd->dev);
 		}
@@ -2455,7 +2456,7 @@ void of_genpd_del_provider(struct device_node *np)
 				if (gpd->provider == &np->fwnode) {
 					gpd->has_provider = false;
 
-					if (!gpd->set_performance_state)
+					if (genpd_is_opp_table_fw(gpd) || !gpd->set_performance_state)
 						continue;
 
 					dev_pm_opp_put_opp_table(gpd->opp_table);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f776fb93eaa0..05ad8cefdff1 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -61,6 +61,10 @@
  * GENPD_FLAG_MIN_RESIDENCY:	Enable the genpd governor to consider its
  *				components' next wakeup when determining the
  *				optimal idle state.
+ *
+ * GENPD_FLAG_OPP_TABLE_FW:	The genpd provider supports performance states,
+ *				but its corresponding OPP tables are not
+ *				described in DT, but are given directly by FW.
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -69,6 +73,7 @@
 #define GENPD_FLAG_CPU_DOMAIN	 (1U << 4)
 #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5)
 #define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
+#define GENPD_FLAG_OPP_TABLE_FW	 (1U << 7)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
-- 
cgit v1.2.3


From 9f6c532f59b20580acf8ede9409c9b8dce6e74e1 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:10 +0200
Subject: futex: Add sys_futex_wake()

To complement sys_futex_waitv() add sys_futex_wake(). This syscall
implements what was previously known as FUTEX_WAKE_BITSET except it
uses 'unsigned long' for the bitmask and takes FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105247.936205525@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           |  2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 include/linux/syscalls.h                    |  3 +++
 include/uapi/asm-generic/unistd.h           |  4 +++-
 kernel/futex/syscalls.c                     | 30 +++++++++++++++++++++++++++++
 kernel/sys_ni.c                             |  1 +
 22 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..3b86519d68e4 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -492,3 +492,4 @@
 560	common	set_mempolicy_home_node		sys_ni_syscall
 561	common	cachestat			sys_cachestat
 562	common	fchmodat2			sys_fchmodat2
+563	common	futex_wake			sys_futex_wake
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..714abeb1e6fa 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -466,3 +466,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index bd77253b62e0..63a8a9c4abc1 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		453
+#define __NR_compat_syscalls		455
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..68974683737b 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -911,6 +911,8 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 83d8609aec03..cd50247508e6 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -373,3 +373,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..21eb35c693e1 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -452,3 +452,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..3a4e8513a8e1 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -458,3 +458,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..6883ea3b830d 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -391,3 +391,4 @@
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n32	cachestat			sys_cachestat
 452	n32	fchmodat2			sys_fchmodat2
+454	n32	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..48bc0fb4e3dc 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -367,3 +367,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n64	cachestat			sys_cachestat
 452	n64	fchmodat2			sys_fchmodat2
+454	n64	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..a92625f5bad8 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -440,3 +440,4 @@
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	o32	cachestat			sys_cachestat
 452	o32	fchmodat2			sys_fchmodat2
+454	o32	futex_wake			sys_futex_wake
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..57faa9786ffe 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -451,3 +451,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..e6c6ed6b30ee 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -539,3 +539,4 @@
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..754720154dc1 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -455,3 +455,4 @@
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
 451  common	cachestat		sys_cachestat			sys_cachestat
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
+454  common	futex_wake		sys_futex_wake			sys_futex_wake
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..902a997e7ec6 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -455,3 +455,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..8a1f887c8be6 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -498,3 +498,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..9e81323979b0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,4 @@
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
+454	i386	futex_wake		sys_futex_wake
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..d10a6003a7c9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,7 @@
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	futex_wake		sys_futex_wake
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..4e511bfd4b8f 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -423,3 +423,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..e174ed86da1d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,6 +549,9 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 				unsigned int nr_futexes, unsigned int flags,
 				struct __kernel_timespec __user *timeout, clockid_t clockid);
+
+asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..f5454e6f4c6f 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -822,9 +822,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
 
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 2339f9ccee7f..7049a52ef68e 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -306,6 +306,36 @@ destroy_timer:
 	return ret;
 }
 
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr:	Address of the futex(es) to wake
+ * @mask:	bitmask
+ * @nr:		Number of the futexes to wake
+ * @flags:	FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+		void __user *, uaddr,
+		unsigned long, mask,
+		int, nr,
+		unsigned int, flags)
+{
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	return futex_wake(uaddr, flags, nr, mask);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..983c0583c627 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ COND_SYSCALL_COMPAT(set_robust_list);
 COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
+COND_SYSCALL(futex_wake);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From cb8c4312afca1b2dc64107e7e7cea81911055612 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:12 +0200
Subject: futex: Add sys_futex_wait()

To complement sys_futex_waitv()/wake(), add sys_futex_wait(). This
syscall implements what was previously known as FUTEX_WAIT_BITSET
except it uses 'unsigned long' for the value and bitmask arguments,
takes timespec and clockid_t arguments for the absolute timeout and
uses FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.164324363@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 include/linux/syscalls.h                    |   4 +
 include/uapi/asm-generic/unistd.h           |   4 +-
 kernel/futex/futex.h                        |   3 +
 kernel/futex/syscalls.c                     | 120 +++++++++++++++++++++-------
 kernel/futex/waitwake.c                     |  61 ++++++++------
 kernel/sys_ni.c                             |   1 +
 24 files changed, 156 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 3b86519d68e4..c49f12fd264e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -493,3 +493,4 @@
 561	common	cachestat			sys_cachestat
 562	common	fchmodat2			sys_fchmodat2
 563	common	futex_wake			sys_futex_wake
+564	common	futex_wait			sys_futex_wait
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 714abeb1e6fa..a6cf56277327 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -467,3 +467,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 63a8a9c4abc1..f33190f17ebb 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		455
+#define __NR_compat_syscalls		456
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 68974683737b..6e7d37282ba1 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -913,6 +913,8 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_futex_wake 454
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index cd50247508e6..4043f0c55170 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -374,3 +374,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 21eb35c693e1..24841674acc5 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 3a4e8513a8e1..f03927ab0220 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -459,3 +459,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 6883ea3b830d..dbb5edfb667b 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -392,3 +392,4 @@
 451	n32	cachestat			sys_cachestat
 452	n32	fchmodat2			sys_fchmodat2
 454	n32	futex_wake			sys_futex_wake
+455	n32	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 48bc0fb4e3dc..faff8dfd2983 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -368,3 +368,4 @@
 451	n64	cachestat			sys_cachestat
 452	n64	fchmodat2			sys_fchmodat2
 454	n64	futex_wake			sys_futex_wake
+455	n64	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index a92625f5bad8..542f75605b3e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -441,3 +441,4 @@
 451	o32	cachestat			sys_cachestat
 452	o32	fchmodat2			sys_fchmodat2
 454	o32	futex_wake			sys_futex_wake
+455	o32	futex_wait			sys_futex_wait
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 57faa9786ffe..8e50e89551f7 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -452,3 +452,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index e6c6ed6b30ee..ad33a9993a6a 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -540,3 +540,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 754720154dc1..418853fd2a6b 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
 451  common	cachestat		sys_cachestat			sys_cachestat
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
 454  common	futex_wake		sys_futex_wake			sys_futex_wake
+455  common	futex_wait		sys_futex_wait			sys_futex_wait
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 902a997e7ec6..8ef9557d2779 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 8a1f887c8be6..df59a9d5f109 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -499,3 +499,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9e81323979b0..0f6616822bd5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -458,3 +458,4 @@
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
 454	i386	futex_wake		sys_futex_wake
+455	i386	futex_wait		sys_futex_wait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index d10a6003a7c9..ddf6288823ad 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -376,6 +376,7 @@
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	futex_wake		sys_futex_wake
+455	common	futex_wait		sys_futex_wait
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 4e511bfd4b8f..ac278dbce2ee 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -424,3 +424,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e174ed86da1d..11f3fdd1ee03 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -552,6 +552,10 @@ asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 
 asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
 
+asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask,
+			       unsigned int flags, struct __kernel_timespec __user *timespec,
+			       clockid_t clockid);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f5454e6f4c6f..f6553bd5d213 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -824,9 +824,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_futex_wake 454
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
 
 #undef __NR_syscalls
-#define __NR_syscalls 455
+#define __NR_syscalls 456
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 0e7821a944a2..e74888a7d71d 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -332,6 +332,9 @@ extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
 			 u32 *cmpval, int requeue_pi);
 
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+			struct hrtimer_sleeper *to, u32 bitset);
+
 extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		      ktime_t *abs_time, u32 bitset);
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 47398926765e..e4c8ec713787 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -221,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 	return 0;
 }
 
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+				clockid_t clockid, struct hrtimer_sleeper *to)
+{
+	int flag_clkid = 0, flag_init = 0;
+	struct timespec64 ts;
+	ktime_t time;
+	int ret;
+
+	if (!timeout)
+		return 0;
+
+	if (clockid == CLOCK_REALTIME) {
+		flag_clkid = FLAGS_CLOCKRT;
+		flag_init = FUTEX_CLOCK_REALTIME;
+	}
+
+	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+		return -EINVAL;
+
+	if (get_timespec64(&ts, timeout))
+		return -EFAULT;
+
+	/*
+	 * Since there's no opcode for futex_waitv, use
+	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
+	 */
+	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+	if (ret)
+		return ret;
+
+	futex_setup_timer(&time, to, flag_clkid, 0);
+	return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
+}
+
 /**
  * sys_futex_waitv - Wait on a list of futexes
  * @waiters:    List of futexes to wait on
@@ -250,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 {
 	struct hrtimer_sleeper to;
 	struct futex_vector *futexv;
-	struct timespec64 ts;
-	ktime_t time;
 	int ret;
 
 	/* This syscall supports no flags for now */
@@ -261,30 +299,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
 		return -EINVAL;
 
-	if (timeout) {
-		int flag_clkid = 0, flag_init = 0;
-
-		if (clockid == CLOCK_REALTIME) {
-			flag_clkid = FLAGS_CLOCKRT;
-			flag_init = FUTEX_CLOCK_REALTIME;
-		}
-
-		if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
-			return -EINVAL;
-
-		if (get_timespec64(&ts, timeout))
-			return -EFAULT;
-
-		/*
-		 * Since there's no opcode for futex_waitv, use
-		 * FUTEX_WAIT_BITSET that uses absolute timeout as well
-		 */
-		ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
-		if (ret)
-			return ret;
-
-		futex_setup_timer(&time, &to, flag_clkid, 0);
-	}
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
 
 	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
 	if (!futexv) {
@@ -299,10 +315,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	kfree(futexv);
 
 destroy_timer:
-	if (timeout) {
-		hrtimer_cancel(&to.timer);
-		destroy_hrtimer_on_stack(&to.timer);
-	}
+	if (timeout)
+		futex2_destroy_timeout(&to);
 	return ret;
 }
 
@@ -336,6 +350,52 @@ SYSCALL_DEFINE4(futex_wake,
 	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
 }
 
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr:	Address of the futex to wait on
+ * @val:	Value of @uaddr
+ * @mask:	bitmask
+ * @flags:	FUTEX2 flags
+ * @timeout:	Optional absolute timeout
+ * @clockid:	Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+		void __user *, uaddr,
+		unsigned long, val,
+		unsigned long, mask,
+		unsigned int, flags,
+		struct __kernel_timespec __user *, timeout,
+		clockid_t, clockid)
+{
+	struct hrtimer_sleeper to;
+	int ret;
+
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, val) ||
+	    !futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
+
+	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+	if (timeout)
+		futex2_destroy_timeout(&to);
+
+	return ret;
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceb05b876597..b109a0810a2c 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -632,20 +632,18 @@ retry_private:
 	return ret;
 }
 
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+		 struct hrtimer_sleeper *to, u32 bitset)
 {
-	struct hrtimer_sleeper timeout, *to;
-	struct restart_block *restart;
-	struct futex_hash_bucket *hb;
 	struct futex_q q = futex_q_init;
+	struct futex_hash_bucket *hb;
 	int ret;
 
 	if (!bitset)
 		return -EINVAL;
+
 	q.bitset = bitset;
 
-	to = futex_setup_timer(abs_time, &timeout, flags,
-			       current->timer_slack_ns);
 retry:
 	/*
 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
@@ -653,18 +651,17 @@ retry:
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
-		goto out;
+		return ret;
 
 	/* futex_queue and wait for wakeup, timeout, or a signal. */
 	futex_wait_queue(hb, &q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
-	ret = 0;
 	if (!futex_unqueue(&q))
-		goto out;
-	ret = -ETIMEDOUT;
+		return 0;
+
 	if (to && !to->task)
-		goto out;
+		return -ETIMEDOUT;
 
 	/*
 	 * We expect signal_pending(current), but we might be the
@@ -673,24 +670,38 @@ retry:
 	if (!signal_pending(current))
 		goto retry;
 
-	ret = -ERESTARTSYS;
-	if (!abs_time)
-		goto out;
+	return -ERESTARTSYS;
+}
 
-	restart = &current->restart_block;
-	restart->futex.uaddr = uaddr;
-	restart->futex.val = val;
-	restart->futex.time = *abs_time;
-	restart->futex.bitset = bitset;
-	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+	struct hrtimer_sleeper timeout, *to;
+	struct restart_block *restart;
+	int ret;
+
+	to = futex_setup_timer(abs_time, &timeout, flags,
+			       current->timer_slack_ns);
+
+	ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+	/* No timeout, nothing to clean up. */
+	if (!to)
+		return ret;
+
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
 
-	ret = set_restart_fn(restart, futex_wait_restart);
+	if (ret == -ERESTARTSYS) {
+		restart = &current->restart_block;
+		restart->futex.uaddr = uaddr;
+		restart->futex.val = val;
+		restart->futex.time = *abs_time;
+		restart->futex.bitset = bitset;
+		restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
-out:
-	if (to) {
-		hrtimer_cancel(&to->timer);
-		destroy_hrtimer_on_stack(&to->timer);
+		return set_restart_fn(restart, futex_wait_restart);
 	}
+
 	return ret;
 }
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 983c0583c627..13df391194e2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -88,6 +88,7 @@ COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
 COND_SYSCALL(futex_wake);
+COND_SYSCALL(futex_wait);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From 0f4b5f972216782a4acb1ae00dcb55173847c2ff Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:15 +0200
Subject: futex: Add sys_futex_requeue()

Finish off the 'simple' futex2 syscall group by adding
sys_futex_requeue(). Unlike sys_futex_{wait,wake}() its arguments are
too numerous to fit into a regular syscall. As such, use struct
futex_waitv to pass the 'source' and 'destination' futexes to the
syscall.

This syscall implements what was previously known as FUTEX_CMP_REQUEUE
and uses {val, uaddr, flags} for source and {uaddr, flags} for
destination.

This design explicitly allows requeueing between different types of
futex by having a different flags word per uaddr.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.511860556@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           |  2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 include/linux/syscalls.h                    |  3 +++
 include/uapi/asm-generic/unistd.h           |  4 ++-
 kernel/futex/syscalls.c                     | 38 +++++++++++++++++++++++++++++
 kernel/sys_ni.c                             |  1 +
 22 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index c49f12fd264e..b1865f9bb31e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -494,3 +494,4 @@
 562	common	fchmodat2			sys_fchmodat2
 563	common	futex_wake			sys_futex_wake
 564	common	futex_wait			sys_futex_wait
+565	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index a6cf56277327..93d0d46cbb15 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -468,3 +468,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index f33190f17ebb..531effca5f1f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		456
+#define __NR_compat_syscalls		457
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6e7d37282ba1..c453291154fd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -915,6 +915,8 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
 #define __NR_futex_wait 455
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 4043f0c55170..81375ea78288 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -375,3 +375,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 24841674acc5..f7f997a88bab 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -454,3 +454,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index f03927ab0220..2967ec26b978 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -460,3 +460,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index dbb5edfb667b..383abb1713f4 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -393,3 +393,4 @@
 452	n32	fchmodat2			sys_fchmodat2
 454	n32	futex_wake			sys_futex_wake
 455	n32	futex_wait			sys_futex_wait
+456	n32	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index faff8dfd2983..c9bd09ba905f 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -369,3 +369,4 @@
 452	n64	fchmodat2			sys_fchmodat2
 454	n64	futex_wake			sys_futex_wake
 455	n64	futex_wait			sys_futex_wait
+456	n64	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 542f75605b3e..ba5ef6cea97a 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -442,3 +442,4 @@
 452	o32	fchmodat2			sys_fchmodat2
 454	o32	futex_wake			sys_futex_wake
 455	o32	futex_wait			sys_futex_wait
+456	o32	futex_requeue			sys_futex_requeue
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 8e50e89551f7..9f0f6df55361 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ad33a9993a6a..26fc41904266 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -541,3 +541,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 418853fd2a6b..31be90b241f7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -457,3 +457,4 @@
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
 454  common	futex_wake		sys_futex_wake			sys_futex_wake
 455  common	futex_wait		sys_futex_wait			sys_futex_wait
+456  common	futex_requeue		sys_futex_requeue			sys_futex_requeue
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 8ef9557d2779..4bc5d488ab17 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -457,3 +457,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index df59a9d5f109..8404c8e50394 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -500,3 +500,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0f6616822bd5..31c48bc2c3d8 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -459,3 +459,4 @@
 452	i386	fchmodat2		sys_fchmodat2
 454	i386	futex_wake		sys_futex_wake
 455	i386	futex_wait		sys_futex_wait
+456	i386	futex_requeue		sys_futex_requeue
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ddf6288823ad..a577bb27c16d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -377,6 +377,7 @@
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	futex_wake		sys_futex_wake
 455	common	futex_wait		sys_futex_wait
+456	common	futex_requeue		sys_futex_requeue
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index ac278dbce2ee..dd71ecce8b86 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -425,3 +425,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 11f3fdd1ee03..0901af60d971 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -556,6 +556,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned l
 			       unsigned int flags, struct __kernel_timespec __user *timespec,
 			       clockid_t clockid);
 
+asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters,
+				  unsigned int flags, int nr_wake, int nr_requeue);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f6553bd5d213..d9e9cd13e577 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -826,9 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
 #define __NR_futex_wait 455
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 #undef __NR_syscalls
-#define __NR_syscalls 456
+#define __NR_syscalls 457
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index dde9b74db9af..8200d86d30e1 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -396,6 +396,44 @@ SYSCALL_DEFINE6(futex_wait,
 	return ret;
 }
 
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters:	array describing the source and destination futex
+ * @flags:	unused
+ * @nr_wake:	number of futexes to wake
+ * @nr_requeue:	number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+		struct futex_waitv __user *, waiters,
+		unsigned int, flags,
+		int, nr_wake,
+		int, nr_requeue)
+{
+	struct futex_vector futexes[2];
+	u32 cmpval;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (!waiters)
+		return -EINVAL;
+
+	ret = futex_parse_waitv(futexes, waiters, 2);
+	if (ret)
+		return ret;
+
+	cmpval = futexes[0].w.val;
+
+	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+			     nr_wake, nr_requeue, &cmpval, 0);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 13df391194e2..9db51ea373b0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -89,6 +89,7 @@ COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
 COND_SYSCALL(futex_wake);
 COND_SYSCALL(futex_wait);
+COND_SYSCALL(futex_requeue);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From f31ecf671ddc498f20219453395794ff2383e06b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Jul 2023 16:14:37 -0600
Subject: io_uring: add IORING_OP_WAITID support

This adds support for an async version of waitid(2), in a fully async
version. If an event isn't immediately available, wait for a callback
to trigger a retry.

The format of the sqe is as follows:

sqe->len		The 'which', the idtype being queried/waited for.
sqe->fd			The 'pid' (or id) being waited for.
sqe->file_index		The 'options' being set.
sqe->addr2		A pointer to siginfo_t, if any, being filled in.

buf_index, add3, and waitid_flags are reserved/unused for now.
waitid_flags will be used for options for this request type. One
interesting use case may be to add multi-shot support, so that the
request stays armed and posts a notification every time a monitored
process state change occurs.

Note that this does not support rusage, on Arnd's recommendation.

See the waitid(2) man page for details on the arguments.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   2 +
 include/uapi/linux/io_uring.h  |   2 +
 io_uring/Makefile              |   3 +-
 io_uring/cancel.c              |   5 +
 io_uring/io_uring.c            |   3 +
 io_uring/opdef.c               |   9 +
 io_uring/waitid.c              | 372 +++++++++++++++++++++++++++++++++++++++++
 io_uring/waitid.h              |  15 ++
 8 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/waitid.c
 create mode 100644 io_uring/waitid.h

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 13d19b9be9f4..fe1c5d4ec56c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -313,6 +313,8 @@ struct io_ring_ctx {
 	struct list_head	cq_overflow_list;
 	struct io_hash_table	cancel_table;
 
+	struct hlist_head	waitid_list;
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d127948b0d8a..683ac2b74721 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -65,6 +65,7 @@ struct io_uring_sqe {
 		__u32		xattr_flags;
 		__u32		msg_ring_flags;
 		__u32		uring_cmd_flags;
+		__u32		waitid_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -241,6 +242,7 @@ enum io_uring_op {
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
 	IORING_OP_READ_MULTISHOT,
+	IORING_OP_WAITID,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..7bd64e442567 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					openclose.o uring_cmd.o epoll.o \
 					statx.o net.o msg_ring.o timeout.o \
 					sqpoll.o fdinfo.o tctx.o poll.o \
-					cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+					cancel.o kbuf.o rsrc.o rw.o opdef.o \
+					notif.o waitid.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 7b23607cf4af..eb77a51c5a79 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -15,6 +15,7 @@
 #include "tctx.h"
 #include "poll.h"
 #include "timeout.h"
+#include "waitid.h"
 #include "cancel.h"
 
 struct io_cancel {
@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 	if (ret != -ENOENT)
 		return ret;
 
+	ret = io_waitid_cancel(ctx, cd, issue_flags);
+	if (ret != -ENOENT)
+		return ret;
+
 	spin_lock(&ctx->completion_lock);
 	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
 		ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 783ed0fff71b..2dff4772bf14 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
 #include "cancel.h"
 #include "net.h"
 #include "notif.h"
+#include "waitid.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -348,6 +349,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_WQ_LIST(&ctx->locked_free_list);
+	INIT_HLIST_HEAD(&ctx->waitid_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	return ctx;
@@ -3303,6 +3305,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	ret |= io_cancel_defer_files(ctx, task, cancel_all);
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
+	ret |= io_waitid_remove_all(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
 	if (task)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a3fb1f9b3998..aadcbf7136b0 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
 #include "poll.h"
 #include "cancel.h"
 #include "rw.h"
+#include "waitid.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -439,6 +440,10 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_read_mshot_prep,
 		.issue			= io_read_mshot,
 	},
+	[IORING_OP_WAITID] = {
+		.prep			= io_waitid_prep,
+		.issue			= io_waitid,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -661,6 +666,10 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_READ_MULTISHOT] = {
 		.name			= "READ_MULTISHOT",
 	},
+	[IORING_OP_WAITID] = {
+		.name			= "WAITID",
+		.async_size		= sizeof(struct io_waitid_async),
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
new file mode 100644
index 000000000000..6f851978606d
--- /dev/null
+++ b/io_uring/waitid.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for async notification of waitid
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "cancel.h"
+#include "waitid.h"
+#include "../kernel/exit.h"
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
+
+#define IO_WAITID_CANCEL_FLAG	BIT(31)
+#define IO_WAITID_REF_MASK	GENMASK(30, 0)
+
+struct io_waitid {
+	struct file *file;
+	int which;
+	pid_t upid;
+	int options;
+	atomic_t refs;
+	struct wait_queue_head *head;
+	struct siginfo __user *infop;
+	struct waitid_info info;
+};
+
+static void io_waitid_free(struct io_kiocb *req)
+{
+	struct io_waitid_async *iwa = req->async_data;
+
+	put_pid(iwa->wo.wo_pid);
+	kfree(req->async_data);
+	req->async_data = NULL;
+	req->flags &= ~REQ_F_ASYNC_DATA;
+}
+
+#ifdef CONFIG_COMPAT
+static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+{
+	struct compat_siginfo __user *infop;
+	bool ret;
+
+	infop = (struct compat_siginfo __user *) iw->infop;
+
+	if (!user_write_access_begin(infop, sizeof(*infop)))
+		return false;
+
+	unsafe_put_user(signo, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
+	unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(iw->info.status, &infop->si_status, Efault);
+	ret = true;
+done:
+	user_write_access_end();
+	return ret;
+Efault:
+	ret = false;
+	goto done;
+}
+#endif
+
+static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	bool ret;
+
+	if (!iw->infop)
+		return true;
+
+#ifdef CONFIG_COMPAT
+	if (req->ctx->compat)
+		return io_waitid_compat_copy_si(iw, signo);
+#endif
+
+	if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
+		return false;
+
+	unsafe_put_user(signo, &iw->infop->si_signo, Efault);
+	unsafe_put_user(0, &iw->infop->si_errno, Efault);
+	unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
+	unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
+	unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
+	unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
+	ret = true;
+done:
+	user_write_access_end();
+	return ret;
+Efault:
+	ret = false;
+	goto done;
+}
+
+static int io_waitid_finish(struct io_kiocb *req, int ret)
+{
+	int signo = 0;
+
+	if (ret > 0) {
+		signo = SIGCHLD;
+		ret = 0;
+	}
+
+	if (!io_waitid_copy_si(req, signo))
+		ret = -EFAULT;
+	io_waitid_free(req);
+	return ret;
+}
+
+static void io_waitid_complete(struct io_kiocb *req, int ret)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_tw_state ts = { .locked = true };
+
+	/* anyone completing better be holding a reference */
+	WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
+
+	lockdep_assert_held(&req->ctx->uring_lock);
+
+	/*
+	 * Did cancel find it meanwhile?
+	 */
+	if (hlist_unhashed(&req->hash_node))
+		return;
+
+	hlist_del_init(&req->hash_node);
+
+	ret = io_waitid_finish(req, ret);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	io_req_task_complete(req, &ts);
+}
+
+static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_waitid_async *iwa = req->async_data;
+
+	/*
+	 * Mark us canceled regardless of ownership. This will prevent a
+	 * potential retry from a spurious wakeup.
+	 */
+	atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
+
+	/* claim ownership */
+	if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+		return false;
+
+	spin_lock_irq(&iw->head->lock);
+	list_del_init(&iwa->wo.child_wait.entry);
+	spin_unlock_irq(&iw->head->lock);
+	io_waitid_complete(req, -ECANCELED);
+	return true;
+}
+
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		     unsigned int issue_flags)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	int nr = 0;
+
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+		return -ENOENT;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+		if (req->cqe.user_data != cd->data &&
+		    !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+			continue;
+		if (__io_waitid_cancel(ctx, req))
+			nr++;
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+			break;
+	}
+	io_ring_submit_unlock(ctx, issue_flags);
+
+	if (nr)
+		return nr;
+
+	return -ENOENT;
+}
+
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			  bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool found = false;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+		if (!io_match_task_safe(req, task, cancel_all))
+			continue;
+		__io_waitid_cancel(ctx, req);
+		found = true;
+	}
+
+	return found;
+}
+
+static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_waitid_async *iwa = req->async_data;
+
+	if (!atomic_sub_return(1, &iw->refs))
+		return false;
+
+	/*
+	 * Wakeup triggered, racing with us. It was prevented from
+	 * completing because of that, queue up the tw to do that.
+	 */
+	req->io_task_work.func = io_waitid_cb;
+	io_req_task_work_add(req);
+	remove_wait_queue(iw->head, &iwa->wo.child_wait);
+	return true;
+}
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	struct io_waitid_async *iwa = req->async_data;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	io_tw_lock(ctx, ts);
+
+	ret = __do_wait(&iwa->wo);
+
+	/*
+	 * If we get -ERESTARTSYS here, we need to re-arm and check again
+	 * to ensure we get another callback. If the retry works, then we can
+	 * just remove ourselves from the waitqueue again and finish the
+	 * request.
+	 */
+	if (unlikely(ret == -ERESTARTSYS)) {
+		struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+		/* Don't retry if cancel found it meanwhile */
+		ret = -ECANCELED;
+		if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
+			iw->head = &current->signal->wait_chldexit;
+			add_wait_queue(iw->head, &iwa->wo.child_wait);
+			ret = __do_wait(&iwa->wo);
+			if (ret == -ERESTARTSYS) {
+				/* retry armed, drop our ref */
+				io_waitid_drop_issue_ref(req);
+				return;
+			}
+
+			remove_wait_queue(iw->head, &iwa->wo.child_wait);
+		}
+	}
+
+	io_waitid_complete(req, ret);
+}
+
+static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+			  int sync, void *key)
+{
+	struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
+	struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
+	struct io_kiocb *req = iwa->req;
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct task_struct *p = key;
+
+	if (!pid_child_should_wake(wo, p))
+		return 0;
+
+	/* cancel is in progress */
+	if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+		return 1;
+
+	req->io_task_work.func = io_waitid_cb;
+	io_req_task_work_add(req);
+	list_del_init(&wait->entry);
+	return 1;
+}
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+	if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
+		return -EINVAL;
+
+	iw->which = READ_ONCE(sqe->len);
+	iw->upid = READ_ONCE(sqe->fd);
+	iw->options = READ_ONCE(sqe->file_index);
+	iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	return 0;
+}
+
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_waitid_async *iwa;
+	int ret;
+
+	if (io_alloc_async_data(req))
+		return -ENOMEM;
+
+	iwa = req->async_data;
+	iwa->req = req;
+
+	ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
+					iw->options, NULL);
+	if (ret)
+		goto done;
+
+	/*
+	 * Mark the request as busy upfront, in case we're racing with the
+	 * wakeup. If we are, then we'll notice when we drop this initial
+	 * reference again after arming.
+	 */
+	atomic_set(&iw->refs, 1);
+
+	/*
+	 * Cancel must hold the ctx lock, so there's no risk of cancelation
+	 * finding us until a) we remain on the list, and b) the lock is
+	 * dropped. We only need to worry about racing with the wakeup
+	 * callback.
+	 */
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_add_head(&req->hash_node, &ctx->waitid_list);
+
+	init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
+	iwa->wo.child_wait.private = req->task;
+	iw->head = &current->signal->wait_chldexit;
+	add_wait_queue(iw->head, &iwa->wo.child_wait);
+
+	ret = __do_wait(&iwa->wo);
+	if (ret == -ERESTARTSYS) {
+		/*
+		 * Nobody else grabbed a reference, it'll complete when we get
+		 * a waitqueue callback, or if someone cancels it.
+		 */
+		if (!io_waitid_drop_issue_ref(req)) {
+			io_ring_submit_unlock(ctx, issue_flags);
+			return IOU_ISSUE_SKIP_COMPLETE;
+		}
+
+		/*
+		 * Wakeup triggered, racing with us. It was prevented from
+		 * completing because of that, queue up the tw to do that.
+		 */
+		io_ring_submit_unlock(ctx, issue_flags);
+		return IOU_ISSUE_SKIP_COMPLETE;
+	}
+
+	hlist_del_init(&req->hash_node);
+	remove_wait_queue(iw->head, &iwa->wo.child_wait);
+	ret = io_waitid_finish(req, ret);
+
+	io_ring_submit_unlock(ctx, issue_flags);
+done:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
new file mode 100644
index 000000000000..956a8adafe8c
--- /dev/null
+++ b/io_uring/waitid.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/exit.h"
+
+struct io_waitid_async {
+	struct io_kiocb *req;
+	struct wait_opts wo;
+};
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		     unsigned int issue_flags);
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			  bool cancel_all);
-- 
cgit v1.2.3


From 41e845628511878d6e89e2a9249c095e72aab7eb Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 9 Sep 2023 21:19:32 +0200
Subject: cred: add get_cred_many and put_cred_many

Some of the frequent consumers of get_cred and put_cred operate on 2
references on the same creds back-to-back.

Switch them to doing the work in one go instead.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
[PM: removed changelog from commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/cred.h | 59 ++++++++++++++++++++++++++++++++++++++++++++--------
 kernel/cred.c        | 26 +++++++++++++----------
 2 files changed, 65 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..56bc432fe49b 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -218,6 +218,20 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 					  cred->cap_inheritable));
 }
 
+/**
+ * get_new_cred_many - Get references on a new set of credentials
+ * @cred: The new credentials to reference
+ * @nr: Number of references to acquire
+ *
+ * Get references on the specified set of new credentials.  The caller must
+ * release all acquired references.
+ */
+static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
+{
+	atomic_add(nr, &cred->usage);
+	return cred;
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
@@ -227,16 +241,16 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred(struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred_many(cred, 1);
 }
 
 /**
- * get_cred - Get a reference on a set of credentials
+ * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
+ * @nr: Number of references to acquire
  *
- * Get a reference on the specified set of credentials.  The caller must
- * release the reference.  If %NULL is passed, it is returned with no action.
+ * Get references on the specified set of credentials.  The caller must release
+ * all acquired reference.  If %NULL is passed, it is returned with no action.
  *
  * This is used to deal with a committed set of credentials.  Although the
  * pointer is const, this will temporarily discard the const and increment the
@@ -244,14 +258,28 @@ static inline struct cred *get_new_cred(struct cred *cred)
  * accidental alteration of a set of credentials that should be considered
  * immutable.
  */
-static inline const struct cred *get_cred(const struct cred *cred)
+static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
-	return get_new_cred(nonconst_cred);
+	return get_new_cred_many(nonconst_cred, nr);
+}
+
+/*
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.  If %NULL is passed, it is returned with no action.
+ *
+ * This is used to deal with a committed set of credentials.
+ */
+static inline const struct cred *get_cred(const struct cred *cred)
+{
+	return get_cred_many(cred, 1);
 }
 
 static inline const struct cred *get_cred_rcu(const struct cred *cred)
@@ -269,6 +297,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
+ * @nr: Number of references to release
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.  If %NULL is passed, nothing is done.
@@ -277,17 +306,29 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
  * on task_struct are attached by const pointers to prevent accidental
  * alteration of otherwise immutable credential sets.
  */
-static inline void put_cred(const struct cred *_cred)
+static inline void put_cred_many(const struct cred *_cred, int nr)
 {
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_dec_and_test(&(cred)->usage))
+		if (atomic_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
 
+/*
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.  If %NULL is passed, nothing is done.
+ */
+static inline void put_cred(const struct cred *cred)
+{
+	put_cred_many(cred, 1);
+}
+
 /**
  * current_cred - Access the current task's subjective credentials
  *
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..9398e534b997 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -162,23 +162,29 @@ EXPORT_SYMBOL(__put_cred);
  */
 void exit_creds(struct task_struct *tsk)
 {
-	struct cred *cred;
+	struct cred *real_cred, *cred;
 
 	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
 	       atomic_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
-	cred = (struct cred *) tsk->real_cred;
+	real_cred = (struct cred *) tsk->real_cred;
 	tsk->real_cred = NULL;
-	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
 
 	cred = (struct cred *) tsk->cred;
 	tsk->cred = NULL;
+
 	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
+	if (real_cred == cred) {
+		alter_cred_subscribers(cred, -2);
+		put_cred_many(cred, 2);
+	} else {
+		validate_creds(real_cred);
+		alter_cred_subscribers(real_cred, -1);
+		put_cred(real_cred);
+		alter_cred_subscribers(cred, -1);
+		put_cred(cred);
+	}
 
 #ifdef CONFIG_KEYS_REQUEST_CACHE
 	key_put(tsk->cached_requested_key);
@@ -355,8 +361,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
-		p->real_cred = get_cred(p->cred);
-		get_cred(p->cred);
+		p->real_cred = get_cred_many(p->cred, 2);
 		alter_cred_subscribers(p->cred, 2);
 		kdebug("share_creds(%p{%d,%d})",
 		       p->cred, atomic_read(&p->cred->usage),
@@ -520,8 +525,7 @@ int commit_creds(struct cred *new)
 		proc_id_connector(task, PROC_EVENT_GID);
 
 	/* release the old obj and subj refs both */
-	put_cred(old);
-	put_cred(old);
+	put_cred_many(old, 2);
 	return 0;
 }
 EXPORT_SYMBOL(commit_creds);
-- 
cgit v1.2.3


From 8bf0cdfac7f8aa3fa6151b5c5f5eebdb44a64e89 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 21 Sep 2023 11:32:58 +0200
Subject: <linux/list.h>: Introduce the list_for_each_reverse() method

The list_head counterpart of list_for_each_entry_reverse() was missing,
add it to complete the list handling APIs in <linux/list.h>.

[ This new API is also relied on by a WIP scheduler patch, so this
  variant is not a theoretical possibility only. ]

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/list.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 164b4d0e9d2a..1837caedf723 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each(pos, head) \
 	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
+/**
+ * list_for_each_reverse - iterate backwards over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_reverse(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
 /**
  * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
  * @pos:	the &struct list_head to use as a loop cursor.
-- 
cgit v1.2.3


From b8ec60e1186cdcfce41e7db4c827cb107e459002 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 19 Sep 2023 17:17:28 -0700
Subject: x86/speculation, objtool: Use absolute relocations for annotations

.discard.retpoline_safe sections do not have the SHF_ALLOC flag.  These
sections referencing text sections' STT_SECTION symbols with PC-relative
relocations like R_386_PC32 [0] is conceptually not suitable.  Newer
LLD will report warnings for REL relocations even for relocatable links [1]:

    ld.lld: warning: vmlinux.a(drivers/i2c/busses/i2c-i801.o):(.discard.retpoline_safe+0x120): has non-ABS relocation R_386_PC32 against symbol ''

Switch to absolute relocations instead, which indicate link-time
addresses.  In a relocatable link, these addresses are also output
section offsets, used by checks in tools/objtool/check.c.  When linking
vmlinux, these .discard.* sections will be discarded, therefore it is
not a problem that R_X86_64_32 cannot represent a kernel address.

Alternatively, we could set the SHF_ALLOC flag for .discard.* sections,
but I think non-SHF_ALLOC for sections to be discarded makes more sense.

Note: if we decide to never support REL architectures (e.g. arm, i386),
we can utilize R_*_NONE relocations (.reloc ., BFD_RELOC_NONE, sym),
making .discard.* sections zero-sized.  That said, the section content
waste is 4 bytes per entry, much smaller than sizeof(Elf{32,64}_Rel).

  [0] commit 1c0c1faf5692 ("objtool: Use relative pointers for annotations")
  [1] https://github.com/ClangBuiltLinux/linux/issues/1937

Signed-off-by: Fangrui Song <maskray@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20230920001728.1439947-1-maskray@google.com
---
 arch/x86/include/asm/alternative.h   |  4 ++--
 arch/x86/include/asm/nospec-branch.h |  4 ++--
 include/linux/objtool.h              | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 9c4da699e11a..65f79092c9d9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -58,7 +58,7 @@
 #define ANNOTATE_IGNORE_ALTERNATIVE				\
 	"999:\n\t"						\
 	".pushsection .discard.ignore_alts\n\t"			\
-	".long 999b - .\n\t"					\
+	".long 999b\n\t"					\
 	".popsection\n\t"
 
 /*
@@ -352,7 +352,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 .macro ANNOTATE_IGNORE_ALTERNATIVE
 	.Lannotate_\@:
 	.pushsection .discard.ignore_alts
-	.long .Lannotate_\@ - .
+	.long .Lannotate_\@
 	.popsection
 .endm
 
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c55cc243592e..4952b73d944e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -196,7 +196,7 @@
 .macro ANNOTATE_RETPOLINE_SAFE
 .Lhere_\@:
 	.pushsection .discard.retpoline_safe
-	.long .Lhere_\@ - .
+	.long .Lhere_\@
 	.popsection
 .endm
 
@@ -334,7 +334,7 @@
 #define ANNOTATE_RETPOLINE_SAFE					\
 	"999:\n\t"						\
 	".pushsection .discard.retpoline_safe\n\t"		\
-	".long 999b - .\n\t"					\
+	".long 999b\n\t"					\
 	".popsection\n\t"
 
 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 03f82c2c2ebf..6f6da95fe7f9 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -48,13 +48,13 @@
 #define ANNOTATE_NOENDBR					\
 	"986: \n\t"						\
 	".pushsection .discard.noendbr\n\t"			\
-	".long 986b - .\n\t"					\
+	".long 986b\n\t"					\
 	".popsection\n\t"
 
 #define ASM_REACHABLE							\
 	"998:\n\t"							\
 	".pushsection .discard.reachable\n\t"				\
-	".long 998b - .\n\t"						\
+	".long 998b\n\t"						\
 	".popsection\n\t"
 
 #else /* __ASSEMBLY__ */
@@ -66,7 +66,7 @@
 #define ANNOTATE_INTRA_FUNCTION_CALL				\
 	999:							\
 	.pushsection .discard.intra_function_calls;		\
-	.long 999b - .;						\
+	.long 999b;						\
 	.popsection;
 
 /*
@@ -118,7 +118,7 @@
 .macro ANNOTATE_NOENDBR
 .Lhere_\@:
 	.pushsection .discard.noendbr
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
@@ -141,7 +141,7 @@
 .macro REACHABLE
 .Lhere_\@:
 	.pushsection .discard.reachable
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
-- 
cgit v1.2.3


From f1a9be986cedbef839062428e8e9b2bcc9c58af5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:12:00 -0700
Subject: mtd: Annotate struct lpddr_private with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct lpddr_private.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230915201159.never.112-kees@kernel.org
---
 include/linux/mtd/qinfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h
index 2e3f43788d48..0421f12156b5 100644
--- a/include/linux/mtd/qinfo.h
+++ b/include/linux/mtd/qinfo.h
@@ -24,7 +24,7 @@ struct lpddr_private {
 	struct qinfo_chip *qinfo;
 	int numchips;
 	unsigned long chipshift;
-	struct flchip chips[];
+	struct flchip chips[] __counted_by(numchips);
 };
 
 /* qinfo_query_info structure contains request information for
-- 
cgit v1.2.3


From 1442d628d05c7aad86e5754fa554f32edc3e77a8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:12:06 -0700
Subject: mtd: cfi: Annotate struct cfi_private with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct cfi_private.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Vignesh Raghavendra <vigneshr@ti.com>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230915201206.never.107-kees@kernel.org
---
 include/linux/mtd/cfi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d88bb56c18e2..947410faf9e2 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -287,7 +287,7 @@ struct cfi_private {
 	unsigned long chipshift; /* Because they're of the same type */
 	const char *im_name;	 /* inter_module name for cmdset_setup */
 	unsigned long quirks;
-	struct flchip chips[];  /* per-chip data structure for each chip */
+	struct flchip chips[] __counted_by(numchips);  /* per-chip data structure for each chip */
 };
 
 uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
-- 
cgit v1.2.3


From 48554df6bf2b1e83f70749bf4b4d7914f8b3c01d Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 13 Sep 2023 15:16:13 +0000
Subject: blk-mq: remove RQF_MQ_INFLIGHT

Since the previous patch change to only account active requests when
we really allocate the driver tag, the RQF_MQ_INFLIGHT can be removed
and no double account problem.

1. none elevator: flush request will use the first pending request's
   driver tag, won't double account.

2. other elevator: flush request will be accounted when allocate driver
   tag when issue, and will be unaccounted when it put the driver tag.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230913151616.3164338-3-chengming.zhou@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c      | 11 ++---------
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         |  4 ----
 include/linux/blk-mq.h |  2 --
 4 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index e73dc22d05c1..3f4d41952ef2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->mq_ctx = first_rq->mq_ctx;
 	flush_rq->mq_hctx = first_rq->mq_hctx;
 
-	if (!q->elevator) {
+	if (!q->elevator)
 		flush_rq->tag = first_rq->tag;
-
-		/*
-		 * We borrow data request's driver tag, so have to mark
-		 * this flush request as INFLIGHT for avoiding double
-		 * account of this driver tag
-		 */
-		flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
-	} else
+	else
 		flush_rq->internal_tag = first_rq->internal_tag;
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b5930106b2..5cbeb9344f2f 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -246,7 +246,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STARTED),
 	RQF_NAME(FLUSH_SEQ),
 	RQF_NAME(MIXED_MERGE),
-	RQF_NAME(MQ_INFLIGHT),
 	RQF_NAME(DONTPREP),
 	RQF_NAME(SCHED_TAGS),
 	RQF_NAME(USE_SCHED),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e776388decc3..c209a7dddee3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1066,10 +1066,6 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 
-	/*
-	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
-	 * update hctx->nr_active in batch
-	 */
 	blk_mq_sub_active_requests(hctx, nr_tags);
 
 	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 958ed7e89b30..1ab3081c82ed 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
 /* merge of different types, fail separately */
 #define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
 /* don't call prep for this one */
 #define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
 /* use hctx->sched_tags */
-- 
cgit v1.2.3


From 5f05285df691b1e82108eead7165feae238c95ef Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Tue, 19 Sep 2023 13:40:48 +0530
Subject: iio: hid-sensor-als: Add light color temperature support

In most cases, ambient color sensors also support light color temperature.
As a result, add support of light color temperature.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Acked-by: Srinivas Pandruvada<srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20230919081054.2050714-4-Basavaraj.Natikar@amd.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/light/hid-sensor-als.c | 37 +++++++++++++++++++++++++++++++++++--
 include/linux/hid-sensor-ids.h     |  1 +
 2 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index efb1f8862b28..16a3f1941c27 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c
@@ -14,8 +14,9 @@
 #include "../common/hid-sensors/hid-sensor-trigger.h"
 
 enum {
-	CHANNEL_SCAN_INDEX_INTENSITY = 0,
-	CHANNEL_SCAN_INDEX_ILLUM = 1,
+	CHANNEL_SCAN_INDEX_INTENSITY,
+	CHANNEL_SCAN_INDEX_ILLUM,
+	CHANNEL_SCAN_INDEX_COLOR_TEMP,
 	CHANNEL_SCAN_INDEX_MAX
 };
 
@@ -65,6 +66,16 @@ static const struct iio_chan_spec als_channels[] = {
 		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
 		.scan_index = CHANNEL_SCAN_INDEX_ILLUM,
 	},
+	{
+		.type = IIO_COLORTEMP,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
+		.scan_index = CHANNEL_SCAN_INDEX_COLOR_TEMP,
+	},
 	IIO_CHAN_SOFT_TIMESTAMP(CHANNEL_SCAN_INDEX_TIMESTAMP)
 };
 
@@ -103,6 +114,11 @@ static int als_read_raw(struct iio_dev *indio_dev,
 			min = als_state->als[chan->scan_index].logical_minimum;
 			address = HID_USAGE_SENSOR_LIGHT_ILLUM;
 			break;
+		case  CHANNEL_SCAN_INDEX_COLOR_TEMP:
+			report_id = als_state->als[chan->scan_index].report_id;
+			min = als_state->als[chan->scan_index].logical_minimum;
+			address = HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE;
+			break;
 		default:
 			report_id = -1;
 			break;
@@ -223,6 +239,10 @@ static int als_capture_sample(struct hid_sensor_hub_device *hsdev,
 		als_state->scan.illum[CHANNEL_SCAN_INDEX_ILLUM] = sample_data;
 		ret = 0;
 		break;
+	case HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE:
+		als_state->scan.illum[CHANNEL_SCAN_INDEX_COLOR_TEMP] = sample_data;
+		ret = 0;
+		break;
 	case HID_USAGE_SENSOR_TIME_TIMESTAMP:
 		als_state->timestamp = hid_sensor_convert_timestamp(&als_state->common_attributes,
 								    *(s64 *)raw_data);
@@ -258,6 +278,19 @@ static int als_parse_report(struct platform_device *pdev,
 			st->als[i].report_id);
 	}
 
+	ret = sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT,
+				usage_id,
+				HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE,
+				&st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP]);
+	if (ret < 0)
+		return ret;
+	als_adjust_channel_bit_mask(channels, CHANNEL_SCAN_INDEX_COLOR_TEMP,
+				st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].size);
+
+	dev_dbg(&pdev->dev, "als %x:%x\n",
+		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].index,
+		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].report_id);
+
 	st->scale_precision = hid_sensor_format_scale(usage_id,
 				&st->als[CHANNEL_SCAN_INDEX_INTENSITY],
 				&st->scale_pre_decml, &st->scale_post_decml);
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 13b1e65fbdcc..8af4fb3e0254 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -21,6 +21,7 @@
 #define HID_USAGE_SENSOR_ALS					0x200041
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
+#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From ee3710f39f9d0ae5137a866138d005fe1ad18132 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Tue, 19 Sep 2023 13:40:52 +0530
Subject: iio: hid-sensor-als: Add light chromaticity support

In most cases, ambient color sensors also support the x and y light
colors, which represent the coordinates on the CIE 1931 chromaticity
diagram. Thus, add light chromaticity x and y.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Acked-by: Srinivas Pandruvada<srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20230919081054.2050714-8-Basavaraj.Natikar@amd.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/light/hid-sensor-als.c | 63 ++++++++++++++++++++++++++++++++++++++
 include/linux/hid-sensor-ids.h     |  3 ++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index 16a3f1941c27..c9d114ff080a 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c
@@ -17,6 +17,8 @@ enum {
 	CHANNEL_SCAN_INDEX_INTENSITY,
 	CHANNEL_SCAN_INDEX_ILLUM,
 	CHANNEL_SCAN_INDEX_COLOR_TEMP,
+	CHANNEL_SCAN_INDEX_CHROMATICITY_X,
+	CHANNEL_SCAN_INDEX_CHROMATICITY_Y,
 	CHANNEL_SCAN_INDEX_MAX
 };
 
@@ -76,6 +78,30 @@ static const struct iio_chan_spec als_channels[] = {
 		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
 		.scan_index = CHANNEL_SCAN_INDEX_COLOR_TEMP,
 	},
+	{
+		.type = IIO_CHROMATICITY,
+		.modified = 1,
+		.channel2 = IIO_MOD_X,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
+		.scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_X,
+	},
+	{
+		.type = IIO_CHROMATICITY,
+		.modified = 1,
+		.channel2 = IIO_MOD_Y,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+		BIT(IIO_CHAN_INFO_SCALE) |
+		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS) |
+		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
+		.scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_Y,
+	},
 	IIO_CHAN_SOFT_TIMESTAMP(CHANNEL_SCAN_INDEX_TIMESTAMP)
 };
 
@@ -119,6 +145,16 @@ static int als_read_raw(struct iio_dev *indio_dev,
 			min = als_state->als[chan->scan_index].logical_minimum;
 			address = HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE;
 			break;
+		case  CHANNEL_SCAN_INDEX_CHROMATICITY_X:
+			report_id = als_state->als[chan->scan_index].report_id;
+			min = als_state->als[chan->scan_index].logical_minimum;
+			address = HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X;
+			break;
+		case  CHANNEL_SCAN_INDEX_CHROMATICITY_Y:
+			report_id = als_state->als[chan->scan_index].report_id;
+			min = als_state->als[chan->scan_index].logical_minimum;
+			address = HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y;
+			break;
 		default:
 			report_id = -1;
 			break;
@@ -243,6 +279,14 @@ static int als_capture_sample(struct hid_sensor_hub_device *hsdev,
 		als_state->scan.illum[CHANNEL_SCAN_INDEX_COLOR_TEMP] = sample_data;
 		ret = 0;
 		break;
+	case HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X:
+		als_state->scan.illum[CHANNEL_SCAN_INDEX_CHROMATICITY_X] = sample_data;
+		ret = 0;
+		break;
+	case HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y:
+		als_state->scan.illum[CHANNEL_SCAN_INDEX_CHROMATICITY_Y] = sample_data;
+		ret = 0;
+		break;
 	case HID_USAGE_SENSOR_TIME_TIMESTAMP:
 		als_state->timestamp = hid_sensor_convert_timestamp(&als_state->common_attributes,
 								    *(s64 *)raw_data);
@@ -291,6 +335,25 @@ static int als_parse_report(struct platform_device *pdev,
 		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].index,
 		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].report_id);
 
+	for (i = 0; i < 2; i++) {
+		int next_scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_X + i;
+
+		ret = sensor_hub_input_get_attribute_info(hsdev,
+				HID_INPUT_REPORT, usage_id,
+				HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X + i,
+				&st->als[next_scan_index]);
+		if (ret < 0)
+			return ret;
+
+		als_adjust_channel_bit_mask(channels,
+					CHANNEL_SCAN_INDEX_CHROMATICITY_X + i,
+					st->als[next_scan_index].size);
+
+		dev_dbg(&pdev->dev, "als %x:%x\n",
+			st->als[next_scan_index].index,
+			st->als[next_scan_index].report_id);
+	}
+
 	st->scale_precision = hid_sensor_format_scale(usage_id,
 				&st->als[CHANNEL_SCAN_INDEX_INTENSITY],
 				&st->scale_pre_decml, &st->scale_post_decml);
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 8af4fb3e0254..6730ee900ee1 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -22,6 +22,9 @@
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY			0x2004d3
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X			0x2004d4
+#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y			0x2004d5
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From a741deac787f0d2d7068638c067db20af9e63752 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 26 Jul 2023 13:57:03 -0700
Subject: torture: Make torture_hrtimeout_ns() take an hrtimer mode parameter

The current torture-test sleeps are waiting for a duration, but there
are situations where it is better to wait for an absolute time, for
example, when ending a stutter interval.  This commit therefore adds
an hrtimer mode parameter to torture_hrtimeout_ns().  Why not also the
other torture_hrtimeout_*() functions?  The theory is that most absolute
times will be in nanoseconds, especially not (say) jiffies.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h |  3 ++-
 kernel/torture.c        | 13 +++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index bb466eec01e4..017f0f710815 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -81,7 +81,8 @@ static inline void torture_random_init(struct torture_random_state *trsp)
 }
 
 /* Definitions for high-resolution-timer sleeps. */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp);
 int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
 int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
 int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
diff --git a/kernel/torture.c b/kernel/torture.c
index 68dba4ecab5c..6ba62e5993e7 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep);
  * nanosecond random fuzz.  This function and its friends desynchronize
  * testing from the timer wheel.
  */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp)
 {
 	ktime_t hto = baset_ns;
 
 	if (trsp)
 		hto += torture_random(trsp) % fuzzt_ns;
 	set_current_state(TASK_IDLE);
-	return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+	return schedule_hrtimeout(&hto, mode);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
 
@@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state
 {
 	ktime_t baset_ns = baset_us * NSEC_PER_USEC;
 
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
 
@@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
 
@@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
 {
 	ktime_t baset_ns = jiffies_to_nsecs(baset_j);
 
-	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
 
@@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
 
-- 
cgit v1.2.3


From 0cfecd7d754f2ef5d7e6b56ee656a8544ade920a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 31 Jul 2023 13:10:34 -0700
Subject: torture: Move rcutorture_sched_setaffinity() out of rcutorture

The rcutorture_sched_setaffinity() function is needed by locktorture,
so move its declaration from rcu.h to torture.h and rename it to the
more generic torture_sched_setaffinity() name.

Please note that use of this function is still restricted to torture
tests, and of those, currently only rcutorture and locktorture.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h | 5 +++++
 kernel/rcu/rcu.h        | 4 ----
 kernel/rcu/rcutorture.c | 2 +-
 kernel/rcu/update.c     | 8 ++++----
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 017f0f710815..c98d0c83d117 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -121,10 +121,15 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
 #define torture_stop_kthread(n, tp) \
 	_torture_stop_kthread("Stopping " #n " task", &(tp))
 
+/* Scheduler-related definitions. */
 #ifdef CONFIG_PREEMPTION
 #define torture_preempt_schedule() __preempt_schedule()
 #else
 #define torture_preempt_schedule()	do { } while (0)
 #endif
 
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..567bd3d72e39 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -568,10 +568,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 static inline void rcu_gp_set_torture_wait(int duration) { }
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
 #ifdef CONFIG_TINY_SRCU
 
 static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ade42d6a9d9b..7e82fb887d09 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -810,7 +810,7 @@ static void synchronize_rcu_trivial(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+		torture_sched_setaffinity(current->pid, cpumask_of(cpu));
 		WARN_ON_ONCE(raw_smp_processor_id() != cpu);
 	}
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9d3c2e6ba667..c534d6806d3d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -525,17 +525,17 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 	do { } while (0)
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
 /* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	int ret;
 
 	ret = sched_setaffinity(pid, in_mask);
-	WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+	WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
 #endif
 
 #ifdef CONFIG_RCU_STALL_COMMON
-- 
cgit v1.2.3


From 5970fbad1036d1015abe45651628b39b5bcb8a22 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:47 -0700
Subject: fscrypt: make it clearer that key_prefix is deprecated

fscrypt_operations::key_prefix should not be set by any filesystems that
aren't setting it already.  This is already documented, but apparently
it's not sufficiently clear, as both ceph and btrfs have tried to set
it.  Rename the field to legacy_key_prefix and improve the documentation
to hopefully make it clearer.

Link: https://lore.kernel.org/r/20230925055451.59499-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/keysetup_v1.c |  5 +++--
 fs/ext4/crypto.c        |  2 +-
 fs/f2fs/super.c         |  2 +-
 fs/ubifs/crypto.c       |  2 +-
 include/linux/fscrypt.h | 14 +++++++++-----
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 75dabd9b27f9..86b48a2b47d1 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -299,6 +299,7 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
 
 int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
 {
+	const struct super_block *sb = ci->ci_inode->i_sb;
 	struct key *key;
 	const struct fscrypt_key *payload;
 	int err;
@@ -306,8 +307,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
 	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
 					ci->ci_policy.v1.master_key_descriptor,
 					ci->ci_mode->keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+	if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) {
+		key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix,
 						ci->ci_policy.v1.master_key_descriptor,
 						ci->ci_mode->keysize, &payload);
 	}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..99a4769a53f6 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -240,7 +240,7 @@ static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
 }
 
 const struct fscrypt_operations ext4_cryptops = {
-	.key_prefix		= "ext4:",
+	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.get_dummy_policy	= ext4_get_dummy_policy,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..f60062b558fd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3231,7 +3231,7 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
-	.key_prefix		= "f2fs:",
+	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
 	.get_dummy_policy	= f2fs_get_dummy_policy,
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3125e76376ee..1be3e11da3b3 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -89,7 +89,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 
 const struct fscrypt_operations ubifs_crypt_operations = {
 	.flags			= FS_CFLG_OWN_PAGES,
-	.key_prefix		= "ubifs:",
+	.legacy_key_prefix	= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.empty_dir		= ubifs_crypt_empty_dir,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index c895b12737a1..b0037566ce30 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -73,12 +73,16 @@ struct fscrypt_operations {
 	unsigned int flags;
 
 	/*
-	 * If set, this is a filesystem-specific key description prefix that
-	 * will be accepted for "logon" keys for v1 fscrypt policies, in
-	 * addition to the generic prefix "fscrypt:".  This functionality is
-	 * deprecated, so new filesystems shouldn't set this field.
+	 * This field exists only for backwards compatibility reasons and should
+	 * only be set by the filesystems that are setting it already.  It
+	 * contains the filesystem-specific key description prefix that is
+	 * accepted for "logon" keys for v1 fscrypt policies.  This
+	 * functionality is deprecated in favor of the generic prefix
+	 * "fscrypt:", which itself is deprecated in favor of the filesystem
+	 * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
+	 * are newly adding fscrypt support should not set this field.
 	 */
-	const char *key_prefix;
+	const char *legacy_key_prefix;
 
 	/*
 	 * Get the fscrypt context of the given inode.
-- 
cgit v1.2.3


From 40e13e18168fd0f1a6ad10166f5042a21c47ab99 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:48 -0700
Subject: fscrypt: make the bounce page pool opt-in instead of opt-out

Replace FS_CFLG_OWN_PAGES with a bit flag 'needs_bounce_pages' which has
the opposite meaning.  I.e., filesystems now opt into the bounce page
pool instead of opt out.  Make fscrypt_alloc_bounce_page() check that
the bounce page pool has been initialized.

I believe the opt-in makes more sense, since nothing else in
fscrypt_operations is opt-out, and these days filesystems can choose to
use blk-crypto which doesn't need the fscrypt bounce page pool.  Also, I
happen to be planning to add two more flags, and I wanted to fix the
"FS_CFLG_" name anyway as it wasn't prefixed with "FSCRYPT_".

Link: https://lore.kernel.org/r/20230925055451.59499-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ceph/crypto.c        |  1 +
 fs/crypto/crypto.c      |  9 ++++++++-
 fs/ext4/crypto.c        |  1 +
 fs/f2fs/super.c         |  1 +
 fs/ubifs/crypto.c       |  1 -
 include/linux/fscrypt.h | 20 +++++++++++---------
 6 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index e4d5cd56a80b..cc63f1e6fdef 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,7 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
 }
 
 static struct fscrypt_operations ceph_fscrypt_ops = {
+	.needs_bounce_pages	= 1,
 	.get_context		= ceph_crypt_get_context,
 	.set_context		= ceph_crypt_set_context,
 	.get_dummy_policy	= ceph_get_dummy_policy,
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..aed0c5ea7578 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
+	if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
+		/*
+		 * Oops, the filesystem called a function that uses the bounce
+		 * page pool, but it didn't set needs_bounce_pages.
+		 */
+		return NULL;
+	}
 	return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 }
 
@@ -325,7 +332,7 @@ int fscrypt_initialize(struct super_block *sb)
 		return 0;
 
 	/* No need to allocate a bounce page pool if this FS won't use it. */
-	if (sb->s_cop->flags & FS_CFLG_OWN_PAGES)
+	if (!sb->s_cop->needs_bounce_pages)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 99a4769a53f6..5cd7bcfae46b 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -240,6 +240,7 @@ static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
 }
 
 const struct fscrypt_operations ext4_cryptops = {
+	.needs_bounce_pages	= 1,
 	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f60062b558fd..55aa0ed531f2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3231,6 +3231,7 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
+	.needs_bounce_pages	= 1,
 	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 1be3e11da3b3..921f9033d0d2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,7 +88,6 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 }
 
 const struct fscrypt_operations ubifs_crypt_operations = {
-	.flags			= FS_CFLG_OWN_PAGES,
 	.legacy_key_prefix	= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b0037566ce30..4505078e89b7 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -59,18 +59,20 @@ struct fscrypt_name {
 
 #ifdef CONFIG_FS_ENCRYPTION
 
-/*
- * If set, the fscrypt bounce page pool won't be allocated (unless another
- * filesystem needs it).  Set this if the filesystem always uses its own bounce
- * pages for writes and therefore won't need the fscrypt bounce page pool.
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
 /* Crypto operations for filesystems */
 struct fscrypt_operations {
 
-	/* Set of optional flags; see above for allowed flags */
-	unsigned int flags;
+	/*
+	 * If set, then fs/crypto/ will allocate a global bounce page pool the
+	 * first time an encryption key is set up for a file.  The bounce page
+	 * pool is required by the following functions:
+	 *
+	 * - fscrypt_encrypt_pagecache_blocks()
+	 * - fscrypt_zeroout_range() for files not using inline crypto
+	 *
+	 * If the filesystem doesn't use those, it doesn't need to set this.
+	 */
+	unsigned int needs_bounce_pages : 1;
 
 	/*
 	 * This field exists only for backwards compatibility reasons and should
-- 
cgit v1.2.3


From 753a4d531bc518633ea88ac0ed02b25a16823d51 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Fri, 22 Sep 2023 22:55:16 +0200
Subject: ata: libata-sata: increase PMP SRST timeout to 10s

On certain SATA controllers, softreset fails after wakeup from S2RAM with
the message "softreset failed (1st FIS failed)", sometimes resulting in
drives not being detected again. With the increased timeout, this issue
is avoided. Instead, "softreset failed (device not ready)" is now
logged 1-2 times; this later failure seems to cause fewer problems
however, and the drives are detected reliably once they've spun up and
the probe is retried.

The issue was observed with the primary SATA controller of the QNAP
TS-453B, which is an "Intel Corporation Celeron/Pentium Silver Processor
SATA Controller [8086:31e3] (rev 06)" integrated in the Celeron J4125 CPU,
and the following drives:

- Seagate IronWolf ST12000VN0008
- Seagate IronWolf ST8000NE0004

The SATA controller seems to be more relevant to this issue than the
drives, as the same drives are always detected reliably on the secondary
SATA controller on the same board (an ASMedia 106x) without any "softreset
failed" errors even without the increased timeout.

Fixes: e7d3ef13d52a ("libata: change drive ready wait after hard reset to 5s")
Cc: stable@vger.kernel.org
Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index bf4913f4d7ac..84aca8c44fa3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -259,7 +259,7 @@ enum {
 	 * advised to wait only for the following duration before
 	 * doing SRST.
 	 */
-	ATA_TMOUT_PMP_SRST_WAIT	= 5000,
+	ATA_TMOUT_PMP_SRST_WAIT	= 10000,
 
 	/* When the LPM policy is set to ATA_LPM_MAX_POWER, there might
 	 * be a spurious PHY event, so ignore the first PHY event that
-- 
cgit v1.2.3


From b3239498353484fd6ddeb513df89c4628cd623d0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:12 +0300
Subject: wifi: mac80211: use bandwidth indication element for CSA

In CSA, parse the (EHT) bandwidth indication element and
use it (in fact prefer it if present).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.43ef01920556.If4f24a61cd634ab1e50eba43899b9e992bf25602@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 23 +++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/mlme.c        |  5 +++--
 net/mac80211/spectmgmt.c   | 13 +++++++++++--
 net/mac80211/util.c        | 42 +++++++++++++++++++++++++++++-------------
 5 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 340d7e0f6bf7..f11b7022d9eb 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3139,6 +3139,28 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
 	return len >= needed;
 }
 
+#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
+
+struct ieee80211_bandwidth_indication {
+	u8 params;
+	struct ieee80211_eht_operation_info info;
+} __packed;
+
+static inline bool
+ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
+{
+	const struct ieee80211_bandwidth_indication *bwi = (const void *)data;
+
+	if (len < sizeof(*bwi))
+		return false;
+
+	if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
+	    len < sizeof(*bwi) + 2)
+		return false;
+
+	return true;
+}
+
 #define LISTEN_INT_USF	GENMASK(15, 14)
 #define LISTEN_INT_UI	GENMASK(13, 0)
 
@@ -3596,6 +3618,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
 /* Action category code */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d5c5f865323c..e7856336b5c6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1677,6 +1677,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_eht_operation *eht_operation;
 	const struct ieee80211_multi_link_elem *ml_basic;
 	const struct ieee80211_multi_link_elem *ml_reconf;
+	const struct ieee80211_bandwidth_indication *bandwidth_indication;
 
 	/* length of them, respectively */
 	u8 ext_capab_len;
@@ -2463,7 +2464,7 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 				const struct ieee80211_vht_operation *oper,
 				const struct ieee80211_ht_operation *htop,
 				struct cfg80211_chan_def *chandef);
-void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation *eht_oper,
+void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
 				bool support_160, bool support_320,
 				struct cfg80211_chan_def *chandef);
 bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e8f16ed235c3..a211f594f25a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -109,7 +109,8 @@ ieee80211_extract_dis_subch_bmap(const struct ieee80211_eht_operation *eht_oper,
 		return 0;
 
 	/* set 160/320 supported to get the full AP definition */
-	ieee80211_chandef_eht_oper(eht_oper, true, true, &ap_chandef);
+	ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
+				   true, true, &ap_chandef);
 	ap_center_freq = ap_chandef.center_freq1;
 	ap_bw = 20 * BIT(u8_get_bits(info->control,
 				     IEEE80211_EHT_OPER_CHAN_WIDTH));
@@ -387,7 +388,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	if (eht_oper && (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) {
 		struct cfg80211_chan_def eht_chandef = *chandef;
 
-		ieee80211_chandef_eht_oper(eht_oper,
+		ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
 					   eht_chandef.width ==
 					   NL80211_CHAN_WIDTH_160,
 					   false, &eht_chandef);
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 871cdac2d0f4..55959b0b24c5 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -9,7 +9,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2008, Intel Corporation
  * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2018, 2020, 2022 Intel Corporation
+ * Copyright (C) 2018, 2020, 2022-2023 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -33,12 +33,14 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 	struct cfg80211_chan_def new_vht_chandef = {};
 	const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
 	const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
+	const struct ieee80211_bandwidth_indication *bwi;
 	int secondary_channel_offset = -1;
 
 	memset(csa_ie, 0, sizeof(*csa_ie));
 
 	sec_chan_offs = elems->sec_chan_offs;
 	wide_bw_chansw_ie = elems->wide_bw_chansw_ie;
+	bwi = elems->bandwidth_indication;
 
 	if (conn_flags & (IEEE80211_CONN_DISABLE_HT |
 			  IEEE80211_CONN_DISABLE_40MHZ)) {
@@ -132,7 +134,14 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	if (wide_bw_chansw_ie) {
+	if (bwi) {
+		/* start with the CSA one */
+		new_vht_chandef = csa_ie->chandef;
+		/* and update the width accordingly */
+		/* FIXME: support 160/320 */
+		ieee80211_chandef_eht_oper(&bwi->info, true, true,
+					   &new_vht_chandef);
+	} else if (wide_bw_chansw_ie) {
 		u8 new_seg1 = wide_bw_chansw_ie->new_center_freq_seg1;
 		struct ieee80211_vht_operation vht_oper = {
 			.chan_width =
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 88f714a75862..a1e18938ce52 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -990,6 +990,11 @@ ieee80211_parse_extension_element(u32 *crc,
 			}
 		}
 		break;
+	case WLAN_EID_EXT_BANDWIDTH_INDICATION:
+		if (ieee80211_bandwidth_indication_size_ok(data, len))
+			elems->bandwidth_indication = data;
+		calc_crc = true;
+		break;
 	}
 
 	if (crc && calc_crc)
@@ -1005,11 +1010,11 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 	bool calc_crc = params->filter != 0;
 	DECLARE_BITMAP(seen_elems, 256);
 	u32 crc = params->crc;
-	const u8 *ie;
 
 	bitmap_zero(seen_elems, 256);
 
 	for_each_element(elem, params->start, params->len) {
+		const struct element *subelem;
 		bool elem_parse_failed;
 		u8 id = elem->id;
 		u8 elen = elem->datalen;
@@ -1267,15 +1272,27 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
 			}
 			/*
 			 * This is a bit tricky, but as we only care about
-			 * the wide bandwidth channel switch element, so
-			 * just parse it out manually.
+			 * a few elements, parse them out manually.
 			 */
-			ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
-					      pos, elen);
-			if (ie) {
-				if (ie[1] >= sizeof(*elems->wide_bw_chansw_ie))
+			subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
+						     pos, elen);
+			if (subelem) {
+				if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie))
 					elems->wide_bw_chansw_ie =
-						(void *)(ie + 2);
+						(void *)subelem->data;
+				else
+					elem_parse_failed = true;
+			}
+
+			subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION,
+							 pos, elen);
+			if (subelem) {
+				const void *edata = subelem->data + 1;
+				u8 edatalen = subelem->datalen - 1;
+
+				if (ieee80211_bandwidth_indication_size_ok(edata,
+									   edatalen))
+					elems->bandwidth_indication = edata;
 				else
 					elem_parse_failed = true;
 			}
@@ -3746,12 +3763,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
 	return true;
 }
 
-void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation *eht_oper,
+void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
 				bool support_160, bool support_320,
 				struct cfg80211_chan_def *chandef)
 {
-	struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional;
-
 	chandef->center_freq1 =
 		ieee80211_channel_to_frequency(info->ccfs0,
 					       chandef->chan->band);
@@ -3920,8 +3935,9 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata,
 		support_320 =
 			eht_phy_cap & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ;
 
-		ieee80211_chandef_eht_oper(eht_oper, support_160,
-					   support_320, &he_chandef);
+		ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
+					   support_160, support_320,
+					   &he_chandef);
 	}
 
 	if (!cfg80211_chandef_valid(&he_chandef)) {
-- 
cgit v1.2.3


From 62e9c64eedfeb697ba28081ccaac59a45f9a96e1 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Wed, 20 Sep 2023 21:25:24 +0300
Subject: wifi: mac80211: add support for parsing TID to Link mapping element

Add the relevant definitions for TID to Link mapping element
according to the P802.11be_D4.0.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.9ea9b0b4412a.I2281ab2c70e8b43a39032dc115db6a80f1f0b3f4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 +++
 net/mac80211/util.c        |  8 +++++++
 3 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f11b7022d9eb..f2965ff3d7c1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1246,6 +1246,30 @@ struct ieee80211_twt_setup {
 	u8 params[];
 } __packed;
 
+#define IEEE80211_TTLM_MAX_CNT				2
+#define IEEE80211_TTLM_CONTROL_DIRECTION		0x03
+#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP		0x04
+#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT	0x08
+#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT	0x10
+#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE		0x20
+
+#define IEEE80211_TTLM_DIRECTION_DOWN		0
+#define IEEE80211_TTLM_DIRECTION_UP		1
+#define IEEE80211_TTLM_DIRECTION_BOTH		2
+
+/**
+ * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
+ *
+ * Defined in section 9.4.2.314 in P802.11be_D4
+ *
+ * @control: the first part of control field
+ * @optional: the second part of control field
+ */
+struct ieee80211_ttlm_elem {
+	u8 control;
+	u8 optional[];
+} __packed;
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
@@ -3618,6 +3642,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_OPERATION = 106,
 	WLAN_EID_EXT_EHT_MULTI_LINK = 107,
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
+	WLAN_EID_EXT_TID_TO_LINK_MAPPING = 109,
 	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
 };
 
@@ -5155,6 +5180,39 @@ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
 	       fixed + prof->sta_info_len - 1 <= len;
 }
 
+static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
+{
+	const struct ieee80211_ttlm_elem *t2l = (const void *)data;
+	u8 control, fixed = sizeof(*t2l), elem_len = 0;
+
+	if (len < fixed)
+		return false;
+
+	control = t2l->control;
+
+	if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
+		elem_len += 2;
+	if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
+		elem_len += 3;
+
+	if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
+		u8 bm_size;
+
+		elem_len += 1;
+		if (len < fixed + elem_len)
+			return false;
+
+		if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
+			bm_size = 1;
+		else
+			bm_size = 2;
+
+		elem_len += hweight8(t2l->optional[0]) * bm_size;
+	}
+
+	return len >= fixed + elem_len;
+}
+
 #define for_each_mle_subelement(_elem, _data, _len)			\
 	if (ieee80211_mle_size_ok(_data, _len))				\
 		for_each_element(_elem,					\
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index e7856336b5c6..d1a73095c914 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1678,6 +1678,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_multi_link_elem *ml_basic;
 	const struct ieee80211_multi_link_elem *ml_reconf;
 	const struct ieee80211_bandwidth_indication *bandwidth_indication;
+	const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT];
 
 	/* length of them, respectively */
 	u8 ext_capab_len;
@@ -1711,6 +1712,8 @@ struct ieee802_11_elems {
 	/* The reconfiguration Multi-Link element in the original IEs */
 	const struct element *ml_reconf_elem;
 
+	u8 ttlm_num;
+
 	/*
 	 * store the per station profile pointer and length in case that the
 	 * parsing also handled Multi-Link element parsing for a specific link
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 97c5823da0eb..98a3bffc6991 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -995,6 +995,14 @@ ieee80211_parse_extension_element(u32 *crc,
 			elems->bandwidth_indication = data;
 		calc_crc = true;
 		break;
+	case WLAN_EID_EXT_TID_TO_LINK_MAPPING:
+		calc_crc = true;
+		if (ieee80211_tid_to_link_map_size_ok(data, len) &&
+		    elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) {
+			elems->ttlm[elems->ttlm_num] = (void *)data;
+			elems->ttlm_num++;
+		}
+		break;
 	}
 
 	if (crc && calc_crc)
-- 
cgit v1.2.3


From ef246a1480cc484cd2aeda75737cb0848616ddf3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 20 Sep 2023 21:25:26 +0300
Subject: wifi: mac80211: support antenna control in injection

Support antenna control for injection by parsing the antenna
radiotap field (which may be presented multiple times) and
telling the driver about the resulting antenna bitmap. Of
course there's no guarantee the driver will actually honour
this, just like any other injection control.

If misconfigured, i.e. the injected HT/VHT MCS needs more
chains than antennas are configured, the bitmap is reset to
zero, indicating no selection.

For now this is only set up for two anntenas so we keep more
free bits, but that can be trivially extended if any driver
implements support for it that can deal with hardware with
more antennas.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230920211508.f71001aa4da9.I00ccb762a806ea62bc3d728fa3a0d29f4f285eeb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  2 ++
 include/net/mac80211.h    |  6 +++++-
 net/mac80211/tx.c         | 14 ++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f2965ff3d7c1..3b02f038d509 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1705,6 +1705,8 @@ struct ieee80211_mcs_info {
 #define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
 #define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
 
+#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))
+
 /*
  * 802.11n D5.0 20.3.5 / 20.6 says:
  * - indices 0 to 7 and 32 are single spatial stream
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d4ef2a605cb4..72375eceb786 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1178,7 +1178,11 @@ struct ieee80211_tx_info {
 					u8 use_cts_prot:1;
 					u8 short_preamble:1;
 					u8 skip_table:1;
-					/* 2 bytes free */
+
+					/* for injection only (bitmap) */
+					u8 antennas:2;
+
+					/* 14 bits free */
 				};
 				/* only needed before rate control */
 				unsigned long jiffies;
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 932516f8cc13..a984fc54644e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2162,6 +2162,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
 			rate_found = true;
 			break;
 
+		case IEEE80211_RADIOTAP_ANTENNA:
+			/* this can appear multiple times, keep a bitmap */
+			info->control.antennas |= BIT(*iterator.this_arg);
+			break;
+
 		case IEEE80211_RADIOTAP_DATA_RETRIES:
 			rate_retries = *iterator.this_arg;
 			break;
@@ -2256,8 +2261,17 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
 		}
 
 		if (rate_flags & IEEE80211_TX_RC_MCS) {
+			/* reset antennas if not enough */
+			if (IEEE80211_HT_MCS_CHAINS(rate) >
+					hweight8(info->control.antennas))
+				info->control.antennas = 0;
+
 			info->control.rates[0].idx = rate;
 		} else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) {
+			/* reset antennas if not enough */
+			if (vht_nss > hweight8(info->control.antennas))
+				info->control.antennas = 0;
+
 			ieee80211_rate_set_vht(info->control.rates, vht_mcs,
 					       vht_nss);
 		} else if (sband) {
-- 
cgit v1.2.3


From 823a0258912b1a4f627e455bee7e78d349c29c5b Mon Sep 17 00:00:00 2001
From: Po-Hao Huang <phhuang@realtek.com>
Date: Mon, 25 Sep 2023 16:08:59 +0800
Subject: wifi: ieee80211: add UL-bandwidth definition of trigger frame

Define UL-bandwidth values of trigger frame according to 802.11 std.

Signed-off-by: Po-Hao Huang <phhuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://lore.kernel.org/r/20230925080902.51449-2-pkshih@realtek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3b02f038d509..62b4469c6866 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -307,6 +307,13 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
 #define IEEE80211_TRIGGER_TYPE_BQRP		0x6
 #define IEEE80211_TRIGGER_TYPE_NFRP		0x7
 
+/* UL-bandwidth within common_info of trigger frame */
+#define IEEE80211_TRIGGER_ULBW_MASK		0xc0000
+#define IEEE80211_TRIGGER_ULBW_20MHZ		0x0
+#define IEEE80211_TRIGGER_ULBW_40MHZ		0x1
+#define IEEE80211_TRIGGER_ULBW_80MHZ		0x2
+#define IEEE80211_TRIGGER_ULBW_160_80P80MHZ	0x3
+
 struct ieee80211_hdr {
 	__le16 frame_control;
 	__le16 duration_id;
-- 
cgit v1.2.3


From df31b298477e65a01deff0af352be3a61524d930 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:34 -0300
Subject: iommu: Add iommu_ops->identity_domain

This allows a driver to set a global static to an IDENTITY domain and
the core code will automatically use it whenever an IDENTITY domain
is requested.

By making it always available it means the IDENTITY can be used in error
handling paths to force the iommu driver into a known state. Devices
implementing global static identity domains should avoid failing their
attach_dev ops.

To make global static domains simpler allow drivers to omit their free
function and update the iommufd selftest.

Convert rockchip to use the new mechanism.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c            | 6 +++++-
 drivers/iommu/iommufd/selftest.c | 5 -----
 drivers/iommu/rockchip-iommu.c   | 9 +--------
 include/linux/iommu.h            | 3 +++
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3bfc56df4f78..33bd11070907 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1978,6 +1978,9 @@ static struct iommu_domain *__iommu_domain_alloc(const struct bus_type *bus,
 	if (bus == NULL || bus->iommu_ops == NULL)
 		return NULL;
 
+	if (alloc_type == IOMMU_DOMAIN_IDENTITY && bus->iommu_ops->identity_domain)
+		return bus->iommu_ops->identity_domain;
+
 	domain = bus->iommu_ops->domain_alloc(alloc_type);
 	if (!domain)
 		return NULL;
@@ -2011,7 +2014,8 @@ void iommu_domain_free(struct iommu_domain *domain)
 	if (domain->type == IOMMU_DOMAIN_SVA)
 		mmdrop(domain->mm);
 	iommu_put_dma_cookie(domain);
-	domain->ops->free(domain);
+	if (domain->ops->free)
+		domain->ops->free(domain);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_free);
 
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 56506d5753f1..d48a202a7c3b 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -111,10 +111,6 @@ struct selftest_obj {
 	};
 };
 
-static void mock_domain_blocking_free(struct iommu_domain *domain)
-{
-}
-
 static int mock_domain_nop_attach(struct iommu_domain *domain,
 				  struct device *dev)
 {
@@ -122,7 +118,6 @@ static int mock_domain_nop_attach(struct iommu_domain *domain,
 }
 
 static const struct iommu_domain_ops mock_blocking_ops = {
-	.free = mock_domain_blocking_free,
 	.attach_dev = mock_domain_nop_attach,
 };
 
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 8ff69fbf9f65..033678f2f8b3 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -989,13 +989,8 @@ static int rk_iommu_identity_attach(struct iommu_domain *identity_domain,
 	return 0;
 }
 
-static void rk_iommu_identity_free(struct iommu_domain *domain)
-{
-}
-
 static struct iommu_domain_ops rk_identity_ops = {
 	.attach_dev = rk_iommu_identity_attach,
-	.free = rk_iommu_identity_free,
 };
 
 static struct iommu_domain rk_identity_domain = {
@@ -1059,9 +1054,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
 {
 	struct rk_iommu_domain *rk_domain;
 
-	if (type == IOMMU_DOMAIN_IDENTITY)
-		return &rk_identity_domain;
-
 	if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA)
 		return NULL;
 
@@ -1186,6 +1178,7 @@ static int rk_iommu_of_xlate(struct device *dev,
 }
 
 static const struct iommu_ops rk_iommu_ops = {
+	.identity_domain = &rk_identity_domain,
 	.domain_alloc = rk_iommu_domain_alloc,
 	.probe_device = rk_iommu_probe_device,
 	.release_device = rk_iommu_release_device,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c50a769d569a..d0920b2a9f1c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -260,6 +260,8 @@ struct iommu_iotlb_gather {
  *                    will be blocked by the hardware.
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
+ * @identity_domain: An always available, always attachable identity
+ *                   translation.
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
@@ -294,6 +296,7 @@ struct iommu_ops {
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
 	struct module *owner;
+	struct iommu_domain *identity_domain;
 };
 
 /**
-- 
cgit v1.2.3


From 1c68cbc64fe6ac01dc242ba562344303031a76fb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:35 -0300
Subject: iommu: Add IOMMU_DOMAIN_PLATFORM

This is used when the iommu driver is taking control of the dma_ops,
currently only on S390 and power spapr. It is designed to preserve the
original ops->detach_dev() semantic that these S390 was built around.

Provide an opaque domain type and a 'default_domain' ops value that allows
the driver to trivially force any single domain as the default domain.

Update iommufd selftest to use this instead of set_platform_dma_ops

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c            | 13 +++++++++++++
 drivers/iommu/iommufd/selftest.c | 14 +++++---------
 include/linux/iommu.h            |  8 ++++++++
 3 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 33bd11070907..0e13e566581c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -184,6 +184,8 @@ static const char *iommu_domain_type_str(unsigned int t)
 	case IOMMU_DOMAIN_DMA:
 	case IOMMU_DOMAIN_DMA_FQ:
 		return "Translated";
+	case IOMMU_DOMAIN_PLATFORM:
+		return "Platform";
 	default:
 		return "Unknown";
 	}
@@ -1752,6 +1754,17 @@ iommu_group_alloc_default_domain(struct iommu_group *group, int req_type)
 
 	lockdep_assert_held(&group->mutex);
 
+	/*
+	 * Allow legacy drivers to specify the domain that will be the default
+	 * domain. This should always be either an IDENTITY/BLOCKED/PLATFORM
+	 * domain. Do not use in new drivers.
+	 */
+	if (bus->iommu_ops->default_domain) {
+		if (req_type)
+			return ERR_PTR(-EINVAL);
+		return bus->iommu_ops->default_domain;
+	}
+
 	if (req_type)
 		return __iommu_group_alloc_default_domain(bus, group, req_type);
 
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index d48a202a7c3b..fb981ba97c4e 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -281,14 +281,6 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
 	return cap == IOMMU_CAP_CACHE_COHERENCY;
 }
 
-static void mock_domain_set_plaform_dma_ops(struct device *dev)
-{
-	/*
-	 * mock doesn't setup default domains because we can't hook into the
-	 * normal probe path
-	 */
-}
-
 static struct iommu_device mock_iommu_device = {
 };
 
@@ -298,12 +290,16 @@ static struct iommu_device *mock_probe_device(struct device *dev)
 }
 
 static const struct iommu_ops mock_ops = {
+	/*
+	 * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type()
+	 * because it is zero.
+	 */
+	.default_domain = &mock_blocking_domain,
 	.owner = THIS_MODULE,
 	.pgsize_bitmap = MOCK_IO_PAGE_SIZE,
 	.hw_info = mock_domain_hw_info,
 	.domain_alloc = mock_domain_alloc,
 	.capable = mock_domain_capable,
-	.set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
 	.device_group = generic_device_group,
 	.probe_device = mock_probe_device,
 	.default_domain_ops =
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d0920b2a9f1c..a05480be05fd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -64,6 +64,7 @@ struct iommu_domain_geometry {
 #define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
 
 #define __IOMMU_DOMAIN_SVA	(1U << 4)  /* Shared process address space */
+#define __IOMMU_DOMAIN_PLATFORM	(1U << 5)
 
 #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
 /*
@@ -81,6 +82,8 @@ struct iommu_domain_geometry {
  *				  invalidation.
  *	IOMMU_DOMAIN_SVA	- DMA addresses are shared process addresses
  *				  represented by mm_struct's.
+ *	IOMMU_DOMAIN_PLATFORM	- Legacy domain for drivers that do their own
+ *				  dma_api stuff. Do not use in new drivers.
  */
 #define IOMMU_DOMAIN_BLOCKED	(0U)
 #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
@@ -91,6 +94,7 @@ struct iommu_domain_geometry {
 				 __IOMMU_DOMAIN_DMA_API |	\
 				 __IOMMU_DOMAIN_DMA_FQ)
 #define IOMMU_DOMAIN_SVA	(__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_PLATFORM	(__IOMMU_DOMAIN_PLATFORM)
 
 struct iommu_domain {
 	unsigned type;
@@ -262,6 +266,9 @@ struct iommu_iotlb_gather {
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
  *                   translation.
+ * @default_domain: If not NULL this will always be set as the default domain.
+ *                  This should be an IDENTITY/BLOCKED/PLATFORM domain.
+ *                  Do not use in new drivers.
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
@@ -297,6 +304,7 @@ struct iommu_ops {
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 	struct iommu_domain *identity_domain;
+	struct iommu_domain *default_domain;
 };
 
 /**
-- 
cgit v1.2.3


From 24b1d476167df3e30c7a53b67765bf3c787c5160 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:48 -0300
Subject: iommu: Remove ops->set_platform_dma_ops()

All drivers are now using IDENTITY or PLATFORM domains for what this did,
we can remove it now. It is no longer possible to attach to a NULL domain.

Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/15-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 30 +++++-------------------------
 include/linux/iommu.h |  4 ----
 2 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1efd6351bbc2..42a4585dd76d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2351,21 +2351,8 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
 	if (group->domain == new_domain)
 		return 0;
 
-	/*
-	 * New drivers should support default domains, so set_platform_dma()
-	 * op will never be called. Otherwise the NULL domain represents some
-	 * platform specific behavior.
-	 */
-	if (!new_domain) {
-		for_each_group_device(group, gdev) {
-			const struct iommu_ops *ops = dev_iommu_ops(gdev->dev);
-
-			if (!WARN_ON(!ops->set_platform_dma_ops))
-				ops->set_platform_dma_ops(gdev->dev);
-		}
-		group->domain = NULL;
-		return 0;
-	}
+	if (WARN_ON(!new_domain))
+		return -EINVAL;
 
 	/*
 	 * Changing the domain is done by calling attach_dev() on the new
@@ -2401,19 +2388,15 @@ err_revert:
 	 */
 	last_gdev = gdev;
 	for_each_group_device(group, gdev) {
-		const struct iommu_ops *ops = dev_iommu_ops(gdev->dev);
-
 		/*
-		 * If set_platform_dma_ops is not present a NULL domain can
-		 * happen only for first probe, in which case we leave
-		 * group->domain as NULL and let release clean everything up.
+		 * A NULL domain can happen only for first probe, in which case
+		 * we leave group->domain as NULL and let release clean
+		 * everything up.
 		 */
 		if (group->domain)
 			WARN_ON(__iommu_device_set_domain(
 				group, gdev->dev, group->domain,
 				IOMMU_SET_DOMAIN_MUST_SUCCEED));
-		else if (ops->set_platform_dma_ops)
-			ops->set_platform_dma_ops(gdev->dev);
 		if (gdev == last_gdev)
 			break;
 	}
@@ -3036,9 +3019,6 @@ static int iommu_setup_default_domain(struct iommu_group *group,
 	/*
 	 * There are still some drivers which don't support default domains, so
 	 * we ignore the failure and leave group->default_domain NULL.
-	 *
-	 * We assume that the iommu driver starts up the device in
-	 * 'set_platform_dma_ops' mode if it does not support default domains.
 	 */
 	dom = iommu_group_alloc_default_domain(group, req_type);
 	if (!dom) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a05480be05fd..511dfeea5272 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -243,9 +243,6 @@ struct iommu_iotlb_gather {
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
- * @set_platform_dma_ops: Returning control back to the platform DMA ops. This op
- *                        is to support old IOMMU drivers, new drivers should use
- *                        default domains, and the common IOMMU DMA ops.
  * @device_group: find iommu group for a particular device
  * @get_resv_regions: Request list of reserved regions for a device
  * @of_xlate: add OF master IDs to iommu grouping
@@ -280,7 +277,6 @@ struct iommu_ops {
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
-	void (*set_platform_dma_ops)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
 
 	/* Request/Free a list of reserved regions for a device */
-- 
cgit v1.2.3


From 4601cd2d7c4c82c4bafc822e1ff630a709eff206 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 13 Sep 2023 10:43:55 -0300
Subject: iommu: Add ops->domain_alloc_paging()

This callback requests the driver to create only a __IOMMU_DOMAIN_PAGING
domain, so it saves a few lines in a lot of drivers needlessly checking
the type.

More critically, this allows us to sweep out all the
IOMMU_DOMAIN_UNMANAGED and IOMMU_DOMAIN_DMA checks from a lot of the
drivers, simplifying what is going on in the code and ultimately removing
the now-unused special cases in drivers where they did not support
IOMMU_DOMAIN_DMA.

domain_alloc_paging() should return a struct iommu_domain that is
functionally compatible with ARM_DMA_USE_IOMMU, dma-iommu.c and iommufd.

Be forwards looking and pass in a 'struct device *' argument. We can
provide this when allocating the default_domain. No drivers will look at
this.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/22-v8-81230027b2fa+9d-iommu_all_defdom_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 17 ++++++++++++++---
 include/linux/iommu.h |  3 +++
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 38856d542afc..fe033043be46 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2041,6 +2041,7 @@ void iommu_set_fault_handler(struct iommu_domain *domain,
 EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
 
 static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
+						 struct device *dev,
 						 unsigned int type)
 {
 	struct iommu_domain *domain;
@@ -2048,8 +2049,13 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
 
 	if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain)
 		return ops->identity_domain;
+	else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging)
+		domain = ops->domain_alloc_paging(dev);
+	else if (ops->domain_alloc)
+		domain = ops->domain_alloc(alloc_type);
+	else
+		return NULL;
 
-	domain = ops->domain_alloc(alloc_type);
 	if (!domain)
 		return NULL;
 
@@ -2074,14 +2080,19 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
 static struct iommu_domain *
 __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 {
-	return __iommu_domain_alloc(group_iommu_ops(group), type);
+	struct device *dev =
+		list_first_entry(&group->devices, struct group_device, list)
+			->dev;
+
+	return __iommu_domain_alloc(group_iommu_ops(group), dev, type);
 }
 
 struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
 {
 	if (bus == NULL || bus->iommu_ops == NULL)
 		return NULL;
-	return __iommu_domain_alloc(bus->iommu_ops, IOMMU_DOMAIN_UNMANAGED);
+	return __iommu_domain_alloc(bus->iommu_ops, NULL,
+				    IOMMU_DOMAIN_UNMANAGED);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 511dfeea5272..3f173307434d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -239,6 +239,8 @@ struct iommu_iotlb_gather {
  *           use. The information type is one of enum iommu_hw_info_type defined
  *           in include/uapi/linux/iommufd.h.
  * @domain_alloc: allocate iommu domain
+ * @domain_alloc_paging: Allocate an iommu_domain that can be used for
+ *                       UNMANAGED, DMA, and DMA_FQ domain types.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -273,6 +275,7 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From e8f52d84cf0b6d1862ab62f7ed705f78690d11b2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 22 Aug 2023 13:15:57 -0300
Subject: iommu: Add generic_single_device_group()

This implements the common pattern seen in drivers of a single iommu_group
for the entire iommu driver instance. Implement this in core code so the
drivers that want this can select it from their ops.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v1-c869a95191f2+5e8-iommu_single_grp_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 28 +++++++++++++++++++++++++++-
 include/linux/iommu.h |  3 +++
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6483d6e19a5e..03202314322b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -292,6 +292,10 @@ void iommu_device_unregister(struct iommu_device *iommu)
 	spin_lock(&iommu_device_lock);
 	list_del(&iommu->list);
 	spin_unlock(&iommu_device_lock);
+
+	/* Pairs with the alloc in generic_single_device_group() */
+	iommu_group_put(iommu->singleton_group);
+	iommu->singleton_group = NULL;
 }
 EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
@@ -406,6 +410,7 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops)
 		ret = PTR_ERR(iommu_dev);
 		goto err_module_put;
 	}
+	dev->iommu->iommu_dev = iommu_dev;
 
 	ret = iommu_device_link(iommu_dev, dev);
 	if (ret)
@@ -420,7 +425,6 @@ static int iommu_init_device(struct device *dev, const struct iommu_ops *ops)
 	}
 	dev->iommu_group = group;
 
-	dev->iommu->iommu_dev = iommu_dev;
 	dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev);
 	if (ops->is_attach_deferred)
 		dev->iommu->attach_deferred = ops->is_attach_deferred(dev);
@@ -434,6 +438,7 @@ err_release:
 err_module_put:
 	module_put(ops->owner);
 err_free:
+	dev->iommu->iommu_dev = NULL;
 	dev_iommu_free(dev);
 	return ret;
 }
@@ -1637,6 +1642,27 @@ struct iommu_group *generic_device_group(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(generic_device_group);
 
+/*
+ * Generic device_group call-back function. It just allocates one
+ * iommu-group per iommu driver instance shared by every device
+ * probed by that iommu driver.
+ */
+struct iommu_group *generic_single_device_group(struct device *dev)
+{
+	struct iommu_device *iommu = dev->iommu->iommu_dev;
+
+	if (!iommu->singleton_group) {
+		struct iommu_group *group;
+
+		group = iommu_group_alloc();
+		if (IS_ERR(group))
+			return group;
+		iommu->singleton_group = group;
+	}
+	return iommu_group_ref_get(iommu->singleton_group);
+}
+EXPORT_SYMBOL_GPL(generic_single_device_group);
+
 /*
  * Use standard PCI bus topology, isolation features, and DMA alias quirks
  * to find or create an IOMMU group for a device.
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3f173307434d..5b693be3d35f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -378,6 +378,7 @@ struct iommu_domain_ops {
  * @list: Used by the iommu-core to keep a list of registered iommus
  * @ops: iommu-ops for talking to this iommu
  * @dev: struct device for sysfs handling
+ * @singleton_group: Used internally for drivers that have only one group
  * @max_pasids: number of supported PASIDs
  */
 struct iommu_device {
@@ -385,6 +386,7 @@ struct iommu_device {
 	const struct iommu_ops *ops;
 	struct fwnode_handle *fwnode;
 	struct device *dev;
+	struct iommu_group *singleton_group;
 	u32 max_pasids;
 };
 
@@ -648,6 +650,7 @@ extern struct iommu_group *pci_device_group(struct device *dev);
 extern struct iommu_group *generic_device_group(struct device *dev);
 /* FSL-MC device grouping function */
 struct iommu_group *fsl_mc_device_group(struct device *dev);
+extern struct iommu_group *generic_single_device_group(struct device *dev);
 
 /**
  * struct iommu_fwspec - per-device IOMMU instance data
-- 
cgit v1.2.3


From 950210887670cbb7d2eb9af6fb743b70f1a1ebdc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 19 Sep 2023 20:54:37 +0200
Subject: thermal: core: Drop trips_disabled bitmask

After recent changes, thermal_zone_get_trip() cannot fail, as invoked
from thermal_zone_device_register_with_trips(), so the only role of
the trips_disabled bitmask is struct thermal_zone_device is to make
handle_thermal_trip() skip trip points whose temperature was initially
zero.  However, since the unit of temperature in the thermal core is
millicelsius, zero may very well be a valid temperature value at least
in some usage scenarios and the trip temperature may as well change
later.  Thus there is no reason to permanently disable trip points
with initial temperature equal to zero.

Accordingly, drop the trips_disabled bitmask along with the code
related to it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c | 13 -------------
 include/linux/thermal.h        |  2 --
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 58533ea75cd9..38d393f139d8 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -347,10 +347,6 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip_id)
 {
 	struct thermal_trip trip;
 
-	/* Ignore disabled trip points */
-	if (test_bit(trip_id, &tz->trips_disabled))
-		return;
-
 	__thermal_zone_get_trip(tz, trip_id, &trip);
 
 	if (trip.temperature == THERMAL_TEMP_INVALID)
@@ -1231,7 +1227,6 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 	struct thermal_zone_device *tz;
 	int id;
 	int result;
-	int count;
 	struct thermal_governor *governor;
 
 	if (!type || strlen(type) == 0) {
@@ -1328,14 +1323,6 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 	if (result)
 		goto release_device;
 
-	for (count = 0; count < num_trips; count++) {
-		struct thermal_trip trip;
-
-		result = thermal_zone_get_trip(tz, count, &trip);
-		if (result || !trip.temperature)
-			set_bit(count, &tz->trips_disabled);
-	}
-
 	/* Update 'this' zone's governor information */
 	mutex_lock(&thermal_governor_lock);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a5ae4af955ff..6cfcae22ba12 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -122,7 +122,6 @@ struct thermal_cooling_device {
  * @devdata:	private pointer for device private data
  * @trips:	an array of struct thermal_trip
  * @num_trips:	number of trip points the thermal zone supports
- * @trips_disabled;	bitmap for disabled trips
  * @passive_delay_jiffies: number of jiffies to wait between polls when
  *			performing passive cooling.
  * @polling_delay_jiffies: number of jiffies to wait between polls when
@@ -163,7 +162,6 @@ struct thermal_zone_device {
 	void *devdata;
 	struct thermal_trip *trips;
 	int num_trips;
-	unsigned long trips_disabled;	/* bitmap for disabled trips */
 	unsigned long passive_delay_jiffies;
 	unsigned long polling_delay_jiffies;
 	int temperature;
-- 
cgit v1.2.3


From bd111e987e762d82dc738232c6ed4b3c9bcc5c91 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 12 Sep 2023 17:18:43 +0100
Subject: iommu: Retire map/unmap ops

With everyone now implementing the new interfaces, clean up the last
remnants of the old map/unmap ops and simplify the calling logic again.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/d2afdf13b2fbf537713c3ec642dfd49d16dd9e6a.1694525662.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 60 +++++++++++----------------------------------------
 include/linux/iommu.h |  6 ------
 2 files changed, 13 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 03202314322b..865fa4f179a8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2509,30 +2509,6 @@ out_set_count:
 	return pgsize;
 }
 
-static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
-			     phys_addr_t paddr, size_t size, int prot,
-			     gfp_t gfp, size_t *mapped)
-{
-	const struct iommu_domain_ops *ops = domain->ops;
-	size_t pgsize, count;
-	int ret;
-
-	pgsize = iommu_pgsize(domain, iova, paddr, size, &count);
-
-	pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n",
-		 iova, &paddr, pgsize, count);
-
-	if (ops->map_pages) {
-		ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot,
-				     gfp, mapped);
-	} else {
-		ret = ops->map(domain, iova, paddr, pgsize, prot, gfp);
-		*mapped = ret ? 0 : pgsize;
-	}
-
-	return ret;
-}
-
 static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 		       phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
@@ -2543,8 +2519,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	phys_addr_t orig_paddr = paddr;
 	int ret = 0;
 
-	if (unlikely(!(ops->map || ops->map_pages) ||
-		     domain->pgsize_bitmap == 0UL))
+	if (unlikely(!ops->map_pages || domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
@@ -2567,10 +2542,14 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova,
 	pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size);
 
 	while (size) {
-		size_t mapped = 0;
+		size_t pgsize, count, mapped = 0;
+
+		pgsize = iommu_pgsize(domain, iova, paddr, size, &count);
 
-		ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp,
-					&mapped);
+		pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n",
+			 iova, &paddr, pgsize, count);
+		ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot,
+				     gfp, &mapped);
 		/*
 		 * Some pages may have been mapped, even if an error occurred,
 		 * so we should account for those so they can be unmapped.
@@ -2614,19 +2593,6 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
-static size_t __iommu_unmap_pages(struct iommu_domain *domain,
-				  unsigned long iova, size_t size,
-				  struct iommu_iotlb_gather *iotlb_gather)
-{
-	const struct iommu_domain_ops *ops = domain->ops;
-	size_t pgsize, count;
-
-	pgsize = iommu_pgsize(domain, iova, iova, size, &count);
-	return ops->unmap_pages ?
-	       ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) :
-	       ops->unmap(domain, iova, pgsize, iotlb_gather);
-}
-
 static size_t __iommu_unmap(struct iommu_domain *domain,
 			    unsigned long iova, size_t size,
 			    struct iommu_iotlb_gather *iotlb_gather)
@@ -2636,8 +2602,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	unsigned long orig_iova = iova;
 	unsigned int min_pagesz;
 
-	if (unlikely(!(ops->unmap || ops->unmap_pages) ||
-		     domain->pgsize_bitmap == 0UL))
+	if (unlikely(!ops->unmap_pages || domain->pgsize_bitmap == 0UL))
 		return 0;
 
 	if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING)))
@@ -2664,9 +2629,10 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
 	 * or we hit an area that isn't mapped.
 	 */
 	while (unmapped < size) {
-		unmapped_page = __iommu_unmap_pages(domain, iova,
-						    size - unmapped,
-						    iotlb_gather);
+		size_t pgsize, count;
+
+		pgsize = iommu_pgsize(domain, iova, iova, size - unmapped, &count);
+		unmapped_page = ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather);
 		if (!unmapped_page)
 			break;
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5b693be3d35f..64bd20142cbe 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -322,10 +322,8 @@ struct iommu_ops {
  * * ENODEV	- device specific errors, not able to be attached
  * * <others>	- treated as ENODEV by the caller. Use is discouraged
  * @set_dev_pasid: set an iommu domain to a pasid of device
- * @map: map a physically contiguous memory region to an iommu domain
  * @map_pages: map a physically contiguous set of pages of the same size to
  *             an iommu domain.
- * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
@@ -344,13 +342,9 @@ struct iommu_domain_ops {
 	int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev,
 			     ioasid_t pasid);
 
-	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
 			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
 			 int prot, gfp_t gfp, size_t *mapped);
-	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
-			size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
 			      size_t pgsize, size_t pgcount,
 			      struct iommu_iotlb_gather *iotlb_gather);
-- 
cgit v1.2.3


From cf08fa74c716cf20e5038d1e7dbbd7dba1b76062 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Wed, 13 Sep 2023 16:29:17 +0800
Subject: regulator: mt6358: Add output voltage fine tuning to fixed regulators

The "fixed" LDO regulators found on the MT6358 and MT6366 PMICs have
either no voltage selection register, or only one valid setting.
However these do have a fine voltage calibration setting that can
slightly boost the output voltage from 0 mV to 100 mV, in 10 mV
increments.

Add support for this by changing these into linear range regulators.
Some register definitions that are missing are also added.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230913082919.1631287-3-wenst@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/mt6358-regulator.c | 15 +++++++++++++--
 include/linux/mfd/mt6358/registers.h |  6 ++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/mt6358-regulator.c b/drivers/regulator/mt6358-regulator.c
index a1eae45f5fee..b34ad85db771 100644
--- a/drivers/regulator/mt6358-regulator.c
+++ b/drivers/regulator/mt6358-regulator.c
@@ -123,10 +123,13 @@ struct mt6358_regulator_info {
 		.type = REGULATOR_VOLTAGE,	\
 		.id = MT6358_ID_##vreg,	\
 		.owner = THIS_MODULE,	\
-		.n_voltages = 1,	\
+		.n_voltages = 11,	\
+		.vsel_reg = MT6358_##vreg##_ANA_CON0,	\
+		.vsel_mask = GENMASK(3, 0),	\
 		.enable_reg = enreg,	\
 		.enable_mask = BIT(enbit),	\
 		.min_uV = volt,	\
+		.uV_step = 10000, \
 	},	\
 	.status_reg = MT6358_LDO_##vreg##_CON1,	\
 	.qi = BIT(15),							\
@@ -219,10 +222,13 @@ struct mt6358_regulator_info {
 		.type = REGULATOR_VOLTAGE,	\
 		.id = MT6366_ID_##vreg,	\
 		.owner = THIS_MODULE,	\
-		.n_voltages = 1,	\
+		.n_voltages = 11,	\
+		.vsel_reg = MT6358_##vreg##_ANA_CON0,	\
+		.vsel_mask = GENMASK(3, 0),	\
 		.enable_reg = enreg,	\
 		.enable_mask = BIT(enbit),	\
 		.min_uV = volt,	\
+		.uV_step = 10000, \
 	},	\
 	.status_reg = MT6358_LDO_##vreg##_CON1,	\
 	.qi = BIT(15),							\
@@ -476,8 +482,13 @@ static const struct regulator_ops mt6358_volt_table_ops = {
 	.get_status = mt6358_get_status,
 };
 
+/* "Fixed" LDOs with output voltage calibration +0 ~ +10 mV */
 static const struct regulator_ops mt6358_volt_fixed_ops = {
 	.list_voltage = regulator_list_voltage_linear,
+	.map_voltage = regulator_map_voltage_linear,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = mt6358_get_buck_voltage_sel,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
 	.enable = regulator_enable_regmap,
 	.disable = regulator_disable_regmap,
 	.is_enabled = regulator_is_enabled_regmap,
diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h
index 3d33517f178c..5ea2590be710 100644
--- a/include/linux/mfd/mt6358/registers.h
+++ b/include/linux/mfd/mt6358/registers.h
@@ -262,6 +262,12 @@
 #define MT6358_LDO_VBIF28_CON3                0x1db0
 #define MT6358_VCAMA1_ANA_CON0                0x1e08
 #define MT6358_VCAMA2_ANA_CON0                0x1e0c
+#define MT6358_VFE28_ANA_CON0                 0x1e10
+#define MT6358_VCN28_ANA_CON0                 0x1e14
+#define MT6358_VBIF28_ANA_CON0                0x1e18
+#define MT6358_VAUD28_ANA_CON0                0x1e1c
+#define MT6358_VAUX18_ANA_CON0                0x1e20
+#define MT6358_VXO22_ANA_CON0                 0x1e24
 #define MT6358_VCN33_ANA_CON0                 0x1e28
 #define MT6358_VSIM1_ANA_CON0                 0x1e2c
 #define MT6358_VSIM2_ANA_CON0                 0x1e30
-- 
cgit v1.2.3


From 581beb4fe37d17571c87058d13d298d5458e25e9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:02:58 +0100
Subject: iov_iter: Remove last_offset from iov_iter as it was for ITER_PIPE

Now that ITER_PIPE has been removed, iov_iter::last_offset is no longer
used, so remove it.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-2-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 42bce38a8e87..2000e42a6586 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -44,10 +44,7 @@ struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	union {
-		size_t iov_offset;
-		int last_offset;
-	};
+	size_t iov_offset;
 	/*
 	 * Hack alert: overlay ubuf_iovec with iovec + count, so
 	 * that the members resolve correctly regardless of the type
-- 
cgit v1.2.3


From 7d9e44a6ad8a22135e6308d330da29271f41a98a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:02 +0100
Subject: iov_iter: Renumber ITER_* constants

Renumber the ITER_* iterator-type constants to put things in the same order
as in the iteration functions and to group user-backed iterators at the
bottom.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-6-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2000e42a6586..bef8e56aa45c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -21,12 +21,12 @@ struct kvec {
 
 enum iter_type {
 	/* iter types */
+	ITER_UBUF,
 	ITER_IOVEC,
-	ITER_KVEC,
 	ITER_BVEC,
+	ITER_KVEC,
 	ITER_XARRAY,
 	ITER_DISCARD,
-	ITER_UBUF,
 };
 
 #define ITER_SOURCE	1	// == WRITE
-- 
cgit v1.2.3


From f1b4cb650b9a0eeba206d8f069fcdc532bfbcd74 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:03 +0100
Subject: iov_iter: Derive user-backedness from the iterator type

Use the iterator type to determine whether an iterator is user-backed or
not rather than using a special flag for it.  Now that ITER_UBUF and
ITER_IOVEC are 0 and 1, they can be checked with a single comparison.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-7-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 4 +---
 lib/iov_iter.c      | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index bef8e56aa45c..65d9143f83c8 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -43,7 +43,6 @@ struct iov_iter {
 	bool copy_mc;
 	bool nofault;
 	bool data_source;
-	bool user_backed;
 	size_t iov_offset;
 	/*
 	 * Hack alert: overlay ubuf_iovec with iovec + count, so
@@ -140,7 +139,7 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 
 static inline bool user_backed_iter(const struct iov_iter *i)
 {
-	return i->user_backed;
+	return iter_is_ubuf(i) || iter_is_iovec(i);
 }
 
 /*
@@ -380,7 +379,6 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	*i = (struct iov_iter) {
 		.iter_type = ITER_UBUF,
 		.copy_mc = false,
-		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
 		.count = count,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 27234a820eeb..227c9f536b94 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -290,7 +290,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.iter_type = ITER_IOVEC,
 		.copy_mc = false,
 		.nofault = false,
-		.user_backed = true,
 		.data_source = direction,
 		.__iov = iov,
 		.nr_segs = nr_segs,
-- 
cgit v1.2.3


From f1982740f5e77090bde41a9b84e257d69ec46598 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:04 +0100
Subject: iov_iter: Convert iterate*() to inline funcs

Convert the iov_iter iteration macros to inline functions to make the code
easier to follow.

The functions are marked __always_inline as we don't want to end up with
indirect calls in the code.  This, however, leaves dealing with ->copy_mc
in an awkard situation since the step function (memcpy_from_iter_mc())
needs to test the flag in the iterator, but isn't passed the iterator.
This will be dealt with in a follow-up patch.

The variable names in the per-type iterator functions have been harmonised
as much as possible and made clearer as to the variable purpose.

The iterator functions are also moved to a header file so that other
operations that need to scan over an iterator can be added.  For instance,
the rbd driver could use this to scan a buffer to see if it is all zeros
and libceph could use this to generate a crc.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/3710261.1691764329@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/855.1692047347@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/20230816120741.534415-1-dhowells@redhat.com/ # v3
Link: https://lore.kernel.org/r/20230925120309.1731676-8-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iov_iter.h | 274 +++++++++++++++++++++++++++++++
 lib/iov_iter.c           | 416 ++++++++++++++++++++---------------------------
 2 files changed, 449 insertions(+), 241 deletions(-)
 create mode 100644 include/linux/iov_iter.h

(limited to 'include/linux')

diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
new file mode 100644
index 000000000000..270454a6703d
--- /dev/null
+++ b/include/linux/iov_iter.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* I/O iterator iteration building functions.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_IOV_ITER_H
+#define _LINUX_IOV_ITER_H
+
+#include <linux/uio.h>
+#include <linux/bvec.h>
+
+typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
+			     void *priv, void *priv2);
+typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
+			      void *priv, void *priv2);
+
+/*
+ * Handle ITER_UBUF.
+ */
+static __always_inline
+size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_ustep_f step)
+{
+	void __user *base = iter->ubuf;
+	size_t progress = 0, remain;
+
+	remain = step(base + iter->iov_offset, 0, len, priv, priv2);
+	progress = len - remain;
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_IOVEC.
+ */
+static __always_inline
+size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		     iov_ustep_f step)
+{
+	const struct iovec *p = iter->__iov;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->__iov;
+	iter->__iov = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_KVEC.
+ */
+static __always_inline
+size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct kvec *p = iter->kvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->kvec;
+	iter->kvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_BVEC.
+ */
+static __always_inline
+size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct bio_vec *p = iter->bvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t offset = p->bv_offset + skip, part;
+		void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);
+
+		part = min3(len,
+			   (size_t)(p->bv_len - skip),
+			   (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
+		remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
+		kunmap_local(kaddr);
+		consumed = part - remain;
+		len -= consumed;
+		progress += consumed;
+		skip += consumed;
+		if (skip >= p->bv_len) {
+			skip = 0;
+			p++;
+		}
+		if (remain)
+			break;
+	} while (len);
+
+	iter->nr_segs -= p - iter->bvec;
+	iter->bvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_XARRAY.
+ */
+static __always_inline
+size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	struct folio *folio;
+	size_t progress = 0;
+	loff_t start = iter->xarray_start + iter->iov_offset;
+	pgoff_t index = start / PAGE_SIZE;
+	XA_STATE(xas, iter->xarray, index);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, ULONG_MAX) {
+		size_t remain, consumed, offset, part, flen;
+
+		if (xas_retry(&xas, folio))
+			continue;
+		if (WARN_ON(xa_is_value(folio)))
+			break;
+		if (WARN_ON(folio_test_hugetlb(folio)))
+			break;
+
+		offset = offset_in_folio(folio, start + progress);
+		flen = min(folio_size(folio) - offset, len);
+
+		while (flen) {
+			void *base = kmap_local_folio(folio, offset);
+
+			part = min_t(size_t, flen,
+				     PAGE_SIZE - offset_in_page(offset));
+			remain = step(base, progress, part, priv, priv2);
+			kunmap_local(base);
+
+			consumed = part - remain;
+			progress += consumed;
+			len -= consumed;
+
+			if (remain || len == 0)
+				goto out;
+			flen -= consumed;
+			offset += consumed;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_DISCARD.
+ */
+static __always_inline
+size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	size_t progress = len;
+
+	iter->count -= progress;
+	return progress;
+}
+
+/**
+ * iterate_and_advance2 - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @priv2: More data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * Iterate over the next part of an iterator, up to the specified length.  The
+ * buffer is presented in segments, which for kernel iteration are broken up by
+ * physical pages and mapped, with the mapped address being presented.
+ *
+ * Two step functions, @step and @ustep, must be provided, one for handling
+ * mapped kernel addresses and the other is given user addresses which have the
+ * potential to fault since no pinning is performed.
+ *
+ * The step functions are passed the address and length of the segment, @priv,
+ * @priv2 and the amount of data so far iterated over (which can, for example,
+ * be added to @priv to point to the right part of a second buffer).  The step
+ * functions should return the amount of the segment they didn't process (ie. 0
+ * indicates complete processsing).
+ *
+ * This function returns the amount of data processed (ie. 0 means nothing was
+ * processed and the value of @len means processes to completion).
+ */
+static __always_inline
+size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
+			    void *priv2, iov_ustep_f ustep, iov_step_f step)
+{
+	if (unlikely(iter->count < len))
+		len = iter->count;
+	if (unlikely(!len))
+		return 0;
+
+	if (likely(iter_is_ubuf(iter)))
+		return iterate_ubuf(iter, len, priv, priv2, ustep);
+	if (likely(iter_is_iovec(iter)))
+		return iterate_iovec(iter, len, priv, priv2, ustep);
+	if (iov_iter_is_bvec(iter))
+		return iterate_bvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_kvec(iter))
+		return iterate_kvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_xarray(iter))
+		return iterate_xarray(iter, len, priv, priv2, step);
+	return iterate_discard(iter, len, priv, priv2, step);
+}
+
+/**
+ * iterate_and_advance - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * As iterate_and_advance2(), but priv2 is always NULL.
+ */
+static __always_inline
+size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
+			   iov_ustep_f ustep, iov_step_f step)
+{
+	return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
+}
+
+#endif /* _LINUX_IOV_ITER_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 227c9f536b94..65374ee91ecd 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -13,189 +13,69 @@
 #include <net/checksum.h>
 #include <linux/scatterlist.h>
 #include <linux/instrumented.h>
+#include <linux/iov_iter.h>
 
-/* covers ubuf and kbuf alike */
-#define iterate_buf(i, n, base, len, off, __p, STEP) {		\
-	size_t __maybe_unused off = 0;				\
-	len = n;						\
-	base = __p + i->iov_offset;				\
-	len -= (STEP);						\
-	i->iov_offset += len;					\
-	n = len;						\
-}
-
-/* covers iovec and kvec alike */
-#define iterate_iovec(i, n, base, len, off, __p, STEP) {	\
-	size_t off = 0;						\
-	size_t skip = i->iov_offset;				\
-	do {							\
-		len = min(n, __p->iov_len - skip);		\
-		if (likely(len)) {				\
-			base = __p->iov_base + skip;		\
-			len -= (STEP);				\
-			off += len;				\
-			skip += len;				\
-			n -= len;				\
-			if (skip < __p->iov_len)		\
-				break;				\
-		}						\
-		__p++;						\
-		skip = 0;					\
-	} while (n);						\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_bvec(i, n, base, len, off, p, STEP) {		\
-	size_t off = 0;						\
-	unsigned skip = i->iov_offset;				\
-	while (n) {						\
-		unsigned offset = p->bv_offset + skip;		\
-		unsigned left;					\
-		void *kaddr = kmap_local_page(p->bv_page +	\
-					offset / PAGE_SIZE);	\
-		base = kaddr + offset % PAGE_SIZE;		\
-		len = min(min(n, (size_t)(p->bv_len - skip)),	\
-		     (size_t)(PAGE_SIZE - offset % PAGE_SIZE));	\
-		left = (STEP);					\
-		kunmap_local(kaddr);				\
-		len -= left;					\
-		off += len;					\
-		skip += len;					\
-		if (skip == p->bv_len) {			\
-			skip = 0;				\
-			p++;					\
-		}						\
-		n -= len;					\
-		if (left)					\
-			break;					\
-	}							\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_xarray(i, n, base, len, __off, STEP) {		\
-	__label__ __out;					\
-	size_t __off = 0;					\
-	struct folio *folio;					\
-	loff_t start = i->xarray_start + i->iov_offset;		\
-	pgoff_t index = start / PAGE_SIZE;			\
-	XA_STATE(xas, i->xarray, index);			\
-								\
-	len = PAGE_SIZE - offset_in_page(start);		\
-	rcu_read_lock();					\
-	xas_for_each(&xas, folio, ULONG_MAX) {			\
-		unsigned left;					\
-		size_t offset;					\
-		if (xas_retry(&xas, folio))			\
-			continue;				\
-		if (WARN_ON(xa_is_value(folio)))		\
-			break;					\
-		if (WARN_ON(folio_test_hugetlb(folio)))		\
-			break;					\
-		offset = offset_in_folio(folio, start + __off);	\
-		while (offset < folio_size(folio)) {		\
-			base = kmap_local_folio(folio, offset);	\
-			len = min(n, len);			\
-			left = (STEP);				\
-			kunmap_local(base);			\
-			len -= left;				\
-			__off += len;				\
-			n -= len;				\
-			if (left || n == 0)			\
-				goto __out;			\
-			offset += len;				\
-			len = PAGE_SIZE;			\
-		}						\
-	}							\
-__out:								\
-	rcu_read_unlock();					\
-	i->iov_offset += __off;					\
-	n = __off;						\
-}
-
-#define __iterate_and_advance(i, n, base, len, off, I, K) {	\
-	if (unlikely(i->count < n))				\
-		n = i->count;					\
-	if (likely(n)) {					\
-		if (likely(iter_is_ubuf(i))) {			\
-			void __user *base;			\
-			size_t len;				\
-			iterate_buf(i, n, base, len, off,	\
-						i->ubuf, (I)) 	\
-		} else if (likely(iter_is_iovec(i))) {		\
-			const struct iovec *iov = iter_iov(i);	\
-			void __user *base;			\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						iov, (I))	\
-			i->nr_segs -= iov - iter_iov(i);	\
-			i->__iov = iov;				\
-		} else if (iov_iter_is_bvec(i)) {		\
-			const struct bio_vec *bvec = i->bvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_bvec(i, n, base, len, off,	\
-						bvec, (K))	\
-			i->nr_segs -= bvec - i->bvec;		\
-			i->bvec = bvec;				\
-		} else if (iov_iter_is_kvec(i)) {		\
-			const struct kvec *kvec = i->kvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						kvec, (K))	\
-			i->nr_segs -= kvec - i->kvec;		\
-			i->kvec = kvec;				\
-		} else if (iov_iter_is_xarray(i)) {		\
-			void *base;				\
-			size_t len;				\
-			iterate_xarray(i, n, base, len, off,	\
-							(K))	\
-		}						\
-		i->count -= n;					\
-	}							\
-}
-#define iterate_and_advance(i, n, base, len, off, I, K) \
-	__iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
-
-static int copyout(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
 {
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = raw_copy_to_user(to, from, n);
+		return len;
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = raw_copy_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
 }
 
-static int copyout_nofault(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
+				 size_t len, void *from, void *priv2)
 {
-	long res;
+	ssize_t res;
 
 	if (should_fail_usercopy())
-		return n;
-
-	res = copy_to_user_nofault(to, from, n);
+		return len;
 
-	return res < 0 ? n : res;
+	from += progress;
+	res = copy_to_user_nofault(iter_to, from, len);
+	return res < 0 ? len : res;
 }
 
-static int copyin(void *to, const void __user *from, size_t n)
+static __always_inline
+size_t copy_from_user_iter(void __user *iter_from, size_t progress,
+			   size_t len, void *to, void *priv2)
 {
-	size_t res = n;
+	size_t res = len;
 
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(from, n)) {
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
+		return len;
+	if (access_ok(iter_from, len)) {
+		to += progress;
+		instrument_copy_from_user_before(to, iter_from, len);
+		res = raw_copy_from_user(to, iter_from, len);
+		instrument_copy_from_user_after(to, iter_from, len, res);
 	}
 	return res;
 }
 
+static __always_inline
+size_t memcpy_to_iter(void *iter_to, size_t progress,
+		      size_t len, void *from, void *priv2)
+{
+	memcpy(iter_to, from + progress, len);
+	return 0;
+}
+
+static __always_inline
+size_t memcpy_from_iter(void *iter_from, size_t progress,
+			size_t len, void *to, void *priv2)
+{
+	memcpy(to + progress, iter_from, len);
+	return 0;
+}
+
 /*
  * fault_in_iov_iter_readable - fault in iov iterator for reading
  * @i: iterator
@@ -312,23 +192,29 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyout(base, addr + off, len),
-		memcpy(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter, memcpy_to_iter);
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
 #ifdef CONFIG_ARCH_HAS_COPY_MC
-static int copyout_mc(void __user *to, const void *from, size_t n)
-{
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = copy_mc_to_user((__force void *) to, from, n);
+static __always_inline
+size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
+			    size_t len, void *from, void *priv2)
+{
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = copy_mc_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
+{
+	return copy_mc_to_kernel(iter_to, from + progress, len);
 }
 
 /**
@@ -361,22 +247,20 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	__iterate_and_advance(i, bytes, base, len, off,
-		copyout_mc(base, addr + off, len),
-		copy_mc_to_kernel(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter_mc, memcpy_to_iter_mc);
 }
 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 #endif /* CONFIG_ARCH_HAS_COPY_MC */
 
-static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
-				 size_t size)
+static size_t memcpy_from_iter_mc(void *iter_from, size_t progress,
+				  size_t len, void *to, void *priv2)
 {
-	if (iov_iter_is_copy_mc(i))
-		return (void *)copy_mc_to_kernel(to, from, size);
-	return memcpy(to, from, size);
+	struct iov_iter *iter = priv2;
+
+	if (iov_iter_is_copy_mc(iter))
+		return copy_mc_to_kernel(to + progress, iter_from, len);
+	return memcpy_from_iter(iter_from, progress, len, to, priv2);
 }
 
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
@@ -386,30 +270,46 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyin(addr + off, base, len),
-		memcpy_from_iter(i, addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance2(i, bytes, addr, i,
+				    copy_from_user_iter,
+				    memcpy_from_iter_mc);
 }
 EXPORT_SYMBOL(_copy_from_iter);
 
+static __always_inline
+size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
+}
+
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_inatomic_nocache(addr + off, base, len),
-		memcpy(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_nocache,
+				   memcpy_from_iter);
 }
 EXPORT_SYMBOL(_copy_from_iter_nocache);
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+static __always_inline
+size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
+				      size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_flushcache(to + progress, iter_from, len);
+}
+
+static __always_inline
+size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	memcpy_flushcache(to + progress, iter_from, len);
+	return 0;
+}
+
 /**
  * _copy_from_iter_flushcache - write destination through cpu cache
  * @addr: destination kernel address
@@ -431,12 +331,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_flushcache(addr + off, base, len),
-		memcpy_flushcache(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_flushcache,
+				   memcpy_from_iter_flushcache);
 }
 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 #endif
@@ -508,10 +405,9 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte
 		void *kaddr = kmap_local_page(page);
 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
 
-		iterate_and_advance(i, n, base, len, off,
-			copyout_nofault(base, kaddr + offset + off, len),
-			memcpy(base, kaddr + offset + off, len)
-		)
+		n = iterate_and_advance(i, bytes, kaddr,
+					copy_to_user_iter_nofault,
+					memcpy_to_iter);
 		kunmap_local(kaddr);
 		res += n;
 		bytes -= n;
@@ -554,14 +450,25 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_from_iter);
 
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+static __always_inline
+size_t zero_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *priv, void *priv2)
 {
-	iterate_and_advance(i, bytes, base, len, count,
-		clear_user(base, len),
-		memset(base, 0, len)
-	)
+	return clear_user(iter_to, len);
+}
 
-	return bytes;
+static __always_inline
+size_t zero_to_iter(void *iter_to, size_t progress,
+		    size_t len, void *priv, void *priv2)
+{
+	memset(iter_to, 0, len);
+	return 0;
+}
+
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+{
+	return iterate_and_advance(i, bytes, NULL,
+				   zero_to_user_iter, zero_to_iter);
 }
 EXPORT_SYMBOL(iov_iter_zero);
 
@@ -586,10 +493,9 @@ size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
 		}
 
 		p = kmap_atomic(page) + offset;
-		iterate_and_advance(i, n, base, len, off,
-			copyin(p + off, base, len),
-			memcpy_from_iter(i, p + off, base, len)
-		)
+		n = iterate_and_advance2(i, n, p, i,
+					 copy_from_user_iter,
+					 memcpy_from_iter_mc);
 		kunmap_atomic(p);
 		copied += n;
 		offset += n;
@@ -1180,32 +1086,64 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+				size_t len, void *to, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_from_user(iter_from, to + progress, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+			     size_t len, void *to, void *priv2)
+{
+	__wsum *csum = priv2;
+
+	*csum = csum_and_memcpy(to + progress, iter_from, len, *csum, progress);
+	return 0;
+}
+
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
 {
-	__wsum sum, next;
-	sum = *csum;
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
-
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_from_user(base, addr + off, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(addr + off, base, len, sum, off);
-	})
-	)
-	*csum = sum;
-	return bytes;
+	return iterate_and_advance2(i, bytes, addr, csum,
+				    copy_from_user_iter_csum,
+				    memcpy_from_iter_csum);
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter);
 
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+			      size_t len, void *from, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_to_user(from + progress, iter_to, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+			   size_t len, void *from, void *priv2)
+{
+	__wsum *csum = priv2;
+
+	*csum = csum_and_memcpy(iter_to, from + progress, len, *csum, progress);
+	return 0;
+}
+
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 			     struct iov_iter *i)
 {
 	struct csum_state *csstate = _csstate;
-	__wsum sum, next;
+	__wsum sum;
 
 	if (WARN_ON_ONCE(i->data_source))
 		return 0;
@@ -1219,14 +1157,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 	}
 
 	sum = csum_shift(csstate->csum, csstate->off);
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_to_user(addr + off, base, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(base, addr + off, len, sum, off);
-	})
-	)
+	
+	bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+				     copy_to_user_iter_csum,
+				     memcpy_to_iter_csum);
 	csstate->csum = csum_shift(sum, csstate->off);
 	csstate->off += bytes;
 	return bytes;
-- 
cgit v1.2.3


From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:39 +0200
Subject: bpf: Add missed value to kprobe perf link info

Add missed value to kprobe attached through perf link info to
hold the stats of missed kprobe handler execution.

The kprobe's missed counter gets incremented when kprobe handler
is not executed due to another kprobe running on the same cpu.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org
---
 include/linux/trace_events.h   |  6 ++++--
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 14 ++++++++------
 kernel/trace/bpf_trace.c       |  5 +++--
 kernel/trace/trace_kprobe.c    | 14 +++++++++++---
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..5eb88a66eb68 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr);
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed);
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
@@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static inline int bpf_get_perf_event_info(const struct perf_event *event,
 					  u32 *prog_id, u32 *fd_type,
 					  const char **buf, u64 *probe_offset,
-					  u64 *probe_addr)
+					  u64 *probe_addr, unsigned long *missed)
 {
 	return -EOPNOTSUPP;
 }
@@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event);
 extern int bpf_get_kprobe_info(const struct perf_event *event,
 			       u32 *fd_type, const char **symbol,
 			       u64 *probe_offset, u64 *probe_addr,
+			       unsigned long *missed,
 			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85c1d908f70f..6b5280f14a53 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3374,7 +3374,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
 static int bpf_perf_link_fill_common(const struct perf_event *event,
 				     char __user *uname, u32 ulen,
 				     u64 *probe_offset, u64 *probe_addr,
-				     u32 *fd_type)
+				     u32 *fd_type, unsigned long *missed)
 {
 	const char *buf;
 	u32 prog_id;
@@ -3385,7 +3385,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 		return -EINVAL;
 
 	err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
-				      probe_offset, probe_addr);
+				      probe_offset, probe_addr, missed);
 	if (err)
 		return err;
 	if (!uname)
@@ -3408,6 +3408,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 				     struct bpf_link_info *info)
 {
+	unsigned long missed;
 	char __user *uname;
 	u64 addr, offset;
 	u32 ulen, type;
@@ -3416,7 +3417,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
 	ulen = info->perf_event.kprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, &missed);
 	if (err)
 		return err;
 	if (type == BPF_FD_TYPE_KRETPROBE)
@@ -3425,6 +3426,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 		info->perf_event.type = BPF_PERF_EVENT_KPROBE;
 
 	info->perf_event.kprobe.offset = offset;
+	info->perf_event.kprobe.missed = missed;
 	if (!kallsyms_show_value(current_cred()))
 		addr = 0;
 	info->perf_event.kprobe.addr = addr;
@@ -3444,7 +3446,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
 	ulen = info->perf_event.uprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, NULL);
 	if (err)
 		return err;
 
@@ -3480,7 +3482,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
 	ulen = info->perf_event.tracepoint.name_len;
 	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
-	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
 }
 
 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -4813,7 +4815,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 
 		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
 					      &buf, &probe_offset,
-					      &probe_addr);
+					      &probe_addr, NULL);
 		if (!err)
 			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
 						     fd_type, buf,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6aec6e7d612a..f6a7d2524949 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr)
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed)
 {
 	bool is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
@@ -2419,7 +2420,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 #ifdef CONFIG_KPROBE_EVENTS
 		if (flags & TRACE_EVENT_FL_KPROBE)
 			err = bpf_get_kprobe_info(event, fd_type, buf,
-						  probe_offset, probe_addr,
+						  probe_offset, probe_addr, missed,
 						  event->attr.type == PERF_TYPE_TRACEPOINT);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3d7a180a8427..961a78ffd6d2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1189,6 +1189,12 @@ static const struct file_operations kprobe_events_ops = {
 	.write		= probes_write,
 };
 
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+	return trace_kprobe_is_return(tk) ?
+		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
@@ -1200,8 +1206,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 		return 0;
 
 	tk = to_trace_kprobe(ev);
-	nmissed = trace_kprobe_is_return(tk) ?
-		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+	nmissed = trace_kprobe_missed(tk);
 	seq_printf(m, "  %-44s %15lu %15lu\n",
 		   trace_probe_name(&tk->tp),
 		   trace_kprobe_nhit(tk),
@@ -1547,7 +1552,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
 
 int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 			const char **symbol, u64 *probe_offset,
-			u64 *probe_addr, bool perf_type_tracepoint)
+			u64 *probe_addr, unsigned long *missed,
+			bool perf_type_tracepoint)
 {
 	const char *pevent = trace_event_name(event->tp_event);
 	const char *group = event->tp_event->class->system;
@@ -1566,6 +1572,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 	*probe_addr = kallsyms_show_value(current_cred()) ?
 		      (unsigned long)tk->rp.kp.addr : 0;
 	*symbol = tk->symbol;
+	if (missed)
+		*missed = trace_kprobe_missed(tk);
 	return 0;
 }
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
-- 
cgit v1.2.3


From dd8657894c11b03c6eb0fd53fe9d7fec2072d18b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:40 +0200
Subject: bpf: Count missed stats in trace_call_bpf

Increase misses stats in case bpf array execution is skipped
because of recursion check in trace_call_bpf.

Adding bpf_prog_inc_misses_counters that increase misses
counts for all bpf programs in bpf_prog_array.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-5-jolsa@kernel.org
---
 include/linux/bpf.h      | 16 ++++++++++++++++
 kernel/trace/bpf_trace.c |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30063a760b5a..a82efd34b741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2922,6 +2922,22 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
+static __always_inline void
+bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
+{
+	const struct bpf_prog_array_item *item;
+	struct bpf_prog *prog;
+
+	if (unlikely(!array))
+		return;
+
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		bpf_prog_inc_misses_counter(prog);
+		item++;
+	}
+}
+
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
 int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6a7d2524949..df697c74d519 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -117,6 +117,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 		 * and don't send kprobe event into ring-buffer,
 		 * so return zero here
 		 */
+		rcu_read_lock();
+		bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+		rcu_read_unlock();
 		ret = 0;
 		goto out;
 	}
-- 
cgit v1.2.3


From 7a0263dc904f3467f474e4088ae092eda9a5a99b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:50 -0700
Subject: fscrypt: replace get_ino_and_lblk_bits with just has_32bit_inodes

Now that fs/crypto/ computes the filesystem's lblk_bits from its maximum
file size, it is no longer necessary for filesystems to provide
lblk_bits via fscrypt_operations::get_ino_and_lblk_bits.

It is still necessary for fs/crypto/ to retrieve ino_bits from the
filesystem.  However, this is used only to decide whether inode numbers
fit in 32 bits.  Also, ino_bits is static for all relevant filesystems,
i.e. it doesn't depend on the filesystem instance.

Therefore, in the interest of keeping things as simple as possible,
replace 'get_ino_and_lblk_bits' with a flag 'has_32bit_inodes'.  This
can always be changed back to a function if a filesystem needs it to be
dynamic, but for now a static flag is all that's needed.

Link: https://lore.kernel.org/r/20230925055451.59499-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/policy.c      | 33 +++++++++++++++------------------
 fs/ext4/crypto.c        |  9 +--------
 fs/f2fs/super.c         |  9 +--------
 include/linux/fscrypt.h | 26 +++++++++++---------------
 4 files changed, 28 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 7b34949e49de..32709dad9762 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -118,11 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode,
 }
 
 static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
-					 const struct inode *inode,
-					 const char *type, int max_ino_bits)
+					 const struct inode *inode)
 {
+	const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+				? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
 	struct super_block *sb = inode->i_sb;
-	int ino_bits = 64, lblk_bits = 64;
 
 	/*
 	 * IV_INO_LBLK_* exist only because of hardware limitations, and
@@ -149,9 +149,15 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 			     type, sb->s_id);
 		return false;
 	}
-	if (sb->s_cop->get_ino_and_lblk_bits)
-		sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
-	if (ino_bits > max_ino_bits) {
+
+	/*
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
+	 * in 32 bits.  In principle, IV_INO_LBLK_32 could support longer inode
+	 * numbers because it hashes the inode number; however, currently the
+	 * inode number is gotten from inode::i_ino which is 'unsigned long'.
+	 * So for now the implementation limit is 32 bits.
+	 */
+	if (!sb->s_cop->has_32bit_inodes) {
 		fscrypt_warn(inode,
 			     "Can't use %s policy on filesystem '%s' because its inode numbers are too long",
 			     type, sb->s_id);
@@ -242,18 +248,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 					policy->filenames_encryption_mode))
 		return false;
 
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", 32))
-		return false;
-
-	/*
-	 * IV_INO_LBLK_32 hashes the inode number, so in principle it can
-	 * support any ino_bits.  However, currently the inode number is gotten
-	 * from inode::i_ino which is 'unsigned long'.  So for now the
-	 * implementation limit is 32 bits.
-	 */
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", 32))
+	if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+			      FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
+	    !supported_iv_ino_lblk_policy(policy, inode))
 		return false;
 
 	if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 5cd7bcfae46b..9e36731701ba 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -232,20 +232,13 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
 	return ext4_has_feature_stable_inodes(sb);
 }
 
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
-	*lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
 const struct fscrypt_operations ext4_cryptops = {
 	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
 	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.get_dummy_policy	= ext4_get_dummy_policy,
 	.empty_dir		= ext4_empty_dir,
 	.has_stable_inodes	= ext4_has_stable_inodes,
-	.get_ino_and_lblk_bits	= ext4_get_ino_and_lblk_bits,
 };
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 55aa0ed531f2..c44915713264 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3203,13 +3203,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb)
 	return true;
 }
 
-static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(nid_t);
-	*lblk_bits_ret = 8 * sizeof(block_t);
-}
-
 static struct block_device **f2fs_get_devices(struct super_block *sb,
 					      unsigned int *num_devs)
 {
@@ -3232,13 +3225,13 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 
 static const struct fscrypt_operations f2fs_cryptops = {
 	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
 	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
 	.get_dummy_policy	= f2fs_get_dummy_policy,
 	.empty_dir		= f2fs_empty_dir,
 	.has_stable_inodes	= f2fs_has_stable_inodes,
-	.get_ino_and_lblk_bits	= f2fs_get_ino_and_lblk_bits,
 	.get_devices		= f2fs_get_devices,
 };
 #endif
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 4505078e89b7..09a3cacbf62a 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -74,6 +74,17 @@ struct fscrypt_operations {
 	 */
 	unsigned int needs_bounce_pages : 1;
 
+	/*
+	 * If set, then fs/crypto/ will allow the use of encryption settings
+	 * that assume inode numbers fit in 32 bits (i.e.
+	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
+	 * prerequisites for these settings are also met.  This is only useful
+	 * if the filesystem wants to support inline encryption hardware that is
+	 * limited to 32-bit or 64-bit data unit numbers and where programming
+	 * keyslots is very slow.
+	 */
+	unsigned int has_32bit_inodes : 1;
+
 	/*
 	 * This field exists only for backwards compatibility reasons and should
 	 * only be set by the filesystems that are setting it already.  It
@@ -151,21 +162,6 @@ struct fscrypt_operations {
 	 */
 	bool (*has_stable_inodes)(struct super_block *sb);
 
-	/*
-	 * Get the number of bits that the filesystem uses to represent inode
-	 * numbers and file logical block numbers.
-	 *
-	 * By default, both of these are assumed to be 64-bit.  This function
-	 * can be implemented to declare that either or both of these numbers is
-	 * shorter, which may allow the use of the
-	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags and/or the use of
-	 * inline crypto hardware whose maximum DUN length is less than 64 bits
-	 * (e.g., eMMC v5.2 spec compliant hardware).  This function only needs
-	 * to be implemented if support for one of these features is needed.
-	 */
-	void (*get_ino_and_lblk_bits)(struct super_block *sb,
-				      int *ino_bits_ret, int *lblk_bits_ret);
-
 	/*
 	 * Return an array of pointers to the block devices to which the
 	 * filesystem may write encrypted file contents, NULL if the filesystem
-- 
cgit v1.2.3


From 5b11888471806edf699316d4dcb9b426caebbef2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 24 Sep 2023 22:54:51 -0700
Subject: fscrypt: support crypto data unit size less than filesystem block
 size

Until now, fscrypt has always used the filesystem block size as the
granularity of file contents encryption.  Two scenarios have come up
where a sub-block granularity of contents encryption would be useful:

1. Inline crypto hardware that only supports a crypto data unit size
   that is less than the filesystem block size.

2. Support for direct I/O at a granularity less than the filesystem
   block size, for example at the block device's logical block size in
   order to match the traditional direct I/O alignment requirement.

(1) first came up with older eMMC inline crypto hardware that only
supports a crypto data unit size of 512 bytes.  That specific case
ultimately went away because all systems with that hardware continued
using out of tree code and never actually upgraded to the upstream
inline crypto framework.  But, now it's coming back in a new way: some
current UFS controllers only support a data unit size of 4096 bytes, and
there is a proposal to increase the filesystem block size to 16K.

(2) was discussed as a "nice to have" feature, though not essential,
when support for direct I/O on encrypted files was being upstreamed.

Still, the fact that this feature has come up several times does suggest
it would be wise to have available.  Therefore, this patch implements it
by using one of the reserved bytes in fscrypt_policy_v2 to allow users
to select a sub-block data unit size.  Supported data unit sizes are
powers of 2 between 512 and the filesystem block size, inclusively.
Support is implemented for both the FS-layer and inline crypto cases.

This patch focuses on the basic support for sub-block data units.  Some
things are out of scope for this patch but may be addressed later:

- Supporting sub-block data units in combination with
  FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64, in most cases.  Unfortunately this
  combination usually causes data unit indices to exceed 32 bits, and
  thus fscrypt_supported_policy() correctly disallows it.  The users who
  potentially need this combination are using f2fs.  To support it, f2fs
  would need to provide an option to slightly reduce its max file size.

- Supporting sub-block data units in combination with
  FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32.  This has the same problem
  described above, but also it will need special code to make DUN
  wraparound still happen on a FS block boundary.

- Supporting use case (2) mentioned above.  The encrypted direct I/O
  code will need to stop requiring and assuming FS block alignment.
  This won't be hard, but it belongs in a separate patch.

- Supporting this feature on filesystems other than ext4 and f2fs.
  (Filesystems declare support for it via their fscrypt_operations.)
  On UBIFS, sub-block data units don't make sense because UBIFS encrypts
  variable-length blocks as a result of compression.  CephFS could
  support it, but a bit more work would be needed to make the
  fscrypt_*_block_inplace functions play nicely with sub-block data
  units.  I don't think there's a use case for this on CephFS anyway.

Link: https://lore.kernel.org/r/20230925055451.59499-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fscrypt.rst | 117 ++++++++++++++++++++--------
 fs/crypto/bio.c                       |  39 +++++-----
 fs/crypto/crypto.c                    | 139 ++++++++++++++++++----------------
 fs/crypto/fscrypt_private.h           |  56 +++++++++++---
 fs/crypto/inline_crypt.c              |  14 ++--
 fs/crypto/keysetup.c                  |   5 ++
 fs/crypto/policy.c                    |  34 ++++++++-
 fs/ext4/crypto.c                      |   1 +
 fs/f2fs/super.c                       |   1 +
 include/linux/fscrypt.h               |  12 +++
 include/uapi/linux/fscrypt.h          |   3 +-
 11 files changed, 288 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index a624e92f2687..28700fb41a00 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -261,9 +261,9 @@ DIRECT_KEY policies
 
 The Adiantum encryption mode (see `Encryption modes and usage`_) is
 suitable for both contents and filenames encryption, and it accepts
-long IVs --- long enough to hold both an 8-byte logical block number
-and a 16-byte per-file nonce.  Also, the overhead of each Adiantum key
-is greater than that of an AES-256-XTS key.
+long IVs --- long enough to hold both an 8-byte data unit index and a
+16-byte per-file nonce.  Also, the overhead of each Adiantum key is
+greater than that of an AES-256-XTS key.
 
 Therefore, to improve performance and save memory, for Adiantum a
 "direct key" configuration is supported.  When the user has enabled
@@ -300,8 +300,8 @@ IV_INO_LBLK_32 policies
 
 IV_INO_LBLK_32 policies work like IV_INO_LBLK_64, except that for
 IV_INO_LBLK_32, the inode number is hashed with SipHash-2-4 (where the
-SipHash key is derived from the master key) and added to the file
-logical block number mod 2^32 to produce a 32-bit IV.
+SipHash key is derived from the master key) and added to the file data
+unit index mod 2^32 to produce a 32-bit IV.
 
 This format is optimized for use with inline encryption hardware
 compliant with the eMMC v5.2 standard, which supports only 32 IV bits
@@ -451,31 +451,62 @@ acceleration is recommended:
 Contents encryption
 -------------------
 
-For file contents, each filesystem block is encrypted independently.
-Starting from Linux kernel 5.5, encryption of filesystems with block
-size less than system's page size is supported.
-
-Each block's IV is set to the logical block number within the file as
-a little endian number, except that:
-
-- With CBC mode encryption, ESSIV is also used.  Specifically, each IV
-  is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
-  of the file's data encryption key.
-
-- With `DIRECT_KEY policies`_, the file's nonce is appended to the IV.
-  Currently this is only allowed with the Adiantum encryption mode.
-
-- With `IV_INO_LBLK_64 policies`_, the logical block number is limited
-  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
-  (which is also limited to 32 bits) is placed in bits 32-63.
-
-- With `IV_INO_LBLK_32 policies`_, the logical block number is limited
-  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
-  is then hashed and added mod 2^32.
-
-Note that because file logical block numbers are included in the IVs,
-filesystems must enforce that blocks are never shifted around within
-encrypted files, e.g. via "collapse range" or "insert range".
+For contents encryption, each file's contents is divided into "data
+units".  Each data unit is encrypted independently.  The IV for each
+data unit incorporates the zero-based index of the data unit within
+the file.  This ensures that each data unit within a file is encrypted
+differently, which is essential to prevent leaking information.
+
+Note: the encryption depending on the offset into the file means that
+operations like "collapse range" and "insert range" that rearrange the
+extent mapping of files are not supported on encrypted files.
+
+There are two cases for the sizes of the data units:
+
+* Fixed-size data units.  This is how all filesystems other than UBIFS
+  work.  A file's data units are all the same size; the last data unit
+  is zero-padded if needed.  By default, the data unit size is equal
+  to the filesystem block size.  On some filesystems, users can select
+  a sub-block data unit size via the ``log2_data_unit_size`` field of
+  the encryption policy; see `FS_IOC_SET_ENCRYPTION_POLICY`_.
+
+* Variable-size data units.  This is what UBIFS does.  Each "UBIFS
+  data node" is treated as a crypto data unit.  Each contains variable
+  length, possibly compressed data, zero-padded to the next 16-byte
+  boundary.  Users cannot select a sub-block data unit size on UBIFS.
+
+In the case of compression + encryption, the compressed data is
+encrypted.  UBIFS compression works as described above.  f2fs
+compression works a bit differently; it compresses a number of
+filesystem blocks into a smaller number of filesystem blocks.
+Therefore a f2fs-compressed file still uses fixed-size data units, and
+it is encrypted in a similar way to a file containing holes.
+
+As mentioned in `Key hierarchy`_, the default encryption setting uses
+per-file keys.  In this case, the IV for each data unit is simply the
+index of the data unit in the file.  However, users can select an
+encryption setting that does not use per-file keys.  For these, some
+kind of file identifier is incorporated into the IVs as follows:
+
+- With `DIRECT_KEY policies`_, the data unit index is placed in bits
+  0-63 of the IV, and the file's nonce is placed in bits 64-191.
+
+- With `IV_INO_LBLK_64 policies`_, the data unit index is placed in
+  bits 0-31 of the IV, and the file's inode number is placed in bits
+  32-63.  This setting is only allowed when data unit indices and
+  inode numbers fit in 32 bits.
+
+- With `IV_INO_LBLK_32 policies`_, the file's inode number is hashed
+  and added to the data unit index.  The resulting value is truncated
+  to 32 bits and placed in bits 0-31 of the IV.  This setting is only
+  allowed when data unit indices and inode numbers fit in 32 bits.
+
+The byte order of the IV is always little endian.
+
+If the user selects FSCRYPT_MODE_AES_128_CBC for the contents mode, an
+ESSIV layer is automatically included.  In this case, before the IV is
+passed to AES-128-CBC, it is encrypted with AES-256 where the AES-256
+key is the SHA-256 hash of the file's contents encryption key.
 
 Filenames encryption
 --------------------
@@ -544,7 +575,8 @@ follows::
             __u8 contents_encryption_mode;
             __u8 filenames_encryption_mode;
             __u8 flags;
-            __u8 __reserved[4];
+            __u8 log2_data_unit_size;
+            __u8 __reserved[3];
             __u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
     };
 
@@ -586,6 +618,29 @@ This structure must be initialized as follows:
   The DIRECT_KEY, IV_INO_LBLK_64, and IV_INO_LBLK_32 flags are
   mutually exclusive.
 
+- ``log2_data_unit_size`` is the log2 of the data unit size in bytes,
+  or 0 to select the default data unit size.  The data unit size is
+  the granularity of file contents encryption.  For example, setting
+  ``log2_data_unit_size`` to 12 causes file contents be passed to the
+  underlying encryption algorithm (such as AES-256-XTS) in 4096-byte
+  data units, each with its own IV.
+
+  Not all filesystems support setting ``log2_data_unit_size``.  ext4
+  and f2fs support it since Linux v6.7.  On filesystems that support
+  it, the supported nonzero values are 9 through the log2 of the
+  filesystem block size, inclusively.  The default value of 0 selects
+  the filesystem block size.
+
+  The main use case for ``log2_data_unit_size`` is for selecting a
+  data unit size smaller than the filesystem block size for
+  compatibility with inline encryption hardware that only supports
+  smaller data unit sizes.  ``/sys/block/$disk/queue/crypto/`` may be
+  useful for checking which data unit sizes are supported by a
+  particular system's inline encryption hardware.
+
+  Leave this field zeroed unless you are certain you need it.  Using
+  an unnecessarily small data unit size reduces performance.
+
 - For v2 encryption policies, ``__reserved`` must be zeroed.
 
 - For v1 encryption policies, ``master_key_descriptor`` specifies how
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 62e1a3dd8357..c8cf77065272 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,10 +111,14 @@ out:
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len)
 {
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
-	const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
+	const unsigned int du_per_page = 1U << du_per_page_bits;
+	u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits);
+	u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits);
+	sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT);
 	struct page *pages[16]; /* write up to 16 pages at a time */
 	unsigned int nr_pages;
 	unsigned int i;
@@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 							  len);
 
 	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
-	nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
-			 (len + blocks_per_page - 1) >> blocks_per_page_bits);
+	nr_pages = min_t(u64, ARRAY_SIZE(pages),
+			 (du_remaining + du_per_page - 1) >> du_per_page_bits);
 
 	/*
 	 * We need at least one page for ciphertext.  Allocate the first one
@@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
 
 	do {
-		bio->bi_iter.bi_sector = pblk << (blockbits - 9);
+		bio->bi_iter.bi_sector = sector;
 
 		i = 0;
 		offset = 0;
 		do {
-			err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
-						  ZERO_PAGE(0), pages[i],
-						  blocksize, offset, GFP_NOFS);
+			err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
+						      ZERO_PAGE(0), pages[i],
+						      du_size, offset,
+						      GFP_NOFS);
 			if (err)
 				goto out;
-			lblk++;
-			pblk++;
-			len--;
-			offset += blocksize;
-			if (offset == PAGE_SIZE || len == 0) {
+			du_index++;
+			sector += 1U << (du_bits - SECTOR_SHIFT);
+			du_remaining--;
+			offset += du_size;
+			if (offset == PAGE_SIZE || du_remaining == 0) {
 				ret = bio_add_page(bio, pages[i++], offset, 0);
 				if (WARN_ON_ONCE(ret != offset)) {
 					err = -EIO;
@@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				}
 				offset = 0;
 			}
-		} while (i != nr_pages && len != 0);
+		} while (i != nr_pages && du_remaining != 0);
 
 		err = submit_bio_wait(bio);
 		if (err)
 			goto out;
 		bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
-	} while (len != 0);
+	} while (du_remaining != 0);
 	err = 0;
 out:
 	bio_put(bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index aed0c5ea7578..85e2f66dd663 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -77,14 +77,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
 EXPORT_SYMBOL(fscrypt_free_bounce_page);
 
 /*
- * Generate the IV for the given logical block number within the given file.
- * For filenames encryption, lblk_num == 0.
+ * Generate the IV for the given data unit index within the given file.
+ * For filenames encryption, index == 0.
  *
  * Keep this in sync with fscrypt_limit_io_blocks().  fscrypt_limit_io_blocks()
  * needs to know about any IV generation methods where the low bits of IV don't
- * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ * simply contain the data unit index (e.g., IV_INO_LBLK_32).
  */
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
 			 const struct fscrypt_info *ci)
 {
 	u8 flags = fscrypt_policy_flags(&ci->ci_policy);
@@ -92,29 +92,29 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
 	memset(iv, 0, ci->ci_mode->ivsize);
 
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
+		WARN_ON_ONCE(index > U32_MAX);
 		WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
-		lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+		index |= (u64)ci->ci_inode->i_ino << 32;
 	} else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
-		lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
+		WARN_ON_ONCE(index > U32_MAX);
+		index = (u32)(ci->ci_hashed_ino + index);
 	} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
 		memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
 	}
-	iv->lblk_num = cpu_to_le64(lblk_num);
+	iv->index = cpu_to_le64(index);
 }
 
-/* Encrypt or decrypt a single filesystem block of file contents */
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags)
+/* Encrypt or decrypt a single "data unit" of file contents. */
+int fscrypt_crypt_data_unit(const struct fscrypt_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags)
 {
 	union fscrypt_iv iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
-	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	int res = 0;
 
@@ -123,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 	if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
 		return -EINVAL;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req)
@@ -144,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res) {
-		fscrypt_err(inode, "%scryption failed for block %llu: %d",
-			    (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+		fscrypt_err(ci->ci_inode,
+			    "%scryption failed for data unit %llu: %d",
+			    (rw == FS_DECRYPT ? "De" : "En"), index, res);
 		return res;
 	}
 	return 0;
 }
 
 /**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
- *					pagecache page
- * @page:      The locked pagecache page containing the block(s) to encrypt
- * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @page of the first block to encrypt.  Must be
- *		a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags.  See details below.
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
+ * @page: the locked pagecache page containing the data to encrypt
+ * @len: size of the data to encrypt, in bytes
+ * @offs: offset within @page of the data to encrypt, in bytes
+ * @gfp_flags: memory allocation flags; see details below
  *
- * A new bounce page is allocated, and the specified block(s) are encrypted into
- * it.  In the bounce page, the ciphertext block(s) will be located at the same
- * offsets at which the plaintext block(s) were located in the source page; any
- * other parts of the bounce page will be left uninitialized.  However, normally
- * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ * This allocates a new bounce page and encrypts the given data into it.  The
+ * length and offset of the data must be aligned to the file's crypto data unit
+ * size.  Alignment to the filesystem block size fulfills this requirement, as
+ * the filesystem block size is always a multiple of the data unit size.
+ *
+ * In the bounce page, the ciphertext data will be located at the same offset at
+ * which the plaintext data was located in the source page.  Any other parts of
+ * the bounce page will be left uninitialized.
  *
  * This is for use by the filesystem's ->writepages() method.
  *
@@ -183,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
 
 {
 	const struct inode *inode = page->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
+	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
 	struct page *ciphertext_page;
-	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	unsigned int i;
 	int err;
 
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return ERR_PTR(-EINVAL);
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return ERR_PTR(-EINVAL);
 
 	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
 	if (!ciphertext_page)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
-		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
-					  page, ciphertext_page,
-					  blocksize, i, gfp_flags);
+	for (i = offs; i < offs + len; i += du_size, index++) {
+		err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
+					      page, ciphertext_page,
+					      du_size, i, gfp_flags);
 		if (err) {
 			fscrypt_free_bounce_page(ciphertext_page);
 			return ERR_PTR(err);
@@ -231,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num, gfp_t gfp_flags)
 {
-	return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
-				   len, offs, gfp_flags);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
+				       lblk_num, page, page, len, offs,
+				       gfp_flags);
 }
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
 /**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- *					pagecache folio
- * @folio:     The locked pagecache folio containing the block(s) to decrypt
- * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @folio of the first block to decrypt.  Must be
- *		a multiple of the filesystem's block size.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
+ * @folio: the pagecache folio containing the data to decrypt
+ * @len: size of the data to decrypt, in bytes
+ * @offs: offset within @folio of the data to decrypt, in bytes
  *
- * The specified block(s) are decrypted in-place within the pagecache folio,
- * which must still be locked and not uptodate.
- *
- * This is for use by the filesystem's ->readahead() method.
+ * Decrypt data that has just been read from an encrypted file.  The data must
+ * be located in a pagecache folio that is still locked and not yet uptodate.
+ * The length and offset of the data must be aligned to the file's crypto data
+ * unit size.  Alignment to the filesystem block size fulfills this requirement,
+ * as the filesystem block size is always a multiple of the data unit size.
  *
  * Return: 0 on success; -errno on failure
  */
@@ -262,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs)
 {
 	const struct inode *inode = folio->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	size_t i;
 	int err;
 
 	if (WARN_ON_ONCE(!folio_test_locked(folio)))
 		return -EINVAL;
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return -EINVAL;
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+	for (i = offs; i < offs + len; i += du_size, index++) {
 		struct page *page = folio_page(folio, i >> PAGE_SHIFT);
 
-		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
-					  page, blocksize, i & ~PAGE_MASK,
-					  GFP_NOFS);
+		err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
+					      page, du_size, i & ~PAGE_MASK,
+					      GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -302,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num)
 {
-	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
-				   len, offs, GFP_NOFS);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
+				       lblk_num, page, page, len, offs,
+				       GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4b113214b53a..9c5e83baa3f1 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -47,7 +47,8 @@ struct fscrypt_context_v2 {
 	u8 contents_encryption_mode;
 	u8 filenames_encryption_mode;
 	u8 flags;
-	u8 __reserved[4];
+	u8 log2_data_unit_size;
+	u8 __reserved[3];
 	u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
 	u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
 };
@@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
 	BUG();
 }
 
+static inline int
+fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
+			  const struct inode *inode)
+{
+	return policy->log2_data_unit_size ?: inode->i_blkbits;
+}
+
+static inline int
+fscrypt_policy_du_bits(const union fscrypt_policy *policy,
+		       const struct inode *inode)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return inode->i_blkbits;
+	case FSCRYPT_POLICY_V2:
+		return fscrypt_policy_v2_du_bits(&policy->v2, inode);
+	}
+	BUG();
+}
+
 /*
  * For encrypted symlinks, the ciphertext length is stored at the beginning
  * of the string in little-endian format.
@@ -211,6 +232,16 @@ struct fscrypt_info {
 	bool ci_inlinecrypt;
 #endif
 
+	/*
+	 * log2 of the data unit size (granularity of contents encryption) of
+	 * this file.  This is computable from ci_policy and ci_inode but is
+	 * cached here for efficiency.  Only used for regular files.
+	 */
+	u8 ci_data_unit_bits;
+
+	/* Cached value: log2 of number of data units per FS block */
+	u8 ci_data_units_per_block_bits;
+
 	/*
 	 * Encryption mode used for this inode.  It corresponds to either the
 	 * contents or filenames encryption mode, depending on the inode type.
@@ -265,10 +296,11 @@ typedef enum {
 /* crypto.c */
 extern struct kmem_cache *fscrypt_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags);
+int fscrypt_crypt_data_unit(const struct fscrypt_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags);
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 
 void __printf(3, 4) __cold
@@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
 
 union fscrypt_iv {
 	struct {
-		/* logical block number within the file */
-		__le64 lblk_num;
+		/* zero-based index of data unit within the file */
+		__le64 index;
 
 		/* per-file nonce; only set in DIRECT_KEY mode */
 		u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
@@ -293,17 +325,17 @@ union fscrypt_iv {
 	__le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
 };
 
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
 			 const struct fscrypt_info *ci);
 
 /*
- * Return the number of bits used by the maximum file logical block number that
- * is possible on the given filesystem.
+ * Return the number of bits used by the maximum file data unit index that is
+ * possible on the given filesystem, using the given log2 data unit size.
  */
 static inline int
-fscrypt_max_file_lblk_bits(const struct super_block *sb)
+fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
 {
-	return fls64(sb->s_maxbytes - 1) - sb->s_blocksize_bits;
+	return fls64(sb->s_maxbytes - 1) - du_bits;
 }
 
 /* fname.c */
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 7d9f6c167de5..8c6d37d6225a 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -43,6 +43,7 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
 {
 	const struct super_block *sb = ci->ci_inode->i_sb;
 	unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
+	int dun_bits;
 
 	if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return offsetofend(union fscrypt_iv, nonce);
@@ -53,8 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
 		return sizeof(__le32);
 
-	/* Default case: IVs are just the file logical block number */
-	return DIV_ROUND_UP(fscrypt_max_file_lblk_bits(sb), 8);
+	/* Default case: IVs are just the file data unit index */
+	dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits);
+	return DIV_ROUND_UP(dun_bits, 8);
 }
 
 /*
@@ -126,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 	 * crypto configuration that the file would use.
 	 */
 	crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
-	crypto_cfg.data_unit_size = sb->s_blocksize;
+	crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
 	crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
 
 	devs = fscrypt_get_devices(sb, &num_devs);
@@ -165,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		return -ENOMEM;
 
 	err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
-				  fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+				  fscrypt_get_dun_bytes(ci),
+				  1U << ci->ci_data_unit_bits);
 	if (err) {
 		fscrypt_err(inode, "error %d initializing blk-crypto key", err);
 		goto fail;
@@ -232,10 +235,11 @@ EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
 static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
 				 u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
 {
+	u64 index = lblk_num << ci->ci_data_units_per_block_bits;
 	union fscrypt_iv iv;
 	int i;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
 	memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 361f41ef46c7..608599f8aa57 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -580,6 +580,11 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
 	crypt_info->ci_mode = mode;
 
+	crypt_info->ci_data_unit_bits =
+		fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
+	crypt_info->ci_data_units_per_block_bits =
+		inode->i_blkbits - crypt_info->ci_data_unit_bits;
+
 	res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
 	if (res)
 		goto out;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 32709dad9762..2fb3f6a1258e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -165,10 +165,11 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 	}
 
 	/*
-	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file logical
-	 * block numbers fit in 32 bits.
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
+	 * indices fit in 32 bits.
 	 */
-	if (fscrypt_max_file_lblk_bits(sb) > 32) {
+	if (fscrypt_max_file_dun_bits(sb,
+			fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
 		fscrypt_warn(inode,
 			     "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
 			     type, sb->s_id);
@@ -243,6 +244,31 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 		return false;
 	}
 
+	if (policy->log2_data_unit_size) {
+		if (!inode->i_sb->s_cop->supports_subblock_data_units) {
+			fscrypt_warn(inode,
+				     "Filesystem does not support configuring crypto data unit size");
+			return false;
+		}
+		if (policy->log2_data_unit_size > inode->i_blkbits ||
+		    policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
+			fscrypt_warn(inode,
+				     "Unsupported log2_data_unit_size in encryption policy: %d",
+				     policy->log2_data_unit_size);
+			return false;
+		}
+		if (policy->log2_data_unit_size != inode->i_blkbits &&
+		    (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+			/*
+			 * Not safe to enable yet, as we need to ensure that DUN
+			 * wraparound can only occur on a FS block boundary.
+			 */
+			fscrypt_warn(inode,
+				     "Sub-block data units not yet supported with IV_INO_LBLK_32");
+			return false;
+		}
+	}
+
 	if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
 	    !supported_direct_key_modes(inode, policy->contents_encryption_mode,
 					policy->filenames_encryption_mode))
@@ -329,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u,
 		ctx->filenames_encryption_mode =
 			policy->filenames_encryption_mode;
 		ctx->flags = policy->flags;
+		ctx->log2_data_unit_size = policy->log2_data_unit_size;
 		memcpy(ctx->master_key_identifier,
 		       policy->master_key_identifier,
 		       sizeof(ctx->master_key_identifier));
@@ -389,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 		policy->filenames_encryption_mode =
 			ctx->filenames_encryption_mode;
 		policy->flags = ctx->flags;
+		policy->log2_data_unit_size = ctx->log2_data_unit_size;
 		memcpy(policy->__reserved, ctx->__reserved,
 		       sizeof(policy->__reserved));
 		memcpy(policy->master_key_identifier,
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 9e36731701ba..7ae0b61258a7 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -235,6 +235,7 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
 const struct fscrypt_operations ext4_cryptops = {
 	.needs_bounce_pages	= 1,
 	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
 	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c44915713264..66a5bf4216b7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3226,6 +3226,7 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 static const struct fscrypt_operations f2fs_cryptops = {
 	.needs_bounce_pages	= 1,
 	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
 	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 09a3cacbf62a..b559e6f77707 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -85,6 +85,18 @@ struct fscrypt_operations {
 	 */
 	unsigned int has_32bit_inodes : 1;
 
+	/*
+	 * If set, then fs/crypto/ will allow users to select a crypto data unit
+	 * size that is less than the filesystem block size.  This is done via
+	 * the log2_data_unit_size field of the fscrypt policy.  This flag is
+	 * not compatible with filesystems that encrypt variable-length blocks
+	 * (i.e. blocks that aren't all equal to filesystem's block size), for
+	 * example as a result of compression.  It's also not compatible with
+	 * the fscrypt_encrypt_block_inplace() and
+	 * fscrypt_decrypt_block_inplace() functions.
+	 */
+	unsigned int supports_subblock_data_units : 1;
+
 	/*
 	 * This field exists only for backwards compatibility reasons and should
 	 * only be set by the filesystems that are setting it already.  It
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index fd1fb0d5389d..7a8f4c290187 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -71,7 +71,8 @@ struct fscrypt_policy_v2 {
 	__u8 contents_encryption_mode;
 	__u8 filenames_encryption_mode;
 	__u8 flags;
-	__u8 __reserved[4];
+	__u8 log2_data_unit_size;
+	__u8 __reserved[3];
 	__u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
 };
 
-- 
cgit v1.2.3


From e850d9a52f4cd31521c80a7ea9718b69129af4d5 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 12 Aug 2023 01:05:07 +0800
Subject: badblocks: add more helper structure and routines in badblocks.h

This patch adds the following helper structure and routines into
badblocks.h,
- struct badblocks_context
  This structure is used in improved badblocks code for bad table
  iteration.
- BB_END()
  The macro to calculate end LBA of a bad range record from bad
  table.
- badblocks_full() and badblocks_empty()
  The inline routines to check whether bad table is full or empty.
- set_changed() and clear_changed()
  The inline routines to set and clear 'changed' tag from struct
  badblocks.

These new helper structure and routines can help to make the code more
clear, they will be used in the improved badblocks code in following
patches.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Xiao Ni <xni@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Geliang Tang <geliang.tang@suse.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: NeilBrown <neilb@suse.de>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Acked-by: Geliang Tang <geliang.tang@suse.com>
Link: https://lore.kernel.org/r/20230811170513.2300-2-colyli@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/badblocks.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
index 2426276b9bd3..670f2dae692f 100644
--- a/include/linux/badblocks.h
+++ b/include/linux/badblocks.h
@@ -15,6 +15,7 @@
 #define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
 #define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
 #define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
+#define BB_END(x)	(BB_OFFSET(x) + BB_LEN(x))
 #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
 
 /* Bad block numbers are stored sorted in a single page.
@@ -41,6 +42,12 @@ struct badblocks {
 	sector_t size;		/* in sectors */
 };
 
+struct badblocks_context {
+	sector_t	start;
+	sector_t	len;
+	int		ack;
+};
+
 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
 		   sector_t *first_bad, int *bad_sectors);
 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
 	}
 	badblocks_exit(bb);
 }
+
+static inline int badblocks_full(struct badblocks *bb)
+{
+	return (bb->count >= MAX_BADBLOCKS);
+}
+
+static inline int badblocks_empty(struct badblocks *bb)
+{
+	return (bb->count == 0);
+}
+
+static inline void set_changed(struct badblocks *bb)
+{
+	if (bb->changed != 1)
+		bb->changed = 1;
+}
+
+static inline void clear_changed(struct badblocks *bb)
+{
+	if (bb->changed != 0)
+		bb->changed = 0;
+}
+
 #endif
-- 
cgit v1.2.3


From 948f0bf5ad6ac1e5f19f6aa8e7da4a950d66b661 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 20 Sep 2023 13:07:42 +0300
Subject: IB/mlx5: Add support for 800G_8X lane speed

Add a check for 800G_8X speed when querying PTYS and report it back
correctly when needed.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Link: https://lore.kernel.org/r/26fd0b6e1fac071c3eb779657bb3d8ba47f47c4f.1695204156.git.leon@kernel.org
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c              | 4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 1 +
 include/linux/mlx5/port.h                      | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index aed5cdea50e6..457d5ac7ad04 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -451,6 +451,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 		*active_width = IB_WIDTH_4X;
 		*active_speed = IB_SPEED_NDR;
 		break;
+	case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
+		*active_width = IB_WIDTH_8X;
+		*active_speed = IB_SPEED_NDR;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index be70d1f23a5d..43423543f34c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1102,6 +1102,7 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
 	[MLX5E_100GAUI_1_100GBASE_CR_KR] = 100000,
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000,
 	[MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000,
+	[MLX5E_800GAUI_8_800GBASE_CR8_KR8] = 800000,
 };
 
 int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 98b2e1e149f9..794001ebd003 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -117,6 +117,7 @@ enum mlx5e_ext_link_mode {
 	MLX5E_200GAUI_2_200GBASE_CR2_KR2	= 13,
 	MLX5E_400GAUI_8				= 15,
 	MLX5E_400GAUI_4_400GBASE_CR4_KR4	= 16,
+	MLX5E_800GAUI_8_800GBASE_CR8_KR8	= 19,
 	MLX5E_EXT_LINK_MODES_NUMBER,
 };
 
-- 
cgit v1.2.3


From b28ad32442bec2f0d9cb660d7d698a1a53c13d08 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Wed, 20 Sep 2023 13:07:43 +0300
Subject: IB/mlx5: Rename 400G_8X speed to comply to naming convention

Rename 400G_8X speed to comply to naming convention.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Link: https://lore.kernel.org/r/ac98447cac8379a43fbdb36d56e5fb2b741a97ff.1695204156.git.leon@kernel.org
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c              | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 2 +-
 include/linux/mlx5/port.h                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 457d5ac7ad04..428abc916509 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -443,7 +443,7 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 		*active_width = IB_WIDTH_2X;
 		*active_speed = IB_SPEED_NDR;
 		break;
-	case MLX5E_PROT_MASK(MLX5E_400GAUI_8):
+	case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
 		*active_width = IB_WIDTH_8X;
 		*active_speed = IB_SPEED_HDR;
 		break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 43423543f34c..7d8c732818f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1098,7 +1098,7 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
 	[MLX5E_CAUI_4_100GBASE_CR4_KR4] = 100000,
 	[MLX5E_100GAUI_2_100GBASE_CR2_KR2] = 100000,
 	[MLX5E_200GAUI_4_200GBASE_CR4_KR4] = 200000,
-	[MLX5E_400GAUI_8] = 400000,
+	[MLX5E_400GAUI_8_400GBASE_CR8] = 400000,
 	[MLX5E_100GAUI_1_100GBASE_CR_KR] = 100000,
 	[MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000,
 	[MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 794001ebd003..26092c78a985 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -115,7 +115,7 @@ enum mlx5e_ext_link_mode {
 	MLX5E_100GAUI_1_100GBASE_CR_KR		= 11,
 	MLX5E_200GAUI_4_200GBASE_CR4_KR4	= 12,
 	MLX5E_200GAUI_2_200GBASE_CR2_KR2	= 13,
-	MLX5E_400GAUI_8				= 15,
+	MLX5E_400GAUI_8_400GBASE_CR8		= 15,
 	MLX5E_400GAUI_4_400GBASE_CR4_KR4	= 16,
 	MLX5E_800GAUI_8_800GBASE_CR8_KR8	= 19,
 	MLX5E_EXT_LINK_MODES_NUMBER,
-- 
cgit v1.2.3


From 9cf63f3a33e929f7eca36409914b8c12102b9984 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:54:37 -0700
Subject: platform/surface: aggregator: Annotate struct ssam_event with
 __counted_by
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ssam_event.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: platform-driver-x86@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20230922175436.work.031-kees@kernel.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/surface_aggregator/controller.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index cb7980805920..5b67f0f47d80 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -44,7 +44,7 @@ struct ssam_event {
 	u8 command_id;
 	u8 instance_id;
 	u16 length;
-	u8 data[];
+	u8 data[] __counted_by(length);
 };
 
 /**
-- 
cgit v1.2.3


From 8d74f1da776da9b0306630b13a3025214fa44618 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Fri, 22 Sep 2023 11:15:52 +0530
Subject: PM: sleep: Fix symbol export for _SIMPLE_ variants of _PM_OPS()

Currently EXPORT_*_SIMPLE_DEV_PM_OPS() use EXPORT_*_DEV_PM_OPS() set
of macros to export dev_pm_ops symbol, which export the symbol in case
CONFIG_PM=y but don't take CONFIG_PM_SLEEP into consideration.

Since _SIMPLE_ variants of _PM_OPS() do not include runtime PM handles
and are only used in case CONFIG_PM_SLEEP=y, we should not be exporting
dev_pm_ops symbol for them in case CONFIG_PM_SLEEP=n.

This can be fixed by having two distinct set of export macros for both
_RUNTIME_ and _SIMPLE_ variants of _PM_OPS(), such that the export of
dev_pm_ops symbol used in each variant depends on CONFIG_PM and
CONFIG_PM_SLEEP respectively.

Introduce _DEV_SLEEP_PM_OPS() set of export macros for _SIMPLE_ variants
of _PM_OPS(), which export dev_pm_ops symbol only in case CONFIG_PM_SLEEP=y
and discard it otherwise.

Fixes: 34e1ed189fab ("PM: Improve EXPORT_*_DEV_PM_OPS macros")
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1400c37b29c7..629c1633bbd0 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -374,24 +374,39 @@ const struct dev_pm_ops name = { \
 	RUNTIME_PM_OPS(runtime_suspend_fn, runtime_resume_fn, idle_fn) \
 }
 
-#ifdef CONFIG_PM
-#define _EXPORT_DEV_PM_OPS(name, license, ns)				\
+#define _EXPORT_PM_OPS(name, license, ns)				\
 	const struct dev_pm_ops name;					\
 	__EXPORT_SYMBOL(name, license, ns);				\
 	const struct dev_pm_ops name
-#define EXPORT_PM_FN_GPL(name)		EXPORT_SYMBOL_GPL(name)
-#define EXPORT_PM_FN_NS_GPL(name, ns)	EXPORT_SYMBOL_NS_GPL(name, ns)
-#else
-#define _EXPORT_DEV_PM_OPS(name, license, ns)				\
+
+#define _DISCARD_PM_OPS(name, license, ns)				\
 	static __maybe_unused const struct dev_pm_ops __static_##name
+
+#ifdef CONFIG_PM
+#define _EXPORT_DEV_PM_OPS(name, license, ns)		_EXPORT_PM_OPS(name, license, ns)
+#define EXPORT_PM_FN_GPL(name)				EXPORT_SYMBOL_GPL(name)
+#define EXPORT_PM_FN_NS_GPL(name, ns)			EXPORT_SYMBOL_NS_GPL(name, ns)
+#else
+#define _EXPORT_DEV_PM_OPS(name, license, ns)		_DISCARD_PM_OPS(name, license, ns)
 #define EXPORT_PM_FN_GPL(name)
 #define EXPORT_PM_FN_NS_GPL(name, ns)
 #endif
 
-#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
-#define EXPORT_GPL_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "GPL", "")
-#define EXPORT_NS_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "", #ns)
-#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns) _EXPORT_DEV_PM_OPS(name, "GPL", #ns)
+#ifdef CONFIG_PM_SLEEP
+#define _EXPORT_DEV_SLEEP_PM_OPS(name, license, ns)	_EXPORT_PM_OPS(name, license, ns)
+#else
+#define _EXPORT_DEV_SLEEP_PM_OPS(name, license, ns)	_DISCARD_PM_OPS(name, license, ns)
+#endif
+
+#define EXPORT_DEV_PM_OPS(name)				_EXPORT_DEV_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_PM_OPS(name)			_EXPORT_DEV_PM_OPS(name, "GPL", "")
+#define EXPORT_NS_DEV_PM_OPS(name, ns)			_EXPORT_DEV_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_PM_OPS(name, ns)		_EXPORT_DEV_PM_OPS(name, "GPL", #ns)
+
+#define EXPORT_DEV_SLEEP_PM_OPS(name)			_EXPORT_DEV_SLEEP_PM_OPS(name, "", "")
+#define EXPORT_GPL_DEV_SLEEP_PM_OPS(name)		_EXPORT_DEV_SLEEP_PM_OPS(name, "GPL", "")
+#define EXPORT_NS_DEV_SLEEP_PM_OPS(name, ns)		_EXPORT_DEV_SLEEP_PM_OPS(name, "", #ns)
+#define EXPORT_NS_GPL_DEV_SLEEP_PM_OPS(name, ns)	_EXPORT_DEV_SLEEP_PM_OPS(name, "GPL", #ns)
 
 /*
  * Use this if you want to use the same suspend and resume callbacks for suspend
@@ -404,19 +419,19 @@ const struct dev_pm_ops name = { \
 	_DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL)
 
 #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	EXPORT_DEV_PM_OPS(name) = { \
+	EXPORT_DEV_SLEEP_PM_OPS(name) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \
-	EXPORT_GPL_DEV_PM_OPS(name) = { \
+	EXPORT_GPL_DEV_SLEEP_PM_OPS(name) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	EXPORT_NS_DEV_PM_OPS(name, ns) = { \
+	EXPORT_NS_DEV_SLEEP_PM_OPS(name, ns) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 #define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns)	\
-	EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \
+	EXPORT_NS_GPL_DEV_SLEEP_PM_OPS(name, ns) = { \
 		SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
 	}
 
-- 
cgit v1.2.3


From 4d0e179a42879f7d76a5b95a2e7e7a5afa33954a Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:55 +1000
Subject: media: cros-ec-cec: Manage an array of ports

To support multiple CEC ports, change cros_ec_cec to contain an array of
ports, each with their own CEC adapter, etc.

For now, only create a single port and use that port everywhere, so
there is no functional change. Support for multiple ports will be added
in the following patches.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 147 +++++++++++++++++------
 include/linux/platform_data/cros_ec_commands.h   |   2 +
 2 files changed, 110 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index 8dd95fb38546..d76a25ae0cf1 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -21,21 +21,40 @@
 
 #define DRV_NAME	"cros-ec-cec"
 
+/* Only one port is supported for now */
+#define CEC_NUM_PORTS	1
+#define CEC_PORT	0
+
 /**
- * struct cros_ec_cec - Driver data for EC CEC
+ * struct cros_ec_cec_port - Driver data for a single EC CEC port
  *
- * @cros_ec: Pointer to EC device
- * @notifier: Notifier info for responding to EC events
+ * @port_num: port number
  * @adap: CEC adapter
  * @notify: CEC notifier pointer
  * @rx_msg: storage for a received message
+ * @cros_ec_cec: pointer to the parent struct
  */
-struct cros_ec_cec {
-	struct cros_ec_device *cros_ec;
-	struct notifier_block notifier;
+struct cros_ec_cec_port {
+	int port_num;
 	struct cec_adapter *adap;
 	struct cec_notifier *notify;
 	struct cec_msg rx_msg;
+	struct cros_ec_cec *cros_ec_cec;
+};
+
+/**
+ * struct cros_ec_cec - Driver data for EC CEC
+ *
+ * @cros_ec: Pointer to EC device
+ * @notifier: Notifier info for responding to EC events
+ * @num_ports: Number of CEC ports
+ * @ports: Array of ports
+ */
+struct cros_ec_cec {
+	struct cros_ec_device *cros_ec;
+	struct notifier_block notifier;
+	int num_ports;
+	struct cros_ec_cec_port *ports[EC_CEC_MAX_PORTS];
 };
 
 static void handle_cec_message(struct cros_ec_cec *cros_ec_cec)
@@ -43,27 +62,28 @@ static void handle_cec_message(struct cros_ec_cec *cros_ec_cec)
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	uint8_t *cec_message = cros_ec->event_data.data.cec_message;
 	unsigned int len = cros_ec->event_size;
+	struct cros_ec_cec_port *port = cros_ec_cec->ports[CEC_PORT];
 
 	if (len > CEC_MAX_MSG_SIZE)
 		len = CEC_MAX_MSG_SIZE;
-	cros_ec_cec->rx_msg.len = len;
-	memcpy(cros_ec_cec->rx_msg.msg, cec_message, len);
+	port->rx_msg.len = len;
+	memcpy(port->rx_msg.msg, cec_message, len);
 
-	cec_received_msg(cros_ec_cec->adap, &cros_ec_cec->rx_msg);
+	cec_received_msg(port->adap, &port->rx_msg);
 }
 
 static void handle_cec_event(struct cros_ec_cec *cros_ec_cec)
 {
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	uint32_t events = cros_ec->event_data.data.cec_events;
+	struct cros_ec_cec_port *port = cros_ec_cec->ports[CEC_PORT];
 
 	if (events & EC_MKBP_CEC_SEND_OK)
-		cec_transmit_attempt_done(cros_ec_cec->adap,
-					  CEC_TX_STATUS_OK);
+		cec_transmit_attempt_done(port->adap, CEC_TX_STATUS_OK);
 
 	/* FW takes care of all retries, tell core to avoid more retries */
 	if (events & EC_MKBP_CEC_SEND_FAILED)
-		cec_transmit_attempt_done(cros_ec_cec->adap,
+		cec_transmit_attempt_done(port->adap,
 					  CEC_TX_STATUS_MAX_RETRIES |
 					  CEC_TX_STATUS_NACK);
 }
@@ -93,7 +113,8 @@ static int cros_ec_cec_event(struct notifier_block *nb,
 
 static int cros_ec_cec_set_log_addr(struct cec_adapter *adap, u8 logical_addr)
 {
-	struct cros_ec_cec *cros_ec_cec = adap->priv;
+	struct cros_ec_cec_port *port = adap->priv;
+	struct cros_ec_cec *cros_ec_cec = port->cros_ec_cec;
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_set params = {
 		.cmd = CEC_CMD_LOGICAL_ADDRESS,
@@ -115,7 +136,8 @@ static int cros_ec_cec_set_log_addr(struct cec_adapter *adap, u8 logical_addr)
 static int cros_ec_cec_transmit(struct cec_adapter *adap, u8 attempts,
 				u32 signal_free_time, struct cec_msg *cec_msg)
 {
-	struct cros_ec_cec *cros_ec_cec = adap->priv;
+	struct cros_ec_cec_port *port = adap->priv;
+	struct cros_ec_cec *cros_ec_cec = port->cros_ec_cec;
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_write params;
 	int ret;
@@ -135,7 +157,8 @@ static int cros_ec_cec_transmit(struct cec_adapter *adap, u8 attempts,
 
 static int cros_ec_cec_adap_enable(struct cec_adapter *adap, bool enable)
 {
-	struct cros_ec_cec *cros_ec_cec = adap->priv;
+	struct cros_ec_cec_port *port = adap->priv;
+	struct cros_ec_cec *cros_ec_cec = port->cros_ec_cec;
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_set params = {
 		.cmd = CEC_CMD_ENABLE,
@@ -260,11 +283,55 @@ static struct device *cros_ec_cec_find_hdmi_dev(struct device *dev,
 
 #endif
 
+static int cros_ec_cec_init_port(struct device *dev,
+				 struct cros_ec_cec *cros_ec_cec,
+				 int port_num, struct device *hdmi_dev,
+				 const char *conn)
+{
+	struct cros_ec_cec_port *port;
+	int ret;
+
+	port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	port->cros_ec_cec = cros_ec_cec;
+	port->port_num = port_num;
+
+	port->adap = cec_allocate_adapter(&cros_ec_cec_ops, port, DRV_NAME,
+					  CEC_CAP_DEFAULTS |
+					  CEC_CAP_CONNECTOR_INFO, 1);
+	if (IS_ERR(port->adap))
+		return PTR_ERR(port->adap);
+
+	port->notify = cec_notifier_cec_adap_register(hdmi_dev, conn,
+						      port->adap);
+	if (!port->notify) {
+		ret = -ENOMEM;
+		goto out_probe_adapter;
+	}
+
+	ret = cec_register_adapter(port->adap, dev);
+	if (ret < 0)
+		goto out_probe_notify;
+
+	cros_ec_cec->ports[port_num] = port;
+
+	return 0;
+
+out_probe_notify:
+	cec_notifier_cec_adap_unregister(port->notify, port->adap);
+out_probe_adapter:
+	cec_delete_adapter(port->adap);
+	return ret;
+}
+
 static int cros_ec_cec_probe(struct platform_device *pdev)
 {
 	struct cros_ec_dev *ec_dev = dev_get_drvdata(pdev->dev.parent);
 	struct cros_ec_device *cros_ec = ec_dev->ec_dev;
 	struct cros_ec_cec *cros_ec_cec;
+	struct cros_ec_cec_port *port;
 	struct device *hdmi_dev;
 	const char *conn = NULL;
 	int ret;
@@ -283,18 +350,13 @@ static int cros_ec_cec_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 1);
 
-	cros_ec_cec->adap = cec_allocate_adapter(&cros_ec_cec_ops, cros_ec_cec,
-						 DRV_NAME,
-						 CEC_CAP_DEFAULTS |
-						 CEC_CAP_CONNECTOR_INFO, 1);
-	if (IS_ERR(cros_ec_cec->adap))
-		return PTR_ERR(cros_ec_cec->adap);
+	cros_ec_cec->num_ports = CEC_NUM_PORTS;
 
-	cros_ec_cec->notify = cec_notifier_cec_adap_register(hdmi_dev, conn,
-							     cros_ec_cec->adap);
-	if (!cros_ec_cec->notify) {
-		ret = -ENOMEM;
-		goto out_probe_adapter;
+	for (int i = 0; i < cros_ec_cec->num_ports; i++) {
+		ret = cros_ec_cec_init_port(&pdev->dev, cros_ec_cec, i,
+					    hdmi_dev, conn);
+		if (ret)
+			goto unregister_ports;
 	}
 
 	/* Get CEC events from the EC. */
@@ -303,20 +365,24 @@ static int cros_ec_cec_probe(struct platform_device *pdev)
 					       &cros_ec_cec->notifier);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register notifier\n");
-		goto out_probe_notify;
+		goto unregister_ports;
 	}
 
-	ret = cec_register_adapter(cros_ec_cec->adap, &pdev->dev);
-	if (ret < 0)
-		goto out_probe_notify;
-
 	return 0;
 
-out_probe_notify:
-	cec_notifier_cec_adap_unregister(cros_ec_cec->notify,
-					 cros_ec_cec->adap);
-out_probe_adapter:
-	cec_delete_adapter(cros_ec_cec->adap);
+unregister_ports:
+	/*
+	 * Unregister any adapters which have been registered. We don't add the
+	 * port to the array until the adapter has been registered successfully,
+	 * so any non-NULL ports must have been registered.
+	 */
+	for (int i = 0; i < cros_ec_cec->num_ports; i++) {
+		port = cros_ec_cec->ports[i];
+		if (!port)
+			break;
+		cec_notifier_cec_adap_unregister(port->notify, port->adap);
+		cec_unregister_adapter(port->adap);
+	}
 	return ret;
 }
 
@@ -324,6 +390,7 @@ static void cros_ec_cec_remove(struct platform_device *pdev)
 {
 	struct cros_ec_cec *cros_ec_cec = platform_get_drvdata(pdev);
 	struct device *dev = &pdev->dev;
+	struct cros_ec_cec_port *port;
 	int ret;
 
 	/*
@@ -337,9 +404,11 @@ static void cros_ec_cec_remove(struct platform_device *pdev)
 	if (ret)
 		dev_err(dev, "failed to unregister notifier\n");
 
-	cec_notifier_cec_adap_unregister(cros_ec_cec->notify,
-					 cros_ec_cec->adap);
-	cec_unregister_adapter(cros_ec_cec->adap);
+	for (int i = 0; i < cros_ec_cec->num_ports; i++) {
+		port = cros_ec_cec->ports[i];
+		cec_notifier_cec_adap_unregister(port->notify, port->adap);
+		cec_unregister_adapter(port->adap);
+	}
 }
 
 static struct platform_driver cros_ec_cec_driver = {
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index ab721cf13a98..cb2ddd10a613 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4436,6 +4436,8 @@ struct ec_response_i2c_passthru_protect {
  * These commands are for sending and receiving message via HDMI CEC
  */
 
+#define EC_CEC_MAX_PORTS 16
+
 #define MAX_CEC_MSG_LEN 16
 
 /* CEC message from the AP to be written on the CEC bus */
-- 
cgit v1.2.3


From e90bd1fe7cda1aa267fe683e392b4433ec2dc0d3 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:56 +1000
Subject: media: cros-ec-cec: Support multiple ports in set/get host commands

Reuse the top four bits of the cmd field to specify the port number.
The reason for doing this as opposed to adding a separate uint8_t field
is it avoids the need to add new versions of these commands. The change
is backwards compatible since these bits were previously always zero, so
the default behaviour is to always operate on port 0.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 2 ++
 include/linux/platform_data/cros_ec_commands.h   | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index d76a25ae0cf1..e969031e1e0e 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -118,6 +118,7 @@ static int cros_ec_cec_set_log_addr(struct cec_adapter *adap, u8 logical_addr)
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_set params = {
 		.cmd = CEC_CMD_LOGICAL_ADDRESS,
+		.port = port->port_num,
 		.val = logical_addr,
 	};
 	int ret;
@@ -162,6 +163,7 @@ static int cros_ec_cec_adap_enable(struct cec_adapter *adap, bool enable)
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_set params = {
 		.cmd = CEC_CMD_ENABLE,
+		.port = port->port_num,
 		.val = enable,
 	};
 	int ret;
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index cb2ddd10a613..e8bb05db360f 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4457,13 +4457,15 @@ struct ec_params_cec_write {
 /**
  * struct ec_params_cec_set - CEC parameters set
  * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ * @port: CEC port to set the parameter on
  * @val: in case cmd is CEC_CMD_ENABLE, this field can be 0 to disable CEC
  *	or 1 to enable CEC functionality, in case cmd is
  *	CEC_CMD_LOGICAL_ADDRESS, this field encodes the requested logical
  *	address between 0 and 15 or 0xff to unregister
  */
 struct ec_params_cec_set {
-	uint8_t cmd; /* enum cec_command */
+	uint8_t cmd : 4; /* enum cec_command */
+	uint8_t port : 4;
 	uint8_t val;
 } __ec_align1;
 
@@ -4473,9 +4475,11 @@ struct ec_params_cec_set {
 /**
  * struct ec_params_cec_get - CEC parameters get
  * @cmd: parameter type, can be CEC_CMD_ENABLE or CEC_CMD_LOGICAL_ADDRESS
+ * @port: CEC port to get the parameter on
  */
 struct ec_params_cec_get {
-	uint8_t cmd; /* enum cec_command */
+	uint8_t cmd : 4; /* enum cec_command */
+	uint8_t port : 4;
 } __ec_align1;
 
 /**
-- 
cgit v1.2.3


From adbfc747ddfb48c06d238640e16939916b7a4494 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:57 +1000
Subject: media: cros-ec-cec: Support multiple ports in write command

Add a v1 of the CEC write command which contains a port parameter. Check
which versions of the write command the EC supports and use the highest
supported version. If it only supports v0, check that there is only one
port. With v0, the EC will assume all write commands are for port 0.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 54 ++++++++++++++++++++++--
 include/linux/platform_data/cros_ec_commands.h   | 12 ++++++
 2 files changed, 63 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index e969031e1e0e..d674a432dfdd 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -47,12 +47,14 @@ struct cros_ec_cec_port {
  *
  * @cros_ec: Pointer to EC device
  * @notifier: Notifier info for responding to EC events
+ * @write_cmd_version: Highest supported version of EC_CMD_CEC_WRITE_MSG.
  * @num_ports: Number of CEC ports
  * @ports: Array of ports
  */
 struct cros_ec_cec {
 	struct cros_ec_device *cros_ec;
 	struct notifier_block notifier;
+	int write_cmd_version;
 	int num_ports;
 	struct cros_ec_cec_port *ports[EC_CEC_MAX_PORTS];
 };
@@ -141,12 +143,22 @@ static int cros_ec_cec_transmit(struct cec_adapter *adap, u8 attempts,
 	struct cros_ec_cec *cros_ec_cec = port->cros_ec_cec;
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	struct ec_params_cec_write params;
+	struct ec_params_cec_write_v1 params_v1;
 	int ret;
 
-	memcpy(params.msg, cec_msg->msg, cec_msg->len);
+	if (cros_ec_cec->write_cmd_version == 0) {
+		memcpy(params.msg, cec_msg->msg, cec_msg->len);
+		ret = cros_ec_cmd(cros_ec, 0, EC_CMD_CEC_WRITE_MSG, &params,
+				  cec_msg->len, NULL, 0);
+	} else {
+		params_v1.port = port->port_num;
+		params_v1.msg_len = cec_msg->len;
+		memcpy(params_v1.msg, cec_msg->msg, cec_msg->len);
+		ret = cros_ec_cmd(cros_ec, cros_ec_cec->write_cmd_version,
+				  EC_CMD_CEC_WRITE_MSG, &params_v1,
+				  sizeof(params_v1), NULL, 0);
+	}
 
-	ret = cros_ec_cmd(cros_ec, 0, EC_CMD_CEC_WRITE_MSG, &params,
-			  cec_msg->len, NULL, 0);
 	if (ret < 0) {
 		dev_err(cros_ec->dev,
 			"error writing CEC msg on EC: %d\n", ret);
@@ -285,6 +297,38 @@ static struct device *cros_ec_cec_find_hdmi_dev(struct device *dev,
 
 #endif
 
+static int cros_ec_cec_get_write_cmd_version(struct cros_ec_cec *cros_ec_cec)
+{
+	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
+	struct ec_params_get_cmd_versions_v1 params = {
+		.cmd = EC_CMD_CEC_WRITE_MSG,
+	};
+	struct ec_response_get_cmd_versions response;
+	int ret;
+
+	ret = cros_ec_cmd(cros_ec, 1, EC_CMD_GET_CMD_VERSIONS, &params,
+			  sizeof(params), &response, sizeof(response));
+	if (ret < 0) {
+		dev_err(cros_ec->dev,
+			"error getting CEC write command version: %d\n", ret);
+		return ret;
+	}
+
+	if (response.version_mask & EC_VER_MASK(1)) {
+		cros_ec_cec->write_cmd_version = 1;
+	} else {
+		if (cros_ec_cec->num_ports != 1) {
+			dev_err(cros_ec->dev,
+				"v0 write command only supports 1 port, %d reported\n",
+				cros_ec_cec->num_ports);
+			return -EINVAL;
+		}
+		cros_ec_cec->write_cmd_version = 0;
+	}
+
+	return 0;
+}
+
 static int cros_ec_cec_init_port(struct device *dev,
 				 struct cros_ec_cec *cros_ec_cec,
 				 int port_num, struct device *hdmi_dev,
@@ -354,6 +398,10 @@ static int cros_ec_cec_probe(struct platform_device *pdev)
 
 	cros_ec_cec->num_ports = CEC_NUM_PORTS;
 
+	ret = cros_ec_cec_get_write_cmd_version(cros_ec_cec);
+	if (ret)
+		return ret;
+
 	for (int i = 0; i < cros_ec_cec->num_ports; i++) {
 		ret = cros_ec_cec_init_port(&pdev->dev, cros_ec_cec, i,
 					    hdmi_dev, conn);
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index e8bb05db360f..9a0c6e28f370 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4451,6 +4451,18 @@ struct ec_params_cec_write {
 	uint8_t msg[MAX_CEC_MSG_LEN];
 } __ec_align1;
 
+/**
+ * struct ec_params_cec_write_v1 - Message to write to the CEC bus
+ * @port: CEC port to write the message on
+ * @msg_len: length of msg in bytes
+ * @msg: message content to write to the CEC bus
+ */
+struct ec_params_cec_write_v1 {
+	uint8_t port;
+	uint8_t msg_len;
+	uint8_t msg[MAX_CEC_MSG_LEN];
+} __ec_align1;
+
 /* Set various CEC parameters */
 #define EC_CMD_CEC_SET 0x00BA
 
-- 
cgit v1.2.3


From 1cabf52639d16428bc0d61028dcaf38e29c5f3b5 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:58 +1000
Subject: media: cros-ec-cec: Support multiple ports in MKBP cec_events

Use the top four bits of the cec_events MKBP event to store the port
number.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 13 +++++++++++--
 include/linux/platform_data/cros_ec_commands.h   | 10 ++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index d674a432dfdd..18f78b7e034a 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -77,8 +77,17 @@ static void handle_cec_message(struct cros_ec_cec *cros_ec_cec)
 static void handle_cec_event(struct cros_ec_cec *cros_ec_cec)
 {
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
-	uint32_t events = cros_ec->event_data.data.cec_events;
-	struct cros_ec_cec_port *port = cros_ec_cec->ports[CEC_PORT];
+	uint32_t cec_events = cros_ec->event_data.data.cec_events;
+	uint32_t port_num = EC_MKBP_EVENT_CEC_GET_PORT(cec_events);
+	uint32_t events = EC_MKBP_EVENT_CEC_GET_EVENTS(cec_events);
+	struct cros_ec_cec_port *port;
+
+	if (port_num >= cros_ec_cec->num_ports) {
+		dev_err(cros_ec->dev,
+			"received CEC event for invalid port %d\n", port_num);
+		return;
+	}
+	port = cros_ec_cec->ports[port_num];
 
 	if (events & EC_MKBP_CEC_SEND_OK)
 		cec_transmit_attempt_done(port->adap, CEC_TX_STATUS_OK);
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 9a0c6e28f370..b7e8573a8a49 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4440,6 +4440,16 @@ struct ec_response_i2c_passthru_protect {
 
 #define MAX_CEC_MSG_LEN 16
 
+/*
+ * Helper macros for packing/unpacking cec_events.
+ * bits[27:0] : bitmask of events from enum mkbp_cec_event
+ * bits[31:28]: port number
+ */
+#define EC_MKBP_EVENT_CEC_PACK(events, port) \
+		(((events) & GENMASK(27, 0)) | (((port) & 0xf) << 28))
+#define EC_MKBP_EVENT_CEC_GET_EVENTS(event) ((event) & GENMASK(27, 0))
+#define EC_MKBP_EVENT_CEC_GET_PORT(event) (((event) >> 28) & 0xf)
+
 /* CEC message from the AP to be written on the CEC bus */
 #define EC_CMD_CEC_WRITE_MSG 0x00B8
 
-- 
cgit v1.2.3


From 425d20518c54bc6d66d733fb117a9a4046932d50 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:43:59 +1000
Subject: media: cros-ec-cec: Support receiving messages from multiple ports

Currently, received messages are sent from the EC in the cec_message
MKBP event. Since the size of ec_response_get_next_data_v1 is 16 bytes,
which is also the maximum size of a CEC message, there is no space to
add a port parameter. Increasing the size of
ec_response_get_next_data_v1 is an option, but this would increase
EC-kernel traffic for all MKBP event types.

Instead, use an event to notify that data is ready, and add a new read
command to read the data.

For backwards compatibility with old EC firmware, continue to handle
cec_message events as well.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 59 +++++++++++++++++++++---
 include/linux/platform_data/cros_ec_commands.h   | 23 +++++++++
 2 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index 18f78b7e034a..6989e63c05be 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -59,19 +59,63 @@ struct cros_ec_cec {
 	struct cros_ec_cec_port *ports[EC_CEC_MAX_PORTS];
 };
 
+static void cros_ec_cec_received_message(struct cros_ec_cec_port *port,
+					 uint8_t *msg, uint8_t len)
+{
+	if (len > CEC_MAX_MSG_SIZE)
+		len = CEC_MAX_MSG_SIZE;
+
+	port->rx_msg.len = len;
+	memcpy(port->rx_msg.msg, msg, len);
+
+	cec_received_msg(port->adap, &port->rx_msg);
+}
+
 static void handle_cec_message(struct cros_ec_cec *cros_ec_cec)
 {
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
 	uint8_t *cec_message = cros_ec->event_data.data.cec_message;
 	unsigned int len = cros_ec->event_size;
-	struct cros_ec_cec_port *port = cros_ec_cec->ports[CEC_PORT];
+	struct cros_ec_cec_port *port;
+	/*
+	 * There are two ways of receiving CEC messages:
+	 * 1. Old EC firmware which only supports one port sends the data in a
+	 *    cec_message MKBP event.
+	 * 2. New EC firmware which supports multiple ports uses
+	 *    EC_MKBP_CEC_HAVE_DATA to notify that data is ready and
+	 *    EC_CMD_CEC_READ_MSG to read it.
+	 * Check that the EC only has one CEC port, and then we can assume the
+	 * message is from port 0.
+	 */
+	if (cros_ec_cec->num_ports != 1) {
+		dev_err(cros_ec->dev,
+			"received cec_message on device with %d ports\n",
+			cros_ec_cec->num_ports);
+		return;
+	}
+	port = cros_ec_cec->ports[0];
 
-	if (len > CEC_MAX_MSG_SIZE)
-		len = CEC_MAX_MSG_SIZE;
-	port->rx_msg.len = len;
-	memcpy(port->rx_msg.msg, cec_message, len);
+	cros_ec_cec_received_message(port, cec_message, len);
+}
 
-	cec_received_msg(port->adap, &port->rx_msg);
+static void cros_ec_cec_read_message(struct cros_ec_cec_port *port)
+{
+	struct cros_ec_device *cros_ec = port->cros_ec_cec->cros_ec;
+	struct ec_params_cec_read params = {
+		.port = port->port_num,
+	};
+	struct ec_response_cec_read response;
+	int ret;
+
+	ret = cros_ec_cmd(cros_ec, 0, EC_CMD_CEC_READ_MSG, &params,
+			  sizeof(params), &response, sizeof(response));
+	if (ret < 0) {
+		dev_err(cros_ec->dev,
+			"error reading CEC message on EC: %d\n", ret);
+		return;
+	}
+
+	cros_ec_cec_received_message(port, response.msg, response.msg_len);
 }
 
 static void handle_cec_event(struct cros_ec_cec *cros_ec_cec)
@@ -97,6 +141,9 @@ static void handle_cec_event(struct cros_ec_cec *cros_ec_cec)
 		cec_transmit_attempt_done(port->adap,
 					  CEC_TX_STATUS_MAX_RETRIES |
 					  CEC_TX_STATUS_NACK);
+
+	if (events & EC_MKBP_CEC_HAVE_DATA)
+		cros_ec_cec_read_message(port);
 }
 
 static int cros_ec_cec_event(struct notifier_block *nb,
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index b7e8573a8a49..ad61c7ff0b28 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4473,6 +4473,27 @@ struct ec_params_cec_write_v1 {
 	uint8_t msg[MAX_CEC_MSG_LEN];
 } __ec_align1;
 
+/* CEC message read from a CEC bus reported back to the AP */
+#define EC_CMD_CEC_READ_MSG 0x00B9
+
+/**
+ * struct ec_params_cec_read - Read a message from the CEC bus
+ * @port: CEC port to read a message on
+ */
+struct ec_params_cec_read {
+	uint8_t port;
+} __ec_align1;
+
+/**
+ * struct ec_response_cec_read - Message read from the CEC bus
+ * @msg_len: length of msg in bytes
+ * @msg: message content read from the CEC bus
+ */
+struct ec_response_cec_read {
+	uint8_t msg_len;
+	uint8_t msg[MAX_CEC_MSG_LEN];
+} __ec_align1;
+
 /* Set various CEC parameters */
 #define EC_CMD_CEC_SET 0x00BA
 
@@ -4529,6 +4550,8 @@ enum mkbp_cec_event {
 	EC_MKBP_CEC_SEND_OK			= BIT(0),
 	/* Outgoing message was not acknowledged */
 	EC_MKBP_CEC_SEND_FAILED			= BIT(1),
+	/* Incoming message can be read out by AP */
+	EC_MKBP_CEC_HAVE_DATA			= BIT(2),
 };
 
 /*****************************************************************************/
-- 
cgit v1.2.3


From 5d227f02ceb9cc120cf04efbd77e12da182a5f62 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Fri, 25 Aug 2023 12:44:01 +1000
Subject: media: cros-ec-cec: Get number of CEC ports from EC

Add a new CEC port count host command and use it to query the number of
CEC ports from the EC. If the host command is not supported then it must
be old EC firmware which only supports one port, so fall back to
assuming one port.

This patch completes support for multiple ports in cros-ec-cec.

Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/cec/platform/cros-ec/cros-ec-cec.c | 40 +++++++++++++++++++++---
 include/linux/platform_data/cros_ec_commands.h   | 11 +++++++
 2 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
index 371699d599de..993deb85d3e2 100644
--- a/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
+++ b/drivers/media/cec/platform/cros-ec/cros-ec-cec.c
@@ -21,10 +21,6 @@
 
 #define DRV_NAME	"cros-ec-cec"
 
-/* Only one port is supported for now */
-#define CEC_NUM_PORTS	1
-#define CEC_PORT	0
-
 /**
  * struct cros_ec_cec_port - Driver data for a single EC CEC port
  *
@@ -358,6 +354,38 @@ static struct device *cros_ec_cec_find_hdmi_dev(struct device *dev,
 
 #endif
 
+static int cros_ec_cec_get_num_ports(struct cros_ec_cec *cros_ec_cec)
+{
+	struct ec_response_cec_port_count response;
+	int ret;
+
+	ret = cros_ec_cmd(cros_ec_cec->cros_ec, 0, EC_CMD_CEC_PORT_COUNT, NULL,
+			  0, &response, sizeof(response));
+	if (ret < 0) {
+		/*
+		 * Old EC firmware only supports one port and does not support
+		 * the port count command, so fall back to assuming one port.
+		 */
+		cros_ec_cec->num_ports = 1;
+		return 0;
+	}
+
+	if (response.port_count == 0) {
+		dev_err(cros_ec_cec->cros_ec->dev,
+			"EC reports 0 CEC ports\n");
+		return -ENODEV;
+	}
+
+	if (response.port_count > EC_CEC_MAX_PORTS) {
+		dev_err(cros_ec_cec->cros_ec->dev,
+			"EC reports too many ports: %d\n", response.port_count);
+		return -EINVAL;
+	}
+
+	cros_ec_cec->num_ports = response.port_count;
+	return 0;
+}
+
 static int cros_ec_cec_get_write_cmd_version(struct cros_ec_cec *cros_ec_cec)
 {
 	struct cros_ec_device *cros_ec = cros_ec_cec->cros_ec;
@@ -463,7 +491,9 @@ static int cros_ec_cec_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 1);
 
-	cros_ec_cec->num_ports = CEC_NUM_PORTS;
+	ret = cros_ec_cec_get_num_ports(cros_ec_cec);
+	if (ret)
+		return ret;
 
 	ret = cros_ec_cec_get_write_cmd_version(cros_ec_cec);
 	if (ret)
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index ad61c7ff0b28..7dae17b62a4d 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4536,6 +4536,17 @@ struct ec_response_cec_get {
 	uint8_t val;
 } __ec_align1;
 
+/* Get the number of CEC ports */
+#define EC_CMD_CEC_PORT_COUNT 0x00C1
+
+/**
+ * struct ec_response_cec_port_count - CEC port count response
+ * @port_count: number of CEC ports
+ */
+struct ec_response_cec_port_count {
+	uint8_t port_count;
+} __ec_align1;
+
 /* CEC parameters command */
 enum cec_command {
 	/* CEC reading, writing and events enable */
-- 
cgit v1.2.3


From 24775700eaa93ff83b2a0f1e005879cdf186cdd9 Mon Sep 17 00:00:00 2001
From: Muralidhara M K <muralidhara.mk@amd.com>
Date: Tue, 26 Sep 2023 05:19:32 +0000
Subject: x86/amd_nb: Add AMD Family MI300 PCI IDs

Add new Root, Device 18h Function 3, and Function 4 PCI IDS
for AMD F19h Model 90h-9fh (MI300A).

Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Suma Hegde <suma.hegde@amd.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20230926051932.193239-1-suma.hegde@amd.com
---
 arch/x86/kernel/amd_nb.c | 5 +++++
 include/linux/pci_ids.h  | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 356de955e78d..10c2a3c9114e 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -27,6 +27,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT		0x153a
 #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT		0x1507
 #define PCI_DEVICE_ID_AMD_MI200_ROOT		0x14bb
+#define PCI_DEVICE_ID_AMD_MI300_ROOT		0x14f8
 
 #define PCI_DEVICE_ID_AMD_17H_DF_F4		0x1464
 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4	0x15ec
@@ -43,6 +44,7 @@
 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4	0x12fc
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4	0x12c4
 #define PCI_DEVICE_ID_AMD_MI200_DF_F4		0x14d4
+#define PCI_DEVICE_ID_AMD_MI300_DF_F4		0x152c
 
 /* Protect the PCI config register pairs used for SMN. */
 static DEFINE_MUTEX(smn_mutex);
@@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
 	{}
 };
 
@@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
 	{}
 };
 
@@ -115,6 +119,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
 	{}
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5fb3d4c393a9..91b457de262e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -579,6 +579,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
+#define PCI_DEVICE_ID_AMD_MI300_DF_F3	0x152b
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From 2d5780bbef8dbe6375d481cbea212606a80e4453 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr@tesarici.cz>
Date: Tue, 26 Sep 2023 20:55:56 +0200
Subject: swiotlb: fix the check whether a device has used software IO TLB

When CONFIG_SWIOTLB_DYNAMIC=y, devices which do not use the software IO TLB
can avoid swiotlb lookup. A flag is added by commit 1395706a1490 ("swiotlb:
search the software IO TLB only if the device makes use of it"), the flag
is correctly set, but it is then never checked. Add the actual check here.

Note that this code is an alternative to the default pool check, not an
additional check, because:

1. swiotlb_find_pool() also searches the default pool;
2. if dma_uses_io_tlb is false, the default swiotlb pool is not used.

Tested in a KVM guest against a QEMU RAM-backed SATA disk over virtio and
*not* using software IO TLB, this patch increases IOPS by approx 2% for
4-way parallel I/O.

The write memory barrier in swiotlb_dyn_alloc() is not needed, because a
newly allocated pool must always be observed by swiotlb_find_slots() before
an address from that pool is passed to is_swiotlb_buffer().

Correctness was verified using the following litmus test:

C swiotlb-new-pool

(*
 * Result: Never
 *
 * Check that a newly allocated pool is always visible when the
 *  corresponding swiotlb buffer is visible.
 *)

{
	mem_pools = default;
}

P0(int **mem_pools, int *pool)
{
	/* add_mem_pool() */
	WRITE_ONCE(*pool, 999);
	rcu_assign_pointer(*mem_pools, pool);
}

P1(int **mem_pools, int *flag, int *buf)
{
	/* swiotlb_find_slots() */
	int *r0;
	int r1;

	rcu_read_lock();
	r0 = READ_ONCE(*mem_pools);
	r1 = READ_ONCE(*r0);
	rcu_read_unlock();

	if (r1) {
		WRITE_ONCE(*flag, 1);
		smp_mb();
	}

	/* device driver (presumed) */
	WRITE_ONCE(*buf, r1);
}

P2(int **mem_pools, int *flag, int *buf)
{
	/* device driver (presumed) */
	int r0 = READ_ONCE(*buf);

	/* is_swiotlb_buffer() */
	int r1;
	int *r2;
	int r3;

	smp_rmb();
	r1 = READ_ONCE(*flag);
	if (r1) {
		/* swiotlb_find_pool() */
		rcu_read_lock();
		r2 = READ_ONCE(*mem_pools);
		r3 = READ_ONCE(*r2);
		rcu_read_unlock();
	}
}

exists (2:r0<>0 /\ 2:r3=0) (* Not found. *)

Fixes: 1395706a1490 ("swiotlb: search the software IO TLB only if the device makes use of it")
Reported-by: Jonathan Corbet <corbet@lwn.net>
Closes: https://lore.kernel.org/linux-iommu/87a5uz3ob8.fsf@meer.lwn.net/
Signed-off-by: Petr Tesarik <petr@tesarici.cz>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 23 ++++++++++++++++-------
 kernel/dma/swiotlb.c    | 26 ++++++++++++++++++++------
 2 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b4536626f8ff..ecde0312dd52 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -172,14 +172,23 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 	if (!mem)
 		return false;
 
-	if (IS_ENABLED(CONFIG_SWIOTLB_DYNAMIC)) {
-		/* Pairs with smp_wmb() in swiotlb_find_slots() and
-		 * swiotlb_dyn_alloc(), which modify the RCU lists.
-		 */
-		smp_rmb();
-		return swiotlb_find_pool(dev, paddr);
-	}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+	/*
+	 * All SWIOTLB buffer addresses must have been returned by
+	 * swiotlb_tbl_map_single() and passed to a device driver.
+	 * If a SWIOTLB address is checked on another CPU, then it was
+	 * presumably loaded by the device driver from an unspecified private
+	 * data structure. Make sure that this load is ordered before reading
+	 * dev->dma_uses_io_tlb here and mem->pools in swiotlb_find_pool().
+	 *
+	 * This barrier pairs with smp_mb() in swiotlb_find_slots().
+	 */
+	smp_rmb();
+	return READ_ONCE(dev->dma_uses_io_tlb) &&
+		swiotlb_find_pool(dev, paddr);
+#else
 	return paddr >= mem->defpool.start && paddr < mem->defpool.end;
+#endif
 }
 
 static inline bool is_swiotlb_force_bounce(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 85dd94323b98..01637677736f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -728,9 +728,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
 	}
 
 	add_mem_pool(mem, pool);
-
-	/* Pairs with smp_rmb() in is_swiotlb_buffer(). */
-	smp_wmb();
 }
 
 /**
@@ -1151,9 +1148,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
 
 found:
-	dev->dma_uses_io_tlb = true;
-	/* Pairs with smp_rmb() in is_swiotlb_buffer() */
-	smp_wmb();
+	WRITE_ONCE(dev->dma_uses_io_tlb, true);
+
+	/*
+	 * The general barrier orders reads and writes against a presumed store
+	 * of the SWIOTLB buffer address by a device driver (to a driver private
+	 * data structure). It serves two purposes.
+	 *
+	 * First, the store to dev->dma_uses_io_tlb must be ordered before the
+	 * presumed store. This guarantees that the returned buffer address
+	 * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+	 *
+	 * Second, the load from mem->pools must be ordered before the same
+	 * presumed store. This guarantees that the returned buffer address
+	 * cannot be observed by another CPU before an update of the RCU list
+	 * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+	 * atomicity).
+	 *
+	 * See also the comment in is_swiotlb_buffer().
+	 */
+	smp_mb();
 
 	*retpool = pool;
 	return index;
-- 
cgit v1.2.3


From 2e2b547950bc09e75afe912f9683be39c2195d9d Mon Sep 17 00:00:00 2001
From: Wenchao Chen <wenchao.chen@unisoc.com>
Date: Tue, 19 Sep 2023 15:47:06 +0800
Subject: mmc: core: Allow dynamical updates of the number of requests for hsq

To allow dynamical updates of the current number of used in-flight
requests, let's move away from using a hard-coded value to a use a
corresponding variable in the struct mmc_host.

This can be valuable when optimizing for certain I/O request sequences, as
shown by subsequent changes.

Signed-off-by: Wenchao Chen <wenchao.chen@unisoc.com>
Link: https://lore.kernel.org/r/20230919074707.25517-2-wenchao.chen@unisoc.com
[Ulf: Re-wrote the commitmsg to clarify the change]
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/queue.c   | 6 +-----
 drivers/mmc/host/mmc_hsq.c | 1 +
 drivers/mmc/host/mmc_hsq.h | 6 ++++++
 include/linux/mmc/host.h   | 1 +
 4 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index b396e3900717..a0a2412f62a7 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -260,11 +260,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 		break;
 	case MMC_ISSUE_ASYNC:
-		/*
-		 * For MMC host software queue, we only allow 2 requests in
-		 * flight to avoid a long latency.
-		 */
-		if (host->hsq_enabled && mq->in_flight[issue_type] > 2) {
+		if (host->hsq_enabled && mq->in_flight[issue_type] > host->hsq_depth) {
 			spin_unlock_irq(&mq->lock);
 			return BLK_STS_RESOURCE;
 		}
diff --git a/drivers/mmc/host/mmc_hsq.c b/drivers/mmc/host/mmc_hsq.c
index 424dc7b07858..8556cacb21a1 100644
--- a/drivers/mmc/host/mmc_hsq.c
+++ b/drivers/mmc/host/mmc_hsq.c
@@ -337,6 +337,7 @@ int mmc_hsq_init(struct mmc_hsq *hsq, struct mmc_host *mmc)
 	hsq->mmc = mmc;
 	hsq->mmc->cqe_private = hsq;
 	mmc->cqe_ops = &mmc_hsq_ops;
+	mmc->hsq_depth = HSQ_NORMAL_DEPTH;
 
 	for (i = 0; i < HSQ_NUM_SLOTS; i++)
 		hsq->tag_slot[i] = HSQ_INVALID_TAG;
diff --git a/drivers/mmc/host/mmc_hsq.h b/drivers/mmc/host/mmc_hsq.h
index 1808024fc6c5..aa5c4543b55f 100644
--- a/drivers/mmc/host/mmc_hsq.h
+++ b/drivers/mmc/host/mmc_hsq.h
@@ -5,6 +5,12 @@
 #define HSQ_NUM_SLOTS	64
 #define HSQ_INVALID_TAG	HSQ_NUM_SLOTS
 
+/*
+ * For MMC host software queue, we only allow 2 requests in
+ * flight to avoid a long latency.
+ */
+#define HSQ_NORMAL_DEPTH	2
+
 struct hsq_slot {
 	struct mmc_request *mrq;
 };
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 62a6847a3b6f..2f445c651742 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -526,6 +526,7 @@ struct mmc_host {
 
 	/* Host Software Queue support */
 	bool			hsq_enabled;
+	int			hsq_depth;
 
 	u32			err_stats[MMC_ERR_MAX];
 	unsigned long		private[] ____cacheline_aligned;
-- 
cgit v1.2.3


From 1a6a464774947920dcedcf7409be62495c7cedd0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 12 Sep 2023 12:44:06 +0200
Subject: timers: Tag (hr)timer softirq as hotplug safe

Specific stress involving frequent CPU-hotplug operations, such as
running rcutorture for example, may trigger the following message:

  NOHZ tick-stop error: local softirq work is pending, handler #02!!!"

This happens in the CPU-down hotplug process, after
CPUHP_AP_SMPBOOT_THREADS whose teardown callback parks ksoftirqd, and
before the target CPU shuts down through CPUHP_AP_IDLE_DEAD. In this
fragile intermediate state, softirqs waiting for threaded handling may be
forever ignored and eventually reported by the idle task as in the above
example.

However some vectors are known to be safe as long as the corresponding
subsystems have teardown callbacks handling the migration of their
events. The above error message reports pending timers softirq although
this vector can be considered as hotplug safe because the
CPUHP_TIMERS_PREPARE teardown callback performs the necessary migration
of timers after the death of the CPU. Hrtimers also have a similar
hotplug handling.

Therefore this error message, as far as (hr-)timers are concerned, can
be considered spurious and the relevant softirq vectors can be marked as
hotplug safe.

Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230912104406.312185-6-frederic@kernel.org
---
 include/linux/interrupt.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..4a1dc88ddbff 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,8 +569,12 @@ enum
  * 	2) rcu_report_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
+ *
+ * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue
  */
-#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ))
+#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\
+				   BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ))
+
 
 /* map softirq index to softirq name. update 'softirq_to_name' in
  * kernel/softirq.c when adding a new softirq.
-- 
cgit v1.2.3


From c02a427f7b64ed5b840a0720a6cee5a17a1e7e07 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Tue, 12 Sep 2023 12:44:05 +0200
Subject: tick/nohz: Remove unused tick_nohz_idle_stop_tick_protected()

All the caller has been removed since commit 336f560a8917 ("x86/xen: don't
let xen_pv_play_dead() return")

Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230912104406.312185-5-frederic@kernel.org
---
 include/linux/tick.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 9459fef5b857..716d17f31c45 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -140,14 +140,6 @@ extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-
-static inline void tick_nohz_idle_stop_tick_protected(void)
-{
-	local_irq_disable();
-	tick_nohz_idle_stop_tick();
-	local_irq_enable();
-}
-
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
@@ -170,8 +162,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
-
-static inline void tick_nohz_idle_stop_tick_protected(void) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-- 
cgit v1.2.3


From 6260ecd04594360ae2af104fb2641317728a66e4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:51:27 -0700
Subject: irqdomain: Annotate struct irq_domain with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct irq_domain.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175127.work.214-kees@kernel.org
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 51c254b7fec2..ee0a82c60508 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -174,7 +174,7 @@ struct irq_domain {
 	irq_hw_number_t			hwirq_max;
 	unsigned int			revmap_size;
 	struct radix_tree_root		revmap_tree;
-	struct irq_data __rcu		*revmap[];
+	struct irq_data __rcu		*revmap[] __counted_by(revmap_size);
 };
 
 /* Irq domain flags */
-- 
cgit v1.2.3


From d069ed6b752f91cea6341a9c60be42837678a7f5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 21 Sep 2023 20:01:43 +0200
Subject: thermal: core: Allow trip pointers to be used for cooling device
 binding

Add new helper functions, thermal_bind_cdev_to_trip() and
thermal_unbind_cdev_from_trip(), to allow a trip pointer to be used for
binding a cooling device to a trip point and unbinding it, respectively,
and redefine the existing helpers, thermal_zone_bind_cooling_device()
and thermal_zone_unbind_cooling_device(), as wrappers around the new
ones, respectively.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c | 54 ++++++++++++++++++++++++++----------------
 include/linux/thermal.h        |  8 +++++++
 2 files changed, 42 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 31fe14a96d13..45d0aa0b69b7 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -600,10 +600,9 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
  */
 
 /**
- * thermal_zone_bind_cooling_device() - bind a cooling device to a thermal zone
+ * thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone
  * @tz:		pointer to struct thermal_zone_device
- * @trip_index:	indicates which trip point the cooling devices is
- *		associated with in this thermal zone.
+ * @trip:	trip point the cooling devices is associated with in this zone.
  * @cdev:	pointer to struct thermal_cooling_device
  * @upper:	the Maximum cooling state for this trip point.
  *		THERMAL_NO_LIMIT means no upper limit,
@@ -621,8 +620,8 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
  *
  * Return: 0 on success, the proper error value otherwise.
  */
-int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
-				     int trip_index,
+int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
+				     const struct thermal_trip *trip,
 				     struct thermal_cooling_device *cdev,
 				     unsigned long upper, unsigned long lower,
 				     unsigned int weight)
@@ -631,15 +630,9 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	struct thermal_instance *pos;
 	struct thermal_zone_device *pos1;
 	struct thermal_cooling_device *pos2;
-	const struct thermal_trip *trip;
 	bool upper_no_limit;
 	int result;
 
-	if (trip_index >= tz->num_trips || trip_index < 0)
-		return -EINVAL;
-
-	trip = &tz->trips[trip_index];
-
 	list_for_each_entry(pos1, &thermal_tz_list, node) {
 		if (pos1 == tz)
 			break;
@@ -736,14 +729,26 @@ free_mem:
 	kfree(dev);
 	return result;
 }
+EXPORT_SYMBOL_GPL(thermal_bind_cdev_to_trip);
+
+int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+				     int trip_index,
+				     struct thermal_cooling_device *cdev,
+				     unsigned long upper, unsigned long lower,
+				     unsigned int weight)
+{
+	if (trip_index < 0 || trip_index >= tz->num_trips)
+		return -EINVAL;
+
+	return thermal_bind_cdev_to_trip(tz, &tz->trips[trip_index], cdev,
+					 upper, lower, weight);
+}
 EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
 
 /**
- * thermal_zone_unbind_cooling_device() - unbind a cooling device from a
- *					  thermal zone.
+ * thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone.
  * @tz:		pointer to a struct thermal_zone_device.
- * @trip_index:	indicates which trip point the cooling devices is
- *		associated with in this thermal zone.
+ * @trip:	trip point the cooling devices is associated with in this zone.
  * @cdev:	pointer to a struct thermal_cooling_device.
  *
  * This interface function unbind a thermal cooling device from the certain
@@ -752,16 +757,14 @@ EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
  *
  * Return: 0 on success, the proper error value otherwise.
  */
-int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
-				       int trip_index,
-				       struct thermal_cooling_device *cdev)
+int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
+				  const struct thermal_trip *trip,
+				  struct thermal_cooling_device *cdev)
 {
 	struct thermal_instance *pos, *next;
-	const struct thermal_trip *trip;
 
 	mutex_lock(&tz->lock);
 	mutex_lock(&cdev->lock);
-	trip = &tz->trips[trip_index];
 	list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) {
 		if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
 			list_del(&pos->tz_node);
@@ -784,6 +787,17 @@ unbind:
 	kfree(pos);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(thermal_unbind_cdev_from_trip);
+
+int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+				       int trip_index,
+				       struct thermal_cooling_device *cdev)
+{
+	if (trip_index < 0 || trip_index >= tz->num_trips)
+		return -EINVAL;
+
+	return thermal_unbind_cdev_from_trip(tz, &tz->trips[trip_index], cdev);
+}
 EXPORT_SYMBOL_GPL(thermal_zone_unbind_cooling_device);
 
 static void thermal_release(struct device *dev)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6cfcae22ba12..6710a4ace992 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -320,10 +320,18 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 int thermal_zone_device_id(struct thermal_zone_device *tzd);
 struct device *thermal_zone_device(struct thermal_zone_device *tzd);
 
+int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
+			      const struct thermal_trip *trip,
+			      struct thermal_cooling_device *cdev,
+			      unsigned long upper, unsigned long lower,
+			      unsigned int weight);
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
 				     unsigned long, unsigned long,
 				     unsigned int);
+int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
+				  const struct thermal_trip *trip,
+				  struct thermal_cooling_device *cdev);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
-- 
cgit v1.2.3


From 5aa4c9608d2d5fea29e211a80c29696f7d94e9f7 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:48 +0300
Subject: net/mlx5: Introduce ifc bits for migration in a chunk mode

Introduce ifc related stuff to enable migration in a chunk mode.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-2-yishaih@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..3265bfcb3156 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1948,7 +1948,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_c0[0x8];
 	u8	   migration_multi_load[0x1];
 	u8	   migration_tracking_state[0x1];
-	u8	   reserved_at_ca[0x16];
+	u8	   reserved_at_ca[0x6];
+	u8	   migration_in_chunks[0x1];
+	u8	   reserved_at_d1[0xf];
 
 	u8	   reserved_at_e0[0xc0];
 
@@ -12392,7 +12394,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits {
 	u8         op_mod[0x10];
 
 	u8         incremental[0x1];
-	u8         reserved_at_41[0xf];
+	u8         chunk[0x1];
+	u8         reserved_at_42[0xe];
 	u8         vhca_id[0x10];
 
 	u8         reserved_at_60[0x20];
@@ -12408,7 +12411,11 @@ struct mlx5_ifc_query_vhca_migration_state_out_bits {
 
 	u8         required_umem_size[0x20];
 
-	u8         reserved_at_a0[0x160];
+	u8         reserved_at_a0[0x20];
+
+	u8         remaining_total_size[0x40];
+
+	u8         reserved_at_100[0x100];
 };
 
 struct mlx5_ifc_save_vhca_state_in_bits {
@@ -12440,7 +12447,7 @@ struct mlx5_ifc_save_vhca_state_out_bits {
 
 	u8         actual_image_size[0x20];
 
-	u8         reserved_at_60[0x20];
+	u8         next_required_umem_size[0x20];
 };
 
 struct mlx5_ifc_load_vhca_state_in_bits {
-- 
cgit v1.2.3


From fb99ef17865035a6657786d4b2af11a27ba23f9b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 25 Aug 2023 15:41:14 +0900
Subject: ata: libata-scsi: link ata port and scsi device

There is no direct device ancestry defined between an ata_device and
its scsi device which prevents the power management code from correctly
ordering suspend and resume operations. Create such ancestry with the
ata device as the parent to ensure that the scsi device (child) is
suspended before the ata device and that resume handles the ata device
before the scsi device.

The parent-child (supplier-consumer) relationship is established between
the ata_port (parent) and the scsi device (child) with the function
device_add_link(). The parent used is not the ata_device as the PM
operations are defined per port and the status of all devices connected
through that port is controlled from the port operations.

The device link is established with the new function
ata_scsi_slave_alloc(), and this function is used to define the
->slave_alloc callback of the scsi host template of all ata drivers.

Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
---
 drivers/ata/libata-scsi.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/libata.h    |  2 ++
 2 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index fb73c145b49a..8b43290ca2cd 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1089,6 +1089,42 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 	return 0;
 }
 
+/**
+ *	ata_scsi_slave_alloc - Early setup of SCSI device
+ *	@sdev: SCSI device to examine
+ *
+ *	This is called from scsi_alloc_sdev() when the scsi device
+ *	associated with an ATA device is scanned on a port.
+ *
+ *	LOCKING:
+ *	Defined by SCSI layer.  We don't really care.
+ */
+
+int ata_scsi_slave_alloc(struct scsi_device *sdev)
+{
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	struct device_link *link;
+
+	ata_scsi_sdev_config(sdev);
+
+	/*
+	 * Create a link from the ata_port device to the scsi device to ensure
+	 * that PM does suspend/resume in the correct order: the scsi device is
+	 * consumer (child) and the ata port the supplier (parent).
+	 */
+	link = device_link_add(&sdev->sdev_gendev, &ap->tdev,
+			       DL_FLAG_STATELESS |
+			       DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE);
+	if (!link) {
+		ata_port_err(ap, "Failed to create link to scsi device %s\n",
+			     dev_name(&sdev->sdev_gendev));
+		return -ENODEV;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ata_scsi_slave_alloc);
+
 /**
  *	ata_scsi_slave_config - Set SCSI device attributes
  *	@sdev: SCSI device to examine
@@ -1105,14 +1141,11 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 {
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
-	int rc = 0;
-
-	ata_scsi_sdev_config(sdev);
 
 	if (dev)
-		rc = ata_scsi_dev_config(sdev, dev);
+		return ata_scsi_dev_config(sdev, dev);
 
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 
@@ -1136,6 +1169,8 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev)
 	unsigned long flags;
 	struct ata_device *dev;
 
+	device_link_remove(&sdev->sdev_gendev, &ap->tdev);
+
 	spin_lock_irqsave(ap->lock, flags);
 	dev = __ata_scsi_find_dev(ap, sdev);
 	if (dev && dev->sdev) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 84aca8c44fa3..3ce1ab408114 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1148,6 +1148,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev);
+extern int ata_scsi_slave_alloc(struct scsi_device *sdev);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
 extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
 extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
@@ -1396,6 +1397,7 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.emulated		= ATA_SHT_EMULATED,		\
 	.proc_name		= drv_name,			\
+	.slave_alloc		= ata_scsi_slave_alloc,		\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity,\
-- 
cgit v1.2.3


From aa3998dbeb3abce63653b7f6d4542e7dcd022590 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 26 Aug 2023 09:43:39 +0900
Subject: ata: libata-scsi: Disable scsi device manage_system_start_stop

The introduction of a device link to create a consumer/supplier
relationship between the scsi device of an ATA device and the ATA port
of that ATA device fixes the ordering of system suspend and resume
operations. For suspend, the scsi device is suspended first and the ata
port after it. This is fine as this allows the synchronize cache and
START STOP UNIT commands issued by the scsi disk driver to be executed
before the ata port is disabled.

For resume operations, the ata port is resumed first, followed
by the scsi device. This allows having the request queue of the scsi
device to be unfrozen after the ata port resume is scheduled in EH,
thus avoiding to see new requests prematurely issued to the ATA device.
Since libata sets manage_system_start_stop to 1, the scsi disk resume
operation also results in issuing a START STOP UNIT command to the
device being resumed so that the device exits standby power mode.

However, restoring the ATA device to the active power mode must be
synchronized with libata EH processing of the port resume operation to
avoid either 1) seeing the start stop unit command being received too
early when the port is not yet resumed and ready to accept commands, or
after the port resume process issues commands such as IDENTIFY to
revalidate the device. In this last case, the risk is that the device
revalidation fails with timeout errors as the drive is still spun down.

Commit 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume")
disabled issuing the START STOP UNIT command to avoid issues with it.
But this is incorrect as transitioning a device to the active power
mode from the standby power mode set on suspend requires a media access
command. The IDENTIFY, READ LOG and SET FEATURES commands executed in
libata EH context triggered by the ata port resume operation may thus
fail.

Fix these synchronization issues is by handling a device power mode
transitions for system suspend and resume directly in libata EH context,
without relying on the scsi disk driver management triggered with the
manage_system_start_stop flag.

To do this, the following libata helper functions are introduced:

1) ata_dev_power_set_standby():

This function issues a STANDBY IMMEDIATE command to transitiom a device
to the standby power mode. For HDDs, this spins down the disks. This
function applies only to ATA and ZAC devices and does nothing otherwise.
This function also does nothing for devices that have the
ATA_FLAG_NO_POWEROFF_SPINDOWN or ATA_FLAG_NO_HIBERNATE_SPINDOWN flag
set.

For suspend, call ata_dev_power_set_standby() in
ata_eh_handle_port_suspend() before the port is disabled and frozen.
ata_eh_unload() is also modified to transition all enabled devices to
the standby power mode when the system is shutdown or devices removed.

2) ata_dev_power_set_active() and

This function applies to ATA or ZAC devices and issues a VERIFY command
for 1 sector at LBA 0 to transition the device to the active power mode.
For HDDs, since this function will complete only once the disk spin up.
Its execution uses the same timeouts as for reset, to give the drive
enough time to complete spinup without triggering a command timeout.

For resume, call ata_dev_power_set_active() in
ata_eh_revalidate_and_attach() after the port has been enabled and
before any other command is issued to the device.

With these changes, the manage_system_start_stop and no_start_on_resume
scsi device flags do not need to be set in ata_scsi_dev_config(). The
flag manage_runtime_start_stop is still set to allow the sd driver to
spinup/spindown a disk through the sd runtime operations.

Fixes: 0a8589055936 ("ata,scsi: do not issue START STOP UNIT on resume")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-core.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ata/libata-eh.c   | 46 +++++++++++++++++++++++-
 drivers/ata/libata-scsi.c | 16 ++++-----
 drivers/ata/libata.h      |  2 ++
 include/linux/libata.h    |  6 ++--
 5 files changed, 148 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 8e35afe5e560..a0bc01606b30 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1972,6 +1972,96 @@ retry:
 	return rc;
 }
 
+/**
+ *	ata_dev_power_set_standby - Set a device power mode to standby
+ *	@dev: target device
+ *
+ *	Issue a STANDBY IMMEDIATE command to set a device power mode to standby.
+ *	For an HDD device, this spins down the disks.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_dev_power_set_standby(struct ata_device *dev)
+{
+	unsigned long ap_flags = dev->link->ap->flags;
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+
+	/* Issue STANDBY IMMEDIATE command only if supported by the device */
+	if (dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC)
+		return;
+
+	/*
+	 * Some odd clown BIOSes issue spindown on power off (ACPI S4 or S5)
+	 * causing some drives to spin up and down again. For these, do nothing
+	 * if we are being called on shutdown.
+	 */
+	if ((ap_flags & ATA_FLAG_NO_POWEROFF_SPINDOWN) &&
+	    system_state == SYSTEM_POWER_OFF)
+		return;
+
+	if ((ap_flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) &&
+	    system_entering_hibernation())
+		return;
+
+	ata_tf_init(dev, &tf);
+	tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
+	tf.protocol = ATA_PROT_NODATA;
+	tf.command = ATA_CMD_STANDBYNOW1;
+
+	ata_dev_notice(dev, "Entering standby power mode\n");
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	if (err_mask)
+		ata_dev_err(dev, "STANDBY IMMEDIATE failed (err_mask=0x%x)\n",
+			    err_mask);
+}
+
+/**
+ *	ata_dev_power_set_active -  Set a device power mode to active
+ *	@dev: target device
+ *
+ *	Issue a VERIFY command to enter to ensure that the device is in the
+ *	active power mode. For a spun-down HDD (standby or idle power mode),
+ *	the VERIFY command will complete after the disk spins up.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ */
+void ata_dev_power_set_active(struct ata_device *dev)
+{
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+
+	/*
+	 * Issue READ VERIFY SECTORS command for 1 sector at lba=0 only
+	 * if supported by the device.
+	 */
+	if (dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC)
+		return;
+
+	ata_tf_init(dev, &tf);
+	tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
+	tf.protocol = ATA_PROT_NODATA;
+	tf.command = ATA_CMD_VERIFY;
+	tf.nsect = 1;
+	if (dev->flags & ATA_DFLAG_LBA) {
+		tf.flags |= ATA_TFLAG_LBA;
+		tf.device |= ATA_LBA;
+	} else {
+		/* CHS */
+		tf.lbal = 0x1; /* sect */
+	}
+
+	ata_dev_notice(dev, "Entering active power mode\n");
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	if (err_mask)
+		ata_dev_err(dev, "VERIFY failed (err_mask=0x%x)\n",
+			    err_mask);
+}
+
 /**
  *	ata_read_log_page - read a specific log page
  *	@dev: target device
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 4cf4f57e57b8..b1b2c276371e 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -147,6 +147,8 @@ ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = {
 	  .timeouts = ata_eh_other_timeouts, },
 	{ .commands = CMDS(ATA_CMD_FLUSH, ATA_CMD_FLUSH_EXT),
 	  .timeouts = ata_eh_flush_timeouts },
+	{ .commands = CMDS(ATA_CMD_VERIFY),
+	  .timeouts = ata_eh_reset_timeouts },
 };
 #undef CMDS
 
@@ -498,7 +500,19 @@ static void ata_eh_unload(struct ata_port *ap)
 	struct ata_device *dev;
 	unsigned long flags;
 
-	/* Restore SControl IPM and SPD for the next driver and
+	/*
+	 * Unless we are restarting, transition all enabled devices to
+	 * standby power mode.
+	 */
+	if (system_state != SYSTEM_RESTART) {
+		ata_for_each_link(link, ap, PMP_FIRST) {
+			ata_for_each_dev(dev, link, ENABLED)
+				ata_dev_power_set_standby(dev);
+		}
+	}
+
+	/*
+	 * Restore SControl IPM and SPD for the next driver and
 	 * disable attached devices.
 	 */
 	ata_for_each_link(link, ap, PMP_FIRST) {
@@ -684,6 +698,10 @@ void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap)
 			ehc->saved_xfer_mode[devno] = dev->xfer_mode;
 			if (ata_ncq_enabled(dev))
 				ehc->saved_ncq_enabled |= 1 << devno;
+
+			/* If we are resuming, wake up the device */
+			if (ap->pflags & ATA_PFLAG_RESUMING)
+				ehc->i.dev_action[devno] |= ATA_EH_SET_ACTIVE;
 		}
 	}
 
@@ -743,6 +761,8 @@ void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap)
 	/* clean up */
 	spin_lock_irqsave(ap->lock, flags);
 
+	ap->pflags &= ~ATA_PFLAG_RESUMING;
+
 	if (ap->pflags & ATA_PFLAG_LOADING)
 		ap->pflags &= ~ATA_PFLAG_LOADING;
 	else if ((ap->pflags & ATA_PFLAG_SCSI_HOTPLUG) &&
@@ -1218,6 +1238,13 @@ void ata_eh_detach_dev(struct ata_device *dev)
 	struct ata_eh_context *ehc = &link->eh_context;
 	unsigned long flags;
 
+	/*
+	 * If the device is still enabled, transition it to standby power mode
+	 * (i.e. spin down HDDs).
+	 */
+	if (ata_dev_enabled(dev))
+		ata_dev_power_set_standby(dev);
+
 	ata_dev_disable(dev);
 
 	spin_lock_irqsave(ap->lock, flags);
@@ -3016,6 +3043,15 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 		if (ehc->i.flags & ATA_EHI_DID_RESET)
 			readid_flags |= ATA_READID_POSTRESET;
 
+		/*
+		 * When resuming, before executing any command, make sure to
+		 * transition the device to the active power mode.
+		 */
+		if ((action & ATA_EH_SET_ACTIVE) && ata_dev_enabled(dev)) {
+			ata_dev_power_set_active(dev);
+			ata_eh_done(link, dev, ATA_EH_SET_ACTIVE);
+		}
+
 		if ((action & ATA_EH_REVALIDATE) && ata_dev_enabled(dev)) {
 			WARN_ON(dev->class == ATA_DEV_PMP);
 
@@ -3989,6 +4025,7 @@ static void ata_eh_handle_port_suspend(struct ata_port *ap)
 	unsigned long flags;
 	int rc = 0;
 	struct ata_device *dev;
+	struct ata_link *link;
 
 	/* are we suspending? */
 	spin_lock_irqsave(ap->lock, flags);
@@ -4001,6 +4038,12 @@ static void ata_eh_handle_port_suspend(struct ata_port *ap)
 
 	WARN_ON(ap->pflags & ATA_PFLAG_SUSPENDED);
 
+	/* Set all devices attached to the port in standby mode */
+	ata_for_each_link(link, ap, HOST_FIRST) {
+		ata_for_each_dev(dev, link, ENABLED)
+			ata_dev_power_set_standby(dev);
+	}
+
 	/*
 	 * If we have a ZPODD attached, check its zero
 	 * power ready status before the port is frozen.
@@ -4083,6 +4126,7 @@ static void ata_eh_handle_port_resume(struct ata_port *ap)
 	/* update the flags */
 	spin_lock_irqsave(ap->lock, flags);
 	ap->pflags &= ~(ATA_PFLAG_PM_PENDING | ATA_PFLAG_SUSPENDED);
+	ap->pflags |= ATA_PFLAG_RESUMING;
 	spin_unlock_irqrestore(ap->lock, flags);
 }
 #endif /* CONFIG_PM */
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 73428ad0c8d2..a0e58d22d222 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1050,15 +1050,13 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 		}
 	} else {
 		sdev->sector_size = ata_id_logical_sector_size(dev->id);
+
 		/*
-		 * Stop the drive on suspend but do not issue START STOP UNIT
-		 * on resume as this is not necessary and may fail: the device
-		 * will be woken up by ata_port_pm_resume() with a port reset
-		 * and device revalidation.
+		 * Ask the sd driver to issue START STOP UNIT on runtime suspend
+		 * and resume only. For system level suspend/resume, devices
+		 * power state is handled directly by libata EH.
 		 */
-		sdev->manage_system_start_stop = true;
 		sdev->manage_runtime_start_stop = true;
-		sdev->no_start_on_resume = 1;
 	}
 
 	/*
@@ -1231,7 +1229,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
 	}
 
 	if (cdb[4] & 0x1) {
-		tf->nsect = 1;	/* 1 sector, lba=0 */
+		tf->nsect = 1;  /* 1 sector, lba=0 */
 
 		if (qc->dev->flags & ATA_DFLAG_LBA) {
 			tf->flags |= ATA_TFLAG_LBA;
@@ -1247,7 +1245,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
 			tf->lbah = 0x0; /* cyl high */
 		}
 
-		tf->command = ATA_CMD_VERIFY;	/* READ VERIFY */
+		tf->command = ATA_CMD_VERIFY;   /* READ VERIFY */
 	} else {
 		/* Some odd clown BIOSen issue spindown on power off (ACPI S4
 		 * or S5) causing some drives to spin up and down again.
@@ -1257,7 +1255,7 @@ static unsigned int ata_scsi_start_stop_xlat(struct ata_queued_cmd *qc)
 			goto skip;
 
 		if ((qc->ap->flags & ATA_FLAG_NO_HIBERNATE_SPINDOWN) &&
-		     system_entering_hibernation())
+		    system_entering_hibernation())
 			goto skip;
 
 		/* Issue ATA STANDBY IMMEDIATE command */
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 6e7d352803bd..820299bd9d06 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -60,6 +60,8 @@ extern int ata_dev_reread_id(struct ata_device *dev, unsigned int readid_flags);
 extern int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class,
 			      unsigned int readid_flags);
 extern int ata_dev_configure(struct ata_device *dev);
+extern void ata_dev_power_set_standby(struct ata_device *dev);
+extern void ata_dev_power_set_active(struct ata_device *dev);
 extern int sata_down_spd_limit(struct ata_link *link, u32 spd_limit);
 extern int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel);
 extern unsigned int ata_dev_set_feature(struct ata_device *dev,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3ce1ab408114..2a7d2af0ed80 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -192,6 +192,7 @@ enum {
 	ATA_PFLAG_UNLOADING	= (1 << 9), /* driver is being unloaded */
 	ATA_PFLAG_UNLOADED	= (1 << 10), /* driver is unloaded */
 
+	ATA_PFLAG_RESUMING	= (1 << 16),  /* port is being resumed */
 	ATA_PFLAG_SUSPENDED	= (1 << 17), /* port is suspended (power) */
 	ATA_PFLAG_PM_PENDING	= (1 << 18), /* PM operation pending */
 	ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */
@@ -318,9 +319,10 @@ enum {
 	ATA_EH_ENABLE_LINK	= (1 << 3),
 	ATA_EH_PARK		= (1 << 5), /* unload heads and stop I/O */
 	ATA_EH_GET_SUCCESS_SENSE = (1 << 6), /* Get sense data for successful cmd */
+	ATA_EH_SET_ACTIVE	= (1 << 7), /* Set a device to active power mode */
 
 	ATA_EH_PERDEV_MASK	= ATA_EH_REVALIDATE | ATA_EH_PARK |
-				  ATA_EH_GET_SUCCESS_SENSE,
+				  ATA_EH_GET_SUCCESS_SENSE | ATA_EH_SET_ACTIVE,
 	ATA_EH_ALL_ACTIONS	= ATA_EH_REVALIDATE | ATA_EH_RESET |
 				  ATA_EH_ENABLE_LINK,
 
@@ -357,7 +359,7 @@ enum {
 	/* This should match the actual table size of
 	 * ata_eh_cmd_timeout_table in libata-eh.c.
 	 */
-	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7,
+	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8,
 
 	/* Horkage types. May be set by libata or controller on drives
 	   (some horkage may be drive/controller pair dependent */
-- 
cgit v1.2.3


From 528ce6781726e022bc5dc84034360e6e8f1b89bd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:24 +0800
Subject: io_uring: retain top 8bits of uring_cmd flags for kernel internal use

Retain top 8bits of uring_cmd flags for kernel internal use, so that we
can move IORING_URING_CMD_POLLED out of uapi header.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h      | 3 +++
 include/uapi/linux/io_uring.h | 5 ++---
 io_uring/io_uring.c           | 3 +++
 io_uring/uring_cmd.c          | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3b..ae08d6f66e62 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,9 @@ enum io_uring_cmd_flags {
 	IO_URING_F_IOPOLL		= (1 << 10),
 };
 
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_POLLED		(1U << 31)
+
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 683ac2b74721..425f64eee44e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -249,13 +249,12 @@ enum io_uring_op {
 };
 
 /*
- * sqe->uring_cmd_flags
+ * sqe->uring_cmd_flags		top 8bits aren't available for userspace
  * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
- * IORING_URING_CMD_POLLED	driver use only
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)
-#define IORING_URING_CMD_POLLED	(1U << 31)
+#define IORING_URING_CMD_MASK	IORING_URING_CMD_FIXED
 
 
 /*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2dff4772bf14..cb6bd9907045 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4669,6 +4669,9 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
 
+	/* top 8bits are for internal use */
+	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
 	io_uring_optable_init();
 
 	/*
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 537795fddc87..a0b0ec5473bf 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -91,7 +91,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
-	if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+	if (ioucmd->flags & ~IORING_URING_CMD_MASK)
 		return -EINVAL;
 
 	if (ioucmd->flags & IORING_URING_CMD_FIXED) {
-- 
cgit v1.2.3


From 93b8cc60c37b9d17732b7a297e5dca29b50a990d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 28 Sep 2023 20:43:25 +0800
Subject: io_uring: cancelable uring_cmd

uring_cmd may never complete, such as ublk, in which uring cmd isn't
completed until one new block request is coming from ublk block device.

Add cancelable uring_cmd to provide mechanism to driver for cancelling
pending commands in its own way.

Add API of io_uring_cmd_mark_cancelable() for driver to mark one command as
cancelable, then io_uring will cancel this command in
io_uring_cancel_generic(). ->uring_cmd() callback is reused for canceling
command in driver's way, then driver gets notified with the cancelling
from io_uring.

Add API of io_uring_cmd_get_task() to help driver cancel handler
deal with the canceling.

Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h       | 15 ++++++++++++++
 include/linux/io_uring_types.h |  6 ++++++
 io_uring/io_uring.c            | 33 +++++++++++++++++++++++++++++
 io_uring/uring_cmd.c           | 47 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index ae08d6f66e62..b4391e0a9bc8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,9 +20,13 @@ enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= (1 << 8),
 	IO_URING_F_CQE32		= (1 << 9),
 	IO_URING_F_IOPOLL		= (1 << 10),
+
+	/* set when uring wants to cancel a previously issued command */
+	IO_URING_F_CANCEL		= (1 << 11),
 };
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE	(1U << 30)
 #define IORING_URING_CMD_POLLED		(1U << 31)
 
 struct io_uring_cmd {
@@ -85,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -125,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fe1c5d4ec56c..e178461fa513 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -265,6 +265,12 @@ struct io_ring_ctx {
 		 */
 		struct io_wq_work_list	iopoll_list;
 		bool			poll_multi_queue;
+
+		/*
+		 * Any cancelable uring_cmd is added to this list in
+		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
+		 */
+		struct hlist_head	cancelable_uring_cmd;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cb6bd9907045..08c9ea46bb95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -352,6 +352,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_HLIST_HEAD(&ctx->waitid_list);
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	return ctx;
 err:
 	kfree(ctx->cancel_table.hbs);
@@ -3258,6 +3259,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 	return ret;
 }
 
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+		struct task_struct *task, bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool ret = false;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+			hash_node) {
+		struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+				struct io_uring_cmd);
+		struct file *file = req->file;
+
+		if (!cancel_all && req->task != task)
+			continue;
+
+		if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+			/* ->sqe isn't available if no async data */
+			if (!req_has_async_data(req))
+				cmd->sqe = NULL;
+			file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+			ret = true;
+		}
+	}
+	io_submit_flush_completions(ctx);
+
+	return ret;
+}
+
 static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 						struct task_struct *task,
 						bool cancel_all)
@@ -3306,6 +3338,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
 	ret |= io_waitid_remove_all(ctx, task, cancel_all);
+	ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
 	if (task)
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a0b0ec5473bf..00a5e5621a28 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -13,6 +13,51 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
+static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
+		return;
+
+	cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_del(&req->hash_node);
+	io_ring_submit_unlock(ctx, issue_flags);
+}
+
+/*
+ * Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
+ * will try to cancel this issued command by sending ->uring_cmd() with
+ * issue_flags of IO_URING_F_CANCEL.
+ *
+ * The command is guaranteed to not be done when calling ->uring_cmd()
+ * with IO_URING_F_CANCEL, but it is driver's responsibility to deal
+ * with race between io_uring canceling and normal completion.
+ */
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
+		cmd->flags |= IORING_URING_CMD_CANCELABLE;
+		io_ring_submit_lock(ctx, issue_flags);
+		hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
+
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return cmd_to_io_kiocb(cmd)->task;
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
+
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
 
+	io_uring_cmd_del_cancelable(ioucmd, issue_flags);
+
 	if (ret < 0)
 		req_set_fail(req);
 
-- 
cgit v1.2.3


From a941b784b15ff65e1a3b6a259c6d6cf7fa0bb3c3 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Fri, 15 Sep 2023 10:24:21 +0530
Subject: cdx: add support for bus mastering

Introduce cdx_set_master() and cdx_clear_master() APIs to support
enable and disable of bus mastering. Drivers need to use these APIs to
enable/disable DMAs from the CDX devices.

Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Link: https://lore.kernel.org/r/20230915045423.31630-1-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/cdx/cdx.c                       | 32 ++++++++++++++++++
 drivers/cdx/controller/cdx_controller.c |  4 +++
 drivers/cdx/controller/mcdi_functions.c | 58 +++++++++++++++++++++++++++++++++
 drivers/cdx/controller/mcdi_functions.h | 13 ++++++++
 include/linux/cdx/cdx_bus.h             | 18 ++++++++++
 5 files changed, 125 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index d2cad4c670a0..9efb7584f952 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -182,6 +182,38 @@ cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev)
 	return NULL;
 }
 
+int cdx_set_master(struct cdx_device *cdx_dev)
+{
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	struct cdx_device_config dev_config;
+	int ret = -EOPNOTSUPP;
+
+	dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+	dev_config.bus_master_enable = true;
+	if (cdx->ops->dev_configure)
+		ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+					      cdx_dev->dev_num, &dev_config);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_set_master);
+
+int cdx_clear_master(struct cdx_device *cdx_dev)
+{
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	struct cdx_device_config dev_config;
+	int ret = -EOPNOTSUPP;
+
+	dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+	dev_config.bus_master_enable = false;
+	if (cdx->ops->dev_configure)
+		ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+					      cdx_dev->dev_num, &dev_config);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_clear_master);
+
 /**
  * cdx_bus_match - device to driver matching callback
  * @dev: the cdx device to match against
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index bb4ae7970e21..7828dac8edb1 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -56,6 +56,10 @@ static int cdx_configure_device(struct cdx_controller *cdx,
 	case CDX_DEV_RESET_CONF:
 		ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num);
 		break;
+	case CDX_DEV_BUS_MASTER_CONF:
+		ret = cdx_mcdi_bus_master_enable(cdx->priv, bus_num, dev_num,
+						 dev_config->bus_master_enable);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 0158f26533dd..fc82435d5dea 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -137,3 +137,61 @@ int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
 
 	return ret;
 }
+
+static int cdx_mcdi_ctrl_flag_get(struct cdx_mcdi *cdx, u8 bus_num,
+				  u8 dev_num, u32 *flags)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN);
+	size_t outlen;
+	int ret;
+
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_BUS, bus_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_DEVICE, dev_num);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_GET, inbuf,
+			   sizeof(inbuf), outbuf, sizeof(outbuf), &outlen);
+	if (ret)
+		return ret;
+
+	if (outlen != MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN)
+		return -EIO;
+
+	*flags = MCDI_DWORD(outbuf, CDX_DEVICE_CONTROL_GET_OUT_FLAGS);
+
+	return 0;
+}
+
+static int cdx_mcdi_ctrl_flag_set(struct cdx_mcdi *cdx, u8 bus_num,
+				  u8 dev_num, bool enable, int bit_pos)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_SET_IN_LEN);
+	u32 flags;
+	int ret;
+
+	/*
+	 * Get flags and then set/reset bit at bit_pos according to
+	 * the input params.
+	 */
+	ret = cdx_mcdi_ctrl_flag_get(cdx, bus_num, dev_num, &flags);
+	if (ret)
+		return ret;
+
+	flags = flags & (u32)(~(BIT(bit_pos)));
+	if (enable)
+		flags |= (1 << bit_pos);
+
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_BUS, bus_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_DEVICE, dev_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_FLAGS, flags);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_SET, inbuf,
+			   sizeof(inbuf), NULL, 0, NULL);
+
+	return ret;
+}
+
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+			       u8 dev_num, bool enable)
+{
+	return cdx_mcdi_ctrl_flag_set(cdx, bus_num, dev_num, enable,
+			MC_CMD_CDX_DEVICE_CONTROL_SET_IN_BUS_MASTER_ENABLE_LBN);
+}
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index 7440ace5539a..a448d6581eb4 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -58,4 +58,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 int cdx_mcdi_reset_device(struct cdx_mcdi *cdx,
 			  u8 bus_num, u8 dev_num);
 
+/**
+ * cdx_mcdi_bus_master_enable - Set/Reset bus mastering for cdx device
+ *				represented by bus_num:dev_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ * @dev_num: Device number.
+ * @enable: Enable bus mastering if set, disable otherwise.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+			       u8 dev_num, bool enable);
+
 #endif /* CDX_MCDI_FUNCTIONS_H */
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..8320ec3b9e37 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -21,11 +21,13 @@
 struct cdx_controller;
 
 enum {
+	CDX_DEV_BUS_MASTER_CONF,
 	CDX_DEV_RESET_CONF,
 };
 
 struct cdx_device_config {
 	u8 type;
+	bool bus_master_enable;
 };
 
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
@@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type;
  */
 int cdx_dev_reset(struct device *dev);
 
+/**
+ * cdx_set_master - enables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to enable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_set_master(struct cdx_device *cdx_dev);
+
+/**
+ * cdx_clear_master - disables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to disable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_clear_master(struct cdx_device *cdx_dev);
+
 #endif /* _CDX_BUS_H_ */
-- 
cgit v1.2.3


From d427da2323b093a65d8317783e76ab8fad2e2ef0 Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <suijingfeng@loongson.cn>
Date: Fri, 25 Aug 2023 14:27:10 +0800
Subject: PCI: Add pci_get_base_class() helper

There is no function to get all PCI devices in a system by matching
against the base class code only, ignoring the sub-class code and
the programming interface.  Add pci_get_base_class() to suit the
need.

For example, if a driver wants to process all PCI display devices in
a system, it can do so like this:

  pdev = NULL;
  while ((pdev = pci_get_base_class(PCI_BASE_CLASS_DISPLAY, pdev))) {
    do_something_for_pci_display_device(pdev);
  }

Link: https://lore.kernel.org/r/20230825062714.6325-2-sui.jingfeng@linux.dev
Signed-off-by: Sui Jingfeng <suijingfeng@loongson.cn>
[bhelgaas: reword commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/pci/search.c | 31 +++++++++++++++++++++++++++++++
 include/linux/pci.h  |  5 +++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index b4c138a6ec02..53840634fbfc 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -363,6 +363,37 @@ struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from)
 }
 EXPORT_SYMBOL(pci_get_class);
 
+/**
+ * pci_get_base_class - searching for a PCI device by matching against the base class code only
+ * @class: search for a PCI device with this base class code
+ * @from: Previous PCI device found in search, or %NULL for new search.
+ *
+ * Iterates through the list of known PCI devices. If a PCI device is found
+ * with a matching base class code, the reference count to the device is
+ * incremented. See pci_match_one_device() to figure out how does this works.
+ * A new search is initiated by passing %NULL as the @from argument.
+ * Otherwise if @from is not %NULL, searches continue from next device on the
+ * global list. The reference count for @from is always decremented if it is
+ * not %NULL.
+ *
+ * Returns:
+ * A pointer to a matched PCI device, %NULL Otherwise.
+ */
+struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from)
+{
+	struct pci_device_id id = {
+		.vendor = PCI_ANY_ID,
+		.device = PCI_ANY_ID,
+		.subvendor = PCI_ANY_ID,
+		.subdevice = PCI_ANY_ID,
+		.class_mask = 0xFF0000,
+		.class = class << 16,
+	};
+
+	return pci_get_dev_by_id(&id, from);
+}
+EXPORT_SYMBOL(pci_get_base_class);
+
 /**
  * pci_dev_present - Returns 1 if device matching the device list is present, 0 if not.
  * @ids: A pointer to a null terminated list of struct pci_device_id structures
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..40ac1288a2cc 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1181,6 +1181,8 @@ struct pci_dev *pci_get_slot(struct pci_bus *bus, unsigned int devfn);
 struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus,
 					    unsigned int devfn);
 struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from);
+struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from);
+
 int pci_dev_present(const struct pci_device_id *ids);
 
 int pci_bus_read_config_byte(struct pci_bus *bus, unsigned int devfn,
@@ -1924,6 +1926,9 @@ static inline struct pci_dev *pci_get_class(unsigned int class,
 					    struct pci_dev *from)
 { return NULL; }
 
+static inline struct pci_dev *pci_get_base_class(unsigned int class,
+						 struct pci_dev *from)
+{ return NULL; }
 
 static inline int pci_dev_present(const struct pci_device_id *ids)
 { return 0; }
-- 
cgit v1.2.3


From 194bb58c6090e39bd7d9b9c888a079213628e1f6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Jun 2023 11:57:40 -0600
Subject: io_uring: add support for futex wake and wait

Add support for FUTEX_WAKE/WAIT primitives.

IORING_OP_FUTEX_WAKE is mix of FUTEX_WAKE and FUTEX_WAKE_BITSET, as
it does support passing in a bitset.

Similary, IORING_OP_FUTEX_WAIT is a mix of FUTEX_WAIT and
FUTEX_WAIT_BITSET.

For both of them, they are using the futex2 interface.

FUTEX_WAKE is straight forward, as those can always be done directly from
the io_uring submission without needing async handling. For FUTEX_WAIT,
things are a bit more complicated. If the futex isn't ready, then we
rely on a callback via futex_queue->wake() when someone wakes up the
futex. From that calback, we queue up task_work with the original task,
which will post a CQE and wake it, if necessary.

Cancelations are supported, both from the application point-of-view,
but also to be able to cancel pending waits if the ring exits before
all events have occurred. The return value of futex_unqueue() is used
to gate who wins the potential race between cancelation and futex
wakeups. Whomever gets a 'ret == 1' return from that claims ownership
of the io_uring futex request.

This is just the barebones wait/wake support. PI or REQUEUE support is
not added at this point, unclear if we might look into that later.

Likewise, explicit timeouts are not supported either. It is expected
that users that need timeouts would do so via the usual io_uring
mechanism to do that using linked timeouts.

The SQE format is as follows:

`addr`		Address of futex
`fd`		futex2(2) FUTEX2_* flags
`futex_flags`	io_uring specific command flags. None valid now.
`addr2`		Value of futex
`addr3`		Mask to wake/wait

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   5 +
 include/uapi/linux/io_uring.h  |   3 +
 io_uring/Makefile              |   1 +
 io_uring/cancel.c              |   5 +
 io_uring/cancel.h              |   4 +
 io_uring/futex.c               | 235 +++++++++++++++++++++++++++++++++++++++++
 io_uring/futex.h               |  34 ++++++
 io_uring/io_uring.c            |   7 ++
 io_uring/opdef.c               |  23 ++++
 9 files changed, 317 insertions(+)
 create mode 100644 io_uring/futex.c
 create mode 100644 io_uring/futex.h

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e178461fa513..990984614fca 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -321,6 +321,11 @@ struct io_ring_ctx {
 
 	struct hlist_head	waitid_list;
 
+#ifdef CONFIG_FUTEX
+	struct hlist_head	futex_list;
+	struct io_alloc_cache	futex_cache;
+#endif
+
 	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
 	struct io_sq_data	*sq_data;	/* if using sq thread polling */
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 425f64eee44e..04f9fba38d4b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -66,6 +66,7 @@ struct io_uring_sqe {
 		__u32		msg_ring_flags;
 		__u32		uring_cmd_flags;
 		__u32		waitid_flags;
+		__u32		futex_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -243,6 +244,8 @@ enum io_uring_op {
 	IORING_OP_SENDMSG_ZC,
 	IORING_OP_READ_MULTISHOT,
 	IORING_OP_WAITID,
+	IORING_OP_FUTEX_WAIT,
+	IORING_OP_FUTEX_WAKE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 7bd64e442567..e5be47e4fc3b 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \
 					cancel.o kbuf.o rsrc.o rw.o opdef.o \
 					notif.o waitid.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
+obj-$(CONFIG_FUTEX)		+= futex.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index eb77a51c5a79..3c19cccb1aec 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -16,6 +16,7 @@
 #include "poll.h"
 #include "timeout.h"
 #include "waitid.h"
+#include "futex.h"
 #include "cancel.h"
 
 struct io_cancel {
@@ -124,6 +125,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 	if (ret != -ENOENT)
 		return ret;
 
+	ret = io_futex_cancel(ctx, cd, issue_flags);
+	if (ret != -ENOENT)
+		return ret;
+
 	spin_lock(&ctx->completion_lock);
 	if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
 		ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index fc98622e6166..c0a8e7c520b6 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_CANCEL_H
+#define IORING_CANCEL_H
 
 #include <linux/io_uring_types.h>
 
@@ -22,3 +24,5 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
 
 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
+
+#endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
new file mode 100644
index 000000000000..eb4406ac46fb
--- /dev/null
+++ b/io_uring/futex.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../kernel/futex/futex.h"
+#include "io_uring.h"
+#include "rsrc.h"
+#include "futex.h"
+
+struct io_futex {
+	struct file	*file;
+	u32 __user	*uaddr;
+	unsigned long	futex_val;
+	unsigned long	futex_mask;
+	u32		futex_flags;
+};
+
+struct io_futex_data {
+	union {
+		struct futex_q		q;
+		struct io_cache_entry	cache;
+	};
+	struct io_kiocb	*req;
+};
+
+void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+	io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
+				sizeof(struct io_futex_data));
+}
+
+static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct io_futex_data, cache));
+}
+
+void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+	io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+}
+
+static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+	struct io_futex_data *ifd = req->async_data;
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_tw_lock(ctx, ts);
+	if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+		kfree(ifd);
+	req->async_data = NULL;
+	hlist_del_init(&req->hash_node);
+	io_req_task_complete(req, ts);
+}
+
+static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct io_futex_data *ifd = req->async_data;
+
+	/* futex wake already done or in progress */
+	if (!futex_unqueue(&ifd->q))
+		return false;
+
+	hlist_del_init(&req->hash_node);
+	io_req_set_res(req, -ECANCELED, 0);
+	req->io_task_work.func = io_futex_complete;
+	io_req_task_work_add(req);
+	return true;
+}
+
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		    unsigned int issue_flags)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	int nr = 0;
+
+	if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+		return -ENOENT;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+		if (req->cqe.user_data != cd->data &&
+		    !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+			continue;
+		if (__io_futex_cancel(ctx, req))
+			nr++;
+		if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+			break;
+	}
+	io_ring_submit_unlock(ctx, issue_flags);
+
+	if (nr)
+		return nr;
+
+	return -ENOENT;
+}
+
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			 bool cancel_all)
+{
+	struct hlist_node *tmp;
+	struct io_kiocb *req;
+	bool found = false;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+		if (!io_match_task_safe(req, task, cancel_all))
+			continue;
+		__io_futex_cancel(ctx, req);
+		found = true;
+	}
+
+	return found;
+}
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+	u32 flags;
+
+	if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index ||
+		     sqe->file_index))
+		return -EINVAL;
+
+	iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	iof->futex_val = READ_ONCE(sqe->addr2);
+	iof->futex_mask = READ_ONCE(sqe->addr3);
+	flags = READ_ONCE(sqe->fd);
+
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	iof->futex_flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(iof->futex_flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
+	    !futex_validate_input(iof->futex_flags, iof->futex_mask))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
+{
+	struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
+	struct io_kiocb *req = ifd->req;
+
+	if (unlikely(!__futex_wake_mark(q)))
+		return;
+
+	io_req_set_res(req, 0, 0);
+	req->io_task_work.func = io_futex_complete;
+	io_req_task_work_add(req);
+}
+
+static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
+{
+	struct io_cache_entry *entry;
+
+	entry = io_alloc_cache_get(&ctx->futex_cache);
+	if (entry)
+		return container_of(entry, struct io_futex_data, cache);
+
+	return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
+}
+
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_futex_data *ifd = NULL;
+	struct futex_hash_bucket *hb;
+	int ret;
+
+	if (!iof->futex_mask) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	io_ring_submit_lock(ctx, issue_flags);
+	ifd = io_alloc_ifd(ctx);
+	if (!ifd) {
+		ret = -ENOMEM;
+		goto done_unlock;
+	}
+
+	req->async_data = ifd;
+	ifd->q = futex_q_init;
+	ifd->q.bitset = iof->futex_mask;
+	ifd->q.wake = io_futex_wake_fn;
+	ifd->req = req;
+
+	ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
+			       &ifd->q, &hb);
+	if (!ret) {
+		hlist_add_head(&req->hash_node, &ctx->futex_list);
+		io_ring_submit_unlock(ctx, issue_flags);
+
+		futex_queue(&ifd->q, hb);
+		return IOU_ISSUE_SKIP_COMPLETE;
+	}
+
+done_unlock:
+	io_ring_submit_unlock(ctx, issue_flags);
+done:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	kfree(ifd);
+	return IOU_OK;
+}
+
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+	int ret;
+
+	/*
+	 * Strict flags - ensure that waking 0 futexes yields a 0 result.
+	 * See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
+	 */
+	ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
+			 iof->futex_val, iof->futex_mask);
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_OK;
+}
diff --git a/io_uring/futex.h b/io_uring/futex.h
new file mode 100644
index 000000000000..ddc9e0d73c52
--- /dev/null
+++ b/io_uring/futex.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "cancel.h"
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags);
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
+
+#if defined(CONFIG_FUTEX)
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+		    unsigned int issue_flags);
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+			 bool cancel_all);
+void io_futex_cache_init(struct io_ring_ctx *ctx);
+void io_futex_cache_free(struct io_ring_ctx *ctx);
+#else
+static inline int io_futex_cancel(struct io_ring_ctx *ctx,
+				  struct io_cancel_data *cd,
+				  unsigned int issue_flags)
+{
+	return 0;
+}
+static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
+				       struct task_struct *task, bool cancel_all)
+{
+	return false;
+}
+static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+}
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 08c9ea46bb95..3c1c111d02cb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -93,6 +93,7 @@
 #include "net.h"
 #include "notif.h"
 #include "waitid.h"
+#include "futex.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -330,6 +331,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 			    sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
+	io_futex_cache_init(ctx);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -350,6 +352,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ctx->submit_state.free_list.next = NULL;
 	INIT_WQ_LIST(&ctx->locked_free_list);
 	INIT_HLIST_HEAD(&ctx->waitid_list);
+#ifdef CONFIG_FUTEX
+	INIT_HLIST_HEAD(&ctx->futex_list);
+#endif
 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
@@ -2895,6 +2900,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
@@ -3338,6 +3344,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	mutex_lock(&ctx->uring_lock);
 	ret |= io_poll_remove_all(ctx, task, cancel_all);
 	ret |= io_waitid_remove_all(ctx, task, cancel_all);
+	ret |= io_futex_remove_all(ctx, task, cancel_all);
 	ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
 	mutex_unlock(&ctx->uring_lock);
 	ret |= io_kill_timeouts(ctx, task, cancel_all);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index aadcbf7136b0..31a3a421e94d 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -34,6 +34,7 @@
 #include "cancel.h"
 #include "rw.h"
 #include "waitid.h"
+#include "futex.h"
 
 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
 {
@@ -444,6 +445,22 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_waitid_prep,
 		.issue			= io_waitid,
 	},
+	[IORING_OP_FUTEX_WAIT] = {
+#if defined(CONFIG_FUTEX)
+		.prep			= io_futex_prep,
+		.issue			= io_futex_wait,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+	[IORING_OP_FUTEX_WAKE] = {
+#if defined(CONFIG_FUTEX)
+		.prep			= io_futex_prep,
+		.issue			= io_futex_wake,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -670,6 +687,12 @@ const struct io_cold_def io_cold_defs[] = {
 		.name			= "WAITID",
 		.async_size		= sizeof(struct io_waitid_async),
 	},
+	[IORING_OP_FUTEX_WAIT] = {
+		.name			= "FUTEX_WAIT",
+	},
+	[IORING_OP_FUTEX_WAKE] = {
+		.name			= "FUTEX_WAKE",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
-- 
cgit v1.2.3


From d77008421afda6208b1256c9b218457acd174ca6 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Thu, 17 Aug 2023 21:14:57 -0700
Subject: groups: Convert group_info.usage to refcount_t

atomic_t variables are currently used to implement reference counters
with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows and
underflows. This is important since overflows and underflows can lead
to use-after-free situation and be exploitable.

The variable group_info.usage is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

**Important note for maintainers:

Some functions from refcount_t API defined in refcount.h have different
memory ordering guarantees than their atomic counterparts. Please check
Documentation/core-api/refcount-vs-atomic.rst for more information.

Normally the differences should not matter since refcount_t provides
enough guarantees to satisfy the refcounting use cases, but in some
rare cases it might matter. Please double check that you don't have
some undocumented memory guarantees for this variable usage.

For the group_info.usage it might make a difference in following places:
 - put_group_info(): decrement in refcount_dec_and_test() only
   provides RELEASE ordering and ACQUIRE ordering on success vs. fully
   ordered atomic counterpart

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Link: https://lore.kernel.org/r/20230818041456.gonna.009-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/cred.h | 7 ++++---
 kernel/cred.c        | 2 +-
 kernel/groups.c      | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..92f8d772da6f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/key.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/uidgid.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
@@ -23,7 +24,7 @@ struct inode;
  * COW Supplementary groups list
  */
 struct group_info {
-	atomic_t	usage;
+	refcount_t	usage;
 	int		ngroups;
 	kgid_t		gid[];
 } __randomize_layout;
@@ -39,7 +40,7 @@ struct group_info {
  */
 static inline struct group_info *get_group_info(struct group_info *gi)
 {
-	atomic_inc(&gi->usage);
+	refcount_inc(&gi->usage);
 	return gi;
 }
 
@@ -49,7 +50,7 @@ static inline struct group_info *get_group_info(struct group_info *gi)
  */
 #define put_group_info(group_info)			\
 do {							\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
+	if (refcount_dec_and_test(&(group_info)->usage))	\
 		groups_free(group_info);		\
 } while (0)
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..4dc0b27b5462 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -36,7 +36,7 @@ do {									\
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
 
 /*
  * The initial credentials for the initial task
diff --git a/kernel/groups.c b/kernel/groups.c
index 9aaed2a31073..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize)
 	if (!gi)
 		return NULL;
 
-	atomic_set(&gi->usage, 1);
+	refcount_set(&gi->usage, 1);
 	gi->ngroups = gidsetsize;
 	return gi;
 }
-- 
cgit v1.2.3


From ce60f27bb62dfeb1bf827350520f34abc84e0933 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 20 Sep 2023 05:09:58 +0100
Subject: mm: abstract moving to the next PFN

In order to fix the L1TF vulnerability, x86 can invert the PTE bits for
PROT_NONE VMAs, which means we cannot move from one PTE to the next by
adding 1 to the PFN field of the PTE.  This results in the BUG reported at
[1].

Abstract advancing the PTE to the next PFN through a pte_next_pfn()
function/macro.

Link: https://lkml.kernel.org/r/20230920040958.866520-1-willy@infradead.org
Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reported-by: syzbot+55cc72f8cc3a549119df@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000d099fa0604f03351@google.com [1]
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h |  8 ++++++++
 include/linux/pgtable.h        | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d6ad98ca1288..e02b179ec659 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -955,6 +955,14 @@ static inline int pte_same(pte_t a, pte_t b)
 	return a.pte == b.pte;
 }
 
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+	if (__pte_needs_invert(pte_val(pte)))
+		return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
+	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+}
+#define pte_next_pfn	pte_next_pfn
+
 static inline int pte_present(pte_t a)
 {
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1fba072b3dac..af7639c3b0a3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -206,6 +206,14 @@ static inline int pmd_young(pmd_t pmd)
 #endif
 
 #ifndef set_ptes
+
+#ifndef pte_next_pfn
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+}
+#endif
+
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.
  * @mm: Address space to map the pages into.
@@ -231,7 +239,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+		pte = pte_next_pfn(pte);
 	}
 	arch_leave_lazy_mmu_mode();
 }
-- 
cgit v1.2.3


From 5c590804b6b0ff933ed4e5cee5d76de3a5048d9f Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Thu, 21 Sep 2023 14:12:35 -0400
Subject: maple_tree: add mas_is_active() to detect in-tree walks

Patch series "maple_tree: Fix mas_prev() state regression".

Pedro Falcato retported an mprotect regression [1] which was bisected back
to the iterator changes for maple tree.  Root cause analysis showed the
mas_prev() running off the end of the VMA space (previous from 0) followed
by mas_find(), would skip the first value.

This patchset introduces maple state underflow/overflow so the sequence of
calls on the maple state will return what the user expects.

Users who encounter this bug may see mprotect(), userfaultfd_register(),
and mlock() fail on VMAs mapped with address 0.


This patch (of 2):

Instead of constantly checking each possibility of the maple state,
create a fast path that will skip over checking unlikely states.

Link: https://lkml.kernel.org/r/20230921181236.509072-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20230921181236.509072-2-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index e41c70ac7744..f66f5f78f8cf 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -511,6 +511,15 @@ static inline bool mas_is_paused(const struct ma_state *mas)
 	return mas->node == MAS_PAUSE;
 }
 
+/* Check if the mas is pointing to a node or not */
+static inline bool mas_is_active(struct ma_state *mas)
+{
+	if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE)
+		return true;
+
+	return false;
+}
+
 /**
  * mas_reset() - Reset a Maple Tree operation state.
  * @mas: Maple Tree operation state.
-- 
cgit v1.2.3


From a8091f039c1ebf5cb0d5261e3613f18eb2a5d8b7 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Thu, 21 Sep 2023 14:12:36 -0400
Subject: maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW states

When updating the maple tree iterator to avoid rewalks, an issue was
introduced when shifting beyond the limits.  This can be seen by trying to
go to the previous address of 0, which would set the maple node to
MAS_NONE and keep the range as the last entry.

Subsequent calls to mas_find() would then search upwards from mas->last
and skip the value at mas->index/mas->last.  This showed up as a bug in
mprotect which skips the actual VMA at the current range after attempting
to go to the previous VMA from 0.

Since MAS_NONE may already be set when searching for a value that isn't
contained within a node, changing the handling of MAS_NONE in mas_find()
would make the code more complicated and error prone.  Furthermore, there
was no way to tell which limit was hit, and thus which action to take
(next or the entry at the current range).

This solution is to add two states to track what happened with the
previous iterator action.  This allows for the expected behaviour of the
next command to return the correct item (either the item at the range
requested, or the next/previous).

Tests are also added and updated accordingly.

Link: https://lkml.kernel.org/r/20230921181236.509072-3-Liam.Howlett@oracle.com
Link: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b
Link: https://lore.kernel.org/linux-mm/20230921181236.509072-1-Liam.Howlett@oracle.com/
Fixes: 39193685d585 ("maple_tree: try harder to keep active node with mas_prev()")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Closes: https://gist.github.com/heatd/85d2971fae1501b55b6ea401fbbe485b
Closes: https://bugs.archlinux.org/task/79656
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h |   2 +
 lib/maple_tree.c           | 221 +++++++++++++++++++++++++++++++++------------
 lib/test_maple_tree.c      |  87 +++++++++++++++---
 3 files changed, 237 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index f66f5f78f8cf..d01e850b570f 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -428,6 +428,8 @@ struct ma_wr_state {
 #define MAS_ROOT	((struct maple_enode *)5UL)
 #define MAS_NONE	((struct maple_enode *)9UL)
 #define MAS_PAUSE	((struct maple_enode *)17UL)
+#define MAS_OVERFLOW	((struct maple_enode *)33UL)
+#define MAS_UNDERFLOW	((struct maple_enode *)65UL)
 #define MA_ERROR(err) \
 		((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
 
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index ee1ff0c59fd7..0e00a84e8e8f 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -256,6 +256,22 @@ bool mas_is_err(struct ma_state *mas)
 	return xa_is_err(mas->node);
 }
 
+static __always_inline bool mas_is_overflow(struct ma_state *mas)
+{
+	if (unlikely(mas->node == MAS_OVERFLOW))
+		return true;
+
+	return false;
+}
+
+static __always_inline bool mas_is_underflow(struct ma_state *mas)
+{
+	if (unlikely(mas->node == MAS_UNDERFLOW))
+		return true;
+
+	return false;
+}
+
 static inline bool mas_searchable(struct ma_state *mas)
 {
 	if (mas_is_none(mas))
@@ -4415,10 +4431,13 @@ no_entry:
  *
  * @mas: The maple state
  * @max: The minimum starting range
+ * @empty: Can be empty
+ * @set_underflow: Set the @mas->node to underflow state on limit.
  *
  * Return: The entry in the previous slot which is possibly NULL
  */
-static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
+static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty,
+			   bool set_underflow)
 {
 	void *entry;
 	void __rcu **slots;
@@ -4435,7 +4454,6 @@ retry:
 	if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
 		goto retry;
 
-again:
 	if (mas->min <= min) {
 		pivot = mas_safe_min(mas, pivots, mas->offset);
 
@@ -4443,9 +4461,10 @@ again:
 			goto retry;
 
 		if (pivot <= min)
-			return NULL;
+			goto underflow;
 	}
 
+again:
 	if (likely(mas->offset)) {
 		mas->offset--;
 		mas->last = mas->index - 1;
@@ -4457,7 +4476,7 @@ again:
 		}
 
 		if (mas_is_none(mas))
-			return NULL;
+			goto underflow;
 
 		mas->last = mas->max;
 		node = mas_mn(mas);
@@ -4474,10 +4493,19 @@ again:
 	if (likely(entry))
 		return entry;
 
-	if (!empty)
+	if (!empty) {
+		if (mas->index <= min)
+			goto underflow;
+
 		goto again;
+	}
 
 	return entry;
+
+underflow:
+	if (set_underflow)
+		mas->node = MAS_UNDERFLOW;
+	return NULL;
 }
 
 /*
@@ -4567,10 +4595,13 @@ no_entry:
  * @mas: The maple state
  * @max: The maximum starting range
  * @empty: Can be empty
+ * @set_overflow: Should @mas->node be set to overflow when the limit is
+ * reached.
  *
  * Return: The entry in the next slot which is possibly NULL
  */
-static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
+static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty,
+			   bool set_overflow)
 {
 	void __rcu **slots;
 	unsigned long *pivots;
@@ -4589,22 +4620,22 @@ retry:
 	if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
 		goto retry;
 
-again:
 	if (mas->max >= max) {
 		if (likely(mas->offset < data_end))
 			pivot = pivots[mas->offset];
 		else
-			return NULL; /* must be mas->max */
+			goto overflow;
 
 		if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
 			goto retry;
 
 		if (pivot >= max)
-			return NULL;
+			goto overflow;
 	}
 
 	if (likely(mas->offset < data_end)) {
 		mas->index = pivots[mas->offset] + 1;
+again:
 		mas->offset++;
 		if (likely(mas->offset < data_end))
 			mas->last = pivots[mas->offset];
@@ -4616,8 +4647,11 @@ again:
 			goto retry;
 		}
 
-		if (mas_is_none(mas))
+		if (WARN_ON_ONCE(mas_is_none(mas))) {
+			mas->node = MAS_OVERFLOW;
 			return NULL;
+			goto overflow;
+		}
 
 		mas->offset = 0;
 		mas->index = mas->min;
@@ -4636,12 +4670,20 @@ again:
 		return entry;
 
 	if (!empty) {
-		if (!mas->offset)
-			data_end = 2;
+		if (mas->last >= max)
+			goto overflow;
+
+		mas->index = mas->last + 1;
+		/* Node cannot end on NULL, so it's safe to short-cut here */
 		goto again;
 	}
 
 	return entry;
+
+overflow:
+	if (set_overflow)
+		mas->node = MAS_OVERFLOW;
+	return NULL;
 }
 
 /*
@@ -4651,17 +4693,20 @@ again:
  *
  * Set the @mas->node to the next entry and the range_start to
  * the beginning value for the entry.  Does not check beyond @limit.
- * Sets @mas->index and @mas->last to the limit if it is hit.
+ * Sets @mas->index and @mas->last to the range, Does not update @mas->index and
+ * @mas->last on overflow.
  * Restarts on dead nodes.
  *
  * Return: the next entry or %NULL.
  */
 static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit)
 {
-	if (mas->last >= limit)
+	if (mas->last >= limit) {
+		mas->node = MAS_OVERFLOW;
 		return NULL;
+	}
 
-	return mas_next_slot(mas, limit, false);
+	return mas_next_slot(mas, limit, false, true);
 }
 
 /*
@@ -4837,7 +4882,7 @@ void *mas_walk(struct ma_state *mas)
 {
 	void *entry;
 
-	if (mas_is_none(mas) || mas_is_paused(mas) || mas_is_ptr(mas))
+	if (!mas_is_active(mas) || !mas_is_start(mas))
 		mas->node = MAS_START;
 retry:
 	entry = mas_state_walk(mas);
@@ -5294,14 +5339,22 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
 
 static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
 {
-	if (mas_is_start(wr_mas->mas))
-		return;
+	if (!mas_is_active(wr_mas->mas)) {
+		if (mas_is_start(wr_mas->mas))
+			return;
 
-	if (unlikely(mas_is_paused(wr_mas->mas)))
-		goto reset;
+		if (unlikely(mas_is_paused(wr_mas->mas)))
+			goto reset;
 
-	if (unlikely(mas_is_none(wr_mas->mas)))
-		goto reset;
+		if (unlikely(mas_is_none(wr_mas->mas)))
+			goto reset;
+
+		if (unlikely(mas_is_overflow(wr_mas->mas)))
+			goto reset;
+
+		if (unlikely(mas_is_underflow(wr_mas->mas)))
+			goto reset;
+	}
 
 	/*
 	 * A less strict version of mas_is_span_wr() where we allow spanning
@@ -5595,8 +5648,25 @@ static inline bool mas_next_setup(struct ma_state *mas, unsigned long max,
 {
 	bool was_none = mas_is_none(mas);
 
-	if (mas_is_none(mas) || mas_is_paused(mas))
+	if (unlikely(mas->last >= max)) {
+		mas->node = MAS_OVERFLOW;
+		return true;
+	}
+
+	if (mas_is_active(mas))
+		return false;
+
+	if (mas_is_none(mas) || mas_is_paused(mas)) {
+		mas->node = MAS_START;
+	} else if (mas_is_overflow(mas)) {
+		/* Overflowed before, but the max changed */
 		mas->node = MAS_START;
+	} else if (mas_is_underflow(mas)) {
+		mas->node = MAS_START;
+		*entry = mas_walk(mas);
+		if (*entry)
+			return true;
+	}
 
 	if (mas_is_start(mas))
 		*entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
@@ -5615,6 +5685,7 @@ static inline bool mas_next_setup(struct ma_state *mas, unsigned long max,
 
 	if (mas_is_none(mas))
 		return true;
+
 	return false;
 }
 
@@ -5637,7 +5708,7 @@ void *mas_next(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, false);
+	return mas_next_slot(mas, max, false, true);
 }
 EXPORT_SYMBOL_GPL(mas_next);
 
@@ -5660,7 +5731,7 @@ void *mas_next_range(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, true);
+	return mas_next_slot(mas, max, true, true);
 }
 EXPORT_SYMBOL_GPL(mas_next_range);
 
@@ -5691,18 +5762,31 @@ EXPORT_SYMBOL_GPL(mt_next);
 static inline bool mas_prev_setup(struct ma_state *mas, unsigned long min,
 		void **entry)
 {
-	if (mas->index <= min)
-		goto none;
+	if (unlikely(mas->index <= min)) {
+		mas->node = MAS_UNDERFLOW;
+		return true;
+	}
 
-	if (mas_is_none(mas) || mas_is_paused(mas))
+	if (mas_is_active(mas))
+		return false;
+
+	if (mas_is_overflow(mas)) {
 		mas->node = MAS_START;
+		*entry = mas_walk(mas);
+		if (*entry)
+			return true;
+	}
 
-	if (mas_is_start(mas)) {
-		mas_walk(mas);
-		if (!mas->index)
-			goto none;
+	if (mas_is_none(mas) || mas_is_paused(mas)) {
+		mas->node = MAS_START;
+	} else if (mas_is_underflow(mas)) {
+		/* underflowed before but the min changed */
+		mas->node = MAS_START;
 	}
 
+	if (mas_is_start(mas))
+		mas_walk(mas);
+
 	if (unlikely(mas_is_ptr(mas))) {
 		if (!mas->index)
 			goto none;
@@ -5747,7 +5831,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min)
 	if (mas_prev_setup(mas, min, &entry))
 		return entry;
 
-	return mas_prev_slot(mas, min, false);
+	return mas_prev_slot(mas, min, false, true);
 }
 EXPORT_SYMBOL_GPL(mas_prev);
 
@@ -5770,7 +5854,7 @@ void *mas_prev_range(struct ma_state *mas, unsigned long min)
 	if (mas_prev_setup(mas, min, &entry))
 		return entry;
 
-	return mas_prev_slot(mas, min, true);
+	return mas_prev_slot(mas, min, true, true);
 }
 EXPORT_SYMBOL_GPL(mas_prev_range);
 
@@ -5828,24 +5912,35 @@ EXPORT_SYMBOL_GPL(mas_pause);
 static inline bool mas_find_setup(struct ma_state *mas, unsigned long max,
 		void **entry)
 {
-	*entry = NULL;
+	if (mas_is_active(mas)) {
+		if (mas->last < max)
+			return false;
 
-	if (unlikely(mas_is_none(mas))) {
+		return true;
+	}
+
+	if (mas_is_paused(mas)) {
 		if (unlikely(mas->last >= max))
 			return true;
 
-		mas->index = mas->last;
+		mas->index = ++mas->last;
 		mas->node = MAS_START;
-	} else if (unlikely(mas_is_paused(mas))) {
+	} else if (mas_is_none(mas)) {
 		if (unlikely(mas->last >= max))
 			return true;
 
+		mas->index = mas->last;
 		mas->node = MAS_START;
-		mas->index = ++mas->last;
-	} else if (unlikely(mas_is_ptr(mas)))
-		goto ptr_out_of_range;
+	} else if (mas_is_overflow(mas) || mas_is_underflow(mas)) {
+		if (mas->index > max) {
+			mas->node = MAS_OVERFLOW;
+			return true;
+		}
+
+		mas->node = MAS_START;
+	}
 
-	if (unlikely(mas_is_start(mas))) {
+	if (mas_is_start(mas)) {
 		/* First run or continue */
 		if (mas->index > max)
 			return true;
@@ -5895,7 +5990,7 @@ void *mas_find(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, false);
+	return mas_next_slot(mas, max, false, false);
 }
 EXPORT_SYMBOL_GPL(mas_find);
 
@@ -5913,13 +6008,13 @@ EXPORT_SYMBOL_GPL(mas_find);
  */
 void *mas_find_range(struct ma_state *mas, unsigned long max)
 {
-	void *entry;
+	void *entry = NULL;
 
 	if (mas_find_setup(mas, max, &entry))
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, true);
+	return mas_next_slot(mas, max, true, false);
 }
 EXPORT_SYMBOL_GPL(mas_find_range);
 
@@ -5934,26 +6029,36 @@ EXPORT_SYMBOL_GPL(mas_find_range);
 static inline bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
 		void **entry)
 {
-	*entry = NULL;
-
-	if (unlikely(mas_is_none(mas))) {
-		if (mas->index <= min)
-			goto none;
+	if (mas_is_active(mas)) {
+		if (mas->index > min)
+			return false;
 
-		mas->last = mas->index;
-		mas->node = MAS_START;
+		return true;
 	}
 
-	if (unlikely(mas_is_paused(mas))) {
+	if (mas_is_paused(mas)) {
 		if (unlikely(mas->index <= min)) {
 			mas->node = MAS_NONE;
 			return true;
 		}
 		mas->node = MAS_START;
 		mas->last = --mas->index;
+	} else if (mas_is_none(mas)) {
+		if (mas->index <= min)
+			goto none;
+
+		mas->last = mas->index;
+		mas->node = MAS_START;
+	} else if (mas_is_underflow(mas) || mas_is_overflow(mas)) {
+		if (mas->last <= min) {
+			mas->node = MAS_UNDERFLOW;
+			return true;
+		}
+
+		mas->node = MAS_START;
 	}
 
-	if (unlikely(mas_is_start(mas))) {
+	if (mas_is_start(mas)) {
 		/* First run or continue */
 		if (mas->index < min)
 			return true;
@@ -6004,13 +6109,13 @@ none:
  */
 void *mas_find_rev(struct ma_state *mas, unsigned long min)
 {
-	void *entry;
+	void *entry = NULL;
 
 	if (mas_find_rev_setup(mas, min, &entry))
 		return entry;
 
 	/* Retries on dead nodes handled by mas_prev_slot */
-	return mas_prev_slot(mas, min, false);
+	return mas_prev_slot(mas, min, false, false);
 
 }
 EXPORT_SYMBOL_GPL(mas_find_rev);
@@ -6030,13 +6135,13 @@ EXPORT_SYMBOL_GPL(mas_find_rev);
  */
 void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
 {
-	void *entry;
+	void *entry = NULL;
 
 	if (mas_find_rev_setup(mas, min, &entry))
 		return entry;
 
 	/* Retries on dead nodes handled by mas_prev_slot */
-	return mas_prev_slot(mas, min, true);
+	return mas_prev_slot(mas, min, true, false);
 }
 EXPORT_SYMBOL_GPL(mas_find_range_rev);
 
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 0674aebd4423..06959165e2f9 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -2166,7 +2166,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt)
 	MT_BUG_ON(mt, val != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 5);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
 
 	mas.index = 0;
 	mas.last = 5;
@@ -2917,6 +2917,7 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		exists	MAS_NONE	active		range
  *		exists	active		active		range
  *		DNE	active		active		set to last range
+ *		ERANGE	active		MAS_OVERFLOW	last range
  *
  * Function	ENTRY	Start		Result		index & last
  * mas_prev()
@@ -2945,6 +2946,7 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		any	MAS_ROOT	MAS_NONE	0
  *		exists	active		active		range
  *		DNE	active		active		last range
+ *		ERANGE	active		MAS_UNDERFLOW	last range
  *
  * Function	ENTRY	Start		Result		index & last
  * mas_find()
@@ -2955,7 +2957,7 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		DNE	MAS_START	MAS_NONE	0
  *		DNE	MAS_PAUSE	MAS_NONE	0
  *		DNE	MAS_ROOT	MAS_NONE	0
- *		DNE	MAS_NONE	MAS_NONE	0
+ *		DNE	MAS_NONE	MAS_NONE	1
  *				if index ==  0
  *		exists	MAS_START	MAS_ROOT	0
  *		exists	MAS_PAUSE	MAS_ROOT	0
@@ -2967,7 +2969,7 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		DNE	MAS_START	active		set to max
  *		exists	MAS_PAUSE	active		range
  *		DNE	MAS_PAUSE	active		set to max
- *		exists	MAS_NONE	active		range
+ *		exists	MAS_NONE	active		range (start at last)
  *		exists	active		active		range
  *		DNE	active		active		last range (max < last)
  *
@@ -2992,7 +2994,7 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		DNE	MAS_START	active		set to min
  *		exists	MAS_PAUSE	active		range
  *		DNE	MAS_PAUSE	active		set to min
- *		exists	MAS_NONE	active		range
+ *		exists	MAS_NONE	active		range (start at index)
  *		exists	active		active		range
  *		DNE	active		active		last range (min > index)
  *
@@ -3039,10 +3041,10 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	mtree_store_range(mt, 0, 0, ptr, GFP_KERNEL);
 
 	mas_lock(&mas);
-	/* prev: Start -> none */
+	/* prev: Start -> underflow*/
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
 
 	/* prev: Start -> root */
 	mas_set(&mas, 10);
@@ -3069,7 +3071,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.node != MAS_NONE);
 
-	/* next: start -> none */
+	/* next: start -> none*/
 	mas_set(&mas, 10);
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, mas.index != 1);
@@ -3268,25 +3270,46 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.last != 0x2500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* next:active -> active out of range*/
+	/* next:active -> active beyond data */
 	entry = mas_next(&mas, 0x2999);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x2501);
 	MT_BUG_ON(mt, mas.last != 0x2fff);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* Continue after out of range*/
+	/* Continue after last range ends after max */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
 	MT_BUG_ON(mt, mas.last != 0x3500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* next:active -> active out of range*/
+	/* next:active -> active continued */
+	entry = mas_next(&mas, ULONG_MAX);
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0x3501);
+	MT_BUG_ON(mt, mas.last != ULONG_MAX);
+	MT_BUG_ON(mt, !mas_active(mas));
+
+	/* next:active -> overflow  */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x3501);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
+	MT_BUG_ON(mt, mas.node != MAS_OVERFLOW);
+
+	/* next:overflow -> overflow  */
+	entry = mas_next(&mas, ULONG_MAX);
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0x3501);
+	MT_BUG_ON(mt, mas.last != ULONG_MAX);
+	MT_BUG_ON(mt, mas.node != MAS_OVERFLOW);
+
+	/* prev:overflow -> active  */
+	entry = mas_prev(&mas, 0);
+	MT_BUG_ON(mt, entry != ptr3);
+	MT_BUG_ON(mt, mas.index != 0x3000);
+	MT_BUG_ON(mt, mas.last != 0x3500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
 	/* next: none -> active, skip value at location */
@@ -3307,11 +3330,46 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.last != 0x1500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* prev:active -> active out of range*/
+	/* prev:active -> active spanning end range */
+	entry = mas_prev(&mas, 0x0100);
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0);
+	MT_BUG_ON(mt, mas.last != 0x0FFF);
+	MT_BUG_ON(mt, !mas_active(mas));
+
+	/* prev:active -> underflow */
+	entry = mas_prev(&mas, 0);
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0);
+	MT_BUG_ON(mt, mas.last != 0x0FFF);
+	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+
+	/* prev:underflow -> underflow */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0x0FFF);
+	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+
+	/* next:underflow -> active */
+	entry = mas_next(&mas, ULONG_MAX);
+	MT_BUG_ON(mt, entry != ptr);
+	MT_BUG_ON(mt, mas.index != 0x1000);
+	MT_BUG_ON(mt, mas.last != 0x1500);
+	MT_BUG_ON(mt, !mas_active(mas));
+
+	/* prev:first value -> underflow */
+	entry = mas_prev(&mas, 0x1000);
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0x1000);
+	MT_BUG_ON(mt, mas.last != 0x1500);
+	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+
+	/* find:underflow -> first value */
+	entry = mas_find(&mas, ULONG_MAX);
+	MT_BUG_ON(mt, entry != ptr);
+	MT_BUG_ON(mt, mas.index != 0x1000);
+	MT_BUG_ON(mt, mas.last != 0x1500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
 	/* prev: pause ->active */
@@ -3325,14 +3383,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.last != 0x2500);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* prev:active -> active out of range*/
+	/* prev:active -> active spanning min */
 	entry = mas_prev(&mas, 0x1600);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1FFF);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* prev: active ->active, continue*/
+	/* prev: active ->active, continue */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
@@ -3379,7 +3437,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.last != 0x2FFF);
 	MT_BUG_ON(mt, !mas_active(mas));
 
-	/* find: none ->active */
+	/* find: overflow ->active */
 	entry = mas_find(&mas, 0x5000);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
@@ -3778,7 +3836,6 @@ static int __init maple_tree_seed(void)
 	check_empty_area_fill(&tree);
 	mtree_destroy(&tree);
 
-
 	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
 	check_state_handling(&tree);
 	mtree_destroy(&tree);
-- 
cgit v1.2.3


From 935d4f0c6dc8b3533e6e39346de7389a84490178 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 22 Sep 2023 12:58:03 +0100
Subject: mm: hugetlb: add huge page size param to set_huge_pte_at()

Patch series "Fix set_huge_pte_at() panic on arm64", v2.

This series fixes a bug in arm64's implementation of set_huge_pte_at(),
which can result in an unprivileged user causing a kernel panic.  The
problem was triggered when running the new uffd poison mm selftest for
HUGETLB memory.  This test (and the uffd poison feature) was merged for
v6.5-rc7.

Ideally, I'd like to get this fix in for v6.6 and I've cc'ed stable
(correctly this time) to get it backported to v6.5, where the issue first
showed up.


Description of Bug
==================

arm64's huge pte implementation supports multiple huge page sizes, some of
which are implemented in the page table with multiple contiguous entries.
So set_huge_pte_at() needs to work out how big the logical pte is, so that
it can also work out how many physical ptes (or pmds) need to be written.
It previously did this by grabbing the folio out of the pte and querying
its size.

However, there are cases when the pte being set is actually a swap entry.
But this also used to work fine, because for huge ptes, we only ever saw
migration entries and hwpoison entries.  And both of these types of swap
entries have a PFN embedded, so the code would grab that and everything
still worked out.

But over time, more calls to set_huge_pte_at() have been added that set
swap entry types that do not embed a PFN.  And this causes the code to go
bang.  The triggering case is for the uffd poison test, commit
99aa77215ad0 ("selftests/mm: add uffd unit test for UFFDIO_POISON"), which
causes a PTE_MARKER_POISONED swap entry to be set, coutesey of commit
8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") -
added in v6.5-rc7.  Although review shows that there are other call sites
that set PTE_MARKER_UFFD_WP (which also has no PFN), these don't trigger
on arm64 because arm64 doesn't support UFFD WP.

If CONFIG_DEBUG_VM is enabled, we do at least get a BUG(), but otherwise,
it will dereference a bad pointer in page_folio():

    static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
    {
        VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));

        return page_folio(pfn_to_page(swp_offset_pfn(entry)));
    }


Fix
===

The simplest fix would have been to revert the dodgy cleanup commit
18f3962953e4 ("mm: hugetlb: kill set_huge_swap_pte_at()"), but since
things have moved on, this would have required an audit of all the new
set_huge_pte_at() call sites to see if they should be converted to
set_huge_swap_pte_at().  As per the original intent of the change, it
would also leave us open to future bugs when people invariably get it
wrong and call the wrong helper.

So instead, I've added a huge page size parameter to set_huge_pte_at().
This means that the arm64 code has the size in all cases.  It's a bigger
change, due to needing to touch the arches that implement the function,
but it is entirely mechanical, so in my view, low risk.

I've compile-tested all touched arches; arm64, parisc, powerpc, riscv,
s390, sparc (and additionally x86_64).  I've additionally booted and run
mm selftests against arm64, where I observe the uffd poison test is fixed,
and there are no other regressions.


This patch (of 2):

In order to fix a bug, arm64 needs to be told the size of the huge page
for which the pte is being set in set_huge_pte_at().  Provide for this by
adding an `unsigned long sz` parameter to the function.  This follows the
same pattern as huge_pte_clear().

This commit makes the required interface modifications to the core mm as
well as all arches that implement this function (arm64, parisc, powerpc,
riscv, s390, sparc).  The actual arm64 bug will be fixed in a separate
commit.

No behavioral changes intended.

Link: https://lkml.kernel.org/r/20230922115804.2043771-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20230922115804.2043771-2-ryan.roberts@arm.com
Fixes: 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>	[powerpc 8xx]
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>	[vmalloc change]
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>	[6.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/hugetlb.h                 |  2 +-
 arch/arm64/mm/hugetlbpage.c                      |  6 ++--
 arch/parisc/include/asm/hugetlb.h                |  2 +-
 arch/parisc/mm/hugetlbpage.c                     |  2 +-
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h |  3 +-
 arch/powerpc/mm/book3s64/hugetlbpage.c           |  5 ++-
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c     |  3 +-
 arch/powerpc/mm/nohash/8xx.c                     |  3 +-
 arch/powerpc/mm/pgtable.c                        |  3 +-
 arch/riscv/include/asm/hugetlb.h                 |  3 +-
 arch/riscv/mm/hugetlbpage.c                      |  3 +-
 arch/s390/include/asm/hugetlb.h                  |  6 ++--
 arch/s390/mm/hugetlbpage.c                       |  8 ++++-
 arch/sparc/include/asm/hugetlb.h                 |  6 ++--
 arch/sparc/mm/hugetlbpage.c                      |  8 ++++-
 include/asm-generic/hugetlb.h                    |  2 +-
 include/linux/hugetlb.h                          |  6 ++--
 mm/damon/vaddr.c                                 |  3 +-
 mm/hugetlb.c                                     | 43 +++++++++++++-----------
 mm/migrate.c                                     |  7 ++--
 mm/rmap.c                                        | 23 ++++++++++---
 mm/vmalloc.c                                     |  2 +-
 22 files changed, 100 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index f43a38ac1779..2ddc33d93b13 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -28,7 +28,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-			    pte_t *ptep, pte_t pte);
+			    pte_t *ptep, pte_t pte, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
 extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 				      unsigned long addr, pte_t *ptep,
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 9c52718ea750..a7f8c8db3425 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -249,7 +249,7 @@ static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-			    pte_t *ptep, pte_t pte)
+			    pte_t *ptep, pte_t pte, unsigned long sz)
 {
 	size_t pgsize;
 	int i;
@@ -571,5 +571,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
 void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
 				  pte_t old_pte, pte_t pte)
 {
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+	unsigned long psize = huge_page_size(hstate_vma(vma));
+
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
 }
diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h
index f7f078c2872c..72daacc472a0 100644
--- a/arch/parisc/include/asm/hugetlb.h
+++ b/arch/parisc/include/asm/hugetlb.h
@@ -6,7 +6,7 @@
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pte);
+		     pte_t *ptep, pte_t pte, unsigned long sz);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index a8a1a7c1e16e..a9f7e21f6656 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -140,7 +140,7 @@ static void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry)
+		     pte_t *ptep, pte_t entry, unsigned long sz)
 {
 	__set_huge_pte_at(mm, addr, ptep, entry);
 }
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index de092b04ee1a..92df40c6cc6b 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -46,7 +46,8 @@ static inline int check_and_get_huge_psize(int shift)
 }
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		     pte_t pte, unsigned long sz);
 
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c
index 3bc0eb21b2a0..5a2e512e96db 100644
--- a/arch/powerpc/mm/book3s64/hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
@@ -143,11 +143,14 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
 void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 				  pte_t *ptep, pte_t old_pte, pte_t pte)
 {
+	unsigned long psize;
 
 	if (radix_enabled())
 		return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
 							   old_pte, pte);
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+
+	psize = huge_page_size(hstate_vma(vma));
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
 }
 
 void __init hugetlbpage_init_defaultsize(void)
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 17075c78d4bc..35fd2a95be24 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -47,6 +47,7 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 					 pte_t old_pte, pte_t pte)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long psize = huge_page_size(hstate_vma(vma));
 
 	/*
 	 * POWER9 NMMU must flush the TLB after clearing the PTE before
@@ -58,5 +59,5 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 	    atomic_read(&mm->context.copros) > 0)
 		radix__flush_hugetlb_page(vma, addr);
 
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
 }
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index dbbfe897455d..a642a7929892 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -91,7 +91,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
 	if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
 		return -EINVAL;
 
-	set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)));
+	set_huge_pte_at(&init_mm, va, ptep,
+			pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize);
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3f86fd217690..3ba9fe411604 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -288,7 +288,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 }
 
 #if defined(CONFIG_PPC_8xx)
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+		     pte_t pte, unsigned long sz)
 {
 	pmd_t *pmd = pmd_off(mm, addr);
 	pte_basic_t val;
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 34e24f078cc1..4c5b0e929890 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -18,7 +18,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm,
-		     unsigned long addr, pte_t *ptep, pte_t pte);
+		     unsigned long addr, pte_t *ptep, pte_t pte,
+		     unsigned long sz);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 96225a8533ad..e4a2ace92dbe 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -180,7 +180,8 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
 void set_huge_pte_at(struct mm_struct *mm,
 		     unsigned long addr,
 		     pte_t *ptep,
-		     pte_t pte)
+		     pte_t pte,
+		     unsigned long sz)
 {
 	int i, pte_num;
 
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index f07267875a19..deb198a61039 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -16,6 +16,8 @@
 #define hugepages_supported()			(MACHINE_HAS_EDAT1)
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz);
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte);
 pte_t huge_ptep_get(pte_t *ptep);
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -65,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	int changed = !pte_same(huge_ptep_get(ptep), pte);
 	if (changed) {
 		huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
-		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		__set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
 	}
 	return changed;
 }
@@ -74,7 +76,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 					   unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep);
-	set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+	__set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte));
 }
 
 static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index c718f2a0de94..297a6d897d5a 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -142,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
 		__storage_key_init_range(paddr, paddr + size - 1);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
 	unsigned long rste;
@@ -163,6 +163,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	set_pte(ptep, __pte(rste));
 }
 
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	__set_huge_pte_at(mm, addr, ptep, pte);
+}
+
 pte_t huge_ptep_get(pte_t *ptep)
 {
 	return __rste_to_pte(pte_val(*ptep));
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 0a26cca24232..c714ca6a05aa 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -14,6 +14,8 @@ extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end;
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte, unsigned long sz);
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte);
 
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
@@ -32,7 +34,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 					   unsigned long addr, pte_t *ptep)
 {
 	pte_t old_pte = *ptep;
-	set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
+	__set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
 }
 
 #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
@@ -42,7 +44,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	int changed = !pte_same(*ptep, pte);
 	if (changed) {
-		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+		__set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
 		flush_tlb_page(vma, addr);
 	}
 	return changed;
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d7018823206c..b432500c13a5 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -328,7 +328,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	return pte_offset_huge(pmd, addr);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
 	unsigned int nptes, orig_shift, shift;
@@ -364,6 +364,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 				    orig_shift);
 }
 
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry, unsigned long sz)
+{
+	__set_huge_pte_at(mm, addr, ptep, entry);
+}
+
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 4da02798a00b..6dcf4d576970 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -76,7 +76,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 
 #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-		pte_t *ptep, pte_t pte)
+		pte_t *ptep, pte_t pte, unsigned long sz)
 {
 	set_pte_at(mm, addr, ptep, pte);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b2626063f4f..a30686e649f7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -984,7 +984,9 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 						unsigned long addr, pte_t *ptep,
 						pte_t old_pte, pte_t pte)
 {
-	set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+	unsigned long psize = huge_page_size(hstate_vma(vma));
+
+	set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
 }
 #endif
 
@@ -1173,7 +1175,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 }
 
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-				   pte_t *ptep, pte_t pte)
+				   pte_t *ptep, pte_t pte, unsigned long sz)
 {
 }
 
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 4c81a9dbd044..cf8a9fc5c9d1 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -341,13 +341,14 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
 	bool referenced = false;
 	pte_t entry = huge_ptep_get(pte);
 	struct folio *folio = pfn_folio(pte_pfn(entry));
+	unsigned long psize = huge_page_size(hstate_vma(vma));
 
 	folio_get(folio);
 
 	if (pte_young(entry)) {
 		referenced = true;
 		entry = pte_mkold(entry);
-		set_huge_pte_at(mm, addr, pte, entry);
+		set_huge_pte_at(mm, addr, pte, entry, psize);
 	}
 
 #ifdef CONFIG_MMU_NOTIFIER
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ba6d39b71cb1..52d26072dfda 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4980,7 +4980,7 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
 
 static void
 hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
-		      struct folio *new_folio, pte_t old)
+		      struct folio *new_folio, pte_t old, unsigned long sz)
 {
 	pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
 
@@ -4988,7 +4988,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add
 	hugepage_add_new_anon_rmap(new_folio, vma, addr);
 	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
 		newpte = huge_pte_mkuffd_wp(newpte);
-	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte);
+	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
 	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
 	folio_set_hugetlb_migratable(new_folio);
 }
@@ -5065,7 +5065,7 @@ again:
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
 			if (!userfaultfd_wp(dst_vma))
 				entry = huge_pte_clear_uffd_wp(entry);
-			set_huge_pte_at(dst, addr, dst_pte, entry);
+			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 		} else if (unlikely(is_hugetlb_entry_migration(entry))) {
 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
 			bool uffd_wp = pte_swp_uffd_wp(entry);
@@ -5080,18 +5080,18 @@ again:
 				entry = swp_entry_to_pte(swp_entry);
 				if (userfaultfd_wp(src_vma) && uffd_wp)
 					entry = pte_swp_mkuffd_wp(entry);
-				set_huge_pte_at(src, addr, src_pte, entry);
+				set_huge_pte_at(src, addr, src_pte, entry, sz);
 			}
 			if (!userfaultfd_wp(dst_vma))
 				entry = huge_pte_clear_uffd_wp(entry);
-			set_huge_pte_at(dst, addr, dst_pte, entry);
+			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 		} else if (unlikely(is_pte_marker(entry))) {
 			pte_marker marker = copy_pte_marker(
 				pte_to_swp_entry(entry), dst_vma);
 
 			if (marker)
 				set_huge_pte_at(dst, addr, dst_pte,
-						make_pte_marker(marker));
+						make_pte_marker(marker), sz);
 		} else {
 			entry = huge_ptep_get(src_pte);
 			pte_folio = page_folio(pte_page(entry));
@@ -5145,7 +5145,7 @@ again:
 					goto again;
 				}
 				hugetlb_install_folio(dst_vma, dst_pte, addr,
-						      new_folio, src_pte_old);
+						      new_folio, src_pte_old, sz);
 				spin_unlock(src_ptl);
 				spin_unlock(dst_ptl);
 				continue;
@@ -5166,7 +5166,7 @@ again:
 			if (!userfaultfd_wp(dst_vma))
 				entry = huge_pte_clear_uffd_wp(entry);
 
-			set_huge_pte_at(dst, addr, dst_pte, entry);
+			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
 			hugetlb_count_add(npages, dst);
 		}
 		spin_unlock(src_ptl);
@@ -5184,7 +5184,8 @@ again:
 }
 
 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
-			  unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
+			  unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
+			  unsigned long sz)
 {
 	struct hstate *h = hstate_vma(vma);
 	struct mm_struct *mm = vma->vm_mm;
@@ -5202,7 +5203,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
 	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
-	set_huge_pte_at(mm, new_addr, dst_pte, pte);
+	set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
 
 	if (src_ptl != dst_ptl)
 		spin_unlock(src_ptl);
@@ -5259,7 +5260,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 		if (!dst_pte)
 			break;
 
-		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
+		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
 	}
 
 	if (shared_pmd)
@@ -5337,7 +5338,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 			if (pte_swp_uffd_wp_any(pte) &&
 			    !(zap_flags & ZAP_FLAG_DROP_MARKER))
 				set_huge_pte_at(mm, address, ptep,
-						make_pte_marker(PTE_MARKER_UFFD_WP));
+						make_pte_marker(PTE_MARKER_UFFD_WP),
+						sz);
 			else
 				huge_pte_clear(mm, address, ptep, sz);
 			spin_unlock(ptl);
@@ -5371,7 +5373,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		if (huge_pte_uffd_wp(pte) &&
 		    !(zap_flags & ZAP_FLAG_DROP_MARKER))
 			set_huge_pte_at(mm, address, ptep,
-					make_pte_marker(PTE_MARKER_UFFD_WP));
+					make_pte_marker(PTE_MARKER_UFFD_WP),
+					sz);
 		hugetlb_count_sub(pages_per_huge_page(h), mm);
 		page_remove_rmap(page, vma, true);
 
@@ -5676,7 +5679,7 @@ retry_avoidcopy:
 		hugepage_add_new_anon_rmap(new_folio, vma, haddr);
 		if (huge_pte_uffd_wp(pte))
 			newpte = huge_pte_mkuffd_wp(newpte);
-		set_huge_pte_at(mm, haddr, ptep, newpte);
+		set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
 		folio_set_hugetlb_migratable(new_folio);
 		/* Make the old page be freed below */
 		new_folio = old_folio;
@@ -5972,7 +5975,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 */
 	if (unlikely(pte_marker_uffd_wp(old_pte)))
 		new_pte = huge_pte_mkuffd_wp(new_pte);
-	set_huge_pte_at(mm, haddr, ptep, new_pte);
+	set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
 
 	hugetlb_count_add(pages_per_huge_page(h), mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
@@ -6261,7 +6264,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 		}
 
 		_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
-		set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+		set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte,
+				huge_page_size(h));
 
 		/* No need to invalidate - it was non-present before */
 		update_mmu_cache(dst_vma, dst_addr, dst_pte);
@@ -6412,7 +6416,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	if (wp_enabled)
 		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
 
-	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h));
 
 	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
 
@@ -6598,7 +6602,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 			else if (uffd_wp_resolve)
 				newpte = pte_swp_clear_uffd_wp(newpte);
 			if (!pte_same(pte, newpte))
-				set_huge_pte_at(mm, address, ptep, newpte);
+				set_huge_pte_at(mm, address, ptep, newpte, psize);
 		} else if (unlikely(is_pte_marker(pte))) {
 			/* No other markers apply for now. */
 			WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
@@ -6623,7 +6627,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 			if (unlikely(uffd_wp))
 				/* Safe to modify directly (none->non-present). */
 				set_huge_pte_at(mm, address, ptep,
-						make_pte_marker(PTE_MARKER_UFFD_WP));
+						make_pte_marker(PTE_MARKER_UFFD_WP),
+						psize);
 		}
 		spin_unlock(ptl);
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index b7fa020003f3..2053b54556ca 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -243,7 +243,9 @@ static bool remove_migration_pte(struct folio *folio,
 
 #ifdef CONFIG_HUGETLB_PAGE
 		if (folio_test_hugetlb(folio)) {
-			unsigned int shift = huge_page_shift(hstate_vma(vma));
+			struct hstate *h = hstate_vma(vma);
+			unsigned int shift = huge_page_shift(h);
+			unsigned long psize = huge_page_size(h);
 
 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			if (folio_test_anon(folio))
@@ -251,7 +253,8 @@ static bool remove_migration_pte(struct folio *folio,
 						       rmap_flags);
 			else
 				page_dup_file_rmap(new, true);
-			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
+					psize);
 		} else
 #endif
 		{
diff --git a/mm/rmap.c b/mm/rmap.c
index ec7f8e6c9e48..9f795b93cf40 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1480,6 +1480,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 	unsigned long pfn;
+	unsigned long hsz = 0;
 
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
@@ -1511,6 +1512,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 		 */
 		adjust_range_if_pmd_sharing_possible(vma, &range.start,
 						     &range.end);
+
+		/* We need the huge page size for set_huge_pte_at() */
+		hsz = huge_page_size(hstate_vma(vma));
 	}
 	mmu_notifier_invalidate_range_start(&range);
 
@@ -1628,7 +1632,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_pte_at(mm, address, pvmw.pte, pteval);
+				set_huge_pte_at(mm, address, pvmw.pte, pteval,
+						hsz);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1820,6 +1825,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 	unsigned long pfn;
+	unsigned long hsz = 0;
 
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
@@ -1855,6 +1861,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		 */
 		adjust_range_if_pmd_sharing_possible(vma, &range.start,
 						     &range.end);
+
+		/* We need the huge page size for set_huge_pte_at() */
+		hsz = huge_page_size(hstate_vma(vma));
 	}
 	mmu_notifier_invalidate_range_start(&range);
 
@@ -2020,7 +2029,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_pte_at(mm, address, pvmw.pte, pteval);
+				set_huge_pte_at(mm, address, pvmw.pte, pteval,
+						hsz);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2044,7 +2054,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 
 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
 				if (folio_test_hugetlb(folio))
-					set_huge_pte_at(mm, address, pvmw.pte, pteval);
+					set_huge_pte_at(mm, address, pvmw.pte,
+							pteval, hsz);
 				else
 					set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = false;
@@ -2058,7 +2069,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			if (anon_exclusive &&
 			    page_try_share_anon_rmap(subpage)) {
 				if (folio_test_hugetlb(folio))
-					set_huge_pte_at(mm, address, pvmw.pte, pteval);
+					set_huge_pte_at(mm, address, pvmw.pte,
+							pteval, hsz);
 				else
 					set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = false;
@@ -2090,7 +2102,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			if (pte_uffd_wp(pteval))
 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			if (folio_test_hugetlb(folio))
-				set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
+				set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
+						hsz);
 			else
 				set_pte_at(mm, address, pvmw.pte, swp_pte);
 			trace_set_migration_pte(address, pte_val(swp_pte),
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ef8599d394fd..a3fedb3ee0db 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -111,7 +111,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			pte_t entry = pfn_pte(pfn, prot);
 
 			entry = arch_make_huge_pte(entry, ilog2(size), 0);
-			set_huge_pte_at(&init_mm, addr, pte, entry);
+			set_huge_pte_at(&init_mm, addr, pte, entry, size);
 			pfn += PFN_DOWN(size);
 			continue;
 		}
-- 
cgit v1.2.3


From 348cbf987ed328682af7d74ce98d9f0e6857f42f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 30 Sep 2023 09:20:20 -0700
Subject: Input: mt - annotate struct input_mt with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct input_mt.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175036.work.762-kees@kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/mt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h
index 3b8580bd33c1..2cf89a538b18 100644
--- a/include/linux/input/mt.h
+++ b/include/linux/input/mt.h
@@ -47,7 +47,7 @@ struct input_mt {
 	unsigned int flags;
 	unsigned int frame;
 	int *red;
-	struct input_mt_slot slots[];
+	struct input_mt_slot slots[] __counted_by(num_slots);
 };
 
 static inline void input_mt_set_value(struct input_mt_slot *slot,
-- 
cgit v1.2.3


From fce9c967820a72f600abbf061d7077861685a14d Mon Sep 17 00:00:00 2001
From: Ingo Rohloff <lundril@gmx.de>
Date: Sat, 26 Aug 2023 22:02:41 +0200
Subject: wifi: mt76: mt7921e: Support MT7992 IP in Xiaomi Redmibook 15 Pro
 (2023)

In the Xiaomi Redmibook 15 Pro (2023) laptop I have got, a wifi chip is
used, which according to its PCI Vendor ID is from "ITTIM Technology".

This chip works flawlessly with the mt7921e module.  The driver doesn't
bind to this PCI device, because the Vendor ID from "ITTIM Technology" is
not recognized.

This patch adds the PCI Vendor ID from "ITTIM Technology" to the list of
PCI Vendor IDs and lets the mt7921e driver bind to the mentioned wifi
chip.

Signed-off-by: Ingo Rohloff <lundril@gmx.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
 drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 2 ++
 include/linux/pci_ids.h                         | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
index 3dda84a93717..f04e7095e181 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
@@ -17,6 +17,8 @@ static const struct pci_device_id mt7921_pci_device_table[] = {
 		.driver_data = (kernel_ulong_t)MT7921_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x7922),
 		.driver_data = (kernel_ulong_t)MT7922_FIRMWARE_WM },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ITTIM, 0x7922),
+		.driver_data = (kernel_ulong_t)MT7922_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x0608),
 		.driver_data = (kernel_ulong_t)MT7921_FIRMWARE_WM },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MEDIATEK, 0x0616),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2dc75df1437f..6ae1803bcd2f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -180,6 +180,8 @@
 #define PCI_DEVICE_ID_BERKOM_A4T		0xffa4
 #define PCI_DEVICE_ID_BERKOM_SCITEL_QUADRO	0xffa8
 
+#define PCI_VENDOR_ID_ITTIM		0x0b48
+
 #define PCI_VENDOR_ID_COMPAQ		0x0e11
 #define PCI_DEVICE_ID_COMPAQ_TOKENRING	0x0508
 #define PCI_DEVICE_ID_COMPAQ_TACHYON	0xa0fc
-- 
cgit v1.2.3


From 0b068c714ca9479d2783cc333fff5bc2d4a6d45c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2023 08:52:16 +0000
Subject: net: add DEV_STATS_READ() helper

Companion of DEV_STATS_INC() & DEV_STATS_ADD().

This is going to be used in the series.

Use it in macsec_get_stats64().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macsec.c      | 6 +++---
 include/linux/netdevice.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index b7e151439c48..7a44e1cbe305 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3655,9 +3655,9 @@ static void macsec_get_stats64(struct net_device *dev,
 
 	dev_fetch_sw_netstats(s, dev->tstats);
 
-	s->rx_dropped = atomic_long_read(&dev->stats.__rx_dropped);
-	s->tx_dropped = atomic_long_read(&dev->stats.__tx_dropped);
-	s->rx_errors = atomic_long_read(&dev->stats.__rx_errors);
+	s->rx_dropped = DEV_STATS_READ(dev, rx_dropped);
+	s->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
+	s->rx_errors = DEV_STATS_READ(dev, rx_errors);
 }
 
 static int macsec_get_iflink(const struct net_device *dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e520c14eb8c..e070a4540fba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5236,5 +5236,6 @@ extern struct net_device *blackhole_netdev;
 #define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
 #define DEV_STATS_ADD(DEV, FIELD, VAL) 	\
 		atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
+#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)
 
 #endif	/* _LINUX_NETDEVICE_H */
-- 
cgit v1.2.3


From fa4c4507099f781ca89a748c480af9cf97629726 Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Thu, 28 Sep 2023 16:31:35 +0200
Subject: iommu: Allow .iotlb_sync_map to fail and handle s390's -ENOMEM return

On s390 when using a paging hypervisor, .iotlb_sync_map is used to sync
mappings by letting the hypervisor inspect the synced IOVA range and
updating a shadow table. This however means that .iotlb_sync_map can
fail as the hypervisor may run out of resources while doing the sync.
This can be due to the hypervisor being unable to pin guest pages, due
to a limit on mapped addresses such as vfio_iommu_type1.dma_entry_limit
or lack of other resources. Either way such a failure to sync a mapping
should result in a DMA_MAPPING_ERROR.

Now especially when running with batched IOTLB flushes for unmap it may
be that some IOVAs have already been invalidated but not yet synced via
.iotlb_sync_map. Thus if the hypervisor indicates running out of
resources, first do a global flush allowing the hypervisor to free
resources associated with these mappings as well a retry creating the
new mappings and only if that also fails report this error to callers.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com> # sun50i
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230928-dma_iommu-v13-1-9e5fc4dacc36@linux.ibm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c    |  5 +++--
 drivers/iommu/apple-dart.c   |  5 +++--
 drivers/iommu/intel/iommu.c  |  5 +++--
 drivers/iommu/iommu.c        | 20 ++++++++++++++++----
 drivers/iommu/msm_iommu.c    |  5 +++--
 drivers/iommu/mtk_iommu.c    |  5 +++--
 drivers/iommu/s390-iommu.c   | 29 +++++++++++++++++++++++------
 drivers/iommu/sprd-iommu.c   |  5 +++--
 drivers/iommu/sun50i-iommu.c |  6 ++++--
 include/linux/iommu.h        |  4 ++--
 10 files changed, 63 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 95bd7c25ba6f..74229b1607a1 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2233,14 +2233,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 	return ret;
 }
 
-static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
-				     unsigned long iova, size_t size)
+static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
+				    unsigned long iova, size_t size)
 {
 	struct protection_domain *domain = to_pdomain(dom);
 	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
 
 	if (ops->map_pages)
 		domain_flush_np_cache(domain, iova, size);
+	return 0;
 }
 
 static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 2082081402d3..22880a42ccaa 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -506,10 +506,11 @@ static void apple_dart_iotlb_sync(struct iommu_domain *domain,
 	apple_dart_domain_flush_tlb(to_dart_domain(domain));
 }
 
-static void apple_dart_iotlb_sync_map(struct iommu_domain *domain,
-				      unsigned long iova, size_t size)
+static int apple_dart_iotlb_sync_map(struct iommu_domain *domain,
+				     unsigned long iova, size_t size)
 {
 	apple_dart_domain_flush_tlb(to_dart_domain(domain));
+	return 0;
 }
 
 static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5db283c17e0d..477937513502 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4678,8 +4678,8 @@ static bool risky_device(struct pci_dev *pdev)
 	return false;
 }
 
-static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
-				       unsigned long iova, size_t size)
+static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
+				      unsigned long iova, size_t size)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	unsigned long pages = aligned_nrpages(iova, size);
@@ -4689,6 +4689,7 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 
 	xa_for_each(&dmar_domain->iommu_array, i, info)
 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
+	return 0;
 }
 
 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1ecac2b5c54f..1cc9d139c860 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2585,8 +2585,17 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		return -EINVAL;
 
 	ret = __iommu_map(domain, iova, paddr, size, prot, gfp);
-	if (ret == 0 && ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain, iova, size);
+	if (ret == 0 && ops->iotlb_sync_map) {
+		ret = ops->iotlb_sync_map(domain, iova, size);
+		if (ret)
+			goto out_err;
+	}
+
+	return ret;
+
+out_err:
+	/* undo mappings already done */
+	iommu_unmap(domain, iova, size);
 
 	return ret;
 }
@@ -2714,8 +2723,11 @@ next:
 			sg = sg_next(sg);
 	}
 
-	if (ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain, iova, mapped);
+	if (ops->iotlb_sync_map) {
+		ret = ops->iotlb_sync_map(domain, iova, mapped);
+		if (ret)
+			goto out_err;
+	}
 	return mapped;
 
 out_err:
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index a163cee0b724..f86af9815d6f 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -498,12 +498,13 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long iova,
 	return ret;
 }
 
-static void msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
-			       size_t size)
+static int msm_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
+			      size_t size)
 {
 	struct msm_priv *priv = to_msm_priv(domain);
 
 	__flush_iotlb_range(iova, size, SZ_4K, false, priv);
+	return 0;
 }
 
 static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 19ef50221c93..2d8e95246781 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -837,12 +837,13 @@ static void mtk_iommu_iotlb_sync(struct iommu_domain *domain,
 	mtk_iommu_tlb_flush_range_sync(gather->start, length, dom->bank);
 }
 
-static void mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
-			       size_t size)
+static int mtk_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
+			      size_t size)
 {
 	struct mtk_iommu_domain *dom = to_mtk_domain(domain);
 
 	mtk_iommu_tlb_flush_range_sync(iova, size, dom->bank);
+	return 0;
 }
 
 static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 5695ad71d60e..560d0957f9be 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -219,6 +219,12 @@ static void s390_iommu_release_device(struct device *dev)
 		__s390_iommu_detach_device(zdev);
 }
 
+static int zpci_refresh_all(struct zpci_dev *zdev)
+{
+	return zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma,
+				  zdev->end_dma - zdev->start_dma + 1);
+}
+
 static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain)
 {
 	struct s390_domain *s390_domain = to_s390_domain(domain);
@@ -226,8 +232,7 @@ static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) {
-		zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma,
-				   zdev->end_dma - zdev->start_dma + 1);
+		zpci_refresh_all(zdev);
 	}
 	rcu_read_unlock();
 }
@@ -251,20 +256,32 @@ static void s390_iommu_iotlb_sync(struct iommu_domain *domain,
 	rcu_read_unlock();
 }
 
-static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain,
-				      unsigned long iova, size_t size)
+static int s390_iommu_iotlb_sync_map(struct iommu_domain *domain,
+				     unsigned long iova, size_t size)
 {
 	struct s390_domain *s390_domain = to_s390_domain(domain);
 	struct zpci_dev *zdev;
+	int ret = 0;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) {
 		if (!zdev->tlb_refresh)
 			continue;
-		zpci_refresh_trans((u64)zdev->fh << 32,
-				   iova, size);
+		ret = zpci_refresh_trans((u64)zdev->fh << 32,
+					 iova, size);
+		/*
+		 * let the hypervisor discover invalidated entries
+		 * allowing it to free IOVAs and unpin pages
+		 */
+		if (ret == -ENOMEM) {
+			ret = zpci_refresh_all(zdev);
+			if (ret)
+				break;
+		}
 	}
 	rcu_read_unlock();
+
+	return ret;
 }
 
 static int s390_iommu_validate_trans(struct s390_domain *s390_domain,
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index 9c33ea6903f6..2eb9fb46703b 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -341,8 +341,8 @@ static size_t sprd_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 	return size;
 }
 
-static void sprd_iommu_sync_map(struct iommu_domain *domain,
-				unsigned long iova, size_t size)
+static int sprd_iommu_sync_map(struct iommu_domain *domain,
+			       unsigned long iova, size_t size)
 {
 	struct sprd_iommu_domain *dom = to_sprd_domain(domain);
 	unsigned int reg;
@@ -354,6 +354,7 @@ static void sprd_iommu_sync_map(struct iommu_domain *domain,
 
 	/* clear IOMMU TLB buffer after page table updated */
 	sprd_iommu_write(dom->sdev, reg, 0xffffffff);
+	return 0;
 }
 
 static void sprd_iommu_sync(struct iommu_domain *domain,
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index 3c834854eda1..41484a5a399b 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -401,8 +401,8 @@ static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain)
 	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
 }
 
-static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain,
-					unsigned long iova, size_t size)
+static int sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain,
+				       unsigned long iova, size_t size)
 {
 	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
 	struct sun50i_iommu *iommu = sun50i_domain->iommu;
@@ -411,6 +411,8 @@ static void sun50i_iommu_iotlb_sync_map(struct iommu_domain *domain,
 	spin_lock_irqsave(&iommu->iommu_lock, flags);
 	sun50i_iommu_zap_range(iommu, iova, size);
 	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+	return 0;
 }
 
 static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 64bd20142cbe..1eb638752781 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -350,8 +350,8 @@ struct iommu_domain_ops {
 			      struct iommu_iotlb_gather *iotlb_gather);
 
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
-	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
-			       size_t size);
+	int (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+			      size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 
-- 
cgit v1.2.3


From 32d5bc8b09c7cc48c511809e7c3b1755c7ecc5fa Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Thu, 28 Sep 2023 16:31:39 +0200
Subject: iommu/dma: Allow a single FQ in addition to per-CPU FQs

In some virtualized environments, including s390 paged memory guests,
IOTLB flushes are used to update IOMMU shadow tables. Due to this, they
are much more expensive than in typical bare metal environments or
non-paged s390 guests. In addition they may parallelize poorly in
virtualized environments. This changes the trade off for flushing IOVAs
such that minimizing the number of IOTLB flushes trumps any benefit of
cheaper queuing operations or increased paralellism.

In this scenario per-CPU flush queues pose several problems. Firstly
per-CPU memory is often quite limited prohibiting larger queues.
Secondly collecting IOVAs per-CPU but flushing via a global timeout
reduces the number of IOVAs flushed for each timeout especially on s390
where PCI interrupts may not be bound to a specific CPU.

Let's introduce a single flush queue mode that reuses the same queue
logic but only allocates a single global queue. This mode is selected by
dma-iommu if a newly introduced .shadow_on_flush flag is set in struct
dev_iommu. As a first user the s390 IOMMU driver sets this flag during
probe_device. With the unchanged small FQ size and timeouts this setting
is worse than per-CPU queues but a follow up patch will make the FQ size
and timeout variable. Together this allows the common IOVA flushing code
to more closely resemble the global flush behavior used on s390's
previous internal DMA API implementation.

Link: https://lore.kernel.org/all/9a466109-01c5-96b0-bf03-304123f435ee@arm.com/
Acked-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com> #s390
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230928-dma_iommu-v13-5-9e5fc4dacc36@linux.ibm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c  | 168 ++++++++++++++++++++++++++++++++++-----------
 drivers/iommu/s390-iommu.c |   3 +
 include/linux/iommu.h      |   2 +
 3 files changed, 134 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4b1a88f514c9..a85ff75ad531 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -43,14 +43,26 @@ enum iommu_dma_cookie_type {
 	IOMMU_DMA_MSI_COOKIE,
 };
 
+enum iommu_dma_queue_type {
+	IOMMU_DMA_OPTS_PER_CPU_QUEUE,
+	IOMMU_DMA_OPTS_SINGLE_QUEUE,
+};
+
+struct iommu_dma_options {
+	enum iommu_dma_queue_type qt;
+};
+
 struct iommu_dma_cookie {
 	enum iommu_dma_cookie_type	type;
 	union {
 		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
 		struct {
 			struct iova_domain	iovad;
-
-			struct iova_fq __percpu *fq;	/* Flush queue */
+			/* Flush queue */
+			union {
+				struct iova_fq	*single_fq;
+				struct iova_fq	__percpu *percpu_fq;
+			};
 			/* Number of TLB flushes that have been started */
 			atomic64_t		fq_flush_start_cnt;
 			/* Number of TLB flushes that have been finished */
@@ -67,6 +79,8 @@ struct iommu_dma_cookie {
 
 	/* Domain for flush queue callback; NULL if flush queue not in use */
 	struct iommu_domain		*fq_domain;
+	/* Options for dma-iommu use */
+	struct iommu_dma_options	options;
 	struct mutex			mutex;
 };
 
@@ -124,7 +138,7 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
 	return idx;
 }
 
-static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
 {
 	u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
 	unsigned int idx;
@@ -145,6 +159,15 @@ static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
 	}
 }
 
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&fq->lock, flags);
+	fq_ring_free_locked(cookie, fq);
+	spin_unlock_irqrestore(&fq->lock, flags);
+}
+
 static void fq_flush_iotlb(struct iommu_dma_cookie *cookie)
 {
 	atomic64_inc(&cookie->fq_flush_start_cnt);
@@ -160,14 +183,11 @@ static void fq_flush_timeout(struct timer_list *t)
 	atomic_set(&cookie->fq_timer_on, 0);
 	fq_flush_iotlb(cookie);
 
-	for_each_possible_cpu(cpu) {
-		unsigned long flags;
-		struct iova_fq *fq;
-
-		fq = per_cpu_ptr(cookie->fq, cpu);
-		spin_lock_irqsave(&fq->lock, flags);
-		fq_ring_free(cookie, fq);
-		spin_unlock_irqrestore(&fq->lock, flags);
+	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
+		fq_ring_free(cookie, cookie->single_fq);
+	} else {
+		for_each_possible_cpu(cpu)
+			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
 	}
 }
 
@@ -188,7 +208,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
 	 */
 	smp_mb();
 
-	fq = raw_cpu_ptr(cookie->fq);
+	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
+		fq = cookie->single_fq;
+	else
+		fq = raw_cpu_ptr(cookie->percpu_fq);
+
 	spin_lock_irqsave(&fq->lock, flags);
 
 	/*
@@ -196,11 +220,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
 	 * flushed out on another CPU. This makes the fq_full() check below less
 	 * likely to be true.
 	 */
-	fq_ring_free(cookie, fq);
+	fq_ring_free_locked(cookie, fq);
 
 	if (fq_full(fq)) {
 		fq_flush_iotlb(cookie);
-		fq_ring_free(cookie, fq);
+		fq_ring_free_locked(cookie, fq);
 	}
 
 	idx = fq_ring_add(fq);
@@ -219,31 +243,88 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
 			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
 }
 
-static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
+static void iommu_dma_free_fq_single(struct iova_fq *fq)
 {
-	int cpu, idx;
+	int idx;
 
-	if (!cookie->fq)
-		return;
+	fq_ring_for_each(idx, fq)
+		put_pages_list(&fq->entries[idx].freelist);
+	vfree(fq);
+}
+
+static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq)
+{
+	int cpu, idx;
 
-	del_timer_sync(&cookie->fq_timer);
 	/* The IOVAs will be torn down separately, so just free our queued pages */
 	for_each_possible_cpu(cpu) {
-		struct iova_fq *fq = per_cpu_ptr(cookie->fq, cpu);
+		struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu);
 
 		fq_ring_for_each(idx, fq)
 			put_pages_list(&fq->entries[idx].freelist);
 	}
 
-	free_percpu(cookie->fq);
+	free_percpu(percpu_fq);
+}
+
+static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
+{
+	if (!cookie->fq_domain)
+		return;
+
+	del_timer_sync(&cookie->fq_timer);
+	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
+		iommu_dma_free_fq_single(cookie->single_fq);
+	else
+		iommu_dma_free_fq_percpu(cookie->percpu_fq);
+}
+
+static void iommu_dma_init_one_fq(struct iova_fq *fq)
+{
+	int i;
+
+	fq->head = 0;
+	fq->tail = 0;
+
+	spin_lock_init(&fq->lock);
+
+	for (i = 0; i < IOVA_FQ_SIZE; i++)
+		INIT_LIST_HEAD(&fq->entries[i].freelist);
+}
+
+static int iommu_dma_init_fq_single(struct iommu_dma_cookie *cookie)
+{
+	struct iova_fq *queue;
+
+	queue = vmalloc(sizeof(*queue));
+	if (!queue)
+		return -ENOMEM;
+	iommu_dma_init_one_fq(queue);
+	cookie->single_fq = queue;
+
+	return 0;
+}
+
+static int iommu_dma_init_fq_percpu(struct iommu_dma_cookie *cookie)
+{
+	struct iova_fq __percpu *queue;
+	int cpu;
+
+	queue = alloc_percpu(struct iova_fq);
+	if (!queue)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		iommu_dma_init_one_fq(per_cpu_ptr(queue, cpu));
+	cookie->percpu_fq = queue;
+	return 0;
 }
 
 /* sysfs updates are serialised by the mutex of the group owning @domain */
 int iommu_dma_init_fq(struct iommu_domain *domain)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_fq __percpu *queue;
-	int i, cpu;
+	int rc;
 
 	if (cookie->fq_domain)
 		return 0;
@@ -251,26 +332,16 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
 	atomic64_set(&cookie->fq_flush_start_cnt,  0);
 	atomic64_set(&cookie->fq_flush_finish_cnt, 0);
 
-	queue = alloc_percpu(struct iova_fq);
-	if (!queue) {
+	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE)
+		rc = iommu_dma_init_fq_single(cookie);
+	else
+		rc = iommu_dma_init_fq_percpu(cookie);
+
+	if (rc) {
 		pr_warn("iova flush queue initialization failed\n");
 		return -ENOMEM;
 	}
 
-	for_each_possible_cpu(cpu) {
-		struct iova_fq *fq = per_cpu_ptr(queue, cpu);
-
-		fq->head = 0;
-		fq->tail = 0;
-
-		spin_lock_init(&fq->lock);
-
-		for (i = 0; i < IOVA_FQ_SIZE; i++)
-			INIT_LIST_HEAD(&fq->entries[i].freelist);
-	}
-
-	cookie->fq = queue;
-
 	timer_setup(&cookie->fq_timer, fq_flush_timeout, 0);
 	atomic_set(&cookie->fq_timer_on, 0);
 	/*
@@ -554,6 +625,23 @@ static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg,
 	return false;
 }
 
+/**
+ * iommu_dma_init_options - Initialize dma-iommu options
+ * @options: The options to be initialized
+ * @dev: Device the options are set for
+ *
+ * This allows tuning dma-iommu specific to device properties
+ */
+static void iommu_dma_init_options(struct iommu_dma_options *options,
+				   struct device *dev)
+{
+	/* Shadowing IOTLB flushes do better with a single queue */
+	if (dev->iommu->shadow_on_flush)
+		options->qt = IOMMU_DMA_OPTS_SINGLE_QUEUE;
+	else
+		options->qt = IOMMU_DMA_OPTS_PER_CPU_QUEUE;
+}
+
 /**
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -614,6 +702,8 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	if (ret)
 		goto done_unlock;
 
+	iommu_dma_init_options(&cookie->options, dev);
+
 	/* If the FQ fails we can simply fall back to strict mode */
 	if (domain->type == IOMMU_DOMAIN_DMA_FQ &&
 	    (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain)))
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 5b5ba1cd5f00..9a5196f523de 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -463,6 +463,9 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev)
 	if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1)
 		zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1;
 
+	if (zdev->tlb_refresh)
+		dev->iommu->shadow_on_flush = 1;
+
 	return &zdev->iommu_dev;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1eb638752781..0c4d8ae985ac 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -424,6 +424,7 @@ struct iommu_fault_param {
  * @attach_deferred: the dma domain attachment is deferred
  * @pci_32bit_workaround: Limit DMA allocations to 32-bit IOVAs
  * @require_direct: device requires IOMMU_RESV_DIRECT regions
+ * @shadow_on_flush: IOTLB flushes are used to sync shadow tables
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
@@ -439,6 +440,7 @@ struct dev_iommu {
 	u32				attach_deferred:1;
 	u32				pci_32bit_workaround:1;
 	u32				require_direct:1;
+	u32				shadow_on_flush:1;
 };
 
 int iommu_device_register(struct iommu_device *iommu,
-- 
cgit v1.2.3


From 0d293714ac32650bfb669ceadf7cc2fad8161401 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:27 +0300
Subject: RDMA/mlx5: Send events from IB driver about device affiliation state

Send blocking events from IB driver whenever the device is done being
affiliated or if it is removed from an affiliation.

This is useful since now the EN driver can register to those event and
know when a device is affiliated or not.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/a7491c3e483cfd8d962f5f75b9a25f253043384a.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c              | 17 +++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  6 ++++++
 include/linux/mlx5/device.h                    |  2 ++
 include/linux/mlx5/driver.h                    |  2 ++
 4 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index aed5cdea50e6..530d88784e41 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -24,6 +24,7 @@
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/eswitch.h>
+#include <linux/mlx5/driver.h>
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem_odp.h>
@@ -3175,6 +3176,13 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 
 	lockdep_assert_held(&mlx5_ib_multiport_mutex);
 
+	mlx5_core_mp_event_replay(ibdev->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
+				  NULL);
+	mlx5_core_mp_event_replay(mpi->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
+				  NULL);
+
 	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
 
 	spin_lock(&port->mp.mpi_lock);
@@ -3226,6 +3234,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 				    struct mlx5_ib_multiport_info *mpi)
 {
 	u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	u64 key;
 	int err;
 
 	lockdep_assert_held(&mlx5_ib_multiport_mutex);
@@ -3254,6 +3263,14 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 
 	mlx5_ib_init_cong_debugfs(ibdev, port_num);
 
+	key = ibdev->ib_dev.index;
+	mlx5_core_mp_event_replay(mpi->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+				  &key);
+	mlx5_core_mp_event_replay(ibdev->mdev,
+				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+				  &key);
+
 	return true;
 
 unbind:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 15561965d2af..6ca91c0e8a6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -361,6 +361,12 @@ void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_core_uplink_netdev_event_replay);
 
+void mlx5_core_mp_event_replay(struct mlx5_core_dev *dev, u32 event, void *data)
+{
+	mlx5_blocking_notifier_call_chain(dev, event, data);
+}
+EXPORT_SYMBOL(mlx5_core_mp_event_replay);
+
 int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
 			    enum mlx5_cap_mode cap_mode)
 {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4d5be378fa8c..26333d602a50 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -366,6 +366,8 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 	MLX5_DRIVER_EVENT_MACSEC_SA_ADDED,
 	MLX5_DRIVER_EVENT_MACSEC_SA_DELETED,
+	MLX5_DRIVER_EVENT_AFFILIATION_DONE,
+	MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3033bbaeac81..5ca4e085d813 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1027,6 +1027,8 @@ bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev);
 void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev);
 
+void mlx5_core_mp_event_replay(struct mlx5_core_dev *dev, u32 event, void *data);
+
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From ef36ffcb381096ed32c309c342a02bf62939c503 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Thu, 21 Sep 2023 15:10:30 +0300
Subject: net/mlx5: Add alias flow table bits

Add all the capabilities needed to check for alias object support.
As well as all the fields or commands needed for its creation and
the creation of flow table that is able to jump to an alias object.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/544c030f2a78c4adf3fe6b64f97a39cc1bbdabb9.1695296682.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 56 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3265bfcb3156..23f9780adb83 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -312,6 +312,7 @@ enum {
 	MLX5_CMD_OP_QUERY_VHCA_STATE              = 0xb0d,
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
+	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1934,6 +1935,14 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         match_definer_format_supported[0x40];
 };
 
+enum {
+	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS  = 0x80000,
+};
+
+enum {
+	MLX5_ALLOWED_OBJ_FOR_OTHER_VHCA_ACCESS_FLOW_TABLE       = 0x200,
+};
+
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
@@ -1952,7 +1961,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   migration_in_chunks[0x1];
 	u8	   reserved_at_d1[0xf];
 
-	u8	   reserved_at_e0[0xc0];
+	u8	   cross_vhca_object_to_object_supported[0x20];
+
+	u8	   allowed_object_for_other_vhca_access[0x40];
+
+	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
@@ -6371,6 +6384,28 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_allow_other_vhca_access_in_bits {
+	u8 opcode[0x10];
+	u8 uid[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 reserved_at_40[0x50];
+	u8 object_type_to_be_accessed[0x10];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_c0[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+};
+
+struct mlx5_ifc_allow_other_vhca_access_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_modify_header_arg_bits {
 	u8         reserved_at_0[0x80];
 
@@ -6393,6 +6428,24 @@ struct mlx5_ifc_create_match_definer_out_bits {
 	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
 };
 
+struct mlx5_ifc_alias_context_bits {
+	u8 vhca_id_to_be_accessed[0x10];
+	u8 reserved_at_10[0xd];
+	u8 status[0x3];
+	u8 object_id_to_be_accessed[0x20];
+	u8 reserved_at_40[0x40];
+	union {
+		u8 access_key_raw[0x100];
+		u8 access_key[8][0x20];
+	};
+	u8 metadata[0x80];
+};
+
+struct mlx5_ifc_create_alias_obj_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_alias_context_bits alias_ctx;
+};
+
 enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
@@ -11919,6 +11972,7 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24,
 	MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27,
 	MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47,
+	MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
 };
 
 enum {
-- 
cgit v1.2.3


From d424348b060d87f92cc59d8e6ea9c612c5b708f5 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Thu, 28 Sep 2023 19:45:12 +0300
Subject: vdpa/mlx5: Expose descriptor group mkey hw capability

Necessary for improved live migration flow. Actual support will be added
in a downstream patch.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://lore.kernel.org/r/20230928164550.980832-3-dtatulea@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fc3db401f8a2..3388007c645f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1231,7 +1231,13 @@ struct mlx5_ifc_virtio_emulation_cap_bits {
 	u8         max_emulated_devices[0x8];
 	u8         max_num_virtio_queues[0x18];
 
-	u8         reserved_at_a0[0x60];
+	u8         reserved_at_a0[0x20];
+
+	u8	   reserved_at_c0[0x13];
+	u8         desc_group_mkey_supported[0x1];
+	u8         reserved_at_d4[0xc];
+
+	u8         reserved_at_e0[0x20];
 
 	u8         umem_1_buffer_param_a[0x20];
 
-- 
cgit v1.2.3


From a60359ea32251399c00d934981f5a6fa25ec917f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 22 Sep 2023 08:45:51 +0200
Subject: usb: renesas_usbhs: remove boilerplate from header file

There is a SPDX entry, so we can remove the boilerplate.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/20230922064551.4663-1-wsa+renesas@sang-engineering.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/renesas_usbhs.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index d418c55523a7..372898d9eeb0 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -5,16 +5,6 @@
  * Copyright (C) 2011 Renesas Solutions Corp.
  * Copyright (C) 2019 Renesas Electronics Corporation
  * Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 #ifndef RENESAS_USB_H
 #define RENESAS_USB_H
-- 
cgit v1.2.3


From a17fae8fc38e91026f116a85c5068668fbf9848a Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:39 -0700
Subject: usb: typec: Add Displayport Alternate Mode 2.1 Support

Displayport Alternate mode 2.1 requires configuration for additional
cable details such as signalling for cable, UHBR13.5 Support, Cable type
and DPAM version.
These details can be used with mux drivers to configure SOP DP
configuration for Displayport Alternate mode 2.1.
This change also includes pertinent cable signalling support in displayport
alternate mode.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-2-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/altmodes/displayport.c |  5 ++++-
 drivers/usb/typec/ucsi/displayport.c     |  2 +-
 drivers/usb/typec/ucsi/ucsi_ccg.c        |  4 ++--
 include/linux/usb/typec_dp.h             | 28 ++++++++++++++++++++++++----
 4 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c
index 426c88a516e5..f503cb4cd721 100644
--- a/drivers/usb/typec/altmodes/displayport.c
+++ b/drivers/usb/typec/altmodes/displayport.c
@@ -86,8 +86,11 @@ static int dp_altmode_notify(struct dp_altmode *dp)
 
 static int dp_altmode_configure(struct dp_altmode *dp, u8 con)
 {
-	u32 conf = DP_CONF_SIGNALING_DP; /* Only DP signaling supported */
 	u8 pin_assign = 0;
+	u32 conf;
+
+	/* DP Signalling */
+	conf = (dp->data.conf & DP_CONF_SIGNALLING_MASK) >> DP_CONF_SIGNALLING_SHIFT;
 
 	switch (con) {
 	case DP_STATUS_CON_DISABLED:
diff --git a/drivers/usb/typec/ucsi/displayport.c b/drivers/usb/typec/ucsi/displayport.c
index 73cd5bf35047..d9d3c91125ca 100644
--- a/drivers/usb/typec/ucsi/displayport.c
+++ b/drivers/usb/typec/ucsi/displayport.c
@@ -315,7 +315,7 @@ struct typec_altmode *ucsi_register_displayport(struct ucsi_connector *con,
 	struct ucsi_dp *dp;
 
 	/* We can't rely on the firmware with the capabilities. */
-	desc->vdo |= DP_CAP_DP_SIGNALING | DP_CAP_RECEPTACLE;
+	desc->vdo |= DP_CAP_DP_SIGNALLING(0) | DP_CAP_RECEPTACLE;
 
 	/* Claiming that we support all pin assignments */
 	desc->vdo |= all_assignments << 8;
diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c
index 607061a37eca..449c125f6f87 100644
--- a/drivers/usb/typec/ucsi/ucsi_ccg.c
+++ b/drivers/usb/typec/ucsi/ucsi_ccg.c
@@ -501,8 +501,8 @@ static void ucsi_ccg_nvidia_altmode(struct ucsi_ccg *uc,
 	case NVIDIA_FTB_DP_OFFSET:
 		if (alt[0].mid == USB_TYPEC_NVIDIA_VLINK_DBG_VDO)
 			alt[0].mid = USB_TYPEC_NVIDIA_VLINK_DP_VDO |
-				DP_CAP_DP_SIGNALING | DP_CAP_USB |
-				DP_CONF_SET_PIN_ASSIGN(BIT(DP_PIN_ASSIGN_E));
+				     DP_CAP_DP_SIGNALLING(0) | DP_CAP_USB |
+				     DP_CONF_SET_PIN_ASSIGN(BIT(DP_PIN_ASSIGN_E));
 		break;
 	case NVIDIA_FTB_DBG_OFFSET:
 		if (alt[0].mid == USB_TYPEC_NVIDIA_VLINK_DP_VDO)
diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index 8d09c2f0a9b8..1f358098522d 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -67,8 +67,10 @@ enum {
 #define   DP_CAP_UFP_D			1
 #define   DP_CAP_DFP_D			2
 #define   DP_CAP_DFP_D_AND_UFP_D	3
-#define DP_CAP_DP_SIGNALING		BIT(2) /* Always set */
-#define DP_CAP_GEN2			BIT(3) /* Reserved after v1.0b */
+#define DP_CAP_DP_SIGNALLING(_cap_)	(((_cap_) & GENMASK(5, 2)) >> 2)
+#define   DP_CAP_SIGNALLING_HBR3	1
+#define   DP_CAP_SIGNALLING_UHBR10	2
+#define   DP_CAP_SIGNALLING_UHBR20	3
 #define DP_CAP_RECEPTACLE		BIT(6)
 #define DP_CAP_USB			BIT(7)
 #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_)	(((_cap_) & GENMASK(15, 8)) >> 8)
@@ -78,6 +80,13 @@ enum {
 			DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_))
 #define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \
 			DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_))
+#define DP_CAP_UHBR_13_5_SUPPORT	BIT(26)
+#define DP_CAP_CABLE_TYPE(_cap_)	(((_cap_) & GENMASK(29, 28)) >> 28)
+#define   DP_CAP_CABLE_TYPE_PASSIVE	0
+#define   DP_CAP_CABLE_TYPE_RE_TIMER	1
+#define   DP_CAP_CABLE_TYPE_RE_DRIVER	2
+#define   DP_CAP_CABLE_TYPE_OPTICAL	3
+#define DP_CAP_DPAM_VERSION		BIT(30)
 
 /* DisplayPort Status Update VDO bits */
 #define DP_STATUS_CONNECTION(_status_)	((_status_) & 3)
@@ -97,13 +106,24 @@ enum {
 #define DP_CONF_CURRENTLY(_conf_)	((_conf_) & 3)
 #define DP_CONF_UFP_U_AS_DFP_D		BIT(0)
 #define DP_CONF_UFP_U_AS_UFP_D		BIT(1)
-#define DP_CONF_SIGNALING_DP		BIT(2)
-#define DP_CONF_SIGNALING_GEN_2		BIT(3) /* Reserved after v1.0b */
+#define DP_CONF_SIGNALLING_MASK		GENMASK(5, 2)
+#define DP_CONF_SIGNALLING_SHIFT	2
+#define   DP_CONF_SIGNALLING_HBR3	1
+#define   DP_CONF_SIGNALLING_UHBR10	2
+#define   DP_CONF_SIGNALLING_UHBR20	3
 #define DP_CONF_PIN_ASSIGNEMENT_SHIFT	8
 #define DP_CONF_PIN_ASSIGNEMENT_MASK	GENMASK(15, 8)
 
 /* Helper for setting/getting the pin assignment value to the configuration */
 #define DP_CONF_SET_PIN_ASSIGN(_a_)	((_a_) << 8)
 #define DP_CONF_GET_PIN_ASSIGN(_conf_)	(((_conf_) & GENMASK(15, 8)) >> 8)
+#define DP_CONF_UHBR13_5_SUPPORT	BIT(26)
+#define DP_CONF_CABLE_TYPE_MASK		GENMASK(29, 28)
+#define DP_CONF_CABLE_TYPE_SHIFT	28
+#define   DP_CONF_CABLE_TYPE_PASSIVE	0
+#define   DP_CONF_CABLE_TYPE_RE_TIMER	1
+#define   DP_CONF_CABLE_TYPE_RE_DRIVER	2
+#define   DP_CONF_CABLE_TYPE_OPTICAL	3
+#define DP_CONF_DPAM_VERSION		BIT(30)
 
 #endif /* __USB_TYPEC_DP_H */
-- 
cgit v1.2.3


From c365b1e1f40499472433cc8fca3d0ea280ead52a Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:40 -0700
Subject: usb: typec: Add Active or Passive cable defination to cable discover
 mode VDO

As per USB Type-C Connector specification v2.2 section F.2.6, BIT25
represents Active or Passive cable. Added BIT25 defination to the
Thunderbolt 3 cable discover mode VDO.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-3-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/typec_tbt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h
index 63dd44b72e0c..c7a2153bd6f5 100644
--- a/include/linux/usb/typec_tbt.h
+++ b/include/linux/usb/typec_tbt.h
@@ -46,6 +46,7 @@ struct typec_thunderbolt_data {
 #define TBT_CABLE_OPTICAL		BIT(21)
 #define TBT_CABLE_RETIMER		BIT(22)
 #define TBT_CABLE_LINK_TRAINING		BIT(23)
+#define TBT_CABLE_ACTIVE_PASSIVE	BIT(25)
 
 #define TBT_SET_CABLE_SPEED(_s_)	(((_s_) & GENMASK(2, 0)) << 16)
 #define TBT_SET_CABLE_ROUNDED(_g_)	(((_g_) & GENMASK(1, 0)) << 19)
-- 
cgit v1.2.3


From f9ee6043283a78c84adf6ef3cef7a085eee4130f Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Tue, 19 Sep 2023 19:32:41 -0700
Subject: usb: pd: Add helper macro to get Type C cable speed

Added a helper macro to get the Type C cable speed when provided the
cable VDO.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Link: https://lore.kernel.org/r/20230920023243.2494410-4-utkarsh.h.patel@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_vdo.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index b057250704e8..3a747938cdab 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -376,6 +376,7 @@
 	 | ((vbm) & 0x3) << 9 | (sbu) << 8 | (sbut) << 7 | ((cur) & 0x3) << 5	\
 	 | (vbt) << 4 | (sopp) << 3 | ((spd) & 0x7))
 
+#define VDO_TYPEC_CABLE_SPEED(vdo)	((vdo) & 0x7)
 #define VDO_TYPEC_CABLE_TYPE(vdo)	(((vdo) >> 18) & 0x3)
 
 /*
-- 
cgit v1.2.3


From 1b8a62937e0b23c41956feec778ca7776a01df48 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 26 Sep 2023 15:25:31 +0200
Subject: ASoC: ti: Convert TWL4030 to use GPIO descriptors

The TWL4030 is actually only ever populated from the device tree,
so we can just pass the right device and headphone jack GPIO name
to snd_soc_jack_add_gpios() and it will pick the right GPIO right
from the device tree.

The platform data patch is unused (no in-tree users of the pdata
method) but these can use GPIO descriptor tables rather than global
GPIO numbers if they need this.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Link: https://lore.kernel.org/r/20230926-descriptors-asoc-ti-v1-3-60cf4f8adbc5@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/omap-twl4030.h |  3 ---
 sound/soc/ti/omap-twl4030.c                | 20 ++++++++------------
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/omap-twl4030.h b/include/linux/platform_data/omap-twl4030.h
index 0dd851ea1c72..7fcb55fe21c9 100644
--- a/include/linux/platform_data/omap-twl4030.h
+++ b/include/linux/platform_data/omap-twl4030.h
@@ -37,9 +37,6 @@ struct omap_tw4030_pdata {
 	bool	has_digimic0;
 	bool	has_digimic1;
 	u8	has_linein;
-
-	/* Jack detect GPIO or  <= 0 if it is not implemented */
-	int jack_detect;
 };
 
 #endif /* _OMAP_TWL4030_H_ */
diff --git a/sound/soc/ti/omap-twl4030.c b/sound/soc/ti/omap-twl4030.c
index 950eec44503b..c7055bb424e6 100644
--- a/sound/soc/ti/omap-twl4030.c
+++ b/sound/soc/ti/omap-twl4030.c
@@ -20,8 +20,6 @@
 #include <linux/platform_data/omap-twl4030.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/gpio.h>
-#include <linux/of_gpio.h>
 
 #include <sound/core.h>
 #include <sound/pcm.h>
@@ -31,7 +29,6 @@
 #include "omap-mcbsp.h"
 
 struct omap_twl4030 {
-	int jack_detect;	/* board can detect jack events */
 	struct snd_soc_jack hs_jack;
 };
 
@@ -130,7 +127,7 @@ static struct snd_soc_jack_pin hs_jack_pins[] = {
 /* Headset jack detection gpios */
 static struct snd_soc_jack_gpio hs_jack_gpios[] = {
 	{
-		.name = "hsdet-gpio",
+		.name = "ti,jack-det",
 		.report = SND_JACK_HEADSET,
 		.debounce_time = 200,
 	},
@@ -151,9 +148,13 @@ static int omap_twl4030_init(struct snd_soc_pcm_runtime *rtd)
 	struct omap_twl4030 *priv = snd_soc_card_get_drvdata(card);
 	int ret = 0;
 
-	/* Headset jack detection only if it is supported */
-	if (priv->jack_detect > 0) {
-		hs_jack_gpios[0].gpio = priv->jack_detect;
+	/*
+	 * This is a bit of a hack, but the GPIO is optional so we
+	 * only want to add the jack detection if the GPIO is there.
+	 */
+	if (of_property_present(card->dev->of_node, "ti,jack-det-gpio")) {
+		hs_jack_gpios[0].gpiod_dev = card->dev;
+		hs_jack_gpios[0].idx = 0;
 
 		ret = snd_soc_card_jack_new_pins(rtd->card, "Headset Jack",
 						 SND_JACK_HEADSET,
@@ -279,9 +280,6 @@ static int omap_twl4030_probe(struct platform_device *pdev)
 			omap_twl4030_dai_links[1].platforms->of_node = dai_node;
 		}
 
-		priv->jack_detect = of_get_named_gpio(node,
-						      "ti,jack-det-gpio", 0);
-
 		/* Optional: audio routing can be provided */
 		prop = of_find_property(node, "ti,audio-routing", NULL);
 		if (prop) {
@@ -302,8 +300,6 @@ static int omap_twl4030_probe(struct platform_device *pdev)
 
 		if (!pdata->voice_connected)
 			card->num_links = 1;
-
-		priv->jack_detect = pdata->jack_detect;
 	} else {
 		dev_err(&pdev->dev, "Missing pdata\n");
 		return -ENODEV;
-- 
cgit v1.2.3


From 52e24f8c0a102ac76649c6b71224fadcc82bd5da Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Mon, 11 Sep 2023 14:56:52 +0200
Subject: usb: pci-quirks: handle HAS_IOPORT dependency for AMD quirk

In a future patch HAS_IOPORT=n will result in inb()/outb() and friends
not being declared. In the pci-quirks case the I/O port acceses are
used in the quirks for several AMD south bridges, Add a config option
for the AMD quirks to depend on HAS_IOPORT and #ifdef the quirk code.

Co-developed-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Link: https://lore.kernel.org/r/20230911125653.1393895-3-schnelle@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/Kconfig           | 10 ++++++++++
 drivers/usb/core/hcd-pci.c    |  3 +--
 drivers/usb/host/pci-quirks.c |  2 ++
 drivers/usb/host/pci-quirks.h | 30 ++++++++++++++++++++++--------
 include/linux/usb/hcd.h       | 17 +++++++++++++++++
 5 files changed, 52 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index 7f33bcc315f2..abf8c6cdea9e 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -91,6 +91,16 @@ config USB_PCI
 	  If you have such a device you may say N here and PCI related code
 	  will not be built in the USB driver.
 
+config USB_PCI_AMD
+	bool "AMD PCI USB host support"
+	depends on USB_PCI && HAS_IOPORT
+	default X86 || MACH_LOONGSON64 || PPC_PASEMI
+	help
+	  Enable workarounds for USB implementation quirks in SB600/SB700/SB800
+	  and later south bridge implementations. These are common on x86 PCs
+	  with AMD CPUs but rarely used elsewhere, with the exception of a few
+	  powerpc and mips desktop machines.
+
 if USB
 
 source "drivers/usb/core/Kconfig"
diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c
index 990280688b25..ee3156f49533 100644
--- a/drivers/usb/core/hcd-pci.c
+++ b/drivers/usb/core/hcd-pci.c
@@ -206,8 +206,7 @@ int usb_hcd_pci_probe(struct pci_dev *dev, const struct hc_driver *driver)
 		goto free_irq_vectors;
 	}
 
-	hcd->amd_resume_bug = (usb_hcd_amd_remote_wakeup_quirk(dev) &&
-			driver->flags & (HCD_USB11 | HCD_USB3)) ? 1 : 0;
+	hcd->amd_resume_bug = usb_hcd_amd_resume_bug(dev, driver);
 
 	if (driver->flags & HCD_MEMORY) {
 		/* EHCI, OHCI */
diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c
index 5e06fad82a22..10813096d00c 100644
--- a/drivers/usb/host/pci-quirks.c
+++ b/drivers/usb/host/pci-quirks.c
@@ -76,6 +76,7 @@
 #define USB_INTEL_USB3_PSSEN   0xD8
 #define USB_INTEL_USB3PRM      0xDC
 
+#ifdef CONFIG_USB_PCI_AMD
 /* AMD quirk use */
 #define	AB_REG_BAR_LOW		0xe0
 #define	AB_REG_BAR_HIGH		0xe1
@@ -587,6 +588,7 @@ bool usb_amd_pt_check_port(struct device *device, int port)
 	return !(value & BIT(port_shift));
 }
 EXPORT_SYMBOL_GPL(usb_amd_pt_check_port);
+#endif /* CONFIG_USB_PCI_AMD */
 
 static int usb_asmedia_wait_write(struct pci_dev *pdev)
 {
diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h
index cde2263a9d2e..a5230b0b9e91 100644
--- a/drivers/usb/host/pci-quirks.h
+++ b/drivers/usb/host/pci-quirks.h
@@ -2,7 +2,7 @@
 #ifndef __LINUX_USB_PCI_QUIRKS_H
 #define __LINUX_USB_PCI_QUIRKS_H
 
-#ifdef CONFIG_USB_PCI
+#ifdef CONFIG_USB_PCI_AMD
 int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *pdev);
 bool usb_amd_hang_symptom_quirk(void);
 bool usb_amd_prefetch_quirk(void);
@@ -12,23 +12,37 @@ void usb_amd_quirk_pll_disable(void);
 void usb_amd_quirk_pll_enable(void);
 void sb800_prefetch(struct device *dev, int on);
 bool usb_amd_pt_check_port(struct device *device, int port);
-
-void uhci_reset_hc(struct pci_dev *pdev, unsigned long base);
-int uhci_check_and_reset_hc(struct pci_dev *pdev, unsigned long base);
-void usb_asmedia_modifyflowcontrol(struct pci_dev *pdev);
-void usb_enable_intel_xhci_ports(struct pci_dev *xhci_pdev);
-void usb_disable_xhci_ports(struct pci_dev *xhci_pdev);
 #else
-struct pci_dev;
+static inline bool usb_amd_hang_symptom_quirk(void)
+{
+	return false;
+};
+static inline bool usb_amd_prefetch_quirk(void)
+{
+	return false;
+}
 static inline void usb_amd_quirk_pll_disable(void) {}
 static inline void usb_amd_quirk_pll_enable(void) {}
 static inline void usb_amd_dev_put(void) {}
+static inline bool usb_amd_quirk_pll_check(void)
+{
+	return false;
+}
 static inline void sb800_prefetch(struct device *dev, int on) {}
 static inline bool usb_amd_pt_check_port(struct device *device, int port)
 {
 	return false;
 }
+#endif /* CONFIG_USB_PCI_AMD */
 
+#ifdef CONFIG_USB_PCI
+void uhci_reset_hc(struct pci_dev *pdev, unsigned long base);
+int uhci_check_and_reset_hc(struct pci_dev *pdev, unsigned long base);
+void usb_asmedia_modifyflowcontrol(struct pci_dev *pdev);
+void usb_enable_intel_xhci_ports(struct pci_dev *xhci_pdev);
+void usb_disable_xhci_ports(struct pci_dev *xhci_pdev);
+#else
+struct pci_dev;
 static inline void usb_asmedia_modifyflowcontrol(struct pci_dev *pdev) {}
 static inline void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) {}
 #endif  /* CONFIG_USB_PCI */
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 61d4f0b793dc..00724b4f6e12 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -484,8 +484,25 @@ extern int usb_hcd_pci_probe(struct pci_dev *dev,
 extern void usb_hcd_pci_remove(struct pci_dev *dev);
 extern void usb_hcd_pci_shutdown(struct pci_dev *dev);
 
+#ifdef CONFIG_USB_PCI_AMD
 extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev);
 
+static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
+					  const struct hc_driver *driver)
+{
+	if (!usb_hcd_amd_remote_wakeup_quirk(dev))
+		return false;
+	if (driver->flags & (HCD_USB11 | HCD_USB3))
+		return true;
+	return false;
+}
+#else /* CONFIG_USB_PCI_AMD */
+static inline bool usb_hcd_amd_resume_bug(struct pci_dev *dev,
+					  const struct hc_driver *driver)
+{
+	return false;
+}
+#endif
 extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
 #endif /* CONFIG_USB_PCI */
 
-- 
cgit v1.2.3


From 568441b7d45fc7198a89b522d721428f2005c356 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 2 Oct 2023 17:22:40 +0300
Subject: usb: pd: Exposing the Peak Current value of Fixed Supplies to user
 space

Exposing the value of the field as is.

The Peak Current value has to be interpreted as described
in Table 6-10 (Fixed Power Source Peak Current Capability)
of the USB Power Delivery Specification, but that
interpretation will be done in user space, not in kernel.

Suggested-by: Douglas Gilbert <dgilbert@interlog.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231002142240.2641962-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-usb_power_delivery |  7 +++++++
 drivers/usb/typec/pd.c                                   | 10 ++++------
 include/linux/usb/pd.h                                   |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-usb_power_delivery b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
index 1bf9d1d7902c..61d233c320ea 100644
--- a/Documentation/ABI/testing/sysfs-class-usb_power_delivery
+++ b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
@@ -124,6 +124,13 @@ Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
 Description:
 		The voltage the supply supports in millivolts.
 
+What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:fixed_supply/peak_current
+Date:		October 2023
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This file shows the value of the Fixed Power Source Peak Current
+		Capability field.
+
 What:		/sys/class/usb_power_delivery/.../source-capabilities/<position>:fixed_supply/maximum_current
 Date:		May 2022
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
diff --git a/drivers/usb/typec/pd.c b/drivers/usb/typec/pd.c
index 8cc66e4467c4..85d015cdbe1f 100644
--- a/drivers/usb/typec/pd.c
+++ b/drivers/usb/typec/pd.c
@@ -83,14 +83,12 @@ unchunked_extended_messages_supported_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(unchunked_extended_messages_supported);
 
-/*
- * REVISIT: Peak Current requires access also to the RDO.
 static ssize_t
 peak_current_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	...
+	return sysfs_emit(buf, "%u\n", (to_pdo(dev)->pdo >> PDO_FIXED_PEAK_CURR_SHIFT) & 3);
 }
-*/
+static DEVICE_ATTR_RO(peak_current);
 
 static ssize_t
 fast_role_swap_current_show(struct device *dev, struct device_attribute *attr, char *buf)
@@ -135,7 +133,7 @@ static struct attribute *source_fixed_supply_attrs[] = {
 	&dev_attr_usb_communication_capable.attr,
 	&dev_attr_dual_role_data.attr,
 	&dev_attr_unchunked_extended_messages_supported.attr,
-	/*&dev_attr_peak_current.attr,*/
+	&dev_attr_peak_current.attr,
 	&dev_attr_voltage.attr,
 	&maximum_current_attr.attr,
 	NULL
@@ -144,7 +142,7 @@ static struct attribute *source_fixed_supply_attrs[] = {
 static umode_t fixed_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
 {
 	if (to_pdo(kobj_to_dev(kobj))->object_position &&
-	    /*attr != &dev_attr_peak_current.attr &&*/
+	    attr != &dev_attr_peak_current.attr &&
 	    attr != &dev_attr_voltage.attr &&
 	    attr != &maximum_current_attr.attr &&
 	    attr != &operational_current_attr.attr)
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index c59fb79a42e8..eb626af0e4e7 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -228,6 +228,7 @@ enum pd_pdo_type {
 #define PDO_FIXED_UNCHUNK_EXT		BIT(24) /* Unchunked Extended Message supported (Source) */
 #define PDO_FIXED_FRS_CURR_MASK		(BIT(24) | BIT(23)) /* FR_Swap Current (Sink) */
 #define PDO_FIXED_FRS_CURR_SHIFT	23
+#define PDO_FIXED_PEAK_CURR_SHIFT	20
 #define PDO_FIXED_VOLT_SHIFT		10	/* 50mV units */
 #define PDO_FIXED_CURR_SHIFT		0	/* 10mA units */
 
-- 
cgit v1.2.3


From 3551ff7c5cfff4dc27fdcd14fa286edc08d78088 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@gmail.com>
Date: Mon, 28 Aug 2023 17:43:02 +0200
Subject: usb: gadget: clarify usage of USB_GADGET_DELAYED_STATUS

USB_GADGET_DELAYED_STATUS was introduced in commit 1b9ba000177e ("usb:
gadget: composite: Allow function drivers to pause control transfers").
It was initially intended for the composite framework to allow delaying
completing the status stage of a SET_CONFIGURATION request until all
functions are ready.

Unfortunately, that commit had an unintended side-effect of returning
USB_GADGET_DELAYED_STATUS from the ->setup() call of the composite
framework gadget driver.

As a result of this and the incomplete documentation, some UDC drivers
started relying on USB_GADGET_DELAYED_STATUS to decide when to avoid
autocompleting the status stage for 0-length control transfers. dwc3 was
the first in commit 5bdb1dcc6330 ("usb: dwc3: ep0: handle delayed_status
again"). And a number of other UDC drivers followed later, probably
relying on the dwc3 behavior as a reference.

Unfortunately, this violated the interface between the UDC and the
gadget driver for 0-length control transfers: the UDC driver must only
proceed with the status stage for a 0-length control transfer once the
gadget driver queued a response to EP0.

As a result, a few gadget drivers are partially broken when used with
a UDC that only delays the status stage for 0-length transfers when
USB_GADGET_DELAYED_STATUS is returned from the setup() callback.

This includes Raw Gadget and GadgetFS. For FunctionFS, a workaround was
added in commit 946ef68ad4e4 ("usb: gadget: ffs: Let setup() return
USB_GADGET_DELAYED_STATUS") and commit 4d644abf2569 ("usb: gadget: f_fs:
Only return delayed status when len is 0").

The proper solution to this issue would be to contain
USB_GADGET_DELAYED_STATUS within the composite framework and make all
UDC drivers to not complete the status stage for 0-length requests on
their own.

Unfortunately, there is quite a few UDC drivers that need to get fixed
and the required changes for some of them are not trivial.

For now, update the comments to clarify that USB_GADGET_DELAYED_STATUS
must not be used by the UDC drivers.

The following two commits also add workarounds to Raw Gadget and GadgetFS
to make them compatible with the broken UDC drivers until they are fixed.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/7f0ee06c68c7241c844cd50f8565fdd5ead79b1b.1693237258.git.andreyknvl@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/composite.h | 8 ++++++++
 include/linux/usb/gadget.h    | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 6014340ba980..af3cd2aae4bc 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -35,6 +35,14 @@
  * are ready. The control transfer will then be kept from completing till
  * all the function drivers that requested for USB_GADGET_DELAYED_STAUS
  * invoke usb_composite_setup_continue().
+ *
+ * NOTE: USB_GADGET_DELAYED_STATUS must not be used in UDC drivers: they
+ * must delay completing the status stage for 0-length control transfers
+ * regardless of the whether USB_GADGET_DELAYED_STATUS is returned from
+ * the gadget driver's setup() callback.
+ * Currently, a number of UDC drivers rely on USB_GADGET_DELAYED_STATUS,
+ * which is a bug. These drivers must be fixed and USB_GADGET_DELAYED_STATUS
+ * must be contained within the composite framework.
  */
 #define USB_GADGET_DELAYED_STATUS       0x7fff	/* Impossibly large value */
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 75bda0783395..6532beb587b1 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -711,6 +711,15 @@ static inline int usb_gadget_check_config(struct usb_gadget *gadget)
  * get_interface.  Setting a configuration (or interface) is where
  * endpoints should be activated or (config 0) shut down.
  *
+ * The gadget driver's setup() callback does not have to queue a response to
+ * ep0 within the setup() call, the driver can do it after setup() returns.
+ * The UDC driver must wait until such a response is queued before proceeding
+ * with the data/status stages of the control transfer.
+ *
+ * NOTE: Currently, a number of UDC drivers rely on USB_GADGET_DELAYED_STATUS
+ * being returned from the setup() callback, which is a bug. See the comment
+ * next to USB_GADGET_DELAYED_STATUS for details.
+ *
  * (Note that only the default control endpoint is supported.  Neither
  * hosts nor devices generally support control traffic except to ep0.)
  *
-- 
cgit v1.2.3


From 5234193ee2b997e59326b047610a8f3f64a0ce02 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 15 Sep 2023 13:15:17 -0700
Subject: ceph: Annotate struct ceph_osd_request with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ceph_osd_request.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: ceph-devel@vger.kernel.org
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Link: https://lore.kernel.org/r/20230915201517.never.373-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ceph/osd_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bf9823956758..b8610e9d2471 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -278,7 +278,7 @@ struct ceph_osd_request {
 	int r_attempts;
 	u32 r_map_dne_bound;
 
-	struct ceph_osd_req_op r_ops[];
+	struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops);
 };
 
 struct ceph_request_redirect {
-- 
cgit v1.2.3


From 210d4e9c732fa87b584af72faae96c037d9d7957 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:28:44 -0700
Subject: ipv4/igmp: Annotate struct ip_sf_socklist with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ip_sf_socklist.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922172858.3822653-2-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/igmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index ebf4349a53af..5171231f70a8 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -39,7 +39,7 @@ struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
 	struct rcu_head		rcu;
-	__be32			sl_addr[];
+	__be32			sl_addr[] __counted_by(sl_max);
 };
 
 #define IP_SFBLOCK	10	/* allocate this many at once */
-- 
cgit v1.2.3


From b7768e67af9a5b6d6101cbfc146969fedf8df4be Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wenst@chromium.org>
Date: Thu, 28 Sep 2023 16:55:33 +0800
Subject: regulator: mt6358: Add missing regulators for MT6366

When support for the MT6366 PMIC regulators was added, it was assumed
that it had the same functionality as MT6358. In reality there are
differences. A few regulators have different ranges, or were renamed
and repurposed, or removed altogether.

Add the 3 regulators that were missing from the original submission.
These are added for completeness. VSRAM_CORE is not used in existing
projects. VM18 and VMDDR feed DRAM related consumers, and are not used
in-kernel.

Signed-off-by: Chen-Yu Tsai <wenst@chromium.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20230928085537.3246669-11-wenst@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/mt6358-regulator.c       | 20 ++++++++++++++++++++
 include/linux/mfd/mt6358/registers.h       | 17 +++++++++++++++++
 include/linux/regulator/mt6358-regulator.h |  3 +++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/mt6358-regulator.c b/drivers/regulator/mt6358-regulator.c
index 9301491e5e4a..54a4028ded8e 100644
--- a/drivers/regulator/mt6358-regulator.c
+++ b/drivers/regulator/mt6358-regulator.c
@@ -325,6 +325,20 @@ static const struct linear_range vldo28_ranges[] = {
 	REGULATOR_LINEAR_RANGE(3000000, 0, 10, 10000),
 };
 
+static const unsigned int mt6366_vmddr_selectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 9, 12 };
+static const struct linear_range mt6366_vmddr_ranges[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(700000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(800000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(900000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1000000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1100000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1200000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1300000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1500000, 0, 10, 10000),
+	REGULATOR_LINEAR_RANGE(1800000, 0, 10, 10000),
+};
+
 static const unsigned int mt6366_vcn18_vm18_selectors[] = {
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
 static const struct linear_range mt6366_vcn18_vm18_ranges[] = {
@@ -602,6 +616,10 @@ static const struct mt6358_regulator_info mt6366_regulators[] = {
 		   MT6358_LDO_VSIM2_CON0, 0, MT6358_VSIM2_ANA_CON0, 0xf00),
 	MT6366_LDO("vcn18", VCN18, mt6366_vcn18_vm18,
 		   MT6358_LDO_VCN18_CON0, 0, MT6358_VCN18_ANA_CON0, 0xf00),
+	MT6366_LDO("vm18", VM18, mt6366_vcn18_vm18,
+		   MT6358_LDO_VM18_CON0, 0, MT6358_VM18_ANA_CON0, 0xf00),
+	MT6366_LDO("vmddr", VMDDR, mt6366_vmddr,
+		   MT6358_LDO_VMDDR_CON0, 0, MT6358_VMDDR_ANA_CON0, 0xf00),
 	MT6366_LDO1("vsram-proc11", VSRAM_PROC11, 500000, 1293750, 6250,
 		    MT6358_LDO_VSRAM_PROC11_DBG0, 0x7f00, MT6358_LDO_VSRAM_CON0, 0x7f),
 	MT6366_LDO1("vsram-others", VSRAM_OTHERS, 500000, 1293750, 6250,
@@ -610,6 +628,8 @@ static const struct mt6358_regulator_info mt6366_regulators[] = {
 		    MT6358_LDO_VSRAM_GPU_DBG0, 0x7f00, MT6358_LDO_VSRAM_CON3, 0x7f),
 	MT6366_LDO1("vsram-proc12", VSRAM_PROC12, 500000, 1293750, 6250,
 		    MT6358_LDO_VSRAM_PROC12_DBG0, 0x7f00, MT6358_LDO_VSRAM_CON1, 0x7f),
+	MT6366_LDO1("vsram-core", VSRAM_CORE, 500000, 1293750, 6250,
+		    MT6358_LDO_VSRAM_CORE_DBG0, 0x7f00, MT6358_LDO_VSRAM_CON5, 0x7f),
 };
 
 static int mt6358_sync_vcn33_setting(struct device *dev)
diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h
index 5ea2590be710..d83e87298ac4 100644
--- a/include/linux/mfd/mt6358/registers.h
+++ b/include/linux/mfd/mt6358/registers.h
@@ -294,4 +294,21 @@
 #define MT6358_AUD_TOP_INT_CON0               0x2228
 #define MT6358_AUD_TOP_INT_STATUS0            0x2234
 
+/*
+ * MT6366 has no VCAM*, but has other regulators in its place. The names
+ * keep the MT6358 prefix for ease of use in the regulator driver.
+ */
+#define MT6358_LDO_VSRAM_CON5                 0x1bf8
+#define MT6358_LDO_VM18_CON0                  MT6358_LDO_VCAMA1_CON0
+#define MT6358_LDO_VM18_CON1                  MT6358_LDO_VCAMA1_CON1
+#define MT6358_LDO_VM18_CON2                  MT6358_LDO_VCAMA1_CON2
+#define MT6358_LDO_VMDDR_CON0                 MT6358_LDO_VCAMA2_CON0
+#define MT6358_LDO_VMDDR_CON1                 MT6358_LDO_VCAMA2_CON1
+#define MT6358_LDO_VMDDR_CON2                 MT6358_LDO_VCAMA2_CON2
+#define MT6358_LDO_VSRAM_CORE_CON0            MT6358_LDO_VCAMD_CON0
+#define MT6358_LDO_VSRAM_CORE_DBG0            0x1cb6
+#define MT6358_LDO_VSRAM_CORE_DBG1            0x1cb8
+#define MT6358_VM18_ANA_CON0                  MT6358_VCAMA1_ANA_CON0
+#define MT6358_VMDDR_ANA_CON0                 MT6358_VCAMD_ANA_CON0
+
 #endif /* __MFD_MT6358_REGISTERS_H__ */
diff --git a/include/linux/regulator/mt6358-regulator.h b/include/linux/regulator/mt6358-regulator.h
index c71a6a9fce7a..562386f9b80e 100644
--- a/include/linux/regulator/mt6358-regulator.h
+++ b/include/linux/regulator/mt6358-regulator.h
@@ -86,6 +86,9 @@ enum {
 	MT6366_ID_VMC,
 	MT6366_ID_VAUD28,
 	MT6366_ID_VSIM2,
+	MT6366_ID_VM18,
+	MT6366_ID_VMDDR,
+	MT6366_ID_VSRAM_CORE,
 	MT6366_ID_RG_MAX,
 };
 
-- 
cgit v1.2.3


From d844fe65f0957024c3e1b0bf2a0615246184d9bc Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 7 Aug 2023 20:03:57 -0700
Subject: sched/headers: Move 'struct sched_param' out of uapi, to work around
 glibc/musl breakage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both glibc and musl define 'struct sched_param' in sched.h, while kernel
has it in uapi/linux/sched/types.h, making it cumbersome to use
sched_getattr(2) or sched_setattr(2) from userspace.

For example, something like this:

	#include <sched.h>
	#include <linux/sched/types.h>

	struct sched_attr sa;

will result in "error: redefinition of ‘struct sched_param’" (note the
code doesn't need sched_param at all -- it needs struct sched_attr
plus some stuff from sched.h).

The situation is, glibc is not going to provide a wrapper for
sched_{get,set}attr, thus the need to include linux/sched_types.h
directly, which leads to the above problem.

Thus, the userspace is left with a few sub-par choices when it wants to
use e.g. sched_setattr(2), such as maintaining a copy of struct
sched_attr definition, or using some other ugly tricks.

OTOH, 'struct sched_param' is well known, defined in POSIX, and it won't
be ever changed (as that would break backward compatibility).

So, while 'struct sched_param' is indeed part of the kernel uapi,
exposing it the way it's done now creates an issue, and hiding it
(like this patch does) fixes that issue, hopefully without creating
another one: common userspace software rely on libc headers, and as
for "special" software (like libc), it looks like glibc and musl
do not rely on kernel headers for 'struct sched_param' definition
(but let's Cc their mailing lists in case it's otherwise).

The alternative to this patch would be to move struct sched_attr to,
say, linux/sched.h, or linux/sched/attr.h (the new file).

Oh, and here is the previous attempt to fix the issue:

  https://lore.kernel.org/all/20200528135552.GA87103@google.com/

While I support Linus arguments, the issue is still here
and needs to be fixed.

[ mingo: Linus is right, this shouldn't be needed - but on the other
         hand I agree that this header is not really helpful to
	 user-space as-is. So let's pretend that
	 <uapi/linux/sched/types.h> is only about sched_attr, and
	 call this commit a workaround for user-space breakage
	 that it in reality is ... Also, remove the Fixes tag. ]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230808030357.1213829-1-kolyshkin@gmail.com
---
 include/linux/sched.h            | 5 ++++-
 include/uapi/linux/sched/types.h | 4 ----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc37ae787e33..e4235bbfad77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,7 +63,6 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
-struct sched_param;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 #endif
 
+struct sched_param {
+	int sched_priority;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..90662385689b 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -4,10 +4,6 @@
 
 #include <linux/types.h>
 
-struct sched_param {
-	int sched_priority;
-};
-
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
 #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
 
-- 
cgit v1.2.3


From affccb16c117d188eb09495cbdea149cecbf00b9 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Wed, 6 Sep 2023 11:22:32 +0200
Subject: ata: ahci: print the lpm policy on boot

The target LPM policy can be set using either a Kconfig or a kernel module
parameter.

However, if the board type is set to anything but board_ahci_low_power,
then the LPM policy will overridden and set to ATA_LPM_UNKNOWN.

Additionally, if the default suspend is suspend to idle, depending on the
hardware capabilities of the HBA, ahci_update_initial_lpm_policy() might
override the LPM policy to either ATA_LPM_MIN_POWER_WITH_PARTIAL or
ATA_LPM_MIN_POWER.

All this means that it is very hard to know which LPM policy a user will
actually be using on a given system.

In order to make it easier to debug LPM related issues, print the LPM
policy on boot.

One common LPM related issue is that the device fails to link up.
Because of that, we cannot add this print to ata_dev_configure(), as that
function is only called after a successful link up. Instead, add the info
using ata_port_desc(), with the help of a new ata_port_desc_misc() helper.
The port description is printed once per port during boot.

Before changes:
ata1: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780100 irq 170
ata2: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780180 irq 170

After changes:
ata1: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780100 irq 170 lpm-pol 4
ata2: SATA max UDMA/133 abar m524288@0xa5780000 port 0xa5780180 irq 170 lpm-pol 4

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/ata/libahci.c     |  2 +-
 drivers/ata/libata-core.c |  2 +-
 drivers/ata/libata-sff.c  | 10 +++++-----
 drivers/ata/pata_cs5520.c |  2 +-
 include/linux/libata.h    |  5 +++++
 5 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index f1263364fa97..1a63200ea437 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -2730,7 +2730,7 @@ static int ahci_host_activate_multi_irqs(struct ata_host *host,
 
 		if (rc)
 			return rc;
-		ata_port_desc(host->ports[i], "irq %d", irq);
+		ata_port_desc_misc(host->ports[i], irq);
 	}
 
 	return ata_host_register(host, sht);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d8cc1e27a125..0f63f805c78c 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6026,7 +6026,7 @@ int ata_host_activate(struct ata_host *host, int irq,
 		return rc;
 
 	for (i = 0; i < host->n_ports; i++)
-		ata_port_desc(host->ports[i], "irq %d", irq);
+		ata_port_desc_misc(host->ports[i], irq);
 
 	rc = ata_host_register(host, sht);
 	/* if failed, just free the IRQ and leave ports alone */
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 8fcc622fcb3d..95a19c4ef2a1 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -2316,7 +2316,7 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 		for (i = 0; i < 2; i++) {
 			if (ata_port_is_dummy(host->ports[i]))
 				continue;
-			ata_port_desc(host->ports[i], "irq %d", pdev->irq);
+			ata_port_desc_misc(host->ports[i], pdev->irq);
 		}
 	} else if (legacy_mode) {
 		if (!ata_port_is_dummy(host->ports[0])) {
@@ -2326,8 +2326,8 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 			if (rc)
 				goto out;
 
-			ata_port_desc(host->ports[0], "irq %d",
-				      ATA_PRIMARY_IRQ(pdev));
+			ata_port_desc_misc(host->ports[0],
+					   ATA_PRIMARY_IRQ(pdev));
 		}
 
 		if (!ata_port_is_dummy(host->ports[1])) {
@@ -2337,8 +2337,8 @@ int ata_pci_sff_activate_host(struct ata_host *host,
 			if (rc)
 				goto out;
 
-			ata_port_desc(host->ports[1], "irq %d",
-				      ATA_SECONDARY_IRQ(pdev));
+			ata_port_desc_misc(host->ports[1],
+					   ATA_SECONDARY_IRQ(pdev));
 		}
 	}
 
diff --git a/drivers/ata/pata_cs5520.c b/drivers/ata/pata_cs5520.c
index 422d42761a1d..38795508c2e9 100644
--- a/drivers/ata/pata_cs5520.c
+++ b/drivers/ata/pata_cs5520.c
@@ -212,7 +212,7 @@ static int cs5520_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		if (rc)
 			return rc;
 
-		ata_port_desc(ap, "irq %d", irq[i]);
+		ata_port_desc_misc(ap, irq[i]);
 	}
 
 	return ata_host_register(host, &cs5520_sht);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2a7d2af0ed80..54a217868ad0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1561,6 +1561,11 @@ void ata_port_desc(struct ata_port *ap, const char *fmt, ...);
 extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 			       const char *name);
 #endif
+static inline void ata_port_desc_misc(struct ata_port *ap, int irq)
+{
+	ata_port_desc(ap, "irq %d", irq);
+	ata_port_desc(ap, "lpm-pol %d", ap->target_lpm_policy);
+}
 
 static inline bool ata_tag_internal(unsigned int tag)
 {
-- 
cgit v1.2.3


From 0e19548145d863c48c6562add768a05231d768c9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:11 -0700
Subject: ata: libata: Annotate struct ata_cpr_log with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ata_cpr_log.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: linux-ide@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 54a217868ad0..b39891320271 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -656,7 +656,7 @@ struct ata_cpr {
 
 struct ata_cpr_log {
 	u8			nr_cpr;
-	struct ata_cpr		cpr[];
+	struct ata_cpr		cpr[] __counted_by(nr_cpr);
 };
 
 struct ata_device {
-- 
cgit v1.2.3


From 1b947279798fd51bef46e8241102f5b896add021 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 28 Aug 2023 13:10:28 +0900
Subject: ata: libata: Cleanup inline DMA helper functions

Simplify the inline DMA helper functions ata_using_mwdma(),
ata_using_udma() and ata_dma_enabled() to directly return as a boolean
the result of their test condition.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Chia-Lin Kao (AceLan) <acelan.kao@canonical.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/libata.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index b39891320271..1dbb14daccfa 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1886,23 +1886,21 @@ static inline unsigned long ata_deadline(unsigned long from_jiffies,
    change in future hardware and specs, secondly 0xFF means 'no DMA' but is
    > UDMA_0. Dyma ddreigiau */
 
-static inline int ata_using_mwdma(struct ata_device *adev)
+static inline bool ata_using_mwdma(struct ata_device *adev)
 {
-	if (adev->dma_mode >= XFER_MW_DMA_0 && adev->dma_mode <= XFER_MW_DMA_4)
-		return 1;
-	return 0;
+	return adev->dma_mode >= XFER_MW_DMA_0 &&
+		adev->dma_mode <= XFER_MW_DMA_4;
 }
 
-static inline int ata_using_udma(struct ata_device *adev)
+static inline bool ata_using_udma(struct ata_device *adev)
 {
-	if (adev->dma_mode >= XFER_UDMA_0 && adev->dma_mode <= XFER_UDMA_7)
-		return 1;
-	return 0;
+	return adev->dma_mode >= XFER_UDMA_0 &&
+		adev->dma_mode <= XFER_UDMA_7;
 }
 
-static inline int ata_dma_enabled(struct ata_device *adev)
+static inline bool ata_dma_enabled(struct ata_device *adev)
 {
-	return (adev->dma_mode == 0xFF ? 0 : 1);
+	return adev->dma_mode != 0xFF;
 }
 
 /**************************************************************************
-- 
cgit v1.2.3


From c2a36609dab3b7c937ab95bfb8b98e72391f772e Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 19 Sep 2023 10:51:47 +0200
Subject: tty: switch tty_{,un}throttle_safe() to return a bool

They return 0 or 1 -- a boolean value, so make it clear than noone
should expect negative or other values.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20230919085156.1578-7-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ioctl.c | 18 ++++++++----------
 include/linux/tty.h     |  4 ++--
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 7958bf6d27c4..ba60fcf518e0 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -124,17 +124,16 @@ EXPORT_SYMBOL(tty_unthrottle);
  *	conditions when throttling is conditional on factors evaluated prior to
  *	throttling.
  *
- *	Returns 0 if tty is throttled (or was already throttled)
+ *	Returns false if tty is throttled (or was already throttled)
  */
-
-int tty_throttle_safe(struct tty_struct *tty)
+bool tty_throttle_safe(struct tty_struct *tty)
 {
-	int ret = 0;
+	bool ret = false;
 
 	mutex_lock(&tty->throttle_mutex);
 	if (!tty_throttled(tty)) {
 		if (tty->flow_change != TTY_THROTTLE_SAFE)
-			ret = 1;
+			ret = true;
 		else {
 			set_bit(TTY_THROTTLED, &tty->flags);
 			if (tty->ops->throttle)
@@ -155,17 +154,16 @@ int tty_throttle_safe(struct tty_struct *tty)
  *	unthrottle due to race conditions when unthrottling is conditional
  *	on factors evaluated prior to unthrottling.
  *
- *	Returns 0 if tty is unthrottled (or was already unthrottled)
+ *	Returns false if tty is unthrottled (or was already unthrottled)
  */
-
-int tty_unthrottle_safe(struct tty_struct *tty)
+bool tty_unthrottle_safe(struct tty_struct *tty)
 {
-	int ret = 0;
+	bool ret = false;
 
 	mutex_lock(&tty->throttle_mutex);
 	if (tty_throttled(tty)) {
 		if (tty->flow_change != TTY_UNTHROTTLE_SAFE)
-			ret = 1;
+			ret = true;
 		else {
 			clear_bit(TTY_THROTTLED, &tty->flags);
 			if (tty->ops->unthrottle)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f002d0f25db7..59d675f345e9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -416,8 +416,8 @@ unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
 void tty_driver_flush_buffer(struct tty_struct *tty);
 void tty_unthrottle(struct tty_struct *tty);
-int tty_throttle_safe(struct tty_struct *tty);
-int tty_unthrottle_safe(struct tty_struct *tty);
+bool tty_throttle_safe(struct tty_struct *tty);
+bool tty_unthrottle_safe(struct tty_struct *tty);
 int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 int tty_get_icount(struct tty_struct *tty,
 		struct serial_icounter_struct *icount);
-- 
cgit v1.2.3


From 71067fb797e074c468221793fe93ba1b58350f1e Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 19 Sep 2023 10:51:50 +0200
Subject: tty: fix kernel-doc for functions in tty.h

tty_kref_get() is already included in Documentation, but is not properly
formatted. Fix this.

tty_get_baud_rate() is neither properly formatted, nor is included. Fix
both.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20230919085156.1578-10-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/tty/tty_ioctl.rst |  3 +++
 include/linux/tty.h                        | 21 +++++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/tty/tty_ioctl.rst b/Documentation/driver-api/tty/tty_ioctl.rst
index 9b0be79fc15e..3ff1ac5e07f1 100644
--- a/Documentation/driver-api/tty/tty_ioctl.rst
+++ b/Documentation/driver-api/tty/tty_ioctl.rst
@@ -5,3 +5,6 @@ TTY IOCTL Helpers
 =================
 
 .. kernel-doc:: drivers/tty/tty_ioctl.c
+
+.. kernel-doc:: include/linux/tty.h
+   :identifiers: tty_get_baud_rate
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 59d675f345e9..4b6340ac2af2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -390,14 +390,12 @@ int vcs_init(void);
 extern const struct class tty_class;
 
 /**
- *	tty_kref_get		-	get a tty reference
- *	@tty: tty device
+ * tty_kref_get - get a tty reference
+ * @tty: tty device
  *
- *	Return a new reference to a tty object. The caller must hold
- *	sufficient locks/counts to ensure that their existing reference cannot
- *	go away
+ * Returns: a new reference to a tty object. The caller must hold sufficient
+ * locks/counts to ensure that their existing reference cannot go away
  */
-
 static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
 	if (tty)
@@ -435,14 +433,13 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
 		speed_t obaud);
 
 /**
- *	tty_get_baud_rate	-	get tty bit rates
- *	@tty: tty to query
+ * tty_get_baud_rate - get tty bit rates
+ * @tty: tty to query
  *
- *	Returns the baud rate as an integer for this terminal. The
- *	termios lock must be held by the caller and the terminal bit
- *	flags may be updated.
+ * Returns: the baud rate as an integer for this terminal. The termios lock
+ * must be held by the caller and the terminal bit flags may be updated.
  *
- *	Locking: none
+ * Locking: none
  */
 static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 {
-- 
cgit v1.2.3


From 29bff582b74ed0bdb7e6986482ad9e6799ea4d2f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 26 Sep 2023 21:41:28 -0700
Subject: serial: core: fix kernel-doc for uart_port_unlock_irqrestore()

Fix the function name to avoid a kernel-doc warning:

include/linux/serial_core.h:666: warning: expecting prototype for uart_port_lock_irqrestore(). Prototype was for uart_port_unlock_irqrestore() instead

Fixes: b0af4bcb4946 ("serial: core: Provide port lock wrappers")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: linux-serial@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20230927044128.4748-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 3091c62ec37b..89f7b6c63598 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -658,7 +658,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up)
 }
 
 /**
- * uart_port_lock_irqrestore - Unlock the UART port, restore interrupts
+ * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
  * @up:		Pointer to UART port structure
  * @flags:	The saved interrupt flags for restore
  */
-- 
cgit v1.2.3


From 3cd39bc3b11b8d34b7d7c961a35fdfd18b0ebf75 Mon Sep 17 00:00:00 2001
From: Alejandro Colomar <alx@kernel.org>
Date: Tue, 3 Oct 2023 14:59:53 +0300
Subject: kernel.h: Move ARRAY_SIZE() to a separate header

Touching files so used for the kernel,
forces 'make' to recompile most of the kernel.

Having those definitions in more granular files
helps avoid recompiling so much of the kernel.

Signed-off-by: Alejandro Colomar <alx@kernel.org>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230817143352.132583-2-lucas.segarra.fernandez@intel.com
[andy: reduced to cover only string.h for now]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/array_size.h | 13 +++++++++++++
 include/linux/kernel.h     |  7 +------
 include/linux/string.h     |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/array_size.h

(limited to 'include/linux')

diff --git a/include/linux/array_size.h b/include/linux/array_size.h
new file mode 100644
index 000000000000..06d7d83196ca
--- /dev/null
+++ b/include/linux/array_size.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ARRAY_SIZE_H
+#define _LINUX_ARRAY_SIZE_H
+
+#include <linux/compiler.h>
+
+/**
+ * ARRAY_SIZE - get the number of elements in array @arr
+ * @arr: array to be sized
+ */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+
+#endif  /* _LINUX_ARRAY_SIZE_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cee8fe87e9f4..d9ad21058eed 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -13,6 +13,7 @@
 
 #include <linux/stdarg.h>
 #include <linux/align.h>
+#include <linux/array_size.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
@@ -50,12 +51,6 @@
 #define READ			0
 #define WRITE			1
 
-/**
- * ARRAY_SIZE - get the number of elements in array @arr
- * @arr: array to be sized
- */
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
-
 #define PTR_IF(cond, ptr)	((cond) ? (ptr) : NULL)
 
 #define u64_to_user_ptr(x) (		\
diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..3c920b6d609b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_STRING_H_
 #define _LINUX_STRING_H_
 
+#include <linux/array_size.h>
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
-- 
cgit v1.2.3


From 82cc14c9930c7613da2fcb41a8d4f90c8b4cb048 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Oct 2023 15:10:11 +0300
Subject: pinctrl: Replace kernel.h by what is actually being used

The kernel.h is a mess of unrelated things and we only used it
as a proxy to array_size.h, hence switch from former to the latter.

While at it, group and sort the headers where it makes sense.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/pinctrl/core.c            |  2 +-
 drivers/pinctrl/pinconf-generic.c | 16 +++++++++-------
 drivers/pinctrl/pinconf.c         | 14 ++++++++------
 drivers/pinctrl/pinctrl-utils.c   |  6 ++++--
 drivers/pinctrl/pinmux.c          |  2 +-
 include/linux/pinctrl/machine.h   |  2 +-
 6 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index e9dc9638120a..afd0a1040329 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -12,12 +12,12 @@
  */
 #define pr_fmt(fmt) "pinctrl core: " fmt
 
+#include <linux/array_size.h>
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/export.h>
 #include <linux/init.h>
-#include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 365c4b0ca465..8313cb5f3b3c 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -10,17 +10,19 @@
 
 #define pr_fmt(fmt) "generic pinconfig core: " fmt
 
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
+#include <linux/array_size.h>
+#include <linux/debugfs.h>
 #include <linux/device.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/of.h>
 #include <linux/slab.h>
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/pinctrl/pinctrl.h>
-#include <linux/pinctrl/pinconf.h>
+
 #include <linux/pinctrl/pinconf-generic.h>
-#include <linux/of.h>
+#include <linux/pinctrl/pinconf.h>
+#include <linux/pinctrl/pinctrl.h>
+
 #include "core.h"
 #include "pinconf.h"
 #include "pinctrl-utils.h"
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index d9d54065472e..96d853a8f339 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -9,16 +9,18 @@
  */
 #define pr_fmt(fmt) "pinconfig core: " fmt
 
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/slab.h>
+#include <linux/array_size.h>
 #include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
+
 #include <linux/pinctrl/machine.h>
-#include <linux/pinctrl/pinctrl.h>
 #include <linux/pinctrl/pinconf.h>
+#include <linux/pinctrl/pinctrl.h>
+
 #include "core.h"
 #include "pinconf.h"
 
diff --git a/drivers/pinctrl/pinctrl-utils.c b/drivers/pinctrl/pinctrl-utils.c
index 3580e0fd94ed..40862f7bd6ca 100644
--- a/drivers/pinctrl/pinctrl-utils.c
+++ b/drivers/pinctrl/pinctrl-utils.c
@@ -6,12 +6,14 @@
  *
  * Author: Laxman Dewangan <ldewangan@nvidia.com>
  */
+#include <linux/array_size.h>
 #include <linux/device.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/pinctrl/pinctrl.h>
 #include <linux/of.h>
 #include <linux/slab.h>
+
+#include <linux/pinctrl/pinctrl.h>
+
 #include "core.h"
 #include "pinctrl-utils.h"
 
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 2a180a5d64a4..3de81d310aee 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -12,12 +12,12 @@
  */
 #define pr_fmt(fmt) "pinmux core: " fmt
 
+#include <linux/array_size.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/init.h>
-#include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/radix-tree.h>
diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h
index 0639b36f43c5..ee8803f6ad07 100644
--- a/include/linux/pinctrl/machine.h
+++ b/include/linux/pinctrl/machine.h
@@ -11,7 +11,7 @@
 #ifndef __LINUX_PINCTRL_MACHINE_H
 #define __LINUX_PINCTRL_MACHINE_H
 
-#include <linux/kernel.h>	/* ARRAY_SIZE() */
+#include <linux/array_size.h>
 
 #include <linux/pinctrl/pinctrl-state.h>
 
-- 
cgit v1.2.3


From 75cec20345fa8e05a2b5f861fada95ad8e165257 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 16:32:15 +0200
Subject: bpf: Remove xdp_do_flush_map().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_do_flush_map() can be removed because there is no more user in tree.

Remove xdp_do_flush_map().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20230908143215.869913-3-bigeasy@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..e8822bd595f9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1025,12 +1025,6 @@ int xdp_do_redirect_frame(struct net_device *dev,
 			  struct bpf_prog *prog);
 void xdp_do_flush(void);
 
-/* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
- * it is no longer only flushing maps. Keep this define for compatibility
- * until all drivers are updated - do not use xdp_do_flush_map() in new code!
- */
-#define xdp_do_flush_map xdp_do_flush
-
 void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act);
 
 #ifdef CONFIG_INET
-- 
cgit v1.2.3


From ccab211af3c2b90ed792eb5f33707d2f0d59fe50 Mon Sep 17 00:00:00 2001
From: Sohil Mehta <sohil.mehta@intel.com>
Date: Mon, 10 Jul 2023 18:51:24 +0000
Subject: syscalls: Cleanup references to sys_lookup_dcookie()

commit 'be65de6b03aa ("fs: Remove dcookies support")' removed the
syscall definition for lookup_dcookie.  However, syscall tables still
point to the old sys_lookup_dcookie() definition. Update syscall tables
of all architectures to directly point to sys_ni_syscall() instead.

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Namhyung Kim <namhyung@kernel.org> # for perf
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/alpha/kernel/syscalls/syscall.tbl              | 2 +-
 arch/arm/tools/syscall.tbl                          | 2 +-
 arch/arm64/include/asm/unistd32.h                   | 4 ++--
 arch/m68k/kernel/syscalls/syscall.tbl               | 2 +-
 arch/microblaze/kernel/syscalls/syscall.tbl         | 2 +-
 arch/mips/kernel/syscalls/syscall_n32.tbl           | 2 +-
 arch/mips/kernel/syscalls/syscall_n64.tbl           | 2 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl           | 2 +-
 arch/parisc/kernel/syscalls/syscall.tbl             | 2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl            | 2 +-
 arch/s390/kernel/syscalls/syscall.tbl               | 2 +-
 arch/sh/kernel/syscalls/syscall.tbl                 | 2 +-
 arch/sparc/kernel/syscalls/syscall.tbl              | 2 +-
 arch/x86/entry/syscalls/syscall_32.tbl              | 2 +-
 arch/x86/entry/syscalls/syscall_64.tbl              | 2 +-
 arch/xtensa/kernel/syscalls/syscall.tbl             | 2 +-
 include/linux/compat.h                              | 1 -
 include/linux/syscalls.h                            | 1 -
 include/uapi/asm-generic/unistd.h                   | 2 +-
 kernel/sys_ni.c                                     | 2 --
 tools/include/uapi/asm-generic/unistd.h             | 2 +-
 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 2 +-
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl  | 2 +-
 tools/perf/arch/s390/entry/syscalls/syscall.tbl     | 2 +-
 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl   | 2 +-
 25 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..26a32f8e47d5 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -334,7 +334,7 @@
 401	common	io_submit			sys_io_submit
 402	common	io_cancel			sys_io_cancel
 405	common	exit_group			sys_exit_group
-406	common	lookup_dcookie			sys_lookup_dcookie
+406	common	lookup_dcookie			sys_ni_syscall
 407	common	epoll_create			sys_epoll_create
 408	common	epoll_ctl			sys_epoll_ctl
 409	common	epoll_wait			sys_epoll_wait
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..04d8e491d8b4 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -263,7 +263,7 @@
 246	common	io_submit		sys_io_submit
 247	common	io_cancel		sys_io_cancel
 248	common	exit_group		sys_exit_group
-249	common	lookup_dcookie		sys_lookup_dcookie
+249	common	lookup_dcookie		sys_ni_syscall
 250	common	epoll_create		sys_epoll_create
 251	common	epoll_ctl		sys_epoll_ctl		sys_oabi_epoll_ctl
 252	common	epoll_wait		sys_epoll_wait
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..9110be82dfaa 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -508,8 +508,8 @@ __SYSCALL(__NR_io_submit, compat_sys_io_submit)
 __SYSCALL(__NR_io_cancel, sys_io_cancel)
 #define __NR_exit_group 248
 __SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_lookup_dcookie 249
-__SYSCALL(__NR_lookup_dcookie, compat_sys_lookup_dcookie)
+			/* 249 was lookup_dcookie */
+__SYSCALL(249, sys_ni_syscall)
 #define __NR_epoll_create 250
 __SYSCALL(__NR_epoll_create, sys_epoll_create)
 #define __NR_epoll_ctl 251
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..f9d1f2d3f067 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -255,7 +255,7 @@
 245	common	io_cancel			sys_io_cancel
 246	common	fadvise64			sys_fadvise64
 247	common	exit_group			sys_exit_group
-248	common	lookup_dcookie			sys_lookup_dcookie
+248	common	lookup_dcookie			sys_ni_syscall
 249	common	epoll_create			sys_epoll_create
 250	common	epoll_ctl			sys_epoll_ctl
 251	common	epoll_wait			sys_epoll_wait
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..185fe73d9bbf 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -260,7 +260,7 @@
 250	common	fadvise64			sys_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	common	exit_group			sys_exit_group
-253	common	lookup_dcookie			sys_lookup_dcookie
+253	common	lookup_dcookie			sys_ni_syscall
 254	common	epoll_create			sys_epoll_create
 255	common	epoll_ctl			sys_epoll_ctl
 256	common	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..08f33e7c2896 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -214,7 +214,7 @@
 203	n32	io_submit			compat_sys_io_submit
 204	n32	io_cancel			sys_io_cancel
 205	n32	exit_group			sys_exit_group
-206	n32	lookup_dcookie			sys_lookup_dcookie
+206	n32	lookup_dcookie			sys_ni_syscall
 207	n32	epoll_create			sys_epoll_create
 208	n32	epoll_ctl			sys_epoll_ctl
 209	n32	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..80be0e98ea0c 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -214,7 +214,7 @@
 203	n64	io_submit			sys_io_submit
 204	n64	io_cancel			sys_io_cancel
 205	n64	exit_group			sys_exit_group
-206	n64	lookup_dcookie			sys_lookup_dcookie
+206	n64	lookup_dcookie			sys_ni_syscall
 207	n64	epoll_create			sys_epoll_create
 208	n64	epoll_ctl			sys_epoll_ctl
 209	n64	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..310c7e839b69 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -258,7 +258,7 @@
 244	o32	io_submit			sys_io_submit			compat_sys_io_submit
 245	o32	io_cancel			sys_io_cancel
 246	o32	exit_group			sys_exit_group
-247	o32	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+247	o32	lookup_dcookie			sys_ni_syscall
 248	o32	epoll_create			sys_epoll_create
 249	o32	epoll_ctl			sys_epoll_ctl
 250	o32	epoll_wait			sys_epoll_wait
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..5410ff9456ae 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -245,7 +245,7 @@
 # 220 was alloc_hugepages
 # 221 was free_hugepages
 222	common	exit_group		sys_exit_group
-223	common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+223	common	lookup_dcookie		sys_ni_syscall
 224	common	epoll_create		sys_epoll_create
 225	common	epoll_ctl		sys_epoll_ctl
 226	common	epoll_wait		sys_epoll_wait
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..e1412519b4ad 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -294,7 +294,7 @@
 233	32	fadvise64			sys_ppc32_fadvise64		compat_sys_ppc32_fadvise64
 233	64	fadvise64			sys_fadvise64
 234	nospu	exit_group			sys_exit_group
-235	nospu	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+235	nospu	lookup_dcookie			sys_ni_syscall
 236	common	epoll_create			sys_epoll_create
 237	common	epoll_ctl			sys_epoll_ctl
 238	common	epoll_wait			sys_epoll_wait
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..cc0bc144b661 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -100,7 +100,7 @@
 106  common	stat			sys_newstat			compat_sys_newstat
 107  common	lstat			sys_newlstat			compat_sys_newlstat
 108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+110  common	lookup_dcookie		-				-
 111  common	vhangup			sys_vhangup			sys_vhangup
 112  common	idle			-				-
 114  common	wait4			sys_wait4			compat_sys_wait4
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..17ca58976849 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -260,7 +260,7 @@
 250	common	fadvise64			sys_fadvise64
 # 251 is unused
 252	common	exit_group			sys_exit_group
-253	common	lookup_dcookie			sys_lookup_dcookie
+253	common	lookup_dcookie			sys_ni_syscall
 254	common	epoll_create			sys_epoll_create
 255	common	epoll_ctl			sys_epoll_ctl
 256	common	epoll_wait			sys_epoll_wait
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..3f72970cf983 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -249,7 +249,7 @@
 205	common	readahead		sys_readahead			compat_sys_readahead
 206	common	socketcall		sys_socketcall			sys32_socketcall
 207	common	syslog			sys_syslog
-208	common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+208	common	lookup_dcookie		sys_ni_syscall
 209	common	fadvise64		sys_fadvise64			compat_sys_fadvise64
 210	common	fadvise64_64		sys_fadvise64_64		compat_sys_fadvise64_64
 211	common	tgkill			sys_tgkill
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..6d0286bbbe27 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -264,7 +264,7 @@
 250	i386	fadvise64		sys_ia32_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	i386	exit_group		sys_exit_group
-253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+253	i386	lookup_dcookie
 254	i386	epoll_create		sys_epoll_create
 255	i386	epoll_ctl		sys_epoll_ctl
 256	i386	epoll_wait		sys_epoll_wait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..2a62eaf30d69 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -220,7 +220,7 @@
 209	64	io_submit		sys_io_submit
 210	common	io_cancel		sys_io_cancel
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
+212	common	lookup_dcookie
 213	common	epoll_create		sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..351521b2e841 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -273,7 +273,7 @@
 252	common	timer_getoverrun		sys_timer_getoverrun
 # System
 253	common	reserved253			sys_ni_syscall
-254	common	lookup_dcookie			sys_lookup_dcookie
+254	common	lookup_dcookie			sys_ni_syscall
 255	common	available255			sys_ni_syscall
 256	common	add_key				sys_add_key
 257	common	request_key			sys_request_key
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1cfa4f0f490a..233f61ec8afc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -581,7 +581,6 @@ asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id,
 					struct io_event __user *events,
 					struct __kernel_timespec __user *timeout,
 					const struct __compat_aio_sigset __user *usig);
-asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
 asmlinkage long compat_sys_epoll_pwait(int epfd,
 			struct epoll_event __user *events,
 			int maxevents, int timeout,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..a031613bf966 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -355,7 +355,6 @@ asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_epoll_create1(int flags);
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..76d946445391 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_getcwd 17
 __SYSCALL(__NR_getcwd, sys_getcwd)
 #define __NR_lookup_dcookie 18
-__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall)
 #define __NR_eventfd2 19
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create1 20
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..d6eaaaf9cf77 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,8 +51,6 @@ COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
-COND_SYSCALL(lookup_dcookie);
-COND_SYSCALL_COMPAT(lookup_dcookie);
 COND_SYSCALL(eventfd2);
 COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index fd6c1cb585db..7ea3875137e9 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_getcwd 17
 __SYSCALL(__NR_getcwd, sys_getcwd)
 #define __NR_lookup_dcookie 18
-__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall)
 #define __NR_eventfd2 19
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create1 20
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
index cfda2511badf..478fe63601fc 100644
--- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -214,7 +214,7 @@
 203	n64	io_submit			sys_io_submit
 204	n64	io_cancel			sys_io_cancel
 205	n64	exit_group			sys_exit_group
-206	n64	lookup_dcookie			sys_lookup_dcookie
+206	n64	lookup_dcookie			sys_ni_syscall
 207	n64	epoll_create			sys_epoll_create
 208	n64	epoll_ctl			sys_epoll_ctl
 209	n64	epoll_wait			sys_epoll_wait
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 8c0b08b7a80e..1b7777e5f9ff 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -294,7 +294,7 @@
 233	32	fadvise64			sys_ppc32_fadvise64		compat_sys_ppc32_fadvise64
 233	64	fadvise64			sys_fadvise64
 234	nospu	exit_group			sys_exit_group
-235	nospu	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+235	nospu	lookup_dcookie			sys_ni_syscall
 236	common	epoll_create			sys_epoll_create
 237	common	epoll_ctl			sys_epoll_ctl
 238	common	epoll_wait			sys_epoll_wait
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index a6935af2235c..11782be77f57 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -100,7 +100,7 @@
 106  common	stat			sys_newstat			compat_sys_newstat
 107  common	lstat			sys_newlstat			compat_sys_newlstat
 108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+110  common	lookup_dcookie		-				-
 111  common	vhangup			sys_vhangup			sys_vhangup
 112  common	idle			-				-
 114  common	wait4			sys_wait4			compat_sys_wait4
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 227538b0ce80..27f78821453b 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -220,7 +220,7 @@
 209	64	io_submit		sys_io_submit
 210	common	io_cancel		sys_io_cancel
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
+212	common	lookup_dcookie
 213	common	epoll_create		sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
-- 
cgit v1.2.3


From 26dd68d293fd1c5ac966fb5dd5f6d89de322a541 Mon Sep 17 00:00:00 2001
From: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Date: Tue, 12 Sep 2023 07:59:31 -0400
Subject: overflow: add DEFINE_FLEX() for on-stack allocs

Add DEFINE_FLEX() macro for on-stack allocations of structs with
flexible array member.

Expose __struct_size() macro outside of fortify-string.h, as it could be
used to read size of structs allocated by DEFINE_FLEX().
Move __member_size() alongside it.
-Kees

Using underlying array for on-stack storage lets us to declare
known-at-compile-time structures without kzalloc().

Actual usage for ice driver is in following patches of the series.

Missing __has_builtin() workaround is moved up to serve also assembly
compilation with m68k-linux-gcc, see [1].
Error was (note the .S file extension):
In file included from ../include/linux/linkage.h:5,
                 from ../arch/m68k/fpsp040/skeleton.S:40:
../include/linux/compiler_types.h:331:5: warning: "__has_builtin" is not defined, evaluates to 0 [-Wundef]
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |     ^~~~~~~~~~~~~
../include/linux/compiler_types.h:331:18: error: missing binary operator before token "("
  331 | #if __has_builtin(__builtin_dynamic_object_size)
      |                  ^

[1] https://lore.kernel.org/netdev/202308112122.OuF0YZqL-lkp@intel.com/
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Link: https://lore.kernel.org/r/20230912115937.1645707-2-przemyslaw.kitszel@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/compiler_types.h | 32 +++++++++++++++++++++-----------
 include/linux/fortify-string.h |  4 ----
 include/linux/overflow.h       | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index c523c6683789..6f1ca49306d2 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -2,6 +2,15 @@
 #ifndef __LINUX_COMPILER_TYPES_H
 #define __LINUX_COMPILER_TYPES_H
 
+/*
+ * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
+ * In the meantime, to support gcc < 10, we implement __has_builtin
+ * by hand.
+ */
+#ifndef __has_builtin
+#define __has_builtin(x) (0)
+#endif
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -134,17 +143,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __preserve_most
 #endif
 
-/* Builtins */
-
-/*
- * __has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.
- * In the meantime, to support gcc < 10, we implement __has_builtin
- * by hand.
- */
-#ifndef __has_builtin
-#define __has_builtin(x) (0)
-#endif
-
 /* Compiler specific macros. */
 #ifdef __clang__
 #include <linux/compiler-clang.h>
@@ -352,6 +350,18 @@ struct ftrace_likely_data {
 # define __realloc_size(x, ...)
 #endif
 
+/*
+ * When the size of an allocated object is needed, use the best available
+ * mechanism to find it. (For cases where sizeof() cannot be used.)
+ */
+#if __has_builtin(__builtin_dynamic_object_size)
+#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
+#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
+#else
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index da51a83b2829..1e7711185ec6 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -93,13 +93,9 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #if __has_builtin(__builtin_dynamic_object_size)
 #define POS			__pass_dynamic_object_size(1)
 #define POS0			__pass_dynamic_object_size(0)
-#define __struct_size(p)	__builtin_dynamic_object_size(p, 0)
-#define __member_size(p)	__builtin_dynamic_object_size(p, 1)
 #else
 #define POS			__pass_object_size(1)
 #define POS0			__pass_object_size(0)
-#define __struct_size(p)	__builtin_object_size(p, 0)
-#define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
 #define __compiletime_lessthan(bounds, length)	(	\
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f9b60313eaea..7b5cf4a5cd19 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -309,4 +309,39 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend)
 #define struct_size_t(type, member, count)					\
 	struct_size((type *)NULL, member, count)
 
+/**
+ * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family.
+ * Enables caller macro to pass (different) initializer.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ * @initializer: initializer expression (could be empty for no init).
+ */
+#define _DEFINE_FLEX(type, name, member, count, initializer)			\
+	_Static_assert(__builtin_constant_p(count),				\
+		       "onstack flex array members require compile-time const count"); \
+	union {									\
+		u8 bytes[struct_size_t(type, member, count)];			\
+		type obj;							\
+	} name##_u initializer;							\
+	type *name = (type *)&name##_u
+
+/**
+ * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing
+ * flexible array member.
+ *
+ * @type: structure type name, including "struct" keyword.
+ * @name: Name for a variable to define.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array; must be compile-time const.
+ *
+ * Define a zeroed, on-stack, instance of @type structure with a trailing
+ * flexible array member.
+ * Use __struct_size(@name) to get compile-time size of it afterwards.
+ */
+#define DEFINE_FLEX(type, name, member, count)			\
+	_DEFINE_FLEX(type, name, member, count, = {})
+
 #endif /* __LINUX_OVERFLOW_H */
-- 
cgit v1.2.3


From eeb6d1d6f4ec0e304608f72c3ead584bbb155714 Mon Sep 17 00:00:00 2001
From: GuoHua Cheng <chenguohua@jari.cn>
Date: Wed, 27 Sep 2023 13:56:08 +0800
Subject: PNP: Clean up coding style in pnp.h

Address the following checkpatch complaints:

ERROR: "foo * bar" should be "foo *bar"
ERROR: space required after that ';' (ctx:VxV)

Signed-off-by: GuoHua Cheng <chenguohua@jari.cn>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pnp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index c2a7cfbca713..267fb8a4fb6e 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -291,7 +291,7 @@ static inline void pnp_set_drvdata(struct pnp_dev *pdev, void *data)
 
 struct pnp_fixup {
 	char id[7];
-	void (*quirk_function) (struct pnp_dev * dev);	/* fixup function */
+	void (*quirk_function) (struct pnp_dev *dev);	/* fixup function */
 };
 
 /* config parameters */
@@ -419,8 +419,8 @@ struct pnp_protocol {
 
 	/* protocol specific suspend/resume */
 	bool (*can_wakeup) (struct pnp_dev *dev);
-	int (*suspend) (struct pnp_dev * dev, pm_message_t state);
-	int (*resume) (struct pnp_dev * dev);
+	int (*suspend) (struct pnp_dev *dev, pm_message_t state);
+	int (*resume) (struct pnp_dev *dev);
 
 	/* used by pnp layer only (look but don't touch) */
 	unsigned char number;	/* protocol number */
@@ -492,7 +492,7 @@ static inline int pnp_start_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_stop_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_activate_dev(struct pnp_dev *dev) { return -ENODEV; }
 static inline int pnp_disable_dev(struct pnp_dev *dev) { return -ENODEV; }
-static inline int pnp_range_reserved(resource_size_t start, resource_size_t end) { return 0;}
+static inline int pnp_range_reserved(resource_size_t start, resource_size_t end) { return 0; }
 
 /* protocol helpers */
 static inline int pnp_is_active(struct pnp_dev *dev) { return 0; }
-- 
cgit v1.2.3


From 473267a4911f2469722c74ca58087d951072f72a Mon Sep 17 00:00:00 2001
From: Patrick Rohr <prohr@google.com>
Date: Mon, 25 Sep 2023 14:47:11 -0700
Subject: net: add sysctl to disable rfc4862 5.5.3e lifetime handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds a sysctl to opt-out of RFC4862 section 5.5.3e's valid
lifetime derivation mechanism.

RFC4862 section 5.5.3e prescribes that the valid lifetime in a Router
Advertisement PIO shall be ignored if it less than 2 hours and to reset
the lifetime of the corresponding address to 2 hours. An in-progress
6man draft (see draft-ietf-6man-slaac-renum-07 section 4.2) is currently
looking to remove this mechanism. While this draft has not been moving
particularly quickly for other reasons, there is widespread consensus on
section 4.2 which updates RFC4862 section 5.5.3e.

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Jen Linkova <furry@google.com>
Signed-off-by: Patrick Rohr <prohr@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230925214711.959704-1-prohr@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 11 ++++++++++
 include/linux/ipv6.h                   |  1 +
 net/ipv6/addrconf.c                    | 38 ++++++++++++++++++++++------------
 3 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 5bfa1837968c..f7dfde3b09a9 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2311,6 +2311,17 @@ accept_ra_pinfo - BOOLEAN
 		- enabled if accept_ra is enabled.
 		- disabled if accept_ra is disabled.
 
+ra_honor_pio_life - BOOLEAN
+	Whether to use RFC4862 Section 5.5.3e to determine the valid
+	lifetime of an address matching a prefix sent in a Router
+	Advertisement Prefix Information Option.
+
+	- If enabled, the PIO valid lifetime will always be honored.
+	- If disabled, RFC4862 section 5.5.3e is used to determine
+	  the valid lifetime of the address.
+
+	Default: 0 (disabled)
+
 accept_ra_rt_info_min_plen - INTEGER
 	Minimum prefix length of Route Information in RA.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e400ff757f13..5e605e384aac 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -82,6 +82,7 @@ struct ipv6_devconf {
 	__u32		ioam6_id_wide;
 	__u8		ioam6_enabled;
 	__u8		ndisc_evict_nocarrier;
+	__u8		ra_honor_pio_life;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0b6ee962c84e..c2d471ad7922 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
 	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 	.ndisc_evict_nocarrier	= 1,
+	.ra_honor_pio_life	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -297,6 +298,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.ioam6_id               = IOAM6_DEFAULT_IF_ID,
 	.ioam6_id_wide		= IOAM6_DEFAULT_IF_ID_WIDE,
 	.ndisc_evict_nocarrier	= 1,
+	.ra_honor_pio_life	= 0,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -2657,22 +2659,23 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 			stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
 		else
 			stored_lft = 0;
-		if (!create && stored_lft) {
+
+		/* RFC4862 Section 5.5.3e:
+		 * "Note that the preferred lifetime of the
+		 *  corresponding address is always reset to
+		 *  the Preferred Lifetime in the received
+		 *  Prefix Information option, regardless of
+		 *  whether the valid lifetime is also reset or
+		 *  ignored."
+		 *
+		 * So we should always update prefered_lft here.
+		 */
+		update_lft = !create && stored_lft;
+
+		if (update_lft && !in6_dev->cnf.ra_honor_pio_life) {
 			const u32 minimum_lft = min_t(u32,
 				stored_lft, MIN_VALID_LIFETIME);
 			valid_lft = max(valid_lft, minimum_lft);
-
-			/* RFC4862 Section 5.5.3e:
-			 * "Note that the preferred lifetime of the
-			 *  corresponding address is always reset to
-			 *  the Preferred Lifetime in the received
-			 *  Prefix Information option, regardless of
-			 *  whether the valid lifetime is also reset or
-			 *  ignored."
-			 *
-			 * So we should always update prefered_lft here.
-			 */
-			update_lft = 1;
 		}
 
 		if (update_lft) {
@@ -6846,6 +6849,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "ra_honor_pio_life",
+		.data		= &ipv6_devconf.ra_honor_pio_life,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	{
 		.procname	= "accept_ra_rtr_pref",
-- 
cgit v1.2.3


From 2632bb84d1d53cfd6cf65261064273ded4f759d5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 28 Sep 2023 20:24:34 -0700
Subject: mm: Remove unused vm_brk()

With fs/binfmt_elf.c fully refactored to use the new elf_load() helper,
there are no more users of vm_brk(), so remove it.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Suggested-by: Eric Biederman <ebiederm@xmission.com>
Tested-by: Pedro Falcato <pedro.falcato@gmail.com>
Signed-off-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20230929032435.2391507-6-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/mm.h | 3 +--
 mm/mmap.c          | 6 ------
 mm/nommu.c         | 5 -----
 3 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..216dd0c6dcf8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3308,8 +3308,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
 extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index b56a7f0c9f85..34d2337ace59 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3174,12 +3174,6 @@ limits_failed:
 }
 EXPORT_SYMBOL(vm_brk_flags);
 
-int vm_brk(unsigned long addr, unsigned long len)
-{
-	return vm_brk_flags(addr, len, 0);
-}
-EXPORT_SYMBOL(vm_brk);
-
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..23c43c208f2b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1531,11 +1531,6 @@ void exit_mmap(struct mm_struct *mm)
 	mmap_write_unlock(mm);
 }
 
-int vm_brk(unsigned long addr, unsigned long len)
-{
-	return -ENOMEM;
-}
-
 /*
  * expand (or shrink) an existing mapping, potentially moving it at the same
  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
-- 
cgit v1.2.3


From 2f3dd39e2b492bec366487a2c9bcbdbd7792f77c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 2 Oct 2023 17:34:28 -0700
Subject: platform/chrome: cros_ec_proto: Mark outdata as const

The 'outdata' is copied to the data buffer in cros_ec_cmd() before being
sent over to the EC. Mark the argument as const so that callers can pass
const pointers to this function and so that callers know the data won't
be modified.

Cc: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20231003003429.1378109-5-swboyd@chromium.org
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
---
 drivers/platform/chrome/cros_ec_proto.c     | 2 +-
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 475a6dd72db6..945b1b15a04c 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -1004,7 +1004,7 @@ EXPORT_SYMBOL_GPL(cros_ec_get_sensor_count);
 int cros_ec_cmd(struct cros_ec_device *ec_dev,
 		unsigned int version,
 		int command,
-		void *outdata,
+		const void *outdata,
 		size_t outsize,
 		void *indata,
 		size_t insize)
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 4f9f756bc17c..8865e350c12a 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -258,7 +258,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
+int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, const void *outdata,
 		    size_t outsize, void *indata, size_t insize);
 
 /**
-- 
cgit v1.2.3


From 8874e414fe78718d0f2861fe511cecbd1cd73f4d Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 3 Oct 2023 11:49:14 -0700
Subject: platform/x86/intel/tpmi: Add defines to get version information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add defines to get major and minor version from a TPMI version field
value. This will avoid code duplication to convert in every feature
driver. Also add define for invalid version field.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://lore.kernel.org/r/20231003184916.1860084-2-srinivas.pandruvada@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/intel_tpmi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index 04d937ad4dc4..ee07393445f9 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -6,6 +6,12 @@
 #ifndef _INTEL_TPMI_H_
 #define _INTEL_TPMI_H_
 
+#include <linux/bitfield.h>
+
+#define TPMI_VERSION_INVALID	0xff
+#define TPMI_MINOR_VERSION(val)	FIELD_GET(GENMASK(4, 0), val)
+#define TPMI_MAJOR_VERSION(val)	FIELD_GET(GENMASK(7, 5), val)
+
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
  * @package_id:	CPU Package id
-- 
cgit v1.2.3


From 968118fcf0546ef74cf306bf8f8c1e06efff10e3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Sep 2023 10:44:52 +0200
Subject: OMAP/gpio: drop MPUIO static base

The OMAP GPIO driver hardcodes the MPIO chip base, but there
is no point: we have already moved all consumers over to using
descriptor look-ups.

Drop the MPUIO GPIO base and use dynamic assignment.

Root out the unused instances of the OMAP_MPUIO() macro and
delete the unused OMAP_GPIO_IS_MPUIO() macro.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Tested-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 arch/arm/mach-omap1/board-palmte.c      | 5 -----
 drivers/gpio/gpio-omap.c                | 3 +--
 include/linux/platform_data/gpio-omap.h | 3 ---
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap1/board-palmte.c b/arch/arm/mach-omap1/board-palmte.c
index 7e061d671fde..c917cb2c6e17 100644
--- a/arch/arm/mach-omap1/board-palmte.c
+++ b/arch/arm/mach-omap1/board-palmte.c
@@ -51,11 +51,6 @@
 #define PALMTE_HDQ_GPIO		11
 #define PALMTE_HEADPHONES_GPIO	14
 #define PALMTE_SPEAKER_GPIO	15
-#define PALMTE_DC_GPIO		OMAP_MPUIO(2)
-#define PALMTE_MMC_SWITCH_GPIO	OMAP_MPUIO(4)
-#define PALMTE_MMC1_GPIO	OMAP_MPUIO(6)
-#define PALMTE_MMC2_GPIO	OMAP_MPUIO(7)
-#define PALMTE_MMC3_GPIO	OMAP_MPUIO(11)
 
 static const unsigned int palmte_keymap[] = {
 	KEY(0, 0, KEY_F1),		/* Calendar */
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index 8889755e2d03..76d5d87e9681 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -1048,15 +1048,14 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct device *pm_dev)
 		bank->chip.label = "mpuio";
 		if (bank->regs->wkup_en)
 			bank->chip.parent = &omap_mpuio_device.dev;
-		bank->chip.base = OMAP_MPUIO(0);
 	} else {
 		label = devm_kasprintf(bank->chip.parent, GFP_KERNEL, "gpio-%d-%d",
 				       gpio, gpio + bank->width - 1);
 		if (!label)
 			return -ENOMEM;
 		bank->chip.label = label;
-		bank->chip.base = -1;
 	}
+	bank->chip.base = -1;
 	bank->chip.ngpio = bank->width;
 
 	irq = &bank->chip.irq;
diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h
index f377817ce75c..cdd8cfb424f5 100644
--- a/include/linux/platform_data/gpio-omap.h
+++ b/include/linux/platform_data/gpio-omap.h
@@ -144,9 +144,6 @@
 
 #define OMAP_MAX_GPIO_LINES		192
 
-#define OMAP_MPUIO(nr)		(OMAP_MAX_GPIO_LINES + (nr))
-#define OMAP_GPIO_IS_MPUIO(nr)	((nr) >= OMAP_MAX_GPIO_LINES)
-
 #ifndef __ASSEMBLER__
 struct omap_gpio_reg_offs {
 	u16 revision;
-- 
cgit v1.2.3


From 36aa129f221c9070afd8dff03154ab49702a5b1b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:21 +0200
Subject: gpiolib: make gpio_device_get() and gpio_device_put() public

In order to start migrating away from accessing struct gpio_chip by
users other than their owners, let's first make the reference management
functions for the opaque struct gpio_device public in the driver.h
header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/gpio/gpiolib.c      | 24 ++++++++++++++++++++++++
 drivers/gpio/gpiolib.h      | 10 ----------
 include/linux/gpio/driver.h |  3 +++
 3 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 7c27a1efc1b0..c9d8f6b57771 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1058,6 +1058,30 @@ static struct gpio_chip *find_chip_by_name(const char *name)
 	return gpiochip_find((void *)name, gpiochip_match_name);
 }
 
+/**
+ * gpio_device_get() - Increase the reference count of this GPIO device
+ * @gdev: GPIO device to increase the refcount for
+ *
+ * Returns:
+ * Pointer to @gdev.
+ */
+struct gpio_device *gpio_device_get(struct gpio_device *gdev)
+{
+	return to_gpio_device(get_device(&gdev->dev));
+}
+EXPORT_SYMBOL_GPL(gpio_device_get);
+
+/**
+ * gpio_device_put() - Decrease the reference count of this GPIO device and
+ *                     possibly free all resources associated with it.
+ * @gdev: GPIO device to decrease the reference count for
+ */
+void gpio_device_put(struct gpio_device *gdev)
+{
+	put_device(&gdev->dev);
+}
+EXPORT_SYMBOL_GPL(gpio_device_put);
+
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 
 /*
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index 9bff5c2cf720..3ccacf3c1288 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -86,16 +86,6 @@ static inline struct gpio_device *to_gpio_device(struct device *dev)
 	return container_of(dev, struct gpio_device, dev);
 }
 
-static inline struct gpio_device *gpio_device_get(struct gpio_device *gdev)
-{
-	return to_gpio_device(get_device(&gdev->dev));
-}
-
-static inline void gpio_device_put(struct gpio_device *gdev)
-{
-	put_device(&gdev->dev);
-}
-
 /* gpio suffixes used for ACPI and device tree lookup */
 static __maybe_unused const char * const gpio_suffixes[] = { "gpios", "gpio" };
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 8f0859ba7065..a2060dc3344b 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -606,6 +606,9 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 struct gpio_chip *gpiochip_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 
+struct gpio_device *gpio_device_get(struct gpio_device *gdev);
+void gpio_device_put(struct gpio_device *gdev);
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 9e4555d1e54a18946d7ca363b9fc8ed1fe7dfde4 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:22 +0200
Subject: gpiolib: add support for scope-based management to gpio_device

As the few users that need to get the reference to the GPIO device often
release it right after inspecting its properties, let's add support for
the automatic reference release to struct gpio_device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/gpio/driver.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a2060dc3344b..1cedbc3d3200 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -3,6 +3,8 @@
 #define __LINUX_GPIO_DRIVER_H
 
 #include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
 #include <linux/irqhandler.h>
@@ -609,6 +611,9 @@ struct gpio_chip *gpiochip_find(void *data,
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
+DEFINE_FREE(gpio_device_put, struct gpio_device *,
+	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From cfe102f63308c8c8e01199a682868a64b83f653e Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:23 +0200
Subject: gpiolib: provide gpio_device_find()

gpiochip_find() is wrong and its kernel doc is misleading as the
function doesn't return a reference to the gpio_chip but just a raw
pointer. The chip itself is not guaranteed to stay alive, in fact it can
be deleted at any point. Also: other than GPIO drivers themselves,
nobody else has any business accessing gpio_chip structs.

Provide a new gpio_device_find() function that returns a real reference
to the opaque gpio_device structure that is guaranteed to stay alive for
as long as there are active users of it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 71 +++++++++++++++++++++++++++++++++------------
 include/linux/gpio/driver.h |  3 ++
 2 files changed, 56 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index c9d8f6b57771..d5bdf9cebb29 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1014,16 +1014,10 @@ void gpiochip_remove(struct gpio_chip *gc)
 }
 EXPORT_SYMBOL_GPL(gpiochip_remove);
 
-/**
- * gpiochip_find() - iterator for locating a specific gpio_chip
- * @data: data to pass to match function
- * @match: Callback function to check gpio_chip
+/*
+ * FIXME: This will be removed soon.
  *
- * Similar to bus_find_device.  It returns a reference to a gpio_chip as
- * determined by a user supplied @match callback.  The callback should return
- * 0 if the device doesn't match and non-zero if it does.  If the callback is
- * non-zero, this function will return to the caller and not iterate over any
- * more gpio_chips.
+ * This function is depracated, don't use.
  */
 struct gpio_chip *gpiochip_find(void *data,
 				int (*match)(struct gpio_chip *gc,
@@ -1031,21 +1025,62 @@ struct gpio_chip *gpiochip_find(void *data,
 {
 	struct gpio_device *gdev;
 	struct gpio_chip *gc = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	list_for_each_entry(gdev, &gpio_devices, list)
-		if (gdev->chip && match(gdev->chip, data)) {
-			gc = gdev->chip;
-			break;
-		}
 
-	spin_unlock_irqrestore(&gpio_lock, flags);
+	gdev = gpio_device_find(data, match);
+	if (gdev) {
+		gc = gdev->chip;
+		gpio_device_put(gdev);
+	}
 
 	return gc;
 }
 EXPORT_SYMBOL_GPL(gpiochip_find);
 
+/**
+ * gpio_device_find() - find a specific GPIO device
+ * @data: data to pass to match function
+ * @match: Callback function to check gpio_chip
+ *
+ * Returns:
+ * New reference to struct gpio_device.
+ *
+ * Similar to bus_find_device(). It returns a reference to a gpio_device as
+ * determined by a user supplied @match callback. The callback should return
+ * 0 if the device doesn't match and non-zero if it does. If the callback
+ * returns non-zero, this function will return to the caller and not iterate
+ * over any more gpio_devices.
+ *
+ * The callback takes the GPIO chip structure as argument. During the execution
+ * of the callback function the chip is protected from being freed. TODO: This
+ * actually has yet to be implemented.
+ *
+ * If the function returns non-NULL, the returned reference must be freed by
+ * the caller using gpio_device_put().
+ */
+struct gpio_device *gpio_device_find(void *data,
+				     int (*match)(struct gpio_chip *gc,
+						  void *data))
+{
+	struct gpio_device *gdev;
+
+	/*
+	 * Not yet but in the future the spinlock below will become a mutex.
+	 * Annotate this function before anyone tries to use it in interrupt
+	 * context like it happened with gpiochip_find().
+	 */
+	might_sleep();
+
+	guard(spinlock_irqsave)(&gpio_lock);
+
+	list_for_each_entry(gdev, &gpio_devices, list) {
+		if (gdev->chip && match(gdev->chip, data))
+			return gpio_device_get(gdev);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(gpio_device_find);
+
 static int gpiochip_match_name(struct gpio_chip *gc, void *data)
 {
 	const char *name = data;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1cedbc3d3200..6ad1f1a8ef2e 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -608,6 +608,9 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 struct gpio_chip *gpiochip_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 
+struct gpio_device *gpio_device_find(void *data,
+				int (*match)(struct gpio_chip *gc, void *data));
+
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
-- 
cgit v1.2.3


From d62fcd9f1897d587f8f9db3f77ccae8797a44b5d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:24 +0200
Subject: gpiolib: provide gpio_device_find_by_label()

By far the most common way of looking up GPIO devices is using their
label. Provide a helpers for that to avoid every user implementing their
own matching function.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 21 +++++++++++++++++++++
 include/linux/gpio/driver.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d5bdf9cebb29..0a41f6ee8d33 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -20,6 +20,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/string.h>
 
 #include <linux/gpio.h>
 #include <linux/gpio/driver.h>
@@ -1081,6 +1082,26 @@ struct gpio_device *gpio_device_find(void *data,
 }
 EXPORT_SYMBOL_GPL(gpio_device_find);
 
+static int gpio_chip_match_by_label(struct gpio_chip *gc, void *label)
+{
+	return gc->label && !strcmp(gc->label, label);
+}
+
+/**
+ * gpio_device_find_by_label() - wrapper around gpio_device_find() finding the
+ *                               GPIO device by its backing chip's label
+ * @label: Label to lookup
+ *
+ * Returns:
+ * Reference to the GPIO device or NULL. Reference must be released with
+ * gpio_device_put().
+ */
+struct gpio_device *gpio_device_find_by_label(const char *label)
+{
+	return gpio_device_find((void *)label, gpio_chip_match_by_label);
+}
+EXPORT_SYMBOL_GPL(gpio_device_find_by_label);
+
 static int gpiochip_match_name(struct gpio_chip *gc, void *data)
 {
 	const char *name = data;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 6ad1f1a8ef2e..24996cba6465 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -610,6 +610,7 @@ struct gpio_chip *gpiochip_find(void *data,
 
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
+struct gpio_device *gpio_device_find_by_label(const char *label);
 
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
-- 
cgit v1.2.3


From 93548f8bbbbf5b62dbd37c8f61a037a06666787b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:25 +0200
Subject: gpiolib: provide gpio_device_get_desc()

Getting the GPIO descriptor directly from the gpio_chip struct is
dangerous as we don't take the reference to the underlying GPIO device.
In order to start working towards removing gpiochip_get_desc(), let's
provide a safer variant that works with an existing reference to struct
gpio_device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 40 +++++++++++++++++++++++++++++++---------
 include/linux/gpio/driver.h |  2 ++
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 0a41f6ee8d33..f4fdf620ca74 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -147,27 +147,49 @@ struct gpio_desc *gpio_to_desc(unsigned gpio)
 }
 EXPORT_SYMBOL_GPL(gpio_to_desc);
 
+/* This function is deprecated and will be removed soon, don't use. */
+struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc,
+				    unsigned int hwnum)
+{
+	return gpio_device_get_desc(gc->gpiodev, hwnum);
+}
+EXPORT_SYMBOL_GPL(gpiochip_get_desc);
+
 /**
- * gpiochip_get_desc - get the GPIO descriptor corresponding to the given
- *                     hardware number for this chip
- * @gc: GPIO chip
+ * gpio_device_get_desc() - get the GPIO descriptor corresponding to the given
+ *                          hardware number for this GPIO device
+ * @gdev: GPIO device to get the descriptor from
  * @hwnum: hardware number of the GPIO for this chip
  *
  * Returns:
- * A pointer to the GPIO descriptor or ``ERR_PTR(-EINVAL)`` if no GPIO exists
- * in the given chip for the specified hardware number.
+ * A pointer to the GPIO descriptor or %EINVAL if no GPIO exists in the given
+ * chip for the specified hardware number or %ENODEV if the underlying chip
+ * already vanished.
+ *
+ * The reference count of struct gpio_device is *NOT* increased like when the
+ * GPIO is being requested for exclusive usage. It's up to the caller to make
+ * sure the GPIO device will stay alive together with the descriptor returned
+ * by this function.
  */
-struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc,
-				    unsigned int hwnum)
+struct gpio_desc *
+gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum)
 {
-	struct gpio_device *gdev = gc->gpiodev;
+	struct gpio_chip *gc;
+
+	/*
+	 * FIXME: This will be locked once we protect gdev->chip everywhere
+	 * with SRCU.
+	 */
+	gc = gdev->chip;
+	if (!gc)
+		return ERR_PTR(-ENODEV);
 
 	if (hwnum >= gdev->ngpio)
 		return ERR_PTR(-EINVAL);
 
 	return &gdev->descs[hwnum];
 }
-EXPORT_SYMBOL_GPL(gpiochip_get_desc);
+EXPORT_SYMBOL_GPL(gpio_device_get_desc);
 
 /**
  * desc_to_gpio - convert a GPIO descriptor to the integer namespace
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 24996cba6465..3fdf3f14bb13 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -770,6 +770,8 @@ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc,
 void gpiochip_free_own_desc(struct gpio_desc *desc);
 
 struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
+struct gpio_desc *
+gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum);
 
 #ifdef CONFIG_GPIOLIB
 
-- 
cgit v1.2.3


From 9b418780844c677669ab474f6f29940ef545c954 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 27 Sep 2023 16:29:26 +0200
Subject: gpiolib: reluctantly provide gpio_device_get_chip()

The process of converting all unauthorized users of struct gpio_chip to
using dedicated struct gpio_device function will be long so in the
meantime we must provide a way of retrieving the pointer to struct
gpio_chip from a GPIO device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 21 +++++++++++++++++++++
 include/linux/gpio/driver.h |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index f4fdf620ca74..6bb8d4a0b1d4 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -220,6 +220,27 @@ struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_to_chip);
 
+/**
+ * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device
+ * @gdev: GPIO device
+ *
+ * Returns:
+ * Address of the GPIO chip backing this device.
+ *
+ * Until we can get rid of all non-driver users of struct gpio_chip, we must
+ * provide a way of retrieving the pointer to it from struct gpio_device. This
+ * is *NOT* safe as the GPIO API is considered to be hot-unpluggable and the
+ * chip can dissapear at any moment (unlike reference-counted struct
+ * gpio_device).
+ *
+ * Use at your own risk.
+ */
+struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev)
+{
+	return gdev->chip;
+}
+EXPORT_SYMBOL_GPL(gpio_device_get_chip);
+
 /* dynamic allocation of GPIOs, e.g. on a hotplugged device */
 static int gpiochip_find_base(int ngpio)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 3fdf3f14bb13..f8ad7f40100c 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -773,6 +773,8 @@ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
 struct gpio_desc *
 gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum);
 
+struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev);
+
 #ifdef CONFIG_GPIOLIB
 
 /* lock/unlock as IRQ */
-- 
cgit v1.2.3


From 8e56b063c86569e51eed1c5681ce6361fa97fc7a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 3 Oct 2023 13:17:53 -0400
Subject: netfilter: handle the connecting collision properly in
 nf_conntrack_proto_sctp

In Scenario A and B below, as the delayed INIT_ACK always changes the peer
vtag, SCTP ct with the incorrect vtag may cause packet loss.

Scenario A: INIT_ACK is delayed until the peer receives its own INIT_ACK

  192.168.1.2 > 192.168.1.1: [INIT] [init tag: 1328086772]
    192.168.1.1 > 192.168.1.2: [INIT] [init tag: 1414468151]
    192.168.1.2 > 192.168.1.1: [INIT ACK] [init tag: 1328086772]
  192.168.1.1 > 192.168.1.2: [INIT ACK] [init tag: 1650211246] *
  192.168.1.2 > 192.168.1.1: [COOKIE ECHO]
    192.168.1.1 > 192.168.1.2: [COOKIE ECHO]
    192.168.1.2 > 192.168.1.1: [COOKIE ACK]

Scenario B: INIT_ACK is delayed until the peer completes its own handshake

  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408]
    192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885]
    192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408]
    192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO]
    192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK]
  192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] *

This patch fixes it as below:

In SCTP_CID_INIT processing:
- clear ct->proto.sctp.init[!dir] if ct->proto.sctp.init[dir] &&
  ct->proto.sctp.init[!dir]. (Scenario E)
- set ct->proto.sctp.init[dir].

In SCTP_CID_INIT_ACK processing:
- drop it if !ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] &&
  ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario B, Scenario C)
- drop it if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
  ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario A)

In SCTP_CID_COOKIE_ACK processing:
- clear ct->proto.sctp.init[dir] and ct->proto.sctp.init[!dir].
  (Scenario D)

Also, it's important to allow the ct state to move forward with cookie_echo
and cookie_ack from the opposite dir for the collision scenarios.

There are also other Scenarios where it should allow the packet through,
addressed by the processing above:

Scenario C: new CT is created by INIT_ACK.

Scenario D: start INIT on the existing ESTABLISHED ct.

Scenario E: start INIT after the old collision on the existing ESTABLISHED
ct.

  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408]
  192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885]
  (both side are stopped, then start new connection again in hours)
  192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 242308742]

Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter/nf_conntrack_sctp.h |  1 +
 net/netfilter/nf_conntrack_proto_sctp.c     | 43 ++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h
index 625f491b95de..fb31312825ae 100644
--- a/include/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/linux/netfilter/nf_conntrack_sctp.h
@@ -9,6 +9,7 @@ struct ip_ct_sctp {
 	enum sctp_conntrack state;
 
 	__be32 vtag[IP_CT_DIR_MAX];
+	u8 init[IP_CT_DIR_MAX];
 	u8 last_dir;
 	u8 flags;
 };
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index b6bcc8f2f46b..c6bd533983c1 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -112,7 +112,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
 /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
 /* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
 /* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
-/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
+/* cookie_ack   */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
 /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
 /* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
 /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -126,7 +126,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
 /* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
 /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
 /* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
-/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
+/* cookie_echo  */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
 /* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
 /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
 /* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
@@ -412,6 +412,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			/* (D) vtag must be same as init_vtag as found in INIT_ACK */
 			if (sh->vtag != ct->proto.sctp.vtag[dir])
 				goto out_unlock;
+		} else if (sch->type == SCTP_CID_COOKIE_ACK) {
+			ct->proto.sctp.init[dir] = 0;
+			ct->proto.sctp.init[!dir] = 0;
 		} else if (sch->type == SCTP_CID_HEARTBEAT) {
 			if (ct->proto.sctp.vtag[dir] == 0) {
 				pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
@@ -461,16 +464,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 		}
 
 		/* If it is an INIT or an INIT ACK note down the vtag */
-		if (sch->type == SCTP_CID_INIT ||
-		    sch->type == SCTP_CID_INIT_ACK) {
-			struct sctp_inithdr _inithdr, *ih;
+		if (sch->type == SCTP_CID_INIT) {
+			struct sctp_inithdr _ih, *ih;
 
-			ih = skb_header_pointer(skb, offset + sizeof(_sch),
-						sizeof(_inithdr), &_inithdr);
-			if (ih == NULL)
+			ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+			if (!ih)
 				goto out_unlock;
-			pr_debug("Setting vtag %x for dir %d\n",
-				 ih->init_tag, !dir);
+
+			if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
+				ct->proto.sctp.init[!dir] = 0;
+			ct->proto.sctp.init[dir] = 1;
+
+			pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
 			ct->proto.sctp.vtag[!dir] = ih->init_tag;
 
 			/* don't renew timeout on init retransmit so
@@ -481,6 +486,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			    old_state == SCTP_CONNTRACK_CLOSED &&
 			    nf_ct_is_confirmed(ct))
 				ignore = true;
+		} else if (sch->type == SCTP_CID_INIT_ACK) {
+			struct sctp_inithdr _ih, *ih;
+			__be32 vtag;
+
+			ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+			if (!ih)
+				goto out_unlock;
+
+			vtag = ct->proto.sctp.vtag[!dir];
+			if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
+				goto out_unlock;
+			/* collision */
+			if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
+			    vtag != ih->init_tag)
+				goto out_unlock;
+
+			pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
+			ct->proto.sctp.vtag[!dir] = ih->init_tag;
 		}
 
 		ct->proto.sctp.state = new_state;
-- 
cgit v1.2.3


From 09361abc346102c505657db4f3ae19ed70e1b703 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 9 Aug 2023 16:00:12 +0800
Subject: dmaengine: Remove unused declaration dma_chan_cleanup()

Commit f27c580c3628 ("dmaengine: remove 'bigref' infrastructure")
removed the implementation but left declaration in place. Remove it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230809080012.22000-1-yuehaibing@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c3656e590213..3df70d6131c8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -517,8 +517,6 @@ static inline const char *dma_chan_name(struct dma_chan *chan)
 	return dev_name(&chan->dev->device);
 }
 
-void dma_chan_cleanup(struct kref *kref);
-
 /**
  * typedef dma_filter_fn - callback filter for dma_request_channel
  * @chan: channel to be reviewed
-- 
cgit v1.2.3


From 4785aa8028536c2be656d22c74ec1995b97056f3 Mon Sep 17 00:00:00 2001
From: Oza Pawandeep <quic_poza@quicinc.com>
Date: Tue, 3 Oct 2023 10:33:33 -0700
Subject: cpuidle, ACPI: Evaluate LPI arch_flags for broadcast timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Arm® Functional Fixed Hardware Specification defines LPI states,
which provide an architectural context loss flags field that can
be used to describe the context that might be lost when an LPI
state is entered.

- Core context Lost
        - General purpose registers.
        - Floating point and SIMD registers.
        - System registers, include the System register based
        - generic timer for the core.
        - Debug register in the core power domain.
        - PMU registers in the core power domain.
        - Trace register in the core power domain.
- Trace context loss
- GICR
- GICD

Qualcomm's custom CPUs preserves the architectural state,
including keeping the power domain for local timers active.
when core is power gated, the local timers are sufficient to
wake the core up without needing broadcast timer.

The patch fixes the evaluation of cpuidle arch_flags, and moves only to
broadcast timer if core context lost is defined in ACPI LPI.

Fixes: a36a7fecfe60 ("ACPI / processor_idle: Add support for Low Power Idle(LPI) states")
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Oza Pawandeep <quic_poza@quicinc.com>
Link: https://lore.kernel.org/r/20231003173333.2865323-1-quic_poza@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/acpi.h | 19 +++++++++++++++++++
 drivers/acpi/processor_idle.c |  3 +--
 include/linux/acpi.h          |  9 +++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 4d537d56eb84..6792a1f83f2a 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -9,6 +9,7 @@
 #ifndef _ASM_ACPI_H
 #define _ASM_ACPI_H
 
+#include <linux/cpuidle.h>
 #include <linux/efi.h>
 #include <linux/memblock.h>
 #include <linux/psci.h>
@@ -44,6 +45,24 @@
 
 #define ACPI_MADT_GICC_TRBE  (offsetof(struct acpi_madt_generic_interrupt, \
 	trbe_interrupt) + sizeof(u16))
+/*
+ * Arm® Functional Fixed Hardware Specification Version 1.2.
+ * Table 2: Arm Architecture context loss flags
+ */
+#define CPUIDLE_CORE_CTXT		BIT(0) /* Core context Lost */
+
+static inline unsigned int arch_get_idle_state_flags(u32 arch_flags)
+{
+	if (arch_flags & CPUIDLE_CORE_CTXT)
+		return CPUIDLE_FLAG_TIMER_STOP;
+
+	return 0;
+}
+#define arch_get_idle_state_flags arch_get_idle_state_flags
+
+#define CPUIDLE_TRACE_CTXT		BIT(1) /* Trace context loss */
+#define CPUIDLE_GICR_CTXT		BIT(2) /* GICR */
+#define CPUIDLE_GICD_CTXT		BIT(3) /* GICD */
 
 /* Basic configuration for ACPI */
 #ifdef	CONFIG_ACPI
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index dc615ef6550a..3a34a8c425fe 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1217,8 +1217,7 @@ static int acpi_processor_setup_lpi_states(struct acpi_processor *pr)
 		strscpy(state->desc, lpi->desc, CPUIDLE_DESC_LEN);
 		state->exit_latency = lpi->wake_latency;
 		state->target_residency = lpi->min_residency;
-		if (lpi->arch_flags)
-			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
+		state->flags |= arch_get_idle_state_flags(lpi->arch_flags);
 		if (i != 0 && lpi->entry_method == ACPI_CSTATE_FFH)
 			state->flags |= CPUIDLE_FLAG_RCU_IDLE;
 		state->enter = acpi_idle_lpi_enter;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..afd94c9b8b8a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1480,6 +1480,15 @@ static inline int lpit_read_residency_count_address(u64 *address)
 }
 #endif
 
+#ifdef CONFIG_ACPI_PROCESSOR_IDLE
+#ifndef arch_get_idle_state_flags
+static inline unsigned int arch_get_idle_state_flags(u32 arch_flags)
+{
+	return 0;
+}
+#endif
+#endif /* CONFIG_ACPI_PROCESSOR_IDLE */
+
 #ifdef CONFIG_ACPI_PPTT
 int acpi_pptt_cpu_is_thread(unsigned int cpu);
 int find_acpi_cpu_topology(unsigned int cpu, int level);
-- 
cgit v1.2.3


From c964c1f5ee96e1460606d44f80a47bdacd8fe568 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:35:59 +0200
Subject: rcu: Assume rcu_report_dead() is always called locally

rcu_report_dead() has to be called locally by the CPU that is going to
exit the RCU state machine. Passing a cpu argument here is error-prone
and leaves the possibility for a racy remote call.

Use local access instead.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/arm64/kernel/smp.c  | 2 +-
 include/linux/rcupdate.h | 2 +-
 kernel/cpu.c             | 2 +-
 kernel/rcu/tree.c        | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..8fa646c90c67 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void)
 
 	/* Mark this CPU absent */
 	set_cpu_present(cpu, 0);
-	rcu_report_dead(cpu);
+	rcu_report_dead();
 
 	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
 		update_cpu_boot_status(CPU_KILL_ME);
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e5f920ade90..aa351ddcbe8d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,7 +122,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(unsigned int cpu);
+void rcu_report_dead(void);
 void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..076e75fed8bb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1388,7 +1388,7 @@ void cpuhp_report_idle_dead(void)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	rcu_report_dead(smp_processor_id());
+	rcu_report_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
 	 * We cannot call complete after rcu_report_dead() so we delegate it
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8c2954502e55..2e1e7eadf2cc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4560,11 +4560,11 @@ void rcu_cpu_starting(unsigned int cpu)
  * from the outgoing CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
  */
-void rcu_report_dead(unsigned int cpu)
+void rcu_report_dead(void)
 {
 	unsigned long flags;
 	unsigned long mask;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
 	/*
-- 
cgit v1.2.3


From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 23 Aug 2023 19:05:56 +0200
Subject: mm: remove remnants of SPLIT_RSS_COUNTING

The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into
percpu_counter"), but the patch failed to fully clean it up.

Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c          | 2 --
 include/linux/mm.h | 8 --------
 kernel/exit.c      | 4 ----
 kernel/fork.c      | 4 ----
 kernel/kthread.c   | 1 -
 mm/madvise.c       | 5 +----
 mm/memory.c        | 2 --
 7 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 6518e33ea813..eb039041ad6a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
-	if (old_mm)
-		sync_mm_rss(old_mm);
 
 	ret = down_write_killable(&tsk->signal->exec_update_lock);
 	if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..7613150acab9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2628,14 +2628,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline int pte_special(pte_t pte)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..3cdbe797008f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -539,7 +539,6 @@ static void exit_mm(void)
 	exit_mm_release(current, mm);
 	if (!mm)
 		return;
-	sync_mm_rss(mm);
 	mmap_read_lock(mm);
 	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
@@ -829,9 +828,6 @@ void __noreturn do_exit(long code)
 	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
 
-	/* sync mm's RSS info before statistics gathering */
-	if (tsk->mm)
-		sync_mm_rss(tsk->mm);
 	acct_update_integrals(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..1779183a7cb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process(
 	p->io_uring = NULL;
 #endif
 
-#if defined(SPLIT_RSS_COUNTING)
-	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
 #ifdef CONFIG_PSI
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..c46128ec0c0a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1469,7 +1469,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	 * clearing tsk->mm.
 	 */
 	smp_mb__after_spinlock();
-	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
 	membarrier_update_current_mm(NULL);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4dded5d27e7e..59e8860c86af 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -746,11 +746,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		folio_mark_lazyfree(folio);
 	}
 
-	if (nr_swap) {
-		if (current->mm == mm)
-			sync_mm_rss(mm);
+	if (nr_swap)
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
-	}
 	if (start_pte) {
 		arch_leave_lazy_mmu_mode();
 		pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 6c264d2f969c..0739ccb00e61 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -471,8 +471,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 {
 	int i;
 
-	if (current->mm == mm)
-		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
-- 
cgit v1.2.3


From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 23 Aug 2023 00:13:14 +0100
Subject: mm: convert DAX lock/unlock page to lock/unlock folio

The one caller of DAX lock/unlock page already calls compound_head(), so
use page_folio() instead, then use a folio throughout the DAX code to
remove uses of page->mapping and page->index.

[jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization]
  Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com
Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c            | 24 ++++++++++++------------
 include/linux/dax.h | 10 +++++-----
 mm/memory-failure.c | 29 ++++++++++++++++-------------
 3 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 8fafecbe42b1..3380b43cb6bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry)
 	return NULL;
 }
 
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
+/**
+ * dax_lock_folio - Lock the DAX entry corresponding to a folio
+ * @folio: The folio whose entry we want to lock
  *
  * Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock_folio(struct folio *folio)
 {
 	XA_STATE(xas, NULL, 0);
 	void *entry;
 
-	/* Ensure page->mapping isn't freed while we look at it */
+	/* Ensure folio->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
+		struct address_space *mapping = READ_ONCE(folio->mapping);
 
 		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
@@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page)
 
 		xas.xa = &mapping->i_pages;
 		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
+		if (mapping != folio->mapping) {
 			xas_unlock_irq(&xas);
 			continue;
 		}
-		xas_set(&xas, page->index);
+		xas_set(&xas, folio->index);
 		entry = xas_load(&xas);
 		if (dax_is_locked(entry)) {
 			rcu_read_unlock();
@@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page)
 	return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
 {
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
+	struct address_space *mapping = folio->mapping;
+	XA_STATE(xas, &mapping->i_pages, folio->index);
 
 	if (S_ISCHR(mapping->host->i_mode))
 		return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 22cd9902345d..b463502b16e1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_folio(struct folio *folio);
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
 		unsigned long index, struct page **page);
 void dax_unlock_mapping_entry(struct address_space *mapping,
@@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
 	return -EOPNOTSUPP;
 }
 
-static inline dax_entry_t dax_lock_page(struct page *page)
+static inline dax_entry_t dax_lock_folio(struct folio *folio)
 {
-	if (IS_DAX(page->mapping->host))
+	if (IS_DAX(folio->mapping->host))
 		return ~0UL;
 	return 0;
 }
 
-static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
+static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
 {
 }
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4d6e43c88489..660c21859118 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1713,20 +1713,23 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
 	kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
 }
 
+/*
+ * Only dev_pagemap pages get here, such as fsdax when the filesystem
+ * either do not claim or fails to claim a hwpoison event, or devdax.
+ * The fsdax pages are initialized per base page, and the devdax pages
+ * could be initialized either as base pages, or as compound pages with
+ * vmemmap optimization enabled. Devdax is simplistic in its dealing with
+ * hwpoison, such that, if a subpage of a compound page is poisoned,
+ * simply mark the compound head page is by far sufficient.
+ */
 static int mf_generic_kill_procs(unsigned long long pfn, int flags,
 		struct dev_pagemap *pgmap)
 {
-	struct page *page = pfn_to_page(pfn);
+	struct folio *folio = pfn_folio(pfn);
 	LIST_HEAD(to_kill);
 	dax_entry_t cookie;
 	int rc = 0;
 
-	/*
-	 * Pages instantiated by device-dax (not filesystem-dax)
-	 * may be compound pages.
-	 */
-	page = compound_head(page);
-
 	/*
 	 * Prevent the inode from being freed while we are interrogating
 	 * the address_space, typically this would be handled by
@@ -1734,11 +1737,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
 	 * also prevents changes to the mapping of this pfn until
 	 * poison signaling is complete.
 	 */
-	cookie = dax_lock_page(page);
+	cookie = dax_lock_folio(folio);
 	if (!cookie)
 		return -EBUSY;
 
-	if (hwpoison_filter(page)) {
+	if (hwpoison_filter(&folio->page)) {
 		rc = -EOPNOTSUPP;
 		goto unlock;
 	}
@@ -1760,7 +1763,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
 	 * Use this flag as an indication that the dax page has been
 	 * remapped UC to prevent speculative consumption of poison.
 	 */
-	SetPageHWPoison(page);
+	SetPageHWPoison(&folio->page);
 
 	/*
 	 * Unlike System-RAM there is no possibility to swap in a
@@ -1769,11 +1772,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
 	 * SIGBUS (i.e. MF_MUST_KILL)
 	 */
 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-	collect_procs(page, &to_kill, true);
+	collect_procs(&folio->page, &to_kill, true);
 
-	unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+	unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
 unlock:
-	dax_unlock_page(page, cookie);
+	dax_unlock_folio(folio, cookie);
 	return rc;
 }
 
-- 
cgit v1.2.3


From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 3 Sep 2023 15:13:23 +0000
Subject: mm/mremap: allow moves within the same VMA for stack moves

For the stack move happening in shift_arg_pages(), the move is happening
within the same VMA which spans the old and new ranges.

In case the aligned address happens to fall within that VMA, allow such
moves and don't abort the mremap alignment optimization.

In the regular non-stack mremap case, we cannot allow any such moves as
will end up destroying some part of the mapping (either the source of the
move, or part of the existing mapping).  So just avoid it for stack moves.

Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c          |  2 +-
 include/linux/mm.h |  2 +-
 mm/mremap.c        | 33 +++++++++++++++++++--------------
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index eb039041ad6a..4aa19b24f281 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	 * process cleanup to remove whatever mess we made.
 	 */
 	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length, false))
+				       vma, new_start, length, false, true))
 		return -ENOMEM;
 
 	lru_add_drain();
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7613150acab9..21c3d4e8a282 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2480,7 +2480,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks);
+		bool need_rmap_locks, bool for_stack);
 
 /*
  * Flags used by change_protection().  For now we make it a bitmap so
diff --git a/mm/mremap.c b/mm/mremap.c
index e2b65a17148e..ce8a23ef325a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 }
 
 /*
- * A helper to check if a previous mapping exists. Required for
- * move_page_tables() and realign_addr() to determine if a previous mapping
- * exists before we can do realignment optimizations.
+ * A helper to check if aligning down is OK. The aligned address should fall
+ * on *no mapping*. For the stack moving down, that's a special move within
+ * the VMA that is created to span the source and destination of the move,
+ * so we make an exception for it.
  */
 static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
-			       unsigned long mask)
+			    unsigned long mask, bool for_stack)
 {
 	unsigned long addr_masked = addr_to_align & mask;
 
@@ -504,9 +505,13 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
 	 * of the corresponding VMA, we can't align down or we will destroy part
 	 * of the current mapping.
 	 */
-	if (vma->vm_start != addr_to_align)
+	if (!for_stack && vma->vm_start != addr_to_align)
 		return false;
 
+	/* In the stack case we explicitly permit in-VMA alignment. */
+	if (for_stack && addr_masked >= vma->vm_start)
+		return true;
+
 	/*
 	 * Make sure the realignment doesn't cause the address to fall on an
 	 * existing mapping.
@@ -517,7 +522,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
 /* Opportunistically realign to specified boundary for faster copy. */
 static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
 			     unsigned long *new_addr, struct vm_area_struct *new_vma,
-			     unsigned long mask)
+			     unsigned long mask, bool for_stack)
 {
 	/* Skip if the addresses are already aligned. */
 	if ((*old_addr & ~mask) == 0)
@@ -528,8 +533,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old
 		return;
 
 	/* Ensure realignment doesn't cause overlap with existing mappings. */
-	if (!can_align_down(old_vma, *old_addr, mask) ||
-	    !can_align_down(new_vma, *new_addr, mask))
+	if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
+	    !can_align_down(new_vma, *new_addr, mask, for_stack))
 		return;
 
 	*old_addr = *old_addr & mask;
@@ -539,7 +544,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old
 unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks)
+		bool need_rmap_locks, bool for_stack)
 {
 	unsigned long extent, old_end;
 	struct mmu_notifier_range range;
@@ -559,9 +564,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	 * If possible, realign addresses to PMD boundary for faster copy.
 	 * Only realign if the mremap copying hits a PMD boundary.
 	 */
-	if ((vma != new_vma)
-		&& (len >= PMD_SIZE - (old_addr & ~PMD_MASK)))
-		try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK);
+	if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
+		try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
+				 for_stack);
 
 	flush_cache_range(vma, old_addr, old_end);
 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
@@ -708,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
-				     need_rmap_locks);
+				     need_rmap_locks, false);
 	if (moved_len < old_len) {
 		err = -ENOMEM;
 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
@@ -722,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		 * and then proceed to unmap new area instead of old.
 		 */
 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
-				 true);
+				 true, false);
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
-- 
cgit v1.2.3


From d896073fc767ebb40c11a6a9de71c390757ac64b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:27 +0000
Subject: mm/damon/core: add more comments for nr_accesses

The comment on struct damon_region about nr_accesses field looks not
sufficient.  Many people actually used to ask what nr_accesses mean.
There is more detailed explanation of the mechanism on the comment for
struct damon_attrs, but it is also ambiguous, as it doesn't specify the
name of the counter for aggregating the access check results.  Make
those more detailed.

Link: https://lkml.kernel.org/r/20230907022929.91361-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ae2664d1d5f1..266f92b34dd2 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -43,6 +43,10 @@ struct damon_addr_range {
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
+ * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
+ * increased for every &damon_attrs->sample_interval if an access to the region
+ * during the last sampling interval is found.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -472,13 +476,14 @@ struct damon_callback {
  *				regions.
  *
  * For each @sample_interval, DAMON checks whether each region is accessed or
- * not.  It aggregates and keeps the access information (number of accesses to
- * each region) for @aggr_interval time.  DAMON also checks whether the target
- * memory regions need update (e.g., by ``mmap()`` calls from the application,
- * in case of virtual memory monitoring) and applies the changes for each
- * @ops_update_interval.  All time intervals are in micro-seconds.
- * Please refer to &struct damon_operations and &struct damon_callback for more
- * detail.
+ * not during the last @sample_interval.  If such access is found, DAMON
+ * aggregates the information by increasing &damon_region->nr_accesses for
+ * @aggr_interval time.  For each @aggr_interval, the count is reset.  DAMON
+ * also checks whether the target memory regions need update (e.g., by
+ * ``mmap()`` calls from the application, in case of virtual memory monitoring)
+ * and applies the changes for each @ops_update_interval.  All time intervals
+ * are in micro-seconds.  Please refer to &struct damon_operations and &struct
+ * damon_callback for more detail.
  */
 struct damon_attrs {
 	unsigned long sample_interval;
-- 
cgit v1.2.3


From cf0a96bd3ab4d9d8a1c92baf1a822f2ddbca3a34 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 7 Sep 2023 02:29:28 +0000
Subject: mm/damon/core: remove duplicated comment for watermarks-based
 deactivation

The comment for explaining about watermarks-based monitoring part
deactivation is duplicated in two paragraphs.  Remove one.

Link: https://lkml.kernel.org/r/20230907022929.91361-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 266f92b34dd2..ab3089de1478 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -317,9 +317,6 @@ struct damos_access_pattern {
  * monitoring context are inactive, DAMON stops monitoring either, and just
  * repeatedly checks the watermarks.
  *
- * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
- * stops monitoring and just repeatedly checks the watermarks.
- *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
  * &filters
-- 
cgit v1.2.3


From 7cd34dd3c9bf1d67ce7d1ab3fe8886c583ae0d9a Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 11 Sep 2023 14:21:13 +0300
Subject: efi/unaccepted: do not let /proc/vmcore try to access unaccepted
 memory

Patch series "Do not try to access unaccepted memory", v2.

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby
a virtual machine may need to accept memory before it can be used.

Plug a few gaps where RAM is exposed without checking if it is
unaccepted memory.


This patch (of 2):

Support for unaccepted memory was added recently, refer commit
dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual
machine may need to accept memory before it can be used.

Do not let /proc/vmcore try to access unaccepted memory because it can
cause the guest to fail.

For /proc/vmcore, which is read-only, this means a read or mmap of
unaccepted memory will return zeros.

Link: https://lkml.kernel.org/r/20230911112114.91323-1-adrian.hunter@intel.com
Link: https://lkml.kernel.org/r/20230911112114.91323-2-adrian.hunter@intel.com
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/firmware/efi/unaccepted_memory.c | 20 ++++++++++++++++++++
 include/linux/mm.h                       |  7 +++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c
index 853f7dc3c21d..79ba576b22e3 100644
--- a/drivers/firmware/efi/unaccepted_memory.c
+++ b/drivers/firmware/efi/unaccepted_memory.c
@@ -3,6 +3,7 @@
 #include <linux/efi.h>
 #include <linux/memblock.h>
 #include <linux/spinlock.h>
+#include <linux/crash_dump.h>
 #include <asm/unaccepted_memory.h>
 
 /* Protects unaccepted memory bitmap */
@@ -145,3 +146,22 @@ bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end)
 
 	return ret;
 }
+
+#ifdef CONFIG_PROC_VMCORE
+static bool unaccepted_memory_vmcore_pfn_is_ram(struct vmcore_cb *cb,
+						unsigned long pfn)
+{
+	return !pfn_is_unaccepted_memory(pfn);
+}
+
+static struct vmcore_cb vmcore_cb = {
+	.pfn_is_ram = unaccepted_memory_vmcore_pfn_is_ram,
+};
+
+static int __init unaccepted_memory_init_kdump(void)
+{
+	register_vmcore_cb(&vmcore_cb);
+	return 0;
+}
+core_initcall(unaccepted_memory_init_kdump);
+#endif /* CONFIG_PROC_VMCORE */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21c3d4e8a282..31dc25d3f6b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4054,4 +4054,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
 
 #endif
 
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+	phys_addr_t paddr = pfn << PAGE_SHIFT;
+
+	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+}
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From 3ee0aa9f06756e959633ffda37856c6741d948ed Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:14 +0800
Subject: mm: move some shrinker-related function declarations to mm/internal.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "cleanups for lockless slab shrink", v4.

This series is some cleanups for lockless slab shrink.


This patch (of 4):

The following functions are only used inside the mm subsystem, so it's
better to move their declarations to the mm/internal.h file.

1. shrinker_debugfs_add()
2. shrinker_debugfs_detach()
3. shrinker_debugfs_remove()

Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 19 -------------------
 mm/internal.h            | 26 ++++++++++++++++++++++++++
 mm/shrinker_debug.c      |  2 ++
 3 files changed, 28 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 224293b2dd06..8dc15aa37410 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -106,28 +106,9 @@ extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
-extern int shrinker_debugfs_add(struct shrinker *shrinker);
-extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-					      int *debugfs_id);
-extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-				    int debugfs_id);
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
 #else /* CONFIG_SHRINKER_DEBUG */
-static inline int shrinker_debugfs_add(struct shrinker *shrinker)
-{
-	return 0;
-}
-static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
-						     int *debugfs_id)
-{
-	*debugfs_id = -1;
-	return NULL;
-}
-static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
-					   int debugfs_id)
-{
-}
 static inline __printf(2, 3)
 int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 30cf724ddbce..939d1227a527 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1154,4 +1154,30 @@ struct vma_prepare {
 	struct vm_area_struct *remove;
 	struct vm_area_struct *remove2;
 };
+
+/* shrinker related functions */
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+					      int *debugfs_id);
+extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
+				    int debugfs_id);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	return 0;
+}
+static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
+						     int *debugfs_id)
+{
+	*debugfs_id = -1;
+	return NULL;
+}
+static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
+					   int debugfs_id)
+{
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 3ab53fad8876..ee0cddb4530f 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -6,6 +6,8 @@
 #include <linux/shrinker.h>
 #include <linux/memcontrol.h>
 
+#include "internal.h"
+
 /* defined in vmscan.c */
 extern struct rw_semaphore shrinker_rwsem;
 extern struct list_head shrinker_list;
-- 
cgit v1.2.3


From 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:25:17 +0800
Subject: drm/ttm: introduce pool_shrink_rwsem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, synchronize_shrinkers() is only used by TTM pool.  It only
requires that no shrinkers run in parallel.

After we use RCU+refcount method to implement the lockless slab shrink, we
can not use shrinker_rwsem or synchronize_rcu() to guarantee that all
shrinker invocations have seen an update before freeing memory.

So we introduce a new pool_shrink_rwsem to implement a private
ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose.

Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/ttm/ttm_pool.c | 17 ++++++++++++++++-
 include/linux/shrinker.h       |  1 -
 mm/shrinker.c                  | 15 ---------------
 3 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index cddb9151d20f..648ca70403a7 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -74,6 +74,7 @@ static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
 static spinlock_t shrinker_lock;
 static struct list_head shrinker_list;
 static struct shrinker mm_shrinker;
+static DECLARE_RWSEM(pool_shrink_rwsem);
 
 /* Allocate pages of size 1 << order with the given gfp_flags */
 static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags,
@@ -317,6 +318,7 @@ static unsigned int ttm_pool_shrink(void)
 	unsigned int num_pages;
 	struct page *p;
 
+	down_read(&pool_shrink_rwsem);
 	spin_lock(&shrinker_lock);
 	pt = list_first_entry(&shrinker_list, typeof(*pt), shrinker_list);
 	list_move_tail(&pt->shrinker_list, &shrinker_list);
@@ -329,6 +331,7 @@ static unsigned int ttm_pool_shrink(void)
 	} else {
 		num_pages = 0;
 	}
+	up_read(&pool_shrink_rwsem);
 
 	return num_pages;
 }
@@ -572,6 +575,18 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 }
 EXPORT_SYMBOL(ttm_pool_init);
 
+/**
+ * ttm_pool_synchronize_shrinkers - Wait for all running shrinkers to complete.
+ *
+ * This is useful to guarantee that all shrinker invocations have seen an
+ * update, before freeing memory, similar to rcu.
+ */
+static void ttm_pool_synchronize_shrinkers(void)
+{
+	down_write(&pool_shrink_rwsem);
+	up_write(&pool_shrink_rwsem);
+}
+
 /**
  * ttm_pool_fini - Cleanup a pool
  *
@@ -593,7 +608,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
 	/* We removed the pool types from the LRU, but we need to also make sure
 	 * that no shrinker is concurrently freeing pages from the pool.
 	 */
-	synchronize_shrinkers();
+	ttm_pool_synchronize_shrinkers();
 }
 EXPORT_SYMBOL(ttm_pool_fini);
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 8dc15aa37410..6b5843c3b827 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -103,7 +103,6 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
-extern void synchronize_shrinkers(void);
 
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 043c87ccfab4..a16cd448b924 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -692,18 +692,3 @@ void unregister_shrinker(struct shrinker *shrinker)
 	shrinker->nr_deferred = NULL;
 }
 EXPORT_SYMBOL(unregister_shrinker);
-
-/**
- * synchronize_shrinkers - Wait for all running shrinkers to complete.
- *
- * This is equivalent to calling unregister_shrink() and register_shrinker(),
- * but atomically and with less overhead. This is useful to guarantee that all
- * shrinker invocations have seen an update, before freeing memory, similar to
- * rcu.
- */
-void synchronize_shrinkers(void)
-{
-	down_write(&shrinker_rwsem);
-	up_write(&shrinker_rwsem);
-}
-EXPORT_SYMBOL(synchronize_shrinkers);
-- 
cgit v1.2.3


From c42d50aefd17a6bad3ed617769edbbb579137545 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:00 +0800
Subject: mm: shrinker: add infrastructure for dynamically allocating shrinker

Patch series "use refcount+RCU method to implement lockless slab shrink",
v6.

1. Background
=============

We used to implement the lockless slab shrink with SRCU [1], but then kernel
test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test
case [2], so we reverted it [3].

This patch series aims to re-implement the lockless slab shrink using the
refcount+RCU method proposed by Dave Chinner [4].

[1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

2. Implementation
=================

Currently, the shrinker instances can be divided into the following three types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such as
   mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b, the
memory of shrinker instance will be freed after synchronize_rcu() when the
module is unloaded. For case c, the memory of shrinker instance will be freed
along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to dynamically
allocate those shrinker instances in case c, then the memory can be dynamically
freed alone by calling kfree_rcu().

This patchset adds the following new APIs for dynamically allocating shrinker,
and add a private_data field to struct shrinker to record and get the original
embedded structure.

1. shrinker_alloc()
2. shrinker_register()
3. shrinker_free()

In order to simplify shrinker-related APIs and make shrinker more independent of
other kernel mechanisms, this patchset uses the above APIs to convert all
shrinkers (including case a and b) to dynamically allocated, and then remove all
existing APIs. This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing out of tree
code and third party modules using the old API and will no longer work with a
kernel using lockless slab shrinkers. They need to break (both at the source and
binary levels) to stop bad things from happening due to using uncoverted
shrinkers in the new setup.
```

Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}()
to ensure that the shrinker instance is valid. And the shrinker::refcount
mechanism ensures that the shrinker instance will not be run again after
unregistration. So the structure that records the pointer of shrinker instance
can be safely freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to be
blocked in unregister_shrinker() to wait RCU read-side critical section.

PATCH 1: introduce new APIs
PATCH 2~38: convert all shrinnkers to use new APIs
PATCH 39: remove old APIs
PATCH 40~41: some cleanups and preparations
PATCH 42-43: implement the lockless slab shrink
PATCH 44~45: convert shrinker_rwsem to mutex

3. Testing
==========

3.1 slab shrink stress test
---------------------------

We can reproduce the down_read_trylock() hotspot through the following script:

```

DIR="/root/shrinker/memcg/mnt"

do_create()
{
    mkdir -p /sys/fs/cgroup/memory/test
    echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
    for i in `seq 0 $1`;
    do
        mkdir -p /sys/fs/cgroup/memory/test/$i;
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        mkdir -p $DIR/$i;
    done
}

do_mount()
{
    for i in `seq $1 $2`;
    do
        mount -t tmpfs $i $DIR/$i;
    done
}

do_touch()
{
    for i in `seq $1 $2`;
    do
        echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
        dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
    done
}

case "$1" in
  touch)
    do_touch $2 $3
    ;;
  test)
    do_create 4000
    do_mount 0 4000
    do_touch 0 3000
    ;;
  *)
    exit 1
    ;;
esac
```

Save the above script, then run test and touch commands. Then we can use the
following perf command to view hotspots:

perf top -U -F 999

1) Before applying this patchset:

  33.15%  [kernel]          [k] down_read_trylock
  25.38%  [kernel]          [k] shrink_slab
  21.75%  [kernel]          [k] up_read
   4.45%  [kernel]          [k] _find_next_bit
   2.27%  [kernel]          [k] do_shrink_slab
   1.80%  [kernel]          [k] intel_idle_irq
   1.79%  [kernel]          [k] shrink_lruvec
   0.67%  [kernel]          [k] xas_descend
   0.41%  [kernel]          [k] mem_cgroup_iter
   0.40%  [kernel]          [k] shrink_node
   0.38%  [kernel]          [k] list_lru_count_one

2) After applying this patchset:

  64.56%  [kernel]          [k] shrink_slab
  12.18%  [kernel]          [k] do_shrink_slab
   3.30%  [kernel]          [k] __rcu_read_unlock
   2.61%  [kernel]          [k] shrink_lruvec
   2.49%  [kernel]          [k] __rcu_read_lock
   1.93%  [kernel]          [k] intel_idle_irq
   0.89%  [kernel]          [k] shrink_node
   0.81%  [kernel]          [k] mem_cgroup_iter
   0.77%  [kernel]          [k] mem_cgroup_calculate_protection
   0.66%  [kernel]          [k] list_lru_count_one

We can see that the first perf hotspot becomes shrink_slab, which is what we
expect.

3.2 registration and unregistration stress test
-----------------------------------------------

Run the command below to test:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.


This patch (of 45):

Currently, the shrinker instances can be divided into the following three
types:

a) global shrinker instance statically defined in the kernel, such as
   workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such
   as mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b,
the memory of shrinker instance will be freed after synchronize_rcu() when
the module is unloaded. For case c, the memory of shrinker instance will
be freed along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to
dynamically allocate those shrinker instances in case c, then the memory
can be dynamically freed alone by calling kfree_rcu().

So this commit adds the following new APIs for dynamically allocating
shrinker, and add a private_data field to struct shrinker to record and
get the original embedded structure.

1. shrinker_alloc()

Used to allocate shrinker instance itself and related memory, it will
return a pointer to the shrinker instance on success and NULL on failure.

2. shrinker_register()

Used to register the shrinker instance, which is same as the current
register_shrinker_prepared().

3. shrinker_free()

Used to unregister (if needed) and free the shrinker instance.

In order to simplify shrinker-related APIs and make shrinker more
independent of other kernel mechanisms, subsequent submissions will use
the above API to convert all shrinkers (including case a and b) to
dynamically allocated, and then remove all existing APIs.

This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing
out of tree code and third party modules using the old API and will
no longer work with a kernel using lockless slab shrinkers. They
need to break (both at the source and binary levels) to stop bad
things from happening due to using unconverted shrinkers in the new
setup.
```

[zhengqi.arch@bytedance.com: mm: shrinker: some cleanup]
  Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <cel@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Sean Paul <sean@poorly.run>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h |  19 ++++++---
 mm/internal.h            |  22 ++++++++++
 mm/shrinker.c            | 106 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/shrinker_debug.c      |   3 --
 4 files changed, 142 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6b5843c3b827..f4a5249f00b2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -70,6 +70,8 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	void *private_data;
+
 	/* These are for internal use */
 	struct list_head list;
 #ifdef CONFIG_MEMCG
@@ -86,15 +88,22 @@ struct shrinker {
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
-/* Flags */
-#define SHRINKER_REGISTERED	(1 << 0)
-#define SHRINKER_NUMA_AWARE	(1 << 1)
-#define SHRINKER_MEMCG_AWARE	(1 << 2)
+/* Internal flags */
+#define SHRINKER_REGISTERED	BIT(0)
+#define SHRINKER_ALLOCATED	BIT(1)
+
+/* Flags for users to use */
+#define SHRINKER_NUMA_AWARE	BIT(2)
+#define SHRINKER_MEMCG_AWARE	BIT(3)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB	(1 << 3)
+#define SHRINKER_NONSLAB	BIT(4)
+
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_free(struct shrinker *shrinker);
 
 extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
 					    const char *fmt, ...);
diff --git a/mm/internal.h b/mm/internal.h
index 0471d6326d01..a273f4d948d8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1160,6 +1160,20 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
 			  int priority);
 
 #ifdef CONFIG_SHRINKER_DEBUG
+static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
+					      const char *fmt, va_list ap)
+{
+	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+
+	return shrinker->name ? 0 : -ENOMEM;
+}
+
+static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
+{
+	kfree_const(shrinker->name);
+	shrinker->name = NULL;
+}
+
 extern int shrinker_debugfs_add(struct shrinker *shrinker);
 extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
 					      int *debugfs_id);
@@ -1170,6 +1184,14 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker)
 {
 	return 0;
 }
+static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
+					      const char *fmt, va_list ap)
+{
+	return 0;
+}
+static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
+{
+}
 static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
 						     int *debugfs_id)
 {
diff --git a/mm/shrinker.c b/mm/shrinker.c
index a16cd448b924..d1032a4d5684 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -550,6 +550,112 @@ out:
 	return freed;
 }
 
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+	struct shrinker *shrinker;
+	unsigned int size;
+	va_list ap;
+	int err;
+
+	shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
+	if (!shrinker)
+		return NULL;
+
+	va_start(ap, fmt);
+	err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
+	va_end(ap);
+	if (err)
+		goto err_name;
+
+	shrinker->flags = flags | SHRINKER_ALLOCATED;
+	shrinker->seeks = DEFAULT_SEEKS;
+
+	if (flags & SHRINKER_MEMCG_AWARE) {
+		err = prealloc_memcg_shrinker(shrinker);
+		if (err == -ENOSYS) {
+			/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
+			shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+			goto non_memcg;
+		}
+
+		if (err)
+			goto err_flags;
+
+		return shrinker;
+	}
+
+non_memcg:
+	/*
+	 * The nr_deferred is available on per memcg level for memcg aware
+	 * shrinkers, so only allocate nr_deferred in the following cases:
+	 *  - non-memcg-aware shrinkers
+	 *  - !CONFIG_MEMCG
+	 *  - memcg is disabled by kernel command line
+	 */
+	size = sizeof(*shrinker->nr_deferred);
+	if (flags & SHRINKER_NUMA_AWARE)
+		size *= nr_node_ids;
+
+	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+	if (!shrinker->nr_deferred)
+		goto err_flags;
+
+	return shrinker;
+
+err_flags:
+	shrinker_debugfs_name_free(shrinker);
+err_name:
+	kfree(shrinker);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(shrinker_alloc);
+
+void shrinker_register(struct shrinker *shrinker)
+{
+	if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
+		pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
+		return;
+	}
+
+	down_write(&shrinker_rwsem);
+	list_add_tail(&shrinker->list, &shrinker_list);
+	shrinker->flags |= SHRINKER_REGISTERED;
+	shrinker_debugfs_add(shrinker);
+	up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL_GPL(shrinker_register);
+
+void shrinker_free(struct shrinker *shrinker)
+{
+	struct dentry *debugfs_entry = NULL;
+	int debugfs_id;
+
+	if (!shrinker)
+		return;
+
+	down_write(&shrinker_rwsem);
+	if (shrinker->flags & SHRINKER_REGISTERED) {
+		list_del(&shrinker->list);
+		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
+		shrinker->flags &= ~SHRINKER_REGISTERED;
+	}
+
+	shrinker_debugfs_name_free(shrinker);
+
+	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+		unregister_memcg_shrinker(shrinker);
+	up_write(&shrinker_rwsem);
+
+	if (debugfs_entry)
+		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
+
+	kfree(shrinker->nr_deferred);
+	shrinker->nr_deferred = NULL;
+
+	kfree(shrinker);
+}
+EXPORT_SYMBOL_GPL(shrinker_free);
+
 /*
  * Add a shrinker callback to be called from the vm.
  */
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index e4ce509f619e..24aebe7c24cc 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -241,9 +241,6 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
 
 	lockdep_assert_held(&shrinker_rwsem);
 
-	kfree_const(shrinker->name);
-	shrinker->name = NULL;
-
 	*debugfs_id = entry ? shrinker->debugfs_id : -1;
 	shrinker->debugfs_entry = NULL;
 
-- 
cgit v1.2.3


From 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:30 +0800
Subject: jbd2,ext4: dynamically allocate the jbd2-journal shrinker

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the jbd2-journal shrinker, so that it can be freed
asynchronously via RCU. Then it doesn't need to wait for RCU read-side
critical section when releasing the struct journal_s.

Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/journal.c    | 29 ++++++++++++++++++-----------
 include/linux/jbd2.h |  2 +-
 2 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 30dec2bd2ecc..ed53188472f9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1290,7 +1290,7 @@ static int jbd2_min_tag_size(void)
 static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
 					      struct shrink_control *sc)
 {
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	journal_t *journal = shrink->private_data;
 	unsigned long nr_to_scan = sc->nr_to_scan;
 	unsigned long nr_shrunk;
 	unsigned long count;
@@ -1316,7 +1316,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
 static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
 					       struct shrink_control *sc)
 {
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	journal_t *journal = shrink->private_data;
 	unsigned long count;
 
 	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
@@ -1588,14 +1588,21 @@ static journal_t *journal_init_common(struct block_device *bdev,
 		goto err_cleanup;
 
 	journal->j_shrink_transaction = NULL;
-	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
-	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
-	journal->j_shrinker.seeks = DEFAULT_SEEKS;
-	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-	err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
-				MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-	if (err)
+
+	journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
+					     MAJOR(bdev->bd_dev),
+					     MINOR(bdev->bd_dev));
+	if (!journal->j_shrinker) {
+		err = -ENOMEM;
 		goto err_cleanup;
+	}
+
+	journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
+	journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
+	journal->j_shrinker->batch = journal->j_max_transaction_buffers;
+	journal->j_shrinker->private_data = journal;
+
+	shrinker_register(journal->j_shrinker);
 
 	return journal;
 
@@ -2172,9 +2179,9 @@ int jbd2_journal_destroy(journal_t *journal)
 		brelse(journal->j_sb_buffer);
 	}
 
-	if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+	if (journal->j_shrinker) {
 		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
-		unregister_shrinker(&journal->j_shrinker);
+		shrinker_free(journal->j_shrinker);
 	}
 	if (journal->j_proc_entry)
 		jbd2_stats_proc_exit(journal);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 52772c826c86..6dcbb4eb80fb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -886,7 +886,7 @@ struct journal_s
 	 * Journal head shrinker, reclaim buffer's journal head which
 	 * has been written back.
 	 */
-	struct shrinker		j_shrinker;
+	struct shrinker		*j_shrinker;
 
 	/**
 	 * @j_checkpoint_jh_count:
-- 
cgit v1.2.3


From 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:37 +0800
Subject: fs: super: dynamically allocate the s_shrink

In preparation for implementing lockless slab shrink, use new APIs to
dynamically allocate the s_shrink, so that it can be freed asynchronously
via RCU. Then it doesn't need to wait for RCU read-side critical section
when releasing the struct super_block.

Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/btrfs/super.c   |  2 +-
 fs/kernfs/mount.c  |  2 +-
 fs/proc/root.c     |  2 +-
 fs/super.c         | 33 ++++++++++++++++++---------------
 include/linux/fs.h |  2 +-
 5 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a093ec0f7e3..b1798bed68f2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1519,7 +1519,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+		shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
 					s->s_id);
 		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c4bf26142eec..79b96e74a8a0 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -265,7 +265,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
 	sb->s_time_gran = 1;
 
 	/* sysfs dentries and inodes don't require IO to create */
-	sb->s_shrink.seeks = 0;
+	sb->s_shrink->seeks = 0;
 
 	/* get root inode, initialize and unlock it */
 	down_read(&kf_root->kernfs_rwsem);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9191248f2dac..b55dbc70287b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
 
 	/* procfs dentries and inodes don't require IO to create */
-	s->s_shrink.seeks = 0;
+	s->s_shrink->seeks = 0;
 
 	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..adadf6689611 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	long	dentries;
 	long	inodes;
 
-	sb = container_of(shrink, struct super_block, s_shrink);
+	sb = shrink->private_data;
 
 	/*
 	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
@@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	struct super_block *sb;
 	long	total_objects = 0;
 
-	sb = container_of(shrink, struct super_block, s_shrink);
+	sb = shrink->private_data;
 
 	/*
 	 * We don't call super_trylock_shared() here as it is a scalability
@@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s)
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
-	free_prealloced_shrinker(&s->s_shrink);
+	shrinker_free(s->s_shrink);
 	/* no delays needed */
 	destroy_super_work(&s->destroy_work);
 }
@@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_time_min = TIME64_MIN;
 	s->s_time_max = TIME64_MAX;
 
-	s->s_shrink.seeks = DEFAULT_SEEKS;
-	s->s_shrink.scan_objects = super_cache_scan;
-	s->s_shrink.count_objects = super_cache_count;
-	s->s_shrink.batch = 1024;
-	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
-	if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
+	s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
+				     "sb-%s", type->name);
+	if (!s->s_shrink)
 		goto fail;
-	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+
+	s->s_shrink->scan_objects = super_cache_scan;
+	s->s_shrink->count_objects = super_cache_count;
+	s->s_shrink->batch = 1024;
+	s->s_shrink->private_data = s;
+
+	if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
 		goto fail;
-	if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
 	return s;
 
@@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		unregister_shrinker(&s->s_shrink);
+		shrinker_free(s->s_shrink);
 		fs->kill_sb(s);
 
 		kill_super_notify(s);
@@ -818,7 +821,7 @@ retry:
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(s->s_type);
-	register_shrinker_prepared(&s->s_shrink);
+	shrinker_register(s->s_shrink);
 	return s;
 
 share_extant_sb:
@@ -901,7 +904,7 @@ retry:
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	register_shrinker_prepared(&s->s_shrink);
+	shrinker_register(s->s_shrink);
 	return s;
 }
 EXPORT_SYMBOL(sget);
@@ -1522,7 +1525,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 
 	snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
-	shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+	shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
 				sb->s_id);
 	sb_set_blocksize(sb, block_size(bdev));
 	return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..fd539c9fef8e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1265,7 +1265,7 @@ struct super_block {
 
 	const struct dentry_operations *s_d_op; /* default d_op for dentries */
 
-	struct shrinker s_shrink;	/* per-sb shrinker handle */
+	struct shrinker *s_shrink;	/* per-sb shrinker handle */
 
 	/* Number of inodes with nlink == 0 but still referenced */
 	atomic_long_t s_remove_count;
-- 
cgit v1.2.3


From f2383e01507eeee8a1c1283d61a117a97d6c4ebe Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:38 +0800
Subject: mm: shrinker: remove old APIs

Now no users are using the old APIs, just remove them.

Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h |   8 ---
 mm/shrinker.c            | 143 -----------------------------------------------
 2 files changed, 151 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index f4a5249f00b2..1d3899f37229 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -105,14 +105,6 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
-extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
-					    const char *fmt, ...);
-extern void unregister_shrinker(struct shrinker *shrinker);
-extern void free_prealloced_shrinker(struct shrinker *shrinker);
-
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
diff --git a/mm/shrinker.c b/mm/shrinker.c
index d1032a4d5684..736fa67e8454 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -655,146 +655,3 @@ void shrinker_free(struct shrinker *shrinker)
 	kfree(shrinker);
 }
 EXPORT_SYMBOL_GPL(shrinker_free);
-
-/*
- * Add a shrinker callback to be called from the vm.
- */
-static int __prealloc_shrinker(struct shrinker *shrinker)
-{
-	unsigned int size;
-	int err;
-
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-		err = prealloc_memcg_shrinker(shrinker);
-		if (err != -ENOSYS)
-			return err;
-
-		shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
-	}
-
-	size = sizeof(*shrinker->nr_deferred);
-	if (shrinker->flags & SHRINKER_NUMA_AWARE)
-		size *= nr_node_ids;
-
-	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
-	if (!shrinker->nr_deferred)
-		return -ENOMEM;
-
-	return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
-	va_list ap;
-	int err;
-
-	va_start(ap, fmt);
-	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
-	va_end(ap);
-	if (!shrinker->name)
-		return -ENOMEM;
-
-	err = __prealloc_shrinker(shrinker);
-	if (err) {
-		kfree_const(shrinker->name);
-		shrinker->name = NULL;
-	}
-
-	return err;
-}
-#else
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
-	return __prealloc_shrinker(shrinker);
-}
-#endif
-
-void free_prealloced_shrinker(struct shrinker *shrinker)
-{
-#ifdef CONFIG_SHRINKER_DEBUG
-	kfree_const(shrinker->name);
-	shrinker->name = NULL;
-#endif
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-		down_write(&shrinker_rwsem);
-		unregister_memcg_shrinker(shrinker);
-		up_write(&shrinker_rwsem);
-		return;
-	}
-
-	kfree(shrinker->nr_deferred);
-	shrinker->nr_deferred = NULL;
-}
-
-void register_shrinker_prepared(struct shrinker *shrinker)
-{
-	down_write(&shrinker_rwsem);
-	list_add_tail(&shrinker->list, &shrinker_list);
-	shrinker->flags |= SHRINKER_REGISTERED;
-	shrinker_debugfs_add(shrinker);
-	up_write(&shrinker_rwsem);
-}
-
-static int __register_shrinker(struct shrinker *shrinker)
-{
-	int err = __prealloc_shrinker(shrinker);
-
-	if (err)
-		return err;
-	register_shrinker_prepared(shrinker);
-	return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
-	va_list ap;
-	int err;
-
-	va_start(ap, fmt);
-	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
-	va_end(ap);
-	if (!shrinker->name)
-		return -ENOMEM;
-
-	err = __register_shrinker(shrinker);
-	if (err) {
-		kfree_const(shrinker->name);
-		shrinker->name = NULL;
-	}
-	return err;
-}
-#else
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
-	return __register_shrinker(shrinker);
-}
-#endif
-EXPORT_SYMBOL(register_shrinker);
-
-/*
- * Remove one
- */
-void unregister_shrinker(struct shrinker *shrinker)
-{
-	struct dentry *debugfs_entry;
-	int debugfs_id;
-
-	if (!(shrinker->flags & SHRINKER_REGISTERED))
-		return;
-
-	down_write(&shrinker_rwsem);
-	list_del(&shrinker->list);
-	shrinker->flags &= ~SHRINKER_REGISTERED;
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-		unregister_memcg_shrinker(shrinker);
-	debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
-	up_write(&shrinker_rwsem);
-
-	shrinker_debugfs_remove(debugfs_entry, debugfs_id);
-
-	kfree(shrinker->nr_deferred);
-	shrinker->nr_deferred = NULL;
-}
-EXPORT_SYMBOL(unregister_shrinker);
-- 
cgit v1.2.3


From 307bececcd1205bcb67a3c0d53a69db237ccc9d4 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:39 +0800
Subject: mm: shrinker: add a secondary array for shrinker_info::{map,
 nr_deferred}

Currently, we maintain two linear arrays per node per memcg, which are
shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
them when the shrinker_nr_max is exceeded, that is, allocate a new array,
and then copy the old array to the new array, and finally free the old
array by RCU.

For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
the value into the old map which is about to be freed. This may cause the
value set to be lost. The current solution is not to copy the old map when
resizing, but to set all the corresponding bits in the new map to 1. This
solves the data loss problem, but bring the overhead of more pointless
loops while doing memcg slab shrink.

For shrinker_info::nr_deferred, we will only modify it under the read lock
of shrinker_rwsem, so it will not run concurrently with the resizing. But
after we make memcg slab shrink lockless, there will be the same data loss
problem as shrinker_info::map, and we can't work around it like the map.

For such resizable arrays, the most straightforward idea is to change it
to xarray, like we did for list_lru [1]. We need to do xa_store() in the
list_lru_add()-->set_shrinker_bit(), but this will cause memory
allocation, and the list_lru_add() doesn't accept failure. A possible
solution is to pre-allocate, but the location of pre-allocation is not
well determined (such as deferred_split_shrinker case).

Therefore, this commit chooses to introduce the following secondary array
for shrinker_info::{map, nr_deferred}:

+---------------+--------+--------+-----+
| shrinker_info | unit 0 | unit 1 | ... | (secondary array)
+---------------+--------+--------+-----+
                     |
                     v
                +---------------+-----+
                | nr_deferred[] | map | (leaf array)
                +---------------+-----+
                (shrinker_info_unit)

The leaf array is never freed unless the memcg is destroyed. The secondary
array will be resized every time the shrinker id exceeds shrinker_nr_max.
So the shrinker_info_unit can be indexed from both the old and the new
shrinker_info->unit[x]. Then even if we get the old secondary array under
the RCU lock, the found map and nr_deferred are also true, so the updated
nr_deferred and map will not be lost.

[1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/

[zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()]
  Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com
Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  12 +--
 include/linux/shrinker.h   |  17 +++
 mm/shrinker.c              | 250 ++++++++++++++++++++++++++++-----------------
 3 files changed, 172 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..031102ac9311 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/shrinker.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-/*
- * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
- * shrinkers, which have elements charged to this memcg.
- */
-struct shrinker_info {
-	struct rcu_head rcu;
-	atomic_long_t *nr_deferred;
-	unsigned long *map;
-	int map_nr_max;
-};
-
 struct lruvec_stats_percpu {
 	/* Local (CPU and cgroup) state */
 	long state[NR_VM_NODE_STAT_ITEMS];
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1d3899f37229..ba5ac82b5dbd 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -5,6 +5,23 @@
 #include <linux/atomic.h>
 #include <linux/types.h>
 
+#define SHRINKER_UNIT_BITS	BITS_PER_LONG
+
+/*
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to the memcg.
+ */
+struct shrinker_info_unit {
+	atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
+	DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
+};
+
+struct shrinker_info {
+	struct rcu_head rcu;
+	int map_nr_max;
+	struct shrinker_info_unit *unit[];
+};
+
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extension later.
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 736fa67e8454..e9644cda80b5 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
 
-/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
-static inline int shrinker_map_size(int nr_items)
+static inline int shrinker_unit_size(int nr_items)
 {
-	return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+	return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
 }
 
-static inline int shrinker_defer_size(int nr_items)
+static inline void shrinker_unit_free(struct shrinker_info *info, int start)
 {
-	return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+	struct shrinker_info_unit **unit;
+	int nr, i;
+
+	if (!info)
+		return;
+
+	unit = info->unit;
+	nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
+
+	for (i = start; i < nr; i++) {
+		if (!unit[i])
+			break;
+
+		kfree(unit[i]);
+		unit[i] = NULL;
+	}
+}
+
+static inline int shrinker_unit_alloc(struct shrinker_info *new,
+				       struct shrinker_info *old, int nid)
+{
+	struct shrinker_info_unit *unit;
+	int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
+	int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
+	int i;
+
+	for (i = start; i < nr; i++) {
+		unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
+		if (!unit) {
+			shrinker_unit_free(new, start);
+			return -ENOMEM;
+		}
+
+		new->unit[i] = unit;
+	}
+
+	return 0;
 }
 
 void free_shrinker_info(struct mem_cgroup *memcg)
@@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg)
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
 		info = rcu_dereference_protected(pn->shrinker_info, true);
+		shrinker_unit_free(info, 0);
 		kvfree(info);
 		rcu_assign_pointer(pn->shrinker_info, NULL);
 	}
@@ -40,28 +76,28 @@ void free_shrinker_info(struct mem_cgroup *memcg)
 int alloc_shrinker_info(struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
-	int nid, size, ret = 0;
-	int map_size, defer_size = 0;
+	int nid, ret = 0;
+	int array_size = 0;
 
 	down_write(&shrinker_rwsem);
-	map_size = shrinker_map_size(shrinker_nr_max);
-	defer_size = shrinker_defer_size(shrinker_nr_max);
-	size = map_size + defer_size;
+	array_size = shrinker_unit_size(shrinker_nr_max);
 	for_each_node(nid) {
-		info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
-		if (!info) {
-			free_shrinker_info(memcg);
-			ret = -ENOMEM;
-			break;
-		}
-		info->nr_deferred = (atomic_long_t *)(info + 1);
-		info->map = (void *)info->nr_deferred + defer_size;
+		info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
+		if (!info)
+			goto err;
 		info->map_nr_max = shrinker_nr_max;
+		if (shrinker_unit_alloc(info, NULL, nid))
+			goto err;
 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 	}
 	up_write(&shrinker_rwsem);
 
 	return ret;
+
+err:
+	up_write(&shrinker_rwsem);
+	free_shrinker_info(memcg);
+	return -ENOMEM;
 }
 
 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
@@ -71,15 +107,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 					 lockdep_is_held(&shrinker_rwsem));
 }
 
-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
-				    int map_size, int defer_size,
-				    int old_map_size, int old_defer_size,
-				    int new_nr_max)
+static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
+				    int old_size, int new_nr_max)
 {
 	struct shrinker_info *new, *old;
 	struct mem_cgroup_per_node *pn;
 	int nid;
-	int size = map_size + defer_size;
 
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
@@ -92,21 +125,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 		if (new_nr_max <= old->map_nr_max)
 			continue;
 
-		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+		new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
 		if (!new)
 			return -ENOMEM;
 
-		new->nr_deferred = (atomic_long_t *)(new + 1);
-		new->map = (void *)new->nr_deferred + defer_size;
 		new->map_nr_max = new_nr_max;
 
-		/* map: set all old bits, clear all new bits */
-		memset(new->map, (int)0xff, old_map_size);
-		memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
-		/* nr_deferred: copy old values, clear all new values */
-		memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
-		memset((void *)new->nr_deferred + old_defer_size, 0,
-		       defer_size - old_defer_size);
+		memcpy(new->unit, old->unit, old_size);
+		if (shrinker_unit_alloc(new, old, nid)) {
+			kvfree(new);
+			return -ENOMEM;
+		}
 
 		rcu_assign_pointer(pn->shrinker_info, new);
 		kvfree_rcu(old, rcu);
@@ -118,9 +147,8 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 static int expand_shrinker_info(int new_id)
 {
 	int ret = 0;
-	int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
-	int map_size, defer_size = 0;
-	int old_map_size, old_defer_size = 0;
+	int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
+	int new_size, old_size = 0;
 	struct mem_cgroup *memcg;
 
 	if (!root_mem_cgroup)
@@ -128,15 +156,12 @@ static int expand_shrinker_info(int new_id)
 
 	lockdep_assert_held(&shrinker_rwsem);
 
-	map_size = shrinker_map_size(new_nr_max);
-	defer_size = shrinker_defer_size(new_nr_max);
-	old_map_size = shrinker_map_size(shrinker_nr_max);
-	old_defer_size = shrinker_defer_size(shrinker_nr_max);
+	new_size = shrinker_unit_size(new_nr_max);
+	old_size = shrinker_unit_size(shrinker_nr_max);
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
-		ret = expand_one_shrinker_info(memcg, map_size, defer_size,
-					       old_map_size, old_defer_size,
+		ret = expand_one_shrinker_info(memcg, new_size, old_size,
 					       new_nr_max);
 		if (ret) {
 			mem_cgroup_iter_break(NULL, memcg);
@@ -150,17 +175,34 @@ out:
 	return ret;
 }
 
+static inline int shrinker_id_to_index(int shrinker_id)
+{
+	return shrinker_id / SHRINKER_UNIT_BITS;
+}
+
+static inline int shrinker_id_to_offset(int shrinker_id)
+{
+	return shrinker_id % SHRINKER_UNIT_BITS;
+}
+
+static inline int calc_shrinker_id(int index, int offset)
+{
+	return index * SHRINKER_UNIT_BITS + offset;
+}
+
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 {
 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
 		struct shrinker_info *info;
+		struct shrinker_info_unit *unit;
 
 		rcu_read_lock();
 		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+		unit = info->unit[shrinker_id_to_index(shrinker_id)];
 		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
 			/* Pairs with smp mb in shrink_slab() */
 			smp_mb__before_atomic();
-			set_bit(shrinker_id, info->map);
+			set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
 		}
 		rcu_read_unlock();
 	}
@@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
 				   struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
+	struct shrinker_info_unit *unit;
 
 	info = shrinker_info_protected(memcg, nid);
-	return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+	unit = info->unit[shrinker_id_to_index(shrinker->id)];
+	return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
 }
 
 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 				  struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
+	struct shrinker_info_unit *unit;
 
 	info = shrinker_info_protected(memcg, nid);
-	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+	unit = info->unit[shrinker_id_to_index(shrinker->id)];
+	return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
 }
 
 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
 {
-	int i, nid;
+	int nid, index, offset;
 	long nr;
 	struct mem_cgroup *parent;
 	struct shrinker_info *child_info, *parent_info;
+	struct shrinker_info_unit *child_unit, *parent_unit;
 
 	parent = parent_mem_cgroup(memcg);
 	if (!parent)
@@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
 	for_each_node(nid) {
 		child_info = shrinker_info_protected(memcg, nid);
 		parent_info = shrinker_info_protected(parent, nid);
-		for (i = 0; i < child_info->map_nr_max; i++) {
-			nr = atomic_long_read(&child_info->nr_deferred[i]);
-			atomic_long_add(nr, &parent_info->nr_deferred[i]);
+		for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
+			child_unit = child_info->unit[index];
+			parent_unit = parent_info->unit[index];
+			for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
+				nr = atomic_long_read(&child_unit->nr_deferred[offset]);
+				atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
+			}
 		}
 	}
 	up_read(&shrinker_rwsem);
@@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
-	int i;
+	int offset, index = 0;
 
 	if (!mem_cgroup_online(memcg))
 		return 0;
@@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	if (unlikely(!info))
 		goto unlock;
 
-	for_each_set_bit(i, info->map, info->map_nr_max) {
-		struct shrink_control sc = {
-			.gfp_mask = gfp_mask,
-			.nid = nid,
-			.memcg = memcg,
-		};
-		struct shrinker *shrinker;
+	for (; index < shrinker_id_to_index(info->map_nr_max); index++) {
+		struct shrinker_info_unit *unit;
 
-		shrinker = idr_find(&shrinker_idr, i);
-		if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
-			if (!shrinker)
-				clear_bit(i, info->map);
-			continue;
-		}
+		unit = info->unit[index];
 
-		/* Call non-slab shrinkers even though kmem is disabled */
-		if (!memcg_kmem_online() &&
-		    !(shrinker->flags & SHRINKER_NONSLAB))
-			continue;
+		for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
+			struct shrink_control sc = {
+				.gfp_mask = gfp_mask,
+				.nid = nid,
+				.memcg = memcg,
+			};
+			struct shrinker *shrinker;
+			int shrinker_id = calc_shrinker_id(index, offset);
 
-		ret = do_shrink_slab(&sc, shrinker, priority);
-		if (ret == SHRINK_EMPTY) {
-			clear_bit(i, info->map);
-			/*
-			 * After the shrinker reported that it had no objects to
-			 * free, but before we cleared the corresponding bit in
-			 * the memcg shrinker map, a new object might have been
-			 * added. To make sure, we have the bit set in this
-			 * case, we invoke the shrinker one more time and reset
-			 * the bit if it reports that it is not empty anymore.
-			 * The memory barrier here pairs with the barrier in
-			 * set_shrinker_bit():
-			 *
-			 * list_lru_add()     shrink_slab_memcg()
-			 *   list_add_tail()    clear_bit()
-			 *   <MB>               <MB>
-			 *   set_bit()          do_shrink_slab()
-			 */
-			smp_mb__after_atomic();
-			ret = do_shrink_slab(&sc, shrinker, priority);
-			if (ret == SHRINK_EMPTY)
-				ret = 0;
-			else
-				set_shrinker_bit(memcg, nid, i);
-		}
-		freed += ret;
+			shrinker = idr_find(&shrinker_idr, shrinker_id);
+			if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
+				if (!shrinker)
+					clear_bit(offset, unit->map);
+				continue;
+			}
 
-		if (rwsem_is_contended(&shrinker_rwsem)) {
-			freed = freed ? : 1;
-			break;
+			/* Call non-slab shrinkers even though kmem is disabled */
+			if (!memcg_kmem_online() &&
+			    !(shrinker->flags & SHRINKER_NONSLAB))
+				continue;
+
+			ret = do_shrink_slab(&sc, shrinker, priority);
+			if (ret == SHRINK_EMPTY) {
+				clear_bit(offset, unit->map);
+				/*
+				 * After the shrinker reported that it had no objects to
+				 * free, but before we cleared the corresponding bit in
+				 * the memcg shrinker map, a new object might have been
+				 * added. To make sure, we have the bit set in this
+				 * case, we invoke the shrinker one more time and reset
+				 * the bit if it reports that it is not empty anymore.
+				 * The memory barrier here pairs with the barrier in
+				 * set_shrinker_bit():
+				 *
+				 * list_lru_add()     shrink_slab_memcg()
+				 *   list_add_tail()    clear_bit()
+				 *   <MB>               <MB>
+				 *   set_bit()          do_shrink_slab()
+				 */
+				smp_mb__after_atomic();
+				ret = do_shrink_slab(&sc, shrinker, priority);
+				if (ret == SHRINK_EMPTY)
+					ret = 0;
+				else
+					set_shrinker_bit(memcg, nid, shrinker_id);
+			}
+			freed += ret;
+
+			if (rwsem_is_contended(&shrinker_rwsem)) {
+				freed = freed ? : 1;
+				goto unlock;
+			}
 		}
 	}
 unlock:
-- 
cgit v1.2.3


From ca1d36b823944f24b5755311e95883fb5fdb807b Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:41 +0800
Subject: mm: shrinker: make global slab shrink lockless

The shrinker_rwsem is a global read-write lock in shrinkers subsystem,
which protects most operations such as slab shrink, registration and
unregistration of shrinkers, etc. This can easily cause problems in the
following cases.

1) When the memory pressure is high and there are many filesystems
   mounted or unmounted at the same time, slab shrink will be affected
   (down_read_trylock() failed).

   Such as the real workload mentioned by Kirill Tkhai:

   ```
   One of the real workloads from my experience is start
   of an overcommitted node containing many starting
   containers after node crash (or many resuming containers
   after reboot for kernel update). In these cases memory
   pressure is huge, and the node goes round in long reclaim.
   ```

2) If a shrinker is blocked (such as the case mentioned
   in [1]) and a writer comes in (such as mount a fs),
   then this writer will be blocked and cause all
   subsequent shrinker-related operations to be blocked.

Even if there is no competitor when shrinking slab, there may still be a
problem. The down_read_trylock() may become a perf hotspot with frequent
calls to shrink_slab(). Because of the poor multicore scalability of
atomic operations, this can lead to a significant drop in IPC
(instructions per cycle).

We used to implement the lockless slab shrink with SRCU [2], but then
kernel test robot reported -88.8% regression in
stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4].

This commit uses the refcount+RCU method [5] proposed by Dave Chinner
to re-implement the lockless global slab shrink. The memcg slab shrink is
handled in the subsequent patch.

For now, all shrinker instances are converted to dynamically allocated and
will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to
ensure that the shrinker instance is valid.

And the shrinker instance will not be run again after unregistration. So
the structure that records the pointer of shrinker instance can be safely
freed without waiting for the RCU read-side critical section.

In this way, while we implement the lockless slab shrink, we don't need to
be blocked in unregister_shrinker().

The following are the test results:

stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 &

1) Before applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            473062     60.00      8.00    279.13      7884.12        1647.59
for a 60.01s run time:
   1440.34s available CPU time
      7.99s user time   (  0.55%)
    279.13s system time ( 19.38%)
    287.12s total time  ( 19.93%)
load average: 7.12 2.99 1.15
successful run completed in 60.01s (1 min, 0.01 secs)

2) After applying this patchset:

setting to a 60 second run per stressor
dispatching hogs: 9 ramfs
stressor       bogo ops real time  usr time  sys time   bogo ops/s     bogo ops/s
                          (secs)    (secs)    (secs)   (real time) (usr+sys time)
ramfs            477165     60.00      8.13    281.34      7952.55        1648.40
for a 60.01s run time:
   1440.33s available CPU time
      8.12s user time   (  0.56%)
    281.34s system time ( 19.53%)
    289.46s total time  ( 20.10%)
load average: 6.98 3.03 1.19
successful run completed in 60.01s (1 min, 0.01 secs)

We can see that the ops/s has hardly changed.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/
[3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/
[4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/
[5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/

Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 24 +++++++++++++
 mm/shrinker.c            | 89 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 92 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index ba5ac82b5dbd..e4f93120e0ab 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,8 @@
 
 #include <linux/atomic.h>
 #include <linux/types.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
 
 #define SHRINKER_UNIT_BITS	BITS_PER_LONG
 
@@ -87,6 +89,17 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	unsigned flags;
 
+	/*
+	 * The reference count of this shrinker. Registered shrinker have an
+	 * initial refcount of 1, then the lookup operations are now allowed
+	 * to use it via shrinker_try_get(). Later in the unregistration step,
+	 * the initial refcount will be discarded, and will free the shrinker
+	 * asynchronously via RCU after its refcount reaches 0.
+	 */
+	refcount_t refcount;
+	struct completion done;	/* use to wait for refcount to reach 0 */
+	struct rcu_head rcu;
+
 	void *private_data;
 
 	/* These are for internal use */
@@ -122,6 +135,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
 
+static inline bool shrinker_try_get(struct shrinker *shrinker)
+{
+	return refcount_inc_not_zero(&shrinker->refcount);
+}
+
+static inline void shrinker_put(struct shrinker *shrinker)
+{
+	if (refcount_dec_and_test(&shrinker->refcount))
+		complete(&shrinker->done);
+}
+
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
 						  const char *fmt, ...);
diff --git a/mm/shrinker.c b/mm/shrinker.c
index e2e832b15803..9bb2e61092b3 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -2,6 +2,7 @@
 #include <linux/memcontrol.h>
 #include <linux/rwsem.h>
 #include <linux/shrinker.h>
+#include <linux/rculist.h>
 #include <trace/events/vmscan.h>
 
 #include "internal.h"
@@ -577,33 +578,50 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
 	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 
-	if (!down_read_trylock(&shrinker_rwsem))
-		goto out;
-
-	list_for_each_entry(shrinker, &shrinker_list, list) {
+	/*
+	 * lockless algorithm of global shrink.
+	 *
+	 * In the unregistration setp, the shrinker will be freed asynchronously
+	 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
+	 * shrinker_try_get() can be used to ensure the existence of the shrinker.
+	 *
+	 * So in the global shrink:
+	 *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
+	 *          and the validity of the shrinker_list walk.
+	 *  step 2: use shrinker_try_get() to try get the refcount, if successful,
+	 *          then the existence of the shrinker can also be guaranteed,
+	 *          so we can release the RCU lock to do do_shrink_slab() that
+	 *          may sleep.
+	 *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
+	 *          which ensures that neither this shrinker nor the next shrinker
+	 *          will be freed in the next traversal operation.
+	 *  step 4: do shrinker_put() paired with step 2 to put the refcount,
+	 *          if the refcount reaches 0, then wake up the waiter in
+	 *          shrinker_free() by calling complete().
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
 			.memcg = memcg,
 		};
 
+		if (!shrinker_try_get(shrinker))
+			continue;
+
+		rcu_read_unlock();
+
 		ret = do_shrink_slab(&sc, shrinker, priority);
 		if (ret == SHRINK_EMPTY)
 			ret = 0;
 		freed += ret;
-		/*
-		 * Bail out if someone want to register a new shrinker to
-		 * prevent the registration from being stalled for long periods
-		 * by parallel ongoing shrinking.
-		 */
-		if (rwsem_is_contended(&shrinker_rwsem)) {
-			freed = freed ? : 1;
-			break;
-		}
+
+		rcu_read_lock();
+		shrinker_put(shrinker);
 	}
 
-	up_read(&shrinker_rwsem);
-out:
+	rcu_read_unlock();
 	cond_resched();
 	return freed;
 }
@@ -676,13 +694,29 @@ void shrinker_register(struct shrinker *shrinker)
 	}
 
 	down_write(&shrinker_rwsem);
-	list_add_tail(&shrinker->list, &shrinker_list);
+	list_add_tail_rcu(&shrinker->list, &shrinker_list);
 	shrinker->flags |= SHRINKER_REGISTERED;
 	shrinker_debugfs_add(shrinker);
 	up_write(&shrinker_rwsem);
+
+	init_completion(&shrinker->done);
+	/*
+	 * Now the shrinker is fully set up, take the first reference to it to
+	 * indicate that lookup operations are now allowed to use it via
+	 * shrinker_try_get().
+	 */
+	refcount_set(&shrinker->refcount, 1);
 }
 EXPORT_SYMBOL_GPL(shrinker_register);
 
+static void shrinker_free_rcu_cb(struct rcu_head *head)
+{
+	struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
+
+	kfree(shrinker->nr_deferred);
+	kfree(shrinker);
+}
+
 void shrinker_free(struct shrinker *shrinker)
 {
 	struct dentry *debugfs_entry = NULL;
@@ -691,9 +725,25 @@ void shrinker_free(struct shrinker *shrinker)
 	if (!shrinker)
 		return;
 
+	if (shrinker->flags & SHRINKER_REGISTERED) {
+		/* drop the initial refcount */
+		shrinker_put(shrinker);
+		/*
+		 * Wait for all lookups of the shrinker to complete, after that,
+		 * no shrinker is running or will run again, then we can safely
+		 * free it asynchronously via RCU and safely free the structure
+		 * where the shrinker is located, such as super_block etc.
+		 */
+		wait_for_completion(&shrinker->done);
+	}
+
 	down_write(&shrinker_rwsem);
 	if (shrinker->flags & SHRINKER_REGISTERED) {
-		list_del(&shrinker->list);
+		/*
+		 * Now we can safely remove it from the shrinker_list and then
+		 * free it.
+		 */
+		list_del_rcu(&shrinker->list);
 		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
 		shrinker->flags &= ~SHRINKER_REGISTERED;
 	}
@@ -707,9 +757,6 @@ void shrinker_free(struct shrinker *shrinker)
 	if (debugfs_entry)
 		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
 
-	kfree(shrinker->nr_deferred);
-	shrinker->nr_deferred = NULL;
-
-	kfree(shrinker);
+	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
 }
 EXPORT_SYMBOL_GPL(shrinker_free);
-- 
cgit v1.2.3


From 09c550508a4b8f7844b197cc16877dd0f7c42d8f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 13 Sep 2023 14:51:13 +0200
Subject: mm/rmap: pass folio to hugepage_add_anon_rmap()

Let's pass a folio; we are always mapping the entire thing.

Link: https://lkml.kernel.org/r/20230913125113.313322-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 2 +-
 mm/migrate.c         | 2 +-
 mm/rmap.c            | 8 +++-----
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 51cc21ebb568..d22f4d21a11c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
diff --git a/mm/migrate.c b/mm/migrate.c
index 2053b54556ca..eb6bc4053bc4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -249,7 +249,7 @@ static bool remove_migration_pte(struct folio *folio,
 
 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			if (folio_test_anon(folio))
-				hugepage_add_anon_rmap(new, vma, pvmw.address,
+				hugepage_add_anon_rmap(folio, vma, pvmw.address,
 						       rmap_flags);
 			else
 				page_dup_file_rmap(new, true);
diff --git a/mm/rmap.c b/mm/rmap.c
index ed4b602bcbd5..d24e2c36372e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2542,18 +2542,16 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
  *
  * RMAP_COMPOUND is ignored.
  */
-void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 			    unsigned long address, rmap_t flags)
 {
-	struct folio *folio = page_folio(page);
-
 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
 
 	atomic_inc(&folio->_entire_mapcount);
 	if (flags & RMAP_EXCLUSIVE)
-		SetPageAnonExclusive(page);
+		SetPageAnonExclusive(&folio->page);
 	VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
-			 PageAnonExclusive(page), folio);
+			 PageAnonExclusive(&folio->page), folio);
 }
 
 void hugepage_add_new_anon_rmap(struct folio *folio,
-- 
cgit v1.2.3


From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 13 Sep 2023 17:51:27 +0800
Subject: mm: migrate: convert migrate_misplaced_page() to
 migrate_misplaced_folio()

At present, numa balance only support base page and PMD-mapped THP, but we
will expand to support to migrate large folio/pte-mapped THP in the
future, it is better to make migrate_misplaced_page() to take a folio
instead of a page, and rename it to migrate_misplaced_folio(), it is a
preparation, also this remove several compound_head() calls.

Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  4 ++--
 mm/huge_memory.c        |  2 +-
 mm/memory.c             |  2 +-
 mm/migrate.c            | 39 +++++++++++++++++++++------------------
 4 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 711dd9412561..2ce13e8a309b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
 			   int node);
 #else
-static inline int migrate_misplaced_page(struct page *page,
+static inline int migrate_misplaced_folio(struct folio *folio,
 					 struct vm_area_struct *vma, int node)
 {
 	return -EAGAIN; /* can't migrate now */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cfd83e91748..5c20e43782e4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1567,7 +1567,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	spin_unlock(vmf->ptl);
 	writable = false;
 
-	migrated = migrate_misplaced_page(page, vma, target_nid);
+	migrated = migrate_misplaced_folio(page_folio(page), vma, target_nid);
 	if (migrated) {
 		flags |= TNF_MIGRATED;
 		page_nid = target_nid;
diff --git a/mm/memory.c b/mm/memory.c
index 0739ccb00e61..d956b231e835 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4812,7 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	writable = false;
 
 	/* Migrate to the requested node */
-	if (migrate_misplaced_page(page, vma, target_nid)) {
+	if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) {
 		page_nid = target_nid;
 		flags |= TNF_MIGRATED;
 	} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index 1f1aebe8da18..1b848f6b5fbc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2516,55 +2516,58 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
 }
 
 /*
- * Attempt to migrate a misplaced page to the specified destination
+ * Attempt to migrate a misplaced folio to the specified destination
  * node. Caller is expected to have an elevated reference count on
- * the page that will be dropped by this function before returning.
+ * the folio that will be dropped by this function before returning.
  */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
-			   int node)
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
+			    int node)
 {
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated;
 	int nr_remaining;
 	unsigned int nr_succeeded;
 	LIST_HEAD(migratepages);
-	int nr_pages = thp_nr_pages(page);
+	int nr_pages = folio_nr_pages(folio);
 
 	/*
-	 * Don't migrate file pages that are mapped in multiple processes
+	 * Don't migrate file folios that are mapped in multiple processes
 	 * with execute permissions as they are probably shared libraries.
+	 * To check if the folio is shared, ideally we want to make sure
+	 * every page is mapped to the same process. Doing that is very
+	 * expensive, so check the estimated mapcount of the folio instead.
 	 */
-	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+	if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
 	    (vma->vm_flags & VM_EXEC))
 		goto out;
 
 	/*
-	 * Also do not migrate dirty pages as not all filesystems can move
-	 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+	 * Also do not migrate dirty folios as not all filesystems can move
+	 * dirty folios in MIGRATE_ASYNC mode which is a waste of cycles.
 	 */
-	if (page_is_file_lru(page) && PageDirty(page))
+	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
 		goto out;
 
-	isolated = numamigrate_isolate_folio(pgdat, page_folio(page));
+	isolated = numamigrate_isolate_folio(pgdat, folio);
 	if (!isolated)
 		goto out;
 
-	list_add(&page->lru, &migratepages);
+	list_add(&folio->lru, &migratepages);
 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
 				     NULL, node, MIGRATE_ASYNC,
 				     MR_NUMA_MISPLACED, &nr_succeeded);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
-			list_del(&page->lru);
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_lru(page), -nr_pages);
-			putback_lru_page(page);
+			list_del(&folio->lru);
+			node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+					folio_is_file_lru(folio), -nr_pages);
+			folio_putback_lru(folio);
 		}
 		isolated = 0;
 	}
 	if (nr_succeeded) {
 		count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
-		if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+		if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node))
 			mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
 					    nr_succeeded);
 	}
@@ -2572,7 +2575,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	return isolated;
 
 out:
-	put_page(page);
+	folio_put(folio);
 	return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
-- 
cgit v1.2.3


From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:04 +0100
Subject: buffer: pass GFP flags to folio_alloc_buffers()

Patch series "Add and use bdev_getblk()", v2.

This patch series fixes a bug reported by Hui Zhu; see proposed
patches v1 and v2:
https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/
https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/

I decided to go in a rather different direction for this fix, and fix a
related problem at the same time.  I don't think there's any urgency to
rush this into Linus' tree, nor have I marked it for stable.  Reasonable
people may disagree.


This patch (of 8):

Instead of creating entirely new flags, inherit them from grow_dev_page().
The other callers create the same flags that this function used to
create.

Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 17 +++++++++--------
 include/linux/buffer_head.h |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71c693d..32338b7cfeb9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -915,16 +915,12 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
-					bool retry)
+					gfp_t gfp)
 {
 	struct buffer_head *bh, *head;
-	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 	long offset;
 	struct mem_cgroup *memcg, *old_memcg;
 
-	if (retry)
-		gfp |= __GFP_NOFAIL;
-
 	/* The folio lock pins the memcg */
 	memcg = folio_memcg(folio);
 	old_memcg = set_active_memcg(memcg);
@@ -967,7 +963,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 				       bool retry)
 {
-	return folio_alloc_buffers(page_folio(page), size, retry);
+	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
+	return folio_alloc_buffers(page_folio(page), size, gfp);
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
@@ -1069,7 +1069,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 			goto failed;
 	}
 
-	bh = folio_alloc_buffers(folio, size, true);
+	bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT);
 
 	/*
 	 * Link the folio to the buffers and initialise them.  Take the
@@ -1644,8 +1644,9 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
 				unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
+	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
 
-	head = folio_alloc_buffers(folio, blocksize, true);
+	head = folio_alloc_buffers(folio, blocksize, gfp);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 44e9de51eedf..67d94d2be475 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -198,7 +198,7 @@ void touch_buffer(struct buffer_head *bh);
 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
-					bool retry);
+					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
-- 
cgit v1.2.3


From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:05 +0100
Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp()

grow_dev_page() is only called by grow_buffers().  grow_buffers() is only
called by __getblk_slow() and __getblk_slow() is only called from
__getblk_gfp(), so it is safe to move the GFP flags setting all the way
up.  With that done, add a new bdev_getblk() entry point that leaves the
GFP flags the way the caller specified them.

[willy@infradead.org: fix grow_dev_page() error handling]
  Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org
Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 61 +++++++++++++++++++++++++++++----------------
 include/linux/buffer_head.h |  2 ++
 2 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 32338b7cfeb9..80e96c1fcd33 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1043,20 +1043,11 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;
-	gfp_t gfp_mask;
-
-	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
-
-	/*
-	 * XXX: __getblk_slow() can not really deal with failure and
-	 * will endlessly loop on improvised global reclaim.  Prefer
-	 * looping in the allocator rather than here, at least that
-	 * code knows what it's doing.
-	 */
-	gfp_mask |= __GFP_NOFAIL;
 
 	folio = __filemap_get_folio(inode->i_mapping, index,
-			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	bh = folio_buffers(folio);
 	if (bh) {
@@ -1069,7 +1060,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 			goto failed;
 	}
 
-	bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT);
+	ret = -ENOMEM;
+	bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+	if (!bh)
+		goto failed;
 
 	/*
 	 * Link the folio to the buffers and initialise them.  Take the
@@ -1420,24 +1414,49 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 }
 EXPORT_SYMBOL(__find_get_block);
 
+/**
+ * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ * @bdev: The block device.
+ * @block: The block number.
+ * @size: The size of buffer_heads for this @bdev.
+ * @gfp: The memory allocation flags to use.
+ *
+ * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags;
+ * they are not augmented with the mapping's GFP flags.
+ *
+ * Return: The buffer head, or NULL if memory could not be allocated.
+ */
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp)
+{
+	struct buffer_head *bh = __find_get_block(bdev, block, size);
+
+	might_alloc(gfp);
+	if (bh)
+		return bh;
+
+	return __getblk_slow(bdev, block, size, gfp);
+}
+EXPORT_SYMBOL(bdev_getblk);
+
 /*
  * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
  * which corresponds to the passed block_device, block and size. The
  * returned buffer has its reference count incremented.
- *
- * __getblk_gfp() will lock up the machine if grow_dev_page's
- * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
  */
 struct buffer_head *
 __getblk_gfp(struct block_device *bdev, sector_t block,
 	     unsigned size, gfp_t gfp)
 {
-	struct buffer_head *bh = __find_get_block(bdev, block, size);
+	gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
 
-	might_sleep();
-	if (bh == NULL)
-		bh = __getblk_slow(bdev, block, size, gfp);
-	return bh;
+	/*
+	 * Prefer looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp |= __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
 }
 EXPORT_SYMBOL(__getblk_gfp);
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 67d94d2be475..7825bb3d63a7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -227,6 +227,8 @@ void __wait_on_buffer(struct buffer_head *);
 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp);
 struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
 				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
-- 
cgit v1.2.3


From c645e65c0675dd4df7ee68b995154dc1c1e7ce3b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:08 +0100
Subject: buffer: convert getblk_unmovable() and __getblk() to use
 bdev_getblk()

Move these two functions up in the file for the benefit of the next patch,
and pass in all of the GFP flags to use instead of the partial GFP flags
used by __getblk_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7825bb3d63a7..9a3ca5f6d63d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -340,6 +340,28 @@ sb_breadahead(struct super_block *sb, sector_t block)
 	__breadahead(sb->s_bdev, block, sb->s_blocksize);
 }
 
+static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
+static inline struct buffer_head *__getblk(struct block_device *bdev,
+		sector_t block, unsigned size)
+{
+	gfp_t gfp;
+
+	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= __GFP_MOVABLE | __GFP_NOFAIL;
+
+	return bdev_getblk(bdev, block, size, gfp);
+}
+
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
@@ -387,20 +409,6 @@ static inline void lock_buffer(struct buffer_head *bh)
 		__lock_buffer(bh);
 }
 
-static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
-						   sector_t block,
-						   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, 0);
-}
-
-static inline struct buffer_head *__getblk(struct block_device *bdev,
-					   sector_t block,
-					   unsigned size)
-{
-	return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
-}
-
 static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
 {
 	if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
-- 
cgit v1.2.3


From 4b9c8b1919323f7f359376ca31a4c721cb2b3acf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:09 +0100
Subject: buffer: convert sb_getblk() to call __getblk()

Now that __getblk() is in the right place in the file, it is trivial to
call it from sb_getblk().

Link: https://lkml.kernel.org/r/20230914150011.843330-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9a3ca5f6d63d..b294e2cccbae 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -362,13 +362,12 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
 	return bdev_getblk(bdev, block, size, gfp);
 }
 
-static inline struct buffer_head *
-sb_getblk(struct super_block *sb, sector_t block)
+static inline struct buffer_head *sb_getblk(struct super_block *sb,
+		sector_t block)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-
 static inline struct buffer_head *
 sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
 {
-- 
cgit v1.2.3


From 8a83ac54940d27b8f56d766e1cb270d150fedd50 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:10 +0100
Subject: ext4: call bdev_getblk() from sb_getblk_gfp()

Most of the callers of sb_getblk_gfp() already assumed that they were
passing the entire GFP flags to use.  Fix up the two callers that didn't,
and remove the __GFP_NOFAIL from them since they both appear to correctly
handle failure.

Link: https://lkml.kernel.org/r/20230914150011.843330-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ext4/super.c             | 10 ++++++++--
 include/linux/buffer_head.h |  6 +++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 268d812b0add..c00ec159dea5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,13 +244,19 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 				   blk_opf_t op_flags)
 {
-	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+			~__GFP_FS) | __GFP_MOVABLE;
+
+	return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
 }
 
 struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 					    sector_t block)
 {
-	return __ext4_sb_bread_gfp(sb, block, 0, 0);
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+			~__GFP_FS);
+
+	return __ext4_sb_bread_gfp(sb, block, 0, gfp);
 }
 
 void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b294e2cccbae..22f13eece719 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -368,10 +368,10 @@ static inline struct buffer_head *sb_getblk(struct super_block *sb,
 	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }
 
-static inline struct buffer_head *
-sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
+		sector_t block, gfp_t gfp)
 {
-	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+	return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
 }
 
 static inline struct buffer_head *
-- 
cgit v1.2.3


From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 14 Sep 2023 16:00:11 +0100
Subject: buffer: remove __getblk_gfp()

Inline it into __bread_gfp().

Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hui Zhu <teawater@antgroup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 36 +++++++++++-------------------------
 include/linux/buffer_head.h |  2 --
 2 files changed, 11 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index edd118594565..edec8652788c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1421,9 +1421,6 @@ EXPORT_SYMBOL(__find_get_block);
  * @size: The size of buffer_heads for this @bdev.
  * @gfp: The memory allocation flags to use.
  *
- * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags;
- * they are not augmented with the mapping's GFP flags.
- *
  * Return: The buffer head, or NULL if memory could not be allocated.
  */
 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
@@ -1439,27 +1436,6 @@ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
 }
 EXPORT_SYMBOL(bdev_getblk);
 
-/*
- * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- * which corresponds to the passed block_device, block and size. The
- * returned buffer has its reference count incremented.
- */
-struct buffer_head *
-__getblk_gfp(struct block_device *bdev, sector_t block,
-	     unsigned size, gfp_t gfp)
-{
-	gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
-
-	/*
-	 * Prefer looping in the allocator rather than here, at least that
-	 * code knows what it's doing.
-	 */
-	gfp |= __GFP_NOFAIL;
-
-	return bdev_getblk(bdev, block, size, gfp);
-}
-EXPORT_SYMBOL(__getblk_gfp);
-
 /*
  * Do async read-ahead on a buffer..
  */
@@ -1491,7 +1467,17 @@ struct buffer_head *
 __bread_gfp(struct block_device *bdev, sector_t block,
 		   unsigned size, gfp_t gfp)
 {
-	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+	struct buffer_head *bh;
+
+	gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+
+	/*
+	 * Prefer looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp |= __GFP_NOFAIL;
+
+	bh = bdev_getblk(bdev, block, size, gfp);
 
 	if (likely(bh) && !buffer_uptodate(bh))
 		bh = __bread_slow(bh);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 22f13eece719..3dc4720e4773 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -229,8 +229,6 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
 			unsigned size);
 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
 		unsigned size, gfp_t gfp);
-struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
-				  unsigned size, gfp_t gfp);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, unsigned int size);
-- 
cgit v1.2.3


From 3dfbb555c98ac55b9d911f9af0e35014b445fb41 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Sep 2023 15:16:39 +0200
Subject: mm, vmscan: remove ISOLATE_UNMAPPED

This isolate_mode_t flag is effectively unused since 89f6c88a6ab4 ("mm:
__isolate_lru_page_prepare() in isolate_migratepages_block()") as
sc->may_unmap is now checked directly (and only node_reclaim has a mode
that sets it to 0).  The last remaining place is mm_vmscan_lru_isolate
tracepoint for the isolate_mode parameter.  That one was mainly used to
indicate the active/inactive mode, which the trace-vmscan-postprocess.pl
script consumed, but that got silently broken.  After fixing the script by
the previous patch, it does not need the isolate_mode anymore.  So just
remove the parameter and with that the whole ISOLATE_UNMAPPED flag.

Link: https://lkml.kernel.org/r/20230914131637.12204-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/trace/postprocess/trace-vmscan-postprocess.pl | 8 ++++----
 include/linux/mmzone.h                                      | 2 --
 include/trace/events/vmscan.h                               | 8 ++------
 mm/vmscan.c                                                 | 3 +--
 4 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 725d41a8d4ef..048dc0dbce64 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -112,7 +112,7 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
 my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
-my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
+my $regex_lru_isolate_default = 'classzone=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
 my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_taken=([0-9]*) nr_active=([0-9]*) nr_deactivated=([0-9]*) nr_referenced=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)' ;
 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
@@ -204,7 +204,7 @@ $regex_wakeup_kswapd = generate_traceevent_regex(
 $regex_lru_isolate = generate_traceevent_regex(
 			"vmscan/mm_vmscan_lru_isolate",
 			$regex_lru_isolate_default,
-			"isolate_mode", classzone", "order",
+			"classzone", "order",
 			"nr_requested", "nr_scanned", "nr_skipped", "nr_taken",
 			"lru");
 $regex_lru_shrink_inactive = generate_traceevent_regex(
@@ -379,8 +379,8 @@ EVENT_PROCESS:
 				print "         $regex_lru_isolate/o\n";
 				next;
 			}
-			my $nr_scanned = $5;
-			my $lru = $8;
+			my $nr_scanned = $4;
+			my $lru = $7;
 
 			# To closer match vmstat scanning statistics, only count
 			# inactive lru as scanning
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4106fbc5b4b3..486587fcd27f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -639,8 +639,6 @@ struct lruvec {
 #endif
 };
 
-/* Isolate unmapped pages */
-#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index d2123dd960d5..1a488c30afa5 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -285,10 +285,9 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		unsigned long nr_scanned,
 		unsigned long nr_skipped,
 		unsigned long nr_taken,
-		isolate_mode_t isolate_mode,
 		int lru),
 
-	TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
+	TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru),
 
 	TP_STRUCT__entry(
 		__field(int, highest_zoneidx)
@@ -297,7 +296,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_skipped)
 		__field(unsigned long, nr_taken)
-		__field(unsigned int, isolate_mode)
 		__field(int, lru)
 	),
 
@@ -308,7 +306,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_skipped = nr_skipped;
 		__entry->nr_taken = nr_taken;
-		__entry->isolate_mode = (__force unsigned int)isolate_mode;
 		__entry->lru = lru;
 	),
 
@@ -316,8 +313,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 	 * classzone is previous name of the highest_zoneidx.
 	 * Reason not to change it is the ABI requirement of the tracepoint.
 	 */
-	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
-		__entry->isolate_mode,
+	TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
 		__entry->highest_zoneidx,
 		__entry->order,
 		__entry->nr_requested,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acf115468bf8..3df0e2a59052 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1689,8 +1689,7 @@ move:
 	}
 	*nr_scanned = total_scan;
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
-				    total_scan, skipped, nr_taken,
-				    sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
+				    total_scan, skipped, nr_taken, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
 	return nr_taken;
 }
-- 
cgit v1.2.3


From 4472edf63d6630e6cf65e205b4fc8c3c94d0afe5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 14 Sep 2023 02:15:23 +0000
Subject: mm/damon/core: use number of passed access sampling as a timer

DAMON sleeps for sampling interval after each sampling, and check if the
aggregation interval and the ops update interval have passed using
ktime_get_coarse_ts64() and baseline timestamps for the intervals.  That
design is for making the operations occur at deterministic timing
regardless of the time that spend for each work.  However, it turned out
it is not that useful, and incur not-that-intuitive results.

After all, timer functions, and especially sleep functions that DAMON uses
to wait for specific timing, are not necessarily strictly accurate.  It is
legal design, so no problem.  However, depending on such inaccuracies, the
nr_accesses can be larger than aggregation interval divided by sampling
interval.  For example, with the default setting (5 ms sampling interval
and 100 ms aggregation interval) we frequently show regions having
nr_accesses larger than 20.  Also, if the execution of a DAMOS scheme
takes a long time, next aggregation could happen before enough number of
samples are collected.  This is not what usual users would intuitively
expect.

Since access check sampling is the smallest unit work of DAMON, using the
number of passed sampling intervals as the DAMON-internal timer can easily
avoid these problems.  That is, convert aggregation and ops update
intervals to numbers of sampling intervals that need to be passed before
those operations be executed, count the number of passed sampling
intervals, and invoke the operations as soon as the specific amount of
sampling intervals passed.  Make the change.

Note that this could make a behavioral change to settings that using
intervals that not aligned by the sampling interval.  For example, if the
sampling interval is 5 ms and the aggregation interval is 12 ms, DAMON
effectively uses 15 ms as its aggregation interval, because it checks
whether the aggregation interval after sleeping the sampling interval.
This change will make DAMON to effectively use 10 ms as aggregation
interval, since it uses 'aggregation interval / sampling interval *
sampling interval' as the effective aggregation interval, and we don't use
floating point types.  Usual users would have used aligned intervals, so
this behavioral change is not expected to make any meaningful impact, so
just make this change.

Link: https://lkml.kernel.org/r/20230914021523.60649-1-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 14 ++++++--
 mm/damon/core.c       | 96 +++++++++++++++++++++++++--------------------------
 2 files changed, 59 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab3089de1478..9a32b8fd0bd3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -524,8 +524,18 @@ struct damon_ctx {
 	struct damon_attrs attrs;
 
 /* private: internal use only */
-	struct timespec64 last_aggregation;
-	struct timespec64 last_ops_update;
+	/* number of sample intervals that passed since this context started */
+	unsigned long passed_sample_intervals;
+	/*
+	 * number of sample intervals that should be passed before next
+	 * aggregation
+	 */
+	unsigned long next_aggregation_sis;
+	/*
+	 * number of sample intervals that should be passed before next ops
+	 * update
+	 */
+	unsigned long next_ops_update_sis;
 
 /* public: */
 	struct task_struct *kdamond;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3ca34a252a3c..c5b7296c69a0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -427,8 +427,10 @@ struct damon_ctx *damon_new_ctx(void)
 	ctx->attrs.aggr_interval = 100 * 1000;
 	ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
 
-	ktime_get_coarse_ts64(&ctx->last_aggregation);
-	ctx->last_ops_update = ctx->last_aggregation;
+	ctx->passed_sample_intervals = 0;
+	/* These will be set from kdamond_init_intervals_sis() */
+	ctx->next_aggregation_sis = 0;
+	ctx->next_ops_update_sis = 0;
 
 	mutex_init(&ctx->kdamond_lock);
 
@@ -552,6 +554,9 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
  */
 int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 {
+	unsigned long sample_interval = attrs->sample_interval ?
+		attrs->sample_interval : 1;
+
 	if (attrs->min_nr_regions < 3)
 		return -EINVAL;
 	if (attrs->min_nr_regions > attrs->max_nr_regions)
@@ -559,6 +564,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 	if (attrs->sample_interval > attrs->aggr_interval)
 		return -EINVAL;
 
+	ctx->next_aggregation_sis = ctx->passed_sample_intervals +
+		attrs->aggr_interval / sample_interval;
+	ctx->next_ops_update_sis = ctx->passed_sample_intervals +
+		attrs->ops_update_interval / sample_interval;
+
 	damon_update_monitoring_results(ctx, attrs);
 	ctx->attrs = *attrs;
 	return 0;
@@ -732,38 +742,6 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
 	return err;
 }
 
-/*
- * damon_check_reset_time_interval() - Check if a time interval is elapsed.
- * @baseline:	the time to check whether the interval has elapsed since
- * @interval:	the time interval (microseconds)
- *
- * See whether the given time interval has passed since the given baseline
- * time.  If so, it also updates the baseline to current time for next check.
- *
- * Return:	true if the time interval has passed, or false otherwise.
- */
-static bool damon_check_reset_time_interval(struct timespec64 *baseline,
-		unsigned long interval)
-{
-	struct timespec64 now;
-
-	ktime_get_coarse_ts64(&now);
-	if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
-			interval * 1000)
-		return false;
-	*baseline = now;
-	return true;
-}
-
-/*
- * Check whether it is time to flush the aggregated information
- */
-static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
-{
-	return damon_check_reset_time_interval(&ctx->last_aggregation,
-			ctx->attrs.aggr_interval);
-}
-
 /*
  * Reset the aggregated monitoring results ('nr_accesses' of each region).
  */
@@ -1274,18 +1252,6 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
 	last_nr_regions = nr_regions;
 }
 
-/*
- * Check whether it is time to check and apply the operations-related data
- * structures.
- *
- * Returns true if it is.
- */
-static bool kdamond_need_update_operations(struct damon_ctx *ctx)
-{
-	return damon_check_reset_time_interval(&ctx->last_ops_update,
-			ctx->attrs.ops_update_interval);
-}
-
 /*
  * Check whether current monitoring should be stopped
  *
@@ -1397,6 +1363,17 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
 	return -EBUSY;
 }
 
+static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+{
+	unsigned long sample_interval = ctx->attrs.sample_interval ?
+		ctx->attrs.sample_interval : 1;
+
+	ctx->passed_sample_intervals = 0;
+	ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
+	ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
+		sample_interval;
+}
+
 /*
  * The monitoring daemon that runs as a kernel thread
  */
@@ -1410,6 +1387,8 @@ static int kdamond_fn(void *data)
 
 	pr_debug("kdamond (%d) starts\n", current->pid);
 
+	kdamond_init_intervals_sis(ctx);
+
 	if (ctx->ops.init)
 		ctx->ops.init(ctx);
 	if (ctx->callback.before_start && ctx->callback.before_start(ctx))
@@ -1418,6 +1397,17 @@ static int kdamond_fn(void *data)
 	sz_limit = damon_region_sz_limit(ctx);
 
 	while (!kdamond_need_stop(ctx)) {
+		/*
+		 * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
+		 * be changed from after_wmarks_check() or after_aggregation()
+		 * callbacks.  Read the values here, and use those for this
+		 * iteration.  That is, damon_set_attrs() updated new values
+		 * are respected from next iteration.
+		 */
+		unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
+		unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
+		unsigned long sample_interval = ctx->attrs.sample_interval;
+
 		if (kdamond_wait_activation(ctx))
 			break;
 
@@ -1427,12 +1417,17 @@ static int kdamond_fn(void *data)
 				ctx->callback.after_sampling(ctx))
 			break;
 
-		kdamond_usleep(ctx->attrs.sample_interval);
+		kdamond_usleep(sample_interval);
+		ctx->passed_sample_intervals++;
 
 		if (ctx->ops.check_accesses)
 			max_nr_accesses = ctx->ops.check_accesses(ctx);
 
-		if (kdamond_aggregate_interval_passed(ctx)) {
+		sample_interval = ctx->attrs.sample_interval ?
+			ctx->attrs.sample_interval : 1;
+		if (ctx->passed_sample_intervals == next_aggregation_sis) {
+			ctx->next_aggregation_sis = next_aggregation_sis +
+				ctx->attrs.aggr_interval / sample_interval;
 			kdamond_merge_regions(ctx,
 					max_nr_accesses / 10,
 					sz_limit);
@@ -1447,7 +1442,10 @@ static int kdamond_fn(void *data)
 				ctx->ops.reset_aggregated(ctx);
 		}
 
-		if (kdamond_need_update_operations(ctx)) {
+		if (ctx->passed_sample_intervals == next_ops_update_sis) {
+			ctx->next_ops_update_sis = next_ops_update_sis +
+				ctx->attrs.ops_update_interval /
+				sample_interval;
 			if (ctx->ops.update)
 				ctx->ops.update(ctx);
 			sz_limit = damon_region_sz_limit(ctx);
-- 
cgit v1.2.3


From 78fbfb155d204428119310d1b9df665ab88da6e8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:44 +0000
Subject: mm/damon/core: define and use a dedicated function for region access
 rate update

Patch series "mm/damon: provide pseudo-moving sum based access rate".

DAMON checks the access to each region for every sampling interval,
increase the access rate counter of the region, namely nr_accesses, if the
access was made.  For every aggregation interval, the counter is reset.
The counter is exposed to users to be used as a metric showing the
relative access rate (frequency) of each region.  In other words, DAMON
provides access rate of each region in every aggregation interval.  The
aggregation avoids temporal access pattern changes making things
confusing.  However, this also makes a few DAMON-related operations to
unnecessarily need to be aligned to the aggregation interval.  This can
restrict the flexibility of DAMON applications, especially when the
aggregation interval is huge.

To provide the monitoring results in finer-grained timing while keeping
handling of temporal access pattern change, this patchset implements a
pseudo-moving sum based access rate metric.  It is pseudo-moving sum
because strict moving sum implementation would need to keep all values for
last time window, and that could incur high overhead of there could be
arbitrary number of values in a time window.  Especially in case of the
nr_accesses, since the sampling interval and aggregation interval can
arbitrarily set and the past values should be maintained for every region,
it could be risky.  The pseudo-moving sum assumes there were no temporal
access pattern change in last discrete time window to remove the needs for
keeping the list of the last time window values.  As a result, it beocmes
not strict moving sum implementation, but provides a reasonable accuracy.

Also, it keeps an important property of the moving sum.  That is, the
moving sum becomes same to discrete-window based sum at the time that
aligns to the time window.  This means using the pseudo moving sum based
nr_accesses makes no change to users who shows the value for every
aggregation interval.

Patches Sequence
----------------

The sequence of the patches is as follows.  The first four patches are for
preparation of the change.  The first two (patches 1 and 2) implements a
helper function for nr_accesses update and eliminate corner case that
skips use of the function, respectively.  Following two (patches 3 and 4)
respectively implement the pseudo-moving sum function and its simple unit
test case.

Two patches for making DAMON to use the pseudo-moving sum follow.  The
fifthe one (patch 5) introduces a new field for representing the
pseudo-moving sum-based access rate of each region, and the sixth one
makes the new representation to actually updated with the pseudo-moving
sum function.

Last two patches (patches 7 and 8) makes followup fixes for skipping
unnecessary updates and marking the moving sum function as static,
respectively.


This patch (of 8):

Each DAMON operarions set is updating nr_accesses field of each
damon_region for each of their access check results, from the
check_accesses() callback.  Directly accessing the field could make things
complex to manage and change in future.  Define and use a dedicated
function for the purpose.

Link: https://lkml.kernel.org/r/20230915025251.72816-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230915025251.72816-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  5 ++++-
 mm/damon/core.c       | 16 ++++++++++++++++
 mm/damon/paddr.c      |  6 ++----
 mm/damon/vaddr.c      |  6 ++----
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 9a32b8fd0bd3..17c504d236b9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -45,7 +45,9 @@ struct damon_addr_range {
  *
  * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
  * increased for every &damon_attrs->sample_interval if an access to the region
- * during the last sampling interval is found.
+ * during the last sampling interval is found.  The update of this field should
+ * not be done with direct access but with the helper function,
+ * damon_update_region_access_rate().
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -620,6 +622,7 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index c5b7296c69a0..10532159323a 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1549,6 +1549,22 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
 	return damon_set_regions(t, &addr_range, 1);
 }
 
+/**
+ * damon_update_region_access_rate() - Update the access rate of a region.
+ * @r:		The DAMON region to update for its access check result.
+ * @accessed:	Whether the region has accessed during last sampling interval.
+ *
+ * Update the access rate of a region with the region's last sampling interval
+ * access check result.
+ *
+ * Usually this will be called by &damon_operations->check_accesses callback.
+ */
+void damon_update_region_access_rate(struct damon_region *r, bool accessed)
+{
+	if (accessed)
+		r->nr_accesses++;
+}
+
 static int __init damon_init(void)
 {
 	damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 909db25efb35..44f21860b555 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -157,14 +157,12 @@ static void __damon_pa_check_access(struct damon_region *r)
 	/* If the region is in the last checked page, reuse the result */
 	if (ALIGN_DOWN(last_addr, last_folio_sz) ==
 				ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
-		if (last_accessed)
-			r->nr_accesses++;
+		damon_update_region_access_rate(r, last_accessed);
 		return;
 	}
 
 	last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
-	if (last_accessed)
-		r->nr_accesses++;
+	damon_update_region_access_rate(r, last_accessed);
 
 	last_addr = r->sampling_addr;
 }
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index cf8a9fc5c9d1..53371bbec605 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -567,14 +567,12 @@ static void __damon_va_check_access(struct mm_struct *mm,
 	/* If the region is in the last checked page, reuse the result */
 	if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
 				ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
-		if (last_accessed)
-			r->nr_accesses++;
+		damon_update_region_access_rate(r, last_accessed);
 		return;
 	}
 
 	last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
-	if (last_accessed)
-		r->nr_accesses++;
+	damon_update_region_access_rate(r, last_accessed);
 
 	last_addr = r->sampling_addr;
 }
-- 
cgit v1.2.3


From d2c062ade07ffd206dd16bf085f02abc59651309 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:46 +0000
Subject: mm/damon/core: implement a pseudo-moving sum function

For values that continuously change, moving average or sum are good ways
to provide fast updates while handling temporal and errorneous variability
of the value.  For example, the access rate counter (nr_accesses) is
calculated as a sum of the number of positive sampled access check results
that collected during a discrete time window (aggregation interval), and
hence it handles temporal and errorneous access check results, but
provides the update only for every aggregation interval.  Using a moving
sum method for that could allow providing the value for every sampling
interval.  That could be useful for getting monitoring results snapshot or
running DAMOS in fine-grained timing.

However, supporting the moving sum for cases that number of samples in the
time window is arbirary could impose high overhead, since the number of
past values that it needs to keep could be too high.  The nr_accesses
would also be one of the cases.  To mitigate the overhead, implement a
pseudo-moving sum function that only provides an estimated pseudo-moving
sum.  It assumes there was no error in last discrete time window and
subtract constant portion of last discrete time window sum.

Note that the function is not strictly implementing the moving sum, but it
keeps a property of moving sum, which makes the value same to the
dsicrete-window based sum for each time window-aligned timing.  Hence,
people collecting the value in the old timings would show no difference.

Link: https://lkml.kernel.org/r/20230915025251.72816-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/core.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 17c504d236b9..487a545a11b4 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -622,6 +622,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
+unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 10532159323a..b005dc15009f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1549,6 +1549,46 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
 	return damon_set_regions(t, &addr_range, 1);
 }
 
+/*
+ * damon_moving_sum() - Calculate an inferred moving sum value.
+ * @mvsum:	Inferred sum of the last @len_window values.
+ * @nomvsum:	Non-moving sum of the last discrete @len_window window values.
+ * @len_window:	The number of last values to take care of.
+ * @new_value:	New value that will be added to the pseudo moving sum.
+ *
+ * Moving sum (moving average * window size) is good for handling noise, but
+ * the cost of keeping past values can be high for arbitrary window size.  This
+ * function implements a lightweight pseudo moving sum function that doesn't
+ * keep the past window values.
+ *
+ * It simply assumes there was no noise in the past, and get the no-noise
+ * assumed past value to drop from @nomvsum and @len_window.  @nomvsum is a
+ * non-moving sum of the last window.  For example, if @len_window is 10 and we
+ * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25
+ * values.  Hence, this function simply drops @nomvsum / @len_window from
+ * given @mvsum and add @new_value.
+ *
+ * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for
+ * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20.  For
+ * calculating next moving sum with a new value, we should drop 0 from 50 and
+ * add the new value.  However, this function assumes it got value 5 for each
+ * of the last ten times.  Based on the assumption, when the next value is
+ * measured, it drops the assumed past value, 5 from the current sum, and add
+ * the new value to get the updated pseduo-moving average.
+ *
+ * This means the value could have errors, but the errors will be disappeared
+ * for every @len_window aligned calls.  For example, if @len_window is 10, the
+ * pseudo moving sum with 11th value to 19th value would have an error.  But
+ * the sum with 20th value will not have the error.
+ *
+ * Return: Pseudo-moving average after getting the @new_value.
+ */
+unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+		unsigned int len_window, unsigned int new_value)
+{
+	return mvsum - nomvsum / len_window + new_value;
+}
+
 /**
  * damon_update_region_access_rate() - Update the access rate of a region.
  * @r:		The DAMON region to update for its access check result.
-- 
cgit v1.2.3


From 80333828ea7728ebe85d079bb5c1467eb9fc6c8c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:48 +0000
Subject: mm/damon/core: introduce nr_accesses_bp

Add yet another representation of the access rate of each region, namely
nr_accesses_bp.  It is just same to the nr_accesses but represents the
value in basis point (1 in 10,000), and updated at once in every
aggregation interval.  That is, moving_accesses_bp is just nr_accesses *
10000.  This may seems useless at the moment.  However, it will be useful
for representing less than one nr_accesses value that will be needed to
make moving sum-based nr_accesses.

Link: https://lkml.kernel.org/r/20230915025251.72816-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 5 +++++
 mm/damon/core-test.h  | 5 +++++
 mm/damon/core.c       | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 487a545a11b4..15f24b23c9a0 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,6 +40,7 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -49,6 +50,9 @@ struct damon_addr_range {
  * not be done with direct access but with the helper function,
  * damon_update_region_access_rate().
  *
+ * @nr_accesses_bp is another representation of @nr_accesses in basis point
+ * (1 in 10,000) that updated every aggregation interval.
+ *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
  * regions are merged into a new region, both @nr_accesses and @age of the new
@@ -58,6 +62,7 @@ struct damon_region {
 	struct damon_addr_range ar;
 	unsigned long sampling_addr;
 	unsigned int nr_accesses;
+	unsigned int nr_accesses_bp;
 	struct list_head list;
 
 	unsigned int age;
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index c539f0e8377e..79f1f12e0dd5 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -94,6 +94,7 @@ static void damon_test_aggregate(struct kunit *test)
 		for (ir = 0; ir < 3; ir++) {
 			r = damon_new_region(saddr[it][ir], eaddr[it][ir]);
 			r->nr_accesses = accesses[it][ir];
+			r->nr_accesses_bp = accesses[it][ir] * 10000;
 			damon_add_region(r, t);
 		}
 		it++;
@@ -147,9 +148,11 @@ static void damon_test_merge_two(struct kunit *test)
 	t = damon_new_target();
 	r = damon_new_region(0, 100);
 	r->nr_accesses = 10;
+	r->nr_accesses_bp = 100000;
 	damon_add_region(r, t);
 	r2 = damon_new_region(100, 300);
 	r2->nr_accesses = 20;
+	r2->nr_accesses_bp = 200000;
 	damon_add_region(r2, t);
 
 	damon_merge_two_regions(t, r, r2);
@@ -196,6 +199,7 @@ static void damon_test_merge_regions_of(struct kunit *test)
 	for (i = 0; i < ARRAY_SIZE(sa); i++) {
 		r = damon_new_region(sa[i], ea[i]);
 		r->nr_accesses = nrs[i];
+		r->nr_accesses_bp = nrs[i] * 10000;
 		damon_add_region(r, t);
 	}
 
@@ -297,6 +301,7 @@ static void damon_test_update_monitoring_result(struct kunit *test)
 	struct damon_region *r = damon_new_region(3, 7);
 
 	r->nr_accesses = 15;
+	r->nr_accesses_bp = 150000;
 	r->age = 20;
 
 	new_attrs = (struct damon_attrs){
diff --git a/mm/damon/core.c b/mm/damon/core.c
index b005dc15009f..ce85c00b0a4c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -128,6 +128,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
 	region->ar.start = start;
 	region->ar.end = end;
 	region->nr_accesses = 0;
+	region->nr_accesses_bp = 0;
 	INIT_LIST_HEAD(&region->list);
 
 	region->age = 0;
@@ -508,6 +509,7 @@ static void damon_update_monitoring_result(struct damon_region *r,
 {
 	r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
 			old_attrs, new_attrs);
+	r->nr_accesses_bp = r->nr_accesses * 10000;
 	r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
 }
 
@@ -1115,6 +1117,7 @@ static void damon_merge_two_regions(struct damon_target *t,
 
 	l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
 			(sz_l + sz_r);
+	l->nr_accesses_bp = l->nr_accesses * 10000;
 	l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
 	l->ar.end = r->ar.end;
 	damon_destroy_region(r, t);
@@ -1138,6 +1141,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
 		else
 			r->age++;
 
+		r->nr_accesses_bp = r->nr_accesses * 10000;
+
 		if (prev && prev->ar.end == r->ar.start &&
 		    abs(prev->nr_accesses - r->nr_accesses) <= thres &&
 		    damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
@@ -1186,6 +1191,7 @@ static void damon_split_region_at(struct damon_target *t,
 
 	new->age = r->age;
 	new->last_nr_accesses = r->last_nr_accesses;
+	new->nr_accesses_bp = r->nr_accesses_bp;
 
 	damon_insert_region(new, r, damon_next_region(r), t);
 }
-- 
cgit v1.2.3


From ace30fb21af5f1be1605db72c16040b95b1557ef Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:49 +0000
Subject: mm/damon/core: use pseudo-moving sum for nr_accesses_bp

Let nr_accesses_bp be calculated as a pseudo-moving sum that updated for
every sampling interval, using damon_moving_sum().  This is assumed to be
useful for cases that the aggregation interval is set quite huge, but the
monivoting results need to be collected earlier than next aggregation
interval is passed.

Link: https://lkml.kernel.org/r/20230915025251.72816-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 12 +++++++++---
 mm/damon/core.c       | 16 +++++++++++++++-
 mm/damon/paddr.c      |  9 +++++----
 mm/damon/vaddr.c      | 12 +++++++-----
 4 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 15f24b23c9a0..0fe13482df63 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,7 +40,8 @@ struct damon_addr_range {
  * @ar:			The address range of the region.
  * @sampling_addr:	Address of the sample for the next access check.
  * @nr_accesses:	Access frequency of this region.
- * @nr_accesses_bp:	@nr_accesses in basis point (0.01%).
+ * @nr_accesses_bp:	@nr_accesses in basis point (0.01%) that updated for
+ *			each sampling interval.
  * @list:		List head for siblings.
  * @age:		Age of this region.
  *
@@ -51,7 +52,11 @@ struct damon_addr_range {
  * damon_update_region_access_rate().
  *
  * @nr_accesses_bp is another representation of @nr_accesses in basis point
- * (1 in 10,000) that updated every aggregation interval.
+ * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a
+ * manner similar to moving sum.  By the algorithm, this value becomes
+ * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval.  This can
+ * be used when the aggregation interval is too huge and therefore cannot wait
+ * for it before getting the access monitoring results.
  *
  * @age is initially zero, increased for each aggregation interval, and reset
  * to zero again if the access frequency is significantly changed.  If two
@@ -629,7 +634,8 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
 unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
 		unsigned int len_window, unsigned int new_value);
-void damon_update_region_access_rate(struct damon_region *r, bool accessed);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+		struct damon_attrs *attrs);
 
 struct damos_filter *damos_new_filter(enum damos_filter_type type,
 		bool matching);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ce85c00b0a4c..29ee1fc18393 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1599,14 +1599,28 @@ unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
  * damon_update_region_access_rate() - Update the access rate of a region.
  * @r:		The DAMON region to update for its access check result.
  * @accessed:	Whether the region has accessed during last sampling interval.
+ * @attrs:	The damon_attrs of the DAMON context.
  *
  * Update the access rate of a region with the region's last sampling interval
  * access check result.
  *
  * Usually this will be called by &damon_operations->check_accesses callback.
  */
-void damon_update_region_access_rate(struct damon_region *r, bool accessed)
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+		struct damon_attrs *attrs)
 {
+	unsigned int len_window = 1;
+
+	/*
+	 * sample_interval can be zero, but cannot be larger than
+	 * aggr_interval, owing to validation of damon_set_attrs().
+	 */
+	if (attrs->sample_interval)
+		len_window = attrs->aggr_interval / attrs->sample_interval;
+	r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp,
+			r->last_nr_accesses * 10000, len_window,
+			accessed ? 10000 : 0);
+
 	if (accessed)
 		r->nr_accesses++;
 }
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 44f21860b555..081e2a325778 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -148,7 +148,8 @@ out:
 	return accessed;
 }
 
-static void __damon_pa_check_access(struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r,
+		struct damon_attrs *attrs)
 {
 	static unsigned long last_addr;
 	static unsigned long last_folio_sz = PAGE_SIZE;
@@ -157,12 +158,12 @@ static void __damon_pa_check_access(struct damon_region *r)
 	/* If the region is in the last checked page, reuse the result */
 	if (ALIGN_DOWN(last_addr, last_folio_sz) ==
 				ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
-		damon_update_region_access_rate(r, last_accessed);
+		damon_update_region_access_rate(r, last_accessed, attrs);
 		return;
 	}
 
 	last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
-	damon_update_region_access_rate(r, last_accessed);
+	damon_update_region_access_rate(r, last_accessed, attrs);
 
 	last_addr = r->sampling_addr;
 }
@@ -175,7 +176,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
 
 	damon_for_each_target(t, ctx) {
 		damon_for_each_region(r, t) {
-			__damon_pa_check_access(r);
+			__damon_pa_check_access(r, &ctx->attrs);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
 		}
 	}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 02ab448d9b1e..a4d1f63c5b23 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -558,26 +558,27 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
  * r	the region to be checked
  */
 static void __damon_va_check_access(struct mm_struct *mm,
-				struct damon_region *r, bool same_target)
+				struct damon_region *r, bool same_target,
+				struct damon_attrs *attrs)
 {
 	static unsigned long last_addr;
 	static unsigned long last_folio_sz = PAGE_SIZE;
 	static bool last_accessed;
 
 	if (!mm) {
-		damon_update_region_access_rate(r, false);
+		damon_update_region_access_rate(r, false, attrs);
 		return;
 	}
 
 	/* If the region is in the last checked page, reuse the result */
 	if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
 				ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
-		damon_update_region_access_rate(r, last_accessed);
+		damon_update_region_access_rate(r, last_accessed, attrs);
 		return;
 	}
 
 	last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
-	damon_update_region_access_rate(r, last_accessed);
+	damon_update_region_access_rate(r, last_accessed, attrs);
 
 	last_addr = r->sampling_addr;
 }
@@ -594,7 +595,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
 		mm = damon_get_mm(t);
 		same_target = false;
 		damon_for_each_region(r, t) {
-			__damon_va_check_access(mm, r, same_target);
+			__damon_va_check_access(mm, r, same_target,
+					&ctx->attrs);
 			max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
 			same_target = true;
 		}
-- 
cgit v1.2.3


From 863803a7948c8e33e6a7b002017747ca83ecfd63 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 15 Sep 2023 02:52:51 +0000
Subject: mm/damon/core: mark damon_moving_sum() as a static function

The function is used by only mm/damon/core.c.  Mark it as a static
function.

Link: https://lkml.kernel.org/r/20230915025251.72816-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 --
 mm/damon/core.c       | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 0fe13482df63..491fdd3e4c76 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -632,8 +632,6 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
 		unsigned int nr_ranges);
-unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
-		unsigned int len_window, unsigned int new_value);
 void damon_update_region_access_rate(struct damon_region *r, bool accessed,
 		struct damon_attrs *attrs);
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 45cc108c0fe1..b15cf47d2d29 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1587,7 +1587,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
  *
  * Return: Pseudo-moving average after getting the @new_value.
  */
-unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
 		unsigned int len_window, unsigned int new_value)
 {
 	return mvsum - nomvsum / len_window + new_value;
-- 
cgit v1.2.3


From 77e6c43e137c130138c3fbadc847351a83c4befe Mon Sep 17 00:00:00 2001
From: Usama Arif <usama.arif@bytedance.com>
Date: Wed, 13 Sep 2023 11:54:00 +0100
Subject: memblock: introduce MEMBLOCK_RSRV_NOINIT flag

For reserved memory regions marked with this flag, reserve_bootmem_region
is not called during memmap_init_reserved_pages.  This can be used to
avoid struct page initialization for regions which won't need them, for
e.g.  hugepages with Hugepage Vmemmap Optimization enabled.

Link: https://lkml.kernel.org/r/20230913105401.519709-4-usama.arif@bytedance.com
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h |  9 +++++++++
 mm/memblock.c            | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1c1072e3ca06..ae3bde302f70 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn;
  * via a driver, and never indicated in the firmware-provided memory map as
  * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
  * kernel resource tree.
+ * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
+ * not initialized (only for reserved regions).
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
@@ -47,6 +49,7 @@ enum memblock_flags {
 	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 	MEMBLOCK_NOMAP		= 0x4,	/* don't add to kernel direct mapping */
 	MEMBLOCK_DRIVER_MANAGED = 0x8,	/* always detected via a driver */
+	MEMBLOCK_RSRV_NOINIT	= 0x10,	/* don't initialize struct pages */
 };
 
 /**
@@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
 void memblock_free(void *ptr, size_t size);
@@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
 	return m->flags & MEMBLOCK_NOMAP;
 }
 
+static inline bool memblock_is_reserved_noinit(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_RSRV_NOINIT;
+}
+
 static inline bool memblock_is_driver_managed(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_DRIVER_MANAGED;
diff --git a/mm/memblock.c b/mm/memblock.c
index b978cda96cf0..fd492e5bbdbc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -997,6 +997,24 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
 	return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP);
 }
 
+/**
+ * memblock_reserved_mark_noinit - Mark a reserved memory region with flag
+ * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
+ * for this region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * struct pages will not be initialized for reserved memory regions marked with
+ * %MEMBLOCK_RSRV_NOINIT.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size)
+{
+	return memblock_setclr_flag(&memblock.reserved, base, size, 1,
+				    MEMBLOCK_RSRV_NOINIT);
+}
+
 static bool should_skip_region(struct memblock_type *type,
 			       struct memblock_region *m,
 			       int nid, int flags)
@@ -2113,13 +2131,18 @@ static void __init memmap_init_reserved_pages(void)
 		memblock_set_node(start, end, &memblock.reserved, nid);
 	}
 
-	/* initialize struct pages for the reserved regions */
+	/*
+	 * initialize struct pages for reserved regions that don't have
+	 * the MEMBLOCK_RSRV_NOINIT flag set
+	 */
 	for_each_reserved_mem_region(region) {
-		nid = memblock_get_region_node(region);
-		start = region->base;
-		end = start + region->size;
+		if (!memblock_is_reserved_noinit(region)) {
+			nid = memblock_get_region_node(region);
+			start = region->base;
+			end = start + region->size;
 
-		reserve_bootmem_region(start, end, nid);
+			reserve_bootmem_region(start, end, nid);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 42f994b71404b17abcd6b170de7a6aa95ffe5d4a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 16 Sep 2023 02:09:40 +0000
Subject: mm/damon/core: implement scheme-specific apply interval

DAMON-based operation schemes are applied for every aggregation interval.
That was mainly because schemes were using nr_accesses, which be complete
to be used for every aggregation interval.  However, the schemes are now
using nr_accesses_bp, which is updated for each sampling interval in a way
that reasonable to be used.  Therefore, there is no reason to apply
schemes for each aggregation interval.

The unnecessary alignment with aggregation interval was also making some
use cases of DAMOS tricky.  Quotas setting under long aggregation interval
is one such example.  Suppose the aggregation interval is ten seconds, and
there is a scheme having CPU quota 100ms per 1s.  The scheme will actually
uses 100ms per ten seconds, since it cannobe be applied before next
aggregation interval.  The feature is working as intended, but the results
might not that intuitive for some users.  This could be fixed by updating
the quota to 1s per 10s.  But, in the case, the CPU usage of DAMOS could
look like spikes, and would actually make a bad effect to other
CPU-sensitive workloads.

Implement a dedicated timing interval for each DAMON-based operation
scheme, namely apply_interval.  The interval will be sampling interval
aligned, and each scheme will be applied for its apply_interval.  The
interval is set to 0 by default, and it means the scheme should use the
aggregation interval instead.  This avoids old users getting any
behavioral difference.

Link: https://lkml.kernel.org/r/20230916020945.47296-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h    | 17 ++++++++++--
 mm/damon/core.c          | 72 +++++++++++++++++++++++++++++++++++++++++++-----
 mm/damon/dbgfs.c         |  3 +-
 mm/damon/lru_sort.c      |  2 ++
 mm/damon/reclaim.c       |  2 ++
 mm/damon/sysfs-schemes.c |  2 +-
 6 files changed, 87 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 491fdd3e4c76..27b995c22497 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -314,16 +314,19 @@ struct damos_access_pattern {
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
  * @action:		&damo_action to be applied to the target regions.
+ * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
  * @filters:		Additional set of &struct damos_filter for &action.
  * @stat:		Statistics of this scheme.
  * @list:		List head for siblings.
  *
- * For each aggregation interval, DAMON finds regions which fit in the
+ * For each @apply_interval_us, DAMON finds regions which fit in the
  * &pattern and applies &action to those. To avoid consuming too much
  * CPU time or IO resources for the &action, &quota is used.
  *
+ * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead.
+ *
  * To do the work only when needed, schemes can be activated for specific
  * system situations using &wmarks.  If all schemes that registered to the
  * monitoring context are inactive, DAMON stops monitoring either, and just
@@ -340,6 +343,14 @@ struct damos_access_pattern {
 struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
+	unsigned long apply_interval_us;
+/* private: internal use only */
+	/*
+	 * number of sample intervals that should be passed before applying
+	 * @action
+	 */
+	unsigned long next_apply_sis;
+/* public: */
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
 	struct list_head filters;
@@ -641,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
 void damos_destroy_filter(struct damos_filter *f);
 
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
-			enum damos_action action, struct damos_quota *quota,
+			enum damos_action action,
+			unsigned long apply_interval_us,
+			struct damos_quota *quota,
 			struct damos_watermarks *wmarks);
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
 void damon_destroy_scheme(struct damos *s);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 79fef5145a4b..5eb649bd002f 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -313,7 +313,9 @@ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
 }
 
 struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
-			enum damos_action action, struct damos_quota *quota,
+			enum damos_action action,
+			unsigned long apply_interval_us,
+			struct damos_quota *quota,
 			struct damos_watermarks *wmarks)
 {
 	struct damos *scheme;
@@ -323,6 +325,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 		return NULL;
 	scheme->pattern = *pattern;
 	scheme->action = action;
+	scheme->apply_interval_us = apply_interval_us;
+	/*
+	 * next_apply_sis will be set when kdamond starts.  While kdamond is
+	 * running, it will also updated when it is added to the DAMON context,
+	 * or damon_attrs are updated.
+	 */
+	scheme->next_apply_sis = 0;
 	INIT_LIST_HEAD(&scheme->filters);
 	scheme->stat = (struct damos_stat){};
 	INIT_LIST_HEAD(&scheme->list);
@@ -335,9 +344,21 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	return scheme;
 }
 
+static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx)
+{
+	unsigned long sample_interval = ctx->attrs.sample_interval ?
+		ctx->attrs.sample_interval : 1;
+	unsigned long apply_interval = s->apply_interval_us ?
+		s->apply_interval_us : ctx->attrs.aggr_interval;
+
+	s->next_apply_sis = ctx->passed_sample_intervals +
+		apply_interval / sample_interval;
+}
+
 void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
 {
 	list_add_tail(&s->list, &ctx->schemes);
+	damos_set_next_apply_sis(s, ctx);
 }
 
 static void damon_del_scheme(struct damos *s)
@@ -558,6 +579,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 {
 	unsigned long sample_interval = attrs->sample_interval ?
 		attrs->sample_interval : 1;
+	struct damos *s;
 
 	if (attrs->min_nr_regions < 3)
 		return -EINVAL;
@@ -573,6 +595,10 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
 
 	damon_update_monitoring_results(ctx, attrs);
 	ctx->attrs = *attrs;
+
+	damon_for_each_scheme(s, ctx)
+		damos_set_next_apply_sis(s, ctx);
+
 	return 0;
 }
 
@@ -1094,14 +1120,29 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	struct damon_target *t;
 	struct damon_region *r, *next_r;
 	struct damos *s;
+	unsigned long sample_interval = c->attrs.sample_interval ?
+		c->attrs.sample_interval : 1;
+	bool has_schemes_to_apply = false;
 
 	damon_for_each_scheme(s, c) {
+		if (c->passed_sample_intervals != s->next_apply_sis)
+			continue;
+
+		s->next_apply_sis +=
+			(s->apply_interval_us ? s->apply_interval_us :
+			 c->attrs.aggr_interval) / sample_interval;
+
 		if (!s->wmarks.activated)
 			continue;
 
+		has_schemes_to_apply = true;
+
 		damos_adjust_quota(c, s);
 	}
 
+	if (!has_schemes_to_apply)
+		return;
+
 	damon_for_each_target(t, c) {
 		damon_for_each_region_safe(r, next_r, t)
 			damon_do_apply_schemes(c, t, r);
@@ -1372,11 +1413,19 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
 {
 	unsigned long sample_interval = ctx->attrs.sample_interval ?
 		ctx->attrs.sample_interval : 1;
+	unsigned long apply_interval;
+	struct damos *scheme;
 
 	ctx->passed_sample_intervals = 0;
 	ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
 	ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
 		sample_interval;
+
+	damon_for_each_scheme(scheme, ctx) {
+		apply_interval = scheme->apply_interval_us ?
+			scheme->apply_interval_us : ctx->attrs.aggr_interval;
+		scheme->next_apply_sis = apply_interval / sample_interval;
+	}
 }
 
 /*
@@ -1428,19 +1477,28 @@ static int kdamond_fn(void *data)
 		if (ctx->ops.check_accesses)
 			max_nr_accesses = ctx->ops.check_accesses(ctx);
 
-		sample_interval = ctx->attrs.sample_interval ?
-			ctx->attrs.sample_interval : 1;
 		if (ctx->passed_sample_intervals == next_aggregation_sis) {
-			ctx->next_aggregation_sis = next_aggregation_sis +
-				ctx->attrs.aggr_interval / sample_interval;
 			kdamond_merge_regions(ctx,
 					max_nr_accesses / 10,
 					sz_limit);
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				break;
-			if (!list_empty(&ctx->schemes))
-				kdamond_apply_schemes(ctx);
+		}
+
+		/*
+		 * do kdamond_apply_schemes() after kdamond_merge_regions() if
+		 * possible, to reduce overhead
+		 */
+		if (!list_empty(&ctx->schemes))
+			kdamond_apply_schemes(ctx);
+
+		sample_interval = ctx->attrs.sample_interval ?
+			ctx->attrs.sample_interval : 1;
+		if (ctx->passed_sample_intervals == next_aggregation_sis) {
+			ctx->next_aggregation_sis = next_aggregation_sis +
+				ctx->attrs.aggr_interval / sample_interval;
+
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
 			if (ctx->ops.reset_aggregated)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 124f0f8c97b7..dc0ea1fc30ca 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -278,7 +278,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
 			goto fail;
 
 		pos += parsed;
-		scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
+		scheme = damon_new_scheme(&pattern, action, 0, &quota,
+				&wmarks);
 		if (!scheme)
 			goto fail;
 
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 7b8fce2f67a8..3ecdcc029443 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -158,6 +158,8 @@ static struct damos *damon_lru_sort_new_scheme(
 			pattern,
 			/* (de)prioritize on LRU-lists */
 			action,
+			/* for each aggregation interval */
+			0,
 			/* under the quota. */
 			&quota,
 			/* (De)activate this according to the watermarks. */
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 648d2a85523a..ab974e477d2f 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -142,6 +142,8 @@ static struct damos *damon_reclaim_new_scheme(void)
 			&pattern,
 			/* page out those, as soon as found */
 			DAMOS_PAGEOUT,
+			/* for each aggregation interval */
+			0,
 			/* under the quota. */
 			&damon_reclaim_quota,
 			/* (De)activate this according to the watermarks. */
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 093700f50b18..3d30e85596b0 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1610,7 +1610,7 @@ static struct damos *damon_sysfs_mk_scheme(
 		.low = sysfs_wmarks->low,
 	};
 
-	scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+	scheme = damon_new_scheme(&pattern, sysfs_scheme->action, 0, &quota,
 			&wmarks);
 	if (!scheme)
 		return NULL;
-- 
cgit v1.2.3


From a8306f2d4dcea03538c70c26d2948483f70254ff Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 31 Aug 2023 09:33:40 -0700
Subject: compiler.h: unify __UNIQUE_ID

commit 6f33d58794ef ("__UNIQUE_ID()")
added a fallback definition of __UNIQUE_ID because gcc 4.2 and older did
not support __COUNTER__.

Also, this commit is effectively a revert of
commit b41c29b0527c ("Kbuild: provide a __UNIQUE_ID for clang")
which mentions clang 2.6+ supporting __COUNTER__.

Documentation/process/changes.rst currently lists the minimum supported
version of these compilers as:
- gcc: 5.1
- clang: 11.0.0
It should be safe to say that __COUNTER__ is well supported by this
point.

Link: https://lkml.kernel.org/r/20230831-unique_id-v1-1-28bacd18eb1d@google.com
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Michal rarek <mmarek@suse.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Paul Russel <rusty@rustcorp.com.au>
Cc: Tom Rix <trix@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-clang.h | 5 -----
 include/linux/compiler-gcc.h   | 2 --
 include/linux/compiler.h       | 5 +----
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 9b673fefcef8..ddab1ef22bee 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -14,11 +14,6 @@
 #undef __cleanup
 #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func)))
 
-/* same as gcc, this was present in clang-2.6 so we can assume it works
- * with any version that can compile the kernel
- */
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
 /* all clang versions usable with the kernel support KASAN ABI version 5 */
 #define KASAN_ABI_VERSION 5
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7af9e34ec261..2ceba3fe4ec1 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -39,8 +39,6 @@
 #define __noretpoline __attribute__((__indirect_branch__("keep")))
 #endif
 
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
 #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
 #define __latent_entropy __attribute__((latent_entropy))
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7779a18b24f..174099fdc485 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -177,10 +177,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__asm__ ("" : "=r" (var) : "0" (var))
 #endif
 
-/* Not-quite-unique ID. */
-#ifndef __UNIQUE_ID
-# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
-#endif
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
 
 /**
  * data_race - mark an expression as containing intentional data races
-- 
cgit v1.2.3


From 33a9813825710fdc2b980d566ee391fd093a36c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 24 Aug 2023 16:31:42 +0200
Subject: introduce __next_thread(), fix next_tid() vs exec() race

Patch series "introduce __next_thread(), change next_thread()".

After commit dce8f8ed1de1 ("document while_each_thread(), change
first_tid() to use for_each_thread()") + this series

1. We have only one lockless user of next_thread(), task_group_seq_get_next().
   I think it should be changed too.

2. We have only one user of task_struct->thread_group, thread_group_empty().
   The next patches will change thread_group_empty() and kill ->thread_group.


This patch (of 2):

next_tid(start) does:

	rcu_read_lock();
	if (pid_alive(start)) {
		pos = next_thread(start);
		if (thread_group_leader(pos))
			pos = NULL;
		else
			get_task_struct(pos);

it should return pos = NULL when next_thread() wraps to the 1st thread
in the thread group, group leader, and the thread_group_leader() check
tries to detect this case.

But this can race with exec. To simplify, suppose we have a main thread
M and a single sub-thread T, next_tid(T) should return NULL.

Now suppose that T execs. If next_tid(T) is called after T changes the
leadership and before it does release_task() which removes the old leader
from list, then next_thread() returns M and thread_group_leader(M) = F.

Lockless use of next_thread() should be avoided. After this change only
task_group_seq_get_next() does this, and I believe it should be changed
as well.

Link: https://lkml.kernel.org/r/20230824143112.GA31208@redhat.com
Link: https://lkml.kernel.org/r/20230824143142.GA31222@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c               |  6 ++----
 include/linux/sched/signal.h | 11 +++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..c0e971cc6d41 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3840,10 +3840,8 @@ static struct task_struct *next_tid(struct task_struct *start)
 	struct task_struct *pos = NULL;
 	rcu_read_lock();
 	if (pid_alive(start)) {
-		pos = next_thread(start);
-		if (thread_group_leader(pos))
-			pos = NULL;
-		else
+		pos = __next_thread(start);
+		if (pos)
 			get_task_struct(pos);
 	}
 	rcu_read_unlock();
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..7fb34b8cda54 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -715,6 +715,17 @@ bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
 	return p1->signal == p2->signal;
 }
 
+/*
+ * returns NULL if p is the last thread in the thread group
+ */
+static inline struct task_struct *__next_thread(struct task_struct *p)
+{
+	return list_next_or_null_rcu(&p->signal->thread_head,
+					&p->thread_node,
+					struct task_struct,
+					thread_node);
+}
+
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
 	return list_entry_rcu(p->thread_group.next,
-- 
cgit v1.2.3


From d639cf4abb4d171ab2456904da5668c42b5c1937 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 24 Aug 2023 16:32:01 +0200
Subject: change next_thread() to use __next_thread() ?: group_leader

This relies on fact that group leader is always the 1st entry in the
signal->thread_head list.

With or without this change, if the lockless next_thread(last_thread)
races with exec it can return the old or the new leader.

We are almost ready to kill task->thread_group, after this change its
only user is thread_group_empty().

Link: https://lkml.kernel.org/r/20230824143201.GB31222@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7fb34b8cda54..cffc882d367f 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -726,10 +726,9 @@ static inline struct task_struct *__next_thread(struct task_struct *p)
 					thread_node);
 }
 
-static inline struct task_struct *next_thread(const struct task_struct *p)
+static inline struct task_struct *next_thread(struct task_struct *p)
 {
-	return list_entry_rcu(p->thread_group.next,
-			      struct task_struct, thread_group);
+	return __next_thread(p) ?: p->group_leader;
 }
 
 static inline int thread_group_empty(struct task_struct *p)
-- 
cgit v1.2.3


From e34a35ee1f52312af130b5ebd42fa28313fc6149 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 26 Aug 2023 13:14:06 +0200
Subject: change thread_group_empty() to use task_struct->thread_node

Patch series "kill task_struct->thread_group".


This patch (of 2):

It could use list_is_singular() but this way it is cheaper.  Plus the
thread_group_leader() check makes it clear that thread_group_empty() can
only return true if p is a group leader.  This was not immediately obvious
before this patch.

task_struct->thread_group no longer has users, it can die.

Link: https://lkml.kernel.org/r/20230826111200.GA22982@redhat.com
Link: https://lkml.kernel.org/r/20230826111406.GA23238@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index cffc882d367f..d7fa3ca2fa53 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -733,7 +733,8 @@ static inline struct task_struct *next_thread(struct task_struct *p)
 
 static inline int thread_group_empty(struct task_struct *p)
 {
-	return list_empty(&p->thread_group);
+	return thread_group_leader(p) &&
+	       list_is_last(&p->thread_node, &p->signal->thread_head);
 }
 
 #define delay_group_leader(p) \
-- 
cgit v1.2.3


From 8e1f385104ac044f1552686ad6e1cbc71cc05a30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 26 Aug 2023 13:14:09 +0200
Subject: kill task_struct->thread_group

The last user was removed by the previous patch.

Link: https://lkml.kernel.org/r/20230826111409.GA23243@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 init/init_task.c      | 1 -
 kernel/exit.c         | 1 -
 kernel/fork.c         | 3 ---
 4 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..6d1341b1673f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1002,7 +1002,6 @@ struct task_struct {
 	/* PID/PID hash table linkage. */
 	struct pid			*thread_pid;
 	struct hlist_node		pid_links[PIDTYPE_MAX];
-	struct list_head		thread_group;
 	struct list_head		thread_node;
 
 	struct completion		*vfork_done;
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b..c0de0200fd56 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -132,7 +132,6 @@ struct task_struct init_task
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
 	.timer_slack_ns = 50000, /* 50 usec default slack */
 	.thread_pid	= &init_struct_pid,
-	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
 #ifdef CONFIG_AUDIT
 	.loginuid	= INVALID_UID,
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..f3ba4b97a7d9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,7 +133,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
 	}
-	list_del_rcu(&p->thread_group);
 	list_del_rcu(&p->thread_node);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..b9d3aa493bbd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2576,7 +2576,6 @@ __latent_entropy struct task_struct *copy_process(
 	p->dirty_paused_when = 0;
 
 	p->pdeath_signal = 0;
-	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 	clear_posix_cputimers_work(p);
 
@@ -2704,8 +2703,6 @@ __latent_entropy struct task_struct *copy_process(
 			atomic_inc(&current->signal->live);
 			refcount_inc(&current->signal->sigcnt);
 			task_join_group_stop(p);
-			list_add_tail_rcu(&p->thread_group,
-					  &p->group_leader->thread_group);
 			list_add_tail_rcu(&p->thread_node,
 					  &p->signal->thread_head);
 		}
-- 
cgit v1.2.3


From 9cba82bba500e3ce875381350f289cfb3aa633ba Mon Sep 17 00:00:00 2001
From: Xingui Yang <yangxingui@huawei.com>
Date: Tue, 5 Sep 2023 02:48:33 +0000
Subject: seq_file: add helper macro to define attribute for rw file

Patch series "Add helper macro DEFINE_SHOW_STORE_ATTRIBUTE() at
seq_file.c", v6.

We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute
for read-only file, but we found many of drivers also want a helper macro
for read-write file too.

So we add this helper macro to reduce duplicated code.


This patch (of 3):

We already own DEFINE_SHOW_ATTRIBUTE() helper macro for defining attribute
for read-only file, but many of drivers want a helper macro for read-write
file too.

So we add DEFINE_SHOW_STORE_ATTRIBUTE() helper to reduce duplicated code.

Link: https://lkml.kernel.org/r/20230905024835.43219-1-yangxingui@huawei.com
Link: https://lkml.kernel.org/r/20230905024835.43219-2-yangxingui@huawei.com
Signed-off-by: Luo Jiaxing <luojiaxing@huawei.com>
Co-developed-by: Xingui Yang <yangxingui@huawei.com>
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Animesh Manna <animesh.manna@intel.com>
Cc: Anshuman Gupta <anshuman.gupta@intel.com>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Felipe Balbi <felipe.balbi@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Himanshu Madhani <himanshu.madhani@cavium.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Uma Shankar <uma.shankar@intel.com>
Cc: Xiang Chen <chenxiang66@hisilicon.com>
Cc: Zeng Tao <prime.zeng@hisilicon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/seq_file.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 386ab580b839..234bcdb1fba4 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -207,6 +207,21 @@ static const struct file_operations __name ## _fops = {			\
 	.release	= single_release,				\
 }
 
+#define DEFINE_SHOW_STORE_ATTRIBUTE(__name)				\
+static int __name ## _open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, __name ## _show, inode->i_private);	\
+}									\
+									\
+static const struct file_operations __name ## _fops = {			\
+	.owner		= THIS_MODULE,					\
+	.open		= __name ## _open,				\
+	.read		= seq_read,					\
+	.write		= __name ## _write,				\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+}
+
 #define DEFINE_PROC_SHOW_ATTRIBUTE(__name)				\
 static int __name ## _open(struct inode *inode, struct file *file)	\
 {									\
-- 
cgit v1.2.3


From 6309727ef27162deabd5c095c11af24970fba5a2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 8 Sep 2023 01:40:48 +0200
Subject: kthread: add kthread_stop_put

Add a kthread_stop_put() helper that stops a thread and puts its task
struct.  Use it to replace the various instances of kthread_stop()
followed by put_task_struct().

Remove the kthread_stop_put() macro in usbip that is similar but doesn't
return the result of kthread_stop().

[agruenba@redhat.com: fix kerneldoc comment]
  Link: https://lkml.kernel.org/r/20230911111730.2565537-1-agruenba@redhat.com
[akpm@linux-foundation.org: document kthread_stop_put()'s argument]
Link: https://lkml.kernel.org/r/20230907234048.2499820-1-agruenba@redhat.com
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/accel/ivpu/ivpu_job.c              |  3 +--
 drivers/dma-buf/st-dma-fence-chain.c       | 12 ++++--------
 drivers/dma-buf/st-dma-fence.c             |  4 +---
 drivers/gpu/drm/i915/gt/selftest_migrate.c |  4 +---
 drivers/net/xen-netback/interface.c        |  3 +--
 drivers/usb/usbip/usbip_common.h           |  6 ------
 fs/gfs2/ops_fstype.c                       |  9 +++------
 include/linux/kthread.h                    |  1 +
 kernel/irq/manage.c                        | 15 +++++----------
 kernel/kthread.c                           | 18 ++++++++++++++++++
 kernel/smpboot.c                           |  3 +--
 mm/damon/core.c                            |  3 +--
 net/core/pktgen.c                          |  3 +--
 13 files changed, 38 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index de9e69f70af7..76f468c9f761 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -618,6 +618,5 @@ int ivpu_job_done_thread_init(struct ivpu_device *vdev)
 
 void ivpu_job_done_thread_fini(struct ivpu_device *vdev)
 {
-	kthread_stop(vdev->job_done_thread);
-	put_task_struct(vdev->job_done_thread);
+	kthread_stop_put(vdev->job_done_thread);
 }
diff --git a/drivers/dma-buf/st-dma-fence-chain.c b/drivers/dma-buf/st-dma-fence-chain.c
index c0979c8049b5..9c2a0c082a76 100644
--- a/drivers/dma-buf/st-dma-fence-chain.c
+++ b/drivers/dma-buf/st-dma-fence-chain.c
@@ -476,10 +476,9 @@ static int find_race(void *arg)
 	for (i = 0; i < ncpus; i++) {
 		int ret;
 
-		ret = kthread_stop(threads[i]);
+		ret = kthread_stop_put(threads[i]);
 		if (ret && !err)
 			err = ret;
-		put_task_struct(threads[i]);
 	}
 	kfree(threads);
 
@@ -591,8 +590,7 @@ static int wait_forward(void *arg)
 	for (i = 0; i < fc.chain_length; i++)
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
@@ -621,8 +619,7 @@ static int wait_backward(void *arg)
 	for (i = fc.chain_length; i--; )
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
@@ -669,8 +666,7 @@ static int wait_random(void *arg)
 	for (i = 0; i < fc.chain_length; i++)
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c
index fb6e0a6ae2c9..b7c6f7ea9e0c 100644
--- a/drivers/dma-buf/st-dma-fence.c
+++ b/drivers/dma-buf/st-dma-fence.c
@@ -548,11 +548,9 @@ static int race_signal_callback(void *arg)
 		for (i = 0; i < ARRAY_SIZE(t); i++) {
 			int err;
 
-			err = kthread_stop(t[i].task);
+			err = kthread_stop_put(t[i].task);
 			if (err && !ret)
 				ret = err;
-
-			put_task_struct(t[i].task);
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index 3def5ca72dec..0fb07f073baa 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -719,11 +719,9 @@ static int threaded_migrate(struct intel_migrate *migrate,
 		if (IS_ERR_OR_NULL(tsk))
 			continue;
 
-		status = kthread_stop(tsk);
+		status = kthread_stop_put(tsk);
 		if (status && !err)
 			err = status;
-
-		put_task_struct(tsk);
 	}
 
 	kfree(thread);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index f3f2c07423a6..33c8143619f0 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -672,8 +672,7 @@ err:
 static void xenvif_disconnect_queue(struct xenvif_queue *queue)
 {
 	if (queue->task) {
-		kthread_stop(queue->task);
-		put_task_struct(queue->task);
+		kthread_stop_put(queue->task);
 		queue->task = NULL;
 	}
 
diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index d8cbd2dfc2c2..282efca64a01 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -298,12 +298,6 @@ struct usbip_device {
 	__k;								   \
 })
 
-#define kthread_stop_put(k)		\
-	do {				\
-		kthread_stop(k);	\
-		put_task_struct(k);	\
-	} while (0)
-
 /* usbip_common.c */
 void usbip_dump_urb(struct urb *purb);
 void usbip_dump_header(struct usbip_header *pdu);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 33ca04733e93..ecf789b7168c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1126,8 +1126,7 @@ static int init_threads(struct gfs2_sbd *sdp)
 	return 0;
 
 fail:
-	kthread_stop(sdp->sd_logd_process);
-	put_task_struct(sdp->sd_logd_process);
+	kthread_stop_put(sdp->sd_logd_process);
 	sdp->sd_logd_process = NULL;
 	return error;
 }
@@ -1135,13 +1134,11 @@ fail:
 void gfs2_destroy_threads(struct gfs2_sbd *sdp)
 {
 	if (sdp->sd_logd_process) {
-		kthread_stop(sdp->sd_logd_process);
-		put_task_struct(sdp->sd_logd_process);
+		kthread_stop_put(sdp->sd_logd_process);
 		sdp->sd_logd_process = NULL;
 	}
 	if (sdp->sd_quotad_process) {
-		kthread_stop(sdp->sd_quotad_process);
-		put_task_struct(sdp->sd_quotad_process);
+		kthread_stop_put(sdp->sd_quotad_process);
 		sdp->sd_quotad_process = NULL;
 	}
 }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2c30ade43bc8..b11f53c1ba2e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -86,6 +86,7 @@ void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
+int kthread_stop_put(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
 bool kthread_should_stop_or_park(void);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d309ba84e08a..1782f90cd8c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1852,15 +1852,13 @@ out_thread:
 		struct task_struct *t = new->thread;
 
 		new->thread = NULL;
-		kthread_stop(t);
-		put_task_struct(t);
+		kthread_stop_put(t);
 	}
 	if (new->secondary && new->secondary->thread) {
 		struct task_struct *t = new->secondary->thread;
 
 		new->secondary->thread = NULL;
-		kthread_stop(t);
-		put_task_struct(t);
+		kthread_stop_put(t);
 	}
 out_mput:
 	module_put(desc->owner);
@@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
 	 * the same bit to a newly requested action.
 	 */
 	if (action->thread) {
-		kthread_stop(action->thread);
-		put_task_struct(action->thread);
-		if (action->secondary && action->secondary->thread) {
-			kthread_stop(action->secondary->thread);
-			put_task_struct(action->secondary->thread);
-		}
+		kthread_stop_put(action->thread);
+		if (action->secondary && action->secondary->thread)
+			kthread_stop_put(action->secondary->thread);
 	}
 
 	/* Last action releases resources */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..290cbc845225 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+	int ret;
+
+	ret = kthread_stop(k);
+	put_task_struct(k);
+	return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
 int kthreadd(void *unused)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f47d8f375946..1992b62e980b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
 
 		if (tsk) {
-			kthread_stop(tsk);
-			put_task_struct(tsk);
+			kthread_stop_put(tsk);
 			*per_cpu_ptr(ht->store, cpu) = NULL;
 		}
 	}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index bcd2bd9d6c10..2f54f153d7f5 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -699,8 +699,7 @@ static int __damon_stop(struct damon_ctx *ctx)
 	if (tsk) {
 		get_task_struct(tsk);
 		mutex_unlock(&ctx->kdamond_lock);
-		kthread_stop(tsk);
-		put_task_struct(tsk);
+		kthread_stop_put(tsk);
 		return 0;
 	}
 	mutex_unlock(&ctx->kdamond_lock);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f56b8d697014..826250a0f5b1 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3982,8 +3982,7 @@ static void __net_exit pg_net_exit(struct net *net)
 	list_for_each_safe(q, n, &list) {
 		t = list_entry(q, struct pktgen_thread, th_list);
 		list_del(&t->th_list);
-		kthread_stop(t->tsk);
-		put_task_struct(t->tsk);
+		kthread_stop_put(t->tsk);
 		kfree(t);
 	}
 
-- 
cgit v1.2.3


From 5e57418a2031cd5e1863efdf3d7447a16a368172 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 11 Sep 2023 18:49:13 +0300
Subject: minmax: deduplicate __unconst_integer_typeof()

It appears that compiler_types.h already have an implementation of the
__unconst_integer_typeof() called __unqual_scalar_typeof().  Use it
instead of the copy.

Link: https://lkml.kernel.org/r/20230911154913.4176033-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Herve Codina <herve.codina@bootlin.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 83aebc244cba..69bbe987fa87 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MINMAX_H
 #define _LINUX_MINMAX_H
 
+#include <linux/compiler_types.h>
 #include <linux/const.h>
 #include <linux/types.h>
 
@@ -134,27 +135,6 @@
  */
 #define max_t(type, x, y)	__careful_cmp((type)(x), (type)(y), >)
 
-/*
- * Remove a const qualifier from integer types
- * _Generic(foo, type-name: association, ..., default: association) performs a
- * comparison against the foo type (not the qualified type).
- * Do not use the const keyword in the type-name as it will not match the
- * unqualified type of foo.
- */
-#define __unconst_integer_type_cases(type)	\
-	unsigned type:  (unsigned type)0,	\
-	signed type:    (signed type)0
-
-#define __unconst_integer_typeof(x) typeof(			\
-	_Generic((x),						\
-		char: (char)0,					\
-		__unconst_integer_type_cases(char),		\
-		__unconst_integer_type_cases(short),		\
-		__unconst_integer_type_cases(int),		\
-		__unconst_integer_type_cases(long),		\
-		__unconst_integer_type_cases(long long),	\
-		default: (x)))
-
 /*
  * Do not check the array parameter using __must_be_array().
  * In the following legit use-case where the "array" passed is a simple pointer,
@@ -169,13 +149,13 @@
  * 'int *buff' and 'int buff[N]' types.
  *
  * The array can be an array of const items.
- * typeof() keeps the const qualifier. Use __unconst_integer_typeof() in order
+ * typeof() keeps the const qualifier. Use __unqual_scalar_typeof() in order
  * to discard the const qualifier for the __element variable.
  */
 #define __minmax_array(op, array, len) ({				\
 	typeof(&(array)[0]) __array = (array);				\
 	typeof(len) __len = (len);					\
-	__unconst_integer_typeof(__array[0]) __element = __array[--__len]; \
+	__unqual_scalar_typeof(__array[0]) __element = __array[--__len];\
 	while (__len--)							\
 		__element = op(__element, __array[__len]);		\
 	__element; })
-- 
cgit v1.2.3


From f6e9d38f8eb00ac8b52e6d15f6aa9bcecacb081b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 12:23:55 +0300
Subject: minmax: fix header inclusions

BUILD_BUG_ON*() macros are defined in build_bug.h.  Include it.  Replace
compiler_types.h by compiler.h, which provides the former, to have a
definition of the __UNIQUE_ID().

Link: https://lkml.kernel.org/r/20230912092355.79280-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Herve Codina <herve.codina@bootlin.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 69bbe987fa87..ca69abd6151e 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -2,7 +2,8 @@
 #ifndef _LINUX_MINMAX_H
 #define _LINUX_MINMAX_H
 
-#include <linux/compiler_types.h>
+#include <linux/build_bug.h>
+#include <linux/compiler.h>
 #include <linux/const.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From a9e1a3d84e4a0ea560ed4d84c28d06dbfdffed22 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:35 +0800
Subject: crash_core: change the prototype of function parse_crashkernel()

Add two parameters 'low_size' and 'high' to function parse_crashkernel(),
later crashkernel=,high|low parsing will be added.  Make adjustments in
all call sites of parse_crashkernel() in arch.

Link: https://lkml.kernel.org/r/20230914033142.676708-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c              |  3 ++-
 arch/arm64/mm/init.c                 |  2 +-
 arch/ia64/kernel/setup.c             |  2 +-
 arch/loongarch/kernel/setup.c        |  4 +++-
 arch/mips/kernel/setup.c             |  3 ++-
 arch/powerpc/kernel/fadump.c         |  2 +-
 arch/powerpc/kexec/core.c            |  2 +-
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/riscv/mm/init.c                 |  2 +-
 arch/s390/kernel/setup.c             |  4 ++--
 arch/sh/kernel/machine_kexec.c       |  2 +-
 arch/x86/kernel/setup.c              |  3 ++-
 include/linux/crash_core.h           |  3 ++-
 kernel/crash_core.c                  | 15 ++++++++++++---
 14 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index c66b560562b3..e2bb7afd0683 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1010,7 +1010,8 @@ static void __init reserve_crashkernel(void)
 
 	total_mem = get_total_mem();
 	ret = parse_crashkernel(boot_command_line, total_mem,
-				&crash_size, &crash_base);
+				&crash_size, &crash_base,
+				NULL, NULL);
 	/* invalid value specified or crashkernel=0 */
 	if (ret || !crash_size)
 		return;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8a0f8604348b..801c59c39a8f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -142,7 +142,7 @@ static void __init reserve_crashkernel(void)
 
 	/* crashkernel=X[@offset] */
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-				&crash_size, &crash_base);
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret == -ENOENT) {
 		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
 		if (ret || !crash_size)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 5a55ac82c13a..4faea2d2cf07 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -277,7 +277,7 @@ static void __init setup_crashkernel(unsigned long total, int *n)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, total,
-			&size, &base);
+			&size, &base, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		if (!base) {
 			sort_regions(rsvd_region, *n);
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 7783f0a3d742..4de32b07c0dc 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -267,7 +267,9 @@ static void __init arch_parse_crashkernel(void)
 	unsigned long long crash_base, crash_size;
 
 	total_mem = memblock_phys_mem_size();
-	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+	ret = parse_crashkernel(boot_command_line, total_mem,
+				&crash_size, &crash_base,
+				NULL, NULL);
 	if (ret < 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index cb871eb784a7..08321c945ac4 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -460,7 +460,8 @@ static void __init mips_parse_crashkernel(void)
 
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
-				&crash_size, &crash_base);
+				&crash_size, &crash_base,
+				NULL, NULL);
 	if (ret != 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 3ff2da7b120b..d14eda1e8589 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -313,7 +313,7 @@ static __init u64 fadump_calculate_reserve_size(void)
 	 * memory at a predefined offset.
 	 */
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&size, &base);
+				&size, &base, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		unsigned long max_size;
 
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index de64c7962991..9346c960b296 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -109,7 +109,7 @@ void __init reserve_crashkernel(void)
 	total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
 	/* use common parsing */
 	ret = parse_crashkernel(boot_command_line, total_mem_sz,
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 2fb3edafe9ab..b4f2786a7d2b 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, size, &crash_size,
-				&crash_base);
+				&crash_base, NULL, NULL);
 	if (ret != 0 || crash_size == 0)
 		return;
 	if (crash_base == 0)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 0798bd861dcb..9fe448900059 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1388,7 +1388,7 @@ static void __init reserve_crashkernel(void)
 	}
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-				&crash_size, &crash_base);
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret == -ENOENT) {
 		/* Fallback to crashkernel=X,[high,low] */
 		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index de6ad0fb2328..e555b576d3c8 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -625,8 +625,8 @@ static void __init reserve_crashkernel(void)
 	phys_addr_t low, high;
 	int rc;
 
-	rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size,
-			       &crash_base);
+	rc = parse_crashkernel(boot_command_line, ident_map_size,
+			       &crash_size, &crash_base, NULL, NULL);
 
 	crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
 	crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 223c14f44af7..fa3a7b36190a 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -154,7 +154,7 @@ void __init reserve_crashkernel(void)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2470..655c04812905 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -553,7 +553,8 @@ static void __init reserve_crashkernel(void)
 	total_mem = memblock_phys_mem_size();
 
 	/* crashkernel=XM */
-	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+	ret = parse_crashkernel(boot_command_line, total_mem,
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret != 0 || crash_size <= 0) {
 		/* crashkernel=X,high */
 		ret = parse_crashkernel_high(boot_command_line, total_mem,
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..6156355ef831 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -80,7 +80,8 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 void final_note(Elf_Word *buf);
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		unsigned long long *low_size, bool *high);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 507113932aa9..33ced5b5ed4e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -287,10 +287,19 @@ static int __init __parse_crashkernel(char *cmdline,
 int __init parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
+			     unsigned long long *crash_base,
+			     unsigned long long *low_size,
+			     bool *high)
 {
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				NULL);
+	int ret;
+
+	/* crashkernel=X[@offset] */
+	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+				crash_base, NULL);
+	if (!high)
+		return ret;
+
+	return 0;
 }
 
 int __init parse_crashkernel_high(char *cmdline,
-- 
cgit v1.2.3


From 70916e9c8d9f1a286c99727072b22e395097909f Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:36 +0800
Subject: crash_core: change parse_crashkernel() to support
 crashkernel=,high|low parsing

Now parse_crashkernel() is a real entry point for all kinds of crahskernel
parsing on any architecture.

And wrap the crahskernel=,high|low handling inside
CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope.

Link: https://lkml.kernel.org/r/20230914033142.676708-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  6 ++++++
 kernel/crash_core.c        | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 6156355ef831..d8050a7eab01 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -79,6 +79,12 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
+#endif
+#endif
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 33ced5b5ed4e..99a243540a35 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -283,6 +283,9 @@ static int __init __parse_crashkernel(char *cmdline,
 /*
  * That function is the entry point for command line parsing and should be
  * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
  */
 int __init parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
@@ -296,10 +299,37 @@ int __init parse_crashkernel(char *cmdline,
 	/* crashkernel=X[@offset] */
 	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
 				crash_base, NULL);
-	if (!high)
-		return ret;
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+	/*
+	 * If non-NULL 'high' passed in and no normal crashkernel
+	 * setting detected, try parsing crashkernel=,high|low.
+	 */
+	if (high && ret == -ENOENT) {
+		ret = __parse_crashkernel(cmdline, 0, crash_size,
+				crash_base, suffix_tbl[SUFFIX_HIGH]);
+		if (ret || !*crash_size)
+			return -EINVAL;
 
-	return 0;
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = __parse_crashkernel(cmdline, 0, low_size,
+				crash_base, suffix_tbl[SUFFIX_LOW]);
+		if (ret == -ENOENT) {
+			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			ret = 0;
+		} else if (ret) {
+			return ret;
+		}
+
+		*high = true;
+	}
+#endif
+	if (!*crash_size)
+		ret = -EINVAL;
+
+	return ret;
 }
 
 int __init parse_crashkernel_high(char *cmdline,
-- 
cgit v1.2.3


From 0ab97169aa0517079b22c2e64192906caa5dc6d5 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:37 +0800
Subject: crash_core: add generic function to do reservation

In architecture like x86_64, arm64 and riscv, they have vast virtual
address space and usually have huge physical memory RAM.  Their
crashkernel reservation doesn't have to be limited under 4G RAM, but can
be extended to the whole physical memory via crashkernel=,high support.

Now add function reserve_crashkernel_generic() to reserve crashkernel
memory if users specify any case of kernel pamameters, like
crashkernel=xM[@offset] or crashkernel=,high|low.

This is preparation to simplify code of crashkernel=,high support in
architecutures.

Link: https://lkml.kernel.org/r/20230914033142.676708-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  28 ++++++++++++
 kernel/crash_core.c        | 107 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index d8050a7eab01..4dbd6565e0ff 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -93,6 +93,34 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN			SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX		SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high)
+{}
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 99a243540a35..72e358197b52 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -5,7 +5,6 @@
  */
 
 #include <linux/buildid.h>
-#include <linux/crash_core.h>
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
@@ -13,6 +12,9 @@
 #include <linux/kexec.h>
 #include <linux/memory.h>
 #include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -360,6 +362,109 @@ static int __init parse_crashkernel_dummy(char *arg)
 }
 early_param("crashkernel", parse_crashkernel_dummy);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+	return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+			     unsigned long long crash_size,
+			     unsigned long long crash_base,
+			     unsigned long long crash_low_size,
+			     bool high)
+{
+	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+	bool fixed_base = false;
+
+	/* User specifies base address explicitly. */
+	if (crash_base) {
+		fixed_base = true;
+		search_base = crash_base;
+		search_end = crash_base + crash_size;
+	} else if (high) {
+		search_base = CRASH_ADDR_LOW_MAX;
+		search_end = CRASH_ADDR_HIGH_MAX;
+	}
+
+retry:
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+					       search_base, search_end);
+	if (!crash_base) {
+		/*
+		 * For crashkernel=size[KMG]@offset[KMG], print out failure
+		 * message if can't reserve the specified region.
+		 */
+		if (fixed_base) {
+			pr_warn("crashkernel reservation failed - memory is in use.\n");
+			return;
+		}
+
+		/*
+		 * For crashkernel=size[KMG], if the first attempt was for
+		 * low memory, fall back to high memory, the minimum required
+		 * low memory will be reserved later.
+		 */
+		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+			search_end = CRASH_ADDR_HIGH_MAX;
+			search_base = CRASH_ADDR_LOW_MAX;
+			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			goto retry;
+		}
+
+		/*
+		 * For crashkernel=size[KMG],high, if the first attempt was
+		 * for high memory, fall back to low memory.
+		 */
+		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+			search_end = CRASH_ADDR_LOW_MAX;
+			search_base = 0;
+			goto retry;
+		}
+		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+			crash_size);
+		return;
+	}
+
+	if ((crash_base > CRASH_ADDR_LOW_MAX) &&
+	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
+	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		crash_base, crash_base + crash_size, crash_size >> 20);
+
+	/*
+	 * The crashkernel memory will be removed from the kernel linear
+	 * map. Inform kmemleak so that it won't try to access it.
+	 */
+	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
+	crashk_res.start = crash_base;
+	crashk_res.end = crash_base + crash_size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
+}
+#endif
+
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
-- 
cgit v1.2.3


From b631b95dded5e7f007a3a79cbaf82ef50c1e2cf7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:38 +0800
Subject: crash_core: move crashk_*res definition into crash_core.c

Both crashk_res and crashk_low_res are used to mark the reserved
crashkernel regions in iomem_resource tree.  And later the generic
crashkernel resrvation will be added into crash_core.c.  So move
crashk_res and crashk_low_res definition into crash_core.c to avoid
compiling error if CONFIG_CRASH_CORE=on while CONFIG_KEXEC_CORE is unset.

Meanwhile include <asm/crash_core.h> in <linux/crash_core.h> if generic
reservation is needed.  In that case, <asm/crash_core.h> need be added by
ARCH.  In asm/crash_core.h, ARCH can provide its own macro definitions to
override macros in <linux/crash_core.h> if needed.  Wrap the including
into CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope to
avoid compiling error in other ARCH-es which don't take the generic
reservation way yet.

Link: https://lkml.kernel.org/r/20230914033142.676708-6-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  8 ++++++++
 include/linux/kexec.h      |  4 ----
 kernel/crash_core.c        | 16 ++++++++++++++++
 kernel/kexec_core.c        | 17 -----------------
 4 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 4dbd6565e0ff..3c735a7e33fb 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,6 +5,14 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_core.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
 
 #define CRASH_CORE_NOTE_NAME	   "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 32c78078552c..8227455192b7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -22,10 +22,6 @@
 #include <uapi/linux/kexec.h>
 #include <linux/verification.h>
 
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
 
 #ifdef CONFIG_KEXEC_CORE
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 72e358197b52..fa8808c4f00e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -35,6 +35,22 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+
 /*
  * parsing the "crashkernel" commandline
  *
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dc728982d79..be5642a4ec49 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,23 +52,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
-- 
cgit v1.2.3


From c37e56cac3d62c69f093904afbc58fc428484d14 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:42 +0800
Subject: crash_core.c: remove unneeded functions

So far, nobody calls functions parse_crashkernel_high() and
parse_crashkernel_low(), remove both of them.

Link: https://lkml.kernel.org/r/20230914033142.676708-10-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  4 ----
 kernel/crash_core.c        | 18 ------------------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 3c735a7e33fb..3426f6eef60b 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -96,10 +96,6 @@ void final_note(Elf_Word *buf);
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fa8808c4f00e..efe87d501c8c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -350,24 +350,6 @@ int __init parse_crashkernel(char *cmdline,
 	return ret;
 }
 
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				suffix_tbl[SUFFIX_LOW]);
-}
-
 /*
  * Add a dummy early_param handler to mark crashkernel= as a known command line
  * parameter and suppress incorrect warnings in init/main.c.
-- 
cgit v1.2.3


From 2cb1f6e9a743af58a23cf14563b5eada1e0d3fde Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:00 +0200
Subject: rcu: Conditionally build CPU-hotplug teardown callbacks

Among the three CPU-hotplug teardown RCU callbacks, two of them early
exit if CONFIG_HOTPLUG_CPU=n, and one is left unchanged. In any case
all of them have an implementation when CONFIG_HOTPLUG_CPU=n.

Align instead with the common way to deal with CPU-hotplug teardown
callbacks and provide a proper stub when they are not supported.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutree.h |  11 ++++-
 kernel/rcu/tree.c       | 114 +++++++++++++++++++++++-------------------------
 2 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 153cfc7bbffd..46875c4e9f56 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,9 +110,16 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-int rcutree_offline_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+int rcutree_offline_cpu(unsigned int cpu);
+#else
+#define rcutree_dead_cpu NULL
+#define rcutree_dying_cpu NULL
+#define rcutree_offline_cpu NULL
+#endif
 
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2e1e7eadf2cc..f9c6b2680cbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4237,25 +4237,6 @@ static bool rcu_init_invoked(void)
 	return !!rcu_state.n_online_cpus;
 }
 
-/*
- * Near the end of the offline process.  Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
-	bool blkd;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-	struct rcu_node *rnp = rdp->mynode;
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
-	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
-			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
-	return 0;
-}
-
 /*
  * All CPUs for the specified rcu_node structure have gone offline,
  * and all tasks that were preempted within an RCU read-side critical
@@ -4301,23 +4282,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	}
 }
 
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
-	// Stop-machine done, so allow nohz_full to disable tick.
-	tick_dep_clear(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
 /*
  * Propagate ->qsinitmask bits up the rcu_node tree to account for the
  * first CPU in a given leaf rcu_node structure coming online.  The caller
@@ -4470,29 +4434,6 @@ int rcutree_online_cpu(unsigned int cpu)
 	return 0;
 }
 
-/*
- * Near the beginning of the process.  The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
-	rnp = rdp->mynode;
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	rnp->ffmask &= ~rdp->grpmask;
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-	rcutree_affinity_setting(cpu, cpu);
-
-	// nohz_full CPUs need the tick for stop-machine to work quickly
-	tick_dep_set(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -4646,7 +4587,60 @@ void rcutree_migrate_callbacks(int cpu)
 		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
 		  rcu_segcblist_first_cb(&rdp->cblist));
 }
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context.  Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+	// Stop-machine done, so allow nohz_full to disable tick.
+	tick_dep_clear(TICK_DEP_BIT_RCU);
+	return 0;
+}
+
+/*
+ * Near the end of the offline process.  Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+	bool blkd;
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_node *rnp = rdp->mynode;
+
+	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+	return 0;
+}
+
+/*
+ * Near the beginning of the process.  The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	rdp = per_cpu_ptr(&rcu_data, cpu);
+	rnp = rdp->mynode;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	rnp->ffmask &= ~rdp->grpmask;
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	rcutree_affinity_setting(cpu, cpu);
+
+	// nohz_full CPUs need the tick for stop-machine to work quickly
+	tick_dep_set(TICK_DEP_BIT_RCU);
+	return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
  * On non-huge systems, use expedited RCU grace periods to make suspend
-- 
cgit v1.2.3


From 448e9f34d91d1a4799fdb06a93c2c24b34b6fd9d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:01 +0200
Subject: rcu: Standardize explicit CPU-hotplug calls

rcu_report_dead() and rcutree_migrate_callbacks() have their headers in
rcupdate.h while those are pure rcutree calls, like the other CPU-hotplug
functions.

Also rcu_cpu_starting() and rcu_report_dead() have different naming
conventions while they mirror each other's effects.

Fix the headers and propose a naming that relates both functions and
aligns with the prefix of other rcutree CPU-hotplug functions.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 .../Expedited-Grace-Periods/Expedited-Grace-Periods.rst      |  2 +-
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg  |  4 ++--
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg      |  4 ++--
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg |  4 ++--
 Documentation/RCU/Design/Requirements/Requirements.rst       |  4 ++--
 arch/arm64/kernel/smp.c                                      |  4 ++--
 arch/powerpc/kernel/smp.c                                    |  2 +-
 arch/s390/kernel/smp.c                                       |  2 +-
 arch/x86/kernel/smpboot.c                                    |  2 +-
 include/linux/interrupt.h                                    |  2 +-
 include/linux/rcupdate.h                                     |  2 --
 include/linux/rcutiny.h                                      |  2 +-
 include/linux/rcutree.h                                      |  7 ++++++-
 kernel/cpu.c                                                 |  6 +++---
 kernel/rcu/tree.c                                            | 12 ++++++++----
 15 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 93d899d53258..414f8a2012d6 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -181,7 +181,7 @@ operations is carried out at several levels:
    of this wait (or series of waits, as the case may be) is to permit a
    concurrent CPU-hotplug operation to complete.
 #. In the case of RCU-sched, one of the last acts of an outgoing CPU is
-   to invoke ``rcu_report_dead()``, which reports a quiescent state for
+   to invoke ``rcutree_report_cpu_dead()``, which reports a quiescent state for
    that CPU. However, this is likely paranoia-induced redundancy.
 
 +-----------------------------------------------------------------------+
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
index 7ddc094d7f28..d82a77d03d8c 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
@@ -1135,7 +1135,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6-5"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -1256,7 +1256,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
index 069f6f8371c2..6e690a3be161 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@@ -3274,7 +3274,7 @@
          font-weight="bold"
          font-size="192"
          id="text202-7-5-3-27-6-5"
-         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
       <text
          xml:space="preserve"
          x="3745.7725"
@@ -3395,7 +3395,7 @@
          font-style="normal"
          y="3679.27"
          x="-3804.9949"
-         xml:space="preserve">rcu_cpu_starting()</text>
+         xml:space="preserve">rcutree_report_cpu_starting()</text>
       <g
          style="fill:none;stroke-width:0.025in"
          id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
index 2c9310ba29ba..4fa7506082bf 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
@@ -607,7 +607,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -728,7 +728,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index f3b605285a87..cccafdaa1f84 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1955,12 +1955,12 @@ if offline CPUs block an RCU grace period for too long.
 
 An offline CPU's quiescent state will be reported either:
 
-1.  As the CPU goes offline using RCU's hotplug notifier (rcu_report_dead()).
+1.  As the CPU goes offline using RCU's hotplug notifier (rcutree_report_cpu_dead()).
 2.  When grace period initialization (rcu_gp_init()) detects a
     race either with CPU offlining or with a task unblocking on a leaf
     ``rcu_node`` structure whose CPUs are all offline.
 
-The CPU-online path (rcu_cpu_starting()) should never need to report
+The CPU-online path (rcutree_report_cpu_starting()) should never need to report
 a quiescent state for an offline CPU.  However, as a debugging measure,
 it does emit a warning if a quiescent state was not already reported
 for that CPU.
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 8fa646c90c67..196533c362e1 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -215,7 +215,7 @@ asmlinkage notrace void secondary_start_kernel(void)
 	if (system_uses_irq_prio_masking())
 		init_gic_priority_masking();
 
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	trace_hardirqs_off();
 
 	/*
@@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void)
 
 	/* Mark this CPU absent */
 	set_cpu_present(cpu, 0);
-	rcu_report_dead();
+	rcutree_report_cpu_dead();
 
 	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
 		update_cpu_boot_status(CPU_KILL_ME);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..a30d4d93ff0b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1629,7 +1629,7 @@ void start_secondary(void *unused)
 
 	smp_store_cpu_info(cpu);
 	set_dec(tb_ticks_per_jiffy);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	cpu_callin_map[cpu] = 1;
 
 	if (smp_ops->setup_cpu)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a4edb7ea66ea..214a1b67f80a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -898,7 +898,7 @@ static void smp_start_secondary(void *cpuvoid)
 	S390_lowcore.restart_flags = 0;
 	restore_access_regs(S390_lowcore.access_regs_save_area);
 	cpu_init();
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	init_cpu_timer();
 	vtime_init();
 	vdso_getcpu_init();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4e45ff44aa07..4ccb76f89af8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -288,7 +288,7 @@ static void notrace start_secondary(void *unused)
 
 	cpu_init();
 	fpu__init_cpu();
-	rcu_cpu_starting(raw_smp_processor_id());
+	rcutree_report_cpu_starting(raw_smp_processor_id());
 	x86_cpuinit.early_percpu_clock_init();
 
 	ap_starting();
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..d05e1e9a553c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,7 +566,7 @@ enum
  *
  * _ RCU:
  * 	1) rcutree_migrate_callbacks() migrates the queue.
- * 	2) rcu_report_dead() reports the final quiescent states.
+ * 	2) rcutree_report_cpu_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index aa351ddcbe8d..f7206b2623c9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(void);
-void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void rcu_init_tasks_generic(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7b949292908a..d9ac7b136aea 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
-static inline void rcu_cpu_starting(unsigned int cpu) { }
+static inline void rcutree_report_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 46875c4e9f56..254244202ea9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,7 +110,7 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+void rcutree_report_cpu_starting(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
@@ -122,4 +122,9 @@ int rcutree_offline_cpu(unsigned int cpu);
 #define rcutree_offline_cpu NULL
 #endif
 
+void rcutree_migrate_callbacks(int cpu);
+
+/* Called from hotplug and also arm64 early secondary boot failure */
+void rcutree_report_cpu_dead(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 076e75fed8bb..2491766e1fd5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1388,10 +1388,10 @@ void cpuhp_report_idle_dead(void)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	rcu_report_dead();
+	rcutree_report_cpu_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
-	 * We cannot call complete after rcu_report_dead() so we delegate it
+	 * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
 	 * to an online cpu.
 	 */
 	smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1617,7 +1617,7 @@ void notify_cpu_starting(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
 
-	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
+	rcutree_report_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
 	cpumask_set_cpu(cpu, &cpus_booted_once_mask);
 
 	/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c6b2680cbb..36d8818eaec1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4216,7 +4216,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	rdp = this_cpu_ptr(&rcu_data);
 	/*
 	 * Strictly, we care here about the case where the current CPU is
-	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
 	 * not being up to date. So arch_spin_is_locked() might have a
 	 * false positive if it's held by some *other* CPU, but that's
 	 * OK because that just means a false *negative* on the warning.
@@ -4445,8 +4445,10 @@ int rcutree_online_cpu(unsigned int cpu)
  * from the incoming CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
  * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
  */
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
 {
 	unsigned long mask;
 	struct rcu_data *rdp;
@@ -4500,8 +4502,10 @@ void rcu_cpu_starting(unsigned int cpu)
  * Note that this function is special in that it is invoked directly
  * from the outgoing CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
  */
-void rcu_report_dead(void)
+void rcutree_report_cpu_dead(void)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -5072,7 +5076,7 @@ void __init rcu_init(void)
 	pm_notifier(rcu_pm_notify, 0);
 	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
 	rcutree_prepare_cpu(cpu);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	rcutree_online_cpu(cpu);
 
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
-- 
cgit v1.2.3


From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 4 Oct 2023 16:50:07 -0400
Subject: eventfs: Remove eventfs_file and just use eventfs_inode

Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.

struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
						const struct eventfs_entry *entries,
						int size, void *data);

is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:

struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
					 const struct eventfs_entry *entries,
					 int size, void *data);

where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.

The entries are defined by:

typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
				const struct file_operations **fops);

struct eventfs_entry {
	const char			*name;
	eventfs_callback		callback;
};

Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.

If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.

This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.

The "show_events_dentry" file has been updated to show the directories,
and any files they have.

With just the eventfs_file allocations:

 Before after deltas for meminfo (in kB):

   MemFree:		-14360
   MemAvailable:	-14260
   Buffers:		40
   Cached:		24
   Active:		44
   Inactive:		48
   Inactive(anon):	28
   Active(file):	44
   Inactive(file):	20
   Dirty:		-4
   AnonPages:		28
   Mapped:		4
   KReclaimable:	132
   Slab:		1604
   SReclaimable:	132
   SUnreclaim:		1472
   Committed_AS:	12

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   ext4_inode_cache	27		[* 1184 = 31968 ]
   extent_status	102		[*   40 = 4080 ]
   tracefs_inode_cache	144		[*  656 = 94464 ]
   buffer_head		39		[*  104 = 4056 ]
   shmem_inode_cache	49		[*  800 = 39200 ]
   filp			-53		[*  256 = -13568 ]
   dentry		251		[*  192 = 48192 ]
   lsm_file_cache	277		[*   32 = 8864 ]
   vm_area_struct	-14		[*  184 = -2576 ]
   trace_event_file	1748		[*   88 = 153824 ]
   kmalloc-1k		35		[* 1024 = 35840 ]
   kmalloc-256		49		[*  256 = 12544 ]
   kmalloc-192		-28		[*  192 = -5376 ]
   kmalloc-128		-30		[*  128 = -3840 ]
   kmalloc-96		10581		[*   96 = 1015776 ]
   kmalloc-64		3056		[*   64 = 195584 ]
   kmalloc-32		1291		[*   32 = 41312 ]
   kmalloc-16		2310		[*   16 = 36960 ]
   kmalloc-8		9216		[*    8 = 73728 ]

 Free memory dropped by 14,360 kB
 Available memory dropped by 14,260 kB
 Total slab additions in size: 1,771,032 bytes

With this change:

 Before after deltas for meminfo (in kB):

   MemFree:		-12084
   MemAvailable:	-11976
   Buffers:		32
   Cached:		32
   Active:		72
   Inactive:		168
   Inactive(anon):	176
   Active(file):	72
   Inactive(file):	-8
   Dirty:		24
   AnonPages:		196
   Mapped:		8
   KReclaimable:	148
   Slab:		836
   SReclaimable:	148
   SUnreclaim:		688
   Committed_AS:	324

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   tracefs_inode_cache	144		[* 656 = 94464 ]
   shmem_inode_cache	-23		[* 800 = -18400 ]
   filp			-92		[* 256 = -23552 ]
   dentry		179		[* 192 = 34368 ]
   lsm_file_cache	-3		[* 32 = -96 ]
   vm_area_struct	-13		[* 184 = -2392 ]
   trace_event_file	1748		[* 88 = 153824 ]
   kmalloc-1k		-49		[* 1024 = -50176 ]
   kmalloc-256		-27		[* 256 = -6912 ]
   kmalloc-128		1864		[* 128 = 238592 ]
   kmalloc-64		4685		[* 64 = 299840 ]
   kmalloc-32		-72		[* 32 = -2304 ]
   kmalloc-16		256		[* 16 = 4096 ]
   total = 721352

 Free memory dropped by 12,084 kB
 Available memory dropped by 11,976 kB
 Total slab additions in size:  721,352 bytes

That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.

Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c     | 847 ++++++++++++++++++++++---------------------
 fs/tracefs/inode.c           |   2 +-
 fs/tracefs/internal.h        |  37 +-
 include/linux/trace_events.h |   2 +-
 include/linux/tracefs.h      |  29 +-
 kernel/trace/trace.c         |   7 +-
 kernel/trace/trace.h         |   4 +-
 kernel/trace/trace_events.c  | 313 +++++++++++-----
 8 files changed, 705 insertions(+), 536 deletions(-)

(limited to 'include/linux')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 8c8d64e76103..eab18b157ef5 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -2,8 +2,9 @@
 /*
  *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
  *
- *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org>
  *  Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *  Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org>
  *
  *  eventfs is used to dynamically create inodes and dentries based on the
  *  meta data provided by the tracing system.
@@ -23,46 +24,6 @@
 #include <linux/delay.h>
 #include "internal.h"
 
-struct eventfs_inode {
-	struct list_head	e_top_files;
-};
-
-/*
- * struct eventfs_file - hold the properties of the eventfs files and
- *                       directories.
- * @name:	the name of the file or directory to create
- * @d_parent:   holds parent's dentry
- * @dentry:     once accessed holds dentry
- * @list:	file or directory to be added to parent directory
- * @ei:		list of files and directories within directory
- * @fop:	file_operations for file or directory
- * @iop:	inode_operations for file or directory
- * @data:	something that the caller will want to get to later on
- * @mode:	the permission that the file or directory should have
- */
-struct eventfs_file {
-	const char			*name;
-	struct dentry			*d_parent;
-	struct dentry			*dentry;
-	struct list_head		list;
-	struct eventfs_inode		*ei;
-	const struct file_operations	*fop;
-	const struct inode_operations	*iop;
-	/*
-	 * Union - used for deletion
-	 * @del_list:	list of eventfs_file to delete
-	 * @rcu:	eventfs_file to delete in RCU
-	 * @is_freed:	node is freed if one of the above is set
-	 */
-	union {
-		struct list_head	del_list;
-		struct rcu_head		rcu;
-		unsigned long		is_freed;
-	};
-	void				*data;
-	umode_t				mode;
-};
-
 static DEFINE_MUTEX(eventfs_mutex);
 DEFINE_STATIC_SRCU(eventfs_srcu);
 
@@ -93,16 +54,9 @@ static const struct file_operations eventfs_file_operations = {
  * @data: something that the caller will want to get to later on.
  * @fop: struct file_operations that should be used for this file.
  *
- * This is the basic "create a file" function for tracefs.  It allows for a
- * wide range of flexibility in creating a file.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function creates a dentry that represents a file in the eventsfs_inode
+ * directory. The inode.i_private pointer will point to @data in the open()
+ * call.
  */
 static struct dentry *create_file(const char *name, umode_t mode,
 				  struct dentry *parent, void *data,
@@ -118,6 +72,7 @@ static struct dentry *create_file(const char *name, umode_t mode,
 	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
 
+	WARN_ON_ONCE(!parent);
 	dentry = eventfs_start_creating(name, parent);
 
 	if (IS_ERR(dentry))
@@ -142,20 +97,11 @@ static struct dentry *create_file(const char *name, umode_t mode,
  * create_dir - create a dir in the tracefs filesystem
  * @name: the name of the file to create.
  * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- *
- * This is the basic "create a dir" function for eventfs.  It allows for a
- * wide range of flexibility in creating a dir.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
  *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function will create a dentry for a directory represented by
+ * a eventfs_inode.
  */
-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+static struct dentry *create_dir(const char *name, struct dentry *parent)
 {
 	struct tracefs_inode *ti;
 	struct dentry *dentry;
@@ -172,7 +118,6 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	inode->i_op = &eventfs_root_dir_inode_operations;
 	inode->i_fop = &eventfs_file_operations;
-	inode->i_private = data;
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
@@ -185,18 +130,18 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 }
 
 /**
- * eventfs_set_ef_status_free - set the ef->status to free
+ * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
  * @ti: the tracefs_inode of the dentry
- * @dentry: dentry who's status to be freed
+ * @dentry: dentry which has the reference to remove.
  *
- * eventfs_set_ef_status_free will be called if no more
- * references remain
+ * Remove the association between a dentry from an eventfs_inode.
  */
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 {
 	struct tracefs_inode *ti_parent;
+	struct eventfs_inode *ei_child, *tmp;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef, *tmp;
+	int i;
 
 	/* The top level events directory may be freed by this */
 	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
@@ -207,9 +152,9 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 		ei = ti->private;
 
 		/* Record all the top level files */
-		list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+		list_for_each_entry_srcu(ei_child, &ei->children, list,
 					 lockdep_is_held(&eventfs_mutex)) {
-			list_add_tail(&ef->del_list, &ef_del_list);
+			list_add_tail(&ei_child->del_list, &ef_del_list);
 		}
 
 		/* Nothing should access this, but just in case! */
@@ -218,11 +163,13 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 		mutex_unlock(&eventfs_mutex);
 
 		/* Now safely free the top level files and their children */
-		list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-			list_del(&ef->del_list);
-			eventfs_remove(ef);
+		list_for_each_entry_safe(ei_child, tmp, &ef_del_list, del_list) {
+			list_del(&ei_child->del_list);
+			eventfs_remove_dir(ei_child);
 		}
 
+		kfree_const(ei->name);
+		kfree(ei->d_children);
 		kfree(ei);
 		return;
 	}
@@ -233,68 +180,162 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
 		goto out;
 
-	ef = dentry->d_fsdata;
-	if (!ef)
+	ei = dentry->d_fsdata;
+	if (!ei)
 		goto out;
 
 	/*
-	 * If ef was freed, then the LSB bit is set for d_fsdata.
+	 * If ei was freed, then the LSB bit is set for d_fsdata.
 	 * But this should not happen, as it should still have a
 	 * ref count that prevents it. Warn in case it does.
 	 */
-	if (WARN_ON_ONCE((unsigned long)ef & 1))
+	if (WARN_ON_ONCE((unsigned long)ei & 1))
 		goto out;
 
+	/* This could belong to one of the files of the ei */
+	if (ei->dentry != dentry) {
+		for (i = 0; i < ei->nr_entries; i++) {
+			if (ei->d_children[i] == dentry)
+				break;
+		}
+		if (WARN_ON_ONCE(i == ei->nr_entries))
+			goto out;
+		ei->d_children[i] = NULL;
+	} else {
+		ei->dentry = NULL;
+	}
+
 	dentry->d_fsdata = NULL;
-	ef->dentry = NULL;
-out:
+ out:
 	mutex_unlock(&eventfs_mutex);
 }
 
+/**
+ * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * @ei: the eventfs_inode that the file will be created under
+ * @e_dentry: a pointer to the d_children[] of the @ei
+ * @parent: The parent dentry of the created file.
+ * @name: The name of the file to create
+ * @mode: The mode of the file.
+ * @data: The data to use to set the inode of the file with on open()
+ * @fops: The fops of the file to be created.
+ * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
+ *
+ * Create a dentry for a file of an eventfs_inode @ei and place it into the
+ * address located at @e_dentry. If the @e_dentry already has a dentry, then
+ * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ */
+static struct dentry *
+create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry,
+		   struct dentry *parent, const char *name, umode_t mode, void *data,
+		   const struct file_operations *fops, bool lookup)
+{
+	struct dentry *dentry;
+	bool invalidate = false;
+
+	mutex_lock(&eventfs_mutex);
+	/* If the e_dentry already has a dentry, use it */
+	if (*e_dentry) {
+		/* lookup does not need to up the ref count */
+		if (!lookup)
+			dget(*e_dentry);
+		mutex_unlock(&eventfs_mutex);
+		return *e_dentry;
+	}
+	mutex_unlock(&eventfs_mutex);
+
+	/* The lookup already has the parent->d_inode locked */
+	if (!lookup)
+		inode_lock(parent->d_inode);
+
+	dentry = create_file(name, mode, parent, data, fops);
+
+	if (!lookup)
+		inode_unlock(parent->d_inode);
+
+	mutex_lock(&eventfs_mutex);
+
+	if (IS_ERR_OR_NULL(dentry)) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * Note, with the mutex held, the e_dentry cannot have content
+		 * and the ei->is_freed be true at the same time.
+		 */
+		WARN_ON_ONCE(ei->is_freed);
+		dentry = *e_dentry;
+		/* The lookup does not need to up the dentry refcount */
+		if (dentry && !lookup)
+			dget(dentry);
+		mutex_unlock(&eventfs_mutex);
+		return dentry;
+	}
+
+	if (!*e_dentry && !ei->is_freed) {
+		*e_dentry = dentry;
+		dentry->d_fsdata = ei;
+	} else {
+		/*
+		 * Should never happen unless we get here due to being freed.
+		 * Otherwise it means two dentries exist with the same name.
+		 */
+		WARN_ON_ONCE(!ei->is_freed);
+		invalidate = true;
+	}
+	mutex_unlock(&eventfs_mutex);
+
+	if (invalidate)
+		d_invalidate(dentry);
+
+	if (lookup || invalidate)
+		dput(dentry);
+
+	return invalidate ? NULL : dentry;
+}
+
 /**
  * eventfs_post_create_dir - post create dir routine
- * @ef: eventfs_file of recently created dir
+ * @ei: eventfs_inode of recently created dir
  *
  * Map the meta-data of files within an eventfs dir to their parent dentry
  */
-static void eventfs_post_create_dir(struct eventfs_file *ef)
+static void eventfs_post_create_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 
 	/* srcu lock already held */
 	/* fill parent-child relation */
-	list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		ef_child->d_parent = ef->dentry;
+		ei_child->d_parent = ei->dentry;
 	}
 
-	ti = get_tracefs(ef->dentry->d_inode);
-	ti->private = ef->ei;
+	ti = get_tracefs(ei->dentry->d_inode);
+	ti->private = ei;
 }
 
 /**
- * create_dentry - helper function to create dentry
- * @ef: eventfs_file of file or directory to create
- * @parent: parent dentry
- * @lookup: true if called from lookup routine
+ * create_dir_dentry - Create a directory dentry for the eventfs_inode
+ * @ei: The eventfs_inode to create the directory for
+ * @parent: The dentry of the parent of this directory
+ * @lookup: True if this is called by the lookup code
  *
- * Used to create a dentry for file/dir, executes post dentry creation routine
+ * This creates and attaches a directory dentry to the eventfs_inode @ei.
  */
 static struct dentry *
-create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+create_dir_dentry(struct eventfs_inode *ei, struct dentry *parent, bool lookup)
 {
 	bool invalidate = false;
-	struct dentry *dentry;
+	struct dentry *dentry = NULL;
 
 	mutex_lock(&eventfs_mutex);
-	if (ef->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	if (ef->dentry) {
-		dentry = ef->dentry;
-		/* On dir open, up the ref count */
+	if (ei->dentry) {
+		/* If the dentry already has a dentry, use it */
+		dentry = ei->dentry;
+		/* lookup does not need to up the ref count */
 		if (!lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
@@ -302,42 +343,44 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
 	}
 	mutex_unlock(&eventfs_mutex);
 
+	/* The lookup already has the parent->d_inode locked */
 	if (!lookup)
 		inode_lock(parent->d_inode);
 
-	if (ef->ei)
-		dentry = create_dir(ef->name, parent, ef->data);
-	else
-		dentry = create_file(ef->name, ef->mode, parent,
-				     ef->data, ef->fop);
+	dentry = create_dir(ei->name, parent);
 
 	if (!lookup)
 		inode_unlock(parent->d_inode);
 
 	mutex_lock(&eventfs_mutex);
-	if (IS_ERR_OR_NULL(dentry)) {
-		/* If the ef was already updated get it */
-		dentry = ef->dentry;
+
+	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * Note, with the mutex held, the e_dentry cannot have content
+		 * and the ei->is_freed be true at the same time.
+		 */
+		dentry = ei->dentry;
 		if (dentry && !lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
 		return dentry;
 	}
 
-	if (!ef->dentry && !ef->is_freed) {
-		ef->dentry = dentry;
-		if (ef->ei)
-			eventfs_post_create_dir(ef);
-		dentry->d_fsdata = ef;
+	if (!ei->dentry && !ei->is_freed) {
+		ei->dentry = dentry;
+		eventfs_post_create_dir(ei);
+		dentry->d_fsdata = ei;
 	} else {
-		/* A race here, should try again (unless freed) */
-		invalidate = true;
-
 		/*
 		 * Should never happen unless we get here due to being freed.
 		 * Otherwise it means two dentries exist with the same name.
 		 */
-		WARN_ON_ONCE(!ef->is_freed);
+		WARN_ON_ONCE(!ei->is_freed);
+		invalidate = true;
 	}
 	mutex_unlock(&eventfs_mutex);
 	if (invalidate)
@@ -349,50 +392,85 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
 	return invalidate ? NULL : dentry;
 }
 
-static bool match_event_file(struct eventfs_file *ef, const char *name)
-{
-	bool ret;
-
-	mutex_lock(&eventfs_mutex);
-	ret = !ef->is_freed && strcmp(ef->name, name) == 0;
-	mutex_unlock(&eventfs_mutex);
-
-	return ret;
-}
-
 /**
  * eventfs_root_lookup - lookup routine to create file/dir
  * @dir: in which a lookup is being done
  * @dentry: file/dir dentry
- * @flags: to pass as flags parameter to simple lookup
+ * @flags: Just passed to simple_lookup()
  *
- * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
- * list of meta data to find the information needed to create the file/dir.
+ * Used to create dynamic file/dir with-in @dir, search with-in @ei
+ * list, if @dentry found go ahead and create the file/dir
  */
+
 static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
+	struct dentry *ei_dentry = NULL;
 	struct dentry *ret = NULL;
+	const char *name = dentry->d_name.name;
+	bool created = false;
+	umode_t mode;
+	void *data;
 	int idx;
+	int i;
+	int r;
 
 	ti = get_tracefs(dir);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
 		return NULL;
 
-	ei = ti->private;
+	/* Grab srcu to prevent the ei from going away */
 	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+
+	/*
+	 * Grab the eventfs_mutex to consistent value from ti->private.
+	 * This s
+	 */
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	if (ei)
+		ei_dentry = READ_ONCE(ei->dentry);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei || !ei_dentry)
+		goto out;
+
+	data = ei->data;
+
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		if (!match_event_file(ef, dentry->d_name.name))
+		if (strcmp(ei_child->name, name) != 0)
 			continue;
 		ret = simple_lookup(dir, dentry, flags);
-		create_dentry(ef, ef->d_parent, true);
+		create_dir_dentry(ei_child, ei_dentry, true);
+		created = true;
 		break;
 	}
+
+	if (created)
+		goto out;
+
+	for (i = 0; i < ei->nr_entries; i++) {
+		entry = &ei->entries[i];
+		if (strcmp(name, entry->name) == 0) {
+			void *cdata = data;
+			r = entry->callback(name, &mode, &cdata, &fops);
+			if (r <= 0)
+				continue;
+			ret = simple_lookup(dir, dentry, flags);
+			create_file_dentry(ei, &ei->d_children[i],
+					   ei_dentry, name, mode, cdata,
+					   fops, true);
+			break;
+		}
+	}
+ out:
 	srcu_read_unlock(&eventfs_srcu, idx);
 	return ret;
 }
@@ -432,29 +510,48 @@ static int eventfs_release(struct inode *inode, struct file *file)
 	return dcache_dir_close(inode, file);
 }
 
+static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
+{
+	struct dentry **tmp;
+
+	tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+	if (!tmp)
+		return -1;
+	tmp[cnt] = d;
+	tmp[cnt + 1] = NULL;
+	*dentries = tmp;
+	return 0;
+}
+
 /**
  * dcache_dir_open_wrapper - eventfs open wrapper
  * @inode: not used
- * @file: dir to be opened (to create its child)
+ * @file: dir to be opened (to create it's children)
  *
- * Used to dynamically create the file/dir within @file. @file is really a
- * directory and all the files/dirs of the children within @file will be
- * created. If any of the files/dirs have already been created, their
- * reference count will be incremented.
+ * Used to dynamic create file/dir with-in @file, all the
+ * file/dir will be created. If already created then references
+ * will be increased
  */
 static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
 	struct dentry_list *dlist;
 	struct dentry **dentries = NULL;
-	struct dentry *dentry = file_dentry(file);
+	struct dentry *parent = file_dentry(file);
 	struct dentry *d;
 	struct inode *f_inode = file_inode(file);
+	const char *name = parent->d_name.name;
+	umode_t mode;
+	void *data;
 	int cnt = 0;
 	int idx;
 	int ret;
+	int i;
+	int r;
 
 	ti = get_tracefs(f_inode);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
@@ -463,25 +560,51 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 	if (WARN_ON_ONCE(file->private_data))
 		return -EINVAL;
 
+	idx = srcu_read_lock(&eventfs_srcu);
+
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei) {
+		srcu_read_unlock(&eventfs_srcu, idx);
+		return -EINVAL;
+	}
+
+
+	data = ei->data;
+
 	dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
-	if (!dlist)
+	if (!dlist) {
+		srcu_read_unlock(&eventfs_srcu, idx);
 		return -ENOMEM;
+	}
 
-	ei = ti->private;
-	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		d = create_dentry(ef, dentry, false);
+		d = create_dir_dentry(ei_child, parent, false);
 		if (d) {
-			struct dentry **tmp;
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
+				break;
+			cnt++;
+		}
+	}
 
-			tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
-			if (!tmp)
+	for (i = 0; i < ei->nr_entries; i++) {
+		void *cdata = data;
+		entry = &ei->entries[i];
+		name = entry->name;
+		r = entry->callback(name, &mode, &cdata, &fops);
+		if (r <= 0)
+			continue;
+		d = create_file_dentry(ei, &ei->d_children[i],
+				       parent, name, mode, cdata, fops, false);
+		if (d) {
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
 				break;
-			tmp[cnt] = d;
-			tmp[cnt + 1] = NULL;
 			cnt++;
-			dentries = tmp;
 		}
 	}
 	srcu_read_unlock(&eventfs_srcu, idx);
@@ -514,63 +637,90 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
 }
 
 /**
- * eventfs_prepare_ef - helper function to prepare eventfs_file
- * @name: the name of the file/directory to create.
- * @mode: the permission that the file should have.
- * @fop: struct file_operations that should be used for this file/directory.
- * @iop: struct inode_operations that should be used for this file/directory.
- * @data: something that the caller will want to get to later on. The
- *        inode.i_private pointer will point to this value on the open() call.
+ * eventfs_create_dir - Create the eventfs_inode for this directory
+ * @name: The name of the directory to create.
+ * @parent: The eventfs_inode of the parent directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
+ *
+ * This function creates the descriptor to represent a directory in the
+ * eventfs. This descriptor is an eventfs_inode, and it is returned to be
+ * used to create other children underneath.
+ *
+ * The @entries is an array of eventfs_entry structures which has:
+ *	const char		 *name
+ *	eventfs_callback	callback;
+ *
+ * The name is the name of the file, and the callback is a pointer to a function
+ * that will be called when the file is reference (either by lookup or by
+ * reading a directory). The callback is of the prototype:
  *
- * This function allocates and fills the eventfs_file structure.
+ *    int callback(const char *name, umode_t *mode, void **data,
+ *		   const struct file_operations **fops);
+ *
+ * When a file needs to be created, this callback will be called with
+ *   name = the name of the file being created (so that the same callback
+ *          may be used for multiple files).
+ *   mode = a place to set the file's mode
+ *   data = A pointer to @data, and the callback may replace it, which will
+ *         cause the file created to pass the new data to the open() call.
+ *   fops = the fops to use for the created file.
  */
-static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
-					const struct file_operations *fop,
-					const struct inode_operations *iop,
-					void *data)
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data)
 {
-	struct eventfs_file *ef;
+	struct eventfs_inode *ei;
 
-	ef = kzalloc(sizeof(*ef), GFP_KERNEL);
-	if (!ef)
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
 		return ERR_PTR(-ENOMEM);
 
-	ef->name = kstrdup(name, GFP_KERNEL);
-	if (!ef->name) {
-		kfree(ef);
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name) {
+		kfree(ei);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (S_ISDIR(mode)) {
-		ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
-		if (!ef->ei) {
-			kfree(ef->name);
-			kfree(ef);
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children) {
+			kfree_const(ei->name);
+			kfree(ei);
 			return ERR_PTR(-ENOMEM);
 		}
-		INIT_LIST_HEAD(&ef->ei->e_top_files);
-	} else {
-		ef->ei = NULL;
 	}
 
-	ef->iop = iop;
-	ef->fop = fop;
-	ef->mode = mode;
-	ef->data = data;
-	return ef;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->data = data;
+	INIT_LIST_HEAD(&ei->children);
+
+	mutex_lock(&eventfs_mutex);
+	list_add_tail(&ei->list, &parent->children);
+	ei->d_parent = parent->dentry;
+	mutex_unlock(&eventfs_mutex);
+
+	return ei;
 }
 
 /**
- * eventfs_create_events_dir - create the trace event structure
- * @name: the name of the directory to create.
- * @parent: parent dentry for this file.  This should be a directory dentry
- *          if set.  If this parameter is NULL, then the directory will be
- *          created in the root of the tracefs filesystem.
+ * eventfs_create_events_dir - create the top level events directory
+ * @name: The name of the top level directory to create.
+ * @parent: Parent dentry for this file in the tracefs directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
  *
  * This function creates the top of the trace event directory.
  */
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent)
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data)
 {
 	struct dentry *dentry = tracefs_start_creating(name, parent);
 	struct eventfs_inode *ei;
@@ -581,19 +731,32 @@ struct dentry *eventfs_create_events_dir(const char *name,
 		return NULL;
 
 	if (IS_ERR(dentry))
-		return dentry;
+		return (struct eventfs_inode *)dentry;
 
 	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
 	if (!ei)
-		return ERR_PTR(-ENOMEM);
+		goto fail;
+
 	inode = tracefs_get_inode(dentry->d_sb);
-	if (unlikely(!inode)) {
-		kfree(ei);
-		tracefs_failed_creating(dentry);
-		return ERR_PTR(-ENOMEM);
+	if (unlikely(!inode))
+		goto fail;
+
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children)
+			goto fail;
 	}
 
-	INIT_LIST_HEAD(&ei->e_top_files);
+	ei->dentry = dentry;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->data = data;
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name)
+		goto fail;
+
+	INIT_LIST_HEAD(&ei->children);
+	INIT_LIST_HEAD(&ei->list);
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
@@ -608,193 +771,41 @@ struct dentry *eventfs_create_events_dir(const char *name,
 	d_instantiate(dentry, inode);
 	inc_nlink(dentry->d_parent->d_inode);
 	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return tracefs_end_creating(dentry);
-}
+	tracefs_end_creating(dentry);
 
-/**
- * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
- * @name: the name of the file to create.
- * @parent: parent dentry for this dir.
- *
- * This function adds eventfs subsystem dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent)
-{
-	struct tracefs_inode *ti_parent;
-	struct eventfs_inode *ei_parent;
-	struct eventfs_file *ef;
+	/* Will call dput when the directory is removed */
+	dget(dentry);
 
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
-
-	if (!parent)
-		return ERR_PTR(-EINVAL);
-
-	ti_parent = get_tracefs(parent->d_inode);
-	ei_parent = ti_parent->private;
+	return ei;
 
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei_parent->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
+ fail:
+	kfree(ei->d_children);
+	kfree(ei);
+	tracefs_failed_creating(dentry);
+	return ERR_PTR(-ENOMEM);
 }
 
-/**
- * eventfs_add_dir - add eventfs dir to list to create later
- * @name: the name of the file to create.
- * @ef_parent: parent eventfs_file for this dir.
- *
- * This function adds eventfs dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent)
+static void free_ei(struct rcu_head *head)
 {
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
+	struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
 
-	if (!ef_parent)
-		return ERR_PTR(-EINVAL);
-
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
-}
-
-/**
- * eventfs_add_events_file - add the data needed to create a file for later reference
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * dentry/inode within the top level events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fop)
-{
-	struct tracefs_inode *ti;
-	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
-
-	if (!parent)
-		return -EINVAL;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-
-	if (!parent->d_inode)
-		return -EINVAL;
-
-	ti = get_tracefs(parent->d_inode);
-	if (!(ti->flags & TRACEFS_EVENT_INODE))
-		return -EINVAL;
-
-	ei = ti->private;
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-
-	if (IS_ERR(ef))
-		return -ENOMEM;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
-}
-
-/**
- * eventfs_add_file - add eventfs file to list to create later
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @ef_parent: parent eventfs_file for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * file within a subdirectory of the events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent,
-		     void *data,
-		     const struct file_operations *fop)
-{
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
-
-	if (!ef_parent)
-		return -EINVAL;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-	if (IS_ERR(ef))
-		return -ENOMEM;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
-}
-
-static void free_ef(struct rcu_head *head)
-{
-	struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
-
-	kfree(ef->name);
-	kfree(ef->ei);
-	kfree(ef);
+	kfree_const(ei->name);
+	kfree(ei->d_children);
+	kfree(ei);
 }
 
 /**
  * eventfs_remove_rec - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
- * @head: to create list of eventfs_file to be deleted
- * @level: to check recursion depth
+ * @ei: eventfs_inode to be removed.
  *
- * The helper function eventfs_remove_rec() is used to clean up and free the
- * associated data from eventfs for both of the added functions.
+ * This function recursively remove eventfs_inode which
+ * contains info of file or dir.
  */
-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, int level)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 
-	if (!ef)
+	if (!ei)
 		return;
 	/*
 	 * Check recursion depth. It should never be greater than 3:
@@ -806,62 +817,68 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head,
 	if (WARN_ON_ONCE(level > 3))
 		return;
 
-	if (ef->ei) {
-		/* search for nested folders or files */
-		list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
-					 lockdep_is_held(&eventfs_mutex)) {
-			eventfs_remove_rec(ef_child, head, level + 1);
-		}
+	/* search for nested folders or files */
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
+				 lockdep_is_held(&eventfs_mutex)) {
+		eventfs_remove_rec(ei_child, head, level + 1);
 	}
 
-	list_del_rcu(&ef->list);
-	list_add_tail(&ef->del_list, head);
+	list_del_rcu(&ei->list);
+	list_add_tail(&ei->del_list, head);
 }
 
+static void unhook_dentry(struct dentry **dentry, struct dentry **list)
+{
+	if (*dentry) {
+		unsigned long ptr = (unsigned long)*list;
+
+		/* Keep the dentry from being freed yet */
+		dget(*dentry);
+
+		/*
+		 * Paranoid: The dget() above should prevent the dentry
+		 * from being freed and calling eventfs_set_ei_status_free().
+		 * But just in case, set the link list LSB pointer to 1
+		 * and have eventfs_set_ei_status_free() check that to
+		 * make sure that if it does happen, it will not think
+		 * the d_fsdata is an eventfs_inode.
+		 *
+		 * For this to work, no eventfs_inode should be allocated
+		 * on a odd space, as the ef should always be allocated
+		 * to be at least word aligned. Check for that too.
+		 */
+		WARN_ON_ONCE(ptr & 1);
+
+		(*dentry)->d_fsdata = (void *)(ptr | 1);
+		*list = *dentry;
+		*dentry = NULL;
+	}
+}
 /**
  * eventfs_remove - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
+ * @ei: eventfs_inode to be removed.
  *
  * This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
  */
-void eventfs_remove(struct eventfs_file *ef)
+void eventfs_remove_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *tmp;
-	LIST_HEAD(ef_del_list);
+	struct eventfs_inode *tmp;
+	LIST_HEAD(ei_del_list);
 	struct dentry *dentry_list = NULL;
 	struct dentry *dentry;
+	int i;
 
-	if (!ef)
+	if (!ei)
 		return;
 
 	mutex_lock(&eventfs_mutex);
-	eventfs_remove_rec(ef, &ef_del_list, 0);
-	list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-		if (ef->dentry) {
-			unsigned long ptr = (unsigned long)dentry_list;
-
-			/* Keep the dentry from being freed yet */
-			dget(ef->dentry);
-
-			/*
-			 * Paranoid: The dget() above should prevent the dentry
-			 * from being freed and calling eventfs_set_ef_status_free().
-			 * But just in case, set the link list LSB pointer to 1
-			 * and have eventfs_set_ef_status_free() check that to
-			 * make sure that if it does happen, it will not think
-			 * the d_fsdata is an event_file.
-			 *
-			 * For this to work, no event_file should be allocated
-			 * on a odd space, as the ef should always be allocated
-			 * to be at least word aligned. Check for that too.
-			 */
-			WARN_ON_ONCE(ptr & 1);
-
-			ef->dentry->d_fsdata = (void *)(ptr | 1);
-			dentry_list = ef->dentry;
-			ef->dentry = NULL;
-		}
-		call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
+	eventfs_remove_rec(ei, &ei_del_list, 0);
+
+	list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) {
+		for (i = 0; i < ei->nr_entries; i++)
+			unhook_dentry(&ei->d_children[i], &dentry_list);
+		unhook_dentry(&ei->dentry, &dentry_list);
+		call_srcu(&eventfs_srcu, &ei->rcu, free_ei);
 	}
 	mutex_unlock(&eventfs_mutex);
 
@@ -876,8 +893,8 @@ void eventfs_remove(struct eventfs_file *ef)
 		mutex_lock(&eventfs_mutex);
 		/* dentry should now have at least a single reference */
 		WARN_ONCE((int)d_count(dentry) < 1,
-			  "dentry %p less than one reference (%d) after invalidate\n",
-			  dentry, d_count(dentry));
+			  "dentry %px (%s) less than one reference (%d) after invalidate\n",
+			  dentry, dentry->d_name.name, d_count(dentry));
 		mutex_unlock(&eventfs_mutex);
 		dput(dentry);
 	}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..34ffb2f8114e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 	ti = get_tracefs(inode);
 	if (ti && ti->flags & TRACEFS_EVENT_INODE)
-		eventfs_set_ef_status_free(ti, dentry);
+		eventfs_set_ei_status_free(ti, dentry);
 	iput(inode);
 }
 
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 4f2e49e2197b..298d3ecaf621 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -13,6 +13,41 @@ struct tracefs_inode {
 	struct inode            vfs_inode;
 };
 
+/*
+ * struct eventfs_inode - hold the properties of the eventfs directories.
+ * @list:	link list into the parent directory
+ * @entries:	the array of entries representing the files in the directory
+ * @name:	the name of the directory to create
+ * @children:	link list into the child eventfs_inode
+ * @dentry:     the dentry of the directory
+ * @d_parent:   pointer to the parent's dentry
+ * @d_children: The array of dentries to represent the files when created
+ * @data:	The private data to pass to the callbacks
+ * @nr_entries: The number of items in @entries
+ */
+struct eventfs_inode {
+	struct list_head		list;
+	const struct eventfs_entry	*entries;
+	const char			*name;
+	struct list_head		children;
+	struct dentry			*dentry;
+	struct dentry			*d_parent;
+	struct dentry			**d_children;
+	void				*data;
+	/*
+	 * Union - used for deletion
+	 * @del_list:	list of eventfs_inode to delete
+	 * @rcu:	eventfs_indoe to delete in RCU
+	 * @is_freed:	node is freed if one of the above is set
+	 */
+	union {
+		struct list_head	del_list;
+		struct rcu_head		rcu;
+		unsigned long		is_freed;
+	};
+	int				nr_entries;
+};
+
 static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
 {
 	return container_of(inode, struct tracefs_inode, vfs_inode);
@@ -25,6 +60,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
 struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
 struct dentry *eventfs_failed_creating(struct dentry *dentry);
 struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..12207dc6722d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -649,7 +649,7 @@ struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
-	struct eventfs_file             *ef;
+	struct eventfs_inode		*ei;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 009072792fa3..0c39704455d9 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -23,26 +23,25 @@ struct file_operations;
 
 struct eventfs_file;
 
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent);
+typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
+				const struct file_operations **fops);
 
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent);
+struct eventfs_entry {
+	const char			*name;
+	eventfs_callback		callback;
+};
 
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent);
+struct eventfs_inode;
 
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent, void *data,
-		     const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data);
 
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data);
 
-void eventfs_remove(struct eventfs_file *ef);
-
-void eventfs_remove_events_dir(struct dentry *dentry);
+void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd6395692ff9..4383be8fa1b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9764,7 +9764,6 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	struct trace_event_file *file;
 	int cpu;
 
 	trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
@@ -9797,11 +9796,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			  tr, &tracing_mark_fops);
 
-	file = __find_event_file(tr, "ftrace", "print");
-	if (file && file->ef)
-		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
-				  file, &event_trigger_fops);
-	tr->trace_marker_file = file;
+	tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");
 
 	trace_create_file("trace_marker_raw", 0220, d_tracer,
 			  tr, &tracing_mark_raw_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e92cb9c1292f..0e1405abf4f7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -381,7 +381,7 @@ struct trace_array {
 	struct dentry		*dir;
 	struct dentry		*options;
 	struct dentry		*percpu_dir;
-	struct dentry		*event_dir;
+	struct eventfs_inode	*event_dir;
 	struct trace_options	*topts;
 	struct list_head	systems;
 	struct list_head	events;
@@ -1349,7 +1349,7 @@ struct trace_subsystem_dir {
 	struct list_head		list;
 	struct event_subsystem		*subsystem;
 	struct trace_array		*tr;
-	struct eventfs_file             *ef;
+	struct eventfs_inode		*ei;
 	int				ref_count;
 	int				nr_events;
 };
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 099b9b6bbdc4..a3b9d9423824 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 		return;
 
 	if (!--dir->nr_events) {
-		eventfs_remove(dir->ef);
+		eventfs_remove_dir(dir->ei);
 		list_del(&dir->list);
 		__put_system_dir(dir);
 	}
@@ -992,7 +992,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 
 static void remove_event_file_dir(struct trace_event_file *file)
 {
-	eventfs_remove(file->ef);
+	eventfs_remove_dir(file->ei);
 	list_del(&file->list);
 	remove_subsystem(file->system);
 	free_event_filter(file->filter);
@@ -2282,14 +2282,40 @@ create_new_subsystem(const char *name)
 	return NULL;
 }
 
-static struct eventfs_file *
+int system_callback(const char *name, umode_t *mode, void **data,
+		    const struct file_operations **fops)
+{
+	if (strcmp(name, "filter") == 0)
+		*fops = &ftrace_subsystem_filter_fops;
+
+	else if (strcmp(name, "enable") == 0)
+		*fops = &ftrace_system_enable_fops;
+
+	else
+		return 0;
+
+	*mode = TRACE_MODE_WRITE;
+	return 1;
+}
+
+static struct eventfs_inode *
 event_subsystem_dir(struct trace_array *tr, const char *name,
-		    struct trace_event_file *file, struct dentry *parent)
+		    struct trace_event_file *file, struct eventfs_inode *parent)
 {
 	struct event_subsystem *system, *iter;
 	struct trace_subsystem_dir *dir;
-	struct eventfs_file *ef;
-	int res;
+	struct eventfs_inode *ei;
+	int nr_entries;
+	static struct eventfs_entry system_entries[] = {
+		{
+			.name		= "filter",
+			.callback	= system_callback,
+		},
+		{
+			.name		= "enable",
+			.callback	= system_callback,
+		}
+	};
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(dir, &tr->systems, list) {
@@ -2297,7 +2323,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 		if (strcmp(system->name, name) == 0) {
 			dir->nr_events++;
 			file->system = dir;
-			return dir->ef;
+			return dir->ei;
 		}
 	}
 
@@ -2321,39 +2347,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	ef = eventfs_add_subsystem_dir(name, parent);
-	if (IS_ERR(ef)) {
+	/* ftrace only has directories no files */
+	if (strcmp(name, "ftrace") == 0)
+		nr_entries = 0;
+	else
+		nr_entries = ARRAY_SIZE(system_entries);
+
+	ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
+	if (!ei) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
 		goto out_free;
 	}
 
-	dir->ef = ef;
+	dir->ei = ei;
 	dir->tr = tr;
 	dir->ref_count = 1;
 	dir->nr_events = 1;
 	dir->subsystem = system;
 	file->system = dir;
 
-	/* the ftrace system is special, do not create enable or filter files */
-	if (strcmp(name, "ftrace") != 0) {
-
-		res = eventfs_add_file("filter", TRACE_MODE_WRITE,
-					    dir->ef, dir,
-					    &ftrace_subsystem_filter_fops);
-		if (res) {
-			kfree(system->filter);
-			system->filter = NULL;
-			pr_warn("Could not create tracefs '%s/filter' entry\n", name);
-		}
-
-		eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
-				  &ftrace_system_enable_fops);
-	}
-
 	list_add(&dir->list, &tr->systems);
 
-	return dir->ef;
+	return dir->ei;
 
  out_free:
 	kfree(dir);
@@ -2402,15 +2418,134 @@ event_define_fields(struct trace_event_call *call)
 	return ret;
 }
 
+static int event_callback(const char *name, umode_t *mode, void **data,
+			  const struct file_operations **fops)
+{
+	struct trace_event_file *file = *data;
+	struct trace_event_call *call = file->event_call;
+
+	if (strcmp(name, "format") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_event_format_fops;
+		*data = call;
+		return 1;
+	}
+
+	/*
+	 * Only event directories that can be enabled should have
+	 * triggers or filters, with the exception of the "print"
+	 * event that can have a "trigger" file.
+	 */
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+		if (call->class->reg && strcmp(name, "enable") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &ftrace_enable_fops;
+			return 1;
+		}
+
+		if (strcmp(name, "filter") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &ftrace_event_filter_fops;
+			return 1;
+		}
+	}
+
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+	    strcmp(trace_event_name(call), "print") == 0) {
+		if (strcmp(name, "trigger") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &event_trigger_fops;
+			return 1;
+		}
+	}
+
+#ifdef CONFIG_PERF_EVENTS
+	if (call->event.type && call->class->reg &&
+	    strcmp(name, "id") == 0) {
+		*mode = TRACE_MODE_READ;
+		*data = (void *)(long)call->event.type;
+		*fops = &ftrace_event_id_fops;
+		return 1;
+	}
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+	if (strcmp(name, "hist") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &event_hist_fops;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+	if (strcmp(name, "hist_debug") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &event_hist_debug_fops;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+	if (call->event.type && call->class->reg &&
+	    strcmp(name, "inject") == 0) {
+		*mode = 0200;
+		*fops = &event_inject_fops;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
 static int
-event_create_dir(struct dentry *parent, struct trace_event_file *file)
+event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 {
 	struct trace_event_call *call = file->event_call;
-	struct eventfs_file *ef_subsystem = NULL;
 	struct trace_array *tr = file->tr;
-	struct eventfs_file *ef;
+	struct eventfs_inode *e_events;
+	struct eventfs_inode *ei;
 	const char *name;
+	int nr_entries;
 	int ret;
+	static struct eventfs_entry event_entries[] = {
+		{
+			.name		= "enable",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "filter",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "trigger",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "format",
+			.callback	= event_callback,
+		},
+#ifdef CONFIG_PERF_EVENTS
+		{
+			.name		= "id",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+		{
+			.name		= "hist",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+		{
+			.name		= "hist_debug",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+		{
+			.name		= "inject",
+			.callback	= event_callback,
+		},
+#endif
+	};
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
@@ -2420,29 +2555,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
 		return -ENODEV;
 
-	ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
-	if (!ef_subsystem)
+	e_events = event_subsystem_dir(tr, call->class->system, file, parent);
+	if (!e_events)
 		return -ENOMEM;
 
+	nr_entries = ARRAY_SIZE(event_entries);
+
 	name = trace_event_name(call);
-	ef = eventfs_add_dir(name, ef_subsystem);
-	if (IS_ERR(ef)) {
+	ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
+	if (IS_ERR(ei)) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
 		return -1;
 	}
 
-	file->ef = ef;
-
-	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-		eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
-				  &ftrace_enable_fops);
-
-#ifdef CONFIG_PERF_EVENTS
-	if (call->event.type && call->class->reg)
-		eventfs_add_file("id", TRACE_MODE_READ, file->ef,
-				  (void *)(long)call->event.type,
-				  &ftrace_event_id_fops);
-#endif
+	file->ei = ei;
 
 	ret = event_define_fields(call);
 	if (ret < 0) {
@@ -2450,35 +2576,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 		return ret;
 	}
 
-	/*
-	 * Only event directories that can be enabled should have
-	 * triggers or filters.
-	 */
-	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
-		eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
-				  file, &ftrace_event_filter_fops);
-
-		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
-				  file, &event_trigger_fops);
-	}
-
-#ifdef CONFIG_HIST_TRIGGERS
-	eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
-			  &event_hist_fops);
-#endif
-#ifdef CONFIG_HIST_TRIGGERS_DEBUG
-	eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
-			  &event_hist_debug_fops);
-#endif
-	eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
-			  &ftrace_event_format_fops);
-
-#ifdef CONFIG_TRACE_EVENT_INJECT
-	if (call->event.type && call->class->reg)
-		eventfs_add_file("inject", 0200, file->ef, file,
-				  &event_inject_fops);
-#endif
-
 	return 0;
 }
 
@@ -3623,30 +3720,65 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
+static int events_callback(const char *name, umode_t *mode, void **data,
+			   const struct file_operations **fops)
+{
+	if (strcmp(name, "enable") == 0) {
+		*mode = TRACE_MODE_WRITE;
+		*fops = &ftrace_tr_enable_fops;
+		return 1;
+	}
+
+	if (strcmp(name, "header_page") == 0)
+		*data = ring_buffer_print_page_header;
+
+	else if (strcmp(name, "header_event") == 0)
+		*data = ring_buffer_print_entry_header;
+
+	else
+		return 0;
+
+	*mode = TRACE_MODE_READ;
+	*fops = &ftrace_show_header_fops;
+	return 1;
+}
+
 /* Expects to have event_mutex held when called */
 static int
 create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 {
-	struct dentry *d_events;
+	struct eventfs_inode *e_events;
 	struct dentry *entry;
-	int error = 0;
+	int nr_entries;
+	static struct eventfs_entry events_entries[] = {
+		{
+			.name		= "enable",
+			.callback	= events_callback,
+		},
+		{
+			.name		= "header_page",
+			.callback	= events_callback,
+		},
+		{
+			.name		= "header_event",
+			.callback	= events_callback,
+		},
+	};
 
 	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
 				  tr, &ftrace_set_event_fops);
 	if (!entry)
 		return -ENOMEM;
 
-	d_events = eventfs_create_events_dir("events", parent);
-	if (IS_ERR(d_events)) {
+	nr_entries = ARRAY_SIZE(events_entries);
+
+	e_events = eventfs_create_events_dir("events", parent, events_entries,
+					     nr_entries, tr);
+	if (IS_ERR(e_events)) {
 		pr_warn("Could not create tracefs 'events' directory\n");
 		return -ENOMEM;
 	}
 
-	error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
-				  tr, &ftrace_tr_enable_fops);
-	if (error)
-		return -ENOMEM;
-
 	/* There are not as crucial, just warn if they are not created */
 
 	trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
@@ -3656,16 +3788,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 			  TRACE_MODE_WRITE, parent, tr,
 			  &ftrace_set_event_notrace_pid_fops);
 
-	/* ring buffer internal formats */
-	eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_page_header,
-				  &ftrace_show_header_fops);
-
-	eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_entry_header,
-				  &ftrace_show_header_fops);
-
-	tr->event_dir = d_events;
+	tr->event_dir = e_events;
 
 	return 0;
 }
@@ -3749,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	eventfs_remove_events_dir(tr->event_dir);
+	eventfs_remove_dir(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
-- 
cgit v1.2.3


From d7e9a9037de27b642d5a3edef7c69e2a2b460287 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Mon, 2 Oct 2023 16:09:35 -0700
Subject: f2fs: Support Block Size == Page Size

This allows f2fs to support cases where the block size = page size for
both 4K and 16K block sizes. Other sizes should work as well, should the
need arise. This does not currently support 4K Block size filesystems if
the page size is 16K.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          |  2 +-
 fs/f2fs/inode.c         |  2 +-
 fs/f2fs/node.c          |  2 +-
 fs/f2fs/super.c         |  4 +--
 include/linux/f2fs_fs.h | 69 +++++++++++++++++++++++++++++--------------------
 5 files changed, 46 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 96342aba8022..4e42b5f24deb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -4076,7 +4076,7 @@ next:
 	sis->highest_bit = cur_lblock - 1;
 out:
 	if (not_aligned)
-		f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)",
+		f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)",
 			  not_aligned, blks_per_sec * F2FS_BLKSIZE);
 	return ret;
 }
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cde243840abd..cafb81588359 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 			f2fs_has_inline_xattr(inode) &&
 			(!fi->i_inline_xattr_size ||
 			fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
-			f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
+			f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu",
 				  __func__, inode->i_ino, fi->i_inline_xattr_size,
 				  MAX_INLINE_XATTR_SIZE);
 			return false;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ee2e1dd64f25..a2b2c6c7f66d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n)
 
 	/* Then, try readahead for siblings of the desired node */
 	end = start + n;
-	end = min(end, NIDS_PER_BLOCK);
+	end = min(end, (int)NIDS_PER_BLOCK);
 	for (i = start; i < end; i++) {
 		nid = get_nid(parent, i, false);
 		f2fs_ra_node_page(sbi, nid);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bc303a052215..d2eafb56af81 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3502,7 +3502,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return -EFSCORRUPTED;
 	}
 
-	/* Currently, support 512/1024/2048/4096 bytes sector size */
+	/* Currently, support 512/1024/2048/4096/16K bytes sector size */
 	if (le32_to_cpu(raw_super->log_sectorsize) >
 				F2FS_MAX_LOG_SECTOR_SIZE ||
 		le32_to_cpu(raw_super->log_sectorsize) <
@@ -4948,7 +4948,7 @@ static int __init init_f2fs_fs(void)
 	int err;
 
 	if (PAGE_SIZE != F2FS_BLKSIZE) {
-		printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n",
+		printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
 				PAGE_SIZE, F2FS_BLKSIZE);
 		return -EINVAL;
 	}
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a82a4bb6ce68..07ed69c2840d 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -13,10 +13,10 @@
 
 #define F2FS_SUPER_OFFSET		1024	/* byte-size offset */
 #define F2FS_MIN_LOG_SECTOR_SIZE	9	/* 9 bits for 512 bytes */
-#define F2FS_MAX_LOG_SECTOR_SIZE	12	/* 12 bits for 4096 bytes */
-#define F2FS_LOG_SECTORS_PER_BLOCK	3	/* log number for sector/blk */
-#define F2FS_BLKSIZE			4096	/* support only 4KB block */
-#define F2FS_BLKSIZE_BITS		12	/* bits for F2FS_BLKSIZE */
+#define F2FS_MAX_LOG_SECTOR_SIZE	PAGE_SHIFT	/* Max is Block Size */
+#define F2FS_LOG_SECTORS_PER_BLOCK	(PAGE_SHIFT - 9) /* log number for sector/blk */
+#define F2FS_BLKSIZE			PAGE_SIZE /* support only block == page */
+#define F2FS_BLKSIZE_BITS		PAGE_SHIFT /* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_EXTENSION_LEN		8	/* max size of extension */
 #define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
@@ -210,14 +210,14 @@ struct f2fs_checkpoint {
 	unsigned char sit_nat_version_bitmap[];
 } __packed;
 
-#define CP_CHKSUM_OFFSET	4092	/* default chksum offset in checkpoint */
+#define CP_CHKSUM_OFFSET	(F2FS_BLKSIZE - sizeof(__le32))	/* default chksum offset in checkpoint */
 #define CP_MIN_CHKSUM_OFFSET						\
 	(offsetof(struct f2fs_checkpoint, sit_nat_version_bitmap))
 
 /*
  * For orphan inode management
  */
-#define F2FS_ORPHANS_PER_BLOCK	1020
+#define F2FS_ORPHANS_PER_BLOCK	((F2FS_BLKSIZE - 4 * sizeof(__le32)) / sizeof(__le32))
 
 #define GET_ORPHAN_BLOCKS(n)	(((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \
 					F2FS_ORPHANS_PER_BLOCK)
@@ -243,14 +243,31 @@ struct f2fs_extent {
 #define F2FS_NAME_LEN		255
 /* 200 bytes for inline xattrs by default */
 #define DEFAULT_INLINE_XATTR_ADDRS	50
-#define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
+
+#define OFFSET_OF_END_OF_I_EXT		360
+#define SIZE_OF_I_NID			20
+
+struct node_footer {
+	__le32 nid;		/* node id */
+	__le32 ino;		/* inode number */
+	__le32 flag;		/* include cold/fsync/dentry marks and offset */
+	__le64 cp_ver;		/* checkpoint version */
+	__le32 next_blkaddr;	/* next node page block address */
+} __packed;
+
+/* Address Pointers in an Inode */
+#define DEF_ADDRS_PER_INODE	((F2FS_BLKSIZE - OFFSET_OF_END_OF_I_EXT	\
+					- SIZE_OF_I_NID	\
+					- sizeof(struct node_footer)) / sizeof(__le32))
 #define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
 					get_extra_isize(inode))
 #define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
 #define ADDRS_PER_INODE(inode)	addrs_per_inode(inode)
-#define DEF_ADDRS_PER_BLOCK	1018	/* Address Pointers in a Direct Block */
+/* Address Pointers in a Direct Block */
+#define DEF_ADDRS_PER_BLOCK	((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 #define ADDRS_PER_BLOCK(inode)	addrs_per_block(inode)
-#define NIDS_PER_BLOCK		1018	/* Node IDs in an Indirect Block */
+/* Node IDs in an Indirect Block */
+#define NIDS_PER_BLOCK		((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 
 #define ADDRS_PER_PAGE(page, inode)	\
 	(IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK(inode))
@@ -342,14 +359,6 @@ enum {
 
 #define OFFSET_BIT_MASK		GENMASK(OFFSET_BIT_SHIFT - 1, 0)
 
-struct node_footer {
-	__le32 nid;		/* node id */
-	__le32 ino;		/* inode number */
-	__le32 flag;		/* include cold/fsync/dentry marks and offset */
-	__le64 cp_ver;		/* checkpoint version */
-	__le32 next_blkaddr;	/* next node page block address */
-} __packed;
-
 struct f2fs_node {
 	/* can be one of three types: inode, direct, and indirect types */
 	union {
@@ -363,7 +372,7 @@ struct f2fs_node {
 /*
  * For NAT entries
  */
-#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_PER_BLOCK (F2FS_BLKSIZE / sizeof(struct f2fs_nat_entry))
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
@@ -378,12 +387,13 @@ struct f2fs_nat_block {
 /*
  * For SIT entries
  *
- * Each segment is 2MB in size by default so that a bitmap for validity of
- * there-in blocks should occupy 64 bytes, 512 bits.
+ * A validity bitmap of 64 bytes covers 512 blocks of area. For a 4K page size,
+ * this results in a segment size of 2MB. For 16k pages, the default segment size
+ * is 8MB.
  * Not allow to change this.
  */
 #define SIT_VBLOCK_MAP_SIZE 64
-#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
+#define SIT_ENTRY_PER_BLOCK (F2FS_BLKSIZE / sizeof(struct f2fs_sit_entry))
 
 /*
  * F2FS uses 4 bytes to represent block address. As a result, supported size of
@@ -418,7 +428,7 @@ struct f2fs_sit_block {
  * For segment summary
  *
  * One summary block contains exactly 512 summary entries, which represents
- * exactly 2MB segment by default. Not allow to change the basic units.
+ * exactly one segment by default. Not allow to change the basic units.
  *
  * NOTE: For initializing fields, you must use set_summary
  *
@@ -429,12 +439,12 @@ struct f2fs_sit_block {
  * from node's page's beginning to get a data block address.
  * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
  */
-#define ENTRIES_IN_SUM		512
+#define ENTRIES_IN_SUM		(F2FS_BLKSIZE / 8)
 #define	SUMMARY_SIZE		(7)	/* sizeof(struct summary) */
 #define	SUM_FOOTER_SIZE		(5)	/* sizeof(struct summary_footer) */
 #define SUM_ENTRY_SIZE		(SUMMARY_SIZE * ENTRIES_IN_SUM)
 
-/* a summary entry for a 4KB-sized block in a segment */
+/* a summary entry for a block in a segment */
 struct f2fs_summary {
 	__le32 nid;		/* parent node id */
 	union {
@@ -518,7 +528,7 @@ struct f2fs_journal {
 	};
 } __packed;
 
-/* 4KB-sized summary block structure */
+/* Block-sized summary block structure */
 struct f2fs_summary_block {
 	struct f2fs_summary entries[ENTRIES_IN_SUM];
 	struct f2fs_journal journal;
@@ -559,11 +569,14 @@ typedef __le32	f2fs_hash_t;
  * Note: there are more reserved space in inline dentry than in regular
  * dentry, when converting inline dentry we should handle this carefully.
  */
-#define NR_DENTRY_IN_BLOCK	214	/* the number of dentry in a block */
+
+/* the number of dentry in a block */
+#define NR_DENTRY_IN_BLOCK	((BITS_PER_BYTE * F2FS_BLKSIZE) / \
+					((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * BITS_PER_BYTE + 1))
 #define SIZE_OF_DIR_ENTRY	11	/* by byte */
 #define SIZE_OF_DENTRY_BITMAP	((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
 					BITS_PER_BYTE)
-#define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
+#define SIZE_OF_RESERVED	(F2FS_BLKSIZE - ((SIZE_OF_DIR_ENTRY + \
 				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
 #define MIN_INLINE_DENTRY_SIZE		40	/* just include '.' and '..' entries */
@@ -576,7 +589,7 @@ struct f2fs_dir_entry {
 	__u8 file_type;		/* file type */
 } __packed;
 
-/* 4KB-sized directory entry block */
+/* Block-sized directory entry block */
 struct f2fs_dentry_block {
 	/* validity bitmap for directory entries in each block */
 	__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
-- 
cgit v1.2.3


From 1cf56299f9bc7d4b8e1e39af08f01d6380e28173 Mon Sep 17 00:00:00 2001
From: Randy Li <ayaka@soulik.info>
Date: Fri, 15 Sep 2023 01:23:23 +0800
Subject: USB: dma: remove unused function prototype

usb_buffer_map_sg() and usb_buffer_unmap_sg() have no definition
since the beginning of v5.4. The rest are gone from 2.6.12.

Signed-off-by: Randy Li <ayaka@soulik.info>
Link: https://lore.kernel.org/r/20230914172336.18761-2-ayaka@soulik.info
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index a21074861f91..8c61643acd49 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1823,22 +1823,6 @@ void *usb_alloc_coherent(struct usb_device *dev, size_t size,
 void usb_free_coherent(struct usb_device *dev, size_t size,
 	void *addr, dma_addr_t dma);
 
-#if 0
-struct urb *usb_buffer_map(struct urb *urb);
-void usb_buffer_dmasync(struct urb *urb);
-void usb_buffer_unmap(struct urb *urb);
-#endif
-
-struct scatterlist;
-int usb_buffer_map_sg(const struct usb_device *dev, int is_in,
-		      struct scatterlist *sg, int nents);
-#if 0
-void usb_buffer_dmasync_sg(const struct usb_device *dev, int is_in,
-			   struct scatterlist *sg, int n_hw_ents);
-#endif
-void usb_buffer_unmap_sg(const struct usb_device *dev, int is_in,
-			 struct scatterlist *sg, int n_hw_ents);
-
 /*-------------------------------------------------------------------*
  *                         SYNCHRONOUS CALL SUPPORT                  *
  *-------------------------------------------------------------------*/
-- 
cgit v1.2.3


From 0f28ada1fbf0054557cddcdb93ad17f767105208 Mon Sep 17 00:00:00 2001
From: Jorge Sanjuan Garcia <jorge.sanjuangarcia@duagon.com>
Date: Wed, 6 Sep 2023 11:49:26 +0000
Subject: mcb: remove is_added flag from mcb_device struct

When calling mcb_bus_add_devices(), both mcb devices and the mcb
bus will attempt to attach a device to a driver because they share
the same bus_type. This causes an issue when trying to cast the
container of the device to mcb_device struct using to_mcb_device(),
leading to a wrong cast when the mcb_bus is added. A crash occurs
when freing the ida resources as the bus numbering of mcb_bus gets
confused with the is_added flag on the mcb_device struct.

The only reason for this cast was to keep an is_added flag on the
mcb_device struct that does not seem necessary. The function
device_attach() handles already bound devices and the mcb subsystem
does nothing special with this is_added flag so remove it completely.

Fixes: 18d288198099 ("mcb: Correctly initialize the bus's device")
Cc: stable <stable@kernel.org>
Signed-off-by: Jorge Sanjuan Garcia <jorge.sanjuangarcia@duagon.com>
Co-developed-by: Jose Javier Rodriguez Barbarin <JoseJavier.Rodriguez@duagon.com>
Signed-off-by: Jose Javier Rodriguez Barbarin <JoseJavier.Rodriguez@duagon.com>
Link: https://lore.kernel.org/r/20230906114901.63174-2-JoseJavier.Rodriguez@duagon.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mcb/mcb-core.c  | 10 +++-------
 drivers/mcb/mcb-parse.c |  2 --
 include/linux/mcb.h     |  1 -
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index 978fdfc19a06..0cac5bead84f 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -387,17 +387,13 @@ EXPORT_SYMBOL_NS_GPL(mcb_free_dev, MCB);
 
 static int __mcb_bus_add_devices(struct device *dev, void *data)
 {
-	struct mcb_device *mdev = to_mcb_device(dev);
 	int retval;
 
-	if (mdev->is_added)
-		return 0;
-
 	retval = device_attach(dev);
-	if (retval < 0)
+	if (retval < 0) {
 		dev_err(dev, "Error adding device (%d)\n", retval);
-
-	mdev->is_added = true;
+		return retval;
+	}
 
 	return 0;
 }
diff --git a/drivers/mcb/mcb-parse.c b/drivers/mcb/mcb-parse.c
index 2aef990f379f..656b6b71c768 100644
--- a/drivers/mcb/mcb-parse.c
+++ b/drivers/mcb/mcb-parse.c
@@ -99,8 +99,6 @@ static int chameleon_parse_gdd(struct mcb_bus *bus,
 	mdev->mem.end = mdev->mem.start + size - 1;
 	mdev->mem.flags = IORESOURCE_MEM;
 
-	mdev->is_added = false;
-
 	ret = mcb_device_register(bus, mdev);
 	if (ret < 0)
 		goto err;
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 1e5893138afe..0b971b24a804 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -63,7 +63,6 @@ static inline struct mcb_bus *to_mcb_bus(struct device *dev)
 struct mcb_device {
 	struct device dev;
 	struct mcb_bus *bus;
-	bool is_added;
 	struct mcb_driver *driver;
 	u16 id;
 	int inst;
-- 
cgit v1.2.3


From e6814ec3ba1994561db9b1c05a80227d30cc18fa Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Fri, 21 Jul 2023 09:06:07 +0000
Subject: perf/core: Rename perf_proc_update_handler() ->
 perf_event_max_sample_rate_handler(), for readability

Follow the naming pattern of the other sysctl handlers in perf.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230721090607.172002-1-xiujianfeng@huawei.com
---
 include/linux/perf_event.h | 2 +-
 kernel/events/core.c       | 4 ++--
 kernel/sysctl.c            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..f31f962a6445 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1573,7 +1573,7 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..af569196d760 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -449,8 +449,8 @@ static void update_perf_cpu_limits(void)
 
 static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int perf_cpu = sysctl_perf_cpu_time_max_percent;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..2b6585751891 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1983,7 +1983,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= perf_proc_update_handler,
+		.proc_handler	= perf_event_max_sample_rate_handler,
 		.extra1		= SYSCTL_ONE,
 	},
 	{
-- 
cgit v1.2.3


From 0cff993e08a7578e2c1df93a95fc5059f447e7ae Mon Sep 17 00:00:00 2001
From: "pangzizhen001@208suo.com" <pangzizhen001@208suo.com>
Date: Thu, 20 Jul 2023 23:45:39 +0800
Subject: locking/seqlock: Fix typo in comment

s/the the
 /the

[ mingo: Cleaned up the changelog. ]

Signed-off-by: Zizhen Pang <pangzizhen001@208suo.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/70293ecd5bb7a1cd370fd4d95c35f936@208suo.com
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 987a59d977c5..ea7a58258af6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -864,7 +864,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 }
 
 /*
- * For all seqlock_t write side functions, use the the internal
+ * For all seqlock_t write side functions, use the internal
  * do_write_seqcount_begin() instead of generic write_seqcount_begin().
  * This way, no redundant lockdep_assert_held() checks are added.
  */
-- 
cgit v1.2.3


From b83ce9cb4a465b8f9a3fa45561b721a9551f60e3 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 8 Sep 2023 10:27:23 +0200
Subject: dma-buf: add dma_fence_timestamp helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a fence signals there is a very small race window where the timestamp
isn't updated yet. sync_file solves this by busy waiting for the
timestamp to appear, but on other ocassions didn't handled this
correctly.

Provide a dma_fence_timestamp() helper function for this and use it in
all appropriate cases.

Another alternative would be to grab the spinlock when that happens.

v2 by teddy: add a wait parameter to wait for the timestamp to show up, in case
   the accurate timestamp is needed and/or the timestamp is not based on
   ktime (e.g. hw timestamp)
v3 chk: drop the parameter again for unified handling

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Fixes: 1774baa64f93 ("drm/scheduler: Change scheduled fence track v2")
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20230929104725.2358-1-christian.koenig@amd.com
---
 drivers/dma-buf/dma-fence-unwrap.c     | 13 ++++---------
 drivers/dma-buf/sync_file.c            |  9 +++------
 drivers/gpu/drm/scheduler/sched_main.c |  2 +-
 include/linux/dma-fence.h              | 19 +++++++++++++++++++
 4 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index c625bb2b5d56..628af51c81af 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -76,16 +76,11 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 		dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
 			if (!dma_fence_is_signaled(tmp)) {
 				++count;
-			} else if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
-					    &tmp->flags)) {
-				if (ktime_after(tmp->timestamp, timestamp))
-					timestamp = tmp->timestamp;
 			} else {
-				/*
-				 * Use the current time if the fence is
-				 * currently signaling.
-				 */
-				timestamp = ktime_get();
+				ktime_t t = dma_fence_timestamp(tmp);
+
+				if (ktime_after(t, timestamp))
+					timestamp = t;
 			}
 		}
 	}
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index af57799c86ce..2e9a316c596a 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -268,13 +268,10 @@ static int sync_fill_fence_info(struct dma_fence *fence,
 		sizeof(info->driver_name));
 
 	info->status = dma_fence_get_status(fence);
-	while (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
-	       !test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
-		cpu_relax();
 	info->timestamp_ns =
-		test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags) ?
-		ktime_to_ns(fence->timestamp) :
-		ktime_set(0, 0);
+		dma_fence_is_signaled(fence) ?
+			ktime_to_ns(dma_fence_timestamp(fence)) :
+			ktime_set(0, 0);
 
 	return info->status;
 }
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 506371c42745..5a3a622fc672 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -929,7 +929,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 
 		if (next) {
 			next->s_fence->scheduled.timestamp =
-				job->s_fence->finished.timestamp;
+				dma_fence_timestamp(&job->s_fence->finished);
 			/* start TO timer for next job */
 			drm_sched_start_timeout(sched);
 		}
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 0d678e9a7b24..ebe78bd3d121 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -568,6 +568,25 @@ static inline void dma_fence_set_error(struct dma_fence *fence,
 	fence->error = error;
 }
 
+/**
+ * dma_fence_timestamp - helper to get the completion timestamp of a fence
+ * @fence: fence to get the timestamp from.
+ *
+ * After a fence is signaled the timestamp is updated with the signaling time,
+ * but setting the timestamp can race with tasks waiting for the signaling. This
+ * helper busy waits for the correct timestamp to appear.
+ */
+static inline ktime_t dma_fence_timestamp(struct dma_fence *fence)
+{
+	if (WARN_ON(!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)))
+		return ktime_get();
+
+	while (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
+		cpu_relax();
+
+	return fence->timestamp;
+}
+
 signed long dma_fence_wait_timeout(struct dma_fence *,
 				   bool intr, signed long timeout);
 signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
-- 
cgit v1.2.3


From a083c755e136844a934bc9b4416cd23b5c19c617 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Fri, 8 Sep 2023 22:58:40 +0900
Subject: devres: rename the first parameter of devm_add_action(_or_reset)

The first parameter of devm_add_action(_or_reset) is a device.
The name 'release' is confusing because it is often used for
dr_release_t in the devres context.

Rename it to 'dev'. No functional change intended.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20230908135840.2362708-1-masahiroy@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 56d93a1ffb7b..d7a72a8749ea 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -389,8 +389,8 @@ void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
 void devm_release_action(struct device *dev, void (*action)(void *), void *data);
 
 int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name);
-#define devm_add_action(release, action, data) \
-	__devm_add_action(release, action, data, #action)
+#define devm_add_action(dev, action, data) \
+	__devm_add_action(dev, action, data, #action)
 
 static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *),
 					     void *data, const char *name)
@@ -403,8 +403,8 @@ static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(
 
 	return ret;
 }
-#define devm_add_action_or_reset(release, action, data) \
-	__devm_add_action_or_reset(release, action, data, #action)
+#define devm_add_action_or_reset(dev, action, data) \
+	__devm_add_action_or_reset(dev, action, data, #action)
 
 /**
  * devm_alloc_percpu - Resource-managed alloc_percpu
-- 
cgit v1.2.3


From 5831fc1fd4a578232fea708b82de0c666ed17153 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Thu, 28 Sep 2023 16:57:22 +0800
Subject: crypto: hisilicon/qm - fix PF queue parameter issue

If the queue isolation feature is enabled, the number of queues
supported by the device changes. When PF is enabled using the
current default number of queues, the default number of queues may
be greater than the number supported by the device. As a result,
the PF fails to be bound to the driver.

After modification, if queue isolation feature is enabled, when
the default queue parameter is greater than the number supported
by the device, the number of enabled queues will be changed to
the number supported by the device, so that the PF and driver
can be properly bound.

Fixes: 8bbecfb402f7 ("crypto: hisilicon/qm - add queue isolation support for Kunpeng930")
Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c |  5 +++++
 drivers/crypto/hisilicon/qm.c             | 18 ++++++++++++------
 drivers/crypto/hisilicon/qm_common.h      |  1 -
 drivers/crypto/hisilicon/sec2/sec_main.c  |  5 +++++
 drivers/crypto/hisilicon/zip/zip_main.c   |  5 +++++
 include/linux/hisi_acc_qm.h               |  7 +++++++
 6 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index db44d889438a..3dce35debf63 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -433,8 +433,11 @@ static u32 uacce_mode = UACCE_MODE_NOUACCE;
 module_param_cb(uacce_mode, &hpre_uacce_mode_ops, &uacce_mode, 0444);
 MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC);
 
+static bool pf_q_num_flag;
 static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
+	pf_q_num_flag = true;
+
 	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
 }
 
@@ -1157,6 +1160,8 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		qm->qp_num = pf_q_num;
 		qm->debug.curr_qm_qp_num = pf_q_num;
 		qm->qm_list = &hpre_devices;
+		if (pf_q_num_flag)
+			set_bit(QM_MODULE_PARAM, &qm->misc_ctl);
 	}
 
 	ret = hisi_qm_init(qm);
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index a99fd589445c..1638c0a7df31 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -206,8 +206,6 @@
 #define WAIT_PERIOD			20
 #define REMOVE_WAIT_DELAY		10
 
-#define QM_DRIVER_REMOVING		0
-#define QM_RST_SCHED			1
 #define QM_QOS_PARAM_NUM		2
 #define QM_QOS_MAX_VAL			1000
 #define QM_QOS_RATE			100
@@ -2824,7 +2822,6 @@ static void hisi_qm_pre_init(struct hisi_qm *qm)
 	mutex_init(&qm->mailbox_lock);
 	init_rwsem(&qm->qps_lock);
 	qm->qp_in_used = 0;
-	qm->misc_ctl = false;
 	if (test_bit(QM_SUPPORT_RPM, &qm->caps)) {
 		if (!acpi_device_power_manageable(ACPI_COMPANION(&pdev->dev)))
 			dev_info(&pdev->dev, "_PS0 and _PR0 are not defined");
@@ -5093,6 +5090,7 @@ free_eq_irq:
 
 static int qm_get_qp_num(struct hisi_qm *qm)
 {
+	struct device *dev = &qm->pdev->dev;
 	bool is_db_isolation;
 
 	/* VF's qp_num assigned by PF in v2, and VF can get qp_num by vft. */
@@ -5109,13 +5107,21 @@ static int qm_get_qp_num(struct hisi_qm *qm)
 	qm->max_qp_num = hisi_qm_get_hw_info(qm, qm_basic_info,
 					     QM_FUNC_MAX_QP_CAP, is_db_isolation);
 
-	/* check if qp number is valid */
-	if (qm->qp_num > qm->max_qp_num) {
-		dev_err(&qm->pdev->dev, "qp num(%u) is more than max qp num(%u)!\n",
+	if (qm->qp_num <= qm->max_qp_num)
+		return 0;
+
+	if (test_bit(QM_MODULE_PARAM, &qm->misc_ctl)) {
+		/* Check whether the set qp number is valid */
+		dev_err(dev, "qp num(%u) is more than max qp num(%u)!\n",
 			qm->qp_num, qm->max_qp_num);
 		return -EINVAL;
 	}
 
+	dev_info(dev, "Default qp num(%u) is too big, reset it to Function's max qp num(%u)!\n",
+		 qm->qp_num, qm->max_qp_num);
+	qm->qp_num = qm->max_qp_num;
+	qm->debug.curr_qm_qp_num = qm->qp_num;
+
 	return 0;
 }
 
diff --git a/drivers/crypto/hisilicon/qm_common.h b/drivers/crypto/hisilicon/qm_common.h
index 1406a422d455..8e36aa9c681b 100644
--- a/drivers/crypto/hisilicon/qm_common.h
+++ b/drivers/crypto/hisilicon/qm_common.h
@@ -4,7 +4,6 @@
 #define QM_COMMON_H
 
 #define QM_DBG_READ_LEN		256
-#define QM_RESETTING		2
 
 struct qm_cqe {
 	__le32 rsvd0;
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 77f9f131b850..62bd8936a915 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -311,8 +311,11 @@ static int sec_diff_regs_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(sec_diff_regs);
 
+static bool pf_q_num_flag;
 static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
+	pf_q_num_flag = true;
+
 	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
 }
 
@@ -1120,6 +1123,8 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		qm->qp_num = pf_q_num;
 		qm->debug.curr_qm_qp_num = pf_q_num;
 		qm->qm_list = &sec_devices;
+		if (pf_q_num_flag)
+			set_bit(QM_MODULE_PARAM, &qm->misc_ctl);
 	} else if (qm->fun_type == QM_HW_VF && qm->ver == QM_HW_V1) {
 		/*
 		 * have no way to get qm configure in VM in v1 hardware,
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 0d5d1ee363e4..945ab3648a87 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -364,8 +364,11 @@ static u32 uacce_mode = UACCE_MODE_NOUACCE;
 module_param_cb(uacce_mode, &zip_uacce_mode_ops, &uacce_mode, 0444);
 MODULE_PARM_DESC(uacce_mode, UACCE_MODE_DESC);
 
+static bool pf_q_num_flag;
 static int pf_q_num_set(const char *val, const struct kernel_param *kp)
 {
+	pf_q_num_flag = true;
+
 	return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
 }
 
@@ -1139,6 +1142,8 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		qm->qp_num = pf_q_num;
 		qm->debug.curr_qm_qp_num = pf_q_num;
 		qm->qm_list = &zip_devices;
+		if (pf_q_num_flag)
+			set_bit(QM_MODULE_PARAM, &qm->misc_ctl);
 	} else if (qm->fun_type == QM_HW_VF && qm->ver == QM_HW_V1) {
 		/*
 		 * have no way to get qm configure in VM in v1 hardware,
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 39fbfb4be944..9da4f3f1e6d6 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -144,6 +144,13 @@ enum qm_vf_state {
 	QM_NOT_READY,
 };
 
+enum qm_misc_ctl_bits {
+	QM_DRIVER_REMOVING = 0x0,
+	QM_RST_SCHED,
+	QM_RESETTING,
+	QM_MODULE_PARAM,
+};
+
 enum qm_cap_bits {
 	QM_SUPPORT_DB_ISOLATION = 0x0,
 	QM_SUPPORT_FUNC_QOS,
-- 
cgit v1.2.3


From b42ab1c61a77832040ad42ebf9adf237360e49f7 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Thu, 28 Sep 2023 17:21:47 +0800
Subject: crypto: hisilicon/qm - check function qp num before alg register

When the Kunpeng accelerator executes tasks such as encryption
and decryption have minimum requirements on the number of device
queues. If the number of queues does not meet the requirement,
the process initialization will fail. Therefore, the driver checks
the number of queues on the device before registering the algorithm.
If the number does not meet the requirements, the driver does not register
the algorithm to crypto subsystem, the device is still added to the
qm_list.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/hpre/hpre_crypto.c | 25 +++++++++++++--
 drivers/crypto/hisilicon/hpre/hpre_main.c   | 14 ++++++---
 drivers/crypto/hisilicon/qm.c               | 47 ++++++++++-------------------
 drivers/crypto/hisilicon/sec2/sec_crypto.c  | 31 +++++++++++++++++--
 drivers/crypto/hisilicon/sec2/sec_main.c    | 24 ++++++---------
 drivers/crypto/hisilicon/zip/zip_crypto.c   | 24 ++++++++++++++-
 drivers/crypto/hisilicon/zip/zip_main.c     | 14 ++++++---
 include/linux/hisi_acc_qm.h                 | 18 +++++++++--
 8 files changed, 134 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
index 9a1c61be32cc..764532a6ca82 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
@@ -57,6 +57,9 @@ struct hpre_ctx;
 #define HPRE_DRV_ECDH_MASK_CAP		BIT(2)
 #define HPRE_DRV_X25519_MASK_CAP	BIT(5)
 
+static DEFINE_MUTEX(hpre_algs_lock);
+static unsigned int hpre_available_devs;
+
 typedef void (*hpre_cb)(struct hpre_ctx *ctx, void *sqe);
 
 struct hpre_rsa_ctx {
@@ -2202,11 +2205,17 @@ static void hpre_unregister_x25519(struct hisi_qm *qm)
 
 int hpre_algs_register(struct hisi_qm *qm)
 {
-	int ret;
+	int ret = 0;
+
+	mutex_lock(&hpre_algs_lock);
+	if (hpre_available_devs) {
+		hpre_available_devs++;
+		goto unlock;
+	}
 
 	ret = hpre_register_rsa(qm);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = hpre_register_dh(qm);
 	if (ret)
@@ -2220,6 +2229,9 @@ int hpre_algs_register(struct hisi_qm *qm)
 	if (ret)
 		goto unreg_ecdh;
 
+	hpre_available_devs++;
+	mutex_unlock(&hpre_algs_lock);
+
 	return ret;
 
 unreg_ecdh:
@@ -2228,13 +2240,22 @@ unreg_dh:
 	hpre_unregister_dh(qm);
 unreg_rsa:
 	hpre_unregister_rsa(qm);
+unlock:
+	mutex_unlock(&hpre_algs_lock);
 	return ret;
 }
 
 void hpre_algs_unregister(struct hisi_qm *qm)
 {
+	mutex_lock(&hpre_algs_lock);
+	if (--hpre_available_devs)
+		goto unlock;
+
 	hpre_unregister_x25519(qm);
 	hpre_unregister_ecdh(qm);
 	hpre_unregister_dh(qm);
 	hpre_unregister_rsa(qm);
+
+unlock:
+	mutex_unlock(&hpre_algs_lock);
 }
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 3dce35debf63..56777099ef69 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -107,6 +107,7 @@
 #define HPRE_VIA_MSI_DSM		1
 #define HPRE_SQE_MASK_OFFSET		8
 #define HPRE_SQE_MASK_LEN		24
+#define HPRE_CTX_Q_NUM_DEF		1
 
 #define HPRE_DFX_BASE		0x301000
 #define HPRE_DFX_COMMON1		0x301400
@@ -1399,10 +1400,11 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (ret)
 		dev_warn(&pdev->dev, "init debugfs fail!\n");
 
-	ret = hisi_qm_alg_register(qm, &hpre_devices);
+	hisi_qm_add_list(qm, &hpre_devices);
+	ret = hisi_qm_alg_register(qm, &hpre_devices, HPRE_CTX_Q_NUM_DEF);
 	if (ret < 0) {
 		pci_err(pdev, "fail to register algs to crypto!\n");
-		goto err_with_qm_start;
+		goto err_qm_del_list;
 	}
 
 	if (qm->uacce) {
@@ -1424,9 +1426,10 @@ static int hpre_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_with_alg_register:
-	hisi_qm_alg_unregister(qm, &hpre_devices);
+	hisi_qm_alg_unregister(qm, &hpre_devices, HPRE_CTX_Q_NUM_DEF);
 
-err_with_qm_start:
+err_qm_del_list:
+	hisi_qm_del_list(qm, &hpre_devices);
 	hpre_debugfs_exit(qm);
 	hisi_qm_stop(qm, QM_NORMAL);
 
@@ -1446,7 +1449,8 @@ static void hpre_remove(struct pci_dev *pdev)
 
 	hisi_qm_pm_uninit(qm);
 	hisi_qm_wait_task_finish(qm, &hpre_devices);
-	hisi_qm_alg_unregister(qm, &hpre_devices);
+	hisi_qm_alg_unregister(qm, &hpre_devices, HPRE_CTX_Q_NUM_DEF);
+	hisi_qm_del_list(qm, &hpre_devices);
 	if (qm->fun_type == QM_HW_PF && qm->vfs_num)
 		hisi_qm_sriov_disable(pdev, true);
 
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 458d1fe42a24..f3b55c044dd3 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -4860,63 +4860,48 @@ static void qm_cmd_process(struct work_struct *cmd_process)
 }
 
 /**
- * hisi_qm_alg_register() - Register alg to crypto and add qm to qm_list.
+ * hisi_qm_alg_register() - Register alg to crypto.
  * @qm: The qm needs add.
  * @qm_list: The qm list.
+ * @guard: Guard of qp_num.
  *
- * This function adds qm to qm list, and will register algorithm to
- * crypto when the qm list is empty.
+ * Register algorithm to crypto when the function is satisfy guard.
  */
-int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard)
 {
 	struct device *dev = &qm->pdev->dev;
-	int flag = 0;
-	int ret = 0;
-
-	mutex_lock(&qm_list->lock);
-	if (list_empty(&qm_list->list))
-		flag = 1;
-	list_add_tail(&qm->list, &qm_list->list);
-	mutex_unlock(&qm_list->lock);
 
 	if (qm->ver <= QM_HW_V2 && qm->use_sva) {
 		dev_info(dev, "HW V2 not both use uacce sva mode and hardware crypto algs.\n");
 		return 0;
 	}
 
-	if (flag) {
-		ret = qm_list->register_to_crypto(qm);
-		if (ret) {
-			mutex_lock(&qm_list->lock);
-			list_del(&qm->list);
-			mutex_unlock(&qm_list->lock);
-		}
+	if (qm->qp_num < guard) {
+		dev_info(dev, "qp_num is less than task need.\n");
+		return 0;
 	}
 
-	return ret;
+	return qm_list->register_to_crypto(qm);
 }
 EXPORT_SYMBOL_GPL(hisi_qm_alg_register);
 
 /**
- * hisi_qm_alg_unregister() - Unregister alg from crypto and delete qm from
- * qm list.
+ * hisi_qm_alg_unregister() - Unregister alg from crypto.
  * @qm: The qm needs delete.
  * @qm_list: The qm list.
+ * @guard: Guard of qp_num.
  *
- * This function deletes qm from qm list, and will unregister algorithm
- * from crypto when the qm list is empty.
+ * Unregister algorithm from crypto when the last function is satisfy guard.
  */
-void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard)
 {
-	mutex_lock(&qm_list->lock);
-	list_del(&qm->list);
-	mutex_unlock(&qm_list->lock);
-
 	if (qm->ver <= QM_HW_V2 && qm->use_sva)
 		return;
 
-	if (list_empty(&qm_list->list))
-		qm_list->unregister_from_crypto(qm);
+	if (qm->qp_num < guard)
+		return;
+
+	qm_list->unregister_from_crypto(qm);
 }
 EXPORT_SYMBOL_GPL(hisi_qm_alg_unregister);
 
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index ed77711e809e..6fcabbc87860 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -104,6 +104,9 @@
 #define IV_CTR_INIT		0x1
 #define IV_BYTE_OFFSET		0x8
 
+static DEFINE_MUTEX(sec_algs_lock);
+static unsigned int sec_available_devs;
+
 struct sec_skcipher {
 	u64 alg_msk;
 	struct skcipher_alg alg;
@@ -2545,16 +2548,31 @@ err:
 int sec_register_to_crypto(struct hisi_qm *qm)
 {
 	u64 alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH, SEC_DRV_ALG_BITMAP_LOW);
-	int ret;
+	int ret = 0;
+
+	mutex_lock(&sec_algs_lock);
+	if (sec_available_devs) {
+		sec_available_devs++;
+		goto unlock;
+	}
 
 	ret = sec_register_skcipher(alg_mask);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = sec_register_aead(alg_mask);
 	if (ret)
-		sec_unregister_skcipher(alg_mask, ARRAY_SIZE(sec_skciphers));
+		goto unreg_skcipher;
 
+	sec_available_devs++;
+	mutex_unlock(&sec_algs_lock);
+
+	return 0;
+
+unreg_skcipher:
+	sec_unregister_skcipher(alg_mask, ARRAY_SIZE(sec_skciphers));
+unlock:
+	mutex_unlock(&sec_algs_lock);
 	return ret;
 }
 
@@ -2562,6 +2580,13 @@ void sec_unregister_from_crypto(struct hisi_qm *qm)
 {
 	u64 alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH, SEC_DRV_ALG_BITMAP_LOW);
 
+	mutex_lock(&sec_algs_lock);
+	if (--sec_available_devs)
+		goto unlock;
+
 	sec_unregister_aead(alg_mask, ARRAY_SIZE(sec_aeads));
 	sec_unregister_skcipher(alg_mask, ARRAY_SIZE(sec_skciphers));
+
+unlock:
+	mutex_unlock(&sec_algs_lock);
 }
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 62bd8936a915..0e56a47eb862 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -1234,15 +1234,11 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (ret)
 		pci_warn(pdev, "Failed to init debugfs!\n");
 
-	if (qm->qp_num >= ctx_q_num) {
-		ret = hisi_qm_alg_register(qm, &sec_devices);
-		if (ret < 0) {
-			pr_err("Failed to register driver to crypto.\n");
-			goto err_qm_stop;
-		}
-	} else {
-		pci_warn(qm->pdev,
-			"Failed to use kernel mode, qp not enough!\n");
+	hisi_qm_add_list(qm, &sec_devices);
+	ret = hisi_qm_alg_register(qm, &sec_devices, ctx_q_num);
+	if (ret < 0) {
+		pr_err("Failed to register driver to crypto.\n");
+		goto err_qm_del_list;
 	}
 
 	if (qm->uacce) {
@@ -1264,9 +1260,9 @@ static int sec_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_alg_unregister:
-	if (qm->qp_num >= ctx_q_num)
-		hisi_qm_alg_unregister(qm, &sec_devices);
-err_qm_stop:
+	hisi_qm_alg_unregister(qm, &sec_devices, ctx_q_num);
+err_qm_del_list:
+	hisi_qm_del_list(qm, &sec_devices);
 	sec_debugfs_exit(qm);
 	hisi_qm_stop(qm, QM_NORMAL);
 err_probe_uninit:
@@ -1283,8 +1279,8 @@ static void sec_remove(struct pci_dev *pdev)
 
 	hisi_qm_pm_uninit(qm);
 	hisi_qm_wait_task_finish(qm, &sec_devices);
-	if (qm->qp_num >= ctx_q_num)
-		hisi_qm_alg_unregister(qm, &sec_devices);
+	hisi_qm_alg_unregister(qm, &sec_devices, ctx_q_num);
+	hisi_qm_del_list(qm, &sec_devices);
 
 	if (qm->fun_type == QM_HW_PF && qm->vfs_num)
 		hisi_qm_sriov_disable(pdev, true);
diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c
index 636ac794ebb7..c650c741a18d 100644
--- a/drivers/crypto/hisilicon/zip/zip_crypto.c
+++ b/drivers/crypto/hisilicon/zip/zip_crypto.c
@@ -25,6 +25,9 @@
 
 #define HZIP_ALG_DEFLATE			GENMASK(5, 4)
 
+static DEFINE_MUTEX(zip_algs_lock);
+static unsigned int zip_available_devs;
+
 enum hisi_zip_alg_type {
 	HZIP_ALG_TYPE_COMP = 0,
 	HZIP_ALG_TYPE_DECOMP = 1,
@@ -618,10 +621,29 @@ static void hisi_zip_unregister_deflate(struct hisi_qm *qm)
 
 int hisi_zip_register_to_crypto(struct hisi_qm *qm)
 {
-	return hisi_zip_register_deflate(qm);
+	int ret = 0;
+
+	mutex_lock(&zip_algs_lock);
+	if (zip_available_devs++)
+		goto unlock;
+
+	ret = hisi_zip_register_deflate(qm);
+	if (ret)
+		zip_available_devs--;
+
+unlock:
+	mutex_unlock(&zip_algs_lock);
+	return ret;
 }
 
 void hisi_zip_unregister_from_crypto(struct hisi_qm *qm)
 {
+	mutex_lock(&zip_algs_lock);
+	if (--zip_available_devs)
+		goto unlock;
+
 	hisi_zip_unregister_deflate(qm);
+
+unlock:
+	mutex_unlock(&zip_algs_lock);
 }
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 945ab3648a87..db4c964cd649 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -66,6 +66,7 @@
 #define HZIP_SQE_SIZE			128
 #define HZIP_PF_DEF_Q_NUM		64
 #define HZIP_PF_DEF_Q_BASE		0
+#define HZIP_CTX_Q_NUM_DEF		2
 
 #define HZIP_SOFT_CTRL_CNT_CLR_CE	0x301000
 #define HZIP_SOFT_CTRL_CNT_CLR_CE_BIT	BIT(0)
@@ -1231,10 +1232,11 @@ static int hisi_zip_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (ret)
 		pci_err(pdev, "failed to init debugfs (%d)!\n", ret);
 
-	ret = hisi_qm_alg_register(qm, &zip_devices);
+	hisi_qm_add_list(qm, &zip_devices);
+	ret = hisi_qm_alg_register(qm, &zip_devices, HZIP_CTX_Q_NUM_DEF);
 	if (ret < 0) {
 		pci_err(pdev, "failed to register driver to crypto!\n");
-		goto err_qm_stop;
+		goto err_qm_del_list;
 	}
 
 	if (qm->uacce) {
@@ -1256,9 +1258,10 @@ static int hisi_zip_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_qm_alg_unregister:
-	hisi_qm_alg_unregister(qm, &zip_devices);
+	hisi_qm_alg_unregister(qm, &zip_devices, HZIP_CTX_Q_NUM_DEF);
 
-err_qm_stop:
+err_qm_del_list:
+	hisi_qm_del_list(qm, &zip_devices);
 	hisi_zip_debugfs_exit(qm);
 	hisi_qm_stop(qm, QM_NORMAL);
 
@@ -1278,7 +1281,8 @@ static void hisi_zip_remove(struct pci_dev *pdev)
 
 	hisi_qm_pm_uninit(qm);
 	hisi_qm_wait_task_finish(qm, &zip_devices);
-	hisi_qm_alg_unregister(qm, &zip_devices);
+	hisi_qm_alg_unregister(qm, &zip_devices, HZIP_CTX_Q_NUM_DEF);
+	hisi_qm_del_list(qm, &zip_devices);
 
 	if (qm->fun_type == QM_HW_PF && qm->vfs_num)
 		hisi_qm_sriov_disable(pdev, true);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 9da4f3f1e6d6..34c64a02712c 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -478,6 +478,20 @@ static inline void hisi_qm_init_list(struct hisi_qm_list *qm_list)
 	mutex_init(&qm_list->lock);
 }
 
+static inline void hisi_qm_add_list(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+{
+	mutex_lock(&qm_list->lock);
+	list_add_tail(&qm->list, &qm_list->list);
+	mutex_unlock(&qm_list->lock);
+}
+
+static inline void hisi_qm_del_list(struct hisi_qm *qm, struct hisi_qm_list *qm_list)
+{
+	mutex_lock(&qm_list->lock);
+	list_del(&qm->list);
+	mutex_unlock(&qm_list->lock);
+}
+
 int hisi_qm_init(struct hisi_qm *qm);
 void hisi_qm_uninit(struct hisi_qm *qm);
 int hisi_qm_start(struct hisi_qm *qm);
@@ -523,8 +537,8 @@ int hisi_qm_alloc_qps_node(struct hisi_qm_list *qm_list, int qp_num,
 void hisi_qm_free_qps(struct hisi_qp **qps, int qp_num);
 void hisi_qm_dev_shutdown(struct pci_dev *pdev);
 void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
-int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
-void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
+int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard);
+void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list, int guard);
 int hisi_qm_resume(struct device *dev);
 int hisi_qm_suspend(struct device *dev);
 void hisi_qm_pm_uninit(struct hisi_qm *qm);
-- 
cgit v1.2.3


From 8468516f9f93a41dc65158b6428a1a1039c68f20 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Mon, 2 Oct 2023 00:57:15 +0100
Subject: crypto: pkcs7 - remove md4 md5 x.509 support

Remove support for md4 md5 hash and signatures in x.509 certificate
parsers, pkcs7 signature parser, authenticode parser.

All of these are insecure or broken, and everyone has long time ago
migrated to alternative hash implementations.

Also remove md2 & md3 oids which have already didn't have support.

This is also likely the last user of md4 in the kernel, and thus
crypto/md4.c and related tests in tcrypt & testmgr can likely be
removed. Other users such as cifs smbfs ext modpost sumversions have
their own internal implementation as needed.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/mscode_parser.c    | 6 ------
 crypto/asymmetric_keys/pkcs7_parser.c     | 6 ------
 crypto/asymmetric_keys/x509_cert_parser.c | 6 ------
 include/linux/oid_registry.h              | 8 --------
 4 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 839591ad21ac..690405ebe77b 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -75,12 +75,6 @@ int mscode_note_digest_algo(void *context, size_t hdrlen,
 
 	oid = look_up_OID(value, vlen);
 	switch (oid) {
-	case OID_md4:
-		ctx->digest_algo = "md4";
-		break;
-	case OID_md5:
-		ctx->digest_algo = "md5";
-		break;
 	case OID_sha1:
 		ctx->digest_algo = "sha1";
 		break;
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 277482bb1777..cf4caab9620f 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -227,12 +227,6 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen,
 	struct pkcs7_parse_context *ctx = context;
 
 	switch (ctx->last_oid) {
-	case OID_md4:
-		ctx->sinfo->sig->hash_algo = "md4";
-		break;
-	case OID_md5:
-		ctx->sinfo->sig->hash_algo = "md5";
-		break;
 	case OID_sha1:
 		ctx->sinfo->sig->hash_algo = "sha1";
 		break;
diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 0a7049b470c1..2c30928621b7 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -195,15 +195,9 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag,
 	pr_debug("PubKey Algo: %u\n", ctx->last_oid);
 
 	switch (ctx->last_oid) {
-	case OID_md2WithRSAEncryption:
-	case OID_md3WithRSAEncryption:
 	default:
 		return -ENOPKG; /* Unsupported combination */
 
-	case OID_md4WithRSAEncryption:
-		ctx->cert->sig->hash_algo = "md4";
-		goto rsa_pkcs1;
-
 	case OID_sha1WithRSAEncryption:
 		ctx->cert->sig->hash_algo = "sha1";
 		goto rsa_pkcs1;
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index f86a08ba0207..4d04fa5d1eec 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -30,9 +30,6 @@ enum OID {
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-	OID_md2WithRSAEncryption,	/* 1.2.840.113549.1.1.2 */
-	OID_md3WithRSAEncryption,	/* 1.2.840.113549.1.1.3 */
-	OID_md4WithRSAEncryption,	/* 1.2.840.113549.1.1.4 */
 	OID_sha1WithRSAEncryption,	/* 1.2.840.113549.1.1.5 */
 	OID_sha256WithRSAEncryption,	/* 1.2.840.113549.1.1.11 */
 	OID_sha384WithRSAEncryption,	/* 1.2.840.113549.1.1.12 */
@@ -49,11 +46,6 @@ enum OID {
 	OID_smimeCapabilites,		/* 1.2.840.113549.1.9.15 */
 	OID_smimeAuthenticatedAttrs,	/* 1.2.840.113549.1.9.16.2.11 */
 
-	/* {iso(1) member-body(2) us(840) rsadsi(113549) digestAlgorithm(2)} */
-	OID_md2,			/* 1.2.840.113549.2.2 */
-	OID_md4,			/* 1.2.840.113549.2.4 */
-	OID_md5,			/* 1.2.840.113549.2.5 */
-
 	OID_mskrb5,			/* 1.2.840.48018.1.2.2 */
 	OID_krb5,			/* 1.2.840.113554.1.2.2 */
 	OID_krb5u2u,			/* 1.2.840.113554.1.2.2.3 */
-- 
cgit v1.2.3


From 7523d330aac7190f738998a52df8d5aa14293280 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 4 Sep 2023 13:40:46 +0300
Subject: device property: Clarify usage scope of some struct fwnode_handle
 members

Most of the struct fwnode_handle members are for exclusive use with
device links framework. Clarify this by adding a respective comment.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20230904104046.1682875-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fwnode.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 5700451b300f..2a72f55d26eb 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -41,6 +41,8 @@ struct device;
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
 	const struct fwnode_operations *ops;
+
+	/* The below is used solely by device links, don't use otherwise */
 	struct device *dev;
 	struct list_head suppliers;
 	struct list_head consumers;
-- 
cgit v1.2.3


From 1dc05a274a7b13fd61b6c43f0136153752e6f731 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 20 Sep 2023 18:38:19 +0300
Subject: device property: Replace custom implementation of COUNT_ARGS()

Replace custom and non-portable implementation of COUNT_ARGS().

Fixes: e64b674bc9d7 ("software node: implement reference properties")
Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Closes: https://lore.kernel.org/r/ZQoILN6QCjzosCOs@google.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Closes: https://github.com/ClangBuiltLinux/linux/issues/1935
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230920153819.2069869-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 8c3c6685a2ae..9f2585d705a8 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -10,6 +10,7 @@
 #ifndef _LINUX_PROPERTY_H_
 #define _LINUX_PROPERTY_H_
 
+#include <linux/args.h>
 #include <linux/bits.h>
 #include <linux/fwnode.h>
 #include <linux/stddef.h>
@@ -288,7 +289,7 @@ struct software_node_ref_args {
 #define SOFTWARE_NODE_REFERENCE(_ref_, ...)			\
 (const struct software_node_ref_args) {				\
 	.node = _ref_,						\
-	.nargs = ARRAY_SIZE(((u64[]){ 0, ##__VA_ARGS__ })) - 1,	\
+	.nargs = COUNT_ARGS(__VA_ARGS__),			\
 	.args = { __VA_ARGS__ },				\
 }
 
-- 
cgit v1.2.3


From a56cc0a8338523f709892696cc229527617c1316 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Oct 2023 15:17:24 +0200
Subject: thermal: core: Add function to walk trips under zone lock

Add a wrapper around for_each_thermal_trip(), called
thermal_zone_for_each_trip(), that will invoke the former under the
thermal zone lock and pass its return value to the caller.

Two drivers will be modified subsequently to use this new function.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_trip.c | 14 ++++++++++++++
 include/linux/thermal.h        |  3 +++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index a8e92a89b2b8..8c649a899537 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -27,6 +27,20 @@ int for_each_thermal_trip(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(for_each_thermal_trip);
 
+int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
+			       int (*cb)(struct thermal_trip *, void *),
+			       void *data)
+{
+	int ret;
+
+	mutex_lock(&tz->lock);
+	ret = for_each_thermal_trip(tz, cb, data);
+	mutex_unlock(&tz->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_for_each_trip);
+
 int thermal_zone_get_num_trips(struct thermal_zone_device *tz)
 {
 	return tz->num_trips;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6710a4ace992..2bab72149bbf 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -286,6 +286,9 @@ int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
 int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data);
+int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
+			       int (*cb)(struct thermal_trip *, void *),
+			       void *data);
 int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
-- 
cgit v1.2.3


From 4963e34ce7b95237021575d208fa576f88697839 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Oct 2023 15:25:33 +0200
Subject: thermal: core: Drop thermal_zone_device_exec()

Because thermal_zone_device_exec() has no users any more and there are
no plans to use it anywhere, revert commit 9a99a996d1ec ("thermal: core:
Introduce thermal_zone_device_exec()") that introduced it.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c | 19 -------------------
 include/linux/thermal.h        |  4 ----
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 45d0aa0b69b7..8ee22eb804d3 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -495,25 +495,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
 
-/**
- * thermal_zone_device_exec - Run a callback under the zone lock.
- * @tz: Thermal zone.
- * @cb: Callback to run.
- * @data: Data to pass to the callback.
- */
-void thermal_zone_device_exec(struct thermal_zone_device *tz,
-			      void (*cb)(struct thermal_zone_device *,
-					 unsigned long),
-			      unsigned long data)
-{
-	mutex_lock(&tz->lock);
-
-	cb(tz, data);
-
-	mutex_unlock(&tz->lock);
-}
-EXPORT_SYMBOL_GPL(thermal_zone_device_exec);
-
 static void thermal_zone_device_check(struct work_struct *work)
 {
 	struct thermal_zone_device *tz = container_of(work, struct
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 2bab72149bbf..c8600e313909 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -339,10 +339,6 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
 				enum thermal_notify_event);
-void thermal_zone_device_exec(struct thermal_zone_device *tz,
-			      void (*cb)(struct thermal_zone_device *,
-					 unsigned long),
-			      unsigned long data);
 
 struct thermal_cooling_device *thermal_cooling_device_register(const char *,
 		void *, const struct thermal_cooling_device_ops *);
-- 
cgit v1.2.3


From c62f5032f72a745542aa6a7e777c7819c96adfbe Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:01 +0100
Subject: comedi: comedi_8254: Use a call-back function for register access

Rework the comedi_8254 module to use a call-back function for register
access.  This will make it easier to isolate the parts that will depend
on the `CONFIG_HAS_IOPORT` macro being defined and also allows the
possibility of supplying an external callback function during
initialization by a variant of the `comedi_8254_init()` and
`comedi_8254_mm_init()` functions, although that has not been
implemented yet.

The `struct comedi_8254` members have been changed to use a pointer to a
callback function and a context of type `unsigned long`.  The
`comedi_8254_init()` and `comedi_8254_mm_init()` functions use an
internal callback function and set the context to the base address of
the registers (for `comedi_8254_mm_init()` that involves converting a
`void __iomem *` to `unsigned long`).

A minor change to `dio200_subdev_8254_offset()` in the
amplc_dio200_common module has been made due to the changes in `struct
comedi_8254`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-3-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/drivers/amplc_dio200_common.c |   4 +-
 drivers/comedi/drivers/comedi_8254.c         | 177 +++++++++++++++++++--------
 include/linux/comedi/comedi_8254.h           |  22 +++-
 3 files changed, 144 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/comedi/drivers/amplc_dio200_common.c b/drivers/comedi/drivers/amplc_dio200_common.c
index ff651f2eb86c..2c1507a23f8a 100644
--- a/drivers/comedi/drivers/amplc_dio200_common.c
+++ b/drivers/comedi/drivers/amplc_dio200_common.c
@@ -149,9 +149,9 @@ static unsigned int dio200_subdev_8254_offset(struct comedi_device *dev,
 
 	/* get the offset that was passed to comedi_8254_*_init() */
 	if (dev->mmio)
-		offset = i8254->mmio - dev->mmio;
+		offset = (void __iomem *)i8254->context - dev->mmio;
 	else
-		offset = i8254->iobase - dev->iobase;
+		offset = i8254->context - dev->iobase;
 
 	/* remove the shift that was added for PCIe boards */
 	if (board->is_pcie)
diff --git a/drivers/comedi/drivers/comedi_8254.c b/drivers/comedi/drivers/comedi_8254.c
index b4185c1b2695..3f8657fc7ee5 100644
--- a/drivers/comedi/drivers/comedi_8254.c
+++ b/drivers/comedi/drivers/comedi_8254.c
@@ -119,63 +119,101 @@
 #include <linux/comedi/comedidev.h>
 #include <linux/comedi/comedi_8254.h>
 
-static unsigned int __i8254_read(struct comedi_8254 *i8254, unsigned int reg)
+static unsigned int i8254_io8_cb(struct comedi_8254 *i8254, int dir,
+				unsigned int reg, unsigned int val)
 {
-	unsigned int reg_offset = (reg * i8254->iosize) << i8254->regshift;
-	unsigned int val;
+	unsigned long iobase = i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO8) << i8254->regshift;
 
-	switch (i8254->iosize) {
-	default:
-	case I8254_IO8:
-		if (i8254->mmio)
-			val = readb(i8254->mmio + reg_offset);
-		else
-			val = inb(i8254->iobase + reg_offset);
-		break;
-	case I8254_IO16:
-		if (i8254->mmio)
-			val = readw(i8254->mmio + reg_offset);
-		else
-			val = inw(i8254->iobase + reg_offset);
-		break;
-	case I8254_IO32:
-		if (i8254->mmio)
-			val = readl(i8254->mmio + reg_offset);
-		else
-			val = inl(i8254->iobase + reg_offset);
-		break;
+	if (dir) {
+		outb(val, iobase + reg_offset);
+		return 0;
+	} else {
+		return inb(iobase + reg_offset);
 	}
-	return val & 0xff;
 }
 
-static void __i8254_write(struct comedi_8254 *i8254,
-			  unsigned int val, unsigned int reg)
+static unsigned int i8254_io16_cb(struct comedi_8254 *i8254, int dir,
+				  unsigned int reg, unsigned int val)
 {
-	unsigned int reg_offset = (reg * i8254->iosize) << i8254->regshift;
+	unsigned long iobase = i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO16) << i8254->regshift;
 
-	switch (i8254->iosize) {
-	default:
-	case I8254_IO8:
-		if (i8254->mmio)
-			writeb(val, i8254->mmio + reg_offset);
-		else
-			outb(val, i8254->iobase + reg_offset);
-		break;
-	case I8254_IO16:
-		if (i8254->mmio)
-			writew(val, i8254->mmio + reg_offset);
-		else
-			outw(val, i8254->iobase + reg_offset);
-		break;
-	case I8254_IO32:
-		if (i8254->mmio)
-			writel(val, i8254->mmio + reg_offset);
-		else
-			outl(val, i8254->iobase + reg_offset);
-		break;
+	if (dir) {
+		outw(val, iobase + reg_offset);
+		return 0;
+	} else {
+		return inw(iobase + reg_offset);
+	}
+}
+
+static unsigned int i8254_io32_cb(struct comedi_8254 *i8254, int dir,
+				  unsigned int reg, unsigned int val)
+{
+	unsigned long iobase = i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO32) << i8254->regshift;
+
+	if (dir) {
+		outl(val, iobase + reg_offset);
+		return 0;
+	} else {
+		return inl(iobase + reg_offset);
+	}
+}
+
+static unsigned int i8254_mmio8_cb(struct comedi_8254 *i8254, int dir,
+				   unsigned int reg, unsigned int val)
+{
+	void __iomem *mmiobase = (void __iomem *)i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO8) << i8254->regshift;
+
+	if (dir) {
+		writeb(val, mmiobase + reg_offset);
+		return 0;
+	} else {
+		return readb(mmiobase + reg_offset);
+	}
+}
+
+static unsigned int i8254_mmio16_cb(struct comedi_8254 *i8254, int dir,
+				    unsigned int reg, unsigned int val)
+{
+	void __iomem *mmiobase = (void __iomem *)i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO16) << i8254->regshift;
+
+	if (dir) {
+		writew(val, mmiobase + reg_offset);
+		return 0;
+	} else {
+		return readw(mmiobase + reg_offset);
 	}
 }
 
+static unsigned int i8254_mmio32_cb(struct comedi_8254 *i8254, int dir,
+				    unsigned int reg, unsigned int val)
+{
+	void __iomem *mmiobase = (void __iomem *)i8254->context;
+	unsigned int reg_offset = (reg * I8254_IO32) << i8254->regshift;
+
+	if (dir) {
+		writel(val, mmiobase + reg_offset);
+		return 0;
+	} else {
+		return readl(mmiobase + reg_offset);
+	}
+}
+
+static unsigned int __i8254_read(struct comedi_8254 *i8254, unsigned int reg)
+{
+	return 0xff & i8254->iocb(i8254, 0, reg, 0);
+}
+
+static void __i8254_write(struct comedi_8254 *i8254,
+			  unsigned int val, unsigned int reg)
+{
+	i8254->iocb(i8254, 1, reg, val);
+}
+
 /**
  * comedi_8254_status - return the status of a counter
  * @i8254:	comedi_8254 struct for the timer
@@ -571,8 +609,8 @@ void comedi_8254_subdevice_init(struct comedi_subdevice *s,
 }
 EXPORT_SYMBOL_GPL(comedi_8254_subdevice_init);
 
-static struct comedi_8254 *__i8254_init(unsigned long iobase,
-					void __iomem *mmio,
+static struct comedi_8254 *__i8254_init(comedi_8254_iocb_fn *iocb,
+					unsigned long context,
 					unsigned int osc_base,
 					unsigned int iosize,
 					unsigned int regshift)
@@ -585,12 +623,15 @@ static struct comedi_8254 *__i8254_init(unsigned long iobase,
 	      iosize == I8254_IO32))
 		return NULL;
 
+	if (!iocb)
+		return NULL;
+
 	i8254 = kzalloc(sizeof(*i8254), GFP_KERNEL);
 	if (!i8254)
 		return NULL;
 
-	i8254->iobase	= iobase;
-	i8254->mmio	= mmio;
+	i8254->iocb	= iocb;
+	i8254->context	= context;
 	i8254->iosize	= iosize;
 	i8254->regshift	= regshift;
 
@@ -617,7 +658,22 @@ struct comedi_8254 *comedi_8254_init(unsigned long iobase,
 				     unsigned int iosize,
 				     unsigned int regshift)
 {
-	return __i8254_init(iobase, NULL, osc_base, iosize, regshift);
+	comedi_8254_iocb_fn *iocb;
+
+	switch (iosize) {
+	case I8254_IO8:
+		iocb = i8254_io8_cb;
+		break;
+	case I8254_IO16:
+		iocb = i8254_io16_cb;
+		break;
+	case I8254_IO32:
+		iocb = i8254_io32_cb;
+		break;
+	default:
+		return NULL;
+	}
+	return __i8254_init(iocb, iobase, osc_base, iosize, regshift);
 }
 EXPORT_SYMBOL_GPL(comedi_8254_init);
 
@@ -634,7 +690,22 @@ struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
 					unsigned int iosize,
 					unsigned int regshift)
 {
-	return __i8254_init(0, mmio, osc_base, iosize, regshift);
+	comedi_8254_iocb_fn *iocb;
+
+	switch (iosize) {
+	case I8254_IO8:
+		iocb = i8254_mmio8_cb;
+		break;
+	case I8254_IO16:
+		iocb = i8254_mmio16_cb;
+		break;
+	case I8254_IO32:
+		iocb = i8254_mmio32_cb;
+		break;
+	default:
+		return NULL;
+	}
+	return __i8254_init(iocb, (unsigned long)mmio, osc_base, iosize, regshift);
 }
 EXPORT_SYMBOL_GPL(comedi_8254_mm_init);
 
diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index d8264417e53c..18d12321c87d 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -57,10 +57,24 @@ struct comedi_subdevice;
 /* counter maps zero to 0x10000 */
 #define I8254_MAX_COUNT			0x10000
 
+struct comedi_8254;
+
+/**
+ * typedef comedi_8254_iocb_fn - call-back function type for 8254 register access
+ * @i8254:		pointer to struct comedi_8254
+ * @dir:		direction (0 = read, 1 = write)
+ * @reg:		register number
+ * @val:		value to write
+ *
+ * Return: Register value when reading, 0 when writing.
+ */
+typedef unsigned int comedi_8254_iocb_fn(struct comedi_8254 *i8254, int dir,
+					 unsigned int reg, unsigned int val);
+
 /**
  * struct comedi_8254 - private data used by this module
- * @iobase:		PIO base address of the registers (in/out)
- * @mmio:		MMIO base address of the registers (read/write)
+ * @iocb:		I/O call-back function for register access
+ * @context:		context for register access (e.g. a base address)
  * @iosize:		I/O size used to access the registers (b/w/l)
  * @regshift:		register gap shift
  * @osc_base:		cascaded oscillator speed in ns
@@ -76,8 +90,8 @@ struct comedi_subdevice;
  * @insn_config:	driver specific (*insn_config) callback
  */
 struct comedi_8254 {
-	unsigned long iobase;
-	void __iomem *mmio;
+	comedi_8254_iocb_fn *iocb;
+	unsigned long context;
 	unsigned int iosize;
 	unsigned int regshift;
 	unsigned int osc_base;
-- 
cgit v1.2.3


From fade5e5b0b2a2cc3855f64be6407b0bdcd837714 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:02 +0100
Subject: comedi: comedi_8254: Replace comedi_8254_init() and
 comedi_8254_mm_init()

`comedi_8254_init()` and `comedi_8254_mm_init()` return `NULL` on
failure, but the failure is not necessarily due to lack of memory.
Change them to return an `ERR_PTR` value on failure and rename the
functions to make it obvious the API has changed.  `comedi_8254_init()`
has been replaced with `comedi_8254_io_alloc()`, and
`comedi_8254_mm_init()` has been replaced with `comedi_8254_mm_alloc()`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-4-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/drivers.c                     |  3 +-
 drivers/comedi/drivers/adl_pci9111.c         |  8 ++--
 drivers/comedi/drivers/adl_pci9118.c         |  8 ++--
 drivers/comedi/drivers/adv_pci1710.c         |  8 ++--
 drivers/comedi/drivers/adv_pci_dio.c         | 10 ++---
 drivers/comedi/drivers/aio_aio12_8.c         |  8 ++--
 drivers/comedi/drivers/amplc_dio200_common.c | 12 +++---
 drivers/comedi/drivers/amplc_pci224.c        |  8 ++--
 drivers/comedi/drivers/amplc_pci230.c        |  8 ++--
 drivers/comedi/drivers/cb_das16_cs.c         |  8 ++--
 drivers/comedi/drivers/cb_pcidas.c           | 21 ++++++-----
 drivers/comedi/drivers/cb_pcimdas.c          | 10 ++---
 drivers/comedi/drivers/comedi_8254.c         | 55 ++++++++++++++++------------
 drivers/comedi/drivers/das08.c               |  9 +++--
 drivers/comedi/drivers/das16.c               |  8 ++--
 drivers/comedi/drivers/das16m1.c             | 20 +++++-----
 drivers/comedi/drivers/das1800.c             |  8 ++--
 drivers/comedi/drivers/das6402.c             |  8 ++--
 drivers/comedi/drivers/das800.c              |  8 ++--
 drivers/comedi/drivers/me4000.c              |  6 +--
 drivers/comedi/drivers/ni_at_a2150.c         |  8 ++--
 drivers/comedi/drivers/ni_at_ao.c            |  8 ++--
 drivers/comedi/drivers/ni_labpc_common.c     | 38 ++++++++++---------
 drivers/comedi/drivers/pcl711.c              |  8 ++--
 drivers/comedi/drivers/pcl812.c              | 10 ++---
 drivers/comedi/drivers/pcl816.c              |  8 ++--
 drivers/comedi/drivers/pcl818.c              |  8 ++--
 drivers/comedi/drivers/rtd520.c              |  6 +--
 include/linux/comedi/comedi_8254.h           | 16 ++++----
 29 files changed, 179 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/comedi/drivers.c b/drivers/comedi/drivers.c
index d4e2ed709bfc..376130bfba8a 100644
--- a/drivers/comedi/drivers.c
+++ b/drivers/comedi/drivers.c
@@ -177,7 +177,8 @@ static void comedi_device_detach_cleanup(struct comedi_device *dev)
 		dev->n_subdevices = 0;
 	}
 	kfree(dev->private);
-	kfree(dev->pacer);
+	if (!IS_ERR(dev->pacer))
+		kfree(dev->pacer);
 	dev->private = NULL;
 	dev->pacer = NULL;
 	dev->driver = NULL;
diff --git a/drivers/comedi/drivers/adl_pci9111.c b/drivers/comedi/drivers/adl_pci9111.c
index c50f94272a74..086d93f40cb9 100644
--- a/drivers/comedi/drivers/adl_pci9111.c
+++ b/drivers/comedi/drivers/adl_pci9111.c
@@ -647,10 +647,10 @@ static int pci9111_auto_attach(struct comedi_device *dev,
 			dev->irq = pcidev->irq;
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCI9111_8254_BASE_REG,
-				      I8254_OSC_BASE_2MHZ, I8254_IO16, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCI9111_8254_BASE_REG,
+					  I8254_OSC_BASE_2MHZ, I8254_IO16, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/adl_pci9118.c b/drivers/comedi/drivers/adl_pci9118.c
index 9a816c718303..a76e2666d583 100644
--- a/drivers/comedi/drivers/adl_pci9118.c
+++ b/drivers/comedi/drivers/adl_pci9118.c
@@ -1524,10 +1524,10 @@ static int pci9118_common_attach(struct comedi_device *dev,
 	devpriv->iobase_a = pci_resource_start(pcidev, 0);
 	dev->iobase = pci_resource_start(pcidev, 2);
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCI9118_TIMER_BASE,
-				      I8254_OSC_BASE_4MHZ, I8254_IO32, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCI9118_TIMER_BASE,
+					  I8254_OSC_BASE_4MHZ, I8254_IO32, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	pci9118_reset(dev);
 
diff --git a/drivers/comedi/drivers/adv_pci1710.c b/drivers/comedi/drivers/adv_pci1710.c
index 4f2639968260..c49b0f1f5228 100644
--- a/drivers/comedi/drivers/adv_pci1710.c
+++ b/drivers/comedi/drivers/adv_pci1710.c
@@ -767,10 +767,10 @@ static int pci1710_auto_attach(struct comedi_device *dev,
 		return ret;
 	dev->iobase = pci_resource_start(pcidev, 2);
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCI171X_TIMER_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO16, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCI171X_TIMER_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO16, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	n_subdevices = 1;	/* all boards have analog inputs */
 	if (board->has_ao)
diff --git a/drivers/comedi/drivers/adv_pci_dio.c b/drivers/comedi/drivers/adv_pci_dio.c
index efa3e46b554b..0319d8c7ee47 100644
--- a/drivers/comedi/drivers/adv_pci_dio.c
+++ b/drivers/comedi/drivers/adv_pci_dio.c
@@ -664,11 +664,11 @@ static int pci_dio_auto_attach(struct comedi_device *dev,
 	if (board->timer_regbase) {
 		s = &dev->subdevices[subdev++];
 
-		dev->pacer = comedi_8254_init(dev->iobase +
-					      board->timer_regbase,
-					      0, I8254_IO8, 0);
-		if (!dev->pacer)
-			return -ENOMEM;
+		dev->pacer =
+		    comedi_8254_io_alloc(dev->iobase + board->timer_regbase,
+					 0, I8254_IO8, 0);
+		if (IS_ERR(dev->pacer))
+			return PTR_ERR(dev->pacer);
 
 		comedi_8254_subdevice_init(s, dev->pacer);
 	}
diff --git a/drivers/comedi/drivers/aio_aio12_8.c b/drivers/comedi/drivers/aio_aio12_8.c
index 30b8a32204d8..f9d40fa3d3a9 100644
--- a/drivers/comedi/drivers/aio_aio12_8.c
+++ b/drivers/comedi/drivers/aio_aio12_8.c
@@ -206,10 +206,10 @@ static int aio_aio12_8_attach(struct comedi_device *dev,
 	if (ret)
 		return ret;
 
-	dev->pacer = comedi_8254_init(dev->iobase + AIO12_8_8254_BASE_REG,
-				      0, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + AIO12_8_8254_BASE_REG,
+					  0, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/amplc_dio200_common.c b/drivers/comedi/drivers/amplc_dio200_common.c
index 2c1507a23f8a..19166cb26f5e 100644
--- a/drivers/comedi/drivers/amplc_dio200_common.c
+++ b/drivers/comedi/drivers/amplc_dio200_common.c
@@ -556,14 +556,14 @@ static int dio200_subdev_8254_init(struct comedi_device *dev,
 	}
 
 	if (dev->mmio) {
-		i8254 = comedi_8254_mm_init(dev->mmio + offset,
-					    0, I8254_IO8, regshift);
+		i8254 = comedi_8254_mm_alloc(dev->mmio + offset,
+					     0, I8254_IO8, regshift);
 	} else {
-		i8254 = comedi_8254_init(dev->iobase + offset,
-					 0, I8254_IO8, regshift);
+		i8254 = comedi_8254_io_alloc(dev->iobase + offset,
+					     0, I8254_IO8, regshift);
 	}
-	if (!i8254)
-		return -ENOMEM;
+	if (IS_ERR(i8254))
+		return PTR_ERR(i8254);
 
 	comedi_8254_subdevice_init(s, i8254);
 
diff --git a/drivers/comedi/drivers/amplc_pci224.c b/drivers/comedi/drivers/amplc_pci224.c
index 5a04e55daeea..1373637c2ca2 100644
--- a/drivers/comedi/drivers/amplc_pci224.c
+++ b/drivers/comedi/drivers/amplc_pci224.c
@@ -1051,10 +1051,10 @@ pci224_auto_attach(struct comedi_device *dev, unsigned long context_model)
 	outw(devpriv->daccon | PCI224_DACCON_FIFORESET,
 	     dev->iobase + PCI224_DACCON);
 
-	dev->pacer = comedi_8254_init(devpriv->iobase1 + PCI224_Z2_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(devpriv->iobase1 + PCI224_Z2_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 1);
 	if (ret)
diff --git a/drivers/comedi/drivers/amplc_pci230.c b/drivers/comedi/drivers/amplc_pci230.c
index 92ba8b8c0172..783da73877b9 100644
--- a/drivers/comedi/drivers/amplc_pci230.c
+++ b/drivers/comedi/drivers/amplc_pci230.c
@@ -2475,10 +2475,10 @@ static int pci230_auto_attach(struct comedi_device *dev,
 			dev->irq = pci_dev->irq;
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCI230_Z2_CT_BASE,
-				      0, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCI230_Z2_CT_BASE,
+					  0, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	rc = comedi_alloc_subdevices(dev, 3);
 	if (rc)
diff --git a/drivers/comedi/drivers/cb_das16_cs.c b/drivers/comedi/drivers/cb_das16_cs.c
index 8e0d2fa5f95d..306208a0695b 100644
--- a/drivers/comedi/drivers/cb_das16_cs.c
+++ b/drivers/comedi/drivers/cb_das16_cs.c
@@ -363,10 +363,10 @@ static int das16cs_auto_attach(struct comedi_device *dev,
 	if (!devpriv)
 		return -ENOMEM;
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS16CS_TIMER_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO16, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS16CS_TIMER_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO16, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/cb_pcidas.c b/drivers/comedi/drivers/cb_pcidas.c
index 0c7576b967fc..7a6cd681e932 100644
--- a/drivers/comedi/drivers/cb_pcidas.c
+++ b/drivers/comedi/drivers/cb_pcidas.c
@@ -1288,16 +1288,16 @@ static int cb_pcidas_auto_attach(struct comedi_device *dev,
 	}
 	dev->irq = pcidev->irq;
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCIDAS_AI_8254_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCIDAS_AI_8254_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
-	devpriv->ao_pacer = comedi_8254_init(dev->iobase + PCIDAS_AO_8254_BASE,
-					     I8254_OSC_BASE_10MHZ,
-					     I8254_IO8, 0);
-	if (!devpriv->ao_pacer)
-		return -ENOMEM;
+	devpriv->ao_pacer =
+	    comedi_8254_io_alloc(dev->iobase + PCIDAS_AO_8254_BASE,
+				 I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(devpriv->ao_pacer))
+		return PTR_ERR(devpriv->ao_pacer);
 
 	ret = comedi_alloc_subdevices(dev, 7);
 	if (ret)
@@ -1453,7 +1453,8 @@ static void cb_pcidas_detach(struct comedi_device *dev)
 		if (devpriv->amcc)
 			outl(INTCSR_INBOX_INTR_STATUS,
 			     devpriv->amcc + AMCC_OP_REG_INTCSR);
-		kfree(devpriv->ao_pacer);
+		if (!IS_ERR(devpriv->ao_pacer))
+			kfree(devpriv->ao_pacer);
 	}
 	comedi_pci_detach(dev);
 }
diff --git a/drivers/comedi/drivers/cb_pcimdas.c b/drivers/comedi/drivers/cb_pcimdas.c
index 8bdb00774f11..5816ef65ed5f 100644
--- a/drivers/comedi/drivers/cb_pcimdas.c
+++ b/drivers/comedi/drivers/cb_pcimdas.c
@@ -364,11 +364,11 @@ static int cb_pcimdas_auto_attach(struct comedi_device *dev,
 	devpriv->BADR3 = pci_resource_start(pcidev, 3);
 	dev->iobase = pci_resource_start(pcidev, 4);
 
-	dev->pacer = comedi_8254_init(devpriv->BADR3 + PCIMDAS_8254_BASE,
-				      cb_pcimdas_pacer_clk(dev),
-				      I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(devpriv->BADR3 + PCIMDAS_8254_BASE,
+					  cb_pcimdas_pacer_clk(dev),
+					  I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 6);
 	if (ret)
diff --git a/drivers/comedi/drivers/comedi_8254.c b/drivers/comedi/drivers/comedi_8254.c
index 3f8657fc7ee5..696596944506 100644
--- a/drivers/comedi/drivers/comedi_8254.c
+++ b/drivers/comedi/drivers/comedi_8254.c
@@ -24,14 +24,17 @@
  *
  * This module provides the following basic functions:
  *
- * comedi_8254_init() / comedi_8254_mm_init()
+ * comedi_8254_io_alloc() / comedi_8254_mm_alloc()
  *	Initializes this module to access the 8254 registers. The _mm version
- *	sets up the module for MMIO register access the other for PIO access.
- *	The pointer returned from these functions is normally stored in the
- *	comedi_device dev->pacer and will be freed by the comedi core during
- *	the driver (*detach). If a driver has multiple 8254 devices, they need
- *	to be stored in the drivers private data and freed when the driver is
- *	detached.
+ *	sets up the module for MMIO register access; the _io version sets it
+ *	up for PIO access.  These functions return a pointer to a struct
+ *	comedi_8254 on success, or an ERR_PTR value on failure.  The pointer
+ *	returned from these functions is normally stored in the comedi_device
+ *	dev->pacer and will be freed by the comedi core during the driver
+ *	(*detach). If a driver has multiple 8254 devices, they need to be
+ *	stored in the drivers private data and freed when the driver is
+ *	detached.  If the ERR_PTR value is stored, code should check the
+ *	pointer value with !IS_ERR(pointer) before freeing.
  *
  *	NOTE: The counters are reset by setting them to I8254_MODE0 as part of
  *	this initialization.
@@ -621,14 +624,14 @@ static struct comedi_8254 *__i8254_init(comedi_8254_iocb_fn *iocb,
 	/* sanity check that the iosize is valid */
 	if (!(iosize == I8254_IO8 || iosize == I8254_IO16 ||
 	      iosize == I8254_IO32))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	if (!iocb)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	i8254 = kzalloc(sizeof(*i8254), GFP_KERNEL);
 	if (!i8254)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	i8254->iocb	= iocb;
 	i8254->context	= context;
@@ -646,17 +649,19 @@ static struct comedi_8254 *__i8254_init(comedi_8254_iocb_fn *iocb,
 }
 
 /**
- * comedi_8254_init - allocate and initialize the 8254 device for pio access
+ * comedi_8254_io_alloc - allocate and initialize the 8254 device for pio access
  * @iobase:	port I/O base address
  * @osc_base:	base time of the counter in ns
  *		OPTIONAL - only used by comedi_8254_cascade_ns_to_timer()
  * @iosize:	I/O register size
  * @regshift:	register gap shift
+ *
+ * Return: A pointer to a struct comedi_8254 or an ERR_PTR value.
  */
-struct comedi_8254 *comedi_8254_init(unsigned long iobase,
-				     unsigned int osc_base,
-				     unsigned int iosize,
-				     unsigned int regshift)
+struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift)
 {
 	comedi_8254_iocb_fn *iocb;
 
@@ -671,24 +676,26 @@ struct comedi_8254 *comedi_8254_init(unsigned long iobase,
 		iocb = i8254_io32_cb;
 		break;
 	default:
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	}
 	return __i8254_init(iocb, iobase, osc_base, iosize, regshift);
 }
-EXPORT_SYMBOL_GPL(comedi_8254_init);
+EXPORT_SYMBOL_GPL(comedi_8254_io_alloc);
 
 /**
- * comedi_8254_mm_init - allocate and initialize the 8254 device for mmio access
+ * comedi_8254_mm_alloc - allocate and initialize the 8254 device for mmio access
  * @mmio:	memory mapped I/O base address
  * @osc_base:	base time of the counter in ns
  *		OPTIONAL - only used by comedi_8254_cascade_ns_to_timer()
  * @iosize:	I/O register size
  * @regshift:	register gap shift
+ *
+ * Return: A pointer to a struct comedi_8254 or an ERR_PTR value.
  */
-struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
-					unsigned int osc_base,
-					unsigned int iosize,
-					unsigned int regshift)
+struct comedi_8254 *comedi_8254_mm_alloc(void __iomem *mmio,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift)
 {
 	comedi_8254_iocb_fn *iocb;
 
@@ -703,11 +710,11 @@ struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
 		iocb = i8254_mmio32_cb;
 		break;
 	default:
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	}
 	return __i8254_init(iocb, (unsigned long)mmio, osc_base, iosize, regshift);
 }
-EXPORT_SYMBOL_GPL(comedi_8254_mm_init);
+EXPORT_SYMBOL_GPL(comedi_8254_mm_alloc);
 
 static int __init comedi_8254_module_init(void)
 {
diff --git a/drivers/comedi/drivers/das08.c b/drivers/comedi/drivers/das08.c
index f8ab3af2e391..6a3b5411aa90 100644
--- a/drivers/comedi/drivers/das08.c
+++ b/drivers/comedi/drivers/das08.c
@@ -439,10 +439,11 @@ int das08_common_attach(struct comedi_device *dev, unsigned long iobase)
 	/* Counter subdevice (8254) */
 	s = &dev->subdevices[5];
 	if (board->i8254_offset) {
-		dev->pacer = comedi_8254_init(dev->iobase + board->i8254_offset,
-					      0, I8254_IO8, 0);
-		if (!dev->pacer)
-			return -ENOMEM;
+		dev->pacer =
+		    comedi_8254_io_alloc(dev->iobase + board->i8254_offset,
+					 0, I8254_IO8, 0);
+		if (IS_ERR(dev->pacer))
+			return PTR_ERR(dev->pacer);
 
 		comedi_8254_subdevice_init(s, dev->pacer);
 	} else {
diff --git a/drivers/comedi/drivers/das16.c b/drivers/comedi/drivers/das16.c
index 728dc02156c8..bfe8811be1b5 100644
--- a/drivers/comedi/drivers/das16.c
+++ b/drivers/comedi/drivers/das16.c
@@ -1067,10 +1067,10 @@ static int das16_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 			osc_base = I8254_OSC_BASE_1MHZ / it->options[3];
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS16_TIMER_BASE_REG,
-				      osc_base, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS16_TIMER_BASE_REG,
+					  osc_base, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	das16_alloc_dma(dev, it->options[2]);
 
diff --git a/drivers/comedi/drivers/das16m1.c b/drivers/comedi/drivers/das16m1.c
index 275effb77746..ff9c5a8897bd 100644
--- a/drivers/comedi/drivers/das16m1.c
+++ b/drivers/comedi/drivers/das16m1.c
@@ -529,15 +529,16 @@ static int das16m1_attach(struct comedi_device *dev,
 			dev->irq = it->options[1];
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS16M1_8254_IOBASE2,
-				      I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS16M1_8254_IOBASE2,
+					  I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
-	devpriv->counter = comedi_8254_init(dev->iobase + DAS16M1_8254_IOBASE1,
-					    0, I8254_IO8, 0);
-	if (!devpriv->counter)
-		return -ENOMEM;
+	devpriv->counter =
+	    comedi_8254_io_alloc(dev->iobase + DAS16M1_8254_IOBASE1,
+				 0, I8254_IO8, 0);
+	if (IS_ERR(devpriv->counter))
+		return PTR_ERR(devpriv->counter);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
@@ -603,7 +604,8 @@ static void das16m1_detach(struct comedi_device *dev)
 	if (devpriv) {
 		if (devpriv->extra_iobase)
 			release_region(devpriv->extra_iobase, DAS16M1_SIZE2);
-		kfree(devpriv->counter);
+		if (!IS_ERR(devpriv->counter))
+			kfree(devpriv->counter);
 	}
 	comedi_legacy_detach(dev);
 }
diff --git a/drivers/comedi/drivers/das1800.c b/drivers/comedi/drivers/das1800.c
index f09608c0f4ff..7117c67aee7e 100644
--- a/drivers/comedi/drivers/das1800.c
+++ b/drivers/comedi/drivers/das1800.c
@@ -1233,10 +1233,10 @@ static int das1800_attach(struct comedi_device *dev,
 	if (!devpriv->fifo_buf)
 		return -ENOMEM;
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS1800_COUNTER,
-				      I8254_OSC_BASE_5MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS1800_COUNTER,
+					  I8254_OSC_BASE_5MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/das6402.c b/drivers/comedi/drivers/das6402.c
index 1af394591e74..68f95330de45 100644
--- a/drivers/comedi/drivers/das6402.c
+++ b/drivers/comedi/drivers/das6402.c
@@ -590,10 +590,10 @@ static int das6402_attach(struct comedi_device *dev,
 		}
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS6402_TIMER_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS6402_TIMER_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/das800.c b/drivers/comedi/drivers/das800.c
index 4ca33f46eaa7..300775523031 100644
--- a/drivers/comedi/drivers/das800.c
+++ b/drivers/comedi/drivers/das800.c
@@ -672,10 +672,10 @@ static int das800_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 			dev->irq = irq;
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + DAS800_8254,
-				      I8254_OSC_BASE_1MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + DAS800_8254,
+					  I8254_OSC_BASE_1MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 3);
 	if (ret)
diff --git a/drivers/comedi/drivers/me4000.c b/drivers/comedi/drivers/me4000.c
index 9aea02b86ed9..7dd3a0071863 100644
--- a/drivers/comedi/drivers/me4000.c
+++ b/drivers/comedi/drivers/me4000.c
@@ -1209,9 +1209,9 @@ static int me4000_auto_attach(struct comedi_device *dev,
 		if (!timer_base)
 			return -ENODEV;
 
-		dev->pacer = comedi_8254_init(timer_base, 0, I8254_IO8, 0);
-		if (!dev->pacer)
-			return -ENOMEM;
+		dev->pacer = comedi_8254_io_alloc(timer_base, 0, I8254_IO8, 0);
+		if (IS_ERR(dev->pacer))
+			return PTR_ERR(dev->pacer);
 
 		comedi_8254_subdevice_init(s, dev->pacer);
 	} else {
diff --git a/drivers/comedi/drivers/ni_at_a2150.c b/drivers/comedi/drivers/ni_at_a2150.c
index df8d219e6723..e4e5a0ebd195 100644
--- a/drivers/comedi/drivers/ni_at_a2150.c
+++ b/drivers/comedi/drivers/ni_at_a2150.c
@@ -707,10 +707,10 @@ static int a2150_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 	/* an IRQ and DMA are required to support async commands */
 	a2150_alloc_irq_and_dma(dev, it);
 
-	dev->pacer = comedi_8254_init(dev->iobase + I8253_BASE_REG,
-				      0, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + I8253_BASE_REG,
+					  0, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 1);
 	if (ret)
diff --git a/drivers/comedi/drivers/ni_at_ao.c b/drivers/comedi/drivers/ni_at_ao.c
index 9f3147b72aa8..9cf6b4ff6b65 100644
--- a/drivers/comedi/drivers/ni_at_ao.c
+++ b/drivers/comedi/drivers/ni_at_ao.c
@@ -303,10 +303,10 @@ static int atao_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 	if (!devpriv)
 		return -ENOMEM;
 
-	dev->pacer = comedi_8254_init(dev->iobase + ATAO_82C53_BASE,
-				      0, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + ATAO_82C53_BASE,
+					  0, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/ni_labpc_common.c b/drivers/comedi/drivers/ni_labpc_common.c
index 763249653228..eb8f6431276a 100644
--- a/drivers/comedi/drivers/ni_labpc_common.c
+++ b/drivers/comedi/drivers/ni_labpc_common.c
@@ -1222,24 +1222,24 @@ int labpc_common_attach(struct comedi_device *dev,
 	}
 
 	if (dev->mmio) {
-		dev->pacer = comedi_8254_mm_init(dev->mmio + COUNTER_B_BASE_REG,
-						 I8254_OSC_BASE_2MHZ,
-						 I8254_IO8, 0);
-		devpriv->counter = comedi_8254_mm_init(dev->mmio +
-						       COUNTER_A_BASE_REG,
-						       I8254_OSC_BASE_2MHZ,
-						       I8254_IO8, 0);
+		dev->pacer =
+		    comedi_8254_mm_alloc(dev->mmio + COUNTER_B_BASE_REG,
+					 I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
+		devpriv->counter =
+		    comedi_8254_mm_alloc(dev->mmio + COUNTER_A_BASE_REG,
+					 I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
 	} else {
-		dev->pacer = comedi_8254_init(dev->iobase + COUNTER_B_BASE_REG,
-					      I8254_OSC_BASE_2MHZ,
-					      I8254_IO8, 0);
-		devpriv->counter = comedi_8254_init(dev->iobase +
-						    COUNTER_A_BASE_REG,
-						    I8254_OSC_BASE_2MHZ,
-						    I8254_IO8, 0);
+		dev->pacer =
+		    comedi_8254_io_alloc(dev->iobase + COUNTER_B_BASE_REG,
+					 I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
+		devpriv->counter =
+		    comedi_8254_io_alloc(dev->iobase + COUNTER_A_BASE_REG,
+					 I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
 	}
-	if (!dev->pacer || !devpriv->counter)
-		return -ENOMEM;
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
+	if (IS_ERR(devpriv->counter))
+		return PTR_ERR(devpriv->counter);
 
 	ret = comedi_alloc_subdevices(dev, 5);
 	if (ret)
@@ -1341,8 +1341,10 @@ void labpc_common_detach(struct comedi_device *dev)
 {
 	struct labpc_private *devpriv = dev->private;
 
-	if (devpriv)
-		kfree(devpriv->counter);
+	if (devpriv) {
+		if (!IS_ERR(devpriv->counter))
+			kfree(devpriv->counter);
+	}
 }
 EXPORT_SYMBOL_GPL(labpc_common_detach);
 
diff --git a/drivers/comedi/drivers/pcl711.c b/drivers/comedi/drivers/pcl711.c
index 05172c553c8a..0cf3917defe7 100644
--- a/drivers/comedi/drivers/pcl711.c
+++ b/drivers/comedi/drivers/pcl711.c
@@ -429,10 +429,10 @@ static int pcl711_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 			dev->irq = it->options[1];
 	}
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCL711_TIMER_BASE,
-				      I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCL711_TIMER_BASE,
+					  I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/pcl812.c b/drivers/comedi/drivers/pcl812.c
index 70dbc129fcf5..0df639c6a595 100644
--- a/drivers/comedi/drivers/pcl812.c
+++ b/drivers/comedi/drivers/pcl812.c
@@ -1143,11 +1143,11 @@ static int pcl812_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 		return ret;
 
 	if (board->irq_bits) {
-		dev->pacer = comedi_8254_init(dev->iobase + PCL812_TIMER_BASE,
-					      I8254_OSC_BASE_2MHZ,
-					      I8254_IO8, 0);
-		if (!dev->pacer)
-			return -ENOMEM;
+		dev->pacer =
+		    comedi_8254_io_alloc(dev->iobase + PCL812_TIMER_BASE,
+					 I8254_OSC_BASE_2MHZ, I8254_IO8, 0);
+		if (IS_ERR(dev->pacer))
+			return PTR_ERR(dev->pacer);
 
 		if ((1 << it->options[1]) & board->irq_bits) {
 			ret = request_irq(it->options[1], pcl812_interrupt, 0,
diff --git a/drivers/comedi/drivers/pcl816.c b/drivers/comedi/drivers/pcl816.c
index a5e5320be648..28d1a88c50f6 100644
--- a/drivers/comedi/drivers/pcl816.c
+++ b/drivers/comedi/drivers/pcl816.c
@@ -615,10 +615,10 @@ static int pcl816_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 	/* an IRQ and DMA are required to support async commands */
 	pcl816_alloc_irq_and_dma(dev, it);
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCL816_TIMER_BASE,
-				      I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCL816_TIMER_BASE,
+					  I8254_OSC_BASE_10MHZ, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	ret = comedi_alloc_subdevices(dev, 4);
 	if (ret)
diff --git a/drivers/comedi/drivers/pcl818.c b/drivers/comedi/drivers/pcl818.c
index 29e503de8267..4127adcfb229 100644
--- a/drivers/comedi/drivers/pcl818.c
+++ b/drivers/comedi/drivers/pcl818.c
@@ -1015,10 +1015,10 @@ static int pcl818_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 	else
 		osc_base = I8254_OSC_BASE_1MHZ;
 
-	dev->pacer = comedi_8254_init(dev->iobase + PCL818_TIMER_BASE,
-				      osc_base, I8254_IO8, 0);
-	if (!dev->pacer)
-		return -ENOMEM;
+	dev->pacer = comedi_8254_io_alloc(dev->iobase + PCL818_TIMER_BASE,
+					  osc_base, I8254_IO8, 0);
+	if (IS_ERR(dev->pacer))
+		return PTR_ERR(dev->pacer);
 
 	/* max sampling speed */
 	devpriv->ns_min = board->ns_min;
diff --git a/drivers/comedi/drivers/rtd520.c b/drivers/comedi/drivers/rtd520.c
index 7e0ec1a2a2ca..44bb0decd7a4 100644
--- a/drivers/comedi/drivers/rtd520.c
+++ b/drivers/comedi/drivers/rtd520.c
@@ -1289,9 +1289,9 @@ static int rtd_auto_attach(struct comedi_device *dev,
 
 	/* 8254 Timer/Counter subdevice */
 	s = &dev->subdevices[3];
-	dev->pacer = comedi_8254_mm_init(dev->mmio + LAS0_8254_TIMER_BASE,
-					 RTD_CLOCK_BASE, I8254_IO8, 2);
-	if (!dev->pacer)
+	dev->pacer = comedi_8254_mm_alloc(dev->mmio + LAS0_8254_TIMER_BASE,
+					  RTD_CLOCK_BASE, I8254_IO8, 2);
+	if (IS_ERR(dev->pacer))
 		return -ENOMEM;
 
 	comedi_8254_subdevice_init(s, dev->pacer);
diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index 18d12321c87d..393ccb301028 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -136,13 +136,13 @@ void comedi_8254_set_busy(struct comedi_8254 *i8254,
 void comedi_8254_subdevice_init(struct comedi_subdevice *s,
 				struct comedi_8254 *i8254);
 
-struct comedi_8254 *comedi_8254_init(unsigned long iobase,
-				     unsigned int osc_base,
-				     unsigned int iosize,
-				     unsigned int regshift);
-struct comedi_8254 *comedi_8254_mm_init(void __iomem *mmio,
-					unsigned int osc_base,
-					unsigned int iosize,
-					unsigned int regshift);
+struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift);
+struct comedi_8254 *comedi_8254_mm_alloc(void __iomem *mmio,
+					 unsigned int osc_base,
+					 unsigned int iosize,
+					 unsigned int regshift);
 
 #endif	/* _COMEDI_8254_H */
-- 
cgit v1.2.3


From 90d256757e0bffd7a9beafd7c2bdc40a0236f9ec Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:03 +0100
Subject: comedi: comedi_8254: Conditionally remove I/O port support

The comedi_8254 module supports both port I/O and memory-mapped I/O.
In a future patch, the port I/O functions (`inb()`, `outb()`, and
friends) will only be declared if the `HAS_IOPORT` configuration option
is enabled.

Conditionally compile the parts of the module that use port I/O so they
are compiled if and only if the `CONFIG_HAS_IOPORT` macro is defined, so
that it can still be built if the port I/O functions have not been
declared.  If `CONFIG_HAS_IOPORT` is undefined, replace the GPL-exported
`comedi_8254_io_alloc()` function with a dummy static inline version
that just returns `ERR_PTR(-ENXIO)`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-5-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/drivers/comedi_8254.c |  8 ++++++++
 include/linux/comedi/comedi_8254.h   | 13 +++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/comedi/drivers/comedi_8254.c b/drivers/comedi/drivers/comedi_8254.c
index 696596944506..6beca2a6d66e 100644
--- a/drivers/comedi/drivers/comedi_8254.c
+++ b/drivers/comedi/drivers/comedi_8254.c
@@ -122,6 +122,8 @@
 #include <linux/comedi/comedidev.h>
 #include <linux/comedi/comedi_8254.h>
 
+#ifdef CONFIG_HAS_IOPORT
+
 static unsigned int i8254_io8_cb(struct comedi_8254 *i8254, int dir,
 				unsigned int reg, unsigned int val)
 {
@@ -164,6 +166,8 @@ static unsigned int i8254_io32_cb(struct comedi_8254 *i8254, int dir,
 	}
 }
 
+#endif	/* CONFIG_HAS_IOPORT */
+
 static unsigned int i8254_mmio8_cb(struct comedi_8254 *i8254, int dir,
 				   unsigned int reg, unsigned int val)
 {
@@ -648,6 +652,8 @@ static struct comedi_8254 *__i8254_init(comedi_8254_iocb_fn *iocb,
 	return i8254;
 }
 
+#ifdef CONFIG_HAS_IOPORT
+
 /**
  * comedi_8254_io_alloc - allocate and initialize the 8254 device for pio access
  * @iobase:	port I/O base address
@@ -682,6 +688,8 @@ struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
 }
 EXPORT_SYMBOL_GPL(comedi_8254_io_alloc);
 
+#endif	/* CONFIG_HAS_IOPORT */
+
 /**
  * comedi_8254_mm_alloc - allocate and initialize the 8254 device for mmio access
  * @mmio:	memory mapped I/O base address
diff --git a/include/linux/comedi/comedi_8254.h b/include/linux/comedi/comedi_8254.h
index 393ccb301028..d527f04400df 100644
--- a/include/linux/comedi/comedi_8254.h
+++ b/include/linux/comedi/comedi_8254.h
@@ -12,6 +12,8 @@
 #define _COMEDI_8254_H
 
 #include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/err.h>
 
 struct comedi_device;
 struct comedi_insn;
@@ -136,10 +138,21 @@ void comedi_8254_set_busy(struct comedi_8254 *i8254,
 void comedi_8254_subdevice_init(struct comedi_subdevice *s,
 				struct comedi_8254 *i8254);
 
+#ifdef CONFIG_HAS_IOPORT
 struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
 					 unsigned int osc_base,
 					 unsigned int iosize,
 					 unsigned int regshift);
+#else
+static inline struct comedi_8254 *comedi_8254_io_alloc(unsigned long iobase,
+						       unsigned int osc_base,
+						       unsigned int iosize,
+						       unsigned int regshift)
+{
+	return ERR_PTR(-ENXIO);
+}
+#endif
+
 struct comedi_8254 *comedi_8254_mm_alloc(void __iomem *mmio,
 					 unsigned int osc_base,
 					 unsigned int iosize,
-- 
cgit v1.2.3


From 5c57b1ccecc72738d4b9be2dfcdfb9001be76bd7 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:05 +0100
Subject: comedi: comedi_8255: Rework subdevice initialization functions

Comedi drivers can initialize an 8255 subdevice in I/O space by calling
`subdev_8255_init()`, or in memory-mapped I/O space by calling
`subdev_8255_mm_init()`, or by supplying a call-back function pointer
and context to either of those functions.  Change it so that a new
function `subdev_8255_cb_init()` shall be called instead when supplying
a callback function and context, and remove the call-back function
parameter from `subdev_8255_init()` and `subdev_8255_mm_init()`.

Also rename `subdev_8255_init()` to `subdev_8255_io_init()`.  The
parameters are changing, so might as well rename it at the same time.

Also rename the `regbase` member of `struct subdev_8255_private` to
`context` since this holds the context for the call-back function call.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-7-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/drivers/8255.c               |   2 +-
 drivers/comedi/drivers/8255_pci.c           |   4 +-
 drivers/comedi/drivers/adv_pci_dio.c        |   4 +-
 drivers/comedi/drivers/aio_aio12_8.c        |   2 +-
 drivers/comedi/drivers/amplc_pc236_common.c |   2 +-
 drivers/comedi/drivers/amplc_pci230.c       |   2 +-
 drivers/comedi/drivers/cb_pcidas.c          |   2 +-
 drivers/comedi/drivers/cb_pcidas64.c        |   7 +-
 drivers/comedi/drivers/cb_pcidda.c          |   2 +-
 drivers/comedi/drivers/cb_pcimdas.c         |   2 +-
 drivers/comedi/drivers/cb_pcimdda.c         |   2 +-
 drivers/comedi/drivers/comedi_8255.c        | 115 +++++++++++++---------------
 drivers/comedi/drivers/daqboard2000.c       |   4 +-
 drivers/comedi/drivers/das08.c              |   2 +-
 drivers/comedi/drivers/das16.c              |   2 +-
 drivers/comedi/drivers/das16m1.c            |   2 +-
 drivers/comedi/drivers/dmm32at.c            |   3 +-
 drivers/comedi/drivers/ni_atmio16d.c        |   2 +-
 drivers/comedi/drivers/ni_daq_dio24.c       |   2 +-
 drivers/comedi/drivers/ni_labpc_common.c    |   4 +-
 drivers/comedi/drivers/ni_mio_common.c      |   4 +-
 drivers/comedi/drivers/pcl724.c             |   6 +-
 drivers/comedi/drivers/pcm3724.c            |   2 +-
 include/linux/comedi/comedi_8255.h          |  13 ++--
 24 files changed, 92 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/comedi/drivers/8255.c b/drivers/comedi/drivers/8255.c
index ced8ea09d4fa..f45f7bd1c61a 100644
--- a/drivers/comedi/drivers/8255.c
+++ b/drivers/comedi/drivers/8255.c
@@ -80,7 +80,7 @@ static int dev_8255_attach(struct comedi_device *dev,
 		if (ret) {
 			s->type = COMEDI_SUBD_UNUSED;
 		} else {
-			ret = subdev_8255_init(dev, s, NULL, iobase);
+			ret = subdev_8255_io_init(dev, s, iobase);
 			if (ret) {
 				/*
 				 * Release the I/O port region here, as the
diff --git a/drivers/comedi/drivers/8255_pci.c b/drivers/comedi/drivers/8255_pci.c
index 9ad52e9f5427..8498cabe4d91 100644
--- a/drivers/comedi/drivers/8255_pci.c
+++ b/drivers/comedi/drivers/8255_pci.c
@@ -242,9 +242,9 @@ static int pci_8255_auto_attach(struct comedi_device *dev,
 	for (i = 0; i < board->n_8255; i++) {
 		s = &dev->subdevices[i];
 		if (dev->mmio)
-			ret = subdev_8255_mm_init(dev, s, NULL, i * I8255_SIZE);
+			ret = subdev_8255_mm_init(dev, s, i * I8255_SIZE);
 		else
-			ret = subdev_8255_init(dev, s, NULL, i * I8255_SIZE);
+			ret = subdev_8255_io_init(dev, s, i * I8255_SIZE);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/comedi/drivers/adv_pci_dio.c b/drivers/comedi/drivers/adv_pci_dio.c
index 0319d8c7ee47..ca8054504760 100644
--- a/drivers/comedi/drivers/adv_pci_dio.c
+++ b/drivers/comedi/drivers/adv_pci_dio.c
@@ -642,8 +642,8 @@ static int pci_dio_auto_attach(struct comedi_device *dev,
 
 		for (j = 0; j < d->chans; j++) {
 			s = &dev->subdevices[subdev++];
-			ret = subdev_8255_init(dev, s, NULL,
-					       d->addr + j * I8255_SIZE);
+			ret = subdev_8255_io_init(dev, s,
+						  d->addr + j * I8255_SIZE);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/comedi/drivers/aio_aio12_8.c b/drivers/comedi/drivers/aio_aio12_8.c
index f9d40fa3d3a9..227a86a3a760 100644
--- a/drivers/comedi/drivers/aio_aio12_8.c
+++ b/drivers/comedi/drivers/aio_aio12_8.c
@@ -247,7 +247,7 @@ static int aio_aio12_8_attach(struct comedi_device *dev,
 
 	/* Digital I/O subdevice (8255) */
 	s = &dev->subdevices[2];
-	ret = subdev_8255_init(dev, s, NULL, AIO12_8_8255_BASE_REG);
+	ret = subdev_8255_io_init(dev, s, AIO12_8_8255_BASE_REG);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/amplc_pc236_common.c b/drivers/comedi/drivers/amplc_pc236_common.c
index 9f4f89b1ef23..326ca72c24ec 100644
--- a/drivers/comedi/drivers/amplc_pc236_common.c
+++ b/drivers/comedi/drivers/amplc_pc236_common.c
@@ -147,7 +147,7 @@ int amplc_pc236_common_attach(struct comedi_device *dev, unsigned long iobase,
 
 	s = &dev->subdevices[0];
 	/* digital i/o subdevice (8255) */
-	ret = subdev_8255_init(dev, s, NULL, 0x00);
+	ret = subdev_8255_io_init(dev, s, 0x00);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/amplc_pci230.c b/drivers/comedi/drivers/amplc_pci230.c
index 783da73877b9..c74209c2e83a 100644
--- a/drivers/comedi/drivers/amplc_pci230.c
+++ b/drivers/comedi/drivers/amplc_pci230.c
@@ -2529,7 +2529,7 @@ static int pci230_auto_attach(struct comedi_device *dev,
 	s = &dev->subdevices[2];
 	/* digital i/o subdevice */
 	if (board->have_dio) {
-		rc = subdev_8255_init(dev, s, NULL, PCI230_PPI_X_BASE);
+		rc = subdev_8255_io_init(dev, s, PCI230_PPI_X_BASE);
 		if (rc)
 			return rc;
 	} else {
diff --git a/drivers/comedi/drivers/cb_pcidas.c b/drivers/comedi/drivers/cb_pcidas.c
index 7a6cd681e932..8bb9b0623869 100644
--- a/drivers/comedi/drivers/cb_pcidas.c
+++ b/drivers/comedi/drivers/cb_pcidas.c
@@ -1352,7 +1352,7 @@ static int cb_pcidas_auto_attach(struct comedi_device *dev,
 
 	/* 8255 */
 	s = &dev->subdevices[2];
-	ret = subdev_8255_init(dev, s, NULL, PCIDAS_8255_BASE);
+	ret = subdev_8255_io_init(dev, s, PCIDAS_8255_BASE);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/cb_pcidas64.c b/drivers/comedi/drivers/cb_pcidas64.c
index ca6038a25f26..ff19fc3859e4 100644
--- a/drivers/comedi/drivers/cb_pcidas64.c
+++ b/drivers/comedi/drivers/cb_pcidas64.c
@@ -3877,11 +3877,10 @@ static int setup_subdevices(struct comedi_device *dev)
 	s = &dev->subdevices[4];
 	if (board->has_8255) {
 		if (board->layout == LAYOUT_4020) {
-			ret = subdev_8255_init(dev, s, dio_callback_4020,
-					       I8255_4020_REG);
+			ret = subdev_8255_cb_init(dev, s, dio_callback_4020,
+						  I8255_4020_REG);
 		} else {
-			ret = subdev_8255_mm_init(dev, s, NULL,
-						  DIO_8255_OFFSET);
+			ret = subdev_8255_mm_init(dev, s, DIO_8255_OFFSET);
 		}
 		if (ret)
 			return ret;
diff --git a/drivers/comedi/drivers/cb_pcidda.c b/drivers/comedi/drivers/cb_pcidda.c
index c52204a6bda4..c353d0f87da9 100644
--- a/drivers/comedi/drivers/cb_pcidda.c
+++ b/drivers/comedi/drivers/cb_pcidda.c
@@ -365,7 +365,7 @@ static int cb_pcidda_auto_attach(struct comedi_device *dev,
 	/* two 8255 digital io subdevices */
 	for (i = 0; i < 2; i++) {
 		s = &dev->subdevices[1 + i];
-		ret = subdev_8255_init(dev, s, NULL, i * I8255_SIZE);
+		ret = subdev_8255_io_init(dev, s, i * I8255_SIZE);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/comedi/drivers/cb_pcimdas.c b/drivers/comedi/drivers/cb_pcimdas.c
index 5816ef65ed5f..641c30df392e 100644
--- a/drivers/comedi/drivers/cb_pcimdas.c
+++ b/drivers/comedi/drivers/cb_pcimdas.c
@@ -405,7 +405,7 @@ static int cb_pcimdas_auto_attach(struct comedi_device *dev,
 
 	/* Digital I/O subdevice */
 	s = &dev->subdevices[2];
-	ret = subdev_8255_init(dev, s, NULL, PCIMDAS_8255_BASE);
+	ret = subdev_8255_io_init(dev, s, PCIMDAS_8255_BASE);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/cb_pcimdda.c b/drivers/comedi/drivers/cb_pcimdda.c
index bf8093a10315..541b5742bb1b 100644
--- a/drivers/comedi/drivers/cb_pcimdda.c
+++ b/drivers/comedi/drivers/cb_pcimdda.c
@@ -154,7 +154,7 @@ static int cb_pcimdda_auto_attach(struct comedi_device *dev,
 
 	s = &dev->subdevices[1];
 	/* digital i/o subdevice */
-	return subdev_8255_init(dev, s, NULL, PCIMDDA_8255_BASE_REG);
+	return subdev_8255_io_init(dev, s, PCIMDDA_8255_BASE_REG);
 }
 
 static struct comedi_driver cb_pcimdda_driver = {
diff --git a/drivers/comedi/drivers/comedi_8255.c b/drivers/comedi/drivers/comedi_8255.c
index 5562b9cd0a17..28fd9d8c95cc 100644
--- a/drivers/comedi/drivers/comedi_8255.c
+++ b/drivers/comedi/drivers/comedi_8255.c
@@ -33,9 +33,9 @@
 #include <linux/comedi/comedi_8255.h>
 
 struct subdev_8255_private {
-	unsigned long regbase;
+	unsigned long context;
 	int (*io)(struct comedi_device *dev, int dir, int port, int data,
-		  unsigned long regbase);
+		  unsigned long context);
 };
 
 static int subdev_8255_io(struct comedi_device *dev,
@@ -64,7 +64,7 @@ static int subdev_8255_insn(struct comedi_device *dev,
 			    unsigned int *data)
 {
 	struct subdev_8255_private *spriv = s->private;
-	unsigned long regbase = spriv->regbase;
+	unsigned long context = spriv->context;
 	unsigned int mask;
 	unsigned int v;
 
@@ -72,18 +72,18 @@ static int subdev_8255_insn(struct comedi_device *dev,
 	if (mask) {
 		if (mask & 0xff)
 			spriv->io(dev, 1, I8255_DATA_A_REG,
-				  s->state & 0xff, regbase);
+				  s->state & 0xff, context);
 		if (mask & 0xff00)
 			spriv->io(dev, 1, I8255_DATA_B_REG,
-				  (s->state >> 8) & 0xff, regbase);
+				  (s->state >> 8) & 0xff, context);
 		if (mask & 0xff0000)
 			spriv->io(dev, 1, I8255_DATA_C_REG,
-				  (s->state >> 16) & 0xff, regbase);
+				  (s->state >> 16) & 0xff, context);
 	}
 
-	v = spriv->io(dev, 0, I8255_DATA_A_REG, 0, regbase);
-	v |= (spriv->io(dev, 0, I8255_DATA_B_REG, 0, regbase) << 8);
-	v |= (spriv->io(dev, 0, I8255_DATA_C_REG, 0, regbase) << 16);
+	v = spriv->io(dev, 0, I8255_DATA_A_REG, 0, context);
+	v |= (spriv->io(dev, 0, I8255_DATA_B_REG, 0, context) << 8);
+	v |= (spriv->io(dev, 0, I8255_DATA_C_REG, 0, context) << 16);
 
 	data[1] = v;
 
@@ -94,7 +94,7 @@ static void subdev_8255_do_config(struct comedi_device *dev,
 				  struct comedi_subdevice *s)
 {
 	struct subdev_8255_private *spriv = s->private;
-	unsigned long regbase = spriv->regbase;
+	unsigned long context = spriv->context;
 	int config;
 
 	config = I8255_CTRL_CW;
@@ -108,7 +108,7 @@ static void subdev_8255_do_config(struct comedi_device *dev,
 	if (!(s->io_bits & 0xf00000))
 		config |= I8255_CTRL_C_HI_IO;
 
-	spriv->io(dev, 1, I8255_CTRL_REG, config, regbase);
+	spriv->io(dev, 1, I8255_CTRL_REG, config, context);
 }
 
 static int subdev_8255_insn_config(struct comedi_device *dev,
@@ -142,23 +142,19 @@ static int __subdev_8255_init(struct comedi_device *dev,
 			      struct comedi_subdevice *s,
 			      int (*io)(struct comedi_device *dev,
 					int dir, int port, int data,
-					unsigned long regbase),
-			      unsigned long regbase,
-			      bool is_mmio)
+					unsigned long context),
+			      unsigned long context)
 {
 	struct subdev_8255_private *spriv;
 
+	if (!io)
+		return -EINVAL;
+
 	spriv = comedi_alloc_spriv(s, sizeof(*spriv));
 	if (!spriv)
 		return -ENOMEM;
 
-	if (io)
-		spriv->io = io;
-	else if (is_mmio)
-		spriv->io = subdev_8255_mmio;
-	else
-		spriv->io = subdev_8255_io;
-	spriv->regbase	= regbase;
+	spriv->context = context;
 
 	s->type		= COMEDI_SUBD_DIO;
 	s->subdev_flags	= SDF_READABLE | SDF_WRITABLE;
@@ -174,88 +170,83 @@ static int __subdev_8255_init(struct comedi_device *dev,
 }
 
 /**
- * subdev_8255_init - initialize DIO subdevice for driving I/O mapped 8255
+ * subdev_8255_io_init - initialize DIO subdevice for driving I/O mapped 8255
  * @dev: comedi device owning subdevice
  * @s: comedi subdevice to initialize
- * @io: (optional) register I/O call-back function
- * @regbase: offset of 8255 registers from dev->iobase, or call-back context
+ * @regbase: offset of 8255 registers from dev->iobase
  *
  * Initializes a comedi subdevice as a DIO subdevice driving an 8255 chip.
  *
- * If the optional I/O call-back function is provided, its prototype is of
- * the following form:
- *
- *   int my_8255_callback(struct comedi_device *dev, int dir, int port,
- *                        int data, unsigned long regbase);
- *
- * where 'dev', and 'regbase' match the values passed to this function,
- * 'port' is the 8255 port number 0 to 3 (including the control port), 'dir'
- * is the direction (0 for read, 1 for write) and 'data' is the value to be
- * written.  It should return 0 if writing or the value read if reading.
- *
- * If the optional I/O call-back function is not provided, an internal
- * call-back function is used which uses consecutive I/O port addresses
- * starting at dev->iobase + regbase.
- *
  * Return: -ENOMEM if failed to allocate memory, zero on success.
  */
-int subdev_8255_init(struct comedi_device *dev, struct comedi_subdevice *s,
-		     int (*io)(struct comedi_device *dev, int dir, int port,
-			       int data, unsigned long regbase),
+int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
 		     unsigned long regbase)
 {
-	return __subdev_8255_init(dev, s, io, regbase, false);
+	return __subdev_8255_init(dev, s, subdev_8255_io, regbase);
 }
-EXPORT_SYMBOL_GPL(subdev_8255_init);
+EXPORT_SYMBOL_GPL(subdev_8255_io_init);
 
 /**
  * subdev_8255_mm_init - initialize DIO subdevice for driving mmio-mapped 8255
  * @dev: comedi device owning subdevice
  * @s: comedi subdevice to initialize
- * @io: (optional) register I/O call-back function
- * @regbase: offset of 8255 registers from dev->mmio, or call-back context
+ * @regbase: offset of 8255 registers from dev->mmio
  *
  * Initializes a comedi subdevice as a DIO subdevice driving an 8255 chip.
  *
- * If the optional I/O call-back function is provided, its prototype is of
- * the following form:
+ * Return: -ENOMEM if failed to allocate memory, zero on success.
+ */
+int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			unsigned long regbase)
+{
+	return __subdev_8255_init(dev, s, subdev_8255_mmio, regbase);
+}
+EXPORT_SYMBOL_GPL(subdev_8255_mm_init);
+
+/**
+ * subdev_8255_cb_init - initialize DIO subdevice for driving callback-mapped 8255
+ * @dev: comedi device owning subdevice
+ * @s: comedi subdevice to initialize
+ * @io: register I/O call-back function
+ * @context: call-back context
+ *
+ * Initializes a comedi subdevice as a DIO subdevice driving an 8255 chip.
+ *
+ * The prototype of the I/O call-back function is of the following form:
  *
  *   int my_8255_callback(struct comedi_device *dev, int dir, int port,
- *                        int data, unsigned long regbase);
+ *                        int data, unsigned long context);
  *
- * where 'dev', and 'regbase' match the values passed to this function,
+ * where 'dev', and 'context' match the values passed to this function,
  * 'port' is the 8255 port number 0 to 3 (including the control port), 'dir'
  * is the direction (0 for read, 1 for write) and 'data' is the value to be
  * written.  It should return 0 if writing or the value read if reading.
  *
- * If the optional I/O call-back function is not provided, an internal
- * call-back function is used which uses consecutive MMIO virtual addresses
- * starting at dev->mmio + regbase.
  *
  * Return: -ENOMEM if failed to allocate memory, zero on success.
  */
-int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
+int subdev_8255_cb_init(struct comedi_device *dev, struct comedi_subdevice *s,
 			int (*io)(struct comedi_device *dev, int dir, int port,
-				  int data, unsigned long regbase),
-			unsigned long regbase)
+				  int data, unsigned long context),
+			unsigned long context)
 {
-	return __subdev_8255_init(dev, s, io, regbase, true);
+	return __subdev_8255_init(dev, s, io, context);
 }
-EXPORT_SYMBOL_GPL(subdev_8255_mm_init);
+EXPORT_SYMBOL_GPL(subdev_8255_cb_init);
 
 /**
  * subdev_8255_regbase - get offset of 8255 registers or call-back context
  * @s: comedi subdevice
  *
- * Returns the 'regbase' parameter that was previously passed to
- * subdev_8255_init() or subdev_8255_mm_init() to set up the subdevice.
- * Only valid if the subdevice was set up successfully.
+ * Returns the 'regbase' or 'context' parameter that was previously passed to
+ * subdev_8255_io_init(), subdev_8255_mm_init(), or subdev_8255_cb_init() to
+ * set up the subdevice.  Only valid if the subdevice was set up successfully.
  */
 unsigned long subdev_8255_regbase(struct comedi_subdevice *s)
 {
 	struct subdev_8255_private *spriv = s->private;
 
-	return spriv->regbase;
+	return spriv->context;
 }
 EXPORT_SYMBOL_GPL(subdev_8255_regbase);
 
diff --git a/drivers/comedi/drivers/daqboard2000.c b/drivers/comedi/drivers/daqboard2000.c
index c0a4e1b06fb3..897bf46b95ee 100644
--- a/drivers/comedi/drivers/daqboard2000.c
+++ b/drivers/comedi/drivers/daqboard2000.c
@@ -738,8 +738,8 @@ static int db2k_auto_attach(struct comedi_device *dev, unsigned long context)
 		return result;
 
 	s = &dev->subdevices[2];
-	return subdev_8255_init(dev, s, db2k_8255_cb,
-				DB2K_REG_DIO_P2_EXP_IO_8_BIT);
+	return subdev_8255_cb_init(dev, s, db2k_8255_cb,
+				   DB2K_REG_DIO_P2_EXP_IO_8_BIT);
 }
 
 static void db2k_detach(struct comedi_device *dev)
diff --git a/drivers/comedi/drivers/das08.c b/drivers/comedi/drivers/das08.c
index 6a3b5411aa90..5d5b9174f88a 100644
--- a/drivers/comedi/drivers/das08.c
+++ b/drivers/comedi/drivers/das08.c
@@ -429,7 +429,7 @@ int das08_common_attach(struct comedi_device *dev, unsigned long iobase)
 	s = &dev->subdevices[4];
 	/* 8255 */
 	if (board->i8255_offset != 0) {
-		ret = subdev_8255_init(dev, s, NULL, board->i8255_offset);
+		ret = subdev_8255_io_init(dev, s, board->i8255_offset);
 		if (ret)
 			return ret;
 	} else {
diff --git a/drivers/comedi/drivers/das16.c b/drivers/comedi/drivers/das16.c
index bfe8811be1b5..4ed56a02150e 100644
--- a/drivers/comedi/drivers/das16.c
+++ b/drivers/comedi/drivers/das16.c
@@ -1145,7 +1145,7 @@ static int das16_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 	/* 8255 Digital I/O subdevice */
 	if (board->has_8255) {
 		s = &dev->subdevices[4];
-		ret = subdev_8255_init(dev, s, NULL, board->i8255_offset);
+		ret = subdev_8255_io_init(dev, s, board->i8255_offset);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/comedi/drivers/das16m1.c b/drivers/comedi/drivers/das16m1.c
index ff9c5a8897bd..b8ea737ad3d1 100644
--- a/drivers/comedi/drivers/das16m1.c
+++ b/drivers/comedi/drivers/das16m1.c
@@ -583,7 +583,7 @@ static int das16m1_attach(struct comedi_device *dev,
 
 	/* Digital I/O subdevice (8255) */
 	s = &dev->subdevices[3];
-	ret = subdev_8255_init(dev, s, NULL, DAS16M1_8255_IOBASE);
+	ret = subdev_8255_io_init(dev, s, DAS16M1_8255_IOBASE);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/dmm32at.c b/drivers/comedi/drivers/dmm32at.c
index fe023c722aa3..644e3b643c79 100644
--- a/drivers/comedi/drivers/dmm32at.c
+++ b/drivers/comedi/drivers/dmm32at.c
@@ -599,7 +599,8 @@ static int dmm32at_attach(struct comedi_device *dev,
 
 	/* Digital I/O subdevice */
 	s = &dev->subdevices[2];
-	return subdev_8255_init(dev, s, dmm32at_8255_io, DMM32AT_8255_IOBASE);
+	return subdev_8255_cb_init(dev, s, dmm32at_8255_io,
+				   DMM32AT_8255_IOBASE);
 }
 
 static struct comedi_driver dmm32at_driver = {
diff --git a/drivers/comedi/drivers/ni_atmio16d.c b/drivers/comedi/drivers/ni_atmio16d.c
index 9fa902529a8e..e5e7cc423c87 100644
--- a/drivers/comedi/drivers/ni_atmio16d.c
+++ b/drivers/comedi/drivers/ni_atmio16d.c
@@ -677,7 +677,7 @@ static int atmio16d_attach(struct comedi_device *dev,
 	/* 8255 subdevice */
 	s = &dev->subdevices[3];
 	if (board->has_8255) {
-		ret = subdev_8255_init(dev, s, NULL, 0x00);
+		ret = subdev_8255_io_init(dev, s, 0x00);
 		if (ret)
 			return ret;
 	} else {
diff --git a/drivers/comedi/drivers/ni_daq_dio24.c b/drivers/comedi/drivers/ni_daq_dio24.c
index 487733111023..9419caf02edc 100644
--- a/drivers/comedi/drivers/ni_daq_dio24.c
+++ b/drivers/comedi/drivers/ni_daq_dio24.c
@@ -45,7 +45,7 @@ static int dio24_auto_attach(struct comedi_device *dev,
 
 	/* 8255 dio */
 	s = &dev->subdevices[0];
-	return subdev_8255_init(dev, s, NULL, 0x00);
+	return subdev_8255_io_init(dev, s, 0x00);
 }
 
 static struct comedi_driver driver_dio24 = {
diff --git a/drivers/comedi/drivers/ni_labpc_common.c b/drivers/comedi/drivers/ni_labpc_common.c
index eb8f6431276a..5d5c1d0e9cb6 100644
--- a/drivers/comedi/drivers/ni_labpc_common.c
+++ b/drivers/comedi/drivers/ni_labpc_common.c
@@ -1287,9 +1287,9 @@ int labpc_common_attach(struct comedi_device *dev,
 	/* 8255 dio */
 	s = &dev->subdevices[2];
 	if (dev->mmio)
-		ret = subdev_8255_mm_init(dev, s, NULL, DIO_BASE_REG);
+		ret = subdev_8255_mm_init(dev, s, DIO_BASE_REG);
 	else
-		ret = subdev_8255_init(dev, s, NULL, DIO_BASE_REG);
+		ret = subdev_8255_io_init(dev, s, DIO_BASE_REG);
 	if (ret)
 		return ret;
 
diff --git a/drivers/comedi/drivers/ni_mio_common.c b/drivers/comedi/drivers/ni_mio_common.c
index d39998565808..638be08b43e4 100644
--- a/drivers/comedi/drivers/ni_mio_common.c
+++ b/drivers/comedi/drivers/ni_mio_common.c
@@ -6137,8 +6137,8 @@ static int ni_E_init(struct comedi_device *dev,
 	/* 8255 device */
 	s = &dev->subdevices[NI_8255_DIO_SUBDEV];
 	if (board->has_8255) {
-		ret = subdev_8255_init(dev, s, ni_8255_callback,
-				       NI_E_8255_BASE);
+		ret = subdev_8255_cb_init(dev, s, ni_8255_callback,
+					  NI_E_8255_BASE);
 		if (ret)
 			return ret;
 	} else {
diff --git a/drivers/comedi/drivers/pcl724.c b/drivers/comedi/drivers/pcl724.c
index 948a0576c9ef..00474710b81f 100644
--- a/drivers/comedi/drivers/pcl724.c
+++ b/drivers/comedi/drivers/pcl724.c
@@ -124,10 +124,10 @@ static int pcl724_attach(struct comedi_device *dev,
 		s = &dev->subdevices[i];
 		if (board->is_pet48) {
 			iobase = dev->iobase + (i * 0x1000);
-			ret = subdev_8255_init(dev, s, pcl724_8255mapped_io,
-					       iobase);
+			ret = subdev_8255_cb_init(dev, s, pcl724_8255mapped_io,
+						  iobase);
 		} else {
-			ret = subdev_8255_init(dev, s, NULL, i * I8255_SIZE);
+			ret = subdev_8255_io_init(dev, s, i * I8255_SIZE);
 		}
 		if (ret)
 			return ret;
diff --git a/drivers/comedi/drivers/pcm3724.c b/drivers/comedi/drivers/pcm3724.c
index ca8bef54dacc..fb41de3baef8 100644
--- a/drivers/comedi/drivers/pcm3724.c
+++ b/drivers/comedi/drivers/pcm3724.c
@@ -204,7 +204,7 @@ static int pcm3724_attach(struct comedi_device *dev,
 
 	for (i = 0; i < dev->n_subdevices; i++) {
 		s = &dev->subdevices[i];
-		ret = subdev_8255_init(dev, s, NULL, i * I8255_SIZE);
+		ret = subdev_8255_io_init(dev, s, i * I8255_SIZE);
 		if (ret)
 			return ret;
 		s->insn_config = subdev_3724_insn_config;
diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h
index b2a5bc6b3a49..b396fcfbf8b0 100644
--- a/include/linux/comedi/comedi_8255.h
+++ b/include/linux/comedi/comedi_8255.h
@@ -27,16 +27,17 @@
 struct comedi_device;
 struct comedi_subdevice;
 
-int subdev_8255_init(struct comedi_device *dev, struct comedi_subdevice *s,
-		     int (*io)(struct comedi_device *dev, int dir, int port,
-			       int data, unsigned long regbase),
-		     unsigned long regbase);
+int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			unsigned long regbase);
 
 int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
-			int (*io)(struct comedi_device *dev, int dir, int port,
-				  int data, unsigned long regbase),
 			unsigned long regbase);
 
+int subdev_8255_cb_init(struct comedi_device *dev, struct comedi_subdevice *s,
+			int (*io)(struct comedi_device *dev, int dir, int port,
+				  int data, unsigned long context),
+			unsigned long context);
+
 unsigned long subdev_8255_regbase(struct comedi_subdevice *s);
 
 #endif
-- 
cgit v1.2.3


From 7187a0939a1773e6a2663a02a6c456047a5e6289 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti@mev.co.uk>
Date: Wed, 13 Sep 2023 18:07:06 +0100
Subject: comedi: comedi_8255: Conditionally remove I/O port support

In a future patch, the port I/O functions (`inb()`, `outb()`, and
friends will only be declared in the `HAS_IOPORT` configuration option
is enabled.

The comedi_8255 module supports both port I/O and memory-mapped I/O.

Conditionally compile the parts of the module that use port I/O if and
only if the `CONFIG_HAS_IOPORT` macro is defined so that it can still be
built if the port I/O functions have not been declared.  If the
`CONFIG_HAS_IOPORT` macro is undefined, replace the GPL-exported
`subdev_8255_io_init()` function with a dummy static inline version that
just returns `-ENXIO`.

Cc: Arnd Bergmann <arnd@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
Link: https://lore.kernel.org/r/20230913170712.111719-8-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/comedi/drivers/comedi_8255.c |  8 ++++++++
 include/linux/comedi/comedi_8255.h   | 11 +++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/comedi/drivers/comedi_8255.c b/drivers/comedi/drivers/comedi_8255.c
index 28fd9d8c95cc..e4974b508328 100644
--- a/drivers/comedi/drivers/comedi_8255.c
+++ b/drivers/comedi/drivers/comedi_8255.c
@@ -38,6 +38,8 @@ struct subdev_8255_private {
 		  unsigned long context);
 };
 
+#ifdef CONFIG_HAS_IOPORT
+
 static int subdev_8255_io(struct comedi_device *dev,
 			  int dir, int port, int data, unsigned long regbase)
 {
@@ -48,6 +50,8 @@ static int subdev_8255_io(struct comedi_device *dev,
 	return inb(dev->iobase + regbase + port);
 }
 
+#endif /* CONFIG_HAS_IOPORT */
+
 static int subdev_8255_mmio(struct comedi_device *dev,
 			    int dir, int port, int data, unsigned long regbase)
 {
@@ -169,6 +173,8 @@ static int __subdev_8255_init(struct comedi_device *dev,
 	return 0;
 }
 
+#ifdef CONFIG_HAS_IOPORT
+
 /**
  * subdev_8255_io_init - initialize DIO subdevice for driving I/O mapped 8255
  * @dev: comedi device owning subdevice
@@ -186,6 +192,8 @@ int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
 }
 EXPORT_SYMBOL_GPL(subdev_8255_io_init);
 
+#endif	/* CONFIG_HAS_IOPORT */
+
 /**
  * subdev_8255_mm_init - initialize DIO subdevice for driving mmio-mapped 8255
  * @dev: comedi device owning subdevice
diff --git a/include/linux/comedi/comedi_8255.h b/include/linux/comedi/comedi_8255.h
index b396fcfbf8b0..d24a69da389b 100644
--- a/include/linux/comedi/comedi_8255.h
+++ b/include/linux/comedi/comedi_8255.h
@@ -10,6 +10,8 @@
 #ifndef _COMEDI_8255_H
 #define _COMEDI_8255_H
 
+#include <linux/errno.h>
+
 #define I8255_SIZE		0x04
 
 #define I8255_DATA_A_REG	0x00
@@ -27,8 +29,17 @@
 struct comedi_device;
 struct comedi_subdevice;
 
+#ifdef CONFIG_HAS_IOPORT
 int subdev_8255_io_init(struct comedi_device *dev, struct comedi_subdevice *s,
 			unsigned long regbase);
+#else
+static inline int subdev_8255_io_init(struct comedi_device *dev,
+				      struct comedi_subdevice *s,
+				      unsigned long regbase)
+{
+	return -ENXIO;
+}
+#endif
 
 int subdev_8255_mm_init(struct comedi_device *dev, struct comedi_subdevice *s,
 			unsigned long regbase);
-- 
cgit v1.2.3


From 77f048bcbf07f7dc961f3b2b7815038b5405ec60 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 30 Sep 2023 11:14:47 +0200
Subject: comedi: Annotate struct comedi_lrange with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/5c3b7459b820e22e2ac6ce892d4aadcc119cc919.1696065263.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/comedi/comedidev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h
index 0a1150900ef3..c08416a7364b 100644
--- a/include/linux/comedi/comedidev.h
+++ b/include/linux/comedi/comedidev.h
@@ -633,7 +633,7 @@ extern const struct comedi_lrange range_unknown;
  */
 struct comedi_lrange {
 	int length;
-	struct comedi_krange range[];
+	struct comedi_krange range[] __counted_by(length);
 };
 
 /**
-- 
cgit v1.2.3


From 0fedefd4c4e33dd24f726b13b5d7c143e2b483be Mon Sep 17 00:00:00 2001
From: Valentine Sinitsyn <valesini@yandex-team.ru>
Date: Mon, 25 Sep 2023 11:40:12 +0300
Subject: kernfs: sysfs: support custom llseek method for sysfs entries

As of now, seeking in sysfs files is handled by generic_file_llseek().
There are situations where one may want to customize seeking logic:

- Many sysfs entries are fixed files while generic_file_llseek() accepts
  past-the-end positions. Not only being useless by itself, this
  also means a bug in userspace code will trigger not at lseek(), but at
  some later point making debugging harder.
- generic_file_llseek() relies on f_mapping->host to get the file size
  which might not be correct for all sysfs entries.
  See commit 636b21b50152 ("PCI: Revoke mappings like devmem") as an example.

Implement llseek method to override this behavior at sysfs attribute
level. The method is optional, and if it is absent,
generic_file_llseek() is called to preserve backwards compatibility.

Signed-off-by: Valentine Sinitsyn <valesini@yandex-team.ru>
Link: https://lore.kernel.org/r/20230925084013.309399-1-valesini@yandex-team.ru
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/file.c       | 29 ++++++++++++++++++++++++++++-
 fs/sysfs/file.c        | 13 +++++++++++++
 include/linux/kernfs.h |  1 +
 include/linux/sysfs.h  |  2 ++
 4 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 180906c36f51..855e3f9d8dcc 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -903,6 +903,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 	return ret;
 }
 
+static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	loff_t ret;
+
+	/*
+	 * @of->mutex nests outside active ref and is primarily to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		mutex_unlock(&of->mutex);
+		return -ENODEV;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->llseek)
+		ret = ops->llseek(of, offset, whence);
+	else
+		ret = generic_file_llseek(file, offset, whence);
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+	return ret;
+}
+
 static void kernfs_notify_workfn(struct work_struct *work)
 {
 	struct kernfs_node *kn;
@@ -1005,7 +1032,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify);
 const struct file_operations kernfs_file_fops = {
 	.read_iter	= kernfs_fop_read_iter,
 	.write_iter	= kernfs_fop_write_iter,
-	.llseek		= generic_file_llseek,
+	.llseek		= kernfs_fop_llseek,
 	.mmap		= kernfs_fop_mmap,
 	.open		= kernfs_fop_open,
 	.release	= kernfs_fop_release,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a12ac0356c69..6b7652fb8050 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
+static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
+				  int whence)
+{
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+
+	if (battr->llseek)
+		return battr->llseek(of->file, kobj, battr, offset, whence);
+	else
+		return generic_file_llseek(of->file, offset, whence);
+}
+
 static int sysfs_kf_bin_open(struct kernfs_open_file *of)
 {
 	struct bin_attribute *battr = of->kn->priv;
@@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
 	.write		= sysfs_kf_bin_write,
 	.mmap		= sysfs_kf_bin_mmap,
 	.open		= sysfs_kf_bin_open,
+	.llseek		= sysfs_kf_bin_llseek,
 };
 
 int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2a36f3218b51..99aaa050ccb7 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -316,6 +316,7 @@ struct kernfs_ops {
 			 struct poll_table_struct *pt);
 
 	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
+	loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
 };
 
 /*
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index fd3fe5c8c17f..b717a70219f6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -181,6 +181,8 @@ struct bin_attribute {
 			char *, loff_t, size_t);
 	ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
 			 char *, loff_t, size_t);
+	loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *,
+			 loff_t, int);
 	int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
 };
-- 
cgit v1.2.3


From eb7581deb4c2eef77f6368e1891e123b69349bb0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 19:53:12 +0300
Subject: resource: Constify resource crosscheck APIs

Constify APIs: _contains(), _overlaps(), _intersection(), _union().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230912165312.402422-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/ioport.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 25d768d48970..14f5cfabbbc8 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -229,7 +229,7 @@ static inline unsigned long resource_ext_type(const struct resource *res)
 	return res->flags & IORESOURCE_EXT_TYPE_BITS;
 }
 /* True iff r1 completely contains r2 */
-static inline bool resource_contains(struct resource *r1, struct resource *r2)
+static inline bool resource_contains(const struct resource *r1, const struct resource *r2)
 {
 	if (resource_type(r1) != resource_type(r2))
 		return false;
@@ -239,13 +239,13 @@ static inline bool resource_contains(struct resource *r1, struct resource *r2)
 }
 
 /* True if any part of r1 overlaps r2 */
-static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
+static inline bool resource_overlaps(const struct resource *r1, const struct resource *r2)
 {
        return r1->start <= r2->end && r1->end >= r2->start;
 }
 
-static inline bool
-resource_intersection(struct resource *r1, struct resource *r2, struct resource *r)
+static inline bool resource_intersection(const struct resource *r1, const struct resource *r2,
+					 struct resource *r)
 {
 	if (!resource_overlaps(r1, r2))
 		return false;
@@ -254,8 +254,8 @@ resource_intersection(struct resource *r1, struct resource *r2, struct resource
 	return true;
 }
 
-static inline bool
-resource_union(struct resource *r1, struct resource *r2, struct resource *r)
+static inline bool resource_union(const struct resource *r1, const struct resource *r2,
+				  struct resource *r)
 {
 	if (!resource_overlaps(r1, r2))
 		return false;
-- 
cgit v1.2.3


From 8a76356e7db02ec7b1913db06605e70294d94672 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Wed, 27 Sep 2023 11:27:41 +0300
Subject: iio: improve doc for available_scan_mask

The available_scan_mask is an array of bitmaps representing the channels
which can be simultaneously enabled by the driver. In many cases, the
hardware can offer more channels than what the user is interested in
obtaining. In such cases, it may be preferred that only a subset of
channels are enabled, and the driver reads only a subset of the channels
from the hardware.

Some devices can't support all channel combinations. For example, the
BM1390 pressure sensor must always read the pressure data in order to
acknowledge the watermark IRQ, while reading temperature can be omitted.
So, the available scan masks would be 'pressure and temperature' and
'pressure only'.

When IIO searches for the scan mask it asks the driver to use, it will
pick the first suitable one from the 'available_scan_mask' array. Hence,
ordering the masks in the array makes a difference. We should 'prefer'
reading just the pressure from the hardware (as it is a cheaper operation
than reading both pressure and temperature) over reading both pressure
and temperature. Hence, we should set the 'only pressure' as the first
scan mask in available_scan_mask array. If we set the 'pressure and
temperature' as first in the array, then the 'only temperature' will never
get used as 'pressure and temperature' can always serve the user's
needs.

Add (minimal) kerneldoc to the 'available_scan_mask' to hint the user
that the ordering of masks matters.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/4e43bf0186df5c8a56b470318b4827605f9cad6c.1695727471.git.mazziesaccount@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 202e55b0a28b..7bfa1b9bc8a2 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -556,7 +556,9 @@ struct iio_buffer_setup_ops {
  *			and owner
  * @buffer:		[DRIVER] any buffer present
  * @scan_bytes:		[INTERN] num bytes captured to be fed to buffer demux
- * @available_scan_masks: [DRIVER] optional array of allowed bitmasks
+ * @available_scan_masks: [DRIVER] optional array of allowed bitmasks. Sort the
+ *			   array in order of preference, the most preferred
+ *			   masks first.
  * @masklength:		[INTERN] the length of the mask established from
  *			channels
  * @active_scan_mask:	[INTERN] union of all scan masks requested by buffers
-- 
cgit v1.2.3


From b3a4dbc89d4021b3f90ff6a13537111a004f9d07 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 4 Oct 2023 20:05:31 -0400
Subject: io_uring/kbuf: Use slab for struct io_buffer objects

The allocation of struct io_buffer for metadata of provided buffers is
done through a custom allocator that directly gets pages and
fragments them.  But, slab would do just fine, as this is not a hot path
(in fact, it is a deprecated feature) and, by keeping a custom allocator
implementation we lose benefits like tracking, poisoning,
sanitizers. Finally, the custom code is more complex and requires
keeping the list of pages in struct ctx for no good reason.  This patch
cleans this path up and just uses slab.

I microbenchmarked it by forcing the allocation of a large number of
objects with the least number of io_uring commands possible (keeping
nbufs=USHRT_MAX), with and without the patch.  There is a slight
increase in time spent in the allocation with slab, of course, but even
when allocating to system resources exhaustion, which is not very
realistic and happened around 1/2 billion provided buffers for me, it
wasn't a significant hit in system time.  Specially if we think of a
real-world scenario, an application doing register/unregister of
provided buffers will hit ctx->io_buffers_cache more often than actually
going to slab.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-4-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/io_uring.c            |  4 +++-
 io_uring/io_uring.h            |  1 +
 io_uring/kbuf.c                | 47 +++++++++++++++++++++++-------------------
 4 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e178461fa513..e4e67899b134 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -350,8 +350,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		rsrc_quiesce_wq;
 	unsigned			rsrc_quiesce;
 
-	struct list_head		io_buffers_pages;
-
 	#if defined(CONFIG_UNIX)
 		struct socket		*ring_sock;
 	#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 08c9ea46bb95..b9e1af5772f3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -339,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
-	INIT_LIST_HEAD(&ctx->io_buffers_pages);
 	INIT_LIST_HEAD(&ctx->io_buffers_comp);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
@@ -4720,6 +4719,9 @@ static int __init io_uring_init(void)
 				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
 				offsetof(struct io_kiocb, cmd.data),
 				sizeof_field(struct io_kiocb, cmd.data), NULL);
+	io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+					  NULL);
 
 #ifdef CONFIG_SYSCTL
 	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 547c30582fb8..2ff719ae1b57 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -330,6 +330,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 }
 
 extern struct kmem_cache *req_cachep;
+extern struct kmem_cache *io_buf_cachep;
 
 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 12a357348733..d5a04467666f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -22,6 +22,8 @@
 /* BIDs are addressed by a 16-bit field in a CQE */
 #define MAX_BIDS_PER_BGID (1 << 16)
 
+struct kmem_cache *io_buf_cachep;
+
 struct io_provide_buf {
 	struct file			*file;
 	__u64				addr;
@@ -258,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
 	struct io_buffer_list *bl;
+	struct list_head *item, *tmp;
+	struct io_buffer *buf;
 	unsigned long index;
 	int i;
 
@@ -273,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 		kfree(bl);
 	}
 
-	while (!list_empty(&ctx->io_buffers_pages)) {
-		struct page *page;
-
-		page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-		list_del_init(&page->lru);
-		__free_page(page);
+	list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+		buf = list_entry(item, struct io_buffer, list);
+		kmem_cache_free(io_buf_cachep, buf);
 	}
 }
 
@@ -361,11 +362,12 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return 0;
 }
 
+#define IO_BUFFER_ALLOC_BATCH 64
+
 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 {
-	struct io_buffer *buf;
-	struct page *page;
-	int bufs_in_page;
+	struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+	int allocated;
 
 	/*
 	 * Completions that don't happen inline (eg not under uring_lock) will
@@ -385,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 
 	/*
 	 * No free buffers and no completion entries either. Allocate a new
-	 * page worth of buffer entries and add those to our freelist.
+	 * batch of buffer entries and add those to our freelist.
 	 */
-	page = alloc_page(GFP_KERNEL_ACCOUNT);
-	if (!page)
-		return -ENOMEM;
 
-	list_add(&page->lru, &ctx->io_buffers_pages);
-
-	buf = page_address(page);
-	bufs_in_page = PAGE_SIZE / sizeof(*buf);
-	while (bufs_in_page) {
-		list_add_tail(&buf->list, &ctx->io_buffers_cache);
-		buf++;
-		bufs_in_page--;
+	allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+					  ARRAY_SIZE(bufs), (void **) bufs);
+	if (unlikely(!allocated)) {
+		/*
+		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
+		 * retry single alloc to be on the safe side.
+		 */
+		bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+		if (!bufs[0])
+			return -ENOMEM;
+		allocated = 1;
 	}
 
+	while (allocated)
+		list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 5 Oct 2023 09:13:48 -0400
Subject: eventfs: Use eventfs_remove_events_dir()

The update to removing the eventfs_file changed the way the events top
level directory was handled. Instead of returning a dentry, it now returns
the eventfs_inode. In this changed, the removing of the events top level
directory is not much different than removing any of the other
directories. Because of this, the removal just called eventfs_remove_dir()
instead of eventfs_remove_events_dir().

Although eventfs_remove_dir() does the clean up, it misses out on the
dget() of the ei->dentry done in eventfs_create_events_dir(). It makes
more sense to match eventfs_create_events_dir() with a specific function
eventfs_remove_events_dir() and this specific function can then perform
the dput() to the dentry that had the dget() when it was created.

Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c    | 19 +++++++------------
 include/linux/tracefs.h     |  1 +
 kernel/trace/trace_events.c |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index eab18b157ef5..1ccd100bc565 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -901,22 +901,17 @@ void eventfs_remove_dir(struct eventfs_inode *ei)
 }
 
 /**
- * eventfs_remove_events_dir - remove eventfs dir or file from list
- * @dentry: events's dentry to be removed.
+ * eventfs_remove_events_dir - remove the top level eventfs directory
+ * @ei: the event_inode returned by eventfs_create_events_dir().
  *
- * This function remove events main directory
+ * This function removes the events main directory
  */
-void eventfs_remove_events_dir(struct dentry *dentry)
+void eventfs_remove_events_dir(struct eventfs_inode *ei)
 {
-	struct tracefs_inode *ti;
-
-	if (!dentry || !dentry->d_inode)
-		return;
+	struct dentry *dentry = ei->dentry;
 
-	ti = get_tracefs(dentry->d_inode);
-	if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
-		return;
+	eventfs_remove_dir(ei);
 
-	d_invalidate(dentry);
+	/* Matches the dget() from eventfs_create_events_dir() */
 	dput(dentry);
 }
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 0c39704455d9..13359b1a35d1 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -41,6 +41,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 					 const struct eventfs_entry *entries,
 					 int size, void *data);
 
+void eventfs_remove_events_dir(struct eventfs_inode *ei);
 void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a3b9d9423824..0e3a1c70e410 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3872,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	eventfs_remove_dir(tr->event_dir);
+	eventfs_remove_events_dir(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
-- 
cgit v1.2.3


From 7e6f3b6d2c352b5fde37ce3fed83bdf6172eebd4 Mon Sep 17 00:00:00 2001
From: Vicki Pfau <vi@endrift.com>
Date: Wed, 27 Sep 2023 13:22:12 -0700
Subject: PCI: Prevent xHCI driver from claiming AMD VanGogh USB3 DRD device

The AMD VanGogh SoC contains a DesignWare USB3 Dual-Role Device that can be
operated as either a USB Host or a USB Device, similar to on the AMD Nolan
platform.

be6646bfbaec ("PCI: Prevent xHCI driver from claiming AMD Nolan USB3 DRD
device") added a quirk to let the dwc3 driver claim the Nolan device since
it provides more specific support.

Extend that quirk to include the VanGogh SoC USB3 device.

Link: https://lore.kernel.org/r/20230927202212.2388216-1-vi@endrift.com
Signed-off-by: Vicki Pfau <vi@endrift.com>
[bhelgaas: include be6646bfbaec reference, add stable tag]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org	# v3.19+
---
 drivers/pci/quirks.c    | 8 +++++---
 include/linux/pci_ids.h | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index eeec1d6f9023..e3e915329510 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -690,7 +690,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI,	PCI_DEVICE_ID_ATI_RS100,   quirk_ati_
 /*
  * In the AMD NL platform, this device ([1022:7912]) has a class code of
  * PCI_CLASS_SERIAL_USB_XHCI (0x0c0330), which means the xhci driver will
- * claim it.
+ * claim it. The same applies on the VanGogh platform device ([1022:163a]).
  *
  * But the dwc3 driver is a more specific driver for this device, and we'd
  * prefer to use it instead of xhci. To prevent xhci from claiming the
@@ -698,7 +698,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI,	PCI_DEVICE_ID_ATI_RS100,   quirk_ati_
  * defines as "USB device (not host controller)". The dwc3 driver can then
  * claim it based on its Vendor and Device ID.
  */
-static void quirk_amd_nl_class(struct pci_dev *pdev)
+static void quirk_amd_dwc_class(struct pci_dev *pdev)
 {
 	u32 class = pdev->class;
 
@@ -708,7 +708,9 @@ static void quirk_amd_nl_class(struct pci_dev *pdev)
 		 class, pdev->class);
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_NL_USB,
-		quirk_amd_nl_class);
+		quirk_amd_dwc_class);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VANGOGH_USB,
+		quirk_amd_dwc_class);
 
 /*
  * Synopsys USB 3.x host HAPS platform has a class code of
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5fb3d4c393a9..3a8e24e9a93f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -579,6 +579,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
+#define PCI_DEVICE_ID_AMD_VANGOGH_USB	0x163a
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From 5b9ceb63c49b9934cf2ec70b3b76951927e50a24 Mon Sep 17 00:00:00 2001
From: John Sanpe <sanpeqf@gmail.com>
Date: Fri, 15 Sep 2023 22:06:50 +0800
Subject: logic_pio: Remove logic_outb(), _outw(), outl() duplicate
 declarations

Remove duplicate declarations of logic_out* functions.

Link: https://lore.kernel.org/r/20230915140650.3562504-1-sanpeqf@gmail.com
Signed-off-by: John Sanpe <sanpeqf@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/logic_pio.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h
index 54945aa824b4..babf4e3c28ba 100644
--- a/include/linux/logic_pio.h
+++ b/include/linux/logic_pio.h
@@ -39,9 +39,6 @@ struct logic_pio_host_ops {
 
 #ifdef CONFIG_INDIRECT_PIO
 u8 logic_inb(unsigned long addr);
-void logic_outb(u8 value, unsigned long addr);
-void logic_outw(u16 value, unsigned long addr);
-void logic_outl(u32 value, unsigned long addr);
 u16 logic_inw(unsigned long addr);
 u32 logic_inl(unsigned long addr);
 void logic_outb(u8 value, unsigned long addr);
-- 
cgit v1.2.3


From 7151d87a175c6618fe81705755eb3dc4199cad4e Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Wed, 27 Sep 2023 10:31:30 +0200
Subject: virtchnl: Add header dependencies

The <linux/avf/virtchnl.h> uses BIT, struct_size and ETH_ALEN macros
but does not include appropriate header files that defines them.
Add these dependencies so this header file can be included anywhere.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index dd71d3009771..6b3acf15be5c 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -4,6 +4,10 @@
 #ifndef _VIRTCHNL_H_
 #define _VIRTCHNL_H_
 
+#include <linux/bitops.h>
+#include <linux/overflow.h>
+#include <uapi/linux/if_ether.h>
+
 /* Description:
  * This header file describes the Virtual Function (VF) - Physical Function
  * (PF) communication protocol used by the drivers for all devices starting
-- 
cgit v1.2.3


From 9beebc2b5d0038a65977a7a14909598c64ce070f Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 28 Sep 2023 09:24:28 +0200
Subject: can: dev: add can_state_get_by_berr_counter() to return the CAN state
 based on the current error counters

Some CAN controllers do not have a register that contains the current
CAN state, but only a register that contains the error counters.

Introduce a new function can_state_get_by_berr_counter() that returns
the current TX and RX state depending on the provided CAN bit error
counters.

Link: https://lore.kernel.org/all/20231005-at91_can-rx_offload-v2-1-9987d53600e0@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 22 ++++++++++++++++++++++
 include/linux/can/dev.h   |  4 ++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 82b12902fc35..3a3be5cdfc1f 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -90,6 +90,28 @@ const char *can_get_state_str(const enum can_state state)
 }
 EXPORT_SYMBOL_GPL(can_get_state_str);
 
+static enum can_state can_state_err_to_state(u16 err)
+{
+	if (err < CAN_ERROR_WARNING_THRESHOLD)
+		return CAN_STATE_ERROR_ACTIVE;
+	if (err < CAN_ERROR_PASSIVE_THRESHOLD)
+		return CAN_STATE_ERROR_WARNING;
+	if (err < CAN_BUS_OFF_THRESHOLD)
+		return CAN_STATE_ERROR_PASSIVE;
+
+	return CAN_STATE_BUS_OFF;
+}
+
+void can_state_get_by_berr_counter(const struct net_device *dev,
+				   const struct can_berr_counter *bec,
+				   enum can_state *tx_state,
+				   enum can_state *rx_state)
+{
+	*tx_state = can_state_err_to_state(bec->txerr);
+	*rx_state = can_state_err_to_state(bec->rxerr);
+}
+EXPORT_SYMBOL_GPL(can_state_get_by_berr_counter);
+
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state)
 {
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 982ba245eb41..1b92aed49363 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -195,6 +195,10 @@ int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
 const char *can_get_state_str(const enum can_state state);
+void can_state_get_by_berr_counter(const struct net_device *dev,
+				   const struct can_berr_counter *bec,
+				   enum can_state *tx_state,
+				   enum can_state *rx_state);
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state);
 
-- 
cgit v1.2.3


From 9c8c3fa3a52bc55696ccc4dfcb8a49f969b5fb0e Mon Sep 17 00:00:00 2001
From: Akihiko Odaki <akihiko.odaki@daynix.com>
Date: Thu, 5 Oct 2023 16:21:36 +0900
Subject: bpf: Fix the comment for bpf_restore_data_end()

The comment used to say:
> Restore data saved by bpf_compute_data_pointers().

But bpf_compute_data_pointers() does not save the data;
bpf_compute_and_save_data_end() does.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20231005072137.29870-1-akihiko.odaki@daynix.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 27406aee2d40..ff7ecc89d3dd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -736,7 +736,7 @@ static inline void bpf_compute_and_save_data_end(
 	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
-/* Restore data saved by bpf_compute_data_pointers(). */
+/* Restore data saved by bpf_compute_and_save_data_end(). */
 static inline void bpf_restore_data_end(
 	struct sk_buff *skb, void *saved_data_end)
 {
-- 
cgit v1.2.3


From 3fbc5c3b8522d655cf91d32c158261060fdc02fe Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:07 +0200
Subject: PM: domains: Introduce dev_pm_domain_set_performance_state()

The generic PM domain is currently the only PM domain variant that supports
performance scaling. To allow performance scaling to be supported through a
common interface, let's add an optional callback ->set_performance_state(),
in the struct dev_pm_domain.

Moreover, let's add a function, dev_pm_domain_set_performance_state(), that
may be called by consumers to request a new performance state for a device
through its PM domain.

Note that, in most cases it's preferred that a consumer use the OPP library
to request a new performance state for its device. Although, this requires
some additional changes to be supported, which are being implemented from
subsequent changes.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/base/power/common.c | 21 +++++++++++++++++++++
 include/linux/pm.h          |  2 ++
 include/linux/pm_domain.h   |  6 ++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index 72115917e0bd..44ec20918a4d 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -228,3 +228,24 @@ void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd)
 	device_pm_check_callbacks(dev);
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_set);
+
+/**
+ * dev_pm_domain_set_performance_state - Request a new performance state.
+ * @dev: The device to make the request for.
+ * @state: Target performance state for the device.
+ *
+ * This function should be called when a new performance state needs to be
+ * requested for a device that is attached to a PM domain. Note that, the
+ * support for performance scaling for PM domains is optional.
+ *
+ * Returns 0 on success and when performance scaling isn't supported, negative
+ * error code on failure.
+ */
+int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state)
+{
+	if (dev->pm_domain && dev->pm_domain->set_performance_state)
+		return dev->pm_domain->set_performance_state(dev, state);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dev_pm_domain_set_performance_state);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1400c37b29c7..4c9f571609c8 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -719,6 +719,7 @@ extern void dev_pm_put_subsys_data(struct device *dev);
  * @activate: Called before executing probe routines for bus types and drivers.
  * @sync: Called after successful driver probe.
  * @dismiss: Called after unsuccessful driver probe and after driver removal.
+ * @set_performance_state: Called to request a new performance state.
  *
  * Power domains provide callbacks that are executed during system suspend,
  * hibernation, system resume and during runtime PM transitions instead of
@@ -731,6 +732,7 @@ struct dev_pm_domain {
 	int (*activate)(struct device *dev);
 	void (*sync)(struct device *dev);
 	void (*dismiss)(struct device *dev);
+	int (*set_performance_state)(struct device *dev, unsigned int state);
 };
 
 /*
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f776fb93eaa0..bda2964a0a56 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -430,6 +430,7 @@ struct device *dev_pm_domain_attach_by_name(struct device *dev,
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 int dev_pm_domain_start(struct device *dev);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
+int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state);
 #else
 static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
 {
@@ -452,6 +453,11 @@ static inline int dev_pm_domain_start(struct device *dev)
 }
 static inline void dev_pm_domain_set(struct device *dev,
 				     struct dev_pm_domain *pd) {}
+static inline int dev_pm_domain_set_performance_state(struct device *dev,
+						      unsigned int state)
+{
+	return 0;
+}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
-- 
cgit v1.2.3


From 248a38d5cc3f3505e6cfbbc0514435c9f1ba00af Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:09 +0200
Subject: OPP: Add dev_pm_opp_add_dynamic() to allow more flexibility

The dev_pm_opp_add() API is limited to add dynamic OPPs with a frequency
and a voltage level. To enable more flexibility, let's add a new API,
dev_pm_opp_add_dynamic() that's takes a struct dev_pm_opp_data* instead of
a list of in-parameters.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 22 ++++++++++------------
 drivers/opp/of.c       | 10 ++++++----
 drivers/opp/opp.h      |  2 +-
 include/linux/pm_opp.h | 29 +++++++++++++++++++++++++----
 4 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 919cc53bc02e..54b6138e1189 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2002,8 +2002,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
  * _opp_add_v1() - Allocate a OPP based on v1 bindings.
  * @opp_table:	OPP table
  * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
+ * @data:	The OPP data for the OPP to add
  * @dynamic:	Dynamically added OPPs.
  *
  * This function adds an opp definition to the opp table and returns status.
@@ -2021,10 +2020,10 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
  * -ENOMEM	Memory allocation failure
  */
 int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
-		unsigned long freq, long u_volt, bool dynamic)
+		struct dev_pm_opp_data *data, bool dynamic)
 {
 	struct dev_pm_opp *new_opp;
-	unsigned long tol;
+	unsigned long tol, u_volt = data->u_volt;
 	int ret;
 
 	if (!assert_single_clk(opp_table))
@@ -2035,7 +2034,7 @@ int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
 		return -ENOMEM;
 
 	/* populate the opp table */
-	new_opp->rates[0] = freq;
+	new_opp->rates[0] = data->freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
 	new_opp->supplies[0].u_volt = u_volt;
 	new_opp->supplies[0].u_volt_min = u_volt - tol;
@@ -2825,10 +2824,9 @@ unlock:
 }
 
 /**
- * dev_pm_opp_add()  - Add an OPP table from a table definitions
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
+ * dev_pm_opp_add_dynamic()  - Add an OPP table from a table definitions
+ * @dev:	The device for which we do this operation
+ * @data:	The OPP data for the OPP to add
  *
  * This function adds an opp definition to the opp table and returns status.
  * The opp is made available by default and it can be controlled using
@@ -2841,7 +2839,7 @@ unlock:
  *		Duplicate OPPs (both freq and volt are same) and !opp->available
  * -ENOMEM	Memory allocation failure
  */
-int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
+int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *data)
 {
 	struct opp_table *opp_table;
 	int ret;
@@ -2853,13 +2851,13 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
 	/* Fix regulator count for dynamic OPPs */
 	opp_table->regulator_count = 1;
 
-	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
+	ret = _opp_add_v1(opp_table, dev, data, true);
 	if (ret)
 		dev_pm_opp_put_opp_table(opp_table);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_add);
+EXPORT_SYMBOL_GPL(dev_pm_opp_add_dynamic);
 
 /**
  * _opp_set_availability() - helper to set the availability of an opp
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index ada4963c7cfa..ade6d42cae46 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1077,13 +1077,15 @@ static int _of_add_opp_table_v1(struct device *dev, struct opp_table *opp_table)
 
 	val = prop->value;
 	while (nr) {
-		unsigned long freq = be32_to_cpup(val++) * 1000;
-		unsigned long volt = be32_to_cpup(val++);
+		struct dev_pm_opp_data data = {
+			.freq = be32_to_cpup(val++) * 1000,
+			.u_volt = be32_to_cpup(val++),
+		};
 
-		ret = _opp_add_v1(opp_table, dev, freq, volt, false);
+		ret = _opp_add_v1(opp_table, dev, &data, false);
 		if (ret) {
 			dev_err(dev, "%s: Failed to add OPP %ld (%d)\n",
-				__func__, freq, ret);
+				__func__, data.freq, ret);
 			goto remove_static_opp;
 		}
 		nr -= 2;
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 8a5ea38f3a3d..fefdf9845692 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -251,7 +251,7 @@ struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
 void _opp_free(struct dev_pm_opp *opp);
 int _opp_compare_key(struct opp_table *opp_table, struct dev_pm_opp *opp1, struct dev_pm_opp *opp2);
 int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
-int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
+int _opp_add_v1(struct opp_table *opp_table, struct device *dev, struct dev_pm_opp_data *data, bool dynamic);
 void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu);
 struct opp_table *_add_opp_table_indexed(struct device *dev, int index, bool getclk);
 void _put_opp_list_kref(struct opp_table *opp_table);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 91f87d7e807c..a8ee93ba41d8 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -92,6 +92,16 @@ struct dev_pm_opp_config {
 	struct device ***virt_devs;
 };
 
+/**
+ * struct dev_pm_opp_data - The data to use to initialize an OPP.
+ * @freq: The clock rate in Hz for the OPP.
+ * @u_volt: The voltage in uV for the OPP.
+ */
+struct dev_pm_opp_data {
+	unsigned long freq;
+	unsigned long u_volt;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
@@ -152,8 +162,8 @@ struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev,
 
 void dev_pm_opp_put(struct dev_pm_opp *opp);
 
-int dev_pm_opp_add(struct device *dev, unsigned long freq,
-		   unsigned long u_volt);
+int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp);
+
 void dev_pm_opp_remove(struct device *dev, unsigned long freq);
 void dev_pm_opp_remove_all_dynamic(struct device *dev);
 
@@ -322,8 +332,8 @@ static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev,
 
 static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {}
 
-static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
-					unsigned long u_volt)
+static inline int
+dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp)
 {
 	return -EOPNOTSUPP;
 }
@@ -519,6 +529,17 @@ static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_ta
 
 /* OPP Configuration helpers */
 
+static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
+				 unsigned long u_volt)
+{
+	struct dev_pm_opp_data data = {
+		.freq = freq,
+		.u_volt = u_volt,
+	};
+
+	return dev_pm_opp_add_dynamic(dev, &data);
+}
+
 /* Regulators helpers */
 static inline int dev_pm_opp_set_regulators(struct device *dev,
 					    const char * const names[])
-- 
cgit v1.2.3


From a0242c81bb759ef03184be8eddcc7d5bdf36cc16 Mon Sep 17 00:00:00 2001
From: Krishna chaitanya chundru <quic_krichai@quicinc.com>
Date: Thu, 7 Sep 2023 11:30:31 +0530
Subject: OPP: Add dev_pm_opp_find_level_floor()

Add dev_pm_opp_find_level_floor(), as is done for frequency and
bandwidth.

Signed-off-by: Krishna chaitanya chundru <quic_krichai@quicinc.com>
[ Viresh: Updated commit log and rearranged code ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 25 +++++++++++++++++++++++++
 include/linux/pm_opp.h |  9 +++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 8978e94c9ca1..cdac46698021 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -813,6 +813,31 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_ceil);
 
+/**
+ * dev_pm_opp_find_level_floor() - Search for a rounded floor level
+ * @dev:	device for which we do this operation
+ * @level:	Start level
+ *
+ * Search for the matching floor *available* OPP from a starting level
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *level accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
+					       unsigned long *level)
+{
+	return _find_key_floor(dev, level, 0, true, _read_level, NULL);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_floor);
+
 /**
  * dev_pm_opp_find_bw_ceil() - Search for a rounded ceil bandwidth
  * @dev:	device for which we do this operation
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9ad168f4cbf1..ccd97bcef269 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -156,6 +156,9 @@ struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 					      unsigned int *level);
 
+struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
+					       unsigned long *level);
+
 struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					   unsigned int *bw, int index);
 
@@ -320,6 +323,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
+							     unsigned long *level)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					unsigned int *bw, int index)
 {
-- 
cgit v1.2.3


From 3166383da081461244918aeed7ad028ef11b17cc Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 25 Sep 2023 15:17:10 +0200
Subject: OPP: Extend dev_pm_opp_data with a level

Let's extend the dev_pm_opp_data with a level variable, to allow users to
specify a corresponding level (performance state) for a dynamically added
OPP.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 1 +
 include/linux/pm_opp.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 54b6138e1189..ca8d1304b508 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2035,6 +2035,7 @@ int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
 
 	/* populate the opp table */
 	new_opp->rates[0] = data->freq;
+	new_opp->level = data->level;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
 	new_opp->supplies[0].u_volt = u_volt;
 	new_opp->supplies[0].u_volt_min = u_volt - tol;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index a8ee93ba41d8..9ad168f4cbf1 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -94,10 +94,12 @@ struct dev_pm_opp_config {
 
 /**
  * struct dev_pm_opp_data - The data to use to initialize an OPP.
+ * @level: The performance level for the OPP.
  * @freq: The clock rate in Hz for the OPP.
  * @u_volt: The voltage in uV for the OPP.
  */
 struct dev_pm_opp_data {
+	unsigned int level;
 	unsigned long freq;
 	unsigned long u_volt;
 };
-- 
cgit v1.2.3


From 869b6ea1609f655a43251bf41757aa44e5350a8f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Oct 2023 15:32:01 +0200
Subject: quota: Fix slow quotaoff

Eric has reported that commit dabc8b207566 ("quota: fix dqput() to
follow the guarantees dquot_srcu should provide") heavily increases
runtime of generic/270 xfstest for ext4 in nojournal mode. The reason
for this is that ext4 in nojournal mode leaves dquots dirty until the last
dqput() and thus the cleanup done in quota_release_workfn() has to write
them all. Due to the way quota_release_workfn() is written this results
in synchronize_srcu() call for each dirty dquot which makes the dquot
cleanup when turning quotas off extremely slow.

To be able to avoid synchronize_srcu() for each dirty dquot we need to
rework how we track dquots to be cleaned up. Instead of keeping the last
dquot reference while it is on releasing_dquots list, we drop it right
away and mark the dquot with new DQ_RELEASING_B bit instead. This way we
can we can remove dquot from releasing_dquots list when new reference to
it is acquired and thus there's no need to call synchronize_srcu() each
time we drop dq_list_lock.

References: https://lore.kernel.org/all/ZRytn6CxFK2oECUt@debian-BULLSEYE-live-builder-AMD64
Reported-by: Eric Whitney <enwlinux@gmail.com>
Fixes: dabc8b207566 ("quota: fix dqput() to follow the guarantees dquot_srcu should provide")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 66 ++++++++++++++++++++++++++++--------------------
 include/linux/quota.h    |  4 ++-
 include/linux/quotaops.h |  2 +-
 3 files changed, 43 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 9e72bfe8bbad..31e897ad5e6a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -233,19 +233,18 @@ static void put_quota_format(struct quota_format_type *fmt)
  * All dquots are placed to the end of inuse_list when first created, and this
  * list is used for invalidate operation, which must look at every dquot.
  *
- * When the last reference of a dquot will be dropped, the dquot will be
- * added to releasing_dquots. We'd then queue work item which would call
+ * When the last reference of a dquot is dropped, the dquot is added to
+ * releasing_dquots. We'll then queue work item which will call
  * synchronize_srcu() and after that perform the final cleanup of all the
- * dquots on the list. Both releasing_dquots and free_dquots use the
- * dq_free list_head in the dquot struct. When a dquot is removed from
- * releasing_dquots, a reference count is always subtracted, and if
- * dq_count == 0 at that point, the dquot will be added to the free_dquots.
+ * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
+ * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
+ * struct.
  *
- * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
- * and this list is searched whenever we need an available dquot.  Dquots are
- * removed from the list as soon as they are used again, and
- * dqstats.free_dquots gives the number of dquots on the list. When
- * dquot is invalidated it's completely released from memory.
+ * Unused and cleaned up dquots are in the free_dquots list and this list is
+ * searched whenever we need an available dquot. Dquots are removed from the
+ * list as soon as they are used again and dqstats.free_dquots gives the number
+ * of dquots on the list. When dquot is invalidated it's completely released
+ * from memory.
  *
  * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
  * dirtied, and this list is searched when writing dirty dquots back to
@@ -321,6 +320,7 @@ static inline void put_dquot_last(struct dquot *dquot)
 static inline void put_releasing_dquots(struct dquot *dquot)
 {
 	list_add_tail(&dquot->dq_free, &releasing_dquots);
+	set_bit(DQ_RELEASING_B, &dquot->dq_flags);
 }
 
 static inline void remove_free_dquot(struct dquot *dquot)
@@ -328,8 +328,10 @@ static inline void remove_free_dquot(struct dquot *dquot)
 	if (list_empty(&dquot->dq_free))
 		return;
 	list_del_init(&dquot->dq_free);
-	if (!atomic_read(&dquot->dq_count))
+	if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
 		dqstats_dec(DQST_FREE_DQUOTS);
+	else
+		clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
 }
 
 static inline void put_inuse(struct dquot *dquot)
@@ -581,12 +583,6 @@ restart:
 			continue;
 		/* Wait for dquot users */
 		if (atomic_read(&dquot->dq_count)) {
-			/* dquot in releasing_dquots, flush and retry */
-			if (!list_empty(&dquot->dq_free)) {
-				spin_unlock(&dq_list_lock);
-				goto restart;
-			}
-
 			atomic_inc(&dquot->dq_count);
 			spin_unlock(&dq_list_lock);
 			/*
@@ -605,6 +601,15 @@ restart:
 			 * restart. */
 			goto restart;
 		}
+		/*
+		 * The last user already dropped its reference but dquot didn't
+		 * get fully cleaned up yet. Restart the scan which flushes the
+		 * work cleaning up released dquots.
+		 */
+		if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+			spin_unlock(&dq_list_lock);
+			goto restart;
+		}
 		/*
 		 * Quota now has no users and it has been written on last
 		 * dqput()
@@ -696,6 +701,13 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
 						 dq_dirty);
 
 			WARN_ON(!dquot_active(dquot));
+			/* If the dquot is releasing we should not touch it */
+			if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+				spin_unlock(&dq_list_lock);
+				flush_delayed_work(&quota_release_work);
+				spin_lock(&dq_list_lock);
+				continue;
+			}
 
 			/* Now we have active dquot from which someone is
  			 * holding reference so we can safely just increase
@@ -809,18 +821,18 @@ static void quota_release_workfn(struct work_struct *work)
 	/* Exchange the list head to avoid livelock. */
 	list_replace_init(&releasing_dquots, &rls_head);
 	spin_unlock(&dq_list_lock);
+	synchronize_srcu(&dquot_srcu);
 
 restart:
-	synchronize_srcu(&dquot_srcu);
 	spin_lock(&dq_list_lock);
 	while (!list_empty(&rls_head)) {
 		dquot = list_first_entry(&rls_head, struct dquot, dq_free);
-		/* Dquot got used again? */
-		if (atomic_read(&dquot->dq_count) > 1) {
-			remove_free_dquot(dquot);
-			atomic_dec(&dquot->dq_count);
-			continue;
-		}
+		WARN_ON_ONCE(atomic_read(&dquot->dq_count));
+		/*
+		 * Note that DQ_RELEASING_B protects us from racing with
+		 * invalidate_dquots() calls so we are safe to work with the
+		 * dquot even after we drop dq_list_lock.
+		 */
 		if (dquot_dirty(dquot)) {
 			spin_unlock(&dq_list_lock);
 			/* Commit dquot before releasing */
@@ -834,7 +846,6 @@ restart:
 		}
 		/* Dquot is inactive and clean, now move it to free list */
 		remove_free_dquot(dquot);
-		atomic_dec(&dquot->dq_count);
 		put_dquot_last(dquot);
 	}
 	spin_unlock(&dq_list_lock);
@@ -875,6 +886,7 @@ void dqput(struct dquot *dquot)
 	BUG_ON(!list_empty(&dquot->dq_free));
 #endif
 	put_releasing_dquots(dquot);
+	atomic_dec(&dquot->dq_count);
 	spin_unlock(&dq_list_lock);
 	queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
 }
@@ -963,7 +975,7 @@ we_slept:
 		dqstats_inc(DQST_LOOKUPS);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is
-	 * already finished or it will be canceled due to dq_count > 1 test */
+	 * already finished or it will be canceled due to dq_count > 0 test */
 	wait_on_dquot(dquot);
 	/* Read the dquot / allocate space in quota file */
 	if (!dquot_active(dquot)) {
diff --git a/include/linux/quota.h b/include/linux/quota.h
index fd692b4a41d5..07071e64abf3 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -285,7 +285,9 @@ static inline void dqstats_dec(unsigned int type)
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
-#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
+#define DQ_RELEASING_B	6	/* dquot is in releasing_dquots list waiting
+				 * to be cleaned up */
+#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
 				 * for the mask of entries set via SETQUOTA\
 				 * quotactl. They are set under dq_data_lock\
 				 * and the quota format handling dquot can\
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 11a4becff3a9..4fa4ef0a173a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -57,7 +57,7 @@ static inline bool dquot_is_busy(struct dquot *dquot)
 {
 	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
 		return true;
-	if (atomic_read(&dquot->dq_count) > 1)
+	if (atomic_read(&dquot->dq_count) > 0)
 		return true;
 	return false;
 }
-- 
cgit v1.2.3


From 5a0b11a180a9b82b4437a4be1cf73530053f139b Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Fri, 6 Oct 2023 09:57:02 +0000
Subject: iommu/amd: Remove iommu_v2 module

AMD GPU driver which was the only in-kernel user of iommu_v2 module
removed dependency on iommu_v2 module.

Also we are working on adding SVA support in AMD IOMMU driver. Device
drivers are expected to use common SVA framework to enable device
PASID/PRI features.

Removing iommu_v2 module and then adding SVA simplifies the development.
Hence remove iommu_v2 module.

Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://lore.kernel.org/r/20231006095706.5694-2-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/Kconfig     |   9 -
 drivers/iommu/amd/Makefile    |   1 -
 drivers/iommu/amd/amd_iommu.h |   5 -
 drivers/iommu/amd/iommu.c     |  40 --
 drivers/iommu/amd/iommu_v2.c  | 996 ------------------------------------------
 include/linux/amd-iommu.h     |  94 ----
 6 files changed, 1145 deletions(-)
 delete mode 100644 drivers/iommu/amd/iommu_v2.c

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 9b5fc3356bf2..75132ae861a2 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -22,15 +22,6 @@ config AMD_IOMMU
 	  your BIOS for an option to enable it or if you have an IVRS ACPI
 	  table.
 
-config AMD_IOMMU_V2
-	tristate "AMD IOMMU Version 2 driver"
-	depends on AMD_IOMMU
-	select MMU_NOTIFIER
-	help
-	  This option enables support for the AMD IOMMUv2 features of the IOMMU
-	  hardware. Select this option if you want to use devices that support
-	  the PCI PRI and PASID interface.
-
 config AMD_IOMMU_DEBUGFS
 	bool "Enable AMD IOMMU internals in DebugFS"
 	depends on AMD_IOMMU && IOMMU_DEBUGFS
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 773d8aa00283..f454fbb1569e 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
-obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 9df53961d5ef..5b8a1e2dd3d0 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -38,9 +38,6 @@ extern int amd_iommu_guest_ir;
 extern enum io_pgtable_fmt amd_iommu_pgtable;
 extern int amd_iommu_gpt_level;
 
-/* IOMMUv2 specific functions */
-struct iommu_domain;
-
 bool amd_iommu_v2_supported(void);
 struct amd_iommu *get_amd_iommu(unsigned int idx);
 u8 amd_iommu_pc_get_max_banks(unsigned int idx);
@@ -57,8 +54,6 @@ void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev);
 
 int amd_iommu_register_ppr_notifier(struct notifier_block *nb);
 int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb);
-void amd_iommu_domain_direct_map(struct iommu_domain *dom);
-int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids);
 int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address);
 void amd_iommu_update_and_flush_device_table(struct protection_domain *domain);
 void amd_iommu_domain_update(struct protection_domain *domain);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index f3448a2b6c0e..fd0d7b2f30dc 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2568,46 +2568,6 @@ int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
 
-void amd_iommu_domain_direct_map(struct iommu_domain *dom)
-{
-	struct protection_domain *domain = to_pdomain(dom);
-	unsigned long flags;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	if (domain->iop.pgtbl_cfg.tlb)
-		free_io_pgtable_ops(&domain->iop.iop.ops);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-EXPORT_SYMBOL(amd_iommu_domain_direct_map);
-
-int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
-{
-	struct protection_domain *pdom = to_pdomain(dom);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&pdom->lock, flags);
-
-	/*
-	 * Save us all sanity checks whether devices already in the
-	 * domain support IOMMUv2. Just force that the domain has no
-	 * devices attached when it is switched into IOMMUv2 mode.
-	 */
-	ret = -EBUSY;
-	if (pdom->dev_cnt > 0 || pdom->flags & PD_IOMMUV2_MASK)
-		goto out;
-
-	if (!pdom->gcr3_tbl)
-		ret = setup_gcr3_table(pdom, pasids);
-
-out:
-	spin_unlock_irqrestore(&pdom->lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
-
 static int __flush_pasid(struct protection_domain *domain, u32 pasid,
 			 u64 address, bool size)
 {
diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c
deleted file mode 100644
index 57c2fb1146e2..000000000000
--- a/drivers/iommu/amd/iommu_v2.c
+++ /dev/null
@@ -1,996 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2010-2012 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <jroedel@suse.de>
- */
-
-#define pr_fmt(fmt)     "AMD-Vi: " fmt
-
-#include <linux/refcount.h>
-#include <linux/mmu_notifier.h>
-#include <linux/amd-iommu.h>
-#include <linux/mm_types.h>
-#include <linux/profile.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/wait.h>
-#include <linux/pci.h>
-#include <linux/gfp.h>
-#include <linux/cc_platform.h>
-
-#include "amd_iommu.h"
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Joerg Roedel <jroedel@suse.de>");
-
-#define PRI_QUEUE_SIZE		512
-
-struct pri_queue {
-	atomic_t inflight;
-	bool finish;
-	int status;
-};
-
-struct pasid_state {
-	struct list_head list;			/* For global state-list */
-	refcount_t count;				/* Reference count */
-	unsigned mmu_notifier_count;		/* Counting nested mmu_notifier
-						   calls */
-	struct mm_struct *mm;			/* mm_struct for the faults */
-	struct mmu_notifier mn;                 /* mmu_notifier handle */
-	struct pri_queue pri[PRI_QUEUE_SIZE];	/* PRI tag states */
-	struct device_state *device_state;	/* Link to our device_state */
-	u32 pasid;				/* PASID index */
-	bool invalid;				/* Used during setup and
-						   teardown of the pasid */
-	spinlock_t lock;			/* Protect pri_queues and
-						   mmu_notifer_count */
-	wait_queue_head_t wq;			/* To wait for count == 0 */
-};
-
-struct device_state {
-	struct list_head list;
-	u32 sbdf;
-	atomic_t count;
-	struct pci_dev *pdev;
-	struct pasid_state **states;
-	struct iommu_domain *domain;
-	int pasid_levels;
-	int max_pasids;
-	amd_iommu_invalid_ppr_cb inv_ppr_cb;
-	amd_iommu_invalidate_ctx inv_ctx_cb;
-	spinlock_t lock;
-	wait_queue_head_t wq;
-};
-
-struct fault {
-	struct work_struct work;
-	struct device_state *dev_state;
-	struct pasid_state *state;
-	struct mm_struct *mm;
-	u64 address;
-	u32 pasid;
-	u16 tag;
-	u16 finish;
-	u16 flags;
-};
-
-static LIST_HEAD(state_list);
-static DEFINE_SPINLOCK(state_lock);
-
-static struct workqueue_struct *iommu_wq;
-
-static void free_pasid_states(struct device_state *dev_state);
-
-static struct device_state *__get_device_state(u32 sbdf)
-{
-	struct device_state *dev_state;
-
-	list_for_each_entry(dev_state, &state_list, list) {
-		if (dev_state->sbdf == sbdf)
-			return dev_state;
-	}
-
-	return NULL;
-}
-
-static struct device_state *get_device_state(u32 sbdf)
-{
-	struct device_state *dev_state;
-	unsigned long flags;
-
-	spin_lock_irqsave(&state_lock, flags);
-	dev_state = __get_device_state(sbdf);
-	if (dev_state != NULL)
-		atomic_inc(&dev_state->count);
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	return dev_state;
-}
-
-static void free_device_state(struct device_state *dev_state)
-{
-	struct iommu_group *group;
-
-	/* Get rid of any remaining pasid states */
-	free_pasid_states(dev_state);
-
-	/*
-	 * Wait until the last reference is dropped before freeing
-	 * the device state.
-	 */
-	wait_event(dev_state->wq, !atomic_read(&dev_state->count));
-
-	/*
-	 * First detach device from domain - No more PRI requests will arrive
-	 * from that device after it is unbound from the IOMMUv2 domain.
-	 */
-	group = iommu_group_get(&dev_state->pdev->dev);
-	if (WARN_ON(!group))
-		return;
-
-	iommu_detach_group(dev_state->domain, group);
-
-	iommu_group_put(group);
-
-	/* Everything is down now, free the IOMMUv2 domain */
-	iommu_domain_free(dev_state->domain);
-
-	/* Finally get rid of the device-state */
-	kfree(dev_state);
-}
-
-static void put_device_state(struct device_state *dev_state)
-{
-	if (atomic_dec_and_test(&dev_state->count))
-		wake_up(&dev_state->wq);
-}
-
-/* Must be called under dev_state->lock */
-static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state,
-						  u32 pasid, bool alloc)
-{
-	struct pasid_state **root, **ptr;
-	int level, index;
-
-	level = dev_state->pasid_levels;
-	root  = dev_state->states;
-
-	while (true) {
-
-		index = (pasid >> (9 * level)) & 0x1ff;
-		ptr   = &root[index];
-
-		if (level == 0)
-			break;
-
-		if (*ptr == NULL) {
-			if (!alloc)
-				return NULL;
-
-			*ptr = (void *)get_zeroed_page(GFP_ATOMIC);
-			if (*ptr == NULL)
-				return NULL;
-		}
-
-		root   = (struct pasid_state **)*ptr;
-		level -= 1;
-	}
-
-	return ptr;
-}
-
-static int set_pasid_state(struct device_state *dev_state,
-			   struct pasid_state *pasid_state,
-			   u32 pasid)
-{
-	struct pasid_state **ptr;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dev_state->lock, flags);
-	ptr = __get_pasid_state_ptr(dev_state, pasid, true);
-
-	ret = -ENOMEM;
-	if (ptr == NULL)
-		goto out_unlock;
-
-	ret = -ENOMEM;
-	if (*ptr != NULL)
-		goto out_unlock;
-
-	*ptr = pasid_state;
-
-	ret = 0;
-
-out_unlock:
-	spin_unlock_irqrestore(&dev_state->lock, flags);
-
-	return ret;
-}
-
-static void clear_pasid_state(struct device_state *dev_state, u32 pasid)
-{
-	struct pasid_state **ptr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev_state->lock, flags);
-	ptr = __get_pasid_state_ptr(dev_state, pasid, true);
-
-	if (ptr == NULL)
-		goto out_unlock;
-
-	*ptr = NULL;
-
-out_unlock:
-	spin_unlock_irqrestore(&dev_state->lock, flags);
-}
-
-static struct pasid_state *get_pasid_state(struct device_state *dev_state,
-					   u32 pasid)
-{
-	struct pasid_state **ptr, *ret = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev_state->lock, flags);
-	ptr = __get_pasid_state_ptr(dev_state, pasid, false);
-
-	if (ptr == NULL)
-		goto out_unlock;
-
-	ret = *ptr;
-	if (ret)
-		refcount_inc(&ret->count);
-
-out_unlock:
-	spin_unlock_irqrestore(&dev_state->lock, flags);
-
-	return ret;
-}
-
-static void free_pasid_state(struct pasid_state *pasid_state)
-{
-	kfree(pasid_state);
-}
-
-static void put_pasid_state(struct pasid_state *pasid_state)
-{
-	if (refcount_dec_and_test(&pasid_state->count))
-		wake_up(&pasid_state->wq);
-}
-
-static void put_pasid_state_wait(struct pasid_state *pasid_state)
-{
-	if (!refcount_dec_and_test(&pasid_state->count))
-		wait_event(pasid_state->wq, !refcount_read(&pasid_state->count));
-	free_pasid_state(pasid_state);
-}
-
-static void unbind_pasid(struct pasid_state *pasid_state)
-{
-	struct iommu_domain *domain;
-
-	domain = pasid_state->device_state->domain;
-
-	/*
-	 * Mark pasid_state as invalid, no more faults will we added to the
-	 * work queue after this is visible everywhere.
-	 */
-	pasid_state->invalid = true;
-
-	/* Make sure this is visible */
-	smp_wmb();
-
-	/* After this the device/pasid can't access the mm anymore */
-	amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid);
-
-	/* Make sure no more pending faults are in the queue */
-	flush_workqueue(iommu_wq);
-}
-
-static void free_pasid_states_level1(struct pasid_state **tbl)
-{
-	int i;
-
-	for (i = 0; i < 512; ++i) {
-		if (tbl[i] == NULL)
-			continue;
-
-		free_page((unsigned long)tbl[i]);
-	}
-}
-
-static void free_pasid_states_level2(struct pasid_state **tbl)
-{
-	struct pasid_state **ptr;
-	int i;
-
-	for (i = 0; i < 512; ++i) {
-		if (tbl[i] == NULL)
-			continue;
-
-		ptr = (struct pasid_state **)tbl[i];
-		free_pasid_states_level1(ptr);
-	}
-}
-
-static void free_pasid_states(struct device_state *dev_state)
-{
-	struct pasid_state *pasid_state;
-	int i;
-
-	for (i = 0; i < dev_state->max_pasids; ++i) {
-		pasid_state = get_pasid_state(dev_state, i);
-		if (pasid_state == NULL)
-			continue;
-
-		put_pasid_state(pasid_state);
-
-		/* Clear the pasid state so that the pasid can be re-used */
-		clear_pasid_state(dev_state, pasid_state->pasid);
-
-		/*
-		 * This will call the mn_release function and
-		 * unbind the PASID
-		 */
-		mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
-
-		put_pasid_state_wait(pasid_state); /* Reference taken in
-						      amd_iommu_bind_pasid */
-
-		/* Drop reference taken in amd_iommu_bind_pasid */
-		put_device_state(dev_state);
-	}
-
-	if (dev_state->pasid_levels == 2)
-		free_pasid_states_level2(dev_state->states);
-	else if (dev_state->pasid_levels == 1)
-		free_pasid_states_level1(dev_state->states);
-	else
-		BUG_ON(dev_state->pasid_levels != 0);
-
-	free_page((unsigned long)dev_state->states);
-}
-
-static struct pasid_state *mn_to_state(struct mmu_notifier *mn)
-{
-	return container_of(mn, struct pasid_state, mn);
-}
-
-static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long start, unsigned long end)
-{
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-
-	pasid_state = mn_to_state(mn);
-	dev_state   = pasid_state->device_state;
-
-	if ((start ^ (end - 1)) < PAGE_SIZE)
-		amd_iommu_flush_page(dev_state->domain, pasid_state->pasid,
-				     start);
-	else
-		amd_iommu_flush_tlb(dev_state->domain, pasid_state->pasid);
-}
-
-static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
-{
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-	bool run_inv_ctx_cb;
-
-	might_sleep();
-
-	pasid_state    = mn_to_state(mn);
-	dev_state      = pasid_state->device_state;
-	run_inv_ctx_cb = !pasid_state->invalid;
-
-	if (run_inv_ctx_cb && dev_state->inv_ctx_cb)
-		dev_state->inv_ctx_cb(dev_state->pdev, pasid_state->pasid);
-
-	unbind_pasid(pasid_state);
-}
-
-static const struct mmu_notifier_ops iommu_mn = {
-	.release			= mn_release,
-	.arch_invalidate_secondary_tlbs	= mn_arch_invalidate_secondary_tlbs,
-};
-
-static void set_pri_tag_status(struct pasid_state *pasid_state,
-			       u16 tag, int status)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pasid_state->lock, flags);
-	pasid_state->pri[tag].status = status;
-	spin_unlock_irqrestore(&pasid_state->lock, flags);
-}
-
-static void finish_pri_tag(struct device_state *dev_state,
-			   struct pasid_state *pasid_state,
-			   u16 tag)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pasid_state->lock, flags);
-	if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
-	    pasid_state->pri[tag].finish) {
-		amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
-				       pasid_state->pri[tag].status, tag);
-		pasid_state->pri[tag].finish = false;
-		pasid_state->pri[tag].status = PPR_SUCCESS;
-	}
-	spin_unlock_irqrestore(&pasid_state->lock, flags);
-}
-
-static void handle_fault_error(struct fault *fault)
-{
-	int status;
-
-	if (!fault->dev_state->inv_ppr_cb) {
-		set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
-		return;
-	}
-
-	status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev,
-					      fault->pasid,
-					      fault->address,
-					      fault->flags);
-	switch (status) {
-	case AMD_IOMMU_INV_PRI_RSP_SUCCESS:
-		set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS);
-		break;
-	case AMD_IOMMU_INV_PRI_RSP_INVALID:
-		set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);
-		break;
-	case AMD_IOMMU_INV_PRI_RSP_FAIL:
-		set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static bool access_error(struct vm_area_struct *vma, struct fault *fault)
-{
-	unsigned long requested = 0;
-
-	if (fault->flags & PPR_FAULT_EXEC)
-		requested |= VM_EXEC;
-
-	if (fault->flags & PPR_FAULT_READ)
-		requested |= VM_READ;
-
-	if (fault->flags & PPR_FAULT_WRITE)
-		requested |= VM_WRITE;
-
-	return (requested & ~vma->vm_flags) != 0;
-}
-
-static void do_fault(struct work_struct *work)
-{
-	struct fault *fault = container_of(work, struct fault, work);
-	struct vm_area_struct *vma;
-	vm_fault_t ret = VM_FAULT_ERROR;
-	unsigned int flags = 0;
-	struct mm_struct *mm;
-	u64 address;
-
-	mm = fault->state->mm;
-	address = fault->address;
-
-	if (fault->flags & PPR_FAULT_USER)
-		flags |= FAULT_FLAG_USER;
-	if (fault->flags & PPR_FAULT_WRITE)
-		flags |= FAULT_FLAG_WRITE;
-	flags |= FAULT_FLAG_REMOTE;
-
-	mmap_read_lock(mm);
-	vma = vma_lookup(mm, address);
-	if (!vma)
-		/* failed to get a vma in the right range */
-		goto out;
-
-	/* Check if we have the right permissions on the vma */
-	if (access_error(vma, fault))
-		goto out;
-
-	ret = handle_mm_fault(vma, address, flags, NULL);
-out:
-	mmap_read_unlock(mm);
-
-	if (ret & VM_FAULT_ERROR)
-		/* failed to service fault */
-		handle_fault_error(fault);
-
-	finish_pri_tag(fault->dev_state, fault->state, fault->tag);
-
-	put_pasid_state(fault->state);
-
-	kfree(fault);
-}
-
-static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
-{
-	struct amd_iommu_fault *iommu_fault;
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-	struct pci_dev *pdev = NULL;
-	unsigned long flags;
-	struct fault *fault;
-	bool finish;
-	u16 tag, devid, seg_id;
-	int ret;
-
-	iommu_fault = data;
-	tag         = iommu_fault->tag & 0x1ff;
-	finish      = (iommu_fault->tag >> 9) & 1;
-
-	seg_id = PCI_SBDF_TO_SEGID(iommu_fault->sbdf);
-	devid = PCI_SBDF_TO_DEVID(iommu_fault->sbdf);
-	pdev = pci_get_domain_bus_and_slot(seg_id, PCI_BUS_NUM(devid),
-					   devid & 0xff);
-	if (!pdev)
-		return -ENODEV;
-
-	ret = NOTIFY_DONE;
-
-	/* In kdump kernel pci dev is not initialized yet -> send INVALID */
-	if (amd_iommu_is_attach_deferred(&pdev->dev)) {
-		amd_iommu_complete_ppr(pdev, iommu_fault->pasid,
-				       PPR_INVALID, tag);
-		goto out;
-	}
-
-	dev_state = get_device_state(iommu_fault->sbdf);
-	if (dev_state == NULL)
-		goto out;
-
-	pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
-	if (pasid_state == NULL || pasid_state->invalid) {
-		/* We know the device but not the PASID -> send INVALID */
-		amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
-				       PPR_INVALID, tag);
-		goto out_drop_state;
-	}
-
-	spin_lock_irqsave(&pasid_state->lock, flags);
-	atomic_inc(&pasid_state->pri[tag].inflight);
-	if (finish)
-		pasid_state->pri[tag].finish = true;
-	spin_unlock_irqrestore(&pasid_state->lock, flags);
-
-	fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
-	if (fault == NULL) {
-		/* We are OOM - send success and let the device re-fault */
-		finish_pri_tag(dev_state, pasid_state, tag);
-		goto out_drop_state;
-	}
-
-	fault->dev_state = dev_state;
-	fault->address   = iommu_fault->address;
-	fault->state     = pasid_state;
-	fault->tag       = tag;
-	fault->finish    = finish;
-	fault->pasid     = iommu_fault->pasid;
-	fault->flags     = iommu_fault->flags;
-	INIT_WORK(&fault->work, do_fault);
-
-	queue_work(iommu_wq, &fault->work);
-
-	ret = NOTIFY_OK;
-
-out_drop_state:
-
-	if (ret != NOTIFY_OK && pasid_state)
-		put_pasid_state(pasid_state);
-
-	put_device_state(dev_state);
-
-out:
-	pci_dev_put(pdev);
-	return ret;
-}
-
-static struct notifier_block ppr_nb = {
-	.notifier_call = ppr_notifier,
-};
-
-int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
-			 struct task_struct *task)
-{
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-	struct mm_struct *mm;
-	u32 sbdf;
-	int ret;
-
-	might_sleep();
-
-	if (!amd_iommu_v2_supported())
-		return -ENODEV;
-
-	sbdf      = get_pci_sbdf_id(pdev);
-	dev_state = get_device_state(sbdf);
-
-	if (dev_state == NULL)
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (pasid >= dev_state->max_pasids)
-		goto out;
-
-	ret = -ENOMEM;
-	pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL);
-	if (pasid_state == NULL)
-		goto out;
-
-
-	refcount_set(&pasid_state->count, 1);
-	init_waitqueue_head(&pasid_state->wq);
-	spin_lock_init(&pasid_state->lock);
-
-	mm                        = get_task_mm(task);
-	pasid_state->mm           = mm;
-	pasid_state->device_state = dev_state;
-	pasid_state->pasid        = pasid;
-	pasid_state->invalid      = true; /* Mark as valid only if we are
-					     done with setting up the pasid */
-	pasid_state->mn.ops       = &iommu_mn;
-
-	if (pasid_state->mm == NULL)
-		goto out_free;
-
-	ret = mmu_notifier_register(&pasid_state->mn, mm);
-	if (ret)
-		goto out_free;
-
-	ret = set_pasid_state(dev_state, pasid_state, pasid);
-	if (ret)
-		goto out_unregister;
-
-	ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid,
-					__pa(pasid_state->mm->pgd));
-	if (ret)
-		goto out_clear_state;
-
-	/* Now we are ready to handle faults */
-	pasid_state->invalid = false;
-
-	/*
-	 * Drop the reference to the mm_struct here. We rely on the
-	 * mmu_notifier release call-back to inform us when the mm
-	 * is going away.
-	 */
-	mmput(mm);
-
-	return 0;
-
-out_clear_state:
-	clear_pasid_state(dev_state, pasid);
-
-out_unregister:
-	mmu_notifier_unregister(&pasid_state->mn, mm);
-	mmput(mm);
-
-out_free:
-	free_pasid_state(pasid_state);
-
-out:
-	put_device_state(dev_state);
-
-	return ret;
-}
-EXPORT_SYMBOL(amd_iommu_bind_pasid);
-
-void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid)
-{
-	struct pasid_state *pasid_state;
-	struct device_state *dev_state;
-	u32 sbdf;
-
-	might_sleep();
-
-	if (!amd_iommu_v2_supported())
-		return;
-
-	sbdf = get_pci_sbdf_id(pdev);
-	dev_state = get_device_state(sbdf);
-	if (dev_state == NULL)
-		return;
-
-	if (pasid >= dev_state->max_pasids)
-		goto out;
-
-	pasid_state = get_pasid_state(dev_state, pasid);
-	if (pasid_state == NULL)
-		goto out;
-	/*
-	 * Drop reference taken here. We are safe because we still hold
-	 * the reference taken in the amd_iommu_bind_pasid function.
-	 */
-	put_pasid_state(pasid_state);
-
-	/* Clear the pasid state so that the pasid can be re-used */
-	clear_pasid_state(dev_state, pasid_state->pasid);
-
-	/*
-	 * Call mmu_notifier_unregister to drop our reference
-	 * to pasid_state->mm
-	 */
-	mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm);
-
-	put_pasid_state_wait(pasid_state); /* Reference taken in
-					      amd_iommu_bind_pasid */
-out:
-	/* Drop reference taken in this function */
-	put_device_state(dev_state);
-
-	/* Drop reference taken in amd_iommu_bind_pasid */
-	put_device_state(dev_state);
-}
-EXPORT_SYMBOL(amd_iommu_unbind_pasid);
-
-int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
-{
-	struct device_state *dev_state;
-	struct iommu_group *group;
-	unsigned long flags;
-	int ret, tmp;
-	u32 sbdf;
-
-	might_sleep();
-
-	/*
-	 * When memory encryption is active the device is likely not in a
-	 * direct-mapped domain. Forbid using IOMMUv2 functionality for now.
-	 */
-	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
-		return -ENODEV;
-
-	if (!amd_iommu_v2_supported())
-		return -ENODEV;
-
-	if (pasids <= 0 || pasids > (PASID_MASK + 1))
-		return -EINVAL;
-
-	sbdf = get_pci_sbdf_id(pdev);
-
-	dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL);
-	if (dev_state == NULL)
-		return -ENOMEM;
-
-	spin_lock_init(&dev_state->lock);
-	init_waitqueue_head(&dev_state->wq);
-	dev_state->pdev  = pdev;
-	dev_state->sbdf = sbdf;
-
-	tmp = pasids;
-	for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9)
-		dev_state->pasid_levels += 1;
-
-	atomic_set(&dev_state->count, 1);
-	dev_state->max_pasids = pasids;
-
-	ret = -ENOMEM;
-	dev_state->states = (void *)get_zeroed_page(GFP_KERNEL);
-	if (dev_state->states == NULL)
-		goto out_free_dev_state;
-
-	dev_state->domain = iommu_domain_alloc(&pci_bus_type);
-	if (dev_state->domain == NULL)
-		goto out_free_states;
-
-	/* See iommu_is_default_domain() */
-	dev_state->domain->type = IOMMU_DOMAIN_IDENTITY;
-	amd_iommu_domain_direct_map(dev_state->domain);
-
-	ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids);
-	if (ret)
-		goto out_free_domain;
-
-	group = iommu_group_get(&pdev->dev);
-	if (!group) {
-		ret = -EINVAL;
-		goto out_free_domain;
-	}
-
-	ret = iommu_attach_group(dev_state->domain, group);
-	if (ret != 0)
-		goto out_drop_group;
-
-	iommu_group_put(group);
-
-	spin_lock_irqsave(&state_lock, flags);
-
-	if (__get_device_state(sbdf) != NULL) {
-		spin_unlock_irqrestore(&state_lock, flags);
-		ret = -EBUSY;
-		goto out_free_domain;
-	}
-
-	list_add_tail(&dev_state->list, &state_list);
-
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	return 0;
-
-out_drop_group:
-	iommu_group_put(group);
-
-out_free_domain:
-	iommu_domain_free(dev_state->domain);
-
-out_free_states:
-	free_page((unsigned long)dev_state->states);
-
-out_free_dev_state:
-	kfree(dev_state);
-
-	return ret;
-}
-EXPORT_SYMBOL(amd_iommu_init_device);
-
-void amd_iommu_free_device(struct pci_dev *pdev)
-{
-	struct device_state *dev_state;
-	unsigned long flags;
-	u32 sbdf;
-
-	if (!amd_iommu_v2_supported())
-		return;
-
-	sbdf = get_pci_sbdf_id(pdev);
-
-	spin_lock_irqsave(&state_lock, flags);
-
-	dev_state = __get_device_state(sbdf);
-	if (dev_state == NULL) {
-		spin_unlock_irqrestore(&state_lock, flags);
-		return;
-	}
-
-	list_del(&dev_state->list);
-
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	put_device_state(dev_state);
-	free_device_state(dev_state);
-}
-EXPORT_SYMBOL(amd_iommu_free_device);
-
-int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
-				 amd_iommu_invalid_ppr_cb cb)
-{
-	struct device_state *dev_state;
-	unsigned long flags;
-	u32 sbdf;
-	int ret;
-
-	if (!amd_iommu_v2_supported())
-		return -ENODEV;
-
-	sbdf = get_pci_sbdf_id(pdev);
-
-	spin_lock_irqsave(&state_lock, flags);
-
-	ret = -EINVAL;
-	dev_state = __get_device_state(sbdf);
-	if (dev_state == NULL)
-		goto out_unlock;
-
-	dev_state->inv_ppr_cb = cb;
-
-	ret = 0;
-
-out_unlock:
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb);
-
-int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
-				    amd_iommu_invalidate_ctx cb)
-{
-	struct device_state *dev_state;
-	unsigned long flags;
-	u32 sbdf;
-	int ret;
-
-	if (!amd_iommu_v2_supported())
-		return -ENODEV;
-
-	sbdf = get_pci_sbdf_id(pdev);
-
-	spin_lock_irqsave(&state_lock, flags);
-
-	ret = -EINVAL;
-	dev_state = __get_device_state(sbdf);
-	if (dev_state == NULL)
-		goto out_unlock;
-
-	dev_state->inv_ctx_cb = cb;
-
-	ret = 0;
-
-out_unlock:
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb);
-
-static int __init amd_iommu_v2_init(void)
-{
-	int ret;
-
-	if (!amd_iommu_v2_supported()) {
-		pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n");
-		/*
-		 * Load anyway to provide the symbols to other modules
-		 * which may use AMD IOMMUv2 optionally.
-		 */
-		return 0;
-	}
-
-	ret = -ENOMEM;
-	iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0);
-	if (iommu_wq == NULL)
-		goto out;
-
-	amd_iommu_register_ppr_notifier(&ppr_nb);
-
-	pr_info("AMD IOMMUv2 loaded and initialized\n");
-
-	return 0;
-
-out:
-	return ret;
-}
-
-static void __exit amd_iommu_v2_exit(void)
-{
-	struct device_state *dev_state, *next;
-	unsigned long flags;
-	LIST_HEAD(freelist);
-
-	if (!amd_iommu_v2_supported())
-		return;
-
-	amd_iommu_unregister_ppr_notifier(&ppr_nb);
-
-	flush_workqueue(iommu_wq);
-
-	/*
-	 * The loop below might call flush_workqueue(), so call
-	 * destroy_workqueue() after it
-	 */
-	spin_lock_irqsave(&state_lock, flags);
-
-	list_for_each_entry_safe(dev_state, next, &state_list, list) {
-		WARN_ON_ONCE(1);
-
-		put_device_state(dev_state);
-		list_del(&dev_state->list);
-		list_add_tail(&dev_state->list, &freelist);
-	}
-
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	/*
-	 * Since free_device_state waits on the count to be zero,
-	 * we need to free dev_state outside the spinlock.
-	 */
-	list_for_each_entry_safe(dev_state, next, &freelist, list) {
-		list_del(&dev_state->list);
-		free_device_state(dev_state);
-	}
-
-	destroy_workqueue(iommu_wq);
-}
-
-module_init(amd_iommu_v2_init);
-module_exit(amd_iommu_v2_exit);
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 99a5201d9e62..35ca00b09384 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -33,84 +33,6 @@ struct pci_dev;
 
 extern int amd_iommu_detect(void);
 
-/**
- * amd_iommu_init_device() - Init device for use with IOMMUv2 driver
- * @pdev: The PCI device to initialize
- * @pasids: Number of PASIDs to support for this device
- *
- * This function does all setup for the device pdev so that it can be
- * used with IOMMUv2.
- * Returns 0 on success or negative value on error.
- */
-extern int amd_iommu_init_device(struct pci_dev *pdev, int pasids);
-
-/**
- * amd_iommu_free_device() - Free all IOMMUv2 related device resources
- *			     and disable IOMMUv2 usage for this device
- * @pdev: The PCI device to disable IOMMUv2 usage for'
- */
-extern void amd_iommu_free_device(struct pci_dev *pdev);
-
-/**
- * amd_iommu_bind_pasid() - Bind a given task to a PASID on a device
- * @pdev: The PCI device to bind the task to
- * @pasid: The PASID on the device the task should be bound to
- * @task: the task to bind
- *
- * The function returns 0 on success or a negative value on error.
- */
-extern int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid,
-				struct task_struct *task);
-
-/**
- * amd_iommu_unbind_pasid() - Unbind a PASID from its task on
- *			      a device
- * @pdev: The device of the PASID
- * @pasid: The PASID to unbind
- *
- * When this function returns the device is no longer using the PASID
- * and the PASID is no longer bound to its task.
- */
-extern void amd_iommu_unbind_pasid(struct pci_dev *pdev, u32 pasid);
-
-/**
- * amd_iommu_set_invalid_ppr_cb() - Register a call-back for failed
- *				    PRI requests
- * @pdev: The PCI device the call-back should be registered for
- * @cb: The call-back function
- *
- * The IOMMUv2 driver invokes this call-back when it is unable to
- * successfully handle a PRI request. The device driver can then decide
- * which PRI response the device should see. Possible return values for
- * the call-back are:
- *
- * - AMD_IOMMU_INV_PRI_RSP_SUCCESS - Send SUCCESS back to the device
- * - AMD_IOMMU_INV_PRI_RSP_INVALID - Send INVALID back to the device
- * - AMD_IOMMU_INV_PRI_RSP_FAIL    - Send Failure back to the device,
- *				     the device is required to disable
- *				     PRI when it receives this response
- *
- * The function returns 0 on success or negative value on error.
- */
-#define AMD_IOMMU_INV_PRI_RSP_SUCCESS	0
-#define AMD_IOMMU_INV_PRI_RSP_INVALID	1
-#define AMD_IOMMU_INV_PRI_RSP_FAIL	2
-
-typedef int (*amd_iommu_invalid_ppr_cb)(struct pci_dev *pdev,
-					u32 pasid,
-					unsigned long address,
-					u16);
-
-extern int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev,
-					amd_iommu_invalid_ppr_cb cb);
-
-#define PPR_FAULT_EXEC	(1 << 1)
-#define PPR_FAULT_READ  (1 << 2)
-#define PPR_FAULT_WRITE (1 << 5)
-#define PPR_FAULT_USER  (1 << 6)
-#define PPR_FAULT_RSVD  (1 << 7)
-#define PPR_FAULT_GN    (1 << 8)
-
 /**
  * amd_iommu_device_info() - Get information about IOMMUv2 support of a
  *			     PCI device
@@ -137,22 +59,6 @@ struct amd_iommu_device_info {
 extern int amd_iommu_device_info(struct pci_dev *pdev,
 				 struct amd_iommu_device_info *info);
 
-/**
- * amd_iommu_set_invalidate_ctx_cb() - Register a call-back for invalidating
- *				       a pasid context. This call-back is
- *				       invoked when the IOMMUv2 driver needs to
- *				       invalidate a PASID context, for example
- *				       because the task that is bound to that
- *				       context is about to exit.
- *
- * @pdev: The PCI device the call-back should be registered for
- * @cb: The call-back function
- */
-
-typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, u32 pasid);
-
-extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev,
-					   amd_iommu_invalidate_ctx cb);
 #else /* CONFIG_AMD_IOMMU */
 
 static inline int amd_iommu_detect(void) { return -ENODEV; }
-- 
cgit v1.2.3


From 37b282fa04dd7b9bfa7c290fe07ef1730444debb Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Fri, 6 Oct 2023 09:57:04 +0000
Subject: iommu/amd: Remove amd_iommu_device_info()

No one is using this function. Hence remove it. Also move PCI device
feature detection flags to amd_iommu_types.h as its only used inside
AMD IOMMU driver.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://lore.kernel.org/r/20231006095706.5694-4-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu_types.h |  8 +++++++
 drivers/iommu/amd/iommu.c           | 42 -------------------------------------
 include/linux/amd-iommu.h           | 26 -----------------------
 3 files changed, 8 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 25b731f3d984..e742006f2885 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -786,6 +786,14 @@ struct devid_map {
 	bool cmd_line;
 };
 
+#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP     0x1    /* ATS feature supported */
+#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP     0x2    /* PRI feature supported */
+#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP   0x4    /* PASID context supported */
+/* Device may request execution on memory pages */
+#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP    0x8
+/* Device may request super-user privileges */
+#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP   0x10
+
 /*
  * This struct contains device specific data for the IOMMU
  */
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 5abf181f5ecd..8f43d89a98db 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2741,48 +2741,6 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
 }
 EXPORT_SYMBOL(amd_iommu_complete_ppr);
 
-int amd_iommu_device_info(struct pci_dev *pdev,
-                          struct amd_iommu_device_info *info)
-{
-	int max_pasids;
-	int pos;
-
-	if (pdev == NULL || info == NULL)
-		return -EINVAL;
-
-	if (!amd_iommu_v2_supported())
-		return -EINVAL;
-
-	memset(info, 0, sizeof(*info));
-
-	if (pci_ats_supported(pdev))
-		info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
-
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (pos)
-		info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
-
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
-	if (pos) {
-		int features;
-
-		max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
-		max_pasids = min(max_pasids, (1 << 20));
-
-		info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
-		info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
-
-		features = pci_pasid_features(pdev);
-		if (features & PCI_PASID_CAP_EXEC)
-			info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
-		if (features & PCI_PASID_CAP_PRIV)
-			info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(amd_iommu_device_info);
-
 #ifdef CONFIG_IRQ_REMAP
 
 /*****************************************************************************
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 35ca00b09384..dc7ed2f46886 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -33,32 +33,6 @@ struct pci_dev;
 
 extern int amd_iommu_detect(void);
 
-/**
- * amd_iommu_device_info() - Get information about IOMMUv2 support of a
- *			     PCI device
- * @pdev: PCI device to query information from
- * @info: A pointer to an amd_iommu_device_info structure which will contain
- *	  the information about the PCI device
- *
- * Returns 0 on success, negative value on error
- */
-
-#define AMD_IOMMU_DEVICE_FLAG_ATS_SUP     0x1    /* ATS feature supported */
-#define AMD_IOMMU_DEVICE_FLAG_PRI_SUP     0x2    /* PRI feature supported */
-#define AMD_IOMMU_DEVICE_FLAG_PASID_SUP   0x4    /* PASID context supported */
-#define AMD_IOMMU_DEVICE_FLAG_EXEC_SUP    0x8    /* Device may request execution
-						    on memory pages */
-#define AMD_IOMMU_DEVICE_FLAG_PRIV_SUP   0x10    /* Device may request
-						    super-user privileges */
-
-struct amd_iommu_device_info {
-	int max_pasids;
-	u32 flags;
-};
-
-extern int amd_iommu_device_info(struct pci_dev *pdev,
-				 struct amd_iommu_device_info *info);
-
 #else /* CONFIG_AMD_IOMMU */
 
 static inline int amd_iommu_detect(void) { return -ENODEV; }
-- 
cgit v1.2.3


From 1609626c32c4538439f6333d0b6c912af9f13b77 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:44:54 +0100
Subject: firmware: arm_ffa: Update the FF-A command list with v1.1 additions

Arm Firmware Framework for A-profile(FFA) v1.1 introduces notifications
and indirect messaging based upon notifications support and extends some
of the memory interfaces.

Let us add all the newly supported FF-A function IDs in the spec.
Also update to the error values and associated handling.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-1-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c |  1 +
 include/linux/arm_ffa.h           | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 7cd6b1564e80..a64512388ea5 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -64,6 +64,7 @@ static const int ffa_linux_errmap[] = {
 	-EACCES,	/* FFA_RET_DENIED */
 	-EAGAIN,	/* FFA_RET_RETRY */
 	-ECANCELED,	/* FFA_RET_ABORTED */
+	-ENODATA,	/* FFA_RET_NO_DATA */
 };
 
 static inline int ffa_to_linux_errno(int errno)
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index cc060da51bec..2ea1717a0825 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -20,6 +20,7 @@
 
 #define FFA_ERROR			FFA_SMC_32(0x60)
 #define FFA_SUCCESS			FFA_SMC_32(0x61)
+#define FFA_FN64_SUCCESS		FFA_SMC_64(0x61)
 #define FFA_INTERRUPT			FFA_SMC_32(0x62)
 #define FFA_VERSION			FFA_SMC_32(0x63)
 #define FFA_FEATURES			FFA_SMC_32(0x64)
@@ -54,6 +55,23 @@
 #define FFA_MEM_FRAG_RX			FFA_SMC_32(0x7A)
 #define FFA_MEM_FRAG_TX			FFA_SMC_32(0x7B)
 #define FFA_NORMAL_WORLD_RESUME		FFA_SMC_32(0x7C)
+#define FFA_NOTIFICATION_BITMAP_CREATE	FFA_SMC_32(0x7D)
+#define FFA_NOTIFICATION_BITMAP_DESTROY FFA_SMC_32(0x7E)
+#define FFA_NOTIFICATION_BIND		FFA_SMC_32(0x7F)
+#define FFA_NOTIFICATION_UNBIND		FFA_SMC_32(0x80)
+#define FFA_NOTIFICATION_SET		FFA_SMC_32(0x81)
+#define FFA_NOTIFICATION_GET		FFA_SMC_32(0x82)
+#define FFA_NOTIFICATION_INFO_GET	FFA_SMC_32(0x83)
+#define FFA_FN64_NOTIFICATION_INFO_GET	FFA_SMC_64(0x83)
+#define FFA_RX_ACQUIRE			FFA_SMC_32(0x84)
+#define FFA_SPM_ID_GET			FFA_SMC_32(0x85)
+#define FFA_MSG_SEND2			FFA_SMC_32(0x86)
+#define FFA_SECONDARY_EP_REGISTER	FFA_SMC_32(0x87)
+#define FFA_FN64_SECONDARY_EP_REGISTER	FFA_SMC_64(0x87)
+#define FFA_MEM_PERM_GET		FFA_SMC_32(0x88)
+#define FFA_FN64_MEM_PERM_GET		FFA_SMC_64(0x88)
+#define FFA_MEM_PERM_SET		FFA_SMC_32(0x89)
+#define FFA_FN64_MEM_PERM_SET		FFA_SMC_64(0x89)
 
 /*
  * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
@@ -76,6 +94,7 @@
 #define FFA_RET_DENIED             (-6)
 #define FFA_RET_RETRY              (-7)
 #define FFA_RET_ABORTED            (-8)
+#define FFA_RET_NO_DATA            (-9)
 
 /* FFA version encoding */
 #define FFA_MAJOR_VERSION_MASK	GENMASK(30, 16)
@@ -86,6 +105,7 @@
 	(FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) |		\
 	 FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor)))
 #define FFA_VERSION_1_0		FFA_PACK_VERSION_INFO(1, 0)
+#define FFA_VERSION_1_1		FFA_PACK_VERSION_INFO(1, 1)
 
 /**
  * FF-A specification mentions explicitly about '4K pages'. This should
-- 
cgit v1.2.3


From fe2ddb6b42358ad25a6aed30512fb284522335f3 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:44:57 +0100
Subject: firmware: arm_ffa: Implement the FFA_RUN interface

FFA_RUN is used by a scheduler to allocate CPU cycles to a target
endpoint execution context specified in the target information parameter.

If the endpoint execution context is in the waiting/blocked state, it
transitions to the running state.

Expose the ability to call FFA_RUN in order to give any partition in the
system cpu cycles to perform IMPDEF functionality.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-4-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 22 ++++++++++++++++++++++
 include/linux/arm_ffa.h           |  5 +++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 20f8f4ca8e89..82e54d231074 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -616,6 +616,23 @@ static int ffa_notification_bind_common(u16 dst_id, u64 bitmap,
 	return 0;
 }
 
+static int ffa_run(struct ffa_device *dev, u16 vcpu)
+{
+	ffa_value_t ret;
+	u32 target = dev->vm_id << 16 | vcpu;
+
+	invoke_ffa_fn((ffa_value_t){ .a0 = FFA_RUN, .a1 = target, }, &ret);
+
+	while (ret.a0 == FFA_INTERRUPT)
+		invoke_ffa_fn((ffa_value_t){ .a0 = FFA_RUN, .a1 = ret.a1, },
+			      &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	return 0;
+}
+
 static void ffa_set_up_mem_ops_native_flag(void)
 {
 	if (!ffa_features(FFA_FN_NATIVE(MEM_LEND), 0, NULL, NULL) ||
@@ -700,10 +717,15 @@ static const struct ffa_mem_ops ffa_drv_mem_ops = {
 	.memory_lend = ffa_memory_lend,
 };
 
+static const struct ffa_cpu_ops ffa_drv_cpu_ops = {
+	.run = ffa_run,
+};
+
 static const struct ffa_ops ffa_drv_ops = {
 	.info_ops = &ffa_drv_info_ops,
 	.msg_ops = &ffa_drv_msg_ops,
 	.mem_ops = &ffa_drv_mem_ops,
+	.cpu_ops = &ffa_drv_cpu_ops,
 };
 
 void ffa_device_match_uuid(struct ffa_device *ffa_dev, const uuid_t *uuid)
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 2ea1717a0825..12fd134bf670 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -387,10 +387,15 @@ struct ffa_mem_ops {
 	int (*memory_lend)(struct ffa_mem_ops_args *args);
 };
 
+struct ffa_cpu_ops {
+	int (*run)(struct ffa_device *dev, u16 vcpu);
+};
+
 struct ffa_ops {
 	const struct ffa_info_ops *info_ops;
 	const struct ffa_msg_ops *msg_ops;
 	const struct ffa_mem_ops *mem_ops;
+	const struct ffa_cpu_ops *cpu_ops;
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 0184450b8b1e7734110472616c4758839e1aff96 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:02 +0100
Subject: firmware: arm_ffa: Add schedule receiver callback mechanism

Enable client drivers to register a callback function that will be
called when one or more notifications are pending for a target
partition as part of schedule receiver interrupt handling.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-9-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 103 ++++++++++++++++++++++++++++++++++++--
 include/linux/arm_ffa.h           |   8 +++
 2 files changed, 108 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index f8d01840f5ec..04cdb49cc78b 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -37,6 +37,7 @@
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/uuid.h>
+#include <linux/xarray.h>
 
 #include "common.h"
 
@@ -99,6 +100,8 @@ struct ffa_drv_info {
 	struct ffa_pcpu_irq __percpu *irq_pcpu;
 	struct workqueue_struct *notif_pcpu_wq;
 	struct work_struct irq_work;
+	struct xarray partition_info;
+	unsigned int partition_count;
 };
 
 static struct ffa_drv_info *drv_info;
@@ -694,9 +697,26 @@ static int ffa_notification_get(u32 flags, struct ffa_notify_bitmaps *notify)
 	return 0;
 }
 
-static void __do_sched_recv_cb(u16 partition_id, u16 vcpu, bool is_per_vcpu)
+struct ffa_dev_part_info {
+	ffa_sched_recv_cb callback;
+	void *cb_data;
+	rwlock_t rw_lock;
+};
+
+static void __do_sched_recv_cb(u16 part_id, u16 vcpu, bool is_per_vcpu)
 {
-	pr_err("Callback for partition 0x%x failed.\n", partition_id);
+	struct ffa_dev_part_info *partition;
+	ffa_sched_recv_cb callback;
+	void *cb_data;
+
+	partition = xa_load(&drv_info->partition_info, part_id);
+	read_lock(&partition->rw_lock);
+	callback = partition->callback;
+	cb_data = partition->cb_data;
+	read_unlock(&partition->rw_lock);
+
+	if (callback)
+		callback(vcpu, is_per_vcpu, cb_data);
 }
 
 static void ffa_notification_info_get(void)
@@ -845,6 +865,39 @@ static int ffa_memory_lend(struct ffa_mem_ops_args *args)
 	return ffa_memory_ops(FFA_MEM_LEND, args);
 }
 
+static int ffa_sched_recv_cb_update(u16 part_id, ffa_sched_recv_cb callback,
+				    void *cb_data, bool is_registration)
+{
+	struct ffa_dev_part_info *partition;
+	bool cb_valid;
+
+	partition = xa_load(&drv_info->partition_info, part_id);
+	write_lock(&partition->rw_lock);
+
+	cb_valid = !!partition->callback;
+	if (!(is_registration ^ cb_valid)) {
+		write_unlock(&partition->rw_lock);
+		return -EINVAL;
+	}
+
+	partition->callback = callback;
+	partition->cb_data = cb_data;
+
+	write_unlock(&partition->rw_lock);
+	return 0;
+}
+
+static int ffa_sched_recv_cb_register(struct ffa_device *dev,
+				      ffa_sched_recv_cb cb, void *cb_data)
+{
+	return ffa_sched_recv_cb_update(dev->vm_id, cb, cb_data, true);
+}
+
+static int ffa_sched_recv_cb_unregister(struct ffa_device *dev)
+{
+	return ffa_sched_recv_cb_update(dev->vm_id, NULL, NULL, false);
+}
+
 static const struct ffa_info_ops ffa_drv_info_ops = {
 	.api_version_get = ffa_api_version_get,
 	.partition_info_get = ffa_partition_info_get,
@@ -865,11 +918,17 @@ static const struct ffa_cpu_ops ffa_drv_cpu_ops = {
 	.run = ffa_run,
 };
 
+static const struct ffa_notifier_ops ffa_drv_notifier_ops = {
+	.sched_recv_cb_register = ffa_sched_recv_cb_register,
+	.sched_recv_cb_unregister = ffa_sched_recv_cb_unregister,
+};
+
 static const struct ffa_ops ffa_drv_ops = {
 	.info_ops = &ffa_drv_info_ops,
 	.msg_ops = &ffa_drv_msg_ops,
 	.mem_ops = &ffa_drv_mem_ops,
 	.cpu_ops = &ffa_drv_cpu_ops,
+	.notifier_ops = &ffa_drv_notifier_ops,
 };
 
 void ffa_device_match_uuid(struct ffa_device *ffa_dev, const uuid_t *uuid)
@@ -900,6 +959,7 @@ static void ffa_setup_partitions(void)
 	int count, idx;
 	uuid_t uuid;
 	struct ffa_device *ffa_dev;
+	struct ffa_dev_part_info *info;
 	struct ffa_partition_info *pbuf, *tpbuf;
 
 	count = ffa_partition_probe(&uuid_null, &pbuf);
@@ -908,6 +968,7 @@ static void ffa_setup_partitions(void)
 		return;
 	}
 
+	xa_init(&drv_info->partition_info);
 	for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++) {
 		import_uuid(&uuid, (u8 *)tpbuf->uuid);
 
@@ -927,10 +988,42 @@ static void ffa_setup_partitions(void)
 		if (drv_info->version > FFA_VERSION_1_0 &&
 		    !(tpbuf->properties & FFA_PARTITION_AARCH64_EXEC))
 			ffa_mode_32bit_set(ffa_dev);
+
+		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			ffa_device_unregister(ffa_dev);
+			continue;
+		}
+		xa_store(&drv_info->partition_info, tpbuf->id, info, GFP_KERNEL);
 	}
+	drv_info->partition_count = count;
+
 	kfree(pbuf);
 }
 
+static void ffa_partitions_cleanup(void)
+{
+	struct ffa_dev_part_info **info;
+	int idx, count = drv_info->partition_count;
+
+	if (!count)
+		return;
+
+	info = kcalloc(count, sizeof(**info), GFP_KERNEL);
+	if (!info)
+		return;
+
+	xa_extract(&drv_info->partition_info, (void **)info, 0, VM_ID_MASK,
+		   count, XA_PRESENT);
+
+	for (idx = 0; idx < count; idx++)
+		kfree(info[idx]);
+	kfree(info);
+
+	drv_info->partition_count = 0;
+	xa_destroy(&drv_info->partition_info);
+}
+
 /* FFA FEATURE IDs */
 #define FFA_FEAT_NOTIFICATION_PENDING_INT	(1)
 #define FFA_FEAT_SCHEDULE_RECEIVER_INT		(2)
@@ -1164,9 +1257,11 @@ static int __init ffa_init(void)
 
 	ret = ffa_notifications_setup();
 	if (ret)
-		goto free_pages;
+		goto partitions_cleanup;
 
 	return 0;
+partitions_cleanup:
+	ffa_partitions_cleanup();
 free_pages:
 	if (drv_info->tx_buffer)
 		free_pages_exact(drv_info->tx_buffer, RXTX_BUFFER_SIZE);
@@ -1182,9 +1277,11 @@ subsys_initcall(ffa_init);
 static void __exit ffa_exit(void)
 {
 	ffa_notifications_cleanup();
+	ffa_partitions_cleanup();
 	ffa_rxtx_unmap(drv_info->vm_id);
 	free_pages_exact(drv_info->tx_buffer, RXTX_BUFFER_SIZE);
 	free_pages_exact(drv_info->rx_buffer, RXTX_BUFFER_SIZE);
+	xa_destroy(&drv_info->partition_info);
 	kfree(drv_info);
 	arm_ffa_bus_exit();
 }
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 12fd134bf670..f9cf6114ef82 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -391,11 +391,19 @@ struct ffa_cpu_ops {
 	int (*run)(struct ffa_device *dev, u16 vcpu);
 };
 
+typedef void (*ffa_sched_recv_cb)(u16 vcpu, bool is_per_vcpu, void *cb_data);
+struct ffa_notifier_ops {
+	int (*sched_recv_cb_register)(struct ffa_device *dev,
+				      ffa_sched_recv_cb cb, void *cb_data);
+	int (*sched_recv_cb_unregister)(struct ffa_device *dev);
+};
+
 struct ffa_ops {
 	const struct ffa_info_ops *info_ops;
 	const struct ffa_msg_ops *msg_ops;
 	const struct ffa_mem_ops *mem_ops;
 	const struct ffa_cpu_ops *cpu_ops;
+	const struct ffa_notifier_ops *notifier_ops;
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From e0573444edbf4ee7e3c191d3d08a4ccbd26628be Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:03 +0100
Subject: firmware: arm_ffa: Add interfaces to request notification callbacks

Add interface to the FFA driver to allow for client drivers to request
and relinquish a notification as well as provide a callback for the
notification.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-10-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 145 ++++++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h           |   5 ++
 2 files changed, 150 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 04cdb49cc78b..1c3b34eb06e4 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -27,11 +27,13 @@
 #include <linux/bitfield.h>
 #include <linux/cpuhotplug.h>
 #include <linux/device.h>
+#include <linux/hashtable.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/of_irq.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
@@ -57,6 +59,8 @@
  */
 #define RXTX_BUFFER_SIZE	SZ_4K
 
+#define FFA_MAX_NOTIFICATIONS		64
+
 static ffa_fn *invoke_ffa_fn;
 
 static const int ffa_linux_errmap[] = {
@@ -102,6 +106,8 @@ struct ffa_drv_info {
 	struct work_struct irq_work;
 	struct xarray partition_info;
 	unsigned int partition_count;
+	DECLARE_HASHTABLE(notifier_hash, ilog2(FFA_MAX_NOTIFICATIONS));
+	struct mutex notify_lock; /* lock to protect notifier hashtable  */
 };
 
 static struct ffa_drv_info *drv_info;
@@ -626,6 +632,8 @@ static int ffa_notification_bitmap_destroy(void)
 #define MAX_IDS_64				20
 #define MAX_IDS_32				10
 
+#define PER_VCPU_NOTIFICATION_FLAG		BIT(0)
+
 static int ffa_notification_bind_common(u16 dst_id, u64 bitmap,
 					u32 flags, bool is_bind)
 {
@@ -865,6 +873,21 @@ static int ffa_memory_lend(struct ffa_mem_ops_args *args)
 	return ffa_memory_ops(FFA_MEM_LEND, args);
 }
 
+#define FFA_SECURE_PARTITION_ID_FLAG	BIT(15)
+
+enum notify_type {
+	NON_SECURE_VM,
+	SECURE_PARTITION,
+	FRAMEWORK,
+};
+
+struct notifier_cb_info {
+	struct hlist_node hnode;
+	ffa_notifier_cb cb;
+	void *cb_data;
+	enum notify_type type;
+};
+
 static int ffa_sched_recv_cb_update(u16 part_id, ffa_sched_recv_cb callback,
 				    void *cb_data, bool is_registration)
 {
@@ -898,6 +921,123 @@ static int ffa_sched_recv_cb_unregister(struct ffa_device *dev)
 	return ffa_sched_recv_cb_update(dev->vm_id, NULL, NULL, false);
 }
 
+static int ffa_notification_bind(u16 dst_id, u64 bitmap, u32 flags)
+{
+	return ffa_notification_bind_common(dst_id, bitmap, flags, true);
+}
+
+static int ffa_notification_unbind(u16 dst_id, u64 bitmap)
+{
+	return ffa_notification_bind_common(dst_id, bitmap, 0, false);
+}
+
+/* Should be called while the notify_lock is taken */
+static struct notifier_cb_info *
+notifier_hash_node_get(u16 notify_id, enum notify_type type)
+{
+	struct notifier_cb_info *node;
+
+	hash_for_each_possible(drv_info->notifier_hash, node, hnode, notify_id)
+		if (type == node->type)
+			return node;
+
+	return NULL;
+}
+
+static int
+update_notifier_cb(int notify_id, enum notify_type type, ffa_notifier_cb cb,
+		   void *cb_data, bool is_registration)
+{
+	struct notifier_cb_info *cb_info = NULL;
+	bool cb_found;
+
+	cb_info = notifier_hash_node_get(notify_id, type);
+	cb_found = !!cb_info;
+
+	if (!(is_registration ^ cb_found))
+		return -EINVAL;
+
+	if (is_registration) {
+		cb_info = kzalloc(sizeof(*cb_info), GFP_KERNEL);
+		if (!cb_info)
+			return -ENOMEM;
+
+		cb_info->type = type;
+		cb_info->cb = cb;
+		cb_info->cb_data = cb_data;
+
+		hash_add(drv_info->notifier_hash, &cb_info->hnode, notify_id);
+	} else {
+		hash_del(&cb_info->hnode);
+	}
+
+	return 0;
+}
+
+static enum notify_type ffa_notify_type_get(u16 vm_id)
+{
+	if (vm_id & FFA_SECURE_PARTITION_ID_FLAG)
+		return SECURE_PARTITION;
+	else
+		return NON_SECURE_VM;
+}
+
+static int ffa_notify_relinquish(struct ffa_device *dev, int notify_id)
+{
+	int rc;
+	enum notify_type type = ffa_notify_type_get(dev->vm_id);
+
+	if (notify_id >= FFA_MAX_NOTIFICATIONS)
+		return -EINVAL;
+
+	mutex_lock(&drv_info->notify_lock);
+
+	rc = update_notifier_cb(notify_id, type, NULL, NULL, false);
+	if (rc) {
+		pr_err("Could not unregister notification callback\n");
+		mutex_unlock(&drv_info->notify_lock);
+		return rc;
+	}
+
+	rc = ffa_notification_unbind(dev->vm_id, BIT(notify_id));
+
+	mutex_unlock(&drv_info->notify_lock);
+
+	return rc;
+}
+
+static int ffa_notify_request(struct ffa_device *dev, bool is_per_vcpu,
+			      ffa_notifier_cb cb, void *cb_data, int notify_id)
+{
+	int rc;
+	u32 flags = 0;
+	enum notify_type type = ffa_notify_type_get(dev->vm_id);
+
+	if (notify_id >= FFA_MAX_NOTIFICATIONS)
+		return -EINVAL;
+
+	mutex_lock(&drv_info->notify_lock);
+
+	if (is_per_vcpu)
+		flags = PER_VCPU_NOTIFICATION_FLAG;
+
+	rc = ffa_notification_bind(dev->vm_id, BIT(notify_id), flags);
+	if (rc) {
+		mutex_unlock(&drv_info->notify_lock);
+		return rc;
+	}
+
+	rc = update_notifier_cb(notify_id, type, cb, cb_data, true);
+	if (rc) {
+		pr_err("Failed to register callback for %d - %d\n",
+		       notify_id, rc);
+		ffa_notification_unbind(dev->vm_id, BIT(notify_id));
+	}
+	mutex_unlock(&drv_info->notify_lock);
+
+	return rc;
+}
+
 static const struct ffa_info_ops ffa_drv_info_ops = {
 	.api_version_get = ffa_api_version_get,
 	.partition_info_get = ffa_partition_info_get,
@@ -921,6 +1061,8 @@ static const struct ffa_cpu_ops ffa_drv_cpu_ops = {
 static const struct ffa_notifier_ops ffa_drv_notifier_ops = {
 	.sched_recv_cb_register = ffa_sched_recv_cb_register,
 	.sched_recv_cb_unregister = ffa_sched_recv_cb_unregister,
+	.notify_request = ffa_notify_request,
+	.notify_relinquish = ffa_notify_relinquish,
 };
 
 static const struct ffa_ops ffa_drv_ops = {
@@ -1194,6 +1336,9 @@ static int ffa_notifications_setup(void)
 	if (ret)
 		goto cleanup;
 
+	hash_init(drv_info->notifier_hash);
+	mutex_init(&drv_info->notify_lock);
+
 	return 0;
 cleanup:
 	ffa_notifications_cleanup();
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index f9cf6114ef82..fb6f388a3737 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -392,10 +392,15 @@ struct ffa_cpu_ops {
 };
 
 typedef void (*ffa_sched_recv_cb)(u16 vcpu, bool is_per_vcpu, void *cb_data);
+typedef void (*ffa_notifier_cb)(int notify_id, void *cb_data);
+
 struct ffa_notifier_ops {
 	int (*sched_recv_cb_register)(struct ffa_device *dev,
 				      ffa_sched_recv_cb cb, void *cb_data);
 	int (*sched_recv_cb_unregister)(struct ffa_device *dev);
+	int (*notify_request)(struct ffa_device *dev, bool per_vcpu,
+			      ffa_notifier_cb cb, void *cb_data, int notify_id);
+	int (*notify_relinquish)(struct ffa_device *dev, int notify_id);
 };
 
 struct ffa_ops {
-- 
cgit v1.2.3


From e5adb3b20e39dbf18651322a9bc24eb55050188f Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:04 +0100
Subject: firmware: arm_ffa: Add interface to send a notification to a given
 partition

The framework provides an interface to the sender endpoint to specify
the notification to signal to the receiver endpoint. A sender signals
a notification by requesting its partition manager to set the
corresponding bit in the notifications bitmap of the receiver.

Expose the ability to send a notification to another partition.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-11-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 13 +++++++++++++
 include/linux/arm_ffa.h           |  2 ++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 1c3b34eb06e4..590005234073 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -1038,6 +1038,18 @@ static int ffa_notify_request(struct ffa_device *dev, bool is_per_vcpu,
 	return rc;
 }
 
+static int ffa_notify_send(struct ffa_device *dev, int notify_id,
+			   bool is_per_vcpu, u16 vcpu)
+{
+	u32 flags = 0;
+
+	if (is_per_vcpu)
+		flags |= (PER_VCPU_NOTIFICATION_FLAG | vcpu << 16);
+
+	return ffa_notification_set(dev->vm_id, drv_info->vm_id, flags,
+				    BIT(notify_id));
+}
+
 static const struct ffa_info_ops ffa_drv_info_ops = {
 	.api_version_get = ffa_api_version_get,
 	.partition_info_get = ffa_partition_info_get,
@@ -1063,6 +1075,7 @@ static const struct ffa_notifier_ops ffa_drv_notifier_ops = {
 	.sched_recv_cb_unregister = ffa_sched_recv_cb_unregister,
 	.notify_request = ffa_notify_request,
 	.notify_relinquish = ffa_notify_relinquish,
+	.notify_send = ffa_notify_send,
 };
 
 static const struct ffa_ops ffa_drv_ops = {
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index fb6f388a3737..f6df81f14b6d 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -401,6 +401,8 @@ struct ffa_notifier_ops {
 	int (*notify_request)(struct ffa_device *dev, bool per_vcpu,
 			      ffa_notifier_cb cb, void *cb_data, int notify_id);
 	int (*notify_relinquish)(struct ffa_device *dev, int notify_id);
+	int (*notify_send)(struct ffa_device *dev, int notify_id, bool per_vcpu,
+			   u16 vcpu);
 };
 
 struct ffa_ops {
-- 
cgit v1.2.3


From c9b21ef0d0a87695d7bfeee9a04b89760b49ccf5 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:06 +0100
Subject: firmware: arm_ffa: Simplify the computation of transmit and fragment
 length

The computation of endpoint memory access descriptor's composite memory
region descriptor offset is using COMPOSITE_CONSTITUENTS_OFFSET which is
unnecessary complicated. Composite memory region descriptor always follow
the endpoint memory access descriptor array and hence it is computed
accordingly. COMPOSITE_CONSTITUENTS_OFFSET is useless and wrong for any
input other than endpoint memory access descriptor count.

Let us drop the usage of COMPOSITE_CONSTITUENTS_OFFSET to simplify the
computation of total transmit and fragment length in the memory
transactions.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-13-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 14 ++++++++------
 include/linux/arm_ffa.h           |  2 --
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 6f688b4a4620..49fcbeb63eaa 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -442,23 +442,25 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize,
 {
 	int rc = 0;
 	bool first = true;
+	u32 composite_offset;
 	phys_addr_t addr = 0;
+	struct ffa_mem_region *mem_region = buffer;
 	struct ffa_composite_mem_region *composite;
 	struct ffa_mem_region_addr_range *constituents;
 	struct ffa_mem_region_attributes *ep_mem_access;
-	struct ffa_mem_region *mem_region = buffer;
 	u32 idx, frag_len, length, buf_sz = 0, num_entries = sg_nents(args->sg);
 
 	mem_region->tag = args->tag;
 	mem_region->flags = args->flags;
 	mem_region->sender_id = drv_info->vm_id;
 	mem_region->attributes = ffa_memory_attributes_get(func_id);
-	ep_mem_access = &mem_region->ep_mem_access[0];
+	ep_mem_access = buffer + COMPOSITE_OFFSET(0);
+	composite_offset = COMPOSITE_OFFSET(args->nattrs);
 
 	for (idx = 0; idx < args->nattrs; idx++, ep_mem_access++) {
 		ep_mem_access->receiver = args->attrs[idx].receiver;
 		ep_mem_access->attrs = args->attrs[idx].attrs;
-		ep_mem_access->composite_off = COMPOSITE_OFFSET(args->nattrs);
+		ep_mem_access->composite_off = composite_offset;
 		ep_mem_access->flag = 0;
 		ep_mem_access->reserved = 0;
 	}
@@ -467,13 +469,13 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize,
 	mem_region->reserved_1 = 0;
 	mem_region->ep_count = args->nattrs;
 
-	composite = buffer + COMPOSITE_OFFSET(args->nattrs);
+	composite = buffer + composite_offset;
 	composite->total_pg_cnt = ffa_get_num_pages_sg(args->sg);
 	composite->addr_range_cnt = num_entries;
 	composite->reserved = 0;
 
-	length = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, num_entries);
-	frag_len = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, 0);
+	length = composite_offset + CONSTITUENTS_OFFSET(num_entries);
+	frag_len = composite_offset + CONSTITUENTS_OFFSET(0);
 	if (frag_len > max_fragsize)
 		return -ENXIO;
 
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index f6df81f14b6d..748d0a83a4bc 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -356,8 +356,6 @@ struct ffa_mem_region {
 	(offsetof(struct ffa_mem_region, ep_mem_access[x]))
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
-#define COMPOSITE_CONSTITUENTS_OFFSET(x, y)	\
-	(COMPOSITE_OFFSET(x) + CONSTITUENTS_OFFSET(y))
 
 struct ffa_mem_ops_args {
 	bool use_txbuf;
-- 
cgit v1.2.3


From bc5bc309db45a7ab218ce8259ba9bc7659be61ca Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 5 Oct 2023 08:41:23 +0000
Subject: bpf: Inherit system settings for CPU security mitigations

Currently, there exists a system-wide setting related to CPU security
mitigations, denoted as 'mitigations='. When set to 'mitigations=off', it
deactivates all optional CPU mitigations. Therefore, if we implement a
system-wide 'mitigations=off' setting, it should inherently bypass Spectre
v1 and Spectre v4 in the BPF subsystem.

Please note that there is also a more specific 'nospectre_v1' setting on
x86 and ppc architectures, though it is not currently exported. For the
time being, let's disregard more fine-grained options.

This idea emerged during our discussion about potential Spectre v1 attacks
with Luis [0].

  [0] https://lore.kernel.org/bpf/b4fc15f7-b204-767e-ebb9-fdb4233961fb@iogearbox.net

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Cc: Luis Gerhorst <gerhorst@cs.fau.de>
Link: https://lore.kernel.org/bpf/20231005084123.1338-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a82efd34b741..61bde4520f5c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable();
+	return perfmon_capable() || cpu_mitigations_off();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3


From e7a1b32e43b194bbf930281ae7f5149c420cd122 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 5 Oct 2023 15:41:20 +0200
Subject: cpufreq: Rebuild sched-domains when removing cpufreq driver

The Energy Aware Scheduler (EAS) relies on the schedutil governor.
When moving to/from the schedutil governor, sched domains must be
rebuilt to allow re-evaluating the enablement conditions of EAS.
This is done through sched_cpufreq_governor_change().

Having a cpufreq governor assumes a cpufreq driver is running.
Inserting/removing a cpufreq driver should trigger a re-evaluation
of EAS enablement conditions, avoiding to see EAS enabled when
removing a running cpufreq driver.

Rebuild the sched domains in schedutil's sugov_init()/sugov_exit(),
allowing to check EAS's enablement condition whenever schedutil
governor is initialized/exited from.
Move relevant code up in schedutil.c to avoid a split and conditional
function declaration.
Rename sched_cpufreq_governor_change() to sugov_eas_rebuild_sd().

Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c        |  3 +--
 include/linux/cpufreq.h          |  8 ------
 kernel/sched/cpufreq_schedutil.c | 55 +++++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 60ed89000e82..4bc15634d49c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1544,7 +1544,7 @@ static int cpufreq_online(unsigned int cpu)
 
 		/*
 		 * Register with the energy model before
-		 * sched_cpufreq_governor_change() is called, which will result
+		 * sugov_eas_rebuild_sd() is called, which will result
 		 * in rebuilding of the sched domains, which should only be done
 		 * once the energy model is properly initialized for the policy
 		 * first.
@@ -2652,7 +2652,6 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
 			pr_debug("governor change\n");
-			sched_cpufreq_governor_change(policy, old_gov);
 			return 0;
 		}
 		cpufreq_exit_governor(policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 71d186d6933a..1c5ca92a0555 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1193,14 +1193,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_
 }
 #endif
 
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov);
-#else
-static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov) { }
-#endif
-
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
 #ifndef arch_set_freq_scale
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f3a95def49cc..492ec650d48f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -555,6 +555,31 @@ static const struct kobj_type sugov_tunables_ktype = {
 
 /********************** cpufreq governor interface *********************/
 
+#ifdef CONFIG_ENERGY_MODEL
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+	rebuild_sched_domains_energy();
+}
+
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+static void sugov_eas_rebuild_sd(void)
+{
+	/*
+	 * When called from the cpufreq_register_driver() path, the
+	 * cpu_hotplug_lock is already held, so use a work item to
+	 * avoid nested locking in rebuild_sched_domains().
+	 */
+	schedule_work(&rebuild_sd_work);
+}
+#else
+static inline void sugov_eas_rebuild_sd(void) { };
+#endif
+
 struct cpufreq_governor schedutil_gov;
 
 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -709,6 +734,8 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto fail;
 
+	sugov_eas_rebuild_sd();
+
 out:
 	mutex_unlock(&global_tunables_lock);
 	return 0;
@@ -750,6 +777,8 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 	cpufreq_disable_fast_switch(policy);
+
+	sugov_eas_rebuild_sd();
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
@@ -833,29 +862,3 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 #endif
 
 cpufreq_governor_init(schedutil_gov);
-
-#ifdef CONFIG_ENERGY_MODEL
-static void rebuild_sd_workfn(struct work_struct *work)
-{
-	rebuild_sched_domains_energy();
-}
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-				  struct cpufreq_governor *old_gov)
-{
-	if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
-		/*
-		 * When called from the cpufreq_register_driver() path, the
-		 * cpu_hotplug_lock is already held, so use a work item to
-		 * avoid nested locking in rebuild_sched_domains().
-		 */
-		schedule_work(&rebuild_sd_work);
-	}
-
-}
-#endif
-- 
cgit v1.2.3


From 51a23b1be92046f0cc52384d30cf060700f1a54e Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@cn.fujitsu.com>
Date: Wed, 2 Aug 2023 17:28:56 +0800
Subject: acpi,mm: fix typo sibiling -> sibling

First found this typo as reviewing memory tier code. Fix it by sed like:
$ sed -i 's/sibiling/sibling/g' $(git grep -l sibiling)

so the acpi one will be corrected as well.

Link: https://lkml.kernel.org/r/20230802092856.819328-1-lizhijian@cn.fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/acpi/acpi_pad.c      |  2 +-
 include/linux/memory-tiers.h |  2 +-
 mm/memory-tiers.c            | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 7a453c5ff303..7f073ca64f0e 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -100,7 +100,7 @@ static void round_robin_cpu(unsigned int tsk_index)
 	for_each_cpu(cpu, pad_busy_cpus)
 		cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
 	cpumask_andnot(tmp, cpu_online_mask, tmp);
-	/* avoid HT sibilings if possible */
+	/* avoid HT siblings if possible */
 	if (cpumask_empty(tmp))
 		cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus);
 	if (cpumask_empty(tmp)) {
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..4fa178b50784 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -22,7 +22,7 @@
 struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
-	struct list_head tier_sibiling;
+	struct list_head tier_sibling;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 37a4f59d9585..876d8a5e210e 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -115,7 +115,7 @@ static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memti
 	nodemask_t nodes = NODE_MASK_NONE;
 	struct memory_dev_type *memtype;
 
-	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+	list_for_each_entry(memtype, &memtier->memory_types, tier_sibling)
 		nodes_or(nodes, nodes, memtype->nodes);
 
 	return nodes;
@@ -174,7 +174,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	 * If the memtype is already part of a memory tier,
 	 * just return that.
 	 */
-	if (!list_empty(&memtype->tier_sibiling)) {
+	if (!list_empty(&memtype->tier_sibling)) {
 		list_for_each_entry(memtier, &memory_tiers, list) {
 			if (adistance == memtier->adistance_start)
 				return memtier;
@@ -218,7 +218,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	memtier = new_memtier;
 
 link_memtype:
-	list_add(&memtype->tier_sibiling, &memtier->memory_types);
+	list_add(&memtype->tier_sibling, &memtier->memory_types);
 	return memtier;
 }
 
@@ -527,7 +527,7 @@ static bool clear_node_memory_tier(int node)
 		memtype = node_memory_types[node].memtype;
 		node_clear(node, memtype->nodes);
 		if (nodes_empty(memtype->nodes)) {
-			list_del_init(&memtype->tier_sibiling);
+			list_del_init(&memtype->tier_sibling);
 			if (list_empty(&memtier->memory_types))
 				destroy_memory_tier(memtier);
 		}
@@ -553,7 +553,7 @@ struct memory_dev_type *alloc_memory_type(int adistance)
 		return ERR_PTR(-ENOMEM);
 
 	memtype->adistance = adistance;
-	INIT_LIST_HEAD(&memtype->tier_sibiling);
+	INIT_LIST_HEAD(&memtype->tier_sibling);
 	memtype->nodes  = NODE_MASK_NONE;
 	kref_init(&memtype->kref);
 	return memtype;
-- 
cgit v1.2.3


From 55d2a0bd5eadaade850efa9d3a7ffbb0aeb67198 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 18 Sep 2023 14:31:42 +0800
Subject: mm: add statistics for PUD level pagetable

Recently, we found that cross-die access to pagetable pages on ARM64
machines can cause performance fluctuations in our business.  Currently,
there are no PMU events available to track this situation on our ARM64
machines, so accurate pagetable accounting can help to analyze this issue,
but now the PUD level pagetable accounting is missed.

So introduce pagetable_pud_ctor/dtor() to help to get accurate PUD
pagetable accounting, as well as converting the architectures which use
generic PUD pagetable allocation to add corresponding PUD pagetable
accounting.  Moreover this patch will mark the PUD level pagetable with
PG_table flag, which will help to do sanity validation in
unpoison_memory().

On my testing machine, I can see more pagetables statistics after the patch
with page-types tool:

Before patch:
        flags           page-count      MB  symbolic-flags                     long-symbolic-flags
0x0000000004000000           27326      106  __________________________g_________________       pgtable
After patch:
0x0000000004000000           27541      107  __________________________g_________________       pgtable

Link: https://lkml.kernel.org/r/876c71c03a7e69c17722a690e3225a4f7b172fb2.1695017383.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/tlb.h         |  5 ++++-
 arch/loongarch/include/asm/pgalloc.h |  1 +
 arch/mips/include/asm/pgalloc.h      |  1 +
 arch/x86/mm/pgtable.c                |  3 +++
 include/asm-generic/pgalloc.h        |  7 ++++++-
 include/linux/mm.h                   | 16 ++++++++++++++++
 6 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 2c29239d05c3..846c563689a8 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -96,7 +96,10 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
 				  unsigned long addr)
 {
-	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp));
+	struct ptdesc *ptdesc = virt_to_ptdesc(pudp);
+
+	pagetable_pud_dtor(ptdesc);
+	tlb_remove_ptdesc(tlb, ptdesc);
 }
 #endif
 
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 79470f0b4f1d..4e2d6b7ca2ee 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -84,6 +84,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 
 	if (!ptdesc)
 		return NULL;
+	pagetable_pud_ctor(ptdesc);
 	pud = ptdesc_address(ptdesc);
 
 	pud_init(pud);
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index 40e40a7eb94a..f4440edcd8fe 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -95,6 +95,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 
 	if (!ptdesc)
 		return NULL;
+	pagetable_pud_ctor(ptdesc);
 	pud = ptdesc_address(ptdesc);
 
 	pud_init(pud);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 9deadf517f14..0cbc1b8e8e3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -76,6 +76,9 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
+	struct ptdesc *ptdesc = virt_to_ptdesc(pud);
+
+	pagetable_pud_dtor(ptdesc);
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
 	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
 }
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index c75d4a753849..879e5f8aa5e9 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -169,6 +169,8 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 	ptdesc = pagetable_alloc(gfp, 0);
 	if (!ptdesc)
 		return NULL;
+
+	pagetable_pud_ctor(ptdesc);
 	return ptdesc_address(ptdesc);
 }
 
@@ -190,8 +192,11 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 
 static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
 {
+	struct ptdesc *ptdesc = virt_to_ptdesc(pud);
+
 	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	pagetable_free(virt_to_ptdesc(pud));
+	pagetable_pud_dtor(ptdesc);
+	pagetable_free(ptdesc);
 }
 
 #ifndef __HAVE_ARCH_PUD_FREE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 31dc25d3f6b5..126b54b45442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3049,6 +3049,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
 	return ptl;
 }
 
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_set_pgtable(folio);
+	lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+	struct folio *folio = ptdesc_folio(ptdesc);
+
+	__folio_clear_pgtable(folio);
+	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 extern void __init pagecache_init(void);
 extern void free_initmem(void);
 
-- 
cgit v1.2.3


From 7ced098fcfe596feab3cea4f40128b0119c7bf1a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 18 Sep 2023 22:18:32 +0200
Subject: mm: document mmu_notifier_invalidate_range_start_nonblock()

Document what mmu_notifier_invalidate_range_start_nonblock() is for.  Also
add a __must_check annotation to signal that callers must bail out if a
notifier vetoes the operation.

Link: https://lkml.kernel.org/r/20230918201832.265108-1-jannh@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6e3c857606f1..f349e08a9dfe 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 }
 
-static inline int
+/*
+ * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
+ * can return an error if a notifier can't proceed without blocking, in which
+ * case you're not allowed to modify PTEs in the specified range.
+ *
+ * This is mainly intended for OOM handling.
+ */
+static inline int __must_check
 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
 	int ret = 0;
-- 
cgit v1.2.3


From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 28 Aug 2023 17:08:57 +0200
Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl

This extends the current PR_SET_MDWE prctl arg with a bit to indicate that
the process doesn't want MDWE protection to propagate to children.

To implement this no-inherit mode, the tag in current->mm->flags must be
absent from MMF_INIT_MASK.  This means that the encoding for "MDWE but
without inherit" is different in the prctl than in the mm flags.  This
leads to a bit of bit-mangling in the prctl implementation.

Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org
Signed-off-by: Florent Revest <revest@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexey Izbyshev <izbyshev@ispras.ru>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ayush Jain <ayush.jain3@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h   | 10 ++++++++++
 include/uapi/linux/prctl.h       |  1 +
 kernel/fork.c                    |  2 +-
 kernel/sys.c                     | 32 ++++++++++++++++++++++++++------
 tools/include/uapi/linux/prctl.h |  1 +
 5 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0ee96ea7a0e9..1b37fa8fc723 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm)
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #define MMF_VM_MERGE_ANY	29
+#define MMF_HAS_MDWE_NO_INHERIT	30
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+		flags &= ~((1UL << MMF_HAS_MDWE) |
+			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
+	return flags & MMF_INIT_MASK;
+}
+
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 9a85c69782bd..370ed14b1ae0 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -284,6 +284,7 @@ struct prctl_mm_map {
 /* Memory deny write / execute */
 #define PR_SET_MDWE			65
 # define PR_MDWE_REFUSE_EXEC_GAIN	(1UL << 0)
+# define PR_MDWE_NO_INHERIT		(1UL << 1)
 
 #define PR_GET_MDWE			66
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1779183a7cb3..e45a4457ba83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	hugetlb_count_init(mm);
 
 	if (current->mm) {
-		mm->flags = current->mm->flags & MMF_INIT_MASK;
+		mm->flags = mmf_init_flags(current->mm->flags);
 		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
 	} else {
 		mm->flags = default_dump_filter;
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..4a8073c1b255 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2368,19 +2368,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
+static inline unsigned long get_current_mdwe(void)
+{
+	unsigned long ret = 0;
+
+	if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+		ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+	if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+		ret |= PR_MDWE_NO_INHERIT;
+
+	return ret;
+}
+
 static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
 				 unsigned long arg4, unsigned long arg5)
 {
+	unsigned long current_bits;
+
 	if (arg3 || arg4 || arg5)
 		return -EINVAL;
 
-	if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+	if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+		return -EINVAL;
+
+	/* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+	if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
 		return -EINVAL;
 
+	current_bits = get_current_mdwe();
+	if (current_bits && current_bits != bits)
+		return -EPERM; /* Cannot unset the flags */
+
+	if (bits & PR_MDWE_NO_INHERIT)
+		set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
 	if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
 		set_bit(MMF_HAS_MDWE, &current->mm->flags);
-	else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
-		return -EPERM; /* Cannot unset the flag */
 
 	return 0;
 }
@@ -2390,9 +2412,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
 {
 	if (arg2 || arg3 || arg4 || arg5)
 		return -EINVAL;
-
-	return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
-		PR_MDWE_REFUSE_EXEC_GAIN : 0;
+	return get_current_mdwe();
 }
 
 static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 9a85c69782bd..370ed14b1ae0 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -284,6 +284,7 @@ struct prctl_mm_map {
 /* Memory deny write / execute */
 #define PR_SET_MDWE			65
 # define PR_MDWE_REFUSE_EXEC_GAIN	(1UL << 0)
+# define PR_MDWE_NO_INHERIT		(1UL << 1)
 
 #define PR_GET_MDWE			66
 
-- 
cgit v1.2.3


From 7e845ecb2fbfa1bf800e703df29ee2e06592c2a0 Mon Sep 17 00:00:00 2001
From: Sui Jingfeng <suijingfeng@loongson.cn>
Date: Wed, 30 Aug 2023 19:15:28 +0800
Subject: PCI: Add pci_is_vga() helper

The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
VGA Base Class and Sub-Classes:

  03 00  PCI_CLASS_DISPLAY_VGA      VGA-compatible or 8514-compatible
  00 01  PCI_CLASS_NOT_DEFINED_VGA  VGA-compatible (before Class Code)

Add a pci_is_vga() helper to return true if a device is in either category.

These VGA devices use the hardwired legacy VGA resources
([mem 0xa0000-0xbffff], [io 0x3b0-0x3bb], [io 0x3c0-0x3df] and aliases), so
they require special handling if more than one is present in the system.

Link: https://lore.kernel.org/r/20230830111532.444535-2-sui.jingfeng@linux.dev
Signed-off-by: Sui Jingfeng <suijingfeng@loongson.cn>
[bhelgaas: commit log, drop !pdev test]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: "Maciej W. Rozycki" <macro@orcam.me.uk>
---
 include/linux/pci.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..7bab234391cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -713,6 +713,30 @@ static inline bool pci_is_bridge(struct pci_dev *dev)
 		dev->hdr_type == PCI_HEADER_TYPE_CARDBUS;
 }
 
+/**
+ * pci_is_vga - check if the PCI device is a VGA device
+ *
+ * The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
+ * VGA Base Class and Sub-Classes:
+ *
+ *   03 00  PCI_CLASS_DISPLAY_VGA      VGA-compatible or 8514-compatible
+ *   00 01  PCI_CLASS_NOT_DEFINED_VGA  VGA-compatible (before Class Code)
+ *
+ * Return true if the PCI device is a VGA device and uses the legacy VGA
+ * resources ([mem 0xa0000-0xbffff], [io 0x3b0-0x3bb], [io 0x3c0-0x3df] and
+ * aliases).
+ */
+static inline bool pci_is_vga(struct pci_dev *pdev)
+{
+	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
+		return true;
+
+	if ((pdev->class >> 8) == PCI_CLASS_NOT_DEFINED_VGA)
+		return true;
+
+	return false;
+}
+
 #define for_each_pci_bridge(dev, bus)				\
 	list_for_each_entry(dev, &bus->devices, bus_list)	\
 		if (!pci_is_bridge(dev)) {} else
-- 
cgit v1.2.3


From 3abbd0699b678fc48e0100704338cff9180fe4bb Mon Sep 17 00:00:00 2001
From: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Date: Thu, 5 Oct 2023 20:29:15 +0200
Subject: net: phy: broadcom: add support for BCM5221 phy

This patch adds the BCM5221 PHY support by reusing brcm_fet_*()
callbacks and adding quirks for BCM5221 when needed.

Cc: Jim Reinhart <jimr@tekvox.com>
Cc: James Autry <jautry@tekvox.com>
Cc: Matthew Maron <matthewm@tekvox.com>
Signed-off-by: Giulio Benetti <giulio.benetti+tekvox@benettiengineering.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231005182915.153815-1-giulio.benetti@benettiengineering.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/broadcom.c | 154 +++++++++++++++++++++++++++++++++++----------
 include/linux/brcmphy.h    |  10 +++
 2 files changed, 131 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 04b2e6eeb195..3a627105675a 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -704,16 +704,21 @@ static int brcm_fet_config_init(struct phy_device *phydev)
 	if (err < 0 && err != -EIO)
 		return err;
 
+	/* Read to clear status bits */
 	reg = phy_read(phydev, MII_BRCM_FET_INTREG);
 	if (reg < 0)
 		return reg;
 
 	/* Unmask events we are interested in and mask interrupts globally. */
-	reg = MII_BRCM_FET_IR_DUPLEX_EN |
-	      MII_BRCM_FET_IR_SPEED_EN |
-	      MII_BRCM_FET_IR_LINK_EN |
-	      MII_BRCM_FET_IR_ENABLE |
-	      MII_BRCM_FET_IR_MASK;
+	if (phydev->phy_id == PHY_ID_BCM5221)
+		reg = MII_BRCM_FET_IR_ENABLE |
+		      MII_BRCM_FET_IR_MASK;
+	else
+		reg = MII_BRCM_FET_IR_DUPLEX_EN |
+		      MII_BRCM_FET_IR_SPEED_EN |
+		      MII_BRCM_FET_IR_LINK_EN |
+		      MII_BRCM_FET_IR_ENABLE |
+		      MII_BRCM_FET_IR_MASK;
 
 	err = phy_write(phydev, MII_BRCM_FET_INTREG, reg);
 	if (err < 0)
@@ -726,42 +731,49 @@ static int brcm_fet_config_init(struct phy_device *phydev)
 
 	reg = brcmtest | MII_BRCM_FET_BT_SRE;
 
-	err = phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
-	if (err < 0)
-		return err;
+	phy_lock_mdio_bus(phydev);
 
-	/* Set the LED mode */
-	reg = phy_read(phydev, MII_BRCM_FET_SHDW_AUXMODE4);
-	if (reg < 0) {
-		err = reg;
-		goto done;
+	err = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
+	if (err < 0) {
+		phy_unlock_mdio_bus(phydev);
+		return err;
 	}
 
-	reg &= ~MII_BRCM_FET_SHDW_AM4_LED_MASK;
-	reg |= MII_BRCM_FET_SHDW_AM4_LED_MODE1;
+	if (phydev->phy_id != PHY_ID_BCM5221) {
+		/* Set the LED mode */
+		reg = __phy_read(phydev, MII_BRCM_FET_SHDW_AUXMODE4);
+		if (reg < 0) {
+			err = reg;
+			goto done;
+		}
 
-	err = phy_write(phydev, MII_BRCM_FET_SHDW_AUXMODE4, reg);
-	if (err < 0)
-		goto done;
+		err = __phy_modify(phydev, MII_BRCM_FET_SHDW_AUXMODE4,
+				   MII_BRCM_FET_SHDW_AM4_LED_MASK,
+				   MII_BRCM_FET_SHDW_AM4_LED_MODE1);
+		if (err < 0)
+			goto done;
 
-	/* Enable auto MDIX */
-	err = phy_set_bits(phydev, MII_BRCM_FET_SHDW_MISCCTRL,
-			   MII_BRCM_FET_SHDW_MC_FAME);
-	if (err < 0)
-		goto done;
+		/* Enable auto MDIX */
+		err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_MISCCTRL,
+				     MII_BRCM_FET_SHDW_MC_FAME);
+		if (err < 0)
+			goto done;
+	}
 
 	if (phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE) {
 		/* Enable auto power down */
-		err = phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXSTAT2,
-				   MII_BRCM_FET_SHDW_AS2_APDE);
+		err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXSTAT2,
+				     MII_BRCM_FET_SHDW_AS2_APDE);
 	}
 
 done:
 	/* Disable shadow register access */
-	err2 = phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
+	err2 = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
 	if (!err)
 		err = err2;
 
+	phy_unlock_mdio_bus(phydev);
+
 	return err;
 }
 
@@ -840,23 +852,86 @@ static int brcm_fet_suspend(struct phy_device *phydev)
 
 	reg = brcmtest | MII_BRCM_FET_BT_SRE;
 
-	err = phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
-	if (err < 0)
+	phy_lock_mdio_bus(phydev);
+
+	err = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, reg);
+	if (err < 0) {
+		phy_unlock_mdio_bus(phydev);
 		return err;
+	}
+
+	if (phydev->phy_id == PHY_ID_BCM5221)
+		/* Force Low Power Mode with clock enabled */
+		reg = BCM5221_SHDW_AM4_EN_CLK_LPM | BCM5221_SHDW_AM4_FORCE_LPM;
+	else
+		/* Set standby mode */
+		reg = MII_BRCM_FET_SHDW_AM4_STANDBY;
 
-	/* Set standby mode */
-	err = phy_modify(phydev, MII_BRCM_FET_SHDW_AUXMODE4,
-			 MII_BRCM_FET_SHDW_AM4_STANDBY,
-			 MII_BRCM_FET_SHDW_AM4_STANDBY);
+	err = __phy_set_bits(phydev, MII_BRCM_FET_SHDW_AUXMODE4, reg);
 
 	/* Disable shadow register access */
-	err2 = phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
+	err2 = __phy_write(phydev, MII_BRCM_FET_BRCMTEST, brcmtest);
 	if (!err)
 		err = err2;
 
+	phy_unlock_mdio_bus(phydev);
+
 	return err;
 }
 
+static int bcm5221_config_aneg(struct phy_device *phydev)
+{
+	int ret, val;
+
+	ret = genphy_config_aneg(phydev);
+	if (ret)
+		return ret;
+
+	switch (phydev->mdix_ctrl) {
+	case ETH_TP_MDI:
+		val = BCM5221_AEGSR_MDIX_DIS;
+		break;
+	case ETH_TP_MDI_X:
+		val = BCM5221_AEGSR_MDIX_DIS | BCM5221_AEGSR_MDIX_MAN_SWAP;
+		break;
+	case ETH_TP_MDI_AUTO:
+		val = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return phy_modify(phydev, BCM5221_AEGSR, BCM5221_AEGSR_MDIX_MAN_SWAP |
+						 BCM5221_AEGSR_MDIX_DIS,
+						 val);
+}
+
+static int bcm5221_read_status(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Read MDIX status */
+	ret = phy_read(phydev, BCM5221_AEGSR);
+	if (ret < 0)
+		return ret;
+
+	if (ret & BCM5221_AEGSR_MDIX_DIS) {
+		if (ret & BCM5221_AEGSR_MDIX_MAN_SWAP)
+			phydev->mdix_ctrl = ETH_TP_MDI_X;
+		else
+			phydev->mdix_ctrl = ETH_TP_MDI;
+	} else {
+		phydev->mdix_ctrl = ETH_TP_MDI_AUTO;
+	}
+
+	if (ret & BCM5221_AEGSR_MDIX_STATUS)
+		phydev->mdix = ETH_TP_MDI_X;
+	else
+		phydev->mdix = ETH_TP_MDI;
+
+	return genphy_read_status(phydev);
+}
+
 static void bcm54xx_phy_get_wol(struct phy_device *phydev,
 				struct ethtool_wolinfo *wol)
 {
@@ -1221,6 +1296,18 @@ static struct phy_driver broadcom_drivers[] = {
 	.handle_interrupt = brcm_fet_handle_interrupt,
 	.suspend	= brcm_fet_suspend,
 	.resume		= brcm_fet_config_init,
+}, {
+	.phy_id		= PHY_ID_BCM5221,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM5221",
+	/* PHY_BASIC_FEATURES */
+	.config_init	= brcm_fet_config_init,
+	.config_intr	= brcm_fet_config_intr,
+	.handle_interrupt = brcm_fet_handle_interrupt,
+	.suspend	= brcm_fet_suspend,
+	.resume		= brcm_fet_config_init,
+	.config_aneg	= bcm5221_config_aneg,
+	.read_status	= bcm5221_read_status,
 }, {
 	.phy_id		= PHY_ID_BCM5395,
 	.phy_id_mask	= 0xfffffff0,
@@ -1296,6 +1383,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM50610M, 0xfffffff0 },
 	{ PHY_ID_BCM57780, 0xfffffff0 },
 	{ PHY_ID_BCMAC131, 0xfffffff0 },
+	{ PHY_ID_BCM5221, 0xfffffff0 },
 	{ PHY_ID_BCM5241, 0xfffffff0 },
 	{ PHY_ID_BCM5395, 0xfffffff0 },
 	{ PHY_ID_BCM53125, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index c55810a43541..1394ba302367 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -11,6 +11,7 @@
 
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
+#define PHY_ID_BCM5221			0x004061e0
 #define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
@@ -331,6 +332,15 @@
 
 #define BCM54XX_WOL_INT_STATUS		(MII_BCM54XX_EXP_SEL_WOL + 0x94)
 
+/* BCM5221 Registers */
+#define BCM5221_AEGSR			0x1C
+#define BCM5221_AEGSR_MDIX_STATUS	BIT(13)
+#define BCM5221_AEGSR_MDIX_MAN_SWAP	BIT(12)
+#define BCM5221_AEGSR_MDIX_DIS		BIT(11)
+
+#define BCM5221_SHDW_AM4_EN_CLK_LPM	BIT(2)
+#define BCM5221_SHDW_AM4_FORCE_LPM	BIT(1)
+
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
 /*****************************************************************************/
-- 
cgit v1.2.3


From 77bbfe607b1d306c88bf96fed00c030f6bf462f1 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Wed, 4 Oct 2023 07:42:23 +0800
Subject: firmware: arm_scmi: Add support for clock parents

SCMI v3.2 spec introduces CLOCK_POSSIBLE_PARENTS_GET, CLOCK_PARENT_SET
and CLOCK_PARENT_GET. Add support for these to enable clock parents
and use them in the clock driver.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20231004-scmi-clock-v3-v5-1-1b8a1435673e@nxp.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 181 ++++++++++++++++++++++++++++++++++++--
 include/linux/scmi_protocol.h     |   6 ++
 2 files changed, 181 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 9c0e33c1efab..42b81c181d68 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -22,6 +22,9 @@ enum scmi_clock_protocol_cmd {
 	CLOCK_RATE_NOTIFY = 0x9,
 	CLOCK_RATE_CHANGE_REQUESTED_NOTIFY = 0xA,
 	CLOCK_CONFIG_GET = 0xB,
+	CLOCK_POSSIBLE_PARENTS_GET = 0xC,
+	CLOCK_PARENT_SET = 0xD,
+	CLOCK_PARENT_GET = 0xE,
 };
 
 enum clk_state {
@@ -42,10 +45,28 @@ struct scmi_msg_resp_clock_attributes {
 #define SUPPORTS_RATE_CHANGED_NOTIF(x)		((x) & BIT(31))
 #define SUPPORTS_RATE_CHANGE_REQUESTED_NOTIF(x)	((x) & BIT(30))
 #define SUPPORTS_EXTENDED_NAMES(x)		((x) & BIT(29))
+#define SUPPORTS_PARENT_CLOCK(x)		((x) & BIT(28))
 	u8 name[SCMI_SHORT_NAME_MAX_SIZE];
 	__le32 clock_enable_latency;
 };
 
+struct scmi_msg_clock_possible_parents {
+	__le32 id;
+	__le32 skip_parents;
+};
+
+struct scmi_msg_resp_clock_possible_parents {
+	__le32 num_parent_flags;
+#define NUM_PARENTS_RETURNED(x)		((x) & 0xff)
+#define NUM_PARENTS_REMAINING(x)	((x) >> 24)
+	__le32 possible_parents[];
+};
+
+struct scmi_msg_clock_set_parent {
+	__le32 id;
+	__le32 parent_id;
+};
+
 struct scmi_msg_clock_config_set {
 	__le32 id;
 	__le32 attributes;
@@ -168,6 +189,98 @@ scmi_clock_protocol_attributes_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
+struct scmi_clk_ipriv {
+	struct device *dev;
+	u32 clk_id;
+	struct scmi_clock_info *clk;
+};
+
+static void iter_clk_possible_parents_prepare_message(void *message, unsigned int desc_index,
+						      const void *priv)
+{
+	struct scmi_msg_clock_possible_parents *msg = message;
+	const struct scmi_clk_ipriv *p = priv;
+
+	msg->id = cpu_to_le32(p->clk_id);
+	/* Set the number of OPPs to be skipped/already read */
+	msg->skip_parents = cpu_to_le32(desc_index);
+}
+
+static int iter_clk_possible_parents_update_state(struct scmi_iterator_state *st,
+						  const void *response, void *priv)
+{
+	const struct scmi_msg_resp_clock_possible_parents *r = response;
+	struct scmi_clk_ipriv *p = priv;
+	struct device *dev = ((struct scmi_clk_ipriv *)p)->dev;
+	u32 flags;
+
+	flags = le32_to_cpu(r->num_parent_flags);
+	st->num_returned = NUM_PARENTS_RETURNED(flags);
+	st->num_remaining = NUM_PARENTS_REMAINING(flags);
+
+	/*
+	 * num parents is not declared previously anywhere so we
+	 * assume it's returned+remaining on first call.
+	 */
+	if (!st->max_resources) {
+		p->clk->num_parents = st->num_returned + st->num_remaining;
+		p->clk->parents = devm_kcalloc(dev, p->clk->num_parents,
+					       sizeof(*p->clk->parents),
+					       GFP_KERNEL);
+		if (!p->clk->parents) {
+			p->clk->num_parents = 0;
+			return -ENOMEM;
+		}
+		st->max_resources = st->num_returned + st->num_remaining;
+	}
+
+	return 0;
+}
+
+static int iter_clk_possible_parents_process_response(const struct scmi_protocol_handle *ph,
+						      const void *response,
+						      struct scmi_iterator_state *st,
+						      void *priv)
+{
+	const struct scmi_msg_resp_clock_possible_parents *r = response;
+	struct scmi_clk_ipriv *p = priv;
+
+	u32 *parent = &p->clk->parents[st->desc_index + st->loop_idx];
+
+	*parent = le32_to_cpu(r->possible_parents[st->loop_idx]);
+
+	return 0;
+}
+
+static int scmi_clock_possible_parents(const struct scmi_protocol_handle *ph, u32 clk_id,
+				       struct scmi_clock_info *clk)
+{
+	struct scmi_iterator_ops ops = {
+		.prepare_message = iter_clk_possible_parents_prepare_message,
+		.update_state = iter_clk_possible_parents_update_state,
+		.process_response = iter_clk_possible_parents_process_response,
+	};
+
+	struct scmi_clk_ipriv ppriv = {
+		.clk_id = clk_id,
+		.clk = clk,
+		.dev = ph->dev,
+	};
+	void *iter;
+	int ret;
+
+	iter = ph->hops->iter_response_init(ph, &ops, 0,
+					    CLOCK_POSSIBLE_PARENTS_GET,
+					    sizeof(struct scmi_msg_clock_possible_parents),
+					    &ppriv);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	ret = ph->hops->iter_response_run(iter);
+
+	return ret;
+}
+
 static int scmi_clock_attributes_get(const struct scmi_protocol_handle *ph,
 				     u32 clk_id, struct scmi_clock_info *clk,
 				     u32 version)
@@ -212,6 +325,8 @@ static int scmi_clock_attributes_get(const struct scmi_protocol_handle *ph,
 			clk->rate_changed_notifications = true;
 		if (SUPPORTS_RATE_CHANGE_REQUESTED_NOTIF(attributes))
 			clk->rate_change_requested_notifications = true;
+		if (SUPPORTS_PARENT_CLOCK(attributes))
+			scmi_clock_possible_parents(ph, clk_id, clk);
 	}
 
 	return ret;
@@ -229,12 +344,6 @@ static int rate_cmp_func(const void *_r1, const void *_r2)
 		return 1;
 }
 
-struct scmi_clk_ipriv {
-	struct device *dev;
-	u32 clk_id;
-	struct scmi_clock_info *clk;
-};
-
 static void iter_clk_describe_prepare_message(void *message,
 					      const unsigned int desc_index,
 					      const void *priv)
@@ -458,6 +567,64 @@ scmi_clock_config_set(const struct scmi_protocol_handle *ph, u32 clk_id,
 	return ret;
 }
 
+static int
+scmi_clock_set_parent(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      u32 parent_id)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_clock_set_parent *cfg;
+	struct clock_info *ci = ph->get_priv(ph);
+	struct scmi_clock_info *clk;
+
+	if (clk_id >= ci->num_clocks)
+		return -EINVAL;
+
+	clk = ci->clk + clk_id;
+
+	if (parent_id >= clk->num_parents)
+		return -EINVAL;
+
+	ret = ph->xops->xfer_get_init(ph, CLOCK_PARENT_SET,
+				      sizeof(*cfg), 0, &t);
+	if (ret)
+		return ret;
+
+	t->hdr.poll_completion = false;
+
+	cfg = t->tx.buf;
+	cfg->id = cpu_to_le32(clk_id);
+	cfg->parent_id = cpu_to_le32(clk->parents[parent_id]);
+
+	ret = ph->xops->do_xfer(ph, t);
+
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int
+scmi_clock_get_parent(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      u32 *parent_id)
+{
+	int ret;
+	struct scmi_xfer *t;
+
+	ret = ph->xops->xfer_get_init(ph, CLOCK_PARENT_GET,
+				      sizeof(__le32), sizeof(u32), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(clk_id, t->tx.buf);
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret)
+		*parent_id = get_unaligned_le32(t->rx.buf);
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
 /* For SCMI clock v2.1 and onwards */
 static int
 scmi_clock_config_set_v2(const struct scmi_protocol_handle *ph, u32 clk_id,
@@ -650,6 +817,8 @@ static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.state_get = scmi_clock_state_get,
 	.config_oem_get = scmi_clock_config_oem_get,
 	.config_oem_set = scmi_clock_config_oem_set,
+	.parent_set = scmi_clock_set_parent,
+	.parent_get = scmi_clock_get_parent,
 };
 
 static int scmi_clk_rate_notify(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 27bfa5a65b45..f2f05fb42d28 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -58,6 +58,8 @@ struct scmi_clock_info {
 			u64 step_size;
 		} range;
 	};
+	int num_parents;
+	u32 *parents;
 };
 
 enum scmi_power_scale {
@@ -83,6 +85,8 @@ struct scmi_protocol_handle;
  * @state_get: get the status of the specified clock
  * @config_oem_get: get the value of an OEM specific clock config
  * @config_oem_set: set the value of an OEM specific clock config
+ * @parent_get: get the parent id of a clk
+ * @parent_set: set the parent of a clock
  */
 struct scmi_clk_proto_ops {
 	int (*count_get)(const struct scmi_protocol_handle *ph);
@@ -104,6 +108,8 @@ struct scmi_clk_proto_ops {
 			      bool atomic);
 	int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
 			      u8 oem_type, u32 oem_val, bool atomic);
+	int (*parent_get)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 *parent_id);
+	int (*parent_set)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 parent_id);
 };
 
 struct scmi_perf_domain_info {
-- 
cgit v1.2.3


From 76cf932c95b9e7c07b065b5c71e56957e2826ae2 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:07 +0100
Subject: KVM: arm64: FFA: Remove access of endpoint memory access descriptor
 array

FF-A v1.1 removes the fixed location of endpoint memory access descriptor
array within the memory transaction descriptor structure. In preparation
to remove the ep_mem_access member from the ffa_mem_region structure,
provide the accessor to fetch the offset and use the same in FF-A proxy
implementation.

The accessor take the FF-A version as the argument from which the memory
access descriptor format can be determined. v1.0 uses the old format while
v1.1 onwards use the new format specified in the v1.1 specification.

Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-14-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 10 ++++++++--
 include/linux/arm_ffa.h       |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index ab4f5d160c58..5c7b345c2cd5 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -423,6 +423,7 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 	DECLARE_REG(u32, fraglen, ctxt, 2);
 	DECLARE_REG(u64, addr_mbz, ctxt, 3);
 	DECLARE_REG(u32, npages_mbz, ctxt, 4);
+	struct ffa_mem_region_attributes *ep_mem_access;
 	struct ffa_composite_mem_region *reg;
 	struct ffa_mem_region *buf;
 	u32 offset, nr_ranges;
@@ -452,7 +453,9 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 	buf = hyp_buffers.tx;
 	memcpy(buf, host_buffers.tx, fraglen);
 
-	offset = buf->ep_mem_access[0].composite_off;
+	ep_mem_access = (void *)buf +
+			ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+	offset = ep_mem_access->composite_off;
 	if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
 		ret = FFA_RET_INVALID_PARAMETERS;
 		goto out_unlock;
@@ -504,6 +507,7 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
 	DECLARE_REG(u32, handle_lo, ctxt, 1);
 	DECLARE_REG(u32, handle_hi, ctxt, 2);
 	DECLARE_REG(u32, flags, ctxt, 3);
+	struct ffa_mem_region_attributes *ep_mem_access;
 	struct ffa_composite_mem_region *reg;
 	u32 offset, len, fraglen, fragoff;
 	struct ffa_mem_region *buf;
@@ -528,7 +532,9 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
 	len = res->a1;
 	fraglen = res->a2;
 
-	offset = buf->ep_mem_access[0].composite_off;
+	ep_mem_access = (void *)buf +
+			ffa_mem_desc_offset(buf, 0, FFA_VERSION_1_0);
+	offset = ep_mem_access->composite_off;
 	/*
 	 * We can trust the SPMD to get this right, but let's at least
 	 * check that we end up with something that doesn't look _completely_
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 748d0a83a4bc..2444d596b703 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -357,6 +357,12 @@ struct ffa_mem_region {
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
 
+static inline u32
+ffa_mem_desc_offset(struct ffa_mem_region *buf, int count, u32 ffa_version)
+{
+	return COMPOSITE_OFFSET(0);
+}
+
 struct ffa_mem_ops_args {
 	bool use_txbuf;
 	u32 nattrs;
-- 
cgit v1.2.3


From 113580530ee7dc61e668b641d657920734533b9f Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 5 Oct 2023 15:45:09 +0100
Subject: firmware: arm_ffa: Update memory descriptor to support v1.1 format

Update memory transaction descriptor structure to accommodate couple of
new entries in v1.1 which were previously marked reserved and MBZ(must
be zero).

It also removes the flexible array member ep_mem_access in the memory
transaction descriptor structure as it need not be at fixed offset.
Also update ffa_mem_desc_offset() accessor to handle both old and new
formats of memory transaction descriptors.

The updated ffa_mem_region structure aligns with new format in v1.1 and
hence the driver/user must take care not to use members beyond and
including ep_mem_offset when using the old format.

Link: https://lore.kernel.org/r/20231005-ffa_v1-1_notif-v4-16-cddd3237809c@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 11 ++++++++---
 include/linux/arm_ffa.h           | 33 ++++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index c79067201487..6c5c7926b8ee 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -423,7 +423,7 @@ static u32 ffa_get_num_pages_sg(struct scatterlist *sg)
 	return num_pages;
 }
 
-static u8 ffa_memory_attributes_get(u32 func_id)
+static u16 ffa_memory_attributes_get(u32 func_id)
 {
 	/*
 	 * For the memory lend or donate operation, if the receiver is a PE or
@@ -467,9 +467,14 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize,
 		ep_mem_access->reserved = 0;
 	}
 	mem_region->handle = 0;
-	mem_region->reserved_0 = 0;
-	mem_region->reserved_1 = 0;
 	mem_region->ep_count = args->nattrs;
+	if (drv_info->version <= FFA_VERSION_1_0) {
+		mem_region->ep_mem_size = 0;
+	} else {
+		mem_region->ep_mem_size = sizeof(*ep_mem_access);
+		mem_region->ep_mem_offset = sizeof(*mem_region);
+		memset(mem_region->reserved, 0, 12);
+	}
 
 	composite = buffer + composite_offset;
 	composite->total_pg_cnt = ffa_get_num_pages_sg(args->sg);
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 2444d596b703..1abedb5b2e48 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -6,6 +6,7 @@
 #ifndef _LINUX_ARM_FFA_H
 #define _LINUX_ARM_FFA_H
 
+#include <linux/bitfield.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -298,8 +299,8 @@ struct ffa_mem_region {
 #define FFA_MEM_NON_SHAREABLE	(0)
 #define FFA_MEM_OUTER_SHAREABLE	(2)
 #define FFA_MEM_INNER_SHAREABLE	(3)
-	u8 attributes;
-	u8 reserved_0;
+	/* Memory region attributes, upper byte MBZ pre v1.1 */
+	u16 attributes;
 /*
  * Clear memory region contents after unmapping it from the sender and
  * before mapping it for any receiver.
@@ -337,30 +338,40 @@ struct ffa_mem_region {
 	 * memory region.
 	 */
 	u64 tag;
-	u32 reserved_1;
+	/* Size of each endpoint memory access descriptor, MBZ pre v1.1 */
+	u32 ep_mem_size;
 	/*
 	 * The number of `ffa_mem_region_attributes` entries included in this
 	 * transaction.
 	 */
 	u32 ep_count;
 	/*
-	 * An array of endpoint memory access descriptors.
-	 * Each one specifies a memory region offset, an endpoint and the
-	 * attributes with which this memory region should be mapped in that
-	 * endpoint's page table.
+	 * 16-byte aligned offset from the base address of this descriptor
+	 * to the first element of the endpoint memory access descriptor array
+	 * Valid only from v1.1
 	 */
-	struct ffa_mem_region_attributes ep_mem_access[];
+	u32 ep_mem_offset;
+	/* MBZ, valid only from v1.1 */
+	u32 reserved[3];
 };
 
-#define	COMPOSITE_OFFSET(x)	\
-	(offsetof(struct ffa_mem_region, ep_mem_access[x]))
 #define CONSTITUENTS_OFFSET(x)	\
 	(offsetof(struct ffa_composite_mem_region, constituents[x]))
 
 static inline u32
 ffa_mem_desc_offset(struct ffa_mem_region *buf, int count, u32 ffa_version)
 {
-	return COMPOSITE_OFFSET(0);
+	u32 offset = count * sizeof(struct ffa_mem_region_attributes);
+	/*
+	 * Earlier to v1.1, the endpoint memory descriptor array started at
+	 * offset 32(i.e. offset of ep_mem_offset in the current structure)
+	 */
+	if (ffa_version <= FFA_VERSION_1_0)
+		offset += offsetof(struct ffa_mem_region, ep_mem_offset);
+	else
+		offset += sizeof(struct ffa_mem_region);
+
+	return offset;
 }
 
 struct ffa_mem_ops_args {
-- 
cgit v1.2.3


From 3e7807d5a7d770c59837026e9967fe99ad043174 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 4 Oct 2023 22:55:32 -0400
Subject: fscrypt: rename fscrypt_info => fscrypt_inode_info

We are going to track per-extent information, so it'll be necessary to
distinguish between inode infos and extent infos.  Rename fscrypt_info
to fscrypt_inode_info, adjusting any lines that now exceed 80
characters.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ebiggers: rebased onto fscrypt tree, renamed fscrypt_get_info(),
 adjusted two comments, and fixed some lines over 80 characters]
Link: https://lore.kernel.org/r/20231005025757.33521-1-ebiggers@kernel.org
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/bio.c             |  2 +-
 fs/crypto/crypto.c          | 21 +++++++++++----------
 fs/crypto/fname.c           |  6 +++---
 fs/crypto/fscrypt_private.h | 42 ++++++++++++++++++++++--------------------
 fs/crypto/hooks.c           |  2 +-
 fs/crypto/inline_crypt.c    | 13 +++++++------
 fs/crypto/keyring.c         |  4 ++--
 fs/crypto/keysetup.c        | 44 +++++++++++++++++++++++---------------------
 fs/crypto/keysetup_v1.c     | 15 +++++++++------
 fs/crypto/policy.c          | 10 +++++-----
 include/linux/fs.h          |  4 ++--
 include/linux/fscrypt.h     | 10 ++++++----
 12 files changed, 92 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index c8cf77065272..0ad8c30b8fa5 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,7 +111,7 @@ out:
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len)
 {
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 85e2f66dd663..328470d40dec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page);
  * simply contain the data unit index (e.g., IV_INO_LBLK_32).
  */
 void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
-			 const struct fscrypt_info *ci)
+			 const struct fscrypt_inode_info *ci)
 {
 	u8 flags = fscrypt_policy_flags(&ci->ci_policy);
 
@@ -105,7 +105,7 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
 }
 
 /* Encrypt or decrypt a single "data unit" of file contents. */
-int fscrypt_crypt_data_unit(const struct fscrypt_info *ci,
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
 			    fscrypt_direction_t rw, u64 index,
 			    struct page *src_page, struct page *dest_page,
 			    unsigned int len, unsigned int offs,
@@ -184,7 +184,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
 
 {
 	const struct inode *inode = page->mapping->host;
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	struct page *ciphertext_page;
@@ -267,7 +267,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs)
 {
 	const struct inode *inode = folio->mapping->host;
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
@@ -409,18 +409,19 @@ static int __init fscrypt_init(void)
 	if (!fscrypt_read_workqueue)
 		goto fail;
 
-	fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
-	if (!fscrypt_info_cachep)
+	fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+					       SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_inode_info_cachep)
 		goto fail_free_queue;
 
 	err = fscrypt_init_keyring();
 	if (err)
-		goto fail_free_info;
+		goto fail_free_inode_info;
 
 	return 0;
 
-fail_free_info:
-	kmem_cache_destroy(fscrypt_info_cachep);
+fail_free_inode_info:
+	kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
 	destroy_workqueue(fscrypt_read_workqueue);
 fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-	const struct fscrypt_info *ci = dir->i_crypt_info;
+	const struct fscrypt_inode_info *ci = dir->i_crypt_info;
 
 	WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 9c5e83baa3f1..2fb4ba435d27 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -210,18 +210,18 @@ struct fscrypt_prepared_key {
 };
 
 /*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct is
  * allocated and stored in ->i_crypt_info.  Once created, it remains until the
  * inode is evicted.
  */
-struct fscrypt_info {
+struct fscrypt_inode_info {
 
 	/* The key in a form prepared for actual encryption/decryption */
 	struct fscrypt_prepared_key ci_enc_key;
 
-	/* True if ci_enc_key should be freed when this fscrypt_info is freed */
+	/* True if ci_enc_key should be freed when this struct is freed */
 	bool ci_owns_key;
 
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -294,9 +294,9 @@ typedef enum {
 } fscrypt_direction_t;
 
 /* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
+extern struct kmem_cache *fscrypt_inode_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_data_unit(const struct fscrypt_info *ci,
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
 			    fscrypt_direction_t rw, u64 index,
 			    struct page *src_page, struct page *dest_page,
 			    unsigned int len, unsigned int offs,
@@ -326,7 +326,7 @@ union fscrypt_iv {
 };
 
 void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
-			 const struct fscrypt_info *ci);
+			 const struct fscrypt_inode_info *ci);
 
 /*
  * Return the number of bits used by the maximum file data unit index that is
@@ -374,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
 
 /* inline_crypt.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return ci->ci_inlinecrypt;
 }
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci);
+				     const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 				      struct fscrypt_prepared_key *prep_key);
@@ -395,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
  */
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	/*
 	 * The two smp_load_acquire()'s here pair with the smp_store_release()'s
@@ -412,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
 
 #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
-static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	return 0;
 }
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return false;
 }
@@ -426,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
 static inline int
 fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				 const u8 *raw_key,
-				 const struct fscrypt_info *ci)
+				 const struct fscrypt_inode_info *ci)
 {
 	WARN_ON_ONCE(1);
 	return -EOPNOTSUPP;
@@ -440,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	return smp_load_acquire(&prep_key->tfm) != NULL;
 }
@@ -640,17 +640,18 @@ struct fscrypt_mode {
 extern struct fscrypt_mode fscrypt_modes[];
 
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci);
+			const u8 *raw_key, const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_prepared_key(struct super_block *sb,
 				  struct fscrypt_prepared_key *prep_key);
 
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key);
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
 int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
@@ -685,10 +686,11 @@ static inline int fscrypt_require_key(struct inode *inode)
 
 void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
 			      const u8 *raw_master_key);
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+				struct fscrypt_inode_info *ci);
 
 /* policy.c */
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 6238dbcadcad..85d2975b69b7 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
 int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct fscrypt_master_key *mk;
 	int err;
 
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8c6d37d6225a..b4002aea7cdb 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -39,7 +39,7 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb,
 	return devs;
 }
 
-static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci)
 {
 	const struct super_block *sb = ci->ci_inode->i_sb;
 	unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
@@ -89,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 }
 
 /* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -151,7 +151,7 @@ out_free_devs:
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci)
+				     const struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -232,7 +232,8 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
 
-static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
+				 u64 lblk_num,
 				 u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
 {
 	u64 index = lblk_num << ci->ci_data_units_per_block_bits;
@@ -266,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
 void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       u64 first_lblk, gfp_t gfp_mask)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
@@ -457,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
  */
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u32 dun;
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7cbb1fd872ac..a51fa6a33de1 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -867,7 +867,7 @@ static void shrink_dcache_inode(struct inode *inode)
 
 static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct inode *inode;
 	struct inode *toput_inode = NULL;
 
@@ -917,7 +917,7 @@ static int check_for_busy_inodes(struct super_block *sb,
 		/* select an example file to show for debugging purposes */
 		struct inode *inode =
 			list_first_entry(&mk->mk_decrypted_inodes,
-					 struct fscrypt_info,
+					 struct fscrypt_inode_info,
 					 ci_master_key_link)->ci_inode;
 		ino = inode->i_ino;
 	}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 608599f8aa57..094d1b7a1ae6 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -148,7 +148,7 @@ err_free_tfm:
  * and IV generation method (@ci->ci_policy.flags).
  */
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci)
+			const u8 *raw_key, const struct fscrypt_inode_info *ci)
 {
 	struct crypto_skcipher *tfm;
 
@@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
 }
 
 /* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key)
 {
 	ci->ci_owns_key = true;
 	return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
 }
 
-static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
 				  struct fscrypt_master_key *mk,
 				  struct fscrypt_prepared_key *keys,
 				  u8 hkdf_context, bool include_fs_uuid)
@@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
 	return 0;
 }
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	int err;
@@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 	return 0;
 }
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
@@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 					      &mk->mk_ino_hash_key);
 }
 
-static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
 					    struct fscrypt_master_key *mk)
 {
 	int err;
@@ -329,7 +330,7 @@ unlock:
 	return 0;
 }
 
-static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
 				     struct fscrypt_master_key *mk,
 				     bool need_dirhash_key)
 {
@@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
  * still allow 512-bit master keys if the user chooses to use them, though.)
  */
 static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
-					  const struct fscrypt_info *ci)
+					  const struct fscrypt_inode_info *ci)
 {
 	unsigned int min_keysize;
 
@@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
  *
  * If the master key is found in the filesystem-level keyring, then it is
  * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
- * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
- * multiple tasks may race to create an fscrypt_info for the same inode), and to
- * synchronize the master key being removed with a new inode starting to use it.
+ * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
+ * (as multiple tasks may race to create an fscrypt_inode_info for the same
+ * inode), and to synchronize the master key being removed with a new inode
+ * starting to use it.
  */
-static int setup_file_encryption_key(struct fscrypt_info *ci,
+static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
 				     bool need_dirhash_key,
 				     struct fscrypt_master_key **mk_ret)
 {
@@ -519,7 +521,7 @@ out_release_key:
 	return err;
 }
 
-static void put_crypt_info(struct fscrypt_info *ci)
+static void put_crypt_info(struct fscrypt_inode_info *ci)
 {
 	struct fscrypt_master_key *mk;
 
@@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
-	kmem_cache_free(fscrypt_info_cachep, ci);
+	kmem_cache_free(fscrypt_inode_info_cachep, ci);
 }
 
 static int
@@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 			      const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
 			      bool need_dirhash_key)
 {
-	struct fscrypt_info *crypt_info;
+	struct fscrypt_inode_info *crypt_info;
 	struct fscrypt_mode *mode;
 	struct fscrypt_master_key *mk = NULL;
 	int res;
@@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	if (res)
 		return res;
 
-	crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
+	crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
 	if (!crypt_info)
 		return -ENOMEM;
 
@@ -592,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	/*
 	 * For existing inodes, multiple tasks may race to set ->i_crypt_info.
 	 * So use cmpxchg_release().  This pairs with the smp_load_acquire() in
-	 * fscrypt_get_info().  I.e., here we publish ->i_crypt_info with a
-	 * RELEASE barrier so that other tasks can ACQUIRE it.
+	 * fscrypt_get_inode_info().  I.e., here we publish ->i_crypt_info with
+	 * a RELEASE barrier so that other tasks can ACQUIRE it.
 	 */
 	if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
 		/*
@@ -740,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
  * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
  * @inode: an inode being evicted
  *
- * Free the inode's fscrypt_info.  Filesystems must call this when the inode is
- * being evicted.  An RCU grace period need not have elapsed yet.
+ * Free the inode's fscrypt_inode_info.  Filesystems must call this when the
+ * inode is being evicted.  An RCU grace period need not have elapsed yet.
  */
 void fscrypt_put_encryption_info(struct inode *inode)
 {
@@ -778,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
  */
 int fscrypt_drop_inode(struct inode *inode)
 {
-	const struct fscrypt_info *ci = fscrypt_get_info(inode);
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);
 
 	/*
 	 * If ci is NULL, then the inode doesn't have an encryption key set up
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 86b48a2b47d1..a10710bc8123 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
  */
 static struct fscrypt_direct_key *
 find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
-			  const u8 *raw_key, const struct fscrypt_info *ci)
+			  const u8 *raw_key,
+			  const struct fscrypt_inode_info *ci)
 {
 	unsigned long hash_key;
 	struct fscrypt_direct_key *dk;
@@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 
 /* Prepare to encrypt directly using the master key in the given mode */
 static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key)
 {
 	struct fscrypt_direct_key *dk;
 	int err;
@@ -250,7 +251,7 @@ err_free_dk:
 }
 
 /* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci,
 				    const u8 *raw_master_key)
 {
 	struct fscrypt_direct_key *dk;
@@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
 }
 
 /* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci,
 				     const u8 *raw_master_key)
 {
 	u8 *derived_key;
@@ -289,7 +290,8 @@ out:
 	return err;
 }
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
+			      const u8 *raw_master_key)
 {
 	if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return setup_v1_file_key_direct(ci, raw_master_key);
@@ -297,7 +299,8 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
 		return setup_v1_file_key_derived(ci, raw_master_key);
 }
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+int
+fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci)
 {
 	const struct super_block *sb = ci->ci_inode->i_sb;
 	struct key *key;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 2fb3f6a1258e..701259991277 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -432,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 /* Retrieve an inode's encryption policy */
 static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	union fscrypt_context ctx;
 	int ret;
 
-	ci = fscrypt_get_info(inode);
+	ci = fscrypt_get_inode_info(inode);
 	if (ci) {
 		/* key available, use the cached policy */
 		*policy = ci->ci_policy;
@@ -674,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 
 	/*
 	 * Both parent and child are encrypted, so verify they use the same
-	 * encryption policy.  Compare the fscrypt_info structs if the keys are
+	 * encryption policy.  Compare the cached policies if the keys are
 	 * available, otherwise retrieve and compare the fscrypt_contexts.
 	 *
 	 * Note that the fscrypt_context retrieval will be required frequently
@@ -744,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
  */
 int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 
 	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
 			FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -769,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
  */
 int fscrypt_set_context(struct inode *inode, void *fs_data)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	union fscrypt_context ctx;
 	int ctxsize;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..a3df96736473 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -67,7 +67,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fscrypt_operations;
 struct fsverity_info;
 struct fsverity_operations;
@@ -738,7 +738,7 @@ struct inode {
 #endif
 
 #ifdef CONFIG_FS_ENCRYPTION
-	struct fscrypt_info	*i_crypt_info;
+	struct fscrypt_inode_info	*i_crypt_info;
 #endif
 
 #ifdef CONFIG_FS_VERITY
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b559e6f77707..12f9e455d569 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -31,7 +31,7 @@
 #define FSCRYPT_CONTENTS_ALIGNMENT 16
 
 union fscrypt_policy;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fs_parameter;
 struct seq_file;
 
@@ -192,7 +192,8 @@ struct fscrypt_operations {
 					     unsigned int *num_devs);
 };
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	/*
 	 * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
@@ -404,7 +405,8 @@ static inline void fscrypt_set_ops(struct super_block *sb,
 }
 #else  /* !CONFIG_FS_ENCRYPTION */
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	return NULL;
 }
@@ -882,7 +884,7 @@ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
  */
 static inline bool fscrypt_has_encryption_key(const struct inode *inode)
 {
-	return fscrypt_get_info(inode) != NULL;
+	return fscrypt_get_inode_info(inode) != NULL;
 }
 
 /**
-- 
cgit v1.2.3


From 4a530cb932af31b0c919a109bc107dd186653381 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:50:53 -0700
Subject: hwmon: Annotate struct gsc_hwmon_platform_data with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct gsc_hwmon_platform_data.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Tim Harvey <tharvey@gateworks.com>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175053.work.564-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/platform_data/gsc_hwmon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gsc_hwmon.h b/include/linux/platform_data/gsc_hwmon.h
index f2781aa7eff8..70e8a6bec0f6 100644
--- a/include/linux/platform_data/gsc_hwmon.h
+++ b/include/linux/platform_data/gsc_hwmon.h
@@ -40,6 +40,6 @@ struct gsc_hwmon_platform_data {
 	unsigned int resolution;
 	unsigned int vreference;
 	unsigned int fan_base;
-	struct gsc_hwmon_channel channels[];
+	struct gsc_hwmon_channel channels[] __counted_by(nchannels);
 };
 #endif
-- 
cgit v1.2.3


From a48e1f656b3c9b8192b6ca6fc92ef4daa30535fb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:51:21 -0700
Subject: KVM: Annotate struct kvm_irq_routing_table with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct kvm_irq_routing_table.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: kvm@vger.kernel.org
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20230922175121.work.660-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb6c6109fdca..4944136efaa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -664,7 +664,7 @@ struct kvm_irq_routing_table {
 	 * Array indexed by gsi. Each entry contains list of irq chips
 	 * the gsi is connected to.
 	 */
-	struct hlist_head map[];
+	struct hlist_head map[] __counted_by(nr_rt_entries);
 };
 #endif
 
-- 
cgit v1.2.3


From 313ebe47d75558511aa1237b6e35c663b5c0ec6f Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 14:36:09 +0200
Subject: string.h: add array-wrappers for (v)memdup_user()

Currently, user array duplications are sometimes done without an
overflow check. Sometimes the checks are done manually; sometimes the
array size is calculated with array_size() and sometimes by calculating
n * size directly in code.

Introduce wrappers for arrays for memdup_user() and vmemdup_user() to
provide a standardized and safe way for duplicating user arrays.

This is both for new code as well as replacing usage of (v)memdup_user()
in existing code that uses, e.g., n * size to calculate array sizes.

Suggested-by: David Airlie <airlied@redhat.com>
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Zack Rusin <zackr@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230920123612.16914-3-pstanner@redhat.com
---
 include/linux/string.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..debf4ef1098f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -5,7 +5,9 @@
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
+#include <linux/err.h>		/* for ERR_PTR() */
 #include <linux/errno.h>	/* for E2BIG */
+#include <linux/overflow.h>	/* for check_mul_overflow() */
 #include <linux/stdarg.h>
 #include <uapi/linux/string.h>
 
@@ -14,6 +16,44 @@ extern void *memdup_user(const void __user *, size_t);
 extern void *vmemdup_user(const void __user *, size_t);
 extern void *memdup_user_nul(const void __user *, size_t);
 
+/**
+ * memdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result is physically
+ * contiguous, to be freed by kfree().
+ */
+static inline void *memdup_array_user(const void __user *src, size_t n, size_t size)
+{
+	size_t nbytes;
+
+	if (check_mul_overflow(n, size, &nbytes))
+		return ERR_PTR(-EOVERFLOW);
+
+	return memdup_user(src, nbytes);
+}
+
+/**
+ * vmemdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result may be not
+ * physically contiguous. Use kvfree() to free.
+ */
+static inline void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
+{
+	size_t nbytes;
+
+	if (check_mul_overflow(n, size, &nbytes))
+		return ERR_PTR(-EOVERFLOW);
+
+	return vmemdup_user(src, nbytes);
+}
+
 /*
  * Include machine specific inline routines
  */
-- 
cgit v1.2.3


From 6d0d419914286a1b255530812a38d992c6c4e608 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:06 +0100
Subject: iov_iter, net: Move csum_and_copy_to/from_iter() to net/

Move csum_and_copy_to/from_iter() to net code now that the iteration
framework can be #included.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-10-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 25 ++++++++++++++
 include/linux/uio.h    | 18 ----------
 lib/iov_iter.c         | 89 --------------------------------------------------
 net/core/datagram.c    | 50 +++++++++++++++++++++++++++-
 net/core/skbuff.c      | 33 +++++++++++++++++++
 5 files changed, 107 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4174c4b82d13..d0656cc11c16 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,6 +3679,31 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
+static inline __wsum csum_and_memcpy(void *to, const void *from, size_t len,
+				     __wsum sum, size_t off)
+{
+	__wsum next = csum_partial_copy_nocheck(from, to, len);
+	return csum_block_add(sum, next, off);
+}
+
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+
+static __always_inline __must_check
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+				  __wsum *csum, struct iov_iter *i)
+{
+	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
+}
+
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
 {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 65d9143f83c8..0a5426c97e02 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -338,24 +338,6 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 	return npages;
 }
 
-struct csum_state {
-	__wsum csum;
-	size_t off;
-};
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-
-static __always_inline __must_check
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
-				  __wsum *csum, struct iov_iter *i)
-{
-	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
-	if (likely(copied == bytes))
-		return true;
-	iov_iter_revert(i, copied);
-	return false;
-}
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0a7160aa4e31..3f913616ce88 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -10,7 +10,6 @@
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
 #include <linux/compat.h>
-#include <net/checksum.h>
 #include <linux/scatterlist.h>
 #include <linux/instrumented.h>
 #include <linux/iov_iter.h>
@@ -179,13 +178,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
-			      __wsum sum, size_t off)
-{
-	__wsum next = csum_partial_copy_nocheck(from, to, len);
-	return csum_block_add(sum, next, off);
-}
-
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(i->data_source))
@@ -1097,87 +1089,6 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
-static __always_inline
-size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
-				size_t len, void *to, void *priv2)
-{
-	__wsum next, *csum = priv2;
-
-	next = csum_and_copy_from_user(iter_from, to + progress, len);
-	*csum = csum_block_add(*csum, next, progress);
-	return next ? 0 : len;
-}
-
-static __always_inline
-size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
-			     size_t len, void *to, void *priv2)
-{
-	__wsum *csum = priv2;
-
-	*csum = csum_and_memcpy(to + progress, iter_from, len, *csum, progress);
-	return 0;
-}
-
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
-{
-	if (WARN_ON_ONCE(!i->data_source))
-		return 0;
-	return iterate_and_advance2(i, bytes, addr, csum,
-				    copy_from_user_iter_csum,
-				    memcpy_from_iter_csum);
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter);
-
-static __always_inline
-size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
-			      size_t len, void *from, void *priv2)
-{
-	__wsum next, *csum = priv2;
-
-	next = csum_and_copy_to_user(from + progress, iter_to, len);
-	*csum = csum_block_add(*csum, next, progress);
-	return next ? 0 : len;
-}
-
-static __always_inline
-size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
-			   size_t len, void *from, void *priv2)
-{
-	__wsum *csum = priv2;
-
-	*csum = csum_and_memcpy(iter_to, from + progress, len, *csum, progress);
-	return 0;
-}
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
-			     struct iov_iter *i)
-{
-	struct csum_state *csstate = _csstate;
-	__wsum sum;
-
-	if (WARN_ON_ONCE(i->data_source))
-		return 0;
-	if (unlikely(iov_iter_is_discard(i))) {
-		// can't use csum_memcpy() for that one - data is not copied
-		csstate->csum = csum_block_add(csstate->csum,
-					       csum_partial(addr, bytes, 0),
-					       csstate->off);
-		csstate->off += bytes;
-		return bytes;
-	}
-
-	sum = csum_shift(csstate->csum, csstate->off);
-	
-	bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
-				     copy_to_user_iter_csum,
-				     memcpy_to_iter_csum);
-	csstate->csum = csum_shift(sum, csstate->off);
-	csstate->off += bytes;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_to_iter);
-
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i)
 {
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 176eb5834746..37c89d0933b7 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -50,7 +50,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/uio.h>
+#include <linux/iov_iter.h>
 #include <linux/indirect_call_wrapper.h>
 
 #include <net/protocol.h>
@@ -716,6 +716,54 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+			      size_t len, void *from, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_to_user(from + progress, iter_to, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+			   size_t len, void *from, void *priv2)
+{
+	__wsum *csum = priv2;
+
+	*csum = csum_and_memcpy(iter_to, from + progress, len, *csum, progress);
+	return 0;
+}
+
+static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
+				    struct iov_iter *i)
+{
+	struct csum_state *csstate = _csstate;
+	__wsum sum;
+
+	if (WARN_ON_ONCE(i->data_source))
+		return 0;
+	if (unlikely(iov_iter_is_discard(i))) {
+		// can't use csum_memcpy() for that one - data is not copied
+		csstate->csum = csum_block_add(csstate->csum,
+					       csum_partial(addr, bytes, 0),
+					       csstate->off);
+		csstate->off += bytes;
+		return bytes;
+	}
+
+	sum = csum_shift(csstate->csum, csstate->off);
+
+	bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+				     copy_to_user_iter_csum,
+				     memcpy_to_iter_csum);
+	csstate->csum = csum_shift(sum, csstate->off);
+	csstate->off += bytes;
+	return bytes;
+}
+
 /**
  *	skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
  *          and update a checksum.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4eaf7ed0d1f4..5dbdfce2d05f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
 #include <linux/kcov.h>
+#include <linux/iov_iter.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -6931,3 +6932,35 @@ out:
 	return spliced ?: ret;
 }
 EXPORT_SYMBOL(skb_splice_from_iter);
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+			     size_t len, void *to, void *priv2)
+{
+	__wsum *csum = priv2;
+
+	*csum = csum_and_memcpy(to + progress, iter_from, len, *csum, progress);
+	return 0;
+}
+
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+				size_t len, void *to, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_from_user(iter_from, to + progress, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
+			       struct iov_iter *i)
+{
+	if (WARN_ON_ONCE(!i->data_source))
+		return 0;
+	return iterate_and_advance2(i, bytes, addr, csum,
+				    copy_from_user_iter_csum,
+				    memcpy_from_iter_csum);
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter);
-- 
cgit v1.2.3


From dc32bff195b45e8571c442954beee259e9500dac Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:07 +0100
Subject: iov_iter, net: Fold in csum_and_memcpy()

Fold csum_and_memcpy() in to its callers.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-11-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 7 -------
 net/core/datagram.c    | 3 ++-
 net/core/skbuff.c      | 3 ++-
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0656cc11c16..c81ef5d76953 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,13 +3679,6 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
-static inline __wsum csum_and_memcpy(void *to, const void *from, size_t len,
-				     __wsum sum, size_t off)
-{
-	__wsum next = csum_partial_copy_nocheck(from, to, len);
-	return csum_block_add(sum, next, off);
-}
-
 struct csum_state {
 	__wsum csum;
 	size_t off;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 37c89d0933b7..452620dd41e4 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -732,8 +732,9 @@ size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
 			   size_t len, void *from, void *priv2)
 {
 	__wsum *csum = priv2;
+	__wsum next = csum_partial_copy_nocheck(from, iter_to, len);
 
-	*csum = csum_and_memcpy(iter_to, from + progress, len, *csum, progress);
+	*csum = csum_block_add(*csum, next, progress);
 	return 0;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5dbdfce2d05f..3efed86321db 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6938,8 +6938,9 @@ size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
 			     size_t len, void *to, void *priv2)
 {
 	__wsum *csum = priv2;
+	__wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
 
-	*csum = csum_and_memcpy(to + progress, iter_from, len, *csum, progress);
+	*csum = csum_block_add(*csum, next, progress);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7c6f353e8a73cbb8919a20e0912021a9f9d24170 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:08 +0100
Subject: iov_iter, net: Merge csum_and_copy_from_iter{,_full}() together

Move csum_and_copy_from_iter_full() out of line and then merge
csum_and_copy_from_iter() into its only caller.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-12-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/skbuff.h | 19 ++-----------------
 net/core/datagram.c    |  5 +++++
 net/core/skbuff.c      | 20 +++++++++++++-------
 3 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c81ef5d76953..be402f55f6d6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,23 +3679,8 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
-struct csum_state {
-	__wsum csum;
-	size_t off;
-};
-
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-
-static __always_inline __must_check
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
-				  __wsum *csum, struct iov_iter *i)
-{
-	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
-	if (likely(copied == bytes))
-		return true;
-	iov_iter_revert(i, copied);
-	return false;
-}
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
+	__must_check;
 
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 452620dd41e4..722311eeee18 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -738,6 +738,11 @@ size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
 	return 0;
 }
 
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
 static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 				    struct iov_iter *i)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3efed86321db..2bfa6a7ba244 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6955,13 +6955,19 @@ size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
 	return next ? 0 : len;
 }
 
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+				  __wsum *csum, struct iov_iter *i)
 {
+	size_t copied;
+
 	if (WARN_ON_ONCE(!i->data_source))
-		return 0;
-	return iterate_and_advance2(i, bytes, addr, csum,
-				    copy_from_user_iter_csum,
-				    memcpy_from_iter_csum);
+		return false;
+	copied = iterate_and_advance2(i, bytes, addr, csum,
+				      copy_from_user_iter_csum,
+				      memcpy_from_iter_csum);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
 }
-EXPORT_SYMBOL(csum_and_copy_from_iter);
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
-- 
cgit v1.2.3


From b5f0e20f444cd150121e0ce912ebd3f2dabd12bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2023 13:03:09 +0100
Subject: iov_iter, net: Move hash_and_copy_to_iter() to net/

Move hash_and_copy_to_iter() to be with its only caller in networking code.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20230925120309.1731676-13-dhowells@redhat.com
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: David Laight <David.Laight@ACULAB.COM>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: netdev@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h |  3 ---
 lib/iov_iter.c      | 20 --------------------
 net/core/datagram.c | 19 +++++++++++++++++++
 3 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 0a5426c97e02..b6214cbf2a43 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -338,9 +338,6 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 	return npages;
 }
 
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i);
-
 struct iovec *iovec_from_user(const struct iovec __user *uvector,
 		unsigned long nr_segs, unsigned long fast_segs,
 		struct iovec *fast_iov, bool compat);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 3f913616ce88..de7d11cf4c63 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#include <crypto/hash.h>
 #include <linux/export.h>
 #include <linux/bvec.h>
 #include <linux/fault-inject-usercopy.h>
@@ -1089,25 +1088,6 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i)
-{
-#ifdef CONFIG_CRYPTO_HASH
-	struct ahash_request *hash = hashp;
-	struct scatterlist sg;
-	size_t copied;
-
-	copied = copy_to_iter(addr, bytes, i);
-	sg_init_one(&sg, addr, copied);
-	ahash_request_set_crypt(hash, &sg, NULL, copied);
-	crypto_ahash_update(hash);
-	return copied;
-#else
-	return 0;
-#endif
-}
-EXPORT_SYMBOL(hash_and_copy_to_iter);
-
 static int iov_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t skip = i->iov_offset, size = i->count;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 722311eeee18..103d46fa0eeb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -61,6 +61,7 @@
 #include <net/tcp_states.h>
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
+#include <crypto/hash.h>
 
 /*
  *	Is a socket 'connection oriented' ?
@@ -489,6 +490,24 @@ short_copy:
 	return 0;
 }
 
+static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+				    struct iov_iter *i)
+{
+#ifdef CONFIG_CRYPTO_HASH
+	struct ahash_request *hash = hashp;
+	struct scatterlist sg;
+	size_t copied;
+
+	copied = copy_to_iter(addr, bytes, i);
+	sg_init_one(&sg, addr, copied);
+	ahash_request_set_crypt(hash, &sg, NULL, copied);
+	crypto_ahash_update(hash);
+	return copied;
+#else
+	return 0;
+#endif
+}
+
 /**
  *	skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
  *          and update a hash.
-- 
cgit v1.2.3


From 2b76129c5ae710423cfb55806803341af6a403a7 Mon Sep 17 00:00:00 2001
From: David Meriin <dmeriin@habana.ai>
Date: Mon, 24 Jul 2023 23:30:44 +0300
Subject: accel/habanalabs: move cpucp interface to linux/habanalabs

The CPUCP interface is moved to a shared folder outside of accel as
a pre-requisite to upstream the NIC drivers that will also include
this file.

Signed-off-by: David Meriin <dmeriin@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 MAINTAINERS                                        |    1 +
 drivers/accel/habanalabs/common/firmware_if.c      |    2 +-
 drivers/accel/habanalabs/common/habanalabs.h       |    2 +-
 drivers/accel/habanalabs/gaudi/gaudiP.h            |    2 +-
 drivers/accel/habanalabs/gaudi2/gaudi2P.h          |    2 +-
 drivers/accel/habanalabs/goya/goyaP.h              |    2 +-
 drivers/accel/habanalabs/include/common/cpucp_if.h | 1402 -------------------
 .../accel/habanalabs/include/common/hl_boot_if.h   |  790 -----------
 include/linux/habanalabs/cpucp_if.h                | 1407 ++++++++++++++++++++
 include/linux/habanalabs/hl_boot_if.h              |  790 +++++++++++
 10 files changed, 2203 insertions(+), 2197 deletions(-)
 delete mode 100644 drivers/accel/habanalabs/include/common/cpucp_if.h
 delete mode 100644 drivers/accel/habanalabs/include/common/hl_boot_if.h
 create mode 100644 include/linux/habanalabs/cpucp_if.h
 create mode 100644 include/linux/habanalabs/hl_boot_if.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e05506ea8917..c9ff335d2c8c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9071,6 +9071,7 @@ T:	git https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux.git
 F:	Documentation/ABI/testing/debugfs-driver-habanalabs
 F:	Documentation/ABI/testing/sysfs-driver-habanalabs
 F:	drivers/accel/habanalabs/
+F:	include/linux/habanalabs/
 F:	include/trace/events/habanalabs.h
 F:	include/uapi/drm/habanalabs_accel.h
 
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 2bc775d29854..2a6dfea3d27d 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -6,7 +6,7 @@
  */
 
 #include "habanalabs.h"
-#include "../include/common/hl_boot_if.h"
+#include <linux/habanalabs/hl_boot_if.h>
 
 #include <linux/firmware.h>
 #include <linux/crc32.h>
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index efb046370f2e..8b5fd2b92676 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -8,7 +8,7 @@
 #ifndef HABANALABSP_H_
 #define HABANALABSP_H_
 
-#include "../include/common/cpucp_if.h"
+#include <linux/habanalabs/cpucp_if.h>
 #include "../include/common/qman_if.h"
 #include "../include/hw_ip/mmu/mmu_general.h"
 #include <uapi/drm/habanalabs_accel.h>
diff --git a/drivers/accel/habanalabs/gaudi/gaudiP.h b/drivers/accel/habanalabs/gaudi/gaudiP.h
index b8fa724be5a1..831be53bb9d7 100644
--- a/drivers/accel/habanalabs/gaudi/gaudiP.h
+++ b/drivers/accel/habanalabs/gaudi/gaudiP.h
@@ -10,7 +10,7 @@
 
 #include <uapi/drm/habanalabs_accel.h>
 #include "../common/habanalabs.h"
-#include "../include/common/hl_boot_if.h"
+#include <linux/habanalabs/hl_boot_if.h>
 #include "../include/gaudi/gaudi_packets.h"
 #include "../include/gaudi/gaudi.h"
 #include "../include/gaudi/gaudi_async_events.h"
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 5f3ce086928e..4535aa5ab561 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -10,7 +10,7 @@
 
 #include <uapi/drm/habanalabs_accel.h>
 #include "../common/habanalabs.h"
-#include "../include/common/hl_boot_if.h"
+#include <linux/habanalabs/hl_boot_if.h>
 #include "../include/gaudi2/gaudi2.h"
 #include "../include/gaudi2/gaudi2_packets.h"
 #include "../include/gaudi2/gaudi2_fw_if.h"
diff --git a/drivers/accel/habanalabs/goya/goyaP.h b/drivers/accel/habanalabs/goya/goyaP.h
index 5df3d30b91fd..194c2ae157cd 100644
--- a/drivers/accel/habanalabs/goya/goyaP.h
+++ b/drivers/accel/habanalabs/goya/goyaP.h
@@ -9,8 +9,8 @@
 #define GOYAP_H_
 
 #include <uapi/drm/habanalabs_accel.h>
+#include <linux/habanalabs/hl_boot_if.h>
 #include "../common/habanalabs.h"
-#include "../include/common/hl_boot_if.h"
 #include "../include/goya/goya_packets.h"
 #include "../include/goya/goya.h"
 #include "../include/goya/goya_async_events.h"
diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h
deleted file mode 100644
index ef7d32224066..000000000000
--- a/drivers/accel/habanalabs/include/common/cpucp_if.h
+++ /dev/null
@@ -1,1402 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * Copyright 2020-2022 HabanaLabs, Ltd.
- * All Rights Reserved.
- *
- */
-
-#ifndef CPUCP_IF_H
-#define CPUCP_IF_H
-
-#include <linux/types.h>
-#include <linux/if_ether.h>
-
-#include "hl_boot_if.h"
-
-#define NUM_HBM_PSEUDO_CH				2
-#define NUM_HBM_CH_PER_DEV				8
-#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT		0
-#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK		0x00000001
-#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT		1
-#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK		0x00000002
-#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT		2
-#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK		0x00000004
-#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT		3
-#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK		0x00000008
-#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT		4
-#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK		0x00000010
-#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT		5
-#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK		0x00000020
-#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT		6
-#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK		0x000007C0
-
-#define PLL_MAP_MAX_BITS	128
-#define PLL_MAP_LEN		(PLL_MAP_MAX_BITS / 8)
-
-/*
- * info of the pkt queue pointers in the first async occurrence
- */
-struct cpucp_pkt_sync_err {
-	__le32 pi;
-	__le32 ci;
-};
-
-struct hl_eq_hbm_ecc_data {
-	/* SERR counter */
-	__le32 sec_cnt;
-	/* DERR counter */
-	__le32 dec_cnt;
-	/* Supplemental Information according to the mask bits */
-	__le32 hbm_ecc_info;
-	/* Address in hbm where the ecc happened */
-	__le32 first_addr;
-	/* SERR continuous address counter */
-	__le32 sec_cont_cnt;
-	__le32 pad;
-};
-
-/*
- * EVENT QUEUE
- */
-
-struct hl_eq_header {
-	__le32 reserved;
-	__le32 ctl;
-};
-
-struct hl_eq_ecc_data {
-	__le64 ecc_address;
-	__le64 ecc_syndrom;
-	__u8 memory_wrapper_idx;
-	__u8 is_critical;
-	__le16 block_id;
-	__u8 pad[4];
-};
-
-enum hl_sm_sei_cause {
-	SM_SEI_SO_OVERFLOW,
-	SM_SEI_LBW_4B_UNALIGNED,
-	SM_SEI_AXI_RESPONSE_ERR
-};
-
-struct hl_eq_sm_sei_data {
-	__le32 sei_log;
-	/* enum hl_sm_sei_cause */
-	__u8 sei_cause;
-	__u8 pad[3];
-};
-
-enum hl_fw_alive_severity {
-	FW_ALIVE_SEVERITY_MINOR,
-	FW_ALIVE_SEVERITY_CRITICAL
-};
-
-struct hl_eq_fw_alive {
-	__le64 uptime_seconds;
-	__le32 process_id;
-	__le32 thread_id;
-	/* enum hl_fw_alive_severity */
-	__u8 severity;
-	__u8 pad[7];
-};
-
-struct hl_eq_intr_cause {
-	__le64 intr_cause_data;
-};
-
-struct hl_eq_pcie_drain_ind_data {
-	struct hl_eq_intr_cause intr_cause;
-	__le64 drain_wr_addr_lbw;
-	__le64 drain_rd_addr_lbw;
-	__le64 drain_wr_addr_hbw;
-	__le64 drain_rd_addr_hbw;
-};
-
-struct hl_eq_razwi_lbw_info_regs {
-	__le32 rr_aw_razwi_reg;
-	__le32 rr_aw_razwi_id_reg;
-	__le32 rr_ar_razwi_reg;
-	__le32 rr_ar_razwi_id_reg;
-};
-
-struct hl_eq_razwi_hbw_info_regs {
-	__le32 rr_aw_razwi_hi_reg;
-	__le32 rr_aw_razwi_lo_reg;
-	__le32 rr_aw_razwi_id_reg;
-	__le32 rr_ar_razwi_hi_reg;
-	__le32 rr_ar_razwi_lo_reg;
-	__le32 rr_ar_razwi_id_reg;
-};
-
-/* razwi_happened masks */
-#define RAZWI_HAPPENED_HBW	0x1
-#define RAZWI_HAPPENED_LBW	0x2
-#define RAZWI_HAPPENED_AW	0x4
-#define RAZWI_HAPPENED_AR	0x8
-
-struct hl_eq_razwi_info {
-	__le32 razwi_happened_mask;
-	union {
-		struct hl_eq_razwi_lbw_info_regs lbw;
-		struct hl_eq_razwi_hbw_info_regs hbw;
-	};
-	__le32 pad;
-};
-
-struct hl_eq_razwi_with_intr_cause {
-	struct hl_eq_razwi_info razwi_info;
-	struct hl_eq_intr_cause intr_cause;
-};
-
-#define HBM_CA_ERR_CMD_LIFO_LEN		8
-#define HBM_RD_ERR_DATA_LIFO_LEN	8
-#define HBM_WR_PAR_CMD_LIFO_LEN		11
-
-enum hl_hbm_sei_cause {
-	/* Command/address parity error event is split into 2 events due to
-	 * size limitation: ODD suffix for odd HBM CK_t cycles and EVEN  suffix
-	 * for even HBM CK_t cycles
-	 */
-	HBM_SEI_CMD_PARITY_EVEN,
-	HBM_SEI_CMD_PARITY_ODD,
-	/* Read errors can be reflected as a combination of SERR/DERR/parity
-	 * errors. Therefore, we define one event for all read error types.
-	 * LKD will perform further proccessing.
-	 */
-	HBM_SEI_READ_ERR,
-	HBM_SEI_WRITE_DATA_PARITY_ERR,
-	HBM_SEI_CATTRIP,
-	HBM_SEI_MEM_BIST_FAIL,
-	HBM_SEI_DFI,
-	HBM_SEI_INV_TEMP_READ_OUT,
-	HBM_SEI_BIST_FAIL,
-};
-
-/* Masks for parsing hl_hbm_sei_headr fields */
-#define HBM_ECC_SERR_CNTR_MASK		0xFF
-#define HBM_ECC_DERR_CNTR_MASK		0xFF00
-#define HBM_RD_PARITY_CNTR_MASK		0xFF0000
-
-/* HBM index and MC index are known by the event_id */
-struct hl_hbm_sei_header {
-	union {
-		/* relevant only in case of HBM read error */
-		struct {
-			__u8 ecc_serr_cnt;
-			__u8 ecc_derr_cnt;
-			__u8 read_par_cnt;
-			__u8 reserved;
-		};
-		/* All other cases */
-		__le32 cnt;
-	};
-	__u8 sei_cause;		/* enum hl_hbm_sei_cause */
-	__u8 mc_channel;		/* range: 0-3 */
-	__u8 mc_pseudo_channel;	/* range: 0-7 */
-	__u8 is_critical;
-};
-
-#define HBM_RD_ADDR_SID_SHIFT		0
-#define HBM_RD_ADDR_SID_MASK		0x1
-#define HBM_RD_ADDR_BG_SHIFT		1
-#define HBM_RD_ADDR_BG_MASK		0x6
-#define HBM_RD_ADDR_BA_SHIFT		3
-#define HBM_RD_ADDR_BA_MASK		0x18
-#define HBM_RD_ADDR_COL_SHIFT		5
-#define HBM_RD_ADDR_COL_MASK		0x7E0
-#define HBM_RD_ADDR_ROW_SHIFT		11
-#define HBM_RD_ADDR_ROW_MASK		0x3FFF800
-
-struct hbm_rd_addr {
-	union {
-		/* bit fields are only for FW use */
-		struct {
-			u32 dbg_rd_err_addr_sid:1;
-			u32 dbg_rd_err_addr_bg:2;
-			u32 dbg_rd_err_addr_ba:2;
-			u32 dbg_rd_err_addr_col:6;
-			u32 dbg_rd_err_addr_row:15;
-			u32 reserved:6;
-		};
-		__le32 rd_addr_val;
-	};
-};
-
-#define HBM_RD_ERR_BEAT_SHIFT		2
-/* dbg_rd_err_misc fields: */
-/* Read parity is calculated per DW on every beat */
-#define HBM_RD_ERR_PAR_ERR_BEAT0_SHIFT	0
-#define HBM_RD_ERR_PAR_ERR_BEAT0_MASK	0x3
-#define HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT	8
-#define HBM_RD_ERR_PAR_DATA_BEAT0_MASK	0x300
-/* ECC is calculated per PC on every beat */
-#define HBM_RD_ERR_SERR_BEAT0_SHIFT	16
-#define HBM_RD_ERR_SERR_BEAT0_MASK	0x10000
-#define HBM_RD_ERR_DERR_BEAT0_SHIFT	24
-#define HBM_RD_ERR_DERR_BEAT0_MASK	0x100000
-
-struct hl_eq_hbm_sei_read_err_intr_info {
-	/* DFI_RD_ERR_REP_ADDR */
-	struct hbm_rd_addr dbg_rd_err_addr;
-	/* DFI_RD_ERR_REP_ERR */
-	union {
-		struct {
-			/* bit fields are only for FW use */
-			u32 dbg_rd_err_par:8;
-			u32 dbg_rd_err_par_data:8;
-			u32 dbg_rd_err_serr:4;
-			u32 dbg_rd_err_derr:4;
-			u32 reserved:8;
-		};
-		__le32 dbg_rd_err_misc;
-	};
-	/* DFI_RD_ERR_REP_DM */
-	__le32 dbg_rd_err_dm;
-	/* DFI_RD_ERR_REP_SYNDROME */
-	__le32 dbg_rd_err_syndrome;
-	/* DFI_RD_ERR_REP_DATA */
-	__le32 dbg_rd_err_data[HBM_RD_ERR_DATA_LIFO_LEN];
-};
-
-struct hl_eq_hbm_sei_ca_par_intr_info {
-	/* 14 LSBs */
-	__le16 dbg_row[HBM_CA_ERR_CMD_LIFO_LEN];
-	/* 18 LSBs */
-	__le32 dbg_col[HBM_CA_ERR_CMD_LIFO_LEN];
-};
-
-#define WR_PAR_LAST_CMD_COL_SHIFT	0
-#define WR_PAR_LAST_CMD_COL_MASK	0x3F
-#define WR_PAR_LAST_CMD_BG_SHIFT	6
-#define WR_PAR_LAST_CMD_BG_MASK		0xC0
-#define WR_PAR_LAST_CMD_BA_SHIFT	8
-#define WR_PAR_LAST_CMD_BA_MASK		0x300
-#define WR_PAR_LAST_CMD_SID_SHIFT	10
-#define WR_PAR_LAST_CMD_SID_MASK	0x400
-
-/* Row address isn't latched */
-struct hbm_sei_wr_cmd_address {
-	/* DFI_DERR_LAST_CMD */
-	union {
-		struct {
-			/* bit fields are only for FW use */
-			u32 col:6;
-			u32 bg:2;
-			u32 ba:2;
-			u32 sid:1;
-			u32 reserved:21;
-		};
-		__le32 dbg_wr_cmd_addr;
-	};
-};
-
-struct hl_eq_hbm_sei_wr_par_intr_info {
-	/* entry 0: WR command address from the 1st cycle prior to the error
-	 * entry 1: WR command address from the 2nd cycle prior to the error
-	 * and so on...
-	 */
-	struct hbm_sei_wr_cmd_address dbg_last_wr_cmds[HBM_WR_PAR_CMD_LIFO_LEN];
-	/* derr[0:1] - 1st HBM cycle DERR output
-	 * derr[2:3] - 2nd HBM cycle DERR output
-	 */
-	__u8 dbg_derr;
-	/* extend to reach 8B */
-	__u8 pad[3];
-};
-
-/*
- * this struct represents the following sei causes:
- * command parity, ECC double error, ECC single error, dfi error, cattrip,
- * temperature read-out, read parity error and write parity error.
- * some only use the header while some have extra data.
- */
-struct hl_eq_hbm_sei_data {
-	struct hl_hbm_sei_header hdr;
-	union {
-		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_even_info;
-		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_odd_info;
-		struct hl_eq_hbm_sei_read_err_intr_info read_err_info;
-		struct hl_eq_hbm_sei_wr_par_intr_info wr_parity_info;
-	};
-};
-
-/* Engine/farm arc interrupt type */
-enum hl_engine_arc_interrupt_type {
-	/* Qman/farm ARC DCCM QUEUE FULL interrupt type */
-	ENGINE_ARC_DCCM_QUEUE_FULL_IRQ = 1
-};
-
-/* Data structure specifies details of payload of DCCM QUEUE FULL interrupt */
-struct hl_engine_arc_dccm_queue_full_irq {
-	/* Queue index value which caused DCCM QUEUE FULL */
-	__le32 queue_index;
-	__le32 pad;
-};
-
-/* Data structure specifies details of QM/FARM ARC interrupt */
-struct hl_eq_engine_arc_intr_data {
-	/* ARC engine id e.g.  DCORE0_TPC0_QM_ARC, DCORE0_TCP1_QM_ARC */
-	__le32 engine_id;
-	__le32 intr_type; /* enum hl_engine_arc_interrupt_type */
-	/* More info related to the interrupt e.g. queue index
-	 * incase of DCCM_QUEUE_FULL interrupt.
-	 */
-	__le64 payload;
-	__le64 pad[5];
-};
-
-#define ADDR_DEC_ADDRESS_COUNT_MAX 4
-
-/* Data structure specifies details of ADDR_DEC interrupt */
-struct hl_eq_addr_dec_intr_data {
-	struct hl_eq_intr_cause intr_cause;
-	__le64 addr[ADDR_DEC_ADDRESS_COUNT_MAX];
-	__u8 addr_cnt;
-	__u8 pad[7];
-};
-
-struct hl_eq_entry {
-	struct hl_eq_header hdr;
-	union {
-		__le64 data_placeholder;
-		struct hl_eq_ecc_data ecc_data;
-		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Obsolete */
-		struct hl_eq_sm_sei_data sm_sei_data;
-		struct cpucp_pkt_sync_err pkt_sync_err;
-		struct hl_eq_fw_alive fw_alive;
-		struct hl_eq_intr_cause intr_cause;
-		struct hl_eq_pcie_drain_ind_data pcie_drain_ind_data;
-		struct hl_eq_razwi_info razwi_info;
-		struct hl_eq_razwi_with_intr_cause razwi_with_intr_cause;
-		struct hl_eq_hbm_sei_data sei_data;	/* Gaudi2 HBM */
-		struct hl_eq_engine_arc_intr_data arc_data;
-		struct hl_eq_addr_dec_intr_data addr_dec;
-		__le64 data[7];
-	};
-};
-
-#define HL_EQ_ENTRY_SIZE		sizeof(struct hl_eq_entry)
-
-#define EQ_CTL_READY_SHIFT		31
-#define EQ_CTL_READY_MASK		0x80000000
-
-#define EQ_CTL_EVENT_TYPE_SHIFT		16
-#define EQ_CTL_EVENT_TYPE_MASK		0x0FFF0000
-
-#define EQ_CTL_INDEX_SHIFT		0
-#define EQ_CTL_INDEX_MASK		0x0000FFFF
-
-enum pq_init_status {
-	PQ_INIT_STATUS_NA = 0,
-	PQ_INIT_STATUS_READY_FOR_CP,
-	PQ_INIT_STATUS_READY_FOR_HOST,
-	PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI,
-	PQ_INIT_STATUS_LEN_NOT_POWER_OF_TWO_ERR,
-	PQ_INIT_STATUS_ILLEGAL_Q_ADDR_ERR
-};
-
-/*
- * CpuCP Primary Queue Packets
- *
- * During normal operation, the host's kernel driver needs to send various
- * messages to CpuCP, usually either to SET some value into a H/W periphery or
- * to GET the current value of some H/W periphery. For example, SET the
- * frequency of MME/TPC and GET the value of the thermal sensor.
- *
- * These messages can be initiated either by the User application or by the
- * host's driver itself, e.g. power management code. In either case, the
- * communication from the host's driver to CpuCP will *always* be in
- * synchronous mode, meaning that the host will send a single message and poll
- * until the message was acknowledged and the results are ready (if results are
- * needed).
- *
- * This means that only a single message can be sent at a time and the host's
- * driver must wait for its result before sending the next message. Having said
- * that, because these are control messages which are sent in a relatively low
- * frequency, this limitation seems acceptable. It's important to note that
- * in case of multiple devices, messages to different devices *can* be sent
- * at the same time.
- *
- * The message, inputs/outputs (if relevant) and fence object will be located
- * on the device DDR at an address that will be determined by the host's driver.
- * During device initialization phase, the host will pass to CpuCP that address.
- * Most of the message types will contain inputs/outputs inside the message
- * itself. The common part of each message will contain the opcode of the
- * message (its type) and a field representing a fence object.
- *
- * When the host's driver wishes to send a message to CPU CP, it will write the
- * message contents to the device DDR, clear the fence object and then write to
- * the PSOC_ARC1_AUX_SW_INTR, to issue interrupt 121 to ARC Management CPU.
- *
- * Upon receiving the interrupt (#121), CpuCP will read the message from the
- * DDR. In case the message is a SET operation, CpuCP will first perform the
- * operation and then write to the fence object on the device DDR. In case the
- * message is a GET operation, CpuCP will first fill the results section on the
- * device DDR and then write to the fence object. If an error occurred, CpuCP
- * will fill the rc field with the right error code.
- *
- * In the meantime, the host's driver will poll on the fence object. Once the
- * host sees that the fence object is signaled, it will read the results from
- * the device DDR (if relevant) and resume the code execution in the host's
- * driver.
- *
- * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
- * so the value being put by the host's driver matches the value read by CpuCP
- *
- * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
- *
- * Detailed description:
- *
- * CPUCP_PACKET_DISABLE_PCI_ACCESS -
- *       After receiving this packet the embedded CPU must NOT issue PCI
- *       transactions (read/write) towards the Host CPU. This also include
- *       sending MSI-X interrupts.
- *       This packet is usually sent before the device is moved to D3Hot state.
- *
- * CPUCP_PACKET_ENABLE_PCI_ACCESS -
- *       After receiving this packet the embedded CPU is allowed to issue PCI
- *       transactions towards the Host CPU, including sending MSI-X interrupts.
- *       This packet is usually send after the device is moved to D0 state.
- *
- * CPUCP_PACKET_TEMPERATURE_GET -
- *       Fetch the current temperature / Max / Max Hyst / Critical /
- *       Critical Hyst of a specified thermal sensor. The packet's
- *       arguments specify the desired sensor and the field to get.
- *
- * CPUCP_PACKET_VOLTAGE_GET -
- *       Fetch the voltage / Max / Min of a specified sensor. The packet's
- *       arguments specify the sensor and type.
- *
- * CPUCP_PACKET_CURRENT_GET -
- *       Fetch the current / Max / Min of a specified sensor. The packet's
- *       arguments specify the sensor and type.
- *
- * CPUCP_PACKET_FAN_SPEED_GET -
- *       Fetch the speed / Max / Min of a specified fan. The packet's
- *       arguments specify the sensor and type.
- *
- * CPUCP_PACKET_PWM_GET -
- *       Fetch the pwm value / mode of a specified pwm. The packet's
- *       arguments specify the sensor and type.
- *
- * CPUCP_PACKET_PWM_SET -
- *       Set the pwm value / mode of a specified pwm. The packet's
- *       arguments specify the sensor, type and value.
- *
- * CPUCP_PACKET_FREQUENCY_SET -
- *       Set the frequency of a specified PLL. The packet's arguments specify
- *       the PLL and the desired frequency. The actual frequency in the device
- *       might differ from the requested frequency.
- *
- * CPUCP_PACKET_FREQUENCY_GET -
- *       Fetch the frequency of a specified PLL. The packet's arguments specify
- *       the PLL.
- *
- * CPUCP_PACKET_LED_SET -
- *       Set the state of a specified led. The packet's arguments
- *       specify the led and the desired state.
- *
- * CPUCP_PACKET_I2C_WR -
- *       Write 32-bit value to I2C device. The packet's arguments specify the
- *       I2C bus, address and value.
- *
- * CPUCP_PACKET_I2C_RD -
- *       Read 32-bit value from I2C device. The packet's arguments specify the
- *       I2C bus and address.
- *
- * CPUCP_PACKET_INFO_GET -
- *       Fetch information from the device as specified in the packet's
- *       structure. The host's driver passes the max size it allows the CpuCP to
- *       write to the structure, to prevent data corruption in case of
- *       mismatched driver/FW versions.
- *
- * CPUCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
- *
- * CPUCP_PACKET_UNMASK_RAZWI_IRQ -
- *       Unmask the given IRQ. The IRQ number is specified in the value field.
- *       The packet is sent after receiving an interrupt and printing its
- *       relevant information.
- *
- * CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
- *       Unmask the given IRQs. The IRQs numbers are specified in an array right
- *       after the cpucp_packet structure, where its first element is the array
- *       length. The packet is sent after a soft reset was done in order to
- *       handle any interrupts that were sent during the reset process.
- *
- * CPUCP_PACKET_TEST -
- *       Test packet for CpuCP connectivity. The CPU will put the fence value
- *       in the result field.
- *
- * CPUCP_PACKET_FREQUENCY_CURR_GET -
- *       Fetch the current frequency of a specified PLL. The packet's arguments
- *       specify the PLL.
- *
- * CPUCP_PACKET_MAX_POWER_GET -
- *       Fetch the maximal power of the device.
- *
- * CPUCP_PACKET_MAX_POWER_SET -
- *       Set the maximal power of the device. The packet's arguments specify
- *       the power.
- *
- * CPUCP_PACKET_EEPROM_DATA_GET -
- *       Get EEPROM data from the CpuCP kernel. The buffer is specified in the
- *       addr field. The CPU will put the returned data size in the result
- *       field. In addition, the host's driver passes the max size it allows the
- *       CpuCP to write to the structure, to prevent data corruption in case of
- *       mismatched driver/FW versions.
- *
- * CPUCP_PACKET_NIC_INFO_GET -
- *       Fetch information from the device regarding the NIC. the host's driver
- *       passes the max size it allows the CpuCP to write to the structure, to
- *       prevent data corruption in case of mismatched driver/FW versions.
- *
- * CPUCP_PACKET_TEMPERATURE_SET -
- *       Set the value of the offset property of a specified thermal sensor.
- *       The packet's arguments specify the desired sensor and the field to
- *       set.
- *
- * CPUCP_PACKET_VOLTAGE_SET -
- *       Trigger the reset_history property of a specified voltage sensor.
- *       The packet's arguments specify the desired sensor and the field to
- *       set.
- *
- * CPUCP_PACKET_CURRENT_SET -
- *       Trigger the reset_history property of a specified current sensor.
- *       The packet's arguments specify the desired sensor and the field to
- *       set.
- *
- * CPUCP_PACKET_PCIE_THROUGHPUT_GET -
- *       Get throughput of PCIe.
- *       The packet's arguments specify the transaction direction (TX/RX).
- *       The window measurement is 10[msec], and the return value is in KB/sec.
- *
- * CPUCP_PACKET_PCIE_REPLAY_CNT_GET
- *       Replay count measures number of "replay" events, which is basicly
- *       number of retries done by PCIe.
- *
- * CPUCP_PACKET_TOTAL_ENERGY_GET -
- *       Total Energy is measurement of energy from the time FW Linux
- *       is loaded. It is calculated by multiplying the average power
- *       by time (passed from armcp start). The units are in MilliJouls.
- *
- * CPUCP_PACKET_PLL_INFO_GET -
- *       Fetch frequencies of PLL from the required PLL IP.
- *       The packet's arguments specify the device PLL type
- *       Pll type is the PLL from device pll_index enum.
- *       The result is composed of 4 outputs, each is 16-bit
- *       frequency in MHz.
- *
- * CPUCP_PACKET_POWER_GET -
- *       Fetch the present power consumption of the device (Current * Voltage).
- *
- * CPUCP_PACKET_NIC_PFC_SET -
- *       Enable/Disable the NIC PFC feature. The packet's arguments specify the
- *       NIC port, relevant lanes to configure and one bit indication for
- *       enable/disable.
- *
- * CPUCP_PACKET_NIC_FAULT_GET -
- *       Fetch the current indication for local/remote faults from the NIC MAC.
- *       The result is 32-bit value of the relevant register.
- *
- * CPUCP_PACKET_NIC_LPBK_SET -
- *       Enable/Disable the MAC loopback feature. The packet's arguments specify
- *       the NIC port, relevant lanes to configure and one bit indication for
- *       enable/disable.
- *
- * CPUCP_PACKET_NIC_MAC_INIT -
- *       Configure the NIC MAC channels. The packet's arguments specify the
- *       NIC port and the speed.
- *
- * CPUCP_PACKET_MSI_INFO_SET -
- *       set the index number for each supported msi type going from
- *       host to device
- *
- * CPUCP_PACKET_NIC_XPCS91_REGS_GET -
- *       Fetch the un/correctable counters values from the NIC MAC.
- *
- * CPUCP_PACKET_NIC_STAT_REGS_GET -
- *       Fetch various NIC MAC counters from the NIC STAT.
- *
- * CPUCP_PACKET_NIC_STAT_REGS_CLR -
- *       Clear the various NIC MAC counters in the NIC STAT.
- *
- * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET -
- *       Fetch all NIC MAC counters from the NIC STAT.
- *
- * CPUCP_PACKET_IS_IDLE_CHECK -
- *       Check if the device is IDLE in regard to the DMA/compute engines
- *       and QMANs. The f/w will return a bitmask where each bit represents
- *       a different engine or QMAN according to enum cpucp_idle_mask.
- *       The bit will be 1 if the engine is NOT idle.
- *
- * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
- *       Fetch all HBM replaced-rows and prending to be replaced rows data.
- *
- * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
- *       Fetch status of HBM rows pending replacement and need a reboot to
- *       be replaced.
- *
- * CPUCP_PACKET_POWER_SET -
- *       Resets power history of device to 0
- *
- * CPUCP_PACKET_ENGINE_CORE_ASID_SET -
- *       Packet to perform engine core ASID configuration
- *
- * CPUCP_PACKET_SEC_ATTEST_GET -
- *       Get the attestaion data that is collected during various stages of the
- *       boot sequence. the attestation data is also hashed with some unique
- *       number (nonce) provided by the host to prevent replay attacks.
- *       public key and certificate also provided as part of the FW response.
- *
- * CPUCP_PACKET_MONITOR_DUMP_GET -
- *       Get monitors registers dump from the CpuCP kernel.
- *       The CPU will put the registers dump in the a buffer allocated by the driver
- *       which address is passed via the CpuCp packet. In addition, the host's driver
- *       passes the max size it allows the CpuCP to write to the structure, to prevent
- *       data corruption in case of mismatched driver/FW versions.
- *       Obsolete.
- *
- * CPUCP_PACKET_GENERIC_PASSTHROUGH -
- *      Generic opcode for all firmware info that is only passed to host
- *      through the LKD, without getting parsed there.
- *
- * CPUCP_PACKET_ACTIVE_STATUS_SET -
- *       LKD sends FW indication whether device is free or in use, this indication is reported
- *       also to the BMC.
- *
- * CPUCP_PACKET_REGISTER_INTERRUPTS -
- *       Packet to register interrupts indicating LKD is ready to receive events from FW.
- *
- * CPUCP_PACKET_SOFT_RESET -
- *	 Packet to perform soft-reset.
- */
-
-enum cpucp_packet_id {
-	CPUCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
-	CPUCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
-	CPUCP_PACKET_TEMPERATURE_GET,		/* sysfs */
-	CPUCP_PACKET_VOLTAGE_GET,		/* sysfs */
-	CPUCP_PACKET_CURRENT_GET,		/* sysfs */
-	CPUCP_PACKET_FAN_SPEED_GET,		/* sysfs */
-	CPUCP_PACKET_PWM_GET,			/* sysfs */
-	CPUCP_PACKET_PWM_SET,			/* sysfs */
-	CPUCP_PACKET_FREQUENCY_SET,		/* sysfs */
-	CPUCP_PACKET_FREQUENCY_GET,		/* sysfs */
-	CPUCP_PACKET_LED_SET,			/* debugfs */
-	CPUCP_PACKET_I2C_WR,			/* debugfs */
-	CPUCP_PACKET_I2C_RD,			/* debugfs */
-	CPUCP_PACKET_INFO_GET,			/* IOCTL */
-	CPUCP_PACKET_FLASH_PROGRAM_REMOVED,
-	CPUCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
-	CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
-	CPUCP_PACKET_TEST,			/* internal */
-	CPUCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
-	CPUCP_PACKET_MAX_POWER_GET,		/* sysfs */
-	CPUCP_PACKET_MAX_POWER_SET,		/* sysfs */
-	CPUCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
-	CPUCP_PACKET_NIC_INFO_GET,		/* internal */
-	CPUCP_PACKET_TEMPERATURE_SET,		/* sysfs */
-	CPUCP_PACKET_VOLTAGE_SET,		/* sysfs */
-	CPUCP_PACKET_CURRENT_SET,		/* sysfs */
-	CPUCP_PACKET_PCIE_THROUGHPUT_GET,	/* internal */
-	CPUCP_PACKET_PCIE_REPLAY_CNT_GET,	/* internal */
-	CPUCP_PACKET_TOTAL_ENERGY_GET,		/* internal */
-	CPUCP_PACKET_PLL_INFO_GET,		/* internal */
-	CPUCP_PACKET_NIC_STATUS,		/* internal */
-	CPUCP_PACKET_POWER_GET,			/* internal */
-	CPUCP_PACKET_NIC_PFC_SET,		/* internal */
-	CPUCP_PACKET_NIC_FAULT_GET,		/* internal */
-	CPUCP_PACKET_NIC_LPBK_SET,		/* internal */
-	CPUCP_PACKET_NIC_MAC_CFG,		/* internal */
-	CPUCP_PACKET_MSI_INFO_SET,		/* internal */
-	CPUCP_PACKET_NIC_XPCS91_REGS_GET,	/* internal */
-	CPUCP_PACKET_NIC_STAT_REGS_GET,		/* internal */
-	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
-	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
-	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
-	CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
-	CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,	/* internal */
-	CPUCP_PACKET_POWER_SET,			/* internal */
-	CPUCP_PACKET_RESERVED,			/* not used */
-	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
-	CPUCP_PACKET_RESERVED2,			/* not used */
-	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
-	CPUCP_PACKET_RESERVED3,			/* not used */
-	CPUCP_PACKET_RESERVED4,			/* not used */
-	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
-	CPUCP_PACKET_RESERVED5,			/* not used */
-	CPUCP_PACKET_RESERVED6,			/* not used */
-	CPUCP_PACKET_RESERVED7,			/* not used */
-	CPUCP_PACKET_GENERIC_PASSTHROUGH,	/* IOCTL */
-	CPUCP_PACKET_RESERVED8,			/* not used */
-	CPUCP_PACKET_ACTIVE_STATUS_SET,		/* internal */
-	CPUCP_PACKET_RESERVED9,			/* not used */
-	CPUCP_PACKET_RESERVED10,		/* not used */
-	CPUCP_PACKET_RESERVED11,		/* not used */
-	CPUCP_PACKET_RESERVED12,		/* internal */
-	CPUCP_PACKET_REGISTER_INTERRUPTS,	/* internal */
-	CPUCP_PACKET_SOFT_RESET,		/* internal */
-	CPUCP_PACKET_ID_MAX			/* must be last */
-};
-
-#define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
-
-#define CPUCP_PKT_CTL_RC_SHIFT		12
-#define CPUCP_PKT_CTL_RC_MASK		0x0000F000
-
-#define CPUCP_PKT_CTL_OPCODE_SHIFT	16
-#define CPUCP_PKT_CTL_OPCODE_MASK	0x1FFF0000
-
-#define CPUCP_PKT_RES_PLL_OUT0_SHIFT	0
-#define CPUCP_PKT_RES_PLL_OUT0_MASK	0x000000000000FFFFull
-#define CPUCP_PKT_RES_PLL_OUT1_SHIFT	16
-#define CPUCP_PKT_RES_PLL_OUT1_MASK	0x00000000FFFF0000ull
-#define CPUCP_PKT_RES_PLL_OUT2_SHIFT	32
-#define CPUCP_PKT_RES_PLL_OUT2_MASK	0x0000FFFF00000000ull
-#define CPUCP_PKT_RES_PLL_OUT3_SHIFT	48
-#define CPUCP_PKT_RES_PLL_OUT3_MASK	0xFFFF000000000000ull
-
-#define CPUCP_PKT_RES_EEPROM_OUT0_SHIFT	0
-#define CPUCP_PKT_RES_EEPROM_OUT0_MASK	0x000000000000FFFFull
-#define CPUCP_PKT_RES_EEPROM_OUT1_SHIFT	16
-#define CPUCP_PKT_RES_EEPROM_OUT1_MASK	0x0000000000FF0000ull
-
-#define CPUCP_PKT_VAL_PFC_IN1_SHIFT	0
-#define CPUCP_PKT_VAL_PFC_IN1_MASK	0x0000000000000001ull
-#define CPUCP_PKT_VAL_PFC_IN2_SHIFT	1
-#define CPUCP_PKT_VAL_PFC_IN2_MASK	0x000000000000001Eull
-
-#define CPUCP_PKT_VAL_LPBK_IN1_SHIFT	0
-#define CPUCP_PKT_VAL_LPBK_IN1_MASK	0x0000000000000001ull
-#define CPUCP_PKT_VAL_LPBK_IN2_SHIFT	1
-#define CPUCP_PKT_VAL_LPBK_IN2_MASK	0x000000000000001Eull
-
-#define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT	0
-#define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK	0x0000000000000001ull
-#define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT	1
-#define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK	0x00000000FFFFFFFEull
-
-/* heartbeat status bits */
-#define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT		0
-#define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK		0x00000001
-
-struct cpucp_packet {
-	union {
-		__le64 value;	/* For SET packets */
-		__le64 result;	/* For GET packets */
-		__le64 addr;	/* For PQ */
-	};
-
-	__le32 ctl;
-
-	__le32 fence;		/* Signal to host that message is completed */
-
-	union {
-		struct {/* For temperature/current/voltage/fan/pwm get/set */
-			__le16 sensor_index;
-			__le16 type;
-		};
-
-		struct {	/* For I2C read/write */
-			__u8 i2c_bus;
-			__u8 i2c_addr;
-			__u8 i2c_reg;
-			/*
-			 * In legacy implemetations, i2c_len was not present,
-			 * was unused and just added as pad.
-			 * So if i2c_len is 0, it is treated as legacy
-			 * and r/w 1 Byte, else if i2c_len is specified,
-			 * its treated as new multibyte r/w support.
-			 */
-			__u8 i2c_len;
-		};
-
-		struct {/* For PLL info fetch */
-			__le16 pll_type;
-			/* TODO pll_reg is kept temporary before removal */
-			__le16 pll_reg;
-		};
-
-		/* For any general request */
-		__le32 index;
-
-		/* For frequency get/set */
-		__le32 pll_index;
-
-		/* For led set */
-		__le32 led_index;
-
-		/* For get CpuCP info/EEPROM data/NIC info */
-		__le32 data_max_size;
-
-		/*
-		 * For any general status bitmask. Shall be used whenever the
-		 * result cannot be used to hold general purpose data.
-		 */
-		__le32 status_mask;
-
-		/* random, used once number, for security packets */
-		__le32 nonce;
-	};
-
-	union {
-		/* For NIC requests */
-		__le32 port_index;
-
-		/* For Generic packet sub index */
-		__le32 pkt_subidx;
-	};
-};
-
-struct cpucp_unmask_irq_arr_packet {
-	struct cpucp_packet cpucp_pkt;
-	__le32 length;
-	__le32 irqs[];
-};
-
-struct cpucp_nic_status_packet {
-	struct cpucp_packet cpucp_pkt;
-	__le32 length;
-	__le32 data[];
-};
-
-struct cpucp_array_data_packet {
-	struct cpucp_packet cpucp_pkt;
-	__le32 length;
-	__le32 data[];
-};
-
-enum cpucp_led_index {
-	CPUCP_LED0_INDEX = 0,
-	CPUCP_LED1_INDEX,
-	CPUCP_LED2_INDEX,
-	CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX
-};
-
-/*
- * enum cpucp_packet_rc - Error return code
- * @cpucp_packet_success	-> in case of success.
- * @cpucp_packet_invalid	-> this is to support first generation platforms.
- * @cpucp_packet_fault		-> in case of processing error like failing to
- *                                 get device binding or semaphore etc.
- * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported.
- * @cpucp_packet_invalid_params	-> when checking parameter like length of buffer
- *				   or attribute value etc.
- * @cpucp_packet_rc_max		-> It indicates size of enum so should be at last.
- */
-enum cpucp_packet_rc {
-	cpucp_packet_success,
-	cpucp_packet_invalid,
-	cpucp_packet_fault,
-	cpucp_packet_invalid_pkt,
-	cpucp_packet_invalid_params,
-	cpucp_packet_rc_max
-};
-
-/*
- * cpucp_temp_type should adhere to hwmon_temp_attributes
- * defined in Linux kernel hwmon.h file
- */
-enum cpucp_temp_type {
-	cpucp_temp_input,
-	cpucp_temp_min = 4,
-	cpucp_temp_min_hyst,
-	cpucp_temp_max = 6,
-	cpucp_temp_max_hyst,
-	cpucp_temp_crit,
-	cpucp_temp_crit_hyst,
-	cpucp_temp_offset = 19,
-	cpucp_temp_lowest = 21,
-	cpucp_temp_highest = 22,
-	cpucp_temp_reset_history = 23,
-	cpucp_temp_warn = 24,
-	cpucp_temp_max_crit = 25,
-	cpucp_temp_max_warn = 26,
-};
-
-enum cpucp_in_attributes {
-	cpucp_in_input,
-	cpucp_in_min,
-	cpucp_in_max,
-	cpucp_in_lowest = 6,
-	cpucp_in_highest = 7,
-	cpucp_in_reset_history,
-	cpucp_in_intr_alarm_a,
-	cpucp_in_intr_alarm_b,
-};
-
-enum cpucp_curr_attributes {
-	cpucp_curr_input,
-	cpucp_curr_min,
-	cpucp_curr_max,
-	cpucp_curr_lowest = 6,
-	cpucp_curr_highest = 7,
-	cpucp_curr_reset_history
-};
-
-enum cpucp_fan_attributes {
-	cpucp_fan_input,
-	cpucp_fan_min = 2,
-	cpucp_fan_max
-};
-
-enum cpucp_pwm_attributes {
-	cpucp_pwm_input,
-	cpucp_pwm_enable
-};
-
-enum cpucp_pcie_throughput_attributes {
-	cpucp_pcie_throughput_tx,
-	cpucp_pcie_throughput_rx
-};
-
-/* TODO temporary kept before removal */
-enum cpucp_pll_reg_attributes {
-	cpucp_pll_nr_reg,
-	cpucp_pll_nf_reg,
-	cpucp_pll_od_reg,
-	cpucp_pll_div_factor_reg,
-	cpucp_pll_div_sel_reg
-};
-
-/* TODO temporary kept before removal */
-enum cpucp_pll_type_attributes {
-	cpucp_pll_cpu,
-	cpucp_pll_pci,
-};
-
-/*
- * cpucp_power_type aligns with hwmon_power_attributes
- * defined in Linux kernel hwmon.h file
- */
-enum cpucp_power_type {
-	CPUCP_POWER_INPUT = 8,
-	CPUCP_POWER_INPUT_HIGHEST = 9,
-	CPUCP_POWER_RESET_INPUT_HISTORY = 11
-};
-
-/*
- * MSI type enumeration table for all ASICs and future SW versions.
- * For future ASIC-LKD compatibility, we can only add new enumerations.
- * at the end of the table (before CPUCP_NUM_OF_MSI_TYPES).
- * Changing the order of entries or removing entries is not allowed.
- */
-enum cpucp_msi_type {
-	CPUCP_EVENT_QUEUE_MSI_TYPE,
-	CPUCP_NIC_PORT1_MSI_TYPE,
-	CPUCP_NIC_PORT3_MSI_TYPE,
-	CPUCP_NIC_PORT5_MSI_TYPE,
-	CPUCP_NIC_PORT7_MSI_TYPE,
-	CPUCP_NIC_PORT9_MSI_TYPE,
-	CPUCP_NUM_OF_MSI_TYPES
-};
-
-/*
- * PLL enumeration table used for all ASICs and future SW versions.
- * For future ASIC-LKD compatibility, we can only add new enumerations.
- * at the end of the table.
- * Changing the order of entries or removing entries is not allowed.
- */
-enum pll_index {
-	CPU_PLL = 0,
-	PCI_PLL = 1,
-	NIC_PLL = 2,
-	DMA_PLL = 3,
-	MESH_PLL = 4,
-	MME_PLL = 5,
-	TPC_PLL = 6,
-	IF_PLL = 7,
-	SRAM_PLL = 8,
-	NS_PLL = 9,
-	HBM_PLL = 10,
-	MSS_PLL = 11,
-	DDR_PLL = 12,
-	VID_PLL = 13,
-	BANK_PLL = 14,
-	MMU_PLL = 15,
-	IC_PLL = 16,
-	MC_PLL = 17,
-	EMMC_PLL = 18,
-	D2D_PLL = 19,
-	CS_PLL = 20,
-	C2C_PLL = 21,
-	NCH_PLL = 22,
-	C2M_PLL = 23,
-	PLL_MAX
-};
-
-enum rl_index {
-	TPC_RL = 0,
-	MME_RL,
-	EDMA_RL,
-};
-
-enum pvt_index {
-	PVT_SW,
-	PVT_SE,
-	PVT_NW,
-	PVT_NE
-};
-
-/* Event Queue Packets */
-
-struct eq_generic_event {
-	__le64 data[7];
-};
-
-/*
- * CpuCP info
- */
-
-#define CARD_NAME_MAX_LEN		16
-#define CPUCP_MAX_SENSORS		128
-#define CPUCP_MAX_NICS			128
-#define CPUCP_LANES_PER_NIC		4
-#define CPUCP_NIC_QSFP_EEPROM_MAX_LEN	1024
-#define CPUCP_MAX_NIC_LANES		(CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
-#define CPUCP_NIC_MASK_ARR_LEN		((CPUCP_MAX_NICS + 63) / 64)
-#define CPUCP_NIC_POLARITY_ARR_LEN	((CPUCP_MAX_NIC_LANES + 63) / 64)
-#define CPUCP_HBM_ROW_REPLACE_MAX	32
-
-struct cpucp_sensor {
-	__le32 type;
-	__le32 flags;
-};
-
-/**
- * struct cpucp_card_types - ASIC card type.
- * @cpucp_card_type_pci: PCI card.
- * @cpucp_card_type_pmc: PCI Mezzanine Card.
- */
-enum cpucp_card_types {
-	cpucp_card_type_pci,
-	cpucp_card_type_pmc
-};
-
-#define CPUCP_SEC_CONF_ENABLED_SHIFT	0
-#define CPUCP_SEC_CONF_ENABLED_MASK	0x00000001
-
-#define CPUCP_SEC_CONF_FLASH_WP_SHIFT	1
-#define CPUCP_SEC_CONF_FLASH_WP_MASK	0x00000002
-
-#define CPUCP_SEC_CONF_EEPROM_WP_SHIFT	2
-#define CPUCP_SEC_CONF_EEPROM_WP_MASK	0x00000004
-
-/**
- * struct cpucp_security_info - Security information.
- * @config: configuration bit field
- * @keys_num: number of stored keys
- * @revoked_keys: revoked keys bit field
- * @min_svn: minimal security version
- */
-struct cpucp_security_info {
-	__u8 config;
-	__u8 keys_num;
-	__u8 revoked_keys;
-	__u8 min_svn;
-};
-
-/**
- * struct cpucp_info - Info from CpuCP that is necessary to the host's driver
- * @sensors: available sensors description.
- * @kernel_version: CpuCP linux kernel version.
- * @reserved: reserved field.
- * @card_type: card configuration type.
- * @card_location: in a server, each card has different connections topology
- *                 depending on its location (relevant for PMC card type)
- * @cpld_version: CPLD programmed F/W version.
- * @infineon_version: Infineon main DC-DC version.
- * @fuse_version: silicon production FUSE information.
- * @thermal_version: thermald S/W version.
- * @cpucp_version: CpuCP S/W version.
- * @infineon_second_stage_version: Infineon 2nd stage DC-DC version.
- * @dram_size: available DRAM size.
- * @card_name: card name that will be displayed in HWMON subsystem on the host
- * @tpc_binning_mask: TPC binning mask, 1 bit per TPC instance
- *                    (0 = functional, 1 = binned)
- * @decoder_binning_mask: Decoder binning mask, 1 bit per decoder instance
- *                        (0 = functional, 1 = binned), maximum 1 per dcore
- * @sram_binning: Categorize SRAM functionality
- *                (0 = fully functional, 1 = lower-half is not functional,
- *                 2 = upper-half is not functional)
- * @sec_info: security information
- * @pll_map: Bit map of supported PLLs for current ASIC version.
- * @mme_binning_mask: MME binning mask,
- *                    bits [0:6]   <==> dcore0 mme fma
- *                    bits [7:13]  <==> dcore1 mme fma
- *                    bits [14:20] <==> dcore0 mme ima
- *                    bits [21:27] <==> dcore1 mme ima
- *                    For each group, if the 6th bit is set then first 5 bits
- *                    represent the col's idx [0-31], otherwise these bits are
- *                    ignored, and col idx 32 is binned. 7th bit is don't care.
- * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
- *                     (0 = functional 1 = binned)
- * @memory_repair_flag: eFuse flag indicating memory repair
- * @edma_binning_mask: EDMA binning mask, 1 bit per EDMA instance
- *                     (0 = functional 1 = binned)
- * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance
- *                     (0 = functional 1 = binned)
- * @interposer_version: Interposer version programmed in eFuse
- * @substrate_version: Substrate version programmed in eFuse
- * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
- * @fw_os_version: Firmware OS Version
- */
-struct cpucp_info {
-	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
-	__u8 kernel_version[VERSION_MAX_LEN];
-	__le32 reserved;
-	__le32 card_type;
-	__le32 card_location;
-	__le32 cpld_version;
-	__le32 infineon_version;
-	__u8 fuse_version[VERSION_MAX_LEN];
-	__u8 thermal_version[VERSION_MAX_LEN];
-	__u8 cpucp_version[VERSION_MAX_LEN];
-	__le32 infineon_second_stage_version;
-	__le64 dram_size;
-	char card_name[CARD_NAME_MAX_LEN];
-	__le64 tpc_binning_mask;
-	__le64 decoder_binning_mask;
-	__u8 sram_binning;
-	__u8 dram_binning_mask;
-	__u8 memory_repair_flag;
-	__u8 edma_binning_mask;
-	__u8 xbar_binning_mask;
-	__u8 interposer_version;
-	__u8 substrate_version;
-	__u8 reserved2;
-	struct cpucp_security_info sec_info;
-	__le32 fw_hbm_region_size;
-	__u8 pll_map[PLL_MAP_LEN];
-	__le64 mme_binning_mask;
-	__u8 fw_os_version[VERSION_MAX_LEN];
-};
-
-struct cpucp_mac_addr {
-	__u8 mac_addr[ETH_ALEN];
-};
-
-enum cpucp_serdes_type {
-	TYPE_1_SERDES_TYPE,
-	TYPE_2_SERDES_TYPE,
-	HLS1_SERDES_TYPE,
-	HLS1H_SERDES_TYPE,
-	HLS2_SERDES_TYPE,
-	HLS2_TYPE_1_SERDES_TYPE,
-	MAX_NUM_SERDES_TYPE,		/* number of types */
-	UNKNOWN_SERDES_TYPE = 0xFFFF	/* serdes_type is u16 */
-};
-
-struct cpucp_nic_info {
-	struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
-	__le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
-	__le64 pol_tx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
-	__le64 pol_rx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
-	__le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
-	__u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
-	__le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
-	__le16 serdes_type; /* enum cpucp_serdes_type */
-	__le16 tx_swap_map[CPUCP_MAX_NICS];
-	__u8 reserved[6];
-};
-
-#define PAGE_DISCARD_MAX	64
-
-struct page_discard_info {
-	__u8 num_entries;
-	__u8 reserved[7];
-	__le32 mmu_page_idx[PAGE_DISCARD_MAX];
-};
-
-/*
- * struct frac_val - fracture value represented by "integer.frac".
- * @integer: the integer part of the fracture value;
- * @frac: the fracture part of the fracture value.
- */
-struct frac_val {
-	union {
-		struct {
-			__le16 integer;
-			__le16 frac;
-		};
-		__le32 val;
-	};
-};
-
-/*
- * struct ser_val - the SER (symbol error rate) value is represented by "integer * 10 ^ -exp".
- * @integer: the integer part of the SER value;
- * @exp: the exponent part of the SER value.
- */
-struct ser_val {
-	__le16 integer;
-	__le16 exp;
-};
-
-/*
- * struct cpucp_nic_status - describes the status of a NIC port.
- * @port: NIC port index.
- * @bad_format_cnt: e.g. CRC.
- * @responder_out_of_sequence_psn_cnt: e.g NAK.
- * @high_ber_reinit_cnt: link reinit due to high BER.
- * @correctable_err_cnt: e.g. bit-flip.
- * @uncorrectable_err_cnt: e.g. MAC errors.
- * @retraining_cnt: re-training counter.
- * @up: is port up.
- * @pcs_link: has PCS link.
- * @phy_ready: is PHY ready.
- * @auto_neg: is Autoneg enabled.
- * @timeout_retransmission_cnt: timeout retransmission events.
- * @high_ber_cnt: high ber events.
- * @pre_fec_ser: pre FEC SER value.
- * @post_fec_ser: post FEC SER value.
- * @throughput: measured throughput.
- * @latency: measured latency.
- */
-struct cpucp_nic_status {
-	__le32 port;
-	__le32 bad_format_cnt;
-	__le32 responder_out_of_sequence_psn_cnt;
-	__le32 high_ber_reinit;
-	__le32 correctable_err_cnt;
-	__le32 uncorrectable_err_cnt;
-	__le32 retraining_cnt;
-	__u8 up;
-	__u8 pcs_link;
-	__u8 phy_ready;
-	__u8 auto_neg;
-	__le32 timeout_retransmission_cnt;
-	__le32 high_ber_cnt;
-	struct ser_val pre_fec_ser;
-	struct ser_val post_fec_ser;
-	struct frac_val bandwidth;
-	struct frac_val lat;
-};
-
-enum cpucp_hbm_row_replace_cause {
-	REPLACE_CAUSE_DOUBLE_ECC_ERR,
-	REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
-};
-
-struct cpucp_hbm_row_info {
-	__u8 hbm_idx;
-	__u8 pc;
-	__u8 sid;
-	__u8 bank_idx;
-	__le16 row_addr;
-	__u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
-	__u8 pad;
-};
-
-struct cpucp_hbm_row_replaced_rows_info {
-	__le16 num_replaced_rows;
-	__u8 pad[6];
-	struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
-};
-
-enum cpu_reset_status {
-	CPU_RST_STATUS_NA = 0,
-	CPU_RST_STATUS_SOFT_RST_DONE = 1,
-};
-
-#define SEC_PCR_DATA_BUF_SZ	256
-#define SEC_PCR_QUOTE_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
-#define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
-#define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
-#define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
-
-/*
- * struct cpucp_sec_attest_info - attestation report of the boot
- * @pcr_data: raw values of the PCR registers
- * @pcr_num_reg: number of PCR registers in the pcr_data array
- * @pcr_reg_len: length of each PCR register in the pcr_data array (bytes)
- * @nonce: number only used once. random number provided by host. this also
- *	    passed to the quote command as a qualifying data.
- * @pcr_quote_len: length of the attestation quote data (bytes)
- * @pcr_quote: attestation report data structure
- * @quote_sig_len: length of the attestation report signature (bytes)
- * @quote_sig: signature structure of the attestation report
- * @pub_data_len: length of the public data (bytes)
- * @public_data: public key for the signed attestation
- *		 (outPublic + name + qualifiedName)
- * @certificate_len: length of the certificate (bytes)
- * @certificate: certificate for the attestation signing key
- */
-struct cpucp_sec_attest_info {
-	__u8 pcr_data[SEC_PCR_DATA_BUF_SZ];
-	__u8 pcr_num_reg;
-	__u8 pcr_reg_len;
-	__le16 pad0;
-	__le32 nonce;
-	__le16 pcr_quote_len;
-	__u8 pcr_quote[SEC_PCR_QUOTE_BUF_SZ];
-	__u8 quote_sig_len;
-	__u8 quote_sig[SEC_SIGNATURE_BUF_SZ];
-	__le16 pub_data_len;
-	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
-	__le16 certificate_len;
-	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
-};
-
-/*
- * struct cpucp_dev_info_signed - device information signed by a secured device
- * @info: device information structure as defined above
- * @nonce: number only used once. random number provided by host. this number is
- *	   hashed and signed along with the device information.
- * @info_sig_len: length of the attestation signature (bytes)
- * @info_sig: signature of the info + nonce data.
- * @pub_data_len: length of the public data (bytes)
- * @public_data: public key info signed info data
- *		 (outPublic + name + qualifiedName)
- * @certificate_len: length of the certificate (bytes)
- * @certificate: certificate for the signing key
- */
-struct cpucp_dev_info_signed {
-	struct cpucp_info info;	/* assumed to be 64bit aligned */
-	__le32 nonce;
-	__le32 pad0;
-	__u8 info_sig_len;
-	__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
-	__le16 pub_data_len;
-	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
-	__le16 certificate_len;
-	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
-};
-
-#define DCORE_MON_REGS_SZ	512
-/*
- * struct dcore_monitor_regs_data - DCORE monitor regs data.
- * the structure follows sync manager block layout. Obsolete.
- * @mon_pay_addrl: array of payload address low bits.
- * @mon_pay_addrh: array of payload address high bits.
- * @mon_pay_data: array of payload data.
- * @mon_arm: array of monitor arm.
- * @mon_status: array of monitor status.
- */
-struct dcore_monitor_regs_data {
-	__le32 mon_pay_addrl[DCORE_MON_REGS_SZ];
-	__le32 mon_pay_addrh[DCORE_MON_REGS_SZ];
-	__le32 mon_pay_data[DCORE_MON_REGS_SZ];
-	__le32 mon_arm[DCORE_MON_REGS_SZ];
-	__le32 mon_status[DCORE_MON_REGS_SZ];
-};
-
-/* contains SM data for each SYNC_MNGR (Obsolete) */
-struct cpucp_monitor_dump {
-	struct dcore_monitor_regs_data sync_mngr_w_s;
-	struct dcore_monitor_regs_data sync_mngr_e_s;
-	struct dcore_monitor_regs_data sync_mngr_w_n;
-	struct dcore_monitor_regs_data sync_mngr_e_n;
-};
-
-/*
- * The Type of the generic request (and other input arguments) will be fetched from user by reading
- * from "pkt_subidx" field in struct cpucp_packet.
- *
- * HL_PASSTHROUGHT_VERSIONS	- Fetch all firmware versions.
- */
-enum hl_passthrough_type {
-	HL_PASSTHROUGH_VERSIONS,
-};
-
-#endif /* CPUCP_IF_H */
diff --git a/drivers/accel/habanalabs/include/common/hl_boot_if.h b/drivers/accel/habanalabs/include/common/hl_boot_if.h
deleted file mode 100644
index 7de8a5786a36..000000000000
--- a/drivers/accel/habanalabs/include/common/hl_boot_if.h
+++ /dev/null
@@ -1,790 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * Copyright 2018-2020 HabanaLabs, Ltd.
- * All Rights Reserved.
- *
- */
-
-#ifndef HL_BOOT_IF_H
-#define HL_BOOT_IF_H
-
-#define LKD_HARD_RESET_MAGIC		0xED7BD694 /* deprecated - do not use */
-#define HL_POWER9_HOST_MAGIC		0x1DA30009
-
-#define BOOT_FIT_SRAM_OFFSET		0x200000
-
-#define VERSION_MAX_LEN			128
-
-enum cpu_boot_err {
-	CPU_BOOT_ERR_DRAM_INIT_FAIL = 0,
-	CPU_BOOT_ERR_FIT_CORRUPTED = 1,
-	CPU_BOOT_ERR_TS_INIT_FAIL = 2,
-	CPU_BOOT_ERR_DRAM_SKIPPED = 3,
-	CPU_BOOT_ERR_BMC_WAIT_SKIPPED = 4,
-	CPU_BOOT_ERR_NIC_DATA_NOT_RDY = 5,
-	CPU_BOOT_ERR_NIC_FW_FAIL = 6,
-	CPU_BOOT_ERR_SECURITY_NOT_RDY = 7,
-	CPU_BOOT_ERR_SECURITY_FAIL = 8,
-	CPU_BOOT_ERR_EFUSE_FAIL = 9,
-	CPU_BOOT_ERR_PRI_IMG_VER_FAIL = 10,
-	CPU_BOOT_ERR_SEC_IMG_VER_FAIL = 11,
-	CPU_BOOT_ERR_PLL_FAIL = 12,
-	CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13,
-	CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18,
-	CPU_BOOT_ERR_BINNING_FAIL = 19,
-	CPU_BOOT_ERR_TPM_FAIL = 20,
-	CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
-	CPU_BOOT_ERR_EEPROM_FAIL = 22,
-	CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23,
-	CPU_BOOT_ERR_ENABLED = 31,
-	CPU_BOOT_ERR_SCND_EN = 63,
-	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
-};
-
-/*
- * Mask for fatal failures
- * This mask contains all possible fatal failures, and a dynamic code
- * will clear the non-relevant ones.
- */
-#define CPU_BOOT_ERR_FATAL_MASK					\
-		((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) |		\
-		 (1 << CPU_BOOT_ERR_PLL_FAIL) |			\
-		 (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) |	\
-		 (1 << CPU_BOOT_ERR_BINNING_FAIL) |		\
-		 (1 << CPU_BOOT_ERR_DRAM_SKIPPED) |		\
-		 (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) |	\
-		 (1 << CPU_BOOT_ERR_EEPROM_FAIL))
-
-/*
- * CPU error bits in BOOT_ERROR registers
- *
- * CPU_BOOT_ERR0_DRAM_INIT_FAIL		DRAM initialization failed.
- *					DRAM is not reliable to use.
- *
- * CPU_BOOT_ERR0_FIT_CORRUPTED		FIT data integrity verification of the
- *					image provided by the host has failed.
- *
- * CPU_BOOT_ERR0_TS_INIT_FAIL		Thermal Sensor initialization failed.
- *					Boot continues as usual, but keep in
- *					mind this is a warning.
- *
- * CPU_BOOT_ERR0_DRAM_SKIPPED		DRAM initialization has been skipped.
- *					Skipping DRAM initialization has been
- *					requested (e.g. strap, command, etc.)
- *					and FW skipped the DRAM initialization.
- *					Host can initialize the DRAM.
- *
- * CPU_BOOT_ERR0_BMC_WAIT_SKIPPED	Waiting for BMC data will be skipped.
- *					Meaning the BMC data might not be
- *					available until reset.
- *
- * CPU_BOOT_ERR0_NIC_DATA_NOT_RDY	NIC data from BMC is not ready.
- *					BMC has not provided the NIC data yet.
- *					Once provided this bit will be cleared.
- *
- * CPU_BOOT_ERR0_NIC_FW_FAIL		NIC FW loading failed.
- *					The NIC FW loading and initialization
- *					failed. This means NICs are not usable.
- *
- * CPU_BOOT_ERR0_SECURITY_NOT_RDY	Chip security initialization has been
- *					started, but is not ready yet - chip
- *					cannot be accessed.
- *
- * CPU_BOOT_ERR0_SECURITY_FAIL		Security related tasks have failed.
- *					The tasks are security init (root of
- *					trust), boot authentication (chain of
- *					trust), data packets authentication.
- *
- * CPU_BOOT_ERR0_EFUSE_FAIL		Reading from eFuse failed.
- *					The PCI device ID might be wrong.
- *
- * CPU_BOOT_ERR0_PRI_IMG_VER_FAIL	Verification of primary image failed.
- *					It mean that ppboot checksum
- *					verification for the preboot primary
- *					image has failed to match expected
- *					checksum. Trying to program image again
- *					might solve this.
- *
- * CPU_BOOT_ERR0_SEC_IMG_VER_FAIL	Verification of secondary image failed.
- *					It mean that ppboot checksum
- *					verification for the preboot secondary
- *					image has failed to match expected
- *					checksum. Trying to program image again
- *					might solve this.
- *
- * CPU_BOOT_ERR0_PLL_FAIL		PLL settings failed, meaning that one
- *					of the PLLs remains in REF_CLK
- *
- * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	Device is unusable and customer support
- *					should be contacted.
- *
- * CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR	Critical error was detected during
- *					the execution of ppboot or preboot.
- *					for example: stack overflow.
- *
- * CPU_BOOT_ERR0_BINNING_FAIL		Binning settings failed, meaning
- *					malfunctioning components might still be
- *					in use.
- *
- * CPU_BOOT_ERR0_TPM_FAIL		TPM verification flow failed.
- *
- * CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	Failed to set threshold for tmperature
- *					sensor.
- *
- * CPU_BOOT_ERR_EEPROM_FAIL		Failed reading EEPROM data. Defaults
- *					are used.
- *
- * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL	Failed scrubbing the Engines/ARCFarm
- *					memories. Boot disabled until reset.
- *
- * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
- *					This is a main indication that the
- *					running FW populates the error
- *					registers. Meaning the error bits are
- *					not garbage, but actual error statuses.
- */
-#define CPU_BOOT_ERR0_DRAM_INIT_FAIL		(1 << CPU_BOOT_ERR_DRAM_INIT_FAIL)
-#define CPU_BOOT_ERR0_FIT_CORRUPTED		(1 << CPU_BOOT_ERR_FIT_CORRUPTED)
-#define CPU_BOOT_ERR0_TS_INIT_FAIL		(1 << CPU_BOOT_ERR_TS_INIT_FAIL)
-#define CPU_BOOT_ERR0_DRAM_SKIPPED		(1 << CPU_BOOT_ERR_DRAM_SKIPPED)
-#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED		(1 << CPU_BOOT_ERR_BMC_WAIT_SKIPPED)
-#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY		(1 << CPU_BOOT_ERR_NIC_DATA_NOT_RDY)
-#define CPU_BOOT_ERR0_NIC_FW_FAIL		(1 << CPU_BOOT_ERR_NIC_FW_FAIL)
-#define CPU_BOOT_ERR0_SECURITY_NOT_RDY		(1 << CPU_BOOT_ERR_SECURITY_NOT_RDY)
-#define CPU_BOOT_ERR0_SECURITY_FAIL		(1 << CPU_BOOT_ERR_SECURITY_FAIL)
-#define CPU_BOOT_ERR0_EFUSE_FAIL		(1 << CPU_BOOT_ERR_EFUSE_FAIL)
-#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_PRI_IMG_VER_FAIL)
-#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_SEC_IMG_VER_FAIL)
-#define CPU_BOOT_ERR0_PLL_FAIL			(1 << CPU_BOOT_ERR_PLL_FAIL)
-#define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL)
-#define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR		(1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR)
-#define CPU_BOOT_ERR0_BINNING_FAIL		(1 << CPU_BOOT_ERR_BINNING_FAIL)
-#define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
-#define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	(1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
-#define CPU_BOOT_ERR0_EEPROM_FAIL		(1 << CPU_BOOT_ERR_EEPROM_FAIL)
-#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL	(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
-#define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
-#define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
-
-enum cpu_boot_dev_sts {
-	CPU_BOOT_DEV_STS_SECURITY_EN = 0,
-	CPU_BOOT_DEV_STS_DEBUG_EN = 1,
-	CPU_BOOT_DEV_STS_WATCHDOG_EN = 2,
-	CPU_BOOT_DEV_STS_DRAM_INIT_EN = 3,
-	CPU_BOOT_DEV_STS_BMC_WAIT_EN = 4,
-	CPU_BOOT_DEV_STS_E2E_CRED_EN = 5,
-	CPU_BOOT_DEV_STS_HBM_CRED_EN = 6,
-	CPU_BOOT_DEV_STS_RL_EN = 7,
-	CPU_BOOT_DEV_STS_SRAM_SCR_EN = 8,
-	CPU_BOOT_DEV_STS_DRAM_SCR_EN = 9,
-	CPU_BOOT_DEV_STS_FW_HARD_RST_EN = 10,
-	CPU_BOOT_DEV_STS_PLL_INFO_EN = 11,
-	CPU_BOOT_DEV_STS_SP_SRAM_EN = 12,
-	CPU_BOOT_DEV_STS_CLK_GATE_EN = 13,
-	CPU_BOOT_DEV_STS_HBM_ECC_EN = 14,
-	CPU_BOOT_DEV_STS_PKT_PI_ACK_EN = 15,
-	CPU_BOOT_DEV_STS_FW_LD_COM_EN = 16,
-	CPU_BOOT_DEV_STS_FW_IATU_CONF_EN = 17,
-	CPU_BOOT_DEV_STS_FW_NIC_MAC_EN = 18,
-	CPU_BOOT_DEV_STS_DYN_PLL_EN = 19,
-	CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN = 20,
-	CPU_BOOT_DEV_STS_EQ_INDEX_EN = 21,
-	CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN = 22,
-	CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN = 23,
-	CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
-	CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
-	CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
-	CPU_BOOT_DEV_STS_ENABLED = 31,
-	CPU_BOOT_DEV_STS_SCND_EN = 63,
-	CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
-};
-
-/*
- * BOOT DEVICE STATUS bits in BOOT_DEVICE_STS registers
- *
- * CPU_BOOT_DEV_STS0_SECURITY_EN	Security is Enabled.
- *					This is an indication for security
- *					enabled in FW, which means that
- *					all conditions for security are met:
- *					device is indicated as security enabled,
- *					registers are protected, and device
- *					uses keys for image verification.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_DEBUG_EN		Debug is enabled.
- *					Enabled when JTAG or DEBUG is enabled
- *					in FW.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_WATCHDOG_EN	Watchdog is enabled.
- *					Watchdog is enabled in FW.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_DRAM_INIT_EN	DRAM initialization is enabled.
- *					DRAM initialization has been done in FW.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_BMC_WAIT_EN	Waiting for BMC data enabled.
- *					If set, it means that during boot,
- *					FW waited for BMC data.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_E2E_CRED_EN	E2E credits initialized.
- *					FW initialized E2E credits.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_HBM_CRED_EN	HBM credits initialized.
- *					FW initialized HBM credits.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_RL_EN		Rate limiter initialized.
- *					FW initialized rate limiter.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_SRAM_SCR_EN	SRAM scrambler enabled.
- *					FW initialized SRAM scrambler.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_DRAM_SCR_EN	DRAM scrambler enabled.
- *					FW initialized DRAM scrambler.
- *					Initialized in: u-boot
- *
- * CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	FW hard reset procedure is enabled.
- *					FW has the hard reset procedure
- *					implemented. This means that FW will
- *					perform hard reset procedure on
- *					receiving the halt-machine event.
- *					Initialized in: preboot, u-boot, linux
- *
- * CPU_BOOT_DEV_STS0_PLL_INFO_EN	FW retrieval of PLL info is enabled.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_SP_SRAM_EN		SP SRAM is initialized and available
- *					for use.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_CLK_GATE_EN	Clock Gating enabled.
- *					FW initialized Clock Gating.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_HBM_ECC_EN		HBM ECC handling Enabled.
- *					FW handles HBM ECC indications.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN	Packets ack value used in the armcpd
- *					is set to the PI counter.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_FW_LD_COM_EN	Flexible FW loading communication
- *					protocol is enabled.
- *					Initialized in: preboot
- *
- * CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	FW iATU configuration is enabled.
- *					This bit if set, means the iATU has been
- *					configured and is ready for use.
- *					Initialized in: ppboot
- *
- * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN	NIC MAC channels init is done by FW and
- *					any access to them is done via the FW.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_DYN_PLL_EN		Dynamic PLL configuration is enabled.
- *					FW sends to host a bitmap of supported
- *					PLLs.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	GIC access permission only from
- *					previleged entity. FW sets this status
- *					bit for host. If this bit is set then
- *					GIC can not be accessed from host.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_EQ_INDEX_EN	Event Queue (EQ) index is a running
- *					index for each new event sent to host.
- *					This is used as a method in host to
- *					identify that the waiting event in
- *					queue is actually a new event which
- *					was not served before.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	Use multiple scratchpad interfaces to
- *					prevent IRQs overriding each other.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN
- *					NIC STAT and XPCS91 access is restricted
- *					and is done via FW only.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN
- *					NIC STAT get all is supported.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN
- *					F/W checks if the device is idle by reading defined set
- *					of registers. It returns a bitmask of all the engines,
- *					where a bit is set if the engine is not idle.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_MAP_HWMON_EN
- *					If set, means f/w supports proprietary
- *					HWMON enum mapping to cpucp enums.
- *					Initialized in: linux
- *
- * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
- *					This is a main indication that the
- *					running FW populates the device status
- *					register. Meaning the device status
- *					bits are not garbage, but actual
- *					statuses.
- *					Initialized in: preboot
- *
- */
-#define CPU_BOOT_DEV_STS0_SECURITY_EN		(1 << CPU_BOOT_DEV_STS_SECURITY_EN)
-#define CPU_BOOT_DEV_STS0_DEBUG_EN		(1 << CPU_BOOT_DEV_STS_DEBUG_EN)
-#define CPU_BOOT_DEV_STS0_WATCHDOG_EN		(1 << CPU_BOOT_DEV_STS_WATCHDOG_EN)
-#define CPU_BOOT_DEV_STS0_DRAM_INIT_EN		(1 << CPU_BOOT_DEV_STS_DRAM_INIT_EN)
-#define CPU_BOOT_DEV_STS0_BMC_WAIT_EN		(1 << CPU_BOOT_DEV_STS_BMC_WAIT_EN)
-#define CPU_BOOT_DEV_STS0_E2E_CRED_EN		(1 << CPU_BOOT_DEV_STS_E2E_CRED_EN)
-#define CPU_BOOT_DEV_STS0_HBM_CRED_EN		(1 << CPU_BOOT_DEV_STS_HBM_CRED_EN)
-#define CPU_BOOT_DEV_STS0_RL_EN			(1 << CPU_BOOT_DEV_STS_RL_EN)
-#define CPU_BOOT_DEV_STS0_SRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_SRAM_SCR_EN)
-#define CPU_BOOT_DEV_STS0_DRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_DRAM_SCR_EN)
-#define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	(1 << CPU_BOOT_DEV_STS_FW_HARD_RST_EN)
-#define CPU_BOOT_DEV_STS0_PLL_INFO_EN		(1 << CPU_BOOT_DEV_STS_PLL_INFO_EN)
-#define CPU_BOOT_DEV_STS0_SP_SRAM_EN		(1 << CPU_BOOT_DEV_STS_SP_SRAM_EN)
-#define CPU_BOOT_DEV_STS0_CLK_GATE_EN		(1 << CPU_BOOT_DEV_STS_CLK_GATE_EN)
-#define CPU_BOOT_DEV_STS0_HBM_ECC_EN		(1 << CPU_BOOT_DEV_STS_HBM_ECC_EN)
-#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN		(1 << CPU_BOOT_DEV_STS_PKT_PI_ACK_EN)
-#define CPU_BOOT_DEV_STS0_FW_LD_COM_EN		(1 << CPU_BOOT_DEV_STS_FW_LD_COM_EN)
-#define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	(1 << CPU_BOOT_DEV_STS_FW_IATU_CONF_EN)
-#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN		(1 << CPU_BOOT_DEV_STS_FW_NIC_MAC_EN)
-#define CPU_BOOT_DEV_STS0_DYN_PLL_EN		(1 << CPU_BOOT_DEV_STS_DYN_PLL_EN)
-#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	(1 << CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN)
-#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN		(1 << CPU_BOOT_DEV_STS_EQ_INDEX_EN)
-#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	(1 << CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN)
-#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN)
-#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
-#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN	(1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
-#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN		(1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
-#define CPU_BOOT_DEV_STS0_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
-#define CPU_BOOT_DEV_STS1_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
-
-enum cpu_boot_status {
-	CPU_BOOT_STATUS_NA = 0,		/* Default value after reset of chip */
-	CPU_BOOT_STATUS_IN_WFE = 1,
-	CPU_BOOT_STATUS_DRAM_RDY = 2,
-	CPU_BOOT_STATUS_SRAM_AVAIL = 3,
-	CPU_BOOT_STATUS_IN_BTL = 4,	/* BTL is H/W FSM */
-	CPU_BOOT_STATUS_IN_PREBOOT = 5,
-	CPU_BOOT_STATUS_IN_SPL,		/* deprecated - not reported */
-	CPU_BOOT_STATUS_IN_UBOOT = 7,
-	CPU_BOOT_STATUS_DRAM_INIT_FAIL,	/* deprecated - will be removed */
-	CPU_BOOT_STATUS_FIT_CORRUPTED,	/* deprecated - will be removed */
-	/* U-Boot console prompt activated, commands are not processed */
-	CPU_BOOT_STATUS_UBOOT_NOT_READY = 10,
-	/* Finished NICs init, reported after DRAM and NICs */
-	CPU_BOOT_STATUS_NIC_FW_RDY = 11,
-	CPU_BOOT_STATUS_TS_INIT_FAIL,	/* deprecated - will be removed */
-	CPU_BOOT_STATUS_DRAM_SKIPPED,	/* deprecated - will be removed */
-	CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */
-	/* Last boot loader progress status, ready to receive commands */
-	CPU_BOOT_STATUS_READY_TO_BOOT = 15,
-	/* Internal Boot finished, ready for boot-fit */
-	CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
-	/* Internal Security has been initialized, device can be accessed */
-	CPU_BOOT_STATUS_SECURITY_READY = 17,
-};
-
-enum kmd_msg {
-	KMD_MSG_NA = 0,
-	KMD_MSG_GOTO_WFE,
-	KMD_MSG_FIT_RDY,
-	KMD_MSG_SKIP_BMC,
-	RESERVED,
-	KMD_MSG_RST_DEV,
-	KMD_MSG_LAST
-};
-
-enum cpu_msg_status {
-	CPU_MSG_CLR = 0,
-	CPU_MSG_OK,
-	CPU_MSG_ERR,
-};
-
-/* communication registers mapping - consider ABI when changing */
-struct cpu_dyn_regs {
-	__le32 cpu_pq_base_addr_low;
-	__le32 cpu_pq_base_addr_high;
-	__le32 cpu_pq_length;
-	__le32 cpu_pq_init_status;
-	__le32 cpu_eq_base_addr_low;
-	__le32 cpu_eq_base_addr_high;
-	__le32 cpu_eq_length;
-	__le32 cpu_eq_ci;
-	__le32 cpu_cq_base_addr_low;
-	__le32 cpu_cq_base_addr_high;
-	__le32 cpu_cq_length;
-	__le32 cpu_pf_pq_pi;
-	__le32 cpu_boot_dev_sts0;
-	__le32 cpu_boot_dev_sts1;
-	__le32 cpu_boot_err0;
-	__le32 cpu_boot_err1;
-	__le32 cpu_boot_status;
-	__le32 fw_upd_sts;
-	__le32 fw_upd_cmd;
-	__le32 fw_upd_pending_sts;
-	__le32 fuse_ver_offset;
-	__le32 preboot_ver_offset;
-	__le32 uboot_ver_offset;
-	__le32 hw_state;
-	__le32 kmd_msg_to_cpu;
-	__le32 cpu_cmd_status_to_host;
-	__le32 gic_host_pi_upd_irq;
-	__le32 gic_tpc_qm_irq_ctrl;
-	__le32 gic_mme_qm_irq_ctrl;
-	__le32 gic_dma_qm_irq_ctrl;
-	__le32 gic_nic_qm_irq_ctrl;
-	__le32 gic_dma_core_irq_ctrl;
-	__le32 gic_host_halt_irq;
-	__le32 gic_host_ints_irq;
-	__le32 gic_host_soft_rst_irq;
-	__le32 gic_rot_qm_irq_ctrl;
-	__le32 cpu_rst_status;
-	__le32 eng_arc_irq_ctrl;
-	__le32 reserved1[20];		/* reserve for future use */
-};
-
-/* TODO: remove the desc magic after the code is updated to use message */
-/* HCDM - Habana Communications Descriptor Magic */
-#define HL_COMMS_DESC_MAGIC	0x4843444D
-#define HL_COMMS_DESC_VER	3
-
-/* HCMv - Habana Communications Message + header version */
-#define HL_COMMS_MSG_MAGIC_VALUE	0x48434D00
-#define HL_COMMS_MSG_MAGIC_MASK		0xFFFFFF00
-#define HL_COMMS_MSG_MAGIC_VER_MASK	0xFF
-
-#define HL_COMMS_MSG_MAGIC_VER(ver)	(HL_COMMS_MSG_MAGIC_VALUE |	\
-					((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
-#define HL_COMMS_MSG_MAGIC_V0		HL_COMMS_DESC_MAGIC
-#define HL_COMMS_MSG_MAGIC_V1		HL_COMMS_MSG_MAGIC_VER(1)
-#define HL_COMMS_MSG_MAGIC_V2		HL_COMMS_MSG_MAGIC_VER(2)
-#define HL_COMMS_MSG_MAGIC_V3		HL_COMMS_MSG_MAGIC_VER(3)
-
-#define HL_COMMS_MSG_MAGIC		HL_COMMS_MSG_MAGIC_V3
-
-#define HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC(magic)			\
-		(((magic) & HL_COMMS_MSG_MAGIC_MASK) ==			\
-		HL_COMMS_MSG_MAGIC_VALUE)
-
-#define HL_COMMS_MSG_MAGIC_VALIDATE_VERSION(magic, ver)			\
-		(((magic) & HL_COMMS_MSG_MAGIC_VER_MASK) >=		\
-		((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
-
-#define HL_COMMS_MSG_MAGIC_VALIDATE(magic, ver)				\
-		(HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC((magic)) &&		\
-		HL_COMMS_MSG_MAGIC_VALIDATE_VERSION((magic), (ver)))
-
-enum comms_msg_type {
-	HL_COMMS_DESC_TYPE = 0,
-	HL_COMMS_RESET_CAUSE_TYPE = 1,
-	HL_COMMS_FW_CFG_SKIP_TYPE = 2,
-	HL_COMMS_BINNING_CONF_TYPE = 3,
-};
-
-/*
- * Binning information shared between LKD and FW
- * @tpc_mask_l - TPC binning information lower 64 bit
- * @dec_mask - Decoder binning information
- * @dram_mask - DRAM binning information
- * @edma_mask - EDMA binning information
- * @mme_mask_l - MME binning information lower 32
- * @mme_mask_h - MME binning information upper 32
- * @rot_mask - Rotator binning information
- * @xbar_mask - xBAR binning information
- * @reserved - reserved field for future binning info w/o ABI change
- * @tpc_mask_h - TPC binning information upper 64 bit
- * @nic_mask - NIC binning information
- */
-struct lkd_fw_binning_info {
-	__le64 tpc_mask_l;
-	__le32 dec_mask;
-	__le32 dram_mask;
-	__le32 edma_mask;
-	__le32 mme_mask_l;
-	__le32 mme_mask_h;
-	__le32 rot_mask;
-	__le32 xbar_mask;
-	__le32 reserved0;
-	__le64 tpc_mask_h;
-	__le64 nic_mask;
-	__le32 reserved1[8];
-};
-
-/* TODO: remove this struct after the code is updated to use message */
-/* this is the comms descriptor header - meta data */
-struct comms_desc_header {
-	__le32 magic;		/* magic for validation */
-	__le32 crc32;		/* CRC32 of the descriptor w/o header */
-	__le16 size;		/* size of the descriptor w/o header */
-	__u8 version;	/* descriptor version */
-	__u8 reserved[5];	/* pad to 64 bit */
-};
-
-/* this is the comms message header - meta data */
-struct comms_msg_header {
-	__le32 magic;		/* magic for validation */
-	__le32 crc32;		/* CRC32 of the message w/o header */
-	__le16 size;		/* size of the message w/o header */
-	__u8 version;	/* message payload version */
-	__u8 type;		/* message type */
-	__u8 reserved[4];	/* pad to 64 bit */
-};
-
-enum lkd_fw_ascii_msg_lvls {
-	LKD_FW_ASCII_MSG_ERR = 0,
-	LKD_FW_ASCII_MSG_WRN = 1,
-	LKD_FW_ASCII_MSG_INF = 2,
-	LKD_FW_ASCII_MSG_DBG = 3,
-};
-
-#define LKD_FW_ASCII_MSG_MAX_LEN	128
-#define LKD_FW_ASCII_MSG_MAX		4	/* consider ABI when changing */
-
-struct lkd_fw_ascii_msg {
-	__u8 valid;
-	__u8 msg_lvl;
-	__u8 reserved[6];
-	char msg[LKD_FW_ASCII_MSG_MAX_LEN];
-};
-
-/* this is the main FW descriptor - consider ABI when changing */
-struct lkd_fw_comms_desc {
-	struct comms_desc_header header;
-	struct cpu_dyn_regs cpu_dyn_regs;
-	char fuse_ver[VERSION_MAX_LEN];
-	char cur_fw_ver[VERSION_MAX_LEN];
-	/* can be used for 1 more version w/o ABI change */
-	char reserved0[VERSION_MAX_LEN];
-	__le64 img_addr;	/* address for next FW component load */
-	struct lkd_fw_binning_info binning_info;
-	struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
-	__le32 rsvd_mem_size_mb; /* reserved memory size [MB] for FW/SVE */
-	char reserved1[4];
-};
-
-enum comms_reset_cause {
-	HL_RESET_CAUSE_UNKNOWN = 0,
-	HL_RESET_CAUSE_HEARTBEAT = 1,
-	HL_RESET_CAUSE_TDR = 2,
-};
-
-/* TODO: remove define after struct name is aligned on all projects */
-#define lkd_msg_comms lkd_fw_comms_msg
-
-/* this is the comms message descriptor */
-struct lkd_fw_comms_msg {
-	struct comms_msg_header header;
-	/* union for future expantions of new messages */
-	union {
-		struct {
-			struct cpu_dyn_regs cpu_dyn_regs;
-			char fuse_ver[VERSION_MAX_LEN];
-			char cur_fw_ver[VERSION_MAX_LEN];
-			/* can be used for 1 more version w/o ABI change */
-			char reserved0[VERSION_MAX_LEN];
-			/* address for next FW component load */
-			__le64 img_addr;
-			struct lkd_fw_binning_info binning_info;
-			struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
-			/* reserved memory size [MB] for FW/SVE */
-			__le32 rsvd_mem_size_mb;
-			char reserved1[4];
-		};
-		struct {
-			__u8 reset_cause;
-		};
-		struct {
-			__u8 fw_cfg_skip; /* 1 - skip, 0 - don't skip */
-		};
-		struct lkd_fw_binning_info binning_conf;
-	};
-};
-
-/*
- * LKD commands:
- *
- * COMMS_NOOP			Used to clear the command register and no actual
- *				command is send.
- *
- * COMMS_CLR_STS		Clear status command - FW should clear the
- *				status register. Used for synchronization
- *				between the commands as part of the race free
- *				protocol.
- *
- * COMMS_RST_STATE		Reset the current communication state which is
- *				kept by FW for proper responses.
- *				Should be used in the beginning of the
- *				communication cycle to clean any leftovers from
- *				previous communication attempts.
- *
- * COMMS_PREP_DESC		Prepare descriptor for setting up the
- *				communication and other dynamic data:
- *				struct lkd_fw_comms_desc.
- *				This command has a parameter stating the next FW
- *				component size, so the FW can actually prepare a
- *				space for it and in the status response provide
- *				the descriptor offset. The Offset of the next FW
- *				data component is a part of the descriptor
- *				structure.
- *
- * COMMS_DATA_RDY		The FW data has been uploaded and is ready for
- *				validation.
- *
- * COMMS_EXEC			Execute the next FW component.
- *
- * COMMS_RST_DEV		Reset the device.
- *
- * COMMS_GOTO_WFE		Execute WFE command. Allowed only on non-secure
- *				devices.
- *
- * COMMS_SKIP_BMC		Perform actions required for BMC-less servers.
- *				Do not wait for BMC response.
- *
- * COMMS_PREP_DESC_ELBI		Same as COMMS_PREP_DESC only that the memory
- *				space is allocated in a ELBI access only
- *				address range.
- *
- */
-enum comms_cmd {
-	COMMS_NOOP = 0,
-	COMMS_CLR_STS = 1,
-	COMMS_RST_STATE = 2,
-	COMMS_PREP_DESC = 3,
-	COMMS_DATA_RDY = 4,
-	COMMS_EXEC = 5,
-	COMMS_RST_DEV = 6,
-	COMMS_GOTO_WFE = 7,
-	COMMS_SKIP_BMC = 8,
-	COMMS_PREP_DESC_ELBI = 10,
-	COMMS_INVLD_LAST
-};
-
-#define COMMS_COMMAND_SIZE_SHIFT	0
-#define COMMS_COMMAND_SIZE_MASK		0x1FFFFFF
-#define COMMS_COMMAND_CMD_SHIFT		27
-#define COMMS_COMMAND_CMD_MASK		0xF8000000
-
-/*
- * LKD command to FW register structure
- * @size	- FW component size
- * @cmd		- command from enum comms_cmd
- */
-struct comms_command {
-	union {		/* bit fields are only for FW use */
-		struct {
-			u32 size :25;		/* 32MB max. */
-			u32 reserved :2;
-			enum comms_cmd cmd :5;		/* 32 commands */
-		};
-		__le32 val;
-	};
-};
-
-/*
- * FW status
- *
- * COMMS_STS_NOOP		Used to clear the status register and no actual
- *				status is provided.
- *
- * COMMS_STS_ACK		Command has been received and recognized.
- *
- * COMMS_STS_OK			Command execution has finished successfully.
- *
- * COMMS_STS_ERR		Command execution was unsuccessful and resulted
- *				in error.
- *
- * COMMS_STS_VALID_ERR		FW validation has failed.
- *
- * COMMS_STS_TIMEOUT_ERR	Command execution has timed out.
- */
-enum comms_sts {
-	COMMS_STS_NOOP = 0,
-	COMMS_STS_ACK = 1,
-	COMMS_STS_OK = 2,
-	COMMS_STS_ERR = 3,
-	COMMS_STS_VALID_ERR = 4,
-	COMMS_STS_TIMEOUT_ERR = 5,
-	COMMS_STS_INVLD_LAST
-};
-
-/* RAM types for FW components loading - defines the base address */
-enum comms_ram_types {
-	COMMS_SRAM = 0,
-	COMMS_DRAM = 1,
-};
-
-#define COMMS_STATUS_OFFSET_SHIFT	0
-#define COMMS_STATUS_OFFSET_MASK	0x03FFFFFF
-#define COMMS_STATUS_OFFSET_ALIGN_SHIFT	2
-#define COMMS_STATUS_RAM_TYPE_SHIFT	26
-#define COMMS_STATUS_RAM_TYPE_MASK	0x0C000000
-#define COMMS_STATUS_STATUS_SHIFT	28
-#define COMMS_STATUS_STATUS_MASK	0xF0000000
-
-/*
- * FW status to LKD register structure
- * @offset	- an offset from the base of the ram_type shifted right by
- *		  2 bits (always aligned to 32 bits).
- *		  Allows a maximum addressable offset of 256MB from RAM base.
- *		  Example: for real offset in RAM of 0x800000 (8MB), the value
- *		  in offset field is (0x800000 >> 2) = 0x200000.
- * @ram_type	- the RAM type that should be used for offset from
- *		  enum comms_ram_types
- * @status	- status from enum comms_sts
- */
-struct comms_status {
-	union {		/* bit fields are only for FW use */
-		struct {
-			u32 offset :26;
-			enum comms_ram_types ram_type :2;
-			enum comms_sts status :4;	/* 16 statuses */
-		};
-		__le32 val;
-	};
-};
-
-#define NAME_MAX_LEN	32 /* bytes */
-struct hl_module_data {
-	__u8 name[NAME_MAX_LEN];
-	__u8 version[VERSION_MAX_LEN];
-};
-
-/**
- * struct hl_component_versions - versions associated with hl component.
- * @struct_size: size of all the struct (including dynamic size of modules).
- * @modules_offset: offset of the modules field in this struct.
- * @component: version of the component itself.
- * @fw_os: Firmware OS Version.
- * @comp_name: Name of the component.
- * @modules_counter: number of set bits in modules_mask.
- * @reserved: reserved for future use.
- * @modules: versions of the component's modules. Elborated explanation in
- *              struct cpucp_versions.
- */
-struct hl_component_versions {
-	__le16 struct_size;
-	__le16 modules_offset;
-	__u8 component[VERSION_MAX_LEN];
-	__u8 fw_os[VERSION_MAX_LEN];
-	__u8 comp_name[NAME_MAX_LEN];
-	__u8 modules_counter;
-	__u8 reserved[3];
-	struct hl_module_data modules[];
-};
-
-/* Max size of fit size */
-#define HL_FW_VERSIONS_FIT_SIZE	4096
-
-#endif /* HL_BOOT_IF_H */
diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
new file mode 100644
index 000000000000..4cdedb603ecb
--- /dev/null
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -0,0 +1,1407 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2020-2022 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef CPUCP_IF_H
+#define CPUCP_IF_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include "hl_boot_if.h"
+
+#define NUM_HBM_PSEUDO_CH				2
+#define NUM_HBM_CH_PER_DEV				8
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_SHIFT		0
+#define CPUCP_PKT_HBM_ECC_INFO_WR_PAR_MASK		0x00000001
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_SHIFT		1
+#define CPUCP_PKT_HBM_ECC_INFO_RD_PAR_MASK		0x00000002
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_SHIFT		2
+#define CPUCP_PKT_HBM_ECC_INFO_CA_PAR_MASK		0x00000004
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_SHIFT		3
+#define CPUCP_PKT_HBM_ECC_INFO_DERR_MASK		0x00000008
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_SHIFT		4
+#define CPUCP_PKT_HBM_ECC_INFO_SERR_MASK		0x00000010
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_SHIFT		5
+#define CPUCP_PKT_HBM_ECC_INFO_TYPE_MASK		0x00000020
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_SHIFT		6
+#define CPUCP_PKT_HBM_ECC_INFO_HBM_CH_MASK		0x000007C0
+
+#define PLL_MAP_MAX_BITS	128
+#define PLL_MAP_LEN		(PLL_MAP_MAX_BITS / 8)
+
+/*
+ * info of the pkt queue pointers in the first async occurrence
+ */
+struct cpucp_pkt_sync_err {
+	__le32 pi;
+	__le32 ci;
+};
+
+struct hl_eq_hbm_ecc_data {
+	/* SERR counter */
+	__le32 sec_cnt;
+	/* DERR counter */
+	__le32 dec_cnt;
+	/* Supplemental Information according to the mask bits */
+	__le32 hbm_ecc_info;
+	/* Address in hbm where the ecc happened */
+	__le32 first_addr;
+	/* SERR continuous address counter */
+	__le32 sec_cont_cnt;
+	__le32 pad;
+};
+
+/*
+ * EVENT QUEUE
+ */
+
+struct hl_eq_header {
+	__le32 reserved;
+	__le32 ctl;
+};
+
+struct hl_eq_ecc_data {
+	__le64 ecc_address;
+	__le64 ecc_syndrom;
+	__u8 memory_wrapper_idx;
+	__u8 is_critical;
+	__le16 block_id;
+	__u8 pad[4];
+};
+
+enum hl_sm_sei_cause {
+	SM_SEI_SO_OVERFLOW,
+	SM_SEI_LBW_4B_UNALIGNED,
+	SM_SEI_AXI_RESPONSE_ERR
+};
+
+struct hl_eq_sm_sei_data {
+	__le32 sei_log;
+	/* enum hl_sm_sei_cause */
+	__u8 sei_cause;
+	__u8 pad[3];
+};
+
+enum hl_fw_alive_severity {
+	FW_ALIVE_SEVERITY_MINOR,
+	FW_ALIVE_SEVERITY_CRITICAL
+};
+
+struct hl_eq_fw_alive {
+	__le64 uptime_seconds;
+	__le32 process_id;
+	__le32 thread_id;
+	/* enum hl_fw_alive_severity */
+	__u8 severity;
+	__u8 pad[7];
+};
+
+struct hl_eq_intr_cause {
+	__le64 intr_cause_data;
+};
+
+struct hl_eq_pcie_drain_ind_data {
+	struct hl_eq_intr_cause intr_cause;
+	__le64 drain_wr_addr_lbw;
+	__le64 drain_rd_addr_lbw;
+	__le64 drain_wr_addr_hbw;
+	__le64 drain_rd_addr_hbw;
+};
+
+struct hl_eq_razwi_lbw_info_regs {
+	__le32 rr_aw_razwi_reg;
+	__le32 rr_aw_razwi_id_reg;
+	__le32 rr_ar_razwi_reg;
+	__le32 rr_ar_razwi_id_reg;
+};
+
+struct hl_eq_razwi_hbw_info_regs {
+	__le32 rr_aw_razwi_hi_reg;
+	__le32 rr_aw_razwi_lo_reg;
+	__le32 rr_aw_razwi_id_reg;
+	__le32 rr_ar_razwi_hi_reg;
+	__le32 rr_ar_razwi_lo_reg;
+	__le32 rr_ar_razwi_id_reg;
+};
+
+/* razwi_happened masks */
+#define RAZWI_HAPPENED_HBW	0x1
+#define RAZWI_HAPPENED_LBW	0x2
+#define RAZWI_HAPPENED_AW	0x4
+#define RAZWI_HAPPENED_AR	0x8
+
+struct hl_eq_razwi_info {
+	__le32 razwi_happened_mask;
+	union {
+		struct hl_eq_razwi_lbw_info_regs lbw;
+		struct hl_eq_razwi_hbw_info_regs hbw;
+	};
+	__le32 pad;
+};
+
+struct hl_eq_razwi_with_intr_cause {
+	struct hl_eq_razwi_info razwi_info;
+	struct hl_eq_intr_cause intr_cause;
+};
+
+#define HBM_CA_ERR_CMD_LIFO_LEN		8
+#define HBM_RD_ERR_DATA_LIFO_LEN	8
+#define HBM_WR_PAR_CMD_LIFO_LEN		11
+
+enum hl_hbm_sei_cause {
+	/* Command/address parity error event is split into 2 events due to
+	 * size limitation: ODD suffix for odd HBM CK_t cycles and EVEN  suffix
+	 * for even HBM CK_t cycles
+	 */
+	HBM_SEI_CMD_PARITY_EVEN,
+	HBM_SEI_CMD_PARITY_ODD,
+	/* Read errors can be reflected as a combination of SERR/DERR/parity
+	 * errors. Therefore, we define one event for all read error types.
+	 * LKD will perform further proccessing.
+	 */
+	HBM_SEI_READ_ERR,
+	HBM_SEI_WRITE_DATA_PARITY_ERR,
+	HBM_SEI_CATTRIP,
+	HBM_SEI_MEM_BIST_FAIL,
+	HBM_SEI_DFI,
+	HBM_SEI_INV_TEMP_READ_OUT,
+	HBM_SEI_BIST_FAIL,
+};
+
+/* Masks for parsing hl_hbm_sei_headr fields */
+#define HBM_ECC_SERR_CNTR_MASK		0xFF
+#define HBM_ECC_DERR_CNTR_MASK		0xFF00
+#define HBM_RD_PARITY_CNTR_MASK		0xFF0000
+
+/* HBM index and MC index are known by the event_id */
+struct hl_hbm_sei_header {
+	union {
+		/* relevant only in case of HBM read error */
+		struct {
+			__u8 ecc_serr_cnt;
+			__u8 ecc_derr_cnt;
+			__u8 read_par_cnt;
+			__u8 reserved;
+		};
+		/* All other cases */
+		__le32 cnt;
+	};
+	__u8 sei_cause;		/* enum hl_hbm_sei_cause */
+	__u8 mc_channel;		/* range: 0-3 */
+	__u8 mc_pseudo_channel;	/* range: 0-7 */
+	__u8 is_critical;
+};
+
+#define HBM_RD_ADDR_SID_SHIFT		0
+#define HBM_RD_ADDR_SID_MASK		0x1
+#define HBM_RD_ADDR_BG_SHIFT		1
+#define HBM_RD_ADDR_BG_MASK		0x6
+#define HBM_RD_ADDR_BA_SHIFT		3
+#define HBM_RD_ADDR_BA_MASK		0x18
+#define HBM_RD_ADDR_COL_SHIFT		5
+#define HBM_RD_ADDR_COL_MASK		0x7E0
+#define HBM_RD_ADDR_ROW_SHIFT		11
+#define HBM_RD_ADDR_ROW_MASK		0x3FFF800
+
+struct hbm_rd_addr {
+	union {
+		/* bit fields are only for FW use */
+		struct {
+			u32 dbg_rd_err_addr_sid:1;
+			u32 dbg_rd_err_addr_bg:2;
+			u32 dbg_rd_err_addr_ba:2;
+			u32 dbg_rd_err_addr_col:6;
+			u32 dbg_rd_err_addr_row:15;
+			u32 reserved:6;
+		};
+		__le32 rd_addr_val;
+	};
+};
+
+#define HBM_RD_ERR_BEAT_SHIFT		2
+/* dbg_rd_err_misc fields: */
+/* Read parity is calculated per DW on every beat */
+#define HBM_RD_ERR_PAR_ERR_BEAT0_SHIFT	0
+#define HBM_RD_ERR_PAR_ERR_BEAT0_MASK	0x3
+#define HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT	8
+#define HBM_RD_ERR_PAR_DATA_BEAT0_MASK	0x300
+/* ECC is calculated per PC on every beat */
+#define HBM_RD_ERR_SERR_BEAT0_SHIFT	16
+#define HBM_RD_ERR_SERR_BEAT0_MASK	0x10000
+#define HBM_RD_ERR_DERR_BEAT0_SHIFT	24
+#define HBM_RD_ERR_DERR_BEAT0_MASK	0x100000
+
+struct hl_eq_hbm_sei_read_err_intr_info {
+	/* DFI_RD_ERR_REP_ADDR */
+	struct hbm_rd_addr dbg_rd_err_addr;
+	/* DFI_RD_ERR_REP_ERR */
+	union {
+		struct {
+			/* bit fields are only for FW use */
+			u32 dbg_rd_err_par:8;
+			u32 dbg_rd_err_par_data:8;
+			u32 dbg_rd_err_serr:4;
+			u32 dbg_rd_err_derr:4;
+			u32 reserved:8;
+		};
+		__le32 dbg_rd_err_misc;
+	};
+	/* DFI_RD_ERR_REP_DM */
+	__le32 dbg_rd_err_dm;
+	/* DFI_RD_ERR_REP_SYNDROME */
+	__le32 dbg_rd_err_syndrome;
+	/* DFI_RD_ERR_REP_DATA */
+	__le32 dbg_rd_err_data[HBM_RD_ERR_DATA_LIFO_LEN];
+};
+
+struct hl_eq_hbm_sei_ca_par_intr_info {
+	/* 14 LSBs */
+	__le16 dbg_row[HBM_CA_ERR_CMD_LIFO_LEN];
+	/* 18 LSBs */
+	__le32 dbg_col[HBM_CA_ERR_CMD_LIFO_LEN];
+};
+
+#define WR_PAR_LAST_CMD_COL_SHIFT	0
+#define WR_PAR_LAST_CMD_COL_MASK	0x3F
+#define WR_PAR_LAST_CMD_BG_SHIFT	6
+#define WR_PAR_LAST_CMD_BG_MASK		0xC0
+#define WR_PAR_LAST_CMD_BA_SHIFT	8
+#define WR_PAR_LAST_CMD_BA_MASK		0x300
+#define WR_PAR_LAST_CMD_SID_SHIFT	10
+#define WR_PAR_LAST_CMD_SID_MASK	0x400
+
+/* Row address isn't latched */
+struct hbm_sei_wr_cmd_address {
+	/* DFI_DERR_LAST_CMD */
+	union {
+		struct {
+			/* bit fields are only for FW use */
+			u32 col:6;
+			u32 bg:2;
+			u32 ba:2;
+			u32 sid:1;
+			u32 reserved:21;
+		};
+		__le32 dbg_wr_cmd_addr;
+	};
+};
+
+struct hl_eq_hbm_sei_wr_par_intr_info {
+	/* entry 0: WR command address from the 1st cycle prior to the error
+	 * entry 1: WR command address from the 2nd cycle prior to the error
+	 * and so on...
+	 */
+	struct hbm_sei_wr_cmd_address dbg_last_wr_cmds[HBM_WR_PAR_CMD_LIFO_LEN];
+	/* derr[0:1] - 1st HBM cycle DERR output
+	 * derr[2:3] - 2nd HBM cycle DERR output
+	 */
+	__u8 dbg_derr;
+	/* extend to reach 8B */
+	__u8 pad[3];
+};
+
+/*
+ * this struct represents the following sei causes:
+ * command parity, ECC double error, ECC single error, dfi error, cattrip,
+ * temperature read-out, read parity error and write parity error.
+ * some only use the header while some have extra data.
+ */
+struct hl_eq_hbm_sei_data {
+	struct hl_hbm_sei_header hdr;
+	union {
+		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_even_info;
+		struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_odd_info;
+		struct hl_eq_hbm_sei_read_err_intr_info read_err_info;
+		struct hl_eq_hbm_sei_wr_par_intr_info wr_parity_info;
+	};
+};
+
+/* Engine/farm arc interrupt type */
+enum hl_engine_arc_interrupt_type {
+	/* Qman/farm ARC DCCM QUEUE FULL interrupt type */
+	ENGINE_ARC_DCCM_QUEUE_FULL_IRQ = 1
+};
+
+/* Data structure specifies details of payload of DCCM QUEUE FULL interrupt */
+struct hl_engine_arc_dccm_queue_full_irq {
+	/* Queue index value which caused DCCM QUEUE FULL */
+	__le32 queue_index;
+	__le32 pad;
+};
+
+/* Data structure specifies details of QM/FARM ARC interrupt */
+struct hl_eq_engine_arc_intr_data {
+	/* ARC engine id e.g.  DCORE0_TPC0_QM_ARC, DCORE0_TCP1_QM_ARC */
+	__le32 engine_id;
+	__le32 intr_type; /* enum hl_engine_arc_interrupt_type */
+	/* More info related to the interrupt e.g. queue index
+	 * incase of DCCM_QUEUE_FULL interrupt.
+	 */
+	__le64 payload;
+	__le64 pad[5];
+};
+
+#define ADDR_DEC_ADDRESS_COUNT_MAX 4
+
+/* Data structure specifies details of ADDR_DEC interrupt */
+struct hl_eq_addr_dec_intr_data {
+	struct hl_eq_intr_cause intr_cause;
+	__le64 addr[ADDR_DEC_ADDRESS_COUNT_MAX];
+	__u8 addr_cnt;
+	__u8 pad[7];
+};
+
+struct hl_eq_entry {
+	struct hl_eq_header hdr;
+	union {
+		__le64 data_placeholder;
+		struct hl_eq_ecc_data ecc_data;
+		struct hl_eq_hbm_ecc_data hbm_ecc_data;	/* Obsolete */
+		struct hl_eq_sm_sei_data sm_sei_data;
+		struct cpucp_pkt_sync_err pkt_sync_err;
+		struct hl_eq_fw_alive fw_alive;
+		struct hl_eq_intr_cause intr_cause;
+		struct hl_eq_pcie_drain_ind_data pcie_drain_ind_data;
+		struct hl_eq_razwi_info razwi_info;
+		struct hl_eq_razwi_with_intr_cause razwi_with_intr_cause;
+		struct hl_eq_hbm_sei_data sei_data;	/* Gaudi2 HBM */
+		struct hl_eq_engine_arc_intr_data arc_data;
+		struct hl_eq_addr_dec_intr_data addr_dec;
+		__le64 data[7];
+	};
+};
+
+#define HL_EQ_ENTRY_SIZE		sizeof(struct hl_eq_entry)
+
+#define EQ_CTL_READY_SHIFT		31
+#define EQ_CTL_READY_MASK		0x80000000
+
+#define EQ_CTL_EVENT_TYPE_SHIFT		16
+#define EQ_CTL_EVENT_TYPE_MASK		0x0FFF0000
+
+#define EQ_CTL_INDEX_SHIFT		0
+#define EQ_CTL_INDEX_MASK		0x0000FFFF
+
+enum pq_init_status {
+	PQ_INIT_STATUS_NA = 0,
+	PQ_INIT_STATUS_READY_FOR_CP,
+	PQ_INIT_STATUS_READY_FOR_HOST,
+	PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI,
+	PQ_INIT_STATUS_LEN_NOT_POWER_OF_TWO_ERR,
+	PQ_INIT_STATUS_ILLEGAL_Q_ADDR_ERR
+};
+
+/*
+ * CpuCP Primary Queue Packets
+ *
+ * During normal operation, the host's kernel driver needs to send various
+ * messages to CpuCP, usually either to SET some value into a H/W periphery or
+ * to GET the current value of some H/W periphery. For example, SET the
+ * frequency of MME/TPC and GET the value of the thermal sensor.
+ *
+ * These messages can be initiated either by the User application or by the
+ * host's driver itself, e.g. power management code. In either case, the
+ * communication from the host's driver to CpuCP will *always* be in
+ * synchronous mode, meaning that the host will send a single message and poll
+ * until the message was acknowledged and the results are ready (if results are
+ * needed).
+ *
+ * This means that only a single message can be sent at a time and the host's
+ * driver must wait for its result before sending the next message. Having said
+ * that, because these are control messages which are sent in a relatively low
+ * frequency, this limitation seems acceptable. It's important to note that
+ * in case of multiple devices, messages to different devices *can* be sent
+ * at the same time.
+ *
+ * The message, inputs/outputs (if relevant) and fence object will be located
+ * on the device DDR at an address that will be determined by the host's driver.
+ * During device initialization phase, the host will pass to CpuCP that address.
+ * Most of the message types will contain inputs/outputs inside the message
+ * itself. The common part of each message will contain the opcode of the
+ * message (its type) and a field representing a fence object.
+ *
+ * When the host's driver wishes to send a message to CPU CP, it will write the
+ * message contents to the device DDR, clear the fence object and then write to
+ * the PSOC_ARC1_AUX_SW_INTR, to issue interrupt 121 to ARC Management CPU.
+ *
+ * Upon receiving the interrupt (#121), CpuCP will read the message from the
+ * DDR. In case the message is a SET operation, CpuCP will first perform the
+ * operation and then write to the fence object on the device DDR. In case the
+ * message is a GET operation, CpuCP will first fill the results section on the
+ * device DDR and then write to the fence object. If an error occurred, CpuCP
+ * will fill the rc field with the right error code.
+ *
+ * In the meantime, the host's driver will poll on the fence object. Once the
+ * host sees that the fence object is signaled, it will read the results from
+ * the device DDR (if relevant) and resume the code execution in the host's
+ * driver.
+ *
+ * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
+ * so the value being put by the host's driver matches the value read by CpuCP
+ *
+ * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
+ *
+ * Detailed description:
+ *
+ * CPUCP_PACKET_DISABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU must NOT issue PCI
+ *       transactions (read/write) towards the Host CPU. This also include
+ *       sending MSI-X interrupts.
+ *       This packet is usually sent before the device is moved to D3Hot state.
+ *
+ * CPUCP_PACKET_ENABLE_PCI_ACCESS -
+ *       After receiving this packet the embedded CPU is allowed to issue PCI
+ *       transactions towards the Host CPU, including sending MSI-X interrupts.
+ *       This packet is usually send after the device is moved to D0 state.
+ *
+ * CPUCP_PACKET_TEMPERATURE_GET -
+ *       Fetch the current temperature / Max / Max Hyst / Critical /
+ *       Critical Hyst of a specified thermal sensor. The packet's
+ *       arguments specify the desired sensor and the field to get.
+ *
+ * CPUCP_PACKET_VOLTAGE_GET -
+ *       Fetch the voltage / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_CURRENT_GET -
+ *       Fetch the current / Max / Min of a specified sensor. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_FAN_SPEED_GET -
+ *       Fetch the speed / Max / Min of a specified fan. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_PWM_GET -
+ *       Fetch the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor and type.
+ *
+ * CPUCP_PACKET_PWM_SET -
+ *       Set the pwm value / mode of a specified pwm. The packet's
+ *       arguments specify the sensor, type and value.
+ *
+ * CPUCP_PACKET_FREQUENCY_SET -
+ *       Set the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL and the desired frequency. The actual frequency in the device
+ *       might differ from the requested frequency.
+ *
+ * CPUCP_PACKET_FREQUENCY_GET -
+ *       Fetch the frequency of a specified PLL. The packet's arguments specify
+ *       the PLL.
+ *
+ * CPUCP_PACKET_LED_SET -
+ *       Set the state of a specified led. The packet's arguments
+ *       specify the led and the desired state.
+ *
+ * CPUCP_PACKET_I2C_WR -
+ *       Write 32-bit value to I2C device. The packet's arguments specify the
+ *       I2C bus, address and value.
+ *
+ * CPUCP_PACKET_I2C_RD -
+ *       Read 32-bit value from I2C device. The packet's arguments specify the
+ *       I2C bus and address.
+ *
+ * CPUCP_PACKET_INFO_GET -
+ *       Fetch information from the device as specified in the packet's
+ *       structure. The host's driver passes the max size it allows the CpuCP to
+ *       write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
+ *
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ -
+ *       Unmask the given IRQ. The IRQ number is specified in the value field.
+ *       The packet is sent after receiving an interrupt and printing its
+ *       relevant information.
+ *
+ * CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY -
+ *       Unmask the given IRQs. The IRQs numbers are specified in an array right
+ *       after the cpucp_packet structure, where its first element is the array
+ *       length. The packet is sent after a soft reset was done in order to
+ *       handle any interrupts that were sent during the reset process.
+ *
+ * CPUCP_PACKET_TEST -
+ *       Test packet for CpuCP connectivity. The CPU will put the fence value
+ *       in the result field.
+ *
+ * CPUCP_PACKET_FREQUENCY_CURR_GET -
+ *       Fetch the current frequency of a specified PLL. The packet's arguments
+ *       specify the PLL.
+ *
+ * CPUCP_PACKET_MAX_POWER_GET -
+ *       Fetch the maximal power of the device.
+ *
+ * CPUCP_PACKET_MAX_POWER_SET -
+ *       Set the maximal power of the device. The packet's arguments specify
+ *       the power.
+ *
+ * CPUCP_PACKET_EEPROM_DATA_GET -
+ *       Get EEPROM data from the CpuCP kernel. The buffer is specified in the
+ *       addr field. The CPU will put the returned data size in the result
+ *       field. In addition, the host's driver passes the max size it allows the
+ *       CpuCP to write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_NIC_INFO_GET -
+ *       Fetch information from the device regarding the NIC. the host's driver
+ *       passes the max size it allows the CpuCP to write to the structure, to
+ *       prevent data corruption in case of mismatched driver/FW versions.
+ *
+ * CPUCP_PACKET_TEMPERATURE_SET -
+ *       Set the value of the offset property of a specified thermal sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_VOLTAGE_SET -
+ *       Trigger the reset_history property of a specified voltage sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_CURRENT_SET -
+ *       Trigger the reset_history property of a specified current sensor.
+ *       The packet's arguments specify the desired sensor and the field to
+ *       set.
+ *
+ * CPUCP_PACKET_PCIE_THROUGHPUT_GET -
+ *       Get throughput of PCIe.
+ *       The packet's arguments specify the transaction direction (TX/RX).
+ *       The window measurement is 10[msec], and the return value is in KB/sec.
+ *
+ * CPUCP_PACKET_PCIE_REPLAY_CNT_GET
+ *       Replay count measures number of "replay" events, which is basicly
+ *       number of retries done by PCIe.
+ *
+ * CPUCP_PACKET_TOTAL_ENERGY_GET -
+ *       Total Energy is measurement of energy from the time FW Linux
+ *       is loaded. It is calculated by multiplying the average power
+ *       by time (passed from armcp start). The units are in MilliJouls.
+ *
+ * CPUCP_PACKET_PLL_INFO_GET -
+ *       Fetch frequencies of PLL from the required PLL IP.
+ *       The packet's arguments specify the device PLL type
+ *       Pll type is the PLL from device pll_index enum.
+ *       The result is composed of 4 outputs, each is 16-bit
+ *       frequency in MHz.
+ *
+ * CPUCP_PACKET_POWER_GET -
+ *       Fetch the present power consumption of the device (Current * Voltage).
+ *
+ * CPUCP_PACKET_NIC_PFC_SET -
+ *       Enable/Disable the NIC PFC feature. The packet's arguments specify the
+ *       NIC port, relevant lanes to configure and one bit indication for
+ *       enable/disable.
+ *
+ * CPUCP_PACKET_NIC_FAULT_GET -
+ *       Fetch the current indication for local/remote faults from the NIC MAC.
+ *       The result is 32-bit value of the relevant register.
+ *
+ * CPUCP_PACKET_NIC_LPBK_SET -
+ *       Enable/Disable the MAC loopback feature. The packet's arguments specify
+ *       the NIC port, relevant lanes to configure and one bit indication for
+ *       enable/disable.
+ *
+ * CPUCP_PACKET_NIC_MAC_INIT -
+ *       Configure the NIC MAC channels. The packet's arguments specify the
+ *       NIC port and the speed.
+ *
+ * CPUCP_PACKET_MSI_INFO_SET -
+ *       set the index number for each supported msi type going from
+ *       host to device
+ *
+ * CPUCP_PACKET_NIC_XPCS91_REGS_GET -
+ *       Fetch the un/correctable counters values from the NIC MAC.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_GET -
+ *       Fetch various NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_CLR -
+ *       Clear the various NIC MAC counters in the NIC STAT.
+ *
+ * CPUCP_PACKET_NIC_STAT_REGS_ALL_GET -
+ *       Fetch all NIC MAC counters from the NIC STAT.
+ *
+ * CPUCP_PACKET_IS_IDLE_CHECK -
+ *       Check if the device is IDLE in regard to the DMA/compute engines
+ *       and QMANs. The f/w will return a bitmask where each bit represents
+ *       a different engine or QMAN according to enum cpucp_idle_mask.
+ *       The bit will be 1 if the engine is NOT idle.
+ *
+ * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
+ *       Fetch all HBM replaced-rows and prending to be replaced rows data.
+ *
+ * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
+ *       Fetch status of HBM rows pending replacement and need a reboot to
+ *       be replaced.
+ *
+ * CPUCP_PACKET_POWER_SET -
+ *       Resets power history of device to 0
+ *
+ * CPUCP_PACKET_ENGINE_CORE_ASID_SET -
+ *       Packet to perform engine core ASID configuration
+ *
+ * CPUCP_PACKET_SEC_ATTEST_GET -
+ *       Get the attestaion data that is collected during various stages of the
+ *       boot sequence. the attestation data is also hashed with some unique
+ *       number (nonce) provided by the host to prevent replay attacks.
+ *       public key and certificate also provided as part of the FW response.
+ *
+ * CPUCP_PACKET_MONITOR_DUMP_GET -
+ *       Get monitors registers dump from the CpuCP kernel.
+ *       The CPU will put the registers dump in the a buffer allocated by the driver
+ *       which address is passed via the CpuCp packet. In addition, the host's driver
+ *       passes the max size it allows the CpuCP to write to the structure, to prevent
+ *       data corruption in case of mismatched driver/FW versions.
+ *       Obsolete.
+ *
+ * CPUCP_PACKET_GENERIC_PASSTHROUGH -
+ *      Generic opcode for all firmware info that is only passed to host
+ *      through the LKD, without getting parsed there.
+ *
+ * CPUCP_PACKET_ACTIVE_STATUS_SET -
+ *       LKD sends FW indication whether device is free or in use, this indication is reported
+ *       also to the BMC.
+ *
+ * CPUCP_PACKET_REGISTER_INTERRUPTS -
+ *       Packet to register interrupts indicating LKD is ready to receive events from FW.
+ *
+ * CPUCP_PACKET_SOFT_RESET -
+ *      Packet to perform soft-reset.
+ *
+ * CPUCP_PACKET_INTS_REGISTER -
+ *       Packet to inform FW that queues have been established and LKD is ready to receive
+ *       EQ events.
+ */
+
+enum cpucp_packet_id {
+	CPUCP_PACKET_DISABLE_PCI_ACCESS = 1,	/* internal */
+	CPUCP_PACKET_ENABLE_PCI_ACCESS,		/* internal */
+	CPUCP_PACKET_TEMPERATURE_GET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_GET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_GET,		/* sysfs */
+	CPUCP_PACKET_FAN_SPEED_GET,		/* sysfs */
+	CPUCP_PACKET_PWM_GET,			/* sysfs */
+	CPUCP_PACKET_PWM_SET,			/* sysfs */
+	CPUCP_PACKET_FREQUENCY_SET,		/* sysfs */
+	CPUCP_PACKET_FREQUENCY_GET,		/* sysfs */
+	CPUCP_PACKET_LED_SET,			/* debugfs */
+	CPUCP_PACKET_I2C_WR,			/* debugfs */
+	CPUCP_PACKET_I2C_RD,			/* debugfs */
+	CPUCP_PACKET_INFO_GET,			/* IOCTL */
+	CPUCP_PACKET_FLASH_PROGRAM_REMOVED,
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ,		/* internal */
+	CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY,	/* internal */
+	CPUCP_PACKET_TEST,			/* internal */
+	CPUCP_PACKET_FREQUENCY_CURR_GET,	/* sysfs */
+	CPUCP_PACKET_MAX_POWER_GET,		/* sysfs */
+	CPUCP_PACKET_MAX_POWER_SET,		/* sysfs */
+	CPUCP_PACKET_EEPROM_DATA_GET,		/* sysfs */
+	CPUCP_PACKET_NIC_INFO_GET,		/* internal */
+	CPUCP_PACKET_TEMPERATURE_SET,		/* sysfs */
+	CPUCP_PACKET_VOLTAGE_SET,		/* sysfs */
+	CPUCP_PACKET_CURRENT_SET,		/* sysfs */
+	CPUCP_PACKET_PCIE_THROUGHPUT_GET,	/* internal */
+	CPUCP_PACKET_PCIE_REPLAY_CNT_GET,	/* internal */
+	CPUCP_PACKET_TOTAL_ENERGY_GET,		/* internal */
+	CPUCP_PACKET_PLL_INFO_GET,		/* internal */
+	CPUCP_PACKET_NIC_STATUS,		/* internal */
+	CPUCP_PACKET_POWER_GET,			/* internal */
+	CPUCP_PACKET_NIC_PFC_SET,		/* internal */
+	CPUCP_PACKET_NIC_FAULT_GET,		/* internal */
+	CPUCP_PACKET_NIC_LPBK_SET,		/* internal */
+	CPUCP_PACKET_NIC_MAC_CFG,		/* internal */
+	CPUCP_PACKET_MSI_INFO_SET,		/* internal */
+	CPUCP_PACKET_NIC_XPCS91_REGS_GET,	/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_GET,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
+	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
+	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
+	CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
+	CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,	/* internal */
+	CPUCP_PACKET_POWER_SET,			/* internal */
+	CPUCP_PACKET_RESERVED,			/* not used */
+	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
+	CPUCP_PACKET_RESERVED2,			/* not used */
+	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
+	CPUCP_PACKET_RESERVED3,			/* not used */
+	CPUCP_PACKET_RESERVED4,			/* not used */
+	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
+	CPUCP_PACKET_RESERVED5,			/* not used */
+	CPUCP_PACKET_RESERVED6,			/* not used */
+	CPUCP_PACKET_RESERVED7,			/* not used */
+	CPUCP_PACKET_GENERIC_PASSTHROUGH,	/* IOCTL */
+	CPUCP_PACKET_RESERVED8,			/* not used */
+	CPUCP_PACKET_ACTIVE_STATUS_SET,		/* internal */
+	CPUCP_PACKET_RESERVED9,			/* not used */
+	CPUCP_PACKET_RESERVED10,		/* not used */
+	CPUCP_PACKET_RESERVED11,		/* not used */
+	CPUCP_PACKET_RESERVED12,		/* internal */
+	CPUCP_PACKET_RESERVED13,                /* internal */
+	CPUCP_PACKET_SOFT_RESET,                /* internal */
+	CPUCP_PACKET_INTS_REGISTER,             /* internal */
+	CPUCP_PACKET_ID_MAX                     /* must be last */
+};
+
+#define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
+
+#define CPUCP_PKT_CTL_RC_SHIFT		12
+#define CPUCP_PKT_CTL_RC_MASK		0x0000F000
+
+#define CPUCP_PKT_CTL_OPCODE_SHIFT	16
+#define CPUCP_PKT_CTL_OPCODE_MASK	0x1FFF0000
+
+#define CPUCP_PKT_RES_PLL_OUT0_SHIFT	0
+#define CPUCP_PKT_RES_PLL_OUT0_MASK	0x000000000000FFFFull
+#define CPUCP_PKT_RES_PLL_OUT1_SHIFT	16
+#define CPUCP_PKT_RES_PLL_OUT1_MASK	0x00000000FFFF0000ull
+#define CPUCP_PKT_RES_PLL_OUT2_SHIFT	32
+#define CPUCP_PKT_RES_PLL_OUT2_MASK	0x0000FFFF00000000ull
+#define CPUCP_PKT_RES_PLL_OUT3_SHIFT	48
+#define CPUCP_PKT_RES_PLL_OUT3_MASK	0xFFFF000000000000ull
+
+#define CPUCP_PKT_RES_EEPROM_OUT0_SHIFT	0
+#define CPUCP_PKT_RES_EEPROM_OUT0_MASK	0x000000000000FFFFull
+#define CPUCP_PKT_RES_EEPROM_OUT1_SHIFT	16
+#define CPUCP_PKT_RES_EEPROM_OUT1_MASK	0x0000000000FF0000ull
+
+#define CPUCP_PKT_VAL_PFC_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_PFC_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_PFC_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_PFC_IN2_MASK	0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_LPBK_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_LPBK_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_LPBK_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_LPBK_IN2_MASK	0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_MAC_CNT_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_MAC_CNT_IN2_MASK	0x00000000FFFFFFFEull
+
+/* heartbeat status bits */
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT		0
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK		0x00000001
+
+struct cpucp_packet {
+	union {
+		__le64 value;	/* For SET packets */
+		__le64 result;	/* For GET packets */
+		__le64 addr;	/* For PQ */
+	};
+
+	__le32 ctl;
+
+	__le32 fence;		/* Signal to host that message is completed */
+
+	union {
+		struct {/* For temperature/current/voltage/fan/pwm get/set */
+			__le16 sensor_index;
+			__le16 type;
+		};
+
+		struct {	/* For I2C read/write */
+			__u8 i2c_bus;
+			__u8 i2c_addr;
+			__u8 i2c_reg;
+			/*
+			 * In legacy implemetations, i2c_len was not present,
+			 * was unused and just added as pad.
+			 * So if i2c_len is 0, it is treated as legacy
+			 * and r/w 1 Byte, else if i2c_len is specified,
+			 * its treated as new multibyte r/w support.
+			 */
+			__u8 i2c_len;
+		};
+
+		struct {/* For PLL info fetch */
+			__le16 pll_type;
+			/* TODO pll_reg is kept temporary before removal */
+			__le16 pll_reg;
+		};
+
+		/* For any general request */
+		__le32 index;
+
+		/* For frequency get/set */
+		__le32 pll_index;
+
+		/* For led set */
+		__le32 led_index;
+
+		/* For get CpuCP info/EEPROM data/NIC info */
+		__le32 data_max_size;
+
+		/*
+		 * For any general status bitmask. Shall be used whenever the
+		 * result cannot be used to hold general purpose data.
+		 */
+		__le32 status_mask;
+
+		/* random, used once number, for security packets */
+		__le32 nonce;
+	};
+
+	union {
+		/* For NIC requests */
+		__le32 port_index;
+
+		/* For Generic packet sub index */
+		__le32 pkt_subidx;
+	};
+};
+
+struct cpucp_unmask_irq_arr_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 irqs[];
+};
+
+struct cpucp_nic_status_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 data[];
+};
+
+struct cpucp_array_data_packet {
+	struct cpucp_packet cpucp_pkt;
+	__le32 length;
+	__le32 data[];
+};
+
+enum cpucp_led_index {
+	CPUCP_LED0_INDEX = 0,
+	CPUCP_LED1_INDEX,
+	CPUCP_LED2_INDEX,
+	CPUCP_LED_MAX_INDEX = CPUCP_LED2_INDEX
+};
+
+/*
+ * enum cpucp_packet_rc - Error return code
+ * @cpucp_packet_success	-> in case of success.
+ * @cpucp_packet_invalid	-> this is to support first generation platforms.
+ * @cpucp_packet_fault		-> in case of processing error like failing to
+ *                                 get device binding or semaphore etc.
+ * @cpucp_packet_invalid_pkt	-> when cpucp packet is un-supported.
+ * @cpucp_packet_invalid_params	-> when checking parameter like length of buffer
+ *				   or attribute value etc.
+ * @cpucp_packet_rc_max		-> It indicates size of enum so should be at last.
+ */
+enum cpucp_packet_rc {
+	cpucp_packet_success,
+	cpucp_packet_invalid,
+	cpucp_packet_fault,
+	cpucp_packet_invalid_pkt,
+	cpucp_packet_invalid_params,
+	cpucp_packet_rc_max
+};
+
+/*
+ * cpucp_temp_type should adhere to hwmon_temp_attributes
+ * defined in Linux kernel hwmon.h file
+ */
+enum cpucp_temp_type {
+	cpucp_temp_input,
+	cpucp_temp_min = 4,
+	cpucp_temp_min_hyst,
+	cpucp_temp_max = 6,
+	cpucp_temp_max_hyst,
+	cpucp_temp_crit,
+	cpucp_temp_crit_hyst,
+	cpucp_temp_offset = 19,
+	cpucp_temp_lowest = 21,
+	cpucp_temp_highest = 22,
+	cpucp_temp_reset_history = 23,
+	cpucp_temp_warn = 24,
+	cpucp_temp_max_crit = 25,
+	cpucp_temp_max_warn = 26,
+};
+
+enum cpucp_in_attributes {
+	cpucp_in_input,
+	cpucp_in_min,
+	cpucp_in_max,
+	cpucp_in_lowest = 6,
+	cpucp_in_highest = 7,
+	cpucp_in_reset_history,
+	cpucp_in_intr_alarm_a,
+	cpucp_in_intr_alarm_b,
+};
+
+enum cpucp_curr_attributes {
+	cpucp_curr_input,
+	cpucp_curr_min,
+	cpucp_curr_max,
+	cpucp_curr_lowest = 6,
+	cpucp_curr_highest = 7,
+	cpucp_curr_reset_history
+};
+
+enum cpucp_fan_attributes {
+	cpucp_fan_input,
+	cpucp_fan_min = 2,
+	cpucp_fan_max
+};
+
+enum cpucp_pwm_attributes {
+	cpucp_pwm_input,
+	cpucp_pwm_enable
+};
+
+enum cpucp_pcie_throughput_attributes {
+	cpucp_pcie_throughput_tx,
+	cpucp_pcie_throughput_rx
+};
+
+/* TODO temporary kept before removal */
+enum cpucp_pll_reg_attributes {
+	cpucp_pll_nr_reg,
+	cpucp_pll_nf_reg,
+	cpucp_pll_od_reg,
+	cpucp_pll_div_factor_reg,
+	cpucp_pll_div_sel_reg
+};
+
+/* TODO temporary kept before removal */
+enum cpucp_pll_type_attributes {
+	cpucp_pll_cpu,
+	cpucp_pll_pci,
+};
+
+/*
+ * cpucp_power_type aligns with hwmon_power_attributes
+ * defined in Linux kernel hwmon.h file
+ */
+enum cpucp_power_type {
+	CPUCP_POWER_INPUT = 8,
+	CPUCP_POWER_INPUT_HIGHEST = 9,
+	CPUCP_POWER_RESET_INPUT_HISTORY = 11
+};
+
+/*
+ * MSI type enumeration table for all ASICs and future SW versions.
+ * For future ASIC-LKD compatibility, we can only add new enumerations.
+ * at the end of the table (before CPUCP_NUM_OF_MSI_TYPES).
+ * Changing the order of entries or removing entries is not allowed.
+ */
+enum cpucp_msi_type {
+	CPUCP_EVENT_QUEUE_MSI_TYPE,
+	CPUCP_NIC_PORT1_MSI_TYPE,
+	CPUCP_NIC_PORT3_MSI_TYPE,
+	CPUCP_NIC_PORT5_MSI_TYPE,
+	CPUCP_NIC_PORT7_MSI_TYPE,
+	CPUCP_NIC_PORT9_MSI_TYPE,
+	CPUCP_NUM_OF_MSI_TYPES
+};
+
+/*
+ * PLL enumeration table used for all ASICs and future SW versions.
+ * For future ASIC-LKD compatibility, we can only add new enumerations.
+ * at the end of the table.
+ * Changing the order of entries or removing entries is not allowed.
+ */
+enum pll_index {
+	CPU_PLL = 0,
+	PCI_PLL = 1,
+	NIC_PLL = 2,
+	DMA_PLL = 3,
+	MESH_PLL = 4,
+	MME_PLL = 5,
+	TPC_PLL = 6,
+	IF_PLL = 7,
+	SRAM_PLL = 8,
+	NS_PLL = 9,
+	HBM_PLL = 10,
+	MSS_PLL = 11,
+	DDR_PLL = 12,
+	VID_PLL = 13,
+	BANK_PLL = 14,
+	MMU_PLL = 15,
+	IC_PLL = 16,
+	MC_PLL = 17,
+	EMMC_PLL = 18,
+	D2D_PLL = 19,
+	CS_PLL = 20,
+	C2C_PLL = 21,
+	NCH_PLL = 22,
+	C2M_PLL = 23,
+	PLL_MAX
+};
+
+enum rl_index {
+	TPC_RL = 0,
+	MME_RL,
+	EDMA_RL,
+};
+
+enum pvt_index {
+	PVT_SW,
+	PVT_SE,
+	PVT_NW,
+	PVT_NE
+};
+
+/* Event Queue Packets */
+
+struct eq_generic_event {
+	__le64 data[7];
+};
+
+/*
+ * CpuCP info
+ */
+
+#define CARD_NAME_MAX_LEN		16
+#define CPUCP_MAX_SENSORS		128
+#define CPUCP_MAX_NICS			128
+#define CPUCP_LANES_PER_NIC		4
+#define CPUCP_NIC_QSFP_EEPROM_MAX_LEN	1024
+#define CPUCP_MAX_NIC_LANES		(CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
+#define CPUCP_NIC_MASK_ARR_LEN		((CPUCP_MAX_NICS + 63) / 64)
+#define CPUCP_NIC_POLARITY_ARR_LEN	((CPUCP_MAX_NIC_LANES + 63) / 64)
+#define CPUCP_HBM_ROW_REPLACE_MAX	32
+
+struct cpucp_sensor {
+	__le32 type;
+	__le32 flags;
+};
+
+/**
+ * struct cpucp_card_types - ASIC card type.
+ * @cpucp_card_type_pci: PCI card.
+ * @cpucp_card_type_pmc: PCI Mezzanine Card.
+ */
+enum cpucp_card_types {
+	cpucp_card_type_pci,
+	cpucp_card_type_pmc
+};
+
+#define CPUCP_SEC_CONF_ENABLED_SHIFT	0
+#define CPUCP_SEC_CONF_ENABLED_MASK	0x00000001
+
+#define CPUCP_SEC_CONF_FLASH_WP_SHIFT	1
+#define CPUCP_SEC_CONF_FLASH_WP_MASK	0x00000002
+
+#define CPUCP_SEC_CONF_EEPROM_WP_SHIFT	2
+#define CPUCP_SEC_CONF_EEPROM_WP_MASK	0x00000004
+
+/**
+ * struct cpucp_security_info - Security information.
+ * @config: configuration bit field
+ * @keys_num: number of stored keys
+ * @revoked_keys: revoked keys bit field
+ * @min_svn: minimal security version
+ */
+struct cpucp_security_info {
+	__u8 config;
+	__u8 keys_num;
+	__u8 revoked_keys;
+	__u8 min_svn;
+};
+
+/**
+ * struct cpucp_info - Info from CpuCP that is necessary to the host's driver
+ * @sensors: available sensors description.
+ * @kernel_version: CpuCP linux kernel version.
+ * @reserved: reserved field.
+ * @card_type: card configuration type.
+ * @card_location: in a server, each card has different connections topology
+ *                 depending on its location (relevant for PMC card type)
+ * @cpld_version: CPLD programmed F/W version.
+ * @infineon_version: Infineon main DC-DC version.
+ * @fuse_version: silicon production FUSE information.
+ * @thermal_version: thermald S/W version.
+ * @cpucp_version: CpuCP S/W version.
+ * @infineon_second_stage_version: Infineon 2nd stage DC-DC version.
+ * @dram_size: available DRAM size.
+ * @card_name: card name that will be displayed in HWMON subsystem on the host
+ * @tpc_binning_mask: TPC binning mask, 1 bit per TPC instance
+ *                    (0 = functional, 1 = binned)
+ * @decoder_binning_mask: Decoder binning mask, 1 bit per decoder instance
+ *                        (0 = functional, 1 = binned), maximum 1 per dcore
+ * @sram_binning: Categorize SRAM functionality
+ *                (0 = fully functional, 1 = lower-half is not functional,
+ *                 2 = upper-half is not functional)
+ * @sec_info: security information
+ * @pll_map: Bit map of supported PLLs for current ASIC version.
+ * @mme_binning_mask: MME binning mask,
+ *                    bits [0:6]   <==> dcore0 mme fma
+ *                    bits [7:13]  <==> dcore1 mme fma
+ *                    bits [14:20] <==> dcore0 mme ima
+ *                    bits [21:27] <==> dcore1 mme ima
+ *                    For each group, if the 6th bit is set then first 5 bits
+ *                    represent the col's idx [0-31], otherwise these bits are
+ *                    ignored, and col idx 32 is binned. 7th bit is don't care.
+ * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
+ *                     (0 = functional 1 = binned)
+ * @memory_repair_flag: eFuse flag indicating memory repair
+ * @edma_binning_mask: EDMA binning mask, 1 bit per EDMA instance
+ *                     (0 = functional 1 = binned)
+ * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance
+ *                     (0 = functional 1 = binned)
+ * @interposer_version: Interposer version programmed in eFuse
+ * @substrate_version: Substrate version programmed in eFuse
+ * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
+ * @fw_os_version: Firmware OS Version
+ */
+struct cpucp_info {
+	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
+	__u8 kernel_version[VERSION_MAX_LEN];
+	__le32 reserved;
+	__le32 card_type;
+	__le32 card_location;
+	__le32 cpld_version;
+	__le32 infineon_version;
+	__u8 fuse_version[VERSION_MAX_LEN];
+	__u8 thermal_version[VERSION_MAX_LEN];
+	__u8 cpucp_version[VERSION_MAX_LEN];
+	__le32 infineon_second_stage_version;
+	__le64 dram_size;
+	char card_name[CARD_NAME_MAX_LEN];
+	__le64 tpc_binning_mask;
+	__le64 decoder_binning_mask;
+	__u8 sram_binning;
+	__u8 dram_binning_mask;
+	__u8 memory_repair_flag;
+	__u8 edma_binning_mask;
+	__u8 xbar_binning_mask;
+	__u8 interposer_version;
+	__u8 substrate_version;
+	__u8 reserved2;
+	struct cpucp_security_info sec_info;
+	__le32 fw_hbm_region_size;
+	__u8 pll_map[PLL_MAP_LEN];
+	__le64 mme_binning_mask;
+	__u8 fw_os_version[VERSION_MAX_LEN];
+};
+
+struct cpucp_mac_addr {
+	__u8 mac_addr[ETH_ALEN];
+};
+
+enum cpucp_serdes_type {
+	TYPE_1_SERDES_TYPE,
+	TYPE_2_SERDES_TYPE,
+	HLS1_SERDES_TYPE,
+	HLS1H_SERDES_TYPE,
+	HLS2_SERDES_TYPE,
+	HLS2_TYPE_1_SERDES_TYPE,
+	MAX_NUM_SERDES_TYPE,		/* number of types */
+	UNKNOWN_SERDES_TYPE = 0xFFFF	/* serdes_type is u16 */
+};
+
+struct cpucp_nic_info {
+	struct cpucp_mac_addr mac_addrs[CPUCP_MAX_NICS];
+	__le64 link_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le64 pol_tx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
+	__le64 pol_rx_mask[CPUCP_NIC_POLARITY_ARR_LEN];
+	__le64 link_ext_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__u8 qsfp_eeprom[CPUCP_NIC_QSFP_EEPROM_MAX_LEN];
+	__le64 auto_neg_mask[CPUCP_NIC_MASK_ARR_LEN];
+	__le16 serdes_type; /* enum cpucp_serdes_type */
+	__le16 tx_swap_map[CPUCP_MAX_NICS];
+	__u8 reserved[6];
+};
+
+#define PAGE_DISCARD_MAX	64
+
+struct page_discard_info {
+	__u8 num_entries;
+	__u8 reserved[7];
+	__le32 mmu_page_idx[PAGE_DISCARD_MAX];
+};
+
+/*
+ * struct frac_val - fracture value represented by "integer.frac".
+ * @integer: the integer part of the fracture value;
+ * @frac: the fracture part of the fracture value.
+ */
+struct frac_val {
+	union {
+		struct {
+			__le16 integer;
+			__le16 frac;
+		};
+		__le32 val;
+	};
+};
+
+/*
+ * struct ser_val - the SER (symbol error rate) value is represented by "integer * 10 ^ -exp".
+ * @integer: the integer part of the SER value;
+ * @exp: the exponent part of the SER value.
+ */
+struct ser_val {
+	__le16 integer;
+	__le16 exp;
+};
+
+/*
+ * struct cpucp_nic_status - describes the status of a NIC port.
+ * @port: NIC port index.
+ * @bad_format_cnt: e.g. CRC.
+ * @responder_out_of_sequence_psn_cnt: e.g NAK.
+ * @high_ber_reinit_cnt: link reinit due to high BER.
+ * @correctable_err_cnt: e.g. bit-flip.
+ * @uncorrectable_err_cnt: e.g. MAC errors.
+ * @retraining_cnt: re-training counter.
+ * @up: is port up.
+ * @pcs_link: has PCS link.
+ * @phy_ready: is PHY ready.
+ * @auto_neg: is Autoneg enabled.
+ * @timeout_retransmission_cnt: timeout retransmission events.
+ * @high_ber_cnt: high ber events.
+ * @pre_fec_ser: pre FEC SER value.
+ * @post_fec_ser: post FEC SER value.
+ * @throughput: measured throughput.
+ * @latency: measured latency.
+ */
+struct cpucp_nic_status {
+	__le32 port;
+	__le32 bad_format_cnt;
+	__le32 responder_out_of_sequence_psn_cnt;
+	__le32 high_ber_reinit;
+	__le32 correctable_err_cnt;
+	__le32 uncorrectable_err_cnt;
+	__le32 retraining_cnt;
+	__u8 up;
+	__u8 pcs_link;
+	__u8 phy_ready;
+	__u8 auto_neg;
+	__le32 timeout_retransmission_cnt;
+	__le32 high_ber_cnt;
+	struct ser_val pre_fec_ser;
+	struct ser_val post_fec_ser;
+	struct frac_val bandwidth;
+	struct frac_val lat;
+};
+
+enum cpucp_hbm_row_replace_cause {
+	REPLACE_CAUSE_DOUBLE_ECC_ERR,
+	REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
+};
+
+struct cpucp_hbm_row_info {
+	__u8 hbm_idx;
+	__u8 pc;
+	__u8 sid;
+	__u8 bank_idx;
+	__le16 row_addr;
+	__u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
+	__u8 pad;
+};
+
+struct cpucp_hbm_row_replaced_rows_info {
+	__le16 num_replaced_rows;
+	__u8 pad[6];
+	struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
+};
+
+enum cpu_reset_status {
+	CPU_RST_STATUS_NA = 0,
+	CPU_RST_STATUS_SOFT_RST_DONE = 1,
+};
+
+#define SEC_PCR_DATA_BUF_SZ	256
+#define SEC_PCR_QUOTE_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
+#define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
+#define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
+
+/*
+ * struct cpucp_sec_attest_info - attestation report of the boot
+ * @pcr_data: raw values of the PCR registers
+ * @pcr_num_reg: number of PCR registers in the pcr_data array
+ * @pcr_reg_len: length of each PCR register in the pcr_data array (bytes)
+ * @nonce: number only used once. random number provided by host. this also
+ *	    passed to the quote command as a qualifying data.
+ * @pcr_quote_len: length of the attestation quote data (bytes)
+ * @pcr_quote: attestation report data structure
+ * @quote_sig_len: length of the attestation report signature (bytes)
+ * @quote_sig: signature structure of the attestation report
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key for the signed attestation
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the attestation signing key
+ */
+struct cpucp_sec_attest_info {
+	__u8 pcr_data[SEC_PCR_DATA_BUF_SZ];
+	__u8 pcr_num_reg;
+	__u8 pcr_reg_len;
+	__le16 pad0;
+	__le32 nonce;
+	__le16 pcr_quote_len;
+	__u8 pcr_quote[SEC_PCR_QUOTE_BUF_SZ];
+	__u8 quote_sig_len;
+	__u8 quote_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
+/*
+ * struct cpucp_dev_info_signed - device information signed by a secured device
+ * @info: device information structure as defined above
+ * @nonce: number only used once. random number provided by host. this number is
+ *	   hashed and signed along with the device information.
+ * @info_sig_len: length of the attestation signature (bytes)
+ * @info_sig: signature of the info + nonce data.
+ * @pub_data_len: length of the public data (bytes)
+ * @public_data: public key info signed info data
+ *		 (outPublic + name + qualifiedName)
+ * @certificate_len: length of the certificate (bytes)
+ * @certificate: certificate for the signing key
+ */
+struct cpucp_dev_info_signed {
+	struct cpucp_info info;	/* assumed to be 64bit aligned */
+	__le32 nonce;
+	__le32 pad0;
+	__u8 info_sig_len;
+	__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
+	__le16 pub_data_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__le16 certificate_len;
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+};
+
+#define DCORE_MON_REGS_SZ	512
+/*
+ * struct dcore_monitor_regs_data - DCORE monitor regs data.
+ * the structure follows sync manager block layout. Obsolete.
+ * @mon_pay_addrl: array of payload address low bits.
+ * @mon_pay_addrh: array of payload address high bits.
+ * @mon_pay_data: array of payload data.
+ * @mon_arm: array of monitor arm.
+ * @mon_status: array of monitor status.
+ */
+struct dcore_monitor_regs_data {
+	__le32 mon_pay_addrl[DCORE_MON_REGS_SZ];
+	__le32 mon_pay_addrh[DCORE_MON_REGS_SZ];
+	__le32 mon_pay_data[DCORE_MON_REGS_SZ];
+	__le32 mon_arm[DCORE_MON_REGS_SZ];
+	__le32 mon_status[DCORE_MON_REGS_SZ];
+};
+
+/* contains SM data for each SYNC_MNGR (Obsolete) */
+struct cpucp_monitor_dump {
+	struct dcore_monitor_regs_data sync_mngr_w_s;
+	struct dcore_monitor_regs_data sync_mngr_e_s;
+	struct dcore_monitor_regs_data sync_mngr_w_n;
+	struct dcore_monitor_regs_data sync_mngr_e_n;
+};
+
+/*
+ * The Type of the generic request (and other input arguments) will be fetched from user by reading
+ * from "pkt_subidx" field in struct cpucp_packet.
+ *
+ * HL_PASSTHROUGHT_VERSIONS	- Fetch all firmware versions.
+ */
+enum hl_passthrough_type {
+	HL_PASSTHROUGH_VERSIONS,
+};
+
+#endif /* CPUCP_IF_H */
diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
new file mode 100644
index 000000000000..7de8a5786a36
--- /dev/null
+++ b/include/linux/habanalabs/hl_boot_if.h
@@ -0,0 +1,790 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2018-2020 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef HL_BOOT_IF_H
+#define HL_BOOT_IF_H
+
+#define LKD_HARD_RESET_MAGIC		0xED7BD694 /* deprecated - do not use */
+#define HL_POWER9_HOST_MAGIC		0x1DA30009
+
+#define BOOT_FIT_SRAM_OFFSET		0x200000
+
+#define VERSION_MAX_LEN			128
+
+enum cpu_boot_err {
+	CPU_BOOT_ERR_DRAM_INIT_FAIL = 0,
+	CPU_BOOT_ERR_FIT_CORRUPTED = 1,
+	CPU_BOOT_ERR_TS_INIT_FAIL = 2,
+	CPU_BOOT_ERR_DRAM_SKIPPED = 3,
+	CPU_BOOT_ERR_BMC_WAIT_SKIPPED = 4,
+	CPU_BOOT_ERR_NIC_DATA_NOT_RDY = 5,
+	CPU_BOOT_ERR_NIC_FW_FAIL = 6,
+	CPU_BOOT_ERR_SECURITY_NOT_RDY = 7,
+	CPU_BOOT_ERR_SECURITY_FAIL = 8,
+	CPU_BOOT_ERR_EFUSE_FAIL = 9,
+	CPU_BOOT_ERR_PRI_IMG_VER_FAIL = 10,
+	CPU_BOOT_ERR_SEC_IMG_VER_FAIL = 11,
+	CPU_BOOT_ERR_PLL_FAIL = 12,
+	CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13,
+	CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18,
+	CPU_BOOT_ERR_BINNING_FAIL = 19,
+	CPU_BOOT_ERR_TPM_FAIL = 20,
+	CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL = 21,
+	CPU_BOOT_ERR_EEPROM_FAIL = 22,
+	CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL = 23,
+	CPU_BOOT_ERR_ENABLED = 31,
+	CPU_BOOT_ERR_SCND_EN = 63,
+	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
+};
+
+/*
+ * Mask for fatal failures
+ * This mask contains all possible fatal failures, and a dynamic code
+ * will clear the non-relevant ones.
+ */
+#define CPU_BOOT_ERR_FATAL_MASK					\
+		((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) |		\
+		 (1 << CPU_BOOT_ERR_PLL_FAIL) |			\
+		 (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) |	\
+		 (1 << CPU_BOOT_ERR_BINNING_FAIL) |		\
+		 (1 << CPU_BOOT_ERR_DRAM_SKIPPED) |		\
+		 (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) |	\
+		 (1 << CPU_BOOT_ERR_EEPROM_FAIL))
+
+/*
+ * CPU error bits in BOOT_ERROR registers
+ *
+ * CPU_BOOT_ERR0_DRAM_INIT_FAIL		DRAM initialization failed.
+ *					DRAM is not reliable to use.
+ *
+ * CPU_BOOT_ERR0_FIT_CORRUPTED		FIT data integrity verification of the
+ *					image provided by the host has failed.
+ *
+ * CPU_BOOT_ERR0_TS_INIT_FAIL		Thermal Sensor initialization failed.
+ *					Boot continues as usual, but keep in
+ *					mind this is a warning.
+ *
+ * CPU_BOOT_ERR0_DRAM_SKIPPED		DRAM initialization has been skipped.
+ *					Skipping DRAM initialization has been
+ *					requested (e.g. strap, command, etc.)
+ *					and FW skipped the DRAM initialization.
+ *					Host can initialize the DRAM.
+ *
+ * CPU_BOOT_ERR0_BMC_WAIT_SKIPPED	Waiting for BMC data will be skipped.
+ *					Meaning the BMC data might not be
+ *					available until reset.
+ *
+ * CPU_BOOT_ERR0_NIC_DATA_NOT_RDY	NIC data from BMC is not ready.
+ *					BMC has not provided the NIC data yet.
+ *					Once provided this bit will be cleared.
+ *
+ * CPU_BOOT_ERR0_NIC_FW_FAIL		NIC FW loading failed.
+ *					The NIC FW loading and initialization
+ *					failed. This means NICs are not usable.
+ *
+ * CPU_BOOT_ERR0_SECURITY_NOT_RDY	Chip security initialization has been
+ *					started, but is not ready yet - chip
+ *					cannot be accessed.
+ *
+ * CPU_BOOT_ERR0_SECURITY_FAIL		Security related tasks have failed.
+ *					The tasks are security init (root of
+ *					trust), boot authentication (chain of
+ *					trust), data packets authentication.
+ *
+ * CPU_BOOT_ERR0_EFUSE_FAIL		Reading from eFuse failed.
+ *					The PCI device ID might be wrong.
+ *
+ * CPU_BOOT_ERR0_PRI_IMG_VER_FAIL	Verification of primary image failed.
+ *					It mean that ppboot checksum
+ *					verification for the preboot primary
+ *					image has failed to match expected
+ *					checksum. Trying to program image again
+ *					might solve this.
+ *
+ * CPU_BOOT_ERR0_SEC_IMG_VER_FAIL	Verification of secondary image failed.
+ *					It mean that ppboot checksum
+ *					verification for the preboot secondary
+ *					image has failed to match expected
+ *					checksum. Trying to program image again
+ *					might solve this.
+ *
+ * CPU_BOOT_ERR0_PLL_FAIL		PLL settings failed, meaning that one
+ *					of the PLLs remains in REF_CLK
+ *
+ * CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	Device is unusable and customer support
+ *					should be contacted.
+ *
+ * CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR	Critical error was detected during
+ *					the execution of ppboot or preboot.
+ *					for example: stack overflow.
+ *
+ * CPU_BOOT_ERR0_BINNING_FAIL		Binning settings failed, meaning
+ *					malfunctioning components might still be
+ *					in use.
+ *
+ * CPU_BOOT_ERR0_TPM_FAIL		TPM verification flow failed.
+ *
+ * CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	Failed to set threshold for tmperature
+ *					sensor.
+ *
+ * CPU_BOOT_ERR_EEPROM_FAIL		Failed reading EEPROM data. Defaults
+ *					are used.
+ *
+ * CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL	Failed scrubbing the Engines/ARCFarm
+ *					memories. Boot disabled until reset.
+ *
+ * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
+ *					This is a main indication that the
+ *					running FW populates the error
+ *					registers. Meaning the error bits are
+ *					not garbage, but actual error statuses.
+ */
+#define CPU_BOOT_ERR0_DRAM_INIT_FAIL		(1 << CPU_BOOT_ERR_DRAM_INIT_FAIL)
+#define CPU_BOOT_ERR0_FIT_CORRUPTED		(1 << CPU_BOOT_ERR_FIT_CORRUPTED)
+#define CPU_BOOT_ERR0_TS_INIT_FAIL		(1 << CPU_BOOT_ERR_TS_INIT_FAIL)
+#define CPU_BOOT_ERR0_DRAM_SKIPPED		(1 << CPU_BOOT_ERR_DRAM_SKIPPED)
+#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED		(1 << CPU_BOOT_ERR_BMC_WAIT_SKIPPED)
+#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY		(1 << CPU_BOOT_ERR_NIC_DATA_NOT_RDY)
+#define CPU_BOOT_ERR0_NIC_FW_FAIL		(1 << CPU_BOOT_ERR_NIC_FW_FAIL)
+#define CPU_BOOT_ERR0_SECURITY_NOT_RDY		(1 << CPU_BOOT_ERR_SECURITY_NOT_RDY)
+#define CPU_BOOT_ERR0_SECURITY_FAIL		(1 << CPU_BOOT_ERR_SECURITY_FAIL)
+#define CPU_BOOT_ERR0_EFUSE_FAIL		(1 << CPU_BOOT_ERR_EFUSE_FAIL)
+#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_PRI_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL		(1 << CPU_BOOT_ERR_SEC_IMG_VER_FAIL)
+#define CPU_BOOT_ERR0_PLL_FAIL			(1 << CPU_BOOT_ERR_PLL_FAIL)
+#define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL)
+#define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR		(1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR)
+#define CPU_BOOT_ERR0_BINNING_FAIL		(1 << CPU_BOOT_ERR_BINNING_FAIL)
+#define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
+#define CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL	(1 << CPU_BOOT_ERR_TMP_THRESH_INIT_FAIL)
+#define CPU_BOOT_ERR0_EEPROM_FAIL		(1 << CPU_BOOT_ERR_EEPROM_FAIL)
+#define CPU_BOOT_ERR0_ENG_ARC_MEM_SCRUB_FAIL	(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
+#define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+#define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
+
+enum cpu_boot_dev_sts {
+	CPU_BOOT_DEV_STS_SECURITY_EN = 0,
+	CPU_BOOT_DEV_STS_DEBUG_EN = 1,
+	CPU_BOOT_DEV_STS_WATCHDOG_EN = 2,
+	CPU_BOOT_DEV_STS_DRAM_INIT_EN = 3,
+	CPU_BOOT_DEV_STS_BMC_WAIT_EN = 4,
+	CPU_BOOT_DEV_STS_E2E_CRED_EN = 5,
+	CPU_BOOT_DEV_STS_HBM_CRED_EN = 6,
+	CPU_BOOT_DEV_STS_RL_EN = 7,
+	CPU_BOOT_DEV_STS_SRAM_SCR_EN = 8,
+	CPU_BOOT_DEV_STS_DRAM_SCR_EN = 9,
+	CPU_BOOT_DEV_STS_FW_HARD_RST_EN = 10,
+	CPU_BOOT_DEV_STS_PLL_INFO_EN = 11,
+	CPU_BOOT_DEV_STS_SP_SRAM_EN = 12,
+	CPU_BOOT_DEV_STS_CLK_GATE_EN = 13,
+	CPU_BOOT_DEV_STS_HBM_ECC_EN = 14,
+	CPU_BOOT_DEV_STS_PKT_PI_ACK_EN = 15,
+	CPU_BOOT_DEV_STS_FW_LD_COM_EN = 16,
+	CPU_BOOT_DEV_STS_FW_IATU_CONF_EN = 17,
+	CPU_BOOT_DEV_STS_FW_NIC_MAC_EN = 18,
+	CPU_BOOT_DEV_STS_DYN_PLL_EN = 19,
+	CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN = 20,
+	CPU_BOOT_DEV_STS_EQ_INDEX_EN = 21,
+	CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN = 22,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN = 23,
+	CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
+	CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
+	CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
+	CPU_BOOT_DEV_STS_ENABLED = 31,
+	CPU_BOOT_DEV_STS_SCND_EN = 63,
+	CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
+};
+
+/*
+ * BOOT DEVICE STATUS bits in BOOT_DEVICE_STS registers
+ *
+ * CPU_BOOT_DEV_STS0_SECURITY_EN	Security is Enabled.
+ *					This is an indication for security
+ *					enabled in FW, which means that
+ *					all conditions for security are met:
+ *					device is indicated as security enabled,
+ *					registers are protected, and device
+ *					uses keys for image verification.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_DEBUG_EN		Debug is enabled.
+ *					Enabled when JTAG or DEBUG is enabled
+ *					in FW.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_WATCHDOG_EN	Watchdog is enabled.
+ *					Watchdog is enabled in FW.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_DRAM_INIT_EN	DRAM initialization is enabled.
+ *					DRAM initialization has been done in FW.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_BMC_WAIT_EN	Waiting for BMC data enabled.
+ *					If set, it means that during boot,
+ *					FW waited for BMC data.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_E2E_CRED_EN	E2E credits initialized.
+ *					FW initialized E2E credits.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_HBM_CRED_EN	HBM credits initialized.
+ *					FW initialized HBM credits.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_RL_EN		Rate limiter initialized.
+ *					FW initialized rate limiter.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_SRAM_SCR_EN	SRAM scrambler enabled.
+ *					FW initialized SRAM scrambler.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_DRAM_SCR_EN	DRAM scrambler enabled.
+ *					FW initialized DRAM scrambler.
+ *					Initialized in: u-boot
+ *
+ * CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	FW hard reset procedure is enabled.
+ *					FW has the hard reset procedure
+ *					implemented. This means that FW will
+ *					perform hard reset procedure on
+ *					receiving the halt-machine event.
+ *					Initialized in: preboot, u-boot, linux
+ *
+ * CPU_BOOT_DEV_STS0_PLL_INFO_EN	FW retrieval of PLL info is enabled.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_SP_SRAM_EN		SP SRAM is initialized and available
+ *					for use.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_CLK_GATE_EN	Clock Gating enabled.
+ *					FW initialized Clock Gating.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_HBM_ECC_EN		HBM ECC handling Enabled.
+ *					FW handles HBM ECC indications.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN	Packets ack value used in the armcpd
+ *					is set to the PI counter.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_LD_COM_EN	Flexible FW loading communication
+ *					protocol is enabled.
+ *					Initialized in: preboot
+ *
+ * CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	FW iATU configuration is enabled.
+ *					This bit if set, means the iATU has been
+ *					configured and is ready for use.
+ *					Initialized in: ppboot
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN	NIC MAC channels init is done by FW and
+ *					any access to them is done via the FW.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_DYN_PLL_EN		Dynamic PLL configuration is enabled.
+ *					FW sends to host a bitmap of supported
+ *					PLLs.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	GIC access permission only from
+ *					previleged entity. FW sets this status
+ *					bit for host. If this bit is set then
+ *					GIC can not be accessed from host.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_EQ_INDEX_EN	Event Queue (EQ) index is a running
+ *					index for each new event sent to host.
+ *					This is used as a method in host to
+ *					identify that the waiting event in
+ *					queue is actually a new event which
+ *					was not served before.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	Use multiple scratchpad interfaces to
+ *					prevent IRQs overriding each other.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN
+ *					NIC STAT and XPCS91 access is restricted
+ *					and is done via FW only.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN
+ *					NIC STAT get all is supported.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN
+ *					F/W checks if the device is idle by reading defined set
+ *					of registers. It returns a bitmask of all the engines,
+ *					where a bit is set if the engine is not idle.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_MAP_HWMON_EN
+ *					If set, means f/w supports proprietary
+ *					HWMON enum mapping to cpucp enums.
+ *					Initialized in: linux
+ *
+ * CPU_BOOT_DEV_STS0_ENABLED		Device status register enabled.
+ *					This is a main indication that the
+ *					running FW populates the device status
+ *					register. Meaning the device status
+ *					bits are not garbage, but actual
+ *					statuses.
+ *					Initialized in: preboot
+ *
+ */
+#define CPU_BOOT_DEV_STS0_SECURITY_EN		(1 << CPU_BOOT_DEV_STS_SECURITY_EN)
+#define CPU_BOOT_DEV_STS0_DEBUG_EN		(1 << CPU_BOOT_DEV_STS_DEBUG_EN)
+#define CPU_BOOT_DEV_STS0_WATCHDOG_EN		(1 << CPU_BOOT_DEV_STS_WATCHDOG_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_INIT_EN		(1 << CPU_BOOT_DEV_STS_DRAM_INIT_EN)
+#define CPU_BOOT_DEV_STS0_BMC_WAIT_EN		(1 << CPU_BOOT_DEV_STS_BMC_WAIT_EN)
+#define CPU_BOOT_DEV_STS0_E2E_CRED_EN		(1 << CPU_BOOT_DEV_STS_E2E_CRED_EN)
+#define CPU_BOOT_DEV_STS0_HBM_CRED_EN		(1 << CPU_BOOT_DEV_STS_HBM_CRED_EN)
+#define CPU_BOOT_DEV_STS0_RL_EN			(1 << CPU_BOOT_DEV_STS_RL_EN)
+#define CPU_BOOT_DEV_STS0_SRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_SRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_DRAM_SCR_EN		(1 << CPU_BOOT_DEV_STS_DRAM_SCR_EN)
+#define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN	(1 << CPU_BOOT_DEV_STS_FW_HARD_RST_EN)
+#define CPU_BOOT_DEV_STS0_PLL_INFO_EN		(1 << CPU_BOOT_DEV_STS_PLL_INFO_EN)
+#define CPU_BOOT_DEV_STS0_SP_SRAM_EN		(1 << CPU_BOOT_DEV_STS_SP_SRAM_EN)
+#define CPU_BOOT_DEV_STS0_CLK_GATE_EN		(1 << CPU_BOOT_DEV_STS_CLK_GATE_EN)
+#define CPU_BOOT_DEV_STS0_HBM_ECC_EN		(1 << CPU_BOOT_DEV_STS_HBM_ECC_EN)
+#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN		(1 << CPU_BOOT_DEV_STS_PKT_PI_ACK_EN)
+#define CPU_BOOT_DEV_STS0_FW_LD_COM_EN		(1 << CPU_BOOT_DEV_STS_FW_LD_COM_EN)
+#define CPU_BOOT_DEV_STS0_FW_IATU_CONF_EN	(1 << CPU_BOOT_DEV_STS_FW_IATU_CONF_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_MAC_EN		(1 << CPU_BOOT_DEV_STS_FW_NIC_MAC_EN)
+#define CPU_BOOT_DEV_STS0_DYN_PLL_EN		(1 << CPU_BOOT_DEV_STS_DYN_PLL_EN)
+#define CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN	(1 << CPU_BOOT_DEV_STS_GIC_PRIVILEGED_EN)
+#define CPU_BOOT_DEV_STS0_EQ_INDEX_EN		(1 << CPU_BOOT_DEV_STS_EQ_INDEX_EN)
+#define CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN	(1 << CPU_BOOT_DEV_STS_MULTI_IRQ_POLL_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_XPCS91_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_XPCS91_EN)
+#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN	(1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
+#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN	(1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
+#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN		(1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
+#define CPU_BOOT_DEV_STS0_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
+#define CPU_BOOT_DEV_STS1_ENABLED		(1 << CPU_BOOT_DEV_STS_ENABLED)
+
+enum cpu_boot_status {
+	CPU_BOOT_STATUS_NA = 0,		/* Default value after reset of chip */
+	CPU_BOOT_STATUS_IN_WFE = 1,
+	CPU_BOOT_STATUS_DRAM_RDY = 2,
+	CPU_BOOT_STATUS_SRAM_AVAIL = 3,
+	CPU_BOOT_STATUS_IN_BTL = 4,	/* BTL is H/W FSM */
+	CPU_BOOT_STATUS_IN_PREBOOT = 5,
+	CPU_BOOT_STATUS_IN_SPL,		/* deprecated - not reported */
+	CPU_BOOT_STATUS_IN_UBOOT = 7,
+	CPU_BOOT_STATUS_DRAM_INIT_FAIL,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_FIT_CORRUPTED,	/* deprecated - will be removed */
+	/* U-Boot console prompt activated, commands are not processed */
+	CPU_BOOT_STATUS_UBOOT_NOT_READY = 10,
+	/* Finished NICs init, reported after DRAM and NICs */
+	CPU_BOOT_STATUS_NIC_FW_RDY = 11,
+	CPU_BOOT_STATUS_TS_INIT_FAIL,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_DRAM_SKIPPED,	/* deprecated - will be removed */
+	CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */
+	/* Last boot loader progress status, ready to receive commands */
+	CPU_BOOT_STATUS_READY_TO_BOOT = 15,
+	/* Internal Boot finished, ready for boot-fit */
+	CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
+	/* Internal Security has been initialized, device can be accessed */
+	CPU_BOOT_STATUS_SECURITY_READY = 17,
+};
+
+enum kmd_msg {
+	KMD_MSG_NA = 0,
+	KMD_MSG_GOTO_WFE,
+	KMD_MSG_FIT_RDY,
+	KMD_MSG_SKIP_BMC,
+	RESERVED,
+	KMD_MSG_RST_DEV,
+	KMD_MSG_LAST
+};
+
+enum cpu_msg_status {
+	CPU_MSG_CLR = 0,
+	CPU_MSG_OK,
+	CPU_MSG_ERR,
+};
+
+/* communication registers mapping - consider ABI when changing */
+struct cpu_dyn_regs {
+	__le32 cpu_pq_base_addr_low;
+	__le32 cpu_pq_base_addr_high;
+	__le32 cpu_pq_length;
+	__le32 cpu_pq_init_status;
+	__le32 cpu_eq_base_addr_low;
+	__le32 cpu_eq_base_addr_high;
+	__le32 cpu_eq_length;
+	__le32 cpu_eq_ci;
+	__le32 cpu_cq_base_addr_low;
+	__le32 cpu_cq_base_addr_high;
+	__le32 cpu_cq_length;
+	__le32 cpu_pf_pq_pi;
+	__le32 cpu_boot_dev_sts0;
+	__le32 cpu_boot_dev_sts1;
+	__le32 cpu_boot_err0;
+	__le32 cpu_boot_err1;
+	__le32 cpu_boot_status;
+	__le32 fw_upd_sts;
+	__le32 fw_upd_cmd;
+	__le32 fw_upd_pending_sts;
+	__le32 fuse_ver_offset;
+	__le32 preboot_ver_offset;
+	__le32 uboot_ver_offset;
+	__le32 hw_state;
+	__le32 kmd_msg_to_cpu;
+	__le32 cpu_cmd_status_to_host;
+	__le32 gic_host_pi_upd_irq;
+	__le32 gic_tpc_qm_irq_ctrl;
+	__le32 gic_mme_qm_irq_ctrl;
+	__le32 gic_dma_qm_irq_ctrl;
+	__le32 gic_nic_qm_irq_ctrl;
+	__le32 gic_dma_core_irq_ctrl;
+	__le32 gic_host_halt_irq;
+	__le32 gic_host_ints_irq;
+	__le32 gic_host_soft_rst_irq;
+	__le32 gic_rot_qm_irq_ctrl;
+	__le32 cpu_rst_status;
+	__le32 eng_arc_irq_ctrl;
+	__le32 reserved1[20];		/* reserve for future use */
+};
+
+/* TODO: remove the desc magic after the code is updated to use message */
+/* HCDM - Habana Communications Descriptor Magic */
+#define HL_COMMS_DESC_MAGIC	0x4843444D
+#define HL_COMMS_DESC_VER	3
+
+/* HCMv - Habana Communications Message + header version */
+#define HL_COMMS_MSG_MAGIC_VALUE	0x48434D00
+#define HL_COMMS_MSG_MAGIC_MASK		0xFFFFFF00
+#define HL_COMMS_MSG_MAGIC_VER_MASK	0xFF
+
+#define HL_COMMS_MSG_MAGIC_VER(ver)	(HL_COMMS_MSG_MAGIC_VALUE |	\
+					((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+#define HL_COMMS_MSG_MAGIC_V0		HL_COMMS_DESC_MAGIC
+#define HL_COMMS_MSG_MAGIC_V1		HL_COMMS_MSG_MAGIC_VER(1)
+#define HL_COMMS_MSG_MAGIC_V2		HL_COMMS_MSG_MAGIC_VER(2)
+#define HL_COMMS_MSG_MAGIC_V3		HL_COMMS_MSG_MAGIC_VER(3)
+
+#define HL_COMMS_MSG_MAGIC		HL_COMMS_MSG_MAGIC_V3
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC(magic)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_MASK) ==			\
+		HL_COMMS_MSG_MAGIC_VALUE)
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE_VERSION(magic, ver)			\
+		(((magic) & HL_COMMS_MSG_MAGIC_VER_MASK) >=		\
+		((ver) & HL_COMMS_MSG_MAGIC_VER_MASK))
+
+#define HL_COMMS_MSG_MAGIC_VALIDATE(magic, ver)				\
+		(HL_COMMS_MSG_MAGIC_VALIDATE_MAGIC((magic)) &&		\
+		HL_COMMS_MSG_MAGIC_VALIDATE_VERSION((magic), (ver)))
+
+enum comms_msg_type {
+	HL_COMMS_DESC_TYPE = 0,
+	HL_COMMS_RESET_CAUSE_TYPE = 1,
+	HL_COMMS_FW_CFG_SKIP_TYPE = 2,
+	HL_COMMS_BINNING_CONF_TYPE = 3,
+};
+
+/*
+ * Binning information shared between LKD and FW
+ * @tpc_mask_l - TPC binning information lower 64 bit
+ * @dec_mask - Decoder binning information
+ * @dram_mask - DRAM binning information
+ * @edma_mask - EDMA binning information
+ * @mme_mask_l - MME binning information lower 32
+ * @mme_mask_h - MME binning information upper 32
+ * @rot_mask - Rotator binning information
+ * @xbar_mask - xBAR binning information
+ * @reserved - reserved field for future binning info w/o ABI change
+ * @tpc_mask_h - TPC binning information upper 64 bit
+ * @nic_mask - NIC binning information
+ */
+struct lkd_fw_binning_info {
+	__le64 tpc_mask_l;
+	__le32 dec_mask;
+	__le32 dram_mask;
+	__le32 edma_mask;
+	__le32 mme_mask_l;
+	__le32 mme_mask_h;
+	__le32 rot_mask;
+	__le32 xbar_mask;
+	__le32 reserved0;
+	__le64 tpc_mask_h;
+	__le64 nic_mask;
+	__le32 reserved1[8];
+};
+
+/* TODO: remove this struct after the code is updated to use message */
+/* this is the comms descriptor header - meta data */
+struct comms_desc_header {
+	__le32 magic;		/* magic for validation */
+	__le32 crc32;		/* CRC32 of the descriptor w/o header */
+	__le16 size;		/* size of the descriptor w/o header */
+	__u8 version;	/* descriptor version */
+	__u8 reserved[5];	/* pad to 64 bit */
+};
+
+/* this is the comms message header - meta data */
+struct comms_msg_header {
+	__le32 magic;		/* magic for validation */
+	__le32 crc32;		/* CRC32 of the message w/o header */
+	__le16 size;		/* size of the message w/o header */
+	__u8 version;	/* message payload version */
+	__u8 type;		/* message type */
+	__u8 reserved[4];	/* pad to 64 bit */
+};
+
+enum lkd_fw_ascii_msg_lvls {
+	LKD_FW_ASCII_MSG_ERR = 0,
+	LKD_FW_ASCII_MSG_WRN = 1,
+	LKD_FW_ASCII_MSG_INF = 2,
+	LKD_FW_ASCII_MSG_DBG = 3,
+};
+
+#define LKD_FW_ASCII_MSG_MAX_LEN	128
+#define LKD_FW_ASCII_MSG_MAX		4	/* consider ABI when changing */
+
+struct lkd_fw_ascii_msg {
+	__u8 valid;
+	__u8 msg_lvl;
+	__u8 reserved[6];
+	char msg[LKD_FW_ASCII_MSG_MAX_LEN];
+};
+
+/* this is the main FW descriptor - consider ABI when changing */
+struct lkd_fw_comms_desc {
+	struct comms_desc_header header;
+	struct cpu_dyn_regs cpu_dyn_regs;
+	char fuse_ver[VERSION_MAX_LEN];
+	char cur_fw_ver[VERSION_MAX_LEN];
+	/* can be used for 1 more version w/o ABI change */
+	char reserved0[VERSION_MAX_LEN];
+	__le64 img_addr;	/* address for next FW component load */
+	struct lkd_fw_binning_info binning_info;
+	struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
+	__le32 rsvd_mem_size_mb; /* reserved memory size [MB] for FW/SVE */
+	char reserved1[4];
+};
+
+enum comms_reset_cause {
+	HL_RESET_CAUSE_UNKNOWN = 0,
+	HL_RESET_CAUSE_HEARTBEAT = 1,
+	HL_RESET_CAUSE_TDR = 2,
+};
+
+/* TODO: remove define after struct name is aligned on all projects */
+#define lkd_msg_comms lkd_fw_comms_msg
+
+/* this is the comms message descriptor */
+struct lkd_fw_comms_msg {
+	struct comms_msg_header header;
+	/* union for future expantions of new messages */
+	union {
+		struct {
+			struct cpu_dyn_regs cpu_dyn_regs;
+			char fuse_ver[VERSION_MAX_LEN];
+			char cur_fw_ver[VERSION_MAX_LEN];
+			/* can be used for 1 more version w/o ABI change */
+			char reserved0[VERSION_MAX_LEN];
+			/* address for next FW component load */
+			__le64 img_addr;
+			struct lkd_fw_binning_info binning_info;
+			struct lkd_fw_ascii_msg ascii_msg[LKD_FW_ASCII_MSG_MAX];
+			/* reserved memory size [MB] for FW/SVE */
+			__le32 rsvd_mem_size_mb;
+			char reserved1[4];
+		};
+		struct {
+			__u8 reset_cause;
+		};
+		struct {
+			__u8 fw_cfg_skip; /* 1 - skip, 0 - don't skip */
+		};
+		struct lkd_fw_binning_info binning_conf;
+	};
+};
+
+/*
+ * LKD commands:
+ *
+ * COMMS_NOOP			Used to clear the command register and no actual
+ *				command is send.
+ *
+ * COMMS_CLR_STS		Clear status command - FW should clear the
+ *				status register. Used for synchronization
+ *				between the commands as part of the race free
+ *				protocol.
+ *
+ * COMMS_RST_STATE		Reset the current communication state which is
+ *				kept by FW for proper responses.
+ *				Should be used in the beginning of the
+ *				communication cycle to clean any leftovers from
+ *				previous communication attempts.
+ *
+ * COMMS_PREP_DESC		Prepare descriptor for setting up the
+ *				communication and other dynamic data:
+ *				struct lkd_fw_comms_desc.
+ *				This command has a parameter stating the next FW
+ *				component size, so the FW can actually prepare a
+ *				space for it and in the status response provide
+ *				the descriptor offset. The Offset of the next FW
+ *				data component is a part of the descriptor
+ *				structure.
+ *
+ * COMMS_DATA_RDY		The FW data has been uploaded and is ready for
+ *				validation.
+ *
+ * COMMS_EXEC			Execute the next FW component.
+ *
+ * COMMS_RST_DEV		Reset the device.
+ *
+ * COMMS_GOTO_WFE		Execute WFE command. Allowed only on non-secure
+ *				devices.
+ *
+ * COMMS_SKIP_BMC		Perform actions required for BMC-less servers.
+ *				Do not wait for BMC response.
+ *
+ * COMMS_PREP_DESC_ELBI		Same as COMMS_PREP_DESC only that the memory
+ *				space is allocated in a ELBI access only
+ *				address range.
+ *
+ */
+enum comms_cmd {
+	COMMS_NOOP = 0,
+	COMMS_CLR_STS = 1,
+	COMMS_RST_STATE = 2,
+	COMMS_PREP_DESC = 3,
+	COMMS_DATA_RDY = 4,
+	COMMS_EXEC = 5,
+	COMMS_RST_DEV = 6,
+	COMMS_GOTO_WFE = 7,
+	COMMS_SKIP_BMC = 8,
+	COMMS_PREP_DESC_ELBI = 10,
+	COMMS_INVLD_LAST
+};
+
+#define COMMS_COMMAND_SIZE_SHIFT	0
+#define COMMS_COMMAND_SIZE_MASK		0x1FFFFFF
+#define COMMS_COMMAND_CMD_SHIFT		27
+#define COMMS_COMMAND_CMD_MASK		0xF8000000
+
+/*
+ * LKD command to FW register structure
+ * @size	- FW component size
+ * @cmd		- command from enum comms_cmd
+ */
+struct comms_command {
+	union {		/* bit fields are only for FW use */
+		struct {
+			u32 size :25;		/* 32MB max. */
+			u32 reserved :2;
+			enum comms_cmd cmd :5;		/* 32 commands */
+		};
+		__le32 val;
+	};
+};
+
+/*
+ * FW status
+ *
+ * COMMS_STS_NOOP		Used to clear the status register and no actual
+ *				status is provided.
+ *
+ * COMMS_STS_ACK		Command has been received and recognized.
+ *
+ * COMMS_STS_OK			Command execution has finished successfully.
+ *
+ * COMMS_STS_ERR		Command execution was unsuccessful and resulted
+ *				in error.
+ *
+ * COMMS_STS_VALID_ERR		FW validation has failed.
+ *
+ * COMMS_STS_TIMEOUT_ERR	Command execution has timed out.
+ */
+enum comms_sts {
+	COMMS_STS_NOOP = 0,
+	COMMS_STS_ACK = 1,
+	COMMS_STS_OK = 2,
+	COMMS_STS_ERR = 3,
+	COMMS_STS_VALID_ERR = 4,
+	COMMS_STS_TIMEOUT_ERR = 5,
+	COMMS_STS_INVLD_LAST
+};
+
+/* RAM types for FW components loading - defines the base address */
+enum comms_ram_types {
+	COMMS_SRAM = 0,
+	COMMS_DRAM = 1,
+};
+
+#define COMMS_STATUS_OFFSET_SHIFT	0
+#define COMMS_STATUS_OFFSET_MASK	0x03FFFFFF
+#define COMMS_STATUS_OFFSET_ALIGN_SHIFT	2
+#define COMMS_STATUS_RAM_TYPE_SHIFT	26
+#define COMMS_STATUS_RAM_TYPE_MASK	0x0C000000
+#define COMMS_STATUS_STATUS_SHIFT	28
+#define COMMS_STATUS_STATUS_MASK	0xF0000000
+
+/*
+ * FW status to LKD register structure
+ * @offset	- an offset from the base of the ram_type shifted right by
+ *		  2 bits (always aligned to 32 bits).
+ *		  Allows a maximum addressable offset of 256MB from RAM base.
+ *		  Example: for real offset in RAM of 0x800000 (8MB), the value
+ *		  in offset field is (0x800000 >> 2) = 0x200000.
+ * @ram_type	- the RAM type that should be used for offset from
+ *		  enum comms_ram_types
+ * @status	- status from enum comms_sts
+ */
+struct comms_status {
+	union {		/* bit fields are only for FW use */
+		struct {
+			u32 offset :26;
+			enum comms_ram_types ram_type :2;
+			enum comms_sts status :4;	/* 16 statuses */
+		};
+		__le32 val;
+	};
+};
+
+#define NAME_MAX_LEN	32 /* bytes */
+struct hl_module_data {
+	__u8 name[NAME_MAX_LEN];
+	__u8 version[VERSION_MAX_LEN];
+};
+
+/**
+ * struct hl_component_versions - versions associated with hl component.
+ * @struct_size: size of all the struct (including dynamic size of modules).
+ * @modules_offset: offset of the modules field in this struct.
+ * @component: version of the component itself.
+ * @fw_os: Firmware OS Version.
+ * @comp_name: Name of the component.
+ * @modules_counter: number of set bits in modules_mask.
+ * @reserved: reserved for future use.
+ * @modules: versions of the component's modules. Elborated explanation in
+ *              struct cpucp_versions.
+ */
+struct hl_component_versions {
+	__le16 struct_size;
+	__le16 modules_offset;
+	__u8 component[VERSION_MAX_LEN];
+	__u8 fw_os[VERSION_MAX_LEN];
+	__u8 comp_name[NAME_MAX_LEN];
+	__u8 modules_counter;
+	__u8 reserved[3];
+	struct hl_module_data modules[];
+};
+
+/* Max size of fit size */
+#define HL_FW_VERSIONS_FIT_SIZE	4096
+
+#endif /* HL_BOOT_IF_H */
-- 
cgit v1.2.3


From 7c4130e6ddd709be2033a6635c91d445cb2baea5 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Tue, 8 Aug 2023 12:56:47 +0300
Subject: accel/habanalabs/gaudi2: handle eq health heartbeat check

Add mechanism for fw eq health check. this will be done using two flows:
using the heartbeat mechanism and raising a dedicated interrupt to
indicate an eq failure like EQ full.
This patch will add implementation for the eq heartbeat for gaudi2 asic.

More info about the heartbeat mechanism:
Expand the heartbeat mechanism to monitor a new event that
will be sent from FW upon receiving heartbeat message.
that way driver can know that the eq is working or not.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c           | 37 ++++++++++++++++++++--
 drivers/accel/habanalabs/common/habanalabs.h       |  2 ++
 drivers/accel/habanalabs/gaudi2/gaudi2.c           | 10 ++++++
 .../include/gaudi2/gaudi2_async_ids_map_extended.h | 14 ++++----
 include/linux/habanalabs/cpucp_if.h                | 14 +++++++-
 5 files changed, 68 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index e217ee6d1768..1d1ccd8d5c75 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -989,6 +989,25 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
 	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
 }
 
+static void hl_device_eq_heartbeat(struct hl_device *hdev)
+{
+	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+	 /*
+	  * This feature supported in FW version 1.12.0 45.2.0 and above,
+	  * only on those FW versions eq_health_check_supported will be set.
+	  * Start checking eq health only after driver has enabled events from FW.
+	  */
+	if (!prop->cpucp_info.eq_health_check_supported || !hdev->init_done)
+		return;
+
+	if (hdev->eq_heartbeat_received)
+		hdev->eq_heartbeat_received = false;
+	else
+		hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+}
+
 static void hl_device_heartbeat(struct work_struct *work)
 {
 	struct hl_device *hdev = container_of(work, struct hl_device,
@@ -999,6 +1018,12 @@ static void hl_device_heartbeat(struct work_struct *work)
 	if (!hl_device_operational(hdev, NULL))
 		goto reschedule;
 
+	/*
+	 * For EQ health check need to check if driver received the heartbeat eq event
+	 * in order to validate the eq is working.
+	 */
+	hl_device_eq_heartbeat(hdev);
+
 	if (!hdev->asic_funcs->send_heartbeat(hdev))
 		goto reschedule;
 
@@ -1055,7 +1080,15 @@ static int device_late_init(struct hl_device *hdev)
 	hdev->high_pll = hdev->asic_prop.high_pll;
 
 	if (hdev->heartbeat) {
+		/*
+		 * Before scheduling the heartbeat driver will check if eq event has received.
+		 * for the first schedule we need to set the indication as true then for the next
+		 * one this indication will be true only if eq event was sent by FW.
+		 */
+		hdev->eq_heartbeat_received = true;
+
 		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
+
 		schedule_delayed_work(&hdev->work_heartbeat,
 				usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
 	}
@@ -2235,8 +2268,6 @@ int hl_device_init(struct hl_device *hdev)
 		"Successfully added device %s to habanalabs driver\n",
 		dev_name(&(hdev)->pdev->dev));
 
-	hdev->init_done = true;
-
 	/* After initialization is done, we are ready to receive events from
 	 * the F/W. We can't do it before because we will ignore events and if
 	 * those events are fatal, we won't know about it and the device will
@@ -2244,6 +2275,8 @@ int hl_device_init(struct hl_device *hdev)
 	 */
 	hdev->asic_funcs->enable_events_from_fw(hdev);
 
+	hdev->init_done = true;
+
 	return 0;
 
 cb_pool_fini:
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index f8c597903cac..e5b416852996 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3314,6 +3314,7 @@ struct hl_reset_info {
  *                             device.
  * @supports_ctx_switch: true if a ctx switch is required upon first submission.
  * @support_preboot_binning: true if we support read binning info from preboot.
+ * @eq_heartbeat_received: indication that eq heartbeat event has received from FW.
  * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing.
  * @fw_components: Controls which f/w components to load to the device. There are multiple f/w
  *                 stages and sometimes we want to stop at a certain stage. Used only for testing.
@@ -3474,6 +3475,7 @@ struct hl_device {
 	u8				reset_upon_device_release;
 	u8				supports_ctx_switch;
 	u8				support_preboot_binning;
+	u8				eq_heartbeat_received;
 
 	/* Parameters for bring-up to be upstreamed */
 	u64				nic_ports_mask;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 677900e18519..e507847bf460 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7804,6 +7804,7 @@ static inline bool is_info_event(u32 event)
 	 * an indication to an error.
 	 */
 	case GAUDI2_EVENT_CPU0_STATUS_NIC0_ENG0 ... GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1:
+	case GAUDI2_EVENT_ARC_EQ_HEARTBEAT:
 		return true;
 	default:
 		return false;
@@ -9765,6 +9766,11 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
 	return U16_MAX;
 }
 
+static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
+{
+	hdev->eq_heartbeat_received = true;
+}
+
 static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -10190,6 +10196,10 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 					gaudi2_irq_map_table[event_type].name);
 		break;
 
+	case GAUDI2_EVENT_ARC_EQ_HEARTBEAT:
+		hl_eq_heartbeat_event_handle(hdev);
+		error_count = GAUDI2_NA_EVENT_CAUSE;
+		break;
 	default:
 		if (gaudi2_irq_map_table[event_type].valid) {
 			dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n",
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
index 6cb0f615fc3e..57e661771b6c 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
@@ -2674,17 +2674,19 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 1321, .cpu_id = 627, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DEV_RESET_REQ" },
 	{ .fc_id = 1322, .cpu_id = 628, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_BRK_ENTRY" },
+		 .name = "ARC_PWR_BRK_ENTRY" },
 	{ .fc_id = 1323, .cpu_id = 629, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_BRK_EXT" },
+		 .name = "ARC_PWR_BRK_EXT" },
 	{ .fc_id = 1324, .cpu_id = 630, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_RD_MODE0" },
+		 .name = "ARC_PWR_RD_MODE0" },
 	{ .fc_id = 1325, .cpu_id = 631, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_RD_MODE1" },
+		 .name = "ARC_PWR_RD_MODE1" },
 	{ .fc_id = 1326, .cpu_id = 632, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_RD_MODE2" },
+		 .name = "ARC_PWR_RD_MODE2" },
 	{ .fc_id = 1327, .cpu_id = 633, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "PWR_RD_MODE3" },
+		 .name = "ARC_PWR_RD_MODE3" },
+	{ .fc_id = 1328, .cpu_id = 634, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
+		 .name = "ARC_EQ_HEARTBEAT" },
 };
 
 #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 4cdedb603ecb..a18fa81aad1f 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -33,6 +33,17 @@
 #define PLL_MAP_MAX_BITS	128
 #define PLL_MAP_LEN		(PLL_MAP_MAX_BITS / 8)
 
+enum eq_event_id {
+	EQ_EVENT_NIC_STS_REQUEST = 0,
+	EQ_EVENT_PWR_MODE_0,
+	EQ_EVENT_PWR_MODE_1,
+	EQ_EVENT_PWR_MODE_2,
+	EQ_EVENT_PWR_MODE_3,
+	EQ_EVENT_PWR_BRK_ENTRY,
+	EQ_EVENT_PWR_BRK_EXIT,
+	EQ_EVENT_HEARTBEAT,
+};
+
 /*
  * info of the pkt queue pointers in the first async occurrence
  */
@@ -1143,6 +1154,7 @@ struct cpucp_security_info {
  *                     (0 = functional 1 = binned)
  * @interposer_version: Interposer version programmed in eFuse
  * @substrate_version: Substrate version programmed in eFuse
+ * @eq_health_check_supported: eq health check feature supported in FW.
  * @fw_hbm_region_size: Size in bytes of FW reserved region in HBM.
  * @fw_os_version: Firmware OS Version
  */
@@ -1169,7 +1181,7 @@ struct cpucp_info {
 	__u8 xbar_binning_mask;
 	__u8 interposer_version;
 	__u8 substrate_version;
-	__u8 reserved2;
+	__u8 eq_health_check_supported;
 	struct cpucp_security_info sec_info;
 	__le32 fw_hbm_region_size;
 	__u8 pll_map[PLL_MAP_LEN];
-- 
cgit v1.2.3


From 764bfd138f359423b299b7bf3fcbabb56b981ef5 Mon Sep 17 00:00:00 2001
From: farah kassabri <fkassabri@habana.ai>
Date: Wed, 23 Aug 2023 12:36:25 +0300
Subject: accel/habanalabs/gaudi2: add eq health check using irq

This is the second patch for applying the eq health check mechanism
which will add support for the interrupt flow for gaudi2 asic.

More info about the interrupt mechanism:
set a dedicated msix for the eq error interrupt, and add
interrupt handler for it.
when FW detects some issue with EQ like EQ_FULL, it'll
raise that interrupt and driver should reset the device.
Driver will inform the FW which msix index to use through
the already existing handshake mechanism which will
send msix info message to fw.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h |  1 +
 drivers/accel/habanalabs/common/irq.c        | 12 ++++++++++++
 drivers/accel/habanalabs/gaudi2/gaudi2.c     | 16 ++++++++++++++++
 drivers/accel/habanalabs/gaudi2/gaudi2P.h    |  1 +
 include/linux/habanalabs/cpucp_if.h          |  1 +
 5 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index e5b416852996..6f2cbd3c2e95 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
 irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
 irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
 irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
 u32 hl_cq_inc_ptr(u32 ptr);
 
 int hl_asid_init(struct hl_device *hdev);
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index 10ac100bf9e2..f6b6c54bc868 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
+{
+	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+	struct hl_device *hdev = arg;
+
+	dev_err(hdev->dev, "EQ error interrupt received\n");
+
+	hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+
+	return IRQ_HANDLED;
+}
+
 /**
  * hl_irq_handler_eq - irq handler for event queue
  *
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index e507847bf460..b0ba62b691ec 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
 		return "gaudi2 unexpected error";
 	case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
 		return "gaudi2 user completion";
+	case GAUDI2_IRQ_NUM_EQ_ERROR:
+		return "gaudi2 eq error";
 	default:
 		return "invalid";
 	}
@@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 		}
 	}
 
+	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+	rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
+					IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
+					hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to request IRQ %d", irq);
+		goto free_user_irq;
+	}
+
 	gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
 
 	return 0;
@@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
 	}
 
 	synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
+	synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
 }
 
 static void gaudi2_disable_msix(struct hl_device *hdev)
@@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 	cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
 	free_irq(irq, cq);
 
+	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+	free_irq(irq, hdev);
+
 	pci_free_irq_vectors(hdev->pdev);
 
 	gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
@@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
 static void gaudi2_get_msi_info(__le32 *table)
 {
 	table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
+	table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
 }
 
 static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 4535aa5ab561..14e281fd9895 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -419,6 +419,7 @@ enum gaudi2_irq_num {
 	GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
 	GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
 	GAUDI2_IRQ_NUM_TPC_ASSERT,
+	GAUDI2_IRQ_NUM_EQ_ERROR,
 	GAUDI2_IRQ_NUM_RESERVED_FIRST,
 	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index a18fa81aad1f..84d74c4ee4d3 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
 	CPUCP_NIC_PORT5_MSI_TYPE,
 	CPUCP_NIC_PORT7_MSI_TYPE,
 	CPUCP_NIC_PORT9_MSI_TYPE,
+	CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
 	CPUCP_NUM_OF_MSI_TYPES
 };
 
-- 
cgit v1.2.3


From 9dca13141332e69fd657873194e77a1960fc9ab2 Mon Sep 17 00:00:00 2001
From: Dafna Hirschfeld <dhirschfeld@habana.ai>
Date: Thu, 7 Sep 2023 14:43:01 +0300
Subject: accel/habanalabs: add fw status SHUTDOWN_PREP

update hl_boot_if.h from specs to include
CPU_BOOT_STATUS_FW_SHUTDOWN_PREP

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/linux/habanalabs/hl_boot_if.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
index 7de8a5786a36..93366d5621fd 100644
--- a/include/linux/habanalabs/hl_boot_if.h
+++ b/include/linux/habanalabs/hl_boot_if.h
@@ -394,6 +394,8 @@ enum cpu_boot_status {
 	CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
 	/* Internal Security has been initialized, device can be accessed */
 	CPU_BOOT_STATUS_SECURITY_READY = 17,
+	/* FW component is preparing to shutdown and communication with host is not available */
+	CPU_BOOT_STATUS_FW_SHUTDOWN_PREP = 18,
 };
 
 enum kmd_msg {
-- 
cgit v1.2.3


From 7f1cd6fdd5872160da05098a84a94dee5e709e54 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 21 Sep 2023 14:52:01 +0300
Subject: accel/habanalabs: minor cosmetics update to cpucp_if.h

- Update copyright years
- Align comments

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
---
 include/linux/habanalabs/cpucp_if.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 84d74c4ee4d3..86ea7c63a0d2 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2020-2022 HabanaLabs, Ltd.
+ * Copyright 2020-2023 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -668,18 +668,15 @@ enum pq_init_status {
  *       Obsolete.
  *
  * CPUCP_PACKET_GENERIC_PASSTHROUGH -
- *      Generic opcode for all firmware info that is only passed to host
- *      through the LKD, without getting parsed there.
+ *       Generic opcode for all firmware info that is only passed to host
+ *       through the LKD, without getting parsed there.
  *
  * CPUCP_PACKET_ACTIVE_STATUS_SET -
  *       LKD sends FW indication whether device is free or in use, this indication is reported
  *       also to the BMC.
  *
- * CPUCP_PACKET_REGISTER_INTERRUPTS -
- *       Packet to register interrupts indicating LKD is ready to receive events from FW.
- *
  * CPUCP_PACKET_SOFT_RESET -
- *      Packet to perform soft-reset.
+ *       Packet to perform soft-reset.
  *
  * CPUCP_PACKET_INTS_REGISTER -
  *       Packet to inform FW that queues have been established and LKD is ready to receive
@@ -750,9 +747,9 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_RESERVED11,		/* not used */
 	CPUCP_PACKET_RESERVED12,		/* internal */
 	CPUCP_PACKET_RESERVED13,                /* internal */
-	CPUCP_PACKET_SOFT_RESET,                /* internal */
-	CPUCP_PACKET_INTS_REGISTER,             /* internal */
-	CPUCP_PACKET_ID_MAX                     /* must be last */
+	CPUCP_PACKET_SOFT_RESET,		/* internal */
+	CPUCP_PACKET_INTS_REGISTER,		/* internal */
+	CPUCP_PACKET_ID_MAX			/* must be last */
 };
 
 #define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
-- 
cgit v1.2.3


From 74975b4f2836e3cc10eeb6c38a6da54311e1de5b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 26 Sep 2023 16:59:43 +0200
Subject: gpio: acpi: remove acpi_get_and_request_gpiod()

With no more users, we can remove acpi_get_and_request_gpiod().

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/gpio/gpiolib-acpi.c   | 28 ----------------------------
 include/linux/gpio/consumer.h |  8 --------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 2ad21f34ee62..25a5f07c8755 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -162,34 +162,6 @@ static struct gpio_desc *acpi_get_gpiod(char *path, unsigned int pin)
 	return gpio_device_get_desc(gdev, pin);
 }
 
-/**
- * acpi_get_and_request_gpiod - Translate ACPI GPIO pin to GPIO descriptor and
- *                              hold a refcount to the GPIO device.
- * @path:      ACPI GPIO controller full path name, (e.g. "\\_SB.GPO1")
- * @pin:       ACPI GPIO pin number (0-based, controller-relative)
- * @label:     Label to pass to gpiod_request()
- *
- * This function is a simple pass-through to acpi_get_gpiod(), except that
- * as it is intended for use outside of the GPIO layer (in a similar fashion to
- * gpiod_get_index() for example) it also holds a reference to the GPIO device.
- */
-struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin, char *label)
-{
-	struct gpio_desc *gpio;
-	int ret;
-
-	gpio = acpi_get_gpiod(path, pin);
-	if (IS_ERR(gpio))
-		return gpio;
-
-	ret = gpiod_request(gpio, label);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return gpio;
-}
-EXPORT_SYMBOL_GPL(acpi_get_and_request_gpiod);
-
 static irqreturn_t acpi_gpio_irq_handler(int irq, void *data)
 {
 	struct acpi_gpio_event *event = data;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 6cc345440a5b..db2dfbae8edb 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -606,8 +606,6 @@ void acpi_dev_remove_driver_gpios(struct acpi_device *adev);
 int devm_acpi_dev_add_driver_gpios(struct device *dev,
 				   const struct acpi_gpio_mapping *gpios);
 
-struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin, char *label);
-
 #else  /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
 #include <linux/err.h>
@@ -625,12 +623,6 @@ static inline int devm_acpi_dev_add_driver_gpios(struct device *dev,
 	return -ENXIO;
 }
 
-static inline struct gpio_desc *acpi_get_and_request_gpiod(char *path, unsigned int pin,
-							   char *label)
-{
-	return ERR_PTR(-ENOSYS);
-}
-
 #endif /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
 
-- 
cgit v1.2.3


From a0fddaa0b5a587cc8d185f8802fe7e48493c43ed Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 15 Sep 2023 08:22:37 -0700
Subject: rtc: Add API function to return alarm time bound by hardware limit

Add rtc_bound_alarmtime() to return the requested alarm timeout bound by
the maxmum alarm timeout that is supported by a given RTC.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230915152238.1144706-2-linux@roeck-us.net
---
 include/linux/rtc.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4c0bcbeb1f00..5f8e438a0312 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -225,6 +225,23 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
+/**
+ * rtc_bound_alarmtime() - Return alarm time bound by rtc limit
+ * @rtc: Pointer to rtc device structure
+ * @requested: Requested alarm timeout
+ *
+ * Return: Alarm timeout bound by maximum alarm time supported by rtc.
+ */
+static inline ktime_t rtc_bound_alarmtime(struct rtc_device *rtc,
+					  ktime_t requested)
+{
+	if (rtc->alarm_offset_max &&
+	    rtc->alarm_offset_max * MSEC_PER_SEC < ktime_to_ms(requested))
+		return ms_to_ktime(rtc->alarm_offset_max * MSEC_PER_SEC);
+
+	return requested;
+}
+
 #define devm_rtc_register_device(device) \
 	__devm_rtc_register_device(THIS_MODULE, device)
 
-- 
cgit v1.2.3


From e346fb6d774abf1d9a87d39b1e3eef0b7397d154 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <walmeida@microsoft.com>
Date: Sat, 30 Sep 2023 02:00:05 -0300
Subject: xattr: make the xattr array itself const

As it is currently declared, the xattr_handler structs are const but the
array containing their pointers is not. This patch makes it so that fs
modules can place them in .rodata, which makes it harder for
accidental/malicious modifications at runtime.

Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
Link: https://lore.kernel.org/r/20230930050033.41174-2-wedsonaf@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/xattr.c         | 6 +++---
 include/linux/fs.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xattr.c b/fs/xattr.c
index efd4736bc94b..09d927603433 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
 static const struct xattr_handler *
 xattr_resolve_name(struct inode *inode, const char **name)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
 int
 xattr_supports_user_prefix(struct inode *inode)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
 	ssize_t remaining_size = buffer_size;
 	int err = 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..bba22e25664d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1206,7 +1206,7 @@ struct super_block {
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
 #endif
-	const struct xattr_handler **s_xattr;
+	const struct xattr_handler * const *s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
 	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
-- 
cgit v1.2.3


From e01cc1e8c2ad73cebb980878ede5584e0f2688f7 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 25 Sep 2023 16:50:23 +0200
Subject: locking/atomic: Add generic support for sync_try_cmpxchg() and its
 fallback

Provide the generic sync_try_cmpxchg() function from the
raw_ prefixed version, also adding explicit instrumentation.

The patch amends existing scripts to generate sync_try_cmpxchg()
locking primitive and its raw_sync_try_cmpxchg() fallback, while
leaving existing macros from the try_cmpxchg() family unchanged.

The target can define its own arch_sync_try_cmpxchg() to override the
generic version of raw_sync_try_cmpxchg(). This allows the target
to generate more optimal assembly than the generic version.

Additionally, the patch renames two scripts to better reflect
whet they really do.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/atomic/atomic-arch-fallback.h | 15 ++++++++++++-
 include/linux/atomic/atomic-instrumented.h  | 10 ++++++++-
 scripts/atomic/gen-atomic-fallback.sh       | 33 ++++++++++++++++-------------
 scripts/atomic/gen-atomic-instrumented.sh   |  3 ++-
 4 files changed, 43 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index b83ef19da13d..5e95faa959c4 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void);
 
 #define raw_sync_cmpxchg arch_sync_cmpxchg
 
+#ifdef arch_sync_try_cmpxchg
+#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
+#else
+#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif
+
 /**
  * raw_atomic_read() - atomic load with relaxed ordering
  * @v: pointer to atomic_t
@@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 2fdd6702823fa842f9cea57a002e6e4476ae780c
+// eec048affea735b8464f58e6d96992101f8f85f1
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index d401b406ef7c..54d7bbe0aeaa 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define sync_try_cmpxchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
+})
+
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 1568f875fef72097413caab8339120c065a39aa4
+// 2cc4bc990fef44d3836ec108f11b610f3f438184
diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh
index a45154cefa48..f80d69cfeb1f 100755
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -223,14 +223,15 @@ gen_xchg_fallbacks()
 
 gen_try_cmpxchg_fallback()
 {
+	local prefix="$1"; shift
 	local cmpxchg="$1"; shift;
-	local order="$1"; shift;
+	local suffix="$1"; shift;
 
 cat <<EOF
-#define raw_try_${cmpxchg}${order}(_ptr, _oldp, _new) \\
+#define raw_${prefix}try_${cmpxchg}${suffix}(_ptr, _oldp, _new) \\
 ({ \\
 	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \\
-	___r = raw_${cmpxchg}${order}((_ptr), ___o, (_new)); \\
+	___r = raw_${prefix}${cmpxchg}${suffix}((_ptr), ___o, (_new)); \\
 	if (unlikely(___r != ___o)) \\
 		*___op = ___r; \\
 	likely(___r == ___o); \\
@@ -259,11 +260,11 @@ gen_try_cmpxchg_order_fallback()
 	fi
 
 	printf "#else\n"
-	gen_try_cmpxchg_fallback "${cmpxchg}" "${order}"
+	gen_try_cmpxchg_fallback "" "${cmpxchg}" "${order}"
 	printf "#endif\n\n"
 }
 
-gen_try_cmpxchg_fallbacks()
+gen_try_cmpxchg_order_fallbacks()
 {
 	local cmpxchg="$1"; shift;
 
@@ -272,15 +273,17 @@ gen_try_cmpxchg_fallbacks()
 	done
 }
 
-gen_cmpxchg_local_fallbacks()
+gen_def_and_try_cmpxchg_fallback()
 {
+	local prefix="$1"; shift
 	local cmpxchg="$1"; shift
+	local suffix="$1"; shift
 
-	printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n"
-	printf "#ifdef arch_try_${cmpxchg}\n"
-	printf "#define raw_try_${cmpxchg} arch_try_${cmpxchg}\n"
+	printf "#define raw_${prefix}${cmpxchg}${suffix} arch_${prefix}${cmpxchg}${suffix}\n\n"
+	printf "#ifdef arch_${prefix}try_${cmpxchg}${suffix}\n"
+	printf "#define raw_${prefix}try_${cmpxchg}${suffix} arch_${prefix}try_${cmpxchg}${suffix}\n"
 	printf "#else\n"
-	gen_try_cmpxchg_fallback "${cmpxchg}" ""
+	gen_try_cmpxchg_fallback "${prefix}" "${cmpxchg}" "${suffix}"
 	printf "#endif\n\n"
 }
 
@@ -302,15 +305,15 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128"; do
 done
 
 for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
-	gen_try_cmpxchg_fallbacks "${cmpxchg}"
+	gen_try_cmpxchg_order_fallbacks "${cmpxchg}"
 done
 
-for cmpxchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local"; do
-	gen_cmpxchg_local_fallbacks "${cmpxchg}" ""
+for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
+	gen_def_and_try_cmpxchg_fallback "" "${cmpxchg}" "_local"
 done
 
-for cmpxchg in "sync_cmpxchg"; do
-	printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n"
+for cmpxchg in "cmpxchg"; do
+	gen_def_and_try_cmpxchg_fallback "sync_" "${cmpxchg}" ""
 done
 
 grep '^[a-z]' "$1" | while read name meta args; do
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 8f8f8e3b20f9..592f3ec89b5f 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -169,7 +169,8 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg
 	done
 done
 
-for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" "try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local"; do
+for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" \
+	    "try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local" "sync_try_cmpxchg"; do
 	gen_xchg "${xchg}" ""
 	printf "\n"
 done
-- 
cgit v1.2.3


From 9ae5c00ea2e600a8b823f9b95606dd244f3096bf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:38 +0100
Subject: sched/numa: Document vma_numab_state fields

Document the intended usage of the fields.

[ mingo: Reformatted to take less vertical space & tidied it up. ]

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-2-mgorman@techsingularity.net
---
 include/linux/mm_types.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..d7f042ec1f33 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -551,8 +551,29 @@ struct vma_lock {
 };
 
 struct vma_numab_state {
+	/*
+	 * Initialised as time in 'jiffies' after which VMA
+	 * should be scanned.  Delays first scan of new VMA by at
+	 * least sysctl_numa_balancing_scan_delay:
+	 */
 	unsigned long next_scan;
+
+	/*
+	 * Time in jiffies when access_pids[] is reset to
+	 * detect phase change behaviour:
+	 */
 	unsigned long next_pid_reset;
+
+	/*
+	 * Approximate tracking of PIDs that trapped a NUMA hinting
+	 * fault. May produce false positives due to hash collisions.
+	 *
+	 *   [0] Previous PID tracking
+	 *   [1] Current PID tracking
+	 *
+	 * Window moves after next_pid_reset has expired approximately
+	 * every VMA_PID_RESET_PERIOD jiffies:
+	 */
 	unsigned long access_pids[2];
 };
 
-- 
cgit v1.2.3


From f3a6c97940fbd25d6c84c2d5642338fc99a9b35b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:39 +0100
Subject: sched/numa: Rename vma_numab_state::access_pids[] => ::pids_active[],
 ::next_pid_reset => ::pids_active_reset

The access_pids[] field name is somewhat ambiguous as no PIDs are accessed.
Similarly, it's not clear that next_pid_reset is related to access_pids[].
Rename the fields to more accurately reflect their purpose.

[ mingo: Rename in the comments too. ]

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-3-mgorman@techsingularity.net
---
 include/linux/mm.h       |  4 ++--
 include/linux/mm_types.h |  6 +++---
 kernel/sched/fair.c      | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..19fc73b02c9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7f042ec1f33..e7571eca1131 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -559,10 +559,10 @@ struct vma_numab_state {
 	unsigned long next_scan;
 
 	/*
-	 * Time in jiffies when access_pids[] is reset to
+	 * Time in jiffies when pids_active[] is reset to
 	 * detect phase change behaviour:
 	 */
-	unsigned long next_pid_reset;
+	unsigned long pids_active_reset;
 
 	/*
 	 * Approximate tracking of PIDs that trapped a NUMA hinting
@@ -574,7 +574,7 @@ struct vma_numab_state {
 	 * Window moves after next_pid_reset has expired approximately
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
-	unsigned long access_pids[2];
+	unsigned long pids_active[2];
 };
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7c1bafc0460..6b47edcbe834 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3125,7 +3125,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
 		return true;
 
-	pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
 	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
 }
 
@@ -3241,7 +3241,7 @@ static void task_numa_work(struct callback_head *work)
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
 			/* Reset happens after 4 times scan delay of scan start */
-			vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
 		}
 
@@ -3262,11 +3262,11 @@ static void task_numa_work(struct callback_head *work)
 		 * vma for recent access to avoid clearing PID info before access..
 		 */
 		if (mm->numa_scan_seq &&
-				time_after(jiffies, vma->numab_state->next_pid_reset)) {
-			vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+				time_after(jiffies, vma->numab_state->pids_active_reset)) {
+			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-			vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
-			vma->numab_state->access_pids[1] = 0;
+			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+			vma->numab_state->pids_active[1] = 0;
 		}
 
 		do {
-- 
cgit v1.2.3


From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:40 +0100
Subject: sched/numa: Trace decisions related to skipping VMAs

NUMA balancing skips or scans VMAs for a variety of reasons. In preparation
for completing scans of VMAs regardless of PID access, trace the reasons
why a VMA was skipped. In a later patch, the tracing will be used to track
if a VMA was forcibly scanned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h |  8 ++++++
 include/trace/events/sched.h         | 50 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c                  | 17 +++++++++---
 3 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..c127a1509e2f 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,14 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+	NUMAB_SKIP_UNSUITABLE,
+	NUMAB_SKIP_SHARED_RO,
+	NUMAB_SKIP_INACCESSIBLE,
+	NUMAB_SKIP_SCAN_DELAY,
+	NUMAB_SKIP_PID_INACTIVE,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index a13d5d06be9d..d82a04d6a1bc 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
+#ifdef CONFIG_NUMA_BALANCING
+#define NUMAB_SKIP_REASON					\
+	EM( NUMAB_SKIP_UNSUITABLE,		"unsuitable" )	\
+	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
+	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
+	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
+	EMe(NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )
+
+/* Redefine for export. */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+NUMAB_SKIP_REASON
+
+/* Redefine for symbolic printing. */
+#undef EM
+#undef EMe
+#define EM(a, b)	{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(sched_skip_vma_numa,
+
+	TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
+		 enum numa_vmaskip_reason reason),
+
+	TP_ARGS(mm, vma, reason),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, numa_scan_offset)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(enum numa_vmaskip_reason, reason)
+	),
+
+	TP_fast_assign(
+		__entry->numa_scan_offset	= mm->numa_scan_offset;
+		__entry->vm_start		= vma->vm_start;
+		__entry->vm_end			= vma->vm_end;
+		__entry->reason			= reason;
+	),
+
+	TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
+		  __entry->numa_scan_offset,
+		  __entry->vm_start,
+		  __entry->vm_end,
+		  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
+);
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Tracepoint for waking a polling cpu without an IPI.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b47edcbe834..31cfdb0794fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work)
 	do {
 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
 			continue;
 		}
 
@@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work)
 		 * as migrating the pages will be of marginal benefit.
 		 */
 		if (!vma->vm_mm ||
-		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
 			continue;
+		}
 
 		/*
 		 * Skip inaccessible VMAs to avoid any confusion between
 		 * PROT_NONE and NUMA hinting ptes
 		 */
-		if (!vma_is_accessible(vma))
+		if (!vma_is_accessible(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
 			continue;
+		}
 
 		/* Initialise new per-VMA NUMAB state. */
 		if (!vma->numab_state) {
@@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work)
 		 * delay the scan for new VMAs.
 		 */
 		if (mm->numa_scan_seq && time_before(jiffies,
-						vma->numab_state->next_scan))
+						vma->numab_state->next_scan)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
 			continue;
+		}
 
 		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma))
+		if (!vma_is_accessed(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
+		}
 
 		/*
 		 * RESET access PIDs regularly for old VMAs. Resetting after checking
-- 
cgit v1.2.3


From 57ec42b9a1b7e4db4a1c2aa4fcc4eefe6d31bcb8 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Tue, 3 Oct 2023 15:53:39 +0800
Subject: i3c: Fix typo "Provisional ID" to "Provisioned ID"

The MIPI I3C spec refers to a Provisioned ID, since it is (sometimes)
provisioned at device manufacturing.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20231003075339.197099-1-matt@codeconstruct.com.au
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 Documentation/ABI/testing/sysfs-bus-i3c        | 4 ++--
 Documentation/devicetree/bindings/i3c/i3c.yaml | 4 ++--
 Documentation/driver-api/i3c/protocol.rst      | 4 ++--
 drivers/i3c/master/svc-i3c-master.c            | 2 +-
 include/linux/i3c/device.h                     | 2 +-
 include/linux/i3c/master.h                     | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-i3c b/Documentation/ABI/testing/sysfs-bus-i3c
index 1f4a2662335b..e5248fd67a56 100644
--- a/Documentation/ABI/testing/sysfs-bus-i3c
+++ b/Documentation/ABI/testing/sysfs-bus-i3c
@@ -67,7 +67,7 @@ What:		/sys/bus/i3c/devices/i3c-<bus-id>/pid
 KernelVersion:  5.0
 Contact:	linux-i3c@vger.kernel.org
 Description:
-		PID stands for Provisional ID and is used to uniquely identify
+		PID stands for Provisioned ID and is used to uniquely identify
 		a device on a bus. This PID contains information about the
 		vendor, the part and an instance ID so that several devices of
 		the same type can be connected on the same bus.
@@ -123,7 +123,7 @@ What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/pid
 KernelVersion:  5.0
 Contact:	linux-i3c@vger.kernel.org
 Description:
-		PID stands for Provisional ID and is used to uniquely identify
+		PID stands for Provisioned ID and is used to uniquely identify
 		a device on a bus. This PID contains information about the
 		vendor, the part and an instance ID so that several devices of
 		the same type can be connected on the same bus.
diff --git a/Documentation/devicetree/bindings/i3c/i3c.yaml b/Documentation/devicetree/bindings/i3c/i3c.yaml
index ab69f4115de4..f8ac7a3e3123 100644
--- a/Documentation/devicetree/bindings/i3c/i3c.yaml
+++ b/Documentation/devicetree/bindings/i3c/i3c.yaml
@@ -119,12 +119,12 @@ patternProperties:
                 minimum: 0
                 maximum: 0x7f
               - description: |
-                  First half of the Provisional ID (following the PID
+                  First half of the Provisioned ID (following the PID
                   definition provided by the I3C specification).
 
                   Contains the manufacturer ID left-shifted by 1.
               - description: |
-                  Second half of the Provisional ID (following the PID
+                  Second half of the Provisioned ID (following the PID
                   definition provided by the I3C specification).
 
                   Contains the ORing of the part ID left-shifted by 16,
diff --git a/Documentation/driver-api/i3c/protocol.rst b/Documentation/driver-api/i3c/protocol.rst
index 02653defa011..23a0b93c62b1 100644
--- a/Documentation/driver-api/i3c/protocol.rst
+++ b/Documentation/driver-api/i3c/protocol.rst
@@ -71,8 +71,8 @@ During DAA, each I3C device reports 3 important things:
   related capabilities
 * DCR: Device Characteristic Register. This 8-bit register describes the
   functionalities provided by the device
-* Provisional ID: A 48-bit unique identifier. On a given bus there should be no
-  Provisional ID collision, otherwise the discovery mechanism may fail.
+* Provisioned ID: A 48-bit unique identifier. On a given bus there should be no
+  Provisioned ID collision, otherwise the discovery mechanism may fail.
 
 I3C slave events
 ================
diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index 32eca2d6caf0..e23d7900c5a1 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -765,7 +765,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 			u8 data[6];
 
 			/*
-			 * We only care about the 48-bit provisional ID yet to
+			 * We only care about the 48-bit provisioned ID yet to
 			 * be sure a device does not nack an address twice.
 			 * Otherwise, we would just need to flush the RX FIFO.
 			 */
diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 90fa83464f00..84ed77c04940 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -96,7 +96,7 @@ enum i3c_dcr {
 
 /**
  * struct i3c_device_info - I3C device information
- * @pid: Provisional ID
+ * @pid: Provisioned ID
  * @bcr: Bus Characteristic Register
  * @dcr: Device Characteristic Register
  * @static_addr: static/I2C address
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0b52da4f2346..4fd6a777150f 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -166,7 +166,7 @@ struct i3c_device_ibi_info {
  *		 assigned a dynamic address by the master. Will be used during
  *		 bus initialization to assign it a specific dynamic address
  *		 before starting DAA (Dynamic Address Assignment)
- * @pid: I3C Provisional ID exposed by the device. This is a unique identifier
+ * @pid: I3C Provisioned ID exposed by the device. This is a unique identifier
  *	 that may be used to attach boardinfo to i3c_dev_desc when the device
  *	 does not have a static address
  * @of_node: optional DT node in case the device has been described in the DT
-- 
cgit v1.2.3


From 295d3c441226d004d1ed59c4fcf62d5dba18d9e1 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <walmeida@microsoft.com>
Date: Sat, 30 Sep 2023 02:00:33 -0300
Subject: net: move sockfs_xattr_handlers to .rodata

This makes it harder for accidental or malicious changes to
sockfs_xattr_handlers at runtime.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
Link: https://lore.kernel.org/r/20230930050033.41174-30-wedsonaf@gmail.com
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pseudo_fs.h | 2 +-
 net/socket.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index eceda1d1407a..730f77381d55 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,7 +5,7 @@
 
 struct pseudo_fs_context {
 	const struct super_operations *ops;
-	const struct xattr_handler **xattr;
+	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
 };
diff --git a/net/socket.c b/net/socket.c
index c8b08b32f097..be301d523679 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -403,7 +403,7 @@ static const struct xattr_handler sockfs_security_xattr_handler = {
 	.set = sockfs_security_xattr_set,
 };
 
-static const struct xattr_handler *sockfs_xattr_handlers[] = {
+static const struct xattr_handler * const sockfs_xattr_handlers[] = {
 	&sockfs_xattr_handler,
 	&sockfs_security_xattr_handler,
 	NULL
-- 
cgit v1.2.3


From 909f4abd1097769d024c3a9c2e59c2fbe5d2d0c0 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Thu, 28 Sep 2023 00:15:23 -0700
Subject: iommu: Add new iommu op to create domains owned by userspace

Introduce a new iommu_domain op to create domains owned by userspace,
e.g. through IOMMUFD. These domains have a few different properties
compares to kernel owned domains:

 - They may be PAGING domains, but created with special parameters.
   For instance aperture size changes/number of levels, different
   IOPTE formats, or other things necessary to make a vIOMMU work

 - We have to track all the memory allocations with GFP_KERNEL_ACCOUNT
   to make the cgroup sandbox stronger

 - Device-specialty domains, such as NESTED domains can be created by
   IOMMUFD.

The new op clearly says the domain is being created by IOMMUFD, that the
domain is intended for userspace use, and it provides a way to pass user
flags or a driver specific uAPI structure to customize the created domain
to exactly what the vIOMMU userspace driver requires.

iommu drivers that cannot support VFIO/IOMMUFD should not support this
op. This includes any driver that cannot provide a fully functional PAGING
domain.

This new op for now is only supposed to be used by IOMMUFD, hence no
wrapper for it. IOMMUFD would call the callback directly. As for domain
free, IOMMUFD would use iommu_domain_free().

Link: https://lore.kernel.org/r/20230928071528.26258-2-yi.l.liu@intel.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c50a769d569a..3861d66b65c1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -234,7 +234,15 @@ struct iommu_iotlb_gather {
  *           op is allocated in the iommu driver and freed by the caller after
  *           use. The information type is one of enum iommu_hw_info_type defined
  *           in include/uapi/linux/iommufd.h.
- * @domain_alloc: allocate iommu domain
+ * @domain_alloc: allocate and return an iommu domain if success. Otherwise
+ *                NULL is returned. The domain is not fully initialized until
+ *                the caller iommu_domain_alloc() returns.
+ * @domain_alloc_user: Allocate an iommu domain corresponding to the input
+ *                     parameters as defined in include/uapi/linux/iommufd.h.
+ *                     Unlike @domain_alloc, it is called only by IOMMUFD and
+ *                     must fully initialize the new domain before return.
+ *                     Upon success, a domain is returned. Upon failure,
+ *                     ERR_PTR must be returned.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -267,6 +275,7 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
+	struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:42 +0100
Subject: sched/numa: Complete scanning of partial VMAs regardless of PID
 activity

NUMA Balancing skips VMAs when the current task has not trapped a NUMA
fault within the VMA. If the VMA is skipped then mm->numa_scan_offset
advances and a task that is trapping faults within the VMA may never
fully update PTEs within the VMA.

Force tasks to update PTEs for partially scanned PTEs. The VMA will
be tagged for NUMA hints by some task but this removes some of the
benefit of tracking PID activity within a VMA. A follow-on patch
will mitigate this problem.

The test cases and machines evaluated did not trigger the corner case so
the performance results are neutral with only small changes within the
noise from normal test-to-test variance. However, the next patch makes
the corner case easier to trigger.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h |  1 +
 include/trace/events/sched.h         |  3 ++-
 kernel/sched/fair.c                  | 18 +++++++++++++++---
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index c127a1509e2f..7dcc0bdfddbb 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -21,6 +21,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_INACCESSIBLE,
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
+	NUMAB_SKIP_IGNORE_PID,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index d82a04d6a1bc..bfc07c10541a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
 	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
 	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
-	EMe(NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )
+	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
+	EMe(NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )
 
 /* Redefine for export. */
 #undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce36969625bd..ab79013f6e91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	unsigned long pids;
 	/*
@@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 		return true;
 
 	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
-	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+		return true;
+
+	/*
+	 * Complete a scan that has already started regardless of PID access, or
+	 * some VMAs may never be scanned in multi-threaded applications:
+	 */
+	if (mm->numa_scan_offset > vma->vm_start) {
+		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+		return true;
+	}
+
+	return false;
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work)
 		}
 
 		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma)) {
+		if (!vma_is_accessed(mm, vma)) {
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
 		}
-- 
cgit v1.2.3


From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:43 +0100
Subject: sched/numa: Complete scanning of inactive VMAs when there is no
 alternative

VMAs are skipped if there is no recent fault activity but this represents
a chicken-and-egg problem as there may be no fault activity if the PTEs
are never updated to trap NUMA hints. There is an indirect reliance on
scanning to be forced early in the lifetime of a task but this may fail
to detect changes in phase behaviour. Force inactive VMAs to be scanned
when all other eligible VMAs have been updated within the same scan
sequence.

Test results in general look good with some changes in performance, both
negative and positive, depending on whether the additional scanning and
faulting was beneficial or not to the workload. The autonuma benchmark
workload NUMA01_THREADLOCAL was picked for closer examination. The workload
creates two processes with numerous threads and thread-local storage that
is zero-filled in a loop. It exercises the corner case where unrelated
threads may skip VMAs that are thread-local to another thread and still
has some VMAs that inactive while the workload executes.

The VMA skipping activity frequency with and without the patch:

	6.6.0-rc2-sched-numabtrace-v1
	=============================
	    649 reason=scan_delay
	  9,094 reason=unsuitable
	 48,915 reason=shared_ro
	143,919 reason=inaccessible
	193,050 reason=pid_inactive

	6.6.0-rc2-sched-numabselective-v1
	=============================
	    146 reason=seq_completed
	    622 reason=ignore_pid_inactive

	    624 reason=scan_delay
	  6,570 reason=unsuitable
	 16,101 reason=shared_ro
	 27,608 reason=inaccessible
	 41,939 reason=pid_inactive

Note that with the patch applied, the PID activity is ignored
(ignore_pid_inactive) to ensure a VMA with some activity is completely
scanned. In addition, a small number of VMAs are scanned when no other
eligible VMA is available during a single scan window (seq_completed).
The number of times a VMA is skipped due to no PID activity from the
scanning task (pid_inactive) drops dramatically. It is expected that
this will increase the number of PTEs updated for NUMA hinting faults
as well as hinting faults but these represent PTEs that would otherwise
have been missed. The tradeoff is scan+fault overhead versus improving
locality due to migration.

On a 2-socket Cascade Lake test machine, the time to complete the
workload is as follows;

                                                 6.6.0-rc2              6.6.0-rc2
                                       sched-numabtrace-v1 sched-numabselective-v1
  Min       elsp-NUMA01_THREADLOCAL      174.22 (   0.00%)      117.64 (  32.48%)
  Amean     elsp-NUMA01_THREADLOCAL      175.68 (   0.00%)      123.34 *  29.79%*
  Stddev    elsp-NUMA01_THREADLOCAL        1.20 (   0.00%)        4.06 (-238.20%)
  CoeffVar  elsp-NUMA01_THREADLOCAL        0.68 (   0.00%)        3.29 (-381.70%)
  Max       elsp-NUMA01_THREADLOCAL      177.18 (   0.00%)      128.03 (  27.74%)

The time to complete the workload is reduced by almost 30%:

                     6.6.0-rc2   6.6.0-rc2
                  sched-numabtrace-v1 sched-numabselective-v1 /
  Duration User       91201.80    63506.64
  Duration System      2015.53     1819.78
  Duration Elapsed     1234.77      868.37

In this specific case, system CPU time was not increased but it's not
universally true.

From vmstat, the NUMA scanning and fault activity is as follows;

                                        6.6.0-rc2      6.6.0-rc2
                              sched-numabtrace-v1 sched-numabselective-v1
  Ops NUMA base-page range updates       64272.00    26374386.00
  Ops NUMA PTE updates                   36624.00       55538.00
  Ops NUMA PMD updates                      54.00       51404.00
  Ops NUMA hint faults                   15504.00       75786.00
  Ops NUMA hint local faults %           14860.00       56763.00
  Ops NUMA hint local percent               95.85          74.90
  Ops NUMA pages migrated                 1629.00     6469222.00

Both the number of PTE updates and hint faults is dramatically
increased. While this is superficially unfortunate, it represents
ranges that were simply skipped without the patch. As a result
of the scanning and hinting faults, many more pages were also
migrated but as the time to completion is reduced, the overhead
is offset by the gain.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net
---
 include/linux/mm_types.h             |  6 ++++
 include/linux/sched/numa_balancing.h |  1 +
 include/trace/events/sched.h         |  3 +-
 kernel/sched/fair.c                  | 55 ++++++++++++++++++++++++++++++++++--
 4 files changed, 61 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e7571eca1131..589f31ef2e84 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -575,6 +575,12 @@ struct vma_numab_state {
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
 	unsigned long pids_active[2];
+
+	/*
+	 * MM scan sequence ID when the VMA was last completely scanned.
+	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+	 */
+	int prev_scan_seq;
 };
 
 /*
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 7dcc0bdfddbb..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -22,6 +22,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
 	NUMAB_SKIP_IGNORE_PID,
+	NUMAB_SKIP_SEQ_COMPLETED,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bfc07c10541a..6188ad0d9e0d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
 	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
 	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
-	EMe(NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )
+	EM( NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )		\
+	EMe(NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )
 
 /* Redefine for export. */
 #undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab79013f6e91..922905194c0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work)
 	unsigned long nr_pte_updates = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
+	bool vma_pids_skipped;
+	bool vma_pids_forced = false;
 
 	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
@@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work)
 	 */
 	p->node_stamp += 2 * TICK_NSEC;
 
-	start = mm->numa_scan_offset;
 	pages = sysctl_numa_balancing_scan_size;
 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
@@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work)
 
 	if (!mmap_read_trylock(mm))
 		return;
+
+	/*
+	 * VMAs are skipped if the current PID has not trapped a fault within
+	 * the VMA recently. Allow scanning to be forced if there is no
+	 * suitable VMA remaining.
+	 */
+	vma_pids_skipped = false;
+
+retry_pids:
+	start = mm->numa_scan_offset;
 	vma_iter_init(&vmi, mm, start);
 	vma = vma_next(&vmi);
 	if (!vma) {
@@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work)
 			/* Reset happens after 4 times scan delay of scan start */
 			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+			/*
+			 * Ensure prev_scan_seq does not match numa_scan_seq,
+			 * to prevent VMAs being skipped prematurely on the
+			 * first scan:
+			 */
+			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
 		}
 
 		/*
@@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work)
 			vma->numab_state->pids_active[1] = 0;
 		}
 
-		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(mm, vma)) {
+		/* Do not rescan VMAs twice within the same sequence. */
+		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+			mm->numa_scan_offset = vma->vm_end;
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+			continue;
+		}
+
+		/*
+		 * Do not scan the VMA if task has not accessed it, unless no other
+		 * VMA candidate exists.
+		 */
+		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+			vma_pids_skipped = true;
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
 		}
@@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work)
 
 			cond_resched();
 		} while (end != vma->vm_end);
+
+		/* VMA scan is complete, do not scan until next sequence. */
+		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+		/*
+		 * Only force scan within one VMA at a time, to limit the
+		 * cost of scanning a potentially uninteresting VMA.
+		 */
+		if (vma_pids_forced)
+			break;
 	} for_each_vma(vmi, vma);
 
+	/*
+	 * If no VMAs are remaining and VMAs were skipped due to the PID
+	 * not accessing the VMA previously, then force a scan to ensure
+	 * forward progress:
+	 */
+	if (!vma && !vma_pids_forced && vma_pids_skipped) {
+		vma_pids_forced = true;
+		goto retry_pids;
+	}
+
 out:
 	/*
 	 * It is possible to reach the end of the VMA list but the last few
-- 
cgit v1.2.3


From 8527ca7735ef4cdad32c45853b0138f46ab2df58 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 8 Oct 2023 14:41:21 -0700
Subject: net: skbuff: fix kernel-doc typos

Correct punctuation and drop an extraneous word.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231008214121.25940-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4174c4b82d13..97bfef071255 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1309,7 +1309,7 @@ struct sk_buff_fclones {
  *
  * Returns true if skb is a fast clone, and its clone is not freed.
  * Some drivers call skb_orphan() in their ndo_start_xmit(),
- * so we also check that this didnt happen.
+ * so we also check that didn't happen.
  */
 static inline bool skb_fclone_busy(const struct sock *sk,
 				   const struct sk_buff *skb)
@@ -2016,7 +2016,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
  *	Copy shared buffers into a new sk_buff. We effectively do COW on
  *	packets to handle cases where we have a local reader and forward
  *	and a couple of other messy ones. The normal one is tcpdumping
- *	a packet thats being forwarded.
+ *	a packet that's being forwarded.
  */
 
 /**
-- 
cgit v1.2.3


From 5247dbf16cee4e83eb89e4d3b87bd5e79c5d1655 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Mon, 9 Oct 2023 19:16:33 +0800
Subject: net/core: Introduce netdev_core_stats_inc()

Although there is a kfree_skb_reason() helper function that can be used to
find the reason why this skb is dropped, but most callers didn't increase
one of rx_dropped, tx_dropped, rx_nohandler and rx_otherhost_dropped.

For the users, people are more concerned about why the dropped in ip
is increasing.

Introduce netdev_core_stats_inc() for trace the caller of
dev_core_stats_*_inc().

Also, add __code to netdev_core_stats_alloc(), as it's called with small
probability. And add noinline make sure netdev_core_stats_inc was never
inlined.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 21 ++++-----------------
 net/core/dev.c            | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e070a4540fba..11d704bfec9b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4002,32 +4002,19 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
 	return false;
 }
 
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev);
-
-static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev)
-{
-	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
-	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
-
-	if (likely(p))
-		return p;
-
-	return netdev_core_stats_alloc(dev);
-}
+void netdev_core_stats_inc(struct net_device *dev, u32 offset);
 
 #define DEV_CORE_STATS_INC(FIELD)						\
 static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 {										\
-	struct net_device_core_stats __percpu *p;				\
-										\
-	p = dev_core_stats(dev);						\
-	if (p)									\
-		this_cpu_inc(p->FIELD);						\
+	netdev_core_stats_inc(dev,						\
+			offsetof(struct net_device_core_stats, FIELD));		\
 }
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
 DEV_CORE_STATS_INC(rx_nohandler)
 DEV_CORE_STATS_INC(rx_otherhost_dropped)
+#undef DEV_CORE_STATS_INC
 
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 606a366cc209..02949a929e7f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10497,7 +10497,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+		struct net_device *dev)
 {
 	struct net_device_core_stats __percpu *p;
 
@@ -10510,7 +10511,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device
 	/* This READ_ONCE() pairs with the cmpxchg() above */
 	return READ_ONCE(dev->core_stats);
 }
-EXPORT_SYMBOL(netdev_core_stats_alloc);
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+	unsigned long __percpu *field;
+
+	if (unlikely(!p)) {
+		p = netdev_core_stats_alloc(dev);
+		if (!p)
+			return;
+	}
+
+	field = (__force unsigned long __percpu *)((__force void *)p + offset);
+	this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
 
 /**
  *	dev_get_stats	- get network device statistics
-- 
cgit v1.2.3


From acd6199f195d6de814ac4090ce0864a613b1580e Mon Sep 17 00:00:00 2001
From: Wentong Wu <wentong.wu@intel.com>
Date: Mon, 9 Oct 2023 14:33:22 +0800
Subject: usb: Add support for Intel LJCA device

Implements the USB part of Intel USB-I2C/GPIO/SPI adapter device
named "La Jolla Cove Adapter" (LJCA).

The communication between the various LJCA module drivers and the
hardware will be muxed/demuxed by this driver. Three modules (
I2C, GPIO, and SPI) are supported currently.

Each sub-module of LJCA device is identified by type field within
the LJCA message header.

The sub-modules of LJCA can use ljca_transfer() to issue a transfer
between host and hardware. And ljca_register_event_cb is exported
to LJCA sub-module drivers for hardware event subscription.

The minimum code in ASL that covers this board is
Scope (\_SB.PCI0.DWC3.RHUB.HS01)
    {
        Device (GPIO)
        {
            Name (_ADR, Zero)
            Name (_STA, 0x0F)
        }

        Device (I2C)
        {
            Name (_ADR, One)
            Name (_STA, 0x0F)
        }

        Device (SPI)
        {
            Name (_ADR, 0x02)
            Name (_STA, 0x0F)
        }
    }

Signed-off-by: Wentong Wu <wentong.wu@intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Oliver Neukum <oneukum@suse.com>
Link: https://lore.kernel.org/r/1696833205-16716-2-git-send-email-wentong.wu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/misc/Kconfig    |  13 +
 drivers/usb/misc/Makefile   |   1 +
 drivers/usb/misc/usb-ljca.c | 902 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/ljca.h    | 145 +++++++
 4 files changed, 1061 insertions(+)
 create mode 100644 drivers/usb/misc/usb-ljca.c
 create mode 100644 include/linux/usb/ljca.h

(limited to 'include/linux')

diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index 99b15b77dfd5..c510af7baa0d 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -165,6 +165,19 @@ config APPLE_MFI_FASTCHARGE
 
 	  It is safe to say M here.
 
+config USB_LJCA
+	tristate "Intel La Jolla Cove Adapter support"
+	select AUXILIARY_BUS
+	depends on USB && ACPI
+	help
+	  This adds support for Intel La Jolla Cove USB-I2C/SPI/GPIO
+	  Master Adapter (LJCA). Additional drivers such as I2C_LJCA,
+	  GPIO_LJCA and SPI_LJCA must be enabled in order to use the
+	  functionality of the device.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called usb-ljca.
+
 source "drivers/usb/misc/sisusbvga/Kconfig"
 
 config USB_LD
diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile
index 1992cc284d8a..0bc732bcb162 100644
--- a/drivers/usb/misc/Makefile
+++ b/drivers/usb/misc/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_USB_EMI26)			+= emi26.o
 obj-$(CONFIG_USB_EMI62)			+= emi62.o
 obj-$(CONFIG_USB_EZUSB_FX2)		+= ezusb.o
 obj-$(CONFIG_APPLE_MFI_FASTCHARGE)	+= apple-mfi-fastcharge.o
+obj-$(CONFIG_USB_LJCA)			+= usb-ljca.o
 obj-$(CONFIG_USB_IDMOUSE)		+= idmouse.o
 obj-$(CONFIG_USB_IOWARRIOR)		+= iowarrior.o
 obj-$(CONFIG_USB_ISIGHTFW)		+= isight_firmware.o
diff --git a/drivers/usb/misc/usb-ljca.c b/drivers/usb/misc/usb-ljca.c
new file mode 100644
index 000000000000..c9decd0396d4
--- /dev/null
+++ b/drivers/usb/misc/usb-ljca.c
@@ -0,0 +1,902 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel La Jolla Cove Adapter USB driver
+ *
+ * Copyright (c) 2023, Intel Corporation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/auxiliary_bus.h>
+#include <linux/dev_printk.h>
+#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/usb.h>
+#include <linux/usb/ljca.h>
+
+#include <asm/unaligned.h>
+
+/* command flags */
+#define LJCA_ACK_FLAG			BIT(0)
+#define LJCA_RESP_FLAG			BIT(1)
+#define LJCA_CMPL_FLAG			BIT(2)
+
+#define LJCA_MAX_PACKET_SIZE		64u
+#define LJCA_MAX_PAYLOAD_SIZE		\
+		(LJCA_MAX_PACKET_SIZE - sizeof(struct ljca_msg))
+
+#define LJCA_WRITE_TIMEOUT_MS		200
+#define LJCA_WRITE_ACK_TIMEOUT_MS	500
+#define LJCA_ENUM_CLIENT_TIMEOUT_MS	20
+
+/* ljca client type */
+enum ljca_client_type {
+	LJCA_CLIENT_MNG = 1,
+	LJCA_CLIENT_GPIO = 3,
+	LJCA_CLIENT_I2C = 4,
+	LJCA_CLIENT_SPI = 5,
+};
+
+/* MNG client commands */
+enum ljca_mng_cmd {
+	LJCA_MNG_RESET = 2,
+	LJCA_MNG_ENUM_GPIO = 4,
+	LJCA_MNG_ENUM_I2C = 5,
+	LJCA_MNG_ENUM_SPI = 8,
+};
+
+/* ljca client acpi _ADR */
+enum ljca_client_acpi_adr {
+	LJCA_GPIO_ACPI_ADR,
+	LJCA_I2C1_ACPI_ADR,
+	LJCA_I2C2_ACPI_ADR,
+	LJCA_SPI1_ACPI_ADR,
+	LJCA_SPI2_ACPI_ADR,
+	LJCA_CLIENT_ACPI_ADR_MAX,
+};
+
+/* ljca cmd message structure */
+struct ljca_msg {
+	u8 type;
+	u8 cmd;
+	u8 flags;
+	u8 len;
+	u8 data[] __counted_by(len);
+} __packed;
+
+struct ljca_i2c_ctr_info {
+	u8 id;
+	u8 capacity;
+	u8 intr_pin;
+} __packed;
+
+struct ljca_i2c_descriptor {
+	u8 num;
+	struct ljca_i2c_ctr_info info[] __counted_by(num);
+} __packed;
+
+struct ljca_spi_ctr_info {
+	u8 id;
+	u8 capacity;
+	u8 intr_pin;
+} __packed;
+
+struct ljca_spi_descriptor {
+	u8 num;
+	struct ljca_spi_ctr_info info[] __counted_by(num);
+} __packed;
+
+struct ljca_bank_descriptor {
+	u8 bank_id;
+	u8 pin_num;
+
+	/* 1 bit for each gpio, 1 means valid */
+	__le32 valid_pins;
+} __packed;
+
+struct ljca_gpio_descriptor {
+	u8 pins_per_bank;
+	u8 bank_num;
+	struct ljca_bank_descriptor bank_desc[] __counted_by(bank_num);
+} __packed;
+
+/**
+ * struct ljca_adapter - represent a ljca adapter
+ *
+ * @intf: the usb interface for this ljca adapter
+ * @usb_dev: the usb device for this ljca adapter
+ * @dev: the specific device info of the usb interface
+ * @rx_pipe: bulk in pipe for receive data from firmware
+ * @tx_pipe: bulk out pipe for send data to firmware
+ * @rx_urb: urb used for the bulk in pipe
+ * @rx_buf: buffer used to receive command response and event
+ * @rx_len: length of rx buffer
+ * @ex_buf: external buffer to save command response
+ * @ex_buf_len: length of external buffer
+ * @actual_length: actual length of data copied to external buffer
+ * @tx_buf: buffer used to download command to firmware
+ * @tx_buf_len: length of tx buffer
+ * @lock: spinlock to protect tx_buf and ex_buf
+ * @cmd_completion: completion object as the command receives ack
+ * @mutex: mutex to avoid command download concurrently
+ * @client_list: client device list
+ * @disconnect: usb disconnect ongoing or not
+ * @reset_id: used to reset firmware
+ */
+struct ljca_adapter {
+	struct usb_interface *intf;
+	struct usb_device *usb_dev;
+	struct device *dev;
+
+	unsigned int rx_pipe;
+	unsigned int tx_pipe;
+
+	struct urb *rx_urb;
+	void *rx_buf;
+	unsigned int rx_len;
+
+	u8 *ex_buf;
+	u8 ex_buf_len;
+	u8 actual_length;
+
+	void *tx_buf;
+	u8 tx_buf_len;
+
+	spinlock_t lock;
+
+	struct completion cmd_completion;
+	struct mutex mutex;
+
+	struct list_head client_list;
+
+	bool disconnect;
+
+	u32 reset_id;
+};
+
+struct ljca_match_ids_walk_data {
+	const struct acpi_device_id *ids;
+	const char *uid;
+	struct acpi_device *adev;
+};
+
+static const struct acpi_device_id ljca_gpio_hids[] = {
+	{ "INTC1074" },
+	{ "INTC1096" },
+	{ "INTC100B" },
+	{ "INTC10D1" },
+	{},
+};
+
+static const struct acpi_device_id ljca_i2c_hids[] = {
+	{ "INTC1075" },
+	{ "INTC1097" },
+	{ "INTC100C" },
+	{ "INTC10D2" },
+	{},
+};
+
+static const struct acpi_device_id ljca_spi_hids[] = {
+	{ "INTC1091" },
+	{ "INTC1098" },
+	{ "INTC100D" },
+	{ "INTC10D3" },
+	{},
+};
+
+static void ljca_handle_event(struct ljca_adapter *adap,
+			      struct ljca_msg *header)
+{
+	struct ljca_client *client;
+
+	list_for_each_entry(client, &adap->client_list, link) {
+		/*
+		 * Currently only GPIO register event callback, but
+		 * firmware message structure should include id when
+		 * multiple same type clients register event callback.
+		 */
+		if (client->type == header->type) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&client->event_cb_lock, flags);
+			client->event_cb(client->context, header->cmd,
+					 header->data, header->len);
+			spin_unlock_irqrestore(&client->event_cb_lock, flags);
+
+			break;
+		}
+	}
+}
+
+/* process command ack and received data if available */
+static void ljca_handle_cmd_ack(struct ljca_adapter *adap, struct ljca_msg *header)
+{
+	struct ljca_msg *tx_header = adap->tx_buf;
+	u8 ibuf_len, actual_len = 0;
+	unsigned long flags;
+	u8 *ibuf;
+
+	spin_lock_irqsave(&adap->lock, flags);
+
+	if (tx_header->type != header->type || tx_header->cmd != header->cmd) {
+		spin_unlock_irqrestore(&adap->lock, flags);
+		dev_err(adap->dev, "cmd ack mismatch error\n");
+		return;
+	}
+
+	ibuf_len = adap->ex_buf_len;
+	ibuf = adap->ex_buf;
+
+	if (ibuf && ibuf_len) {
+		actual_len = min(header->len, ibuf_len);
+
+		/* copy received data to external buffer */
+		memcpy(ibuf, header->data, actual_len);
+	}
+	/* update copied data length */
+	adap->actual_length = actual_len;
+
+	spin_unlock_irqrestore(&adap->lock, flags);
+
+	complete(&adap->cmd_completion);
+}
+
+static void ljca_recv(struct urb *urb)
+{
+	struct ljca_msg *header = urb->transfer_buffer;
+	struct ljca_adapter *adap = urb->context;
+	int ret;
+
+	switch (urb->status) {
+	case 0:
+		/* success */
+		break;
+	case -ENOENT:
+		/*
+		 * directly complete the possible ongoing transfer
+		 * during disconnect
+		 */
+		if (adap->disconnect)
+			complete(&adap->cmd_completion);
+		return;
+	case -ECONNRESET:
+	case -ESHUTDOWN:
+	case -EPIPE:
+		/* rx urb is terminated */
+		dev_dbg(adap->dev, "rx urb terminated with status: %d\n",
+			urb->status);
+		return;
+	default:
+		dev_dbg(adap->dev, "rx urb error: %d\n", urb->status);
+		goto resubmit;
+	}
+
+	if (header->len + sizeof(*header) != urb->actual_length)
+		goto resubmit;
+
+	if (header->flags & LJCA_ACK_FLAG)
+		ljca_handle_cmd_ack(adap, header);
+	else
+		ljca_handle_event(adap, header);
+
+resubmit:
+	ret = usb_submit_urb(urb, GFP_ATOMIC);
+	if (ret && ret != -EPERM)
+		dev_err(adap->dev, "resubmit rx urb error %d\n", ret);
+}
+
+static int ljca_send(struct ljca_adapter *adap, u8 type, u8 cmd,
+		     const u8 *obuf, u8 obuf_len, u8 *ibuf, u8 ibuf_len,
+		     bool ack, unsigned long timeout)
+{
+	unsigned int msg_len = sizeof(struct ljca_msg) + obuf_len;
+	struct ljca_msg *header = adap->tx_buf;
+	unsigned int transferred;
+	unsigned long flags;
+	int ret;
+
+	if (adap->disconnect)
+		return -ENODEV;
+
+	if (msg_len > adap->tx_buf_len)
+		return -EINVAL;
+
+	mutex_lock(&adap->mutex);
+
+	spin_lock_irqsave(&adap->lock, flags);
+
+	header->type = type;
+	header->cmd = cmd;
+	header->len = obuf_len;
+	if (obuf)
+		memcpy(header->data, obuf, obuf_len);
+
+	header->flags = LJCA_CMPL_FLAG | (ack ? LJCA_ACK_FLAG : 0);
+
+	adap->ex_buf = ibuf;
+	adap->ex_buf_len = ibuf_len;
+	adap->actual_length = 0;
+
+	spin_unlock_irqrestore(&adap->lock, flags);
+
+	reinit_completion(&adap->cmd_completion);
+
+	ret = usb_autopm_get_interface(adap->intf);
+	if (ret < 0)
+		goto out;
+
+	ret = usb_bulk_msg(adap->usb_dev, adap->tx_pipe, header,
+			   msg_len, &transferred, LJCA_WRITE_TIMEOUT_MS);
+
+	usb_autopm_put_interface(adap->intf);
+
+	if (ret < 0)
+		goto out;
+	if (transferred != msg_len) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (ack) {
+		ret = wait_for_completion_timeout(&adap->cmd_completion,
+						  timeout);
+		if (!ret) {
+			ret = -ETIMEDOUT;
+			goto out;
+		}
+	}
+	ret = adap->actual_length;
+
+out:
+	spin_lock_irqsave(&adap->lock, flags);
+	adap->ex_buf = NULL;
+	adap->ex_buf_len = 0;
+
+	memset(header, 0, sizeof(*header));
+	spin_unlock_irqrestore(&adap->lock, flags);
+
+	mutex_unlock(&adap->mutex);
+
+	return ret;
+}
+
+int ljca_transfer(struct ljca_client *client, u8 cmd, const u8 *obuf,
+		  u8 obuf_len, u8 *ibuf, u8 ibuf_len)
+{
+	return ljca_send(client->adapter, client->type, cmd,
+			 obuf, obuf_len, ibuf, ibuf_len, true,
+			 LJCA_WRITE_ACK_TIMEOUT_MS);
+}
+EXPORT_SYMBOL_NS_GPL(ljca_transfer, LJCA);
+
+int ljca_transfer_noack(struct ljca_client *client, u8 cmd, const u8 *obuf,
+			u8 obuf_len)
+{
+	return ljca_send(client->adapter, client->type, cmd, obuf,
+			 obuf_len, NULL, 0, false, LJCA_WRITE_ACK_TIMEOUT_MS);
+}
+EXPORT_SYMBOL_NS_GPL(ljca_transfer_noack, LJCA);
+
+int ljca_register_event_cb(struct ljca_client *client, ljca_event_cb_t event_cb,
+			   void *context)
+{
+	unsigned long flags;
+
+	if (!event_cb)
+		return -EINVAL;
+
+	spin_lock_irqsave(&client->event_cb_lock, flags);
+
+	if (client->event_cb) {
+		spin_unlock_irqrestore(&client->event_cb_lock, flags);
+		return -EALREADY;
+	}
+
+	client->event_cb = event_cb;
+	client->context = context;
+
+	spin_unlock_irqrestore(&client->event_cb_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(ljca_register_event_cb, LJCA);
+
+void ljca_unregister_event_cb(struct ljca_client *client)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&client->event_cb_lock, flags);
+
+	client->event_cb = NULL;
+	client->context = NULL;
+
+	spin_unlock_irqrestore(&client->event_cb_lock, flags);
+}
+EXPORT_SYMBOL_NS_GPL(ljca_unregister_event_cb, LJCA);
+
+static int ljca_match_device_ids(struct acpi_device *adev, void *data)
+{
+	struct ljca_match_ids_walk_data *wd = data;
+	const char *uid = acpi_device_uid(adev);
+
+	if (acpi_match_device_ids(adev, wd->ids))
+		return 0;
+
+	if (!wd->uid)
+		goto match;
+
+	if (!uid)
+		/*
+		 * Some DSDTs have only one ACPI companion for the two I2C
+		 * controllers and they don't set a UID at all (e.g. Dell
+		 * Latitude 9420). On these platforms only the first I2C
+		 * controller is used, so if a HID match has no UID we use
+		 * "0" as the UID and assign ACPI companion to the first
+		 * I2C controller.
+		 */
+		uid = "0";
+	else
+		uid = strchr(uid, wd->uid[0]);
+
+	if (!uid || strcmp(uid, wd->uid))
+		return 0;
+
+match:
+	wd->adev = adev;
+
+	return 1;
+}
+
+/* bind auxiliary device to acpi device */
+static void ljca_auxdev_acpi_bind(struct ljca_adapter *adap,
+				  struct auxiliary_device *auxdev,
+				  u64 adr, u8 id)
+{
+	struct ljca_match_ids_walk_data wd = { 0 };
+	struct acpi_device *parent, *adev;
+	struct device *dev = adap->dev;
+	char uid[4];
+
+	parent = ACPI_COMPANION(dev);
+	if (!parent)
+		return;
+
+	/*
+	 * get auxdev ACPI handle from the ACPI device directly
+	 * under the parent that matches _ADR.
+	 */
+	adev = acpi_find_child_device(parent, adr, false);
+	if (adev) {
+		ACPI_COMPANION_SET(&auxdev->dev, adev);
+		return;
+	}
+
+	/*
+	 * _ADR is a grey area in the ACPI specification, some
+	 * platforms use _HID to distinguish children devices.
+	 */
+	switch (adr) {
+	case LJCA_GPIO_ACPI_ADR:
+		wd.ids = ljca_gpio_hids;
+		break;
+	case LJCA_I2C1_ACPI_ADR:
+	case LJCA_I2C2_ACPI_ADR:
+		snprintf(uid, sizeof(uid), "%d", id);
+		wd.uid = uid;
+		wd.ids = ljca_i2c_hids;
+		break;
+	case LJCA_SPI1_ACPI_ADR:
+	case LJCA_SPI2_ACPI_ADR:
+		wd.ids = ljca_spi_hids;
+		break;
+	default:
+		dev_warn(dev, "unsupported _ADR\n");
+		return;
+	}
+
+	acpi_dev_for_each_child(parent, ljca_match_device_ids, &wd);
+	if (wd.adev) {
+		ACPI_COMPANION_SET(&auxdev->dev, wd.adev);
+		return;
+	}
+
+	parent = ACPI_COMPANION(dev->parent->parent);
+	if (!parent)
+		return;
+
+	acpi_dev_for_each_child(parent, ljca_match_device_ids, &wd);
+	if (wd.adev)
+		ACPI_COMPANION_SET(&auxdev->dev, wd.adev);
+}
+
+static void ljca_auxdev_release(struct device *dev)
+{
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+
+	kfree(auxdev->dev.platform_data);
+}
+
+static int ljca_new_client_device(struct ljca_adapter *adap, u8 type, u8 id,
+				  char *name, void *data, u64 adr)
+{
+	struct auxiliary_device *auxdev;
+	struct ljca_client *client;
+	int ret;
+
+	client = kzalloc(sizeof *client, GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+
+	client->type = type;
+	client->id = id;
+	client->adapter = adap;
+	spin_lock_init(&client->event_cb_lock);
+
+	auxdev = &client->auxdev;
+	auxdev->name = name;
+	auxdev->id = id;
+
+	auxdev->dev.parent = adap->dev;
+	auxdev->dev.platform_data = data;
+	auxdev->dev.release = ljca_auxdev_release;
+
+	ret = auxiliary_device_init(auxdev);
+	if (ret)
+		goto err_free;
+
+	ljca_auxdev_acpi_bind(adap, auxdev, adr, id);
+
+	ret = auxiliary_device_add(auxdev);
+	if (ret)
+		goto err_uninit;
+
+	list_add_tail(&client->link, &adap->client_list);
+
+	return 0;
+
+err_uninit:
+	auxiliary_device_uninit(auxdev);
+
+err_free:
+	kfree(client);
+
+	return ret;
+}
+
+static int ljca_enumerate_gpio(struct ljca_adapter *adap)
+{
+	u32 valid_pin[LJCA_MAX_GPIO_NUM / BITS_PER_TYPE(u32)];
+	struct ljca_gpio_descriptor *desc;
+	struct ljca_gpio_info *gpio_info;
+	u8 buf[LJCA_MAX_PAYLOAD_SIZE];
+	int ret, gpio_num;
+	unsigned int i;
+
+	ret = ljca_send(adap, LJCA_CLIENT_MNG, LJCA_MNG_ENUM_GPIO, NULL, 0, buf,
+			sizeof(buf), true, LJCA_ENUM_CLIENT_TIMEOUT_MS);
+	if (ret < 0)
+		return ret;
+
+	/* check firmware response */
+	desc = (struct ljca_gpio_descriptor *)buf;
+	if (ret != struct_size(desc, bank_desc, desc->bank_num))
+		return -EINVAL;
+
+	gpio_num = desc->pins_per_bank * desc->bank_num;
+	if (gpio_num > LJCA_MAX_GPIO_NUM)
+		return -EINVAL;
+
+	/* construct platform data */
+	gpio_info = kzalloc(sizeof *gpio_info, GFP_KERNEL);
+	if (!gpio_info)
+		return -ENOMEM;
+	gpio_info->num = gpio_num;
+
+	for (i = 0; i < desc->bank_num; i++)
+		valid_pin[i] = get_unaligned_le32(&desc->bank_desc[i].valid_pins);
+	bitmap_from_arr32(gpio_info->valid_pin_map, valid_pin, gpio_num);
+
+	ret = ljca_new_client_device(adap, LJCA_CLIENT_GPIO, 0, "ljca-gpio",
+				     gpio_info, LJCA_GPIO_ACPI_ADR);
+	if (ret)
+		kfree(gpio_info);
+
+	return ret;
+}
+
+static int ljca_enumerate_i2c(struct ljca_adapter *adap)
+{
+	struct ljca_i2c_descriptor *desc;
+	struct ljca_i2c_info *i2c_info;
+	u8 buf[LJCA_MAX_PAYLOAD_SIZE];
+	unsigned int i;
+	int ret;
+
+	ret = ljca_send(adap, LJCA_CLIENT_MNG, LJCA_MNG_ENUM_I2C, NULL, 0, buf,
+			sizeof(buf), true, LJCA_ENUM_CLIENT_TIMEOUT_MS);
+	if (ret < 0)
+		return ret;
+
+	/* check firmware response */
+	desc = (struct ljca_i2c_descriptor *)buf;
+	if (ret != struct_size(desc, info, desc->num))
+		return -EINVAL;
+
+	for (i = 0; i < desc->num; i++) {
+		/* construct platform data */
+		i2c_info = kzalloc(sizeof *i2c_info, GFP_KERNEL);
+		if (!i2c_info)
+			return -ENOMEM;
+
+		i2c_info->id = desc->info[i].id;
+		i2c_info->capacity = desc->info[i].capacity;
+		i2c_info->intr_pin = desc->info[i].intr_pin;
+
+		ret = ljca_new_client_device(adap, LJCA_CLIENT_I2C, i,
+					     "ljca-i2c", i2c_info,
+					     LJCA_I2C1_ACPI_ADR + i);
+		if (ret) {
+			kfree(i2c_info);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int ljca_enumerate_spi(struct ljca_adapter *adap)
+{
+	struct ljca_spi_descriptor *desc;
+	struct ljca_spi_info *spi_info;
+	u8 buf[LJCA_MAX_PAYLOAD_SIZE];
+	unsigned int i;
+	int ret;
+
+	ret = ljca_send(adap, LJCA_CLIENT_MNG, LJCA_MNG_ENUM_SPI, NULL, 0, buf,
+			sizeof(buf), true, LJCA_ENUM_CLIENT_TIMEOUT_MS);
+	if (ret < 0)
+		return ret;
+
+	/* check firmware response */
+	desc = (struct ljca_spi_descriptor *)buf;
+	if (ret != struct_size(desc, info, desc->num))
+		return -EINVAL;
+
+	for (i = 0; i < desc->num; i++) {
+		/* construct platform data */
+		spi_info = kzalloc(sizeof *spi_info, GFP_KERNEL);
+		if (!spi_info)
+			return -ENOMEM;
+
+		spi_info->id = desc->info[i].id;
+		spi_info->capacity = desc->info[i].capacity;
+
+		ret = ljca_new_client_device(adap, LJCA_CLIENT_SPI, i,
+					     "ljca-spi", spi_info,
+					     LJCA_SPI1_ACPI_ADR + i);
+		if (ret) {
+			kfree(spi_info);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int ljca_reset_handshake(struct ljca_adapter *adap)
+{
+	__le32 reset_id = cpu_to_le32(adap->reset_id);
+	__le32 reset_id_ret = 0;
+	int ret;
+
+	adap->reset_id++;
+
+	ret = ljca_send(adap, LJCA_CLIENT_MNG, LJCA_MNG_RESET, (u8 *)&reset_id,
+			sizeof(__le32), (u8 *)&reset_id_ret, sizeof(__le32),
+			true, LJCA_WRITE_ACK_TIMEOUT_MS);
+	if (ret < 0)
+		return ret;
+
+	if (reset_id_ret != reset_id)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ljca_enumerate_clients(struct ljca_adapter *adap)
+{
+	struct ljca_client *client, *next;
+	int ret;
+
+	ret = ljca_reset_handshake(adap);
+	if (ret)
+		goto err_kill;
+
+	ret = ljca_enumerate_gpio(adap);
+	if (ret) {
+		dev_err(adap->dev, "enumerate GPIO error\n");
+		goto err_kill;
+	}
+
+	ret = ljca_enumerate_i2c(adap);
+	if (ret) {
+		dev_err(adap->dev, "enumerate I2C error\n");
+		goto err_kill;
+	}
+
+	ret = ljca_enumerate_spi(adap);
+	if (ret) {
+		dev_err(adap->dev, "enumerate SPI error\n");
+		goto err_kill;
+	}
+
+	return 0;
+
+err_kill:
+	adap->disconnect = true;
+
+	usb_kill_urb(adap->rx_urb);
+
+	list_for_each_entry_safe_reverse(client, next, &adap->client_list, link) {
+		auxiliary_device_delete(&client->auxdev);
+		auxiliary_device_uninit(&client->auxdev);
+
+		list_del_init(&client->link);
+		kfree(client);
+	}
+
+	return ret;
+}
+
+static int ljca_probe(struct usb_interface *interface,
+		      const struct usb_device_id *id)
+{
+	struct usb_device *usb_dev = interface_to_usbdev(interface);
+	struct usb_host_interface *alt = interface->cur_altsetting;
+	struct usb_endpoint_descriptor *ep_in, *ep_out;
+	struct device *dev = &interface->dev;
+	struct ljca_adapter *adap;
+	int ret;
+
+	adap = devm_kzalloc(dev, sizeof(*adap), GFP_KERNEL);
+	if (!adap)
+		return -ENOMEM;
+
+	/* separate tx buffer allocation for alignment */
+	adap->tx_buf = devm_kzalloc(dev, LJCA_MAX_PACKET_SIZE, GFP_KERNEL);
+	if (!adap->tx_buf)
+		return -ENOMEM;
+	adap->tx_buf_len = LJCA_MAX_PACKET_SIZE;
+
+	mutex_init(&adap->mutex);
+	spin_lock_init(&adap->lock);
+	init_completion(&adap->cmd_completion);
+	INIT_LIST_HEAD(&adap->client_list);
+
+	adap->intf = usb_get_intf(interface);
+	adap->usb_dev = usb_dev;
+	adap->dev = dev;
+
+	/*
+	 * find the first bulk in and out endpoints.
+	 * ignore any others.
+	 */
+	ret = usb_find_common_endpoints(alt, &ep_in, &ep_out, NULL, NULL);
+	if (ret) {
+		dev_err(dev, "bulk endpoints not found\n");
+		goto err_put;
+	}
+	adap->rx_pipe = usb_rcvbulkpipe(usb_dev, usb_endpoint_num(ep_in));
+	adap->tx_pipe = usb_sndbulkpipe(usb_dev, usb_endpoint_num(ep_out));
+
+	/* setup rx buffer */
+	adap->rx_len = usb_endpoint_maxp(ep_in);
+	adap->rx_buf = devm_kzalloc(dev, adap->rx_len, GFP_KERNEL);
+	if (!adap->rx_buf) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	/* alloc rx urb */
+	adap->rx_urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!adap->rx_urb) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+	usb_fill_bulk_urb(adap->rx_urb, usb_dev, adap->rx_pipe,
+			  adap->rx_buf, adap->rx_len, ljca_recv, adap);
+
+	usb_set_intfdata(interface, adap);
+
+	/* submit rx urb before enumerate clients */
+	ret = usb_submit_urb(adap->rx_urb, GFP_KERNEL);
+	if (ret) {
+		dev_err(dev, "submit rx urb failed: %d\n", ret);
+		goto err_free;
+	}
+
+	ret = ljca_enumerate_clients(adap);
+	if (ret)
+		goto err_free;
+
+	usb_enable_autosuspend(usb_dev);
+
+	return 0;
+
+err_free:
+	usb_free_urb(adap->rx_urb);
+
+err_put:
+	usb_put_intf(adap->intf);
+
+	mutex_destroy(&adap->mutex);
+
+	return ret;
+}
+
+static void ljca_disconnect(struct usb_interface *interface)
+{
+	struct ljca_adapter *adap = usb_get_intfdata(interface);
+	struct ljca_client *client, *next;
+
+	adap->disconnect = true;
+
+	usb_kill_urb(adap->rx_urb);
+
+	list_for_each_entry_safe_reverse(client, next, &adap->client_list, link) {
+		auxiliary_device_delete(&client->auxdev);
+		auxiliary_device_uninit(&client->auxdev);
+
+		list_del_init(&client->link);
+		kfree(client);
+	}
+
+	usb_free_urb(adap->rx_urb);
+
+	usb_put_intf(adap->intf);
+
+	mutex_destroy(&adap->mutex);
+}
+
+static int ljca_suspend(struct usb_interface *interface, pm_message_t message)
+{
+	struct ljca_adapter *adap = usb_get_intfdata(interface);
+
+	usb_kill_urb(adap->rx_urb);
+
+	return 0;
+}
+
+static int ljca_resume(struct usb_interface *interface)
+{
+	struct ljca_adapter *adap = usb_get_intfdata(interface);
+
+	return usb_submit_urb(adap->rx_urb, GFP_KERNEL);
+}
+
+static const struct usb_device_id ljca_table[] = {
+	{ USB_DEVICE(0x8086, 0x0b63) },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(usb, ljca_table);
+
+static struct usb_driver ljca_driver = {
+	.name = "ljca",
+	.id_table = ljca_table,
+	.probe = ljca_probe,
+	.disconnect = ljca_disconnect,
+	.suspend = ljca_suspend,
+	.resume = ljca_resume,
+	.supports_autosuspend = 1,
+};
+module_usb_driver(ljca_driver);
+
+MODULE_AUTHOR("Wentong Wu <wentong.wu@intel.com>");
+MODULE_AUTHOR("Zhifeng Wang <zhifeng.wang@intel.com>");
+MODULE_AUTHOR("Lixu Zhang <lixu.zhang@intel.com>");
+MODULE_DESCRIPTION("Intel La Jolla Cove Adapter USB driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/ljca.h b/include/linux/usb/ljca.h
new file mode 100644
index 000000000000..47661feda96c
--- /dev/null
+++ b/include/linux/usb/ljca.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Intel Corporation. All rights reserved.
+ */
+#ifndef _LINUX_USB_LJCA_H_
+#define _LINUX_USB_LJCA_H_
+
+#include <linux/auxiliary_bus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define LJCA_MAX_GPIO_NUM 64
+
+#define auxiliary_dev_to_ljca_client(auxiliary_dev)			\
+		container_of(auxiliary_dev, struct ljca_client, auxdev)
+
+struct ljca_adapter;
+
+/**
+ * typedef ljca_event_cb_t - event callback function signature
+ *
+ * @context: the execution context of who registered this callback
+ * @cmd: the command from device for this event
+ * @evt_data: the event data payload
+ * @len: the event data payload length
+ *
+ * The callback function is called in interrupt context and the data payload is
+ * only valid during the call. If the user needs later access of the data, it
+ * must copy it.
+ */
+typedef void (*ljca_event_cb_t)(void *context, u8 cmd, const void *evt_data, int len);
+
+/**
+ * struct ljca_client - represent a ljca client device
+ *
+ * @type: ljca client type
+ * @id: ljca client id within same client type
+ * @link: ljca client on the same ljca adapter
+ * @auxdev: auxiliary device object
+ * @adapter: ljca adapter the ljca client sit on
+ * @context: the execution context of the event callback
+ * @event_cb: ljca client driver register this callback to get
+ *	firmware asynchronous rx buffer pending notifications
+ * @event_cb_lock: spinlock to protect event callback
+ */
+struct ljca_client {
+	u8 type;
+	u8 id;
+	struct list_head link;
+	struct auxiliary_device auxdev;
+	struct ljca_adapter *adapter;
+
+	void *context;
+	ljca_event_cb_t event_cb;
+	/* lock to protect event_cb */
+	spinlock_t event_cb_lock;
+};
+
+/**
+ * struct ljca_gpio_info - ljca gpio client device info
+ *
+ * @num: ljca gpio client device pin number
+ * @valid_pin_map: ljca gpio client device valid pin mapping
+ */
+struct ljca_gpio_info {
+	unsigned int num;
+	DECLARE_BITMAP(valid_pin_map, LJCA_MAX_GPIO_NUM);
+};
+
+/**
+ * struct ljca_i2c_info - ljca i2c client device info
+ *
+ * @id: ljca i2c client device identification number
+ * @capacity: ljca i2c client device capacity
+ * @intr_pin: ljca i2c client device interrupt pin number if exists
+ */
+struct ljca_i2c_info {
+	u8 id;
+	u8 capacity;
+	u8 intr_pin;
+};
+
+/**
+ * struct ljca_spi_info - ljca spi client device info
+ *
+ * @id: ljca spi client device identification number
+ * @capacity: ljca spi client device capacity
+ */
+struct ljca_spi_info {
+	u8 id;
+	u8 capacity;
+};
+
+/**
+ * ljca_register_event_cb - register a callback function to receive events
+ *
+ * @client: ljca client device
+ * @event_cb: callback function
+ * @context: execution context of event callback
+ *
+ * Return: 0 in case of success, negative value in case of error
+ */
+int ljca_register_event_cb(struct ljca_client *client, ljca_event_cb_t event_cb, void *context);
+
+/**
+ * ljca_unregister_event_cb - unregister the callback function for an event
+ *
+ * @client: ljca client device
+ */
+void ljca_unregister_event_cb(struct ljca_client *client);
+
+/**
+ * ljca_transfer - issue a LJCA command and wait for a response
+ *
+ * @client: ljca client device
+ * @cmd: the command to be sent to the device
+ * @obuf: the buffer to be sent to the device; it can be NULL if the user
+ *	doesn't need to transmit data with this command
+ * @obuf_len: the size of the buffer to be sent to the device; it should
+ *	be 0 when obuf is NULL
+ * @ibuf: any data associated with the response will be copied here; it can be
+ *	NULL if the user doesn't need the response data
+ * @ibuf_len: must be initialized to the input buffer size
+ *
+ * Return: the actual length of response data for success, negative value for errors
+ */
+int ljca_transfer(struct ljca_client *client, u8 cmd, const u8 *obuf,
+		  u8 obuf_len, u8 *ibuf, u8 ibuf_len);
+
+/**
+ * ljca_transfer_noack - issue a LJCA command without a response
+ *
+ * @client: ljca client device
+ * @cmd: the command to be sent to the device
+ * @obuf: the buffer to be sent to the device; it can be NULL if the user
+ *	doesn't need to transmit data with this command
+ * @obuf_len: the size of the buffer to be sent to the device
+ *
+ * Return: 0 for success, negative value for errors
+ */
+int ljca_transfer_noack(struct ljca_client *client, u8 cmd, const u8 *obuf,
+			u8 obuf_len);
+
+#endif
-- 
cgit v1.2.3


From c4dd854f740c21ae8dd9903fc67969c5497cb14b Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 25 Sep 2023 17:28:39 +0100
Subject: cpu-hotplug: Provide prototypes for arch CPU registration

Provide common prototypes for arch_register_cpu() and
arch_unregister_cpu(). These are called by acpi_processor.c, with weak
versions, so the prototype for this is already set. It is generally not
necessary for function prototypes to be conditional on preprocessor macros.

Some architectures (e.g. Loongarch) are missing the prototype for this, and
rather than add it to Loongarch's asm/cpu.h, do the job once for everyone.

Since this covers everyone, remove the now unnecessary prototypes in
asm/cpu.h, and therefore remove the 'static' from one of ia64's
arch_register_cpu() definitions.

[ tglx: Bring back the ia64 part and remove the ACPI prototypes ]

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1qkoRr-0088Q8-Da@rmk-PC.armlinux.org.uk
---
 arch/ia64/include/asm/cpu.h   | 5 -----
 arch/ia64/kernel/topology.c   | 2 +-
 arch/x86/include/asm/cpu.h    | 2 --
 arch/x86/kernel/topology.c    | 2 +-
 drivers/acpi/acpi_processor.c | 1 +
 include/acpi/processor.h      | 5 -----
 include/linux/cpu.h           | 2 ++
 7 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/cpu.h b/arch/ia64/include/asm/cpu.h
index db125df9e088..642d71675ddb 100644
--- a/arch/ia64/include/asm/cpu.h
+++ b/arch/ia64/include/asm/cpu.h
@@ -15,9 +15,4 @@ DECLARE_PER_CPU(struct ia64_cpu, cpu_devices);
 
 DECLARE_PER_CPU(int, cpu_state);
 
-#ifdef CONFIG_HOTPLUG_CPU
-extern int arch_register_cpu(int num);
-extern void arch_unregister_cpu(int);
-#endif
-
 #endif /* _ASM_IA64_CPU_H_ */
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 94a848b06f15..741863a187a6 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -59,7 +59,7 @@ void __ref arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
 #else
-static int __init arch_register_cpu(int num)
+int __init arch_register_cpu(int num)
 {
 	return register_cpu(&sysfs_cpus[num].cpu, num);
 }
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 3a233ebff712..25050d953eee 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -28,8 +28,6 @@ struct x86_cpu {
 };
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern int arch_register_cpu(int num);
-extern void arch_unregister_cpu(int);
 extern void soft_restart_cpu(void);
 #endif
 
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index ca004e2e4469..0bab03130033 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -54,7 +54,7 @@ void arch_unregister_cpu(int num)
 EXPORT_SYMBOL(arch_unregister_cpu);
 #else /* CONFIG_HOTPLUG_CPU */
 
-static int __init arch_register_cpu(int num)
+int __init arch_register_cpu(int num)
 {
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index c711db8a9c33..0f5218e361df 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -12,6 +12,7 @@
 #define pr_fmt(fmt) "ACPI: " fmt
 
 #include <linux/acpi.h>
+#include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/dmi.h>
 #include <linux/kernel.h>
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 94181fe9780a..3f34ebb27525 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -465,9 +465,4 @@ extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
 extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
 #endif
 
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-extern int arch_register_cpu(int cpu);
-extern void arch_unregister_cpu(int cpu);
-#endif
-
 #endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..eb768a866fe3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -80,6 +80,8 @@ extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
 				 const struct attribute_group **groups,
 				 const char *fmt, ...);
+extern int arch_register_cpu(int cpu);
+extern void arch_unregister_cpu(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
-- 
cgit v1.2.3


From 5987279373446e97206a7078b2229446ba871ea0 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Thu, 5 Oct 2023 19:50:29 -0500
Subject: iio: event: add optional event label support

This adds a new optional field to struct iio_info to allow drivers to
specify a label for the event. This is useful for cases where there are
many events or the event attribute name is not descriptive enough or
where an event doesn't have any other attributes.

The implementation is based on the existing label support for channels.
So either all events of a device have a label attribute or none do.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20231005-ad2s1210-mainline-v4-12-ec00746840fc@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-event.c | 55 ++++++++++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h          |  8 ++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 19f7a91157ee..910c1f14abd5 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -355,6 +355,21 @@ static ssize_t iio_ev_value_store(struct device *dev,
 	return len;
 }
 
+static ssize_t iio_ev_label_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct iio_dev_attr *this_attr = to_iio_dev_attr(attr);
+
+	if (indio_dev->info->read_event_label)
+		return indio_dev->info->read_event_label(indio_dev,
+				 this_attr->c, iio_ev_attr_type(this_attr),
+				 iio_ev_attr_dir(this_attr), buf);
+
+	return -EINVAL;
+}
+
 static int iio_device_add_event(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, unsigned int spec_index,
 	enum iio_event_type type, enum iio_event_direction dir,
@@ -411,6 +426,41 @@ static int iio_device_add_event(struct iio_dev *indio_dev,
 	return attrcount;
 }
 
+static int iio_device_add_event_label(struct iio_dev *indio_dev,
+				      const struct iio_chan_spec *chan,
+				      unsigned int spec_index,
+				      enum iio_event_type type,
+				      enum iio_event_direction dir)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	char *postfix;
+	int ret;
+
+	if (!indio_dev->info->read_event_label)
+		return 0;
+
+	if (dir != IIO_EV_DIR_NONE)
+		postfix = kasprintf(GFP_KERNEL, "%s_%s_label",
+				iio_ev_type_text[type],
+				iio_ev_dir_text[dir]);
+	else
+		postfix = kasprintf(GFP_KERNEL, "%s_label",
+				iio_ev_type_text[type]);
+	if (postfix == NULL)
+		return -ENOMEM;
+
+	ret = __iio_add_chan_devattr(postfix, chan, &iio_ev_label_show, NULL,
+				spec_index, IIO_SEPARATE, &indio_dev->dev, NULL,
+				&iio_dev_opaque->event_interface->dev_attr_list);
+
+	kfree(postfix);
+
+	if (ret < 0)
+		return ret;
+
+	return 1;
+}
+
 static int iio_device_add_event_sysfs(struct iio_dev *indio_dev,
 	struct iio_chan_spec const *chan)
 {
@@ -448,6 +498,11 @@ static int iio_device_add_event_sysfs(struct iio_dev *indio_dev,
 		if (ret < 0)
 			return ret;
 		attrcount += ret;
+
+		ret = iio_device_add_event_label(indio_dev, chan, i, type, dir);
+		if (ret < 0)
+			return ret;
+		attrcount += ret;
 	}
 	ret = attrcount;
 	return ret;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 7bfa1b9bc8a2..d0ce3b71106a 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -427,6 +427,8 @@ struct iio_trigger; /* forward declaration */
  * @write_event_config:	set if the event is enabled.
  * @read_event_value:	read a configuration value associated with the event.
  * @write_event_value:	write a configuration value for the event.
+ * @read_event_label:	function to request label name for a specified label,
+ *			for better event identification.
  * @validate_trigger:	function to validate the trigger when the
  *			current trigger gets changed.
  * @update_scan_mode:	function to configure device and scan buffer when
@@ -511,6 +513,12 @@ struct iio_info {
 				 enum iio_event_direction dir,
 				 enum iio_event_info info, int val, int val2);
 
+	int (*read_event_label)(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				enum iio_event_type type,
+				enum iio_event_direction dir,
+				char *label);
+
 	int (*validate_trigger)(struct iio_dev *indio_dev,
 				struct iio_trigger *trig);
 	int (*update_scan_mode)(struct iio_dev *indio_dev,
-- 
cgit v1.2.3


From 21ca59b365c091d583f36ac753eaa8baf947be6f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 28 Oct 2021 12:31:14 +0200
Subject: binfmt_misc: enable sandboxed mounts

Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.

While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.

However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).

Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.

Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.

Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.

Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:

root    [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000  \_ /sbin/init
1000000      \_ /lib/systemd/systemd-journald
1000000      \_ /lib/systemd/systemd-udevd
1000100      \_ /lib/systemd/systemd-networkd
1000101      \_ /lib/systemd/systemd-resolved
1000000      \_ /usr/sbin/cron -f
1000103      \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000      \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104      \_ /usr/sbin/rsyslogd -n -iNONE
1000000      \_ /lib/systemd/systemd-logind
1000000      \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107      \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000      \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000          \_ /usr/bin/qemu-s390x-static /sbin/init
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000              \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103              \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000              \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104              \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd

[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11

Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
  - Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
    new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
  - Switch authorship to me. I refused to do that earlier even though
    Laurent said I should do so because I think it's genuinely bad form.
    But by now I have changed so many things that it'd be unfair to
    blame Laurent for any potential bugs in here.
  - Add more comments that explain what's going on.
  - Rename functions while changing them to better reflect what they are
    doing to make the code easier to understand.
  - In the first version when a specific binary type handler was removed
    either through a write to the entry's file or all binary type
    handlers were removed by a write to the binfmt_misc mount's status
    file all cleanup work happened during inode eviction.
    That includes removal of the relevant entries from entry list. While
    that works fine I disliked that model after thinking about it for a
    bit. Because it means that there was a window were someone has
    already removed a or all binary handlers but they could still be
    safely reached from load_misc_binary() when it has managed to take
    the read_lock() on the entries list while inode eviction was already
    happening. Again, that perfectly benign but it's cleaner to remove
    the binary handler from the list immediately meaning that ones the
    write to then entry's file or the binfmt_misc status file returns
    the binary type cannot be executed anymore. That gives stronger
    guarantees to the user.
---
 fs/binfmt_misc.c               | 202 ++++++++++++++++++++++++++++++++++-------
 include/linux/binfmts.h        |  10 ++
 include/linux/user_namespace.h |   8 ++
 kernel/user.c                  |  13 +++
 kernel/user_namespace.c        |   3 +
 5 files changed, 202 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index cf5ed5cd4102..deacc105119d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
 #define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -63,7 +60,6 @@ typedef struct {
 	refcount_t users;		/* sync removal with load_misc_binary() */
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
 
 /*
@@ -91,13 +87,14 @@ static struct file_system_type bm_fs_type;
  *
  * Return: binary type list entry on success, NULL on failure
  */
-static Node *search_binfmt_handler(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+				   struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
 	Node *e;
 
 	/* Walk all the registered handlers. */
-	list_for_each_entry(e, &entries, list) {
+	list_for_each_entry(e, &misc->entries, list) {
 		char *s;
 		int j;
 
@@ -140,15 +137,16 @@ static Node *search_binfmt_handler(struct linux_binprm *bprm)
  *
  * Return: binary type list entry on success, NULL on failure
  */
-static Node *get_binfmt_handler(struct linux_binprm *bprm)
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+				struct linux_binprm *bprm)
 {
 	Node *e;
 
-	read_lock(&entries_lock);
-	e = search_binfmt_handler(bprm);
+	read_lock(&misc->entries_lock);
+	e = search_binfmt_handler(misc, bprm);
 	if (e)
 		refcount_inc(&e->users);
-	read_unlock(&entries_lock);
+	read_unlock(&misc->entries_lock);
 	return e;
 }
 
@@ -169,6 +167,35 @@ static void put_binfmt_handler(Node *e)
 	}
 }
 
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+	const struct user_namespace *user_ns;
+	struct binfmt_misc *misc;
+
+	user_ns = current_user_ns();
+	while (user_ns) {
+		/* Pairs with smp_store_release() in bm_fill_super(). */
+		misc = smp_load_acquire(&user_ns->binfmt_misc);
+		if (misc)
+			return misc;
+
+		user_ns = user_ns->parent;
+	}
+
+	return &init_binfmt_misc;
+}
+
 /*
  * the loader itself
  */
@@ -176,13 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file *interp_file = NULL;
-	int retval;
+	int retval = -ENOEXEC;
+	struct binfmt_misc *misc;
 
-	retval = -ENOEXEC;
-	if (!enabled)
+	misc = load_binfmt_misc();
+	if (!misc->enabled)
 		return retval;
 
-	fmt = get_binfmt_handler(bprm);
+	fmt = get_binfmt_handler(misc, bprm);
 	if (!fmt)
 		return retval;
 
@@ -240,9 +268,9 @@ ret:
 	/*
 	 * If we actually put the node here all concurrent calls to
 	 * load_misc_binary() will have finished. We also know
-	 * that for the refcount to be zero ->evict_inode() must have removed
-	 * the node to be deleted from the list. All that is left for us is to
-	 * close and free.
+	 * that for the refcount to be zero someone must have concurently
+	 * removed the binary type handler from the list and it's our job to
+	 * free it.
 	 */
 	put_binfmt_handler(fmt);
 
@@ -334,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 	err = -ENOMEM;
 	memsize = sizeof(Node) + count + 8;
-	e = kmalloc(memsize, GFP_KERNEL);
+	e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
 	if (!e)
 		goto out;
 
@@ -446,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 			if (e->mask) {
 				int i;
-				char *masked = kmalloc(e->size, GFP_KERNEL);
+				char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
 
 				print_hex_dump_bytes(
 					KBUILD_MODNAME ": register:  mask[decoded]: ",
@@ -599,6 +627,22 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+	return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
 /**
  * bm_evict_inode - cleanup data associated with @inode
  * @inode: inode to which the data is attached
@@ -619,10 +663,13 @@ static void bm_evict_inode(struct inode *inode)
 	clear_inode(inode);
 
 	if (e) {
-		write_lock(&entries_lock);
+		struct binfmt_misc *misc;
+
+		misc = i_binfmt_misc(inode);
+		write_lock(&misc->entries_lock);
 		if (!list_empty(&e->list))
 			list_del_init(&e->list);
-		write_unlock(&entries_lock);
+		write_unlock(&misc->entries_lock);
 		put_binfmt_handler(e);
 	}
 }
@@ -677,11 +724,11 @@ static void unlink_binfmt_dentry(struct dentry *dentry)
  * to use writes to files in order to delete binary type handlers. But it has
  * worked for so long that it's not a pressing issue.
  */
-static void remove_binfmt_handler(Node *e)
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
 {
-	write_lock(&entries_lock);
+	write_lock(&misc->entries_lock);
 	list_del_init(&e->list);
-	write_unlock(&entries_lock);
+	write_unlock(&misc->entries_lock);
 	unlink_binfmt_dentry(e->dentry);
 }
 
@@ -737,7 +784,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		 * actually remove the entry from the list.
 		 */
 		if (!list_empty(&e->list))
-			remove_binfmt_handler(e);
+			remove_binfmt_handler(i_binfmt_misc(inode), e);
 
 		inode_unlock(inode);
 		break;
@@ -763,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct dentry *root = sb->s_root, *dentry;
+	struct binfmt_misc *misc;
 	int err = 0;
 	struct file *f = NULL;
 
@@ -772,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	if (e->flags & MISC_FMT_OPEN_FILE) {
+		const struct cred *old_cred;
+
+		/*
+		 * Now that we support unprivileged binfmt_misc mounts make
+		 * sure we use the credentials that the register @file was
+		 * opened with to also open the interpreter. Before that this
+		 * didn't matter much as only a privileged process could open
+		 * the register file.
+		 */
+		old_cred = override_creds(file->f_cred);
 		f = open_exec(e->interpreter);
+		revert_creds(old_cred);
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
@@ -804,9 +863,10 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	misc = i_binfmt_misc(inode);
+	write_lock(&misc->entries_lock);
+	list_add(&e->list, &misc->entries);
+	write_unlock(&misc->entries_lock);
 
 	err = 0;
 out2:
@@ -833,26 +893,31 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *misc;
+	char *s;
 
+	misc = i_binfmt_misc(file_inode(file));
+	s = misc->enabled ? "enabled\n" : "disabled\n";
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
 static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *misc;
 	int res = parse_command(buffer, count);
 	Node *e, *next;
 	struct inode *inode;
 
+	misc = i_binfmt_misc(file_inode(file));
 	switch (res) {
 	case 1:
 		/* Disable all handlers. */
-		enabled = 0;
+		misc->enabled = false;
 		break;
 	case 2:
 		/* Enable all handlers. */
-		enabled = 1;
+		misc->enabled = true;
 		break;
 	case 3:
 		/* Delete all handlers. */
@@ -868,8 +933,8 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		 * read-only. So we only need to take the write lock when we
 		 * actually remove the entry from the list.
 		 */
-		list_for_each_entry_safe(e, next, &entries, list)
-			remove_binfmt_handler(e);
+		list_for_each_entry_safe(e, next, &misc->entries, list)
+			remove_binfmt_handler(misc, e);
 
 		inode_unlock(inode);
 		break;
@@ -888,32 +953,100 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct user_namespace *user_ns = sb->s_fs_info;
+
+	sb->s_fs_info = NULL;
+	put_user_ns(user_ns);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int err;
+	struct user_namespace *user_ns = sb->s_user_ns;
+	struct binfmt_misc *misc;
 	static const struct tree_descr bm_files[] = {
 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
 
+	if (WARN_ON(user_ns != current_user_ns()))
+		return -EINVAL;
+
+	/*
+	 * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+	 * do it here during the first mount of binfmt_misc. We don't need to
+	 * waste memory for every user namespace allocation. It's likely much
+	 * more common to not mount a separate binfmt_misc instance than it is
+	 * to mount one.
+	 *
+	 * While multiple superblocks can exist they are keyed by userns in
+	 * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+	 * bm_fill_super() is called exactly once whenever a binfmt_misc
+	 * superblock for a userns is created. This in turn lets us conclude
+	 * that when a binfmt_misc superblock is created for the first time for
+	 * a userns there's no one racing us. Therefore we don't need any
+	 * barriers when we dereference binfmt_misc.
+	 */
+	misc = user_ns->binfmt_misc;
+	if (!misc) {
+		/*
+		 * If it turns out that most user namespaces actually want to
+		 * register their own binary type handler and therefore all
+		 * create their own separate binfm_misc mounts we should
+		 * consider turning this into a kmem cache.
+		 */
+		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!misc)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&misc->entries);
+		rwlock_init(&misc->entries_lock);
+
+		/* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+		smp_store_release(&user_ns->binfmt_misc, misc);
+	}
+
+	/*
+	 * When the binfmt_misc superblock for this userns is shutdown
+	 * ->enabled might have been set to false and we don't reinitialize
+	 * ->enabled again in put_super() as someone might already be mounting
+	 * binfmt_misc again. It also would be pointless since by the time
+	 * ->put_super() is called we know that the binary type list for this
+	 * bintfmt_misc mount is empty making load_misc_binary() return
+	 * -ENOEXEC independent of whether ->enabled is true. Instead, if
+	 * someone mounts binfmt_misc for the first time or again we simply
+	 * reset ->enabled to true.
+	 */
+	misc->enabled = true;
+
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
 	if (!err)
 		sb->s_op = &s_ops;
 	return err;
 }
 
+static void bm_free(struct fs_context *fc)
+{
+	if (fc->s_fs_info)
+		put_user_ns(fc->s_fs_info);
+}
+
 static int bm_get_tree(struct fs_context *fc)
 {
-	return get_tree_single(fc, bm_fill_super);
+	return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
 }
 
 static const struct fs_context_operations bm_context_ops = {
+	.free		= bm_free,
 	.get_tree	= bm_get_tree,
 };
 
@@ -932,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
 	.init_fs_context = bm_init_fs_context,
+	.fs_flags	= FS_USERNS_MOUNT,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("binfmt_misc");
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8d51f69f9f5e..70f97f685bff 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,6 +90,16 @@ struct linux_binfmt {
 #endif
 } __randomize_layout;
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc {
+	struct list_head entries;
+	rwlock_t entries_lock;
+	bool enabled;
+} __randomize_layout;
+
+extern struct binfmt_misc init_binfmt_misc;
+#endif
+
 extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
 /* Registration of default binfmt handlers */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..6030a8235617 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,6 +65,10 @@ enum rlimit_type {
 	UCOUNT_RLIMIT_COUNTS,
 };
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc;
+#endif
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -102,6 +106,10 @@ struct user_namespace {
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
 	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc *binfmt_misc;
+#endif
 } __randomize_layout;
 
 struct ucounts {
diff --git a/kernel/user.c b/kernel/user.c
index d667debeafd6..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 #include <linux/proc_ns.h>
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+	.entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+	.enabled = true,
+	.entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
  * and 1 for... ?
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
 	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
 	.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
 #endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	.binfmt_misc = &init_binfmt_misc,
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..d52a894ecf57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work)
 			kfree(ns->projid_map.forward);
 			kfree(ns->projid_map.reverse);
 		}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+		kfree(ns->binfmt_misc);
+#endif
 		retire_userns_sysctls(ns);
 		key_free_user_ns(ns);
 		ns_free_inum(&ns->ns);
-- 
cgit v1.2.3


From f6d7f050e258e3c71e310f5167c4d65bbefaeb31 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 10 Oct 2023 19:31:00 +0300
Subject: spi: Don't use flexible array in struct spi_message definition

The struct spi_message can be embedded into another structures.
With that the flexible array might be problematic as sparse
complains about it, although there is no real issue in the code
because when the message is embedded it doesn't use flexible array
member. That memeber is a private to spi_message_alloc() API, so
move it to that API in a form of an inherited data type.

Reported-by: Marc Kleine-Budde <mkl@pengutronix.de>
Fixes: 75e308ffc4f0 ("spi: Use struct_size() helper"))
Closes: https://lore.kernel.org/r/20231009-onshore-underage-c58415adfd92-mkl@pengutronix.de
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20231010163100.89734-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 7f8b478fdeb3..487da1f6e4b7 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1086,8 +1086,6 @@ struct spi_transfer {
  * @state: for use by whichever driver currently owns the message
  * @resources: for resource management when the SPI message is processed
  * @prepared: spi_prepare_message was called for the this message
- * @t: for use with spi_message_alloc() when message and transfers have
- *	been allocated together
  *
  * A @spi_message is used to execute an atomic sequence of data transfers,
  * each represented by a struct spi_transfer.  The sequence is "atomic"
@@ -1142,9 +1140,6 @@ struct spi_message {
 
 	/* List of spi_res resources when the SPI message is processed */
 	struct list_head        resources;
-
-	/* For embedding transfers into the memory of the message */
-	struct spi_transfer	t[];
 };
 
 static inline void spi_message_init_no_memset(struct spi_message *m)
@@ -1203,17 +1198,21 @@ struct spi_transfer *xfers, unsigned int num_xfers)
  */
 static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags)
 {
-	struct spi_message *m;
+	struct spi_message_with_transfers {
+		struct spi_message m;
+		struct spi_transfer t[];
+	} *mwt;
+	unsigned i;
+
+	mwt = kzalloc(struct_size(mwt, t, ntrans), flags);
+	if (!mwt)
+		return NULL;
 
-	m = kzalloc(struct_size(m, t, ntrans), flags);
-	if (m) {
-		unsigned i;
+	spi_message_init_no_memset(&mwt->m);
+	for (i = 0; i < ntrans; i++)
+		spi_message_add_tail(&mwt->t[i], &mwt->m);
 
-		spi_message_init_no_memset(m);
-		for (i = 0; i < ntrans; i++)
-			spi_message_add_tail(&m->t[i], m);
-	}
-	return m;
+	return &mwt->m;
 }
 
 static inline void spi_message_free(struct spi_message *m)
-- 
cgit v1.2.3


From 9d77eb52778499a97cab662aa96de4e2e4fa72d3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:08 +0200
Subject: nvme-keyring: register '.nvme' keyring

Register a '.nvme' keyring to hold keys for TLS and DH-HMAC-CHAP and
add a new config option NVME_KEYRING.
We need a separate keyring for NVMe as the configuration is done
via individual commands (eg for configfs), and the usual per-session
or per-process keyrings can't be used.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/common/Kconfig   |  4 ++++
 drivers/nvme/common/Makefile  |  3 ++-
 drivers/nvme/common/keyring.c | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c      | 10 ++++++++--
 include/linux/nvme-keyring.h  | 28 ++++++++++++++++++++++++++++
 5 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 drivers/nvme/common/keyring.c
 create mode 100644 include/linux/nvme-keyring.h

(limited to 'include/linux')

diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
index 4514f44362dd..641b27adb047 100644
--- a/drivers/nvme/common/Kconfig
+++ b/drivers/nvme/common/Kconfig
@@ -2,3 +2,7 @@
 
 config NVME_COMMON
        tristate
+
+config NVME_KEYRING
+       bool
+       select KEYS
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
index 720c625b8a52..0cbd0b0b8d49 100644
--- a/drivers/nvme/common/Makefile
+++ b/drivers/nvme/common/Makefile
@@ -4,4 +4,5 @@ ccflags-y			+= -I$(src)
 
 obj-$(CONFIG_NVME_COMMON)	+= nvme-common.o
 
-nvme-common-y			+= auth.o
+nvme-common-$(CONFIG_NVME_AUTH)	+= auth.o
+nvme-common-$(CONFIG_NVME_KEYRING) += keyring.o
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
new file mode 100644
index 000000000000..5cf64b278119
--- /dev/null
+++ b/drivers/nvme/common/keyring.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Hannes Reinecke, SUSE Labs
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/nvme.h>
+
+static struct key *nvme_keyring;
+
+key_serial_t nvme_keyring_id(void)
+{
+	return nvme_keyring->serial;
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_id);
+
+int nvme_keyring_init(void)
+{
+	nvme_keyring = keyring_alloc(".nvme",
+				     GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				     current_cred(),
+				     (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+				     (KEY_USR_ALL & ~KEY_USR_SETATTR),
+				     KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+	if (IS_ERR(nvme_keyring))
+		return PTR_ERR(nvme_keyring);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_init);
+
+void nvme_keyring_exit(void)
+{
+	key_revoke(nvme_keyring);
+	key_put(nvme_keyring);
+}
+EXPORT_SYMBOL_GPL(nvme_keyring_exit);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21783aa2ee8e..a49b65d34cda 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -25,6 +25,7 @@
 #include "nvme.h"
 #include "fabrics.h"
 #include <linux/nvme-auth.h>
+#include <linux/nvme-keyring.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -4723,12 +4724,16 @@ static int __init nvme_core_init(void)
 		result = PTR_ERR(nvme_ns_chr_class);
 		goto unregister_generic_ns;
 	}
-
-	result = nvme_init_auth();
+	result = nvme_keyring_init();
 	if (result)
 		goto destroy_ns_chr;
+	result = nvme_init_auth();
+	if (result)
+		goto keyring_exit;
 	return 0;
 
+keyring_exit:
+	nvme_keyring_exit();
 destroy_ns_chr:
 	class_destroy(nvme_ns_chr_class);
 unregister_generic_ns:
@@ -4752,6 +4757,7 @@ out:
 static void __exit nvme_core_exit(void)
 {
 	nvme_exit_auth();
+	nvme_keyring_exit();
 	class_destroy(nvme_ns_chr_class);
 	class_destroy(nvme_subsys_class);
 	class_destroy(nvme_class);
diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
new file mode 100644
index 000000000000..32bd264a71e6
--- /dev/null
+++ b/include/linux/nvme-keyring.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Hannes Reinecke, SUSE Labs
+ */
+
+#ifndef _NVME_KEYRING_H
+#define _NVME_KEYRING_H
+
+#ifdef CONFIG_NVME_KEYRING
+
+key_serial_t nvme_keyring_id(void);
+int nvme_keyring_init(void);
+void nvme_keyring_exit(void);
+
+#else
+
+static inline key_serial_t nvme_keyring_id(void)
+{
+	return 0;
+}
+static inline int nvme_keyring_init(void)
+{
+	return 0;
+}
+static inline void nvme_keyring_exit(void) {}
+
+#endif /* !CONFIG_NVME_KEYRING */
+#endif /* _NVME_KEYRING_H */
-- 
cgit v1.2.3


From 646f45b23218c7a97a84259d8eeb22dad5711fc8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:10 +0200
Subject: nvme: add TCP TSAS definitions

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 26dd3f859d9d..a7ba74babad7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -108,6 +108,13 @@ enum {
 	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
 };
 
+/* TSAS SECTYPE for TCP transport */
+enum {
+	NVMF_TCP_SECTYPE_NONE = 0, /* No Security */
+	NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */
+	NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */
+};
+
 #define NVME_AQ_DEPTH		32
 #define NVME_NR_AEN_COMMANDS	1
 #define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
@@ -1493,6 +1500,9 @@ struct nvmf_disc_rsp_page_entry {
 			__u16	pkey;
 			__u8	resv10[246];
 		} rdma;
+		struct tcp {
+			__u8	sectype;
+		} tcp;
 	} tsas;
 };
 
-- 
cgit v1.2.3


From a86062aac34d100a3117c0fff91ee1892ebfb460 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:11 +0200
Subject: nvme-tcp: add definitions for TLS cipher suites

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-tcp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
index 57ebe1267f7f..e07e8978d691 100644
--- a/include/linux/nvme-tcp.h
+++ b/include/linux/nvme-tcp.h
@@ -18,6 +18,12 @@ enum nvme_tcp_pfv {
 	NVME_TCP_PFV_1_0 = 0x0,
 };
 
+enum nvme_tcp_tls_cipher {
+	NVME_TCP_TLS_CIPHER_INVALID     = 0,
+	NVME_TCP_TLS_CIPHER_SHA256      = 1,
+	NVME_TCP_TLS_CIPHER_SHA384      = 2,
+};
+
 enum nvme_tcp_fatal_error_status {
 	NVME_TCP_FES_INVALID_PDU_HDR		= 0x01,
 	NVME_TCP_FES_PDU_SEQ_ERR		= 0x02,
-- 
cgit v1.2.3


From 501cc6f4aca8dc0958c4d9716f0233ba7cff4830 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:12 +0200
Subject: nvme-keyring: implement nvme_tls_psk_default()

Implement a function to select the preferred PSK for TLS.
A 'retained' PSK should be preferred over a 'generated' PSK,
and SHA-384 should be preferred to SHA-256.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/common/keyring.c | 48 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme-keyring.h  |  8 ++++++++
 2 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index 494dd365052e..f8d9a208397b 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -5,6 +5,7 @@
 
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/key.h>
 #include <linux/key-type.h>
 #include <keys/user-type.h>
 #include <linux/nvme.h>
@@ -103,6 +104,53 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
 	return key_ref_to_ptr(keyref);
 }
 
+/*
+ * NVMe PSK priority list
+ *
+ * 'Retained' PSKs (ie 'generated == false')
+ * should be preferred to 'generated' PSKs,
+ * and SHA-384 should be preferred to SHA-256.
+ */
+struct nvme_tls_psk_priority_list {
+	bool generated;
+	enum nvme_tcp_tls_cipher cipher;
+} nvme_tls_psk_prio[] = {
+	{ .generated = false,
+	  .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+	{ .generated = false,
+	  .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+	{ .generated = true,
+	  .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+	{ .generated = true,
+	  .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+};
+
+/*
+ * nvme_tls_psk_default - Return the preferred PSK to use for TLS ClientHello
+ */
+key_serial_t nvme_tls_psk_default(struct key *keyring,
+		      const char *hostnqn, const char *subnqn)
+{
+	struct key *tls_key;
+	key_serial_t tls_key_id;
+	int prio;
+
+	for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) {
+		bool generated = nvme_tls_psk_prio[prio].generated;
+		enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher;
+
+		tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn,
+					      cipher, generated);
+		if (!IS_ERR(tls_key)) {
+			tls_key_id = tls_key->serial;
+			key_put(tls_key);
+			return tls_key_id;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_tls_psk_default);
+
 int nvme_keyring_init(void)
 {
 	int err;
diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 32bd264a71e6..4efea9dd967c 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -8,12 +8,20 @@
 
 #ifdef CONFIG_NVME_KEYRING
 
+key_serial_t nvme_tls_psk_default(struct key *keyring,
+		const char *hostnqn, const char *subnqn);
+
 key_serial_t nvme_keyring_id(void);
 int nvme_keyring_init(void);
 void nvme_keyring_exit(void);
 
 #else
 
+static inline key_serial_t nvme_tls_psk_default(struct key *keyring,
+		const char *hostnqn, const char *subnqn)
+{
+	return 0;
+}
 static inline key_serial_t nvme_keyring_id(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 037c34318a479294cdb98dc8018edd5d191b68c0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 24 Aug 2023 16:39:13 +0200
Subject: security/keys: export key_lookup()

For in-kernel consumers one cannot readily assign a user (eg when
running from a workqueue), so the normal key search permissions
cannot be applied.
This patch exports the 'key_lookup()' function for a simple lookup
of keys without checking for permissions.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/key.h | 1 +
 security/keys/key.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 938d7ecfb495..943a432da3ae 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -515,6 +515,7 @@ extern void key_init(void);
 #define key_init()			do { } while(0)
 #define key_free_user_ns(ns)		do { } while(0)
 #define key_remove_domain(d)		do { } while(0)
+#define key_lookup(k)			NULL
 
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
diff --git a/security/keys/key.c b/security/keys/key.c
index 5c0c7df833f8..0260a1902922 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -693,6 +693,7 @@ error:
 	spin_unlock(&key_serial_lock);
 	return key;
 }
+EXPORT_SYMBOL(key_lookup);
 
 /*
  * Find and lock the specified key type against removal.
-- 
cgit v1.2.3


From 39c6eed1f61594f737160e498d29673edbd9eefd Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Tue, 10 Oct 2023 12:42:36 +0200
Subject: x86/resctrl: Rename arch_has_sparse_bitmaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename arch_has_sparse_bitmaps to arch_has_sparse_bitmasks to ensure
consistent terminology throughout resctrl.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Peter Newman <peternewman@google.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Peter Newman <peternewman@google.com>
Link: https://lore.kernel.org/r/e330fcdae873ef1a831e707025a4b70fa346666e.1696934091.git.maciej.wieczor-retman@intel.com
---
 arch/x86/kernel/cpu/resctrl/core.c        | 4 ++--
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++--
 include/linux/resctrl.h                   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 030d3b409768..c09e4fdded3c 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -872,7 +872,7 @@ static __init void rdt_init_res_defs_intel(void)
 
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L2) {
-			r->cache.arch_has_sparse_bitmaps = false;
+			r->cache.arch_has_sparse_bitmasks = false;
 			r->cache.arch_has_per_cpu_cfg = false;
 			r->cache.min_cbm_bits = 1;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -892,7 +892,7 @@ static __init void rdt_init_res_defs_amd(void)
 
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L2) {
-			r->cache.arch_has_sparse_bitmaps = true;
+			r->cache.arch_has_sparse_bitmasks = true;
 			r->cache.arch_has_per_cpu_cfg = true;
 			r->cache.min_cbm_bits = 0;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b44c487727d4..ab45012288bb 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -113,8 +113,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 	first_bit = find_first_bit(&val, cbm_len);
 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
 
-	/* Are non-contiguous bitmaps allowed? */
-	if (!r->cache.arch_has_sparse_bitmaps &&
+	/* Are non-contiguous bitmasks allowed? */
+	if (!r->cache.arch_has_sparse_bitmasks &&
 	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
 		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
 		return false;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8334eeacfec5..66942d7fba7f 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -94,7 +94,7 @@ struct rdt_domain {
  *			zero CBM.
  * @shareable_bits:	Bitmask of shareable resource with other
  *			executing entities
- * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_sparse_bitmasks:	True if a bitmask like f00f is valid.
  * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
  *				level has CPU scope.
  */
@@ -102,7 +102,7 @@ struct resctrl_cache {
 	unsigned int	cbm_len;
 	unsigned int	min_cbm_bits;
 	unsigned int	shareable_bits;
-	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_sparse_bitmasks;
 	bool		arch_has_per_cpu_cfg;
 };
 
-- 
cgit v1.2.3


From fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:04 +0200
Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs

As prep for adding unix socket support to the cgroup sockaddr hooks,
let's propagate the sockaddr length back to the caller after running
a bpf cgroup sockaddr hook program. While not important for AF_INET or
AF_INET6, the sockaddr length is important when working with AF_UNIX
sockaddrs as the size of the sockaddr cannot be determined just from the
address family or the sockaddr's contents.

__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as
an input/output argument. After running the program, the modified sockaddr
length is stored in the uaddrlen pointer.

Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup.h | 73 +++++++++++++++++++++++-----------------------
 include/linux/filter.h     |  1 +
 kernel/bpf/cgroup.c        | 17 +++++++++--
 net/ipv4/af_inet.c         |  7 +++--
 net/ipv4/ping.c            |  2 +-
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/udp.c             |  9 ++++--
 net/ipv6/af_inet6.c        |  9 +++---
 net/ipv6/ping.c            |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 net/ipv6/udp.c             |  6 ++--
 11 files changed, 76 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..31561e789715 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, NULL);  \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx)	       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  t_ctx, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, t_ctx, NULL); \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, &__flags);     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, &__flags); \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
 			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled(atype) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff7ecc89d3dd..bcd2bc15ff56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1335,6 +1335,7 @@ struct bpf_sock_addr_kern {
 	 */
 	u64 tmp_reg;
 	void *t_ctx;	/* Attach type specific context. */
+	u32 uaddrlen;
 };
 
 struct bpf_sock_ops_kern {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03b3d4492980..ac37bd53aee0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1450,6 +1450,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  *                                       provided by user sockaddr
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ *            uaddr.
  * @atype: The type of program to be executed
  * @t_ctx: Pointer to attach type specific context
  * @flags: Pointer to u32 which contains higher bits of BPF program
@@ -1462,6 +1465,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags)
@@ -1473,6 +1477,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	};
 	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
+	int ret;
 
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
@@ -1483,11 +1488,19 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (!ctx.uaddr) {
 		memset(&unspec, 0, sizeof(unspec));
 		ctx.uaddr = (struct sockaddr *)&unspec;
+		ctx.uaddrlen = 0;
+	} else {
+		ctx.uaddrlen = *uaddrlen;
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
-				     0, flags);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+				    0, flags);
+
+	if (!ret && uaddr)
+		*uaddrlen = ctx.uaddrlen;
+
+	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3d2e30e20473..7e27ad37b939 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
@@ -788,6 +788,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+	int sin_addr_len = sizeof(*sin);
 
 	sin->sin_family = AF_INET;
 	lock_sock(sk);
@@ -800,7 +801,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETPEERNAME);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -808,7 +809,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 			addr = inet->inet_saddr;
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETSOCKNAME);
 	}
 	release_sock(sk);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b188..2887177822c9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03..7c18dd3ce011 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -194,7 +194,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 }
 
 /* This will initiate an outgoing connection. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b6354..7b21a51dd25a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
-					    (struct sockaddr *)usin, &ipc.addr);
+					    (struct sockaddr *)usin,
+					    &msg->msg_namelen,
+					    &ipc.addr);
 		if (err)
 			goto out_free;
 		if (usin) {
@@ -1865,7 +1867,8 @@ try_again:
 		*addr_len = sizeof(*sin);
 
 		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin);
+						      (struct sockaddr *)sin,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1904,7 +1907,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 EXPORT_SYMBOL(udp_pre_connect);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6337fb4504fd..c35d302a3da9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -454,7 +454,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
@@ -520,6 +520,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer)
 {
 	struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+	int sin_addr_len = sizeof(*sin);
 	struct sock *sk = sock->sk;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -539,7 +540,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin6_addr = sk->sk_v6_daddr;
 		if (inet6_test_bit(SNDFLOW, sk))
 			sin->sin6_flowinfo = np->flow_label;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETPEERNAME);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -547,13 +548,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		else
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETSOCKNAME);
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
 	release_sock(sk);
-	return sizeof(*sin);
+	return sin_addr_len;
 }
 EXPORT_SYMBOL(inet6_getname);
 
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e8fb0d275cc2..d2098dd4ceae 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94afb8d0f2d0..3a1e76a2d33e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -135,7 +135,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5e9312eefed0..622b10a549f7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -410,7 +410,8 @@ try_again:
 		*addr_len = sizeof(*sin6);
 
 		BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin6);
+						      (struct sockaddr *)sin6,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1157,7 +1158,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /**
@@ -1510,6 +1511,7 @@ do_udp_sendmsg:
 	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6,
+					   &addr_len,
 					   &fl6->saddr);
 		if (err)
 			goto out_no_dst;
-- 
cgit v1.2.3


From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:06 +0200
Subject: bpf: Implement cgroup sockaddr hooks for unix sockets

These hooks allows intercepting connect(), getsockname(),
getpeername(), sendmsg() and recvmsg() for unix sockets. The unix
socket hooks get write access to the address length because the
address length is not fixed when dealing with unix sockets and
needs to be modified when a unix socket address is modified by
the hook. Because abstract socket unix addresses start with a
NUL byte, we cannot recalculate the socket address in kernelspace
after running the hook by calculating the length of the unix socket
path using strlen().

These hooks can be used when users want to multiplex syscall to a
single unix socket to multiple different processes behind the scenes
by redirecting the connect() and other syscalls to process specific
sockets.

We do not implement support for intercepting bind() because when
using bind() with unix sockets with a pathname address, this creates
an inode in the filesystem which must be cleaned up. If we rewrite
the address, the user might try to clean up the wrong file, leaking
the socket in the filesystem where it is never cleaned up. Until we
figure out a solution for this (and a use case for intercepting bind()),
we opt to not allow rewriting the sockaddr in bind() calls.

We also implement recvmsg() support for connected streams so that
after a connect() that is modified by a sockaddr hook, any corresponding
recmvsg() on the connected socket can also be modified to make the
connected program think it is connected to the "intended" remote.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  5 +++++
 include/linux/bpf-cgroup.h      | 17 +++++++++++++++++
 include/uapi/linux/bpf.h        | 13 +++++++++----
 kernel/bpf/cgroup.c             | 11 +++++++++--
 kernel/bpf/syscall.c            | 15 +++++++++++++++
 kernel/bpf/verifier.c           |  5 ++++-
 net/core/filter.c               | 14 ++++++++++++--
 net/unix/af_unix.c              | 35 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h  | 13 +++++++++----
 9 files changed, 114 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..0985221d5478 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET6_BIND,
 	CGROUP_INET4_CONNECT,
 	CGROUP_INET6_CONNECT,
+	CGROUP_UNIX_CONNECT,
 	CGROUP_INET4_POST_BIND,
 	CGROUP_INET6_POST_BIND,
 	CGROUP_UDP4_SENDMSG,
 	CGROUP_UDP6_SENDMSG,
+	CGROUP_UNIX_SENDMSG,
 	CGROUP_SYSCTL,
 	CGROUP_UDP4_RECVMSG,
 	CGROUP_UDP6_RECVMSG,
+	CGROUP_UNIX_RECVMSG,
 	CGROUP_GETSOCKOPT,
 	CGROUP_SETSOCKOPT,
 	CGROUP_INET4_GETPEERNAME,
 	CGROUP_INET6_GETPEERNAME,
+	CGROUP_UNIX_GETPEERNAME,
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_UNIX_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
 	CGROUP_LSM_START,
 	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 31561e789715..98b8cea904fe 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 	CGROUP_ATYPE(CGROUP_INET6_BIND);
 	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_UNIX_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
 	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
 	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_SENDMSG);
 	CGROUP_ATYPE(CGROUP_SYSCTL);
 	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_RECVMSG);
 	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
 	default:
 		return CGROUP_BPF_ATTACH_TYPE_INVALID;
@@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL)
+
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
  * sk_to_full_sk().
@@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac37bd53aee0..74ad2215e1ba 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1458,7 +1458,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @flags: Pointer to u32 which contains higher bits of BPF program
  *         return value (OR'ed together).
  *
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
  *
  * This function will return %-EPERM if an attached program is found and
  * returned value != 1 during execution. In all other cases, 0 is returned.
@@ -1482,7 +1482,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
 	 */
-	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+	    sk->sk_family != AF_UNIX)
 		return 0;
 
 	if (!ctx.uaddr) {
@@ -2533,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_get_retval_proto;
@@ -2548,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_set_retval_proto;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6b5280f14a53..8677837f3deb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2446,14 +2446,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -3678,14 +3683,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 	case BPF_CGROUP_SOCK_OPS:
 		return BPF_PROG_TYPE_SOCK_OPS;
@@ -3942,14 +3952,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eed7350e15f4..e777f50401b6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14797,10 +14797,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
-		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
 			range = tnum_range(1, 1);
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
diff --git a/net/core/filter.c b/net/core/filter.c
index ff0bd9f20b95..cc2e4babc85f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7875,14 +7875,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_setsockopt_proto;
 		default:
 			return NULL;
@@ -7893,14 +7898,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_getsockopt_proto;
 		default:
 			return NULL;
@@ -8948,8 +8958,8 @@ static bool sock_addr_is_valid_access(int off, int size,
 	if (off % size != 0)
 		return false;
 
-	/* Disallow access to IPv6 fields from IPv4 contex and vise
-	 * versa.
+	/* Disallow access to fields not belonging to the attach type's address
+	 * family.
 	 */
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3e8a04a13668..e10d07c76044 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -116,6 +116,7 @@
 #include <linux/freezer.h>
 #include <linux/file.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf-cgroup.h>
 
 #include "scm.h"
 
@@ -1381,6 +1382,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
+		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
+		if (err)
+			goto out;
+
 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
 		    !unix_sk(sk)->addr) {
@@ -1490,6 +1495,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (err)
 		goto out;
 
+	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
+	if (err)
+		goto out;
+
 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
 		err = unix_autobind(sk);
@@ -1770,6 +1779,13 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 	} else {
 		err = addr->len;
 		memcpy(sunaddr, addr->name, addr->len);
+
+		if (peer)
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETPEERNAME);
+		else
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETSOCKNAME);
 	}
 	sock_put(sk);
 out:
@@ -1922,6 +1938,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
 		if (err)
 			goto out;
+
+		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
+							    msg->msg_name,
+							    &msg->msg_namelen,
+							    NULL);
+		if (err)
+			goto out;
 	} else {
 		sunaddr = NULL;
 		err = -ENOTCONN;
@@ -2390,9 +2413,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
 						EPOLLOUT | EPOLLWRNORM |
 						EPOLLWRBAND);
 
-	if (msg->msg_name)
+	if (msg->msg_name) {
 		unix_copy_addr(msg, skb->sk);
 
+		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+						      msg->msg_name,
+						      &msg->msg_namelen);
+	}
+
 	if (size > skb->len - skip)
 		size = skb->len - skip;
 	else if (size < skb->len - skip)
@@ -2744,6 +2772,11 @@ unlock:
 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
 					 state->msg->msg_name);
 			unix_copy_addr(state->msg, skb->sk);
+
+			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+							      state->msg->msg_name,
+							      &state->msg->msg_namelen);
+
 			sunaddr = NULL;
 		}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
-- 
cgit v1.2.3


From 0a779003213b589d7b3eef72f69e19a30f603ebc Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:51 +0200
Subject: netdev: make napi_schedule return bool on NAPI successful schedule

Change napi_schedule to return a bool on NAPI successful schedule.
This might be useful for some driver to do additional steps after a
NAPI has been scheduled.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231009133754.9834-2-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 11d704bfec9b..2874770a0a74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -490,11 +490,18 @@ bool napi_schedule_prep(struct napi_struct *n);
  *
  * Schedule NAPI poll routine to be called if it is not already
  * running.
+ * Return true if we schedule a NAPI or false if not.
+ * Refer to napi_schedule_prep() for additional reason on why
+ * a NAPI might not be scheduled.
  */
-static inline void napi_schedule(struct napi_struct *n)
+static inline bool napi_schedule(struct napi_struct *n)
 {
-	if (napi_schedule_prep(n))
+	if (napi_schedule_prep(n)) {
 		__napi_schedule(n);
+		return true;
+	}
+
+	return false;
 }
 
 /**
-- 
cgit v1.2.3


From 73382e919f3d938554dadd01d95760f90d1c25c1 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Mon, 9 Oct 2023 15:37:52 +0200
Subject: netdev: replace napi_reschedule with napi_schedule

Now that napi_schedule return a bool, we can drop napi_reschedule that
does the same exact function. The function comes from a very old commit
bfe13f54f502 ("ibm_emac: Convert to use napi_struct independent of struct
net_device") and the purpose is actually deprecated in favour of
different logic.

Convert every user of napi_reschedule to napi_schedule.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Acked-by: Jeff Johnson <quic_jjohnson@quicinc.com> # ath10k
Acked-by: Nick Child <nnac123@linux.ibm.com> # ibm
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # for can/dev/rx-offload.c
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20231009133754.9834-3-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c                |  4 ++--
 drivers/net/can/dev/rx-offload.c                       |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c               |  2 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c             |  2 +-
 drivers/net/ethernet/ezchip/nps_enet.c                 |  2 +-
 drivers/net/ethernet/google/gve/gve_main.c             |  2 +-
 drivers/net/ethernet/ibm/ehea/ehea_main.c              |  2 +-
 drivers/net/ethernet/ibm/emac/mal.c                    |  2 +-
 drivers/net/ethernet/ibm/ibmveth.c                     |  2 +-
 drivers/net/ethernet/ibm/ibmvnic.c                     |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c             |  2 +-
 drivers/net/ethernet/ni/nixge.c                        |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c |  2 +-
 drivers/net/ethernet/xscale/ixp4xx_eth.c               |  4 ++--
 drivers/net/fjes/fjes_main.c                           |  2 +-
 drivers/net/wan/ixp4xx_hss.c                           |  4 ++--
 drivers/net/wireless/ath/ath10k/pci.c                  |  2 +-
 drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c             |  2 +-
 include/linux/netdevice.h                              | 10 ----------
 19 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index ed25061fac62..7f84d9866cef 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -488,7 +488,7 @@ poll_more:
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    napi_reschedule(napi))
+		    napi_schedule(napi))
 			goto poll_more;
 	}
 
@@ -518,7 +518,7 @@ poll_more:
 		napi_complete(napi);
 		if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    napi_reschedule(napi))
+		    napi_schedule(napi))
 			goto poll_more;
 	}
 	return n < 0 ? 0 : n;
diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index 77091f7d1fa7..46e7b6db4a1e 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -67,7 +67,7 @@ static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 
 		/* Check if there was another interrupt */
 		if (!skb_queue_empty(&offload->skb_queue))
-			napi_reschedule(&offload->napi);
+			napi_schedule(&offload->napi);
 	}
 
 	return work_done;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 98dd78551d89..b5ff2e1a9975 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -4261,7 +4261,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
 
 			if (fl_starving(adap, fl)) {
 				rxq = container_of(fl, struct sge_eth_rxq, fl);
-				if (napi_reschedule(&rxq->rspq.napi))
+				if (napi_schedule(&rxq->rspq.napi))
 					fl->starving++;
 				else
 					set_bit(id, s->starving_fl);
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 2d0cf76fb3c5..5b1d746e6563 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -2094,7 +2094,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
 				struct sge_eth_rxq *rxq;
 
 				rxq = container_of(fl, struct sge_eth_rxq, fl);
-				if (napi_reschedule(&rxq->rspq.napi))
+				if (napi_schedule(&rxq->rspq.napi))
 					fl->starving++;
 				else
 					set_bit(id, s->starving_fl);
diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c
index edf000e7bab4..4d7184d46824 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -198,7 +198,7 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
 		 */
 		if (nps_enet_is_tx_pending(priv)) {
 			nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		}
 	}
 
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 83b09dcfafc4..276f996f95dc 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -281,7 +281,7 @@ static int gve_napi_poll(struct napi_struct *napi, int budget)
 		if (block->rx)
 			reschedule |= gve_rx_work_pending(block->rx);
 
-		if (reschedule && napi_reschedule(napi))
+		if (reschedule && napi_schedule(napi))
 			iowrite32be(GVE_IRQ_MASK, irq_doorbell);
 	}
 	return work_done;
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 251dedd55cfb..1e29e5c9a2df 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -900,7 +900,7 @@ static int ehea_poll(struct napi_struct *napi, int budget)
 		if (!cqe && !cqe_skb)
 			return rx;
 
-		if (!napi_reschedule(napi))
+		if (!napi_schedule(napi))
 			return rx;
 
 		cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c
index 462646d1b817..2439f7e96e05 100644
--- a/drivers/net/ethernet/ibm/emac/mal.c
+++ b/drivers/net/ethernet/ibm/emac/mal.c
@@ -442,7 +442,7 @@ static int mal_poll(struct napi_struct *napi, int budget)
 		if (unlikely(mc->ops->peek_rx(mc->dev) ||
 			     test_bit(MAL_COMMAC_RX_STOPPED, &mc->flags))) {
 			MAL_DBG2(mal, "rotting packet" NL);
-			if (!napi_reschedule(napi))
+			if (!napi_schedule(napi))
 				goto more_work;
 
 			spin_lock_irqsave(&mal->lock, flags);
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index a8d79ee350f8..b5aef0b29efe 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1432,7 +1432,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 		BUG_ON(lpar_rc != H_SUCCESS);
 
 		if (ibmveth_rxq_pending_buffer(adapter) &&
-		    napi_reschedule(napi)) {
+		    napi_schedule(napi)) {
 			lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 					       VIO_IRQ_DISABLE);
 		}
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index cdf5251e5679..2094f413cbe4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3519,7 +3519,7 @@ restart_poll:
 		if (napi_complete_done(napi, frames_processed)) {
 			enable_scrq_irq(adapter, rx_scrq);
 			if (pending_scrq(adapter, rx_scrq)) {
-				if (napi_reschedule(napi)) {
+				if (napi_schedule(napi)) {
 					disable_scrq_irq(adapter, rx_scrq);
 					goto restart_poll;
 				}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 332472fe4990..a09b6e05337d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -400,7 +400,7 @@ void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
 	for (ring = 0; ring < priv->rx_ring_num; ring++) {
 		if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
 			local_bh_disable();
-			napi_reschedule(&priv->rx_cq[ring]->napi);
+			napi_schedule(&priv->rx_cq[ring]->napi);
 			local_bh_enable();
 		}
 	}
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index f71a4f8bbb89..fa1f78b03cb2 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -683,7 +683,7 @@ static int nixge_poll(struct napi_struct *napi, int budget)
 		if (status & (XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK)) {
 			/* If there's more, reschedule, but clear */
 			nixge_dma_write_reg(priv, XAXIDMA_RX_SR_OFFSET, status);
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		} else {
 			/* if not, turn on RX IRQs again ... */
 			cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index f9e43fc32ee8..3ca1c2a816ff 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
@@ -802,7 +802,7 @@ static int stmmac_test_flowctrl(struct stmmac_priv *priv)
 		stmmac_start_rx(priv, priv->ioaddr, i);
 
 		local_bh_disable();
-		napi_reschedule(&ch->rx_napi);
+		napi_schedule(&ch->rx_napi);
 		local_bh_enable();
 	}
 
diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index c47108f2a0a0..531bf919aef5 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -723,9 +723,9 @@ static int eth_poll(struct napi_struct *napi, int budget)
 			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_below_low_watermark(rxq) &&
-			    napi_reschedule(napi)) { /* not empty again */
+			    napi_schedule(napi)) { /* not empty again */
 #if DEBUG_RX
-				netdev_debug(dev, "eth_poll napi_reschedule succeeded\n");
+				netdev_debug(dev, "eth_poll napi_schedule succeeded\n");
 #endif
 				qmgr_disable_irq(rxq);
 				continue;
diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index 2513be6d4e11..cd8cf08477ec 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -1030,7 +1030,7 @@ static int fjes_poll(struct napi_struct *napi, int budget)
 		}
 
 		if (((long)jiffies - (long)adapter->rx_last_jiffies) < 3) {
-			napi_reschedule(napi);
+			napi_schedule(napi);
 		} else {
 			spin_lock(&hw->rx_status_lock);
 			for (epidx = 0; epidx < hw->max_epid; epidx++) {
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index e46b7f5ee49e..b09f4c235142 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -687,10 +687,10 @@ static int hss_hdlc_poll(struct napi_struct *napi, int budget)
 			napi_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_empty(rxq) &&
-			    napi_reschedule(napi)) {
+			    napi_schedule(napi)) {
 #if DEBUG_RX
 				printk(KERN_DEBUG "%s: hss_hdlc_poll"
-				       " napi_reschedule succeeded\n",
+				       " napi_schedule succeeded\n",
 				       dev->name);
 #endif
 				qmgr_disable_irq(rxq);
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index 23f366221939..2f8c785277af 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -3148,7 +3148,7 @@ static int ath10k_pci_napi_poll(struct napi_struct *ctx, int budget)
 		 * immediate servicing.
 		 */
 		if (ath10k_ce_interrupt_summary(ar)) {
-			napi_reschedule(ctx);
+			napi_schedule(ctx);
 			goto out;
 		}
 		ath10k_pci_enable_legacy_irq(ar);
diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
index f4ff2198b5ef..210d84c67ef9 100644
--- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
+++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
@@ -852,7 +852,7 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget)
 	if (!ret) {
 		napi_complete_done(napi, work_done);
 		rxq->sleep_lock_pending = true;
-		napi_reschedule(napi);
+		napi_schedule(napi);
 		return work_done;
 	}
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2874770a0a74..ae553f886796 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -516,16 +516,6 @@ static inline void napi_schedule_irqoff(struct napi_struct *n)
 		__napi_schedule_irqoff(n);
 }
 
-/* Try to reschedule poll. Called by dev->poll() after napi_complete().  */
-static inline bool napi_reschedule(struct napi_struct *napi)
-{
-	if (napi_schedule_prep(napi)) {
-		__napi_schedule(napi);
-		return true;
-	}
-	return false;
-}
-
 /**
  * napi_complete_done - NAPI processing complete
  * @n: NAPI context
-- 
cgit v1.2.3


From b9f29205c0182a2059b4dfa2883db5ef423574d4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2023 15:14:27 -0700
Subject: iosys-map: fix kernel-doc typos

Correct spelling of "beginning".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20230930221428.18463-2-rdunlap@infradead.org
---
 include/linux/iosys-map.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index cb71aa616bd3..1b06d074ade0 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -426,7 +426,7 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * iosys_map_rd_field - Read a member from a struct in the iosys_map
  *
  * @map__:		The iosys_map structure
- * @struct_offset__:	Offset from the beggining of the map, where the struct
+ * @struct_offset__:	Offset from the beginning of the map, where the struct
  *			is located
  * @struct_type__:	The struct describing the layout of the mapping
  * @field__:		Member of the struct to read
@@ -494,7 +494,7 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * iosys_map_wr_field - Write to a member of a struct in the iosys_map
  *
  * @map__:		The iosys_map structure
- * @struct_offset__:	Offset from the beggining of the map, where the struct
+ * @struct_offset__:	Offset from the beginning of the map, where the struct
  *			is located
  * @struct_type__:	The struct describing the layout of the mapping
  * @field__:		Member of the struct to read
-- 
cgit v1.2.3


From 668706b10c9b8181a53bd8881a77bb81b328ab33 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 11 Oct 2023 23:26:35 +0300
Subject: gpiolib: provide gpio_device_find_by_fwnode()

One of the ways of looking up GPIO devices is using their fwnode.
Provide a helper for that to avoid every user implementing their
own matching function.

Reviewed-by: Dipen Patel <dipenp@nvidia.com>
Tested-by: Dipen Patel <dipenp@nvidia.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231010151709.4104747-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c      | 20 ++++++++++++++++++++
 include/linux/gpio/driver.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 191f9c87b4d0..dd18dd56c33c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1145,6 +1145,26 @@ struct gpio_device *gpio_device_find_by_label(const char *label)
 }
 EXPORT_SYMBOL_GPL(gpio_device_find_by_label);
 
+static int gpio_chip_match_by_fwnode(struct gpio_chip *gc, void *fwnode)
+{
+	return device_match_fwnode(&gc->gpiodev->dev, fwnode);
+}
+
+/**
+ * gpio_device_find_by_fwnode() - wrapper around gpio_device_find() finding
+ *                                the GPIO device by its fwnode
+ * @fwnode: Firmware node to lookup
+ *
+ * Returns:
+ * Reference to the GPIO device or NULL. Reference must be released with
+ * gpio_device_put().
+ */
+struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode)
+{
+	return gpio_device_find((void *)fwnode, gpio_chip_match_by_fwnode);
+}
+EXPORT_SYMBOL_GPL(gpio_device_find_by_fwnode);
+
 /**
  * gpio_device_get() - Increase the reference count of this GPIO device
  * @gdev: GPIO device to increase the refcount for
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index f8ad7f40100c..ae4162d3f1d3 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -611,6 +611,7 @@ struct gpio_chip *gpiochip_find(void *data,
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 struct gpio_device *gpio_device_find_by_label(const char *label);
+struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode);
 
 struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
-- 
cgit v1.2.3


From e001d1447cd4585d7f23a44ff668ba2bc624badb Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 12 Oct 2023 15:24:17 +0300
Subject: fs: factor out vfs_parse_monolithic_sep() helper

Factor out vfs_parse_monolithic_sep() from generic_parse_monolithic(),
so filesystems could use it with a custom option separator callback.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/fs_context.c            | 34 +++++++++++++++++++++++++++++-----
 include/linux/fs_context.h |  2 ++
 2 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index a0ad7a0c4680..98589aae5208 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -192,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 EXPORT_SYMBOL(vfs_parse_fs_string);
 
 /**
- * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
  * @fc: The superblock configuration to fill in.
  * @data: The data to parse
+ * @sep: callback for separating next option
  *
- * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
- * called from the ->monolithic_mount_data() fs_context operation.
+ * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom
+ * option separator callback.
  *
  * Returns 0 on success or the error returned by the ->parse_option() fs_context
  * operation on failure.
  */
-int generic_parse_monolithic(struct fs_context *fc, void *data)
+int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
+			     char *(*sep)(char **))
 {
 	char *options = data, *key;
 	int ret = 0;
@@ -214,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
 	if (ret)
 		return ret;
 
-	while ((key = strsep(&options, ",")) != NULL) {
+	while ((key = sep(&options)) != NULL) {
 		if (*key) {
 			size_t v_len = 0;
 			char *value = strchr(key, '=');
@@ -233,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
 
 	return ret;
 }
+EXPORT_SYMBOL(vfs_parse_monolithic_sep);
+
+static char *vfs_parse_comma_sep(char **s)
+{
+	return strsep(s, ",");
+}
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @fc: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+	return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep);
+}
 EXPORT_SYMBOL(generic_parse_monolithic);
 
 /**
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 96332db693d5..c13e99cbbf81 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -136,6 +136,8 @@ extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
 extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
 extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 			       const char *value, size_t v_size);
+int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
+			     char *(*sep)(char **));
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
-- 
cgit v1.2.3


From 13cc9ee8f8ed58e563294d87d74a62006be40f21 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 12 Oct 2023 13:09:02 -0400
Subject: cgroup: Fix incorrect css_set_rwsem reference in comment

Since commit f0d9a5f17575 ("cgroup: make css_set_rwsem a spinlock
and rename it to css_set_lock"), css_set_rwsem has been replaced by
css_set_lock. That commit, however, missed the css_set_rwsem reference
in include/linux/cgroup-defs.h. Fix that by changing it to css_set_lock
as well.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..265da00a1a8b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -238,7 +238,7 @@ struct css_set {
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
 	 * process of being migrated out or in.  Protected by
-	 * css_set_rwsem, but, during migration, once tasks are moved to
+	 * css_set_lock, but, during migration, once tasks are moved to
 	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
-- 
cgit v1.2.3


From f06cc667f79909e9175460b167c277b7c64d3df0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Oct 2023 23:04:25 +0200
Subject: perf: Optimize perf_cgroup_switch()

Namhyung reported that bd2756811766 ("perf: Rewrite core context handling")
regresses context switch overhead when perf-cgroup is in use together
with 'slow' PMUs like uncore.

Specifically, perf_cgroup_switch()'s perf_ctx_disable() /
ctx_sched_out() etc.. all iterate the full list of active PMUs for
that CPU, even if they don't have cgroup events.

Previously there was cgrp_cpuctx_list which linked the relevant PMUs
together, but that got lost in the rework. Instead of re-instruducing
a similar list, let the perf_event_pmu_context iteration skip those
that do not have cgroup events. This avoids growing multiple versions
of the perf_event_pmu_context iteration.

Measured performance (on a slightly different patch):

Before)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.901 [sec]

        90.128700 usecs/op
            11095 ops/sec

After)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.065 [sec]

         6.560100 usecs/op
           152436 ops/sec

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Debugged-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231009210425.GC6307@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c       | 115 +++++++++++++++++++++++----------------------
 2 files changed, 61 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f31f962a6445..0367d748fae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
 	unsigned int			embedded : 1;
 
 	unsigned int			nr_events;
+	unsigned int			nr_cgroups;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 708d474c2ede..3eb26c2c6e65 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
 	EVENT_TIME = 0x4,
 	/* see ctx_resched() for details */
 	EVENT_CPU = 0x8,
+	EVENT_CGROUP = 0x10,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -684,20 +685,26 @@ do {									\
 	___p;								\
 })
 
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_disable(pmu_ctx->pmu);
+	}
 }
 
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_enable(pmu_ctx->pmu);
+	}
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 		return;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups++;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups--;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, false);
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx);
+		perf_ctx_disable(task_ctx, false);
 		task_ctx_sched_out(task_ctx, event_type);
 	}
 
@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	perf_event_sched_in(cpuctx, task_ctx);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, false);
 	if (task_ctx)
-		perf_ctx_enable(task_ctx);
+		perf_ctx_enable(task_ctx, false);
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
+	}
 }
 
 /*
@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
 
-			perf_ctx_disable(ctx);
+			perf_ctx_disable(ctx, false);
 
 			/* PMIs are disabled; ctx->nr_pending is stable. */
 			if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 			perf_ctx_sched_task_cb(ctx, false);
 			perf_event_swap_task_ctx_data(ctx, next_ctx);
 
-			perf_ctx_enable(ctx);
+			perf_ctx_enable(ctx, false);
 
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3543,13 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
 		task_ctx_sched_out(ctx, EVENT_ALL);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
 	return 0;
 }
 
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				struct pmu *pmu)
 {
-	struct perf_event_pmu_context *pmu_ctx;
 	int can_add_hw = 1;
-
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->pinned_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->pinned_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
-	}
+	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
-	int can_add_hw = 1;
 
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->flexible_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->flexible_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
+		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
 	}
 }
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+			       struct pmu *pmu)
 {
-	ctx_flexible_sched_in(ctx, pmu);
+	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
 }
 
 static void
@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
 
 	/* Then walk through the lower prio flexible groups */
 	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
 
 	if (cpuctx->task_ctx == ctx) {
 		perf_ctx_lock(cpuctx, ctx);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 		perf_ctx_sched_task_cb(ctx, true);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		perf_ctx_unlock(cpuctx, ctx);
 		goto rcu_unlock;
 	}
@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	if (!ctx->nr_events)
 		goto unlock;
 
-	perf_ctx_disable(ctx);
+	perf_ctx_disable(ctx, false);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	 * events, no need to flip the cpuctx's events around.
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
-		perf_ctx_disable(&cpuctx->ctx);
+		perf_ctx_disable(&cpuctx->ctx, false);
 		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
 	}
 
@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-		perf_ctx_enable(&cpuctx->ctx);
+		perf_ctx_enable(&cpuctx->ctx, false);
 
-	perf_ctx_enable(ctx);
+	perf_ctx_enable(ctx, false);
 
 unlock:
 	perf_ctx_unlock(cpuctx, ctx);
-- 
cgit v1.2.3


From f995443f01b4dbcce723539b99050ce69b319e58 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 12 Oct 2023 16:31:58 +0200
Subject: locking/seqlock: Simplify SEQCOUNT_LOCKNAME()

1. Kill the "lockmember" argument. It is always s->lock plus
   __seqprop_##lockname##_sequence() already uses s->lock and
   ignores "lockmember".

2. Kill the "lock_acquire" argument. __seqprop_##lockname##_sequence()
   can use the same "lockbase" prefix for _lock and _unlock.

Apart from line numbers, gcc -E outputs the same code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20231012143158.GA16133@redhat.com
---
 include/linux/seqlock.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index af518e4d0c6b..7e7109dbc3f5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * @lockname:		"LOCKNAME" part of seqcount_LOCKNAME_t
  * @locktype:		LOCKNAME canonical C data type
  * @preemptible:	preemptibility of above locktype
- * @lockmember:		argument for lockdep_assert_held()
- * @lockbase:		associated lock release function (prefix only)
- * @lock_acquire:	associated lock acquisition function (full call)
+ * @lockbase:		prefix for associated lock/unlock
  */
-#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
 typedef struct seqcount_##lockname {					\
 	seqcount_t		seqcount;				\
 	__SEQ_LOCK(locktype	*lock);					\
@@ -216,7 +214,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 		return seq;						\
 									\
 	if (preemptible && unlikely(seq & 1)) {				\
-		__SEQ_LOCK(lock_acquire);				\
+		__SEQ_LOCK(lockbase##_lock(s->lock));			\
 		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
 									\
 		/*							\
@@ -242,7 +240,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)	\
 static __always_inline void						\
 __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
 {									\
-	__SEQ_LOCK(lockdep_assert_held(lockmember));			\
+	__SEQ_LOCK(lockdep_assert_held(s->lock));			\
 }
 
 /*
@@ -271,10 +269,10 @@ static inline void __seqprop_assert(const seqcount_t *s)
 
 #define __SEQ_RT	IS_ENABLED(CONFIG_PREEMPT_RT)
 
-SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    s->lock,        raw_spin, raw_spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, s->lock,        spin,     spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, s->lock,        read,     read_lock(s->lock))
-SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex,    mutex_lock(s->lock))
+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
+SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
+SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
+SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 
 /*
  * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
-- 
cgit v1.2.3


From e6115c6f7a0ce3388cc60b69a284facf78b5dbfd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 12 Oct 2023 16:32:27 +0200
Subject: locking/seqlock: Change __seqprop() to return the function pointer

This simplifies the macro and makes it easy to add the new seqprop's
with 2 or more args.

Plus this way we do not lose the type info, the (void*) type cast is
no longer needed.

And the latter reveals the problem: a lot of seqcount_t helpers pass
the "const seqcount_t *s" argument to __seqprop_ptr(seqcount_t *s)
but (before this patch) "(void *)(s)" masked the problem.

So this patch changes __seqprop_ptr() and __seqprop_##lockname##_ptr()
to accept the "const LOCKNAME *s" argument. This is not nice either,
they need to drop the constness on return because these helpers are used
by both the readers and writers, but at least it is clear what's going on.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20231012143227.GA16143@redhat.com
---
 include/linux/seqlock.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 7e7109dbc3f5..4b8dcd3a0d93 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -200,9 +200,9 @@ typedef struct seqcount_##lockname {					\
 } seqcount_##lockname##_t;						\
 									\
 static __always_inline seqcount_t *					\
-__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
+__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s)		\
 {									\
-	return &s->seqcount;						\
+	return (void *)&s->seqcount; /* drop const */			\
 }									\
 									\
 static __always_inline unsigned						\
@@ -247,9 +247,9 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
  * __seqprop() for seqcount_t
  */
 
-static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
+static inline seqcount_t *__seqprop_ptr(const seqcount_t *s)
 {
-	return s;
+	return (void *)s; /* drop const */
 }
 
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
@@ -292,19 +292,19 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 #define SEQCNT_WW_MUTEX_ZERO(name, lock) 	SEQCOUNT_LOCKNAME_ZERO(name, lock)
 
 #define __seqprop_case(s, lockname, prop)				\
-	seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))
+	seqcount_##lockname##_t: __seqprop_##lockname##_##prop
 
 #define __seqprop(s, prop) _Generic(*(s),				\
-	seqcount_t:		__seqprop_##prop((void *)(s)),		\
+	seqcount_t:		__seqprop_##prop,			\
 	__seqprop_case((s),	raw_spinlock,	prop),			\
 	__seqprop_case((s),	spinlock,	prop),			\
 	__seqprop_case((s),	rwlock,		prop),			\
 	__seqprop_case((s),	mutex,		prop))
 
-#define seqprop_ptr(s)			__seqprop(s, ptr)
-#define seqprop_sequence(s)		__seqprop(s, sequence)
-#define seqprop_preemptible(s)		__seqprop(s, preemptible)
-#define seqprop_assert(s)		__seqprop(s, assert)
+#define seqprop_ptr(s)			__seqprop(s, ptr)(s)
+#define seqprop_sequence(s)		__seqprop(s, sequence)(s)
+#define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
+#define seqprop_assert(s)		__seqprop(s, assert)(s)
 
 /**
  * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
-- 
cgit v1.2.3


From 7cbabed16464ad980c8d80b601559b36e1bc0b0c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 31 Jul 2023 05:25:36 +0300
Subject: thunderbolt: Fix typo in enum tb_link_width kernel-doc

Typo trasmitters -> transmitters.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Gil Fine <gil.fine@linux.intel.com>
---
 include/linux/thunderbolt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 02333f47c994..6151c210d987 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -175,7 +175,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * enum tb_link_width - Thunderbolt/USB4 link width
  * @TB_LINK_WIDTH_SINGLE: Single lane link
  * @TB_LINK_WIDTH_DUAL: Dual lane symmetric link
- * @TB_LINK_WIDTH_ASYM_TX: Dual lane asymmetric Gen 4 link with 3 trasmitters
+ * @TB_LINK_WIDTH_ASYM_TX: Dual lane asymmetric Gen 4 link with 3 transmitters
  * @TB_LINK_WIDTH_ASYM_RX: Dual lane asymmetric Gen 4 link with 3 receivers
  */
 enum tb_link_width {
-- 
cgit v1.2.3


From 1559d14977b694570f010854b8192e6de034bc27 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 11 Oct 2023 15:02:02 +0200
Subject: gpiolib: provide gpio_device_to_device()

There are users in the kernel who need to retrieve the address of the
struct device backing the GPIO device. Currently they needlessly poke in
the internals of GPIOLIB. Add a dedicated getter function.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Peter Rosin <peda@axentia.se>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 17 +++++++++++++++++
 include/linux/gpio/driver.h |  2 ++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index dd18dd56c33c..7e297ae35f8c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1189,6 +1189,23 @@ void gpio_device_put(struct gpio_device *gdev)
 }
 EXPORT_SYMBOL_GPL(gpio_device_put);
 
+/**
+ * gpio_device_to_device() - Retrieve the address of the underlying struct
+ *                           device.
+ * @gdev: GPIO device for which to return the address.
+ *
+ * This does not increase the reference count of the GPIO device nor the
+ * underlying struct device.
+ *
+ * Returns:
+ * Address of struct device backing this GPIO device.
+ */
+struct device *gpio_device_to_device(struct gpio_device *gdev)
+{
+	return &gdev->dev;
+}
+EXPORT_SYMBOL_GPL(gpio_device_to_device);
+
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 
 /*
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ae4162d3f1d3..0e40098aa283 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -619,6 +619,8 @@ void gpio_device_put(struct gpio_device *gdev);
 DEFINE_FREE(gpio_device_put, struct gpio_device *,
 	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
 
+struct device *gpio_device_to_device(struct gpio_device *gdev);
+
 bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From 370232d096e3fe188e4596f77bc6560636bd40c1 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 11 Oct 2023 15:02:03 +0200
Subject: gpiolib: provide gpiod_to_gpio_device()

Accessing struct gpio_chip backing a GPIO device is only allowed for the
actual providers of that chip.

Similarly to how we introduced gpio_device_find() in order to replace
the abused gpiochip_find(), let's introduce a counterpart to
gpiod_to_chip() that returns a reference to the GPIO device owning the
descriptor. This is done in order to later remove gpiod_to_chip()
entirely.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Peter Rosin <peda@axentia.se>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 21 +++++++++++++++++++++
 include/linux/gpio/driver.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 7e297ae35f8c..9febaef6767d 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -220,6 +220,27 @@ struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_to_chip);
 
+/**
+ * gpiod_to_gpio_device() - Return the GPIO device to which this descriptor
+ *                          belongs.
+ * @desc: Descriptor for which to return the GPIO device.
+ *
+ * This *DOES NOT* increase the reference count of the GPIO device as it's
+ * expected that the descriptor is requested and the users already holds a
+ * reference to the device.
+ *
+ * Returns:
+ * Address of the GPIO device owning this descriptor.
+ */
+struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc)
+{
+	if (!desc)
+		return NULL;
+
+	return desc->gdev;
+}
+EXPORT_SYMBOL_GPL(gpiod_to_gpio_device);
+
 /**
  * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device
  * @gdev: GPIO device
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0e40098aa283..d231c4f31cb4 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -785,6 +785,7 @@ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
+struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
 #else /* CONFIG_GPIOLIB */
 
-- 
cgit v1.2.3


From 8c85a102fc4e5c0c942c10677fa43f7a19baa92f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 5 Sep 2023 20:52:55 +0200
Subject: gpiolib: provide gpio_device_get_base()

Let's start adding getters for the opaque struct gpio_device. Start with
a function allowing to retrieve the base GPIO number.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 13 +++++++++++++
 include/linux/gpio/driver.h |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9febaef6767d..cbafcd95243e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -241,6 +241,19 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc)
 }
 EXPORT_SYMBOL_GPL(gpiod_to_gpio_device);
 
+/**
+ * gpio_device_get_base() - Get the base GPIO number allocated by this device
+ * @gdev: GPIO device
+ *
+ * Returns:
+ * First GPIO number in the global GPIO numberspace for this device.
+ */
+int gpio_device_get_base(struct gpio_device *gdev)
+{
+	return gdev->base;
+}
+EXPORT_SYMBOL_GPL(gpio_device_get_base);
+
 /**
  * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device
  * @gdev: GPIO device
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index d231c4f31cb4..1d454dc944b3 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -787,6 +787,9 @@ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset);
 struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc);
 struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
+/* struct gpio_device getters */
+int gpio_device_get_base(struct gpio_device *gdev);
+
 #else /* CONFIG_GPIOLIB */
 
 #include <linux/err.h>
-- 
cgit v1.2.3


From 384461abcab6602abc06c2dfb8fb99beeeaa12b0 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 4 Aug 2023 16:27:06 +0200
Subject: pwm: Manage owner assignment implicitly for drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of requiring each driver to care for assigning the owner member
of struct pwm_ops, handle that implicitly using a macro. Note that the
owner member has to be moved to struct pwm_chip, as the ops structure
usually lives in read-only memory and so cannot be modified.

The upside is that new low level drivers cannot forget the assignment and
save one line each. The pwm-crc driver didn't assign .owner, that's not
a problem in practice though as the driver cannot be compiled as a
module.

Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com> # Intel LPSS
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com> # pwm-{bcm,brcm}*.c
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com> # sun4i
Acked-by: Andi Shyti <andi.shyti@kernel.org>
Acked-by: Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp> # pwm-visconti
Acked-by: Heiko Stuebner <heiko@sntech.de> # pwm-rockchip
Acked-by: Michael Walle <michael@walle.cc> # pwm-sl28cpld
Acked-by: Neil Armstrong <neil.armstrong@linaro.org> # pwm-meson
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20230804142707.412137-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/gpio/gpio-mvebu.c             |  1 -
 drivers/gpu/drm/bridge/ti-sn65dsi86.c |  1 -
 drivers/leds/rgb/leds-qcom-lpg.c      |  1 -
 drivers/pwm/core.c                    | 24 ++++++++++++++----------
 drivers/pwm/pwm-ab8500.c              |  1 -
 drivers/pwm/pwm-apple.c               |  1 -
 drivers/pwm/pwm-atmel-hlcdc.c         |  1 -
 drivers/pwm/pwm-atmel-tcb.c           |  1 -
 drivers/pwm/pwm-atmel.c               |  1 -
 drivers/pwm/pwm-bcm-iproc.c           |  1 -
 drivers/pwm/pwm-bcm-kona.c            |  1 -
 drivers/pwm/pwm-bcm2835.c             |  1 -
 drivers/pwm/pwm-berlin.c              |  1 -
 drivers/pwm/pwm-brcmstb.c             |  1 -
 drivers/pwm/pwm-clk.c                 |  1 -
 drivers/pwm/pwm-clps711x.c            |  1 -
 drivers/pwm/pwm-cros-ec.c             |  1 -
 drivers/pwm/pwm-dwc.c                 |  1 -
 drivers/pwm/pwm-ep93xx.c              |  1 -
 drivers/pwm/pwm-fsl-ftm.c             |  1 -
 drivers/pwm/pwm-hibvt.c               |  1 -
 drivers/pwm/pwm-img.c                 |  1 -
 drivers/pwm/pwm-imx-tpm.c             |  1 -
 drivers/pwm/pwm-imx1.c                |  1 -
 drivers/pwm/pwm-imx27.c               |  1 -
 drivers/pwm/pwm-intel-lgm.c           |  1 -
 drivers/pwm/pwm-iqs620a.c             |  1 -
 drivers/pwm/pwm-jz4740.c              |  1 -
 drivers/pwm/pwm-keembay.c             |  1 -
 drivers/pwm/pwm-lp3943.c              |  1 -
 drivers/pwm/pwm-lpc18xx-sct.c         |  1 -
 drivers/pwm/pwm-lpc32xx.c             |  1 -
 drivers/pwm/pwm-lpss.c                |  1 -
 drivers/pwm/pwm-mediatek.c            |  1 -
 drivers/pwm/pwm-meson.c               |  1 -
 drivers/pwm/pwm-microchip-core.c      |  1 -
 drivers/pwm/pwm-mtk-disp.c            |  1 -
 drivers/pwm/pwm-mxs.c                 |  1 -
 drivers/pwm/pwm-ntxec.c               |  1 -
 drivers/pwm/pwm-omap-dmtimer.c        |  1 -
 drivers/pwm/pwm-pca9685.c             |  1 -
 drivers/pwm/pwm-pxa.c                 |  1 -
 drivers/pwm/pwm-raspberrypi-poe.c     |  1 -
 drivers/pwm/pwm-rcar.c                |  1 -
 drivers/pwm/pwm-renesas-tpu.c         |  1 -
 drivers/pwm/pwm-rockchip.c            |  1 -
 drivers/pwm/pwm-rz-mtu3.c             |  1 -
 drivers/pwm/pwm-samsung.c             |  1 -
 drivers/pwm/pwm-sifive.c              |  1 -
 drivers/pwm/pwm-sl28cpld.c            |  1 -
 drivers/pwm/pwm-spear.c               |  1 -
 drivers/pwm/pwm-sprd.c                |  1 -
 drivers/pwm/pwm-sti.c                 |  1 -
 drivers/pwm/pwm-stm32-lp.c            |  1 -
 drivers/pwm/pwm-stm32.c               |  1 -
 drivers/pwm/pwm-stmpe.c               |  1 -
 drivers/pwm/pwm-sun4i.c               |  1 -
 drivers/pwm/pwm-sunplus.c             |  1 -
 drivers/pwm/pwm-tegra.c               |  1 -
 drivers/pwm/pwm-tiecap.c              |  1 -
 drivers/pwm/pwm-tiehrpwm.c            |  1 -
 drivers/pwm/pwm-twl-led.c             |  2 --
 drivers/pwm/pwm-twl.c                 |  2 --
 drivers/pwm/pwm-visconti.c            |  1 -
 drivers/pwm/pwm-vt8500.c              |  1 -
 drivers/pwm/pwm-xilinx.c              |  1 -
 drivers/staging/greybus/pwm.c         |  1 -
 include/linux/pwm.h                   | 10 ++++++----
 68 files changed, 20 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 67497116ce27..64acd497a341 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -756,7 +756,6 @@ static const struct pwm_ops mvebu_pwm_ops = {
 	.free = mvebu_pwm_free,
 	.get_state = mvebu_pwm_get_state,
 	.apply = mvebu_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static void __maybe_unused mvebu_pwm_suspend(struct mvebu_gpio_chip *mvchip)
diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index f448b903e190..08e8c4f333e3 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -1580,7 +1580,6 @@ static const struct pwm_ops ti_sn_pwm_ops = {
 	.free = ti_sn_pwm_free,
 	.apply = ti_sn_pwm_apply,
 	.get_state = ti_sn_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int ti_sn_pwm_probe(struct auxiliary_device *adev,
diff --git a/drivers/leds/rgb/leds-qcom-lpg.c b/drivers/leds/rgb/leds-qcom-lpg.c
index df469aaa7e6e..c8525f59748c 100644
--- a/drivers/leds/rgb/leds-qcom-lpg.c
+++ b/drivers/leds/rgb/leds-qcom-lpg.c
@@ -1085,7 +1085,6 @@ static const struct pwm_ops lpg_pwm_ops = {
 	.request = lpg_pwm_request,
 	.apply = lpg_pwm_apply,
 	.get_state = lpg_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int lpg_add_pwm(struct lpg *lpg)
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index dc66e3405bf5..a2824eb4e236 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -89,13 +89,13 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
 	if (test_bit(PWMF_REQUESTED, &pwm->flags))
 		return -EBUSY;
 
-	if (!try_module_get(pwm->chip->ops->owner))
+	if (!try_module_get(pwm->chip->owner))
 		return -ENODEV;
 
 	if (pwm->chip->ops->request) {
 		err = pwm->chip->ops->request(pwm->chip, pwm);
 		if (err) {
-			module_put(pwm->chip->ops->owner);
+			module_put(pwm->chip->owner);
 			return err;
 		}
 	}
@@ -253,14 +253,16 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
 }
 
 /**
- * pwmchip_add() - register a new PWM chip
+ * __pwmchip_add() - register a new PWM chip
  * @chip: the PWM chip to add
+ * @owner: reference to the module providing the chip.
  *
- * Register a new PWM chip.
+ * Register a new PWM chip. @owner is supposed to be THIS_MODULE, use the
+ * pwmchip_add wrapper to do this right.
  *
  * Returns: 0 on success or a negative error code on failure.
  */
-int pwmchip_add(struct pwm_chip *chip)
+int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 {
 	struct pwm_device *pwm;
 	unsigned int i;
@@ -272,6 +274,8 @@ int pwmchip_add(struct pwm_chip *chip)
 	if (!pwm_ops_check(chip))
 		return -EINVAL;
 
+	chip->owner = owner;
+
 	chip->pwms = kcalloc(chip->npwm, sizeof(*pwm), GFP_KERNEL);
 	if (!chip->pwms)
 		return -ENOMEM;
@@ -306,7 +310,7 @@ int pwmchip_add(struct pwm_chip *chip)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pwmchip_add);
+EXPORT_SYMBOL_GPL(__pwmchip_add);
 
 /**
  * pwmchip_remove() - remove a PWM chip
@@ -338,17 +342,17 @@ static void devm_pwmchip_remove(void *data)
 	pwmchip_remove(chip);
 }
 
-int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip)
+int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner)
 {
 	int ret;
 
-	ret = pwmchip_add(chip);
+	ret = __pwmchip_add(chip, owner);
 	if (ret)
 		return ret;
 
 	return devm_add_action_or_reset(dev, devm_pwmchip_remove, chip);
 }
-EXPORT_SYMBOL_GPL(devm_pwmchip_add);
+EXPORT_SYMBOL_GPL(__devm_pwmchip_add);
 
 /**
  * pwm_request_from_chip() - request a PWM device relative to a PWM chip
@@ -979,7 +983,7 @@ void pwm_put(struct pwm_device *pwm)
 	pwm_set_chip_data(pwm, NULL);
 	pwm->label = NULL;
 
-	module_put(pwm->chip->ops->owner);
+	module_put(pwm->chip->owner);
 out:
 	mutex_unlock(&pwm_lock);
 }
diff --git a/drivers/pwm/pwm-ab8500.c b/drivers/pwm/pwm-ab8500.c
index 583a7d69c741..670d33daea84 100644
--- a/drivers/pwm/pwm-ab8500.c
+++ b/drivers/pwm/pwm-ab8500.c
@@ -181,7 +181,6 @@ static int ab8500_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops ab8500_pwm_ops = {
 	.apply = ab8500_pwm_apply,
 	.get_state = ab8500_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int ab8500_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-apple.c b/drivers/pwm/pwm-apple.c
index 8e7d67fb5fbe..4d755b628d9e 100644
--- a/drivers/pwm/pwm-apple.c
+++ b/drivers/pwm/pwm-apple.c
@@ -99,7 +99,6 @@ static int apple_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops apple_pwm_ops = {
 	.apply = apple_pwm_apply,
 	.get_state = apple_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int apple_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c
index e271d920151e..07920e034757 100644
--- a/drivers/pwm/pwm-atmel-hlcdc.c
+++ b/drivers/pwm/pwm-atmel-hlcdc.c
@@ -170,7 +170,6 @@ static int atmel_hlcdc_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops atmel_hlcdc_pwm_ops = {
 	.apply = atmel_hlcdc_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct atmel_hlcdc_pwm_errata atmel_hlcdc_pwm_at91sam9x5_errata = {
diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c
index c00dd37c5fbd..98b33c016c3c 100644
--- a/drivers/pwm/pwm-atmel-tcb.c
+++ b/drivers/pwm/pwm-atmel-tcb.c
@@ -364,7 +364,6 @@ static const struct pwm_ops atmel_tcb_pwm_ops = {
 	.request = atmel_tcb_pwm_request,
 	.free = atmel_tcb_pwm_free,
 	.apply = atmel_tcb_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static struct atmel_tcb_config tcb_rm9200_config = {
diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c
index 35ae876bf2ea..47bcc8a3bf9d 100644
--- a/drivers/pwm/pwm-atmel.c
+++ b/drivers/pwm/pwm-atmel.c
@@ -402,7 +402,6 @@ static int atmel_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops atmel_pwm_ops = {
 	.apply = atmel_pwm_apply,
 	.get_state = atmel_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const struct atmel_pwm_data atmel_sam9rl_pwm_data = {
diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c
index 7d70b6f186a6..1da902440df2 100644
--- a/drivers/pwm/pwm-bcm-iproc.c
+++ b/drivers/pwm/pwm-bcm-iproc.c
@@ -183,7 +183,6 @@ static int iproc_pwmc_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops iproc_pwm_ops = {
 	.apply = iproc_pwmc_apply,
 	.get_state = iproc_pwmc_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int iproc_pwmc_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c
index e5b00cc9f7a7..15d6ed03c3ce 100644
--- a/drivers/pwm/pwm-bcm-kona.c
+++ b/drivers/pwm/pwm-bcm-kona.c
@@ -269,7 +269,6 @@ static int kona_pwmc_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops kona_pwm_ops = {
 	.apply = kona_pwmc_apply,
-	.owner = THIS_MODULE,
 };
 
 static int kona_pwmc_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c
index bdfc2a5ec0d6..af318a35d510 100644
--- a/drivers/pwm/pwm-bcm2835.c
+++ b/drivers/pwm/pwm-bcm2835.c
@@ -129,7 +129,6 @@ static const struct pwm_ops bcm2835_pwm_ops = {
 	.request = bcm2835_pwm_request,
 	.free = bcm2835_pwm_free,
 	.apply = bcm2835_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int bcm2835_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c
index 0971c666afd1..6a6641fc4fa1 100644
--- a/drivers/pwm/pwm-berlin.c
+++ b/drivers/pwm/pwm-berlin.c
@@ -205,7 +205,6 @@ static const struct pwm_ops berlin_pwm_ops = {
 	.request = berlin_pwm_request,
 	.free = berlin_pwm_free,
 	.apply = berlin_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id berlin_pwm_match[] = {
diff --git a/drivers/pwm/pwm-brcmstb.c b/drivers/pwm/pwm-brcmstb.c
index a3faa9a3de7c..96967f8a8dad 100644
--- a/drivers/pwm/pwm-brcmstb.c
+++ b/drivers/pwm/pwm-brcmstb.c
@@ -220,7 +220,6 @@ static int brcmstb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops brcmstb_pwm_ops = {
 	.apply = brcmstb_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id brcmstb_pwm_of_match[] = {
diff --git a/drivers/pwm/pwm-clk.c b/drivers/pwm/pwm-clk.c
index 0ee4d2aee4df..9dd88b386907 100644
--- a/drivers/pwm/pwm-clk.c
+++ b/drivers/pwm/pwm-clk.c
@@ -77,7 +77,6 @@ static int pwm_clk_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops pwm_clk_ops = {
 	.apply = pwm_clk_apply,
-	.owner = THIS_MODULE,
 };
 
 static int pwm_clk_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c
index b0d91142da8d..42179b3f7ec3 100644
--- a/drivers/pwm/pwm-clps711x.c
+++ b/drivers/pwm/pwm-clps711x.c
@@ -72,7 +72,6 @@ static int clps711x_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops clps711x_pwm_ops = {
 	.request = clps711x_pwm_request,
 	.apply = clps711x_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static struct pwm_device *clps711x_pwm_xlate(struct pwm_chip *chip,
diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c
index baaac0c33aa0..500504c7d459 100644
--- a/drivers/pwm/pwm-cros-ec.c
+++ b/drivers/pwm/pwm-cros-ec.c
@@ -241,7 +241,6 @@ static const struct pwm_ops cros_ec_pwm_ops = {
 	.free = cros_ec_pwm_free,
 	.get_state	= cros_ec_pwm_get_state,
 	.apply		= cros_ec_pwm_apply,
-	.owner		= THIS_MODULE,
 };
 
 /*
diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c
index 3bbb26c862c3..53fe00ccd47e 100644
--- a/drivers/pwm/pwm-dwc.c
+++ b/drivers/pwm/pwm-dwc.c
@@ -195,7 +195,6 @@ static int dwc_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops dwc_pwm_ops = {
 	.apply = dwc_pwm_apply,
 	.get_state = dwc_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static struct dwc_pwm *dwc_pwm_alloc(struct device *dev)
diff --git a/drivers/pwm/pwm-ep93xx.c b/drivers/pwm/pwm-ep93xx.c
index c45a75e65c86..51e072572a87 100644
--- a/drivers/pwm/pwm-ep93xx.c
+++ b/drivers/pwm/pwm-ep93xx.c
@@ -159,7 +159,6 @@ static const struct pwm_ops ep93xx_pwm_ops = {
 	.request = ep93xx_pwm_request,
 	.free = ep93xx_pwm_free,
 	.apply = ep93xx_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int ep93xx_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c
index b7c6045c5d08..d1b6d1aa4773 100644
--- a/drivers/pwm/pwm-fsl-ftm.c
+++ b/drivers/pwm/pwm-fsl-ftm.c
@@ -350,7 +350,6 @@ static const struct pwm_ops fsl_pwm_ops = {
 	.request = fsl_pwm_request,
 	.free = fsl_pwm_free,
 	.apply = fsl_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int fsl_pwm_init(struct fsl_pwm_chip *fpc)
diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c
index f7ba6fe9a349..c435776e2f78 100644
--- a/drivers/pwm/pwm-hibvt.c
+++ b/drivers/pwm/pwm-hibvt.c
@@ -185,7 +185,6 @@ static const struct pwm_ops hibvt_pwm_ops = {
 	.get_state = hibvt_pwm_get_state,
 	.apply = hibvt_pwm_apply,
 
-	.owner = THIS_MODULE,
 };
 
 static int hibvt_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c
index 326af85888e7..116fa060e302 100644
--- a/drivers/pwm/pwm-img.c
+++ b/drivers/pwm/pwm-img.c
@@ -208,7 +208,6 @@ static int img_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops img_pwm_ops = {
 	.apply = img_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct img_pwm_soc_data pistachio_pwm = {
diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c
index 98ab65c89685..fe8bf598d388 100644
--- a/drivers/pwm/pwm-imx-tpm.c
+++ b/drivers/pwm/pwm-imx-tpm.c
@@ -332,7 +332,6 @@ static const struct pwm_ops imx_tpm_pwm_ops = {
 	.free = pwm_imx_tpm_free,
 	.get_state = pwm_imx_tpm_get_state,
 	.apply = pwm_imx_tpm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int pwm_imx_tpm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-imx1.c b/drivers/pwm/pwm-imx1.c
index 0651983bed19..d175d895f22a 100644
--- a/drivers/pwm/pwm-imx1.c
+++ b/drivers/pwm/pwm-imx1.c
@@ -146,7 +146,6 @@ static int pwm_imx1_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops pwm_imx1_ops = {
 	.apply = pwm_imx1_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id pwm_imx1_dt_ids[] = {
diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c
index 29a3089c534c..7d9bc43f12b0 100644
--- a/drivers/pwm/pwm-imx27.c
+++ b/drivers/pwm/pwm-imx27.c
@@ -296,7 +296,6 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops pwm_imx27_ops = {
 	.apply = pwm_imx27_apply,
 	.get_state = pwm_imx27_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id pwm_imx27_dt_ids[] = {
diff --git a/drivers/pwm/pwm-intel-lgm.c b/drivers/pwm/pwm-intel-lgm.c
index 0cd7dd548e82..54ecae7f937e 100644
--- a/drivers/pwm/pwm-intel-lgm.c
+++ b/drivers/pwm/pwm-intel-lgm.c
@@ -107,7 +107,6 @@ static int lgm_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops lgm_pwm_ops = {
 	.get_state = lgm_pwm_get_state,
 	.apply = lgm_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static void lgm_pwm_init(struct lgm_pwm_chip *pc)
diff --git a/drivers/pwm/pwm-iqs620a.c b/drivers/pwm/pwm-iqs620a.c
index 47b3141135f3..378ab036edfe 100644
--- a/drivers/pwm/pwm-iqs620a.c
+++ b/drivers/pwm/pwm-iqs620a.c
@@ -166,7 +166,6 @@ static int iqs620_pwm_notifier(struct notifier_block *notifier,
 static const struct pwm_ops iqs620_pwm_ops = {
 	.apply = iqs620_pwm_apply,
 	.get_state = iqs620_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static void iqs620_pwm_notifier_unregister(void *context)
diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c
index 6b2124266273..49a6e28d1d98 100644
--- a/drivers/pwm/pwm-jz4740.c
+++ b/drivers/pwm/pwm-jz4740.c
@@ -216,7 +216,6 @@ static const struct pwm_ops jz4740_pwm_ops = {
 	.request = jz4740_pwm_request,
 	.free = jz4740_pwm_free,
 	.apply = jz4740_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int jz4740_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-keembay.c b/drivers/pwm/pwm-keembay.c
index ac02d8bb4a0b..ac824ecc3f64 100644
--- a/drivers/pwm/pwm-keembay.c
+++ b/drivers/pwm/pwm-keembay.c
@@ -178,7 +178,6 @@ static int keembay_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 }
 
 static const struct pwm_ops keembay_pwm_ops = {
-	.owner = THIS_MODULE,
 	.apply = keembay_pwm_apply,
 	.get_state = keembay_pwm_get_state,
 };
diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c
index 4b133a17f4be..a1a106747372 100644
--- a/drivers/pwm/pwm-lp3943.c
+++ b/drivers/pwm/pwm-lp3943.c
@@ -216,7 +216,6 @@ static const struct pwm_ops lp3943_pwm_ops = {
 	.request	= lp3943_pwm_request,
 	.free		= lp3943_pwm_free,
 	.apply		= lp3943_pwm_apply,
-	.owner		= THIS_MODULE,
 };
 
 static int lp3943_pwm_parse_dt(struct device *dev,
diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c
index 7a19a840bca5..ef7d0da137ed 100644
--- a/drivers/pwm/pwm-lpc18xx-sct.c
+++ b/drivers/pwm/pwm-lpc18xx-sct.c
@@ -341,7 +341,6 @@ static const struct pwm_ops lpc18xx_pwm_ops = {
 	.apply = lpc18xx_pwm_apply,
 	.request = lpc18xx_pwm_request,
 	.free = lpc18xx_pwm_free,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id lpc18xx_pwm_of_match[] = {
diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c
index 806f0bb3ad6d..78f664e41e6e 100644
--- a/drivers/pwm/pwm-lpc32xx.c
+++ b/drivers/pwm/pwm-lpc32xx.c
@@ -115,7 +115,6 @@ static int lpc32xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops lpc32xx_pwm_ops = {
 	.apply = lpc32xx_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int lpc32xx_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c
index 23fe332b2394..a6ea3ce7e019 100644
--- a/drivers/pwm/pwm-lpss.c
+++ b/drivers/pwm/pwm-lpss.c
@@ -243,7 +243,6 @@ static int pwm_lpss_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops pwm_lpss_ops = {
 	.apply = pwm_lpss_apply,
 	.get_state = pwm_lpss_get_state,
-	.owner = THIS_MODULE,
 };
 
 struct pwm_lpss_chip *devm_pwm_lpss_probe(struct device *dev, void __iomem *base,
diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c
index 6adb0ed01906..373abfd25acb 100644
--- a/drivers/pwm/pwm-mediatek.c
+++ b/drivers/pwm/pwm-mediatek.c
@@ -229,7 +229,6 @@ static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops pwm_mediatek_ops = {
 	.apply = pwm_mediatek_apply,
-	.owner = THIS_MODULE,
 };
 
 static int pwm_mediatek_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
index 25519cddc2a9..5bea53243ed2 100644
--- a/drivers/pwm/pwm-meson.c
+++ b/drivers/pwm/pwm-meson.c
@@ -335,7 +335,6 @@ static const struct pwm_ops meson_pwm_ops = {
 	.free = meson_pwm_free,
 	.apply = meson_pwm_apply,
 	.get_state = meson_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const char * const pwm_meson8b_parent_names[] = {
diff --git a/drivers/pwm/pwm-microchip-core.c b/drivers/pwm/pwm-microchip-core.c
index e7525c98105e..c0c53968f3e9 100644
--- a/drivers/pwm/pwm-microchip-core.c
+++ b/drivers/pwm/pwm-microchip-core.c
@@ -435,7 +435,6 @@ static int mchp_core_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm
 static const struct pwm_ops mchp_core_pwm_ops = {
 	.apply = mchp_core_pwm_apply,
 	.get_state = mchp_core_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id mchp_core_of_match[] = {
diff --git a/drivers/pwm/pwm-mtk-disp.c b/drivers/pwm/pwm-mtk-disp.c
index a83bd6e18b07..7748e3eaa818 100644
--- a/drivers/pwm/pwm-mtk-disp.c
+++ b/drivers/pwm/pwm-mtk-disp.c
@@ -227,7 +227,6 @@ static int mtk_disp_pwm_get_state(struct pwm_chip *chip,
 static const struct pwm_ops mtk_disp_pwm_ops = {
 	.apply = mtk_disp_pwm_apply,
 	.get_state = mtk_disp_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int mtk_disp_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c
index 766dbc58dad8..1b5e787d78f1 100644
--- a/drivers/pwm/pwm-mxs.c
+++ b/drivers/pwm/pwm-mxs.c
@@ -115,7 +115,6 @@ static int mxs_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops mxs_pwm_ops = {
 	.apply = mxs_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int mxs_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-ntxec.c b/drivers/pwm/pwm-ntxec.c
index 7514ea384ec5..78606039eda2 100644
--- a/drivers/pwm/pwm-ntxec.c
+++ b/drivers/pwm/pwm-ntxec.c
@@ -126,7 +126,6 @@ static int ntxec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm_dev,
 }
 
 static const struct pwm_ops ntxec_pwm_ops = {
-	.owner = THIS_MODULE,
 	.apply = ntxec_pwm_apply,
 	/*
 	 * No .get_state callback, because the current state cannot be read
diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c
index 94faa4650686..13161e08dd6e 100644
--- a/drivers/pwm/pwm-omap-dmtimer.c
+++ b/drivers/pwm/pwm-omap-dmtimer.c
@@ -311,7 +311,6 @@ unlock_mutex:
 
 static const struct pwm_ops pwm_omap_dmtimer_ops = {
 	.apply = pwm_omap_dmtimer_apply,
-	.owner = THIS_MODULE,
 };
 
 static int pwm_omap_dmtimer_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index 3038a68412a7..e79b1de8c4d8 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c
@@ -505,7 +505,6 @@ static const struct pwm_ops pca9685_pwm_ops = {
 	.get_state = pca9685_pwm_get_state,
 	.request = pca9685_pwm_request,
 	.free = pca9685_pwm_free,
-	.owner = THIS_MODULE,
 };
 
 static const struct regmap_config pca9685_regmap_i2c_config = {
diff --git a/drivers/pwm/pwm-pxa.c b/drivers/pwm/pwm-pxa.c
index 1e475ed10180..faf6b35b9336 100644
--- a/drivers/pwm/pwm-pxa.c
+++ b/drivers/pwm/pwm-pxa.c
@@ -135,7 +135,6 @@ static int pxa_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops pxa_pwm_ops = {
 	.apply = pxa_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 #ifdef CONFIG_OF
diff --git a/drivers/pwm/pwm-raspberrypi-poe.c b/drivers/pwm/pwm-raspberrypi-poe.c
index 2939b71a7ba7..1ad814fdec6b 100644
--- a/drivers/pwm/pwm-raspberrypi-poe.c
+++ b/drivers/pwm/pwm-raspberrypi-poe.c
@@ -135,7 +135,6 @@ static int raspberrypi_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops raspberrypi_pwm_ops = {
 	.get_state = raspberrypi_pwm_get_state,
 	.apply = raspberrypi_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int raspberrypi_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c
index 5b5f357c44de..13269f55fccf 100644
--- a/drivers/pwm/pwm-rcar.c
+++ b/drivers/pwm/pwm-rcar.c
@@ -198,7 +198,6 @@ static const struct pwm_ops rcar_pwm_ops = {
 	.request = rcar_pwm_request,
 	.free = rcar_pwm_free,
 	.apply = rcar_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int rcar_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index d7311614c846..1b004e01829a 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -431,7 +431,6 @@ static const struct pwm_ops tpu_pwm_ops = {
 	.request = tpu_pwm_request,
 	.free = tpu_pwm_free,
 	.apply = tpu_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 /* -----------------------------------------------------------------------------
diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c
index 03ee18fb82d5..cce4381e188a 100644
--- a/drivers/pwm/pwm-rockchip.c
+++ b/drivers/pwm/pwm-rockchip.c
@@ -228,7 +228,6 @@ out:
 static const struct pwm_ops rockchip_pwm_ops = {
 	.get_state = rockchip_pwm_get_state,
 	.apply = rockchip_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct rockchip_pwm_data pwm_data_v1 = {
diff --git a/drivers/pwm/pwm-rz-mtu3.c b/drivers/pwm/pwm-rz-mtu3.c
index a56cecb0e46e..bdda315b3bd3 100644
--- a/drivers/pwm/pwm-rz-mtu3.c
+++ b/drivers/pwm/pwm-rz-mtu3.c
@@ -438,7 +438,6 @@ static const struct pwm_ops rz_mtu3_pwm_ops = {
 	.free = rz_mtu3_pwm_free,
 	.get_state = rz_mtu3_pwm_get_state,
 	.apply = rz_mtu3_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int rz_mtu3_pwm_pm_runtime_suspend(struct device *dev)
diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c
index e8828f57ab15..08a903257383 100644
--- a/drivers/pwm/pwm-samsung.c
+++ b/drivers/pwm/pwm-samsung.c
@@ -475,7 +475,6 @@ static const struct pwm_ops pwm_samsung_ops = {
 	.request	= pwm_samsung_request,
 	.free		= pwm_samsung_free,
 	.apply		= pwm_samsung_apply,
-	.owner		= THIS_MODULE,
 };
 
 #ifdef CONFIG_OF
diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c
index eabddb7c7820..089e50bdbbf0 100644
--- a/drivers/pwm/pwm-sifive.c
+++ b/drivers/pwm/pwm-sifive.c
@@ -203,7 +203,6 @@ static const struct pwm_ops pwm_sifive_ops = {
 	.free = pwm_sifive_free,
 	.get_state = pwm_sifive_get_state,
 	.apply = pwm_sifive_apply,
-	.owner = THIS_MODULE,
 };
 
 static int pwm_sifive_clock_notifier(struct notifier_block *nb,
diff --git a/drivers/pwm/pwm-sl28cpld.c b/drivers/pwm/pwm-sl28cpld.c
index 9e42e3a74ad6..88b01ff9e460 100644
--- a/drivers/pwm/pwm-sl28cpld.c
+++ b/drivers/pwm/pwm-sl28cpld.c
@@ -200,7 +200,6 @@ static int sl28cpld_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops sl28cpld_pwm_ops = {
 	.apply = sl28cpld_pwm_apply,
 	.get_state = sl28cpld_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int sl28cpld_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-spear.c b/drivers/pwm/pwm-spear.c
index 4e1cfd8d7c03..2cbc34cf6799 100644
--- a/drivers/pwm/pwm-spear.c
+++ b/drivers/pwm/pwm-spear.c
@@ -189,7 +189,6 @@ static int spear_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops spear_pwm_ops = {
 	.apply = spear_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int spear_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c
index 1499c8c1fe37..dfda2152a7c1 100644
--- a/drivers/pwm/pwm-sprd.c
+++ b/drivers/pwm/pwm-sprd.c
@@ -210,7 +210,6 @@ static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops sprd_pwm_ops = {
 	.apply = sprd_pwm_apply,
 	.get_state = sprd_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int sprd_pwm_clk_init(struct sprd_pwm_chip *spc)
diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index b1d1373648a3..5756f2e3b3c0 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -420,7 +420,6 @@ static const struct pwm_ops sti_pwm_ops = {
 	.capture = sti_pwm_capture,
 	.apply = sti_pwm_apply,
 	.free = sti_pwm_free,
-	.owner = THIS_MODULE,
 };
 
 static irqreturn_t sti_pwm_interrupt(int irq, void *data)
diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index bb3a045a7334..b67974cc1872 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c
@@ -189,7 +189,6 @@ static int stm32_pwm_lp_get_state(struct pwm_chip *chip,
 }
 
 static const struct pwm_ops stm32_pwm_lp_ops = {
-	.owner = THIS_MODULE,
 	.apply = stm32_pwm_lp_apply,
 	.get_state = stm32_pwm_lp_get_state,
 };
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 3d6be7749e23..3303a754ea02 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -487,7 +487,6 @@ static int stm32_pwm_apply_locked(struct pwm_chip *chip, struct pwm_device *pwm,
 }
 
 static const struct pwm_ops stm32pwm_ops = {
-	.owner = THIS_MODULE,
 	.apply = stm32_pwm_apply_locked,
 	.capture = IS_ENABLED(CONFIG_DMA_ENGINE) ? stm32_pwm_capture : NULL,
 };
diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c
index e205405c4828..a46f5b4dd816 100644
--- a/drivers/pwm/pwm-stmpe.c
+++ b/drivers/pwm/pwm-stmpe.c
@@ -287,7 +287,6 @@ static int stmpe_24xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops stmpe_24xx_pwm_ops = {
 	.apply = stmpe_24xx_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int __init stmpe_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index c84fcf1a13dc..1a439025540d 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -325,7 +325,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops sun4i_pwm_ops = {
 	.apply = sun4i_pwm_apply,
 	.get_state = sun4i_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const struct sun4i_pwm_data sun4i_pwm_dual_nobypass = {
diff --git a/drivers/pwm/pwm-sunplus.c b/drivers/pwm/pwm-sunplus.c
index 7705c7b86c3a..773e2f80526e 100644
--- a/drivers/pwm/pwm-sunplus.c
+++ b/drivers/pwm/pwm-sunplus.c
@@ -163,7 +163,6 @@ static int sunplus_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops sunplus_pwm_ops = {
 	.apply = sunplus_pwm_apply,
 	.get_state = sunplus_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static void sunplus_pwm_clk_release(void *data)
diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index a169a34e0778..39ea51e08c94 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -268,7 +268,6 @@ static int tegra_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops tegra_pwm_ops = {
 	.apply = tegra_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int tegra_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index 8c94b266c1b2..11e3549cf103 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -205,7 +205,6 @@ static int ecap_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops ecap_pwm_ops = {
 	.apply = ecap_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id ecap_of_match[] = {
diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index ecbfd7e954ec..66ac2655845f 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -437,7 +437,6 @@ static int ehrpwm_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops ehrpwm_pwm_ops = {
 	.free = ehrpwm_pwm_free,
 	.apply = ehrpwm_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id ehrpwm_of_match[] = {
diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index 8fb84b441853..625233f4703a 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -189,7 +189,6 @@ static int twl4030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops twl4030_pwmled_ops = {
 	.apply = twl4030_pwmled_apply,
-	.owner = THIS_MODULE,
 };
 
 static int twl6030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -342,7 +341,6 @@ static const struct pwm_ops twl6030_pwmled_ops = {
 	.apply = twl6030_pwmled_apply,
 	.request = twl6030_pwmled_request,
 	.free = twl6030_pwmled_free,
-	.owner = THIS_MODULE,
 };
 
 static int twl_pwmled_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 86567add79db..603d31f27470 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -333,12 +333,10 @@ static const struct pwm_ops twl4030_pwm_ops = {
 	.apply = twl4030_pwm_apply,
 	.request = twl4030_pwm_request,
 	.free = twl4030_pwm_free,
-	.owner = THIS_MODULE,
 };
 
 static const struct pwm_ops twl6030_pwm_ops = {
 	.apply = twl6030_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int twl_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-visconti.c b/drivers/pwm/pwm-visconti.c
index 7f7591a2384c..8d736d558122 100644
--- a/drivers/pwm/pwm-visconti.c
+++ b/drivers/pwm/pwm-visconti.c
@@ -129,7 +129,6 @@ static int visconti_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
 static const struct pwm_ops visconti_pwm_ops = {
 	.apply = visconti_pwm_apply,
 	.get_state = visconti_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static int visconti_pwm_probe(struct platform_device *pdev)
diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index 6d46db51daac..a96c7f5d9099 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -221,7 +221,6 @@ static int vt8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 
 static const struct pwm_ops vt8500_pwm_ops = {
 	.apply = vt8500_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static const struct of_device_id vt8500_pwm_dt_ids[] = {
diff --git a/drivers/pwm/pwm-xilinx.c b/drivers/pwm/pwm-xilinx.c
index 85153ee90809..5f3c2a6fed11 100644
--- a/drivers/pwm/pwm-xilinx.c
+++ b/drivers/pwm/pwm-xilinx.c
@@ -198,7 +198,6 @@ static int xilinx_pwm_get_state(struct pwm_chip *chip,
 static const struct pwm_ops xilinx_pwm_ops = {
 	.apply = xilinx_pwm_apply,
 	.get_state = xilinx_pwm_get_state,
-	.owner = THIS_MODULE,
 };
 
 static const struct regmap_config xilinx_pwm_regmap_config = {
diff --git a/drivers/staging/greybus/pwm.c b/drivers/staging/greybus/pwm.c
index 57cc1960d059..a3cb68cfa0f9 100644
--- a/drivers/staging/greybus/pwm.c
+++ b/drivers/staging/greybus/pwm.c
@@ -258,7 +258,6 @@ static const struct pwm_ops gb_pwm_ops = {
 	.request = gb_pwm_request,
 	.free = gb_pwm_free,
 	.apply = gb_pwm_apply,
-	.owner = THIS_MODULE,
 };
 
 static int gb_pwm_probe(struct gbphy_device *gbphy_dev,
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index d2f9f690a9c1..56e3b7a09824 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -267,7 +267,6 @@ struct pwm_capture {
  * @get_state: get the current PWM state. This function is only
  *	       called once per PWM device when the PWM chip is
  *	       registered.
- * @owner: helps prevent removal of modules exporting active PWMs
  */
 struct pwm_ops {
 	int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
@@ -278,13 +277,13 @@ struct pwm_ops {
 		     const struct pwm_state *state);
 	int (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			 struct pwm_state *state);
-	struct module *owner;
 };
 
 /**
  * struct pwm_chip - abstract a PWM controller
  * @dev: device providing the PWMs
  * @ops: callbacks for this PWM controller
+ * @owner: module providing this chip
  * @base: number of first PWM controlled by this chip
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
@@ -295,6 +294,7 @@ struct pwm_ops {
 struct pwm_chip {
 	struct device *dev;
 	const struct pwm_ops *ops;
+	struct module *owner;
 	int base;
 	unsigned int npwm;
 
@@ -386,10 +386,12 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
-int pwmchip_add(struct pwm_chip *chip);
+int __pwmchip_add(struct pwm_chip *chip, struct module *owner);
+#define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE)
 void pwmchip_remove(struct pwm_chip *chip);
 
-int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip);
+int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner);
+#define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE)
 
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
-- 
cgit v1.2.3


From a6e5654e0b8b53e2d0e316bc7cecb81dd8371f18 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 5 Jul 2023 10:06:50 +0200
Subject: pwm: Drop pwm_[sg]et_chip_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The semantic of chip_data is a bit surprising as it's cleared when
pwm_put() is called. Also there is a big overlap with the standard driver
data.

All drivers were adapted to not make use of chip_data any more, so it can
go away.

Link: https://lore.kernel.org/r/20230705080650.2353391-9-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 31 -------------------------------
 include/linux/pwm.h | 14 --------------
 2 files changed, 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index a2824eb4e236..29078486534d 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -208,36 +208,6 @@ static void of_pwmchip_remove(struct pwm_chip *chip)
 		of_node_put(chip->dev->of_node);
 }
 
-/**
- * pwm_set_chip_data() - set private chip data for a PWM
- * @pwm: PWM device
- * @data: pointer to chip-specific data
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwm_set_chip_data(struct pwm_device *pwm, void *data)
-{
-	if (!pwm)
-		return -EINVAL;
-
-	pwm->chip_data = data;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pwm_set_chip_data);
-
-/**
- * pwm_get_chip_data() - get private chip data for a PWM
- * @pwm: PWM device
- *
- * Returns: A pointer to the chip-private data for the PWM device.
- */
-void *pwm_get_chip_data(struct pwm_device *pwm)
-{
-	return pwm ? pwm->chip_data : NULL;
-}
-EXPORT_SYMBOL_GPL(pwm_get_chip_data);
-
 static bool pwm_ops_check(const struct pwm_chip *chip)
 {
 	const struct pwm_ops *ops = chip->ops;
@@ -980,7 +950,6 @@ void pwm_put(struct pwm_device *pwm)
 	if (pwm->chip->ops->free)
 		pwm->chip->ops->free(pwm->chip, pwm);
 
-	pwm_set_chip_data(pwm, NULL);
 	pwm->label = NULL;
 
 	module_put(pwm->chip->owner);
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 56e3b7a09824..e3b437587b32 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -71,7 +71,6 @@ struct pwm_state {
  * @hwpwm: per-chip relative index of the PWM device
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
- * @chip_data: chip-private data associated with the PWM device
  * @args: PWM arguments
  * @state: last applied state
  * @last: last implemented state (for PWM_DEBUG)
@@ -82,7 +81,6 @@ struct pwm_device {
 	unsigned int hwpwm;
 	unsigned int pwm;
 	struct pwm_chip *chip;
-	void *chip_data;
 
 	struct pwm_args args;
 	struct pwm_state state;
@@ -383,8 +381,6 @@ static inline void pwm_disable(struct pwm_device *pwm)
 /* PWM provider APIs */
 int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 		unsigned long timeout);
-int pwm_set_chip_data(struct pwm_device *pwm, void *data);
-void *pwm_get_chip_data(struct pwm_device *pwm);
 
 int __pwmchip_add(struct pwm_chip *chip, struct module *owner);
 #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE)
@@ -447,16 +443,6 @@ static inline int pwm_capture(struct pwm_device *pwm,
 	return -EINVAL;
 }
 
-static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
-{
-	return -EINVAL;
-}
-
-static inline void *pwm_get_chip_data(struct pwm_device *pwm)
-{
-	return NULL;
-}
-
 static inline int pwmchip_add(struct pwm_chip *chip)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 38985e8c278b82e6d4d62d4acd57c761cc23ce63 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Mon, 9 Oct 2023 13:06:08 +0300
Subject: net: Handle bulk delete policy in bridge driver

The merge commit 92716869375b ("Merge branch 'br-flush-filtering'")
added support for FDB flushing in bridge driver. The following patches
will extend VXLAN driver to support FDB flushing as well. The netlink
message for bulk delete is shared between the drivers. With the existing
implementation, there is no way to prevent user from flushing with
attributes that are not supported per driver. For example, when VNI will
be added, user will not get an error for flush FDB entries in bridge
with VNI, although this attribute is not relevant for bridge.

As preparation for support of FDB flush in VXLAN driver, move the policy
to be handled in bridge driver, later a new policy for VXLAN will be
added in VXLAN driver. Do not pass 'vid' as part of ndo_fdb_del_bulk(),
as this field is relevant only for bridge.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 ++------
 net/bridge/br_fdb.c       | 29 ++++++++++++++++++++++++-----
 net/bridge/br_private.h   |  3 +--
 net/core/rtnetlink.c      | 27 ++++++++++-----------------
 4 files changed, 37 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ae553f886796..1c7681263d30 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1287,9 +1287,7 @@ struct netdev_net_notifier {
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid)
  *	Deletes the FDB entry from dev coresponding to addr.
- * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[],
- *			   struct net_device *dev,
- *			   u16 vid,
+ * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
  *			   struct netlink_ext_ack *extack);
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  *		       struct net_device *dev, struct net_device *filter_dev,
@@ -1564,10 +1562,8 @@ struct net_device_ops {
 					       struct net_device *dev,
 					       const unsigned char *addr,
 					       u16 vid, struct netlink_ext_ack *extack);
-	int			(*ndo_fdb_del_bulk)(struct ndmsg *ndm,
-						    struct nlattr *tb[],
+	int			(*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
 						    struct net_device *dev,
-						    u16 vid,
 						    struct netlink_ext_ack *extack);
 	int			(*ndo_fdb_dump)(struct sk_buff *skb,
 						struct netlink_callback *cb,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e69a872bfc1d..a98ad763b368 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -661,14 +661,30 @@ static int __fdb_flush_validate_ifindex(const struct net_bridge *br,
 	return 0;
 }
 
-int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev, u16 vid,
+static const struct nla_policy br_fdb_del_bulk_policy[NDA_MAX + 1] = {
+	[NDA_VLAN]	= NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
+	[NDA_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
+	[NDA_NDM_STATE_MASK]	= { .type = NLA_U16 },
+	[NDA_NDM_FLAGS_MASK]	= { .type = NLA_U8 },
+};
+
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack)
 {
-	u8 ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
-	struct net_bridge_fdb_flush_desc desc = { .vlan_id = vid };
+	struct net_bridge_fdb_flush_desc desc = {};
+	struct ndmsg *ndm = nlmsg_data(nlh);
 	struct net_bridge_port *p = NULL;
+	struct nlattr *tb[NDA_MAX + 1];
 	struct net_bridge *br;
+	u8 ndm_flags;
+	int err;
+
+	ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
+
+	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
+			  br_fdb_del_bulk_policy, extack);
+	if (err)
+		return err;
 
 	if (netif_is_bridge_master(dev)) {
 		br = netdev_priv(dev);
@@ -681,6 +697,9 @@ int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
 		br = p->br;
 	}
 
+	if (tb[NDA_VLAN])
+		desc.vlan_id = nla_get_u16(tb[NDA_VLAN]);
+
 	if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) {
 		NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
 		return -EINVAL;
@@ -703,7 +722,7 @@ int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
 		desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask);
 	}
 	if (tb[NDA_IFINDEX]) {
-		int err, ifidx = nla_get_s32(tb[NDA_IFINDEX]);
+		int ifidx = nla_get_s32(tb[NDA_IFINDEX]);
 
 		err = __fdb_flush_validate_ifindex(br, ifidx, extack);
 		if (err)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a1f4acfa6994..cbbe35278459 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -847,8 +847,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev, const unsigned char *addr, u16 vid,
 		  struct netlink_ext_ack *extack);
-int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[],
-		       struct net_device *dev, u16 vid,
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
 		       struct netlink_ext_ack *extack);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
 	       const unsigned char *addr, u16 vid, u16 nlh_flags,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7452a6d190c5..eef7f7788996 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4367,13 +4367,6 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_del);
 
-static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
-	[NDA_VLAN]	= { .type = NLA_U16 },
-	[NDA_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
-	[NDA_NDM_STATE_MASK]	= { .type = NLA_U16  },
-	[NDA_NDM_FLAGS_MASK]	= { .type = NLA_U8 },
-};
-
 static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
@@ -4394,8 +4387,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 		err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
 					     NULL, extack);
 	} else {
-		err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
-				  fdb_del_bulk_policy, extack);
+		/* For bulk delete, the drivers will parse the message with
+		 * policy.
+		 */
+		err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
 	}
 	if (err < 0)
 		return err;
@@ -4418,6 +4413,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			return -EINVAL;
 		}
 		addr = nla_data(tb[NDA_LLADDR]);
+
+		err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+		if (err)
+			return err;
 	}
 
 	if (dev->type != ARPHRD_ETHER) {
@@ -4425,10 +4424,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
-	if (err)
-		return err;
-
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -4442,8 +4437,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 				err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
 		} else {
 			if (ops->ndo_fdb_del_bulk)
-				err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
-							    extack);
+				err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
 		}
 
 		if (err)
@@ -4464,8 +4458,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 			/* in case err was cleared by NTF_MASTER call */
 			err = -EOPNOTSUPP;
 			if (ops->ndo_fdb_del_bulk)
-				err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
-							    extack);
+				err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
 		}
 
 		if (!err) {
-- 
cgit v1.2.3


From 03adc61edad49e1bbecfb53f7ea5d78f398fe368 Mon Sep 17 00:00:00 2001
From: Dan Clash <daclash@linux.microsoft.com>
Date: Thu, 12 Oct 2023 14:55:18 -0700
Subject: audit,io_uring: io_uring openat triggers audit reference count
 underflow

An io_uring openat operation can update an audit reference count
from multiple threads resulting in the call trace below.

A call to io_uring_submit() with a single openat op with a flag of
IOSQE_ASYNC results in the following reference count updates.

These first part of the system call performs two increments that do not race.

do_syscall_64()
  __do_sys_io_uring_enter()
    io_submit_sqes()
      io_openat_prep()
        __io_openat_prep()
          getname()
            getname_flags()       /* update 1 (increment) */
              __audit_getname()   /* update 2 (increment) */

The openat op is queued to an io_uring worker thread which starts the
opportunity for a race.  The system call exit performs one decrement.

do_syscall_64()
  syscall_exit_to_user_mode()
    syscall_exit_to_user_mode_prepare()
      __audit_syscall_exit()
        audit_reset_context()
           putname()              /* update 3 (decrement) */

The io_uring worker thread performs one increment and two decrements.
These updates can race with the system call decrement.

io_wqe_worker()
  io_worker_handle_work()
    io_wq_submit_work()
      io_issue_sqe()
        io_openat()
          io_openat2()
            do_filp_open()
              path_openat()
                __audit_inode()   /* update 4 (increment) */
            putname()             /* update 5 (decrement) */
        __audit_uring_exit()
          audit_reset_context()
            putname()             /* update 6 (decrement) */

The fix is to change the refcnt member of struct audit_names
from int to atomic_t.

kernel BUG at fs/namei.c:262!
Call Trace:
...
 ? putname+0x68/0x70
 audit_reset_context.part.0.constprop.0+0xe1/0x300
 __audit_uring_exit+0xda/0x1c0
 io_issue_sqe+0x1f3/0x450
 ? lock_timer_base+0x3b/0xd0
 io_wq_submit_work+0x8d/0x2b0
 ? __try_to_del_timer_sync+0x67/0xa0
 io_worker_handle_work+0x17c/0x2b0
 io_wqe_worker+0x10a/0x350

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
Link: https://lore.kernel.org/r/20231012215518.GA4048@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c         | 9 +++++----
 include/linux/fs.h | 2 +-
 kernel/auditsc.c   | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..94565bd7e73f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
 		}
 	}
 
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
 		if (empty)
@@ -249,7 +249,7 @@ getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	audit_getname(result);
 
 	return result;
@@ -261,9 +261,10 @@ void putname(struct filename *name)
 	if (IS_ERR(name))
 		return;
 
-	BUG_ON(name->refcnt <= 0);
+	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
+		return;
 
-	if (--name->refcnt > 0)
+	if (!atomic_dec_and_test(&name->refcnt))
 		return;
 
 	if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..4a40823c3c67 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2403,7 +2403,7 @@ struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	atomic_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21d2fa815e78..6f0d6fb6523f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2212,7 +2212,7 @@ __audit_reusename(const __user char *uptr)
 		if (!n->name)
 			continue;
 		if (n->name->uptr == uptr) {
-			n->name->refcnt++;
+			atomic_inc(&n->name->refcnt);
 			return n->name;
 		}
 	}
@@ -2241,7 +2241,7 @@ void __audit_getname(struct filename *name)
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
-	name->refcnt++;
+	atomic_inc(&name->refcnt);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2373,7 +2373,7 @@ out_alloc:
 		return;
 	if (name) {
 		n->name = name;
-		name->refcnt++;
+		atomic_inc(&name->refcnt);
 	}
 
 out:
@@ -2500,7 +2500,7 @@ void __audit_inode_child(struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			found_child->name->refcnt++;
+			atomic_inc(&found_child->name->refcnt);
 		}
 	}
 
-- 
cgit v1.2.3


From 236334aeec0f93217cf9235f2004e61a0a1a5985 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 13 Oct 2023 08:39:16 +0000
Subject: bpf: Avoid unnecessary audit log for CPU security mitigations

Check cpu_mitigations_off() first to avoid calling capable() if it is off.
This can avoid unnecessary audit log.

Fixes: bc5bc309db45 ("bpf: Inherit system settings for CPU security mitigations")
Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/CAEf4Bza6UVUWqcWQ-66weZ-nMDr+TFU3Mtq=dumZFD-pSqU7Ow@mail.gmail.com/
Link: https://lore.kernel.org/bpf/20231013083916.4199-1-laoar.shao@gmail.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 61bde4520f5c..f0891ba24cb1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void)
 
 static inline bool bpf_bypass_spec_v1(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 static inline bool bpf_bypass_spec_v4(void)
 {
-	return perfmon_capable() || cpu_mitigations_off();
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
-- 
cgit v1.2.3


From 84aefafe6b294041b7fa0757414c4a29c1bdeea2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 30 Sep 2023 15:14:26 -0700
Subject: clk: linux/clk-provider.h: fix kernel-doc warnings and typos

Fix spelling of "Structure".

Fix multiple kernel-doc warnings:

clk-provider.h:269: warning: Function parameter or member 'recalc_rate' not described in 'clk_ops'
clk-provider.h:468: warning: Function parameter or member 'parent_data' not described in 'clk_hw_register_fixed_rate_with_accuracy_parent_data'
clk-provider.h:468: warning: Excess function parameter 'parent_name' description in 'clk_hw_register_fixed_rate_with_accuracy_parent_data'
clk-provider.h:482: warning: Function parameter or member 'parent_data' not described in 'clk_hw_register_fixed_rate_parent_accuracy'
clk-provider.h:482: warning: Excess function parameter 'parent_name' description in 'clk_hw_register_fixed_rate_parent_accuracy'
clk-provider.h:687: warning: Function parameter or member 'flags' not described in 'clk_divider'
clk-provider.h:1164: warning: Function parameter or member 'flags' not described in 'clk_fractional_divider'
clk-provider.h:1164: warning: Function parameter or member 'approximation' not described in 'clk_fractional_divider'
clk-provider.h:1213: warning: Function parameter or member 'flags' not described in 'clk_multiplier'

Fixes: 9fba738a53dd ("clk: add duty cycle support")
Fixes: b2476490ef11 ("clk: introduce the common clock framework")
Fixes: 2d34f09e79c9 ("clk: fixed-rate: Add support for specifying parents via DT/pointers")
Fixes: f5290d8e4f0c ("clk: asm9260: use parent index to link the reference clock")
Fixes: 9d9f78ed9af0 ("clk: basic clock hardware types")
Fixes: e2d0e90fae82 ("clk: new basic clk type for fractional divider")
Fixes: f2e0a53271a4 ("clk: Add a basic multiplier clock")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: linux-clk@vger.kernel.org
Link: https://lore.kernel.org/r/20230930221428.18463-1-rdunlap@infradead.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ec32ec58c59f..ace3a4ce2fc9 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -74,7 +74,7 @@ void clk_hw_forward_rate_request(const struct clk_hw *core,
 				 unsigned long parent_rate);
 
 /**
- * struct clk_duty - Struture encoding the duty cycle ratio of a clock
+ * struct clk_duty - Structure encoding the duty cycle ratio of a clock
  *
  * @num:	Numerator of the duty cycle ratio
  * @den:	Denominator of the duty cycle ratio
@@ -129,7 +129,7 @@ struct clk_duty {
  * @restore_context: Restore the context of the clock after a restoration
  *		of power.
  *
- * @recalc_rate	Recalculate the rate of this clock, by querying hardware. The
+ * @recalc_rate: Recalculate the rate of this clock, by querying hardware. The
  *		parent rate is an input parameter.  It is up to the caller to
  *		ensure that the prepare_mutex is held across this call. If the
  *		driver cannot figure out a rate for this clock, it must return
@@ -456,7 +456,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  * clock with the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
- * @parent_name: name of clock's parent
+ * @parent_data: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  * @fixed_accuracy: non-adjustable clock accuracy
@@ -471,7 +471,7 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  * the clock framework
  * @dev: device that is registering this clock
  * @name: name of this clock
- * @parent_name: name of clock's parent
+ * @parent_data: name of clock's parent
  * @flags: framework-specific flags
  * @fixed_rate: non-adjustable clock rate
  */
@@ -649,7 +649,7 @@ struct clk_div_table {
  * Clock with an adjustable divider affecting its output frequency.  Implements
  * .recalc_rate, .set_rate and .round_rate
  *
- * Flags:
+ * @flags:
  * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the
  *	register plus one.  If CLK_DIVIDER_ONE_BASED is set then the divider is
  *	the raw value read from the register, with the value of zero considered
@@ -1130,11 +1130,12 @@ struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
  * @mwidth:	width of the numerator bit field
  * @nshift:	shift to the denominator bit field
  * @nwidth:	width of the denominator bit field
+ * @approximation: clk driver's callback for calculating the divider clock
  * @lock:	register lock
  *
  * Clock with adjustable fractional divider affecting its output frequency.
  *
- * Flags:
+ * @flags:
  * CLK_FRAC_DIVIDER_ZERO_BASED - by default the numerator and denominator
  *	is the value read from the register. If CLK_FRAC_DIVIDER_ZERO_BASED
  *	is set then the numerator and denominator are both the value read
@@ -1191,7 +1192,7 @@ void clk_hw_unregister_fractional_divider(struct clk_hw *hw);
  * Clock with an adjustable multiplier affecting its output frequency.
  * Implements .recalc_rate, .set_rate and .round_rate
  *
- * Flags:
+ * @flags:
  * CLK_MULTIPLIER_ZERO_BYPASS - By default, the multiplier is the value read
  *	from the register, with 0 being a valid value effectively
  *	zeroing the output clock rate. If CLK_MULTIPLIER_ZERO_BYPASS is
-- 
cgit v1.2.3


From 787650cc335201a0489905c5504a9179470ebc51 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Oct 2023 20:04:12 -0700
Subject: Input: Annotate struct ff_device with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct ff_device.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20231006201739.work.350-kees@kernel.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 49790c1bd2c4..de6503c0edb8 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -562,7 +562,7 @@ struct ff_device {
 
 	int max_effects;
 	struct ff_effect *effects;
-	struct file *effect_owners[];
+	struct file *effect_owners[] __counted_by(max_effects);
 };
 
 int input_ff_create(struct input_dev *dev, unsigned int max_effects);
-- 
cgit v1.2.3


From 5b90073defd1a52aa8120403d79f6e0fc10c87ee Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sun, 8 Oct 2023 20:36:17 +0800
Subject: crypto: hisilicon/qm - alloc buffer to set and get xqc

If the temporarily applied memory is used to set or get the xqc
information, the driver releases the memory immediately after the
hardware mailbox operation time exceeds the driver waiting time.
However, the hardware does not cancel the operation, so the hardware
may write data to released memory.

Therefore, when the driver is bound to a device, the driver reserves
memory for the xqc configuration. The subsequent xqc configuration
uses the reserved memory to prevent hardware from accessing the
released memory.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/debugfs.c   |  75 ++++----
 drivers/crypto/hisilicon/qm.c        | 332 ++++++++++++++++-------------------
 drivers/crypto/hisilicon/qm_common.h |   5 +-
 include/linux/hisi_acc_qm.h          |  13 ++
 4 files changed, 191 insertions(+), 234 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/debugfs.c b/drivers/crypto/hisilicon/debugfs.c
index 2cc1591949db..7e8186fe0512 100644
--- a/drivers/crypto/hisilicon/debugfs.c
+++ b/drivers/crypto/hisilicon/debugfs.c
@@ -137,8 +137,8 @@ static void dump_show(struct hisi_qm *qm, void *info,
 static int qm_sqc_dump(struct hisi_qm *qm, char *s, char *name)
 {
 	struct device *dev = &qm->pdev->dev;
-	struct qm_sqc *sqc, *sqc_curr;
-	dma_addr_t sqc_dma;
+	struct qm_sqc *sqc_curr;
+	struct qm_sqc sqc;
 	u32 qp_id;
 	int ret;
 
@@ -151,35 +151,29 @@ static int qm_sqc_dump(struct hisi_qm *qm, char *s, char *name)
 		return -EINVAL;
 	}
 
-	sqc = hisi_qm_ctx_alloc(qm, sizeof(*sqc), &sqc_dma);
-	if (IS_ERR(sqc))
-		return PTR_ERR(sqc);
+	ret = qm_set_and_get_xqc(qm, QM_MB_CMD_SQC, &sqc, qp_id, 1);
+	if (!ret) {
+		dump_show(qm, &sqc, sizeof(struct qm_sqc), name);
 
-	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 1);
-	if (ret) {
-		down_read(&qm->qps_lock);
-		if (qm->sqc) {
-			sqc_curr = qm->sqc + qp_id;
+		return 0;
+	}
 
-			dump_show(qm, sqc_curr, sizeof(*sqc), "SOFT SQC");
-		}
-		up_read(&qm->qps_lock);
+	down_read(&qm->qps_lock);
+	if (qm->sqc) {
+		sqc_curr = qm->sqc + qp_id;
 
-		goto free_ctx;
+		dump_show(qm, sqc_curr, sizeof(*sqc_curr), "SOFT SQC");
 	}
+	up_read(&qm->qps_lock);
 
-	dump_show(qm, sqc, sizeof(*sqc), name);
-
-free_ctx:
-	hisi_qm_ctx_free(qm, sizeof(*sqc), sqc, &sqc_dma);
 	return 0;
 }
 
 static int qm_cqc_dump(struct hisi_qm *qm, char *s, char *name)
 {
 	struct device *dev = &qm->pdev->dev;
-	struct qm_cqc *cqc, *cqc_curr;
-	dma_addr_t cqc_dma;
+	struct qm_cqc *cqc_curr;
+	struct qm_cqc cqc;
 	u32 qp_id;
 	int ret;
 
@@ -192,34 +186,29 @@ static int qm_cqc_dump(struct hisi_qm *qm, char *s, char *name)
 		return -EINVAL;
 	}
 
-	cqc = hisi_qm_ctx_alloc(qm, sizeof(*cqc), &cqc_dma);
-	if (IS_ERR(cqc))
-		return PTR_ERR(cqc);
+	ret = qm_set_and_get_xqc(qm, QM_MB_CMD_CQC, &cqc, qp_id, 1);
+	if (!ret) {
+		dump_show(qm, &cqc, sizeof(struct qm_cqc), name);
 
-	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 1);
-	if (ret) {
-		down_read(&qm->qps_lock);
-		if (qm->cqc) {
-			cqc_curr = qm->cqc + qp_id;
+		return 0;
+	}
 
-			dump_show(qm, cqc_curr, sizeof(*cqc), "SOFT CQC");
-		}
-		up_read(&qm->qps_lock);
+	down_read(&qm->qps_lock);
+	if (qm->cqc) {
+		cqc_curr = qm->cqc + qp_id;
 
-		goto free_ctx;
+		dump_show(qm, cqc_curr, sizeof(*cqc_curr), "SOFT CQC");
 	}
+	up_read(&qm->qps_lock);
 
-	dump_show(qm, cqc, sizeof(*cqc), name);
-
-free_ctx:
-	hisi_qm_ctx_free(qm, sizeof(*cqc), cqc, &cqc_dma);
 	return 0;
 }
 
 static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, char *name)
 {
 	struct device *dev = &qm->pdev->dev;
-	dma_addr_t xeqc_dma;
+	struct qm_aeqc aeqc;
+	struct qm_eqc eqc;
 	size_t size;
 	void *xeqc;
 	int ret;
@@ -233,23 +222,19 @@ static int qm_eqc_aeqc_dump(struct hisi_qm *qm, char *s, char *name)
 	if (!strcmp(name, "EQC")) {
 		cmd = QM_MB_CMD_EQC;
 		size = sizeof(struct qm_eqc);
+		xeqc = &eqc;
 	} else {
 		cmd = QM_MB_CMD_AEQC;
 		size = sizeof(struct qm_aeqc);
+		xeqc = &aeqc;
 	}
 
-	xeqc = hisi_qm_ctx_alloc(qm, size, &xeqc_dma);
-	if (IS_ERR(xeqc))
-		return PTR_ERR(xeqc);
-
-	ret = hisi_qm_mb(qm, cmd, xeqc_dma, 0, 1);
+	ret = qm_set_and_get_xqc(qm, cmd, xeqc, 0, 1);
 	if (ret)
-		goto err_free_ctx;
+		return ret;
 
 	dump_show(qm, xeqc, size, name);
 
-err_free_ctx:
-	hisi_qm_ctx_free(qm, size, xeqc, &xeqc_dma);
 	return ret;
 }
 
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index f3b55c044dd3..a1d0473f1931 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -46,7 +46,7 @@
 #define QM_QC_PASID_ENABLE_SHIFT	7
 
 #define QM_SQ_TYPE_MASK			GENMASK(3, 0)
-#define QM_SQ_TAIL_IDX(sqc)		((le16_to_cpu((sqc)->w11) >> 6) & 0x1)
+#define QM_SQ_TAIL_IDX(sqc)		((le16_to_cpu((sqc).w11) >> 6) & 0x1)
 
 /* cqc shift */
 #define QM_CQ_HOP_NUM_SHIFT		0
@@ -58,7 +58,7 @@
 
 #define QM_CQE_PHASE(cqe)		(le16_to_cpu((cqe)->w7) & 0x1)
 #define QM_QC_CQE_SIZE			4
-#define QM_CQ_TAIL_IDX(cqc)		((le16_to_cpu((cqc)->w11) >> 6) & 0x1)
+#define QM_CQ_TAIL_IDX(cqc)		((le16_to_cpu((cqc).w11) >> 6) & 0x1)
 
 /* eqc shift */
 #define QM_EQE_AEQE_SIZE		(2UL << 12)
@@ -252,19 +252,6 @@
 #define QM_MK_SQC_DW3_V2(sqe_sz, sq_depth) \
 	((((u32)sq_depth) - 1) | ((u32)ilog2(sqe_sz) << QM_SQ_SQE_SIZE_SHIFT))
 
-#define INIT_QC_COMMON(qc, base, pasid) do {			\
-	(qc)->head = 0;						\
-	(qc)->tail = 0;						\
-	(qc)->base_l = cpu_to_le32(lower_32_bits(base));	\
-	(qc)->base_h = cpu_to_le32(upper_32_bits(base));	\
-	(qc)->dw3 = 0;						\
-	(qc)->w8 = 0;						\
-	(qc)->rsvd0 = 0;					\
-	(qc)->pasid = cpu_to_le16(pasid);			\
-	(qc)->w11 = 0;						\
-	(qc)->rsvd1 = 0;					\
-} while (0)
-
 enum vft_type {
 	SQC_VFT = 0,
 	CQC_VFT,
@@ -686,6 +673,59 @@ int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
 }
 EXPORT_SYMBOL_GPL(hisi_qm_mb);
 
+/* op 0: set xqc information to hardware, 1: get xqc information from hardware. */
+int qm_set_and_get_xqc(struct hisi_qm *qm, u8 cmd, void *xqc, u32 qp_id, bool op)
+{
+	struct hisi_qm *pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+	struct qm_mailbox mailbox;
+	dma_addr_t xqc_dma;
+	void *tmp_xqc;
+	size_t size;
+	int ret;
+
+	switch (cmd) {
+	case QM_MB_CMD_SQC:
+		size = sizeof(struct qm_sqc);
+		tmp_xqc = qm->xqc_buf.sqc;
+		xqc_dma = qm->xqc_buf.sqc_dma;
+		break;
+	case QM_MB_CMD_CQC:
+		size = sizeof(struct qm_cqc);
+		tmp_xqc = qm->xqc_buf.cqc;
+		xqc_dma = qm->xqc_buf.cqc_dma;
+		break;
+	case QM_MB_CMD_EQC:
+		size = sizeof(struct qm_eqc);
+		tmp_xqc = qm->xqc_buf.eqc;
+		xqc_dma = qm->xqc_buf.eqc_dma;
+		break;
+	case QM_MB_CMD_AEQC:
+		size = sizeof(struct qm_aeqc);
+		tmp_xqc = qm->xqc_buf.aeqc;
+		xqc_dma = qm->xqc_buf.aeqc_dma;
+		break;
+	}
+
+	/* Setting xqc will fail if master OOO is blocked. */
+	if (qm_check_dev_error(pf_qm)) {
+		dev_err(&qm->pdev->dev, "failed to send mailbox since qm is stop!\n");
+		return -EIO;
+	}
+
+	mutex_lock(&qm->mailbox_lock);
+	if (!op)
+		memcpy(tmp_xqc, xqc, size);
+
+	qm_mb_pre_init(&mailbox, cmd, xqc_dma, qp_id, op);
+	ret = qm_mb_nolock(qm, &mailbox);
+	if (!ret && op)
+		memcpy(xqc, tmp_xqc, size);
+
+	mutex_unlock(&qm->mailbox_lock);
+
+	return ret;
+}
+
 static void qm_db_v1(struct hisi_qm *qm, u16 qn, u8 cmd, u16 index, u8 priority)
 {
 	u64 doorbell;
@@ -1321,45 +1361,6 @@ static int qm_get_vft_v2(struct hisi_qm *qm, u32 *base, u32 *number)
 	return 0;
 }
 
-void *hisi_qm_ctx_alloc(struct hisi_qm *qm, size_t ctx_size,
-			  dma_addr_t *dma_addr)
-{
-	struct device *dev = &qm->pdev->dev;
-	void *ctx_addr;
-
-	ctx_addr = kzalloc(ctx_size, GFP_KERNEL);
-	if (!ctx_addr)
-		return ERR_PTR(-ENOMEM);
-
-	*dma_addr = dma_map_single(dev, ctx_addr, ctx_size, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dev, *dma_addr)) {
-		dev_err(dev, "DMA mapping error!\n");
-		kfree(ctx_addr);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	return ctx_addr;
-}
-
-void hisi_qm_ctx_free(struct hisi_qm *qm, size_t ctx_size,
-			const void *ctx_addr, dma_addr_t *dma_addr)
-{
-	struct device *dev = &qm->pdev->dev;
-
-	dma_unmap_single(dev, *dma_addr, ctx_size, DMA_FROM_DEVICE);
-	kfree(ctx_addr);
-}
-
-static int qm_dump_sqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id)
-{
-	return hisi_qm_mb(qm, QM_MB_CMD_SQC, dma_addr, qp_id, 1);
-}
-
-static int qm_dump_cqc_raw(struct hisi_qm *qm, dma_addr_t dma_addr, u16 qp_id)
-{
-	return hisi_qm_mb(qm, QM_MB_CMD_CQC, dma_addr, qp_id, 1);
-}
-
 static void qm_hw_error_init_v1(struct hisi_qm *qm)
 {
 	writel(QM_ABNORMAL_INT_MASK_VALUE, qm->io_base + QM_ABNORMAL_INT_MASK);
@@ -1952,84 +1953,51 @@ static void hisi_qm_release_qp(struct hisi_qp *qp)
 static int qm_sq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
 {
 	struct hisi_qm *qm = qp->qm;
-	struct device *dev = &qm->pdev->dev;
 	enum qm_hw_ver ver = qm->ver;
-	struct qm_sqc *sqc;
-	dma_addr_t sqc_dma;
-	int ret;
+	struct qm_sqc sqc = {0};
 
-	sqc = kzalloc(sizeof(struct qm_sqc), GFP_KERNEL);
-	if (!sqc)
-		return -ENOMEM;
-
-	INIT_QC_COMMON(sqc, qp->sqe_dma, pasid);
 	if (ver == QM_HW_V1) {
-		sqc->dw3 = cpu_to_le32(QM_MK_SQC_DW3_V1(0, 0, 0, qm->sqe_size));
-		sqc->w8 = cpu_to_le16(qp->sq_depth - 1);
+		sqc.dw3 = cpu_to_le32(QM_MK_SQC_DW3_V1(0, 0, 0, qm->sqe_size));
+		sqc.w8 = cpu_to_le16(qp->sq_depth - 1);
 	} else {
-		sqc->dw3 = cpu_to_le32(QM_MK_SQC_DW3_V2(qm->sqe_size, qp->sq_depth));
-		sqc->w8 = 0; /* rand_qc */
+		sqc.dw3 = cpu_to_le32(QM_MK_SQC_DW3_V2(qm->sqe_size, qp->sq_depth));
+		sqc.w8 = 0; /* rand_qc */
 	}
-	sqc->cq_num = cpu_to_le16(qp_id);
-	sqc->w13 = cpu_to_le16(QM_MK_SQC_W13(0, 1, qp->alg_type));
+	sqc.w13 = cpu_to_le16(QM_MK_SQC_W13(0, 1, qp->alg_type));
+	sqc.base_l = cpu_to_le32(lower_32_bits(qp->sqe_dma));
+	sqc.base_h = cpu_to_le32(upper_32_bits(qp->sqe_dma));
+	sqc.cq_num = cpu_to_le16(qp_id);
+	sqc.pasid = cpu_to_le16(pasid);
 
 	if (ver >= QM_HW_V3 && qm->use_sva && !qp->is_in_kernel)
-		sqc->w11 = cpu_to_le16(QM_QC_PASID_ENABLE <<
-				       QM_QC_PASID_ENABLE_SHIFT);
-
-	sqc_dma = dma_map_single(dev, sqc, sizeof(struct qm_sqc),
-				 DMA_TO_DEVICE);
-	if (dma_mapping_error(dev, sqc_dma)) {
-		kfree(sqc);
-		return -ENOMEM;
-	}
+		sqc.w11 = cpu_to_le16(QM_QC_PASID_ENABLE <<
+				      QM_QC_PASID_ENABLE_SHIFT);
 
-	ret = hisi_qm_mb(qm, QM_MB_CMD_SQC, sqc_dma, qp_id, 0);
-	dma_unmap_single(dev, sqc_dma, sizeof(struct qm_sqc), DMA_TO_DEVICE);
-	kfree(sqc);
-
-	return ret;
+	return qm_set_and_get_xqc(qm, QM_MB_CMD_SQC, &sqc, qp_id, 0);
 }
 
 static int qm_cq_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
 {
 	struct hisi_qm *qm = qp->qm;
-	struct device *dev = &qm->pdev->dev;
 	enum qm_hw_ver ver = qm->ver;
-	struct qm_cqc *cqc;
-	dma_addr_t cqc_dma;
-	int ret;
-
-	cqc = kzalloc(sizeof(struct qm_cqc), GFP_KERNEL);
-	if (!cqc)
-		return -ENOMEM;
+	struct qm_cqc cqc = {0};
 
-	INIT_QC_COMMON(cqc, qp->cqe_dma, pasid);
 	if (ver == QM_HW_V1) {
-		cqc->dw3 = cpu_to_le32(QM_MK_CQC_DW3_V1(0, 0, 0,
-							QM_QC_CQE_SIZE));
-		cqc->w8 = cpu_to_le16(qp->cq_depth - 1);
+		cqc.dw3 = cpu_to_le32(QM_MK_CQC_DW3_V1(0, 0, 0, QM_QC_CQE_SIZE));
+		cqc.w8 = cpu_to_le16(qp->cq_depth - 1);
 	} else {
-		cqc->dw3 = cpu_to_le32(QM_MK_CQC_DW3_V2(QM_QC_CQE_SIZE, qp->cq_depth));
-		cqc->w8 = 0; /* rand_qc */
+		cqc.dw3 = cpu_to_le32(QM_MK_CQC_DW3_V2(QM_QC_CQE_SIZE, qp->cq_depth));
+		cqc.w8 = 0; /* rand_qc */
 	}
-	cqc->dw6 = cpu_to_le32(1 << QM_CQ_PHASE_SHIFT | 1 << QM_CQ_FLAG_SHIFT);
+	cqc.dw6 = cpu_to_le32(1 << QM_CQ_PHASE_SHIFT | 1 << QM_CQ_FLAG_SHIFT);
+	cqc.base_l = cpu_to_le32(lower_32_bits(qp->cqe_dma));
+	cqc.base_h = cpu_to_le32(upper_32_bits(qp->cqe_dma));
+	cqc.pasid = cpu_to_le16(pasid);
 
 	if (ver >= QM_HW_V3 && qm->use_sva && !qp->is_in_kernel)
-		cqc->w11 = cpu_to_le16(QM_QC_PASID_ENABLE);
-
-	cqc_dma = dma_map_single(dev, cqc, sizeof(struct qm_cqc),
-				 DMA_TO_DEVICE);
-	if (dma_mapping_error(dev, cqc_dma)) {
-		kfree(cqc);
-		return -ENOMEM;
-	}
+		cqc.w11 = cpu_to_le16(QM_QC_PASID_ENABLE);
 
-	ret = hisi_qm_mb(qm, QM_MB_CMD_CQC, cqc_dma, qp_id, 0);
-	dma_unmap_single(dev, cqc_dma, sizeof(struct qm_cqc), DMA_TO_DEVICE);
-	kfree(cqc);
-
-	return ret;
+	return qm_set_and_get_xqc(qm, QM_MB_CMD_CQC, &cqc, qp_id, 0);
 }
 
 static int qm_qp_ctx_cfg(struct hisi_qp *qp, int qp_id, u32 pasid)
@@ -2119,14 +2087,11 @@ static void qp_stop_fail_cb(struct hisi_qp *qp)
  */
 static int qm_drain_qp(struct hisi_qp *qp)
 {
-	size_t size = sizeof(struct qm_sqc) + sizeof(struct qm_cqc);
 	struct hisi_qm *qm = qp->qm;
 	struct device *dev = &qm->pdev->dev;
-	struct qm_sqc *sqc;
-	struct qm_cqc *cqc;
-	dma_addr_t dma_addr;
-	int ret = 0, i = 0;
-	void *addr;
+	struct qm_sqc sqc;
+	struct qm_cqc cqc;
+	int ret, i = 0;
 
 	/* No need to judge if master OOO is blocked. */
 	if (qm_check_dev_error(qm))
@@ -2140,44 +2105,32 @@ static int qm_drain_qp(struct hisi_qp *qp)
 		return ret;
 	}
 
-	addr = hisi_qm_ctx_alloc(qm, size, &dma_addr);
-	if (IS_ERR(addr)) {
-		dev_err(dev, "Failed to alloc ctx for sqc and cqc!\n");
-		return -ENOMEM;
-	}
-
 	while (++i) {
-		ret = qm_dump_sqc_raw(qm, dma_addr, qp->qp_id);
+		ret = qm_set_and_get_xqc(qm, QM_MB_CMD_SQC, &sqc, qp->qp_id, 1);
 		if (ret) {
 			dev_err_ratelimited(dev, "Failed to dump sqc!\n");
-			break;
+			return ret;
 		}
-		sqc = addr;
 
-		ret = qm_dump_cqc_raw(qm, (dma_addr + sizeof(struct qm_sqc)),
-				      qp->qp_id);
+		ret = qm_set_and_get_xqc(qm, QM_MB_CMD_CQC, &cqc, qp->qp_id, 1);
 		if (ret) {
 			dev_err_ratelimited(dev, "Failed to dump cqc!\n");
-			break;
+			return ret;
 		}
-		cqc = addr + sizeof(struct qm_sqc);
 
-		if ((sqc->tail == cqc->tail) &&
+		if ((sqc.tail == cqc.tail) &&
 		    (QM_SQ_TAIL_IDX(sqc) == QM_CQ_TAIL_IDX(cqc)))
 			break;
 
 		if (i == MAX_WAIT_COUNTS) {
 			dev_err(dev, "Fail to empty queue %u!\n", qp->qp_id);
-			ret = -EBUSY;
-			break;
+			return -EBUSY;
 		}
 
 		usleep_range(WAIT_PERIOD_US_MIN, WAIT_PERIOD_US_MAX);
 	}
 
-	hisi_qm_ctx_free(qm, size, addr, &dma_addr);
-
-	return ret;
+	return 0;
 }
 
 static int qm_stop_qp_nolock(struct hisi_qp *qp)
@@ -2889,11 +2842,20 @@ static void hisi_qm_unint_work(struct hisi_qm *qm)
 	destroy_workqueue(qm->wq);
 }
 
+static void hisi_qm_free_rsv_buf(struct hisi_qm *qm)
+{
+	struct qm_dma *xqc_dma = &qm->xqc_buf.qcdma;
+	struct device *dev = &qm->pdev->dev;
+
+	dma_free_coherent(dev, xqc_dma->size, xqc_dma->va, xqc_dma->dma);
+}
+
 static void hisi_qm_memory_uninit(struct hisi_qm *qm)
 {
 	struct device *dev = &qm->pdev->dev;
 
 	hisi_qp_memory_uninit(qm, qm->qp_num);
+	hisi_qm_free_rsv_buf(qm);
 	if (qm->qdma.va) {
 		hisi_qm_cache_wb(qm);
 		dma_free_coherent(dev, qm->qdma.size,
@@ -3015,62 +2977,26 @@ static void qm_disable_eq_aeq_interrupts(struct hisi_qm *qm)
 
 static int qm_eq_ctx_cfg(struct hisi_qm *qm)
 {
-	struct device *dev = &qm->pdev->dev;
-	struct qm_eqc *eqc;
-	dma_addr_t eqc_dma;
-	int ret;
-
-	eqc = kzalloc(sizeof(struct qm_eqc), GFP_KERNEL);
-	if (!eqc)
-		return -ENOMEM;
+	struct qm_eqc eqc = {0};
 
-	eqc->base_l = cpu_to_le32(lower_32_bits(qm->eqe_dma));
-	eqc->base_h = cpu_to_le32(upper_32_bits(qm->eqe_dma));
+	eqc.base_l = cpu_to_le32(lower_32_bits(qm->eqe_dma));
+	eqc.base_h = cpu_to_le32(upper_32_bits(qm->eqe_dma));
 	if (qm->ver == QM_HW_V1)
-		eqc->dw3 = cpu_to_le32(QM_EQE_AEQE_SIZE);
-	eqc->dw6 = cpu_to_le32(((u32)qm->eq_depth - 1) | (1 << QM_EQC_PHASE_SHIFT));
+		eqc.dw3 = cpu_to_le32(QM_EQE_AEQE_SIZE);
+	eqc.dw6 = cpu_to_le32(((u32)qm->eq_depth - 1) | (1 << QM_EQC_PHASE_SHIFT));
 
-	eqc_dma = dma_map_single(dev, eqc, sizeof(struct qm_eqc),
-				 DMA_TO_DEVICE);
-	if (dma_mapping_error(dev, eqc_dma)) {
-		kfree(eqc);
-		return -ENOMEM;
-	}
-
-	ret = hisi_qm_mb(qm, QM_MB_CMD_EQC, eqc_dma, 0, 0);
-	dma_unmap_single(dev, eqc_dma, sizeof(struct qm_eqc), DMA_TO_DEVICE);
-	kfree(eqc);
-
-	return ret;
+	return qm_set_and_get_xqc(qm, QM_MB_CMD_EQC, &eqc, 0, 0);
 }
 
 static int qm_aeq_ctx_cfg(struct hisi_qm *qm)
 {
-	struct device *dev = &qm->pdev->dev;
-	struct qm_aeqc *aeqc;
-	dma_addr_t aeqc_dma;
-	int ret;
+	struct qm_aeqc aeqc = {0};
 
-	aeqc = kzalloc(sizeof(struct qm_aeqc), GFP_KERNEL);
-	if (!aeqc)
-		return -ENOMEM;
-
-	aeqc->base_l = cpu_to_le32(lower_32_bits(qm->aeqe_dma));
-	aeqc->base_h = cpu_to_le32(upper_32_bits(qm->aeqe_dma));
-	aeqc->dw6 = cpu_to_le32(((u32)qm->aeq_depth - 1) | (1 << QM_EQC_PHASE_SHIFT));
+	aeqc.base_l = cpu_to_le32(lower_32_bits(qm->aeqe_dma));
+	aeqc.base_h = cpu_to_le32(upper_32_bits(qm->aeqe_dma));
+	aeqc.dw6 = cpu_to_le32(((u32)qm->aeq_depth - 1) | (1 << QM_EQC_PHASE_SHIFT));
 
-	aeqc_dma = dma_map_single(dev, aeqc, sizeof(struct qm_aeqc),
-				  DMA_TO_DEVICE);
-	if (dma_mapping_error(dev, aeqc_dma)) {
-		kfree(aeqc);
-		return -ENOMEM;
-	}
-
-	ret = hisi_qm_mb(qm, QM_MB_CMD_AEQC, aeqc_dma, 0, 0);
-	dma_unmap_single(dev, aeqc_dma, sizeof(struct qm_aeqc), DMA_TO_DEVICE);
-	kfree(aeqc);
-
-	return ret;
+	return qm_set_and_get_xqc(qm, QM_MB_CMD_AEQC, &aeqc, 0, 0);
 }
 
 static int qm_eq_aeq_ctx_cfg(struct hisi_qm *qm)
@@ -5296,6 +5222,36 @@ err_init_qp_mem:
 	return ret;
 }
 
+static int hisi_qm_alloc_rsv_buf(struct hisi_qm *qm)
+{
+	struct qm_rsv_buf *xqc_buf = &qm->xqc_buf;
+	struct qm_dma *xqc_dma = &xqc_buf->qcdma;
+	struct device *dev = &qm->pdev->dev;
+	size_t off = 0;
+
+#define QM_XQC_BUF_INIT(xqc_buf, type) do { \
+	(xqc_buf)->type = ((xqc_buf)->qcdma.va + (off)); \
+	(xqc_buf)->type##_dma = (xqc_buf)->qcdma.dma + (off); \
+	off += QMC_ALIGN(sizeof(struct qm_##type)); \
+} while (0)
+
+	xqc_dma->size = QMC_ALIGN(sizeof(struct qm_eqc)) +
+			QMC_ALIGN(sizeof(struct qm_aeqc)) +
+			QMC_ALIGN(sizeof(struct qm_sqc)) +
+			QMC_ALIGN(sizeof(struct qm_cqc));
+	xqc_dma->va = dma_alloc_coherent(dev, xqc_dma->size,
+					 &xqc_dma->dma, GFP_KERNEL);
+	if (!xqc_dma->va)
+		return -ENOMEM;
+
+	QM_XQC_BUF_INIT(xqc_buf, eqc);
+	QM_XQC_BUF_INIT(xqc_buf, aeqc);
+	QM_XQC_BUF_INIT(xqc_buf, sqc);
+	QM_XQC_BUF_INIT(xqc_buf, cqc);
+
+	return 0;
+}
+
 static int hisi_qm_memory_init(struct hisi_qm *qm)
 {
 	struct device *dev = &qm->pdev->dev;
@@ -5337,13 +5293,19 @@ static int hisi_qm_memory_init(struct hisi_qm *qm)
 	QM_INIT_BUF(qm, sqc, qm->qp_num);
 	QM_INIT_BUF(qm, cqc, qm->qp_num);
 
+	ret = hisi_qm_alloc_rsv_buf(qm);
+	if (ret)
+		goto err_free_qdma;
+
 	ret = hisi_qp_alloc_memory(qm);
 	if (ret)
-		goto err_alloc_qp_array;
+		goto err_free_reserve_buf;
 
 	return 0;
 
-err_alloc_qp_array:
+err_free_reserve_buf:
+	hisi_qm_free_rsv_buf(qm);
+err_free_qdma:
 	dma_free_coherent(dev, qm->qdma.size, qm->qdma.va, qm->qdma.dma);
 err_destroy_idr:
 	idr_destroy(&qm->qp_idr);
diff --git a/drivers/crypto/hisilicon/qm_common.h b/drivers/crypto/hisilicon/qm_common.h
index 8e36aa9c681b..7b0b15c83ec1 100644
--- a/drivers/crypto/hisilicon/qm_common.h
+++ b/drivers/crypto/hisilicon/qm_common.h
@@ -76,10 +76,7 @@ static const char * const qm_s[] = {
 	"init", "start", "close", "stop",
 };
 
-void *hisi_qm_ctx_alloc(struct hisi_qm *qm, size_t ctx_size,
-			dma_addr_t *dma_addr);
-void hisi_qm_ctx_free(struct hisi_qm *qm, size_t ctx_size,
-		      const void *ctx_addr, dma_addr_t *dma_addr);
+int qm_set_and_get_xqc(struct hisi_qm *qm, u8 cmd, void *xqc, u32 qp_id, bool op);
 void hisi_qm_show_last_dfx_regs(struct hisi_qm *qm);
 void hisi_qm_set_algqos_init(struct hisi_qm *qm);
 
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 34c64a02712c..44e0c44a2e20 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -292,6 +292,18 @@ struct qm_err_isolate {
 	struct list_head qm_hw_errs;
 };
 
+struct qm_rsv_buf {
+	struct qm_sqc *sqc;
+	struct qm_cqc *cqc;
+	struct qm_eqc *eqc;
+	struct qm_aeqc *aeqc;
+	dma_addr_t sqc_dma;
+	dma_addr_t cqc_dma;
+	dma_addr_t eqc_dma;
+	dma_addr_t aeqc_dma;
+	struct qm_dma qcdma;
+};
+
 struct hisi_qm {
 	enum qm_hw_ver ver;
 	enum qm_fun_type fun_type;
@@ -324,6 +336,7 @@ struct hisi_qm {
 	dma_addr_t cqc_dma;
 	dma_addr_t eqe_dma;
 	dma_addr_t aeqe_dma;
+	struct qm_rsv_buf xqc_buf;
 
 	struct hisi_qm_status status;
 	const struct hisi_qm_err_ini *err_ini;
-- 
cgit v1.2.3


From 886ee55eabac0d46faf8bc0b22207ca2740847ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 13 Oct 2023 10:15:46 +0200
Subject: locking/seqlock: Propagate 'const' pointers within read-only methods,
 remove forced type casts

Currently __seqprop_ptr() is an inline function that must chose to either
use 'const' or non-const seqcount related pointers - but this results in
the undesirable loss of 'const' propagation, via a forced type cast.

The easiest solution would be to turn the pointer wrappers into macros that
pass through whatever type is passed to them - but the clever maze of
seqlock API instantiation macros relies on the GCC CPP '##' macro
extension, which isn't recursive, so inline functions must be used here.

So create two wrapper variants instead: 'ptr' and 'const_ptr', and pick the
right one for the codepaths that are const: read_seqcount_begin() and
read_seqcount_retry().

This cleans up type handling and allows the removal of all type forcing.

No change in functionality.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/seqlock.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 4b8dcd3a0d93..80f21d2ca2aa 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -200,9 +200,15 @@ typedef struct seqcount_##lockname {					\
 } seqcount_##lockname##_t;						\
 									\
 static __always_inline seqcount_t *					\
-__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s)		\
+__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
 {									\
-	return (void *)&s->seqcount; /* drop const */			\
+	return &s->seqcount;						\
+}									\
+									\
+static __always_inline const seqcount_t *				\
+__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)	\
+{									\
+	return &s->seqcount;						\
 }									\
 									\
 static __always_inline unsigned						\
@@ -247,9 +253,14 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
  * __seqprop() for seqcount_t
  */
 
-static inline seqcount_t *__seqprop_ptr(const seqcount_t *s)
+static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
+{
+	return s;
+}
+
+static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
 {
-	return (void *)s; /* drop const */
+	return s;
 }
 
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
@@ -302,6 +313,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 	__seqprop_case((s),	mutex,		prop))
 
 #define seqprop_ptr(s)			__seqprop(s, ptr)(s)
+#define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
 #define seqprop_sequence(s)		__seqprop(s, sequence)(s)
 #define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
 #define seqprop_assert(s)		__seqprop(s, assert)(s)
@@ -353,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  */
 #define read_seqcount_begin(s)						\
 ({									\
-	seqcount_lockdep_reader_access(seqprop_ptr(s));			\
+	seqcount_lockdep_reader_access(seqprop_const_ptr(s));		\
 	raw_read_seqcount_begin(s);					\
 })
 
@@ -419,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  * Return: true if a read section retry is required, else false
  */
 #define __read_seqcount_retry(s, start)					\
-	do___read_seqcount_retry(seqprop_ptr(s), start)
+	do___read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -439,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Return: true if a read section retry is required, else false
  */
 #define read_seqcount_retry(s, start)					\
-	do_read_seqcount_retry(seqprop_ptr(s), start)
+	do_read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
-- 
cgit v1.2.3


From 3f7f31fff2510272334f3d0374c432bdaa4f1536 Mon Sep 17 00:00:00 2001
From: Wei Zhang <weizhang@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:36 -0700
Subject: net/mlx5: Parallelize vhca event handling

At present, mlx5 driver have a general purpose
event handler which not only handles vhca event
but also many other events. This incurs a huge
bottleneck because the event handler is
implemented by single threaded workqueue and all
events are forced to be handled in serial manner
even though application tries to create multiple
SFs simultaneously.

Introduce a dedicated vhca event handler which
manages SFs parallel creation.

Signed-off-by: Wei Zhang <weizhang@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |  5 --
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  3 +-
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.c    | 57 ++++++++++++++++++++--
 include/linux/mlx5/driver.h                        |  1 +
 4 files changed, 57 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 3ec892d51f57..d91ea53eb394 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -441,8 +441,3 @@ int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int ev
 
 	return blocking_notifier_call_chain(&events->sw_nh, event, data);
 }
-
-void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work)
-{
-	queue_work(dev->priv.events->wq, work);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 19ffd1816474..d348a7f9511f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -159,6 +159,8 @@ enum mlx5_semaphore_space_address {
 
 #define MLX5_DEFAULT_PROF       2
 #define MLX5_SF_PROF		3
+#define MLX5_NUM_FW_CMD_THREADS 8
+#define MLX5_DEV_MAX_WQS	MLX5_NUM_FW_CMD_THREADS
 
 static inline int mlx5_flexible_inlen(struct mlx5_core_dev *dev, size_t fixed,
 				      size_t item_size, size_t num_items,
@@ -347,7 +349,6 @@ int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap
 #define mlx5_vport_get_other_func_general_cap(dev, vport, out)		\
 	mlx5_vport_get_other_func_cap(dev, vport, out, MLX5_CAP_GENERAL)
 
-void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work);
 static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev)
 {
 	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
index d908fba968f0..c6fd729de8b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
@@ -21,6 +21,15 @@ struct mlx5_vhca_event_work {
 	struct mlx5_vhca_state_event event;
 };
 
+struct mlx5_vhca_event_handler {
+	struct workqueue_struct *wq;
+};
+
+struct mlx5_vhca_events {
+	struct mlx5_core_dev *dev;
+	struct mlx5_vhca_event_handler handler[MLX5_DEV_MAX_WQS];
+};
+
 int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, u32 *out, u32 outlen)
 {
 	u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {};
@@ -99,6 +108,12 @@ static void mlx5_vhca_state_work_handler(struct work_struct *_work)
 	kfree(work);
 }
 
+static void
+mlx5_vhca_events_work_enqueue(struct mlx5_core_dev *dev, int idx, struct work_struct *work)
+{
+	queue_work(dev->priv.vhca_events->handler[idx].wq, work);
+}
+
 static int
 mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data)
 {
@@ -106,6 +121,7 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
 				mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb);
 	struct mlx5_vhca_event_work *work;
 	struct mlx5_eqe *eqe = data;
+	int wq_idx;
 
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work)
@@ -113,7 +129,8 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
 	INIT_WORK(&work->work, &mlx5_vhca_state_work_handler);
 	work->notifier = notifier;
 	work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id);
-	mlx5_events_work_enqueue(notifier->dev, &work->work);
+	wq_idx = work->event.function_id % MLX5_DEV_MAX_WQS;
+	mlx5_vhca_events_work_enqueue(notifier->dev, wq_idx, &work->work);
 	return NOTIFY_OK;
 }
 
@@ -132,28 +149,62 @@ void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
 int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_vhca_state_notifier *notifier;
+	char wq_name[MLX5_CMD_WQ_MAX_NAME];
+	struct mlx5_vhca_events *events;
+	int err, i;
 
 	if (!mlx5_vhca_event_supported(dev))
 		return 0;
 
-	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
-	if (!notifier)
+	events = kzalloc(sizeof(*events), GFP_KERNEL);
+	if (!events)
 		return -ENOMEM;
 
+	events->dev = dev;
+	dev->priv.vhca_events = events;
+	for (i = 0; i < MLX5_DEV_MAX_WQS; i++) {
+		snprintf(wq_name, MLX5_CMD_WQ_MAX_NAME, "mlx5_vhca_event%d", i);
+		events->handler[i].wq = create_singlethread_workqueue(wq_name);
+		if (!events->handler[i].wq) {
+			err = -ENOMEM;
+			goto err_create_wq;
+		}
+	}
+
+	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+	if (!notifier) {
+		err = -ENOMEM;
+		goto err_notifier;
+	}
+
 	dev->priv.vhca_state_notifier = notifier;
 	notifier->dev = dev;
 	BLOCKING_INIT_NOTIFIER_HEAD(&notifier->n_head);
 	MLX5_NB_INIT(&notifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE);
 	return 0;
+
+err_notifier:
+err_create_wq:
+	for (--i; i >= 0; i--)
+		destroy_workqueue(events->handler[i].wq);
+	kfree(events);
+	return err;
 }
 
 void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
 {
+	struct mlx5_vhca_events *vhca_events;
+	int i;
+
 	if (!mlx5_vhca_event_supported(dev))
 		return;
 
 	kfree(dev->priv.vhca_state_notifier);
 	dev->priv.vhca_state_notifier = NULL;
+	vhca_events = dev->priv.vhca_events;
+	for (i = 0; i < MLX5_DEV_MAX_WQS; i++)
+		destroy_workqueue(vhca_events->handler[i].wq);
+	kvfree(vhca_events);
 }
 
 void mlx5_vhca_event_start(struct mlx5_core_dev *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 52e982bc0f50..7968c5ee85c4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -615,6 +615,7 @@ struct mlx5_priv {
 	int			adev_idx;
 	int			sw_vhca_id;
 	struct mlx5_events      *events;
+	struct mlx5_vhca_events *vhca_events;
 
 	struct mlx5_flow_steering *steering;
 	struct mlx5_mpfs        *mpfs;
-- 
cgit v1.2.3


From e534552c92a44690e48593f9567fe689545ded73 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 12 Oct 2023 12:27:39 -0700
Subject: net/mlx5: Refactor LAG peer device lookout bus logic to mlx5 devcom

LAG peer device lookout bus logic required the usage of global lock,
mlx5_intf_mutex.
As part of the effort to remove this global lock, refactor LAG peer
device lookout to use mlx5 devcom layer.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 68 ----------------------
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c  | 12 ++--
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.c   | 14 +++++
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.h   |  4 ++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 25 ++++++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  1 -
 include/linux/mlx5/driver.h                        |  1 +
 7 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 1fc03480c2ff..6e3a8c22881f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -566,74 +566,6 @@ bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev
 	return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid);
 }
 
-static u32 mlx5_gen_pci_id(const struct mlx5_core_dev *dev)
-{
-	return (u32)((pci_domain_nr(dev->pdev->bus) << 16) |
-		     (dev->pdev->bus->number << 8) |
-		     PCI_SLOT(dev->pdev->devfn));
-}
-
-static int _next_phys_dev(struct mlx5_core_dev *mdev,
-			  const struct mlx5_core_dev *curr)
-{
-	if (!mlx5_core_is_pf(mdev))
-		return 0;
-
-	if (mdev == curr)
-		return 0;
-
-	if (!mlx5_same_hw_devs(mdev, (struct mlx5_core_dev *)curr) &&
-	    mlx5_gen_pci_id(mdev) != mlx5_gen_pci_id(curr))
-		return 0;
-
-	return 1;
-}
-
-static void *pci_get_other_drvdata(struct device *this, struct device *other)
-{
-	if (this->driver != other->driver)
-		return NULL;
-
-	return pci_get_drvdata(to_pci_dev(other));
-}
-
-static int next_phys_dev_lag(struct device *dev, const void *data)
-{
-	struct mlx5_core_dev *mdev, *this = (struct mlx5_core_dev *)data;
-
-	mdev = pci_get_other_drvdata(this->device, dev);
-	if (!mdev)
-		return 0;
-
-	if (!mlx5_lag_is_supported(mdev))
-		return 0;
-
-	return _next_phys_dev(mdev, data);
-}
-
-static struct mlx5_core_dev *mlx5_get_next_dev(struct mlx5_core_dev *dev,
-					       int (*match)(struct device *dev, const void *data))
-{
-	struct device *next;
-
-	if (!mlx5_core_is_pf(dev))
-		return NULL;
-
-	next = bus_find_device(&pci_bus_type, NULL, dev, match);
-	if (!next)
-		return NULL;
-
-	put_device(next);
-	return pci_get_drvdata(to_pci_dev(next));
-}
-
-/* Must be called with intf_mutex held */
-struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev)
-{
-	lockdep_assert_held(&mlx5_intf_mutex);
-	return mlx5_get_next_dev(dev, &next_phys_dev_lag);
-}
-
 void mlx5_dev_list_lock(void)
 {
 	mutex_lock(&mlx5_intf_mutex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index af3fac090b82..f0b57f97739f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1212,13 +1212,14 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
 	dev->priv.lag = NULL;
 }
 
-/* Must be called with intf_mutex held */
+/* Must be called with HCA devcom component lock held */
 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 {
+	struct mlx5_devcom_comp_dev *pos = NULL;
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
 
-	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
+	tmp_dev = mlx5_devcom_get_next_peer_data(dev->priv.hca_devcom_comp, &pos);
 	if (tmp_dev)
 		ldev = mlx5_lag_dev(tmp_dev);
 
@@ -1275,10 +1276,13 @@ void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
 	if (!mlx5_lag_is_supported(dev))
 		return;
 
+	if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp))
+		return;
+
 recheck:
-	mlx5_dev_list_lock();
+	mlx5_devcom_comp_lock(dev->priv.hca_devcom_comp);
 	err = __mlx5_lag_dev_add_mdev(dev);
-	mlx5_dev_list_unlock();
+	mlx5_devcom_comp_unlock(dev->priv.hca_devcom_comp);
 
 	if (err) {
 		msleep(100);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index 89ac3209277e..f4d5c300ddd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -387,3 +387,17 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom,
 	*pos = tmp;
 	return data;
 }
+
+void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom)
+{
+	if (IS_ERR_OR_NULL(devcom))
+		return;
+	down_write(&devcom->comp->sem);
+}
+
+void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom)
+{
+	if (IS_ERR_OR_NULL(devcom))
+		return;
+	up_write(&devcom->comp->sem);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index 8220d180e33c..f06529cc3c61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -9,6 +9,7 @@
 enum mlx5_devcom_component {
 	MLX5_DEVCOM_ESW_OFFLOADS,
 	MLX5_DEVCOM_MPV,
+	MLX5_DEVCOM_HCA_PORTS,
 	MLX5_DEVCOM_NUM_COMPONENTS,
 };
 
@@ -52,4 +53,7 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom,
 	     data;								  \
 	     data = mlx5_devcom_get_next_peer_data_rcu(devcom, &pos))
 
+void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom);
+void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom);
+
 #endif /* __LIB_MLX5_DEVCOM_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 307ffe6300f8..a17152c1cbb2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -73,6 +73,7 @@
 #include "sf/sf.h"
 #include "mlx5_irq.h"
 #include "hwmon.h"
+#include "lag/lag.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -952,6 +953,27 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev)
 	mlx5_pci_disable_device(dev);
 }
 
+static void mlx5_register_hca_devcom_comp(struct mlx5_core_dev *dev)
+{
+	/* This component is use to sync adding core_dev to lag_dev and to sync
+	 * changes of mlx5_adev_devices between LAG layer and other layers.
+	 */
+	if (!mlx5_lag_is_supported(dev))
+		return;
+
+	dev->priv.hca_devcom_comp =
+		mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_HCA_PORTS,
+					       mlx5_query_nic_system_image_guid(dev),
+					       NULL, dev);
+	if (IS_ERR_OR_NULL(dev->priv.hca_devcom_comp))
+		mlx5_core_err(dev, "Failed to register devcom HCA component\n");
+}
+
+static void mlx5_unregister_hca_devcom_comp(struct mlx5_core_dev *dev)
+{
+	mlx5_devcom_unregister_component(dev->priv.hca_devcom_comp);
+}
+
 static int mlx5_init_once(struct mlx5_core_dev *dev)
 {
 	int err;
@@ -960,6 +982,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	if (IS_ERR(dev->priv.devc))
 		mlx5_core_warn(dev, "failed to register devcom device %ld\n",
 			       PTR_ERR(dev->priv.devc));
+	mlx5_register_hca_devcom_comp(dev);
 
 	err = mlx5_query_board_id(dev);
 	if (err) {
@@ -1094,6 +1117,7 @@ err_eq_cleanup:
 err_irq_cleanup:
 	mlx5_irq_table_cleanup(dev);
 err_devcom:
+	mlx5_unregister_hca_devcom_comp(dev);
 	mlx5_devcom_unregister_device(dev->priv.devc);
 
 	return err;
@@ -1123,6 +1147,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_events_cleanup(dev);
 	mlx5_eq_table_cleanup(dev);
 	mlx5_irq_table_cleanup(dev);
+	mlx5_unregister_hca_devcom_comp(dev);
 	mlx5_devcom_unregister_device(dev->priv.devc);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index d348a7f9511f..f191a90f337b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -266,7 +266,6 @@ int mlx5_register_device(struct mlx5_core_dev *dev);
 void mlx5_unregister_device(struct mlx5_core_dev *dev);
 void mlx5_dev_set_lightweight(struct mlx5_core_dev *dev);
 bool mlx5_dev_is_lightweight(struct mlx5_core_dev *dev);
-struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev);
 void mlx5_dev_list_lock(void);
 void mlx5_dev_list_unlock(void);
 int mlx5_dev_list_trylock(void);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7968c5ee85c4..f60cdc9bd40f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -624,6 +624,7 @@ struct mlx5_priv {
 	struct mlx5_lag		*lag;
 	u32			flags;
 	struct mlx5_devcom_dev	*devc;
+	struct mlx5_devcom_comp_dev *hca_devcom_comp;
 	struct mlx5_fw_reset	*fw_reset;
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
-- 
cgit v1.2.3


From 0d2d6bc7e74fee586f587d33484b797bb78f334c Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 12 Oct 2023 12:27:41 -0700
Subject: net/mlx5: Remove unused declaration

Commit 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path")
declared mlx5e_ipsec_inverse_table_init() but never implemented it.
Commit f52f2faee581 ("net/mlx5e: Introduce flow steering API")
declared mlx5e_fs_set_tc() but never implemented it.
Commit f2f3df550139 ("net/mlx5: EQ, Privatize eq_table and friends")
declared mlx5_eq_comp_cpumask() but never implemented it.
Commit cac1eb2cf2e3 ("net/mlx5: Lag, properly lock eswitch if needed")
removed mlx5_lag_update() but not its declaration.
Commit 35ba005d820b ("net/mlx5: DR, Set flex parser for TNL_MPLS dynamically")
removed mlx5dr_ste_build_tnl_mpls() but not its declaration.

Commit e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
declared but never implemented mlx5_alloc_cmd_mailbox_chain() and mlx5_free_cmd_mailbox_chain().
Commit 0cf53c124756 ("net/mlx5: FWPage, Use async events chain")
removed mlx5_core_req_pages_handler() but not its declaration.
Commit 938fe83c8dcb ("net/mlx5_core: New device capabilities handling")
removed mlx5_query_odp_caps() but not its declaration.
Commit f6a8a19bb11b ("RDMA/netdev: Hoist alloc_netdev_mqs out of the driver")
removed mlx5_rdma_netdev_alloc() but not its declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h            |  1 -
 .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h  |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h           |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h        |  2 --
 .../net/ethernet/mellanox/mlx5/core/steering/dr_types.h    |  4 ----
 include/linux/mlx5/driver.h                                | 14 --------------
 6 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index e5a44b0b9616..4d6225e0eec7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -150,7 +150,6 @@ struct mlx5e_flow_steering *mlx5e_fs_init(const struct mlx5e_profile *profile,
 					  struct dentry *dfs_root);
 void mlx5e_fs_cleanup(struct mlx5e_flow_steering *fs);
 struct mlx5e_vlan_table *mlx5e_fs_get_vlan(struct mlx5e_flow_steering *fs);
-void mlx5e_fs_set_tc(struct mlx5e_flow_steering *fs, struct mlx5e_tc_table *tc);
 struct mlx5e_tc_table *mlx5e_fs_get_tc(struct mlx5e_flow_steering *fs);
 struct mlx5e_l2_table *mlx5e_fs_get_l2(struct mlx5e_flow_steering *fs);
 struct mlx5_flow_namespace *mlx5e_fs_get_ns(struct mlx5e_flow_steering *fs, bool egress);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
index 9ee014a8ad24..2ed99772f168 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
@@ -54,7 +54,6 @@ struct mlx5e_accel_tx_ipsec_state {
 
 #ifdef CONFIG_MLX5_EN_IPSEC
 
-void mlx5e_ipsec_inverse_table_init(void);
 void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
 			    struct xfrm_offload *xo);
 void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 69a75459775d..4b7f7131c560 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -85,7 +85,6 @@ void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
 struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn);
 struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev);
 void mlx5_cq_tasklet_cb(struct tasklet_struct *t);
-struct cpumask *mlx5_eq_comp_cpumask(struct mlx5_core_dev *dev, int ix);
 
 u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq);
 void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 13af3ae2cce5..6b14e347d914 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -311,8 +311,6 @@ static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev)
 	return ret;
 }
 
-void mlx5_lag_update(struct mlx5_core_dev *dev);
-
 enum {
 	MLX5_NIC_IFC_FULL		= 0,
 	MLX5_NIC_IFC_DISABLED		= 1,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 55dc7383477c..81eff6c410ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -436,10 +436,6 @@ void mlx5dr_ste_build_mpls(struct mlx5dr_ste_ctx *ste_ctx,
 			   struct mlx5dr_ste_build *sb,
 			   struct mlx5dr_match_param *mask,
 			   bool inner, bool rx);
-void mlx5dr_ste_build_tnl_mpls(struct mlx5dr_ste_ctx *ste_ctx,
-			       struct mlx5dr_ste_build *sb,
-			       struct mlx5dr_match_param *mask,
-			       bool inner, bool rx);
 void mlx5dr_ste_build_tnl_mpls_over_gre(struct mlx5dr_ste_ctx *ste_ctx,
 					struct mlx5dr_ste_build *sb,
 					struct mlx5dr_match_param *mask,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f60cdc9bd40f..d2b8d4a74a30 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1043,10 +1043,6 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			     struct mlx5_frag_buf *buf, int node);
 void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
-struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-						      gfp_t flags, int npages);
-void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
-				 struct mlx5_cmd_mailbox *head);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
 			  int inlen);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey);
@@ -1060,8 +1056,6 @@ void mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev);
-void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
-				 s32 npages, bool ec_function);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
 int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
 void mlx5_register_debugfs(void);
@@ -1101,8 +1095,6 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
 int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
 __be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev);
 void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
-int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
-			struct mlx5_odp_caps *odp_caps);
 
 int mlx5_init_rl_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@ -1204,12 +1196,6 @@ int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev,
 void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev,
 					     int vf_id,
 					     struct notifier_block *nb);
-#ifdef CONFIG_MLX5_CORE_IPOIB
-struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-					  struct ib_device *ibdev,
-					  const char *name,
-					  void (*setup)(struct net_device *));
-#endif /* CONFIG_MLX5_CORE_IPOIB */
 int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
 			    struct ib_device *device,
 			    struct rdma_netdev_alloc_params *params);
-- 
cgit v1.2.3


From 57f728d59f005dffdbb52a03531e480a71599bc5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 31 Jul 2023 22:08:17 -0700
Subject: cpumask: kernel-doc cleanups and additions

Clean up some punctutation and abbreviations.
Add kernel-doc notation for one function and function return value
for 39 functions.

cpumask.h:
Fix some punctuation (plural vs. possessive).
Fix some abbreviations (ie. -> i.e., id -> ID).

Fix 35 warnings like this:
include/linux/cpumask.h:161: warning: No description found for return value of 'cpumask_first'

cpumask.c:
Add Return: value for 4 functions.
Add kernel-doc for cpumask_any_distribute().

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 113 +++++++++++++++++++++++++++++-------------------
 lib/cpumask.c           |  17 +++++---
 2 files changed, 81 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f10fb87d49db..cfb545841a2c 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -4,7 +4,7 @@
 
 /*
  * Cpumasks provide a bitmap suitable for representing the
- * set of CPU's in a system, one bit position per CPU number.  In general,
+ * set of CPUs in a system, one bit position per CPU number.  In general,
  * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
 #include <linux/kernel.h>
@@ -97,7 +97,7 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *
  *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
  *
- *  The cpu_possible_mask is fixed at boot time, as the set of CPU id's
+ *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
  *  that it is possible might ever be plugged in at anytime during the
  *  life of that system boot.  The cpu_present_mask is dynamic(*),
  *  representing which CPUs are currently plugged in.  And
@@ -112,7 +112,7 @@ static inline void set_nr_cpu_ids(unsigned int nr)
  *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
  *
  * Subtleties:
- * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
+ * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
  *    assumption that their single CPU is online.  The UP
  *    cpu_{online,possible,present}_masks are placebos.  Changing them
  *    will have no useful affect on the following num_*_cpus()
@@ -155,7 +155,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
  * cpumask_first - get the first cpu in a cpumask
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 static inline unsigned int cpumask_first(const struct cpumask *srcp)
 {
@@ -166,7 +166,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
  * cpumask_first_zero - get the first unset cpu in a cpumask
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if all cpus are set.
+ * Return: >= nr_cpu_ids if all cpus are set.
  */
 static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 {
@@ -178,7 +178,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
  * @srcp1: the first input
  * @srcp2: the second input
  *
- * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
+ * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
  */
 static inline
 unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
@@ -190,7 +190,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
  * cpumask_last - get the last CPU in a cpumask
  * @srcp:	- the cpumask pointer
  *
- * Returns	>= nr_cpumask_bits if no CPUs set.
+ * Return:	>= nr_cpumask_bits if no CPUs set.
  */
 static inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
@@ -199,10 +199,10 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 
 /**
  * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus set.
+ * Return: >= nr_cpu_ids if no further cpus set.
  */
 static inline
 unsigned int cpumask_next(int n, const struct cpumask *srcp)
@@ -215,10 +215,10 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @srcp: the cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus unset.
+ * Return: >= nr_cpu_ids if no further cpus unset.
  */
 static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 {
@@ -254,11 +254,11 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
- * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @n: the cpu prior to the place to search (i.e. return will be > @n)
  * @src1p: the first cpumask pointer
  * @src2p: the second cpumask pointer
  *
- * Returns >= nr_cpu_ids if no further cpus set in both.
+ * Return: >= nr_cpu_ids if no further cpus set in both.
  */
 static inline
 unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
@@ -373,7 +373,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * @cpu: the cpu to ignore.
  *
  * Often used to find any cpu but smp_processor_id() in a mask.
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 static inline
 unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
@@ -388,11 +388,11 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 }
 
 /**
- * cpumask_nth - get the first cpu in a cpumask
+ * cpumask_nth - get the Nth cpu in a cpumask
  * @srcp: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
 {
@@ -400,12 +400,12 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s
 }
 
 /**
- * cpumask_nth_and - get the first cpu in 2 cpumasks
+ * cpumask_nth_and - get the Nth cpu in 2 cpumasks
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline
 unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
@@ -416,12 +416,12 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
 }
 
 /**
- * cpumask_nth_andnot - get the first cpu set in 1st cpumask, and clear in 2nd.
+ * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static inline
 unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
@@ -436,9 +436,9 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
  * @srcp1: the cpumask pointer
  * @srcp2: the cpumask pointer
  * @srcp3: the cpumask pointer
- * @cpu: the N'th cpu to find, starting from 0
+ * @cpu: the Nth cpu to find, starting from 0
  *
- * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
 static __always_inline
 unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
@@ -497,7 +497,7 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in @cpumask, else returns false
+ * Return: true if @cpu is set in @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
 {
@@ -509,9 +509,9 @@ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpum
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
- *
  * test_and_set_bit wrapper for cpumasks.
+ *
+ * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 {
@@ -523,9 +523,9 @@ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cp
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
- *
  * test_and_clear_bit wrapper for cpumasks.
+ *
+ * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
 static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
 {
@@ -560,7 +560,7 @@ static inline void cpumask_clear(struct cpumask *dstp)
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns false, else returns true
+ * Return: false if *@dstp is empty, else returns true
  */
 static inline bool cpumask_and(struct cpumask *dstp,
 			       const struct cpumask *src1p,
@@ -603,7 +603,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns false, else returns true
+ * Return: false if *@dstp is empty, else returns true
  */
 static inline bool cpumask_andnot(struct cpumask *dstp,
 				  const struct cpumask *src1p,
@@ -617,6 +617,8 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
  * cpumask_equal - *src1p == *src2p
  * @src1p: the first input
  * @src2p: the second input
+ *
+ * Return: true if the cpumasks are equal, false if not
  */
 static inline bool cpumask_equal(const struct cpumask *src1p,
 				const struct cpumask *src2p)
@@ -630,6 +632,9 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  * @src3p: the third input
+ *
+ * Return: true if first cpumask ORed with second cpumask == third cpumask,
+ *	   otherwise false
  */
 static inline bool cpumask_or_equal(const struct cpumask *src1p,
 				    const struct cpumask *src2p,
@@ -643,6 +648,9 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
  * cpumask_intersects - (*src1p & *src2p) != 0
  * @src1p: the first input
  * @src2p: the second input
+ *
+ * Return: true if first cpumask ANDed with second cpumask is non-empty,
+ *	   otherwise false
  */
 static inline bool cpumask_intersects(const struct cpumask *src1p,
 				     const struct cpumask *src2p)
@@ -656,7 +664,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  *
- * Returns true if *@src1p is a subset of *@src2p, else returns false
+ * Return: true if *@src1p is a subset of *@src2p, else returns false
  */
 static inline bool cpumask_subset(const struct cpumask *src1p,
 				 const struct cpumask *src2p)
@@ -668,6 +676,8 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
 /**
  * cpumask_empty - *srcp == 0
  * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
+ *
+ * Return: true if srcp is empty (has no bits set), else false
  */
 static inline bool cpumask_empty(const struct cpumask *srcp)
 {
@@ -677,6 +687,8 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
 /**
  * cpumask_full - *srcp == 0xFFFFFFFF...
  * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
+ *
+ * Return: true if srcp is full (has all bits set), else false
  */
 static inline bool cpumask_full(const struct cpumask *srcp)
 {
@@ -686,6 +698,8 @@ static inline bool cpumask_full(const struct cpumask *srcp)
 /**
  * cpumask_weight - Count of bits in *srcp
  * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
+ *
+ * Return: count of bits set in *srcp
  */
 static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 {
@@ -696,6 +710,8 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
  * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
  * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
  * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+ *
+ * Return: count of bits set in both *srcp1 and *srcp2
  */
 static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
 						const struct cpumask *srcp2)
@@ -744,7 +760,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * cpumask_any - pick a "random" cpu from *srcp
  * @srcp: the input cpumask
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 #define cpumask_any(srcp) cpumask_first(srcp)
 
@@ -753,7 +769,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * @mask1: the first input cpumask
  * @mask2: the second input cpumask
  *
- * Returns >= nr_cpu_ids if no cpus set.
+ * Return: >= nr_cpu_ids if no cpus set.
  */
 #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))
 
@@ -769,7 +785,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
  * @len: the length of the buffer
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
@@ -783,7 +799,7 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
  * @len: the length of the buffer
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
@@ -797,7 +813,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
  * @buf: the buffer to extract from
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
@@ -809,7 +825,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  * @buf: the buffer to extract from
  * @dstp: the cpumask to set.
  *
- * Returns -errno, or 0 for success.
+ * Return: -errno, or 0 for success.
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
@@ -817,7 +833,9 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 }
 
 /**
- * cpumask_size - size to allocate for a 'struct cpumask' in bytes
+ * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
+ *
+ * Return: size to allocate for a &struct cpumask in bytes
  */
 static inline unsigned int cpumask_size(void)
 {
@@ -831,7 +849,7 @@ static inline unsigned int cpumask_size(void)
  * little more difficult, we typedef cpumask_var_t to an array or a
  * pointer: doing &mask on an array is a noop, so it still works.
  *
- * ie.
+ * i.e.
  *	cpumask_var_t tmpmask;
  *	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
  *		return -ENOMEM;
@@ -887,6 +905,8 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
  * a nop returning a constant 1 (in <linux/cpumask.h>).
  *
  * See alloc_cpumask_var_node.
+ *
+ * Return: %true if allocation succeeded, %false if not
  */
 static inline
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
@@ -1025,7 +1045,7 @@ set_cpu_dying(unsigned int cpu, bool dying)
 }
 
 /**
- * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
  *
  * There are a few places where cpumask_var_t isn't appropriate and
@@ -1068,6 +1088,8 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
  * interface gives only a momentary snapshot and is not protected against
  * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
  * region.
+ *
+ * Return: momentary snapshot of the number of online CPUs
  */
 static __always_inline unsigned int num_online_cpus(void)
 {
@@ -1160,7 +1182,7 @@ static inline bool cpu_dying(unsigned int cpu)
  * @mask: the cpumask to copy
  * @buf: the buffer to copy into
  *
- * Returns the length of the (null-terminated) @buf string, zero if
+ * Return: the length of the (null-terminated) @buf string, zero if
  * nothing is copied.
  */
 static inline ssize_t
@@ -1183,7 +1205,7 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
  * cpumask; Typically used by bin_attribute to export cpumask bitmask
  * ABI.
  *
- * Returns the length of how many bytes have been copied, excluding
+ * Return: the length of how many bytes have been copied, excluding
  * terminating '\0'.
  */
 static inline ssize_t
@@ -1204,6 +1226,9 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
  *
  * Everything is same with the above cpumap_print_bitmask_to_buf()
  * except the print format.
+ *
+ * Return: the length of how many bytes have been copied, excluding
+ * terminating '\0'.
  */
 static inline ssize_t
 cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae26..19277c6d551f 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -14,7 +14,7 @@
  * @start: the start point of the iteration
  * @wrap: assume @n crossing @start terminates the iteration
  *
- * Returns >= nr_cpu_ids on completion
+ * Return: >= nr_cpu_ids on completion
  *
  * Note: the @wrap argument is required for the start condition when
  * we cannot assume @start is set in @mask.
@@ -48,8 +48,9 @@ EXPORT_SYMBOL(cpumask_next_wrap);
  * @node: memory node from which to allocate or %NUMA_NO_NODE
  *
  * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
- * a nop returning a constant 1 (in <linux/cpumask.h>)
- * Returns TRUE if memory allocation succeeded, FALSE otherwise.
+ * a nop returning a constant 1 (in <linux/cpumask.h>).
+ *
+ * Return: TRUE if memory allocation succeeded, FALSE otherwise.
  *
  * In addition, mask will be NULL if this fails.  Note that gcc is
  * usually smart enough to know that mask can never be NULL if
@@ -115,7 +116,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  * @i: index number
  * @node: local numa_node
  *
- * Returns online CPU according to a numa aware policy; local cpus are returned
+ * Return: online CPU according to a numa aware policy; local cpus are returned
  * first, followed by non-local ones, then it wraps around.
  *
  * For those who wants to enumerate all CPUs based on their NUMA distances,
@@ -165,7 +166,7 @@ static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
  * Iterated calls using the same srcp1 and srcp2 will be distributed within
  * their intersection.
  *
- * Returns >= nr_cpu_ids if the intersection is empty.
+ * Return: >= nr_cpu_ids if the intersection is empty.
  */
 unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p)
@@ -184,6 +185,12 @@ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
 
+/**
+ * cpumask_any_distribute - Return an arbitrary cpu from srcp
+ * @srcp: &cpumask for selection
+ *
+ * Return: >= nr_cpu_ids if the intersection is empty.
+ */
 unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	unsigned int next, prev;
-- 
cgit v1.2.3


From 7733aa893847f021c674d0d30b723d892109369d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 17 Aug 2023 19:20:54 +0300
Subject: bitmap: Remove dead code, i.e. bitmap_copy_le()

Besides the fact it's not used anywhere it should be implemented
differently, i.e. via helpers from linux/byteorder/generic.h.
Yet the helpers themselves need to be introduced first.

Also note, the function lacks of the test cases, they must be provided.

Hence, drop the current dead code for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h |  5 -----
 lib/bitmap.c           | 23 -----------------------
 2 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 03644237e1ef..1516ff979315 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -220,11 +220,6 @@ int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
 void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
-#ifdef __BIG_ENDIAN
-void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
-#else
-#define bitmap_copy_le bitmap_copy
-#endif
 int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 24284caadbcc..935e0f96e785 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1360,29 +1360,6 @@ int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
 }
 EXPORT_SYMBOL(bitmap_allocate_region);
 
-/**
- * bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
- * @dst:   destination buffer
- * @src:   bitmap to copy
- * @nbits: number of bits in the bitmap
- *
- * Require nbits % BITS_PER_LONG == 0.
- */
-#ifdef __BIG_ENDIAN
-void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
-{
-	unsigned int i;
-
-	for (i = 0; i < nbits/BITS_PER_LONG; i++) {
-		if (BITS_PER_LONG == 64)
-			dst[i] = cpu_to_le64(src[i]);
-		else
-			dst[i] = cpu_to_le32(src[i]);
-	}
-}
-EXPORT_SYMBOL(bitmap_copy_le);
-#endif
-
 unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
 {
 	return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
-- 
cgit v1.2.3


From aae06fc1b5a2e4b52f8504a1f12f9b8b98e80641 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 7 Oct 2023 16:35:10 -0700
Subject: lib/bitmap: split-out string-related operations to a separate files

lib/bitmap.c and corresponding include/linux/bitmap.h are intended to
hold functions related to operations on bitmaps, like bitmap_shift or
bitmap_set. Historically, some string-related operations like
bitmap_parse are also reside in lib/bitmap.c.

Now that the subsystem evolves, string-related bitmap operations became a
significant part of the file. Because they are quite different from the
other bitmap functions by nature, it's worth to split them to a separate
source/header files.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 MAINTAINERS                |   2 +
 include/linux/bitmap-str.h |  16 ++
 include/linux/bitmap.h     |  18 +-
 lib/Makefile               |   2 +-
 lib/bitmap-str.c           | 510 ++++++++++++++++++++++++++++++++++++++++++++
 lib/bitmap.c               | 512 ---------------------------------------------
 6 files changed, 530 insertions(+), 530 deletions(-)
 create mode 100644 include/linux/bitmap-str.h
 create mode 100644 lib/bitmap-str.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6c4cce45a09d..561def8239f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3506,12 +3506,14 @@ R:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 R:	Rasmus Villemoes <linux@rasmusvillemoes.dk>
 S:	Maintained
 F:	include/linux/bitfield.h
+F:	include/linux/bitmap-str.h
 F:	include/linux/bitmap.h
 F:	include/linux/bits.h
 F:	include/linux/cpumask.h
 F:	include/linux/find.h
 F:	include/linux/nodemask.h
 F:	include/vdso/bits.h
+F:	lib/bitmap-str.c
 F:	lib/bitmap.c
 F:	lib/cpumask.c
 F:	lib/cpumask_kunit.c
diff --git a/include/linux/bitmap-str.h b/include/linux/bitmap-str.h
new file mode 100644
index 000000000000..17caeca94cab
--- /dev/null
+++ b/include/linux/bitmap-str.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BITMAP_STR_H
+#define __LINUX_BITMAP_STR_H
+
+int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits);
+int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits);
+extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
+					int nmaskbits, loff_t off, size_t count);
+extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
+					int nmaskbits, loff_t off, size_t count);
+int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits);
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits);
+int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
+					unsigned long *dst, int nbits);
+
+#endif /* __LINUX_BITMAP_STR_H */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1516ff979315..1cca950a54ae 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -10,6 +10,7 @@
 #include <linux/limits.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/bitmap-str.h>
 
 struct device;
 
@@ -200,14 +201,6 @@ bitmap_find_next_zero_area(unsigned long *map,
 					      align_mask, 0);
 }
 
-int bitmap_parse(const char *buf, unsigned int buflen,
-			unsigned long *dst, int nbits);
-int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
-			unsigned long *dst, int nbits);
-int bitmap_parselist(const char *buf, unsigned long *maskp,
-			int nmaskbits);
-int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
-			unsigned long *dst, int nbits);
 void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, unsigned int nbits);
 int bitmap_bitremap(int oldbit,
@@ -220,15 +213,6 @@ int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
 void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
 int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
-int bitmap_print_to_pagebuf(bool list, char *buf,
-				   const unsigned long *maskp, int nmaskbits);
-
-extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
-				      int nmaskbits, loff_t off, size_t count);
-
-extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
-				      int nmaskbits, loff_t off, size_t count);
-
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..9e8f9f6dd3b2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -48,7 +48,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
-	 generic-radix-tree.o
+	 generic-radix-tree.o bitmap-str.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/bitmap-str.c b/lib/bitmap-str.c
new file mode 100644
index 000000000000..be745209507a
--- /dev/null
+++ b/lib/bitmap-str.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bitmap.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/hex.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+#include "kstrtox.h"
+
+/**
+ * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
+ *
+ * @ubuf: pointer to user buffer containing string.
+ * @ulen: buffer size in bytes.  If string is smaller than this
+ *    then it must be terminated with a \0.
+ * @maskp: pointer to bitmap array that will contain result.
+ * @nmaskbits: size of bitmap, in bits.
+ */
+int bitmap_parse_user(const char __user *ubuf,
+			unsigned int ulen, unsigned long *maskp,
+			int nmaskbits)
+{
+	char *buf;
+	int ret;
+
+	buf = memdup_user_nul(ubuf, ulen);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);
+
+	kfree(buf);
+	return ret;
+}
+EXPORT_SYMBOL(bitmap_parse_user);
+
+/**
+ * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
+ * @list: indicates whether the bitmap must be list
+ * @buf: page aligned buffer into which string is placed
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ *
+ * Output format is a comma-separated list of decimal numbers and
+ * ranges if list is specified or hex digits grouped into comma-separated
+ * sets of 8 digits/set. Returns the number of characters written to buf.
+ *
+ * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
+ * area and that sufficient storage remains at @buf to accommodate the
+ * bitmap_print_to_pagebuf() output. Returns the number of characters
+ * actually printed to @buf, excluding terminating '\0'.
+ */
+int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
+			    int nmaskbits)
+{
+	ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);
+
+	return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
+		      scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
+}
+EXPORT_SYMBOL(bitmap_print_to_pagebuf);
+
+/**
+ * bitmap_print_to_buf  - convert bitmap to list or hex format ASCII string
+ * @list: indicates whether the bitmap must be list
+ *      true:  print in decimal list format
+ *      false: print in hexadecimal bitmask format
+ * @buf: buffer into which string is placed
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ * @off: in the string from which we are copying, We copy to @buf
+ * @count: the maximum number of bytes to print
+ */
+static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp,
+		int nmaskbits, loff_t off, size_t count)
+{
+	const char *fmt = list ? "%*pbl\n" : "%*pb\n";
+	ssize_t size;
+	void *data;
+
+	data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp);
+	if (!data)
+		return -ENOMEM;
+
+	size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1);
+	kfree(data);
+
+	return size;
+}
+
+/**
+ * bitmap_print_bitmask_to_buf  - convert bitmap to hex bitmask format ASCII string
+ * @buf: buffer into which string is placed
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ * @off: in the string from which we are copying, We copy to @buf
+ * @count: the maximum number of bytes to print
+ *
+ * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper
+ * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal
+ * bitmask and decimal list to userspace by sysfs ABI.
+ * Drivers might be using a normal attribute for this kind of ABIs. A
+ * normal attribute typically has show entry as below::
+ *
+ *   static ssize_t example_attribute_show(struct device *dev,
+ *		struct device_attribute *attr, char *buf)
+ *   {
+ *	...
+ *	return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max);
+ *   }
+ *
+ * show entry of attribute has no offset and count parameters and this
+ * means the file is limited to one page only.
+ * bitmap_print_to_pagebuf() API works terribly well for this kind of
+ * normal attribute with buf parameter and without offset, count::
+ *
+ *   bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
+ *			   int nmaskbits)
+ *   {
+ *   }
+ *
+ * The problem is once we have a large bitmap, we have a chance to get a
+ * bitmask or list more than one page. Especially for list, it could be
+ * as complex as 0,3,5,7,9,... We have no simple way to know it exact size.
+ * It turns out bin_attribute is a way to break this limit. bin_attribute
+ * has show entry as below::
+ *
+ *   static ssize_t
+ *   example_bin_attribute_show(struct file *filp, struct kobject *kobj,
+ *		struct bin_attribute *attr, char *buf,
+ *		loff_t offset, size_t count)
+ *   {
+ *	...
+ *   }
+ *
+ * With the new offset and count parameters, this makes sysfs ABI be able
+ * to support file size more than one page. For example, offset could be
+ * >= 4096.
+ * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their
+ * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf()
+ * make those drivers be able to support large bitmask and list after they
+ * move to use bin_attribute. In result, we have to pass the corresponding
+ * parameters such as off, count from bin_attribute show entry to this API.
+ *
+ * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf()
+ * is similar with cpumap_print_to_pagebuf(),  the difference is that
+ * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption
+ * the destination buffer is exactly one page and won't be more than one page.
+ * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other
+ * hand, mainly serves bin_attribute which doesn't work with exact one page,
+ * and it can break the size limit of converted decimal list and hexadecimal
+ * bitmask.
+ *
+ * WARNING!
+ *
+ * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf().
+ * It is intended to workaround sysfs limitations discussed above and should be
+ * used carefully in general case for the following reasons:
+ *
+ *  - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf().
+ *  - Memory complexity is O(nbits), comparing to O(1) for snprintf().
+ *  - @off and @count are NOT offset and number of bits to print.
+ *  - If printing part of bitmap as list, the resulting string is not a correct
+ *    list representation of bitmap. Particularly, some bits within or out of
+ *    related interval may be erroneously set or unset. The format of the string
+ *    may be broken, so bitmap_parselist-like parser may fail parsing it.
+ *  - If printing the whole bitmap as list by parts, user must ensure the order
+ *    of calls of the function such that the offset is incremented linearly.
+ *  - If printing the whole bitmap as list by parts, user must keep bitmap
+ *    unchanged between the very first and very last call. Otherwise concatenated
+ *    result may be incorrect, and format may be broken.
+ *
+ * Returns the number of characters actually printed to @buf
+ */
+int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
+				int nmaskbits, loff_t off, size_t count)
+{
+	return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count);
+}
+EXPORT_SYMBOL(bitmap_print_bitmask_to_buf);
+
+/**
+ * bitmap_print_list_to_buf  - convert bitmap to decimal list format ASCII string
+ * @buf: buffer into which string is placed
+ * @maskp: pointer to bitmap to convert
+ * @nmaskbits: size of bitmap, in bits
+ * @off: in the string from which we are copying, We copy to @buf
+ * @count: the maximum number of bytes to print
+ *
+ * Everything is same with the above bitmap_print_bitmask_to_buf() except
+ * the print format.
+ */
+int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
+			     int nmaskbits, loff_t off, size_t count)
+{
+	return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count);
+}
+EXPORT_SYMBOL(bitmap_print_list_to_buf);
+
+/*
+ * Region 9-38:4/10 describes the following bitmap structure:
+ * 0	   9  12    18			38	     N
+ * .........****......****......****..................
+ *	    ^  ^     ^			 ^	     ^
+ *      start  off   group_len	       end	 nbits
+ */
+struct region {
+	unsigned int start;
+	unsigned int off;
+	unsigned int group_len;
+	unsigned int end;
+	unsigned int nbits;
+};
+
+static void bitmap_set_region(const struct region *r, unsigned long *bitmap)
+{
+	unsigned int start;
+
+	for (start = r->start; start <= r->end; start += r->group_len)
+		bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
+}
+
+static int bitmap_check_region(const struct region *r)
+{
+	if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
+		return -EINVAL;
+
+	if (r->end >= r->nbits)
+		return -ERANGE;
+
+	return 0;
+}
+
+static const char *bitmap_getnum(const char *str, unsigned int *num,
+				 unsigned int lastbit)
+{
+	unsigned long long n;
+	unsigned int len;
+
+	if (str[0] == 'N') {
+		*num = lastbit;
+		return str + 1;
+	}
+
+	len = _parse_integer(str, 10, &n);
+	if (!len)
+		return ERR_PTR(-EINVAL);
+	if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
+		return ERR_PTR(-EOVERFLOW);
+
+	*num = n;
+	return str + len;
+}
+
+static inline bool end_of_str(char c)
+{
+	return c == '\0' || c == '\n';
+}
+
+static inline bool __end_of_region(char c)
+{
+	return isspace(c) || c == ',';
+}
+
+static inline bool end_of_region(char c)
+{
+	return __end_of_region(c) || end_of_str(c);
+}
+
+/*
+ * The format allows commas and whitespaces at the beginning
+ * of the region.
+ */
+static const char *bitmap_find_region(const char *str)
+{
+	while (__end_of_region(*str))
+		str++;
+
+	return end_of_str(*str) ? NULL : str;
+}
+
+static const char *bitmap_find_region_reverse(const char *start, const char *end)
+{
+	while (start <= end && __end_of_region(*end))
+		end--;
+
+	return end;
+}
+
+static const char *bitmap_parse_region(const char *str, struct region *r)
+{
+	unsigned int lastbit = r->nbits - 1;
+
+	if (!strncasecmp(str, "all", 3)) {
+		r->start = 0;
+		r->end = lastbit;
+		str += 3;
+
+		goto check_pattern;
+	}
+
+	str = bitmap_getnum(str, &r->start, lastbit);
+	if (IS_ERR(str))
+		return str;
+
+	if (end_of_region(*str))
+		goto no_end;
+
+	if (*str != '-')
+		return ERR_PTR(-EINVAL);
+
+	str = bitmap_getnum(str + 1, &r->end, lastbit);
+	if (IS_ERR(str))
+		return str;
+
+check_pattern:
+	if (end_of_region(*str))
+		goto no_pattern;
+
+	if (*str != ':')
+		return ERR_PTR(-EINVAL);
+
+	str = bitmap_getnum(str + 1, &r->off, lastbit);
+	if (IS_ERR(str))
+		return str;
+
+	if (*str != '/')
+		return ERR_PTR(-EINVAL);
+
+	return bitmap_getnum(str + 1, &r->group_len, lastbit);
+
+no_end:
+	r->end = r->start;
+no_pattern:
+	r->off = r->end + 1;
+	r->group_len = r->end + 1;
+
+	return end_of_str(*str) ? NULL : str;
+}
+
+/**
+ * bitmap_parselist - convert list format ASCII string to bitmap
+ * @buf: read user string from this buffer; must be terminated
+ *    with a \0 or \n.
+ * @maskp: write resulting mask here
+ * @nmaskbits: number of bits in mask to be written
+ *
+ * Input format is a comma-separated list of decimal numbers and
+ * ranges.  Consecutively set bits are shown as two hyphen-separated
+ * decimal numbers, the smallest and largest bit numbers set in
+ * the range.
+ * Optionally each range can be postfixed to denote that only parts of it
+ * should be set. The range will divided to groups of specific size.
+ * From each group will be used only defined amount of bits.
+ * Syntax: range:used_size/group_size
+ * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
+ * The value 'N' can be used as a dynamically substituted token for the
+ * maximum allowed value; i.e (nmaskbits - 1).  Keep in mind that it is
+ * dynamic, so if system changes cause the bitmap width to change, such
+ * as more cores in a CPU list, then any ranges using N will also change.
+ *
+ * Returns: 0 on success, -errno on invalid input strings. Error values:
+ *
+ *   - ``-EINVAL``: wrong region format
+ *   - ``-EINVAL``: invalid character in string
+ *   - ``-ERANGE``: bit number specified too large for mask
+ *   - ``-EOVERFLOW``: integer overflow in the input parameters
+ */
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
+{
+	struct region r;
+	long ret;
+
+	r.nbits = nmaskbits;
+	bitmap_zero(maskp, r.nbits);
+
+	while (buf) {
+		buf = bitmap_find_region(buf);
+		if (buf == NULL)
+			return 0;
+
+		buf = bitmap_parse_region(buf, &r);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+
+		ret = bitmap_check_region(&r);
+		if (ret)
+			return ret;
+
+		bitmap_set_region(&r, maskp);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_parselist);
+
+
+/**
+ * bitmap_parselist_user() - convert user buffer's list format ASCII
+ * string to bitmap
+ *
+ * @ubuf: pointer to user buffer containing string.
+ * @ulen: buffer size in bytes.  If string is smaller than this
+ *    then it must be terminated with a \0.
+ * @maskp: pointer to bitmap array that will contain result.
+ * @nmaskbits: size of bitmap, in bits.
+ *
+ * Wrapper for bitmap_parselist(), providing it with user buffer.
+ */
+int bitmap_parselist_user(const char __user *ubuf,
+			unsigned int ulen, unsigned long *maskp,
+			int nmaskbits)
+{
+	char *buf;
+	int ret;
+
+	buf = memdup_user_nul(ubuf, ulen);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	ret = bitmap_parselist(buf, maskp, nmaskbits);
+
+	kfree(buf);
+	return ret;
+}
+EXPORT_SYMBOL(bitmap_parselist_user);
+
+static const char *bitmap_get_x32_reverse(const char *start,
+					const char *end, u32 *num)
+{
+	u32 ret = 0;
+	int c, i;
+
+	for (i = 0; i < 32; i += 4) {
+		c = hex_to_bin(*end--);
+		if (c < 0)
+			return ERR_PTR(-EINVAL);
+
+		ret |= c << i;
+
+		if (start > end || __end_of_region(*end))
+			goto out;
+	}
+
+	if (hex_to_bin(*end--) >= 0)
+		return ERR_PTR(-EOVERFLOW);
+out:
+	*num = ret;
+	return end;
+}
+
+/**
+ * bitmap_parse - convert an ASCII hex string into a bitmap.
+ * @start: pointer to buffer containing string.
+ * @buflen: buffer size in bytes.  If string is smaller than this
+ *    then it must be terminated with a \0 or \n. In that case,
+ *    UINT_MAX may be provided instead of string length.
+ * @maskp: pointer to bitmap array that will contain result.
+ * @nmaskbits: size of bitmap, in bits.
+ *
+ * Commas group hex digits into chunks.  Each chunk defines exactly 32
+ * bits of the resultant bitmask.  No chunk may specify a value larger
+ * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
+ * then leading 0-bits are prepended.  %-EINVAL is returned for illegal
+ * characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
+ * Leading, embedded and trailing whitespace accepted.
+ */
+int bitmap_parse(const char *start, unsigned int buflen,
+		unsigned long *maskp, int nmaskbits)
+{
+	const char *end = strnchrnul(start, buflen, '\n') - 1;
+	int chunks = BITS_TO_U32(nmaskbits);
+	u32 *bitmap = (u32 *)maskp;
+	int unset_bit;
+	int chunk;
+
+	for (chunk = 0; ; chunk++) {
+		end = bitmap_find_region_reverse(start, end);
+		if (start > end)
+			break;
+
+		if (!chunks--)
+			return -EOVERFLOW;
+
+#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
+		end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
+#else
+		end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
+#endif
+		if (IS_ERR(end))
+			return PTR_ERR(end);
+	}
+
+	unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
+	if (unset_bit < nmaskbits) {
+		bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
+		return 0;
+	}
+
+	if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
+		return -EOVERFLOW;
+
+	return 0;
+}
+EXPORT_SYMBOL(bitmap_parse);
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 935e0f96e785..abc5579768e9 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -6,21 +6,11 @@
 
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
-#include <linux/bug.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/thread_info.h>
-#include <linux/uaccess.h>
-
-#include <asm/page.h>
-
-#include "kstrtox.h"
 
 /**
  * DOC: bitmap introduction
@@ -440,508 +430,6 @@ again:
 }
 EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
 
-/*
- * Bitmap printing & parsing functions: first version by Nadia Yvette Chambers,
- * second version by Paul Jackson, third by Joe Korty.
- */
-
-/**
- * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
- *
- * @ubuf: pointer to user buffer containing string.
- * @ulen: buffer size in bytes.  If string is smaller than this
- *    then it must be terminated with a \0.
- * @maskp: pointer to bitmap array that will contain result.
- * @nmaskbits: size of bitmap, in bits.
- */
-int bitmap_parse_user(const char __user *ubuf,
-			unsigned int ulen, unsigned long *maskp,
-			int nmaskbits)
-{
-	char *buf;
-	int ret;
-
-	buf = memdup_user_nul(ubuf, ulen);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
-
-	ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);
-
-	kfree(buf);
-	return ret;
-}
-EXPORT_SYMBOL(bitmap_parse_user);
-
-/**
- * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
- * @list: indicates whether the bitmap must be list
- * @buf: page aligned buffer into which string is placed
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- *
- * Output format is a comma-separated list of decimal numbers and
- * ranges if list is specified or hex digits grouped into comma-separated
- * sets of 8 digits/set. Returns the number of characters written to buf.
- *
- * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
- * area and that sufficient storage remains at @buf to accommodate the
- * bitmap_print_to_pagebuf() output. Returns the number of characters
- * actually printed to @buf, excluding terminating '\0'.
- */
-int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
-			    int nmaskbits)
-{
-	ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);
-
-	return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
-		      scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
-}
-EXPORT_SYMBOL(bitmap_print_to_pagebuf);
-
-/**
- * bitmap_print_to_buf  - convert bitmap to list or hex format ASCII string
- * @list: indicates whether the bitmap must be list
- *      true:  print in decimal list format
- *      false: print in hexadecimal bitmask format
- * @buf: buffer into which string is placed
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- * @off: in the string from which we are copying, We copy to @buf
- * @count: the maximum number of bytes to print
- */
-static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp,
-		int nmaskbits, loff_t off, size_t count)
-{
-	const char *fmt = list ? "%*pbl\n" : "%*pb\n";
-	ssize_t size;
-	void *data;
-
-	data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp);
-	if (!data)
-		return -ENOMEM;
-
-	size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1);
-	kfree(data);
-
-	return size;
-}
-
-/**
- * bitmap_print_bitmask_to_buf  - convert bitmap to hex bitmask format ASCII string
- * @buf: buffer into which string is placed
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- * @off: in the string from which we are copying, We copy to @buf
- * @count: the maximum number of bytes to print
- *
- * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper
- * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal
- * bitmask and decimal list to userspace by sysfs ABI.
- * Drivers might be using a normal attribute for this kind of ABIs. A
- * normal attribute typically has show entry as below::
- *
- *   static ssize_t example_attribute_show(struct device *dev,
- * 		struct device_attribute *attr, char *buf)
- *   {
- * 	...
- * 	return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max);
- *   }
- *
- * show entry of attribute has no offset and count parameters and this
- * means the file is limited to one page only.
- * bitmap_print_to_pagebuf() API works terribly well for this kind of
- * normal attribute with buf parameter and without offset, count::
- *
- *   bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
- * 			   int nmaskbits)
- *   {
- *   }
- *
- * The problem is once we have a large bitmap, we have a chance to get a
- * bitmask or list more than one page. Especially for list, it could be
- * as complex as 0,3,5,7,9,... We have no simple way to know it exact size.
- * It turns out bin_attribute is a way to break this limit. bin_attribute
- * has show entry as below::
- *
- *   static ssize_t
- *   example_bin_attribute_show(struct file *filp, struct kobject *kobj,
- * 		struct bin_attribute *attr, char *buf,
- * 		loff_t offset, size_t count)
- *   {
- * 	...
- *   }
- *
- * With the new offset and count parameters, this makes sysfs ABI be able
- * to support file size more than one page. For example, offset could be
- * >= 4096.
- * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their
- * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf()
- * make those drivers be able to support large bitmask and list after they
- * move to use bin_attribute. In result, we have to pass the corresponding
- * parameters such as off, count from bin_attribute show entry to this API.
- *
- * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf()
- * is similar with cpumap_print_to_pagebuf(),  the difference is that
- * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption
- * the destination buffer is exactly one page and won't be more than one page.
- * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other
- * hand, mainly serves bin_attribute which doesn't work with exact one page,
- * and it can break the size limit of converted decimal list and hexadecimal
- * bitmask.
- *
- * WARNING!
- *
- * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf().
- * It is intended to workaround sysfs limitations discussed above and should be
- * used carefully in general case for the following reasons:
- *
- *  - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf().
- *  - Memory complexity is O(nbits), comparing to O(1) for snprintf().
- *  - @off and @count are NOT offset and number of bits to print.
- *  - If printing part of bitmap as list, the resulting string is not a correct
- *    list representation of bitmap. Particularly, some bits within or out of
- *    related interval may be erroneously set or unset. The format of the string
- *    may be broken, so bitmap_parselist-like parser may fail parsing it.
- *  - If printing the whole bitmap as list by parts, user must ensure the order
- *    of calls of the function such that the offset is incremented linearly.
- *  - If printing the whole bitmap as list by parts, user must keep bitmap
- *    unchanged between the very first and very last call. Otherwise concatenated
- *    result may be incorrect, and format may be broken.
- *
- * Returns the number of characters actually printed to @buf
- */
-int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
-				int nmaskbits, loff_t off, size_t count)
-{
-	return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count);
-}
-EXPORT_SYMBOL(bitmap_print_bitmask_to_buf);
-
-/**
- * bitmap_print_list_to_buf  - convert bitmap to decimal list format ASCII string
- * @buf: buffer into which string is placed
- * @maskp: pointer to bitmap to convert
- * @nmaskbits: size of bitmap, in bits
- * @off: in the string from which we are copying, We copy to @buf
- * @count: the maximum number of bytes to print
- *
- * Everything is same with the above bitmap_print_bitmask_to_buf() except
- * the print format.
- */
-int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
-			     int nmaskbits, loff_t off, size_t count)
-{
-	return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count);
-}
-EXPORT_SYMBOL(bitmap_print_list_to_buf);
-
-/*
- * Region 9-38:4/10 describes the following bitmap structure:
- * 0	   9  12    18			38	     N
- * .........****......****......****..................
- *	    ^  ^     ^			 ^	     ^
- *      start  off   group_len	       end	 nbits
- */
-struct region {
-	unsigned int start;
-	unsigned int off;
-	unsigned int group_len;
-	unsigned int end;
-	unsigned int nbits;
-};
-
-static void bitmap_set_region(const struct region *r, unsigned long *bitmap)
-{
-	unsigned int start;
-
-	for (start = r->start; start <= r->end; start += r->group_len)
-		bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
-}
-
-static int bitmap_check_region(const struct region *r)
-{
-	if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
-		return -EINVAL;
-
-	if (r->end >= r->nbits)
-		return -ERANGE;
-
-	return 0;
-}
-
-static const char *bitmap_getnum(const char *str, unsigned int *num,
-				 unsigned int lastbit)
-{
-	unsigned long long n;
-	unsigned int len;
-
-	if (str[0] == 'N') {
-		*num = lastbit;
-		return str + 1;
-	}
-
-	len = _parse_integer(str, 10, &n);
-	if (!len)
-		return ERR_PTR(-EINVAL);
-	if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
-		return ERR_PTR(-EOVERFLOW);
-
-	*num = n;
-	return str + len;
-}
-
-static inline bool end_of_str(char c)
-{
-	return c == '\0' || c == '\n';
-}
-
-static inline bool __end_of_region(char c)
-{
-	return isspace(c) || c == ',';
-}
-
-static inline bool end_of_region(char c)
-{
-	return __end_of_region(c) || end_of_str(c);
-}
-
-/*
- * The format allows commas and whitespaces at the beginning
- * of the region.
- */
-static const char *bitmap_find_region(const char *str)
-{
-	while (__end_of_region(*str))
-		str++;
-
-	return end_of_str(*str) ? NULL : str;
-}
-
-static const char *bitmap_find_region_reverse(const char *start, const char *end)
-{
-	while (start <= end && __end_of_region(*end))
-		end--;
-
-	return end;
-}
-
-static const char *bitmap_parse_region(const char *str, struct region *r)
-{
-	unsigned int lastbit = r->nbits - 1;
-
-	if (!strncasecmp(str, "all", 3)) {
-		r->start = 0;
-		r->end = lastbit;
-		str += 3;
-
-		goto check_pattern;
-	}
-
-	str = bitmap_getnum(str, &r->start, lastbit);
-	if (IS_ERR(str))
-		return str;
-
-	if (end_of_region(*str))
-		goto no_end;
-
-	if (*str != '-')
-		return ERR_PTR(-EINVAL);
-
-	str = bitmap_getnum(str + 1, &r->end, lastbit);
-	if (IS_ERR(str))
-		return str;
-
-check_pattern:
-	if (end_of_region(*str))
-		goto no_pattern;
-
-	if (*str != ':')
-		return ERR_PTR(-EINVAL);
-
-	str = bitmap_getnum(str + 1, &r->off, lastbit);
-	if (IS_ERR(str))
-		return str;
-
-	if (*str != '/')
-		return ERR_PTR(-EINVAL);
-
-	return bitmap_getnum(str + 1, &r->group_len, lastbit);
-
-no_end:
-	r->end = r->start;
-no_pattern:
-	r->off = r->end + 1;
-	r->group_len = r->end + 1;
-
-	return end_of_str(*str) ? NULL : str;
-}
-
-/**
- * bitmap_parselist - convert list format ASCII string to bitmap
- * @buf: read user string from this buffer; must be terminated
- *    with a \0 or \n.
- * @maskp: write resulting mask here
- * @nmaskbits: number of bits in mask to be written
- *
- * Input format is a comma-separated list of decimal numbers and
- * ranges.  Consecutively set bits are shown as two hyphen-separated
- * decimal numbers, the smallest and largest bit numbers set in
- * the range.
- * Optionally each range can be postfixed to denote that only parts of it
- * should be set. The range will divided to groups of specific size.
- * From each group will be used only defined amount of bits.
- * Syntax: range:used_size/group_size
- * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
- * The value 'N' can be used as a dynamically substituted token for the
- * maximum allowed value; i.e (nmaskbits - 1).  Keep in mind that it is
- * dynamic, so if system changes cause the bitmap width to change, such
- * as more cores in a CPU list, then any ranges using N will also change.
- *
- * Returns: 0 on success, -errno on invalid input strings. Error values:
- *
- *   - ``-EINVAL``: wrong region format
- *   - ``-EINVAL``: invalid character in string
- *   - ``-ERANGE``: bit number specified too large for mask
- *   - ``-EOVERFLOW``: integer overflow in the input parameters
- */
-int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
-{
-	struct region r;
-	long ret;
-
-	r.nbits = nmaskbits;
-	bitmap_zero(maskp, r.nbits);
-
-	while (buf) {
-		buf = bitmap_find_region(buf);
-		if (buf == NULL)
-			return 0;
-
-		buf = bitmap_parse_region(buf, &r);
-		if (IS_ERR(buf))
-			return PTR_ERR(buf);
-
-		ret = bitmap_check_region(&r);
-		if (ret)
-			return ret;
-
-		bitmap_set_region(&r, maskp);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bitmap_parselist);
-
-
-/**
- * bitmap_parselist_user() - convert user buffer's list format ASCII
- * string to bitmap
- *
- * @ubuf: pointer to user buffer containing string.
- * @ulen: buffer size in bytes.  If string is smaller than this
- *    then it must be terminated with a \0.
- * @maskp: pointer to bitmap array that will contain result.
- * @nmaskbits: size of bitmap, in bits.
- *
- * Wrapper for bitmap_parselist(), providing it with user buffer.
- */
-int bitmap_parselist_user(const char __user *ubuf,
-			unsigned int ulen, unsigned long *maskp,
-			int nmaskbits)
-{
-	char *buf;
-	int ret;
-
-	buf = memdup_user_nul(ubuf, ulen);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
-
-	ret = bitmap_parselist(buf, maskp, nmaskbits);
-
-	kfree(buf);
-	return ret;
-}
-EXPORT_SYMBOL(bitmap_parselist_user);
-
-static const char *bitmap_get_x32_reverse(const char *start,
-					const char *end, u32 *num)
-{
-	u32 ret = 0;
-	int c, i;
-
-	for (i = 0; i < 32; i += 4) {
-		c = hex_to_bin(*end--);
-		if (c < 0)
-			return ERR_PTR(-EINVAL);
-
-		ret |= c << i;
-
-		if (start > end || __end_of_region(*end))
-			goto out;
-	}
-
-	if (hex_to_bin(*end--) >= 0)
-		return ERR_PTR(-EOVERFLOW);
-out:
-	*num = ret;
-	return end;
-}
-
-/**
- * bitmap_parse - convert an ASCII hex string into a bitmap.
- * @start: pointer to buffer containing string.
- * @buflen: buffer size in bytes.  If string is smaller than this
- *    then it must be terminated with a \0 or \n. In that case,
- *    UINT_MAX may be provided instead of string length.
- * @maskp: pointer to bitmap array that will contain result.
- * @nmaskbits: size of bitmap, in bits.
- *
- * Commas group hex digits into chunks.  Each chunk defines exactly 32
- * bits of the resultant bitmask.  No chunk may specify a value larger
- * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
- * then leading 0-bits are prepended.  %-EINVAL is returned for illegal
- * characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
- * Leading, embedded and trailing whitespace accepted.
- */
-int bitmap_parse(const char *start, unsigned int buflen,
-		unsigned long *maskp, int nmaskbits)
-{
-	const char *end = strnchrnul(start, buflen, '\n') - 1;
-	int chunks = BITS_TO_U32(nmaskbits);
-	u32 *bitmap = (u32 *)maskp;
-	int unset_bit;
-	int chunk;
-
-	for (chunk = 0; ; chunk++) {
-		end = bitmap_find_region_reverse(start, end);
-		if (start > end)
-			break;
-
-		if (!chunks--)
-			return -EOVERFLOW;
-
-#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
-		end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
-#else
-		end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
-#endif
-		if (IS_ERR(end))
-			return PTR_ERR(end);
-	}
-
-	unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
-	if (unset_bit < nmaskbits) {
-		bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
-		return 0;
-	}
-
-	if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
-		return -EOVERFLOW;
-
-	return 0;
-}
-EXPORT_SYMBOL(bitmap_parse);
-
 /**
  * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
  *	@buf: pointer to a bitmap
-- 
cgit v1.2.3


From 49dbe25adac42d3e06f65d1420946bec65896222 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Tue, 10 Oct 2023 22:15:14 +0300
Subject: vsock: read from socket's error queue

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK'
and 'VSOCK_RECVERR'.

Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h          |  1 +
 include/uapi/linux/vm_sockets.h | 17 +++++++++++++++++
 net/vmw_vsock/af_vsock.c        |  6 ++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 39b74d83c7c4..cfcb7e2c3813 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -383,6 +383,7 @@ struct ucred {
 #define SOL_MPTCP	284
 #define SOL_MCTP	285
 #define SOL_SMC		286
+#define SOL_VSOCK	287
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index c60ca33eac59..ed07181d4eff 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -191,4 +191,21 @@ struct sockaddr_vm {
 
 #define IOCTL_VM_SOCKETS_GET_LOCAL_CID		_IO(7, 0xb9)
 
+/* MSG_ZEROCOPY notifications are encoded in the standard error format,
+ * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
+ * kernel source tree for more details.
+ */
+
+/* 'cmsg_level' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
+ */
+
+#define SOL_VSOCK	287
+
+/* 'cmsg_type' field value of 'struct cmsghdr' for notification parsing
+ * when MSG_ZEROCOPY flag is used on transmissions.
+ */
+
+#define VSOCK_RECVERR	1
+
 #endif /* _UAPI_VM_SOCKETS_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d841f4de33b0..38486efd3d05 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -89,6 +89,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include <linux/cred.h>
+#include <linux/errqueue.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -110,6 +111,7 @@
 #include <linux/workqueue.h>
 #include <net/sock.h>
 #include <net/af_vsock.h>
+#include <uapi/linux/vm_sockets.h>
 
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
 static void vsock_sk_destruct(struct sock *sk);
@@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	int err;
 
 	sk = sock->sk;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR);
+
 	vsk = vsock_sk(sk);
 	err = 0;
 
-- 
cgit v1.2.3


From d7fbc0b7e846e9e0e70ae766d274b8720fbab412 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Wed, 11 Oct 2023 12:12:34 +0200
Subject: dpll: netlink/core: add support for pin-dpll signal phase
 offset/adjust

Add callback ops for pin-dpll phase measurement.
Add callback for pin signal phase adjustment.
Add min and max phase adjustment values to pin proprties.
Invoke callbacks in dpll_netlink.c when filling the pin details to
provide user with phase related attribute values.

Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_netlink.c | 138 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/dpll.h        |  18 ++++++
 2 files changed, 155 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index e20daba6896a..09a6c2a1ea92 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -212,6 +212,53 @@ dpll_msg_add_pin_direction(struct sk_buff *msg, struct dpll_pin *pin,
 	return 0;
 }
 
+static int
+dpll_msg_add_pin_phase_adjust(struct sk_buff *msg, struct dpll_pin *pin,
+			      struct dpll_pin_ref *ref,
+			      struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	s32 phase_adjust;
+	int ret;
+
+	if (!ops->phase_adjust_get)
+		return 0;
+	ret = ops->phase_adjust_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll),
+				    &phase_adjust, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST, phase_adjust))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int
+dpll_msg_add_phase_offset(struct sk_buff *msg, struct dpll_pin *pin,
+			  struct dpll_pin_ref *ref,
+			  struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	s64 phase_offset;
+	int ret;
+
+	if (!ops->phase_offset_get)
+		return 0;
+	ret = ops->phase_offset_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll), &phase_offset,
+				    extack);
+	if (ret)
+		return ret;
+	if (nla_put_64bit(msg, DPLL_A_PIN_PHASE_OFFSET, sizeof(phase_offset),
+			  &phase_offset, DPLL_A_PIN_PAD))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int
 dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
 		      struct dpll_pin_ref *ref, struct netlink_ext_ack *extack)
@@ -330,6 +377,9 @@ dpll_msg_add_pin_dplls(struct sk_buff *msg, struct dpll_pin *pin,
 		if (ret)
 			goto nest_cancel;
 		ret = dpll_msg_add_pin_direction(msg, pin, ref, extack);
+		if (ret)
+			goto nest_cancel;
+		ret = dpll_msg_add_phase_offset(msg, pin, ref, extack);
 		if (ret)
 			goto nest_cancel;
 		nla_nest_end(msg, attr);
@@ -377,6 +427,15 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
 	if (nla_put_u32(msg, DPLL_A_PIN_CAPABILITIES, prop->capabilities))
 		return -EMSGSIZE;
 	ret = dpll_msg_add_pin_freq(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MIN,
+			prop->phase_range.min))
+		return -EMSGSIZE;
+	if (nla_put_s32(msg, DPLL_A_PIN_PHASE_ADJUST_MAX,
+			prop->phase_range.max))
+		return -EMSGSIZE;
+	ret = dpll_msg_add_pin_phase_adjust(msg, pin, ref, extack);
 	if (ret)
 		return ret;
 	if (xa_empty(&pin->parent_refs))
@@ -416,7 +475,7 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
 	if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type))
 		return -EMSGSIZE;
 
-	return ret;
+	return 0;
 }
 
 static int
@@ -705,6 +764,78 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
 	return 0;
 }
 
+static int
+dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr,
+		       struct netlink_ext_ack *extack)
+{
+	struct dpll_pin_ref *ref, *failed;
+	const struct dpll_pin_ops *ops;
+	s32 phase_adj, old_phase_adj;
+	struct dpll_device *dpll;
+	unsigned long i;
+	int ret;
+
+	phase_adj = nla_get_s32(phase_adj_attr);
+	if (phase_adj > pin->prop->phase_range.max ||
+	    phase_adj < pin->prop->phase_range.min) {
+		NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
+				    "phase adjust value not supported");
+		return -EINVAL;
+	}
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		ops = dpll_pin_ops(ref);
+		if (!ops->phase_adjust_set || !ops->phase_adjust_get) {
+			NL_SET_ERR_MSG(extack, "phase adjust not supported");
+			return -EOPNOTSUPP;
+		}
+	}
+	ref = dpll_xa_ref_dpll_first(&pin->dpll_refs);
+	ops = dpll_pin_ops(ref);
+	dpll = ref->dpll;
+	ret = ops->phase_adjust_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				    dpll, dpll_priv(dpll), &old_phase_adj,
+				    extack);
+	if (ret) {
+		NL_SET_ERR_MSG(extack, "unable to get old phase adjust value");
+		return ret;
+	}
+	if (phase_adj == old_phase_adj)
+		return 0;
+
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		ret = ops->phase_adjust_set(pin,
+					    dpll_pin_on_dpll_priv(dpll, pin),
+					    dpll, dpll_priv(dpll), phase_adj,
+					    extack);
+		if (ret) {
+			failed = ref;
+			NL_SET_ERR_MSG_FMT(extack,
+					   "phase adjust set failed for dpll_id:%u",
+					   dpll->id);
+			goto rollback;
+		}
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+
+rollback:
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		if (ref == failed)
+			break;
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		if (ops->phase_adjust_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+					  dpll, dpll_priv(dpll), old_phase_adj,
+					  extack))
+			NL_SET_ERR_MSG(extack, "set phase adjust rollback failed");
+	}
+	return ret;
+}
+
 static int
 dpll_pin_parent_device_set(struct dpll_pin *pin, struct nlattr *parent_nest,
 			   struct netlink_ext_ack *extack)
@@ -793,6 +924,11 @@ dpll_pin_set_from_nlattr(struct dpll_pin *pin, struct genl_info *info)
 			if (ret)
 				return ret;
 			break;
+		case DPLL_A_PIN_PHASE_ADJUST:
+			ret = dpll_pin_phase_adj_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
 		case DPLL_A_PIN_PARENT_DEVICE:
 			ret = dpll_pin_parent_device_set(pin, a, info->extack);
 			if (ret)
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index bbc480cd2932..578fc5fa3750 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -68,6 +68,18 @@ struct dpll_pin_ops {
 	int (*prio_set)(const struct dpll_pin *pin, void *pin_priv,
 			const struct dpll_device *dpll, void *dpll_priv,
 			const u32 prio, struct netlink_ext_ack *extack);
+	int (*phase_offset_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s64 *phase_offset,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_get)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				s32 *phase_adjust,
+				struct netlink_ext_ack *extack);
+	int (*phase_adjust_set)(const struct dpll_pin *pin, void *pin_priv,
+				const struct dpll_device *dpll, void *dpll_priv,
+				const s32 phase_adjust,
+				struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
@@ -91,6 +103,11 @@ struct dpll_pin_frequency {
 #define DPLL_PIN_FREQUENCY_DCF77 \
 	DPLL_PIN_FREQUENCY(DPLL_PIN_FREQUENCY_77_5_KHZ)
 
+struct dpll_pin_phase_adjust_range {
+	s32 min;
+	s32 max;
+};
+
 struct dpll_pin_properties {
 	const char *board_label;
 	const char *panel_label;
@@ -99,6 +116,7 @@ struct dpll_pin_properties {
 	unsigned long capabilities;
 	u32 freq_supported_num;
 	struct dpll_pin_frequency *freq_supported;
+	struct dpll_pin_phase_adjust_range phase_range;
 };
 
 #if IS_ENABLED(CONFIG_DPLL)
-- 
cgit v1.2.3


From fc8b2a619469378717e7270d2a4e1ef93c585f7a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Oct 2023 10:01:14 -0400
Subject: net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation

Syzbot reported two new paths to hit an internal WARNING using the
new virtio gso type VIRTIO_NET_HDR_GSO_UDP_L4.

    RIP: 0010:skb_checksum_help+0x4a2/0x600 net/core/dev.c:3260
    skb len=64521 gso_size=344
and

    RIP: 0010:skb_warn_bad_offload+0x118/0x240 net/core/dev.c:3262

Older virtio types have historically had loose restrictions, leading
to many entirely impractical fuzzer generated packets causing
problems deep in the kernel stack. Ideally, we would have had strict
validation for all types from the start.

New virtio types can have tighter validation. Limit UDP GSO packets
inserted via virtio to the same limits imposed by the UDP_SEGMENT
socket interface:

1. must use checksum offload
2. checksum offload matches UDP header
3. no more segments than UDP_MAX_SEGMENTS
4. UDP GSO does not take modifier flags, notably SKB_GSO_TCP_ECN

Fixes: 860b7f27b8f7 ("linux/virtio_net.h: Support USO offload in vnet header.")
Reported-by: syzbot+01cdbc31e9c0ae9b33ac@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/0000000000005039270605eb0b7f@google.com/
Reported-by: syzbot+c99d835ff081ca30f986@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/0000000000005426680605eb0b9f@google.com/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 7b4dd69555e4..27cc1d464321 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -3,8 +3,8 @@
 #define _LINUX_VIRTIO_NET_H
 
 #include <linux/if_vlan.h>
+#include <linux/udp.h>
 #include <uapi/linux/tcp.h>
-#include <uapi/linux/udp.h>
 #include <uapi/linux/virtio_net.h>
 
 static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
@@ -151,9 +151,22 @@ retry:
 		unsigned int nh_off = p_off;
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
-		/* UFO may not include transport header in gso_size. */
-		if (gso_type & SKB_GSO_UDP)
+		switch (gso_type & ~SKB_GSO_TCP_ECN) {
+		case SKB_GSO_UDP:
+			/* UFO may not include transport header in gso_size. */
 			nh_off -= thlen;
+			break;
+		case SKB_GSO_UDP_L4:
+			if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
+				return -EINVAL;
+			if (skb->csum_offset != offsetof(struct udphdr, check))
+				return -EINVAL;
+			if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS)
+				return -EINVAL;
+			if (gso_type != SKB_GSO_UDP_L4)
+				return -EINVAL;
+			break;
+		}
 
 		/* Kernel has a special handling for GSO_BY_FRAGS. */
 		if (gso_size == GSO_BY_FRAGS)
-- 
cgit v1.2.3


From 60c6946675fc06dd2fd2b7a4b6fd1c1f046f1056 Mon Sep 17 00:00:00 2001
From: Xabier Marquiegui <reibax@gmail.com>
Date: Thu, 12 Oct 2023 00:39:53 +0200
Subject: posix-clock: introduce posix_clock_context concept

Add the necessary structure to support custom private-data per
posix-clock user.

The previous implementation of posix-clock assumed all file open
instances need access to the same clock structure on private_data.

The need for individual data structures per file open instance has been
identified when developing support for multiple timestamp event queue
users for ptp_clock.

Signed-off-by: Xabier Marquiegui <reibax@gmail.com>
Suggested-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c   | 21 +++++++++++++--------
 drivers/ptp/ptp_private.h   | 16 +++++++++-------
 include/linux/posix-clock.h | 35 +++++++++++++++++++++++++++--------
 kernel/time/posix-clock.c   | 36 +++++++++++++++++++++++++++---------
 4 files changed, 76 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 362bf756e6b7..0ba3e7064df2 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -101,14 +101,16 @@ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 	return 0;
 }
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode)
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode)
 {
 	return 0;
 }
 
-long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct ptp_sys_offset_extended *extoff = NULL;
 	struct ptp_sys_offset_precise precise_offset;
 	struct system_device_crosststamp xtstamp;
@@ -432,9 +434,11 @@ out:
 	return err;
 }
 
-__poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 
 	poll_wait(fp, &ptp->tsev_wq, wait);
 
@@ -443,10 +447,11 @@ __poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
 
 #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event))
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint rdflags, char __user *buf, size_t cnt)
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags,
+		 char __user *buf, size_t cnt)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct timestamp_event_queue *queue = &ptp->tsevq;
 	struct ptp_extts_event *event;
 	unsigned long flags;
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 75f58fc468a7..a3110c85f694 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -117,16 +117,18 @@ extern struct class *ptp_class;
 int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 		    enum ptp_pin_function func, unsigned int chan);
 
-long ptp_ioctl(struct posix_clock *pc,
-	       unsigned int cmd, unsigned long arg);
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg);
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode);
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode);
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint flags, char __user *buf, size_t cnt);
+int ptp_release(struct posix_clock_context *pccontext);
 
-__poll_t ptp_poll(struct posix_clock *pc,
-	      struct file *fp, poll_table *wait);
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint flags, char __user *buf,
+		 size_t cnt);
+
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait);
 
 /*
  * see ptp_sysfs.c
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 468328b1e1dd..ef8619f48920 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -14,6 +14,7 @@
 #include <linux/rwsem.h>
 
 struct posix_clock;
+struct posix_clock_context;
 
 /**
  * struct posix_clock_operations - functional interface to the clock
@@ -50,18 +51,18 @@ struct posix_clock_operations {
 	/*
 	 * Optional character device methods:
 	 */
-	long    (*ioctl)   (struct posix_clock *pc,
-			    unsigned int cmd, unsigned long arg);
+	long (*ioctl)(struct posix_clock_context *pccontext, unsigned int cmd,
+		      unsigned long arg);
 
-	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
+	int (*open)(struct posix_clock_context *pccontext, fmode_t f_mode);
 
-	__poll_t (*poll)   (struct posix_clock *pc,
-			    struct file *file, poll_table *wait);
+	__poll_t (*poll)(struct posix_clock_context *pccontext, struct file *file,
+			 poll_table *wait);
 
-	int     (*release) (struct posix_clock *pc);
+	int (*release)(struct posix_clock_context *pccontext);
 
-	ssize_t (*read)    (struct posix_clock *pc,
-			    uint flags, char __user *buf, size_t cnt);
+	ssize_t (*read)(struct posix_clock_context *pccontext, uint flags,
+			char __user *buf, size_t cnt);
 };
 
 /**
@@ -90,6 +91,24 @@ struct posix_clock {
 	bool zombie;
 };
 
+/**
+ * struct posix_clock_context - represents clock file operations context
+ *
+ * @clk:              Pointer to the clock
+ * @private_clkdata:  Pointer to user data
+ *
+ * Drivers should use struct posix_clock_context during specific character
+ * device file operation methods to access the posix clock.
+ *
+ * Drivers can store a private data structure during the open operation
+ * if they have specific information that is required in other file
+ * operations.
+ */
+struct posix_clock_context {
+	struct posix_clock *clk;
+	void *private_clkdata;
+};
+
 /**
  * posix_clock_register() - register a new clock
  * @clk:   Pointer to the clock. Caller must provide 'ops' field
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
  */
 static struct posix_clock *get_posix_clock(struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk = pccontext->clk;
 
 	down_read(&clk->rwsem);
 
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
 static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -EINVAL;
 
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 		return -ENODEV;
 
 	if (clk->ops.read)
-		err = clk->ops.read(clk, fp->f_flags, buf, count);
+		err = clk->ops.read(pccontext, fp->f_flags, buf, count);
 
 	put_posix_clock(clk);
 
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 
 static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	__poll_t result = 0;
 
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 		return EPOLLERR;
 
 	if (clk->ops.poll)
-		result = clk->ops.poll(clk, fp, wait);
+		result = clk->ops.poll(pccontext, fp, wait);
 
 	put_posix_clock(clk);
 
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 static long posix_clock_ioctl(struct file *fp,
 			      unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
 static long posix_clock_compat_ioctl(struct file *fp,
 				     unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 	int err;
 	struct posix_clock *clk =
 		container_of(inode->i_cdev, struct posix_clock, cdev);
+	struct posix_clock_context *pccontext;
 
 	down_read(&clk->rwsem);
 
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 		err = -ENODEV;
 		goto out;
 	}
+	pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+	if (!pccontext) {
+		err = -ENOMEM;
+		goto out;
+	}
+	pccontext->clk = clk;
+	fp->private_data = pccontext;
 	if (clk->ops.open)
-		err = clk->ops.open(clk, fp->f_mode);
+		err = clk->ops.open(pccontext, fp->f_mode);
 	else
 		err = 0;
 
 	if (!err) {
 		get_device(clk->dev);
-		fp->private_data = clk;
 	}
 out:
 	up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
 
 static int posix_clock_release(struct inode *inode, struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk;
 	int err = 0;
 
+	if (!pccontext)
+		return -ENODEV;
+	clk = pccontext->clk;
+
 	if (clk->ops.release)
-		err = clk->ops.release(clk);
+		err = clk->ops.release(pccontext);
 
 	put_device(clk->dev);
 
+	kfree(pccontext);
 	fp->private_data = NULL;
 
 	return err;
-- 
cgit v1.2.3


From 058b4d9de86b3f77cd23fbd43a0f5ee4ea8e0aeb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 11 Oct 2023 11:01:21 +0300
Subject: iommu: change iommu_map_sgtable to return signed values

The iommu_map_sgtable() function returns ssize_t and negative error
codes but it's declared as size_t instead.  I think that static checkers
would have complained if this caused a bug, but even though it doesn't
cause a bug, it's definitely worth fixing.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/06672b96-23fd-424c-8880-1626e7bf119c@moroto.mountain
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 64bd20142cbe..c28178f3690a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1116,7 +1116,7 @@ static inline void iommu_free_global_pasid(ioasid_t pasid) {}
  * Creates a mapping at @iova for the buffer described by a scatterlist
  * stored in the given sg_table object in the provided IOMMU domain.
  */
-static inline size_t iommu_map_sgtable(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sgtable(struct iommu_domain *domain,
 			unsigned long iova, struct sg_table *sgt, int prot)
 {
 	return iommu_map_sg(domain, iova, sgt->sgl, sgt->orig_nents, prot,
-- 
cgit v1.2.3


From f6ca3fb6978f94d95ee79f95085fc22e71ca17cc Mon Sep 17 00:00:00 2001
From: Rouven Czerwinski <r.czerwinski@pengutronix.de>
Date: Fri, 22 Sep 2023 16:17:16 +0200
Subject: mtd: rawnand: Ensure the nand chip supports cached reads

Both the JEDEC and ONFI specification say that read cache sequential
support is an optional command. This means that we not only need to
check whether the individual controller supports the command, we also
need to check the parameter pages for both ONFI and JEDEC NAND flashes
before enabling sequential cache reads.

This fixes support for NAND flashes which don't support enabling cache
reads, i.e. Samsung K9F4G08U0F or Toshiba TC58NVG0S3HTA00.

Sequential cache reads are now only available for ONFI and JEDEC
devices, if individual vendors implement this, it needs to be enabled
per vendor.

Tested on i.MX6Q with a Samsung NAND flash chip that doesn't support
sequential reads.

Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Cc: stable@vger.kernel.org
Signed-off-by: Rouven Czerwinski <r.czerwinski@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20230922141717.35977-1-r.czerwinski@pengutronix.de
---
 drivers/mtd/nand/raw/nand_base.c  | 3 +++
 drivers/mtd/nand/raw/nand_jedec.c | 3 +++
 drivers/mtd/nand/raw/nand_onfi.c  | 3 +++
 include/linux/mtd/jedec.h         | 3 +++
 include/linux/mtd/onfi.h          | 1 +
 include/linux/mtd/rawnand.h       | 2 ++
 6 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index d4b55155aeae..1fcac403cee6 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -5110,6 +5110,9 @@ static void rawnand_check_cont_read_support(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 
+	if (!chip->parameters.supports_read_cache)
+		return;
+
 	if (chip->read_retries)
 		return;
 
diff --git a/drivers/mtd/nand/raw/nand_jedec.c b/drivers/mtd/nand/raw/nand_jedec.c
index 836757717660..b3cc8f360529 100644
--- a/drivers/mtd/nand/raw/nand_jedec.c
+++ b/drivers/mtd/nand/raw/nand_jedec.c
@@ -94,6 +94,9 @@ int nand_jedec_detect(struct nand_chip *chip)
 		goto free_jedec_param_page;
 	}
 
+	if (p->opt_cmd[0] & JEDEC_OPT_CMD_READ_CACHE)
+		chip->parameters.supports_read_cache = true;
+
 	memorg->pagesize = le32_to_cpu(p->byte_per_page);
 	mtd->writesize = memorg->pagesize;
 
diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c
index f15ef90aec8c..861975e44b55 100644
--- a/drivers/mtd/nand/raw/nand_onfi.c
+++ b/drivers/mtd/nand/raw/nand_onfi.c
@@ -303,6 +303,9 @@ int nand_onfi_detect(struct nand_chip *chip)
 			   ONFI_FEATURE_ADDR_TIMING_MODE, 1);
 	}
 
+	if (le16_to_cpu(p->opt_cmd) & ONFI_OPT_CMD_READ_CACHE)
+		chip->parameters.supports_read_cache = true;
+
 	onfi = kzalloc(sizeof(*onfi), GFP_KERNEL);
 	if (!onfi) {
 		ret = -ENOMEM;
diff --git a/include/linux/mtd/jedec.h b/include/linux/mtd/jedec.h
index 0b6b59f7cfbd..56047a4e54c9 100644
--- a/include/linux/mtd/jedec.h
+++ b/include/linux/mtd/jedec.h
@@ -21,6 +21,9 @@ struct jedec_ecc_info {
 /* JEDEC features */
 #define JEDEC_FEATURE_16_BIT_BUS	(1 << 0)
 
+/* JEDEC Optional Commands */
+#define JEDEC_OPT_CMD_READ_CACHE	BIT(1)
+
 struct nand_jedec_params {
 	/* rev info and features block */
 	/* 'J' 'E' 'S' 'D'  */
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index a7376f9beddf..55ab2e4d62f9 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -55,6 +55,7 @@
 #define ONFI_SUBFEATURE_PARAM_LEN	4
 
 /* ONFI optional commands SET/GET FEATURES supported? */
+#define ONFI_OPT_CMD_READ_CACHE		BIT(1)
 #define ONFI_OPT_CMD_SET_GET_FEATURES	BIT(2)
 
 struct nand_onfi_params {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 90a141ba2a5a..c29ace15a053 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -225,6 +225,7 @@ struct gpio_desc;
  * struct nand_parameters - NAND generic parameters from the parameter page
  * @model: Model name
  * @supports_set_get_features: The NAND chip supports setting/getting features
+ * @supports_read_cache: The NAND chip supports read cache operations
  * @set_feature_list: Bitmap of features that can be set
  * @get_feature_list: Bitmap of features that can be get
  * @onfi: ONFI specific parameters
@@ -233,6 +234,7 @@ struct nand_parameters {
 	/* Generic parameters */
 	const char *model;
 	bool supports_set_get_features;
+	bool supports_read_cache;
 	DECLARE_BITMAP(set_feature_list, ONFI_FEATURE_NUMBER);
 	DECLARE_BITMAP(get_feature_list, ONFI_FEATURE_NUMBER);
 
-- 
cgit v1.2.3


From f447318fb1d156b4b6da79266724c7ee347d1b59 Mon Sep 17 00:00:00 2001
From: Martin Kurbanov <mmkurbanov@salutedevices.com>
Date: Mon, 2 Oct 2023 17:04:58 +0300
Subject: mtd: spinand: add support for FORESEE F35SQA002G

Add support for FORESEE F35SQA002G SPI NAND.
Datasheet:
  https://www.longsys.com/uploads/LM-00006FORESEEF35SQA002GDatasheet_1650183701.pdf

Signed-off-by: Martin Kurbanov <mmkurbanov@salutedevices.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231002140458.147605-1-mmkurbanov@salutedevices.com
---
 drivers/mtd/nand/spi/Makefile  |  2 +-
 drivers/mtd/nand/spi/core.c    |  1 +
 drivers/mtd/nand/spi/foresee.c | 95 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h    |  1 +
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/foresee.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index cd8b66bf7740..19cc77288ebb 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o alliancememory.o ato.o esmt.o gigadevice.o macronix.o
+spinand-objs := core.o alliancememory.o ato.o esmt.o foresee.o gigadevice.o macronix.o
 spinand-objs += micron.o paragon.o toshiba.o winbond.o xtx.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 393ff37f0d23..849ccfedbc72 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -940,6 +940,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&alliancememory_spinand_manufacturer,
 	&ato_spinand_manufacturer,
 	&esmt_c8_spinand_manufacturer,
+	&foresee_spinand_manufacturer,
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
diff --git a/drivers/mtd/nand/spi/foresee.c b/drivers/mtd/nand/spi/foresee.c
new file mode 100644
index 000000000000..e0d2d9257045
--- /dev/null
+++ b/drivers/mtd/nand/spi/foresee.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+/*
+ * Copyright (c) 2023, SberDevices. All Rights Reserved.
+ *
+ * Author: Martin Kurbanov <mmkurbanov@salutedevices.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+#define SPINAND_MFR_FORESEE		0xCD
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+static int f35sqa002g_ooblayout_ecc(struct mtd_info *mtd, int section,
+				    struct mtd_oob_region *region)
+{
+	return -ERANGE;
+}
+
+static int f35sqa002g_ooblayout_free(struct mtd_info *mtd, int section,
+				     struct mtd_oob_region *region)
+{
+	if (section)
+		return -ERANGE;
+
+	/* Reserve 2 bytes for the BBM. */
+	region->offset = 2;
+	region->length = 62;
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops f35sqa002g_ooblayout = {
+	.ecc = f35sqa002g_ooblayout_ecc,
+	.free = f35sqa002g_ooblayout_free,
+};
+
+static int f35sqa002g_ecc_get_status(struct spinand_device *spinand, u8 status)
+{
+	struct nand_device *nand = spinand_to_nand(spinand);
+
+	switch (status & STATUS_ECC_MASK) {
+	case STATUS_ECC_NO_BITFLIPS:
+		return 0;
+
+	case STATUS_ECC_HAS_BITFLIPS:
+		return nanddev_get_ecc_conf(nand)->strength;
+
+	default:
+		break;
+	}
+
+	/* More than 1-bit error was detected in one or more sectors and
+	 * cannot be corrected.
+	 */
+	return -EBADMSG;
+}
+
+static const struct spinand_info foresee_spinand_table[] = {
+	SPINAND_INFO("F35SQA002G",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x72, 0x72),
+		     NAND_MEMORG(1, 2048, 64, 64, 2048, 40, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     SPINAND_HAS_QE_BIT,
+		     SPINAND_ECCINFO(&f35sqa002g_ooblayout,
+				     f35sqa002g_ecc_get_status)),
+};
+
+static const struct spinand_manufacturer_ops foresee_spinand_manuf_ops = {
+};
+
+const struct spinand_manufacturer foresee_spinand_manufacturer = {
+	.id = SPINAND_MFR_FORESEE,
+	.name = "FORESEE",
+	.chips = foresee_spinand_table,
+	.nchips = ARRAY_SIZE(foresee_spinand_table),
+	.ops = &foresee_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 3e285c09d16d..badb4c1ac079 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -263,6 +263,7 @@ struct spinand_manufacturer {
 extern const struct spinand_manufacturer alliancememory_spinand_manufacturer;
 extern const struct spinand_manufacturer ato_spinand_manufacturer;
 extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer;
+extern const struct spinand_manufacturer foresee_spinand_manufacturer;
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
-- 
cgit v1.2.3


From 0da28d5fc808dfcfbc910870b4b0277c1a7ccb6c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:27 +0200
Subject: drm: renesas: shmobile: Remove backlight support

Backlight support should be implemented by panels, not by the LCDC
driver.  As the feature is currently unused anyway, remove it.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
[geert: Cleanups]
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/69707650245bc2193d072f24723d4d5482ea590b.1694767209.git.geert+renesas@glider.be
---
 drivers/gpu/drm/renesas/shmobile/Makefile          |  3 +-
 .../gpu/drm/renesas/shmobile/shmob_drm_backlight.c | 82 ----------------------
 .../gpu/drm/renesas/shmobile/shmob_drm_backlight.h | 19 -----
 drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c  | 33 ++-------
 drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h  |  8 ---
 drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.h   |  2 +-
 drivers/gpu/drm/renesas/shmobile/shmob_drm_kms.c   |  2 +-
 include/linux/platform_data/shmob_drm.h            |  8 ---
 8 files changed, 7 insertions(+), 150 deletions(-)
 delete mode 100644 drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.c
 delete mode 100644 drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.h

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/renesas/shmobile/Makefile b/drivers/gpu/drm/renesas/shmobile/Makefile
index 861edafed856..2679555d61a7 100644
--- a/drivers/gpu/drm/renesas/shmobile/Makefile
+++ b/drivers/gpu/drm/renesas/shmobile/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-shmob-drm-y := shmob_drm_backlight.o \
-	       shmob_drm_crtc.o \
+shmob-drm-y := shmob_drm_crtc.o \
 	       shmob_drm_drv.o \
 	       shmob_drm_kms.o \
 	       shmob_drm_plane.o
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.c
deleted file mode 100644
index 794573badfe8..000000000000
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * shmob_drm_backlight.c  --  SH Mobile DRM Backlight
- *
- * Copyright (C) 2012 Renesas Electronics Corporation
- *
- * Laurent Pinchart (laurent.pinchart@ideasonboard.com)
- */
-
-#include <linux/backlight.h>
-
-#include "shmob_drm_backlight.h"
-#include "shmob_drm_crtc.h"
-#include "shmob_drm_drv.h"
-
-static int shmob_drm_backlight_update(struct backlight_device *bdev)
-{
-	struct shmob_drm_connector *scon = bl_get_data(bdev);
-	struct shmob_drm_device *sdev = scon->connector.dev->dev_private;
-	const struct shmob_drm_backlight_data *bdata = &sdev->pdata->backlight;
-	int brightness = backlight_get_brightness(bdev);
-
-	return bdata->set_brightness(brightness);
-}
-
-static int shmob_drm_backlight_get_brightness(struct backlight_device *bdev)
-{
-	struct shmob_drm_connector *scon = bl_get_data(bdev);
-	struct shmob_drm_device *sdev = scon->connector.dev->dev_private;
-	const struct shmob_drm_backlight_data *bdata = &sdev->pdata->backlight;
-
-	return bdata->get_brightness();
-}
-
-static const struct backlight_ops shmob_drm_backlight_ops = {
-	.options	= BL_CORE_SUSPENDRESUME,
-	.update_status	= shmob_drm_backlight_update,
-	.get_brightness	= shmob_drm_backlight_get_brightness,
-};
-
-void shmob_drm_backlight_dpms(struct shmob_drm_connector *scon, int mode)
-{
-	if (scon->backlight == NULL)
-		return;
-
-	scon->backlight->props.power = mode == DRM_MODE_DPMS_ON
-				     ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN;
-	backlight_update_status(scon->backlight);
-}
-
-int shmob_drm_backlight_init(struct shmob_drm_connector *scon)
-{
-	struct shmob_drm_device *sdev = scon->connector.dev->dev_private;
-	const struct shmob_drm_backlight_data *bdata = &sdev->pdata->backlight;
-	struct drm_connector *connector = &scon->connector;
-	struct drm_device *dev = connector->dev;
-	struct backlight_device *backlight;
-
-	if (!bdata->max_brightness)
-		return 0;
-
-	backlight = backlight_device_register(bdata->name, dev->dev, scon,
-					      &shmob_drm_backlight_ops, NULL);
-	if (IS_ERR(backlight)) {
-		dev_err(dev->dev, "unable to register backlight device: %ld\n",
-			PTR_ERR(backlight));
-		return PTR_ERR(backlight);
-	}
-
-	backlight->props.max_brightness = bdata->max_brightness;
-	backlight->props.brightness = bdata->max_brightness;
-	backlight->props.power = FB_BLANK_POWERDOWN;
-	backlight_update_status(backlight);
-
-	scon->backlight = backlight;
-	return 0;
-}
-
-void shmob_drm_backlight_exit(struct shmob_drm_connector *scon)
-{
-	backlight_device_unregister(scon->backlight);
-}
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.h b/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.h
deleted file mode 100644
index d9abb7a60be5..000000000000
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_backlight.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * shmob_drm_backlight.h  --  SH Mobile DRM Backlight
- *
- * Copyright (C) 2012 Renesas Electronics Corporation
- *
- * Laurent Pinchart (laurent.pinchart@ideasonboard.com)
- */
-
-#ifndef __SHMOB_DRM_BACKLIGHT_H__
-#define __SHMOB_DRM_BACKLIGHT_H__
-
-struct shmob_drm_connector;
-
-void shmob_drm_backlight_dpms(struct shmob_drm_connector *scon, int mode);
-int shmob_drm_backlight_init(struct shmob_drm_connector *scon);
-void shmob_drm_backlight_exit(struct shmob_drm_connector *scon);
-
-#endif /* __SHMOB_DRM_BACKLIGHT_H__ */
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
index 2cdf8f9b06e5..db9d8d440144 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
@@ -7,7 +7,6 @@
  * Laurent Pinchart (laurent.pinchart@ideasonboard.com)
  */
 
-#include <linux/backlight.h>
 #include <linux/clk.h>
 #include <linux/pm_runtime.h>
 
@@ -24,7 +23,6 @@
 #include <drm/drm_simple_kms_helper.h>
 #include <drm/drm_vblank.h>
 
-#include "shmob_drm_backlight.h"
 #include "shmob_drm_crtc.h"
 #include "shmob_drm_drv.h"
 #include "shmob_drm_kms.h"
@@ -487,21 +485,9 @@ int shmob_drm_crtc_create(struct shmob_drm_device *sdev)
  * Encoder
  */
 
-#define to_shmob_encoder(e) \
-	container_of(e, struct shmob_drm_encoder, encoder)
-
 static void shmob_drm_encoder_dpms(struct drm_encoder *encoder, int mode)
 {
-	struct shmob_drm_encoder *senc = to_shmob_encoder(encoder);
-	struct shmob_drm_device *sdev = encoder->dev->dev_private;
-	struct shmob_drm_connector *scon = &sdev->connector;
-
-	if (senc->dpms == mode)
-		return;
-
-	shmob_drm_backlight_dpms(scon, mode);
-
-	senc->dpms = mode;
+	/* No-op, everything is handled in the CRTC code. */
 }
 
 static bool shmob_drm_encoder_mode_fixup(struct drm_encoder *encoder,
@@ -553,11 +539,9 @@ static const struct drm_encoder_helper_funcs encoder_helper_funcs = {
 
 int shmob_drm_encoder_create(struct shmob_drm_device *sdev)
 {
-	struct drm_encoder *encoder = &sdev->encoder.encoder;
+	struct drm_encoder *encoder = &sdev->encoder;
 	int ret;
 
-	sdev->encoder.dpms = DRM_MODE_DPMS_OFF;
-
 	encoder->possible_crtcs = 1;
 
 	ret = drm_simple_encoder_init(sdev->ddev, encoder,
@@ -622,9 +606,6 @@ static const struct drm_connector_helper_funcs connector_helper_funcs = {
 
 static void shmob_drm_connector_destroy(struct drm_connector *connector)
 {
-	struct shmob_drm_connector *scon = to_shmob_connector(connector);
-
-	shmob_drm_backlight_exit(scon);
 	drm_connector_unregister(connector);
 	drm_connector_cleanup(connector);
 }
@@ -653,13 +634,9 @@ int shmob_drm_connector_create(struct shmob_drm_device *sdev,
 
 	drm_connector_helper_add(connector, &connector_helper_funcs);
 
-	ret = shmob_drm_backlight_init(&sdev->connector);
-	if (ret < 0)
-		goto err_cleanup;
-
 	ret = drm_connector_attach_encoder(connector, encoder);
 	if (ret < 0)
-		goto err_backlight;
+		goto error;
 
 	drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF);
 	drm_object_property_set_value(&connector->base,
@@ -667,9 +644,7 @@ int shmob_drm_connector_create(struct shmob_drm_device *sdev,
 
 	return 0;
 
-err_backlight:
-	shmob_drm_backlight_exit(&sdev->connector);
-err_cleanup:
+error:
 	drm_connector_cleanup(connector);
 	return ret;
 }
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
index 21718843f46d..bce692626945 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
@@ -14,7 +14,6 @@
 #include <drm/drm_connector.h>
 #include <drm/drm_encoder.h>
 
-struct backlight_device;
 struct drm_pending_vblank_event;
 struct shmob_drm_device;
 struct shmob_drm_format_info;
@@ -31,16 +30,9 @@ struct shmob_drm_crtc {
 	bool started;
 };
 
-struct shmob_drm_encoder {
-	struct drm_encoder encoder;
-	int dpms;
-};
-
 struct shmob_drm_connector {
 	struct drm_connector connector;
 	struct drm_encoder *encoder;
-
-	struct backlight_device *backlight;
 };
 
 int shmob_drm_crtc_create(struct shmob_drm_device *sdev);
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.h b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.h
index 4964ddd5ab74..16d830168b2a 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.h
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.h
@@ -35,7 +35,7 @@ struct shmob_drm_device {
 	struct drm_device *ddev;
 
 	struct shmob_drm_crtc crtc;
-	struct shmob_drm_encoder encoder;
+	struct drm_encoder encoder;
 	struct shmob_drm_connector connector;
 };
 
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_kms.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_kms.c
index 8fd360149743..3051318ddc79 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_kms.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_kms.c
@@ -159,7 +159,7 @@ int shmob_drm_modeset_init(struct shmob_drm_device *sdev)
 
 	shmob_drm_crtc_create(sdev);
 	shmob_drm_encoder_create(sdev);
-	shmob_drm_connector_create(sdev, &sdev->encoder.encoder);
+	shmob_drm_connector_create(sdev, &sdev->encoder);
 
 	drm_kms_helper_poll_init(sdev->ddev);
 
diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index d661399b217d..b6b5b6607fb5 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -40,13 +40,6 @@ enum shmob_drm_interface {
 	SHMOB_DRM_IFACE_SYS24,		/* 24bpp */
 };
 
-struct shmob_drm_backlight_data {
-	const char *name;
-	int max_brightness;
-	int (*get_brightness)(void);
-	int (*set_brightness)(int brightness);
-};
-
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
@@ -83,7 +76,6 @@ struct shmob_drm_platform_data {
 	enum shmob_drm_clk_source clk_source;
 	struct shmob_drm_interface_data iface;
 	struct shmob_drm_panel_data panel;
-	struct shmob_drm_backlight_data backlight;
 };
 
 #endif /* __SHMOB_DRM_H__ */
-- 
cgit v1.2.3


From 04ed052f3ab4b3c4c3e8451522ffaa84479bf0fb Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:30 +0200
Subject: drm: renesas: shmobile: Remove support for SYS panels

SYS panels are not used, and have no defined DT bindings.  Remove their
support to avoid impeding DT support.  It can always be added back
later.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/4ccca2a5ac05c73ea9fd6e44b8bc443fd9d14e0d.1694767209.git.geert+renesas@glider.be
---
 drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c | 20 ------------------
 drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c  | 11 ----------
 include/linux/platform_data/shmob_drm.h           | 25 -----------------------
 3 files changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
index 2ccb2fbfea26..015263ac2100 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
@@ -55,24 +55,6 @@ static void shmob_drm_crtc_setup_geometry(struct shmob_drm_crtc *scrtc)
 	      | ((idata->flags & SHMOB_DRM_IFACE_FL_DWCNT) ? LDMT1R_DWCNT : 0);
 	lcdc_write(sdev, LDMT1R, value);
 
-	if (idata->interface >= SHMOB_DRM_IFACE_SYS8A &&
-	    idata->interface <= SHMOB_DRM_IFACE_SYS24) {
-		/* Setup SYS bus. */
-		value = (idata->sys.cs_setup << LDMT2R_CSUP_SHIFT)
-		      | (idata->sys.vsync_active_high ? LDMT2R_RSV : 0)
-		      | (idata->sys.vsync_dir_input ? LDMT2R_VSEL : 0)
-		      | (idata->sys.write_setup << LDMT2R_WCSC_SHIFT)
-		      | (idata->sys.write_cycle << LDMT2R_WCEC_SHIFT)
-		      | (idata->sys.write_strobe << LDMT2R_WCLW_SHIFT);
-		lcdc_write(sdev, LDMT2R, value);
-
-		value = (idata->sys.read_latch << LDMT3R_RDLC_SHIFT)
-		      | (idata->sys.read_setup << LDMT3R_RCSC_SHIFT)
-		      | (idata->sys.read_cycle << LDMT3R_RCEC_SHIFT)
-		      | (idata->sys.read_strobe << LDMT3R_RCLW_SHIFT);
-		lcdc_write(sdev, LDMT3R, value);
-	}
-
 	value = ((mode->hdisplay / 8) << 16)			/* HDCN */
 	      | (mode->htotal / 8);				/* HTCN */
 	lcdc_write(sdev, LDHCNR, value);
@@ -180,8 +162,6 @@ static void shmob_drm_crtc_start(struct shmob_drm_crtc *scrtc)
 	lcdc_write(sdev, LDDCKSTPR, 0);
 	lcdc_wait_bit(sdev, LDDCKSTPR, ~0, 0);
 
-	/* TODO: Setup SYS panel */
-
 	/* Setup geometry, format, frame buffer memory and operation mode. */
 	shmob_drm_crtc_setup_geometry(scrtc);
 
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
index 5f3061e88e21..9c3d8b3cf57d 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
@@ -43,17 +43,6 @@ static int shmob_drm_init_interface(struct shmob_drm_device *sdev)
 		[SHMOB_DRM_IFACE_RGB18] = LDMT1R_MIFTYP_RGB18,
 		[SHMOB_DRM_IFACE_RGB24] = LDMT1R_MIFTYP_RGB24,
 		[SHMOB_DRM_IFACE_YUV422] = LDMT1R_MIFTYP_YCBCR,
-		[SHMOB_DRM_IFACE_SYS8A] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS8A,
-		[SHMOB_DRM_IFACE_SYS8B] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS8B,
-		[SHMOB_DRM_IFACE_SYS8C] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS8C,
-		[SHMOB_DRM_IFACE_SYS8D] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS8D,
-		[SHMOB_DRM_IFACE_SYS9] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS9,
-		[SHMOB_DRM_IFACE_SYS12] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS12,
-		[SHMOB_DRM_IFACE_SYS16A] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS16A,
-		[SHMOB_DRM_IFACE_SYS16B] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS16B,
-		[SHMOB_DRM_IFACE_SYS16C] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS16C,
-		[SHMOB_DRM_IFACE_SYS18] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS18,
-		[SHMOB_DRM_IFACE_SYS24] = LDMT1R_IFM | LDMT1R_MIFTYP_SYS24,
 	};
 
 	if (sdev->pdata->iface.interface >= ARRAY_SIZE(ldmt1r)) {
diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index b6b5b6607fb5..b728e24222d9 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -27,17 +27,6 @@ enum shmob_drm_interface {
 	SHMOB_DRM_IFACE_RGB18,		/* 18bpp */
 	SHMOB_DRM_IFACE_RGB24,		/* 24bpp */
 	SHMOB_DRM_IFACE_YUV422,		/* 16bpp */
-	SHMOB_DRM_IFACE_SYS8A,		/* 24bpp, 8:8:8 */
-	SHMOB_DRM_IFACE_SYS8B,		/* 18bpp, 8:8:2 */
-	SHMOB_DRM_IFACE_SYS8C,		/* 18bpp, 2:8:8 */
-	SHMOB_DRM_IFACE_SYS8D,		/* 16bpp, 8:8 */
-	SHMOB_DRM_IFACE_SYS9,		/* 18bpp, 9:9 */
-	SHMOB_DRM_IFACE_SYS12,		/* 24bpp, 12:12 */
-	SHMOB_DRM_IFACE_SYS16A,		/* 16bpp */
-	SHMOB_DRM_IFACE_SYS16B,		/* 18bpp, 16:2 */
-	SHMOB_DRM_IFACE_SYS16C,		/* 18bpp, 2:16 */
-	SHMOB_DRM_IFACE_SYS18,		/* 18bpp */
-	SHMOB_DRM_IFACE_SYS24,		/* 24bpp */
 };
 
 struct shmob_drm_panel_data {
@@ -46,19 +35,6 @@ struct shmob_drm_panel_data {
 	struct drm_mode_modeinfo mode;
 };
 
-struct shmob_drm_sys_interface_data {
-	unsigned int read_latch:6;
-	unsigned int read_setup:8;
-	unsigned int read_cycle:8;
-	unsigned int read_strobe:8;
-	unsigned int write_setup:8;
-	unsigned int write_cycle:8;
-	unsigned int write_strobe:8;
-	unsigned int cs_setup:3;
-	unsigned int vsync_active_high:1;
-	unsigned int vsync_dir_input:1;
-};
-
 #define SHMOB_DRM_IFACE_FL_DWPOL (1 << 0) /* Rising edge dot clock data latch */
 #define SHMOB_DRM_IFACE_FL_DIPOL (1 << 1) /* Active low display enable */
 #define SHMOB_DRM_IFACE_FL_DAPOL (1 << 2) /* Active low display data */
@@ -67,7 +43,6 @@ struct shmob_drm_sys_interface_data {
 
 struct shmob_drm_interface_data {
 	enum shmob_drm_interface interface;
-	struct shmob_drm_sys_interface_data sys;
 	unsigned int clk_div;
 	unsigned int flags;
 };
-- 
cgit v1.2.3


From 6a6ab0c7162b4b10ce74347e282e1bc81103a48f Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Fri, 15 Sep 2023 10:53:38 +0200
Subject: drm: renesas: shmobile: Use struct videomode in platform data

Replace the drm_mode_modeinfo field with videomode that includes more
signal polarity flags.  This simplifies driver handling of panel modes
and prepares for DT support.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
[geert: Simplify]
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/4312e56de424d94399c6105e7159317eae86c9d5.1694767209.git.geert+renesas@glider.be
---
 drivers/gpu/drm/renesas/shmobile/Kconfig          |  1 +
 drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c | 35 +++++++++--------------
 drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h |  3 ++
 include/linux/platform_data/shmob_drm.h           | 11 ++-----
 4 files changed, 20 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/renesas/shmobile/Kconfig b/drivers/gpu/drm/renesas/shmobile/Kconfig
index ba941587ca70..027220b8fe1c 100644
--- a/drivers/gpu/drm/renesas/shmobile/Kconfig
+++ b/drivers/gpu/drm/renesas/shmobile/Kconfig
@@ -6,6 +6,7 @@ config DRM_SHMOBILE
 	select BACKLIGHT_CLASS_DEVICE
 	select DRM_KMS_HELPER
 	select DRM_GEM_DMA_HELPER
+	select VIDEOMODE_HELPERS
 	help
 	  Choose this option if you have an SH Mobile chipset.
 	  If M is selected the module will be called shmob-drm.
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
index f62ae047a48f..b3ef10b7828d 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.c
@@ -23,6 +23,8 @@
 #include <drm/drm_simple_kms_helper.h>
 #include <drm/drm_vblank.h>
 
+#include <video/videomode.h>
+
 #include "shmob_drm_crtc.h"
 #include "shmob_drm_drv.h"
 #include "shmob_drm_kms.h"
@@ -41,18 +43,16 @@ static void shmob_drm_crtc_setup_geometry(struct shmob_drm_crtc *scrtc)
 {
 	struct drm_crtc *crtc = &scrtc->crtc;
 	struct shmob_drm_device *sdev = to_shmob_device(crtc->dev);
-	const struct shmob_drm_interface_data *idata = &sdev->pdata->iface;
+	enum display_flags dpy_flags = sdev->connector.mode->flags;
 	const struct drm_display_mode *mode = &crtc->mode;
 	u32 value;
 
 	value = sdev->ldmt1r
 	      | ((mode->flags & DRM_MODE_FLAG_PVSYNC) ? 0 : LDMT1R_VPOL)
 	      | ((mode->flags & DRM_MODE_FLAG_PHSYNC) ? 0 : LDMT1R_HPOL)
-	      | ((idata->flags & SHMOB_DRM_IFACE_FL_DWPOL) ? LDMT1R_DWPOL : 0)
-	      | ((idata->flags & SHMOB_DRM_IFACE_FL_DIPOL) ? LDMT1R_DIPOL : 0)
-	      | ((idata->flags & SHMOB_DRM_IFACE_FL_DAPOL) ? LDMT1R_DAPOL : 0)
-	      | ((idata->flags & SHMOB_DRM_IFACE_FL_HSCNT) ? LDMT1R_HSCNT : 0)
-	      | ((idata->flags & SHMOB_DRM_IFACE_FL_DWCNT) ? LDMT1R_DWCNT : 0);
+	      | ((dpy_flags & DISPLAY_FLAGS_PIXDATA_POSEDGE) ? LDMT1R_DWPOL : 0)
+	      | ((dpy_flags & DISPLAY_FLAGS_DE_LOW) ? LDMT1R_DIPOL : 0);
+
 	lcdc_write(sdev, LDMT1R, value);
 
 	value = ((mode->hdisplay / 8) << 16)			/* HDCN */
@@ -548,7 +548,7 @@ static inline struct shmob_drm_connector *to_shmob_connector(struct drm_connecto
 
 static int shmob_drm_connector_get_modes(struct drm_connector *connector)
 {
-	struct shmob_drm_device *sdev = to_shmob_device(connector->dev);
+	struct shmob_drm_connector *scon = to_shmob_connector(connector);
 	struct drm_display_mode *mode;
 
 	mode = drm_mode_create(connector->dev);
@@ -556,18 +556,9 @@ static int shmob_drm_connector_get_modes(struct drm_connector *connector)
 		return 0;
 
 	mode->type = DRM_MODE_TYPE_PREFERRED | DRM_MODE_TYPE_DRIVER;
-	mode->clock = sdev->pdata->panel.mode.clock;
-	mode->hdisplay = sdev->pdata->panel.mode.hdisplay;
-	mode->hsync_start = sdev->pdata->panel.mode.hsync_start;
-	mode->hsync_end = sdev->pdata->panel.mode.hsync_end;
-	mode->htotal = sdev->pdata->panel.mode.htotal;
-	mode->vdisplay = sdev->pdata->panel.mode.vdisplay;
-	mode->vsync_start = sdev->pdata->panel.mode.vsync_start;
-	mode->vsync_end = sdev->pdata->panel.mode.vsync_end;
-	mode->vtotal = sdev->pdata->panel.mode.vtotal;
-	mode->flags = sdev->pdata->panel.mode.flags;
-
-	drm_mode_set_name(mode);
+
+	drm_display_mode_from_videomode(scon->mode, mode);
+
 	drm_mode_probed_add(connector, mode);
 
 	return 1;
@@ -601,10 +592,12 @@ static const struct drm_connector_funcs connector_funcs = {
 int shmob_drm_connector_create(struct shmob_drm_device *sdev,
 			       struct drm_encoder *encoder)
 {
-	struct drm_connector *connector = &sdev->connector.connector;
+	struct shmob_drm_connector *scon = &sdev->connector;
+	struct drm_connector *connector = &scon->connector;
 	int ret;
 
-	sdev->connector.encoder = encoder;
+	scon->encoder = encoder;
+	scon->mode = &sdev->pdata->panel.mode;
 
 	connector->display_info.width_mm = sdev->pdata->panel.width_mm;
 	connector->display_info.height_mm = sdev->pdata->panel.height_mm;
diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
index bce692626945..f507eaf912e1 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_crtc.h
@@ -14,6 +14,8 @@
 #include <drm/drm_connector.h>
 #include <drm/drm_encoder.h>
 
+#include <video/videomode.h>
+
 struct drm_pending_vblank_event;
 struct shmob_drm_device;
 struct shmob_drm_format_info;
@@ -33,6 +35,7 @@ struct shmob_drm_crtc {
 struct shmob_drm_connector {
 	struct drm_connector connector;
 	struct drm_encoder *encoder;
+	const struct videomode *mode;
 };
 
 int shmob_drm_crtc_create(struct shmob_drm_device *sdev);
diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index b728e24222d9..f3cb19ff9f81 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -10,7 +10,7 @@
 #ifndef __SHMOB_DRM_H__
 #define __SHMOB_DRM_H__
 
-#include <drm/drm_mode.h>
+#include <video/videomode.h>
 
 enum shmob_drm_clk_source {
 	SHMOB_DRM_CLK_BUS,
@@ -32,19 +32,12 @@ enum shmob_drm_interface {
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
-	struct drm_mode_modeinfo mode;
+	struct videomode mode;
 };
 
-#define SHMOB_DRM_IFACE_FL_DWPOL (1 << 0) /* Rising edge dot clock data latch */
-#define SHMOB_DRM_IFACE_FL_DIPOL (1 << 1) /* Active low display enable */
-#define SHMOB_DRM_IFACE_FL_DAPOL (1 << 2) /* Active low display data */
-#define SHMOB_DRM_IFACE_FL_HSCNT (1 << 3) /* Disable HSYNC during VBLANK */
-#define SHMOB_DRM_IFACE_FL_DWCNT (1 << 4) /* Disable dotclock during blanking */
-
 struct shmob_drm_interface_data {
 	enum shmob_drm_interface interface;
 	unsigned int clk_div;
-	unsigned int flags;
 };
 
 struct shmob_drm_platform_data {
-- 
cgit v1.2.3


From fa32c6bc92813a76419d1c306e35394cae5028eb Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 15 Sep 2023 10:53:39 +0200
Subject: drm: renesas: shmobile: Use media bus formats in platform data

Replace the custom shmob_drm_interface enumeration values with standard
media bus formats.  This simplifies driver handling of bus formats and
prepares for DT support.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/0a15e5100ca30d14953c93550eb1d4c2e18de939.1694767209.git.geert+renesas@glider.be
---
 drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c | 37 ++++++++++++++----------
 include/linux/platform_data/shmob_drm.h          | 13 +--------
 2 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
index 7e5977d3e256..db4d081fa3d9 100644
--- a/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
+++ b/drivers/gpu/drm/renesas/shmobile/shmob_drm_drv.c
@@ -9,6 +9,7 @@
 
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/media-bus-format.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
@@ -34,25 +35,31 @@
 
 static int shmob_drm_init_interface(struct shmob_drm_device *sdev)
 {
-	static const u32 ldmt1r[] = {
-		[SHMOB_DRM_IFACE_RGB8] = LDMT1R_MIFTYP_RGB8,
-		[SHMOB_DRM_IFACE_RGB9] = LDMT1R_MIFTYP_RGB9,
-		[SHMOB_DRM_IFACE_RGB12A] = LDMT1R_MIFTYP_RGB12A,
-		[SHMOB_DRM_IFACE_RGB12B] = LDMT1R_MIFTYP_RGB12B,
-		[SHMOB_DRM_IFACE_RGB16] = LDMT1R_MIFTYP_RGB16,
-		[SHMOB_DRM_IFACE_RGB18] = LDMT1R_MIFTYP_RGB18,
-		[SHMOB_DRM_IFACE_RGB24] = LDMT1R_MIFTYP_RGB24,
-		[SHMOB_DRM_IFACE_YUV422] = LDMT1R_MIFTYP_YCBCR,
+	static const struct {
+		u32 fmt;
+		u32 ldmt1r;
+	} bus_fmts[] = {
+		{ MEDIA_BUS_FMT_RGB888_3X8,	 LDMT1R_MIFTYP_RGB8 },
+		{ MEDIA_BUS_FMT_RGB666_2X9_BE,	 LDMT1R_MIFTYP_RGB9 },
+		{ MEDIA_BUS_FMT_RGB888_2X12_BE,	 LDMT1R_MIFTYP_RGB12A },
+		{ MEDIA_BUS_FMT_RGB444_1X12,	 LDMT1R_MIFTYP_RGB12B },
+		{ MEDIA_BUS_FMT_RGB565_1X16,	 LDMT1R_MIFTYP_RGB16 },
+		{ MEDIA_BUS_FMT_RGB666_1X18,	 LDMT1R_MIFTYP_RGB18 },
+		{ MEDIA_BUS_FMT_RGB888_1X24,	 LDMT1R_MIFTYP_RGB24 },
+		{ MEDIA_BUS_FMT_UYVY8_1X16,	 LDMT1R_MIFTYP_YCBCR },
 	};
+	unsigned int i;
 
-	if (sdev->pdata->iface.interface >= ARRAY_SIZE(ldmt1r)) {
-		dev_err(sdev->dev, "invalid interface type %u\n",
-			sdev->pdata->iface.interface);
-		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(bus_fmts); i++) {
+		if (bus_fmts[i].fmt == sdev->pdata->iface.bus_fmt) {
+			sdev->ldmt1r = bus_fmts[i].ldmt1r;
+			return 0;
+		}
 	}
 
-	sdev->ldmt1r = ldmt1r[sdev->pdata->iface.interface];
-	return 0;
+	dev_err(sdev->dev, "unsupported bus format 0x%x\n",
+		sdev->pdata->iface.bus_fmt);
+	return -EINVAL;
 }
 
 static int shmob_drm_setup_clocks(struct shmob_drm_device *sdev,
diff --git a/include/linux/platform_data/shmob_drm.h b/include/linux/platform_data/shmob_drm.h
index f3cb19ff9f81..6c19d4fbbe39 100644
--- a/include/linux/platform_data/shmob_drm.h
+++ b/include/linux/platform_data/shmob_drm.h
@@ -18,17 +18,6 @@ enum shmob_drm_clk_source {
 	SHMOB_DRM_CLK_EXTERNAL,
 };
 
-enum shmob_drm_interface {
-	SHMOB_DRM_IFACE_RGB8,		/* 24bpp, 8:8:8 */
-	SHMOB_DRM_IFACE_RGB9,		/* 18bpp, 9:9 */
-	SHMOB_DRM_IFACE_RGB12A,		/* 24bpp, 12:12 */
-	SHMOB_DRM_IFACE_RGB12B,		/* 12bpp */
-	SHMOB_DRM_IFACE_RGB16,		/* 16bpp */
-	SHMOB_DRM_IFACE_RGB18,		/* 18bpp */
-	SHMOB_DRM_IFACE_RGB24,		/* 24bpp */
-	SHMOB_DRM_IFACE_YUV422,		/* 16bpp */
-};
-
 struct shmob_drm_panel_data {
 	unsigned int width_mm;		/* Panel width in mm */
 	unsigned int height_mm;		/* Panel height in mm */
@@ -36,7 +25,7 @@ struct shmob_drm_panel_data {
 };
 
 struct shmob_drm_interface_data {
-	enum shmob_drm_interface interface;
+	unsigned int bus_fmt;		/* MEDIA_BUS_FMT_* */
 	unsigned int clk_div;
 };
 
-- 
cgit v1.2.3


From 166b76a073be8e1ffdc2c4db60f3abbbc974a3a3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:24 +0100
Subject: clocksource/drivers/arm_arch_timer: Initialize evtstrm after
 finalizing cpucaps

We attempt to initialize each CPU's arch_timer event stream in
arch_timer_evtstrm_enable(), which we call from the
arch_timer_starting_cpu() cpu hotplug callback which is registered early
in boot. As this is registered before we initialize the system cpucaps,
the test for ARM64_HAS_ECV will always be false for CPUs present at boot
time, and will only be taken into account for CPUs onlined late
(including those which are hotplugged out and in again).

Due to this, CPUs present and boot time may not use the intended divider
and scale factor to generate the event stream, and may differ from other
CPUs.

Correct this by only initializing the event stream after cpucaps have been
finalized, registering a separate CPU hotplug callback for the event stream
configuration. Since the caps must be finalized by this point, use
cpus_have_final_cap() to verify this.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/clocksource/arm_arch_timer.c | 31 ++++++++++++++++++++++++++-----
 include/linux/cpuhotplug.h           |  1 +
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 7dd2c615bce2..d1e9e556da81 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -917,7 +917,7 @@ static void arch_timer_evtstrm_enable(unsigned int divider)
 
 #ifdef CONFIG_ARM64
 	/* ECV is likely to require a large divider. Use the EVNTIS flag. */
-	if (cpus_have_const_cap(ARM64_HAS_ECV) && divider > 15) {
+	if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) {
 		cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE;
 		divider -= 8;
 	}
@@ -955,6 +955,30 @@ static void arch_timer_configure_evtstream(void)
 	arch_timer_evtstrm_enable(max(0, lsb));
 }
 
+static int arch_timer_evtstrm_starting_cpu(unsigned int cpu)
+{
+	arch_timer_configure_evtstream();
+	return 0;
+}
+
+static int arch_timer_evtstrm_dying_cpu(unsigned int cpu)
+{
+	cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
+	return 0;
+}
+
+static int __init arch_timer_evtstrm_register(void)
+{
+	if (!arch_timer_evt || !evtstrm_enable)
+		return 0;
+
+	return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
+				 "clockevents/arm/arch_timer_evtstrm:starting",
+				 arch_timer_evtstrm_starting_cpu,
+				 arch_timer_evtstrm_dying_cpu);
+}
+core_initcall(arch_timer_evtstrm_register);
+
 static void arch_counter_set_user_access(void)
 {
 	u32 cntkctl = arch_timer_get_cntkctl();
@@ -1016,8 +1040,6 @@ static int arch_timer_starting_cpu(unsigned int cpu)
 	}
 
 	arch_counter_set_user_access();
-	if (evtstrm_enable)
-		arch_timer_configure_evtstream();
 
 	return 0;
 }
@@ -1164,8 +1186,6 @@ static int arch_timer_dying_cpu(unsigned int cpu)
 {
 	struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
 
-	cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
-
 	arch_timer_stop(clk);
 	return 0;
 }
@@ -1279,6 +1299,7 @@ out_unreg_notify:
 
 out_free:
 	free_percpu(arch_timer_evt);
+	arch_timer_evt = NULL;
 out:
 	return err;
 }
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..bb4d0bcac81b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -172,6 +172,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_L2X0_STARTING,
 	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
+	CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
 	CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
 	CPUHP_AP_JCORE_TIMER_STARTING,
 	CPUHP_AP_ARM_TWD_STARTING,
-- 
cgit v1.2.3


From 20f3b8eafe0ba5d3c69d5011a9b07739e9645132 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:25 +0100
Subject: arm64/arm: xen: enlighten: Fix KPTI checks

When KPTI is in use, we cannot register a runstate region as XEN
requires that this is always a valid VA, which we cannot guarantee. Due
to this, xen_starting_cpu() must avoid registering each CPU's runstate
region, and xen_guest_init() must avoid setting up features that depend
upon it.

We tried to ensure that in commit:

  f88af7229f6f22ce (" xen/arm: do not setup the runstate info page if kpti is enabled")

... where we added checks for xen_kernel_unmapped_at_usr(), which wraps
arm64_kernel_unmapped_at_el0() on arm64 and is always false on 32-bit
arm.

Unfortunately, as xen_guest_init() is an early_initcall, this happens
before secondary CPUs are booted and arm64 has finalized the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap which backs
arm64_kernel_unmapped_at_el0(), and so this can subsequently be set as
secondary CPUs are onlined. On a big.LITTLE system where the boot CPU
does not require KPTI but some secondary CPUs do, this will result in
xen_guest_init() intializing features that depend on the runstate
region, and xen_starting_cpu() registering the runstate region on some
CPUs before KPTI is subsequent enabled, resulting the the problems the
aforementioned commit tried to avoid.

Handle this more robsutly by deferring the initialization of the
runstate region until secondary CPUs have been initialized and the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap has been finalized. The per-cpu work is
moved into a new hotplug starting function which is registered later
when we're certain that KPTI will not be used.

Fixes: f88af7229f6f ("xen/arm: do not setup the runstate info page if kpti is enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Bertrand Marquis <bertrand.marquis@arm.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/xen/enlighten.c   | 25 ++++++++++++++++---------
 include/linux/cpuhotplug.h |  1 +
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index c392e18f1e43..9afdc4c4a5dc 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -164,9 +164,6 @@ static int xen_starting_cpu(unsigned int cpu)
 	BUG_ON(err);
 	per_cpu(xen_vcpu, cpu) = vcpup;
 
-	if (!xen_kernel_unmapped_at_usr())
-		xen_setup_runstate_info(cpu);
-
 after_register_vcpu_info:
 	enable_percpu_irq(xen_events_irq, 0);
 	return 0;
@@ -523,9 +520,6 @@ static int __init xen_guest_init(void)
 		return -EINVAL;
 	}
 
-	if (!xen_kernel_unmapped_at_usr())
-		xen_time_setup_guest();
-
 	if (xen_initial_domain())
 		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 
@@ -535,7 +529,13 @@ static int __init xen_guest_init(void)
 }
 early_initcall(xen_guest_init);
 
-static int __init xen_pm_init(void)
+static int xen_starting_runstate_cpu(unsigned int cpu)
+{
+	xen_setup_runstate_info(cpu);
+	return 0;
+}
+
+static int __init xen_late_init(void)
 {
 	if (!xen_domain())
 		return -ENODEV;
@@ -548,9 +548,16 @@ static int __init xen_pm_init(void)
 		do_settimeofday64(&ts);
 	}
 
-	return 0;
+	if (xen_kernel_unmapped_at_usr())
+		return 0;
+
+	xen_time_setup_guest();
+
+	return cpuhp_setup_state(CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
+				 "arm/xen_runstate:starting",
+				 xen_starting_runstate_cpu, NULL);
 }
-late_initcall(xen_pm_init);
+late_initcall(xen_late_init);
 
 
 /* empty stubs */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bb4d0bcac81b..d06e31e03a5e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -190,6 +190,7 @@ enum cpuhp_state {
 	/* Must be the last timer callback */
 	CPUHP_AP_DUMMY_TIMER_STARTING,
 	CPUHP_AP_ARM_XEN_STARTING,
+	CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
-- 
cgit v1.2.3


From ada1682d60ac6017037305166d02eb0cd5ee50fa Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 4 Oct 2023 20:57:32 +0200
Subject: firmware: qcom: qseecom: add missing include guards

The qseecom header does not contain ifdef guards against multiple
inclusion. Add them.

Fixes: 00b1248606ba ("firmware: qcom_scm: Add support for Qualcomm Secure Execution Environment SCM interface")
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20231004185732.98621-1-brgl@bgdev.pl
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/linux/firmware/qcom/qcom_qseecom.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/qcom/qcom_qseecom.h b/include/linux/firmware/qcom/qcom_qseecom.h
index b531547e1dc9..5c28298a98be 100644
--- a/include/linux/firmware/qcom/qcom_qseecom.h
+++ b/include/linux/firmware/qcom/qcom_qseecom.h
@@ -5,6 +5,10 @@
  *
  * Copyright (C) 2023 Maximilian Luz <luzmaximilian@gmail.com>
  */
+
+#ifndef __QCOM_QSEECOM_H
+#define __QCOM_QSEECOM_H
+
 #include <linux/auxiliary_bus.h>
 #include <linux/types.h>
 
@@ -44,3 +48,5 @@ static inline int qcom_qseecom_app_send(struct qseecom_client *client, void *req
 {
 	return qcom_scm_qseecom_app_send(client->app_id, req, req_size, rsp, rsp_size);
 }
+
+#endif /* __QCOM_QSEECOM_H */
-- 
cgit v1.2.3


From a8ecbc54165fca767e75a82372a7be3810c667cf Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 14 Oct 2023 22:53:11 +0200
Subject: spi: Export acpi_spi_find_controller_by_adev()

Export acpi_spi_find_controller_by_adev() so that ACPI glue code which
wants to dynamically create a spi_device using acpi_spi_device_alloc() or
spi_new_device() on a controller, to which the code does not already have
a reference, can find the controller.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231014205314.59333-2-hdegoede@redhat.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 5 ++---
 include/linux/spi/spi.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ac518be34917..791df0e69105 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2523,8 +2523,6 @@ static void acpi_spi_parse_apple_properties(struct acpi_device *dev,
 		lookup->mode |= SPI_CPHA;
 }
 
-static struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev);
-
 static int acpi_spi_add_resource(struct acpi_resource *ares, void *data)
 {
 	struct acpi_spi_lookup *lookup = data;
@@ -4519,7 +4517,7 @@ static int spi_acpi_controller_match(struct device *dev, const void *data)
 	return ACPI_COMPANION(dev->parent) == data;
 }
 
-static struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev)
+struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev)
 {
 	struct device *dev;
 
@@ -4533,6 +4531,7 @@ static struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_devic
 
 	return container_of(dev, struct spi_controller, dev);
 }
+EXPORT_SYMBOL_GPL(acpi_spi_find_controller_by_adev);
 
 static struct spi_device *acpi_spi_find_device_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 487da1f6e4b7..86825c88b576 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -867,6 +867,7 @@ extern int devm_spi_register_controller(struct device *dev,
 extern void spi_unregister_controller(struct spi_controller *ctlr);
 
 #if IS_ENABLED(CONFIG_ACPI)
+extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev);
 extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr,
 						struct acpi_device *adev,
 						int index);
-- 
cgit v1.2.3


From 2dd10de8e6bcbacf85ad758b904543c294820c63 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Tue, 12 Sep 2023 17:53:18 -0400
Subject: lockd: introduce safe async lock op

This patch reverts mostly commit 40595cdc93ed ("nfs: block notification
on fs with its own ->lock") and introduces an EXPORT_OP_ASYNC_LOCK
export flag to signal that the "own ->lock" implementation supports
async lock requests. The only main user is DLM that is used by GFS2 and
OCFS2 filesystem. Those implement their own lock() implementation and
return FILE_LOCK_DEFERRED as return value. Since commit 40595cdc93ed
("nfs: block notification on fs with its own ->lock") the DLM
implementation were never updated. This patch should prepare for DLM
to set the EXPORT_OP_ASYNC_LOCK export flag and update the DLM
plock implementation regarding to it.

Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/filesystems/nfs/exporting.rst |  7 +++++++
 fs/lockd/svclock.c                          |  4 +---
 fs/nfsd/nfs4state.c                         | 10 +++++++---
 include/linux/exportfs.h                    | 14 ++++++++++++++
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index 4b30daee399a..198d805d611c 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -241,3 +241,10 @@ following flags are defined:
     all of an inode's dirty data on last close. Exports that behave this
     way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
     waiting for writeback when closing such files.
+
+  EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock
+    requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has
+    it's own ->lock() functionality as core posix_lock_file() implementation
+    has no async lock request handling yet. For more information about how to
+    indicate an async lock request from a ->lock() file_operations struct, see
+    fs/locks.c and comment for the function vfs_lock_file().
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..d500e32ebb18 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -481,9 +481,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	    struct nlm_host *host, struct nlm_lock *lock, int wait,
 	    struct nlm_cookie *cookie, int reclaim)
 {
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct inode		*inode = nlmsvc_file_inode(file);
-#endif
 	struct nlm_block	*block = NULL;
 	int			error;
 	int			mode;
@@ -497,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
-	if (nlmsvc_file_file(file)->f_op->lock) {
+	if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
 		async_block = wait;
 		wait = 0;
 	}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8534693eb6a4..7cabe882724e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7487,6 +7487,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
+	struct super_block *sb;
 	__be32 status = 0;
 	int lkflg;
 	int err;
@@ -7508,6 +7509,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("NFSD: nfsd4_lock: permission denied!\n");
 		return status;
 	}
+	sb = cstate->current_fh.fh_dentry->d_sb;
 
 	if (lock->lk_is_new) {
 		if (nfsd4_has_session(cstate))
@@ -7559,7 +7561,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
 		case NFS4_READW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_READ_LT:
@@ -7571,7 +7574,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			fl_type = F_RDLCK;
 			break;
 		case NFS4_WRITEW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_WRITE_LT:
@@ -7599,7 +7603,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * for file locks), so don't attempt blocking lock notifications
 	 * on those filesystems:
 	 */
-	if (nf->nf_file->f_op->lock)
+	if (!exportfs_lock_op_is_async(sb->s_export_op))
 		fl_flags &= ~FL_SLEEP;
 
 	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..6dd993240fcc 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -224,9 +224,23 @@ struct export_operations {
 						  atomic attribute updates
 						*/
 #define EXPORT_OP_FLUSH_ON_CLOSE	(0x20) /* fs flushes file data on close */
+#define EXPORT_OP_ASYNC_LOCK		(0x40) /* fs can do async lock request */
 	unsigned long	flags;
 };
 
+/**
+ * exportfs_lock_op_is_async() - export op supports async lock operation
+ * @export_ops:	the nfs export operations to check
+ *
+ * Returns true if the nfs export_operations structure has
+ * EXPORT_OP_ASYNC_LOCK in their flags set
+ */
+static inline bool
+exportfs_lock_op_is_async(const struct export_operations *export_ops)
+{
+	return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
+}
+
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 				    int *max_len, struct inode *parent,
 				    int flags);
-- 
cgit v1.2.3


From 6ed8cdf967f7e9fc96cd1c129719ef99db2f9afc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Sep 2023 10:38:51 -0400
Subject: SUNRPC: Clean up bc_svc_process()

The test robot complained that, in some build configurations, the
@error variable in bc_svc_process's only caller is set but never
used. This happens because dprintk() is the only consumer of that
value.

 - Remove the dprintk() call sites in favor of the svc_process
   tracepoint
 - The @error variable and the return value of bc_svc_process() are
   now unused, so get rid of them.
 - The @serv parameter is set to rqstp->rq_serv by the only caller,
   and bc_svc_process() then uses it only to set rqstp->rq_serv. It
   can be removed.
 - Rename bc_svc_process() according to the convention that
   globally-visible RPC server functions have names that begin with
   "svc_"; and because it is globally-visible, give it a proper
   kdoc comment.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308121314.HA8Rq2XG-lkp@intel.com/
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/callback.c          |  6 +-----
 include/linux/sunrpc/svc.h |  3 +--
 net/sunrpc/svc.c           | 38 ++++++++++++--------------------------
 3 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..272e6d2bb478 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -95,7 +95,6 @@ nfs41_callback_svc(void *vrqstp)
 	struct svc_rqst *rqstp = vrqstp;
 	struct svc_serv *serv = rqstp->rq_server;
 	struct rpc_rqst *req;
-	int error;
 	DEFINE_WAIT(wq);
 
 	set_freezable();
@@ -109,10 +108,7 @@ nfs41_callback_svc(void *vrqstp)
 			list_del(&req->rq_bc_list);
 			spin_unlock_bh(&serv->sv_cb_lock);
 			finish_wait(&serv->sv_cb_waitq, &wq);
-			dprintk("Invoking bc_svc_process()\n");
-			error = bc_svc_process(serv, req, rqstp);
-			dprintk("bc_svc_process() returned w/ error code= %d\n",
-				error);
+			svc_process_bc(req, rqstp);
 		} else {
 			spin_unlock_bh(&serv->sv_cb_lock);
 			if (!kthread_should_stop())
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dbf5b21feafe..0cdca5960171 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -413,8 +413,7 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_process(struct svc_rqst *rqstp);
-int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
-			struct svc_rqst *);
+void		   svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
 				const unsigned short, const unsigned short);
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 812fda9d45dd..a3d031deb1ec 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1544,24 +1544,20 @@ out_drop:
 }
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-/*
- * Process a backchannel RPC request that arrived over an existing
- * outbound connection
+/**
+ * svc_process_bc - process a reverse-direction RPC request
+ * @req: RPC request to be used for client-side processing
+ * @rqstp: server-side execution context
+ *
  */
-int
-bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
-	       struct svc_rqst *rqstp)
+void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 {
 	struct rpc_task *task;
 	int proc_error;
-	int error;
-
-	dprintk("svc: %s(%p)\n", __func__, req);
 
 	/* Build the svc_rqst used by the common processing routine */
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
-	rqstp->rq_server = serv;
 	rqstp->rq_bc_net = req->rq_xprt->xprt_net;
 
 	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
@@ -1590,10 +1586,8 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	 * been processed by the caller.
 	 */
 	svcxdr_init_decode(rqstp);
-	if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2)) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2))
+		return;
 
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp);
@@ -1602,26 +1596,18 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
-		error = -EINVAL;
-		goto out;
+		return;
 	}
 	/* Finally, send the reply synchronously */
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
 	task = rpc_run_bc_task(req);
-	if (IS_ERR(task)) {
-		error = PTR_ERR(task);
-		goto out;
-	}
+	if (IS_ERR(task))
+		return;
 
 	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-	error = task->tk_status;
 	rpc_put_task(task);
-
-out:
-	dprintk("svc: %s(), error=%d\n", __func__, error);
-	return error;
 }
-EXPORT_SYMBOL_GPL(bc_svc_process);
+EXPORT_SYMBOL_GPL(svc_process_bc);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 /**
-- 
cgit v1.2.3


From 063ab935a48b3a2854f433957adbb2bde396ed22 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:38:58 -0400
Subject: SUNRPC: integrate back-channel processing with svc_recv()

Using svc_recv() for (NFSv4.1) back-channel handling means we have just
one mechanism for waking threads.

Also change kthread_freezable_should_stop() in nfs4_callback_svc() to
kthread_should_stop() as used elsewhere.
kthread_freezable_should_stop() effectively adds a try_to_freeze() call,
and svc_recv() already contains that at an appropriate place.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/callback.c                 | 42 ++-------------------------------------
 include/linux/sunrpc/svc.h        |  2 --
 net/sunrpc/backchannel_rqst.c     |  8 +++-----
 net/sunrpc/svc.c                  |  2 +-
 net/sunrpc/svc_xprt.c             | 27 +++++++++++++++++++++++++
 net/sunrpc/xprtrdma/backchannel.c |  2 +-
 6 files changed, 34 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 272e6d2bb478..42a0c2f1e785 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_freezable_should_stop(NULL))
+	while (!kthread_should_stop())
 		svc_recv(rqstp);
 
 	svc_exit_thread(rqstp);
@@ -86,41 +86,6 @@ nfs4_callback_svc(void *vrqstp)
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
-	struct svc_rqst *rqstp = vrqstp;
-	struct svc_serv *serv = rqstp->rq_server;
-	struct rpc_rqst *req;
-	DEFINE_WAIT(wq);
-
-	set_freezable();
-
-	while (!kthread_freezable_should_stop(NULL)) {
-		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
-		spin_lock_bh(&serv->sv_cb_lock);
-		if (!list_empty(&serv->sv_cb_list)) {
-			req = list_first_entry(&serv->sv_cb_list,
-					struct rpc_rqst, rq_bc_list);
-			list_del(&req->rq_bc_list);
-			spin_unlock_bh(&serv->sv_cb_lock);
-			finish_wait(&serv->sv_cb_waitq, &wq);
-			svc_process_bc(req, rqstp);
-		} else {
-			spin_unlock_bh(&serv->sv_cb_lock);
-			if (!kthread_should_stop())
-				schedule();
-			finish_wait(&serv->sv_cb_waitq, &wq);
-		}
-	}
-
-	svc_exit_thread(rqstp);
-	return 0;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -233,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 			cb_info->users);
 
 	threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
-	if (minorversion)
-		threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
 	if (minorversion)
 		return ERR_PTR(-ENOTSUPP);
 #endif
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0cdca5960171..acbe1314febd 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -92,8 +92,6 @@ struct svc_serv {
 						 * that arrive over the same
 						 * connection */
 	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
-	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
-						 * entries in the svc_cb_list */
 	bool			sv_bc_enabled;	/* service uses backchannel */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 65a6c6429a53..44b7c89a635f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -349,10 +349,8 @@ found:
 }
 
 /*
- * Add callback request to callback list.  The callback
- * service sleeps on the sv_cb_waitq waiting for new
- * requests.  Wake it up after adding enqueing the
- * request.
+ * Add callback request to callback list.  Wake a thread
+ * on the first pool (usually the only pool) to handle it.
  */
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 {
@@ -371,6 +369,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 	xprt_get(xprt);
 	spin_lock(&bc_serv->sv_cb_lock);
 	list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
-	wake_up(&bc_serv->sv_cb_waitq);
 	spin_unlock(&bc_serv->sv_cb_lock);
+	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 }
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a3d031deb1ec..b98a159eb17f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -440,7 +440,6 @@ __svc_init_bc(struct svc_serv *serv)
 {
 	INIT_LIST_HEAD(&serv->sv_cb_list);
 	spin_lock_init(&serv->sv_cb_lock);
-	init_waitqueue_head(&serv->sv_cb_waitq);
 }
 #else
 static void
@@ -718,6 +717,7 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 
 	set_bit(SP_CONGESTED, &pool->sp_flags);
 }
+EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
 
 static struct svc_pool *
 svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 835160da3ad4..b057f1cbe7a1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -17,6 +17,7 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <trace/events/sunrpc.h>
@@ -719,6 +720,13 @@ rqst_should_sleep(struct svc_rqst *rqstp)
 	if (freezing(current))
 		return false;
 
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (svc_is_backchannel(rqstp)) {
+		if (!list_empty(&rqstp->rq_server->sv_cb_list))
+			return false;
+	}
+#endif
+
 	return true;
 }
 
@@ -868,6 +876,25 @@ void svc_recv(struct svc_rqst *rqstp)
 		trace_svc_xprt_dequeue(rqstp);
 		svc_handle_xprt(rqstp, xprt);
 	}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (svc_is_backchannel(rqstp)) {
+		struct svc_serv *serv = rqstp->rq_server;
+		struct rpc_rqst *req;
+
+		spin_lock_bh(&serv->sv_cb_lock);
+		req = list_first_entry_or_null(&serv->sv_cb_list,
+					       struct rpc_rqst, rq_bc_list);
+		if (req) {
+			list_del(&req->rq_bc_list);
+			spin_unlock_bh(&serv->sv_cb_lock);
+
+			svc_process_bc(req, rqstp);
+			return;
+		}
+		spin_unlock_bh(&serv->sv_cb_lock);
+	}
+#endif
 }
 EXPORT_SYMBOL_GPL(svc_recv);
 
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index e4d84a13c566..bfc434ec52a7 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -267,7 +267,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
 	spin_unlock(&bc_serv->sv_cb_lock);
 
-	wake_up(&bc_serv->sv_cb_waitq);
+	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 
 	r_xprt->rx_stats.bcall_count++;
 	return;
-- 
cgit v1.2.3


From fa341560ca7458f4396d5a0771cb5f2358d8535d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:04 -0400
Subject: SUNRPC: change how svc threads are asked to exit.

svc threads are currently stopped using kthread_stop().  This requires
identifying a specific thread.  However we don't care which thread
stops, just as long as one does.

So instead, set a flag in the svc_pool to say that a thread needs to
die, and have each thread check this flag instead of calling
kthread_should_stop().  The first thread to find and clear this flag
then moves towards exiting.

This removes an explicit dependency on sp_all_threads which will make a
future patch simpler.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc.c              |  5 ++---
 fs/lockd/svclock.c          |  5 ++---
 fs/nfs/callback.c           |  2 +-
 fs/nfsd/nfs4proc.c          |  8 +++++---
 fs/nfsd/nfssvc.c            |  2 +-
 include/linux/lockd/lockd.h |  2 +-
 include/linux/sunrpc/svc.h  | 26 +++++++++++++++++++++++++-
 net/sunrpc/svc.c            | 43 +++++++++++++++++++++----------------------
 net/sunrpc/svc_xprt.c       |  7 +++----
 9 files changed, 61 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 365cc7adff66..81be07c1d3d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
 #include <linux/uio.h>
 #include <linux/smp.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/inetdevice.h>
 
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
 	 * The main request loop. We don't terminate until the last
 	 * NFS mount or NFS daemon has gone away.
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nlm_max_connections;
 
-		nlmsvc_retry_blocked();
+		nlmsvc_retry_blocked(rqstp);
 		svc_recv(rqstp);
 	}
 	if (nlmsvc_ops)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 993999297e31..2dc10900ad1c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/lockd/nlm.h>
 #include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
 #include <linux/exportfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVCLOCK
@@ -1032,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block)
  * be retransmitted.
  */
 void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
 {
 	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
 	struct nlm_block *block;
 
 	spin_lock(&nlm_blocked_lock);
-	while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+	while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
 		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
 
 		if (block->b_when == NLM_NEVER)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 42a0c2f1e785..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_should_stop())
+	while (!svc_thread_should_stop(rqstp))
 		svc_recv(rqstp);
 
 	svc_exit_thread(rqstp);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 580c5f592408..d7e88c7beba3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
  * setup a work entry in the ssc delayed unmount list.
  */
 static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
-				  struct nfsd4_ssc_umount_item **nsui)
+				  struct nfsd4_ssc_umount_item **nsui,
+				  struct svc_rqst *rqstp)
 {
 	struct nfsd4_ssc_umount_item *ni = NULL;
 	struct nfsd4_ssc_umount_item *work = NULL;
@@ -1351,7 +1352,7 @@ try_again:
 			spin_unlock(&nn->nfsd_ssc_lock);
 
 			/* allow 20secs for mount/unmount for now - revisit */
-			if (kthread_should_stop() ||
+			if (svc_thread_should_stop(rqstp) ||
 					(schedule_timeout(20*HZ) == 0)) {
 				finish_wait(&nn->nfsd_ssc_waitq, &wait);
 				kfree(work);
@@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 		goto out_free_rawdata;
 	snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
 
-	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
 	if (status)
 		goto out_free_devname;
 	if ((*nsui)->nsui_vfsmount)
@@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
 	if (bytes_total == 0)
 		bytes_total = ULLONG_MAX;
 	do {
+		/* Only async copies can be stopped here */
 		if (kthread_should_stop())
 			break;
 		bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c7af1095f6b5..0b03a2e50dee 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -957,7 +957,7 @@ nfsd(void *vrqstp)
 	/*
 	 * The main request loop
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* Update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0f016d69c996..9f565416d186 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -282,7 +282,7 @@ __be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
 			struct nlm_host *, struct nlm_lock *,
 			struct nlm_lock *, struct nlm_cookie *);
 __be32		  nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *);
-void		  nlmsvc_retry_blocked(void);
+void		  nlmsvc_retry_blocked(struct svc_rqst *rqstp);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					nlm_host_match_fn_t match);
 void		  nlmsvc_grant_reply(struct nlm_cookie *, __be32);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index acbe1314febd..0ec691070e27 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,6 +50,8 @@ struct svc_pool {
 enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
 	SP_CONGESTED,		/* all threads are busy, none idle */
+	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
+	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
 };
 
 
@@ -259,7 +261,7 @@ enum {
 	RQ_DROPME,		/* drop current reply */
 	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
 				 * encrypting page cache pages */
-	RQ_VICTIM,		/* about to be shut down */
+	RQ_VICTIM,		/* Have agreed to shut down */
 	RQ_BUSY,		/* request is busy */
 	RQ_DATA,		/* request has data */
 };
@@ -299,6 +301,28 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
 	return (struct sockaddr *) &rqst->rq_daddr;
 }
 
+/**
+ * svc_thread_should_stop - check if this thread should stop
+ * @rqstp: the thread that might need to stop
+ *
+ * To stop an svc thread, the pool flags SP_NEED_VICTIM and SP_VICTIM_REMAINS
+ * are set.  The first thread which sees SP_NEED_VICTIM clears it, becoming
+ * the victim using this function.  It should then promptly call
+ * svc_exit_thread() to complete the process, clearing SP_VICTIM_REMAINS
+ * so the task waiting for a thread to exit can wake and continue.
+ *
+ * Return values:
+ *   %true: caller should invoke svc_exit_thread()
+ *   %false: caller should do nothing
+ */
+static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
+{
+	if (test_and_clear_bit(SP_NEED_VICTIM, &rqstp->rq_pool->sp_flags))
+		set_bit(RQ_VICTIM, &rqstp->rq_flags);
+
+	return test_bit(RQ_VICTIM, &rqstp->rq_flags);
+}
+
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct svc_xprt		*xprt;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b98a159eb17f..db579bbc0a0a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -725,19 +725,22 @@ svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
 	return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools];
 }
 
-static struct task_struct *
+static struct svc_pool *
 svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
 {
 	unsigned int i;
-	struct task_struct *task = NULL;
 
 	if (pool != NULL) {
 		spin_lock_bh(&pool->sp_lock);
+		if (pool->sp_nrthreads)
+			goto found_pool;
+		spin_unlock_bh(&pool->sp_lock);
+		return NULL;
 	} else {
 		for (i = 0; i < serv->sv_nrpools; i++) {
 			pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
 			spin_lock_bh(&pool->sp_lock);
-			if (!list_empty(&pool->sp_all_threads))
+			if (pool->sp_nrthreads)
 				goto found_pool;
 			spin_unlock_bh(&pool->sp_lock);
 		}
@@ -745,16 +748,10 @@ svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *stat
 	}
 
 found_pool:
-	if (!list_empty(&pool->sp_all_threads)) {
-		struct svc_rqst *rqstp;
-
-		rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
-		set_bit(RQ_VICTIM, &rqstp->rq_flags);
-		list_del_rcu(&rqstp->rq_all);
-		task = rqstp->rq_task;
-	}
+	set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+	set_bit(SP_NEED_VICTIM, &pool->sp_flags);
 	spin_unlock_bh(&pool->sp_lock);
-	return task;
+	return pool;
 }
 
 static int
@@ -795,18 +792,16 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 static int
 svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	struct svc_rqst	*rqstp;
-	struct task_struct *task;
 	unsigned int state = serv->sv_nrthreads-1;
+	struct svc_pool *victim;
 
 	do {
-		task = svc_pool_victim(serv, pool, &state);
-		if (task == NULL)
+		victim = svc_pool_victim(serv, pool, &state);
+		if (!victim)
 			break;
-		rqstp = kthread_data(task);
-		/* Did we lose a race to svo_function threadfn? */
-		if (kthread_stop(task) == -EINTR)
-			svc_exit_thread(rqstp);
+		svc_pool_wake_idle_thread(victim);
+		wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS,
+			    TASK_IDLE);
 		nrservs++;
 	} while (nrservs < 0);
 	return 0;
@@ -926,8 +921,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
 
 	spin_lock_bh(&pool->sp_lock);
 	pool->sp_nrthreads--;
-	if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
-		list_del_rcu(&rqstp->rq_all);
+	list_del_rcu(&rqstp->rq_all);
 	spin_unlock_bh(&pool->sp_lock);
 
 	spin_lock_bh(&serv->sv_lock);
@@ -938,6 +932,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
 	svc_rqst_free(rqstp);
 
 	svc_put(serv);
+	/* That svc_put() cannot be the last, because the thread
+	 * waiting for SP_VICTIM_REMAINS to clear must hold
+	 * a reference. So it is still safe to access pool.
+	 */
+	clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
 }
 EXPORT_SYMBOL_GPL(svc_exit_thread);
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b057f1cbe7a1..b8539545fefd 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -9,7 +9,6 @@
 #include <linux/sched/mm.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
-#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/sunrpc/addr.h>
@@ -675,7 +674,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
 			continue;
 
 		set_current_state(TASK_IDLE);
-		if (kthread_should_stop()) {
+		if (svc_thread_should_stop(rqstp)) {
 			set_current_state(TASK_RUNNING);
 			return false;
 		}
@@ -713,7 +712,7 @@ rqst_should_sleep(struct svc_rqst *rqstp)
 		return false;
 
 	/* are we shutting down? */
-	if (kthread_should_stop())
+	if (svc_thread_should_stop(rqstp))
 		return false;
 
 	/* are we freezing? */
@@ -858,7 +857,7 @@ void svc_recv(struct svc_rqst *rqstp)
 
 	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
 
-	if (kthread_should_stop())
+	if (svc_thread_should_stop(rqstp))
 		return;
 
 	rqstp->rq_xprt = svc_xprt_dequeue(pool);
-- 
cgit v1.2.3


From 5ff817b23534dd3942f881ab01dd5050505517aa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:11 -0400
Subject: SUNRPC: add list of idle threads

Rather than searching a list of threads to find an idle one, having a
list of idle threads allows an idle thread to be found immediately.

This adds some spin_lock calls which is not ideal, but as the hold-time
is tiny it is still faster than searching a list.  A future patch will
remove them using llist.h.  This involves some subtlety and so is left
to a separate patch.

This removes the need for the RQ_BUSY flag.  The rqst is "busy"
precisely when it is not on the "idle" list.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h    | 25 ++++++++++++++++++++++++-
 include/trace/events/sunrpc.h |  1 -
 net/sunrpc/svc.c              | 14 +++++++++-----
 net/sunrpc/svc_xprt.c         | 15 +++++++++++----
 4 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0ec691070e27..e9c34e99bc88 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -37,6 +37,7 @@ struct svc_pool {
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
+	struct list_head	sp_idle_threads; /* idle server threads */
 
 	/* statistics on pool operation */
 	struct percpu_counter	sp_messages_arrived;
@@ -186,6 +187,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  */
 struct svc_rqst {
 	struct list_head	rq_all;		/* all threads list */
+	struct list_head	rq_idle;	/* On the idle list */
 	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
 
@@ -262,10 +264,31 @@ enum {
 	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
 				 * encrypting page cache pages */
 	RQ_VICTIM,		/* Have agreed to shut down */
-	RQ_BUSY,		/* request is busy */
 	RQ_DATA,		/* request has data */
 };
 
+/**
+ * svc_thread_set_busy - mark a thread as busy
+ * @rqstp: the thread which is now busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline void svc_thread_set_busy(struct svc_rqst *rqstp)
+{
+	INIT_LIST_HEAD(&rqstp->rq_idle);
+}
+
+/**
+ * svc_thread_busy - check if a thread as busy
+ * @rqstp: the thread which might be busy
+ *
+ * If rq_idle is "empty", the thread must be busy.
+ */
+static inline bool svc_thread_busy(struct svc_rqst *rqstp)
+{
+	return list_empty(&rqstp->rq_idle);
+}
+
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 6beb38c1dcb5..337c90787fb1 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1677,7 +1677,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
 	svc_rqst_flag(DROPME)						\
 	svc_rqst_flag(SPLICE_OK)					\
 	svc_rqst_flag(VICTIM)						\
-	svc_rqst_flag(BUSY)						\
 	svc_rqst_flag_end(DATA)
 
 #undef svc_rqst_flag
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index db579bbc0a0a..9d080fe2dcdf 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -510,6 +510,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		pool->sp_id = i;
 		INIT_LIST_HEAD(&pool->sp_sockets);
 		INIT_LIST_HEAD(&pool->sp_all_threads);
+		INIT_LIST_HEAD(&pool->sp_idle_threads);
 		spin_lock_init(&pool->sp_lock);
 
 		percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL);
@@ -641,7 +642,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	folio_batch_init(&rqstp->rq_fbatch);
 
-	__set_bit(RQ_BUSY, &rqstp->rq_flags);
+	svc_thread_set_busy(rqstp);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
@@ -702,10 +703,13 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 	struct svc_rqst	*rqstp;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
-		if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
-			continue;
-
+	spin_lock_bh(&pool->sp_lock);
+	rqstp = list_first_entry_or_null(&pool->sp_idle_threads,
+					 struct svc_rqst, rq_idle);
+	if (rqstp)
+		list_del_init(&rqstp->rq_idle);
+	spin_unlock_bh(&pool->sp_lock);
+	if (rqstp) {
 		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
 		wake_up_process(rqstp->rq_task);
 		rcu_read_unlock();
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b8539545fefd..ebfeeb504a79 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -737,8 +737,9 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
 		set_current_state(TASK_IDLE);
 		smp_mb__before_atomic();
 		clear_bit(SP_CONGESTED, &pool->sp_flags);
-		clear_bit(RQ_BUSY, &rqstp->rq_flags);
-		smp_mb__after_atomic();
+		spin_lock_bh(&pool->sp_lock);
+		list_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+		spin_unlock_bh(&pool->sp_lock);
 
 		/* Need to check should_sleep() again after
 		 * setting task state in case a wakeup happened
@@ -751,8 +752,14 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
 			cond_resched();
 		}
 
-		set_bit(RQ_BUSY, &rqstp->rq_flags);
-		smp_mb__after_atomic();
+		/* We *must* be removed from the list before we can continue.
+		 * If we were woken, this is already done
+		 */
+		if (!svc_thread_busy(rqstp)) {
+			spin_lock_bh(&pool->sp_lock);
+			list_del_init(&rqstp->rq_idle);
+			spin_unlock_bh(&pool->sp_lock);
+		}
 	} else {
 		cond_resched();
 	}
-- 
cgit v1.2.3


From 2b65a226840c0e86db0e7926856a0a017b3390f2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:17 -0400
Subject: SUNRPC: discard SP_CONGESTED

We can tell if a pool is congested by checking if the idle list is
empty.  We don't need a separate flag.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 1 -
 net/sunrpc/svc.c           | 1 -
 net/sunrpc/svc_xprt.c      | 4 +---
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e9c34e99bc88..22b3018ebf62 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -50,7 +50,6 @@ struct svc_pool {
 /* bits for sp_flags */
 enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
-	SP_CONGESTED,		/* all threads are busy, none idle */
 	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
 	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
 };
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9d080fe2dcdf..db4674211f36 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -719,7 +719,6 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 	}
 	rcu_read_unlock();
 
-	set_bit(SP_CONGESTED, &pool->sp_flags);
 }
 EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ebfeeb504a79..fa0d854a5596 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -735,8 +735,6 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
 
 	if (rqst_should_sleep(rqstp)) {
 		set_current_state(TASK_IDLE);
-		smp_mb__before_atomic();
-		clear_bit(SP_CONGESTED, &pool->sp_flags);
 		spin_lock_bh(&pool->sp_lock);
 		list_add(&rqstp->rq_idle, &pool->sp_idle_threads);
 		spin_unlock_bh(&pool->sp_lock);
@@ -874,7 +872,7 @@ void svc_recv(struct svc_rqst *rqstp)
 		/* Normally we will wait up to 5 seconds for any required
 		 * cache information to be provided.
 		 */
-		if (!test_bit(SP_CONGESTED, &pool->sp_flags))
+		if (!list_empty(&pool->sp_idle_threads))
 			rqstp->rq_chandle.thread_wait = 5 * HZ;
 		else
 			rqstp->rq_chandle.thread_wait = 1 * HZ;
-- 
cgit v1.2.3


From d6b3358a2813bb14791259a2227d9af1e7019ca0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:24 -0400
Subject: llist: add interface to check if a node is on a list.

With list.h lists, it is easy to test if a node is on a list, providing
it was initialised and that it is removed with list_del_init().

This patch provides similar functionality for llist.h lists.

 init_llist_node()
marks a node as being not-on-any-list be setting the ->next pointer to
the node itself.
 llist_on_list()
tests if the node is on any list.
 llist_del_first_init()
remove the first element from a llist, and marks it as being off-list.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/llist.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 85bda2d02d65..dcb91e3bac1c 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -73,6 +73,33 @@ static inline void init_llist_head(struct llist_head *list)
 	list->first = NULL;
 }
 
+/**
+ * init_llist_node - initialize lock-less list node
+ * @node:	the node to be initialised
+ *
+ * In cases where there is a need to test if a node is on
+ * a list or not, this initialises the node to clearly
+ * not be on any list.
+ */
+static inline void init_llist_node(struct llist_node *node)
+{
+	node->next = node;
+}
+
+/**
+ * llist_on_list - test if a lock-list list node is on a list
+ * @node:	the node to test
+ *
+ * When a node is on a list the ->next pointer will be NULL or
+ * some other node.  It can never point to itself.  We use that
+ * in init_llist_node() to record that a node is not on any list,
+ * and here to test whether it is on any list.
+ */
+static inline bool llist_on_list(const struct llist_node *node)
+{
+	return node->next != node;
+}
+
 /**
  * llist_entry - get the struct of this entry
  * @ptr:	the &struct llist_node pointer.
@@ -249,6 +276,21 @@ static inline struct llist_node *__llist_del_all(struct llist_head *head)
 
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
+/**
+ * llist_del_first_init - delete first entry from lock-list and mark is as being off-list
+ * @head:	the head of lock-less list to delete from.
+ *
+ * This behave the same as llist_del_first() except that llist_init_node() is called
+ * on the returned node so that llist_on_list() will report false for the node.
+ */
+static inline struct llist_node *llist_del_first_init(struct llist_head *head)
+{
+	struct llist_node *n = llist_del_first(head);
+
+	if (n)
+		init_llist_node(n);
+	return n;
+}
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
 #endif /* LLIST_H */
-- 
cgit v1.2.3


From 9bd4161c591710f152a8cd3ed85ea928c61e26ca Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:30 -0400
Subject: SUNRPC: change service idle list to be an llist

With an llist we don't need to take a lock to add a thread to the list,
though we still need a lock to remove it.  That will go in the next
patch.

Unlike double-linked lists, a thread cannot reliably remove itself from
the list.  Only the first thread can be removed, and that can change
asynchronously.  So some care is needed.

We already check if there is pending work to do, so we are unlikely to
add ourselves to the idle list and then want to remove ourselves again.

If we DO find something needs to be done after adding ourselves to the
list, we simply wake up the first thread on the list.  If that was us,
we successfully removed ourselves and can continue.  If it was some
other thread, they will do the work that needs to be done.  We can
safely sleep until woken.

We also remove the test on freezing() from rqst_should_sleep().  Instead
we set TASK_FREEZABLE before scheduling.  This makes is safe to
schedule() when a freeze is pending.  As we now loop waiting to be
removed from the idle queue, this is a cleaner way to handle freezing.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 21 +++++----------------
 net/sunrpc/svc.c           | 14 +++++++-------
 net/sunrpc/svc_xprt.c      | 45 +++++++++++++++++++--------------------------
 3 files changed, 31 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 22b3018ebf62..ad4572630335 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -37,7 +37,7 @@ struct svc_pool {
 	struct list_head	sp_sockets;	/* pending sockets */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
-	struct list_head	sp_idle_threads; /* idle server threads */
+	struct llist_head	sp_idle_threads; /* idle server threads */
 
 	/* statistics on pool operation */
 	struct percpu_counter	sp_messages_arrived;
@@ -186,7 +186,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  */
 struct svc_rqst {
 	struct list_head	rq_all;		/* all threads list */
-	struct list_head	rq_idle;	/* On the idle list */
+	struct llist_node	rq_idle;	/* On the idle list */
 	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
 
@@ -266,26 +266,15 @@ enum {
 	RQ_DATA,		/* request has data */
 };
 
-/**
- * svc_thread_set_busy - mark a thread as busy
- * @rqstp: the thread which is now busy
- *
- * If rq_idle is "empty", the thread must be busy.
- */
-static inline void svc_thread_set_busy(struct svc_rqst *rqstp)
-{
-	INIT_LIST_HEAD(&rqstp->rq_idle);
-}
-
 /**
  * svc_thread_busy - check if a thread as busy
  * @rqstp: the thread which might be busy
  *
- * If rq_idle is "empty", the thread must be busy.
+ * A thread is only busy when it is not an the idle list.
  */
-static inline bool svc_thread_busy(struct svc_rqst *rqstp)
+static inline bool svc_thread_busy(const struct svc_rqst *rqstp)
 {
-	return list_empty(&rqstp->rq_idle);
+	return !llist_on_list(&rqstp->rq_idle);
 }
 
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index db4674211f36..54ae6a569f6a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -510,7 +510,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		pool->sp_id = i;
 		INIT_LIST_HEAD(&pool->sp_sockets);
 		INIT_LIST_HEAD(&pool->sp_all_threads);
-		INIT_LIST_HEAD(&pool->sp_idle_threads);
+		init_llist_head(&pool->sp_idle_threads);
 		spin_lock_init(&pool->sp_lock);
 
 		percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL);
@@ -642,7 +642,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	folio_batch_init(&rqstp->rq_fbatch);
 
-	svc_thread_set_busy(rqstp);
+	init_llist_node(&rqstp->rq_idle);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
@@ -701,15 +701,15 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 void svc_pool_wake_idle_thread(struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
+	struct llist_node *ln;
 
 	rcu_read_lock();
 	spin_lock_bh(&pool->sp_lock);
-	rqstp = list_first_entry_or_null(&pool->sp_idle_threads,
-					 struct svc_rqst, rq_idle);
-	if (rqstp)
-		list_del_init(&rqstp->rq_idle);
+	ln = llist_del_first_init(&pool->sp_idle_threads);
 	spin_unlock_bh(&pool->sp_lock);
-	if (rqstp) {
+	if (ln) {
+		rqstp = llist_entry(ln, struct svc_rqst, rq_idle);
+
 		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
 		wake_up_process(rqstp->rq_task);
 		rcu_read_unlock();
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index fa0d854a5596..17c43bde35c9 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -715,10 +715,6 @@ rqst_should_sleep(struct svc_rqst *rqstp)
 	if (svc_thread_should_stop(rqstp))
 		return false;
 
-	/* are we freezing? */
-	if (freezing(current))
-		return false;
-
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	if (svc_is_backchannel(rqstp)) {
 		if (!list_empty(&rqstp->rq_server->sv_cb_list))
@@ -734,30 +730,26 @@ static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
 	struct svc_pool *pool = rqstp->rq_pool;
 
 	if (rqst_should_sleep(rqstp)) {
-		set_current_state(TASK_IDLE);
-		spin_lock_bh(&pool->sp_lock);
-		list_add(&rqstp->rq_idle, &pool->sp_idle_threads);
-		spin_unlock_bh(&pool->sp_lock);
+		set_current_state(TASK_IDLE | TASK_FREEZABLE);
+		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+
+		if (unlikely(!rqst_should_sleep(rqstp)))
+			/* Work just became available.  This thread cannot simply
+			 * choose not to sleep as it *must* wait until removed.
+			 * So wake the first waiter - whether it is this
+			 * thread or some other, it will get the work done.
+			 */
+			svc_pool_wake_idle_thread(pool);
 
-		/* Need to check should_sleep() again after
-		 * setting task state in case a wakeup happened
-		 * between testing and setting.
+		/* Since a thread cannot remove itself from an llist,
+		 * schedule until someone else removes @rqstp from
+		 * the idle list.
 		 */
-		if (rqst_should_sleep(rqstp)) {
+		while (!svc_thread_busy(rqstp)) {
 			schedule();
-		} else {
-			__set_current_state(TASK_RUNNING);
-			cond_resched();
-		}
-
-		/* We *must* be removed from the list before we can continue.
-		 * If we were woken, this is already done
-		 */
-		if (!svc_thread_busy(rqstp)) {
-			spin_lock_bh(&pool->sp_lock);
-			list_del_init(&rqstp->rq_idle);
-			spin_unlock_bh(&pool->sp_lock);
+			set_current_state(TASK_IDLE | TASK_FREEZABLE);
 		}
+		__set_current_state(TASK_RUNNING);
 	} else {
 		cond_resched();
 	}
@@ -870,9 +862,10 @@ void svc_recv(struct svc_rqst *rqstp)
 		struct svc_xprt *xprt = rqstp->rq_xprt;
 
 		/* Normally we will wait up to 5 seconds for any required
-		 * cache information to be provided.
+		 * cache information to be provided.  When there are no
+		 * idle threads, we reduce the wait time.
 		 */
-		if (!list_empty(&pool->sp_idle_threads))
+		if (pool->sp_idle_threads.first)
 			rqstp->rq_chandle.thread_wait = 5 * HZ;
 		else
 			rqstp->rq_chandle.thread_wait = 1 * HZ;
-- 
cgit v1.2.3


From 8a3e5975ed11dd16d81dc501b514ab01986db94e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:37 -0400
Subject: llist: add llist_del_first_this()

llist_del_first_this() deletes a specific entry from an llist, providing
it is at the head of the list.  Multiple threads can call this
concurrently providing they each offer a different entry.

This can be uses for a set of worker threads which are on the llist when
they are idle.  The head can always be woken, and when it is woken it
can remove itself, and possibly wake the next if there is an excess of
work to do.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/llist.h |  4 ++++
 lib/llist.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index dcb91e3bac1c..2c982ff7475a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -291,6 +291,10 @@ static inline struct llist_node *llist_del_first_init(struct llist_head *head)
 		init_llist_node(n);
 	return n;
 }
+
+extern bool llist_del_first_this(struct llist_head *head,
+				 struct llist_node *this);
+
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
 #endif /* LLIST_H */
diff --git a/lib/llist.c b/lib/llist.c
index 6e668fa5a2c6..f21d0cfbbaaa 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -65,6 +65,34 @@ struct llist_node *llist_del_first(struct llist_head *head)
 }
 EXPORT_SYMBOL_GPL(llist_del_first);
 
+/**
+ * llist_del_first_this - delete given entry of lock-less list if it is first
+ * @head:	the head for your lock-less list
+ * @this:	a list entry.
+ *
+ * If head of the list is given entry, delete and return %true else
+ * return %false.
+ *
+ * Multiple callers can safely call this concurrently with multiple
+ * llist_add() callers, providing all the callers offer a different @this.
+ */
+bool llist_del_first_this(struct llist_head *head,
+			  struct llist_node *this)
+{
+	struct llist_node *entry, *next;
+
+	/* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
+	entry = smp_load_acquire(&head->first);
+	do {
+		if (entry != this)
+			return false;
+		next = READ_ONCE(entry->next);
+	} while (!try_cmpxchg(&head->first, &entry, next));
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(llist_del_first_this);
+
 /**
  * llist_reverse_order - reverse order of a llist chain
  * @head:	first item of the list to be reversed
-- 
cgit v1.2.3


From de9e82c355f2ae73c04aec84f73fc2657cf7dfdd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:43 -0400
Subject: lib: add light-weight queuing mechanism.

lwq is a FIFO single-linked queue that only requires a spinlock
for dequeueing, which happens in process context.  Enqueueing is atomic
with no spinlock and can happen in any context.

This is particularly useful when work items are queued from BH or IRQ
context, and when they are handled one at a time by dedicated threads.

Avoiding any locking when enqueueing means there is no need to disable
BH or interrupts, which is generally best avoided (particularly when
there are any RT tasks on the machine).

This solution is superior to using "list_head" links because we need
half as many pointers in the data structures, and because list_head
lists would need locking to add items to the queue.

This solution is superior to a bespoke solution as all locking and
container_of casting is integrated, so the interface is simple.

Despite the similar name, this solution meets a distinctly different
need to kfifo.  kfifo provides a fixed sized circular buffer to which
data can be added at one end and removed at the other, and does not
provide any locking.  lwq does not have any size limit and works with
data structures (objects?) rather than data (bytes).

A unit test for basic functionality, which runs at boot time, is included.

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Gow <davidgow@google.com>
Cc: linux-kernel@vger.kernel.org
Message-Id: <20230911111333.4d1a872330e924a00acb905b@linux-foundation.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lwq.h | 124 +++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig         |   5 ++
 lib/Makefile        |   2 +-
 lib/lwq.c           | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 288 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/lwq.h
 create mode 100644 lib/lwq.c

(limited to 'include/linux')

diff --git a/include/linux/lwq.h b/include/linux/lwq.h
new file mode 100644
index 000000000000..d081d5cf8e33
--- /dev/null
+++ b/include/linux/lwq.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef LWQ_H
+#define LWQ_H
+/*
+ * Light-weight single-linked queue built from llist
+ *
+ * Entries can be enqueued from any context with no locking.
+ * Entries can be dequeued from process context with integrated locking.
+ *
+ * This is particularly suitable when work items are queued in
+ * BH or IRQ context, and where work items are handled one at a time
+ * by dedicated threads.
+ */
+#include <linux/container_of.h>
+#include <linux/spinlock.h>
+#include <linux/llist.h>
+
+struct lwq_node {
+	struct llist_node node;
+};
+
+struct lwq {
+	spinlock_t		lock;
+	struct llist_node	*ready;		/* entries to be dequeued */
+	struct llist_head	new;		/* entries being enqueued */
+};
+
+/**
+ * lwq_init - initialise a lwq
+ * @q:	the lwq object
+ */
+static inline void lwq_init(struct lwq *q)
+{
+	spin_lock_init(&q->lock);
+	q->ready = NULL;
+	init_llist_head(&q->new);
+}
+
+/**
+ * lwq_empty - test if lwq contains any entry
+ * @q:	the lwq object
+ *
+ * This empty test contains an acquire barrier so that if a wakeup
+ * is sent when lwq_dequeue returns true, it is safe to go to sleep after
+ * a test on lwq_empty().
+ */
+static inline bool lwq_empty(struct lwq *q)
+{
+	/* acquire ensures ordering wrt lwq_enqueue() */
+	return smp_load_acquire(&q->ready) == NULL && llist_empty(&q->new);
+}
+
+struct llist_node *__lwq_dequeue(struct lwq *q);
+/**
+ * lwq_dequeue - dequeue first (oldest) entry from lwq
+ * @q:		the queue to dequeue from
+ * @type:	the type of object to return
+ * @member:	them member in returned object which is an lwq_node.
+ *
+ * Remove a single object from the lwq and return it.  This will take
+ * a spinlock and so must always be called in the same context, typcially
+ * process contet.
+ */
+#define lwq_dequeue(q, type, member)					\
+	({ struct llist_node *_n = __lwq_dequeue(q);			\
+	  _n ? container_of(_n, type, member.node) : NULL; })
+
+struct llist_node *lwq_dequeue_all(struct lwq *q);
+
+/**
+ * lwq_for_each_safe - iterate over detached queue allowing deletion
+ * @_n:		iterator variable
+ * @_t1:	temporary struct llist_node **
+ * @_t2:	temporary struct llist_node *
+ * @_l:		address of llist_node pointer from lwq_dequeue_all()
+ * @_member:	member in _n where lwq_node is found.
+ *
+ * Iterate over members in a dequeued list.  If the iterator variable
+ * is set to NULL, the iterator removes that entry from the queue.
+ */
+#define lwq_for_each_safe(_n, _t1, _t2, _l, _member)			\
+	for (_t1 = (_l);						\
+	     *(_t1) ? (_n = container_of(*(_t1), typeof(*(_n)), _member.node),\
+		       _t2 = ((*_t1)->next),				\
+		       true)						\
+	     : false;							\
+	     (_n) ? (_t1 = &(_n)->_member.node.next, 0)			\
+	     : ((*(_t1) = (_t2)),  0))
+
+/**
+ * lwq_enqueue - add a new item to the end of the queue
+ * @n	- the lwq_node embedded in the item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue(struct lwq_node *n, struct lwq *q)
+{
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add(&n->node, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+
+/**
+ * lwq_enqueue_batch - add a list of new items to the end of the queue
+ * @n	- the lwq_node embedded in the first item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue_batch(struct llist_node *n, struct lwq *q)
+{
+	struct llist_node *e = n;
+
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add_batch(llist_reverse_order(n), e, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+#endif /* LWQ_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..76fe64f933fc 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -729,6 +729,11 @@ config PARMAN
 config OBJAGG
 	tristate "objagg" if COMPILE_TEST
 
+config LWQ_TEST
+	bool "Boot-time test for lwq queuing"
+	help
+          Run boot-time test of light-weight queuing.
+
 endmenu
 
 config GENERIC_IOREMAP
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..d0c116b706e6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -45,7 +45,7 @@ obj-y	+= lockref.o
 obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
-	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
+	 bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
diff --git a/lib/lwq.c b/lib/lwq.c
new file mode 100644
index 000000000000..57d080a4d53d
--- /dev/null
+++ b/lib/lwq.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Light-weight single-linked queue.
+ *
+ * Entries are enqueued to the head of an llist, with no blocking.
+ * This can happen in any context.
+ *
+ * Entries are dequeued using a spinlock to protect against multiple
+ * access.  The llist is staged in reverse order, and refreshed
+ * from the llist when it exhausts.
+ *
+ * This is particularly suitable when work items are queued in BH or
+ * IRQ context, and where work items are handled one at a time by
+ * dedicated threads.
+ */
+#include <linux/rcupdate.h>
+#include <linux/lwq.h>
+
+struct llist_node *__lwq_dequeue(struct lwq *q)
+{
+	struct llist_node *this;
+
+	if (lwq_empty(q))
+		return NULL;
+	spin_lock(&q->lock);
+	this = q->ready;
+	if (!this && !llist_empty(&q->new)) {
+		/* ensure queue doesn't appear transiently lwq_empty */
+		smp_store_release(&q->ready, (void *)1);
+		this = llist_reverse_order(llist_del_all(&q->new));
+		if (!this)
+			q->ready = NULL;
+	}
+	if (this)
+		q->ready = llist_next(this);
+	spin_unlock(&q->lock);
+	return this;
+}
+EXPORT_SYMBOL_GPL(__lwq_dequeue);
+
+/**
+ * lwq_dequeue_all - dequeue all currently enqueued objects
+ * @q:	the queue to dequeue from
+ *
+ * Remove and return a linked list of llist_nodes of all the objects that were
+ * in the queue. The first on the list will be the object that was least
+ * recently enqueued.
+ */
+struct llist_node *lwq_dequeue_all(struct lwq *q)
+{
+	struct llist_node *r, *t, **ep;
+
+	if (lwq_empty(q))
+		return NULL;
+
+	spin_lock(&q->lock);
+	r = q->ready;
+	q->ready = NULL;
+	t = llist_del_all(&q->new);
+	spin_unlock(&q->lock);
+	ep = &r;
+	while (*ep)
+		ep = &(*ep)->next;
+	*ep = llist_reverse_order(t);
+	return r;
+}
+EXPORT_SYMBOL_GPL(lwq_dequeue_all);
+
+#if IS_ENABLED(CONFIG_LWQ_TEST)
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/wait_bit.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+struct tnode {
+	struct lwq_node n;
+	int i;
+	int c;
+};
+
+static int lwq_exercise(void *qv)
+{
+	struct lwq *q = qv;
+	int cnt;
+	struct tnode *t;
+
+	for (cnt = 0; cnt < 10000; cnt++) {
+		wait_var_event(q, (t = lwq_dequeue(q, struct tnode, n)) != NULL);
+		t->c++;
+		if (lwq_enqueue(&t->n, q))
+			wake_up_var(q);
+	}
+	while (!kthread_should_stop())
+		schedule_timeout_idle(1);
+	return 0;
+}
+
+static int lwq_test(void)
+{
+	int i;
+	struct lwq q;
+	struct llist_node *l, **t1, *t2;
+	struct tnode *t;
+	struct task_struct *threads[8];
+
+	printk(KERN_INFO "testing lwq....\n");
+	lwq_init(&q);
+	printk(KERN_INFO " lwq: run some threads\n");
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		threads[i] = kthread_run(lwq_exercise, &q, "lwq-test-%d", i);
+	for (i = 0; i < 100; i++) {
+		t = kmalloc(sizeof(*t), GFP_KERNEL);
+		if (!t)
+			break;
+		t->i = i;
+		t->c = 0;
+		if (lwq_enqueue(&t->n, &q))
+			wake_up_var(&q);
+	}
+	/* wait for threads to exit */
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		if (!IS_ERR_OR_NULL(threads[i]))
+			kthread_stop(threads[i]);
+	printk(KERN_INFO " lwq: dequeue first 50:");
+	for (i = 0; i < 50 ; i++) {
+		if (i && (i % 10) == 0) {
+			printk(KERN_CONT "\n");
+			printk(KERN_INFO " lwq: ... ");
+		}
+		t = lwq_dequeue(&q, struct tnode, n);
+		if (t)
+			printk(KERN_CONT " %d(%d)", t->i, t->c);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	l = lwq_dequeue_all(&q);
+	printk(KERN_INFO " lwq: delete the multiples of 3 (test lwq_for_each_safe())\n");
+	lwq_for_each_safe(t, t1, t2, &l, n) {
+		if ((t->i % 3) == 0) {
+			t->i = -1;
+			kfree(t);
+			t = NULL;
+		}
+	}
+	if (l)
+		lwq_enqueue_batch(l, &q);
+	printk(KERN_INFO " lwq: dequeue remaining:");
+	while ((t = lwq_dequeue(&q, struct tnode, n)) != NULL) {
+		printk(KERN_CONT " %d", t->i);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	return 0;
+}
+
+module_init(lwq_test);
+#endif /* CONFIG_LWQ_TEST*/
-- 
cgit v1.2.3


From 5b80147e0c70181654e8e54eae99f69b2bf891b1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:39:56 -0400
Subject: SUNRPC: only have one thread waking up at a time

Currently if several items of work become available in quick succession,
that number of threads (if available) will be woken.  By the time some
of them wake up another thread that was already cache-warm might have
come along and completed the work.  Anecdotal evidence suggests as many
as 15% of wakes find nothing to do once they get to the point of
looking.

This patch changes svc_pool_wake_idle_thread() to wake the first thread
on the queue but NOT remove it.  Subsequent calls will wake the same
thread.  Once that thread starts it will dequeue itself and after
dequeueing some work to do, it will wake the next thread if there is more
work ready.  This results in a more orderly increase in the number of
busy threads.

As a bonus, this allows us to reduce locking around the idle queue.
svc_pool_wake_idle_thread() no longer needs to take a lock (beyond
rcu_read_lock()) as it doesn't manipulate the queue, it just looks at
the first item.

The thread itself can avoid locking by using the new
llist_del_first_this() interface.  This will safely remove the thread
itself if it is the head.  If it isn't the head, it will do nothing.
If multiple threads call this concurrently only one will succeed.  The
others will do nothing, so no corruption can result.

If a thread wakes up and finds that it cannot dequeue itself that means
either
- that it wasn't woken because it was the head of the queue.  Maybe the
  freezer woke it.  In that case it can go back to sleep (after trying
  to freeze of course).
- some other thread found there was nothing to do very recently, and
  placed itself on the head of the queue in front of this thread.
  It must check again after placing itself there, so it can be deemed to
  be responsible for any pending work, and this thread can go back to
  sleep until woken.

No code ever tests for busy threads any more.  Only each thread itself
cares if it is busy.  So svc_thread_busy() is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 11 -----------
 net/sunrpc/svc.c           | 14 ++++++--------
 net/sunrpc/svc_xprt.c      | 38 +++++++++++++++++++++++++-------------
 3 files changed, 31 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index ad4572630335..dafa362b4fdd 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -266,17 +266,6 @@ enum {
 	RQ_DATA,		/* request has data */
 };
 
-/**
- * svc_thread_busy - check if a thread as busy
- * @rqstp: the thread which might be busy
- *
- * A thread is only busy when it is not an the idle list.
- */
-static inline bool svc_thread_busy(const struct svc_rqst *rqstp)
-{
-	return !llist_on_list(&rqstp->rq_idle);
-}
-
 #define SVC_NET(rqst) (rqst->rq_xprt ? rqst->rq_xprt->xpt_net : rqst->rq_bc_net)
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 54ae6a569f6a..326592162af1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -642,7 +642,6 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	folio_batch_init(&rqstp->rq_fbatch);
 
-	init_llist_node(&rqstp->rq_idle);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
@@ -704,17 +703,16 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 	struct llist_node *ln;
 
 	rcu_read_lock();
-	spin_lock_bh(&pool->sp_lock);
-	ln = llist_del_first_init(&pool->sp_idle_threads);
-	spin_unlock_bh(&pool->sp_lock);
+	ln = READ_ONCE(pool->sp_idle_threads.first);
 	if (ln) {
 		rqstp = llist_entry(ln, struct svc_rqst, rq_idle);
-
 		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
-		wake_up_process(rqstp->rq_task);
+		if (!task_is_running(rqstp->rq_task)) {
+			wake_up_process(rqstp->rq_task);
+			trace_svc_wake_up(rqstp->rq_task->pid);
+			percpu_counter_inc(&pool->sp_threads_woken);
+		}
 		rcu_read_unlock();
-		percpu_counter_inc(&pool->sp_threads_woken);
-		trace_svc_wake_up(rqstp->rq_task->pid);
 		return;
 	}
 	rcu_read_unlock();
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 1b300a7889eb..75f66714e3a7 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -732,20 +732,19 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
 	if (svc_thread_should_sleep(rqstp)) {
 		set_current_state(TASK_IDLE | TASK_FREEZABLE);
 		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+		if (likely(svc_thread_should_sleep(rqstp)))
+			schedule();
 
-		if (unlikely(!svc_thread_should_sleep(rqstp)))
-			/* Work just became available.  This thread cannot simply
-			 * choose not to sleep as it *must* wait until removed.
-			 * So wake the first waiter - whether it is this
-			 * thread or some other, it will get the work done.
+		while (!llist_del_first_this(&pool->sp_idle_threads,
+					     &rqstp->rq_idle)) {
+			/* Work just became available.  This thread can only
+			 * handle it after removing rqstp from the idle
+			 * list. If that attempt failed, some other thread
+			 * must have queued itself after finding no
+			 * work to do, so that thread has taken responsibly
+			 * for this new work.  This thread can safely sleep
+			 * until woken again.
 			 */
-			svc_pool_wake_idle_thread(pool);
-
-		/* Since a thread cannot remove itself from an llist,
-		 * schedule until someone else removes @rqstp from
-		 * the idle list.
-		 */
-		while (!svc_thread_busy(rqstp)) {
 			schedule();
 			set_current_state(TASK_IDLE | TASK_FREEZABLE);
 		}
@@ -835,6 +834,15 @@ out:
 	svc_xprt_release(rqstp);
 }
 
+static void svc_thread_wake_next(struct svc_rqst *rqstp)
+{
+	if (!svc_thread_should_sleep(rqstp))
+		/* More work pending after I dequeued some,
+		 * wake another worker
+		 */
+		svc_pool_wake_idle_thread(rqstp->rq_pool);
+}
+
 /**
  * svc_recv - Receive and process the next request on any transport
  * @rqstp: an idle RPC service thread
@@ -854,13 +862,16 @@ void svc_recv(struct svc_rqst *rqstp)
 
 	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
 
-	if (svc_thread_should_stop(rqstp))
+	if (svc_thread_should_stop(rqstp)) {
+		svc_thread_wake_next(rqstp);
 		return;
+	}
 
 	rqstp->rq_xprt = svc_xprt_dequeue(pool);
 	if (rqstp->rq_xprt) {
 		struct svc_xprt *xprt = rqstp->rq_xprt;
 
+		svc_thread_wake_next(rqstp);
 		/* Normally we will wait up to 5 seconds for any required
 		 * cache information to be provided.  When there are no
 		 * idle threads, we reduce the wait time.
@@ -885,6 +896,7 @@ void svc_recv(struct svc_rqst *rqstp)
 		if (req) {
 			list_del(&req->rq_bc_list);
 			spin_unlock_bh(&serv->sv_cb_lock);
+			svc_thread_wake_next(rqstp);
 
 			svc_process_bc(req, rqstp);
 			return;
-- 
cgit v1.2.3


From 9a0e6accc0a8c3adf72f1b43be8019961b68663a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:02 -0400
Subject: SUNRPC: use lwq for sp_sockets - renamed to sp_xprts

lwq avoids using back pointers in lists, and uses less locking.
This introduces a new spinlock, but the other one will be removed in a
future patch.

For svc_clean_up_xprts(), we now dequeue the entire queue, walk it to
remove and process the xprts that need cleaning up, then re-enqueue the
remaining queue.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h      |  3 ++-
 include/linux/sunrpc/svc_xprt.h |  2 +-
 net/sunrpc/svc.c                |  2 +-
 net/sunrpc/svc_xprt.c           | 57 ++++++++++++-----------------------------
 4 files changed, 21 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dafa362b4fdd..7ff9fe785e49 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -17,6 +17,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/svcauth.h>
+#include <linux/lwq.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/pagevec.h>
@@ -34,7 +35,7 @@
 struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
 	spinlock_t		sp_lock;	/* protects all fields */
-	struct list_head	sp_sockets;	/* pending sockets */
+	struct lwq		sp_xprts;	/* pending transports */
 	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	struct llist_head	sp_idle_threads; /* idle server threads */
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index fa55d12dc765..8e20cd60e2e7 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -54,7 +54,7 @@ struct svc_xprt {
 	const struct svc_xprt_ops *xpt_ops;
 	struct kref		xpt_ref;
 	struct list_head	xpt_list;
-	struct list_head	xpt_ready;
+	struct lwq_node		xpt_ready;
 	unsigned long		xpt_flags;
 
 	struct svc_serv		*xpt_server;	/* service for transport */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 326592162af1..244b5b9eba4d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -508,7 +508,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 				i, serv->sv_name);
 
 		pool->sp_id = i;
-		INIT_LIST_HEAD(&pool->sp_sockets);
+		lwq_init(&pool->sp_xprts);
 		INIT_LIST_HEAD(&pool->sp_all_threads);
 		init_llist_head(&pool->sp_idle_threads);
 		spin_lock_init(&pool->sp_lock);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 75f66714e3a7..28ca7db55da1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -201,7 +201,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
 	kref_init(&xprt->xpt_ref);
 	xprt->xpt_server = serv;
 	INIT_LIST_HEAD(&xprt->xpt_list);
-	INIT_LIST_HEAD(&xprt->xpt_ready);
 	INIT_LIST_HEAD(&xprt->xpt_deferred);
 	INIT_LIST_HEAD(&xprt->xpt_users);
 	mutex_init(&xprt->xpt_mutex);
@@ -472,9 +471,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	pool = svc_pool_for_cpu(xprt->xpt_server);
 
 	percpu_counter_inc(&pool->sp_sockets_queued);
-	spin_lock_bh(&pool->sp_lock);
-	list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
-	spin_unlock_bh(&pool->sp_lock);
+	lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts);
 
 	svc_pool_wake_idle_thread(pool);
 }
@@ -487,18 +484,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 {
 	struct svc_xprt	*xprt = NULL;
 
-	if (list_empty(&pool->sp_sockets))
-		goto out;
-
-	spin_lock_bh(&pool->sp_lock);
-	if (likely(!list_empty(&pool->sp_sockets))) {
-		xprt = list_first_entry(&pool->sp_sockets,
-					struct svc_xprt, xpt_ready);
-		list_del_init(&xprt->xpt_ready);
+	xprt = lwq_dequeue(&pool->sp_xprts, struct svc_xprt, xpt_ready);
+	if (xprt)
 		svc_xprt_get(xprt);
-	}
-	spin_unlock_bh(&pool->sp_lock);
-out:
 	return xprt;
 }
 
@@ -708,7 +696,7 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
 		return false;
 
 	/* was a socket queued? */
-	if (!list_empty(&pool->sp_sockets))
+	if (!lwq_empty(&pool->sp_xprts))
 		return false;
 
 	/* are we shutting down? */
@@ -1050,7 +1038,6 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
 
 	spin_lock_bh(&serv->sv_lock);
 	list_del_init(&xprt->xpt_list);
-	WARN_ON_ONCE(!list_empty(&xprt->xpt_ready));
 	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
 		serv->sv_tmpcnt--;
 	spin_unlock_bh(&serv->sv_lock);
@@ -1101,36 +1088,26 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st
 	return ret;
 }
 
-static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net)
+static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
 {
-	struct svc_pool *pool;
 	struct svc_xprt *xprt;
-	struct svc_xprt *tmp;
 	int i;
 
 	for (i = 0; i < serv->sv_nrpools; i++) {
-		pool = &serv->sv_pools[i];
-
-		spin_lock_bh(&pool->sp_lock);
-		list_for_each_entry_safe(xprt, tmp, &pool->sp_sockets, xpt_ready) {
-			if (xprt->xpt_net != net)
-				continue;
-			list_del_init(&xprt->xpt_ready);
-			spin_unlock_bh(&pool->sp_lock);
-			return xprt;
+		struct svc_pool *pool = &serv->sv_pools[i];
+		struct llist_node *q, **t1, *t2;
+
+		q = lwq_dequeue_all(&pool->sp_xprts);
+		lwq_for_each_safe(xprt, t1, t2, &q, xpt_ready) {
+			if (xprt->xpt_net == net) {
+				set_bit(XPT_CLOSE, &xprt->xpt_flags);
+				svc_delete_xprt(xprt);
+				xprt = NULL;
+			}
 		}
-		spin_unlock_bh(&pool->sp_lock);
-	}
-	return NULL;
-}
 
-static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
-{
-	struct svc_xprt *xprt;
-
-	while ((xprt = svc_dequeue_net(serv, net))) {
-		set_bit(XPT_CLOSE, &xprt->xpt_flags);
-		svc_delete_xprt(xprt);
+		if (q)
+			lwq_enqueue_batch(q, &pool->sp_xprts);
 	}
 }
 
-- 
cgit v1.2.3


From 2e8fc923fe476db8cab9b6458027eccb22f3b6e6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:09 -0400
Subject: SUNRPC: change sp_nrthreads to atomic_t

Using an atomic_t avoids the need to take a spinlock (which can soon be
removed).

Choosing a thread to kill needs to be careful as we cannot set the "die
now" bit atomically with the test on the count.  Instead we temporarily
increase the count.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfssvc.c           | 11 +++++------
 include/linux/sunrpc/svc.h |  2 +-
 net/sunrpc/svc.c           | 37 ++++++++++++++++++++-----------------
 3 files changed, 26 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 0b03a2e50dee..433154b9eee0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -713,14 +713,13 @@ int nfsd_nrpools(struct net *net)
 
 int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
 {
-	int i = 0;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct svc_serv *serv = nn->nfsd_serv;
+	int i;
 
-	if (nn->nfsd_serv != NULL) {
-		for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
-			nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
-	}
-
+	if (serv)
+		for (i = 0; i < serv->sv_nrpools && i < n; i++)
+			nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
 	return 0;
 }
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 7ff9fe785e49..9d0fcd6148ae 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -36,7 +36,7 @@ struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
 	spinlock_t		sp_lock;	/* protects all fields */
 	struct lwq		sp_xprts;	/* pending transports */
-	unsigned int		sp_nrthreads;	/* # of threads in pool */
+	atomic_t		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	struct llist_head	sp_idle_threads; /* idle server threads */
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 244b5b9eba4d..0928d3f918b0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -681,8 +681,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	serv->sv_nrthreads += 1;
 	spin_unlock_bh(&serv->sv_lock);
 
+	atomic_inc(&pool->sp_nrthreads);
 	spin_lock_bh(&pool->sp_lock);
-	pool->sp_nrthreads++;
 	list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
 	spin_unlock_bh(&pool->sp_lock);
 	return rqstp;
@@ -727,23 +727,24 @@ svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
 }
 
 static struct svc_pool *
-svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool,
+		unsigned int *state)
 {
+	struct svc_pool *pool;
 	unsigned int i;
 
+retry:
+	pool = target_pool;
+
 	if (pool != NULL) {
-		spin_lock_bh(&pool->sp_lock);
-		if (pool->sp_nrthreads)
+		if (atomic_inc_not_zero(&pool->sp_nrthreads))
 			goto found_pool;
-		spin_unlock_bh(&pool->sp_lock);
 		return NULL;
 	} else {
 		for (i = 0; i < serv->sv_nrpools; i++) {
 			pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
-			spin_lock_bh(&pool->sp_lock);
-			if (pool->sp_nrthreads)
+			if (atomic_inc_not_zero(&pool->sp_nrthreads))
 				goto found_pool;
-			spin_unlock_bh(&pool->sp_lock);
 		}
 		return NULL;
 	}
@@ -751,8 +752,12 @@ svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *stat
 found_pool:
 	set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
 	set_bit(SP_NEED_VICTIM, &pool->sp_flags);
-	spin_unlock_bh(&pool->sp_lock);
-	return pool;
+	if (!atomic_dec_and_test(&pool->sp_nrthreads))
+		return pool;
+	/* Nothing left in this pool any more */
+	clear_bit(SP_NEED_VICTIM, &pool->sp_flags);
+	clear_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+	goto retry;
 }
 
 static int
@@ -828,13 +833,10 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 int
 svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	if (pool == NULL) {
+	if (!pool)
 		nrservs -= serv->sv_nrthreads;
-	} else {
-		spin_lock_bh(&pool->sp_lock);
-		nrservs -= pool->sp_nrthreads;
-		spin_unlock_bh(&pool->sp_lock);
-	}
+	else
+		nrservs -= atomic_read(&pool->sp_nrthreads);
 
 	if (nrservs > 0)
 		return svc_start_kthreads(serv, pool, nrservs);
@@ -921,10 +923,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
 	struct svc_pool	*pool = rqstp->rq_pool;
 
 	spin_lock_bh(&pool->sp_lock);
-	pool->sp_nrthreads--;
 	list_del_rcu(&rqstp->rq_all);
 	spin_unlock_bh(&pool->sp_lock);
 
+	atomic_dec(&pool->sp_nrthreads);
+
 	spin_lock_bh(&serv->sv_lock);
 	serv->sv_nrthreads -= 1;
 	spin_unlock_bh(&serv->sv_lock);
-- 
cgit v1.2.3


From 580a25756a9f639180b29a508f3bdd24c50a936a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:15 -0400
Subject: SUNRPC: discard sp_lock

sp_lock is now only used to protect sp_all_threads.  This isn't needed
as sp_all_threads is only manipulated through svc_set_num_threads(),
which is already serialized.  Read-acccess only requires rcu_read_lock().
So no more locking is needed.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h |  1 -
 net/sunrpc/svc.c           | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 9d0fcd6148ae..8ce1392c1a35 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -34,7 +34,6 @@
  */
 struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
-	spinlock_t		sp_lock;	/* protects all fields */
 	struct lwq		sp_xprts;	/* pending transports */
 	atomic_t		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0928d3f918b0..efe7a58ccbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -511,7 +511,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		lwq_init(&pool->sp_xprts);
 		INIT_LIST_HEAD(&pool->sp_all_threads);
 		init_llist_head(&pool->sp_idle_threads);
-		spin_lock_init(&pool->sp_lock);
 
 		percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL);
 		percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL);
@@ -682,9 +681,12 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	spin_unlock_bh(&serv->sv_lock);
 
 	atomic_inc(&pool->sp_nrthreads);
-	spin_lock_bh(&pool->sp_lock);
+
+	/* Protected by whatever lock the service uses when calling
+	 * svc_set_num_threads()
+	 */
 	list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
-	spin_unlock_bh(&pool->sp_lock);
+
 	return rqstp;
 }
 
@@ -922,9 +924,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = rqstp->rq_server;
 	struct svc_pool	*pool = rqstp->rq_pool;
 
-	spin_lock_bh(&pool->sp_lock);
 	list_del_rcu(&rqstp->rq_all);
-	spin_unlock_bh(&pool->sp_lock);
 
 	atomic_dec(&pool->sp_nrthreads);
 
-- 
cgit v1.2.3


From 15d39883ee7dfc023d8a24f5d4b58100e1d04ad9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Sep 2023 10:40:22 -0400
Subject: SUNRPC: change the back-channel queue to lwq

This removes the need to store and update back-links in the list.
It also remove the need for the _bh version of spin_lock().

Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h        |  3 +--
 include/linux/sunrpc/xprt.h       |  3 ++-
 net/sunrpc/backchannel_rqst.c     |  5 +----
 net/sunrpc/svc.c                  |  3 +--
 net/sunrpc/svc_xprt.c             | 12 +++---------
 net/sunrpc/xprtrdma/backchannel.c |  4 +---
 6 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 8ce1392c1a35..c1feaf0d1542 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -90,10 +90,9 @@ struct svc_serv {
 	int			(*sv_threadfn)(void *data);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	sv_cb_list;	/* queue for callback requests
+	struct lwq		sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
 						 * connection */
-	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
 	bool			sv_bc_enabled;	/* service uses backchannel */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4ecc89301eb7..f85d3a0daca2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -57,6 +57,7 @@ struct xprt_class;
 struct seq_file;
 struct svc_serv;
 struct net;
+#include <linux/lwq.h>
 
 /*
  * This describes a complete RPC request
@@ -121,7 +122,7 @@ struct rpc_rqst {
 	int			rq_ntrans;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	rq_bc_list;	/* Callback service list */
+	struct lwq_node		rq_bc_list;	/* Callback service list */
 	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
 	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
 #endif /* CONFIG_SUNRPC_BACKCHANEL */
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 44b7c89a635f..caa94cf57123 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -83,7 +83,6 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
 		return NULL;
 
 	req->rq_xprt = xprt;
-	INIT_LIST_HEAD(&req->rq_bc_list);
 
 	/* Preallocate one XDR receive buffer */
 	if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
@@ -367,8 +366,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	dprintk("RPC:       add callback request to list\n");
 	xprt_get(xprt);
-	spin_lock(&bc_serv->sv_cb_lock);
-	list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
-	spin_unlock(&bc_serv->sv_cb_lock);
+	lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
 	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 }
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index efe7a58ccbdc..7d4c10e609dc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -438,8 +438,7 @@ EXPORT_SYMBOL_GPL(svc_bind);
 static void
 __svc_init_bc(struct svc_serv *serv)
 {
-	INIT_LIST_HEAD(&serv->sv_cb_list);
-	spin_lock_init(&serv->sv_cb_lock);
+	lwq_init(&serv->sv_cb_list);
 }
 #else
 static void
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 28ca7db55da1..fee83d1024bc 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -705,7 +705,7 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	if (svc_is_backchannel(rqstp)) {
-		if (!list_empty(&rqstp->rq_server->sv_cb_list))
+		if (!lwq_empty(&rqstp->rq_server->sv_cb_list))
 			return false;
 	}
 #endif
@@ -878,18 +878,12 @@ void svc_recv(struct svc_rqst *rqstp)
 		struct svc_serv *serv = rqstp->rq_server;
 		struct rpc_rqst *req;
 
-		spin_lock_bh(&serv->sv_cb_lock);
-		req = list_first_entry_or_null(&serv->sv_cb_list,
-					       struct rpc_rqst, rq_bc_list);
+		req = lwq_dequeue(&serv->sv_cb_list,
+				  struct rpc_rqst, rq_bc_list);
 		if (req) {
-			list_del(&req->rq_bc_list);
-			spin_unlock_bh(&serv->sv_cb_lock);
 			svc_thread_wake_next(rqstp);
-
 			svc_process_bc(req, rqstp);
-			return;
 		}
-		spin_unlock_bh(&serv->sv_cb_lock);
 	}
 #endif
 }
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index bfc434ec52a7..8c817e755262 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -263,9 +263,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	/* Queue rqst for ULP's callback service */
 	bc_serv = xprt->bc_serv;
 	xprt_get(xprt);
-	spin_lock(&bc_serv->sv_cb_lock);
-	list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
-	spin_unlock(&bc_serv->sv_cb_lock);
+	lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
 
 	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 
-- 
cgit v1.2.3


From bd9d6a3efa9709e653aafbeb859289feccb8e70c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 11 Sep 2023 14:49:46 +0200
Subject: NFSD: add rpc_status netlink support

Introduce rpc_status netlink support for NFSD in order to dump pending
RPC requests debugging information from userspace.

Closes: https://bugzilla.linux-nfs.org/show_bug.cgi?id=366
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c           | 188 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfsd.h             |  17 ++++
 fs/nfsd/nfssvc.c           |  15 ++++
 fs/nfsd/state.h            |   2 -
 include/linux/sunrpc/svc.h |   1 +
 5 files changed, 220 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 364ccd0be1c1..739ed5bf71cd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1496,19 +1496,200 @@ static int create_proc_exports_entry(void)
 
 unsigned int nfsd_net_id;
 
+/**
+ * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: The rpc_status_get command may proceed
+ *   %-ENODEV: There is no NFSD running in this namespace
+ */
 int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
 {
+	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+	int ret = -ENODEV;
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv) {
+		svc_get(nn->nfsd_serv);
+		ret = 0;
+	}
+	mutex_unlock(&nfsd_mutex);
+
+	return ret;
+}
+
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+					    struct netlink_callback *cb,
+					    struct nfsd_genl_rqstp *rqstp)
+{
+	void *hdr;
+	u32 i;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
+	    nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+	    nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
+			ktime_to_us(rqstp->rq_stime),
+			NFSD_A_RPC_STATUS_PAD))
+		return -ENOBUFS;
+
+	switch (rqstp->rq_saddr.sa_family) {
+	case AF_INET: {
+		const struct sockaddr_in *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+		if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
+				    s_in->sin_addr.s_addr) ||
+		    nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
+				    d_in->sin_addr.s_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin_port))
+			return -ENOBUFS;
+		break;
+	}
+	case AF_INET6: {
+		const struct sockaddr_in6 *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+		if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
+				     &s_in->sin6_addr) ||
+		    nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
+				     &d_in->sin6_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin6_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin6_port))
+			return -ENOBUFS;
+		break;
+	}
+	}
+
+	for (i = 0; i < rqstp->rq_opcnt; i++)
+		if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
+				rqstp->rq_opnum[i]))
+			return -ENOBUFS;
+
+	genlmsg_end(skb, hdr);
 	return 0;
 }
 
+/**
+ * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit
+ * @skb: reply buffer
+ * @cb: netlink metadata and command arguments
+ *
+ * Returns the size of the reply or a negative errno.
+ */
 int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
 				  struct netlink_callback *cb)
 {
-	return 0;
+	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+	int i, ret, rqstp_index = 0;
+
+	rcu_read_lock();
+
+	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+		struct svc_rqst *rqstp;
+
+		if (i < cb->args[0]) /* already consumed */
+			continue;
+
+		rqstp_index = 0;
+		list_for_each_entry_rcu(rqstp,
+				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
+				rq_all) {
+			struct nfsd_genl_rqstp genl_rqstp;
+			unsigned int status_counter;
+
+			if (rqstp_index++ < cb->args[1]) /* already consumed */
+				continue;
+			/*
+			 * Acquire rq_status_counter before parsing the rqst
+			 * fields. rq_status_counter is set to an odd value in
+			 * order to notify the consumers the rqstp fields are
+			 * meaningful.
+			 */
+			status_counter =
+				smp_load_acquire(&rqstp->rq_status_counter);
+			if (!(status_counter & 1))
+				continue;
+
+			genl_rqstp.rq_xid = rqstp->rq_xid;
+			genl_rqstp.rq_flags = rqstp->rq_flags;
+			genl_rqstp.rq_vers = rqstp->rq_vers;
+			genl_rqstp.rq_prog = rqstp->rq_prog;
+			genl_rqstp.rq_proc = rqstp->rq_proc;
+			genl_rqstp.rq_stime = rqstp->rq_stime;
+			genl_rqstp.rq_opcnt = 0;
+			memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp),
+			       sizeof(struct sockaddr));
+			memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp),
+			       sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+			if (rqstp->rq_vers == NFS4_VERSION &&
+			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
+				/* NFSv4 compound */
+				struct nfsd4_compoundargs *args;
+				int j;
+
+				args = rqstp->rq_argp;
+				genl_rqstp.rq_opcnt = args->opcnt;
+				for (j = 0; j < genl_rqstp.rq_opcnt; j++)
+					genl_rqstp.rq_opnum[j] =
+						args->ops[j].opnum;
+			}
+#endif /* CONFIG_NFSD_V4 */
+
+			/*
+			 * Acquire rq_status_counter before reporting the rqst
+			 * fields to the user.
+			 */
+			if (smp_load_acquire(&rqstp->rq_status_counter) !=
+			    status_counter)
+				continue;
+
+			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+							       &genl_rqstp);
+			if (ret)
+				goto out;
+		}
+	}
+
+	cb->args[0] = i;
+	cb->args[1] = rqstp_index;
+	ret = skb->len;
+out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
+/**
+ * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: Success
+ */
 int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
 {
+	mutex_lock(&nfsd_mutex);
+	nfsd_put(sock_net(cb->skb->sk));
+	mutex_unlock(&nfsd_mutex);
+
 	return 0;
 }
 
@@ -1606,6 +1787,10 @@ static int __init init_nfsd(void)
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
+	retval = genl_register_family(&nfsd_nl_family);
+	if (retval)
+		goto out_free_all;
+
 	return 0;
 out_free_all:
 	nfsd4_destroy_laundry_wq();
@@ -1630,6 +1815,7 @@ out_free_slabs:
 
 static void __exit exit_nfsd(void)
 {
+	genl_unregister_family(&nfsd_nl_family);
 	unregister_filesystem(&nfsd_fs_type);
 	nfsd4_destroy_laundry_wq();
 	unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..f5ff42f41ee7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,23 @@ struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
 };
 
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	50
+
+struct nfsd_genl_rqstp {
+	struct sockaddr		rq_daddr;
+	struct sockaddr		rq_saddr;
+	unsigned long		rq_flags;
+	ktime_t			rq_stime;
+	__be32			rq_xid;
+	u32			rq_vers;
+	u32			rq_prog;
+	u32			rq_proc;
+
+	/* NFSv4 compound */
+	u32			rq_opcnt;
+	u32			rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+};
 
 extern struct svc_program	nfsd_program;
 extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 433154b9eee0..c5890cdfe97b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -997,6 +997,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
 		goto out_decode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an odd value after the rpc
+	 * request has been properly parsed. rq_status_counter is used to
+	 * notify the consumers if the rqstp fields are stable
+	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
+	 * is even).
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
 	rp = NULL;
 	switch (nfsd_cache_lookup(rqstp, &rp)) {
 	case RC_DOIT:
@@ -1014,6 +1023,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
 		goto out_encode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an even value after the rpc
+	 * request has been properly processed.
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
 	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
 out_cached_reply:
 	return 1;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6bbb1d0276b4..f96eaa8e9413 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -195,8 +195,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND	50
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index c1feaf0d1542..b10f987509cc 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -251,6 +251,7 @@ struct svc_rqst {
 						 * net namespace
 						 */
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
+	unsigned int		rq_status_counter; /* RPC processing counter */
 };
 
 /* bits for rq_flags */
-- 
cgit v1.2.3


From 2929ba9b460c13625cb47ed4e2de134f966bdf17 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Mon, 18 Sep 2023 14:59:06 +0800
Subject: nfs: fix the typo of rfc number about xattr in NFSv4

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 730003c4f4af..b6fa923a3111 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -150,7 +150,7 @@ enum nfs_opnum4 {
 	OP_WRITE_SAME = 70,
 	OP_CLONE = 71,
 
-	/* xattr support (RFC8726) */
+	/* xattr support (RFC8276) */
 	OP_GETXATTR                = 72,
 	OP_SETXATTR                = 73,
 	OP_LISTXATTRS              = 74,
-- 
cgit v1.2.3


From 263453d9bb46ad42f03a0f86aafe580f1b0e9291 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Sep 2023 09:57:51 -0400
Subject: NFSD: Add nfsd4_encode_fattr4_change()

Refactor the encoder for FATTR4_CHANGE into a helper. In a
subsequent patch, this helper will be called from a bitmask loop.

The code is restructured a bit to use the modern xdr_stream flow,
and the encoded cinfo value is made const so that callers of the
encoders can be passed a const cinfo.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c        | 56 ++++++++++++++++++++++++++++--------------------
 fs/nfsd/nfsfh.c          |  2 +-
 fs/nfsd/nfsfh.h          |  3 ++-
 fs/nfsd/xdr4.h           |  2 ++
 include/linux/iversion.h |  2 +-
 5 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 622f74b36d5e..c557d8ac0d38 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2530,17 +2530,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	return true;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
-			     struct svc_export *exp)
-{
-	if (exp->ex_flags & NFSEXP_V4ROOT) {
-		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
-		*p++ = 0;
-	} else
-		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
-	return p;
-}
-
 static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
 				    struct timespec64 *tv)
 {
@@ -2581,15 +2570,17 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
 }
 
 static __be32
-nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
+nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c)
 {
-	if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
-		return nfserr_resource;
-	return nfs_ok;
+	__be32 status;
+
+	status = nfsd4_encode_bool(xdr, c->atomic);
+	if (status != nfs_ok)
+		return status;
+	status = nfsd4_encode_changeid4(xdr, c->before_change);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_changeid4(xdr, c->after_change);
 }
 
 /* Encode as an array of strings the string given with components
@@ -3014,6 +3005,26 @@ static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr,
 	return nfsd4_encode_uint32_t(xdr, mask);
 }
 
+static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	const struct svc_export *exp = args->exp;
+	u64 c;
+
+	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
+
+		if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)
+			return nfserr_resource;
+		if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+			return nfserr_resource;
+		return nfs_ok;
+	}
+
+	c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry));
+	return nfsd4_encode_changeid4(xdr, c);
+}
+
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves.
@@ -3159,10 +3170,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out;
 	}
 	if (bmval0 & FATTR4_WORD0_CHANGE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = encode_change(p, &args.stat, d_inode(dentry), exp);
+		status = nfsd4_encode_fattr4_change(xdr, &args);
+		if (status != nfs_ok)
+			goto out;
 	}
 	if (bmval0 & FATTR4_WORD0_SIZE) {
 		p = xdr_reserve_space(xdr, 8);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 355bf0db3235..dbfa0ac13564 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
  * assume that the new change attr is always logged to stable storage in some
  * fashion before the results can be seen.
  */
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode)
 {
 	u64 chattr;
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 40426f899e76..6ebdf7ea27bf 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
+u64 nfsd4_change_attribute(const struct kstat *stat,
+			   const struct inode *inode);
 __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
 __be32 fh_fill_post_attrs(struct svc_fh *fhp);
 __be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 5c3eb3691f8b..d6059b0549f5 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -110,6 +110,8 @@ nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val)
 	return nfs_ok;
 }
 
+#define nfsd4_encode_changeid4(x, v)	nfsd4_encode_uint64_t(x, v)
+
 /**
  * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result
  * @xdr: target XDR stream
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index f174ff1b59ee..8f972eaca2ed 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -256,7 +256,7 @@ inode_peek_iversion(const struct inode *inode)
  * For filesystems without any sort of change attribute, the best we can
  * do is fake one up from the ctime:
  */
-static inline u64 time_to_chattr(struct timespec64 *t)
+static inline u64 time_to_chattr(const struct timespec64 *t)
 {
 	u64 chattr = t->tv_sec;
 
-- 
cgit v1.2.3


From be46e695fb95c8acfb342f3182595aaad45301d5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Sep 2023 10:02:06 -0400
Subject: NFSD: Copy FATTR4 bit number definitions from RFCs

I'd like to convert nfsd4_encode_fattr() to rotate through the
attrmask using for_each_bit() instead of explicitly testing the
bitmask for each bit value. This means I need the bit numbers, as
defined in the specs, instead of our internal bitmask constants.

As a clean up, use the new spec-derived values to define the WORD#
bitmask constants.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 260 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 192 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b6fa923a3111..c11c4db34639 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -389,79 +389,203 @@ enum lock_type4 {
 	NFS4_WRITEW_LT = 4
 };
 
+/*
+ * Symbol names and values are from RFC 7531 Section 2.
+ * "XDR Description of NFSv4.0"
+ */
+enum {
+	FATTR4_SUPPORTED_ATTRS		= 0,
+	FATTR4_TYPE			= 1,
+	FATTR4_FH_EXPIRE_TYPE		= 2,
+	FATTR4_CHANGE			= 3,
+	FATTR4_SIZE			= 4,
+	FATTR4_LINK_SUPPORT		= 5,
+	FATTR4_SYMLINK_SUPPORT		= 6,
+	FATTR4_NAMED_ATTR		= 7,
+	FATTR4_FSID			= 8,
+	FATTR4_UNIQUE_HANDLES		= 9,
+	FATTR4_LEASE_TIME		= 10,
+	FATTR4_RDATTR_ERROR		= 11,
+	FATTR4_ACL			= 12,
+	FATTR4_ACLSUPPORT		= 13,
+	FATTR4_ARCHIVE			= 14,
+	FATTR4_CANSETTIME		= 15,
+	FATTR4_CASE_INSENSITIVE		= 16,
+	FATTR4_CASE_PRESERVING		= 17,
+	FATTR4_CHOWN_RESTRICTED		= 18,
+	FATTR4_FILEHANDLE		= 19,
+	FATTR4_FILEID			= 20,
+	FATTR4_FILES_AVAIL		= 21,
+	FATTR4_FILES_FREE		= 22,
+	FATTR4_FILES_TOTAL		= 23,
+	FATTR4_FS_LOCATIONS		= 24,
+	FATTR4_HIDDEN			= 25,
+	FATTR4_HOMOGENEOUS		= 26,
+	FATTR4_MAXFILESIZE		= 27,
+	FATTR4_MAXLINK			= 28,
+	FATTR4_MAXNAME			= 29,
+	FATTR4_MAXREAD			= 30,
+	FATTR4_MAXWRITE			= 31,
+	FATTR4_MIMETYPE			= 32,
+	FATTR4_MODE			= 33,
+	FATTR4_NO_TRUNC			= 34,
+	FATTR4_NUMLINKS			= 35,
+	FATTR4_OWNER			= 36,
+	FATTR4_OWNER_GROUP		= 37,
+	FATTR4_QUOTA_AVAIL_HARD		= 38,
+	FATTR4_QUOTA_AVAIL_SOFT		= 39,
+	FATTR4_QUOTA_USED		= 40,
+	FATTR4_RAWDEV			= 41,
+	FATTR4_SPACE_AVAIL		= 42,
+	FATTR4_SPACE_FREE		= 43,
+	FATTR4_SPACE_TOTAL		= 44,
+	FATTR4_SPACE_USED		= 45,
+	FATTR4_SYSTEM			= 46,
+	FATTR4_TIME_ACCESS		= 47,
+	FATTR4_TIME_ACCESS_SET		= 48,
+	FATTR4_TIME_BACKUP		= 49,
+	FATTR4_TIME_CREATE		= 50,
+	FATTR4_TIME_DELTA		= 51,
+	FATTR4_TIME_METADATA		= 52,
+	FATTR4_TIME_MODIFY		= 53,
+	FATTR4_TIME_MODIFY_SET		= 54,
+	FATTR4_MOUNTED_ON_FILEID	= 55,
+};
+
+/*
+ * Symbol names and values are from RFC 5662 Section 2.
+ * "XDR Description of NFSv4.1"
+ */
+enum {
+	FATTR4_DIR_NOTIF_DELAY		= 56,
+	FATTR4_DIRENT_NOTIF_DELAY	= 57,
+	FATTR4_DACL			= 58,
+	FATTR4_SACL			= 59,
+	FATTR4_CHANGE_POLICY		= 60,
+	FATTR4_FS_STATUS		= 61,
+	FATTR4_FS_LAYOUT_TYPES		= 62,
+	FATTR4_LAYOUT_HINT		= 63,
+	FATTR4_LAYOUT_TYPES		= 64,
+	FATTR4_LAYOUT_BLKSIZE		= 65,
+	FATTR4_LAYOUT_ALIGNMENT		= 66,
+	FATTR4_FS_LOCATIONS_INFO	= 67,
+	FATTR4_MDSTHRESHOLD		= 68,
+	FATTR4_RETENTION_GET		= 69,
+	FATTR4_RETENTION_SET		= 70,
+	FATTR4_RETENTEVT_GET		= 71,
+	FATTR4_RETENTEVT_SET		= 72,
+	FATTR4_RETENTION_HOLD		= 73,
+	FATTR4_MODE_SET_MASKED		= 74,
+	FATTR4_SUPPATTR_EXCLCREAT	= 75,
+	FATTR4_FS_CHARSET_CAP		= 76,
+};
+
+/*
+ * Symbol names and values are from RFC 7863 Section 2.
+ * "XDR Description of NFSv4.2"
+ */
+enum {
+	FATTR4_CLONE_BLKSIZE		= 77,
+	FATTR4_SPACE_FREED		= 78,
+	FATTR4_CHANGE_ATTR_TYPE		= 79,
+	FATTR4_SEC_LABEL		= 80,
+};
+
+/*
+ * Symbol names and values are from RFC 8275 Section 5.
+ * "The mode_umask Attribute"
+ */
+enum {
+	FATTR4_MODE_UMASK		= 81,
+};
+
+/*
+ * Symbol names and values are from RFC 8276 Section 8.6.
+ * "Numeric Values Assigned to Protocol Extensions"
+ */
+enum {
+	FATTR4_XATTR_SUPPORT		= 82,
+};
+
+/*
+ * The following internal definitions enable processing the above
+ * attribute bits within 32-bit word boundaries.
+ */
 
 /* Mandatory Attributes */
-#define FATTR4_WORD0_SUPPORTED_ATTRS    (1UL << 0)
-#define FATTR4_WORD0_TYPE               (1UL << 1)
-#define FATTR4_WORD0_FH_EXPIRE_TYPE     (1UL << 2)
-#define FATTR4_WORD0_CHANGE             (1UL << 3)
-#define FATTR4_WORD0_SIZE               (1UL << 4)
-#define FATTR4_WORD0_LINK_SUPPORT       (1UL << 5)
-#define FATTR4_WORD0_SYMLINK_SUPPORT    (1UL << 6)
-#define FATTR4_WORD0_NAMED_ATTR         (1UL << 7)
-#define FATTR4_WORD0_FSID               (1UL << 8)
-#define FATTR4_WORD0_UNIQUE_HANDLES     (1UL << 9)
-#define FATTR4_WORD0_LEASE_TIME         (1UL << 10)
-#define FATTR4_WORD0_RDATTR_ERROR       (1UL << 11)
+#define FATTR4_WORD0_SUPPORTED_ATTRS    BIT(FATTR4_SUPPORTED_ATTRS)
+#define FATTR4_WORD0_TYPE               BIT(FATTR4_TYPE)
+#define FATTR4_WORD0_FH_EXPIRE_TYPE     BIT(FATTR4_FH_EXPIRE_TYPE)
+#define FATTR4_WORD0_CHANGE             BIT(FATTR4_CHANGE)
+#define FATTR4_WORD0_SIZE               BIT(FATTR4_SIZE)
+#define FATTR4_WORD0_LINK_SUPPORT       BIT(FATTR4_LINK_SUPPORT)
+#define FATTR4_WORD0_SYMLINK_SUPPORT    BIT(FATTR4_SYMLINK_SUPPORT)
+#define FATTR4_WORD0_NAMED_ATTR         BIT(FATTR4_NAMED_ATTR)
+#define FATTR4_WORD0_FSID               BIT(FATTR4_FSID)
+#define FATTR4_WORD0_UNIQUE_HANDLES     BIT(FATTR4_UNIQUE_HANDLES)
+#define FATTR4_WORD0_LEASE_TIME         BIT(FATTR4_LEASE_TIME)
+#define FATTR4_WORD0_RDATTR_ERROR       BIT(FATTR4_RDATTR_ERROR)
 /* Mandatory in NFSv4.1 */
-#define FATTR4_WORD2_SUPPATTR_EXCLCREAT (1UL << 11)
+#define FATTR4_WORD2_SUPPATTR_EXCLCREAT BIT(FATTR4_SUPPATTR_EXCLCREAT - 64)
 
 /* Recommended Attributes */
-#define FATTR4_WORD0_ACL                (1UL << 12)
-#define FATTR4_WORD0_ACLSUPPORT         (1UL << 13)
-#define FATTR4_WORD0_ARCHIVE            (1UL << 14)
-#define FATTR4_WORD0_CANSETTIME         (1UL << 15)
-#define FATTR4_WORD0_CASE_INSENSITIVE   (1UL << 16)
-#define FATTR4_WORD0_CASE_PRESERVING    (1UL << 17)
-#define FATTR4_WORD0_CHOWN_RESTRICTED   (1UL << 18)
-#define FATTR4_WORD0_FILEHANDLE         (1UL << 19)
-#define FATTR4_WORD0_FILEID             (1UL << 20)
-#define FATTR4_WORD0_FILES_AVAIL        (1UL << 21)
-#define FATTR4_WORD0_FILES_FREE         (1UL << 22)
-#define FATTR4_WORD0_FILES_TOTAL        (1UL << 23)
-#define FATTR4_WORD0_FS_LOCATIONS       (1UL << 24)
-#define FATTR4_WORD0_HIDDEN             (1UL << 25)
-#define FATTR4_WORD0_HOMOGENEOUS        (1UL << 26)
-#define FATTR4_WORD0_MAXFILESIZE        (1UL << 27)
-#define FATTR4_WORD0_MAXLINK            (1UL << 28)
-#define FATTR4_WORD0_MAXNAME            (1UL << 29)
-#define FATTR4_WORD0_MAXREAD            (1UL << 30)
-#define FATTR4_WORD0_MAXWRITE           (1UL << 31)
-#define FATTR4_WORD1_MIMETYPE           (1UL << 0)
-#define FATTR4_WORD1_MODE               (1UL << 1)
-#define FATTR4_WORD1_NO_TRUNC           (1UL << 2)
-#define FATTR4_WORD1_NUMLINKS           (1UL << 3)
-#define FATTR4_WORD1_OWNER              (1UL << 4)
-#define FATTR4_WORD1_OWNER_GROUP        (1UL << 5)
-#define FATTR4_WORD1_QUOTA_HARD         (1UL << 6)
-#define FATTR4_WORD1_QUOTA_SOFT         (1UL << 7)
-#define FATTR4_WORD1_QUOTA_USED         (1UL << 8)
-#define FATTR4_WORD1_RAWDEV             (1UL << 9)
-#define FATTR4_WORD1_SPACE_AVAIL        (1UL << 10)
-#define FATTR4_WORD1_SPACE_FREE         (1UL << 11)
-#define FATTR4_WORD1_SPACE_TOTAL        (1UL << 12)
-#define FATTR4_WORD1_SPACE_USED         (1UL << 13)
-#define FATTR4_WORD1_SYSTEM             (1UL << 14)
-#define FATTR4_WORD1_TIME_ACCESS        (1UL << 15)
-#define FATTR4_WORD1_TIME_ACCESS_SET    (1UL << 16)
-#define FATTR4_WORD1_TIME_BACKUP        (1UL << 17)
-#define FATTR4_WORD1_TIME_CREATE        (1UL << 18)
-#define FATTR4_WORD1_TIME_DELTA         (1UL << 19)
-#define FATTR4_WORD1_TIME_METADATA      (1UL << 20)
-#define FATTR4_WORD1_TIME_MODIFY        (1UL << 21)
-#define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
-#define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
-#define FATTR4_WORD1_DACL               (1UL << 26)
-#define FATTR4_WORD1_SACL               (1UL << 27)
-#define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
-#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
-#define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
-#define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
-#define FATTR4_WORD2_CLONE_BLKSIZE	(1UL << 13)
-#define FATTR4_WORD2_CHANGE_ATTR_TYPE	(1UL << 15)
-#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
-#define FATTR4_WORD2_MODE_UMASK		(1UL << 17)
-#define FATTR4_WORD2_XATTR_SUPPORT	(1UL << 18)
+#define FATTR4_WORD0_ACL                BIT(FATTR4_ACL)
+#define FATTR4_WORD0_ACLSUPPORT         BIT(FATTR4_ACLSUPPORT)
+#define FATTR4_WORD0_ARCHIVE            BIT(FATTR4_ARCHIVE)
+#define FATTR4_WORD0_CANSETTIME         BIT(FATTR4_CANSETTIME)
+#define FATTR4_WORD0_CASE_INSENSITIVE   BIT(FATTR4_CASE_INSENSITIVE)
+#define FATTR4_WORD0_CASE_PRESERVING    BIT(FATTR4_CASE_PRESERVING)
+#define FATTR4_WORD0_CHOWN_RESTRICTED   BIT(FATTR4_CHOWN_RESTRICTED)
+#define FATTR4_WORD0_FILEHANDLE         BIT(FATTR4_FILEHANDLE)
+#define FATTR4_WORD0_FILEID             BIT(FATTR4_FILEID)
+#define FATTR4_WORD0_FILES_AVAIL        BIT(FATTR4_FILES_AVAIL)
+#define FATTR4_WORD0_FILES_FREE         BIT(FATTR4_FILES_FREE)
+#define FATTR4_WORD0_FILES_TOTAL        BIT(FATTR4_FILES_TOTAL)
+#define FATTR4_WORD0_FS_LOCATIONS       BIT(FATTR4_FS_LOCATIONS)
+#define FATTR4_WORD0_HIDDEN             BIT(FATTR4_HIDDEN)
+#define FATTR4_WORD0_HOMOGENEOUS        BIT(FATTR4_HOMOGENEOUS)
+#define FATTR4_WORD0_MAXFILESIZE        BIT(FATTR4_MAXFILESIZE)
+#define FATTR4_WORD0_MAXLINK            BIT(FATTR4_MAXLINK)
+#define FATTR4_WORD0_MAXNAME            BIT(FATTR4_MAXNAME)
+#define FATTR4_WORD0_MAXREAD            BIT(FATTR4_MAXREAD)
+#define FATTR4_WORD0_MAXWRITE           BIT(FATTR4_MAXWRITE)
+
+#define FATTR4_WORD1_MIMETYPE           BIT(FATTR4_MIMETYPE - 32)
+#define FATTR4_WORD1_MODE               BIT(FATTR4_MODE	- 32)
+#define FATTR4_WORD1_NO_TRUNC           BIT(FATTR4_NO_TRUNC - 32)
+#define FATTR4_WORD1_NUMLINKS           BIT(FATTR4_NUMLINKS - 32)
+#define FATTR4_WORD1_OWNER              BIT(FATTR4_OWNER - 32)
+#define FATTR4_WORD1_OWNER_GROUP        BIT(FATTR4_OWNER_GROUP - 32)
+#define FATTR4_WORD1_QUOTA_HARD         BIT(FATTR4_QUOTA_AVAIL_HARD - 32)
+#define FATTR4_WORD1_QUOTA_SOFT         BIT(FATTR4_QUOTA_AVAIL_SOFT - 32)
+#define FATTR4_WORD1_QUOTA_USED         BIT(FATTR4_QUOTA_USED - 32)
+#define FATTR4_WORD1_RAWDEV             BIT(FATTR4_RAWDEV - 32)
+#define FATTR4_WORD1_SPACE_AVAIL        BIT(FATTR4_SPACE_AVAIL - 32)
+#define FATTR4_WORD1_SPACE_FREE         BIT(FATTR4_SPACE_FREE - 32)
+#define FATTR4_WORD1_SPACE_TOTAL        BIT(FATTR4_SPACE_TOTAL - 32)
+#define FATTR4_WORD1_SPACE_USED         BIT(FATTR4_SPACE_USED - 32)
+#define FATTR4_WORD1_SYSTEM             BIT(FATTR4_SYSTEM - 32)
+#define FATTR4_WORD1_TIME_ACCESS        BIT(FATTR4_TIME_ACCESS - 32)
+#define FATTR4_WORD1_TIME_ACCESS_SET    BIT(FATTR4_TIME_ACCESS_SET - 32)
+#define FATTR4_WORD1_TIME_BACKUP        BIT(FATTR4_TIME_BACKUP - 32)
+#define FATTR4_WORD1_TIME_CREATE        BIT(FATTR4_TIME_CREATE - 32)
+#define FATTR4_WORD1_TIME_DELTA         BIT(FATTR4_TIME_DELTA - 32)
+#define FATTR4_WORD1_TIME_METADATA      BIT(FATTR4_TIME_METADATA - 32)
+#define FATTR4_WORD1_TIME_MODIFY        BIT(FATTR4_TIME_MODIFY - 32)
+#define FATTR4_WORD1_TIME_MODIFY_SET    BIT(FATTR4_TIME_MODIFY_SET - 32)
+#define FATTR4_WORD1_MOUNTED_ON_FILEID  BIT(FATTR4_MOUNTED_ON_FILEID - 32)
+#define FATTR4_WORD1_DACL               BIT(FATTR4_DACL	- 32)
+#define FATTR4_WORD1_SACL               BIT(FATTR4_SACL	- 32)
+#define FATTR4_WORD1_FS_LAYOUT_TYPES    BIT(FATTR4_FS_LAYOUT_TYPES - 32)
+
+#define FATTR4_WORD2_LAYOUT_TYPES       BIT(FATTR4_LAYOUT_TYPES - 64)
+#define FATTR4_WORD2_LAYOUT_BLKSIZE     BIT(FATTR4_LAYOUT_BLKSIZE - 64)
+#define FATTR4_WORD2_MDSTHRESHOLD       BIT(FATTR4_MDSTHRESHOLD	- 64)
+#define FATTR4_WORD2_CLONE_BLKSIZE	BIT(FATTR4_CLONE_BLKSIZE - 64)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE	BIT(FATTR4_CHANGE_ATTR_TYPE - 64)
+#define FATTR4_WORD2_SECURITY_LABEL     BIT(FATTR4_SEC_LABEL - 64)
+#define FATTR4_WORD2_MODE_UMASK		BIT(FATTR4_MODE_UMASK - 64)
+#define FATTR4_WORD2_XATTR_SUPPORT	BIT(FATTR4_XATTR_SUPPORT - 64)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
-- 
cgit v1.2.3


From 59de2a56d127890cc610f3896d5fc31887c54ac2 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 11 Oct 2023 13:58:24 +0300
Subject: usb: typec: Link enumerated USB devices with Type-C partner

Adding functions that USB hub code can use to inform the
Type-C class about connected USB devices.

Once taken into use, it will allow the Type-C port drivers
to power off components that are not needed, for example if
USB2 device is enumerated, everything that is only relevant
for USB3 (retimers, etc.), can be powered off.

This will also create a symlink "typec" for the USB devices
pointing to the USB Type-C partner device.

Suggested-by: Benson Leung <bleung@chromium.org>
Tested-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231011105825.320062-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-usb |   9 +++
 drivers/usb/typec/class.c               | 108 +++++++++++++++++++++++++++++---
 drivers/usb/typec/class.h               |  16 +++++
 drivers/usb/typec/port-mapper.c         |   9 ++-
 include/linux/usb/typec.h               |  37 +++++++++++
 5 files changed, 169 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index a44bfe020061..2b7108e21977 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -313,6 +313,15 @@ Description:
 		Inter-Chip SSIC devices support asymmetric lanes up to 4 lanes per
 		direction. Devices before USB 3.2 are single lane (tx_lanes = 1)
 
+What:		/sys/bus/usb/devices/.../typec
+Date:		November 2023
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Symlink to the USB Type-C partner device. USB Type-C partner
+		represents the component that communicates over the
+		Configuration Channel (CC signal on USB Type-C connectors and
+		cables) with the local port.
+
 What:		/sys/bus/usb/devices/usbX/bAlternateSetting
 Description:
 		The current interface alternate setting number, in decimal.
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 9c1dbf3c00e0..2e0451bd336e 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -13,6 +13,7 @@
 #include <linux/usb/pd_vdo.h>
 #include <linux/usb/typec_mux.h>
 #include <linux/usb/typec_retimer.h>
+#include <linux/usb.h>
 
 #include "bus.h"
 #include "class.h"
@@ -681,6 +682,33 @@ const struct device_type typec_partner_dev_type = {
 	.release = typec_partner_release,
 };
 
+static void typec_partner_link_device(struct typec_partner *partner, struct device *dev)
+{
+	int ret;
+
+	ret = sysfs_create_link(&dev->kobj, &partner->dev.kobj, "typec");
+	if (ret)
+		return;
+
+	ret = sysfs_create_link(&partner->dev.kobj, &dev->kobj, dev_name(dev));
+	if (ret) {
+		sysfs_remove_link(&dev->kobj, "typec");
+		return;
+	}
+
+	if (partner->attach)
+		partner->attach(partner, dev);
+}
+
+static void typec_partner_unlink_device(struct typec_partner *partner, struct device *dev)
+{
+	sysfs_remove_link(&partner->dev.kobj, dev_name(dev));
+	sysfs_remove_link(&dev->kobj, "typec");
+
+	if (partner->deattach)
+		partner->deattach(partner, dev);
+}
+
 /**
  * typec_partner_set_identity - Report result from Discover Identity command
  * @partner: The partner updated identity values
@@ -865,6 +893,8 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	partner->num_altmodes = -1;
 	partner->pd_revision = desc->pd_revision;
 	partner->svdm_version = port->cap->svdm_version;
+	partner->attach = desc->attach;
+	partner->deattach = desc->deattach;
 
 	if (desc->identity) {
 		/*
@@ -887,6 +917,11 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 		return ERR_PTR(ret);
 	}
 
+	if (port->usb2_dev)
+		typec_partner_link_device(partner, port->usb2_dev);
+	if (port->usb3_dev)
+		typec_partner_link_device(partner, port->usb3_dev);
+
 	return partner;
 }
 EXPORT_SYMBOL_GPL(typec_register_partner);
@@ -899,8 +934,19 @@ EXPORT_SYMBOL_GPL(typec_register_partner);
  */
 void typec_unregister_partner(struct typec_partner *partner)
 {
-	if (!IS_ERR_OR_NULL(partner))
-		device_unregister(&partner->dev);
+	struct typec_port *port;
+
+	if (IS_ERR_OR_NULL(partner))
+		return;
+
+	port = to_typec_port(partner->dev.parent);
+
+	if (port->usb2_dev)
+		typec_partner_unlink_device(partner, port->usb2_dev);
+	if (port->usb3_dev)
+		typec_partner_unlink_device(partner, port->usb3_dev);
+
+	device_unregister(&partner->dev);
 }
 EXPORT_SYMBOL_GPL(typec_unregister_partner);
 
@@ -1775,6 +1821,50 @@ static int partner_match(struct device *dev, void *data)
 	return is_typec_partner(dev);
 }
 
+static struct typec_partner *typec_get_partner(struct typec_port *port)
+{
+	struct device *dev;
+
+	dev = device_find_child(&port->dev, NULL, partner_match);
+	if (!dev)
+		return NULL;
+
+	return to_typec_partner(dev);
+}
+
+static void typec_partner_attach(struct typec_connector *con, struct device *dev)
+{
+	struct typec_port *port = container_of(con, struct typec_port, con);
+	struct typec_partner *partner = typec_get_partner(port);
+	struct usb_device *udev = to_usb_device(dev);
+
+	if (udev->speed < USB_SPEED_SUPER)
+		port->usb2_dev = dev;
+	else
+		port->usb3_dev = dev;
+
+	if (partner) {
+		typec_partner_link_device(partner, dev);
+		put_device(&partner->dev);
+	}
+}
+
+static void typec_partner_deattach(struct typec_connector *con, struct device *dev)
+{
+	struct typec_port *port = container_of(con, struct typec_port, con);
+	struct typec_partner *partner = typec_get_partner(port);
+
+	if (partner) {
+		typec_partner_unlink_device(partner, dev);
+		put_device(&partner->dev);
+	}
+
+	if (port->usb2_dev == dev)
+		port->usb2_dev = NULL;
+	else if (port->usb3_dev == dev)
+		port->usb3_dev = NULL;
+}
+
 /**
  * typec_set_data_role - Report data role change
  * @port: The USB Type-C Port where the role was changed
@@ -1784,7 +1874,7 @@ static int partner_match(struct device *dev, void *data)
  */
 void typec_set_data_role(struct typec_port *port, enum typec_data_role role)
 {
-	struct device *partner_dev;
+	struct typec_partner *partner;
 
 	if (port->data_role == role)
 		return;
@@ -1793,14 +1883,14 @@ void typec_set_data_role(struct typec_port *port, enum typec_data_role role)
 	sysfs_notify(&port->dev.kobj, NULL, "data_role");
 	kobject_uevent(&port->dev.kobj, KOBJ_CHANGE);
 
-	partner_dev = device_find_child(&port->dev, NULL, partner_match);
-	if (!partner_dev)
+	partner = typec_get_partner(port);
+	if (!partner)
 		return;
 
-	if (to_typec_partner(partner_dev)->identity)
-		typec_product_type_notify(partner_dev);
+	if (partner->identity)
+		typec_product_type_notify(&partner->dev);
 
-	put_device(partner_dev);
+	put_device(&partner->dev);
 }
 EXPORT_SYMBOL_GPL(typec_set_data_role);
 
@@ -2251,6 +2341,8 @@ struct typec_port *typec_register_port(struct device *parent,
 	port->ops = cap->ops;
 	port->port_type = cap->type;
 	port->prefer_role = cap->prefer_role;
+	port->con.attach = typec_partner_attach;
+	port->con.deattach = typec_partner_deattach;
 
 	device_initialize(&port->dev);
 	port->dev.class = &typec_class;
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index 673b2952b074..c36761ba3f59 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -8,6 +8,7 @@
 
 struct typec_mux;
 struct typec_switch;
+struct usb_device;
 
 struct typec_plug {
 	struct device			dev;
@@ -35,6 +36,9 @@ struct typec_partner {
 	enum usb_pd_svdm_ver		svdm_version;
 
 	struct usb_power_delivery	*pd;
+
+	void (*attach)(struct typec_partner *partner, struct device *dev);
+	void (*deattach)(struct typec_partner *partner, struct device *dev);
 };
 
 struct typec_port {
@@ -59,6 +63,18 @@ struct typec_port {
 
 	const struct typec_capability	*cap;
 	const struct typec_operations   *ops;
+
+	struct typec_connector		con;
+
+	/*
+	 * REVISIT: Only USB devices for now. If there are others, these need to
+	 * be converted into a list.
+	 *
+	 * NOTE: These may be registered first before the typec_partner, so they
+	 * will always have to be kept here instead of struct typec_partner.
+	 */
+	struct device			*usb2_dev;
+	struct device			*usb3_dev;
 };
 
 #define to_typec_port(_dev_) container_of(_dev_, struct typec_port, dev)
diff --git a/drivers/usb/typec/port-mapper.c b/drivers/usb/typec/port-mapper.c
index a929e000d0e2..d42da5720a25 100644
--- a/drivers/usb/typec/port-mapper.c
+++ b/drivers/usb/typec/port-mapper.c
@@ -8,17 +8,22 @@
 
 #include <linux/acpi.h>
 #include <linux/component.h>
+#include <linux/usb.h>
 
 #include "class.h"
 
 static int typec_aggregate_bind(struct device *dev)
 {
-	return component_bind_all(dev, NULL);
+	struct typec_port *port = to_typec_port(dev);
+
+	return component_bind_all(dev, &port->con);
 }
 
 static void typec_aggregate_unbind(struct device *dev)
 {
-	component_unbind_all(dev, NULL);
+	struct typec_port *port = to_typec_port(dev);
+
+	component_unbind_all(dev, &port->con);
 }
 
 static const struct component_master_ops typec_aggregate_ops = {
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 8fa781207970..a05d6f6f2536 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -202,6 +202,8 @@ struct typec_cable_desc {
  * @accessory: Audio, Debug or none.
  * @identity: Discover Identity command data
  * @pd_revision: USB Power Delivery Specification Revision if supported
+ * @attach: Notification about attached USB device
+ * @deattach: Notification about removed USB device
  *
  * Details about a partner that is attached to USB Type-C port. If @identity
  * member exists when partner is registered, a directory named "identity" is
@@ -217,6 +219,9 @@ struct typec_partner_desc {
 	enum typec_accessory	accessory;
 	struct usb_pd_identity	*identity;
 	u16			pd_revision; /* 0300H = "3.0" */
+
+	void (*attach)(struct typec_partner *partner, struct device *dev);
+	void (*deattach)(struct typec_partner *partner, struct device *dev);
 };
 
 /**
@@ -335,4 +340,36 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_
 int typec_partner_set_usb_power_delivery(struct typec_partner *partner,
 					 struct usb_power_delivery *pd);
 
+/**
+ * struct typec_connector - Representation of Type-C port for external drivers
+ * @attach: notification about device removal
+ * @deattach: notification about device removal
+ *
+ * Drivers that control the USB and other ports (DisplayPorts, etc.), that are
+ * connected to the Type-C connectors, can use these callbacks to inform the
+ * Type-C connector class about connections and disconnections. That information
+ * can then be used by the typec-port drivers to power on or off parts that are
+ * needed or not needed - as an example, in USB mode if USB2 device is
+ * enumerated, USB3 components (retimers, phys, and what have you) do not need
+ * to be powered on.
+ *
+ * The attached (enumerated) devices will be liked with the typec-partner device.
+ */
+struct typec_connector {
+	void (*attach)(struct typec_connector *con, struct device *dev);
+	void (*deattach)(struct typec_connector *con, struct device *dev);
+};
+
+static inline void typec_attach(struct typec_connector *con, struct device *dev)
+{
+	if (con && con->attach)
+		con->attach(con, dev);
+}
+
+static inline void typec_deattach(struct typec_connector *con, struct device *dev)
+{
+	if (con && con->deattach)
+		con->deattach(con, dev);
+}
+
 #endif /* __LINUX_USB_TYPEC_H */
-- 
cgit v1.2.3


From a5e80e18f268ea7c7a36bc4159de0deb3b5a2171 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Mon, 16 Oct 2023 19:27:31 +0800
Subject: f2fs: fix error path of __f2fs_build_free_nids

If NAT is corrupted, let scan_nat_page() return EFSCORRUPTED, so that,
caller can set SBI_NEED_FSCK flag into checkpoint for later repair by
fsck.

Also, this patch introduces a new fscorrupted error flag, and in above
scenario, it will persist the error flag into superblock synchronously
to avoid it has no luck to trigger a checkpoint to record SBI_NEED_FSCK

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c          | 11 +++++++++--
 include/linux/f2fs_fs.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a2b2c6c7f66d..1c8bf56c834c 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2389,7 +2389,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi,
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 
 		if (blk_addr == NEW_ADDR)
-			return -EINVAL;
+			return -EFSCORRUPTED;
 
 		if (blk_addr == NULL_ADDR) {
 			add_free_nid(sbi, start_nid, true, true);
@@ -2504,7 +2504,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
 
 			if (ret) {
 				f2fs_up_read(&nm_i->nat_tree_lock);
-				f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+
+				if (ret == -EFSCORRUPTED) {
+					f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+					set_sbi_flag(sbi, SBI_NEED_FSCK);
+					f2fs_handle_error(sbi,
+						ERROR_INCONSISTENT_NAT);
+				}
+
 				return ret;
 			}
 		}
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 07ed69c2840d..039fe0ce8d83 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -104,6 +104,7 @@ enum f2fs_error {
 	ERROR_CORRUPTED_VERITY_XATTR,
 	ERROR_CORRUPTED_XATTR,
 	ERROR_INVALID_NODE_REFERENCE,
+	ERROR_INCONSISTENT_NAT,
 	ERROR_MAX,
 };
 
-- 
cgit v1.2.3


From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:12 +0800
Subject: mm: memory: add vm_normal_folio_pmd()

Patch series "mm: convert numa balancing functions to use a folio", v2.

do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs
are handled in do_huge_pmd_numa_page().  But a large, PTE-mapped folio
will be supported so let's convert more numa balancing functions to
use/take a folio in preparation for that, no functional change intended
for now.


This patch (of 6):

The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(),
which allow them to completely replace the struct page variables with
struct folio variables.

Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 126b54b45442..52c40b3d0813 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2327,6 +2327,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+				  unsigned long addr, pmd_t pmd);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
diff --git a/mm/memory.c b/mm/memory.c
index d956b231e835..311e862c6404 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -689,6 +689,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 out:
 	return pfn_to_page(pfn);
 }
+
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+				  unsigned long addr, pmd_t pmd)
+{
+	struct page *page = vm_normal_page_pmd(vma, addr, pmd);
+
+	if (page)
+		return page_folio(page);
+	return NULL;
+}
 #endif
 
 static void restore_exclusive_pte(struct vm_area_struct *vma,
-- 
cgit v1.2.3


From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:16 +0800
Subject: mm: mempolicy: make mpol_misplaced() to take a folio

In preparation for large folio numa balancing, make mpol_misplaced() to
take a folio, no functional change intended.

Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h |  5 +++--
 mm/memory.c               |  2 +-
 mm/mempolicy.c            | 22 ++++++++++++----------
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..6c2754d7bfed 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -174,7 +174,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 /* Check if a vma is migratable */
 extern bool vma_migratable(struct vm_area_struct *vma);
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -278,7 +278,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 }
 #endif
 
-static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+static inline int mpol_misplaced(struct folio *folio,
+				 struct vm_area_struct *vma,
 				 unsigned long address)
 {
 	return -1; /* no node preference */
diff --git a/mm/memory.c b/mm/memory.c
index 20b290c9dc87..7ca16ed67634 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
 		*flags |= TNF_FAULT_LOCAL;
 	}
 
-	return mpol_misplaced(&folio->page, vma, addr);
+	return mpol_misplaced(folio, vma, addr);
 }
 
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1b00d6ac7ee..69c0eac7292c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2564,24 +2564,25 @@ static void sp_free(struct sp_node *n)
 }
 
 /**
- * mpol_misplaced - check whether current page node is valid in policy
+ * mpol_misplaced - check whether current folio node is valid in policy
  *
- * @page: page to be checked
- * @vma: vm area where page mapped
- * @addr: virtual address where page mapped
+ * @folio: folio to be checked
+ * @vma: vm area where folio mapped
+ * @addr: virtual address in @vma for shared policy lookup and interleave policy
  *
- * Lookup current policy node id for vma,addr and "compare to" page's
+ * Lookup current policy node id for vma,addr and "compare to" folio's
  * node id.  Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  *
  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
- * policy, or a suitable node ID to allocate a replacement page from.
+ * policy, or a suitable node ID to allocate a replacement folio from.
  */
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
+		   unsigned long addr)
 {
 	struct mempolicy *pol;
 	struct zoneref *z;
-	int curnid = page_to_nid(page);
+	int curnid = folio_nid(folio);
 	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
 	int thisnid = cpu_to_node(thiscpu);
@@ -2637,11 +2638,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		BUG();
 	}
 
-	/* Migrate the page towards the node whose CPU is referencing it */
+	/* Migrate the folio towards the node whose CPU is referencing it */
 	if (pol->flags & MPOL_F_MORON) {
 		polnid = thisnid;
 
-		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+		if (!should_numa_migrate_memory(current, &folio->page, curnid,
+						thiscpu))
 			goto out;
 	}
 
-- 
cgit v1.2.3


From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:17 +0800
Subject: sched/numa, mm: make numa migrate functions to take a folio

The cpupid (or access time) is stored in the head page for THP, so it is
safely to make should_numa_migrate_memory() and numa_hint_fault_latency()
to take a folio.  This is in preparation for large folio numa balancing.

Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/numa_balancing.h |  6 +++---
 kernel/sched/fair.c                  | 12 ++++++------
 mm/mempolicy.c                       |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..06a9d35650f0 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-					int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+				int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
-				struct page *page, int src_nid, int dst_cpu)
+				struct folio *folio, int src_nid, int dst_cpu)
 {
 	return true;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb225921bbca..42aefe7e6fdc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
  * The smaller the hint page fault latency, the higher the possibility
  * for the page to be hot.
  */
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
 {
 	int last_time, time;
 
 	time = jiffies_to_msecs(jiffies);
-	last_time = xchg_page_access_time(page, time);
+	last_time = xchg_page_access_time(&folio->page, time);
 
 	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 }
@@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
 	}
 }
 
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 				int src_nid, int dst_cpu)
 {
 	struct numa_group *ng = deref_curr_numa_group(p);
@@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
 
 		th = pgdat->nbp_threshold ? : def_th;
-		latency = numa_hint_fault_latency(page);
+		latency = numa_hint_fault_latency(folio);
 		if (latency >= th)
 			return false;
 
 		return !numa_promotion_rate_limit(pgdat, rate_limit,
-						  thp_nr_pages(page));
+						  folio_nr_pages(folio));
 	}
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
-	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+	last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid);
 
 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
 	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 69c0eac7292c..abd94f4c7f6b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 	if (pol->flags & MPOL_F_MORON) {
 		polnid = thisnid;
 
-		if (!should_numa_migrate_memory(current, &folio->page, curnid,
+		if (!should_numa_migrate_memory(current, folio, curnid,
 						thiscpu))
 			goto out;
 	}
-- 
cgit v1.2.3


From 3c6f33b7273a7e2f2b2497b62c8400bd957b2fbe Mon Sep 17 00:00:00 2001
From: Stefan Roesch <shr@devkernel.io>
Date: Fri, 22 Sep 2023 14:11:40 -0700
Subject: mm/ksm: support fork/exec for prctl

Patch series "mm/ksm: add fork-exec support for prctl", v4.

A process can enable KSM with the prctl system call.  When the process is
forked the KSM flag is inherited by the child process.  However if the
process is executing an exec system call directly after the fork, the KSM
setting is cleared.  This patch series addresses this problem.

1) Change the mask in coredump.h for execing a new process
2) Add a new test case in ksm_functional_tests


This patch (of 2):

Today we have two ways to enable KSM:

1) madvise system call
   This allows to enable KSM for a memory region for a long time.

2) prctl system call
   This is a recent addition to enable KSM for the complete process.
   In addition when a process is forked, the KSM setting is inherited.

This change only affects the second case.

One of the use cases for (2) was to support the ability to enable
KSM for cgroups. This allows systemd to enable KSM for the seed
process. By enabling it in the seed process all child processes inherit
the setting.

This works correctly when the process is forked. However it doesn't
support fork/exec workflow.

From the previous cover letter:

....
Use case 3:
With the madvise call sharing opportunities are only enabled for the
current process: it is a workload-local decision. A considerable number
of sharing opportunities may exist across multiple workloads or jobs
(if they are part of the same security domain). Only a higler level
entity like a job scheduler or container can know for certain if its
running one or more instances of a job. That job scheduler however
doesn't have the necessary internal workload knowledge to make targeted
madvise calls.
....

In addition it can also be a bit surprising that fork keeps the KSM
setting and fork/exec does not.

Link: https://lkml.kernel.org/r/20230922211141.320789-1-shr@devkernel.io
Link: https://lkml.kernel.org/r/20230922211141.320789-2-shr@devkernel.io
Signed-off-by: Stefan Roesch <shr@devkernel.io>
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process")
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Carl Klemm <carl@uvos.xyz>
Tested-by: Carl Klemm <carl@uvos.xyz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 1b37fa8fc723..02f5090ffea2 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 #define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 #define MMF_OOM_REAP_QUEUED	25	/* mm was queued for oom_reaper */
 #define MMF_MULTIPROCESS	26	/* mm is shared between processes */
 /*
@@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_HAS_MDWE		28
 #define MMF_HAS_MDWE_MASK	(1 << MMF_HAS_MDWE)
 
-#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+#define MMF_HAS_MDWE_NO_INHERIT	29
 
-#define MMF_VM_MERGE_ANY	29
-#define MMF_HAS_MDWE_NO_INHERIT	30
+#define MMF_VM_MERGE_ANY	30
+#define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
+
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+				 MMF_VM_MERGE_ANY_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
-- 
cgit v1.2.3


From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Tue, 26 Sep 2023 12:20:17 -0700
Subject: mm/filemap: remove hugetlb special casing in filemap.c

Remove special cased hugetlb handling code within the page cache by
changing the granularity of ->index to the base page size rather than the
huge page size.  The motivation of this patch is to reduce complexity
within the filemap code while also increasing performance by removing
branches that are evaluated on every page cache lookup.

To support the change in index, new wrappers for hugetlb page cache
interactions are added.  These wrappers perform the conversion to a linear
index which is now expected by the page cache for huge pages.

========================= PERFORMANCE ======================================

Perf was used to check the performance differences after the patch.
Overall the performance is similar to mainline with a very small larger
overhead that occurs in __filemap_add_folio() and
hugetlb_add_to_page_cache().  This is because of the larger overhead that
occurs in xa_load() and xa_store() as the xarray is now using more entries
to store hugetlb folios in the page cache.

Timing

aarch64
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
            real    1m49.568s
            user    0m0.000s
            sys     1m49.461s

        6.5-rc3:
            [root]# time fallocate -l 700GB test.txt
            real    1m47.495s
            user    0m0.000s
            sys     1m47.370s
    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m47.024s
            user    0m0.000s
            sys     1m46.921s

        6.5-rc3:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m44.551s
            user    0m0.000s
            sys     1m44.438s

x86
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
            real    0m22.383s
            user    0m0.000s
            sys     0m22.255s

        6.5-rc3:
            [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
            real    0m22.735s
            user    0m0.038s
            sys     0m22.567s

    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
            real    0m25.786s
            user    0m0.001s
            sys     0m25.589s

        6.5-rc3:
            [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
            real    0m33.454s
            user    0m0.001s
            sys     0m33.193s

aarch64:
    workload - fallocate a 700GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--95.04%--__pi_clear_page
                          |
                          |--3.57%--clear_huge_page
                          |          |
                          |          |--2.63%--rcu_all_qs
                          |          |
                          |           --0.91%--__cond_resched
                          |
                           --0.67%--__cond_resched
            0.17%     0.00%             0  fallocate  [kernel.vmlinux]       [k] hugetlb_add_to_page_cache
            0.14%     0.10%            11  fallocate  [kernel.vmlinux]       [k] __filemap_add_folio

    6.5-rc3
        2MB Page Size:
                --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                          |--94.91%--__pi_clear_page
                          |
                          |--4.11%--clear_huge_page
                          |          |
                          |          |--3.00%--rcu_all_qs
                          |          |
                          |           --1.10%--__cond_resched
                          |
                           --0.59%--__cond_resched
            0.08%     0.01%             1  fallocate  [kernel.kallsyms]  [k] hugetlb_add_to_page_cache
            0.05%     0.03%             3  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

x86
    workload - fallocate a 100GB file backed by huge pages

    6.5-rc3 + this patch:
        2MB Page Size:
            hugetlbfs_fallocate
            |
            --99.57%--clear_huge_page
                |
                --98.47%--clear_page_erms
                    |
                    --0.53%--asm_sysvec_apic_timer_interrupt

            0.04%     0.04%             1  fallocate  [kernel.kallsyms]     [k] xa_load
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] hugetlb_add_to_page_cache
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] __filemap_add_folio
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] xas_store

    6.5-rc3
        2MB Page Size:
                --99.93%--__x64_sys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |
                           --99.38%--clear_huge_page
                                     |
                                     |--98.40%--clear_page_erms
                                     |
                                      --0.59%--__cond_resched
            0.03%     0.03%             1  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

========================= TESTING ======================================

This patch passes libhugetlbfs tests and LTP hugetlb tests

********** TEST SUMMARY
*                      2M
*                      32-bit 64-bit
*     Total testcases:   110    113
*             Skipped:     0      0
*                PASS:   107    113
*                FAIL:     0      0
*    Killed by signal:     3      0
*   Bad configuration:     0      0
*       Expected FAIL:     0      0
*     Unexpected PASS:     0      0
*    Test not present:     0      0
* Strange test result:     0      0
**********

    Done executing testcases.
    LTP Version:  20220527-178-g2761a81c4

page migration was also tested using Mike Kravetz's test program.[8]

[dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug]
  Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain
Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 37 +++++++++++++++++++------------------
 include/linux/hugetlb.h | 12 ++++++++++++
 include/linux/pagemap.h | 32 ++------------------------------
 mm/filemap.c            | 34 ++++++++++------------------------
 mm/hugetlb.c            | 32 ++++++--------------------------
 mm/migrate.c            |  6 +++---
 6 files changed, 52 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 60fce26ff937..926d01c493fb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -334,7 +334,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t retval = 0;
 
 	while (iov_iter_count(to)) {
-		struct page *page;
+		struct folio *folio;
 		size_t nr, copied, want;
 
 		/* nr is the maximum number of bytes to copy from this page */
@@ -352,18 +352,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		}
 		nr = nr - offset;
 
-		/* Find the page */
-		page = find_lock_page(mapping, index);
-		if (unlikely(page == NULL)) {
+		/* Find the folio */
+		folio = filemap_lock_hugetlb_folio(h, mapping, index);
+		if (IS_ERR(folio)) {
 			/*
 			 * We have a HOLE, zero out the user-buffer for the
 			 * length of the hole or request.
 			 */
 			copied = iov_iter_zero(nr, to);
 		} else {
-			unlock_page(page);
+			folio_unlock(folio);
 
-			if (!PageHWPoison(page))
+			if (!folio_test_has_hwpoisoned(folio))
 				want = nr;
 			else {
 				/*
@@ -371,19 +371,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				 * touching the 1st raw HWPOISON subpage after
 				 * offset.
 				 */
-				want = adjust_range_hwpoison(page, offset, nr);
+				want = adjust_range_hwpoison(&folio->page, offset, nr);
 				if (want == 0) {
-					put_page(page);
+					folio_put(folio);
 					retval = -EIO;
 					break;
 				}
 			}
 
 			/*
-			 * We have the page, copy it to user space buffer.
+			 * We have the folio, copy it to user space buffer.
 			 */
-			copied = copy_page_to_iter(page, offset, want, to);
-			put_page(page);
+			copied = copy_folio_to_iter(folio, offset, want, to);
+			folio_put(folio);
 		}
 		offset += copied;
 		retval += copied;
@@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 {
 	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> huge_page_shift(h);
-	const pgoff_t end = lend >> huge_page_shift(h);
+	const pgoff_t end = lend >> PAGE_SHIFT;
 	struct folio_batch fbatch;
 	pgoff_t next, index;
 	int i, freed = 0;
 	bool truncate_op = (lend == LLONG_MAX);
 
 	folio_batch_init(&fbatch);
-	next = start;
+	next = lstart >> PAGE_SHIFT;
 	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
 			struct folio *folio = fbatch.folios[i];
 			u32 hash = 0;
 
-			index = folio->index;
+			index = folio->index >> huge_page_order(h);
 			hash = hugetlb_fault_mutex_hash(mapping, index);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
@@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	}
 
 	if (truncate_op)
-		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+		(void)hugetlb_unreserve_pages(inode,
+				lstart >> huge_page_shift(h),
+				LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
@@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
 	pgoff_t idx = start >> huge_page_shift(h);
 	struct folio *folio;
 
-	folio = filemap_lock_folio(mapping, idx);
+	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 	if (IS_ERR(folio))
 		return;
 
@@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		/* See if already present in mapping to avoid alloc/free */
-		folio = filemap_get_folio(mapping, index);
+		folio = filemap_get_folio(mapping, index << huge_page_order(h));
 		if (!IS_ERR(folio)) {
 			folio_put(folio);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a30686e649f7..d935b0584f1d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
 	return huge_page_size(h) / 512;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
 #include <asm/hugetlb.h>
 
 #ifndef is_hugepage_only_range
@@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
 	return NULL;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return NULL;
+}
+
 static inline int isolate_or_dissolve_huge_page(struct page *page,
 						struct list_head *list)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..759b29d9a69a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
  */
 static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return &folio->page;
 	return folio_page(folio, index & (folio_nr_pages(folio) - 1));
 }
 
@@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
  */
 static inline bool folio_contains(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return folio->index == index;
 	return index - folio_index(folio) < folio_nr_pages(folio);
 }
 
@@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
 }
 
 /*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
  */
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
 {
 	struct page *head;
 
@@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)
 	return head->index + page - head;
 }
 
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
-	if (unlikely(PageHuge(page)))
-		return hugetlb_basepage_index(page);
-	return page_to_index(page);
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
@@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)
 
 /*
  * Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
  */
 static inline pgoff_t folio_pgoff(struct folio *folio)
 {
-	if (unlikely(folio_test_hugetlb(folio)))
-		return hugetlb_basepage_index(&folio->page);
 	return folio->index;
 }
 
-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
-				     unsigned long address);
-
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
 	pgoff_t pgoff;
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return linear_hugepage_index(vma, address);
 	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
 	return pgoff;
diff --git a/mm/filemap.c b/mm/filemap.c
index 9481ffaf24e6..340c693112a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping,
 
 	mapping_set_update(&xas, mapping);
 
-	/* hugetlb pages are represented by a single entry in the xarray */
-	if (!folio_test_hugetlb(folio)) {
-		xas_set_order(&xas, folio->index, folio_order(folio));
-		nr = folio_nr_pages(folio);
-	}
+	xas_set_order(&xas, folio->index, folio_order(folio));
+	nr = folio_nr_pages(folio);
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
@@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
 	if (free_folio)
 		free_folio(folio);
 
-	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+	if (folio_test_large(folio))
 		refs = folio_nr_pages(folio);
 	folio_put_refs(folio, refs);
 }
@@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 
 	if (!huge) {
 		int error = mem_cgroup_charge(folio, NULL, gfp);
-		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
 		if (error)
 			return error;
 		charged = true;
-		xas_set_order(&xas, index, folio_order(folio));
-		nr = folio_nr_pages(folio);
 	}
 
+	VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
+	xas_set_order(&xas, index, folio_order(folio));
+	nr = folio_nr_pages(folio);
+
 	gfp &= GFP_RECLAIM_MASK;
 	folio_ref_add(folio, nr);
 	folio->mapping = mapping;
@@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
 		int idx = folio_batch_count(fbatch) - 1;
 
 		folio = fbatch->folios[idx];
-		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+		if (!xa_is_value(folio))
 			nr = folio_nr_pages(folio);
 		*start = indices[idx] + nr;
 	}
@@ -2104,7 +2102,7 @@ put:
 		int idx = folio_batch_count(fbatch) - 1;
 
 		folio = fbatch->folios[idx];
-		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+		if (!xa_is_value(folio))
 			nr = folio_nr_pages(folio);
 		*start = indices[idx] + nr;
 	}
@@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
 			continue;
 		if (!folio_batch_add(fbatch, folio)) {
 			unsigned long nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
@@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 
 		if (!folio_batch_add(fbatch, folio)) {
 			nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
@@ -2232,10 +2224,7 @@ update_start:
 
 	if (nr) {
 		folio = fbatch->folios[nr - 1];
-		if (folio_test_hugetlb(folio))
-			*start = folio->index + 1;
-		else
-			*start = folio_next_index(folio);
+		*start = folio->index + folio_nr_pages(folio);
 	}
 out:
 	rcu_read_unlock();
@@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
 			continue;
 		if (!folio_batch_add(fbatch, folio)) {
 			unsigned long nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de220e3ff8be..0aaf28ce32e5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t)
 
 /*
  * Convert the address within this vma to the page offset within
- * the mapping, in pagecache page units; huge pages here.
+ * the mapping, huge page units here.
  */
 static pgoff_t vma_hugecache_offset(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
@@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
 			(vma->vm_pgoff >> huge_page_order(h));
 }
 
-pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
-				     unsigned long address)
-{
-	return vma_hugecache_offset(hstate_vma(vma), vma, address);
-}
-EXPORT_SYMBOL_GPL(linear_hugepage_index);
-
 /**
  * vma_kernel_pagesize - Page size granularity for this VMA.
  * @vma: The user mapping.
@@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
 	return NULL;
 }
 
-pgoff_t hugetlb_basepage_index(struct page *page)
-{
-	struct page *page_head = compound_head(page);
-	pgoff_t index = page_index(page_head);
-	unsigned long compound_idx;
-
-	if (compound_order(page_head) > MAX_ORDER)
-		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
-	else
-		compound_idx = page - page_head;
-
-	return (index << compound_order(page_head)) + compound_idx;
-}
-
 static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
 		nodemask_t *node_alloc_noretry)
@@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	pgoff_t idx = vma_hugecache_offset(h, vma, address);
+	pgoff_t idx = linear_page_index(vma, address);
 	struct folio *folio;
 
 	folio = filemap_get_folio(mapping, idx);
@@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
 	struct hstate *h = hstate_inode(inode);
 	int err;
 
+	idx <<= huge_page_order(h);
 	__folio_set_locked(folio);
 	err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
 
@@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 * before we get page_table_lock.
 	 */
 	new_folio = false;
-	folio = filemap_lock_folio(mapping, idx);
+	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 	if (IS_ERR(folio)) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
@@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Just decrements count, does not deallocate */
 		vma_end_reservation(h, vma, haddr);
 
-		pagecache_folio = filemap_lock_folio(mapping, idx);
+		pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 		if (IS_ERR(pagecache_folio))
 			pagecache_folio = NULL;
 	}
@@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 
 	if (is_continue) {
 		ret = -EFAULT;
-		folio = filemap_lock_folio(mapping, idx);
+		folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 		if (IS_ERR(folio))
 			goto out;
 		folio_in_pagecache = true;
diff --git a/mm/migrate.c b/mm/migrate.c
index 7d1804c4a5d9..942f8c9cd068 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -524,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 	int expected_count;
 
 	xas_lock_irq(&xas);
-	expected_count = 2 + folio_has_private(src);
+	expected_count = folio_expected_refs(mapping, src);
 	if (!folio_ref_freeze(src, expected_count)) {
 		xas_unlock_irq(&xas);
 		return -EAGAIN;
@@ -533,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 	dst->index = src->index;
 	dst->mapping = src->mapping;
 
-	folio_get(dst);
+	folio_ref_add(dst, folio_nr_pages(dst));
 
 	xas_store(&xas, dst);
 
-	folio_ref_unfreeze(src, expected_count - 1);
+	folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
 
 	xas_unlock_irq(&xas);
 
-- 
cgit v1.2.3


From 07a8bdd4120ced3490ef9adf51b8086af0aaa8e7 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:25 +0800
Subject: memory tiering: add abstract distance calculation algorithms
 management

Patch series "memory tiering: calculate abstract distance based on ACPI
HMAT", v4.

We have the explicit memory tiers framework to manage systems with
multiple types of memory, e.g., DRAM in DIMM slots and CXL memory devices.
Where, same kind of memory devices will be grouped into memory types,
then put into memory tiers.  To describe the performance of a memory type,
abstract distance is defined.  Which is in direct proportion to the memory
latency and inversely proportional to the memory bandwidth.  To keep the
code as simple as possible, fixed abstract distance is used in dax/kmem to
describe slow memory such as Optane DCPMM.

To support more memory types, in this series, we added the abstract
distance calculation algorithm management mechanism, provided a algorithm
implementation based on ACPI HMAT, and used the general abstract distance
calculation interface in dax/kmem driver.  So, dax/kmem can support HBM
(high bandwidth memory) in addition to the original Optane DCPMM.


This patch (of 4):

The abstract distance may be calculated by various drivers, such as ACPI
HMAT, CXL CDAT, etc.  While it may be used by various code which hot-add
memory node, such as dax/kmem etc.  To decouple the algorithm users and
the providers, the abstract distance calculation algorithms management
mechanism is implemented in this patch.  It provides interface for the
providers to register the implementation, and interface for the users.

Multiple algorithm implementations can cooperate via calculating abstract
distance for different memory nodes.  The preference of algorithm
implementations can be specified via priority (notifier_block.priority).

Link: https://lkml.kernel.org/r/20230926060628.265989-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20230926060628.265989-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 19 ++++++++++++++
 mm/memory-tiers.c            | 59 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 4fa178b50784..fddc59990644 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,7 @@
 #include <linux/nodemask.h>
 #include <linux/kref.h>
 #include <linux/mmzone.h>
+#include <linux/notifier.h>
 /*
  * Each tier cover a abstrace distance chunk size of 128
  */
@@ -36,6 +37,9 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+int register_mt_adistance_algorithm(struct notifier_block *nb);
+int unregister_mt_adistance_algorithm(struct notifier_block *nb);
+int mt_calc_adistance(int node, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +101,20 @@ static inline bool node_is_toptier(int node)
 {
 	return true;
 }
+
+static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int mt_calc_adistance(int node, int *adist)
+{
+	return NOTIFY_DONE;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 876d8a5e210e..4301e7e89223 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,7 @@
 #include <linux/kobject.h>
 #include <linux/memory.h>
 #include <linux/memory-tiers.h>
+#include <linux/notifier.h>
 
 #include "internal.h"
 
@@ -105,6 +106,8 @@ static int top_tier_adistance;
 static struct demotion_nodes *node_demotion __read_mostly;
 #endif /* CONFIG_MIGRATION */
 
+static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+
 static inline struct memory_tier *to_memory_tier(struct device *device)
 {
 	return container_of(device, struct memory_tier, dev);
@@ -592,6 +595,62 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+/**
+ * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
+ * @nb: The notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ *
+ * Every memory tiering abstract distance algorithm provider needs to
+ * register the algorithm with register_mt_adistance_algorithm().  To
+ * calculate the abstract distance for a specified memory node, the
+ * notifier function will be called unless some high priority
+ * algorithm has provided result.  The prototype of the notifier
+ * function is as follows,
+ *
+ *   int (*algorithm_notifier)(struct notifier_block *nb,
+ *                             unsigned long nid, void *data);
+ *
+ * Where "nid" specifies the memory node, "data" is the pointer to the
+ * returned abstract distance (that is, "int *adist").  If the
+ * algorithm provides the result, NOTIFY_STOP should be returned.
+ * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next
+ * algorithm in the chain to provide the result.
+ */
+int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm);
+
+/**
+ * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm
+ * @nb: the notifier block which describe the algorithm
+ *
+ * Return: 0 on success, errno on error.
+ */
+int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm);
+
+/**
+ * mt_calc_adistance() - Calculate abstract distance with registered algorithms
+ * @node: the node to calculate abstract distance for
+ * @adist: the returned abstract distance
+ *
+ * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some
+ * abstract distance algorithm provides the result, and return it via
+ * @adist.  Otherwise, no algorithm can provide the result and @adist
+ * will be kept as it is.
+ */
+int mt_calc_adistance(int node, int *adist)
+{
+	return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist);
+}
+EXPORT_SYMBOL_GPL(mt_calc_adistance);
+
 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
 					      unsigned long action, void *_arg)
 {
-- 
cgit v1.2.3


From 3718c02dbd4c88d47b5af003acdb3d1112604ea3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:27 +0800
Subject: acpi, hmat: calculate abstract distance with HMAT

A memory tiering abstract distance calculation algorithm based on ACPI
HMAT is implemented.  The basic idea is as follows.

The performance attributes of system default DRAM nodes are recorded as
the base line.  Whose abstract distance is MEMTIER_ADISTANCE_DRAM.  Then,
the ratio of the abstract distance of a memory node (target) to
MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance
attributes of the node to that of the default DRAM nodes.

The functions to record the read/write latency/bandwidth of the default
DRAM nodes and calculate abstract distance according to read/write
latency/bandwidth ratio will be used by CXL CDAT (Coherent Device
Attribute Table) and other memory device drivers.  So, they are put in
memory-tiers.c.

Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/acpi/numa/hmat.c     |  65 ++++++++++++++++++++++++++-
 include/linux/memory-tiers.h |  18 ++++++++
 mm/memory-tiers.c            | 103 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 183 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 2dee0098f1a9..9ef5f1bdcfdb 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -24,6 +24,7 @@
 #include <linux/node.h>
 #include <linux/sysfs.h>
 #include <linux/dax.h>
+#include <linux/memory-tiers.h>
 
 static u8 hmat_revision;
 static int hmat_disable __initdata;
@@ -759,6 +760,61 @@ static int hmat_callback(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+static int hmat_set_default_dram_perf(void)
+{
+	int rc;
+	int nid, pxm;
+	struct memory_target *target;
+	struct node_hmem_attrs *attrs;
+
+	if (!default_dram_type)
+		return -EIO;
+
+	for_each_node_mask(nid, default_dram_type->nodes) {
+		pxm = node_to_pxm(nid);
+		target = find_mem_target(pxm);
+		if (!target)
+			continue;
+		attrs = &target->hmem_attrs[1];
+		rc = mt_set_default_dram_perf(nid, attrs, "ACPI HMAT");
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int hmat_calculate_adistance(struct notifier_block *self,
+				    unsigned long nid, void *data)
+{
+	static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
+	struct memory_target *target;
+	struct node_hmem_attrs *perf;
+	int *adist = data;
+	int pxm;
+
+	pxm = node_to_pxm(nid);
+	target = find_mem_target(pxm);
+	if (!target)
+		return NOTIFY_OK;
+
+	mutex_lock(&target_lock);
+	hmat_update_target_attrs(target, p_nodes, 1);
+	mutex_unlock(&target_lock);
+
+	perf = &target->hmem_attrs[1];
+
+	if (mt_perf_to_adistance(perf, adist))
+		return NOTIFY_OK;
+
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block hmat_adist_nb __meminitdata = {
+	.notifier_call = hmat_calculate_adistance,
+	.priority = 100,
+};
+
 static __init void hmat_free_structures(void)
 {
 	struct memory_target *target, *tnext;
@@ -841,8 +897,13 @@ static __init int hmat_init(void)
 	hmat_register_targets();
 
 	/* Keep the table and structures if the notifier may use them */
-	if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
-		return 0;
+	if (hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
+		goto out_put;
+
+	if (!hmat_set_default_dram_perf())
+		register_mt_adistance_algorithm(&hmat_adist_nb);
+
+	return 0;
 out_put:
 	hmat_free_structures();
 	acpi_put_table(tbl);
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fddc59990644..59d15c6d3c0d 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -31,8 +31,11 @@ struct memory_dev_type {
 	struct kref kref;
 };
 
+struct node_hmem_attrs;
+
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
+extern struct memory_dev_type *default_dram_type;
 struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
@@ -40,6 +43,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 int register_mt_adistance_algorithm(struct notifier_block *nb);
 int unregister_mt_adistance_algorithm(struct notifier_block *nb);
 int mt_calc_adistance(int node, int *adist);
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+			     const char *source);
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -64,6 +70,7 @@ static inline bool node_is_toptier(int node)
 #else
 
 #define numa_demotion_enabled	false
+#define default_dram_type	NULL
 /*
  * CONFIG_NUMA implementation returns non NULL error.
  */
@@ -116,5 +123,16 @@ static inline int mt_calc_adistance(int node, int *adist)
 {
 	return NOTIFY_DONE;
 }
+
+static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+					   const char *source)
+{
+	return -EIO;
+}
+
+static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+	return -EIO;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 4301e7e89223..085321c77123 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -37,7 +37,7 @@ struct node_memory_type_map {
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
-static struct memory_dev_type *default_dram_type;
+struct memory_dev_type *default_dram_type;
 
 static struct bus_type memory_tier_subsys = {
 	.name = "memory_tiering",
@@ -108,6 +108,11 @@ static struct demotion_nodes *node_demotion __read_mostly;
 
 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
 
+static bool default_dram_perf_error;
+static struct node_hmem_attrs default_dram_perf;
+static int default_dram_perf_ref_nid = NUMA_NO_NODE;
+static const char *default_dram_perf_ref_source;
+
 static inline struct memory_tier *to_memory_tier(struct device *device)
 {
 	return container_of(device, struct memory_tier, dev);
@@ -595,6 +600,102 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
+{
+	pr_info(
+"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
+		prefix, attrs->read_latency, attrs->write_latency,
+		attrs->read_bandwidth, attrs->write_bandwidth);
+}
+
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+			     const char *source)
+{
+	int rc = 0;
+
+	mutex_lock(&memory_tier_lock);
+	if (default_dram_perf_error) {
+		rc = -EIO;
+		goto out;
+	}
+
+	if (perf->read_latency + perf->write_latency == 0 ||
+	    perf->read_bandwidth + perf->write_bandwidth == 0) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
+		default_dram_perf = *perf;
+		default_dram_perf_ref_nid = nid;
+		default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
+		goto out;
+	}
+
+	/*
+	 * The performance of all default DRAM nodes is expected to be
+	 * same (that is, the variation is less than 10%).  And it
+	 * will be used as base to calculate the abstract distance of
+	 * other memory nodes.
+	 */
+	if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
+	    default_dram_perf.read_latency ||
+	    abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
+	    default_dram_perf.write_latency ||
+	    abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
+	    default_dram_perf.read_bandwidth ||
+	    abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
+	    default_dram_perf.write_bandwidth) {
+		pr_info(
+"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
+"DRAM node %d.\n", nid, default_dram_perf_ref_nid);
+		pr_info("  performance of reference DRAM node %d:\n",
+			default_dram_perf_ref_nid);
+		dump_hmem_attrs(&default_dram_perf, "    ");
+		pr_info("  performance of DRAM node %d:\n", nid);
+		dump_hmem_attrs(perf, "    ");
+		pr_info(
+"  disable default DRAM node performance based abstract distance algorithm.\n");
+		default_dram_perf_error = true;
+		rc = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&memory_tier_lock);
+	return rc;
+}
+
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+	if (default_dram_perf_error)
+		return -EIO;
+
+	if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+		return -ENOENT;
+
+	if (perf->read_latency + perf->write_latency == 0 ||
+	    perf->read_bandwidth + perf->write_bandwidth == 0)
+		return -EINVAL;
+
+	mutex_lock(&memory_tier_lock);
+	/*
+	 * The abstract distance of a memory node is in direct proportion to
+	 * its memory latency (read + write) and inversely proportional to its
+	 * memory bandwidth (read + write).  The abstract distance, memory
+	 * latency, and memory bandwidth of the default DRAM nodes are used as
+	 * the base.
+	 */
+	*adist = MEMTIER_ADISTANCE_DRAM *
+		(perf->read_latency + perf->write_latency) /
+		(default_dram_perf.read_latency + default_dram_perf.write_latency) *
+		(default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
+		(perf->read_bandwidth + perf->write_bandwidth);
+	mutex_unlock(&memory_tier_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
+
 /**
  * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
  * @nb: The notifier block which describe the algorithm
-- 
cgit v1.2.3


From 6bc2cfdf82d56863b7cf5e86e37a662b2ae5d47e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 26 Sep 2023 14:06:28 +0800
Subject: dax, kmem: calculate abstract distance with general interface

Previously, a fixed abstract distance MEMTIER_DEFAULT_DAX_ADISTANCE is
used for slow memory type in kmem driver.  This limits the usage of kmem
driver, for example, it cannot be used for HBM (high bandwidth memory).

So, we use the general abstract distance calculation mechanism in kmem
drivers to get more accurate abstract distance on systems with proper
support.  The original MEMTIER_DEFAULT_DAX_ADISTANCE is used as fallback
only.

Now, multiple memory types may be managed by kmem.  These memory types are
put into the "kmem_memory_types" list and protected by
kmem_memory_type_lock.

Link: https://lkml.kernel.org/r/20230926060628.265989-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Bharata B Rao <bharata@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Rafael J Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/kmem.c           | 62 ++++++++++++++++++++++++++++++++++----------
 include/linux/memory-tiers.h |  2 ++
 mm/memory-tiers.c            |  3 ++-
 3 files changed, 53 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index c57acb73e3db..369c698b7706 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -49,14 +49,52 @@ struct dax_kmem_data {
 	struct resource *res[];
 };
 
-static struct memory_dev_type *dax_slowmem_type;
+static DEFINE_MUTEX(kmem_memory_type_lock);
+static LIST_HEAD(kmem_memory_types);
+
+static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
+{
+	bool found = false;
+	struct memory_dev_type *mtype;
+
+	mutex_lock(&kmem_memory_type_lock);
+	list_for_each_entry(mtype, &kmem_memory_types, list) {
+		if (mtype->adistance == adist) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		mtype = alloc_memory_type(adist);
+		if (!IS_ERR(mtype))
+			list_add(&mtype->list, &kmem_memory_types);
+	}
+	mutex_unlock(&kmem_memory_type_lock);
+
+	return mtype;
+}
+
+static void kmem_put_memory_types(void)
+{
+	struct memory_dev_type *mtype, *mtn;
+
+	mutex_lock(&kmem_memory_type_lock);
+	list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
+		list_del(&mtype->list);
+		put_memory_type(mtype);
+	}
+	mutex_unlock(&kmem_memory_type_lock);
+}
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
 	struct device *dev = &dev_dax->dev;
 	unsigned long total_len = 0;
 	struct dax_kmem_data *data;
+	struct memory_dev_type *mtype;
 	int i, rc, mapped = 0;
 	int numa_node;
+	int adist = MEMTIER_DEFAULT_DAX_ADISTANCE;
 
 	/*
 	 * Ensure good NUMA information for the persistent memory.
@@ -71,6 +109,11 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 		return -EINVAL;
 	}
 
+	mt_calc_adistance(numa_node, &adist);
+	mtype = kmem_find_alloc_memory_type(adist);
+	if (IS_ERR(mtype))
+		return PTR_ERR(mtype);
+
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct range range;
 
@@ -88,7 +131,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 		return -EINVAL;
 	}
 
-	init_node_memory_type(numa_node, dax_slowmem_type);
+	init_node_memory_type(numa_node, mtype);
 
 	rc = -ENOMEM;
 	data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
@@ -167,7 +210,7 @@ err_reg_mgid:
 err_res_name:
 	kfree(data);
 err_dax_kmem_data:
-	clear_node_memory_type(numa_node, dax_slowmem_type);
+	clear_node_memory_type(numa_node, mtype);
 	return rc;
 }
 
@@ -219,7 +262,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 		 * for that. This implies this reference will be around
 		 * till next reboot.
 		 */
-		clear_node_memory_type(node, dax_slowmem_type);
+		clear_node_memory_type(node, NULL);
 	}
 }
 #else
@@ -251,12 +294,6 @@ static int __init dax_kmem_init(void)
 	if (!kmem_name)
 		return -ENOMEM;
 
-	dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE);
-	if (IS_ERR(dax_slowmem_type)) {
-		rc = PTR_ERR(dax_slowmem_type);
-		goto err_dax_slowmem_type;
-	}
-
 	rc = dax_driver_register(&device_dax_kmem_driver);
 	if (rc)
 		goto error_dax_driver;
@@ -264,8 +301,7 @@ static int __init dax_kmem_init(void)
 	return rc;
 
 error_dax_driver:
-	put_memory_type(dax_slowmem_type);
-err_dax_slowmem_type:
+	kmem_put_memory_types();
 	kfree_const(kmem_name);
 	return rc;
 }
@@ -275,7 +311,7 @@ static void __exit dax_kmem_exit(void)
 	dax_driver_unregister(&device_dax_kmem_driver);
 	if (!any_hotremove_failed)
 		kfree_const(kmem_name);
-	put_memory_type(dax_slowmem_type);
+	kmem_put_memory_types();
 }
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 59d15c6d3c0d..1e39d27bee41 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -24,6 +24,8 @@ struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibling;
+	/* list of memory types that are managed by one driver */
+	struct list_head list;
 	/* abstract distance for this specific memory type */
 	int adistance;
 	/* Nodes of same abstract distance */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 085321c77123..8d5291add2bc 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -586,13 +586,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 {
 	mutex_lock(&memory_tier_lock);
-	if (node_memory_types[node].memtype == memtype)
+	if (node_memory_types[node].memtype == memtype || !memtype)
 		node_memory_types[node].map_count--;
 	/*
 	 * If we umapped all the attached devices to this node,
 	 * clear the node memory type.
 	 */
 	if (!node_memory_types[node].map_count) {
+		memtype = node_memory_types[node].memtype;
 		node_memory_types[node].memtype = NULL;
 		put_memory_type(memtype);
 	}
-- 
cgit v1.2.3


From 6cb42f91aa6dfd10fd847c469caebe63b35141ff Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 24 Sep 2023 19:38:17 -0700
Subject: bitmap: move bitmap_*_region() functions to bitmap.h

Now that bitmap_*_region() functions are implemented as thin wrappers
around others, it's worth to move them to the header, as it opens room
for compile-time optimizations.

CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++---
 lib/bitmap.c           | 64 --------------------------------------------------
 2 files changed, 61 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 1cca950a54ae..99451431e4d6 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -6,6 +6,7 @@
 
 #include <linux/align.h>
 #include <linux/bitops.h>
+#include <linux/errno.h>
 #include <linux/find.h>
 #include <linux/limits.h>
 #include <linux/string.h>
@@ -209,9 +210,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		unsigned int sz, unsigned int nbits);
-int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
-void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
-int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
@@ -497,6 +495,66 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
 }
 
+/**
+ * bitmap_release_region - release allocated bitmap region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@pos: beginning of bit region to release
+ *	@order: region size (log base 2 of number of bits) to release
+ *
+ * This is the complement to __bitmap_find_free_region() and releases
+ * the found region (by clearing it in the bitmap).
+ */
+static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
+{
+	bitmap_clear(bitmap, pos, BIT(order));
+}
+
+/**
+ * bitmap_allocate_region - allocate bitmap region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@pos: beginning of bit region to allocate
+ *	@order: region size (log base 2 of number of bits) to allocate
+ *
+ * Allocate (set bits in) a specified region of a bitmap.
+ *
+ * Returns: 0 on success, or %-EBUSY if specified region wasn't
+ * free (not all bits were zero).
+ */
+static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
+{
+	unsigned int len = BIT(order);
+
+	if (find_next_bit(bitmap, pos + len, pos) < pos + len)
+		return -EBUSY;
+	bitmap_set(bitmap, pos, len);
+	return 0;
+}
+
+/**
+ * bitmap_find_free_region - find a contiguous aligned mem region
+ *	@bitmap: array of unsigned longs corresponding to the bitmap
+ *	@bits: number of bits in the bitmap
+ *	@order: region size (log base 2 of number of bits) to find
+ *
+ * Find a region of free (zero) bits in a @bitmap of @bits bits and
+ * allocate them (set them to one).  Only consider regions of length
+ * a power (@order) of two, aligned to that power of two, which
+ * makes the search algorithm much faster.
+ *
+ * Returns: the bit offset in bitmap of the allocated region,
+ * or -errno on failure.
+ */
+static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
+{
+	unsigned int pos, end;		/* scans bitmap by regions of size order */
+
+	for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
+		if (!bitmap_allocate_region(bitmap, pos, order))
+			return pos;
+	}
+	return -ENOMEM;
+}
+
 /**
  * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
  * @n: u64 value
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 66da8c33fb1b..09522af227f1 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -8,7 +8,6 @@
 #include <linux/bitops.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
-#include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/slab.h>
 
@@ -708,69 +707,6 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 }
 #endif /* CONFIG_NUMA */
 
-/**
- * bitmap_find_free_region - find a contiguous aligned mem region
- *	@bitmap: array of unsigned longs corresponding to the bitmap
- *	@bits: number of bits in the bitmap
- *	@order: region size (log base 2 of number of bits) to find
- *
- * Find a region of free (zero) bits in a @bitmap of @bits bits and
- * allocate them (set them to one).  Only consider regions of length
- * a power (@order) of two, aligned to that power of two, which
- * makes the search algorithm much faster.
- *
- * Return: the bit offset in bitmap of the allocated region,
- * or -errno on failure.
- */
-int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
-{
-	unsigned int pos, end;		/* scans bitmap by regions of size order */
-
-	for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
-		if (!bitmap_allocate_region(bitmap, pos, order))
-			return pos;
-	}
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(bitmap_find_free_region);
-
-/**
- * bitmap_release_region - release allocated bitmap region
- *	@bitmap: array of unsigned longs corresponding to the bitmap
- *	@pos: beginning of bit region to release
- *	@order: region size (log base 2 of number of bits) to release
- *
- * This is the complement to __bitmap_find_free_region() and releases
- * the found region (by clearing it in the bitmap).
- */
-void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
-{
-	bitmap_clear(bitmap, pos, BIT(order));
-}
-EXPORT_SYMBOL(bitmap_release_region);
-
-/**
- * bitmap_allocate_region - allocate bitmap region
- *	@bitmap: array of unsigned longs corresponding to the bitmap
- *	@pos: beginning of bit region to allocate
- *	@order: region size (log base 2 of number of bits) to allocate
- *
- * Allocate (set bits in) a specified region of a bitmap.
- *
- * Return: 0 on success, or %-EBUSY if specified region wasn't
- * free (not all bits were zero).
- */
-int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
-{
-	unsigned int len = BIT(order);
-
-	if (find_next_bit(bitmap, pos + len, pos) < pos + len)
-		return -EBUSY;
-	bitmap_set(bitmap, pos, len);
-	return 0;
-}
-EXPORT_SYMBOL(bitmap_allocate_region);
-
 unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
 {
 	return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
-- 
cgit v1.2.3


From 90de47f020db086f7929e09f64efd0cf627d6869 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 13 Oct 2023 14:48:21 +0800
Subject: page_pool: fragment API support for 32-bit arch with 64-bit DMA

Currently page_pool_alloc_frag() is not supported in 32-bit
arch with 64-bit DMA because of the overlap issue between
pp_frag_count and dma_addr_upper in 'struct page' for those
arches, which seems to be quite common, see [1], which means
driver may need to handle it when using fragment API.

It is assumed that the combination of the above arch with an
address space >16TB does not exist, as all those arches have
64b equivalent, it seems logical to use the 64b version for a
system with a large address space. It is also assumed that dma
address is page aligned when we are dma mapping a page aligned
buffer, see [2].

That means we're storing 12 bits of 0 at the lower end for a
dma address, we can reuse those bits for the above arches to
support 32b+12b, which is 16TB of memory.

If we make a wrong assumption, a warning is emitted so that
user can report to us.

1. https://lore.kernel.org/all/20211117075652.58299-1-linyunsheng@huawei.com/
2. https://lore.kernel.org/all/20230818145145.4b357c89@kernel.org/

Tested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Lorenzo Bianconi <lorenzo@kernel.org>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Liang Chen <liangchen.linux@gmail.com>
CC: Guillaume Tucker <guillaume.tucker@collabora.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Linux-MM <linux-mm@kvack.org>
Link: https://lore.kernel.org/r/20231013064827.61135-2-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm_types.h        | 13 +------------
 include/net/page_pool/helpers.h | 20 ++++++++++++++------
 net/core/page_pool.c            | 14 +++++++++-----
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..74b49c4c7a52 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,18 +125,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			union {
-				/**
-				 * dma_addr_upper: might require a 64-bit
-				 * value on 32-bit architectures.
-				 */
-				unsigned long dma_addr_upper;
-				/**
-				 * For frag page support, not supported in
-				 * 32-bit architectures with 64-bit DMA.
-				 */
-				atomic_long_t pp_frag_count;
-			};
+			atomic_long_t pp_frag_count;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 8e7751464ff5..8f64adf86f5b 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -197,7 +197,7 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
-#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
 /**
@@ -211,17 +211,25 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
 	dma_addr_t ret = page->dma_addr;
 
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
-		ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16;
+	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
+		ret <<= PAGE_SHIFT;
 
 	return ret;
 }
 
-static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 {
+	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+		page->dma_addr = addr >> PAGE_SHIFT;
+
+		/* We assume page alignment to shave off bottom bits,
+		 * if this "compression" doesn't work we need to drop.
+		 */
+		return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT;
+	}
+
 	page->dma_addr = addr;
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
-		page->dma_addr_upper = upper_32_bits(addr);
+	return false;
 }
 
 static inline bool page_pool_put(struct page_pool *pool)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..8a9868ea5067 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,10 +211,6 @@ static int page_pool_init(struct page_pool *pool,
 		 */
 	}
 
-	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
-	    pool->p.flags & PP_FLAG_PAGE_FRAG)
-		return -EINVAL;
-
 #ifdef CONFIG_PAGE_POOL_STATS
 	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 	if (!pool->recycle_stats)
@@ -359,12 +355,20 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
-	page_pool_set_dma_addr(page, dma);
+	if (page_pool_set_dma_addr(page, dma))
+		goto unmap_failed;
 
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 
 	return true;
+
+unmap_failed:
+	WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
+	dma_unmap_page_attrs(pool->p.dev, dma,
+			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+	return false;
 }
 
 static void page_pool_set_pp_info(struct page_pool *pool,
-- 
cgit v1.2.3


From acfc788233263fee9413434d39d4201a8de592ba Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Oct 2023 23:18:41 +0200
Subject: vgacon: remove screen_info dependency

The vga console driver is fairly self-contained, and only used by
architectures that explicitly initialize the screen_info settings.

Chance every instance that picks the vga console by setting conswitchp
to call a function instead, and pass a reference to the screen_info
there.

Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Acked-by: Khalid Azzi <khalid@gonehiking.org>
Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231009211845.3136536-6-arnd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/kernel/setup.c      |  2 +-
 arch/arm/kernel/setup.c        |  2 +-
 arch/ia64/kernel/setup.c       |  2 +-
 arch/mips/kernel/setup.c       |  2 +-
 arch/x86/kernel/setup.c        |  2 +-
 drivers/firmware/pcdp.c        |  2 +-
 drivers/video/console/vgacon.c | 68 +++++++++++++++++++++++++-----------------
 include/linux/console.h        |  7 +++++
 8 files changed, 53 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 85a679ce061c..c00493346860 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -654,7 +654,7 @@ setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
+	vgacon_register_screen(&screen_info);
 #endif
 #endif
 
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 40326a35a179..5d8a7fb3eba4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1192,7 +1192,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
+	vgacon_register_screen(&screen_info);
 #endif
 #endif
 
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index d2c66efdde56..2c9283fcd375 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -619,7 +619,7 @@ setup_arch (char **cmdline_p)
 		 * memory so we can avoid this problem.
 		 */
 		if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY)
-			conswitchp = &vga_con;
+			vgacon_register_screen(&screen_info);
 # endif
 	}
 #endif
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 1aba7dc95132..6c3fae62a9f6 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -794,7 +794,7 @@ void __init setup_arch(char **cmdline_p)
 
 #if defined(CONFIG_VT)
 #if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
+	vgacon_register_screen(&screen_info);
 #endif
 #endif
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2470..d626e13c921a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1290,7 +1290,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
 	if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-		conswitchp = &vga_con;
+		vgacon_register_screen(&screen_info);
 #endif
 #endif
 	x86_init.oem.banner();
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index 715a45442d1c..667a595373b2 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -72,7 +72,7 @@ setup_vga_console(struct pcdp_device *dev)
 		return -ENODEV;
 	}
 
-	conswitchp = &vga_con;
+	vgacon_register_screen(&screen_info);
 	printk(KERN_INFO "PCDP: VGA console\n");
 	return 0;
 #else
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 7ad047bcae17..8ef1579fa57f 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -89,6 +89,8 @@ static int 		vga_video_font_height;
 static int 		vga_scan_lines		__read_mostly;
 static unsigned int 	vga_rolled_over; /* last vc_origin offset before wrap */
 
+static struct screen_info *vga_si;
+
 static bool vga_hardscroll_enabled;
 static bool vga_hardscroll_user_enable = true;
 
@@ -153,8 +155,9 @@ static const char *vgacon_startup(void)
 	u16 saved1, saved2;
 	volatile u16 *p;
 
-	if (screen_info.orig_video_isVGA == VIDEO_TYPE_VLFB ||
-	    screen_info.orig_video_isVGA == VIDEO_TYPE_EFI) {
+	if (!vga_si ||
+	    vga_si->orig_video_isVGA == VIDEO_TYPE_VLFB ||
+	    vga_si->orig_video_isVGA == VIDEO_TYPE_EFI) {
 	      no_vga:
 #ifdef CONFIG_DUMMY_CONSOLE
 		conswitchp = &dummy_con;
@@ -164,29 +167,29 @@ static const char *vgacon_startup(void)
 #endif
 	}
 
-	/* boot_params.screen_info reasonably initialized? */
-	if ((screen_info.orig_video_lines == 0) ||
-	    (screen_info.orig_video_cols  == 0))
+	/* vga_si reasonably initialized? */
+	if ((vga_si->orig_video_lines == 0) ||
+	    (vga_si->orig_video_cols  == 0))
 		goto no_vga;
 
 	/* VGA16 modes are not handled by VGACON */
-	if ((screen_info.orig_video_mode == 0x0D) ||	/* 320x200/4 */
-	    (screen_info.orig_video_mode == 0x0E) ||	/* 640x200/4 */
-	    (screen_info.orig_video_mode == 0x10) ||	/* 640x350/4 */
-	    (screen_info.orig_video_mode == 0x12) ||	/* 640x480/4 */
-	    (screen_info.orig_video_mode == 0x6A))	/* 800x600/4 (VESA) */
+	if ((vga_si->orig_video_mode == 0x0D) ||	/* 320x200/4 */
+	    (vga_si->orig_video_mode == 0x0E) ||	/* 640x200/4 */
+	    (vga_si->orig_video_mode == 0x10) ||	/* 640x350/4 */
+	    (vga_si->orig_video_mode == 0x12) ||	/* 640x480/4 */
+	    (vga_si->orig_video_mode == 0x6A))	/* 800x600/4 (VESA) */
 		goto no_vga;
 
-	vga_video_num_lines = screen_info.orig_video_lines;
-	vga_video_num_columns = screen_info.orig_video_cols;
+	vga_video_num_lines = vga_si->orig_video_lines;
+	vga_video_num_columns = vga_si->orig_video_cols;
 	vgastate.vgabase = NULL;
 
-	if (screen_info.orig_video_mode == 7) {
+	if (vga_si->orig_video_mode == 7) {
 		/* Monochrome display */
 		vga_vram_base = 0xb0000;
 		vga_video_port_reg = VGA_CRT_IM;
 		vga_video_port_val = VGA_CRT_DM;
-		if ((screen_info.orig_video_ega_bx & 0xff) != 0x10) {
+		if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) {
 			static struct resource ega_console_resource =
 			    { .name	= "ega",
 			      .flags	= IORESOURCE_IO,
@@ -223,12 +226,12 @@ static const char *vgacon_startup(void)
 		vga_vram_base = 0xb8000;
 		vga_video_port_reg = VGA_CRT_IC;
 		vga_video_port_val = VGA_CRT_DC;
-		if ((screen_info.orig_video_ega_bx & 0xff) != 0x10) {
+		if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) {
 			int i;
 
 			vga_vram_size = 0x8000;
 
-			if (!screen_info.orig_video_isVGA) {
+			if (!vga_si->orig_video_isVGA) {
 				static struct resource ega_console_resource =
 				    { .name	= "ega",
 				      .flags	= IORESOURCE_IO,
@@ -319,14 +322,14 @@ static const char *vgacon_startup(void)
 	    || vga_video_type == VIDEO_TYPE_VGAC
 	    || vga_video_type == VIDEO_TYPE_EGAM) {
 		vga_hardscroll_enabled = vga_hardscroll_user_enable;
-		vga_default_font_height = screen_info.orig_video_points;
-		vga_video_font_height = screen_info.orig_video_points;
+		vga_default_font_height = vga_si->orig_video_points;
+		vga_video_font_height = vga_si->orig_video_points;
 		/* This may be suboptimal but is a safe bet - go with it */
 		vga_scan_lines =
 		    vga_video_font_height * vga_video_num_lines;
 	}
 
-	vgacon_xres = screen_info.orig_video_cols * VGA_FONTWIDTH;
+	vgacon_xres = vga_si->orig_video_cols * VGA_FONTWIDTH;
 	vgacon_yres = vga_scan_lines;
 
 	return display_desc;
@@ -371,7 +374,7 @@ static void vgacon_init(struct vc_data *c, int init)
 	/* Only set the default if the user didn't deliberately override it */
 	if (global_cursor_default == -1)
 		global_cursor_default =
-			!(screen_info.flags & VIDEO_FLAGS_NOCURSOR);
+			!(vga_si->flags & VIDEO_FLAGS_NOCURSOR);
 }
 
 static void vgacon_deinit(struct vc_data *c)
@@ -589,7 +592,7 @@ static int vgacon_switch(struct vc_data *c)
 {
 	int x = c->vc_cols * VGA_FONTWIDTH;
 	int y = c->vc_rows * c->vc_cell_height;
-	int rows = screen_info.orig_video_lines * vga_default_font_height/
+	int rows = vga_si->orig_video_lines * vga_default_font_height/
 		c->vc_cell_height;
 	/*
 	 * We need to save screen size here as it's the only way
@@ -609,7 +612,7 @@ static int vgacon_switch(struct vc_data *c)
 
 		if ((vgacon_xres != x || vgacon_yres != y) &&
 		    (!(vga_video_num_columns % 2) &&
-		     vga_video_num_columns <= screen_info.orig_video_cols &&
+		     vga_video_num_columns <= vga_si->orig_video_cols &&
 		     vga_video_num_lines <= rows))
 			vgacon_doresize(c, c->vc_cols, c->vc_rows);
 	}
@@ -1056,13 +1059,13 @@ static int vgacon_resize(struct vc_data *c, unsigned int width,
 		 * Ho ho!  Someone (svgatextmode, eh?) may have reprogrammed
 		 * the video mode!  Set the new defaults then and go away.
 		 */
-		screen_info.orig_video_cols = width;
-		screen_info.orig_video_lines = height;
+		vga_si->orig_video_cols = width;
+		vga_si->orig_video_lines = height;
 		vga_default_font_height = c->vc_cell_height;
 		return 0;
 	}
-	if (width % 2 || width > screen_info.orig_video_cols ||
-	    height > (screen_info.orig_video_lines * vga_default_font_height)/
+	if (width % 2 || width > vga_si->orig_video_cols ||
+	    height > (vga_si->orig_video_lines * vga_default_font_height)/
 	    c->vc_cell_height)
 		return -EINVAL;
 
@@ -1092,8 +1095,8 @@ static void vgacon_save_screen(struct vc_data *c)
 		 * console initialization routines.
 		 */
 		vga_bootup_console = 1;
-		c->state.x = screen_info.orig_x;
-		c->state.y = screen_info.orig_y;
+		c->state.x = vga_si->orig_x;
+		c->state.y = vga_si->orig_y;
 	}
 
 	/* We can't copy in more than the size of the video buffer,
@@ -1186,4 +1189,13 @@ const struct consw vga_con = {
 };
 EXPORT_SYMBOL(vga_con);
 
+void vgacon_register_screen(struct screen_info *si)
+{
+	if (!si || vga_si)
+		return;
+
+	conswitchp = &vga_con;
+	vga_si = si;
+}
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/console.h b/include/linux/console.h
index 7de11c763eb3..5ff6f11c47b1 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -101,6 +101,13 @@ extern const struct consw dummy_con;	/* dummy console buffer */
 extern const struct consw vga_con;	/* VGA text console */
 extern const struct consw newport_con;	/* SGI Newport console  */
 
+struct screen_info;
+#ifdef CONFIG_VGA_CONSOLE
+void vgacon_register_screen(struct screen_info *si);
+#else
+static inline void vgacon_register_screen(struct screen_info *si) { }
+#endif
+
 int con_is_bound(const struct consw *csw);
 int do_unregister_con_driver(const struct consw *csw);
 int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
-- 
cgit v1.2.3


From 545a4f89cad5bd349522d17558b3a4208648e20e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:56 +0300
Subject: printk: Check valid console index for preferred console

Let's check for valid console index values for preferred console to avoid
bogus console index numbers from kernel command line.

Let's also return an error for negative index numbers for the preferred
console. Unlike for device drivers, a negative index is not valid for the
preferred console.

Let's also constify idx while at it.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-1-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h |  2 +-
 kernel/printk/printk.c  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 5ff6f11c47b1..1be13c9695e0 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, int idx, char *options);
+extern int add_preferred_console(char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0b3af1529778..b16c0bab88c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2404,12 +2404,20 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
 	console_set_on_cmdline = 1;
 }
 
-static int __add_preferred_console(char *name, int idx, char *options,
+static int __add_preferred_console(char *name, const short idx, char *options,
 				   char *brl_options, bool user_specified)
 {
 	struct console_cmdline *c;
 	int i;
 
+	/*
+	 * We use a signed short index for struct console for device drivers to
+	 * indicate a not yet assigned index or port. However, a negative index
+	 * value is not valid for preferred console.
+	 */
+	if (idx < 0)
+		return -EINVAL;
+
 	/*
 	 *	See if this tty is not yet registered, and
 	 *	if we have a slot free.
@@ -2513,7 +2521,7 @@ __setup("console=", console_setup);
  * commonly to provide a default console (ie from PROM variables) when
  * the user has not supplied one.
  */
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(char *name, const short idx, char *options)
 {
 	return __add_preferred_console(name, idx, options, NULL, false);
 }
-- 
cgit v1.2.3


From 1e3c8526918403a4cebbd67bcd18443bf68df939 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:57 +0300
Subject: printk: Constify name for add_preferred_console()

While adding a preferred console handling for serial_core for serial port
hardware based device addressing, Jiri suggested we constify name for
add_preferred_console(). The name gets copied anyways. This allows serial
core to add a preferred console using serial drv->dev_name without copying
it.

Note that constifying options causes changes all over the place because of
struct console for match().

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-2-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 2 +-
 kernel/printk/printk.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 1be13c9695e0..eddf4c5279b5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, const short idx, char *options);
+extern int add_preferred_console(const char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b16c0bab88c6..122cb6e83f42 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2404,7 +2404,7 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
 	console_set_on_cmdline = 1;
 }
 
-static int __add_preferred_console(char *name, const short idx, char *options,
+static int __add_preferred_console(const char *name, const short idx, char *options,
 				   char *brl_options, bool user_specified)
 {
 	struct console_cmdline *c;
@@ -2521,7 +2521,7 @@ __setup("console=", console_setup);
  * commonly to provide a default console (ie from PROM variables) when
  * the user has not supplied one.
  */
-int add_preferred_console(char *name, const short idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
 {
 	return __add_preferred_console(name, idx, options, NULL, false);
 }
-- 
cgit v1.2.3


From c5b5831f3c168a1bf1172159c4956e721a0064cf Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Mon, 18 Sep 2023 11:37:52 +0200
Subject: pmdomain: mediatek: Add support for MT8365

Add the needed board data to support MT8365 SoC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Alexandre Mergnat <amergnat@baylibre.com>
Tested-by: Alexandre Mergnat <amergnat@baylibre.com>
Link: https://lore.kernel.org/r/20230918093751.1188668-9-msp@baylibre.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/mediatek/mt8365-pm-domains.h | 197 ++++++++++++++++++++++++++
 drivers/pmdomain/mediatek/mtk-pm-domains.c    |   5 +
 include/linux/soc/mediatek/infracfg.h         |  41 ++++++
 3 files changed, 243 insertions(+)
 create mode 100644 drivers/pmdomain/mediatek/mt8365-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/pmdomain/mediatek/mt8365-pm-domains.h b/drivers/pmdomain/mediatek/mt8365-pm-domains.h
new file mode 100644
index 000000000000..3d83d49eaa7c
--- /dev/null
+++ b/drivers/pmdomain/mediatek/mt8365-pm-domains.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8365_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8365_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mediatek,mt8365-power.h>
+
+/*
+ * MT8365 power domain support
+ */
+
+#define MT8365_BUS_PROT_INFRA_WR_TOPAXI(_mask)				\
+		BUS_PROT_WR(INFRA, _mask,				\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_SET,		\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_CLR,		\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_STA1)
+
+#define MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(_mask)			\
+		BUS_PROT_WR(INFRA, _mask,				\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_1_SET,	\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_1_CLR,	\
+			    MT8365_INFRA_TOPAXI_PROTECTEN_1_STA1)
+
+#define MT8365_BUS_PROT_SMI_WR_CLAMP_EN_PORT(port)			\
+		BUS_PROT_WR(SMI, BIT(port),				\
+			    MT8365_SMI_COMMON_CLAMP_EN_SET,		\
+			    MT8365_SMI_COMMON_CLAMP_EN_CLR,		\
+			    MT8365_SMI_COMMON_CLAMP_EN)
+
+#define MT8365_BUS_PROT_WAY_EN(_set_mask, _set, _sta_mask, _sta)	\
+		_BUS_PROT(_set_mask, _set, _set, _sta_mask, _sta,	\
+			  BUS_PROT_COMPONENT_INFRA |			\
+			  BUS_PROT_STA_COMPONENT_INFRA_NAO |		\
+			  BUS_PROT_INVERTED |				\
+			  BUS_PROT_REG_UPDATE)
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8365[] = {
+	[MT8365_POWER_DOMAIN_MM] = {
+		.name = "mm",
+		.sta_mask = PWR_STATUS_DISP,
+		.ctl_offs = 0x30c,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_0 |
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_1),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(
+				MT8365_INFRA_TOPAXI_PROTECTEN_MM_M0 |
+				MT8365_INFRA_TOPAXI_PROTECTEN_MDMCU_M1 |
+				MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_0 |
+				MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_1),
+			MT8365_BUS_PROT_WAY_EN(
+				MT8365_INFRA_TOPAXI_SI0_WAY_EN_MMAPB_S,
+				MT8365_INFRA_TOPAXI_SI0_CTL,
+				MT8365_INFRA_NAO_TOPAXI_SI0_CTRL_UPDATED,
+				MT8365_INFRA_NAO_TOPAXI_SI0_STA),
+			MT8365_BUS_PROT_WAY_EN(
+				MT8365_INFRA_TOPAXI_SI2_WAY_EN_PERI_M1,
+				MT8365_INFRA_TOPAXI_SI2_CTL,
+				MT8365_INFRA_NAO_TOPAXI_SI2_CTRL_UPDATED,
+				MT8365_INFRA_NAO_TOPAXI_SI2_STA),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(
+				MT8365_INFRA_TOPAXI_PROTECTEN_MMAPB_S),
+		},
+		.caps = MTK_SCPD_STRICT_BUS_PROTECTION | MTK_SCPD_HAS_INFRA_NAO,
+	},
+	[MT8365_POWER_DOMAIN_VENC] = {
+		.name = "venc",
+		.sta_mask = PWR_STATUS_VENC,
+		.ctl_offs = 0x0304,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_SMI_WR_CLAMP_EN_PORT(1),
+		},
+	},
+	[MT8365_POWER_DOMAIN_AUDIO] = {
+		.name = "audio",
+		.sta_mask = PWR_STATUS_AUDIO,
+		.ctl_offs = 0x0314,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(12, 8),
+		.sram_pdn_ack_bits = GENMASK(17, 13),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_MP1_L2C_AFIFO |
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_AUDIO_M),
+		},
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+	},
+	[MT8365_POWER_DOMAIN_CONN] = {
+		.name = "conn",
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = 0x032c,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(
+				MT8365_INFRA_TOPAXI_PROTECTEN_AP2CONN_AHB),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_CONN2INFRA_AXI_GALS_MST),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(
+				MT8365_INFRA_TOPAXI_PROTECTEN_CONN2INFRA_AHB),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_INFRA2CONN_AHB_GALS_SLV),
+		},
+		.caps = MTK_SCPD_ACTIVE_WAKEUP | MTK_SCPD_KEEP_DEFAULT_OFF,
+	},
+	[MT8365_POWER_DOMAIN_MFG] = {
+		.name = "mfg",
+		.sta_mask = PWR_STATUS_MFG,
+		.ctl_offs = 0x0338,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(BIT(25)),
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI(
+				MT8365_INFRA_TOPAXI_PROTECTEN_MFG_M0 |
+				MT8365_INFRA_TOPAXI_PROTECTEN_INFRA2MFG),
+		},
+	},
+	[MT8365_POWER_DOMAIN_CAM] = {
+		.name = "cam",
+		.sta_mask = BIT(25),
+		.ctl_offs = 0x0344,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_CAM2MM_AXI_GALS_MST),
+			MT8365_BUS_PROT_SMI_WR_CLAMP_EN_PORT(2),
+		},
+	},
+	[MT8365_POWER_DOMAIN_VDEC] = {
+		.name = "vdec",
+		.sta_mask = BIT(31),
+		.ctl_offs = 0x0370,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_SMI_WR_CLAMP_EN_PORT(3),
+		},
+	},
+	[MT8365_POWER_DOMAIN_APU] = {
+		.name = "apu",
+		.sta_mask = BIT(16),
+		.ctl_offs = 0x0378,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(14, 8),
+		.sram_pdn_ack_bits = GENMASK(21, 15),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_APU2AP |
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_APU_CBIP_GALS_MST),
+			MT8365_BUS_PROT_SMI_WR_CLAMP_EN_PORT(4),
+		},
+	},
+	[MT8365_POWER_DOMAIN_DSP] = {
+		.name = "dsp",
+		.sta_mask = BIT(17),
+		.ctl_offs = 0x037C,
+		.pwr_sta_offs = 0x0180,
+		.pwr_sta2nd_offs = 0x0184,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.bp_cfg = {
+			MT8365_BUS_PROT_INFRA_WR_TOPAXI_1(
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_INFRA_GALS_ADB |
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_M |
+				MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_S),
+		},
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+	},
+};
+
+static const struct scpsys_soc_data mt8365_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8365,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8365),
+};
+
+#endif /* __SOC_MEDIATEK_MT8365_PM_DOMAINS_H */
diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c
index 69cdf6ff00b8..e26dc17d07ad 100644
--- a/drivers/pmdomain/mediatek/mtk-pm-domains.c
+++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c
@@ -24,6 +24,7 @@
 #include "mt8188-pm-domains.h"
 #include "mt8192-pm-domains.h"
 #include "mt8195-pm-domains.h"
+#include "mt8365-pm-domains.h"
 
 #define MTK_POLL_DELAY_US		10
 #define MTK_POLL_TIMEOUT		USEC_PER_SEC
@@ -652,6 +653,10 @@ static const struct of_device_id scpsys_of_match[] = {
 		.compatible = "mediatek,mt8195-power-controller",
 		.data = &mt8195_scpsys_data,
 	},
+	{
+		.compatible = "mediatek,mt8365-power-controller",
+		.data = &mt8365_scpsys_data,
+	},
 	{ }
 };
 
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 07f67b3d8e97..6c6cccc848f4 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,47 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8365_INFRA_TOPAXI_PROTECTEN_STA1				0x228
+#define MT8365_INFRA_TOPAXI_PROTECTEN_SET				0x2a0
+#define MT8365_INFRA_TOPAXI_PROTECTEN_CLR				0x2a4
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM_M0				BIT(1)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MDMCU_M1				BIT(2)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MMAPB_S				BIT(6)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_0		BIT(10)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MM2INFRA_AXI_GALS_SLV_1		BIT(11)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_AP2CONN_AHB			BIT(13)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_CONN2INFRA_AHB			BIT(14)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_MFG_M0				BIT(21)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_INFRA2MFG			BIT(22)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_STA1				0x258
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_SET				0x2a8
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CLR				0x2ac
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_APU2AP				BIT(2)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_0	BIT(16)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_MM2INFRA_AXI_GALS_MST_1	BIT(17)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CONN2INFRA_AXI_GALS_MST	BIT(18)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_CAM2MM_AXI_GALS_MST		BIT(19)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_APU_CBIP_GALS_MST		BIT(20)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_INFRA2CONN_AHB_GALS_SLV	BIT(21)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_INFRA_GALS_ADB	BIT(24)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_PWRDNREQ_MP1_L2C_AFIFO		BIT(27)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_AUDIO_M		BIT(28)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_M		BIT(30)
+#define MT8365_INFRA_TOPAXI_PROTECTEN_1_AUDIO_BUS_DSP_S		BIT(31)
+
+#define MT8365_INFRA_NAO_TOPAXI_SI0_STA					0x0
+#define MT8365_INFRA_NAO_TOPAXI_SI0_CTRL_UPDATED			BIT(24)
+#define MT8365_INFRA_NAO_TOPAXI_SI2_STA					0x28
+#define MT8365_INFRA_NAO_TOPAXI_SI2_CTRL_UPDATED			BIT(14)
+#define MT8365_INFRA_TOPAXI_SI0_CTL					0x200
+#define MT8365_INFRA_TOPAXI_SI0_WAY_EN_MMAPB_S				BIT(6)
+#define MT8365_INFRA_TOPAXI_SI2_CTL					0x234
+#define MT8365_INFRA_TOPAXI_SI2_WAY_EN_PERI_M1				BIT(5)
+
+#define MT8365_SMI_COMMON_CLAMP_EN			0x3c0
+#define MT8365_SMI_COMMON_CLAMP_EN_SET			0x3c4
+#define MT8365_SMI_COMMON_CLAMP_EN_CLR			0x3c8
+
 #define MT8195_TOP_AXI_PROT_EN_STA1                     0x228
 #define MT8195_TOP_AXI_PROT_EN_1_STA1                   0x258
 #define MT8195_TOP_AXI_PROT_EN_SET			0x2a0
-- 
cgit v1.2.3


From 0ac6486e5cbd1af682d3b52fcd3615247e13cb57 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Fri, 13 Oct 2023 12:06:24 +0800
Subject: i3c: Add support for bus enumeration & notification

This allows other drivers to be notified when new i3c busses are
attached, referring to a whole i3c bus as opposed to individual
devices.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/i3c/master.c       | 35 +++++++++++++++++++++++++++++++++++
 include/linux/i3c/master.h | 11 +++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 87283e4a4607..959ec5269376 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -22,6 +22,7 @@
 static DEFINE_IDR(i3c_bus_idr);
 static DEFINE_MUTEX(i3c_core_lock);
 static int __i3c_first_dynamic_bus_num;
+static BLOCKING_NOTIFIER_HEAD(i3c_bus_notifier);
 
 /**
  * i3c_bus_maintenance_lock - Lock the bus for a maintenance operation
@@ -453,6 +454,36 @@ static int i3c_bus_init(struct i3c_bus *i3cbus, struct device_node *np)
 	return 0;
 }
 
+void i3c_for_each_bus_locked(int (*fn)(struct i3c_bus *bus, void *data),
+			     void *data)
+{
+	struct i3c_bus *bus;
+	int id;
+
+	mutex_lock(&i3c_core_lock);
+	idr_for_each_entry(&i3c_bus_idr, bus, id)
+		fn(bus, data);
+	mutex_unlock(&i3c_core_lock);
+}
+EXPORT_SYMBOL_GPL(i3c_for_each_bus_locked);
+
+int i3c_register_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&i3c_bus_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(i3c_register_notifier);
+
+int i3c_unregister_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&i3c_bus_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(i3c_unregister_notifier);
+
+static void i3c_bus_notify(struct i3c_bus *bus, unsigned int action)
+{
+	blocking_notifier_call_chain(&i3c_bus_notifier, action, bus);
+}
+
 static const char * const i3c_bus_mode_strings[] = {
 	[I3C_BUS_MODE_PURE] = "pure",
 	[I3C_BUS_MODE_MIXED_FAST] = "mixed-fast",
@@ -2682,6 +2713,8 @@ int i3c_master_register(struct i3c_master_controller *master,
 	if (ret)
 		goto err_del_dev;
 
+	i3c_bus_notify(i3cbus, I3C_NOTIFY_BUS_ADD);
+
 	/*
 	 * We're done initializing the bus and the controller, we can now
 	 * register I3C devices discovered during the initial DAA.
@@ -2714,6 +2747,8 @@ EXPORT_SYMBOL_GPL(i3c_master_register);
  */
 void i3c_master_unregister(struct i3c_master_controller *master)
 {
+	i3c_bus_notify(&master->bus, I3C_NOTIFY_BUS_REMOVE);
+
 	i3c_master_i2c_adapter_cleanup(master);
 	i3c_master_unregister_i3c_devs(master);
 	i3c_master_bus_cleanup(master);
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0b52da4f2346..db909ef79be4 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -24,6 +24,12 @@
 
 struct i2c_client;
 
+/* notifier actions. notifier call data is the struct i3c_bus */
+enum {
+	I3C_NOTIFY_BUS_ADD,
+	I3C_NOTIFY_BUS_REMOVE,
+};
+
 struct i3c_master_controller;
 struct i3c_bus;
 struct i3c_device;
@@ -652,4 +658,9 @@ void i3c_master_queue_ibi(struct i3c_dev_desc *dev, struct i3c_ibi_slot *slot);
 
 struct i3c_ibi_slot *i3c_master_get_free_ibi_slot(struct i3c_dev_desc *dev);
 
+void i3c_for_each_bus_locked(int (*fn)(struct i3c_bus *bus, void *data),
+			     void *data);
+int i3c_register_notifier(struct notifier_block *nb);
+int i3c_unregister_notifier(struct notifier_block *nb);
+
 #endif /* I3C_MASTER_H */
-- 
cgit v1.2.3


From 184fdf9fc7ae6ae7155768faa48fc609d1a24b7e Mon Sep 17 00:00:00 2001
From: Cuda-Chen <clh960524@gmail.com>
Date: Tue, 17 Oct 2023 13:37:03 +0800
Subject: locking/seqlock: Fix grammar in comment

The "neither writes before and after ..." for the description
of do_write_seqcount_end() should be "neither writes before nor after".

Signed-off-by: Cuda-Chen <clh960524@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231017053703.11312-1-clh960524@gmail.com
---
 include/linux/seqlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 80f21d2ca2aa..e92f9d5577ba 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -584,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s)
  * via WRITE_ONCE): a) to ensure the writes become visible to other threads
  * atomically, avoiding compiler optimizations; b) to document which writes are
  * meant to propagate to the reader critical section. This is necessary because
- * neither writes before and after the barrier are enclosed in a seq-writer
+ * neither writes before nor after the barrier are enclosed in a seq-writer
  * critical section that would ensure readers are aware of ongoing writes::
  *
  *	seqcount_t seq;
-- 
cgit v1.2.3


From 9a675ba55a96a45a9fb69e6a5c43f80c6682e541 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Oct 2023 14:57:38 +0200
Subject: net, bpf: Add a warning if NAPI cb missed xdp_do_flush().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few drivers were missing a xdp_do_flush() invocation after
XDP_REDIRECT.

Add three helper functions each for one of the per-CPU lists. Return
true if the per-CPU list is non-empty and flush the list.

Add xdp_do_check_flushed() which invokes each helper functions and
creates a warning if one of the functions had a non-empty list.

Hide everything behind CONFIG_DEBUG_NET.

Suggested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20231016125738.Yt79p1uF@linutronix.de
---
 include/linux/bpf.h    |  3 +++
 include/net/xdp_sock.h |  9 +++++++++
 kernel/bpf/cpumap.c    | 10 ++++++++++
 kernel/bpf/devmap.c    | 10 ++++++++++
 net/core/dev.c         |  2 ++
 net/core/dev.h         |  6 ++++++
 net/core/filter.c      | 16 ++++++++++++++++
 net/xdp/xsk.c          | 10 ++++++++++
 8 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c51a507508..b4b40b45962b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2478,6 +2478,9 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
+
+bool dev_check_flush(void);
+bool cpu_map_check_flush(void);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69b472604b86..7dd0df2f6f8e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -109,4 +109,13 @@ static inline void __xsk_map_flush(void)
 
 #endif /* CONFIG_XDP_SOCKETS */
 
+#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
+bool xsk_map_check_flush(void);
+#else
+static inline bool xsk_map_check_flush(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e42a1bdb7f53..8a0bb80fe48a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -764,6 +764,16 @@ void __cpu_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+		return false;
+	__cpu_map_flush();
+	return true;
+}
+#endif
+
 static int __init cpu_map_init(void)
 {
 	int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4d42f6ed6c11..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -418,6 +418,16 @@ void __dev_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&dev_flush_list)))
+		return false;
+	__dev_flush();
+	return true;
+}
+#endif
+
 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
  * by local_bh_disable() (from XDP calls inside NAPI). The
  * rcu_read_lock_bh_held() below makes lockdep accept both.
diff --git a/net/core/dev.c b/net/core/dev.c
index 97e7b9833db9..4420831180c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6535,6 +6535,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 		work = n->poll(n, weight);
 		trace_napi_poll(n, work, weight);
+
+		xdp_do_check_flushed(n);
 	}
 
 	if (unlikely(work > weight))
diff --git a/net/core/dev.h b/net/core/dev.h
index e075e198092c..f66125857af7 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -136,4 +136,10 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index cc2e4babc85f..21d75108c2e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,8 @@
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <linux/un.h>
 
+#include "dev.h"
+
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
 
@@ -4208,6 +4210,20 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+	bool ret;
+
+	ret = dev_check_flush();
+	ret |= cpu_map_check_flush();
+	ret |= xsk_map_check_flush();
+
+	WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+		  napi->poll);
+}
+#endif
+
 void bpf_clear_redirect_map(struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f5e96e0d6e01..ba070fd37d24 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -391,6 +391,16 @@ void __xsk_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool xsk_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
+		return false;
+	__xsk_map_flush();
+	return true;
+}
+#endif
+
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 {
 	xskq_prod_submit_n(pool->cq, nb_entries);
-- 
cgit v1.2.3


From 96ff37ceb203426b1bcebbae42399686110b0130 Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Date: Wed, 4 Oct 2023 15:19:55 -0500
Subject: block:sed-opal: SED Opal keystore

Add read and write functions that allow SED Opal keys to stored
in a permanent keystore.

Signed-off-by: Greg Joyce <gjoyce@linux.vnet.ibm.com>
Reviewed-by: Jonathan Derrick <jonathan.derrick@linux.dev>
Link: https://lore.kernel.org/r/20231004201957.1451669-2-gjoyce@linux.vnet.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sed-opal-key.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 include/linux/sed-opal-key.h

(limited to 'include/linux')

diff --git a/include/linux/sed-opal-key.h b/include/linux/sed-opal-key.h
new file mode 100644
index 000000000000..0ca03054e8f6
--- /dev/null
+++ b/include/linux/sed-opal-key.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SED key operations.
+ *
+ * Copyright (C) 2023 IBM Corporation
+ *
+ * These are the accessor functions (read/write) for SED Opal
+ * keys. Specific keystores can provide overrides.
+ *
+ */
+
+#include <linux/kernel.h>
+
+#ifdef CONFIG_PSERIES_PLPKS_SED
+int sed_read_key(char *keyname, char *key, u_int *keylen);
+int sed_write_key(char *keyname, char *key, u_int keylen);
+#else
+static inline
+int sed_read_key(char *keyname, char *key, u_int *keylen) {
+	return -EOPNOTSUPP;
+}
+static inline
+int sed_write_key(char *keyname, char *key, u_int keylen) {
+	return -EOPNOTSUPP;
+}
+#endif
-- 
cgit v1.2.3


From 41efa431244f6498833ff8ee8dde28c4924c5479 Mon Sep 17 00:00:00 2001
From: Reinette Chatre <reinette.chatre@intel.com>
Date: Tue, 17 Oct 2023 10:56:38 -0700
Subject: PCI/MSI: Provide stubs for IMS functions

The IMS related functions (pci_create_ims_domain(), pci_ims_alloc_irq(),
and pci_ims_free_irq()) are not declared when CONFIG_PCI_MSI is disabled.

Provide definitions of these functions for use when callers are compiled
with CONFIG_PCI_MSI disabled.

Fixes: 0194425af0c8 ("PCI/MSI: Provide IMS (Interrupt Message Store) support")
Fixes: c9e5bea27383 ("PCI/MSI: Provide pci_ims_alloc/free_irq()")
Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/14ff656899a3757453f8584c1109d7a9b98fa258.1697564731.git.reinette.chatre@intel.com
---
 include/linux/pci.h | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..b56417276042 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1624,6 +1624,8 @@ struct msix_entry {
 	u16	entry;	/* Driver uses to specify entry, OS writes */
 };
 
+struct msi_domain_template;
+
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
@@ -1656,6 +1658,11 @@ void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map);
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
+bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
+			   unsigned int hwsize, void *data);
+struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
+				 const struct irq_affinity_desc *affdesc);
+void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1719,6 +1726,25 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
 {
 	return cpu_possible_mask;
 }
+
+static inline bool pci_create_ims_domain(struct pci_dev *pdev,
+					 const struct msi_domain_template *template,
+					 unsigned int hwsize, void *data)
+{ return false; }
+
+static inline struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev,
+					       union msi_instance_cookie *icookie,
+					       const struct irq_affinity_desc *affdesc)
+{
+	struct msi_map map = { .index = -ENOSYS, };
+
+	return map;
+}
+
+static inline void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map)
+{
+}
+
 #endif
 
 /**
@@ -2616,14 +2642,6 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
-struct msi_domain_template;
-
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data);
-struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc);
-void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
-
 #include <linux/dma-mapping.h>
 
 #define pci_printk(level, pdev, fmt, arg...) \
-- 
cgit v1.2.3


From 3ebed3749f1767927229d568eea29daaf9f272d5 Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Tue, 17 Oct 2023 17:09:17 +0000
Subject: nvme-auth: alloc nvme_dhchap_key as single buffer

Co-developed-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/common/auth.c | 37 +++++++++++++++++++++++++------------
 include/linux/nvme-auth.h  |  4 +++-
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index d90e4f0c08b7..16a071448d54 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -150,6 +150,14 @@ size_t nvme_auth_hmac_hash_len(u8 hmac_id)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
 
+u32 nvme_auth_key_struct_size(u32 key_len)
+{
+	struct nvme_dhchap_key key;
+
+	return struct_size(&key, key, key_len);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size);
+
 struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 					      u8 key_hash)
 {
@@ -163,14 +171,9 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 	p = strrchr(secret, ':');
 	if (p)
 		allocated_len = p - secret;
-	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	key = nvme_auth_alloc_key(allocated_len, 0);
 	if (!key)
 		return ERR_PTR(-ENOMEM);
-	key->key = kzalloc(allocated_len, GFP_KERNEL);
-	if (!key->key) {
-		ret = -ENOMEM;
-		goto out_free_key;
-	}
 
 	key_len = base64_decode(secret, allocated_len, key->key);
 	if (key_len < 0) {
@@ -213,19 +216,29 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 	key->hash = key_hash;
 	return key;
 out_free_secret:
-	kfree_sensitive(key->key);
-out_free_key:
-	kfree(key);
+	nvme_auth_free_key(key);
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
 
+struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash)
+{
+	u32 num_bytes = nvme_auth_key_struct_size(len);
+	struct nvme_dhchap_key *key = kzalloc(num_bytes, GFP_KERNEL);
+
+	if (key) {
+		key->len = len;
+		key->hash = hash;
+	}
+	return key;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_alloc_key);
+
 void nvme_auth_free_key(struct nvme_dhchap_key *key)
 {
 	if (!key)
 		return;
-	kfree_sensitive(key->key);
-	kfree(key);
+	kfree_sensitive(key);
 }
 EXPORT_SYMBOL_GPL(nvme_auth_free_key);
 
@@ -237,7 +250,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
 	u8 *transformed_key;
 	int ret;
 
-	if (!key || !key->key) {
+	if (!key) {
 		pr_warn("No key specified\n");
 		return ERR_PTR(-ENOKEY);
 	}
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index dcb8030062dd..a5ae9abe1ef6 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -9,9 +9,9 @@
 #include <crypto/kpp.h>
 
 struct nvme_dhchap_key {
-	u8 *key;
 	size_t len;
 	u8 hash;
+	u8 key[];
 };
 
 u32 nvme_auth_get_seqnum(void);
@@ -24,9 +24,11 @@ const char *nvme_auth_digest_name(u8 hmac_id);
 size_t nvme_auth_hmac_hash_len(u8 hmac_id);
 u8 nvme_auth_hmac_id(const char *hmac_name);
 
+u32 nvme_auth_key_struct_size(u32 key_len);
 struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 					      u8 key_hash);
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
+struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash);
 u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
 int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
-- 
cgit v1.2.3


From f047daed179a451657d1e66b5fe4030a593a000c Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Tue, 17 Oct 2023 17:09:18 +0000
Subject: nvme-auth: use transformed key size to create resp

This does not change current behaviour as the driver currently
verifies that the secret size is the same size as the length of
the transformation hash.

Co-developed-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Akash Appaiah <Akash.Appaiah@dell.com>
Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/common/auth.c | 23 ++++++++++++++---------
 drivers/nvme/host/auth.c   | 30 +++++++++++++++---------------
 drivers/nvme/target/auth.c | 31 +++++++++++++++++--------------
 include/linux/nvme-auth.h  |  3 ++-
 4 files changed, 48 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 16a071448d54..f954aeb647a5 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -242,21 +242,25 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_free_key);
 
-u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+struct nvme_dhchap_key *nvme_auth_transform_key(
+		struct nvme_dhchap_key *key, char *nqn)
 {
 	const char *hmac_name;
 	struct crypto_shash *key_tfm;
 	struct shash_desc *shash;
-	u8 *transformed_key;
-	int ret;
+	struct nvme_dhchap_key *transformed_key;
+	int ret, key_len;
 
 	if (!key) {
 		pr_warn("No key specified\n");
 		return ERR_PTR(-ENOKEY);
 	}
 	if (key->hash == 0) {
-		transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
-		return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+		key_len = nvme_auth_key_struct_size(key->len);
+		transformed_key = kmemdup(key, key_len, GFP_KERNEL);
+		if (!transformed_key)
+			return ERR_PTR(-ENOMEM);
+		return transformed_key;
 	}
 	hmac_name = nvme_auth_hmac_name(key->hash);
 	if (!hmac_name) {
@@ -266,7 +270,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
 
 	key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
 	if (IS_ERR(key_tfm))
-		return (u8 *)key_tfm;
+		return ERR_CAST(key_tfm);
 
 	shash = kmalloc(sizeof(struct shash_desc) +
 			crypto_shash_descsize(key_tfm),
@@ -276,7 +280,8 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
 		goto out_free_key;
 	}
 
-	transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+	key_len = crypto_shash_digestsize(key_tfm);
+	transformed_key = nvme_auth_alloc_key(key_len, key->hash);
 	if (!transformed_key) {
 		ret = -ENOMEM;
 		goto out_free_shash;
@@ -295,7 +300,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
 	ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
 	if (ret < 0)
 		goto out_free_transformed_key;
-	ret = crypto_shash_final(shash, transformed_key);
+	ret = crypto_shash_final(shash, transformed_key->key);
 	if (ret < 0)
 		goto out_free_transformed_key;
 
@@ -305,7 +310,7 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
 	return transformed_key;
 
 out_free_transformed_key:
-	kfree_sensitive(transformed_key);
+	nvme_auth_free_key(transformed_key);
 out_free_shash:
 	kfree(shash);
 out_free_key:
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index daf5d144a8ea..de1390d705dc 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -23,6 +23,7 @@ struct nvme_dhchap_queue_context {
 	struct nvme_ctrl *ctrl;
 	struct crypto_shash *shash_tfm;
 	struct crypto_kpp *dh_tfm;
+	struct nvme_dhchap_key *transformed_key;
 	void *buf;
 	int qid;
 	int error;
@@ -36,7 +37,6 @@ struct nvme_dhchap_queue_context {
 	u8 c1[64];
 	u8 c2[64];
 	u8 response[64];
-	u8 *host_response;
 	u8 *ctrl_key;
 	u8 *host_key;
 	u8 *sess_key;
@@ -428,12 +428,12 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 	dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
 		__func__, chap->qid, chap->s1, chap->transaction);
 
-	if (!chap->host_response) {
-		chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+	if (!chap->transformed_key) {
+		chap->transformed_key = nvme_auth_transform_key(ctrl->host_key,
 						ctrl->opts->host->nqn);
-		if (IS_ERR(chap->host_response)) {
-			ret = PTR_ERR(chap->host_response);
-			chap->host_response = NULL;
+		if (IS_ERR(chap->transformed_key)) {
+			ret = PTR_ERR(chap->transformed_key);
+			chap->transformed_key = NULL;
 			return ret;
 		}
 	} else {
@@ -442,7 +442,7 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 	}
 
 	ret = crypto_shash_setkey(chap->shash_tfm,
-			chap->host_response, ctrl->host_key->len);
+			chap->transformed_key->key, chap->transformed_key->len);
 	if (ret) {
 		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
 			 chap->qid, ret);
@@ -508,19 +508,19 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		struct nvme_dhchap_queue_context *chap)
 {
 	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
-	u8 *ctrl_response;
+	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4], *challenge = chap->c2;
 	int ret;
 
-	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+	transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
 				ctrl->opts->subsysnqn);
-	if (IS_ERR(ctrl_response)) {
-		ret = PTR_ERR(ctrl_response);
+	if (IS_ERR(transformed_key)) {
+		ret = PTR_ERR(transformed_key);
 		return ret;
 	}
 
 	ret = crypto_shash_setkey(chap->shash_tfm,
-			ctrl_response, ctrl->ctrl_key->len);
+			transformed_key->key, transformed_key->len);
 	if (ret) {
 		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
 			 chap->qid, ret);
@@ -586,7 +586,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 out:
 	if (challenge != chap->c2)
 		kfree(challenge);
-	kfree(ctrl_response);
+	nvme_auth_free_key(transformed_key);
 	return ret;
 }
 
@@ -648,8 +648,8 @@ gen_sesskey:
 
 static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
 {
-	kfree_sensitive(chap->host_response);
-	chap->host_response = NULL;
+	nvme_auth_free_key(chap->transformed_key);
+	chap->transformed_key = NULL;
 	kfree_sensitive(chap->host_key);
 	chap->host_key = NULL;
 	chap->host_key_len = 0;
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 4dcddcf95279..3ddbc3880cac 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -267,7 +267,8 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	struct shash_desc *shash;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	const char *hash_name;
-	u8 *challenge = req->sq->dhchap_c1, *host_response;
+	u8 *challenge = req->sq->dhchap_c1;
+	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4];
 	int ret;
 
@@ -291,14 +292,15 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 		goto out_free_tfm;
 	}
 
-	host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn);
-	if (IS_ERR(host_response)) {
-		ret = PTR_ERR(host_response);
+	transformed_key = nvme_auth_transform_key(ctrl->host_key,
+						  ctrl->hostnqn);
+	if (IS_ERR(transformed_key)) {
+		ret = PTR_ERR(transformed_key);
 		goto out_free_tfm;
 	}
 
-	ret = crypto_shash_setkey(shash_tfm, host_response,
-				  ctrl->host_key->len);
+	ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+				  transformed_key->len);
 	if (ret)
 		goto out_free_response;
 
@@ -365,7 +367,7 @@ out:
 		kfree(challenge);
 	kfree(shash);
 out_free_response:
-	kfree_sensitive(host_response);
+	nvme_auth_free_key(transformed_key);
 out_free_tfm:
 	crypto_free_shash(shash_tfm);
 	return 0;
@@ -378,7 +380,8 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
 	struct shash_desc *shash;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	const char *hash_name;
-	u8 *challenge = req->sq->dhchap_c2, *ctrl_response;
+	u8 *challenge = req->sq->dhchap_c2;
+	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4];
 	int ret;
 
@@ -402,15 +405,15 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
 		goto out_free_tfm;
 	}
 
-	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+	transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
 						ctrl->subsysnqn);
-	if (IS_ERR(ctrl_response)) {
-		ret = PTR_ERR(ctrl_response);
+	if (IS_ERR(transformed_key)) {
+		ret = PTR_ERR(transformed_key);
 		goto out_free_tfm;
 	}
 
-	ret = crypto_shash_setkey(shash_tfm, ctrl_response,
-				  ctrl->ctrl_key->len);
+	ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+				  transformed_key->len);
 	if (ret)
 		goto out_free_response;
 
@@ -474,7 +477,7 @@ out:
 		kfree(challenge);
 	kfree(shash);
 out_free_response:
-	kfree_sensitive(ctrl_response);
+	nvme_auth_free_key(transformed_key);
 out_free_tfm:
 	crypto_free_shash(shash_tfm);
 	return 0;
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index a5ae9abe1ef6..c1d0bc5d9624 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -29,7 +29,8 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 					      u8 key_hash);
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
 struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash);
-u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+struct nvme_dhchap_key *nvme_auth_transform_key(
+				struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
 int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
 				  u8 *challenge, u8 *aug, size_t hlen);
-- 
cgit v1.2.3


From b6f9774719e5601b32f47021b40fee446b356490 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:42:53 +0100
Subject: net: phylink: provide mac_get_caps() method

Provide a new method, mac_get_caps() to get the MAC capabilities for
the specified interface mode. This is for MACs which have special
requirements, such as not supporting half-duplex in certain interface
modes, and will replace the validate() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPk5-009wiX-G5@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/sfp-phylink.rst |  7 +++++++
 drivers/net/phy/phylink.c                | 14 +++++++++++---
 include/linux/phylink.h                  | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 55b65f607a64..b069d34d7f5c 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -200,6 +200,13 @@ this documentation.
    when the in-band link state changes - otherwise the link will never
    come up.
 
+   The :c:func:`mac_get_caps` method is optional, and if provided should
+   return the phylink MAC capabilities that are supported for the passed
+   ``interface`` mode. In general, there is no need to implement this method.
+   Phylink will use these capabilities in combination with permissible
+   capabilities for ``interface`` to determine the allowable ethtool link
+   modes.
+
    The :c:func:`validate` method should mask the supplied supported mask,
    and ``state->advertising`` with the supported ethtool link modes.
    These are the new ethtool link modes, so bitmask operations must be
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 0d7354955d62..f5c2ba15d701 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -657,6 +657,7 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 					unsigned long *supported,
 					struct phylink_link_state *state)
 {
+	unsigned long capabilities;
 	struct phylink_pcs *pcs;
 	int ret;
 
@@ -696,10 +697,17 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	}
 
 	/* Then validate the link parameters with the MAC */
-	if (pl->mac_ops->validate)
+	if (pl->mac_ops->validate) {
 		pl->mac_ops->validate(pl->config, supported, state);
-	else
-		phylink_generic_validate(pl->config, supported, state);
+	} else {
+		if (pl->mac_ops->mac_get_caps)
+			capabilities = pl->mac_ops->mac_get_caps(pl->config,
+							state->interface);
+		else
+			capabilities = pl->config->mac_capabilities;
+
+		phylink_validate_mask_caps(supported, state, capabilities);
+	}
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2b886ea654bb..0798198a09ef 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -228,6 +228,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
+ * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
  * @mac_config: configure the MAC for the selected mode and state.
@@ -241,6 +242,8 @@ struct phylink_mac_ops {
 	void (*validate)(struct phylink_config *config,
 			 unsigned long *supported,
 			 struct phylink_link_state *state);
+	unsigned long (*mac_get_caps)(struct phylink_config *config,
+				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
 					      phy_interface_t interface);
 	int (*mac_prepare)(struct phylink_config *config, unsigned int mode,
@@ -292,6 +295,18 @@ struct phylink_mac_ops {
  */
 void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
+/**
+ * mac_get_caps: Get MAC capabilities for interface mode.
+ * @config: a pointer to a &struct phylink_config.
+ * @interface: PHY interface mode.
+ *
+ * Optional method. When not provided, config->mac_capabilities will be used.
+ * When implemented, this returns the MAC capabilities for the specified
+ * interface mode where there is some special handling required by the MAC
+ * driver (e.g. not supporting half-duplex in certain interface modes.)
+ */
+unsigned long mac_get_caps(struct phylink_config *config,
+			   phy_interface_t interface);
 /**
  * mac_select_pcs: Select a PCS for the interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3


From da5f6b80ad6417f7cf50d273c1c0ecb885e511e8 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:03 +0100
Subject: net: phylink: remove .validate() method

The MAC .validate() method is no longer used, so remove it from the
phylink_mac_ops structure, and remove the callsite in
phylink_validate_mac_and_pcs().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkF-009wij-QM@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/sfp-phylink.rst |  5 -----
 drivers/net/phy/phylink.c                | 16 +++++---------
 include/linux/phylink.h                  | 38 --------------------------------
 3 files changed, 6 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index b069d34d7f5c..8054d33f449f 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -207,11 +207,6 @@ this documentation.
    capabilities for ``interface`` to determine the allowable ethtool link
    modes.
 
-   The :c:func:`validate` method should mask the supplied supported mask,
-   and ``state->advertising`` with the supported ethtool link modes.
-   These are the new ethtool link modes, so bitmask operations must be
-   used. For an example, see ``drivers/net/ethernet/marvell/mvneta.c``.
-
    The :c:func:`mac_link_state` method is used to read the link state
    from the MAC, and report back the settings that the MAC is currently
    using. This is particularly important for in-band negotiation
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f5c2ba15d701..1c7e73fa58e4 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -697,17 +697,13 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	}
 
 	/* Then validate the link parameters with the MAC */
-	if (pl->mac_ops->validate) {
-		pl->mac_ops->validate(pl->config, supported, state);
-	} else {
-		if (pl->mac_ops->mac_get_caps)
-			capabilities = pl->mac_ops->mac_get_caps(pl->config,
-							state->interface);
-		else
-			capabilities = pl->config->mac_capabilities;
+	if (pl->mac_ops->mac_get_caps)
+		capabilities = pl->mac_ops->mac_get_caps(pl->config,
+							 state->interface);
+	else
+		capabilities = pl->config->mac_capabilities;
 
-		phylink_validate_mask_caps(supported, state, capabilities);
-	}
+	phylink_validate_mask_caps(supported, state, capabilities);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0798198a09ef..0cf559bae1ff 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -227,7 +227,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
 
 /**
  * struct phylink_mac_ops - MAC operations structure.
- * @validate: Validate and update the link configuration.
  * @mac_get_caps: Get MAC capabilities for interface mode.
  * @mac_select_pcs: Select a PCS for the interface mode.
  * @mac_prepare: prepare for a major reconfiguration of the interface.
@@ -239,9 +238,6 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed);
  * The individual methods are described more fully below.
  */
 struct phylink_mac_ops {
-	void (*validate)(struct phylink_config *config,
-			 unsigned long *supported,
-			 struct phylink_link_state *state);
 	unsigned long (*mac_get_caps)(struct phylink_config *config,
 				      phy_interface_t interface);
 	struct phylink_pcs *(*mac_select_pcs)(struct phylink_config *config,
@@ -261,40 +257,6 @@ struct phylink_mac_ops {
 };
 
 #if 0 /* For kernel-doc purposes only. */
-/**
- * validate - Validate and update the link configuration
- * @config: a pointer to a &struct phylink_config.
- * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
- *
- * Clear bits in the @supported and @state->advertising masks that
- * are not supportable by the MAC.
- *
- * Note that the PHY may be able to transform from one connection
- * technology to another, so, eg, don't clear 1000BaseX just
- * because the MAC is unable to BaseX mode. This is more about
- * clearing unsupported speeds and duplex settings. The port modes
- * should not be cleared; phylink_set_port_modes() will help with this.
- *
- * When @config->supported_interfaces has been set, phylink will iterate
- * over the supported interfaces to determine the full capability of the
- * MAC. The validation function must not print errors if @state->interface
- * is set to an unexpected value.
- *
- * When @config->supported_interfaces is empty, phylink will call this
- * function with @state->interface set to %PHY_INTERFACE_MODE_NA, and
- * expects the MAC driver to return all supported link modes.
- *
- * If the @state->interface mode is not supported, then the @supported
- * mask must be cleared.
- *
- * This member is optional; if not set, the generic validator will be
- * used making use of @config->mac_capabilities and
- * @config->supported_interfaces to determine which link modes are
- * supported.
- */
-void validate(struct phylink_config *config, unsigned long *supported,
-	      struct phylink_link_state *state);
 /**
  * mac_get_caps: Get MAC capabilities for interface mode.
  * @config: a pointer to a &struct phylink_config.
-- 
cgit v1.2.3


From 743f6397623edc708eda76ecc20e70032166e573 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 16 Oct 2023 16:43:08 +0100
Subject: net: phylink: remove a bunch of unused validation methods

Remove exports for phylink_caps_to_linkmodes(),
phylink_get_capabilities(), phylink_validate_mask_caps() and
phylink_generic_validate(). Also, as phylink_generic_validate() is no
longer called, we can remove its implementation as well.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1qsPkK-009wip-W9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 35 ++++++++---------------------------
 include/linux/phylink.h   | 11 -----------
 2 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1c7e73fa58e4..6712883498bb 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -257,7 +257,8 @@ static int phylink_interface_max_speed(phy_interface_t interface)
  * Set all possible pause, speed and duplex linkmodes in @linkmodes that are
  * supported by the @caps. @linkmodes must have been initialised previously.
  */
-void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps)
+static void phylink_caps_to_linkmodes(unsigned long *linkmodes,
+				      unsigned long caps)
 {
 	if (caps & MAC_SYM_PAUSE)
 		__set_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes);
@@ -400,7 +401,6 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps)
 		__set_bit(ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT, linkmodes);
 	}
 }
-EXPORT_SYMBOL_GPL(phylink_caps_to_linkmodes);
 
 static struct {
 	unsigned long mask;
@@ -477,9 +477,9 @@ static unsigned long phylink_cap_from_speed_duplex(int speed,
  * Get the MAC capabilities that are supported by the @interface mode and
  * @mac_capabilities.
  */
-unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities,
-				       int rate_matching)
+static unsigned long phylink_get_capabilities(phy_interface_t interface,
+					      unsigned long mac_capabilities,
+					      int rate_matching)
 {
 	int max_speed = phylink_interface_max_speed(interface);
 	unsigned long caps = MAC_SYM_PAUSE | MAC_ASYM_PAUSE;
@@ -606,7 +606,6 @@ unsigned long phylink_get_capabilities(phy_interface_t interface,
 
 	return (caps & mac_capabilities) | matched_caps;
 }
-EXPORT_SYMBOL_GPL(phylink_get_capabilities);
 
 /**
  * phylink_validate_mask_caps() - Restrict link modes based on caps
@@ -618,9 +617,9 @@ EXPORT_SYMBOL_GPL(phylink_get_capabilities);
  * @supported and @state based on that. Use this function if your capabiliies
  * aren't constant, such as if they vary depending on the interface.
  */
-void phylink_validate_mask_caps(unsigned long *supported,
-				struct phylink_link_state *state,
-				unsigned long mac_capabilities)
+static void phylink_validate_mask_caps(unsigned long *supported,
+				       struct phylink_link_state *state,
+				       unsigned long mac_capabilities)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 	unsigned long caps;
@@ -634,24 +633,6 @@ void phylink_validate_mask_caps(unsigned long *supported,
 	linkmode_and(supported, supported, mask);
 	linkmode_and(state->advertising, state->advertising, mask);
 }
-EXPORT_SYMBOL_GPL(phylink_validate_mask_caps);
-
-/**
- * phylink_generic_validate() - generic validate() callback implementation
- * @config: a pointer to a &struct phylink_config.
- * @supported: ethtool bitmask for supported link modes.
- * @state: a pointer to a &struct phylink_link_state.
- *
- * Generic implementation of the validate() callback that MAC drivers can
- * use when they pass the range of supported interfaces and MAC capabilities.
- */
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state)
-{
-	phylink_validate_mask_caps(supported, state, config->mac_capabilities);
-}
-EXPORT_SYMBOL_GPL(phylink_generic_validate);
 
 static int phylink_validate_mac_and_pcs(struct phylink *pl,
 					unsigned long *supported,
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0cf559bae1ff..875439ab45de 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -613,17 +613,6 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
 #endif
 
-void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps);
-unsigned long phylink_get_capabilities(phy_interface_t interface,
-				       unsigned long mac_capabilities,
-				       int rate_matching);
-void phylink_validate_mask_caps(unsigned long *supported,
-				struct phylink_link_state *state,
-				unsigned long caps);
-void phylink_generic_validate(struct phylink_config *config,
-			      unsigned long *supported,
-			      struct phylink_link_state *state);
-
 struct phylink *phylink_create(struct phylink_config *,
 			       const struct fwnode_handle *,
 			       phy_interface_t,
-- 
cgit v1.2.3


From cf439721f66719c6aa73b5ea23320c2f8cba100f Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Wed, 11 Oct 2023 14:01:54 +0300
Subject: mei: bus: add send and recv api with timeout

Add variation of the send and recv functions on bus
that define timeout.
Caller can use such functions in flow that can stuck
to bail out and not to put down the whole system.

Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Link: https://lore.kernel.org/r/20231011110157.247552-2-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 88 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mei_cl_bus.h |  8 +++++
 2 files changed, 94 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 2e65ce6bdec7..7f4ee9a47404 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -257,7 +257,7 @@ out:
 }
 
 /**
- * mei_cldev_send_vtag - me device send with vtag  (write)
+ * mei_cldev_send_vtag - me device send with vtag (write)
  *
  * @cldev: me client device
  * @buf: buffer to send
@@ -278,6 +278,29 @@ ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf,
 }
 EXPORT_SYMBOL_GPL(mei_cldev_send_vtag);
 
+/**
+ * mei_cldev_send_vtag_timeout - me device send with vtag and timeout (write)
+ *
+ * @cldev: me client device
+ * @buf: buffer to send
+ * @length: buffer length
+ * @vtag: virtual tag
+ * @timeout: send timeout in milliseconds, 0 for infinite timeout
+ *
+ * Return:
+ *  * written size in bytes
+ *  * < 0 on error
+ */
+
+ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf,
+				    size_t length, u8 vtag, unsigned long timeout)
+{
+	struct mei_cl *cl = cldev->cl;
+
+	return __mei_cl_send_timeout(cl, buf, length, vtag, MEI_CL_IO_TX_BLOCKING, timeout);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_send_vtag_timeout);
+
 /**
  * mei_cldev_recv_vtag - client receive with vtag (read)
  *
@@ -323,7 +346,49 @@ ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
 EXPORT_SYMBOL_GPL(mei_cldev_recv_nonblock_vtag);
 
 /**
- * mei_cldev_send - me device send  (write)
+ * mei_cldev_recv_timeout - client receive with timeout (read)
+ *
+ * @cldev: me client device
+ * @buf: buffer to receive
+ * @length: buffer length
+ * @timeout: send timeout in milliseconds, 0 for infinite timeout
+ *
+ * Return:
+ * * read size in bytes
+ * *  < 0 on error
+ */
+ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			       unsigned long timeout)
+{
+	return mei_cldev_recv_vtag_timeout(cldev, buf, length, NULL, timeout);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_recv_timeout);
+
+/**
+ * mei_cldev_recv_vtag_timeout - client receive with vtag (read)
+ *
+ * @cldev: me client device
+ * @buf: buffer to receive
+ * @length: buffer length
+ * @vtag: virtual tag
+ * @timeout: recv timeout in milliseconds, 0 for infinite timeout
+ *
+ * Return:
+ * * read size in bytes
+ * *  < 0 on error
+ */
+
+ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+				    u8 *vtag, unsigned long timeout)
+{
+	struct mei_cl *cl = cldev->cl;
+
+	return __mei_cl_recv(cl, buf, length, vtag, 0, timeout);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_recv_vtag_timeout);
+
+/**
+ * mei_cldev_send - me device send (write)
  *
  * @cldev: me client device
  * @buf: buffer to send
@@ -339,6 +404,25 @@ ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf, size_t length
 }
 EXPORT_SYMBOL_GPL(mei_cldev_send);
 
+/**
+ * mei_cldev_send_timeout - me device send with timeout (write)
+ *
+ * @cldev: me client device
+ * @buf: buffer to send
+ * @length: buffer length
+ * @timeout: send timeout in milliseconds, 0 for infinite timeout
+ *
+ * Return:
+ *  * written size in bytes
+ *  * < 0 on error
+ */
+ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf, size_t length,
+			       unsigned long timeout)
+{
+	return mei_cldev_send_vtag_timeout(cldev, buf, length, 0, timeout);
+}
+EXPORT_SYMBOL_GPL(mei_cldev_send_timeout);
+
 /**
  * mei_cldev_recv - client receive (read)
  *
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index fd6e0620658d..b77cefd4f0c0 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -94,15 +94,23 @@ void mei_cldev_driver_unregister(struct mei_cl_driver *cldrv);
 
 ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf,
 		       size_t length);
+ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf,
+			       size_t length, unsigned long timeout);
 ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length);
 ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf,
 				size_t length);
+ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			       unsigned long timeout);
 ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf,
 			    size_t length, u8 vtag);
+ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf,
+				    size_t length, u8 vtag, unsigned long timeout);
 ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length,
 			    u8 *vtag);
 ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf,
 				     size_t length, u8 *vtag);
+ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length,
+			    u8 *vtag, unsigned long timeout);
 
 int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb);
 int mei_cldev_register_notif_cb(struct mei_cl_device *cldev,
-- 
cgit v1.2.3


From ae4cb6bd5039a659d71632dbc2e176bf9227e294 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Wed, 11 Oct 2023 10:43:01 +0300
Subject: mei: docs: fix spelling errors

Fix spelling errors in the mei code base.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20231011074301.223879-4-tomas.winkler@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c       | 4 ++--
 drivers/misc/mei/dma-ring.c  | 2 +-
 drivers/misc/mei/hbm.c       | 4 ++--
 drivers/misc/mei/interrupt.c | 2 +-
 drivers/misc/mei/mei_dev.h   | 4 ++--
 include/linux/mei_cl_bus.h   | 4 ++--
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 7f4ee9a47404..f9bcff197615 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -1385,7 +1385,7 @@ static inline void mei_cl_bus_set_name(struct mei_cl_device *cldev)
  * @bus: mei device
  * @me_cl: me client
  *
- * Return: allocated device structur or NULL on allocation failure
+ * Return: allocated device structure or NULL on allocation failure
  */
 static struct mei_cl_device *mei_cl_bus_dev_alloc(struct mei_device *bus,
 						  struct mei_me_client *me_cl)
@@ -1445,7 +1445,7 @@ static bool mei_cl_bus_dev_setup(struct mei_device *bus,
  *
  * @cldev: me client device
  *
- * Return: 0 on success; < 0 on failre
+ * Return: 0 on success; < 0 on failure
  */
 static int mei_cl_bus_dev_add(struct mei_cl_device *cldev)
 {
diff --git a/drivers/misc/mei/dma-ring.c b/drivers/misc/mei/dma-ring.c
index ef56f849b251..e5d800e68cb1 100644
--- a/drivers/misc/mei/dma-ring.c
+++ b/drivers/misc/mei/dma-ring.c
@@ -161,7 +161,7 @@ static size_t mei_dma_copy_to(struct mei_device *dev, unsigned char *buf,
 /**
  * mei_dma_ring_read() - read data from the ring
  * @dev: mei device
- * @buf: buffer to read into: may be NULL in case of droping the data.
+ * @buf: buffer to read into: may be NULL in case of dropping the data.
  * @len: length to read.
  */
 void mei_dma_ring_read(struct mei_device *dev, unsigned char *buf, u32 len)
diff --git a/drivers/misc/mei/hbm.c b/drivers/misc/mei/hbm.c
index 12a62a911e42..15737655c896 100644
--- a/drivers/misc/mei/hbm.c
+++ b/drivers/misc/mei/hbm.c
@@ -111,7 +111,7 @@ void mei_hbm_idle(struct mei_device *dev)
 }
 
 /**
- * mei_hbm_reset - reset hbm counters and book keeping data structurs
+ * mei_hbm_reset - reset hbm counters and book keeping data structures
  *
  * @dev: the device structure
  */
@@ -907,7 +907,7 @@ int mei_hbm_cl_disconnect_req(struct mei_device *dev, struct mei_cl *cl)
 }
 
 /**
- * mei_hbm_cl_disconnect_rsp - sends disconnect respose to the FW
+ * mei_hbm_cl_disconnect_rsp - sends disconnect response to the FW
  *
  * @dev: the device structure
  * @cl: a client to disconnect from
diff --git a/drivers/misc/mei/interrupt.c b/drivers/misc/mei/interrupt.c
index 0a0e984e5673..5a050f50f33e 100644
--- a/drivers/misc/mei/interrupt.c
+++ b/drivers/misc/mei/interrupt.c
@@ -72,7 +72,7 @@ static void mei_irq_discard_msg(struct mei_device *dev, struct mei_msg_hdr *hdr,
 		discard_len = 0;
 	}
 	/*
-	 * no need to check for size as it is guarantied
+	 * no need to check for size as it is guaranteed
 	 * that length fits into rd_msg_buf
 	 */
 	mei_read_slots(dev, dev->rd_msg_buf, discard_len);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index cdf8a2edf0b3..fca0094a2310 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -70,9 +70,9 @@ enum mei_dev_state {
 /**
  * enum mei_dev_pxp_mode - MEI PXP mode state
  *
- * @MEI_DEV_PXP_DEFAULT: PCH based device, no initailization required
+ * @MEI_DEV_PXP_DEFAULT: PCH based device, no initialization required
  * @MEI_DEV_PXP_INIT:    device requires initialization, send setup message to firmware
- * @MEI_DEV_PXP_SETUP:   device is in setup stage, waiting for firmware repsonse
+ * @MEI_DEV_PXP_SETUP:   device is in setup stage, waiting for firmware response
  * @MEI_DEV_PXP_READY:   device initialized
  */
 enum mei_dev_pxp_mode {
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index b77cefd4f0c0..b38a56a13f39 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -31,11 +31,11 @@ typedef void (*mei_cldev_cb_t)(struct mei_cl_device *cldev);
  * @rx_work: async work to execute Rx event callback
  * @rx_cb: Drivers register this callback to get asynchronous ME
  *	Rx buffer pending notifications.
- * @notif_work: async work to execute FW notif event callback
+ * @notif_work: async work to execute FW notify event callback
  * @notif_cb: Drivers register this callback to get asynchronous ME
  *	FW notification pending notifications.
  *
- * @do_match: wheather device can be matched with a driver
+ * @do_match: whether the device can be matched with a driver
  * @is_added: device is already scanned
  * @priv_data: client private data
  */
-- 
cgit v1.2.3


From 8d8ae17eb0de1fcdff6e7ddee3b641a16eefe8f6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 16 Oct 2023 16:31:33 +0300
Subject: parport: Use kasprintf() instead of fixed buffer formatting

Improve readability and maintainability by replacing a hardcoded string
allocation and formatting by the use of the kasprintf() helper.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231016133135.1203643-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/parport/procfs.c | 53 ++++++++----------------------------------------
 drivers/parport/share.c  | 15 ++++++--------
 include/linux/parport.h  |  2 --
 3 files changed, 15 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 4e5b972c3e26..7aa99c65b934 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -32,13 +32,6 @@
 #define PARPORT_MAX_TIMESLICE_VALUE ((unsigned long) HZ)
 #define PARPORT_MIN_SPINTIME_VALUE 1
 #define PARPORT_MAX_SPINTIME_VALUE 1000
-/*
- * PARPORT_BASE_* is the size of the known parts of the sysctl path
- * in dev/partport/%s/devices/%s. "dev/parport/"(12), "/devices/"(9
- * and null char(1).
- */
-#define PARPORT_BASE_PATH_SIZE 13
-#define PARPORT_BASE_DEVICES_PATH_SIZE 22
 
 static int do_active_device(struct ctl_table *table, int write,
 		      void *result, size_t *lenp, loff_t *ppos)
@@ -431,8 +424,7 @@ int parport_proc_register(struct parport *port)
 {
 	struct parport_sysctl_table *t;
 	char *tmp_dir_path;
-	size_t tmp_path_len, port_name_len;
-	int bytes_written, i, err = 0;
+	int i, err = 0;
 
 	t = kmemdup(&parport_sysctl_template, sizeof(*t), GFP_KERNEL);
 	if (t == NULL)
@@ -446,35 +438,23 @@ int parport_proc_register(struct parport *port)
 		t->vars[5 + i].extra2 = &port->probe_info[i];
 	}
 
-	port_name_len = strnlen(port->name, PARPORT_NAME_MAX_LEN);
-	/*
-	 * Allocate a buffer for two paths: dev/parport/PORT and dev/parport/PORT/devices.
-	 * We calculate for the second as that will give us enough for the first.
-	 */
-	tmp_path_len = PARPORT_BASE_DEVICES_PATH_SIZE + port_name_len;
-	tmp_dir_path = kzalloc(tmp_path_len, GFP_KERNEL);
+	tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s/devices", port->name);
 	if (!tmp_dir_path) {
 		err = -ENOMEM;
 		goto exit_free_t;
 	}
 
-	bytes_written = snprintf(tmp_dir_path, tmp_path_len,
-				 "dev/parport/%s/devices", port->name);
-	if (tmp_path_len <= bytes_written) {
-		err = -ENOENT;
-		goto exit_free_tmp_dir_path;
-	}
 	t->devices_header = register_sysctl(tmp_dir_path, t->device_dir);
 	if (t->devices_header == NULL) {
 		err = -ENOENT;
 		goto  exit_free_tmp_dir_path;
 	}
 
-	tmp_path_len = PARPORT_BASE_PATH_SIZE + port_name_len;
-	bytes_written = snprintf(tmp_dir_path, tmp_path_len,
-				 "dev/parport/%s", port->name);
-	if (tmp_path_len <= bytes_written) {
-		err = -ENOENT;
+	kfree(tmp_dir_path);
+
+	tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s", port->name);
+	if (!tmp_dir_path) {
+		err = -ENOMEM;
 		goto unregister_devices_h;
 	}
 
@@ -514,34 +494,22 @@ int parport_proc_unregister(struct parport *port)
 
 int parport_device_proc_register(struct pardevice *device)
 {
-	int bytes_written, err = 0;
 	struct parport_device_sysctl_table *t;
 	struct parport * port = device->port;
-	size_t port_name_len, device_name_len, tmp_dir_path_len;
 	char *tmp_dir_path;
+	int err = 0;
 	
 	t = kmemdup(&parport_device_sysctl_template, sizeof(*t), GFP_KERNEL);
 	if (t == NULL)
 		return -ENOMEM;
 
-	port_name_len = strnlen(port->name, PARPORT_NAME_MAX_LEN);
-	device_name_len = strnlen(device->name, PATH_MAX);
-
 	/* Allocate a buffer for two paths: dev/parport/PORT/devices/DEVICE. */
-	tmp_dir_path_len = PARPORT_BASE_DEVICES_PATH_SIZE + port_name_len + device_name_len;
-	tmp_dir_path = kzalloc(tmp_dir_path_len, GFP_KERNEL);
+	tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s/devices/%s", port->name, device->name);
 	if (!tmp_dir_path) {
 		err = -ENOMEM;
 		goto exit_free_t;
 	}
 
-	bytes_written = snprintf(tmp_dir_path, tmp_dir_path_len, "dev/parport/%s/devices/%s",
-				 port->name, device->name);
-	if (tmp_dir_path_len <= bytes_written) {
-		err = -ENOENT;
-		goto exit_free_path;
-	}
-
 	t->vars[0].data = &device->timeslice;
 
 	t->sysctl_header = register_sysctl(tmp_dir_path, t->vars);
@@ -554,9 +522,6 @@ int parport_device_proc_register(struct pardevice *device)
 	kfree(tmp_dir_path);
 	return 0;
 
-exit_free_path:
-	kfree(tmp_dir_path);
-
 exit_free_t:
 	kfree(t);
 
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 2d46b1d4fd69..8037bcd07bcf 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -438,7 +438,6 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	struct parport *tmp;
 	int num;
 	int device;
-	char *name;
 	int ret;
 
 	tmp = kzalloc(sizeof(struct parport), GFP_KERNEL);
@@ -467,11 +466,6 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	atomic_set(&tmp->ref_count, 1);
 	INIT_LIST_HEAD(&tmp->full_list);
 
-	name = kmalloc(PARPORT_NAME_MAX_LEN, GFP_KERNEL);
-	if (!name) {
-		kfree(tmp);
-		return NULL;
-	}
 	/* Search for the lowest free parport number. */
 
 	spin_lock(&full_list_lock);
@@ -487,11 +481,14 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	/*
 	 * Now that the portnum is known finish doing the Init.
 	 */
-	sprintf(name, "parport%d", tmp->portnum = tmp->number);
-	tmp->name = name;
+	tmp->name = kasprintf(GFP_KERNEL, "parport%d", tmp->portnum);
+	if (!tmp->name) {
+		kfree(tmp);
+		return NULL;
+	}
+	dev_set_name(&tmp->bus_dev, tmp->name);
 	tmp->bus_dev.bus = &parport_bus_type;
 	tmp->bus_dev.release = free_port;
-	dev_set_name(&tmp->bus_dev, name);
 	tmp->bus_dev.type = &parport_device_type;
 
 	for (device = 0; device < 5; device++)
diff --git a/include/linux/parport.h b/include/linux/parport.h
index 999eddd619b7..fff39bc30629 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -180,8 +180,6 @@ struct ieee1284_info {
 	struct semaphore irq;
 };
 
-#define PARPORT_NAME_MAX_LEN 15
-
 /* A parallel port */
 struct parport {
 	unsigned long base;	/* base address */
-- 
cgit v1.2.3


From e0d4593140b01b8da513a0c88c26da28b4906413 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Oct 2023 09:59:38 +0200
Subject: netfilter: make nftables drops visible in net dropmonitor

net_dropmonitor blames core.c:nf_hook_slow.
Add NF_DROP_REASON() helper and use it in nft_do_chain().

The helper releases the skb, so exact drop location becomes
available. Calling code will observe the NF_STOLEN verdict
instead.

Adjust nf_hook_slow so we can embed an erro value wih
NF_STOLEN verdicts, just like we do for NF_DROP.

After this, drop in nftables can be pinpointed to a drop due
to a rule or the chain policy.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h      | 10 ++++++++++
 net/netfilter/core.c           |  6 +++---
 net/netfilter/nf_tables_core.c |  6 +++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d68644b7c299..80900d910992 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -22,6 +22,16 @@ static inline int NF_DROP_GETERR(int verdict)
 	return -(verdict >> NF_VERDICT_QBITS);
 }
 
+static __always_inline int
+NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
+{
+	BUILD_BUG_ON(err > 0xffff);
+
+	kfree_skb_reason(skb, reason);
+
+	return ((err << 16) | NF_STOLEN);
+}
+
 static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
 				   const union nf_inet_addr *a2)
 {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index ef4e76e5aef9..3126911f5042 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -639,10 +639,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 			if (ret == 1)
 				continue;
 			return ret;
+		case NF_STOLEN:
+			return NF_DROP_GETERR(verdict);
 		default:
-			/* Implicit handling for NF_STOLEN, as well as any other
-			 * non conventional verdicts.
-			 */
+			WARN_ON_ONCE(1);
 			return 0;
 		}
 	}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 6009b423f60a..8b536d7ef6c2 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -308,10 +308,11 @@ next_rule:
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
-	case NF_DROP:
 	case NF_QUEUE:
 	case NF_STOLEN:
 		return regs.verdict.code;
+	case NF_DROP:
+		return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
 	}
 
 	switch (regs.verdict.code) {
@@ -342,6 +343,9 @@ next_rule:
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
 
+	if (nft_base_chain(basechain)->policy == NF_DROP)
+		return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
+
 	return nft_base_chain(basechain)->policy;
 }
 EXPORT_SYMBOL_GPL(nft_do_chain);
-- 
cgit v1.2.3


From 26c5334d344db7a9bffa1f5ce87d9045b5e90383 Mon Sep 17 00:00:00 2001
From: Paul Greenwalt <paul.greenwalt@intel.com>
Date: Sun, 15 Oct 2023 19:43:02 -0400
Subject: ethtool: Add forced speed to supported link modes maps

The need to map Ethtool forced speeds to Ethtool supported link modes is
common among drivers. To support this, add a common structure for forced
speed maps and a function to init them.  This is solution was originally
introduced in commit 1d4e4ecccb11 ("qede: populate supported link modes
maps on module init") for qede driver.

ethtool_forced_speed_maps_init() should be called during driver init
with an array of struct ethtool_forced_speed_map to populate the mapping.

Definitions for maps themselves are left in the driver code, as the sets
of supported link modes may vary between the devices.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Pawel Chmielewski <pawel.chmielewski@intel.com>
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/linkmode.h | 29 +++++++++++++++--------------
 2 files changed, 52 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 62b61527bcc4..8e91e8b8a693 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1052,4 +1053,40 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
  * next string.
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
+
+/* Link mode to forced speed capabilities maps */
+struct ethtool_forced_speed_map {
+	u32		speed;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(caps);
+
+	const u32	*cap_arr;
+	u32		arr_size;
+};
+
+#define ETHTOOL_FORCED_SPEED_MAP(prefix, value)				\
+{									\
+	.speed		= SPEED_##value,				\
+	.cap_arr	= prefix##_##value,				\
+	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
+}
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+static inline void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+	for (u32 i = 0; i < size; i++) {
+		struct ethtool_forced_speed_map *map = &maps[i];
+
+		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+		map->cap_arr = NULL;
+		map->arr_size = 0;
+	}
+}
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 15e0e0209da4..cd38f89553e6 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,6 +2,21 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
+
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -38,20 +53,6 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
-- 
cgit v1.2.3


From 265f3ed077036f053981f5eea0b5b43e7c5b39ff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Sun, 24 Sep 2023 17:07:02 +0200
Subject: workqueue: Provide one lock class key per work_on_cpu() callsite

All callers of work_on_cpu() share the same lock class key for all the
functions queued. As a result the workqueue related locking scenario for
a function A may be spuriously accounted as an inversion against the
locking scenario of function B such as in the following model:

	long A(void *arg)
	{
		mutex_lock(&mutex);
		mutex_unlock(&mutex);
	}

	long B(void *arg)
	{
	}

	void launchA(void)
	{
		work_on_cpu(0, A, NULL);
	}

	void launchB(void)
	{
		mutex_lock(&mutex);
		work_on_cpu(1, B, NULL);
		mutex_unlock(&mutex);
	}

launchA and launchB running concurrently have no chance to deadlock.
However the above can be reported by lockdep as a possible locking
inversion because the works containing A() and B() are treated as
belonging to the same locking class.

The following shows an existing example of such a spurious lockdep splat:

	 ======================================================
	 WARNING: possible circular locking dependency detected
	 6.6.0-rc1-00065-g934ebd6e5359 #35409 Not tainted
	 ------------------------------------------------------
	 kworker/0:1/9 is trying to acquire lock:
	 ffffffff9bc72f30 (cpu_hotplug_lock){++++}-{0:0}, at: _cpu_down+0x57/0x2b0

	 but task is already holding lock:
	 ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 which lock already depends on the new lock.

	 the existing dependency chain (in reverse order) is:

	 -> #2 ((work_completion)(&wfc.work)){+.+.}-{0:0}:
			__flush_work+0x83/0x4e0
			work_on_cpu+0x97/0xc0
			rcu_nocb_cpu_offload+0x62/0xb0
			rcu_nocb_toggle+0xd0/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #1 (rcu_state.barrier_mutex){+.+.}-{3:3}:
			__mutex_lock+0x81/0xc80
			rcu_nocb_cpu_deoffload+0x38/0xb0
			rcu_nocb_toggle+0x144/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #0 (cpu_hotplug_lock){++++}-{0:0}:
			__lock_acquire+0x1538/0x2500
			lock_acquire+0xbf/0x2a0
			percpu_down_write+0x31/0x200
			_cpu_down+0x57/0x2b0
			__cpu_down_maps_locked+0x10/0x20
			work_for_cpu_fn+0x15/0x20
			process_scheduled_works+0x2a7/0x500
			worker_thread+0x173/0x330
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 other info that might help us debug this:

	 Chain exists of:
	   cpu_hotplug_lock --> rcu_state.barrier_mutex --> (work_completion)(&wfc.work)

	  Possible unsafe locking scenario:

			CPU0                    CPU1
			----                    ----
	   lock((work_completion)(&wfc.work));
									lock(rcu_state.barrier_mutex);
									lock((work_completion)(&wfc.work));
	   lock(cpu_hotplug_lock);

	  *** DEADLOCK ***

	 2 locks held by kworker/0:1/9:
	  #0: ffff900481068b38 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x212/0x500
	  #1: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 stack backtrace:
	 CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1-00065-g934ebd6e5359 #35409
	 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
	 Workqueue: events work_for_cpu_fn
	 Call Trace:
	 rcu-torture: rcu_torture_read_exit: Start of episode
	  <TASK>
	  dump_stack_lvl+0x4a/0x80
	  check_noncircular+0x132/0x150
	  __lock_acquire+0x1538/0x2500
	  lock_acquire+0xbf/0x2a0
	  ? _cpu_down+0x57/0x2b0
	  percpu_down_write+0x31/0x200
	  ? _cpu_down+0x57/0x2b0
	  _cpu_down+0x57/0x2b0
	  __cpu_down_maps_locked+0x10/0x20
	  work_for_cpu_fn+0x15/0x20
	  process_scheduled_works+0x2a7/0x500
	  worker_thread+0x173/0x330
	  ? __pfx_worker_thread+0x10/0x10
	  kthread+0xe6/0x120
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork+0x2f/0x40
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork_asm+0x1b/0x30
	  </TASK

Fix this with providing one lock class key per work_on_cpu() caller.

Reported-and-tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 46 +++++++++++++++++++++++++++++++++++++++-------
 kernel/workqueue.c        | 20 ++++++++++++--------
 2 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c1d06804d45..24b1e5070f4d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * to generate better code.
  */
 #ifdef CONFIG_LOCKDEP
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
-		static struct lock_class_key __key;			\
-									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
 #else
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
@@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	} while (0)
 #endif
 
+#define __INIT_WORK(_work, _func, _onstack)				\
+	do {								\
+		static __maybe_unused struct lock_class_key __key;	\
+									\
+		__INIT_WORK_KEY(_work, _func, _onstack, &__key);	\
+	} while (0)
+
 #define INIT_WORK(_work, _func)						\
 	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
 	__INIT_WORK((_work), (_func), 1)
 
+#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)			\
+	__INIT_WORK_KEY((_work), (_func), 1, _key)
+
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
@@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
 	return fn(arg);
 }
 #else
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key);
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu(_cpu, _fn, _arg)			\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_key(_cpu, _fn, _arg, &__key);	\
+})
+
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key);
+
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu_safe(_cpu, _fn, _arg)		\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);	\
+})
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3522b70218d..0f682da96e1c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5622,50 +5622,54 @@ static void work_for_cpu_fn(struct work_struct *work)
 }
 
 /**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn: the function to run
  * @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
  *
  * It is up to the caller to ensure that the cpu doesn't go offline.
  * The caller must not hold any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key)
 {
 	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
 
-	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
 	schedule_work_on(cpu, &wfc.work);
 	flush_work(&wfc.work);
 	destroy_work_on_stack(&wfc.work);
 	return wfc.ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
 
 /**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn:  the function to run
  * @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
  *
  * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
  * any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key)
 {
 	long ret = -ENODEV;
 
 	cpus_read_lock();
 	if (cpu_online(cpu))
-		ret = work_on_cpu(cpu, fn, arg);
+		ret = work_on_cpu_key(cpu, fn, arg, key);
 	cpus_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.2.3


From 077c212f0344ae4198b2b51af128a94b614ccdf4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 4 Oct 2023 14:52:37 -0400
Subject: fs: new accessor methods for atime and mtime

Recently, we converted the ctime accesses in the kernel to use new
accessor functions. Linus recently pointed out though that if we add
accessors for the atime and mtime, then that would allow us to
seamlessly change how these timestamps are stored in the inode.

Add new accessor functions for the atime and mtime that mirror the
accessors for the ctime.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20231004185239.80830-1-jlayton@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c         | 41 +++++++++++++++++++-------
 include/linux/fs.h | 85 +++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 102 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 37f2d34ee090..abe2b5a40ba1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -541,7 +541,8 @@ void simple_recursive_removal(struct dentry *dentry,
 				dput(victim);		// unpin it
 			}
 			if (victim == dentry) {
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 				if (d_is_dir(dentry))
 					drop_nlink(inode);
 				inode_unlock(inode);
@@ -582,7 +583,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	 */
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = inode_set_ctime_current(root);
+	simple_inode_init_ts(root);
 	s->s_root = d_make_root(root);
 	if (!s->s_root)
 		return -ENOMEM;
@@ -638,8 +639,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 {
 	struct inode *inode = d_inode(old_dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	inc_nlink(inode);
 	ihold(inode);
 	dget(dentry);
@@ -673,8 +674,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	drop_nlink(inode);
 	dput(dentry);
 	return 0;
@@ -709,9 +710,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct inode *newino = d_inode(new_dentry);
 
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 	if (new_dir != old_dir)
-		new_dir->i_mtime = inode_set_ctime_current(new_dir);
+		inode_set_mtime_to_ts(new_dir,
+				      inode_set_ctime_current(new_dir));
 	inode_set_ctime_current(d_inode(old_dentry));
 	if (newino)
 		inode_set_ctime_current(newino);
@@ -926,7 +928,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	 */
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
@@ -952,7 +954,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 			goto out;
 		}
 		inode->i_mode = S_IFREG | files->mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_fop = files->ops;
 		inode->i_ino = i;
 		d_add(dentry, inode);
@@ -1520,7 +1522,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_flags |= S_PRIVATE;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	return inode;
 }
 EXPORT_SYMBOL(alloc_anon_inode);
@@ -1912,3 +1914,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
 	return direct_written + buffered_written;
 }
 EXPORT_SYMBOL_GPL(direct_write_fallback);
+
+/**
+ * simple_inode_init_ts - initialize the timestamps for a new inode
+ * @inode: inode to be initialized
+ *
+ * When a new inode is created, most filesystems set the timestamps to the
+ * current time. Add a helper to do this.
+ */
+struct timespec64 simple_inode_init_ts(struct inode *inode)
+{
+	struct timespec64 ts = inode_set_ctime_current(inode);
+
+	inode_set_atime_to_ts(inode, ts);
+	inode_set_mtime_to_ts(inode, ts);
+	return ts;
+}
+EXPORT_SYMBOL(simple_inode_init_ts);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..79cc11139115 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1511,24 +1511,81 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
 
-/**
- * inode_get_ctime - fetch the current ctime from the inode
- * @inode: inode from which to fetch ctime
- *
- * Grab the current ctime from the inode and return it.
- */
+static inline time64_t inode_get_atime_sec(const struct inode *inode)
+{
+	return inode->i_atime.tv_sec;
+}
+
+static inline long inode_get_atime_nsec(const struct inode *inode)
+{
+	return inode->i_atime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_atime(const struct inode *inode)
+{
+	return inode->i_atime;
+}
+
+static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->i_atime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_atime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_atime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_mtime_sec(const struct inode *inode)
+{
+	return inode->i_mtime.tv_sec;
+}
+
+static inline long inode_get_mtime_nsec(const struct inode *inode)
+{
+	return inode->i_mtime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_mtime(const struct inode *inode)
+{
+	return inode->i_mtime;
+}
+
+static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->i_mtime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_mtime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_mtime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_ctime_sec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_sec;
+}
+
+static inline long inode_get_ctime_nsec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_nsec;
+}
+
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
 	return inode->__i_ctime;
 }
 
-/**
- * inode_set_ctime_to_ts - set the ctime in the inode
- * @inode: inode in which to set the ctime
- * @ts: value to set in the ctime field
- *
- * Set the ctime in @inode to @ts
- */
 static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
@@ -1553,6 +1610,8 @@ static inline struct timespec64 inode_set_ctime(struct inode *inode,
 	return inode_set_ctime_to_ts(inode, ts);
 }
 
+struct timespec64 simple_inode_init_ts(struct inode *inode);
+
 /*
  * Snapshotting support.
  */
-- 
cgit v1.2.3


From 1f693269cbb4cd9cd1044299c14b2464590a5f28 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 4 Oct 2023 14:53:04 -0400
Subject: linux: convert to new timestamp accessors

Convert to using the new inode timestamp accessor functions.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20231004185347.80880-77-jlayton@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs_stack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h
index 010d39d0dc1c..2b1f74b24070 100644
--- a/include/linux/fs_stack.h
+++ b/include/linux/fs_stack.h
@@ -16,14 +16,14 @@ extern void fsstack_copy_inode_size(struct inode *dst, struct inode *src);
 static inline void fsstack_copy_attr_atime(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
 }
 
 static inline void fsstack_copy_attr_times(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
-	dest->i_mtime = src->i_mtime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
+	inode_set_mtime_to_ts(dest, inode_get_mtime(src));
 	inode_set_ctime_to_ts(dest, inode_get_ctime(src));
 }
 
-- 
cgit v1.2.3


From 12cd44023651666bd44baa36a5c999698890debb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 29 Sep 2023 09:05:52 -0400
Subject: fs: rename inode i_atime and i_mtime fields

Rename these two fields to discourage direct access (and to help ensure
that we mop up any leftover direct accesses).

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79cc11139115..cc29518517f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -671,8 +671,8 @@ struct inode {
 	};
 	dev_t			i_rdev;
 	loff_t			i_size;
-	struct timespec64	i_atime;
-	struct timespec64	i_mtime;
+	struct timespec64	__i_atime;
+	struct timespec64	__i_mtime;
 	struct timespec64	__i_ctime; /* use inode_*_ctime accessors! */
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
@@ -1513,23 +1513,23 @@ struct timespec64 inode_set_ctime_current(struct inode *inode);
 
 static inline time64_t inode_get_atime_sec(const struct inode *inode)
 {
-	return inode->i_atime.tv_sec;
+	return inode->__i_atime.tv_sec;
 }
 
 static inline long inode_get_atime_nsec(const struct inode *inode)
 {
-	return inode->i_atime.tv_nsec;
+	return inode->__i_atime.tv_nsec;
 }
 
 static inline struct timespec64 inode_get_atime(const struct inode *inode)
 {
-	return inode->i_atime;
+	return inode->__i_atime;
 }
 
 static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->i_atime = ts;
+	inode->__i_atime = ts;
 	return ts;
 }
 
@@ -1543,23 +1543,23 @@ static inline struct timespec64 inode_set_atime(struct inode *inode,
 
 static inline time64_t inode_get_mtime_sec(const struct inode *inode)
 {
-	return inode->i_mtime.tv_sec;
+	return inode->__i_mtime.tv_sec;
 }
 
 static inline long inode_get_mtime_nsec(const struct inode *inode)
 {
-	return inode->i_mtime.tv_nsec;
+	return inode->__i_mtime.tv_nsec;
 }
 
 static inline struct timespec64 inode_get_mtime(const struct inode *inode)
 {
-	return inode->i_mtime;
+	return inode->__i_mtime;
 }
 
 static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
-	inode->i_mtime = ts;
+	inode->__i_mtime = ts;
 	return ts;
 }
 
-- 
cgit v1.2.3


From b4edb8d2d4647a71a246d91fe34ff25c2c5f3481 Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:50 +0800
Subject: lib: objpool added: ring-array based lockless MPMC

objpool is a scalable implementation of high performance queue for
object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate hot spots of memory
contention, it delivers near-linear scalability for high parallel
scenarios. The objpool is best suited for the following cases:
1) Memory allocation or reclamation are prohibited or too expensive
2) Consumers are of different priorities, such as irqs and threads

Limitations:
1) Maximum objects (capacity) is fixed after objpool creation
2) All pre-allocated objects are managed in percpu ring array,
   which consumes more memory than linked lists

Link: https://lore.kernel.org/all/20231017135654.82270-2-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/objpool.h | 181 +++++++++++++++++++++++++++++++
 lib/Makefile            |   2 +-
 lib/objpool.c           | 280 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 462 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/objpool.h
 create mode 100644 lib/objpool.c

(limited to 'include/linux')

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
new file mode 100644
index 000000000000..15aff4a17f0c
--- /dev/null
+++ b/include/linux/objpool.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include <linux/types.h>
+#include <linux/refcount.h>
+
+/*
+ * objpool: ring-array based lockless MPMC queue
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ *
+ * objpool is a scalable implementation of high performance queue for
+ * object allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging percpu ring-array to mitigate hot spots of memory
+ * contention, it delivers near-linear scalability for high parallel
+ * scenarios. The objpool is best suited for the following cases:
+ * 1) Memory allocation or reclamation are prohibited or too expensive
+ * 2) Consumers are of different priorities, such as irqs and threads
+ *
+ * Limitations:
+ * 1) Maximum objects (capacity) is fixed after objpool creation
+ * 2) All pre-allocated objects are managed in percpu ring array,
+ *    which consumes more memory than linked lists
+ */
+
+/**
+ * struct objpool_slot - percpu ring array of objpool
+ * @head: head sequence of the local ring array (to retrieve at)
+ * @tail: tail sequence of the local ring array (to append at)
+ * @last: the last sequence number marked as ready for retrieve
+ * @mask: bits mask for modulo capacity to compute array indexes
+ * @entries: object entries on this slot
+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool. The percpu objpool node is to be
+ * allocated from local memory for NUMA system, and to be kept compact in
+ * continuous memory: CPU assigned number of objects are stored just after
+ * the body of objpool_node.
+ *
+ * Real size of the ring array is far too smaller than the value range of
+ * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask)
+ * of head and tail are used as the actual position in the ring array. In
+ * general the ring array is acting like a small sliding window, which is
+ * always moving forward in the loop of [0, 2^32).
+ */
+struct objpool_slot {
+	uint32_t            head;
+	uint32_t            tail;
+	uint32_t            last;
+	uint32_t            mask;
+	void               *entries[];
+} __packed;
+
+struct objpool_head;
+
+/*
+ * caller-specified callback for object initial setup, it's only called
+ * once for each object (just after the memory allocation of the object)
+ */
+typedef int (*objpool_init_obj_cb)(void *obj, void *context);
+
+/* caller-specified cleanup callback for objpool destruction */
+typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
+
+/**
+ * struct objpool_head - object pooling metadata
+ * @obj_size:   object size, aligned to sizeof(void *)
+ * @nr_objs:    total objs (to be pre-allocated with objpool)
+ * @nr_cpus:    local copy of nr_cpu_ids
+ * @capacity:   max objs can be managed by one objpool_slot
+ * @gfp:        gfp flags for kmalloc & vmalloc
+ * @ref:        refcount of objpool
+ * @flags:      flags for objpool management
+ * @cpu_slots:  pointer to the array of objpool_slot
+ * @release:    resource cleanup callback
+ * @context:    caller-provided context
+ */
+struct objpool_head {
+	int                     obj_size;
+	int                     nr_objs;
+	int                     nr_cpus;
+	int                     capacity;
+	gfp_t                   gfp;
+	refcount_t              ref;
+	unsigned long           flags;
+	struct objpool_slot   **cpu_slots;
+	objpool_fini_cb         release;
+	void                   *context;
+};
+
+#define OBJPOOL_NR_OBJECT_MAX	(1UL << 24) /* maximum numbers of total objects */
+#define OBJPOOL_OBJECT_SIZE_MAX	(1UL << 16) /* maximum size of an object */
+
+/**
+ * objpool_init() - initialize objpool and pre-allocated objects
+ * @pool:    the object pool to be initialized, declared by caller
+ * @nr_objs: total objects to be pre-allocated by this object pool
+ * @object_size: size of an object (should be > 0)
+ * @gfp:     flags for memory allocation (via kmalloc or vmalloc)
+ * @context: user context for object initialization callback
+ * @objinit: object initialization callback for extra setup
+ * @release: cleanup callback for extra cleanup task
+ *
+ * return value: 0 for success, otherwise error code
+ *
+ * All pre-allocated objects are to be zeroed after memory allocation.
+ * Caller could do extra initialization in objinit callback. objinit()
+ * will be called just after slot allocation and called only once for
+ * each object. After that the objpool won't touch any content of the
+ * objects. It's caller's duty to perform reinitialization after each
+ * pop (object allocation) or do clearance before each push (object
+ * reclamation).
+ */
+int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
+		 gfp_t gfp, void *context, objpool_init_obj_cb objinit,
+		 objpool_fini_cb release);
+
+/**
+ * objpool_pop() - allocate an object from objpool
+ * @pool: object pool
+ *
+ * return value: object ptr or NULL if failed
+ */
+void *objpool_pop(struct objpool_head *pool);
+
+/**
+ * objpool_push() - reclaim the object and return back to objpool
+ * @obj:  object ptr to be pushed to objpool
+ * @pool: object pool
+ *
+ * return: 0 or error code (it fails only when user tries to push
+ * the same object multiple times or wrong "objects" into objpool)
+ */
+int objpool_push(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_drop() - discard the object and deref objpool
+ * @obj:  object ptr to be discarded
+ * @pool: object pool
+ *
+ * return: 0 if objpool was released; -EAGAIN if there are still
+ *         outstanding objects
+ *
+ * objpool_drop is normally for the release of outstanding objects
+ * after objpool cleanup (objpool_fini). Thinking of this example:
+ * kretprobe is unregistered and objpool_fini() is called to release
+ * all remained objects, but there are still objects being used by
+ * unfinished kretprobes (like blockable function: sys_accept). So
+ * only when the last outstanding object is dropped could the whole
+ * objpool be released along with the call of objpool_drop()
+ */
+int objpool_drop(void *obj, struct objpool_head *pool);
+
+/**
+ * objpool_free() - release objpool forcely (all objects to be freed)
+ * @pool: object pool to be released
+ */
+void objpool_free(struct objpool_head *pool);
+
+/**
+ * objpool_fini() - deref object pool (also releasing unused objects)
+ * @pool: object pool to be dereferenced
+ *
+ * objpool_fini() will try to release all remained free objects and
+ * then drop an extra reference of the objpool. If all objects are
+ * already returned to objpool (so called synchronous use cases),
+ * the objpool itself will be freed together. But if there are still
+ * outstanding objects (so called asynchronous use cases, such like
+ * blockable kretprobe), the objpool won't be released until all
+ * the outstanding objects are dropped, but the caller must assure
+ * there are no concurrent objpool_push() on the fly. Normally RCU
+ * is being required to make sure all ongoing objpool_push() must
+ * be finished before calling objpool_fini(), so does test_objpool,
+ * kretprobe or rethook
+ */
+void objpool_fini(struct objpool_head *pool);
+
+#endif /* _LINUX_OBJPOOL_H */
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..d0385909de8d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o
+	 buildid.o objpool.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/objpool.c b/lib/objpool.c
new file mode 100644
index 000000000000..ce0087f64400
--- /dev/null
+++ b/lib/objpool.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/objpool.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+
+/*
+ * objpool: ring-array based lockless MPMC/FIFO queues
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ */
+
+/* initialize percpu objpool_slot */
+static int
+objpool_init_percpu_slot(struct objpool_head *pool,
+			 struct objpool_slot *slot,
+			 int nodes, void *context,
+			 objpool_init_obj_cb objinit)
+{
+	void *obj = (void *)&slot->entries[pool->capacity];
+	int i;
+
+	/* initialize elements of percpu objpool_slot */
+	slot->mask = pool->capacity - 1;
+
+	for (i = 0; i < nodes; i++) {
+		if (objinit) {
+			int rc = objinit(obj, context);
+			if (rc)
+				return rc;
+		}
+		slot->entries[slot->tail & slot->mask] = obj;
+		obj = obj + pool->obj_size;
+		slot->tail++;
+		slot->last = slot->tail;
+		pool->nr_objs++;
+	}
+
+	return 0;
+}
+
+/* allocate and initialize percpu slots */
+static int
+objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
+			  void *context, objpool_init_obj_cb objinit)
+{
+	int i, cpu_count = 0;
+
+	for (i = 0; i < pool->nr_cpus; i++) {
+
+		struct objpool_slot *slot;
+		int nodes, size, rc;
+
+		/* skip the cpu node which could never be present */
+		if (!cpu_possible(i))
+			continue;
+
+		/* compute how many objects to be allocated with this slot */
+		nodes = nr_objs / num_possible_cpus();
+		if (cpu_count < (nr_objs % num_possible_cpus()))
+			nodes++;
+		cpu_count++;
+
+		size = struct_size(slot, entries, pool->capacity) +
+			pool->obj_size * nodes;
+
+		/*
+		 * here we allocate percpu-slot & objs together in a single
+		 * allocation to make it more compact, taking advantage of
+		 * warm caches and TLB hits. in default vmalloc is used to
+		 * reduce the pressure of kernel slab system. as we know,
+		 * mimimal size of vmalloc is one page since vmalloc would
+		 * always align the requested size to page size
+		 */
+		if (pool->gfp & GFP_ATOMIC)
+			slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
+		else
+			slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
+				cpu_to_node(i), __builtin_return_address(0));
+		if (!slot)
+			return -ENOMEM;
+		memset(slot, 0, size);
+		pool->cpu_slots[i] = slot;
+
+		/* initialize the objpool_slot of cpu node i */
+		rc = objpool_init_percpu_slot(pool, slot, nodes, context, objinit);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+/* cleanup all percpu slots of the object pool */
+static void objpool_fini_percpu_slots(struct objpool_head *pool)
+{
+	int i;
+
+	if (!pool->cpu_slots)
+		return;
+
+	for (i = 0; i < pool->nr_cpus; i++)
+		kvfree(pool->cpu_slots[i]);
+	kfree(pool->cpu_slots);
+}
+
+/* initialize object pool and pre-allocate objects */
+int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
+		gfp_t gfp, void *context, objpool_init_obj_cb objinit,
+		objpool_fini_cb release)
+{
+	int rc, capacity, slot_size;
+
+	/* check input parameters */
+	if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX ||
+	    object_size <= 0 || object_size > OBJPOOL_OBJECT_SIZE_MAX)
+		return -EINVAL;
+
+	/* align up to unsigned long size */
+	object_size = ALIGN(object_size, sizeof(long));
+
+	/* calculate capacity of percpu objpool_slot */
+	capacity = roundup_pow_of_two(nr_objs);
+	if (!capacity)
+		return -EINVAL;
+
+	/* initialize objpool pool */
+	memset(pool, 0, sizeof(struct objpool_head));
+	pool->nr_cpus = nr_cpu_ids;
+	pool->obj_size = object_size;
+	pool->capacity = capacity;
+	pool->gfp = gfp & ~__GFP_ZERO;
+	pool->context = context;
+	pool->release = release;
+	slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
+	pool->cpu_slots = kzalloc(slot_size, pool->gfp);
+	if (!pool->cpu_slots)
+		return -ENOMEM;
+
+	/* initialize per-cpu slots */
+	rc = objpool_init_percpu_slots(pool, nr_objs, context, objinit);
+	if (rc)
+		objpool_fini_percpu_slots(pool);
+	else
+		refcount_set(&pool->ref, pool->nr_objs + 1);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(objpool_init);
+
+/* adding object to slot, abort if the slot was already full */
+static inline int
+objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	uint32_t head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		WARN_ON_ONCE(tail - head > pool->nr_objs);
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
+
+/* reclaim an object to object pool */
+int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(objpool_push);
+
+/* try to retrieve object from slot */
+static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	uint32_t head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
+/* allocate an object from object pool */
+void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(objpool_pop);
+
+/* release whole objpool forcely */
+void objpool_free(struct objpool_head *pool)
+{
+	if (!pool->cpu_slots)
+		return;
+
+	/* release percpu slots */
+	objpool_fini_percpu_slots(pool);
+
+	/* call user's cleanup callback if provided */
+	if (pool->release)
+		pool->release(pool, pool->context);
+}
+EXPORT_SYMBOL_GPL(objpool_free);
+
+/* drop the allocated object, rather reclaim it to objpool */
+int objpool_drop(void *obj, struct objpool_head *pool)
+{
+	if (!obj || !pool)
+		return -EINVAL;
+
+	if (refcount_dec_and_test(&pool->ref)) {
+		objpool_free(pool);
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(objpool_drop);
+
+/* drop unused objects and defref objpool for releasing */
+void objpool_fini(struct objpool_head *pool)
+{
+	int count = 1; /* extra ref for objpool itself */
+
+	/* drop all remained objects from objpool */
+	while (objpool_pop(pool))
+		count++;
+
+	if (refcount_sub_and_test(count, &pool->ref))
+		objpool_free(pool);
+}
+EXPORT_SYMBOL_GPL(objpool_fini);
-- 
cgit v1.2.3


From 4bbd9345565933823f38a419df65661f12adbe5e Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:52 +0800
Subject: kprobes: kretprobe scalability improvement

kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):

OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

         1T       2T       4T       8T      16T      24T
   24150045 29317964 15446741 12494489 18287272 17708768
        32T      48T      64T      72T      96T     128T
   16200682 13737658 11645677 11269858 10470118  9931051

This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:

                  1T         2T         4T         8T        16T
native:     41186213   82336866  164250978  328662645  658810299
freelist:   24150045   29317964   15446741   12494489   18287272
objpool:    23926730   48010314   96125218  191782984  385091769
                 32T        48T        64T        96T       128T
native:   1330338351 1969957941 2512291791 2615754135 2671040914
freelist:   16200682   13737658   11645677   10470118    9931051
objpool:   764481096 1147149781 1456220214 1502109662 1579015050

Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:

OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

                  1T         2T         4T         8T        16T
native: .   30066096   63569843  126194076  257447289  505800181
freelist:   16152090   11064397   11124068    7215768    5663013
objpool:    13997541   28032100   55726624  110099926  221498787
                 24T        32T        48T        64T        96T
native:    763305277 1015925192 1521075123 2033009392 3021013752
freelist:    5015810    4602893    3766792    3382478    2945292
objpool:   328192025  439439564  668534502  887401381 1319972072

Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 11 ++----
 include/linux/rethook.h | 16 +++------
 kernel/kprobes.c        | 91 +++++++++++++++++++++----------------------------
 kernel/trace/fprobe.c   | 28 ++++++---------
 kernel/trace/rethook.c  | 90 ++++++++++++++++++++++--------------------------
 5 files changed, 96 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 85a64cb95d75..365eb092e9c4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -26,8 +26,7 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
-#include <linux/refcount.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/rethook.h>
 #include <asm/kprobes.h>
 
@@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  */
 struct kretprobe_holder {
 	struct kretprobe	*rp;
-	refcount_t		ref;
+	struct objpool_head	pool;
 };
 
 struct kretprobe {
@@ -154,7 +153,6 @@ struct kretprobe {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook *rh;
 #else
-	struct freelist_head freelist;
 	struct kretprobe_holder *rph;
 #endif
 };
@@ -165,10 +163,7 @@ struct kretprobe_instance {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook_node node;
 #else
-	union {
-		struct freelist_node freelist;
-		struct rcu_head rcu;
-	};
+	struct rcu_head rcu;
 	struct llist_node llist;
 	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index 26b6f3c81a76..ce69b2b7bc35 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -6,11 +6,10 @@
 #define _LINUX_RETHOOK_H
 
 #include <linux/compiler.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/kallsyms.h>
 #include <linux/llist.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
 
 struct rethook_node;
 
@@ -30,14 +29,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
 struct rethook {
 	void			*data;
 	rethook_handler_t	handler;
-	struct freelist_head	pool;
-	refcount_t		ref;
+	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
 
 /**
  * struct rethook_node - The rethook shadow-stack entry node.
- * @freelist: The freelist, linked to struct rethook::pool.
  * @rcu: The rcu_head for deferred freeing.
  * @llist: The llist, linked to a struct task_struct::rethooks.
  * @rethook: The pointer to the struct rethook.
@@ -48,20 +45,16 @@ struct rethook {
  * on each entry of the shadow stack.
  */
 struct rethook_node {
-	union {
-		struct freelist_node freelist;
-		struct rcu_head      rcu;
-	};
+	struct rcu_head		rcu;
 	struct llist_node	llist;
 	struct rethook		*rethook;
 	unsigned long		ret_addr;
 	unsigned long		frame;
 };
 
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num);
 void rethook_stop(struct rethook *rh);
 void rethook_free(struct rethook *rh);
-void rethook_add_node(struct rethook *rh, struct rethook_node *node);
 struct rethook_node *rethook_try_get(struct rethook *rh);
 void rethook_recycle(struct rethook_node *node);
 void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
@@ -98,4 +91,3 @@ void rethook_flush_task(struct task_struct *tk);
 #endif
 
 #endif
-
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0c6185aefaef..075a632e6c7c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1877,13 +1877,27 @@ static struct notifier_block kprobe_exceptions_nb = {
 #ifdef CONFIG_KRETPROBES
 
 #if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+	struct kretprobe_instance *ri = nod;
+
+	ri->rph = context;
+	return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+	kfree(context);
+	return 0;
+}
+
 static void free_rp_inst_rcu(struct rcu_head *head)
 {
 	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+	struct kretprobe_holder *rph = ri->rph;
 
-	if (refcount_dec_and_test(&ri->rph->ref))
-		kfree(ri->rph);
-	kfree(ri);
+	objpool_drop(ri, &rph->pool);
 }
 NOKPROBE_SYMBOL(free_rp_inst_rcu);
 
@@ -1892,7 +1906,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
 	struct kretprobe *rp = get_kretprobe(ri);
 
 	if (likely(rp))
-		freelist_add(&ri->freelist, &rp->freelist);
+		objpool_push(ri, &rp->rph->pool);
 	else
 		call_rcu(&ri->rcu, free_rp_inst_rcu);
 }
@@ -1929,23 +1943,12 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
 
 static inline void free_rp_inst(struct kretprobe *rp)
 {
-	struct kretprobe_instance *ri;
-	struct freelist_node *node;
-	int count = 0;
-
-	node = rp->freelist.head;
-	while (node) {
-		ri = container_of(node, struct kretprobe_instance, freelist);
-		node = node->next;
-
-		kfree(ri);
-		count++;
-	}
+	struct kretprobe_holder *rph = rp->rph;
 
-	if (refcount_sub_and_test(count, &rp->rph->ref)) {
-		kfree(rp->rph);
-		rp->rph = NULL;
-	}
+	if (!rph)
+		return;
+	rp->rph = NULL;
+	objpool_fini(&rph->pool);
 }
 
 /* This assumes the 'tsk' is the current task or the is not running. */
@@ -2087,19 +2090,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	struct kretprobe_holder *rph = rp->rph;
 	struct kretprobe_instance *ri;
-	struct freelist_node *fn;
 
-	fn = freelist_try_get(&rp->freelist);
-	if (!fn) {
+	ri = objpool_pop(&rph->pool);
+	if (!ri) {
 		rp->nmissed++;
 		return 0;
 	}
 
-	ri = container_of(fn, struct kretprobe_instance, freelist);
-
 	if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-		freelist_add(&ri->freelist, &rp->freelist);
+		objpool_push(ri, &rph->pool);
 		return 0;
 	}
 
@@ -2193,7 +2194,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
 int register_kretprobe(struct kretprobe *rp)
 {
 	int ret;
-	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
 
@@ -2227,19 +2227,12 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
 
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
-	rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
-	if (!rp->rh)
-		return -ENOMEM;
+	rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+				sizeof(struct kretprobe_instance) +
+				rp->data_size, rp->maxactive);
+	if (IS_ERR(rp->rh))
+		return PTR_ERR(rp->rh);
 
-	for (i = 0; i < rp->maxactive; i++) {
-		inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
-		if (inst == NULL) {
-			rethook_free(rp->rh);
-			rp->rh = NULL;
-			return -ENOMEM;
-		}
-		rethook_add_node(rp->rh, &inst->node);
-	}
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
@@ -2248,24 +2241,18 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->rh = NULL;
 	}
 #else	/* !CONFIG_KRETPROBE_ON_RETHOOK */
-	rp->freelist.head = NULL;
 	rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
 	if (!rp->rph)
 		return -ENOMEM;
 
-	rp->rph->rp = rp;
-	for (i = 0; i < rp->maxactive; i++) {
-		inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
-		if (inst == NULL) {
-			refcount_set(&rp->rph->ref, i);
-			free_rp_inst(rp);
-			return -ENOMEM;
-		}
-		inst->rph = rp->rph;
-		freelist_add(&inst->freelist, &rp->freelist);
+	if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+			sizeof(struct kretprobe_instance), GFP_KERNEL,
+			rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+		kfree(rp->rph);
+		rp->rph = NULL;
+		return -ENOMEM;
 	}
-	refcount_set(&rp->rph->ref, i);
-
+	rp->rph->rp = rp;
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 3b21f4063258..6890f4fd17fc 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -187,7 +187,7 @@ static void fprobe_init(struct fprobe *fp)
 
 static int fprobe_init_rethook(struct fprobe *fp, int num)
 {
-	int i, size;
+	int size;
 
 	if (num < 0)
 		return -EINVAL;
@@ -205,26 +205,18 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 	if (size < 0)
 		return -E2BIG;
 
-	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
-	if (!fp->rethook)
-		return -ENOMEM;
-	for (i = 0; i < size; i++) {
-		struct fprobe_rethook_node *node;
-
-		node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
-		if (!node) {
-			rethook_free(fp->rethook);
-			fp->rethook = NULL;
-			return -ENOMEM;
-		}
-		rethook_add_node(fp->rethook, &node->node);
-	}
+	/* Initialize rethook */
+	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
+				sizeof(struct fprobe_rethook_node), size);
+	if (IS_ERR(fp->rethook))
+		return PTR_ERR(fp->rethook);
+
 	return 0;
 }
 
 static void fprobe_fail_cleanup(struct fprobe *fp)
 {
-	if (fp->rethook) {
+	if (!IS_ERR_OR_NULL(fp->rethook)) {
 		/* Don't need to cleanup rethook->handler because this is not used. */
 		rethook_free(fp->rethook);
 		fp->rethook = NULL;
@@ -379,14 +371,14 @@ int unregister_fprobe(struct fprobe *fp)
 	if (!fprobe_is_registered(fp))
 		return -EINVAL;
 
-	if (fp->rethook)
+	if (!IS_ERR_OR_NULL(fp->rethook))
 		rethook_stop(fp->rethook);
 
 	ret = unregister_ftrace_function(&fp->ops);
 	if (ret < 0)
 		return ret;
 
-	if (fp->rethook)
+	if (!IS_ERR_OR_NULL(fp->rethook))
 		rethook_free(fp->rethook);
 
 	ftrace_free_filter(&fp->ops);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 5eb9b598f4e9..13c8e6773892 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -9,6 +9,7 @@
 #include <linux/rethook.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
+#include <linux/smp.h>
 
 /* Return hook list (shadow stack by list) */
 
@@ -36,21 +37,7 @@ void rethook_flush_task(struct task_struct *tk)
 static void rethook_free_rcu(struct rcu_head *head)
 {
 	struct rethook *rh = container_of(head, struct rethook, rcu);
-	struct rethook_node *rhn;
-	struct freelist_node *node;
-	int count = 1;
-
-	node = rh->pool.head;
-	while (node) {
-		rhn = container_of(node, struct rethook_node, freelist);
-		node = node->next;
-		kfree(rhn);
-		count++;
-	}
-
-	/* The rh->ref is the number of pooled node + 1 */
-	if (refcount_sub_and_test(count, &rh->ref))
-		kfree(rh);
+	objpool_fini(&rh->pool);
 }
 
 /**
@@ -83,54 +70,62 @@ void rethook_free(struct rethook *rh)
 	call_rcu(&rh->rcu, rethook_free_rcu);
 }
 
+static int rethook_init_node(void *nod, void *context)
+{
+	struct rethook_node *node = nod;
+
+	node->rethook = context;
+	return 0;
+}
+
+static int rethook_fini_pool(struct objpool_head *head, void *context)
+{
+	kfree(context);
+	return 0;
+}
+
 /**
  * rethook_alloc() - Allocate struct rethook.
  * @data: a data to pass the @handler when hooking the return.
- * @handler: the return hook callback function.
+ * @handler: the return hook callback function, must NOT be NULL
+ * @size: node size: rethook node and additional data
+ * @num: number of rethook nodes to be preallocated
  *
  * Allocate and initialize a new rethook with @data and @handler.
- * Return NULL if memory allocation fails or @handler is NULL.
+ * Return pointer of new rethook, or error codes for failures.
+ *
  * Note that @handler == NULL means this rethook is going to be freed.
  */
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
+			      int size, int num)
 {
-	struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+	struct rethook *rh;
 
-	if (!rh || !handler) {
-		kfree(rh);
-		return NULL;
-	}
+	if (!handler || num <= 0 || size < sizeof(struct rethook_node))
+		return ERR_PTR(-EINVAL);
+
+	rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+	if (!rh)
+		return ERR_PTR(-ENOMEM);
 
 	rh->data = data;
 	rh->handler = handler;
-	rh->pool.head = NULL;
-	refcount_set(&rh->ref, 1);
 
+	/* initialize the objpool for rethook nodes */
+	if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
+			 rethook_init_node, rethook_fini_pool)) {
+		kfree(rh);
+		return ERR_PTR(-ENOMEM);
+	}
 	return rh;
 }
 
-/**
- * rethook_add_node() - Add a new node to the rethook.
- * @rh: the struct rethook.
- * @node: the struct rethook_node to be added.
- *
- * Add @node to @rh. User must allocate @node (as a part of user's
- * data structure.) The @node fields are initialized in this function.
- */
-void rethook_add_node(struct rethook *rh, struct rethook_node *node)
-{
-	node->rethook = rh;
-	freelist_add(&node->freelist, &rh->pool);
-	refcount_inc(&rh->ref);
-}
-
 static void free_rethook_node_rcu(struct rcu_head *head)
 {
 	struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+	struct rethook *rh = node->rethook;
 
-	if (refcount_dec_and_test(&node->rethook->ref))
-		kfree(node->rethook);
-	kfree(node);
+	objpool_drop(node, &rh->pool);
 }
 
 /**
@@ -145,7 +140,7 @@ void rethook_recycle(struct rethook_node *node)
 	lockdep_assert_preemption_disabled();
 
 	if (likely(READ_ONCE(node->rethook->handler)))
-		freelist_add(&node->freelist, &node->rethook->pool);
+		objpool_push(node, &node->rethook->pool);
 	else
 		call_rcu(&node->rcu, free_rethook_node_rcu);
 }
@@ -161,7 +156,6 @@ NOKPROBE_SYMBOL(rethook_recycle);
 struct rethook_node *rethook_try_get(struct rethook *rh)
 {
 	rethook_handler_t handler = READ_ONCE(rh->handler);
-	struct freelist_node *fn;
 
 	lockdep_assert_preemption_disabled();
 
@@ -178,11 +172,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
 	if (unlikely(!rcu_is_watching()))
 		return NULL;
 
-	fn = freelist_try_get(&rh->pool);
-	if (!fn)
-		return NULL;
-
-	return container_of(fn, struct rethook_node, freelist);
+	return (struct rethook_node *)objpool_pop(&rh->pool);
 }
 NOKPROBE_SYMBOL(rethook_try_get);
 
-- 
cgit v1.2.3


From f46e8939de80bf029787111bf488ee86355b111c Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:53 +0800
Subject: kprobes: freelist.h removed

This patch will remove freelist.h from kernel source tree, since the
only use cases (kretprobe and rethook) are converted to objpool.

Link: https://lore.kernel.org/all/20231017135654.82270-5-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/freelist.h | 129 -----------------------------------------------
 1 file changed, 129 deletions(-)
 delete mode 100644 include/linux/freelist.h

(limited to 'include/linux')

diff --git a/include/linux/freelist.h b/include/linux/freelist.h
deleted file mode 100644
index fc1842b96469..000000000000
--- a/include/linux/freelist.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
-#ifndef FREELIST_H
-#define FREELIST_H
-
-#include <linux/atomic.h>
-
-/*
- * Copyright: cameron@moodycamel.com
- *
- * A simple CAS-based lock-free free list. Not the fastest thing in the world
- * under heavy contention, but simple and correct (assuming nodes are never
- * freed until after the free list is destroyed), and fairly speedy under low
- * contention.
- *
- * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists
- */
-
-struct freelist_node {
-	atomic_t		refs;
-	struct freelist_node	*next;
-};
-
-struct freelist_head {
-	struct freelist_node	*head;
-};
-
-#define REFS_ON_FREELIST 0x80000000
-#define REFS_MASK	 0x7FFFFFFF
-
-static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
-	/*
-	 * Since the refcount is zero, and nobody can increase it once it's
-	 * zero (except us, and we run only one copy of this method per node at
-	 * a time, i.e. the single thread case), then we know we can safely
-	 * change the next pointer of the node; however, once the refcount is
-	 * back above zero, then other threads could increase it (happens under
-	 * heavy contention, when the refcount goes to zero in between a load
-	 * and a refcount increment of a node in try_get, then back up to
-	 * something non-zero, then the refcount increment is done by the other
-	 * thread) -- so if the CAS to add the node to the actual list fails,
-	 * decrese the refcount and leave the add operation to the next thread
-	 * who puts the refcount back to zero (which could be us, hence the
-	 * loop).
-	 */
-	struct freelist_node *head = READ_ONCE(list->head);
-
-	for (;;) {
-		WRITE_ONCE(node->next, head);
-		atomic_set_release(&node->refs, 1);
-
-		if (!try_cmpxchg_release(&list->head, &head, node)) {
-			/*
-			 * Hmm, the add failed, but we can only try again when
-			 * the refcount goes back to zero.
-			 */
-			if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1)
-				continue;
-		}
-		return;
-	}
-}
-
-static inline void freelist_add(struct freelist_node *node, struct freelist_head *list)
-{
-	/*
-	 * We know that the should-be-on-freelist bit is 0 at this point, so
-	 * it's safe to set it using a fetch_add.
-	 */
-	if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) {
-		/*
-		 * Oh look! We were the last ones referencing this node, and we
-		 * know we want to add it to the free list, so let's do it!
-		 */
-		__freelist_add(node, list);
-	}
-}
-
-static inline struct freelist_node *freelist_try_get(struct freelist_head *list)
-{
-	struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head);
-	unsigned int refs;
-
-	while (head) {
-		prev = head;
-		refs = atomic_read(&head->refs);
-		if ((refs & REFS_MASK) == 0 ||
-		    !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) {
-			head = smp_load_acquire(&list->head);
-			continue;
-		}
-
-		/*
-		 * Good, reference count has been incremented (it wasn't at
-		 * zero), which means we can read the next and not worry about
-		 * it changing between now and the time we do the CAS.
-		 */
-		next = READ_ONCE(head->next);
-		if (try_cmpxchg_acquire(&list->head, &head, next)) {
-			/*
-			 * Yay, got the node. This means it was on the list,
-			 * which means should-be-on-freelist must be false no
-			 * matter the refcount (because nobody else knows it's
-			 * been taken off yet, it can't have been put back on).
-			 */
-			WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST);
-
-			/*
-			 * Decrease refcount twice, once for our ref, and once
-			 * for the list's ref.
-			 */
-			atomic_fetch_add(-2, &head->refs);
-
-			return head;
-		}
-
-		/*
-		 * OK, the head must have changed on us, but we still need to decrement
-		 * the refcount we increased.
-		 */
-		refs = atomic_fetch_add(-1, &prev->refs);
-		if (refs == REFS_ON_FREELIST + 1)
-			__freelist_add(prev, list);
-	}
-
-	return NULL;
-}
-
-#endif /* FREELIST_H */
-- 
cgit v1.2.3


From bf4916922c60f43efaa329744b3eef539aa6a2b2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 5 Oct 2023 23:59:07 -0400
Subject: hugetlbfs: extend hugetlb_vma_lock to private VMAs

Extend the locking scheme used to protect shared hugetlb mappings from
truncate vs page fault races, in order to protect private hugetlb mappings
(with resv_map) against MADV_DONTNEED.

Add a read-write semaphore to the resv_map data structure, and use that
from the hugetlb_vma_(un)lock_* functions, in preparation for closing the
race between MADV_DONTNEED and page faults.

Link: https://lkml.kernel.org/r/20231006040020.3677377-3-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 ++++++
 mm/hugetlb.c            | 41 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a30686e649f7..065ec022cbb0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -60,6 +60,7 @@ struct resv_map {
 	long adds_in_progress;
 	struct list_head region_cache;
 	long region_cache_count;
+	struct rw_semaphore rw_sema;
 #ifdef CONFIG_CGROUP_HUGETLB
 	/*
 	 * On private mappings, the counter to uncharge reservations is stored
@@ -1233,6 +1234,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
 }
 
+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
 /*
  * Safe version of huge_pte_offset() to check the locks.  See comments
  * above huge_pte_offset().
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 467fedd42453..d218ee2b6a99 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		down_read(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		down_read(&resv_map->rw_sema);
 	}
 }
 
@@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		up_read(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		up_read(&resv_map->rw_sema);
 	}
 }
 
@@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		down_write(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		down_write(&resv_map->rw_sema);
 	}
 }
 
@@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		up_write(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		up_write(&resv_map->rw_sema);
 	}
 }
 
 int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
 {
-	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
-	if (!__vma_shareable_lock(vma))
-		return 1;
+	if (__vma_shareable_lock(vma)) {
+		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
-	return down_write_trylock(&vma_lock->rw_sema);
+		return down_write_trylock(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		return down_write_trylock(&resv_map->rw_sema);
+	}
+
+	return 1;
 }
 
 void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		lockdep_assert_held(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		lockdep_assert_held(&resv_map->rw_sema);
 	}
 }
 
@@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		__hugetlb_vma_unlock_write_put(vma_lock);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		/* no free for anon vmas, but still need to unlock */
+		up_write(&resv_map->rw_sema);
 	}
 }
 
@@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
 	kref_init(&resv_map->refs);
 	spin_lock_init(&resv_map->lock);
 	INIT_LIST_HEAD(&resv_map->regions);
+	init_rwsem(&resv_map->rw_sema);
 
 	resv_map->adds_in_progress = 0;
 	/*
-- 
cgit v1.2.3


From 2820b0f09be99f6406784b03a22dfc83e858449d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 5 Oct 2023 23:59:08 -0400
Subject: hugetlbfs: close race between MADV_DONTNEED and page fault

Malloc libraries, like jemalloc and tcalloc, take decisions on when to
call madvise independently from the code in the main application.

This sometimes results in the application page faulting on an address,
right after the malloc library has shot down the backing memory with
MADV_DONTNEED.

Usually this is harmless, because we always have some 4kB pages sitting
around to satisfy a page fault.  However, with hugetlbfs systems often
allocate only the exact number of huge pages that the application wants.

Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
any lock taken on the page fault path, which can open up the following
race condition:

       CPU 1                            CPU 2

       MADV_DONTNEED
       unmap page
       shoot down TLB entry
                                       page fault
                                       fail to allocate a huge page
                                       killed with SIGBUS
       free page

Fix that race by pulling the locking from __unmap_hugepage_final_range
into helper functions called from zap_page_range_single.  This ensures
page faults stay locked out of the MADV_DONTNEED VMA until the huge pages
have actually been freed.

Link: https://lkml.kernel.org/r/20231006040020.3677377-4-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++--
 mm/hugetlb.c            | 34 ++++++++++++++++++++++------------
 mm/memory.c             | 13 ++++++++-----
 3 files changed, 63 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 065ec022cbb0..47d25a5e1933 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *,
 			  zap_flags_t);
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+void __unmap_hugepage_range(struct mmu_gather *tlb,
 			  struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end,
 			  struct page *ref_page, zap_flags_t zap_flags);
@@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
 
+extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
+				unsigned long *begin, unsigned long *end);
+extern void __hugetlb_zap_end(struct vm_area_struct *vma,
+			      struct zap_details *details);
+
+static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
+				     unsigned long *start, unsigned long *end)
+{
+	if (is_vm_hugetlb_page(vma))
+		__hugetlb_zap_begin(vma, start, end);
+}
+
+static inline void hugetlb_zap_end(struct vm_area_struct *vma,
+				   struct zap_details *details)
+{
+	if (is_vm_hugetlb_page(vma))
+		__hugetlb_zap_end(vma, details);
+}
+
 void hugetlb_vma_lock_read(struct vm_area_struct *vma);
 void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
 void hugetlb_vma_lock_write(struct vm_area_struct *vma);
@@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static inline void hugetlb_zap_begin(
+				struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end)
+{
+}
+
+static inline void hugetlb_zap_end(
+				struct vm_area_struct *vma,
+				struct zap_details *details)
+{
+}
+
 static inline struct page *hugetlb_follow_page_mask(
     struct vm_area_struct *vma, unsigned long address, unsigned int flags,
     unsigned int *page_mask)
@@ -442,7 +473,7 @@ static inline long hugetlb_change_protection(
 	return 0;
 }
 
-static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 			struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, struct page *ref_page,
 			zap_flags_t zap_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d218ee2b6a99..1301ba7b2c9a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	return len + old_addr - old_end;
 }
 
-static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end,
-				   struct page *ref_page, zap_flags_t zap_flags)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+			    unsigned long start, unsigned long end,
+			    struct page *ref_page, zap_flags_t zap_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -5437,16 +5437,25 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		tlb_flush_mmu_tlbonly(tlb);
 }
 
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
-			  struct vm_area_struct *vma, unsigned long start,
-			  unsigned long end, struct page *ref_page,
-			  zap_flags_t zap_flags)
+void __hugetlb_zap_begin(struct vm_area_struct *vma,
+			 unsigned long *start, unsigned long *end)
 {
+	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
+		return;
+
+	adjust_range_if_pmd_sharing_possible(vma, start, end);
 	hugetlb_vma_lock_write(vma);
-	i_mmap_lock_write(vma->vm_file->f_mapping);
+	if (vma->vm_file)
+		i_mmap_lock_write(vma->vm_file->f_mapping);
+}
 
-	/* mmu notification performed in caller */
-	__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
+void __hugetlb_zap_end(struct vm_area_struct *vma,
+		       struct zap_details *details)
+{
+	zap_flags_t zap_flags = details ? details->zap_flags : 0;
+
+	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
+		return;
 
 	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
 		/*
@@ -5459,11 +5468,12 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 		 * someone else.
 		 */
 		__hugetlb_vma_unlock_write_free(vma);
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
 	} else {
-		i_mmap_unlock_write(vma->vm_file->f_mapping);
 		hugetlb_vma_unlock_write(vma);
 	}
+
+	if (vma->vm_file)
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
diff --git a/mm/memory.c b/mm/memory.c
index 6c264d2f969c..517221f01303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1683,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 			if (vma->vm_file) {
 				zap_flags_t zap_flags = details ?
 				    details->zap_flags : 0;
-				__unmap_hugepage_range_final(tlb, vma, start, end,
+				__unmap_hugepage_range(tlb, vma, start, end,
 							     NULL, zap_flags);
 			}
 		} else
@@ -1728,8 +1728,12 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 				start_addr, end_addr);
 	mmu_notifier_invalidate_range_start(&range);
 	do {
-		unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+		unsigned long start = start_addr;
+		unsigned long end = end_addr;
+		hugetlb_zap_begin(vma, &start, &end);
+		unmap_single_vma(tlb, vma, start, end, &details,
 				 mm_wr_locked);
+		hugetlb_zap_end(vma, &details);
 	} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
 	mmu_notifier_invalidate_range_end(&range);
 }
@@ -1753,9 +1757,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
 				address, end);
-	if (is_vm_hugetlb_page(vma))
-		adjust_range_if_pmd_sharing_possible(vma, &range.start,
-						     &range.end);
+	hugetlb_zap_begin(vma, &range.start, &range.end);
 	tlb_gather_mmu(&tlb, vma->vm_mm);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
@@ -1766,6 +1768,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
 	unmap_single_vma(&tlb, vma, address, end, details, false);
 	mmu_notifier_invalidate_range_end(&range);
 	tlb_finish_mmu(&tlb);
+	hugetlb_zap_end(vma, details);
 }
 
 /**
-- 
cgit v1.2.3


From babddbfb7d7d70ae7f10fedd75a45d8ad75fdddf Mon Sep 17 00:00:00 2001
From: Haibo Li <haibo.li@mediatek.com>
Date: Mon, 9 Oct 2023 15:37:48 +0800
Subject: kasan: print the original fault addr when access invalid shadow

when the checked address is illegal,the corresponding shadow address from
kasan_mem_to_shadow may have no mapping in mmu table.  Access such shadow
address causes kernel oops.  Here is a sample about oops on arm64(VA
39bit) with KASAN_SW_TAGS and KASAN_OUTLINE on:

[ffffffb80aaaaaaa] pgd=000000005d3ce003, p4d=000000005d3ce003,
    pud=000000005d3ce003, pmd=0000000000000000
Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
Modules linked in:
CPU: 3 PID: 100 Comm: sh Not tainted 6.6.0-rc1-dirty #43
Hardware name: linux,dummy-virt (DT)
pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : __hwasan_load8_noabort+0x5c/0x90
lr : do_ib_ob+0xf4/0x110
ffffffb80aaaaaaa is the shadow address for efffff80aaaaaaaa.
The problem is reading invalid shadow in kasan_check_range.

The generic kasan also has similar oops.

It only reports the shadow address which causes oops but not
the original address.

Commit 2f004eea0fc8("x86/kasan: Print original address on #GP")
introduce to kasan_non_canonical_hook but limit it to KASAN_INLINE.

This patch extends it to KASAN_OUTLINE mode.

Link: https://lkml.kernel.org/r/20231009073748.159228-1-haibo.li@mediatek.com
Fixes: 2f004eea0fc8("x86/kasan: Print original address on #GP")
Signed-off-by: Haibo Li <haibo.li@mediatek.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Haibo Li <haibo.li@mediatek.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++---
 mm/kasan/report.c     | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 842623d708c2..6dbb5ded3cf9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-#ifdef CONFIG_KASAN_INLINE
+#ifdef CONFIG_KASAN
 void kasan_non_canonical_hook(unsigned long addr);
-#else /* CONFIG_KASAN_INLINE */
+#else /* CONFIG_KASAN */
 static inline void kasan_non_canonical_hook(unsigned long addr) { }
-#endif /* CONFIG_KASAN_INLINE */
+#endif /* CONFIG_KASAN */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ca4b6ff080a6..3974e4549c3e 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -621,9 +621,8 @@ void kasan_report_async(void)
 }
 #endif /* CONFIG_KASAN_HW_TAGS */
 
-#ifdef CONFIG_KASAN_INLINE
 /*
- * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
+ * With CONFIG_KASAN, accesses to bogus pointers (outside the high
  * canonical half of the address space) cause out-of-bounds shadow memory reads
  * before the actual access. For addresses in the low canonical half of the
  * address space, as well as most non-canonical addresses, that out-of-bounds
@@ -659,4 +658,3 @@ void kasan_non_canonical_hook(unsigned long addr)
 	pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
 		 orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1);
 }
-#endif
-- 
cgit v1.2.3


From 17c17567fe510857b18fe01b7a88027600e76ac6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 16 Oct 2023 22:08:38 +0200
Subject: kasan: disable kasan_non_canonical_hook() for HW tags

On arm64, building with CONFIG_KASAN_HW_TAGS now causes a compile-time
error:

mm/kasan/report.c: In function 'kasan_non_canonical_hook':
mm/kasan/report.c:637:20: error: 'KASAN_SHADOW_OFFSET' undeclared (first use in this function)
  637 |         if (addr < KASAN_SHADOW_OFFSET)
      |                    ^~~~~~~~~~~~~~~~~~~
mm/kasan/report.c:637:20: note: each undeclared identifier is reported only once for each function it appears in
mm/kasan/report.c:640:77: error: expected expression before ';' token
  640 |         orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;

This was caused by removing the dependency on CONFIG_KASAN_INLINE that
used to prevent this from happening. Use the more specific dependency
on KASAN_SW_TAGS || KASAN_GENERIC to only ignore the function for hwasan
mode.

Link: https://lkml.kernel.org/r/20231016200925.984439-1-arnd@kernel.org
Fixes: 12ec6a919b0f ("kasan: print the original fault addr when access invalid shadow")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Haibo Li <haibo.li@mediatek.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++---
 mm/kasan/report.c     | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6dbb5ded3cf9..71fa9a40fb5a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 void kasan_non_canonical_hook(unsigned long addr);
-#else /* CONFIG_KASAN */
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 static inline void kasan_non_canonical_hook(unsigned long addr) { }
-#endif /* CONFIG_KASAN */
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3974e4549c3e..6e3cb118d20e 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -621,8 +621,9 @@ void kasan_report_async(void)
 }
 #endif /* CONFIG_KASAN_HW_TAGS */
 
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 /*
- * With CONFIG_KASAN, accesses to bogus pointers (outside the high
+ * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
  * canonical half of the address space) cause out-of-bounds shadow memory reads
  * before the actual access. For addresses in the low canonical half of the
  * address space, as well as most non-canonical addresses, that out-of-bounds
@@ -658,3 +659,4 @@ void kasan_non_canonical_hook(unsigned long addr)
 	pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
 		 orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1);
 }
+#endif
-- 
cgit v1.2.3


From b7c67206594a56be2407ae4da54a114c90609e53 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:53:28 -0700
Subject: mm/memcg: annotate struct mem_cgroup_threshold_ary with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute.  Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct
mem_cgroup_threshold_ary.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Link: https://lkml.kernel.org/r/20230922175327.work.985-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 031102ac9311..c6029aeaa268 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -143,7 +143,7 @@ struct mem_cgroup_threshold_ary {
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
-	struct mem_cgroup_threshold entries[];
+	struct mem_cgroup_threshold entries[] __counted_by(size);
 };
 
 struct mem_cgroup_thresholds {
-- 
cgit v1.2.3


From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 21 Aug 2023 19:15:13 +0500
Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Implement IOCTL to get and optionally clear info about
PTEs", v33.

*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1].  The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.

This syscall is used in Windows applications and games etc.  This syscall
is being emulated in pretty slow manner in userspace.  Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels.  We intend to replace those with these
patches.  So the whole gaming on Linux can effectively get benefit from
this.  It means there would be tons of users of this code.

CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.

Andrei defines the following uses of this code:
* it is more granular and allows us to track changed pages more
  effectively. The current interface can clear dirty bits for the entire
  process only. In addition, reading info about pages is a separate
  operation. It means we must freeze the process to read information
  about all its pages, reset dirty bits, only then we can start dumping
  pages. The information about pages becomes more and more outdated,
  while we are processing pages. The new interface solves both these
  downsides. First, it allows us to read pte bits and clear the
  soft-dirty bit atomically. It means that CRIU will not need to freeze
  processes to pre-dump their memory. Second, it clears soft-dirty bits
  for a specified region of memory. It means CRIU will have actual info
  about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
  is happening in the kernel. With the old interface, we have to read
  pagemap for each page.

*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically

So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.

At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)

All the different masks were added on the request of CRIU devs to create
interface more generic and better.

[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com


This patch (of 6):

Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows
userfaultfd wr-protect faults to be resolved by the kernel directly.

It can be used like a high accuracy version of soft-dirty, without vma
modifications during tracking, and also with ranged support by default
rather than for a whole mm when reset the protections due to existence of
ioctl(UFFDIO_WRITEPROTECT).

Several goals of such a dirty tracking interface:

1. All types of memory should be supported and tracable. This is nature
   for soft-dirty but should mention when the context is userfaultfd,
   because it used to only support anon/shmem/hugetlb. The problem is for
   a dirty tracking purpose these three types may not be enough, and it's
   legal to track anything e.g. any page cache writes from mmap.

2. Protections can be applied to partial of a memory range, without vma
   split/merge fuss.  The hope is that the tracking itself should not
   affect any vma layout change.  It also helps when reset happens because
   the reset will not need mmap write lock which can block the tracee.

3. Accuracy needs to be maintained.  This means we need pte markers to work
   on any type of VMA.

One could question that, the whole concept of async dirty tracking is not
really close to fundamentally what userfaultfd used to be: it's not "a
fault to be serviced by userspace" anymore. However, using userfaultfd-wp
here as a framework is convenient for us in at least:

1. VM_UFFD_WP vma flag, which has a very good name to suite something like
   this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new
   feature bit to identify from a sync version of uffd-wp registration.

2. PTE markers logic can be leveraged across the whole kernel to maintain
   the uffd-wp bit as long as an arch supports, this also applies to this
   case where uffd-wp bit will be a hint to dirty information and it will
   not go lost easily (e.g. when some page cache ptes got zapped).

3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or
   resetting a range of memory, while there's no counterpart in the old
   soft-dirty world, hence if this is wanted in a new design we'll need a
   new interface otherwise.

We can somehow understand that commonality because uffd-wp was
fundamentally a similar idea of write-protecting pages just like
soft-dirty.

This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so
far WP_ASYNC seems to not usable if without WP_UNPOPULATE.  This also
gives us chance to modify impl of WP_ASYNC just in case it could be not
depending on WP_UNPOPULATED anymore in the future kernels.  It's also fine
to imply that because both features will rely on PTE_MARKER_UFFD_WP config
option, so they'll show up together (or both missing) in an UFFDIO_API
probe.

vma_can_userfault() now allows any VMA if the userfaultfd registration is
only about async uffd-wp.  So we can track dirty for all kinds of memory
including generic file systems (like XFS, EXT4 or BTRFS).

One trick worth mention in do_wp_page() is that we need to manually update
vmf->orig_pte here because it can be used later with a pte_same() check -
this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags.

The major defect of this approach of dirty tracking is we need to populate
the pgtables when tracking starts.  Soft-dirty doesn't do it like that.
It's unwanted in the case where the range of memory to track is huge and
unpopulated (e.g., tracking updates on a 10G file with mmap() on top,
without having any page cache installed yet).  One way to improve this is
to allow pte markers exist for larger than PTE level for PMD+.  That will
not change the interface if to implemented, so we can leave that for
later.

Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Co-developed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst | 35 ++++++++++++++++++++++++++++
 fs/userfaultfd.c                             | 26 +++++++++++++++++----
 include/linux/userfaultfd_k.h                | 21 ++++++++++++++++-
 include/uapi/linux/userfaultfd.h             |  9 ++++++-
 mm/hugetlb.c                                 | 32 ++++++++++++++-----------
 mm/memory.c                                  | 28 +++++++++++++++++++---
 6 files changed, 129 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 4349a8c2b978..203e26da5f92 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -244,6 +244,41 @@ write-protected (so future writes will also result in a WP fault). These ioctls
 support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
 respectively) to configure the mapping this way.
 
+If the userfaultfd context has ``UFFD_FEATURE_WP_ASYNC`` feature bit set,
+any vma registered with write-protection will work in async mode rather
+than the default sync mode.
+
+In async mode, there will be no message generated when a write operation
+happens, meanwhile the write-protection will be resolved automatically by
+the kernel.  It can be seen as a more accurate version of soft-dirty
+tracking and it can be different in a few ways:
+
+  - The dirty result will not be affected by vma changes (e.g. vma
+    merging) because the dirty is only tracked by the pte.
+
+  - It supports range operations by default, so one can enable tracking on
+    any range of memory as long as page aligned.
+
+  - Dirty information will not get lost if the pte was zapped due to
+    various reasons (e.g. during split of a shmem transparent huge page).
+
+  - Due to a reverted meaning of soft-dirty (page clean when uffd-wp bit
+    set; dirty when uffd-wp bit cleared), it has different semantics on
+    some of the memory operations.  For example: ``MADV_DONTNEED`` on
+    anonymous (or ``MADV_REMOVE`` on a file mapping) will be treated as
+    dirtying of memory by dropping uffd-wp bit during the procedure.
+
+The user app can collect the "written/dirty" status by looking up the
+uffd-wp bit for the pages being interested in /proc/pagemap.
+
+The page will not be under track of uffd-wp async mode until the page is
+explicitly write-protected by ``ioctl(UFFDIO_WRITEPROTECT)`` with the mode
+flag ``UFFDIO_WRITEPROTECT_MODE_WP`` set.  Trying to resolve a page fault
+that was tracked by async mode userfaultfd-wp is invalid.
+
+When userfaultfd-wp async mode is used alone, it can be applied to all
+kinds of memory.
+
 Memory Poisioning Emulation
 ---------------------------
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..a7c6ef764e63 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 	return ctx->features & UFFD_FEATURE_INITIALIZED;
 }
 
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
 /*
  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
  * meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -1325,6 +1330,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	bool basic_ioctls;
 	unsigned long start, end, vma_end;
 	struct vma_iterator vmi;
+	bool wp_async = userfaultfd_wp_async_ctx(ctx);
 	pgoff_t pgoff;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1399,7 +1405,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_can_userfault(cur, vm_flags))
+		if (!vma_can_userfault(cur, vm_flags, wp_async))
 			goto out_unlock;
 
 		/*
@@ -1460,7 +1466,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	for_each_vma_range(vmi, vma, end) {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma, vm_flags));
+		BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1561,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	unsigned long start, end, vma_end;
 	const void __user *buf = (void __user *)arg;
 	struct vma_iterator vmi;
+	bool wp_async = userfaultfd_wp_async_ctx(ctx);
 	pgoff_t pgoff;
 
 	ret = -EFAULT;
@@ -1615,7 +1622,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_can_userfault(cur, cur->vm_flags))
+		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
 			goto out_unlock;
 
 		found = true;
@@ -1631,7 +1638,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	for_each_vma_range(vmi, vma, end) {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+		BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -2018,6 +2025,11 @@ out:
 	return ret;
 }
 
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -2051,6 +2063,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	ret = -EPERM;
 	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
 		goto err_out;
+
+	/* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+	if (features & UFFD_FEATURE_WP_ASYNC)
+		features |= UFFD_FEATURE_WP_UNPOPULATED;
+
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2063,6 +2080,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
 	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac8c6854097c..c98df391bfd8 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 }
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags)
+				     unsigned long vm_flags,
+				     bool wp_async)
 {
+	vm_flags &= __VM_UFFD_FLAGS;
+
 	if ((vm_flags & VM_UFFD_MINOR) &&
 	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
 		return false;
+
+	/*
+	 * If wp async enabled, and WP is the only mode enabled, allow any
+	 * memory type.
+	 */
+	if (wp_async && (vm_flags == VM_UFFD_WP))
+		return true;
+
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	/*
 	 * If user requested uffd-wp but not enabled pte markers for
@@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
 		return false;
 #endif
+
+	/* By default, allow any of anon|shmem|hugetlb */
 	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
 	    vma_is_shmem(vma);
 }
@@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
+extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 62151706c5a3..0dbc81015018 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -40,7 +40,8 @@
 			   UFFD_FEATURE_EXACT_ADDRESS |		\
 			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
 			   UFFD_FEATURE_WP_UNPOPULATED |	\
-			   UFFD_FEATURE_POISON)
+			   UFFD_FEATURE_POISON |		\
+			   UFFD_FEATURE_WP_ASYNC)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -216,6 +217,11 @@ struct uffdio_api {
 	 * (i.e. empty ptes).  This will be the default behavior for shmem
 	 * & hugetlbfs, so this flag only affects anonymous memory behavior
 	 * when userfault write-protection mode is registered.
+	 *
+	 * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection
+	 * asynchronous mode is supported in which the write fault is
+	 * automatically resolved and write-protection is un-set.
+	 * It implies UFFD_FEATURE_WP_UNPOPULATED.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -232,6 +238,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
 #define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
 #define UFFD_FEATURE_POISON			(1<<14)
+#define UFFD_FEATURE_WP_ASYNC			(1<<15)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dfb4534834f5..bc654b36df9f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Handle userfault-wp first, before trying to lock more pages */
 	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
 	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
-		struct vm_fault vmf = {
-			.vma = vma,
-			.address = haddr,
-			.real_address = address,
-			.flags = flags,
-		};
+		if (!userfaultfd_wp_async(vma)) {
+			struct vm_fault vmf = {
+				.vma = vma,
+				.address = haddr,
+				.real_address = address,
+				.flags = flags,
+			};
 
-		spin_unlock(ptl);
-		if (pagecache_folio) {
-			folio_unlock(pagecache_folio);
-			folio_put(pagecache_folio);
+			spin_unlock(ptl);
+			if (pagecache_folio) {
+				folio_unlock(pagecache_folio);
+				folio_put(pagecache_folio);
+			}
+			hugetlb_vma_unlock_read(vma);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+			return handle_userfault(&vmf, VM_UFFD_WP);
 		}
-		hugetlb_vma_unlock_read(vma);
-		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-		return handle_userfault(&vmf, VM_UFFD_WP);
+
+		entry = huge_pte_clear_uffd_wp(entry);
+		set_huge_pte_at(mm, haddr, ptep, entry);
+		/* Fallthrough to CoW */
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index 2d209bc4d18c..d0f61d729fb4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,3 +1,4 @@
+
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  *  linux/mm/memory.c
@@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
 	struct vm_area_struct *vma = vmf->vma;
 	struct folio *folio = NULL;
+	pte_t pte;
 
 	if (likely(!unshare)) {
 		if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
-			pte_unmap_unlock(vmf->pte, vmf->ptl);
-			return handle_userfault(vmf, VM_UFFD_WP);
+			if (!userfaultfd_wp_async(vma)) {
+				pte_unmap_unlock(vmf->pte, vmf->ptl);
+				return handle_userfault(vmf, VM_UFFD_WP);
+			}
+
+			/*
+			 * Nothing needed (cache flush, TLB invalidations,
+			 * etc.) because we're only removing the uffd-wp bit,
+			 * which is completely invisible to the user.
+			 */
+			pte = pte_clear_uffd_wp(ptep_get(vmf->pte));
+
+			set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+			/*
+			 * Update this to be prepared for following up CoW
+			 * handling
+			 */
+			vmf->orig_pte = pte;
 		}
 
 		/*
@@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 
 	if (vma_is_anonymous(vma)) {
 		if (likely(!unshare) &&
-		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
+		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
+			if (userfaultfd_wp_async(vmf->vma))
+				goto split;
 			return handle_userfault(vmf, VM_UFFD_WP);
+		}
 		return do_huge_pmd_wp_page(vmf);
 	}
 
@@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 		}
 	}
 
+split:
 	/* COW or write-notify handled on pte level: split pmd. */
 	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
 
-- 
cgit v1.2.3


From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
Date: Mon, 21 Aug 2023 19:15:14 +0500
Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info
 about PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
clear the info about page table entries. The following operations are
supported in this IOCTL:
- Scan the address range and get the memory ranges matching the provided
  criteria. This is performed when the output buffer is specified.
- Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect
  the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if
  non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING``
  can be used with or without PM_SCAN_CHECK_WPASYNC.
- Both of those operations can be combined into one atomic operation where
  we can get and write protect the pages as well.

Following flags about pages are currently supported:
- PAGE_IS_WPALLOWED - Page has async-write-protection enabled
- PAGE_IS_WRITTEN - Page has been written to from the time it was write protected
- PAGE_IS_FILE - Page is file backed
- PAGE_IS_PRESENT - Page is present in the memory
- PAGE_IS_SWAPPED - Page is in swapped
- PAGE_IS_PFNZERO - Page has zero PFN
- PAGE_IS_HUGE - Page is THP or Hugetlb backed

This IOCTL can be extended to get information about more PTE bits. The
entire address range passed by user [start, end) is scanned until either
the user provided buffer is full or max_pages have been found.

[akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"]
[akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning]
[arnd@arndb.de: hide unused pagemap_scan_backout_range() function]
  Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org
[sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"]
  Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au
Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Miroslaw <emmir@google.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Paul Gofman <pgofman@codeweavers.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yun Zhou <yun.zhou@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c            | 692 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/hugetlb.h       |   1 +
 include/linux/userfaultfd_k.h |   7 +
 include/uapi/linux/fs.h       |  59 ++++
 mm/hugetlb.c                  |   5 +-
 5 files changed, 762 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..d4ef9a2bf95d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,8 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#define PM_SCAN_CATEGORIES	(PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |	\
+				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
+				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
+				 PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+	struct pm_scan_arg arg;
+	unsigned long masks_of_interest, cur_vma_category;
+	struct page_region *vec_buf;
+	unsigned long vec_buf_len, vec_buf_index, found_pages;
+	struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+					   struct vm_area_struct *vma,
+					   unsigned long addr, pte_t pte)
+{
+	unsigned long categories = 0;
+
+	if (pte_present(pte)) {
+		struct page *page;
+
+		categories |= PAGE_IS_PRESENT;
+		if (!pte_uffd_wp(pte))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			page = vm_normal_page(vma, addr, pte);
+			if (page && !PageAnon(page))
+				categories |= PAGE_IS_FILE;
+		}
+
+		if (is_zero_pfn(pte_pfn(pte)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pte(pte)) {
+		swp_entry_t swp;
+
+		categories |= PAGE_IS_SWAPPED;
+		if (!pte_swp_uffd_wp_any(pte))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			swp = pte_to_swp_entry(pte);
+			if (is_pfn_swap_entry(swp) &&
+			    !PageAnon(pfn_swap_entry_to_page(swp)))
+				categories |= PAGE_IS_FILE;
+		}
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+			     unsigned long addr, pte_t *pte)
+{
+	pte_t ptent = ptep_get(pte);
+
+	if (pte_present(ptent)) {
+		pte_t old_pte;
+
+		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ptent = pte_mkuffd_wp(ptent);
+		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+	} else if (is_swap_pte(ptent)) {
+		ptent = pte_swp_mkuffd_wp(ptent);
+		set_pte_at(vma->vm_mm, addr, pte, ptent);
+	} else {
+		set_pte_at(vma->vm_mm, addr, pte,
+			   make_pte_marker(PTE_MARKER_UFFD_WP));
+	}
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+					  struct vm_area_struct *vma,
+					  unsigned long addr, pmd_t pmd)
+{
+	unsigned long categories = PAGE_IS_HUGE;
+
+	if (pmd_present(pmd)) {
+		struct page *page;
+
+		categories |= PAGE_IS_PRESENT;
+		if (!pmd_uffd_wp(pmd))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			page = vm_normal_page_pmd(vma, addr, pmd);
+			if (page && !PageAnon(page))
+				categories |= PAGE_IS_FILE;
+		}
+
+		if (is_zero_pfn(pmd_pfn(pmd)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pmd(pmd)) {
+		swp_entry_t swp;
+
+		categories |= PAGE_IS_SWAPPED;
+		if (!pmd_swp_uffd_wp(pmd))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			swp = pmd_to_swp_entry(pmd);
+			if (is_pfn_swap_entry(swp) &&
+			    !PageAnon(pfn_swap_entry_to_page(swp)))
+				categories |= PAGE_IS_FILE;
+		}
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+			     unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old, pmd = *pmdp;
+
+	if (pmd_present(pmd)) {
+		old = pmdp_invalidate_ad(vma, addr, pmdp);
+		pmd = pmd_mkuffd_wp(old);
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		pmd = pmd_swp_mkuffd_wp(pmd);
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+	unsigned long categories = PAGE_IS_HUGE;
+
+	/*
+	 * According to pagemap_hugetlb_range(), file-backed HugeTLB
+	 * page cannot be swapped. So PAGE_IS_FILE is not checked for
+	 * swapped pages.
+	 */
+	if (pte_present(pte)) {
+		categories |= PAGE_IS_PRESENT;
+		if (!huge_pte_uffd_wp(pte))
+			categories |= PAGE_IS_WRITTEN;
+		if (!PageAnon(pte_page(pte)))
+			categories |= PAGE_IS_FILE;
+		if (is_zero_pfn(pte_pfn(pte)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pte(pte)) {
+		categories |= PAGE_IS_SWAPPED;
+		if (!pte_swp_uffd_wp_any(pte))
+			categories |= PAGE_IS_WRITTEN;
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep,
+				  pte_t ptent)
+{
+	unsigned long psize;
+
+	if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+		return;
+
+	psize = huge_page_size(hstate_vma(vma));
+
+	if (is_hugetlb_entry_migration(ptent))
+		set_huge_pte_at(vma->vm_mm, addr, ptep,
+				pte_swp_mkuffd_wp(ptent), psize);
+	else if (!huge_pte_none(ptent))
+		huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+					     huge_pte_mkuffd_wp(ptent));
+	else
+		set_huge_pte_at(vma->vm_mm, addr, ptep,
+				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+				       unsigned long addr, unsigned long end)
+{
+	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+	if (cur_buf->start != addr)
+		cur_buf->end = addr;
+	else
+		cur_buf->start = cur_buf->end = 0;
+
+	p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+					     const struct pagemap_scan_private *p)
+{
+	categories ^= p->arg.category_inverted;
+	if ((categories & p->arg.category_mask) != p->arg.category_mask)
+		return false;
+	if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+		return false;
+
+	return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+					    const struct pagemap_scan_private *p)
+{
+	unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+	categories ^= p->arg.category_inverted;
+	if ((categories & required) != required)
+		return false;
+
+	return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+				  struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long vma_category = 0;
+
+	if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
+		vma_category |= PAGE_IS_WPALLOWED;
+	else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+		return -EPERM;
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	if (!pagemap_scan_is_interesting_vma(vma_category, p))
+		return 1;
+
+	p->cur_vma_category = vma_category;
+
+	return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+				    struct pagemap_scan_private *p,
+				    unsigned long addr, unsigned long end)
+{
+	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+	/*
+	 * When there is no output buffer provided at all, the sentinel values
+	 * won't match here. There is no other way for `cur_buf->end` to be
+	 * non-zero other than it being non-empty.
+	 */
+	if (addr == cur_buf->end && categories == cur_buf->categories) {
+		cur_buf->end = end;
+		return true;
+	}
+
+	if (cur_buf->end) {
+		if (p->vec_buf_index >= p->vec_buf_len - 1)
+			return false;
+
+		cur_buf = &p->vec_buf[++p->vec_buf_index];
+	}
+
+	cur_buf->start = addr;
+	cur_buf->end = end;
+	cur_buf->categories = categories;
+
+	return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+			       struct pagemap_scan_private *p,
+			       unsigned long addr, unsigned long *end)
+{
+	unsigned long n_pages, total_pages;
+	int ret = 0;
+
+	if (!p->vec_buf)
+		return 0;
+
+	categories &= p->arg.return_mask;
+
+	n_pages = (*end - addr) / PAGE_SIZE;
+	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+	    total_pages > p->arg.max_pages) {
+		size_t n_too_much = total_pages - p->arg.max_pages;
+		*end -= n_too_much * PAGE_SIZE;
+		n_pages -= n_too_much;
+		ret = -ENOSPC;
+	}
+
+	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+		*end = addr;
+		n_pages = 0;
+		ret = -ENOSPC;
+	}
+
+	p->found_pages += n_pages;
+	if (ret)
+		p->arg.walk_end = *end;
+
+	return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+				  unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long categories;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
+		return -ENOENT;
+
+	categories = p->cur_vma_category |
+		     pagemap_thp_category(p, vma, start, *pmd);
+
+	if (!pagemap_scan_is_interesting_page(categories, p))
+		goto out_unlock;
+
+	ret = pagemap_scan_output(categories, p, start, &end);
+	if (start == end)
+		goto out_unlock;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+		goto out_unlock;
+	if (~categories & PAGE_IS_WRITTEN)
+		goto out_unlock;
+
+	/*
+	 * Break huge page into small pages if the WP operation
+	 * needs to be performed on a portion of the huge page.
+	 */
+	if (end != start + HPAGE_SIZE) {
+		spin_unlock(ptl);
+		split_huge_pmd(vma, pmd, start);
+		pagemap_scan_backout_range(p, start, end);
+		/* Report as if there was no THP */
+		return -ENOENT;
+	}
+
+	make_uffd_wp_pmd(vma, start, pmd);
+	flush_tlb_range(vma, start, end);
+out_unlock:
+	spin_unlock(ptl);
+	return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+	return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long addr, flush_end = 0;
+	pte_t *pte, *start_pte;
+	spinlock_t *ptl;
+	int ret;
+
+	arch_enter_lazy_mmu_mode();
+
+	ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+	if (ret != -ENOENT) {
+		arch_leave_lazy_mmu_mode();
+		return ret;
+	}
+
+	ret = 0;
+	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+	if (!pte) {
+		arch_leave_lazy_mmu_mode();
+		walk->action = ACTION_AGAIN;
+		return 0;
+	}
+
+	for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+		unsigned long categories = p->cur_vma_category |
+					   pagemap_page_category(p, vma, addr, ptep_get(pte));
+		unsigned long next = addr + PAGE_SIZE;
+
+		if (!pagemap_scan_is_interesting_page(categories, p))
+			continue;
+
+		ret = pagemap_scan_output(categories, p, addr, &next);
+		if (next == addr)
+			break;
+
+		if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+			continue;
+		if (~categories & PAGE_IS_WRITTEN)
+			continue;
+
+		make_uffd_wp_pte(vma, addr, pte);
+		if (!flush_end)
+			start = addr;
+		flush_end = next;
+	}
+
+	if (flush_end)
+		flush_tlb_range(vma, start, addr);
+
+	pte_unmap_unlock(start_pte, ptl);
+	arch_leave_lazy_mmu_mode();
+
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+				      unsigned long start, unsigned long end,
+				      struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long categories;
+	spinlock_t *ptl;
+	int ret = 0;
+	pte_t pte;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+		/* Go the short route when not write-protecting pages. */
+
+		pte = huge_ptep_get(ptep);
+		categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+		if (!pagemap_scan_is_interesting_page(categories, p))
+			return 0;
+
+		return pagemap_scan_output(categories, p, start, &end);
+	}
+
+	i_mmap_lock_write(vma->vm_file->f_mapping);
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+	pte = huge_ptep_get(ptep);
+	categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+	if (!pagemap_scan_is_interesting_page(categories, p))
+		goto out_unlock;
+
+	ret = pagemap_scan_output(categories, p, start, &end);
+	if (start == end)
+		goto out_unlock;
+
+	if (~categories & PAGE_IS_WRITTEN)
+		goto out_unlock;
+
+	if (end != start + HPAGE_SIZE) {
+		/* Partial HugeTLB page WP isn't possible. */
+		pagemap_scan_backout_range(p, start, end);
+		p->arg.walk_end = start;
+		ret = 0;
+		goto out_unlock;
+	}
+
+	make_uffd_wp_huge_pte(vma, start, ptep, pte);
+	flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+	spin_unlock(ptl);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+	return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+				 int depth, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	int ret, err;
+
+	if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+		return 0;
+
+	ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+	if (addr == end)
+		return ret;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+		return ret;
+
+	err = uffd_wp_range(vma, addr, end - addr, true);
+	if (err < 0)
+		ret = err;
+
+	return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+	.test_walk = pagemap_scan_test_walk,
+	.pmd_entry = pagemap_scan_pmd_entry,
+	.pte_hole = pagemap_scan_pte_hole,
+	.hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+				 unsigned long uarg)
+{
+	if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+		return -EFAULT;
+
+	if (arg->size != sizeof(struct pm_scan_arg))
+		return -EINVAL;
+
+	/* Validate requested features */
+	if (arg->flags & ~PM_SCAN_FLAGS)
+		return -EINVAL;
+	if ((arg->category_inverted | arg->category_mask |
+	     arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+		return -EINVAL;
+
+	arg->start = untagged_addr((unsigned long)arg->start);
+	arg->end = untagged_addr((unsigned long)arg->end);
+	arg->vec = untagged_addr((unsigned long)arg->vec);
+
+	/* Validate memory pointers */
+	if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+		return -EINVAL;
+	if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+		return -EFAULT;
+	if (!arg->vec && arg->vec_len)
+		return -EINVAL;
+	if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+			      arg->vec_len * sizeof(struct page_region)))
+		return -EFAULT;
+
+	/* Fixup default values */
+	arg->end = ALIGN(arg->end, PAGE_SIZE);
+	arg->walk_end = 0;
+	if (!arg->max_pages)
+		arg->max_pages = ULONG_MAX;
+
+	return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+				       unsigned long uargl)
+{
+	struct pm_scan_arg __user *uarg	= (void __user *)uargl;
+
+	if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+	if (!p->arg.vec_len)
+		return 0;
+
+	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+			       p->arg.vec_len);
+	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+				   GFP_KERNEL);
+	if (!p->vec_buf)
+		return -ENOMEM;
+
+	p->vec_buf->start = p->vec_buf->end = 0;
+	p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+	return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+	const struct page_region *buf = p->vec_buf;
+	long n = p->vec_buf_index;
+
+	if (!p->vec_buf)
+		return 0;
+
+	if (buf[n].end != buf[n].start)
+		n++;
+
+	if (!n)
+		return 0;
+
+	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+		return -EFAULT;
+
+	p->arg.vec_len -= n;
+	p->vec_out += n;
+
+	p->vec_buf_index = 0;
+	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+	p->vec_buf->start = p->vec_buf->end = 0;
+
+	return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+	struct mmu_notifier_range range;
+	struct pagemap_scan_private p = {0};
+	unsigned long walk_start;
+	size_t n_ranges_out = 0;
+	int ret;
+
+	ret = pagemap_scan_get_args(&p.arg, uarg);
+	if (ret)
+		return ret;
+
+	p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+			      p.arg.return_mask;
+	ret = pagemap_scan_init_bounce_buffer(&p);
+	if (ret)
+		return ret;
+
+	/* Protection change for the range is going to happen. */
+	if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+					mm, p.arg.start, p.arg.end);
+		mmu_notifier_invalidate_range_start(&range);
+	}
+
+	for (walk_start = p.arg.start; walk_start < p.arg.end;
+			walk_start = p.arg.walk_end) {
+		long n_out;
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = mmap_read_lock_killable(mm);
+		if (ret)
+			break;
+		ret = walk_page_range(mm, walk_start, p.arg.end,
+				      &pagemap_scan_ops, &p);
+		mmap_read_unlock(mm);
+
+		n_out = pagemap_scan_flush_buffer(&p);
+		if (n_out < 0)
+			ret = n_out;
+		else
+			n_ranges_out += n_out;
+
+		if (ret != -ENOSPC)
+			break;
+
+		if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+			break;
+	}
+
+	/* ENOSPC signifies early stop (buffer full) from the walk. */
+	if (!ret || ret == -ENOSPC)
+		ret = n_ranges_out;
+
+	/* The walk_end isn't set when ret is zero */
+	if (!p.arg.walk_end)
+		p.arg.walk_end = p.arg.end;
+	if (pagemap_scan_writeback_args(&p.arg, uarg))
+		ret = -EFAULT;
+
+	if (p.arg.flags & PM_SCAN_WP_MATCHING)
+		mmu_notifier_invalidate_range_end(&range);
+
+	kfree(p.vec_buf);
+	return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct mm_struct *mm = file->private_data;
+
+	switch (cmd) {
+	case PAGEMAP_SCAN:
+		return do_pagemap_scan(mm, arg);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 	.open		= pagemap_open,
 	.release	= pagemap_release,
+	.unlocked_ioctl = do_pagemap_cmd,
+	.compat_ioctl	= do_pagemap_cmd,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e5b9f7e62eeb..205469aa0613 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long cp_flags);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c98df391bfd8..f2dc19f40d05 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
 	return VM_FAULT_SIGBUS;
 }
 
+static inline long uffd_wp_range(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long len,
+				 bool enable_wp)
+{
+	return false;
+}
+
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..da43810b7485 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)
+
+/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
+#define PAGE_IS_WPALLOWED	(1 << 0)
+#define PAGE_IS_WRITTEN		(1 << 1)
+#define PAGE_IS_FILE		(1 << 2)
+#define PAGE_IS_PRESENT		(1 << 3)
+#define PAGE_IS_SWAPPED		(1 << 4)
+#define PAGE_IS_PFNZERO		(1 << 5)
+#define PAGE_IS_HUGE		(1 << 6)
+
+/*
+ * struct page_region - Page region with flags
+ * @start:	Start of the region
+ * @end:	End of the region (exclusive)
+ * @categories:	PAGE_IS_* category bitmask for the region
+ */
+struct page_region {
+	__u64 start;
+	__u64 end;
+	__u64 categories;
+};
+
+/* Flags for PAGEMAP_SCAN ioctl */
+#define PM_SCAN_WP_MATCHING	(1 << 0)	/* Write protect the pages matched. */
+#define PM_SCAN_CHECK_WPASYNC	(1 << 1)	/* Abort the scan when a non-WP-enabled page is found. */
+
+/*
+ * struct pm_scan_arg - Pagemap ioctl argument
+ * @size:		Size of the structure
+ * @flags:		Flags for the IOCTL
+ * @start:		Starting address of the region
+ * @end:		Ending address of the region
+ * @walk_end		Address where the scan stopped (written by kernel).
+ *			walk_end == end (address tags cleared) informs that the scan completed on entire range.
+ * @vec:		Address of page_region struct array for output
+ * @vec_len:		Length of the page_region struct array
+ * @max_pages:		Optional limit for number of returned pages (0 = disabled)
+ * @category_inverted:	PAGE_IS_* categories which values match if 0 instead of 1
+ * @category_mask:	Skip pages for which any category doesn't match
+ * @category_anyof_mask: Skip pages for which no category matches
+ * @return_mask:	PAGE_IS_* categories that are to be reported in `page_region`s returned
+ */
+struct pm_scan_arg {
+	__u64 size;
+	__u64 flags;
+	__u64 start;
+	__u64 end;
+	__u64 walk_end;
+	__u64 vec;
+	__u64 vec_len;
+	__u64 max_pages;
+	__u64 category_inverted;
+	__u64 category_mask;
+	__u64 category_anyof_mask;
+	__u64 return_mask;
+};
+
 #endif /* _UAPI_LINUX_FS_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc654b36df9f..2878e0e6bac5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
 		return false;
 }
 
-static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
+bool is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
 	swp_entry_t swp;
 
@@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 
 		entry = huge_pte_clear_uffd_wp(entry);
-		set_huge_pte_at(mm, haddr, ptep, entry);
+		set_huge_pte_at(mm, haddr, ptep, entry,
+				huge_page_size(hstate_vma(vma)));
 		/* Fallthrough to CoW */
 	}
 
-- 
cgit v1.2.3


From ee615d4585cfc305bf6c218a62123c3051f8b4a3 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:25:38 -0700
Subject: shmem: shrink shmem_inode_info: dir_offsets in a union

Patch series "shmem,tmpfs: general maintenance".

Mostly just cosmetic mods in mm/shmem.c, but the last two enforcing the
"size=" limit better.  8/8 goes into percpu counter territory, and could
stand alone.


This patch (of 8):

Shave 32 bytes off (the 64-bit) shmem_inode_info.  There was a 4-byte
pahole after stop_eviction, better filled by fsflags.  And the 24-byte
dir_offsets can only be used by directories, whereas shrinklist and
swaplist only by shmem_mapping() inodes (regular files or long symlinks):
so put those into a union.  No change in mm/shmem.c is required for this.

Link: https://lkml.kernel.org/r/c7441dc6-f3bb-dd60-c670-9f5cbd9f266@google.com
Link: https://lkml.kernel.org/r/86ebb4b-c571-b9e8-27f5-cb82ec50357e@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6b0c626620f5..2caa6b86106a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,18 +23,22 @@ struct shmem_inode_info {
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
-	pgoff_t			fallocend;	/* highest fallocate endindex */
-	struct list_head        shrinklist;     /* shrinkable hpage inodes */
-	struct list_head	swaplist;	/* chain of maybes on swap */
+	union {
+	    struct offset_ctx	dir_offsets;	/* stable directory offsets */
+	    struct {
+		struct list_head shrinklist;	/* shrinkable hpage inodes */
+		struct list_head swaplist;	/* chain of maybes on swap */
+	    };
+	};
+	struct timespec64	i_crtime;	/* file creation time */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
+	pgoff_t			fallocend;	/* highest fallocate endindex */
+	unsigned int		fsflags;	/* for FS_IOC_[SG]ETFLAGS */
 	atomic_t		stop_eviction;	/* hold when working on inode */
-	struct timespec64	i_crtime;	/* file creation time */
-	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */
 #ifdef CONFIG_TMPFS_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
 #endif
-	struct offset_ctx	dir_offsets;	/* stable entry offsets */
 	struct inode		vfs_inode;
 };
 
-- 
cgit v1.2.3


From beb9868628445306958fd7b2da1cd369a4a381cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 29 Sep 2023 20:42:45 -0700
Subject: shmem,percpu_counter: add _limited_add(fbc, limit, amount)

Percpu counter's compare and add are separate functions: without locking
around them (which would defeat their purpose), it has been possible to
overflow the intended limit.  Imagine all the other CPUs fallocating tmpfs
huge pages to the limit, in between this CPU's compare and its add.

I have not seen reports of that happening; but tmpfs's recent addition of
dquot_alloc_block_nodirty() in between the compare and the add makes it
even more likely, and I'd be uncomfortable to leave it unfixed.

Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.

I believe this implementation is correct, and slightly more efficient than
the combination of compare and add (taking the lock once rather than twice
when nearing full - the last 128MiB of a tmpfs volume on a machine with
128 CPUs and 4KiB pages); but it does beg for a better design - when
nearing full, there is no new batching, but the costly percpu counter sum
across CPUs still has to be done, while locked.

Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well
as cpu_online_mask: but shouldn't __percpu_counter_compare() and
__percpu_counter_limited_add() then be adding a num_dying_cpus() to
num_online_cpus(), when they calculate the maximum which could be held
across CPUs?  But the times when it matters would be vanishingly rare.

Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 23 ++++++++++++++++++
 lib/percpu_counter.c           | 53 ++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c                     | 10 ++++----
 3 files changed, 81 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d01351b1526f..8cb7c071bd5c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
 			      s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
+				  s64 amount, s32 batch);
 void percpu_counter_sync(struct percpu_counter *fbc);
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	return __percpu_counter_limited_add(fbc, limit, amount,
+					    percpu_counter_batch);
+}
+
 /*
  * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
  * are accumulated in local per cpu counter and not in fbc->count until
@@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 	local_irq_restore(flags);
 }
 
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+	unsigned long flags;
+	s64 count;
+
+	local_irq_save(flags);
+	count = fbc->count + amount;
+	if (count <= limit)
+		fbc->count = count;
+	local_irq_restore(flags);
+	return count <= limit;
+}
+
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
 static inline void
 percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 9073430dc865..58a3392f471b 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -278,6 +278,59 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
 }
 EXPORT_SYMBOL(__percpu_counter_compare);
 
+/*
+ * Compare counter, and add amount if the total is within limit.
+ * Return true if amount was added, false if it would exceed limit.
+ */
+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
+				  s64 limit, s64 amount, s32 batch)
+{
+	s64 count;
+	s64 unknown;
+	unsigned long flags;
+	bool good;
+
+	if (amount > limit)
+		return false;
+
+	local_irq_save(flags);
+	unknown = batch * num_online_cpus();
+	count = __this_cpu_read(*fbc->counters);
+
+	/* Skip taking the lock when safe */
+	if (abs(count + amount) <= batch &&
+	    fbc->count + unknown <= limit) {
+		this_cpu_add(*fbc->counters, amount);
+		local_irq_restore(flags);
+		return true;
+	}
+
+	raw_spin_lock(&fbc->lock);
+	count = fbc->count + amount;
+
+	/* Skip percpu_counter_sum() when safe */
+	if (count + unknown > limit) {
+		s32 *pcount;
+		int cpu;
+
+		for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
+			pcount = per_cpu_ptr(fbc->counters, cpu);
+			count += *pcount;
+		}
+	}
+
+	good = count <= limit;
+	if (good) {
+		count = __this_cpu_read(*fbc->counters);
+		fbc->count += count + amount;
+		__this_cpu_sub(*fbc->counters, count);
+	}
+
+	raw_spin_unlock(&fbc->lock);
+	local_irq_restore(flags);
+	return good;
+}
+
 static int __init percpu_counter_startup(void)
 {
 	int ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 269cd3c1110f..61b170324e5c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -217,15 +217,15 @@ static int shmem_inode_acct_blocks(struct inode *inode, long pages)
 
 	might_sleep();	/* when quotas */
 	if (sbinfo->max_blocks) {
-		if (percpu_counter_compare(&sbinfo->used_blocks,
-					   sbinfo->max_blocks - pages) > 0)
+		if (!percpu_counter_limited_add(&sbinfo->used_blocks,
+						sbinfo->max_blocks, pages))
 			goto unacct;
 
 		err = dquot_alloc_block_nodirty(inode, pages);
-		if (err)
+		if (err) {
+			percpu_counter_sub(&sbinfo->used_blocks, pages);
 			goto unacct;
-
-		percpu_counter_add(&sbinfo->used_blocks, pages);
+		}
 	} else {
 		err = dquot_alloc_block_nodirty(inode, pages);
 		if (err)
-- 
cgit v1.2.3


From 1431996bf9088ee59f8017637ab9a7f89909ae63 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 11 Oct 2023 21:40:09 -0700
Subject: percpu_counter: extend _limited_add() to negative amounts

Though tmpfs does not need it, percpu_counter_limited_add() can be twice
as useful if it works sensibly with negative amounts (subs) - typically
decrements towards a limit of 0 or nearby: as suggested by Dave Chinner.

And in the course of that reworking, skip the percpu counter sum if it is
already obvious that the limit would be passed: as suggested by Tim Chen.

Extend the comment above __percpu_counter_limited_add(), defining the
behaviour with positive and negative amounts, allowing negative limits,
but not bothering about overflow beyond S64_MAX.

Link: https://lkml.kernel.org/r/8f86083b-c452-95d4-365b-f16a2e4ebcd4@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu_counter.h | 11 +++++++--
 lib/percpu_counter.c           | 54 +++++++++++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8cb7c071bd5c..3a44dd1e33d2 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -198,14 +198,21 @@ static inline bool
 percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
 {
 	unsigned long flags;
+	bool good = false;
 	s64 count;
 
+	if (amount == 0)
+		return true;
+
 	local_irq_save(flags);
 	count = fbc->count + amount;
-	if (count <= limit)
+	if ((amount > 0 && count <= limit) ||
+	    (amount < 0 && count >= limit)) {
 		fbc->count = count;
+		good = true;
+	}
 	local_irq_restore(flags);
-	return count <= limit;
+	return good;
 }
 
 /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 58a3392f471b..44dd133594d4 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -279,8 +279,16 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
 EXPORT_SYMBOL(__percpu_counter_compare);
 
 /*
- * Compare counter, and add amount if the total is within limit.
- * Return true if amount was added, false if it would exceed limit.
+ * Compare counter, and add amount if total is: less than or equal to limit if
+ * amount is positive, or greater than or equal to limit if amount is negative.
+ * Return true if amount is added, or false if total would be beyond the limit.
+ *
+ * Negative limit is allowed, but unusual.
+ * When negative amounts (subs) are given to percpu_counter_limited_add(),
+ * the limit would most naturally be 0 - but other limits are also allowed.
+ *
+ * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
+ * are all assumed to be sane (far from S64_MIN and S64_MAX).
  */
 bool __percpu_counter_limited_add(struct percpu_counter *fbc,
 				  s64 limit, s64 amount, s32 batch)
@@ -288,10 +296,10 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
 	s64 count;
 	s64 unknown;
 	unsigned long flags;
-	bool good;
+	bool good = false;
 
-	if (amount > limit)
-		return false;
+	if (amount == 0)
+		return true;
 
 	local_irq_save(flags);
 	unknown = batch * num_online_cpus();
@@ -299,7 +307,8 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
 
 	/* Skip taking the lock when safe */
 	if (abs(count + amount) <= batch &&
-	    fbc->count + unknown <= limit) {
+	    ((amount > 0 && fbc->count + unknown <= limit) ||
+	     (amount < 0 && fbc->count - unknown >= limit))) {
 		this_cpu_add(*fbc->counters, amount);
 		local_irq_restore(flags);
 		return true;
@@ -309,7 +318,19 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
 	count = fbc->count + amount;
 
 	/* Skip percpu_counter_sum() when safe */
-	if (count + unknown > limit) {
+	if (amount > 0) {
+		if (count - unknown > limit)
+			goto out;
+		if (count + unknown <= limit)
+			good = true;
+	} else {
+		if (count + unknown < limit)
+			goto out;
+		if (count - unknown >= limit)
+			good = true;
+	}
+
+	if (!good) {
 		s32 *pcount;
 		int cpu;
 
@@ -317,15 +338,20 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
 			pcount = per_cpu_ptr(fbc->counters, cpu);
 			count += *pcount;
 		}
+		if (amount > 0) {
+			if (count > limit)
+				goto out;
+		} else {
+			if (count < limit)
+				goto out;
+		}
+		good = true;
 	}
 
-	good = count <= limit;
-	if (good) {
-		count = __this_cpu_read(*fbc->counters);
-		fbc->count += count + amount;
-		__this_cpu_sub(*fbc->counters, count);
-	}
-
+	count = __this_cpu_read(*fbc->counters);
+	fbc->count += count + amount;
+	__this_cpu_sub(*fbc->counters, count);
+out:
 	raw_spin_unlock(&fbc->lock);
 	local_irq_restore(flags);
 	return good;
-- 
cgit v1.2.3


From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 2 Oct 2023 16:29:48 +0200
Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()

Let's convert it to consume a folio.

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/huge_memory.c     |  2 +-
 mm/hugetlb.c         |  2 +-
 mm/memory.c          |  2 +-
 mm/rmap.c            | 16 +++++++---------
 5 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d22f4d21a11c..b26fe858fd44 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -189,7 +189,7 @@ typedef int __bitwise rmap_t;
 /*
  * rmap interfaces called when adding or removing pte of page
  */
-void page_move_anon_rmap(struct page *, struct vm_area_struct *);
+void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 76ead290f1c8..51a66eb48938 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1376,7 +1376,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 	if (folio_ref_count(folio) == 1) {
 		pmd_t entry;
 
-		page_move_anon_rmap(page, vma);
+		folio_move_anon_rmap(folio, vma);
 		SetPageAnonExclusive(page);
 		folio_unlock(folio);
 reuse:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 35d924f74972..7ad9d2159da4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5653,7 +5653,7 @@ retry_avoidcopy:
 	 */
 	if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
 		if (!PageAnonExclusive(&old_folio->page)) {
-			page_move_anon_rmap(&old_folio->page, vma);
+			folio_move_anon_rmap(old_folio, vma);
 			SetPageAnonExclusive(&old_folio->page);
 		}
 		if (likely(!unshare))
diff --git a/mm/memory.c b/mm/memory.c
index 21fba1c9a6c7..6c67828f934c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3480,7 +3480,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 		 * and the folio is locked, it's dark out, and we're wearing
 		 * sunglasses. Hit it.
 		 */
-		page_move_anon_rmap(vmf->page, vma);
+		folio_move_anon_rmap(folio, vma);
 		SetPageAnonExclusive(vmf->page);
 		folio_unlock(folio);
 reuse:
diff --git a/mm/rmap.c b/mm/rmap.c
index 6f1ea1491118..7a27a2b41802 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1128,19 +1128,17 @@ int folio_total_mapcount(struct folio *folio)
 }
 
 /**
- * page_move_anon_rmap - move a page to our anon_vma
- * @page:	the page to move to our anon_vma
- * @vma:	the vma the page belongs to
+ * folio_move_anon_rmap - move a folio to our anon_vma
+ * @folio:	The folio to move to our anon_vma
+ * @vma:	The vma the folio belongs to
  *
- * When a page belongs exclusively to one process after a COW event,
- * that page can be moved into the anon_vma that belongs to just that
- * process, so the rmap code will not search the parent or sibling
- * processes.
+ * When a folio belongs exclusively to one process after a COW event,
+ * that folio can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling processes.
  */
-void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
+void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
 {
 	void *anon_vma = vma->anon_vma;
-	struct folio *folio = page_folio(page);
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_VMA(!anon_vma, vma);
-- 
cgit v1.2.3


From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:51 +0100
Subject: mm: make __access_remote_vm() static

Patch series "various improvements to the GUP interface", v2.

A series of fixes to simplify and improve the GUP interface with an eye to
providing groundwork to future improvements:-

* __access_remote_vm() and access_remote_vm() are functionally identical,
  so make the former static such that in future we can potentially change
  the external-facing implementation details of this function.

* Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and
  simplify things by defining INTERNAL_GUP_FLAGS to check against.

* Adjust __get_user_pages_locked() to explicitly treat a failure to pin any
  pages as an error in all circumstances other than FOLL_NOWAIT being
  specified, bringing it in line with the nommu implementation of this
  function.

* (With many thanks to Arnd who suggested this in the first instance)
  Update get_user_page_vma_remote() to explicitly only return a page or an
  error, simplifying the interface and avoiding the questionable
  IS_ERR_OR_NULL() pattern.


This patch (of 4):

access_remote_vm() passes through parameters to __access_remote_vm()
directly, so remove the __access_remote_vm() function from mm.h and use
access_remote_vm() in the one caller that needs it (ptrace_access_vm()).

This allows future adjustments to the GUP-internal __access_remote_vm()
function while keeping the access_remote_vm() function stable.

Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 kernel/ptrace.c    | 2 +-
 mm/memory.c        | 4 ++--
 mm/nommu.c         | 4 ++--
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52c40b3d0813..7b89f7bd420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
-			      void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..d8b5e13a2229 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 		return 0;
 	}
 
-	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
+	ret = access_remote_vm(mm, addr, buf, len, gup_flags);
 	mmput(mm);
 
 	return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 22784d3b886d..e47c36c0aef0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
 /*
  * Access another process' address space as given in mm.
  */
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
-		       int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+			      void *buf, int len, unsigned int gup_flags)
 {
 	void *old_buf = buf;
 	int write = gup_flags & FOLL_WRITE;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..f9553579389b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1651,8 +1651,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
-		       int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+			      void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
 	int write = gup_flags & FOLL_WRITE;
-- 
cgit v1.2.3


From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:54 +0100
Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL

get_user_pages_remote() will never return 0 except in the case of
FOLL_NOWAIT being specified, which we explicitly disallow.

This simplifies error handling for the caller and avoids the awkwardness
of dealing with both errors and failing to pin.  Failing to pin here is an
error.

Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/kernel/mte.c |  4 ++--
 include/linux/mm.h      | 12 +++++++++---
 kernel/events/uprobes.c |  4 ++--
 mm/memory.c             |  3 +--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 4edecaac8f91..8878b392df58 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -411,8 +411,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
 		struct page *page = get_user_page_vma_remote(mm, addr,
 							     gup_flags, &vma);
 
-		if (IS_ERR_OR_NULL(page)) {
-			err = page == NULL ? -EIO : PTR_ERR(page);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
 			break;
 		}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b89f7bd420d..fa608cba041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked);
 
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 						    unsigned long addr,
 						    int gup_flags,
@@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 {
 	struct page *page;
 	struct vm_area_struct *vma;
-	int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+	int got;
+
+	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+		return ERR_PTR(-EINVAL);
+
+	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
 
 	if (got < 0)
 		return ERR_PTR(got);
-	if (got == 0)
-		return NULL;
 
 	vma = vma_lookup(mm, addr);
 	if (WARN_ON_ONCE(!vma)) {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3048589e2e85..435aac1d8c27 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -474,8 +474,8 @@ retry:
 		gup_flags |= FOLL_SPLIT_PMD;
 	/* Read the page with vaddr into memory */
 	old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
-	if (IS_ERR_OR_NULL(old_page))
-		return old_page ? PTR_ERR(old_page) : 0;
+	if (IS_ERR(old_page))
+		return PTR_ERR(old_page);
 
 	ret = verify_opcode(old_page, vaddr, &opcode);
 	if (ret <= 0)
diff --git a/mm/memory.c b/mm/memory.c
index e47c36c0aef0..1f88e4f6fbf2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		struct page *page = get_user_page_vma_remote(mm, addr,
 							     gup_flags, &vma);
 
-		if (IS_ERR_OR_NULL(page)) {
+		if (IS_ERR(page)) {
 			/* We might need to expand the stack to access it */
 			vma = vma_lookup(mm, addr);
 			if (!vma) {
@@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
 				continue;
 			}
 
-
 			/*
 			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
 			 * we can access using slightly different code.
-- 
cgit v1.2.3


From 0b237047d5a72ffe06c0bdf2f4536f669dcd31c9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:03 +0100
Subject: mm: add folio_end_read()

Provide a function for filesystems to call when they have finished reading
an entire folio.

Link: https://lkml.kernel.org/r/20231004165317.1061855-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 759b29d9a69a..bcc1ea44b4e8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1101,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page)
 	folio_wait_locked(page_folio(page));
 }
 
+void folio_end_read(struct folio *folio, bool success);
 void wait_on_page_writeback(struct page *page);
 void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
diff --git a/mm/filemap.c b/mm/filemap.c
index cd872fbc6086..1ce78c619294 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1525,6 +1525,28 @@ void folio_unlock(struct folio *folio)
 }
 EXPORT_SYMBOL(folio_unlock);
 
+/**
+ * folio_end_read - End read on a folio.
+ * @folio: The folio.
+ * @success: True if all reads completed successfully.
+ *
+ * When all reads against a folio have completed, filesystems should
+ * call this function to let the pagecache know that no more reads
+ * are outstanding.  This will unlock the folio and wake up any thread
+ * sleeping on the lock.  The folio will also be marked uptodate if all
+ * reads succeeded.
+ *
+ * Context: May be called from interrupt or process context.  May not be
+ * called from NMI context.
+ */
+void folio_end_read(struct folio *folio, bool success)
+{
+	if (likely(success))
+		folio_mark_uptodate(folio);
+	folio_unlock(folio);
+}
+EXPORT_SYMBOL(folio_end_read);
+
 /**
  * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
  * @folio: The folio.
-- 
cgit v1.2.3


From 0410cd844ed0af3db3cb510d877d62c66d26e5cc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:15 +0100
Subject: mm: add folio_xor_flags_has_waiters()

Optimise folio_end_read() by setting the uptodate bit at the same time we
clear the unlock bit.  This saves at least one memory barrier and one
write-after-write hazard.

Link: https://lkml.kernel.org/r/20231004165317.1061855-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 19 +++++++++++++++++++
 mm/filemap.c               | 14 +++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c02720c53a5..a88e64acebfe 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -692,6 +692,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)
 
 u64 stable_page_flags(struct page *page);
 
+/**
+ * folio_xor_flags_has_waiters - Change some folio flags.
+ * @folio: The folio.
+ * @mask: Bits set in this word will be changed.
+ *
+ * This must only be used for flags which are changed with the folio
+ * lock held.  For example, it is unsafe to use for PG_dirty as that
+ * can be set without the folio lock held.  It can also only be used
+ * on flags which are in the range 0-6 as some of the implementations
+ * only affect those bits.
+ *
+ * Return: Whether there are tasks waiting on the folio.
+ */
+static inline bool folio_xor_flags_has_waiters(struct folio *folio,
+		unsigned long mask)
+{
+	return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
+}
+
 /**
  * folio_test_uptodate - Is this folio up to date?
  * @folio: The folio.
diff --git a/mm/filemap.c b/mm/filemap.c
index 458377e9a184..e9c636f57777 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1497,7 +1497,7 @@ void folio_unlock(struct folio *folio)
 	BUILD_BUG_ON(PG_waiters != 7);
 	BUILD_BUG_ON(PG_locked > 7);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
-	if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0)))
+	if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
 		folio_wake_bit(folio, PG_locked);
 }
 EXPORT_SYMBOL(folio_unlock);
@@ -1518,9 +1518,17 @@ EXPORT_SYMBOL(folio_unlock);
  */
 void folio_end_read(struct folio *folio, bool success)
 {
+	unsigned long mask = 1 << PG_locked;
+
+	/* Must be in bottom byte for x86 to work */
+	BUILD_BUG_ON(PG_uptodate > 7);
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+
 	if (likely(success))
-		folio_mark_uptodate(folio);
-	folio_unlock(folio);
+		mask |= 1 << PG_uptodate;
+	if (folio_xor_flags_has_waiters(folio, mask))
+		folio_wake_bit(folio, PG_locked);
 }
 EXPORT_SYMBOL(folio_end_read);
 
-- 
cgit v1.2.3


From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fvdl@google.com>
Date: Wed, 4 Oct 2023 15:32:48 +0000
Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER

Originally, hugetlb_cgroup was the only hugetlb user of tail page
structure fields.  So, the code defined and checked against
HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use.

However, by now, tail page #2 is used to store hugetlb hwpoison and
subpool information as well.  In other words, without that tail page
hugetlb doesn't work.

Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and
checks against it.  Instead, just check for the minimum viable page order
at hstate creation time.

Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 11 -----------
 mm/hugetlb.c                   |  2 +-
 mm/hugetlb_cgroup.c            | 20 ++------------------
 3 files changed, 3 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 3d82d91f49ac..e5d64b8b59c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -22,13 +22,6 @@ struct resv_map;
 struct file_region;
 
 #ifdef CONFIG_CGROUP_HUGETLB
-/*
- * Minimum page order trackable by hugetlb cgroup.
- * At least 3 pages are necessary for all the tracking information.
- * The second tail page contains all of the hugetlb-specific fields.
- */
-#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE)
-
 enum hugetlb_memory_event {
 	HUGETLB_MAX,
 	HUGETLB_NR_MEMORY_EVENTS,
@@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup *
 __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return NULL;
 	if (rsvd)
 		return folio->_hugetlb_cgroup_rsvd;
 	else
@@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio,
 				       struct hugetlb_cgroup *h_cg, bool rsvd)
 {
 	VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
-	if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
-		return;
 	if (rsvd)
 		folio->_hugetlb_cgroup_rsvd = h_cg;
 	else
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7ad9d2159da4..e2b1c417b90a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4361,7 +4361,7 @@ void __init hugetlb_add_hstate(unsigned int order)
 		return;
 	}
 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
-	BUG_ON(order == 0);
+	BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
 	h = &hstates[hugetlb_max_hstate++];
 	mutex_init(&h->resize_lock);
 	h->order = order;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index dedd2edb076e..aa4486bd3904 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -262,12 +262,6 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 
 	if (hugetlb_cgroup_disabled())
 		goto done;
-	/*
-	 * We don't charge any cgroup if the compound page have less
-	 * than 3 pages.
-	 */
-	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
-		goto done;
 again:
 	rcu_read_lock();
 	h_cg = hugetlb_cgroup_from_task(current);
@@ -397,9 +391,6 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 	if (hugetlb_cgroup_disabled() || !h_cg)
 		return;
 
-	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
-		return;
-
 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
 								   rsvd),
 			      nr_pages);
@@ -869,15 +860,8 @@ void __init hugetlb_cgroup_file_init(void)
 {
 	struct hstate *h;
 
-	for_each_hstate(h) {
-		/*
-		 * Add cgroup control files only if the huge page consists
-		 * of more than two normal pages. This is because we use
-		 * page[2].private for storing cgroup details.
-		 */
-		if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
-			__hugetlb_cgroup_file_init(hstate_index(h));
-	}
+	for_each_hstate(h)
+		__hugetlb_cgroup_file_init(hstate_index(h));
 }
 
 /*
-- 
cgit v1.2.3


From 4b569387c0d566db288e7c3e1b484b43df797bdb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:26 -0700
Subject: memcontrol: add helpers for hugetlb memcg accounting

Patch series "hugetlb memcg accounting", v4.

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etcetera fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch series rectifies this issue by charging the memcg when the
hugetlb folio is allocated, and uncharging when the folio is freed.  In
addition, a new selftest is added to demonstrate and verify this new
behavior.


This patch (of 4):

This patch exposes charge committing and cancelling as parts of the memory
controller interface.  These functionalities are useful when the
try_charge() and commit_charge() stages have to be separated by other
actions in between (which can fail).  One such example is the new hugetlb
accounting behavior in the following patch.

The patch also adds a helper function to obtain a reference to the
current task's memcg.

Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++
 mm/memcontrol.c            | 59 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 68 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c6029aeaa268..bea3d997b076 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 		page_counter_read(&memcg->memory);
 }
 
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
 
 /**
@@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }
 
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
+struct mem_cgroup *get_mem_cgroup_from_current(void);
+
 struct lruvec *folio_lruvec_lock(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 	return false;
 }
 
+static inline void mem_cgroup_commit_charge(struct folio *folio,
+		struct mem_cgroup *memcg)
+{
+}
+
 static inline int mem_cgroup_charge(struct folio *folio,
 		struct mm_struct *mm, gfp_t gfp)
 {
@@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
+static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
+		unsigned int nr_pages)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
@@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+	return NULL;
+}
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8539f2037168..b3e40a59f732 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1099,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void)
 	return false;
 }
 
+/**
+ * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
+ */
+struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+again:
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	if (!css_tryget(&memcg->css)) {
+		rcu_read_unlock();
+		goto again;
+	}
+	rcu_read_unlock();
+	return memcg;
+}
+
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -2873,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	return try_charge_memcg(memcg, gfp_mask, nr_pages);
 }
 
-static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+/**
+ * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
+ * @memcg: memcg previously charged.
+ * @nr_pages: number of pages previously charged.
+ */
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (mem_cgroup_is_root(memcg))
 		return;
@@ -2898,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 	folio->memcg_data = (unsigned long)memcg;
 }
 
+/**
+ * mem_cgroup_commit_charge - commit a previously successful try_charge().
+ * @folio: folio to commit the charge to.
+ * @memcg: memcg previously charged.
+ */
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+	css_get(&memcg->css);
+	commit_charge(folio, memcg);
+
+	local_irq_disable();
+	mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
+	memcg_check_events(memcg, folio_nid(folio));
+	local_irq_enable();
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * The allocated objcg pointers array is not accounted directly.
@@ -6116,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void)
 
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
-		cancel_charge(mc.to, mc.precharge);
+		mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
@@ -6124,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void)
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
-		cancel_charge(mc.from, mc.moved_charge);
+		mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
@@ -7031,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 			gfp_t gfp)
 {
-	long nr_pages = folio_nr_pages(folio);
 	int ret;
 
-	ret = try_charge(memcg, gfp, nr_pages);
+	ret = try_charge(memcg, gfp, folio_nr_pages(folio));
 	if (ret)
 		goto out;
 
-	css_get(&memcg->css);
-	commit_charge(folio, memcg);
-
-	local_irq_disable();
-	mem_cgroup_charge_statistics(memcg, nr_pages);
-	memcg_check_events(memcg, folio_nid(folio));
-	local_irq_enable();
+	mem_cgroup_commit_charge(folio, memcg);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 85ce2c517ade0d51b7ad95f2e88be9bbe294379a Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:27 -0700
Subject: memcontrol: only transfer the memcg data for migration

For most migration use cases, only transfer the memcg data from the old
folio to the new folio, and clear the old folio's memcg data.  No charging
and uncharging will be done.

This shaves off some work on the migration path, and avoids the temporary
double charging of a folio during its migration.

The only exception is replace_page_cache_folio(), which will use the old
mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio).  In that
context, the isolation of the old page isn't quite as thorough as with
migration, so we cannot use our new implementation directly.

This patch is the result of the following discussion on the new hugetlb
memcg accounting behavior:

https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/

Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++++++
 mm/filemap.c               |  2 +-
 mm/memcontrol.c            | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bea3d997b076..a3fce570122e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -707,6 +707,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 
 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
 
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
+
 void mem_cgroup_migrate(struct folio *old, struct folio *new);
 
 /**
@@ -1279,6 +1281,11 @@ static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 {
 }
 
+static inline void mem_cgroup_replace_folio(struct folio *old,
+		struct folio *new)
+{
+}
+
 static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
 {
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index b04ba896aac9..48cd16c54e86 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -816,7 +816,7 @@ void replace_page_cache_folio(struct folio *old, struct folio *new)
 	new->mapping = mapping;
 	new->index = offset;
 
-	mem_cgroup_migrate(old, new);
+	mem_cgroup_replace_folio(old, new);
 
 	xas_lock_irq(&xas);
 	xas_store(&xas, new);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3e40a59f732..135d6637afe5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7292,16 +7292,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 
 /**
- * mem_cgroup_migrate - Charge a folio's replacement.
+ * mem_cgroup_replace_folio - Charge a folio's replacement.
  * @old: Currently circulating folio.
  * @new: Replacement folio.
  *
  * Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free.
+ * be uncharged upon free. This is only used by the page cache
+ * (in replace_page_cache_folio()).
  *
  * Both folios must be locked, @new->mapping must be set up.
  */
-void mem_cgroup_migrate(struct folio *old, struct folio *new)
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
 {
 	struct mem_cgroup *memcg;
 	long nr_pages = folio_nr_pages(new);
@@ -7340,6 +7341,39 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
 	local_irq_restore(flags);
 }
 
+/**
+ * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
+ *
+ * Transfer the memcg data from the old folio to the new folio for migration.
+ * The old folio's data info will be cleared. Note that the memory counters
+ * will remain unchanged throughout the process.
+ *
+ * Both folios must be locked, @new->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
+{
+	struct mem_cgroup *memcg;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+	VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
+
+	if (mem_cgroup_disabled())
+		return;
+
+	memcg = folio_memcg(old);
+	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
+	if (!memcg)
+		return;
+
+	/* Transfer the charge and the css ref */
+	commit_charge(new, memcg);
+	old->memcg_data = 0;
+}
+
 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
 EXPORT_SYMBOL(memcg_sockets_enabled_key);
 
-- 
cgit v1.2.3


From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:28 -0700
Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc.  fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller).  Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.

Some caveats to consider:
  * This feature is only available on cgroup v2.
  * There is no hugetlb pool management involved in the memory
    controller. As stated above, hugetlb folios are only charged towards
    the memory controller when it is used. Host overcommit management
    has to consider it when configuring hard limits.
  * Failure to charge towards the memcg results in SIGBUS. This could
    happen even if the hugetlb pool still has pages (but the cgroup
    limit is hit and reclaim attempt fails).
  * When this feature is enabled, hugetlb pages contribute to memory
    reclaim protection. low, min limits tuning must take into account
    hugetlb memory.
  * Hugetlb pages utilized while this option is not selected will not
    be tracked by the memory controller (even if cgroup v2 is remounted
    later on).

Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++++++++
 include/linux/cgroup-defs.h             |  5 ++++
 include/linux/memcontrol.h              |  9 +++++++
 kernel/cgroup/cgroup.c                  | 15 +++++++++++-
 mm/hugetlb.c                            | 35 +++++++++++++++++++++------
 mm/memcontrol.c                         | 42 ++++++++++++++++++++++++++++++++-
 mm/migrate.c                            |  3 +--
 7 files changed, 127 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 622a7f28db1f..606b2e0eac4b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options.
         relying on the original semantics (e.g. specifying bogusly
         high 'bypass' protection values at higher tree levels).
 
+  memory_hugetlb_accounting
+        Count HugeTLB memory usage towards the cgroup's overall
+        memory usage for the memory controller (for the purpose of
+        statistics reporting and memory protetion). This is a new
+        behavior that could regress existing setups, so it must be
+        explicitly opted in with this mount option.
+
+        A few caveats to keep in mind:
+
+        * There is no HugeTLB pool management involved in the memory
+          controller. The pre-allocated pool does not belong to anyone.
+          Specifically, when a new HugeTLB folio is allocated to
+          the pool, it is not accounted for from the perspective of the
+          memory controller. It is only charged to a cgroup when it is
+          actually used (for e.g at page fault time). Host memory
+          overcommit management has to consider this when configuring
+          hard limits. In general, HugeTLB pool management should be
+          done via other mechanisms (such as the HugeTLB controller).
+        * Failure to charge a HugeTLB folio to the memory controller
+          results in SIGBUS. This could happen even if the HugeTLB pool
+          still has pages available (but the cgroup limit is hit and
+          reclaim attempt fails).
+        * Charging HugeTLB memory towards the memory controller affects
+          memory protection and reclaim dynamics. Any userspace tuning
+          (of low, min limits for e.g) needs to take this into account.
+        * HugeTLB pages utilized while this option is not selected
+          will not be tracked by the memory controller (even if cgroup
+          v2 is remounted later on).
+
 
 Organizing Processes and Threads
 --------------------------------
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..8641f4320c98 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -115,6 +115,11 @@ enum {
 	 * Enable recursive subtree protection
 	 */
 	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+	/*
+	 * Enable hugetlb accounting for the memory controller.
+	 */
+	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
 };
 
 /* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3fce570122e..6674c12725d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+		long nr_pages);
+
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }
 
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+		gfp_t gfp, long nr_pages)
+{
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..f11488b18ceb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1902,6 +1902,7 @@ enum cgroup2_param {
 	Opt_favordynmods,
 	Opt_memory_localevents,
 	Opt_memory_recursiveprot,
+	Opt_memory_hugetlb_accounting,
 	nr__cgroup2_params
 };
 
@@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
 	fsparam_flag("favordynmods",		Opt_favordynmods),
 	fsparam_flag("memory_localevents",	Opt_memory_localevents),
 	fsparam_flag("memory_recursiveprot",	Opt_memory_recursiveprot),
+	fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
 	{}
 };
 
@@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_memory_recursiveprot:
 		ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 		return 0;
+	case Opt_memory_hugetlb_accounting:
+		ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+		return 0;
 	}
 	return -EINVAL;
 }
@@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
 			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 		else
 			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+		if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
 	}
 }
 
@@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 		seq_puts(seq, ",memory_localevents");
 	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
 		seq_puts(seq, ",memory_recursiveprot");
+	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+		seq_puts(seq, ",memory_hugetlb_accounting");
 	return 0;
 }
 
@@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
 			"nsdelegate\n"
 			"favordynmods\n"
 			"memory_localevents\n"
-			"memory_recursiveprot\n");
+			"memory_recursiveprot\n"
+			"memory_hugetlb_accounting\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2b1c417b90a..da6f85b7db88 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio)
 				     pages_per_huge_page(h), folio);
 	hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 					  pages_per_huge_page(h), folio);
+	mem_cgroup_uncharge(folio);
 	if (restore_reserve)
 		h->resv_huge_pages++;
 
@@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	struct hugepage_subpool *spool = subpool_vma(vma);
 	struct hstate *h = hstate_vma(vma);
 	struct folio *folio;
-	long map_chg, map_commit;
+	long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
 	long gbl_chg;
-	int ret, idx;
+	int memcg_charge_ret, ret, idx;
 	struct hugetlb_cgroup *h_cg = NULL;
+	struct mem_cgroup *memcg;
 	bool deferred_reserve;
+	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+
+	memcg = get_mem_cgroup_from_current();
+	memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
+	if (memcg_charge_ret == -ENOMEM) {
+		mem_cgroup_put(memcg);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	idx = hstate_index(h);
 	/*
@@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	 * code of zero indicates a reservation exists (no change).
 	 */
 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
-	if (map_chg < 0)
+	if (map_chg < 0) {
+		if (!memcg_charge_ret)
+			mem_cgroup_cancel_charge(memcg, nr_pages);
+		mem_cgroup_put(memcg);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	/*
 	 * Processes that did not create the mapping will have no
@@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	 */
 	if (map_chg || avoid_reserve) {
 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
-		if (gbl_chg < 0) {
-			vma_end_reservation(h, vma, addr);
-			return ERR_PTR(-ENOSPC);
-		}
+		if (gbl_chg < 0)
+			goto out_end_reservation;
 
 		/*
 		 * Even though there was no reservation in the region/reserve
@@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 			hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 					pages_per_huge_page(h), folio);
 	}
+
+	if (!memcg_charge_ret)
+		mem_cgroup_commit_charge(folio, memcg);
+	mem_cgroup_put(memcg);
+
 	return folio;
 
 out_uncharge_cgroup:
@@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation:
 out_subpool_put:
 	if (map_chg || avoid_reserve)
 		hugepage_subpool_put_pages(spool, 1);
+out_end_reservation:
 	vma_end_reservation(h, vma, addr);
+	if (!memcg_charge_ret)
+		mem_cgroup_cancel_charge(memcg, nr_pages);
+	mem_cgroup_put(memcg);
 	return ERR_PTR(-ENOSPC);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 135d6637afe5..a86e7b445800 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 	return ret;
 }
 
+/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			long nr_pages)
+{
+	/*
+	 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+	 * but do not attempt to commit charge later (or cancel on error) either.
+	 */
+	if (mem_cgroup_disabled() || !memcg ||
+		!cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+		!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+		return -EOPNOTSUPP;
+
+	if (try_charge(memcg, gfp, nr_pages))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /**
  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
  * @folio: folio to charge.
@@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
 		return;
 
 	memcg = folio_memcg(old);
-	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
+	/*
+	 * Note that it is normal to see !memcg for a hugetlb folio.
+	 * For e.g, itt could have been allocated when memory_hugetlb_accounting
+	 * was not selected.
+	 */
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
 	if (!memcg)
 		return;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 9d9eee5322d1..c602bf6dec97 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 
 	folio_copy_owner(newfolio, folio);
 
-	if (!folio_test_hugetlb(folio))
-		mem_cgroup_migrate(folio, newfolio);
+	mem_cgroup_migrate(folio, newfolio);
 }
 EXPORT_SYMBOL(folio_migrate_flags);
 
-- 
cgit v1.2.3


From f04eba134e598d4a5af2eeecff0562df5042cbfc Mon Sep 17 00:00:00 2001
From: Lucy Mielke <lucymielke@icloud.com>
Date: Fri, 6 Oct 2023 22:30:51 +0200
Subject: mm: add printf attribute to shrinker_debugfs_name_alloc

This fixes a compiler warning when compiling an allyesconfig with W=1:

mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf'
format attribute [-Werror=suggest-attribute=format]

[akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng]
  Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com
Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe
Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker")
Signed-off-by: Lucy Mielke <lucymielke@icloud.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h | 1 +
 mm/internal.h            | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index e4f93120e0ab..1a00be90d93a 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -131,6 +131,7 @@ struct shrinker {
  */
 #define SHRINKER_NONSLAB	BIT(4)
 
+__printf(2, 3)
 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
 void shrinker_register(struct shrinker *shrinker);
 void shrinker_free(struct shrinker *shrinker);
diff --git a/mm/internal.h b/mm/internal.h
index 2b79da9deb64..d1f7a3c612fe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1219,8 +1219,8 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
 			  int priority);
 
 #ifdef CONFIG_SHRINKER_DEBUG
-static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
-					      const char *fmt, va_list ap)
+static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
+			struct shrinker *shrinker, const char *fmt, va_list ap)
 {
 	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
 
-- 
cgit v1.2.3


From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 04:58:29 +0100
Subject: sched: remove wait bookmarks

There are no users of wait bookmarks left, so simplify the wait
code by removing them.

Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Bin Lai <sclaibin@gmail.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/wait.h |  9 +++-----
 kernel/sched/wait.c  | 60 +++++++++-------------------------------------------
 2 files changed, 13 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
-#define WQ_FLAG_BOOKMARK	0x04
-#define WQ_FLAG_CUSTOM		0x08
-#define WQ_FLAG_DONE		0x10
-#define WQ_FLAG_PRIORITY	0x20
+#define WQ_FLAG_CUSTOM		0x04
+#define WQ_FLAG_DONE		0x08
+#define WQ_FLAG_PRIORITY	0x10
 
 /*
  * A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -57,13 +57,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
-/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
@@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key,
-			wait_queue_entry_t *bookmark)
+			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_entry_t *curr, *next;
-	int cnt = 0;
 
 	lockdep_assert_held(&wq_head->lock);
 
-	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
-		curr = list_next_entry(bookmark, entry);
-
-		list_del(&bookmark->entry);
-		bookmark->flags = 0;
-	} else
-		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+	curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
 
 	if (&curr->entry == &wq_head->head)
 		return nr_exclusive;
@@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		unsigned flags = curr->flags;
 		int ret;
 
-		if (flags & WQ_FLAG_BOOKMARK)
-			continue;
-
 		ret = curr->func(curr, mode, wake_flags, key);
 		if (ret < 0)
 			break;
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
-
-		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
-				(&next->entry != &wq_head->head)) {
-			bookmark->flags = WQ_FLAG_BOOKMARK;
-			list_add_tail(&bookmark->entry, &next->entry);
-			break;
-		}
 	}
 
 	return nr_exclusive;
@@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 			int nr_exclusive, int wake_flags, void *key)
 {
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
-	int remaining = nr_exclusive;
+	int remaining;
 
-	bookmark.flags = 0;
-	bookmark.private = NULL;
-	bookmark.func = NULL;
-	INIT_LIST_HEAD(&bookmark.entry);
-
-	do {
-		spin_lock_irqsave(&wq_head->lock, flags);
-		remaining = __wake_up_common(wq_head, mode, remaining,
-						wake_flags, key, &bookmark);
-		spin_unlock_irqrestore(&wq_head->lock, flags);
-	} while (bookmark.flags & WQ_FLAG_BOOKMARK);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+			key);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 
 	return nr_exclusive - remaining;
 }
@@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
+	__wake_up_common(wq_head, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
-	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
@@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
 			       unsigned int mode, void *key)
 {
-        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+        __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
 
-- 
cgit v1.2.3


From 3657fdc2451abf135c2d20949acf57d78cc50338 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:27 +0100
Subject: mm: move vma_policy() and anon_vma_name() decls to mm_types.h

Patch series "Abstract vma_merge() and split_vma()", v4.

The vma_merge() interface is very confusing and its implementation has led
to numerous bugs as a result of that confusion.

In addition there is duplication both in invocation of vma_merge(), but
also in the common mprotect()-style pattern of attempting a merge, then if
this fails, splitting the portion of a VMA about to have its attributes
changed.

This pattern has been copy/pasted around the kernel in each instance where
such an operation has been required, each very slightly modified from the
last to make it even harder to decipher what is going on.

Simplify the whole thing by dividing the actual uses of vma_merge() and
split_vma() into specific and abstracted functions and de-duplicate the
vma_merge()/split_vma() pattern altogether.

Doing so also opens the door to changing how vma_merge() is implemented -
by knowing precisely what cases a caller is invoking rather than having a
central interface where anything might happen we can untangle the brittle
and confusing vma_merge() implementation into something more workable.

For mprotect()-like cases we introduce vma_modify() which performs the
vma_merge()/split_vma() pattern, returning a pointer to either the merged
or split VMA or an ERR_PTR(err) if the splits fail.

We provide a number of inline helper functions to make things even clearer:-

* vma_modify_flags()      - Prepare to modify the VMA's flags.
* vma_modify_flags_name() - Prepare to modify the VMA's flags/anon_vma_name
* vma_modify_policy()     - Prepare to modify the VMA's mempolicy.
* vma_modify_flags_uffd() - Prepare to modify the VMA's flags/uffd context.

For cases where a new VMA is attempted to be merged with adjacent VMAs we
add:-

* vma_merge_new_vma() - Prepare to merge a new VMA.
* vma_merge_extend()  - Prepare to extend the end of a new VMA.


This patch (of 5):

The vma_policy() define is a helper specifically for a VMA field so it
makes sense to host it in the memory management types header.

The anon_vma_name(), anon_vma_name_alloc() and anon_vma_name_free()
functions are a little out of place in mm_inline.h as they define external
functions, and so it makes sense to locate them in mm_types.h.

The purpose of these relocations is to make it possible to abstract static
inline wrappers which invoke both of these helpers.

Link: https://lkml.kernel.org/r/cover.1697043508.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/24bfc6c9e382fffbcb0ea8d424392c27d56cc8ca.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ----
 include/linux/mm_inline.h | 20 +-------------------
 include/linux/mm_types.h  | 27 +++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6c2754d7bfed..1d7f4ec2614c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -89,8 +89,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 	return pol;
 }
 
-#define vma_policy(vma) ((vma)->vm_policy)
-
 static inline void mpol_get(struct mempolicy *pol)
 {
 	if (pol)
@@ -222,8 +220,6 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return NULL;
 }
 
-#define vma_policy(vma) NULL
-
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8148b30a9df1..9ae7def16cb2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,7 @@
 
 #include <linux/atomic.h>
 #include <linux/huge_mm.h>
+#include <linux/mm_types.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
@@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 }
 
 #ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
- * either keep holding the lock while using the returned pointer or it should
- * raise anon_vma_name refcount before releasing the lock.
- */
-extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
-extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
-extern void anon_vma_name_free(struct kref *kref);
-
 /* mmap_lock should be read-locked */
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
 {
@@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
 }
 
 #else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
-static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
-{
-	return NULL;
-}
-
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36c5b43999e6..21eb56145f57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -546,6 +546,27 @@ struct anon_vma_name {
 	char name[];
 };
 
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
+ */
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+struct anon_vma_name *anon_vma_name_alloc(const char *name);
+void anon_vma_name_free(struct kref *kref);
+#else /* CONFIG_ANON_VMA_NAME */
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+	return NULL;
+}
+#endif
+
 struct vma_lock {
 	struct rw_semaphore lock;
 };
@@ -662,6 +683,12 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 } __randomize_layout;
 
+#ifdef CONFIG_NUMA
+#define vma_policy(vma) ((vma)->vm_policy)
+#else
+#define vma_policy(vma) NULL
+#endif
+
 #ifdef CONFIG_SCHED_MM_CID
 struct mm_cid {
 	u64 time;
-- 
cgit v1.2.3


From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:28 +0100
Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et
 al.

mprotect() and other functions which change VMA parameters over a range
each employ a pattern of:-

1. Attempt to merge the range with adjacent VMAs.
2. If this fails, and the range spans a subset of the VMA, split it
   accordingly.

This is open-coded and duplicated in each case. Also in each case most of
the parameters passed to vma_merge() remain the same.

Create a new function, vma_modify(), which abstracts this operation,
accepting only those parameters which can be changed.

To avoid the mess of invoking each function call with unnecessary
parameters, create inline wrapper functions for each of the modify
operations, parameterised only by what is required to perform the action.

We can also significantly simplify the logic - by returning the VMA if we
split (or merged VMA if we do not) we no longer need specific handling for
merge/split cases in any of the call sites.

Note that the userfaultfd_release() case works even though it does not
split VMAs - since start is set to vma->vm_start and end is set to
vma->vm_end, the split logic does not trigger.

In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start
- vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this
instance, this invocation will remain unchanged.

We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that
vma_merge() correctly ensures that flags remain the same, something that is
already checked in is_mergeable_vma() and elsewhere, and in any case is not
specific to mprotect().

Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c   | 70 ++++++++++++++----------------------------------------
 include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/madvise.c       | 26 ++++----------------
 mm/mempolicy.c     | 26 +++-----------------
 mm/mlock.c         | 25 ++++---------------
 mm/mmap.c          | 48 +++++++++++++++++++++++++++++++++++++
 mm/mprotect.c      | 29 ++++------------------
 7 files changed, 141 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index a7c6ef764e63..ac616cfbacf5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -927,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 			continue;
 		}
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
-		prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
-				 new_flags, vma->anon_vma,
-				 vma->vm_file, vma->vm_pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-		if (prev) {
-			vma = prev;
-		} else {
-			prev = vma;
-		}
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+					    vma->vm_end, new_flags,
+					    NULL_VM_UFFD_CTX);
 
 		vma_start_write(vma);
 		userfaultfd_set_vm_flags(vma, new_flags);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+		prev = vma;
 	}
 	mmap_write_unlock(mm);
 	mmput(mm);
@@ -1331,7 +1326,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	unsigned long start, end, vma_end;
 	struct vma_iterator vmi;
 	bool wp_async = userfaultfd_wp_async_ctx(ctx);
-	pgoff_t pgoff;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
 
@@ -1484,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		vma_end = min(end, vma->vm_end);
 
 		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
-		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, pgoff,
-				 vma_policy(vma),
-				 ((struct vm_userfaultfd_ctx){ ctx }),
-				 anon_vma_name(vma));
-		if (prev) {
-			/* vma_merge() invalidated the mas */
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+					    new_flags,
+					    (struct vm_userfaultfd_ctx){ctx});
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
+			break;
 		}
-	next:
+
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and
@@ -1568,7 +1548,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	const void __user *buf = (void __user *)arg;
 	struct vma_iterator vmi;
 	bool wp_async = userfaultfd_wp_async_ctx(ctx);
-	pgoff_t pgoff;
 
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1671,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			uffd_wp_range(vma, start, vma_end - start, false);
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
-		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-		if (prev) {
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+					    new_flags, NULL_VM_UFFD_CTX);
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
+			break;
 		}
-	next:
+
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa608cba041f..895feb814620 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3251,6 +3251,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+				  struct vm_area_struct *prev,
+				  struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  unsigned long vm_flags,
+				  struct mempolicy *policy,
+				  struct vm_userfaultfd_ctx uffd_ctx,
+				  struct anon_vma_name *anon_name);
+
+/* We are about to modify the VMA's flags. */
+static inline struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
+		  struct vm_area_struct *prev,
+		  struct vm_area_struct *vma,
+		  unsigned long start, unsigned long end,
+		  unsigned long new_flags)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx,
+			  anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+static inline struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start,
+		       unsigned long end,
+		       unsigned long new_flags,
+		       struct anon_vma_name *new_name)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
+}
+
+/* We are about to modify the VMA's memory policy. */
+static inline struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+		   struct vm_area_struct *prev,
+		   struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   struct mempolicy *new_pol)
+{
+	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
+			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+static inline struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+		       struct vm_area_struct *prev,
+		       struct vm_area_struct *vma,
+		       unsigned long start, unsigned long end,
+		       unsigned long new_flags,
+		       struct vm_userfaultfd_ctx new_ctx)
+{
+	return vma_modify(vmi, prev, vma, start, end, new_flags,
+			  vma_policy(vma), new_ctx, anon_vma_name(vma));
+}
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
diff --git a/mm/madvise.c b/mm/madvise.c
index 59e8860c86af..a6b48d4796ba 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int error;
-	pgoff_t pgoff;
 	VMA_ITERATOR(vmi, mm, start);
 
 	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
@@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma,
 		return 0;
 	}
 
-	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-	*prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
-			  vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			  vma->vm_userfaultfd_ctx, anon_name);
-	if (*prev) {
-		vma = *prev;
-		goto success;
-	}
+	vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
+				    anon_name);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
 
 	*prev = vma;
 
-	if (start != vma->vm_start) {
-		error = split_vma(&vmi, vma, start, 1);
-		if (error)
-			return error;
-	}
-
-	if (end != vma->vm_end) {
-		error = split_vma(&vmi, vma, end, 0);
-		if (error)
-			return error;
-	}
-
-success:
 	/* vm_flags is protected by the mmap_lock held in write mode. */
 	vma_start_write(vma);
 	vm_flags_reset(vma, new_flags);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38a47fa33ef4..8bb5b4b4b395 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -811,10 +811,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		struct vm_area_struct **prev, unsigned long start,
 		unsigned long end, struct mempolicy *new_pol)
 {
-	struct vm_area_struct *merged;
 	unsigned long vmstart, vmend;
-	pgoff_t pgoff;
-	int err;
 
 	vmend = min(end, vma->vm_end);
 	if (start > vma->vm_start) {
@@ -829,26 +826,9 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		return 0;
 	}
 
-	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
-	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
-			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
-			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-	if (merged) {
-		*prev = merged;
-		return vma_replace_policy(merged, new_pol);
-	}
-
-	if (vma->vm_start != vmstart) {
-		err = split_vma(vmi, vma, vmstart, 1);
-		if (err)
-			return err;
-	}
-
-	if (vma->vm_end != vmend) {
-		err = split_vma(vmi, vma, vmend, 0);
-		if (err)
-			return err;
-	}
+	vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
 
 	*prev = vma;
 	return vma_replace_policy(vma, new_pol);
diff --git a/mm/mlock.c b/mm/mlock.c
index 42b6865f8f82..aa44456200e3 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -476,7 +476,6 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	       unsigned long end, vm_flags_t newflags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	pgoff_t pgoff;
 	int nr_pages;
 	int ret = 0;
 	vm_flags_t oldflags = vma->vm_flags;
@@ -487,28 +486,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
 		goto out;
 
-	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-	*prev = vma_merge(vmi, mm, *prev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-	if (*prev) {
-		vma = *prev;
-		goto success;
-	}
-
-	if (start != vma->vm_start) {
-		ret = split_vma(vmi, vma, start, 1);
-		if (ret)
-			goto out;
-	}
-
-	if (end != vma->vm_end) {
-		ret = split_vma(vmi, vma, end, 0);
-		if (ret)
-			goto out;
+	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto out;
 	}
 
-success:
 	/*
 	 * Keep track of amount of locked VM.
 	 */
diff --git a/mm/mmap.c b/mm/mmap.c
index 673429ee8a9e..bca685820763 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2437,6 +2437,54 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	return __split_vma(vmi, vma, addr, new_below);
 }
 
+/*
+ * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
+ * context and anonymous VMA name within the range [start, end).
+ *
+ * As a result, we might be able to merge the newly modified VMA range with an
+ * adjacent VMA with identical properties.
+ *
+ * If no merge is possible and the range does not span the entirety of the VMA,
+ * we then need to split the VMA to accommodate the change.
+ *
+ * The function returns either the merged VMA, the original VMA if a split was
+ * required instead, or an error if the split failed.
+ */
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+				  struct vm_area_struct *prev,
+				  struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  unsigned long vm_flags,
+				  struct mempolicy *policy,
+				  struct vm_userfaultfd_ctx uffd_ctx,
+				  struct anon_vma_name *anon_name)
+{
+	pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	struct vm_area_struct *merged;
+
+	merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
+			   vma->anon_vma, vma->vm_file, pgoff, policy,
+			   uffd_ctx, anon_name);
+	if (merged)
+		return merged;
+
+	if (vma->vm_start < start) {
+		int err = split_vma(vmi, vma, start, 1);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	if (vma->vm_end > end) {
+		int err = split_vma(vmi, vma, end, 0);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	return vma;
+}
+
 /*
  * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  * @vmi: The vma iterator
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 03e2cec3e669..f1dc8f8c84ef 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -581,7 +581,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned int mm_cp_flags = 0;
 	unsigned long charged = 0;
-	pgoff_t pgoff;
 	int error;
 
 	if (newflags == oldflags) {
@@ -631,34 +630,14 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 		newflags &= ~VM_ACCOUNT;
 	}
 
-	/*
-	 * First try to merge with previous and/or next vma.
-	 */
-	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-	*pprev = vma_merge(vmi, mm, *pprev, start, end, newflags,
-			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-	if (*pprev) {
-		vma = *pprev;
-		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
-		goto success;
+	vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
+	if (IS_ERR(vma)) {
+		error = PTR_ERR(vma);
+		goto fail;
 	}
 
 	*pprev = vma;
 
-	if (start != vma->vm_start) {
-		error = split_vma(vmi, vma, start, 1);
-		if (error)
-			goto fail;
-	}
-
-	if (end != vma->vm_end) {
-		error = split_vma(vmi, vma, end, 0);
-		if (error)
-			goto fail;
-	}
-
-success:
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_lock
 	 * held in write mode.
-- 
cgit v1.2.3


From adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 11 Oct 2023 18:04:29 +0100
Subject: mm: make vma_merge() and split_vma() internal

Now the common pattern of - attempting a merge via vma_merge() and should
this fail splitting VMAs via split_vma() - has been abstracted, the former
can be placed into mm/internal.h and the latter made static.

In addition, the split_vma() nommu variant also need not be exported.

Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 9 ---------
 mm/internal.h      | 9 +++++++++
 mm/mmap.c          | 8 ++++----
 mm/nommu.c         | 4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 895feb814620..1bddb151cd5c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3235,16 +3235,7 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		      struct vm_area_struct *next);
 extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
-	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
-	unsigned long end, unsigned long vm_flags, struct anon_vma *,
-	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
-	struct anon_vma_name *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-		       unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
-			 unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
diff --git a/mm/internal.h b/mm/internal.h
index d1f7a3c612fe..2ab81c68be6c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1002,6 +1002,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmd,
 				   unsigned int flags);
 
+/*
+ * mm/mmap.c
+ */
+struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
+	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
+	unsigned long end, unsigned long vm_flags, struct anon_vma *,
+	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
+	struct anon_vma_name *);
+
 enum {
 	/* mark page accessed */
 	FOLL_TOUCH = 1 << 16,
diff --git a/mm/mmap.c b/mm/mmap.c
index bca685820763..a516f2412f79 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2346,8 +2346,8 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
  * has already been checked or doesn't make sense to fail.
  * VMA Iterator will point to the end VMA.
  */
-int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
-		unsigned long addr, int new_below)
+static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		       unsigned long addr, int new_below)
 {
 	struct vma_prepare vp;
 	struct vm_area_struct *new;
@@ -2428,8 +2428,8 @@ out_free_vma:
  * Split a vma into two pieces at address 'addr', a new vma is allocated
  * either for the first part or the tail.
  */
-int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
-	      unsigned long addr, int new_below)
+static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		     unsigned long addr, int new_below)
 {
 	if (vma->vm_mm->map_count >= sysctl_max_map_count)
 		return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index f9553579389b..fc4afe924ad5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1305,8 +1305,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  * split a vma into two pieces at address 'addr', a new vma is allocated either
  * for the first part or the tail.
  */
-int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
-	      unsigned long addr, int new_below)
+static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		     unsigned long addr, int new_below)
 {
 	struct vm_area_struct *new;
 	struct vm_region *region;
-- 
cgit v1.2.3


From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:28 +0100
Subject: mm: drop the assumption that VM_SHARED always implies writable

Patch series "permit write-sealed memfd read-only shared mappings", v4.

The man page for fcntl() describing memfd file seals states the following
about F_SEAL_WRITE:-

    Furthermore, trying to create new shared, writable memory-mappings via
    mmap(2) will also fail with EPERM.

With emphasis on 'writable'.  In turns out in fact that currently the
kernel simply disallows all new shared memory mappings for a memfd with
F_SEAL_WRITE applied, rendering this documentation inaccurate.

This matters because users are therefore unable to obtain a shared mapping
to a memfd after write sealing altogether, which limits their usefulness.
This was reported in the discussion thread [1] originating from a bug
report [2].

This is a product of both using the struct address_space->i_mmap_writable
atomic counter to determine whether writing may be permitted, and the
kernel adjusting this counter when any VM_SHARED mapping is performed and
more generally implicitly assuming VM_SHARED implies writable.

It seems sensible that we should only update this mapping if VM_MAYWRITE
is specified, i.e.  whether it is possible that this mapping could at any
point be written to.

If we do so then all we need to do to permit write seals to function as
documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
this functionality already exists for F_SEAL_FUTURE_WRITE - we can
therefore simply adapt this logic to do the same for F_SEAL_WRITE.

We then hit a chicken and egg situation in mmap_region() where the check
for VM_MAYWRITE occurs before we are able to clear this flag.  To work
around this, perform this check after we invoke call_mmap(), with careful
consideration of error paths.

Thanks to Andy Lutomirski for the suggestion!

[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238


This patch (of 3):

There is a general assumption that VMAs with the VM_SHARED flag set are
writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
case.

Update those checks which affect the struct address_space->i_mmap_writable
field to explicitly test for this by introducing
[vma_]is_shared_maywrite() helper functions.

This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
that the VMA cannot be written to.

Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  4 ++--
 include/linux/mm.h | 11 +++++++++++
 kernel/fork.c      |  2 +-
 mm/filemap.c       |  2 +-
 mm/madvise.c       |  2 +-
 mm/mmap.c          | 12 ++++++------
 6 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd539c9fef8e..5265186da0e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
  *   It is also used to block modification of page cache contents through
  *   memory mappings.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1bddb151cd5c..c3b5749ede9d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(vma->vm_flags);
+}
+
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index e45a4457ba83..1e6c656e0857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
 			get_file(file);
 			i_mmap_lock_write(mapping);
-			if (tmp->vm_flags & VM_SHARED)
+			if (vma_is_shared_maywrite(tmp))
 				mapping_allow_writable(mapping);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
diff --git a/mm/filemap.c b/mm/filemap.c
index 9ef49255f1a5..9710f43a89ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
  */
 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+	if (vma_is_shared_maywrite(vma))
 		return -EINVAL;
 	return generic_file_mmap(file, vma);
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index a6b48d4796ba..cf4d694280e9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 			return -EINVAL;
 	}
 
-	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+	if (!vma_is_shared_maywrite(vma))
 		return -EACCES;
 
 	offset = (loff_t)(start - vma->vm_start)
diff --git a/mm/mmap.c b/mm/mmap.c
index 3ea52451623b..0041e3631f6c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
 {
-	if (vma->vm_flags & VM_SHARED)
+	if (vma_is_shared_maywrite(vma))
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
@@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 static void __vma_link_file(struct vm_area_struct *vma,
 			    struct address_space *mapping)
 {
-	if (vma->vm_flags & VM_SHARED)
+	if (vma_is_shared_maywrite(vma))
 		mapping_allow_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
@@ -2846,7 +2846,7 @@ cannot_expand:
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
-		if (vm_flags & VM_SHARED) {
+		if (is_shared_maywrite(vm_flags)) {
 			error = mapping_map_writable(file->f_mapping);
 			if (error)
 				goto free_vma;
@@ -2920,7 +2920,7 @@ cannot_expand:
 	mm->map_count++;
 	if (vma->vm_file) {
 		i_mmap_lock_write(vma->vm_file->f_mapping);
-		if (vma->vm_flags & VM_SHARED)
+		if (vma_is_shared_maywrite(vma))
 			mapping_allow_writable(vma->vm_file->f_mapping);
 
 		flush_dcache_mmap_lock(vma->vm_file->f_mapping);
@@ -2937,7 +2937,7 @@ cannot_expand:
 
 	/* Once vma denies write, undo our temporary denial count */
 unmap_writable:
-	if (file && vm_flags & VM_SHARED)
+	if (file && is_shared_maywrite(vm_flags))
 		mapping_unmap_writable(file->f_mapping);
 	file = vma->vm_file;
 	ksm_add_vma(vma);
@@ -2985,7 +2985,7 @@ unmap_and_free_vma:
 		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
 			     vma->vm_end, vma->vm_end, true);
 	}
-	if (file && (vm_flags & VM_SHARED))
+	if (file && is_shared_maywrite(vm_flags))
 		mapping_unmap_writable(file->f_mapping);
 free_vma:
 	vm_area_free(vma);
-- 
cgit v1.2.3


From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:29 +0100
Subject: mm: update memfd seal write check to include F_SEAL_WRITE

The seal_check_future_write() function is called by shmem_mmap() or
hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd
sealed this way.

The F_SEAL_WRITE flag is not checked here, as that is handled via the
mapping->i_mmap_writable mechanism and so any attempt at a mapping would
fail before this could be run.

However we intend to change this, meaning this check can be performed for
F_SEAL_WRITE mappings also.

The logic here is equally applicable to both flags, so update this
function to accommodate both and rename it accordingly.

Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c |  2 +-
 include/linux/mm.h   | 15 ++++++++-------
 mm/shmem.c           |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 926d01c493fb..85cd418d2340 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -135,7 +135,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	ret = seal_check_future_write(info->seals, vma);
+	ret = seal_check_write(info->seals, vma);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3b5749ede9d..86e040e886e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4076,25 +4076,26 @@ static inline void mem_dump_obj(void *object) {}
 #endif
 
 /**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ *                    handle them.
  * @seals: the seals to check
  * @vma: the vma to operate on
  *
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags.  Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
  */
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
 {
-	if (seals & F_SEAL_FUTURE_WRITE) {
+	if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
 		/*
 		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-		 * "future write" seal active.
+		 * write seals are active.
 		 */
 		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
 			return -EPERM;
 
 		/*
-		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+		 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
 		 * MAP_SHARED and read-only, take care to not allow mprotect to
 		 * revert protections on such mappings. Do this only for shared
 		 * mappings. For private mappings, don't need to mask
diff --git a/mm/shmem.c b/mm/shmem.c
index 61b170324e5c..bcbe9dbb9f5f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2378,7 +2378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int ret;
 
-	ret = seal_check_future_write(info->seals, vma);
+	ret = seal_check_write(info->seals, vma);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 5097a69d676f7e562240dbe81c21b1c62aaf5950 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 16 Sep 2023 21:21:31 +0300
Subject: extract and use FILE_LINE macro

Extract nifty FILE_LINE useful for printk style debugging:

	printk("%s\n", FILE_LINE);

It should not be used en mass probably because __FILE__ string literals
can be merged while FILE_LINE's won't. But for debugging it is what
the doctor ordered.

Don't add leading and trailing underscores, they're painful to type.
Trust me, I've tried both versions.

Link: https://lkml.kernel.org/r/ebf12ac4-5a61-4b12-b8b0-1253eb371332@p183
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fortify-string.h | 2 +-
 include/linux/stringify.h      | 2 ++
 include/linux/timer.h          | 3 +--
 sound/pci/asihpi/hpidebug.h    | 9 ++++-----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index da51a83b2829..442ee9170259 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -643,7 +643,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 				     __q_size_field, #op),		\
 		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
 		  __fortify_size,					\
-		  "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+		  "field \"" #p "\" at " FILE_LINE,			\
 		  __p_size_field);					\
 	__underlying_##op(p, q, __fortify_size);			\
 })
diff --git a/include/linux/stringify.h b/include/linux/stringify.h
index 841cec8ed525..0e84cbe65270 100644
--- a/include/linux/stringify.h
+++ b/include/linux/stringify.h
@@ -9,4 +9,6 @@
 #define __stringify_1(x...)	#x
 #define __stringify(x...)	__stringify_1(x)
 
+#define FILE_LINE	__FILE__ ":" __stringify(__LINE__)
+
 #endif	/* !__LINUX_STRINGIFY_H */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9162f275819a..26a545bb0153 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -77,8 +77,7 @@ struct timer_list {
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.flags = (_flags),				\
-		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
-			__FILE__ ":" __stringify(__LINE__))	\
+		__TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE)	\
 	}
 
 #define DEFINE_TIMER(_name, _function)				\
diff --git a/sound/pci/asihpi/hpidebug.h b/sound/pci/asihpi/hpidebug.h
index c24ed69eb743..c6dfc229213d 100644
--- a/sound/pci/asihpi/hpidebug.h
+++ b/sound/pci/asihpi/hpidebug.h
@@ -29,16 +29,15 @@ enum { HPI_DEBUG_LEVEL_ERROR = 0,	/* always log errors */
    the start of each message, eg see linux kernel hpios.h */
 
 #ifdef SOURCEFILE_NAME
+#undef FILE_LINE
 #define FILE_LINE  SOURCEFILE_NAME ":" __stringify(__LINE__) " "
-#else
-#define FILE_LINE  __FILE__ ":" __stringify(__LINE__) " "
 #endif
 
 #define HPI_DEBUG_ASSERT(expression) \
 	do { \
 		if (!(expression)) { \
 			printk(KERN_ERR  FILE_LINE \
-				"ASSERT " __stringify(expression)); \
+				" ASSERT " __stringify(expression)); \
 		} \
 	} while (0)
 
@@ -46,7 +45,7 @@ enum { HPI_DEBUG_LEVEL_ERROR = 0,	/* always log errors */
 	do { \
 		if (hpi_debug_level >= HPI_DEBUG_LEVEL_##level) { \
 			printk(HPI_DEBUG_FLAG_##level \
-			FILE_LINE  __VA_ARGS__); \
+			FILE_LINE " " __VA_ARGS__); \
 		} \
 	} while (0)
 
@@ -70,7 +69,7 @@ void hpi_debug_data(u16 *pdata, u32 len);
 	do { \
 		if (hpi_debug_level >= HPI_DEBUG_LEVEL_##level) { \
 			hpi_debug_message(phm, HPI_DEBUG_FLAG_##level \
-				FILE_LINE __stringify(level)); \
+				FILE_LINE " " __stringify(level)); \
 		} \
 	} while (0)
 
-- 
cgit v1.2.3


From 9bf2850c9170b52a6ab052085feced55effa78ef Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 16 Sep 2023 17:50:11 +0200
Subject: kstrtox: remove strtobool()

The conversion from strtobool() to kstrtobool() is completed.  So
strtobool() can now be removed.

Link: https://lkml.kernel.org/r/87e3cc2547df174cd5af1fadbf866be4ef9e8e45.1694878151.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kstrtox.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h
index 529974e22ea7..7fcf29a4e0de 100644
--- a/include/linux/kstrtox.h
+++ b/include/linux/kstrtox.h
@@ -147,9 +147,4 @@ extern long simple_strtol(const char *,char **,unsigned int);
 extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
 extern long long simple_strtoll(const char *,char **,unsigned int);
 
-static inline int strtobool(const char *s, bool *res)
-{
-	return kstrtobool(s, res);
-}
-
 #endif	/* _LINUX_KSTRTOX_H */
-- 
cgit v1.2.3


From 80fcac55385ccb710d33a20dc1caaef29bd5a921 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:16:30 +0000
Subject: minmax: add umin(a, b) and umax(a, b)

Patch series "minmax: Relax type checks in min() and max()", v4.

The min() (etc) functions in minmax.h require that the arguments have
exactly the same types.

However when the type check fails, rather than look at the types and fix
the type of a variable/constant, everyone seems to jump on min_t().  In
reality min_t() ought to be rare - when something unusual is being done,
not normality.

The orginal min() (added in 2.4.9) replaced several inline functions and
included the type - so matched the implicit casting of the function call.
This was renamed min_t() in 2.4.10 and the current min() added.  There is
no actual indication that the conversion of negatve values to large
unsigned values has ever been an actual problem.

A quick grep shows 5734 min() and 4597 min_t().  Having the casts on
almost half of the calls shows that something is clearly wrong.

If the wrong type is picked (and it is far too easy to pick the type of
the result instead of the larger input) then significant bits can get
discarded.

Pretty much the worst example is in the derived clamp_val(), consider:
        unsigned char x = 200u;
        y = clamp_val(x, 10u, 300u);

I also suspect that many of the min_t(u16, ...) are actually wrong.  For
example copy_data() in printk_ringbuffer.c contains:

        data_size = min_t(u16, buf_size, len);

Here buf_size is 'unsigned int' and len 'u16', pass a 64k buffer (can you
prove that doesn't happen?) and no data is returned.  Apparantly it did -
and has since been fixed.

The only reason that most of the min_t() are 'fine' is that pretty much
all the values in the kernel are between 0 and INT_MAX.

Patch 1 adds umin(), this uses integer promotions to convert both
arguments to 'unsigned long long'.  It can be used to compare a signed
type that is known to contain a non-negative value with an unsigned type.
The compiler typically optimises it all away.  Added first so that it can
be referred to in patch 2.

Patch 2 replaces the 'same type' check with a 'same signedness' one.  This
makes min(unsigned_int_var, sizeof()) be ok.  The error message is also
improved and will contain the expanded form of both arguments (useful for
seeing how constants are defined).

Patch 3 just fixes some whitespace.

Patch 4 allows comparisons of 'unsigned char' and 'unsigned short' to
signed types.  The integer promotion rules convert them both to 'signed
int' prior to the comparison so they can never cause a negative value be
converted to a large positive one.

Patch 5 (rewritted for v4) allows comparisons of unsigned values against
non-negative constant integer expressions.  This makes
min(unsigned_int_var, 4) be ok.

The only common case that is still errored is the comparison of signed
values against unsigned constant integer expressions below __INT_MAX__.
Typcally min(int_val, sizeof (foo)), the real fix for this is casting the
constant: min(int_var, (int)sizeof (foo)).

With all the patches applied pretty much all the min_t() could be replaced
by min(), and most of the rest by umin().  However they all need careful
inspection due to code like:

        sz = min_t(unsigned char, sz - 1, LIM - 1) + 1;

which converts 0 to LIM.


This patch (of 6):

umin() and umax() can be used when min()/max() errors a signed v unsigned
compare when the signed value is known to be non-negative.

Unlike min_t(some_unsigned_type, a, b) umin() will never mask off high
bits if an inappropriate type is selected.

The '+ 0u + 0ul + 0ull' may look strange.
The '+ 0u' is needed for 'signed int' on 64bit systems.
The '+ 0ul' is needed for 'signed long' on 32bit systems.
The '+ 0ull' is needed for 'signed long long'.

Link: https://lkml.kernel.org/r/b97faef60ad24922b530241c5d7c933c@AcuMS.aculab.com
Link: https://lkml.kernel.org/r/41d93ca827a248698ec64bf57e0c05a5@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index ca69abd6151e..2a197f54fe05 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -76,6 +76,23 @@
  */
 #define max(x, y)	__careful_cmp(x, y, >)
 
+/**
+ * umin - return minimum of two non-negative values
+ *   Signed types are zero extended to match a larger unsigned type.
+ * @x: first value
+ * @y: second value
+ */
+#define umin(x, y)	\
+	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <)
+
+/**
+ * umax - return maximum of two non-negative values
+ * @x: first value
+ * @y: second value
+ */
+#define umax(x, y)	\
+	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >)
+
 /**
  * min3 - return minimum of three values
  * @x: first value
-- 
cgit v1.2.3


From d03eba99f5bf7cbc6e2fdde3b6fa36954ad58e09 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:17:15 +0000
Subject: minmax: allow min()/max()/clamp() if the arguments have the same
 signedness.

The type-check in min()/max() is there to stop unexpected results if a
negative value gets converted to a large unsigned value.  However it also
rejects 'unsigned int' v 'unsigned long' compares which are common and
never problematc.

Replace the 'same type' check with a 'same signedness' check.

The new test isn't itself a compile time error, so use static_assert() to
report the error and give a meaningful error message.

Due to the way builtin_choose_expr() works detecting the error in the
'non-constant' side (where static_assert() can be used) also detects
errors when the arguments are constant.

Link: https://lkml.kernel.org/r/fe7e6c542e094bfca655abcd323c1c98@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 60 +++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 2a197f54fe05..8718fd71a793 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -12,9 +12,8 @@
  *
  * - avoid multiple evaluations of the arguments (so side-effects like
  *   "x++" happen only once) when non-constant.
- * - perform strict type-checking (to generate warnings instead of
- *   nasty runtime surprises). See the "unnecessary" pointer comparison
- *   in __typecheck().
+ * - perform signed v unsigned type-checking (to generate compile
+ *   errors instead of nasty runtime surprises).
  * - retain result as a constant expressions when called with only
  *   constant expressions (to avoid tripping VLA warnings in stack
  *   allocation usage).
@@ -22,23 +21,30 @@
 #define __typecheck(x, y) \
 	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
 
-#define __no_side_effects(x, y) \
-		(__is_constexpr(x) && __is_constexpr(y))
+/* is_signed_type() isn't a constexpr for pointer types */
+#define __is_signed(x) 								\
+	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
+		is_signed_type(typeof(x)), 0)
 
-#define __safe_cmp(x, y) \
-		(__typecheck(x, y) && __no_side_effects(x, y))
+#define __types_ok(x, y) \
+	(__is_signed(x) == __is_signed(y))
 
-#define __cmp(x, y, op)	((x) op (y) ? (x) : (y))
+#define __cmp_op_min <
+#define __cmp_op_max >
 
-#define __cmp_once(x, y, unique_x, unique_y, op) ({	\
+#define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
+
+#define __cmp_once(op, x, y, unique_x, unique_y) ({	\
 		typeof(x) unique_x = (x);		\
 		typeof(y) unique_y = (y);		\
-		__cmp(unique_x, unique_y, op); })
+		static_assert(__types_ok(x, y),		\
+			#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+		__cmp(op, unique_x, unique_y); })
 
-#define __careful_cmp(x, y, op) \
-	__builtin_choose_expr(__safe_cmp(x, y), \
-		__cmp(x, y, op), \
-		__cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
+#define __careful_cmp(op, x, y)					\
+	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
+		__cmp(op, x, y),				\
+		__cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y)))
 
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
@@ -47,17 +53,15 @@
 		typeof(val) unique_val = (val);				\
 		typeof(lo) unique_lo = (lo);				\
 		typeof(hi) unique_hi = (hi);				\
+		static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
+				(lo) <= (hi), true),					\
+			"clamp() low limit " #lo " greater than high limit " #hi);	\
+		static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
+		static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
 		__clamp(unique_val, unique_lo, unique_hi); })
 
-#define __clamp_input_check(lo, hi)					\
-        (BUILD_BUG_ON_ZERO(__builtin_choose_expr(			\
-                __is_constexpr((lo) > (hi)), (lo) > (hi), false)))
-
 #define __careful_clamp(val, lo, hi) ({					\
-	__clamp_input_check(lo, hi) +					\
-	__builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \
-			      __typecheck(hi, lo) && __is_constexpr(val) && \
-			      __is_constexpr(lo) && __is_constexpr(hi),	\
+	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
 		__clamp(val, lo, hi),					\
 		__clamp_once(val, lo, hi, __UNIQUE_ID(__val),		\
 			     __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); })
@@ -67,14 +71,14 @@
  * @x: first value
  * @y: second value
  */
-#define min(x, y)	__careful_cmp(x, y, <)
+#define min(x, y)	__careful_cmp(min, x, y)
 
 /**
  * max - return maximum of two values of the same or compatible types
  * @x: first value
  * @y: second value
  */
-#define max(x, y)	__careful_cmp(x, y, >)
+#define max(x, y)	__careful_cmp(max, x, y)
 
 /**
  * umin - return minimum of two non-negative values
@@ -83,7 +87,7 @@
  * @y: second value
  */
 #define umin(x, y)	\
-	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, <)
+	__careful_cmp(min, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
 /**
  * umax - return maximum of two non-negative values
@@ -91,7 +95,7 @@
  * @y: second value
  */
 #define umax(x, y)	\
-	__careful_cmp((x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull, >)
+	__careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull)
 
 /**
  * min3 - return minimum of three values
@@ -143,7 +147,7 @@
  * @x: first value
  * @y: second value
  */
-#define min_t(type, x, y)	__careful_cmp((type)(x), (type)(y), <)
+#define min_t(type, x, y)	__careful_cmp(min, (type)(x), (type)(y))
 
 /**
  * max_t - return maximum of two values, using the specified type
@@ -151,7 +155,7 @@
  * @x: first value
  * @y: second value
  */
-#define max_t(type, x, y)	__careful_cmp((type)(x), (type)(y), >)
+#define max_t(type, x, y)	__careful_cmp(max, (type)(x), (type)(y))
 
 /*
  * Do not check the array parameter using __must_be_array().
-- 
cgit v1.2.3


From f4b84b2ff851f01d0fac619eadef47eb41648534 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:17:57 +0000
Subject: minmax: fix indentation of __cmp_once() and __clamp_once()

Remove the extra indentation and align continuation markers.

Link: https://lkml.kernel.org/r/bed41317a05c498ea0209eafbcab45a5@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 8718fd71a793..c0e738eacefa 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -35,11 +35,11 @@
 #define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
 
 #define __cmp_once(op, x, y, unique_x, unique_y) ({	\
-		typeof(x) unique_x = (x);		\
-		typeof(y) unique_y = (y);		\
-		static_assert(__types_ok(x, y),		\
-			#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
-		__cmp(op, unique_x, unique_y); })
+	typeof(x) unique_x = (x);			\
+	typeof(y) unique_y = (y);			\
+	static_assert(__types_ok(x, y),			\
+		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
+	__cmp(op, unique_x, unique_y); })
 
 #define __careful_cmp(op, x, y)					\
 	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
@@ -49,16 +49,16 @@
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
 
-#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({	\
-		typeof(val) unique_val = (val);				\
-		typeof(lo) unique_lo = (lo);				\
-		typeof(hi) unique_hi = (hi);				\
-		static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
-				(lo) <= (hi), true),					\
-			"clamp() low limit " #lo " greater than high limit " #hi);	\
-		static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
-		static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
-		__clamp(unique_val, unique_lo, unique_hi); })
+#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({		\
+	typeof(val) unique_val = (val);						\
+	typeof(lo) unique_lo = (lo);						\
+	typeof(hi) unique_hi = (hi);						\
+	static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), 	\
+			(lo) <= (hi), true),					\
+		"clamp() low limit " #lo " greater than high limit " #hi);	\
+	static_assert(__types_ok(val, lo), "clamp() 'lo' signedness error");	\
+	static_assert(__types_ok(val, hi), "clamp() 'hi' signedness error");	\
+	__clamp(unique_val, unique_lo, unique_hi); })
 
 #define __careful_clamp(val, lo, hi) ({					\
 	__builtin_choose_expr(__is_constexpr((val) - (lo) + (hi)),	\
-- 
cgit v1.2.3


From 4ead534fba42fc4fd41163297528d2aa731cd121 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:18:40 +0000
Subject: minmax: allow comparisons of 'int' against 'unsigned char/short'

Since 'unsigned char/short' get promoted to 'signed int' it is safe to
compare them against an 'int' value.

Link: https://lkml.kernel.org/r/8732ef5f809c47c28a7be47c938b28d4@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index c0e738eacefa..842c1db62ffe 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -26,8 +26,9 @@
 	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
 		is_signed_type(typeof(x)), 0)
 
-#define __types_ok(x, y) \
-	(__is_signed(x) == __is_signed(y))
+#define __types_ok(x, y) 			\
+	(__is_signed(x) == __is_signed(y) ||	\
+		__is_signed((x) + 0) == __is_signed((y) + 0))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
-- 
cgit v1.2.3


From 867046cc7027703f60a46339ffde91a1970f2901 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Mon, 18 Sep 2023 08:19:25 +0000
Subject: minmax: relax check to allow comparison between unsigned arguments
 and signed constants

Allow (for example) min(unsigned_var, 20).

The opposite min(signed_var, 20u) is still errored.

Since a comparison between signed and unsigned never makes the unsigned
value negative it is only necessary to adjust the __types_ok() test.

Link: https://lkml.kernel.org/r/633b64e2f39e46bb8234809c5595b8c7@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/minmax.h | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 842c1db62ffe..2ec559284a9f 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -10,13 +10,18 @@
 /*
  * min()/max()/clamp() macros must accomplish three things:
  *
- * - avoid multiple evaluations of the arguments (so side-effects like
+ * - Avoid multiple evaluations of the arguments (so side-effects like
  *   "x++" happen only once) when non-constant.
- * - perform signed v unsigned type-checking (to generate compile
- *   errors instead of nasty runtime surprises).
- * - retain result as a constant expressions when called with only
+ * - Retain result as a constant expressions when called with only
  *   constant expressions (to avoid tripping VLA warnings in stack
  *   allocation usage).
+ * - Perform signed v unsigned type-checking (to generate compile
+ *   errors instead of nasty runtime surprises).
+ * - Unsigned char/short are always promoted to signed int and can be
+ *   compared against signed or unsigned arguments.
+ * - Unsigned arguments can be compared against non-negative signed constants.
+ * - Comparison of a signed argument against an unsigned constant fails
+ *   even if the constant is below __INT_MAX__ and could be cast to int.
  */
 #define __typecheck(x, y) \
 	(!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
@@ -26,9 +31,14 @@
 	__builtin_choose_expr(__is_constexpr(is_signed_type(typeof(x))),	\
 		is_signed_type(typeof(x)), 0)
 
-#define __types_ok(x, y) 			\
-	(__is_signed(x) == __is_signed(y) ||	\
-		__is_signed((x) + 0) == __is_signed((y) + 0))
+/* True for a non-negative signed int constant */
+#define __is_noneg_int(x)	\
+	(__builtin_choose_expr(__is_constexpr(x) && __is_signed(x), x, -1) >= 0)
+
+#define __types_ok(x, y) 					\
+	(__is_signed(x) == __is_signed(y) ||			\
+		__is_signed((x) + 0) == __is_signed((y) + 0) ||	\
+		__is_noneg_int(x) || __is_noneg_int(y))
 
 #define __cmp_op_min <
 #define __cmp_op_max >
-- 
cgit v1.2.3


From 598f0ac1500d7145c696de4d38797b1dd2c651de Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 5 Oct 2023 11:39:54 +0000
Subject: compiler.h: move __is_constexpr() to compiler.h

Prior to f747e6667ebb2 __is_constexpr() was in its only user minmax.h.
That commit moved it to const.h - but that file just defines ULL(x) and
UL(x) so that constants can be defined for .S and .c files.

So apart from the word 'const' it wasn't really a good location.  Instead
move the definition to compiler.h just before the similar

  is_signed_type() and is_unsigned_type().

This may not be a good long-term home, but the three definitions belong
together.

Link: https://lkml.kernel.org/r/2a6680bbe2e84459816a113730426782@AcuMS.aculab.com
Signed-off-by: David Laight <david.laight@aculab.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler.h       | 8 ++++++++
 include/linux/const.h          | 8 --------
 tools/include/linux/compiler.h | 8 ++++++++
 tools/include/linux/const.h    | 8 --------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 174099fdc485..bb1339c7057b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -227,6 +227,14 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
 /*
  * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
  * bool and also pointer types.
diff --git a/include/linux/const.h b/include/linux/const.h
index 435ddd72d2c4..81b8aae5a855 100644
--- a/include/linux/const.h
+++ b/include/linux/const.h
@@ -3,12 +3,4 @@
 
 #include <vdso/const.h>
 
-/*
- * This returns a constant expression while determining if an argument is
- * a constant expression, most importantly without evaluating the argument.
- * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
- */
-#define __is_constexpr(x) \
-	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
-
 #endif /* _LINUX_CONST_H */
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 1684216e826a..7b65566f3e42 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -63,6 +63,14 @@
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 #endif
 
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
 #ifdef __ANDROID__
 /*
  * FIXME: Big hammer to get rid of tons of:
diff --git a/tools/include/linux/const.h b/tools/include/linux/const.h
index 435ddd72d2c4..81b8aae5a855 100644
--- a/tools/include/linux/const.h
+++ b/tools/include/linux/const.h
@@ -3,12 +3,4 @@
 
 #include <vdso/const.h>
 
-/*
- * This returns a constant expression while determining if an argument is
- * a constant expression, most importantly without evaluating the argument.
- * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
- */
-#define __is_constexpr(x) \
-	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
-
 #endif /* _LINUX_CONST_H */
-- 
cgit v1.2.3


From 0e108725f6cc5b3be9e607f89c9fbcbb236367b7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 18 Oct 2023 10:53:58 -0700
Subject: string: Adjust strtomem() logic to allow for smaller sources

Arnd noticed we have a case where a shorter source string is being copied
into a destination byte array, but this results in a strnlen() call that
exceeds the size of the source. This is seen with -Wstringop-overread:

In file included from ../include/linux/uuid.h:11,
                 from ../include/linux/mod_devicetable.h:14,
                 from ../include/linux/cpufeature.h:12,
                 from ../arch/x86/coco/tdx/tdx.c:7:
../arch/x86/coco/tdx/tdx.c: In function 'tdx_panic.constprop':
../include/linux/string.h:284:9: error: 'strnlen' specified bound 64 exceeds source size 60 [-Werror=stringop-overread]
  284 |         memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../arch/x86/coco/tdx/tdx.c:124:9: note: in expansion of macro 'strtomem_pad'
  124 |         strtomem_pad(message.str, msg, '\0');
      |         ^~~~~~~~~~~~

Use the smaller of the two buffer sizes when calling strnlen(). When
src length is unknown (SIZE_MAX), it is adjusted to use dest length,
which is what the original code did.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Fixes: dfbafa70bde2 ("string: Introduce strtomem() and strtomem_pad()")
Tested-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Shevchenko <andy@kernel.org>
Cc: linux-hardening@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..9e3cb6923b0e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -277,10 +277,12 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem_pad(dest, src, pad)	do {				\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
+	memcpy_and_pad(dest, _dest_len, src,				\
+		       strnlen(src, min(_src_len, _dest_len)), pad);	\
 } while (0)
 
 /**
@@ -298,10 +300,11 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem(dest, src)	do {					\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));	\
+	memcpy(dest, src, strnlen(src, min(_src_len, _dest_len)));	\
 } while (0)
 
 /**
-- 
cgit v1.2.3


From bdcb37a5d8de3253da48b120e3f10863696fb654 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 17 Oct 2023 16:33:58 +0200
Subject: buildid: reduce header file dependencies for module

The vmlinux decompressor code intentionally has only a limited set of
included header files, but this started running into a build failure
because of the bitmap logic needing linux/errno.h:

In file included from include/linux/cpumask.h:12,
                 from include/linux/mm_types_task.h:14,
                 from include/linux/mm_types.h:5,
                 from include/linux/buildid.h:5,
                 from include/linux/module.h:14,
                 from arch/arm/boot/compressed/../../../../lib/lz4/lz4_decompress.c:39,
                 from arch/arm/boot/compressed/../../../../lib/decompress_unlz4.c:10,
                 from arch/arm/boot/compressed/decompress.c:60:
include/linux/bitmap.h: In function 'bitmap_allocate_region':
include/linux/bitmap.h:527:25: error: 'EBUSY' undeclared (first use in this function)
  527 |                 return -EBUSY;
      |                         ^~~~~
include/linux/bitmap.h:527:25: note: each undeclared identifier is reported only once for each function it appears in
include/linux/bitmap.h: In function 'bitmap_find_free_region':
include/linux/bitmap.h:554:17: error: 'ENOMEM' undeclared (first use in this function)
  554 |         return -ENOMEM;
      |                 ^~~~~~

This is easily avoided by changing linux/buildid.h to no longer depend on
linux/mm_types.h, a header that pulls in a huge number of indirect dependencies.

Fixes: b9c957f554442 ("bitmap: move bitmap_*_region() functions to bitmap.h")
Fixes: bd7525dacd7e2 ("bpf: Move stack_map_get_build_id into lib")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/buildid.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 3b7a0ff4642f..8a582d242f06 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -2,10 +2,11 @@
 #ifndef _LINUX_BUILDID_H
 #define _LINUX_BUILDID_H
 
-#include <linux/mm_types.h>
+#include <linux/types.h>
 
 #define BUILD_ID_SIZE_MAX 20
 
+struct vm_area_struct;
 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
-- 
cgit v1.2.3


From 32671e3799ca2e4590773fd0e63aaa4229e50c06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Oct 2023 13:56:54 +0200
Subject: perf: Disallow mis-matched inherited group reads

Because group consistency is non-atomic between parent (filedesc) and children
(inherited) events, it is possible for PERF_FORMAT_GROUP read() to try and sum
non-matching counter groups -- with non-sensical results.

Add group_generation to distinguish the case where a parent group removes and
adds an event and thus has the same number, but a different configuration of
events as inherited groups.

This became a problem when commit fa8c269353d5 ("perf/core: Invert
perf_read_group() loops") flipped the order of child_list and sibling_list.
Previously it would iterate the group (sibling_list) first, and for each
sibling traverse the child_list. In this order, only the group composition of
the parent is relevant. By flipping the order the group composition of the
child (inherited) events becomes an issue and the mis-match in group
composition becomes evident.

That said; even prior to this commit, while reading of a group that is not
equally inherited was not broken, it still made no sense.

(Ab)use ECHILD as error return to indicate issues with child process group
composition.

Fixes: fa8c269353d5 ("perf/core: Invert perf_read_group() loops")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231018115654.GK33217@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..7b5406e3288d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -704,6 +704,7 @@ struct perf_event {
 	/* The cumulative AND of all event_caps for events in this group. */
 	int				group_caps;
 
+	unsigned int			group_generation;
 	struct perf_event		*group_leader;
 	/*
 	 * event->pmu will always point to pmu in which this event belongs.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..d0663b9324e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1954,6 +1954,7 @@ static void perf_group_attach(struct perf_event *event)
 
 	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
 	group_leader->nr_siblings++;
+	group_leader->group_generation++;
 
 	perf_event__header_size(group_leader);
 
@@ -2144,6 +2145,7 @@ static void perf_group_detach(struct perf_event *event)
 	if (leader != event) {
 		list_del_init(&event->sibling_list);
 		event->group_leader->nr_siblings--;
+		event->group_leader->group_generation++;
 		goto out;
 	}
 
@@ -5440,7 +5442,7 @@ static int __perf_read_group_add(struct perf_event *leader,
 					u64 read_format, u64 *values)
 {
 	struct perf_event_context *ctx = leader->ctx;
-	struct perf_event *sub;
+	struct perf_event *sub, *parent;
 	unsigned long flags;
 	int n = 1; /* skip @nr */
 	int ret;
@@ -5450,6 +5452,33 @@ static int __perf_read_group_add(struct perf_event *leader,
 		return ret;
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Verify the grouping between the parent and child (inherited)
+	 * events is still in tact.
+	 *
+	 * Specifically:
+	 *  - leader->ctx->lock pins leader->sibling_list
+	 *  - parent->child_mutex pins parent->child_list
+	 *  - parent->ctx->mutex pins parent->sibling_list
+	 *
+	 * Because parent->ctx != leader->ctx (and child_list nests inside
+	 * ctx->mutex), group destruction is not atomic between children, also
+	 * see perf_event_release_kernel(). Additionally, parent can grow the
+	 * group.
+	 *
+	 * Therefore it is possible to have parent and child groups in a
+	 * different configuration and summing over such a beast makes no sense
+	 * what so ever.
+	 *
+	 * Reject this.
+	 */
+	parent = leader->parent;
+	if (parent &&
+	    (parent->group_generation != leader->group_generation ||
+	     parent->nr_siblings != leader->nr_siblings)) {
+		ret = -ECHILD;
+		goto unlock;
+	}
 
 	/*
 	 * Since we co-schedule groups, {enabled,running} times of siblings
@@ -5483,8 +5512,9 @@ static int __perf_read_group_add(struct perf_event *leader,
 			values[n++] = atomic64_read(&sub->lost_samples);
 	}
 
+unlock:
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-	return 0;
+	return ret;
 }
 
 static int perf_read_group(struct perf_event *event,
@@ -5503,10 +5533,6 @@ static int perf_read_group(struct perf_event *event,
 
 	values[0] = 1 + leader->nr_siblings;
 
-	/*
-	 * By locking the child_mutex of the leader we effectively
-	 * lock the child list of all siblings.. XXX explain how.
-	 */
 	mutex_lock(&leader->child_mutex);
 
 	ret = __perf_read_group_add(leader, read_format, values);
@@ -13346,6 +13372,7 @@ static int inherit_group(struct perf_event *parent_event,
 		    !perf_get_aux_event(child_ctr, leader))
 			return -EINVAL;
 	}
+	leader->group_generation = parent_event->group_generation;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9a6c782158f703fa65c7d52e2156b7d4e7cd41f5 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 22 Sep 2023 08:15:24 +0200
Subject: vlynq: remove bus driver

There are no users with a vlynq_driver in the Kernel tree. Also, only
the AR7 platform ever initialized a VLYNQ bus, but AR7 is going to be
removed from the Kernel. OpenWRT had some out-of-tree drivers which they
probably intended to upport, but AR7 devices are even there not
supported anymore because they are "stuck with Kernel
3.18" [1]. This code can go.

[1] https://openwrt.org/docs/techref/targets/ar7

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 MAINTAINERS            |   7 -
 drivers/Kconfig        |   2 -
 drivers/Makefile       |   1 -
 drivers/vlynq/Kconfig  |  21 --
 drivers/vlynq/Makefile |   6 -
 drivers/vlynq/vlynq.c  | 799 -------------------------------------------------
 include/linux/vlynq.h  | 149 ---------
 7 files changed, 985 deletions(-)
 delete mode 100644 drivers/vlynq/Kconfig
 delete mode 100644 drivers/vlynq/Makefile
 delete mode 100644 drivers/vlynq/vlynq.c
 delete mode 100644 include/linux/vlynq.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 35977b269d5e..1cf130fce761 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -22963,13 +22963,6 @@ W:	https://linuxtv.org
 T:	git git://linuxtv.org/media_tree.git
 F:	drivers/media/test-drivers/vivid/*
 
-VLYNQ BUS
-M:	Florian Fainelli <f.fainelli@gmail.com>
-L:	openwrt-devel@lists.openwrt.org (subscribers-only)
-S:	Maintained
-F:	drivers/vlynq/vlynq.c
-F:	include/linux/vlynq.h
-
 VM SOCKETS (AF_VSOCK)
 M:	Stefano Garzarella <sgarzare@redhat.com>
 L:	virtualization@lists.linux-foundation.org
diff --git a/drivers/Kconfig b/drivers/Kconfig
index efb66e25fa2d..9c8b82ddebfe 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -135,8 +135,6 @@ source "drivers/uio/Kconfig"
 
 source "drivers/vfio/Kconfig"
 
-source "drivers/vlynq/Kconfig"
-
 source "drivers/virt/Kconfig"
 
 source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 1bec7819a837..f7eeec79ef66 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -151,7 +151,6 @@ obj-$(CONFIG_BCMA)		+= bcma/
 obj-$(CONFIG_VHOST_RING)	+= vhost/
 obj-$(CONFIG_VHOST_IOTLB)	+= vhost/
 obj-$(CONFIG_VHOST)		+= vhost/
-obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_GREYBUS)		+= greybus/
 obj-$(CONFIG_COMEDI)		+= comedi/
 obj-$(CONFIG_STAGING)		+= staging/
diff --git a/drivers/vlynq/Kconfig b/drivers/vlynq/Kconfig
deleted file mode 100644
index e7f9492a0b04..000000000000
--- a/drivers/vlynq/Kconfig
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-menu "TI VLYNQ"
-	depends on AR7
-
-config VLYNQ
-	bool "TI VLYNQ bus support"
-	help
-	  Support for Texas Instruments(R) VLYNQ bus.
-	  The VLYNQ bus is a high-speed, serial and packetized
-	  data bus which allows external peripherals of a SoC
-	  to appear into the system's main memory.
-
-	  If unsure, say N
-
-config VLYNQ_DEBUG
-	bool "VLYNQ bus debug"
-	depends on VLYNQ && DEBUG_KERNEL
-	help
-	  Turn on VLYNQ bus debugging.
-
-endmenu
diff --git a/drivers/vlynq/Makefile b/drivers/vlynq/Makefile
deleted file mode 100644
index d9ce5b2b5ce0..000000000000
--- a/drivers/vlynq/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for kernel vlynq drivers
-#
-
-obj-$(CONFIG_VLYNQ) += vlynq.o
diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c
deleted file mode 100644
index 4af6615808cc..000000000000
--- a/drivers/vlynq/vlynq.c
+++ /dev/null
@@ -1,799 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
- *
- * Parts of the VLYNQ specification can be found here:
- * http://www.ti.com/litv/pdf/sprue36a
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-
-#include <linux/vlynq.h>
-
-#define VLYNQ_CTRL_PM_ENABLE		0x80000000
-#define VLYNQ_CTRL_CLOCK_INT		0x00008000
-#define VLYNQ_CTRL_CLOCK_DIV(x)		(((x) & 7) << 16)
-#define VLYNQ_CTRL_INT_LOCAL		0x00004000
-#define VLYNQ_CTRL_INT_ENABLE		0x00002000
-#define VLYNQ_CTRL_INT_VECTOR(x)	(((x) & 0x1f) << 8)
-#define VLYNQ_CTRL_INT2CFG		0x00000080
-#define VLYNQ_CTRL_RESET		0x00000001
-
-#define VLYNQ_CTRL_CLOCK_MASK          (0x7 << 16)
-
-#define VLYNQ_INT_OFFSET		0x00000014
-#define VLYNQ_REMOTE_OFFSET		0x00000080
-
-#define VLYNQ_STATUS_LINK		0x00000001
-#define VLYNQ_STATUS_LERROR		0x00000080
-#define VLYNQ_STATUS_RERROR		0x00000100
-
-#define VINT_ENABLE			0x00000100
-#define VINT_TYPE_EDGE			0x00000080
-#define VINT_LEVEL_LOW			0x00000040
-#define VINT_VECTOR(x)			((x) & 0x1f)
-#define VINT_OFFSET(irq)		(8 * ((irq) % 4))
-
-#define VLYNQ_AUTONEGO_V2		0x00010000
-
-struct vlynq_regs {
-	u32 revision;
-	u32 control;
-	u32 status;
-	u32 int_prio;
-	u32 int_status;
-	u32 int_pending;
-	u32 int_ptr;
-	u32 tx_offset;
-	struct vlynq_mapping rx_mapping[4];
-	u32 chip;
-	u32 autonego;
-	u32 unused[6];
-	u32 int_device[8];
-};
-
-#ifdef CONFIG_VLYNQ_DEBUG
-static void vlynq_dump_regs(struct vlynq_device *dev)
-{
-	int i;
-
-	printk(KERN_DEBUG "VLYNQ local=%p remote=%p\n",
-			dev->local, dev->remote);
-	for (i = 0; i < 32; i++) {
-		printk(KERN_DEBUG "VLYNQ: local %d: %08x\n",
-			i + 1, ((u32 *)dev->local)[i]);
-		printk(KERN_DEBUG "VLYNQ: remote %d: %08x\n",
-			i + 1, ((u32 *)dev->remote)[i]);
-	}
-}
-
-static void vlynq_dump_mem(u32 *base, int count)
-{
-	int i;
-
-	for (i = 0; i < (count + 3) / 4; i++) {
-		if (i % 4 == 0)
-			printk(KERN_DEBUG "\nMEM[0x%04x]:", i * 4);
-		printk(KERN_DEBUG " 0x%08x", *(base + i));
-	}
-	printk(KERN_DEBUG "\n");
-}
-#endif
-
-/* Check the VLYNQ link status with a given device */
-static int vlynq_linked(struct vlynq_device *dev)
-{
-	int i;
-
-	for (i = 0; i < 100; i++)
-		if (readl(&dev->local->status) & VLYNQ_STATUS_LINK)
-			return 1;
-		else
-			cpu_relax();
-
-	return 0;
-}
-
-static void vlynq_reset(struct vlynq_device *dev)
-{
-	writel(readl(&dev->local->control) | VLYNQ_CTRL_RESET,
-			&dev->local->control);
-
-	/* Wait for the devices to finish resetting */
-	msleep(5);
-
-	/* Remove reset bit */
-	writel(readl(&dev->local->control) & ~VLYNQ_CTRL_RESET,
-			&dev->local->control);
-
-	/* Give some time for the devices to settle */
-	msleep(5);
-}
-
-static void vlynq_irq_unmask(struct irq_data *d)
-{
-	struct vlynq_device *dev = irq_data_get_irq_chip_data(d);
-	int virq;
-	u32 val;
-
-	BUG_ON(!dev);
-	virq = d->irq - dev->irq_start;
-	val = readl(&dev->remote->int_device[virq >> 2]);
-	val |= (VINT_ENABLE | virq) << VINT_OFFSET(virq);
-	writel(val, &dev->remote->int_device[virq >> 2]);
-}
-
-static void vlynq_irq_mask(struct irq_data *d)
-{
-	struct vlynq_device *dev = irq_data_get_irq_chip_data(d);
-	int virq;
-	u32 val;
-
-	BUG_ON(!dev);
-	virq = d->irq - dev->irq_start;
-	val = readl(&dev->remote->int_device[virq >> 2]);
-	val &= ~(VINT_ENABLE << VINT_OFFSET(virq));
-	writel(val, &dev->remote->int_device[virq >> 2]);
-}
-
-static int vlynq_irq_type(struct irq_data *d, unsigned int flow_type)
-{
-	struct vlynq_device *dev = irq_data_get_irq_chip_data(d);
-	int virq;
-	u32 val;
-
-	BUG_ON(!dev);
-	virq = d->irq - dev->irq_start;
-	val = readl(&dev->remote->int_device[virq >> 2]);
-	switch (flow_type & IRQ_TYPE_SENSE_MASK) {
-	case IRQ_TYPE_EDGE_RISING:
-	case IRQ_TYPE_EDGE_FALLING:
-	case IRQ_TYPE_EDGE_BOTH:
-		val |= VINT_TYPE_EDGE << VINT_OFFSET(virq);
-		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
-		break;
-	case IRQ_TYPE_LEVEL_HIGH:
-		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
-		val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq));
-		break;
-	case IRQ_TYPE_LEVEL_LOW:
-		val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq));
-		val |= VINT_LEVEL_LOW << VINT_OFFSET(virq);
-		break;
-	default:
-		return -EINVAL;
-	}
-	writel(val, &dev->remote->int_device[virq >> 2]);
-	return 0;
-}
-
-static void vlynq_local_ack(struct irq_data *d)
-{
-	struct vlynq_device *dev = irq_data_get_irq_chip_data(d);
-	u32 status = readl(&dev->local->status);
-
-	pr_debug("%s: local status: 0x%08x\n",
-		       dev_name(&dev->dev), status);
-	writel(status, &dev->local->status);
-}
-
-static void vlynq_remote_ack(struct irq_data *d)
-{
-	struct vlynq_device *dev = irq_data_get_irq_chip_data(d);
-	u32 status = readl(&dev->remote->status);
-
-	pr_debug("%s: remote status: 0x%08x\n",
-		       dev_name(&dev->dev), status);
-	writel(status, &dev->remote->status);
-}
-
-static irqreturn_t vlynq_irq(int irq, void *dev_id)
-{
-	struct vlynq_device *dev = dev_id;
-	u32 status;
-	int virq = 0;
-
-	status = readl(&dev->local->int_status);
-	writel(status, &dev->local->int_status);
-
-	if (unlikely(!status))
-		spurious_interrupt();
-
-	while (status) {
-		if (status & 1)
-			do_IRQ(dev->irq_start + virq);
-		status >>= 1;
-		virq++;
-	}
-
-	return IRQ_HANDLED;
-}
-
-static struct irq_chip vlynq_irq_chip = {
-	.name = "vlynq",
-	.irq_unmask = vlynq_irq_unmask,
-	.irq_mask = vlynq_irq_mask,
-	.irq_set_type = vlynq_irq_type,
-};
-
-static struct irq_chip vlynq_local_chip = {
-	.name = "vlynq local error",
-	.irq_unmask = vlynq_irq_unmask,
-	.irq_mask = vlynq_irq_mask,
-	.irq_ack = vlynq_local_ack,
-};
-
-static struct irq_chip vlynq_remote_chip = {
-	.name = "vlynq local error",
-	.irq_unmask = vlynq_irq_unmask,
-	.irq_mask = vlynq_irq_mask,
-	.irq_ack = vlynq_remote_ack,
-};
-
-static int vlynq_setup_irq(struct vlynq_device *dev)
-{
-	u32 val;
-	int i, virq;
-
-	if (dev->local_irq == dev->remote_irq) {
-		printk(KERN_ERR
-		       "%s: local vlynq irq should be different from remote\n",
-		       dev_name(&dev->dev));
-		return -EINVAL;
-	}
-
-	/* Clear local and remote error bits */
-	writel(readl(&dev->local->status), &dev->local->status);
-	writel(readl(&dev->remote->status), &dev->remote->status);
-
-	/* Now setup interrupts */
-	val = VLYNQ_CTRL_INT_VECTOR(dev->local_irq);
-	val |= VLYNQ_CTRL_INT_ENABLE | VLYNQ_CTRL_INT_LOCAL |
-		VLYNQ_CTRL_INT2CFG;
-	val |= readl(&dev->local->control);
-	writel(VLYNQ_INT_OFFSET, &dev->local->int_ptr);
-	writel(val, &dev->local->control);
-
-	val = VLYNQ_CTRL_INT_VECTOR(dev->remote_irq);
-	val |= VLYNQ_CTRL_INT_ENABLE;
-	val |= readl(&dev->remote->control);
-	writel(VLYNQ_INT_OFFSET, &dev->remote->int_ptr);
-	writel(val, &dev->remote->int_ptr);
-	writel(val, &dev->remote->control);
-
-	for (i = dev->irq_start; i <= dev->irq_end; i++) {
-		virq = i - dev->irq_start;
-		if (virq == dev->local_irq) {
-			irq_set_chip_and_handler(i, &vlynq_local_chip,
-						 handle_level_irq);
-			irq_set_chip_data(i, dev);
-		} else if (virq == dev->remote_irq) {
-			irq_set_chip_and_handler(i, &vlynq_remote_chip,
-						 handle_level_irq);
-			irq_set_chip_data(i, dev);
-		} else {
-			irq_set_chip_and_handler(i, &vlynq_irq_chip,
-						 handle_simple_irq);
-			irq_set_chip_data(i, dev);
-			writel(0, &dev->remote->int_device[virq >> 2]);
-		}
-	}
-
-	if (request_irq(dev->irq, vlynq_irq, IRQF_SHARED, "vlynq", dev)) {
-		printk(KERN_ERR "%s: request_irq failed\n",
-					dev_name(&dev->dev));
-		return -EAGAIN;
-	}
-
-	return 0;
-}
-
-static void vlynq_device_release(struct device *dev)
-{
-	struct vlynq_device *vdev = to_vlynq_device(dev);
-	kfree(vdev);
-}
-
-static int vlynq_device_match(struct device *dev,
-			      struct device_driver *drv)
-{
-	struct vlynq_device *vdev = to_vlynq_device(dev);
-	struct vlynq_driver *vdrv = to_vlynq_driver(drv);
-	struct vlynq_device_id *ids = vdrv->id_table;
-
-	while (ids->id) {
-		if (ids->id == vdev->dev_id) {
-			vdev->divisor = ids->divisor;
-			vlynq_set_drvdata(vdev, ids);
-			printk(KERN_INFO "Driver found for VLYNQ "
-				"device: %08x\n", vdev->dev_id);
-			return 1;
-		}
-		printk(KERN_DEBUG "Not using the %08x VLYNQ device's driver"
-			" for VLYNQ device: %08x\n", ids->id, vdev->dev_id);
-		ids++;
-	}
-	return 0;
-}
-
-static int vlynq_device_probe(struct device *dev)
-{
-	struct vlynq_device *vdev = to_vlynq_device(dev);
-	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
-	struct vlynq_device_id *id = vlynq_get_drvdata(vdev);
-	int result = -ENODEV;
-
-	if (drv->probe)
-		result = drv->probe(vdev, id);
-	if (result)
-		put_device(dev);
-	return result;
-}
-
-static void vlynq_device_remove(struct device *dev)
-{
-	struct vlynq_driver *drv = to_vlynq_driver(dev->driver);
-
-	if (drv->remove)
-		drv->remove(to_vlynq_device(dev));
-}
-
-int __vlynq_register_driver(struct vlynq_driver *driver, struct module *owner)
-{
-	driver->driver.name = driver->name;
-	driver->driver.bus = &vlynq_bus_type;
-	return driver_register(&driver->driver);
-}
-EXPORT_SYMBOL(__vlynq_register_driver);
-
-void vlynq_unregister_driver(struct vlynq_driver *driver)
-{
-	driver_unregister(&driver->driver);
-}
-EXPORT_SYMBOL(vlynq_unregister_driver);
-
-/*
- * A VLYNQ remote device can clock the VLYNQ bus master
- * using a dedicated clock line. In that case, both the
- * remove device and the bus master should have the same
- * serial clock dividers configured. Iterate through the
- * 8 possible dividers until we actually link with the
- * device.
- */
-static int __vlynq_try_remote(struct vlynq_device *dev)
-{
-	int i;
-
-	vlynq_reset(dev);
-	for (i = dev->dev_id ? vlynq_rdiv2 : vlynq_rdiv8; dev->dev_id ?
-			i <= vlynq_rdiv8 : i >= vlynq_rdiv2;
-		dev->dev_id ? i++ : i--) {
-
-		if (!vlynq_linked(dev))
-			break;
-
-		writel((readl(&dev->remote->control) &
-				~VLYNQ_CTRL_CLOCK_MASK) |
-				VLYNQ_CTRL_CLOCK_INT |
-				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
-				&dev->remote->control);
-		writel((readl(&dev->local->control)
-				& ~(VLYNQ_CTRL_CLOCK_INT |
-				VLYNQ_CTRL_CLOCK_MASK)) |
-				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1),
-				&dev->local->control);
-
-		if (vlynq_linked(dev)) {
-			printk(KERN_DEBUG
-				"%s: using remote clock divisor %d\n",
-				dev_name(&dev->dev), i - vlynq_rdiv1 + 1);
-			dev->divisor = i;
-			return 0;
-		} else {
-			vlynq_reset(dev);
-		}
-	}
-
-	return -ENODEV;
-}
-
-/*
- * A VLYNQ remote device can be clocked by the VLYNQ bus
- * master using a dedicated clock line. In that case, only
- * the bus master configures the serial clock divider.
- * Iterate through the 8 possible dividers until we
- * actually get a link with the device.
- */
-static int __vlynq_try_local(struct vlynq_device *dev)
-{
-	int i;
-
-	vlynq_reset(dev);
-
-	for (i = dev->dev_id ? vlynq_ldiv2 : vlynq_ldiv8; dev->dev_id ?
-			i <= vlynq_ldiv8 : i >= vlynq_ldiv2;
-		dev->dev_id ? i++ : i--) {
-
-		writel((readl(&dev->local->control) &
-				~VLYNQ_CTRL_CLOCK_MASK) |
-				VLYNQ_CTRL_CLOCK_INT |
-				VLYNQ_CTRL_CLOCK_DIV(i - vlynq_ldiv1),
-				&dev->local->control);
-
-		if (vlynq_linked(dev)) {
-			printk(KERN_DEBUG
-				"%s: using local clock divisor %d\n",
-				dev_name(&dev->dev), i - vlynq_ldiv1 + 1);
-			dev->divisor = i;
-			return 0;
-		} else {
-			vlynq_reset(dev);
-		}
-	}
-
-	return -ENODEV;
-}
-
-/*
- * When using external clocking method, serial clock
- * is supplied by an external oscillator, therefore we
- * should mask the local clock bit in the clock control
- * register for both the bus master and the remote device.
- */
-static int __vlynq_try_external(struct vlynq_device *dev)
-{
-	vlynq_reset(dev);
-	if (!vlynq_linked(dev))
-		return -ENODEV;
-
-	writel((readl(&dev->remote->control) &
-			~VLYNQ_CTRL_CLOCK_INT),
-			&dev->remote->control);
-
-	writel((readl(&dev->local->control) &
-			~VLYNQ_CTRL_CLOCK_INT),
-			&dev->local->control);
-
-	if (vlynq_linked(dev)) {
-		printk(KERN_DEBUG "%s: using external clock\n",
-			dev_name(&dev->dev));
-			dev->divisor = vlynq_div_external;
-		return 0;
-	}
-
-	return -ENODEV;
-}
-
-static int __vlynq_enable_device(struct vlynq_device *dev)
-{
-	int result;
-	struct plat_vlynq_ops *ops = dev->dev.platform_data;
-
-	result = ops->on(dev);
-	if (result)
-		return result;
-
-	switch (dev->divisor) {
-	case vlynq_div_external:
-	case vlynq_div_auto:
-		/* When the device is brought from reset it should have clock
-		 * generation negotiated by hardware.
-		 * Check which device is generating clocks and perform setup
-		 * accordingly */
-		if (vlynq_linked(dev) && readl(&dev->remote->control) &
-		   VLYNQ_CTRL_CLOCK_INT) {
-			if (!__vlynq_try_remote(dev) ||
-				!__vlynq_try_local(dev)  ||
-				!__vlynq_try_external(dev))
-				return 0;
-		} else {
-			if (!__vlynq_try_external(dev) ||
-				!__vlynq_try_local(dev)    ||
-				!__vlynq_try_remote(dev))
-				return 0;
-		}
-		break;
-	case vlynq_ldiv1:
-	case vlynq_ldiv2:
-	case vlynq_ldiv3:
-	case vlynq_ldiv4:
-	case vlynq_ldiv5:
-	case vlynq_ldiv6:
-	case vlynq_ldiv7:
-	case vlynq_ldiv8:
-		writel(VLYNQ_CTRL_CLOCK_INT |
-			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
-			vlynq_ldiv1), &dev->local->control);
-		writel(0, &dev->remote->control);
-		if (vlynq_linked(dev)) {
-			printk(KERN_DEBUG
-				"%s: using local clock divisor %d\n",
-				dev_name(&dev->dev),
-				dev->divisor - vlynq_ldiv1 + 1);
-			return 0;
-		}
-		break;
-	case vlynq_rdiv1:
-	case vlynq_rdiv2:
-	case vlynq_rdiv3:
-	case vlynq_rdiv4:
-	case vlynq_rdiv5:
-	case vlynq_rdiv6:
-	case vlynq_rdiv7:
-	case vlynq_rdiv8:
-		writel(0, &dev->local->control);
-		writel(VLYNQ_CTRL_CLOCK_INT |
-			VLYNQ_CTRL_CLOCK_DIV(dev->divisor -
-			vlynq_rdiv1), &dev->remote->control);
-		if (vlynq_linked(dev)) {
-			printk(KERN_DEBUG
-				"%s: using remote clock divisor %d\n",
-				dev_name(&dev->dev),
-				dev->divisor - vlynq_rdiv1 + 1);
-			return 0;
-		}
-		break;
-	}
-
-	ops->off(dev);
-	return -ENODEV;
-}
-
-int vlynq_enable_device(struct vlynq_device *dev)
-{
-	struct plat_vlynq_ops *ops = dev->dev.platform_data;
-	int result = -ENODEV;
-
-	result = __vlynq_enable_device(dev);
-	if (result)
-		return result;
-
-	result = vlynq_setup_irq(dev);
-	if (result)
-		ops->off(dev);
-
-	dev->enabled = !result;
-	return result;
-}
-EXPORT_SYMBOL(vlynq_enable_device);
-
-
-void vlynq_disable_device(struct vlynq_device *dev)
-{
-	struct plat_vlynq_ops *ops = dev->dev.platform_data;
-
-	dev->enabled = 0;
-	free_irq(dev->irq, dev);
-	ops->off(dev);
-}
-EXPORT_SYMBOL(vlynq_disable_device);
-
-int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
-			    struct vlynq_mapping *mapping)
-{
-	int i;
-
-	if (!dev->enabled)
-		return -ENXIO;
-
-	writel(tx_offset, &dev->local->tx_offset);
-	for (i = 0; i < 4; i++) {
-		writel(mapping[i].offset, &dev->local->rx_mapping[i].offset);
-		writel(mapping[i].size, &dev->local->rx_mapping[i].size);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(vlynq_set_local_mapping);
-
-int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
-			     struct vlynq_mapping *mapping)
-{
-	int i;
-
-	if (!dev->enabled)
-		return -ENXIO;
-
-	writel(tx_offset, &dev->remote->tx_offset);
-	for (i = 0; i < 4; i++) {
-		writel(mapping[i].offset, &dev->remote->rx_mapping[i].offset);
-		writel(mapping[i].size, &dev->remote->rx_mapping[i].size);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(vlynq_set_remote_mapping);
-
-int vlynq_set_local_irq(struct vlynq_device *dev, int virq)
-{
-	int irq = dev->irq_start + virq;
-	if (dev->enabled)
-		return -EBUSY;
-
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	if (virq == dev->remote_irq)
-		return -EINVAL;
-
-	dev->local_irq = virq;
-
-	return 0;
-}
-EXPORT_SYMBOL(vlynq_set_local_irq);
-
-int vlynq_set_remote_irq(struct vlynq_device *dev, int virq)
-{
-	int irq = dev->irq_start + virq;
-	if (dev->enabled)
-		return -EBUSY;
-
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	if (virq == dev->local_irq)
-		return -EINVAL;
-
-	dev->remote_irq = virq;
-
-	return 0;
-}
-EXPORT_SYMBOL(vlynq_set_remote_irq);
-
-static int vlynq_probe(struct platform_device *pdev)
-{
-	struct vlynq_device *dev;
-	struct resource *regs_res, *mem_res, *irq_res;
-	int len, result;
-
-	regs_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
-	if (!regs_res)
-		return -ENODEV;
-
-	mem_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mem");
-	if (!mem_res)
-		return -ENODEV;
-
-	irq_res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "devirq");
-	if (!irq_res)
-		return -ENODEV;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev) {
-		printk(KERN_ERR
-		       "vlynq: failed to allocate device structure\n");
-		return -ENOMEM;
-	}
-
-	dev->id = pdev->id;
-	dev->dev.bus = &vlynq_bus_type;
-	dev->dev.parent = &pdev->dev;
-	dev_set_name(&dev->dev, "vlynq%d", dev->id);
-	dev->dev.platform_data = pdev->dev.platform_data;
-	dev->dev.release = vlynq_device_release;
-
-	dev->regs_start = regs_res->start;
-	dev->regs_end = regs_res->end;
-	dev->mem_start = mem_res->start;
-	dev->mem_end = mem_res->end;
-
-	len = resource_size(regs_res);
-	if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
-		printk(KERN_ERR "%s: Can't request vlynq registers\n",
-		       dev_name(&dev->dev));
-		result = -ENXIO;
-		goto fail_request;
-	}
-
-	dev->local = ioremap(regs_res->start, len);
-	if (!dev->local) {
-		printk(KERN_ERR "%s: Can't remap vlynq registers\n",
-		       dev_name(&dev->dev));
-		result = -ENXIO;
-		goto fail_remap;
-	}
-
-	dev->remote = (struct vlynq_regs *)((void *)dev->local +
-					    VLYNQ_REMOTE_OFFSET);
-
-	dev->irq = platform_get_irq_byname(pdev, "irq");
-	dev->irq_start = irq_res->start;
-	dev->irq_end = irq_res->end;
-	dev->local_irq = dev->irq_end - dev->irq_start;
-	dev->remote_irq = dev->local_irq - 1;
-
-	if (device_register(&dev->dev))
-		goto fail_register;
-	platform_set_drvdata(pdev, dev);
-
-	printk(KERN_INFO "%s: regs 0x%p, irq %d, mem 0x%p\n",
-	       dev_name(&dev->dev), (void *)dev->regs_start, dev->irq,
-	       (void *)dev->mem_start);
-
-	dev->dev_id = 0;
-	dev->divisor = vlynq_div_auto;
-	result = __vlynq_enable_device(dev);
-	if (result == 0) {
-		dev->dev_id = readl(&dev->remote->chip);
-		((struct plat_vlynq_ops *)(dev->dev.platform_data))->off(dev);
-	}
-	if (dev->dev_id)
-		printk(KERN_INFO "Found a VLYNQ device: %08x\n", dev->dev_id);
-
-	return 0;
-
-fail_register:
-	iounmap(dev->local);
-fail_remap:
-fail_request:
-	release_mem_region(regs_res->start, len);
-	kfree(dev);
-	return result;
-}
-
-static int vlynq_remove(struct platform_device *pdev)
-{
-	struct vlynq_device *dev = platform_get_drvdata(pdev);
-
-	device_unregister(&dev->dev);
-	iounmap(dev->local);
-	release_mem_region(dev->regs_start,
-			   dev->regs_end - dev->regs_start + 1);
-
-	kfree(dev);
-
-	return 0;
-}
-
-static struct platform_driver vlynq_platform_driver = {
-	.driver.name = "vlynq",
-	.probe = vlynq_probe,
-	.remove = vlynq_remove,
-};
-
-struct bus_type vlynq_bus_type = {
-	.name = "vlynq",
-	.match = vlynq_device_match,
-	.probe = vlynq_device_probe,
-	.remove = vlynq_device_remove,
-};
-EXPORT_SYMBOL(vlynq_bus_type);
-
-static int vlynq_init(void)
-{
-	int res = 0;
-
-	res = bus_register(&vlynq_bus_type);
-	if (res)
-		goto fail_bus;
-
-	res = platform_driver_register(&vlynq_platform_driver);
-	if (res)
-		goto fail_platform;
-
-	return 0;
-
-fail_platform:
-	bus_unregister(&vlynq_bus_type);
-fail_bus:
-	return res;
-}
-
-static void vlynq_exit(void)
-{
-	platform_driver_unregister(&vlynq_platform_driver);
-	bus_unregister(&vlynq_bus_type);
-}
-
-module_init(vlynq_init);
-module_exit(vlynq_exit);
diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h
deleted file mode 100644
index e9c0cd36c48a..000000000000
--- a/include/linux/vlynq.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2006, 2007 Eugene Konev <ejka@openwrt.org>
- */
-
-#ifndef __VLYNQ_H__
-#define __VLYNQ_H__
-
-#include <linux/device.h>
-#include <linux/types.h>
-
-struct module;
-
-#define VLYNQ_NUM_IRQS 32
-
-struct vlynq_mapping {
-	u32 size;
-	u32 offset;
-};
-
-enum vlynq_divisor {
-	vlynq_div_auto = 0,
-	vlynq_ldiv1,
-	vlynq_ldiv2,
-	vlynq_ldiv3,
-	vlynq_ldiv4,
-	vlynq_ldiv5,
-	vlynq_ldiv6,
-	vlynq_ldiv7,
-	vlynq_ldiv8,
-	vlynq_rdiv1,
-	vlynq_rdiv2,
-	vlynq_rdiv3,
-	vlynq_rdiv4,
-	vlynq_rdiv5,
-	vlynq_rdiv6,
-	vlynq_rdiv7,
-	vlynq_rdiv8,
-	vlynq_div_external
-};
-
-struct vlynq_device_id {
-	u32 id;
-	enum vlynq_divisor divisor;
-	unsigned long driver_data;
-};
-
-struct vlynq_regs;
-struct vlynq_device {
-	u32 id, dev_id;
-	int local_irq;
-	int remote_irq;
-	enum vlynq_divisor divisor;
-	u32 regs_start, regs_end;
-	u32 mem_start, mem_end;
-	u32 irq_start, irq_end;
-	int irq;
-	int enabled;
-	struct vlynq_regs *local;
-	struct vlynq_regs *remote;
-	struct device dev;
-};
-
-struct vlynq_driver {
-	char *name;
-	struct vlynq_device_id *id_table;
-	int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id);
-	void (*remove)(struct vlynq_device *dev);
-	struct device_driver driver;
-};
-
-struct plat_vlynq_ops {
-	int (*on)(struct vlynq_device *dev);
-	void (*off)(struct vlynq_device *dev);
-};
-
-static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv)
-{
-	return container_of(drv, struct vlynq_driver, driver);
-}
-
-static inline struct vlynq_device *to_vlynq_device(struct device *device)
-{
-	return container_of(device, struct vlynq_device, dev);
-}
-
-extern struct bus_type vlynq_bus_type;
-
-extern int __vlynq_register_driver(struct vlynq_driver *driver,
-				   struct module *owner);
-
-static inline int vlynq_register_driver(struct vlynq_driver *driver)
-{
-	return __vlynq_register_driver(driver, THIS_MODULE);
-}
-
-static inline void *vlynq_get_drvdata(struct vlynq_device *dev)
-{
-	return dev_get_drvdata(&dev->dev);
-}
-
-static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data)
-{
-	dev_set_drvdata(&dev->dev, data);
-}
-
-static inline u32 vlynq_mem_start(struct vlynq_device *dev)
-{
-	return dev->mem_start;
-}
-
-static inline u32 vlynq_mem_end(struct vlynq_device *dev)
-{
-	return dev->mem_end;
-}
-
-static inline u32 vlynq_mem_len(struct vlynq_device *dev)
-{
-	return dev->mem_end - dev->mem_start + 1;
-}
-
-static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq)
-{
-	int irq = dev->irq_start + virq;
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	return irq;
-}
-
-static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq)
-{
-	if ((irq < dev->irq_start) || (irq > dev->irq_end))
-		return -EINVAL;
-
-	return irq - dev->irq_start;
-}
-
-extern void vlynq_unregister_driver(struct vlynq_driver *driver);
-extern int vlynq_enable_device(struct vlynq_device *dev);
-extern void vlynq_disable_device(struct vlynq_device *dev);
-extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset,
-				   struct vlynq_mapping *mapping);
-extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset,
-				    struct vlynq_mapping *mapping);
-extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq);
-extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq);
-
-#endif /* __VLYNQ_H__ */
-- 
cgit v1.2.3


From 5aa8fd9cea2ee0d42c5d92c5eacf0a14bbc4c293 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 11 Sep 2023 20:25:50 -0400
Subject: fs: add a new SB_I_NOUMASK flag

SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
also sets this flag to prevent the VFS from applying the umask on
newly-created files. NFSv4 doesn't support POSIX ACLs however, which
causes confusion when other subsystems try to test for them.

Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
stripping without advertising support for POSIX ACLs. Set the new flag
on NFSv4 instead of SB_POSIXACL.

Also, move mode_strip_umask to namei.h and convert init_mknod and
init_mkdir to use it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/init.c             |  6 ++----
 fs/namei.c            | 19 -------------------
 fs/nfs/super.c        |  2 +-
 include/linux/fs.h    |  3 ++-
 include/linux/namei.h | 24 ++++++++++++++++++++++++
 5 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
 		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..94b27370f468 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3103,25 +3103,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-/**
- * mode_strip_umask - handle vfs umask stripping
- * @dir:	parent directory of the new inode
- * @mode:	mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
-	if (!IS_POSIXACL(dir))
-		mode &= ~current_umask();
-	return mode;
-}
-
 /**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..9b1cfca8112a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 		sb->s_export_op = &nfs_export_ops;
 		break;
 	case 4:
-		sb->s_flags |= SB_POSIXACL;
+		sb->s_iflags |= SB_I_NOUMASK;
 		sb->s_time_gran = 1;
 		sb->s_time_min = S64_MIN;
 		sb->s_time_max = S64_MAX;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..58dea591a341 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1119,7 +1119,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_NOATIME      BIT(10)	/* Do not update access times. */
 #define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
 #define SB_SILENT       BIT(15)
-#define SB_POSIXACL     BIT(16)	/* VFS does not apply the umask */
+#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
 #define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
 #define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
 #define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
@@ -1166,6 +1166,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
+#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
 
 /* Possible states of 'frozen' field */
 enum {
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 1463cbda4888..e3619920f9f0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
+/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir:	parent directory of the new inode
+ * @mode:	mode of the new inode to be created in @dir
+ *
+ * In most filesystems, umask stripping depends on whether or not the
+ * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
+ * stripping is done directly in here. If the filesystem does support POSIX
+ * ACLs umask stripping is deferred until the filesystem calls
+ * posix_acl_create().
+ *
+ * Some filesystems (like NFSv4) also want to avoid umask stripping by the
+ * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
+ * to get this effect without declaring that they support POSIX ACLs.
+ *
+ * Returns: mode
+ */
+static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+	if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
+		mode &= ~current_umask();
+	return mode;
+}
+
 extern int __must_check nd_jump_link(const struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
-- 
cgit v1.2.3


From 61105aab4edb59bf8177f005eb2923fe5c4deb3c Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:52 +0200
Subject: pipe: reduce padding in struct pipe_inode_info

This has no effect on 64 bit because there are 10 32-bit integers
surrounding the two bools, but on 32 bit architectures, this reduces
the struct size by 4 bytes by merging the two bools into one word.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-1-max.kellermann@ionos.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pipe_fs_i.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 608a9eb86bff..598a411d7da2 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -62,9 +62,6 @@ struct pipe_inode_info {
 	unsigned int tail;
 	unsigned int max_usage;
 	unsigned int ring_size;
-#ifdef CONFIG_WATCH_QUEUE
-	bool note_loss;
-#endif
 	unsigned int nr_accounted;
 	unsigned int readers;
 	unsigned int writers;
@@ -72,6 +69,9 @@ struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+#ifdef CONFIG_WATCH_QUEUE
+	bool note_loss;
+#endif
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
-- 
cgit v1.2.3


From b4bd6b4bac8edd61eb8f7b836969d12c0c6af165 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:53 +0200
Subject: fs/pipe: move check to pipe_has_watch_queue()

This declutters the code by reducing the number of #ifdefs and makes
the watch_queue checks simpler.  This has no runtime effect; the
machine code is identical.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-2-max.kellermann@ionos.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c                 | 12 +++---------
 include/linux/pipe_fs_i.h | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 6c1a9b1db907..6ecaccb48738 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -437,12 +437,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue) {
+	if (pipe_has_watch_queue(pipe)) {
 		ret = -EXDEV;
 		goto out;
 	}
-#endif
 
 	/*
 	 * If it wasn't empty we try to merge new data into
@@ -1325,10 +1323,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
 	unsigned int nr_slots, size;
 	long ret = 0;
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue)
+	if (pipe_has_watch_queue(pipe))
 		return -EBUSY;
-#endif
 
 	size = round_pipe_size(arg);
 	nr_slots = size >> PAGE_SHIFT;
@@ -1380,10 +1376,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
 
 	if (file->f_op != &pipefifo_fops || !pipe)
 		return NULL;
-#ifdef CONFIG_WATCH_QUEUE
-	if (for_splice && pipe->watch_queue)
+	if (for_splice && pipe_has_watch_queue(pipe))
 		return NULL;
-#endif
 	return pipe;
 }
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 598a411d7da2..8ff23bf5a819 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -124,6 +124,22 @@ struct pipe_buf_operations {
 	bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 
+/**
+ * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
+ * i.e. it was created with O_NOTIFICATION_PIPE
+ * @pipe: The pipe to check
+ *
+ * Return: true if pipe is a watch queue, false otherwise.
+ */
+static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
+{
+#ifdef CONFIG_WATCH_QUEUE
+	return pipe->watch_queue != NULL;
+#else
+	return false;
+#endif
+}
+
 /**
  * pipe_empty - Return true if the pipe is empty
  * @head: The pipe ring head pointer
-- 
cgit v1.2.3


From 85fadf89e5708d74c4923e9ceeca4b9df0e000bb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:54:08 -0700
Subject: watch_queue: Annotate struct watch_filter with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct watch_filter.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: David Howells <dhowells@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Siddh Raman Pant <code@siddh.me>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Qian Cai <cai@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Siddh Raman Pant <code@siddh.me>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Message-Id: <20230922175407.work.754-kees@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/watch_queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 45cd42f55d49..429c7b6afead 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -32,7 +32,7 @@ struct watch_filter {
 		DECLARE_BITMAP(type_filter, WATCH_TYPE__NR);
 	};
 	u32			nr_filters;	/* Number of filters */
-	struct watch_type_filter filters[];
+	struct watch_type_filter filters[] __counted_by(nr_filters);
 };
 
 struct watch_queue {
-- 
cgit v1.2.3


From 0ede61d8589cc2d93aa78230d74ac58b5b8d0244 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Sep 2023 08:45:59 +0200
Subject: file: convert to SLAB_TYPESAFE_BY_RCU

In recent discussions around some performance improvements in the file
handling area we discussed switching the file cache to rely on
SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based
freeing for files completely. This is a pretty sensitive change overall
but it might actually be worth doing.

The main downside is the subtlety. The other one is that we should
really wait for Jann's patch to land that enables KASAN to handle
SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this
exists.

With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times
which requires a few changes. So it isn't sufficient anymore to just
acquire a reference to the file in question under rcu using
atomic_long_inc_not_zero() since the file might have already been
recycled and someone else might have bumped the reference.

In other words, callers might see reference count bumps from newer
users. For this reason it is necessary to verify that the pointer is the
same before and after the reference count increment. This pattern can be
seen in get_file_rcu() and __files_get_rcu().

In addition, it isn't possible to access or check fields in struct file
without first aqcuiring a reference on it. Not doing that was always
very dodgy and it was only usable for non-pointer data in struct file.
With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a
reference under rcu or they must hold the files_lock of the fdtable.
Failing to do either one of this is a bug.

Thanks to Jann for pointing out that we need to ensure memory ordering
between reallocations and pointer check by ensuring that all subsequent
loads have a dependency on the second load in get_file_rcu() and
providing a fixup that was folded into this patch.

Cc: Jann Horn <jannh@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/files.rst          |  53 +++++-------
 arch/powerpc/platforms/cell/spufs/coredump.c |  11 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c     |   4 +-
 fs/file.c                                    | 125 +++++++++++++++++++++++----
 fs/file_table.c                              |  40 +++++----
 fs/gfs2/glock.c                              |  11 ++-
 fs/notify/dnotify/dnotify.c                  |   6 +-
 fs/proc/fd.c                                 |  11 ++-
 include/linux/fdtable.h                      |  17 +---
 include/linux/fs.h                           |   4 +-
 kernel/bpf/task_iter.c                       |   4 +-
 kernel/fork.c                                |   4 +-
 kernel/kcmp.c                                |   4 +-
 13 files changed, 191 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index bcf84459917f..9e38e4c221ca 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+   must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -70,43 +70,22 @@ the fdtable structure -
 	struct file *file;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(fd);
-	if (file) {
-		...
-	}
-	....
+	file = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
-
-5. Handling of the file structures is special. Since the look-up
-   of the fd (fget()/fget_light()) are lock-free, it is possible
-   that look-up may race with the last put() operation on the
-   file structure. This is avoided using atomic_long_inc_not_zero()
-   on ->f_count::
-
-	rcu_read_lock();
-	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
-		if (atomic_long_inc_not_zero(&file->f_count))
-			*fput_needed = 1;
-		else
-		/* Didn't get the reference, someone's freed */
-			file = NULL;
+		...
+                fput(file);
 	}
-	rcu_read_unlock();
 	....
-	return file;
-
-   atomic_long_inc_not_zero() detects if refcounts is already zero or
-   goes to zero during increment. If it does, we fail
-   fget()/fget_light().
 
-6. Since both fdtable and file structures can be looked up
+5. Since both fdtable and file structures can be looked up
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+   and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these
+   issues.
 
-7. While updating, the fdtable pointer must be looked up while
+6. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
    another thread expand the files thereby creating a new
    fdtable and making the earlier fdtable pointer stale.
@@ -126,3 +105,19 @@ the fdtable structure -
    Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
    the fdtable pointer (fdt) must be loaded after locate_fd().
 
+On newer kernels rcu based file lookup has been switched to rely on
+SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore
+to just acquire a reference to the file in question under rcu using
+atomic_long_inc_not_zero() since the file might have already been
+recycled and someone else might have bumped the reference. In other
+words, callers might see reference count bumps from newer users. For
+this is reason it is necessary to verify that the pointer is the same
+before and after the reference count increment. This pattern can be seen
+in get_file_rcu() and __files_get_rcu().
+
+In addition, it isn't possible to access or check fields in struct file
+without first aqcuiring a reference on it under rcu lookup. Not doing
+that was always very dodgy and it was only usable for non-pointer data
+in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
+either first acquire a reference or they must hold the files_lock of the
+fdtable.
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1a587618015c..18daafbe2e65 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd)
  */
 static struct spu_context *coredump_next_context(int *fd)
 {
-	struct spu_context *ctx;
+	struct spu_context *ctx = NULL;
 	struct file *file;
 	int n = iterate_fd(current->files, *fd, match_context, NULL);
 	if (!n)
@@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd)
 	*fd = n - 1;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(*fd);
-	ctx = SPUFS_I(file_inode(file))->i_ctx;
-	get_spu_context(ctx);
+	file = lookup_fdget_rcu(*fd);
 	rcu_read_unlock();
+	if (file) {
+		ctx = SPUFS_I(file_inode(file))->i_ctx;
+		get_spu_context(ctx);
+		fput(file);
+	}
 
 	return ctx;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index aa4d842d4c5a..97bf10861cad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -916,9 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 	struct file *file;
 
 	rcu_read_lock();
-	file = READ_ONCE(i915->gem.mmap_singleton);
-	if (file && !get_file_rcu(file))
-		file = NULL;
+	file = get_file_rcu(&i915->gem.mmap_singleton);
 	rcu_read_unlock();
 	if (file)
 		return file;
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..4eb026631673 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -853,8 +853,79 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+	struct file __rcu *file;
+	struct file __rcu *file_reloaded;
+	struct file __rcu *file_reloaded_cmp;
+
+	file = rcu_dereference_raw(*f);
+	if (!file)
+		return NULL;
+
+	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		return ERR_PTR(-EAGAIN);
+
+	file_reloaded = rcu_dereference_raw(*f);
+
+	/*
+	 * Ensure that all accesses have a dependency on the load from
+	 * rcu_dereference_raw() above so we get correct ordering
+	 * between reuse/allocation and the pointer check below.
+	 */
+	file_reloaded_cmp = file_reloaded;
+	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+	/*
+	 * atomic_long_inc_not_zero() above provided a full memory
+	 * barrier when we acquired a reference.
+	 *
+	 * This is paired with the write barrier from assigning to the
+	 * __rcu protected file pointer so that if that pointer still
+	 * matches the current file, we know we have successfully
+	 * acquired a reference to the right file.
+	 *
+	 * If the pointers don't match the file has been reallocated by
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	if (file == file_reloaded_cmp)
+		return file_reloaded;
+
+	fput(file);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+	for (;;) {
+		struct file __rcu *file;
+
+		file = __get_file_rcu(f);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
+			continue;
+
+		return file;
+	}
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
-	unsigned int fd, fmode_t mask)
+       unsigned int fd, fmode_t mask)
 {
 	for (;;) {
 		struct file *file;
@@ -865,12 +936,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 			return NULL;
 
 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
-		file = rcu_dereference_raw(*fdentry);
-		if (unlikely(!file))
-			return NULL;
-
-		if (unlikely(file->f_mode & mask))
-			return NULL;
 
 		/*
 		 * Ok, we have a file pointer. However, because we do
@@ -879,10 +944,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * Such a race can take two forms:
 		 *
-		 *  (a) the file ref already went down to zero,
-		 *      and get_file_rcu() fails. Just try again:
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
 		 */
-		if (unlikely(!get_file_rcu(file)))
+		file = __get_file_rcu(fdentry);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
 			continue;
 
 		/*
@@ -893,12 +963,20 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
-		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
 
+		/*
+		 * This isn't the file we're looking for or we're not
+		 * allowed to get a reference to it.
+		 */
+		if (unlikely(file->f_mode & mask)) {
+			fput(file);
+			return NULL;
+		}
+
 		/*
 		 * Ok, we have a ref to the file, and checked that it
 		 * still exists.
@@ -948,7 +1026,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+	return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -957,13 +1042,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	task_lock(task);
 	files = task->files;
 	if (files)
-		file = files_lookup_fd_rcu(files, fd);
+		file = __fget_files_rcu(files, fd, 0);
 	task_unlock(task);
 
 	return file;
 }
 
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -974,7 +1059,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	files = task->files;
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
-			file = files_lookup_fd_rcu(files, fd);
+			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
@@ -983,7 +1068,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1357,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		struct file *f;
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!files_lookup_fd_rcu(files, oldfd))
+		f = __fget_files_rcu(files, oldfd, 0);
+		if (!f)
 			retval = -EBADF;
 		rcu_read_unlock();
+		if (f)
+			fput(f);
 		return retval;
 	}
 	return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index e68e97d4f00a..a79a80031343 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -65,33 +65,33 @@ static void file_free_rcu(struct rcu_head *head)
 {
 	struct file *f = container_of(head, struct file, f_rcuhead);
 
-	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		kfree(backing_file(f));
-	else
-		kmem_cache_free(filp_cachep, f);
+	kfree(backing_file(f));
 }
 
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
 	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	put_cred(f->f_cred);
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_real_path(f));
+		call_rcu(&f->f_rcuhead, file_free_rcu);
+	} else {
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 void release_empty_file(struct file *f)
 {
 	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
-	/* Uhm, we better find out who grabs references to an unopened file. */
-	WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1);
-	security_file_free(f);
-	put_cred(f->f_cred);
-	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
-		percpu_counter_dec(&nr_files);
-	kmem_cache_free(filp_cachep, f);
+	if (atomic_long_dec_and_test(&f->f_count)) {
+		security_file_free(f);
+		put_cred(f->f_cred);
+		if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+			percpu_counter_dec(&nr_files);
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 /*
@@ -176,7 +176,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -184,6 +183,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
 
+	/*
+	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+	 * fget-rcu pattern users need to be able to handle spurious
+	 * refcount bumps we should reinitialize the reused file first.
+	 */
+	atomic_long_set(&f->f_count, 1);
 	return 0;
 }
 
@@ -483,7 +488,8 @@ EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..b4bc873aab7d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2717,16 +2717,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 	for(;; i->fd++) {
 		struct inode *inode;
 
-		i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
+
 		inode = file_inode(i->file);
-		if (inode->i_sb != i->sb)
-			continue;
-		if (get_file_rcu(i->file))
+		if (inode->i_sb == i->sb)
 			break;
+
+		rcu_read_unlock();
+		fput(i->file);
+		rcu_read_lock();
 	}
 	rcu_read_unlock();
 	return i->file;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
-	struct file *f;
+	struct file *f = NULL;
 	int destroy = 0, error = 0;
 	__u32 mask;
 
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	}
 
 	rcu_read_lock();
-	f = lookup_fd_rcu(fd);
+	f = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
 		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
+	if (f)
+		fput(f);
 	return error;
 }
 
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, fd);
-	if (file)
-		*mode = file->f_mode;
+	file = task_lookup_fdget_rcu(task, fd);
 	rcu_read_unlock();
+	if (file) {
+		*mode = file->f_mode;
+		fput(file);
+	}
 	return !!file;
 }
 
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fd_rcu(p, &fd);
+		f = task_lookup_next_fdget_rcu(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
 		rcu_read_unlock();
+		fput(f);
 		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-			   "suspicious rcu_dereference_check() usage");
-	return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
-	return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58dea591a341..ceafc40cc25f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..82ad23b1d257 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -308,11 +308,9 @@ again:
 	rcu_read_lock();
 	for (;; curr_fd++) {
 		struct file *f;
-		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+		f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
 		if (!f)
 			break;
-		if (!get_file_rcu(f))
-			continue;
 
 		/* set info->fd */
 		info->fd = curr_fd;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..640123767726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 	struct file *exe_file;
 
 	rcu_read_lock();
-	exe_file = rcu_dereference(mm->exe_file);
-	if (exe_file && !get_file_rcu(exe_file))
-		exe_file = NULL;
+	exe_file = get_file_rcu(&mm->exe_file);
 	rcu_read_unlock();
 	return exe_file;
 }
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, idx);
+	file = task_lookup_fdget_rcu(task, idx);
 	rcu_read_unlock();
+	if (file)
+		fput(file);
 
 	return file;
 }
-- 
cgit v1.2.3


From 95e93d17cb113a3b097774874572b69c058acab5 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 4 Oct 2023 13:19:15 +0200
Subject: vfs: predict the error in retry_estale as unlikely

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231004111916.728135-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index e3619920f9f0..3100371b5e32 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -136,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 static inline bool
 retry_estale(const long error, const unsigned int flags)
 {
-	return error == -ESTALE && !(flags & LOOKUP_REVAL);
+	return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
 }
 
 #endif /* _LINUX_NAMEI_H */
-- 
cgit v1.2.3


From 08582d678fcf11fc86188f0b92239d3d49667d8e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:11 +0300
Subject: fs: create helper file_user_path() for user displayed mapped file
 path

Overlayfs uses backing files with "fake" overlayfs f_path and "real"
underlying f_inode, in order to use underlying inode aops for mapped
files and to display the overlayfs path in /proc/<pid>/maps.

In preparation for storing the overlayfs "fake" path instead of the
underlying "real" path in struct backing_file, define a noop helper
file_user_path() that returns f_path for now.

Use the new helper in procfs and kernel logs whenever a path of a
mapped file is displayed to users.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/arc/kernel/troubleshoot.c |  6 ++++--
 fs/proc/base.c                 |  2 +-
 fs/proc/nommu.c                |  2 +-
 fs/proc/task_mmu.c             |  4 ++--
 fs/proc/task_nommu.c           |  2 +-
 include/linux/fs.h             | 14 ++++++++++++++
 kernel/trace/trace_output.c    |  2 +-
 7 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index d5b3ed2c58f5..c380d8c30704 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address)
 	 */
 	if (vma) {
 		char buf[ARC_PATH_MAX];
-		char *nm = "?";
+		char *nm = "anon";
 
 		if (vma->vm_file) {
-			nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1);
+			/* XXX: can we use %pD below and get rid of buf? */
+			nm = d_path(file_user_path(vma->vm_file), buf,
+				    ARC_PATH_MAX-1);
 			if (IS_ERR(nm))
 				nm = "?";
 		}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..20695c928ee6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		if (anon_name)
 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
 		else
-			seq_file_path(m, file, "\n");
+			seq_path(m, file_user_path(file), "\n");
 		goto done;
 	}
 
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_file_path(m, file, "\n\t= ");
+		seq_path(m, file_user_path(file), "\n\t= ");
 	} else if (vma_is_initial_heap(vma)) {
 		seq_puts(m, " heap");
 	} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a8ac0dd8041e..2d5862c13399 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	} else if (mm && vma_is_initial_stack(vma)) {
 		seq_pad(m, ' ');
 		seq_puts(m, "[stack]");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ceafc40cc25f..057a3bbd4d27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2513,6 +2513,20 @@ static inline const struct path *file_real_path(struct file *f)
 	return &f->f_path;
 }
 
+/*
+ * file_user_path - get the path to display for memory mapped file
+ *
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem.  When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
+{
+	return &f->f_path;
+}
+
 static inline struct file *file_clone_open(struct file *file)
 {
 	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index db575094c498..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			vmstart = vma->vm_start;
 		}
 		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
+			ret = trace_seq_path(s, file_user_path(file));
 			if (ret)
 				trace_seq_printf(s, "[+0x%lx]",
 						 ip - vmstart);
-- 
cgit v1.2.3


From def3ae83da02f87005210fa3d448c5dd37ba4105 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:12 +0300
Subject: fs: store real path instead of fake path in backing file f_path

A backing file struct stores two path's, one "real" path that is referring
to f_inode and one "fake" path, which should be displayed to users in
/proc/<pid>/maps.

There is a lot more potential code that needs to know the "real" path, then
code that needs to know the "fake" path.

Instead of code having to request the "real" path with file_real_path(),
store the "real" path in f_path and require code that needs to know the
"fake" path request it with file_user_path().
Replace the file_real_path() helper with a simple const accessor f_path().

After this change, file_dentry() is not expected to observe any files
with overlayfs f_path and real f_inode, so the call to ->d_real() should
not be needed.  Leave the ->d_real() call for now and add an assertion
in ovl_d_real() to catch if we made wrong assumptions.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpegtt48eXhhjDFA1ojcHPNKj3Go6joryCPtEFAKpocyBsnw@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c          | 12 ++++++------
 fs/internal.h            |  2 +-
 fs/open.c                | 23 +++++++++++------------
 fs/overlayfs/super.c     | 16 ++++++++++++----
 include/linux/fs.h       | 22 ++++------------------
 include/linux/fsnotify.h |  3 +--
 6 files changed, 35 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 08fd1dd6d863..fa92743ba6a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
 struct backing_file {
 	struct file file;
-	struct path real_path;
+	struct path user_path;
 };
 
 static inline struct backing_file *backing_file(struct file *f)
@@ -55,11 +55,11 @@ static inline struct backing_file *backing_file(struct file *f)
 	return container_of(f, struct backing_file, file);
 }
 
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
 {
-	return &backing_file(f)->real_path;
+	return &backing_file(f)->user_path;
 }
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
 
 static inline void file_free(struct file *f)
 {
@@ -68,7 +68,7 @@ static inline void file_free(struct file *f)
 		percpu_counter_dec(&nr_files);
 	put_cred(f->f_cred);
 	if (unlikely(f->f_mode & FMODE_BACKING)) {
-		path_put(backing_file_real_path(f));
+		path_put(backing_file_user_path(f));
 		kfree(backing_file(f));
 	} else {
 		kmem_cache_free(filp_cachep, f);
diff --git a/fs/internal.h b/fs/internal.h
index 4e93a685bdaa..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,7 +101,7 @@ static inline void file_put_write_access(struct file *file)
 	put_write_access(file->f_inode);
 	mnt_put_write_access(file->f_path.mnt);
 	if (unlikely(file->f_mode & FMODE_BACKING))
-		mnt_put_write_access(backing_file_real_path(file)->mnt);
+		mnt_put_write_access(backing_file_user_path(file)->mnt);
 }
 
 static inline void put_file_access(struct file *file)
diff --git a/fs/open.c b/fs/open.c
index fe63e236da22..02dc608d40d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -881,7 +881,7 @@ static inline int file_get_write_access(struct file *f)
 	if (unlikely(error))
 		goto cleanup_inode;
 	if (unlikely(f->f_mode & FMODE_BACKING)) {
-		error = mnt_get_write_access(backing_file_real_path(f)->mnt);
+		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
 		if (unlikely(error))
 			goto cleanup_mnt;
 	}
@@ -1182,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
 
 /**
  * backing_file_open - open a backing file for kernel internal use
- * @path:	path of the file to open
+ * @user_path:	path that the user reuqested to open
  * @flags:	open flags
  * @real_path:	path of the backing file
  * @cred:	credentials for open
  *
  * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem.  In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
  */
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred)
 {
@@ -1206,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags,
 	if (IS_ERR(f))
 		return f;
 
-	f->f_path = *path;
-	path_get(real_path);
-	*backing_file_real_path(f) = *real_path;
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	f->f_path = *real_path;
 	error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
 	if (error) {
 		fput(f);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index def266b5e2a3..9f43f0d303ad 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	struct dentry *real = NULL, *lower;
 	int err;
 
-	/* It's an overlay file */
+	/*
+	 * vfs is only expected to call d_real() with NULL from d_real_inode()
+	 * and with overlay inode from file_dentry() on an overlay file.
+	 *
+	 * TODO: remove @inode argument from d_real() API, remove code in this
+	 * function that deals with non-NULL @inode and remove d_real() call
+	 * from file_dentry().
+	 */
 	if (inode && d_inode(dentry) == inode)
 		return dentry;
+	else if (inode)
+		goto bug;
 
 	if (!d_is_reg(dentry)) {
-		if (!inode || inode == d_inode(dentry))
-			return dentry;
-		goto bug;
+		/* d_real_inode() is only relevant for regular files */
+		return dentry;
 	}
 
 	real = ovl_dentry_upper(dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 057a3bbd4d27..b37dccfb4534 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2492,26 +2492,10 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
-
-/*
- * file_real_path - get the path corresponding to f_inode
- *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem.  When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
-{
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		return backing_file_real_path(f);
-	return &f->f_path;
-}
+struct path *backing_file_user_path(struct file *f);
 
 /*
  * file_user_path - get the path to display for memory mapped file
@@ -2524,6 +2508,8 @@ static inline const struct path *file_real_path(struct file *f)
  */
 static inline const struct path *file_user_path(struct file *f)
 {
+	if (unlikely(f->f_mode & FMODE_BACKING))
+		return backing_file_user_path(f);
 	return &f->f_path;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index ed48e4f1e755..bcb6609b54b3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
 
-	/* Overlayfs internal files have fake f_path */
-	path = file_real_path(file);
+	path = &file->f_path;
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
-- 
cgit v1.2.3


From e4e8b47a34a432c3f65534d12d5c132b6639da71 Mon Sep 17 00:00:00 2001
From: Max Kellermann <mk@cm4all.com>
Date: Mon, 15 Jan 2018 18:30:46 +0100
Subject: fs: fix umask on NFS with CONFIG_FS_POSIX_ACL=n

Make IS_POSIXACL() return false if POSIX ACL support is disabled.

Never skip applying the umask in namei.c and never bother to do any
ACL specific checks if the filesystem falsely indicates it has ACLs
enabled when the feature is completely disabled in the kernel.

This fixes a problem where the umask is always ignored in the NFS
client when compiled without CONFIG_FS_POSIX_ACL.  This is a 4 year
old regression caused by commit 013cdf1088d723 which itself was not
completely wrong, but failed to consider all the side effects by
misdesigned VFS code.

Prior to that commit, there were two places where the umask could be
applied, for example when creating a directory:

 1. in the VFS layer in SYSCALL_DEFINE3(mkdirat), but only if
    !IS_POSIXACL()

 2. again (unconditionally) in nfs3_proc_mkdir()

The first one does not apply, because even without
CONFIG_FS_POSIX_ACL, the NFS client sets SB_POSIXACL in
nfs_fill_super().

After that commit, (2.) was replaced by:

 2b. in posix_acl_create(), called by nfs3_proc_mkdir()

There's one branch in posix_acl_create() which applies the umask;
however, without CONFIG_FS_POSIX_ACL, posix_acl_create() is an empty
dummy function which does not apply the umask.

The approach chosen by this patch is to make IS_POSIXACL() always
return false when POSIX ACL support is disabled, so the umask always
gets applied by the VFS layer.  This is consistent with the (regular)
behavior of posix_acl_create(): that function returns early if
IS_POSIXACL() is false, before applying the umask.

Therefore, posix_acl_create() is responsible for applying the umask if
there is ACL support enabled in the file system (SB_POSIXACL), and the
VFS layer is responsible for all other cases (no SB_POSIXACL or no
CONFIG_FS_POSIX_ACL).

Signed-off-by: Max Kellermann <mk@cm4all.com>
Link: https://lore.kernel.org/r/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b37dccfb4534..cb8bfa1f8ecb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2113,7 +2113,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
+
+#ifdef CONFIG_FS_POSIX_ACL
 #define IS_POSIXACL(inode)	__IS_FLG(inode, SB_POSIXACL)
+#else
+#define IS_POSIXACL(inode)	0
+#endif
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
-- 
cgit v1.2.3


From 1dbfe73bd648bdc3a2d8c9e5b23ae70e3333b592 Mon Sep 17 00:00:00 2001
From: Johannes Zink <j.zink@pengutronix.de>
Date: Wed, 18 Oct 2023 09:09:55 +0200
Subject: net: stmmac: intel: remove unnecessary field struct
 plat_stmmacenet_data::ext_snapshot_num

Do not store bitmask for enabling AUX_SNAPSHOT0. The previous commit
("net: stmmac: fix PPS capture input index") takes care of calculating
the proper bit mask from the request data's extts.index field, which is
0 if not explicitly specified otherwise.

Signed-off-by: Johannes Zink <j.zink@pengutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 1 -
 include/linux/stmmac.h                            | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index a3a249c63598..60283543ffc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -605,7 +605,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR;
 
 	plat->int_snapshot_num = AUX_SNAPSHOT1;
-	plat->ext_snapshot_num = AUX_SNAPSHOT0;
 
 	plat->crosststamp = intel_crosststamp;
 	plat->flags &= ~STMMAC_FLAG_INT_SNAPSHOT_EN;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c0079a7574ae..0b4658a7eceb 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -303,7 +303,6 @@ struct plat_stmmacenet_data {
 	unsigned int eee_usecs_rate;
 	struct pci_dev *pdev;
 	int int_snapshot_num;
-	int ext_snapshot_num;
 	int msi_mac_vec;
 	int msi_wol_vec;
 	int msi_lpi_vec;
-- 
cgit v1.2.3


From 7f3eb2174512fe6c9c0f062e96eccb0d3cc6d5cd Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Wed, 18 Oct 2023 14:35:47 +0200
Subject: net: introduce napi_is_scheduled helper

We currently have napi_if_scheduled_mark_missed that can be used to
check if napi is scheduled but that does more thing than simply checking
it and return a bool. Some driver already implement custom function to
check if napi is scheduled.

Drop these custom function and introduce napi_is_scheduled that simply
check if napi is scheduled atomically.

Update any driver and code that implement a similar check and instead
use this new helper.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/chelsio/cxgb3/sge.c  |  8 --------
 drivers/net/wireless/realtek/rtw89/core.c |  2 +-
 include/linux/netdevice.h                 | 23 +++++++++++++++++++++++
 net/core/dev.c                            |  2 +-
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index dfe4e0102960..6268f96cb4aa 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -2501,14 +2501,6 @@ static int napi_rx_handler(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-/*
- * Returns true if the device is already scheduled for polling.
- */
-static inline int napi_is_scheduled(struct napi_struct *napi)
-{
-	return test_bit(NAPI_STATE_SCHED, &napi->state);
-}
-
 /**
  *	process_pure_responses - process pure responses from a response queue
  *	@adap: the adapter
diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c
index 4bfb4188de72..3d75165e48be 100644
--- a/drivers/net/wireless/realtek/rtw89/core.c
+++ b/drivers/net/wireless/realtek/rtw89/core.c
@@ -2005,7 +2005,7 @@ static void rtw89_core_rx_to_mac80211(struct rtw89_dev *rtwdev,
 	struct napi_struct *napi = &rtwdev->napi;
 
 	/* In low power mode, napi isn't scheduled. Receive it to netif. */
-	if (unlikely(!test_bit(NAPI_STATE_SCHED, &napi->state)))
+	if (unlikely(!napi_is_scheduled(napi)))
 		napi = NULL;
 
 	rtw89_core_hw_to_sband_rate(rx_status);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c7681263d30..b8bf669212cc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -482,6 +482,29 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n)
 	return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 }
 
+/**
+ * napi_is_scheduled - test if NAPI is scheduled
+ * @n: NAPI context
+ *
+ * This check is "best-effort". With no locking implemented,
+ * a NAPI can be scheduled or terminate right after this check
+ * and produce not precise results.
+ *
+ * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
+ * should not be used normally and napi_schedule should be
+ * used instead.
+ *
+ * Use only if the driver really needs to check if a NAPI
+ * is scheduled for example in the context of delayed timer
+ * that can be skipped if a NAPI is already scheduled.
+ *
+ * Return True if NAPI is scheduled, False otherwise.
+ */
+static inline bool napi_is_scheduled(struct napi_struct *n)
+{
+	return test_bit(NAPI_STATE_SCHED, &n->state);
+}
+
 bool napi_schedule_prep(struct napi_struct *n);
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 97e7b9833db9..e7f61f5a5322 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6532,7 +6532,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	 * accidentally calling ->poll() when NAPI is not scheduled.
 	 */
 	work = 0;
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+	if (napi_is_scheduled(n)) {
 		work = n->poll(n, weight);
 		trace_napi_poll(n, work, weight);
 	}
-- 
cgit v1.2.3


From 8c8d2d9670e813d623d8a2cbc881cb57344f4d37 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 17 Mar 2017 16:35:23 -0800
Subject: bcache: move closures to lib/

Prep work for bcachefs - being a fork of bcache it also uses closures

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Coly Li <colyli@suse.de>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
---
 drivers/md/bcache/Kconfig   |  10 +-
 drivers/md/bcache/Makefile  |   4 +-
 drivers/md/bcache/bcache.h  |   2 +-
 drivers/md/bcache/closure.c | 207 ------------------------
 drivers/md/bcache/closure.h | 378 --------------------------------------------
 drivers/md/bcache/super.c   |   1 -
 drivers/md/bcache/util.h    |   3 +-
 include/linux/closure.h     | 377 +++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig                 |   3 +
 lib/Kconfig.debug           |   9 ++
 lib/Makefile                |   2 +
 lib/closure.c               | 204 ++++++++++++++++++++++++
 12 files changed, 600 insertions(+), 600 deletions(-)
 delete mode 100644 drivers/md/bcache/closure.c
 delete mode 100644 drivers/md/bcache/closure.h
 create mode 100644 include/linux/closure.h
 create mode 100644 lib/closure.c

(limited to 'include/linux')

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e9a4..b2d10063d35f 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
 	tristate "Block device as cache"
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select CRC64
+	select CLOSURES
 	help
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
@@ -19,15 +20,6 @@ config BCACHE_DEBUG
 	Enables extra debugging tools, allows expensive runtime checks to be
 	turned on.
 
-config BCACHE_CLOSURES_DEBUG
-	bool "Debug closures"
-	depends on BCACHE
-	select DEBUG_FS
-	help
-	Keeps all active closures in a linked list and provides a debugfs
-	interface to list them, which makes it possible to see asynchronous
-	operations that get stuck.
-
 config BCACHE_ASYNC_REGISTRATION
 	bool "Asynchronous device registration"
 	depends on BCACHE
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..054e8a33a7ab 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,6 +2,6 @@
 
 obj-$(CONFIG_BCACHE)	+= bcache.o
 
-bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
-	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+bcache-y		:= alloc.o bset.o btree.o debug.o extents.o io.o\
+	journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
 	util.o writeback.o features.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5a79bb3c272f..7c0d00432612 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -179,6 +179,7 @@
 #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
 
 #include <linux/bio.h>
+#include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -192,7 +193,6 @@
 #include "bcache_ondisk.h"
 #include "bset.h"
 #include "util.h"
-#include "closure.h"
 
 struct bucket {
 	atomic_t	pin;
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
deleted file mode 100644
index d8d9394a6beb..000000000000
--- a/drivers/md/bcache/closure.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Asynchronous refcounty things
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/sched/debug.h>
-
-#include "closure.h"
-
-static inline void closure_put_after_sub(struct closure *cl, int flags)
-{
-	int r = flags & CLOSURE_REMAINING_MASK;
-
-	BUG_ON(flags & CLOSURE_GUARD_MASK);
-	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
-
-	if (!r) {
-		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
-			atomic_set(&cl->remaining,
-				   CLOSURE_REMAINING_INITIALIZER);
-			closure_queue(cl);
-		} else {
-			struct closure *parent = cl->parent;
-			closure_fn *destructor = cl->fn;
-
-			closure_debug_destroy(cl);
-
-			if (destructor)
-				destructor(cl);
-
-			if (parent)
-				closure_put(parent);
-		}
-	}
-}
-
-/* For clearing flags with the same atomic op as a put */
-void closure_sub(struct closure *cl, int v)
-{
-	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
-}
-
-/*
- * closure_put - decrement a closure's refcount
- */
-void closure_put(struct closure *cl)
-{
-	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
-}
-
-/*
- * closure_wake_up - wake up all closures on a wait list, without memory barrier
- */
-void __closure_wake_up(struct closure_waitlist *wait_list)
-{
-	struct llist_node *list;
-	struct closure *cl, *t;
-	struct llist_node *reverse = NULL;
-
-	list = llist_del_all(&wait_list->list);
-
-	/* We first reverse the list to preserve FIFO ordering and fairness */
-	reverse = llist_reverse_order(list);
-
-	/* Then do the wakeups */
-	llist_for_each_entry_safe(cl, t, reverse, list) {
-		closure_set_waiting(cl, 0);
-		closure_sub(cl, CLOSURE_WAITING + 1);
-	}
-}
-
-/**
- * closure_wait - add a closure to a waitlist
- * @waitlist: will own a ref on @cl, which will be released when
- * closure_wake_up() is called on @waitlist.
- * @cl: closure pointer.
- *
- */
-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
-{
-	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-		return false;
-
-	closure_set_waiting(cl, _RET_IP_);
-	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
-	llist_add(&cl->list, &waitlist->list);
-
-	return true;
-}
-
-struct closure_syncer {
-	struct task_struct	*task;
-	int			done;
-};
-
-static void closure_sync_fn(struct closure *cl)
-{
-	struct closure_syncer *s = cl->s;
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = READ_ONCE(s->task);
-	s->done = 1;
-	wake_up_process(p);
-	rcu_read_unlock();
-}
-
-void __sched __closure_sync(struct closure *cl)
-{
-	struct closure_syncer s = { .task = current };
-
-	cl->s = &s;
-	continue_at(cl, closure_sync_fn, NULL);
-
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (s.done)
-			break;
-		schedule();
-	}
-
-	__set_current_state(TASK_RUNNING);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-static LIST_HEAD(closure_list);
-static DEFINE_SPINLOCK(closure_list_lock);
-
-void closure_debug_create(struct closure *cl)
-{
-	unsigned long flags;
-
-	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
-	cl->magic = CLOSURE_MAGIC_ALIVE;
-
-	spin_lock_irqsave(&closure_list_lock, flags);
-	list_add(&cl->all, &closure_list);
-	spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-void closure_debug_destroy(struct closure *cl)
-{
-	unsigned long flags;
-
-	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
-	cl->magic = CLOSURE_MAGIC_DEAD;
-
-	spin_lock_irqsave(&closure_list_lock, flags);
-	list_del(&cl->all);
-	spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-
-static struct dentry *closure_debug;
-
-static int debug_show(struct seq_file *f, void *data)
-{
-	struct closure *cl;
-
-	spin_lock_irq(&closure_list_lock);
-
-	list_for_each_entry(cl, &closure_list, all) {
-		int r = atomic_read(&cl->remaining);
-
-		seq_printf(f, "%p: %pS -> %pS p %p r %i ",
-			   cl, (void *) cl->ip, cl->fn, cl->parent,
-			   r & CLOSURE_REMAINING_MASK);
-
-		seq_printf(f, "%s%s\n",
-			   test_bit(WORK_STRUCT_PENDING_BIT,
-				    work_data_bits(&cl->work)) ? "Q" : "",
-			   r & CLOSURE_RUNNING	? "R" : "");
-
-		if (r & CLOSURE_WAITING)
-			seq_printf(f, " W %pS\n",
-				   (void *) cl->waiting_on);
-
-		seq_printf(f, "\n");
-	}
-
-	spin_unlock_irq(&closure_list_lock);
-	return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(debug);
-
-void  __init closure_debug_init(void)
-{
-	if (!IS_ERR_OR_NULL(bcache_debug))
-		/*
-		 * it is unnecessary to check return value of
-		 * debugfs_create_file(), we should not care
-		 * about this.
-		 */
-		closure_debug = debugfs_create_file(
-			"closures", 0400, bcache_debug, NULL, &debug_fops);
-}
-#endif
-
-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
deleted file mode 100644
index c88cdc4ae4ec..000000000000
--- a/drivers/md/bcache/closure.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CLOSURE_H
-#define _LINUX_CLOSURE_H
-
-#include <linux/llist.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/workqueue.h>
-
-/*
- * Closure is perhaps the most overused and abused term in computer science, but
- * since I've been unable to come up with anything better you're stuck with it
- * again.
- *
- * What are closures?
- *
- * They embed a refcount. The basic idea is they count "things that are in
- * progress" - in flight bios, some other thread that's doing something else -
- * anything you might want to wait on.
- *
- * The refcount may be manipulated with closure_get() and closure_put().
- * closure_put() is where many of the interesting things happen, when it causes
- * the refcount to go to 0.
- *
- * Closures can be used to wait on things both synchronously and asynchronously,
- * and synchronous and asynchronous use can be mixed without restriction. To
- * wait synchronously, use closure_sync() - you will sleep until your closure's
- * refcount hits 1.
- *
- * To wait asynchronously, use
- *   continue_at(cl, next_function, workqueue);
- *
- * passing it, as you might expect, the function to run when nothing is pending
- * and the workqueue to run that function out of.
- *
- * continue_at() also, critically, requires a 'return' immediately following the
- * location where this macro is referenced, to return to the calling function.
- * There's good reason for this.
- *
- * To use safely closures asynchronously, they must always have a refcount while
- * they are running owned by the thread that is running them. Otherwise, suppose
- * you submit some bios and wish to have a function run when they all complete:
- *
- * foo_endio(struct bio *bio)
- * {
- *	closure_put(cl);
- * }
- *
- * closure_init(cl);
- *
- * do_stuff();
- * closure_get(cl);
- * bio1->bi_endio = foo_endio;
- * bio_submit(bio1);
- *
- * do_more_stuff();
- * closure_get(cl);
- * bio2->bi_endio = foo_endio;
- * bio_submit(bio2);
- *
- * continue_at(cl, complete_some_read, system_wq);
- *
- * If closure's refcount started at 0, complete_some_read() could run before the
- * second bio was submitted - which is almost always not what you want! More
- * importantly, it wouldn't be possible to say whether the original thread or
- * complete_some_read()'s thread owned the closure - and whatever state it was
- * associated with!
- *
- * So, closure_init() initializes a closure's refcount to 1 - and when a
- * closure_fn is run, the refcount will be reset to 1 first.
- *
- * Then, the rule is - if you got the refcount with closure_get(), release it
- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
- * on a closure because you called closure_init() or you were run out of a
- * closure - _always_ use continue_at(). Doing so consistently will help
- * eliminate an entire class of particularly pernicious races.
- *
- * Lastly, you might have a wait list dedicated to a specific event, and have no
- * need for specifying the condition - you just want to wait until someone runs
- * closure_wake_up() on the appropriate wait list. In that case, just use
- * closure_wait(). It will return either true or false, depending on whether the
- * closure was already on a wait list or not - a closure can only be on one wait
- * list at a time.
- *
- * Parents:
- *
- * closure_init() takes two arguments - it takes the closure to initialize, and
- * a (possibly null) parent.
- *
- * If parent is non null, the new closure will have a refcount for its lifetime;
- * a closure is considered to be "finished" when its refcount hits 0 and the
- * function to run is null. Hence
- *
- * continue_at(cl, NULL, NULL);
- *
- * returns up the (spaghetti) stack of closures, precisely like normal return
- * returns up the C stack. continue_at() with non null fn is better thought of
- * as doing a tail call.
- *
- * All this implies that a closure should typically be embedded in a particular
- * struct (which its refcount will normally control the lifetime of), and that
- * struct can very much be thought of as a stack frame.
- */
-
-struct closure;
-struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
-extern struct dentry *bcache_debug;
-
-struct closure_waitlist {
-	struct llist_head	list;
-};
-
-enum closure_state {
-	/*
-	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
-	 * the thread that owns the closure, and cleared by the thread that's
-	 * waking up the closure.
-	 *
-	 * The rest are for debugging and don't affect behaviour:
-	 *
-	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
-	 * closure_init() and when closure_put() runs then next function), and
-	 * must be cleared before remaining hits 0. Primarily to help guard
-	 * against incorrect usage and accidentally transferring references.
-	 * continue_at() and closure_return() clear it for you, if you're doing
-	 * something unusual you can use closure_set_dead() which also helps
-	 * annotate where references are being transferred.
-	 */
-
-	CLOSURE_BITS_START	= (1U << 26),
-	CLOSURE_DESTRUCTOR	= (1U << 26),
-	CLOSURE_WAITING		= (1U << 28),
-	CLOSURE_RUNNING		= (1U << 30),
-};
-
-#define CLOSURE_GUARD_MASK					\
-	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
-
-#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
-#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
-
-struct closure {
-	union {
-		struct {
-			struct workqueue_struct *wq;
-			struct closure_syncer	*s;
-			struct llist_node	list;
-			closure_fn		*fn;
-		};
-		struct work_struct	work;
-	};
-
-	struct closure		*parent;
-
-	atomic_t		remaining;
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-#define CLOSURE_MAGIC_DEAD	0xc054dead
-#define CLOSURE_MAGIC_ALIVE	0xc054a11e
-
-	unsigned int		magic;
-	struct list_head	all;
-	unsigned long		ip;
-	unsigned long		waiting_on;
-#endif
-};
-
-void closure_sub(struct closure *cl, int v);
-void closure_put(struct closure *cl);
-void __closure_wake_up(struct closure_waitlist *list);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void __closure_sync(struct closure *cl);
-
-/**
- * closure_sync - sleep until a closure a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-static inline void closure_sync(struct closure *cl)
-{
-	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
-		__closure_sync(cl);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-void closure_debug_init(void);
-void closure_debug_create(struct closure *cl);
-void closure_debug_destroy(struct closure *cl);
-
-#else
-
-static inline void closure_debug_init(void) {}
-static inline void closure_debug_create(struct closure *cl) {}
-static inline void closure_debug_destroy(struct closure *cl) {}
-
-#endif
-
-static inline void closure_set_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	cl->ip = _THIS_IP_;
-#endif
-}
-
-static inline void closure_set_ret_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	cl->ip = _RET_IP_;
-#endif
-}
-
-static inline void closure_set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	cl->waiting_on = f;
-#endif
-}
-
-static inline void closure_set_stopped(struct closure *cl)
-{
-	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-}
-
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
-				  struct workqueue_struct *wq)
-{
-	closure_set_ip(cl);
-	cl->fn = fn;
-	cl->wq = wq;
-	/* between atomic_dec() in closure_put() */
-	smp_mb__before_atomic();
-}
-
-static inline void closure_queue(struct closure *cl)
-{
-	struct workqueue_struct *wq = cl->wq;
-	/**
-	 * Changes made to closure, work_struct, or a couple of other structs
-	 * may cause work.func not pointing to the right location.
-	 */
-	BUILD_BUG_ON(offsetof(struct closure, fn)
-		     != offsetof(struct work_struct, func));
-	if (wq) {
-		INIT_WORK(&cl->work, cl->work.func);
-		BUG_ON(!queue_work(wq, &cl->work));
-	} else
-		cl->fn(cl);
-}
-
-/**
- * closure_get - increment a closure's refcount
- */
-static inline void closure_get(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-	BUG_ON((atomic_inc_return(&cl->remaining) &
-		CLOSURE_REMAINING_MASK) <= 1);
-#else
-	atomic_inc(&cl->remaining);
-#endif
-}
-
-/**
- * closure_init - Initialize a closure, setting the refcount to 1
- * @cl:		closure to initialize
- * @parent:	parent of the new closure. cl will take a refcount on it for its
- *		lifetime; may be NULL.
- */
-static inline void closure_init(struct closure *cl, struct closure *parent)
-{
-	memset(cl, 0, sizeof(struct closure));
-	cl->parent = parent;
-	if (parent)
-		closure_get(parent);
-
-	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-
-	closure_debug_create(cl);
-	closure_set_ip(cl);
-}
-
-static inline void closure_init_stack(struct closure *cl)
-{
-	memset(cl, 0, sizeof(struct closure));
-	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-}
-
-/**
- * closure_wake_up - wake up all closures on a wait list,
- *		     with memory barrier
- */
-static inline void closure_wake_up(struct closure_waitlist *list)
-{
-	/* Memory barrier for the wait list */
-	smp_mb();
-	__closure_wake_up(list);
-}
-
-/**
- * continue_at - jump to another function with barrier
- *
- * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * been dropped with closure_put()), it will resume execution at @fn running out
- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- *
- * This is because after calling continue_at() you no longer have a ref on @cl,
- * and whatever @cl owns may be freed out from under you - a running closure fn
- * has a ref on its own closure which continue_at() drops.
- *
- * Note you are expected to immediately return after using this macro.
- */
-#define continue_at(_cl, _fn, _wq)					\
-do {									\
-	set_closure_fn(_cl, _fn, _wq);					\
-	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
-} while (0)
-
-/**
- * closure_return - finish execution of a closure
- *
- * This is used to indicate that @cl is finished: when all outstanding refs on
- * @cl have been dropped @cl's ref on its parent closure (as passed to
- * closure_init()) will be dropped, if one was specified - thus this can be
- * thought of as returning to the parent closure.
- */
-#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
-
-/**
- * continue_at_nobarrier - jump to another function without barrier
- *
- * Causes @fn to be executed out of @cl, in @wq context (or called directly if
- * @wq is NULL).
- *
- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
- * thus it's not safe to touch anything protected by @cl after a
- * continue_at_nobarrier().
- */
-#define continue_at_nobarrier(_cl, _fn, _wq)				\
-do {									\
-	set_closure_fn(_cl, _fn, _wq);					\
-	closure_queue(_cl);						\
-} while (0)
-
-/**
- * closure_return_with_destructor - finish execution of a closure,
- *				    with destructor
- *
- * Works like closure_return(), except @destructor will be called when all
- * outstanding refs on @cl have been dropped; @destructor may be used to safely
- * free the memory occupied by @cl, and it is called with the ref on the parent
- * closure still held - so @destructor could safely return an item to a
- * freelist protected by @cl's parent.
- */
-#define closure_return_with_destructor(_cl, _destructor)		\
-do {									\
-	set_closure_fn(_cl, _destructor, NULL);				\
-	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
-} while (0)
-
-/**
- * closure_call - execute @fn out of a new, uninitialized closure
- *
- * Typically used when running out of one closure, and we want to run @fn
- * asynchronously out of a new closure - @parent will then wait for @cl to
- * finish.
- */
-static inline void closure_call(struct closure *cl, closure_fn fn,
-				struct workqueue_struct *wq,
-				struct closure *parent)
-{
-	closure_init(cl, parent);
-	continue_at_nobarrier(cl, fn, wq);
-}
-
-#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 0ae2b3676293..4affe587586c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2905,7 +2905,6 @@ static int __init bcache_init(void)
 		goto err;
 
 	bch_debug_init();
-	closure_debug_init();
 
 	bcache_is_reboot = false;
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6f3cb7c92130..f61ab1bada6c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
 #define _BCACHE_UTIL_H
 
 #include <linux/blkdev.h>
+#include <linux/closure.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched/clock.h>
@@ -13,8 +14,6 @@
 #include <linux/workqueue.h>
 #include <linux/crc64.h>
 
-#include "closure.h"
-
 struct closure;
 
 #ifdef CONFIG_BCACHE_DEBUG
diff --git a/include/linux/closure.h b/include/linux/closure.h
new file mode 100644
index 000000000000..0ec9e7bc8d97
--- /dev/null
+++ b/include/linux/closure.h
@@ -0,0 +1,377 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, requires a 'return' immediately following the
+ * location where this macro is referenced, to return to the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ */
+
+struct closure;
+struct closure_syncer;
+typedef void (closure_fn) (struct closure *);
+extern struct dentry *bcache_debug;
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 */
+
+	CLOSURE_BITS_START	= (1U << 26),
+	CLOSURE_DESTRUCTOR	= (1U << 26),
+	CLOSURE_WAITING		= (1U << 28),
+	CLOSURE_RUNNING		= (1U << 30),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct closure_syncer	*s;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned int		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+		__closure_sync(cl);
+}
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	cl->waiting_on = f;
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic();
+}
+
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
+
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
+
+/**
+ * closure_get - increment a closure's refcount
+ */
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+/**
+ * closure_init - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+static inline void closure_init(struct closure *cl, struct closure *parent)
+{
+	cl->fn = NULL;
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+
+	closure_debug_create(cl);
+	closure_set_ip(cl);
+}
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
+/**
+ * closure_wake_up - wake up all closures on a wait list,
+ *		     with memory barrier
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	/* Memory barrier for the wait list */
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+/**
+ * continue_at - jump to another function with barrier
+ *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
+ * been dropped with closure_put()), it will resume execution at @fn running out
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
+ *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
+ * and whatever @cl owns may be freed out from under you - a running closure fn
+ * has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
+ */
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+} while (0)
+
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(_cl);						\
+} while (0)
+
+/**
+ * closure_return_with_destructor - finish execution of a closure,
+ *				    with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+} while (0)
+
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..263aa6ae8d7c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -506,6 +506,9 @@ config ASSOCIATIVE_ARRAY
 
 	  for more information.
 
+config CLOSURES
+	bool
+
 config HAS_IOMEM
 	bool
 	depends on !NO_IOMEM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa307f93fa2e..ce3a4abf40f8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1720,6 +1720,15 @@ config DEBUG_NOTIFIERS
 	  This is a relatively cheap check but if you care about maximum
 	  performance, say N.
 
+config DEBUG_CLOSURES
+	bool "Debug closures (bcache async widgits)"
+	depends on CLOSURES
+	select DEBUG_FS
+	help
+	  Keeps all active closures in a linked list and provides a debugfs
+	  interface to list them, which makes it possible to see asynchronous
+	  operations that get stuck.
+
 config DEBUG_MAPLE_TREE
 	bool "Debug maple trees"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..57d394575919 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -255,6 +255,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
+obj-$(CONFIG_CLOSURES) += closure.o
+
 obj-$(CONFIG_DQL) += dynamic_queue_limits.o
 
 obj-$(CONFIG_GLOB) += glob.o
diff --git a/lib/closure.c b/lib/closure.c
new file mode 100644
index 000000000000..b38ded00b9b0
--- /dev/null
+++ b/lib/closure.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/closure.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			closure_fn *destructor = cl->fn;
+
+			closure_debug_destroy(cl);
+
+			if (destructor)
+				destructor(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/*
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/*
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl, *t;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+	reverse = llist_reverse_order(list);
+
+	/* Then do the wakeups */
+	llist_for_each_entry_safe(cl, t, reverse, list) {
+		closure_set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ * @waitlist: will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+struct closure_syncer {
+	struct task_struct	*task;
+	int			done;
+};
+
+static void closure_sync_fn(struct closure *cl)
+{
+	struct closure_syncer *s = cl->s;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = READ_ONCE(s->task);
+	s->done = 1;
+	wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void __sched __closure_sync(struct closure *cl)
+{
+	struct closure_syncer s = { .task = current };
+
+	cl->s = &s;
+	continue_at(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (s.done)
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(__closure_sync);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static int debug_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pS -> %pS p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING_BIT,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pS\n",
+				   (void *) cl->waiting_on);
+
+		seq_puts(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debug);
+
+static int __init closure_debug_init(void)
+{
+	debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
+	return 0;
+}
+late_initcall(closure_debug_init)
+
+#endif
-- 
cgit v1.2.3


From ced58fc7ab9f2d4b3d8aaeb859321a61aa884f66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 9 Dec 2017 12:42:44 -0500
Subject: closures: closure_wait_event()

Like wait_event() - except, because it uses closures and closure
waitlists it doesn't have the restriction on modifying task state inside
the condition check, like wait_event() does.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Coly Li <colyli@suse.de>
---
 include/linux/closure.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 0ec9e7bc8d97..36b4a83f9b77 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -374,4 +374,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
 	continue_at_nobarrier(cl, fn, wq);
 }
 
+#define __closure_wait_event(waitlist, _cond)				\
+do {									\
+	struct closure cl;						\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond)						\
+			break;						\
+		closure_sync(&cl);					\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)				\
+do {									\
+	if (!(_cond))							\
+		__closure_wait_event(waitlist, _cond);			\
+} while (0)
+
 #endif /* _LINUX_CLOSURE_H */
-- 
cgit v1.2.3


From 48b7935722b8403fec0469ba3dafc5a0af08225d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 Mar 2023 02:39:39 -0500
Subject: closures: closure_nr_remaining()

Factor out a new helper, which returns the number of events outstanding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 36b4a83f9b77..722a586bb224 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void __closure_sync(struct closure *cl);
 
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
 /**
  * closure_sync - sleep until a closure a closure has nothing left to wait on
  *
@@ -180,7 +185,7 @@ void __closure_sync(struct closure *cl);
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+	if (closure_nr_remaining(cl) != 1)
 		__closure_sync(cl);
 }
 
-- 
cgit v1.2.3


From 9492261ff2460252cf2d8de89cdf854c7e2b28a0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 12 Feb 2021 20:11:25 -0500
Subject: lib/generic-radix-tree.c: Don't overflow in peek()

When we started spreading new inode numbers throughout most of the 64
bit inode space, that triggered some corner case bugs, in particular
some integer overflows related to the radix tree code. Oops.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 include/linux/generic-radix-tree.h |  7 +++++++
 lib/generic-radix-tree.c           | 17 ++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 107613f7d792..f6cd0f909d9f 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -38,6 +38,7 @@
 
 #include <asm/page.h>
 #include <linux/bug.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math.h>
 #include <linux/types.h>
@@ -184,6 +185,12 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
 {
+	if (iter->offset + obj_size < iter->offset) {
+		iter->offset	= SIZE_MAX;
+		iter->pos	= SIZE_MAX;
+		return;
+	}
+
 	iter->offset += obj_size;
 
 	if (!is_power_of_2(obj_size) &&
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index f25eb111c051..7dfa88282b00 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -166,6 +166,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
 	struct genradix_root *r;
 	struct genradix_node *n;
 	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
 restart:
 	r = READ_ONCE(radix->root);
 	if (!r)
@@ -184,10 +188,17 @@ restart:
 			(GENRADIX_ARY - 1);
 
 		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			if (iter->offset + objs_per_ptr < iter->offset) {
+				iter->offset	= SIZE_MAX;
+				iter->pos	= SIZE_MAX;
+				return NULL;
+			}
+
 			i++;
-			iter->offset = round_down(iter->offset +
-					   genradix_depth_size(level),
-					   genradix_depth_size(level));
+			iter->offset = round_down(iter->offset + objs_per_ptr,
+						  objs_per_ptr);
 			iter->pos = (iter->offset >> PAGE_SHIFT) *
 				objs_per_page;
 			if (i == GENRADIX_ARY)
-- 
cgit v1.2.3


From 73badee4280ce30a927b82eedc919fb23ee549ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 12 Sep 2023 01:17:22 -0400
Subject: lib/generic-radix-tree.c: Add peek_prev()

This patch adds genradix_peek_prev(), genradix_iter_rewind(), and
genradix_for_each_reverse(), for iterating backwards over a generic
radix tree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 61 +++++++++++++++++++++++++++++++++++++-
 lib/generic-radix-tree.c           | 59 ++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f6cd0f909d9f..847413164738 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -117,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix)			\
+	(PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix)			\
+	(PAGE_SIZE % sizeof((_radix)->type[0]))
+
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
@@ -180,7 +185,25 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 #define genradix_iter_peek(_iter, _radix)			\
 	(__genradix_cast(_radix)				\
 	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
-			      PAGE_SIZE / __genradix_obj_size(_radix)))
+			__genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+				size_t, size_t);
+
+/**
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ *			     position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek_prev(_iter, &(_radix)->tree,	\
+			__genradix_objs_per_page(_radix),	\
+			__genradix_obj_size(_radix) +		\
+			__genradix_page_remainder(_radix)))
 
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
@@ -203,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)			\
 	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+					  size_t obj_size)
+{
+	if (iter->offset == 0 ||
+	    iter->offset == SIZE_MAX) {
+		iter->offset = SIZE_MAX;
+		return;
+	}
+
+	if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+		iter->offset -= PAGE_SIZE % obj_size;
+
+	iter->offset -= obj_size;
+	iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix)			\
+	__genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
 #define genradix_for_each_from(_radix, _iter, _p, _start)	\
 	for (_iter = genradix_iter_init(_radix, _start);	\
 	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
@@ -220,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_for_each(_radix, _iter, _p)			\
 	genradix_for_each_from(_radix, _iter, _p, 0)
 
+#define genradix_last_pos(_radix)				\
+	(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p)		\
+	for (_iter = genradix_iter_init(_radix,	genradix_last_pos(_radix));\
+	     (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+	     genradix_iter_rewind(&_iter, _radix))
+
 int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
 
 /**
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index 7dfa88282b00..41f1bcdc4488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -1,4 +1,5 @@
 
+#include <linux/atomic.h>
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
@@ -212,6 +213,64 @@ restart:
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+				struct __genradix *radix,
+				size_t objs_per_page,
+				size_t obj_size_plus_page_remainder)
+{
+	struct genradix_root *r;
+	struct genradix_node *n;
+	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
+restart:
+	r = READ_ONCE(radix->root);
+	if (!r)
+		return NULL;
+
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+		iter->offset = genradix_depth_size(level);
+		iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+		iter->offset -= obj_size_plus_page_remainder;
+		iter->pos--;
+	}
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			iter->offset = round_down(iter->offset, objs_per_ptr);
+			iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+			if (!iter->offset)
+				return NULL;
+
+			iter->offset -= obj_size_plus_page_remainder;
+			iter->pos--;
+
+			if (!i)
+				goto restart;
+			--i;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
 static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 {
 	if (level) {
-- 
cgit v1.2.3


From a615f67e1a426f35366b8398c11f31c148e7df48 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:39 -0700
Subject: bpf: Add sockptr support for getsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in getsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h |  5 +++--
 kernel/bpf/cgroup.c        | 20 +++++++++++---------
 net/socket.c               |  5 +++--
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..f5b4fb6ed8c6 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -139,9 +139,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval);
+
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-				       int optname, char __user *optval,
-				       int __user *optlen, int max_optlen,
+				       int optname, sockptr_t optval,
+				       sockptr_t optlen, int max_optlen,
 				       int retval);
 
 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..ebc8c58f7e46 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1875,8 +1875,8 @@ out:
 }
 
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-				       int optname, char __user *optval,
-				       int __user *optlen, int max_optlen,
+				       int optname, sockptr_t optval,
+				       sockptr_t optlen, int max_optlen,
 				       int retval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1903,8 +1903,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		 * one that kernel returned as well to let
 		 * BPF programs inspect the value.
 		 */
-
-		if (get_user(ctx.optlen, optlen)) {
+		if (copy_from_sockptr(&ctx.optlen, optlen,
+				      sizeof(ctx.optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1915,8 +1915,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		}
 		orig_optlen = ctx.optlen;
 
-		if (copy_from_user(ctx.optval, optval,
-				   min(ctx.optlen, max_optlen)) != 0) {
+		if (copy_from_sockptr(ctx.optval, optval,
+				      min(ctx.optlen, max_optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1930,7 +1930,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	if (ret < 0)
 		goto out;
 
-	if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+	if (!sockptr_is_null(optval) &&
+	    (ctx.optlen > max_optlen || ctx.optlen < 0)) {
 		if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
 			pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
 				     ctx.optlen, max_optlen);
@@ -1942,11 +1943,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	if (ctx.optlen != 0) {
-		if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+		if (!sockptr_is_null(optval) &&
+		    copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
 			ret = -EFAULT;
 			goto out;
 		}
-		if (put_user(ctx.optlen, optlen)) {
+		if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
diff --git a/net/socket.c b/net/socket.c
index c8b08b32f097..82cd6890a4f0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2356,8 +2356,9 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 
 	if (!in_compat_syscall())
 		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
-						     optval, optlen, max_optlen,
-						     err);
+						     USER_SOCKPTR(optval),
+						     USER_SOCKPTR(optlen),
+						     max_optlen, err);
 out_put:
 	fput_light(sock->file, fput_needed);
 	return err;
-- 
cgit v1.2.3


From 3f31e0d14d44ad491a81b7c1f83f32fbc300a867 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:40 -0700
Subject: bpf: Add sockptr support for setsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in setsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-3-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h | 2 +-
 kernel/bpf/cgroup.c        | 5 +++--
 net/socket.c               | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index f5b4fb6ed8c6..cecfe8c99f28 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -137,7 +137,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
-				       int *optname, char __user *optval,
+				       int *optname, sockptr_t optval,
 				       int *optlen, char **kernel_optval);
 
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ebc8c58f7e46..f0dedd4f7f2e 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1785,7 +1785,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
-				       int *optname, char __user *optval,
+				       int *optname, sockptr_t optval,
 				       int *optlen, char **kernel_optval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1808,7 +1808,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	ctx.optlen = *optlen;
 
-	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+	if (copy_from_sockptr(ctx.optval, optval,
+			      min(*optlen, max_optlen))) {
 		ret = -EFAULT;
 		goto out;
 	}
diff --git a/net/socket.c b/net/socket.c
index 82cd6890a4f0..9a43f6e850c7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2288,7 +2288,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
 
 	if (!in_compat_syscall())
 		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
-						     user_optval, &optlen,
+						     optval, &optlen,
 						     &kernel_optval);
 	if (err < 0)
 		goto out_put;
-- 
cgit v1.2.3


From 0b05b0cd78c92371fdde6333d006f39eaf9e0860 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:42 -0700
Subject: net/socket: Break down __sys_getsockopt

Split __sys_getsockopt() into two functions by removing the core
logic into a sub-function (do_sock_getsockopt()). This will avoid
code duplication when doing the same operation in other callers, for
instance.

do_sock_getsockopt() will be called by io_uring getsockopt() command
operation in the following patch.

The same was done for the setsockopt pair.

Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-5-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h |  2 +-
 include/net/sock.h         |  4 +--
 net/core/sock.c            |  8 ------
 net/socket.c               | 64 +++++++++++++++++++++++++++++-----------------
 4 files changed, 44 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cecfe8c99f28..ffaca1ab5e8d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -378,7 +378,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT))			       \
-		get_user(__ret, optlen);				       \
+		copy_from_sockptr(&__ret, optlen, sizeof(int));		       \
 	__ret;								       \
 })
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 24fbf7182386..cffb77aa71ab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1869,11 +1869,11 @@ int sock_setsockopt(struct socket *sock, int level, int op,
 		    sockptr_t optval, unsigned int optlen);
 int do_sock_setsockopt(struct socket *sock, bool compat, int level,
 		       int optname, sockptr_t optval, int optlen);
+int do_sock_getsockopt(struct socket *sock, bool compat, int level,
+		       int optname, sockptr_t optval, sockptr_t optlen);
 
 int sk_getsockopt(struct sock *sk, int level, int optname,
 		  sockptr_t optval, sockptr_t optlen);
-int sock_getsockopt(struct socket *sock, int level, int op,
-		    char __user *optval, int __user *optlen);
 int sock_gettstamp(struct socket *sock, void __user *userstamp,
 		   bool timeval, bool time32);
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
diff --git a/net/core/sock.c b/net/core/sock.c
index 16584e2dd648..223eed53fb7d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2010,14 +2010,6 @@ lenout:
 	return 0;
 }
 
-int sock_getsockopt(struct socket *sock, int level, int optname,
-		    char __user *optval, int __user *optlen)
-{
-	return sk_getsockopt(sock->sk, level, optname,
-			     USER_SOCKPTR(optval),
-			     USER_SOCKPTR(optlen));
-}
-
 /*
  * Initialize an sk_lock.
  *
diff --git a/net/socket.c b/net/socket.c
index 63be25d762f6..eb6960958026 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2333,6 +2333,43 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
 INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
 							 int optname));
 
+int do_sock_getsockopt(struct socket *sock, bool compat, int level,
+		       int optname, sockptr_t optval, sockptr_t optlen)
+{
+	int max_optlen __maybe_unused;
+	const struct proto_ops *ops;
+	int err;
+
+	err = security_socket_getsockopt(sock, level, optname);
+	if (err)
+		return err;
+
+	if (!compat)
+		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
+	ops = READ_ONCE(sock->ops);
+	if (level == SOL_SOCKET) {
+		err = sk_getsockopt(sock->sk, level, optname, optval, optlen);
+	} else if (unlikely(!ops->getsockopt)) {
+		err = -EOPNOTSUPP;
+	} else {
+		if (WARN_ONCE(optval.is_kernel || optlen.is_kernel,
+			      "Invalid argument type"))
+			return -EOPNOTSUPP;
+
+		err = ops->getsockopt(sock, level, optname, optval.user,
+				      optlen.user);
+	}
+
+	if (!compat)
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+						     optval, optlen, max_optlen,
+						     err);
+
+	return err;
+}
+EXPORT_SYMBOL(do_sock_getsockopt);
+
 /*
  *	Get a socket option. Because we don't know the option lengths we have
  *	to pass a user mode parameter for the protocols to sort out.
@@ -2340,37 +2377,18 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
 int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 		int __user *optlen)
 {
-	int max_optlen __maybe_unused;
-	const struct proto_ops *ops;
 	int err, fput_needed;
 	struct socket *sock;
+	bool compat;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		return err;
 
-	err = security_socket_getsockopt(sock, level, optname);
-	if (err)
-		goto out_put;
+	compat = in_compat_syscall();
+	err = do_sock_getsockopt(sock, compat, level, optname,
+				 USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
 
-	if (!in_compat_syscall())
-		max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
-
-	ops = READ_ONCE(sock->ops);
-	if (level == SOL_SOCKET)
-		err = sock_getsockopt(sock, level, optname, optval, optlen);
-	else if (unlikely(!ops->getsockopt))
-		err = -EOPNOTSUPP;
-	else
-		err = ops->getsockopt(sock, level, optname, optval,
-					    optlen);
-
-	if (!in_compat_syscall())
-		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
-						     USER_SOCKPTR(optval),
-						     USER_SOCKPTR(optlen),
-						     max_optlen, err);
-out_put:
 	fput_light(sock->file, fput_needed);
 	return err;
 }
-- 
cgit v1.2.3


From 5fea44a6e05b86bf49019fbbf2ab30098d03e0dc Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:43 -0700
Subject: io_uring/cmd: Pass compat mode in issue_flags

Create a new flag to track if the operation is running compat mode.
This basically check the context->compat and pass it to the issue_flags,
so, it could be queried later in the callbacks.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231016134750.1381153-6-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 1 +
 io_uring/uring_cmd.c     | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index b4391e0a9bc8..aefb73eeeebf 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -23,6 +23,7 @@ enum io_uring_cmd_flags {
 
 	/* set when uring wants to cancel a previously issued command */
 	IO_URING_F_CANCEL		= (1 << 11),
+	IO_URING_F_COMPAT		= (1 << 12),
 };
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 00a5e5621a28..4bedd633c08c 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -175,6 +175,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 		issue_flags |= IO_URING_F_SQE128;
 	if (ctx->flags & IORING_SETUP_CQE32)
 		issue_flags |= IO_URING_F_CQE32;
+	if (ctx->compat)
+		issue_flags |= IO_URING_F_COMPAT;
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		if (!file->f_op->uring_cmd_iopoll)
 			return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 6da88306811b40a207c94c9da9faf07bdb20776e Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:39 +0800
Subject: cgroup: Prepare for using css_task_iter_*() in BPF

This patch makes some preparations for using css_task_iter_*() in BPF
Program.

1. Flags CSS_TASK_ITER_* are #define-s and it's not easy for bpf prog to
use them. Convert them to enum so bpf prog can take them from vmlinux.h.

2. In the next patch we will add css_task_iter_*() in common kfuncs which
is not safe. Since css_task_iter_*() does spin_unlock_irq() which might
screw up irq flags depending on the context where bpf prog is running.
So we should use irqsave/irqrestore here and the switching is harmless.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-2-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cgroup.h | 12 +++++-------
 kernel/cgroup/cgroup.c | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b307013b9c6c..0ef0af66080e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,11 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
-/* walk only threadgroup leaders */
-#define CSS_TASK_ITER_PROCS		(1U << 0)
-/* walk all threaded css_sets in the domain */
-#define CSS_TASK_ITER_THREADED		(1U << 1)
-
-/* internal flags */
-#define CSS_TASK_ITER_SKIPPED		(1U << 16)
+enum {
+	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
+	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
+	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
+};
 
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..b6d64f3b8888 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4917,9 +4917,11 @@ repeat:
 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 			 struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	it->ss = css->ss;
 	it->flags = flags;
@@ -4933,7 +4935,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 
 	css_task_iter_advance(it);
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 }
 
 /**
@@ -4946,12 +4948,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_task) {
 		put_task_struct(it->cur_task);
 		it->cur_task = NULL;
 	}
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	/* @it may be half-advanced by skips, finish advancing */
 	if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4964,7 +4968,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 
 	return it->cur_task;
 }
@@ -4977,11 +4981,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  */
 void css_task_iter_end(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_cset) {
-		spin_lock_irq(&css_set_lock);
+		spin_lock_irqsave(&css_set_lock, irqflags);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_irqrestore(&css_set_lock, irqflags);
 	}
 
 	if (it->cur_dcset)
-- 
cgit v1.2.3


From dfab99df147b0d364f0c199f832ff2aedfb2265a Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:43 +0800
Subject: bpf: teach the verifier to enforce css_iter and task_iter in RCU CS

css_iter and task_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.

This patch adds a new a KF flag KF_RCU_PROTECTED for bpf_iter_task_new and
bpf_iter_css_new. It means the kfunc should be used in RCU CS. We check
whether we are in rcu cs before we want to invoke this kfunc. If the rcu
protection is guaranteed, we would let st->type = PTR_TO_STACK | MEM_RCU.
Once user do rcu_unlock during the iteration, state MEM_RCU of regs would
be cleared. is_iter_reg_valid_init() will reject if reg->type is UNTRUSTED.

It is worth noting that currently, bpf_rcu_read_unlock does not
clear the state of the STACK_ITER reg, since bpf_for_each_spilled_reg
only considers STACK_SPILL. This patch also let bpf_for_each_spilled_reg
search STACK_ITER.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-6-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 19 ++++++++++-------
 include/linux/btf.h          |  1 +
 kernel/bpf/helpers.c         |  4 ++--
 kernel/bpf/verifier.c        | 50 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 94ec766432f5..e67cd45a85be 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -386,19 +386,18 @@ struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 };
 
-#define bpf_get_spilled_reg(slot, frame)				\
+#define bpf_get_spilled_reg(slot, frame, mask)				\
 	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	  ((1 << frame->stack[slot].slot_type[0]) & (mask))) \
 	 ? &frame->stack[slot].spilled_ptr : NULL)
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
-#define bpf_for_each_spilled_reg(iter, frame, reg)			\
-	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+#define bpf_for_each_spilled_reg(iter, frame, reg, mask)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);		\
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
-	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+	     iter++, reg = bpf_get_spilled_reg(iter, frame, mask))
 
-/* Invoke __expr over regsiters in __vst, setting __state and __reg */
-#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
 	({                                                               \
 		struct bpf_verifier_state *___vstate = __vst;            \
 		int ___i, ___j;                                          \
@@ -410,7 +409,7 @@ struct bpf_verifier_state {
 				__reg = &___regs[___j];                  \
 				(void)(__expr);                          \
 			}                                                \
-			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+			bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
 				if (!__reg)                              \
 					continue;                        \
 				(void)(__expr);                          \
@@ -418,6 +417,10 @@ struct bpf_verifier_state {
 		}                                                        \
 	})
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
+	bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 928113a80a95..c2231c64d60b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -74,6 +74,7 @@
 #define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
 #define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
 #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
+#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b1d285ed4796..da058aead20c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2563,10 +2563,10 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 974713185269..fcdf2382153a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1173,7 +1173,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 
 static void __mark_reg_known_zero(struct bpf_reg_state *reg);
 
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
 static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_kfunc_call_arg_meta *meta,
 				 struct bpf_reg_state *reg, int insn_idx,
 				 struct btf *btf, u32 btf_id, int nr_slots)
 {
@@ -1194,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 
 		__mark_reg_known_zero(st);
 		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		if (is_kfunc_rcu_protected(meta)) {
+			if (in_rcu_cs(env))
+				st->type |= MEM_RCU;
+			else
+				st->type |= PTR_UNTRUSTED;
+		}
 		st->live |= REG_LIVE_WRITTEN;
 		st->ref_obj_id = i == 0 ? id : 0;
 		st->iter.btf = btf;
@@ -1268,7 +1279,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
 	return true;
 }
 
-static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   struct btf *btf, u32 btf_id, int nr_slots)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -1276,26 +1287,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
 
 	spi = iter_get_spi(env, reg, nr_slots);
 	if (spi < 0)
-		return false;
+		return -EINVAL;
 
 	for (i = 0; i < nr_slots; i++) {
 		struct bpf_stack_state *slot = &state->stack[spi - i];
 		struct bpf_reg_state *st = &slot->spilled_ptr;
 
+		if (st->type & PTR_UNTRUSTED)
+			return -EPROTO;
 		/* only main (first) slot has ref_obj_id set */
 		if (i == 0 && !st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (i != 0 && st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
-			return false;
+			return -EINVAL;
 
 		for (j = 0; j < BPF_REG_SIZE; j++)
 			if (slot->slot_type[j] != STACK_ITER)
-				return false;
+				return -EINVAL;
 	}
 
-	return true;
+	return 0;
 }
 
 /* Check if given stack slot is "special":
@@ -7640,15 +7653,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 				return err;
 		}
 
-		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
 		if (err)
 			return err;
 	} else {
 		/* iter_next() or iter_destroy() expect initialized iter state*/
-		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+		err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+		switch (err) {
+		case 0:
+			break;
+		case -EINVAL:
 			verbose(env, "expected an initialized iter_%s as arg #%d\n",
 				iter_type_str(meta->btf, btf_id), regno);
-			return -EINVAL;
+			return err;
+		case -EPROTO:
+			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+			return err;
+		default:
+			return err;
 		}
 
 		spi = iter_get_spi(env, reg, nr_slots);
@@ -10231,6 +10253,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU;
 }
 
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -11582,6 +11609,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
+		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
 
 		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
 			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
@@ -11592,7 +11620,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
 			return -EINVAL;
 		} else if (rcu_unlock) {
-			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
 				if (reg->type & MEM_RCU) {
 					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
 					reg->type |= PTR_UNTRUSTED;
-- 
cgit v1.2.3


From 70e6f7e2b98575621019aa40ac616be58ff984e0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 25 Sep 2023 20:13:29 -0700
Subject: configfs-tsm: Introduce a shared ABI for attestation reports

One of the common operations of a TSM (Trusted Security Module) is to
provide a way for a TVM (confidential computing guest execution
environment) to take a measurement of its launch state, sign it and
submit it to a verifying party. Upon successful attestation that
verifies the integrity of the TVM additional secrets may be deployed.
The concept is common across TSMs, but the implementations are
unfortunately vendor specific. While the industry grapples with a common
definition of this attestation format [1], Linux need not make this
problem worse by defining a new ABI per TSM that wants to perform a
similar operation. The current momentum has been to invent new ioctl-ABI
per TSM per function which at best is an abdication of the kernel's
responsibility to make common infrastructure concepts share common ABI.

The proposal, targeted to conceptually work with TDX, SEV-SNP, COVE if
not more, is to define a configfs interface to retrieve the TSM-specific
blob.

    report=/sys/kernel/config/tsm/report/report0
    mkdir $report
    dd if=binary_userdata_plus_nonce > $report/inblob
    hexdump $report/outblob

This approach later allows for the standardization of the attestation
blob format without needing to invent a new ABI. Once standardization
happens the standard format can be emitted by $report/outblob and
indicated by $report/provider, or a new attribute like
"$report/tcg_coco_report" can emit the standard format alongside the
vendor format.

Review of previous iterations of this interface identified that there is
a need to scale report generation for multiple container environments
[2]. Configfs enables a model where each container can bind mount one or
more report generation item instances. Still, within a container only a
single thread can be manipulating a given configuration instance at a
time. A 'generation' count is provided to detect conflicts between
multiple threads racing to configure a report instance.

The SEV-SNP concepts of "extended reports" and "privilege levels" are
optionally enabled by selecting 'tsm_report_ext_type' at register_tsm()
time. The expectation is that those concepts are generic enough that
they may be adopted by other TSM implementations. In other words,
configfs-tsm aims to address a superset of TSM specific functionality
with a common ABI where attributes may appear, or not appear, based on
the set of concepts the implementation supports.

Link: http://lore.kernel.org/r/64961c3baf8ce_142af829436@dwillia2-xfh.jf.intel.com.notmuch [1]
Link: http://lore.kernel.org/r/57f3a05e-8fcd-4656-beea-56bb8365ae64@linux.microsoft.com [2]
Cc: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Cc: Dionna Amalie Glaze <dionnaglaze@google.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Peter Gonda <pgonda@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/configfs-tsm |  82 +++++++
 MAINTAINERS                            |   8 +
 drivers/virt/coco/Kconfig              |   5 +
 drivers/virt/coco/Makefile             |   1 +
 drivers/virt/coco/tsm.c                | 425 +++++++++++++++++++++++++++++++++
 include/linux/tsm.h                    |  69 ++++++
 6 files changed, 590 insertions(+)
 create mode 100644 Documentation/ABI/testing/configfs-tsm
 create mode 100644 drivers/virt/coco/tsm.c
 create mode 100644 include/linux/tsm.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/configfs-tsm b/Documentation/ABI/testing/configfs-tsm
new file mode 100644
index 000000000000..dd24202b5ba5
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-tsm
@@ -0,0 +1,82 @@
+What:		/sys/kernel/config/tsm/report/$name/inblob
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(WO) Up to 64 bytes of user specified binary data. For replay
+		protection this should include a nonce, but the kernel does not
+		place any restrictions on the content.
+
+What:		/sys/kernel/config/tsm/report/$name/outblob
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RO) Binary attestation report generated from @inblob and other
+		options The format of the report is implementation specific
+		where the implementation is conveyed via the @provider
+		attribute.
+
+What:		/sys/kernel/config/tsm/report/$name/auxblob
+Date:		October, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RO) Optional supplemental data that a TSM may emit, visibility
+		of this attribute depends on TSM, and may be empty if no
+		auxiliary data is available.
+
+		When @provider is "sev_guest" this file contains the
+		"cert_table" from SEV-ES Guest-Hypervisor Communication Block
+		Standardization v2.03 Section 4.1.8.1 MSG_REPORT_REQ.
+		https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56421.pdf
+
+What:		/sys/kernel/config/tsm/report/$name/provider
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RO) A name for the format-specification of @outblob like
+		"sev_guest" [1] or "tdx_guest" [2] in the near term, or a
+		common standard format in the future.
+
+		[1]: SEV Secure Nested Paging Firmware ABI Specification
+		Revision 1.55 Table 22
+		https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf
+
+		[2]: Intel® Trust Domain Extensions Data Center Attestation
+		Primitives : Quote Generation Library and Quote Verification
+		Library Revision 0.8 Appendix 4,5
+		https://download.01.org/intel-sgx/latest/dcap-latest/linux/docs/Intel_TDX_DCAP_Quoting_Library_API.pdf
+
+What:		/sys/kernel/config/tsm/report/$name/generation
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RO) The value in this attribute increments each time @inblob or
+		any option is written. Userspace can detect conflicts by
+		checking generation before writing to any attribute and making
+		sure the number of writes matches expectations after reading
+		@outblob, or it can prevent conflicts by creating a report
+		instance per requesting context.
+
+What:		/sys/kernel/config/tsm/report/$name/privlevel
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(WO) Attribute is visible if a TSM implementation provider
+		supports the concept of attestation reports for TVMs running at
+		different privilege levels, like SEV-SNP "VMPL", specify the
+		privilege level via this attribute.  The minimum acceptable
+		value is conveyed via @privlevel_floor and the maximum
+		acceptable value is TSM_PRIVLEVEL_MAX (3).
+
+What:		/sys/kernel/config/tsm/report/$name/privlevel_floor
+Date:		September, 2023
+KernelVersion:	v6.7
+Contact:	linux-coco@lists.linux.dev
+Description:
+		(RO) Indicates the minimum permissible value that can be written
+		to @privlevel.
diff --git a/MAINTAINERS b/MAINTAINERS
index b19995690904..8acbeb029ba1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21889,6 +21889,14 @@ W:	https://github.com/srcres258/linux-doc
 T:	git git://github.com/srcres258/linux-doc.git doc-zh-tw
 F:	Documentation/translations/zh_TW/
 
+TRUSTED SECURITY MODULE (TSM) ATTESTATION REPORTS
+M:	Dan Williams <dan.j.williams@intel.com>
+L:	linux-coco@lists.linux.dev
+S:	Maintained
+F:	Documentation/ABI/testing/configfs-tsm
+F:	drivers/virt/coco/tsm.c
+F:	include/linux/tsm.h
+
 TTY LAYER AND SERIAL DRIVERS
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Jiri Slaby <jirislaby@kernel.org>
diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig
index fc5c64f04c4a..87d142c1f932 100644
--- a/drivers/virt/coco/Kconfig
+++ b/drivers/virt/coco/Kconfig
@@ -2,6 +2,11 @@
 #
 # Confidential computing related collateral
 #
+
+config TSM_REPORTS
+	select CONFIGFS_FS
+	tristate
+
 source "drivers/virt/coco/efi_secret/Kconfig"
 
 source "drivers/virt/coco/sev-guest/Kconfig"
diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile
index 55302ef719ad..18c1aba5edb7 100644
--- a/drivers/virt/coco/Makefile
+++ b/drivers/virt/coco/Makefile
@@ -2,6 +2,7 @@
 #
 # Confidential computing related collateral
 #
+obj-$(CONFIG_TSM_REPORTS)	+= tsm.o
 obj-$(CONFIG_EFI_SECRET)	+= efi_secret/
 obj-$(CONFIG_SEV_GUEST)		+= sev-guest/
 obj-$(CONFIG_INTEL_TDX_GUEST)	+= tdx-guest/
diff --git a/drivers/virt/coco/tsm.c b/drivers/virt/coco/tsm.c
new file mode 100644
index 000000000000..d1c2db83a8ca
--- /dev/null
+++ b/drivers/virt/coco/tsm.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. All rights reserved. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/tsm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/cleanup.h>
+#include <linux/configfs.h>
+
+static struct tsm_provider {
+	const struct tsm_ops *ops;
+	const struct config_item_type *type;
+	void *data;
+} provider;
+static DECLARE_RWSEM(tsm_rwsem);
+
+/**
+ * DOC: Trusted Security Module (TSM) Attestation Report Interface
+ *
+ * The TSM report interface is a common provider of blobs that facilitate
+ * attestation of a TVM (confidential computing guest) by an attestation
+ * service. A TSM report combines a user-defined blob (likely a public-key with
+ * a nonce for a key-exchange protocol) with a signed attestation report. That
+ * combined blob is then used to obtain secrets provided by an agent that can
+ * validate the attestation report. The expectation is that this interface is
+ * invoked infrequently, however configfs allows for multiple agents to
+ * own their own report generation instances to generate reports as
+ * often as needed.
+ *
+ * The attestation report format is TSM provider specific, when / if a standard
+ * materializes that can be published instead of the vendor layout. Until then
+ * the 'provider' attribute indicates the format of 'outblob', and optionally
+ * 'auxblob'.
+ */
+
+struct tsm_report_state {
+	struct tsm_report report;
+	unsigned long write_generation;
+	unsigned long read_generation;
+	struct config_item cfg;
+};
+
+enum tsm_data_select {
+	TSM_REPORT,
+	TSM_CERTS,
+};
+
+static struct tsm_report *to_tsm_report(struct config_item *cfg)
+{
+	struct tsm_report_state *state =
+		container_of(cfg, struct tsm_report_state, cfg);
+
+	return &state->report;
+}
+
+static struct tsm_report_state *to_state(struct tsm_report *report)
+{
+	return container_of(report, struct tsm_report_state, report);
+}
+
+static int try_advance_write_generation(struct tsm_report *report)
+{
+	struct tsm_report_state *state = to_state(report);
+
+	lockdep_assert_held_write(&tsm_rwsem);
+
+	/*
+	 * Malicious or broken userspace has written enough times for
+	 * read_generation == write_generation by modular arithmetic without an
+	 * interim read. Stop accepting updates until the current report
+	 * configuration is read.
+	 */
+	if (state->write_generation == state->read_generation - 1)
+		return -EBUSY;
+	state->write_generation++;
+	return 0;
+}
+
+static ssize_t tsm_report_privlevel_store(struct config_item *cfg,
+					  const char *buf, size_t len)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	/*
+	 * The valid privilege levels that a TSM might accept, if it accepts a
+	 * privilege level setting at all, are a max of TSM_PRIVLEVEL_MAX (see
+	 * SEV-SNP GHCB) and a minimum of a TSM selected floor value no less
+	 * than 0.
+	 */
+	if (provider.ops->privlevel_floor > val || val > TSM_PRIVLEVEL_MAX)
+		return -EINVAL;
+
+	guard(rwsem_write)(&tsm_rwsem);
+	rc = try_advance_write_generation(report);
+	if (rc)
+		return rc;
+	report->desc.privlevel = val;
+
+	return len;
+}
+CONFIGFS_ATTR_WO(tsm_report_, privlevel);
+
+static ssize_t tsm_report_privlevel_floor_show(struct config_item *cfg,
+					       char *buf)
+{
+	guard(rwsem_read)(&tsm_rwsem);
+	return sysfs_emit(buf, "%u\n", provider.ops->privlevel_floor);
+}
+CONFIGFS_ATTR_RO(tsm_report_, privlevel_floor);
+
+static ssize_t tsm_report_inblob_write(struct config_item *cfg,
+				       const void *buf, size_t count)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+	int rc;
+
+	guard(rwsem_write)(&tsm_rwsem);
+	rc = try_advance_write_generation(report);
+	if (rc)
+		return rc;
+
+	report->desc.inblob_len = count;
+	memcpy(report->desc.inblob, buf, count);
+	return count;
+}
+CONFIGFS_BIN_ATTR_WO(tsm_report_, inblob, NULL, TSM_INBLOB_MAX);
+
+static ssize_t tsm_report_generation_show(struct config_item *cfg, char *buf)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+	struct tsm_report_state *state = to_state(report);
+
+	guard(rwsem_read)(&tsm_rwsem);
+	return sysfs_emit(buf, "%lu\n", state->write_generation);
+}
+CONFIGFS_ATTR_RO(tsm_report_, generation);
+
+static ssize_t tsm_report_provider_show(struct config_item *cfg, char *buf)
+{
+	guard(rwsem_read)(&tsm_rwsem);
+	return sysfs_emit(buf, "%s\n", provider.ops->name);
+}
+CONFIGFS_ATTR_RO(tsm_report_, provider);
+
+static ssize_t __read_report(struct tsm_report *report, void *buf, size_t count,
+			     enum tsm_data_select select)
+{
+	loff_t offset = 0;
+	ssize_t len;
+	u8 *out;
+
+	if (select == TSM_REPORT) {
+		out = report->outblob;
+		len = report->outblob_len;
+	} else {
+		out = report->auxblob;
+		len = report->auxblob_len;
+	}
+
+	/*
+	 * Recall that a NULL @buf is configfs requesting the size of
+	 * the buffer.
+	 */
+	if (!buf)
+		return len;
+	return memory_read_from_buffer(buf, count, &offset, out, len);
+}
+
+static ssize_t read_cached_report(struct tsm_report *report, void *buf,
+				  size_t count, enum tsm_data_select select)
+{
+	struct tsm_report_state *state = to_state(report);
+
+	guard(rwsem_read)(&tsm_rwsem);
+	if (!report->desc.inblob_len)
+		return -EINVAL;
+
+	/*
+	 * A given TSM backend always fills in ->outblob regardless of
+	 * whether the report includes an auxblob or not.
+	 */
+	if (!report->outblob ||
+	    state->read_generation != state->write_generation)
+		return -EWOULDBLOCK;
+
+	return __read_report(report, buf, count, select);
+}
+
+static ssize_t tsm_report_read(struct tsm_report *report, void *buf,
+			       size_t count, enum tsm_data_select select)
+{
+	struct tsm_report_state *state = to_state(report);
+	const struct tsm_ops *ops;
+	ssize_t rc;
+
+	/* try to read from the existing report if present and valid... */
+	rc = read_cached_report(report, buf, count, select);
+	if (rc >= 0 || rc != -EWOULDBLOCK)
+		return rc;
+
+	/* slow path, report may need to be regenerated... */
+	guard(rwsem_write)(&tsm_rwsem);
+	ops = provider.ops;
+	if (!ops)
+		return -ENOTTY;
+	if (!report->desc.inblob_len)
+		return -EINVAL;
+
+	/* did another thread already generate this report? */
+	if (report->outblob &&
+	    state->read_generation == state->write_generation)
+		goto out;
+
+	kvfree(report->outblob);
+	kvfree(report->auxblob);
+	report->outblob = NULL;
+	report->auxblob = NULL;
+	rc = ops->report_new(report, provider.data);
+	if (rc < 0)
+		return rc;
+	state->read_generation = state->write_generation;
+out:
+	return __read_report(report, buf, count, select);
+}
+
+static ssize_t tsm_report_outblob_read(struct config_item *cfg, void *buf,
+				       size_t count)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+
+	return tsm_report_read(report, buf, count, TSM_REPORT);
+}
+CONFIGFS_BIN_ATTR_RO(tsm_report_, outblob, NULL, TSM_OUTBLOB_MAX);
+
+static ssize_t tsm_report_auxblob_read(struct config_item *cfg, void *buf,
+				       size_t count)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+
+	return tsm_report_read(report, buf, count, TSM_CERTS);
+}
+CONFIGFS_BIN_ATTR_RO(tsm_report_, auxblob, NULL, TSM_OUTBLOB_MAX);
+
+#define TSM_DEFAULT_ATTRS() \
+	&tsm_report_attr_generation, \
+	&tsm_report_attr_provider
+
+static struct configfs_attribute *tsm_report_attrs[] = {
+	TSM_DEFAULT_ATTRS(),
+	NULL,
+};
+
+static struct configfs_attribute *tsm_report_extra_attrs[] = {
+	TSM_DEFAULT_ATTRS(),
+	&tsm_report_attr_privlevel,
+	&tsm_report_attr_privlevel_floor,
+	NULL,
+};
+
+#define TSM_DEFAULT_BIN_ATTRS() \
+	&tsm_report_attr_inblob, \
+	&tsm_report_attr_outblob
+
+static struct configfs_bin_attribute *tsm_report_bin_attrs[] = {
+	TSM_DEFAULT_BIN_ATTRS(),
+	NULL,
+};
+
+static struct configfs_bin_attribute *tsm_report_bin_extra_attrs[] = {
+	TSM_DEFAULT_BIN_ATTRS(),
+	&tsm_report_attr_auxblob,
+	NULL,
+};
+
+static void tsm_report_item_release(struct config_item *cfg)
+{
+	struct tsm_report *report = to_tsm_report(cfg);
+	struct tsm_report_state *state = to_state(report);
+
+	kvfree(report->auxblob);
+	kvfree(report->outblob);
+	kfree(state);
+}
+
+static struct configfs_item_operations tsm_report_item_ops = {
+	.release = tsm_report_item_release,
+};
+
+const struct config_item_type tsm_report_default_type = {
+	.ct_owner = THIS_MODULE,
+	.ct_bin_attrs = tsm_report_bin_attrs,
+	.ct_attrs = tsm_report_attrs,
+	.ct_item_ops = &tsm_report_item_ops,
+};
+EXPORT_SYMBOL_GPL(tsm_report_default_type);
+
+const struct config_item_type tsm_report_extra_type = {
+	.ct_owner = THIS_MODULE,
+	.ct_bin_attrs = tsm_report_bin_extra_attrs,
+	.ct_attrs = tsm_report_extra_attrs,
+	.ct_item_ops = &tsm_report_item_ops,
+};
+EXPORT_SYMBOL_GPL(tsm_report_extra_type);
+
+static struct config_item *tsm_report_make_item(struct config_group *group,
+						const char *name)
+{
+	struct tsm_report_state *state;
+
+	guard(rwsem_read)(&tsm_rwsem);
+	if (!provider.ops)
+		return ERR_PTR(-ENXIO);
+
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	if (!state)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&state->cfg, name, provider.type);
+	return &state->cfg;
+}
+
+static struct configfs_group_operations tsm_report_group_ops = {
+	.make_item = tsm_report_make_item,
+};
+
+static const struct config_item_type tsm_reports_type = {
+	.ct_owner = THIS_MODULE,
+	.ct_group_ops = &tsm_report_group_ops,
+};
+
+static const struct config_item_type tsm_root_group_type = {
+	.ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem tsm_configfs = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "tsm",
+			.ci_type = &tsm_root_group_type,
+		},
+	},
+	.su_mutex = __MUTEX_INITIALIZER(tsm_configfs.su_mutex),
+};
+
+int tsm_register(const struct tsm_ops *ops, void *priv,
+		 const struct config_item_type *type)
+{
+	const struct tsm_ops *conflict;
+
+	if (!type)
+		type = &tsm_report_default_type;
+	if (!(type == &tsm_report_default_type || type == &tsm_report_extra_type))
+		return -EINVAL;
+
+	guard(rwsem_write)(&tsm_rwsem);
+	conflict = provider.ops;
+	if (conflict) {
+		pr_err("\"%s\" ops already registered\n", conflict->name);
+		return -EBUSY;
+	}
+
+	provider.ops = ops;
+	provider.data = priv;
+	provider.type = type;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tsm_register);
+
+int tsm_unregister(const struct tsm_ops *ops)
+{
+	guard(rwsem_write)(&tsm_rwsem);
+	if (ops != provider.ops)
+		return -EBUSY;
+	provider.ops = NULL;
+	provider.data = NULL;
+	provider.type = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tsm_unregister);
+
+static struct config_group *tsm_report_group;
+
+static int __init tsm_init(void)
+{
+	struct config_group *root = &tsm_configfs.su_group;
+	struct config_group *tsm;
+	int rc;
+
+	config_group_init(root);
+	rc = configfs_register_subsystem(&tsm_configfs);
+	if (rc)
+		return rc;
+
+	tsm = configfs_register_default_group(root, "report",
+					      &tsm_reports_type);
+	if (IS_ERR(tsm)) {
+		configfs_unregister_subsystem(&tsm_configfs);
+		return PTR_ERR(tsm);
+	}
+	tsm_report_group = tsm;
+
+	return 0;
+}
+module_init(tsm_init);
+
+static void __exit tsm_exit(void)
+{
+	configfs_unregister_default_group(tsm_report_group);
+	configfs_unregister_subsystem(&tsm_configfs);
+}
+module_exit(tsm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Provide Trusted Security Module attestation reports via configfs");
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
new file mode 100644
index 000000000000..de8324a2223c
--- /dev/null
+++ b/include/linux/tsm.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TSM_H
+#define __TSM_H
+
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#define TSM_INBLOB_MAX 64
+#define TSM_OUTBLOB_MAX SZ_32K
+
+/*
+ * Privilege level is a nested permission concept to allow confidential
+ * guests to partition address space, 4-levels are supported.
+ */
+#define TSM_PRIVLEVEL_MAX 3
+
+/**
+ * struct tsm_desc - option descriptor for generating tsm report blobs
+ * @privlevel: optional privilege level to associate with @outblob
+ * @inblob_len: sizeof @inblob
+ * @inblob: arbitrary input data
+ */
+struct tsm_desc {
+	unsigned int privlevel;
+	size_t inblob_len;
+	u8 inblob[TSM_INBLOB_MAX];
+};
+
+/**
+ * struct tsm_report - track state of report generation relative to options
+ * @desc: input parameters to @report_new()
+ * @outblob_len: sizeof(@outblob)
+ * @outblob: generated evidence to provider to the attestation agent
+ * @auxblob_len: sizeof(@auxblob)
+ * @auxblob: (optional) auxiliary data to the report (e.g. certificate data)
+ */
+struct tsm_report {
+	struct tsm_desc desc;
+	size_t outblob_len;
+	u8 *outblob;
+	size_t auxblob_len;
+	u8 *auxblob;
+};
+
+/**
+ * struct tsm_ops - attributes and operations for tsm instances
+ * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider
+ * @privlevel_floor: convey base privlevel for nested scenarios
+ * @report_new: Populate @report with the report blob and auxblob
+ * (optional), return 0 on successful population, or -errno otherwise
+ *
+ * Implementation specific ops, only one is expected to be registered at
+ * a time i.e. only one of "sev-guest", "tdx-guest", etc.
+ */
+struct tsm_ops {
+	const char *name;
+	const unsigned int privlevel_floor;
+	int (*report_new)(struct tsm_report *report, void *data);
+};
+
+extern const struct config_item_type tsm_report_default_type;
+
+/* publish @privlevel, @privlevel_floor, and @auxblob attributes */
+extern const struct config_item_type tsm_report_extra_type;
+
+int tsm_register(const struct tsm_ops *ops, void *priv,
+		 const struct config_item_type *type);
+int tsm_unregister(const struct tsm_ops *ops);
+#endif /* __TSM_H */
-- 
cgit v1.2.3


From a67d74a4b163878a3c0537033ed1b20db92ebfc5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 19 Jul 2023 18:14:13 -0700
Subject: mm/slab: Add __free() support for kvfree

Allow for the declaration of variables that trigger kvfree() when they
go out of scope. The check for NULL and call to kvfree() can be elided
by the compiler in most cases, otherwise without the NULL check an
unnecessary call to kvfree() may be emitted. Peter proposed a comment
for this detail [1].

Link: http://lore.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net [1]
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Pankaj Gupta <pankaj.gupta@amd.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/slab.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..df4c2d45bb86 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -763,6 +763,8 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla
 extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 		      __realloc_size(3);
 extern void kvfree(const void *addr);
+DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T))
+
 extern void kvfree_sensitive(const void *addr, size_t len);
 
 unsigned int kmem_cache_size(struct kmem_cache *s);
-- 
cgit v1.2.3


From 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Tue, 10 Oct 2023 22:22:38 +0100
Subject: crypto: pkcs7 - remove sha1 support

Removes support for sha1 signed kernel modules, importing sha1 signed
x.509 certificates.

rsa-pkcs1pad keeps sha1 padding support, which seems to be used by
virtio driver.

sha1 remains available as there are many drivers and subsystems using
it. Note only hmac(sha1) with secret keys remains cryptographically
secure.

In the kernel there are filesystems, IMA, tpm/pcr that appear to be
using sha1. Maybe they can all start to be slowly upgraded to
something else i.e. blake3, ParallelHash, SHAKE256 as needed.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/mscode_parser.c    |  3 --
 crypto/asymmetric_keys/pkcs7_parser.c     |  4 --
 crypto/asymmetric_keys/public_key.c       |  3 +-
 crypto/asymmetric_keys/signature.c        |  2 +-
 crypto/asymmetric_keys/x509_cert_parser.c |  8 ----
 crypto/testmgr.h                          | 80 -------------------------------
 include/linux/oid_registry.h              |  4 --
 kernel/module/Kconfig                     |  5 --
 8 files changed, 2 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 690405ebe77b..6416bded0e07 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -75,9 +75,6 @@ int mscode_note_digest_algo(void *context, size_t hdrlen,
 
 	oid = look_up_OID(value, vlen);
 	switch (oid) {
-	case OID_sha1:
-		ctx->digest_algo = "sha1";
-		break;
 	case OID_sha256:
 		ctx->digest_algo = "sha256";
 		break;
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index cf4caab9620f..ab647cb4d766 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -227,9 +227,6 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen,
 	struct pkcs7_parse_context *ctx = context;
 
 	switch (ctx->last_oid) {
-	case OID_sha1:
-		ctx->sinfo->sig->hash_algo = "sha1";
-		break;
 	case OID_sha256:
 		ctx->sinfo->sig->hash_algo = "sha256";
 		break;
@@ -272,7 +269,6 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen,
 		ctx->sinfo->sig->pkey_algo = "rsa";
 		ctx->sinfo->sig->encoding = "pkcs1";
 		break;
-	case OID_id_ecdsa_with_sha1:
 	case OID_id_ecdsa_with_sha224:
 	case OID_id_ecdsa_with_sha256:
 	case OID_id_ecdsa_with_sha384:
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index abeecb8329b3..5bf0452c17af 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -116,8 +116,7 @@ software_key_determine_akcipher(const struct public_key *pkey,
 		 */
 		if (!hash_algo)
 			return -EINVAL;
-		if (strcmp(hash_algo, "sha1") != 0 &&
-		    strcmp(hash_algo, "sha224") != 0 &&
+		if (strcmp(hash_algo, "sha224") != 0 &&
 		    strcmp(hash_algo, "sha256") != 0 &&
 		    strcmp(hash_algo, "sha384") != 0 &&
 		    strcmp(hash_algo, "sha512") != 0)
diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c
index 2deff81f8af5..398983be77e8 100644
--- a/crypto/asymmetric_keys/signature.c
+++ b/crypto/asymmetric_keys/signature.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(decrypt_blob);
  * Sign the specified data blob using the private key specified by params->key.
  * The signature is wrapped in an encoding if params->encoding is specified
  * (eg. "pkcs1").  If the encoding needs to know the digest type, this can be
- * passed through params->hash_algo (eg. "sha1").
+ * passed through params->hash_algo (eg. "sha512").
  *
  * Returns the length of the data placed in the signature buffer or an error.
  */
diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 2c30928621b7..68ef1ffbbef6 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -198,10 +198,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag,
 	default:
 		return -ENOPKG; /* Unsupported combination */
 
-	case OID_sha1WithRSAEncryption:
-		ctx->cert->sig->hash_algo = "sha1";
-		goto rsa_pkcs1;
-
 	case OID_sha256WithRSAEncryption:
 		ctx->cert->sig->hash_algo = "sha256";
 		goto rsa_pkcs1;
@@ -218,10 +214,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag,
 		ctx->cert->sig->hash_algo = "sha224";
 		goto rsa_pkcs1;
 
-	case OID_id_ecdsa_with_sha1:
-		ctx->cert->sig->hash_algo = "sha1";
-		goto ecdsa;
-
 	case OID_id_ecdsa_with_sha224:
 		ctx->cert->sig->hash_algo = "sha224";
 		goto ecdsa;
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0cd6e0600255..d7e98397549b 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -653,30 +653,6 @@ static const struct akcipher_testvec rsa_tv_template[] = {
 static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = {
 	{
 	.key =
-	"\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1"
-	"\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68"
-	"\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08"
-	"\x98",
-	.key_len = 49,
-	.params =
-	"\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
-	"\xce\x3d\x03\x01\x01",
-	.param_len = 21,
-	.m =
-	"\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae"
-	"\x63\x85\xe7\x82",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91"
-	"\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10"
-	"\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86"
-	"\x80\x6f\xa5\x79\x77\xda\xd0",
-	.c_size = 55,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
-	.key =
 	"\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd"
 	"\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75"
 	"\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee"
@@ -780,32 +756,6 @@ static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = {
 static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = {
 	{
 	.key =
-	"\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41"
-	"\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9"
-	"\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc"
-	"\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8"
-	"\xaf",
-	.key_len = 65,
-	.params =
-	"\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
-	"\xce\x3d\x03\x01\x07",
-	.param_len = 21,
-	.m =
-	"\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c"
-	"\x0b\xde\x6a\x42",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7"
-	"\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a"
-	"\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d"
-	"\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7"
-	"\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad",
-	.c_size = 72,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
-	.key =
 	"\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9"
 	"\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0"
 	"\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88"
@@ -916,36 +866,6 @@ static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = {
 
 static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
 	{
-	.key = /* secp384r1(sha1) */
-	"\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1"
-	"\x5a\xa5\x24\xf1\x95\x06\x9e\x28\xfb\xc4\xb9\xbe\x5a\x0d\xd9\x9f"
-	"\xf3\xd1\x4d\x2d\x07\x99\xbd\xda\xa7\x66\xec\xbb\xea\xba\x79\x42"
-	"\xc9\x34\x89\x6a\xe7\x0b\xc3\xf2\xfe\x32\x30\xbe\xba\xf9\xdf\x7e"
-	"\x4b\x6a\x07\x8e\x26\x66\x3f\x1d\xec\xa2\x57\x91\x51\xdd\x17\x0e"
-	"\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3"
-	"\xf1",
-	.key_len = 97,
-	.params =
-	"\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
-	"\x00\x22",
-	.param_len = 18,
-	.m =
-	"\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22"
-	"\x3a\x69\xc1\x93",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07"
-	"\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1"
-	"\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e"
-	"\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0"
-	"\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88"
-	"\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26"
-	"\x79\x12\x2a\xb7\xc5\x15\x92\xc5",
-	.c_size = 104,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
 	.key = /* secp384r1(sha224) */
 	"\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0"
 	"\xd9\x98\x8d\x92\x2a\xab\x9b\x11\xcb\x48\x18\xa1\xa9\x0d\xd5\x18"
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 4d04fa5d1eec..8b79e55cfcec 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -17,12 +17,10 @@
  *	  build_OID_registry.pl to generate the data for look_up_OID().
  */
 enum OID {
-	OID_id_dsa_with_sha1,		/* 1.2.840.10030.4.3 */
 	OID_id_dsa,			/* 1.2.840.10040.4.1 */
 	OID_id_ecPublicKey,		/* 1.2.840.10045.2.1 */
 	OID_id_prime192v1,		/* 1.2.840.10045.3.1.1 */
 	OID_id_prime256v1,		/* 1.2.840.10045.3.1.7 */
-	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
 	OID_id_ecdsa_with_sha224,	/* 1.2.840.10045.4.3.1 */
 	OID_id_ecdsa_with_sha256,	/* 1.2.840.10045.4.3.2 */
 	OID_id_ecdsa_with_sha384,	/* 1.2.840.10045.4.3.3 */
@@ -30,7 +28,6 @@ enum OID {
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-	OID_sha1WithRSAEncryption,	/* 1.2.840.113549.1.1.5 */
 	OID_sha256WithRSAEncryption,	/* 1.2.840.113549.1.1.11 */
 	OID_sha384WithRSAEncryption,	/* 1.2.840.113549.1.1.12 */
 	OID_sha512WithRSAEncryption,	/* 1.2.840.113549.1.1.13 */
@@ -67,7 +64,6 @@ enum OID {
 	OID_PKU2U,			/* 1.3.5.1.5.2.7 */
 	OID_Scram,			/* 1.3.6.1.5.5.14 */
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
-	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_id_ansip384r1,		/* 1.3.132.0.34 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
 	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..19a53d5e7736 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,10 +236,6 @@ choice
 	  possible to load a signed module containing the algorithm to check
 	  the signature on that module.
 
-config MODULE_SIG_SHA1
-	bool "Sign modules with SHA-1"
-	select CRYPTO_SHA1
-
 config MODULE_SIG_SHA224
 	bool "Sign modules with SHA-224"
 	select CRYPTO_SHA256
@@ -261,7 +257,6 @@ endchoice
 config MODULE_SIG_HASH
 	string
 	depends on MODULE_SIG || IMA_APPRAISE_MODSIG
-	default "sha1" if MODULE_SIG_SHA1
 	default "sha224" if MODULE_SIG_SHA224
 	default "sha256" if MODULE_SIG_SHA256
 	default "sha384" if MODULE_SIG_SHA384
-- 
cgit v1.2.3


From 5acab6eb592387191c1bb745ba9b815e1e076db5 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Fri, 13 Oct 2023 11:49:57 +0800
Subject: crypto: hisilicon/qm - fix EQ/AEQ interrupt issue

During hisilicon accelerator live migration operation. In order to
prevent the problem of EQ/AEQ interrupt loss. Migration driver will
trigger an EQ/AEQ doorbell at the end of the migration.

This operation may cause double interruption of EQ/AEQ events.
To ensure that the EQ/AEQ interrupt processing function is normal.
The interrupt handling functionality of EQ/AEQ needs to be updated.
Used to handle repeated interrupts event.

Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration")
Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/qm.c | 105 ++++++++++++++++--------------------------
 include/linux/hisi_acc_qm.h   |   1 +
 2 files changed, 41 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index a1d0473f1931..6ee24313d851 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -894,47 +894,15 @@ static void qm_poll_req_cb(struct hisi_qp *qp)
 	qm_db(qm, qp->qp_id, QM_DOORBELL_CMD_CQ, qp->qp_status.cq_head, 1);
 }
 
-static int qm_get_complete_eqe_num(struct hisi_qm_poll_data *poll_data)
-{
-	struct hisi_qm *qm = poll_data->qm;
-	struct qm_eqe *eqe = qm->eqe + qm->status.eq_head;
-	u16 eq_depth = qm->eq_depth;
-	int eqe_num = 0;
-	u16 cqn;
-
-	while (QM_EQE_PHASE(eqe) == qm->status.eqc_phase) {
-		cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
-		poll_data->qp_finish_id[eqe_num] = cqn;
-		eqe_num++;
-
-		if (qm->status.eq_head == eq_depth - 1) {
-			qm->status.eqc_phase = !qm->status.eqc_phase;
-			eqe = qm->eqe;
-			qm->status.eq_head = 0;
-		} else {
-			eqe++;
-			qm->status.eq_head++;
-		}
-
-		if (eqe_num == (eq_depth >> 1) - 1)
-			break;
-	}
-
-	qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
-
-	return eqe_num;
-}
-
 static void qm_work_process(struct work_struct *work)
 {
 	struct hisi_qm_poll_data *poll_data =
 		container_of(work, struct hisi_qm_poll_data, work);
 	struct hisi_qm *qm = poll_data->qm;
+	u16 eqe_num = poll_data->eqe_num;
 	struct hisi_qp *qp;
-	int eqe_num, i;
+	int i;
 
-	/* Get qp id of completed tasks and re-enable the interrupt. */
-	eqe_num = qm_get_complete_eqe_num(poll_data);
 	for (i = eqe_num - 1; i >= 0; i--) {
 		qp = &qm->qp_array[poll_data->qp_finish_id[i]];
 		if (unlikely(atomic_read(&qp->qp_status.flags) == QP_STOP))
@@ -950,39 +918,55 @@ static void qm_work_process(struct work_struct *work)
 	}
 }
 
-static bool do_qm_eq_irq(struct hisi_qm *qm)
+static void qm_get_complete_eqe_num(struct hisi_qm *qm)
 {
 	struct qm_eqe *eqe = qm->eqe + qm->status.eq_head;
-	struct hisi_qm_poll_data *poll_data;
-	u16 cqn;
+	struct hisi_qm_poll_data *poll_data = NULL;
+	u16 eq_depth = qm->eq_depth;
+	u16 cqn, eqe_num = 0;
 
-	if (!readl(qm->io_base + QM_VF_EQ_INT_SOURCE))
-		return false;
+	if (QM_EQE_PHASE(eqe) != qm->status.eqc_phase) {
+		atomic64_inc(&qm->debug.dfx.err_irq_cnt);
+		qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
+		return;
+	}
 
-	if (QM_EQE_PHASE(eqe) == qm->status.eqc_phase) {
+	cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
+	if (unlikely(cqn >= qm->qp_num))
+		return;
+	poll_data = &qm->poll_data[cqn];
+
+	while (QM_EQE_PHASE(eqe) == qm->status.eqc_phase) {
 		cqn = le32_to_cpu(eqe->dw0) & QM_EQE_CQN_MASK;
-		poll_data = &qm->poll_data[cqn];
-		queue_work(qm->wq, &poll_data->work);
+		poll_data->qp_finish_id[eqe_num] = cqn;
+		eqe_num++;
+
+		if (qm->status.eq_head == eq_depth - 1) {
+			qm->status.eqc_phase = !qm->status.eqc_phase;
+			eqe = qm->eqe;
+			qm->status.eq_head = 0;
+		} else {
+			eqe++;
+			qm->status.eq_head++;
+		}
 
-		return true;
+		if (eqe_num == (eq_depth >> 1) - 1)
+			break;
 	}
 
-	return false;
+	poll_data->eqe_num = eqe_num;
+	queue_work(qm->wq, &poll_data->work);
+	qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
 }
 
 static irqreturn_t qm_eq_irq(int irq, void *data)
 {
 	struct hisi_qm *qm = data;
-	bool ret;
-
-	ret = do_qm_eq_irq(qm);
-	if (ret)
-		return IRQ_HANDLED;
 
-	atomic64_inc(&qm->debug.dfx.err_irq_cnt);
-	qm_db(qm, 0, QM_DOORBELL_CMD_EQ, qm->status.eq_head, 0);
+	/* Get qp id of completed tasks and re-enable the interrupt */
+	qm_get_complete_eqe_num(qm);
 
-	return IRQ_NONE;
+	return IRQ_HANDLED;
 }
 
 static irqreturn_t qm_mb_cmd_irq(int irq, void *data)
@@ -1064,6 +1048,8 @@ static irqreturn_t qm_aeq_thread(int irq, void *data)
 	u16 aeq_depth = qm->aeq_depth;
 	u32 type, qp_id;
 
+	atomic64_inc(&qm->debug.dfx.aeq_irq_cnt);
+
 	while (QM_AEQE_PHASE(aeqe) == qm->status.aeqc_phase) {
 		type = (le32_to_cpu(aeqe->dw0) >> QM_AEQE_TYPE_SHIFT) &
 			QM_AEQE_TYPE_MASK;
@@ -1102,17 +1088,6 @@ static irqreturn_t qm_aeq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t qm_aeq_irq(int irq, void *data)
-{
-	struct hisi_qm *qm = data;
-
-	atomic64_inc(&qm->debug.dfx.aeq_irq_cnt);
-	if (!readl(qm->io_base + QM_VF_AEQ_INT_SOURCE))
-		return IRQ_NONE;
-
-	return IRQ_WAKE_THREAD;
-}
-
 static void qm_init_qp_status(struct hisi_qp *qp)
 {
 	struct hisi_qp_status *qp_status = &qp->qp_status;
@@ -4923,8 +4898,8 @@ static int qm_register_aeq_irq(struct hisi_qm *qm)
 		return 0;
 
 	irq_vector = val & QM_IRQ_VECTOR_MASK;
-	ret = request_threaded_irq(pci_irq_vector(pdev, irq_vector), qm_aeq_irq,
-						   qm_aeq_thread, 0, qm->dev_name, qm);
+	ret = request_threaded_irq(pci_irq_vector(pdev, irq_vector), NULL,
+						   qm_aeq_thread, IRQF_ONESHOT, qm->dev_name, qm);
 	if (ret)
 		dev_err(&pdev->dev, "failed to request eq irq, ret = %d", ret);
 
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 44e0c44a2e20..ddc7ebb70523 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -276,6 +276,7 @@ struct hisi_qm_poll_data {
 	struct hisi_qm *qm;
 	struct work_struct work;
 	u16 *qp_finish_id;
+	u16 eqe_num;
 };
 
 /**
-- 
cgit v1.2.3


From eeb9f34df065f42f0c9195b322ba6df420c9fc92 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 4 Sep 2023 22:04:53 -0700
Subject: x86/srso: Fix unret validation dependencies

CONFIG_CPU_SRSO isn't dependent on CONFIG_CPU_UNRET_ENTRY (AMD
Retbleed), so the two features are independently configurable.  Fix
several issues for the (presumably rare) case where CONFIG_CPU_SRSO is
enabled but CONFIG_CPU_UNRET_ENTRY isn't.

Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/299fb7740174d0f2335e91c58af0e9c242b4bac1.1693889988.git.jpoimboe@kernel.org
---
 arch/x86/include/asm/nospec-branch.h | 4 ++--
 include/linux/objtool.h              | 3 ++-
 scripts/Makefile.vmlinux_o           | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c55cc243592e..197ff4f4d1ce 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -271,7 +271,7 @@
 .Lskip_rsb_\@:
 .endm
 
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
 #define CALL_UNTRAIN_RET	"call entry_untrain_ret"
 #else
 #define CALL_UNTRAIN_RET	""
@@ -312,7 +312,7 @@
 
 .macro UNTRAIN_RET_FROM_CALL
 #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
-	defined(CONFIG_CALL_DEPTH_TRACKING)
+	defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
 	VALIDATE_UNRET_END
 	ALTERNATIVE_3 "",						\
 		      CALL_UNTRAIN_RET, X86_FEATURE_UNRET,		\
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 03f82c2c2ebf..b5440e7da55b 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -130,7 +130,8 @@
  * it will be ignored.
  */
 .macro VALIDATE_UNRET_BEGIN
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+	(defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
 .Lhere_\@:
 	.pushsection .discard.validate_unret
 	.long	.Lhere_\@ - .
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 0edfdb40364b..25b3b587d37c 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -37,7 +37,8 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION))
 
 vmlinux-objtool-args-$(delay-objtool)			+= $(objtool-args-y)
 vmlinux-objtool-args-$(CONFIG_GCOV_KERNEL)		+= --no-unreachable
-vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION)	+= --noinstr $(if $(CONFIG_CPU_UNRET_ENTRY), --unret)
+vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION)	+= --noinstr \
+							   $(if $(or $(CONFIG_CPU_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret)
 
 objtool-args = $(vmlinux-objtool-args-y) --link
 
-- 
cgit v1.2.3


From f600bb612b06adf70ccdefbd3294c71275b650c2 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 19 Oct 2023 13:14:58 +0200
Subject: net: dsa: microchip: ksz8: Enable MIIM PHY Control reg access

Provide access to MIIM PHY Control register (Reg. 31) through
ksz8_r_phy_ctrl() and ksz8_w_phy_ctrl() functions. Necessary for
upcoming micrel.c patch to address forced link mode configuration.

Closes: https://lore.kernel.org/oe-kbuild-all/202310112224.iYgvjBUy-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz8795.c | 86 +++++++++++++++++++++++++++++++++++--
 include/linux/micrel_phy.h          |  4 ++
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
index 91aba470fb2f..4bf4d67557dc 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -632,6 +632,50 @@ static void ksz8_w_vlan_table(struct ksz_device *dev, u16 vid, u16 vlan)
 	ksz8_w_table(dev, TABLE_VLAN, addr, buf);
 }
 
+/**
+ * ksz8_r_phy_ctrl - Translates and reads from the SMI interface to a MIIM PHY
+ *		     Control register (Reg. 31).
+ * @dev: The KSZ device instance.
+ * @port: The port number to be read.
+ * @val: The value read from the SMI interface.
+ *
+ * This function reads the SMI interface and translates the hardware register
+ * bit values into their corresponding control settings for a MIIM PHY Control
+ * register.
+ *
+ * Return: 0 on success, error code on failure.
+ */
+static int ksz8_r_phy_ctrl(struct ksz_device *dev, int port, u16 *val)
+{
+	const u16 *regs = dev->info->regs;
+	u8 reg_val;
+	int ret;
+
+	*val = 0;
+
+	ret = ksz_pread8(dev, port, regs[P_LINK_STATUS], &reg_val);
+	if (ret < 0)
+		return ret;
+
+	if (reg_val & PORT_MDIX_STATUS)
+		*val |= KSZ886X_CTRL_MDIX_STAT;
+
+	ret = ksz_pread8(dev, port, REG_PORT_LINK_MD_CTRL, &reg_val);
+	if (ret < 0)
+		return ret;
+
+	if (reg_val & PORT_FORCE_LINK)
+		*val |= KSZ886X_CTRL_FORCE_LINK;
+
+	if (reg_val & PORT_POWER_SAVING)
+		*val |= KSZ886X_CTRL_PWRSAVE;
+
+	if (reg_val & PORT_PHY_REMOTE_LOOPBACK)
+		*val |= KSZ886X_CTRL_REMOTE_LOOPBACK;
+
+	return 0;
+}
+
 int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 {
 	u8 restart, speed, ctrl, link;
@@ -769,12 +813,10 @@ int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 				FIELD_GET(PORT_CABLE_FAULT_COUNTER_L, val2));
 		break;
 	case PHY_REG_PHY_CTRL:
-		ret = ksz_pread8(dev, p, regs[P_LINK_STATUS], &link);
+		ret = ksz8_r_phy_ctrl(dev, p, &data);
 		if (ret)
 			return ret;
 
-		if (link & PORT_MDIX_STATUS)
-			data |= KSZ886X_CTRL_MDIX_STAT;
 		break;
 	default:
 		processed = false;
@@ -786,6 +828,38 @@ int ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 	return 0;
 }
 
+/**
+ * ksz8_w_phy_ctrl - Translates and writes to the SMI interface from a MIIM PHY
+ *		     Control register (Reg. 31).
+ * @dev: The KSZ device instance.
+ * @port: The port number to be configured.
+ * @val: The register value to be written.
+ *
+ * This function translates control settings from a MIIM PHY Control register
+ * into their corresponding hardware register bit values for the SMI
+ * interface.
+ *
+ * Return: 0 on success, error code on failure.
+ */
+static int ksz8_w_phy_ctrl(struct ksz_device *dev, int port, u16 val)
+{
+	u8 reg_val = 0;
+	int ret;
+
+	if (val & KSZ886X_CTRL_FORCE_LINK)
+		reg_val |= PORT_FORCE_LINK;
+
+	if (val & KSZ886X_CTRL_PWRSAVE)
+		reg_val |= PORT_POWER_SAVING;
+
+	if (val & KSZ886X_CTRL_REMOTE_LOOPBACK)
+		reg_val |= PORT_PHY_REMOTE_LOOPBACK;
+
+	ret = ksz_prmw8(dev, port, REG_PORT_LINK_MD_CTRL, PORT_FORCE_LINK |
+			PORT_POWER_SAVING | PORT_PHY_REMOTE_LOOPBACK, reg_val);
+	return ret;
+}
+
 int ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 {
 	u8 restart, speed, ctrl, data;
@@ -926,6 +1000,12 @@ int ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 		if (val & PHY_START_CABLE_DIAG)
 			ksz_port_cfg(dev, p, REG_PORT_LINK_MD_CTRL, PORT_START_CABLE_DIAG, true);
 		break;
+
+	case PHY_REG_PHY_CTRL:
+		ret = ksz8_w_phy_ctrl(dev, p, val);
+		if (ret)
+			return ret;
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 4e27ca7c49de..591bf5b5e8dc 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -64,6 +64,10 @@
 #define KSZ886X_BMCR_DISABLE_TRANSMIT		BIT(1)
 #define KSZ886X_BMCR_DISABLE_LED		BIT(0)
 
+/* PHY Special Control/Status Register (Reg 31) */
 #define KSZ886X_CTRL_MDIX_STAT			BIT(4)
+#define KSZ886X_CTRL_FORCE_LINK			BIT(3)
+#define KSZ886X_CTRL_PWRSAVE			BIT(2)
+#define KSZ886X_CTRL_REMOTE_LOOPBACK		BIT(1)
 
 #endif /* _MICREL_PHY_H */
-- 
cgit v1.2.3


From 20c6e05bd33deaa6fa890252d7ffc5ad54a0942c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 19 Oct 2023 08:28:15 -0700
Subject: ethtool: untangle the linkmode and ethtool headers

Commit 26c5334d344d ("ethtool: Add forced speed to supported link
modes maps") added a dependency between ethtool.h and linkmode.h.
The dependency in the opposite direction already exists so the
new code was inserted in an awkward place.

The reason for ethtool.h to include linkmode.h, is that
ethtool_forced_speed_maps_init() is a static inline helper.
That's not really necessary.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h  | 22 ++--------------------
 include/linux/linkmode.h | 29 ++++++++++++++---------------
 net/ethtool/common.c     | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 8e91e8b8a693..226a36ed5aa1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -13,7 +13,6 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
-#include <linux/linkmode.h>
 #include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <linux/if_ether.h>
@@ -1070,23 +1069,6 @@ struct ethtool_forced_speed_map {
 	.arr_size	= ARRAY_SIZE(prefix##_##value),			\
 }
 
-/**
- * ethtool_forced_speed_maps_init
- * @maps: Pointer to an array of Ethtool forced speed map
- * @size: Array size
- *
- * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
- * should be called during driver module init.
- */
-static inline void
-ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
-{
-	for (u32 i = 0; i < size; i++) {
-		struct ethtool_forced_speed_map *map = &maps[i];
-
-		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
-		map->cap_arr = NULL;
-		map->arr_size = 0;
-	}
-}
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size);
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index cd38f89553e6..7303b4bc2ce0 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -2,21 +2,6 @@
 #define __LINKMODE_H
 
 #include <linux/bitmap.h>
-
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_set_bit_array(const int *array, int array_size,
-					  unsigned long *addr)
-{
-	int i;
-
-	for (i = 0; i < array_size; i++)
-		linkmode_set_bit(array[i], addr);
-}
-
 #include <linux/ethtool.h>
 #include <uapi/linux/ethtool.h>
 
@@ -53,6 +38,11 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
+{
+	__set_bit(nr, addr);
+}
+
 static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
 {
 	__clear_bit(nr, addr);
@@ -72,6 +62,15 @@ static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
 	return test_bit(nr, addr);
 }
 
+static inline void linkmode_set_bit_array(const int *array, int array_size,
+					  unsigned long *addr)
+{
+	int i;
+
+	for (i = 0; i < array_size; i++)
+		linkmode_set_bit(array[i], addr);
+}
+
 static inline int linkmode_equal(const unsigned long *src1,
 				 const unsigned long *src2)
 {
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f5598c5f50de..b4419fb6df6a 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -685,3 +685,24 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
 	link_ksettings->base.duplex = link_info->duplex;
 }
 EXPORT_SYMBOL_GPL(ethtool_params_from_link_mode);
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+	for (u32 i = 0; i < size; i++) {
+		struct ethtool_forced_speed_map *map = &maps[i];
+
+		linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+		map->cap_arr = NULL;
+		map->arr_size = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init);
-- 
cgit v1.2.3


From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Oct 2023 04:35:45 +0100
Subject: tracing: Move readpos from seq_buf to trace_seq

To make seq_buf more lightweight as a string buf, move the readpos member
from seq_buf to its container, trace_seq.  That puts the responsibility
of maintaining the readpos entirely in the tracing code.  If some future
users want to package up the readpos with a seq_buf, we can define a
new struct then.

Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org

Cc: Kees Cook <keescook@chromium.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   |  5 +----
 include/linux/trace_seq.h |  2 ++
 kernel/trace/trace.c      | 10 +++++-----
 kernel/trace/trace_seq.c  |  6 +++++-
 lib/seq_buf.c             | 22 ++++++++++------------
 5 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 515d7fcb9634..a0fb013cebdf 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -14,19 +14,16 @@
  * @buffer:	pointer to the buffer
  * @size:	size of the buffer
  * @len:	the amount of data inside the buffer
- * @readpos:	The next position to read in the buffer.
  */
 struct seq_buf {
 	char			*buffer;
 	size_t			size;
 	size_t			len;
-	loff_t			readpos;
 };
 
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
-	s->readpos = 0;
 }
 
 static inline void
@@ -143,7 +140,7 @@ extern __printf(2, 0)
 int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
 extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s);
 extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
-			   int cnt);
+			   size_t start, int cnt);
 extern int seq_buf_puts(struct seq_buf *s, const char *str);
 extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
 extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len);
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 6be92bf559fe..3691e0e76a1a 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -14,6 +14,7 @@
 struct trace_seq {
 	char			buffer[PAGE_SIZE];
 	struct seq_buf		seq;
+	size_t			readpos;
 	int			full;
 };
 
@@ -22,6 +23,7 @@ trace_seq_init(struct trace_seq *s)
 {
 	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
 	s->full = 0;
+	s->readpos = 0;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4383be8fa1b0..d629065c2383 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1731,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
 	int len;
 
-	if (trace_seq_used(s) <= s->seq.readpos)
+	if (trace_seq_used(s) <= s->readpos)
 		return -EBUSY;
 
-	len = trace_seq_used(s) - s->seq.readpos;
+	len = trace_seq_used(s) - s->readpos;
 	if (cnt > len)
 		cnt = len;
-	memcpy(buf, s->buffer + s->seq.readpos, cnt);
+	memcpy(buf, s->buffer + s->readpos, cnt);
 
-	s->seq.readpos += cnt;
+	s->readpos += cnt;
 	return cnt;
 }
 
@@ -7008,7 +7008,7 @@ waitagain:
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+	if (iter->seq.readpos >= trace_seq_used(&iter->seq))
 		trace_seq_init(&iter->seq);
 
 	/*
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index bac06ee3b98b..7be97229ddf8 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
  */
 int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
 {
+	int ret;
 	__trace_seq_init(s);
-	return seq_buf_to_user(&s->seq, ubuf, cnt);
+	ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt);
+	if (ret > 0)
+		s->readpos += ret;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(trace_seq_to_user);
 
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 45c450f423fa..b7477aefff53 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -324,23 +324,24 @@ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc)
  * seq_buf_to_user - copy the sequence buffer to user space
  * @s: seq_buf descriptor
  * @ubuf: The userspace memory location to copy to
+ * @start: The first byte in the buffer to copy
  * @cnt: The amount to copy
  *
  * Copies the sequence buffer into the userspace memory pointed to
- * by @ubuf. It starts from the last read position (@s->readpos)
- * and writes up to @cnt characters or till it reaches the end of
- * the content in the buffer (@s->len), which ever comes first.
+ * by @ubuf. It starts from @start and writes up to @cnt characters
+ * or until it reaches the end of the content in the buffer (@s->len),
+ * whichever comes first.
  *
  * On success, it returns a positive number of the number of bytes
  * it copied.
  *
  * On failure it returns -EBUSY if all of the content in the
  * sequence has been already read, which includes nothing in the
- * sequence (@s->len == @s->readpos).
+ * sequence (@s->len == @start).
  *
  * Returns -EFAULT if the copy to userspace fails.
  */
-int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt)
+int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt)
 {
 	int len;
 	int ret;
@@ -350,20 +351,17 @@ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt)
 
 	len = seq_buf_used(s);
 
-	if (len <= s->readpos)
+	if (len <= start)
 		return -EBUSY;
 
-	len -= s->readpos;
+	len -= start;
 	if (cnt > len)
 		cnt = len;
-	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	ret = copy_to_user(ubuf, s->buffer + start, cnt);
 	if (ret == cnt)
 		return -EFAULT;
 
-	cnt -= ret;
-
-	s->readpos += cnt;
-	return cnt;
+	return cnt - ret;
 }
 
 /**
-- 
cgit v1.2.3


From 8c35b1f472533b0df1b8f1f1afcaf4395cdb2256 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 12 Oct 2023 20:34:50 +0200
Subject: thermal: core: Pass trip pointer to governor throttle callback

Modify the governor .throttle() callback definition so that it takes a
trip pointer instead of a trip index as its second argument, adjust the
governors accordingly and update the core code invoking .throttle().

This causes the governors to become independent of the representation
of the list of trips in the thermal zone structure.

This change is not expected to alter the general functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/gov_bang_bang.c       |  8 +++---
 drivers/thermal/gov_fair_share.c      |  6 ++---
 drivers/thermal/gov_power_allocator.c |  4 +--
 drivers/thermal/gov_step_wise.c       | 12 +++++----
 drivers/thermal/gov_user_space.c      |  8 +++---
 drivers/thermal/thermal_core.c        | 50 +++++++++++++++++------------------
 drivers/thermal/thermal_core.h        |  2 +-
 drivers/thermal/thermal_helpers.c     |  3 +--
 include/linux/thermal.h               |  3 ++-
 9 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index 49cdfaa3a927..6ddf0accdc98 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -13,9 +13,10 @@
 
 #include "thermal_core.h"
 
-static int thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_index)
+static int thermal_zone_trip_update(struct thermal_zone_device *tz,
+				    const struct thermal_trip *trip)
 {
-	const struct thermal_trip *trip = &tz->trips[trip_index];
+	int trip_index = thermal_zone_trip_id(tz, trip);
 	struct thermal_instance *instance;
 
 	if (!trip->hysteresis)
@@ -89,7 +90,8 @@ static int thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_ind
  *     (trip_temp - hyst) so that the fan gets turned off again.
  *
  */
-static int bang_bang_control(struct thermal_zone_device *tz, int trip)
+static int bang_bang_control(struct thermal_zone_device *tz,
+			     const struct thermal_trip *trip)
 {
 	struct thermal_instance *instance;
 	int ret;
diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index 41effa87d3cd..538abb7de4e2 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -47,7 +47,7 @@ static long get_target_state(struct thermal_zone_device *tz,
 /**
  * fair_share_throttle - throttles devices associated with the given zone
  * @tz: thermal_zone_device
- * @trip_index: trip point index
+ * @trip: trip point
  *
  * Throttling Logic: This uses three parameters to calculate the new
  * throttle state of the cooling devices associated with the given zone.
@@ -63,9 +63,9 @@ static long get_target_state(struct thermal_zone_device *tz,
  *	(Heavily assumes the trip points are in ascending order)
  * new_state of cooling device = P3 * P2 * P1
  */
-static int fair_share_throttle(struct thermal_zone_device *tz, int trip_index)
+static int fair_share_throttle(struct thermal_zone_device *tz,
+			       const struct thermal_trip *trip)
 {
-	const struct thermal_trip *trip = &tz->trips[trip_index];
 	struct thermal_instance *instance;
 	int total_weight = 0;
 	int total_instance = 0;
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index c0d29d286374..83d4f451b1a9 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -676,10 +676,10 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 	tz->governor_data = NULL;
 }
 
-static int power_allocator_throttle(struct thermal_zone_device *tz, int trip_index)
+static int power_allocator_throttle(struct thermal_zone_device *tz,
+				    const struct thermal_trip *trip)
 {
 	struct power_allocator_params *params = tz->governor_data;
-	const struct thermal_trip *trip = &tz->trips[trip_index];
 	bool update;
 
 	lockdep_assert_held(&tz->lock);
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 8bc63a1d361e..5436aa58d41e 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -68,15 +68,16 @@ static unsigned long get_target_state(struct thermal_instance *instance,
 	return next_target;
 }
 
-static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id)
+static void thermal_zone_trip_update(struct thermal_zone_device *tz,
+				     const struct thermal_trip *trip)
 {
-	const struct thermal_trip *trip = &tz->trips[trip_id];
+	int trip_id = thermal_zone_trip_id(tz, trip);
 	enum thermal_trend trend;
 	struct thermal_instance *instance;
 	bool throttle = false;
 	int old_target;
 
-	trend = get_tz_trend(tz, trip_id);
+	trend = get_tz_trend(tz, trip);
 
 	if (tz->temperature >= trip->temperature) {
 		throttle = true;
@@ -120,7 +121,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id
 /**
  * step_wise_throttle - throttles devices associated with the given zone
  * @tz: thermal_zone_device
- * @trip: trip point index
+ * @trip: trip point
  *
  * Throttling Logic: This uses the trend of the thermal zone to throttle.
  * If the thermal zone is 'heating up' this throttles all the cooling
@@ -128,7 +129,8 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id
  * step. If the zone is 'cooling down' it brings back the performance of
  * the devices by one step.
  */
-static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
+static int step_wise_throttle(struct thermal_zone_device *tz,
+			      const struct thermal_trip *trip)
 {
 	struct thermal_instance *instance;
 
diff --git a/drivers/thermal/gov_user_space.c b/drivers/thermal/gov_user_space.c
index 8bc1c22aaf03..7a1790b7e8f5 100644
--- a/drivers/thermal/gov_user_space.c
+++ b/drivers/thermal/gov_user_space.c
@@ -25,11 +25,12 @@ static int user_space_bind(struct thermal_zone_device *tz)
 /**
  * notify_user_space - Notifies user space about thermal events
  * @tz: thermal_zone_device
- * @trip: trip point index
+ * @trip: trip point
  *
  * This function notifies the user space through UEvents.
  */
-static int notify_user_space(struct thermal_zone_device *tz, int trip)
+static int notify_user_space(struct thermal_zone_device *tz,
+			     const struct thermal_trip *trip)
 {
 	char *thermal_prop[5];
 	int i;
@@ -38,7 +39,8 @@ static int notify_user_space(struct thermal_zone_device *tz, int trip)
 
 	thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s", tz->type);
 	thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d", tz->temperature);
-	thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=%d", trip);
+	thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=%d",
+				    thermal_zone_trip_id(tz, trip));
 	thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d", tz->notify_event);
 	thermal_prop[4] = NULL;
 	kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, thermal_prop);
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 2b345f25dfa4..9c17d35ccbbd 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -307,7 +307,8 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
 		thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
 }
 
-static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
+static void handle_non_critical_trips(struct thermal_zone_device *tz,
+				      const struct thermal_trip *trip)
 {
 	tz->governor ? tz->governor->throttle(tz, trip) :
 		       def_governor->throttle(tz, trip);
@@ -329,44 +330,43 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz)
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
 static void handle_critical_trips(struct thermal_zone_device *tz,
-				  int trip, int trip_temp, enum thermal_trip_type trip_type)
+				  const struct thermal_trip *trip)
 {
 	/* If we have not crossed the trip_temp, we do not care. */
-	if (trip_temp <= 0 || tz->temperature < trip_temp)
+	if (trip->temperature <= 0 || tz->temperature < trip->temperature)
 		return;
 
-	trace_thermal_zone_trip(tz, trip, trip_type);
+	trace_thermal_zone_trip(tz, thermal_zone_trip_id(tz, trip), trip->type);
 
-	if (trip_type == THERMAL_TRIP_HOT && tz->ops->hot)
-		tz->ops->hot(tz);
-	else if (trip_type == THERMAL_TRIP_CRITICAL)
+	if (trip->type == THERMAL_TRIP_CRITICAL)
 		tz->ops->critical(tz);
+	else if (tz->ops->hot)
+		tz->ops->hot(tz);
 }
 
-static void handle_thermal_trip(struct thermal_zone_device *tz, int trip_id)
+static void handle_thermal_trip(struct thermal_zone_device *tz,
+				const struct thermal_trip *trip)
 {
-	struct thermal_trip trip;
-
-	__thermal_zone_get_trip(tz, trip_id, &trip);
-
-	if (trip.temperature == THERMAL_TEMP_INVALID)
+	if (trip->temperature == THERMAL_TEMP_INVALID)
 		return;
 
 	if (tz->last_temperature != THERMAL_TEMP_INVALID) {
-		if (tz->last_temperature < trip.temperature &&
-		    tz->temperature >= trip.temperature)
-			thermal_notify_tz_trip_up(tz->id, trip_id,
+		if (tz->last_temperature < trip->temperature &&
+		    tz->temperature >= trip->temperature)
+			thermal_notify_tz_trip_up(tz->id,
+						  thermal_zone_trip_id(tz, trip),
 						  tz->temperature);
-		if (tz->last_temperature >= trip.temperature &&
-		    tz->temperature < (trip.temperature - trip.hysteresis))
-			thermal_notify_tz_trip_down(tz->id, trip_id,
+		if (tz->last_temperature >= trip->temperature &&
+		    tz->temperature < trip->temperature - trip->hysteresis)
+			thermal_notify_tz_trip_down(tz->id,
+						    thermal_zone_trip_id(tz, trip),
 						    tz->temperature);
 	}
 
-	if (trip.type == THERMAL_TRIP_CRITICAL || trip.type == THERMAL_TRIP_HOT)
-		handle_critical_trips(tz, trip_id, trip.temperature, trip.type);
+	if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT)
+		handle_critical_trips(tz, trip);
 	else
-		handle_non_critical_trips(tz, trip_id);
+		handle_non_critical_trips(tz, trip);
 }
 
 static void update_temperature(struct thermal_zone_device *tz)
@@ -403,7 +403,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event)
 {
-	int count;
+	const struct thermal_trip *trip;
 
 	if (atomic_read(&in_suspend))
 		return;
@@ -422,8 +422,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 
 	tz->notify_event = event;
 
-	for (count = 0; count < tz->num_trips; count++)
-		handle_thermal_trip(tz, count);
+	for_each_trip(tz, trip)
+		handle_thermal_trip(tz, trip);
 
 	monitor_thermal_zone(tz);
 }
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 4702d3a152f9..0a3b3ec5120b 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -70,7 +70,7 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 void thermal_cdev_update(struct thermal_cooling_device *);
 void __thermal_cdev_update(struct thermal_cooling_device *cdev);
 
-int get_tz_trend(struct thermal_zone_device *tz, int trip_index);
+int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip);
 
 struct thermal_instance *
 get_thermal_instance(struct thermal_zone_device *tz,
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index c1d0af73c85d..69e8ea4aa908 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -22,9 +22,8 @@
 #include "thermal_core.h"
 #include "thermal_trace.h"
 
-int get_tz_trend(struct thermal_zone_device *tz, int trip_index)
+int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip)
 {
-	struct thermal_trip *trip = tz->trips ? &tz->trips[trip_index] : NULL;
 	enum thermal_trend trend;
 
 	if (tz->emul_temperature || !tz->ops->get_trend ||
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c8600e313909..cee814d5d1ac 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -199,7 +199,8 @@ struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
 	int (*bind_to_tz)(struct thermal_zone_device *tz);
 	void (*unbind_from_tz)(struct thermal_zone_device *tz);
-	int (*throttle)(struct thermal_zone_device *tz, int trip);
+	int (*throttle)(struct thermal_zone_device *tz,
+			const struct thermal_trip *trip);
 	struct list_head	governor_list;
 };
 
-- 
cgit v1.2.3


From 845e31e1101fc8533be52aff42d8f1ff48636024 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 20 Oct 2023 14:38:49 -0600
Subject: seq_buf: fix a misleading comment

The comment for seq_buf_has_overflowed() says that an overflow condition is
marked by len == size, but that's not what the code is testing.  Make the
comment match reality.

Link: https://lkml.kernel.org/r/87pm19kp0m.fsf@meer.lwn.net

Fixes: 8cd709ae7658a ("tracing: Have seq_buf use full buffer")
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index a0fb013cebdf..8483e4b2d0d2 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -36,7 +36,7 @@ seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
 
 /*
  * seq_buf have a buffer that might overflow. When this happens
- * the len and size are set to be equal.
+ * len is set to be greater than size.
  */
 static inline bool
 seq_buf_has_overflowed(struct seq_buf *s)
-- 
cgit v1.2.3


From b460bc8302f222d346f0c15bba980eb8c36d6278 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:57 +0800
Subject: mm/percpu.c: introduce pcpu_alloc_size()

Introduce pcpu_alloc_size() to get the size of the dynamic per-cpu
area. It will be used by bpf memory allocator in the following patches.
BPF memory allocator maintains per-cpu area caches for multiple area
sizes and its free API only has the to-be-freed per-cpu pointer, so it
needs the size of dynamic per-cpu area to select the corresponding cache
when bpf program frees the dynamic per-cpu pointer.

Acked-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/percpu.h |  1 +
 mm/percpu.c            | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 68fac2e7cbe6..8c677f185901 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
 extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
 extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
 extern void free_percpu(void __percpu *__pdata);
+extern size_t pcpu_alloc_size(void __percpu *__pdata);
 
 DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
 
diff --git a/mm/percpu.c b/mm/percpu.c
index ea607078368d..60ed078e4cd0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2244,6 +2244,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
 	mutex_unlock(&pcpu_alloc_mutex);
 }
 
+/**
+ * pcpu_alloc_size - the size of the dynamic percpu area
+ * @ptr: pointer to the dynamic percpu area
+ *
+ * Returns the size of the @ptr allocation.  This is undefined for statically
+ * defined percpu variables as there is no corresponding chunk->bound_map.
+ *
+ * RETURNS:
+ * The size of the dynamic percpu area.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+size_t pcpu_alloc_size(void __percpu *ptr)
+{
+	struct pcpu_chunk *chunk;
+	unsigned long bit_off, end;
+	void *addr;
+
+	if (!ptr)
+		return 0;
+
+	addr = __pcpu_ptr_to_addr(ptr);
+	/* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
+	chunk = pcpu_chunk_addr_search(addr);
+	bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
+	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+			    bit_off + 1);
+	return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
+}
+
 /**
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
-- 
cgit v1.2.3


From 3f2189e4f77b7a3e979d143dc4ff586488c7e8a5 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:59 +0800
Subject: bpf: Use pcpu_alloc_size() in bpf_mem_free{_rcu}()

For bpf_global_percpu_ma, the pointer passed to bpf_mem_free_rcu() is
allocated by kmalloc() and its size is fixed (16-bytes on x86-64). So
no matter which cache allocates the dynamic per-cpu area, on x86-64
cache[2] will always be used to free the per-cpu area.

Fix the unbalance by checking whether the bpf memory allocator is
per-cpu or not and use pcpu_alloc_size() instead of ksize() to
find the correct cache for per-cpu free.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  1 +
 kernel/bpf/memalloc.c         | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index d644bbb298af..bb1223b21308 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	bool percpu;
 	struct work_struct work;
 };
 
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 776bdf5ffd80..5308e386380a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -525,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
 		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+	ma->percpu = percpu;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -874,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+	size_t size;
+
+	if (percpu)
+		size = pcpu_alloc_size(*((void **)ptr));
+	else
+		size = ksize(ptr - LLIST_NODE_SZ);
+	return bpf_mem_cache_idx(size);
+}
+
 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 {
 	int idx;
@@ -881,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
@@ -895,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
-- 
cgit v1.2.3


From e581a3461de3f129cfe888a67d9f31086328271f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:00 +0800
Subject: bpf: Move the declaration of __bpf_obj_drop_impl() to bpf.h

both syscall.c and helpers.c have the declaration of
__bpf_obj_drop_impl(), so just move it to a common header file.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/helpers.c | 2 --
 kernel/bpf/syscall.c | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b40b45962b..ebd412179771 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index da058aead20c..c814bb44d2d1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 341f8cb4405c..69998f84f7c8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
-- 
cgit v1.2.3


From e383a45902337356d9ccad797094a27c6b2150f9 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:01 +0800
Subject: bpf: Use bpf_global_percpu_ma for per-cpu kptr in
 __bpf_obj_drop_impl()

The following warning was reported when running "./test_progs -t
test_bpf_ma/percpu_free_through_map_free":

  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 68 at kernel/bpf/memalloc.c:342
  CPU: 1 PID: 68 Comm: kworker/u16:2 Not tainted 6.6.0-rc2+ #222
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: events_unbound bpf_map_free_deferred
  RIP: 0010:bpf_mem_refill+0x21c/0x2a0
  ......
  Call Trace:
   <IRQ>
   ? bpf_mem_refill+0x21c/0x2a0
   irq_work_single+0x27/0x70
   irq_work_run_list+0x2a/0x40
   irq_work_run+0x18/0x40
   __sysvec_irq_work+0x1c/0xc0
   sysvec_irq_work+0x73/0x90
   </IRQ>
   <TASK>
   asm_sysvec_irq_work+0x1b/0x20
  RIP: 0010:unit_free+0x50/0x80
   ......
   bpf_mem_free+0x46/0x60
   __bpf_obj_drop_impl+0x40/0x90
   bpf_obj_free_fields+0x17d/0x1a0
   array_map_free+0x6b/0x170
   bpf_map_free_deferred+0x54/0xa0
   process_scheduled_works+0xba/0x370
   worker_thread+0x16d/0x2e0
   kthread+0x105/0x140
   ret_from_fork+0x39/0x60
   ret_from_fork_asm+0x1b/0x30
   </TASK>
  ---[ end trace 0000000000000000 ]---

The reason is simple: __bpf_obj_drop_impl() does not know the freeing
field is a per-cpu pointer and it uses bpf_global_ma to free the
pointer. Because bpf_global_ma is not a per-cpu allocator, so ksize() is
used to select the corresponding cache. The bpf_mem_cache with 16-bytes
unit_size will always be selected to do the unmatched free and it will
trigger the warning in free_bulk() eventually.

Because per-cpu kptr doesn't support list or rb-tree now, so fix the
problem by only checking whether or not the type of kptr is per-cpu in
bpf_obj_free_fields(), and using bpf_global_percpu_ma to these kptrs.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-7-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 +-
 kernel/bpf/helpers.c | 22 ++++++++++++++--------
 kernel/bpf/syscall.c |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ebd412179771..b4825d3cdb29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,7 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c814bb44d2d1..e46ac288a108 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1842,7 +1842,7 @@ unlock:
 		 * bpf_list_head which needs to be freed.
 		 */
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1881,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 
 
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1913,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 }
 
 /* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
 {
+	struct bpf_mem_alloc *ma;
+
 	if (rec && rec->refcount_off >= 0 &&
 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
 		/* Object is refcounted and refcount_dec didn't result in 0
@@ -1926,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 	if (rec)
 		bpf_obj_free_fields(rec, p);
 
+	if (percpu)
+		ma = &bpf_global_percpu_ma;
+	else
+		ma = &bpf_global_ma;
 	if (rec && rec->refcount_off >= 0)
-		bpf_mem_free_rcu(&bpf_global_ma, p);
+		bpf_mem_free_rcu(ma, p);
 	else
-		bpf_mem_free(&bpf_global_ma, p);
+		bpf_mem_free(ma, p);
 }
 
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1937,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
 }
 
 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1981,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
@@ -2080,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 69998f84f7c8..a9b2cb500bf7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -660,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 									   field->kptr.btf_id);
 				migrate_disable();
 				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
-								 pointee_struct_meta->record :
-								 NULL);
+								 pointee_struct_meta->record : NULL,
+								 fields[i].type == BPF_KPTR_PERCPU);
 				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
-- 
cgit v1.2.3


From 1fa05877588a02fdad995385eaafea4798022cd4 Mon Sep 17 00:00:00 2001
From: Umang Jain <umang.jain@ideasonboard.com>
Date: Thu, 19 Oct 2023 14:31:26 +0530
Subject: staging: vc04_services: Support module autoloading using
 MODULE_DEVICE_TABLE

VC04 has now a independent bus vchiq_bus to register its devices.
However, the module auto-loading for bcm2835-audio and bcm2835-camera
currently happens through MODULE_ALIAS() macro specified explicitly.

The correct way to auto-load a module, is when the alias is picked
out from MODULE_DEVICE_TABLE(). In order to get there, we need to
introduce  vchiq_device_id and add relevant entries in file2alias.c
infrastructure so that aliases can be generated. This patch targets
adding vchiq_device_id and do_vchiq_entry, in order to
generate those alias using the /script/mod/file2alias.c.

Going forward the MODULE_ALIAS() from bcm2835-camera and bcm2835-audio
will be dropped, in favour of MODULE_DEVICE_TABLE being used there.

The alias format for vchiq_bus devices will be "vchiq:<dev_name>".
Adjust the vchiq_bus_uevent() to reflect that.

Signed-off-by: Umang Jain <umang.jain@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
Link: https://lore.kernel.org/r/20231019090128.430297-2-umang.jain@ideasonboard.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c | 2 +-
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h | 3 +++
 include/linux/mod_devicetable.h                               | 4 ++++
 scripts/mod/devicetable-offsets.c                             | 3 +++
 scripts/mod/file2alias.c                                      | 9 +++++++++
 5 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
index 0076d091b9b2..933027e0011e 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c
@@ -26,7 +26,7 @@ static int vchiq_bus_uevent(const struct device *dev, struct kobj_uevent_env *en
 {
 	const struct vchiq_device *device = container_of_const(dev, struct vchiq_device, dev);
 
-	return add_uevent_var(env, "MODALIAS=%s", dev_name(&device->dev));
+	return add_uevent_var(env, "MODALIAS=vchiq:%s", dev_name(&device->dev));
 }
 
 static int vchiq_bus_probe(struct device *dev)
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
index 7eaaf9a91cda..caa6fdf25bb1 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h
@@ -7,6 +7,7 @@
 #define _VCHIQ_DEVICE_H
 
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 
 struct vchiq_device {
 	struct device dev;
@@ -18,6 +19,8 @@ struct vchiq_driver {
 	int		(*resume)(struct vchiq_device *device);
 	int		(*suspend)(struct vchiq_device *device,
 				   pm_message_t state);
+
+	const struct vchiq_device_id *id_table;
 	struct device_driver driver;
 };
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b0678b093cb2..700cbf34c3b8 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -946,4 +946,8 @@ struct cdx_device_id {
 	__u32 override_only;
 };
 
+struct vchiq_device_id {
+	char name[32];
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index abe65f8968dd..e8aadc780da5 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -267,5 +267,8 @@ int main(void)
 	DEVID_FIELD(cdx_device_id, device);
 	DEVID_FIELD(cdx_device_id, override_only);
 
+	DEVID(vchiq_device_id);
+	DEVID_FIELD(vchiq_device_id, name);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 38120f932b0d..78dfae5a8720 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1478,6 +1478,14 @@ static int do_cdx_entry(const char *filename, void *symval,
 	return 1;
 }
 
+static int do_vchiq_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD_ADDR(symval, vchiq_device_id, name);
+	sprintf(alias, "vchiq:%s", *name);
+
+	return 1;
+}
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
@@ -1558,6 +1566,7 @@ static const struct devtable devtable[] = {
 	{"dfl", SIZE_dfl_device_id, do_dfl_entry},
 	{"ishtp", SIZE_ishtp_device_id, do_ishtp_entry},
 	{"cdx", SIZE_cdx_device_id, do_cdx_entry},
+	{"vchiq", SIZE_vchiq_device_id, do_vchiq_entry},
 };
 
 /* Create MODULE_ALIAS() statements.
-- 
cgit v1.2.3


From 2978cc1f285390c1bd4d9bfc665747adc6e4b19c Mon Sep 17 00:00:00 2001
From: Tomer Maimon <tmaimon77@gmail.com>
Date: Tue, 17 Oct 2023 22:59:01 +0300
Subject: usb: chipidea: add CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS flag

Adding CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS flag to modify the vbus_active
parameter to active in case the ChipIdea USB IP role is device-only and
there is no otgsc register.

Signed-off-by: Tomer Maimon <tmaimon77@gmail.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20231017195903.1665260-2-tmaimon77@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/chipidea/otg.c   | 5 ++++-
 include/linux/usb/chipidea.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c
index f5490f2a5b6b..647e98f4e351 100644
--- a/drivers/usb/chipidea/otg.c
+++ b/drivers/usb/chipidea/otg.c
@@ -130,8 +130,11 @@ enum ci_role ci_otg_role(struct ci_hdrc *ci)
 
 void ci_handle_vbus_change(struct ci_hdrc *ci)
 {
-	if (!ci->is_otg)
+	if (!ci->is_otg) {
+		if (ci->platdata->flags & CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS)
+			usb_gadget_vbus_connect(&ci->gadget);
 		return;
+	}
 
 	if (hw_read_otgsc(ci, OTGSC_BSV) && !ci->vbus_active)
 		usb_gadget_vbus_connect(&ci->gadget);
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 0b4f2d5faa08..5a7f96684ea2 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -64,6 +64,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_PMQOS			BIT(15)
 #define CI_HDRC_PHY_VBUS_CONTROL	BIT(16)
 #define CI_HDRC_HAS_PORTSC_PEC_MISSED	BIT(17)
+#define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS	BIT(18)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 2cc3b37f5b6df8189d55d0e812d9658ce256dfec Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Fri, 20 Oct 2023 11:55:41 +0100
Subject: nvmem: add explicit config option to read old syntax fixed OF cells
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Binding for fixed NVMEM cells defined directly as NVMEM device subnodes
has been deprecated. It has been replaced by the "fixed-layout" NVMEM
layout binding.

New syntax is meant to be clearer and should help avoiding imprecise
bindings.

NVMEM subsystem already supports the new binding. It should be a good
idea to limit support for old syntax to existing drivers that actually
support & use it (we can't break backward compatibility!). That way we
additionally encourage new bindings & drivers to ignore deprecated
binding.

It wasn't clear (to me) if rtc and w1 code actually uses old syntax
fixed cells. I enabled them to don't risk any breakage.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
[for meson-{efuse,mx-efuse}.c]
Acked-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
[for mtk-efuse.c, nvmem/core.c, nvmem-provider.h]
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
[MT8192, MT8195 Chromebooks]
Tested-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
[for microchip-otpc.c]
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
[SAMA7G5-EK]
Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231020105545.216052-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mtd/mtdcore.c          | 2 ++
 drivers/nvmem/apple-efuses.c   | 1 +
 drivers/nvmem/core.c           | 8 +++++---
 drivers/nvmem/imx-ocotp-scu.c  | 1 +
 drivers/nvmem/imx-ocotp.c      | 1 +
 drivers/nvmem/meson-efuse.c    | 1 +
 drivers/nvmem/meson-mx-efuse.c | 1 +
 drivers/nvmem/microchip-otpc.c | 1 +
 drivers/nvmem/mtk-efuse.c      | 1 +
 drivers/nvmem/qcom-spmi-sdam.c | 1 +
 drivers/nvmem/qfprom.c         | 1 +
 drivers/nvmem/rave-sp-eeprom.c | 1 +
 drivers/nvmem/rockchip-efuse.c | 1 +
 drivers/nvmem/sc27xx-efuse.c   | 1 +
 drivers/nvmem/sec-qfprom.c     | 1 +
 drivers/nvmem/sprd-efuse.c     | 1 +
 drivers/nvmem/stm32-romem.c    | 1 +
 drivers/nvmem/sunplus-ocotp.c  | 1 +
 drivers/nvmem/sunxi_sid.c      | 1 +
 drivers/nvmem/uniphier-efuse.c | 1 +
 drivers/nvmem/zynqmp_nvmem.c   | 1 +
 drivers/rtc/nvmem.c            | 1 +
 drivers/w1/slaves/w1_ds250x.c  | 1 +
 include/linux/nvmem-provider.h | 2 ++
 24 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 9bd661be3ae9..fbf60d1364f0 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -552,6 +552,7 @@ static int mtd_nvmem_add(struct mtd_info *mtd)
 	config.dev = &mtd->dev;
 	config.name = dev_name(&mtd->dev);
 	config.owner = THIS_MODULE;
+	config.add_legacy_fixed_of_cells = of_device_is_compatible(node, "nvmem-cells");
 	config.reg_read = mtd_nvmem_reg_read;
 	config.size = mtd->size;
 	config.word_size = 1;
@@ -898,6 +899,7 @@ static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd,
 	config.name = compatible;
 	config.id = NVMEM_DEVID_AUTO;
 	config.owner = THIS_MODULE;
+	config.add_legacy_fixed_of_cells = true;
 	config.type = NVMEM_TYPE_OTP;
 	config.root_only = true;
 	config.ignore_wp = true;
diff --git a/drivers/nvmem/apple-efuses.c b/drivers/nvmem/apple-efuses.c
index 9b7c87102104..d3d49d22338b 100644
--- a/drivers/nvmem/apple-efuses.c
+++ b/drivers/nvmem/apple-efuses.c
@@ -36,6 +36,7 @@ static int apple_efuses_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct nvmem_config config = {
 		.dev = &pdev->dev,
+		.add_legacy_fixed_of_cells = true,
 		.read_only = true,
 		.reg_read = apple_efuses_read,
 		.stride = sizeof(u32),
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index eaf6a3fe8ca6..2710943f53c4 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -997,9 +997,11 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	if (rval)
 		goto err_remove_cells;
 
-	rval = nvmem_add_cells_from_legacy_of(nvmem);
-	if (rval)
-		goto err_remove_cells;
+	if (config->add_legacy_fixed_of_cells) {
+		rval = nvmem_add_cells_from_legacy_of(nvmem);
+		if (rval)
+			goto err_remove_cells;
+	}
 
 	rval = nvmem_add_cells_from_fixed_layout(nvmem);
 	if (rval)
diff --git a/drivers/nvmem/imx-ocotp-scu.c b/drivers/nvmem/imx-ocotp-scu.c
index c38d9c1c3f48..517d83e11af2 100644
--- a/drivers/nvmem/imx-ocotp-scu.c
+++ b/drivers/nvmem/imx-ocotp-scu.c
@@ -220,6 +220,7 @@ static int imx_scu_ocotp_write(void *context, unsigned int offset,
 
 static struct nvmem_config imx_scu_ocotp_nvmem_config = {
 	.name = "imx-scu-ocotp",
+	.add_legacy_fixed_of_cells = true,
 	.read_only = false,
 	.word_size = 4,
 	.stride = 1,
diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
index a223d9537f22..434f197e27bf 100644
--- a/drivers/nvmem/imx-ocotp.c
+++ b/drivers/nvmem/imx-ocotp.c
@@ -615,6 +615,7 @@ static int imx_ocotp_probe(struct platform_device *pdev)
 		return PTR_ERR(priv->clk);
 
 	priv->params = of_device_get_match_data(&pdev->dev);
+	imx_ocotp_nvmem_config.add_legacy_fixed_of_cells = true;
 	imx_ocotp_nvmem_config.size = 4 * priv->params->nregs;
 	imx_ocotp_nvmem_config.dev = dev;
 	imx_ocotp_nvmem_config.priv = priv;
diff --git a/drivers/nvmem/meson-efuse.c b/drivers/nvmem/meson-efuse.c
index d6b533497ce1..b922df99f9bc 100644
--- a/drivers/nvmem/meson-efuse.c
+++ b/drivers/nvmem/meson-efuse.c
@@ -93,6 +93,7 @@ static int meson_efuse_probe(struct platform_device *pdev)
 
 	econfig->dev = dev;
 	econfig->name = dev_name(dev);
+	econfig->add_legacy_fixed_of_cells = true;
 	econfig->stride = 1;
 	econfig->word_size = 1;
 	econfig->reg_read = meson_efuse_read;
diff --git a/drivers/nvmem/meson-mx-efuse.c b/drivers/nvmem/meson-mx-efuse.c
index d6d7aeda31f9..3ff04d5ca8f8 100644
--- a/drivers/nvmem/meson-mx-efuse.c
+++ b/drivers/nvmem/meson-mx-efuse.c
@@ -210,6 +210,7 @@ static int meson_mx_efuse_probe(struct platform_device *pdev)
 	efuse->config.owner = THIS_MODULE;
 	efuse->config.dev = &pdev->dev;
 	efuse->config.priv = efuse;
+	efuse->config.add_legacy_fixed_of_cells = true;
 	efuse->config.stride = drvdata->word_size;
 	efuse->config.word_size = drvdata->word_size;
 	efuse->config.size = SZ_512;
diff --git a/drivers/nvmem/microchip-otpc.c b/drivers/nvmem/microchip-otpc.c
index 436e0dc4f337..7cf81738a3e0 100644
--- a/drivers/nvmem/microchip-otpc.c
+++ b/drivers/nvmem/microchip-otpc.c
@@ -261,6 +261,7 @@ static int mchp_otpc_probe(struct platform_device *pdev)
 		return ret;
 
 	mchp_nvmem_config.dev = otpc->dev;
+	mchp_nvmem_config.add_legacy_fixed_of_cells = true;
 	mchp_nvmem_config.size = size;
 	mchp_nvmem_config.priv = otpc;
 	nvmem = devm_nvmem_register(&pdev->dev, &mchp_nvmem_config);
diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c
index b36cd0dcc8c7..87c94686cfd2 100644
--- a/drivers/nvmem/mtk-efuse.c
+++ b/drivers/nvmem/mtk-efuse.c
@@ -83,6 +83,7 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 		return PTR_ERR(priv->base);
 
 	pdata = device_get_match_data(dev);
+	econfig.add_legacy_fixed_of_cells = true;
 	econfig.stride = 1;
 	econfig.word_size = 1;
 	econfig.reg_read = mtk_reg_read;
diff --git a/drivers/nvmem/qcom-spmi-sdam.c b/drivers/nvmem/qcom-spmi-sdam.c
index 70f2d4f2efbf..9aa8f42faa4c 100644
--- a/drivers/nvmem/qcom-spmi-sdam.c
+++ b/drivers/nvmem/qcom-spmi-sdam.c
@@ -142,6 +142,7 @@ static int sdam_probe(struct platform_device *pdev)
 	sdam->sdam_config.name = "spmi_sdam";
 	sdam->sdam_config.id = NVMEM_DEVID_AUTO;
 	sdam->sdam_config.owner = THIS_MODULE;
+	sdam->sdam_config.add_legacy_fixed_of_cells = true;
 	sdam->sdam_config.stride = 1;
 	sdam->sdam_config.word_size = 1;
 	sdam->sdam_config.reg_read = sdam_read;
diff --git a/drivers/nvmem/qfprom.c b/drivers/nvmem/qfprom.c
index 525be03b7bba..116a39e804c7 100644
--- a/drivers/nvmem/qfprom.c
+++ b/drivers/nvmem/qfprom.c
@@ -357,6 +357,7 @@ static int qfprom_probe(struct platform_device *pdev)
 {
 	struct nvmem_config econfig = {
 		.name = "qfprom",
+		.add_legacy_fixed_of_cells = true,
 		.stride = 1,
 		.word_size = 1,
 		.id = NVMEM_DEVID_AUTO,
diff --git a/drivers/nvmem/rave-sp-eeprom.c b/drivers/nvmem/rave-sp-eeprom.c
index df6a1c594b78..9ecf3873cbb7 100644
--- a/drivers/nvmem/rave-sp-eeprom.c
+++ b/drivers/nvmem/rave-sp-eeprom.c
@@ -328,6 +328,7 @@ static int rave_sp_eeprom_probe(struct platform_device *pdev)
 	of_property_read_string(np, "zii,eeprom-name", &config.name);
 	config.priv		= eeprom;
 	config.dev		= dev;
+	config.add_legacy_fixed_of_cells	= true;
 	config.size		= size;
 	config.reg_read		= rave_sp_eeprom_reg_read;
 	config.reg_write	= rave_sp_eeprom_reg_write;
diff --git a/drivers/nvmem/rockchip-efuse.c b/drivers/nvmem/rockchip-efuse.c
index 4004c5bece42..2b40978ddb18 100644
--- a/drivers/nvmem/rockchip-efuse.c
+++ b/drivers/nvmem/rockchip-efuse.c
@@ -205,6 +205,7 @@ static int rockchip_rk3399_efuse_read(void *context, unsigned int offset,
 
 static struct nvmem_config econfig = {
 	.name = "rockchip-efuse",
+	.add_legacy_fixed_of_cells = true,
 	.stride = 1,
 	.word_size = 1,
 	.read_only = true,
diff --git a/drivers/nvmem/sc27xx-efuse.c b/drivers/nvmem/sc27xx-efuse.c
index 2210da40dfbd..bff27011f4ff 100644
--- a/drivers/nvmem/sc27xx-efuse.c
+++ b/drivers/nvmem/sc27xx-efuse.c
@@ -247,6 +247,7 @@ static int sc27xx_efuse_probe(struct platform_device *pdev)
 	econfig.reg_read = sc27xx_efuse_read;
 	econfig.priv = efuse;
 	econfig.dev = &pdev->dev;
+	econfig.add_legacy_fixed_of_cells = true;
 	nvmem = devm_nvmem_register(&pdev->dev, &econfig);
 	if (IS_ERR(nvmem)) {
 		dev_err(&pdev->dev, "failed to register nvmem config\n");
diff --git a/drivers/nvmem/sec-qfprom.c b/drivers/nvmem/sec-qfprom.c
index e48c2dc0c44b..19799b3fe00a 100644
--- a/drivers/nvmem/sec-qfprom.c
+++ b/drivers/nvmem/sec-qfprom.c
@@ -47,6 +47,7 @@ static int sec_qfprom_probe(struct platform_device *pdev)
 {
 	struct nvmem_config econfig = {
 		.name = "sec-qfprom",
+		.add_legacy_fixed_of_cells = true,
 		.stride = 1,
 		.word_size = 1,
 		.id = NVMEM_DEVID_AUTO,
diff --git a/drivers/nvmem/sprd-efuse.c b/drivers/nvmem/sprd-efuse.c
index 7e6e31db4baa..bb3105f3291f 100644
--- a/drivers/nvmem/sprd-efuse.c
+++ b/drivers/nvmem/sprd-efuse.c
@@ -408,6 +408,7 @@ static int sprd_efuse_probe(struct platform_device *pdev)
 	econfig.read_only = false;
 	econfig.name = "sprd-efuse";
 	econfig.size = efuse->data->blk_nums * SPRD_EFUSE_BLOCK_WIDTH;
+	econfig.add_legacy_fixed_of_cells = true;
 	econfig.reg_read = sprd_efuse_read;
 	econfig.reg_write = sprd_efuse_write;
 	econfig.priv = efuse;
diff --git a/drivers/nvmem/stm32-romem.c b/drivers/nvmem/stm32-romem.c
index 0f84044bd1ad..1541c20709d2 100644
--- a/drivers/nvmem/stm32-romem.c
+++ b/drivers/nvmem/stm32-romem.c
@@ -207,6 +207,7 @@ static int stm32_romem_probe(struct platform_device *pdev)
 	priv->cfg.priv = priv;
 	priv->cfg.owner = THIS_MODULE;
 	priv->cfg.type = NVMEM_TYPE_OTP;
+	priv->cfg.add_legacy_fixed_of_cells = true;
 
 	priv->lower = 0;
 
diff --git a/drivers/nvmem/sunplus-ocotp.c b/drivers/nvmem/sunplus-ocotp.c
index f3a18aa0a6c7..38f5d9df39cd 100644
--- a/drivers/nvmem/sunplus-ocotp.c
+++ b/drivers/nvmem/sunplus-ocotp.c
@@ -145,6 +145,7 @@ disable_clk:
 
 static struct nvmem_config sp_ocotp_nvmem_config = {
 	.name = "sp-ocotp",
+	.add_legacy_fixed_of_cells = true,
 	.read_only = true,
 	.word_size = 1,
 	.size = QAC628_OTP_SIZE,
diff --git a/drivers/nvmem/sunxi_sid.c b/drivers/nvmem/sunxi_sid.c
index 5d364d85347f..ba14a76208ab 100644
--- a/drivers/nvmem/sunxi_sid.c
+++ b/drivers/nvmem/sunxi_sid.c
@@ -153,6 +153,7 @@ static int sunxi_sid_probe(struct platform_device *pdev)
 	nvmem_cfg->dev = dev;
 	nvmem_cfg->name = "sunxi-sid";
 	nvmem_cfg->type = NVMEM_TYPE_OTP;
+	nvmem_cfg->add_legacy_fixed_of_cells = true;
 	nvmem_cfg->read_only = true;
 	nvmem_cfg->size = cfg->size;
 	nvmem_cfg->word_size = 1;
diff --git a/drivers/nvmem/uniphier-efuse.c b/drivers/nvmem/uniphier-efuse.c
index 0a1dbb80537e..6ad3295d3195 100644
--- a/drivers/nvmem/uniphier-efuse.c
+++ b/drivers/nvmem/uniphier-efuse.c
@@ -52,6 +52,7 @@ static int uniphier_efuse_probe(struct platform_device *pdev)
 	econfig.size = resource_size(res);
 	econfig.priv = priv;
 	econfig.dev = dev;
+	econfig.add_legacy_fixed_of_cells = true;
 	nvmem = devm_nvmem_register(dev, &econfig);
 
 	return PTR_ERR_OR_ZERO(nvmem);
diff --git a/drivers/nvmem/zynqmp_nvmem.c b/drivers/nvmem/zynqmp_nvmem.c
index f49bb9a26d05..7f15aa89a9d0 100644
--- a/drivers/nvmem/zynqmp_nvmem.c
+++ b/drivers/nvmem/zynqmp_nvmem.c
@@ -58,6 +58,7 @@ static int zynqmp_nvmem_probe(struct platform_device *pdev)
 
 	priv->dev = dev;
 	econfig.dev = dev;
+	econfig.add_legacy_fixed_of_cells = true;
 	econfig.reg_read = zynqmp_nvmem_read;
 	econfig.priv = priv;
 
diff --git a/drivers/rtc/nvmem.c b/drivers/rtc/nvmem.c
index 07ede21cee34..37df7e80525b 100644
--- a/drivers/rtc/nvmem.c
+++ b/drivers/rtc/nvmem.c
@@ -21,6 +21,7 @@ int devm_rtc_nvmem_register(struct rtc_device *rtc,
 
 	nvmem_config->dev = dev;
 	nvmem_config->owner = rtc->owner;
+	nvmem_config->add_legacy_fixed_of_cells = true;
 	nvmem = devm_nvmem_register(dev, nvmem_config);
 	if (IS_ERR(nvmem))
 		dev_err(dev, "failed to register nvmem device for RTC\n");
diff --git a/drivers/w1/slaves/w1_ds250x.c b/drivers/w1/slaves/w1_ds250x.c
index 7592c7050d1d..cb426f7dd23d 100644
--- a/drivers/w1/slaves/w1_ds250x.c
+++ b/drivers/w1/slaves/w1_ds250x.c
@@ -168,6 +168,7 @@ static int w1_eprom_add_slave(struct w1_slave *sl)
 	struct nvmem_device *nvmem;
 	struct nvmem_config nvmem_cfg = {
 		.dev = &sl->dev,
+		.add_legacy_fixed_of_cells = true,
 		.reg_read = w1_nvmem_read,
 		.type = NVMEM_TYPE_OTP,
 		.read_only = true,
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index dae26295e6be..1b81adebdb8b 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -82,6 +82,7 @@ struct nvmem_cell_info {
  * @owner:	Pointer to exporter module. Used for refcounting.
  * @cells:	Optional array of pre-defined NVMEM cells.
  * @ncells:	Number of elements in cells.
+ * @add_legacy_fixed_of_cells:	Read fixed NVMEM cells from old OF syntax.
  * @keepout:	Optional array of keepout ranges (sorted ascending by start).
  * @nkeepout:	Number of elements in the keepout array.
  * @type:	Type of the nvmem storage
@@ -112,6 +113,7 @@ struct nvmem_config {
 	struct module		*owner;
 	const struct nvmem_cell_info	*cells;
 	int			ncells;
+	bool			add_legacy_fixed_of_cells;
 	const struct nvmem_keepout *keepout;
 	unsigned int		nkeepout;
 	enum nvmem_type		type;
-- 
cgit v1.2.3


From 85e95ca7cc48c23f772387b069d794f69116192b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sat, 13 Nov 2021 19:49:14 -0500
Subject: bcachefs: Update export_operations for snapshots

When support for snapshots was merged, export operations weren't
updated yet. This patch adds new filehandle types for bcachefs that
include the subvolume ID and updates export operations for subvolumes -
and also .get_parent, support for which was added just prior to
snapshots.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c     |   4 +-
 fs/bcachefs/dirent.h     |   3 +
 fs/bcachefs/fs.c         | 230 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/exportfs.h |   6 ++
 4 files changed, 218 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 5db1426faaf3..4dfcc955675b 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -197,8 +197,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
-static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-				   struct bkey_s_c_dirent d, subvol_inum *target)
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+			    struct bkey_s_c_dirent d, subvol_inum *target)
 {
 	struct bch_subvolume s;
 	int ret = 0;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 8ae407765fe4..1bb4d802bc1d 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,6 +29,9 @@ static inline unsigned dirent_val_u64s(unsigned len)
 			    sizeof(u64));
 }
 
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+			    struct bkey_s_c_dirent, subvol_inum *);
+
 int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4561c60c95e3..61027d349cd8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1124,46 +1124,230 @@ static const struct address_space_operations bch_address_space_operations = {
 	.error_remove_page = generic_error_remove_page,
 };
 
-#if 0
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+struct bcachefs_fid {
+	u64		inum;
+	u32		subvol;
+	u32		gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+	struct bcachefs_fid	fid;
+	struct bcachefs_fid	dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
 {
-	struct bch_fs *c = sb->s_fs_info;
-	struct inode *vinode;
+	switch (fh_type) {
+	case FILEID_BCACHEFS_WITHOUT_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+	case FILEID_BCACHEFS_WITH_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+	default:
+		return false;
+	}
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+	return (struct bcachefs_fid) {
+		.inum	= inode->ei_inode.bi_inum,
+		.subvol	= inode->ei_subvol,
+		.gen	= inode->ei_inode.bi_generation,
+	};
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+			  struct inode *vdir)
+{
+	struct bch_inode_info *inode	= to_bch_ei(vinode);
+	struct bch_inode_info *dir	= to_bch_ei(vdir);
+
+	if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+		return FILEID_INVALID;
+
+	if (!S_ISDIR(inode->v.i_mode) && dir) {
+		struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+		fid->fid = bch2_inode_to_fid(inode);
+		fid->dir = bch2_inode_to_fid(dir);
+
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITH_PARENT;
+	} else {
+		struct bcachefs_fid *fid = (void *) fh;
 
-	if (ino < BCACHEFS_ROOT_INO)
-		return ERR_PTR(-ESTALE);
+		*fid = bch2_inode_to_fid(inode);
 
-	vinode = bch2_vfs_inode_get(c, ino);
-	if (IS_ERR(vinode))
-		return ERR_CAST(vinode);
-	if (generation && vinode->i_generation != generation) {
-		/* we didn't find the right inode.. */
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITHOUT_PARENT;
+	}
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+					struct bcachefs_fid fid)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+				    .subvol = fid.subvol,
+				    .inum = fid.inum,
+	});
+	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
 		iput(vinode);
-		return ERR_PTR(-ESTALE);
+		vinode = ERR_PTR(-ESTALE);
 	}
 	return vinode;
 }
 
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type))
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 }
 
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
 		int fh_len, int fh_type)
 {
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    bch2_nfs_get_inode);
+	struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type) ||
+	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	subvol_inum parent_inum = {
+		.subvol = inode->ei_inode.bi_parent_subvol ?:
+			inode->ei_subvol,
+		.inum = inode->ei_inode.bi_dir,
+	};
+
+	if (!parent_inum.inum)
+		return NULL;
+
+	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
+	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter iter1;
+	struct btree_iter iter2;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked inode_u;
+	subvol_inum target;
+	u32 snapshot;
+	unsigned name_len;
+	int ret;
+
+	if (!S_ISDIR(dir->v.i_mode))
+		return -EINVAL;
+
+	bch2_trans_init(&trans, c, 0, 0);
+
+	bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+	bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&iter1, snapshot);
+	bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+	ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+		k = bch2_btree_iter_peek_slot(&iter1);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_dirent) {
+			ret = -ENOENT;
+			goto err;
+		}
+
+		d = bkey_s_c_to_dirent(k);
+		ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+		if (ret > 0)
+			ret = -ENOENT;
+		if (ret)
+			goto err;
+
+		if (target.subvol	== inode->ei_subvol &&
+		    target.inum		== inode->ei_inode.bi_inum)
+			goto found;
+	} else {
+		/*
+		 * File with multiple hardlinks and our backref is to the wrong
+		 * directory - linear search:
+		 */
+		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+			if (k.k->p.inode > dir->ei_inode.bi_inum)
+				break;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			d = bkey_s_c_to_dirent(k);
+			ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+			if (ret < 0)
+				break;
+			if (ret)
+				continue;
+
+			if (target.subvol	== inode->ei_subvol &&
+			    target.inum		== inode->ei_inode.bi_inum)
+				goto found;
+		}
+	}
+
+	ret = -ENOENT;
+	goto err;
+found:
+	name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+	memcpy(name, d.v->d_name, name_len);
+	name[name_len] = '\0';
+err:
+	if (ret == -EINTR)
+		goto retry;
+
+	bch2_trans_iter_exit(&trans, &iter1);
+	bch2_trans_iter_exit(&trans, &iter2);
+	bch2_trans_exit(&trans);
+
+	return ret;
 }
-#endif
 
 static const struct export_operations bch_export_ops = {
-	//.fh_to_dentry	= bch2_fh_to_dentry,
-	//.fh_to_parent	= bch2_fh_to_parent,
-	//.get_parent	= bch2_get_parent,
+	.encode_fh	= bch2_encode_fh,
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	.get_parent	= bch2_get_parent,
+	.get_name	= bch2_get_name,
 };
 
 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..f49a7d31167e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,6 +98,12 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
+	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
+	FILEID_BCACHEFS_WITH_PARENT = 0x81,
+
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
-- 
cgit v1.2.3


From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:45 +0000
Subject: tcp: add RTAX_FEATURE_TCP_USEC_TS

This new dst feature flag will be used to allow TCP to use usec
based timestamps instead of msec ones.

ip route .... feature tcp_usec_ts

Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP
are unused.

RTAX_FEATURE_ALLFRAG is also going away soon.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h            |  5 +++++
 include/uapi/linux/rtnetlink.h | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e15452df9804..04a0e647ef74 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 int tcp_sock_set_user_timeout(struct sock *sk, int val);
 
+static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
+{
+	return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
+}
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 51c13cf9c5ae..aa2482a0614a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -502,13 +502,17 @@ enum {
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN	(1 << 0)
-#define RTAX_FEATURE_SACK	(1 << 1)
-#define RTAX_FEATURE_TIMESTAMP	(1 << 2)
-#define RTAX_FEATURE_ALLFRAG	(1 << 3)
-
-#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
-				 RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
+#define RTAX_FEATURE_ECN		(1 << 0)
+#define RTAX_FEATURE_SACK		(1 << 1) /* unused */
+#define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
+#define RTAX_FEATURE_ALLFRAG		(1 << 3)
+#define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
+
+#define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
+				 RTAX_FEATURE_SACK |		\
+				 RTAX_FEATURE_TIMESTAMP |	\
+				 RTAX_FEATURE_ALLFRAG |		\
+				 RTAX_FEATURE_TCP_USEC_TS)
 
 struct rta_session {
 	__u8	proto;
-- 
cgit v1.2.3


From 614e8316aa4cafba3e204cb8ee48bd12b92f3d93 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2023 12:57:47 +0000
Subject: tcp: add support for usec resolution in TCP TS values

Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values.
This has been implemented in our private kernels.

Goals were :

1) better observability of delays in networking stacks.
2) better disambiguation of events based on TSval/ecr values.
3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to negotiate the feature.

For upstream submission, we chose to use a route attribute,
because this feature is probably going to be used in private
networks [1] [2].

ip route add 10/8 ... features tcp_usec_ts

Note that RFC 7323 recommends a
  "timestamp clock frequency in the range 1 ms to 1 sec per tick.",
but also mentions
  "the maximum acceptable clock frequency is one tick every 59 ns."

[1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests
to invalidate TS.Recent values after a flow was idle for more
than 24 days. This is the part making usec_ts a problem
for peers following this recommendation for long living
idle flows.

[2] Attempts to standardize usec ts went nowhere:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h              |  4 +++-
 include/net/inet_timewait_sock.h |  3 ++-
 include/net/tcp.h                |  6 ++++--
 net/ipv4/syncookies.c            |  6 +++++-
 net/ipv4/tcp.c                   | 18 ++++++++++++++----
 net/ipv4/tcp_input.c             |  5 ++++-
 net/ipv4/tcp_ipv4.c              |  1 +
 net/ipv4/tcp_minisocks.c         | 19 ++++++++++++++-----
 net/ipv4/tcp_output.c            | 12 ++++++++----
 net/ipv4/tcp_timer.c             | 40 ++++++++++++++++++++++++++--------------
 net/ipv6/tcp_ipv6.c              |  1 +
 11 files changed, 82 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 04a0e647ef74..6df715b6e51d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,6 +152,7 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
+	s8				req_usec_ts;
 #if IS_ENABLED(CONFIG_MPTCP)
 	bool				drop_req;
 #endif
@@ -257,7 +258,8 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		tcp_usec_ts:1, /* TSval values in usec */
+		unused:4;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4a8e578405cb..b14999ff55db 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
 	/* And these are ours. */
 	unsigned int		tw_transparent  : 1,
 				tw_flowlabel	: 20,
-				tw_pad		: 3,	/* 3 bits hole */
+				tw_usec_ts	: 1,
+				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
 	u32			tw_txhash;
 	u32			tw_priority;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ab577869d7a..39b731c900dd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
 
 static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
 {
+	if (tp->tcp_usec_ts)
+		return tp->tcp_mstamp;
 	return tcp_time_stamp_ms(tp);
 }
 
@@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
 
 static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
 {
-	return tcp_clock_ts(false) + tcptw->tw_ts_offset;
+	return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
 }
 
 static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
 {
-	return tcp_clock_ts(false) + treq->ts_off;
+	return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
 }
 
 #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 62395fdb0ca5..c64334363230 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 	if (ts > ts_now)
 		ts -= (1UL << TSBITS);
 
-	return ts * (NSEC_PER_SEC / TCP_TS_HZ);
+	if (tcp_rsk(req)->req_usec_ts)
+		return ts * NSEC_PER_USEC;
+	return ts * NSEC_PER_MSEC;
 }
 
 
@@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 	treq->af_specific = af_ops;
 
 	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+	treq->req_usec_ts = -1;
+
 #if IS_ENABLED(CONFIG_MPTCP)
 	treq->is_mptcp = sk_is_mptcp(sk);
 	if (treq->is_mptcp) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 805f8341064f..b961364b4961 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 			tp->fastopen_no_cookie = val;
 		break;
 	case TCP_TIMESTAMP:
-		if (!tp->repair)
+		if (!tp->repair) {
 			err = -EPERM;
-		else
-			WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false));
+			break;
+		}
+		/* val is an opaque field,
+		 * and low order bit contains usec_ts enable bit.
+		 * Its a best effort, and we do not care if user makes an error.
+		 */
+		tp->tcp_usec_ts = val & 1;
+		WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
 		break;
 	case TCP_REPAIR_WINDOW:
 		err = tcp_repair_set_window(tp, optval, optlen);
@@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_TIMESTAMP:
-		val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset);
+		val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
+		if (tp->tcp_usec_ts)
+			val |= 1;
+		else
+			val &= ~1;
 		break;
 	case TCP_NOTSENT_LOWAT:
 		val = READ_ONCE(tp->notsent_lowat);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5666f6137167..18b858597af4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -698,6 +698,8 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
 	u32 delta, delta_us;
 
 	delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
+	if (tp->tcp_usec_ts)
+		return delta;
 
 	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
 		if (!delta)
@@ -2452,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
 				     const struct sk_buff *skb)
 {
 	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
-	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb));
+	       tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
 }
 
 /* Nothing was retransmitted or returned timestamp is less
@@ -7045,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	req->syncookie = want_cookie;
 	tcp_rsk(req)->af_specific = af_ops;
 	tcp_rsk(req)->ts_off = 0;
+	tcp_rsk(req)->req_usec_ts = -1;
 #if IS_ENABLED(CONFIG_MPTCP)
 	tcp_rsk(req)->is_mptcp = 0;
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cdd65cc594bc..7583d4e34c8c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		rt = NULL;
 		goto failure;
 	}
+	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a9fdba897a28..ace806c5bd0c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
+		tw->tw_usec_ts		= tp->tcp_usec_ts;
 		tcptw->tw_last_oow_ack_time = 0;
 		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
 		tw->tw_txhash		= sk->sk_txhash;
@@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->max_window = newtp->snd_wnd;
 
 	if (newtp->rx_opt.tstamp_ok) {
+		newtp->tcp_usec_ts = treq->req_usec_ts;
 		newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
 		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 	} else {
+		newtp->tcp_usec_ts = 0;
 		newtp->rx_opt.ts_recent_stamp = 0;
 		newtp->tcp_header_len = sizeof(struct tcphdr);
 	}
 	if (req->num_timeout) {
-		newtp->undo_marker = treq->snt_isn;
-		newtp->retrans_stamp = div_u64(treq->snt_synack,
-					       USEC_PER_SEC / TCP_TS_HZ);
 		newtp->total_rto = req->num_timeout;
+		newtp->undo_marker = treq->snt_isn;
+		if (newtp->tcp_usec_ts) {
+			newtp->retrans_stamp = treq->snt_synack;
+			newtp->total_rto_time = (u32)(tcp_clock_us() -
+						      newtp->retrans_stamp) / USEC_PER_MSEC;
+		} else {
+			newtp->retrans_stamp = div_u64(treq->snt_synack,
+						       USEC_PER_SEC / TCP_TS_HZ);
+			newtp->total_rto_time = tcp_clock_ms() -
+						newtp->retrans_stamp;
+		}
 		newtp->total_rto_recoveries = 1;
-		newtp->total_rto_time = tcp_clock_ms() -
-					newtp->retrans_stamp;
 	}
 	newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a1fec8be9ac3..2866ccbccde0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset;
+		opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 	}
 	if (likely(ireq->tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off;
+		opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
+			      tcp_rsk(req)->ts_off;
 		opts->tsecr = READ_ONCE(req->ts_recent);
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0;
+		opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
+				tp->tsoffset : 0;
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
@@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	/* Save stamp of the first (attempted) retransmit. */
 	if (!tp->retrans_stamp)
-		tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb);
+		tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
 
 	if (tp->undo_retrans < 0)
 		tp->undo_retrans = 0;
@@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
+	if (tcp_rsk(req)->req_usec_ts < 0)
+		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bfcf3fe44c72..1f9f6c1c196b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -26,14 +26,18 @@
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 elapsed, start_ts, user_timeout;
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 elapsed, user_timeout;
 	s32 remaining;
 
-	start_ts = tcp_sk(sk)->retrans_stamp;
 	user_timeout = READ_ONCE(icsk->icsk_user_timeout);
 	if (!user_timeout)
 		return icsk->icsk_rto;
-	elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts;
+
+	elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
+	if (tp->tcp_usec_ts)
+		elapsed /= USEC_PER_MSEC;
+
 	remaining = user_timeout - elapsed;
 	if (remaining <= 0)
 		return 1; /* user timeout has passed; fire ASAP */
@@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk,
 				  unsigned int boundary,
 				  unsigned int timeout)
 {
-	unsigned int start_ts;
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int start_ts, delta;
 
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
 
-	start_ts = tcp_sk(sk)->retrans_stamp;
+	start_ts = tp->retrans_stamp;
 	if (likely(timeout == 0)) {
 		unsigned int rto_base = TCP_RTO_MIN;
 
@@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk,
 		timeout = tcp_model_timeout(sk, boundary, rto_base);
 	}
 
-	return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0;
+	if (tp->tcp_usec_ts) {
+		/* delta maybe off up to a jiffy due to timer granularity. */
+		delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
+		return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
+	}
+	return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
 }
 
 /* A write timeout has occurred. Process the after effects. */
@@ -468,20 +478,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 }
 
 static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
-				     const struct sk_buff *skb)
+				     const struct sk_buff *skb,
+				     u32 rtx_delta)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const int timeout = TCP_RTO_MAX * 2;
-	u32 rcv_delta, rtx_delta;
+	u32 rcv_delta;
 
 	rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
 	if (rcv_delta <= timeout)
 		return false;
 
-	rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) -
-			(tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)));
-
-	return rtx_delta > timeout;
+	return msecs_to_jiffies(rtx_delta) > timeout;
 }
 
 /**
@@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk)
 		struct inet_sock *inet = inet_sk(sk);
 		u32 rtx_delta;
 
-		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
+		rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
+				tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+		if (tp->tcp_usec_ts)
+			rtx_delta /= USEC_PER_MSEC;
+
 		if (sk->sk_family == AF_INET) {
 			net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
 				&inet->inet_daddr, ntohs(inet->inet_dport),
@@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk)
 				rtx_delta);
 		}
 #endif
-		if (tcp_rtx_probe0_timed_out(sk, skb)) {
+		if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
 			tcp_write_err(sk);
 			goto out;
 		}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ee6517e9b2f..0c8a14ba104f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -286,6 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		goto failure;
 	}
 
+	tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (!saddr) {
-- 
cgit v1.2.3


From 89141f965325de9aac3805cd506df788179b2809 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Wed, 4 Oct 2023 18:30:29 +0300
Subject: wifi: remove unused argument of ieee80211_get_tdls_action()

Remove unused 'hdr_size' argument of 'ieee80211_get_tdls_action()'
and adjust 'ieee80211_report_used_skb()' accordingly.

Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Link: https://lore.kernel.org/r/20231004153032.206134-1-dmantipov@yandex.ru
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 +--
 net/mac80211/status.c     | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 62b4469c6866..045a776ee547 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4490,12 +4490,11 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
 /**
  * ieee80211_get_tdls_action - get tdls packet action (or -1, if not tdls packet)
  * @skb: the skb containing the frame, length will not be checked
- * @hdr_size: the size of the ieee80211_hdr that starts at skb->data
  *
  * This function assumes the frame is a data frame, and that the network header
  * is in the correct place.
  */
-static inline int ieee80211_get_tdls_action(struct sk_buff *skb, u32 hdr_size)
+static inline int ieee80211_get_tdls_action(struct sk_buff *skb)
 {
 	if (!skb_is_nonlinear(skb) &&
 	    skb->len > (skb_network_offset(skb) + 2)) {
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index f67eafada741..807cdab38d5e 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -731,12 +731,9 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 		if (!sdata) {
 			skb->dev = NULL;
 		} else if (!dropped) {
-			unsigned int hdr_size =
-				ieee80211_hdrlen(hdr->frame_control);
-
 			/* Check to see if packet is a TDLS teardown packet */
 			if (ieee80211_is_data(hdr->frame_control) &&
-			    (ieee80211_get_tdls_action(skb, hdr_size) ==
+			    (ieee80211_get_tdls_action(skb) ==
 			     WLAN_TDLS_TEARDOWN)) {
 				ieee80211_tdls_td_tx_handle(local, sdata, skb,
 							    info->flags);
-- 
cgit v1.2.3


From 91535613b6090fc968c601d11d4e2f16b333713c Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Mon, 16 Oct 2023 14:52:48 +0300
Subject: wifi: mac80211: don't drop all unprotected public action frames

Not all public action frames have a protected variant. When MFP is
enabled drop only public action frames that have a dual protected
variant.

Fixes: 76a3059cf124 ("wifi: mac80211: drop some unprotected action frames")
Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20231016145213.2973e3c8d3bb.I6198b8d3b04cf4a97b06660d346caec3032f232a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 29 +++++++++++++++++++++++++++++
 net/mac80211/rx.c         |  3 +--
 2 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index bd2f6e19c357..b24fb80782c5 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4355,6 +4355,35 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr,
 	return mgmt->u.action.category == WLAN_CATEGORY_PUBLIC;
 }
 
+/**
+ * ieee80211_is_protected_dual_of_public_action - check if skb contains a
+ * protected dual of public action management frame
+ * @skb: the skb containing the frame, length will be checked
+ *
+ * Return: true if the skb contains a protected dual of public action
+ * management frame, false otherwise.
+ */
+static inline bool
+ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb)
+{
+	u8 action;
+
+	if (!ieee80211_is_public_action((void *)skb->data, skb->len) ||
+	    skb->len < IEEE80211_MIN_ACTION_SIZE + 1)
+		return false;
+
+	action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE);
+
+	return action != WLAN_PUB_ACTION_20_40_BSS_COEX &&
+		action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN &&
+		action != WLAN_PUB_ACTION_MSMT_PILOT &&
+		action != WLAN_PUB_ACTION_TDLS_DISCOVER_RES &&
+		action != WLAN_PUB_ACTION_LOC_TRACK_NOTI &&
+		action != WLAN_PUB_ACTION_FTM_REQUEST &&
+		action != WLAN_PUB_ACTION_FTM_RESPONSE &&
+		action != WLAN_PUB_ACTION_FILS_DISCOVERY;
+}
+
 /**
  * _ieee80211_is_group_privacy_action - check if frame is a group addressed
  * privacy action frame
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e751cda5eef6..8f6b6f56b65b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2468,8 +2468,7 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
 
 		/* drop unicast public action frames when using MPF */
 		if (is_unicast_ether_addr(mgmt->da) &&
-		    ieee80211_is_public_action((void *)rx->skb->data,
-					       rx->skb->len))
+		    ieee80211_is_protected_dual_of_public_action(rx->skb))
 			return -EACCES;
 	}
 
-- 
cgit v1.2.3


From 6f15b178cd6315c997981f76c6ebed7ad39144c5 Mon Sep 17 00:00:00 2001
From: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
Date: Thu, 5 Oct 2023 15:42:42 +0530
Subject: EDAC/versal: Add a Xilinx Versal memory controller driver

Add a EDAC driver for the RAS capabilities on the Xilinx integrated DDR
Memory Controllers (DDRMCs) which support both DDR4 and LPDDR4/4X memory
interfaces. It has four programmable Network-on-Chip (NoC) interface
ports and is designed to handle multiple streams of traffic. The driver
reports correctable and uncorrectable errors, and also creates debugfs
entries for testing through error injection.

  [ bp:
   - Add a pointer to the documentation about the register unlock code.
   - Squash in a fix for a Smatch static checker issue as reported by
     Dan Carpenter:
     https://lore.kernel.org/r/a4db6f93-8e5f-4d55-a7b8-b5a987d48a58@moroto.mountain
  ]

Co-developed-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231005101242.14621-3-shubhrajyoti.datta@amd.com
---
 MAINTAINERS                          |    7 +
 drivers/edac/Kconfig                 |   12 +
 drivers/edac/Makefile                |    1 +
 drivers/edac/versal_edac.c           | 1069 ++++++++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h |   12 +
 5 files changed, 1101 insertions(+)
 create mode 100644 drivers/edac/versal_edac.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 35977b269d5e..628d02fa549a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23719,6 +23719,13 @@ F:	Documentation/devicetree/bindings/media/xilinx/
 F:	drivers/media/platform/xilinx/
 F:	include/uapi/linux/xilinx-v4l2-controls.h
 
+XILINX VERSAL EDAC DRIVER
+M:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
+M:	Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
+F:	drivers/edac/versal_edac.c
+
 XILINX WATCHDOG DRIVER
 M:	Srinivas Neeli <srinivas.neeli@amd.com>
 R:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 110e99b86a66..5a7f3fabee22 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -561,4 +561,16 @@ config EDAC_NPCM
 	  error detection (in-line ECC in which a section 1/8th of the memory
 	  device used to store data is used for ECC storage).
 
+config EDAC_VERSAL
+	tristate "Xilinx Versal DDR Memory Controller"
+	depends on ARCH_ZYNQMP || COMPILE_TEST
+	help
+	  Support for error detection and correction on the Xilinx Versal DDR
+	  memory controller.
+
+	  Report both single bit errors (CE) and double bit errors (UE).
+	  Support injecting both correctable and uncorrectable errors
+	  for debugging purposes.
+
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 61945d3113cc..9c09893695b7 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -86,3 +86,4 @@ obj-$(CONFIG_EDAC_BLUEFIELD)		+= bluefield_edac.o
 obj-$(CONFIG_EDAC_DMC520)		+= dmc520_edac.o
 obj-$(CONFIG_EDAC_NPCM)			+= npcm_edac.o
 obj-$(CONFIG_EDAC_ZYNQMP)		+= zynqmp_edac.o
+obj-$(CONFIG_EDAC_VERSAL)		+= versal_edac.o
diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
new file mode 100644
index 000000000000..87e730dfefa0
--- /dev/null
+++ b/drivers/edac/versal_edac.c
@@ -0,0 +1,1069 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xilinx Versal memory controller driver
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ */
+#include <linux/bitfield.h>
+#include <linux/edac.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sizes.h>
+#include <linux/firmware/xlnx-zynqmp.h>
+#include <linux/firmware/xlnx-event-manager.h>
+
+#include "edac_module.h"
+
+/* Granularity of reported error in bytes */
+#define XDDR_EDAC_ERR_GRAIN			1
+
+#define XDDR_EDAC_MSG_SIZE			256
+#define EVENT					2
+
+#define XDDR_PCSR_OFFSET			0xC
+#define XDDR_ISR_OFFSET				0x14
+#define XDDR_IRQ_EN_OFFSET			0x20
+#define XDDR_IRQ1_EN_OFFSET			0x2C
+#define XDDR_IRQ_DIS_OFFSET			0x24
+#define XDDR_IRQ_CE_MASK			GENMASK(18, 15)
+#define XDDR_IRQ_UE_MASK			GENMASK(14, 11)
+
+#define XDDR_REG_CONFIG0_OFFSET			0x258
+#define XDDR_REG_CONFIG0_BUS_WIDTH_MASK		GENMASK(19, 18)
+#define XDDR_REG_CONFIG0_NUM_CHANS_MASK		BIT(17)
+#define XDDR_REG_CONFIG0_NUM_RANKS_MASK		GENMASK(15, 14)
+#define XDDR_REG_CONFIG0_SIZE_MASK		GENMASK(10, 8)
+
+#define XDDR_REG_PINOUT_OFFSET			0x25C
+#define XDDR_REG_PINOUT_ECC_EN_MASK		GENMASK(7, 5)
+
+#define ECCW0_FLIP_CTRL				0x109C
+#define ECCW0_FLIP0_OFFSET			0x10A0
+#define ECCW1_FLIP_CTRL				0x10AC
+#define ECCW1_FLIP0_OFFSET			0x10B0
+#define ECCR0_CERR_STAT_OFFSET			0x10BC
+#define ECCR0_CE_ADDR_LO_OFFSET			0x10C0
+#define ECCR0_CE_ADDR_HI_OFFSET			0x10C4
+#define ECCR0_CE_DATA_LO_OFFSET			0x10C8
+#define ECCR0_CE_DATA_HI_OFFSET			0x10CC
+#define ECCR0_CE_DATA_PAR_OFFSET		0x10D0
+
+#define ECCR0_UERR_STAT_OFFSET			0x10D4
+#define ECCR0_UE_ADDR_LO_OFFSET			0x10D8
+#define ECCR0_UE_ADDR_HI_OFFSET			0x10DC
+#define ECCR0_UE_DATA_LO_OFFSET			0x10E0
+#define ECCR0_UE_DATA_HI_OFFSET			0x10E4
+#define ECCR0_UE_DATA_PAR_OFFSET		0x10E8
+
+#define ECCR1_CERR_STAT_OFFSET			0x10F4
+#define ECCR1_CE_ADDR_LO_OFFSET			0x10F8
+#define ECCR1_CE_ADDR_HI_OFFSET			0x10FC
+#define ECCR1_CE_DATA_LO_OFFSET			0x1100
+#define ECCR1_CE_DATA_HI_OFFSET			0x110C
+#define ECCR1_CE_DATA_PAR_OFFSET		0x1108
+
+#define ECCR1_UERR_STAT_OFFSET			0x110C
+#define ECCR1_UE_ADDR_LO_OFFSET			0x1110
+#define ECCR1_UE_ADDR_HI_OFFSET			0x1114
+#define ECCR1_UE_DATA_LO_OFFSET			0x1118
+#define ECCR1_UE_DATA_HI_OFFSET			0x111C
+#define ECCR1_UE_DATA_PAR_OFFSET		0x1120
+
+#define XDDR_NOC_REG_ADEC4_OFFSET		0x44
+#define RANK_1_MASK				GENMASK(11, 6)
+#define LRANK_0_MASK				GENMASK(17, 12)
+#define LRANK_1_MASK				GENMASK(23, 18)
+#define MASK_24					GENMASK(29, 24)
+
+#define XDDR_NOC_REG_ADEC5_OFFSET		0x48
+#define XDDR_NOC_REG_ADEC6_OFFSET		0x4C
+#define XDDR_NOC_REG_ADEC7_OFFSET		0x50
+#define XDDR_NOC_REG_ADEC8_OFFSET		0x54
+#define XDDR_NOC_REG_ADEC9_OFFSET		0x58
+#define XDDR_NOC_REG_ADEC10_OFFSET		0x5C
+
+#define XDDR_NOC_REG_ADEC11_OFFSET		0x60
+#define MASK_0					GENMASK(5, 0)
+#define GRP_0_MASK				GENMASK(11, 6)
+#define GRP_1_MASK				GENMASK(17, 12)
+#define CH_0_MASK				GENMASK(23, 18)
+
+#define XDDR_NOC_REG_ADEC12_OFFSET		0x71C
+#define XDDR_NOC_REG_ADEC13_OFFSET		0x720
+
+#define XDDR_NOC_REG_ADEC14_OFFSET		0x724
+#define XDDR_NOC_ROW_MATCH_MASK			GENMASK(17, 0)
+#define XDDR_NOC_COL_MATCH_MASK			GENMASK(27, 18)
+#define XDDR_NOC_BANK_MATCH_MASK		GENMASK(29, 28)
+#define XDDR_NOC_GRP_MATCH_MASK			GENMASK(31, 30)
+
+#define XDDR_NOC_REG_ADEC15_OFFSET		0x728
+#define XDDR_NOC_RANK_MATCH_MASK		GENMASK(1, 0)
+#define XDDR_NOC_LRANK_MATCH_MASK		GENMASK(4, 2)
+#define XDDR_NOC_CH_MATCH_MASK			BIT(5)
+#define XDDR_NOC_MOD_SEL_MASK			BIT(6)
+#define XDDR_NOC_MATCH_EN_MASK			BIT(8)
+
+#define ECCR_UE_CE_ADDR_HI_ROW_MASK		GENMASK(7, 0)
+
+#define XDDR_EDAC_NR_CSROWS			1
+#define XDDR_EDAC_NR_CHANS			1
+
+#define XDDR_BUS_WIDTH_64			0
+#define XDDR_BUS_WIDTH_32			1
+#define XDDR_BUS_WIDTH_16			2
+
+#define ECC_CEPOISON_MASK			0x1
+#define ECC_UEPOISON_MASK			0x3
+
+#define XDDR_MAX_ROW_CNT			18
+#define XDDR_MAX_COL_CNT			10
+#define XDDR_MAX_RANK_CNT			2
+#define XDDR_MAX_LRANK_CNT			3
+#define XDDR_MAX_BANK_CNT			2
+#define XDDR_MAX_GRP_CNT			2
+
+/*
+ * Config and system registers are usually locked. This is the
+ * code which unlocks them in order to accept writes. See
+ *
+ * https://docs.xilinx.com/r/en-US/am012-versal-register-reference/PCSR_LOCK-XRAM_SLCR-Register
+ */
+#define PCSR_UNLOCK_VAL				0xF9E8D7C6
+#define XDDR_ERR_TYPE_CE			0
+#define XDDR_ERR_TYPE_UE			1
+
+#define XILINX_DRAM_SIZE_4G			0
+#define XILINX_DRAM_SIZE_6G			1
+#define XILINX_DRAM_SIZE_8G			2
+#define XILINX_DRAM_SIZE_12G			3
+#define XILINX_DRAM_SIZE_16G			4
+#define XILINX_DRAM_SIZE_32G			5
+
+/**
+ * struct ecc_error_info - ECC error log information.
+ * @burstpos:		Burst position.
+ * @lrank:		Logical Rank number.
+ * @rank:		Rank number.
+ * @group:		Group number.
+ * @bank:		Bank number.
+ * @col:		Column number.
+ * @row:		Row number.
+ * @rowhi:		Row number higher bits.
+ * @i:			ECC error info.
+ */
+union ecc_error_info {
+	struct {
+		u32 burstpos:3;
+		u32 lrank:3;
+		u32 rank:2;
+		u32 group:2;
+		u32 bank:2;
+		u32 col:10;
+		u32 row:10;
+		u32 rowhi;
+	};
+	u64 i;
+} __packed;
+
+union edac_info {
+	struct {
+		u32 row0:6;
+		u32 row1:6;
+		u32 row2:6;
+		u32 row3:6;
+		u32 row4:6;
+		u32 reserved:2;
+	};
+	struct {
+		u32 col1:6;
+		u32 col2:6;
+		u32 col3:6;
+		u32 col4:6;
+		u32 col5:6;
+		u32 reservedcol:2;
+	};
+	u32 i;
+} __packed;
+
+/**
+ * struct ecc_status - ECC status information to report.
+ * @ceinfo:	Correctable error log information.
+ * @ueinfo:	Uncorrectable error log information.
+ * @channel:	Channel number.
+ * @error_type:	Error type information.
+ */
+struct ecc_status {
+	union ecc_error_info ceinfo[2];
+	union ecc_error_info ueinfo[2];
+	u8 channel;
+	u8 error_type;
+};
+
+/**
+ * struct edac_priv - DDR memory controller private instance data.
+ * @ddrmc_baseaddr:	Base address of the DDR controller.
+ * @ddrmc_noc_baseaddr:	Base address of the DDRMC NOC.
+ * @message:		Buffer for framing the event specific info.
+ * @mc_id:		Memory controller ID.
+ * @ce_cnt:		Correctable error count.
+ * @ue_cnt:		UnCorrectable error count.
+ * @stat:		ECC status information.
+ * @lrank_bit:		Bit shifts for lrank bit.
+ * @rank_bit:		Bit shifts for rank bit.
+ * @row_bit:		Bit shifts for row bit.
+ * @col_bit:		Bit shifts for column bit.
+ * @bank_bit:		Bit shifts for bank bit.
+ * @grp_bit:		Bit shifts for group bit.
+ * @ch_bit:		Bit shifts for channel bit.
+ * @err_inject_addr:	Data poison address.
+ * @debugfs:		Debugfs handle.
+ */
+struct edac_priv {
+	void __iomem *ddrmc_baseaddr;
+	void __iomem *ddrmc_noc_baseaddr;
+	char message[XDDR_EDAC_MSG_SIZE];
+	u32 mc_id;
+	u32 ce_cnt;
+	u32 ue_cnt;
+	struct ecc_status stat;
+	u32 lrank_bit[3];
+	u32 rank_bit[2];
+	u32 row_bit[18];
+	u32 col_bit[10];
+	u32 bank_bit[2];
+	u32 grp_bit[2];
+	u32 ch_bit;
+#ifdef CONFIG_EDAC_DEBUG
+	u64 err_inject_addr;
+	struct dentry *debugfs;
+#endif
+};
+
+static void get_ce_error_info(struct edac_priv *priv)
+{
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+	u32  regval;
+	u64  reghi;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	p->error_type = XDDR_ERR_TYPE_CE;
+	regval = readl(ddrmc_base + ECCR0_CE_ADDR_LO_OFFSET);
+	reghi = regval & ECCR_UE_CE_ADDR_HI_ROW_MASK;
+	p->ceinfo[0].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR0_CE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR0_CE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR0_CE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR0_CE_DATA_PAR_OFFSET));
+
+	regval = readl(ddrmc_base + ECCR1_CE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
+	p->ceinfo[1].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR1_CE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR1_CE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR1_CE_DATA_PAR_OFFSET));
+}
+
+static void get_ue_error_info(struct edac_priv *priv)
+{
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+	u32  regval;
+	u64 reghi;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	p->error_type = XDDR_ERR_TYPE_UE;
+	regval = readl(ddrmc_base + ECCR0_UE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR0_UE_ADDR_HI_OFFSET);
+
+	p->ueinfo[0].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR0_UE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR0_UE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR0_UE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR0_UE_DATA_PAR_OFFSET));
+
+	regval = readl(ddrmc_base + ECCR1_UE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR1_UE_ADDR_HI_OFFSET);
+	p->ueinfo[1].i = regval | reghi << 32;
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR1_UE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR1_UE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR1_UE_DATA_PAR_OFFSET));
+}
+
+static bool get_error_info(struct edac_priv *priv)
+{
+	u32 eccr0_ceval, eccr1_ceval, eccr0_ueval, eccr1_ueval;
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	eccr0_ceval = readl(ddrmc_base + ECCR0_CERR_STAT_OFFSET);
+	eccr1_ceval = readl(ddrmc_base + ECCR1_CERR_STAT_OFFSET);
+	eccr0_ueval = readl(ddrmc_base + ECCR0_UERR_STAT_OFFSET);
+	eccr1_ueval = readl(ddrmc_base + ECCR1_UERR_STAT_OFFSET);
+
+	if (!eccr0_ceval && !eccr1_ceval && !eccr0_ueval && !eccr1_ueval)
+		return 1;
+	if (!eccr0_ceval)
+		p->channel = 1;
+	else
+		p->channel = 0;
+
+	if (eccr0_ceval || eccr1_ceval)
+		get_ce_error_info(priv);
+
+	if (eccr0_ueval || eccr1_ueval) {
+		if (!eccr0_ueval)
+			p->channel = 1;
+		else
+			p->channel = 0;
+		get_ue_error_info(priv);
+	}
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, ddrmc_base + XDDR_PCSR_OFFSET);
+
+	writel(0, ddrmc_base + ECCR0_CERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR1_CERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR0_UERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR1_UERR_STAT_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, ddrmc_base + XDDR_PCSR_OFFSET);
+
+	return 0;
+}
+
+/**
+ * convert_to_physical - Convert to physical address.
+ * @priv:	DDR memory controller private instance data.
+ * @pinf:	ECC error info structure.
+ *
+ * Return: Physical address of the DDR memory.
+ */
+static unsigned long convert_to_physical(struct edac_priv *priv, union ecc_error_info pinf)
+{
+	unsigned long err_addr = 0;
+	u32 index;
+	u32 row;
+
+	row = pinf.rowhi << 10 | pinf.row;
+	for (index = 0; index < XDDR_MAX_ROW_CNT; index++) {
+		err_addr |= (row & BIT(0)) << priv->row_bit[index];
+		row >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_COL_CNT; index++) {
+		err_addr |= (pinf.col & BIT(0)) << priv->col_bit[index];
+		pinf.col >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_BANK_CNT; index++) {
+		err_addr |= (pinf.bank & BIT(0)) << priv->bank_bit[index];
+		pinf.bank >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_GRP_CNT; index++) {
+		err_addr |= (pinf.group & BIT(0)) << priv->grp_bit[index];
+		pinf.group >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_RANK_CNT; index++) {
+		err_addr |= (pinf.rank & BIT(0)) << priv->rank_bit[index];
+		pinf.rank >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_LRANK_CNT; index++) {
+		err_addr |= (pinf.lrank & BIT(0)) << priv->lrank_bit[index];
+		pinf.lrank >>= 1;
+	}
+
+	err_addr |= (priv->stat.channel & BIT(0)) << priv->ch_bit;
+
+	return err_addr;
+}
+
+/**
+ * handle_error - Handle Correctable and Uncorrectable errors.
+ * @mci:	EDAC memory controller instance.
+ * @stat:	ECC status structure.
+ *
+ * Handles ECC correctable and uncorrectable errors.
+ */
+static void handle_error(struct mem_ctl_info *mci, struct ecc_status *stat)
+{
+	struct edac_priv *priv = mci->pvt_info;
+	union ecc_error_info pinf;
+
+	if (stat->error_type == XDDR_ERR_TYPE_CE) {
+		priv->ce_cnt++;
+		pinf = stat->ceinfo[stat->channel];
+		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
+			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
+			 "CE", priv->mc_id,
+			 convert_to_physical(priv, pinf), pinf.burstpos);
+
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     priv->ce_cnt, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+	}
+
+	if (stat->error_type == XDDR_ERR_TYPE_UE) {
+		priv->ue_cnt++;
+		pinf = stat->ueinfo[stat->channel];
+		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
+			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
+			 "UE", priv->mc_id,
+			 convert_to_physical(priv, pinf), pinf.burstpos);
+
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     priv->ue_cnt, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+	}
+
+	memset(stat, 0, sizeof(*stat));
+}
+
+/**
+ * err_callback - Handle Correctable and Uncorrectable errors.
+ * @payload:	payload data.
+ * @data:	mci controller data.
+ *
+ * Handles ECC correctable and uncorrectable errors.
+ */
+static void err_callback(const u32 *payload, void *data)
+{
+	struct mem_ctl_info *mci = (struct mem_ctl_info *)data;
+	struct edac_priv *priv;
+	struct ecc_status *p;
+	int regval;
+
+	priv = mci->pvt_info;
+	p = &priv->stat;
+
+	regval = readl(priv->ddrmc_baseaddr + XDDR_ISR_OFFSET);
+
+	if (payload[EVENT] == XPM_EVENT_ERROR_MASK_DDRMC_CR)
+		p->error_type = XDDR_ERR_TYPE_CE;
+	if (payload[EVENT] == XPM_EVENT_ERROR_MASK_DDRMC_NCR)
+		p->error_type = XDDR_ERR_TYPE_UE;
+
+	if (get_error_info(priv))
+		return;
+
+	handle_error(mci, &priv->stat);
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Clear the ISR */
+	writel(regval, priv->ddrmc_baseaddr + XDDR_ISR_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+	edac_dbg(3, "Total error count CE %d UE %d\n",
+		 priv->ce_cnt, priv->ue_cnt);
+}
+
+/**
+ * get_dwidth - Return the controller memory width.
+ * @base:	DDR memory controller base address.
+ *
+ * Get the EDAC device type width appropriate for the controller
+ * configuration.
+ *
+ * Return: a device type width enumeration.
+ */
+static enum dev_type get_dwidth(const void __iomem *base)
+{
+	enum dev_type dt;
+	u32 regval;
+	u32 width;
+
+	regval = readl(base + XDDR_REG_CONFIG0_OFFSET);
+	width  = FIELD_GET(XDDR_REG_CONFIG0_BUS_WIDTH_MASK, regval);
+
+	switch (width) {
+	case XDDR_BUS_WIDTH_16:
+		dt = DEV_X2;
+		break;
+	case XDDR_BUS_WIDTH_32:
+		dt = DEV_X4;
+		break;
+	case XDDR_BUS_WIDTH_64:
+		dt = DEV_X8;
+		break;
+	default:
+		dt = DEV_UNKNOWN;
+	}
+
+	return dt;
+}
+
+/**
+ * get_ecc_state - Return the controller ECC enable/disable status.
+ * @base:	DDR memory controller base address.
+ *
+ * Get the ECC enable/disable status for the controller.
+ *
+ * Return: a ECC status boolean i.e true/false - enabled/disabled.
+ */
+static bool get_ecc_state(void __iomem *base)
+{
+	enum dev_type dt;
+	u32 ecctype;
+
+	dt = get_dwidth(base);
+	if (dt == DEV_UNKNOWN)
+		return false;
+
+	ecctype = readl(base + XDDR_REG_PINOUT_OFFSET);
+	ecctype &= XDDR_REG_PINOUT_ECC_EN_MASK;
+
+	return !!ecctype;
+}
+
+/**
+ * get_memsize - Get the size of the attached memory device.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Return: the memory size in bytes.
+ */
+static u64 get_memsize(struct edac_priv *priv)
+{
+	u32 regval;
+	u64 size;
+
+	regval = readl(priv->ddrmc_baseaddr + XDDR_REG_CONFIG0_OFFSET);
+	regval  = FIELD_GET(XDDR_REG_CONFIG0_SIZE_MASK, regval);
+
+	switch (regval) {
+	case XILINX_DRAM_SIZE_4G:
+		size = 4U;      break;
+	case XILINX_DRAM_SIZE_6G:
+		size = 6U;      break;
+	case XILINX_DRAM_SIZE_8G:
+		size = 8U;      break;
+	case XILINX_DRAM_SIZE_12G:
+		size = 12U;     break;
+	case XILINX_DRAM_SIZE_16G:
+		size = 16U;     break;
+	case XILINX_DRAM_SIZE_32G:
+		size = 32U;     break;
+	/* Invalid configuration */
+	default:
+		size = 0;	break;
+	}
+
+	size *= SZ_1G;
+	return size;
+}
+
+/**
+ * init_csrows - Initialize the csrow data.
+ * @mci:	EDAC memory controller instance.
+ *
+ * Initialize the chip select rows associated with the EDAC memory
+ * controller instance.
+ */
+static void init_csrows(struct mem_ctl_info *mci)
+{
+	struct edac_priv *priv = mci->pvt_info;
+	struct csrow_info *csi;
+	struct dimm_info *dimm;
+	unsigned long size;
+	u32 row;
+	int ch;
+
+	size = get_memsize(priv);
+	for (row = 0; row < mci->nr_csrows; row++) {
+		csi = mci->csrows[row];
+		for (ch = 0; ch < csi->nr_channels; ch++) {
+			dimm = csi->channels[ch]->dimm;
+			dimm->edac_mode	= EDAC_SECDED;
+			dimm->mtype = MEM_DDR4;
+			dimm->nr_pages = (size >> PAGE_SHIFT) / csi->nr_channels;
+			dimm->grain = XDDR_EDAC_ERR_GRAIN;
+			dimm->dtype = get_dwidth(priv->ddrmc_baseaddr);
+		}
+	}
+}
+
+/**
+ * mc_init - Initialize one driver instance.
+ * @mci:	EDAC memory controller instance.
+ * @pdev:	platform device.
+ *
+ * Perform initialization of the EDAC memory controller instance and
+ * related driver-private data associated with the memory controller the
+ * instance is bound to.
+ */
+static void mc_init(struct mem_ctl_info *mci, struct platform_device *pdev)
+{
+	mci->pdev = &pdev->dev;
+	platform_set_drvdata(pdev, mci);
+
+	/* Initialize controller capabilities and configuration */
+	mci->mtype_cap = MEM_FLAG_DDR4;
+	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+	mci->scrub_cap = SCRUB_HW_SRC;
+	mci->scrub_mode = SCRUB_NONE;
+
+	mci->edac_cap = EDAC_FLAG_SECDED;
+	mci->ctl_name = "xlnx_ddr_controller";
+	mci->dev_name = dev_name(&pdev->dev);
+	mci->mod_name = "xlnx_edac";
+
+	edac_op_state = EDAC_OPSTATE_INT;
+
+	init_csrows(mci);
+}
+
+static void enable_intr(struct edac_priv *priv)
+{
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Enable UE and CE Interrupts to support the interrupt case */
+	writel(XDDR_IRQ_CE_MASK | XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ_EN_OFFSET);
+
+	writel(XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ1_EN_OFFSET);
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+}
+
+static void disable_intr(struct edac_priv *priv)
+{
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Disable UE/CE Interrupts */
+	writel(XDDR_IRQ_CE_MASK | XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ_DIS_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+}
+
+#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
+
+#ifdef CONFIG_EDAC_DEBUG
+/**
+ * poison_setup - Update poison registers.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Update poison registers as per DDR mapping upon write of the address
+ * location the fault is injected.
+ * Return: none.
+ */
+static void poison_setup(struct edac_priv *priv)
+{
+	u32 col = 0, row = 0, bank = 0, grp = 0, rank = 0, lrank = 0, ch = 0;
+	u32 index, regval;
+
+	for (index = 0; index < XDDR_MAX_ROW_CNT; index++) {
+		row |= (((priv->err_inject_addr >> priv->row_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_COL_CNT; index++) {
+		col |= (((priv->err_inject_addr >> priv->col_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_BANK_CNT; index++) {
+		bank |= (((priv->err_inject_addr >> priv->bank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_GRP_CNT; index++) {
+		grp |= (((priv->err_inject_addr >> priv->grp_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_RANK_CNT; index++) {
+		rank |= (((priv->err_inject_addr >> priv->rank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_LRANK_CNT; index++) {
+		lrank |= (((priv->err_inject_addr >> priv->lrank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	ch = (priv->err_inject_addr >> priv->ch_bit) & BIT(0);
+	if (ch)
+		writel(0xFF, priv->ddrmc_baseaddr + ECCW1_FLIP_CTRL);
+	else
+		writel(0xFF, priv->ddrmc_baseaddr + ECCW0_FLIP_CTRL);
+
+	writel(0, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC12_OFFSET);
+	writel(0, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC13_OFFSET);
+
+	regval = row & XDDR_NOC_ROW_MATCH_MASK;
+	regval |= FIELD_PREP(XDDR_NOC_COL_MATCH_MASK, col);
+	regval |= FIELD_PREP(XDDR_NOC_BANK_MATCH_MASK, bank);
+	regval |= FIELD_PREP(XDDR_NOC_GRP_MATCH_MASK, grp);
+	writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC14_OFFSET);
+
+	regval = rank & XDDR_NOC_RANK_MATCH_MASK;
+	regval |= FIELD_PREP(XDDR_NOC_LRANK_MATCH_MASK, lrank);
+	regval |= FIELD_PREP(XDDR_NOC_CH_MATCH_MASK, ch);
+	regval |= (XDDR_NOC_MOD_SEL_MASK | XDDR_NOC_MATCH_EN_MASK);
+	writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC15_OFFSET);
+}
+
+static ssize_t xddr_inject_data_poison_store(struct mem_ctl_info *mci,
+					     const char __user *data)
+{
+	struct edac_priv *priv = mci->pvt_info;
+
+	writel(0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET);
+	writel(0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET);
+
+	if (strncmp(data, "CE", 2) == 0) {
+		writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW0_FLIP0_OFFSET);
+		writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW1_FLIP0_OFFSET);
+	} else {
+		writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW0_FLIP0_OFFSET);
+		writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW1_FLIP0_OFFSET);
+	}
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	return 0;
+}
+
+static ssize_t inject_data_poison_store(struct file *file, const char __user *data,
+					size_t count, loff_t *ppos)
+{
+	struct device *dev = file->private_data;
+	struct mem_ctl_info *mci = to_mci(dev);
+	struct edac_priv *priv = mci->pvt_info;
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+
+	poison_setup(priv);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+
+	xddr_inject_data_poison_store(mci, data);
+
+	return count;
+}
+
+static const struct file_operations xddr_inject_enable_fops = {
+	.open = simple_open,
+	.write = inject_data_poison_store,
+	.llseek = generic_file_llseek,
+};
+
+static void create_debugfs_attributes(struct mem_ctl_info *mci)
+{
+	struct edac_priv *priv = mci->pvt_info;
+
+	priv->debugfs = edac_debugfs_create_dir(mci->dev_name);
+	if (!priv->debugfs)
+		return;
+
+	edac_debugfs_create_file("inject_error", 0200, priv->debugfs,
+				 &mci->dev, &xddr_inject_enable_fops);
+	debugfs_create_x64("address", 0600, priv->debugfs,
+			   &priv->err_inject_addr);
+	mci->debugfs = priv->debugfs;
+}
+
+static inline void process_bit(struct edac_priv *priv, unsigned int start, u32 regval)
+{
+	union edac_info rows;
+
+	rows.i  = regval;
+	priv->row_bit[start]	 = rows.row0;
+	priv->row_bit[start + 1] = rows.row1;
+	priv->row_bit[start + 2] = rows.row2;
+	priv->row_bit[start + 3] = rows.row3;
+	priv->row_bit[start + 4] = rows.row4;
+}
+
+static void setup_row_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+	union edac_info rows;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC5_OFFSET);
+	process_bit(priv, 0, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC6_OFFSET);
+	process_bit(priv, 5, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC7_OFFSET);
+	process_bit(priv, 10, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC8_OFFSET);
+	rows.i  = regval;
+
+	priv->row_bit[15] = rows.row0;
+	priv->row_bit[16] = rows.row1;
+	priv->row_bit[17] = rows.row2;
+}
+
+static void setup_column_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+	union edac_info cols;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC8_OFFSET);
+	priv->col_bit[0] = FIELD_GET(MASK_24, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC9_OFFSET);
+	cols.i  = regval;
+	priv->col_bit[1] = cols.col1;
+	priv->col_bit[2] = cols.col2;
+	priv->col_bit[3] = cols.col3;
+	priv->col_bit[4] = cols.col4;
+	priv->col_bit[5] = cols.col5;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC10_OFFSET);
+	cols.i  = regval;
+	priv->col_bit[6] = cols.col1;
+	priv->col_bit[7] = cols.col2;
+	priv->col_bit[8] = cols.col3;
+	priv->col_bit[9] = cols.col4;
+}
+
+static void setup_bank_grp_ch_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC10_OFFSET);
+	priv->bank_bit[0] = FIELD_GET(MASK_24, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC11_OFFSET);
+	priv->bank_bit[1] = (regval & MASK_0);
+	priv->grp_bit[0] = FIELD_GET(GRP_0_MASK, regval);
+	priv->grp_bit[1] = FIELD_GET(GRP_1_MASK, regval);
+	priv->ch_bit = FIELD_GET(CH_0_MASK, regval);
+}
+
+static void setup_rank_lrank_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC4_OFFSET);
+	priv->rank_bit[0] = (regval & MASK_0);
+	priv->rank_bit[1] = FIELD_GET(RANK_1_MASK, regval);
+	priv->lrank_bit[0] = FIELD_GET(LRANK_0_MASK, regval);
+	priv->lrank_bit[1] = FIELD_GET(LRANK_1_MASK, regval);
+	priv->lrank_bit[2] = FIELD_GET(MASK_24, regval);
+}
+
+/**
+ * setup_address_map - Set Address Map by querying ADDRMAP registers.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Set Address Map by querying ADDRMAP registers.
+ *
+ * Return: none.
+ */
+static void setup_address_map(struct edac_priv *priv)
+{
+	setup_row_address_map(priv);
+
+	setup_column_address_map(priv);
+
+	setup_bank_grp_ch_address_map(priv);
+
+	setup_rank_lrank_address_map(priv);
+}
+#endif /* CONFIG_EDAC_DEBUG */
+
+static const struct of_device_id xlnx_edac_match[] = {
+	{ .compatible = "xlnx,versal-ddrmc", },
+	{
+		/* end of table */
+	}
+};
+
+MODULE_DEVICE_TABLE(of, xlnx_edac_match);
+static u32 emif_get_id(struct device_node *node)
+{
+	u32 addr, my_addr, my_id = 0;
+	struct device_node *np;
+	const __be32 *addrp;
+
+	addrp = of_get_address(node, 0, NULL, NULL);
+	my_addr = (u32)of_translate_address(node, addrp);
+
+	for_each_matching_node(np, xlnx_edac_match) {
+		if (np == node)
+			continue;
+
+		addrp = of_get_address(np, 0, NULL, NULL);
+		addr = (u32)of_translate_address(np, addrp);
+
+		edac_printk(KERN_INFO, EDAC_MC,
+			    "addr=%x, my_addr=%x\n",
+			    addr, my_addr);
+
+		if (addr < my_addr)
+			my_id++;
+	}
+
+	return my_id;
+}
+
+static int mc_probe(struct platform_device *pdev)
+{
+	void __iomem *ddrmc_baseaddr, *ddrmc_noc_baseaddr;
+	struct edac_mc_layer layers[2];
+	struct mem_ctl_info *mci;
+	u8 num_chans, num_csrows;
+	struct edac_priv *priv;
+	u32 edac_mc_id, regval;
+	int rc;
+
+	ddrmc_baseaddr = devm_platform_ioremap_resource_byname(pdev, "base");
+	if (IS_ERR(ddrmc_baseaddr))
+		return PTR_ERR(ddrmc_baseaddr);
+
+	ddrmc_noc_baseaddr = devm_platform_ioremap_resource_byname(pdev, "noc");
+	if (IS_ERR(ddrmc_noc_baseaddr))
+		return PTR_ERR(ddrmc_noc_baseaddr);
+
+	if (!get_ecc_state(ddrmc_baseaddr))
+		return -ENXIO;
+
+	/* Allocate ID number for the EMIF controller */
+	edac_mc_id = emif_get_id(pdev->dev.of_node);
+
+	regval = readl(ddrmc_baseaddr + XDDR_REG_CONFIG0_OFFSET);
+	num_chans = FIELD_PREP(XDDR_REG_CONFIG0_NUM_CHANS_MASK, regval);
+	num_chans++;
+
+	num_csrows = FIELD_PREP(XDDR_REG_CONFIG0_NUM_RANKS_MASK, regval);
+	num_csrows *= 2;
+	if (!num_csrows)
+		num_csrows = 1;
+
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = num_csrows;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = num_chans;
+	layers[1].is_virt_csrow = false;
+
+	mci = edac_mc_alloc(edac_mc_id, ARRAY_SIZE(layers), layers,
+			    sizeof(struct edac_priv));
+	if (!mci) {
+		edac_printk(KERN_ERR, EDAC_MC,
+			    "Failed memory allocation for mc instance\n");
+		return -ENOMEM;
+	}
+
+	priv = mci->pvt_info;
+	priv->ddrmc_baseaddr = ddrmc_baseaddr;
+	priv->ddrmc_noc_baseaddr = ddrmc_noc_baseaddr;
+	priv->ce_cnt = 0;
+	priv->ue_cnt = 0;
+	priv->mc_id = edac_mc_id;
+
+	mc_init(mci, pdev);
+
+	rc = edac_mc_add_mc(mci);
+	if (rc) {
+		edac_printk(KERN_ERR, EDAC_MC,
+			    "Failed to register with EDAC core\n");
+		goto free_edac_mc;
+	}
+
+	rc = xlnx_register_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+				 XPM_EVENT_ERROR_MASK_DDRMC_CR | XPM_EVENT_ERROR_MASK_DDRMC_NCR |
+				 XPM_EVENT_ERROR_MASK_NOC_CR | XPM_EVENT_ERROR_MASK_NOC_NCR,
+				 false, err_callback, mci);
+	if (rc) {
+		if (rc == -EACCES)
+			rc = -EPROBE_DEFER;
+
+		goto del_mc;
+	}
+
+#ifdef CONFIG_EDAC_DEBUG
+	create_debugfs_attributes(mci);
+	setup_address_map(priv);
+#endif
+	enable_intr(priv);
+	return rc;
+
+del_mc:
+	edac_mc_del_mc(&pdev->dev);
+free_edac_mc:
+	edac_mc_free(mci);
+
+	return rc;
+}
+
+static int mc_remove(struct platform_device *pdev)
+{
+	struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+	struct edac_priv *priv = mci->pvt_info;
+
+	disable_intr(priv);
+
+#ifdef CONFIG_EDAC_DEBUG
+	debugfs_remove_recursive(priv->debugfs);
+#endif
+
+	xlnx_unregister_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+			      XPM_EVENT_ERROR_MASK_DDRMC_CR |
+			      XPM_EVENT_ERROR_MASK_NOC_CR |
+			      XPM_EVENT_ERROR_MASK_NOC_NCR |
+			      XPM_EVENT_ERROR_MASK_DDRMC_NCR, err_callback, mci);
+	edac_mc_del_mc(&pdev->dev);
+	edac_mc_free(mci);
+
+	return 0;
+}
+
+static struct platform_driver xilinx_ddr_edac_mc_driver = {
+	.driver = {
+		.name = "xilinx-ddrmc-edac",
+		.of_match_table = xlnx_edac_match,
+	},
+	.probe = mc_probe,
+	.remove = mc_remove,
+};
+
+module_platform_driver(xilinx_ddr_edac_mc_driver);
+
+MODULE_AUTHOR("AMD Inc");
+MODULE_DESCRIPTION("Xilinx DDRMC ECC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index e8b12ec8b060..d1ea3898564c 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -100,6 +100,18 @@
 #define SD_ITAPDLY	0xFF180314
 #define SD_OTAPDLYSEL	0xFF180318
 
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_CR: Error event mask for DDRMC MC Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_CR		BIT(18)
+
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_NCR: Error event mask for DDRMC MC Non-Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_NCR		BIT(19)
+#define XPM_EVENT_ERROR_MASK_NOC_NCR		BIT(13)
+#define XPM_EVENT_ERROR_MASK_NOC_CR		BIT(12)
+
 enum pm_api_cb_id {
 	PM_INIT_SUSPEND_CB = 30,
 	PM_ACKNOWLEDGE_CB = 31,
-- 
cgit v1.2.3


From 2793a8b015f7f1caadb9bce9c63dc659f7522676 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:13 +0300
Subject: bpf: exact states comparison for iterator convergence checks

Convergence for open coded iterators is computed in is_state_visited()
by examining states with branches count > 1 and using states_equal().
states_equal() computes sub-state relation using read and precision marks.
Read and precision marks are propagated from children states,
thus are not guaranteed to be complete inside a loop when branches
count > 1. This could be demonstrated using the following unsafe program:

     1. r7 = -16
     2. r6 = bpf_get_prandom_u32()
     3. while (bpf_iter_num_next(&fp[-8])) {
     4.   if (r6 != 42) {
     5.     r7 = -32
     6.     r6 = bpf_get_prandom_u32()
     7.     continue
     8.   }
     9.   r0 = r10
    10.   r0 += r7
    11.   r8 = *(u64 *)(r0 + 0)
    12.   r6 = bpf_get_prandom_u32()
    13. }

Here verifier would first visit path 1-3, create a checkpoint at 3
with r7=-16, continue to 4-7,3 with r7=-32.

Because instructions at 9-12 had not been visitied yet existing
checkpoint at 3 does not have read or precision mark for r7.
Thus states_equal() would return true and verifier would discard
current state, thus unsafe memory access at 11 would not be caught.

This commit fixes this loophole by introducing exact state comparisons
for iterator convergence logic:
- registers are compared using regs_exact() regardless of read or
  precision marks;
- stack slots have to have identical type.

Unfortunately, this is too strict even for simple programs like below:

    i = 0;
    while(iter_next(&it))
      i++;

At each iteration step i++ would produce a new distinct state and
eventually instruction processing limit would be reached.

To avoid such behavior speculatively forget (widen) range for
imprecise scalar registers, if those registers were not precise at the
end of the previous iteration and do not match exactly.

This a conservative heuristic that allows to verify wide range of
programs, however it precludes verification of programs that conjure
an imprecise value on the first loop iteration and use it as precise
on the second.

Test case iter_task_vma_for_each() presents one of such cases:

        unsigned int seen = 0;
        ...
        bpf_for_each(task_vma, vma, task, 0) {
                if (seen >= 1000)
                        break;
                ...
                seen++;
        }

Here clang generates the following code:

<LBB0_4>:
      24:       r8 = r6                          ; stash current value of
                ... body ...                       'seen'
      29:       r1 = r10
      30:       r1 += -0x8
      31:       call bpf_iter_task_vma_next
      32:       r6 += 0x1                        ; seen++;
      33:       if r0 == 0x0 goto +0x2 <LBB0_6>  ; exit on next() == NULL
      34:       r7 += 0x10
      35:       if r8 < 0x3e7 goto -0xc <LBB0_4> ; loop on seen < 1000

<LBB0_6>:
      ... exit ...

Note that counter in r6 is copied to r8 and then incremented,
conditional jump is done using r8. Because of this precision mark for
r6 lags one state behind of precision mark on r8 and widening logic
kicks in.

Adding barrier_var(seen) after conditional is sufficient to force
clang use the same register for both counting and conditional jump.

This issue was discussed in the thread [1] which was started by
Andrew Werner <awerner32@gmail.com> demonstrating a similar bug
in callback functions handling. The callbacks would be addressed
in a followup patch.

[1] https://lore.kernel.org/bpf/97a90da09404c65c8e810cf83c94ac703705dc0e.camel@gmail.com/

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   1 +
 kernel/bpf/verifier.c                              | 218 ++++++++++++++++++---
 tools/testing/selftests/bpf/progs/iters_task_vma.c |   1 +
 3 files changed, 189 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e67cd45a85be..38b788228594 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -384,6 +384,7 @@ struct bpf_verifier_state {
 	 */
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
+	u32 dfs_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 366029e484a0..6573e8c77329 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1802,6 +1802,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
+	dst_state->dfs_depth = src->dfs_depth;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -7723,6 +7724,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	return 0;
 }
 
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+						  struct bpf_verifier_state *cur,
+						  int insn_idx)
+{
+	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *st;
+
+	/* Explored states are pushed in stack order, most recent states come first */
+	sl = *explored_state(env, insn_idx);
+	for (; sl; sl = sl->next) {
+		/* If st->branches != 0 state is a part of current DFS verification path,
+		 * hence cur & st for a loop.
+		 */
+		st = &sl->state;
+		if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+		    st->dfs_depth < cur->dfs_depth)
+			return st;
+	}
+
+	return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+		       const struct bpf_reg_state *rcur,
+		       struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+			    struct bpf_idmap *idmap)
+{
+	if (rold->type != SCALAR_VALUE)
+		return;
+	if (rold->type != rcur->type)
+		return;
+	if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+		return;
+	__mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+				   struct bpf_verifier_state *old,
+				   struct bpf_verifier_state *cur)
+{
+	struct bpf_func_state *fold, *fcur;
+	int i, fr;
+
+	reset_idmap_scratch(env);
+	for (fr = old->curframe; fr >= 0; fr--) {
+		fold = old->frame[fr];
+		fcur = cur->frame[fr];
+
+		for (i = 0; i < MAX_BPF_REG; i++)
+			maybe_widen_reg(env,
+					&fold->regs[i],
+					&fcur->regs[i],
+					&env->idmap_scratch);
+
+		for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+			if (!is_spilled_reg(&fold->stack[i]) ||
+			    !is_spilled_reg(&fcur->stack[i]))
+				continue;
+
+			maybe_widen_reg(env,
+					&fold->stack[i].spilled_ptr,
+					&fcur->stack[i].spilled_ptr,
+					&env->idmap_scratch);
+		}
+	}
+	return 0;
+}
+
 /* process_iter_next_call() is called when verifier gets to iterator's next
  * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
  * to it as just "iter_next()" in comments below.
@@ -7764,25 +7840,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
  * is some statically known limit on number of iterations (e.g., if there is
  * an explicit `if n > 100 then break;` statement somewhere in the loop).
  *
- * One very subtle but very important aspect is that we *always* simulate NULL
- * condition first (as the current state) before we simulate non-NULL case.
- * This has to do with intricacies of scalar precision tracking. By simulating
- * "exit condition" of iter_next() returning NULL first, we make sure all the
- * relevant precision marks *that will be set **after** we exit iterator loop*
- * are propagated backwards to common parent state of NULL and non-NULL
- * branches. Thanks to that, state equivalence checks done later in forked
- * state, when reaching iter_next() for ACTIVE iterator, can assume that
- * precision marks are finalized and won't change. Because simulating another
- * ACTIVE iterator iteration won't change them (because given same input
- * states we'll end up with exactly same output states which we are currently
- * comparing; and verification after the loop already propagated back what
- * needs to be **additionally** tracked as precise). It's subtle, grok
- * precision tracking for more intuitive understanding.
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ *     i = 0;
+ *     while(iter_next(&it))
+ *       i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ *     struct bpf_num_iter it;
+ *     int arr[10];
+ *     int i = 0, a = 0;
+ *     bpf_iter_num_new(&it, 0, 10);
+ *     while (bpf_iter_num_next(&it)) {
+ *       if (a == 0) {
+ *         a = 1;
+ *         i = 7; // Because i changed verifier would forget
+ *                // it's range on second loop entry.
+ *       } else {
+ *         arr[i] = 42; // This would fail to verify.
+ *       }
+ *     }
+ *     bpf_iter_num_destroy(&it);
  */
 static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 				  struct bpf_kfunc_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
 	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
 	struct bpf_reg_state *cur_iter, *queued_iter;
 	int iter_frameno = meta->iter.frameno;
@@ -7800,6 +7898,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 	}
 
 	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* Because iter_next() call is a checkpoint is_state_visitied()
+		 * should guarantee parent state with same call sites and insn_idx.
+		 */
+		if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+		    !same_callsites(cur_st->parent, cur_st)) {
+			verbose(env, "bug: bad parent state for iter next call");
+			return -EFAULT;
+		}
+		/* Note cur_st->parent in the call below, it is necessary to skip
+		 * checkpoint created for cur_st by is_state_visited()
+		 * right at this instruction.
+		 */
+		prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
 		/* branch out active iter state */
 		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
 		if (!queued_st)
@@ -7808,6 +7919,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
 		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
 		queued_iter->iter.depth++;
+		if (prev_st)
+			widen_imprecise_scalars(env, prev_st, queued_st);
 
 		queued_fr = queued_st->frame[queued_st->curframe];
 		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
@@ -15948,8 +16061,11 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
-		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
+		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
 {
+	if (exact)
+		return regs_exact(rold, rcur, idmap);
+
 	if (!(rold->live & REG_LIVE_READ))
 		/* explored state didn't use this */
 		return true;
@@ -16066,7 +16182,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 }
 
 static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
-		      struct bpf_func_state *cur, struct bpf_idmap *idmap)
+		      struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
 {
 	int i, spi;
 
@@ -16079,7 +16195,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 
 		spi = i / BPF_REG_SIZE;
 
-		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+		if (exact &&
+		    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			return false;
+
+		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
 			i += BPF_REG_SIZE - 1;
 			/* explored state didn't use this */
 			continue;
@@ -16129,7 +16250,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			 * return false to continue verification of this path
 			 */
 			if (!regsafe(env, &old->stack[spi].spilled_ptr,
-				     &cur->stack[spi].spilled_ptr, idmap))
+				     &cur->stack[spi].spilled_ptr, idmap, exact))
 				return false;
 			break;
 		case STACK_DYNPTR:
@@ -16211,16 +16332,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
  * the current state will reach 'bpf_exit' instruction safely
  */
 static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
-			      struct bpf_func_state *cur)
+			      struct bpf_func_state *cur, bool exact)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (!regsafe(env, &old->regs[i], &cur->regs[i],
-			     &env->idmap_scratch))
+			     &env->idmap_scratch, exact))
 			return false;
 
-	if (!stacksafe(env, old, cur, &env->idmap_scratch))
+	if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
 		return false;
 
 	if (!refsafe(old, cur, &env->idmap_scratch))
@@ -16229,17 +16350,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 	return true;
 }
 
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+	env->idmap_scratch.tmp_id_gen = env->id_gen;
+	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
 static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
-			 struct bpf_verifier_state *cur)
+			 struct bpf_verifier_state *cur,
+			 bool exact)
 {
 	int i;
 
 	if (old->curframe != cur->curframe)
 		return false;
 
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	reset_idmap_scratch(env);
 
 	/* Verification state from speculative execution simulation
 	 * must never prune a non-speculative execution one.
@@ -16269,7 +16396,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 	for (i = 0; i <= old->curframe; i++) {
 		if (old->frame[i]->callsite != cur->frame[i]->callsite)
 			return false;
-		if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+		if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
 			return false;
 	}
 	return true;
@@ -16524,7 +16651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
-	int i, j, err, states_cnt = 0;
+	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
 
@@ -16579,9 +16706,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * It's safe to assume that iterator loop will finish, taking into
 			 * account iter_next() contract of eventually returning
 			 * sticky NULL result.
+			 *
+			 * Note, that states have to be compared exactly in this case because
+			 * read and precision marks might not be finalized inside the loop.
+			 * E.g. as in the program below:
+			 *
+			 *     1. r7 = -16
+			 *     2. r6 = bpf_get_prandom_u32()
+			 *     3. while (bpf_iter_num_next(&fp[-8])) {
+			 *     4.   if (r6 != 42) {
+			 *     5.     r7 = -32
+			 *     6.     r6 = bpf_get_prandom_u32()
+			 *     7.     continue
+			 *     8.   }
+			 *     9.   r0 = r10
+			 *    10.   r0 += r7
+			 *    11.   r8 = *(u64 *)(r0 + 0)
+			 *    12.   r6 = bpf_get_prandom_u32()
+			 *    13. }
+			 *
+			 * Here verifier would first visit path 1-3, create a checkpoint at 3
+			 * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+			 * not have read or precision mark for r7 yet, thus inexact states
+			 * comparison would discard current state with r7=-32
+			 * => unsafe memory access at 11 would not be caught.
 			 */
 			if (is_iter_next_insn(env, insn_idx)) {
-				if (states_equal(env, &sl->state, cur)) {
+				if (states_equal(env, &sl->state, cur, true)) {
 					struct bpf_func_state *cur_frame;
 					struct bpf_reg_state *iter_state, *iter_reg;
 					int spi;
@@ -16604,7 +16755,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			}
 			/* attempt to detect infinite loop to avoid unnecessary doomed work */
 			if (states_maybe_looping(&sl->state, cur) &&
-			    states_equal(env, &sl->state, cur) &&
+			    states_equal(env, &sl->state, cur, false) &&
 			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -16629,7 +16780,7 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur)) {
+		if (states_equal(env, &sl->state, cur, false)) {
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16668,8 +16819,12 @@ miss:
 		 * to keep checking from state equivalence point of view.
 		 * Higher numbers increase max_states_per_insn and verification time,
 		 * but do not meaningfully decrease insn_processed.
+		 * 'n' controls how many times state could miss before eviction.
+		 * Use bigger 'n' for checkpoints because evicting checkpoint states
+		 * too early would hinder iterator convergence.
 		 */
-		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+		n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+		if (sl->miss_cnt > sl->hit_cnt * n + n) {
 			/* the state is unlikely to be useful. Remove it to
 			 * speed up verification
 			 */
@@ -16743,6 +16898,7 @@ next:
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
+	cur->dfs_depth = new->dfs_depth + 1;
 	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c
index 44edecfdfaee..e085a51d153e 100644
--- a/tools/testing/selftests/bpf/progs/iters_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c
@@ -30,6 +30,7 @@ int iter_task_vma_for_each(const void *ctx)
 	bpf_for_each(task_vma, vma, task, 0) {
 		if (seen >= 1000)
 			break;
+		barrier_var(seen);
 
 		vm_ranges[seen].vm_start = vma->vm_start;
 		vm_ranges[seen].vm_end = vma->vm_end;
-- 
cgit v1.2.3


From 2a0992829ea3864939d917a5c7b48be6629c6217 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:15 +0300
Subject: bpf: correct loop detection for iterators convergence

It turns out that .branches > 0 in is_state_visited() is not a
sufficient condition to identify if two verifier states form a loop
when iterators convergence is computed. This commit adds logic to
distinguish situations like below:

 (I)            initial       (II)            initial
                  |                             |
                  V                             V
     .---------> hdr                           ..
     |            |                             |
     |            V                             V
     |    .------...                    .------..
     |    |       |                     |       |
     |    V       V                     V       V
     |   ...     ...               .-> hdr     ..
     |    |       |                |    |       |
     |    V       V                |    V       V
     |   succ <- cur               |   succ <- cur
     |    |                        |    |
     |    V                        |    V
     |   ...                       |   ...
     |    |                        |    |
     '----'                        '----'

For both (I) and (II) successor 'succ' of the current state 'cur' was
previously explored and has branches count at 0. However, loop entry
'hdr' corresponding to 'succ' might be a part of current DFS path.
If that is the case 'succ' and 'cur' are members of the same loop
and have to be compared exactly.

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  15 ++++
 kernel/bpf/verifier.c        | 207 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 218 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 38b788228594..24213a99cc79 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -373,10 +373,25 @@ struct bpf_verifier_state {
 	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
+	/* If this state was ever pointed-to by other state's loop_entry field
+	 * this flag would be set to true. Used to avoid freeing such states
+	 * while they are still in use.
+	 */
+	bool used_as_loop_entry;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
 	u32 last_insn_idx;
+	/* If this state is a part of states loop this field points to some
+	 * parent of this state such that:
+	 * - it is also a member of the same states loop;
+	 * - DFS states traversal starting from initial state visits loop_entry
+	 *   state before this state.
+	 * Used to compute topmost loop entry for state loops.
+	 * State loops might appear because of open coded iterators logic.
+	 * See get_loop_entry() for more information.
+	 */
+	struct bpf_verifier_state *loop_entry;
 	/* jmp history recorded from first to last.
 	 * backtracking is using it to go from last to first.
 	 * For most states jmp_history_cnt is [0-3].
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6573e8c77329..f23fbfe82c59 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1803,6 +1803,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
+	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1845,11 +1846,176 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
 	return true;
 }
 
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ *   read and precision marks to do so, these marks are propagated from
+ *   children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ *   are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ *          initial                     initial                   initial
+ *            |                           |                         |
+ *            V                           V                         V
+ *           ...                         ...           .---------> hdr
+ *            |                           |            |            |
+ *            V                           V            |            V
+ *           cur                     .-> succ          |    .------...
+ *            |                      |    |            |    |       |
+ *            V                      |    V            |    V       V
+ *           succ                    '-- cur           |   ...     ...
+ *                                                     |    |       |
+ *                                                     |    V       V
+ *                                                     |   succ <- cur
+ *                                                     |    |
+ *                                                     |    V
+ *                                                     |   ...
+ *                                                     |    |
+ *                                                     '----'
+ *
+ *  (A) successor state of cur   (B) successor state of cur or it's entry
+ *      not yet traversed            are in current DFS path, thus cur and succ
+ *                                   are members of the same outermost loop
+ *
+ *                      initial                  initial
+ *                        |                        |
+ *                        V                        V
+ *                       ...                      ...
+ *                        |                        |
+ *                        V                        V
+ *                .------...               .------...
+ *                |       |                |       |
+ *                V       V                V       V
+ *           .-> hdr     ...              ...     ...
+ *           |    |       |                |       |
+ *           |    V       V                V       V
+ *           |   succ <- cur              succ <- cur
+ *           |    |                        |
+ *           |    V                        V
+ *           |   ...                      ...
+ *           |    |                        |
+ *           '----'                       exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ *     does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ *     traversed = set()   # Set of traversed nodes
+ *     entries = {}        # Mapping from node to loop entry
+ *     depths = {}         # Depth level assigned to graph node
+ *     path = set()        # Current DFS path
+ *
+ *     # Find outermost loop entry known for n
+ *     def get_loop_entry(n):
+ *         h = entries.get(n, None)
+ *         while h in entries and entries[h] != h:
+ *             h = entries[h]
+ *         return h
+ *
+ *     # Update n's loop entry if h's outermost entry comes
+ *     # before n's outermost entry in current DFS path.
+ *     def update_loop_entry(n, h):
+ *         n1 = get_loop_entry(n) or n
+ *         h1 = get_loop_entry(h) or h
+ *         if h1 in path and depths[h1] <= depths[n1]:
+ *             entries[n] = h1
+ *
+ *     def dfs(n, depth):
+ *         traversed.add(n)
+ *         path.add(n)
+ *         depths[n] = depth
+ *         for succ in G.successors(n):
+ *             if succ not in traversed:
+ *                 # Case A: explore succ and update cur's loop entry
+ *                 #         only if succ's entry is in current DFS path.
+ *                 dfs(succ, depth + 1)
+ *                 h = get_loop_entry(succ)
+ *                 update_loop_entry(n, h)
+ *             else:
+ *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
+ *                 update_loop_entry(n, succ)
+ *         path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ *   and cur's loop entry has to be updated (case A), handle this in
+ *   update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+	struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+		topmost = topmost->loop_entry;
+	/* Update loop entries for intermediate states to avoid this
+	 * traversal in future get_loop_entry() calls.
+	 */
+	while (st && st->loop_entry != topmost) {
+		old = st->loop_entry;
+		st->loop_entry = topmost;
+		st = old;
+	}
+	return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+	struct bpf_verifier_state *cur1, *hdr1;
+
+	cur1 = get_loop_entry(cur) ?: cur;
+	hdr1 = get_loop_entry(hdr) ?: hdr;
+	/* The head1->branches check decides between cases B and C in
+	 * comment for get_loop_entry(). If hdr1->branches == 0 then
+	 * head's topmost loop entry is not in current DFS path,
+	 * hence 'cur' and 'hdr' are not in the same loop and there is
+	 * no need to update cur->loop_entry.
+	 */
+	if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+		cur->loop_entry = hdr;
+		hdr->used_as_loop_entry = true;
+	}
+}
+
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	while (st) {
 		u32 br = --st->branches;
 
+		/* br == 0 signals that DFS exploration for 'st' is finished,
+		 * thus it is necessary to update parent's loop entry if it
+		 * turned out that st is a part of some loop.
+		 * This is a part of 'case A' in get_loop_entry() comment.
+		 */
+		if (br == 0 && st->parent && st->loop_entry)
+			update_loop_entry(st->parent, st->loop_entry);
+
 		/* WARN_ON(br > 1) technically makes sense here,
 		 * but see comment in push_stack(), hence:
 		 */
@@ -16650,10 +16816,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
-	struct bpf_verifier_state *cur = env->cur_state, *new;
+	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
 	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
+	bool force_exact;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16748,8 +16915,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 					 */
 					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
 					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
-					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+						update_loop_entry(cur, &sl->state);
 						goto hit;
+					}
 				}
 				goto skip_inf_loop_check;
 			}
@@ -16780,7 +16949,36 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur, false)) {
+		/* If sl->state is a part of a loop and this loop's entry is a part of
+		 * current verification path then states have to be compared exactly.
+		 * 'force_exact' is needed to catch the following case:
+		 *
+		 *                initial     Here state 'succ' was processed first,
+		 *                  |         it was eventually tracked to produce a
+		 *                  V         state identical to 'hdr'.
+		 *     .---------> hdr        All branches from 'succ' had been explored
+		 *     |            |         and thus 'succ' has its .branches == 0.
+		 *     |            V
+		 *     |    .------...        Suppose states 'cur' and 'succ' correspond
+		 *     |    |       |         to the same instruction + callsites.
+		 *     |    V       V         In such case it is necessary to check
+		 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
+		 *     |    |       |         If 'succ' and 'cur' are a part of the
+		 *     |    V       V         same loop exact flag has to be set.
+		 *     |   succ <- cur        To check if that is the case, verify
+		 *     |    |                 if loop entry of 'succ' is in current
+		 *     |    V                 DFS path.
+		 *     |   ...
+		 *     |    |
+		 *     '----'
+		 *
+		 * Additional details are in the comment before get_loop_entry().
+		 */
+		loop_entry = get_loop_entry(&sl->state);
+		force_exact = loop_entry && loop_entry->branches > 0;
+		if (states_equal(env, &sl->state, cur, force_exact)) {
+			if (force_exact)
+				update_loop_entry(cur, loop_entry);
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16829,7 +17027,8 @@ miss:
 			 * speed up verification
 			 */
 			*pprev = sl->next;
-			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+			    !sl->state.used_as_loop_entry) {
 				u32 br = sl->state.branches;
 
 				WARN_ONCE(br,
-- 
cgit v1.2.3


From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Thu, 19 Oct 2023 11:33:21 +0800
Subject: sched: Add cpus_share_resources API

Add cpus_share_resources() API. This is the preparation for the
optimization of select_idle_cpu() on platforms with cluster scheduler
level.

On a machine with clusters cpus_share_resources() will test whether
two cpus are within the same cluster. On a non-cluster machine it
will behaves the same as cpus_share_cache(). So we use "resources"
here for cache resources.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-and-reviewed-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com
---
 include/linux/sched/sd_flags.h |  7 +++++++
 include/linux/sched/topology.h |  8 +++++++-
 kernel/sched/core.c            | 12 ++++++++++++
 kernel/sched/sched.h           |  1 +
 kernel/sched/topology.c        | 13 +++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index fad77b5172e2..a8b28647aafc 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
+/*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
 /*
  * Domain members share CPU package resources (i.e. caches)
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..4c14fe127223 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc724f59e495..5e1fb8a63b2e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 65cad0e5729e..998f03d02de0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a63729f87c21..dbb8c328e8ad 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu)
 	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
+	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+	if (sd)
+		id = cpumask_first(sched_domain_span(sd));
+
+	/*
+	 * This assignment should be placed after the sd_llc_id as
+	 * we want this id equals to cluster id on cluster machines
+	 * but equals to LLC id on non-Cluster machines.
+	 */
+	per_cpu(sd_share_id, cpu) = id;
+
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
@@ -1550,6 +1562,7 @@ static struct cpumask		***sched_domains_numa_masks;
  */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY	|	\
+	 SD_CLUSTER		|	\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING)
-- 
cgit v1.2.3


From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Oct 2023 12:35:33 +0200
Subject: sched/fair: Remove SIS_PROP

SIS_UTIL seems to work well, lets remove the old thing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net
---
 include/linux/sched/topology.h |  2 --
 kernel/sched/core.c            |  5 -----
 kernel/sched/fair.c            | 48 ------------------------------------------
 kernel/sched/features.h        |  1 -
 kernel/sched/sched.h           |  3 ---
 5 files changed, 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 4c14fe127223..de545ba85218 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -109,8 +109,6 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e1fb8a63b2e..7a0c16115b79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		if (rq->avg_idle > max)
 			rq->avg_idle = max;
 
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle / 2;
-
 		rq->idle_stamp = 0;
 	}
 #endif
@@ -9953,8 +9950,6 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 523b5aee2d6a..8767988242ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
 	struct sched_domain_shared *sd_share;
-	struct rq *this_rq = this_rq();
-	int this = smp_processor_id();
-	struct sched_domain *this_sd = NULL;
-	u64 time = 0;
 
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-	if (sched_feat(SIS_PROP) && !has_idle_core) {
-		u64 avg_cost, avg_idle, span_avg;
-		unsigned long now = jiffies;
-
-		this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-		if (!this_sd)
-			return -1;
-
-		/*
-		 * If we're busy, the assumption that the last idle period
-		 * predicts the future is flawed; age away the remaining
-		 * predicted idle time.
-		 */
-		if (unlikely(this_rq->wake_stamp < now)) {
-			while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
-				this_rq->wake_stamp++;
-				this_rq->wake_avg_idle >>= 1;
-			}
-		}
-
-		avg_idle = this_rq->wake_avg_idle;
-		avg_cost = this_sd->avg_scan_cost + 1;
-
-		span_avg = sd->span_weight * avg_idle;
-		if (span_avg > 4*avg_cost)
-			nr = div_u64(span_avg, avg_cost);
-		else
-			nr = 4;
-
-		time = cpu_clock(this);
-	}
-
 	if (sched_feat(SIS_UTIL)) {
 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
 		if (sd_share) {
@@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	if (has_idle_core)
 		set_idle_cores(target, false);
 
-	if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
-		time = cpu_clock(this) - time;
-
-		/*
-		 * Account for the scan cost of wakeups against the average
-		 * idle time.
-		 */
-		this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
-		update_avg(&this_sd->avg_scan_cost, time);
-	}
-
 	return idle_cpu;
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..a3ddf84de430 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
 /*
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
-SCHED_FEAT(SIS_PROP, false)
 SCHED_FEAT(SIS_UTIL, true)
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef4fe7bcf740..2e5a95486a42 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1059,9 +1059,6 @@ struct rq {
 	u64			idle_stamp;
 	u64			avg_idle;
 
-	unsigned long		wake_stamp;
-	u64			wake_avg_idle;
-
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
 
-- 
cgit v1.2.3


From c54e52f84d7aa590e90e1f73f462517ac40051e1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 23 Oct 2023 14:35:03 +0100
Subject: arm64, irqchip/gic-v3, ACPI: Move MADT GICC enabled check into a
 helper

ACPI, irqchip and the architecture code all inspect the MADT
enabled bit for a GICC entry in the MADT.

The addition of an 'online capable' bit means all these sites need
updating.

Move the current checks behind a helper to make future updates easier.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Acked-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/E1quv5D-00AeNJ-U8@rmk-PC.armlinux.org.uk
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c       |  2 +-
 drivers/acpi/processor_core.c |  2 +-
 drivers/irqchip/irq-gic-v3.c  | 10 ++++------
 include/linux/acpi.h          |  5 +++++
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..8c8f55721786 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -520,7 +520,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
 {
 	u64 hwid = processor->arm_mpidr;
 
-	if (!(processor->flags & ACPI_MADT_ENABLED)) {
+	if (!acpi_gicc_is_usable(processor)) {
 		pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid);
 		return;
 	}
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7dd6dbaa98c3..b203cfe28550 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -90,7 +90,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
 	struct acpi_madt_generic_interrupt *gicc =
 	    container_of(entry, struct acpi_madt_generic_interrupt, header);
 
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return -ENODEV;
 
 	/* device_declaration means Device object in DSDT, in the
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index eedfa8e9f077..72d3cdebdad1 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -2367,8 +2367,7 @@ gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
 	u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
 	void __iomem *redist_base;
 
-	/* GICC entry which has !ACPI_MADT_ENABLED is not unusable so skip */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	redist_base = ioremap(gicc->gicr_base_address, size);
@@ -2418,7 +2417,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 	 * If GICC is enabled and has valid gicr base address, then it means
 	 * GICR base is presented via GICC
 	 */
-	if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) {
+	if (acpi_gicc_is_usable(gicc) && gicc->gicr_base_address) {
 		acpi_data.enabled_rdists++;
 		return 0;
 	}
@@ -2427,7 +2426,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 	 * It's perfectly valid firmware can pass disabled GICC entry, driver
 	 * should not treat as errors, skip the entry instead of probe fail.
 	 */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	return -ENODEV;
@@ -2486,8 +2485,7 @@ static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *hea
 	int maint_irq_mode;
 	static int first_madt = true;
 
-	/* Skip unusable CPUs */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..fae2aa0b5e8b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,11 @@ acpi_table_parse_cedt(enum acpi_cedt_type id,
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 
+static inline bool acpi_gicc_is_usable(struct acpi_madt_generic_interrupt *gicc)
+{
+	return gicc->flags & ACPI_MADT_ENABLED;
+}
+
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-- 
cgit v1.2.3


From 8c9c727b6142325ed5697240fceb99cbeb4ac2ec Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 24 Oct 2023 14:50:53 +0100
Subject: vfio: Move iova_bitmap into iommufd

Both VFIO and IOMMUFD will need iova bitmap for storing dirties and walking
the user bitmaps, so move to the common dependency into IOMMUFD.  In doing
so, create the symbol IOMMUFD_DRIVER which designates the builtin code that
will be used by drivers when selected. Today this means MLX5_VFIO_PCI and
PDS_VFIO_PCI. IOMMU drivers will do the same (in future patches) when
supporting dirty tracking and select IOMMUFD_DRIVER accordingly.

Given that the symbol maybe be disabled, add header definitions in
iova_bitmap.h for when IOMMUFD_DRIVER=n

Link: https://lore.kernel.org/r/20231024135109.73787-3-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/Kconfig               |   4 +
 drivers/iommu/iommufd/Makefile      |   1 +
 drivers/iommu/iommufd/iova_bitmap.c | 426 ++++++++++++++++++++++++++++++++++++
 drivers/vfio/Makefile               |   3 +-
 drivers/vfio/iova_bitmap.c          | 426 ------------------------------------
 drivers/vfio/pci/mlx5/Kconfig       |   1 +
 drivers/vfio/pci/pds/Kconfig        |   1 +
 include/linux/iova_bitmap.h         |  26 +++
 8 files changed, 460 insertions(+), 428 deletions(-)
 create mode 100644 drivers/iommu/iommufd/iova_bitmap.c
 delete mode 100644 drivers/vfio/iova_bitmap.c

(limited to 'include/linux')

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..5cc869db1b79 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -7,6 +7,10 @@ config IOMMU_IOVA
 config IOMMU_API
 	bool
 
+config IOMMUFD_DRIVER
+	bool
+	default n
+
 menuconfig IOMMU_SUPPORT
 	bool "IOMMU Hardware Support"
 	depends on MMU
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 8aeba81800c5..34b446146961 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -11,3 +11,4 @@ iommufd-y := \
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
 obj-$(CONFIG_IOMMUFD) += iommufd.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
new file mode 100644
index 000000000000..f54b56388e00
--- /dev/null
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#include <linux/iova_bitmap.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+
+/*
+ * struct iova_bitmap_map - A bitmap representing an IOVA range
+ *
+ * Main data structure for tracking mapped user pages of bitmap data.
+ *
+ * For example, for something recording dirty IOVAs, it will be provided a
+ * struct iova_bitmap structure, as a general structure for iterating the
+ * total IOVA range. The struct iova_bitmap_map, though, represents the
+ * subset of said IOVA space that is pinned by its parent structure (struct
+ * iova_bitmap).
+ *
+ * The user does not need to exact location of the bits in the bitmap.
+ * From user perspective the only API available is iova_bitmap_set() which
+ * records the IOVA *range* in the bitmap by setting the corresponding
+ * bits.
+ *
+ * The bitmap is an array of u64 whereas each bit represents an IOVA of
+ * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
+ *
+ *   data[(iova / page_size) / 64] & (1ULL << (iova % 64))
+ */
+struct iova_bitmap_map {
+	/* base IOVA representing bit 0 of the first page */
+	unsigned long iova;
+
+	/* page size order that each bit granules to */
+	unsigned long pgshift;
+
+	/* page offset of the first user page pinned */
+	unsigned long pgoff;
+
+	/* number of pages pinned */
+	unsigned long npages;
+
+	/* pinned pages representing the bitmap data */
+	struct page **pages;
+};
+
+/*
+ * struct iova_bitmap - The IOVA bitmap object
+ *
+ * Main data structure for iterating over the bitmap data.
+ *
+ * Abstracts the pinning work and iterates in IOVA ranges.
+ * It uses a windowing scheme and pins the bitmap in relatively
+ * big ranges e.g.
+ *
+ * The bitmap object uses one base page to store all the pinned pages
+ * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
+ * 512 struct page pointers which, if the base page size is 4K, it means
+ * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
+ * also 4K then the range window to iterate is 64G.
+ *
+ * For example iterating on a total IOVA range of 4G..128G, it will walk
+ * through this set of ranges:
+ *
+ *    4G  -  68G-1 (64G)
+ *    68G - 128G-1 (64G)
+ *
+ * An example of the APIs on how to use/iterate over the IOVA bitmap:
+ *
+ *   bitmap = iova_bitmap_alloc(iova, length, page_size, data);
+ *   if (IS_ERR(bitmap))
+ *       return PTR_ERR(bitmap);
+ *
+ *   ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
+ *
+ *   iova_bitmap_free(bitmap);
+ *
+ * Each iteration of the @dirty_reporter_fn is called with a unique @iova
+ * and @length argument, indicating the current range available through the
+ * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
+ * areas (@iova_length) within that provided range, as following:
+ *
+ *   iova_bitmap_set(bitmap, iova, iova_length);
+ *
+ * The internals of the object uses an index @mapped_base_index that indexes
+ * which u64 word of the bitmap is mapped, up to @mapped_total_index.
+ * Those keep being incremented until @mapped_total_index is reached while
+ * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
+ *
+ * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
+ * some form of IOVA range tracking that co-relates to the user passed
+ * bitmap.
+ */
+struct iova_bitmap {
+	/* IOVA range representing the currently mapped bitmap data */
+	struct iova_bitmap_map mapped;
+
+	/* userspace address of the bitmap */
+	u64 __user *bitmap;
+
+	/* u64 index that @mapped points to */
+	unsigned long mapped_base_index;
+
+	/* how many u64 can we walk in total */
+	unsigned long mapped_total_index;
+
+	/* base IOVA of the whole bitmap */
+	unsigned long iova;
+
+	/* length of the IOVA range for the whole bitmap */
+	size_t length;
+};
+
+/*
+ * Converts a relative IOVA to a bitmap index.
+ * This function provides the index into the u64 array (bitmap::bitmap)
+ * for a given IOVA offset.
+ * Relative IOVA means relative to the bitmap::mapped base IOVA
+ * (stored in mapped::iova). All computations in this file are done using
+ * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
+ * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
+ */
+static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
+						 unsigned long iova)
+{
+	unsigned long pgsize = 1 << bitmap->mapped.pgshift;
+
+	return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+}
+
+/*
+ * Converts a bitmap index to a *relative* IOVA.
+ */
+static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
+						 unsigned long index)
+{
+	unsigned long pgshift = bitmap->mapped.pgshift;
+
+	return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
+}
+
+/*
+ * Returns the base IOVA of the mapped range.
+ */
+static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
+{
+	unsigned long skip = bitmap->mapped_base_index;
+
+	return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
+}
+
+/*
+ * Pins the bitmap user pages for the current range window.
+ * This is internal to IOVA bitmap and called when advancing the
+ * index (@mapped_base_index) or allocating the bitmap.
+ */
+static int iova_bitmap_get(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+	unsigned long npages;
+	u64 __user *addr;
+	long ret;
+
+	/*
+	 * @mapped_base_index is the index of the currently mapped u64 words
+	 * that we have access. Anything before @mapped_base_index is not
+	 * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
+	 * mapped but capped at a maximum number of pages.
+	 */
+	npages = DIV_ROUND_UP((bitmap->mapped_total_index -
+			       bitmap->mapped_base_index) *
+			       sizeof(*bitmap->bitmap), PAGE_SIZE);
+
+	/*
+	 * We always cap at max number of 'struct page' a base page can fit.
+	 * This is, for example, on x86 means 2M of bitmap data max.
+	 */
+	npages = min(npages,  PAGE_SIZE / sizeof(struct page *));
+
+	/*
+	 * Bitmap address to be pinned is calculated via pointer arithmetic
+	 * with bitmap u64 word index.
+	 */
+	addr = bitmap->bitmap + bitmap->mapped_base_index;
+
+	ret = pin_user_pages_fast((unsigned long)addr, npages,
+				  FOLL_WRITE, mapped->pages);
+	if (ret <= 0)
+		return -EFAULT;
+
+	mapped->npages = (unsigned long)ret;
+	/* Base IOVA where @pages point to i.e. bit 0 of the first page */
+	mapped->iova = iova_bitmap_mapped_iova(bitmap);
+
+	/*
+	 * offset of the page where pinned pages bit 0 is located.
+	 * This handles the case where the bitmap is not PAGE_SIZE
+	 * aligned.
+	 */
+	mapped->pgoff = offset_in_page(addr);
+	return 0;
+}
+
+/*
+ * Unpins the bitmap user pages and clears @npages
+ * (un)pinning is abstracted from API user and it's done when advancing
+ * the index or freeing the bitmap.
+ */
+static void iova_bitmap_put(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+	if (mapped->npages) {
+		unpin_user_pages(mapped->pages, mapped->npages);
+		mapped->npages = 0;
+	}
+}
+
+/**
+ * iova_bitmap_alloc() - Allocates an IOVA bitmap object
+ * @iova: Start address of the IOVA range
+ * @length: Length of the IOVA range
+ * @page_size: Page size of the IOVA bitmap. It defines what each bit
+ *             granularity represents
+ * @data: Userspace address of the bitmap
+ *
+ * Allocates an IOVA object and initializes all its fields including the
+ * first user pages of @data.
+ *
+ * Return: A pointer to a newly allocated struct iova_bitmap
+ * or ERR_PTR() on error.
+ */
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+				      unsigned long page_size, u64 __user *data)
+{
+	struct iova_bitmap_map *mapped;
+	struct iova_bitmap *bitmap;
+	int rc;
+
+	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
+	if (!bitmap)
+		return ERR_PTR(-ENOMEM);
+
+	mapped = &bitmap->mapped;
+	mapped->pgshift = __ffs(page_size);
+	bitmap->bitmap = data;
+	bitmap->mapped_total_index =
+		iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
+	bitmap->iova = iova;
+	bitmap->length = length;
+	mapped->iova = iova;
+	mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
+	if (!mapped->pages) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = iova_bitmap_get(bitmap);
+	if (rc)
+		goto err;
+	return bitmap;
+
+err:
+	iova_bitmap_free(bitmap);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(iova_bitmap_alloc);
+
+/**
+ * iova_bitmap_free() - Frees an IOVA bitmap object
+ * @bitmap: IOVA bitmap to free
+ *
+ * It unpins and releases pages array memory and clears any leftover
+ * state.
+ */
+void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+	iova_bitmap_put(bitmap);
+
+	if (mapped->pages) {
+		free_page((unsigned long)mapped->pages);
+		mapped->pages = NULL;
+	}
+
+	kfree(bitmap);
+}
+EXPORT_SYMBOL_GPL(iova_bitmap_free);
+
+/*
+ * Returns the remaining bitmap indexes from mapped_total_index to process for
+ * the currently pinned bitmap pages.
+ */
+static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
+{
+	unsigned long remaining, bytes;
+
+	bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
+
+	remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
+	remaining = min_t(unsigned long, remaining,
+			  bytes / sizeof(*bitmap->bitmap));
+
+	return remaining;
+}
+
+/*
+ * Returns the length of the mapped IOVA range.
+ */
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
+{
+	unsigned long max_iova = bitmap->iova + bitmap->length - 1;
+	unsigned long iova = iova_bitmap_mapped_iova(bitmap);
+	unsigned long remaining;
+
+	/*
+	 * iova_bitmap_mapped_remaining() returns a number of indexes which
+	 * when converted to IOVA gives us a max length that the bitmap
+	 * pinned data can cover. Afterwards, that is capped to
+	 * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
+	 */
+	remaining = iova_bitmap_index_to_offset(bitmap,
+			iova_bitmap_mapped_remaining(bitmap));
+
+	if (iova + remaining - 1 > max_iova)
+		remaining -= ((iova + remaining - 1) - max_iova);
+
+	return remaining;
+}
+
+/*
+ * Returns true if there's not more data to iterate.
+ */
+static bool iova_bitmap_done(struct iova_bitmap *bitmap)
+{
+	return bitmap->mapped_base_index >= bitmap->mapped_total_index;
+}
+
+/*
+ * Advances to the next range, releases the current pinned
+ * pages and pins the next set of bitmap pages.
+ * Returns 0 on success or otherwise errno.
+ */
+static int iova_bitmap_advance(struct iova_bitmap *bitmap)
+{
+	unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
+	unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
+
+	bitmap->mapped_base_index += count;
+
+	iova_bitmap_put(bitmap);
+	if (iova_bitmap_done(bitmap))
+		return 0;
+
+	/* When advancing the index we pin the next set of bitmap pages */
+	return iova_bitmap_get(bitmap);
+}
+
+/**
+ * iova_bitmap_for_each() - Iterates over the bitmap
+ * @bitmap: IOVA bitmap to iterate
+ * @opaque: Additional argument to pass to the callback
+ * @fn: Function that gets called for each IOVA range
+ *
+ * Helper function to iterate over bitmap data representing a portion of IOVA
+ * space. It hides the complexity of iterating bitmaps and translating the
+ * mapped bitmap user pages into IOVA ranges to process.
+ *
+ * Return: 0 on success, and an error on failure either upon
+ * iteration or when the callback returns an error.
+ */
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+			 iova_bitmap_fn_t fn)
+{
+	int ret = 0;
+
+	for (; !iova_bitmap_done(bitmap) && !ret;
+	     ret = iova_bitmap_advance(bitmap)) {
+		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
+			 iova_bitmap_mapped_length(bitmap), opaque);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iova_bitmap_for_each);
+
+/**
+ * iova_bitmap_set() - Records an IOVA range in bitmap
+ * @bitmap: IOVA bitmap
+ * @iova: IOVA to start
+ * @length: IOVA range length
+ *
+ * Set the bits corresponding to the range [iova .. iova+length-1] in
+ * the user bitmap.
+ *
+ */
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+		     unsigned long iova, size_t length)
+{
+	struct iova_bitmap_map *mapped = &bitmap->mapped;
+	unsigned long cur_bit = ((iova - mapped->iova) >>
+			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+	unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
+			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+
+	do {
+		unsigned int page_idx = cur_bit / BITS_PER_PAGE;
+		unsigned int offset = cur_bit % BITS_PER_PAGE;
+		unsigned int nbits = min(BITS_PER_PAGE - offset,
+					 last_bit - cur_bit + 1);
+		void *kaddr;
+
+		kaddr = kmap_local_page(mapped->pages[page_idx]);
+		bitmap_set(kaddr, offset, nbits);
+		kunmap_local(kaddr);
+		cur_bit += nbits;
+	} while (cur_bit <= last_bit);
+}
+EXPORT_SYMBOL_GPL(iova_bitmap_set);
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c82ea032d352..68c05705200f 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VFIO) += vfio.o
 
-vfio-y += vfio_main.o \
-	  iova_bitmap.o
+vfio-y += vfio_main.o
 vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
 vfio-$(CONFIG_VFIO_GROUP) += group.o
 vfio-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c
deleted file mode 100644
index f54b56388e00..000000000000
--- a/drivers/vfio/iova_bitmap.c
+++ /dev/null
@@ -1,426 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2022, Oracle and/or its affiliates.
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
- */
-#include <linux/iova_bitmap.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
-
-/*
- * struct iova_bitmap_map - A bitmap representing an IOVA range
- *
- * Main data structure for tracking mapped user pages of bitmap data.
- *
- * For example, for something recording dirty IOVAs, it will be provided a
- * struct iova_bitmap structure, as a general structure for iterating the
- * total IOVA range. The struct iova_bitmap_map, though, represents the
- * subset of said IOVA space that is pinned by its parent structure (struct
- * iova_bitmap).
- *
- * The user does not need to exact location of the bits in the bitmap.
- * From user perspective the only API available is iova_bitmap_set() which
- * records the IOVA *range* in the bitmap by setting the corresponding
- * bits.
- *
- * The bitmap is an array of u64 whereas each bit represents an IOVA of
- * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
- *
- *   data[(iova / page_size) / 64] & (1ULL << (iova % 64))
- */
-struct iova_bitmap_map {
-	/* base IOVA representing bit 0 of the first page */
-	unsigned long iova;
-
-	/* page size order that each bit granules to */
-	unsigned long pgshift;
-
-	/* page offset of the first user page pinned */
-	unsigned long pgoff;
-
-	/* number of pages pinned */
-	unsigned long npages;
-
-	/* pinned pages representing the bitmap data */
-	struct page **pages;
-};
-
-/*
- * struct iova_bitmap - The IOVA bitmap object
- *
- * Main data structure for iterating over the bitmap data.
- *
- * Abstracts the pinning work and iterates in IOVA ranges.
- * It uses a windowing scheme and pins the bitmap in relatively
- * big ranges e.g.
- *
- * The bitmap object uses one base page to store all the pinned pages
- * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
- * 512 struct page pointers which, if the base page size is 4K, it means
- * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
- * also 4K then the range window to iterate is 64G.
- *
- * For example iterating on a total IOVA range of 4G..128G, it will walk
- * through this set of ranges:
- *
- *    4G  -  68G-1 (64G)
- *    68G - 128G-1 (64G)
- *
- * An example of the APIs on how to use/iterate over the IOVA bitmap:
- *
- *   bitmap = iova_bitmap_alloc(iova, length, page_size, data);
- *   if (IS_ERR(bitmap))
- *       return PTR_ERR(bitmap);
- *
- *   ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
- *
- *   iova_bitmap_free(bitmap);
- *
- * Each iteration of the @dirty_reporter_fn is called with a unique @iova
- * and @length argument, indicating the current range available through the
- * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
- * areas (@iova_length) within that provided range, as following:
- *
- *   iova_bitmap_set(bitmap, iova, iova_length);
- *
- * The internals of the object uses an index @mapped_base_index that indexes
- * which u64 word of the bitmap is mapped, up to @mapped_total_index.
- * Those keep being incremented until @mapped_total_index is reached while
- * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
- *
- * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
- * some form of IOVA range tracking that co-relates to the user passed
- * bitmap.
- */
-struct iova_bitmap {
-	/* IOVA range representing the currently mapped bitmap data */
-	struct iova_bitmap_map mapped;
-
-	/* userspace address of the bitmap */
-	u64 __user *bitmap;
-
-	/* u64 index that @mapped points to */
-	unsigned long mapped_base_index;
-
-	/* how many u64 can we walk in total */
-	unsigned long mapped_total_index;
-
-	/* base IOVA of the whole bitmap */
-	unsigned long iova;
-
-	/* length of the IOVA range for the whole bitmap */
-	size_t length;
-};
-
-/*
- * Converts a relative IOVA to a bitmap index.
- * This function provides the index into the u64 array (bitmap::bitmap)
- * for a given IOVA offset.
- * Relative IOVA means relative to the bitmap::mapped base IOVA
- * (stored in mapped::iova). All computations in this file are done using
- * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
- * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
- */
-static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
-						 unsigned long iova)
-{
-	unsigned long pgsize = 1 << bitmap->mapped.pgshift;
-
-	return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
-}
-
-/*
- * Converts a bitmap index to a *relative* IOVA.
- */
-static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
-						 unsigned long index)
-{
-	unsigned long pgshift = bitmap->mapped.pgshift;
-
-	return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
-}
-
-/*
- * Returns the base IOVA of the mapped range.
- */
-static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
-{
-	unsigned long skip = bitmap->mapped_base_index;
-
-	return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
-}
-
-/*
- * Pins the bitmap user pages for the current range window.
- * This is internal to IOVA bitmap and called when advancing the
- * index (@mapped_base_index) or allocating the bitmap.
- */
-static int iova_bitmap_get(struct iova_bitmap *bitmap)
-{
-	struct iova_bitmap_map *mapped = &bitmap->mapped;
-	unsigned long npages;
-	u64 __user *addr;
-	long ret;
-
-	/*
-	 * @mapped_base_index is the index of the currently mapped u64 words
-	 * that we have access. Anything before @mapped_base_index is not
-	 * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
-	 * mapped but capped at a maximum number of pages.
-	 */
-	npages = DIV_ROUND_UP((bitmap->mapped_total_index -
-			       bitmap->mapped_base_index) *
-			       sizeof(*bitmap->bitmap), PAGE_SIZE);
-
-	/*
-	 * We always cap at max number of 'struct page' a base page can fit.
-	 * This is, for example, on x86 means 2M of bitmap data max.
-	 */
-	npages = min(npages,  PAGE_SIZE / sizeof(struct page *));
-
-	/*
-	 * Bitmap address to be pinned is calculated via pointer arithmetic
-	 * with bitmap u64 word index.
-	 */
-	addr = bitmap->bitmap + bitmap->mapped_base_index;
-
-	ret = pin_user_pages_fast((unsigned long)addr, npages,
-				  FOLL_WRITE, mapped->pages);
-	if (ret <= 0)
-		return -EFAULT;
-
-	mapped->npages = (unsigned long)ret;
-	/* Base IOVA where @pages point to i.e. bit 0 of the first page */
-	mapped->iova = iova_bitmap_mapped_iova(bitmap);
-
-	/*
-	 * offset of the page where pinned pages bit 0 is located.
-	 * This handles the case where the bitmap is not PAGE_SIZE
-	 * aligned.
-	 */
-	mapped->pgoff = offset_in_page(addr);
-	return 0;
-}
-
-/*
- * Unpins the bitmap user pages and clears @npages
- * (un)pinning is abstracted from API user and it's done when advancing
- * the index or freeing the bitmap.
- */
-static void iova_bitmap_put(struct iova_bitmap *bitmap)
-{
-	struct iova_bitmap_map *mapped = &bitmap->mapped;
-
-	if (mapped->npages) {
-		unpin_user_pages(mapped->pages, mapped->npages);
-		mapped->npages = 0;
-	}
-}
-
-/**
- * iova_bitmap_alloc() - Allocates an IOVA bitmap object
- * @iova: Start address of the IOVA range
- * @length: Length of the IOVA range
- * @page_size: Page size of the IOVA bitmap. It defines what each bit
- *             granularity represents
- * @data: Userspace address of the bitmap
- *
- * Allocates an IOVA object and initializes all its fields including the
- * first user pages of @data.
- *
- * Return: A pointer to a newly allocated struct iova_bitmap
- * or ERR_PTR() on error.
- */
-struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
-				      unsigned long page_size, u64 __user *data)
-{
-	struct iova_bitmap_map *mapped;
-	struct iova_bitmap *bitmap;
-	int rc;
-
-	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
-	if (!bitmap)
-		return ERR_PTR(-ENOMEM);
-
-	mapped = &bitmap->mapped;
-	mapped->pgshift = __ffs(page_size);
-	bitmap->bitmap = data;
-	bitmap->mapped_total_index =
-		iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
-	bitmap->iova = iova;
-	bitmap->length = length;
-	mapped->iova = iova;
-	mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
-	if (!mapped->pages) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-	rc = iova_bitmap_get(bitmap);
-	if (rc)
-		goto err;
-	return bitmap;
-
-err:
-	iova_bitmap_free(bitmap);
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(iova_bitmap_alloc);
-
-/**
- * iova_bitmap_free() - Frees an IOVA bitmap object
- * @bitmap: IOVA bitmap to free
- *
- * It unpins and releases pages array memory and clears any leftover
- * state.
- */
-void iova_bitmap_free(struct iova_bitmap *bitmap)
-{
-	struct iova_bitmap_map *mapped = &bitmap->mapped;
-
-	iova_bitmap_put(bitmap);
-
-	if (mapped->pages) {
-		free_page((unsigned long)mapped->pages);
-		mapped->pages = NULL;
-	}
-
-	kfree(bitmap);
-}
-EXPORT_SYMBOL_GPL(iova_bitmap_free);
-
-/*
- * Returns the remaining bitmap indexes from mapped_total_index to process for
- * the currently pinned bitmap pages.
- */
-static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
-{
-	unsigned long remaining, bytes;
-
-	bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
-
-	remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
-	remaining = min_t(unsigned long, remaining,
-			  bytes / sizeof(*bitmap->bitmap));
-
-	return remaining;
-}
-
-/*
- * Returns the length of the mapped IOVA range.
- */
-static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
-{
-	unsigned long max_iova = bitmap->iova + bitmap->length - 1;
-	unsigned long iova = iova_bitmap_mapped_iova(bitmap);
-	unsigned long remaining;
-
-	/*
-	 * iova_bitmap_mapped_remaining() returns a number of indexes which
-	 * when converted to IOVA gives us a max length that the bitmap
-	 * pinned data can cover. Afterwards, that is capped to
-	 * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
-	 */
-	remaining = iova_bitmap_index_to_offset(bitmap,
-			iova_bitmap_mapped_remaining(bitmap));
-
-	if (iova + remaining - 1 > max_iova)
-		remaining -= ((iova + remaining - 1) - max_iova);
-
-	return remaining;
-}
-
-/*
- * Returns true if there's not more data to iterate.
- */
-static bool iova_bitmap_done(struct iova_bitmap *bitmap)
-{
-	return bitmap->mapped_base_index >= bitmap->mapped_total_index;
-}
-
-/*
- * Advances to the next range, releases the current pinned
- * pages and pins the next set of bitmap pages.
- * Returns 0 on success or otherwise errno.
- */
-static int iova_bitmap_advance(struct iova_bitmap *bitmap)
-{
-	unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
-	unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
-
-	bitmap->mapped_base_index += count;
-
-	iova_bitmap_put(bitmap);
-	if (iova_bitmap_done(bitmap))
-		return 0;
-
-	/* When advancing the index we pin the next set of bitmap pages */
-	return iova_bitmap_get(bitmap);
-}
-
-/**
- * iova_bitmap_for_each() - Iterates over the bitmap
- * @bitmap: IOVA bitmap to iterate
- * @opaque: Additional argument to pass to the callback
- * @fn: Function that gets called for each IOVA range
- *
- * Helper function to iterate over bitmap data representing a portion of IOVA
- * space. It hides the complexity of iterating bitmaps and translating the
- * mapped bitmap user pages into IOVA ranges to process.
- *
- * Return: 0 on success, and an error on failure either upon
- * iteration or when the callback returns an error.
- */
-int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
-			 iova_bitmap_fn_t fn)
-{
-	int ret = 0;
-
-	for (; !iova_bitmap_done(bitmap) && !ret;
-	     ret = iova_bitmap_advance(bitmap)) {
-		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
-			 iova_bitmap_mapped_length(bitmap), opaque);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iova_bitmap_for_each);
-
-/**
- * iova_bitmap_set() - Records an IOVA range in bitmap
- * @bitmap: IOVA bitmap
- * @iova: IOVA to start
- * @length: IOVA range length
- *
- * Set the bits corresponding to the range [iova .. iova+length-1] in
- * the user bitmap.
- *
- */
-void iova_bitmap_set(struct iova_bitmap *bitmap,
-		     unsigned long iova, size_t length)
-{
-	struct iova_bitmap_map *mapped = &bitmap->mapped;
-	unsigned long cur_bit = ((iova - mapped->iova) >>
-			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
-	unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
-			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
-
-	do {
-		unsigned int page_idx = cur_bit / BITS_PER_PAGE;
-		unsigned int offset = cur_bit % BITS_PER_PAGE;
-		unsigned int nbits = min(BITS_PER_PAGE - offset,
-					 last_bit - cur_bit + 1);
-		void *kaddr;
-
-		kaddr = kmap_local_page(mapped->pages[page_idx]);
-		bitmap_set(kaddr, offset, nbits);
-		kunmap_local(kaddr);
-		cur_bit += nbits;
-	} while (cur_bit <= last_bit);
-}
-EXPORT_SYMBOL_GPL(iova_bitmap_set);
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
index 7088edc4fb28..c3ced56b7787 100644
--- a/drivers/vfio/pci/mlx5/Kconfig
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -3,6 +3,7 @@ config MLX5_VFIO_PCI
 	tristate "VFIO support for MLX5 PCI devices"
 	depends on MLX5_CORE
 	select VFIO_PCI_CORE
+	select IOMMUFD_DRIVER
 	help
 	  This provides migration support for MLX5 devices using the VFIO
 	  framework.
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
index 407b3fd32733..fff368a8183b 100644
--- a/drivers/vfio/pci/pds/Kconfig
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -5,6 +5,7 @@ config PDS_VFIO_PCI
 	tristate "VFIO support for PDS PCI devices"
 	depends on PDS_CORE
 	select VFIO_PCI_CORE
+	select IOMMUFD_DRIVER
 	help
 	  This provides generic PCI support for PDS devices using the VFIO
 	  framework.
diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h
index c006cf0a25f3..1c338f5e5b7a 100644
--- a/include/linux/iova_bitmap.h
+++ b/include/linux/iova_bitmap.h
@@ -7,6 +7,7 @@
 #define _IOVA_BITMAP_H_
 
 #include <linux/types.h>
+#include <linux/errno.h>
 
 struct iova_bitmap;
 
@@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap,
 				unsigned long iova, size_t length,
 				void *opaque);
 
+#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER)
 struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
 				      unsigned long page_size,
 				      u64 __user *data);
@@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 			 iova_bitmap_fn_t fn);
 void iova_bitmap_set(struct iova_bitmap *bitmap,
 		     unsigned long iova, size_t length);
+#else
+static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova,
+						    size_t length,
+						    unsigned long page_size,
+						    u64 __user *data)
+{
+	return NULL;
+}
+
+static inline void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+}
+
+static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+				       iova_bitmap_fn_t fn)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void iova_bitmap_set(struct iova_bitmap *bitmap,
+				   unsigned long iova, size_t length)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From 750e2e902b7180cb82d2f9b1e372e32087bb8b1b Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Tue, 24 Oct 2023 14:50:55 +0100
Subject: iommu: Add iommu_domain ops for dirty tracking

Add to iommu domain operations a set of callbacks to perform dirty
tracking, particulary to start and stop tracking and to read and clear the
dirty data.

Drivers are generally expected to dynamically change its translation
structures to toggle the tracking and flush some form of control state
structure that stands in the IOVA translation path. Though it's not
mandatory, as drivers can also enable dirty tracking at boot, and just
clear the dirty bits before setting dirty tracking. For each of the newly
added IOMMU core APIs:

iommu_cap::IOMMU_CAP_DIRTY_TRACKING: new device iommu_capable value when
probing for capabilities of the device.

.set_dirty_tracking(): an iommu driver is expected to change its
translation structures and enable dirty tracking for the devices in the
iommu_domain. For drivers making dirty tracking always-enabled, it should
just return 0.

.read_and_clear_dirty(): an iommu driver is expected to walk the pagetables
for the iova range passed in and use iommu_dirty_bitmap_record() to record
dirty info per IOVA. When detecting that a given IOVA is dirty it should
also clear its dirty state from the PTE, *unless* the flag
IOMMU_DIRTY_NO_CLEAR is passed in -- flushing is steered from the caller of
the domain_op via iotlb_gather. The iommu core APIs use the same data
structure in use for dirty tracking for VFIO device dirty (struct
iova_bitmap) abstracted by iommu_dirty_bitmap_record() helper function.

domain::dirty_ops: IOMMU domains will store the dirty ops depending on
whether the iommu device supports dirty tracking or not. iommu drivers can
then use this field to figure if the dirty tracking is supported+enforced
on attach. The enforcement is enable via domain_alloc_user() which is done
via IOMMUFD hwpt flag introduced later.

Link: https://lore.kernel.org/r/20231024135109.73787-5-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/io-pgtable.h |  4 +++
 include/linux/iommu.h      | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 1b7a44b35616..25142a0e2fc2 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -166,6 +166,10 @@ struct io_pgtable_ops {
 			      struct iommu_iotlb_gather *gather);
 	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
 				    unsigned long iova);
+	int (*read_and_clear_dirty)(struct io_pgtable_ops *ops,
+				    unsigned long iova, size_t size,
+				    unsigned long flags,
+				    struct iommu_dirty_bitmap *dirty);
 };
 
 /**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3861d66b65c1..1d42bdb37cbc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -13,6 +13,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
+#include <linux/iova_bitmap.h>
 #include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
@@ -37,6 +38,7 @@ struct bus_type;
 struct device;
 struct iommu_domain;
 struct iommu_domain_ops;
+struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
@@ -95,6 +97,8 @@ struct iommu_domain_geometry {
 struct iommu_domain {
 	unsigned type;
 	const struct iommu_domain_ops *ops;
+	const struct iommu_dirty_ops *dirty_ops;
+
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
@@ -133,6 +137,7 @@ enum iommu_cap {
 	 * usefully support the non-strict DMA flush queue.
 	 */
 	IOMMU_CAP_DEFERRED_FLUSH,
+	IOMMU_CAP_DIRTY_TRACKING,	/* IOMMU supports dirty tracking */
 };
 
 /* These are the possible reserved region types */
@@ -227,6 +232,35 @@ struct iommu_iotlb_gather {
 	bool			queued;
 };
 
+/**
+ * struct iommu_dirty_bitmap - Dirty IOVA bitmap state
+ * @bitmap: IOVA bitmap
+ * @gather: Range information for a pending IOTLB flush
+ */
+struct iommu_dirty_bitmap {
+	struct iova_bitmap *bitmap;
+	struct iommu_iotlb_gather *gather;
+};
+
+/* Read but do not clear any dirty bits */
+#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
+
+/**
+ * struct iommu_dirty_ops - domain specific dirty tracking operations
+ * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain
+ * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled
+ *                        into a bitmap, with a bit represented as a page.
+ *                        Reads the dirty PTE bits and clears it from IO
+ *                        pagetables.
+ */
+struct iommu_dirty_ops {
+	int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled);
+	int (*read_and_clear_dirty)(struct iommu_domain *domain,
+				    unsigned long iova, size_t size,
+				    unsigned long flags,
+				    struct iommu_dirty_bitmap *dirty);
+};
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -641,6 +675,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
 	return gather && gather->queued;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+					   struct iova_bitmap *bitmap,
+					   struct iommu_iotlb_gather *gather)
+{
+	if (gather)
+		iommu_iotlb_gather_init(gather);
+
+	dirty->bitmap = bitmap;
+	dirty->gather = gather;
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+					     unsigned long iova,
+					     unsigned long length)
+{
+	if (dirty->bitmap)
+		iova_bitmap_set(dirty->bitmap, iova, length);
+
+	if (dirty->gather)
+		iommu_iotlb_gather_add_range(dirty->gather, iova, length);
+}
+
 /* PCI device grouping function */
 extern struct iommu_group *pci_device_group(struct device *dev);
 /* Generic device grouping function */
@@ -746,6 +802,8 @@ struct iommu_fwspec {};
 struct iommu_device {};
 struct iommu_fault_param {};
 struct iommu_iotlb_gather {};
+struct iommu_dirty_bitmap {};
+struct iommu_dirty_ops {};
 
 static inline bool iommu_present(const struct bus_type *bus)
 {
@@ -978,6 +1036,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
 	return false;
 }
 
+static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty,
+					   struct iova_bitmap *bitmap,
+					   struct iommu_iotlb_gather *gather)
+{
+}
+
+static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty,
+					     unsigned long iova,
+					     unsigned long length)
+{
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
-- 
cgit v1.2.3


From 66c62769bcf6aa142e2309278980a2e52f4b08db Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:07:58 +0300
Subject: exportfs: add helpers to check if filesystem can encode/decode file
 handles

The logic of whether filesystem can encode/decode file handles is open
coded in many places.

In preparation to changing the logic, move the open coded logic into
inline helpers.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c                |  8 ++------
 fs/fhandle.c                       |  6 +-----
 fs/nfsd/export.c                   |  3 +--
 fs/notify/fanotify/fanotify_user.c |  4 ++--
 fs/overlayfs/util.c                |  2 +-
 include/linux/exportfs.h           | 27 +++++++++++++++++++++++++++
 6 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c20704aa21b3..9ee205df8fa7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -396,11 +396,7 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 {
 	const struct export_operations *nop = inode->i_sb->s_export_op;
 
-	/*
-	 * If a decodeable file handle was requested, we need to make sure that
-	 * filesystem can decode file handles.
-	 */
-	if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry)
+	if (!exportfs_can_encode_fh(nop, flags))
 		return -EOPNOTSUPP;
 
 	if (nop && nop->encode_fh)
@@ -456,7 +452,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
-	if (!nop || !nop->fh_to_dentry)
+	if (!exportfs_can_decode_fh(nop))
 		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (IS_ERR_OR_NULL(result))
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6ea8d35a9382..18b3ba8dc8ea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path,
 	/*
 	 * We need to make sure whether the file system support decoding of
 	 * the file handle if decodeable file handle was requested.
-	 * Otherwise, even empty export_operations are sufficient to opt-in
-	 * to encoding FIDs.
 	 */
-	if (!path->dentry->d_sb->s_export_op ||
-	    (!(fh_flags & EXPORT_FH_FID) &&
-	     !path->dentry->d_sb->s_export_op->fh_to_dentry))
+	if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 11a0eaa2f914..dc99dfc1d411 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -421,8 +421,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 
-	if (!inode->i_sb->s_export_op ||
-	    !inode->i_sb->s_export_op->fh_to_dentry) {
+	if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) {
 		dprintk("exp_export: export of invalid fs type.\n");
 		return -EINVAL;
 	}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 62fe0b679e58..0eb9622e8a9f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1595,7 +1595,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
 	 * file handles so user can use name_to_handle_at() to compare fids
 	 * reported with events to the file handle of watched objects.
 	 */
-	if (!nop)
+	if (!exportfs_can_encode_fid(nop))
 		return -EOPNOTSUPP;
 
 	/*
@@ -1603,7 +1603,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
 	 * supports decoding file handles, so user has a way to map back the
 	 * reported fids to filesystem objects.
 	 */
-	if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
+	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
 		return -EOPNOTSUPP;
 
 	return 0;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 89e0d60d35b6..f0a712214ec2 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -55,7 +55,7 @@ int ovl_can_decode_fh(struct super_block *sb)
 	if (!capable(CAP_DAC_READ_SEARCH))
 		return 0;
 
-	if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
+	if (!exportfs_can_decode_fh(sb->s_export_op))
 		return 0;
 
 	return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..5b3c9f30b422 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -233,6 +233,33 @@ extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 			      int *max_len, int flags);
 
+static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
+{
+	return nop;
+}
+
+static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
+{
+	return nop && nop->fh_to_dentry;
+}
+
+static inline bool exportfs_can_encode_fh(const struct export_operations *nop,
+					  int fh_flags)
+{
+	/*
+	 * If a non-decodeable file handle was requested, we only need to make
+	 * sure that filesystem can encode file handles.
+	 */
+	if (fh_flags & EXPORT_FH_FID)
+		return exportfs_can_encode_fid(nop);
+
+	/*
+	 * If a decodeable file handle was requested, we need to make sure that
+	 * filesystem can also decode file handles.
+	 */
+	return exportfs_can_decode_fh(nop);
+}
+
 static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid,
 				      int *max_len)
 {
-- 
cgit v1.2.3


From ae8d3522e5b7208d238c893a00957864536983dc Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Thu, 19 Oct 2023 18:56:18 +0000
Subject: KVM: arm64: Add PMU event filter bits required if EL3 is implemented

Suzuki noticed that KVM's PMU emulation is oblivious to the NSU and NSK
event filter bits. On systems that have EL3 these bits modify the
filter behavior in non-secure EL0 and EL1, respectively. Even though the
kernel doesn't use these bits, it is entirely possible some other guest
OS does. Additionally, it would appear that these and the M bit are
required by the architecture if EL3 is implemented.

Allow the EL3 event filter bits to be set if EL3 is advertised in the
guest's ID register. Implement the behavior of NSU and NSK according to
the pseudocode, and entirely ignore the M bit for perf event creation.

Reported-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20231019185618.3442949-3-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/kvm/pmu-emul.c      | 15 +++++++++++++--
 include/linux/perf/arm_pmuv3.h |  9 ++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 32d83db9674e..e6be14e38cee 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -69,6 +69,11 @@ u64 kvm_pmu_evtyper_mask(struct kvm *kvm)
 	if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL2, pfr0))
 		mask |= ARMV8_PMU_INCLUDE_EL2;
 
+	if (SYS_FIELD_GET(ID_AA64PFR0_EL1, EL3, pfr0))
+		mask |= ARMV8_PMU_EXCLUDE_NS_EL0 |
+			ARMV8_PMU_EXCLUDE_NS_EL1 |
+			ARMV8_PMU_EXCLUDE_EL3;
+
 	return mask;
 }
 
@@ -596,6 +601,7 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
 	struct perf_event *event;
 	struct perf_event_attr attr;
 	u64 eventsel, reg, data;
+	bool p, u, nsk, nsu;
 
 	reg = counter_index_to_evtreg(pmc->idx);
 	data = __vcpu_sys_reg(vcpu, reg);
@@ -622,13 +628,18 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
 	    !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
 		return;
 
+	p = data & ARMV8_PMU_EXCLUDE_EL1;
+	u = data & ARMV8_PMU_EXCLUDE_EL0;
+	nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1;
+	nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0;
+
 	memset(&attr, 0, sizeof(struct perf_event_attr));
 	attr.type = arm_pmu->pmu.type;
 	attr.size = sizeof(attr);
 	attr.pinned = 1;
 	attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
-	attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
-	attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
+	attr.exclude_user = (u != nsu);
+	attr.exclude_kernel = (p != nsk);
 	attr.exclude_hv = 1; /* Don't count EL2 events */
 	attr.exclude_host = 1; /* Don't count host events */
 	attr.config = eventsel;
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index e3899bd77f5c..9c226adf938a 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -234,9 +234,12 @@
 /*
  * Event filters for PMUv3
  */
-#define ARMV8_PMU_EXCLUDE_EL1	(1U << 31)
-#define ARMV8_PMU_EXCLUDE_EL0	(1U << 30)
-#define ARMV8_PMU_INCLUDE_EL2	(1U << 27)
+#define ARMV8_PMU_EXCLUDE_EL1		(1U << 31)
+#define ARMV8_PMU_EXCLUDE_EL0		(1U << 30)
+#define ARMV8_PMU_EXCLUDE_NS_EL1	(1U << 29)
+#define ARMV8_PMU_EXCLUDE_NS_EL0	(1U << 28)
+#define ARMV8_PMU_INCLUDE_EL2		(1U << 27)
+#define ARMV8_PMU_EXCLUDE_EL3		(1U << 26)
 
 /*
  * PMUSERENR: user enable reg
-- 
cgit v1.2.3


From a3a62ca256ab1d1ba771ee243ec4a8e8b7856f08 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Tue, 24 Oct 2023 11:50:13 +0530
Subject: ACPI: utils: Introduce acpi_dev_uid_match() for matching _UID

Introduce acpi_dev_uid_match() helper that matches the device with
supplied _UID string.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 31 +++++++++++++++++++++++++++----
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  5 +++++
 3 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 2ea14648a661..07ee0d23a188 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -768,20 +768,43 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs)
 }
 EXPORT_SYMBOL(acpi_check_dsm);
 
+/**
+ * acpi_dev_uid_match - Match device by supplied UID
+ * @adev: ACPI device to match.
+ * @uid2: Unique ID of the device.
+ *
+ * Matches UID in @adev with given @uid2.
+ *
+ * Returns:
+ *  - %true if matches.
+ *  - %false otherwise.
+ */
+bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
+{
+	const char *uid1 = acpi_device_uid(adev);
+
+	return uid1 && uid2 && !strcmp(uid1, uid2);
+}
+EXPORT_SYMBOL_GPL(acpi_dev_uid_match);
+
 /**
  * acpi_dev_hid_uid_match - Match device by supplied HID and UID
  * @adev: ACPI device to match.
  * @hid2: Hardware ID of the device.
  * @uid2: Unique ID of the device, pass NULL to not check _UID.
  *
- * Matches HID and UID in @adev with given @hid2 and @uid2.
- * Returns true if matches.
+ * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2
+ * will be treated as a match. If user wants to validate @uid2, it should be
+ * done before calling this function.
+ *
+ * Returns:
+ *  - %true if matches or @uid2 is NULL.
+ *  - %false otherwise.
  */
 bool acpi_dev_hid_uid_match(struct acpi_device *adev,
 			    const char *hid2, const char *uid2)
 {
 	const char *hid1 = acpi_device_hid(adev);
-	const char *uid1 = acpi_device_uid(adev);
 
 	if (strcmp(hid1, hid2))
 		return false;
@@ -789,7 +812,7 @@ bool acpi_dev_hid_uid_match(struct acpi_device *adev,
 	if (!uid2)
 		return true;
 
-	return uid1 && !strcmp(uid1, uid2);
+	return acpi_dev_uid_match(adev, uid2);
 }
 EXPORT_SYMBOL(acpi_dev_hid_uid_match);
 
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 254685085c82..d1fe6446ffe0 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -760,6 +760,7 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
 		adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
 }
 
+bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2);
 bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
 int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index afd94c9b8b8a..db3a33e19c97 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -787,6 +787,11 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
+static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
+{
+	return false;
+}
+
 static inline bool
 acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
 {
-- 
cgit v1.2.3


From 6ca80638b90cec66547011ee1ef79e534589989a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Mon, 23 Oct 2023 11:17:28 -0700
Subject: net: dsa: Use conduit and user terms

Use more inclusive terms throughout the DSA subsystem by moving away
from "master" which is replaced by "conduit" and "slave" which is
replaced by "user". No functional changes.

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20231023181729.1191071-2-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../bindings/net/dsa/mediatek,mt7530.yaml          |    2 +-
 Documentation/networking/dsa/b53.rst               |   14 +-
 Documentation/networking/dsa/bcm_sf2.rst           |    2 +-
 Documentation/networking/dsa/configuration.rst     |  100 +-
 Documentation/networking/dsa/dsa.rst               |  162 +-
 Documentation/networking/dsa/lan9303.rst           |    2 +-
 Documentation/networking/dsa/sja1105.rst           |    6 +-
 .../boot/dts/marvell/armada-3720-espressobin.dtsi  |    2 +-
 drivers/net/dsa/b53/b53_common.c                   |    4 +-
 drivers/net/dsa/b53/b53_mdio.c                     |    2 +-
 drivers/net/dsa/bcm_sf2.c                          |   41 +-
 drivers/net/dsa/bcm_sf2.h                          |    2 +-
 drivers/net/dsa/bcm_sf2_cfp.c                      |    4 +-
 drivers/net/dsa/lan9303-core.c                     |    4 +-
 drivers/net/dsa/lantiq_gswip.c                     |   34 +-
 drivers/net/dsa/microchip/ksz9477.c                |    6 +-
 drivers/net/dsa/microchip/ksz_common.c             |   20 +-
 drivers/net/dsa/microchip/ksz_ptp.c                |    2 +-
 drivers/net/dsa/mt7530.c                           |   18 +-
 drivers/net/dsa/mv88e6xxx/chip.c                   |    4 +-
 drivers/net/dsa/ocelot/felix.c                     |   68 +-
 drivers/net/dsa/ocelot/felix.h                     |    6 +-
 drivers/net/dsa/qca/qca8k-8xxx.c                   |   50 +-
 drivers/net/dsa/qca/qca8k-common.c                 |    4 +-
 drivers/net/dsa/qca/qca8k-leds.c                   |    6 +-
 drivers/net/dsa/qca/qca8k.h                        |    2 +-
 drivers/net/dsa/realtek/realtek-smi.c              |   28 +-
 drivers/net/dsa/realtek/realtek.h                  |    2 +-
 drivers/net/dsa/realtek/rtl8365mb.c                |    2 +-
 drivers/net/dsa/sja1105/sja1105_main.c             |    4 +-
 drivers/net/dsa/xrs700x/xrs700x.c                  |   12 +-
 drivers/net/ethernet/broadcom/bcmsysport.c         |    2 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c        |    2 +-
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c    |    2 +-
 include/linux/dsa/sja1105.h                        |    2 +-
 include/net/dsa.h                                  |   56 +-
 include/net/dsa_stubs.h                            |   22 +-
 net/core/dev_ioctl.c                               |    2 +-
 net/dsa/Makefile                                   |    6 +-
 net/dsa/conduit.c                                  |  475 +++
 net/dsa/conduit.h                                  |   22 +
 net/dsa/dsa.c                                      |  224 +-
 net/dsa/dsa.h                                      |   12 +-
 net/dsa/master.c                                   |  475 ---
 net/dsa/master.h                                   |   22 -
 net/dsa/netlink.c                                  |   14 +-
 net/dsa/port.c                                     |  124 +-
 net/dsa/port.h                                     |    4 +-
 net/dsa/slave.c                                    | 3727 --------------------
 net/dsa/slave.h                                    |   69 -
 net/dsa/switch.c                                   |   20 +-
 net/dsa/switch.h                                   |    8 +-
 net/dsa/tag.c                                      |   10 +-
 net/dsa/tag.h                                      |   26 +-
 net/dsa/tag_8021q.c                                |   22 +-
 net/dsa/tag_8021q.h                                |    2 +-
 net/dsa/tag_ar9331.c                               |    4 +-
 net/dsa/tag_brcm.c                                 |   14 +-
 net/dsa/tag_dsa.c                                  |    6 +-
 net/dsa/tag_gswip.c                                |    4 +-
 net/dsa/tag_hellcreek.c                            |    4 +-
 net/dsa/tag_ksz.c                                  |   12 +-
 net/dsa/tag_lan9303.c                              |    4 +-
 net/dsa/tag_mtk.c                                  |    4 +-
 net/dsa/tag_none.c                                 |    6 +-
 net/dsa/tag_ocelot.c                               |   22 +-
 net/dsa/tag_ocelot_8021q.c                         |   12 +-
 net/dsa/tag_qca.c                                  |    6 +-
 net/dsa/tag_rtl4_a.c                               |    6 +-
 net/dsa/tag_rtl8_4.c                               |    6 +-
 net/dsa/tag_rzn1_a5psw.c                           |    4 +-
 net/dsa/tag_sja1105.c                              |   30 +-
 net/dsa/tag_trailer.c                              |    4 +-
 net/dsa/tag_xrs700x.c                              |    4 +-
 net/dsa/user.c                                     | 3727 ++++++++++++++++++++
 net/dsa/user.h                                     |   69 +
 76 files changed, 4958 insertions(+), 4955 deletions(-)
 create mode 100644 net/dsa/conduit.c
 create mode 100644 net/dsa/conduit.h
 delete mode 100644 net/dsa/master.c
 delete mode 100644 net/dsa/master.h
 delete mode 100644 net/dsa/slave.c
 delete mode 100644 net/dsa/slave.h
 create mode 100644 net/dsa/user.c
 create mode 100644 net/dsa/user.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml b/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
index 5038818e9f2e..1c2444121e60 100644
--- a/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
+++ b/Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml
@@ -60,7 +60,7 @@ description: |
 
       Check out example 6.
 
-    - Port 5 can be wired to an external phy. Port 5 becomes a DSA slave.
+    - Port 5 can be wired to an external phy. Port 5 becomes a DSA user port.
 
       For the multi-chip module MT7530, the external phy must be wired TX to TX
       to gmac1 of the SoC for this to work. Ubiquiti EdgeRouter X SFP is wired
diff --git a/Documentation/networking/dsa/b53.rst b/Documentation/networking/dsa/b53.rst
index b41637cdb82b..1cb3ff648f88 100644
--- a/Documentation/networking/dsa/b53.rst
+++ b/Documentation/networking/dsa/b53.rst
@@ -52,7 +52,7 @@ VLAN programming would basically change the CPU port's default PVID and make
 it untagged, undesirable.
 
 In difference to the configuration described in :ref:`dsa-vlan-configuration`
-the default VLAN 1 has to be removed from the slave interface configuration in
+the default VLAN 1 has to be removed from the user interface configuration in
 single port and gateway configuration, while there is no need to add an extra
 VLAN configuration in the bridge showcase.
 
@@ -68,13 +68,13 @@ By default packages are tagged with vid 1:
   ip link add link eth0 name eth0.2 type vlan id 2
   ip link add link eth0 name eth0.3 type vlan id 3
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
   ip link set eth0.2 up
   ip link set eth0.3 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
@@ -113,11 +113,11 @@ bridge
   # tag traffic on CPU port
   ip link add link eth0 name eth0.1 type vlan id 1
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
@@ -149,12 +149,12 @@ gateway
   ip link add link eth0 name eth0.1 type vlan id 1
   ip link add link eth0 name eth0.2 type vlan id 2
 
-  # The master interface needs to be brought up before the slave ports.
+  # The conduit interface needs to be brought up before the user ports.
   ip link set eth0 up
   ip link set eth0.1 up
   ip link set eth0.2 up
 
-  # bring up the slave interfaces
+  # bring up the user interfaces
   ip link set wan up
   ip link set lan1 up
   ip link set lan2 up
diff --git a/Documentation/networking/dsa/bcm_sf2.rst b/Documentation/networking/dsa/bcm_sf2.rst
index dee234039e1e..d2571435696f 100644
--- a/Documentation/networking/dsa/bcm_sf2.rst
+++ b/Documentation/networking/dsa/bcm_sf2.rst
@@ -67,7 +67,7 @@ MDIO indirect accesses
 ----------------------
 
 Due to a limitation in how Broadcom switches have been designed, external
-Broadcom switches connected to a SF2 require the use of the DSA slave MDIO bus
+Broadcom switches connected to a SF2 require the use of the DSA user MDIO bus
 in order to properly configure them. By default, the SF2 pseudo-PHY address, and
 an external switch pseudo-PHY address will both be snooping for incoming MDIO
 transactions, since they are at the same address (30), resulting in some kind of
diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
index d2934c40f0f1..e6c9719874b0 100644
--- a/Documentation/networking/dsa/configuration.rst
+++ b/Documentation/networking/dsa/configuration.rst
@@ -31,38 +31,38 @@ at https://www.kernel.org/pub/linux/utils/net/iproute2/
 
 Through DSA every port of a switch is handled like a normal linux Ethernet
 interface. The CPU port is the switch port connected to an Ethernet MAC chip.
-The corresponding linux Ethernet interface is called the master interface.
-All other corresponding linux interfaces are called slave interfaces.
+The corresponding linux Ethernet interface is called the conduit interface.
+All other corresponding linux interfaces are called user interfaces.
 
-The slave interfaces depend on the master interface being up in order for them
-to send or receive traffic. Prior to kernel v5.12, the state of the master
+The user interfaces depend on the conduit interface being up in order for them
+to send or receive traffic. Prior to kernel v5.12, the state of the conduit
 interface had to be managed explicitly by the user. Starting with kernel v5.12,
 the behavior is as follows:
 
-- when a DSA slave interface is brought up, the master interface is
+- when a DSA user interface is brought up, the conduit interface is
   automatically brought up.
-- when the master interface is brought down, all DSA slave interfaces are
+- when the conduit interface is brought down, all DSA user interfaces are
   automatically brought down.
 
 In this documentation the following Ethernet interfaces are used:
 
 *eth0*
-  the master interface
+  the conduit interface
 
 *eth1*
-  another master interface
+  another conduit interface
 
 *lan1*
-  a slave interface
+  a user interface
 
 *lan2*
-  another slave interface
+  another user interface
 
 *lan3*
-  a third slave interface
+  a third user interface
 
 *wan*
-  A slave interface dedicated for upstream traffic
+  A user interface dedicated for upstream traffic
 
 Further Ethernet interfaces can be configured similar.
 The configured IPs and networks are:
@@ -96,11 +96,11 @@ without using a VLAN based configuration.
     ip addr add 192.0.2.5/30 dev lan2
     ip addr add 192.0.2.9/30 dev lan3
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -108,11 +108,11 @@ without using a VLAN based configuration.
 *bridge*
   .. code-block:: sh
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -134,11 +134,11 @@ without using a VLAN based configuration.
 *gateway*
   .. code-block:: sh
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set wan up
     ip link set lan1 up
     ip link set lan2 up
@@ -178,14 +178,14 @@ configuration.
     ip link add link eth0 name eth0.2 type vlan id 2
     ip link add link eth0 name eth0.3 type vlan id 3
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
     ip link set eth0.2 up
     ip link set eth0.3 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -221,12 +221,12 @@ configuration.
     # tag traffic on CPU port
     ip link add link eth0 name eth0.1 type vlan id 1
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set lan1 up
     ip link set lan2 up
     ip link set lan3 up
@@ -261,13 +261,13 @@ configuration.
     ip link add link eth0 name eth0.1 type vlan id 1
     ip link add link eth0 name eth0.2 type vlan id 2
 
-    # For kernels earlier than v5.12, the master interface needs to be
-    # brought up manually before the slave ports.
+    # For kernels earlier than v5.12, the conduit interface needs to be
+    # brought up manually before the user ports.
     ip link set eth0 up
     ip link set eth0.1 up
     ip link set eth0.2 up
 
-    # bring up the slave interfaces
+    # bring up the user interfaces
     ip link set wan up
     ip link set lan1 up
     ip link set lan2 up
@@ -380,22 +380,22 @@ affinities according to the available CPU ports.
 
 Secondly, it is possible to perform load balancing between CPU ports on a per
 packet basis, rather than statically assigning user ports to CPU ports.
-This can be achieved by placing the DSA masters under a LAG interface (bonding
+This can be achieved by placing the DSA conduits under a LAG interface (bonding
 or team). DSA monitors this operation and creates a mirror of this software LAG
-on the CPU ports facing the physical DSA masters that constitute the LAG slave
+on the CPU ports facing the physical DSA conduits that constitute the LAG slave
 devices.
 
 To make use of multiple CPU ports, the firmware (device tree) description of
-the switch must mark all the links between CPU ports and their DSA masters
+the switch must mark all the links between CPU ports and their DSA conduits
 using the ``ethernet`` reference/phandle. At startup, only a single CPU port
-and DSA master will be used - the numerically first port from the firmware
+and DSA conduit will be used - the numerically first port from the firmware
 description which has an ``ethernet`` property. It is up to the user to
-configure the system for the switch to use other masters.
+configure the system for the switch to use other conduits.
 
 DSA uses the ``rtnl_link_ops`` mechanism (with a "dsa" ``kind``) to allow
-changing the DSA master of a user port. The ``IFLA_DSA_MASTER`` u32 netlink
-attribute contains the ifindex of the master device that handles each slave
-device. The DSA master must be a valid candidate based on firmware node
+changing the DSA conduit of a user port. The ``IFLA_DSA_MASTER`` u32 netlink
+attribute contains the ifindex of the conduit device that handles each user
+device. The DSA conduit must be a valid candidate based on firmware node
 information, or a LAG interface which contains only slaves which are valid
 candidates.
 
@@ -403,7 +403,7 @@ Using iproute2, the following manipulations are possible:
 
   .. code-block:: sh
 
-    # See the DSA master in current use
+    # See the DSA conduit in current use
     ip -d link show dev swp0
         (...)
         dsa master eth0
@@ -414,7 +414,7 @@ Using iproute2, the following manipulations are possible:
     ip link set swp2 type dsa master eth1
     ip link set swp3 type dsa master eth0
 
-    # CPU ports in LAG, using explicit assignment of the DSA master
+    # CPU ports in LAG, using explicit assignment of the DSA conduit
     ip link add bond0 type bond mode balance-xor && ip link set bond0 up
     ip link set eth1 down && ip link set eth1 master bond0
     ip link set swp0 type dsa master bond0
@@ -426,7 +426,7 @@ Using iproute2, the following manipulations are possible:
         (...)
         dsa master bond0
 
-    # CPU ports in LAG, relying on implicit migration of the DSA master
+    # CPU ports in LAG, relying on implicit migration of the DSA conduit
     ip link add bond0 type bond mode balance-xor && ip link set bond0 up
     ip link set eth0 down && ip link set eth0 master bond0
     ip link set eth1 down && ip link set eth1 master bond0
@@ -436,23 +436,23 @@ Using iproute2, the following manipulations are possible:
 
 Notice that in the case of CPU ports under a LAG, the use of the
 ``IFLA_DSA_MASTER`` netlink attribute is not strictly needed, but rather, DSA
-reacts to the ``IFLA_MASTER`` attribute change of its present master (``eth0``)
+reacts to the ``IFLA_MASTER`` attribute change of its present conduit (``eth0``)
 and migrates all user ports to the new upper of ``eth0``, ``bond0``. Similarly,
 when ``bond0`` is destroyed using ``RTM_DELLINK``, DSA migrates the user ports
-that were assigned to this interface to the first physical DSA master which is
+that were assigned to this interface to the first physical DSA conduit which is
 eligible, based on the firmware description (it effectively reverts to the
 startup configuration).
 
 In a setup with more than 2 physical CPU ports, it is therefore possible to mix
-static user to CPU port assignment with LAG between DSA masters. It is not
-possible to statically assign a user port towards a DSA master that has any
-upper interfaces (this includes LAG devices - the master must always be the LAG
+static user to CPU port assignment with LAG between DSA conduits. It is not
+possible to statically assign a user port towards a DSA conduit that has any
+upper interfaces (this includes LAG devices - the conduit must always be the LAG
 in this case).
 
-Live changing of the DSA master (and thus CPU port) affinity of a user port is
+Live changing of the DSA conduit (and thus CPU port) affinity of a user port is
 permitted, in order to allow dynamic redistribution in response to traffic.
 
-Physical DSA masters are allowed to join and leave at any time a LAG interface
-used as a DSA master; however, DSA will reject a LAG interface as a valid
-candidate for being a DSA master unless it has at least one physical DSA master
+Physical DSA conduits are allowed to join and leave at any time a LAG interface
+used as a DSA conduit; however, DSA will reject a LAG interface as a valid
+candidate for being a DSA conduit unless it has at least one physical DSA conduit
 as a slave device.
diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst
index a94ddf83348a..7b2e69cd7ef0 100644
--- a/Documentation/networking/dsa/dsa.rst
+++ b/Documentation/networking/dsa/dsa.rst
@@ -25,7 +25,7 @@ presence of a management port connected to an Ethernet controller capable of
 receiving Ethernet frames from the switch. This is a very common setup for all
 kinds of Ethernet switches found in Small Home and Office products: routers,
 gateways, or even top-of-rack switches. This host Ethernet controller will
-be later referred to as "master" and "cpu" in DSA terminology and code.
+be later referred to as "conduit" and "cpu" in DSA terminology and code.
 
 The D in DSA stands for Distributed, because the subsystem has been designed
 with the ability to configure and manage cascaded switches on top of each other
@@ -35,7 +35,7 @@ of multiple switches connected to each other is called a "switch tree".
 
 For each front-panel port, DSA creates specialized network devices which are
 used as controlling and data-flowing endpoints for use by the Linux networking
-stack. These specialized network interfaces are referred to as "slave" network
+stack. These specialized network interfaces are referred to as "user" network
 interfaces in DSA terminology and code.
 
 The ideal case for using DSA is when an Ethernet switch supports a "switch tag"
@@ -56,12 +56,16 @@ Note that DSA does not currently create network interfaces for the "cpu" and
 
 - the "cpu" port is the Ethernet switch facing side of the management
   controller, and as such, would create a duplication of feature, since you
-  would get two interfaces for the same conduit: master netdev, and "cpu" netdev
+  would get two interfaces for the same conduit: conduit netdev, and "cpu" netdev
 
 - the "dsa" port(s) are just conduits between two or more switches, and as such
   cannot really be used as proper network interfaces either, only the
   downstream, or the top-most upstream interface makes sense with that model
 
+NB: for the past 15 years, the DSA subsystem had been making use of the terms
+"master" (rather than "conduit") and "slave" (rather than "user"). These terms
+have been removed from the DSA codebase and phased out of the uAPI.
+
 Switch tagging protocols
 ------------------------
 
@@ -80,14 +84,14 @@ methods of the ``struct dsa_device_ops`` structure, which are detailed below.
 Tagging protocols generally fall in one of three categories:
 
 1. The switch-specific frame header is located before the Ethernet header,
-   shifting to the right (from the perspective of the DSA master's frame
+   shifting to the right (from the perspective of the DSA conduit's frame
    parser) the MAC DA, MAC SA, EtherType and the entire L2 payload.
 2. The switch-specific frame header is located before the EtherType, keeping
-   the MAC DA and MAC SA in place from the DSA master's perspective, but
+   the MAC DA and MAC SA in place from the DSA conduit's perspective, but
    shifting the 'real' EtherType and L2 payload to the right.
 3. The switch-specific frame header is located at the tail of the packet,
    keeping all frame headers in place and not altering the view of the packet
-   that the DSA master's frame parser has.
+   that the DSA conduit's frame parser has.
 
 A tagging protocol may tag all packets with switch tags of the same length, or
 the tag length might vary (for example packets with PTP timestamps might
@@ -95,7 +99,7 @@ require an extended switch tag, or there might be one tag length on TX and a
 different one on RX). Either way, the tagging protocol driver must populate the
 ``struct dsa_device_ops::needed_headroom`` and/or ``struct dsa_device_ops::needed_tailroom``
 with the length in octets of the longest switch frame header/trailer. The DSA
-framework will automatically adjust the MTU of the master interface to
+framework will automatically adjust the MTU of the conduit interface to
 accommodate for this extra size in order for DSA user ports to support the
 standard MTU (L2 payload length) of 1500 octets. The ``needed_headroom`` and
 ``needed_tailroom`` properties are also used to request from the network stack,
@@ -140,18 +144,18 @@ adding or removing the ``ETH_P_EDSA`` EtherType and some padding octets).
 It is possible to construct cascaded setups of DSA switches even if their
 tagging protocols are not compatible with one another. In this case, there are
 no DSA links in this fabric, and each switch constitutes a disjoint DSA switch
-tree. The DSA links are viewed as simply a pair of a DSA master (the out-facing
+tree. The DSA links are viewed as simply a pair of a DSA conduit (the out-facing
 port of the upstream DSA switch) and a CPU port (the in-facing port of the
 downstream DSA switch).
 
 The tagging protocol of the attached DSA switch tree can be viewed through the
-``dsa/tagging`` sysfs attribute of the DSA master::
+``dsa/tagging`` sysfs attribute of the DSA conduit::
 
     cat /sys/class/net/eth0/dsa/tagging
 
 If the hardware and driver are capable, the tagging protocol of the DSA switch
 tree can be changed at runtime. This is done by writing the new tagging
-protocol name to the same sysfs device attribute as above (the DSA master and
+protocol name to the same sysfs device attribute as above (the DSA conduit and
 all attached switch ports must be down while doing this).
 
 It is desirable that all tagging protocols are testable with the ``dsa_loop``
@@ -159,7 +163,7 @@ mockup driver, which can be attached to any network interface. The goal is that
 any network interface should be capable of transmitting the same packet in the
 same way, and the tagger should decode the same received packet in the same way
 regardless of the driver used for the switch control path, and the driver used
-for the DSA master.
+for the DSA conduit.
 
 The transmission of a packet goes through the tagger's ``xmit`` function.
 The passed ``struct sk_buff *skb`` has ``skb->data`` pointing at
@@ -183,44 +187,44 @@ virtual DSA user network interface corresponding to the physical front-facing
 switch port that the packet was received on.
 
 Since tagging protocols in category 1 and 2 break software (and most often also
-hardware) packet dissection on the DSA master, features such as RPS (Receive
-Packet Steering) on the DSA master would be broken. The DSA framework deals
+hardware) packet dissection on the DSA conduit, features such as RPS (Receive
+Packet Steering) on the DSA conduit would be broken. The DSA framework deals
 with this by hooking into the flow dissector and shifting the offset at which
-the IP header is to be found in the tagged frame as seen by the DSA master.
+the IP header is to be found in the tagged frame as seen by the DSA conduit.
 This behavior is automatic based on the ``overhead`` value of the tagging
 protocol. If not all packets are of equal size, the tagger can implement the
 ``flow_dissect`` method of the ``struct dsa_device_ops`` and override this
 default behavior by specifying the correct offset incurred by each individual
 RX packet. Tail taggers do not cause issues to the flow dissector.
 
-Checksum offload should work with category 1 and 2 taggers when the DSA master
+Checksum offload should work with category 1 and 2 taggers when the DSA conduit
 driver declares NETIF_F_HW_CSUM in vlan_features and looks at csum_start and
 csum_offset. For those cases, DSA will shift the checksum start and offset by
-the tag size. If the DSA master driver still uses the legacy NETIF_F_IP_CSUM
+the tag size. If the DSA conduit driver still uses the legacy NETIF_F_IP_CSUM
 or NETIF_F_IPV6_CSUM in vlan_features, the offload might only work if the
 offload hardware already expects that specific tag (perhaps due to matching
-vendors). DSA slaves inherit those flags from the master port, and it is up to
+vendors). DSA user ports inherit those flags from the conduit, and it is up to
 the driver to correctly fall back to software checksum when the IP header is not
 where the hardware expects. If that check is ineffective, the packets might go
 to the network without a proper checksum (the checksum field will have the
 pseudo IP header sum). For category 3, when the offload hardware does not
 already expect the switch tag in use, the checksum must be calculated before any
-tag is inserted (i.e. inside the tagger). Otherwise, the DSA master would
+tag is inserted (i.e. inside the tagger). Otherwise, the DSA conduit would
 include the tail tag in the (software or hardware) checksum calculation. Then,
 when the tag gets stripped by the switch during transmission, it will leave an
 incorrect IP checksum in place.
 
 Due to various reasons (most common being category 1 taggers being associated
-with DSA-unaware masters, mangling what the master perceives as MAC DA), the
-tagging protocol may require the DSA master to operate in promiscuous mode, to
+with DSA-unaware conduits, mangling what the conduit perceives as MAC DA), the
+tagging protocol may require the DSA conduit to operate in promiscuous mode, to
 receive all frames regardless of the value of the MAC DA. This can be done by
-setting the ``promisc_on_master`` property of the ``struct dsa_device_ops``.
-Note that this assumes a DSA-unaware master driver, which is the norm.
+setting the ``promisc_on_conduit`` property of the ``struct dsa_device_ops``.
+Note that this assumes a DSA-unaware conduit driver, which is the norm.
 
-Master network devices
-----------------------
+Conduit network devices
+-----------------------
 
-Master network devices are regular, unmodified Linux network device drivers for
+Conduit network devices are regular, unmodified Linux network device drivers for
 the CPU/management Ethernet interface. Such a driver might occasionally need to
 know whether DSA is enabled (e.g.: to enable/disable specific offload features),
 but the DSA subsystem has been proven to work with industry standard drivers:
@@ -232,14 +236,14 @@ Ethernet switch.
 Networking stack hooks
 ----------------------
 
-When a master netdev is used with DSA, a small hook is placed in the
+When a conduit netdev is used with DSA, a small hook is placed in the
 networking stack is in order to have the DSA subsystem process the Ethernet
 switch specific tagging protocol. DSA accomplishes this by registering a
 specific (and fake) Ethernet type (later becoming ``skb->protocol``) with the
 networking stack, this is also known as a ``ptype`` or ``packet_type``. A typical
 Ethernet Frame receive sequence looks like this:
 
-Master network device (e.g.: e1000e):
+Conduit network device (e.g.: e1000e):
 
 1. Receive interrupt fires:
 
@@ -269,16 +273,16 @@ Master network device (e.g.: e1000e):
 
         - inspect and strip switch tag protocol to determine originating port
         - locate per-port network device
-        - invoke ``eth_type_trans()`` with the DSA slave network device
+        - invoke ``eth_type_trans()`` with the DSA user network device
         - invoked ``netif_receive_skb()``
 
-Past this point, the DSA slave network devices get delivered regular Ethernet
+Past this point, the DSA user network devices get delivered regular Ethernet
 frames that can be processed by the networking stack.
 
-Slave network devices
----------------------
+User network devices
+--------------------
 
-Slave network devices created by DSA are stacked on top of their master network
+User network devices created by DSA are stacked on top of their conduit network
 device, each of these network interfaces will be responsible for being a
 controlling and data-flowing end-point for each front-panel port of the switch.
 These interfaces are specialized in order to:
@@ -289,31 +293,31 @@ These interfaces are specialized in order to:
   Wake-on-LAN, register dumps...
 - manage external/internal PHY: link, auto-negotiation, etc.
 
-These slave network devices have custom net_device_ops and ethtool_ops function
+These user network devices have custom net_device_ops and ethtool_ops function
 pointers which allow DSA to introduce a level of layering between the networking
 stack/ethtool and the switch driver implementation.
 
-Upon frame transmission from these slave network devices, DSA will look up which
+Upon frame transmission from these user network devices, DSA will look up which
 switch tagging protocol is currently registered with these network devices and
 invoke a specific transmit routine which takes care of adding the relevant
 switch tag in the Ethernet frames.
 
-These frames are then queued for transmission using the master network device
+These frames are then queued for transmission using the conduit network device
 ``ndo_start_xmit()`` function. Since they contain the appropriate switch tag, the
 Ethernet switch will be able to process these incoming frames from the
 management interface and deliver them to the physical switch port.
 
 When using multiple CPU ports, it is possible to stack a LAG (bonding/team)
-device between the DSA slave devices and the physical DSA masters. The LAG
-device is thus also a DSA master, but the LAG slave devices continue to be DSA
-masters as well (just with no user port assigned to them; this is needed for
-recovery in case the LAG DSA master disappears). Thus, the data path of the LAG
-DSA master is used asymmetrically. On RX, the ``ETH_P_XDSA`` handler, which
-calls ``dsa_switch_rcv()``, is invoked early (on the physical DSA master;
-LAG slave). Therefore, the RX data path of the LAG DSA master is not used.
-On the other hand, TX takes place linearly: ``dsa_slave_xmit`` calls
-``dsa_enqueue_skb``, which calls ``dev_queue_xmit`` towards the LAG DSA master.
-The latter calls ``dev_queue_xmit`` towards one physical DSA master or the
+device between the DSA user devices and the physical DSA conduits. The LAG
+device is thus also a DSA conduit, but the LAG slave devices continue to be DSA
+conduits as well (just with no user port assigned to them; this is needed for
+recovery in case the LAG DSA conduit disappears). Thus, the data path of the LAG
+DSA conduit is used asymmetrically. On RX, the ``ETH_P_XDSA`` handler, which
+calls ``dsa_switch_rcv()``, is invoked early (on the physical DSA conduit;
+LAG slave). Therefore, the RX data path of the LAG DSA conduit is not used.
+On the other hand, TX takes place linearly: ``dsa_user_xmit`` calls
+``dsa_enqueue_skb``, which calls ``dev_queue_xmit`` towards the LAG DSA conduit.
+The latter calls ``dev_queue_xmit`` towards one physical DSA conduit or the
 other, and in both cases, the packet exits the system through a hardware path
 towards the switch.
 
@@ -352,11 +356,11 @@ perspective::
            || swp0 | | swp1 | | swp2 | | swp3 ||
            ++------+-+------+-+------+-+------++
 
-Slave MDIO bus
---------------
+User MDIO bus
+-------------
 
-In order to be able to read to/from a switch PHY built into it, DSA creates a
-slave MDIO bus which allows a specific switch driver to divert and intercept
+In order to be able to read to/from a switch PHY built into it, DSA creates an
+user MDIO bus which allows a specific switch driver to divert and intercept
 MDIO reads/writes towards specific PHY addresses. In most MDIO-connected
 switches, these functions would utilize direct or indirect PHY addressing mode
 to return standard MII registers from the switch builtin PHYs, allowing the PHY
@@ -364,7 +368,7 @@ library and/or to return link status, link partner pages, auto-negotiation
 results, etc.
 
 For Ethernet switches which have both external and internal MDIO buses, the
-slave MII bus can be utilized to mux/demux MDIO reads and writes towards either
+user MII bus can be utilized to mux/demux MDIO reads and writes towards either
 internal or external MDIO devices this switch might be connected to: internal
 PHYs, external PHYs, or even external switches.
 
@@ -381,10 +385,10 @@ DSA data structures are defined in ``include/net/dsa.h`` as well as
 
 - ``dsa_platform_data``: platform device configuration data which can reference
   a collection of dsa_chip_data structures if multiple switches are cascaded,
-  the master network device this switch tree is attached to needs to be
+  the conduit network device this switch tree is attached to needs to be
   referenced
 
-- ``dsa_switch_tree``: structure assigned to the master network device under
+- ``dsa_switch_tree``: structure assigned to the conduit network device under
   ``dsa_ptr``, this structure references a dsa_platform_data structure as well as
   the tagging protocol supported by the switch tree, and which receive/transmit
   function hooks should be invoked, information about the directly attached
@@ -392,7 +396,7 @@ DSA data structures are defined in ``include/net/dsa.h`` as well as
   referenced to address individual switches in the tree.
 
 - ``dsa_switch``: structure describing a switch device in the tree, referencing
-  a ``dsa_switch_tree`` as a backpointer, slave network devices, master network
+  a ``dsa_switch_tree`` as a backpointer, user network devices, conduit network
   device, and a reference to the backing``dsa_switch_ops``
 
 - ``dsa_switch_ops``: structure referencing function pointers, see below for a
@@ -404,7 +408,7 @@ Design limitations
 Lack of CPU/DSA network devices
 -------------------------------
 
-DSA does not currently create slave network devices for the CPU or DSA ports, as
+DSA does not currently create user network devices for the CPU or DSA ports, as
 described before. This might be an issue in the following cases:
 
 - inability to fetch switch CPU port statistics counters using ethtool, which
@@ -419,7 +423,7 @@ described before. This might be an issue in the following cases:
 Common pitfalls using DSA setups
 --------------------------------
 
-Once a master network device is configured to use DSA (dev->dsa_ptr becomes
+Once a conduit network device is configured to use DSA (dev->dsa_ptr becomes
 non-NULL), and the switch behind it expects a tagging protocol, this network
 interface can only exclusively be used as a conduit interface. Sending packets
 directly through this interface (e.g.: opening a socket using this interface)
@@ -440,7 +444,7 @@ DSA currently leverages the following subsystems:
 MDIO/PHY library
 ----------------
 
-Slave network devices exposed by DSA may or may not be interfacing with PHY
+User network devices exposed by DSA may or may not be interfacing with PHY
 devices (``struct phy_device`` as defined in ``include/linux/phy.h)``, but the DSA
 subsystem deals with all possible combinations:
 
@@ -450,7 +454,7 @@ subsystem deals with all possible combinations:
 - special, non-autonegotiated or non MDIO-managed PHY devices: SFPs, MoCA; a.k.a
   fixed PHYs
 
-The PHY configuration is done by the ``dsa_slave_phy_setup()`` function and the
+The PHY configuration is done by the ``dsa_user_phy_setup()`` function and the
 logic basically looks like this:
 
 - if Device Tree is used, the PHY device is looked up using the standard
@@ -463,7 +467,7 @@ logic basically looks like this:
   and connected transparently using the special fixed MDIO bus driver
 
 - finally, if the PHY is built into the switch, as is very common with
-  standalone switch packages, the PHY is probed using the slave MII bus created
+  standalone switch packages, the PHY is probed using the user MII bus created
   by DSA
 
 
@@ -472,7 +476,7 @@ SWITCHDEV
 
 DSA directly utilizes SWITCHDEV when interfacing with the bridge layer, and
 more specifically with its VLAN filtering portion when configuring VLANs on top
-of per-port slave network devices. As of today, the only SWITCHDEV objects
+of per-port user network devices. As of today, the only SWITCHDEV objects
 supported by DSA are the FDB and VLAN objects.
 
 Devlink
@@ -589,8 +593,8 @@ is torn down when the first switch unregisters.
 It is mandatory for DSA switch drivers to implement the ``shutdown()`` callback
 of their respective bus, and call ``dsa_switch_shutdown()`` from it (a minimal
 version of the full teardown performed by ``dsa_unregister_switch()``).
-The reason is that DSA keeps a reference on the master net device, and if the
-driver for the master device decides to unbind on shutdown, DSA's reference
+The reason is that DSA keeps a reference on the conduit net device, and if the
+driver for the conduit device decides to unbind on shutdown, DSA's reference
 will block that operation from finalizing.
 
 Either ``dsa_switch_shutdown()`` or ``dsa_unregister_switch()`` must be called,
@@ -615,7 +619,7 @@ Switch configuration
   tag formats.
 
 - ``change_tag_protocol``: when the default tagging protocol has compatibility
-  problems with the master or other issues, the driver may support changing it
+  problems with the conduit or other issues, the driver may support changing it
   at runtime, either through a device tree property or through sysfs. In that
   case, further calls to ``get_tag_protocol`` should report the protocol in
   current use.
@@ -643,22 +647,22 @@ Switch configuration
   PHY cannot be found. In this case, probing of the DSA switch continues
   without that particular port.
 
-- ``port_change_master``: method through which the affinity (association used
+- ``port_change_conduit``: method through which the affinity (association used
   for traffic termination purposes) between a user port and a CPU port can be
   changed. By default all user ports from a tree are assigned to the first
   available CPU port that makes sense for them (most of the times this means
   the user ports of a tree are all assigned to the same CPU port, except for H
   topologies as described in commit 2c0b03258b8b). The ``port`` argument
-  represents the index of the user port, and the ``master`` argument represents
-  the new DSA master ``net_device``. The CPU port associated with the new
-  master can be retrieved by looking at ``struct dsa_port *cpu_dp =
-  master->dsa_ptr``. Additionally, the master can also be a LAG device where
-  all the slave devices are physical DSA masters. LAG DSA masters also have a
-  valid ``master->dsa_ptr`` pointer, however this is not unique, but rather a
-  duplicate of the first physical DSA master's (LAG slave) ``dsa_ptr``. In case
-  of a LAG DSA master, a further call to ``port_lag_join`` will be emitted
+  represents the index of the user port, and the ``conduit`` argument represents
+  the new DSA conduit ``net_device``. The CPU port associated with the new
+  conduit can be retrieved by looking at ``struct dsa_port *cpu_dp =
+  conduit->dsa_ptr``. Additionally, the conduit can also be a LAG device where
+  all the slave devices are physical DSA conduits. LAG DSA  also have a
+  valid ``conduit->dsa_ptr`` pointer, however this is not unique, but rather a
+  duplicate of the first physical DSA conduit's (LAG slave) ``dsa_ptr``. In case
+  of a LAG DSA conduit, a further call to ``port_lag_join`` will be emitted
   separately for the physical CPU ports associated with the physical DSA
-  masters, requesting them to create a hardware LAG associated with the LAG
+  conduits, requesting them to create a hardware LAG associated with the LAG
   interface.
 
 PHY devices and link management
@@ -670,16 +674,16 @@ PHY devices and link management
   should return a 32-bit bitmask of "flags" that is private between the switch
   driver and the Ethernet PHY driver in ``drivers/net/phy/\*``.
 
-- ``phy_read``: Function invoked by the DSA slave MDIO bus when attempting to read
+- ``phy_read``: Function invoked by the DSA user MDIO bus when attempting to read
   the switch port MDIO registers. If unavailable, return 0xffff for each read.
   For builtin switch Ethernet PHYs, this function should allow reading the link
   status, auto-negotiation results, link partner pages, etc.
 
-- ``phy_write``: Function invoked by the DSA slave MDIO bus when attempting to write
+- ``phy_write``: Function invoked by the DSA user MDIO bus when attempting to write
   to the switch port MDIO registers. If unavailable return a negative error
   code.
 
-- ``adjust_link``: Function invoked by the PHY library when a slave network device
+- ``adjust_link``: Function invoked by the PHY library when a user network device
   is attached to a PHY device. This function is responsible for appropriately
   configuring the switch port link parameters: speed, duplex, pause based on
   what the ``phy_device`` is providing.
@@ -698,14 +702,14 @@ Ethtool operations
   typically return statistics strings, private flags strings, etc.
 
 - ``get_ethtool_stats``: ethtool function used to query per-port statistics and
-  return their values. DSA overlays slave network devices general statistics:
+  return their values. DSA overlays user network devices general statistics:
   RX/TX counters from the network device, with switch driver specific statistics
   per port
 
 - ``get_sset_count``: ethtool function used to query the number of statistics items
 
 - ``get_wol``: ethtool function used to obtain Wake-on-LAN settings per-port, this
-  function may for certain implementations also query the master network device
+  function may for certain implementations also query the conduit network device
   Wake-on-LAN settings if this interface needs to participate in Wake-on-LAN
 
 - ``set_wol``: ethtool function used to configure Wake-on-LAN settings per-port,
@@ -747,13 +751,13 @@ Power management
   should resume all Ethernet switch activities and re-configure the switch to be
   in a fully active state
 
-- ``port_enable``: function invoked by the DSA slave network device ndo_open
+- ``port_enable``: function invoked by the DSA user network device ndo_open
   function when a port is administratively brought up, this function should
   fully enable a given switch port. DSA takes care of marking the port with
   ``BR_STATE_BLOCKING`` if the port is a bridge member, or ``BR_STATE_FORWARDING`` if it
   was not, and propagating these changes down to the hardware
 
-- ``port_disable``: function invoked by the DSA slave network device ndo_close
+- ``port_disable``: function invoked by the DSA user network device ndo_close
   function when a port is administratively brought down, this function should
   fully disable a given switch port. DSA takes care of marking the port with
   ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is
diff --git a/Documentation/networking/dsa/lan9303.rst b/Documentation/networking/dsa/lan9303.rst
index e3c820db28ad..ab81b4e0139e 100644
--- a/Documentation/networking/dsa/lan9303.rst
+++ b/Documentation/networking/dsa/lan9303.rst
@@ -4,7 +4,7 @@ LAN9303 Ethernet switch driver
 
 The LAN9303 is a three port 10/100 Mbps ethernet switch with integrated phys for
 the two external ethernet ports. The third port is an RMII/MII interface to a
-host master network interface (e.g. fixed link).
+host conduit network interface (e.g. fixed link).
 
 
 Driver details
diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst
index e0219c1452ab..8ab60eef07d4 100644
--- a/Documentation/networking/dsa/sja1105.rst
+++ b/Documentation/networking/dsa/sja1105.rst
@@ -79,7 +79,7 @@ The hardware tags all traffic internally with a port-based VLAN (pvid), or it
 decodes the VLAN information from the 802.1Q tag. Advanced VLAN classification
 is not possible. Once attributed a VLAN tag, frames are checked against the
 port's membership rules and dropped at ingress if they don't match any VLAN.
-This behavior is available when switch ports are enslaved to a bridge with
+This behavior is available when switch ports join a bridge with
 ``vlan_filtering 1``.
 
 Normally the hardware is not configurable with respect to VLAN awareness, but
@@ -122,7 +122,7 @@ on egress. Using ``vlan_filtering=1``, the behavior is the other way around:
 offloaded flows can be steered to TX queues based on the VLAN PCP, but the DSA
 net devices are no longer able to do that. To inject frames into a hardware TX
 queue with VLAN awareness active, it is necessary to create a VLAN
-sub-interface on the DSA master port, and send normal (0x8100) VLAN-tagged
+sub-interface on the DSA conduit port, and send normal (0x8100) VLAN-tagged
 towards the switch, with the VLAN PCP bits set appropriately.
 
 Management traffic (having DMAC 01-80-C2-xx-xx-xx or 01-19-1B-xx-xx-xx) is the
@@ -389,7 +389,7 @@ MDIO bus and PHY management
 The SJA1105 does not have an MDIO bus and does not perform in-band AN either.
 Therefore there is no link state notification coming from the switch device.
 A board would need to hook up the PHYs connected to the switch to any other
-MDIO bus available to Linux within the system (e.g. to the DSA master's MDIO
+MDIO bus available to Linux within the system (e.g. to the DSA conduit's MDIO
 bus). Link state management then works by the driver manually keeping in sync
 (over SPI commands) the MAC link speed with the settings negotiated by the PHY.
 
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
index 5fc613d24151..49cbdb55b4b3 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dtsi
@@ -13,7 +13,7 @@
 / {
 	aliases {
 		ethernet0 = &eth0;
-		/* for dsa slave device */
+		/* for DSA user port device */
 		ethernet1 = &switch0port1;
 		ethernet2 = &switch0port2;
 		ethernet3 = &switch0port3;
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 4e27dc913cf7..0d628b35fd5c 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -757,7 +757,7 @@ int b53_configure_vlan(struct dsa_switch *ds)
 
 	/* Create an untagged VLAN entry for the default PVID in case
 	 * CONFIG_VLAN_8021Q is disabled and there are no calls to
-	 * dsa_slave_vlan_rx_add_vid() to create the default VLAN
+	 * dsa_user_vlan_rx_add_vid() to create the default VLAN
 	 * entry. Do this only when the tagging protocol is not
 	 * DSA_TAG_PROTO_NONE
 	 */
@@ -958,7 +958,7 @@ static struct phy_device *b53_get_phy_device(struct dsa_switch *ds, int port)
 		return NULL;
 	}
 
-	return mdiobus_get_phy(ds->slave_mii_bus, port);
+	return mdiobus_get_phy(ds->user_mii_bus, port);
 }
 
 void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
diff --git a/drivers/net/dsa/b53/b53_mdio.c b/drivers/net/dsa/b53/b53_mdio.c
index 4d55d8d18376..897e5e8b3d69 100644
--- a/drivers/net/dsa/b53/b53_mdio.c
+++ b/drivers/net/dsa/b53/b53_mdio.c
@@ -329,7 +329,7 @@ static int b53_mdio_probe(struct mdio_device *mdiodev)
 	 * layer setup
 	 */
 	if (of_machine_is_compatible("brcm,bcm7445d0") &&
-	    strcmp(mdiodev->bus->name, "sf2 slave mii"))
+	    strcmp(mdiodev->bus->name, "sf2 user mii"))
 		return -EPROBE_DEFER;
 
 	dev = b53_switch_alloc(&mdiodev->dev, &b53_mdio_ops, mdiodev->bus);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 326937e91f52..cadee5505c29 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -623,19 +623,19 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 
 	priv->master_mii_dn = dn;
 
-	priv->slave_mii_bus = mdiobus_alloc();
-	if (!priv->slave_mii_bus) {
+	priv->user_mii_bus = mdiobus_alloc();
+	if (!priv->user_mii_bus) {
 		err = -ENOMEM;
 		goto err_put_master_mii_bus_dev;
 	}
 
-	priv->slave_mii_bus->priv = priv;
-	priv->slave_mii_bus->name = "sf2 slave mii";
-	priv->slave_mii_bus->read = bcm_sf2_sw_mdio_read;
-	priv->slave_mii_bus->write = bcm_sf2_sw_mdio_write;
-	snprintf(priv->slave_mii_bus->id, MII_BUS_ID_SIZE, "sf2-%d",
+	priv->user_mii_bus->priv = priv;
+	priv->user_mii_bus->name = "sf2 user mii";
+	priv->user_mii_bus->read = bcm_sf2_sw_mdio_read;
+	priv->user_mii_bus->write = bcm_sf2_sw_mdio_write;
+	snprintf(priv->user_mii_bus->id, MII_BUS_ID_SIZE, "sf2-%d",
 		 index++);
-	priv->slave_mii_bus->dev.of_node = dn;
+	priv->user_mii_bus->dev.of_node = dn;
 
 	/* Include the pseudo-PHY address to divert reads towards our
 	 * workaround. This is only required for 7445D0, since 7445E0
@@ -653,9 +653,9 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 		priv->indir_phy_mask = 0;
 
 	ds->phys_mii_mask = priv->indir_phy_mask;
-	ds->slave_mii_bus = priv->slave_mii_bus;
-	priv->slave_mii_bus->parent = ds->dev->parent;
-	priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
+	ds->user_mii_bus = priv->user_mii_bus;
+	priv->user_mii_bus->parent = ds->dev->parent;
+	priv->user_mii_bus->phy_mask = ~priv->indir_phy_mask;
 
 	/* We need to make sure that of_phy_connect() will not work by
 	 * removing the 'phandle' and 'linux,phandle' properties and
@@ -682,14 +682,14 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 			phy_device_remove(phydev);
 	}
 
-	err = mdiobus_register(priv->slave_mii_bus);
+	err = mdiobus_register(priv->user_mii_bus);
 	if (err && dn)
-		goto err_free_slave_mii_bus;
+		goto err_free_user_mii_bus;
 
 	return 0;
 
-err_free_slave_mii_bus:
-	mdiobus_free(priv->slave_mii_bus);
+err_free_user_mii_bus:
+	mdiobus_free(priv->user_mii_bus);
 err_put_master_mii_bus_dev:
 	put_device(&priv->master_mii_bus->dev);
 err_of_node_put:
@@ -699,10 +699,9 @@ err_of_node_put:
 
 static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv)
 {
-	mdiobus_unregister(priv->slave_mii_bus);
-	mdiobus_free(priv->slave_mii_bus);
+	mdiobus_unregister(priv->user_mii_bus);
+	mdiobus_free(priv->user_mii_bus);
 	put_device(&priv->master_mii_bus->dev);
-	of_node_put(priv->master_mii_dn);
 }
 
 static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
@@ -915,7 +914,7 @@ static void bcm_sf2_sw_fixed_state(struct dsa_switch *ds, int port,
 		 * state machine and make it go in PHY_FORCING state instead.
 		 */
 		if (!status->link)
-			netif_carrier_off(dsa_to_port(ds, port)->slave);
+			netif_carrier_off(dsa_to_port(ds, port)->user);
 		status->duplex = DUPLEX_FULL;
 	} else {
 		status->link = true;
@@ -989,7 +988,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 			       struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_wolinfo pwol = { };
 
@@ -1013,7 +1012,7 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
 			      struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	struct ethtool_wolinfo pwol =  { };
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 00afc94ce522..424f896b5a6f 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -108,7 +108,7 @@ struct bcm_sf2_priv {
 	/* Master and slave MDIO bus controller */
 	unsigned int			indir_phy_mask;
 	struct device_node		*master_mii_dn;
-	struct mii_bus			*slave_mii_bus;
+	struct mii_bus			*user_mii_bus;
 	struct mii_bus			*master_mii_bus;
 
 	/* Bitmask of ports needing BRCM tags */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index c4010b7bf089..c88ee3dd4299 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -1102,7 +1102,7 @@ static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv,
 int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc, u32 *rule_locs)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
@@ -1145,7 +1145,7 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc)
 {
-	struct net_device *p = dsa_port_to_master(dsa_to_port(ds, port));
+	struct net_device *p = dsa_port_to_conduit(dsa_to_port(ds, port));
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index ee67adeb2cdb..fcb20eac332a 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -1084,7 +1084,7 @@ static int lan9303_port_enable(struct dsa_switch *ds, int port,
 	if (!dsa_port_is_user(dp))
 		return 0;
 
-	vlan_vid_add(dsa_port_to_master(dp), htons(ETH_P_8021Q), port);
+	vlan_vid_add(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
 
 	return lan9303_enable_processing_port(chip, port);
 }
@@ -1097,7 +1097,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return;
 
-	vlan_vid_del(dsa_port_to_master(dp), htons(ETH_P_8021Q), port);
+	vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port);
 
 	lan9303_disable_processing_port(chip, port);
 	lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN);
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 1a2d5797bf98..9c185c9f0963 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -510,22 +510,22 @@ static int gswip_mdio(struct gswip_priv *priv, struct device_node *mdio_np)
 	struct dsa_switch *ds = priv->ds;
 	int err;
 
-	ds->slave_mii_bus = mdiobus_alloc();
-	if (!ds->slave_mii_bus)
+	ds->user_mii_bus = mdiobus_alloc();
+	if (!ds->user_mii_bus)
 		return -ENOMEM;
 
-	ds->slave_mii_bus->priv = priv;
-	ds->slave_mii_bus->read = gswip_mdio_rd;
-	ds->slave_mii_bus->write = gswip_mdio_wr;
-	ds->slave_mii_bus->name = "lantiq,xrx200-mdio";
-	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s-mii",
+	ds->user_mii_bus->priv = priv;
+	ds->user_mii_bus->read = gswip_mdio_rd;
+	ds->user_mii_bus->write = gswip_mdio_wr;
+	ds->user_mii_bus->name = "lantiq,xrx200-mdio";
+	snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "%s-mii",
 		 dev_name(priv->dev));
-	ds->slave_mii_bus->parent = priv->dev;
-	ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
+	ds->user_mii_bus->parent = priv->dev;
+	ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
 
-	err = of_mdiobus_register(ds->slave_mii_bus, mdio_np);
+	err = of_mdiobus_register(ds->user_mii_bus, mdio_np);
 	if (err)
-		mdiobus_free(ds->slave_mii_bus);
+		mdiobus_free(ds->user_mii_bus);
 
 	return err;
 }
@@ -2196,8 +2196,8 @@ disable_switch:
 	dsa_unregister_switch(priv->ds);
 mdio_bus:
 	if (mdio_np) {
-		mdiobus_unregister(priv->ds->slave_mii_bus);
-		mdiobus_free(priv->ds->slave_mii_bus);
+		mdiobus_unregister(priv->ds->user_mii_bus);
+		mdiobus_free(priv->ds->user_mii_bus);
 	}
 put_mdio_node:
 	of_node_put(mdio_np);
@@ -2219,10 +2219,10 @@ static void gswip_remove(struct platform_device *pdev)
 
 	dsa_unregister_switch(priv->ds);
 
-	if (priv->ds->slave_mii_bus) {
-		mdiobus_unregister(priv->ds->slave_mii_bus);
-		of_node_put(priv->ds->slave_mii_bus->dev.of_node);
-		mdiobus_free(priv->ds->slave_mii_bus);
+	if (priv->ds->user_mii_bus) {
+		mdiobus_unregister(priv->ds->user_mii_bus);
+		of_node_put(priv->ds->user_mii_bus->dev.of_node);
+		mdiobus_free(priv->ds->user_mii_bus);
 	}
 
 	for (i = 0; i < priv->num_gphy_fw; i++)
diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index cde8ef33d029..a8b0e528b804 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -1170,7 +1170,7 @@ int ksz9477_tc_cbs_set_cinc(struct ksz_device *dev, int port, u32 val)
 void ksz9477_hsr_join(struct dsa_switch *ds, int port, struct net_device *hsr)
 {
 	struct ksz_device *dev = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	struct dsa_port *hsr_dp;
 	u8 data, hsr_ports = 0;
 
@@ -1202,8 +1202,8 @@ void ksz9477_hsr_join(struct dsa_switch *ds, int port, struct net_device *hsr)
 	ksz_port_cfg(dev, port, REG_PORT_LUE_CTRL, PORT_SRC_ADDR_FILTER, true);
 
 	/* Setup HW supported features for lan HSR ports */
-	slave = dsa_to_port(ds, port)->slave;
-	slave->features |= KSZ9477_SUPPORTED_HSR_FEATURES;
+	user = dsa_to_port(ds, port)->user;
+	user->features |= KSZ9477_SUPPORTED_HSR_FEATURES;
 }
 
 void ksz9477_hsr_leave(struct dsa_switch *ds, int port, struct net_device *hsr)
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index b800ace40ce1..a8025ff988ec 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1945,14 +1945,14 @@ static int ksz_irq_phy_setup(struct ksz_device *dev)
 				ret = irq;
 				goto out;
 			}
-			ds->slave_mii_bus->irq[phy] = irq;
+			ds->user_mii_bus->irq[phy] = irq;
 		}
 	}
 	return 0;
 out:
 	while (phy--)
 		if (BIT(phy) & ds->phys_mii_mask)
-			irq_dispose_mapping(ds->slave_mii_bus->irq[phy]);
+			irq_dispose_mapping(ds->user_mii_bus->irq[phy]);
 
 	return ret;
 }
@@ -1964,7 +1964,7 @@ static void ksz_irq_phy_free(struct ksz_device *dev)
 
 	for (phy = 0; phy < KSZ_MAX_NUM_PORTS; phy++)
 		if (BIT(phy) & ds->phys_mii_mask)
-			irq_dispose_mapping(ds->slave_mii_bus->irq[phy]);
+			irq_dispose_mapping(ds->user_mii_bus->irq[phy]);
 }
 
 static int ksz_mdio_register(struct ksz_device *dev)
@@ -1987,12 +1987,12 @@ static int ksz_mdio_register(struct ksz_device *dev)
 	bus->priv = dev;
 	bus->read = ksz_sw_mdio_read;
 	bus->write = ksz_sw_mdio_write;
-	bus->name = "ksz slave smi";
+	bus->name = "ksz user smi";
 	snprintf(bus->id, MII_BUS_ID_SIZE, "SMI-%d", ds->index);
 	bus->parent = ds->dev;
 	bus->phy_mask = ~ds->phys_mii_mask;
 
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 
 	if (dev->irq > 0) {
 		ret = ksz_irq_phy_setup(dev);
@@ -2344,7 +2344,7 @@ static void ksz_mib_read_work(struct work_struct *work)
 		if (!p->read) {
 			const struct dsa_port *dp = dsa_to_port(dev->ds, i);
 
-			if (!netif_carrier_ok(dp->slave))
+			if (!netif_carrier_ok(dp->user))
 				mib->cnt_ptr = dev->info->reg_mib_cnt;
 		}
 		port_r_cnt(dev, i);
@@ -2464,7 +2464,7 @@ static void ksz_get_ethtool_stats(struct dsa_switch *ds, int port,
 	mutex_lock(&mib->cnt_mutex);
 
 	/* Only read dropped counters if no link. */
-	if (!netif_carrier_ok(dp->slave))
+	if (!netif_carrier_ok(dp->user))
 		mib->cnt_ptr = dev->info->reg_mib_cnt;
 	port_r_cnt(dev, port);
 	memcpy(buf, mib->counters, dev->info->mib_cnt * sizeof(u64));
@@ -2574,7 +2574,7 @@ static int ksz_port_setup(struct dsa_switch *ds, int port)
 	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	/* setup slave port */
+	/* setup user port */
 	dev->dev_ops->port_setup(dev, port, false);
 
 	/* port_stp_state_set() will be called after to enable the port so
@@ -3567,8 +3567,8 @@ static int ksz_port_set_mac_address(struct dsa_switch *ds, int port,
 static int ksz_switch_macaddr_get(struct dsa_switch *ds, int port,
 				  struct netlink_ext_ack *extack)
 {
-	struct net_device *slave = dsa_to_port(ds, port)->slave;
-	const unsigned char *addr = slave->dev_addr;
+	struct net_device *user = dsa_to_port(ds, port)->user;
+	const unsigned char *addr = user->dev_addr;
 	struct ksz_switch_macaddr *switch_macaddr;
 	struct ksz_device *dev = ds->priv;
 	const u16 *regs = dev->info->regs;
diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c
index 4e22a695a64c..1fe105913c75 100644
--- a/drivers/net/dsa/microchip/ksz_ptp.c
+++ b/drivers/net/dsa/microchip/ksz_ptp.c
@@ -557,7 +557,7 @@ static void ksz_ptp_txtstamp_skb(struct ksz_device *dev,
 	struct skb_shared_hwtstamps hwtstamps = {};
 	int ret;
 
-	/* timeout must include DSA master to transmit data, tstamp latency,
+	/* timeout must include DSA conduit to transmit data, tstamp latency,
 	 * IRQ latency and time for reading the time stamp.
 	 */
 	ret = wait_for_completion_timeout(&prt->tstamp_msg_comp,
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index ecf5d3deb36e..d27c6b70a2f6 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1113,7 +1113,7 @@ mt7530_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 	u32 val;
 
 	/* When a new MTU is set, DSA always set the CPU port's MTU to the
-	 * largest MTU of the slave ports. Because the switch only has a global
+	 * largest MTU of the user ports. Because the switch only has a global
 	 * RX length register, only allowing CPU port here is enough.
 	 */
 	if (!dsa_is_cpu_port(ds, port))
@@ -2069,7 +2069,7 @@ mt7530_setup_mdio_irq(struct mt7530_priv *priv)
 			unsigned int irq;
 
 			irq = irq_create_mapping(priv->irq_domain, p);
-			ds->slave_mii_bus->irq[p] = irq;
+			ds->user_mii_bus->irq[p] = irq;
 		}
 	}
 }
@@ -2163,7 +2163,7 @@ mt7530_setup_mdio(struct mt7530_priv *priv)
 	if (!bus)
 		return -ENOMEM;
 
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 	bus->priv = priv;
 	bus->name = KBUILD_MODNAME "-mii";
 	snprintf(bus->id, MII_BUS_ID_SIZE, KBUILD_MODNAME "-%d", idx++);
@@ -2200,20 +2200,20 @@ mt7530_setup(struct dsa_switch *ds)
 	u32 id, val;
 	int ret, i;
 
-	/* The parent node of master netdev which holds the common system
+	/* The parent node of conduit netdev which holds the common system
 	 * controller also is the container for two GMACs nodes representing
 	 * as two netdev instances.
 	 */
 	dsa_switch_for_each_cpu_port(cpu_dp, ds) {
-		dn = cpu_dp->master->dev.of_node->parent;
+		dn = cpu_dp->conduit->dev.of_node->parent;
 		/* It doesn't matter which CPU port is found first,
-		 * their masters should share the same parent OF node
+		 * their conduits should share the same parent OF node
 		 */
 		break;
 	}
 
 	if (!dn) {
-		dev_err(ds->dev, "parent OF node of DSA master not found");
+		dev_err(ds->dev, "parent OF node of DSA conduit not found");
 		return -EINVAL;
 	}
 
@@ -2488,7 +2488,7 @@ mt7531_setup(struct dsa_switch *ds)
 	if (mt7531_dual_sgmii_supported(priv)) {
 		priv->p5_intf_sel = P5_INTF_SEL_GMAC5_SGMII;
 
-		/* Let ds->slave_mii_bus be able to access external phy. */
+		/* Let ds->user_mii_bus be able to access external phy. */
 		mt7530_rmw(priv, MT7531_GPIO_MODE1, MT7531_GPIO11_RG_RXD2_MASK,
 			   MT7531_EXT_P_MDC_11);
 		mt7530_rmw(priv, MT7531_GPIO_MODE1, MT7531_GPIO12_RG_RXD3_MASK,
@@ -2717,7 +2717,7 @@ mt7531_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
 	case PHY_INTERFACE_MODE_RGMII_RXID:
 	case PHY_INTERFACE_MODE_RGMII_TXID:
 		dp = dsa_to_port(ds, port);
-		phydev = dp->slave->phydev;
+		phydev = dp->user->phydev;
 		return mt7531_rgmii_setup(priv, port, interface, phydev);
 	case PHY_INTERFACE_MODE_SGMII:
 	case PHY_INTERFACE_MODE_NA:
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ab434a77b059..42b1acaca33a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2486,7 +2486,7 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
 	else
 		member = MV88E6XXX_G1_VTU_DATA_MEMBER_TAG_TAGGED;
 
-	/* net/dsa/slave.c will call dsa_port_vlan_add() for the affected port
+	/* net/dsa/user.c will call dsa_port_vlan_add() for the affected port
 	 * and then the CPU port. Do not warn for duplicates for the CPU port.
 	 */
 	warn = !dsa_is_cpu_port(ds, port) && !dsa_is_dsa_port(ds, port);
@@ -3719,7 +3719,7 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 		return err;
 
 	chip->ds = ds;
-	ds->slave_mii_bus = mv88e6xxx_default_mdio_bus(chip);
+	ds->user_mii_bus = mv88e6xxx_default_mdio_bus(chip);
 
 	/* Since virtual bridges are mapped in the PVT, the number we support
 	 * depends on the physical switch topology. We need to let DSA figure
diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 9a3e5ec16972..61e95487732d 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -42,22 +42,22 @@ static struct net_device *felix_classify_db(struct dsa_db db)
 	}
 }
 
-static int felix_cpu_port_for_master(struct dsa_switch *ds,
-				     struct net_device *master)
+static int felix_cpu_port_for_conduit(struct dsa_switch *ds,
+				      struct net_device *conduit)
 {
 	struct ocelot *ocelot = ds->priv;
 	struct dsa_port *cpu_dp;
 	int lag;
 
-	if (netif_is_lag_master(master)) {
+	if (netif_is_lag_master(conduit)) {
 		mutex_lock(&ocelot->fwd_domain_lock);
-		lag = ocelot_bond_get_id(ocelot, master);
+		lag = ocelot_bond_get_id(ocelot, conduit);
 		mutex_unlock(&ocelot->fwd_domain_lock);
 
 		return lag;
 	}
 
-	cpu_dp = master->dsa_ptr;
+	cpu_dp = conduit->dsa_ptr;
 	return cpu_dp->index;
 }
 
@@ -366,7 +366,7 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds,
  * is the mode through which frames can be injected from and extracted to an
  * external CPU, over Ethernet. In NXP SoCs, the "external CPU" is the ARM CPU
  * running Linux, and this forms a DSA setup together with the enetc or fman
- * DSA master.
+ * DSA conduit.
  */
 static void felix_npi_port_init(struct ocelot *ocelot, int port)
 {
@@ -441,16 +441,16 @@ static unsigned long felix_tag_npi_get_host_fwd_mask(struct dsa_switch *ds)
 	return BIT(ocelot->num_phys_ports);
 }
 
-static int felix_tag_npi_change_master(struct dsa_switch *ds, int port,
-				       struct net_device *master,
-				       struct netlink_ext_ack *extack)
+static int felix_tag_npi_change_conduit(struct dsa_switch *ds, int port,
+					struct net_device *conduit,
+					struct netlink_ext_ack *extack)
 {
 	struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
 	struct ocelot *ocelot = ds->priv;
 
-	if (netif_is_lag_master(master)) {
+	if (netif_is_lag_master(conduit)) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "LAG DSA master only supported using ocelot-8021q");
+				   "LAG DSA conduit only supported using ocelot-8021q");
 		return -EOPNOTSUPP;
 	}
 
@@ -459,24 +459,24 @@ static int felix_tag_npi_change_master(struct dsa_switch *ds, int port,
 	 * come back up until they're all changed to the new one.
 	 */
 	dsa_switch_for_each_user_port(other_dp, ds) {
-		struct net_device *slave = other_dp->slave;
+		struct net_device *user = other_dp->user;
 
-		if (other_dp != dp && (slave->flags & IFF_UP) &&
-		    dsa_port_to_master(other_dp) != master) {
+		if (other_dp != dp && (user->flags & IFF_UP) &&
+		    dsa_port_to_conduit(other_dp) != conduit) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "Cannot change while old master still has users");
+					   "Cannot change while old conduit still has users");
 			return -EOPNOTSUPP;
 		}
 	}
 
 	felix_npi_port_deinit(ocelot, ocelot->npi);
-	felix_npi_port_init(ocelot, felix_cpu_port_for_master(ds, master));
+	felix_npi_port_init(ocelot, felix_cpu_port_for_conduit(ds, conduit));
 
 	return 0;
 }
 
 /* Alternatively to using the NPI functionality, that same hardware MAC
- * connected internally to the enetc or fman DSA master can be configured to
+ * connected internally to the enetc or fman DSA conduit can be configured to
  * use the software-defined tag_8021q frame format. As far as the hardware is
  * concerned, it thinks it is a "dumb switch" - the queues of the CPU port
  * module are now disconnected from it, but can still be accessed through
@@ -486,7 +486,7 @@ static const struct felix_tag_proto_ops felix_tag_npi_proto_ops = {
 	.setup			= felix_tag_npi_setup,
 	.teardown		= felix_tag_npi_teardown,
 	.get_host_fwd_mask	= felix_tag_npi_get_host_fwd_mask,
-	.change_master		= felix_tag_npi_change_master,
+	.change_conduit		= felix_tag_npi_change_conduit,
 };
 
 static int felix_tag_8021q_setup(struct dsa_switch *ds)
@@ -561,11 +561,11 @@ static unsigned long felix_tag_8021q_get_host_fwd_mask(struct dsa_switch *ds)
 	return dsa_cpu_ports(ds);
 }
 
-static int felix_tag_8021q_change_master(struct dsa_switch *ds, int port,
-					 struct net_device *master,
-					 struct netlink_ext_ack *extack)
+static int felix_tag_8021q_change_conduit(struct dsa_switch *ds, int port,
+					  struct net_device *conduit,
+					  struct netlink_ext_ack *extack)
 {
-	int cpu = felix_cpu_port_for_master(ds, master);
+	int cpu = felix_cpu_port_for_conduit(ds, conduit);
 	struct ocelot *ocelot = ds->priv;
 
 	ocelot_port_unassign_dsa_8021q_cpu(ocelot, port);
@@ -578,7 +578,7 @@ static const struct felix_tag_proto_ops felix_tag_8021q_proto_ops = {
 	.setup			= felix_tag_8021q_setup,
 	.teardown		= felix_tag_8021q_teardown,
 	.get_host_fwd_mask	= felix_tag_8021q_get_host_fwd_mask,
-	.change_master		= felix_tag_8021q_change_master,
+	.change_conduit		= felix_tag_8021q_change_conduit,
 };
 
 static void felix_set_host_flood(struct dsa_switch *ds, unsigned long mask,
@@ -741,14 +741,14 @@ static void felix_port_set_host_flood(struct dsa_switch *ds, int port,
 			     !!felix->host_flood_mc_mask, true);
 }
 
-static int felix_port_change_master(struct dsa_switch *ds, int port,
-				    struct net_device *master,
-				    struct netlink_ext_ack *extack)
+static int felix_port_change_conduit(struct dsa_switch *ds, int port,
+				     struct net_device *conduit,
+				     struct netlink_ext_ack *extack)
 {
 	struct ocelot *ocelot = ds->priv;
 	struct felix *felix = ocelot_to_felix(ocelot);
 
-	return felix->tag_proto_ops->change_master(ds, port, master, extack);
+	return felix->tag_proto_ops->change_conduit(ds, port, conduit, extack);
 }
 
 static int felix_set_ageing_time(struct dsa_switch *ds,
@@ -953,7 +953,7 @@ static int felix_lag_join(struct dsa_switch *ds, int port,
 	if (!dsa_is_cpu_port(ds, port))
 		return 0;
 
-	return felix_port_change_master(ds, port, lag.dev, extack);
+	return felix_port_change_conduit(ds, port, lag.dev, extack);
 }
 
 static int felix_lag_leave(struct dsa_switch *ds, int port,
@@ -967,7 +967,7 @@ static int felix_lag_leave(struct dsa_switch *ds, int port,
 	if (!dsa_is_cpu_port(ds, port))
 		return 0;
 
-	return felix_port_change_master(ds, port, lag.dev, NULL);
+	return felix_port_change_conduit(ds, port, lag.dev, NULL);
 }
 
 static int felix_lag_change(struct dsa_switch *ds, int port)
@@ -1116,10 +1116,10 @@ static int felix_port_enable(struct dsa_switch *ds, int port,
 		return 0;
 
 	if (ocelot->npi >= 0) {
-		struct net_device *master = dsa_port_to_master(dp);
+		struct net_device *conduit = dsa_port_to_conduit(dp);
 
-		if (felix_cpu_port_for_master(ds, master) != ocelot->npi) {
-			dev_err(ds->dev, "Multiple masters are not allowed\n");
+		if (felix_cpu_port_for_conduit(ds, conduit) != ocelot->npi) {
+			dev_err(ds->dev, "Multiple conduits are not allowed\n");
 			return -EINVAL;
 		}
 	}
@@ -2164,7 +2164,7 @@ const struct dsa_switch_ops felix_switch_ops = {
 	.port_add_dscp_prio		= felix_port_add_dscp_prio,
 	.port_del_dscp_prio		= felix_port_del_dscp_prio,
 	.port_set_host_flood		= felix_port_set_host_flood,
-	.port_change_master		= felix_port_change_master,
+	.port_change_conduit		= felix_port_change_conduit,
 };
 EXPORT_SYMBOL_GPL(felix_switch_ops);
 
@@ -2176,7 +2176,7 @@ struct net_device *felix_port_to_netdev(struct ocelot *ocelot, int port)
 	if (!dsa_is_user_port(ds, port))
 		return NULL;
 
-	return dsa_to_port(ds, port)->slave;
+	return dsa_to_port(ds, port)->user;
 }
 EXPORT_SYMBOL_GPL(felix_port_to_netdev);
 
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 1d4befe7cfe8..dbf5872fe367 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -77,9 +77,9 @@ struct felix_tag_proto_ops {
 	int (*setup)(struct dsa_switch *ds);
 	void (*teardown)(struct dsa_switch *ds);
 	unsigned long (*get_host_fwd_mask)(struct dsa_switch *ds);
-	int (*change_master)(struct dsa_switch *ds, int port,
-			     struct net_device *master,
-			     struct netlink_ext_ack *extack);
+	int (*change_conduit)(struct dsa_switch *ds, int port,
+			      struct net_device *conduit,
+			      struct netlink_ext_ack *extack);
 };
 
 extern const struct dsa_switch_ops felix_switch_ops;
diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c
index 4ce68e655a63..ec57d9d52072 100644
--- a/drivers/net/dsa/qca/qca8k-8xxx.c
+++ b/drivers/net/dsa/qca/qca8k-8xxx.c
@@ -323,14 +323,14 @@ static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
 
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check mgmt_master if is operational */
-	if (!priv->mgmt_master) {
+	/* Check if the mgmt_conduit if is operational */
+	if (!priv->mgmt_conduit) {
 		kfree_skb(skb);
 		mutex_unlock(&mgmt_eth_data->mutex);
 		return -EINVAL;
 	}
 
-	skb->dev = priv->mgmt_master;
+	skb->dev = priv->mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -375,14 +375,14 @@ static int qca8k_write_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len)
 
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check mgmt_master if is operational */
-	if (!priv->mgmt_master) {
+	/* Check if the mgmt_conduit if is operational */
+	if (!priv->mgmt_conduit) {
 		kfree_skb(skb);
 		mutex_unlock(&mgmt_eth_data->mutex);
 		return -EINVAL;
 	}
 
-	skb->dev = priv->mgmt_master;
+	skb->dev = priv->mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -508,7 +508,7 @@ qca8k_bulk_read(void *ctx, const void *reg_buf, size_t reg_len,
 	struct qca8k_priv *priv = ctx;
 	u32 reg = *(u16 *)reg_buf;
 
-	if (priv->mgmt_master &&
+	if (priv->mgmt_conduit &&
 	    !qca8k_read_eth(priv, reg, val_buf, val_len))
 		return 0;
 
@@ -531,7 +531,7 @@ qca8k_bulk_gather_write(void *ctx, const void *reg_buf, size_t reg_len,
 	u32 reg = *(u16 *)reg_buf;
 	u32 *val = (u32 *)val_buf;
 
-	if (priv->mgmt_master &&
+	if (priv->mgmt_conduit &&
 	    !qca8k_write_eth(priv, reg, val, val_len))
 		return 0;
 
@@ -626,7 +626,7 @@ qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy,
 	struct sk_buff *write_skb, *clear_skb, *read_skb;
 	struct qca8k_mgmt_eth_data *mgmt_eth_data;
 	u32 write_val, clear_val = 0, val;
-	struct net_device *mgmt_master;
+	struct net_device *mgmt_conduit;
 	int ret, ret1;
 	bool ack;
 
@@ -683,18 +683,18 @@ qca8k_phy_eth_command(struct qca8k_priv *priv, bool read, int phy,
 	 */
 	mutex_lock(&mgmt_eth_data->mutex);
 
-	/* Check if mgmt_master is operational */
-	mgmt_master = priv->mgmt_master;
-	if (!mgmt_master) {
+	/* Check if mgmt_conduit is operational */
+	mgmt_conduit = priv->mgmt_conduit;
+	if (!mgmt_conduit) {
 		mutex_unlock(&mgmt_eth_data->mutex);
 		mutex_unlock(&priv->bus->mdio_lock);
 		ret = -EINVAL;
-		goto err_mgmt_master;
+		goto err_mgmt_conduit;
 	}
 
-	read_skb->dev = mgmt_master;
-	clear_skb->dev = mgmt_master;
-	write_skb->dev = mgmt_master;
+	read_skb->dev = mgmt_conduit;
+	clear_skb->dev = mgmt_conduit;
+	write_skb->dev = mgmt_conduit;
 
 	reinit_completion(&mgmt_eth_data->rw_done);
 
@@ -780,7 +780,7 @@ exit:
 	return ret;
 
 	/* Error handling before lock */
-err_mgmt_master:
+err_mgmt_conduit:
 	kfree_skb(read_skb);
 err_read_skb:
 	kfree_skb(clear_skb);
@@ -959,12 +959,12 @@ qca8k_mdio_register(struct qca8k_priv *priv)
 		 ds->dst->index, ds->index);
 	bus->parent = ds->dev;
 	bus->phy_mask = ~ds->phys_mii_mask;
-	ds->slave_mii_bus = bus;
+	ds->user_mii_bus = bus;
 
 	/* Check if the devicetree declare the port:phy mapping */
 	mdio = of_get_child_by_name(priv->dev->of_node, "mdio");
 	if (of_device_is_available(mdio)) {
-		bus->name = "qca8k slave mii";
+		bus->name = "qca8k user mii";
 		bus->read = qca8k_internal_mdio_read;
 		bus->write = qca8k_internal_mdio_write;
 		return devm_of_mdiobus_register(priv->dev, bus, mdio);
@@ -973,7 +973,7 @@ qca8k_mdio_register(struct qca8k_priv *priv)
 	/* If a mapping can't be found the legacy mapping is used,
 	 * using the qca8k_port_to_phy function
 	 */
-	bus->name = "qca8k-legacy slave mii";
+	bus->name = "qca8k-legacy user mii";
 	bus->read = qca8k_legacy_mdio_read;
 	bus->write = qca8k_legacy_mdio_write;
 	return devm_mdiobus_register(priv->dev, bus);
@@ -1728,10 +1728,10 @@ qca8k_get_tag_protocol(struct dsa_switch *ds, int port,
 }
 
 static void
-qca8k_master_change(struct dsa_switch *ds, const struct net_device *master,
-		    bool operational)
+qca8k_conduit_change(struct dsa_switch *ds, const struct net_device *conduit,
+		     bool operational)
 {
-	struct dsa_port *dp = master->dsa_ptr;
+	struct dsa_port *dp = conduit->dsa_ptr;
 	struct qca8k_priv *priv = ds->priv;
 
 	/* Ethernet MIB/MDIO is only supported for CPU port 0 */
@@ -1741,7 +1741,7 @@ qca8k_master_change(struct dsa_switch *ds, const struct net_device *master,
 	mutex_lock(&priv->mgmt_eth_data.mutex);
 	mutex_lock(&priv->mib_eth_data.mutex);
 
-	priv->mgmt_master = operational ? (struct net_device *)master : NULL;
+	priv->mgmt_conduit = operational ? (struct net_device *)conduit : NULL;
 
 	mutex_unlock(&priv->mib_eth_data.mutex);
 	mutex_unlock(&priv->mgmt_eth_data.mutex);
@@ -2016,7 +2016,7 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
 	.get_phy_flags		= qca8k_get_phy_flags,
 	.port_lag_join		= qca8k_port_lag_join,
 	.port_lag_leave		= qca8k_port_lag_leave,
-	.master_state_change	= qca8k_master_change,
+	.conduit_state_change	= qca8k_conduit_change,
 	.connect_tag_protocol	= qca8k_connect_tag_protocol,
 };
 
diff --git a/drivers/net/dsa/qca/qca8k-common.c b/drivers/net/dsa/qca/qca8k-common.c
index 9ff0a3c1cb91..9243eff8918d 100644
--- a/drivers/net/dsa/qca/qca8k-common.c
+++ b/drivers/net/dsa/qca/qca8k-common.c
@@ -499,7 +499,7 @@ void qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
 	u32 hi = 0;
 	int ret;
 
-	if (priv->mgmt_master && priv->info->ops->autocast_mib &&
+	if (priv->mgmt_conduit && priv->info->ops->autocast_mib &&
 	    priv->info->ops->autocast_mib(ds, port, data) > 0)
 		return;
 
@@ -761,7 +761,7 @@ int qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 	int ret;
 
 	/* We have only have a general MTU setting.
-	 * DSA always set the CPU port's MTU to the largest MTU of the slave
+	 * DSA always set the CPU port's MTU to the largest MTU of the user
 	 * ports.
 	 * Setting MTU just for the CPU port is sufficient to correctly set a
 	 * value for every port.
diff --git a/drivers/net/dsa/qca/qca8k-leds.c b/drivers/net/dsa/qca/qca8k-leds.c
index e8c16e76e34b..90e30c2909e4 100644
--- a/drivers/net/dsa/qca/qca8k-leds.c
+++ b/drivers/net/dsa/qca/qca8k-leds.c
@@ -356,8 +356,8 @@ static struct device *qca8k_cled_hw_control_get_device(struct led_classdev *ldev
 	dp = dsa_to_port(priv->ds, qca8k_phy_to_port(led->port_num));
 	if (!dp)
 		return NULL;
-	if (dp->slave)
-		return &dp->slave->dev;
+	if (dp->user)
+		return &dp->user->dev;
 	return NULL;
 }
 
@@ -429,7 +429,7 @@ qca8k_parse_port_leds(struct qca8k_priv *priv, struct fwnode_handle *port, int p
 		init_data.default_label = ":port";
 		init_data.fwnode = led;
 		init_data.devname_mandatory = true;
-		init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d", ds->slave_mii_bus->id,
+		init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d", ds->user_mii_bus->id,
 						 port_num);
 		if (!init_data.devicename)
 			return -ENOMEM;
diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h
index 8f88b7db384d..2ac7e88f8da5 100644
--- a/drivers/net/dsa/qca/qca8k.h
+++ b/drivers/net/dsa/qca/qca8k.h
@@ -458,7 +458,7 @@ struct qca8k_priv {
 	struct mutex reg_mutex;
 	struct device *dev;
 	struct gpio_desc *reset_gpio;
-	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
+	struct net_device *mgmt_conduit; /* Track if mdio/mib Ethernet is available */
 	struct qca8k_mgmt_eth_data mgmt_eth_data;
 	struct qca8k_mib_eth_data mib_eth_data;
 	struct qca8k_mdio_cache mdio_cache;
diff --git a/drivers/net/dsa/realtek/realtek-smi.c b/drivers/net/dsa/realtek/realtek-smi.c
index bfd11591faf4..755546ed8db6 100644
--- a/drivers/net/dsa/realtek/realtek-smi.c
+++ b/drivers/net/dsa/realtek/realtek-smi.c
@@ -378,25 +378,25 @@ static int realtek_smi_setup_mdio(struct dsa_switch *ds)
 		return -ENODEV;
 	}
 
-	priv->slave_mii_bus = devm_mdiobus_alloc(priv->dev);
-	if (!priv->slave_mii_bus) {
+	priv->user_mii_bus = devm_mdiobus_alloc(priv->dev);
+	if (!priv->user_mii_bus) {
 		ret = -ENOMEM;
 		goto err_put_node;
 	}
-	priv->slave_mii_bus->priv = priv;
-	priv->slave_mii_bus->name = "SMI slave MII";
-	priv->slave_mii_bus->read = realtek_smi_mdio_read;
-	priv->slave_mii_bus->write = realtek_smi_mdio_write;
-	snprintf(priv->slave_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
+	priv->user_mii_bus->priv = priv;
+	priv->user_mii_bus->name = "SMI user MII";
+	priv->user_mii_bus->read = realtek_smi_mdio_read;
+	priv->user_mii_bus->write = realtek_smi_mdio_write;
+	snprintf(priv->user_mii_bus->id, MII_BUS_ID_SIZE, "SMI-%d",
 		 ds->index);
-	priv->slave_mii_bus->dev.of_node = mdio_np;
-	priv->slave_mii_bus->parent = priv->dev;
-	ds->slave_mii_bus = priv->slave_mii_bus;
+	priv->user_mii_bus->dev.of_node = mdio_np;
+	priv->user_mii_bus->parent = priv->dev;
+	ds->user_mii_bus = priv->user_mii_bus;
 
-	ret = devm_of_mdiobus_register(priv->dev, priv->slave_mii_bus, mdio_np);
+	ret = devm_of_mdiobus_register(priv->dev, priv->user_mii_bus, mdio_np);
 	if (ret) {
 		dev_err(priv->dev, "unable to register MDIO bus %s\n",
-			priv->slave_mii_bus->id);
+			priv->user_mii_bus->id);
 		goto err_put_node;
 	}
 
@@ -514,8 +514,8 @@ static void realtek_smi_remove(struct platform_device *pdev)
 		return;
 
 	dsa_unregister_switch(priv->ds);
-	if (priv->slave_mii_bus)
-		of_node_put(priv->slave_mii_bus->dev.of_node);
+	if (priv->user_mii_bus)
+		of_node_put(priv->user_mii_bus->dev.of_node);
 
 	/* leave the device reset asserted */
 	if (priv->reset)
diff --git a/drivers/net/dsa/realtek/realtek.h b/drivers/net/dsa/realtek/realtek.h
index 4fa7c6ba874a..790488e9c667 100644
--- a/drivers/net/dsa/realtek/realtek.h
+++ b/drivers/net/dsa/realtek/realtek.h
@@ -54,7 +54,7 @@ struct realtek_priv {
 	struct regmap		*map;
 	struct regmap		*map_nolock;
 	struct mutex		map_lock;
-	struct mii_bus		*slave_mii_bus;
+	struct mii_bus		*user_mii_bus;
 	struct mii_bus		*bus;
 	int			mdio_addr;
 
diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c
index d171c18dd354..0875e4fc9f57 100644
--- a/drivers/net/dsa/realtek/rtl8365mb.c
+++ b/drivers/net/dsa/realtek/rtl8365mb.c
@@ -1144,7 +1144,7 @@ static int rtl8365mb_port_change_mtu(struct dsa_switch *ds, int port,
 	int frame_size;
 
 	/* When a new MTU is set, DSA always sets the CPU port's MTU to the
-	 * largest MTU of the slave ports. Because the switch only has a global
+	 * largest MTU of the user ports. Because the switch only has a global
 	 * RX length register, only allowing CPU port here is enough.
 	 */
 	if (!dsa_is_cpu_port(ds, port))
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 1a367e64bc3b..74cee39d73df 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2688,7 +2688,7 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 	}
 
 	/* Transfer skb to the host port. */
-	dsa_enqueue_skb(skb, dsa_to_port(ds, port)->slave);
+	dsa_enqueue_skb(skb, dsa_to_port(ds, port)->user);
 
 	/* Wait until the switch has processed the frame */
 	do {
@@ -3081,7 +3081,7 @@ static int sja1105_port_bridge_flags(struct dsa_switch *ds, int port,
  * ref_clk pin. So port clocking needs to be initialized early, before
  * connecting to PHYs is attempted, otherwise they won't respond through MDIO.
  * Setting correct PHY link speed does not matter now.
- * But dsa_slave_phy_setup is called later than sja1105_setup, so the PHY
+ * But dsa_user_phy_setup is called later than sja1105_setup, so the PHY
  * bindings are not yet parsed by DSA core. We need to parse early so that we
  * can populate the xMII mode parameters table.
  */
diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c
index 5b02e9e426fd..96db032b478f 100644
--- a/drivers/net/dsa/xrs700x/xrs700x.c
+++ b/drivers/net/dsa/xrs700x/xrs700x.c
@@ -554,7 +554,7 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
 	unsigned int val = XRS_HSR_CFG_HSR_PRP;
 	struct dsa_port *partner = NULL, *dp;
 	struct xrs700x *priv = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	int ret, i, hsr_pair[2];
 	enum hsr_version ver;
 	bool fwd = false;
@@ -638,8 +638,8 @@ static int xrs700x_hsr_join(struct dsa_switch *ds, int port,
 	hsr_pair[0] = port;
 	hsr_pair[1] = partner->index;
 	for (i = 0; i < ARRAY_SIZE(hsr_pair); i++) {
-		slave = dsa_to_port(ds, hsr_pair[i])->slave;
-		slave->features |= XRS7000X_SUPPORTED_HSR_FEATURES;
+		user = dsa_to_port(ds, hsr_pair[i])->user;
+		user->features |= XRS7000X_SUPPORTED_HSR_FEATURES;
 	}
 
 	return 0;
@@ -650,7 +650,7 @@ static int xrs700x_hsr_leave(struct dsa_switch *ds, int port,
 {
 	struct dsa_port *partner = NULL, *dp;
 	struct xrs700x *priv = ds->priv;
-	struct net_device *slave;
+	struct net_device *user;
 	int i, hsr_pair[2];
 	unsigned int val;
 
@@ -692,8 +692,8 @@ static int xrs700x_hsr_leave(struct dsa_switch *ds, int port,
 	hsr_pair[0] = port;
 	hsr_pair[1] = partner->index;
 	for (i = 0; i < ARRAY_SIZE(hsr_pair); i++) {
-		slave = dsa_to_port(ds, hsr_pair[i])->slave;
-		slave->features &= ~XRS7000X_SUPPORTED_HSR_FEATURES;
+		user = dsa_to_port(ds, hsr_pair[i])->user;
+		user->features &= ~XRS7000X_SUPPORTED_HSR_FEATURES;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index ab096795e805..c9faa8540859 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2430,7 +2430,7 @@ static int bcm_sysport_netdevice_event(struct notifier_block *nb,
 		if (dev->netdev_ops != &bcm_sysport_netdev_ops)
 			return NOTIFY_DONE;
 
-		if (!dsa_slave_dev_check(info->upper_dev))
+		if (!dsa_user_dev_check(info->upper_dev))
 			return NOTIFY_DONE;
 
 		if (info->linking)
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 60d49b0f595f..3cf6589cfdac 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -3329,7 +3329,7 @@ static int mtk_device_event(struct notifier_block *n, unsigned long event, void
 	return NOTIFY_DONE;
 
 found:
-	if (!dsa_slave_dev_check(dev))
+	if (!dsa_user_dev_check(dev))
 		return NOTIFY_DONE;
 
 	if (__ethtool_get_link_ksettings(dev, &s))
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index e073d2b5542c..fbb5e9d5af13 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -175,7 +175,7 @@ mtk_flow_get_dsa_port(struct net_device **dev)
 	if (dp->cpu_dp->tag_ops->proto != DSA_TAG_PROTO_MTK)
 		return -ENODEV;
 
-	*dev = dsa_port_to_master(dp);
+	*dev = dsa_port_to_conduit(dp);
 
 	return dp->index;
 #else
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index c177322f793d..b9dd35d4b8f5 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -28,7 +28,7 @@
 /* Source and Destination MAC of follow-up meta frames.
  * Whereas the choice of SMAC only affects the unique identification of the
  * switch as sender of meta frames, the DMAC must be an address that is present
- * in the DSA master port's multicast MAC filter.
+ * in the DSA conduit port's multicast MAC filter.
  * 01-80-C2-00-00-0E is a good choice for this, as all profiles of IEEE 1588
  * over L2 use this address for some purpose already.
  */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d98439ea6146..82135fbdb1e6 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -102,11 +102,11 @@ struct dsa_device_ops {
 	const char *name;
 	enum dsa_tag_protocol proto;
 	/* Some tagging protocols either mangle or shift the destination MAC
-	 * address, in which case the DSA master would drop packets on ingress
+	 * address, in which case the DSA conduit would drop packets on ingress
 	 * if what it understands out of the destination MAC address is not in
 	 * its RX filter.
 	 */
-	bool promisc_on_master;
+	bool promisc_on_conduit;
 };
 
 struct dsa_lag {
@@ -236,12 +236,12 @@ struct dsa_bridge {
 };
 
 struct dsa_port {
-	/* A CPU port is physically connected to a master device.
-	 * A user port exposed to userspace has a slave device.
+	/* A CPU port is physically connected to a conduit device. A user port
+	 * exposes a network device to user-space, called 'user' here.
 	 */
 	union {
-		struct net_device *master;
-		struct net_device *slave;
+		struct net_device *conduit;
+		struct net_device *user;
 	};
 
 	/* Copy of the tagging protocol operations, for quicker access
@@ -249,7 +249,7 @@ struct dsa_port {
 	 */
 	const struct dsa_device_ops *tag_ops;
 
-	/* Copies for faster access in master receive hot path */
+	/* Copies for faster access in conduit receive hot path */
 	struct dsa_switch_tree *dst;
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
 
@@ -281,9 +281,9 @@ struct dsa_port {
 
 	u8			lag_tx_enabled:1;
 
-	/* Master state bits, valid only on CPU ports */
-	u8			master_admin_up:1;
-	u8			master_oper_up:1;
+	/* conduit state bits, valid only on CPU ports */
+	u8			conduit_admin_up:1;
+	u8			conduit_oper_up:1;
 
 	/* Valid only on user ports */
 	u8			cpu_port_in_lag:1;
@@ -303,7 +303,7 @@ struct dsa_port {
 	struct list_head list;
 
 	/*
-	 * Original copy of the master netdev ethtool_ops
+	 * Original copy of the conduit netdev ethtool_ops
 	 */
 	const struct ethtool_ops *orig_ethtool_ops;
 
@@ -452,10 +452,10 @@ struct dsa_switch {
 	const struct dsa_switch_ops	*ops;
 
 	/*
-	 * Slave mii_bus and devices for the individual ports.
+	 * User mii_bus and devices for the individual ports.
 	 */
 	u32			phys_mii_mask;
-	struct mii_bus		*slave_mii_bus;
+	struct mii_bus		*user_mii_bus;
 
 	/* Ageing Time limits in msecs */
 	unsigned int ageing_time_min;
@@ -520,10 +520,10 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp)
 	return dp->type == DSA_PORT_TYPE_UNUSED;
 }
 
-static inline bool dsa_port_master_is_operational(struct dsa_port *dp)
+static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
 {
-	return dsa_port_is_cpu(dp) && dp->master_admin_up &&
-	       dp->master_oper_up;
+	return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
+	       dp->conduit_oper_up;
 }
 
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
@@ -713,12 +713,12 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
 	return dsa_port_lag_dev_get(dp) == lag->dev;
 }
 
-static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp)
+static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
 {
 	if (dp->cpu_port_in_lag)
 		return dsa_port_lag_dev_get(dp->cpu_dp);
 
-	return dp->cpu_dp->master;
+	return dp->cpu_dp->conduit;
 }
 
 static inline
@@ -732,7 +732,7 @@ struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
 	else if (dp->hsr_dev)
 		return dp->hsr_dev;
 
-	return dp->slave;
+	return dp->user;
 }
 
 static inline struct net_device *
@@ -834,9 +834,9 @@ struct dsa_switch_ops {
 	int	(*connect_tag_protocol)(struct dsa_switch *ds,
 					enum dsa_tag_protocol proto);
 
-	int	(*port_change_master)(struct dsa_switch *ds, int port,
-				      struct net_device *master,
-				      struct netlink_ext_ack *extack);
+	int	(*port_change_conduit)(struct dsa_switch *ds, int port,
+				       struct net_device *conduit,
+				       struct netlink_ext_ack *extack);
 
 	/* Optional switch-wide initialization and destruction methods */
 	int	(*setup)(struct dsa_switch *ds);
@@ -1233,11 +1233,11 @@ struct dsa_switch_ops {
 	int	(*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
 
 	/*
-	 * DSA master tracking operations
+	 * DSA conduit tracking operations
 	 */
-	void	(*master_state_change)(struct dsa_switch *ds,
-				       const struct net_device *master,
-				       bool operational);
+	void	(*conduit_state_change)(struct dsa_switch *ds,
+					const struct net_device *conduit,
+					bool operational);
 };
 
 #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
@@ -1374,9 +1374,9 @@ static inline int dsa_switch_resume(struct dsa_switch *ds)
 #endif /* CONFIG_PM_SLEEP */
 
 #if IS_ENABLED(CONFIG_NET_DSA)
-bool dsa_slave_dev_check(const struct net_device *dev);
+bool dsa_user_dev_check(const struct net_device *dev);
 #else
-static inline bool dsa_slave_dev_check(const struct net_device *dev)
+static inline bool dsa_user_dev_check(const struct net_device *dev)
 {
 	return false;
 }
diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h
index 361811750a54..6f384897f287 100644
--- a/include/net/dsa_stubs.h
+++ b/include/net/dsa_stubs.h
@@ -13,14 +13,14 @@
 extern const struct dsa_stubs *dsa_stubs;
 
 struct dsa_stubs {
-	int (*master_hwtstamp_validate)(struct net_device *dev,
-					const struct kernel_hwtstamp_config *config,
-					struct netlink_ext_ack *extack);
+	int (*conduit_hwtstamp_validate)(struct net_device *dev,
+					 const struct kernel_hwtstamp_config *config,
+					 struct netlink_ext_ack *extack);
 };
 
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
-					       const struct kernel_hwtstamp_config *config,
-					       struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+						const struct kernel_hwtstamp_config *config,
+						struct netlink_ext_ack *extack)
 {
 	if (!netdev_uses_dsa(dev))
 		return 0;
@@ -29,18 +29,18 @@ static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
 	 * netdev_uses_dsa() returns true, the dsa_core module is still
 	 * registered, and so, dsa_unregister_stubs() couldn't have run.
 	 * For netdev_uses_dsa() to start returning false, it would imply that
-	 * dsa_master_teardown() has executed, which requires rtnl_lock().
+	 * dsa_conduit_teardown() has executed, which requires rtnl_lock().
 	 */
 	ASSERT_RTNL();
 
-	return dsa_stubs->master_hwtstamp_validate(dev, config, extack);
+	return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack);
 }
 
 #else
 
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
-					       const struct kernel_hwtstamp_config *config,
-					       struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+						const struct kernel_hwtstamp_config *config,
+						struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b46aedc36939..feeddf95f450 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -382,7 +382,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
+	err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
 	if (err) {
 		if (extack._msg)
 			netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 12e305824a96..8a1894a42552 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -8,16 +8,16 @@ endif
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += \
+	conduit.o \
 	devlink.o \
 	dsa.o \
-	master.o \
 	netlink.o \
 	port.o \
-	slave.o \
 	switch.o \
 	tag.o \
 	tag_8021q.o \
-	trace.o
+	trace.o \
+	user.o
 
 # tagging formats
 obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c
new file mode 100644
index 000000000000..3dfdb3cb47dc
--- /dev/null
+++ b/net/dsa/conduit.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a conduit device, switching frames via its switch fabric CPU port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/dsa.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "port.h"
+#include "tag.h"
+
+static int dsa_conduit_get_regs_len(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int ret = 0;
+	int len;
+
+	if (ops->get_regs_len) {
+		len = ops->get_regs_len(dev);
+		if (len < 0)
+			return len;
+		ret += len;
+	}
+
+	ret += sizeof(struct ethtool_drvinfo);
+	ret += sizeof(struct ethtool_regs);
+
+	if (ds->ops->get_regs_len) {
+		len = ds->ops->get_regs_len(ds, port);
+		if (len < 0)
+			return len;
+		ret += len;
+	}
+
+	return ret;
+}
+
+static void dsa_conduit_get_regs(struct net_device *dev,
+				 struct ethtool_regs *regs, void *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct ethtool_drvinfo *cpu_info;
+	struct ethtool_regs *cpu_regs;
+	int port = cpu_dp->index;
+	int len;
+
+	if (ops->get_regs_len && ops->get_regs) {
+		len = ops->get_regs_len(dev);
+		if (len < 0)
+			return;
+		regs->len = len;
+		ops->get_regs(dev, regs, data);
+		data += regs->len;
+	}
+
+	cpu_info = (struct ethtool_drvinfo *)data;
+	strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
+	data += sizeof(*cpu_info);
+	cpu_regs = (struct ethtool_regs *)data;
+	data += sizeof(*cpu_regs);
+
+	if (ds->ops->get_regs_len && ds->ops->get_regs) {
+		len = ds->ops->get_regs_len(ds, port);
+		if (len < 0)
+			return;
+		cpu_regs->len = len;
+		ds->ops->get_regs(ds, port, cpu_regs, data);
+	}
+}
+
+static void dsa_conduit_get_ethtool_stats(struct net_device *dev,
+					  struct ethtool_stats *stats,
+					  uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (ops->get_sset_count && ops->get_ethtool_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_ethtool_stats(dev, stats, data);
+	}
+
+	if (ds->ops->get_ethtool_stats)
+		ds->ops->get_ethtool_stats(ds, port, data + count);
+}
+
+static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev,
+					      struct ethtool_stats *stats,
+					      uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		count = phy_ethtool_get_sset_count(dev->phydev);
+		if (count >= 0)
+			phy_ethtool_get_stats(dev->phydev, stats, data);
+	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+		ops->get_ethtool_phy_stats(dev, stats, data);
+	}
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_ethtool_phy_stats)
+		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
+static int dsa_conduit_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int count = 0;
+
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		count = phy_ethtool_get_sset_count(dev->phydev);
+	else if (ops->get_sset_count)
+		count = ops->get_sset_count(dev, sset);
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_sset_count)
+		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
+
+	return count;
+}
+
+static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset,
+				    uint8_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int len = ETH_GSTRING_LEN;
+	int mcount = 0, count, i;
+	uint8_t pfx[4];
+	uint8_t *ndata;
+
+	snprintf(pfx, sizeof(pfx), "p%.2d", port);
+	/* We do not want to be NULL-terminated, since this is a prefix */
+	pfx[sizeof(pfx) - 1] = '_';
+
+	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats) {
+		mcount = phy_ethtool_get_sset_count(dev->phydev);
+		if (mcount < 0)
+			mcount = 0;
+		else
+			phy_ethtool_get_strings(dev->phydev, data);
+	} else if (ops->get_sset_count && ops->get_strings) {
+		mcount = ops->get_sset_count(dev, stringset);
+		if (mcount < 0)
+			mcount = 0;
+		ops->get_strings(dev, stringset, data);
+	}
+
+	if (ds->ops->get_strings) {
+		ndata = data + mcount * len;
+		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+		 * the output after to prepend our CPU port prefix we
+		 * constructed earlier
+		 */
+		ds->ops->get_strings(ds, port, stringset, ndata);
+		count = ds->ops->get_sset_count(ds, port, stringset);
+		if (count < 0)
+			return;
+		for (i = 0; i < count; i++) {
+			memmove(ndata + (i * len + sizeof(pfx)),
+				ndata + i * len, len - sizeof(pfx));
+			memcpy(ndata + i * len, pfx, sizeof(pfx));
+		}
+	}
+}
+
+/* Deny PTP operations on conduit if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+				    const struct kernel_hwtstamp_config *config,
+				    struct netlink_ext_ack *extack)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct dsa_switch_tree *dst;
+	struct dsa_port *dp;
+
+	dst = ds->dst;
+
+	list_for_each_entry(dp, &dst->ports, list) {
+		if (dsa_port_supports_hwtstamp(dp)) {
+			NL_SET_ERR_MSG(extack,
+				       "HW timestamping not allowed on DSA conduit when switch supports the operation");
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static int dsa_conduit_ethtool_setup(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct ethtool_ops *ops;
+
+	if (netif_is_lag_master(dev))
+		return 0;
+
+	ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
+	if (cpu_dp->orig_ethtool_ops)
+		memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+
+	ops->get_regs_len = dsa_conduit_get_regs_len;
+	ops->get_regs = dsa_conduit_get_regs;
+	ops->get_sset_count = dsa_conduit_get_sset_count;
+	ops->get_ethtool_stats = dsa_conduit_get_ethtool_stats;
+	ops->get_strings = dsa_conduit_get_strings;
+	ops->get_ethtool_phy_stats = dsa_conduit_get_ethtool_phy_stats;
+
+	dev->ethtool_ops = ops;
+
+	return 0;
+}
+
+static void dsa_conduit_ethtool_teardown(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	if (netif_is_lag_master(dev))
+		return;
+
+	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+	cpu_dp->orig_ethtool_ops = NULL;
+}
+
+/* Keep the conduit always promiscuous if the tagging protocol requires that
+ * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
+ * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
+ * anyway.
+ */
+static void dsa_conduit_set_promiscuity(struct net_device *dev, int inc)
+{
+	const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
+
+	if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_conduit)
+		return;
+
+	ASSERT_RTNL();
+
+	dev_set_promiscuity(dev, inc);
+}
+
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+			    char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	return sysfs_emit(buf, "%s\n",
+		       dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+
+static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
+	const char *end = strchrnul(buf, '\n'), *name;
+	struct net_device *dev = to_net_dev(d);
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	size_t len = end - buf;
+	int err;
+
+	/* Empty string passed */
+	if (!len)
+		return -ENOPROTOOPT;
+
+	name = kstrndup(buf, len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	old_tag_ops = cpu_dp->tag_ops;
+	new_tag_ops = dsa_tag_driver_get_by_name(name);
+	kfree(name);
+	/* Bad tagger name? */
+	if (IS_ERR(new_tag_ops))
+		return PTR_ERR(new_tag_ops);
+
+	if (new_tag_ops == old_tag_ops)
+		/* Drop the temporarily held duplicate reference, since
+		 * the DSA switch tree uses this tagger.
+		 */
+		goto out;
+
+	err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
+					old_tag_ops);
+	if (err) {
+		/* On failure the old tagger is restored, so we don't need the
+		 * driver for the new one.
+		 */
+		dsa_tag_driver_put(new_tag_ops);
+		return err;
+	}
+
+	/* On success we no longer need the module for the old tagging protocol
+	 */
+out:
+	dsa_tag_driver_put(old_tag_ops);
+	return count;
+}
+static DEVICE_ATTR_RW(tagging);
+
+static struct attribute *dsa_user_attrs[] = {
+	&dev_attr_tagging.attr,
+	NULL
+};
+
+static const struct attribute_group dsa_group = {
+	.name	= "dsa",
+	.attrs	= dsa_user_attrs,
+};
+
+static void dsa_conduit_reset_mtu(struct net_device *dev)
+{
+	int err;
+
+	err = dev_set_mtu(dev, ETH_DATA_LEN);
+	if (err)
+		netdev_dbg(dev,
+			   "Unable to reset MTU to exclude DSA overheads\n");
+}
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp)
+{
+	const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct device_link *consumer_link;
+	int mtu, ret;
+
+	mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
+
+	/* The DSA conduit must use SET_NETDEV_DEV for this to work. */
+	if (!netif_is_lag_master(dev)) {
+		consumer_link = device_link_add(ds->dev, dev->dev.parent,
+						DL_FLAG_AUTOREMOVE_CONSUMER);
+		if (!consumer_link)
+			netdev_err(dev,
+				   "Failed to create a device link to DSA switch %s\n",
+				   dev_name(ds->dev));
+	}
+
+	/* The switch driver may not implement ->port_change_mtu(), case in
+	 * which dsa_user_change_mtu() will not update the conduit MTU either,
+	 * so we need to do that here.
+	 */
+	ret = dev_set_mtu(dev, mtu);
+	if (ret)
+		netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
+			    ret, mtu);
+
+	/* If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.
+	 */
+	wmb();
+
+	dev->dsa_ptr = cpu_dp;
+
+	dsa_conduit_set_promiscuity(dev, 1);
+
+	ret = dsa_conduit_ethtool_setup(dev);
+	if (ret)
+		goto out_err_reset_promisc;
+
+	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+	if (ret)
+		goto out_err_ethtool_teardown;
+
+	return ret;
+
+out_err_ethtool_teardown:
+	dsa_conduit_ethtool_teardown(dev);
+out_err_reset_promisc:
+	dsa_conduit_set_promiscuity(dev, -1);
+	return ret;
+}
+
+void dsa_conduit_teardown(struct net_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
+	dsa_conduit_ethtool_teardown(dev);
+	dsa_conduit_reset_mtu(dev);
+	dsa_conduit_set_promiscuity(dev, -1);
+
+	dev->dsa_ptr = NULL;
+
+	/* If we used a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point get sent
+	 * without the tag and go through the regular receive path.
+	 */
+	wmb();
+}
+
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+			  struct netdev_lag_upper_info *uinfo,
+			  struct netlink_ext_ack *extack)
+{
+	bool conduit_setup = false;
+	int err;
+
+	if (!netdev_uses_dsa(lag_dev)) {
+		err = dsa_conduit_setup(lag_dev, cpu_dp);
+		if (err)
+			return err;
+
+		conduit_setup = true;
+	}
+
+	err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
+	if (err) {
+		NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
+		goto out_conduit_teardown;
+	}
+
+	return 0;
+
+out_conduit_teardown:
+	if (conduit_setup)
+		dsa_conduit_teardown(lag_dev);
+	return err;
+}
+
+/* Tear down a conduit if there isn't any other user port on it,
+ * optionally also destroying LAG information.
+ */
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+			      struct dsa_port *cpu_dp)
+{
+	struct net_device *upper;
+	struct list_head *iter;
+
+	dsa_port_lag_leave(cpu_dp, lag_dev);
+
+	netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
+		if (dsa_user_dev_check(upper))
+			return;
+
+	dsa_conduit_teardown(lag_dev);
+}
diff --git a/net/dsa/conduit.h b/net/dsa/conduit.h
new file mode 100644
index 000000000000..31f8834f54bb
--- /dev/null
+++ b/net/dsa/conduit.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_CONDUIT_H
+#define __DSA_CONDUIT_H
+
+struct dsa_port;
+struct net_device;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp);
+void dsa_conduit_teardown(struct net_device *dev);
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+			  struct netdev_lag_upper_info *uinfo,
+			  struct netlink_ext_ack *extack);
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+			      struct dsa_port *cpu_dp);
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+				    const struct kernel_hwtstamp_config *config,
+				    struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index ccbdb98109f8..ac7be864e80d 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -20,14 +20,14 @@
 #include <net/dsa_stubs.h>
 #include <net/sch_generic.h>
 
+#include "conduit.h"
 #include "devlink.h"
 #include "dsa.h"
-#include "master.h"
 #include "netlink.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag.h"
+#include "user.h"
 
 #define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
 
@@ -365,18 +365,18 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
 	return NULL;
 }
 
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst)
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst)
 {
 	struct device_node *ethernet;
-	struct net_device *master;
+	struct net_device *conduit;
 	struct dsa_port *cpu_dp;
 
 	cpu_dp = dsa_tree_find_first_cpu(dst);
 	ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
-	master = of_find_net_device_by_node(ethernet);
+	conduit = of_find_net_device_by_node(ethernet);
 	of_node_put(ethernet);
 
-	return master;
+	return conduit;
 }
 
 /* Assign the default CPU port (the first one in the tree) to all ports of the
@@ -517,7 +517,7 @@ static int dsa_port_setup(struct dsa_port *dp)
 		break;
 	case DSA_PORT_TYPE_USER:
 		of_get_mac_address(dp->dn, dp->mac);
-		err = dsa_slave_create(dp);
+		err = dsa_user_create(dp);
 		break;
 	}
 
@@ -554,9 +554,9 @@ static void dsa_port_teardown(struct dsa_port *dp)
 			dsa_shared_port_link_unregister_of(dp);
 		break;
 	case DSA_PORT_TYPE_USER:
-		if (dp->slave) {
-			dsa_slave_destroy(dp->slave);
-			dp->slave = NULL;
+		if (dp->user) {
+			dsa_user_destroy(dp->user);
+			dp->user = NULL;
 		}
 		break;
 	}
@@ -632,9 +632,9 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	if (ds->setup)
 		return 0;
 
-	/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
+	/* Initialize ds->phys_mii_mask before registering the user MDIO bus
 	 * driver and before ops->setup() has run, since the switch drivers and
-	 * the slave MDIO bus driver rely on these values for probing PHY
+	 * the user MDIO bus driver rely on these values for probing PHY
 	 * devices or not
 	 */
 	ds->phys_mii_mask |= dsa_user_ports(ds);
@@ -657,21 +657,21 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	if (err)
 		goto teardown;
 
-	if (!ds->slave_mii_bus && ds->ops->phy_read) {
-		ds->slave_mii_bus = mdiobus_alloc();
-		if (!ds->slave_mii_bus) {
+	if (!ds->user_mii_bus && ds->ops->phy_read) {
+		ds->user_mii_bus = mdiobus_alloc();
+		if (!ds->user_mii_bus) {
 			err = -ENOMEM;
 			goto teardown;
 		}
 
-		dsa_slave_mii_bus_init(ds);
+		dsa_user_mii_bus_init(ds);
 
 		dn = of_get_child_by_name(ds->dev->of_node, "mdio");
 
-		err = of_mdiobus_register(ds->slave_mii_bus, dn);
+		err = of_mdiobus_register(ds->user_mii_bus, dn);
 		of_node_put(dn);
 		if (err < 0)
-			goto free_slave_mii_bus;
+			goto free_user_mii_bus;
 	}
 
 	dsa_switch_devlink_register(ds);
@@ -679,9 +679,9 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	ds->setup = true;
 	return 0;
 
-free_slave_mii_bus:
-	if (ds->slave_mii_bus && ds->ops->phy_read)
-		mdiobus_free(ds->slave_mii_bus);
+free_user_mii_bus:
+	if (ds->user_mii_bus && ds->ops->phy_read)
+		mdiobus_free(ds->user_mii_bus);
 teardown:
 	if (ds->ops->teardown)
 		ds->ops->teardown(ds);
@@ -699,10 +699,10 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
 
 	dsa_switch_devlink_unregister(ds);
 
-	if (ds->slave_mii_bus && ds->ops->phy_read) {
-		mdiobus_unregister(ds->slave_mii_bus);
-		mdiobus_free(ds->slave_mii_bus);
-		ds->slave_mii_bus = NULL;
+	if (ds->user_mii_bus && ds->ops->phy_read) {
+		mdiobus_unregister(ds->user_mii_bus);
+		mdiobus_free(ds->user_mii_bus);
+		ds->user_mii_bus = NULL;
 	}
 
 	dsa_switch_teardown_tag_protocol(ds);
@@ -793,7 +793,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
 	return err;
 }
 
-static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
+static int dsa_tree_setup_conduit(struct dsa_switch_tree *dst)
 {
 	struct dsa_port *cpu_dp;
 	int err = 0;
@@ -801,18 +801,18 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
 	rtnl_lock();
 
 	dsa_tree_for_each_cpu_port(cpu_dp, dst) {
-		struct net_device *master = cpu_dp->master;
-		bool admin_up = (master->flags & IFF_UP) &&
-				!qdisc_tx_is_noop(master);
+		struct net_device *conduit = cpu_dp->conduit;
+		bool admin_up = (conduit->flags & IFF_UP) &&
+				!qdisc_tx_is_noop(conduit);
 
-		err = dsa_master_setup(master, cpu_dp);
+		err = dsa_conduit_setup(conduit, cpu_dp);
 		if (err)
 			break;
 
-		/* Replay master state event */
-		dsa_tree_master_admin_state_change(dst, master, admin_up);
-		dsa_tree_master_oper_state_change(dst, master,
-						  netif_oper_up(master));
+		/* Replay conduit state event */
+		dsa_tree_conduit_admin_state_change(dst, conduit, admin_up);
+		dsa_tree_conduit_oper_state_change(dst, conduit,
+						   netif_oper_up(conduit));
 	}
 
 	rtnl_unlock();
@@ -820,22 +820,22 @@ static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
 	return err;
 }
 
-static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
+static void dsa_tree_teardown_conduit(struct dsa_switch_tree *dst)
 {
 	struct dsa_port *cpu_dp;
 
 	rtnl_lock();
 
 	dsa_tree_for_each_cpu_port(cpu_dp, dst) {
-		struct net_device *master = cpu_dp->master;
+		struct net_device *conduit = cpu_dp->conduit;
 
 		/* Synthesizing an "admin down" state is sufficient for
-		 * the switches to get a notification if the master is
+		 * the switches to get a notification if the conduit is
 		 * currently up and running.
 		 */
-		dsa_tree_master_admin_state_change(dst, master, false);
+		dsa_tree_conduit_admin_state_change(dst, conduit, false);
 
-		dsa_master_teardown(master);
+		dsa_conduit_teardown(conduit);
 	}
 
 	rtnl_unlock();
@@ -894,13 +894,13 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
 	if (err)
 		goto teardown_switches;
 
-	err = dsa_tree_setup_master(dst);
+	err = dsa_tree_setup_conduit(dst);
 	if (err)
 		goto teardown_ports;
 
 	err = dsa_tree_setup_lags(dst);
 	if (err)
-		goto teardown_master;
+		goto teardown_conduit;
 
 	dst->setup = true;
 
@@ -908,8 +908,8 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
 
 	return 0;
 
-teardown_master:
-	dsa_tree_teardown_master(dst);
+teardown_conduit:
+	dsa_tree_teardown_conduit(dst);
 teardown_ports:
 	dsa_tree_teardown_ports(dst);
 teardown_switches:
@@ -929,7 +929,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 
 	dsa_tree_teardown_lags(dst);
 
-	dsa_tree_teardown_master(dst);
+	dsa_tree_teardown_conduit(dst);
 
 	dsa_tree_teardown_ports(dst);
 
@@ -978,7 +978,7 @@ out_disconnect:
 	return err;
 }
 
-/* Since the dsa/tagging sysfs device attribute is per master, the assumption
+/* Since the dsa/tagging sysfs device attribute is per conduit, the assumption
  * is that all DSA switches within a tree share the same tagger, otherwise
  * they would have formed disjoint trees (different "dsa,member" values).
  */
@@ -999,10 +999,10 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
 	 * restriction, there needs to be another mutex which serializes this.
 	 */
 	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp)->flags & IFF_UP)
+		if (dsa_port_to_conduit(dp)->flags & IFF_UP)
 			goto out_unlock;
 
-		if (dp->slave->flags & IFF_UP)
+		if (dp->user->flags & IFF_UP)
 			goto out_unlock;
 	}
 
@@ -1028,62 +1028,62 @@ out_unlock:
 	return err;
 }
 
-static void dsa_tree_master_state_change(struct dsa_switch_tree *dst,
-					 struct net_device *master)
+static void dsa_tree_conduit_state_change(struct dsa_switch_tree *dst,
+					  struct net_device *conduit)
 {
-	struct dsa_notifier_master_state_info info;
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_notifier_conduit_state_info info;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 
-	info.master = master;
-	info.operational = dsa_port_master_is_operational(cpu_dp);
+	info.conduit = conduit;
+	info.operational = dsa_port_conduit_is_operational(cpu_dp);
 
-	dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info);
+	dsa_tree_notify(dst, DSA_NOTIFIER_CONDUIT_STATE_CHANGE, &info);
 }
 
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
-					struct net_device *master,
-					bool up)
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+					 struct net_device *conduit,
+					 bool up)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	bool notify = false;
 
-	/* Don't keep track of admin state on LAG DSA masters,
-	 * but rather just of physical DSA masters
+	/* Don't keep track of admin state on LAG DSA conduits,
+	 * but rather just of physical DSA conduits
 	 */
-	if (netif_is_lag_master(master))
+	if (netif_is_lag_master(conduit))
 		return;
 
-	if ((dsa_port_master_is_operational(cpu_dp)) !=
-	    (up && cpu_dp->master_oper_up))
+	if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+	    (up && cpu_dp->conduit_oper_up))
 		notify = true;
 
-	cpu_dp->master_admin_up = up;
+	cpu_dp->conduit_admin_up = up;
 
 	if (notify)
-		dsa_tree_master_state_change(dst, master);
+		dsa_tree_conduit_state_change(dst, conduit);
 }
 
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
-				       struct net_device *master,
-				       bool up)
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+					struct net_device *conduit,
+					bool up)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	bool notify = false;
 
-	/* Don't keep track of oper state on LAG DSA masters,
-	 * but rather just of physical DSA masters
+	/* Don't keep track of oper state on LAG DSA conduits,
+	 * but rather just of physical DSA conduits
 	 */
-	if (netif_is_lag_master(master))
+	if (netif_is_lag_master(conduit))
 		return;
 
-	if ((dsa_port_master_is_operational(cpu_dp)) !=
-	    (cpu_dp->master_admin_up && up))
+	if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+	    (cpu_dp->conduit_admin_up && up))
 		notify = true;
 
-	cpu_dp->master_oper_up = up;
+	cpu_dp->conduit_oper_up = up;
 
 	if (notify)
-		dsa_tree_master_state_change(dst, master);
+		dsa_tree_conduit_state_change(dst, conduit);
 }
 
 static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
@@ -1129,7 +1129,7 @@ static int dsa_port_parse_dsa(struct dsa_port *dp)
 }
 
 static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
-						  struct net_device *master)
+						  struct net_device *conduit)
 {
 	enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
 	struct dsa_switch *mds, *ds = dp->ds;
@@ -1140,21 +1140,21 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
 	 * happens the switch driver may want to know if its tagging protocol
 	 * is going to work in such a configuration.
 	 */
-	if (dsa_slave_dev_check(master)) {
-		mdp = dsa_slave_to_port(master);
+	if (dsa_user_dev_check(conduit)) {
+		mdp = dsa_user_to_port(conduit);
 		mds = mdp->ds;
 		mdp_upstream = dsa_upstream_port(mds, mdp->index);
 		tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
 							  DSA_TAG_PROTO_NONE);
 	}
 
-	/* If the master device is not itself a DSA slave in a disjoint DSA
+	/* If the conduit device is not itself a DSA user in a disjoint DSA
 	 * tree, then return immediately.
 	 */
 	return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
 }
 
-static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
+static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *conduit,
 			      const char *user_protocol)
 {
 	const struct dsa_device_ops *tag_ops = NULL;
@@ -1163,7 +1163,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
 	enum dsa_tag_protocol default_proto;
 
 	/* Find out which protocol the switch would prefer. */
-	default_proto = dsa_get_tag_protocol(dp, master);
+	default_proto = dsa_get_tag_protocol(dp, conduit);
 	if (dst->default_proto) {
 		if (dst->default_proto != default_proto) {
 			dev_err(ds->dev,
@@ -1218,7 +1218,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master,
 		dst->tag_ops = tag_ops;
 	}
 
-	dp->master = master;
+	dp->conduit = conduit;
 	dp->type = DSA_PORT_TYPE_CPU;
 	dsa_port_set_tag_protocol(dp, dst->tag_ops);
 	dp->dst = dst;
@@ -1248,16 +1248,16 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
 	dp->dn = dn;
 
 	if (ethernet) {
-		struct net_device *master;
+		struct net_device *conduit;
 		const char *user_protocol;
 
-		master = of_find_net_device_by_node(ethernet);
+		conduit = of_find_net_device_by_node(ethernet);
 		of_node_put(ethernet);
-		if (!master)
+		if (!conduit)
 			return -EPROBE_DEFER;
 
 		user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
-		return dsa_port_parse_cpu(dp, master, user_protocol);
+		return dsa_port_parse_cpu(dp, conduit, user_protocol);
 	}
 
 	if (link)
@@ -1412,15 +1412,15 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name,
 			  struct device *dev)
 {
 	if (!strcmp(name, "cpu")) {
-		struct net_device *master;
+		struct net_device *conduit;
 
-		master = dsa_dev_to_net_device(dev);
-		if (!master)
+		conduit = dsa_dev_to_net_device(dev);
+		if (!conduit)
 			return -EPROBE_DEFER;
 
-		dev_put(master);
+		dev_put(conduit);
 
-		return dsa_port_parse_cpu(dp, master, NULL);
+		return dsa_port_parse_cpu(dp, conduit, NULL);
 	}
 
 	if (!strcmp(name, "dsa"))
@@ -1566,14 +1566,14 @@ void dsa_unregister_switch(struct dsa_switch *ds)
 }
 EXPORT_SYMBOL_GPL(dsa_unregister_switch);
 
-/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is
+/* If the DSA conduit chooses to unregister its net_device on .shutdown, DSA is
  * blocking that operation from completion, due to the dev_hold taken inside
- * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of
- * the DSA master, so that the system can reboot successfully.
+ * netdev_upper_dev_link. Unlink the DSA user interfaces from being uppers of
+ * the DSA conduit, so that the system can reboot successfully.
  */
 void dsa_switch_shutdown(struct dsa_switch *ds)
 {
-	struct net_device *master, *slave_dev;
+	struct net_device *conduit, *user_dev;
 	struct dsa_port *dp;
 
 	mutex_lock(&dsa2_mutex);
@@ -1584,17 +1584,17 @@ void dsa_switch_shutdown(struct dsa_switch *ds)
 	rtnl_lock();
 
 	dsa_switch_for_each_user_port(dp, ds) {
-		master = dsa_port_to_master(dp);
-		slave_dev = dp->slave;
+		conduit = dsa_port_to_conduit(dp);
+		user_dev = dp->user;
 
-		netdev_upper_dev_unlink(master, slave_dev);
+		netdev_upper_dev_unlink(conduit, user_dev);
 	}
 
-	/* Disconnect from further netdevice notifiers on the master,
+	/* Disconnect from further netdevice notifiers on the conduit,
 	 * since netdev_uses_dsa() will now return false.
 	 */
 	dsa_switch_for_each_cpu_port(dp, ds)
-		dp->master->dsa_ptr = NULL;
+		dp->conduit->dsa_ptr = NULL;
 
 	rtnl_unlock();
 out:
@@ -1605,7 +1605,7 @@ EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
 #ifdef CONFIG_PM_SLEEP
 static bool dsa_port_is_initialized(const struct dsa_port *dp)
 {
-	return dp->type == DSA_PORT_TYPE_USER && dp->slave;
+	return dp->type == DSA_PORT_TYPE_USER && dp->user;
 }
 
 int dsa_switch_suspend(struct dsa_switch *ds)
@@ -1613,12 +1613,12 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 	struct dsa_port *dp;
 	int ret = 0;
 
-	/* Suspend slave network devices */
+	/* Suspend user network devices */
 	dsa_switch_for_each_port(dp, ds) {
 		if (!dsa_port_is_initialized(dp))
 			continue;
 
-		ret = dsa_slave_suspend(dp->slave);
+		ret = dsa_user_suspend(dp->user);
 		if (ret)
 			return ret;
 	}
@@ -1641,12 +1641,12 @@ int dsa_switch_resume(struct dsa_switch *ds)
 	if (ret)
 		return ret;
 
-	/* Resume slave network devices */
+	/* Resume user network devices */
 	dsa_switch_for_each_port(dp, ds) {
 		if (!dsa_port_is_initialized(dp))
 			continue;
 
-		ret = dsa_slave_resume(dp->slave);
+		ret = dsa_user_resume(dp->user);
 		if (ret)
 			return ret;
 	}
@@ -1658,10 +1658,10 @@ EXPORT_SYMBOL_GPL(dsa_switch_resume);
 
 struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
 {
-	if (!netdev || !dsa_slave_dev_check(netdev))
+	if (!netdev || !dsa_user_dev_check(netdev))
 		return ERR_PTR(-ENODEV);
 
-	return dsa_slave_to_port(netdev);
+	return dsa_user_to_port(netdev);
 }
 EXPORT_SYMBOL_GPL(dsa_port_from_netdev);
 
@@ -1726,7 +1726,7 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
 
 static const struct dsa_stubs __dsa_stubs = {
-	.master_hwtstamp_validate = __dsa_master_hwtstamp_validate,
+	.conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate,
 };
 
 static void dsa_register_stubs(void)
@@ -1748,7 +1748,7 @@ static int __init dsa_init_module(void)
 	if (!dsa_owq)
 		return -ENOMEM;
 
-	rc = dsa_slave_register_notifier();
+	rc = dsa_user_register_notifier();
 	if (rc)
 		goto register_notifier_fail;
 
@@ -1763,7 +1763,7 @@ static int __init dsa_init_module(void)
 	return 0;
 
 netlink_register_fail:
-	dsa_slave_unregister_notifier();
+	dsa_user_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 register_notifier_fail:
 	destroy_workqueue(dsa_owq);
@@ -1778,7 +1778,7 @@ static void __exit dsa_cleanup_module(void)
 
 	rtnl_link_unregister(&dsa_link_ops);
 
-	dsa_slave_unregister_notifier();
+	dsa_user_unregister_notifier();
 	dev_remove_pack(&dsa_pack_type);
 	destroy_workqueue(dsa_owq);
 }
diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h
index b7e17ae1094d..3cc7823e9ef3 100644
--- a/net/dsa/dsa.h
+++ b/net/dsa/dsa.h
@@ -21,16 +21,16 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
 struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
 				  const struct net_device *lag_dev);
-struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst);
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst);
 int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
 			      const struct dsa_device_ops *tag_ops,
 			      const struct dsa_device_ops *old_tag_ops);
-void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst,
-					struct net_device *master,
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+					 struct net_device *conduit,
+					 bool up);
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+					struct net_device *conduit,
 					bool up);
-void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst,
-				       struct net_device *master,
-				       bool up);
 unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
 void dsa_bridge_num_put(const struct net_device *bridge_dev,
 			unsigned int bridge_num);
diff --git a/net/dsa/master.c b/net/dsa/master.c
deleted file mode 100644
index 6be89ab0cc01..000000000000
--- a/net/dsa/master.c
+++ /dev/null
@@ -1,475 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Handling of a master device, switching frames via its switch fabric CPU port
- *
- * Copyright (c) 2017 Savoir-faire Linux Inc.
- *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
- */
-
-#include <linux/ethtool.h>
-#include <linux/netdevice.h>
-#include <linux/netlink.h>
-#include <net/dsa.h>
-
-#include "dsa.h"
-#include "master.h"
-#include "port.h"
-#include "tag.h"
-
-static int dsa_master_get_regs_len(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int ret = 0;
-	int len;
-
-	if (ops->get_regs_len) {
-		len = ops->get_regs_len(dev);
-		if (len < 0)
-			return len;
-		ret += len;
-	}
-
-	ret += sizeof(struct ethtool_drvinfo);
-	ret += sizeof(struct ethtool_regs);
-
-	if (ds->ops->get_regs_len) {
-		len = ds->ops->get_regs_len(ds, port);
-		if (len < 0)
-			return len;
-		ret += len;
-	}
-
-	return ret;
-}
-
-static void dsa_master_get_regs(struct net_device *dev,
-				struct ethtool_regs *regs, void *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct ethtool_drvinfo *cpu_info;
-	struct ethtool_regs *cpu_regs;
-	int port = cpu_dp->index;
-	int len;
-
-	if (ops->get_regs_len && ops->get_regs) {
-		len = ops->get_regs_len(dev);
-		if (len < 0)
-			return;
-		regs->len = len;
-		ops->get_regs(dev, regs, data);
-		data += regs->len;
-	}
-
-	cpu_info = (struct ethtool_drvinfo *)data;
-	strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
-	data += sizeof(*cpu_info);
-	cpu_regs = (struct ethtool_regs *)data;
-	data += sizeof(*cpu_regs);
-
-	if (ds->ops->get_regs_len && ds->ops->get_regs) {
-		len = ds->ops->get_regs_len(ds, port);
-		if (len < 0)
-			return;
-		cpu_regs->len = len;
-		ds->ops->get_regs(ds, port, cpu_regs, data);
-	}
-}
-
-static void dsa_master_get_ethtool_stats(struct net_device *dev,
-					 struct ethtool_stats *stats,
-					 uint64_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int count = 0;
-
-	if (ops->get_sset_count && ops->get_ethtool_stats) {
-		count = ops->get_sset_count(dev, ETH_SS_STATS);
-		ops->get_ethtool_stats(dev, stats, data);
-	}
-
-	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, port, data + count);
-}
-
-static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
-					     struct ethtool_stats *stats,
-					     uint64_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int count = 0;
-
-	if (dev->phydev && !ops->get_ethtool_phy_stats) {
-		count = phy_ethtool_get_sset_count(dev->phydev);
-		if (count >= 0)
-			phy_ethtool_get_stats(dev->phydev, stats, data);
-	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
-		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
-		ops->get_ethtool_phy_stats(dev, stats, data);
-	}
-
-	if (count < 0)
-		count = 0;
-
-	if (ds->ops->get_ethtool_phy_stats)
-		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
-}
-
-static int dsa_master_get_sset_count(struct net_device *dev, int sset)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int count = 0;
-
-	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
-	    !ops->get_ethtool_phy_stats)
-		count = phy_ethtool_get_sset_count(dev->phydev);
-	else if (ops->get_sset_count)
-		count = ops->get_sset_count(dev, sset);
-
-	if (count < 0)
-		count = 0;
-
-	if (ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
-
-	return count;
-}
-
-static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
-				   uint8_t *data)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	int port = cpu_dp->index;
-	int len = ETH_GSTRING_LEN;
-	int mcount = 0, count, i;
-	uint8_t pfx[4];
-	uint8_t *ndata;
-
-	snprintf(pfx, sizeof(pfx), "p%.2d", port);
-	/* We do not want to be NULL-terminated, since this is a prefix */
-	pfx[sizeof(pfx) - 1] = '_';
-
-	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
-	    !ops->get_ethtool_phy_stats) {
-		mcount = phy_ethtool_get_sset_count(dev->phydev);
-		if (mcount < 0)
-			mcount = 0;
-		else
-			phy_ethtool_get_strings(dev->phydev, data);
-	} else if (ops->get_sset_count && ops->get_strings) {
-		mcount = ops->get_sset_count(dev, stringset);
-		if (mcount < 0)
-			mcount = 0;
-		ops->get_strings(dev, stringset, data);
-	}
-
-	if (ds->ops->get_strings) {
-		ndata = data + mcount * len;
-		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
-		 * the output after to prepend our CPU port prefix we
-		 * constructed earlier
-		 */
-		ds->ops->get_strings(ds, port, stringset, ndata);
-		count = ds->ops->get_sset_count(ds, port, stringset);
-		if (count < 0)
-			return;
-		for (i = 0; i < count; i++) {
-			memmove(ndata + (i * len + sizeof(pfx)),
-				ndata + i * len, len - sizeof(pfx));
-			memcpy(ndata + i * len, pfx, sizeof(pfx));
-		}
-	}
-}
-
-/* Deny PTP operations on master if there is at least one switch in the tree
- * that is PTP capable.
- */
-int __dsa_master_hwtstamp_validate(struct net_device *dev,
-				   const struct kernel_hwtstamp_config *config,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct dsa_switch_tree *dst;
-	struct dsa_port *dp;
-
-	dst = ds->dst;
-
-	list_for_each_entry(dp, &dst->ports, list) {
-		if (dsa_port_supports_hwtstamp(dp)) {
-			NL_SET_ERR_MSG(extack,
-				       "HW timestamping not allowed on DSA master when switch supports the operation");
-			return -EBUSY;
-		}
-	}
-
-	return 0;
-}
-
-static int dsa_master_ethtool_setup(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct ethtool_ops *ops;
-
-	if (netif_is_lag_master(dev))
-		return 0;
-
-	ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
-
-	cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
-	if (cpu_dp->orig_ethtool_ops)
-		memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
-
-	ops->get_regs_len = dsa_master_get_regs_len;
-	ops->get_regs = dsa_master_get_regs;
-	ops->get_sset_count = dsa_master_get_sset_count;
-	ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
-	ops->get_strings = dsa_master_get_strings;
-	ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
-
-	dev->ethtool_ops = ops;
-
-	return 0;
-}
-
-static void dsa_master_ethtool_teardown(struct net_device *dev)
-{
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-
-	if (netif_is_lag_master(dev))
-		return;
-
-	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
-	cpu_dp->orig_ethtool_ops = NULL;
-}
-
-/* Keep the master always promiscuous if the tagging protocol requires that
- * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
- * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
- * anyway.
- */
-static void dsa_master_set_promiscuity(struct net_device *dev, int inc)
-{
-	const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
-
-	if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_master)
-		return;
-
-	ASSERT_RTNL();
-
-	dev_set_promiscuity(dev, inc);
-}
-
-static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
-			    char *buf)
-{
-	struct net_device *dev = to_net_dev(d);
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-
-	return sysfs_emit(buf, "%s\n",
-		       dsa_tag_protocol_to_str(cpu_dp->tag_ops));
-}
-
-static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
-	const char *end = strchrnul(buf, '\n'), *name;
-	struct net_device *dev = to_net_dev(d);
-	struct dsa_port *cpu_dp = dev->dsa_ptr;
-	size_t len = end - buf;
-	int err;
-
-	/* Empty string passed */
-	if (!len)
-		return -ENOPROTOOPT;
-
-	name = kstrndup(buf, len, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
-
-	old_tag_ops = cpu_dp->tag_ops;
-	new_tag_ops = dsa_tag_driver_get_by_name(name);
-	kfree(name);
-	/* Bad tagger name? */
-	if (IS_ERR(new_tag_ops))
-		return PTR_ERR(new_tag_ops);
-
-	if (new_tag_ops == old_tag_ops)
-		/* Drop the temporarily held duplicate reference, since
-		 * the DSA switch tree uses this tagger.
-		 */
-		goto out;
-
-	err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
-					old_tag_ops);
-	if (err) {
-		/* On failure the old tagger is restored, so we don't need the
-		 * driver for the new one.
-		 */
-		dsa_tag_driver_put(new_tag_ops);
-		return err;
-	}
-
-	/* On success we no longer need the module for the old tagging protocol
-	 */
-out:
-	dsa_tag_driver_put(old_tag_ops);
-	return count;
-}
-static DEVICE_ATTR_RW(tagging);
-
-static struct attribute *dsa_slave_attrs[] = {
-	&dev_attr_tagging.attr,
-	NULL
-};
-
-static const struct attribute_group dsa_group = {
-	.name	= "dsa",
-	.attrs	= dsa_slave_attrs,
-};
-
-static void dsa_master_reset_mtu(struct net_device *dev)
-{
-	int err;
-
-	err = dev_set_mtu(dev, ETH_DATA_LEN);
-	if (err)
-		netdev_dbg(dev,
-			   "Unable to reset MTU to exclude DSA overheads\n");
-}
-
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
-{
-	const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct device_link *consumer_link;
-	int mtu, ret;
-
-	mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
-
-	/* The DSA master must use SET_NETDEV_DEV for this to work. */
-	if (!netif_is_lag_master(dev)) {
-		consumer_link = device_link_add(ds->dev, dev->dev.parent,
-						DL_FLAG_AUTOREMOVE_CONSUMER);
-		if (!consumer_link)
-			netdev_err(dev,
-				   "Failed to create a device link to DSA switch %s\n",
-				   dev_name(ds->dev));
-	}
-
-	/* The switch driver may not implement ->port_change_mtu(), case in
-	 * which dsa_slave_change_mtu() will not update the master MTU either,
-	 * so we need to do that here.
-	 */
-	ret = dev_set_mtu(dev, mtu);
-	if (ret)
-		netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
-			    ret, mtu);
-
-	/* If we use a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point on get
-	 * sent to the tag format's receive function.
-	 */
-	wmb();
-
-	dev->dsa_ptr = cpu_dp;
-
-	dsa_master_set_promiscuity(dev, 1);
-
-	ret = dsa_master_ethtool_setup(dev);
-	if (ret)
-		goto out_err_reset_promisc;
-
-	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
-	if (ret)
-		goto out_err_ethtool_teardown;
-
-	return ret;
-
-out_err_ethtool_teardown:
-	dsa_master_ethtool_teardown(dev);
-out_err_reset_promisc:
-	dsa_master_set_promiscuity(dev, -1);
-	return ret;
-}
-
-void dsa_master_teardown(struct net_device *dev)
-{
-	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
-	dsa_master_ethtool_teardown(dev);
-	dsa_master_reset_mtu(dev);
-	dsa_master_set_promiscuity(dev, -1);
-
-	dev->dsa_ptr = NULL;
-
-	/* If we used a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point get sent
-	 * without the tag and go through the regular receive path.
-	 */
-	wmb();
-}
-
-int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
-			 struct netdev_lag_upper_info *uinfo,
-			 struct netlink_ext_ack *extack)
-{
-	bool master_setup = false;
-	int err;
-
-	if (!netdev_uses_dsa(lag_dev)) {
-		err = dsa_master_setup(lag_dev, cpu_dp);
-		if (err)
-			return err;
-
-		master_setup = true;
-	}
-
-	err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
-	if (err) {
-		NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
-		goto out_master_teardown;
-	}
-
-	return 0;
-
-out_master_teardown:
-	if (master_setup)
-		dsa_master_teardown(lag_dev);
-	return err;
-}
-
-/* Tear down a master if there isn't any other user port on it,
- * optionally also destroying LAG information.
- */
-void dsa_master_lag_teardown(struct net_device *lag_dev,
-			     struct dsa_port *cpu_dp)
-{
-	struct net_device *upper;
-	struct list_head *iter;
-
-	dsa_port_lag_leave(cpu_dp, lag_dev);
-
-	netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
-		if (dsa_slave_dev_check(upper))
-			return;
-
-	dsa_master_teardown(lag_dev);
-}
diff --git a/net/dsa/master.h b/net/dsa/master.h
deleted file mode 100644
index 76e39d3ec909..000000000000
--- a/net/dsa/master.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef __DSA_MASTER_H
-#define __DSA_MASTER_H
-
-struct dsa_port;
-struct net_device;
-struct netdev_lag_upper_info;
-struct netlink_ext_ack;
-
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp);
-void dsa_master_teardown(struct net_device *dev);
-int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
-			 struct netdev_lag_upper_info *uinfo,
-			 struct netlink_ext_ack *extack);
-void dsa_master_lag_teardown(struct net_device *lag_dev,
-			     struct dsa_port *cpu_dp);
-int __dsa_master_hwtstamp_validate(struct net_device *dev,
-				   const struct kernel_hwtstamp_config *config,
-				   struct netlink_ext_ack *extack);
-
-#endif
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
index bd4bbaf851de..f56f90a25b99 100644
--- a/net/dsa/netlink.c
+++ b/net/dsa/netlink.c
@@ -5,7 +5,7 @@
 #include <net/rtnetlink.h>
 
 #include "netlink.h"
-#include "slave.h"
+#include "user.h"
 
 static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
 	[IFLA_DSA_MASTER]	= { .type = NLA_U32 },
@@ -22,13 +22,13 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
 
 	if (data[IFLA_DSA_MASTER]) {
 		u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]);
-		struct net_device *master;
+		struct net_device *conduit;
 
-		master = __dev_get_by_index(dev_net(dev), ifindex);
-		if (!master)
+		conduit = __dev_get_by_index(dev_net(dev), ifindex);
+		if (!conduit)
 			return -EINVAL;
 
-		err = dsa_slave_change_master(dev, master, extack);
+		err = dsa_user_change_conduit(dev, conduit, extack);
 		if (err)
 			return err;
 	}
@@ -44,9 +44,9 @@ static size_t dsa_get_size(const struct net_device *dev)
 
 static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
-	struct net_device *master = dsa_slave_to_master(dev);
+	struct net_device *conduit = dsa_user_to_conduit(dev);
 
-	if (nla_put_u32(skb, IFLA_DSA_MASTER, master->ifindex))
+	if (nla_put_u32(skb, IFLA_DSA_MASTER, conduit->ifindex))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 6e0d000a97c4..c42dac87671b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -14,9 +14,9 @@
 
 #include "dsa.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag_8021q.h"
+#include "user.h"
 
 /**
  * dsa_port_notify - Notify the switching fabric of changes to a port
@@ -289,7 +289,7 @@ static void dsa_port_reset_vlan_filtering(struct dsa_port *dp,
 	}
 
 	/* If the bridge was vlan_filtering, the bridge core doesn't trigger an
-	 * event for changing vlan_filtering setting upon slave ports leaving
+	 * event for changing vlan_filtering setting upon user ports leaving
 	 * it. That is a good thing, because that lets us handle it and also
 	 * handle the case where the switch's vlan_filtering setting is global
 	 * (not per port). When that happens, the correct moment to trigger the
@@ -489,7 +489,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 		.dp = dp,
 		.extack = extack,
 	};
-	struct net_device *dev = dp->slave;
+	struct net_device *dev = dp->user;
 	struct net_device *brport_dev;
 	int err;
 
@@ -514,8 +514,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 	dp->bridge->tx_fwd_offload = info.tx_fwd_offload;
 
 	err = switchdev_bridge_port_offload(brport_dev, dev, dp,
-					    &dsa_slave_switchdev_notifier,
-					    &dsa_slave_switchdev_blocking_notifier,
+					    &dsa_user_switchdev_notifier,
+					    &dsa_user_switchdev_blocking_notifier,
 					    dp->bridge->tx_fwd_offload, extack);
 	if (err)
 		goto out_rollback_unbridge;
@@ -528,8 +528,8 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
 
 out_rollback_unoffload:
 	switchdev_bridge_port_unoffload(brport_dev, dp,
-					&dsa_slave_switchdev_notifier,
-					&dsa_slave_switchdev_blocking_notifier);
+					&dsa_user_switchdev_notifier,
+					&dsa_user_switchdev_blocking_notifier);
 	dsa_flush_workqueue();
 out_rollback_unbridge:
 	dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
@@ -547,8 +547,8 @@ void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br)
 		return;
 
 	switchdev_bridge_port_unoffload(brport_dev, dp,
-					&dsa_slave_switchdev_notifier,
-					&dsa_slave_switchdev_blocking_notifier);
+					&dsa_user_switchdev_notifier,
+					&dsa_user_switchdev_blocking_notifier);
 
 	dsa_flush_workqueue();
 }
@@ -741,10 +741,10 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
 	 */
 	if (vlan_filtering && dsa_port_is_user(dp)) {
 		struct net_device *br = dsa_port_bridge_dev_get(dp);
-		struct net_device *upper_dev, *slave = dp->slave;
+		struct net_device *upper_dev, *user = dp->user;
 		struct list_head *iter;
 
-		netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
+		netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
 			struct bridge_vlan_info br_info;
 			u16 vid;
 
@@ -803,9 +803,9 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 	if (!ds->ops->port_vlan_filtering)
 		return -EOPNOTSUPP;
 
-	/* We are called from dsa_slave_switchdev_blocking_event(),
+	/* We are called from dsa_user_switchdev_blocking_event(),
 	 * which is not under rcu_read_lock(), unlike
-	 * dsa_slave_switchdev_event().
+	 * dsa_user_switchdev_event().
 	 */
 	rcu_read_lock();
 	apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack);
@@ -827,24 +827,24 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 		ds->vlan_filtering = vlan_filtering;
 
 		dsa_switch_for_each_user_port(other_dp, ds) {
-			struct net_device *slave = other_dp->slave;
+			struct net_device *user = other_dp->user;
 
 			/* We might be called in the unbind path, so not
-			 * all slave devices might still be registered.
+			 * all user devices might still be registered.
 			 */
-			if (!slave)
+			if (!user)
 				continue;
 
-			err = dsa_slave_manage_vlan_filtering(slave,
-							      vlan_filtering);
+			err = dsa_user_manage_vlan_filtering(user,
+							     vlan_filtering);
 			if (err)
 				goto restore;
 		}
 	} else {
 		dp->vlan_filtering = vlan_filtering;
 
-		err = dsa_slave_manage_vlan_filtering(dp->slave,
-						      vlan_filtering);
+		err = dsa_user_manage_vlan_filtering(dp->user,
+						     vlan_filtering);
 		if (err)
 			goto restore;
 	}
@@ -863,7 +863,7 @@ restore:
 }
 
 /* This enforces legacy behavior for switch drivers which assume they can't
- * receive VLAN configuration when enslaved to a bridge with vlan_filtering=0
+ * receive VLAN configuration when joining a bridge with vlan_filtering=0
  */
 bool dsa_port_skip_vlan_configuration(struct dsa_port *dp)
 {
@@ -1047,7 +1047,7 @@ int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
 int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
 				 const unsigned char *addr, u16 vid)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1057,12 +1057,12 @@ int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	/* Avoid a call to __dev_set_promiscuity() on the master, which
+	/* Avoid a call to __dev_set_promiscuity() on the conduit, which
 	 * requires rtnl_lock(), since we can't guarantee that is held here,
 	 * and we can't take it either.
 	 */
-	if (master->priv_flags & IFF_UNICAST_FLT) {
-		err = dev_uc_add(master, addr);
+	if (conduit->priv_flags & IFF_UNICAST_FLT) {
+		err = dev_uc_add(conduit, addr);
 		if (err)
 			return err;
 	}
@@ -1098,7 +1098,7 @@ int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
 int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
 				 const unsigned char *addr, u16 vid)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1108,8 +1108,8 @@ int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	if (master->priv_flags & IFF_UNICAST_FLT) {
-		err = dev_uc_del(master, addr);
+	if (conduit->priv_flags & IFF_UNICAST_FLT) {
+		err = dev_uc_del(conduit, addr);
 		if (err)
 			return err;
 	}
@@ -1229,7 +1229,7 @@ int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
 int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
 				 const struct switchdev_obj_port_mdb *mdb)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1239,7 +1239,7 @@ int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	err = dev_mc_add(master, mdb->addr);
+	err = dev_mc_add(conduit, mdb->addr);
 	if (err)
 		return err;
 
@@ -1273,7 +1273,7 @@ int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
 int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
 				 const struct switchdev_obj_port_mdb *mdb)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_db db = {
 		.type = DSA_DB_BRIDGE,
 		.bridge = *dp->bridge,
@@ -1283,7 +1283,7 @@ int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
 	if (!dp->ds->fdb_isolation)
 		db.bridge.num = 0;
 
-	err = dev_mc_del(master, mdb->addr);
+	err = dev_mc_del(conduit, mdb->addr);
 	if (err)
 		return err;
 
@@ -1318,7 +1318,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 			   const struct switchdev_obj_port_vlan *vlan,
 			   struct netlink_ext_ack *extack)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_notifier_vlan_info info = {
 		.dp = dp,
 		.vlan = vlan,
@@ -1330,7 +1330,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	vlan_vid_add(master, htons(ETH_P_8021Q), vlan->vid);
+	vlan_vid_add(conduit, htons(ETH_P_8021Q), vlan->vid);
 
 	return err;
 }
@@ -1338,7 +1338,7 @@ int dsa_port_host_vlan_add(struct dsa_port *dp,
 int dsa_port_host_vlan_del(struct dsa_port *dp,
 			   const struct switchdev_obj_port_vlan *vlan)
 {
-	struct net_device *master = dsa_port_to_master(dp);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
 	struct dsa_notifier_vlan_info info = {
 		.dp = dp,
 		.vlan = vlan,
@@ -1349,7 +1349,7 @@ int dsa_port_host_vlan_del(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid);
+	vlan_vid_del(conduit, htons(ETH_P_8021Q), vlan->vid);
 
 	return err;
 }
@@ -1398,24 +1398,24 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
 	return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp);
 }
 
-static int dsa_port_assign_master(struct dsa_port *dp,
-				  struct net_device *master,
-				  struct netlink_ext_ack *extack,
-				  bool fail_on_err)
+static int dsa_port_assign_conduit(struct dsa_port *dp,
+				   struct net_device *conduit,
+				   struct netlink_ext_ack *extack,
+				   bool fail_on_err)
 {
 	struct dsa_switch *ds = dp->ds;
 	int port = dp->index, err;
 
-	err = ds->ops->port_change_master(ds, port, master, extack);
+	err = ds->ops->port_change_conduit(ds, port, conduit, extack);
 	if (err && !fail_on_err)
-		dev_err(ds->dev, "port %d failed to assign master %s: %pe\n",
-			port, master->name, ERR_PTR(err));
+		dev_err(ds->dev, "port %d failed to assign conduit %s: %pe\n",
+			port, conduit->name, ERR_PTR(err));
 
 	if (err && fail_on_err)
 		return err;
 
-	dp->cpu_dp = master->dsa_ptr;
-	dp->cpu_port_in_lag = netif_is_lag_master(master);
+	dp->cpu_dp = conduit->dsa_ptr;
+	dp->cpu_port_in_lag = netif_is_lag_master(conduit);
 
 	return 0;
 }
@@ -1428,12 +1428,12 @@ static int dsa_port_assign_master(struct dsa_port *dp,
  * the old CPU port before changing it, and restore it on errors during the
  * bringup of the new one.
  */
-int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
-			   struct netlink_ext_ack *extack)
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+			    struct netlink_ext_ack *extack)
 {
 	struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp);
-	struct net_device *old_master = dsa_port_to_master(dp);
-	struct net_device *dev = dp->slave;
+	struct net_device *old_conduit = dsa_port_to_conduit(dp);
+	struct net_device *dev = dp->user;
 	struct dsa_switch *ds = dp->ds;
 	bool vlan_filtering;
 	int err, tmp;
@@ -1454,7 +1454,7 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 	 */
 	vlan_filtering = dsa_port_is_vlan_filtering(dp);
 	if (vlan_filtering) {
-		err = dsa_slave_manage_vlan_filtering(dev, false);
+		err = dsa_user_manage_vlan_filtering(dev, false);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed to remove standalone VLANs");
@@ -1465,16 +1465,16 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 	/* Standalone addresses, and addresses of upper interfaces like
 	 * VLAN, LAG, HSR need to be migrated.
 	 */
-	dsa_slave_unsync_ha(dev);
+	dsa_user_unsync_ha(dev);
 
-	err = dsa_port_assign_master(dp, master, extack, true);
+	err = dsa_port_assign_conduit(dp, conduit, extack, true);
 	if (err)
 		goto rewind_old_addrs;
 
-	dsa_slave_sync_ha(dev);
+	dsa_user_sync_ha(dev);
 
 	if (vlan_filtering) {
-		err = dsa_slave_manage_vlan_filtering(dev, true);
+		err = dsa_user_manage_vlan_filtering(dev, true);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed to restore standalone VLANs");
@@ -1495,19 +1495,19 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
 
 rewind_new_vlan:
 	if (vlan_filtering)
-		dsa_slave_manage_vlan_filtering(dev, false);
+		dsa_user_manage_vlan_filtering(dev, false);
 
 rewind_new_addrs:
-	dsa_slave_unsync_ha(dev);
+	dsa_user_unsync_ha(dev);
 
-	dsa_port_assign_master(dp, old_master, NULL, false);
+	dsa_port_assign_conduit(dp, old_conduit, NULL, false);
 
 /* Restore the objects on the old CPU port */
 rewind_old_addrs:
-	dsa_slave_sync_ha(dev);
+	dsa_user_sync_ha(dev);
 
 	if (vlan_filtering) {
-		tmp = dsa_slave_manage_vlan_filtering(dev, true);
+		tmp = dsa_user_manage_vlan_filtering(dev, true);
 		if (tmp) {
 			dev_err(ds->dev,
 				"port %d failed to restore standalone VLANs: %pe\n",
@@ -1620,7 +1620,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
 	struct dsa_switch *ds = dp->ds;
 
 	if (dsa_port_is_user(dp))
-		phydev = dp->slave->phydev;
+		phydev = dp->user->phydev;
 
 	if (!ds->ops->phylink_mac_link_down) {
 		if (ds->ops->adjust_link && phydev)
@@ -1808,7 +1808,7 @@ err_phy_connect:
  * their type.
  *
  * User ports with no phy-handle or fixed-link are expected to connect to an
- * internal PHY located on the ds->slave_mii_bus at an MDIO address equal to
+ * internal PHY located on the ds->user_mii_bus at an MDIO address equal to
  * the port number. This description is still actively supported.
  *
  * Shared (CPU and DSA) ports with no phy-handle or fixed-link are expected to
@@ -1829,7 +1829,7 @@ err_phy_connect:
  * a fixed-link, a phy-handle, or a managed = "in-band-status" property.
  * It becomes the responsibility of the driver to ensure that these ports
  * operate at the maximum speed (whatever this means) and will interoperate
- * with the DSA master or other cascade port, since phylink methods will not be
+ * with the DSA conduit or other cascade port, since phylink methods will not be
  * invoked for them.
  *
  * If you are considering expanding this table for newly introduced switches,
diff --git a/net/dsa/port.h b/net/dsa/port.h
index 334879964e2c..6bc3291573c0 100644
--- a/net/dsa/port.h
+++ b/net/dsa/port.h
@@ -109,7 +109,7 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
 int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
 void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
-int dsa_port_change_master(struct dsa_port *dp, struct net_device *master,
-			   struct netlink_ext_ack *extack);
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+			    struct netlink_ext_ack *extack);
 
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
deleted file mode 100644
index 4c3e502d7e16..000000000000
--- a/net/dsa/slave.c
+++ /dev/null
@@ -1,3727 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * net/dsa/slave.c - Slave device handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- */
-
-#include <linux/list.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/phy.h>
-#include <linux/phy_fixed.h>
-#include <linux/phylink.h>
-#include <linux/of_net.h>
-#include <linux/of_mdio.h>
-#include <linux/mdio.h>
-#include <net/rtnetlink.h>
-#include <net/pkt_cls.h>
-#include <net/selftests.h>
-#include <net/tc_act/tc_mirred.h>
-#include <linux/if_bridge.h>
-#include <linux/if_hsr.h>
-#include <net/dcbnl.h>
-#include <linux/netpoll.h>
-#include <linux/string.h>
-
-#include "dsa.h"
-#include "port.h"
-#include "master.h"
-#include "netlink.h"
-#include "slave.h"
-#include "switch.h"
-#include "tag.h"
-
-struct dsa_switchdev_event_work {
-	struct net_device *dev;
-	struct net_device *orig_dev;
-	struct work_struct work;
-	unsigned long event;
-	/* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
-	 * SWITCHDEV_FDB_DEL_TO_DEVICE
-	 */
-	unsigned char addr[ETH_ALEN];
-	u16 vid;
-	bool host_addr;
-};
-
-enum dsa_standalone_event {
-	DSA_UC_ADD,
-	DSA_UC_DEL,
-	DSA_MC_ADD,
-	DSA_MC_DEL,
-};
-
-struct dsa_standalone_event_work {
-	struct work_struct work;
-	struct net_device *dev;
-	enum dsa_standalone_event event;
-	unsigned char addr[ETH_ALEN];
-	u16 vid;
-};
-
-struct dsa_host_vlan_rx_filtering_ctx {
-	struct net_device *dev;
-	const unsigned char *addr;
-	enum dsa_standalone_event event;
-};
-
-static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
-{
-	return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
-	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
-	       !ds->needs_standalone_vlan_filtering;
-}
-
-static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
-{
-	return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
-	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
-	       !ds->needs_standalone_vlan_filtering;
-}
-
-static void dsa_slave_standalone_event_work(struct work_struct *work)
-{
-	struct dsa_standalone_event_work *standalone_work =
-		container_of(work, struct dsa_standalone_event_work, work);
-	const unsigned char *addr = standalone_work->addr;
-	struct net_device *dev = standalone_work->dev;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_mdb mdb;
-	struct dsa_switch *ds = dp->ds;
-	u16 vid = standalone_work->vid;
-	int err;
-
-	switch (standalone_work->event) {
-	case DSA_UC_ADD:
-		err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to fdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		break;
-
-	case DSA_UC_DEL:
-		err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from fdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	case DSA_MC_ADD:
-		ether_addr_copy(mdb.addr, addr);
-		mdb.vid = vid;
-
-		err = dsa_port_standalone_host_mdb_add(dp, &mdb);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to mdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		break;
-	case DSA_MC_DEL:
-		ether_addr_copy(mdb.addr, addr);
-		mdb.vid = vid;
-
-		err = dsa_port_standalone_host_mdb_del(dp, &mdb);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from mdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	}
-
-	kfree(standalone_work);
-}
-
-static int dsa_slave_schedule_standalone_work(struct net_device *dev,
-					      enum dsa_standalone_event event,
-					      const unsigned char *addr,
-					      u16 vid)
-{
-	struct dsa_standalone_event_work *standalone_work;
-
-	standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
-	if (!standalone_work)
-		return -ENOMEM;
-
-	INIT_WORK(&standalone_work->work, dsa_slave_standalone_event_work);
-	standalone_work->event = event;
-	standalone_work->dev = dev;
-
-	ether_addr_copy(standalone_work->addr, addr);
-	standalone_work->vid = vid;
-
-	dsa_schedule_work(&standalone_work->work);
-
-	return 0;
-}
-
-static int dsa_slave_host_vlan_rx_filtering(void *arg, int vid)
-{
-	struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
-
-	return dsa_slave_schedule_standalone_work(ctx->dev, ctx->event,
-						  ctx->addr, vid);
-}
-
-static int dsa_slave_vlan_for_each(struct net_device *dev,
-				   int (*cb)(void *arg, int vid), void *arg)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_vlan *v;
-	int err;
-
-	lockdep_assert_held(&dev->addr_list_lock);
-
-	err = cb(arg, 0);
-	if (err)
-		return err;
-
-	list_for_each_entry(v, &dp->user_vlans, list) {
-		err = cb(arg, v->vid);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int dsa_slave_sync_uc(struct net_device *dev,
-			     const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_UC_ADD,
-	};
-
-	dev_uc_add(master, addr);
-
-	if (!dsa_switch_supports_uc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_unsync_uc(struct net_device *dev,
-			       const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_UC_DEL,
-	};
-
-	dev_uc_del(master, addr);
-
-	if (!dsa_switch_supports_uc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_sync_mc(struct net_device *dev,
-			     const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_MC_ADD,
-	};
-
-	dev_mc_add(master, addr);
-
-	if (!dsa_switch_supports_mc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-static int dsa_slave_unsync_mc(struct net_device *dev,
-			       const unsigned char *addr)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_host_vlan_rx_filtering_ctx ctx = {
-		.dev = dev,
-		.addr = addr,
-		.event = DSA_MC_DEL,
-	};
-
-	dev_mc_del(master, addr);
-
-	if (!dsa_switch_supports_mc_filtering(dp->ds))
-		return 0;
-
-	return dsa_slave_vlan_for_each(dev, dsa_slave_host_vlan_rx_filtering,
-				       &ctx);
-}
-
-void dsa_slave_sync_ha(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(dev);
-
-	netdev_for_each_synced_mc_addr(ha, dev)
-		dsa_slave_sync_mc(dev, ha->addr);
-
-	netdev_for_each_synced_uc_addr(ha, dev)
-		dsa_slave_sync_uc(dev, ha->addr);
-
-	netif_addr_unlock_bh(dev);
-
-	if (dsa_switch_supports_uc_filtering(ds) ||
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_flush_workqueue();
-}
-
-void dsa_slave_unsync_ha(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-
-	netif_addr_lock_bh(dev);
-
-	netdev_for_each_synced_uc_addr(ha, dev)
-		dsa_slave_unsync_uc(dev, ha->addr);
-
-	netdev_for_each_synced_mc_addr(ha, dev)
-		dsa_slave_unsync_mc(dev, ha->addr);
-
-	netif_addr_unlock_bh(dev);
-
-	if (dsa_switch_supports_uc_filtering(ds) ||
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_flush_workqueue();
-}
-
-/* slave mii_bus handling ***************************************************/
-static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
-{
-	struct dsa_switch *ds = bus->priv;
-
-	if (ds->phys_mii_mask & (1 << addr))
-		return ds->ops->phy_read(ds, addr, reg);
-
-	return 0xffff;
-}
-
-static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
-{
-	struct dsa_switch *ds = bus->priv;
-
-	if (ds->phys_mii_mask & (1 << addr))
-		return ds->ops->phy_write(ds, addr, reg, val);
-
-	return 0;
-}
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds)
-{
-	ds->slave_mii_bus->priv = (void *)ds;
-	ds->slave_mii_bus->name = "dsa slave smi";
-	ds->slave_mii_bus->read = dsa_slave_phy_read;
-	ds->slave_mii_bus->write = dsa_slave_phy_write;
-	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
-		 ds->dst->index, ds->index);
-	ds->slave_mii_bus->parent = ds->dev;
-	ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
-}
-
-
-/* slave device handling ****************************************************/
-static int dsa_slave_get_iflink(const struct net_device *dev)
-{
-	return dsa_slave_to_master(dev)->ifindex;
-}
-
-static int dsa_slave_open(struct net_device *dev)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int err;
-
-	err = dev_open(master, NULL);
-	if (err < 0) {
-		netdev_err(dev, "failed to open master %s\n", master->name);
-		goto out;
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0);
-		if (err)
-			goto out;
-	}
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) {
-		err = dev_uc_add(master, dev->dev_addr);
-		if (err < 0)
-			goto del_host_addr;
-	}
-
-	err = dsa_port_enable_rt(dp, dev->phydev);
-	if (err)
-		goto del_unicast;
-
-	return 0;
-
-del_unicast:
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-del_host_addr:
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-out:
-	return err;
-}
-
-static int dsa_slave_close(struct net_device *dev)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	dsa_port_disable_rt(dp);
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-
-	return 0;
-}
-
-static void dsa_slave_manage_host_flood(struct net_device *dev)
-{
-	bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	bool uc = dev->flags & IFF_PROMISC;
-
-	dsa_port_set_host_flood(dp, uc, mc);
-}
-
-static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (change & IFF_ALLMULTI)
-		dev_set_allmulti(master,
-				 dev->flags & IFF_ALLMULTI ? 1 : -1);
-	if (change & IFF_PROMISC)
-		dev_set_promiscuity(master,
-				    dev->flags & IFF_PROMISC ? 1 : -1);
-
-	if (dsa_switch_supports_uc_filtering(ds) &&
-	    dsa_switch_supports_mc_filtering(ds))
-		dsa_slave_manage_host_flood(dev);
-}
-
-static void dsa_slave_set_rx_mode(struct net_device *dev)
-{
-	__dev_mc_sync(dev, dsa_slave_sync_mc, dsa_slave_unsync_mc);
-	__dev_uc_sync(dev, dsa_slave_sync_uc, dsa_slave_unsync_uc);
-}
-
-static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct sockaddr *addr = a;
-	int err;
-
-	if (!is_valid_ether_addr(addr->sa_data))
-		return -EADDRNOTAVAIL;
-
-	if (ds->ops->port_set_mac_address) {
-		err = ds->ops->port_set_mac_address(ds, dp->index,
-						    addr->sa_data);
-		if (err)
-			return err;
-	}
-
-	/* If the port is down, the address isn't synced yet to hardware or
-	 * to the DSA master, so there is nothing to change.
-	 */
-	if (!(dev->flags & IFF_UP))
-		goto out_change_dev_addr;
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0);
-		if (err)
-			return err;
-	}
-
-	if (!ether_addr_equal(addr->sa_data, master->dev_addr)) {
-		err = dev_uc_add(master, addr->sa_data);
-		if (err < 0)
-			goto del_unicast;
-	}
-
-	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
-		dev_uc_del(master, dev->dev_addr);
-
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
-
-out_change_dev_addr:
-	eth_hw_addr_set(dev, addr->sa_data);
-
-	return 0;
-
-del_unicast:
-	if (dsa_switch_supports_uc_filtering(ds))
-		dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0);
-
-	return err;
-}
-
-struct dsa_slave_dump_ctx {
-	struct net_device *dev;
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
-	int idx;
-};
-
-static int
-dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid,
-			   bool is_static, void *data)
-{
-	struct dsa_slave_dump_ctx *dump = data;
-	u32 portid = NETLINK_CB(dump->cb->skb).portid;
-	u32 seq = dump->cb->nlh->nlmsg_seq;
-	struct nlmsghdr *nlh;
-	struct ndmsg *ndm;
-
-	if (dump->idx < dump->cb->args[2])
-		goto skip;
-
-	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
-			sizeof(*ndm), NLM_F_MULTI);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	ndm = nlmsg_data(nlh);
-	ndm->ndm_family  = AF_BRIDGE;
-	ndm->ndm_pad1    = 0;
-	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags   = NTF_SELF;
-	ndm->ndm_type    = 0;
-	ndm->ndm_ifindex = dump->dev->ifindex;
-	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
-
-	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
-		goto nla_put_failure;
-
-	if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
-		goto nla_put_failure;
-
-	nlmsg_end(dump->skb, nlh);
-
-skip:
-	dump->idx++;
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(dump->skb, nlh);
-	return -EMSGSIZE;
-}
-
-static int
-dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
-		   struct net_device *dev, struct net_device *filter_dev,
-		   int *idx)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_dump_ctx dump = {
-		.dev = dev,
-		.skb = skb,
-		.cb = cb,
-		.idx = *idx,
-	};
-	int err;
-
-	err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump);
-	*idx = dump.idx;
-
-	return err;
-}
-
-static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
-	int port = p->dp->index;
-
-	/* Pass through to switch driver if it supports timestamping */
-	switch (cmd) {
-	case SIOCGHWTSTAMP:
-		if (ds->ops->port_hwtstamp_get)
-			return ds->ops->port_hwtstamp_get(ds, port, ifr);
-		break;
-	case SIOCSHWTSTAMP:
-		if (ds->ops->port_hwtstamp_set)
-			return ds->ops->port_hwtstamp_set(ds, port, ifr);
-		break;
-	}
-
-	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
-}
-
-static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx,
-				   const struct switchdev_attr *attr,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int ret;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (attr->id) {
-	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_set_state(dp, attr->u.stp_state, true);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
-					      extack);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
-		break;
-	case SWITCHDEV_ATTR_ID_BRIDGE_MST:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
-						extack);
-		break;
-	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
-		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
-		break;
-	case SWITCHDEV_ATTR_ID_VLAN_MSTI:
-		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
-			return -EOPNOTSUPP;
-
-		ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
-		break;
-	default:
-		ret = -EOPNOTSUPP;
-		break;
-	}
-
-	return ret;
-}
-
-/* Must be called under rcu_read_lock() */
-static int
-dsa_slave_vlan_check_for_8021q_uppers(struct net_device *slave,
-				      const struct switchdev_obj_port_vlan *vlan)
-{
-	struct net_device *upper_dev;
-	struct list_head *iter;
-
-	netdev_for_each_upper_dev_rcu(slave, upper_dev, iter) {
-		u16 vid;
-
-		if (!is_vlan_dev(upper_dev))
-			continue;
-
-		vid = vlan_dev_vlan_id(upper_dev);
-		if (vid == vlan->vid)
-			return -EBUSY;
-	}
-
-	return 0;
-}
-
-static int dsa_slave_vlan_add(struct net_device *dev,
-			      const struct switchdev_obj *obj,
-			      struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-	int err;
-
-	if (dsa_port_skip_vlan_configuration(dp)) {
-		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
-		return 0;
-	}
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	/* Deny adding a bridge VLAN when there is already an 802.1Q upper with
-	 * the same VID.
-	 */
-	if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
-		rcu_read_lock();
-		err = dsa_slave_vlan_check_for_8021q_uppers(dev, vlan);
-		rcu_read_unlock();
-		if (err) {
-			NL_SET_ERR_MSG_MOD(extack,
-					   "Port already has a VLAN upper with this VID");
-			return err;
-		}
-	}
-
-	return dsa_port_vlan_add(dp, vlan, extack);
-}
-
-/* Offload a VLAN installed on the bridge or on a foreign interface by
- * installing it as a VLAN towards the CPU port.
- */
-static int dsa_slave_host_vlan_add(struct net_device *dev,
-				   const struct switchdev_obj *obj,
-				   struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan;
-
-	/* Do nothing if this is a software bridge */
-	if (!dp->bridge)
-		return -EOPNOTSUPP;
-
-	if (dsa_port_skip_vlan_configuration(dp)) {
-		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
-		return 0;
-	}
-
-	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	/* Even though drivers often handle CPU membership in special ways,
-	 * it doesn't make sense to program a PVID, so clear this flag.
-	 */
-	vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
-
-	return dsa_port_host_vlan_add(dp, &vlan, extack);
-}
-
-static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
-				  const struct switchdev_obj *obj,
-				  struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int err;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			err = dsa_slave_vlan_add(dev, obj, extack);
-		else
-			err = dsa_slave_host_vlan_add(dev, obj, extack);
-		break;
-	case SWITCHDEV_OBJ_ID_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_add_ring_role(dp,
-						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static int dsa_slave_vlan_del(struct net_device *dev,
-			      const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-
-	if (dsa_port_skip_vlan_configuration(dp))
-		return 0;
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	return dsa_port_vlan_del(dp, vlan);
-}
-
-static int dsa_slave_host_vlan_del(struct net_device *dev,
-				   const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan *vlan;
-
-	/* Do nothing if this is a software bridge */
-	if (!dp->bridge)
-		return -EOPNOTSUPP;
-
-	if (dsa_port_skip_vlan_configuration(dp))
-		return 0;
-
-	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-
-	return dsa_port_host_vlan_del(dp, vlan);
-}
-
-static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
-				  const struct switchdev_obj *obj)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	int err;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_HOST_MDB:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-			err = dsa_slave_vlan_del(dev, obj);
-		else
-			err = dsa_slave_host_vlan_del(dev, obj);
-		break;
-	case SWITCHDEV_OBJ_ID_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
-		break;
-	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
-		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
-			return -EOPNOTSUPP;
-
-		err = dsa_port_mrp_del_ring_role(dp,
-						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
-static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
-						     struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	return netpoll_send_skb(p->netpoll, skb);
-#else
-	BUG();
-	return NETDEV_TX_OK;
-#endif
-}
-
-static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
-				 struct sk_buff *skb)
-{
-	struct dsa_switch *ds = p->dp->ds;
-
-	if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
-		return;
-
-	if (!ds->ops->port_txtstamp)
-		return;
-
-	ds->ops->port_txtstamp(ds, p->dp->index, skb);
-}
-
-netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
-{
-	/* SKB for netpoll still need to be mangled with the protocol-specific
-	 * tag to be successfully transmitted
-	 */
-	if (unlikely(netpoll_tx_running(dev)))
-		return dsa_slave_netpoll_send_skb(dev, skb);
-
-	/* Queue the SKB for transmission on the parent interface, but
-	 * do not modify its EtherType
-	 */
-	skb->dev = dsa_slave_to_master(dev);
-	dev_queue_xmit(skb);
-
-	return NETDEV_TX_OK;
-}
-EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
-
-static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
-{
-	int needed_headroom = dev->needed_headroom;
-	int needed_tailroom = dev->needed_tailroom;
-
-	/* For tail taggers, we need to pad short frames ourselves, to ensure
-	 * that the tail tag does not fail at its role of being at the end of
-	 * the packet, once the master interface pads the frame. Account for
-	 * that pad length here, and pad later.
-	 */
-	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
-		needed_tailroom += ETH_ZLEN - skb->len;
-	/* skb_headroom() returns unsigned int... */
-	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
-	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
-
-	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
-		/* No reallocation needed, yay! */
-		return 0;
-
-	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
-				GFP_ATOMIC);
-}
-
-static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct sk_buff *nskb;
-
-	dev_sw_netstats_tx_add(dev, 1, skb->len);
-
-	memset(skb->cb, 0, sizeof(skb->cb));
-
-	/* Handle tx timestamp if any */
-	dsa_skb_tx_timestamp(p, skb);
-
-	if (dsa_realloc_skb(skb, dev)) {
-		dev_kfree_skb_any(skb);
-		return NETDEV_TX_OK;
-	}
-
-	/* needed_tailroom should still be 'warm' in the cache line from
-	 * dsa_realloc_skb(), which has also ensured that padding is safe.
-	 */
-	if (dev->needed_tailroom)
-		eth_skb_pad(skb);
-
-	/* Transmit function may have to reallocate the original SKB,
-	 * in which case it must have freed it. Only free it here on error.
-	 */
-	nskb = p->xmit(skb, dev);
-	if (!nskb) {
-		kfree_skb(skb);
-		return NETDEV_TX_OK;
-	}
-
-	return dsa_enqueue_skb(nskb, dev);
-}
-
-/* ethtool operations *******************************************************/
-
-static void dsa_slave_get_drvinfo(struct net_device *dev,
-				  struct ethtool_drvinfo *drvinfo)
-{
-	strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
-	strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
-	strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
-}
-
-static int dsa_slave_get_regs_len(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_regs_len)
-		return ds->ops->get_regs_len(ds, dp->index);
-
-	return -EOPNOTSUPP;
-}
-
-static void
-dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_regs)
-		ds->ops->get_regs(ds, dp->index, regs, _p);
-}
-
-static int dsa_slave_nway_reset(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_nway_reset(dp->pl);
-}
-
-static int dsa_slave_get_eeprom_len(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->cd && ds->cd->eeprom_len)
-		return ds->cd->eeprom_len;
-
-	if (ds->ops->get_eeprom_len)
-		return ds->ops->get_eeprom_len(ds);
-
-	return 0;
-}
-
-static int dsa_slave_get_eeprom(struct net_device *dev,
-				struct ethtool_eeprom *eeprom, u8 *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eeprom)
-		return ds->ops->get_eeprom(ds, eeprom, data);
-
-	return -EOPNOTSUPP;
-}
-
-static int dsa_slave_set_eeprom(struct net_device *dev,
-				struct ethtool_eeprom *eeprom, u8 *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->set_eeprom)
-		return ds->ops->set_eeprom(ds, eeprom, data);
-
-	return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_strings(struct net_device *dev,
-				  uint32_t stringset, uint8_t *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (stringset == ETH_SS_STATS) {
-		int len = ETH_GSTRING_LEN;
-
-		strscpy_pad(data, "tx_packets", len);
-		strscpy_pad(data + len, "tx_bytes", len);
-		strscpy_pad(data + 2 * len, "rx_packets", len);
-		strscpy_pad(data + 3 * len, "rx_bytes", len);
-		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, dp->index, stringset,
-					     data + 4 * len);
-	} else if (stringset ==  ETH_SS_TEST) {
-		net_selftest_get_strings(data);
-	}
-
-}
-
-static void dsa_slave_get_ethtool_stats(struct net_device *dev,
-					struct ethtool_stats *stats,
-					uint64_t *data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct pcpu_sw_netstats *s;
-	unsigned int start;
-	int i;
-
-	for_each_possible_cpu(i) {
-		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
-
-		s = per_cpu_ptr(dev->tstats, i);
-		do {
-			start = u64_stats_fetch_begin(&s->syncp);
-			tx_packets = u64_stats_read(&s->tx_packets);
-			tx_bytes = u64_stats_read(&s->tx_bytes);
-			rx_packets = u64_stats_read(&s->rx_packets);
-			rx_bytes = u64_stats_read(&s->rx_bytes);
-		} while (u64_stats_fetch_retry(&s->syncp, start));
-		data[0] += tx_packets;
-		data[1] += tx_bytes;
-		data[2] += rx_packets;
-		data[3] += rx_bytes;
-	}
-	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
-}
-
-static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (sset == ETH_SS_STATS) {
-		int count = 0;
-
-		if (ds->ops->get_sset_count) {
-			count = ds->ops->get_sset_count(ds, dp->index, sset);
-			if (count < 0)
-				return count;
-		}
-
-		return count + 4;
-	} else if (sset ==  ETH_SS_TEST) {
-		return net_selftest_get_count();
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_eth_phy_stats(struct net_device *dev,
-					struct ethtool_eth_phy_stats *phy_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_phy_stats)
-		ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
-}
-
-static void dsa_slave_get_eth_mac_stats(struct net_device *dev,
-					struct ethtool_eth_mac_stats *mac_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_mac_stats)
-		ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
-}
-
-static void
-dsa_slave_get_eth_ctrl_stats(struct net_device *dev,
-			     struct ethtool_eth_ctrl_stats *ctrl_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_eth_ctrl_stats)
-		ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
-}
-
-static void
-dsa_slave_get_rmon_stats(struct net_device *dev,
-			 struct ethtool_rmon_stats *rmon_stats,
-			 const struct ethtool_rmon_hist_range **ranges)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_rmon_stats)
-		ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
-}
-
-static void dsa_slave_net_selftest(struct net_device *ndev,
-				   struct ethtool_test *etest, u64 *buf)
-{
-	struct dsa_port *dp = dsa_slave_to_port(ndev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->self_test) {
-		ds->ops->self_test(ds, dp->index, etest, buf);
-		return;
-	}
-
-	net_selftest(ndev, etest, buf);
-}
-
-static int dsa_slave_get_mm(struct net_device *dev,
-			    struct ethtool_mm_state *state)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->get_mm)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_mm(ds, dp->index, state);
-}
-
-static int dsa_slave_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
-			    struct netlink_ext_ack *extack)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->set_mm)
-		return -EOPNOTSUPP;
-
-	return ds->ops->set_mm(ds, dp->index, cfg, extack);
-}
-
-static void dsa_slave_get_mm_stats(struct net_device *dev,
-				   struct ethtool_mm_stats *stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_mm_stats)
-		ds->ops->get_mm_stats(ds, dp->index, stats);
-}
-
-static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	phylink_ethtool_get_wol(dp->pl, w);
-
-	if (ds->ops->get_wol)
-		ds->ops->get_wol(ds, dp->index, w);
-}
-
-static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret = -EOPNOTSUPP;
-
-	phylink_ethtool_set_wol(dp->pl, w);
-
-	if (ds->ops->set_wol)
-		ret = ds->ops->set_wol(ds, dp->index, w);
-
-	return ret;
-}
-
-static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret;
-
-	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev || !dp->pl)
-		return -ENODEV;
-
-	if (!ds->ops->set_mac_eee)
-		return -EOPNOTSUPP;
-
-	ret = ds->ops->set_mac_eee(ds, dp->index, e);
-	if (ret)
-		return ret;
-
-	return phylink_ethtool_set_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int ret;
-
-	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev || !dp->pl)
-		return -ENODEV;
-
-	if (!ds->ops->get_mac_eee)
-		return -EOPNOTSUPP;
-
-	ret = ds->ops->get_mac_eee(ds, dp->index, e);
-	if (ret)
-		return ret;
-
-	return phylink_ethtool_get_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_link_ksettings(struct net_device *dev,
-					struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_ksettings_get(dp->pl, cmd);
-}
-
-static int dsa_slave_set_link_ksettings(struct net_device *dev,
-					const struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_ksettings_set(dp->pl, cmd);
-}
-
-static void dsa_slave_get_pause_stats(struct net_device *dev,
-				  struct ethtool_pause_stats *pause_stats)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_pause_stats)
-		ds->ops->get_pause_stats(ds, dp->index, pause_stats);
-}
-
-static void dsa_slave_get_pauseparam(struct net_device *dev,
-				     struct ethtool_pauseparam *pause)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	phylink_ethtool_get_pauseparam(dp->pl, pause);
-}
-
-static int dsa_slave_set_pauseparam(struct net_device *dev,
-				    struct ethtool_pauseparam *pause)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return phylink_ethtool_set_pauseparam(dp->pl, pause);
-}
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static int dsa_slave_netpoll_setup(struct net_device *dev,
-				   struct netpoll_info *ni)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct netpoll *netpoll;
-	int err = 0;
-
-	netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
-	if (!netpoll)
-		return -ENOMEM;
-
-	err = __netpoll_setup(netpoll, master);
-	if (err) {
-		kfree(netpoll);
-		goto out;
-	}
-
-	p->netpoll = netpoll;
-out:
-	return err;
-}
-
-static void dsa_slave_netpoll_cleanup(struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct netpoll *netpoll = p->netpoll;
-
-	if (!netpoll)
-		return;
-
-	p->netpoll = NULL;
-
-	__netpoll_free(netpoll);
-}
-
-static void dsa_slave_poll_controller(struct net_device *dev)
-{
-}
-#endif
-
-static struct dsa_mall_tc_entry *
-dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_tc_entry *mall_tc_entry;
-
-	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
-		if (mall_tc_entry->cookie == cookie)
-			return mall_tc_entry;
-
-	return NULL;
-}
-
-static int
-dsa_slave_add_cls_matchall_mirred(struct net_device *dev,
-				  struct tc_cls_matchall_offload *cls,
-				  bool ingress)
-{
-	struct netlink_ext_ack *extack = cls->common.extack;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_mirror_tc_entry *mirror;
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-	struct flow_action_entry *act;
-	struct dsa_port *to_dp;
-	int err;
-
-	if (!ds->ops->port_mirror_add)
-		return -EOPNOTSUPP;
-
-	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
-					      cls->common.extack))
-		return -EOPNOTSUPP;
-
-	act = &cls->rule->action.entries[0];
-
-	if (!act->dev)
-		return -EINVAL;
-
-	if (!dsa_slave_dev_check(act->dev))
-		return -EOPNOTSUPP;
-
-	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
-	if (!mall_tc_entry)
-		return -ENOMEM;
-
-	mall_tc_entry->cookie = cls->cookie;
-	mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
-	mirror = &mall_tc_entry->mirror;
-
-	to_dp = dsa_slave_to_port(act->dev);
-
-	mirror->to_local_port = to_dp->index;
-	mirror->ingress = ingress;
-
-	err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
-	if (err) {
-		kfree(mall_tc_entry);
-		return err;
-	}
-
-	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
-
-	return err;
-}
-
-static int
-dsa_slave_add_cls_matchall_police(struct net_device *dev,
-				  struct tc_cls_matchall_offload *cls,
-				  bool ingress)
-{
-	struct netlink_ext_ack *extack = cls->common.extack;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_mall_policer_tc_entry *policer;
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-	struct flow_action_entry *act;
-	int err;
-
-	if (!ds->ops->port_policer_add) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Policing offload not implemented");
-		return -EOPNOTSUPP;
-	}
-
-	if (!ingress) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Only supported on ingress qdisc");
-		return -EOPNOTSUPP;
-	}
-
-	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
-					      cls->common.extack))
-		return -EOPNOTSUPP;
-
-	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
-		if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
-			NL_SET_ERR_MSG_MOD(extack,
-					   "Only one port policer allowed");
-			return -EEXIST;
-		}
-	}
-
-	act = &cls->rule->action.entries[0];
-
-	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
-	if (!mall_tc_entry)
-		return -ENOMEM;
-
-	mall_tc_entry->cookie = cls->cookie;
-	mall_tc_entry->type = DSA_PORT_MALL_POLICER;
-	policer = &mall_tc_entry->policer;
-	policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
-	policer->burst = act->police.burst;
-
-	err = ds->ops->port_policer_add(ds, dp->index, policer);
-	if (err) {
-		kfree(mall_tc_entry);
-		return err;
-	}
-
-	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
-
-	return err;
-}
-
-static int dsa_slave_add_cls_matchall(struct net_device *dev,
-				      struct tc_cls_matchall_offload *cls,
-				      bool ingress)
-{
-	int err = -EOPNOTSUPP;
-
-	if (cls->common.protocol == htons(ETH_P_ALL) &&
-	    flow_offload_has_one_action(&cls->rule->action) &&
-	    cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
-		err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress);
-	else if (flow_offload_has_one_action(&cls->rule->action) &&
-		 cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
-		err = dsa_slave_add_cls_matchall_police(dev, cls, ingress);
-
-	return err;
-}
-
-static void dsa_slave_del_cls_matchall(struct net_device *dev,
-				       struct tc_cls_matchall_offload *cls)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_mall_tc_entry *mall_tc_entry;
-	struct dsa_switch *ds = dp->ds;
-
-	mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
-	if (!mall_tc_entry)
-		return;
-
-	list_del(&mall_tc_entry->list);
-
-	switch (mall_tc_entry->type) {
-	case DSA_PORT_MALL_MIRROR:
-		if (ds->ops->port_mirror_del)
-			ds->ops->port_mirror_del(ds, dp->index,
-						 &mall_tc_entry->mirror);
-		break;
-	case DSA_PORT_MALL_POLICER:
-		if (ds->ops->port_policer_del)
-			ds->ops->port_policer_del(ds, dp->index);
-		break;
-	default:
-		WARN_ON(1);
-	}
-
-	kfree(mall_tc_entry);
-}
-
-static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
-					   struct tc_cls_matchall_offload *cls,
-					   bool ingress)
-{
-	if (cls->common.chain_index)
-		return -EOPNOTSUPP;
-
-	switch (cls->command) {
-	case TC_CLSMATCHALL_REPLACE:
-		return dsa_slave_add_cls_matchall(dev, cls, ingress);
-	case TC_CLSMATCHALL_DESTROY:
-		dsa_slave_del_cls_matchall(dev, cls);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_add_cls_flower(struct net_device *dev,
-				    struct flow_cls_offload *cls,
-				    bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_add)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_add(ds, port, cls, ingress);
-}
-
-static int dsa_slave_del_cls_flower(struct net_device *dev,
-				    struct flow_cls_offload *cls,
-				    bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_del)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_del(ds, port, cls, ingress);
-}
-
-static int dsa_slave_stats_cls_flower(struct net_device *dev,
-				      struct flow_cls_offload *cls,
-				      bool ingress)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-
-	if (!ds->ops->cls_flower_stats)
-		return -EOPNOTSUPP;
-
-	return ds->ops->cls_flower_stats(ds, port, cls, ingress);
-}
-
-static int dsa_slave_setup_tc_cls_flower(struct net_device *dev,
-					 struct flow_cls_offload *cls,
-					 bool ingress)
-{
-	switch (cls->command) {
-	case FLOW_CLS_REPLACE:
-		return dsa_slave_add_cls_flower(dev, cls, ingress);
-	case FLOW_CLS_DESTROY:
-		return dsa_slave_del_cls_flower(dev, cls, ingress);
-	case FLOW_CLS_STATS:
-		return dsa_slave_stats_cls_flower(dev, cls, ingress);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
-				       void *cb_priv, bool ingress)
-{
-	struct net_device *dev = cb_priv;
-
-	if (!tc_can_offload(dev))
-		return -EOPNOTSUPP;
-
-	switch (type) {
-	case TC_SETUP_CLSMATCHALL:
-		return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress);
-	case TC_SETUP_CLSFLOWER:
-		return dsa_slave_setup_tc_cls_flower(dev, type_data, ingress);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_tc_block_cb_ig(enum tc_setup_type type,
-					  void *type_data, void *cb_priv)
-{
-	return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, true);
-}
-
-static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type,
-					  void *type_data, void *cb_priv)
-{
-	return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false);
-}
-
-static LIST_HEAD(dsa_slave_block_cb_list);
-
-static int dsa_slave_setup_tc_block(struct net_device *dev,
-				    struct flow_block_offload *f)
-{
-	struct flow_block_cb *block_cb;
-	flow_setup_cb_t *cb;
-
-	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
-		cb = dsa_slave_setup_tc_block_cb_ig;
-	else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
-		cb = dsa_slave_setup_tc_block_cb_eg;
-	else
-		return -EOPNOTSUPP;
-
-	f->driver_block_list = &dsa_slave_block_cb_list;
-
-	switch (f->command) {
-	case FLOW_BLOCK_BIND:
-		if (flow_block_cb_is_busy(cb, dev, &dsa_slave_block_cb_list))
-			return -EBUSY;
-
-		block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
-		if (IS_ERR(block_cb))
-			return PTR_ERR(block_cb);
-
-		flow_block_cb_add(block_cb, f);
-		list_add_tail(&block_cb->driver_list, &dsa_slave_block_cb_list);
-		return 0;
-	case FLOW_BLOCK_UNBIND:
-		block_cb = flow_block_cb_lookup(f->block, cb, dev);
-		if (!block_cb)
-			return -ENOENT;
-
-		flow_block_cb_remove(block_cb, f);
-		list_del(&block_cb->driver_list);
-		return 0;
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port,
-				    void *type_data)
-{
-	struct net_device *master = dsa_port_to_master(dsa_to_port(ds, port));
-
-	if (!master->netdev_ops->ndo_setup_tc)
-		return -EOPNOTSUPP;
-
-	return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data);
-}
-
-static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      void *type_data)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	switch (type) {
-	case TC_SETUP_BLOCK:
-		return dsa_slave_setup_tc_block(dev, type_data);
-	case TC_SETUP_FT:
-		return dsa_slave_setup_ft_block(ds, dp->index, type_data);
-	default:
-		break;
-	}
-
-	if (!ds->ops->port_setup_tc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
-}
-
-static int dsa_slave_get_rxnfc(struct net_device *dev,
-			       struct ethtool_rxnfc *nfc, u32 *rule_locs)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->get_rxnfc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
-}
-
-static int dsa_slave_set_rxnfc(struct net_device *dev,
-			       struct ethtool_rxnfc *nfc)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->set_rxnfc)
-		return -EOPNOTSUPP;
-
-	return ds->ops->set_rxnfc(ds, dp->index, nfc);
-}
-
-static int dsa_slave_get_ts_info(struct net_device *dev,
-				 struct ethtool_ts_info *ts)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_switch *ds = p->dp->ds;
-
-	if (!ds->ops->get_ts_info)
-		return -EOPNOTSUPP;
-
-	return ds->ops->get_ts_info(ds, p->dp->index, ts);
-}
-
-static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
-				     u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan = {
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.vid = vid,
-		/* This API only allows programming tagged, non-PVID VIDs */
-		.flags = 0,
-	};
-	struct netlink_ext_ack extack = {0};
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-	struct dsa_vlan *v;
-	int ret;
-
-	/* User port... */
-	ret = dsa_port_vlan_add(dp, &vlan, &extack);
-	if (ret) {
-		if (extack._msg)
-			netdev_err(dev, "%s\n", extack._msg);
-		return ret;
-	}
-
-	/* And CPU port... */
-	ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
-	if (ret) {
-		if (extack._msg)
-			netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
-				   extack._msg);
-		return ret;
-	}
-
-	if (!dsa_switch_supports_uc_filtering(ds) &&
-	    !dsa_switch_supports_mc_filtering(ds))
-		return 0;
-
-	v = kzalloc(sizeof(*v), GFP_KERNEL);
-	if (!v) {
-		ret = -ENOMEM;
-		goto rollback;
-	}
-
-	netif_addr_lock_bh(dev);
-
-	v->vid = vid;
-	list_add_tail(&v->list, &dp->user_vlans);
-
-	if (dsa_switch_supports_mc_filtering(ds)) {
-		netdev_for_each_synced_mc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_MC_ADD,
-							   ha->addr, vid);
-		}
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		netdev_for_each_synced_uc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_UC_ADD,
-							   ha->addr, vid);
-		}
-	}
-
-	netif_addr_unlock_bh(dev);
-
-	dsa_flush_workqueue();
-
-	return 0;
-
-rollback:
-	dsa_port_host_vlan_del(dp, &vlan);
-	dsa_port_vlan_del(dp, &vlan);
-
-	return ret;
-}
-
-static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
-				      u16 vid)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct switchdev_obj_port_vlan vlan = {
-		.vid = vid,
-		/* This API only allows programming tagged, non-PVID VIDs */
-		.flags = 0,
-	};
-	struct dsa_switch *ds = dp->ds;
-	struct netdev_hw_addr *ha;
-	struct dsa_vlan *v;
-	int err;
-
-	err = dsa_port_vlan_del(dp, &vlan);
-	if (err)
-		return err;
-
-	err = dsa_port_host_vlan_del(dp, &vlan);
-	if (err)
-		return err;
-
-	if (!dsa_switch_supports_uc_filtering(ds) &&
-	    !dsa_switch_supports_mc_filtering(ds))
-		return 0;
-
-	netif_addr_lock_bh(dev);
-
-	v = dsa_vlan_find(&dp->user_vlans, &vlan);
-	if (!v) {
-		netif_addr_unlock_bh(dev);
-		return -ENOENT;
-	}
-
-	list_del(&v->list);
-	kfree(v);
-
-	if (dsa_switch_supports_mc_filtering(ds)) {
-		netdev_for_each_synced_mc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL,
-							   ha->addr, vid);
-		}
-	}
-
-	if (dsa_switch_supports_uc_filtering(ds)) {
-		netdev_for_each_synced_uc_addr(ha, dev) {
-			dsa_slave_schedule_standalone_work(dev, DSA_UC_DEL,
-							   ha->addr, vid);
-		}
-	}
-
-	netif_addr_unlock_bh(dev);
-
-	dsa_flush_workqueue();
-
-	return 0;
-}
-
-static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg)
-{
-	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
-
-	return dsa_slave_vlan_rx_add_vid(arg, proto, vid);
-}
-
-static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg)
-{
-	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
-
-	return dsa_slave_vlan_rx_kill_vid(arg, proto, vid);
-}
-
-/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
- * filtering is enabled. The baseline is that only ports that offload a
- * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
- * but there are exceptions for quirky hardware.
- *
- * If ds->vlan_filtering_is_global = true, then standalone ports which share
- * the same switch with other ports that offload a VLAN-aware bridge are also
- * inevitably VLAN-aware.
- *
- * To summarize, a DSA switch port offloads:
- *
- * - If standalone (this includes software bridge, software LAG):
- *     - if ds->needs_standalone_vlan_filtering = true, OR if
- *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
- *       this switch chip which have vlan_filtering=1)
- *         - the 8021q upper VLANs
- *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
- *       global, or it is, but no port is under a VLAN-aware bridge):
- *         - no VLAN (any 8021q upper is a software VLAN)
- *
- * - If under a vlan_filtering=0 bridge which it offload:
- *     - if ds->configure_vlan_while_not_filtering = true (default):
- *         - the bridge VLANs. These VLANs are committed to hardware but inactive.
- *     - else (deprecated):
- *         - no VLAN. The bridge VLANs are not restored when VLAN awareness is
- *           enabled, so this behavior is broken and discouraged.
- *
- * - If under a vlan_filtering=1 bridge which it offload:
- *     - the bridge VLANs
- *     - the 8021q upper VLANs
- */
-int dsa_slave_manage_vlan_filtering(struct net_device *slave,
-				    bool vlan_filtering)
-{
-	int err;
-
-	if (vlan_filtering) {
-		slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-
-		err = vlan_for_each(slave, dsa_slave_restore_vlan, slave);
-		if (err) {
-			vlan_for_each(slave, dsa_slave_clear_vlan, slave);
-			slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
-			return err;
-		}
-	} else {
-		err = vlan_for_each(slave, dsa_slave_clear_vlan, slave);
-		if (err)
-			return err;
-
-		slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
-	}
-
-	return 0;
-}
-
-struct dsa_hw_port {
-	struct list_head list;
-	struct net_device *dev;
-	int old_mtu;
-};
-
-static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
-{
-	const struct dsa_hw_port *p;
-	int err;
-
-	list_for_each_entry(p, hw_port_list, list) {
-		if (p->dev->mtu == mtu)
-			continue;
-
-		err = dev_set_mtu(p->dev, mtu);
-		if (err)
-			goto rollback;
-	}
-
-	return 0;
-
-rollback:
-	list_for_each_entry_continue_reverse(p, hw_port_list, list) {
-		if (p->dev->mtu == p->old_mtu)
-			continue;
-
-		if (dev_set_mtu(p->dev, p->old_mtu))
-			netdev_err(p->dev, "Failed to restore MTU\n");
-	}
-
-	return err;
-}
-
-static void dsa_hw_port_list_free(struct list_head *hw_port_list)
-{
-	struct dsa_hw_port *p, *n;
-
-	list_for_each_entry_safe(p, n, hw_port_list, list)
-		kfree(p);
-}
-
-/* Make the hardware datapath to/from @dev limited to a common MTU */
-static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
-{
-	struct list_head hw_port_list;
-	struct dsa_switch_tree *dst;
-	int min_mtu = ETH_MAX_MTU;
-	struct dsa_port *other_dp;
-	int err;
-
-	if (!dp->ds->mtu_enforcement_ingress)
-		return;
-
-	if (!dp->bridge)
-		return;
-
-	INIT_LIST_HEAD(&hw_port_list);
-
-	/* Populate the list of ports that are part of the same bridge
-	 * as the newly added/modified port
-	 */
-	list_for_each_entry(dst, &dsa_tree_list, list) {
-		list_for_each_entry(other_dp, &dst->ports, list) {
-			struct dsa_hw_port *hw_port;
-			struct net_device *slave;
-
-			if (other_dp->type != DSA_PORT_TYPE_USER)
-				continue;
-
-			if (!dsa_port_bridge_same(dp, other_dp))
-				continue;
-
-			if (!other_dp->ds->mtu_enforcement_ingress)
-				continue;
-
-			slave = other_dp->slave;
-
-			if (min_mtu > slave->mtu)
-				min_mtu = slave->mtu;
-
-			hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
-			if (!hw_port)
-				goto out;
-
-			hw_port->dev = slave;
-			hw_port->old_mtu = slave->mtu;
-
-			list_add(&hw_port->list, &hw_port_list);
-		}
-	}
-
-	/* Attempt to configure the entire hardware bridge to the newly added
-	 * interface's MTU first, regardless of whether the intention of the
-	 * user was to raise or lower it.
-	 */
-	err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu);
-	if (!err)
-		goto out;
-
-	/* Clearly that didn't work out so well, so just set the minimum MTU on
-	 * all hardware bridge ports now. If this fails too, then all ports will
-	 * still have their old MTU rolled back anyway.
-	 */
-	dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
-
-out:
-	dsa_hw_port_list_free(&hw_port_list);
-}
-
-int dsa_slave_change_mtu(struct net_device *dev, int new_mtu)
-{
-	struct net_device *master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_port *cpu_dp = dp->cpu_dp;
-	struct dsa_switch *ds = dp->ds;
-	struct dsa_port *other_dp;
-	int largest_mtu = 0;
-	int new_master_mtu;
-	int old_master_mtu;
-	int mtu_limit;
-	int overhead;
-	int cpu_mtu;
-	int err;
-
-	if (!ds->ops->port_change_mtu)
-		return -EOPNOTSUPP;
-
-	dsa_tree_for_each_user_port(other_dp, ds->dst) {
-		int slave_mtu;
-
-		/* During probe, this function will be called for each slave
-		 * device, while not all of them have been allocated. That's
-		 * ok, it doesn't change what the maximum is, so ignore it.
-		 */
-		if (!other_dp->slave)
-			continue;
-
-		/* Pretend that we already applied the setting, which we
-		 * actually haven't (still haven't done all integrity checks)
-		 */
-		if (dp == other_dp)
-			slave_mtu = new_mtu;
-		else
-			slave_mtu = other_dp->slave->mtu;
-
-		if (largest_mtu < slave_mtu)
-			largest_mtu = slave_mtu;
-	}
-
-	overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
-	mtu_limit = min_t(int, master->max_mtu, dev->max_mtu + overhead);
-	old_master_mtu = master->mtu;
-	new_master_mtu = largest_mtu + overhead;
-	if (new_master_mtu > mtu_limit)
-		return -ERANGE;
-
-	/* If the master MTU isn't over limit, there's no need to check the CPU
-	 * MTU, since that surely isn't either.
-	 */
-	cpu_mtu = largest_mtu;
-
-	/* Start applying stuff */
-	if (new_master_mtu != old_master_mtu) {
-		err = dev_set_mtu(master, new_master_mtu);
-		if (err < 0)
-			goto out_master_failed;
-
-		/* We only need to propagate the MTU of the CPU port to
-		 * upstream switches, so emit a notifier which updates them.
-		 */
-		err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
-		if (err)
-			goto out_cpu_failed;
-	}
-
-	err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
-	if (err)
-		goto out_port_failed;
-
-	dev->mtu = new_mtu;
-
-	dsa_bridge_mtu_normalization(dp);
-
-	return 0;
-
-out_port_failed:
-	if (new_master_mtu != old_master_mtu)
-		dsa_port_mtu_change(cpu_dp, old_master_mtu - overhead);
-out_cpu_failed:
-	if (new_master_mtu != old_master_mtu)
-		dev_set_mtu(master, old_master_mtu);
-out_master_failed:
-	return err;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-
-	if (!ds->ops->port_set_default_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_setapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = __fls(mask);
-
-	err = ds->ops->port_set_default_prio(ds, port, new_prio);
-	if (err) {
-		dcb_ieee_delapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-	u8 dscp = app->protocol;
-
-	if (!ds->ops->port_add_dscp_prio)
-		return -EOPNOTSUPP;
-
-	if (dscp >= 64) {
-		netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
-			   dscp);
-		return -EINVAL;
-	}
-
-	err = dcb_ieee_setapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = __fls(mask);
-
-	err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
-	if (err) {
-		dcb_ieee_delapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused dsa_slave_dcbnl_ieee_setapp(struct net_device *dev,
-						      struct dcb_app *app)
-{
-	switch (app->selector) {
-	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
-		switch (app->protocol) {
-		case 0:
-			return dsa_slave_dcbnl_set_default_prio(dev, app);
-		default:
-			return -EOPNOTSUPP;
-		}
-		break;
-	case IEEE_8021QAZ_APP_SEL_DSCP:
-		return dsa_slave_dcbnl_add_dscp_prio(dev, app);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	unsigned long mask, new_prio;
-	int err, port = dp->index;
-
-	if (!ds->ops->port_set_default_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_delapp(dev, app);
-	if (err)
-		return err;
-
-	mask = dcb_ieee_getapp_mask(dev, app);
-	new_prio = mask ? __fls(mask) : 0;
-
-	err = ds->ops->port_set_default_prio(ds, port, new_prio);
-	if (err) {
-		dcb_ieee_setapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused
-dsa_slave_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int err, port = dp->index;
-	u8 dscp = app->protocol;
-
-	if (!ds->ops->port_del_dscp_prio)
-		return -EOPNOTSUPP;
-
-	err = dcb_ieee_delapp(dev, app);
-	if (err)
-		return err;
-
-	err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
-	if (err) {
-		dcb_ieee_setapp(dev, app);
-		return err;
-	}
-
-	return 0;
-}
-
-static int __maybe_unused dsa_slave_dcbnl_ieee_delapp(struct net_device *dev,
-						      struct dcb_app *app)
-{
-	switch (app->selector) {
-	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
-		switch (app->protocol) {
-		case 0:
-			return dsa_slave_dcbnl_del_default_prio(dev, app);
-		default:
-			return -EOPNOTSUPP;
-		}
-		break;
-	case IEEE_8021QAZ_APP_SEL_DSCP:
-		return dsa_slave_dcbnl_del_dscp_prio(dev, app);
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-/* Pre-populate the DCB application priority table with the priorities
- * configured during switch setup, which we read from hardware here.
- */
-static int dsa_slave_dcbnl_init(struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	int port = dp->index;
-	int err;
-
-	if (ds->ops->port_get_default_prio) {
-		int prio = ds->ops->port_get_default_prio(ds, port);
-		struct dcb_app app = {
-			.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
-			.protocol = 0,
-			.priority = prio,
-		};
-
-		if (prio < 0)
-			return prio;
-
-		err = dcb_ieee_setapp(dev, &app);
-		if (err)
-			return err;
-	}
-
-	if (ds->ops->port_get_dscp_prio) {
-		int protocol;
-
-		for (protocol = 0; protocol < 64; protocol++) {
-			struct dcb_app app = {
-				.selector = IEEE_8021QAZ_APP_SEL_DSCP,
-				.protocol = protocol,
-			};
-			int prio;
-
-			prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
-			if (prio == -EOPNOTSUPP)
-				continue;
-			if (prio < 0)
-				return prio;
-
-			app.priority = prio;
-
-			err = dcb_ieee_setapp(dev, &app);
-			if (err)
-				return err;
-		}
-	}
-
-	return 0;
-}
-
-static const struct ethtool_ops dsa_slave_ethtool_ops = {
-	.get_drvinfo		= dsa_slave_get_drvinfo,
-	.get_regs_len		= dsa_slave_get_regs_len,
-	.get_regs		= dsa_slave_get_regs,
-	.nway_reset		= dsa_slave_nway_reset,
-	.get_link		= ethtool_op_get_link,
-	.get_eeprom_len		= dsa_slave_get_eeprom_len,
-	.get_eeprom		= dsa_slave_get_eeprom,
-	.set_eeprom		= dsa_slave_set_eeprom,
-	.get_strings		= dsa_slave_get_strings,
-	.get_ethtool_stats	= dsa_slave_get_ethtool_stats,
-	.get_sset_count		= dsa_slave_get_sset_count,
-	.get_eth_phy_stats	= dsa_slave_get_eth_phy_stats,
-	.get_eth_mac_stats	= dsa_slave_get_eth_mac_stats,
-	.get_eth_ctrl_stats	= dsa_slave_get_eth_ctrl_stats,
-	.get_rmon_stats		= dsa_slave_get_rmon_stats,
-	.set_wol		= dsa_slave_set_wol,
-	.get_wol		= dsa_slave_get_wol,
-	.set_eee		= dsa_slave_set_eee,
-	.get_eee		= dsa_slave_get_eee,
-	.get_link_ksettings	= dsa_slave_get_link_ksettings,
-	.set_link_ksettings	= dsa_slave_set_link_ksettings,
-	.get_pause_stats	= dsa_slave_get_pause_stats,
-	.get_pauseparam		= dsa_slave_get_pauseparam,
-	.set_pauseparam		= dsa_slave_set_pauseparam,
-	.get_rxnfc		= dsa_slave_get_rxnfc,
-	.set_rxnfc		= dsa_slave_set_rxnfc,
-	.get_ts_info		= dsa_slave_get_ts_info,
-	.self_test		= dsa_slave_net_selftest,
-	.get_mm			= dsa_slave_get_mm,
-	.set_mm			= dsa_slave_set_mm,
-	.get_mm_stats		= dsa_slave_get_mm_stats,
-};
-
-static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = {
-	.ieee_setapp		= dsa_slave_dcbnl_ieee_setapp,
-	.ieee_delapp		= dsa_slave_dcbnl_ieee_delapp,
-};
-
-static void dsa_slave_get_stats64(struct net_device *dev,
-				  struct rtnl_link_stats64 *s)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->get_stats64)
-		ds->ops->get_stats64(ds, dp->index, s);
-	else
-		dev_get_tstats64(dev, s);
-}
-
-static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
-				       struct net_device_path *path)
-{
-	struct dsa_port *dp = dsa_slave_to_port(ctx->dev);
-	struct net_device *master = dsa_port_to_master(dp);
-	struct dsa_port *cpu_dp = dp->cpu_dp;
-
-	path->dev = ctx->dev;
-	path->type = DEV_PATH_DSA;
-	path->dsa.proto = cpu_dp->tag_ops->proto;
-	path->dsa.port = dp->index;
-	ctx->dev = master;
-
-	return 0;
-}
-
-static const struct net_device_ops dsa_slave_netdev_ops = {
-	.ndo_open	 	= dsa_slave_open,
-	.ndo_stop		= dsa_slave_close,
-	.ndo_start_xmit		= dsa_slave_xmit,
-	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
-	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
-	.ndo_set_mac_address	= dsa_slave_set_mac_address,
-	.ndo_fdb_dump		= dsa_slave_fdb_dump,
-	.ndo_eth_ioctl		= dsa_slave_ioctl,
-	.ndo_get_iflink		= dsa_slave_get_iflink,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_netpoll_setup	= dsa_slave_netpoll_setup,
-	.ndo_netpoll_cleanup	= dsa_slave_netpoll_cleanup,
-	.ndo_poll_controller	= dsa_slave_poll_controller,
-#endif
-	.ndo_setup_tc		= dsa_slave_setup_tc,
-	.ndo_get_stats64	= dsa_slave_get_stats64,
-	.ndo_vlan_rx_add_vid	= dsa_slave_vlan_rx_add_vid,
-	.ndo_vlan_rx_kill_vid	= dsa_slave_vlan_rx_kill_vid,
-	.ndo_change_mtu		= dsa_slave_change_mtu,
-	.ndo_fill_forward_path	= dsa_slave_fill_forward_path,
-};
-
-static struct device_type dsa_type = {
-	.name	= "dsa",
-};
-
-void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
-{
-	const struct dsa_port *dp = dsa_to_port(ds, port);
-
-	if (dp->pl)
-		phylink_mac_change(dp->pl, up);
-}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
-
-static void dsa_slave_phylink_fixed_state(struct phylink_config *config,
-					  struct phylink_link_state *state)
-{
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
-	struct dsa_switch *ds = dp->ds;
-
-	/* No need to check that this operation is valid, the callback would
-	 * not be called if it was not.
-	 */
-	ds->ops->phylink_fixed_state(ds, dp->index, state);
-}
-
-/* slave device setup *******************************************************/
-static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr,
-				 u32 flags)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_switch *ds = dp->ds;
-
-	slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
-	if (!slave_dev->phydev) {
-		netdev_err(slave_dev, "no phy at %d\n", addr);
-		return -ENODEV;
-	}
-
-	slave_dev->phydev->dev_flags |= flags;
-
-	return phylink_connect_phy(dp->pl, slave_dev->phydev);
-}
-
-static int dsa_slave_phy_setup(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct device_node *port_dn = dp->dn;
-	struct dsa_switch *ds = dp->ds;
-	u32 phy_flags = 0;
-	int ret;
-
-	dp->pl_config.dev = &slave_dev->dev;
-	dp->pl_config.type = PHYLINK_NETDEV;
-
-	/* The get_fixed_state callback takes precedence over polling the
-	 * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
-	 * this if the switch provides such a callback.
-	 */
-	if (ds->ops->phylink_fixed_state) {
-		dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state;
-		dp->pl_config.poll_fixed_state = true;
-	}
-
-	ret = dsa_port_phylink_create(dp);
-	if (ret)
-		return ret;
-
-	if (ds->ops->get_phy_flags)
-		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
-
-	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
-	if (ret == -ENODEV && ds->slave_mii_bus) {
-		/* We could not connect to a designated PHY or SFP, so try to
-		 * use the switch internal MDIO bus instead
-		 */
-		ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags);
-	}
-	if (ret) {
-		netdev_err(slave_dev, "failed to connect to PHY: %pe\n",
-			   ERR_PTR(ret));
-		dsa_port_phylink_destroy(dp);
-	}
-
-	return ret;
-}
-
-void dsa_slave_setup_tagger(struct net_device *slave)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave);
-	struct net_device *master = dsa_port_to_master(dp);
-	struct dsa_slave_priv *p = netdev_priv(slave);
-	const struct dsa_port *cpu_dp = dp->cpu_dp;
-	const struct dsa_switch *ds = dp->ds;
-
-	slave->needed_headroom = cpu_dp->tag_ops->needed_headroom;
-	slave->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
-	/* Try to save one extra realloc later in the TX path (in the master)
-	 * by also inheriting the master's needed headroom and tailroom.
-	 * The 8021q driver also does this.
-	 */
-	slave->needed_headroom += master->needed_headroom;
-	slave->needed_tailroom += master->needed_tailroom;
-
-	p->xmit = cpu_dp->tag_ops->xmit;
-
-	slave->features = master->vlan_features | NETIF_F_HW_TC;
-	slave->hw_features |= NETIF_F_HW_TC;
-	slave->features |= NETIF_F_LLTX;
-	if (slave->needed_tailroom)
-		slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
-	if (ds->needs_standalone_vlan_filtering)
-		slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
-}
-
-int dsa_slave_suspend(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
-	if (!netif_running(slave_dev))
-		return 0;
-
-	netif_device_detach(slave_dev);
-
-	rtnl_lock();
-	phylink_stop(dp->pl);
-	rtnl_unlock();
-
-	return 0;
-}
-
-int dsa_slave_resume(struct net_device *slave_dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
-	if (!netif_running(slave_dev))
-		return 0;
-
-	netif_device_attach(slave_dev);
-
-	rtnl_lock();
-	phylink_start(dp->pl);
-	rtnl_unlock();
-
-	return 0;
-}
-
-int dsa_slave_create(struct dsa_port *port)
-{
-	struct net_device *master = dsa_port_to_master(port);
-	struct dsa_switch *ds = port->ds;
-	struct net_device *slave_dev;
-	struct dsa_slave_priv *p;
-	const char *name;
-	int assign_type;
-	int ret;
-
-	if (!ds->num_tx_queues)
-		ds->num_tx_queues = 1;
-
-	if (port->name) {
-		name = port->name;
-		assign_type = NET_NAME_PREDICTABLE;
-	} else {
-		name = "eth%d";
-		assign_type = NET_NAME_ENUM;
-	}
-
-	slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name,
-				     assign_type, ether_setup,
-				     ds->num_tx_queues, 1);
-	if (slave_dev == NULL)
-		return -ENOMEM;
-
-	slave_dev->rtnl_link_ops = &dsa_link_ops;
-	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
-#if IS_ENABLED(CONFIG_DCB)
-	slave_dev->dcbnl_ops = &dsa_slave_dcbnl_ops;
-#endif
-	if (!is_zero_ether_addr(port->mac))
-		eth_hw_addr_set(slave_dev, port->mac);
-	else
-		eth_hw_addr_inherit(slave_dev, master);
-	slave_dev->priv_flags |= IFF_NO_QUEUE;
-	if (dsa_switch_supports_uc_filtering(ds))
-		slave_dev->priv_flags |= IFF_UNICAST_FLT;
-	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
-	if (ds->ops->port_max_mtu)
-		slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
-	SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
-
-	SET_NETDEV_DEV(slave_dev, port->ds->dev);
-	SET_NETDEV_DEVLINK_PORT(slave_dev, &port->devlink_port);
-	slave_dev->dev.of_node = port->dn;
-	slave_dev->vlan_features = master->vlan_features;
-
-	p = netdev_priv(slave_dev);
-	slave_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
-	if (!slave_dev->tstats) {
-		free_netdev(slave_dev);
-		return -ENOMEM;
-	}
-
-	ret = gro_cells_init(&p->gcells, slave_dev);
-	if (ret)
-		goto out_free;
-
-	p->dp = port;
-	INIT_LIST_HEAD(&p->mall_tc_list);
-	port->slave = slave_dev;
-	dsa_slave_setup_tagger(slave_dev);
-
-	netif_carrier_off(slave_dev);
-
-	ret = dsa_slave_phy_setup(slave_dev);
-	if (ret) {
-		netdev_err(slave_dev,
-			   "error %d setting up PHY for tree %d, switch %d, port %d\n",
-			   ret, ds->dst->index, ds->index, port->index);
-		goto out_gcells;
-	}
-
-	rtnl_lock();
-
-	ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
-	if (ret && ret != -EOPNOTSUPP)
-		dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
-			 ret, ETH_DATA_LEN, port->index);
-
-	ret = register_netdevice(slave_dev);
-	if (ret) {
-		netdev_err(master, "error %d registering interface %s\n",
-			   ret, slave_dev->name);
-		rtnl_unlock();
-		goto out_phy;
-	}
-
-	if (IS_ENABLED(CONFIG_DCB)) {
-		ret = dsa_slave_dcbnl_init(slave_dev);
-		if (ret) {
-			netdev_err(slave_dev,
-				   "failed to initialize DCB: %pe\n",
-				   ERR_PTR(ret));
-			rtnl_unlock();
-			goto out_unregister;
-		}
-	}
-
-	ret = netdev_upper_dev_link(master, slave_dev, NULL);
-
-	rtnl_unlock();
-
-	if (ret)
-		goto out_unregister;
-
-	return 0;
-
-out_unregister:
-	unregister_netdev(slave_dev);
-out_phy:
-	rtnl_lock();
-	phylink_disconnect_phy(p->dp->pl);
-	rtnl_unlock();
-	dsa_port_phylink_destroy(p->dp);
-out_gcells:
-	gro_cells_destroy(&p->gcells);
-out_free:
-	free_percpu(slave_dev->tstats);
-	free_netdev(slave_dev);
-	port->slave = NULL;
-	return ret;
-}
-
-void dsa_slave_destroy(struct net_device *slave_dev)
-{
-	struct net_device *master = dsa_slave_to_master(slave_dev);
-	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-
-	netif_carrier_off(slave_dev);
-	rtnl_lock();
-	netdev_upper_dev_unlink(master, slave_dev);
-	unregister_netdevice(slave_dev);
-	phylink_disconnect_phy(dp->pl);
-	rtnl_unlock();
-
-	dsa_port_phylink_destroy(dp);
-	gro_cells_destroy(&p->gcells);
-	free_percpu(slave_dev->tstats);
-	free_netdev(slave_dev);
-}
-
-int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
-			    struct netlink_ext_ack *extack)
-{
-	struct net_device *old_master = dsa_slave_to_master(dev);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch *ds = dp->ds;
-	struct net_device *upper;
-	struct list_head *iter;
-	int err;
-
-	if (master == old_master)
-		return 0;
-
-	if (!ds->ops->port_change_master) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Driver does not support changing DSA master");
-		return -EOPNOTSUPP;
-	}
-
-	if (!netdev_uses_dsa(master)) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "Interface not eligible as DSA master");
-		return -EOPNOTSUPP;
-	}
-
-	netdev_for_each_upper_dev_rcu(master, upper, iter) {
-		if (dsa_slave_dev_check(upper))
-			continue;
-		if (netif_is_bridge_master(upper))
-			continue;
-		NL_SET_ERR_MSG_MOD(extack, "Cannot join master with unknown uppers");
-		return -EOPNOTSUPP;
-	}
-
-	/* Since we allow live-changing the DSA master, plus we auto-open the
-	 * DSA master when the user port opens => we need to ensure that the
-	 * new DSA master is open too.
-	 */
-	if (dev->flags & IFF_UP) {
-		err = dev_open(master, extack);
-		if (err)
-			return err;
-	}
-
-	netdev_upper_dev_unlink(old_master, dev);
-
-	err = netdev_upper_dev_link(master, dev, extack);
-	if (err)
-		goto out_revert_old_master_unlink;
-
-	err = dsa_port_change_master(dp, master, extack);
-	if (err)
-		goto out_revert_master_link;
-
-	/* Update the MTU of the new CPU port through cross-chip notifiers */
-	err = dsa_slave_change_mtu(dev, dev->mtu);
-	if (err && err != -EOPNOTSUPP) {
-		netdev_warn(dev,
-			    "nonfatal error updating MTU with new master: %pe\n",
-			    ERR_PTR(err));
-	}
-
-	/* If the port doesn't have its own MAC address and relies on the DSA
-	 * master's one, inherit it again from the new DSA master.
-	 */
-	if (is_zero_ether_addr(dp->mac))
-		eth_hw_addr_inherit(dev, master);
-
-	return 0;
-
-out_revert_master_link:
-	netdev_upper_dev_unlink(master, dev);
-out_revert_old_master_unlink:
-	netdev_upper_dev_link(old_master, dev, NULL);
-	return err;
-}
-
-bool dsa_slave_dev_check(const struct net_device *dev)
-{
-	return dev->netdev_ops == &dsa_slave_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(dsa_slave_dev_check);
-
-static int dsa_slave_changeupper(struct net_device *dev,
-				 struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-
-	if (!dsa_slave_dev_check(dev))
-		return err;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (netif_is_bridge_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_bridge_join(dp, info->upper_dev, extack);
-			if (!err)
-				dsa_bridge_mtu_normalization(dp);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_bridge_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	} else if (netif_is_lag_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_lag_join(dp, info->upper_dev,
-						info->upper_info, extack);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_lag_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	} else if (is_hsr_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_port_hsr_join(dp, info->upper_dev, extack);
-			if (err == -EOPNOTSUPP) {
-				NL_SET_ERR_MSG_WEAK_MOD(extack,
-							"Offloading not supported");
-				err = 0;
-			}
-			err = notifier_from_errno(err);
-		} else {
-			dsa_port_hsr_leave(dp, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	}
-
-	return err;
-}
-
-static int dsa_slave_prechangeupper(struct net_device *dev,
-				    struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	if (!dsa_slave_dev_check(dev))
-		return NOTIFY_DONE;
-
-	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
-		dsa_port_pre_bridge_leave(dp, info->upper_dev);
-	else if (netif_is_lag_master(info->upper_dev) && !info->linking)
-		dsa_port_pre_lag_leave(dp, info->upper_dev);
-	/* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be
-	 * meaningfully enslaved to a bridge yet
-	 */
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_lag_changeupper(struct net_device *dev,
-			  struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *lower;
-	struct list_head *iter;
-	int err = NOTIFY_DONE;
-	struct dsa_port *dp;
-
-	if (!netif_is_lag_master(dev))
-		return err;
-
-	netdev_for_each_lower_dev(dev, lower, iter) {
-		if (!dsa_slave_dev_check(lower))
-			continue;
-
-		dp = dsa_slave_to_port(lower);
-		if (!dp->lag)
-			/* Software LAG */
-			continue;
-
-		err = dsa_slave_changeupper(lower, info);
-		if (notifier_to_errno(err))
-			break;
-	}
-
-	return err;
-}
-
-/* Same as dsa_slave_lag_changeupper() except that it calls
- * dsa_slave_prechangeupper()
- */
-static int
-dsa_slave_lag_prechangeupper(struct net_device *dev,
-			     struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *lower;
-	struct list_head *iter;
-	int err = NOTIFY_DONE;
-	struct dsa_port *dp;
-
-	if (!netif_is_lag_master(dev))
-		return err;
-
-	netdev_for_each_lower_dev(dev, lower, iter) {
-		if (!dsa_slave_dev_check(lower))
-			continue;
-
-		dp = dsa_slave_to_port(lower);
-		if (!dp->lag)
-			/* Software LAG */
-			continue;
-
-		err = dsa_slave_prechangeupper(lower, info);
-		if (notifier_to_errno(err))
-			break;
-	}
-
-	return err;
-}
-
-static int
-dsa_prevent_bridging_8021q_upper(struct net_device *dev,
-				 struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *ext_ack;
-	struct net_device *slave, *br;
-	struct dsa_port *dp;
-
-	ext_ack = netdev_notifier_info_to_extack(&info->info);
-
-	if (!is_vlan_dev(dev))
-		return NOTIFY_DONE;
-
-	slave = vlan_dev_real_dev(dev);
-	if (!dsa_slave_dev_check(slave))
-		return NOTIFY_DONE;
-
-	dp = dsa_slave_to_port(slave);
-	br = dsa_port_bridge_dev_get(dp);
-	if (!br)
-		return NOTIFY_DONE;
-
-	/* Deny enslaving a VLAN device into a VLAN-aware bridge */
-	if (br_vlan_enabled(br) &&
-	    netif_is_bridge_master(info->upper_dev) && info->linking) {
-		NL_SET_ERR_MSG_MOD(ext_ack,
-				   "Cannot enslave VLAN device into VLAN aware bridge");
-		return notifier_from_errno(-EINVAL);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_check_8021q_upper(struct net_device *dev,
-			    struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct net_device *br = dsa_port_bridge_dev_get(dp);
-	struct bridge_vlan_info br_info;
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-	u16 vid;
-
-	if (!br || !br_vlan_enabled(br))
-		return NOTIFY_DONE;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-	vid = vlan_dev_vlan_id(info->upper_dev);
-
-	/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
-	 * device, respectively the VID is not found, returning
-	 * 0 means success, which is a failure for us here.
-	 */
-	err = br_vlan_get_info(br, vid, &br_info);
-	if (err == 0) {
-		NL_SET_ERR_MSG_MOD(extack,
-				   "This VLAN is already configured by the bridge");
-		return notifier_from_errno(-EBUSY);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_slave_prechangeupper_sanity_check(struct net_device *dev,
-				      struct netdev_notifier_changeupper_info *info)
-{
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
-	int err;
-
-	if (!dsa_slave_dev_check(dev))
-		return dsa_prevent_bridging_8021q_upper(dev, info);
-
-	dp = dsa_slave_to_port(dev);
-	ds = dp->ds;
-
-	if (ds->ops->port_prechangeupper) {
-		err = ds->ops->port_prechangeupper(ds, dp->index, info);
-		if (err)
-			return notifier_from_errno(err);
-	}
-
-	if (is_vlan_dev(info->upper_dev))
-		return dsa_slave_check_8021q_upper(dev, info);
-
-	return NOTIFY_DONE;
-}
-
-/* To be eligible as a DSA master, a LAG must have all lower interfaces be
- * eligible DSA masters. Additionally, all LAG slaves must be DSA masters of
- * switches in the same switch tree.
- */
-static int dsa_lag_master_validate(struct net_device *lag_dev,
-				   struct netlink_ext_ack *extack)
-{
-	struct net_device *lower1, *lower2;
-	struct list_head *iter1, *iter2;
-
-	netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
-		netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
-			if (!netdev_uses_dsa(lower1) ||
-			    !netdev_uses_dsa(lower2)) {
-				NL_SET_ERR_MSG_MOD(extack,
-						   "All LAG ports must be eligible as DSA masters");
-				return notifier_from_errno(-EINVAL);
-			}
-
-			if (lower1 == lower2)
-				continue;
-
-			if (!dsa_port_tree_same(lower1->dsa_ptr,
-						lower2->dsa_ptr)) {
-				NL_SET_ERR_MSG_MOD(extack,
-						   "LAG contains DSA masters of disjoint switch trees");
-				return notifier_from_errno(-EINVAL);
-			}
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-dsa_master_prechangeupper_sanity_check(struct net_device *master,
-				       struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (!netdev_uses_dsa(master))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	/* Allow DSA switch uppers */
-	if (dsa_slave_dev_check(info->upper_dev))
-		return NOTIFY_DONE;
-
-	/* Allow bridge uppers of DSA masters, subject to further
-	 * restrictions in dsa_bridge_prechangelower_sanity_check()
-	 */
-	if (netif_is_bridge_master(info->upper_dev))
-		return NOTIFY_DONE;
-
-	/* Allow LAG uppers, subject to further restrictions in
-	 * dsa_lag_master_prechangelower_sanity_check()
-	 */
-	if (netif_is_lag_master(info->upper_dev))
-		return dsa_lag_master_validate(info->upper_dev, extack);
-
-	NL_SET_ERR_MSG_MOD(extack,
-			   "DSA master cannot join unknown upper interfaces");
-	return notifier_from_errno(-EBUSY);
-}
-
-static int
-dsa_lag_master_prechangelower_sanity_check(struct net_device *dev,
-					   struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
-	struct net_device *lag_dev = info->upper_dev;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	if (!netdev_uses_dsa(dev)) {
-		NL_SET_ERR_MSG(extack,
-			       "Only DSA masters can join a LAG DSA master");
-		return notifier_from_errno(-EINVAL);
-	}
-
-	netdev_for_each_lower_dev(lag_dev, lower, iter) {
-		if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
-			NL_SET_ERR_MSG(extack,
-				       "Interface is DSA master for a different switch tree than this LAG");
-			return notifier_from_errno(-EINVAL);
-		}
-
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-/* Don't allow bridging of DSA masters, since the bridge layer rx_handler
- * prevents the DSA fake ethertype handler to be invoked, so we don't get the
- * chance to strip off and parse the DSA switch tag protocol header (the bridge
- * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
- * frames).
- * The only case where that would not be an issue is when bridging can already
- * be offloaded, such as when the DSA master is itself a DSA or plain switchdev
- * port, and is bridged only with other ports from the same hardware device.
- */
-static int
-dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
-				       struct netdev_notifier_changeupper_info *info)
-{
-	struct net_device *br = info->upper_dev;
-	struct netlink_ext_ack *extack;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	if (!netif_is_bridge_master(br))
-		return NOTIFY_DONE;
-
-	if (!info->linking)
-		return NOTIFY_DONE;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	netdev_for_each_lower_dev(br, lower, iter) {
-		if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
-			continue;
-
-		if (!netdev_port_same_parent_id(lower, new_lower)) {
-			NL_SET_ERR_MSG(extack,
-				       "Cannot do software bridging with a DSA master");
-			return notifier_from_errno(-EINVAL);
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static void dsa_tree_migrate_ports_from_lag_master(struct dsa_switch_tree *dst,
-						   struct net_device *lag_dev)
-{
-	struct net_device *new_master = dsa_tree_find_first_master(dst);
-	struct dsa_port *dp;
-	int err;
-
-	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp) != lag_dev)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, new_master, NULL);
-		if (err) {
-			netdev_err(dp->slave,
-				   "failed to restore master to %s: %pe\n",
-				   new_master->name, ERR_PTR(err));
-		}
-	}
-}
-
-static int dsa_master_lag_join(struct net_device *master,
-			       struct net_device *lag_dev,
-			       struct netdev_lag_upper_info *uinfo,
-			       struct netlink_ext_ack *extack)
-{
-	struct dsa_port *cpu_dp = master->dsa_ptr;
-	struct dsa_switch_tree *dst = cpu_dp->dst;
-	struct dsa_port *dp;
-	int err;
-
-	err = dsa_master_lag_setup(lag_dev, cpu_dp, uinfo, extack);
-	if (err)
-		return err;
-
-	dsa_tree_for_each_user_port(dp, dst) {
-		if (dsa_port_to_master(dp) != master)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, lag_dev, extack);
-		if (err)
-			goto restore;
-	}
-
-	return 0;
-
-restore:
-	dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
-		if (dsa_port_to_master(dp) != lag_dev)
-			continue;
-
-		err = dsa_slave_change_master(dp->slave, master, NULL);
-		if (err) {
-			netdev_err(dp->slave,
-				   "failed to restore master to %s: %pe\n",
-				   master->name, ERR_PTR(err));
-		}
-	}
-
-	dsa_master_lag_teardown(lag_dev, master->dsa_ptr);
-
-	return err;
-}
-
-static void dsa_master_lag_leave(struct net_device *master,
-				 struct net_device *lag_dev)
-{
-	struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
-	struct dsa_switch_tree *dst = cpu_dp->dst;
-	struct dsa_port *new_cpu_dp = NULL;
-	struct net_device *lower;
-	struct list_head *iter;
-
-	netdev_for_each_lower_dev(lag_dev, lower, iter) {
-		if (netdev_uses_dsa(lower)) {
-			new_cpu_dp = lower->dsa_ptr;
-			break;
-		}
-	}
-
-	if (new_cpu_dp) {
-		/* Update the CPU port of the user ports still under the LAG
-		 * so that dsa_port_to_master() continues to work properly
-		 */
-		dsa_tree_for_each_user_port(dp, dst)
-			if (dsa_port_to_master(dp) == lag_dev)
-				dp->cpu_dp = new_cpu_dp;
-
-		/* Update the index of the virtual CPU port to match the lowest
-		 * physical CPU port
-		 */
-		lag_dev->dsa_ptr = new_cpu_dp;
-		wmb();
-	} else {
-		/* If the LAG DSA master has no ports left, migrate back all
-		 * user ports to the first physical CPU port
-		 */
-		dsa_tree_migrate_ports_from_lag_master(dst, lag_dev);
-	}
-
-	/* This DSA master has left its LAG in any case, so let
-	 * the CPU port leave the hardware LAG as well
-	 */
-	dsa_master_lag_teardown(lag_dev, master->dsa_ptr);
-}
-
-static int dsa_master_changeupper(struct net_device *dev,
-				  struct netdev_notifier_changeupper_info *info)
-{
-	struct netlink_ext_ack *extack;
-	int err = NOTIFY_DONE;
-
-	if (!netdev_uses_dsa(dev))
-		return err;
-
-	extack = netdev_notifier_info_to_extack(&info->info);
-
-	if (netif_is_lag_master(info->upper_dev)) {
-		if (info->linking) {
-			err = dsa_master_lag_join(dev, info->upper_dev,
-						  info->upper_info, extack);
-			err = notifier_from_errno(err);
-		} else {
-			dsa_master_lag_leave(dev, info->upper_dev);
-			err = NOTIFY_OK;
-		}
-	}
-
-	return err;
-}
-
-static int dsa_slave_netdevice_event(struct notifier_block *nb,
-				     unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	switch (event) {
-	case NETDEV_PRECHANGEUPPER: {
-		struct netdev_notifier_changeupper_info *info = ptr;
-		int err;
-
-		err = dsa_slave_prechangeupper_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_master_prechangeupper_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_lag_master_prechangelower_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_bridge_prechangelower_sanity_check(dev, info);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_prechangeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_lag_prechangeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		break;
-	}
-	case NETDEV_CHANGEUPPER: {
-		int err;
-
-		err = dsa_slave_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_slave_lag_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		err = dsa_master_changeupper(dev, ptr);
-		if (notifier_to_errno(err))
-			return err;
-
-		break;
-	}
-	case NETDEV_CHANGELOWERSTATE: {
-		struct netdev_notifier_changelowerstate_info *info = ptr;
-		struct dsa_port *dp;
-		int err = 0;
-
-		if (dsa_slave_dev_check(dev)) {
-			dp = dsa_slave_to_port(dev);
-
-			err = dsa_port_lag_change(dp, info->lower_state_info);
-		}
-
-		/* Mirror LAG port events on DSA masters that are in
-		 * a LAG towards their respective switch CPU ports
-		 */
-		if (netdev_uses_dsa(dev)) {
-			dp = dev->dsa_ptr;
-
-			err = dsa_port_lag_change(dp, info->lower_state_info);
-		}
-
-		return notifier_from_errno(err);
-	}
-	case NETDEV_CHANGE:
-	case NETDEV_UP: {
-		/* Track state of master port.
-		 * DSA driver may require the master port (and indirectly
-		 * the tagger) to be available for some special operation.
-		 */
-		if (netdev_uses_dsa(dev)) {
-			struct dsa_port *cpu_dp = dev->dsa_ptr;
-			struct dsa_switch_tree *dst = cpu_dp->ds->dst;
-
-			/* Track when the master port is UP */
-			dsa_tree_master_oper_state_change(dst, dev,
-							  netif_oper_up(dev));
-
-			/* Track when the master port is ready and can accept
-			 * packet.
-			 * NETDEV_UP event is not enough to flag a port as ready.
-			 * We also have to wait for linkwatch_do_dev to dev_activate
-			 * and emit a NETDEV_CHANGE event.
-			 * We check if a master port is ready by checking if the dev
-			 * have a qdisc assigned and is not noop.
-			 */
-			dsa_tree_master_admin_state_change(dst, dev,
-							   !qdisc_tx_is_noop(dev));
-
-			return NOTIFY_OK;
-		}
-
-		return NOTIFY_DONE;
-	}
-	case NETDEV_GOING_DOWN: {
-		struct dsa_port *dp, *cpu_dp;
-		struct dsa_switch_tree *dst;
-		LIST_HEAD(close_list);
-
-		if (!netdev_uses_dsa(dev))
-			return NOTIFY_DONE;
-
-		cpu_dp = dev->dsa_ptr;
-		dst = cpu_dp->ds->dst;
-
-		dsa_tree_master_admin_state_change(dst, dev, false);
-
-		list_for_each_entry(dp, &dst->ports, list) {
-			if (!dsa_port_is_user(dp))
-				continue;
-
-			if (dp->cpu_dp != cpu_dp)
-				continue;
-
-			list_add(&dp->slave->close_list, &close_list);
-		}
-
-		dev_close_many(&close_list, true);
-
-		return NOTIFY_OK;
-	}
-	default:
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static void
-dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
-{
-	struct switchdev_notifier_fdb_info info = {};
-
-	info.addr = switchdev_work->addr;
-	info.vid = switchdev_work->vid;
-	info.offloaded = true;
-	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
-				 switchdev_work->orig_dev, &info.info, NULL);
-}
-
-static void dsa_slave_switchdev_event_work(struct work_struct *work)
-{
-	struct dsa_switchdev_event_work *switchdev_work =
-		container_of(work, struct dsa_switchdev_event_work, work);
-	const unsigned char *addr = switchdev_work->addr;
-	struct net_device *dev = switchdev_work->dev;
-	u16 vid = switchdev_work->vid;
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
-	int err;
-
-	dp = dsa_slave_to_port(dev);
-	ds = dp->ds;
-
-	switch (switchdev_work->event) {
-	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-		if (switchdev_work->host_addr)
-			err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
-		else if (dp->lag)
-			err = dsa_port_lag_fdb_add(dp, addr, vid);
-		else
-			err = dsa_port_fdb_add(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to add %pM vid %d to fdb: %d\n",
-				dp->index, addr, vid, err);
-			break;
-		}
-		dsa_fdb_offload_notify(switchdev_work);
-		break;
-
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (switchdev_work->host_addr)
-			err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
-		else if (dp->lag)
-			err = dsa_port_lag_fdb_del(dp, addr, vid);
-		else
-			err = dsa_port_fdb_del(dp, addr, vid);
-		if (err) {
-			dev_err(ds->dev,
-				"port %d failed to delete %pM vid %d from fdb: %d\n",
-				dp->index, addr, vid, err);
-		}
-
-		break;
-	}
-
-	kfree(switchdev_work);
-}
-
-static bool dsa_foreign_dev_check(const struct net_device *dev,
-				  const struct net_device *foreign_dev)
-{
-	const struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_switch_tree *dst = dp->ds->dst;
-
-	if (netif_is_bridge_master(foreign_dev))
-		return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
-
-	if (netif_is_bridge_port(foreign_dev))
-		return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
-
-	/* Everything else is foreign */
-	return true;
-}
-
-static int dsa_slave_fdb_event(struct net_device *dev,
-			       struct net_device *orig_dev,
-			       unsigned long event, const void *ctx,
-			       const struct switchdev_notifier_fdb_info *fdb_info)
-{
-	struct dsa_switchdev_event_work *switchdev_work;
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-	bool host_addr = fdb_info->is_local;
-	struct dsa_switch *ds = dp->ds;
-
-	if (ctx && ctx != dp)
-		return 0;
-
-	if (!dp->bridge)
-		return 0;
-
-	if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
-		if (dsa_port_offloads_bridge_port(dp, orig_dev))
-			return 0;
-
-		/* FDB entries learned by the software bridge or by foreign
-		 * bridge ports should be installed as host addresses only if
-		 * the driver requests assisted learning.
-		 */
-		if (!ds->assisted_learning_on_cpu_port)
-			return 0;
-	}
-
-	/* Also treat FDB entries on foreign interfaces bridged with us as host
-	 * addresses.
-	 */
-	if (dsa_foreign_dev_check(dev, orig_dev))
-		host_addr = true;
-
-	/* Check early that we're not doing work in vain.
-	 * Host addresses on LAG ports still require regular FDB ops,
-	 * since the CPU port isn't in a LAG.
-	 */
-	if (dp->lag && !host_addr) {
-		if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
-			return -EOPNOTSUPP;
-	} else {
-		if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
-			return -EOPNOTSUPP;
-	}
-
-	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
-	if (!switchdev_work)
-		return -ENOMEM;
-
-	netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
-		   event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
-		   orig_dev->name, fdb_info->addr, fdb_info->vid,
-		   host_addr ? " as host address" : "");
-
-	INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work);
-	switchdev_work->event = event;
-	switchdev_work->dev = dev;
-	switchdev_work->orig_dev = orig_dev;
-
-	ether_addr_copy(switchdev_work->addr, fdb_info->addr);
-	switchdev_work->vid = fdb_info->vid;
-	switchdev_work->host_addr = host_addr;
-
-	dsa_schedule_work(&switchdev_work->work);
-
-	return 0;
-}
-
-/* Called under rcu_read_lock() */
-static int dsa_slave_switchdev_event(struct notifier_block *unused,
-				     unsigned long event, void *ptr)
-{
-	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-	int err;
-
-	switch (event) {
-	case SWITCHDEV_PORT_ATTR_SET:
-		err = switchdev_handle_port_attr_set(dev, ptr,
-						     dsa_slave_dev_check,
-						     dsa_slave_port_attr_set);
-		return notifier_from_errno(err);
-	case SWITCHDEV_FDB_ADD_TO_DEVICE:
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
-							   dsa_slave_dev_check,
-							   dsa_foreign_dev_check,
-							   dsa_slave_fdb_event);
-		return notifier_from_errno(err);
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-
-static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,
-					      unsigned long event, void *ptr)
-{
-	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-	int err;
-
-	switch (event) {
-	case SWITCHDEV_PORT_OBJ_ADD:
-		err = switchdev_handle_port_obj_add_foreign(dev, ptr,
-							    dsa_slave_dev_check,
-							    dsa_foreign_dev_check,
-							    dsa_slave_port_obj_add);
-		return notifier_from_errno(err);
-	case SWITCHDEV_PORT_OBJ_DEL:
-		err = switchdev_handle_port_obj_del_foreign(dev, ptr,
-							    dsa_slave_dev_check,
-							    dsa_foreign_dev_check,
-							    dsa_slave_port_obj_del);
-		return notifier_from_errno(err);
-	case SWITCHDEV_PORT_ATTR_SET:
-		err = switchdev_handle_port_attr_set(dev, ptr,
-						     dsa_slave_dev_check,
-						     dsa_slave_port_attr_set);
-		return notifier_from_errno(err);
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block dsa_slave_nb __read_mostly = {
-	.notifier_call  = dsa_slave_netdevice_event,
-};
-
-struct notifier_block dsa_slave_switchdev_notifier = {
-	.notifier_call = dsa_slave_switchdev_event,
-};
-
-struct notifier_block dsa_slave_switchdev_blocking_notifier = {
-	.notifier_call = dsa_slave_switchdev_blocking_event,
-};
-
-int dsa_slave_register_notifier(void)
-{
-	struct notifier_block *nb;
-	int err;
-
-	err = register_netdevice_notifier(&dsa_slave_nb);
-	if (err)
-		return err;
-
-	err = register_switchdev_notifier(&dsa_slave_switchdev_notifier);
-	if (err)
-		goto err_switchdev_nb;
-
-	nb = &dsa_slave_switchdev_blocking_notifier;
-	err = register_switchdev_blocking_notifier(nb);
-	if (err)
-		goto err_switchdev_blocking_nb;
-
-	return 0;
-
-err_switchdev_blocking_nb:
-	unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
-err_switchdev_nb:
-	unregister_netdevice_notifier(&dsa_slave_nb);
-	return err;
-}
-
-void dsa_slave_unregister_notifier(void)
-{
-	struct notifier_block *nb;
-	int err;
-
-	nb = &dsa_slave_switchdev_blocking_notifier;
-	err = unregister_switchdev_blocking_notifier(nb);
-	if (err)
-		pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
-
-	err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
-	if (err)
-		pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
-
-	err = unregister_netdevice_notifier(&dsa_slave_nb);
-	if (err)
-		pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
-}
diff --git a/net/dsa/slave.h b/net/dsa/slave.h
deleted file mode 100644
index d0abe609e00d..000000000000
--- a/net/dsa/slave.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef __DSA_SLAVE_H
-#define __DSA_SLAVE_H
-
-#include <linux/if_bridge.h>
-#include <linux/if_vlan.h>
-#include <linux/list.h>
-#include <linux/netpoll.h>
-#include <linux/types.h>
-#include <net/dsa.h>
-#include <net/gro_cells.h>
-
-struct net_device;
-struct netlink_ext_ack;
-
-extern struct notifier_block dsa_slave_switchdev_notifier;
-extern struct notifier_block dsa_slave_switchdev_blocking_notifier;
-
-struct dsa_slave_priv {
-	/* Copy of CPU port xmit for faster access in slave transmit hot path */
-	struct sk_buff *	(*xmit)(struct sk_buff *skb,
-					struct net_device *dev);
-
-	struct gro_cells	gcells;
-
-	/* DSA port data, such as switch, port index, etc. */
-	struct dsa_port		*dp;
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	struct netpoll		*netpoll;
-#endif
-
-	/* TC context */
-	struct list_head	mall_tc_list;
-};
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_port *dp);
-void dsa_slave_destroy(struct net_device *slave_dev);
-int dsa_slave_suspend(struct net_device *slave_dev);
-int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_register_notifier(void);
-void dsa_slave_unregister_notifier(void);
-void dsa_slave_sync_ha(struct net_device *dev);
-void dsa_slave_unsync_ha(struct net_device *dev);
-void dsa_slave_setup_tagger(struct net_device *slave);
-int dsa_slave_change_mtu(struct net_device *dev, int new_mtu);
-int dsa_slave_change_master(struct net_device *dev, struct net_device *master,
-			    struct netlink_ext_ack *extack);
-int dsa_slave_manage_vlan_filtering(struct net_device *dev,
-				    bool vlan_filtering);
-
-static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	return p->dp;
-}
-
-static inline struct net_device *
-dsa_slave_to_master(const struct net_device *dev)
-{
-	struct dsa_port *dp = dsa_slave_to_port(dev);
-
-	return dsa_port_to_master(dp);
-}
-
-#endif
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 1a42f9317334..3d2feeea897b 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -15,10 +15,10 @@
 #include "dsa.h"
 #include "netlink.h"
 #include "port.h"
-#include "slave.h"
 #include "switch.h"
 #include "tag_8021q.h"
 #include "trace.h"
+#include "user.h"
 
 static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
 						   unsigned int ageing_time)
@@ -894,12 +894,12 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds,
 	 * bits that depend on the tagger, such as the MTU.
 	 */
 	dsa_switch_for_each_user_port(dp, ds) {
-		struct net_device *slave = dp->slave;
+		struct net_device *user = dp->user;
 
-		dsa_slave_setup_tagger(slave);
+		dsa_user_setup_tagger(user);
 
 		/* rtnl_mutex is held in dsa_tree_change_tag_proto */
-		dsa_slave_change_mtu(slave, slave->mtu);
+		dsa_user_change_mtu(user, user->mtu);
 	}
 
 	return 0;
@@ -960,13 +960,13 @@ dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
 }
 
 static int
-dsa_switch_master_state_change(struct dsa_switch *ds,
-			       struct dsa_notifier_master_state_info *info)
+dsa_switch_conduit_state_change(struct dsa_switch *ds,
+				struct dsa_notifier_conduit_state_info *info)
 {
-	if (!ds->ops->master_state_change)
+	if (!ds->ops->conduit_state_change)
 		return 0;
 
-	ds->ops->master_state_change(ds, info->master, info->operational);
+	ds->ops->conduit_state_change(ds, info->conduit, info->operational);
 
 	return 0;
 }
@@ -1056,8 +1056,8 @@ static int dsa_switch_event(struct notifier_block *nb,
 	case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
 		err = dsa_switch_tag_8021q_vlan_del(ds, info);
 		break;
-	case DSA_NOTIFIER_MASTER_STATE_CHANGE:
-		err = dsa_switch_master_state_change(ds, info);
+	case DSA_NOTIFIER_CONDUIT_STATE_CHANGE:
+		err = dsa_switch_conduit_state_change(ds, info);
 		break;
 	default:
 		err = -EOPNOTSUPP;
diff --git a/net/dsa/switch.h b/net/dsa/switch.h
index ea034677da15..be0a2749cd97 100644
--- a/net/dsa/switch.h
+++ b/net/dsa/switch.h
@@ -34,7 +34,7 @@ enum {
 	DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
 	DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
 	DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
-	DSA_NOTIFIER_MASTER_STATE_CHANGE,
+	DSA_NOTIFIER_CONDUIT_STATE_CHANGE,
 };
 
 /* DSA_NOTIFIER_AGEING_TIME */
@@ -105,9 +105,9 @@ struct dsa_notifier_tag_8021q_vlan_info {
 	u16 vid;
 };
 
-/* DSA_NOTIFIER_MASTER_STATE_CHANGE */
-struct dsa_notifier_master_state_info {
-	const struct net_device *master;
+/* DSA_NOTIFIER_CONDUIT_STATE_CHANGE */
+struct dsa_notifier_conduit_state_info {
+	const struct net_device *conduit;
 	bool operational;
 };
 
diff --git a/net/dsa/tag.c b/net/dsa/tag.c
index 5105a5ff58fa..6e402d49afd3 100644
--- a/net/dsa/tag.c
+++ b/net/dsa/tag.c
@@ -13,8 +13,8 @@
 #include <net/dsa.h>
 #include <net/dst_metadata.h>
 
-#include "slave.h"
 #include "tag.h"
+#include "user.h"
 
 static LIST_HEAD(dsa_tag_drivers_list);
 static DEFINE_MUTEX(dsa_tag_drivers_lock);
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(dsa_tag_drivers_lock);
  * switch, the DSA driver owning the interface to which the packet is
  * delivered is never notified unless we do so here.
  */
-static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
+static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p,
 				       struct sk_buff *skb)
 {
 	struct dsa_switch *ds = p->dp->ds;
@@ -57,7 +57,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
-	struct dsa_slave_priv *p;
+	struct dsa_user_priv *p;
 
 	if (unlikely(!cpu_dp)) {
 		kfree_skb(skb);
@@ -75,7 +75,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (!skb_has_extensions(skb))
 			skb->slow_gro = 0;
 
-		skb->dev = dsa_master_find_slave(dev, 0, port);
+		skb->dev = dsa_conduit_find_user(dev, 0, port);
 		if (likely(skb->dev)) {
 			dsa_default_offload_fwd_mark(skb);
 			nskb = skb;
@@ -94,7 +94,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	skb->pkt_type = PACKET_HOST;
 	skb->protocol = eth_type_trans(skb, skb->dev);
 
-	if (unlikely(!dsa_slave_dev_check(skb->dev))) {
+	if (unlikely(!dsa_user_dev_check(skb->dev))) {
 		/* Packet is to be injected directly on an upper
 		 * device, e.g. a team/bond, so skip all DSA-port
 		 * specific actions.
diff --git a/net/dsa/tag.h b/net/dsa/tag.h
index 32d12f4a9d73..f6b9c73718df 100644
--- a/net/dsa/tag.h
+++ b/net/dsa/tag.h
@@ -9,7 +9,7 @@
 #include <net/dsa.h>
 
 #include "port.h"
-#include "slave.h"
+#include "user.h"
 
 struct dsa_tag_driver {
 	const struct dsa_device_ops *ops;
@@ -29,7 +29,7 @@ static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
 	return ops->needed_headroom + ops->needed_tailroom;
 }
 
-static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
+static inline struct net_device *dsa_conduit_find_user(struct net_device *dev,
 						       int device, int port)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -39,7 +39,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
 	list_for_each_entry(dp, &dst->ports, list)
 		if (dp->ds->index == device && dp->index == port &&
 		    dp->type == DSA_PORT_TYPE_USER)
-			return dp->slave;
+			return dp->user;
 
 	return NULL;
 }
@@ -49,7 +49,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
  */
 static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
 {
-	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+	struct dsa_port *dp = dsa_user_to_port(skb->dev);
 	struct net_device *br = dsa_port_bridge_dev_get(dp);
 	struct net_device *dev = skb->dev;
 	struct net_device *upper_dev;
@@ -107,12 +107,12 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb)
  * to support termination through the bridge.
  */
 static inline struct net_device *
-dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
+dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct bridge_vlan_info vinfo;
-	struct net_device *slave;
+	struct net_device *user;
 	struct dsa_port *dp;
 	int err;
 
@@ -134,13 +134,13 @@ dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
 		if (dp->cpu_dp != cpu_dp)
 			continue;
 
-		slave = dp->slave;
+		user = dp->user;
 
-		err = br_vlan_get_info_rcu(slave, vid, &vinfo);
+		err = br_vlan_get_info_rcu(user, vid, &vinfo);
 		if (err)
 			continue;
 
-		return slave;
+		return user;
 	}
 
 	return NULL;
@@ -155,7 +155,7 @@ dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid)
  */
 static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
 {
-	struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+	struct dsa_port *dp = dsa_user_to_port(skb->dev);
 
 	skb->offload_fwd_mark = !!(dp->bridge);
 }
@@ -215,9 +215,9 @@ static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
 	memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
 }
 
-/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from
+/* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from
  * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
- * what the DSA master perceives as the EtherType (the beginning of the L3
+ * what the DSA conduit perceives as the EtherType (the beginning of the L3
  * protocol). Since DSA EtherType header taggers treat the EtherType as part of
  * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
  * is located 2 bytes behind skb->data. Note that EtherType in this context
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index cbdfc392f7e0..71b26ae6db39 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -73,7 +73,7 @@ struct dsa_tag_8021q_vlan {
 struct dsa_8021q_context {
 	struct dsa_switch *ds;
 	struct list_head vlans;
-	/* EtherType of RX VID, used for filtering on master interface */
+	/* EtherType of RX VID, used for filtering on conduit interface */
 	__be16 proto;
 };
 
@@ -338,7 +338,7 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
 	u16 vid = dsa_tag_8021q_standalone_vid(dp);
-	struct net_device *master;
+	struct net_device *conduit;
 	int err;
 
 	/* The CPU port is implicitly configured by
@@ -347,7 +347,7 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return 0;
 
-	master = dsa_port_to_master(dp);
+	conduit = dsa_port_to_conduit(dp);
 
 	err = dsa_port_tag_8021q_vlan_add(dp, vid, false);
 	if (err) {
@@ -357,8 +357,8 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
 		return err;
 	}
 
-	/* Add the VLAN to the master's RX filter. */
-	vlan_vid_add(master, ctx->proto, vid);
+	/* Add the VLAN to the conduit's RX filter. */
+	vlan_vid_add(conduit, ctx->proto, vid);
 
 	return err;
 }
@@ -368,7 +368,7 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
 	struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
 	struct dsa_port *dp = dsa_to_port(ds, port);
 	u16 vid = dsa_tag_8021q_standalone_vid(dp);
-	struct net_device *master;
+	struct net_device *conduit;
 
 	/* The CPU port is implicitly configured by
 	 * configuring the front-panel ports
@@ -376,11 +376,11 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
 	if (!dsa_port_is_user(dp))
 		return;
 
-	master = dsa_port_to_master(dp);
+	conduit = dsa_port_to_conduit(dp);
 
 	dsa_port_tag_8021q_vlan_del(dp, vid, false);
 
-	vlan_vid_del(master, ctx->proto, vid);
+	vlan_vid_del(conduit, ctx->proto, vid);
 }
 
 static int dsa_tag_8021q_setup(struct dsa_switch *ds)
@@ -468,10 +468,10 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit,
 						   int vbid)
 {
-	struct dsa_port *cpu_dp = master->dsa_ptr;
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
 	struct dsa_switch_tree *dst = cpu_dp->dst;
 	struct dsa_port *dp;
 
@@ -490,7 +490,7 @@ struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
 			continue;
 
 		if (dsa_port_bridge_num_get(dp) == vbid)
-			return dp->slave;
+			return dp->user;
 	}
 
 	return NULL;
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
index b75cbaa028ef..41f7167ac520 100644
--- a/net/dsa/tag_8021q.h
+++ b/net/dsa/tag_8021q.h
@@ -16,7 +16,7 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
 		   int *vbid);
 
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit,
 						   int vbid);
 
 int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 7f3b7d730b85..92ce67b93a58 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -29,7 +29,7 @@
 static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
 				       struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__le16 *phdr;
 	u16 hdr;
 
@@ -74,7 +74,7 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
 	/* Get source port information */
 	port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
 
-	skb->dev = dsa_master_find_slave(ndev, 0, port);
+	skb->dev = dsa_conduit_find_user(ndev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index cacdafb41200..83d283a5d27e 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -85,7 +85,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 					struct net_device *dev,
 					unsigned int offset)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u16 queue = skb_get_queue_mapping(skb);
 	u8 *brcm_tag;
 
@@ -96,7 +96,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_slave_xmit() free the SKB
+	 * Let dsa_user_xmit() free the SKB
 	 */
 	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
 		return NULL;
@@ -119,7 +119,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
 		brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
 	brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK;
 
-	/* Now tell the master network device about the desired output queue
+	/* Now tell the conduit network device about the desired output queue
 	 * as well
 	 */
 	skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue));
@@ -164,7 +164,7 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
@@ -216,7 +216,7 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
 static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *brcm_tag;
 
 	/* The Ethernet switch we are interfaced with needs packets to be at
@@ -226,7 +226,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 	 * (including FCS and tag) because the length verification is done after
 	 * the Broadcom tag is stripped off the ingress packet.
 	 *
-	 * Let dsa_slave_xmit() free the SKB
+	 * Let dsa_user_xmit() free the SKB
 	 */
 	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
 		return NULL;
@@ -264,7 +264,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 
 	source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 1fd7fa26db64..8ed52dd663ab 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -129,7 +129,7 @@ enum dsa_code {
 static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
 				   u8 extra)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct net_device *br_dev;
 	u8 tag_dev, tag_port;
 	enum dsa_cmd cmd;
@@ -267,14 +267,14 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
 		lag = dsa_lag_by_id(cpu_dp->dst, source_port + 1);
 		skb->dev = lag ? lag->dev : NULL;
 	} else {
-		skb->dev = dsa_master_find_slave(dev, source_device,
+		skb->dev = dsa_conduit_find_user(dev, source_device,
 						 source_port);
 	}
 
 	if (!skb->dev)
 		return NULL;
 
-	/* When using LAG offload, skb->dev is not a DSA slave interface,
+	/* When using LAG offload, skb->dev is not a DSA user interface,
 	 * so we cannot call dsa_default_offload_fwd_mark and we need to
 	 * special-case it.
 	 */
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index e279cd9057b0..3539141b5350 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -61,7 +61,7 @@
 static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *gswip_tag;
 
 	skb_push(skb, GSWIP_TX_HEADER_LEN);
@@ -89,7 +89,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
 
 	/* Get source port information */
 	port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index 03a1fb9c87a9..6e233cd0aa38 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -20,7 +20,7 @@
 static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *tag;
 
 	/* Calculate checksums (if required) before adding the trailer tag to
@@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
 	u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
 	unsigned int port = tag[0] & 0x03;
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		netdev_warn_once(dev, "Failed to get source port: %d\n", port);
 		return NULL;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 3632e47dea9e..9be341fa88f0 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -87,7 +87,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 				      struct net_device *dev,
 				      unsigned int port, unsigned int len)
 {
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
@@ -119,7 +119,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 
 static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	u8 *tag;
 
@@ -256,7 +256,7 @@ static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
 		return NULL;
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -272,7 +272,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	__be16 *tag;
 	u16 val;
@@ -344,7 +344,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct ethhdr *hdr;
 	u8 *tag;
 
@@ -410,7 +410,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
 {
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 prio = netdev_txq_to_tc(dev, queue_mapping);
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	const struct ethhdr *hdr = eth_hdr(skb);
 	__be16 *tag;
 	u16 val;
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index c25f5536706b..1ed8ee24855d 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -56,7 +56,7 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr)
 
 static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *lan9303_tag;
 	u16 tag;
 
@@ -99,7 +99,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = lan9303_tag1 & 0x3;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
 		return NULL;
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 40af80452747..2483785f6ab1 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -23,7 +23,7 @@
 static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 				    struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 xmit_tpid;
 	u8 *mtk_tag;
 
@@ -85,7 +85,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	/* Get source port information */
 	port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c
index d2fd179c4227..9a473624db50 100644
--- a/net/dsa/tag_none.c
+++ b/net/dsa/tag_none.c
@@ -12,8 +12,8 @@
 
 #define NONE_NAME	"none"
 
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
-					    struct net_device *dev)
+static struct sk_buff *dsa_user_notag_xmit(struct sk_buff *skb,
+					   struct net_device *dev)
 {
 	/* Just return the original SKB */
 	return skb;
@@ -22,7 +22,7 @@ static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
 static const struct dsa_device_ops none_ops = {
 	.name	= NONE_NAME,
 	.proto	= DSA_TAG_PROTO_NONE,
-	.xmit	= dsa_slave_notag_xmit,
+	.xmit	= dsa_user_notag_xmit,
 };
 
 module_dsa_tag_driver(none_ops);
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 20bf7074d5a6..ef2f8fffb2c7 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -45,7 +45,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
 static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
 			       __be32 ifh_prefix, void **ifh)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	struct dsa_switch *ds = dp->ds;
 	u64 vlan_tci, tag_type;
 	void *injection;
@@ -79,7 +79,7 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	void *injection;
 
 	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection);
@@ -91,7 +91,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 static struct sk_buff *seville_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	void *injection;
 
 	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection);
@@ -111,12 +111,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	u16 vlan_tpid;
 	u64 rew_val;
 
-	/* Revert skb->data by the amount consumed by the DSA master,
+	/* Revert skb->data by the amount consumed by the DSA conduit,
 	 * so it points to the beginning of the frame.
 	 */
 	skb_push(skb, ETH_HLEN);
 	/* We don't care about the short prefix, it is just for easy entrance
-	 * into the DSA master's RX filter. Discard it now by moving it into
+	 * into the DSA conduit's RX filter. Discard it now by moving it into
 	 * the headroom.
 	 */
 	skb_pull(skb, OCELOT_SHORT_PREFIX_LEN);
@@ -141,12 +141,12 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
 	ocelot_xfh_get_rew_val(extraction, &rew_val);
 
-	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
+	skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
 	if (!skb->dev)
 		/* The switch will reflect back some frames sent through
-		 * sockets opened on the bare DSA master. These will come back
+		 * sockets opened on the bare DSA conduit. These will come back
 		 * with src_port equal to the index of the CPU port, for which
-		 * there is no slave registered. So don't print any error
+		 * there is no user registered. So don't print any error
 		 * message here (ignore and drop those frames).
 		 */
 		return NULL;
@@ -170,7 +170,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	 * equal to the pvid of the ingress port and should not be used for
 	 * processing.
 	 */
-	dp = dsa_slave_to_port(skb->dev);
+	dp = dsa_user_to_port(skb->dev);
 	vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q;
 
 	if (dsa_port_is_vlan_filtering(dp) &&
@@ -192,7 +192,7 @@ static const struct dsa_device_ops ocelot_netdev_ops = {
 	.xmit			= ocelot_xmit,
 	.rcv			= ocelot_rcv,
 	.needed_headroom	= OCELOT_TOTAL_TAG_LEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 DSA_TAG_DRIVER(ocelot_netdev_ops);
@@ -204,7 +204,7 @@ static const struct dsa_device_ops seville_netdev_ops = {
 	.xmit			= seville_xmit,
 	.rcv			= ocelot_rcv,
 	.needed_headroom	= OCELOT_TOTAL_TAG_LEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 DSA_TAG_DRIVER(seville_netdev_ops);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 1f0b8c20eba5..210039320888 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -37,8 +37,8 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 		return NULL;
 
 	/* PTP over IP packets need UDP checksumming. We may have inherited
-	 * NETIF_F_HW_CSUM from the DSA master, but these packets are not sent
-	 * through the DSA master, so calculate the checksum here.
+	 * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
+	 * through the DSA conduit, so calculate the checksum here.
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
 		return NULL;
@@ -49,7 +49,7 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 
 	/* Calls felix_port_deferred_xmit in felix.c */
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -63,7 +63,7 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -83,7 +83,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 
 	dsa_8021q_rcv(skb, &src_port, &switch_id, NULL);
 
-	skb->dev = dsa_master_find_slave(netdev, switch_id, src_port);
+	skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
 	if (!skb->dev)
 		return NULL;
 
@@ -130,7 +130,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
 	.connect		= ocelot_connect,
 	.disconnect		= ocelot_disconnect,
 	.needed_headroom	= VLAN_HLEN,
-	.promisc_on_master	= true,
+	.promisc_on_conduit	= true,
 };
 
 MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index e5ff7c34e577..6514aa7993ce 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -14,7 +14,7 @@
 
 static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *phdr;
 	u16 hdr;
 
@@ -78,7 +78,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
 	/* Get source port information */
 	port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
@@ -116,7 +116,7 @@ static const struct dsa_device_ops qca_netdev_ops = {
 	.xmit	= qca_tag_xmit,
 	.rcv	= qca_tag_rcv,
 	.needed_headroom = QCA_HDR_LEN,
-	.promisc_on_master = true,
+	.promisc_on_conduit = true,
 };
 
 MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
index c327314b95e3..4da5bad1a7aa 100644
--- a/net/dsa/tag_rtl4_a.c
+++ b/net/dsa/tag_rtl4_a.c
@@ -36,7 +36,7 @@
 static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 *p;
 	u8 *tag;
 	u16 out;
@@ -97,9 +97,9 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
 	}
 	port = protport & 0xff;
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
-		netdev_dbg(dev, "could not find slave for port %d\n", port);
+		netdev_dbg(dev, "could not find user for port %d\n", port);
 		return NULL;
 	}
 
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
index 4f67834fd121..07e857debabf 100644
--- a/net/dsa/tag_rtl8_4.c
+++ b/net/dsa/tag_rtl8_4.c
@@ -103,7 +103,7 @@
 static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev,
 			     void *tag)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	__be16 tag16[RTL8_4_TAG_LEN / 2];
 
 	/* Set Realtek EtherType */
@@ -180,10 +180,10 @@ static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
 
 	/* Parse TX (switch->CPU) */
 	port = FIELD_GET(RTL8_4_TX, ntohs(tag16[3]));
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev,
-				     "could not find slave for port %d\n",
+				     "could not find user for port %d\n",
 				     port);
 		return -ENOENT;
 	}
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
index 437a6820ac42..2ce866b45615 100644
--- a/net/dsa/tag_rzn1_a5psw.c
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -39,7 +39,7 @@ struct a5psw_tag {
 
 static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct a5psw_tag *ptag;
 	u32 data2_val;
 
@@ -90,7 +90,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
 
 	port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
 
-	skb->dev = dsa_master_find_slave(dev, 0, port);
+	skb->dev = dsa_conduit_find_user(dev, 0, port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index ade3eeb2f3e6..1fffe8c2b589 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -157,7 +157,7 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
 		return NULL;
 
 	kthread_init_work(&xmit_work->work, xmit_work_fn);
-	/* Increase refcount so the kfree_skb in dsa_slave_xmit
+	/* Increase refcount so the kfree_skb in dsa_user_xmit
 	 * won't really free the packet.
 	 */
 	xmit_work->dp = dp;
@@ -210,7 +210,7 @@ static u16 sja1105_xmit_tpid(struct dsa_port *dp)
 static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 					      struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	unsigned int bridge_num = dsa_port_bridge_num_get(dp);
 	struct net_device *br = dsa_port_bridge_dev_get(dp);
 	u16 tx_vid;
@@ -235,7 +235,7 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
 
 /* Transform untagged control packets into pvid-tagged control packets so that
  * all packets sent by this tagger are VLAN-tagged and we can configure the
- * switch to drop untagged packets coming from the DSA master.
+ * switch to drop untagged packets coming from the DSA conduit.
  */
 static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
 						    struct sk_buff *skb, u8 pcp)
@@ -266,7 +266,7 @@ static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
 static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -294,7 +294,7 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
 	struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
-	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	struct dsa_port *dp = dsa_user_to_port(netdev);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
 	u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
@@ -383,7 +383,7 @@ static struct sk_buff
 	 * Buffer it until we get its meta frame.
 	 */
 	if (is_link_local) {
-		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct dsa_port *dp = dsa_user_to_port(skb->dev);
 		struct sja1105_tagger_private *priv;
 		struct dsa_switch *ds = dp->ds;
 
@@ -396,7 +396,7 @@ static struct sk_buff
 		if (priv->stampable_skb) {
 			dev_err_ratelimited(ds->dev,
 					    "Expected meta frame, is %12llx "
-					    "in the DSA master multicast filter?\n",
+					    "in the DSA conduit multicast filter?\n",
 					    SJA1105_META_DMAC);
 			kfree_skb(priv->stampable_skb);
 		}
@@ -417,7 +417,7 @@ static struct sk_buff
 	 * frame, which serves no further purpose).
 	 */
 	} else if (is_meta) {
-		struct dsa_port *dp = dsa_slave_to_port(skb->dev);
+		struct dsa_port *dp = dsa_user_to_port(skb->dev);
 		struct sja1105_tagger_private *priv;
 		struct dsa_switch *ds = dp->ds;
 		struct sk_buff *stampable_skb;
@@ -550,7 +550,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 	}
 
 	if (source_port != -1 && switch_id != -1)
-		skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
+		skb->dev = dsa_conduit_find_user(netdev, switch_id, source_port);
 	else if (vbid >= 1)
 		skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid);
 	else
@@ -573,16 +573,16 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
 	int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
 	int n_ts = SJA1110_RX_HEADER_N_TS(rx_header);
 	struct sja1105_tagger_data *tagger_data;
-	struct net_device *master = skb->dev;
+	struct net_device *conduit = skb->dev;
 	struct dsa_port *cpu_dp;
 	struct dsa_switch *ds;
 	int i;
 
-	cpu_dp = master->dsa_ptr;
+	cpu_dp = conduit->dsa_ptr;
 	ds = dsa_switch_find(cpu_dp->dst->index, switch_id);
 	if (!ds) {
 		net_err_ratelimited("%s: cannot find switch id %d\n",
-				    master->name, switch_id);
+				    conduit->name, switch_id);
 		return NULL;
 	}
 
@@ -649,7 +649,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 
 		/* skb->len counts from skb->data, while start_of_padding
 		 * counts from the destination MAC address. Right now skb->data
-		 * is still as set by the DSA master, so to trim away the
+		 * is still as set by the DSA conduit, so to trim away the
 		 * padding and trailer we need to account for the fact that
 		 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
 		 */
@@ -698,7 +698,7 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
 	else if (source_port == -1 || switch_id == -1)
 		skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid);
 	else
-		skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
+		skb->dev = dsa_conduit_find_user(netdev, switch_id, source_port);
 	if (!skb->dev) {
 		netdev_warn(netdev, "Couldn't decode source port\n");
 		return NULL;
@@ -778,7 +778,7 @@ static const struct dsa_device_ops sja1105_netdev_ops = {
 	.disconnect = sja1105_disconnect,
 	.needed_headroom = VLAN_HLEN,
 	.flow_dissect = sja1105_flow_dissect,
-	.promisc_on_master = true,
+	.promisc_on_conduit = true,
 };
 
 DSA_TAG_DRIVER(sja1105_netdev_ops);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 7361b9106382..1ebb25a8b140 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -14,7 +14,7 @@
 
 static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
 	u8 *trailer;
 
 	trailer = skb_put(skb, 4);
@@ -41,7 +41,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
 
 	source_port = trailer[1] & 7;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
index af19969f9bc4..c9c163598ef2 100644
--- a/net/dsa/tag_xrs700x.c
+++ b/net/dsa/tag_xrs700x.c
@@ -13,7 +13,7 @@
 
 static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dsa_port *partner, *dp = dsa_slave_to_port(dev);
+	struct dsa_port *partner, *dp = dsa_user_to_port(dev);
 	u8 *trailer;
 
 	trailer = skb_put(skb, 1);
@@ -39,7 +39,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
 	if (source_port < 0)
 		return NULL;
 
-	skb->dev = dsa_master_find_slave(dev, 0, source_port);
+	skb->dev = dsa_conduit_find_user(dev, 0, source_port);
 	if (!skb->dev)
 		return NULL;
 
diff --git a/net/dsa/user.c b/net/dsa/user.c
new file mode 100644
index 000000000000..d438884a4eb0
--- /dev/null
+++ b/net/dsa/user.c
@@ -0,0 +1,3727 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/user.c - user device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ */
+
+#include <linux/list.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/mdio.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/selftests.h>
+#include <net/tc_act/tc_mirred.h>
+#include <linux/if_bridge.h>
+#include <linux/if_hsr.h>
+#include <net/dcbnl.h>
+#include <linux/netpoll.h>
+#include <linux/string.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+struct dsa_switchdev_event_work {
+	struct net_device *dev;
+	struct net_device *orig_dev;
+	struct work_struct work;
+	unsigned long event;
+	/* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+	 * SWITCHDEV_FDB_DEL_TO_DEVICE
+	 */
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+	bool host_addr;
+};
+
+enum dsa_standalone_event {
+	DSA_UC_ADD,
+	DSA_UC_DEL,
+	DSA_MC_ADD,
+	DSA_MC_DEL,
+};
+
+struct dsa_standalone_event_work {
+	struct work_struct work;
+	struct net_device *dev;
+	enum dsa_standalone_event event;
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+};
+
+struct dsa_host_vlan_rx_filtering_ctx {
+	struct net_device *dev;
+	const unsigned char *addr;
+	enum dsa_standalone_event event;
+};
+
+static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
+{
+	return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
+	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+	       !ds->needs_standalone_vlan_filtering;
+}
+
+static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
+{
+	return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
+	       ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+	       !ds->needs_standalone_vlan_filtering;
+}
+
+static void dsa_user_standalone_event_work(struct work_struct *work)
+{
+	struct dsa_standalone_event_work *standalone_work =
+		container_of(work, struct dsa_standalone_event_work, work);
+	const unsigned char *addr = standalone_work->addr;
+	struct net_device *dev = standalone_work->dev;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_mdb mdb;
+	struct dsa_switch *ds = dp->ds;
+	u16 vid = standalone_work->vid;
+	int err;
+
+	switch (standalone_work->event) {
+	case DSA_UC_ADD:
+		err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to fdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		break;
+
+	case DSA_UC_DEL:
+		err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from fdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	case DSA_MC_ADD:
+		ether_addr_copy(mdb.addr, addr);
+		mdb.vid = vid;
+
+		err = dsa_port_standalone_host_mdb_add(dp, &mdb);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to mdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		break;
+	case DSA_MC_DEL:
+		ether_addr_copy(mdb.addr, addr);
+		mdb.vid = vid;
+
+		err = dsa_port_standalone_host_mdb_del(dp, &mdb);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from mdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	}
+
+	kfree(standalone_work);
+}
+
+static int dsa_user_schedule_standalone_work(struct net_device *dev,
+					     enum dsa_standalone_event event,
+					     const unsigned char *addr,
+					     u16 vid)
+{
+	struct dsa_standalone_event_work *standalone_work;
+
+	standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
+	if (!standalone_work)
+		return -ENOMEM;
+
+	INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work);
+	standalone_work->event = event;
+	standalone_work->dev = dev;
+
+	ether_addr_copy(standalone_work->addr, addr);
+	standalone_work->vid = vid;
+
+	dsa_schedule_work(&standalone_work->work);
+
+	return 0;
+}
+
+static int dsa_user_host_vlan_rx_filtering(void *arg, int vid)
+{
+	struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
+
+	return dsa_user_schedule_standalone_work(ctx->dev, ctx->event,
+						  ctx->addr, vid);
+}
+
+static int dsa_user_vlan_for_each(struct net_device *dev,
+				  int (*cb)(void *arg, int vid), void *arg)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_vlan *v;
+	int err;
+
+	lockdep_assert_held(&dev->addr_list_lock);
+
+	err = cb(arg, 0);
+	if (err)
+		return err;
+
+	list_for_each_entry(v, &dp->user_vlans, list) {
+		err = cb(arg, v->vid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int dsa_user_sync_uc(struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_UC_ADD,
+	};
+
+	dev_uc_add(conduit, addr);
+
+	if (!dsa_switch_supports_uc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_unsync_uc(struct net_device *dev,
+			      const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_UC_DEL,
+	};
+
+	dev_uc_del(conduit, addr);
+
+	if (!dsa_switch_supports_uc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_sync_mc(struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_MC_ADD,
+	};
+
+	dev_mc_add(conduit, addr);
+
+	if (!dsa_switch_supports_mc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+static int dsa_user_unsync_mc(struct net_device *dev,
+			      const unsigned char *addr)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_host_vlan_rx_filtering_ctx ctx = {
+		.dev = dev,
+		.addr = addr,
+		.event = DSA_MC_DEL,
+	};
+
+	dev_mc_del(conduit, addr);
+
+	if (!dsa_switch_supports_mc_filtering(dp->ds))
+		return 0;
+
+	return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+				       &ctx);
+}
+
+void dsa_user_sync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_user_sync_mc(dev, ha->addr);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_user_sync_uc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
+void dsa_user_unsync_ha(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(dev);
+
+	netdev_for_each_synced_uc_addr(ha, dev)
+		dsa_user_unsync_uc(dev, ha->addr);
+
+	netdev_for_each_synced_mc_addr(ha, dev)
+		dsa_user_unsync_mc(dev, ha->addr);
+
+	netif_addr_unlock_bh(dev);
+
+	if (dsa_switch_supports_uc_filtering(ds) ||
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_flush_workqueue();
+}
+
+/* user mii_bus handling ***************************************************/
+static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_mii_mask & (1 << addr))
+		return ds->ops->phy_read(ds, addr, reg);
+
+	return 0xffff;
+}
+
+static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_mii_mask & (1 << addr))
+		return ds->ops->phy_write(ds, addr, reg, val);
+
+	return 0;
+}
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds)
+{
+	ds->user_mii_bus->priv = (void *)ds;
+	ds->user_mii_bus->name = "dsa user smi";
+	ds->user_mii_bus->read = dsa_user_phy_read;
+	ds->user_mii_bus->write = dsa_user_phy_write;
+	snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
+		 ds->dst->index, ds->index);
+	ds->user_mii_bus->parent = ds->dev;
+	ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
+}
+
+
+/* user device handling ****************************************************/
+static int dsa_user_get_iflink(const struct net_device *dev)
+{
+	return dsa_user_to_conduit(dev)->ifindex;
+}
+
+static int dsa_user_open(struct net_device *dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int err;
+
+	err = dev_open(conduit, NULL);
+	if (err < 0) {
+		netdev_err(dev, "failed to open conduit %s\n", conduit->name);
+		goto out;
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		err = dsa_port_standalone_host_fdb_add(dp, dev->dev_addr, 0);
+		if (err)
+			goto out;
+	}
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) {
+		err = dev_uc_add(conduit, dev->dev_addr);
+		if (err < 0)
+			goto del_host_addr;
+	}
+
+	err = dsa_port_enable_rt(dp, dev->phydev);
+	if (err)
+		goto del_unicast;
+
+	return 0;
+
+del_unicast:
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+del_host_addr:
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+out:
+	return err;
+}
+
+static int dsa_user_close(struct net_device *dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	dsa_port_disable_rt(dp);
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+
+	return 0;
+}
+
+static void dsa_user_manage_host_flood(struct net_device *dev)
+{
+	bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	bool uc = dev->flags & IFF_PROMISC;
+
+	dsa_port_set_host_flood(dp, uc, mc);
+}
+
+static void dsa_user_change_rx_flags(struct net_device *dev, int change)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(conduit,
+				 dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(conduit,
+				    dev->flags & IFF_PROMISC ? 1 : -1);
+
+	if (dsa_switch_supports_uc_filtering(ds) &&
+	    dsa_switch_supports_mc_filtering(ds))
+		dsa_user_manage_host_flood(dev);
+}
+
+static void dsa_user_set_rx_mode(struct net_device *dev)
+{
+	__dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc);
+	__dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc);
+}
+
+static int dsa_user_set_mac_address(struct net_device *dev, void *a)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct sockaddr *addr = a;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ds->ops->port_set_mac_address) {
+		err = ds->ops->port_set_mac_address(ds, dp->index,
+						    addr->sa_data);
+		if (err)
+			return err;
+	}
+
+	/* If the port is down, the address isn't synced yet to hardware or
+	 * to the DSA conduit, so there is nothing to change.
+	 */
+	if (!(dev->flags & IFF_UP))
+		goto out_change_dev_addr;
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		err = dsa_port_standalone_host_fdb_add(dp, addr->sa_data, 0);
+		if (err)
+			return err;
+	}
+
+	if (!ether_addr_equal(addr->sa_data, conduit->dev_addr)) {
+		err = dev_uc_add(conduit, addr->sa_data);
+		if (err < 0)
+			goto del_unicast;
+	}
+
+	if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+		dev_uc_del(conduit, dev->dev_addr);
+
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+
+out_change_dev_addr:
+	eth_hw_addr_set(dev, addr->sa_data);
+
+	return 0;
+
+del_unicast:
+	if (dsa_switch_supports_uc_filtering(ds))
+		dsa_port_standalone_host_fdb_del(dp, addr->sa_data, 0);
+
+	return err;
+}
+
+struct dsa_user_dump_ctx {
+	struct net_device *dev;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx;
+};
+
+static int
+dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
+			  bool is_static, void *data)
+{
+	struct dsa_user_dump_ctx *dump = data;
+	u32 portid = NETLINK_CB(dump->cb->skb).portid;
+	u32 seq = dump->cb->nlh->nlmsg_seq;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	if (dump->idx < dump->cb->args[2])
+		goto skip;
+
+	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+			sizeof(*ndm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family  = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags   = NTF_SELF;
+	ndm->ndm_type    = 0;
+	ndm->ndm_ifindex = dump->dev->ifindex;
+	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
+		goto nla_put_failure;
+
+	if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
+		goto nla_put_failure;
+
+	nlmsg_end(dump->skb, nlh);
+
+skip:
+	dump->idx++;
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(dump->skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int
+dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+		  struct net_device *dev, struct net_device *filter_dev,
+		  int *idx)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_dump_ctx dump = {
+		.dev = dev,
+		.skb = skb,
+		.cb = cb,
+		.idx = *idx,
+	};
+	int err;
+
+	err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump);
+	*idx = dump.idx;
+
+	return err;
+}
+
+static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+	int port = p->dp->index;
+
+	/* Pass through to switch driver if it supports timestamping */
+	switch (cmd) {
+	case SIOCGHWTSTAMP:
+		if (ds->ops->port_hwtstamp_get)
+			return ds->ops->port_hwtstamp_get(ds, port, ifr);
+		break;
+	case SIOCSHWTSTAMP:
+		if (ds->ops->port_hwtstamp_set)
+			return ds->ops->port_hwtstamp_set(ds, port, ifr);
+		break;
+	}
+
+	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
+}
+
+static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx,
+				  const struct switchdev_attr *attr,
+				  struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int ret;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_set_state(dp, attr->u.stp_state, true);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
+					      extack);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MST:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
+						extack);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
+		break;
+	case SWITCHDEV_ATTR_ID_VLAN_MSTI:
+		if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+			return -EOPNOTSUPP;
+
+		ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	return ret;
+}
+
+/* Must be called under rcu_read_lock() */
+static int
+dsa_user_vlan_check_for_8021q_uppers(struct net_device *user,
+				     const struct switchdev_obj_port_vlan *vlan)
+{
+	struct net_device *upper_dev;
+	struct list_head *iter;
+
+	netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+		u16 vid;
+
+		if (!is_vlan_dev(upper_dev))
+			continue;
+
+		vid = vlan_dev_vlan_id(upper_dev);
+		if (vid == vlan->vid)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int dsa_user_vlan_add(struct net_device *dev,
+			     const struct switchdev_obj *obj,
+			     struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+	int err;
+
+	if (dsa_port_skip_vlan_configuration(dp)) {
+		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+		return 0;
+	}
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	/* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+	 * the same VID.
+	 */
+	if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
+		rcu_read_lock();
+		err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan);
+		rcu_read_unlock();
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Port already has a VLAN upper with this VID");
+			return err;
+		}
+	}
+
+	return dsa_port_vlan_add(dp, vlan, extack);
+}
+
+/* Offload a VLAN installed on the bridge or on a foreign interface by
+ * installing it as a VLAN towards the CPU port.
+ */
+static int dsa_user_host_vlan_add(struct net_device *dev,
+				  const struct switchdev_obj *obj,
+				  struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan;
+
+	/* Do nothing if this is a software bridge */
+	if (!dp->bridge)
+		return -EOPNOTSUPP;
+
+	if (dsa_port_skip_vlan_configuration(dp)) {
+		NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+		return 0;
+	}
+
+	vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	/* Even though drivers often handle CPU membership in special ways,
+	 * it doesn't make sense to program a PVID, so clear this flag.
+	 */
+	vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+	return dsa_port_host_vlan_add(dp, &vlan, extack);
+}
+
+static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx,
+				 const struct switchdev_obj *obj,
+				 struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int err;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_HOST_MDB:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			err = dsa_user_vlan_add(dev, obj, extack);
+		else
+			err = dsa_user_host_vlan_add(dev, obj, extack);
+		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_add_ring_role(dp,
+						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int dsa_user_vlan_del(struct net_device *dev,
+			     const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+
+	if (dsa_port_skip_vlan_configuration(dp))
+		return 0;
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	return dsa_port_vlan_del(dp, vlan);
+}
+
+static int dsa_user_host_vlan_del(struct net_device *dev,
+				  const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan *vlan;
+
+	/* Do nothing if this is a software bridge */
+	if (!dp->bridge)
+		return -EOPNOTSUPP;
+
+	if (dsa_port_skip_vlan_configuration(dp))
+		return 0;
+
+	vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+	return dsa_port_host_vlan_del(dp, vlan);
+}
+
+static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx,
+				 const struct switchdev_obj *obj)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	int err;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_HOST_MDB:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+			err = dsa_user_vlan_del(dev, obj);
+		else
+			err = dsa_user_host_vlan_del(dev, obj);
+		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+			return -EOPNOTSUPP;
+
+		err = dsa_port_mrp_del_ring_role(dp,
+						 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static inline netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev,
+						    struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct dsa_user_priv *p = netdev_priv(dev);
+
+	return netpoll_send_skb(p->netpoll, skb);
+#else
+	BUG();
+	return NETDEV_TX_OK;
+#endif
+}
+
+static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
+				 struct sk_buff *skb)
+{
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+		return;
+
+	if (!ds->ops->port_txtstamp)
+		return;
+
+	ds->ops->port_txtstamp(ds, p->dp->index, skb);
+}
+
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	/* SKB for netpoll still need to be mangled with the protocol-specific
+	 * tag to be successfully transmitted
+	 */
+	if (unlikely(netpoll_tx_running(dev)))
+		return dsa_user_netpoll_send_skb(dev, skb);
+
+	/* Queue the SKB for transmission on the parent interface, but
+	 * do not modify its EtherType
+	 */
+	skb->dev = dsa_user_to_conduit(dev);
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
+
+static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
+{
+	int needed_headroom = dev->needed_headroom;
+	int needed_tailroom = dev->needed_tailroom;
+
+	/* For tail taggers, we need to pad short frames ourselves, to ensure
+	 * that the tail tag does not fail at its role of being at the end of
+	 * the packet, once the conduit interface pads the frame. Account for
+	 * that pad length here, and pad later.
+	 */
+	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+		needed_tailroom += ETH_ZLEN - skb->len;
+	/* skb_headroom() returns unsigned int... */
+	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+		/* No reallocation needed, yay! */
+		return 0;
+
+	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+				GFP_ATOMIC);
+}
+
+static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct sk_buff *nskb;
+
+	dev_sw_netstats_tx_add(dev, 1, skb->len);
+
+	memset(skb->cb, 0, sizeof(skb->cb));
+
+	/* Handle tx timestamp if any */
+	dsa_skb_tx_timestamp(p, skb);
+
+	if (dsa_realloc_skb(skb, dev)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* needed_tailroom should still be 'warm' in the cache line from
+	 * dsa_realloc_skb(), which has also ensured that padding is safe.
+	 */
+	if (dev->needed_tailroom)
+		eth_skb_pad(skb);
+
+	/* Transmit function may have to reallocate the original SKB,
+	 * in which case it must have freed it. Only free it here on error.
+	 */
+	nskb = p->xmit(skb, dev);
+	if (!nskb) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	return dsa_enqueue_skb(nskb, dev);
+}
+
+/* ethtool operations *******************************************************/
+
+static void dsa_user_get_drvinfo(struct net_device *dev,
+				 struct ethtool_drvinfo *drvinfo)
+{
+	strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
+	strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+	strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
+}
+
+static int dsa_user_get_regs_len(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_regs_len)
+		return ds->ops->get_regs_len(ds, dp->index);
+
+	return -EOPNOTSUPP;
+}
+
+static void
+dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_regs)
+		ds->ops->get_regs(ds, dp->index, regs, _p);
+}
+
+static int dsa_user_nway_reset(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_nway_reset(dp->pl);
+}
+
+static int dsa_user_get_eeprom_len(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->cd && ds->cd->eeprom_len)
+		return ds->cd->eeprom_len;
+
+	if (ds->ops->get_eeprom_len)
+		return ds->ops->get_eeprom_len(ds);
+
+	return 0;
+}
+
+static int dsa_user_get_eeprom(struct net_device *dev,
+			       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eeprom)
+		return ds->ops->get_eeprom(ds, eeprom, data);
+
+	return -EOPNOTSUPP;
+}
+
+static int dsa_user_set_eeprom(struct net_device *dev,
+			       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->set_eeprom)
+		return ds->ops->set_eeprom(ds, eeprom, data);
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_strings(struct net_device *dev,
+				 uint32_t stringset, uint8_t *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (stringset == ETH_SS_STATS) {
+		int len = ETH_GSTRING_LEN;
+
+		strscpy_pad(data, "tx_packets", len);
+		strscpy_pad(data + len, "tx_bytes", len);
+		strscpy_pad(data + 2 * len, "rx_packets", len);
+		strscpy_pad(data + 3 * len, "rx_bytes", len);
+		if (ds->ops->get_strings)
+			ds->ops->get_strings(ds, dp->index, stringset,
+					     data + 4 * len);
+	} else if (stringset ==  ETH_SS_TEST) {
+		net_selftest_get_strings(data);
+	}
+
+}
+
+static void dsa_user_get_ethtool_stats(struct net_device *dev,
+				       struct ethtool_stats *stats,
+				       uint64_t *data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct pcpu_sw_netstats *s;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
+
+		s = per_cpu_ptr(dev->tstats, i);
+		do {
+			start = u64_stats_fetch_begin(&s->syncp);
+			tx_packets = u64_stats_read(&s->tx_packets);
+			tx_bytes = u64_stats_read(&s->tx_bytes);
+			rx_packets = u64_stats_read(&s->rx_packets);
+			rx_bytes = u64_stats_read(&s->rx_bytes);
+		} while (u64_stats_fetch_retry(&s->syncp, start));
+		data[0] += tx_packets;
+		data[1] += tx_bytes;
+		data[2] += rx_packets;
+		data[3] += rx_bytes;
+	}
+	if (ds->ops->get_ethtool_stats)
+		ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
+}
+
+static int dsa_user_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (sset == ETH_SS_STATS) {
+		int count = 0;
+
+		if (ds->ops->get_sset_count) {
+			count = ds->ops->get_sset_count(ds, dp->index, sset);
+			if (count < 0)
+				return count;
+		}
+
+		return count + 4;
+	} else if (sset ==  ETH_SS_TEST) {
+		return net_selftest_get_count();
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_eth_phy_stats(struct net_device *dev,
+				       struct ethtool_eth_phy_stats *phy_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_phy_stats)
+		ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
+}
+
+static void dsa_user_get_eth_mac_stats(struct net_device *dev,
+				       struct ethtool_eth_mac_stats *mac_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_mac_stats)
+		ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
+}
+
+static void
+dsa_user_get_eth_ctrl_stats(struct net_device *dev,
+			    struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_eth_ctrl_stats)
+		ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
+}
+
+static void
+dsa_user_get_rmon_stats(struct net_device *dev,
+			struct ethtool_rmon_stats *rmon_stats,
+			const struct ethtool_rmon_hist_range **ranges)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_rmon_stats)
+		ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
+static void dsa_user_net_selftest(struct net_device *ndev,
+				  struct ethtool_test *etest, u64 *buf)
+{
+	struct dsa_port *dp = dsa_user_to_port(ndev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->self_test) {
+		ds->ops->self_test(ds, dp->index, etest, buf);
+		return;
+	}
+
+	net_selftest(ndev, etest, buf);
+}
+
+static int dsa_user_get_mm(struct net_device *dev,
+			   struct ethtool_mm_state *state)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->get_mm)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_mm(ds, dp->index, state);
+}
+
+static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->set_mm)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_mm(ds, dp->index, cfg, extack);
+}
+
+static void dsa_user_get_mm_stats(struct net_device *dev,
+				  struct ethtool_mm_stats *stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_mm_stats)
+		ds->ops->get_mm_stats(ds, dp->index, stats);
+}
+
+static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	phylink_ethtool_get_wol(dp->pl, w);
+
+	if (ds->ops->get_wol)
+		ds->ops->get_wol(ds, dp->index, w);
+}
+
+static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret = -EOPNOTSUPP;
+
+	phylink_ethtool_set_wol(dp->pl, w);
+
+	if (ds->ops->set_wol)
+		ret = ds->ops->set_wol(ds, dp->index, w);
+
+	return ret;
+}
+
+static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret;
+
+	/* Port's PHY and MAC both need to be EEE capable */
+	if (!dev->phydev || !dp->pl)
+		return -ENODEV;
+
+	if (!ds->ops->set_mac_eee)
+		return -EOPNOTSUPP;
+
+	ret = ds->ops->set_mac_eee(ds, dp->index, e);
+	if (ret)
+		return ret;
+
+	return phylink_ethtool_set_eee(dp->pl, e);
+}
+
+static int dsa_user_get_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int ret;
+
+	/* Port's PHY and MAC both need to be EEE capable */
+	if (!dev->phydev || !dp->pl)
+		return -ENODEV;
+
+	if (!ds->ops->get_mac_eee)
+		return -EOPNOTSUPP;
+
+	ret = ds->ops->get_mac_eee(ds, dp->index, e);
+	if (ret)
+		return ret;
+
+	return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_user_get_link_ksettings(struct net_device *dev,
+				       struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_user_set_link_ksettings(struct net_device *dev,
+				       const struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_ksettings_set(dp->pl, cmd);
+}
+
+static void dsa_user_get_pause_stats(struct net_device *dev,
+				     struct ethtool_pause_stats *pause_stats)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_pause_stats)
+		ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
+static void dsa_user_get_pauseparam(struct net_device *dev,
+				    struct ethtool_pauseparam *pause)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	phylink_ethtool_get_pauseparam(dp->pl, pause);
+}
+
+static int dsa_user_set_pauseparam(struct net_device *dev,
+				   struct ethtool_pauseparam *pause)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return phylink_ethtool_set_pauseparam(dp->pl, pause);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_user_netpoll_setup(struct net_device *dev,
+				  struct netpoll_info *ni)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct netpoll *netpoll;
+	int err = 0;
+
+	netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+	if (!netpoll)
+		return -ENOMEM;
+
+	err = __netpoll_setup(netpoll, conduit);
+	if (err) {
+		kfree(netpoll);
+		goto out;
+	}
+
+	p->netpoll = netpoll;
+out:
+	return err;
+}
+
+static void dsa_user_netpoll_cleanup(struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct netpoll *netpoll = p->netpoll;
+
+	if (!netpoll)
+		return;
+
+	p->netpoll = NULL;
+
+	__netpoll_free(netpoll);
+}
+
+static void dsa_user_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
+static struct dsa_mall_tc_entry *
+dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int
+dsa_user_add_cls_matchall_mirred(struct net_device *dev,
+				 struct tc_cls_matchall_offload *cls,
+				 bool ingress)
+{
+	struct netlink_ext_ack *extack = cls->common.extack;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_mirror_tc_entry *mirror;
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+	struct flow_action_entry *act;
+	struct dsa_port *to_dp;
+	int err;
+
+	if (!ds->ops->port_mirror_add)
+		return -EOPNOTSUPP;
+
+	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+					      cls->common.extack))
+		return -EOPNOTSUPP;
+
+	act = &cls->rule->action.entries[0];
+
+	if (!act->dev)
+		return -EINVAL;
+
+	if (!dsa_user_dev_check(act->dev))
+		return -EOPNOTSUPP;
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+
+	mall_tc_entry->cookie = cls->cookie;
+	mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+	mirror = &mall_tc_entry->mirror;
+
+	to_dp = dsa_user_to_port(act->dev);
+
+	mirror->to_local_port = to_dp->index;
+	mirror->ingress = ingress;
+
+	err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
+	if (err) {
+		kfree(mall_tc_entry);
+		return err;
+	}
+
+	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+	return err;
+}
+
+static int
+dsa_user_add_cls_matchall_police(struct net_device *dev,
+				 struct tc_cls_matchall_offload *cls,
+				 bool ingress)
+{
+	struct netlink_ext_ack *extack = cls->common.extack;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_mall_policer_tc_entry *policer;
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+	struct flow_action_entry *act;
+	int err;
+
+	if (!ds->ops->port_policer_add) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Policing offload not implemented");
+		return -EOPNOTSUPP;
+	}
+
+	if (!ingress) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only supported on ingress qdisc");
+		return -EOPNOTSUPP;
+	}
+
+	if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+					      cls->common.extack))
+		return -EOPNOTSUPP;
+
+	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+		if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Only one port policer allowed");
+			return -EEXIST;
+		}
+	}
+
+	act = &cls->rule->action.entries[0];
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+
+	mall_tc_entry->cookie = cls->cookie;
+	mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+	policer = &mall_tc_entry->policer;
+	policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+	policer->burst = act->police.burst;
+
+	err = ds->ops->port_policer_add(ds, dp->index, policer);
+	if (err) {
+		kfree(mall_tc_entry);
+		return err;
+	}
+
+	list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+	return err;
+}
+
+static int dsa_user_add_cls_matchall(struct net_device *dev,
+				     struct tc_cls_matchall_offload *cls,
+				     bool ingress)
+{
+	int err = -EOPNOTSUPP;
+
+	if (cls->common.protocol == htons(ETH_P_ALL) &&
+	    flow_offload_has_one_action(&cls->rule->action) &&
+	    cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
+		err = dsa_user_add_cls_matchall_mirred(dev, cls, ingress);
+	else if (flow_offload_has_one_action(&cls->rule->action) &&
+		 cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
+		err = dsa_user_add_cls_matchall_police(dev, cls, ingress);
+
+	return err;
+}
+
+static void dsa_user_del_cls_matchall(struct net_device *dev,
+				      struct tc_cls_matchall_offload *cls)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_mall_tc_entry *mall_tc_entry;
+	struct dsa_switch *ds = dp->ds;
+
+	mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie);
+	if (!mall_tc_entry)
+		return;
+
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case DSA_PORT_MALL_MIRROR:
+		if (ds->ops->port_mirror_del)
+			ds->ops->port_mirror_del(ds, dp->index,
+						 &mall_tc_entry->mirror);
+		break;
+	case DSA_PORT_MALL_POLICER:
+		if (ds->ops->port_policer_del)
+			ds->ops->port_policer_del(ds, dp->index);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int dsa_user_setup_tc_cls_matchall(struct net_device *dev,
+					  struct tc_cls_matchall_offload *cls,
+					  bool ingress)
+{
+	if (cls->common.chain_index)
+		return -EOPNOTSUPP;
+
+	switch (cls->command) {
+	case TC_CLSMATCHALL_REPLACE:
+		return dsa_user_add_cls_matchall(dev, cls, ingress);
+	case TC_CLSMATCHALL_DESTROY:
+		dsa_user_del_cls_matchall(dev, cls);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_add_cls_flower(struct net_device *dev,
+				   struct flow_cls_offload *cls,
+				   bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_add)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_add(ds, port, cls, ingress);
+}
+
+static int dsa_user_del_cls_flower(struct net_device *dev,
+				   struct flow_cls_offload *cls,
+				   bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_del)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_del(ds, port, cls, ingress);
+}
+
+static int dsa_user_stats_cls_flower(struct net_device *dev,
+				     struct flow_cls_offload *cls,
+				     bool ingress)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->cls_flower_stats)
+		return -EOPNOTSUPP;
+
+	return ds->ops->cls_flower_stats(ds, port, cls, ingress);
+}
+
+static int dsa_user_setup_tc_cls_flower(struct net_device *dev,
+					struct flow_cls_offload *cls,
+					bool ingress)
+{
+	switch (cls->command) {
+	case FLOW_CLS_REPLACE:
+		return dsa_user_add_cls_flower(dev, cls, ingress);
+	case FLOW_CLS_DESTROY:
+		return dsa_user_del_cls_flower(dev, cls, ingress);
+	case FLOW_CLS_STATS:
+		return dsa_user_stats_cls_flower(dev, cls, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				      void *cb_priv, bool ingress)
+{
+	struct net_device *dev = cb_priv;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress);
+	case TC_SETUP_CLSFLOWER:
+		return dsa_user_setup_tc_cls_flower(dev, type_data, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type,
+					 void *type_data, void *cb_priv)
+{
+	return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true);
+}
+
+static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type,
+					 void *type_data, void *cb_priv)
+{
+	return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false);
+}
+
+static LIST_HEAD(dsa_user_block_cb_list);
+
+static int dsa_user_setup_tc_block(struct net_device *dev,
+				   struct flow_block_offload *f)
+{
+	struct flow_block_cb *block_cb;
+	flow_setup_cb_t *cb;
+
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		cb = dsa_user_setup_tc_block_cb_ig;
+	else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+		cb = dsa_user_setup_tc_block_cb_eg;
+	else
+		return -EOPNOTSUPP;
+
+	f->driver_block_list = &dsa_user_block_cb_list;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list))
+			return -EBUSY;
+
+		block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
+		if (IS_ERR(block_cb))
+			return PTR_ERR(block_cb);
+
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list);
+		return 0;
+	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		flow_block_cb_remove(block_cb, f);
+		list_del(&block_cb->driver_list);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port,
+				   void *type_data)
+{
+	struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port));
+
+	if (!conduit->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data);
+}
+
+static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     void *type_data)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return dsa_user_setup_tc_block(dev, type_data);
+	case TC_SETUP_FT:
+		return dsa_user_setup_ft_block(ds, dp->index, type_data);
+	default:
+		break;
+	}
+
+	if (!ds->ops->port_setup_tc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
+}
+
+static int dsa_user_get_rxnfc(struct net_device *dev,
+			      struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
+}
+
+static int dsa_user_set_rxnfc(struct net_device *dev,
+			      struct ethtool_rxnfc *nfc)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->set_rxnfc)
+		return -EOPNOTSUPP;
+
+	return ds->ops->set_rxnfc(ds, dp->index, nfc);
+}
+
+static int dsa_user_get_ts_info(struct net_device *dev,
+				struct ethtool_ts_info *ts)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_ts_info)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
+static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+				    u16 vid)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan = {
+		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+		.vid = vid,
+		/* This API only allows programming tagged, non-PVID VIDs */
+		.flags = 0,
+	};
+	struct netlink_ext_ack extack = {0};
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+	struct dsa_vlan *v;
+	int ret;
+
+	/* User port... */
+	ret = dsa_port_vlan_add(dp, &vlan, &extack);
+	if (ret) {
+		if (extack._msg)
+			netdev_err(dev, "%s\n", extack._msg);
+		return ret;
+	}
+
+	/* And CPU port... */
+	ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
+	if (ret) {
+		if (extack._msg)
+			netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
+				   extack._msg);
+		return ret;
+	}
+
+	if (!dsa_switch_supports_uc_filtering(ds) &&
+	    !dsa_switch_supports_mc_filtering(ds))
+		return 0;
+
+	v = kzalloc(sizeof(*v), GFP_KERNEL);
+	if (!v) {
+		ret = -ENOMEM;
+		goto rollback;
+	}
+
+	netif_addr_lock_bh(dev);
+
+	v->vid = vid;
+	list_add_tail(&v->list, &dp->user_vlans);
+
+	if (dsa_switch_supports_mc_filtering(ds)) {
+		netdev_for_each_synced_mc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_MC_ADD,
+							  ha->addr, vid);
+		}
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		netdev_for_each_synced_uc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_UC_ADD,
+							  ha->addr, vid);
+		}
+	}
+
+	netif_addr_unlock_bh(dev);
+
+	dsa_flush_workqueue();
+
+	return 0;
+
+rollback:
+	dsa_port_host_vlan_del(dp, &vlan);
+	dsa_port_vlan_del(dp, &vlan);
+
+	return ret;
+}
+
+static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+				     u16 vid)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct switchdev_obj_port_vlan vlan = {
+		.vid = vid,
+		/* This API only allows programming tagged, non-PVID VIDs */
+		.flags = 0,
+	};
+	struct dsa_switch *ds = dp->ds;
+	struct netdev_hw_addr *ha;
+	struct dsa_vlan *v;
+	int err;
+
+	err = dsa_port_vlan_del(dp, &vlan);
+	if (err)
+		return err;
+
+	err = dsa_port_host_vlan_del(dp, &vlan);
+	if (err)
+		return err;
+
+	if (!dsa_switch_supports_uc_filtering(ds) &&
+	    !dsa_switch_supports_mc_filtering(ds))
+		return 0;
+
+	netif_addr_lock_bh(dev);
+
+	v = dsa_vlan_find(&dp->user_vlans, &vlan);
+	if (!v) {
+		netif_addr_unlock_bh(dev);
+		return -ENOENT;
+	}
+
+	list_del(&v->list);
+	kfree(v);
+
+	if (dsa_switch_supports_mc_filtering(ds)) {
+		netdev_for_each_synced_mc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_MC_DEL,
+							  ha->addr, vid);
+		}
+	}
+
+	if (dsa_switch_supports_uc_filtering(ds)) {
+		netdev_for_each_synced_uc_addr(ha, dev) {
+			dsa_user_schedule_standalone_work(dev, DSA_UC_DEL,
+							  ha->addr, vid);
+		}
+	}
+
+	netif_addr_unlock_bh(dev);
+
+	dsa_flush_workqueue();
+
+	return 0;
+}
+
+static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg)
+{
+	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+	return dsa_user_vlan_rx_add_vid(arg, proto, vid);
+}
+
+static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg)
+{
+	__be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+	return dsa_user_vlan_rx_kill_vid(arg, proto, vid);
+}
+
+/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
+ * filtering is enabled. The baseline is that only ports that offload a
+ * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
+ * but there are exceptions for quirky hardware.
+ *
+ * If ds->vlan_filtering_is_global = true, then standalone ports which share
+ * the same switch with other ports that offload a VLAN-aware bridge are also
+ * inevitably VLAN-aware.
+ *
+ * To summarize, a DSA switch port offloads:
+ *
+ * - If standalone (this includes software bridge, software LAG):
+ *     - if ds->needs_standalone_vlan_filtering = true, OR if
+ *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
+ *       this switch chip which have vlan_filtering=1)
+ *         - the 8021q upper VLANs
+ *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
+ *       global, or it is, but no port is under a VLAN-aware bridge):
+ *         - no VLAN (any 8021q upper is a software VLAN)
+ *
+ * - If under a vlan_filtering=0 bridge which it offload:
+ *     - if ds->configure_vlan_while_not_filtering = true (default):
+ *         - the bridge VLANs. These VLANs are committed to hardware but inactive.
+ *     - else (deprecated):
+ *         - no VLAN. The bridge VLANs are not restored when VLAN awareness is
+ *           enabled, so this behavior is broken and discouraged.
+ *
+ * - If under a vlan_filtering=1 bridge which it offload:
+ *     - the bridge VLANs
+ *     - the 8021q upper VLANs
+ */
+int dsa_user_manage_vlan_filtering(struct net_device *user,
+				   bool vlan_filtering)
+{
+	int err;
+
+	if (vlan_filtering) {
+		user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+		err = vlan_for_each(user, dsa_user_restore_vlan, user);
+		if (err) {
+			vlan_for_each(user, dsa_user_clear_vlan, user);
+			user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+			return err;
+		}
+	} else {
+		err = vlan_for_each(user, dsa_user_clear_vlan, user);
+		if (err)
+			return err;
+
+		user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+	}
+
+	return 0;
+}
+
+struct dsa_hw_port {
+	struct list_head list;
+	struct net_device *dev;
+	int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+	const struct dsa_hw_port *p;
+	int err;
+
+	list_for_each_entry(p, hw_port_list, list) {
+		if (p->dev->mtu == mtu)
+			continue;
+
+		err = dev_set_mtu(p->dev, mtu);
+		if (err)
+			goto rollback;
+	}
+
+	return 0;
+
+rollback:
+	list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+		if (p->dev->mtu == p->old_mtu)
+			continue;
+
+		if (dev_set_mtu(p->dev, p->old_mtu))
+			netdev_err(p->dev, "Failed to restore MTU\n");
+	}
+
+	return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+	struct dsa_hw_port *p, *n;
+
+	list_for_each_entry_safe(p, n, hw_port_list, list)
+		kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+	struct list_head hw_port_list;
+	struct dsa_switch_tree *dst;
+	int min_mtu = ETH_MAX_MTU;
+	struct dsa_port *other_dp;
+	int err;
+
+	if (!dp->ds->mtu_enforcement_ingress)
+		return;
+
+	if (!dp->bridge)
+		return;
+
+	INIT_LIST_HEAD(&hw_port_list);
+
+	/* Populate the list of ports that are part of the same bridge
+	 * as the newly added/modified port
+	 */
+	list_for_each_entry(dst, &dsa_tree_list, list) {
+		list_for_each_entry(other_dp, &dst->ports, list) {
+			struct dsa_hw_port *hw_port;
+			struct net_device *user;
+
+			if (other_dp->type != DSA_PORT_TYPE_USER)
+				continue;
+
+			if (!dsa_port_bridge_same(dp, other_dp))
+				continue;
+
+			if (!other_dp->ds->mtu_enforcement_ingress)
+				continue;
+
+			user = other_dp->user;
+
+			if (min_mtu > user->mtu)
+				min_mtu = user->mtu;
+
+			hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+			if (!hw_port)
+				goto out;
+
+			hw_port->dev = user;
+			hw_port->old_mtu = user->mtu;
+
+			list_add(&hw_port->list, &hw_port_list);
+		}
+	}
+
+	/* Attempt to configure the entire hardware bridge to the newly added
+	 * interface's MTU first, regardless of whether the intention of the
+	 * user was to raise or lower it.
+	 */
+	err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu);
+	if (!err)
+		goto out;
+
+	/* Clearly that didn't work out so well, so just set the minimum MTU on
+	 * all hardware bridge ports now. If this fails too, then all ports will
+	 * still have their old MTU rolled back anyway.
+	 */
+	dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+	dsa_hw_port_list_free(&hw_port_list);
+}
+
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct net_device *conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_port *other_dp;
+	int largest_mtu = 0;
+	int new_conduit_mtu;
+	int old_conduit_mtu;
+	int mtu_limit;
+	int overhead;
+	int cpu_mtu;
+	int err;
+
+	if (!ds->ops->port_change_mtu)
+		return -EOPNOTSUPP;
+
+	dsa_tree_for_each_user_port(other_dp, ds->dst) {
+		int user_mtu;
+
+		/* During probe, this function will be called for each user
+		 * device, while not all of them have been allocated. That's
+		 * ok, it doesn't change what the maximum is, so ignore it.
+		 */
+		if (!other_dp->user)
+			continue;
+
+		/* Pretend that we already applied the setting, which we
+		 * actually haven't (still haven't done all integrity checks)
+		 */
+		if (dp == other_dp)
+			user_mtu = new_mtu;
+		else
+			user_mtu = other_dp->user->mtu;
+
+		if (largest_mtu < user_mtu)
+			largest_mtu = user_mtu;
+	}
+
+	overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
+	mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead);
+	old_conduit_mtu = conduit->mtu;
+	new_conduit_mtu = largest_mtu + overhead;
+	if (new_conduit_mtu > mtu_limit)
+		return -ERANGE;
+
+	/* If the conduit MTU isn't over limit, there's no need to check the CPU
+	 * MTU, since that surely isn't either.
+	 */
+	cpu_mtu = largest_mtu;
+
+	/* Start applying stuff */
+	if (new_conduit_mtu != old_conduit_mtu) {
+		err = dev_set_mtu(conduit, new_conduit_mtu);
+		if (err < 0)
+			goto out_conduit_failed;
+
+		/* We only need to propagate the MTU of the CPU port to
+		 * upstream switches, so emit a notifier which updates them.
+		 */
+		err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
+		if (err)
+			goto out_cpu_failed;
+	}
+
+	err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
+	if (err)
+		goto out_port_failed;
+
+	dev->mtu = new_mtu;
+
+	dsa_bridge_mtu_normalization(dp);
+
+	return 0;
+
+out_port_failed:
+	if (new_conduit_mtu != old_conduit_mtu)
+		dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead);
+out_cpu_failed:
+	if (new_conduit_mtu != old_conduit_mtu)
+		dev_set_mtu(conduit, old_conduit_mtu);
+out_conduit_failed:
+	return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+
+	if (!ds->ops->port_set_default_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = __fls(mask);
+
+	err = ds->ops->port_set_default_prio(ds, port, new_prio);
+	if (err) {
+		dcb_ieee_delapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+	u8 dscp = app->protocol;
+
+	if (!ds->ops->port_add_dscp_prio)
+		return -EOPNOTSUPP;
+
+	if (dscp >= 64) {
+		netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
+			   dscp);
+		return -EINVAL;
+	}
+
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = __fls(mask);
+
+	err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
+	if (err) {
+		dcb_ieee_delapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev,
+						     struct dcb_app *app)
+{
+	switch (app->selector) {
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		switch (app->protocol) {
+		case 0:
+			return dsa_user_dcbnl_set_default_prio(dev, app);
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	case IEEE_8021QAZ_APP_SEL_DSCP:
+		return dsa_user_dcbnl_add_dscp_prio(dev, app);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	unsigned long mask, new_prio;
+	int err, port = dp->index;
+
+	if (!ds->ops->port_set_default_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	mask = dcb_ieee_getapp_mask(dev, app);
+	new_prio = mask ? __fls(mask) : 0;
+
+	err = ds->ops->port_set_default_prio(ds, port, new_prio);
+	if (err) {
+		dcb_ieee_setapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int err, port = dp->index;
+	u8 dscp = app->protocol;
+
+	if (!ds->ops->port_del_dscp_prio)
+		return -EOPNOTSUPP;
+
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
+	if (err) {
+		dcb_ieee_setapp(dev, app);
+		return err;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev,
+						     struct dcb_app *app)
+{
+	switch (app->selector) {
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		switch (app->protocol) {
+		case 0:
+			return dsa_user_dcbnl_del_default_prio(dev, app);
+		default:
+			return -EOPNOTSUPP;
+		}
+		break;
+	case IEEE_8021QAZ_APP_SEL_DSCP:
+		return dsa_user_dcbnl_del_dscp_prio(dev, app);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/* Pre-populate the DCB application priority table with the priorities
+ * configured during switch setup, which we read from hardware here.
+ */
+static int dsa_user_dcbnl_init(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+	int err;
+
+	if (ds->ops->port_get_default_prio) {
+		int prio = ds->ops->port_get_default_prio(ds, port);
+		struct dcb_app app = {
+			.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
+			.protocol = 0,
+			.priority = prio,
+		};
+
+		if (prio < 0)
+			return prio;
+
+		err = dcb_ieee_setapp(dev, &app);
+		if (err)
+			return err;
+	}
+
+	if (ds->ops->port_get_dscp_prio) {
+		int protocol;
+
+		for (protocol = 0; protocol < 64; protocol++) {
+			struct dcb_app app = {
+				.selector = IEEE_8021QAZ_APP_SEL_DSCP,
+				.protocol = protocol,
+			};
+			int prio;
+
+			prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
+			if (prio == -EOPNOTSUPP)
+				continue;
+			if (prio < 0)
+				return prio;
+
+			app.priority = prio;
+
+			err = dcb_ieee_setapp(dev, &app);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops dsa_user_ethtool_ops = {
+	.get_drvinfo		= dsa_user_get_drvinfo,
+	.get_regs_len		= dsa_user_get_regs_len,
+	.get_regs		= dsa_user_get_regs,
+	.nway_reset		= dsa_user_nway_reset,
+	.get_link		= ethtool_op_get_link,
+	.get_eeprom_len		= dsa_user_get_eeprom_len,
+	.get_eeprom		= dsa_user_get_eeprom,
+	.set_eeprom		= dsa_user_set_eeprom,
+	.get_strings		= dsa_user_get_strings,
+	.get_ethtool_stats	= dsa_user_get_ethtool_stats,
+	.get_sset_count		= dsa_user_get_sset_count,
+	.get_eth_phy_stats	= dsa_user_get_eth_phy_stats,
+	.get_eth_mac_stats	= dsa_user_get_eth_mac_stats,
+	.get_eth_ctrl_stats	= dsa_user_get_eth_ctrl_stats,
+	.get_rmon_stats		= dsa_user_get_rmon_stats,
+	.set_wol		= dsa_user_set_wol,
+	.get_wol		= dsa_user_get_wol,
+	.set_eee		= dsa_user_set_eee,
+	.get_eee		= dsa_user_get_eee,
+	.get_link_ksettings	= dsa_user_get_link_ksettings,
+	.set_link_ksettings	= dsa_user_set_link_ksettings,
+	.get_pause_stats	= dsa_user_get_pause_stats,
+	.get_pauseparam		= dsa_user_get_pauseparam,
+	.set_pauseparam		= dsa_user_set_pauseparam,
+	.get_rxnfc		= dsa_user_get_rxnfc,
+	.set_rxnfc		= dsa_user_set_rxnfc,
+	.get_ts_info		= dsa_user_get_ts_info,
+	.self_test		= dsa_user_net_selftest,
+	.get_mm			= dsa_user_get_mm,
+	.set_mm			= dsa_user_set_mm,
+	.get_mm_stats		= dsa_user_get_mm_stats,
+};
+
+static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
+	.ieee_setapp		= dsa_user_dcbnl_ieee_setapp,
+	.ieee_delapp		= dsa_user_dcbnl_ieee_delapp,
+};
+
+static void dsa_user_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *s)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->ops->get_stats64)
+		ds->ops->get_stats64(ds, dp->index, s);
+	else
+		dev_get_tstats64(dev, s);
+}
+
+static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
+				      struct net_device_path *path)
+{
+	struct dsa_port *dp = dsa_user_to_port(ctx->dev);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+
+	path->dev = ctx->dev;
+	path->type = DEV_PATH_DSA;
+	path->dsa.proto = cpu_dp->tag_ops->proto;
+	path->dsa.port = dp->index;
+	ctx->dev = conduit;
+
+	return 0;
+}
+
+static const struct net_device_ops dsa_user_netdev_ops = {
+	.ndo_open		= dsa_user_open,
+	.ndo_stop		= dsa_user_close,
+	.ndo_start_xmit		= dsa_user_xmit,
+	.ndo_change_rx_flags	= dsa_user_change_rx_flags,
+	.ndo_set_rx_mode	= dsa_user_set_rx_mode,
+	.ndo_set_mac_address	= dsa_user_set_mac_address,
+	.ndo_fdb_dump		= dsa_user_fdb_dump,
+	.ndo_eth_ioctl		= dsa_user_ioctl,
+	.ndo_get_iflink		= dsa_user_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_setup	= dsa_user_netpoll_setup,
+	.ndo_netpoll_cleanup	= dsa_user_netpoll_cleanup,
+	.ndo_poll_controller	= dsa_user_poll_controller,
+#endif
+	.ndo_setup_tc		= dsa_user_setup_tc,
+	.ndo_get_stats64	= dsa_user_get_stats64,
+	.ndo_vlan_rx_add_vid	= dsa_user_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= dsa_user_vlan_rx_kill_vid,
+	.ndo_change_mtu		= dsa_user_change_mtu,
+	.ndo_fill_forward_path	= dsa_user_fill_forward_path,
+};
+
+static struct device_type dsa_type = {
+	.name	= "dsa",
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+	const struct dsa_port *dp = dsa_to_port(ds, port);
+
+	if (dp->pl)
+		phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_user_phylink_fixed_state(struct phylink_config *config,
+					 struct phylink_link_state *state)
+{
+	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_switch *ds = dp->ds;
+
+	/* No need to check that this operation is valid, the callback would
+	 * not be called if it was not.
+	 */
+	ds->ops->phylink_fixed_state(ds, dp->index, state);
+}
+
+/* user device setup *******************************************************/
+static int dsa_user_phy_connect(struct net_device *user_dev, int addr,
+				u32 flags)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct dsa_switch *ds = dp->ds;
+
+	user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr);
+	if (!user_dev->phydev) {
+		netdev_err(user_dev, "no phy at %d\n", addr);
+		return -ENODEV;
+	}
+
+	user_dev->phydev->dev_flags |= flags;
+
+	return phylink_connect_phy(dp->pl, user_dev->phydev);
+}
+
+static int dsa_user_phy_setup(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct device_node *port_dn = dp->dn;
+	struct dsa_switch *ds = dp->ds;
+	u32 phy_flags = 0;
+	int ret;
+
+	dp->pl_config.dev = &user_dev->dev;
+	dp->pl_config.type = PHYLINK_NETDEV;
+
+	/* The get_fixed_state callback takes precedence over polling the
+	 * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
+	 * this if the switch provides such a callback.
+	 */
+	if (ds->ops->phylink_fixed_state) {
+		dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state;
+		dp->pl_config.poll_fixed_state = true;
+	}
+
+	ret = dsa_port_phylink_create(dp);
+	if (ret)
+		return ret;
+
+	if (ds->ops->get_phy_flags)
+		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
+
+	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+	if (ret == -ENODEV && ds->user_mii_bus) {
+		/* We could not connect to a designated PHY or SFP, so try to
+		 * use the switch internal MDIO bus instead
+		 */
+		ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags);
+	}
+	if (ret) {
+		netdev_err(user_dev, "failed to connect to PHY: %pe\n",
+			   ERR_PTR(ret));
+		dsa_port_phylink_destroy(dp);
+	}
+
+	return ret;
+}
+
+void dsa_user_setup_tagger(struct net_device *user)
+{
+	struct dsa_port *dp = dsa_user_to_port(user);
+	struct net_device *conduit = dsa_port_to_conduit(dp);
+	struct dsa_user_priv *p = netdev_priv(user);
+	const struct dsa_port *cpu_dp = dp->cpu_dp;
+	const struct dsa_switch *ds = dp->ds;
+
+	user->needed_headroom = cpu_dp->tag_ops->needed_headroom;
+	user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
+	/* Try to save one extra realloc later in the TX path (in the conduit)
+	 * by also inheriting the conduit's needed headroom and tailroom.
+	 * The 8021q driver also does this.
+	 */
+	user->needed_headroom += conduit->needed_headroom;
+	user->needed_tailroom += conduit->needed_tailroom;
+
+	p->xmit = cpu_dp->tag_ops->xmit;
+
+	user->features = conduit->vlan_features | NETIF_F_HW_TC;
+	user->hw_features |= NETIF_F_HW_TC;
+	user->features |= NETIF_F_LLTX;
+	if (user->needed_tailroom)
+		user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
+	if (ds->needs_standalone_vlan_filtering)
+		user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+}
+
+int dsa_user_suspend(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+	if (!netif_running(user_dev))
+		return 0;
+
+	netif_device_detach(user_dev);
+
+	rtnl_lock();
+	phylink_stop(dp->pl);
+	rtnl_unlock();
+
+	return 0;
+}
+
+int dsa_user_resume(struct net_device *user_dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+	if (!netif_running(user_dev))
+		return 0;
+
+	netif_device_attach(user_dev);
+
+	rtnl_lock();
+	phylink_start(dp->pl);
+	rtnl_unlock();
+
+	return 0;
+}
+
+int dsa_user_create(struct dsa_port *port)
+{
+	struct net_device *conduit = dsa_port_to_conduit(port);
+	struct dsa_switch *ds = port->ds;
+	struct net_device *user_dev;
+	struct dsa_user_priv *p;
+	const char *name;
+	int assign_type;
+	int ret;
+
+	if (!ds->num_tx_queues)
+		ds->num_tx_queues = 1;
+
+	if (port->name) {
+		name = port->name;
+		assign_type = NET_NAME_PREDICTABLE;
+	} else {
+		name = "eth%d";
+		assign_type = NET_NAME_ENUM;
+	}
+
+	user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name,
+				    assign_type, ether_setup,
+				    ds->num_tx_queues, 1);
+	if (user_dev == NULL)
+		return -ENOMEM;
+
+	user_dev->rtnl_link_ops = &dsa_link_ops;
+	user_dev->ethtool_ops = &dsa_user_ethtool_ops;
+#if IS_ENABLED(CONFIG_DCB)
+	user_dev->dcbnl_ops = &dsa_user_dcbnl_ops;
+#endif
+	if (!is_zero_ether_addr(port->mac))
+		eth_hw_addr_set(user_dev, port->mac);
+	else
+		eth_hw_addr_inherit(user_dev, conduit);
+	user_dev->priv_flags |= IFF_NO_QUEUE;
+	if (dsa_switch_supports_uc_filtering(ds))
+		user_dev->priv_flags |= IFF_UNICAST_FLT;
+	user_dev->netdev_ops = &dsa_user_netdev_ops;
+	if (ds->ops->port_max_mtu)
+		user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+	SET_NETDEV_DEVTYPE(user_dev, &dsa_type);
+
+	SET_NETDEV_DEV(user_dev, port->ds->dev);
+	SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port);
+	user_dev->dev.of_node = port->dn;
+	user_dev->vlan_features = conduit->vlan_features;
+
+	p = netdev_priv(user_dev);
+	user_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!user_dev->tstats) {
+		free_netdev(user_dev);
+		return -ENOMEM;
+	}
+
+	ret = gro_cells_init(&p->gcells, user_dev);
+	if (ret)
+		goto out_free;
+
+	p->dp = port;
+	INIT_LIST_HEAD(&p->mall_tc_list);
+	port->user = user_dev;
+	dsa_user_setup_tagger(user_dev);
+
+	netif_carrier_off(user_dev);
+
+	ret = dsa_user_phy_setup(user_dev);
+	if (ret) {
+		netdev_err(user_dev,
+			   "error %d setting up PHY for tree %d, switch %d, port %d\n",
+			   ret, ds->dst->index, ds->index, port->index);
+		goto out_gcells;
+	}
+
+	rtnl_lock();
+
+	ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN);
+	if (ret && ret != -EOPNOTSUPP)
+		dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
+			 ret, ETH_DATA_LEN, port->index);
+
+	ret = register_netdevice(user_dev);
+	if (ret) {
+		netdev_err(conduit, "error %d registering interface %s\n",
+			   ret, user_dev->name);
+		rtnl_unlock();
+		goto out_phy;
+	}
+
+	if (IS_ENABLED(CONFIG_DCB)) {
+		ret = dsa_user_dcbnl_init(user_dev);
+		if (ret) {
+			netdev_err(user_dev,
+				   "failed to initialize DCB: %pe\n",
+				   ERR_PTR(ret));
+			rtnl_unlock();
+			goto out_unregister;
+		}
+	}
+
+	ret = netdev_upper_dev_link(conduit, user_dev, NULL);
+
+	rtnl_unlock();
+
+	if (ret)
+		goto out_unregister;
+
+	return 0;
+
+out_unregister:
+	unregister_netdev(user_dev);
+out_phy:
+	rtnl_lock();
+	phylink_disconnect_phy(p->dp->pl);
+	rtnl_unlock();
+	dsa_port_phylink_destroy(p->dp);
+out_gcells:
+	gro_cells_destroy(&p->gcells);
+out_free:
+	free_percpu(user_dev->tstats);
+	free_netdev(user_dev);
+	port->user = NULL;
+	return ret;
+}
+
+void dsa_user_destroy(struct net_device *user_dev)
+{
+	struct net_device *conduit = dsa_user_to_conduit(user_dev);
+	struct dsa_port *dp = dsa_user_to_port(user_dev);
+	struct dsa_user_priv *p = netdev_priv(user_dev);
+
+	netif_carrier_off(user_dev);
+	rtnl_lock();
+	netdev_upper_dev_unlink(conduit, user_dev);
+	unregister_netdevice(user_dev);
+	phylink_disconnect_phy(dp->pl);
+	rtnl_unlock();
+
+	dsa_port_phylink_destroy(dp);
+	gro_cells_destroy(&p->gcells);
+	free_percpu(user_dev->tstats);
+	free_netdev(user_dev);
+}
+
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *old_conduit = dsa_user_to_conduit(dev);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct net_device *upper;
+	struct list_head *iter;
+	int err;
+
+	if (conduit == old_conduit)
+		return 0;
+
+	if (!ds->ops->port_change_conduit) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Driver does not support changing DSA conduit");
+		return -EOPNOTSUPP;
+	}
+
+	if (!netdev_uses_dsa(conduit)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Interface not eligible as DSA conduit");
+		return -EOPNOTSUPP;
+	}
+
+	netdev_for_each_upper_dev_rcu(conduit, upper, iter) {
+		if (dsa_user_dev_check(upper))
+			continue;
+		if (netif_is_bridge_master(upper))
+			continue;
+		NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers");
+		return -EOPNOTSUPP;
+	}
+
+	/* Since we allow live-changing the DSA conduit, plus we auto-open the
+	 * DSA conduit when the user port opens => we need to ensure that the
+	 * new DSA conduit is open too.
+	 */
+	if (dev->flags & IFF_UP) {
+		err = dev_open(conduit, extack);
+		if (err)
+			return err;
+	}
+
+	netdev_upper_dev_unlink(old_conduit, dev);
+
+	err = netdev_upper_dev_link(conduit, dev, extack);
+	if (err)
+		goto out_revert_old_conduit_unlink;
+
+	err = dsa_port_change_conduit(dp, conduit, extack);
+	if (err)
+		goto out_revert_conduit_link;
+
+	/* Update the MTU of the new CPU port through cross-chip notifiers */
+	err = dsa_user_change_mtu(dev, dev->mtu);
+	if (err && err != -EOPNOTSUPP) {
+		netdev_warn(dev,
+			    "nonfatal error updating MTU with new conduit: %pe\n",
+			    ERR_PTR(err));
+	}
+
+	/* If the port doesn't have its own MAC address and relies on the DSA
+	 * conduit's one, inherit it again from the new DSA conduit.
+	 */
+	if (is_zero_ether_addr(dp->mac))
+		eth_hw_addr_inherit(dev, conduit);
+
+	return 0;
+
+out_revert_conduit_link:
+	netdev_upper_dev_unlink(conduit, dev);
+out_revert_old_conduit_unlink:
+	netdev_upper_dev_link(old_conduit, dev, NULL);
+	return err;
+}
+
+bool dsa_user_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &dsa_user_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(dsa_user_dev_check);
+
+static int dsa_user_changeupper(struct net_device *dev,
+				struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+
+	if (!dsa_user_dev_check(dev))
+		return err;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (netif_is_bridge_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_bridge_join(dp, info->upper_dev, extack);
+			if (!err)
+				dsa_bridge_mtu_normalization(dp);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_bridge_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	} else if (netif_is_lag_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_lag_join(dp, info->upper_dev,
+						info->upper_info, extack);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_lag_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	} else if (is_hsr_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_port_hsr_join(dp, info->upper_dev, extack);
+			if (err == -EOPNOTSUPP) {
+				NL_SET_ERR_MSG_WEAK_MOD(extack,
+							"Offloading not supported");
+				err = 0;
+			}
+			err = notifier_from_errno(err);
+		} else {
+			dsa_port_hsr_leave(dp, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	}
+
+	return err;
+}
+
+static int dsa_user_prechangeupper(struct net_device *dev,
+				   struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	if (!dsa_user_dev_check(dev))
+		return NOTIFY_DONE;
+
+	if (netif_is_bridge_master(info->upper_dev) && !info->linking)
+		dsa_port_pre_bridge_leave(dp, info->upper_dev);
+	else if (netif_is_lag_master(info->upper_dev) && !info->linking)
+		dsa_port_pre_lag_leave(dp, info->upper_dev);
+	/* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot
+	 * meaningfully placed under a bridge yet
+	 */
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_lag_changeupper(struct net_device *dev,
+			 struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+	struct dsa_port *dp;
+
+	if (!netif_is_lag_master(dev))
+		return err;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		if (!dsa_user_dev_check(lower))
+			continue;
+
+		dp = dsa_user_to_port(lower);
+		if (!dp->lag)
+			/* Software LAG */
+			continue;
+
+		err = dsa_user_changeupper(lower, info);
+		if (notifier_to_errno(err))
+			break;
+	}
+
+	return err;
+}
+
+/* Same as dsa_user_lag_changeupper() except that it calls
+ * dsa_user_prechangeupper()
+ */
+static int
+dsa_user_lag_prechangeupper(struct net_device *dev,
+			    struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+	int err = NOTIFY_DONE;
+	struct dsa_port *dp;
+
+	if (!netif_is_lag_master(dev))
+		return err;
+
+	netdev_for_each_lower_dev(dev, lower, iter) {
+		if (!dsa_user_dev_check(lower))
+			continue;
+
+		dp = dsa_user_to_port(lower);
+		if (!dp->lag)
+			/* Software LAG */
+			continue;
+
+		err = dsa_user_prechangeupper(lower, info);
+		if (notifier_to_errno(err))
+			break;
+	}
+
+	return err;
+}
+
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+				 struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *ext_ack;
+	struct net_device *user, *br;
+	struct dsa_port *dp;
+
+	ext_ack = netdev_notifier_info_to_extack(&info->info);
+
+	if (!is_vlan_dev(dev))
+		return NOTIFY_DONE;
+
+	user = vlan_dev_real_dev(dev);
+	if (!dsa_user_dev_check(user))
+		return NOTIFY_DONE;
+
+	dp = dsa_user_to_port(user);
+	br = dsa_port_bridge_dev_get(dp);
+	if (!br)
+		return NOTIFY_DONE;
+
+	/* Deny enslaving a VLAN device into a VLAN-aware bridge */
+	if (br_vlan_enabled(br) &&
+	    netif_is_bridge_master(info->upper_dev) && info->linking) {
+		NL_SET_ERR_MSG_MOD(ext_ack,
+				   "Cannot make VLAN device join VLAN-aware bridge");
+		return notifier_from_errno(-EINVAL);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_check_8021q_upper(struct net_device *dev,
+			   struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct net_device *br = dsa_port_bridge_dev_get(dp);
+	struct bridge_vlan_info br_info;
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+	u16 vid;
+
+	if (!br || !br_vlan_enabled(br))
+		return NOTIFY_DONE;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+	vid = vlan_dev_vlan_id(info->upper_dev);
+
+	/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+	 * device, respectively the VID is not found, returning
+	 * 0 means success, which is a failure for us here.
+	 */
+	err = br_vlan_get_info(br, vid, &br_info);
+	if (err == 0) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "This VLAN is already configured by the bridge");
+		return notifier_from_errno(-EBUSY);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_user_prechangeupper_sanity_check(struct net_device *dev,
+				     struct netdev_notifier_changeupper_info *info)
+{
+	struct dsa_switch *ds;
+	struct dsa_port *dp;
+	int err;
+
+	if (!dsa_user_dev_check(dev))
+		return dsa_prevent_bridging_8021q_upper(dev, info);
+
+	dp = dsa_user_to_port(dev);
+	ds = dp->ds;
+
+	if (ds->ops->port_prechangeupper) {
+		err = ds->ops->port_prechangeupper(ds, dp->index, info);
+		if (err)
+			return notifier_from_errno(err);
+	}
+
+	if (is_vlan_dev(info->upper_dev))
+		return dsa_user_check_8021q_upper(dev, info);
+
+	return NOTIFY_DONE;
+}
+
+/* To be eligible as a DSA conduit, a LAG must have all lower interfaces be
+ * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of
+ * switches in the same switch tree.
+ */
+static int dsa_lag_conduit_validate(struct net_device *lag_dev,
+				    struct netlink_ext_ack *extack)
+{
+	struct net_device *lower1, *lower2;
+	struct list_head *iter1, *iter2;
+
+	netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
+		netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
+			if (!netdev_uses_dsa(lower1) ||
+			    !netdev_uses_dsa(lower2)) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "All LAG ports must be eligible as DSA conduits");
+				return notifier_from_errno(-EINVAL);
+			}
+
+			if (lower1 == lower2)
+				continue;
+
+			if (!dsa_port_tree_same(lower1->dsa_ptr,
+						lower2->dsa_ptr)) {
+				NL_SET_ERR_MSG_MOD(extack,
+						   "LAG contains DSA conduits of disjoint switch trees");
+				return notifier_from_errno(-EINVAL);
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int
+dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit,
+					struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (!netdev_uses_dsa(conduit))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	/* Allow DSA switch uppers */
+	if (dsa_user_dev_check(info->upper_dev))
+		return NOTIFY_DONE;
+
+	/* Allow bridge uppers of DSA conduits, subject to further
+	 * restrictions in dsa_bridge_prechangelower_sanity_check()
+	 */
+	if (netif_is_bridge_master(info->upper_dev))
+		return NOTIFY_DONE;
+
+	/* Allow LAG uppers, subject to further restrictions in
+	 * dsa_lag_conduit_prechangelower_sanity_check()
+	 */
+	if (netif_is_lag_master(info->upper_dev))
+		return dsa_lag_conduit_validate(info->upper_dev, extack);
+
+	NL_SET_ERR_MSG_MOD(extack,
+			   "DSA conduit cannot join unknown upper interfaces");
+	return notifier_from_errno(-EBUSY);
+}
+
+static int
+dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev,
+					    struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+	struct net_device *lag_dev = info->upper_dev;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	if (!netdev_uses_dsa(dev)) {
+		NL_SET_ERR_MSG(extack,
+			       "Only DSA conduits can join a LAG DSA conduit");
+		return notifier_from_errno(-EINVAL);
+	}
+
+	netdev_for_each_lower_dev(lag_dev, lower, iter) {
+		if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
+			NL_SET_ERR_MSG(extack,
+				       "Interface is DSA conduit for a different switch tree than this LAG");
+			return notifier_from_errno(-EINVAL);
+		}
+
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/* Don't allow bridging of DSA conduits, since the bridge layer rx_handler
+ * prevents the DSA fake ethertype handler to be invoked, so we don't get the
+ * chance to strip off and parse the DSA switch tag protocol header (the bridge
+ * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
+ * frames).
+ * The only case where that would not be an issue is when bridging can already
+ * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev
+ * port, and is bridged only with other ports from the same hardware device.
+ */
+static int
+dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
+				       struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *br = info->upper_dev;
+	struct netlink_ext_ack *extack;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	if (!netif_is_bridge_master(br))
+		return NOTIFY_DONE;
+
+	if (!info->linking)
+		return NOTIFY_DONE;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	netdev_for_each_lower_dev(br, lower, iter) {
+		if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
+			continue;
+
+		if (!netdev_port_same_parent_id(lower, new_lower)) {
+			NL_SET_ERR_MSG(extack,
+				       "Cannot do software bridging with a DSA conduit");
+			return notifier_from_errno(-EINVAL);
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst,
+						    struct net_device *lag_dev)
+{
+	struct net_device *new_conduit = dsa_tree_find_first_conduit(dst);
+	struct dsa_port *dp;
+	int err;
+
+	dsa_tree_for_each_user_port(dp, dst) {
+		if (dsa_port_to_conduit(dp) != lag_dev)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, new_conduit, NULL);
+		if (err) {
+			netdev_err(dp->user,
+				   "failed to restore conduit to %s: %pe\n",
+				   new_conduit->name, ERR_PTR(err));
+		}
+	}
+}
+
+static int dsa_conduit_lag_join(struct net_device *conduit,
+				struct net_device *lag_dev,
+				struct netdev_lag_upper_info *uinfo,
+				struct netlink_ext_ack *extack)
+{
+	struct dsa_port *cpu_dp = conduit->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
+	struct dsa_port *dp;
+	int err;
+
+	err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack);
+	if (err)
+		return err;
+
+	dsa_tree_for_each_user_port(dp, dst) {
+		if (dsa_port_to_conduit(dp) != conduit)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, lag_dev, extack);
+		if (err)
+			goto restore;
+	}
+
+	return 0;
+
+restore:
+	dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
+		if (dsa_port_to_conduit(dp) != lag_dev)
+			continue;
+
+		err = dsa_user_change_conduit(dp->user, conduit, NULL);
+		if (err) {
+			netdev_err(dp->user,
+				   "failed to restore conduit to %s: %pe\n",
+				   conduit->name, ERR_PTR(err));
+		}
+	}
+
+	dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+
+	return err;
+}
+
+static void dsa_conduit_lag_leave(struct net_device *conduit,
+				  struct net_device *lag_dev)
+{
+	struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
+	struct dsa_port *new_cpu_dp = NULL;
+	struct net_device *lower;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(lag_dev, lower, iter) {
+		if (netdev_uses_dsa(lower)) {
+			new_cpu_dp = lower->dsa_ptr;
+			break;
+		}
+	}
+
+	if (new_cpu_dp) {
+		/* Update the CPU port of the user ports still under the LAG
+		 * so that dsa_port_to_conduit() continues to work properly
+		 */
+		dsa_tree_for_each_user_port(dp, dst)
+			if (dsa_port_to_conduit(dp) == lag_dev)
+				dp->cpu_dp = new_cpu_dp;
+
+		/* Update the index of the virtual CPU port to match the lowest
+		 * physical CPU port
+		 */
+		lag_dev->dsa_ptr = new_cpu_dp;
+		wmb();
+	} else {
+		/* If the LAG DSA conduit has no ports left, migrate back all
+		 * user ports to the first physical CPU port
+		 */
+		dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev);
+	}
+
+	/* This DSA conduit has left its LAG in any case, so let
+	 * the CPU port leave the hardware LAG as well
+	 */
+	dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+}
+
+static int dsa_conduit_changeupper(struct net_device *dev,
+				   struct netdev_notifier_changeupper_info *info)
+{
+	struct netlink_ext_ack *extack;
+	int err = NOTIFY_DONE;
+
+	if (!netdev_uses_dsa(dev))
+		return err;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	if (netif_is_lag_master(info->upper_dev)) {
+		if (info->linking) {
+			err = dsa_conduit_lag_join(dev, info->upper_dev,
+						   info->upper_info, extack);
+			err = notifier_from_errno(err);
+		} else {
+			dsa_conduit_lag_leave(dev, info->upper_dev);
+			err = NOTIFY_OK;
+		}
+	}
+
+	return err;
+}
+
+static int dsa_user_netdevice_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER: {
+		struct netdev_notifier_changeupper_info *info = ptr;
+		int err;
+
+		err = dsa_user_prechangeupper_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_conduit_prechangeupper_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_lag_conduit_prechangelower_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_bridge_prechangelower_sanity_check(dev, info);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_prechangeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_lag_prechangeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		break;
+	}
+	case NETDEV_CHANGEUPPER: {
+		int err;
+
+		err = dsa_user_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_user_lag_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		err = dsa_conduit_changeupper(dev, ptr);
+		if (notifier_to_errno(err))
+			return err;
+
+		break;
+	}
+	case NETDEV_CHANGELOWERSTATE: {
+		struct netdev_notifier_changelowerstate_info *info = ptr;
+		struct dsa_port *dp;
+		int err = 0;
+
+		if (dsa_user_dev_check(dev)) {
+			dp = dsa_user_to_port(dev);
+
+			err = dsa_port_lag_change(dp, info->lower_state_info);
+		}
+
+		/* Mirror LAG port events on DSA conduits that are in
+		 * a LAG towards their respective switch CPU ports
+		 */
+		if (netdev_uses_dsa(dev)) {
+			dp = dev->dsa_ptr;
+
+			err = dsa_port_lag_change(dp, info->lower_state_info);
+		}
+
+		return notifier_from_errno(err);
+	}
+	case NETDEV_CHANGE:
+	case NETDEV_UP: {
+		/* Track state of conduit port.
+		 * DSA driver may require the conduit port (and indirectly
+		 * the tagger) to be available for some special operation.
+		 */
+		if (netdev_uses_dsa(dev)) {
+			struct dsa_port *cpu_dp = dev->dsa_ptr;
+			struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+			/* Track when the conduit port is UP */
+			dsa_tree_conduit_oper_state_change(dst, dev,
+							   netif_oper_up(dev));
+
+			/* Track when the conduit port is ready and can accept
+			 * packet.
+			 * NETDEV_UP event is not enough to flag a port as ready.
+			 * We also have to wait for linkwatch_do_dev to dev_activate
+			 * and emit a NETDEV_CHANGE event.
+			 * We check if a conduit port is ready by checking if the dev
+			 * have a qdisc assigned and is not noop.
+			 */
+			dsa_tree_conduit_admin_state_change(dst, dev,
+							    !qdisc_tx_is_noop(dev));
+
+			return NOTIFY_OK;
+		}
+
+		return NOTIFY_DONE;
+	}
+	case NETDEV_GOING_DOWN: {
+		struct dsa_port *dp, *cpu_dp;
+		struct dsa_switch_tree *dst;
+		LIST_HEAD(close_list);
+
+		if (!netdev_uses_dsa(dev))
+			return NOTIFY_DONE;
+
+		cpu_dp = dev->dsa_ptr;
+		dst = cpu_dp->ds->dst;
+
+		dsa_tree_conduit_admin_state_change(dst, dev, false);
+
+		list_for_each_entry(dp, &dst->ports, list) {
+			if (!dsa_port_is_user(dp))
+				continue;
+
+			if (dp->cpu_dp != cpu_dp)
+				continue;
+
+			list_add(&dp->user->close_list, &close_list);
+		}
+
+		dev_close_many(&close_list, true);
+
+		return NOTIFY_OK;
+	}
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void
+dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
+{
+	struct switchdev_notifier_fdb_info info = {};
+
+	info.addr = switchdev_work->addr;
+	info.vid = switchdev_work->vid;
+	info.offloaded = true;
+	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+				 switchdev_work->orig_dev, &info.info, NULL);
+}
+
+static void dsa_user_switchdev_event_work(struct work_struct *work)
+{
+	struct dsa_switchdev_event_work *switchdev_work =
+		container_of(work, struct dsa_switchdev_event_work, work);
+	const unsigned char *addr = switchdev_work->addr;
+	struct net_device *dev = switchdev_work->dev;
+	u16 vid = switchdev_work->vid;
+	struct dsa_switch *ds;
+	struct dsa_port *dp;
+	int err;
+
+	dp = dsa_user_to_port(dev);
+	ds = dp->ds;
+
+	switch (switchdev_work->event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+		if (switchdev_work->host_addr)
+			err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
+		else if (dp->lag)
+			err = dsa_port_lag_fdb_add(dp, addr, vid);
+		else
+			err = dsa_port_fdb_add(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to add %pM vid %d to fdb: %d\n",
+				dp->index, addr, vid, err);
+			break;
+		}
+		dsa_fdb_offload_notify(switchdev_work);
+		break;
+
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (switchdev_work->host_addr)
+			err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
+		else if (dp->lag)
+			err = dsa_port_lag_fdb_del(dp, addr, vid);
+		else
+			err = dsa_port_fdb_del(dp, addr, vid);
+		if (err) {
+			dev_err(ds->dev,
+				"port %d failed to delete %pM vid %d from fdb: %d\n",
+				dp->index, addr, vid, err);
+		}
+
+		break;
+	}
+
+	kfree(switchdev_work);
+}
+
+static bool dsa_foreign_dev_check(const struct net_device *dev,
+				  const struct net_device *foreign_dev)
+{
+	const struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch_tree *dst = dp->ds->dst;
+
+	if (netif_is_bridge_master(foreign_dev))
+		return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
+
+	if (netif_is_bridge_port(foreign_dev))
+		return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
+
+	/* Everything else is foreign */
+	return true;
+}
+
+static int dsa_user_fdb_event(struct net_device *dev,
+			      struct net_device *orig_dev,
+			      unsigned long event, const void *ctx,
+			      const struct switchdev_notifier_fdb_info *fdb_info)
+{
+	struct dsa_switchdev_event_work *switchdev_work;
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	bool host_addr = fdb_info->is_local;
+	struct dsa_switch *ds = dp->ds;
+
+	if (ctx && ctx != dp)
+		return 0;
+
+	if (!dp->bridge)
+		return 0;
+
+	if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
+		if (dsa_port_offloads_bridge_port(dp, orig_dev))
+			return 0;
+
+		/* FDB entries learned by the software bridge or by foreign
+		 * bridge ports should be installed as host addresses only if
+		 * the driver requests assisted learning.
+		 */
+		if (!ds->assisted_learning_on_cpu_port)
+			return 0;
+	}
+
+	/* Also treat FDB entries on foreign interfaces bridged with us as host
+	 * addresses.
+	 */
+	if (dsa_foreign_dev_check(dev, orig_dev))
+		host_addr = true;
+
+	/* Check early that we're not doing work in vain.
+	 * Host addresses on LAG ports still require regular FDB ops,
+	 * since the CPU port isn't in a LAG.
+	 */
+	if (dp->lag && !host_addr) {
+		if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
+			return -EOPNOTSUPP;
+	} else {
+		if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
+			return -EOPNOTSUPP;
+	}
+
+	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+	if (!switchdev_work)
+		return -ENOMEM;
+
+	netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
+		   event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
+		   orig_dev->name, fdb_info->addr, fdb_info->vid,
+		   host_addr ? " as host address" : "");
+
+	INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work);
+	switchdev_work->event = event;
+	switchdev_work->dev = dev;
+	switchdev_work->orig_dev = orig_dev;
+
+	ether_addr_copy(switchdev_work->addr, fdb_info->addr);
+	switchdev_work->vid = fdb_info->vid;
+	switchdev_work->host_addr = host_addr;
+
+	dsa_schedule_work(&switchdev_work->work);
+
+	return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_user_switchdev_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	int err;
+
+	switch (event) {
+	case SWITCHDEV_PORT_ATTR_SET:
+		err = switchdev_handle_port_attr_set(dev, ptr,
+						     dsa_user_dev_check,
+						     dsa_user_port_attr_set);
+		return notifier_from_errno(err);
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
+							   dsa_user_dev_check,
+							   dsa_foreign_dev_check,
+							   dsa_user_fdb_event);
+		return notifier_from_errno(err);
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int dsa_user_switchdev_blocking_event(struct notifier_block *unused,
+					     unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	int err;
+
+	switch (event) {
+	case SWITCHDEV_PORT_OBJ_ADD:
+		err = switchdev_handle_port_obj_add_foreign(dev, ptr,
+							    dsa_user_dev_check,
+							    dsa_foreign_dev_check,
+							    dsa_user_port_obj_add);
+		return notifier_from_errno(err);
+	case SWITCHDEV_PORT_OBJ_DEL:
+		err = switchdev_handle_port_obj_del_foreign(dev, ptr,
+							    dsa_user_dev_check,
+							    dsa_foreign_dev_check,
+							    dsa_user_port_obj_del);
+		return notifier_from_errno(err);
+	case SWITCHDEV_PORT_ATTR_SET:
+		err = switchdev_handle_port_attr_set(dev, ptr,
+						     dsa_user_dev_check,
+						     dsa_user_port_attr_set);
+		return notifier_from_errno(err);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dsa_user_nb __read_mostly = {
+	.notifier_call  = dsa_user_netdevice_event,
+};
+
+struct notifier_block dsa_user_switchdev_notifier = {
+	.notifier_call = dsa_user_switchdev_event,
+};
+
+struct notifier_block dsa_user_switchdev_blocking_notifier = {
+	.notifier_call = dsa_user_switchdev_blocking_event,
+};
+
+int dsa_user_register_notifier(void)
+{
+	struct notifier_block *nb;
+	int err;
+
+	err = register_netdevice_notifier(&dsa_user_nb);
+	if (err)
+		return err;
+
+	err = register_switchdev_notifier(&dsa_user_switchdev_notifier);
+	if (err)
+		goto err_switchdev_nb;
+
+	nb = &dsa_user_switchdev_blocking_notifier;
+	err = register_switchdev_blocking_notifier(nb);
+	if (err)
+		goto err_switchdev_blocking_nb;
+
+	return 0;
+
+err_switchdev_blocking_nb:
+	unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+err_switchdev_nb:
+	unregister_netdevice_notifier(&dsa_user_nb);
+	return err;
+}
+
+void dsa_user_unregister_notifier(void)
+{
+	struct notifier_block *nb;
+	int err;
+
+	nb = &dsa_user_switchdev_blocking_notifier;
+	err = unregister_switchdev_blocking_notifier(nb);
+	if (err)
+		pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
+
+	err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+	if (err)
+		pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
+
+	err = unregister_netdevice_notifier(&dsa_user_nb);
+	if (err)
+		pr_err("DSA: failed to unregister user notifier (%d)\n", err);
+}
diff --git a/net/dsa/user.h b/net/dsa/user.h
new file mode 100644
index 000000000000..996069130bea
--- /dev/null
+++ b/net/dsa/user.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_USER_H
+#define __DSA_USER_H
+
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netpoll.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+#include <net/gro_cells.h>
+
+struct net_device;
+struct netlink_ext_ack;
+
+extern struct notifier_block dsa_user_switchdev_notifier;
+extern struct notifier_block dsa_user_switchdev_blocking_notifier;
+
+struct dsa_user_priv {
+	/* Copy of CPU port xmit for faster access in user transmit hot path */
+	struct sk_buff *	(*xmit)(struct sk_buff *skb,
+					struct net_device *dev);
+
+	struct gro_cells	gcells;
+
+	/* DSA port data, such as switch, port index, etc. */
+	struct dsa_port		*dp;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll		*netpoll;
+#endif
+
+	/* TC context */
+	struct list_head	mall_tc_list;
+};
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds);
+int dsa_user_create(struct dsa_port *dp);
+void dsa_user_destroy(struct net_device *user_dev);
+int dsa_user_suspend(struct net_device *user_dev);
+int dsa_user_resume(struct net_device *user_dev);
+int dsa_user_register_notifier(void);
+void dsa_user_unregister_notifier(void);
+void dsa_user_sync_ha(struct net_device *dev);
+void dsa_user_unsync_ha(struct net_device *dev);
+void dsa_user_setup_tagger(struct net_device *user);
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+			    struct netlink_ext_ack *extack);
+int dsa_user_manage_vlan_filtering(struct net_device *dev,
+				   bool vlan_filtering);
+
+static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev)
+{
+	struct dsa_user_priv *p = netdev_priv(dev);
+
+	return p->dp;
+}
+
+static inline struct net_device *
+dsa_user_to_conduit(const struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+
+	return dsa_port_to_conduit(dp);
+}
+
+#endif
-- 
cgit v1.2.3


From 15fcedd43a0810a5482ef759a18ab852d012cead Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:24 -0700
Subject: kexec: Annotate struct crash_mem with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct crash_mem.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: kexec@lists.infradead.org
Acked-by: Baoquan He <bhe@redhat.com>
Link: https://lore.kernel.org/r/20230922175224.work.712-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/crash_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..08704c29fdb4 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -92,7 +92,7 @@ int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 struct crash_mem {
 	unsigned int max_nr_ranges;
 	unsigned int nr_ranges;
-	struct range ranges[];
+	struct range ranges[] __counted_by(max_nr_ranges);
 };
 
 extern int crash_exclude_mem_range(struct crash_mem *mem,
-- 
cgit v1.2.3


From 8daaed76383aaae1d094d3d65d34890b2c6da24b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 14 Sep 2023 17:55:22 +0200
Subject: sh: Remove superhyway bus support

The superhyway bus driver was only referenced on SH4-202, which is now gone,
so remove it all as well.

I could find no trace of anything ever calling superhyway_register_driver(),
not in the git history but also not on the web, so I assume this has never
served any purpose on mainline kernels.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Link: https://lore.kernel.org/r/20230914155523.3839811-3-arnd@kernel.org
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 Documentation/arch/sh/index.rst          |   6 -
 arch/sh/Kconfig                          |   4 -
 arch/sh/drivers/Makefile                 |   1 -
 arch/sh/drivers/superhyway/Makefile      |   7 -
 arch/sh/drivers/superhyway/ops-sh4-202.c | 168 ----------------------
 drivers/sh/Makefile                      |   1 -
 drivers/sh/superhyway/Makefile           |   8 --
 drivers/sh/superhyway/superhyway-sysfs.c |  54 -------
 drivers/sh/superhyway/superhyway.c       | 234 -------------------------------
 include/linux/superhyway.h               | 107 --------------
 10 files changed, 590 deletions(-)
 delete mode 100644 arch/sh/drivers/superhyway/Makefile
 delete mode 100644 arch/sh/drivers/superhyway/ops-sh4-202.c
 delete mode 100644 drivers/sh/superhyway/Makefile
 delete mode 100644 drivers/sh/superhyway/superhyway-sysfs.c
 delete mode 100644 drivers/sh/superhyway/superhyway.c
 delete mode 100644 include/linux/superhyway.h

(limited to 'include/linux')

diff --git a/Documentation/arch/sh/index.rst b/Documentation/arch/sh/index.rst
index c64776738cf6..01fce7c131f1 100644
--- a/Documentation/arch/sh/index.rst
+++ b/Documentation/arch/sh/index.rst
@@ -43,12 +43,6 @@ mach-x3proto
 Busses
 ======
 
-SuperHyway
-----------
-
-.. kernel-doc:: drivers/sh/superhyway/superhyway.c
-   :export:
-
 Maple
 -----
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d785329e5715..7500521b2b98 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -737,10 +737,6 @@ endmenu
 
 menu "Bus options"
 
-config SUPERHYWAY
-	tristate "SuperHyway Bus support"
-	depends on CPU_SUBTYPE_SH4_202
-
 config MAPLE
 	bool "Maple Bus support"
 	depends on SH_DREAMCAST
diff --git a/arch/sh/drivers/Makefile b/arch/sh/drivers/Makefile
index 56b0acace6e7..8bd10b904bf9 100644
--- a/arch/sh/drivers/Makefile
+++ b/arch/sh/drivers/Makefile
@@ -6,6 +6,5 @@
 obj-y		+= dma/ platform_early.o
 
 obj-$(CONFIG_PCI)		+= pci/
-obj-$(CONFIG_SUPERHYWAY)	+= superhyway/
 obj-$(CONFIG_PUSH_SWITCH)	+= push-switch.o
 obj-$(CONFIG_HEARTBEAT)		+= heartbeat.o
diff --git a/arch/sh/drivers/superhyway/Makefile b/arch/sh/drivers/superhyway/Makefile
deleted file mode 100644
index aa6e3267c055..000000000000
--- a/arch/sh/drivers/superhyway/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the SuperHyway specific kernel interface routines under Linux.
-#
-
-obj-$(CONFIG_CPU_SUBTYPE_SH4_202)	+= ops-sh4-202.o
-
diff --git a/arch/sh/drivers/superhyway/ops-sh4-202.c b/arch/sh/drivers/superhyway/ops-sh4-202.c
deleted file mode 100644
index 490142274e3b..000000000000
--- a/arch/sh/drivers/superhyway/ops-sh4-202.c
+++ /dev/null
@@ -1,168 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/sh/drivers/superhyway/ops-sh4-202.c
- *
- * SuperHyway bus support for SH4-202
- *
- * Copyright (C) 2005  Paul Mundt
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/superhyway.h>
-#include <linux/string.h>
-#include <asm/addrspace.h>
-#include <asm/io.h>
-
-#define PHYS_EMI_CBLOCK		P4SEGADDR(0x1ec00000)
-#define PHYS_EMI_DBLOCK		P4SEGADDR(0x08000000)
-#define PHYS_FEMI_CBLOCK	P4SEGADDR(0x1f800000)
-#define PHYS_FEMI_DBLOCK	P4SEGADDR(0x00000000)
-
-#define PHYS_EPBR_BLOCK		P4SEGADDR(0x1de00000)
-#define PHYS_DMAC_BLOCK		P4SEGADDR(0x1fa00000)
-#define PHYS_PBR_BLOCK		P4SEGADDR(0x1fc00000)
-
-static struct resource emi_resources[] = {
-	[0] = {
-		.start	= PHYS_EMI_CBLOCK,
-		.end	= PHYS_EMI_CBLOCK + 0x00300000 - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= PHYS_EMI_DBLOCK,
-		.end	= PHYS_EMI_DBLOCK + 0x08000000 - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct superhyway_device emi_device = {
-	.name		= "emi",
-	.num_resources	= ARRAY_SIZE(emi_resources),
-	.resource	= emi_resources,
-};
-
-static struct resource femi_resources[] = {
-	[0] = {
-		.start	= PHYS_FEMI_CBLOCK,
-		.end	= PHYS_FEMI_CBLOCK + 0x00100000 - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= PHYS_FEMI_DBLOCK,
-		.end	= PHYS_FEMI_DBLOCK + 0x08000000 - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct superhyway_device femi_device = {
-	.name		= "femi",
-	.num_resources	= ARRAY_SIZE(femi_resources),
-	.resource	= femi_resources,
-};
-
-static struct resource epbr_resources[] = {
-	[0] = {
-		.start	= P4SEGADDR(0x1e7ffff8),
-		.end	= P4SEGADDR(0x1e7ffff8 + (sizeof(u32) * 2) - 1),
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= PHYS_EPBR_BLOCK,
-		.end	= PHYS_EPBR_BLOCK + 0x00a00000 - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct superhyway_device epbr_device = {
-	.name		= "epbr",
-	.num_resources	= ARRAY_SIZE(epbr_resources),
-	.resource	= epbr_resources,
-};
-
-static struct resource dmac_resource = {
-	.start	= PHYS_DMAC_BLOCK,
-	.end	= PHYS_DMAC_BLOCK + 0x00100000 - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-static struct superhyway_device dmac_device = {
-	.name		= "dmac",
-	.num_resources	= 1,
-	.resource	= &dmac_resource,
-};
-
-static struct resource pbr_resources[] = {
-	[0] = {
-		.start	= P4SEGADDR(0x1ffffff8),
-		.end	= P4SEGADDR(0x1ffffff8 + (sizeof(u32) * 2) - 1),
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= PHYS_PBR_BLOCK,
-		.end	= PHYS_PBR_BLOCK + 0x00400000 - (sizeof(u32) * 2) - 1,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct superhyway_device pbr_device = {
-	.name		= "pbr",
-	.num_resources	= ARRAY_SIZE(pbr_resources),
-	.resource	= pbr_resources,
-};
-
-static struct superhyway_device *sh4202_devices[] __initdata = {
-	&emi_device, &femi_device, &epbr_device, &dmac_device, &pbr_device,
-};
-
-static int sh4202_read_vcr(unsigned long base, struct superhyway_vcr_info *vcr)
-{
-	u32 vcrh, vcrl;
-	u64 tmp;
-
-	/*
-	 * XXX: Even though the SH4-202 Evaluation Device documentation
-	 * indicates that VCRL is mapped first with VCRH at a + 0x04
-	 * offset, the opposite seems to be true.
-	 *
-	 * Some modules (PBR and ePBR for instance) also appear to have
-	 * VCRL/VCRH flipped in the documentation, but on the SH4-202
-	 * itself it appears that these are all consistently mapped with
-	 * VCRH preceding VCRL.
-	 *
-	 * Do not trust the documentation, for it is evil.
-	 */
-	vcrh = __raw_readl(base);
-	vcrl = __raw_readl(base + sizeof(u32));
-
-	tmp = ((u64)vcrh << 32) | vcrl;
-	memcpy(vcr, &tmp, sizeof(u64));
-
-	return 0;
-}
-
-static int sh4202_write_vcr(unsigned long base, struct superhyway_vcr_info vcr)
-{
-	u64 tmp = *(u64 *)&vcr;
-
-	__raw_writel((tmp >> 32) & 0xffffffff, base);
-	__raw_writel(tmp & 0xffffffff, base + sizeof(u32));
-
-	return 0;
-}
-
-static struct superhyway_ops sh4202_superhyway_ops = {
-	.read_vcr	= sh4202_read_vcr,
-	.write_vcr	= sh4202_write_vcr,
-};
-
-struct superhyway_bus superhyway_channels[] = {
-	{ &sh4202_superhyway_ops, },
-	{ 0, },
-};
-
-int __init superhyway_scan_bus(struct superhyway_bus *bus)
-{
-	return superhyway_add_devices(bus, sh4202_devices,
-				      ARRAY_SIZE(sh4202_devices));
-}
-
diff --git a/drivers/sh/Makefile b/drivers/sh/Makefile
index c53262835e85..0aa152b1587a 100644
--- a/drivers/sh/Makefile
+++ b/drivers/sh/Makefile
@@ -7,6 +7,5 @@ ifneq ($(CONFIG_COMMON_CLK),y)
 obj-$(CONFIG_HAVE_CLK)			+= clk/
 endif
 obj-$(CONFIG_MAPLE)			+= maple/
-obj-$(CONFIG_SUPERHYWAY)		+= superhyway/
 
 obj-y					+= pm_runtime.o
diff --git a/drivers/sh/superhyway/Makefile b/drivers/sh/superhyway/Makefile
deleted file mode 100644
index 6dfa41f160af..000000000000
--- a/drivers/sh/superhyway/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the SuperHyway bus drivers.
-#
-
-obj-$(CONFIG_SUPERHYWAY)	+= superhyway.o
-obj-$(CONFIG_SYSFS)		+= superhyway-sysfs.o
-
diff --git a/drivers/sh/superhyway/superhyway-sysfs.c b/drivers/sh/superhyway/superhyway-sysfs.c
deleted file mode 100644
index 774f31b564f8..000000000000
--- a/drivers/sh/superhyway/superhyway-sysfs.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * drivers/sh/superhyway/superhyway-sysfs.c
- *
- * SuperHyway Bus sysfs interface
- *
- * Copyright (C) 2004, 2005  Paul Mundt <lethal@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/types.h>
-#include <linux/superhyway.h>
-
-#define superhyway_ro_attr(name, fmt, field)				\
-static ssize_t name##_show(struct device *dev, struct device_attribute *attr, char *buf)		\
-{									\
-	struct superhyway_device *s = to_superhyway_device(dev);	\
-	return sprintf(buf, fmt, s->field);				\
-}									\
-static DEVICE_ATTR_RO(name);
-
-/* VCR flags */
-superhyway_ro_attr(perr_flags, "0x%02x\n", vcr.perr_flags);
-superhyway_ro_attr(merr_flags, "0x%02x\n", vcr.merr_flags);
-superhyway_ro_attr(mod_vers, "0x%04x\n", vcr.mod_vers);
-superhyway_ro_attr(mod_id, "0x%04x\n", vcr.mod_id);
-superhyway_ro_attr(bot_mb, "0x%02x\n", vcr.bot_mb);
-superhyway_ro_attr(top_mb, "0x%02x\n", vcr.top_mb);
-
-/* Misc */
-superhyway_ro_attr(resource, "0x%08lx\n", resource[0].start);
-
-static struct attribute *superhyway_dev_attrs[] = {
-	&dev_attr_perr_flags.attr,
-	&dev_attr_merr_flags.attr,
-	&dev_attr_mod_vers.attr,
-	&dev_attr_mod_id.attr,
-	&dev_attr_bot_mb.attr,
-	&dev_attr_top_mb.attr,
-	&dev_attr_resource.attr,
-	NULL,
-};
-
-static const struct attribute_group superhyway_dev_group = {
-	.attrs = superhyway_dev_attrs,
-};
-
-const struct attribute_group *superhyway_dev_groups[] = {
-	&superhyway_dev_group,
-	NULL,
-};
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c
deleted file mode 100644
index 44324abe21da..000000000000
--- a/drivers/sh/superhyway/superhyway.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * drivers/sh/superhyway/superhyway.c
- *
- * SuperHyway Bus Driver
- *
- * Copyright (C) 2004, 2005  Paul Mundt <lethal@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/superhyway.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-static int superhyway_devices;
-
-static struct device superhyway_bus_device = {
-	.init_name = "superhyway",
-};
-
-static void superhyway_device_release(struct device *dev)
-{
-	struct superhyway_device *sdev = to_superhyway_device(dev);
-
-	kfree(sdev->resource);
-	kfree(sdev);
-}
-
-/**
- * superhyway_add_device - Add a SuperHyway module
- * @base: Physical address where module is mapped.
- * @sdev: SuperHyway device to add, or NULL to allocate a new one.
- * @bus: Bus where SuperHyway module resides.
- *
- * This is responsible for adding a new SuperHyway module. This sets up a new
- * struct superhyway_device for the module being added if @sdev == NULL.
- *
- * Devices are initially added in the order that they are scanned (from the
- * top-down of the memory map), and are assigned an ID based on the order that
- * they are added. Any manual addition of a module will thus get the ID after
- * the devices already discovered regardless of where it resides in memory.
- *
- * Further work can and should be done in superhyway_scan_bus(), to be sure
- * that any new modules are properly discovered and subsequently registered.
- */
-int superhyway_add_device(unsigned long base, struct superhyway_device *sdev,
-			  struct superhyway_bus *bus)
-{
-	struct superhyway_device *dev = sdev;
-
-	if (!dev) {
-		dev = kzalloc(sizeof(struct superhyway_device), GFP_KERNEL);
-		if (!dev)
-			return -ENOMEM;
-
-	}
-
-	dev->bus = bus;
-	superhyway_read_vcr(dev, base, &dev->vcr);
-
-	if (!dev->resource) {
-		dev->resource = kzalloc(sizeof(struct resource), GFP_KERNEL);
-		if (!dev->resource) {
-			kfree(dev);
-			return -ENOMEM;
-		}
-
-		dev->resource->name	= dev->name;
-		dev->resource->start	= base;
-		dev->resource->end	= dev->resource->start + 0x01000000;
-	}
-
-	dev->dev.parent		= &superhyway_bus_device;
-	dev->dev.bus		= &superhyway_bus_type;
-	dev->dev.release	= superhyway_device_release;
-	dev->id.id		= dev->vcr.mod_id;
-
-	sprintf(dev->name, "SuperHyway device %04x", dev->id.id);
-	dev_set_name(&dev->dev, "%02x", superhyway_devices);
-
-	superhyway_devices++;
-
-	return device_register(&dev->dev);
-}
-
-int superhyway_add_devices(struct superhyway_bus *bus,
-			   struct superhyway_device **devices,
-			   int nr_devices)
-{
-	int i, ret = 0;
-
-	for (i = 0; i < nr_devices; i++) {
-		struct superhyway_device *dev = devices[i];
-		ret |= superhyway_add_device(dev->resource[0].start, dev, bus);
-	}
-
-	return ret;
-}
-
-static int __init superhyway_init(void)
-{
-	struct superhyway_bus *bus;
-	int ret;
-
-	ret = device_register(&superhyway_bus_device);
-	if (unlikely(ret))
-		return ret;
-
-	for (bus = superhyway_channels; bus->ops; bus++)
-		ret |= superhyway_scan_bus(bus);
-
-	return ret;
-}
-postcore_initcall(superhyway_init);
-
-static const struct superhyway_device_id *
-superhyway_match_id(const struct superhyway_device_id *ids,
-		    struct superhyway_device *dev)
-{
-	while (ids->id) {
-		if (ids->id == dev->id.id)
-			return ids;
-
-		ids++;
-	}
-
-	return NULL;
-}
-
-static int superhyway_device_probe(struct device *dev)
-{
-	struct superhyway_device *shyway_dev = to_superhyway_device(dev);
-	struct superhyway_driver *shyway_drv = to_superhyway_driver(dev->driver);
-
-	if (shyway_drv && shyway_drv->probe) {
-		const struct superhyway_device_id *id;
-
-		id = superhyway_match_id(shyway_drv->id_table, shyway_dev);
-		if (id)
-			return shyway_drv->probe(shyway_dev, id);
-	}
-
-	return -ENODEV;
-}
-
-static void superhyway_device_remove(struct device *dev)
-{
-	struct superhyway_device *shyway_dev = to_superhyway_device(dev);
-	struct superhyway_driver *shyway_drv = to_superhyway_driver(dev->driver);
-
-	if (shyway_drv->remove)
-		shyway_drv->remove(shyway_dev);
-}
-
-/**
- * superhyway_register_driver - Register a new SuperHyway driver
- * @drv: SuperHyway driver to register.
- *
- * This registers the passed in @drv. Any devices matching the id table will
- * automatically be populated and handed off to the driver's specified probe
- * routine.
- */
-int superhyway_register_driver(struct superhyway_driver *drv)
-{
-	drv->drv.name	= drv->name;
-	drv->drv.bus	= &superhyway_bus_type;
-
-	return driver_register(&drv->drv);
-}
-
-/**
- * superhyway_unregister_driver - Unregister a SuperHyway driver
- * @drv: SuperHyway driver to unregister.
- *
- * This cleans up after superhyway_register_driver(), and should be invoked in
- * the exit path of any module drivers.
- */
-void superhyway_unregister_driver(struct superhyway_driver *drv)
-{
-	driver_unregister(&drv->drv);
-}
-
-static int superhyway_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct superhyway_device *shyway_dev = to_superhyway_device(dev);
-	struct superhyway_driver *shyway_drv = to_superhyway_driver(drv);
-	const struct superhyway_device_id *ids = shyway_drv->id_table;
-
-	if (!ids)
-		return -EINVAL;
-	if (superhyway_match_id(ids, shyway_dev))
-		return 1;
-
-	return -ENODEV;
-}
-
-struct bus_type superhyway_bus_type = {
-	.name		= "superhyway",
-	.match		= superhyway_bus_match,
-#ifdef CONFIG_SYSFS
-	.dev_groups	= superhyway_dev_groups,
-#endif
-	.probe		= superhyway_device_probe,
-	.remove		= superhyway_device_remove,
-};
-
-static int __init superhyway_bus_init(void)
-{
-	return bus_register(&superhyway_bus_type);
-}
-
-static void __exit superhyway_bus_exit(void)
-{
-	device_unregister(&superhyway_bus_device);
-	bus_unregister(&superhyway_bus_type);
-}
-
-core_initcall(superhyway_bus_init);
-module_exit(superhyway_bus_exit);
-
-EXPORT_SYMBOL(superhyway_bus_type);
-EXPORT_SYMBOL(superhyway_add_device);
-EXPORT_SYMBOL(superhyway_add_devices);
-EXPORT_SYMBOL(superhyway_register_driver);
-EXPORT_SYMBOL(superhyway_unregister_driver);
-
-MODULE_LICENSE("GPL");
diff --git a/include/linux/superhyway.h b/include/linux/superhyway.h
deleted file mode 100644
index 8d3376775813..000000000000
--- a/include/linux/superhyway.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * include/linux/superhyway.h
- *
- * SuperHyway Bus definitions
- *
- * Copyright (C) 2004, 2005  Paul Mundt <lethal@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __LINUX_SUPERHYWAY_H
-#define __LINUX_SUPERHYWAY_H
-
-#include <linux/device.h>
-
-/*
- * SuperHyway IDs
- */
-#define SUPERHYWAY_DEVICE_ID_SH5_DMAC	0x0183
-
-struct superhyway_vcr_info {
-	u8	perr_flags;	/* P-port Error flags */
-	u8	merr_flags;	/* Module Error flags */
-	u16	mod_vers;	/* Module Version */
-	u16	mod_id;		/* Module ID */
-	u8	bot_mb;		/* Bottom Memory block */
-	u8	top_mb;		/* Top Memory block */
-};
-
-struct superhyway_ops {
-	int (*read_vcr)(unsigned long base, struct superhyway_vcr_info *vcr);
-	int (*write_vcr)(unsigned long base, struct superhyway_vcr_info vcr);
-};
-
-struct superhyway_bus {
-	struct superhyway_ops *ops;
-};
-
-extern struct superhyway_bus superhyway_channels[];
-
-struct superhyway_device_id {
-	unsigned int id;
-	unsigned long driver_data;
-};
-
-struct superhyway_device;
-extern struct bus_type superhyway_bus_type;
-
-struct superhyway_driver {
-	char *name;
-
-	const struct superhyway_device_id *id_table;
-	struct device_driver drv;
-
-	int (*probe)(struct superhyway_device *dev, const struct superhyway_device_id *id);
-	void (*remove)(struct superhyway_device *dev);
-};
-
-#define to_superhyway_driver(d)	container_of((d), struct superhyway_driver, drv)
-
-struct superhyway_device {
-	char name[32];
-
-	struct device dev;
-
-	struct superhyway_device_id id;
-	struct superhyway_driver *drv;
-	struct superhyway_bus *bus;
-
-	int num_resources;
-	struct resource *resource;
-	struct superhyway_vcr_info vcr;
-};
-
-#define to_superhyway_device(d)	container_of((d), struct superhyway_device, dev)
-
-#define superhyway_get_drvdata(d)	dev_get_drvdata(&(d)->dev)
-#define superhyway_set_drvdata(d,p)	dev_set_drvdata(&(d)->dev, (p))
-
-static inline int
-superhyway_read_vcr(struct superhyway_device *dev, unsigned long base,
-		    struct superhyway_vcr_info *vcr)
-{
-	return dev->bus->ops->read_vcr(base, vcr);
-}
-
-static inline int
-superhyway_write_vcr(struct superhyway_device *dev, unsigned long base,
-		     struct superhyway_vcr_info vcr)
-{
-	return dev->bus->ops->write_vcr(base, vcr);
-}
-
-extern int superhyway_scan_bus(struct superhyway_bus *);
-
-/* drivers/sh/superhyway/superhyway.c */
-int superhyway_register_driver(struct superhyway_driver *);
-void superhyway_unregister_driver(struct superhyway_driver *);
-int superhyway_add_device(unsigned long base, struct superhyway_device *, struct superhyway_bus *);
-int superhyway_add_devices(struct superhyway_bus *bus, struct superhyway_device **devices, int nr_devices);
-
-/* drivers/sh/superhyway/superhyway-sysfs.c */
-extern const struct attribute_group *superhyway_dev_groups[];
-
-#endif /* __LINUX_SUPERHYWAY_H */
-
-- 
cgit v1.2.3


From df8b030d82dd1224881acb4e778fa06c4824f72b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 12 Oct 2023 12:23:38 +0200
Subject: HID: core: remove #ifdef CONFIG_PM from hid_driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow HID drivers to pass ->suspend, ->resume and ->reset_resume via
pm_ptr().
Through the usage of pm_ptr() the CONFIG_PM-dependent code will always be
compiled, protecting against bitrot.
The linker will then garbage-collect the unused function avoiding any overhead.

The only overhead in the final kernel image and at runtime are a few
extra bytes in 'struct hid_driver'.

The same approach is chosen by 'struct usb_driver' and other subsystems.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20231012-hid-pm_ptr-v1-1-0a71531ca93b@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 964ca1f15e3f..5a8387a4a712 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -833,11 +833,11 @@ struct hid_driver {
 	void (*feature_mapping)(struct hid_device *hdev,
 			struct hid_field *field,
 			struct hid_usage *usage);
-#ifdef CONFIG_PM
+
 	int (*suspend)(struct hid_device *hdev, pm_message_t message);
 	int (*resume)(struct hid_device *hdev);
 	int (*reset_resume)(struct hid_device *hdev);
-#endif
+
 /* private: */
 	struct device_driver driver;
 };
-- 
cgit v1.2.3


From 3de6047f1832010d24963f58206298fc75085816 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 21 Sep 2023 21:07:38 +0100
Subject: highmem: Add folio_release_kmap()

This is the folio equivalent of unmap_and_put_page(), which remains as
a wrapper for it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20230921200746.3303942-1-willy@infradead.org>
---
 include/linux/highmem.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 99c474de800d..4cacc0e43b51 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -551,10 +551,24 @@ static inline void folio_zero_range(struct folio *folio,
 	zero_user_segments(&folio->page, start, start + length, 0, 0);
 }
 
-static inline void unmap_and_put_page(struct page *page, void *addr)
+/**
+ * folio_release_kmap - Unmap a folio and drop a refcount.
+ * @folio: The folio to release.
+ * @addr: The address previously returned by a call to kmap_local_folio().
+ *
+ * It is common, eg in directory handling to kmap a folio.  This function
+ * unmaps the folio and drops the refcount that was being held to keep the
+ * folio alive while we accessed it.
+ */
+static inline void folio_release_kmap(struct folio *folio, void *addr)
 {
 	kunmap_local(addr);
-	put_page(page);
+	folio_put(folio);
+}
+
+static inline void unmap_and_put_page(struct page *page, void *addr)
+{
+	folio_release_kmap(page_folio(page), addr);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 61d4fb0b349ec1b33119913c3b0bd109de30142c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 25 Oct 2023 12:14:37 +0200
Subject: file, i915: fix file reference for mmap_singleton()

Today we got a report at [1] for rcu stalls on the i915 testsuite in [2]
due to the conversion of files to SLAB_TYPSSAFE_BY_RCU. Afaict,
get_file_rcu() goes into an infinite loop trying to carefully verify
that i915->gem.mmap_singleton hasn't changed - see the splat below.

So I stared at this code to figure out what it actually does. It seems
that the i915->gem.mmap_singleton pointer itself never had rcu semantics.

The i915->gem.mmap_singleton is replaced in
file->f_op->release::singleton_release():

        static int singleton_release(struct inode *inode, struct file *file)
        {
                struct drm_i915_private *i915 = file->private_data;

                cmpxchg(&i915->gem.mmap_singleton, file, NULL);
                drm_dev_put(&i915->drm);

                return 0;
        }

The cmpxchg() is ordered against a concurrent update of
i915->gem.mmap_singleton from mmap_singleton(). IOW, when
mmap_singleton() fails to get a reference on i915->gem.mmap_singleton:

While mmap_singleton() does

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

it allocates a new file via anon_inode_getfile() and does

        smp_store_mb(i915->gem.mmap_singleton, file);

So, then what happens in the case of this bug is that at some point
fput() is called and drops the file->f_count to zero leaving the pointer
in i915->gem.mmap_singleton in tact.

Now, there might be delays until
file->f_op->release::singleton_release() is called and
i915->gem.mmap_singleton is set to NULL.

Say concurrently another task hits mmap_singleton() and does:

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

When get_file_rcu() fails to get a reference via atomic_inc_not_zero()
it will try the reload from i915->gem.mmap_singleton expecting it to be
NULL, assuming it has comparable semantics as we expect in
__fget_files_rcu().

But it hasn't so it reloads the same pointer again, trying the same
atomic_inc_not_zero() again and doing so until
file->f_op->release::singleton_release() of the old file has been
called.

So, in contrast to __fget_files_rcu() here we want to not retry when
atomic_inc_not_zero() has failed. We only want to retry in case we
managed to get a reference but the pointer did change on reload.

<3> [511.395679] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
<3> [511.395716] rcu:   Tasks blocked on level-1 rcu_node (CPUs 0-9): P6238
<3> [511.395934] rcu:   (detected by 16, t=65002 jiffies, g=123977, q=439 ncpus=20)
<6> [511.395944] task:i915_selftest   state:R  running task     stack:10568 pid:6238  tgid:6238  ppid:1001   flags:0x00004002
<6> [511.395962] Call Trace:
<6> [511.395966]  <TASK>
<6> [511.395974]  ? __schedule+0x3a8/0xd70
<6> [511.395995]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396003]  ? lockdep_hardirqs_on+0xc3/0x140
<6> [511.396013]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396029]  ? get_file_rcu+0x10/0x30
<6> [511.396039]  ? get_file_rcu+0x10/0x30
<6> [511.396046]  ? i915_gem_object_mmap+0xbc/0x450 [i915]
<6> [511.396509]  ? i915_gem_mmap+0x272/0x480 [i915]
<6> [511.396903]  ? mmap_region+0x253/0xb60
<6> [511.396925]  ? do_mmap+0x334/0x5c0
<6> [511.396939]  ? vm_mmap_pgoff+0x9f/0x1c0
<6> [511.396949]  ? rcu_is_watching+0x11/0x50
<6> [511.396962]  ? igt_mmap_offset+0xfc/0x110 [i915]
<6> [511.397376]  ? __igt_mmap+0xb3/0x570 [i915]
<6> [511.397762]  ? igt_mmap+0x11e/0x150 [i915]
<6> [511.398139]  ? __trace_bprintk+0x76/0x90
<6> [511.398156]  ? __i915_subtests+0xbf/0x240 [i915]
<6> [511.398586]  ? __pfx___i915_live_setup+0x10/0x10 [i915]
<6> [511.399001]  ? __pfx___i915_live_teardown+0x10/0x10 [i915]
<6> [511.399433]  ? __run_selftests+0xbc/0x1a0 [i915]
<6> [511.399875]  ? i915_live_selftests+0x4b/0x90 [i915]
<6> [511.400308]  ? i915_pci_probe+0x106/0x200 [i915]
<6> [511.400692]  ? pci_device_probe+0x95/0x120
<6> [511.400704]  ? really_probe+0x164/0x3c0
<6> [511.400715]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400722]  ? __driver_probe_device+0x73/0x160
<6> [511.400731]  ? driver_probe_device+0x19/0xa0
<6> [511.400741]  ? __driver_attach+0xb6/0x180
<6> [511.400749]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400756]  ? bus_for_each_dev+0x77/0xd0
<6> [511.400770]  ? bus_add_driver+0x114/0x210
<6> [511.400781]  ? driver_register+0x5b/0x110
<6> [511.400791]  ? i915_init+0x23/0xc0 [i915]
<6> [511.401153]  ? __pfx_i915_init+0x10/0x10 [i915]
<6> [511.401503]  ? do_one_initcall+0x57/0x270
<6> [511.401515]  ? rcu_is_watching+0x11/0x50
<6> [511.401521]  ? kmalloc_trace+0xa3/0xb0
<6> [511.401532]  ? do_init_module+0x5f/0x210
<6> [511.401544]  ? load_module+0x1d00/0x1f60
<6> [511.401581]  ? init_module_from_file+0x86/0xd0
<6> [511.401590]  ? init_module_from_file+0x86/0xd0
<6> [511.401613]  ? idempotent_init_module+0x17c/0x230
<6> [511.401639]  ? __x64_sys_finit_module+0x56/0xb0
<6> [511.401650]  ? do_syscall_64+0x3c/0x90
<6> [511.401659]  ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8
<6> [511.401684]  </TASK>

Link: [1]: https://lore.kernel.org/intel-gfx/SJ1PR11MB6129CB39EED831784C331BAFB9DEA@SJ1PR11MB6129.namprd11.prod.outlook.com
Link: [2]: https://intel-gfx-ci.01.org/tree/linux-next/next-20231013/bat-dg2-11/igt@i915_selftest@live@mman.html#dmesg-warnings10963
Cc: Jann Horn <jannh@google.com>,
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20231025-formfrage-watscheln-84526cd3bd7d@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c |  4 +---
 fs/file.c                                | 25 +++++++++++++++++++++++++
 include/linux/fs.h                       |  1 +
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 97bf10861cad..72b353737334 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -915,9 +915,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 {
 	struct file *file;
 
-	rcu_read_lock();
-	file = get_file_rcu(&i915->gem.mmap_singleton);
-	rcu_read_unlock();
+	file = get_file_active(&i915->gem.mmap_singleton);
 	if (file)
 		return file;
 
diff --git a/fs/file.c b/fs/file.c
index 1a475d7d636e..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -927,6 +927,31 @@ struct file *get_file_rcu(struct file __rcu **f)
 }
 EXPORT_SYMBOL_GPL(get_file_rcu);
 
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+	struct file __rcu *file;
+
+	rcu_read_lock();
+	file = __get_file_rcu(f);
+	rcu_read_unlock();
+	if (IS_ERR(file))
+		file = NULL;
+	return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
        unsigned int fd, fmode_t mask)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cb8bfa1f8ecb..b35815277cc6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1044,6 +1044,7 @@ static inline struct file *get_file(struct file *f)
 }
 
 struct file *get_file_rcu(struct file __rcu **f);
+struct file *get_file_active(struct file **f);
 
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
-- 
cgit v1.2.3


From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:49 +0100
Subject: buffer: make folio_create_empty_buffers() return a buffer_head

Patch series "Finish the create_empty_buffers() transition", v2.

Pankaj recently added folio_create_empty_buffers() as the folio equivalent
to create_empty_buffers().  This patch set finishes the conversion by
first converting all remaining filesystems to call
folio_create_empty_buffers(), then renaming it back to
create_empty_buffers().  I took the opportunity to make a few
simplifications like making folio_create_empty_buffers() return the head
buffer and extracting get_nth_bh() from nilfs2.

A few of the patches in this series aren't directly related to
create_empty_buffers(), but I saw them while I was working on this and
thought they'd be easy enough to add to this series.  Compile-tested only,
other than ext4.


This patch (of 26):

Almost all callers want to know the first BH that was allocated for this
folio.  We already have that handy, so return it.

Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 24 +++++++++++++-----------
 include/linux/buffer_head.h |  4 ++--
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 86ddfa52c699..bfa7604d1891 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1641,8 +1641,8 @@ EXPORT_SYMBOL(block_invalidate_folio);
  * block_dirty_folio() via private_lock.  try_to_free_buffers
  * is already excluded via the folio lock.
  */
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
-				unsigned long b_state)
+struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+		unsigned long blocksize, unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
@@ -1669,6 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
 	}
 	folio_attach_private(folio, head);
 	spin_unlock(&folio->mapping->private_lock);
+
+	return head;
 }
 EXPORT_SYMBOL(folio_create_empty_buffers);
 
@@ -1770,13 +1772,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
 						struct inode *inode,
 						unsigned int b_state)
 {
+	struct buffer_head *bh;
+
 	BUG_ON(!folio_test_locked(folio));
 
-	if (!folio_buffers(folio))
-		folio_create_empty_buffers(folio,
-					   1 << READ_ONCE(inode->i_blkbits),
-					   b_state);
-	return folio_buffers(folio);
+	bh = folio_buffers(folio);
+	if (!bh)
+		bh = folio_create_empty_buffers(folio,
+				1 << READ_ONCE(inode->i_blkbits), b_state);
+	return bh;
 }
 
 /*
@@ -2676,10 +2680,8 @@ int block_truncate_page(struct address_space *mapping,
 		return PTR_ERR(folio);
 
 	bh = folio_buffers(folio);
-	if (!bh) {
-		folio_create_empty_buffers(folio, blocksize, 0);
-		bh = folio_buffers(folio);
-	}
+	if (!bh)
+		bh = folio_create_empty_buffers(folio, blocksize, 0);
 
 	/* Find the buffer that contains "offset" */
 	offset = offset_in_folio(folio, from);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3dc4720e4773..1001244a8941 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,8 +203,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
-				unsigned long b_state);
+struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_async_write(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 0217fbb0271a7c78e16629cf45375916ec2eb35f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:10:52 +0100
Subject: buffer: add get_nth_bh()

Extract this useful helper from nilfs_page_get_nth_block()

Link: https://lkml.kernel.org/r/20231016201114.1928083-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/page.h            |  7 +------
 include/linux/buffer_head.h | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 21ddcdd4d63e..344d71942d36 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -55,12 +55,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 static inline struct buffer_head *
 nilfs_page_get_nth_block(struct page *page, unsigned int count)
 {
-	struct buffer_head *bh = page_buffers(page);
-
-	while (count-- > 0)
-		bh = bh->b_this_page;
-	get_bh(bh);
-	return bh;
+	return get_nth_bh(page_buffers(page), count);
 }
 
 #endif /* _NILFS_PAGE_H */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1001244a8941..3d85a0cf0ca5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -457,6 +457,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
 	return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
 }
 
+/**
+ * get_nth_bh - Get a reference on the n'th buffer after this one.
+ * @bh: The buffer to start counting from.
+ * @count: How many buffers to skip.
+ *
+ * This is primarily useful for finding the nth buffer in a folio; in
+ * that case you pass the head buffer and the byte offset in the folio
+ * divided by the block size.  It can be used for other purposes, but
+ * it will wrap at the end of the folio rather than returning NULL or
+ * proceeding to the next folio for you.
+ *
+ * Return: The requested buffer with an elevated refcount.
+ */
+static inline __must_check
+struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
+{
+	while (count--)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
 bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
 
 #ifdef CONFIG_BUFFER_HEAD
-- 
cgit v1.2.3


From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 16 Oct 2023 21:11:14 +0100
Subject: buffer: remove folio_create_empty_buffers()

With all users converted, remove the old create_empty_buffers() and rename
folio_create_empty_buffers() to create_empty_buffers().

Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 13 +++----------
 fs/ext4/inode.c             |  6 +++---
 fs/ext4/move_extent.c       |  4 ++--
 fs/gfs2/aops.c              |  2 +-
 fs/gfs2/bmap.c              |  2 +-
 fs/gfs2/meta_io.c           |  2 +-
 fs/gfs2/quota.c             |  2 +-
 fs/mpage.c                  |  2 +-
 fs/nilfs2/mdt.c             |  2 +-
 fs/nilfs2/page.c            |  4 ++--
 fs/nilfs2/segment.c         |  2 +-
 fs/ntfs/aops.c              |  4 ++--
 fs/ntfs/file.c              |  2 +-
 fs/ntfs3/file.c             |  2 +-
 fs/ocfs2/aops.c             |  2 +-
 fs/reiserfs/inode.c         |  2 +-
 fs/ufs/util.c               |  2 +-
 include/linux/buffer_head.h |  4 +---
 18 files changed, 25 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index bfa7604d1891..657a62bab73d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1641,7 +1641,7 @@ EXPORT_SYMBOL(block_invalidate_folio);
  * block_dirty_folio() via private_lock.  try_to_free_buffers
  * is already excluded via the folio lock.
  */
-struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
@@ -1672,13 +1672,6 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio,
 
 	return head;
 }
-EXPORT_SYMBOL(folio_create_empty_buffers);
-
-void create_empty_buffers(struct page *page,
-			unsigned long blocksize, unsigned long b_state)
-{
-	folio_create_empty_buffers(page_folio(page), blocksize, b_state);
-}
 EXPORT_SYMBOL(create_empty_buffers);
 
 /**
@@ -1778,7 +1771,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
 
 	bh = folio_buffers(folio);
 	if (!bh)
-		bh = folio_create_empty_buffers(folio,
+		bh = create_empty_buffers(folio,
 				1 << READ_ONCE(inode->i_blkbits), b_state);
 	return bh;
 }
@@ -2681,7 +2674,7 @@ int block_truncate_page(struct address_space *mapping,
 
 	bh = folio_buffers(folio);
 	if (!bh)
-		bh = folio_create_empty_buffers(folio, blocksize, 0);
+		bh = create_empty_buffers(folio, blocksize, 0);
 
 	/* Find the buffer that contains "offset" */
 	offset = offset_in_folio(folio, from);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8e431ff2fd95..347fc8986e93 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1021,7 +1021,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 
 	head = folio_buffers(folio);
 	if (!head)
-		head = folio_create_empty_buffers(folio, blocksize, 0);
+		head = create_empty_buffers(folio, blocksize, 0);
 	bbits = ilog2(blocksize);
 	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
 
@@ -1151,7 +1151,7 @@ retry_grab:
 	 * starting the handle.
 	 */
 	if (!folio_buffers(folio))
-		folio_create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
+		create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
 
 	folio_unlock(folio);
 
@@ -3642,7 +3642,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 
 	bh = folio_buffers(folio);
 	if (!bh)
-		bh = folio_create_empty_buffers(folio, blocksize, 0);
+		bh = create_empty_buffers(folio, blocksize, 0);
 
 	/* Find the buffer that contains "offset" */
 	pos = blocksize;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7fe448fb948b..3aa57376d9c2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -184,7 +184,7 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
 	blocksize = i_blocksize(inode);
 	head = folio_buffers(folio);
 	if (!head)
-		head = folio_create_empty_buffers(folio, blocksize, 0);
+		head = create_empty_buffers(folio, blocksize, 0);
 
 	block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
 	for (bh = head, block_start = 0; bh != head || !block_start;
@@ -380,7 +380,7 @@ data_copy:
 	 * but keeping in mind that i_size will not change */
 	bh = folio_buffers(folio[0]);
 	if (!bh)
-		bh = folio_create_empty_buffers(folio[0],
+		bh = create_empty_buffers(folio[0],
 				1 << orig_inode->i_blkbits, 0);
 	for (i = 0; i < data_offset_in_page; i++)
 		bh = bh->b_this_page;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c26d48355cc2..6b060fc9e260 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
 	if (folio_test_checked(folio)) {
 		folio_clear_checked(folio);
 		if (!folio_buffers(folio)) {
-			folio_create_empty_buffers(folio,
+			create_empty_buffers(folio,
 					inode->i_sb->s_blocksize,
 					BIT(BH_Dirty)|BIT(BH_Uptodate));
 		}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 247d2c16593c..f1eee3f4704b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,7 +71,7 @@ static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
 		struct buffer_head *bh = folio_buffers(folio);
 
 		if (!bh)
-			bh = folio_create_empty_buffers(folio,
+			bh = create_empty_buffers(folio,
 				BIT(inode->i_blkbits), BIT(BH_Uptodate));
 
 		if (!buffer_mapped(bh))
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index f6d40d51f5ed..25ceb0805df2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -134,7 +134,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 				mapping_gfp_mask(mapping) | __GFP_NOFAIL);
 		bh = folio_buffers(folio);
 		if (!bh)
-			bh = folio_create_empty_buffers(folio,
+			bh = create_empty_buffers(folio,
 				sdp->sd_sb.sb_bsize, 0);
 	} else {
 		folio = __filemap_get_folio(mapping, index,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 25354278cecb..2f1328af34f4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -763,7 +763,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
 		return PTR_ERR(folio);
 	bh = folio_buffers(folio);
 	if (!bh)
-		bh = folio_create_empty_buffers(folio, bsize, 0);
+		bh = create_empty_buffers(folio, bsize, 0);
 
 	for (;;) {
 		/* Find the beginning block within the folio */
diff --git a/fs/mpage.c b/fs/mpage.c
index 964a6efe594d..ffb064ed9d04 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -119,7 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
 			folio_mark_uptodate(folio);
 			return;
 		}
-		head = folio_create_empty_buffers(folio, i_blocksize(inode), 0);
+		head = create_empty_buffers(folio, i_blocksize(inode), 0);
 	}
 
 	page_bh = head;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 7b754e6494d7..c97c77a39668 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -568,7 +568,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 
 	bh_frozen = folio_buffers(folio);
 	if (!bh_frozen)
-		bh_frozen = folio_create_empty_buffers(folio, 1 << blkbits, 0);
+		bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0);
 
 	bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits);
 
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 696215d899bf..06b04758f289 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -34,7 +34,7 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
 	struct buffer_head *bh = folio_buffers(folio);
 
 	if (!bh)
-		bh = folio_create_empty_buffers(folio, 1 << blkbits, b_state);
+		bh = create_empty_buffers(folio, 1 << blkbits, b_state);
 
 	first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
 	bh = get_nth_bh(bh, block - first_block);
@@ -204,7 +204,7 @@ static void nilfs_copy_folio(struct folio *dst, struct folio *src,
 	sbh = folio_buffers(src);
 	dbh = folio_buffers(dst);
 	if (!dbh)
-		dbh = folio_create_empty_buffers(dst, sbh->b_size, 0);
+		dbh = create_empty_buffers(dst, sbh->b_size, 0);
 
 	if (copy_dirty)
 		mask |= BIT(BH_Dirty);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 94388fe83cf8..55e31cc903d1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -732,7 +732,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		}
 		head = folio_buffers(folio);
 		if (!head)
-			head = folio_create_empty_buffers(folio,
+			head = create_empty_buffers(folio,
 					i_blocksize(inode), 0);
 		folio_unlock(folio);
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c4426992a2ee..71e31e789b29 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -189,7 +189,7 @@ static int ntfs_read_block(struct folio *folio)
 
 	head = folio_buffers(folio);
 	if (!head)
-		head = folio_create_empty_buffers(folio, blocksize, 0);
+		head = create_empty_buffers(folio, blocksize, 0);
 	bh = head;
 
 	/*
@@ -555,7 +555,7 @@ static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
 	head = folio_buffers(folio);
 	if (!head) {
 		BUG_ON(!folio_test_uptodate(folio));
-		head = folio_create_empty_buffers(folio, blocksize,
+		head = create_empty_buffers(folio, blocksize,
 				(1 << BH_Uptodate) | (1 << BH_Dirty));
 	}
 	bh = head;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 099141d20db6..297c0b9db621 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -625,7 +625,7 @@ do_next_folio:
 		 * create_empty_buffers() will create uptodate/dirty
 		 * buffers if the folio is uptodate/dirty.
 		 */
-		head = folio_create_empty_buffers(folio, blocksize, 0);
+		head = create_empty_buffers(folio, blocksize, 0);
 	bh = head;
 	do {
 		VCN cdelta;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index a003a69091a2..66fd4ac28395 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -203,7 +203,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 
 		head = folio_buffers(folio);
 		if (!head)
-			head = folio_create_empty_buffers(folio, blocksize, 0);
+			head = create_empty_buffers(folio, blocksize, 0);
 
 		bh = head;
 		bh_off = 0;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 95d1e70b4401..a6405dd5df09 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -601,7 +601,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 
 	head = folio_buffers(folio);
 	if (!head)
-		head = folio_create_empty_buffers(folio, bsize, 0);
+		head = create_empty_buffers(folio, bsize, 0);
 
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	     bh = bh->b_this_page, block_start += bsize) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d9737235b8e0..a9075c4843ed 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2539,7 +2539,7 @@ static int reiserfs_write_full_folio(struct folio *folio,
 	 */
 	head = folio_buffers(folio);
 	if (!head)
-		head = folio_create_empty_buffers(folio, s->s_blocksize,
+		head = create_empty_buffers(folio, s->s_blocksize,
 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
 
 	/*
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d32de30009a0..13ba34e6d64f 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -264,6 +264,6 @@ struct folio *ufs_get_locked_folio(struct address_space *mapping,
 		}
 	}
 	if (!folio_buffers(folio))
-		folio_create_empty_buffers(folio, 1 << inode->i_blkbits, 0);
+		create_empty_buffers(folio, 1 << inode->i_blkbits, 0);
 	return folio;
 }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3d85a0cf0ca5..5f23ee599889 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -201,9 +201,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					gfp_t gfp);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry);
-void create_empty_buffers(struct page *, unsigned long,
-			unsigned long b_state);
-struct buffer_head *folio_create_empty_buffers(struct folio *folio,
+struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From ca71fe1ad9221a89c6a25f49159c600d9e598ae1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:54 +0800
Subject: mm, pcp: avoid to drain PCP when process exit

Patch series "mm: PCP high auto-tuning", v3.

The page allocation performance requirements of different workloads are
often different.  So, we need to tune the PCP (Per-CPU Pageset) high on
each CPU automatically to optimize the page allocation performance.

The list of patches in series is as follows,

[1/9] mm, pcp: avoid to drain PCP when process exit
[2/9] cacheinfo: calculate per-CPU data cache size
[3/9] mm, pcp: reduce lock contention for draining high-order pages
[4/9] mm: restrict the pcp batch scale factor to avoid too long latency
[5/9] mm, page_alloc: scale the number of pages that are batch allocated
[6/9] mm: add framework for PCP high auto-tuning
[7/9] mm: tune PCP high automatically
[8/9] mm, pcp: decrease PCP high if free pages < high watermark
[9/9] mm, pcp: reduce detecting time of consecutive high order page freeing

Patch [1/9], [2/9], [3/9] optimize the PCP draining for consecutive
high-order pages freeing.

Patch [4/9], [5/9] optimize batch freeing and allocating.

Patch [6/9], [7/9], [8/9] implement and optimize a PCP high
auto-tuning method.

Patch [9/9] optimize the PCP draining for consecutive high order page
freeing based on PCP high auto-tuning.

The test results for patches with performance impact are as follows,

kbuild
======

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.

	build time   lock contend%	free_high	alloc_zone
	----------	----------	---------	----------
base	     100.0	      14.0          100.0            100.0
patch1	      99.5	      12.8	     19.5	      95.6
patch3	      99.4	      12.6	      7.1	      95.6
patch5	      98.6	      11.0	      8.1	      97.1
patch7	      95.1	       0.5	      2.8	      15.6
patch9	      95.0	       1.0	      8.8	      20.0

The PCP draining optimization (patch [1/9], [3/9]) and PCP batch
allocation optimization (patch [5/9]) reduces zone lock contention a
little.  The PCP high auto-tuning (patch [7/9], [9/9]) reduces build time
visibly.  Where the tuning target: the number of pages allocated from zone
reduces greatly.  So, the zone contention cycles% reduces greatly.

With PCP tuning patches (patch [7/9], [9/9]), the average used memory
during test increases up to 18.4% because more pages are cached in PCP.
But at the end of the test, the number of the used memory decreases to the
same level as that of the base patch.  That is, the pages cached in PCP
will be released to zone after not being used actively.

netperf SCTP_STREAM_MANY
========================

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	       2.1          100.0            100.0	         1.3
patch1	      99.4	       2.1	     99.4	      99.4		 1.3
patch3	     106.4	       1.3	     13.3	     106.3		 1.3
patch5	     106.0	       1.2	     13.2	     105.9		 1.3
patch7	     103.4	       1.9	      6.7	      90.3		 7.6
patch9	     108.6	       1.3	     13.7	     108.6		 1.3

The PCP draining optimization (patch [1/9]+[3/9]) improves performance.
The PCP high auto-tuning (patch [7/9]) reduces performance a little
because PCP draining cannot be triggered in time sometimes.  So, the cache
miss rate% increases.  The further PCP draining optimization (patch [9/9])
based on PCP tuning restore the performance.

lmbench3 UNIX (AF_UNIX)
=======================

On a 2-socket Intel server with 128 logical CPU, we tested UNIX
(AF_UNIX socket) test case of lmbench3 test suite with 16-pair
processes.

	     score   lock contend%	free_high	alloc_zone  cache miss rate%
	     -----	----------	---------	----------  ----------------
base	     100.0	      51.4          100.0            100.0	         0.2
patch1	     116.8	      46.1           69.5	     104.3	         0.2
patch3	     199.1	      21.3            7.0	     104.9	         0.2
patch5	     200.0	      20.8            7.1	     106.9	         0.3
patch7	     191.6	      19.9            6.8	     103.8	         2.8
patch9	     193.4	      21.7            7.0	     104.7	         2.1

The PCP draining optimization (patch [1/9], [3/9]) improves performance
much.  The PCP tuning (patch [7/9]) reduces performance a little because
PCP draining cannot be triggered in time sometimes.  The further PCP
draining optimization (patch [9/9]) based on PCP tuning restores the
performance partly.

The patchset adds several fields in struct per_cpu_pages.  The struct
layout before/after the patchset is as follows,

base
====

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        batch;                /*    12     4 */
	short int                  free_factor;          /*    16     2 */
	short int                  expire;               /*    18     2 */

	/* XXX 4 bytes hole, try to pack */

	struct list_head           lists[13];            /*    24   208 */

	/* size: 256, cachelines: 4, members: 7 */
	/* sum members: 228, holes: 1, sum holes: 4 */
	/* padding: 24 */
} __attribute__((__aligned__(64)));

patched
=======

struct per_cpu_pages {
	spinlock_t                 lock;                 /*     0     4 */
	int                        count;                /*     4     4 */
	int                        high;                 /*     8     4 */
	int                        high_min;             /*    12     4 */
	int                        high_max;             /*    16     4 */
	int                        batch;                /*    20     4 */
	u8                         flags;                /*    24     1 */
	u8                         alloc_factor;         /*    25     1 */
	u8                         expire;               /*    26     1 */

	/* XXX 1 byte hole, try to pack */

	short int                  free_count;           /*    28     2 */

	/* XXX 2 bytes hole, try to pack */

	struct list_head           lists[13];            /*    32   208 */

	/* size: 256, cachelines: 4, members: 11 */
	/* sum members: 237, holes: 2, sum holes: 3 */
	/* padding: 16 */
} __attribute__((__aligned__(64)));

The size of the struct doesn't changed with the patchset.


This patch (of 9):

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocation and freeing CPUs.

But, the PCP draining mechanism may be triggered unexpectedly when process
exits.  With some customized trace point, it was found that PCP draining
(free_high == true) was triggered with the order-1 page freeing with the
following call stack,

 => free_unref_page_commit
 => free_unref_page
 => __mmdrop
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

Checking the source code, this is the page table PGD freeing
(mm_free_pgd()).  It's a order-1 page freeing if
CONFIG_PAGE_TABLE_ISOLATION=y.  Which is a common configuration for
security.

Just before that, page freeing with the following call stack was found,

 => free_unref_page_commit
 => free_unref_page_list
 => release_pages
 => tlb_batch_pages_flush
 => tlb_finish_mmu
 => exit_mmap
 => __mmput
 => exit_mm
 => do_exit
 => do_group_exit
 => __x64_sys_exit_group
 => do_syscall_64

So, when a process exits,

- a large number of user pages of the process will be freed without
  page allocation, it's highly possible that pcp->free_factor becomes >
  0.  In fact, this is expected behavior to improve process exit
  performance.

- after freeing all user pages, the PGD will be freed, which is a
  order-1 page freeing, PCP will be drained.

All in all, when a process exits, it's high possible that the PCP will be
drained.  This is an unexpected behavior.

To avoid this, in the patch, the PCP draining will only be triggered for 2
consecutive high-order page freeing.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
14.0% to 12.8% (with PCP size == 367).  The number of PCP draining for
high order pages freeing (free_high) decreases 80.5%.

This helps network workload too for reduced zone lock contention.  On a
2-socket Intel server with 128 logical CPU, with the patch, the network
bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with
16-pair processes increase 16.8%.  The cycles% of the spinlock contention
(mostly for zone lock) decreases from 51.4% to 46.1%.  The number of PCP
draining for high order pages freeing (free_high) decreases 30.5%.  The
cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-1-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20231016053002.756205-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 12 +++++++++++-
 mm/page_alloc.c        | 11 ++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 486587fcd27f..de313f1c15f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -674,12 +674,22 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/*
+ * Flags used in pcp->flags field.
+ *
+ * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
+ * previous page freeing.  To avoid to drain PCP for an accident
+ * high-order page freeing.
+ */
+#define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
-	short free_factor;	/* batch scaling factor during free */
+	u8 flags;		/* protected by pcp->lock */
+	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	short expire;		/* When 0, remote pagesets are drained */
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f46e519618a0..de547ef9a9ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2370,7 +2370,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 {
 	int high;
 	int pindex;
-	bool free_high;
+	bool free_high = false;
 
 	__count_vm_events(PGFREE, 1 << order);
 	pindex = order_to_pindex(migratetype, order);
@@ -2383,8 +2383,13 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	 * freeing without allocation. The remainder after bulk freeing
 	 * stops will be drained from vmstat refresh context.
 	 */
-	free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
-
+	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
+		free_high = (pcp->free_factor &&
+			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER));
+		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
+	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
+		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
+	}
 	high = nr_pcp_high(pcp, zone, free_high);
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
-- 
cgit v1.2.3


From 94a3bfe4073cd88b05f7fb201ea7bf9dfa2cf5d5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:55 +0800
Subject: cacheinfo: calculate size of per-CPU data cache slice

This can be used to estimate the size of the data cache slice that can be
used by one CPU under ideal circumstances.  Both DATA caches and UNIFIED
caches are used in calculation.  So, the users need to consider the impact
of the code cache usage.

Because the cache inclusive/non-inclusive information isn't available now,
we just use the size of the per-CPU slice of LLC to make the result more
predictable across architectures.  This may be improved when more cache
information is available in the future.

A brute-force algorithm to iterate all online CPUs is used to avoid to
allocate an extra cpumask, especially in offline callback.

Link: https://lkml.kernel.org/r/20231016053002.756205-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cacheinfo.c  | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cacheinfo.h |  1 +
 2 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index cbae8be1fe52..585c66fce9d9 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -898,6 +898,48 @@ err:
 	return rc;
 }
 
+/*
+ * Calculate the size of the per-CPU data cache slice.  This can be
+ * used to estimate the size of the data cache slice that can be used
+ * by one CPU under ideal circumstances.  UNIFIED caches are counted
+ * in addition to DATA caches.  So, please consider code cache usage
+ * when use the result.
+ *
+ * Because the cache inclusive/non-inclusive information isn't
+ * available, we just use the size of the per-CPU slice of LLC to make
+ * the result more predictable across architectures.
+ */
+static void update_per_cpu_data_slice_size_cpu(unsigned int cpu)
+{
+	struct cpu_cacheinfo *ci;
+	struct cacheinfo *llc;
+	unsigned int nr_shared;
+
+	if (!last_level_cache_is_valid(cpu))
+		return;
+
+	ci = ci_cacheinfo(cpu);
+	llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
+
+	if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
+		return;
+
+	nr_shared = cpumask_weight(&llc->shared_cpu_map);
+	if (nr_shared)
+		ci->per_cpu_data_slice_size = llc->size / nr_shared;
+}
+
+static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
+{
+	unsigned int icpu;
+
+	for_each_online_cpu(icpu) {
+		if (!cpu_online && icpu == cpu)
+			continue;
+		update_per_cpu_data_slice_size_cpu(icpu);
+	}
+}
+
 static int cacheinfo_cpu_online(unsigned int cpu)
 {
 	int rc = detect_cache_attributes(cpu);
@@ -906,7 +948,11 @@ static int cacheinfo_cpu_online(unsigned int cpu)
 		return rc;
 	rc = cache_add_dev(cpu);
 	if (rc)
-		free_cache_attributes(cpu);
+		goto err;
+	update_per_cpu_data_slice_size(true, cpu);
+	return 0;
+err:
+	free_cache_attributes(cpu);
 	return rc;
 }
 
@@ -916,6 +962,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
 		cpu_cache_sysfs_exit(cpu);
 
 	free_cache_attributes(cpu);
+	update_per_cpu_data_slice_size(false, cpu);
 	return 0;
 }
 
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index a5cfd44fab45..d504eb4b49ab 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -73,6 +73,7 @@ struct cacheinfo {
 
 struct cpu_cacheinfo {
 	struct cacheinfo *info_list;
+	unsigned int per_cpu_data_slice_size;
 	unsigned int num_levels;
 	unsigned int num_leaves;
 	bool cpu_map_populated;
-- 
cgit v1.2.3


From 362d37a106dd3f6431b2fdd91d9208b0d023b50d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:56 +0800
Subject: mm, pcp: reduce lock contention for draining high-order pages

In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocating and freeing CPUs.

On system with small per-CPU data cache slice, pages shouldn't be cached
before draining to guarantee cache-hot.  But on a system with large
per-CPU data cache slice, some pages can be cached before draining to
reduce zone lock contention.

So, in this patch, instead of draining without any caching, "pcp->batch"
pages will be cached in PCP before draining if the size of the per-CPU
data cache slice is more than "3 * batch".

In theory, if the size of per-CPU data cache slice is more than "2 *
batch", we can reuse cache-hot pages between CPUs.  But considering the
other usage of cache (code, other data accessing, etc.), "3 * batch" is
used.

Note: "3 * batch" is chosen to make sure the optimization works on recent
x86_64 server CPUs.  If you want to increase it, please check whether it
breaks the optimization.

On a 2-socket Intel server with 128 logical CPU, with the patch, the
network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite
with 16-pair processes increase 70.5%.  The cycles% of the spinlock
contention (mostly for zone lock) decreases from 46.1% to 21.3%.  The
number of PCP draining for high order pages freeing (free_high) decreases
89.9%.  The cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/cacheinfo.c |  2 ++
 include/linux/gfp.h      |  1 +
 include/linux/mmzone.h   |  6 ++++++
 mm/page_alloc.c          | 38 +++++++++++++++++++++++++++++++++++++-
 4 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 585c66fce9d9..f1e79263fe61 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -950,6 +950,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
 	if (rc)
 		goto err;
 	update_per_cpu_data_slice_size(true, cpu);
+	setup_pcp_cacheinfo();
 	return 0;
 err:
 	free_cache_attributes(cpu);
@@ -963,6 +964,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
 
 	free_cache_attributes(cpu);
 	update_per_cpu_data_slice_size(false, cpu);
+	setup_pcp_cacheinfo();
 	return 0;
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665f06675c83..665edc11fb9f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index de313f1c15f9..efe72b3f7872 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -680,8 +680,14 @@ enum zone_watermarks {
  * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
  * previous page freeing.  To avoid to drain PCP for an accident
  * high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough.  To reduce
+ * zone lock contention and keep cache-hot pages reusing.
  */
 #define	PCPF_PREV_FREE_HIGH_ORDER	BIT(0)
+#define	PCPF_FREE_HIGH_BATCH		BIT(1)
 
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index de547ef9a9ad..b76b1de48a30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -52,6 +52,7 @@
 #include <linux/psi.h>
 #include <linux/khugepaged.h>
 #include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -2385,7 +2386,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	 */
 	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
 		free_high = (pcp->free_factor &&
-			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER));
+			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+			      pcp->count >= READ_ONCE(pcp->batch)));
 		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -5418,6 +5421,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
+static void zone_pcp_update_cacheinfo(struct zone *zone)
+{
+	int cpu;
+	struct per_cpu_pages *pcp;
+	struct cpu_cacheinfo *cci;
+
+	for_each_online_cpu(cpu) {
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		cci = get_cpu_cacheinfo(cpu);
+		/*
+		 * If data cache slice of CPU is large enough, "pcp->batch"
+		 * pages can be preserved in PCP before draining PCP for
+		 * consecutive high-order pages freeing without allocation.
+		 * This can reduce zone lock contention without hurting
+		 * cache-hot pages sharing.
+		 */
+		spin_lock(&pcp->lock);
+		if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+			pcp->flags |= PCPF_FREE_HIGH_BATCH;
+		else
+			pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+		spin_unlock(&pcp->lock);
+	}
+}
+
+void setup_pcp_cacheinfo(void)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		zone_pcp_update_cacheinfo(zone);
+}
+
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
-- 
cgit v1.2.3


From c0a242394cb980bd00e1f61dc8aacb453d2bbe6a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:58 +0800
Subject: mm, page_alloc: scale the number of pages that are batch allocated

When a task is allocating a large number of order-0 pages, it may acquire
the zone->lock multiple times allocating pages in batches.  This may
unnecessarily contend on the zone lock when allocating very large number
of pages.  This patch adapts the size of the batch based on the recent
pattern to scale the batch size for subsequent allocations.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
cycles% of the spinlock contention (mostly for zone lock) decreases from
12.6% to 11.0% (with PCP size == 367).

Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        | 53 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index efe72b3f7872..e6cc2238c65f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -695,9 +695,10 @@ struct per_cpu_pages {
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
+	u8 alloc_factor;	/* batch scaling factor during allocate */
 	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
-	short expire;		/* When 0, remote pagesets are drained */
+	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05a8d34827d2..6f6af835b05b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	int pindex;
 	bool free_high = false;
 
+	/*
+	 * On freeing, reduce the number of pages that are batch allocated.
+	 * See nr_pcp_alloc() where alloc_factor is increased for subsequent
+	 * allocations.
+	 */
+	pcp->alloc_factor >>= 1;
 	__count_vm_events(PGFREE, 1 << order);
 	pindex = order_to_pindex(migratetype, order);
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
@@ -2679,6 +2685,42 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 	return page;
 }
 
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
+{
+	int high, batch, max_nr_alloc;
+
+	high = READ_ONCE(pcp->high);
+	batch = READ_ONCE(pcp->batch);
+
+	/* Check for PCP disabled or boot pageset */
+	if (unlikely(high < batch))
+		return 1;
+
+	/*
+	 * Double the number of pages allocated each time there is subsequent
+	 * allocation of order-0 pages without any freeing.
+	 */
+	if (!order) {
+		max_nr_alloc = max(high - pcp->count - batch, batch);
+		batch <<= pcp->alloc_factor;
+		if (batch <= max_nr_alloc &&
+		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+			pcp->alloc_factor++;
+		batch = min(batch, max_nr_alloc);
+	}
+
+	/*
+	 * Scale batch relative to order if batch implies free pages
+	 * can be stored on the PCP. Batch can be 1 for small zones or
+	 * for boot pagesets which should never store free pages as
+	 * the pages may belong to arbitrary zones.
+	 */
+	if (batch > 1)
+		batch = max(batch >> order, 2);
+
+	return batch;
+}
+
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 
 	do {
 		if (list_empty(list)) {
-			int batch = READ_ONCE(pcp->batch);
+			int batch = nr_pcp_alloc(pcp, order);
 			int alloced;
 
-			/*
-			 * Scale batch relative to order if batch implies
-			 * free pages can be stored on the PCP. Batch can
-			 * be 1 for small zones or for boot pagesets which
-			 * should never store free pages as the pages may
-			 * belong to arbitrary zones.
-			 */
-			if (batch > 1)
-				batch = max(batch >> order, 2);
 			alloced = rmqueue_bulk(zone, order,
 					batch, list,
 					migratetype, alloc_flags);
-- 
cgit v1.2.3


From 90b41691b9881376fe784e13b5766ec3676fdb55 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:29:59 +0800
Subject: mm: add framework for PCP high auto-tuning

The page allocation performance requirements of different workloads are
usually different.  So, we need to tune PCP (per-CPU pageset) high to
optimize the workload page allocation performance.  Now, we have a system
wide sysctl knob (percpu_pagelist_high_fraction) to tune PCP high by hand.
But, it's hard to find out the best value by hand.  And one global
configuration may not work best for the different workloads that run on
the same system.  One solution to these issues is to tune PCP high of each
CPU automatically.

This patch adds the framework for PCP high auto-tuning.  With it,
pcp->high of each CPU will be changed automatically by tuning algorithm at
runtime.  The minimal high (pcp->high_min) is the original PCP high value
calculated based on the low watermark pages.  While the maximal high
(pcp->high_max) is the PCP high value when percpu_pagelist_high_fraction
sysctl knob is set to MIN_PERCPU_PAGELIST_HIGH_FRACTION.  That is, the
maximal pcp->high that can be set via sysctl knob by hand.

It's possible that PCP high auto-tuning doesn't work well for some
workloads.  So, when PCP high is tuned by hand via the sysctl knob, the
auto-tuning will be disabled.  The PCP high set by hand will be used
instead.

This patch only adds the framework, so pcp->high will be set to
pcp->high_min (original default) always.  We will add actual auto-tuning
algorithm in the following patches in the series.

Link: https://lkml.kernel.org/r/20231016053002.756205-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  5 +++-
 mm/page_alloc.c        | 71 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 50 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e6cc2238c65f..775abc899e80 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -693,6 +693,8 @@ struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
+	int high_min;		/* min high watermark */
+	int high_max;		/* max high watermark */
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
@@ -852,7 +854,8 @@ struct zone {
 	 * the high and batch values are copied to individual pagesets for
 	 * faster access
 	 */
-	int pageset_high;
+	int pageset_high_min;
+	int pageset_high_max;
 	int pageset_batch;
 
 #ifndef CONFIG_SPARSEMEM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6f6af835b05b..922836b6a01d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2350,7 +2350,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 		       bool free_high)
 {
-	int high = READ_ONCE(pcp->high);
+	int high = READ_ONCE(pcp->high_min);
 
 	if (unlikely(!high || free_high))
 		return 0;
@@ -2689,7 +2689,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
 {
 	int high, batch, max_nr_alloc;
 
-	high = READ_ONCE(pcp->high);
+	high = READ_ONCE(pcp->high_min);
 	batch = READ_ONCE(pcp->batch);
 
 	/* Check for PCP disabled or boot pageset */
@@ -5296,14 +5296,15 @@ static int zone_batchsize(struct zone *zone)
 }
 
 static int percpu_pagelist_high_fraction;
-static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online,
+			 int high_fraction)
 {
 #ifdef CONFIG_MMU
 	int high;
 	int nr_split_cpus;
 	unsigned long total_pages;
 
-	if (!percpu_pagelist_high_fraction) {
+	if (!high_fraction) {
 		/*
 		 * By default, the high value of the pcp is based on the zone
 		 * low watermark so that if they are full then background
@@ -5316,15 +5317,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 		 * value is based on a fraction of the managed pages in the
 		 * zone.
 		 */
-		total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+		total_pages = zone_managed_pages(zone) / high_fraction;
 	}
 
 	/*
 	 * Split the high value across all online CPUs local to the zone. Note
 	 * that early in boot that CPUs may not be online yet and that during
 	 * CPU hotplug that the cpumask is not yet updated when a CPU is being
-	 * onlined. For memory nodes that have no CPUs, split pcp->high across
-	 * all online CPUs to mitigate the risk that reclaim is triggered
+	 * onlined. For memory nodes that have no CPUs, split the high value
+	 * across all online CPUs to mitigate the risk that reclaim is triggered
 	 * prematurely due to pages stored on pcp lists.
 	 */
 	nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
@@ -5352,19 +5353,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
  * However, guaranteeing these relations at all times would require e.g. write
  * barriers here but also careful usage of read barriers at the read side, and
  * thus be prone to error and bad for performance. Thus the update only prevents
- * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
+ * should ensure they can cope with those fields changing asynchronously, and
+ * fully trust only the pcp->count field on the local CPU with interrupts
+ * disabled.
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
-		unsigned long batch)
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
+			   unsigned long high_max, unsigned long batch)
 {
 	WRITE_ONCE(pcp->batch, batch);
-	WRITE_ONCE(pcp->high, high);
+	WRITE_ONCE(pcp->high_min, high_min);
+	WRITE_ONCE(pcp->high_max, high_max);
 }
 
 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
@@ -5384,20 +5387,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	 * need to be as careful as pageset_update() as nobody can access the
 	 * pageset yet.
 	 */
-	pcp->high = BOOT_PAGESET_HIGH;
+	pcp->high_min = BOOT_PAGESET_HIGH;
+	pcp->high_max = BOOT_PAGESET_HIGH;
 	pcp->batch = BOOT_PAGESET_BATCH;
 	pcp->free_factor = 0;
 }
 
-static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
-		unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
+					      unsigned long high_max, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-		pageset_update(pcp, high, batch);
+		pageset_update(pcp, high_min, high_max, batch);
 	}
 }
 
@@ -5407,19 +5411,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
  */
 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
 {
-	int new_high, new_batch;
+	int new_high_min, new_high_max, new_batch;
 
 	new_batch = max(1, zone_batchsize(zone));
-	new_high = zone_highsize(zone, new_batch, cpu_online);
+	if (percpu_pagelist_high_fraction) {
+		new_high_min = zone_highsize(zone, new_batch, cpu_online,
+					     percpu_pagelist_high_fraction);
+		/*
+		 * PCP high is tuned manually, disable auto-tuning via
+		 * setting high_min and high_max to the manual value.
+		 */
+		new_high_max = new_high_min;
+	} else {
+		new_high_min = zone_highsize(zone, new_batch, cpu_online, 0);
+		new_high_max = zone_highsize(zone, new_batch, cpu_online,
+					     MIN_PERCPU_PAGELIST_HIGH_FRACTION);
+	}
 
-	if (zone->pageset_high == new_high &&
+	if (zone->pageset_high_min == new_high_min &&
+	    zone->pageset_high_max == new_high_max &&
 	    zone->pageset_batch == new_batch)
 		return;
 
-	zone->pageset_high = new_high;
+	zone->pageset_high_min = new_high_min;
+	zone->pageset_high_max = new_high_max;
 	zone->pageset_batch = new_batch;
 
-	__zone_set_pageset_high_and_batch(zone, new_high, new_batch);
+	__zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max,
+					  new_batch);
 }
 
 void __meminit setup_zone_pageset(struct zone *zone)
@@ -5528,7 +5547,8 @@ __meminit void zone_pcp_init(struct zone *zone)
 	 */
 	zone->per_cpu_pageset = &boot_pageset;
 	zone->per_cpu_zonestats = &boot_zonestats;
-	zone->pageset_high = BOOT_PAGESET_HIGH;
+	zone->pageset_high_min = BOOT_PAGESET_HIGH;
+	zone->pageset_high_max = BOOT_PAGESET_HIGH;
 	zone->pageset_batch = BOOT_PAGESET_BATCH;
 
 	if (populated_zone(zone))
@@ -6430,13 +6450,14 @@ EXPORT_SYMBOL(free_contig_range);
 void zone_pcp_disable(struct zone *zone)
 {
 	mutex_lock(&pcp_batch_high_lock);
-	__zone_set_pageset_high_and_batch(zone, 0, 1);
+	__zone_set_pageset_high_and_batch(zone, 0, 0, 1);
 	__drain_all_pages(zone, true);
 }
 
 void zone_pcp_enable(struct zone *zone)
 {
-	__zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+	__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+		zone->pageset_high_max, zone->pageset_batch);
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-- 
cgit v1.2.3


From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:00 +0800
Subject: mm: tune PCP high automatically

The target to tune PCP high automatically is as follows,

- Minimize allocation/freeing from/to shared zone

- Minimize idle pages in PCP

- Minimize pages in PCP if the system free pages is too few

To reach these target, a tuning algorithm as follows is designed,

- When we refill PCP via allocating from the zone, increase PCP high.
  Because if we had larger PCP, we could avoid to allocate from the
  zone.

- In periodic vmstat updating kworker (via refresh_cpu_vm_stats()),
  decrease PCP high to try to free possible idle PCP pages.

- When page reclaiming is active for the zone, stop increasing PCP
  high in allocating path, decrease PCP high and free some pages in
  freeing path.

So, the PCP high can be tuned to the page allocating/freeing depth of
workloads eventually.

One issue of the algorithm is that if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small.  But this
isn't a severe issue, because there are no idle pages in this case.

One alternative choice is to increase PCP high when we drain PCP via
trying to free pages to the zone, but don't increase PCP high during PCP
refilling.  This can avoid the issue above.  But if the number of pages
allocated is much less than that of pages freed on a CPU, there will be
many idle pages in PCP and it is hard to free these idle pages.

1/8 (>> 3) of PCP high will be decreased periodically.  The value 1/8 is
kind of arbitrary.  Just to make sure that the idle PCP pages will be
freed eventually.

On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
kbuild server that is used by 0-Day kbuild service.  With the patch, the
build time decreases 3.5%.  The cycles% of the spinlock contention (mostly
for zone lock) decreases from 11.0% to 0.5%.  The number of PCP draining
for high order pages freeing (free_high) decreases 65.6%.  The number of
pages allocated from zone (instead of from PCP) decreases 83.9%.

Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Suggested-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h |   1 +
 mm/page_alloc.c     | 119 +++++++++++++++++++++++++++++++++++++++++-----------
 mm/vmstat.c         |   8 ++--
 3 files changed, 99 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665edc11fb9f..5b917e5b9350 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -320,6 +320,7 @@ extern void page_frag_free(void *addr);
 #define free_page(addr) free_pages((addr), 0)
 
 void page_alloc_init_cpuhp(void);
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 922836b6a01d..83af76a8cef9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2157,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	return i;
 }
 
+/*
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+{
+	int high_min, to_drain, batch;
+	int todo = 0;
+
+	high_min = READ_ONCE(pcp->high_min);
+	batch = READ_ONCE(pcp->batch);
+	/*
+	 * Decrease pcp->high periodically to try to free possible
+	 * idle PCP pages.  And, avoid to free too many pages to
+	 * control latency.  This caps pcp->high decrement too.
+	 */
+	if (pcp->high > high_min) {
+		pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+				 pcp->high - (pcp->high >> 3), high_min);
+		if (pcp->high > high_min)
+			todo++;
+	}
+
+	to_drain = pcp->count - pcp->high;
+	if (to_drain > 0) {
+		spin_lock(&pcp->lock);
+		free_pcppages_bulk(zone, to_drain, pcp, 0);
+		spin_unlock(&pcp->lock);
+		todo++;
+	}
+
+	return todo;
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
@@ -2318,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
 	return true;
 }
 
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
 {
 	int min_nr_free, max_nr_free;
-	int batch = READ_ONCE(pcp->batch);
 
-	/* Free everything if batch freeing high-order pages. */
+	/* Free as much as possible if batch freeing high-order pages. */
 	if (unlikely(free_high))
-		return pcp->count;
+		return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
 
 	/* Check for PCP disabled or boot pageset */
 	if (unlikely(high < batch))
@@ -2340,7 +2373,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
 	 * freeing of pages without any allocation.
 	 */
 	batch <<= pcp->free_factor;
-	if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+	if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
 		pcp->free_factor++;
 	batch = clamp(batch, min_nr_free, max_nr_free);
 
@@ -2348,28 +2381,48 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
 }
 
 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
-		       bool free_high)
+		       int batch, bool free_high)
 {
-	int high = READ_ONCE(pcp->high_min);
+	int high, high_min, high_max;
+
+	high_min = READ_ONCE(pcp->high_min);
+	high_max = READ_ONCE(pcp->high_max);
+	high = pcp->high = clamp(pcp->high, high_min, high_max);
 
-	if (unlikely(!high || free_high))
+	if (unlikely(!high))
 		return 0;
 
-	if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
-		return high;
+	if (unlikely(free_high)) {
+		pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+				high_min);
+		return 0;
+	}
 
 	/*
 	 * If reclaim is active, limit the number of pages that can be
 	 * stored on pcp lists
 	 */
-	return min(READ_ONCE(pcp->batch) << 2, high);
+	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
+		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		return min(batch << 2, pcp->high);
+	}
+
+	if (pcp->count >= high && high_min != high_max) {
+		int need_high = (batch << pcp->free_factor) + batch;
+
+		/* pcp->high should be large enough to hold batch freed pages */
+		if (pcp->high < need_high)
+			pcp->high = clamp(need_high, high_min, high_max);
+	}
+
+	return high;
 }
 
 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 				   struct page *page, int migratetype,
 				   unsigned int order)
 {
-	int high;
+	int high, batch;
 	int pindex;
 	bool free_high = false;
 
@@ -2384,6 +2437,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
 
+	batch = READ_ONCE(pcp->batch);
 	/*
 	 * As high-order pages other than THP's stored on PCP can contribute
 	 * to fragmentation, limit the number stored when PCP is heavily
@@ -2394,14 +2448,15 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 		free_high = (pcp->free_factor &&
 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
-			      pcp->count >= READ_ONCE(pcp->batch)));
+			      pcp->count >= READ_ONCE(batch)));
 		pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
 	}
-	high = nr_pcp_high(pcp, zone, free_high);
+	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count >= high) {
-		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
+		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+				   pcp, pindex);
 	}
 }
 
@@ -2685,24 +2740,38 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 	return page;
 }
 
-static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
 {
-	int high, batch, max_nr_alloc;
+	int high, base_batch, batch, max_nr_alloc;
+	int high_max, high_min;
 
-	high = READ_ONCE(pcp->high_min);
-	batch = READ_ONCE(pcp->batch);
+	base_batch = READ_ONCE(pcp->batch);
+	high_min = READ_ONCE(pcp->high_min);
+	high_max = READ_ONCE(pcp->high_max);
+	high = pcp->high = clamp(pcp->high, high_min, high_max);
 
 	/* Check for PCP disabled or boot pageset */
-	if (unlikely(high < batch))
+	if (unlikely(high < base_batch))
 		return 1;
 
+	if (order)
+		batch = base_batch;
+	else
+		batch = (base_batch << pcp->alloc_factor);
+
 	/*
-	 * Double the number of pages allocated each time there is subsequent
-	 * allocation of order-0 pages without any freeing.
+	 * If we had larger pcp->high, we could avoid to allocate from
+	 * zone.
 	 */
+	if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+		high = pcp->high = min(high + batch, high_max);
+
 	if (!order) {
-		max_nr_alloc = max(high - pcp->count - batch, batch);
-		batch <<= pcp->alloc_factor;
+		max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
+		/*
+		 * Double the number of pages allocated each time there is
+		 * subsequent allocation of order-0 pages without any freeing.
+		 */
 		if (batch <= max_nr_alloc &&
 		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
 			pcp->alloc_factor++;
@@ -2733,7 +2802,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 
 	do {
 		if (list_empty(list)) {
-			int batch = nr_pcp_alloc(pcp, order);
+			int batch = nr_pcp_alloc(pcp, zone, order);
 			int alloced;
 
 			alloced = rmqueue_bulk(zone, order,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 88ea95d4221c..359460deb377 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -816,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 
 	for_each_populated_zone(zone) {
 		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-#ifdef CONFIG_NUMA
 		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
-#endif
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 			int v;
@@ -834,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 #endif
 			}
 		}
-#ifdef CONFIG_NUMA
 
 		if (do_pagesets) {
 			cond_resched();
+
+			changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
+#ifdef CONFIG_NUMA
 			/*
 			 * Deal with draining the remote pageset of this
 			 * processor
@@ -866,8 +866,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 				drain_zone_pages(zone, this_cpu_ptr(pcp));
 				changes++;
 			}
-		}
 #endif
+		}
 	}
 
 	for_each_online_pgdat(pgdat) {
-- 
cgit v1.2.3


From 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:01 +0800
Subject: mm, pcp: decrease PCP high if free pages < high watermark

One target of PCP is to minimize pages in PCP if the system free pages is
too few.  To reach that target, when page reclaiming is active for the
zone (ZONE_RECLAIM_ACTIVE), we will stop increasing PCP high in allocating
path, decrease PCP high and free some pages in freeing path.  But this may
be too late because the background page reclaiming may introduce latency
for some workloads.  So, in this patch, during page allocation we will
detect whether the number of free pages of the zone is below high
watermark.  If so, we will stop increasing PCP high in allocating path,
decrease PCP high and free some pages in freeing path.  With this, we can
reduce the possibility of the premature background page reclaiming caused
by too large PCP.

The high watermark checking is done in allocating path to reduce the
overhead in hotter freeing path.

Link: https://lkml.kernel.org/r/20231016053002.756205-9-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 775abc899e80..b92ab001e146 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1016,6 +1016,7 @@ enum zone_flags {
 					 * Cleared when kswapd is woken.
 					 */
 	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
+	ZONE_BELOW_HIGH,		/* zone is below high watermark. */
 };
 
 static inline unsigned long zone_managed_pages(struct zone *zone)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83af76a8cef9..58ab8389da05 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2407,7 +2407,13 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 		return min(batch << 2, pcp->high);
 	}
 
-	if (pcp->count >= high && high_min != high_max) {
+	if (high_min == high_max)
+		return high;
+
+	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
+		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		high = max(pcp->count, high_min);
+	} else if (pcp->count >= high) {
 		int need_high = (batch << pcp->free_factor) + batch;
 
 		/* pcp->high should be large enough to hold batch freed pages */
@@ -2457,6 +2463,10 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
 				   pcp, pindex);
+		if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+		    zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+				      ZONE_MOVABLE, 0))
+			clear_bit(ZONE_BELOW_HIGH, &zone->flags);
 	}
 }
 
@@ -2763,7 +2773,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
 	 * If we had larger pcp->high, we could avoid to allocate from
 	 * zone.
 	 */
-	if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
 		high = pcp->high = min(high + batch, high_max);
 
 	if (!order) {
@@ -3225,6 +3235,25 @@ retry:
 			}
 		}
 
+		/*
+		 * Detect whether the number of free pages is below high
+		 * watermark.  If so, we will decrease pcp->high and free
+		 * PCP pages in free path to reduce the possibility of
+		 * premature page reclaiming.  Detection is done here to
+		 * avoid to do that in hotter free path.
+		 */
+		if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
+			goto check_alloc_wmark;
+
+		mark = high_wmark_pages(zone);
+		if (zone_watermark_fast(zone, order, mark,
+					ac->highest_zoneidx, alloc_flags,
+					gfp_mask))
+			goto try_this_zone;
+		else
+			set_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+check_alloc_wmark:
 		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
 		if (!zone_watermark_fast(zone, order, mark,
 				       ac->highest_zoneidx, alloc_flags,
-- 
cgit v1.2.3


From 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 16 Oct 2023 13:30:02 +0800
Subject: mm, pcp: reduce detecting time of consecutive high order page freeing

In current PCP auto-tuning design, if the number of pages allocated is
much more than that of pages freed on a CPU, the PCP high may become the
maximal value even if the allocating/freeing depth is small, for example,
in the sender of network workloads.  If a CPU was used as sender
originally, then it is used as receiver after context switching, we need
to fill the whole PCP with maximal high before triggering PCP draining for
consecutive high order freeing.  This will hurt the performance of some
network workloads.

To solve the issue, in this patch, we will track the consecutive page
freeing with a counter in stead of relying on PCP draining.  So, we can
detect consecutive page freeing much earlier.

On a 2-socket Intel server with 128 logical CPU, we tested
SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes.
With the patch, the network bandwidth improves 5.0%.  This restores the
performance drop caused by PCP auto-tuning.

Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/page_alloc.c        | 27 +++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b92ab001e146..3c25226beeed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -698,10 +698,10 @@ struct per_cpu_pages {
 	int batch;		/* chunk size for buddy add/remove */
 	u8 flags;		/* protected by pcp->lock */
 	u8 alloc_factor;	/* batch scaling factor during allocate */
-	u8 free_factor;		/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
 	u8 expire;		/* When 0, remote pagesets are drained */
 #endif
+	short free_count;	/* consecutive free count */
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 58ab8389da05..d52718284029 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2369,13 +2369,10 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free
 	max_nr_free = high - batch;
 
 	/*
-	 * Double the number of pages freed each time there is subsequent
-	 * freeing of pages without any allocation.
+	 * Increase the batch number to the number of the consecutive
+	 * freed pages to reduce zone lock contention.
 	 */
-	batch <<= pcp->free_factor;
-	if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
-		pcp->free_factor++;
-	batch = clamp(batch, min_nr_free, max_nr_free);
+	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
 
 	return batch;
 }
@@ -2403,7 +2400,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	 * stored on pcp lists
 	 */
 	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
-		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		int free_count = max_t(int, pcp->free_count, batch);
+
+		pcp->high = max(high - free_count, high_min);
 		return min(batch << 2, pcp->high);
 	}
 
@@ -2411,10 +2410,12 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 		return high;
 
 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
-		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		int free_count = max_t(int, pcp->free_count, batch);
+
+		pcp->high = max(high - free_count, high_min);
 		high = max(pcp->count, high_min);
 	} else if (pcp->count >= high) {
-		int need_high = (batch << pcp->free_factor) + batch;
+		int need_high = pcp->free_count + batch;
 
 		/* pcp->high should be large enough to hold batch freed pages */
 		if (pcp->high < need_high)
@@ -2451,7 +2452,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	 * stops will be drained from vmstat refresh context.
 	 */
 	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
-		free_high = (pcp->free_factor &&
+		free_high = (pcp->free_count >= batch &&
 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
 			      pcp->count >= READ_ONCE(batch)));
@@ -2459,6 +2460,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
 	}
+	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
+		pcp->free_count += (1 << order);
 	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2855,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	 * See nr_pcp_free() where free_factor is increased for subsequent
 	 * frees.
 	 */
-	pcp->free_factor >>= 1;
+	pcp->free_count >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
 	pcp_spin_unlock(pcp);
@@ -5488,7 +5491,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	pcp->high_min = BOOT_PAGESET_HIGH;
 	pcp->high_max = BOOT_PAGESET_HIGH;
 	pcp->batch = BOOT_PAGESET_BATCH;
-	pcp->free_factor = 0;
+	pcp->free_count = 0;
 }
 
 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
-- 
cgit v1.2.3


From 1aacbd354313f25c855e662e41c04e2abf71444a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:42 -0700
Subject: mm: kmem: add direct objcg pointer to task_struct

To charge a freshly allocated kernel object to a memory cgroup, the kernel
needs to obtain an objcg pointer.  Currently it does it indirectly by
obtaining the memcg pointer first and then calling to
__get_obj_cgroup_from_memcg().

Usually tasks spend their entire life belonging to the same object cgroup.
So it makes sense to save the objcg pointer on task_struct directly, so
it can be obtained faster.  It requires some work on fork, exit and cgroup
migrate paths, but these paths are way colder.

To avoid any costly synchronization the following rules are applied:
1) A task sets it's objcg pointer itself.

2) If a task is being migrated to another cgroup, the least
   significant bit of the objcg pointer is set atomically.

3) On the allocation path the objcg pointer is obtained locklessly
   using the READ_ONCE() macro and the least significant bit is
   checked. If it's set, the following procedure is used to update
   it locklessly:
       - task->objcg is zeroed using cmpxcg
       - new objcg pointer is obtained
       - task->objcg is updated using try_cmpxchg
       - operation is repeated if try_cmpxcg fails
   It guarantees that no updates will be lost if task migration
   is racing against objcg pointer update. It also allows to keep
   both read and write paths fully lockless.

Because the task is keeping a reference to the objcg, it can't go away
while the task is alive.

This commit doesn't change the way the remote memcg charging works.

Link: https://lkml.kernel.org/r/20231019225346.1822282-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h |   4 ++
 mm/memcontrol.c       | 139 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 134 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..60de42715b56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,10 @@ struct task_struct {
 	struct mem_cgroup		*active_memcg;
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+	struct obj_cgroup		*objcg;
+#endif
+
 #ifdef CONFIG_BLK_CGROUP
 	struct gendisk			*throttle_disk;
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 975ba766c16f..96f4c319f025 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -249,6 +249,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 	return container_of(vmpr, struct mem_cgroup, vmpressure);
 }
 
+#define CURRENT_OBJCG_UPDATE_BIT 0
+#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
+
 #ifdef CONFIG_MEMCG_KMEM
 static DEFINE_SPINLOCK(objcg_lock);
 
@@ -3083,6 +3086,58 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
 	return objcg;
 }
 
+static struct obj_cgroup *current_objcg_update(void)
+{
+	struct mem_cgroup *memcg;
+	struct obj_cgroup *old, *objcg = NULL;
+
+	do {
+		/* Atomically drop the update bit. */
+		old = xchg(&current->objcg, NULL);
+		if (old) {
+			old = (struct obj_cgroup *)
+				((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
+			if (old)
+				obj_cgroup_put(old);
+
+			old = NULL;
+		}
+
+		/* If new objcg is NULL, no reason for the second atomic update. */
+		if (!current->mm || (current->flags & PF_KTHREAD))
+			return NULL;
+
+		/*
+		 * Release the objcg pointer from the previous iteration,
+		 * if try_cmpxcg() below fails.
+		 */
+		if (unlikely(objcg)) {
+			obj_cgroup_put(objcg);
+			objcg = NULL;
+		}
+
+		/*
+		 * Obtain the new objcg pointer. The current task can be
+		 * asynchronously moved to another memcg and the previous
+		 * memcg can be offlined. So let's get the memcg pointer
+		 * and try get a reference to objcg under a rcu read lock.
+		 */
+
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(current);
+		objcg = __get_obj_cgroup_from_memcg(memcg);
+		rcu_read_unlock();
+
+		/*
+		 * Try set up a new objcg pointer atomically. If it
+		 * fails, it means the update flag was set concurrently, so
+		 * the whole procedure should be repeated.
+		 */
+	} while (!try_cmpxchg(&current->objcg, &old, objcg));
+
+	return objcg;
+}
+
 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
 {
 	struct mem_cgroup *memcg;
@@ -3090,19 +3145,26 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
 
 	if (in_task()) {
 		memcg = current->active_memcg;
+		if (unlikely(memcg))
+			goto from_memcg;
 
-		/* Memcg to charge can't be determined. */
-		if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD)))
-			return NULL;
+		objcg = READ_ONCE(current->objcg);
+		if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
+			objcg = current_objcg_update();
+
+		if (objcg) {
+			obj_cgroup_get(objcg);
+			return objcg;
+		}
 	} else {
 		memcg = this_cpu_read(int_active_memcg);
-		if (likely(!memcg))
-			return NULL;
+		if (unlikely(memcg))
+			goto from_memcg;
 	}
+	return NULL;
 
+from_memcg:
 	rcu_read_lock();
-	if (!memcg)
-		memcg = mem_cgroup_from_task(current);
 	objcg = __get_obj_cgroup_from_memcg(memcg);
 	rcu_read_unlock();
 	return objcg;
@@ -6440,6 +6502,7 @@ static void mem_cgroup_move_task(void)
 		mem_cgroup_clear_mc();
 	}
 }
+
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 {
@@ -6453,8 +6516,39 @@ static void mem_cgroup_move_task(void)
 }
 #endif
 
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_fork(struct task_struct *task)
+{
+	/*
+	 * Set the update flag to cause task->objcg to be initialized lazily
+	 * on the first allocation. It can be done without any synchronization
+	 * because it's always performed on the current task, so does
+	 * current_objcg_update().
+	 */
+	task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
+}
+
+static void mem_cgroup_exit(struct task_struct *task)
+{
+	struct obj_cgroup *objcg = task->objcg;
+
+	objcg = (struct obj_cgroup *)
+		((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
+	if (objcg)
+		obj_cgroup_put(objcg);
+
+	/*
+	 * Some kernel allocations can happen after this point,
+	 * but let's ignore them. It can be done without any synchronization
+	 * because it's always performed on the current task, so does
+	 * current_objcg_update().
+	 */
+	task->objcg = NULL;
+}
+#endif
+
 #ifdef CONFIG_LRU_GEN
-static void mem_cgroup_attach(struct cgroup_taskset *tset)
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
@@ -6472,10 +6566,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
 	task_unlock(task);
 }
 #else
+static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_MEMCG_KMEM
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+
+	cgroup_taskset_for_each(task, css, tset) {
+		/* atomically set the update bit */
+		set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
+	}
+}
+#else
+static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
 static void mem_cgroup_attach(struct cgroup_taskset *tset)
 {
+	mem_cgroup_lru_gen_attach(tset);
+	mem_cgroup_kmem_attach(tset);
 }
-#endif /* CONFIG_LRU_GEN */
+#endif
 
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
@@ -6885,9 +7000,15 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.css_reset = mem_cgroup_css_reset,
 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
 	.can_attach = mem_cgroup_can_attach,
+#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM)
 	.attach = mem_cgroup_attach,
+#endif
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
+#ifdef CONFIG_MEMCG_KMEM
+	.fork = mem_cgroup_fork,
+	.exit = mem_cgroup_exit,
+#endif
 	.dfl_cftypes = memory_files,
 	.legacy_cftypes = mem_cgroup_legacy_files,
 	.early_init = 0,
-- 
cgit v1.2.3


From 675d6c9b59e313ca2573c93e8fd87011a99bb8ce Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:43 -0700
Subject: mm: kmem: make memcg keep a reference to the original objcg

Keep a reference to the original objcg object for the entire life of a
memcg structure.

This allows to simplify the synchronization on the kernel memory
allocation paths: pinning a (live) memcg will also pin the corresponding
objcg.

The memory overhead of this change is minimal because object cgroups
usually outlive their corresponding memory cgroups even without this
change, so it's only an additional pointer per memcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 +++++++-
 mm/memcontrol.c            | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6674c12725d5..cc110cc8fdfc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -289,7 +289,13 @@ struct mem_cgroup {
 
 #ifdef CONFIG_MEMCG_KMEM
 	int kmemcg_id;
-	struct obj_cgroup __rcu *objcg;
+	/*
+	 * memcg->objcg is wiped out as a part of the objcg repaprenting
+	 * process. memcg->orig_objcg preserves a pointer (and a reference)
+	 * to the original objcg until the end of live of memcg.
+	 */
+	struct obj_cgroup __rcu	*objcg;
+	struct obj_cgroup	*orig_objcg;
 	/* list of inherited objcgs, protected by objcg_lock */
 	struct list_head objcg_list;
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 96f4c319f025..ff036d5d339d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3899,6 +3899,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 
 	objcg->memcg = memcg;
 	rcu_assign_pointer(memcg->objcg, objcg);
+	obj_cgroup_get(objcg);
+	memcg->orig_objcg = objcg;
 
 	static_branch_enable(&memcg_kmem_online_key);
 
@@ -5406,6 +5408,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 
+	if (memcg->orig_objcg)
+		obj_cgroup_put(memcg->orig_objcg);
+
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	kfree(memcg->vmstats);
-- 
cgit v1.2.3


From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:44 -0700
Subject: mm: kmem: scoped objcg protection

Switch to a scope-based protection of the objcg pointer on slab/kmem
allocation paths.  Instead of using the get_() semantics in the
pre-allocation hook and put the reference afterwards, let's rely on the
fact that objcg is pinned by the scope.

It's possible because:
1) if the objcg is received from the current task struct, the task is
   keeping a reference to the objcg.
2) if the objcg is received from an active memcg (remote charging),
   the memcg is pinned by the scope and has a reference to the
   corresponding objcg.

Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +++++++++
 include/linux/sched/mm.h   |  4 ++++
 mm/memcontrol.c            | 47 ++++++++++++++++++++++++++++++++++++++++++++--
 mm/slab.h                  | 15 ++++++++-------
 4 files changed, 66 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cc110cc8fdfc..8006bc3bd7bf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1796,6 +1796,15 @@ bool mem_cgroup_kmem_disabled(void);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
+/*
+ * The returned objcg pointer is safe to use without additional
+ * protection within a scope. The scope is defined either by
+ * the current task (similar to the "current" global variable)
+ * or by set_active_memcg() pair.
+ * Please, use obj_cgroup_get() to get a reference if the pointer
+ * needs to be used outside of the local scope.
+ */
+struct obj_cgroup *current_obj_cgroup(void);
 struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8d89c8c4fac1..9a19f1b42f64 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
  * given memcg.
  *
+ * Please, make sure that caller has a reference to the passed memcg structure,
+ * so its lifetime is guaranteed to exceed the scope between two
+ * set_active_memcg() calls.
+ *
  * NOTE: This function can nest. Users must save the return value and
  * reset the previous value after their own charging scope is over.
  */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ff036d5d339d..a6457c8b5e16 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3170,6 +3170,49 @@ from_memcg:
 	return objcg;
 }
 
+__always_inline struct obj_cgroup *current_obj_cgroup(void)
+{
+	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
+
+	if (in_task()) {
+		memcg = current->active_memcg;
+		if (unlikely(memcg))
+			goto from_memcg;
+
+		objcg = READ_ONCE(current->objcg);
+		if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
+			objcg = current_objcg_update();
+		/*
+		 * Objcg reference is kept by the task, so it's safe
+		 * to use the objcg by the current task.
+		 */
+		return objcg;
+	}
+
+	memcg = this_cpu_read(int_active_memcg);
+	if (unlikely(memcg))
+		goto from_memcg;
+
+	return NULL;
+
+from_memcg:
+	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+		/*
+		 * Memcg pointer is protected by scope (see set_active_memcg())
+		 * and is pinning the corresponding objcg, so objcg can't go
+		 * away and can be used within the scope without any additional
+		 * protection.
+		 */
+		objcg = rcu_dereference_check(memcg->objcg, 1);
+		if (likely(objcg))
+			break;
+		objcg = NULL;
+	}
+
+	return objcg;
+}
+
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
 {
 	struct obj_cgroup *objcg;
@@ -3264,15 +3307,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 	struct obj_cgroup *objcg;
 	int ret = 0;
 
-	objcg = get_obj_cgroup_from_current();
+	objcg = current_obj_cgroup();
 	if (objcg) {
 		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
 		if (!ret) {
+			obj_cgroup_get(objcg);
 			page->memcg_data = (unsigned long)objcg |
 				MEMCG_DATA_KMEM;
 			return 0;
 		}
-		obj_cgroup_put(objcg);
 	}
 	return ret;
 }
diff --git a/mm/slab.h b/mm/slab.h
index 799a315695c6..3d07fb428393 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 	if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
 		return true;
 
-	objcg = get_obj_cgroup_from_current();
+	/*
+	 * The obtained objcg pointer is safe to use within the current scope,
+	 * defined by current task or set_active_memcg() pair.
+	 * obj_cgroup_get() is used to get a permanent reference.
+	 */
+	objcg = current_obj_cgroup();
 	if (!objcg)
 		return true;
 
@@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 		css_put(&memcg->css);
 
 		if (ret)
-			goto out;
+			return false;
 	}
 
 	if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
-		goto out;
+		return false;
 
 	*objcgp = objcg;
 	return true;
-out:
-	obj_cgroup_put(objcg);
-	return false;
 }
 
 static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 			obj_cgroup_uncharge(objcg, obj_full_size(s));
 		}
 	}
-	obj_cgroup_put(objcg);
 }
 
 static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
-- 
cgit v1.2.3


From e56808fef8f71a192b2740c0b6ea8be7ab865d54 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Thu, 19 Oct 2023 15:53:46 -0700
Subject: mm: kmem: reimplement get_obj_cgroup_from_current()

Reimplement get_obj_cgroup_from_current() using current_obj_cgroup().
get_obj_cgroup_from_current() and current_obj_cgroup() share 80% of the
code, so the new implementation is almost trivial.

get_obj_cgroup_from_current() is a convenient function used by the
bpf subsystem, so there is no reason to get rid of it completely.

Link: https://lkml.kernel.org/r/20231019225346.1822282-7-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 11 ++++++++++-
 mm/memcontrol.c            | 32 --------------------------------
 2 files changed, 10 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8006bc3bd7bf..b3d2e3e60eed 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1805,9 +1805,18 @@ void __memcg_kmem_uncharge_page(struct page *page, int order);
  * needs to be used outside of the local scope.
  */
 struct obj_cgroup *current_obj_cgroup(void);
-struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
 
+static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+	struct obj_cgroup *objcg = current_obj_cgroup();
+
+	if (objcg)
+		obj_cgroup_get(objcg);
+
+	return objcg;
+}
+
 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a6457c8b5e16..8b0859b8cc03 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3138,38 +3138,6 @@ static struct obj_cgroup *current_objcg_update(void)
 	return objcg;
 }
 
-__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
-{
-	struct mem_cgroup *memcg;
-	struct obj_cgroup *objcg;
-
-	if (in_task()) {
-		memcg = current->active_memcg;
-		if (unlikely(memcg))
-			goto from_memcg;
-
-		objcg = READ_ONCE(current->objcg);
-		if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
-			objcg = current_objcg_update();
-
-		if (objcg) {
-			obj_cgroup_get(objcg);
-			return objcg;
-		}
-	} else {
-		memcg = this_cpu_read(int_active_memcg);
-		if (unlikely(memcg))
-			goto from_memcg;
-	}
-	return NULL;
-
-from_memcg:
-	rcu_read_lock();
-	objcg = __get_obj_cgroup_from_memcg(memcg);
-	rcu_read_unlock();
-	return objcg;
-}
-
 __always_inline struct obj_cgroup *current_obj_cgroup(void)
 {
 	struct mem_cgroup *memcg;
-- 
cgit v1.2.3


From 1d44f2e6d178163a94980fd5f9a4b04b6b36535b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:48 +0800
Subject: mm_types: add virtual and _last_cpupid into struct folio

Patch series "mm: convert page cpupid functions to folios", v3.

The cpupid(or access time) used by numa balancing is stored in flags or
_last_cpupid(if LAST_CPUPID_NOT_IN_PAGE_FLAGS) of page, this is to convert
page cpupid to folio cpupid, a new _last_cpupid is added into folio, which
make us to use folio->_last_cpupid directly, and the page cpupid functions
are converted to folio ones.

  page_cpupid_last()		-> folio_last_cpupid()
  xchg_page_access_time()	-> folio_xchg_access_time()
  page_cpupid_xchg_last()	-> folio_xchg_last_cpupid()


This patch (of 19):

If WANT_PAGE_VIRTUAL and LAST_CPUPID_NOT_IN_PAGE_FLAGS defined, the
'virtual' and '_last_cpupid' are in struct page, and since _last_cpupid is
used by numa balancing feature, it is better to move it before KMSAN
metadata from struct page, also add them into struct folio to make us to
access them from folio directly.

Link: https://lkml.kernel.org/r/20231018140806.2783514-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20231018140806.2783514-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21eb56145f57..692e41213cd3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -199,6 +199,10 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+	int _last_cpupid;
+#endif
+
 #ifdef CONFIG_KMSAN
 	/*
 	 * KMSAN metadata for this page:
@@ -210,10 +214,6 @@ struct page {
 	struct page *kmsan_shadow;
 	struct page *kmsan_origin;
 #endif
-
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-	int _last_cpupid;
-#endif
 } _struct_page_alignment;
 
 /*
@@ -272,6 +272,8 @@ typedef struct {
  * @_refcount: Do not access this member directly.  Use folio_ref_count()
  *    to find how many references there are to this folio.
  * @memcg_data: Memory Control Group data.
+ * @virtual: Virtual address in the kernel direct map.
+ * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
  * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
  * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@@ -317,6 +319,12 @@ struct folio {
 			atomic_t _refcount;
 #ifdef CONFIG_MEMCG
 			unsigned long memcg_data;
+#endif
+#if defined(WANT_PAGE_VIRTUAL)
+			void *virtual;
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+			int _last_cpupid;
 #endif
 	/* private: the union with struct page is transitional */
 		};
@@ -373,6 +381,12 @@ FOLIO_MATCH(_refcount, _refcount);
 #ifdef CONFIG_MEMCG
 FOLIO_MATCH(memcg_data, memcg_data);
 #endif
+#if defined(WANT_PAGE_VIRTUAL)
+FOLIO_MATCH(virtual, virtual);
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+FOLIO_MATCH(_last_cpupid, _last_cpupid);
+#endif
 #undef FOLIO_MATCH
 #define FOLIO_MATCH(pg, fl)						\
 	static_assert(offsetof(struct folio, fl) ==			\
-- 
cgit v1.2.3


From 155c98cfcf961327adedabd629edfc2301cf354b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:49 +0800
Subject: mm: add folio_last_cpupid()

Add folio_last_cpupid() wrapper, which is required to convert
page_cpupid_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 86e040e886e4..0c4ff852930b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_last_cpupid(struct folio *folio)
+{
+	return page_cpupid_last(&folio->page);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f39eac30a8f334f0765ef78fe4d13b3fd5bfa3fd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:53 +0800
Subject: mm: remove page_cpupid_last()

Since all calls use folio_last_cpupid(), remove page_cpupid_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c4ff852930b..f8b7a1825720 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1702,18 +1702,18 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page->_last_cpupid;
+	return folio->_last_cpupid;
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
 	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
 extern int page_cpupid_xchg_last(struct page *page, int cpupid);
@@ -1752,9 +1752,9 @@ static inline int xchg_page_access_time(struct page *page, int time)
 	return 0;
 }
 
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int cpupid_to_nid(int cpupid)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_last_cpupid(struct folio *folio)
-{
-	return page_cpupid_last(&folio->page);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 55c199385c4465e9abe1a3d6d1aba348d0356e03 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:54 +0800
Subject: mm: add folio_xchg_access_time()

Add folio_xchg_access_time() wrapper, which is required to convert
xchg_page_access_time() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-8-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8b7a1825720..dee63b820f0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_access_time(struct folio *folio, int time)
+{
+	return xchg_page_access_time(&folio->page, time);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From f393084382fa3bbd5840b428d538dbcb33be0186 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:58 +0800
Subject: mm: remove xchg_page_access_time()

Since all calls use folio_xchg_access_time(), remove
xchg_page_access_time().

Link: https://lkml.kernel.org/r/20231018140806.2783514-12-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dee63b820f0c..e003465238c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1724,11 +1724,12 @@ static inline void page_cpupid_reset_last(struct page *page)
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = page_cpupid_xchg_last(&folio->page,
+					  time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1747,7 +1748,7 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 	return page_to_nid(page); /* XXX */
 }
 
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	return 0;
 }
@@ -1796,11 +1797,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_access_time(struct folio *folio, int time)
-{
-	return xchg_page_access_time(&folio->page, time);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From 136d0b47576f8701d68c2d504e7237d9fdc4ebbd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:59 +0800
Subject: mm: add folio_xchg_last_cpupid()

Add folio_xchg_last_cpupid() wrapper, which is required to convert
page_cpupid_xchg_last() to folio vertion later in the series.

Link: https://lkml.kernel.org/r/20231018140806.2783514-13-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e003465238c4..00e0874f7ce2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1797,6 +1797,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
+{
+	return page_cpupid_xchg_last(&folio->page, cpupid);
+}
+
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
-- 
cgit v1.2.3


From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:03 +0800
Subject: mm: make finish_mkwrite_fault() static

Make finish_mkwrite_fault static since it is not used outside of
memory.c.

Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00e0874f7ce2..5d6e1d7656c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1348,7 +1348,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		struct page *page, unsigned int nr, unsigned long addr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 0915eac772fe..2b6e2b239411 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3269,7 +3269,7 @@ out:
  * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
  * we acquired PTE lock.
  */
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
+static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
 {
 	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
 	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-- 
cgit v1.2.3


From 8f0f4788b1247c2f92ecacd8f86ce0b379b807b9 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:06 +0800
Subject: mm: remove page_cpupid_xchg_last()

Since all calls use folio_xchg_last_cpupid(), remove
page_cpupid_xchg_last().

Link: https://lkml.kernel.org/r/20231018140806.2783514-20-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 +++++++------------
 mm/mmzone.c        |  6 +++---
 2 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d6e1d7656c2..0e496a8b8e50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1696,9 +1696,9 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
 
 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+	return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
 static inline int folio_last_cpupid(struct folio *folio)
@@ -1715,7 +1715,7 @@ static inline int folio_last_cpupid(struct folio *folio)
 	return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 }
 
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
@@ -1727,8 +1727,8 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
 {
 	int last_time;
 
-	last_time = page_cpupid_xchg_last(&folio->page,
-					  time >> PAGE_ACCESS_TIME_BUCKETS);
+	last_time = folio_xchg_last_cpupid(folio,
+					   time >> PAGE_ACCESS_TIME_BUCKETS);
 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
 
@@ -1742,9 +1742,9 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
-	return page_to_nid(page); /* XXX */
+	return folio_nid(folio); /* XXX */
 }
 
 static inline int folio_xchg_access_time(struct folio *folio, int time)
@@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
-{
-	return page_cpupid_xchg_last(&folio->page, cpupid);
-}
-
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 /*
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 68e1511be12d..b594d3f268fe 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -93,19 +93,19 @@ void lruvec_init(struct lruvec *lruvec)
 }
 
 #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-int page_cpupid_xchg_last(struct page *page, int cpupid)
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
 	unsigned long old_flags, flags;
 	int last_cpupid;
 
-	old_flags = READ_ONCE(page->flags);
+	old_flags = READ_ONCE(folio->flags);
 	do {
 		flags = old_flags;
 		last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
 
 		flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
 		flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
-	} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+	} while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
 
 	return last_cpupid;
 }
-- 
cgit v1.2.3


From 80203f1ca086835100843f1474bd6dd4a48cc73b Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 18 Oct 2023 18:29:47 +0800
Subject: bootmem: use kmemleak_free_part_phys in free_bootmem_page

Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from
memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to
delete kmemleak object in free_bootmem_page().  In debug mode, there are
following warning:

 kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096)

Link: https://lkml.kernel.org/r/20231018102952.3339837-3-liushixin2@huawei.com
Fixes: 028725e73375 ("bootmem: remove the vmemmap pages from kmemleak in free_bootmem_page")
Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Patrick Wang <patrick.wang.shcn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bootmem_info.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index e1a3c9c9754c..cffa38a73618 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
 
 static inline void free_bootmem_page(struct page *page)
 {
-	kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+	kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
 	free_reserved_page(page);
 }
 #endif
-- 
cgit v1.2.3


From b455f39d228935f88eebcd1f7c1a6981093c6a3b Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Fri, 20 Oct 2023 11:33:30 -0700
Subject: mm/khugepaged: convert alloc_charge_hpage() to use folios

Also remove count_memcg_page_event now that its last caller no longer uses
it and reword hpage_collapse_alloc_page() to hpage_collapse_alloc_folio().

This removes 1 call to compound_head() and helps convert khugepaged to
use folios throughout.

Link: https://lkml.kernel.org/r/20231020183331.10770-5-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 --------------
 mm/khugepaged.c            | 17 ++++++++++-------
 2 files changed, 10 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3d2e3e60eed..7bdcf3020d7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1087,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  enum vm_event_item idx)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		count_memcg_events(memcg, idx, 1);
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
@@ -1598,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
 {
 }
 
-static inline void count_memcg_page_event(struct page *page,
-					  int idx)
-{
-}
-
 static inline void count_memcg_folio_events(struct folio *folio,
 		enum vm_event_item idx, unsigned long nr)
 {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9efd8ff68f06..6a7184cd291b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -888,16 +888,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
 }
 #endif
 
-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
 				      nodemask_t *nmask)
 {
-	*hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
-	if (unlikely(!*hpage)) {
+	*folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
+
+	if (unlikely(!*folio)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		return false;
 	}
 
-	folio_prep_large_rmappable((struct folio *)*hpage);
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return true;
 }
@@ -1064,17 +1064,20 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
 	int node = hpage_collapse_find_target_node(cc);
 	struct folio *folio;
 
-	if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+	if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
+		*hpage = NULL;
 		return SCAN_ALLOC_HUGE_PAGE_FAIL;
+	}
 
-	folio = page_folio(*hpage);
 	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
 		folio_put(folio);
 		*hpage = NULL;
 		return SCAN_CGROUP_CHARGE_FAIL;
 	}
-	count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
 
+	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+
+	*hpage = folio_page(folio, 0);
 	return SCAN_SUCCEED;
 }
 
-- 
cgit v1.2.3


From 35f5d94187a6a3a8df2cba54beccca1c2379edb8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 19 Oct 2023 19:49:20 +0000
Subject: mm/damon: implement a function for max nr_accesses safe calculation

Patch series "avoid divide-by-zero due to max_nr_accesses overflow".

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.

Avoid the divide-by-zero by implementing a function that handles the
corner case (first patch), and replaces the vulnerable direct max
nr_accesses calculations (remaining patches).

Note that the patches for the replacements are divided for broken commits,
to make backporting on required tres easier.  Especially, the last patch
is for a patch that not yet merged into the mainline but in mm tree.


This patch (of 4):

The maximum nr_accesses of given DAMON context can be calculated by
dividing the aggregation interval by the sampling interval.  Some logics
in DAMON uses the maximum nr_accesses as a divisor.  Hence, the value
shouldn't be zero.  Such case is avoided since DAMON avoids setting the
agregation interval as samller than the sampling interval.  However, since
nr_accesses is unsigned int while the intervals are unsigned long, the
maximum nr_accesses could be zero while casting.  Implement a function
that handles the corner case.

Note that this commit is not fixing the real issue since this is only
introducing the safe function that will replaces the problematic
divisions.  The replacements will be made by followup commits, to make
backporting on stable series easier.

Link: https://lkml.kernel.org/r/20231019194924.100347-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231019194924.100347-2-sj@kernel.org
Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Jakub Acs <acsjakub@amazon.de>
Cc: <stable@vger.kernel.org>	[5.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 27b995c22497..ab2f17d9926b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -681,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
 	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
 }
 
+static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs)
+{
+	/* {aggr,sample}_interval are unsigned long, hence could overflow */
+	return min(attrs->aggr_interval / attrs->sample_interval,
+			(unsigned long)UINT_MAX);
+}
+
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From 10969b5571387047cd461a9c701b8b9cd007f6c7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:15:09 -0700
Subject: hugetlbfs: drop shared NUMA mempolicy pretence

Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2.

Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma
from shmem folio allocation, and removing the mmap_lock around folio
migration for mbind and migrate_pages syscalls.


This patch (of 12):

hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA
mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA
mempolicy for this file?  hugetlb_vm_ops has never offered a set_policy
method, and hugetlbfs_parse_param() has never supported any mpol options
for a mount-wide default policy.

It's just an illusion: clean it away so as not to confuse others, giving
us more freedom to adjust shmem's set_policy/get_policy implementation.
But hugetlbfs_inode_info is still required, just to accommodate seals.

Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a
set_policy method and/or mpol mount option (Andi's first posting did
include an admitted-unsatisfactory hugetlb_set_policy()); but it seems
that nobody has bothered to add that in the nineteen years since v2.6.7
made it possible, and there is at least one company that has invested
enough into hugetlbfs, that I guess they have learnt well enough how to
manage its NUMA, without needing shared mempolicy.

Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in
its place, because hugetlb.h's recently added use of filemap_lock_folio()
requires that (although most .configs and .c's get it in some other way).

Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com
Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 41 +----------------------------------------
 include/linux/hugetlb.h |  3 +--
 2 files changed, 2 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 85cd418d2340..3ad5fc3cb8db 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 	{}
 };
 
-#ifdef CONFIG_NUMA
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
-{
-	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
-							index);
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-	mpol_cond_put(vma->vm_policy);
-}
-#else
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
-{
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-}
-#endif
-
 /*
  * Mask used when checking the page offset value passed in via system
  * calls.  This value will be converted to a loff_t which is signed.
@@ -853,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
 	/*
 	 * Initialize a pseudo vma as this is required by the huge page
-	 * allocation routines.  If NUMA is configured, use page index
-	 * as input to create an allocation policy.
+	 * allocation routines.
 	 */
 	vma_init(&pseudo_vma, mm);
 	vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
@@ -902,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		 * folios in these areas, we need to consume the reserves
 		 * to keep reservation accounting consistent.
 		 */
-		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
 		folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
-		hugetlb_drop_vma_policy(&pseudo_vma);
 		if (IS_ERR(folio)) {
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			error = PTR_ERR(folio);
@@ -1283,18 +1257,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-
-	/*
-	 * Any time after allocation, hugetlbfs_destroy_inode can be called
-	 * for the inode.  mpol_free_shared_policy is unconditionally called
-	 * as part of hugetlbfs_destroy_inode.  So, initialize policy here
-	 * in case of a quick call to destroy.
-	 *
-	 * Note that the policy is initialized even if we are creating a
-	 * private inode.  This simplifies hugetlbfs_destroy_inode.
-	 */
-	mpol_shared_policy_init(&p->policy, NULL);
-
 	return &p->vfs_inode;
 }
 
@@ -1306,7 +1268,6 @@ static void hugetlbfs_free_inode(struct inode *inode)
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
-	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 205469aa0613..158ff156710b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio);
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-#include <linux/mempolicy.h>
+#include <linux/pagemap.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 
@@ -545,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 }
 
 struct hugetlbfs_inode_info {
-	struct shared_policy policy;
 	struct inode vfs_inode;
 	unsigned int seals;
 };
-- 
cgit v1.2.3


From c36f6e6dff4d32ec8b6da8f553933727a57a7a4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:20:14 -0700
Subject: mempolicy trivia: slightly more consistent naming

Before getting down to work, do a little cleanup, mainly of inconsistent
variable naming.  I gave up trying to rationalize mpol versus pol versus
policy, and node versus nid, but let's avoid p and nd.  Remove a few
superfluous blank lines, but add one; and here prefer vma->vm_policy to
vma_policy(vma) - the latter being appropriate in other sources, which
have to allow for !CONFIG_NUMA.  That intriguing line about KERNEL_DS?
should have gone in v2.6.15, when numa_policy_init() stopped using
set_mempolicy(2)'s system call handler.

Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 11 ++++---
 mm/mempolicy.c            | 73 +++++++++++++++++++++--------------------------
 2 files changed, 38 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1d7f4ec2614c..a807976fe95d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -124,10 +124,9 @@ struct shared_policy {
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *info,
-				struct vm_area_struct *vma,
-				struct mempolicy *new);
-void mpol_free_shared_policy(struct shared_policy *p);
+int mpol_set_shared_policy(struct shared_policy *sp,
+			   struct vm_area_struct *vma, struct mempolicy *mpol);
+void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
@@ -191,7 +190,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return true;
 }
 
-static inline void mpol_put(struct mempolicy *p)
+static inline void mpol_put(struct mempolicy *pol)
 {
 }
 
@@ -210,7 +209,7 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp,
 {
 }
 
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
 {
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 02a9b89c10aa..50a57f4c8013 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -25,7 +25,7 @@
  *                to the last. It would be better if bind would truly restrict
  *                the allocation to memory nodes instead
  *
- * preferred       Try a specific node first before normal fallback.
+ * preferred      Try a specific node first before normal fallback.
  *                As a special case NUMA_NO_NODE here means do the allocation
  *                on the local CPU. This is normally identical to default,
  *                but useful to set in a VMA when you have a non default
@@ -52,7 +52,7 @@
  * on systems with highmem kernel lowmem allocation don't get policied.
  * Same with GFP_DMA allocations.
  *
- * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * For shmem/tmpfs shared memory the policy is shared between
  * all users and remembered even when nobody has memory mapped.
  */
 
@@ -291,6 +291,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 			return ERR_PTR(-EINVAL);
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
+
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
@@ -303,11 +304,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 }
 
 /* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
+void __mpol_put(struct mempolicy *pol)
 {
-	if (!atomic_dec_and_test(&p->refcnt))
+	if (!atomic_dec_and_test(&pol->refcnt))
 		return;
-	kmem_cache_free(policy_cache, p);
+	kmem_cache_free(policy_cache, pol);
 }
 
 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
@@ -364,7 +365,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
  *
  * Called with task's alloc_lock held.
  */
-
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
 	mpol_rebind_policy(tsk->mempolicy, new);
@@ -375,7 +375,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
  *
  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
  */
-
 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 	struct vm_area_struct *vma;
@@ -757,7 +756,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
  * This must be called with the mmap_lock held for writing.
  */
 static int vma_replace_policy(struct vm_area_struct *vma,
-						struct mempolicy *pol)
+				struct mempolicy *pol)
 {
 	int err;
 	struct mempolicy *old;
@@ -800,7 +799,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		vmstart = vma->vm_start;
 	}
 
-	if (mpol_equal(vma_policy(vma), new_pol)) {
+	if (mpol_equal(vma->vm_policy, new_pol)) {
 		*prev = vma;
 		return 0;
 	}
@@ -855,18 +854,18 @@ out:
  *
  * Called with task's alloc_lock held
  */
-static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
+static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
 {
 	nodes_clear(*nodes);
-	if (p == &default_policy)
+	if (pol == &default_policy)
 		return;
 
-	switch (p->mode) {
+	switch (pol->mode) {
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
-		*nodes = p->nodes;
+		*nodes = pol->nodes;
 		break;
 	case MPOL_LOCAL:
 		/* return empty node mask for local allocation */
@@ -1634,7 +1633,6 @@ out:
 out_put:
 	put_task_struct(task);
 	goto out;
-
 }
 
 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
@@ -1644,7 +1642,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
 }
 
-
 /* Retrieve NUMA policy */
 static int kernel_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
@@ -1827,10 +1824,10 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
  * policy_node() is always coupled with policy_nodemask(), which
  * secures the nodemask limit for 'bind' and 'prefer-many' policy.
  */
-static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid)
 {
 	if (policy->mode == MPOL_PREFERRED) {
-		nd = first_node(policy->nodes);
+		nid = first_node(policy->nodes);
 	} else {
 		/*
 		 * __GFP_THISNODE shouldn't even be used with the bind policy
@@ -1845,19 +1842,18 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	    policy->home_node != NUMA_NO_NODE)
 		return policy->home_node;
 
-	return nd;
+	return nid;
 }
 
 /* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+static unsigned int interleave_nodes(struct mempolicy *policy)
 {
-	unsigned next;
-	struct task_struct *me = current;
+	unsigned int nid;
 
-	next = next_node_in(me->il_prev, policy->nodes);
-	if (next < MAX_NUMNODES)
-		me->il_prev = next;
-	return next;
+	nid = next_node_in(current->il_prev, policy->nodes);
+	if (nid < MAX_NUMNODES)
+		current->il_prev = nid;
+	return nid;
 }
 
 /*
@@ -2347,7 +2343,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
-	struct mempolicy *pol = mpol_dup(vma_policy(src));
+	struct mempolicy *pol = mpol_dup(src->vm_policy);
 
 	if (IS_ERR(pol))
 		return PTR_ERR(pol);
@@ -2771,40 +2767,40 @@ put_mpol:
 	}
 }
 
-int mpol_set_shared_policy(struct shared_policy *info,
-			struct vm_area_struct *vma, struct mempolicy *npol)
+int mpol_set_shared_policy(struct shared_policy *sp,
+			struct vm_area_struct *vma, struct mempolicy *pol)
 {
 	int err;
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 
-	if (npol) {
-		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+	if (pol) {
+		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
 		if (!new)
 			return -ENOMEM;
 	}
-	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
 	if (err && new)
 		sp_free(new);
 	return err;
 }
 
 /* Free a backing policy store on inode delete. */
-void mpol_free_shared_policy(struct shared_policy *p)
+void mpol_free_shared_policy(struct shared_policy *sp)
 {
 	struct sp_node *n;
 	struct rb_node *next;
 
-	if (!p->root.rb_node)
+	if (!sp->root.rb_node)
 		return;
-	write_lock(&p->lock);
-	next = rb_first(&p->root);
+	write_lock(&sp->lock);
+	next = rb_first(&sp->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
-		sp_delete(p, n);
+		sp_delete(sp, n);
 	}
-	write_unlock(&p->lock);
+	write_unlock(&sp->lock);
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -2854,7 +2850,6 @@ static inline void __init check_numabalancing_enable(void)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-/* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
 	nodemask_t interleave_nodes;
@@ -2917,7 +2912,6 @@ void numa_default_policy(void)
 /*
  * Parse and format mempolicy from/to strings
  */
-
 static const char * const policy_modes[] =
 {
 	[MPOL_DEFAULT]    = "default",
@@ -2928,7 +2922,6 @@ static const char * const policy_modes[] =
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
 
-
 #ifdef CONFIG_TMPFS
 /**
  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
-- 
cgit v1.2.3


From 93397c3b7684555b7cec726cd13eef6742d191fe Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:21:34 -0700
Subject: mempolicy trivia: use pgoff_t in shared mempolicy tree

Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a
shared mempolicy tree.  Delete confusing comment about pseudo mm vmas.

Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h | 20 +++++++-------------
 mm/mempolicy.c            | 12 ++++++------
 2 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index a807976fe95d..acdb12fcb6cd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -105,22 +105,16 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 /*
  * Tree of shared policies for a shared memory region.
- * Maintain the policies in a pseudo mm that contains vmas. The vmas
- * carry the policy. As a special twist the pseudo mm is indexed in pages, not
- * bytes, so that we can work with shared memory segments bigger than
- * unsigned long.
  */
-
-struct sp_node {
-	struct rb_node nd;
-	unsigned long start, end;
-	struct mempolicy *policy;
-};
-
 struct shared_policy {
 	struct rb_root root;
 	rwlock_t lock;
 };
+struct sp_node {
+	struct rb_node nd;
+	pgoff_t start, end;
+	struct mempolicy *policy;
+};
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
@@ -128,7 +122,7 @@ int mpol_set_shared_policy(struct shared_policy *sp,
 			   struct vm_area_struct *vma, struct mempolicy *mpol);
 void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
-					    unsigned long idx);
+					    pgoff_t idx);
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
@@ -214,7 +208,7 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp)
 }
 
 static inline struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 {
 	return NULL;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 50a57f4c8013..c18f0a1b97e3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2428,8 +2428,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  * lookup first element intersecting start-end.  Caller holds sp->lock for
  * reading or for writing
  */
-static struct sp_node *
-sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+static struct sp_node *sp_lookup(struct shared_policy *sp,
+					pgoff_t start, pgoff_t end)
 {
 	struct rb_node *n = sp->root.rb_node;
 
@@ -2483,8 +2483,8 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 }
 
 /* Find shared policy intersecting idx */
-struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
+						pgoff_t idx)
 {
 	struct mempolicy *pol = NULL;
 	struct sp_node *sn;
@@ -2652,8 +2652,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 }
 
 /* Replace a policy range. */
-static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
-				 unsigned long end, struct sp_node *new)
+static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
+				 pgoff_t end, struct sp_node *new)
 {
 	struct sp_node *n;
 	struct sp_node *n_new = NULL;
-- 
cgit v1.2.3


From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 Oct 2023 13:39:08 -0700
Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma

Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).

vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node().
Cleanup throughout, cutting out some unhelpful "helpers".

It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.

Hence "ilx" throughout, the "interleave index".  Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided.  And how ilx is applied depends also on page order.

The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order).  vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.

get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g.  swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.

[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
  Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c        |   5 +-
 include/linux/gfp.h       |  10 +-
 include/linux/mempolicy.h |  20 ++-
 include/linux/mm.h        |   2 +-
 ipc/shm.c                 |  21 +--
 mm/mempolicy.c            | 379 +++++++++++++++++++---------------------------
 mm/shmem.c                |  92 ++++++-----
 mm/swap.h                 |   9 +-
 mm/swap_state.c           |  86 +++++++----
 mm/zswap.c                |   7 +-
 10 files changed, 308 insertions(+), 323 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1d994505805b..66ae1c265da3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v)
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
-	struct mempolicy *pol;
 	char buffer[64];
+	struct mempolicy *pol;
+	pgoff_t ilx;
 	int nid;
 
 	if (!mm)
@@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 	/* Ensure we start with an empty set of numa_maps statistics. */
 	memset(md, 0, sizeof(*md));
 
-	pol = __get_vma_policy(vma, vma->vm_start);
+	pol = __get_vma_policy(vma, vma->vm_start, &ilx);
 	if (pol) {
 		mpol_to_str(buffer, sizeof(buffer), pol);
 		mpol_cond_put(pol);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5b917e5b9350..de292a007138 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,7 @@
 #include <linux/topology.h>
 
 struct vm_area_struct;
+struct mempolicy;
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct folio *folio_alloc(gfp_t gfp, unsigned order);
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid);
+struct folio *folio_alloc(gfp_t gfp, unsigned int order);
 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, bool hugepage);
 #else
@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
+static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *mpol, pgoff_t ilx, int nid)
+{
+	return alloc_pages(gfp, order);
+}
 static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 {
 	return __folio_alloc_node(gfp, order, numa_node_id());
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index acdb12fcb6cd..931b118336f4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -17,6 +17,8 @@
 
 struct mm_struct;
 
+#define NO_INTERLEAVE_INDEX (-1UL)	/* use task il_prev for interleaving */
+
 #ifdef CONFIG_NUMA
 
 /*
@@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
-		unsigned long addr);
+		unsigned long addr, pgoff_t *ilx);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+		unsigned long addr, int order, pgoff_t *ilx);
 bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
@@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
 extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
 				const nodemask_t *mask);
-extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-
 extern unsigned int mempolicy_slab_node(void);
 
 extern enum zone_type policy_zone;
@@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
 
 struct mempolicy {};
 
+static inline struct mempolicy *get_task_policy(struct task_struct *p)
+{
+	return NULL;
+}
+
 static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	return true;
@@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
 	return NULL;
 }
 
+static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+				unsigned long addr, int order, pgoff_t *ilx)
+{
+	*ilx = 0;
+	return NULL;
+}
+
 static inline int
 vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e496a8b8e50..14d5aaff96d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -619,7 +619,7 @@ struct vm_operations_struct {
 	 * policy.
 	 */
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
-					unsigned long addr);
+					unsigned long addr, pgoff_t *ilx);
 #endif
 	/*
 	 * Called by vm_normal_page() for special PTEs to find the
diff --git a/ipc/shm.c b/ipc/shm.c
index 576a543b7cff..222aaf035afb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_NUMA
-static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-	struct file *file = vma->vm_file;
-	struct shm_file_data *sfd = shm_file_data(file);
+	struct shm_file_data *sfd = shm_file_data(vma->vm_file);
 	int err = 0;
 
 	if (sfd->vm_ops->set_policy)
-		err = sfd->vm_ops->set_policy(vma, new);
+		err = sfd->vm_ops->set_policy(vma, mpol);
 	return err;
 }
 
 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
-					unsigned long addr)
+					unsigned long addr, pgoff_t *ilx)
 {
-	struct file *file = vma->vm_file;
-	struct shm_file_data *sfd = shm_file_data(file);
-	struct mempolicy *pol = NULL;
+	struct shm_file_data *sfd = shm_file_data(vma->vm_file);
+	struct mempolicy *mpol = vma->vm_policy;
 
 	if (sfd->vm_ops->get_policy)
-		pol = sfd->vm_ops->get_policy(vma, addr);
-	else if (vma->vm_policy)
-		pol = vma->vm_policy;
-
-	return pol;
+		mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
+	return mpol;
 }
 #endif
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 596d580f92d1..ded5508f0972 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -898,6 +898,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	}
 
 	if (flags & MPOL_F_ADDR) {
+		pgoff_t ilx;		/* ignored here */
 		/*
 		 * Do NOT fall back to task policy if the
 		 * vma/shared policy at addr is NULL.  We
@@ -909,10 +910,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			mmap_read_unlock(mm);
 			return -EFAULT;
 		}
-		if (vma->vm_ops && vma->vm_ops->get_policy)
-			pol = vma->vm_ops->get_policy(vma, addr);
-		else
-			pol = vma->vm_policy;
+		pol = __get_vma_policy(vma, addr, &ilx);
 	} else if (addr)
 		return -EINVAL;
 
@@ -1170,6 +1168,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
 			break;
 	}
 
+	/*
+	 * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
+	 * when the page can no longer be located in a vma: that is not ideal
+	 * (migrate_pages() will give up early, presuming ENOMEM), but good
+	 * enough to avoid a crash by syzkaller or concurrent holepunch.
+	 */
+	if (!vma)
+		return NULL;
+
 	if (folio_test_hugetlb(src)) {
 		return alloc_hugetlb_folio_vma(folio_hstate(src),
 				vma, address);
@@ -1178,9 +1185,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
 	if (folio_test_large(src))
 		gfp = GFP_TRANSHUGE;
 
-	/*
-	 * if !vma, vma_alloc_folio() will use task or system default policy
-	 */
 	return vma_alloc_folio(gfp, folio_order(src), vma, address,
 			folio_test_large(src));
 }
@@ -1690,34 +1694,19 @@ bool vma_migratable(struct vm_area_struct *vma)
 }
 
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
-						unsigned long addr)
+				   unsigned long addr, pgoff_t *ilx)
 {
-	struct mempolicy *pol = NULL;
-
-	if (vma) {
-		if (vma->vm_ops && vma->vm_ops->get_policy) {
-			pol = vma->vm_ops->get_policy(vma, addr);
-		} else if (vma->vm_policy) {
-			pol = vma->vm_policy;
-
-			/*
-			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
-			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
-			 * count on these policies which will be dropped by
-			 * mpol_cond_put() later
-			 */
-			if (mpol_needs_cond_ref(pol))
-				mpol_get(pol);
-		}
-	}
-
-	return pol;
+	*ilx = 0;
+	return (vma->vm_ops && vma->vm_ops->get_policy) ?
+		vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
 }
 
 /*
- * get_vma_policy(@vma, @addr)
+ * get_vma_policy(@vma, @addr, @order, @ilx)
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup
+ * @order: 0, or appropriate huge_page_order for interleaving
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to current->mempolicy or system default policy, as necessary.
@@ -1726,14 +1715,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
  * freeing by another task.  It is the caller's responsibility to free the
  * extra reference for shared policies.
  */
-static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
-						unsigned long addr)
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+				 unsigned long addr, int order, pgoff_t *ilx)
 {
-	struct mempolicy *pol = __get_vma_policy(vma, addr);
+	struct mempolicy *pol;
 
+	pol = __get_vma_policy(vma, addr, ilx);
 	if (!pol)
 		pol = get_task_policy(current);
-
+	if (pol->mode == MPOL_INTERLEAVE) {
+		*ilx += vma->vm_pgoff >> order;
+		*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+	}
 	return pol;
 }
 
@@ -1743,8 +1736,9 @@ bool vma_policy_mof(struct vm_area_struct *vma)
 
 	if (vma->vm_ops && vma->vm_ops->get_policy) {
 		bool ret = false;
+		pgoff_t ilx;		/* ignored here */
 
-		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+		pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
 		if (pol && (pol->flags & MPOL_F_MOF))
 			ret = true;
 		mpol_cond_put(pol);
@@ -1779,54 +1773,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 	return zone >= dynamic_policy_zone;
 }
 
-/*
- * Return a nodemask representing a mempolicy for filtering nodes for
- * page allocation
- */
-nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
-{
-	int mode = policy->mode;
-
-	/* Lower zones don't get a nodemask applied for MPOL_BIND */
-	if (unlikely(mode == MPOL_BIND) &&
-		apply_policy_zone(policy, gfp_zone(gfp)) &&
-		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
-		return &policy->nodes;
-
-	if (mode == MPOL_PREFERRED_MANY)
-		return &policy->nodes;
-
-	return NULL;
-}
-
-/*
- * Return the  preferred node id for 'prefer' mempolicy, and return
- * the given id for all other policies.
- *
- * policy_node() is always coupled with policy_nodemask(), which
- * secures the nodemask limit for 'bind' and 'prefer-many' policy.
- */
-static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid)
-{
-	if (policy->mode == MPOL_PREFERRED) {
-		nid = first_node(policy->nodes);
-	} else {
-		/*
-		 * __GFP_THISNODE shouldn't even be used with the bind policy
-		 * because we might easily break the expectation to stay on the
-		 * requested node and not break the policy.
-		 */
-		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
-	}
-
-	if ((policy->mode == MPOL_BIND ||
-	     policy->mode == MPOL_PREFERRED_MANY) &&
-	    policy->home_node != NUMA_NO_NODE)
-		return policy->home_node;
-
-	return nid;
-}
-
 /* Do dynamic interleaving for a process */
 static unsigned int interleave_nodes(struct mempolicy *policy)
 {
@@ -1886,11 +1832,11 @@ unsigned int mempolicy_slab_node(void)
 }
 
 /*
- * Do static interleaving for a VMA with known offset @n.  Returns the n'th
- * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
- * number of present nodes.
+ * Do static interleaving for interleave index @ilx.  Returns the ilx'th
+ * node in pol->nodes (starting from ilx=0), wrapping around if ilx
+ * exceeds the number of present nodes.
  */
-static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
+static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
 {
 	nodemask_t nodemask = pol->nodes;
 	unsigned int target, nnodes;
@@ -1908,33 +1854,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+	target = ilx % nnodes;
 	nid = first_node(nodemask);
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);
 	return nid;
 }
 
-/* Determine a node number for interleave */
-static inline unsigned interleave_nid(struct mempolicy *pol,
-		 struct vm_area_struct *vma, unsigned long addr, int shift)
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation, together with preferred node id (or the input node id).
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+				   pgoff_t ilx, int *nid)
 {
-	if (vma) {
-		unsigned long off;
+	nodemask_t *nodemask = NULL;
 
+	switch (pol->mode) {
+	case MPOL_PREFERRED:
+		/* Override input node id */
+		*nid = first_node(pol->nodes);
+		break;
+	case MPOL_PREFERRED_MANY:
+		nodemask = &pol->nodes;
+		if (pol->home_node != NUMA_NO_NODE)
+			*nid = pol->home_node;
+		break;
+	case MPOL_BIND:
+		/* Restrict to nodemask (but not on lower zones) */
+		if (apply_policy_zone(pol, gfp_zone(gfp)) &&
+		    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
+			nodemask = &pol->nodes;
+		if (pol->home_node != NUMA_NO_NODE)
+			*nid = pol->home_node;
 		/*
-		 * for small pages, there is no difference between
-		 * shift and PAGE_SHIFT, so the bit-shift is safe.
-		 * for huge pages, since vm_pgoff is in units of small
-		 * pages, we need to shift off the always 0 bits to get
-		 * a useful offset.
+		 * __GFP_THISNODE shouldn't even be used with the bind policy
+		 * because we might easily break the expectation to stay on the
+		 * requested node and not break the policy.
 		 */
-		BUG_ON(shift < PAGE_SHIFT);
-		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
-		off += (addr - vma->vm_start) >> shift;
-		return offset_il_node(pol, off);
-	} else
-		return interleave_nodes(pol);
+		WARN_ON_ONCE(gfp & __GFP_THISNODE);
+		break;
+	case MPOL_INTERLEAVE:
+		/* Override input node id */
+		*nid = (ilx == NO_INTERLEAVE_INDEX) ?
+			interleave_nodes(pol) : interleave_nid(pol, ilx);
+		break;
+	}
+
+	return nodemask;
 }
 
 #ifdef CONFIG_HUGETLBFS
@@ -1950,27 +1917,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * to the struct mempolicy for conditional unref after allocation.
  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
  * to the mempolicy's @nodemask for filtering the zonelist.
- *
- * Must be protected by read_mems_allowed_begin()
  */
 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
-				struct mempolicy **mpol, nodemask_t **nodemask)
+		struct mempolicy **mpol, nodemask_t **nodemask)
 {
+	pgoff_t ilx;
 	int nid;
-	int mode;
 
-	*mpol = get_vma_policy(vma, addr);
-	*nodemask = NULL;
-	mode = (*mpol)->mode;
-
-	if (unlikely(mode == MPOL_INTERLEAVE)) {
-		nid = interleave_nid(*mpol, vma, addr,
-					huge_page_shift(hstate_vma(vma)));
-	} else {
-		nid = policy_node(gfp_flags, *mpol, numa_node_id());
-		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
-			*nodemask = &(*mpol)->nodes;
-	}
+	nid = numa_node_id();
+	*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
+	*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
 	return nid;
 }
 
@@ -2048,27 +2004,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk,
 	return ret;
 }
 
-/* Allocate a page in interleaved policy.
-   Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
-					unsigned nid)
-{
-	struct page *page;
-
-	page = __alloc_pages(gfp, order, nid, NULL);
-	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
-	if (!static_branch_likely(&vm_numa_stat_key))
-		return page;
-	if (page && page_to_nid(page) == nid) {
-		preempt_disable();
-		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
-		preempt_enable();
-	}
-	return page;
-}
-
 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
-						int nid, struct mempolicy *pol)
+						int nid, nodemask_t *nodemask)
 {
 	struct page *page;
 	gfp_t preferred_gfp;
@@ -2081,7 +2018,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
 	 */
 	preferred_gfp = gfp | __GFP_NOWARN;
 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
-	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+	page = __alloc_pages(preferred_gfp, order, nid, nodemask);
 	if (!page)
 		page = __alloc_pages(gfp, order, nid, NULL);
 
@@ -2089,55 +2026,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
 }
 
 /**
- * vma_alloc_folio - Allocate a folio for a VMA.
+ * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
  * @gfp: GFP flags.
- * @order: Order of the folio.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual address of the allocation.  Must be inside @vma.
- * @hugepage: For hugepages try only the preferred node if possible.
- *
- * Allocate a folio for a specific address in @vma, using the appropriate
- * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
- * of the mm_struct of the VMA to prevent it from going away.  Should be
- * used for all allocations for folios that will be mapped into user space.
+ * @order: Order of the page allocation.
+ * @pol: Pointer to the NUMA mempolicy.
+ * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
+ * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
  *
- * Return: The folio on success or NULL if allocation fails.
+ * Return: The page on success or NULL if allocation fails.
  */
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, bool hugepage)
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+		struct mempolicy *pol, pgoff_t ilx, int nid)
 {
-	struct mempolicy *pol;
-	int node = numa_node_id();
-	struct folio *folio;
-	int preferred_nid;
-	nodemask_t *nmask;
-
-	pol = get_vma_policy(vma, addr);
-
-	if (pol->mode == MPOL_INTERLEAVE) {
-		struct page *page;
-		unsigned nid;
-
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
-		mpol_cond_put(pol);
-		gfp |= __GFP_COMP;
-		page = alloc_page_interleave(gfp, order, nid);
-		return page_rmappable_folio(page);
-	}
-
-	if (pol->mode == MPOL_PREFERRED_MANY) {
-		struct page *page;
+	nodemask_t *nodemask;
+	struct page *page;
 
-		node = policy_node(gfp, pol, node);
-		gfp |= __GFP_COMP;
-		page = alloc_pages_preferred_many(gfp, order, node, pol);
-		mpol_cond_put(pol);
-		return page_rmappable_folio(page);
-	}
+	nodemask = policy_nodemask(gfp, pol, ilx, &nid);
 
-	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
-		int hpage_node = node;
+	if (pol->mode == MPOL_PREFERRED_MANY)
+		return alloc_pages_preferred_many(gfp, order, nid, nodemask);
 
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+	    /* filter "hugepage" allocation, unless from alloc_pages() */
+	    order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
 		/*
 		 * For hugepage allocation and non-interleave policy which
 		 * allows the current node (or other explicitly preferred
@@ -2148,39 +2059,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 		 * If the policy is interleave or does not allow the current
 		 * node in its nodemask, we allocate the standard way.
 		 */
-		if (pol->mode == MPOL_PREFERRED)
-			hpage_node = first_node(pol->nodes);
-
-		nmask = policy_nodemask(gfp, pol);
-		if (!nmask || node_isset(hpage_node, *nmask)) {
-			mpol_cond_put(pol);
+		if (pol->mode != MPOL_INTERLEAVE &&
+		    (!nodemask || node_isset(nid, *nodemask))) {
 			/*
 			 * First, try to allocate THP only on local node, but
 			 * don't reclaim unnecessarily, just compact.
 			 */
-			folio = __folio_alloc_node(gfp | __GFP_THISNODE |
-					__GFP_NORETRY, order, hpage_node);
-
+			page = __alloc_pages_node(nid,
+				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
+				return page;
 			/*
 			 * If hugepage allocations are configured to always
 			 * synchronous compact or the vma has been madvised
 			 * to prefer hugepage backing, retry allowing remote
 			 * memory with both reclaim and compact as well.
 			 */
-			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
-				folio = __folio_alloc(gfp, order, hpage_node,
-						      nmask);
+		}
+	}
 
-			goto out;
+	page = __alloc_pages(gfp, order, nid, nodemask);
+
+	if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
+		if (static_branch_likely(&vm_numa_stat_key) &&
+		    page_to_nid(page) == nid) {
+			preempt_disable();
+			__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
+			preempt_enable();
 		}
 	}
 
-	nmask = policy_nodemask(gfp, pol);
-	preferred_nid = policy_node(gfp, pol, node);
-	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
+	return page;
+}
+
+/**
+ * vma_alloc_folio - Allocate a folio for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the folio.
+ * @vma: Pointer to VMA.
+ * @addr: Virtual address of the allocation.  Must be inside @vma.
+ * @hugepage: Unused (was: For hugepages try only preferred node if possible).
+ *
+ * Allocate a folio for a specific address in @vma, using the appropriate
+ * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
+ * VMA to prevent it from going away.  Should be used for all allocations
+ * for folios that will be mapped into user space, excepting hugetlbfs, and
+ * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ *
+ * Return: The folio on success or NULL if allocation fails.
+ */
+struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+		unsigned long addr, bool hugepage)
+{
+	struct mempolicy *pol;
+	pgoff_t ilx;
+	struct page *page;
+
+	pol = get_vma_policy(vma, addr, order, &ilx);
+	page = alloc_pages_mpol(gfp | __GFP_COMP, order,
+				pol, ilx, numa_node_id());
 	mpol_cond_put(pol);
-out:
-	return folio;
+	return page_rmappable_folio(page);
 }
 EXPORT_SYMBOL(vma_alloc_folio);
 
@@ -2198,33 +2138,23 @@ EXPORT_SYMBOL(vma_alloc_folio);
  * flags are used.
  * Return: The page on success or NULL if allocation fails.
  */
-struct page *alloc_pages(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned int order)
 {
 	struct mempolicy *pol = &default_policy;
-	struct page *page;
-
-	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
-		pol = get_task_policy(current);
 
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
-	if (pol->mode == MPOL_INTERLEAVE)
-		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	else if (pol->mode == MPOL_PREFERRED_MANY)
-		page = alloc_pages_preferred_many(gfp, order,
-				  policy_node(gfp, pol, numa_node_id()), pol);
-	else
-		page = __alloc_pages(gfp, order,
-				policy_node(gfp, pol, numa_node_id()),
-				policy_nodemask(gfp, pol));
+	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+		pol = get_task_policy(current);
 
-	return page;
+	return alloc_pages_mpol(gfp, order,
+				pol, NO_INTERLEAVE_INDEX, numa_node_id());
 }
 EXPORT_SYMBOL(alloc_pages);
 
-struct folio *folio_alloc(gfp_t gfp, unsigned order)
+struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 {
 	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
 }
@@ -2295,6 +2225,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		unsigned long nr_pages, struct page **page_array)
 {
 	struct mempolicy *pol = &default_policy;
+	nodemask_t *nodemask;
+	int nid;
 
 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
 		pol = get_task_policy(current);
@@ -2307,9 +2239,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
 
-	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
-				  policy_nodemask(gfp, pol), nr_pages, NULL,
-				  page_array);
+	nid = numa_node_id();
+	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
+	return __alloc_pages_bulk(gfp, nid, nodemask,
+				  nr_pages, NULL, page_array);
 }
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -2496,23 +2429,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		   unsigned long addr)
 {
 	struct mempolicy *pol;
+	pgoff_t ilx;
 	struct zoneref *z;
 	int curnid = folio_nid(folio);
-	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
 	int thisnid = cpu_to_node(thiscpu);
 	int polnid = NUMA_NO_NODE;
 	int ret = NUMA_NO_NODE;
 
-	pol = get_vma_policy(vma, addr);
+	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
 	if (!(pol->flags & MPOL_F_MOF))
 		goto out;
 
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
-		pgoff = vma->vm_pgoff;
-		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
-		polnid = offset_il_node(pol, pgoff);
+		polnid = interleave_nid(pol, ilx);
 		break;
 
 	case MPOL_PREFERRED:
diff --git a/mm/shmem.c b/mm/shmem.c
index bcbe9dbb9f5f..a314a25aea8c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 	return NULL;
 }
 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
-#ifndef CONFIG_NUMA
-#define vm_policy vm_private_data
-#endif
-
-static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
-		struct shmem_inode_info *info, pgoff_t index)
-{
-	/* Create a pseudo vma that just contains the policy */
-	vma_init(vma, NULL);
-	/* Bias interleave by inode number to distribute better across nodes */
-	vma->vm_pgoff = index + info->vfs_inode.i_ino;
-	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-}
 
-static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
-{
-	/* Drop reference taken by mpol_shared_policy_lookup() */
-	mpol_cond_put(vma->vm_policy);
-}
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+			pgoff_t index, unsigned int order, pgoff_t *ilx);
 
-static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
-	struct vm_area_struct pvma;
+	struct mempolicy *mpol;
+	pgoff_t ilx;
 	struct page *page;
-	struct vm_fault vmf = {
-		.vma = &pvma,
-	};
 
-	shmem_pseudo_vma_init(&pvma, info, index);
-	page = swap_cluster_readahead(swap, gfp, &vmf);
-	shmem_pseudo_vma_destroy(&pvma);
+	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+	page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+	mpol_cond_put(mpol);
 
 	if (!page)
 		return NULL;
@@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
 		struct shmem_inode_info *info, pgoff_t index)
 {
-	struct vm_area_struct pvma;
-	struct folio *folio;
+	struct mempolicy *mpol;
+	pgoff_t ilx;
+	struct page *page;
 
-	shmem_pseudo_vma_init(&pvma, info, index);
-	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
-	shmem_pseudo_vma_destroy(&pvma);
+	mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
+	page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
+	mpol_cond_put(mpol);
 
-	return folio;
+	return page_rmappable_folio(page);
 }
 
 static struct folio *shmem_alloc_folio(gfp_t gfp,
 		struct shmem_inode_info *info, pgoff_t index)
 {
-	struct vm_area_struct pvma;
-	struct folio *folio;
+	struct mempolicy *mpol;
+	pgoff_t ilx;
+	struct page *page;
 
-	shmem_pseudo_vma_init(&pvma, info, index);
-	folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
-	shmem_pseudo_vma_destroy(&pvma);
+	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+	page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
+	mpol_cond_put(mpol);
 
-	return folio;
+	return (struct folio *)page;
 }
 
 static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
@@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
 		/* Here we actually start the io */
-		folio = shmem_swapin(swap, gfp, info, index);
+		folio = shmem_swapin_cluster(swap, gfp, info, index);
 		if (!folio) {
 			error = -ENOMEM;
 			goto failed;
@@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 }
 
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
-					  unsigned long addr)
+					  unsigned long addr, pgoff_t *ilx)
 {
 	struct inode *inode = file_inode(vma->vm_file);
 	pgoff_t index;
 
+	/*
+	 * Bias interleave by inode number to distribute better across nodes;
+	 * but this interface is independent of which page order is used, so
+	 * supplies only that bias, letting caller apply the offset (adjusted
+	 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
+	 */
+	*ilx = inode->i_ino;
 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
 }
-#endif
+
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+			pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+	struct mempolicy *mpol;
+
+	/* Bias interleave by inode number to distribute better across nodes */
+	*ilx = info->vfs_inode.i_ino + (index >> order);
+
+	mpol = mpol_shared_policy_lookup(&info->policy, index);
+	return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+			pgoff_t index, unsigned int order, pgoff_t *ilx)
+{
+	*ilx = 0;
+	return NULL;
+}
+#endif /* CONFIG_NUMA */
 
 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
diff --git a/mm/swap.h b/mm/swap.h
index 8a3c7a0ace4f..73c332ee4d91 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -2,6 +2,8 @@
 #ifndef _MM_SWAP_H
 #define _MM_SWAP_H
 
+struct mempolicy;
+
 #ifdef CONFIG_SWAP
 #include <linux/blk_types.h> /* for bio_end_io_t */
 
@@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   unsigned long addr,
 				   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				     struct vm_area_struct *vma,
-				     unsigned long addr,
+				     struct mempolicy *mpol, pgoff_t ilx,
 				     bool *new_page_allocated);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
-				    struct vm_fault *vmf);
+				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 			      struct vm_fault *vmf);
 
@@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void)
 }
 
 static inline struct page *swap_cluster_readahead(swp_entry_t entry,
-				gfp_t gfp_mask, struct vm_fault *vmf)
+			gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
 {
 	return NULL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ab79ffb71736..85d9e5806a6a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/gfp.h>
 #include <linux/kernel_stat.h>
+#include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/init.h>
@@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 }
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-			struct vm_area_struct *vma, unsigned long addr,
-			bool *new_page_allocated)
+				     struct mempolicy *mpol, pgoff_t ilx,
+				     bool *new_page_allocated)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
@@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
 		 * cause any racers to loop around until we add it to cache.
 		 */
-		folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+		folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
+						mpol, ilx, numa_node_id());
 		if (!folio)
                         goto fail_put_swap;
 
@@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   struct vm_area_struct *vma,
 				   unsigned long addr, struct swap_iocb **plug)
 {
-	bool page_was_allocated;
-	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
-			vma, addr, &page_was_allocated);
+	bool page_allocated;
+	struct mempolicy *mpol;
+	pgoff_t ilx;
+	struct page *page;
 
-	if (page_was_allocated)
-		swap_readpage(retpage, false, plug);
+	mpol = get_vma_policy(vma, addr, 0, &ilx);
+	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+					&page_allocated);
+	mpol_cond_put(mpol);
 
-	return retpage;
+	if (page_allocated)
+		swap_readpage(page, false, plug);
+	return page;
 }
 
 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
@@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * swap_cluster_readahead - swap in pages in hope we need them soon
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
- * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
@@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * because it doesn't cost us any seek time.  We also make sure to queue
  * the 'original' request together with the readahead ones...
  *
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold read mmap_lock if vmf->vma is not NULL.
+ * Note: it is intentional that the same NUMA policy and interleave index
+ * are used for every page of the readahead: neighbouring pages on swap
+ * are fairly likely to have been swapped out from the same node.
  */
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
-				struct vm_fault *vmf)
+				    struct mempolicy *mpol, pgoff_t ilx)
 {
 	struct page *page;
 	unsigned long entry_offset = swp_offset(entry);
@@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
 	bool page_allocated;
-	struct vm_area_struct *vma = vmf->vma;
-	unsigned long addr = vmf->address;
 
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
@@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	for (offset = start_offset; offset <= end_offset ; offset++) {
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
-			swp_entry(swp_type(entry), offset),
-			gfp_mask, vma, addr, &page_allocated);
+				swp_entry(swp_type(entry), offset),
+				gfp_mask, mpol, ilx, &page_allocated);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	}
 	blk_finish_plug(&plug);
 	swap_read_unplug(splug);
-
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
 	/* The page was likely read above, so no need for plugging here */
-	return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
+	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+					&page_allocated);
+	if (unlikely(page_allocated))
+		swap_readpage(page, false, NULL);
+	return page;
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf,
 
 /**
  * swap_vma_readahead - swap in pages in hope we need them soon
- * @fentry: swap entry of this memory
+ * @targ_entry: swap entry of the targeted memory
  * @gfp_mask: memory allocation flags
+ * @mpol: NUMA memory allocation policy to be applied
+ * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  * @vmf: fault information
  *
  * Returns the struct page for entry and addr, after queueing swapin.
@@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf,
  * Caller must hold read mmap_lock if vmf->vma is not NULL.
  *
  */
-static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
+				       struct mempolicy *mpol, pgoff_t targ_ilx,
 				       struct vm_fault *vmf)
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
-	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 	pte_t *pte = NULL, pentry;
 	unsigned long addr;
 	swp_entry_t entry;
+	pgoff_t ilx;
 	unsigned int i;
 	bool page_allocated;
 	struct vma_swap_readahead ra_info = {
@@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		goto skip;
 
 	addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+	ilx = targ_ilx - ra_info.offset;
 
 	blk_start_plug(&plug);
-	for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+	for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
 		if (!pte++) {
 			pte = pte_offset_map(vmf->pmd, addr);
 			if (!pte)
@@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 			continue;
 		pte_unmap(pte);
 		pte = NULL;
-		page = __read_swap_cache_async(entry, gfp_mask, vma,
-					       addr, &page_allocated);
+		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+						&page_allocated);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 	lru_add_drain();
 skip:
 	/* The page was likely read above, so no need for plugging here */
-	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
-				     NULL);
+	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
+					&page_allocated);
+	if (unlikely(page_allocated))
+		swap_readpage(page, false, NULL);
+	return page;
 }
 
 /**
@@ -853,9 +868,16 @@ skip:
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 				struct vm_fault *vmf)
 {
-	return swap_use_vma_readahead() ?
-			swap_vma_readahead(entry, gfp_mask, vmf) :
-			swap_cluster_readahead(entry, gfp_mask, vmf);
+	struct mempolicy *mpol;
+	pgoff_t ilx;
+	struct page *page;
+
+	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
+	page = swap_use_vma_readahead() ?
+		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
+		swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+	mpol_cond_put(mpol);
+	return page;
 }
 
 #ifdef CONFIG_SYSFS
diff --git a/mm/zswap.c b/mm/zswap.c
index 37d2b1cb2ecb..060857adca76 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,6 +24,7 @@
 #include <linux/swap.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/mempolicy.h>
 #include <linux/mempool.h>
 #include <linux/zpool.h>
 #include <crypto/acompress.h>
@@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 {
 	swp_entry_t swpentry = entry->swpentry;
 	struct page *page;
+	struct mempolicy *mpol;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct zpool *pool = zswap_find_zpool(entry);
@@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	}
 
 	/* try to allocate swap cache page */
-	page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
-				       &page_was_allocated);
+	mpol = get_task_policy(current);
+	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+				NO_INTERLEAVE_INDEX, &page_was_allocated);
 	if (!page) {
 		ret = -ENOMEM;
 		goto fail;
-- 
cgit v1.2.3


From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2023 02:27:47 -0700
Subject: mempolicy: mmap_lock is not needed while migrating folios

mbind(2) holds down_write of current task's mmap_lock throughout
(exclusive because it needs to set the new mempolicy on the vmas);
migrate_pages(2) holds down_read of pid's mmap_lock throughout.

They both hold mmap_lock across the internal migrate_pages(), under which
all new page allocations (huge or small) are made.  I'm nervous about it;
and migrate_pages() certainly does not need mmap_lock itself.  It's done
this way for mbind(2), because its page allocator is vma_alloc_folio() or
alloc_hugetlb_folio_vma(), both of which depend on vma and address.

Now that we have alloc_pages_mpol(), depending on (refcounted) memory
policy and interleave index, mbind(2) can be modified to use that or
alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the
internal migrate_pages() at all: add alloc_migration_target_by_mpol() to
replace mbind's new_page().

(After that change, alloc_hugetlb_folio_vma() is used by nothing but a
userfaultfd function: move it out of hugetlb.h and into the #ifdef.)

migrate_pages(2) has chosen its target node before migrating, so can
continue to use the standard alloc_migration_target(); but let it take and
drop mmap_lock just around migrate_to_node()'s queue_pages_range():
neither the node-to-node calculations nor the page migrations need it.

It seems unlikely, but it is conceivable that some userspace depends on
the kernel's mmap_lock exclusion here, instead of doing its own locking:
more likely in a testsuite than in real life.  It is also possible, of
course, that some pages on the list will be munmapped by another thread
before they are migrated, or a newer memory policy applied to the range by
that time: but such races could happen before, as soon as mmap_lock was
dropped, so it does not appear to be a concern.

Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  9 ------
 mm/hugetlb.c            | 38 +++++++++++-----------
 mm/mempolicy.c          | 83 +++++++++++++++++++++++++------------------------
 3 files changed, 63 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 158ff156710b..d3acecc5db4b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask);
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
-				unsigned long address);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	return NULL;
 }
 
-static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
-					       struct vm_area_struct *vma,
-					       unsigned long address)
-{
-	return NULL;
-}
-
 static inline int __alloc_bootmem_huge_page(struct hstate *h)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd8065e36038..1169ef2f2176 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
 }
 
-/* mempolicy aware migration callback */
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
-		unsigned long address)
-{
-	struct mempolicy *mpol;
-	nodemask_t *nodemask;
-	struct folio *folio;
-	gfp_t gfp_mask;
-	int node;
-
-	gfp_mask = htlb_alloc_mask(h);
-	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
-	mpol_cond_put(mpol);
-
-	return folio;
-}
-
 /*
  * Increase the hugetlb pool such that it can accommodate a reservation
  * of size 'delta'.
@@ -6559,6 +6541,26 @@ out_mutex:
 }
 
 #ifdef CONFIG_USERFAULTFD
+/*
+ * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
+ */
+static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long address)
+{
+	struct mempolicy *mpol;
+	nodemask_t *nodemask;
+	struct folio *folio;
+	gfp_t gfp_mask;
+	int node;
+
+	gfp_mask = htlb_alloc_mask(h);
+	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+	mpol_cond_put(mpol);
+
+	return folio;
+}
+
 /*
  * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
  * with modifications for hugetlb pages.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ded5508f0972..0594362f2470 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 
 static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 				unsigned long flags);
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+				pgoff_t ilx, int *nid);
 
 static bool strictly_unmovable(unsigned long flags)
 {
@@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
 	node_set(source, nmask);
 
 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+
+	mmap_read_lock(mm);
 	vma = find_vma(mm, 0);
 
 	/*
@@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
 	 */
 	nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
 				      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	mmap_read_unlock(mm);
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
@@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 
 	lru_cache_disable();
 
-	mmap_read_lock(mm);
-
 	/*
 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 		if (err < 0)
 			break;
 	}
-	mmap_read_unlock(mm);
 
 	lru_cache_enable();
 	if (err < 0)
@@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 }
 
 /*
- * Allocate a new page for page migration based on vma policy.
- * Start by assuming the page is mapped by the same vma as contains @start.
- * Search forward from there, if not.  N.B., this assumes that the
- * list of pages handed to migrate_pages()--which is how we get here--
- * is in virtual address order.
+ * Allocate a new folio for page migration, according to NUMA mempolicy.
  */
-static struct folio *new_folio(struct folio *src, unsigned long start)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+						    unsigned long private)
 {
-	struct vm_area_struct *vma;
-	unsigned long address;
-	VMA_ITERATOR(vmi, current->mm, start);
-	gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
-
-	for_each_vma(vmi, vma) {
-		address = page_address_in_vma(&src->page, vma);
-		if (address != -EFAULT)
-			break;
-	}
+	struct mempolicy *pol = (struct mempolicy *)private;
+	pgoff_t ilx = 0;	/* improve on this later */
+	struct page *page;
+	unsigned int order;
+	int nid = numa_node_id();
+	gfp_t gfp;
 
-	/*
-	 * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
-	 * when the page can no longer be located in a vma: that is not ideal
-	 * (migrate_pages() will give up early, presuming ENOMEM), but good
-	 * enough to avoid a crash by syzkaller or concurrent holepunch.
-	 */
-	if (!vma)
-		return NULL;
+	order = folio_order(src);
+	ilx += src->index >> order;
 
 	if (folio_test_hugetlb(src)) {
-		return alloc_hugetlb_folio_vma(folio_hstate(src),
-				vma, address);
+		nodemask_t *nodemask;
+		struct hstate *h;
+
+		h = folio_hstate(src);
+		gfp = htlb_alloc_mask(h);
+		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
+		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
 	}
 
 	if (folio_test_large(src))
 		gfp = GFP_TRANSHUGE;
+	else
+		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
 
-	return vma_alloc_folio(gfp, folio_order(src), vma, address,
-			folio_test_large(src));
+	page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
+	return page_rmappable_folio(page);
 }
 #else
 
@@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	return -ENOSYS;
 }
 
-static struct folio *new_folio(struct folio *src, unsigned long start)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+						    unsigned long private)
 {
 	return NULL;
 }
@@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 	if (nr_failed < 0) {
 		err = nr_failed;
+		nr_failed = 0;
 	} else {
 		vma_iter_init(&vmi, mm, start);
 		prev = vma_prev(&vmi);
@@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len,
 		}
 	}
 
-	if (!err) {
-		if (!list_empty(&pagelist)) {
-			nr_failed |= migrate_pages(&pagelist, new_folio, NULL,
-				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
+	mmap_write_unlock(mm);
+
+	if (!err && !list_empty(&pagelist)) {
+		/* Convert MPOL_DEFAULT's NULL to task or default policy */
+		if (!new) {
+			new = get_task_policy(current);
+			mpol_get(new);
 		}
-		if (nr_failed && (flags & MPOL_MF_STRICT))
-			err = -EIO;
+		nr_failed |= migrate_pages(&pagelist,
+				alloc_migration_target_by_mpol, NULL,
+				(unsigned long)new, MIGRATE_SYNC,
+				MR_MEMPOLICY_MBIND, NULL);
 	}
 
+	if (nr_failed && (flags & MPOL_MF_STRICT))
+		err = -EIO;
 	if (!list_empty(&pagelist))
 		putback_movable_pages(&pagelist);
-
-	mmap_write_unlock(mm);
 mpol_out:
 	mpol_put(new);
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-- 
cgit v1.2.3


From b56ebe7c896dc78b5865ec2c4b1dae3c93537517 Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Thu, 26 Oct 2023 12:20:36 +0900
Subject: x86/apic/msi: Fix misconfigured non-maskable MSI quirk

commit ef8dd01538ea ("genirq/msi: Make interrupt allocation less
convoluted"), reworked the code so that the x86 specific quirk for affinity
setting of non-maskable PCI/MSI interrupts is not longer activated if
necessary.

This could be solved by restoring the original logic in the core MSI code,
but after a deeper analysis it turned out that the quirk flag is not
required at all.

The quirk is only required when the PCI/MSI device cannot mask the MSI
interrupts, which in turn also prevents reservation mode from being enabled
for the affected interrupt.

This allows ot remove the NOMASK quirk bit completely as msi_set_affinity()
can instead check whether reservation mode is enabled for the interrupt,
which gives exactly the same answer.

Even in the momentary non-existing case that the reservation mode would be
not set for a maskable MSI interrupt this would not cause any harm as it
just would cause msi_set_affinity() to go needlessly through the
functionaly equivalent slow path, which works perfectly fine with maskable
interrupts as well.

Rework msi_set_affinity() to query the reservation mode and remove all
NOMASK quirk logic from the core code.

[ tglx: Massaged changelog ]

Fixes: ef8dd01538ea ("genirq/msi: Make interrupt allocation less convoluted")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231026032036.2462428-1-den@valinux.co.jp
---
 arch/x86/kernel/apic/msi.c |  8 +++-----
 include/linux/irq.h        | 26 ++++----------------------
 include/linux/msi.h        |  6 ------
 kernel/irq/debugfs.c       |  1 -
 kernel/irq/msi.c           | 12 +-----------
 5 files changed, 8 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6b6b711678fe..d9651f15ae4f 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 * caused by the non-atomic update of the address/data pair.
 	 *
 	 * Direct update is possible when:
-	 * - The MSI is maskable (remapped MSI does not use this code path)).
-	 *   The quirk bit is not set in this case.
+	 * - The MSI is maskable (remapped MSI does not use this code path).
+	 *   The reservation mode bit is set in this case.
 	 * - The new vector is the same as the old vector
 	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
 	 * - The interrupt is not yet started up
 	 * - The new destination CPU is the same as the old destination CPU
 	 */
-	if (!irqd_msi_nomask_quirk(irqd) ||
+	if (!irqd_can_reserve(irqd) ||
 	    cfg->vector == old_cfg.vector ||
 	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
 	    !irqd_is_started(irqd) ||
@@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 		if (WARN_ON_ONCE(domain != real_parent))
 			return false;
 		info->chip->irq_set_affinity = msi_set_affinity;
-		/* See msi_set_affinity() for the gory details */
-		info->flags |= MSI_FLAG_NOMASK_QUIRK;
 		break;
 	case DOMAIN_BUS_DMAR:
 	case DOMAIN_BUS_AMDVI:
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d8a6fdce9373..90081afa10ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -215,8 +215,6 @@ struct irq_data {
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  * IRQD_CAN_RESERVE		- Can use reservation mode
- * IRQD_MSI_NOMASK_QUIRK	- Non-maskable MSI quirk for affinity change
- *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
  * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
@@ -247,11 +245,10 @@ enum {
 	IRQD_SINGLE_TARGET		= BIT(24),
 	IRQD_DEFAULT_TRIGGER_SET	= BIT(25),
 	IRQD_CAN_RESERVE		= BIT(26),
-	IRQD_MSI_NOMASK_QUIRK		= BIT(27),
-	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(28),
-	IRQD_AFFINITY_ON_ACTIVATE	= BIT(29),
-	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(30),
-	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(31),
+	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(27),
+	IRQD_AFFINITY_ON_ACTIVATE	= BIT(28),
+	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(29),
+	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -426,21 +423,6 @@ static inline bool irqd_can_reserve(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
 }
 
-static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
-{
-	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
-}
-
 static inline void irqd_set_affinity_on_activate(struct irq_data *d)
 {
 	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index a50ea79522f8..ddace8c34dcf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -547,12 +547,6 @@ enum {
 	MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS	= (1 << 5),
 	/* Free MSI descriptors */
 	MSI_FLAG_FREE_MSI_DESCS		= (1 << 6),
-	/*
-	 * Quirk to handle MSI implementations which do not provide
-	 * masking. Currently known to affect x86, but has to be partially
-	 * handled in the core MSI code.
-	 */
-	MSI_FLAG_NOMASK_QUIRK		= (1 << 7),
 
 	/* Mask for the generic functionality */
 	MSI_GENERIC_FLAGS_MASK		= GENMASK(15, 0),
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 5971a66be034..aae0402507ed 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
-	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b4c31a5c1147..79b4a58ba9c3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
 
 #define VIRQ_CAN_RESERVE	0x01
 #define VIRQ_ACTIVATE		0x02
-#define VIRQ_NOMASK_QUIRK	0x04
 
 static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
 {
@@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
 
 	if (!(vflags & VIRQ_CAN_RESERVE)) {
 		irqd_clr_can_reserve(irqd);
-		if (vflags & VIRQ_NOMASK_QUIRK)
-			irqd_set_msi_nomask_quirk(irqd);
 
 		/*
 		 * If the interrupt is managed but no CPU is available to
@@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
 	 * Interrupt can use a reserved vector and will not occupy
 	 * a real device vector until the interrupt is requested.
 	 */
-	if (msi_check_reservation_mode(domain, info, dev)) {
+	if (msi_check_reservation_mode(domain, info, dev))
 		vflags |= VIRQ_CAN_RESERVE;
-		/*
-		 * MSI affinity setting requires a special quirk (X86) when
-		 * reservation mode is active.
-		 */
-		if (info->flags & MSI_FLAG_NOMASK_QUIRK)
-			vflags |= VIRQ_NOMASK_QUIRK;
-	}
 
 	xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
 		if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
-- 
cgit v1.2.3


From 54d606816b32401de5431f6776a78b1de135bfa2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 25 Oct 2023 21:39:29 -0700
Subject: iommu: Add IOMMU_DOMAIN_NESTED

Introduce a new domain type for a user I/O page table, which is nested on
top of another user space address represented by a PAGING domain. This
new domain can be allocated by the domain_alloc_user op, and attached to
a device through the existing iommu_attach_device/group() interfaces.

The mappings of a nested domain are managed by user space software, so it
is not necessary to have map/unmap callbacks.

Link: https://lore.kernel.org/r/20231026043938.63898-2-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1d42bdb37cbc..bc303cb2af37 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -67,6 +67,9 @@ struct iommu_domain_geometry {
 
 #define __IOMMU_DOMAIN_SVA	(1U << 4)  /* Shared process address space */
 
+#define __IOMMU_DOMAIN_NESTED	(1U << 6)  /* User-managed address space nested
+					      on a stage-2 translation        */
+
 #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ
 /*
  * This are the possible domain-types
@@ -93,6 +96,7 @@ struct iommu_domain_geometry {
 				 __IOMMU_DOMAIN_DMA_API |	\
 				 __IOMMU_DOMAIN_DMA_FQ)
 #define IOMMU_DOMAIN_SVA	(__IOMMU_DOMAIN_SVA)
+#define IOMMU_DOMAIN_NESTED	(__IOMMU_DOMAIN_NESTED)
 
 struct iommu_domain {
 	unsigned type;
-- 
cgit v1.2.3


From 2bdabb8e82f564d19eeeb7c83e6b2467af0707cb Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Wed, 25 Oct 2023 21:39:34 -0700
Subject: iommu: Pass in parent domain with user_data to domain_alloc_user op

domain_alloc_user op already accepts user flags for domain allocation, add
a parent domain pointer and a driver specific user data support as well.
The user data would be tagged with a type for iommu drivers to add their
own driver specific user data per hw_pagetable.

Add a struct iommu_user_data as a bundle of data_ptr/data_len/type from an
iommufd core uAPI structure. Make the user data opaque to the core, since
a userspace driver must match the kernel driver. In the future, if drivers
share some common parameter, there would be a generic parameter as well.

Link: https://lore.kernel.org/r/20231026043938.63898-7-yi.l.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/amd/iommu.c            |  9 ++++++---
 drivers/iommu/intel/iommu.c          |  7 ++++++-
 drivers/iommu/iommufd/hw_pagetable.c |  3 ++-
 drivers/iommu/iommufd/selftest.c     |  7 ++++++-
 include/linux/iommu.h                | 27 ++++++++++++++++++++++++---
 5 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index caad10f9cee3..b399c5741378 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2219,12 +2219,15 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
 	return domain;
 }
 
-static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev,
-							u32 flags)
+static struct iommu_domain *
+amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
+			    struct iommu_domain *parent,
+			    const struct iommu_user_data *user_data)
+
 {
 	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
 
-	if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
+	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	return do_iommu_domain_alloc(type, dev, flags);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index eb92a201cc0b..fe67f8d77b09 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4076,7 +4076,9 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 }
 
 static struct iommu_domain *
-intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
+intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+			      struct iommu_domain *parent,
+			      const struct iommu_user_data *user_data)
 {
 	struct iommu_domain *domain;
 	struct intel_iommu *iommu;
@@ -4086,6 +4088,9 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (parent || user_data)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	iommu = device_to_iommu(dev, NULL, NULL);
 	if (!iommu)
 		return ERR_PTR(-ENODEV);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 6bce9af0cb8d..198ecbd536f7 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -108,7 +108,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 	hwpt_paging->ioas = ioas;
 
 	if (ops->domain_alloc_user) {
-		hwpt->domain = ops->domain_alloc_user(idev->dev, flags);
+		hwpt->domain =
+			ops->domain_alloc_user(idev->dev, flags, NULL, NULL);
 		if (IS_ERR(hwpt->domain)) {
 			rc = PTR_ERR(hwpt->domain);
 			hwpt->domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 068928ba7950..d71007234896 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -241,7 +241,9 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
 }
 
 static struct iommu_domain *
-mock_domain_alloc_user(struct device *dev, u32 flags)
+mock_domain_alloc_user(struct device *dev, u32 flags,
+		       struct iommu_domain *parent,
+		       const struct iommu_user_data *user_data)
 {
 	struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
 	struct iommu_domain *domain;
@@ -250,6 +252,9 @@ mock_domain_alloc_user(struct device *dev, u32 flags)
 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (parent || user_data)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
 	    (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
 		return ERR_PTR(-EOPNOTSUPP);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bc303cb2af37..2ddd99f55471 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -265,6 +265,21 @@ struct iommu_dirty_ops {
 				    struct iommu_dirty_bitmap *dirty);
 };
 
+/**
+ * struct iommu_user_data - iommu driver specific user space data info
+ * @type: The data type of the user buffer
+ * @uptr: Pointer to the user buffer for copy_from_user()
+ * @len: The length of the user buffer in bytes
+ *
+ * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h
+ * @type, @uptr and @len should be just copied from an iommufd core uAPI struct.
+ */
+struct iommu_user_data {
+	unsigned int type;
+	void __user *uptr;
+	size_t len;
+};
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -279,8 +294,12 @@ struct iommu_dirty_ops {
  *                     parameters as defined in include/uapi/linux/iommufd.h.
  *                     Unlike @domain_alloc, it is called only by IOMMUFD and
  *                     must fully initialize the new domain before return.
- *                     Upon success, a domain is returned. Upon failure,
- *                     ERR_PTR must be returned.
+ *                     Upon success, if the @user_data is valid and the @parent
+ *                     points to a kernel-managed domain, the new domain must be
+ *                     IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be
+ *                     NULL while the @user_data can be optionally provided, the
+ *                     new domain must support __IOMMU_DOMAIN_PAGING.
+ *                     Upon failure, ERR_PTR must be returned.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -313,7 +332,9 @@ struct iommu_ops {
 
 	/* Domain allocation and freeing by the iommu driver */
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
-	struct iommu_domain *(*domain_alloc_user)(struct device *dev, u32 flags);
+	struct iommu_domain *(*domain_alloc_user)(
+		struct device *dev, u32 flags, struct iommu_domain *parent,
+		const struct iommu_user_data *user_data);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From e9d36c07bb787840e4813fb09a929a17d522a69f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 25 Oct 2023 21:39:36 -0700
Subject: iommu: Add iommu_copy_struct_from_user helper

Wrap up the data type/pointer/len sanity and a copy_struct_from_user call
for iommu drivers to copy driver specific data via struct iommu_user_data.
And expect it to be used in the domain_alloc_user op for example.

Link: https://lore.kernel.org/r/20231026043938.63898-9-yi.l.liu@intel.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Co-developed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2ddd99f55471..8fb1b41b4d15 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -280,6 +280,46 @@ struct iommu_user_data {
 	size_t len;
 };
 
+/**
+ * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @src_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @dst_data. Must match with @src_data.type
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user(
+	void *dst_data, const struct iommu_user_data *src_data,
+	unsigned int data_type, size_t data_len, size_t min_len)
+{
+	if (src_data->type != data_type)
+		return -EINVAL;
+	if (WARN_ON(!dst_data || !src_data))
+		return -EINVAL;
+	if (src_data->len < min_len || data_len < src_data->len)
+		return -EINVAL;
+	return copy_struct_from_user(dst_data, data_len, src_data->uptr,
+				     src_data->len);
+}
+
+/**
+ * iommu_copy_struct_from_user - Copy iommu driver specific user space data
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @user_data: Pointer to a struct iommu_user_data for user space data info
+ * @data_type: The data type of the @kdst. Must match with @user_data->type
+ * @min_last: The last memember of the data structure @kdst points in the
+ *            initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \
+	__iommu_copy_struct_from_user(kdst, user_data, data_type,         \
+				      sizeof(*kdst),                      \
+				      offsetofend(typeof(*kdst), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From e5d8be7406ca7b6b251c09b786f08176337c0999 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 27 Sep 2023 20:47:31 -0300
Subject: iommu: Move IOMMU_DOMAIN_BLOCKED global statics to
 ops->blocked_domain

Following the pattern of identity domains, just assign the BLOCKED domain
global statics to a value in ops. Update the core code to use the global
static directly.

Update powerpc to use the new scheme and remove its empty domain_alloc
callback.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Sven Peter <sven@svenpeter.dev>
Link: https://lore.kernel.org/r/1-v2-bff223cf6409+282-dart_paging_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/powerpc/kernel/iommu.c | 9 +--------
 drivers/iommu/iommu.c       | 2 ++
 include/linux/iommu.h       | 3 +++
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index bf1993214751..ed7c97d9128e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1333,13 +1333,6 @@ static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
-static struct iommu_domain *spapr_tce_iommu_domain_alloc(unsigned int type)
-{
-	if (type != IOMMU_DOMAIN_BLOCKED)
-		return NULL;
-	return &spapr_tce_blocked_domain;
-}
-
 static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev)
 {
 	struct pci_dev *pdev;
@@ -1374,8 +1367,8 @@ static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev)
 
 static const struct iommu_ops spapr_tce_iommu_ops = {
 	.default_domain = &spapr_tce_platform_domain,
+	.blocked_domain = &spapr_tce_blocked_domain,
 	.capable = spapr_tce_iommu_capable,
-	.domain_alloc = spapr_tce_iommu_domain_alloc,
 	.probe_device = spapr_tce_iommu_probe_device,
 	.release_device = spapr_tce_iommu_release_device,
 	.device_group = spapr_tce_iommu_device_group,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f9f315d58a3a..79a0fdb33404 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2086,6 +2086,8 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
 
 	if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain)
 		return ops->identity_domain;
+	else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain)
+		return ops->blocked_domain;
 	else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging)
 		domain = ops->domain_alloc_paging(dev);
 	else if (ops->domain_alloc)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c28178f3690a..73daa4fb168b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -265,6 +265,8 @@ struct iommu_iotlb_gather {
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
  *                   translation.
+ * @blocked_domain: An always available, always attachable blocking
+ *                  translation.
  * @default_domain: If not NULL this will always be set as the default domain.
  *                  This should be an IDENTITY/BLOCKED/PLATFORM domain.
  *                  Do not use in new drivers.
@@ -303,6 +305,7 @@ struct iommu_ops {
 	unsigned long pgsize_bitmap;
 	struct module *owner;
 	struct iommu_domain *identity_domain;
+	struct iommu_domain *blocked_domain;
 	struct iommu_domain *default_domain;
 };
 
-- 
cgit v1.2.3


From b827ac419721a106ae2fccaa40576b0594edad92 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 26 Oct 2023 16:37:58 -0400
Subject: exportfs: Change bcachefs fid_type enum to avoid conflicts

Per Amir Goldstein, the fid types that bcachefs picked conflicted with
xfs and fuse, which previously were in use but not deviced in the master
enum.

Since bcachefs is still out of tree, we can move.

https://lore.kernel.org/linux-next/20231026203733.fx65mjyic4pka3e5@moria.home.lan/T/#ma59f65ba61f605b593e69f4690dbd317526d83ba

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/exportfs.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index f49a7d31167e..f75e0914d40d 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,18 +98,18 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
-	/*
-	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
-	 */
-	FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
-	FILEID_BCACHEFS_WITH_PARENT = 0x81,
-
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
 	 */
 	FILEID_LUSTRE = 0x97,
 
+	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
+	FILEID_BCACHEFS_WITH_PARENT = 0xb2,
+
 	/*
 	 * 64 bit unique kernfs id
 	 */
-- 
cgit v1.2.3


From c845f5f3590ef4669fe5464f8a42be6442cd174b Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:54 +0100
Subject: net/tcp: Add TCP-AO config and structures

Introduce new kernel config option and common structures as well as
helpers to be used by TCP-AO code.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  9 +++--
 include/net/tcp.h        |  8 ++---
 include/net/tcp_ao.h     | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/Kconfig         | 13 +++++++
 5 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 include/net/tcp_ao.h

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6df715b6e51d..64e7b560fa79 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -447,13 +447,18 @@ struct tcp_sock {
 	bool	syn_smc;	/* SYN includes SMC */
 #endif
 
-#ifdef CONFIG_TCP_MD5SIG
-/* TCP AF-Specific parts; only used by MD5 Signature support so far */
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
 	const struct tcp_sock_af_ops	*af_specific;
 
+#ifdef CONFIG_TCP_MD5SIG
 /* TCP MD5 Signature Option information */
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
+#endif
 
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f6e2db5292b5..3153712faa3d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -37,6 +37,7 @@
 #include <net/snmp.h>
 #include <net/ip.h>
 #include <net/tcp_states.h>
+#include <net/tcp_ao.h>
 #include <net/inet_ecn.h>
 #include <net/dst.h>
 #include <net/mptcp.h>
@@ -1688,12 +1689,7 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 	tp->retransmit_skb_hint = NULL;
 }
 
-union tcp_md5_addr {
-	struct in_addr  a4;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr	a6;
-#endif
-};
+#define tcp_md5_addr tcp_ao_addr
 
 /* - key database */
 struct tcp_md5sig_key {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
new file mode 100644
index 000000000000..af76e1c47bea
--- /dev/null
+++ b/include/net/tcp_ao.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _TCP_AO_H
+#define _TCP_AO_H
+
+#define TCP_AO_KEY_ALIGN	1
+#define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN)
+
+union tcp_ao_addr {
+	struct in_addr  a4;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct in6_addr	a6;
+#endif
+};
+
+struct tcp_ao_hdr {
+	u8	kind;
+	u8	length;
+	u8	keyid;
+	u8	rnext_keyid;
+};
+
+struct tcp_ao_key {
+	struct hlist_node	node;
+	union tcp_ao_addr	addr;
+	u8			key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align;
+	unsigned int		tcp_sigpool_id;
+	unsigned int		digest_size;
+	u8			prefixlen;
+	u8			family;
+	u8			keylen;
+	u8			keyflags;
+	u8			sndid;
+	u8			rcvid;
+	u8			maclen;
+	struct rcu_head		rcu;
+	u8			traffic_keys[];
+};
+
+static inline u8 *rcv_other_key(struct tcp_ao_key *key)
+{
+	return key->traffic_keys;
+}
+
+static inline u8 *snd_other_key(struct tcp_ao_key *key)
+{
+	return key->traffic_keys + key->digest_size;
+}
+
+static inline int tcp_ao_maclen(const struct tcp_ao_key *key)
+{
+	return key->maclen;
+}
+
+static inline int tcp_ao_len(const struct tcp_ao_key *key)
+{
+	return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr);
+}
+
+static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key)
+{
+	return key->digest_size;
+}
+
+static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key)
+{
+	return sizeof(struct tcp_ao_key) + (key->digest_size << 1);
+}
+
+struct tcp_ao_info {
+	/* List of tcp_ao_key's */
+	struct hlist_head	head;
+	/* current_key and rnext_key aren't maintained on listen sockets.
+	 * Their purpose is to cache keys on established connections,
+	 * saving needless lookups. Never dereference any of them from
+	 * listen sockets.
+	 * ::current_key may change in RX to the key that was requested by
+	 * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid
+	 * load/store tearing.
+	 * Do the same for ::rnext_key, if you don't hold socket lock
+	 * (it's changed only by userspace request in setsockopt()).
+	 */
+	struct tcp_ao_key	*current_key;
+	struct tcp_ao_key	*rnext_key;
+	u32			flags;
+	__be32			lisn;
+	__be32			risn;
+	struct rcu_head		rcu;
+};
+
+#endif /* _TCP_AO_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8aa3916e14f6..74a1267baac5 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -361,6 +361,8 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+#define TCP_AO_MAXKEYLEN	80
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 89e2ab023272..8e94ed7c56a0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -744,6 +744,19 @@ config DEFAULT_TCP_CONG
 config TCP_SIGPOOL
 	tristate
 
+config TCP_AO
+	bool "TCP: Authentication Option (RFC5925)"
+	select CRYPTO
+	select TCP_SIGPOOL
+	depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64)
+	help
+	  TCP-AO specifies the use of stronger Message Authentication Codes (MACs),
+	  protects against replays for long-lived TCP connections, and
+	  provides more details on the association of security with TCP
+	  connections than TCP MD5 (See RFC5925)
+
+	  If unsure, say N.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385)"
 	select CRYPTO
-- 
cgit v1.2.3


From 4954f17ddefc51d218625dcdfaf422a253dad3fa Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:21:55 +0100
Subject: net/tcp: Introduce TCP_AO setsockopt()s

Add 3 setsockopt()s:
1. TCP_AO_ADD_KEY to add a new Master Key Tuple (MKT) on a socket
2. TCP_AO_DEL_KEY to delete present MKT from a socket
3. TCP_AO_INFO to change flags, Current_key/RNext_key on a TCP-AO sk

Userspace has to introduce keys on every socket it wants to use TCP-AO
option on, similarly to TCP_MD5SIG/TCP_MD5SIG_EXT.
RFC5925 prohibits definition of MKTs that would match the same peer,
so do sanity checks on the data provided by userspace. Be as
conservative as possible, including refusal of defining MKT on
an established connection with no AO, removing the key in-use and etc.

(1) and (2) are to be used by userspace key manager to add/remove keys.
(3) main purpose is to set RNext_key, which (as prescribed by RFC5925)
is the KeyID that will be requested in TCP-AO header from the peer to
sign their segments with.

At this moment the life of ao_info ends in tcp_v4_destroy_sock().

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sockptr.h  |  23 ++
 include/net/tcp.h        |   3 +
 include/net/tcp_ao.h     |  17 +-
 include/uapi/linux/tcp.h |  46 +++
 net/ipv4/Makefile        |   1 +
 net/ipv4/tcp.c           |  17 +
 net/ipv4/tcp_ao.c        | 794 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  10 +-
 net/ipv6/Makefile        |   1 +
 net/ipv6/tcp_ao.c        |  19 ++
 net/ipv6/tcp_ipv6.c      |  39 ++-
 11 files changed, 952 insertions(+), 18 deletions(-)
 create mode 100644 net/ipv4/tcp_ao.c
 create mode 100644 net/ipv6/tcp_ao.c

(limited to 'include/linux')

diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index bae5e2369b4f..307961b41541 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -55,6 +55,29 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
 	return copy_from_sockptr_offset(dst, src, 0, size);
 }
 
+static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
+		sockptr_t src, size_t usize)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	if (!sockptr_is_kernel(src))
+		return copy_struct_from_user(dst, ksize, src.user, size);
+
+	if (usize < ksize) {
+		memset(dst + size, 0, rest);
+	} else if (usize > ksize) {
+		char *p = src.kernel;
+
+		while (rest--) {
+			if (*p++)
+				return -E2BIG;
+		}
+	}
+	memcpy(dst, src.kernel, size);
+	return 0;
+}
+
 static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
 		const void *src, size_t size)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3153712faa3d..ff204471d451 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2175,6 +2175,9 @@ struct tcp_sock_af_ops {
 				     sockptr_t optval,
 				     int optlen);
 #endif
+#ifdef CONFIG_TCP_AO
+	int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen);
+#endif
 };
 
 struct tcp_request_sock_ops {
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index af76e1c47bea..a81e40fd255a 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -81,10 +81,25 @@ struct tcp_ao_info {
 	 */
 	struct tcp_ao_key	*current_key;
 	struct tcp_ao_key	*rnext_key;
-	u32			flags;
+	u32			ao_required	:1,
+				__unused	:31;
 	__be32			lisn;
 	__be32			risn;
 	struct rcu_head		rcu;
 };
 
+#ifdef CONFIG_TCP_AO
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+		 sockptr_t optval, int optlen);
+void tcp_ao_destroy_sock(struct sock *sk);
+/* ipv4 specific functions */
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+/* ipv6 specific functions */
+int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen);
+#else
+static inline void tcp_ao_destroy_sock(struct sock *sk)
+{
+}
+#endif
+
 #endif /* _TCP_AO_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 74a1267baac5..fa49f03e62fe 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -129,6 +129,9 @@ enum {
 
 #define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
 
+#define TCP_AO_ADD_KEY		38	/* Add/Set MKT */
+#define TCP_AO_DEL_KEY		39	/* Delete MKT */
+#define TCP_AO_INFO		40	/* Modify TCP-AO per-socket options */
 
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
@@ -363,6 +366,49 @@ struct tcp_diag_md5sig {
 
 #define TCP_AO_MAXKEYLEN	80
 
+#define TCP_AO_KEYF_IFINDEX	(1 << 0)	/* L3 ifindex for VRF */
+
+struct tcp_ao_add { /* setsockopt(TCP_AO_ADD_KEY) */
+	struct __kernel_sockaddr_storage addr;	/* peer's address for the key */
+	char	alg_name[64];		/* crypto hash algorithm to use */
+	__s32	ifindex;		/* L3 dev index for VRF */
+	__u32   set_current	:1,	/* set key as Current_key at once */
+		set_rnext	:1,	/* request it from peer with RNext_key */
+		reserved	:30;	/* must be 0 */
+	__u16	reserved2;		/* padding, must be 0 */
+	__u8	prefix;			/* peer's address prefix */
+	__u8	sndid;			/* SendID for outgoing segments */
+	__u8	rcvid;			/* RecvID to match for incoming seg */
+	__u8	maclen;			/* length of authentication code (hash) */
+	__u8	keyflags;		/* see TCP_AO_KEYF_ */
+	__u8	keylen;			/* length of ::key */
+	__u8	key[TCP_AO_MAXKEYLEN];
+} __attribute__((aligned(8)));
+
+struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */
+	struct __kernel_sockaddr_storage addr;	/* peer's address for the key */
+	__s32	ifindex;		/* L3 dev index for VRF */
+	__u32   set_current	:1,	/* corresponding ::current_key */
+		set_rnext	:1,	/* corresponding ::rnext */
+		reserved	:30;	/* must be 0 */
+	__u16	reserved2;		/* padding, must be 0 */
+	__u8	prefix;			/* peer's address prefix */
+	__u8	sndid;			/* SendID for outgoing segments */
+	__u8	rcvid;			/* RecvID to match for incoming seg */
+	__u8	current_key;		/* KeyID to set as Current_key */
+	__u8	rnext;			/* KeyID to set as Rnext_key */
+	__u8	keyflags;		/* see TCP_AO_KEYF_ */
+} __attribute__((aligned(8)));
+
+struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */
+	__u32   set_current	:1,	/* corresponding ::current_key */
+		set_rnext	:1,	/* corresponding ::rnext */
+		ao_required	:1,	/* don't accept non-AO connects */
+		reserved	:29;	/* must be 0 */
+	__u8	current_key;		/* KeyID to set as Current_key */
+	__u8	rnext;			/* KeyID to set as Rnext_key */
+} __attribute__((aligned(8)));
+
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
 #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index cd760793cfcb..e144a02a6a61 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
 
 ifeq ($(CONFIG_BPF_JIT),y)
 obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dca9ca2f1081..b6faee8a1e67 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3593,6 +3593,23 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 		__tcp_sock_set_quickack(sk, val);
 		break;
 
+#ifdef CONFIG_TCP_AO
+	case TCP_AO_ADD_KEY:
+	case TCP_AO_DEL_KEY:
+	case TCP_AO_INFO: {
+		/* If this is the first TCP-AO setsockopt() on the socket,
+		 * sk_state has to be LISTEN or CLOSE
+		 */
+		if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) ||
+		    rcu_dereference_protected(tcp_sk(sk)->ao_info,
+					      lockdep_sock_is_held(sk)))
+			err = tp->af_specific->ao_parse(sk, optname, optval,
+							optlen);
+		else
+			err = -EISCONN;
+		break;
+	}
+#endif
 #ifdef CONFIG_TCP_MD5SIG
 	case TCP_MD5SIG:
 	case TCP_MD5SIG_EXT:
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
new file mode 100644
index 000000000000..3c2d005a37ce
--- /dev/null
+++ b/net/ipv4/tcp_ao.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP Authentication Option (TCP-AO).
+ *		See RFC5925.
+ *
+ * Authors:	Dmitry Safonov <dima@arista.com>
+ *		Francesco Ruggeri <fruggeri@arista.com>
+ *		Salam Noureddine <noureddine@arista.com>
+ */
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/hash.h>
+#include <linux/inetdevice.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+/* Optimized version of tcp_ao_do_lookup(): only for sockets for which
+ * it's known that the keys in ao_info are matching peer's
+ * family/address/VRF/etc.
+ */
+static struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
+						 int sndid, int rcvid)
+{
+	struct tcp_ao_key *key;
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		if ((sndid >= 0 && key->sndid != sndid) ||
+		    (rcvid >= 0 && key->rcvid != rcvid))
+			continue;
+		return key;
+	}
+
+	return NULL;
+}
+
+static int ipv4_prefix_cmp(const struct in_addr *addr1,
+			   const struct in_addr *addr2,
+			   unsigned int prefixlen)
+{
+	__be32 mask = inet_make_mask(prefixlen);
+	__be32 a1 = addr1->s_addr & mask;
+	__be32 a2 = addr2->s_addr & mask;
+
+	if (a1 == a2)
+		return 0;
+	return memcmp(&a1, &a2, sizeof(a1));
+}
+
+static int __tcp_ao_key_cmp(const struct tcp_ao_key *key,
+			    const union tcp_ao_addr *addr, u8 prefixlen,
+			    int family, int sndid, int rcvid)
+{
+	if (sndid >= 0 && key->sndid != sndid)
+		return (key->sndid > sndid) ? 1 : -1;
+	if (rcvid >= 0 && key->rcvid != rcvid)
+		return (key->rcvid > rcvid) ? 1 : -1;
+
+	if (family == AF_UNSPEC)
+		return 0;
+	if (key->family != family)
+		return (key->family > family) ? 1 : -1;
+
+	if (family == AF_INET) {
+		if (ntohl(key->addr.a4.s_addr) == INADDR_ANY)
+			return 0;
+		if (ntohl(addr->a4.s_addr) == INADDR_ANY)
+			return 0;
+		return ipv4_prefix_cmp(&key->addr.a4, &addr->a4, prefixlen);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		if (ipv6_addr_any(&key->addr.a6) || ipv6_addr_any(&addr->a6))
+			return 0;
+		if (ipv6_prefix_equal(&key->addr.a6, &addr->a6, prefixlen))
+			return 0;
+		return memcmp(&key->addr.a6, &addr->a6, sizeof(addr->a6));
+#endif
+	}
+	return -1;
+}
+
+static int tcp_ao_key_cmp(const struct tcp_ao_key *key,
+			  const union tcp_ao_addr *addr, u8 prefixlen,
+			  int family, int sndid, int rcvid)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) {
+		__be32 addr4 = addr->a6.s6_addr32[3];
+
+		return __tcp_ao_key_cmp(key, (union tcp_ao_addr *)&addr4,
+					prefixlen, AF_INET, sndid, rcvid);
+	}
+#endif
+	return __tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid);
+}
+
+static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk,
+		const union tcp_ao_addr *addr, int family, u8 prefix,
+		int sndid, int rcvid)
+{
+	struct tcp_ao_key *key;
+	struct tcp_ao_info *ao;
+
+	ao = rcu_dereference_check(tcp_sk(sk)->ao_info,
+				   lockdep_sock_is_held(sk));
+	if (!ao)
+		return NULL;
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		u8 prefixlen = min(prefix, key->prefixlen);
+
+		if (!tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid))
+			return key;
+	}
+	return NULL;
+}
+
+static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
+{
+	struct tcp_ao_info *ao;
+
+	ao = kzalloc(sizeof(*ao), flags);
+	if (!ao)
+		return NULL;
+	INIT_HLIST_HEAD(&ao->head);
+
+	return ao;
+}
+
+static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
+{
+	hlist_add_head_rcu(&mkt->node, &ao->head);
+}
+
+static void tcp_ao_key_free_rcu(struct rcu_head *head)
+{
+	struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
+
+	tcp_sigpool_release(key->tcp_sigpool_id);
+	kfree_sensitive(key);
+}
+
+void tcp_ao_destroy_sock(struct sock *sk)
+{
+	struct tcp_ao_info *ao;
+	struct tcp_ao_key *key;
+	struct hlist_node *n;
+
+	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+	tcp_sk(sk)->ao_info = NULL;
+
+	if (!ao)
+		return;
+
+	hlist_for_each_entry_safe(key, n, &ao->head, node) {
+		hlist_del_rcu(&key->node);
+		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+	}
+
+	kfree_rcu(ao, rcu);
+}
+
+static bool tcp_ao_can_set_current_rnext(struct sock *sk)
+{
+	/* There aren't current/rnext keys on TCP_LISTEN sockets */
+	if (sk->sk_state == TCP_LISTEN)
+		return false;
+	return true;
+}
+
+static int tcp_ao_verify_ipv4(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **addr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd->addr;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (sin->sin_family != AF_INET)
+		return -EINVAL;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (sin->sin_port != 0)
+		return -EINVAL;
+
+	/* Check prefix and trailing 0's in addr */
+	if (cmd->prefix != 0) {
+		__be32 mask;
+
+		if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY)
+			return -EINVAL;
+		if (cmd->prefix > 32)
+			return -EINVAL;
+
+		mask = inet_make_mask(cmd->prefix);
+		if (sin->sin_addr.s_addr & ~mask)
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (ntohl(inet->inet_daddr) != INADDR_ANY &&
+		    (inet->inet_daddr & mask) != sin->sin_addr.s_addr)
+			return -EINVAL;
+	} else {
+		if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY)
+			return -EINVAL;
+	}
+
+	*addr = (union tcp_ao_addr *)&sin->sin_addr;
+	return 0;
+}
+
+static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key)
+{
+	unsigned int syn_tcp_option_space;
+	bool is_kdf_aes_128_cmac = false;
+	struct crypto_ahash *tfm;
+	struct tcp_sigpool hp;
+	void *tmp_key = NULL;
+	int err;
+
+	/* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+	if (!strcmp("cmac(aes128)", cmd->alg_name)) {
+		strscpy(cmd->alg_name, "cmac(aes)", sizeof(cmd->alg_name));
+		is_kdf_aes_128_cmac = (cmd->keylen != 16);
+		tmp_key = kmalloc(cmd->keylen, GFP_KERNEL);
+		if (!tmp_key)
+			return -ENOMEM;
+	}
+
+	key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */
+
+	/* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss
+	 *					- tstamp - wscale - sackperm),
+	 * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b.
+	 *
+	 * In order to allow D-SACK with TCP-AO, the header size should be:
+	 * (MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED
+	 *			- TCPOLEN_SACK_BASE_ALIGNED
+	 *			- 2 * TCPOLEN_SACK_PERBLOCK) = 8 (maclen = 4),
+	 * see tcp_established_options().
+	 *
+	 * RFC5925, 2.2:
+	 * Typical MACs are 96-128 bits (12-16 bytes), but any length
+	 * that fits in the header of the segment being authenticated
+	 * is allowed.
+	 *
+	 * RFC5925, 7.6:
+	 * TCP-AO continues to consume 16 bytes in non-SYN segments,
+	 * leaving a total of 24 bytes for other options, of which
+	 * the timestamp consumes 10.  This leaves 14 bytes, of which 10
+	 * are used for a single SACK block. When two SACK blocks are used,
+	 * such as to handle D-SACK, a smaller TCP-AO MAC would be required
+	 * to make room for the additional SACK block (i.e., to leave 18
+	 * bytes for the D-SACK variant of the SACK option) [RFC2883].
+	 * Note that D-SACK is not supportable in TCP MD5 in the presence
+	 * of timestamps, because TCP MD5’s MAC length is fixed and too
+	 * large to leave sufficient option space.
+	 */
+	syn_tcp_option_space = MAX_TCP_OPTION_SPACE;
+	syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED;
+	syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED;
+	syn_tcp_option_space -= TCPOLEN_SACKPERM_ALIGNED;
+	if (tcp_ao_len(key) > syn_tcp_option_space) {
+		err = -EMSGSIZE;
+		goto err_kfree;
+	}
+
+	key->keylen = cmd->keylen;
+	memcpy(key->key, cmd->key, cmd->keylen);
+
+	err = tcp_sigpool_start(key->tcp_sigpool_id, &hp);
+	if (err)
+		goto err_kfree;
+
+	tfm = crypto_ahash_reqtfm(hp.req);
+	if (is_kdf_aes_128_cmac) {
+		void *scratch = hp.scratch;
+		struct scatterlist sg;
+
+		memcpy(tmp_key, cmd->key, cmd->keylen);
+		sg_init_one(&sg, tmp_key, cmd->keylen);
+
+		/* Using zero-key of 16 bytes as described in RFC5926 */
+		memset(scratch, 0, 16);
+		err = crypto_ahash_setkey(tfm, scratch, 16);
+		if (err)
+			goto err_pool_end;
+
+		err = crypto_ahash_init(hp.req);
+		if (err)
+			goto err_pool_end;
+
+		ahash_request_set_crypt(hp.req, &sg, key->key, cmd->keylen);
+		err = crypto_ahash_update(hp.req);
+		if (err)
+			goto err_pool_end;
+
+		err |= crypto_ahash_final(hp.req);
+		if (err)
+			goto err_pool_end;
+		key->keylen = 16;
+	}
+
+	err = crypto_ahash_setkey(tfm, key->key, key->keylen);
+	if (err)
+		goto err_pool_end;
+
+	tcp_sigpool_end(&hp);
+	kfree_sensitive(tmp_key);
+
+	if (tcp_ao_maclen(key) > key->digest_size)
+		return -EINVAL;
+
+	return 0;
+
+err_pool_end:
+	tcp_sigpool_end(&hp);
+err_kfree:
+	kfree_sensitive(tmp_key);
+	return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **paddr,
+			      unsigned short int *family)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd->addr;
+	struct in6_addr *addr = &sin6->sin6_addr;
+	u8 prefix = cmd->prefix;
+
+	if (sin6->sin6_family != AF_INET6)
+		return -EINVAL;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (sin6->sin6_port != 0)
+		return -EINVAL;
+
+	/* Check prefix and trailing 0's in addr */
+	if (cmd->prefix != 0 && ipv6_addr_v4mapped(addr)) {
+		__be32 addr4 = addr->s6_addr32[3];
+		__be32 mask;
+
+		if (prefix > 32 || ntohl(addr4) == INADDR_ANY)
+			return -EINVAL;
+
+		mask = inet_make_mask(prefix);
+		if (addr4 & ~mask)
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+			__be32 daddr4 = sk->sk_v6_daddr.s6_addr32[3];
+
+			if (!ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+				return -EINVAL;
+			if ((daddr4 & mask) != addr4)
+				return -EINVAL;
+		}
+
+		*paddr = (union tcp_ao_addr *)&addr->s6_addr32[3];
+		*family = AF_INET;
+		return 0;
+	} else if (cmd->prefix != 0) {
+		struct in6_addr pfx;
+
+		if (ipv6_addr_any(addr) || prefix > 128)
+			return -EINVAL;
+
+		ipv6_addr_prefix(&pfx, addr, prefix);
+		if (ipv6_addr_cmp(&pfx, addr))
+			return -EINVAL;
+
+		/* Check that MKT address is consistent with socket */
+		if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_prefix_equal(&sk->sk_v6_daddr, addr, prefix))
+
+			return -EINVAL;
+	} else {
+		if (!ipv6_addr_any(addr))
+			return -EINVAL;
+	}
+
+	*paddr = (union tcp_ao_addr *)addr;
+	return 0;
+}
+#else
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+			      union tcp_ao_addr **paddr,
+			      unsigned short int *family)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
+{
+	if (sk_fullsock(sk)) {
+		return rcu_dereference_protected(tcp_sk(sk)->ao_info,
+						 lockdep_sock_is_held(sk));
+	}
+	return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+#define TCP_AO_KEYF_ALL		(0)
+
+static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
+					   struct tcp_ao_add *cmd)
+{
+	const char *algo = cmd->alg_name;
+	unsigned int digest_size;
+	struct crypto_ahash *tfm;
+	struct tcp_ao_key *key;
+	struct tcp_sigpool hp;
+	int err, pool_id;
+	size_t size;
+
+	/* Force null-termination of alg_name */
+	cmd->alg_name[ARRAY_SIZE(cmd->alg_name) - 1] = '\0';
+
+	/* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+	if (!strcmp("cmac(aes128)", algo))
+		algo = "cmac(aes)";
+
+	/* Full TCP header (th->doff << 2) should fit into scratch area,
+	 * see tcp_ao_hash_header().
+	 */
+	pool_id = tcp_sigpool_alloc_ahash(algo, 60);
+	if (pool_id < 0)
+		return ERR_PTR(pool_id);
+
+	err = tcp_sigpool_start(pool_id, &hp);
+	if (err)
+		goto err_free_pool;
+
+	tfm = crypto_ahash_reqtfm(hp.req);
+	if (crypto_ahash_alignmask(tfm) > TCP_AO_KEY_ALIGN) {
+		err = -EOPNOTSUPP;
+		goto err_pool_end;
+	}
+	digest_size = crypto_ahash_digestsize(tfm);
+	tcp_sigpool_end(&hp);
+
+	size = sizeof(struct tcp_ao_key) + (digest_size << 1);
+	key = sock_kmalloc(sk, size, GFP_KERNEL);
+	if (!key) {
+		err = -ENOMEM;
+		goto err_free_pool;
+	}
+
+	key->tcp_sigpool_id = pool_id;
+	key->digest_size = digest_size;
+	return key;
+
+err_pool_end:
+	tcp_sigpool_end(&hp);
+err_free_pool:
+	tcp_sigpool_release(pool_id);
+	return ERR_PTR(err);
+}
+
+static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
+			  sockptr_t optval, int optlen)
+{
+	struct tcp_ao_info *ao_info;
+	union tcp_ao_addr *addr;
+	struct tcp_ao_key *key;
+	struct tcp_ao_add cmd;
+	bool first = false;
+	int ret;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	ret = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (ret)
+		return ret;
+
+	if (cmd.keylen > TCP_AO_MAXKEYLEN)
+		return -EINVAL;
+
+	if (cmd.reserved != 0 || cmd.reserved2 != 0)
+		return -EINVAL;
+
+	if (family == AF_INET)
+		ret = tcp_ao_verify_ipv4(sk, &cmd, &addr);
+	else
+		ret = tcp_ao_verify_ipv6(sk, &cmd, &addr, &family);
+	if (ret)
+		return ret;
+
+	if (cmd.keyflags & ~TCP_AO_KEYF_ALL)
+		return -EINVAL;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+
+	if (!ao_info) {
+		ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+		if (!ao_info)
+			return -ENOMEM;
+		first = true;
+	} else {
+		/* Check that neither RecvID nor SendID match any
+		 * existing key for the peer, RFC5925 3.1:
+		 * > The IDs of MKTs MUST NOT overlap where their
+		 * > TCP connection identifiers overlap.
+		 */
+		if (__tcp_ao_do_lookup(sk, addr, family,
+				       cmd.prefix, -1, cmd.rcvid))
+			return -EEXIST;
+		if (__tcp_ao_do_lookup(sk, addr, family,
+				       cmd.prefix, cmd.sndid, -1))
+			return -EEXIST;
+	}
+
+	key = tcp_ao_key_alloc(sk, &cmd);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto err_free_ao;
+	}
+
+	INIT_HLIST_NODE(&key->node);
+	memcpy(&key->addr, addr, (family == AF_INET) ? sizeof(struct in_addr) :
+						       sizeof(struct in6_addr));
+	key->prefixlen	= cmd.prefix;
+	key->family	= family;
+	key->keyflags	= cmd.keyflags;
+	key->sndid	= cmd.sndid;
+	key->rcvid	= cmd.rcvid;
+
+	ret = tcp_ao_parse_crypto(&cmd, key);
+	if (ret < 0)
+		goto err_free_sock;
+
+	tcp_ao_link_mkt(ao_info, key);
+	if (first) {
+		sk_gso_disable(sk);
+		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+	}
+
+	if (cmd.set_current)
+		WRITE_ONCE(ao_info->current_key, key);
+	if (cmd.set_rnext)
+		WRITE_ONCE(ao_info->rnext_key, key);
+	return 0;
+
+err_free_sock:
+	atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+	tcp_sigpool_release(key->tcp_sigpool_id);
+	kfree_sensitive(key);
+err_free_ao:
+	if (first)
+		kfree(ao_info);
+	return ret;
+}
+
+static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
+			     struct tcp_ao_key *key,
+			     struct tcp_ao_key *new_current,
+			     struct tcp_ao_key *new_rnext)
+{
+	int err;
+
+	hlist_del_rcu(&key->node);
+
+	/* At this moment another CPU could have looked this key up
+	 * while it was unlinked from the list. Wait for RCU grace period,
+	 * after which the key is off-list and can't be looked up again;
+	 * the rx path [just before RCU came] might have used it and set it
+	 * as current_key (very unlikely).
+	 */
+	synchronize_rcu();
+	if (new_current)
+		WRITE_ONCE(ao_info->current_key, new_current);
+	if (new_rnext)
+		WRITE_ONCE(ao_info->rnext_key, new_rnext);
+
+	if (unlikely(READ_ONCE(ao_info->current_key) == key ||
+		     READ_ONCE(ao_info->rnext_key) == key)) {
+		err = -EBUSY;
+		goto add_key;
+	}
+
+	atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+	call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+
+	return 0;
+add_key:
+	hlist_add_head_rcu(&key->node, &ao_info->head);
+	return err;
+}
+
+static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
+			  sockptr_t optval, int optlen)
+{
+	struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL;
+	struct tcp_ao_info *ao_info;
+	union tcp_ao_addr *addr;
+	struct tcp_ao_del cmd;
+	int addr_len;
+	__u8 prefix;
+	u16 port;
+	int err;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (err)
+		return err;
+
+	if (cmd.reserved != 0 || cmd.reserved2 != 0)
+		return -EINVAL;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+	if (!ao_info)
+		return -ENOENT;
+
+	/* For sockets in TCP_CLOSED it's possible set keys that aren't
+	 * matching the future peer (address/VRF/etc),
+	 * tcp_ao_connect_init() will choose a correct matching MKT
+	 * if there's any.
+	 */
+	if (cmd.set_current) {
+		new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1);
+		if (!new_current)
+			return -ENOENT;
+	}
+	if (cmd.set_rnext) {
+		new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext);
+		if (!new_rnext)
+			return -ENOENT;
+	}
+
+	if (family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr;
+
+		addr = (union tcp_ao_addr *)&sin->sin_addr;
+		addr_len = sizeof(struct in_addr);
+		port = ntohs(sin->sin_port);
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.addr;
+		struct in6_addr *addr6 = &sin6->sin6_addr;
+
+		if (ipv6_addr_v4mapped(addr6)) {
+			addr = (union tcp_ao_addr *)&addr6->s6_addr32[3];
+			addr_len = sizeof(struct in_addr);
+			family = AF_INET;
+		} else {
+			addr = (union tcp_ao_addr *)addr6;
+			addr_len = sizeof(struct in6_addr);
+		}
+		port = ntohs(sin6->sin6_port);
+	}
+	prefix = cmd.prefix;
+
+	/* Currently matching is not performed on port (or port ranges) */
+	if (port != 0)
+		return -EINVAL;
+
+	/* We could choose random present key here for current/rnext
+	 * but that's less predictable. Let's be strict and don't
+	 * allow removing a key that's in use. RFC5925 doesn't
+	 * specify how-to coordinate key removal, but says:
+	 * "It is presumed that an MKT affecting a particular
+	 * connection cannot be destroyed during an active connection"
+	 */
+	hlist_for_each_entry_rcu(key, &ao_info->head, node) {
+		if (cmd.sndid != key->sndid ||
+		    cmd.rcvid != key->rcvid)
+			continue;
+
+		if (family != key->family ||
+		    prefix != key->prefixlen ||
+		    memcmp(addr, &key->addr, addr_len))
+			continue;
+
+		if (key == new_current || key == new_rnext)
+			continue;
+
+		return tcp_ao_delete_key(sk, ao_info, key,
+					  new_current, new_rnext);
+	}
+	return -ENOENT;
+}
+
+static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
+			   sockptr_t optval, int optlen)
+{
+	struct tcp_ao_key *new_current = NULL, *new_rnext = NULL;
+	struct tcp_ao_info *ao_info;
+	struct tcp_ao_info_opt cmd;
+	bool first = false;
+	int err;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+	if (err)
+		return err;
+
+	if (cmd.set_current || cmd.set_rnext) {
+		if (!tcp_ao_can_set_current_rnext(sk))
+			return -EINVAL;
+	}
+
+	if (cmd.reserved != 0)
+		return -EINVAL;
+
+	ao_info = setsockopt_ao_info(sk);
+	if (IS_ERR(ao_info))
+		return PTR_ERR(ao_info);
+	if (!ao_info) {
+		ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+		if (!ao_info)
+			return -ENOMEM;
+		first = true;
+	}
+
+	/* For sockets in TCP_CLOSED it's possible set keys that aren't
+	 * matching the future peer (address/port/VRF/etc),
+	 * tcp_ao_connect_init() will choose a correct matching MKT
+	 * if there's any.
+	 */
+	if (cmd.set_current) {
+		new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1);
+		if (!new_current) {
+			err = -ENOENT;
+			goto out;
+		}
+	}
+	if (cmd.set_rnext) {
+		new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext);
+		if (!new_rnext) {
+			err = -ENOENT;
+			goto out;
+		}
+	}
+
+	ao_info->ao_required = cmd.ao_required;
+	if (new_current)
+		WRITE_ONCE(ao_info->current_key, new_current);
+	if (new_rnext)
+		WRITE_ONCE(ao_info->rnext_key, new_rnext);
+	if (first) {
+		sk_gso_disable(sk);
+		rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+	}
+	return 0;
+out:
+	if (first)
+		kfree(ao_info);
+	return err;
+}
+
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+		 sockptr_t optval, int optlen)
+{
+	if (WARN_ON_ONCE(family != AF_INET && family != AF_INET6))
+		return -EAFNOSUPPORT;
+
+	switch (cmd) {
+	case TCP_AO_ADD_KEY:
+		return tcp_ao_add_cmd(sk, family, optval, optlen);
+	case TCP_AO_DEL_KEY:
+		return tcp_ao_del_cmd(sk, family, optval, optlen);
+	case TCP_AO_INFO:
+		return tcp_ao_info_cmd(sk, family, optval, optlen);
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+}
+
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen)
+{
+	return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen);
+}
+
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7d81e90b6f5c..a4746c27f38a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2271,11 +2271,16 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 };
 EXPORT_SYMBOL(ipv4_specific);
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup		= tcp_v4_md5_lookup,
 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
 	.md5_parse		= tcp_v4_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse		= tcp_v4_parse_ao,
+#endif
 };
 #endif
 
@@ -2290,7 +2295,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv4_specific;
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
 #endif
 
@@ -2341,6 +2346,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		rcu_assign_pointer(tp->md5sig_info, NULL);
 	}
 #endif
+	tcp_ao_destroy_sock(sk);
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 3036a45e8a1e..d283c59df4c1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -52,4 +52,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
 ifneq ($(CONFIG_IPV6),)
 obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
 obj-y += mcast_snoop.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
 endif
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
new file mode 100644
index 000000000000..049ddbabe049
--- /dev/null
+++ b/net/ipv6/tcp_ao.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET		An implementation of the TCP Authentication Option (TCP-AO).
+ *		See RFC5925.
+ *
+ * Authors:	Dmitry Safonov <dima@arista.com>
+ *		Francesco Ruggeri <fruggeri@arista.com>
+ *		Salam Noureddine <noureddine@arista.com>
+ */
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+int tcp_v6_parse_ao(struct sock *sk, int cmd,
+		    sockptr_t optval, int optlen)
+{
+	return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen);
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ee53dad20a59..30bd17d03239 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -76,16 +76,9 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
 
 static const struct inet_connection_sock_af_ops ipv6_mapped;
 const struct inet_connection_sock_af_ops ipv6_specific;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
-#else
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
-						   const struct in6_addr *addr,
-						   int l3index)
-{
-	return NULL;
-}
 #endif
 
 /* Helper returning the inet6 address from a given tcp socket.
@@ -239,7 +232,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		if (sk_is_mptcp(sk))
 			mptcpv6_handle_mapped(sk, true);
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 		tp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
@@ -252,7 +245,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			if (sk_is_mptcp(sk))
 				mptcpv6_handle_mapped(sk, false);
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 			tp->af_specific = &tcp_sock_ipv6_specific;
 #endif
 			goto failure;
@@ -769,7 +762,13 @@ clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-
+#else /* CONFIG_TCP_MD5SIG */
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
+						   const struct in6_addr *addr,
+						   int l3index)
+{
+	return NULL;
+}
 #endif
 
 static void tcp_v6_init_req(struct request_sock *req,
@@ -1228,7 +1227,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		if (sk_is_mptcp(newsk))
 			mptcpv6_handle_mapped(newsk, true);
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
@@ -1896,11 +1895,16 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
 	.mtu_reduced	   = tcp_v6_mtu_reduced,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v6_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 	.md5_parse	=	tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse	=	tcp_v6_parse_ao,
+#endif
 };
 #endif
 
@@ -1922,11 +1926,16 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v4_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 	.md5_parse	=	tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+	.ao_parse	=	tcp_v6_parse_ao,
+#endif
 };
 #endif
 
@@ -1941,7 +1950,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv6_specific;
 
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
 	tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
 #endif
 
-- 
cgit v1.2.3


From decde2586b34b99684faff1eab41e5c496c27fb6 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:01 +0100
Subject: net/tcp: Add TCP-AO sign to twsk

Add support for sockets in time-wait state.
ao_info as well as all keys are inherited on transition to time-wait
socket. The lifetime of ao_info is now protected by ref counter, so
that tcp_ao_destroy_sock() will destruct it only when the last user is
gone.

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  3 ++
 include/net/tcp_ao.h     | 11 ++++--
 net/ipv4/tcp_ao.c        | 49 +++++++++++++++++++++-----
 net/ipv4/tcp_ipv4.c      | 92 ++++++++++++++++++++++++++++++++++++++----------
 net/ipv4/tcp_minisocks.c |  4 ++-
 net/ipv4/tcp_output.c    |  2 +-
 net/ipv6/tcp_ipv6.c      | 72 +++++++++++++++++++++++++++----------
 7 files changed, 183 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 64e7b560fa79..eec6e7e5312e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -514,6 +514,9 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info	__rcu *ao_info;
+#endif
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 629ab0365b83..971d7edcda9c 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -85,6 +85,7 @@ struct tcp_ao_info {
 				__unused	:31;
 	__be32			lisn;
 	__be32			risn;
+	refcount_t		refcnt;		/* Protects twsk destruction */
 	struct rcu_head		rcu;
 };
 
@@ -124,7 +125,8 @@ struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
 					  int sndid, int rcvid);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
-void tcp_ao_destroy_sock(struct sock *sk);
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp);
 struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 				    const union tcp_ao_addr *addr,
 				    int family, int sndid, int rcvid);
@@ -182,7 +184,7 @@ static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 	return NULL;
 }
 
-static inline void tcp_ao_destroy_sock(struct sock *sk)
+static inline void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
 {
 }
 
@@ -194,6 +196,11 @@ static inline void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 {
 }
 
+static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw,
+				    struct tcp_sock *tp)
+{
+}
+
 static inline void tcp_ao_connect_init(struct sock *sk)
 {
 }
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index b8afe78ff057..7c4e2f42845a 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -159,6 +159,7 @@ static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
 	if (!ao)
 		return NULL;
 	INIT_HLIST_HEAD(&ao->head);
+	refcount_set(&ao->refcnt, 1);
 
 	return ao;
 }
@@ -176,27 +177,54 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head)
 	kfree_sensitive(key);
 }
 
-void tcp_ao_destroy_sock(struct sock *sk)
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
 {
 	struct tcp_ao_info *ao;
 	struct tcp_ao_key *key;
 	struct hlist_node *n;
 
-	ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
-	tcp_sk(sk)->ao_info = NULL;
+	if (twsk) {
+		ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1);
+		tcp_twsk(sk)->ao_info = NULL;
+	} else {
+		ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+		tcp_sk(sk)->ao_info = NULL;
+	}
 
-	if (!ao)
+	if (!ao || !refcount_dec_and_test(&ao->refcnt))
 		return;
 
 	hlist_for_each_entry_safe(key, n, &ao->head, node) {
 		hlist_del_rcu(&key->node);
-		atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+		if (!twsk)
+			atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
 		call_rcu(&key->rcu, tcp_ao_key_free_rcu);
 	}
 
 	kfree_rcu(ao, rcu);
 }
 
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
+{
+	struct tcp_ao_info *ao_info = rcu_dereference_protected(tp->ao_info, 1);
+
+	if (ao_info) {
+		struct tcp_ao_key *key;
+		struct hlist_node *n;
+		int omem = 0;
+
+		hlist_for_each_entry_safe(key, n, &ao_info->head, node) {
+			omem += tcp_ao_sizeof_key(key);
+		}
+
+		refcount_inc(&ao_info->refcnt);
+		atomic_sub(omem, &(((struct sock *)tp)->sk_omem_alloc));
+		rcu_assign_pointer(tcptw->ao_info, ao_info);
+	} else {
+		tcptw->ao_info = NULL;
+	}
+}
+
 /* 4 tuple and ISNs are expected in NBO */
 static int tcp_v4_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
 			      __be32 saddr, __be32 daddr,
@@ -514,11 +542,13 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 	if (!sk)
 		return -ENOTCONN;
 
-	if ((1 << sk->sk_state) &
-	    (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT))
+	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
 		return -1;
 
-	ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (sk->sk_state == TCP_TIME_WAIT)
+		ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+	else
+		ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
 	if (!ao_info)
 		return -ENOENT;
 
@@ -910,6 +940,9 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
 	if (sk_fullsock(sk)) {
 		return rcu_dereference_protected(tcp_sk(sk)->ao_info,
 						 lockdep_sock_is_held(sk));
+	} else if (sk->sk_state == TCP_TIME_WAIT) {
+		return rcu_dereference_protected(tcp_twsk(sk)->ao_info,
+						 lockdep_sock_is_held(sk));
 	}
 	return ERR_PTR(-ESOCKTNOSUPPORT);
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 71e1cbb0020b..a78112d78d06 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -911,17 +911,13 @@ out:
 static void tcp_v4_send_ack(const struct sock *sk,
 			    struct sk_buff *skb, u32 seq, u32 ack,
 			    u32 win, u32 tsval, u32 tsecr, int oif,
-			    struct tcp_md5sig_key *key,
+			    struct tcp_key *key,
 			    int reply_flags, u8 tos, u32 txhash)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
-			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
-			];
+		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
@@ -952,7 +948,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	rep.th.window  = htons(win);
 
 #ifdef CONFIG_TCP_MD5SIG
-	if (key) {
+	if (tcp_key_is_md5(key)) {
 		int offset = (tsecr) ? 3 : 0;
 
 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
@@ -963,9 +959,27 @@ static void tcp_v4_send_ack(const struct sock *sk,
 		rep.th.doff = arg.iov[0].iov_len/4;
 
 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
-				    key, ip_hdr(skb)->saddr,
+				    key->md5_key, ip_hdr(skb)->saddr,
 				    ip_hdr(skb)->daddr, &rep.th);
 	}
+#endif
+#ifdef CONFIG_TCP_AO
+	if (tcp_key_is_ao(key)) {
+		int offset = (tsecr) ? 3 : 0;
+
+		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
+					  (tcp_ao_len(key->ao_key) << 16) |
+					  (key->ao_key->sndid << 8) |
+					  key->rcv_next);
+		arg.iov[0].iov_len += round_up(tcp_ao_len(key->ao_key), 4);
+		rep.th.doff = arg.iov[0].iov_len / 4;
+
+		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
+				key->ao_key, key->traffic_key,
+				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+				&rep.th, key->sne);
+	}
 #endif
 	arg.flags = reply_flags;
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
@@ -999,18 +1013,50 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+	struct tcp_key key = {};
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao_info;
+
+	/* FIXME: the segment to-be-acked is not verified yet */
+	ao_info = rcu_dereference(tcptw->ao_info);
+	if (ao_info) {
+		const struct tcp_ao_hdr *aoh;
+
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
+			inet_twsk_put(tw);
+			return;
+		}
+
+		if (aoh)
+			key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
+	}
+	if (key.ao_key) {
+		struct tcp_ao_key *rnext_key;
+
+		key.traffic_key = snd_other_key(key.ao_key);
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		key.rcv_next = rnext_key->rcvid;
+		key.type = TCP_KEY_AO;
+#else
+	if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		key.md5_key = tcp_twsk_md5_key(tcptw);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+#endif
+	}
 
 	tcp_v4_send_ack(sk, skb,
 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_tw_tsval(tcptw),
 			tcptw->tw_ts_recent,
-			tw->tw_bound_dev_if,
-			tcp_twsk_md5_key(tcptw),
+			tw->tw_bound_dev_if, &key,
 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			tw->tw_tos,
-			tw->tw_txhash
-			);
+			tw->tw_txhash);
 
 	inet_twsk_put(tw);
 }
@@ -1018,8 +1064,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req)
 {
-	const union tcp_md5_addr *addr;
-	int l3index;
+	struct tcp_key key = {};
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1032,15 +1077,24 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	 * exception of <SYN> segments, MUST be right-shifted by
 	 * Rcv.Wind.Shift bits:
 	 */
-	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
-	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		const union tcp_md5_addr *addr;
+		int l3index;
+
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+	}
+#endif
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent),
-			0,
-			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
+			0, &key,
 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			ip_hdr(skb)->tos,
 			READ_ONCE(tcp_rsk(req)->txhash));
@@ -2404,7 +2458,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		rcu_assign_pointer(tp->md5sig_info, NULL);
 	}
 #endif
-	tcp_ao_destroy_sock(sk);
+	tcp_ao_destroy_sock(sk, false);
 
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3dcb3fc36e64..6810cf65a322 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -279,7 +279,7 @@ out_free:
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct inet_timewait_sock *tw;
 
@@ -316,6 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 #endif
 
 		tcp_time_wait_init(sk, tcptw);
+		tcp_ao_time_wait(tcptw, tp);
 
 		/* Get the TIME_WAIT timeout firing. */
 		if (timeo < rto)
@@ -370,6 +371,7 @@ void tcp_twsk_destructor(struct sock *sk)
 			call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
 	}
 #endif
+	tcp_ao_destroy_sock(sk, true);
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa3a630f3629..de546313a214 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3997,7 +3997,7 @@ int tcp_connect(struct sock *sk)
 		 * then free up ao_info if allocated.
 		 */
 		if (needs_md5) {
-			tcp_ao_destroy_sock(sk);
+			tcp_ao_destroy_sock(sk, false);
 		} else if (needs_ao) {
 			tcp_clear_md5_list(sk);
 			kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 45b81e230438..e6c3617976f0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -778,13 +778,6 @@ clear_hash_nostart:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-#else /* CONFIG_TCP_MD5SIG */
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
-						   const struct in6_addr *addr,
-						   int l3index)
-{
-	return NULL;
-}
 #endif
 
 static void tcp_v6_init_req(struct request_sock *req,
@@ -1134,39 +1127,81 @@ out:
 
 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
 			    u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
-			    struct tcp_md5sig_key *md5_key, u8 tclass,
+			    struct tcp_key *key, u8 tclass,
 			    __be32 label, u32 priority, u32 txhash)
 {
-	struct tcp_key key = {
-				.md5_key = md5_key,
-				.type = md5_key ? TCP_KEY_MD5 : TCP_KEY_NONE,
-	};
-
 	tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
-			     tclass, label, priority, txhash, &key);
+			     tclass, label, priority, txhash, key);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+	struct tcp_key key = {};
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_info *ao_info;
+
+	/* FIXME: the segment to-be-acked is not verified yet */
+	ao_info = rcu_dereference(tcptw->ao_info);
+	if (ao_info) {
+		const struct tcp_ao_hdr *aoh;
+
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			goto out;
+		if (aoh) {
+			key.ao_key = tcp_ao_established_key(ao_info,
+							    aoh->rnext_keyid, -1);
+		}
+	}
+	if (key.ao_key) {
+		struct tcp_ao_key *rnext_key;
+
+		key.traffic_key = snd_other_key(key.ao_key);
+		/* rcv_next switches to our rcv_next */
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		key.rcv_next = rnext_key->rcvid;
+		key.type = TCP_KEY_AO;
+#else
+	if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		key.md5_key = tcp_twsk_md5_key(tcptw);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+#endif
+	}
 
 	tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_tw_tsval(tcptw),
-			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
+			tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key,
 			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority,
 			tw->tw_txhash);
 
+#ifdef CONFIG_TCP_AO
+out:
+#endif
 	inet_twsk_put(tw);
 }
 
 static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req)
 {
-	int l3index;
+	struct tcp_key key = {};
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+		int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
 
-	l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+		key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
+						   l3index);
+		if (key.md5_key)
+			key.type = TCP_KEY_MD5;
+	}
+#endif
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1182,8 +1217,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 			tcp_rsk_tsval(tcp_rsk(req)),
 			READ_ONCE(req->ts_recent), sk->sk_bound_dev_if,
-			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
-			ipv6_get_dsfield(ipv6_hdr(skb)), 0,
+			&key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
 			READ_ONCE(sk->sk_priority),
 			READ_ONCE(tcp_rsk(req)->txhash));
 }
-- 
cgit v1.2.3


From 06b22ef29591f625ef877ae00d82192938e29e60 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 23 Oct 2023 20:22:02 +0100
Subject: net/tcp: Wire TCP-AO to request sockets

Now when the new request socket is created from the listening socket,
it's recorded what MKT was used by the peer. tcp_rsk_used_ao() is
a new helper for checking if TCP-AO option was used to create the
request socket.
tcp_ao_copy_all_matching() will copy all keys that match the peer on the
request socket, as well as preparing them for the usage (creating
traffic keys).

Co-developed-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Co-developed-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  18 ++++
 include/net/tcp.h        |   6 ++
 include/net/tcp_ao.h     |  24 +++++
 net/ipv4/syncookies.c    |   2 +
 net/ipv4/tcp_ao.c        | 264 ++++++++++++++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_input.c     |  15 +++
 net/ipv4/tcp_ipv4.c      |  66 ++++++++++--
 net/ipv4/tcp_minisocks.c |  10 ++
 net/ipv4/tcp_output.c    |  37 ++++---
 net/ipv6/syncookies.c    |   2 +
 net/ipv6/tcp_ao.c        |  38 ++++++-
 net/ipv6/tcp_ipv6.c      |  75 +++++++++++---
 12 files changed, 506 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eec6e7e5312e..ec4e9367f5b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -166,6 +166,11 @@ struct tcp_request_sock {
 						  * after data-in-SYN.
 						  */
 	u8				syn_tos;
+#ifdef CONFIG_TCP_AO
+	u8				ao_keyid;
+	u8				ao_rcv_next;
+	u8				maclen;
+#endif
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -173,6 +178,19 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 	return (struct tcp_request_sock *)req;
 }
 
+static inline bool tcp_rsk_used_ao(const struct request_sock *req)
+{
+	/* The real length of MAC is saved in the request socket,
+	 * signing anything with zero-length makes no sense, so here is
+	 * a little hack..
+	 */
+#ifndef CONFIG_TCP_AO
+	return false;
+#else
+	return tcp_rsk(req)->maclen != 0;
+#endif
+}
+
 #define TCP_RMEM_TO_WIN_SCALE 8
 
 struct tcp_sock {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 17b3ecd2760f..d0bea102b523 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2216,6 +2216,12 @@ struct tcp_request_sock_ops {
 					  const struct sock *sk,
 					  const struct sk_buff *skb);
 #endif
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
+	int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk);
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	__u32 (*cookie_init_seq)(const struct sk_buff *skb,
 				 __u16 *mss);
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index 971d7edcda9c..d2c1ee8bf7b0 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -123,6 +123,9 @@ int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
 		 sockptr_t optval, int optlen);
 struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao,
 					  int sndid, int rcvid);
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+			     struct request_sock *req, struct sk_buff *skb,
+			     int family);
 int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
 			    unsigned int len, struct tcp_sigpool *hp);
 void tcp_ao_destroy_sock(struct sock *sk, bool twsk);
@@ -147,6 +150,11 @@ struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk,
 			  __be32 sisn, __be32 disn, bool send);
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req);
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
 int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 		       const struct sock *sk, const struct sk_buff *skb,
 		       const u8 *tkey, int hash_offset, u32 sne);
@@ -154,11 +162,21 @@ int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				const struct in6_addr *daddr,
 				const struct in6_addr *saddr, int nbytes);
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			   const struct sk_buff *skb, __be32 sisn, __be32 disn);
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk, __be32 sisn,
 			  __be32 disn, bool send);
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req);
+struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
+				       const struct in6_addr *addr,
+				       int sndid, int rcvid);
 struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 				    struct sock *addr_sk, int sndid, int rcvid);
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid);
 int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 		       const struct sock *sk, const struct sk_buff *skb,
 		       const u8 *tkey, int hash_offset, u32 sne);
@@ -178,6 +196,12 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+				    struct tcp_request_sock *treq,
+				    unsigned short int family)
+{
+}
+
 static inline struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk,
 		const union tcp_ao_addr *addr, int family, int sndid, int rcvid)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c64334363230..0681d3e82b11 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -400,6 +400,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack	= 0;
 	treq->tfo_listener	= false;
 
+	tcp_ao_syncookie(sk, skb, treq, AF_INET);
+
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 7c4e2f42845a..68d81704e14e 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -169,6 +169,23 @@ static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
 	hlist_add_head_rcu(&mkt->node, &ao->head);
 }
 
+static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
+					  struct tcp_ao_key *key)
+{
+	struct tcp_ao_key *new_key;
+
+	new_key = sock_kmalloc(sk, tcp_ao_sizeof_key(key),
+			       GFP_ATOMIC);
+	if (!new_key)
+		return NULL;
+
+	*new_key = *key;
+	INIT_HLIST_NODE(&new_key->node);
+	tcp_sigpool_get(new_key->tcp_sigpool_id);
+
+	return new_key;
+}
+
 static void tcp_ao_key_free_rcu(struct rcu_head *head)
 {
 	struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
@@ -290,6 +307,42 @@ static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 		return -EOPNOTSUPP;
 }
 
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	return tcp_v4_ao_calc_key(mkt, key,
+				  ireq->ir_loc_addr, ireq->ir_rmt_addr,
+				  htons(ireq->ir_num), ireq->ir_rmt_port,
+				  htonl(tcp_rsk(req)->snt_isn),
+				  htonl(tcp_rsk(req)->rcv_isn));
+}
+
+static int tcp_v4_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+				  const struct sk_buff *skb,
+				  __be32 sisn, __be32 disn)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+
+	return tcp_v4_ao_calc_key(mkt, key, iph->saddr, iph->daddr,
+				  th->source, th->dest, sisn, disn);
+}
+
+static int tcp_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			       const struct sk_buff *skb,
+			       __be32 sisn, __be32 disn, int family)
+{
+	if (family == AF_INET)
+		return tcp_v4_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (family == AF_INET6)
+		return tcp_v6_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#endif
+	return -EAFNOSUPPORT;
+}
+
 static int tcp_v4_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				       __be32 daddr, __be32 saddr,
 				       int nbytes)
@@ -515,6 +568,16 @@ int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
 			       tkey, hash_offset, sne);
 }
 
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid)
+{
+	union tcp_ao_addr *addr =
+			(union tcp_ao_addr *)&inet_rsk(req)->ir_rmt_addr;
+
+	return tcp_ao_do_lookup(sk, addr, AF_INET, sndid, rcvid);
+}
+
 struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
 				    int sndid, int rcvid)
 {
@@ -528,7 +591,7 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 			 struct tcp_ao_key **key, char **traffic_key,
 			 bool *allocated_traffic_key, u8 *keyid, u32 *sne)
 {
-	struct tcp_ao_key *rnext_key;
+	const struct tcphdr *th = tcp_hdr(skb);
 	struct tcp_ao_info *ao_info;
 
 	*allocated_traffic_key = false;
@@ -543,23 +606,62 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
 		return -ENOTCONN;
 
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
-		return -1;
+		unsigned int family = READ_ONCE(sk->sk_family);
+		union tcp_ao_addr *addr;
+		__be32 disn, sisn;
 
-	if (sk->sk_state == TCP_TIME_WAIT)
-		ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
-	else
+		if (sk->sk_state == TCP_NEW_SYN_RECV) {
+			struct request_sock *req = inet_reqsk(sk);
+
+			sisn = htonl(tcp_rsk(req)->rcv_isn);
+			disn = htonl(tcp_rsk(req)->snt_isn);
+			*sne = 0;
+		} else {
+			sisn = th->seq;
+			disn = 0;
+		}
+		if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+			addr = (union tcp_md5_addr *)&ipv6_hdr(skb)->saddr;
+		else
+			addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (family == AF_INET6 && ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+			family = AF_INET;
+#endif
+
+		sk = sk_const_to_full_sk(sk);
 		ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
-	if (!ao_info)
-		return -ENOENT;
+		if (!ao_info)
+			return -ENOENT;
+		*key = tcp_ao_do_lookup(sk, addr, family, -1, aoh->rnext_keyid);
+		if (!*key)
+			return -ENOENT;
+		*traffic_key = kmalloc(tcp_ao_digest_size(*key), GFP_ATOMIC);
+		if (!*traffic_key)
+			return -ENOMEM;
+		*allocated_traffic_key = true;
+		if (tcp_ao_calc_key_skb(*key, *traffic_key, skb,
+					sisn, disn, family))
+			return -1;
+		*keyid = (*key)->rcvid;
+	} else {
+		struct tcp_ao_key *rnext_key;
 
-	*key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
-	if (!*key)
-		return -ENOENT;
-	*traffic_key = snd_other_key(*key);
-	rnext_key = READ_ONCE(ao_info->rnext_key);
-	*keyid = rnext_key->rcvid;
-	*sne = 0;
+		if (sk->sk_state == TCP_TIME_WAIT)
+			ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+		else
+			ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+		if (!ao_info)
+			return -ENOENT;
 
+		*key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
+		if (!*key)
+			return -ENOENT;
+		*traffic_key = snd_other_key(*key);
+		rnext_key = READ_ONCE(ao_info->rnext_key);
+		*keyid = rnext_key->rcvid;
+		*sne = 0;
+	}
 	return 0;
 }
 
@@ -597,6 +699,46 @@ int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family,
+		const struct sock *sk, const struct sk_buff *skb,
+		int sndid, int rcvid)
+{
+	if (family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
+				AF_INET, sndid, rcvid);
+	} else {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)&iph->saddr,
+				AF_INET6, sndid, rcvid);
+	}
+}
+
+void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+		      struct tcp_request_sock *treq,
+		      unsigned short int family)
+{
+	const struct tcphdr *th = tcp_hdr(skb);
+	const struct tcp_ao_hdr *aoh;
+	struct tcp_ao_key *key;
+
+	treq->maclen = 0;
+
+	if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
+		return;
+
+	key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid);
+	if (!key)
+		/* Key not found, continue without TCP-AO */
+		return;
+
+	treq->ao_rcv_next = aoh->keyid;
+	treq->ao_keyid = aoh->rnext_keyid;
+	treq->maclen = tcp_ao_maclen(key);
+}
+
 static int tcp_ao_cache_traffic_keys(const struct sock *sk,
 				     struct tcp_ao_info *ao,
 				     struct tcp_ao_key *ao_key)
@@ -704,6 +846,100 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 		tcp_ao_cache_traffic_keys(sk, ao, key);
 }
 
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+			     struct request_sock *req, struct sk_buff *skb,
+			     int family)
+{
+	struct tcp_ao_key *key, *new_key, *first_key;
+	struct tcp_ao_info *new_ao, *ao;
+	struct hlist_node *key_head;
+	union tcp_ao_addr *addr;
+	bool match = false;
+	int ret = -ENOMEM;
+
+	ao = rcu_dereference(tcp_sk(sk)->ao_info);
+	if (!ao)
+		return 0;
+
+	/* New socket without TCP-AO on it */
+	if (!tcp_rsk_used_ao(req))
+		return 0;
+
+	new_ao = tcp_ao_alloc_info(GFP_ATOMIC);
+	if (!new_ao)
+		return -ENOMEM;
+	new_ao->lisn = htonl(tcp_rsk(req)->snt_isn);
+	new_ao->risn = htonl(tcp_rsk(req)->rcv_isn);
+	new_ao->ao_required = ao->ao_required;
+
+	if (family == AF_INET) {
+		addr = (union tcp_ao_addr *)&newsk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (family == AF_INET6) {
+		addr = (union tcp_ao_addr *)&newsk->sk_v6_daddr;
+#endif
+	} else {
+		ret = -EAFNOSUPPORT;
+		goto free_ao;
+	}
+
+	hlist_for_each_entry_rcu(key, &ao->head, node) {
+		if (tcp_ao_key_cmp(key, addr, key->prefixlen, family, -1, -1))
+			continue;
+
+		new_key = tcp_ao_copy_key(newsk, key);
+		if (!new_key)
+			goto free_and_exit;
+
+		tcp_ao_cache_traffic_keys(newsk, new_ao, new_key);
+		tcp_ao_link_mkt(new_ao, new_key);
+		match = true;
+	}
+
+	if (!match) {
+		/* RFC5925 (7.4.1) specifies that the TCP-AO status
+		 * of a connection is determined on the initial SYN.
+		 * At this point the connection was TCP-AO enabled, so
+		 * it can't switch to being unsigned if peer's key
+		 * disappears on the listening socket.
+		 */
+		ret = -EKEYREJECTED;
+		goto free_and_exit;
+	}
+
+	key_head = rcu_dereference(hlist_first_rcu(&new_ao->head));
+	first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node);
+
+	key = tcp_ao_established_key(new_ao, tcp_rsk(req)->ao_keyid, -1);
+	if (key)
+		new_ao->current_key = key;
+	else
+		new_ao->current_key = first_key;
+
+	/* set rnext_key */
+	key = tcp_ao_established_key(new_ao, -1, tcp_rsk(req)->ao_rcv_next);
+	if (key)
+		new_ao->rnext_key = key;
+	else
+		new_ao->rnext_key = first_key;
+
+	sk_gso_disable(newsk);
+	rcu_assign_pointer(tcp_sk(newsk)->ao_info, new_ao);
+
+	return 0;
+
+free_and_exit:
+	hlist_for_each_entry_safe(key, key_head, &new_ao->head, node) {
+		hlist_del(&key->node);
+		tcp_sigpool_release(key->tcp_sigpool_id);
+		atomic_sub(tcp_ao_sizeof_key(key), &newsk->sk_omem_alloc);
+		kfree_sensitive(key);
+	}
+free_ao:
+	kfree(new_ao);
+	return ret;
+}
+
 static bool tcp_ao_can_set_current_rnext(struct sock *sk)
 {
 	/* There aren't current/rnext keys on TCP_LISTEN sockets */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fc42b172abf6..de64c0c2fb69 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7045,6 +7045,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	struct flowi fl;
 	u8 syncookies;
 
+#ifdef CONFIG_TCP_AO
+	const struct tcp_ao_hdr *aoh;
+#endif
+
 	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
 
 	/* TW buckets are converted to open requests without
@@ -7131,6 +7135,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			inet_rsk(req)->ecn_ok = 0;
 	}
 
+#ifdef CONFIG_TCP_AO
+	if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+		goto drop_and_release; /* Invalid TCP options */
+	if (aoh) {
+		tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+		tcp_rsk(req)->ao_rcv_next = aoh->keyid;
+		tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+	} else {
+		tcp_rsk(req)->maclen = 0;
+	}
+#endif
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a78112d78d06..2bd56b57f3c9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1072,13 +1072,47 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 					     tcp_sk(sk)->snd_nxt;
 
-	/* RFC 7323 2.3
-	 * The window field (SEG.WND) of every outgoing segment, with the
-	 * exception of <SYN> segments, MUST be right-shifted by
-	 * Rcv.Wind.Shift bits:
-	 */
+#ifdef CONFIG_TCP_AO
+	if (tcp_rsk_used_ao(req)) {
+		const union tcp_md5_addr *addr;
+		const struct tcp_ao_hdr *aoh;
+
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			return;
+		if (!aoh)
+			return;
+
+		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+		key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET,
+					      aoh->rnext_keyid, -1);
+		if (unlikely(!key.ao_key)) {
+			/* Send ACK with any matching MKT for the peer */
+			key.ao_key = tcp_ao_do_lookup(sk, addr, AF_INET, -1, -1);
+			/* Matching key disappeared (user removed the key?)
+			 * let the handshake timeout.
+			 */
+			if (!key.ao_key) {
+				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
+						     addr,
+						     ntohs(tcp_hdr(skb)->source),
+						     &ip_hdr(skb)->daddr,
+						     ntohs(tcp_hdr(skb)->dest));
+				return;
+			}
+		}
+		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+		if (!key.traffic_key)
+			return;
+
+		key.type = TCP_KEY_AO;
+		key.rcv_next = aoh->keyid;
+		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+	if (0) {
+#endif
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		const union tcp_md5_addr *addr;
 		int l3index;
 
@@ -1087,8 +1121,14 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 		if (key.md5_key)
 			key.type = TCP_KEY_MD5;
-	}
 #endif
+	}
+
+	/* RFC 7323 2.3
+	 * The window field (SEG.WND) of every outgoing segment, with the
+	 * exception of <SYN> segments, MUST be right-shifted by
+	 * Rcv.Wind.Shift bits:
+	 */
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
@@ -1098,6 +1138,8 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 			ip_hdr(skb)->tos,
 			READ_ONCE(tcp_rsk(req)->txhash));
+	if (tcp_key_is_ao(&key))
+		kfree(key.traffic_key);
 }
 
 /*
@@ -1636,6 +1678,10 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.req_md5_lookup	=	tcp_v4_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 #endif
+#ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
+	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
@@ -1737,12 +1783,16 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	/* Copy over the MD5 key from the original socket */
 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
-	if (key) {
+	if (key && !tcp_rsk_used_ao(req)) {
 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
 			goto put_and_exit;
 		sk_gso_disable(newsk);
 	}
 #endif
+#ifdef CONFIG_TCP_AO
+	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
+		goto put_and_exit; /* OOM, release back memory */
+#endif
 
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6810cf65a322..8d941a6e066b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -506,6 +506,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	const struct tcp_sock *oldtp;
 	struct tcp_sock *newtp;
 	u32 seq;
+#ifdef CONFIG_TCP_AO
+	struct tcp_ao_key *ao_key;
+#endif
 
 	if (!newsk)
 		return NULL;
@@ -594,6 +597,13 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 #ifdef CONFIG_TCP_MD5SIG
 	newtp->md5sig_info = NULL;	/*XXX*/
 #endif
+#ifdef CONFIG_TCP_AO
+	newtp->ao_info = NULL;
+	ao_key = treq->af_specific->ao_lookup(sk, req,
+				tcp_rsk(req)->ao_keyid, -1);
+	if (ao_key)
+		newtp->tcp_header_len += tcp_ao_len(ao_key);
+ #endif
 	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
 		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 	newtp->rx_opt.mss_clamp = req->mss;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de546313a214..d44ef88025fb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -615,6 +615,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
  * (but it may well be that other scenarios fail similarly).
  */
 static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
+			      const struct tcp_request_sock *tcprsk,
 			      struct tcp_out_options *opts,
 			      struct tcp_key *key)
 {
@@ -629,20 +630,28 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
 		ptr += 4;
 	} else if (tcp_key_is_ao(key)) {
 #ifdef CONFIG_TCP_AO
-		struct tcp_ao_key *rnext_key;
-		struct tcp_ao_info *ao_info;
-		u8 maclen;
+		u8 maclen = tcp_ao_maclen(key->ao_key);
 
-		ao_info = rcu_dereference_check(tp->ao_info,
+		if (tcprsk) {
+			u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
+
+			*ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
+				       (tcprsk->ao_keyid << 8) |
+				       (tcprsk->ao_rcv_next));
+		} else {
+			struct tcp_ao_key *rnext_key;
+			struct tcp_ao_info *ao_info;
+
+			ao_info = rcu_dereference_check(tp->ao_info,
 				lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
-		rnext_key = READ_ONCE(ao_info->rnext_key);
-		if (WARN_ON_ONCE(!rnext_key))
-			goto out_ao;
-		maclen = tcp_ao_maclen(key->ao_key);
-		*ptr++ = htonl((TCPOPT_AO << 24) |
-				(tcp_ao_len(key->ao_key) << 16) |
-				(key->ao_key->sndid << 8) |
-				(rnext_key->rcvid));
+			rnext_key = READ_ONCE(ao_info->rnext_key);
+			if (WARN_ON_ONCE(!rnext_key))
+				goto out_ao;
+			*ptr++ = htonl((TCPOPT_AO << 24) |
+				       (tcp_ao_len(key->ao_key) << 16) |
+				       (key->ao_key->sndid << 8) |
+				       (rnext_key->rcvid));
+		}
 		opts->hash_location = (__u8 *)ptr;
 		ptr += maclen / sizeof(*ptr);
 		if (unlikely(maclen % sizeof(*ptr))) {
@@ -1386,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
 
-	tcp_options_write(th, tp, &opts, &key);
+	tcp_options_write(th, tp, NULL, &opts, &key);
 
 	if (tcp_key_is_md5(&key)) {
 #ifdef CONFIG_TCP_MD5SIG
@@ -3729,7 +3738,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-	tcp_options_write(th, NULL, &opts, &key);
+	tcp_options_write(th, NULL, NULL, &opts, &key);
 	th->doff = (tcp_header_size >> 2);
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 5014aa663452..ad7a8caa7b2a 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -214,6 +214,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_isn = cookie;
 	treq->ts_off = 0;
 	treq->txhash = net_tx_rndhash();
+	tcp_ao_syncookie(sk, skb, treq, AF_INET6);
+
 	if (IS_ENABLED(CONFIG_SMC))
 		ireq->smc_ok = 0;
 
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
index d08735b6f3c5..c9a6fa84f6ce 100644
--- a/net/ipv6/tcp_ao.c
+++ b/net/ipv6/tcp_ao.c
@@ -49,6 +49,17 @@ static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
 	return err;
 }
 
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+			   const struct sk_buff *skb,
+			   __be32 sisn, __be32 disn)
+{
+       const struct ipv6hdr *iph = ipv6_hdr(skb);
+       const struct tcphdr *th = tcp_hdr(skb);
+
+       return tcp_v6_ao_calc_key(mkt, key, &iph->saddr, &iph->daddr,
+				 th->source, th->dest, sisn, disn);
+}
+
 int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 			  const struct sock *sk, __be32 sisn,
 			  __be32 disn, bool send)
@@ -63,9 +74,21 @@ int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
 					  htons(sk->sk_num), disn, sisn);
 }
 
-static struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
-					      const struct in6_addr *addr,
-					      int sndid, int rcvid)
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+			   struct request_sock *req)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	return tcp_v6_ao_calc_key(mkt, key,
+			&ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr,
+			htons(ireq->ir_num), ireq->ir_rmt_port,
+			htonl(tcp_rsk(req)->snt_isn),
+			htonl(tcp_rsk(req)->rcv_isn));
+}
+
+struct tcp_ao_key *tcp_v6_ao_do_lookup(const struct sock *sk,
+				       const struct in6_addr *addr,
+				       int sndid, int rcvid)
 {
 	return tcp_ao_do_lookup(sk, (union tcp_ao_addr *)addr, AF_INET6,
 				sndid, rcvid);
@@ -80,6 +103,15 @@ struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
 	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
 }
 
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+					struct request_sock *req,
+					int sndid, int rcvid)
+{
+	struct in6_addr *addr = &inet_rsk(req)->ir_v6_rmt_addr;
+
+	return tcp_v6_ao_do_lookup(sk, addr, sndid, rcvid);
+}
+
 int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
 				const struct in6_addr *daddr,
 				const struct in6_addr *saddr, int nbytes)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e6c3617976f0..b08b177847da 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -836,6 +836,10 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.req_md5_lookup	=	tcp_v6_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 #endif
+#ifdef CONFIG_TCP_AO
+	.ao_lookup	=	tcp_v6_ao_lookup_rsk,
+	.ao_calc_key	=	tcp_v6_ao_calc_key_rsk,
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
@@ -1192,16 +1196,54 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_key key = {};
 
+#ifdef CONFIG_TCP_AO
+	if (tcp_rsk_used_ao(req)) {
+		const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
+		const struct tcp_ao_hdr *aoh;
+		int l3index;
+
+		l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+		/* Invalid TCP option size or twice included auth */
+		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+			return;
+		if (!aoh)
+			return;
+		key.ao_key = tcp_v6_ao_do_lookup(sk, addr, aoh->rnext_keyid, -1);
+		if (unlikely(!key.ao_key)) {
+			/* Send ACK with any matching MKT for the peer */
+			key.ao_key = tcp_v6_ao_do_lookup(sk, addr, -1, -1);
+			/* Matching key disappeared (user removed the key?)
+			 * let the handshake timeout.
+			 */
+			if (!key.ao_key) {
+				net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
+						     addr,
+						     ntohs(tcp_hdr(skb)->source),
+						     &ipv6_hdr(skb)->daddr,
+						     ntohs(tcp_hdr(skb)->dest));
+				return;
+			}
+		}
+		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+		if (!key.traffic_key)
+			return;
+
+		key.type = TCP_KEY_AO;
+		key.rcv_next = aoh->keyid;
+		tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+	if (0) {
+#endif
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed.key)) {
+	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
 
 		key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
 						   l3index);
 		if (key.md5_key)
 			key.type = TCP_KEY_MD5;
-	}
 #endif
+	}
 
 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -1220,6 +1262,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			&key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
 			READ_ONCE(sk->sk_priority),
 			READ_ONCE(tcp_rsk(req)->txhash));
+	if (tcp_key_is_ao(&key))
+		kfree(key.traffic_key);
 }
 
 
@@ -1449,19 +1493,26 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 #ifdef CONFIG_TCP_MD5SIG
 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
 
-	/* Copy over the MD5 key from the original socket */
-	key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
-	if (key) {
-		const union tcp_md5_addr *addr;
-
-		addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
-		if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
-			inet_csk_prepare_forced_close(newsk);
-			tcp_done(newsk);
-			goto out;
+	if (!tcp_rsk_used_ao(req)) {
+		/* Copy over the MD5 key from the original socket */
+		key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
+		if (key) {
+			const union tcp_md5_addr *addr;
+
+			addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
+			if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
+				inet_csk_prepare_forced_close(newsk);
+				tcp_done(newsk);
+				goto out;
+			}
 		}
 	}
 #endif
+#ifdef CONFIG_TCP_AO
+	/* Copy over tcp_ao_info if any */
+	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
+		goto out; /* OOM */
+#endif
 
 	if (__inet_inherit_port(sk, newsk) < 0) {
 		inet_csk_prepare_forced_close(newsk);
-- 
cgit v1.2.3


From 62f47bf9e2c00eea457cad8fa43c24ed0282b37a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 25 Oct 2023 15:30:15 +0300
Subject: net: Add MDB get device operation

Add MDB net device operation that will be invoked by rtnetlink code in
response to received RTM_GETMDB messages. Subsequent patches will
implement the operation in the bridge and VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b8bf669212cc..a16c9cc063fe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1609,6 +1609,10 @@ struct net_device_ops {
 	int			(*ndo_mdb_dump)(struct net_device *dev,
 						struct sk_buff *skb,
 						struct netlink_callback *cb);
+	int			(*ndo_mdb_get)(struct net_device *dev,
+					       struct nlattr *tb[], u32 portid,
+					       u32 seq,
+					       struct netlink_ext_ack *extack);
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
 						      struct nlmsghdr *nlh,
 						      u16 flags,
-- 
cgit v1.2.3


From e8eed5f7366f1f5decb694168bd06fb59ef6b12c Mon Sep 17 00:00:00 2001
From: Damian Muszynski <damian.muszynski@intel.com>
Date: Fri, 20 Oct 2023 15:49:25 +0200
Subject: units: Add BYTES_PER_*BIT

There is going to be a new user of the BYTES_PER_[K/M/G]BIT definition
besides possibly existing ones. Add them to the header.

Signed-off-by: Damian Muszynski <damian.muszynski@intel.com>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/units.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index 2793a41e73a2..ff1bd6b5f5b3 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -31,6 +31,10 @@
 #define MICROWATT_PER_MILLIWATT	1000UL
 #define MICROWATT_PER_WATT	1000000UL
 
+#define BYTES_PER_KBIT		(KILO / BITS_PER_BYTE)
+#define BYTES_PER_MBIT		(MEGA / BITS_PER_BYTE)
+#define BYTES_PER_GBIT		(GIGA / BITS_PER_BYTE)
+
 #define ABSOLUTE_ZERO_MILLICELSIUS -273150
 
 static inline long milli_kelvin_to_millicelsius(long t)
-- 
cgit v1.2.3


From c626910f3f1bbce6ad18bc613d895d2a089ed95e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 22 Oct 2023 01:10:44 -0700
Subject: crypto: ahash - remove support for nonzero alignmask

Currently, the ahash API checks the alignment of all key and result
buffers against the algorithm's declared alignmask, and for any
unaligned buffers it falls back to manually aligned temporary buffers.

This is virtually useless, however.  First, since it does not apply to
the message, its effect is much more limited than e.g. is the case for
the alignmask for "skcipher".  Second, the key and result buffers are
given as virtual addresses and cannot (in general) be DMA'ed into, so
drivers end up having to copy to/from them in software anyway.  As a
result it's easy to use memcpy() or the unaligned access helpers.

The crypto_hash_walk_*() helper functions do use the alignmask to align
the message.  But with one exception those are only used for shash
algorithms being exposed via the ahash API, not for native ahashes, and
aligning the message is not required in this case, especially now that
alignmask support has been removed from shash.  The exception is the
n2_core driver, which doesn't set an alignmask.

In any case, no ahash algorithms actually set a nonzero alignmask
anymore.  Therefore, remove support for it from ahash.  The benefit is
that all the code to handle "misaligned" buffers in the ahash API goes
away, reducing the overhead of the ahash API.

This follows the same change that was made to shash.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/devel-algos.rst |   4 +-
 crypto/ahash.c                       | 117 +++--------------------------------
 crypto/shash.c                       |   8 +--
 include/crypto/internal/hash.h       |   4 +-
 include/linux/crypto.h               |  27 ++++----
 5 files changed, 28 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/crypto/devel-algos.rst b/Documentation/crypto/devel-algos.rst
index 3506899ef83e..9b7782f4f6e0 100644
--- a/Documentation/crypto/devel-algos.rst
+++ b/Documentation/crypto/devel-algos.rst
@@ -235,6 +235,4 @@ Specifics Of Asynchronous HASH Transformation
 
 Some of the drivers will want to use the Generic ScatterWalk in case the
 implementation needs to be fed separate chunks of the scatterlist which
-contains the input data. The buffer containing the resulting hash will
-always be properly aligned to .cra_alignmask so there is no need to
-worry about this.
+contains the input data.
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 213bb3e9f245..744fd3b8ea25 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -35,21 +35,12 @@ struct ahash_request_priv {
 
 static int hash_walk_next(struct crypto_hash_walk *walk)
 {
-	unsigned int alignmask = walk->alignmask;
 	unsigned int offset = walk->offset;
 	unsigned int nbytes = min(walk->entrylen,
 				  ((unsigned int)(PAGE_SIZE)) - offset);
 
 	walk->data = kmap_local_page(walk->pg);
 	walk->data += offset;
-
-	if (offset & alignmask) {
-		unsigned int unaligned = alignmask + 1 - (offset & alignmask);
-
-		if (nbytes > unaligned)
-			nbytes = unaligned;
-	}
-
 	walk->entrylen -= nbytes;
 	return nbytes;
 }
@@ -73,23 +64,8 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk)
 
 int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 {
-	unsigned int alignmask = walk->alignmask;
-
 	walk->data -= walk->offset;
 
-	if (walk->entrylen && (walk->offset & alignmask) && !err) {
-		unsigned int nbytes;
-
-		walk->offset = ALIGN(walk->offset, alignmask + 1);
-		nbytes = min(walk->entrylen,
-			     (unsigned int)(PAGE_SIZE - walk->offset));
-		if (nbytes) {
-			walk->entrylen -= nbytes;
-			walk->data += walk->offset;
-			return nbytes;
-		}
-	}
-
 	kunmap_local(walk->data);
 	crypto_yield(walk->flags);
 
@@ -121,7 +97,6 @@ int crypto_hash_walk_first(struct ahash_request *req,
 		return 0;
 	}
 
-	walk->alignmask = crypto_ahash_alignmask(crypto_ahash_reqtfm(req));
 	walk->sg = req->src;
 	walk->flags = req->base.flags;
 
@@ -129,26 +104,6 @@ int crypto_hash_walk_first(struct ahash_request *req,
 }
 EXPORT_SYMBOL_GPL(crypto_hash_walk_first);
 
-static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
-				unsigned int keylen)
-{
-	unsigned long alignmask = crypto_ahash_alignmask(tfm);
-	int ret;
-	u8 *buffer, *alignbuffer;
-	unsigned long absize;
-
-	absize = keylen + alignmask;
-	buffer = kmalloc(absize, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
-	memcpy(alignbuffer, key, keylen);
-	ret = tfm->setkey(tfm, alignbuffer, keylen);
-	kfree_sensitive(buffer);
-	return ret;
-}
-
 static int ahash_nosetkey(struct crypto_ahash *tfm, const u8 *key,
 			  unsigned int keylen)
 {
@@ -167,13 +122,7 @@ static void ahash_set_needkey(struct crypto_ahash *tfm)
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 			unsigned int keylen)
 {
-	unsigned long alignmask = crypto_ahash_alignmask(tfm);
-	int err;
-
-	if ((unsigned long)key & alignmask)
-		err = ahash_setkey_unaligned(tfm, key, keylen);
-	else
-		err = tfm->setkey(tfm, key, keylen);
+	int err = tfm->setkey(tfm, key, keylen);
 
 	if (unlikely(err)) {
 		ahash_set_needkey(tfm);
@@ -189,7 +138,6 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt,
 			  bool has_state)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	unsigned long alignmask = crypto_ahash_alignmask(tfm);
 	unsigned int ds = crypto_ahash_digestsize(tfm);
 	struct ahash_request *subreq;
 	unsigned int subreq_size;
@@ -203,7 +151,6 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt,
 	reqsize = ALIGN(reqsize, crypto_tfm_ctx_alignment());
 	subreq_size += reqsize;
 	subreq_size += ds;
-	subreq_size += alignmask & ~(crypto_tfm_ctx_alignment() - 1);
 
 	flags = ahash_request_flags(req);
 	gfp = (flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?  GFP_KERNEL : GFP_ATOMIC;
@@ -215,7 +162,6 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt,
 	ahash_request_set_callback(subreq, flags, cplt, req);
 
 	result = (u8 *)(subreq + 1) + reqsize;
-	result = PTR_ALIGN(result, alignmask + 1);
 
 	ahash_request_set_crypt(subreq, req->src, result, req->nbytes);
 
@@ -251,56 +197,6 @@ static void ahash_restore_req(struct ahash_request *req, int err)
 	kfree_sensitive(subreq);
 }
 
-static void ahash_op_unaligned_done(void *data, int err)
-{
-	struct ahash_request *areq = data;
-
-	if (err == -EINPROGRESS)
-		goto out;
-
-	/* First copy req->result into req->priv.result */
-	ahash_restore_req(areq, err);
-
-out:
-	/* Complete the ORIGINAL request. */
-	ahash_request_complete(areq, err);
-}
-
-static int ahash_op_unaligned(struct ahash_request *req,
-			      int (*op)(struct ahash_request *),
-			      bool has_state)
-{
-	int err;
-
-	err = ahash_save_req(req, ahash_op_unaligned_done, has_state);
-	if (err)
-		return err;
-
-	err = op(req->priv);
-	if (err == -EINPROGRESS || err == -EBUSY)
-		return err;
-
-	ahash_restore_req(req, err);
-
-	return err;
-}
-
-static int crypto_ahash_op(struct ahash_request *req,
-			   int (*op)(struct ahash_request *),
-			   bool has_state)
-{
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	unsigned long alignmask = crypto_ahash_alignmask(tfm);
-	int err;
-
-	if ((unsigned long)req->result & alignmask)
-		err = ahash_op_unaligned(req, op, has_state);
-	else
-		err = op(req);
-
-	return crypto_hash_errstat(crypto_hash_alg_common(tfm), err);
-}
-
 int crypto_ahash_final(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -309,7 +205,7 @@ int crypto_ahash_final(struct ahash_request *req)
 	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
 		atomic64_inc(&hash_get_stat(alg)->hash_cnt);
 
-	return crypto_ahash_op(req, tfm->final, true);
+	return crypto_hash_errstat(alg, tfm->final(req));
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_final);
 
@@ -325,7 +221,7 @@ int crypto_ahash_finup(struct ahash_request *req)
 		atomic64_add(req->nbytes, &istat->hash_tlen);
 	}
 
-	return crypto_ahash_op(req, tfm->finup, true);
+	return crypto_hash_errstat(alg, tfm->finup(req));
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_finup);
 
@@ -333,6 +229,7 @@ int crypto_ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	struct hash_alg_common *alg = crypto_hash_alg_common(tfm);
+	int err;
 
 	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
 		struct crypto_istat_hash *istat = hash_get_stat(alg);
@@ -342,9 +239,11 @@ int crypto_ahash_digest(struct ahash_request *req)
 	}
 
 	if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		return crypto_hash_errstat(alg, -ENOKEY);
+		err = -ENOKEY;
+	else
+		err = tfm->digest(req);
 
-	return crypto_ahash_op(req, tfm->digest, false);
+	return crypto_hash_errstat(alg, err);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
 
diff --git a/crypto/shash.c b/crypto/shash.c
index 409b33f9c97c..359702c2cd02 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -541,6 +541,10 @@ int hash_prepare_alg(struct hash_alg_common *alg)
 	if (alg->digestsize > HASH_MAX_DIGESTSIZE)
 		return -EINVAL;
 
+	/* alignmask is not useful for hashes, so it is not supported. */
+	if (base->cra_alignmask)
+		return -EINVAL;
+
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 
 	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
@@ -557,10 +561,6 @@ static int shash_prepare_alg(struct shash_alg *alg)
 	if (alg->descsize > HASH_MAX_DESCSIZE)
 		return -EINVAL;
 
-	/* alignmask is not useful for shash, so it is not supported. */
-	if (base->cra_alignmask)
-		return -EINVAL;
-
 	if ((alg->export && !alg->import) || (alg->import && !alg->export))
 		return -EINVAL;
 
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 8d0cd0c591a0..59c707e4dea4 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -18,15 +18,13 @@ struct crypto_hash_walk {
 	char *data;
 
 	unsigned int offset;
-	unsigned int alignmask;
+	unsigned int flags;
 
 	struct page *pg;
 	unsigned int entrylen;
 
 	unsigned int total;
 	struct scatterlist *sg;
-
-	unsigned int flags;
 };
 
 struct ahash_instance {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index f3c3a3b27fac..b164da5e129e 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -110,7 +110,6 @@
  *	  crypto_aead_walksize() (with the remainder going at the end), no chunk
  *	  can cross a page boundary or a scatterlist element boundary.
  *    ahash:
- *	- The result buffer must be aligned to the algorithm's alignmask.
  *	- crypto_ahash_finup() must not be used unless the algorithm implements
  *	  ->finup() natively.
  */
@@ -278,18 +277,20 @@ struct compress_alg {
  * @cra_ctxsize: Size of the operational context of the transformation. This
  *		 value informs the kernel crypto API about the memory size
  *		 needed to be allocated for the transformation context.
- * @cra_alignmask: Alignment mask for the input and output data buffer. The data
- *		   buffer containing the input data for the algorithm must be
- *		   aligned to this alignment mask. The data buffer for the
- *		   output data must be aligned to this alignment mask. Note that
- *		   the Crypto API will do the re-alignment in software, but
- *		   only under special conditions and there is a performance hit.
- *		   The re-alignment happens at these occasions for different
- *		   @cra_u types: cipher -- For both input data and output data
- *		   buffer; ahash -- For output hash destination buf; shash --
- *		   For output hash destination buf.
- *		   This is needed on hardware which is flawed by design and
- *		   cannot pick data from arbitrary addresses.
+ * @cra_alignmask: For cipher, skcipher, lskcipher, and aead algorithms this is
+ *		   1 less than the alignment, in bytes, that the algorithm
+ *		   implementation requires for input and output buffers.  When
+ *		   the crypto API is invoked with buffers that are not aligned
+ *		   to this alignment, the crypto API automatically utilizes
+ *		   appropriately aligned temporary buffers to comply with what
+ *		   the algorithm needs.  (For scatterlists this happens only if
+ *		   the algorithm uses the skcipher_walk helper functions.)  This
+ *		   misalignment handling carries a performance penalty, so it is
+ *		   preferred that algorithms do not set a nonzero alignmask.
+ *		   Also, crypto API users may wish to allocate buffers aligned
+ *		   to the alignmask of the algorithm being used, in order to
+ *		   avoid the API having to realign them.  Note: the alignmask is
+ *		   not supported for hash algorithms and is always 0 for them.
  * @cra_priority: Priority of this transformation implementation. In case
  *		  multiple transformations with same @cra_name are available to
  *		  the Crypto API, the kernel will use the one with highest
-- 
cgit v1.2.3


From 2ee7c1bcf3d1c91ede9d914c52fa2f56c449b75a Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Sun, 22 Oct 2023 19:22:03 +0100
Subject: x509: Add OIDs for FIPS 202 SHA-3 hash and signatures

Add OID for FIPS 202 SHA-3 family of hash functions, RSA & ECDSA
signatures using those. Limit to 256 or larger sizes, for
interoperability reasons. 224 is too weak for any practical uses.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/oid_registry.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 8b79e55cfcec..3921fbed0b28 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -129,6 +129,17 @@ enum OID {
 	OID_TPMImportableKey,		/* 2.23.133.10.1.4 */
 	OID_TPMSealedData,		/* 2.23.133.10.1.5 */
 
+	/* CSOR FIPS-202 SHA-3 */
+	OID_sha3_256,                           /* 2.16.840.1.101.3.4.2.8 */
+	OID_sha3_384,                           /* 2.16.840.1.101.3.4.2.9 */
+	OID_sha3_512,                           /* 2.16.840.1.101.3.4.2.10 */
+	OID_id_ecdsa_with_sha3_256,             /* 2.16.840.1.101.3.4.3.10 */
+	OID_id_ecdsa_with_sha3_384,             /* 2.16.840.1.101.3.4.3.11 */
+	OID_id_ecdsa_with_sha3_512,             /* 2.16.840.1.101.3.4.3.12 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_256, /* 2.16.840.1.101.3.4.3.14 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_384, /* 2.16.840.1.101.3.4.3.15 */
+	OID_id_rsassa_pkcs1_v1_5_with_sha3_512, /* 2.16.840.1.101.3.4.3.16 */
+
 	OID__NR
 };
 
-- 
cgit v1.2.3


From f4cf4e5db331a5ce69e3f0b21d322cac0f4e4b5d Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Mon, 23 Oct 2023 12:27:59 +0200
Subject: Revert "nvmem: add new config option"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 517f14d9cf3533d5ab4fded195ab6f80a92e378f.

Config option "no_of_node" is no longer needed since adding a more
explicit and targeted option "add_legacy_fixed_of_cells".

That "no_of_node" config option was needed *earlier* to help mtd's case.

DT nodes of MTD partitions (that are also NVMEM devices) may contain
subnodes. Those SHOULD NOT be treated as NVMEM fixed cells.

To prevent NVMEM core code from parsing subnodes a "no_of_node" option
was added (and set to true in mtd) to make for_each_child_of_node() in
NVMEM a no-op. That was a bit hacky because it was messing with
"of_node" pointer to achieve some side-effect.

With the introduction of "add_legacy_fixed_of_cells" config option
things got more explicit. MTD subsystem simply tells NVMEM when to look
for fixed cells and there is no need to hack "of_node" pointer anymore.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231023102759.31529-1-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mtd/mtdcore.c          | 1 -
 drivers/nvmem/core.c           | 2 +-
 include/linux/nvmem-provider.h | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index fbf60d1364f0..74dd1b74008d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -560,7 +560,6 @@ static int mtd_nvmem_add(struct mtd_info *mtd)
 	config.read_only = true;
 	config.root_only = true;
 	config.ignore_wp = true;
-	config.no_of_node = !of_device_is_compatible(node, "nvmem-cells");
 	config.priv = mtd;
 
 	mtd->nvmem = nvmem_register(&config);
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 2710943f53c4..bf42b7e826db 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -935,7 +935,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	nvmem->nkeepout = config->nkeepout;
 	if (config->of_node)
 		nvmem->dev.of_node = config->of_node;
-	else if (!config->no_of_node)
+	else
 		nvmem->dev.of_node = config->dev->of_node;
 
 	switch (config->id) {
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 1b81adebdb8b..e3930835235b 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -89,7 +89,6 @@ struct nvmem_cell_info {
  * @read_only:	Device is read-only.
  * @root_only:	Device is accessibly to root only.
  * @of_node:	If given, this will be used instead of the parent's of_node.
- * @no_of_node:	Device should not use the parent's of_node even if it's !NULL.
  * @reg_read:	Callback to read data.
  * @reg_write:	Callback to write data.
  * @size:	Device size.
@@ -122,7 +121,6 @@ struct nvmem_config {
 	bool			ignore_wp;
 	struct nvmem_layout	*layout;
 	struct device_node	*of_node;
-	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
 	int	size;
-- 
cgit v1.2.3


From 54b406e10f0334ee0245d5129c8def444a49cd9b Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:34:59 +0530
Subject: cdx: Remove cdx controller list from cdx bus system

Remove xarray list of cdx controller. Instead, use platform bus
to locate the cdx controller using compat string used by cdx
controller platform driver.
Also, use ida to allocate a unique id for the controller.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-2-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cdx/cdx.c           | 42 ++++++++++++++++++++++++++++--------------
 include/linux/cdx/cdx_bus.h |  2 ++
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index d2cad4c670a0..0252fc92433d 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -60,7 +60,7 @@
 #include <linux/of_device.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/xarray.h>
+#include <linux/idr.h>
 #include <linux/cdx/cdx_bus.h>
 #include <linux/iommu.h>
 #include <linux/dma-map-ops.h>
@@ -70,8 +70,10 @@
 #define CDX_DEFAULT_DMA_MASK	(~0ULL)
 #define MAX_CDX_CONTROLLERS 16
 
-/* CDX controllers registered with the CDX bus */
-static DEFINE_XARRAY_ALLOC(cdx_controllers);
+/* IDA for CDX controllers registered with the CDX bus */
+static DEFINE_IDA(cdx_controller_ida);
+
+static char *compat_node_name = "xlnx,versal-net-cdx";
 
 /**
  * cdx_dev_reset - Reset a CDX device
@@ -384,7 +386,8 @@ static ssize_t rescan_store(const struct bus_type *bus,
 			    const char *buf, size_t count)
 {
 	struct cdx_controller *cdx;
-	unsigned long index;
+	struct platform_device *pd;
+	struct device_node *np;
 	bool val;
 
 	if (kstrtobool(buf, &val) < 0)
@@ -397,12 +400,19 @@ static ssize_t rescan_store(const struct bus_type *bus,
 	cdx_unregister_devices(&cdx_bus_type);
 
 	/* Rescan all the devices */
-	xa_for_each(&cdx_controllers, index, cdx) {
-		int ret;
+	for_each_compatible_node(np, NULL, compat_node_name) {
+		if (!np)
+			return -EINVAL;
 
-		ret = cdx->ops->scan(cdx);
-		if (ret)
-			dev_err(cdx->dev, "cdx bus scanning failed\n");
+		pd = of_find_device_by_node(np);
+		if (!pd)
+			return -EINVAL;
+
+		cdx = platform_get_drvdata(pd);
+		if (cdx && cdx->controller_registered && cdx->ops->scan)
+			cdx->ops->scan(cdx);
+
+		put_device(&pd->dev);
 	}
 
 	return count;
@@ -520,17 +530,20 @@ int cdx_register_controller(struct cdx_controller *cdx)
 {
 	int ret;
 
-	ret = xa_alloc(&cdx_controllers, &cdx->id, cdx,
-		       XA_LIMIT(0, MAX_CDX_CONTROLLERS - 1), GFP_KERNEL);
-	if (ret) {
+	ret = ida_alloc_range(&cdx_controller_ida, 0,  MAX_CDX_CONTROLLERS - 1, GFP_KERNEL);
+	if (ret < 0) {
 		dev_err(cdx->dev,
 			"No free index available. Maximum controllers already registered\n");
 		cdx->id = (u8)MAX_CDX_CONTROLLERS;
 		return ret;
 	}
 
+	cdx->id = ret;
+
 	/* Scan all the devices */
-	cdx->ops->scan(cdx);
+	if (cdx->ops->scan)
+		cdx->ops->scan(cdx);
+	cdx->controller_registered = true;
 
 	return 0;
 }
@@ -541,8 +554,9 @@ void cdx_unregister_controller(struct cdx_controller *cdx)
 	if (cdx->id >= MAX_CDX_CONTROLLERS)
 		return;
 
+	cdx->controller_registered = false;
 	device_for_each_child(cdx->dev, NULL, cdx_unregister_device);
-	xa_erase(&cdx_controllers, cdx->id);
+	ida_free(&cdx_controller_ida, cdx->id);
 }
 EXPORT_SYMBOL_GPL(cdx_unregister_controller);
 
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..82c27b8c94e1 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -63,12 +63,14 @@ struct cdx_ops {
  * @dev: Linux device associated with the CDX controller.
  * @priv: private data
  * @id: Controller ID
+ * @controller_registered: controller registered with bus
  * @ops: CDX controller ops
  */
 struct cdx_controller {
 	struct device *dev;
 	void *priv;
 	u32 id;
+	bool controller_registered;
 	struct cdx_ops *ops;
 };
 
-- 
cgit v1.2.3


From ce558a391d80b9f47a462107977e8e81fe2f2962 Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:02 +0530
Subject: cdx: Register cdx bus as a device on cdx subsystem

While scanning for CDX devices, register newly discovered bus as a
cdx device. CDX device attributes are visible based on device type.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-5-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cdx/cdx.c                       | 77 ++++++++++++++++++++++++++++++---
 drivers/cdx/cdx.h                       | 14 +++++-
 drivers/cdx/controller/cdx_controller.c |  7 +++
 include/linux/cdx/cdx_bus.h             |  2 +
 4 files changed, 93 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index ee0913f04758..cf5306580c21 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -125,8 +125,13 @@ static int cdx_unregister_device(struct device *dev,
 {
 	struct cdx_device *cdx_dev = to_cdx_device(dev);
 
-	kfree(cdx_dev->driver_override);
-	cdx_dev->driver_override = NULL;
+	if (cdx_dev->is_bus) {
+		device_for_each_child(dev, NULL, cdx_unregister_device);
+	} else {
+		kfree(cdx_dev->driver_override);
+		cdx_dev->driver_override = NULL;
+	}
+
 	/*
 	 * Do not free cdx_dev here as it would be freed in
 	 * cdx_device_release() called from within put_device().
@@ -201,6 +206,9 @@ static int cdx_bus_match(struct device *dev, struct device_driver *drv)
 	const struct cdx_device_id *found_id = NULL;
 	const struct cdx_device_id *ids;
 
+	if (cdx_dev->is_bus)
+		return false;
+
 	ids = cdx_drv->match_id_table;
 
 	/* When driver_override is set, only bind to the matching driver */
@@ -265,10 +273,11 @@ static int cdx_dma_configure(struct device *dev)
 {
 	struct cdx_driver *cdx_drv = to_cdx_driver(dev->driver);
 	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	struct cdx_controller *cdx = cdx_dev->cdx;
 	u32 input_id = cdx_dev->req_id;
 	int ret;
 
-	ret = of_dma_configure_id(dev, dev->parent->of_node, 0, &input_id);
+	ret = of_dma_configure_id(dev, cdx->dev->of_node, 0, &input_id);
 	if (ret && ret != -EPROBE_DEFER) {
 		dev_err(dev, "of_dma_configure_id() failed\n");
 		return ret;
@@ -374,6 +383,18 @@ static ssize_t driver_override_show(struct device *dev,
 }
 static DEVICE_ATTR_RW(driver_override);
 
+static umode_t cdx_dev_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cdx_device *cdx_dev;
+
+	cdx_dev = to_cdx_device(dev);
+	if (!cdx_dev->is_bus)
+		return a->mode;
+
+	return 0;
+}
+
 static struct attribute *cdx_dev_attrs[] = {
 	&dev_attr_remove.attr,
 	&dev_attr_reset.attr,
@@ -382,7 +403,16 @@ static struct attribute *cdx_dev_attrs[] = {
 	&dev_attr_driver_override.attr,
 	NULL,
 };
-ATTRIBUTE_GROUPS(cdx_dev);
+
+static const struct attribute_group cdx_dev_group = {
+	.attrs = cdx_dev_attrs,
+	.is_visible = cdx_dev_attrs_are_visible,
+};
+
+static const struct attribute_group *cdx_dev_groups[] = {
+	&cdx_dev_group,
+	NULL,
+};
 
 static ssize_t rescan_store(const struct bus_type *bus,
 			    const char *buf, size_t count)
@@ -479,7 +509,6 @@ static void cdx_device_release(struct device *dev)
 int cdx_device_add(struct cdx_dev_params *dev_params)
 {
 	struct cdx_controller *cdx = dev_params->cdx;
-	struct device *parent = cdx->dev;
 	struct cdx_device *cdx_dev;
 	int ret;
 
@@ -503,7 +532,7 @@ int cdx_device_add(struct cdx_dev_params *dev_params)
 
 	/* Initialize generic device */
 	device_initialize(&cdx_dev->dev);
-	cdx_dev->dev.parent = parent;
+	cdx_dev->dev.parent = dev_params->parent;
 	cdx_dev->dev.bus = &cdx_bus_type;
 	cdx_dev->dev.dma_mask = &cdx_dev->dma_mask;
 	cdx_dev->dev.release = cdx_device_release;
@@ -532,6 +561,42 @@ fail:
 }
 EXPORT_SYMBOL_NS_GPL(cdx_device_add, CDX_BUS_CONTROLLER);
 
+struct device *cdx_bus_add(struct cdx_controller *cdx, u8 bus_num)
+{
+	struct cdx_device *cdx_dev;
+	int ret;
+
+	cdx_dev = kzalloc(sizeof(*cdx_dev), GFP_KERNEL);
+	if (!cdx_dev)
+		return NULL;
+
+	device_initialize(&cdx_dev->dev);
+	cdx_dev->cdx = cdx;
+
+	cdx_dev->dev.parent = cdx->dev;
+	cdx_dev->dev.bus = &cdx_bus_type;
+	cdx_dev->dev.release = cdx_device_release;
+	cdx_dev->is_bus = true;
+	cdx_dev->bus_num = bus_num;
+
+	dev_set_name(&cdx_dev->dev, "cdx-%02x",
+		     ((cdx->id << CDX_CONTROLLER_ID_SHIFT) | (bus_num & CDX_BUS_NUM_MASK)));
+
+	ret = device_add(&cdx_dev->dev);
+	if (ret) {
+		dev_err(&cdx_dev->dev, "cdx bus device add failed: %d\n", ret);
+		goto device_add_fail;
+	}
+
+	return &cdx_dev->dev;
+
+device_add_fail:
+	put_device(&cdx_dev->dev);
+
+	return NULL;
+}
+EXPORT_SYMBOL_NS_GPL(cdx_bus_add, CDX_BUS_CONTROLLER);
+
 int cdx_register_controller(struct cdx_controller *cdx)
 {
 	int ret;
diff --git a/drivers/cdx/cdx.h b/drivers/cdx/cdx.h
index c436ac7ac86f..1f593deb4c9e 100644
--- a/drivers/cdx/cdx.h
+++ b/drivers/cdx/cdx.h
@@ -13,7 +13,7 @@
 /**
  * struct cdx_dev_params - CDX device parameters
  * @cdx: CDX controller associated with the device
- * @parent: Associated CDX controller
+ * @parent: Associated CDX Bus device
  * @vendor: Vendor ID for CDX device
  * @device: Device ID for CDX device
  * @bus_num: Bus number for this CDX device
@@ -24,6 +24,7 @@
  */
 struct cdx_dev_params {
 	struct cdx_controller *cdx;
+	struct device *parent;
 	u16 vendor;
 	u16 device;
 	u8 bus_num;
@@ -59,4 +60,15 @@ void cdx_unregister_controller(struct cdx_controller *cdx);
  */
 int cdx_device_add(struct cdx_dev_params *dev_params);
 
+/**
+ * cdx_bus_add - Add a CDX bus. This function adds a bus on the CDX bus
+ *		subsystem. It creates a CDX device for the corresponding bus and
+ *		also registers an associated Linux generic device.
+ * @cdx: Associated CDX controller
+ * @us_num: Bus number
+ *
+ * Return: associated Linux generic device pointer on success or NULL on failure.
+ */
+struct device *cdx_bus_add(struct cdx_controller *cdx, u8 bus_num);
+
 #endif /* _CDX_H_ */
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index 3f86663fbacf..b4e0d6b40339 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -79,8 +79,14 @@ static int cdx_scan_devices(struct cdx_controller *cdx)
 	num_cdx_bus = (u8)ret;
 
 	for (bus_num = 0; bus_num < num_cdx_bus; bus_num++) {
+		struct device *bus_dev;
 		u8 num_cdx_dev;
 
+		/* Add the bus on cdx subsystem */
+		bus_dev = cdx_bus_add(cdx, bus_num);
+		if (!bus_dev)
+			continue;
+
 		/* MCDI FW Read: Fetch the number of devices present */
 		ret = cdx_mcdi_get_num_devs(cdx_mcdi, bus_num);
 		if (ret < 0) {
@@ -103,6 +109,7 @@ static int cdx_scan_devices(struct cdx_controller *cdx)
 				continue;
 			}
 			dev_params.cdx = cdx;
+			dev_params.parent = bus_dev;
 
 			/* Add the device to the cdx bus */
 			ret = cdx_device_add(&dev_params);
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 82c27b8c94e1..b5e4b7e05666 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -88,6 +88,7 @@ struct cdx_controller {
  * @dma_mask: Default DMA mask
  * @flags: CDX device flags
  * @req_id: Requestor ID associated with CDX device
+ * @is_bus: Is this bus device
  * @driver_override: driver name to force a match; do not set directly,
  *                   because core frees it; use driver_set_override() to
  *                   set or clear it.
@@ -104,6 +105,7 @@ struct cdx_device {
 	u64 dma_mask;
 	u16 flags;
 	u32 req_id;
+	bool is_bus;
 	const char *driver_override;
 };
 
-- 
cgit v1.2.3


From e3cfd49cb9491ac1cc233a4b9e098688b4ab281d Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:03 +0530
Subject: cdx: add support for bus enable and disable

CDX bus needs to be disabled before updating/writing devices
in the FPGA. Once the devices are written, the bus shall be
rescanned. This change provides sysfs entry to enable/disable the
CDX bus.

Co-developed-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-6-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-cdx | 13 ++++++
 drivers/cdx/cdx.c                       | 72 +++++++++++++++++++++++++++++++++
 drivers/cdx/controller/cdx_controller.c | 12 ++++++
 drivers/cdx/controller/mc_cdx_pcol.h    | 54 +++++++++++++++++++++++++
 drivers/cdx/controller/mcdi_functions.c | 24 +++++++++++
 drivers/cdx/controller/mcdi_functions.h | 18 +++++++++
 include/linux/cdx/cdx_bus.h             | 10 +++++
 7 files changed, 203 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
index 7af477f49998..c12bdaa4152a 100644
--- a/Documentation/ABI/testing/sysfs-bus-cdx
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -28,6 +28,19 @@ Description:
 		of a device manufacturer.
 		Combination of Vendor ID and Device ID identifies a device.
 
+What:		/sys/bus/cdx/devices/.../enable
+Date:		October 2023
+Contact:	abhijit.gangurde@amd.com
+Description:
+		CDX bus should be disabled before updating the devices in FPGA.
+		Writing n/0/off will attempt to disable the CDX bus and.
+		writing y/1/on will attempt to enable the CDX bus. Reading this file
+		gives the current state of the bus, 1 for enabled and 0 for disabled.
+
+		For example::
+
+		  # echo 1 > /sys/bus/cdx/.../enable
+
 What:		/sys/bus/cdx/devices/.../reset
 Date:		March 2023
 Contact:	nipun.gupta@amd.com
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index cf5306580c21..8eb484c37e97 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -124,9 +124,12 @@ static int cdx_unregister_device(struct device *dev,
 				 void *data)
 {
 	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	struct cdx_controller *cdx = cdx_dev->cdx;
 
 	if (cdx_dev->is_bus) {
 		device_for_each_child(dev, NULL, cdx_unregister_device);
+		if (cdx_dev->enabled && cdx->ops->bus_disable)
+			cdx->ops->bus_disable(cdx, cdx_dev->bus_num);
 	} else {
 		kfree(cdx_dev->driver_override);
 		cdx_dev->driver_override = NULL;
@@ -383,6 +386,41 @@ static ssize_t driver_override_show(struct device *dev,
 }
 static DEVICE_ATTR_RW(driver_override);
 
+static ssize_t enable_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	bool enable;
+	int ret;
+
+	if (kstrtobool(buf, &enable) < 0)
+		return -EINVAL;
+
+	if (enable == cdx_dev->enabled)
+		return count;
+
+	if (enable && cdx->ops->bus_enable)
+		ret = cdx->ops->bus_enable(cdx, cdx_dev->bus_num);
+	else if (!enable && cdx->ops->bus_disable)
+		ret = cdx->ops->bus_disable(cdx, cdx_dev->bus_num);
+	else
+		ret = -EOPNOTSUPP;
+
+	if (!ret)
+		cdx_dev->enabled = enable;
+
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t enable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	return sysfs_emit(buf, "%u\n", cdx_dev->enabled);
+}
+static DEVICE_ATTR_RW(enable);
+
 static umode_t cdx_dev_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n)
 {
 	struct device *dev = kobj_to_dev(kobj);
@@ -395,6 +433,18 @@ static umode_t cdx_dev_attrs_are_visible(struct kobject *kobj, struct attribute
 	return 0;
 }
 
+static umode_t cdx_bus_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cdx_device *cdx_dev;
+
+	cdx_dev = to_cdx_device(dev);
+	if (cdx_dev->is_bus)
+		return a->mode;
+
+	return 0;
+}
+
 static struct attribute *cdx_dev_attrs[] = {
 	&dev_attr_remove.attr,
 	&dev_attr_reset.attr,
@@ -409,8 +459,19 @@ static const struct attribute_group cdx_dev_group = {
 	.is_visible = cdx_dev_attrs_are_visible,
 };
 
+static struct attribute *cdx_bus_dev_attrs[] = {
+	&dev_attr_enable.attr,
+	NULL,
+};
+
+static const struct attribute_group cdx_bus_dev_group = {
+	.attrs = cdx_bus_dev_attrs,
+	.is_visible = cdx_bus_attrs_are_visible,
+};
+
 static const struct attribute_group *cdx_dev_groups[] = {
 	&cdx_dev_group,
+	&cdx_bus_dev_group,
 	NULL,
 };
 
@@ -588,8 +649,19 @@ struct device *cdx_bus_add(struct cdx_controller *cdx, u8 bus_num)
 		goto device_add_fail;
 	}
 
+	if (cdx->ops->bus_enable) {
+		ret = cdx->ops->bus_enable(cdx, bus_num);
+		if (ret && ret != -EALREADY) {
+			dev_err(cdx->dev, "cdx bus enable failed: %d\n", ret);
+			goto bus_enable_fail;
+		}
+	}
+
+	cdx_dev->enabled = true;
 	return &cdx_dev->dev;
 
+bus_enable_fail:
+	device_del(&cdx_dev->dev);
 device_add_fail:
 	put_device(&cdx_dev->dev);
 
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index b4e0d6b40339..f2a691efd1f1 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -33,6 +33,16 @@ static const struct cdx_mcdi_ops mcdi_ops = {
 	.mcdi_request = cdx_mcdi_request,
 };
 
+static int cdx_bus_enable(struct cdx_controller *cdx, u8 bus_num)
+{
+	return cdx_mcdi_bus_enable(cdx->priv, bus_num);
+}
+
+static int cdx_bus_disable(struct cdx_controller *cdx, u8 bus_num)
+{
+	return cdx_mcdi_bus_disable(cdx->priv, bus_num);
+}
+
 void cdx_rpmsg_post_probe(struct cdx_controller *cdx)
 {
 	/* Register CDX controller with CDX bus driver */
@@ -128,6 +138,8 @@ static int cdx_scan_devices(struct cdx_controller *cdx)
 }
 
 static struct cdx_ops cdx_ops = {
+	.bus_enable		= cdx_bus_enable,
+	.bus_disable	= cdx_bus_disable,
 	.scan		= cdx_scan_devices,
 	.dev_configure	= cdx_configure_device,
 };
diff --git a/drivers/cdx/controller/mc_cdx_pcol.h b/drivers/cdx/controller/mc_cdx_pcol.h
index 4ccb7b52951b..2de019406b57 100644
--- a/drivers/cdx/controller/mc_cdx_pcol.h
+++ b/drivers/cdx/controller/mc_cdx_pcol.h
@@ -455,6 +455,60 @@
 #define MC_CMD_CDX_BUS_GET_DEVICE_CONFIG_OUT_REQUESTER_ID_OFST			84
 #define MC_CMD_CDX_BUS_GET_DEVICE_CONFIG_OUT_REQUESTER_ID_LEN			4
 
+/***********************************/
+/*
+ * MC_CMD_CDX_BUS_DOWN
+ * Asserting reset on the CDX bus causes all devices on the bus to be quiesced.
+ * DMA bus mastering is disabled and any pending DMA request are flushed. Once
+ * the response is returned, the devices are guaranteed to no longer issue DMA
+ * requests or raise MSI interrupts. Further device MMIO accesses may have
+ * undefined results. While the bus reset is asserted, any of the enumeration
+ * or device configuration MCDIs will fail with EAGAIN. It is only legal to
+ * reload the relevant PL region containing CDX devices if the corresponding CDX
+ * bus is in reset. Depending on the implementation, the firmware may or may
+ * not enforce this restriction and it is up to the caller to make sure this
+ * requirement is satisfied.
+ */
+#define MC_CMD_CDX_BUS_DOWN					0x4
+#define MC_CMD_CDX_BUS_DOWN_MSGSET			0x4
+
+/* MC_CMD_CDX_BUS_DOWN_IN msgrequest */
+#define MC_CMD_CDX_BUS_DOWN_IN_LEN			4
+/* Bus number to put in reset, in range 0 to BUS_COUNT-1 */
+#define MC_CMD_CDX_BUS_DOWN_IN_BUS_OFST		0
+#define MC_CMD_CDX_BUS_DOWN_IN_BUS_LEN		4
+
+/*
+ * MC_CMD_CDX_BUS_DOWN_OUT msgresponse: The bus is quiesced, no further
+ * upstream traffic for devices on this bus.
+ */
+#define MC_CMD_CDX_BUS_DOWN_OUT_LEN			0
+
+/***********************************/
+/*
+ * MC_CMD_CDX_BUS_UP
+ * After bus reset is de-asserted, devices are in a state which is functionally
+ * equivalent to each device having been reset with MC_CMD_CDX_DEVICE_RESET. In
+ * other words, device logic is reset in a hardware-specific way, MMIO accesses
+ * are forwarded to the device, DMA bus mastering is disabled and needs to be
+ * re-enabled with MC_CMD_CDX_DEVICE_DMA_ENABLE once the driver is ready to
+ * start servicing DMA. If the underlying number of devices or device resources
+ * changed (e.g. if PL was reloaded) while the bus was in reset, the bus driver
+ * is expected to re-enumerate the bus. Returns EALREADY if the bus was already
+ * up before the call.
+ */
+#define MC_CMD_CDX_BUS_UP					0x5
+#define MC_CMD_CDX_BUS_UP_MSGSET			0x5
+
+/* MC_CMD_CDX_BUS_UP_IN msgrequest */
+#define MC_CMD_CDX_BUS_UP_IN_LEN			4
+/* Bus number to take out of reset, in range 0 to BUS_COUNT-1 */
+#define MC_CMD_CDX_BUS_UP_IN_BUS_OFST		0
+#define MC_CMD_CDX_BUS_UP_IN_BUS_LEN		4
+
+/* MC_CMD_CDX_BUS_UP_OUT msgresponse: The bus can now be enumerated. */
+#define MC_CMD_CDX_BUS_UP_OUT_LEN			0
+
 /***********************************/
 /*
  * MC_CMD_CDX_DEVICE_RESET
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 0158f26533dd..0e1e35d91242 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -124,6 +124,30 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 	return 0;
 }
 
+int cdx_mcdi_bus_enable(struct cdx_mcdi *cdx, u8 bus_num)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_BUS_UP_IN_LEN);
+	int ret;
+
+	MCDI_SET_DWORD(inbuf, CDX_BUS_UP_IN_BUS, bus_num);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_BUS_UP, inbuf, sizeof(inbuf),
+			   NULL, 0, NULL);
+
+	return ret;
+}
+
+int cdx_mcdi_bus_disable(struct cdx_mcdi *cdx, u8 bus_num)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_BUS_DOWN_IN_LEN);
+	int ret;
+
+	MCDI_SET_DWORD(inbuf, CDX_BUS_DOWN_IN_BUS, bus_num);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_BUS_DOWN, inbuf, sizeof(inbuf),
+			   NULL, 0, NULL);
+
+	return ret;
+}
+
 int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
 {
 	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_RESET_IN_LEN);
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index 7440ace5539a..28973d5ec3ab 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -47,6 +47,24 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 			    u8 bus_num, u8 dev_num,
 			    struct cdx_dev_params *dev_params);
 
+/**
+ * cdx_mcdi_bus_enable - Enable CDX bus represented by bus_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_bus_enable(struct cdx_mcdi *cdx, u8 bus_num);
+
+/**
+ * cdx_mcdi_bus_disable - Disable CDX bus represented by bus_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_bus_disable(struct cdx_mcdi *cdx, u8 bus_num);
+
 /**
  * cdx_mcdi_reset_device - Reset cdx device represented by bus_num:dev_num
  * @cdx: pointer to MCDI interface.
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index b5e4b7e05666..18e95076d1d5 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -28,6 +28,10 @@ struct cdx_device_config {
 	u8 type;
 };
 
+typedef int (*cdx_bus_enable_cb)(struct cdx_controller *cdx, u8 bus_num);
+
+typedef int (*cdx_bus_disable_cb)(struct cdx_controller *cdx, u8 bus_num);
+
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
 
 typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
@@ -49,11 +53,15 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
 
 /**
  * struct cdx_ops - Callbacks supported by CDX controller.
+ * @bus_enable: enable bus on the controller
+ * @bus_disable: disable bus on the controller
  * @scan: scan the devices on the controller
  * @dev_configure: configuration like reset, master_enable,
  *		   msi_config etc for a CDX device
  */
 struct cdx_ops {
+	cdx_bus_enable_cb bus_enable;
+	cdx_bus_disable_cb bus_disable;
 	cdx_scan_cb scan;
 	cdx_dev_configure_cb dev_configure;
 };
@@ -89,6 +97,7 @@ struct cdx_controller {
  * @flags: CDX device flags
  * @req_id: Requestor ID associated with CDX device
  * @is_bus: Is this bus device
+ * @enabled: is this bus enabled
  * @driver_override: driver name to force a match; do not set directly,
  *                   because core frees it; use driver_set_override() to
  *                   set or clear it.
@@ -106,6 +115,7 @@ struct cdx_device {
 	u16 flags;
 	u32 req_id;
 	bool is_bus;
+	bool enabled;
 	const char *driver_override;
 };
 
-- 
cgit v1.2.3


From fa10f413091a43f801f82b3cf484f15d6fc9266f Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Tue, 17 Oct 2023 21:35:05 +0530
Subject: cdx: add sysfs for subsystem, class and revision

CDX controller provides subsystem vendor, subsystem device, class and
revision info of the device along with vendor and device ID in native
endian format. CDX Bus system uses this information to bind the cdx
device to the cdx device driver.

Co-developed-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Puneet Gupta <puneet.gupta@amd.com>
Co-developed-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Tested-by: Nikhil Agarwal <nikhil.agarwal@amd.com>
Link: https://lore.kernel.org/r/20231017160505.10640-8-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-cdx | 45 +++++++++++++++++++++++++++++++++
 drivers/cdx/cdx.c                       | 29 ++++++++++++++++++++-
 drivers/cdx/cdx.h                       |  8 ++++++
 drivers/cdx/controller/mcdi_functions.c |  7 +++++
 include/linux/cdx/cdx_bus.h             | 27 ++++++++++++++++++--
 include/linux/mod_devicetable.h         | 10 ++++++++
 scripts/mod/devicetable-offsets.c       |  4 +++
 scripts/mod/file2alias.c                |  8 ++++++
 8 files changed, 135 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
index c2c54abffed5..8c067ff99e54 100644
--- a/Documentation/ABI/testing/sysfs-bus-cdx
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -28,6 +28,36 @@ Description:
 		of a device manufacturer.
 		Combination of Vendor ID and Device ID identifies a device.
 
+What:		/sys/bus/cdx/devices/.../subsystem_vendor
+Date:		July 2023
+Contact:	puneet.gupta@amd.com
+Description:
+		Subsystem Vendor ID for this CDX device, in hexadecimal.
+		Subsystem Vendor ID is 16 bit identifier specific to the
+		card manufacturer.
+
+What:		/sys/bus/cdx/devices/.../subsystem_device
+Date:		July 2023
+Contact:	puneet.gupta@amd.com
+Description:
+		Subsystem Device ID for this CDX device, in hexadecimal
+		Subsystem Device ID is 16 bit identifier specific to the
+		card manufacturer.
+
+What:		/sys/bus/cdx/devices/.../class
+Date:		July 2023
+Contact:	puneet.gupta@amd.com
+Description:
+		This file contains the class of the CDX device, in hexadecimal.
+		Class is 24 bit identifier specifies the functionality of the device.
+
+What:		/sys/bus/cdx/devices/.../revision
+Date:		July 2023
+Contact:	puneet.gupta@amd.com
+Description:
+		This file contains the revision field of the CDX device, in hexadecimal.
+		Revision is 8 bit revision identifier of the device.
+
 What:		/sys/bus/cdx/devices/.../enable
 Date:		October 2023
 Contact:	abhijit.gangurde@amd.com
@@ -67,3 +97,18 @@ Description:
 		For example::
 
 		  # echo 1 > /sys/bus/cdx/devices/.../remove
+
+What:		/sys/bus/cdx/devices/.../modalias
+Date:		July 2023
+Contact:	nipun.gupta@amd.com
+Description:
+		This attribute indicates the CDX ID of the device.
+		That is in the format:
+		cdx:vXXXXdXXXXsvXXXXsdXXXXcXXXXXX,
+		where:
+
+		    - vXXXX contains the vendor ID;
+		    - dXXXX contains the device ID;
+		    - svXXXX contains the subsystem vendor ID;
+		    - sdXXXX contains the subsystem device ID;
+		    - cXXXXXX contains the device class.
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index e22a7138292e..3dcda7513818 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -179,7 +179,10 @@ cdx_match_one_device(const struct cdx_device_id *id,
 {
 	/* Use vendor ID and device ID for matching */
 	if ((id->vendor == CDX_ANY_ID || id->vendor == dev->vendor) &&
-	    (id->device == CDX_ANY_ID || id->device == dev->device))
+	    (id->device == CDX_ANY_ID || id->device == dev->device) &&
+	    (id->subvendor == CDX_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
+	    (id->subdevice == CDX_ANY_ID || id->subdevice == dev->subsystem_device) &&
+	    !((id->class ^ dev->class) & id->class_mask))
 		return id;
 	return NULL;
 }
@@ -329,6 +332,10 @@ static DEVICE_ATTR_RO(field)
 
 cdx_config_attr(vendor, "0x%04x\n");
 cdx_config_attr(device, "0x%04x\n");
+cdx_config_attr(subsystem_vendor, "0x%04x\n");
+cdx_config_attr(subsystem_device, "0x%04x\n");
+cdx_config_attr(revision, "0x%02x\n");
+cdx_config_attr(class, "0x%06x\n");
 
 static ssize_t remove_store(struct device *dev,
 			    struct device_attribute *attr,
@@ -377,6 +384,17 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_WO(reset);
 
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(dev);
+
+	return sprintf(buf, "cdx:v%04Xd%04Xsv%04Xsd%04Xc%06X\n", cdx_dev->vendor,
+			cdx_dev->device, cdx_dev->subsystem_vendor, cdx_dev->subsystem_device,
+			cdx_dev->class);
+}
+static DEVICE_ATTR_RO(modalias);
+
 static ssize_t driver_override_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
@@ -467,6 +485,11 @@ static struct attribute *cdx_dev_attrs[] = {
 	&dev_attr_reset.attr,
 	&dev_attr_vendor.attr,
 	&dev_attr_device.attr,
+	&dev_attr_subsystem_vendor.attr,
+	&dev_attr_subsystem_device.attr,
+	&dev_attr_class.attr,
+	&dev_attr_revision.attr,
+	&dev_attr_modalias.attr,
 	&dev_attr_driver_override.attr,
 	NULL,
 };
@@ -604,6 +627,10 @@ int cdx_device_add(struct cdx_dev_params *dev_params)
 	cdx_dev->req_id = dev_params->req_id;
 	cdx_dev->vendor = dev_params->vendor;
 	cdx_dev->device = dev_params->device;
+	cdx_dev->subsystem_vendor = dev_params->subsys_vendor;
+	cdx_dev->subsystem_device = dev_params->subsys_device;
+	cdx_dev->class = dev_params->class;
+	cdx_dev->revision = dev_params->revision;
 	cdx_dev->bus_num = dev_params->bus_num;
 	cdx_dev->dev_num = dev_params->dev_num;
 	cdx_dev->cdx = dev_params->cdx;
diff --git a/drivers/cdx/cdx.h b/drivers/cdx/cdx.h
index 1f593deb4c9e..300ad8be7a34 100644
--- a/drivers/cdx/cdx.h
+++ b/drivers/cdx/cdx.h
@@ -16,22 +16,30 @@
  * @parent: Associated CDX Bus device
  * @vendor: Vendor ID for CDX device
  * @device: Device ID for CDX device
+ * @subsys_vendor: Sub vendor ID for CDX device
+ * @subsys_device: Sub device ID for CDX device
  * @bus_num: Bus number for this CDX device
  * @dev_num: Device number for this device
  * @res: array of MMIO region entries
  * @res_count: number of valid MMIO regions
  * @req_id: Requestor ID associated with CDX device
+ * @class: Class of the CDX Device
+ * @revision: Revision of the CDX device
  */
 struct cdx_dev_params {
 	struct cdx_controller *cdx;
 	struct device *parent;
 	u16 vendor;
 	u16 device;
+	u16 subsys_vendor;
+	u16 subsys_device;
 	u8 bus_num;
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
 	u8 res_count;
 	u32 req_id;
+	u32 class;
+	u8 revision;
 };
 
 /**
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 0e1e35d91242..65dca2aa1d3f 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -120,6 +120,13 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 
 	dev_params->vendor = MCDI_WORD(outbuf, CDX_BUS_GET_DEVICE_CONFIG_OUT_VENDOR_ID);
 	dev_params->device = MCDI_WORD(outbuf, CDX_BUS_GET_DEVICE_CONFIG_OUT_DEVICE_ID);
+	dev_params->subsys_vendor = MCDI_WORD(outbuf,
+					      CDX_BUS_GET_DEVICE_CONFIG_OUT_SUBSYS_VENDOR_ID);
+	dev_params->subsys_device = MCDI_WORD(outbuf,
+					      CDX_BUS_GET_DEVICE_CONFIG_OUT_SUBSYS_DEVICE_ID);
+	dev_params->class = MCDI_DWORD(outbuf,
+				       CDX_BUS_GET_DEVICE_CONFIG_OUT_DEVICE_CLASS) & 0xFFFFFF;
+	dev_params->revision = MCDI_BYTE(outbuf, CDX_BUS_GET_DEVICE_CONFIG_OUT_DEVICE_REVISION);
 
 	return 0;
 }
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 18e95076d1d5..cf9738e3a957 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -38,6 +38,19 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
 				    u8 bus_num, u8 dev_num,
 				    struct cdx_device_config *dev_config);
 
+/**
+ * CDX_DEVICE - macro used to describe a specific CDX device
+ * @vend: the 16 bit CDX Vendor ID
+ * @dev: the 16 bit CDX Device ID
+ *
+ * This macro is used to create a struct cdx_device_id that matches a
+ * specific device. The subvendor and subdevice fields will be set to
+ * CDX_ANY_ID.
+ */
+#define CDX_DEVICE(vend, dev) \
+	.vendor = (vend), .device = (dev), \
+	.subvendor = CDX_ANY_ID, .subdevice = CDX_ANY_ID
+
 /**
  * CDX_DEVICE_DRIVER_OVERRIDE - macro used to describe a CDX device with
  *                              override_only flags.
@@ -46,10 +59,12 @@ typedef int (*cdx_dev_configure_cb)(struct cdx_controller *cdx,
  * @driver_override: the 32 bit CDX Device override_only
  *
  * This macro is used to create a struct cdx_device_id that matches only a
- * driver_override device.
+ * driver_override device. The subvendor and subdevice fields will be set to
+ * CDX_ANY_ID.
  */
 #define CDX_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
-	.vendor = (vend), .device = (dev), .override_only = (driver_override)
+	.vendor = (vend), .device = (dev), .subvendor = CDX_ANY_ID,\
+	.subdevice = CDX_ANY_ID, .override_only = (driver_override)
 
 /**
  * struct cdx_ops - Callbacks supported by CDX controller.
@@ -88,6 +103,10 @@ struct cdx_controller {
  * @cdx: CDX controller associated with the device
  * @vendor: Vendor ID for CDX device
  * @device: Device ID for CDX device
+ * @subsystem_vendor: Subsystem Vendor ID for CDX device
+ * @subsystem_device: Subsystem Device ID for CDX device
+ * @class: Class for the CDX device
+ * @revision: Revision of the CDX device
  * @bus_num: Bus number for this CDX device
  * @dev_num: Device number for this device
  * @res: array of MMIO region entries
@@ -107,6 +126,10 @@ struct cdx_device {
 	struct cdx_controller *cdx;
 	u16 vendor;
 	u16 device;
+	u16 subsystem_vendor;
+	u16 subsystem_device;
+	u32 class;
+	u8 revision;
 	u8 bus_num;
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b0678b093cb2..aa3c28781248 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -935,6 +935,12 @@ enum {
  * struct cdx_device_id - CDX device identifier
  * @vendor: Vendor ID
  * @device: Device ID
+ * @subvendor: Subsystem vendor ID (or CDX_ANY_ID)
+ * @subdevice: Subsystem device ID (or CDX_ANY_ID)
+ * @class: Device class
+ *         Most drivers do not need to specify class/class_mask
+ *         as vendor/device is normally sufficient.
+ * @class_mask: Limit which sub-fields of the class field are compared.
  * @override_only: Match only when dev->driver_override is this driver.
  *
  * Type of entries in the "device Id" table for CDX devices supported by
@@ -943,6 +949,10 @@ enum {
 struct cdx_device_id {
 	__u16 vendor;
 	__u16 device;
+	__u16 subvendor;
+	__u16 subdevice;
+	__u32 class;
+	__u32 class_mask;
 	__u32 override_only;
 };
 
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index abe65f8968dd..7a659aa3114a 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -265,6 +265,10 @@ int main(void)
 	DEVID(cdx_device_id);
 	DEVID_FIELD(cdx_device_id, vendor);
 	DEVID_FIELD(cdx_device_id, device);
+	DEVID_FIELD(cdx_device_id, subvendor);
+	DEVID_FIELD(cdx_device_id, subdevice);
+	DEVID_FIELD(cdx_device_id, class);
+	DEVID_FIELD(cdx_device_id, class_mask);
 	DEVID_FIELD(cdx_device_id, override_only);
 
 	return 0;
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 7056751c29b1..2b28faca9585 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1458,6 +1458,10 @@ static int do_cdx_entry(const char *filename, void *symval,
 {
 	DEF_FIELD(symval, cdx_device_id, vendor);
 	DEF_FIELD(symval, cdx_device_id, device);
+	DEF_FIELD(symval, cdx_device_id, subvendor);
+	DEF_FIELD(symval, cdx_device_id, subdevice);
+	DEF_FIELD(symval, cdx_device_id, class);
+	DEF_FIELD(symval, cdx_device_id, class_mask);
 	DEF_FIELD(symval, cdx_device_id, override_only);
 
 	switch (override_only) {
@@ -1475,6 +1479,10 @@ static int do_cdx_entry(const char *filename, void *symval,
 
 	ADD(alias, "v", vendor != CDX_ANY_ID, vendor);
 	ADD(alias, "d", device != CDX_ANY_ID, device);
+	ADD(alias, "sv", subvendor != CDX_ANY_ID, subvendor);
+	ADD(alias, "sd", subdevice != CDX_ANY_ID, subdevice);
+	ADD(alias, "c", class_mask == 0xFFFFFF, class);
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 25 Oct 2023 13:16:19 -0700
Subject: perf: Add branch stack counters

Currently, the additional information of a branch entry is stored in a
u64 space. With more and more information added, the space is running
out. For example, the information of occurrences of events will be added
for each branch.

Two places were suggested to append the counters.
https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/
One place is right after the flags of each branch entry. It changes the
existing struct perf_branch_entry. The later ARCH specific
implementation has to be really careful to consistently pick
the right struct.
The other place is right after the entire struct perf_branch_stack.
The disadvantage is that the pointer of the extra space has to be
recorded. The common interface perf_sample_save_brstack() has to be
updated.

The latter is much straightforward, and should be easily understood and
maintained. It is implemented in the patch.

Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate
the event which is recorded in the branch info.

The "u64 counters" may store the occurrences of several events. The
information regarding the number of events/counters and the width of
each counter should be exposed via sysfs as a reference for the perf
tool. Define the branch_counter_nr and branch_counter_width ABI here.
The support will be implemented later in the Intel-specific patch.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com
---
 .../ABI/testing/sysfs-bus-event_source-devices-caps     |  6 ++++++
 arch/powerpc/perf/core-book3s.c                         |  2 +-
 arch/x86/events/amd/core.c                              |  2 +-
 arch/x86/events/core.c                                  |  2 +-
 arch/x86/events/intel/core.c                            |  2 +-
 arch/x86/events/intel/ds.c                              |  4 ++--
 include/linux/perf_event.h                              | 17 ++++++++++++++++-
 include/uapi/linux/perf_event.h                         | 10 ++++++++++
 kernel/events/core.c                                    |  8 ++++++++
 9 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
index 8757dcf41c08..a5f506f7d481 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
@@ -16,3 +16,9 @@ Description:
 		Example output in powerpc:
 		grep . /sys/bus/event_source/devices/cpu/caps/*
 		/sys/bus/event_source/devices/cpu/caps/pmu_name:POWER9
+
+		The "branch_counter_nr" in the supported platform exposes the
+		maximum number of counters which can be shown in the u64 counters
+		of PERF_SAMPLE_BRANCH_COUNTERS, while the "branch_counter_width"
+		exposes the width of each counter. Both of them can be used by
+		the perf tool to parse the logged counters in each branch.
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 8c1f7def596e..3c14596bbfaf 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2313,7 +2313,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			struct cpu_hw_events *cpuhw;
 			cpuhw = this_cpu_ptr(&cpu_hw_events);
 			power_pmu_bhrb_read(event, cpuhw);
-			perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack);
+			perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack, NULL);
 		}
 
 		if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e24976593a29..4ee6390b45c9 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -940,7 +940,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
 			continue;
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 40ad1425ffa2..40c9af124128 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1702,7 +1702,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
 		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a08f794a0e79..41a164764a84 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3047,7 +3047,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index bf97ab904d40..cb3f329f8fa4 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 		setup_pebs_time(event, data, pebs->tsc);
 
 	if (has_branch_stack(event))
-		perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+		perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
 }
 
 static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 
 		if (has_branch_stack(event)) {
 			intel_pmu_store_pebs_lbrs(lbr);
-			perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
 		}
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0367d748fae0..7897ef066027 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1139,6 +1139,10 @@ static inline bool branch_sample_priv(const struct perf_event *event)
 	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
 }
 
+static inline bool branch_sample_counters(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
+}
 
 struct perf_sample_data {
 	/*
@@ -1173,6 +1177,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
+	u64				*br_stack_cntr;
 	union perf_sample_weight	weight;
 	union  perf_mem_data_src	data_src;
 	u64				txn;
@@ -1250,7 +1255,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
 
 static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 					    struct perf_event *event,
-					    struct perf_branch_stack *brs)
+					    struct perf_branch_stack *brs,
+					    u64 *brs_cntr)
 {
 	int size = sizeof(u64); /* nr */
 
@@ -1258,7 +1264,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 		size += sizeof(u64);
 	size += brs->nr * sizeof(struct perf_branch_entry);
 
+	/*
+	 * The extension space for counters is appended after the
+	 * struct perf_branch_stack. It is used to store the occurrences
+	 * of events of each branch.
+	 */
+	if (brs_cntr)
+		size += brs->nr * sizeof(u64);
+
 	data->br_stack = brs;
+	data->br_stack_cntr = brs_cntr;
 	data->dyn_size += size;
 	data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 39c6a250dd1b..4461f380425b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift {
 
 	PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT	= 18, /* save privilege mode */
 
+	PERF_SAMPLE_BRANCH_COUNTERS_SHIFT	= 19, /* save occurrences of events on a branch */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -235,6 +237,8 @@ enum perf_branch_sample_type {
 
 	PERF_SAMPLE_BRANCH_PRIV_SAVE	= 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT,
 
+	PERF_SAMPLE_BRANCH_COUNTERS	= 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
@@ -982,6 +986,12 @@ enum perf_event_type {
 	 *	{ u64                   nr;
 	 *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
 	 *        { u64 from, to, flags } lbr[nr];
+	 *        #
+	 *        # The format of the counters is decided by the
+	 *        # "branch_counter_nr" and "branch_counter_width",
+	 *        # which are defined in the ABI.
+	 *        #
+	 *        { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS
 	 *      } && PERF_SAMPLE_BRANCH_STACK
 	 *
 	 * 	{ u64			abi; # enum perf_sample_regs_abi
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3eb26c2c6e65..d27ffd80ed67 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7341,6 +7341,14 @@ void perf_output_sample(struct perf_output_handle *handle,
 			if (branch_sample_hw_index(event))
 				perf_output_put(handle, data->br_stack->hw_idx);
 			perf_output_copy(handle, data->br_stack->entries, size);
+			/*
+			 * Add the extension space which is appended
+			 * right after the struct perf_branch_stack.
+			 */
+			if (data->br_stack_cntr) {
+				size = data->br_stack->nr * sizeof(u64);
+				perf_output_copy(handle, data->br_stack_cntr, size);
+			}
 		} else {
 			/*
 			 * we always store at least the value of nr
-- 
cgit v1.2.3


From 1f2376cd03dd3b965d130ed46a7c92769d614ba1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 25 Oct 2023 13:16:21 -0700
Subject: perf: Add branch_sample_call_stack

Add a helper function to check call stack sample type.

The later patch will invoke the function in several places.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231025201626.3000228-3-kan.liang@linux.intel.com
---
 arch/x86/events/core.c     | 2 +-
 include/linux/perf_event.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 40c9af124128..09050641ce5d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -601,7 +601,7 @@ int x86_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
-	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+	if (branch_sample_call_stack(event))
 		event->attach_state |= PERF_ATTACH_TASK_DATA;
 
 	/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7897ef066027..ac1a59c1f252 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1144,6 +1144,11 @@ static inline bool branch_sample_counters(const struct perf_event *event)
 	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
 }
 
+static inline bool branch_sample_call_stack(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
+}
+
 struct perf_sample_data {
 	/*
 	 * Fields set by perf_sample_data_init() unconditionally,
-- 
cgit v1.2.3


From 6777877eb7a3290cf0a8a6b621e46f72f9d94b6b Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 18 Oct 2023 19:17:05 +0200
Subject: PCI/AER: Refactor cper_print_aer() for use by CXL driver module

The CXL driver plans to use cper_print_aer() for logging restricted CXL
host (RCH) AER errors. cper_print_aer() is not currently exported and
therefore not usable by the CXL drivers built as loadable modules. Export
the cper_print_aer() function. Use the EXPORT_SYMBOL_NS_GPL() variant
to restrict the export to CXL drivers.

The CONFIG_ACPI_APEI_PCIEAER kernel config is currently used to enable
cper_print_aer(). cper_print_aer() logs the AER registers and is
useful in PCIE AER logging outside of APEI. Remove the
CONFIG_ACPI_APEI_PCIEAER dependency to enable cper_print_aer().

The cper_print_aer() function name implies CPER specific use but is useful
in non-CPER cases as well. Rename cper_print_aer() to pci_print_aer().

Also, update cxl_core to import CXL namespace imports.

Co-developed-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20231018171713.1883517-13-rrichter@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/port.c | 1 +
 drivers/pci/pcie/aer.c  | 9 +++++----
 include/linux/aer.h     | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 8cb2ed0d6bbb..002c820cfd83 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -2100,3 +2100,4 @@ static void cxl_core_exit(void)
 subsys_initcall(cxl_core_init);
 module_exit(cxl_core_exit);
 MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(CXL);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9c8fd69ae5ad..6593fe3fc555 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -759,9 +759,10 @@ int cper_severity_to_aer(int cper_severity)
 	}
 }
 EXPORT_SYMBOL_GPL(cper_severity_to_aer);
+#endif
 
-void cper_print_aer(struct pci_dev *dev, int aer_severity,
-		    struct aer_capability_regs *aer)
+void pci_print_aer(struct pci_dev *dev, int aer_severity,
+		   struct aer_capability_regs *aer)
 {
 	int layer, agent, tlp_header_valid = 0;
 	u32 status, mask;
@@ -800,7 +801,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
 			aer_severity, tlp_header_valid, &aer->header_log);
 }
-#endif
+EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
 
 /**
  * add_error_device - list device to be handled
@@ -996,7 +997,7 @@ static void aer_recover_work_func(struct work_struct *work)
 			       PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
 			continue;
 		}
-		cper_print_aer(pdev, entry.severity, entry.regs);
+		pci_print_aer(pdev, entry.severity, entry.regs);
 		if (entry.severity == AER_NONFATAL)
 			pcie_do_recovery(pdev, pci_channel_io_normal,
 					 aer_root_reset);
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 29cc10220952..f6ea2f57d808 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -51,7 +51,7 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
 #endif
 
-void cper_print_aer(struct pci_dev *dev, int aer_severity,
+void pci_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer);
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
-- 
cgit v1.2.3


From a103f46633fdcddc2aaca506420f177e8803a2bd Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 12 Oct 2023 11:53:54 -0700
Subject: acpi: Move common tables helper functions to common lib

Some of the routines in ACPI driver/acpi/tables.c can be shared with
parsing CDAT. CDAT is a device-provided data structure that is formatted
similar to a platform provided ACPI table. CDAT is used by CXL and can
exist on platforms that do not use ACPI. Split out the common routine
from ACPI to accommodate platforms that do not support ACPI and move that
to /lib. The common routines can be built outside of ACPI if
FIRMWARE_TABLES is selected.

Link: https://lore.kernel.org/linux-cxl/CAJZ5v0jipbtTNnsA0-o5ozOk8ZgWnOg34m34a9pPenTyRLj=6A@mail.gmail.com/
Suggested-by: "Rafael J. Wysocki" <rafael@kernel.org>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/169713683430.2205276.17899451119920103445.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS              |   2 +
 drivers/acpi/Kconfig     |   1 +
 drivers/acpi/tables.c    | 173 -------------------------------------------
 include/linux/acpi.h     |  42 +++--------
 include/linux/fw_table.h |  43 +++++++++++
 lib/Kconfig              |   3 +
 lib/Makefile             |   2 +
 lib/fw_table.c           | 189 +++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 251 insertions(+), 204 deletions(-)
 create mode 100644 include/linux/fw_table.h
 create mode 100644 lib/fw_table.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 35977b269d5e..c6b08ec50109 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -294,6 +294,8 @@ F:	drivers/pnp/pnpacpi/
 F:	include/acpi/
 F:	include/linux/acpi.h
 F:	include/linux/fwnode.h
+F:	include/linux/fw_table.h
+F:	lib/fw_table.c
 F:	tools/power/acpi/
 
 ACPI APEI
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index cee82b473dc5..c0a2d43f8451 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -12,6 +12,7 @@ menuconfig ACPI
 	select PNP
 	select NLS
 	select CRC32
+	select FIRMWARE_TABLE
 	default y if X86
 	help
 	  Advanced Configuration and Power Interface (ACPI) support for 
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 8ab0a82b4da4..c1516337f668 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -37,18 +37,6 @@ static struct acpi_table_desc initial_tables[ACPI_MAX_TABLES] __initdata;
 
 static int acpi_apic_instance __initdata_or_acpilib;
 
-enum acpi_subtable_type {
-	ACPI_SUBTABLE_COMMON,
-	ACPI_SUBTABLE_HMAT,
-	ACPI_SUBTABLE_PRMT,
-	ACPI_SUBTABLE_CEDT,
-};
-
-struct acpi_subtable_entry {
-	union acpi_subtable_headers *hdr;
-	enum acpi_subtable_type type;
-};
-
 /*
  * Disable table checksum verification for the early stage due to the size
  * limitation of the current x86 early mapping implementation.
@@ -237,167 +225,6 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
 	}
 }
 
-static unsigned long __init_or_acpilib
-acpi_get_entry_type(struct acpi_subtable_entry *entry)
-{
-	switch (entry->type) {
-	case ACPI_SUBTABLE_COMMON:
-		return entry->hdr->common.type;
-	case ACPI_SUBTABLE_HMAT:
-		return entry->hdr->hmat.type;
-	case ACPI_SUBTABLE_PRMT:
-		return 0;
-	case ACPI_SUBTABLE_CEDT:
-		return entry->hdr->cedt.type;
-	}
-	return 0;
-}
-
-static unsigned long __init_or_acpilib
-acpi_get_entry_length(struct acpi_subtable_entry *entry)
-{
-	switch (entry->type) {
-	case ACPI_SUBTABLE_COMMON:
-		return entry->hdr->common.length;
-	case ACPI_SUBTABLE_HMAT:
-		return entry->hdr->hmat.length;
-	case ACPI_SUBTABLE_PRMT:
-		return entry->hdr->prmt.length;
-	case ACPI_SUBTABLE_CEDT:
-		return entry->hdr->cedt.length;
-	}
-	return 0;
-}
-
-static unsigned long __init_or_acpilib
-acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
-{
-	switch (entry->type) {
-	case ACPI_SUBTABLE_COMMON:
-		return sizeof(entry->hdr->common);
-	case ACPI_SUBTABLE_HMAT:
-		return sizeof(entry->hdr->hmat);
-	case ACPI_SUBTABLE_PRMT:
-		return sizeof(entry->hdr->prmt);
-	case ACPI_SUBTABLE_CEDT:
-		return sizeof(entry->hdr->cedt);
-	}
-	return 0;
-}
-
-static enum acpi_subtable_type __init_or_acpilib
-acpi_get_subtable_type(char *id)
-{
-	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
-		return ACPI_SUBTABLE_HMAT;
-	if (strncmp(id, ACPI_SIG_PRMT, 4) == 0)
-		return ACPI_SUBTABLE_PRMT;
-	if (strncmp(id, ACPI_SIG_CEDT, 4) == 0)
-		return ACPI_SUBTABLE_CEDT;
-	return ACPI_SUBTABLE_COMMON;
-}
-
-static __init_or_acpilib bool has_handler(struct acpi_subtable_proc *proc)
-{
-	return proc->handler || proc->handler_arg;
-}
-
-static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc,
-					  union acpi_subtable_headers *hdr,
-					  unsigned long end)
-{
-	if (proc->handler)
-		return proc->handler(hdr, end);
-	if (proc->handler_arg)
-		return proc->handler_arg(hdr, proc->arg, end);
-	return -EINVAL;
-}
-
-/**
- * acpi_parse_entries_array - for each proc_num find a suitable subtable
- *
- * @id: table id (for debugging purposes)
- * @table_size: size of the root table
- * @table_header: where does the table start?
- * @proc: array of acpi_subtable_proc struct containing entry id
- *        and associated handler with it
- * @proc_num: how big proc is?
- * @max_entries: how many entries can we process?
- *
- * For each proc_num find a subtable with proc->id and run proc->handler
- * on it. Assumption is that there's only single handler for particular
- * entry id.
- *
- * The table_size is not the size of the complete ACPI table (the length
- * field in the header struct), but only the size of the root table; i.e.,
- * the offset from the very first byte of the complete ACPI table, to the
- * first byte of the very first subtable.
- *
- * On success returns sum of all matching entries for all proc handlers.
- * Otherwise, -ENODEV or -EINVAL is returned.
- */
-static int __init_or_acpilib acpi_parse_entries_array(
-	char *id, unsigned long table_size,
-	struct acpi_table_header *table_header, struct acpi_subtable_proc *proc,
-	int proc_num, unsigned int max_entries)
-{
-	struct acpi_subtable_entry entry;
-	unsigned long table_end, subtable_len, entry_len;
-	int count = 0;
-	int errs = 0;
-	int i;
-
-	table_end = (unsigned long)table_header + table_header->length;
-
-	/* Parse all entries looking for a match. */
-
-	entry.type = acpi_get_subtable_type(id);
-	entry.hdr = (union acpi_subtable_headers *)
-	    ((unsigned long)table_header + table_size);
-	subtable_len = acpi_get_subtable_header_length(&entry);
-
-	while (((unsigned long)entry.hdr) + subtable_len  < table_end) {
-		if (max_entries && count >= max_entries)
-			break;
-
-		for (i = 0; i < proc_num; i++) {
-			if (acpi_get_entry_type(&entry) != proc[i].id)
-				continue;
-			if (!has_handler(&proc[i]) ||
-			    (!errs &&
-			     call_handler(&proc[i], entry.hdr, table_end))) {
-				errs++;
-				continue;
-			}
-
-			proc[i].count++;
-			break;
-		}
-		if (i != proc_num)
-			count++;
-
-		/*
-		 * If entry->length is 0, break from this loop to avoid
-		 * infinite loop.
-		 */
-		entry_len = acpi_get_entry_length(&entry);
-		if (entry_len == 0) {
-			pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, proc->id);
-			return -EINVAL;
-		}
-
-		entry.hdr = (union acpi_subtable_headers *)
-		    ((unsigned long)entry.hdr + entry_len);
-	}
-
-	if (max_entries && count > max_entries) {
-		pr_warn("[%4.4s:0x%02x] found the maximum %i entries\n",
-			id, proc->id, count);
-	}
-
-	return errs ? -EINVAL : count;
-}
-
 int __init_or_acpilib acpi_table_parse_entries_array(
 	char *id, unsigned long table_size, struct acpi_subtable_proc *proc,
 	int proc_num, unsigned int max_entries)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..0d334975c0c9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,6 +15,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
+#include <linux/fw_table.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -24,6 +25,16 @@ struct irq_domain_ops;
 #endif
 #include <acpi/acpi.h>
 
+#ifdef CONFIG_ACPI_TABLE_LIB
+#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
+#define __init_or_acpilib
+#define __initdata_or_acpilib
+#else
+#define EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_acpilib __init
+#define __initdata_or_acpilib __initdata
+#endif
+
 #ifdef	CONFIG_ACPI
 
 #include <linux/list.h>
@@ -119,21 +130,8 @@ enum acpi_address_range_id {
 
 
 /* Table Handlers */
-union acpi_subtable_headers {
-	struct acpi_subtable_header common;
-	struct acpi_hmat_structure hmat;
-	struct acpi_prmt_module_header prmt;
-	struct acpi_cedt_header cedt;
-};
-
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
 
-typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
-				      const unsigned long end);
-
-typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header,
-					  void *arg, const unsigned long end);
-
 /* Debugger support */
 
 struct acpi_debugger_ops {
@@ -207,14 +205,6 @@ static inline int acpi_debugger_notify_command_complete(void)
 		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
 		((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
 
-struct acpi_subtable_proc {
-	int id;
-	acpi_tbl_entry_handler handler;
-	acpi_tbl_entry_handler_arg handler_arg;
-	void *arg;
-	int count;
-};
-
 void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
 void __acpi_unmap_table(void __iomem *map, unsigned long size);
 int early_acpi_boot_init(void);
@@ -229,16 +219,6 @@ void acpi_reserve_initial_tables (void);
 void acpi_table_init_complete (void);
 int acpi_table_init (void);
 
-#ifdef CONFIG_ACPI_TABLE_LIB
-#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
-#define __init_or_acpilib
-#define __initdata_or_acpilib
-#else
-#define EXPORT_SYMBOL_ACPI_LIB(x)
-#define __init_or_acpilib __init
-#define __initdata_or_acpilib __initdata
-#endif
-
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init_or_acpilib acpi_table_parse_entries(char *id,
 		unsigned long table_size, int entry_id,
diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
new file mode 100644
index 000000000000..ff8fa58d5818
--- /dev/null
+++ b/include/linux/fw_table.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  fw_tables.h - Parsing support for ACPI and ACPI-like tables provided by
+ *                platform or device firmware
+ *
+ *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2023 Intel Corp.
+ */
+#ifndef _FW_TABLE_H_
+#define _FW_TABLE_H_
+
+union acpi_subtable_headers;
+
+typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header,
+				      const unsigned long end);
+
+typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header,
+					  void *arg, const unsigned long end);
+
+struct acpi_subtable_proc {
+	int id;
+	acpi_tbl_entry_handler handler;
+	acpi_tbl_entry_handler_arg handler_arg;
+	void *arg;
+	int count;
+};
+
+#include <linux/acpi.h>
+#include <acpi/acpi.h>
+
+union acpi_subtable_headers {
+	struct acpi_subtable_header common;
+	struct acpi_hmat_structure hmat;
+	struct acpi_prmt_module_header prmt;
+	struct acpi_cedt_header cedt;
+};
+
+int acpi_parse_entries_array(char *id, unsigned long table_size,
+			     struct acpi_table_header *table_header,
+			     struct acpi_subtable_proc *proc,
+			     int proc_num, unsigned int max_entries);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..d0474e251da5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -764,3 +764,6 @@ config ASN1_ENCODER
 
 config POLYNOMIAL
        tristate
+
+config FIRMWARE_TABLE
+	bool
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..f113b44bc4e4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -405,6 +405,8 @@ obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o
 
 obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
 
+obj-$(CONFIG_FIRMWARE_TABLE) += fw_table.o
+
 # FORTIFY_SOURCE compile-time behavior tests
 TEST_FORTIFY_SRCS = $(wildcard $(srctree)/$(src)/test_fortify/*-*.c)
 TEST_FORTIFY_LOGS = $(patsubst $(srctree)/$(src)/%.c, %.log, $(TEST_FORTIFY_SRCS))
diff --git a/lib/fw_table.c b/lib/fw_table.c
new file mode 100644
index 000000000000..e84bd0866e10
--- /dev/null
+++ b/lib/fw_table.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  fw_tables.c - Parsing support for ACPI and ACPI-like tables provided by
+ *                platform or device firmware
+ *
+ *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2023 Intel Corp.
+ */
+#include <linux/errno.h>
+#include <linux/fw_table.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+enum acpi_subtable_type {
+	ACPI_SUBTABLE_COMMON,
+	ACPI_SUBTABLE_HMAT,
+	ACPI_SUBTABLE_PRMT,
+	ACPI_SUBTABLE_CEDT,
+};
+
+struct acpi_subtable_entry {
+	union acpi_subtable_headers *hdr;
+	enum acpi_subtable_type type;
+};
+
+static unsigned long __init_or_acpilib
+acpi_get_entry_type(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.type;
+	case ACPI_SUBTABLE_HMAT:
+		return entry->hdr->hmat.type;
+	case ACPI_SUBTABLE_PRMT:
+		return 0;
+	case ACPI_SUBTABLE_CEDT:
+		return entry->hdr->cedt.type;
+	}
+	return 0;
+}
+
+static unsigned long __init_or_acpilib
+acpi_get_entry_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return entry->hdr->common.length;
+	case ACPI_SUBTABLE_HMAT:
+		return entry->hdr->hmat.length;
+	case ACPI_SUBTABLE_PRMT:
+		return entry->hdr->prmt.length;
+	case ACPI_SUBTABLE_CEDT:
+		return entry->hdr->cedt.length;
+	}
+	return 0;
+}
+
+static unsigned long __init_or_acpilib
+acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
+{
+	switch (entry->type) {
+	case ACPI_SUBTABLE_COMMON:
+		return sizeof(entry->hdr->common);
+	case ACPI_SUBTABLE_HMAT:
+		return sizeof(entry->hdr->hmat);
+	case ACPI_SUBTABLE_PRMT:
+		return sizeof(entry->hdr->prmt);
+	case ACPI_SUBTABLE_CEDT:
+		return sizeof(entry->hdr->cedt);
+	}
+	return 0;
+}
+
+static enum acpi_subtable_type __init_or_acpilib
+acpi_get_subtable_type(char *id)
+{
+	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
+		return ACPI_SUBTABLE_HMAT;
+	if (strncmp(id, ACPI_SIG_PRMT, 4) == 0)
+		return ACPI_SUBTABLE_PRMT;
+	if (strncmp(id, ACPI_SIG_CEDT, 4) == 0)
+		return ACPI_SUBTABLE_CEDT;
+	return ACPI_SUBTABLE_COMMON;
+}
+
+static __init_or_acpilib bool has_handler(struct acpi_subtable_proc *proc)
+{
+	return proc->handler || proc->handler_arg;
+}
+
+static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc,
+					  union acpi_subtable_headers *hdr,
+					  unsigned long end)
+{
+	if (proc->handler)
+		return proc->handler(hdr, end);
+	if (proc->handler_arg)
+		return proc->handler_arg(hdr, proc->arg, end);
+	return -EINVAL;
+}
+
+/**
+ * acpi_parse_entries_array - for each proc_num find a suitable subtable
+ *
+ * @id: table id (for debugging purposes)
+ * @table_size: size of the root table
+ * @table_header: where does the table start?
+ * @proc: array of acpi_subtable_proc struct containing entry id
+ *        and associated handler with it
+ * @proc_num: how big proc is?
+ * @max_entries: how many entries can we process?
+ *
+ * For each proc_num find a subtable with proc->id and run proc->handler
+ * on it. Assumption is that there's only single handler for particular
+ * entry id.
+ *
+ * The table_size is not the size of the complete ACPI table (the length
+ * field in the header struct), but only the size of the root table; i.e.,
+ * the offset from the very first byte of the complete ACPI table, to the
+ * first byte of the very first subtable.
+ *
+ * On success returns sum of all matching entries for all proc handlers.
+ * Otherwise, -ENODEV or -EINVAL is returned.
+ */
+int __init_or_acpilib
+acpi_parse_entries_array(char *id, unsigned long table_size,
+			 struct acpi_table_header *table_header,
+			 struct acpi_subtable_proc *proc,
+			 int proc_num, unsigned int max_entries)
+{
+	unsigned long table_end, subtable_len, entry_len;
+	struct acpi_subtable_entry entry;
+	int count = 0;
+	int errs = 0;
+	int i;
+
+	table_end = (unsigned long)table_header + table_header->length;
+
+	/* Parse all entries looking for a match. */
+
+	entry.type = acpi_get_subtable_type(id);
+	entry.hdr = (union acpi_subtable_headers *)
+	    ((unsigned long)table_header + table_size);
+	subtable_len = acpi_get_subtable_header_length(&entry);
+
+	while (((unsigned long)entry.hdr) + subtable_len  < table_end) {
+		if (max_entries && count >= max_entries)
+			break;
+
+		for (i = 0; i < proc_num; i++) {
+			if (acpi_get_entry_type(&entry) != proc[i].id)
+				continue;
+			if (!has_handler(&proc[i]) ||
+			    (!errs &&
+			     call_handler(&proc[i], entry.hdr, table_end))) {
+				errs++;
+				continue;
+			}
+
+			proc[i].count++;
+			break;
+		}
+		if (i != proc_num)
+			count++;
+
+		/*
+		 * If entry->length is 0, break from this loop to avoid
+		 * infinite loop.
+		 */
+		entry_len = acpi_get_entry_length(&entry);
+		if (entry_len == 0) {
+			pr_err("[%4.4s:0x%02x] Invalid zero length\n", id, proc->id);
+			return -EINVAL;
+		}
+
+		entry.hdr = (union acpi_subtable_headers *)
+		    ((unsigned long)entry.hdr + entry_len);
+	}
+
+	if (max_entries && count > max_entries) {
+		pr_warn("[%4.4s:0x%02x] found the maximum %i entries\n",
+			id, proc->id, count);
+	}
+
+	return errs ? -EINVAL : count;
+}
+EXPORT_SYMBOL_GPL(acpi_parse_entries_array);
-- 
cgit v1.2.3


From e719b4d156749f02eafed31a3c515f2aa9dcc72a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:07 +0200
Subject: block: Provide bdev_open_* functions

Create struct bdev_handle that contains all parameters that need to be
passed to blkdev_put() and provide bdev_open_* functions that return
this structure instead of plain bdev pointer. This will eventually allow
us to pass one more argument to blkdev_put() (renamed to bdev_release())
without too much hassle.

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-1-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h | 10 ++++++++++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/block/bdev.c b/block/bdev.c
index f3b13aa1b7d4..bdc7d739882b 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -829,6 +829,25 @@ put_blkdev:
 }
 EXPORT_SYMBOL(blkdev_get_by_dev);
 
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				     const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
+	struct block_device *bdev;
+
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+	bdev = blkdev_get_by_dev(dev, mode, holder, hops);
+	if (IS_ERR(bdev)) {
+		kfree(handle);
+		return ERR_CAST(bdev);
+	}
+	handle->bdev = bdev;
+	handle->holder = holder;
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_dev);
+
 /**
  * blkdev_get_by_path - open a block device by name
  * @path: path to the block device to open
@@ -867,6 +886,28 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
 
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle;
+	dev_t dev;
+	int error;
+
+	error = lookup_bdev(path, &dev);
+	if (error)
+		return ERR_PTR(error);
+
+	handle = bdev_open_by_dev(dev, mode, holder, hops);
+	if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+	    bdev_read_only(handle->bdev)) {
+		bdev_release(handle);
+		return ERR_PTR(-EACCES);
+	}
+
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_path);
+
 void blkdev_put(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -903,6 +944,13 @@ void blkdev_put(struct block_device *bdev, void *holder)
 }
 EXPORT_SYMBOL(blkdev_put);
 
+void bdev_release(struct bdev_handle *handle)
+{
+	blkdev_put(handle->bdev, handle->holder);
+	kfree(handle);
+}
+EXPORT_SYMBOL(bdev_release);
+
 /**
  * lookup_bdev() - Look up a struct block_device by name.
  * @pathname: Name of the block device in the filesystem.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eef450f25982..03d3adc3ff34 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1479,14 +1479,24 @@ extern const struct blk_holder_ops fs_holder_ops;
 #define sb_open_mode(flags) \
 	(BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
+struct bdev_handle {
+	struct block_device *bdev;
+	void *holder;
+};
+
 struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 		void *holder, const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+		const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops);
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
 void blkdev_put(struct block_device *bdev, void *holder);
+void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
 struct block_device *blkdev_get_no_open(dev_t dev);
-- 
cgit v1.2.3


From 841dd789b8625eb9288aaa2be9f10872e6622033 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:08 +0200
Subject: block: Use bdev_open_by_dev() in blkdev_open()

Convert blkdev_open() to use bdev_open_by_dev(). To be able to propagate
handle from blkdev_open() to blkdev_release() we need to stop using
existence of file->private_data to determine exclusive block device
opens. Use bdev_handle->mode for this purpose since file->f_flags
isn't usable for this (O_EXCL is cleared from the flags during open).

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-2-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           |  3 +++
 block/fops.c           | 44 ++++++++++++++++++++++++++++----------------
 include/linux/blkdev.h |  1 +
 3 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/bdev.c b/block/bdev.c
index bdc7d739882b..4628dcb1da8a 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -844,6 +844,9 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 	}
 	handle->bdev = bdev;
 	handle->holder = holder;
+	if (holder)
+		mode |= BLK_OPEN_EXCL;
+	handle->mode = mode;
 	return handle;
 }
 EXPORT_SYMBOL(bdev_open_by_dev);
diff --git a/block/fops.c b/block/fops.c
index 73e42742543f..0abaac705daf 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -542,15 +542,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 	return error;
 }
 
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
 blk_mode_t file_to_blk_mode(struct file *file)
 {
 	blk_mode_t mode = 0;
+	struct bdev_handle *handle = file->private_data;
 
 	if (file->f_mode & FMODE_READ)
 		mode |= BLK_OPEN_READ;
 	if (file->f_mode & FMODE_WRITE)
 		mode |= BLK_OPEN_WRITE;
-	if (file->private_data)
+	/*
+	 * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
+	 * determine whether the open was exclusive for already open files.
+	 */
+	if (handle)
+		mode |= handle->mode & BLK_OPEN_EXCL;
+	else if (file->f_flags & O_EXCL)
 		mode |= BLK_OPEN_EXCL;
 	if (file->f_flags & O_NDELAY)
 		mode |= BLK_OPEN_NDELAY;
@@ -568,7 +584,8 @@ blk_mode_t file_to_blk_mode(struct file *file)
 
 static int blkdev_open(struct inode *inode, struct file *filp)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
+	blk_mode_t mode;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -579,29 +596,24 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	filp->f_flags |= O_LARGEFILE;
 	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
 
-	/*
-	 * Use the file private data to store the holder for exclusive openes.
-	 * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL.
-	 */
-	if (filp->f_flags & O_EXCL)
-		filp->private_data = filp;
-
-	bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp),
-				 filp->private_data, NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	mode = file_to_blk_mode(filp);
+	handle = bdev_open_by_dev(inode->i_rdev, mode,
+			mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
-	if (bdev_nowait(bdev))
+	if (bdev_nowait(handle->bdev))
 		filp->f_mode |= FMODE_NOWAIT;
 
-	filp->f_mapping = bdev->bd_inode->i_mapping;
+	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+	filp->private_data = handle;
 	return 0;
 }
 
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
-	blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data);
+	bdev_release(filp->private_data);
 	return 0;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 03d3adc3ff34..51fa7ffdee83 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1482,6 +1482,7 @@ extern const struct blk_holder_ops fs_holder_ops;
 struct bdev_handle {
 	struct block_device *bdev;
 	void *holder;
+	blk_mode_t mode;
 };
 
 struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-- 
cgit v1.2.3


From 7ac86df899f0025cfd554c057efb30e5a4ef95b0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:11 +0200
Subject: pktcdvd: Convert to bdev_open_by_dev()

Convert pktcdvd to use bdev_open_by_dev().

Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-5-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/pktcdvd.c | 76 ++++++++++++++++++++++++++-----------------------
 include/linux/pktcdvd.h |  4 ++-
 2 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a1428538bda5..d56d972aadb3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -340,8 +340,8 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu
 		n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n",
 			pd->disk->disk_name,
 			MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
-			MAJOR(pd->bdev->bd_dev),
-			MINOR(pd->bdev->bd_dev));
+			MAJOR(pd->bdev_handle->bdev->bd_dev),
+			MINOR(pd->bdev_handle->bdev->bd_dev));
 	}
 	mutex_unlock(&ctl_mutex);
 	return n;
@@ -437,7 +437,8 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 	char *msg;
 	int states[PACKET_NUM_STATES];
 
-	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev);
+	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name,
+		   pd->bdev_handle->bdev);
 
 	seq_printf(m, "\nSettings:\n");
 	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
@@ -714,7 +715,7 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
  */
 static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
 {
-	struct request_queue *q = bdev_get_queue(pd->bdev);
+	struct request_queue *q = bdev_get_queue(pd->bdev_handle->bdev);
 	struct scsi_cmnd *scmd;
 	struct request *rq;
 	int ret = 0;
@@ -1047,7 +1048,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 			continue;
 
 		bio = pkt->r_bios[f];
-		bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ);
+		bio_init(bio, pd->bdev_handle->bdev, bio->bi_inline_vecs, 1,
+			 REQ_OP_READ);
 		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
@@ -1262,8 +1264,8 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	struct device *ddev = disk_to_dev(pd->disk);
 	int f;
 
-	bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames,
-		 REQ_OP_WRITE);
+	bio_init(pkt->w_bio, pd->bdev_handle->bdev, pkt->w_bio->bi_inline_vecs,
+		 pkt->frames, REQ_OP_WRITE);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
@@ -2160,18 +2162,20 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	int ret;
 	long lba;
 	struct request_queue *q;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 
 	/*
 	 * We need to re-open the cdrom device without O_NONBLOCK to be able
 	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
 	 * so open should not fail.
 	 */
-	bdev = blkdev_get_by_dev(pd->bdev->bd_dev, BLK_OPEN_READ, pd, NULL);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(pd->bdev_handle->bdev->bd_dev,
+				       BLK_OPEN_READ, pd, NULL);
+	if (IS_ERR(bdev_handle)) {
+		ret = PTR_ERR(bdev_handle);
 		goto out;
 	}
+	pd->open_bdev_handle = bdev_handle;
 
 	ret = pkt_get_last_written(pd, &lba);
 	if (ret) {
@@ -2180,9 +2184,9 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	}
 
 	set_capacity(pd->disk, lba << 2);
-	set_capacity_and_notify(pd->bdev->bd_disk, lba << 2);
+	set_capacity_and_notify(pd->bdev_handle->bdev->bd_disk, lba << 2);
 
-	q = bdev_get_queue(pd->bdev);
+	q = bdev_get_queue(pd->bdev_handle->bdev);
 	if (write) {
 		ret = pkt_open_write(pd);
 		if (ret)
@@ -2214,7 +2218,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	return 0;
 
 out_putdev:
-	blkdev_put(bdev, pd);
+	bdev_release(bdev_handle);
 out:
 	return ret;
 }
@@ -2233,7 +2237,8 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	blkdev_put(pd->bdev, pd);
+	bdev_release(pd->open_bdev_handle);
+	pd->open_bdev_handle = NULL;
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2321,8 +2326,8 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio =
-		bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
+	struct bio *cloned_bio = bio_alloc_clone(pd->bdev_handle->bdev, bio,
+		GFP_NOIO, &pkt_bio_set);
 	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
@@ -2492,7 +2497,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 {
 	struct device *ddev = disk_to_dev(pd->disk);
 	int i;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct scsi_device *sdev;
 
 	if (pd->pkt_dev == dev) {
@@ -2503,8 +2508,9 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		struct pktcdvd_device *pd2 = pkt_devs[i];
 		if (!pd2)
 			continue;
-		if (pd2->bdev->bd_dev == dev) {
-			dev_err(ddev, "%pg already setup\n", pd2->bdev);
+		if (pd2->bdev_handle->bdev->bd_dev == dev) {
+			dev_err(ddev, "%pg already setup\n",
+				pd2->bdev_handle->bdev);
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
@@ -2513,13 +2519,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		}
 	}
 
-	bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, NULL,
-				 NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-	sdev = scsi_device_from_queue(bdev->bd_disk->queue);
+	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
+				       NULL, NULL);
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
+	sdev = scsi_device_from_queue(bdev_handle->bdev->bd_disk->queue);
 	if (!sdev) {
-		blkdev_put(bdev, NULL);
+		bdev_release(bdev_handle);
 		return -EINVAL;
 	}
 	put_device(&sdev->sdev_gendev);
@@ -2527,8 +2533,8 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	/* This is safe, since we have a reference from open(). */
 	__module_get(THIS_MODULE);
 
-	pd->bdev = bdev;
-	set_blocksize(bdev, CD_FRAMESIZE);
+	pd->bdev_handle = bdev_handle;
+	set_blocksize(bdev_handle->bdev, CD_FRAMESIZE);
 
 	pkt_init_queue(pd);
 
@@ -2540,11 +2546,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	}
 
 	proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd);
-	dev_notice(ddev, "writer mapped to %pg\n", bdev);
+	dev_notice(ddev, "writer mapped to %pg\n", bdev_handle->bdev);
 	return 0;
 
 out_mem:
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return -ENOMEM;
@@ -2599,9 +2605,9 @@ static unsigned int pkt_check_events(struct gendisk *disk,
 
 	if (!pd)
 		return 0;
-	if (!pd->bdev)
+	if (!pd->bdev_handle)
 		return 0;
-	attached_disk = pd->bdev->bd_disk;
+	attached_disk = pd->bdev_handle->bdev->bd_disk;
 	if (!attached_disk || !attached_disk->fops->check_events)
 		return 0;
 	return attached_disk->fops->check_events(attached_disk, clearing);
@@ -2686,7 +2692,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 		goto out_mem2;
 
 	/* inherit events of the host device */
-	disk->events = pd->bdev->bd_disk->events;
+	disk->events = pd->bdev_handle->bdev->bd_disk->events;
 
 	ret = add_disk(disk);
 	if (ret)
@@ -2751,7 +2757,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	pkt_debugfs_dev_remove(pd);
 	pkt_sysfs_dev_remove(pd);
 
-	blkdev_put(pd->bdev, NULL);
+	bdev_release(pd->bdev_handle);
 
 	remove_proc_entry(pd->disk->disk_name, pkt_proc);
 	dev_notice(ddev, "writer unmapped\n");
@@ -2778,7 +2784,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
 
 	pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
 	if (pd) {
-		ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+		ctrl_cmd->dev = new_encode_dev(pd->bdev_handle->bdev->bd_dev);
 		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
 	} else {
 		ctrl_cmd->dev = 0;
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 80cb00db42a4..79594aeb160d 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -154,7 +154,9 @@ struct packet_stacked_data
 
 struct pktcdvd_device
 {
-	struct block_device	*bdev;		/* dev attached */
+	struct bdev_handle	*bdev_handle;	/* dev attached */
+	/* handle acquired for bdev during pkt_open_dev() */
+	struct bdev_handle	*open_bdev_handle;
 	dev_t			pkt_dev;	/* our dev */
 	struct packet_settings	settings;
 	struct packet_stats	stats;
-- 
cgit v1.2.3


From c2fce61fb22e3235d29512558c4e93e184e2f68b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:16 +0200
Subject: dm: Convert to bdev_open_by_dev()

Convert device mapper to use bdev_open_by_dev() and pass the handle
around.

CC: Alasdair Kergon <agk@redhat.com>
CC: Mike Snitzer <snitzer@kernel.org>
CC: dm-devel@redhat.com
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-10-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/md/dm.c               | 20 +++++++++++---------
 include/linux/device-mapper.h |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 64a1f306c96c..f7212e8fc27f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -724,7 +724,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		dev_t dev, blk_mode_t mode)
 {
 	struct table_device *td;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	u64 part_off;
 	int r;
 
@@ -733,9 +733,9 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		return ERR_PTR(-ENOMEM);
 	refcount_set(&td->count, 1);
 
-	bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL);
-	if (IS_ERR(bdev)) {
-		r = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
+	if (IS_ERR(bdev_handle)) {
+		r = PTR_ERR(bdev_handle);
 		goto out_free_td;
 	}
 
@@ -745,20 +745,22 @@ static struct table_device *open_table_device(struct mapped_device *md,
 	 * called.
 	 */
 	if (md->disk->slave_dir) {
-		r = bd_link_disk_holder(bdev, md->disk);
+		r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
 		if (r)
 			goto out_blkdev_put;
 	}
 
 	td->dm_dev.mode = mode;
-	td->dm_dev.bdev = bdev;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
+	td->dm_dev.bdev = bdev_handle->bdev;
+	td->dm_dev.bdev_handle = bdev_handle;
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
+						NULL, NULL);
 	format_dev_t(td->dm_dev.name, dev);
 	list_add(&td->list, &md->table_devices);
 	return td;
 
 out_blkdev_put:
-	blkdev_put(bdev, _dm_claim_ptr);
+	bdev_release(bdev_handle);
 out_free_td:
 	kfree(td);
 	return ERR_PTR(r);
@@ -771,7 +773,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 {
 	if (md->disk->slave_dir)
 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
-	blkdev_put(td->dm_dev.bdev, _dm_claim_ptr);
+	bdev_release(td->dm_dev.bdev_handle);
 	put_dax(td->dm_dev.dax_dev);
 	list_del(&td->list);
 	kfree(td);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 69d0435c7ebb..772ab4d74d94 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -165,6 +165,7 @@ void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
-- 
cgit v1.2.3


From 4c6bca43c547fe9bc1d7d1519b1d6430fee2cae2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:24 +0200
Subject: mm/swap: Convert to use bdev_open_by_dev()

Convert swapping code to use bdev_open_by_dev() and pass the handle
around.

CC: linux-mm@kvack.org
CC: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-18-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 493487ed7c38..f6dd6575b905 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,6 +298,7 @@ struct swap_info_struct {
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
+	struct bdev_handle *bdev_handle;/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e52f486834eb..4bc70f459164 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2530,11 +2530,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
-	if (S_ISBLK(inode->i_mode)) {
-		struct block_device *bdev = I_BDEV(inode);
-
-		set_blocksize(bdev, old_block_size);
-		blkdev_put(bdev, p);
+	if (p->bdev_handle) {
+		set_blocksize(p->bdev, old_block_size);
+		bdev_release(p->bdev_handle);
+		p->bdev_handle = NULL;
 	}
 
 	inode_lock(inode);
@@ -2764,13 +2763,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 	int error;
 
 	if (S_ISBLK(inode->i_mode)) {
-		p->bdev = blkdev_get_by_dev(inode->i_rdev,
+		p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
-		if (IS_ERR(p->bdev)) {
-			error = PTR_ERR(p->bdev);
-			p->bdev = NULL;
+		if (IS_ERR(p->bdev_handle)) {
+			error = PTR_ERR(p->bdev_handle);
+			p->bdev_handle = NULL;
 			return error;
 		}
+		p->bdev = p->bdev_handle->bdev;
 		p->old_block_size = block_size(p->bdev);
 		error = set_blocksize(p->bdev, PAGE_SIZE);
 		if (error < 0)
@@ -3206,9 +3206,10 @@ bad_swap:
 	p->percpu_cluster = NULL;
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
-	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+	if (p->bdev_handle) {
 		set_blocksize(p->bdev, p->old_block_size);
-		blkdev_put(p->bdev, p);
+		bdev_release(p->bdev_handle);
+		p->bdev_handle = NULL;
 	}
 	inode = NULL;
 	destroy_swap_extents(p);
-- 
cgit v1.2.3


From f4a48bc36cdfae7c603e8e3f2a51e2a283f3f365 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:25 +0200
Subject: fs: Convert to bdev_open_by_dev()

Convert mount code to use bdev_open_by_dev() and propagate the handle
around to bdev_release().

Acked-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-19-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cramfs/inode.c  |  2 +-
 fs/romfs/super.c   |  2 +-
 fs/super.c         | 15 +++++++++------
 include/linux/fs.h |  1 +
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 5ee7d7bbb361..2fbf97077ce9 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 	kfree(sbi);
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 5c35f6c76037..b1bdfbc211c3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -593,7 +593,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 #endif
 }
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..26b96191e9b3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1479,14 +1479,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
 	blk_mode_t mode = sb_open_mode(sb_flags);
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 
-	bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_handle)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev);
+		return PTR_ERR(bdev_handle);
 	}
+	bdev = bdev_handle->bdev;
 
 	/*
 	 * This really should be in blkdev_get_by_dev, but right now can't due
@@ -1494,7 +1496,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EACCES;
 	}
 
@@ -1510,10 +1512,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EBUSY;
 	}
 	spin_lock(&sb_lock);
+	sb->s_bdev_handle = bdev_handle;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
@@ -1646,7 +1649,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a40823c3c67..fb5accebdcdf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1221,6 +1221,7 @@ struct super_block {
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
+	struct bdev_handle	*s_bdev_handle;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct hlist_node	s_instances;
-- 
cgit v1.2.3


From 6a4e59eeedc3018cb57722eecfcbb49431aeb05f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Mon, 23 Oct 2023 02:06:05 +0900
Subject: linux/init: remove __memexit* annotations

We have never used __memexit, __memexitdata, or __memexitconst.

These were unneeded.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/vmlinux.lds.h |  6 ------
 include/linux/init.h              |  3 ---
 scripts/mod/modpost.c             | 15 +++------------
 3 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 67d8dd2f1bde..bae0fe4d499b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -356,7 +356,6 @@
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
 	MEM_KEEP(init.data*)						\
-	MEM_KEEP(exit.data*)						\
 	*(.data.unlikely)						\
 	__start_once = .;						\
 	*(.data.once)							\
@@ -521,7 +520,6 @@
 	__init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) {		\
 		*(.ref.rodata)						\
 		MEM_KEEP(init.rodata)					\
-		MEM_KEEP(exit.rodata)					\
 	}								\
 									\
 	/* Built-in module parameters. */				\
@@ -574,7 +572,6 @@
 		*(.ref.text)						\
 		*(.text.asan.* .text.tsan.*)				\
 	MEM_KEEP(init.text*)						\
-	MEM_KEEP(exit.text*)						\
 
 
 /* sched.text is aling to function alignment to secure we have same
@@ -714,13 +711,10 @@
 	*(.exit.data .exit.data.*)					\
 	*(.fini_array .fini_array.*)					\
 	*(.dtors .dtors.*)						\
-	MEM_DISCARD(exit.data*)						\
-	MEM_DISCARD(exit.rodata*)
 
 #define EXIT_TEXT							\
 	*(.exit.text)							\
 	*(.text.exit)							\
-	MEM_DISCARD(exit.text)
 
 #define EXIT_CALL							\
 	*(.exitcall.exit)
diff --git a/include/linux/init.h b/include/linux/init.h
index 266c3e1640d4..01b52c9c7526 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -89,9 +89,6 @@
 						  __latent_entropy
 #define __meminitdata    __section(".meminit.data")
 #define __meminitconst   __section(".meminit.rodata")
-#define __memexit        __section(".memexit.text") __exitused __cold notrace
-#define __memexitdata    __section(".memexit.data")
-#define __memexitconst   __section(".memexit.rodata")
 
 /* For assembly routines */
 #define __HEAD		.section	".head.text","ax"
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index d936fa5fbbb1..bcc334b28a2c 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -798,7 +798,7 @@ static void check_section(const char *modname, struct elf_info *elf,
 #define ALL_INIT_TEXT_SECTIONS \
 	".init.text", ".meminit.text"
 #define ALL_EXIT_TEXT_SECTIONS \
-	".exit.text", ".memexit.text"
+	".exit.text"
 
 #define ALL_PCI_INIT_SECTIONS	\
 	".pci_fixup_early", ".pci_fixup_header", ".pci_fixup_final", \
@@ -806,10 +806,9 @@ static void check_section(const char *modname, struct elf_info *elf,
 	".pci_fixup_resume_early", ".pci_fixup_suspend"
 
 #define ALL_XXXINIT_SECTIONS MEM_INIT_SECTIONS
-#define ALL_XXXEXIT_SECTIONS MEM_EXIT_SECTIONS
 
 #define ALL_INIT_SECTIONS INIT_SECTIONS, ALL_XXXINIT_SECTIONS
-#define ALL_EXIT_SECTIONS EXIT_SECTIONS, ALL_XXXEXIT_SECTIONS
+#define ALL_EXIT_SECTIONS EXIT_SECTIONS
 
 #define DATA_SECTIONS ".data", ".data.rel"
 #define TEXT_SECTIONS ".text", ".text.*", ".sched.text", \
@@ -822,7 +821,6 @@ static void check_section(const char *modname, struct elf_info *elf,
 #define MEM_INIT_SECTIONS  ".meminit.*"
 
 #define EXIT_SECTIONS      ".exit.*"
-#define MEM_EXIT_SECTIONS  ".memexit.*"
 
 #define ALL_TEXT_SECTIONS  ALL_INIT_TEXT_SECTIONS, ALL_EXIT_TEXT_SECTIONS, \
 		TEXT_SECTIONS, OTHER_TEXT_SECTIONS
@@ -832,7 +830,6 @@ enum mismatch {
 	DATA_TO_ANY_INIT,
 	TEXTDATA_TO_ANY_EXIT,
 	XXXINIT_TO_SOME_INIT,
-	XXXEXIT_TO_SOME_EXIT,
 	ANY_INIT_TO_ANY_EXIT,
 	ANY_EXIT_TO_ANY_INIT,
 	EXTABLE_TO_NON_TEXT,
@@ -883,12 +880,6 @@ static const struct sectioncheck sectioncheck[] = {
 	.bad_tosec = { INIT_SECTIONS, NULL },
 	.mismatch = XXXINIT_TO_SOME_INIT,
 },
-/* Do not reference exit code/data from memexit code/data */
-{
-	.fromsec = { ALL_XXXEXIT_SECTIONS, NULL },
-	.bad_tosec = { EXIT_SECTIONS, NULL },
-	.mismatch = XXXEXIT_TO_SOME_EXIT,
-},
 /* Do not use exit code/data from init code */
 {
 	.fromsec = { ALL_INIT_SECTIONS, NULL },
@@ -1017,7 +1008,7 @@ static int secref_whitelist(const char *fromsec, const char *fromsym,
 
 	/* symbols in data sections that may refer to meminit sections */
 	if (match(fromsec, PATTERNS(DATA_SECTIONS)) &&
-	    match(tosec, PATTERNS(ALL_XXXINIT_SECTIONS, ALL_XXXEXIT_SECTIONS)) &&
+	    match(tosec, PATTERNS(ALL_XXXINIT_SECTIONS)) &&
 	    match(fromsym, PATTERNS("*driver")))
 		return 0;
 
-- 
cgit v1.2.3


From 29166faac5486cbc34228431cd049e1c1be451ac Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 15 Oct 2023 23:34:25 +0200
Subject: ACPI: Add helper acpi_use_parent_companion

In several drivers devices use the ACPI companion of the parent.
Add a helper for this use case to avoid code duplication.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/acpi.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..d17a280ce7c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1539,4 +1539,9 @@ static inline void acpi_device_notify(struct device *dev) { }
 static inline void acpi_device_notify_remove(struct device *dev) { }
 #endif
 
+static inline void acpi_use_parent_companion(struct device *dev)
+{
+	ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent));
+}
+
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From e21fc2038c1b978b89bbc3d45a4c5c6ef598e178 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:07:59 +0300
Subject: exportfs: make ->encode_fh() a mandatory method for NFS export

Rename the default helper for encoding FILEID_INO32_GEN* file handles to
generic_encode_ino32_fh() and convert the filesystems that used the
default implementation to use the generic helper explicitly.

After this change, exportfs_encode_inode_fh() no longer has a default
implementation to encode FILEID_INO32_GEN* file handles.

This is a step towards allowing filesystems to encode non-decodeable
file handles for fanotify without having to implement any
export_operations.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-3-amir73il@gmail.com
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/nfs/exporting.rst |  7 ++-----
 Documentation/filesystems/porting.rst       |  9 +++++++++
 fs/affs/namei.c                             |  1 +
 fs/befs/linuxvfs.c                          |  1 +
 fs/efs/super.c                              |  1 +
 fs/erofs/super.c                            |  1 +
 fs/exportfs/expfs.c                         | 16 +++++++++-------
 fs/ext2/super.c                             |  1 +
 fs/ext4/super.c                             |  1 +
 fs/f2fs/super.c                             |  1 +
 fs/fat/nfs.c                                |  1 +
 fs/jffs2/super.c                            |  1 +
 fs/jfs/super.c                              |  1 +
 fs/ntfs/namei.c                             |  1 +
 fs/ntfs3/super.c                            |  1 +
 fs/smb/client/export.c                      | 11 +++++------
 fs/squashfs/export.c                        |  1 +
 fs/ufs/super.c                              |  1 +
 include/linux/exportfs.h                    |  9 ++++++++-
 19 files changed, 47 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index 4b30daee399a..de64d2d002a2 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -122,12 +122,9 @@ are exportable by setting the s_export_op field in the struct
 super_block.  This field must point to a "struct export_operations"
 struct which has the following members:
 
-  encode_fh (optional)
+  encode_fh (mandatory)
     Takes a dentry and creates a filehandle fragment which may later be used
-    to find or create a dentry for the same object.  The default
-    implementation creates a filehandle fragment that encodes a 32bit inode
-    and generation number for the inode encoded, and if necessary the
-    same information for the parent.
+    to find or create a dentry for the same object.
 
   fh_to_dentry (mandatory)
     Given a filehandle fragment, this should find the implied object and
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 4d05b9862451..9cc6cb27c4d5 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1045,3 +1045,12 @@ filesystem type is now moved to a later point when the devices are closed:
 As this is a VFS level change it has no practical consequences for filesystems
 other than that all of them must use one of the provided kill_litter_super(),
 kill_anon_super(), or kill_block_super() helpers.
+
+---
+
+**mandatory**
+
+export_operations ->encode_fh() no longer has a default implementation to
+encode FILEID_INO32_GEN* file handles.
+Filesystems that used the default implementation may use the generic helper
+generic_encode_ino32_fh() explicitly.
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2fe4a5832fcf..d6b9758ee23d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 const struct export_operations affs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = affs_fh_to_dentry,
 	.fh_to_parent = affs_fh_to_parent,
 	.get_parent = affs_get_parent,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9a16a51fbb88..410dcaffd5ab 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = {
 };
 
 static const struct export_operations befs_export_operations = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= befs_fh_to_dentry,
 	.fh_to_parent	= befs_fh_to_parent,
 	.get_parent	= befs_get_parent,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index b287f47c165b..f17fdac76b2e 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = {
 };
 
 static const struct export_operations efs_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= efs_fh_to_dentry,
 	.fh_to_parent	= efs_fh_to_parent,
 	.get_parent	= efs_get_parent,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3700af9ee173..edbe07a24156 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -626,6 +626,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
 }
 
 static const struct export_operations erofs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = erofs_fh_to_dentry,
 	.fh_to_parent = erofs_fh_to_parent,
 	.get_parent = erofs_get_parent,
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 9ee205df8fa7..8f883c4758f5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -343,20 +343,21 @@ out:
 }
 
 /**
- * export_encode_fh - default export_operations->encode_fh function
+ * generic_encode_ino32_fh - generic export_operations->encode_fh function
  * @inode:   the object to encode
- * @fid:     where to store the file handle fragment
- * @max_len: maximum length to store there
+ * @fh:      where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
  * @parent:  parent directory inode, if wanted
  *
- * This default encode_fh function assumes that the 32 inode number
+ * This generic encode_fh function assumes that the 32 inode number
  * is suitable for locating an inode, and that the generation number
  * can be used to check that it is still valid.  It places them in the
  * filehandle fragment where export_decode_fh expects to find them.
  */
-static int export_encode_fh(struct inode *inode, struct fid *fid,
-		int *max_len, struct inode *parent)
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+			    struct inode *parent)
 {
+	struct fid *fid = (void *)fh;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
 
@@ -380,6 +381,7 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
 	*max_len = len;
 	return type;
 }
+EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
 
 /**
  * exportfs_encode_inode_fh - encode a file handle from inode
@@ -402,7 +404,7 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 	if (nop && nop->encode_fh)
 		return nop->encode_fh(inode, fid->raw, max_len, parent);
 
-	return export_encode_fh(inode, fid, max_len, parent);
+	return -EOPNOTSUPP;
 }
 EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index aaf3e3e88cb2..b9f158a34997 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 static const struct export_operations ext2_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ext2_fh_to_dentry,
 	.fh_to_parent = ext2_fh_to_parent,
 	.get_parent = ext2_get_parent,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..c44db1915437 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1646,6 +1646,7 @@ static const struct super_operations ext4_sops = {
 };
 
 static const struct export_operations ext4_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..60cfa11f65bf 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3282,6 +3282,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 static const struct export_operations f2fs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = f2fs_fh_to_dentry,
 	.fh_to_parent = f2fs_fh_to_parent,
 	.get_parent = f2fs_get_parent,
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 3626eb585a98..c52e63e10d35 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir)
 }
 
 const struct export_operations fat_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry   = fat_fh_to_dentry,
 	.fh_to_parent   = fat_fh_to_parent,
 	.get_parent     = fat_get_parent,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7ea37f49f1e1..f99591a634b4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
 }
 
 static const struct export_operations jffs2_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.get_parent = jffs2_get_parent,
 	.fh_to_dentry = jffs2_fh_to_dentry,
 	.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2e2f7f6d36a0..2cc2632f3c47 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = {
 };
 
 static const struct export_operations jfs_export_operations = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= jfs_fh_to_dentry,
 	.fh_to_parent	= jfs_fh_to_parent,
 	.get_parent	= jfs_get_parent,
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index ab44f2db533b..d7498ddc4a72 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
  * and due to using iget() whereas NTFS needs ntfs_iget().
  */
 const struct export_operations ntfs_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
 						   directory. */
 	.fh_to_dentry	= ntfs_fh_to_dentry,
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index f763e3256ccc..9153dffde950 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -811,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode)
 }
 
 static const struct export_operations ntfs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ntfs_fh_to_dentry,
 	.fh_to_parent = ntfs_fh_to_parent,
 	.get_parent = ntfs3_get_parent,
diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c
index 37c28415df1e..d606e8cbcb7d 100644
--- a/fs/smb/client/export.c
+++ b/fs/smb/client/export.c
@@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry)
 }
 
 const struct export_operations cifs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.get_parent = cifs_get_parent,
-/*	Following five export operations are unneeded so far and can default:
-	.get_dentry =
-	.get_name =
-	.find_exported_dentry =
-	.decode_fh =
-	.encode_fs =  */
+/*
+ * Following export operations are mandatory for NFS export support:
+ *	.fh_to_dentry =
+ */
 };
 
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 723763746238..62972f0ff868 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
 
 
 const struct export_operations squashfs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = squashfs_fh_to_dentry,
 	.fh_to_parent = squashfs_fh_to_parent,
 	.get_parent = squashfs_get_parent
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 23377c1baed9..a480810cd4e3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
 }
 
 static const struct export_operations ufs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry	= ufs_fh_to_dentry,
 	.fh_to_parent	= ufs_fh_to_parent,
 	.get_parent	= ufs_get_parent,
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 5b3c9f30b422..85bd027494e5 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -235,7 +235,7 @@ extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 
 static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
 {
-	return nop;
+	return nop && nop->encode_fh;
 }
 
 static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
@@ -279,6 +279,13 @@ extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 /*
  * Generic helpers for filesystems.
  */
+#ifdef CONFIG_EXPORTFS
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+			    struct inode *parent);
+#else
+#define generic_encode_ino32_fh NULL
+#endif
+
 extern struct dentry *generic_fh_to_dentry(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
-- 
cgit v1.2.3


From 41d1ddd2717c758b8606a66d57d2cc63b41373c0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:08:00 +0300
Subject: exportfs: define FILEID_INO64_GEN* file handle types

Similar to the common FILEID_INO32* file handle types, define common
FILEID_INO64* file handle types.

The type values of FILEID_INO64_GEN and FILEID_INO64_GEN_PARENT are the
values returned by fuse and xfs for 64bit ino encoded file handle types.

Note that these type value are filesystem specific and they do not define
a universal file handle format, for example:
fuse encodes FILEID_INO64_GEN as [ino-hi32,ino-lo32,gen] and xfs encodes
FILEID_INO64_GEN as [hostr-order-ino64,gen] (a.k.a xfs_fid64).

The FILEID_INO64_GEN fhandle type is going to be used for file ids for
fanotify from filesystems that do not support NFS export.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fuse/inode.c          |  7 ++++---
 include/linux/exportfs.h | 11 +++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..e63f966698a5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1002,7 +1002,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	}
 
 	*max_len = len;
-	return parent ? 0x82 : 0x81;
+	return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
 }
 
 static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
@@ -1010,7 +1010,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
 {
 	struct fuse_inode_handle handle;
 
-	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+	if ((fh_type != FILEID_INO64_GEN &&
+	     fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
 		return NULL;
 
 	handle.nodeid = (u64) fid->raw[0] << 32;
@@ -1024,7 +1025,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
 {
 	struct fuse_inode_handle parent;
 
-	if (fh_type != 0x82 || fh_len < 6)
+	if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
 		return NULL;
 
 	parent.nodeid = (u64) fid->raw[3] << 32;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 85bd027494e5..4119d3ee72eb 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -98,6 +98,17 @@ enum fid_type {
 	 */
 	FILEID_FAT_WITH_PARENT = 0x72,
 
+	/*
+	 * 64 bit inode number, 32 bit generation number.
+	 */
+	FILEID_INO64_GEN = 0x81,
+
+	/*
+	 * 64 bit inode number, 32 bit generation number,
+	 * 64 bit parent inode number, 32 bit parent generation.
+	 */
+	FILEID_INO64_GEN_PARENT = 0x82,
+
 	/*
 	 * 128 bit child FID (struct lu_fid)
 	 * 128 bit parent FID (struct lu_fid)
-- 
cgit v1.2.3


From 64343119d7b80b4ee9ba7703390681608a17f2c5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 23 Oct 2023 21:08:01 +0300
Subject: exportfs: support encoding non-decodeable file handles by default

AT_HANDLE_FID was added as an API for name_to_handle_at() that request
the encoding of a file id, which is not intended to be decoded.

This file id is used by fanotify to describe objects in events.

So far, overlayfs is the only filesystem that supports encoding
non-decodeable file ids, by providing export_operations with an
->encode_fh() method and without a ->decode_fh() method.

Add support for encoding non-decodeable file ids to all the filesystems
that do not provide export_operations, by encoding a file id of type
FILEID_INO64_GEN from { i_ino, i_generation }.

A filesystem may that does not support NFS export, can opt-out of
encoding non-decodeable file ids for fanotify by defining an empty
export_operations struct (i.e. with a NULL ->encode_fh() method).

This allows the use of fanotify events with file ids on filesystems
like 9p which do not support NFS export to bring fanotify in feature
parity with inotify on those filesystems.

Note that fanotify also requires that the filesystems report a non-null
fsid.  Currently, many simple filesystems that have support for inotify
(e.g. debugfs, tracefs, sysfs) report a null fsid, so can still not be
used with fanotify in file id reporting mode.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023180801.2953446-5-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c      | 32 +++++++++++++++++++++++++++++---
 include/linux/exportfs.h | 10 +++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 8f883c4758f5..7d9fdcc187b7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -383,6 +383,32 @@ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
 }
 EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
 
+#define FILEID_INO64_GEN_LEN 3
+
+/**
+ * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id
+ * @inode:   the object to encode
+ * @fid:     where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
+ *
+ * This generic function is used to encode a non-decodeable file id for
+ * fanotify for filesystems that do not support NFS export.
+ */
+static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid,
+				     int *max_len)
+{
+	if (*max_len < FILEID_INO64_GEN_LEN) {
+		*max_len = FILEID_INO64_GEN_LEN;
+		return FILEID_INVALID;
+	}
+
+	fid->i64.ino = inode->i_ino;
+	fid->i64.gen = inode->i_generation;
+	*max_len = FILEID_INO64_GEN_LEN;
+
+	return FILEID_INO64_GEN;
+}
+
 /**
  * exportfs_encode_inode_fh - encode a file handle from inode
  * @inode:   the object to encode
@@ -401,10 +427,10 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 	if (!exportfs_can_encode_fh(nop, flags))
 		return -EOPNOTSUPP;
 
-	if (nop && nop->encode_fh)
-		return nop->encode_fh(inode, fid->raw, max_len, parent);
+	if (!nop && (flags & EXPORT_FH_FID))
+		return exportfs_encode_ino64_fid(inode, fid, max_len);
 
-	return -EOPNOTSUPP;
+	return nop->encode_fh(inode, fid->raw, max_len, parent);
 }
 EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
 
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4119d3ee72eb..21bae8bfeef1 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -134,7 +134,11 @@ struct fid {
 			u32 parent_ino;
 			u32 parent_gen;
 		} i32;
- 		struct {
+		struct {
+			u64 ino;
+			u32 gen;
+		} __packed i64;
+		struct {
  			u32 block;
  			u16 partref;
  			u16 parent_partref;
@@ -246,7 +250,7 @@ extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
 
 static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
 {
-	return nop && nop->encode_fh;
+	return !nop || nop->encode_fh;
 }
 
 static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
@@ -259,7 +263,7 @@ static inline bool exportfs_can_encode_fh(const struct export_operations *nop,
 {
 	/*
 	 * If a non-decodeable file handle was requested, we only need to make
-	 * sure that filesystem can encode file handles.
+	 * sure that filesystem did not opt-out of encoding fid.
 	 */
 	if (fh_flags & EXPORT_FH_FID)
 		return exportfs_can_encode_fid(nop);
-- 
cgit v1.2.3


From d9e5d9221d7f82a2736f091bbc5b54ab8d6ef701 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 26 Oct 2023 23:45:40 +0300
Subject: fs: fix build error with CONFIG_EXPORTFS=m or not defined

Many of the filesystems that call the generic exportfs helpers do not
select the EXPORTFS config.

Move generic_encode_ino32_fh() to libfs.c, same as generic_fh_to_*()
to avoid having to fix all those config dependencies.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310262151.renqMvme-lkp@intel.com/
Fixes: dfaf653dc415 ("exportfs: make ->encode_fh() a mandatory method for NFS export")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231026204540.143217-1-amir73il@gmail.com
Tested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exportfs/expfs.c      | 41 -----------------------------------------
 fs/libfs.c               | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/exportfs.h |  9 ++-------
 3 files changed, 43 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 7d9fdcc187b7..3ae0154c5680 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -342,47 +342,6 @@ out:
 	return error;
 }
 
-/**
- * generic_encode_ino32_fh - generic export_operations->encode_fh function
- * @inode:   the object to encode
- * @fh:      where to store the file handle fragment
- * @max_len: maximum length to store there (in 4 byte units)
- * @parent:  parent directory inode, if wanted
- *
- * This generic encode_fh function assumes that the 32 inode number
- * is suitable for locating an inode, and that the generation number
- * can be used to check that it is still valid.  It places them in the
- * filehandle fragment where export_decode_fh expects to find them.
- */
-int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
-			    struct inode *parent)
-{
-	struct fid *fid = (void *)fh;
-	int len = *max_len;
-	int type = FILEID_INO32_GEN;
-
-	if (parent && (len < 4)) {
-		*max_len = 4;
-		return FILEID_INVALID;
-	} else if (len < 2) {
-		*max_len = 2;
-		return FILEID_INVALID;
-	}
-
-	len = 2;
-	fid->i32.ino = inode->i_ino;
-	fid->i32.gen = inode->i_generation;
-	if (parent) {
-		fid->i32.parent_ino = parent->i_ino;
-		fid->i32.parent_gen = parent->i_generation;
-		len = 4;
-		type = FILEID_INO32_GEN_PARENT;
-	}
-	*max_len = len;
-	return type;
-}
-EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
-
 #define FILEID_INO64_GEN_LEN 3
 
 /**
diff --git a/fs/libfs.c b/fs/libfs.c
index 8117b24b929d..38950cce135b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1310,6 +1310,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
 }
 EXPORT_SYMBOL_GPL(simple_attr_write_signed);
 
+/**
+ * generic_encode_ino32_fh - generic export_operations->encode_fh function
+ * @inode:   the object to encode
+ * @fh:      where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
+ * @parent:  parent directory inode, if wanted
+ *
+ * This generic encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid.  It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+			    struct inode *parent)
+{
+	struct fid *fid = (void *)fh;
+	int len = *max_len;
+	int type = FILEID_INO32_GEN;
+
+	if (parent && (len < 4)) {
+		*max_len = 4;
+		return FILEID_INVALID;
+	} else if (len < 2) {
+		*max_len = 2;
+		return FILEID_INVALID;
+	}
+
+	len = 2;
+	fid->i32.ino = inode->i_ino;
+	fid->i32.gen = inode->i_generation;
+	if (parent) {
+		fid->i32.parent_ino = parent->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		len = 4;
+		type = FILEID_INO32_GEN_PARENT;
+	}
+	*max_len = len;
+	return type;
+}
+EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
+
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 21bae8bfeef1..e0e69dafaa43 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -294,17 +294,12 @@ extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 /*
  * Generic helpers for filesystems.
  */
-#ifdef CONFIG_EXPORTFS
 int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
 			    struct inode *parent);
-#else
-#define generic_encode_ino32_fh NULL
-#endif
-
-extern struct dentry *generic_fh_to_dentry(struct super_block *sb,
+struct dentry *generic_fh_to_dentry(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
-extern struct dentry *generic_fh_to_parent(struct super_block *sb,
+struct dentry *generic_fh_to_parent(struct super_block *sb,
 	struct fid *fid, int fh_len, int fh_type,
 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
 
-- 
cgit v1.2.3


From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Oct 2023 08:56:38 -0700
Subject: seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()

Solve two ergonomic issues with struct seq_buf;

1) Too much boilerplate is required to initialize:

	struct seq_buf s;
	char buf[32];

	seq_buf_init(s, buf, sizeof(buf));

Instead, we can build this directly on the stack. Provide
DECLARE_SEQ_BUF() macro to do this:

	DECLARE_SEQ_BUF(s, 32);

2) %NUL termination is fragile and requires 2 steps to get a valid
   C String (and is a layering violation exposing the "internals" of
   seq_buf):

	seq_buf_terminate(s);
	do_something(s->buffer);

Instead, we can just return s->buffer directly after terminating it in
the refactored seq_buf_terminate(), now known as seq_buf_str():

	do_something(seq_buf_str(s));

Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org
Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/

Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 21 +++++++++++++++++----
 kernel/trace/trace.c    | 11 +----------
 lib/seq_buf.c           |  4 +---
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 8483e4b2d0d2..5fb1f12c33f9 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -21,9 +21,18 @@ struct seq_buf {
 	size_t			len;
 };
 
+#define DECLARE_SEQ_BUF(NAME, SIZE)			\
+	char __ ## NAME ## _buffer[SIZE] = "";		\
+	struct seq_buf NAME = {				\
+		.buffer = &__ ## NAME ## _buffer,	\
+		.size = SIZE,				\
+	}
+
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
+	if (s->size)
+		s->buffer[0] = '\0';
 }
 
 static inline void
@@ -69,8 +78,8 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
 }
 
 /**
- * seq_buf_terminate - Make sure buffer is nul terminated
- * @s: the seq_buf descriptor to terminate.
+ * seq_buf_str - get %NUL-terminated C string from seq_buf
+ * @s: the seq_buf handle
  *
  * This makes sure that the buffer in @s is nul terminated and
  * safe to read as a string.
@@ -81,16 +90,20 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
  *
  * After this function is called, s->buffer is safe to use
  * in string operations.
+ *
+ * Returns @s->buf after making sure it is terminated.
  */
-static inline void seq_buf_terminate(struct seq_buf *s)
+static inline const char *seq_buf_str(struct seq_buf *s)
 {
 	if (WARN_ON(s->size == 0))
-		return;
+		return "";
 
 	if (seq_buf_buffer_left(s))
 		s->buffer[s->len] = 0;
 	else
 		s->buffer[s->size - 1] = 0;
+
+	return s->buffer;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d629065c2383..2539cfc20a97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3828,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
 	return false;
 }
 
-static const char *show_buffer(struct trace_seq *s)
-{
-	struct seq_buf *seq = &s->seq;
-
-	seq_buf_terminate(seq);
-
-	return seq->buffer;
-}
-
 static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
 
 static int test_can_verify_check(const char *fmt, ...)
@@ -3976,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
 		 */
 		if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
 			      "fmt: '%s' current_buffer: '%s'",
-			      fmt, show_buffer(&iter->seq))) {
+			      fmt, seq_buf_str(&iter->seq.seq))) {
 			int ret;
 
 			/* Try to safely read the string */
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index b7477aefff53..23518f77ea9c 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -109,9 +109,7 @@ void seq_buf_do_printk(struct seq_buf *s, const char *lvl)
 	if (s->size == 0 || s->len == 0)
 		return;
 
-	seq_buf_terminate(s);
-
-	start = s->buffer;
+	start = seq_buf_str(s);
 	while ((lf = strchr(start, '\n'))) {
 		int len = lf - start + 1;
 
-- 
cgit v1.2.3


From 2bce6368c46b835a133f7f4946eea9c4513828dd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Oct 2023 14:46:58 -0400
Subject: closures: Better memory barriers

atomic_(dec|sub)_return_release() are a thing now - use them.

Also, delete the useless barrier in set_closure_fn(): it's redundant
with the memory barrier in closure_put(0.

Since closure_put() would now otherwise just have a release barrier, we
also need a new barrier when the ref hits 0 -
smp_acquire__after_ctrl_dep().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 2 --
 lib/closure.c           | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 722a586bb224..bdab17050bc8 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -233,8 +233,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 	closure_set_ip(cl);
 	cl->fn = fn;
 	cl->wq = wq;
-	/* between atomic_dec() in closure_put() */
-	smp_mb__before_atomic();
 }
 
 static inline void closure_queue(struct closure *cl)
diff --git a/lib/closure.c b/lib/closure.c
index 0855e698ced1..501dfa277b59 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -21,6 +21,8 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
 
 	if (!r) {
+		smp_acquire__after_ctrl_dep();
+
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
@@ -43,7 +45,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 /* For clearing flags with the same atomic op as a put */
 void closure_sub(struct closure *cl, int v)
 {
-	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+	closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
 }
 EXPORT_SYMBOL(closure_sub);
 
@@ -52,7 +54,7 @@ EXPORT_SYMBOL(closure_sub);
  */
 void closure_put(struct closure *cl)
 {
-	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+	closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
 
-- 
cgit v1.2.3


From ee526b88caaa4b4182144bf2576af2c3b1e9c759 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 24 Oct 2023 14:46:58 -0400
Subject: closures: Fix race in closure_sync()

As pointed out by Linus, closure_sync() was racy; we could skip blocking
immediately after a get() and a put(), but then that would skip any
barrier corresponding to the other thread's put() barrier.

To fix this, always do the full __closure_sync() sequence whenever any
get() has happened and the closure might have been used by other
threads.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-direct.c |  1 +
 include/linux/closure.h    | 10 +++++++++-
 lib/closure.c              |  3 +++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 6a9557e7ecab..5b42a76c4796 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 	} else {
 		atomic_set(&dio->cl.remaining,
 			   CLOSURE_REMAINING_INITIALIZER + 1);
+		dio->cl.closure_get_happened = true;
 	}
 
 	dio->req	= req;
diff --git a/include/linux/closure.h b/include/linux/closure.h
index bdab17050bc8..de7bb47d8a46 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -154,6 +154,7 @@ struct closure {
 	struct closure		*parent;
 
 	atomic_t		remaining;
+	bool			closure_get_happened;
 
 #ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD	0xc054dead
@@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+	if (cl->closure_get_happened)
 		__closure_sync(cl);
 }
 
@@ -257,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
+	cl->closure_get_happened = true;
+
 #ifdef CONFIG_DEBUG_CLOSURES
 	BUG_ON((atomic_inc_return(&cl->remaining) &
 		CLOSURE_REMAINING_MASK) <= 1);
@@ -279,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
 		closure_get(parent);
 
 	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+	cl->closure_get_happened = false;
 
 	closure_debug_create(cl);
 	closure_set_ip(cl);
diff --git a/lib/closure.c b/lib/closure.c
index 501dfa277b59..f86c9eeafb35 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -23,6 +23,8 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 	if (!r) {
 		smp_acquire__after_ctrl_dep();
 
+		cl->closure_get_happened = false;
+
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
@@ -92,6 +94,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
 		return false;
 
+	cl->closure_get_happened = true;
 	closure_set_waiting(cl, _RET_IP_);
 	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
 	llist_add(&cl->list, &waitlist->list);
-- 
cgit v1.2.3


From 0da611a8702101814257a7c03f6caf0574c83b98 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 8 Sep 2023 10:27:23 +0200
Subject: dma-buf: add dma_fence_timestamp helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a fence signals there is a very small race window where the timestamp
isn't updated yet. sync_file solves this by busy waiting for the
timestamp to appear, but on other ocassions didn't handled this
correctly.

Provide a dma_fence_timestamp() helper function for this and use it in
all appropriate cases.

Another alternative would be to grab the spinlock when that happens.

v2 by teddy: add a wait parameter to wait for the timestamp to show up, in case
   the accurate timestamp is needed and/or the timestamp is not based on
   ktime (e.g. hw timestamp)
v3 chk: drop the parameter again for unified handling

Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Fixes: 1774baa64f93 ("drm/scheduler: Change scheduled fence track v2")
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
CC: stable@vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20230929104725.2358-1-christian.koenig@amd.com
---
 drivers/dma-buf/dma-fence-unwrap.c     | 13 ++++---------
 drivers/dma-buf/sync_file.c            |  9 +++------
 drivers/gpu/drm/scheduler/sched_main.c |  2 +-
 include/linux/dma-fence.h              | 19 +++++++++++++++++++
 4 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c
index c625bb2b5d56..628af51c81af 100644
--- a/drivers/dma-buf/dma-fence-unwrap.c
+++ b/drivers/dma-buf/dma-fence-unwrap.c
@@ -76,16 +76,11 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
 		dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
 			if (!dma_fence_is_signaled(tmp)) {
 				++count;
-			} else if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT,
-					    &tmp->flags)) {
-				if (ktime_after(tmp->timestamp, timestamp))
-					timestamp = tmp->timestamp;
 			} else {
-				/*
-				 * Use the current time if the fence is
-				 * currently signaling.
-				 */
-				timestamp = ktime_get();
+				ktime_t t = dma_fence_timestamp(tmp);
+
+				if (ktime_after(t, timestamp))
+					timestamp = t;
 			}
 		}
 	}
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index af57799c86ce..2e9a316c596a 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -268,13 +268,10 @@ static int sync_fill_fence_info(struct dma_fence *fence,
 		sizeof(info->driver_name));
 
 	info->status = dma_fence_get_status(fence);
-	while (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) &&
-	       !test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
-		cpu_relax();
 	info->timestamp_ns =
-		test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags) ?
-		ktime_to_ns(fence->timestamp) :
-		ktime_set(0, 0);
+		dma_fence_is_signaled(fence) ?
+			ktime_to_ns(dma_fence_timestamp(fence)) :
+			ktime_set(0, 0);
 
 	return info->status;
 }
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index fd755e953487..99797a8c836a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -935,7 +935,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 
 		if (next) {
 			next->s_fence->scheduled.timestamp =
-				job->s_fence->finished.timestamp;
+				dma_fence_timestamp(&job->s_fence->finished);
 			/* start TO timer for next job */
 			drm_sched_start_timeout(sched);
 		}
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 0d678e9a7b24..ebe78bd3d121 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -568,6 +568,25 @@ static inline void dma_fence_set_error(struct dma_fence *fence,
 	fence->error = error;
 }
 
+/**
+ * dma_fence_timestamp - helper to get the completion timestamp of a fence
+ * @fence: fence to get the timestamp from.
+ *
+ * After a fence is signaled the timestamp is updated with the signaling time,
+ * but setting the timestamp can race with tasks waiting for the signaling. This
+ * helper busy waits for the correct timestamp to appear.
+ */
+static inline ktime_t dma_fence_timestamp(struct dma_fence *fence)
+{
+	if (WARN_ON(!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)))
+		return ktime_get();
+
+	while (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags))
+		cpu_relax();
+
+	return fence->timestamp;
+}
+
 signed long dma_fence_wait_timeout(struct dma_fence *,
 				   bool intr, signed long timeout);
 signed long dma_fence_wait_any_timeout(struct dma_fence **fences,
-- 
cgit v1.2.3


From 3c70342f1f0045dc827bb2f02d814ce31e0e0d05 Mon Sep 17 00:00:00 2001
From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Mon, 28 Aug 2023 22:16:11 +0200
Subject: mfd: core: Un-constify mfd_cell.of_reg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable dynamically filling in the whole mfd_cell structure. All other
fields already allow that.

Fixes: 466a62d7642f ("mfd: core: Make a best effort attempt to match devices with the correct of_nodes")
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Link: https://lore.kernel.org/r/b73fe4bc4bd6ba1af90940a640ed65fe254c0408.1693253717.git.mirq-linux@rere.qmqm.pl
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 47e7a3a61ce6..e8bcad641d8c 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -92,7 +92,7 @@ struct mfd_cell {
 	 * (above) when matching OF nodes with devices that have identical
 	 * compatible strings
 	 */
-	const u64 of_reg;
+	u64 of_reg;
 
 	/* Set to 'true' to use 'of_reg' (above) - allows for of_reg=0 */
 	bool use_of_reg;
-- 
cgit v1.2.3


From 89b00c328f5623c428e6aa8563c3a648f8190252 Mon Sep 17 00:00:00 2001
From: Ying Sun <sunying@nj.iscas.ac.cn>
Date: Wed, 13 Sep 2023 16:45:59 +0800
Subject: mfd: ab8500: Remove non-existent configuration "#ifdef
 CONFIG_AB8500_DEBUG"

The CONFIG_AB8500_DEBUG has been deleted in:

  3d4d1266597c0 ("mfd: ab8500: Drop debugfs module")

The condition "#ifdef CONFIG_AB8500_DEBUG" in:

   include/linux/mfd/abx500/ab8500.h:502

...cannot be valid. It is recommended to delete redundant code.

Suggested-by: Yanjie Ren <renyanjie01@gmail.com>
Signed-off-by: Ying Sun <sunying@nj.iscas.ac.cn>
Link: https://lore.kernel.org/r/20230913084559.18141-1-sunying@nj.iscas.ac.cn
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/abx500/ab8500.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 09fb3c56e7d7..76d326ea8eba 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -499,13 +499,7 @@ static inline int is_ab9540_2p0_or_earlier(struct ab8500 *ab)
 
 void ab8500_override_turn_on_stat(u8 mask, u8 set);
 
-#ifdef CONFIG_AB8500_DEBUG
-extern int prcmu_abb_read(u8 slave, u8 reg, u8 *value, u8 size);
-void ab8500_dump_all_banks(struct device *dev);
-void ab8500_debug_register_interrupt(int line);
-#else
 static inline void ab8500_dump_all_banks(struct device *dev) {}
 static inline void ab8500_debug_register_interrupt(int line) {}
-#endif
 
 #endif /* MFD_AB8500_H */
-- 
cgit v1.2.3


From 1c7ea43fc42b09080657c83683846c4be672bd91 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Date: Tue, 29 Aug 2023 15:40:25 +0200
Subject: mfd: stm32-timers: Add support for interrupts

There are two types of STM32 timers that may have:

- a global interrupt line
- 4 dedicated interrupt lines.

Those interrupts are optional as defined in the dt-bindings. Enforce checks
on either one, four or no interrupts are provided with their names.
Optionally get them here, to be used by child devices.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/20230829134029.2402868-5-fabrice.gasnier@foss.st.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/stm32-timers.c       | 46 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stm32-timers.h | 11 ++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/stm32-timers.c b/drivers/mfd/stm32-timers.c
index 732a28db80fa..a656a1c186a8 100644
--- a/drivers/mfd/stm32-timers.c
+++ b/drivers/mfd/stm32-timers.c
@@ -215,6 +215,48 @@ static void stm32_timers_dma_remove(struct device *dev,
 			dma_release_channel(ddata->dma.chans[i]);
 }
 
+static const char * const stm32_timers_irq_name[STM32_TIMERS_MAX_IRQS] = {
+	"brk", "up", "trg-com", "cc"
+};
+
+static int stm32_timers_irq_probe(struct platform_device *pdev,
+				  struct stm32_timers *ddata)
+{
+	int i, ret;
+
+	/*
+	 * STM32 Timer may have either:
+	 * - a unique global interrupt line
+	 * - four dedicated interrupt lines that may be handled separately.
+	 * Optionally get them here, to be used by child devices.
+	 */
+	ret = platform_get_irq_byname_optional(pdev, "global");
+	if (ret < 0 && ret != -ENXIO) {
+		return ret;
+	} else if (ret != -ENXIO) {
+		ddata->irq[STM32_TIMERS_IRQ_GLOBAL_BRK] = ret;
+		ddata->nr_irqs = 1;
+		return 0;
+	}
+
+	for (i = 0; i < STM32_TIMERS_MAX_IRQS; i++) {
+		ret = platform_get_irq_byname_optional(pdev, stm32_timers_irq_name[i]);
+		if (ret < 0 && ret != -ENXIO) {
+			return ret;
+		} else if (ret != -ENXIO) {
+			ddata->irq[i] = ret;
+			ddata->nr_irqs++;
+		}
+	}
+
+	if (ddata->nr_irqs && ddata->nr_irqs != STM32_TIMERS_MAX_IRQS) {
+		dev_err(&pdev->dev, "Invalid number of IRQs %d\n", ddata->nr_irqs);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int stm32_timers_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -245,6 +287,10 @@ static int stm32_timers_probe(struct platform_device *pdev)
 
 	stm32_timers_get_arr_size(ddata);
 
+	ret = stm32_timers_irq_probe(pdev, ddata);
+	if (ret)
+		return ret;
+
 	ret = stm32_timers_dma_probe(dev, ddata);
 	if (ret) {
 		stm32_timers_dma_remove(dev, ddata);
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index 1b94325febb3..ca35af30745f 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -102,6 +102,15 @@ enum stm32_timers_dmas {
 	STM32_TIMERS_MAX_DMAS,
 };
 
+/* STM32 Timer may have either a unique global interrupt or 4 interrupt lines */
+enum stm32_timers_irqs {
+	STM32_TIMERS_IRQ_GLOBAL_BRK, /* global or brk IRQ */
+	STM32_TIMERS_IRQ_UP,
+	STM32_TIMERS_IRQ_TRG_COM,
+	STM32_TIMERS_IRQ_CC,
+	STM32_TIMERS_MAX_IRQS,
+};
+
 /**
  * struct stm32_timers_dma - STM32 timer DMA handling.
  * @completion:		end of DMA transfer completion
@@ -123,6 +132,8 @@ struct stm32_timers {
 	struct regmap *regmap;
 	u32 max_arr;
 	struct stm32_timers_dma dma; /* Only to be used by the parent */
+	unsigned int nr_irqs;
+	int irq[STM32_TIMERS_MAX_IRQS];
 };
 
 #if IS_REACHABLE(CONFIG_MFD_STM32_TIMERS)
-- 
cgit v1.2.3


From 16c4c1bb7e01d5de797c43d7a2396434f2078b0a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Sep 2023 22:08:32 +0300
Subject: mfd: lpc_ich: Convert gpio_version to be enum

We have an anonymous enum for the GPIO versions. Make it named
and use this type for the gpio_version member of struct lpc_ich_info.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230926190834.932233-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/lpc_ich.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index ea4a4b1b246a..83621e0ccf33 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -15,7 +15,7 @@
 #define ICH_RES_GPE0	1
 
 /* GPIO compatibility */
-enum {
+enum lpc_gpio_versions {
 	ICH_I3100_GPIO,
 	ICH_V5_GPIO,
 	ICH_V6_GPIO,
@@ -29,7 +29,7 @@ enum {
 struct lpc_ich_info {
 	char name[32];
 	unsigned int iTCO_version;
-	unsigned int gpio_version;
+	enum lpc_gpio_versions gpio_version;
 	enum intel_spi_type spi_type;
 	u8 use_gpio;
 };
-- 
cgit v1.2.3


From 1f84f88dc15036d63fa7c84833a440370f22a74b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Sep 2023 22:08:33 +0300
Subject: mfd: lpc_ich: Move APL GPIO resources to a custom structure

We are expecting more platforms that want to instantiate
the GPIO device via P2SB. For them prepare the custom structure
and move Apollo Lake data there. Refactor the code accordingly.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230926190834.932233-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/lpc_ich.c       | 52 +++++++++++++++++++++++++++++++++++----------
 include/linux/mfd/lpc_ich.h |  3 +++
 2 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
index 58da6c95c462..8b21d0f629c8 100644
--- a/drivers/mfd/lpc_ich.c
+++ b/drivers/mfd/lpc_ich.c
@@ -131,22 +131,33 @@ static struct mfd_cell lpc_ich_gpio_cell = {
 	.ignore_resource_conflicts = true,
 };
 
+#define INTEL_GPIO_RESOURCE_SIZE	0x1000
+
+struct lpc_ich_gpio_info {
+	const char *hid;
+	const struct mfd_cell *devices;
+	size_t nr_devices;
+	struct resource **resources;
+	size_t nr_resources;
+	resource_size_t *offsets;
+};
+
 #define APL_GPIO_NORTH		0
 #define APL_GPIO_NORTHWEST	1
 #define APL_GPIO_WEST		2
 #define APL_GPIO_SOUTHWEST	3
+
 #define APL_GPIO_NR_DEVICES	4
+#define APL_GPIO_NR_RESOURCES	4
 
 /* Offset data for Apollo Lake GPIO controllers */
-static resource_size_t apl_gpio_offsets[APL_GPIO_NR_DEVICES] = {
+static resource_size_t apl_gpio_offsets[APL_GPIO_NR_RESOURCES] = {
 	[APL_GPIO_NORTH]	= 0xc50000,
 	[APL_GPIO_NORTHWEST]	= 0xc40000,
 	[APL_GPIO_WEST]		= 0xc70000,
 	[APL_GPIO_SOUTHWEST]	= 0xc00000,
 };
 
-#define APL_GPIO_RESOURCE_SIZE		0x1000
-
 #define APL_GPIO_IRQ			14
 
 static struct resource apl_gpio_resources[APL_GPIO_NR_DEVICES][2] = {
@@ -168,6 +179,13 @@ static struct resource apl_gpio_resources[APL_GPIO_NR_DEVICES][2] = {
 	},
 };
 
+static struct resource *apl_gpio_mem_resources[APL_GPIO_NR_RESOURCES] = {
+	[APL_GPIO_NORTH] = &apl_gpio_resources[APL_GPIO_NORTH][0],
+	[APL_GPIO_NORTHWEST] = &apl_gpio_resources[APL_GPIO_NORTHWEST][0],
+	[APL_GPIO_WEST] = &apl_gpio_resources[APL_GPIO_WEST][0],
+	[APL_GPIO_SOUTHWEST] = &apl_gpio_resources[APL_GPIO_SOUTHWEST][0],
+};
+
 static const struct mfd_cell apl_gpio_devices[APL_GPIO_NR_DEVICES] = {
 	[APL_GPIO_NORTH] = {
 		.name = "apollolake-pinctrl",
@@ -199,6 +217,15 @@ static const struct mfd_cell apl_gpio_devices[APL_GPIO_NR_DEVICES] = {
 	},
 };
 
+static const struct lpc_ich_gpio_info apl_gpio_info = {
+	.hid = "INT3452",
+	.devices = apl_gpio_devices,
+	.nr_devices = ARRAY_SIZE(apl_gpio_devices),
+	.resources = apl_gpio_mem_resources,
+	.nr_resources = ARRAY_SIZE(apl_gpio_mem_resources),
+	.offsets = apl_gpio_offsets,
+};
+
 static struct mfd_cell lpc_ich_spi_cell = {
 	.name = "intel-spi",
 	.num_resources = ARRAY_SIZE(intel_spi_res),
@@ -618,6 +645,7 @@ static struct lpc_ich_info lpc_chipset_info[] = {
 	[LPC_APL] = {
 		.name = "Apollo Lake SoC",
 		.iTCO_version = 5,
+		.gpio_info = &apl_gpio_info,
 		.spi_type = INTEL_SPI_BXT,
 	},
 	[LPC_GLK] = {
@@ -1156,30 +1184,32 @@ wdt_done:
 
 static int lpc_ich_init_pinctrl(struct pci_dev *dev)
 {
+	struct lpc_ich_priv *priv = pci_get_drvdata(dev);
+	const struct lpc_ich_gpio_info *info = lpc_chipset_info[priv->chipset].gpio_info;
 	struct resource base;
 	unsigned int i;
 	int ret;
 
 	/* Check, if GPIO has been exported as an ACPI device */
-	if (acpi_dev_present("INT3452", NULL, -1))
+	if (acpi_dev_present(info->hid, NULL, -1))
 		return -EEXIST;
 
 	ret = p2sb_bar(dev->bus, 0, &base);
 	if (ret)
 		return ret;
 
-	for (i = 0; i < ARRAY_SIZE(apl_gpio_devices); i++) {
-		struct resource *mem = &apl_gpio_resources[i][0];
-		resource_size_t offset = apl_gpio_offsets[i];
+	for (i = 0; i < info->nr_resources; i++) {
+		struct resource *mem = info->resources[i];
+		resource_size_t offset = info->offsets[i];
 
 		/* Fill MEM resource */
 		mem->start = base.start + offset;
-		mem->end = base.start + offset + APL_GPIO_RESOURCE_SIZE - 1;
+		mem->end = base.start + offset + INTEL_GPIO_RESOURCE_SIZE - 1;
 		mem->flags = base.flags;
 	}
 
-	return mfd_add_devices(&dev->dev, 0, apl_gpio_devices,
-			       ARRAY_SIZE(apl_gpio_devices), NULL, 0, NULL);
+	return mfd_add_devices(&dev->dev, 0, info->devices, info->nr_devices,
+			       NULL, 0, NULL);
 }
 
 static bool lpc_ich_byt_set_writeable(void __iomem *base, void *data)
@@ -1332,7 +1362,7 @@ static int lpc_ich_probe(struct pci_dev *dev,
 			cell_added = true;
 	}
 
-	if (priv->chipset == LPC_APL) {
+	if (lpc_chipset_info[priv->chipset].gpio_info) {
 		ret = lpc_ich_init_pinctrl(dev);
 		if (!ret)
 			cell_added = true;
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 83621e0ccf33..1fbda1f8967d 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -26,11 +26,14 @@ enum lpc_gpio_versions {
 	AVOTON_GPIO,
 };
 
+struct lpc_ich_gpio_info;
+
 struct lpc_ich_info {
 	char name[32];
 	unsigned int iTCO_version;
 	enum lpc_gpio_versions gpio_version;
 	enum intel_spi_type spi_type;
+	const struct lpc_ich_gpio_info *gpio_info;
 	u8 use_gpio;
 };
 
-- 
cgit v1.2.3


From 272f99edab36974b58347e97ee105885e385fa88 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 3 Oct 2023 17:29:27 +0200
Subject: spmi: rename spmi device lookup helper

Rename the SPMI device helper which is used to lookup a device from its
OF node as spmi_find_device_by_of_node() so that it reflects the
implementation and matches how other helpers like this are named.

This will specifically make it more clear that this is a lookup function
which returns a reference counted structure.

Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20231003152927.15000-6-johan+linaro@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/qcom-spmi-pmic.c | 2 +-
 drivers/spmi/spmi.c          | 6 +++---
 include/linux/spmi.h         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/qcom-spmi-pmic.c b/drivers/mfd/qcom-spmi-pmic.c
index 203b1e27a1ae..4549fa9f7d4b 100644
--- a/drivers/mfd/qcom-spmi-pmic.c
+++ b/drivers/mfd/qcom-spmi-pmic.c
@@ -116,7 +116,7 @@ static struct spmi_device *qcom_pmic_get_base_usid(struct spmi_device *sdev, str
 		}
 
 		if (pmic_addr == function_parent_usid - (ctx->num_usids - 1)) {
-			sdev = spmi_device_from_of(child);
+			sdev = spmi_find_device_by_of_node(child);
 			if (!sdev) {
 				/*
 				 * If the base USID for this PMIC hasn't been
diff --git a/drivers/spmi/spmi.c b/drivers/spmi/spmi.c
index ca2fd4d72fa6..93cd4a34debc 100644
--- a/drivers/spmi/spmi.c
+++ b/drivers/spmi/spmi.c
@@ -388,7 +388,7 @@ static struct bus_type spmi_bus_type = {
 };
 
 /**
- * spmi_device_from_of() - get the associated SPMI device from a device node
+ * spmi_find_device_by_of_node() - look up an SPMI device from a device node
  *
  * @np:		device node
  *
@@ -397,7 +397,7 @@ static struct bus_type spmi_bus_type = {
  *
  * Returns the struct spmi_device associated with a device node or NULL.
  */
-struct spmi_device *spmi_device_from_of(struct device_node *np)
+struct spmi_device *spmi_find_device_by_of_node(struct device_node *np)
 {
 	struct device *dev = bus_find_device_by_of_node(&spmi_bus_type, np);
 
@@ -405,7 +405,7 @@ struct spmi_device *spmi_device_from_of(struct device_node *np)
 		return to_spmi_device(dev);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(spmi_device_from_of);
+EXPORT_SYMBOL_GPL(spmi_find_device_by_of_node);
 
 /**
  * spmi_device_alloc() - Allocate a new SPMI device
diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index eac1956a8727..2a4ce4144f9f 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -166,7 +166,7 @@ static inline void spmi_driver_unregister(struct spmi_driver *sdrv)
 
 struct device_node;
 
-struct spmi_device *spmi_device_from_of(struct device_node *np);
+struct spmi_device *spmi_find_device_by_of_node(struct device_node *np);
 int spmi_register_read(struct spmi_device *sdev, u8 addr, u8 *buf);
 int spmi_ext_register_read(struct spmi_device *sdev, u8 addr, u8 *buf,
 			   size_t len);
-- 
cgit v1.2.3


From a72cac6067fdfde280ece0ec5c055659a8de6508 Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Wed, 18 Oct 2023 20:14:41 +0300
Subject: vdpa: introduce dedicated descriptor group for virtqueue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In some cases, the access to the virtqueue's descriptor area, device
and driver areas (precluding indirect descriptor table in guest memory)
may have to be confined to a different address space than where its
buffers reside. Without loss of simplicity and generality with already
established terminology, let's fold up these 3 areas and call them
as a whole as descriptor table group, or descriptor group for short.
Specifically, in case of split virtqueues, descriptor group consists of
regions for Descriptor Table, Available Ring and Used Ring; for packed
virtqueues layout, descriptor group contains Descriptor Ring, Driver
and Device Event Suppression structures.

The group ID for a dedicated descriptor group can be obtained through a
new .get_vq_desc_group() op. If driver implements this op, it means that
the descriptor, device and driver areas of the virtqueue may reside
in a dedicated group than where its buffers reside, a.k.a the default
virtqueue group through the .get_vq_group() op.

In principle, the descriptor group may or may not have same group ID
as the default group. Even if the descriptor group has a different ID,
meaning the vq's descriptor group areas can optionally move to a
separate address space than where guest memory resides, the descriptor
group may still start from a default address space, same as where its
buffers reside. To move the descriptor group to a different address
space, .set_group_asid() has to be called to change the ASID binding
for the group, which is no different than what needs to be done on any
other virtqueue group. On the other hand, the .reset() semantics also
applies on descriptor table group, meaning the device reset will clear
all ASID bindings and move all virtqueue groups including descriptor
group back to the default address space, i.e. in ASID 0.

QEMU's shadow virtqueue is going to utilize dedicated descriptor group
to speed up map and unmap operations, yielding tremendous downtime
reduction by avoiding the full and slow remap cycle in SVQ switching.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231018171456.1624030-4-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 0e652026b776..d376309b99cf 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -204,6 +204,16 @@ struct vdpa_map_file {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				Returns u32: group id for this virtqueue
+ * @get_vq_desc_group:		Get the group id for the descriptor table of
+ *				a specific virtqueue (optional)
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				Returns u32: group id for the descriptor table
+ *				portion of this virtqueue. Could be different
+ *				than the one from @get_vq_group, in which case
+ *				the access to the descriptor table can be
+ *				confined to a separate asid, isolating from
+ *				the virtqueue's buffer address access.
  * @get_device_features:	Get virtio features supported by the device
  *				@vdev: vdpa device
  *				Returns the virtio features support by the
@@ -360,6 +370,7 @@ struct vdpa_config_ops {
 	/* Device ops */
 	u32 (*get_vq_align)(struct vdpa_device *vdev);
 	u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx);
+	u32 (*get_vq_desc_group)(struct vdpa_device *vdev, u16 idx);
 	u64 (*get_device_features)(struct vdpa_device *vdev);
 	u64 (*get_backend_features)(const struct vdpa_device *vdev);
 	int (*set_driver_features)(struct vdpa_device *vdev, u64 features);
-- 
cgit v1.2.3


From 03dd63c8fae4599439a30e0dde91061789260dbe Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Wed, 18 Oct 2023 20:14:53 +0300
Subject: vdpa/mlx5: Enable hw support for vq descriptor mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vq descriptor mappings are supported in hardware by filling in an
additional mkey which contains the descriptor mappings to the hw vq.

A previous patch in this series added support for hw mkey (mr) creation
for ASID 1.

This patch fills in both the vq data and vq descriptor mkeys based on
group ASID mapping.

The feature is signaled to the vdpa core through the presence of the
.get_vq_desc_group op.

Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231018171456.1624030-16-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Si-Wei Liu <si-wei.liu@oracle.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 24 +++++++++++++++++++++++-
 include/linux/mlx5/mlx5_ifc_vdpa.h |  7 ++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 804711140890..63aa6ec4ef88 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -863,6 +863,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
 	struct mlx5_vdpa_mr *vq_mr;
+	struct mlx5_vdpa_mr *vq_desc_mr;
 	void *obj_context;
 	u16 mlx_features;
 	void *cmd_hdr;
@@ -918,6 +919,11 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
 	if (vq_mr)
 		MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
+
+	vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
+	if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
+		MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey);
+
 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
@@ -2305,6 +2311,16 @@ static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
 	return MLX5_VDPA_DATAVQ_GROUP;
 }
 
+static u32 mlx5_vdpa_get_vq_desc_group(struct vdpa_device *vdev, u16 idx)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+
+	if (is_ctrl_vq_idx(mvdev, idx))
+		return MLX5_VDPA_CVQ_GROUP;
+
+	return MLX5_VDPA_DATAVQ_DESC_GROUP;
+}
+
 static u64 mlx_to_vritio_features(u16 dev_features)
 {
 	u64 result = 0;
@@ -3209,6 +3225,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.get_vq_irq = mlx5_get_vq_irq,
 	.get_vq_align = mlx5_vdpa_get_vq_align,
 	.get_vq_group = mlx5_vdpa_get_vq_group,
+	.get_vq_desc_group = mlx5_vdpa_get_vq_desc_group, /* Op disabled if not supported. */
 	.get_device_features = mlx5_vdpa_get_device_features,
 	.set_driver_features = mlx5_vdpa_set_driver_features,
 	.get_driver_features = mlx5_vdpa_get_driver_features,
@@ -3307,6 +3324,7 @@ struct mlx5_vdpa_mgmtdev {
 	struct vdpa_mgmt_dev mgtdev;
 	struct mlx5_adev *madev;
 	struct mlx5_vdpa_net *ndev;
+	struct vdpa_config_ops vdpa_ops;
 };
 
 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
@@ -3420,7 +3438,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
 		max_vqs = 2;
 	}
 
-	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
+	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops,
 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
 	if (IS_ERR(ndev))
 		return PTR_ERR(ndev);
@@ -3593,6 +3611,10 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
 	mgtdev->madev = madev;
+	mgtdev->vdpa_ops = mlx5_vdpa_ops;
+
+	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported))
+		mgtdev->vdpa_ops.get_vq_desc_group = NULL;
 
 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
 	if (err)
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 9becdc3fa503..b86d51a855f6 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -74,7 +74,11 @@ struct mlx5_ifc_virtio_q_bits {
 	u8    reserved_at_320[0x8];
 	u8    pd[0x18];
 
-	u8    reserved_at_340[0xc0];
+	u8    reserved_at_340[0x20];
+
+	u8    desc_group_mkey[0x20];
+
+	u8    reserved_at_380[0x80];
 };
 
 struct mlx5_ifc_virtio_net_q_object_bits {
@@ -141,6 +145,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
 enum {
-- 
cgit v1.2.3


From a5d7df87843dd9025015e4038dd927e893cc8b8b Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Mon, 11 Sep 2023 14:31:04 -0700
Subject: virtio: kdoc for struct virtio_pci_modern_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finally following up to Simon's suggestion for some kdoc attention
on struct virtio_pci_modern_device.

Link: https://lore.kernel.org/netdev/ZE%2FQS0lnUvxFacjf@corigine.com/
Cc: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20230911213104.14391-1-shannon.nelson@amd.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/virtio_pci_modern.h | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 067ac1d789bc..a38c729d1973 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -12,37 +12,47 @@ struct virtio_pci_modern_common_cfg {
 	__le16 queue_reset;		/* read-write */
 };
 
+/**
+ * struct virtio_pci_modern_device - info for modern PCI virtio
+ * @pci_dev:	    Ptr to the PCI device struct
+ * @common:	    Position of the common capability in the PCI config
+ * @device:	    Device-specific data (non-legacy mode)
+ * @notify_base:    Base of vq notifications (non-legacy mode)
+ * @notify_pa:	    Physical base of vq notifications
+ * @isr:	    Where to read and clear interrupt
+ * @notify_len:	    So we can sanity-check accesses
+ * @device_len:	    So we can sanity-check accesses
+ * @notify_map_cap: Capability for when we need to map notifications per-vq
+ * @notify_offset_multiplier: Multiply queue_notify_off by this value
+ *                            (non-legacy mode).
+ * @modern_bars:    Bitmask of BARs
+ * @id:		    Device and vendor id
+ * @device_id_check: Callback defined before vp_modern_probe() to be used to
+ *		    verify the PCI device is a vendor's expected device rather
+ *		    than the standard virtio PCI device
+ *		    Returns the found device id or ERRNO
+ * @dma_mask:	    Optional mask instead of the traditional DMA_BIT_MASK(64),
+ *		    for vendor devices with DMA space address limitations
+ */
 struct virtio_pci_modern_device {
 	struct pci_dev *pci_dev;
 
 	struct virtio_pci_common_cfg __iomem *common;
-	/* Device-specific data (non-legacy mode)  */
 	void __iomem *device;
-	/* Base of vq notifications (non-legacy mode). */
 	void __iomem *notify_base;
-	/* Physical base of vq notifications */
 	resource_size_t notify_pa;
-	/* Where to read and clear interrupt */
 	u8 __iomem *isr;
 
-	/* So we can sanity-check accesses. */
 	size_t notify_len;
 	size_t device_len;
 
-	/* Capability for when we need to map notifications per-vq. */
 	int notify_map_cap;
 
-	/* Multiply queue_notify_off by this value. (non-legacy mode). */
 	u32 notify_offset_multiplier;
-
 	int modern_bars;
-
 	struct virtio_device_id id;
 
-	/* optional check for vendor virtio device, returns dev_id or -ERRNO */
 	int (*device_id_check)(struct pci_dev *pdev);
-
-	/* optional mask for devices with limited DMA space */
 	u64 dma_mask;
 };
 
-- 
cgit v1.2.3


From e0592acd1ef2497b861ef7ed6eda14b092b1e667 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Thu, 19 Oct 2023 11:49:02 +0800
Subject: virtio_pci: add check for common cfg size

Some buggy devices, the common cfg size may not match the features.

This patch checks the common cfg size for the
features(VIRTIO_F_NOTIF_CONFIG_DATA, VIRTIO_F_RING_RESET). When the
common cfg size does not match the corresponding feature, we fail the
probe and print error message.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231019034902.7346-1-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_modern.c     | 36 ++++++++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci_modern_dev.c |  2 +-
 include/linux/virtio_pci_modern.h      |  1 +
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index d6bb68ba84e5..ee6a386d250b 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -39,6 +39,39 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features)
 		__virtio_set_bit(vdev, VIRTIO_F_RING_RESET);
 }
 
+static int __vp_check_common_size_one_feature(struct virtio_device *vdev, u32 fbit,
+					    u32 offset, const char *fname)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	if (!__virtio_test_bit(vdev, fbit))
+		return 0;
+
+	if (likely(vp_dev->mdev.common_len >= offset))
+		return 0;
+
+	dev_err(&vdev->dev,
+		"virtio: common cfg size(%zu) does not match the feature %s\n",
+		vp_dev->mdev.common_len, fname);
+
+	return -EINVAL;
+}
+
+#define vp_check_common_size_one_feature(vdev, fbit, field) \
+	__vp_check_common_size_one_feature(vdev, fbit, \
+		offsetofend(struct virtio_pci_modern_common_cfg, field), #fbit)
+
+static int vp_check_common_size(struct virtio_device *vdev)
+{
+	if (vp_check_common_size_one_feature(vdev, VIRTIO_F_NOTIF_CONFIG_DATA, queue_notify_data))
+		return -EINVAL;
+
+	if (vp_check_common_size_one_feature(vdev, VIRTIO_F_RING_RESET, queue_reset))
+		return -EINVAL;
+
+	return 0;
+}
+
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -57,6 +90,9 @@ static int vp_finalize_features(struct virtio_device *vdev)
 		return -EINVAL;
 	}
 
+	if (vp_check_common_size(vdev))
+		return -EINVAL;
+
 	vp_modern_set_features(&vp_dev->mdev, vdev->features);
 
 	return 0;
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index d312443d6569..e2a1fe7bb66c 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -296,7 +296,7 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 	mdev->common = vp_modern_map_capability(mdev, common,
 				      sizeof(struct virtio_pci_common_cfg), 4,
 				      0, sizeof(struct virtio_pci_modern_common_cfg),
-				      NULL, NULL);
+				      &mdev->common_len, NULL);
 	if (!mdev->common)
 		goto err_map_common;
 	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index a38c729d1973..d0f2797420f7 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -45,6 +45,7 @@ struct virtio_pci_modern_device {
 
 	size_t notify_len;
 	size_t device_len;
+	size_t common_len;
 
 	int notify_map_cap;
 
-- 
cgit v1.2.3


From d2cf1b6e3b85dcb2bb3e8cd7924beede34fbbf0e Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:13 -0700
Subject: vdpa: introduce .reset_map operation callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some device specific IOMMU parent drivers have long standing bogus
behavior that mistakenly clean up the maps during .reset. By definition,
this is violation to the on-chip IOMMU ops (i.e. .set_map, or .dma_map &
.dma_unmap) in those offending drivers, as the removal of internal maps
is completely agnostic to the upper layer, causing inconsistent view
between the userspace and the kernel. Some userspace app like QEMU gets
around of this brokenness by proactively removing and adding back all
the maps around vdpa device reset, but such workaround actually penalize
other well-behaved driver setup, where vdpa reset always comes with the
associated mapping cost, especially for kernel vDPA devices
(use_va=false) that have high cost on pinning. It's imperative to
rectify this behavior and remove the problematic code from all those
non-compliant parent drivers.

The reason why a separate .reset_map op is introduced is because this
allows a simple on-chip IOMMU model without exposing too much device
implementation detail to the upper vdpa layer. The .dma_map/unmap or
.set_map driver API is meant to be used to manipulate the IOTLB
mappings, and has been abstracted in a way similar to how a real IOMMU
device maps or unmaps pages for certain memory ranges. However, apart
from this there also exists other mapping needs, in which case 1:1
passthrough mapping has to be used by other users (read virtio-vdpa). To
ease parent/vendor driver implementation and to avoid abusing DMA ops in
an unexpacted way, these on-chip IOMMU devices can start with 1:1
passthrough mapping mode initially at the time of creation. Then the
.reset_map op can be used to switch iotlb back to this initial state
without having to expose a complex two-dimensional IOMMU device model.

The .reset_map is not a MUST for every parent that implements the
.dma_map or .set_map API, because device may work with DMA ops directly
by implement their own to manipulate system memory mappings, so don't
have to use .reset_map to achieve a simple IOMMU device model for 1:1
passthrough mapping.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <1697880319-4937-2-git-send-email-si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index d376309b99cf..26ae6ae1eac3 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -327,6 +327,15 @@ struct vdpa_map_file {
  *				@iova: iova to be unmapped
  *				@size: size of the area
  *				Returns integer: success (0) or error (< 0)
+ * @reset_map:			Reset device memory mapping to the default
+ *				state (optional)
+ *				Needed for devices that are using device
+ *				specific DMA translation and prefer mapping
+ *				to be decoupled from the virtio life cycle,
+ *				i.e. device .reset op does not reset mapping
+ *				@vdev: vdpa device
+ *				@asid: address space identifier
+ *				Returns integer: success (0) or error (< 0)
  * @get_vq_dma_dev:		Get the dma device for a specific
  *				virtqueue (optional)
  *				@vdev: vdpa device
@@ -405,6 +414,7 @@ struct vdpa_config_ops {
 		       u64 iova, u64 size, u64 pa, u32 perm, void *opaque);
 	int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid,
 			 u64 iova, u64 size);
+	int (*reset_map)(struct vdpa_device *vdev, unsigned int asid);
 	int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group,
 			      unsigned int asid);
 	struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx);
-- 
cgit v1.2.3


From a26f2e4e68ee3130e5d5acb4f58807041aaea905 Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:16 -0700
Subject: vdpa: introduce .compat_reset operation callback

Some device specific IOMMU parent drivers have long standing bogus
behaviour that mistakenly clean up the maps during .reset. By
definition, this is violation to the on-chip IOMMU ops (i.e. .set_map,
or .dma_map & .dma_unmap) in those offending drivers, as the removal of
internal maps is completely agnostic to the upper layer, causing
inconsistent view between the userspace and the kernel. Some userspace
app like QEMU gets around of this brokenness by proactively removing and
adding back all the maps around vdpa device reset, but such workaround
actually penaltize other well-behaved driver setup, where vdpa reset
always comes with the associated mapping cost, especially for kernel
vDPA devices (use_va=false) that have high cost on pinning. It's
imperative to rectify this behaviour and remove the problematic code
from all those non-compliant parent drivers.

However, we cannot unconditionally remove the bogus map-cleaning code
from the buggy .reset implementation, as there might exist userspace
apps that already rely on the behaviour on some setup. Introduce a
.compat_reset driver op to keep compatibility with older userspace. New
and well behaved parent driver should not bother to implement such op,
but only those drivers that are doing or used to do non-compliant
map-cleaning reset will have to.

Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Message-Id: <1697880319-4937-5-git-send-email-si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 include/linux/vdpa.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 26ae6ae1eac3..6b8cbf75712d 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -252,6 +252,17 @@ struct vdpa_map_file {
  * @reset:			Reset device
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
+ * @compat_reset:		Reset device with compatibility quirks to
+ *				accommodate older userspace. Only needed by
+ *				parent driver which used to have bogus reset
+ *				behaviour, and has to maintain such behaviour
+ *				for compatibility with older userspace.
+ *				Historically compliant driver only has to
+ *				implement .reset, Historically non-compliant
+ *				driver should implement both.
+ *				@vdev: vdpa device
+ *				@flags: compatibility quirks for reset
+ *				Returns integer: success (0) or error (< 0)
  * @suspend:			Suspend the device (optional)
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
@@ -393,6 +404,8 @@ struct vdpa_config_ops {
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
 	int (*reset)(struct vdpa_device *vdev);
+	int (*compat_reset)(struct vdpa_device *vdev, u32 flags);
+#define VDPA_RESET_F_CLEAN_MAP 1
 	int (*suspend)(struct vdpa_device *vdev);
 	int (*resume)(struct vdpa_device *vdev);
 	size_t (*get_config_size)(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From bc91df5c70ac720eca18bd1f4a288f2582713d3e Mon Sep 17 00:00:00 2001
From: Si-Wei Liu <si-wei.liu@oracle.com>
Date: Sat, 21 Oct 2023 02:25:17 -0700
Subject: vhost-vdpa: clean iotlb map during reset for older userspace

Using .compat_reset op from the previous patch, the buggy .reset
behaviour can be kept as-is on older userspace apps, which don't ack the
IOTLB_PERSIST backend feature. As this compatibility quirk is limited to
those drivers that used to be buggy in the past, it won't affect change
the behaviour or affect ABI on the setups with API compliant driver.

The separation of .compat_reset from the regular .reset allows
vhost-vdpa able to know which driver had broken behaviour before, so it
can apply the corresponding compatibility quirk to the individual driver
whenever needed.  Compared to overloading the existing .reset with
flags, .compat_reset won't cause any extra burden to the implementation
of every compliant driver.

[mst: squashed in two fixup commits]

Message-Id: <1697880319-4937-6-git-send-email-si-wei.liu@oracle.com>
Message-Id: <1698102863-21122-1-git-send-email-si-wei.liu@oracle.com>
Reported-by: Dragos Tatulea <dtatulea@nvidia.com>
Tested-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <1698275594-19204-1-git-send-email-si-wei.liu@oracle.com>
Reported-by: Lei Yang <leiyang@redhat.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 drivers/vhost/vdpa.c         | 20 ++++++++++++++++----
 drivers/virtio/virtio_vdpa.c |  2 +-
 include/linux/vdpa.h         |  7 +++++--
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index acc7c74ba7d6..30df5c58db73 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -227,13 +227,24 @@ static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
 	irq_bypass_unregister_producer(&vq->call_ctx.producer);
 }
 
-static int vhost_vdpa_reset(struct vhost_vdpa *v)
+static int _compat_vdpa_reset(struct vhost_vdpa *v)
 {
 	struct vdpa_device *vdpa = v->vdpa;
+	u32 flags = 0;
 
-	v->in_batch = 0;
+	if (v->vdev.vqs) {
+		flags |= !vhost_backend_has_feature(v->vdev.vqs[0],
+						    VHOST_BACKEND_F_IOTLB_PERSIST) ?
+			 VDPA_RESET_F_CLEAN_MAP : 0;
+	}
+
+	return vdpa_reset(vdpa, flags);
+}
 
-	return vdpa_reset(vdpa);
+static int vhost_vdpa_reset(struct vhost_vdpa *v)
+{
+	v->in_batch = 0;
+	return _compat_vdpa_reset(v);
 }
 
 static long vhost_vdpa_bind_mm(struct vhost_vdpa *v)
@@ -312,7 +323,7 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
 			vhost_vdpa_unsetup_vq_irq(v, i);
 
 	if (status == 0) {
-		ret = vdpa_reset(vdpa);
+		ret = _compat_vdpa_reset(v);
 		if (ret)
 			return ret;
 	} else
@@ -1344,6 +1355,7 @@ static void vhost_vdpa_cleanup(struct vhost_vdpa *v)
 	vhost_vdpa_free_domain(v);
 	vhost_dev_cleanup(&v->vdev);
 	kfree(v->vdev.vqs);
+	v->vdev.vqs = NULL;
 }
 
 static int vhost_vdpa_open(struct inode *inode, struct file *filep)
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 06ce6d8c2e00..8d63e5923d24 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -100,7 +100,7 @@ static void virtio_vdpa_reset(struct virtio_device *vdev)
 {
 	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
 
-	vdpa_reset(vdpa);
+	vdpa_reset(vdpa, 0);
 }
 
 static bool virtio_vdpa_notify(struct virtqueue *vq)
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6b8cbf75712d..db15ac07f8a6 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -519,14 +519,17 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
 	return vdev->dma_dev;
 }
 
-static inline int vdpa_reset(struct vdpa_device *vdev)
+static inline int vdpa_reset(struct vdpa_device *vdev, u32 flags)
 {
 	const struct vdpa_config_ops *ops = vdev->config;
 	int ret;
 
 	down_write(&vdev->cf_lock);
 	vdev->features_valid = false;
-	ret = ops->reset(vdev);
+	if (ops->compat_reset && flags)
+		ret = ops->compat_reset(vdev, flags);
+	else
+		ret = ops->reset(vdev);
 	up_write(&vdev->cf_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 3503895788d402d6a3814085ed582c364ec3e903 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 31 Oct 2023 12:02:06 -0400
Subject: virtio_pci: move structure to a header

These are guest/host interfaces, so they belong in the header where e.g.
qemu will know to find them.

Note: we added a new structure as opposed to extending existing one
because someone might be relying on the size of the existing structure
staying unchanged.  Add a warning to avoid using sizeof.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/virtio/virtio_pci_modern_dev.c |  7 ++++---
 include/linux/virtio_pci_modern.h      |  7 -------
 include/uapi/linux/virtio_pci.h        | 11 +++++++++++
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index e2a1fe7bb66c..7de8b1ebabac 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -294,9 +294,10 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 
 	err = -EINVAL;
 	mdev->common = vp_modern_map_capability(mdev, common,
-				      sizeof(struct virtio_pci_common_cfg), 4,
-				      0, sizeof(struct virtio_pci_modern_common_cfg),
-				      &mdev->common_len, NULL);
+			      sizeof(struct virtio_pci_common_cfg), 4, 0,
+			      offsetofend(struct virtio_pci_modern_common_cfg,
+					  queue_reset),
+			      &mdev->common_len, NULL);
 	if (!mdev->common)
 		goto err_map_common;
 	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index d0f2797420f7..a09e13a577a9 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,13 +5,6 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
-struct virtio_pci_modern_common_cfg {
-	struct virtio_pci_common_cfg cfg;
-
-	__le16 queue_notify_data;	/* read-write */
-	__le16 queue_reset;		/* read-write */
-};
-
 /**
  * struct virtio_pci_modern_device - info for modern PCI virtio
  * @pci_dev:	    Ptr to the PCI device struct
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index f703afc7ad31..44f4dd2add18 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -166,6 +166,17 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_hi;		/* read-write */
 };
 
+/*
+ * Warning: do not use sizeof on this: use offsetofend for
+ * specific fields you need.
+ */
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	__le16 queue_notify_data;	/* read-write */
+	__le16 queue_reset;		/* read-write */
+};
+
 /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
 struct virtio_pci_cfg_cap {
 	struct virtio_pci_cap cap;
-- 
cgit v1.2.3


From 8001f49394e353f035306a45bcf504f06fca6355 Mon Sep 17 00:00:00 2001
From: Krister Johansen <kjlx@templeofstupid.com>
Date: Fri, 27 Oct 2023 14:46:40 -0700
Subject: proc: sysctl: prevent aliased sysctls from getting passed to init

The code that checks for unknown boot options is unaware of the sysctl
alias facility, which maps bootparams to sysctl values.  If a user sets
an old value that has a valid alias, a message about an invalid
parameter will be printed during boot, and the parameter will get passed
to init.  Fix by checking for the existence of aliased parameters in the
unknown boot parameter code.  If an alias exists, don't return an error
or pass the value to init.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
Cc: stable@vger.kernel.org
Fixes: 0a477e1ae21b ("kernel/sysctl: support handling command line aliases")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/proc/proc_sysctl.c  | 7 +++++++
 include/linux/sysctl.h | 6 ++++++
 init/main.c            | 4 ++++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c88854df0b62..1c9635dddb70 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1592,6 +1592,13 @@ static const char *sysctl_find_alias(char *param)
 	return NULL;
 }
 
+bool sysctl_is_alias(char *param)
+{
+	const char *alias = sysctl_find_alias(param);
+
+	return alias != NULL;
+}
+
 /* Set sysctl value passed on kernel command line. */
 static int process_sysctl_arg(char *param, char *val,
 			       const char *unused, void *arg)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 09d7429d67c0..61b40ea81f4d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -242,6 +242,7 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table,
 extern struct ctl_table_header *register_sysctl_mount_point(const char *path);
 
 void do_sysctl_args(void);
+bool sysctl_is_alias(char *param);
 int do_proc_douintvec(struct ctl_table *table, int write,
 		      void *buffer, size_t *lenp, loff_t *ppos,
 		      int (*conv)(unsigned long *lvalp,
@@ -287,6 +288,11 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 static inline void do_sysctl_args(void)
 {
 }
+
+static inline bool sysctl_is_alias(char *param)
+{
+	return false;
+}
 #endif /* CONFIG_SYSCTL */
 
 int sysctl_max_threads(struct ctl_table *table, int write, void *buffer,
diff --git a/init/main.c b/init/main.c
index 436d73261810..e24b0780fdff 100644
--- a/init/main.c
+++ b/init/main.c
@@ -530,6 +530,10 @@ static int __init unknown_bootoption(char *param, char *val,
 {
 	size_t len = strlen(param);
 
+	/* Handle params aliased to sysctls */
+	if (sysctl_is_alias(param))
+		return 0;
+
 	repair_env_string(param, val);
 
 	/* Handle obsolete-style parameters */
-- 
cgit v1.2.3


From a9b8d90f87263f985fa14250fc8caf7bfcde8803 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 24 Aug 2023 14:52:19 -0400
Subject: NFSv4: fairly test all delegations on a SEQ4_ revocation

When the client is required to use TEST_STATEID to discover which
delegation(s) have been revoked, it may continually test delegations at the
head of the list if the server continues to be unsatisfied and send
SEQ4_STATUS_RECALLABLE_STATE_REVOKED.  For a large number of delegations
this behavior is prone to live-lock because the client may never be able to
test and free revoked state at the end of the list since the
SEQ4_STATUS_RECALLABLE_STATE_REVOKED will cause us to flag delegations at
the head of the list to be tested.  This problem is further exacerbated by
the state manager's willingness to be scheduled out on a busy system while
testing the list of delegations.

Keep a generation counter for each attempt to test all delegations, and
skip delegations that have already been tested in the current pass.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Torkil Svensgaard <torkil@drcmr.dk>
Tested-by: Ruben Vestergaard <rubenv@drcmr.dk>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/delegation.c       | 7 ++++++-
 fs/nfs/delegation.h       | 1 +
 include/linux/nfs_fs_sb.h | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cf7365581031..fa1a14def45c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -448,6 +448,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 	delegation->cred = get_cred(cred);
 	delegation->inode = inode;
 	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+	delegation->test_gen = 0;
 	spin_lock_init(&delegation->lock);
 
 	spin_lock(&clp->cl_lock);
@@ -1294,6 +1295,8 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
 	struct inode *inode;
 	const struct cred *cred;
 	nfs4_stateid stateid;
+	unsigned long gen = ++server->delegation_gen;
+
 restart:
 	rcu_read_lock();
 restart_locked:
@@ -1303,7 +1306,8 @@ restart_locked:
 		    test_bit(NFS_DELEGATION_RETURNING,
 					&delegation->flags) ||
 		    test_bit(NFS_DELEGATION_TEST_EXPIRED,
-					&delegation->flags) == 0)
+					&delegation->flags) == 0 ||
+			delegation->test_gen == gen)
 			continue;
 		inode = nfs_delegation_grab_inode(delegation);
 		if (inode == NULL)
@@ -1312,6 +1316,7 @@ restart_locked:
 		cred = get_cred_rcu(delegation->cred);
 		nfs4_stateid_copy(&stateid, &delegation->stateid);
 		spin_unlock(&delegation->lock);
+		delegation->test_gen = gen;
 		clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
 		rcu_read_unlock();
 		nfs_delegation_test_free_expired(inode, &stateid, cred);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 1c378992b7c0..a6f495d012cf 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
 	fmode_t type;
 	unsigned long pagemod_limit;
 	__u64 change_attr;
+	unsigned long test_gen;
 	unsigned long flags;
 	refcount_t refcount;
 	spinlock_t lock;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index cd628c4b011e..cd797e00fe35 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -239,6 +239,7 @@ struct nfs_server {
 	struct list_head	delegations;
 	struct list_head	ss_copies;
 
+	unsigned long		delegation_gen;
 	unsigned long		mig_gen;
 	unsigned long		mig_status;
 #define NFS_MIG_IN_TRANSITION		(1)
-- 
cgit v1.2.3


From bfca5fb4e97c46503ddfc582335917b0cc228264 Mon Sep 17 00:00:00 2001
From: felix <fuzhen5@huawei.com>
Date: Mon, 23 Oct 2023 09:40:19 +0800
Subject: SUNRPC: Fix RPC client cleaned up the freed pipefs dentries

RPC client pipefs dentries cleanup is in separated rpc_remove_pipedir()
workqueue,which takes care about pipefs superblock locking.
In some special scenarios, when kernel frees the pipefs sb of the
current client and immediately alloctes a new pipefs sb,
rpc_remove_pipedir function would misjudge the existence of pipefs
sb which is not the one it used to hold. As a result,
the rpc_remove_pipedir would clean the released freed pipefs dentries.

To fix this issue, rpc_remove_pipedir should check whether the
current pipefs sb is consistent with the original pipefs sb.

This error can be catched by KASAN:
=========================================================
[  250.497700] BUG: KASAN: slab-use-after-free in dget_parent+0x195/0x200
[  250.498315] Read of size 4 at addr ffff88800a2ab804 by task kworker/0:18/106503
[  250.500549] Workqueue: events rpc_free_client_work
[  250.501001] Call Trace:
[  250.502880]  kasan_report+0xb6/0xf0
[  250.503209]  ? dget_parent+0x195/0x200
[  250.503561]  dget_parent+0x195/0x200
[  250.503897]  ? __pfx_rpc_clntdir_depopulate+0x10/0x10
[  250.504384]  rpc_rmdir_depopulate+0x1b/0x90
[  250.504781]  rpc_remove_client_dir+0xf5/0x150
[  250.505195]  rpc_free_client_work+0xe4/0x230
[  250.505598]  process_one_work+0x8ee/0x13b0
...
[   22.039056] Allocated by task 244:
[   22.039390]  kasan_save_stack+0x22/0x50
[   22.039758]  kasan_set_track+0x25/0x30
[   22.040109]  __kasan_slab_alloc+0x59/0x70
[   22.040487]  kmem_cache_alloc_lru+0xf0/0x240
[   22.040889]  __d_alloc+0x31/0x8e0
[   22.041207]  d_alloc+0x44/0x1f0
[   22.041514]  __rpc_lookup_create_exclusive+0x11c/0x140
[   22.041987]  rpc_mkdir_populate.constprop.0+0x5f/0x110
[   22.042459]  rpc_create_client_dir+0x34/0x150
[   22.042874]  rpc_setup_pipedir_sb+0x102/0x1c0
[   22.043284]  rpc_client_register+0x136/0x4e0
[   22.043689]  rpc_new_client+0x911/0x1020
[   22.044057]  rpc_create_xprt+0xcb/0x370
[   22.044417]  rpc_create+0x36b/0x6c0
...
[   22.049524] Freed by task 0:
[   22.049803]  kasan_save_stack+0x22/0x50
[   22.050165]  kasan_set_track+0x25/0x30
[   22.050520]  kasan_save_free_info+0x2b/0x50
[   22.050921]  __kasan_slab_free+0x10e/0x1a0
[   22.051306]  kmem_cache_free+0xa5/0x390
[   22.051667]  rcu_core+0x62c/0x1930
[   22.051995]  __do_softirq+0x165/0x52a
[   22.052347]
[   22.052503] Last potentially related work creation:
[   22.052952]  kasan_save_stack+0x22/0x50
[   22.053313]  __kasan_record_aux_stack+0x8e/0xa0
[   22.053739]  __call_rcu_common.constprop.0+0x6b/0x8b0
[   22.054209]  dentry_free+0xb2/0x140
[   22.054540]  __dentry_kill+0x3be/0x540
[   22.054900]  shrink_dentry_list+0x199/0x510
[   22.055293]  shrink_dcache_parent+0x190/0x240
[   22.055703]  do_one_tree+0x11/0x40
[   22.056028]  shrink_dcache_for_umount+0x61/0x140
[   22.056461]  generic_shutdown_super+0x70/0x590
[   22.056879]  kill_anon_super+0x3a/0x60
[   22.057234]  rpc_kill_sb+0x121/0x200

Fixes: 0157d021d23a ("SUNRPC: handle RPC client pipefs dentries by network namespace aware routines")
Signed-off-by: felix <fuzhen5@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 net/sunrpc/clnt.c           | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index af7358277f1c..e9d4377d03c6 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -92,6 +92,7 @@ struct rpc_clnt {
 	};
 	const struct cred	*cl_cred;
 	unsigned int		cl_max_connect; /* max number of transports not to the same IP */
+	struct super_block *pipefs_sb;
 };
 
 /*
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e49c11ef7c50..daa9582ec861 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -111,7 +111,8 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
 
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
-		__rpc_clnt_remove_pipedir(clnt);
+		if (pipefs_sb == clnt->pipefs_sb)
+			__rpc_clnt_remove_pipedir(clnt);
 		rpc_put_sb_net(net);
 	}
 }
@@ -151,6 +152,8 @@ rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt)
 {
 	struct dentry *dentry;
 
+	clnt->pipefs_sb = pipefs_sb;
+
 	if (clnt->cl_program->pipe_dir_name != NULL) {
 		dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt);
 		if (IS_ERR(dentry))
-- 
cgit v1.2.3


From f003a717ae9086b1e8a4663124a96862df7282e7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Sep 2023 18:33:33 +0100
Subject: nfs: Convert nfs_symlink() to use a folio

Use the folio APIs, saving about four calls to compound_head().
Convert back to a page in each of the individual protocol implementations.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c            | 29 ++++++++++++-----------------
 fs/nfs/nfs3proc.c       |  3 ++-
 fs/nfs/nfs4proc.c       |  7 ++++---
 fs/nfs/proc.c           |  3 ++-
 include/linux/nfs_xdr.h |  2 +-
 5 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e6a51fd94fea..13dffe4201e6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2532,7 +2532,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
 int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		struct dentry *dentry, const char *symname)
 {
-	struct page *page;
+	struct folio *folio;
 	char *kaddr;
 	struct iattr attr;
 	unsigned int pathlen = strlen(symname);
@@ -2547,24 +2547,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	attr.ia_mode = S_IFLNK | S_IRWXUGO;
 	attr.ia_valid = ATTR_MODE;
 
-	page = alloc_page(GFP_USER);
-	if (!page)
+	folio = folio_alloc(GFP_USER, 0);
+	if (!folio)
 		return -ENOMEM;
 
-	kaddr = page_address(page);
+	kaddr = folio_address(folio);
 	memcpy(kaddr, symname, pathlen);
 	if (pathlen < PAGE_SIZE)
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
 
 	trace_nfs_symlink_enter(dir, dentry);
-	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+	error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr);
 	trace_nfs_symlink_exit(dir, dentry, error);
 	if (error != 0) {
 		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
 			dentry, symname, error);
 		d_drop(dentry);
-		__free_page(page);
+		folio_put(folio);
 		return error;
 	}
 
@@ -2574,18 +2574,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	 * No big deal if we can't add this page to the page cache here.
 	 * READLINK will get the missing page from the server if needed.
 	 */
-	if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
-							GFP_KERNEL)) {
-		SetPageUptodate(page);
-		unlock_page(page);
-		/*
-		 * add_to_page_cache_lru() grabs an extra page refcount.
-		 * Drop it here to avoid leaking this page later.
-		 */
-		put_page(page);
-	} else
-		__free_page(page);
+	if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0,
+							GFP_KERNEL) == 0) {
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+	}
 
+	folio_put(folio);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_symlink);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4bf208a0a8e9..2de66e4e8280 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -543,9 +543,10 @@ out:
 }
 
 static int
-nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
 		  unsigned int len, struct iattr *sattr)
 {
+	struct page *page = &folio->page;
 	struct nfs3_createdata *data;
 	struct dentry *d_alias;
 	int status = -ENOMEM;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b36193edc3ba..e7e80c67d82b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5036,9 +5036,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data)
 }
 
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct page *page, unsigned int len, struct iattr *sattr,
+		struct folio *folio, unsigned int len, struct iattr *sattr,
 		struct nfs4_label *label)
 {
+	struct page *page = &folio->page;
 	struct nfs4_createdata *data;
 	int status = -ENAMETOOLONG;
 
@@ -5063,7 +5064,7 @@ out:
 }
 
 static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct page *page, unsigned int len, struct iattr *sattr)
+		struct folio *folio, unsigned int len, struct iattr *sattr)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
@@ -5074,7 +5075,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 	do {
-		err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+		err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label);
 		trace_nfs4_symlink(dir, &dentry->d_name, err);
 		err = nfs4_handle_exception(NFS_SERVER(dir), err,
 				&exception);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e3570c656b0f..ad3a321ae997 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
 }
 
 static int
-nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
 		 unsigned int len, struct iattr *sattr)
 {
+	struct page *page = &folio->page;
 	struct nfs_fh *fh;
 	struct nfs_fattr *fattr;
 	struct nfs_symlinkargs	arg = {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 12bbb5c63664..539b57fbf3ce 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1772,7 +1772,7 @@ struct nfs_rpc_ops {
 	void	(*rename_rpc_prepare)(struct rpc_task *task, struct nfs_renamedata *);
 	int	(*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir);
 	int	(*link)    (struct inode *, struct inode *, const struct qstr *);
-	int	(*symlink) (struct inode *, struct dentry *, struct page *,
+	int	(*symlink) (struct inode *, struct dentry *, struct folio *,
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-- 
cgit v1.2.3


From 2c7ccb3c362bc86aeb3e52d6fbf15f7f480ca961 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 13 Sep 2023 16:54:14 -0700
Subject: module: Clarify documentation of module_param_call()

Commit 9bbb9e5a3310 ("param: use ops in struct kernel_param, rather than
get and set fns directly") added the comment that module_param_call()
was deprecated, during a large scale refactoring to bring sanity to type
casting back then. In 2017 following more cleanups, it became useful
again as it wraps a common pattern of creating an ops struct for a
given get/set pair:

  b2f270e87473 ("module: Prepare to convert all module_param_call() prototypes")
  ece1996a21ee ("module: Do not paper over type mismatches in module_param_call()")

        static const struct kernel_param_ops __param_ops_##name = \
                { .flags = 0, .set = _set, .get = _get }; \
        __module_param_call(MODULE_PARAM_PREFIX, \
                            name, &__param_ops_##name, arg, perm, -1, 0)

        __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1, 0)

Many users of module_param_cb() appear to be almost universally
open-coding the same thing that module_param_call() does now. Don't
discourage[1] people from using module_param_call(): clarify the comment
to show that module_param_cb() is useful if you repeatedly use the same
pair of get/set functions.

[1] https://lore.kernel.org/lkml/202308301546.5C789E5EC@keescook/

Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Johan Hovold <johan@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Nick Desaulniers <ndesaulniers@gooogle.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: linux-modules@vger.kernel.org
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/moduleparam.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 962cd41a2cb5..d4452f93d060 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -293,7 +293,11 @@ struct kparam_array
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-/* Obsolete - use module_param_cb() */
+/*
+ * Useful for describing a set/get pair used only once (i.e. for this
+ * parameter). For repeated set/get pairs (i.e. the same struct
+ * kernel_param_ops), use module_param_cb() instead.
+ */
 #define module_param_call(name, _set, _get, arg, perm)			\
 	static const struct kernel_param_ops __param_ops_##name =	\
 		{ .flags = 0, .set = _set, .get = _get };		\
-- 
cgit v1.2.3


From 60da3640b07ce03706a8c77a3740ebad8b9af063 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 16 Jun 2023 09:51:32 +0800
Subject: module: Make is_mapping_symbol() return bool

The return value of is_mapping_symbol() is true or false,
so change its type to reflect that.

Suggested-by: Xi Zhang <zhangxi@kylinos.cn>
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module_symbol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module_symbol.h b/include/linux/module_symbol.h
index 1269543d0634..77c9895b9ddb 100644
--- a/include/linux/module_symbol.h
+++ b/include/linux/module_symbol.h
@@ -3,7 +3,7 @@
 #define _LINUX_MODULE_SYMBOL_H
 
 /* This ignores the intensely annoying "mapping symbols" found in ELF files. */
-static inline int is_mapping_symbol(const char *str)
+static inline bool is_mapping_symbol(const char *str)
 {
 	if (str[0] == '.' && str[1] == 'L')
 		return true;
-- 
cgit v1.2.3


From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 31 Oct 2023 12:24:53 -0400
Subject: tracing: Have trace_event_file have ref counters

The following can crash the kernel:

 # cd /sys/kernel/tracing
 # echo 'p:sched schedule' > kprobe_events
 # exec 5>>events/kprobes/sched/enable
 # > kprobe_events
 # exec 5>&-

The above commands:

 1. Change directory to the tracefs directory
 2. Create a kprobe event (doesn't matter what one)
 3. Open bash file descriptor 5 on the enable file of the kprobe event
 4. Delete the kprobe event (removes the files too)
 5. Close the bash file descriptor 5

The above causes a crash!

 BUG: kernel NULL pointer dereference, address: 0000000000000028
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP PTI
 CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
 RIP: 0010:tracing_release_file_tr+0xc/0x50

What happens here is that the kprobe event creates a trace_event_file
"file" descriptor that represents the file in tracefs to the event. It
maintains state of the event (is it enabled for the given instance?).
Opening the "enable" file gets a reference to the event "file" descriptor
via the open file descriptor. When the kprobe event is deleted, the file is
also deleted from the tracefs system which also frees the event "file"
descriptor.

But as the tracefs file is still opened by user space, it will not be
totally removed until the final dput() is called on it. But this is not
true with the event "file" descriptor that is already freed. If the user
does a write to or simply closes the file descriptor it will reference the
event "file" descriptor that was just freed, causing a use-after-free bug.

To solve this, add a ref count to the event "file" descriptor as well as a
new flag called "FREED". The "file" will not be freed until the last
reference is released. But the FREE flag will be set when the event is
removed to prevent any more modifications to that event from happening,
even if there's still a reference to the event "file" descriptor.

Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files")
Reported-by: Beau Belgrave <beaub@linux.microsoft.com>
Tested-by: Beau Belgrave <beaub@linux.microsoft.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  4 ++++
 kernel/trace/trace.c               | 15 +++++++++++++++
 kernel/trace/trace.h               |  3 +++
 kernel/trace/trace_events.c        | 31 +++++++++++++++++++++++++++----
 kernel/trace/trace_events_filter.c |  3 +++
 5 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12207dc6722d..696f8dc4aa53 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -492,6 +492,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
 	EVENT_FILE_FL_PID_FILTER_BIT,
 	EVENT_FILE_FL_WAS_ENABLED_BIT,
+	EVENT_FILE_FL_FREED_BIT,
 };
 
 extern struct trace_event_file *trace_get_event_file(const char *instance,
@@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);
  *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  *  PID_FILTER    - When set, the event is filtered based on pid
  *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
+ *  FREED         - File descriptor is freed, all fields should be considered invalid
  */
 enum {
 	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -643,6 +645,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 	EVENT_FILE_FL_PID_FILTER	= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
 	EVENT_FILE_FL_WAS_ENABLED	= (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
+	EVENT_FILE_FL_FREED		= (1 << EVENT_FILE_FL_FREED_BIT),
 };
 
 struct trace_event_file {
@@ -671,6 +674,7 @@ struct trace_event_file {
 	 * caching and such. Which is mostly OK ;-)
 	 */
 	unsigned long		flags;
+	atomic_t		ref;	/* ref count for opened files */
 	atomic_t		sm_ref;	/* soft-mode reference counter */
 	atomic_t		tm_ref;	/* trigger-mode reference counter */
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2539cfc20a97..9aebf904ff97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
+	mutex_lock(&event_mutex);
+
+	/* Fail if the file is marked for removal */
+	if (file->flags & EVENT_FILE_FL_FREED) {
+		trace_array_put(file->tr);
+		ret = -ENODEV;
+	} else {
+		event_file_get(file);
+	}
+
+	mutex_unlock(&event_mutex);
+	if (ret)
+		return ret;
+
 	filp->private_data = inode->i_private;
 
 	return 0;
@@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
 	struct trace_event_file *file = inode->i_private;
 
 	trace_array_put(file->tr);
+	event_file_put(file);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0e1405abf4f7..b7f4ea25a194 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops,
 				     char *glob,
 				     struct event_trigger_data *trigger_data);
 
+extern void event_file_get(struct trace_event_file *file);
+extern void event_file_put(struct trace_event_file *file);
+
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
  *
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f9e3e24d8796..f29e815ca5b2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 	}
 }
 
+void event_file_get(struct trace_event_file *file)
+{
+	atomic_inc(&file->ref);
+}
+
+void event_file_put(struct trace_event_file *file)
+{
+	if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+		if (file->flags & EVENT_FILE_FL_FREED)
+			kmem_cache_free(file_cachep, file);
+		return;
+	}
+
+	if (atomic_dec_and_test(&file->ref)) {
+		/* Count should only go to zero when it is freed */
+		if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
+			return;
+		kmem_cache_free(file_cachep, file);
+	}
+}
+
 static void remove_event_file_dir(struct trace_event_file *file)
 {
 	eventfs_remove_dir(file->ei);
 	list_del(&file->list);
 	remove_subsystem(file->system);
 	free_event_filter(file->filter);
-	kmem_cache_free(file_cachep, file);
+	file->flags |= EVENT_FILE_FL_FREED;
+	event_file_put(file);
 }
 
 /*
@@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		flags = file->flags;
 	mutex_unlock(&event_mutex);
 
-	if (!file)
+	if (!file || flags & EVENT_FILE_FL_FREED)
 		return -ENODEV;
 
 	if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		ret = -ENODEV;
 		mutex_lock(&event_mutex);
 		file = event_file_data(filp);
-		if (likely(file)) {
+		if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
 			ret = tracing_update_buffers(file->tr);
 			if (ret < 0) {
 				mutex_unlock(&event_mutex);
@@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	file = event_file_data(filp);
-	if (file)
+	if (file && !(file->flags & EVENT_FILE_FL_FREED))
 		print_event_filter(file, s);
 	mutex_unlock(&event_mutex);
 
@@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call,
 	atomic_set(&file->tm_ref, 0);
 	INIT_LIST_HEAD(&file->triggers);
 	list_add(&file->list, &tr->events);
+	event_file_get(file);
 
 	return file;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 33264e510d16..0c611b281a5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
 	struct event_filter *filter = NULL;
 	int err;
 
+	if (file->flags & EVENT_FILE_FL_FREED)
+		return -ENODEV;
+
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable(file);
 		filter = event_filter(file);
-- 
cgit v1.2.3


From 44365329f8219fc379097c2c9a75ff53f123764f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 1 Nov 2023 13:25:46 -0400
Subject: eventfs: Hold eventfs_mutex when calling callback functions

The callback function that is used to create inodes and dentries is not
protected by anything and the data that is passed to it could become
stale. After eventfs_remove_dir() is called by the tracing system, it is
free to remove the events that are associated to that directory.
Unfortunately, that means the callbacks must not be called after that.

     CPU0				CPU1
     ----				----
 eventfs_root_lookup() {
				 eventfs_remove_dir() {
				      mutex_lock(&event_mutex);
				      ei->is_freed = set;
				      mutex_unlock(&event_mutex);
				 }
				 kfree(event_call);

    for (...) {
      entry = &ei->entries[i];
      r = entry->callback() {
          call = data;		// call == event_call above
          if (call->flags ...)

 [ USE AFTER FREE BUG ]

The safest way to protect this is to wrap the callback with:

 mutex_lock(&eventfs_mutex);
 if (!ei->is_freed)
     r = entry->callback();
 else
     r = -1;
 mutex_unlock(&eventfs_mutex);

This will make sure that the callback will not be called after it is
freed. But now it needs to be known that the callback is called while
holding internal eventfs locks, and that it must not call back into the
eventfs / tracefs system. There's no reason it should anyway, but document
that as well.

Link: https://lore.kernel.org/all/CA+G9fYu9GOEbD=rR5eMR-=HJ8H6rMsbzDC2ZY5=Y50WpWAE7_Q@mail.gmail.com/
Link: https://lkml.kernel.org/r/20231101172649.906696613@goodmis.org

Cc: Ajay Kaher <akaher@vmware.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c | 22 ++++++++++++++++++++--
 include/linux/tracefs.h  | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 93d08e552483..8ac9abf7a3d5 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -615,7 +615,13 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
 		entry = &ei->entries[i];
 		if (strcmp(name, entry->name) == 0) {
 			void *cdata = data;
-			r = entry->callback(name, &mode, &cdata, &fops);
+			mutex_lock(&eventfs_mutex);
+			/* If ei->is_freed, then the event itself may be too */
+			if (!ei->is_freed)
+				r = entry->callback(name, &mode, &cdata, &fops);
+			else
+				r = -1;
+			mutex_unlock(&eventfs_mutex);
 			if (r <= 0)
 				continue;
 			ret = simple_lookup(dir, dentry, flags);
@@ -749,7 +755,13 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 		void *cdata = data;
 		entry = &ei->entries[i];
 		name = entry->name;
-		r = entry->callback(name, &mode, &cdata, &fops);
+		mutex_lock(&eventfs_mutex);
+		/* If ei->is_freed, then the event itself may be too */
+		if (!ei->is_freed)
+			r = entry->callback(name, &mode, &cdata, &fops);
+		else
+			r = -1;
+		mutex_unlock(&eventfs_mutex);
 		if (r <= 0)
 			continue;
 		d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
@@ -819,6 +831,10 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
  *   data = A pointer to @data, and the callback may replace it, which will
  *         cause the file created to pass the new data to the open() call.
  *   fops = the fops to use for the created file.
+ *
+ * NB. @callback is called while holding internal locks of the eventfs
+ *     system. The callback must not call any code that might also call into
+ *     the tracefs or eventfs system or it will risk creating a deadlock.
  */
 struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
 					 const struct eventfs_entry *entries,
@@ -878,6 +894,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
  * @data: The default data to pass to the files (an entry may override it).
  *
  * This function creates the top of the trace event directory.
+ *
+ * See eventfs_create_dir() for use of @entries.
  */
 struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
 						const struct eventfs_entry *entries,
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 13359b1a35d1..7a5fe17b6bf9 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -23,9 +23,52 @@ struct file_operations;
 
 struct eventfs_file;
 
+/**
+ * eventfs_callback - A callback function to create dynamic files in eventfs
+ * @name: The name of the file that is to be created
+ * @mode: return the file mode for the file (RW access, etc)
+ * @data: data to pass to the created file ops
+ * @fops: the file operations of the created file
+ *
+ * The evetnfs files are dynamically created. The struct eventfs_entry array
+ * is passed to eventfs_create_dir() or eventfs_create_events_dir() that will
+ * be used to create the files within those directories. When a lookup
+ * or access to a file within the directory is made, the struct eventfs_entry
+ * array is used to find a callback() with the matching name that is being
+ * referenced (for lookups, the entire array is iterated and each callback
+ * will be called).
+ *
+ * The callback will be called with @name for the name of the file to create.
+ * The callback can return less than 1 to indicate  that no file should be
+ * created.
+ *
+ * If a file is to be created, then @mode should be populated with the file
+ * mode (permissions) for which the file is created for. This would be
+ * used to set the created inode i_mode field.
+ *
+ * The @data should be set to the data passed to the other file operations
+ * (read, write, etc). Note, @data will also point to the data passed in
+ * to eventfs_create_dir() or eventfs_create_events_dir(), but the callback
+ * can replace the data if it chooses to. Otherwise, the original data
+ * will be used for the file operation functions.
+ *
+ * The @fops should be set to the file operations that will be used to create
+ * the inode.
+ *
+ * NB. This callback is called while holding internal locks of the eventfs
+ *     system. The callback must not call any code that might also call into
+ *     the tracefs or eventfs system or it will risk creating a deadlock.
+ */
 typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
 				const struct file_operations **fops);
 
+/**
+ * struct eventfs_entry - dynamically created eventfs file call back handler
+ * @name:	Then name of the dynamic file in an eventfs directory
+ * @callback:	The callback to get the fops of the file when it is created
+ *
+ * See evenfs_callback() typedef for how to set up @callback.
+ */
 struct eventfs_entry {
 	const char			*name;
 	eventfs_callback		callback;
-- 
cgit v1.2.3


From 391145ba2accc48b596f3d438af1a6255b62a555 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:24 -0700
Subject: bpf: Add __bpf_kfunc_{start,end}_defs macros

BPF kfuncs are meant to be called from BPF programs. Accordingly, most
kfuncs are not called from anywhere in the kernel, which the
-Wmissing-prototypes warning is unhappy about. We've peppered
__diag_ignore_all("-Wmissing-prototypes", ... everywhere kfuncs are
defined in the codebase to suppress this warning.

This patch adds two macros meant to bound one or many kfunc definitions.
All existing kfunc definitions which use these __diag calls to suppress
-Wmissing-prototypes are migrated to use the newly-introduced macros.
A new __diag_ignore_all - for "-Wmissing-declarations" - is added to the
__bpf_kfunc_start_defs macro based on feedback from Andrii on an earlier
version of this patch [0] and another recent mailing list thread [1].

In the future we might need to ignore different warnings or do other
kfunc-specific things. This change will make it easier to make such
modifications for all kfunc defs.

  [0]: https://lore.kernel.org/bpf/CAEf4BzaE5dRWtK6RPLnjTW-MW9sx9K3Fn6uwqCTChK2Dcb1Xig@mail.gmail.com/
  [1]: https://lore.kernel.org/bpf/ZT+2qCc%2FaXep0%2FLf@krava/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst     |  6 ++----
 include/linux/btf.h              |  9 +++++++++
 kernel/bpf/bpf_iter.c            |  6 ++----
 kernel/bpf/cgroup_iter.c         |  6 ++----
 kernel/bpf/cpumask.c             |  6 ++----
 kernel/bpf/helpers.c             |  6 ++----
 kernel/bpf/map_iter.c            |  6 ++----
 kernel/bpf/task_iter.c           | 18 ++++++------------
 kernel/trace/bpf_trace.c         |  6 ++----
 net/bpf/test_run.c               |  7 +++----
 net/core/filter.c                | 13 ++++---------
 net/core/xdp.c                   |  6 ++----
 net/ipv4/fou_bpf.c               |  6 ++----
 net/netfilter/nf_conntrack_bpf.c |  6 ++----
 net/netfilter/nf_nat_bpf.c       |  6 ++----
 net/xfrm/xfrm_interface_bpf.c    |  6 ++----
 16 files changed, 46 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 0d2647fb358d..723408e399ab 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -37,16 +37,14 @@ prototype in a header for the wrapper kfunc.
 An example is given below::
 
         /* Disables missing prototype warnings */
-        __diag_push();
-        __diag_ignore_all("-Wmissing-prototypes",
-                          "Global kfuncs as their definitions will be in BTF");
+        __bpf_kfunc_start_defs();
 
         __bpf_kfunc struct task_struct *bpf_find_get_task_by_vpid(pid_t nr)
         {
                 return find_get_task_by_vpid(nr);
         }
 
-        __diag_pop();
+        __bpf_kfunc_end_defs();
 
 A wrapper kfunc is often needed when we need to annotate parameters of the
 kfunc. Otherwise one may directly make the kfunc visible to the BPF program by
diff --git a/include/linux/btf.h b/include/linux/btf.h
index c2231c64d60b..dc5ce962f600 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -84,6 +84,15 @@
  */
 #define __bpf_kfunc __used noinline
 
+#define __bpf_kfunc_start_defs()					       \
+	__diag_push();							       \
+	__diag_ignore_all("-Wmissing-declarations",			       \
+			  "Global kfuncs as their definitions will be in BTF");\
+	__diag_ignore_all("-Wmissing-prototypes",			       \
+			  "Global kfuncs as their definitions will be in BTF")
+
+#define __bpf_kfunc_end_defs() __diag_pop()
+
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
  * example the structure gets renamed. In this way, developers have to revisit
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 833faa04461b..0fae79164187 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -782,9 +782,7 @@ struct bpf_iter_num_kern {
 	int end; /* final value, exclusive */
 } __aligned(8);
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
 {
@@ -843,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
 	s->cur = s->end = 0;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 209e5135f9fb..d1b5c5618dd7 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -305,9 +305,7 @@ struct bpf_iter_css_kern {
 	unsigned int flags;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		"Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
 		struct cgroup_subsys_state *start, unsigned int flags)
@@ -358,4 +356,4 @@ __bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
 {
 }
 
-__diag_pop();
\ No newline at end of file
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 6983af8e093c..e01c741e54e7 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -34,9 +34,7 @@ static bool cpu_valid(u32 cpu)
 	return cpu < nr_cpu_ids;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global kfuncs as their definitions will be in BTF");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_cpumask_create() - Create a mutable BPF cpumask.
@@ -407,7 +405,7 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 	return cpumask_any_and_distribute(src1, src2);
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 95449ea7cc1b..abe82105e33e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1886,9 +1886,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 	}
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 {
@@ -2505,7 +2503,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(generic_btf_ids)
 #ifdef CONFIG_KEXEC_CORE
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6fc9dae9edc8..6abd7c5df4b3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -193,9 +193,7 @@ static int __init bpf_map_iter_init(void)
 
 late_initcall(bpf_map_iter_init);
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 {
@@ -213,7 +211,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 	return ret;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_map_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8967d1ac7551..4e156dca48de 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -822,9 +822,7 @@ struct bpf_iter_task_vma_kern {
 	struct bpf_iter_task_vma_kern_data *data;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
 				      struct task_struct *task, u64 addr)
@@ -890,7 +888,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
 	}
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 #ifdef CONFIG_CGROUPS
 
@@ -902,9 +900,7 @@ struct bpf_iter_css_task_kern {
 	struct css_task_iter *css_it;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
 		struct cgroup_subsys_state *css, unsigned int flags)
@@ -950,7 +946,7 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 	bpf_mem_free(&bpf_global_ma, kit->css_it);
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 #endif /* CONFIG_CGROUPS */
 
@@ -973,9 +969,7 @@ enum {
 	BPF_TASK_ITER_PROC_THREADS
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
 		struct task_struct *task__nullable, unsigned int flags)
@@ -1045,7 +1039,7 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
 {
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index df697c74d519..84e8a0f6e4e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1252,9 +1252,7 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
 };
 
 #ifdef CONFIG_KEYS
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "kfuncs which will be used in BPF programs");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_lookup_user_key - lookup a key by its serial
@@ -1404,7 +1402,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 }
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(key_sig_kfunc_set)
 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0841f8d82419..c9fdcc5cdce1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -503,9 +503,8 @@ out:
  * architecture dependent calling conventions. 7+ can be supported in the
  * future.
  */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
+
 __bpf_kfunc int bpf_fentry_test1(int a)
 {
 	return a + 1;
@@ -605,7 +604,7 @@ __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
 {
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_test_modify_return_ids)
 BTF_ID_FLAGS(func, bpf_modify_return_test)
diff --git a/net/core/filter.c b/net/core/filter.c
index 21d75108c2e9..383f96b0a1c7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11767,9 +11767,7 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	return func;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
 				    struct bpf_dynptr_kern *ptr__uninit)
 {
@@ -11816,7 +11814,7 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
 
 	return 0;
 }
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
 			       struct bpf_dynptr_kern *ptr__uninit)
@@ -11879,10 +11877,7 @@ static int __init bpf_kfunc_init(void)
 }
 late_initcall(bpf_kfunc_init);
 
-/* Disables missing prototype warnings */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
  *
@@ -11916,7 +11911,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
 	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_sk_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index df4789ab512d..b6f1d6dab3f2 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -696,9 +696,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
 	return nxdpf;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
@@ -738,7 +736,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 	return -EOPNOTSUPP;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
 #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
index 3760a14b6b57..4da03bf45c9b 100644
--- a/net/ipv4/fou_bpf.c
+++ b/net/ipv4/fou_bpf.c
@@ -22,9 +22,7 @@ enum bpf_fou_encap_type {
 	FOU_BPF_ENCAP_GUE,
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_skb_set_fou_encap - Set FOU encap parameters
  *
@@ -100,7 +98,7 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
 	return 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(fou_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index b21799d468d2..475358ec8212 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -230,9 +230,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
 	return 0;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in nf_conntrack BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_xdp_ct_alloc - Allocate a new CT entry
  *
@@ -467,7 +465,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 	return nf_ct_change_status_common(nfct, status);
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(nf_ct_kfunc_set)
 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 141ee7783223..6e3b2f58855f 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -12,9 +12,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_nat.h>
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in nf_nat BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_ct_set_nat_info - Set source or destination nat address
  *
@@ -54,7 +52,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
 	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(nf_nat_kfunc_set)
 BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
index d74f3fd20f2b..7d5e920141e9 100644
--- a/net/xfrm/xfrm_interface_bpf.c
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -27,9 +27,7 @@ struct bpf_xfrm_info {
 	int link;
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in xfrm_interface BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_skb_get_xfrm_info - Get XFRM metadata
  *
@@ -93,7 +91,7 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp
 	return 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(xfrm_ifc_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
-- 
cgit v1.2.3


From 15fb6f2b6c4c3c129adc2412ae12ec15e60a6adb Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:25 -0700
Subject: bpf: Add __bpf_hook_{start,end} macros

Not all uses of __diag_ignore_all(...) in BPF-related code in order to
suppress warnings are wrapping kfunc definitions. Some "hook point"
definitions - small functions meant to be used as attach points for
fentry and similar BPF progs - need to suppress -Wmissing-declarations.

We could use __bpf_kfunc_{start,end}_defs added in the previous patch in
such cases, but this might be confusing to someone unfamiliar with BPF
internals. Instead, this patch adds __bpf_hook_{start,end} macros,
currently having the same effect as __bpf_kfunc_{start,end}_defs, then
uses them to suppress warnings for two hook points in the kernel itself
and some bpf_testmod hook points as well.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h                                   | 2 ++
 kernel/cgroup/rstat.c                                 | 9 +++------
 net/socket.c                                          | 8 ++------
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 6 ++----
 4 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index dc5ce962f600..59d404e22814 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -92,6 +92,8 @@
 			  "Global kfuncs as their definitions will be in BTF")
 
 #define __bpf_kfunc_end_defs() __diag_pop()
+#define __bpf_hook_start() __bpf_kfunc_start_defs()
+#define __bpf_hook_end() __bpf_kfunc_end_defs()
 
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d80d7a608141..c0adb7254b45 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -156,19 +156,16 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  * optimize away the callsite. Therefore, __weak is needed to ensure that the
  * call is still emitted, by telling the compiler that we don't know what the
  * function might eventually be.
- *
- * __diag_* below are needed to dismiss the missing prototype warning.
  */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "kfuncs which will be used in BPF programs");
+
+__bpf_hook_start();
 
 __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
 				     struct cgroup *parent, int cpu)
 {
 }
 
-__diag_pop();
+__bpf_hook_end();
 
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
diff --git a/net/socket.c b/net/socket.c
index 0d1c4e78fc7f..3379c64217a4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1685,20 +1685,16 @@ struct file *__sys_socket_file(int family, int type, int protocol)
  *	Therefore, __weak is needed to ensure that the call is still
  *	emitted, by telling the compiler that we don't know what the
  *	function might eventually be.
- *
- *	__diag_* below are needed to dismiss the missing prototype warning.
  */
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "A fmod_ret entry point for BPF programs");
+__bpf_hook_start();
 
 __weak noinline int update_socket_protocol(int family, int type, int protocol)
 {
 	return protocol;
 }
 
-__diag_pop();
+__bpf_hook_end();
 
 int __sys_socket(int family, int type, int protocol)
 {
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index a5e246f7b202..91907b321f91 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -39,9 +39,7 @@ struct bpf_testmod_struct_arg_4 {
 	int b;
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in bpf_testmod.ko BTF");
+__bpf_hook_start();
 
 noinline int
 bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) {
@@ -335,7 +333,7 @@ noinline int bpf_fentry_shadow_test(int a)
 }
 EXPORT_SYMBOL_GPL(bpf_fentry_shadow_test);
 
-__diag_pop();
+__bpf_hook_end();
 
 static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 	.attr = { .name = "bpf_testmod", .mode = 0666, },
-- 
cgit v1.2.3


From f55d8e60f10909dbc5524e261041e1d28d7d20d8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 28 Oct 2023 21:25:11 +0200
Subject: net: ethtool: Fix documentation of ethtool_sprintf()

This function takes a pointer to a pointer, unlike sprintf() which is
passed a plain pointer. Fix up the documentation to make this clear.

Fixes: 7888fe53b706 ("ethtool: Add common function for filling out strings")
Cc: Alexander Duyck <alexanderduyck@fb.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Link: https://lore.kernel.org/r/20231028192511.100001-1-andrew@lunn.ch
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 226a36ed5aa1..689028257fcc 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1045,10 +1045,10 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
 
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
- * @data: Pointer to start of string to update
+ * @data: Pointer to a pointer to the start of string to update
  * @fmt: Format of string to write
  *
- * Write formatted string to data. Update data to point at start of
+ * Write formatted string to *data. Update *data to point at start of
  * next string.
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
-- 
cgit v1.2.3


From e8ae8ad479e2d037daa33756e5e72850a7bd37a9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 24 Oct 2023 09:53:33 +1100
Subject: Fix termination state for idr_for_each_entry_ul()

The comment for idr_for_each_entry_ul() states

  after normal termination @entry is left with the value NULL

This is not correct in the case where UINT_MAX has an entry in the idr.
In that case @entry will be non-NULL after termination.
No current code depends on the documentation being correct, but to
save future code we should fix it.

Also fix idr_for_each_entry_continue_ul().  While this is not documented
as leaving @entry as NULL, the mellanox driver appears to depend on
it doing so.  So make that explicit in the documentation as well as in
the code.

Fixes: e33d2b74d805 ("idr: fix overflow case for idr_for_each_entry_ul()")
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chris Mi <chrism@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/idr.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index a0dce14090a9..da5f5fa4a3a6 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -200,7 +200,7 @@ static inline void idr_preload_end(void)
  */
 #define idr_for_each_entry_ul(idr, entry, tmp, id)			\
 	for (tmp = 0, id = 0;						\
-	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
 	     tmp = id, ++id)
 
 /**
@@ -224,10 +224,12 @@ static inline void idr_preload_end(void)
  * @id: Entry ID.
  *
  * Continue to iterate over entries, continuing after the current position.
+ * After normal termination @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
  */
 #define idr_for_each_entry_continue_ul(idr, entry, tmp, id)		\
 	for (tmp = id;							\
-	     tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
+	     ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
 	     tmp = id, ++id)
 
 /*
-- 
cgit v1.2.3


From cdbab6236605dc11780779d9af689aea7d58cab1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 31 Oct 2023 06:19:45 +0000
Subject: tcp: fix fastopen code vs usec TS

After blamed commit, TFO client-ack-dropped-then-recovery-ms-timestamps
packetdrill test failed.

David Morley and Neal Cardwell started investigating and Neal pointed
that we had :

tcp_conn_request()
  tcp_try_fastopen()
   -> tcp_fastopen_create_child
     -> child = inet_csk(sk)->icsk_af_ops->syn_recv_sock()
       -> tcp_create_openreq_child()
          -> copy req_usec_ts from req:
          newtp->tcp_usec_ts = treq->req_usec_ts;
          // now the new TFO server socket always does usec TS, no matter
          // what the route options are...
  send_synack()
    -> tcp_make_synack()
        // disable tcp_rsk(req)->req_usec_ts if route option is not present:
        if (tcp_rsk(req)->req_usec_ts < 0)
                tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);

tcp_conn_request() has the initial dst, we can initialize
tcp_rsk(req)->req_usec_ts there instead of later in send_synack();

This means tcp_rsk(req)->req_usec_ts can be a boolean.

Many thanks to David an Neal for their help.

Fixes: 614e8316aa4c ("tcp: add support for usec resolution in TCP TS values")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202310302216.f79d78bc-oliver.sang@intel.com
Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Morley <morleyd@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   | 2 +-
 net/ipv4/syncookies.c | 2 +-
 net/ipv4/tcp_input.c  | 7 ++++---
 net/ipv4/tcp_output.c | 2 --
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ec4e9367f5b0..68f3d315d2e1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -152,7 +152,7 @@ struct tcp_request_sock {
 	u64				snt_synack; /* first SYNACK sent time */
 	bool				tfo_listener;
 	bool				is_mptcp;
-	s8				req_usec_ts;
+	bool				req_usec_ts;
 #if IS_ENABLED(CONFIG_MPTCP)
 	bool				drop_req;
 #endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 98b25e5d147b..d37282c06e3d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -306,7 +306,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 	treq->af_specific = af_ops;
 
 	treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
-	treq->req_usec_ts = -1;
+	treq->req_usec_ts = false;
 
 #if IS_ENABLED(CONFIG_MPTCP)
 	treq->is_mptcp = sk_is_mptcp(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 50aaa1527150..bcb55d98004c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7115,7 +7115,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	req->syncookie = want_cookie;
 	tcp_rsk(req)->af_specific = af_ops;
 	tcp_rsk(req)->ts_off = 0;
-	tcp_rsk(req)->req_usec_ts = -1;
+	tcp_rsk(req)->req_usec_ts = false;
 #if IS_ENABLED(CONFIG_MPTCP)
 	tcp_rsk(req)->is_mptcp = 0;
 #endif
@@ -7143,9 +7143,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (!dst)
 		goto drop_and_free;
 
-	if (tmp_opt.tstamp_ok)
+	if (tmp_opt.tstamp_ok) {
+		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 		tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
-
+	}
 	if (!want_cookie && !isn) {
 		int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f558c054cf6e..0d8dd5b7e2e5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3693,8 +3693,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	memset(&opts, 0, sizeof(opts));
-	if (tcp_rsk(req)->req_usec_ts < 0)
-		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
-- 
cgit v1.2.3


From ed9009ad300c0f15a3ecfe9613547b1962bde02c Mon Sep 17 00:00:00 2001
From: Bean Huo <beanhuo@micron.com>
Date: Mon, 30 Oct 2023 23:48:09 +0100
Subject: mmc: Add quirk MMC_QUIRK_BROKEN_CACHE_FLUSH for Micron eMMC Q2J54A

Micron MTFC4GACAJCN eMMC supports cache but requires that flush cache
operation be allowed only after a write has occurred. Otherwise, the
cache flush command or subsequent commands will time out.

Signed-off-by: Bean Huo <beanhuo@micron.com>
Signed-off-by: Rafael Beims <rafael.beims@toradex.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231030224809.59245-1-beanhuo@iokpp.de
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c  | 4 +++-
 drivers/mmc/core/card.h   | 4 ++++
 drivers/mmc/core/mmc.c    | 8 ++++++--
 drivers/mmc/core/quirks.h | 7 ++++---
 include/linux/mmc/card.h  | 2 ++
 5 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 3a8f27c3e310..152dfe593c43 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2381,8 +2381,10 @@ enum mmc_issued mmc_blk_mq_issue_rq(struct mmc_queue *mq, struct request *req)
 			}
 			ret = mmc_blk_cqe_issue_flush(mq, req);
 			break;
-		case REQ_OP_READ:
 		case REQ_OP_WRITE:
+			card->written_flag = true;
+			fallthrough;
+		case REQ_OP_READ:
 			if (host->cqe_enabled)
 				ret = mmc_blk_cqe_issue_rw_rq(mq, req);
 			else
diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h
index 4edf9057fa79..b7754a1b8d97 100644
--- a/drivers/mmc/core/card.h
+++ b/drivers/mmc/core/card.h
@@ -280,4 +280,8 @@ static inline int mmc_card_broken_sd_cache(const struct mmc_card *c)
 	return c->quirks & MMC_QUIRK_BROKEN_SD_CACHE;
 }
 
+static inline int mmc_card_broken_cache_flush(const struct mmc_card *c)
+{
+	return c->quirks & MMC_QUIRK_BROKEN_CACHE_FLUSH;
+}
 #endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 8180983bd402..8ec19f9f1fd2 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -2086,13 +2086,17 @@ static int _mmc_flush_cache(struct mmc_host *host)
 {
 	int err = 0;
 
+	if (mmc_card_broken_cache_flush(host->card) && !host->card->written_flag)
+		return 0;
+
 	if (_mmc_cache_enabled(host)) {
 		err = mmc_switch(host->card, EXT_CSD_CMD_SET_NORMAL,
 				 EXT_CSD_FLUSH_CACHE, 1,
 				 CACHE_FLUSH_TIMEOUT_MS);
 		if (err)
-			pr_err("%s: cache flush error %d\n",
-			       mmc_hostname(host), err);
+			pr_err("%s: cache flush error %d\n", mmc_hostname(host), err);
+		else
+			host->card->written_flag = false;
 	}
 
 	return err;
diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h
index 32b64b564fb1..cca71867bc4a 100644
--- a/drivers/mmc/core/quirks.h
+++ b/drivers/mmc/core/quirks.h
@@ -110,11 +110,12 @@ static const struct mmc_fixup __maybe_unused mmc_blk_fixups[] = {
 		  MMC_QUIRK_TRIM_BROKEN),
 
 	/*
-	 * Micron MTFC4GACAJCN-1M advertises TRIM but it does not seems to
-	 * support being used to offload WRITE_ZEROES.
+	 * Micron MTFC4GACAJCN-1M supports TRIM but does not appear to support
+	 * WRITE_ZEROES offloading. It also supports caching, but the cache can
+	 * only be flushed after a write has occurred.
 	 */
 	MMC_FIXUP("Q2J54A", CID_MANFID_MICRON, 0x014e, add_quirk_mmc,
-		  MMC_QUIRK_TRIM_BROKEN),
+		  MMC_QUIRK_TRIM_BROKEN | MMC_QUIRK_BROKEN_CACHE_FLUSH),
 
 	/*
 	 * Kingston EMMC04G-M627 advertises TRIM but it does not seems to
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index daa2f40d9ce6..7b12eebc5586 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -295,7 +295,9 @@ struct mmc_card {
 #define MMC_QUIRK_BROKEN_HPI	(1<<13)		/* Disable broken HPI support */
 #define MMC_QUIRK_BROKEN_SD_DISCARD	(1<<14)	/* Disable broken SD discard support */
 #define MMC_QUIRK_BROKEN_SD_CACHE	(1<<15)	/* Disable broken SD cache support */
+#define MMC_QUIRK_BROKEN_CACHE_FLUSH	(1<<16)	/* Don't flush cache until the write has occurred */
 
+	bool			written_flag;	/* Indicates eMMC has been written since power on */
 	bool			reenable_cmdq;	/* Re-enable Command Queue */
 
 	unsigned int		erase_size;	/* erase size in sectors */
-- 
cgit v1.2.3


From 5c5f0d2b5f92c47baf82b9b211e27edd7d195158 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 9 Jun 2023 13:04:14 +0800
Subject: libceph: add doutc and *_client debug macros support

This will help print the fsid and client's global_id in debug logs,
and also print the function names.

[ idryomov: %lld -> %llu, leading space for doutc(), don't include
  __func__ in pr_*() variants ]

Link: https://tracker.ceph.com/issues/61590
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_debug.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index d5a5da838caf..11a92a946016 100644
--- a/include/linux/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -19,12 +19,25 @@
 	pr_debug("%.*s %12.12s:%-4d : " fmt,				\
 		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
 		 kbasename(__FILE__), __LINE__, ##__VA_ARGS__)
+#  define doutc(client, fmt, ...)					\
+	pr_debug("%.*s %12.12s:%-4d : [%pU %llu] " fmt,			\
+		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
+		 kbasename(__FILE__), __LINE__,				\
+		 &client->fsid, client->monc.auth->global_id,		\
+		 ##__VA_ARGS__)
 # else
 /* faux printk call just to see any compiler warnings. */
 #  define dout(fmt, ...)	do {				\
 		if (0)						\
 			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
 	} while (0)
+#  define doutc(client, fmt, ...)	do {			\
+		if (0)						\
+			printk(KERN_DEBUG "[%pU %llu] " fmt,	\
+			&client->fsid,				\
+			client->monc.auth->global_id,		\
+			##__VA_ARGS__);				\
+		} while (0)
 # endif
 
 #else
@@ -33,7 +46,32 @@
  * or, just wrap pr_debug
  */
 # define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+# define doutc(client, fmt, ...)					\
+	pr_debug(" [%pU %llu] %s: " fmt, &client->fsid,			\
+		 client->monc.auth->global_id, __func__, ##__VA_ARGS__)
 
 #endif
 
+#define pr_notice_client(client, fmt, ...)				\
+	pr_notice("[%pU %llu]: " fmt, &client->fsid,			\
+		  client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_info_client(client, fmt, ...)				\
+	pr_info("[%pU %llu]: " fmt, &client->fsid,			\
+		client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_client(client, fmt, ...)				\
+	pr_warn("[%pU %llu]: " fmt, &client->fsid,			\
+		client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_once_client(client, fmt, ...)				\
+	pr_warn_once("[%pU %llu]: " fmt, &client->fsid,			\
+		     client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_client(client, fmt, ...)					\
+	pr_err("[%pU %llu]: " fmt, &client->fsid,			\
+	       client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_warn_ratelimited_client(client, fmt, ...)			\
+	pr_warn_ratelimited("[%pU %llu]: " fmt, &client->fsid,		\
+			    client->monc.auth->global_id, ##__VA_ARGS__)
+#define pr_err_ratelimited_client(client, fmt, ...)			\
+	pr_err_ratelimited("[%pU %llu]: " fmt, &client->fsid,		\
+			   client->monc.auth->global_id, ##__VA_ARGS__)
+
 #endif
-- 
cgit v1.2.3


From 197b7d792d6aead2e30d4b2c054ffabae2ed73dc Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 9 Jun 2023 15:15:47 +0800
Subject: ceph: pass the mdsc to several helpers

We will use the 'mdsc' to get the global_id in the following commits.

Link: https://tracker.ceph.com/issues/61590
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c              | 15 +++++++++------
 fs/ceph/debugfs.c           |  4 ++--
 fs/ceph/dir.c               |  2 +-
 fs/ceph/file.c              |  2 +-
 fs/ceph/mds_client.c        | 39 ++++++++++++++++++++++-----------------
 fs/ceph/mds_client.h        |  3 ++-
 fs/ceph/mdsmap.c            |  3 ++-
 fs/ceph/snap.c              | 16 ++++++++++------
 fs/ceph/super.h             |  3 ++-
 include/linux/ceph/mdsmap.h |  5 ++++-
 10 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 14215ec646f7..c00f15b773f0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1178,7 +1178,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 	}
 }
 
-void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		     bool queue_release)
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_fs_client *fsc;
@@ -1342,6 +1343,8 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
  */
 void __ceph_remove_caps(struct ceph_inode_info *ci)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct rb_node *p;
 
 	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
@@ -1351,7 +1354,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
 		p = rb_next(p);
-		ceph_remove_cap(cap, true);
+		ceph_remove_cap(mdsc, cap, true);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 }
@@ -3999,7 +4002,7 @@ retry:
 		goto out_unlock;
 
 	if (target < 0) {
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	}
 
@@ -4034,7 +4037,7 @@ retry:
 				change_auth_cap_ses(ci, tcap->session);
 			}
 		}
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	} else if (tsession) {
 		/* add placeholder for the export tagert */
@@ -4051,7 +4054,7 @@ retry:
 			spin_unlock(&mdsc->cap_dirty_lock);
 		}
 
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	}
 
@@ -4164,7 +4167,7 @@ retry:
 					ocap->mseq, mds, le32_to_cpu(ph->seq),
 					le32_to_cpu(ph->mseq));
 		}
-		ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+		ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
 	}
 
 	*old_issued = issued;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3904333fa6c3..2f1e7498cd74 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -81,7 +81,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 		if (req->r_inode) {
 			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
 		} else if (req->r_dentry) {
-			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
 						    &pathbase, 0);
 			if (IS_ERR(path))
 				path = NULL;
@@ -100,7 +100,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 		}
 
 		if (req->r_old_dentry) {
-			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+			path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
 						    &pathbase, 0);
 			if (IS_ERR(path))
 				path = NULL;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 854cbdd66661..fff5cb2df9a8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1226,7 +1226,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 	if (result) {
 		int pathlen = 0;
 		u64 base = 0;
-		char *path = ceph_mdsc_build_path(dentry, &pathlen,
+		char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
 						  &base, 0);
 
 		/* mark error on parent + clear complete */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b5f8038065d7..7c4d79a23506 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -574,7 +574,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 	if (result) {
 		int pathlen = 0;
 		u64 base = 0;
-		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+		char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
 						  &base, 0);
 
 		pr_warn("async create failure path=(%llx)%s result=%d!\n",
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 293b93182955..284ba087c507 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2126,6 +2126,7 @@ out:
  */
 static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
@@ -2173,7 +2174,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
-		ceph_remove_cap(cap, true);
+		ceph_remove_cap(mdsc, cap, true);
 		(*remaining)--;
 	} else {
 		struct dentry *dentry;
@@ -2588,6 +2589,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
 
 /**
  * ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
  * @dentry: dentry to which path should be built
  * @plen: returned length of string
  * @pbase: returned base inode number
@@ -2607,8 +2609,8 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
  * Encode hidden .snap dirs as a double /, i.e.
  *   foo/.snap/bar -> foo//bar
  */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
-			   int for_wire)
+char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+			   int *plen, u64 *pbase, int for_wire)
 {
 	struct dentry *cur;
 	struct inode *inode;
@@ -2726,9 +2728,9 @@ retry:
 	return path + pos;
 }
 
-static int build_dentry_path(struct dentry *dentry, struct inode *dir,
-			     const char **ppath, int *ppathlen, u64 *pino,
-			     bool *pfreepath, bool parent_locked)
+static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+			     struct inode *dir, const char **ppath, int *ppathlen,
+			     u64 *pino, bool *pfreepath, bool parent_locked)
 {
 	char *path;
 
@@ -2744,7 +2746,7 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 		return 0;
 	}
 	rcu_read_unlock();
-	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
@@ -2756,6 +2758,7 @@ static int build_inode_path(struct inode *inode,
 			    const char **ppath, int *ppathlen, u64 *pino,
 			    bool *pfreepath)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct dentry *dentry;
 	char *path;
 
@@ -2765,7 +2768,7 @@ static int build_inode_path(struct inode *inode,
 		return 0;
 	}
 	dentry = d_find_alias(inode);
-	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
 	dput(dentry);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
@@ -2778,10 +2781,11 @@ static int build_inode_path(struct inode *inode,
  * request arguments may be specified via an inode *, a dentry *, or
  * an explicit ino+path.
  */
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
-				  struct inode *rdiri, const char *rpath,
-				  u64 rino, const char **ppath, int *pathlen,
-				  u64 *ino, bool *freepath, bool parent_locked)
+static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+				 struct dentry *rdentry, struct inode *rdiri,
+				 const char *rpath, u64 rino, const char **ppath,
+				 int *pathlen, u64 *ino, bool *freepath,
+				 bool parent_locked)
 {
 	int r = 0;
 
@@ -2790,7 +2794,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
 		     ceph_snap(rinode));
 	} else if (rdentry) {
-		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
 					freepath, parent_locked);
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
@@ -2877,7 +2881,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
 				     &session->s_features);
 
-	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
 			      &path1, &pathlen1, &ino1, &freepath1,
 			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
@@ -2891,7 +2895,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (req->r_old_dentry &&
 	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
 		old_dentry = req->r_old_dentry;
-	ret = set_request_path_attr(NULL, old_dentry,
+	ret = set_request_path_attr(mdsc, NULL, old_dentry,
 			      req->r_old_dentry_dir,
 			      req->r_path2, req->r_ino2.ino,
 			      &path2, &pathlen2, &ino2, &freepath2, true);
@@ -4290,6 +4294,7 @@ out_unlock:
  */
 static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	union {
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
@@ -4307,7 +4312,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 	dentry = d_find_primary(inode);
 	if (dentry) {
 		/* set pathbase to parent dir when msg_version >= 2 */
-		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
 					    recon_state->msg_version >= 2);
 		dput(dentry);
 		if (IS_ERR(path)) {
@@ -5662,7 +5667,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
+	newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5a3714bdd64a..d930eb79dc38 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -581,7 +581,8 @@ static inline void ceph_mdsc_free_path(char *path, int len)
 		__putname(path - (PATH_MAX - 1 - len));
 }
 
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+				  struct dentry *dentry, int *plen, u64 *base,
 				  int for_wire);
 
 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 7dac21ee6ce7..6cbec7aed5a0 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -114,7 +114,8 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2)
 {
 	struct ceph_mdsmap *m;
 	const void *start = *p;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 813f21add992..55090e6c9967 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -329,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
 /*
  * build the snap context for a given realm.
  */
-static int build_snap_context(struct ceph_snap_realm *realm,
+static int build_snap_context(struct ceph_mds_client *mdsc,
+			      struct ceph_snap_realm *realm,
 			      struct list_head *realm_queue,
 			      struct list_head *dirty_realms)
 {
@@ -425,7 +426,8 @@ fail:
 /*
  * rebuild snap context for the given realm and all of its children.
  */
-static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm,
 				struct list_head *dirty_realms)
 {
 	LIST_HEAD(realm_queue);
@@ -451,7 +453,8 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm,
 			continue;
 		}
 
-		last = build_snap_context(_realm, &realm_queue, dirty_realms);
+		last = build_snap_context(mdsc, _realm, &realm_queue,
+					  dirty_realms);
 		dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
 		     last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
 
@@ -708,7 +711,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
  * Queue cap_snaps for snap writeback for this realm and its children.
  * Called under snap_rwsem, so realm topology won't change.
  */
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
+				  struct ceph_snap_realm *realm)
 {
 	struct ceph_inode_info *ci;
 	struct inode *lastinode = NULL;
@@ -855,7 +859,7 @@ more:
 
 	/* rebuild_snapcs when we reach the _end_ (root) of the trace */
 	if (realm_to_rebuild && p >= e)
-		rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
+		rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
 
 	if (!first_realm)
 		first_realm = realm;
@@ -873,7 +877,7 @@ more:
 		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
 					 dirty_item);
 		list_del_init(&realm->dirty_item);
-		queue_realm_cap_snaps(realm);
+		queue_realm_cap_snaps(mdsc, realm);
 	}
 
 	if (realm_ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..09c262dd5bd3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1223,7 +1223,8 @@ extern void ceph_add_cap(struct inode *inode,
 			 unsigned cap, unsigned seq, u64 realmino, int flags,
 			 struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
-extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+			    bool queue_release);
 extern void __ceph_remove_caps(struct ceph_inode_info *ci);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c3e0648dc27..89f1931f1ba6 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -5,6 +5,8 @@
 #include <linux/bug.h>
 #include <linux/ceph/types.h>
 
+struct ceph_mds_client;
+
 /*
  * mds map - describe servers in the mds cluster.
  *
@@ -65,7 +67,8 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
-- 
cgit v1.2.3


From 522dc5108f07ef30e2c7399e59b9547d382308ff Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 12 Jun 2023 15:41:10 +0800
Subject: libceph, ceph: move mdsmap.h to fs/ceph

The mdsmap.h is only used by CephFS, so move it to fs/ceph.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.h        |  2 +-
 fs/ceph/mdsmap.c            |  2 +-
 fs/ceph/mdsmap.h            | 75 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ceph/mdsmap.h | 75 ---------------------------------------------
 4 files changed, 77 insertions(+), 77 deletions(-)
 create mode 100644 fs/ceph/mdsmap.h
 delete mode 100644 include/linux/ceph/mdsmap.h

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d930eb79dc38..df9e3d668969 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -14,9 +14,9 @@
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
 #include <linux/ceph/auth.h>
 
+#include "mdsmap.h"
 #include "metric.h"
 #include "super.h"
 
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index fa080183ac1f..fae97c25ce58 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -7,10 +7,10 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include <linux/ceph/mdsmap.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 
+#include "mdsmap.h"
 #include "mds_client.h"
 #include "super.h"
 
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..89f1931f1ba6
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+struct ceph_mds_client;
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	u64 global_id;
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	bool laggy;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
+	u32 m_max_mds;			/* expected up:active mds number */
+	u32 m_num_active_mds;		/* actual up:active mds number */
+	u32 possible_max_rank;		/* possible max rank index */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u64 *m_data_pg_pools;
+	u64 m_cas_pg_pool;
+
+	bool m_enabled;
+	bool m_damaged;
+	int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->possible_max_rank)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->possible_max_rank)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+	if (w >= 0 && w < m->possible_max_rank)
+		return m->m_info[w].laggy;
+	return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
deleted file mode 100644
index 89f1931f1ba6..000000000000
--- a/include/linux/ceph/mdsmap.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include <linux/bug.h>
-#include <linux/ceph/types.h>
-
-struct ceph_mds_client;
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-	u64 global_id;
-	struct ceph_entity_addr addr;
-	s32 state;
-	int num_export_targets;
-	bool laggy;
-	u32 *export_targets;
-};
-
-struct ceph_mdsmap {
-	u32 m_epoch, m_client_epoch, m_last_failure;
-	u32 m_root;
-	u32 m_session_timeout;          /* seconds */
-	u32 m_session_autoclose;        /* seconds */
-	u64 m_max_file_size;
-	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
-	u32 m_max_mds;			/* expected up:active mds number */
-	u32 m_num_active_mds;		/* actual up:active mds number */
-	u32 possible_max_rank;		/* possible max rank index */
-	struct ceph_mds_info *m_info;
-
-	/* which object pools file data can be stored in */
-	int m_num_data_pg_pools;
-	u64 *m_data_pg_pools;
-	u64 m_cas_pg_pool;
-
-	bool m_enabled;
-	bool m_damaged;
-	int m_num_laggy;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-	if (w >= m->possible_max_rank)
-		return NULL;
-	return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-	BUG_ON(w < 0);
-	if (w >= m->possible_max_rank)
-		return CEPH_MDS_STATE_DNE;
-	return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-	if (w >= 0 && w < m->possible_max_rank)
-		return m->m_info[w].laggy;
-	return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
-				       void *end, bool msgr2);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
-
-#endif
-- 
cgit v1.2.3


From 1b90344614cc5949666328b37f03edec1d4e2873 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 7 Aug 2023 15:26:15 +0200
Subject: fs: export mnt_idmap_get/mnt_idmap_put

These helpers are required to support idmapped mounts in CephFS.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/mnt_idmapping.c            | 2 ++
 include/linux/mnt_idmapping.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 4905665c47d0..57d1dedf3f8f 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -256,6 +256,7 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
 
 	return idmap;
 }
+EXPORT_SYMBOL_GPL(mnt_idmap_get);
 
 /**
  * mnt_idmap_put - put a reference to an idmapping
@@ -271,3 +272,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
 		kfree(idmap);
 	}
 }
+EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index 057c89867aa2..b8da2db4ecd2 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -115,6 +115,9 @@ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid)
 
 int vfsgid_in_group_p(vfsgid_t vfsgid);
 
+struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
+void mnt_idmap_put(struct mnt_idmap *idmap);
+
 vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		     struct user_namespace *fs_userns, kuid_t kuid);
 
-- 
cgit v1.2.3


From 5ccd8530dd7ba97531a50ffa11eabe258d65a7af Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Aug 2023 15:26:17 +0200
Subject: ceph: handle idmapped mounts in create_request_message()

Inode operations that create a new filesystem object such as ->mknod,
->create, ->mkdir() and others don't take a {g,u}id argument explicitly.
Instead the caller's fs{g,u}id is used for the {g,u}id of the new
filesystem object.

In order to ensure that the correct {g,u}id is used map the caller's
fs{g,u}id for creation requests. This doesn't require complex changes.
It suffices to pass in the relevant idmapping recorded in the request
message. If this request message was triggered from an inode operation
that creates filesystem objects it will have passed down the relevant
idmaping. If this is a request message that was triggered from an inode
operation that doens't need to take idmappings into account the initial
idmapping is passed down which is an identity mapping.

This change uses a new cephfs protocol extension CEPHFS_FEATURE_HAS_OWNER_UIDGID
which adds two new fields (owner_{u,g}id) to the request head structure.
So, we need to ensure that MDS supports it otherwise we need to fail
any IO that comes through an idmapped mount because we can't process it
in a proper way. MDS server without such an extension will use caller_{u,g}id
fields to set a new inode owner UID/GID which is incorrect because caller_{u,g}id
values are unmapped. At the same time we can't map these fields with an
idmapping as it can break UID/GID-based permission checks logic on the
MDS side. This problem was described with a lot of details at [1], [2].

[1] https://lore.kernel.org/lkml/CAEivzxfw1fHO2TFA4dx3u23ZKK6Q+EThfzuibrhA3RKM=ZOYLg@mail.gmail.com/
[2] https://lore.kernel.org/all/20220104140414.155198-3-brauner@kernel.org/

Link: https://github.com/ceph/ceph/pull/52575
Link: https://tracker.ceph.com/issues/62217
Co-Developed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 56 ++++++++++++++++++++++++++++++++++++++++----
 fs/ceph/mds_client.h         |  5 +++-
 include/linux/ceph/ceph_fs.h | 10 +++++++-
 3 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ff5a05346f52..b97a16df7580 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2881,6 +2881,17 @@ static void encode_mclientrequest_tail(void **p,
 	}
 }
 
+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+	if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+		return 1;
+
+	if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+		return 2;
+
+	return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
 static struct ceph_mds_request_head_legacy *
 find_legacy_request_head(void *p, u64 features)
 {
@@ -2902,6 +2913,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	struct ceph_mds_request_head_legacy *lhead;
 	const char *path1 = NULL;
@@ -2915,8 +2927,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	void *p, *end;
 	int ret;
 	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
-	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
-				     &session->s_features);
+	u16 request_head_version = mds_supported_head_version(session);
 
 	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2957,8 +2968,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	 */
 	if (legacy)
 		len = sizeof(struct ceph_mds_request_head_legacy);
-	else if (old_version)
+	else if (request_head_version == 1)
 		len = sizeof(struct ceph_mds_request_head_old);
+	else if (request_head_version == 2)
+		len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
 	else
 		len = sizeof(struct ceph_mds_request_head);
 
@@ -3008,6 +3021,18 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	lhead = find_legacy_request_head(msg->front.iov_base,
 					 session->s_con.peer_features);
 
+	if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+	    !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+		WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));
+
+		pr_err_ratelimited_client(cl,
+			"idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+			" is not supported by MDS. Fail request with -EIO.\n");
+
+		ret = -EIO;
+		goto out_err;
+	}
+
 	/*
 	 * The ceph_mds_request_head_legacy didn't contain a version field, and
 	 * one was added when we moved the message version from 3->4.
@@ -3015,17 +3040,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (legacy) {
 		msg->hdr.version = cpu_to_le16(3);
 		p = msg->front.iov_base + sizeof(*lhead);
-	} else if (old_version) {
+	} else if (request_head_version == 1) {
 		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
 
 		msg->hdr.version = cpu_to_le16(4);
 		ohead->version = cpu_to_le16(1);
 		p = msg->front.iov_base + sizeof(*ohead);
+	} else if (request_head_version == 2) {
+		struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(6);
+		nhead->version = cpu_to_le16(2);
+
+		p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
 	} else {
 		struct ceph_mds_request_head *nhead = msg->front.iov_base;
+		kuid_t owner_fsuid;
+		kgid_t owner_fsgid;
 
 		msg->hdr.version = cpu_to_le16(6);
 		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));
+
+		if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {
+			owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+						VFSUIDT_INIT(req->r_cred->fsuid));
+			owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+						VFSGIDT_INIT(req->r_cred->fsgid));
+			nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+			nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
+		} else {
+			nhead->owner_uid = cpu_to_le32(-1);
+			nhead->owner_gid = cpu_to_le32(-1);
+		}
+
 		p = msg->front.iov_base + sizeof(*nhead);
 	}
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index eb5e76446046..7c942294e96b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -33,8 +33,10 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
 	CEPHFS_FEATURE_OP_GETVXATTR,
 	CEPHFS_FEATURE_32BITS_RETRY_FWD,
+	CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+	CEPHFS_FEATURE_HAS_OWNER_UIDGID,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
 };
 
 #define CEPHFS_FEATURES_CLIENT_SUPPORTED {	\
@@ -49,6 +51,7 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,	\
 	CEPHFS_FEATURE_OP_GETVXATTR,		\
 	CEPHFS_FEATURE_32BITS_RETRY_FWD,	\
+	CEPHFS_FEATURE_HAS_OWNER_UIDGID,	\
 }
 
 /*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index f3b3593254b9..ee1d0e5f9789 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -357,6 +357,11 @@ enum {
 	CEPH_MDS_OP_RENAMESNAP = 0x01403,
 };
 
+#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE     || \
+				     op == CEPH_MDS_OP_MKNOD      || \
+				     op == CEPH_MDS_OP_MKDIR      || \
+				     op == CEPH_MDS_OP_SYMLINK)
+
 extern const char *ceph_mds_op_name(int op);
 
 #define CEPH_SETATTR_MODE              (1 << 0)
@@ -497,7 +502,7 @@ struct ceph_mds_request_head_legacy {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_REQUEST_HEAD_VERSION  2
+#define CEPH_MDS_REQUEST_HEAD_VERSION  3
 
 struct ceph_mds_request_head_old {
 	__le16 version;                /* struct version */
@@ -528,6 +533,9 @@ struct ceph_mds_request_head {
 
 	__le32 ext_num_retry;          /* new count retry attempts */
 	__le32 ext_num_fwd;            /* new count fwd attempts */
+
+	__le32 struct_len;             /* to store size of struct ceph_mds_request_head */
+	__le32 owner_uid, owner_gid;   /* used for OPs which create inodes */
 } __attribute__ ((packed));
 
 /* cap/lease release record */
-- 
cgit v1.2.3


From 9fd00df05e81a2e1080ce6e9abc35533dca99d74 Mon Sep 17 00:00:00 2001
From: Zbigniew Lukwinski <zbigniew.lukwinski@linux.intel.com>
Date: Mon, 16 Oct 2023 00:23:34 +0200
Subject: i3c: master: handle IBIs in order they came

IBI shall be handled in order they appear on the bus. Otherwise could hit
case when order of handling them in device driver will be different. It may
lead to invalid assembling fragmented packets or events order broken.

Added separate workqueue with option WQ_MEM_RECLAIM for each device driver.
This ensures IBI handling order and improves IBI handling performance: IBI
handlers for device B are not blocked by IBI handlers for device A.

Original solution (single workqueue in main driver) was able to handle also
general IBI (not related to specific device) like HJ or MR. So leaving this
for such purposes.

Signed-off-by: Zbigniew Lukwinski <zbigniew.lukwinski@linux.intel.com>
Link: https://lore.kernel.org/r/20231015222334.1652401-2-zbigniew.lukwinski@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c       | 14 +++++++++++++-
 include/linux/i3c/master.h |  4 +++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index e66a1f84cefb..0cdc94e4cb77 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -2374,7 +2374,7 @@ static void i3c_master_unregister_i3c_devs(struct i3c_master_controller *master)
 void i3c_master_queue_ibi(struct i3c_dev_desc *dev, struct i3c_ibi_slot *slot)
 {
 	atomic_inc(&dev->ibi->pending_ibis);
-	queue_work(dev->common.master->wq, &slot->work);
+	queue_work(dev->ibi->wq, &slot->work);
 }
 EXPORT_SYMBOL_GPL(i3c_master_queue_ibi);
 
@@ -2819,6 +2819,12 @@ int i3c_dev_request_ibi_locked(struct i3c_dev_desc *dev,
 	if (!ibi)
 		return -ENOMEM;
 
+	ibi->wq = alloc_ordered_workqueue(dev_name(i3cdev_to_dev(dev->dev)), WQ_MEM_RECLAIM);
+	if (!ibi->wq) {
+		kfree(ibi);
+		return -ENOMEM;
+	}
+
 	atomic_set(&ibi->pending_ibis, 0);
 	init_completion(&ibi->all_ibis_handled);
 	ibi->handler = req->handler;
@@ -2846,6 +2852,12 @@ void i3c_dev_free_ibi_locked(struct i3c_dev_desc *dev)
 		WARN_ON(i3c_dev_disable_ibi_locked(dev));
 
 	master->ops->free_ibi(dev);
+
+	if (dev->ibi->wq) {
+		destroy_workqueue(dev->ibi->wq);
+		dev->ibi->wq = NULL;
+	}
+
 	kfree(dev->ibi);
 	dev->ibi = NULL;
 }
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 4fd6a777150f..349d7d0dda73 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -129,6 +129,7 @@ struct i3c_ibi_slot {
  *		     rejected by the master
  * @num_slots: number of IBI slots reserved for this device
  * @enabled: reflect the IBI status
+ * @wq: workqueue used to execute IBI handlers.
  * @handler: IBI handler specified at i3c_device_request_ibi() call time. This
  *	     handler will be called from the controller workqueue, and as such
  *	     is allowed to sleep (though it is recommended to process the IBI
@@ -151,6 +152,7 @@ struct i3c_device_ibi_info {
 	unsigned int max_payload_len;
 	unsigned int num_slots;
 	unsigned int enabled;
+	struct workqueue_struct *wq;
 	void (*handler)(struct i3c_device *dev,
 			const struct i3c_ibi_payload *payload);
 };
@@ -469,7 +471,7 @@ struct i3c_master_controller_ops {
  * @boardinfo.i2c: list of I2C boardinfo objects
  * @boardinfo: board-level information attached to devices connected on the bus
  * @bus: I3C bus exposed by this master
- * @wq: workqueue used to execute IBI handlers. Can also be used by master
+ * @wq: workqueue which can be used by master
  *	drivers if they need to postpone operations that need to take place
  *	in a thread context. Typical examples are Hot Join processing which
  *	requires taking the bus lock in maintenance, which in turn, can only
-- 
cgit v1.2.3


From 2654521d774f9461234bcf3d089185404abd110b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 5 Sep 2023 20:53:09 +0200
Subject: gpiolib: remove gpiochip_find()

With all users of gpiochip_find() converted to using gpio_device_find(),
we can now remove this function from the kernel.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 22 ----------------------
 include/linux/gpio/driver.h |  3 ---
 2 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index cbafcd95243e..1c3c284fa44e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1092,28 +1092,6 @@ void gpiochip_remove(struct gpio_chip *gc)
 }
 EXPORT_SYMBOL_GPL(gpiochip_remove);
 
-/*
- * FIXME: This will be removed soon.
- *
- * This function is depracated, don't use.
- */
-struct gpio_chip *gpiochip_find(void *data,
-				int (*match)(struct gpio_chip *gc,
-					     void *data))
-{
-	struct gpio_device *gdev;
-	struct gpio_chip *gc = NULL;
-
-	gdev = gpio_device_find(data, match);
-	if (gdev) {
-		gc = gdev->chip;
-		gpio_device_put(gdev);
-	}
-
-	return gc;
-}
-EXPORT_SYMBOL_GPL(gpiochip_find);
-
 /**
  * gpio_device_find() - find a specific GPIO device
  * @data: data to pass to match function
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 1d454dc944b3..0aed62f0c633 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -605,9 +605,6 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc,
 				    void *data, struct lock_class_key *lock_key,
 				    struct lock_class_key *request_key);
 
-struct gpio_chip *gpiochip_find(void *data,
-				int (*match)(struct gpio_chip *gc, void *data));
-
 struct gpio_device *gpio_device_find(void *data,
 				int (*match)(struct gpio_chip *gc, void *data));
 struct gpio_device *gpio_device_find_by_label(const char *label);
-- 
cgit v1.2.3


From 9596ebf87c4f3f439d658af4d7cc2a302bea3750 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 2 Oct 2023 20:53:47 +0200
Subject: pinctrl: remove unneeded extern specifiers from consumer.h

The 'extern' specifiers are not needed for function declarations. Remove
all of them from the pinctrl/consumer.h header.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/pinctrl/consumer.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 4729d54e8995..22eef0a513ce 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -25,27 +25,27 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-extern bool pinctrl_gpio_can_use_line(unsigned gpio);
-extern int pinctrl_gpio_request(unsigned gpio);
-extern void pinctrl_gpio_free(unsigned gpio);
-extern int pinctrl_gpio_direction_input(unsigned gpio);
-extern int pinctrl_gpio_direction_output(unsigned gpio);
-extern int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
-
-extern struct pinctrl * __must_check pinctrl_get(struct device *dev);
-extern void pinctrl_put(struct pinctrl *p);
-extern struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p,
-								const char *name);
-extern int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s);
-
-extern struct pinctrl * __must_check devm_pinctrl_get(struct device *dev);
-extern void devm_pinctrl_put(struct pinctrl *p);
-extern int pinctrl_select_default_state(struct device *dev);
+bool pinctrl_gpio_can_use_line(unsigned gpio);
+int pinctrl_gpio_request(unsigned gpio);
+void pinctrl_gpio_free(unsigned gpio);
+int pinctrl_gpio_direction_input(unsigned gpio);
+int pinctrl_gpio_direction_output(unsigned gpio);
+int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
+
+struct pinctrl * __must_check pinctrl_get(struct device *dev);
+void pinctrl_put(struct pinctrl *p);
+struct pinctrl_state * __must_check pinctrl_lookup_state(struct pinctrl *p,
+							 const char *name);
+int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s);
+
+struct pinctrl * __must_check devm_pinctrl_get(struct device *dev);
+void devm_pinctrl_put(struct pinctrl *p);
+int pinctrl_select_default_state(struct device *dev);
 
 #ifdef CONFIG_PM
-extern int pinctrl_pm_select_default_state(struct device *dev);
-extern int pinctrl_pm_select_sleep_state(struct device *dev);
-extern int pinctrl_pm_select_idle_state(struct device *dev);
+int pinctrl_pm_select_default_state(struct device *dev);
+int pinctrl_pm_select_sleep_state(struct device *dev);
+int pinctrl_pm_select_idle_state(struct device *dev);
 #else
 static inline int pinctrl_pm_select_default_state(struct device *dev)
 {
-- 
cgit v1.2.3


From ec963d04ca8651cc3738bc8e013b3ab7fdb95e3d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 09:12:13 +0200
Subject: pinctrl: provide new GPIO-to-pinctrl glue helpers

Currently the pinctrl GPIO helpers all take a number from the global
GPIO numberspace - of which we're trying to get rid of as argument.

These helpers are almost universally called from GPIOLIB driver
callbacks which take a pointer to the backing gpio_chip and the
controller-relative offset as arguments.

Let's provide improved variants of these functions that match the
GPIOLIB signatures as the first step in removing the older flavor.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 108 ++++++++++++++++++++++++++++-----------
 include/linux/pinctrl/consumer.h |  46 +++++++++++++++++
 2 files changed, 125 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 71fc9f95584e..901cae45d468 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -23,6 +23,8 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 
+#include <linux/gpio/driver.h>
+
 #include <linux/pinctrl/consumer.h>
 #include <linux/pinctrl/devinfo.h>
 #include <linux/pinctrl/machine.h>
@@ -781,14 +783,13 @@ bool pinctrl_gpio_can_use_line(unsigned gpio)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line);
 
-/**
- * pinctrl_gpio_request() - request a single pin to be used as GPIO
- * @gpio: the GPIO pin number from the GPIO subsystem number space
- *
- * This function should *ONLY* be used from gpiolib-based GPIO drivers,
- * as part of their gpio_request() semantics, platforms and individual drivers
- * shall *NOT* request GPIO pins to be muxed in.
- */
+bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return pinctrl_gpio_can_use_line(gc->base + offset);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line_new);
+
+/* This function is deprecated and will be removed. Don't use. */
 int pinctrl_gpio_request(unsigned gpio)
 {
 	struct pinctrl_dev *pctldev;
@@ -817,13 +818,21 @@ int pinctrl_gpio_request(unsigned gpio)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
 
 /**
- * pinctrl_gpio_free() - free control on a single pin, currently used as GPIO
- * @gpio: the GPIO pin number from the GPIO subsystem number space
+ * pinctrl_gpio_request_new() - request a single pin to be used as GPIO
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
  *
  * This function should *ONLY* be used from gpiolib-based GPIO drivers,
- * as part of their gpio_free() semantics, platforms and individual drivers
- * shall *NOT* request GPIO pins to be muxed out.
+ * as part of their gpio_request() semantics, platforms and individual drivers
+ * shall *NOT* request GPIO pins to be muxed in.
  */
+int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return pinctrl_gpio_request(gc->base + offset);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_request_new);
+
+/* This function is deprecated and will be removed. Don't use. */
 void pinctrl_gpio_free(unsigned gpio)
 {
 	struct pinctrl_dev *pctldev;
@@ -846,6 +855,21 @@ void pinctrl_gpio_free(unsigned gpio)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_free);
 
+/**
+ * pinctrl_gpio_free_new() - free control on a single pin, currently used as GPIO
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
+ *
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers,
+ * as part of their gpio_request() semantics, platforms and individual drivers
+ * shall *NOT* request GPIO pins to be muxed in.
+ */
+void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return pinctrl_gpio_free(gc->base + offset);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_free_new);
+
 static int pinctrl_gpio_direction(unsigned gpio, bool input)
 {
 	struct pinctrl_dev *pctldev;
@@ -869,14 +893,7 @@ static int pinctrl_gpio_direction(unsigned gpio, bool input)
 	return ret;
 }
 
-/**
- * pinctrl_gpio_direction_input() - request a GPIO pin to go into input mode
- * @gpio: the GPIO pin number from the GPIO subsystem number space
- *
- * This function should *ONLY* be used from gpiolib-based GPIO drivers,
- * as part of their gpio_direction_input() semantics, platforms and individual
- * drivers shall *NOT* touch pin control GPIO calls.
- */
+/* This function is deprecated and will be removed. Don't use. */
 int pinctrl_gpio_direction_input(unsigned gpio)
 {
 	return pinctrl_gpio_direction(gpio, true);
@@ -884,13 +901,21 @@ int pinctrl_gpio_direction_input(unsigned gpio)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
 
 /**
- * pinctrl_gpio_direction_output() - request a GPIO pin to go into output mode
- * @gpio: the GPIO pin number from the GPIO subsystem number space
+ * pinctrl_gpio_direction_input_new() - request a GPIO pin to go into input mode
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
  *
  * This function should *ONLY* be used from gpiolib-based GPIO drivers,
- * as part of their gpio_direction_output() semantics, platforms and individual
+ * as part of their gpio_direction_input() semantics, platforms and individual
  * drivers shall *NOT* touch pin control GPIO calls.
  */
+int pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return pinctrl_gpio_direction_input(gc->base + offset);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input_new);
+
+/* This function is deprecated and will be removed. Don't use. */
 int pinctrl_gpio_direction_output(unsigned gpio)
 {
 	return pinctrl_gpio_direction(gpio, false);
@@ -898,14 +923,22 @@ int pinctrl_gpio_direction_output(unsigned gpio)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
 
 /**
- * pinctrl_gpio_set_config() - Apply config to given GPIO pin
- * @gpio: the GPIO pin number from the GPIO subsystem number space
- * @config: the configuration to apply to the GPIO
+ * pinctrl_gpio_direction_output_new() - request a GPIO pin to go into output
+ *                                       mode
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
  *
- * This function should *ONLY* be used from gpiolib-based GPIO drivers, if
- * they need to call the underlying pin controller to change GPIO config
- * (for example set debounce time).
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers,
+ * as part of their gpio_direction_output() semantics, platforms and individual
+ * drivers shall *NOT* touch pin control GPIO calls.
  */
+int pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return pinctrl_gpio_direction_output(gc->base + offset);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output_new);
+
+/* This function is deprecated and will be removed. Don't use. */
 int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
 {
 	unsigned long configs[] = { config };
@@ -926,6 +959,23 @@ int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config);
 
+/**
+ * pinctrl_gpio_set_config_new() - Apply config to given GPIO pin
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
+ * @config: the configuration to apply to the GPIO
+ *
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers, if
+ * they need to call the underlying pin controller to change GPIO config
+ * (for example set debounce time).
+ */
+int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+				unsigned long config)
+{
+	return pinctrl_gpio_set_config(gc->base + offset, config);
+}
+EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config_new);
+
 static struct pinctrl_state *find_state(struct pinctrl *p,
 					const char *name)
 {
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 22eef0a513ce..c95c13983376 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -17,6 +17,7 @@
 #include <linux/pinctrl/pinctrl-state.h>
 
 struct device;
+struct gpio_chip;
 
 /* This struct is private to the core and should be regarded as a cookie */
 struct pinctrl;
@@ -26,11 +27,20 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(unsigned gpio);
+bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(unsigned gpio);
+int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(unsigned gpio);
+void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(unsigned gpio);
+int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
+				     unsigned int offset);
 int pinctrl_gpio_direction_output(unsigned gpio);
+int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
+				      unsigned int offset);
 int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
+int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+				unsigned long config);
 
 struct pinctrl * __must_check pinctrl_get(struct device *dev);
 void pinctrl_put(struct pinctrl *p);
@@ -68,30 +78,66 @@ static inline bool pinctrl_gpio_can_use_line(unsigned gpio)
 	return true;
 }
 
+static inline bool
+pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return true;
+}
+
 static inline int pinctrl_gpio_request(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline void pinctrl_gpio_free(unsigned gpio)
 {
 }
 
+static inline void
+pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+{
+}
+
 static inline int pinctrl_gpio_direction_input(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline int pinctrl_gpio_direction_output(unsigned gpio)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+{
+	return 0;
+}
+
 static inline int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
 {
 	return 0;
 }
 
+static inline int
+pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+			    unsigned long config)
+{
+	return 0;
+}
+
 static inline struct pinctrl * __must_check pinctrl_get(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From a063e57adf7baa91eee52eb4131557c3a43f4c3a Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 14:59:36 +0200
Subject: pinctrl: remove pinctrl_gpio_can_use_line()

There are no more users of pinctrl_gpio_can_use_line() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 12 +++---------
 include/linux/pinctrl/consumer.h |  6 ------
 2 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 901cae45d468..a4ea3eb1ccbe 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -755,7 +755,7 @@ int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
 	return -EINVAL;
 }
 
-bool pinctrl_gpio_can_use_line(unsigned gpio)
+bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 {
 	struct pinctrl_dev *pctldev;
 	struct pinctrl_gpio_range *range;
@@ -767,13 +767,13 @@ bool pinctrl_gpio_can_use_line(unsigned gpio)
 	 * we're probably dealing with GPIO driver
 	 * without a backing pin controller - bail out.
 	 */
-	if (pinctrl_get_device_gpio_range(gpio, &pctldev, &range))
+	if (pinctrl_get_device_gpio_range(gc->base + offset, &pctldev, &range))
 		return true;
 
 	mutex_lock(&pctldev->mutex);
 
 	/* Convert to the pin controllers number space */
-	pin = gpio_to_pin(range, gpio);
+	pin = gpio_to_pin(range, gc->base + offset);
 
 	result = pinmux_can_be_used_for_gpio(pctldev, pin);
 
@@ -781,12 +781,6 @@ bool pinctrl_gpio_can_use_line(unsigned gpio)
 
 	return result;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line);
-
-bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
-{
-	return pinctrl_gpio_can_use_line(gc->base + offset);
-}
 EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line_new);
 
 /* This function is deprecated and will be removed. Don't use. */
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index c95c13983376..3991271cc39f 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -26,7 +26,6 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-bool pinctrl_gpio_can_use_line(unsigned gpio);
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(unsigned gpio);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
@@ -73,11 +72,6 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev)
 
 #else /* !CONFIG_PINCTRL */
 
-static inline bool pinctrl_gpio_can_use_line(unsigned gpio)
-{
-	return true;
-}
-
 static inline bool
 pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 699f0784631eee4f482291fdcc23c87a055ab5f7 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:04:24 +0200
Subject: pinctrl: remove pinctrl_gpio_request()

There are no more users of pinctrl_gpio_request() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 41 +++++++++++++++++-----------------------
 include/linux/pinctrl/consumer.h |  6 ------
 2 files changed, 17 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index a4ea3eb1ccbe..bee26db31ee9 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -783,17 +783,25 @@ bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line_new);
 
-/* This function is deprecated and will be removed. Don't use. */
-int pinctrl_gpio_request(unsigned gpio)
+/**
+ * pinctrl_gpio_request_new() - request a single pin to be used as GPIO
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
+ *
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers,
+ * as part of their gpio_request() semantics, platforms and individual drivers
+ * shall *NOT* request GPIO pins to be muxed in.
+ */
+int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 {
-	struct pinctrl_dev *pctldev;
 	struct pinctrl_gpio_range *range;
-	int ret;
-	int pin;
+	struct pinctrl_dev *pctldev;
+	int ret, pin;
 
-	ret = pinctrl_get_device_gpio_range(gpio, &pctldev, &range);
+	ret = pinctrl_get_device_gpio_range(gc->base + offset, &pctldev,
+					    &range);
 	if (ret) {
-		if (pinctrl_ready_for_gpio_range(gpio))
+		if (pinctrl_ready_for_gpio_range(gc->base + offset))
 			ret = 0;
 		return ret;
 	}
@@ -801,29 +809,14 @@ int pinctrl_gpio_request(unsigned gpio)
 	mutex_lock(&pctldev->mutex);
 
 	/* Convert to the pin controllers number space */
-	pin = gpio_to_pin(range, gpio);
+	pin = gpio_to_pin(range, gc->base + offset);
 
-	ret = pinmux_request_gpio(pctldev, range, pin, gpio);
+	ret = pinmux_request_gpio(pctldev, range, pin, gc->base + offset);
 
 	mutex_unlock(&pctldev->mutex);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
-
-/**
- * pinctrl_gpio_request_new() - request a single pin to be used as GPIO
- * @gc: GPIO chip structure from the GPIO subsystem
- * @offset: hardware offset of the GPIO relative to the controller
- *
- * This function should *ONLY* be used from gpiolib-based GPIO drivers,
- * as part of their gpio_request() semantics, platforms and individual drivers
- * shall *NOT* request GPIO pins to be muxed in.
- */
-int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
-{
-	return pinctrl_gpio_request(gc->base + offset);
-}
 EXPORT_SYMBOL_GPL(pinctrl_gpio_request_new);
 
 /* This function is deprecated and will be removed. Don't use. */
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 3991271cc39f..83a414a347cf 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -27,7 +27,6 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_request(unsigned gpio);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(unsigned gpio);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
@@ -78,11 +77,6 @@ pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 	return true;
 }
 
-static inline int pinctrl_gpio_request(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 1d2c88450f77d9d62883a4a2ecc5e840cc7c74ee Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:11:28 +0200
Subject: pinctrl: remove pinctrl_gpio_free()

There are no more users of pinctrl_gpio_free() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 41 +++++++++++++++++-----------------------
 include/linux/pinctrl/consumer.h |  5 -----
 2 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index bee26db31ee9..c00d7db2d6f1 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -819,29 +819,6 @@ int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_request_new);
 
-/* This function is deprecated and will be removed. Don't use. */
-void pinctrl_gpio_free(unsigned gpio)
-{
-	struct pinctrl_dev *pctldev;
-	struct pinctrl_gpio_range *range;
-	int ret;
-	int pin;
-
-	ret = pinctrl_get_device_gpio_range(gpio, &pctldev, &range);
-	if (ret) {
-		return;
-	}
-	mutex_lock(&pctldev->mutex);
-
-	/* Convert to the pin controllers number space */
-	pin = gpio_to_pin(range, gpio);
-
-	pinmux_free_gpio(pctldev, pin, range);
-
-	mutex_unlock(&pctldev->mutex);
-}
-EXPORT_SYMBOL_GPL(pinctrl_gpio_free);
-
 /**
  * pinctrl_gpio_free_new() - free control on a single pin, currently used as GPIO
  * @gc: GPIO chip structure from the GPIO subsystem
@@ -853,7 +830,23 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_free);
  */
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 {
-	return pinctrl_gpio_free(gc->base + offset);
+	struct pinctrl_gpio_range *range;
+	struct pinctrl_dev *pctldev;
+	int ret, pin;
+
+	ret = pinctrl_get_device_gpio_range(gc->base + offset, &pctldev,
+					    &range);
+	if (ret)
+		return;
+
+	mutex_lock(&pctldev->mutex);
+
+	/* Convert to the pin controllers number space */
+	pin = gpio_to_pin(range, gc->base + offset);
+
+	pinmux_free_gpio(pctldev, pin, range);
+
+	mutex_unlock(&pctldev->mutex);
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_free_new);
 
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 83a414a347cf..90b2f0b78c4b 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -28,7 +28,6 @@ struct pinctrl_state;
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
-void pinctrl_gpio_free(unsigned gpio);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(unsigned gpio);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
@@ -83,10 +82,6 @@ pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline void pinctrl_gpio_free(unsigned gpio)
-{
-}
-
 static inline void
 pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From aec96797f9efcaf444400a9d92900de3acc13e51 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:13:38 +0200
Subject: pinctrl: remove pinctrl_gpio_direction_input()

There are no more users of pinctrl_gpio_direction_input() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 9 +--------
 include/linux/pinctrl/consumer.h | 6 ------
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c00d7db2d6f1..84de9cce7f86 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -873,13 +873,6 @@ static int pinctrl_gpio_direction(unsigned gpio, bool input)
 	return ret;
 }
 
-/* This function is deprecated and will be removed. Don't use. */
-int pinctrl_gpio_direction_input(unsigned gpio)
-{
-	return pinctrl_gpio_direction(gpio, true);
-}
-EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
-
 /**
  * pinctrl_gpio_direction_input_new() - request a GPIO pin to go into input mode
  * @gc: GPIO chip structure from the GPIO subsystem
@@ -891,7 +884,7 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
  */
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input(gc->base + offset);
+	return pinctrl_gpio_direction(gc->base + offset, true);
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input_new);
 
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 90b2f0b78c4b..249bb28a52d6 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -29,7 +29,6 @@ struct pinctrl_state;
 bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_direction_input(unsigned gpio);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output(unsigned gpio);
@@ -87,11 +86,6 @@ pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 {
 }
 
-static inline int pinctrl_gpio_direction_input(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From 45d2055b0067739253883dc541f37c86aad45c92 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:14:33 +0200
Subject: pinctrl: remove pinctrl_gpio_direction_output()

There are no more users of pinctrl_gpio_direction_output() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 9 +--------
 include/linux/pinctrl/consumer.h | 6 ------
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 84de9cce7f86..b000ecc1bab3 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -888,13 +888,6 @@ int pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input_new);
 
-/* This function is deprecated and will be removed. Don't use. */
-int pinctrl_gpio_direction_output(unsigned gpio)
-{
-	return pinctrl_gpio_direction(gpio, false);
-}
-EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
-
 /**
  * pinctrl_gpio_direction_output_new() - request a GPIO pin to go into output
  *                                       mode
@@ -907,7 +900,7 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
  */
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 {
-	return pinctrl_gpio_direction_output(gc->base + offset);
+	return pinctrl_gpio_direction(gc->base + offset, false);
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output_new);
 
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 249bb28a52d6..e25e9b0599ac 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -31,7 +31,6 @@ int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
-int pinctrl_gpio_direction_output(unsigned gpio);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
 int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
@@ -92,11 +91,6 @@ pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline int pinctrl_gpio_direction_output(unsigned gpio)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 {
-- 
cgit v1.2.3


From ab56e2bfceecae12e3b656b1641ecb961de6f92c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 10 Oct 2023 15:16:54 +0200
Subject: pinctrl: remove pinctrl_gpio_set_config()

There are no more users of pinctrl_gpio_set_config() so remove it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c           | 36 +++++++++++++++---------------------
 include/linux/pinctrl/consumer.h |  6 ------
 2 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index b000ecc1bab3..3c67528240da 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -904,42 +904,36 @@ int pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output_new);
 
-/* This function is deprecated and will be removed. Don't use. */
-int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
+/**
+ * pinctrl_gpio_set_config_new() - Apply config to given GPIO pin
+ * @gc: GPIO chip structure from the GPIO subsystem
+ * @offset: hardware offset of the GPIO relative to the controller
+ * @config: the configuration to apply to the GPIO
+ *
+ * This function should *ONLY* be used from gpiolib-based GPIO drivers, if
+ * they need to call the underlying pin controller to change GPIO config
+ * (for example set debounce time).
+ */
+int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+				unsigned long config)
 {
 	unsigned long configs[] = { config };
 	struct pinctrl_gpio_range *range;
 	struct pinctrl_dev *pctldev;
 	int ret, pin;
 
-	ret = pinctrl_get_device_gpio_range(gpio, &pctldev, &range);
+	ret = pinctrl_get_device_gpio_range(gc->base + offset, &pctldev,
+					    &range);
 	if (ret)
 		return ret;
 
 	mutex_lock(&pctldev->mutex);
-	pin = gpio_to_pin(range, gpio);
+	pin = gpio_to_pin(range, gc->base + offset);
 	ret = pinconf_set_config(pctldev, pin, configs, ARRAY_SIZE(configs));
 	mutex_unlock(&pctldev->mutex);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config);
-
-/**
- * pinctrl_gpio_set_config_new() - Apply config to given GPIO pin
- * @gc: GPIO chip structure from the GPIO subsystem
- * @offset: hardware offset of the GPIO relative to the controller
- * @config: the configuration to apply to the GPIO
- *
- * This function should *ONLY* be used from gpiolib-based GPIO drivers, if
- * they need to call the underlying pin controller to change GPIO config
- * (for example set debounce time).
- */
-int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
-				unsigned long config)
-{
-	return pinctrl_gpio_set_config(gc->base + offset, config);
-}
 EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config_new);
 
 static struct pinctrl_state *find_state(struct pinctrl *p,
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index e25e9b0599ac..aa3a7a7ca52d 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -33,7 +33,6 @@ int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
-int pinctrl_gpio_set_config(unsigned gpio, unsigned long config);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
@@ -97,11 +96,6 @@ pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
 	return 0;
 }
 
-static inline int pinctrl_gpio_set_config(unsigned gpio, unsigned long config)
-{
-	return 0;
-}
-
 static inline int
 pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
-- 
cgit v1.2.3


From 00762e416cd2001270a59fab417fbb1c30e2b7e5 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:54:45 +0200
Subject: treewide: rename pinctrl_gpio_can_use_line_new()

Now that pinctrl_gpio_can_use_line() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-cdev.c      | 2 +-
 drivers/pinctrl/core.c           | 4 ++--
 include/linux/pinctrl/consumer.h | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
index 54ee075410db..02ffda6c1e51 100644
--- a/drivers/gpio/gpiolib-cdev.c
+++ b/drivers/gpio/gpiolib-cdev.c
@@ -2287,7 +2287,7 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc,
 	 * FIXME: find a non-racy way to retrieve this information. Maybe a
 	 * lock common to both frameworks?
 	 */
-	ok_for_pinctrl = pinctrl_gpio_can_use_line_new(gc, info->offset);
+	ok_for_pinctrl = pinctrl_gpio_can_use_line(gc, info->offset);
 
 	spin_lock_irqsave(&gpio_lock, flags);
 
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 3c67528240da..4d78358ef745 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -755,7 +755,7 @@ int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
 	return -EINVAL;
 }
 
-bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 {
 	struct pinctrl_dev *pctldev;
 	struct pinctrl_gpio_range *range;
@@ -781,7 +781,7 @@ bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
 
 	return result;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line);
 
 /**
  * pinctrl_gpio_request_new() - request a single pin to be used as GPIO
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index aa3a7a7ca52d..4f9ead7f65d2 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -26,7 +26,7 @@ struct pinctrl_state;
 #ifdef CONFIG_PINCTRL
 
 /* External interface to pin control */
-bool pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset);
+bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
@@ -68,7 +68,7 @@ static inline int pinctrl_pm_select_idle_state(struct device *dev)
 #else /* !CONFIG_PINCTRL */
 
 static inline bool
-pinctrl_gpio_can_use_line_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 {
 	return true;
 }
-- 
cgit v1.2.3


From acb38be654f9af05fe626b37ea83e119cd310c3c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:56:03 +0200
Subject: treewide: rename pinctrl_gpio_request_new()

Now that pinctrl_gpio_request() is no longer used, let's drop the '_new'
suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-aspeed.c                         | 2 +-
 drivers/gpio/gpio-em.c                             | 2 +-
 drivers/gpio/gpio-rcar.c                           | 2 +-
 drivers/gpio/gpio-tegra.c                          | 2 +-
 drivers/gpio/gpiolib.c                             | 2 +-
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c           | 2 +-
 drivers/pinctrl/core.c                             | 6 +++---
 drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c          | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c          | 2 +-
 drivers/pinctrl/renesas/gpio.c                     | 2 +-
 drivers/pinctrl/renesas/pinctrl-rzg2l.c            | 2 +-
 drivers/pinctrl/renesas/pinctrl-rzv2m.c            | 2 +-
 drivers/pinctrl/spear/pinctrl-plgpio.c             | 2 +-
 drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c | 2 +-
 drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c | 2 +-
 drivers/pinctrl/stm32/pinctrl-stm32.c              | 2 +-
 include/linux/pinctrl/consumer.h                   | 4 ++--
 17 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index d3aa1cfd4ace..af851c89cacc 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -750,7 +750,7 @@ static int aspeed_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	if (!have_gpio(gpiochip_get_data(chip), offset))
 		return -ENODEV;
 
-	return pinctrl_gpio_request_new(chip, offset);
+	return pinctrl_gpio_request(chip, offset);
 }
 
 static void aspeed_gpio_free(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/gpio/gpio-em.c b/drivers/gpio/gpio-em.c
index 35c65ff43f71..08c5427deb71 100644
--- a/drivers/gpio/gpio-em.c
+++ b/drivers/gpio/gpio-em.c
@@ -229,7 +229,7 @@ static int em_gio_to_irq(struct gpio_chip *chip, unsigned offset)
 
 static int em_gio_request(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_gpio_request_new(chip, offset);
+	return pinctrl_gpio_request(chip, offset);
 }
 
 static void em_gio_free(struct gpio_chip *chip, unsigned offset)
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index 47121e1a6d4e..8c98730a7ad2 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -275,7 +275,7 @@ static int gpio_rcar_request(struct gpio_chip *chip, unsigned offset)
 		return error;
 	}
 
-	error = pinctrl_gpio_request_new(chip, offset);
+	error = pinctrl_gpio_request(chip, offset);
 	if (error)
 		pm_runtime_put(p->dev);
 
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 08951ddcd294..c974ba368d18 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -139,7 +139,7 @@ static void tegra_gpio_disable(struct tegra_gpio_info *tgi, unsigned int gpio)
 
 static int tegra_gpio_request(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_request_new(chip, offset);
+	return pinctrl_gpio_request(chip, offset);
 }
 
 static void tegra_gpio_free(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 35023129c806..fd4bda29c586 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2014,7 +2014,7 @@ int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset)
 		return 0;
 #endif
 
-	return pinctrl_gpio_request_new(gc, offset);
+	return pinctrl_gpio_request(gc, offset);
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_request);
 
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index fb98c3e14743..8162dd840933 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -335,7 +335,7 @@ static int iproc_gpio_request(struct gpio_chip *gc, unsigned offset)
 	if (!chip->pinmux_is_supported)
 		return 0;
 
-	return pinctrl_gpio_request_new(gc, offset);
+	return pinctrl_gpio_request(gc, offset);
 }
 
 static void iproc_gpio_free(struct gpio_chip *gc, unsigned offset)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 4d78358ef745..934d9e4e02d0 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -784,7 +784,7 @@ bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line);
 
 /**
- * pinctrl_gpio_request_new() - request a single pin to be used as GPIO
+ * pinctrl_gpio_request() - request a single pin to be used as GPIO
  * @gc: GPIO chip structure from the GPIO subsystem
  * @offset: hardware offset of the GPIO relative to the controller
  *
@@ -792,7 +792,7 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_can_use_line);
  * as part of their gpio_request() semantics, platforms and individual drivers
  * shall *NOT* request GPIO pins to be muxed in.
  */
-int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 {
 	struct pinctrl_gpio_range *range;
 	struct pinctrl_dev *pctldev;
@@ -817,7 +817,7 @@ int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_request_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
 
 /**
  * pinctrl_gpio_free_new() - free control on a single pin, currently used as GPIO
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
index 62134a7f0b4e..ad39b8c060b7 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
@@ -201,7 +201,7 @@ static int npcmgpio_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	int ret;
 
 	dev_dbg(chip->parent, "gpio_request: offset%d\n", offset);
-	ret = pinctrl_gpio_request_new(chip, offset);
+	ret = pinctrl_gpio_request(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
index 0b6d6964087e..dd4532ae62bf 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
@@ -198,7 +198,7 @@ static int npcmgpio_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	struct npcm8xx_gpio *bank = gpiochip_get_data(chip);
 	int ret;
 
-	ret = pinctrl_gpio_request_new(chip, offset);
+	ret = pinctrl_gpio_request(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/renesas/gpio.c b/drivers/pinctrl/renesas/gpio.c
index e7771a57e6d1..0c4f4d8d0891 100644
--- a/drivers/pinctrl/renesas/gpio.c
+++ b/drivers/pinctrl/renesas/gpio.c
@@ -135,7 +135,7 @@ static int gpio_pin_request(struct gpio_chip *gc, unsigned offset)
 	if (idx < 0 || pfc->info->pins[idx].enum_id == 0)
 		return -EINVAL;
 
-	return pinctrl_gpio_request_new(gc, offset);
+	return pinctrl_gpio_request(gc, offset);
 }
 
 static void gpio_pin_free(struct gpio_chip *gc, unsigned offset)
diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
index 22d4ab7d228c..c5d247d24374 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
@@ -1056,7 +1056,7 @@ static int rzg2l_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	if (ret)
 		return ret;
 
-	ret = pinctrl_gpio_request_new(chip, offset);
+	ret = pinctrl_gpio_request(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
index 3d29c77a5aad..1ec0922f735c 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
@@ -754,7 +754,7 @@ static int rzv2m_gpio_request(struct gpio_chip *chip, unsigned int offset)
 	u8 bit = RZV2M_PIN_ID_TO_PIN(offset);
 	int ret;
 
-	ret = pinctrl_gpio_request_new(chip, offset);
+	ret = pinctrl_gpio_request(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index 7488f6394318..6d3b7ce1c35a 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -210,7 +210,7 @@ static int plgpio_request(struct gpio_chip *chip, unsigned offset)
 	if (offset >= chip->ngpio)
 		return -EINVAL;
 
-	ret = pinctrl_gpio_request_new(chip, offset);
+	ret = pinctrl_gpio_request(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
index 22a2db8fa315..88e39f68ef25 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
@@ -918,7 +918,7 @@ static struct pinctrl_desc starfive_desc = {
 
 static int starfive_gpio_request(struct gpio_chip *gc, unsigned int gpio)
 {
-	return pinctrl_gpio_request_new(gc, gpio);
+	return pinctrl_gpio_request(gc, gpio);
 }
 
 static void starfive_gpio_free(struct gpio_chip *gc, unsigned int gpio)
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
index 5557ef3fbeb2..c458c6fb992f 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
@@ -547,7 +547,7 @@ static const struct pinconf_ops jh7110_pinconf_ops = {
 
 static int jh7110_gpio_request(struct gpio_chip *gc, unsigned int gpio)
 {
-	return pinctrl_gpio_request_new(gc, gpio);
+	return pinctrl_gpio_request(gc, gpio);
 }
 
 static void jh7110_gpio_free(struct gpio_chip *gc, unsigned int gpio)
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index 11d9f07934bd..dddbdc1a64c5 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -217,7 +217,7 @@ static int stm32_gpio_request(struct gpio_chip *chip, unsigned offset)
 		return -EINVAL;
 	}
 
-	return pinctrl_gpio_request_new(chip, offset);
+	return pinctrl_gpio_request(chip, offset);
 }
 
 static void stm32_gpio_free(struct gpio_chip *chip, unsigned offset)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 4f9ead7f65d2..05b2dae05f24 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -27,7 +27,7 @@ struct pinctrl_state;
 
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset);
+int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
@@ -74,7 +74,7 @@ pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_request_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 4fccb263f3a0dc07b306213930e0c25d1c9002b0 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:57:17 +0200
Subject: treewide: rename pinctrl_gpio_free_new()

Now that pinctrl_gpio_free()() is no longer used, let's drop the '_new'
suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-aspeed.c                         | 2 +-
 drivers/gpio/gpio-em.c                             | 2 +-
 drivers/gpio/gpio-rcar.c                           | 2 +-
 drivers/gpio/gpio-tegra.c                          | 2 +-
 drivers/gpio/gpiolib.c                             | 2 +-
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c           | 2 +-
 drivers/pinctrl/core.c                             | 6 +++---
 drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c          | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c          | 2 +-
 drivers/pinctrl/renesas/gpio.c                     | 2 +-
 drivers/pinctrl/renesas/pinctrl-rzg2l.c            | 2 +-
 drivers/pinctrl/renesas/pinctrl-rzv2m.c            | 2 +-
 drivers/pinctrl/spear/pinctrl-plgpio.c             | 4 ++--
 drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c | 2 +-
 drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c | 2 +-
 drivers/pinctrl/stm32/pinctrl-stm32.c              | 2 +-
 include/linux/pinctrl/consumer.h                   | 4 ++--
 17 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index af851c89cacc..4c417c8486dd 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -755,7 +755,7 @@ static int aspeed_gpio_request(struct gpio_chip *chip, unsigned int offset)
 
 static void aspeed_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 }
 
 static int usecs_to_cycles(struct aspeed_gpio *gpio, unsigned long usecs,
diff --git a/drivers/gpio/gpio-em.c b/drivers/gpio/gpio-em.c
index 08c5427deb71..aa31c189d0ef 100644
--- a/drivers/gpio/gpio-em.c
+++ b/drivers/gpio/gpio-em.c
@@ -234,7 +234,7 @@ static int em_gio_request(struct gpio_chip *chip, unsigned offset)
 
 static void em_gio_free(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 
 	/* Set the GPIO as an input to ensure that the next GPIO request won't
 	* drive the GPIO pin as an output.
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index 8c98730a7ad2..6159fda38d5d 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -286,7 +286,7 @@ static void gpio_rcar_free(struct gpio_chip *chip, unsigned offset)
 {
 	struct gpio_rcar_priv *p = gpiochip_get_data(chip);
 
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 
 	/*
 	 * Set the GPIO as an input to ensure that the next GPIO request won't
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index c974ba368d18..fcb3b474d1df 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -146,7 +146,7 @@ static void tegra_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
 	struct tegra_gpio_info *tgi = gpiochip_get_data(chip);
 
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 	tegra_gpio_disable(tgi, offset);
 }
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index fd4bda29c586..449e00a47296 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2030,7 +2030,7 @@ void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset)
 		return;
 #endif
 
-	pinctrl_gpio_free_new(gc, offset);
+	pinctrl_gpio_free(gc, offset);
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_free);
 
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index 8162dd840933..fd5ce52d05b1 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -345,7 +345,7 @@ static void iproc_gpio_free(struct gpio_chip *gc, unsigned offset)
 	if (!chip->pinmux_is_supported)
 		return;
 
-	pinctrl_gpio_free_new(gc, offset);
+	pinctrl_gpio_free(gc, offset);
 }
 
 static int iproc_gpio_direction_input(struct gpio_chip *gc, unsigned gpio)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 934d9e4e02d0..21d54f82838c 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -820,7 +820,7 @@ int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
 
 /**
- * pinctrl_gpio_free_new() - free control on a single pin, currently used as GPIO
+ * pinctrl_gpio_free() - free control on a single pin, currently used as GPIO
  * @gc: GPIO chip structure from the GPIO subsystem
  * @offset: hardware offset of the GPIO relative to the controller
  *
@@ -828,7 +828,7 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_request);
  * as part of their gpio_request() semantics, platforms and individual drivers
  * shall *NOT* request GPIO pins to be muxed in.
  */
-void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset)
 {
 	struct pinctrl_gpio_range *range;
 	struct pinctrl_dev *pctldev;
@@ -848,7 +848,7 @@ void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
 
 	mutex_unlock(&pctldev->mutex);
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_free_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_free);
 
 static int pinctrl_gpio_direction(unsigned gpio, bool input)
 {
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
index ad39b8c060b7..ab7cfc1e66f0 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
@@ -211,7 +211,7 @@ static int npcmgpio_gpio_request(struct gpio_chip *chip, unsigned int offset)
 static void npcmgpio_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
 	dev_dbg(chip->parent, "gpio_free: offset%d\n", offset);
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 }
 
 static void npcmgpio_irq_handler(struct irq_desc *desc)
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
index dd4532ae62bf..78dbeef6af86 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
@@ -207,7 +207,7 @@ static int npcmgpio_gpio_request(struct gpio_chip *chip, unsigned int offset)
 
 static void npcmgpio_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 }
 
 static void npcmgpio_irq_handler(struct irq_desc *desc)
diff --git a/drivers/pinctrl/renesas/gpio.c b/drivers/pinctrl/renesas/gpio.c
index 0c4f4d8d0891..b39125a632b0 100644
--- a/drivers/pinctrl/renesas/gpio.c
+++ b/drivers/pinctrl/renesas/gpio.c
@@ -140,7 +140,7 @@ static int gpio_pin_request(struct gpio_chip *gc, unsigned offset)
 
 static void gpio_pin_free(struct gpio_chip *gc, unsigned offset)
 {
-	return pinctrl_gpio_free_new(gc, offset);
+	return pinctrl_gpio_free(gc, offset);
 }
 
 static void gpio_pin_set_value(struct sh_pfc_chip *chip, unsigned offset,
diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
index c5d247d24374..9de350ad7e7d 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
@@ -1181,7 +1181,7 @@ static void rzg2l_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
 	unsigned int virq;
 
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 
 	virq = irq_find_mapping(chip->irq.domain, offset);
 	if (virq)
diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
index 1ec0922f735c..21d7d5ac8c4a 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
@@ -832,7 +832,7 @@ static int rzv2m_gpio_get(struct gpio_chip *chip, unsigned int offset)
 
 static void rzv2m_gpio_free(struct gpio_chip *chip, unsigned int offset)
 {
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 
 	/*
 	 * Set the GPIO as an input to ensure that the next GPIO request won't
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index 6d3b7ce1c35a..a05570c7d833 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -248,7 +248,7 @@ err1:
 	if (!IS_ERR(plgpio->clk))
 		clk_disable(plgpio->clk);
 err0:
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 	return ret;
 }
 
@@ -278,7 +278,7 @@ disable_clk:
 	if (!IS_ERR(plgpio->clk))
 		clk_disable(plgpio->clk);
 
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 }
 
 /* PLGPIO IRQ */
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
index 88e39f68ef25..7beed57d665a 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
@@ -923,7 +923,7 @@ static int starfive_gpio_request(struct gpio_chip *gc, unsigned int gpio)
 
 static void starfive_gpio_free(struct gpio_chip *gc, unsigned int gpio)
 {
-	pinctrl_gpio_free_new(gc, gpio);
+	pinctrl_gpio_free(gc, gpio);
 }
 
 static int starfive_gpio_get_direction(struct gpio_chip *gc, unsigned int gpio)
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
index c458c6fb992f..695c8c0a6e8b 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
@@ -552,7 +552,7 @@ static int jh7110_gpio_request(struct gpio_chip *gc, unsigned int gpio)
 
 static void jh7110_gpio_free(struct gpio_chip *gc, unsigned int gpio)
 {
-	pinctrl_gpio_free_new(gc, gpio);
+	pinctrl_gpio_free(gc, gpio);
 }
 
 static int jh7110_gpio_get_direction(struct gpio_chip *gc,
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index dddbdc1a64c5..e4453fa58f9e 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -222,7 +222,7 @@ static int stm32_gpio_request(struct gpio_chip *chip, unsigned offset)
 
 static void stm32_gpio_free(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_gpio_free_new(chip, offset);
+	pinctrl_gpio_free(chip, offset);
 }
 
 static int stm32_gpio_get(struct gpio_chip *chip, unsigned offset)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 05b2dae05f24..55b465edaff1 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -28,7 +28,7 @@ struct pinctrl_state;
 /* External interface to pin control */
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
-void pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset);
+void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
 				     unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
@@ -80,7 +80,7 @@ pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline void
-pinctrl_gpio_free_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset)
 {
 }
 
-- 
cgit v1.2.3


From 315c46f9b696be82972290d50349c7824276b844 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:58:30 +0200
Subject: treewide: rename pinctrl_gpio_direction_input_new()

Now that pinctrl_gpio_direction_input() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-mvebu.c                     | 2 +-
 drivers/gpio/gpio-pxa.c                       | 2 +-
 drivers/gpio/gpio-rockchip.c                  | 2 +-
 drivers/gpio/gpio-tegra.c                     | 2 +-
 drivers/gpio/gpio-vf610.c                     | 2 +-
 drivers/pinctrl/cirrus/pinctrl-cs42l43.c      | 2 +-
 drivers/pinctrl/core.c                        | 6 +++---
 drivers/pinctrl/intel/pinctrl-cherryview.c    | 2 +-
 drivers/pinctrl/intel/pinctrl-intel.c         | 2 +-
 drivers/pinctrl/intel/pinctrl-lynxpoint.c     | 2 +-
 drivers/pinctrl/mediatek/pinctrl-moore.c      | 2 +-
 drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +-
 drivers/pinctrl/mediatek/pinctrl-paris.c      | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c     | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c     | 2 +-
 drivers/pinctrl/pinctrl-as3722.c              | 2 +-
 drivers/pinctrl/pinctrl-axp209.c              | 2 +-
 drivers/pinctrl/pinctrl-cy8c95x0.c            | 2 +-
 drivers/pinctrl/pinctrl-ingenic.c             | 2 +-
 drivers/pinctrl/pinctrl-ocelot.c              | 2 +-
 drivers/pinctrl/pinctrl-rk805.c               | 2 +-
 drivers/pinctrl/pinctrl-st.c                  | 2 +-
 drivers/pinctrl/renesas/gpio.c                | 2 +-
 drivers/pinctrl/stm32/pinctrl-stm32.c         | 2 +-
 drivers/pinctrl/vt8500/pinctrl-wmt.c          | 2 +-
 include/linux/pinctrl/consumer.h              | 6 +++---
 26 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index d7eae6f0430d..3d1f941191e7 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -346,7 +346,7 @@ static int mvebu_gpio_direction_input(struct gpio_chip *chip, unsigned int pin)
 	 * Check with the pinctrl driver whether this pin is usable as
 	 * an input GPIO
 	 */
-	ret = pinctrl_gpio_direction_input_new(chip, pin);
+	ret = pinctrl_gpio_direction_input(chip, pin);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index 2fd797b91270..c60197d5fde0 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -260,7 +260,7 @@ static int pxa_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 	int ret;
 
 	if (pxa_gpio_has_pinctrl()) {
-		ret = pinctrl_gpio_direction_input_new(chip, offset);
+		ret = pinctrl_gpio_direction_input(chip, offset);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c
index 3576793f0bd9..05afb67fd677 100644
--- a/drivers/gpio/gpio-rockchip.c
+++ b/drivers/gpio/gpio-rockchip.c
@@ -159,7 +159,7 @@ static int rockchip_gpio_set_direction(struct gpio_chip *chip,
 
 
 	if (input)
-		pinctrl_gpio_direction_input_new(chip, offset);
+		pinctrl_gpio_direction_input(chip, offset);
 	else
 		pinctrl_gpio_direction_output_new(chip, offset);
 
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index fcb3b474d1df..a9a00b28cc55 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -179,7 +179,7 @@ static int tegra_gpio_direction_input(struct gpio_chip *chip,
 	tegra_gpio_mask_write(tgi, GPIO_MSK_OE(tgi, offset), offset, 0);
 	tegra_gpio_enable(tgi, offset);
 
-	ret = pinctrl_gpio_direction_input_new(chip, offset);
+	ret = pinctrl_gpio_direction_input(chip, offset);
 	if (ret < 0)
 		dev_err(tgi->dev,
 			"Failed to set pinctrl input direction of GPIO %d: %d",
diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c
index 4dfd2331b997..4c7849ebb067 100644
--- a/drivers/gpio/gpio-vf610.c
+++ b/drivers/gpio/gpio-vf610.c
@@ -130,7 +130,7 @@ static int vf610_gpio_direction_input(struct gpio_chip *chip, unsigned gpio)
 		vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR);
 	}
 
-	return pinctrl_gpio_direction_input_new(chip, gpio);
+	return pinctrl_gpio_direction_input(chip, gpio);
 }
 
 static int vf610_gpio_direction_output(struct gpio_chip *chip, unsigned gpio,
diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
index 097e4dcdda36..e35964359381 100644
--- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
+++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
@@ -508,7 +508,7 @@ static void cs42l43_gpio_set(struct gpio_chip *chip, unsigned int offset, int va
 
 static int cs42l43_gpio_direction_in(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int cs42l43_gpio_direction_out(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 21d54f82838c..c6a17ae8ab34 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -874,7 +874,7 @@ static int pinctrl_gpio_direction(unsigned gpio, bool input)
 }
 
 /**
- * pinctrl_gpio_direction_input_new() - request a GPIO pin to go into input mode
+ * pinctrl_gpio_direction_input() - request a GPIO pin to go into input mode
  * @gc: GPIO chip structure from the GPIO subsystem
  * @offset: hardware offset of the GPIO relative to the controller
  *
@@ -882,11 +882,11 @@ static int pinctrl_gpio_direction(unsigned gpio, bool input)
  * as part of their gpio_direction_input() semantics, platforms and individual
  * drivers shall *NOT* touch pin control GPIO calls.
  */
-int pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+int pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 {
 	return pinctrl_gpio_direction(gc->base + offset, true);
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
 
 /**
  * pinctrl_gpio_direction_output_new() - request a GPIO pin to go into output
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index f806b7ce086b..baf32a8156c4 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1148,7 +1148,7 @@ static int chv_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
 
 static int chv_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int chv_gpio_direction_output(struct gpio_chip *chip, unsigned int offset,
diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c
index d38b879d4f79..20f6a80e3eb7 100644
--- a/drivers/pinctrl/intel/pinctrl-intel.c
+++ b/drivers/pinctrl/intel/pinctrl-intel.c
@@ -992,7 +992,7 @@ static int intel_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
 
 static int intel_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int intel_gpio_direction_output(struct gpio_chip *chip, unsigned int offset,
diff --git a/drivers/pinctrl/intel/pinctrl-lynxpoint.c b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
index 91d228fead45..b4174829e1f9 100644
--- a/drivers/pinctrl/intel/pinctrl-lynxpoint.c
+++ b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
@@ -541,7 +541,7 @@ static void lp_gpio_set(struct gpio_chip *chip, unsigned int offset, int value)
 
 static int lp_gpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int lp_gpio_direction_output(struct gpio_chip *chip, unsigned int offset,
diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c
index dc41ffa31357..5a5ec00e32ea 100644
--- a/drivers/pinctrl/mediatek/pinctrl-moore.c
+++ b/drivers/pinctrl/mediatek/pinctrl-moore.c
@@ -512,7 +512,7 @@ static void mtk_gpio_set(struct gpio_chip *chip, unsigned int gpio, int value)
 
 static int mtk_gpio_direction_input(struct gpio_chip *chip, unsigned int gpio)
 {
-	return pinctrl_gpio_direction_input_new(chip, gpio);
+	return pinctrl_gpio_direction_input(chip, gpio);
 }
 
 static int mtk_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio,
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
index fc046415b567..b2dc3072050a 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
@@ -811,7 +811,7 @@ static const struct pinmux_ops mtk_pmx_ops = {
 static int mtk_gpio_direction_input(struct gpio_chip *chip,
 					unsigned offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int mtk_gpio_direction_output(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c
index 14b151aec2f9..d8c969e506ce 100644
--- a/drivers/pinctrl/mediatek/pinctrl-paris.c
+++ b/drivers/pinctrl/mediatek/pinctrl-paris.c
@@ -916,7 +916,7 @@ static int mtk_gpio_direction_input(struct gpio_chip *chip, unsigned int gpio)
 	if (gpio >= hw->soc->npins)
 		return -EINVAL;
 
-	return pinctrl_gpio_direction_input_new(chip, gpio);
+	return pinctrl_gpio_direction_input(chip, gpio);
 }
 
 static int mtk_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio,
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
index ab7cfc1e66f0..d7ce67242248 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
@@ -171,7 +171,7 @@ static int npcmgpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 	struct npcm7xx_gpio *bank = gpiochip_get_data(chip);
 	int ret;
 
-	ret = pinctrl_gpio_direction_input_new(chip, offset);
+	ret = pinctrl_gpio_direction_input(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
index 78dbeef6af86..c22513e30068 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
@@ -173,7 +173,7 @@ static int npcmgpio_direction_input(struct gpio_chip *chip, unsigned int offset)
 	struct npcm8xx_gpio *bank = gpiochip_get_data(chip);
 	int ret;
 
-	ret = pinctrl_gpio_direction_input_new(chip, offset);
+	ret = pinctrl_gpio_direction_input(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/pinctrl-as3722.c b/drivers/pinctrl/pinctrl-as3722.c
index 205937f7f9a1..e77b180155f4 100644
--- a/drivers/pinctrl/pinctrl-as3722.c
+++ b/drivers/pinctrl/pinctrl-as3722.c
@@ -502,7 +502,7 @@ static void as3722_gpio_set(struct gpio_chip *chip, unsigned offset,
 
 static int as3722_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int as3722_gpio_direction_output(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/pinctrl-axp209.c b/drivers/pinctrl/pinctrl-axp209.c
index d7f081f1d1f6..a6eb3b866a23 100644
--- a/drivers/pinctrl/pinctrl-axp209.c
+++ b/drivers/pinctrl/pinctrl-axp209.c
@@ -126,7 +126,7 @@ static int axp20x_gpio_get_reg(unsigned int offset)
 
 static int axp20x_gpio_input(struct gpio_chip *chip, unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int axp20x_gpio_get(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c
index 8c6006353515..729c13a249ef 100644
--- a/drivers/pinctrl/pinctrl-cy8c95x0.c
+++ b/drivers/pinctrl/pinctrl-cy8c95x0.c
@@ -554,7 +554,7 @@ out:
 
 static int cy8c95x0_gpio_direction_input(struct gpio_chip *gc, unsigned int off)
 {
-	return pinctrl_gpio_direction_input_new(gc, off);
+	return pinctrl_gpio_direction_input(gc, off);
 }
 
 static int cy8c95x0_gpio_direction_output(struct gpio_chip *gc,
diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c
index 5072fb907193..df36141edbf3 100644
--- a/drivers/pinctrl/pinctrl-ingenic.c
+++ b/drivers/pinctrl/pinctrl-ingenic.c
@@ -3563,7 +3563,7 @@ static int ingenic_gpio_get(struct gpio_chip *gc, unsigned int offset)
 static int ingenic_gpio_direction_input(struct gpio_chip *gc,
 		unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(gc, offset);
+	return pinctrl_gpio_direction_input(gc, offset);
 }
 
 static int ingenic_gpio_direction_output(struct gpio_chip *gc,
diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c
index a6022d44ad6c..d02d16084f33 100644
--- a/drivers/pinctrl/pinctrl-ocelot.c
+++ b/drivers/pinctrl/pinctrl-ocelot.c
@@ -1779,7 +1779,7 @@ static int ocelot_gpio_get_direction(struct gpio_chip *chip,
 static int ocelot_gpio_direction_input(struct gpio_chip *chip,
 				       unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int ocelot_gpio_direction_output(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c
index 58e6dd932bc7..f0feb3a8e630 100644
--- a/drivers/pinctrl/pinctrl-rk805.c
+++ b/drivers/pinctrl/pinctrl-rk805.c
@@ -289,7 +289,7 @@ static void rk805_gpio_set(struct gpio_chip *chip,
 static int rk805_gpio_direction_input(struct gpio_chip *chip,
 				      unsigned int offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int rk805_gpio_direction_output(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/pinctrl-st.c b/drivers/pinctrl/pinctrl-st.c
index ec763572ab3e..3f6631716779 100644
--- a/drivers/pinctrl/pinctrl-st.c
+++ b/drivers/pinctrl/pinctrl-st.c
@@ -719,7 +719,7 @@ static void st_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 
 static int st_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 {
-	pinctrl_gpio_direction_input_new(chip, offset);
+	pinctrl_gpio_direction_input(chip, offset);
 
 	return 0;
 }
diff --git a/drivers/pinctrl/renesas/gpio.c b/drivers/pinctrl/renesas/gpio.c
index b39125a632b0..54346b7fcdef 100644
--- a/drivers/pinctrl/renesas/gpio.c
+++ b/drivers/pinctrl/renesas/gpio.c
@@ -164,7 +164,7 @@ static void gpio_pin_set_value(struct sh_pfc_chip *chip, unsigned offset,
 
 static int gpio_pin_direction_input(struct gpio_chip *gc, unsigned offset)
 {
-	return pinctrl_gpio_direction_input_new(gc, offset);
+	return pinctrl_gpio_direction_input(gc, offset);
 }
 
 static int gpio_pin_direction_output(struct gpio_chip *gc, unsigned offset,
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index e4453fa58f9e..cbe306f47c0a 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -241,7 +241,7 @@ static void stm32_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
 
 static int stm32_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int stm32_gpio_direction_output(struct gpio_chip *chip,
diff --git a/drivers/pinctrl/vt8500/pinctrl-wmt.c b/drivers/pinctrl/vt8500/pinctrl-wmt.c
index 1a313295611e..54cd556a16b8 100644
--- a/drivers/pinctrl/vt8500/pinctrl-wmt.c
+++ b/drivers/pinctrl/vt8500/pinctrl-wmt.c
@@ -528,7 +528,7 @@ static void wmt_gpio_set_value(struct gpio_chip *chip, unsigned offset,
 
 static int wmt_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
 {
-	return pinctrl_gpio_direction_input_new(chip, offset);
+	return pinctrl_gpio_direction_input(chip, offset);
 }
 
 static int wmt_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 55b465edaff1..9531595d1abd 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -29,8 +29,8 @@ struct pinctrl_state;
 bool pinctrl_gpio_can_use_line(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
-int pinctrl_gpio_direction_input_new(struct gpio_chip *gc,
-				     unsigned int offset);
+int pinctrl_gpio_direction_input(struct gpio_chip *gc,
+				 unsigned int offset);
 int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
 				      unsigned int offset);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
@@ -85,7 +85,7 @@ pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_direction_input_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b679d6c06b2b29187374f9f4da7eea1961c2eeaa Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 11:59:51 +0200
Subject: treewide: rename pinctrl_gpio_direction_output_new()

Now that pinctrl_gpio_direction_output() is no longer used, let's drop
the '_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-mvebu.c                     | 2 +-
 drivers/gpio/gpio-pxa.c                       | 2 +-
 drivers/gpio/gpio-rockchip.c                  | 2 +-
 drivers/gpio/gpio-tegra.c                     | 2 +-
 drivers/gpio/gpio-vf610.c                     | 2 +-
 drivers/pinctrl/cirrus/pinctrl-cs42l43.c      | 2 +-
 drivers/pinctrl/cirrus/pinctrl-lochnagar.c    | 2 +-
 drivers/pinctrl/core.c                        | 7 +++----
 drivers/pinctrl/intel/pinctrl-cherryview.c    | 2 +-
 drivers/pinctrl/intel/pinctrl-intel.c         | 2 +-
 drivers/pinctrl/intel/pinctrl-lynxpoint.c     | 2 +-
 drivers/pinctrl/mediatek/pinctrl-moore.c      | 2 +-
 drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +-
 drivers/pinctrl/mediatek/pinctrl-paris.c      | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c     | 2 +-
 drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c     | 2 +-
 drivers/pinctrl/pinctrl-as3722.c              | 2 +-
 drivers/pinctrl/pinctrl-cy8c95x0.c            | 2 +-
 drivers/pinctrl/pinctrl-ingenic.c             | 4 ++--
 drivers/pinctrl/pinctrl-ocelot.c              | 2 +-
 drivers/pinctrl/pinctrl-rk805.c               | 2 +-
 drivers/pinctrl/pinctrl-st.c                  | 2 +-
 drivers/pinctrl/renesas/gpio.c                | 2 +-
 drivers/pinctrl/stm32/pinctrl-stm32.c         | 2 +-
 drivers/pinctrl/vt8500/pinctrl-wmt.c          | 2 +-
 include/linux/pinctrl/consumer.h              | 6 +++---
 26 files changed, 31 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 3d1f941191e7..8342bfc2d3f9 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -366,7 +366,7 @@ static int mvebu_gpio_direction_output(struct gpio_chip *chip, unsigned int pin,
 	 * Check with the pinctrl driver whether this pin is usable as
 	 * an output GPIO
 	 */
-	ret = pinctrl_gpio_direction_output_new(chip, pin);
+	ret = pinctrl_gpio_direction_output(chip, pin);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index c60197d5fde0..91cea97255fa 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -289,7 +289,7 @@ static int pxa_gpio_direction_output(struct gpio_chip *chip,
 	writel_relaxed(mask, base + (value ? GPSR_OFFSET : GPCR_OFFSET));
 
 	if (pxa_gpio_has_pinctrl()) {
-		ret = pinctrl_gpio_direction_output_new(chip, offset);
+		ret = pinctrl_gpio_direction_output(chip, offset);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c
index 05afb67fd677..0bd339813110 100644
--- a/drivers/gpio/gpio-rockchip.c
+++ b/drivers/gpio/gpio-rockchip.c
@@ -161,7 +161,7 @@ static int rockchip_gpio_set_direction(struct gpio_chip *chip,
 	if (input)
 		pinctrl_gpio_direction_input(chip, offset);
 	else
-		pinctrl_gpio_direction_output_new(chip, offset);
+		pinctrl_gpio_direction_output(chip, offset);
 
 	raw_spin_lock_irqsave(&bank->slock, flags);
 	rockchip_gpio_writel_bit(bank, offset, data, bank->gpio_regs->port_ddr);
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index a9a00b28cc55..adfbed62eff1 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -199,7 +199,7 @@ static int tegra_gpio_direction_output(struct gpio_chip *chip,
 	tegra_gpio_mask_write(tgi, GPIO_MSK_OE(tgi, offset), offset, 1);
 	tegra_gpio_enable(tgi, offset);
 
-	ret = pinctrl_gpio_direction_output_new(chip, offset);
+	ret = pinctrl_gpio_direction_output(chip, offset);
 	if (ret < 0)
 		dev_err(tgi->dev,
 			"Failed to set pinctrl output direction of GPIO %d: %d",
diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c
index 4c7849ebb067..07e5e6323e86 100644
--- a/drivers/gpio/gpio-vf610.c
+++ b/drivers/gpio/gpio-vf610.c
@@ -148,7 +148,7 @@ static int vf610_gpio_direction_output(struct gpio_chip *chip, unsigned gpio,
 		vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR);
 	}
 
-	return pinctrl_gpio_direction_output_new(chip, gpio);
+	return pinctrl_gpio_direction_output(chip, gpio);
 }
 
 static void vf610_gpio_irq_handler(struct irq_desc *desc)
diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
index e35964359381..1ba89cf279fb 100644
--- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
+++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
@@ -516,7 +516,7 @@ static int cs42l43_gpio_direction_out(struct gpio_chip *chip,
 {
 	cs42l43_gpio_set(chip, offset, value);
 
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static int cs42l43_gpio_add_pin_ranges(struct gpio_chip *chip)
diff --git a/drivers/pinctrl/cirrus/pinctrl-lochnagar.c b/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
index f3c8a8c14e14..014297a3fbd2 100644
--- a/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
+++ b/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
@@ -1098,7 +1098,7 @@ static int lochnagar_gpio_direction_out(struct gpio_chip *chip,
 {
 	lochnagar_gpio_set(chip, offset, value);
 
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static int lochnagar_fill_func_groups(struct lochnagar_pin_priv *priv)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c6a17ae8ab34..fa49bb47dc7b 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -889,8 +889,7 @@ int pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
 
 /**
- * pinctrl_gpio_direction_output_new() - request a GPIO pin to go into output
- *                                       mode
+ * pinctrl_gpio_direction_output() - request a GPIO pin to go into output mode
  * @gc: GPIO chip structure from the GPIO subsystem
  * @offset: hardware offset of the GPIO relative to the controller
  *
@@ -898,11 +897,11 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_input);
  * as part of their gpio_direction_output() semantics, platforms and individual
  * drivers shall *NOT* touch pin control GPIO calls.
  */
-int pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+int pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 {
 	return pinctrl_gpio_direction(gc->base + offset, false);
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
 
 /**
  * pinctrl_gpio_set_config_new() - Apply config to given GPIO pin
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index baf32a8156c4..067b0d344f0e 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1155,7 +1155,7 @@ static int chv_gpio_direction_output(struct gpio_chip *chip, unsigned int offset
 				     int value)
 {
 	chv_gpio_set(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static const struct gpio_chip chv_gpio_chip = {
diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c
index 20f6a80e3eb7..652ba451f885 100644
--- a/drivers/pinctrl/intel/pinctrl-intel.c
+++ b/drivers/pinctrl/intel/pinctrl-intel.c
@@ -999,7 +999,7 @@ static int intel_gpio_direction_output(struct gpio_chip *chip, unsigned int offs
 				       int value)
 {
 	intel_gpio_set(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static const struct gpio_chip intel_gpio_chip = {
diff --git a/drivers/pinctrl/intel/pinctrl-lynxpoint.c b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
index b4174829e1f9..e6878e4cf20c 100644
--- a/drivers/pinctrl/intel/pinctrl-lynxpoint.c
+++ b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
@@ -549,7 +549,7 @@ static int lp_gpio_direction_output(struct gpio_chip *chip, unsigned int offset,
 {
 	lp_gpio_set(chip, offset, value);
 
-	return pinctrl_gpio_direction_output_new(chip,  offset);
+	return pinctrl_gpio_direction_output(chip,  offset);
 }
 
 static int lp_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c
index 5a5ec00e32ea..c3f33f96f920 100644
--- a/drivers/pinctrl/mediatek/pinctrl-moore.c
+++ b/drivers/pinctrl/mediatek/pinctrl-moore.c
@@ -520,7 +520,7 @@ static int mtk_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio,
 {
 	mtk_gpio_set(chip, gpio, value);
 
-	return pinctrl_gpio_direction_output_new(chip, gpio);
+	return pinctrl_gpio_direction_output(chip, gpio);
 }
 
 static int mtk_gpio_to_irq(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
index b2dc3072050a..41e9847e3085 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
@@ -818,7 +818,7 @@ static int mtk_gpio_direction_output(struct gpio_chip *chip,
 					unsigned offset, int value)
 {
 	mtk_gpio_set(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static int mtk_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c
index d8c969e506ce..6392f1e05d02 100644
--- a/drivers/pinctrl/mediatek/pinctrl-paris.c
+++ b/drivers/pinctrl/mediatek/pinctrl-paris.c
@@ -929,7 +929,7 @@ static int mtk_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio,
 
 	mtk_gpio_set(chip, gpio, value);
 
-	return pinctrl_gpio_direction_output_new(chip, gpio);
+	return pinctrl_gpio_direction_output(chip, gpio);
 }
 
 static int mtk_gpio_to_irq(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
index d7ce67242248..51aeac7ecc79 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c
@@ -188,7 +188,7 @@ static int npcmgpio_direction_output(struct gpio_chip *chip,
 	dev_dbg(chip->parent, "gpio_direction_output: offset%d = %x\n", offset,
 		value);
 
-	ret = pinctrl_gpio_direction_output_new(chip, offset);
+	ret = pinctrl_gpio_direction_output(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
index c22513e30068..140fdbbe8458 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c
@@ -186,7 +186,7 @@ static int npcmgpio_direction_output(struct gpio_chip *chip,
 	struct npcm8xx_gpio *bank = gpiochip_get_data(chip);
 	int ret;
 
-	ret = pinctrl_gpio_direction_output_new(chip, offset);
+	ret = pinctrl_gpio_direction_output(chip, offset);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/pinctrl-as3722.c b/drivers/pinctrl/pinctrl-as3722.c
index e77b180155f4..5ebd1939663f 100644
--- a/drivers/pinctrl/pinctrl-as3722.c
+++ b/drivers/pinctrl/pinctrl-as3722.c
@@ -509,7 +509,7 @@ static int as3722_gpio_direction_output(struct gpio_chip *chip,
 		unsigned offset, int value)
 {
 	as3722_gpio_set(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static int as3722_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c
index 729c13a249ef..04285c930e94 100644
--- a/drivers/pinctrl/pinctrl-cy8c95x0.c
+++ b/drivers/pinctrl/pinctrl-cy8c95x0.c
@@ -571,7 +571,7 @@ static int cy8c95x0_gpio_direction_output(struct gpio_chip *gc,
 	if (ret)
 		return ret;
 
-	return pinctrl_gpio_direction_output_new(gc, off);
+	return pinctrl_gpio_direction_output(gc, off);
 }
 
 static int cy8c95x0_gpio_get_value(struct gpio_chip *gc, unsigned int off)
diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c
index df36141edbf3..ca58c9db5c2c 100644
--- a/drivers/pinctrl/pinctrl-ingenic.c
+++ b/drivers/pinctrl/pinctrl-ingenic.c
@@ -3570,7 +3570,7 @@ static int ingenic_gpio_direction_output(struct gpio_chip *gc,
 		unsigned int offset, int value)
 {
 	ingenic_gpio_set(gc, offset, value);
-	return pinctrl_gpio_direction_output_new(gc, offset);
+	return pinctrl_gpio_direction_output(gc, offset);
 }
 
 static inline void ingenic_config_pin(struct ingenic_pinctrl *jzpc,
@@ -4054,7 +4054,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin,
 			break;
 
 		case PIN_CONFIG_OUTPUT:
-			ret = pinctrl_gpio_direction_output_new(jzpc->gc,
+			ret = pinctrl_gpio_direction_output(jzpc->gc,
 							pin - jzpc->gc->base);
 			if (ret)
 				return ret;
diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c
index d02d16084f33..96b4e9fceb78 100644
--- a/drivers/pinctrl/pinctrl-ocelot.c
+++ b/drivers/pinctrl/pinctrl-ocelot.c
@@ -1795,7 +1795,7 @@ static int ocelot_gpio_direction_output(struct gpio_chip *chip,
 		regmap_write(info->map, REG(OCELOT_GPIO_OUT_CLR, info, offset),
 			     pin);
 
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static const struct gpio_chip ocelot_gpiolib_chip = {
diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c
index f0feb3a8e630..968f066eb9da 100644
--- a/drivers/pinctrl/pinctrl-rk805.c
+++ b/drivers/pinctrl/pinctrl-rk805.c
@@ -296,7 +296,7 @@ static int rk805_gpio_direction_output(struct gpio_chip *chip,
 				       unsigned int offset, int value)
 {
 	rk805_gpio_set(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static int rk805_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
diff --git a/drivers/pinctrl/pinctrl-st.c b/drivers/pinctrl/pinctrl-st.c
index 3f6631716779..db511dbd50e5 100644
--- a/drivers/pinctrl/pinctrl-st.c
+++ b/drivers/pinctrl/pinctrl-st.c
@@ -730,7 +730,7 @@ static int st_gpio_direction_output(struct gpio_chip *chip,
 	struct st_gpio_bank *bank = gpiochip_get_data(chip);
 
 	__st_gpio_set(bank, offset, value);
-	pinctrl_gpio_direction_output_new(chip, offset);
+	pinctrl_gpio_direction_output(chip, offset);
 
 	return 0;
 }
diff --git a/drivers/pinctrl/renesas/gpio.c b/drivers/pinctrl/renesas/gpio.c
index 54346b7fcdef..a5136dacaaf2 100644
--- a/drivers/pinctrl/renesas/gpio.c
+++ b/drivers/pinctrl/renesas/gpio.c
@@ -172,7 +172,7 @@ static int gpio_pin_direction_output(struct gpio_chip *gc, unsigned offset,
 {
 	gpio_pin_set_value(gpiochip_get_data(gc), offset, value);
 
-	return pinctrl_gpio_direction_output_new(gc, offset);
+	return pinctrl_gpio_direction_output(gc, offset);
 }
 
 static int gpio_pin_get(struct gpio_chip *gc, unsigned offset)
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index cbe306f47c0a..917b87acbf0d 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -250,7 +250,7 @@ static int stm32_gpio_direction_output(struct gpio_chip *chip,
 	struct stm32_gpio_bank *bank = gpiochip_get_data(chip);
 
 	__stm32_gpio_set(bank, offset, value);
-	pinctrl_gpio_direction_output_new(chip, offset);
+	pinctrl_gpio_direction_output(chip, offset);
 
 	return 0;
 }
diff --git a/drivers/pinctrl/vt8500/pinctrl-wmt.c b/drivers/pinctrl/vt8500/pinctrl-wmt.c
index 54cd556a16b8..639446bba373 100644
--- a/drivers/pinctrl/vt8500/pinctrl-wmt.c
+++ b/drivers/pinctrl/vt8500/pinctrl-wmt.c
@@ -535,7 +535,7 @@ static int wmt_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
 				     int value)
 {
 	wmt_gpio_set_value(chip, offset, value);
-	return pinctrl_gpio_direction_output_new(chip, offset);
+	return pinctrl_gpio_direction_output(chip, offset);
 }
 
 static const struct gpio_chip wmt_gpio_chip = {
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 9531595d1abd..e7d660e98e86 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -31,8 +31,8 @@ int pinctrl_gpio_request(struct gpio_chip *gc, unsigned int offset);
 void pinctrl_gpio_free(struct gpio_chip *gc, unsigned int offset);
 int pinctrl_gpio_direction_input(struct gpio_chip *gc,
 				 unsigned int offset);
-int pinctrl_gpio_direction_output_new(struct gpio_chip *gc,
-				      unsigned int offset);
+int pinctrl_gpio_direction_output(struct gpio_chip *gc,
+				  unsigned int offset);
 int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
@@ -91,7 +91,7 @@ pinctrl_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_direction_output_new(struct gpio_chip *gc, unsigned int offset)
+pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From acf2981b84c344428baa10dbc9234749c68dedae Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 3 Oct 2023 12:01:01 +0200
Subject: treewide: rename pinctrl_gpio_set_config_new()

Now that pinctrl_gpio_set_config() is no longer used, let's drop the
'_new' suffix from its improved variant.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-aspeed.c       | 2 +-
 drivers/gpio/gpiolib.c           | 2 +-
 drivers/pinctrl/core.c           | 6 +++---
 include/linux/pinctrl/consumer.h | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 4c417c8486dd..04c03402db6d 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -973,7 +973,7 @@ static int aspeed_gpio_set_config(struct gpio_chip *chip, unsigned int offset,
 	else if (param == PIN_CONFIG_BIAS_DISABLE ||
 			param == PIN_CONFIG_BIAS_PULL_DOWN ||
 			param == PIN_CONFIG_DRIVE_STRENGTH)
-		return pinctrl_gpio_set_config_new(chip, offset, config);
+		return pinctrl_gpio_set_config(chip, offset, config);
 	else if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN ||
 			param == PIN_CONFIG_DRIVE_OPEN_SOURCE)
 		/* Return -ENOTSUPP to trigger emulation, as per datasheet */
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 449e00a47296..95d2a7b2ea3e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2043,7 +2043,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_free);
 int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
 {
-	return pinctrl_gpio_set_config_new(gc, offset, config);
+	return pinctrl_gpio_set_config(gc, offset, config);
 }
 EXPORT_SYMBOL_GPL(gpiochip_generic_config);
 
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index fa49bb47dc7b..e26f94436f26 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -904,7 +904,7 @@ int pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
 
 /**
- * pinctrl_gpio_set_config_new() - Apply config to given GPIO pin
+ * pinctrl_gpio_set_config() - Apply config to given GPIO pin
  * @gc: GPIO chip structure from the GPIO subsystem
  * @offset: hardware offset of the GPIO relative to the controller
  * @config: the configuration to apply to the GPIO
@@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(pinctrl_gpio_direction_output);
  * they need to call the underlying pin controller to change GPIO config
  * (for example set debounce time).
  */
-int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+int pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config)
 {
 	unsigned long configs[] = { config };
@@ -933,7 +933,7 @@ int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config_new);
+EXPORT_SYMBOL_GPL(pinctrl_gpio_set_config);
 
 static struct pinctrl_state *find_state(struct pinctrl *p,
 					const char *name)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index e7d660e98e86..73de70362b98 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -33,7 +33,7 @@ int pinctrl_gpio_direction_input(struct gpio_chip *gc,
 				 unsigned int offset);
 int pinctrl_gpio_direction_output(struct gpio_chip *gc,
 				  unsigned int offset);
-int pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+int pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 				unsigned long config);
 
 struct pinctrl * __must_check pinctrl_get(struct device *dev);
@@ -97,7 +97,7 @@ pinctrl_gpio_direction_output(struct gpio_chip *gc, unsigned int offset)
 }
 
 static inline int
-pinctrl_gpio_set_config_new(struct gpio_chip *gc, unsigned int offset,
+pinctrl_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
 {
 	return 0;
-- 
cgit v1.2.3


From 8ae0e970319ac0b516d285650a744bab4ed3dd37 Mon Sep 17 00:00:00 2001
From: Jia He <justin.he@arm.com>
Date: Sat, 28 Oct 2023 10:20:58 +0000
Subject: dma-mapping: move dma_addressing_limited() out of line

This patch moves dma_addressing_limited() out of line, serving as a
preliminary step to prevent the introduction of a new publicly accessible
low-level helper when validating whether all system RAM is mapped within
the DMA mapping range.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 19 +++++--------------
 kernel/dma/mapping.c        | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f0ccca16a0ac..4a658de44ee9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,7 @@ bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
+bool dma_addressing_limited(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 size_t dma_opt_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
@@ -264,6 +265,10 @@ static inline u64 dma_get_required_mask(struct device *dev)
 {
 	return 0;
 }
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return false;
+}
 static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
@@ -465,20 +470,6 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
-/**
- * dma_addressing_limited - return if the device is addressing limited
- * @dev:	device to check
- *
- * Return %true if the devices DMA mask is too small to address all memory in
- * the system, else %false.  Lack of addressing bits is the prime reason for
- * bounce buffering, but might not be the only one.
- */
-static inline bool dma_addressing_limited(struct device *dev)
-{
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
-			    dma_get_required_mask(dev);
-}
-
 static inline unsigned int dma_get_max_seg_size(struct device *dev)
 {
 	if (dev->dma_parms && dev->dma_parms->max_segment_size)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index e323ca48f7f2..7789c86f7ba3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -793,6 +793,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev:	device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false.  Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
+{
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+			    dma_get_required_mask(dev);
+}
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
+
 size_t dma_max_mapping_size(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 1a229d8690a0f8951fc4aa8b76a7efab0d8de342 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Mon, 6 Nov 2023 12:06:54 +0100
Subject: Revert "usb: phy: add usb phy notify port status API"

This reverts commit a08799cf17c22375752abfad3b4a2b34b3acb287.

The recently added Realtek PHY drivers depend on the new port status
notification mechanism which was built on the deprecated USB PHY
implementation and devicetree binding.

Specifically, using these PHYs would require describing the very same
PHY using both the generic "phy" property and the deprecated "usb-phy"
property which is clearly wrong.

We should not be building new functionality on top of the legacy USB PHY
implementation even if it is currently stuck in some kind of
transitional limbo.

Revert the new notification interface which is broken by design.

Fixes: a08799cf17c2 ("usb: phy: add usb phy notify port status API")
Cc: stable@vger.kernel.org      # 6.6
Cc: Stanley Chang <stanley_chang@realtek.com>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20231106110654.31090-4-johan+linaro@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c  | 23 -----------------------
 include/linux/usb/phy.h | 13 -------------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index b4584a0cd484..87480a6e6d93 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -622,29 +622,6 @@ static int hub_ext_port_status(struct usb_hub *hub, int port1, int type,
 		ret = 0;
 	}
 	mutex_unlock(&hub->status_mutex);
-
-	/*
-	 * There is no need to lock status_mutex here, because status_mutex
-	 * protects hub->status, and the phy driver only checks the port
-	 * status without changing the status.
-	 */
-	if (!ret) {
-		struct usb_device *hdev = hub->hdev;
-
-		/*
-		 * Only roothub will be notified of port state changes,
-		 * since the USB PHY only cares about changes at the next
-		 * level.
-		 */
-		if (is_root_hub(hdev)) {
-			struct usb_hcd *hcd = bus_to_hcd(hdev->bus);
-
-			if (hcd->usb_phy)
-				usb_phy_notify_port_status(hcd->usb_phy,
-							   port1 - 1, *status, *change);
-		}
-	}
-
 	return ret;
 }
 
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index b513749582d7..e4de6bc1f69b 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -144,10 +144,6 @@ struct usb_phy {
 	 */
 	int	(*set_wakeup)(struct usb_phy *x, bool enabled);
 
-	/* notify phy port status change */
-	int	(*notify_port_status)(struct usb_phy *x, int port,
-				      u16 portstatus, u16 portchange);
-
 	/* notify phy connect status change */
 	int	(*notify_connect)(struct usb_phy *x,
 			enum usb_device_speed speed);
@@ -320,15 +316,6 @@ usb_phy_set_wakeup(struct usb_phy *x, bool enabled)
 		return 0;
 }
 
-static inline int
-usb_phy_notify_port_status(struct usb_phy *x, int port, u16 portstatus, u16 portchange)
-{
-	if (x && x->notify_port_status)
-		return x->notify_port_status(x, port, portstatus, portchange);
-	else
-		return 0;
-}
-
 static inline int
 usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed)
 {
-- 
cgit v1.2.3


From 75276847f4e262a52ccaf1a1c6b929280ddf77f6 Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Wed, 25 Oct 2023 10:51:23 +0000
Subject: nvme-auth: auth success1 msg always includes resp

In cases where RVALID is false, the response is still transmitted,
but is cleared to zero.

Relevant extract from the spec:
Response R2, if valid (i.e., if the RVALID field is set to 01h),
cleared to 0h otherwise

Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/auth.c | 5 +----
 include/linux/nvme.h     | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index eaefebb2a799..5885bb0d5a9a 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -339,10 +339,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
 		struct nvme_dhchap_queue_context *chap)
 {
 	struct nvmf_auth_dhchap_success1_data *data = chap->buf;
-	size_t size = sizeof(*data);
-
-	if (chap->s2)
-		size += chap->hash_len;
+	size_t size = sizeof(*data) + chap->hash_len;
 
 	if (size > CHAP_BUF_SIZE) {
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a7ba74babad7..44325c068b6a 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1732,7 +1732,7 @@ struct nvmf_auth_dhchap_success1_data {
 	__u8		rsvd2;
 	__u8		rvalid;
 	__u8		rsvd3[7];
-	/* 'hl' bytes of response value if 'rvalid' is set */
+	/* 'hl' bytes of response value */
 	__u8		rval[];
 };
 
-- 
cgit v1.2.3


From 6affe08aea5f3b630565676e227b41d55a6f009c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Oct 2023 15:08:03 +0200
Subject: nvme: common: make keyring and auth separate modules

When only the keyring module is included but auth is not, modpost
complains about the lack of a module license tag:

ERROR: modpost: missing MODULE_LICENSE() in drivers/nvme/common/nvme-common.o

Address this by making both modules buildable standalone,
removing the now unnecessary CONFIG_NVME_COMMON symbol
in the process.

Also, now that NVME_KEYRING config symbol can be either a module or
built-in, the stubs need to check for '#if IS_ENABLED' rather than a
simple '#ifdef'.

Fixes: 9d77eb5277849 ("nvme-keyring: register '.nvme' keyring")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/Makefile         | 2 +-
 drivers/nvme/common/Kconfig   | 7 ++-----
 drivers/nvme/common/Makefile  | 7 ++++---
 drivers/nvme/common/keyring.c | 2 ++
 drivers/nvme/host/Kconfig     | 2 --
 drivers/nvme/target/Kconfig   | 2 --
 include/linux/nvme-keyring.h  | 2 +-
 7 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index eedca8c72098..74f59ceed3d5 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-obj-$(CONFIG_NVME_COMMON)		+= common/
+obj-y		+= common/
 obj-y		+= host/
 obj-y		+= target/
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
index 06c8df00d1e2..244432e0b73d 100644
--- a/drivers/nvme/common/Kconfig
+++ b/drivers/nvme/common/Kconfig
@@ -1,14 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-config NVME_COMMON
-       tristate
-
 config NVME_KEYRING
-       bool
+       tristate
        select KEYS
 
 config NVME_AUTH
-	bool
+	tristate
 	select CRYPTO
 	select CRYPTO_HMAC
 	select CRYPTO_SHA256
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
index 0cbd0b0b8d49..681514cf2e2f 100644
--- a/drivers/nvme/common/Makefile
+++ b/drivers/nvme/common/Makefile
@@ -2,7 +2,8 @@
 
 ccflags-y			+= -I$(src)
 
-obj-$(CONFIG_NVME_COMMON)	+= nvme-common.o
+obj-$(CONFIG_NVME_AUTH)		+= nvme-auth.o
+obj-$(CONFIG_NVME_KEYRING)	+= nvme-keyring.o
 
-nvme-common-$(CONFIG_NVME_AUTH)	+= auth.o
-nvme-common-$(CONFIG_NVME_KEYRING) += keyring.o
+nvme-auth-y			+= auth.o
+nvme-keyring-y			+= keyring.o
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index f8d9a208397b..46d7a537dbc2 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -180,3 +180,5 @@ void nvme_keyring_exit(void)
 	key_put(nvme_keyring);
 }
 EXPORT_SYMBOL_GPL(nvme_keyring_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 48f7d72de5e9..8fe2dd619e80 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -95,7 +95,6 @@ config NVME_TCP
 config NVME_TCP_TLS
 	bool "NVMe over Fabrics TCP TLS encryption support"
 	depends on NVME_TCP
-	select NVME_COMMON
 	select NVME_KEYRING
 	select NET_HANDSHAKE
 	select KEYS
@@ -110,7 +109,6 @@ config NVME_TCP_TLS
 config NVME_HOST_AUTH
 	bool "NVM Express over Fabrics In-Band Authentication"
 	depends on NVME_CORE
-	select NVME_COMMON
 	select NVME_AUTH
 	help
 	  This provides support for NVMe over Fabrics In-Band Authentication.
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index fa479c9f5c3d..31633da9427c 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -87,7 +87,6 @@ config NVME_TARGET_TCP
 config NVME_TARGET_TCP_TLS
 	bool "NVMe over Fabrics TCP target TLS encryption support"
 	depends on NVME_TARGET_TCP
-	select NVME_COMMON
 	select NVME_KEYRING
 	select NET_HANDSHAKE
 	select KEYS
@@ -102,7 +101,6 @@ config NVME_TARGET_TCP_TLS
 config NVME_TARGET_AUTH
 	bool "NVMe over Fabrics In-band Authentication support"
 	depends on NVME_TARGET
-	select NVME_COMMON
 	select NVME_AUTH
 	help
 	  This enables support for NVMe over Fabrics In-band Authentication
diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 4efea9dd967c..6cc0696625f3 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -6,7 +6,7 @@
 #ifndef _NVME_KEYRING_H
 #define _NVME_KEYRING_H
 
-#ifdef CONFIG_NVME_KEYRING
+#if IS_ENABLED(CONFIG_NVME_KEYRING)
 
 key_serial_t nvme_tls_psk_default(struct key *keyring,
 		const char *hostnqn, const char *subnqn);
-- 
cgit v1.2.3


From 706add13676da7ad213b65e92b94af5efc8c4131 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 26 Oct 2023 15:08:04 +0200
Subject: nvme: keyring: fix conditional compilation

The keyring and auth functions can be called from both the host and
the target side and are controlled by Kconfig options for each of the
combinations, but the declarations are controlled by #ifdef checks
on the shared Kconfig symbols.

This leads to link failures in combinations where one of the frontends
is built-in and the other one is a module, and the keyring code
ends up in a module that is not reachable from the builtin code:

ld: drivers/nvme/host/core.o: in function `nvme_core_exit':
core.c:(.exit.text+0x4): undefined reference to `nvme_keyring_exit'
ld: drivers/nvme/host/core.o: in function `nvme_core_init':
core.c:(.init.text+0x94): undefined reference to `nvme_keyring_init

ld: drivers/nvme/host/tcp.o: in function `nvme_tcp_setup_ctrl':
tcp.c:(.text+0x4c18): undefined reference to `nvme_tls_psk_default'

Address this by moving nvme_keyring_init()/nvme_keyring_exit() into
module init/exit functions for the keyring module.

Fixes: be8e82caa6859 ("nvme-tcp: enable TLS handshake upcall")
Signed-off-by: Hannes Reinecke <hare@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/common/keyring.c | 9 +++++----
 drivers/nvme/host/core.c      | 9 +--------
 include/linux/nvme-keyring.h  | 8 --------
 3 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index 46d7a537dbc2..ee341b83eeba 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -151,7 +151,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
 }
 EXPORT_SYMBOL_GPL(nvme_tls_psk_default);
 
-int nvme_keyring_init(void)
+static int __init nvme_keyring_init(void)
 {
 	int err;
 
@@ -171,14 +171,15 @@ int nvme_keyring_init(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nvme_keyring_init);
 
-void nvme_keyring_exit(void)
+static void __exit nvme_keyring_exit(void)
 {
 	unregister_key_type(&nvme_tls_psk_key_type);
 	key_revoke(nvme_keyring);
 	key_put(nvme_keyring);
 }
-EXPORT_SYMBOL_GPL(nvme_keyring_exit);
 
 MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Hannes Reinecke <hare@suse.de>");
+module_init(nvme_keyring_init);
+module_exit(nvme_keyring_exit);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 75a1b58a7a43..88b54cdcbd68 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -25,7 +25,6 @@
 #include "nvme.h"
 #include "fabrics.h"
 #include <linux/nvme-auth.h>
-#include <linux/nvme-keyring.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -4737,16 +4736,11 @@ static int __init nvme_core_init(void)
 		result = PTR_ERR(nvme_ns_chr_class);
 		goto unregister_generic_ns;
 	}
-	result = nvme_keyring_init();
-	if (result)
-		goto destroy_ns_chr;
 	result = nvme_init_auth();
 	if (result)
-		goto keyring_exit;
+		goto destroy_ns_chr;
 	return 0;
 
-keyring_exit:
-	nvme_keyring_exit();
 destroy_ns_chr:
 	class_destroy(nvme_ns_chr_class);
 unregister_generic_ns:
@@ -4770,7 +4764,6 @@ out:
 static void __exit nvme_core_exit(void)
 {
 	nvme_exit_auth();
-	nvme_keyring_exit();
 	class_destroy(nvme_ns_chr_class);
 	class_destroy(nvme_subsys_class);
 	class_destroy(nvme_class);
diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index 6cc0696625f3..e10333d78dbb 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -12,8 +12,6 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
 		const char *hostnqn, const char *subnqn);
 
 key_serial_t nvme_keyring_id(void);
-int nvme_keyring_init(void);
-void nvme_keyring_exit(void);
 
 #else
 
@@ -26,11 +24,5 @@ static inline key_serial_t nvme_keyring_id(void)
 {
 	return 0;
 }
-static inline int nvme_keyring_init(void)
-{
-	return 0;
-}
-static inline void nvme_keyring_exit(void) {}
-
 #endif /* !CONFIG_NVME_KEYRING */
 #endif /* _NVME_KEYRING_H */
-- 
cgit v1.2.3


From 65120498aaf8d7320647a8b6d6de7db42e74ea52 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:27 +0100
Subject: stackleak: add declarations for global functions

With -Wmissing-prototypes enabled, the stackleak code produces a couple of
warnings that have no declarations because they are only called from assembler:

stackleak.c:127:25: error: no previous prototype for 'stackleak_erase' [-Werror=missing-prototypes]
stackleak.c:139:25: error: no previous prototype for 'stackleak_erase_on_task_stack' [-Werror=missing-prototypes]
stackleak.c:151:25: error: no previous prototype for 'stackleak_erase_off_task_stack' [-Werror=missing-prototypes]
stackleak.c:159:49: error: no previous prototype for 'stackleak_track_stack' [-Werror=missing-prototypes]

Add declarations to the stackleak header to shut up the warnings.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231108125843.3806765-7-arnd@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/stackleak.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h
index c36e7a3b45e7..3be2cb564710 100644
--- a/include/linux/stackleak.h
+++ b/include/linux/stackleak.h
@@ -14,6 +14,7 @@
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 #include <asm/stacktrace.h>
+#include <linux/linkage.h>
 
 /*
  * The lowest address on tsk's stack which we can plausibly erase.
@@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t)
 # endif
 }
 
+asmlinkage void noinstr stackleak_erase(void);
+asmlinkage void noinstr stackleak_erase_on_task_stack(void);
+asmlinkage void noinstr stackleak_erase_off_task_stack(void);
+void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+
 #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 static inline void stackleak_task_init(struct task_struct *t) { }
 #endif
-- 
cgit v1.2.3


From 866d648059d5faf53f1cd960b43fe8365ad93ea7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Tue, 31 Oct 2023 13:32:06 +0100
Subject: lsm: fix default return value for vm_enough_memory

1 is the return value that implements a "no-op" hook, not 0.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 99b8176c3738..4dd55fdfec26 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -48,7 +48,7 @@ LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
-LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
+LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
-- 
cgit v1.2.3


From b36995b8609a5a8fe5cf259a1ee768fcaed919f8 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Tue, 31 Oct 2023 13:32:07 +0100
Subject: lsm: fix default return value for inode_getsecctx

-EOPNOTSUPP is the return value that implements a "no-op" hook, not 0.

Without this fix having only the BPF LSM enabled (with no programs
attached) can cause uninitialized variable reads in
nfsd4_encode_fattr(), because the BPF hook returns 0 without touching
the 'ctxlen' variable and the corresponding 'contextlen' variable in
nfsd4_encode_fattr() remains uninitialized, yet being treated as valid
based on the 0 return value.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Reported-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 4dd55fdfec26..ff217a5ce552 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -273,7 +273,7 @@ LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
 LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
 LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
 LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
-LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx,
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx,
 	 u32 *ctxlen)
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
-- 
cgit v1.2.3


From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:29 +0000
Subject: cgroup: Make operations on the cgroup root_list RCU safe

At present, when we perform operations on the cgroup root_list, we must
hold the cgroup_mutex, which is a relatively heavyweight lock. In reality,
we can make operations on this list RCU-safe, eliminating the need to hold
the cgroup_mutex during traversal. Modifications to the list only occur in
the cgroup root setup and destroy paths, which should be infrequent in a
production environment. In contrast, traversal may occur frequently.
Therefore, making it RCU-safe would be beneficial.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     |  1 +
 kernel/cgroup/cgroup-internal.h |  3 ++-
 kernel/cgroup/cgroup.c          | 23 ++++++++++++++++-------
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4a6b6b77ccb6..4caab0c6b361 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -563,6 +563,7 @@ struct cgroup_root {
 
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
+	struct rcu_head rcu;
 
 	/* Hierarchy-specific flags */
 	unsigned int flags;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c56071f150f2..5e17f01ced9f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
 
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
+	list_for_each_entry_rcu((root), &cgroup_roots, root_list,	\
+				lockdep_is_held(&cgroup_mutex))
 
 /**
  * for_each_subsys - iterate all enabled cgroup subsystems
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3a436e4f0da1..19784d44d615 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 
 void cgroup_free_root(struct cgroup_root *root)
 {
-	kfree(root);
+	kfree_rcu(root, rcu);
 }
 
 static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	spin_unlock_irq(&css_set_lock);
 
 	WARN_ON_ONCE(list_empty(&root->root_list));
-	list_del(&root->root_list);
+	list_del_rcu(&root->root_list);
 	cgroup_root_count--;
 
 	if (!have_favordynmods)
@@ -1389,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
 		}
 	}
 
-	BUG_ON(!res_cgroup);
+	/*
+	 * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+	 * before we remove the cgroup root from the root_list. Consequently,
+	 * when accessing a cgroup root, the cset_link may have already been
+	 * freed, resulting in a NULL res_cgroup. However, by holding the
+	 * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+	 * If we don't hold cgroup_mutex in the caller, we must do the NULL
+	 * check.
+	 */
 	return res_cgroup;
 }
 
@@ -1448,7 +1456,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
 {
-	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_lock);
 
 	return __cset_cgroup_from_root(cset, root);
@@ -1456,7 +1463,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
  */
 struct cgroup *task_cgroup_from_root(struct task_struct *task,
 				     struct cgroup_root *root)
@@ -2031,7 +2040,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
 	struct cgroup_root *root = ctx->root;
 	struct cgroup *cgrp = &root->cgrp;
 
-	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD_RCU(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	 * care of subsystems' refcounts, which are explicitly dropped in
 	 * the failure exit path.
 	 */
-	list_add(&root->root_list, &cgroup_roots);
+	list_add_rcu(&root->root_list, &cgroup_roots);
 	cgroup_root_count++;
 
 	/*
-- 
cgit v1.2.3


From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:32 +0000
Subject: cgroup: Add a new helper for cgroup1 hierarchy

A new helper is added for cgroup1 hierarchy:

- task_get_cgroup1
  Acquires the associated cgroup of a task within a specific cgroup1
  hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID.

This helper function is added to facilitate the tracing of tasks within
a particular container or cgroup dir in BPF programs. It's important to
note that this helper is designed specifically for cgroup1 only.

tj: Use irsqsave/restore as suggested by Hou Tao <houtao@huaweicloud.com>.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Hou Tao <houtao@huaweicloud.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h          |  4 +++-
 kernel/cgroup/cgroup-internal.h |  1 -
 kernel/cgroup/cgroup-v1.c       | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0ef0af66080e..34aaf0e87def 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -69,6 +69,7 @@ struct css_task_iter {
 extern struct file_system_type cgroup_fs_type;
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
+extern spinlock_t css_set_lock;
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
@@ -386,7 +387,6 @@ static inline void cgroup_unlock(void)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
-extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		rcu_read_lock_sched_held() ||				\
@@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #endif /* CONFIG_CGROUP_BPF */
 
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5e17f01ced9f..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
 #define DEFINE_CGROUP_MGCTX(name)						\
 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
 
-extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
 extern struct list_head cgroup_roots;
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 76db6c67e39a..04d11a7dd95f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
 	return ret;
 }
 
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+	struct cgroup *cgrp = ERR_PTR(-ENOENT);
+	struct cgroup_root *root;
+	unsigned long flags;
+
+	rcu_read_lock();
+	for_each_root(root) {
+		/* cgroup1 only*/
+		if (root == &cgrp_dfl_root)
+			continue;
+		if (root->hierarchy_id != hierarchy_id)
+			continue;
+		spin_lock_irqsave(&css_set_lock, flags);
+		cgrp = task_cgroup_from_root(tsk, root);
+		if (!cgrp || !cgroup_tryget(cgrp))
+			cgrp = ERR_PTR(-ENOENT);
+		spin_unlock_irqrestore(&css_set_lock, flags);
+		break;
+	}
+	rcu_read_unlock();
+	return cgrp;
+}
+
 static int __init cgroup1_wq_init(void)
 {
 	/*
-- 
cgit v1.2.3


From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Nov 2023 20:57:23 -0800
Subject: bpf: Add __bpf_dynptr_data* for in kernel use

Different types of bpf dynptr have different internal data storage.
Specifically, SKB and XDP type of dynptr may have non-continuous data.
Therefore, it is not always safe to directly access dynptr->data.

Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to
dynptr->data.

Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of
dynptr->data.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/helpers.c     | 19 +++++++++++++++++++
 kernel/trace/bpf_trace.c | 12 ++++++++----
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..eb84caf133df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1222,6 +1222,8 @@ enum bpf_dynptr_type {
 
 int bpf_dynptr_check_size(u32 size);
 u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
 
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 56b0c1f678ee..174f02a9e703 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2618,3 +2618,22 @@ static int __init kfunc_init(void)
 }
 
 late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+	return bpf_dynptr_slice(ptr, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+	if (__bpf_dynptr_is_rdonly(ptr))
+		return NULL;
+	return (void *)__bpf_dynptr_data(ptr, len);
+}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 84e8a0f6e4e0..f0b8b7c29126 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1376,6 +1376,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 			       struct bpf_dynptr_kern *sig_ptr,
 			       struct bpf_key *trusted_keyring)
 {
+	const void *data, *sig;
+	u32 data_len, sig_len;
 	int ret;
 
 	if (trusted_keyring->has_ref) {
@@ -1392,10 +1394,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 			return ret;
 	}
 
-	return verify_pkcs7_signature(data_ptr->data,
-				      __bpf_dynptr_size(data_ptr),
-				      sig_ptr->data,
-				      __bpf_dynptr_size(sig_ptr),
+	data_len = __bpf_dynptr_size(data_ptr);
+	data = __bpf_dynptr_data(data_ptr, data_len);
+	sig_len = __bpf_dynptr_size(sig_ptr);
+	sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+	return verify_pkcs7_signature(data, data_len, sig, sig_len,
 				      trusted_keyring->key,
 				      VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
 				      NULL);
-- 
cgit v1.2.3


From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:37 -0800
Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum

This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT
btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the
btf_field_type enum. Further patches in the series will use
BPF_GRAPH_NODE, so let's move this useful definition out of btf.c.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  4 ++--
 kernel/bpf/btf.c    | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb84caf133df..4001d11be151 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -186,8 +186,8 @@ enum btf_field_type {
 	BPF_LIST_NODE  = (1 << 6),
 	BPF_RB_ROOT    = (1 << 7),
 	BPF_RB_NODE    = (1 << 8),
-	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
-				 BPF_RB_NODE | BPF_RB_ROOT,
+	BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
+	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 15d71d2986d3..63cf4128fc05 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3840,9 +3840,6 @@ end:
 	return ERR_PTR(ret);
 }
 
-#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
-#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
-
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 {
 	int i;
@@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
 	 * does not form cycles.
 	 */
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
 		return 0;
 	for (i = 0; i < rec->cnt; i++) {
 		struct btf_struct_meta *meta;
 		u32 btf_id;
 
-		if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
+		if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
 			continue;
 		btf_id = rec->fields[i].graph_root.value_btf_id;
 		meta = btf_find_struct_meta(btf, btf_id);
@@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 		 * to check ownership cycle for a type unless it's also a
 		 * node type.
 		 */
-		if (!(rec->field_mask & GRAPH_NODE_MASK))
+		if (!(rec->field_mask & BPF_GRAPH_NODE))
 			continue;
 
 		/* We need to ensure ownership acyclicity among all types. The
@@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 		 * - A is both an root and node.
 		 * - B is only an node.
 		 */
-		if (meta->record->field_mask & GRAPH_ROOT_MASK)
+		if (meta->record->field_mask & BPF_GRAPH_ROOT)
 			return -ELOOP;
 	}
 	return 0;
-- 
cgit v1.2.3


From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:36 -0800
Subject: bpf: handle ldimm64 properly in check_cfg()

ldimm64 instructions are 16-byte long, and so have to be handled
appropriately in check_cfg(), just like the rest of BPF verifier does.

This has implications in three places:
  - when determining next instruction for non-jump instructions;
  - when determining next instruction for callback address ldimm64
    instructions (in visit_func_call_insn());
  - when checking for unreachable instructions, where second half of
    ldimm64 is expected to be unreachable;

We take this also as an opportunity to report jump into the middle of
ldimm64. And adjust few test_verifier tests accordingly.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Hao Sun <sunhao.th@gmail.com>
Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                             |  8 ++++++--
 kernel/bpf/verifier.c                           | 27 ++++++++++++++++++-------
 tools/testing/selftests/bpf/verifier/ld_imm64.c |  8 ++++----
 3 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..35bff17396c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+static bool bpf_is_ldimm64(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
-	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
-	       insn->src_reg == BPF_PSEUDO_FUNC;
+	return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 struct bpf_prog_ops {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f..b87715b364fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 				struct bpf_verifier_env *env,
 				bool visit_callee)
 {
-	int ret;
+	int ret, insn_sz;
 
-	ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
 	if (ret)
 		return ret;
 
-	mark_prune_point(env, t + 1);
+	mark_prune_point(env, t + insn_sz);
 	/* when we exit from subprog, we need to record non-linear history */
-	mark_jmp_point(env, t + 1);
+	mark_jmp_point(env, t + insn_sz);
 
 	if (visit_callee) {
 		mark_prune_point(env, t);
@@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 static int visit_insn(int t, struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
-	int ret, off;
+	int ret, off, insn_sz;
 
 	if (bpf_pseudo_func(insn))
 		return visit_func_call_insn(t, insns, env, true);
 
 	/* All non-branch instructions have a single fall-through edge. */
 	if (BPF_CLASS(insn->code) != BPF_JMP &&
-	    BPF_CLASS(insn->code) != BPF_JMP32)
-		return push_insn(t, t + 1, FALLTHROUGH, env, false);
+	    BPF_CLASS(insn->code) != BPF_JMP32) {
+		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+		return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+	}
 
 	switch (BPF_OP(insn->code)) {
 	case BPF_EXIT:
@@ -15607,11 +15610,21 @@ walk_cfg:
 	}
 
 	for (i = 0; i < insn_cnt; i++) {
+		struct bpf_insn *insn = &env->prog->insnsi[i];
+
 		if (insn_state[i] != EXPLORED) {
 			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
+		if (bpf_is_ldimm64(insn)) {
+			if (insn_state[i + 1] != 0) {
+				verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+				ret = -EINVAL;
+				goto err_free;
+			}
+			i++; /* skip second half of ldimm64 */
+		}
 	}
 	ret = 0; /* cfg looks good */
 
diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
index f9297900cea6..78f19c255f20 100644
--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c
+++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
@@ -9,8 +9,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
@@ -23,8 +23,8 @@
 	BPF_LD_IMM64(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
-- 
cgit v1.2.3


From d27abbfd4888d79dd24baf50e774631046ac4732 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 25 Oct 2023 14:58:18 +0300
Subject: pwm: Fix double shift bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These enums are passed to set/test_bit().  The set/test_bit() functions
take a bit number instead of a shifted value.  Passing a shifted value
is a double shift bug like doing BIT(BIT(1)).  The double shift bug
doesn't cause a problem here because we are only checking 0 and 1 but
if the value was 5 or above then it can lead to a buffer overflow.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index e3b437587b32..cda3597b84f2 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -41,8 +41,8 @@ struct pwm_args {
 };
 
 enum {
-	PWMF_REQUESTED = 1 << 0,
-	PWMF_EXPORTED = 1 << 1,
+	PWMF_REQUESTED = 0,
+	PWMF_EXPORTED = 1,
 };
 
 /*
-- 
cgit v1.2.3


From abc28463c81853e4fdf8d009f71b2a3ce62a6f40 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 10 Nov 2023 19:59:05 +0900
Subject: kprobes: unify kprobes_exceptions_nofify() prototypes

Most architectures that support kprobes declare this function in their
own asm/kprobes.h header and provide an override, but some are missing
the prototype, which causes a warning for the __weak stub implementation:

kernel/kprobes.c:1865:12: error: no previous prototype for 'kprobe_exceptions_notify' [-Werror=missing-prototypes]
 1865 | int __weak kprobe_exceptions_notify(struct notifier_block *self,

Move the prototype into linux/kprobes.h so it is visible to all
the definitions.

Link: https://lore.kernel.org/all/20231108125843.3806765-4-arnd@kernel.org/

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/arc/include/asm/kprobes.h     | 3 ---
 arch/arm/include/asm/kprobes.h     | 2 --
 arch/arm64/include/asm/kprobes.h   | 2 --
 arch/mips/include/asm/kprobes.h    | 2 --
 arch/powerpc/include/asm/kprobes.h | 2 --
 arch/s390/include/asm/kprobes.h    | 2 --
 arch/sh/include/asm/kprobes.h      | 2 --
 arch/sparc/include/asm/kprobes.h   | 2 --
 arch/x86/include/asm/kprobes.h     | 2 --
 include/linux/kprobes.h            | 4 ++++
 10 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/kprobes.h b/arch/arc/include/asm/kprobes.h
index de1566e32cb8..68e8301c0df2 100644
--- a/arch/arc/include/asm/kprobes.h
+++ b/arch/arc/include/asm/kprobes.h
@@ -32,9 +32,6 @@ struct kprobe;
 
 void arch_remove_kprobe(struct kprobe *p);
 
-int kprobe_exceptions_notify(struct notifier_block *self,
-			     unsigned long val, void *data);
-
 struct prev_kprobe {
 	struct kprobe *kp;
 	unsigned long status;
diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h
index e26a278d301a..5b8dbf1b0be4 100644
--- a/arch/arm/include/asm/kprobes.h
+++ b/arch/arm/include/asm/kprobes.h
@@ -40,8 +40,6 @@ struct kprobe_ctlblk {
 
 void arch_remove_kprobe(struct kprobe *);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
-int kprobe_exceptions_notify(struct notifier_block *self,
-			     unsigned long val, void *data);
 
 /* optinsn template addresses */
 extern __visible kprobe_opcode_t optprobe_template_entry[];
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index 05cd82eeca13..be7a3680dadf 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -37,8 +37,6 @@ struct kprobe_ctlblk {
 
 void arch_remove_kprobe(struct kprobe *);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
-int kprobe_exceptions_notify(struct notifier_block *self,
-			     unsigned long val, void *data);
 void __kretprobe_trampoline(void);
 void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
 
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index 68b1e5d458cf..bc27d99c9436 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -71,8 +71,6 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe;
 };
 
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
 
 #endif /* CONFIG_KPROBES */
 #endif /* _ASM_KPROBES_H */
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index c8e4b4fd4e33..4525a9c68260 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -84,8 +84,6 @@ struct arch_optimized_insn {
 	kprobe_opcode_t *insn;
 };
 
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-					unsigned long val, void *data);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
 extern int kprobe_post_handler(struct pt_regs *regs);
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 21b9e5290c04..01f1682a73b7 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -73,8 +73,6 @@ struct kprobe_ctlblk {
 void arch_remove_kprobe(struct kprobe *p);
 
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-int kprobe_exceptions_notify(struct notifier_block *self,
-	unsigned long val, void *data);
 
 #define flush_insn_slot(p)	do { } while (0)
 
diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h
index eeba83e0a7d2..65d4c3316a5b 100644
--- a/arch/sh/include/asm/kprobes.h
+++ b/arch/sh/include/asm/kprobes.h
@@ -46,8 +46,6 @@ struct kprobe_ctlblk {
 };
 
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
 extern int kprobe_handle_illslot(unsigned long pc);
 #else
 
diff --git a/arch/sparc/include/asm/kprobes.h b/arch/sparc/include/asm/kprobes.h
index 06c2bc767ef7..aec742cd898f 100644
--- a/arch/sparc/include/asm/kprobes.h
+++ b/arch/sparc/include/asm/kprobes.h
@@ -47,8 +47,6 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe;
 };
 
-int kprobe_exceptions_notify(struct notifier_block *self,
-			     unsigned long val, void *data);
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
 				      struct pt_regs *regs);
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index a2e9317aad49..5939694dfb28 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -113,8 +113,6 @@ struct kprobe_ctlblk {
 };
 
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
 extern int kprobe_int3_handler(struct pt_regs *regs);
 
 #else
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 365eb092e9c4..ab1da3142b06 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -445,6 +445,10 @@ int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
 int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 			    char *type, char *sym);
+
+int kprobe_exceptions_notify(struct notifier_block *self,
+			     unsigned long val, void *data);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-- 
cgit v1.2.3


From bef4a48f4ef798c4feddf045d49e53c8a97d5e37 Mon Sep 17 00:00:00 2001
From: Mark Hasemeyer <markhas@chromium.org>
Date: Tue, 7 Nov 2023 14:47:43 -0700
Subject: spi: Fix null dereference on suspend

A race condition exists where a synchronous (noqueue) transfer can be
active during a system suspend. This can cause a null pointer
dereference exception to occur when the system resumes.

Example order of events leading to the exception:
1. spi_sync() calls __spi_transfer_message_noqueue() which sets
   ctlr->cur_msg
2. Spi transfer begins via spi_transfer_one_message()
3. System is suspended interrupting the transfer context
4. System is resumed
6. spi_controller_resume() calls spi_start_queue() which resets cur_msg
   to NULL
7. Spi transfer context resumes and spi_finalize_current_message() is
   called which dereferences cur_msg (which is now NULL)

Wait for synchronous transfers to complete before suspending by
acquiring the bus mutex and setting/checking a suspend flag.

Signed-off-by: Mark Hasemeyer <markhas@chromium.org>
Link: https://lore.kernel.org/r/20231107144743.v1.1.I7987f05f61901f567f7661763646cb7d7919b528@changeid
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@kernel.org
---
 drivers/spi/spi.c       | 56 ++++++++++++++++++++++++++++++++++---------------
 include/linux/spi/spi.h |  1 +
 2 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 791df0e69105..8ead7acb99f3 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3317,33 +3317,52 @@ void spi_unregister_controller(struct spi_controller *ctlr)
 }
 EXPORT_SYMBOL_GPL(spi_unregister_controller);
 
+static inline int __spi_check_suspended(const struct spi_controller *ctlr)
+{
+	return ctlr->flags & SPI_CONTROLLER_SUSPENDED ? -ESHUTDOWN : 0;
+}
+
+static inline void __spi_mark_suspended(struct spi_controller *ctlr)
+{
+	mutex_lock(&ctlr->bus_lock_mutex);
+	ctlr->flags |= SPI_CONTROLLER_SUSPENDED;
+	mutex_unlock(&ctlr->bus_lock_mutex);
+}
+
+static inline void __spi_mark_resumed(struct spi_controller *ctlr)
+{
+	mutex_lock(&ctlr->bus_lock_mutex);
+	ctlr->flags &= ~SPI_CONTROLLER_SUSPENDED;
+	mutex_unlock(&ctlr->bus_lock_mutex);
+}
+
 int spi_controller_suspend(struct spi_controller *ctlr)
 {
-	int ret;
+	int ret = 0;
 
 	/* Basically no-ops for non-queued controllers */
-	if (!ctlr->queued)
-		return 0;
-
-	ret = spi_stop_queue(ctlr);
-	if (ret)
-		dev_err(&ctlr->dev, "queue stop failed\n");
+	if (ctlr->queued) {
+		ret = spi_stop_queue(ctlr);
+		if (ret)
+			dev_err(&ctlr->dev, "queue stop failed\n");
+	}
 
+	__spi_mark_suspended(ctlr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(spi_controller_suspend);
 
 int spi_controller_resume(struct spi_controller *ctlr)
 {
-	int ret;
-
-	if (!ctlr->queued)
-		return 0;
+	int ret = 0;
 
-	ret = spi_start_queue(ctlr);
-	if (ret)
-		dev_err(&ctlr->dev, "queue restart failed\n");
+	__spi_mark_resumed(ctlr);
 
+	if (ctlr->queued) {
+		ret = spi_start_queue(ctlr);
+		if (ret)
+			dev_err(&ctlr->dev, "queue restart failed\n");
+	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(spi_controller_resume);
@@ -4147,8 +4166,7 @@ static void __spi_transfer_message_noqueue(struct spi_controller *ctlr, struct s
 	ctlr->cur_msg = msg;
 	ret = __spi_pump_transfer_message(ctlr, msg, was_busy);
 	if (ret)
-		goto out;
-
+		dev_err(&ctlr->dev, "noqueue transfer failed\n");
 	ctlr->cur_msg = NULL;
 	ctlr->fallback = false;
 
@@ -4164,7 +4182,6 @@ static void __spi_transfer_message_noqueue(struct spi_controller *ctlr, struct s
 		spi_idle_runtime_pm(ctlr);
 	}
 
-out:
 	mutex_unlock(&ctlr->io_mutex);
 }
 
@@ -4187,6 +4204,11 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message)
 	int status;
 	struct spi_controller *ctlr = spi->controller;
 
+	if (__spi_check_suspended(ctlr)) {
+		dev_warn_once(&spi->dev, "Attempted to sync while suspend\n");
+		return -ESHUTDOWN;
+	}
+
 	status = __spi_validate(spi, message);
 	if (status != 0)
 		return status;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 86825c88b576..255a0562aea5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -566,6 +566,7 @@ struct spi_controller {
 #define SPI_CONTROLLER_MUST_RX		BIT(3)	/* Requires rx */
 #define SPI_CONTROLLER_MUST_TX		BIT(4)	/* Requires tx */
 #define SPI_CONTROLLER_GPIO_SS		BIT(5)	/* GPIO CS must select slave */
+#define SPI_CONTROLLER_SUSPENDED	BIT(6)	/* Currently suspended */
 
 	/* Flag indicating if the allocation of this struct is devres-managed */
 	bool			devm_allocated;
-- 
cgit v1.2.3


From 689b097a06bafb461ec162fc3b3ecc9765cea67b Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Mon, 6 Nov 2023 03:18:02 +0000
Subject: compiler-gcc: Suppress -Wmissing-prototypes warning for all supported
 GCC

The kernel supports a minimum GCC version of 5.1.0 for building. However,
the "__diag_ignore_all" directive only suppresses the
"-Wmissing-prototypes" warning for GCC versions >= 8.0.0. As a result, when
building the kernel with older GCC versions, warnings may be triggered. The
example below illustrates the warnings reported by the kernel test robot
using GCC 7.5.0:

  compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
  All warnings (new ones prefixed by >>):

   kernel/bpf/helpers.c:1893:19: warning: no previous prototype for 'bpf_obj_new_impl' [-Wmissing-prototypes]
    __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
                      ^~~~~~~~~~~~~~~~
   kernel/bpf/helpers.c:1907:19: warning: no previous prototype for 'bpf_percpu_obj_new_impl' [-Wmissing-prototypes]
    __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
   [...]

To address this, we should also suppress the "-Wmissing-prototypes" warning
for older GCC versions. "#pragma GCC diagnostic push" is supported as
of GCC 4.6, and both "-Wmissing-prototypes" and "-Wmissing-declarations"
are supported for all the GCC versions that we currently support.
Therefore, it is reasonable to suppress these warnings for all supported
GCC versions.

With this adjustment, it's important to note that after implementing
"__diag_ignore_all", it will effectively suppress warnings for all the
supported GCC versions.

In the future, if you wish to suppress warnings that are only supported on
higher GCC versions, it is advisable to explicitly use "__diag_ignore" to
specify the GCC version you are targeting.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311031651.A7crZEur-lkp@intel.com/
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20231106031802.4188-1-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/compiler-gcc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2ceba3fe4ec1..aebb65bf95a7 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -136,7 +136,7 @@
 #endif
 
 #define __diag_ignore_all(option, comment) \
-	__diag_GCC(8, ignore, option)
+	__diag(__diag_GCC_ignore option)
 
 /*
  * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
-- 
cgit v1.2.3


From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Nov 2023 15:57:13 +0100
Subject: hrtimers: Push pending hrtimers away from outgoing CPU earlier

2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug")
solved the straight forward CPU hotplug deadlock vs. the scheduler
bandwidth timer. Yu discovered a more involved variant where a task which
has a bandwidth timer started on the outgoing CPU holds a lock and then
gets throttled. If the lock required by one of the CPU hotplug callbacks
the hotplug operation deadlocks because the unthrottling timer event is not
handled on the dying CPU and can only be recovered once the control CPU
reaches the hotplug state which pulls the pending hrtimers from the dead
CPU.

Solve this by pushing the hrtimers away from the dying CPU in the dying
callbacks. Nothing can queue a hrtimer on the dying CPU at that point because
all other CPUs spin in stop_machine() with interrupts disabled and once the
operation is finished the CPU is marked offline.

Reported-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Liu Tie <liutie4@huawei.com>
Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/hrtimer.h    |  4 ++--
 kernel/cpu.c               |  8 +++++++-
 kernel/time/hrtimer.c      | 33 ++++++++++++---------------------
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..448f5f995adc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -193,6 +193,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
 	CPUHP_AP_SMPCFD_DYING,
+	CPUHP_AP_HRTIMERS_DYING,
 	CPUHP_AP_X86_TBOOT_DYING,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
 	CPUHP_AP_ONLINE,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f2044d5a652b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
 #else
-#define hrtimers_dead_cpu	NULL
+#define hrtimers_cpu_dying	NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..2e69a1deaa31 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2098,7 +2098,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_HRTIMERS_PREPARE] = {
 		.name			= "hrtimers:prepare",
 		.startup.single		= hrtimers_prepare_cpu,
-		.teardown.single	= hrtimers_dead_cpu,
+		.teardown.single	= NULL,
 	},
 	[CPUHP_SMPCFD_PREPARE] = {
 		.name			= "smpcfd:prepare",
@@ -2190,6 +2190,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= NULL,
 		.teardown.single	= smpcfd_dying_cpu,
 	},
+	[CPUHP_AP_HRTIMERS_DYING] = {
+		.name			= "hrtimers:dying",
+		.startup.single		= NULL,
+		.teardown.single	= hrtimers_cpu_dying,
+	},
+
 	/* Entry state on starting. Interrupts enabled from here on. Transient
 	 * state for synchronsization */
 	[CPUHP_AP_ONLINE] = {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..760793998cdd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int i, ncpu = cpumask_first(cpu_active_mask);
 
-	BUG_ON(cpu_online(scpu));
-	tick_cancel_sched_timer(scpu);
+	tick_cancel_sched_timer(dying_cpu);
+
+	old_base = this_cpu_ptr(&hrtimer_bases);
+	new_base = &per_cpu(hrtimer_bases, ncpu);
 
-	/*
-	 * this BH disable ensures that raise_softirq_irqoff() does
-	 * not wakeup ksoftirqd (and acquire the pi-lock) while
-	 * holding the cpu_base lock
-	 */
-	local_bh_disable();
-	local_irq_disable();
-	old_base = &per_cpu(hrtimer_bases, scpu);
-	new_base = this_cpu_ptr(&hrtimer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	raw_spin_lock(&new_base->lock);
-	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+	raw_spin_lock(&old_base->lock);
+	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	 * The migration might have changed the first expiring softirq
 	 * timer on this CPU. Update it.
 	 */
-	hrtimer_update_softirq_timer(new_base, false);
+	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+	/* Tell the other CPU to retrigger the next event */
+	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
-	raw_spin_unlock(&old_base->lock);
 	raw_spin_unlock(&new_base->lock);
+	raw_spin_unlock(&old_base->lock);
 
-	/* Check, if we got expired work to do */
-	__hrtimer_peek_ahead_timers();
-	local_irq_enable();
-	local_bh_enable();
 	return 0;
 }
 
-- 
cgit v1.2.3


From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 25 Oct 2023 14:25:52 -0400
Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs
 from wq_unbound_cpumask

When the "isolcpus" boot command line option is used to add a set
of isolated CPUs, those CPUs will be excluded automatically from
wq_unbound_cpumask to avoid running work functions from unbound
workqueues.

Recently cpuset has been extended to allow the creation of partitions
of isolated CPUs dynamically. To make it closer to the "isolcpus"
in functionality, the CPUs in those isolated cpuset partitions should be
excluded from wq_unbound_cpumask as well. This can be done currently by
explicitly writing to the workqueue's cpumask sysfs file after creating
the isolated partitions. However, this process can be error prone.

Ideally, the cpuset code should be allowed to request the workqueue code
to exclude those isolated CPUs from wq_unbound_cpumask so that this
operation can be done automatically and the isolated CPUs will be returned
back to wq_unbound_cpumask after the destructions of the isolated
cpuset partitions.

This patch adds a new workqueue_unbound_exclude_cpumask() function to
enable that. This new function will exclude the specified isolated
CPUs from wq_unbound_cpumask. To be able to restore those isolated
CPUs back after the destruction of isolated cpuset partitions, a new
wq_requested_unbound_cpumask is added to store the user provided unbound
cpumask either from the boot command line options or from writing to
the cpumask sysfs file. This new cpumask provides the basis for CPU
exclusion.

To enable users to understand how the wq_unbound_cpumask is being
modified internally, this patch also exposes the newly introduced
wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to
store the cpumask to be excluded from wq_unbound_cpumask as read-only
sysfs files.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  2 +-
 kernel/workqueue.c        | 91 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 84 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 24b1e5070f4d..b0b9604b76b8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..bd9d34eacd78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -381,6 +381,12 @@ static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 /* PL&A: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
 /* for further constrain wq_unbound_cpumask by cmdline parameter*/
 static struct cpumask wq_cmdline_cpumask __initdata;
 
@@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
  *  		-EINVAL	- Invalid @cpumask
  *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
  */
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 {
 	int ret = -EINVAL;
 
@@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
+		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
 		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
 			ret = 0;
 			goto out_unlock;
@@ -5864,6 +5871,44 @@ out_unlock:
 	return ret;
 }
 
+/**
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ *
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
+ */
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+{
+	cpumask_var_t cpumask;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	lockdep_assert_cpus_held();
+	mutex_lock(&wq_pool_mutex);
+
+	/* Save the current isolated cpumask & export it via sysfs */
+	cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
+	/*
+	 * If the operation fails, it will fall back to
+	 * wq_requested_unbound_cpumask which is initially set to
+	 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+	 * by any subsequent write to workqueue/cpumask sysfs file.
+	 */
+	if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+		cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+	if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+		ret = workqueue_apply_unbound_cpumask(cpumask);
+
+	mutex_unlock(&wq_pool_mutex);
+	free_cpumask_var(cpumask);
+	return ret;
+}
+
 static int parse_affn_scope(const char *val)
 {
 	int i;
@@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+static ssize_t __wq_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf, cpumask_var_t mask)
 {
 	int written;
 
 	mutex_lock(&wq_pool_mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq_unbound_cpumask));
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
 	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
 static ssize_t wq_unbound_cpumask_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
 	return ret ? ret : count;
 }
 
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
 	__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
-	       wq_unbound_cpumask_store);
+	       wq_unbound_cpumask_store),
+	__ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+	__ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+	__ATTR_NULL,
+};
 
 static int __init wq_sysfs_init(void)
 {
@@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void)
 
 	dev_root = bus_get_dev_root(&wq_subsys);
 	if (dev_root) {
-		err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+			err = device_create_file(dev_root, attr);
+			if (err)
+				break;
+		}
 		put_device(dev_root);
 	}
 	return err;
@@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
 	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
 	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
 
 	if (!cpumask_empty(&wq_cmdline_cpumask))
 		cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+	cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 3 Nov 2023 23:13:01 -0400
Subject: cgroup/rstat: Reduce cpu_lock hold time in
 cgroup_rstat_flush_locked()

When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.

The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.

To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.

Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.

The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.

OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch.  Below were the frequency
distribution before and after the patch:

     Hold time        Before patch       After patch
     ---------        ------------       -----------
       0-01 us           804,139         13,738,708
      01-05 us         9,772,767          1,177,194
      05-10 us         4,595,028              4,984
      10-15 us           303,481              3,562
      15-20 us            78,971              1,314
      20-25 us            24,583                 18
      25-30 us             6,908                 12
      30-40 us             8,015
      40-50 us             2,192
      50-60 us               316
      60-70 us                43
      70-80 us                 7
      80-90 us                 2
        >90 us                 3

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  7 +++++++
 kernel/cgroup/rstat.c       | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4caab0c6b361..37518436cfe7 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -496,6 +496,13 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 	struct list_head rstat_css_list;
 
+	/*
+	 * A singly-linked list of cgroup structures to be rstat flushed.
+	 * This is a scratch field to be used exclusively by
+	 * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
+	 */
+	struct cgroup	*rstat_flush_next;
+
 	/* cgroup basic resource statistics */
 	struct cgroup_base_stat last_bstat;
 	struct cgroup_base_stat bstat;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d80d7a608141..1f300bf4dc40 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -145,6 +145,32 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
 	return pos;
 }
 
+/* Return a list of updated cgroups to be flushed */
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	struct cgroup *head, *tail, *next;
+	unsigned long flags;
+
+	/*
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
+	 */
+	raw_spin_lock_irqsave(cpu_lock, flags);
+	head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu);
+	while (tail) {
+		next = cgroup_rstat_cpu_pop_updated(tail, root, cpu);
+		tail->rstat_flush_next = next;
+		tail = next;
+	}
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	return head;
+}
+
 /*
  * A hook for bpf stat collectors to attach to and flush their stats.
  * Together with providing bpf kfuncs for cgroup_rstat_updated() and
@@ -179,21 +205,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_rstat_lock);
 
 	for_each_possible_cpu(cpu) {
-		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
-						       cpu);
-		struct cgroup *pos = NULL;
-		unsigned long flags;
+		struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
 
-		/*
-		 * The _irqsave() is needed because cgroup_rstat_lock is
-		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-		 * this lock with the _irq() suffix only disables interrupts on
-		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-		 * interrupts on both configurations. The _irqsave() ensures
-		 * that interrupts are always disabled and later restored.
-		 */
-		raw_spin_lock_irqsave(cpu_lock, flags);
-		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+		for (; pos; pos = pos->rstat_flush_next) {
 			struct cgroup_subsys_state *css;
 
 			cgroup_base_stat_flush(pos, cpu);
@@ -205,7 +219,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 				css->ss->css_rstat_flush(css, cpu);
 			rcu_read_unlock();
 		}
-		raw_spin_unlock_irqrestore(cpu_lock, flags);
 
 		/* play nice and yield if necessary */
 		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-- 
cgit v1.2.3


From 8156c7dd47b92fc4a70c9ea58e7a9e88c8bc32be Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 26 Oct 2023 16:48:21 +0200
Subject: regulator: Introduce handling for system-critical under-voltage
 events

Handle under-voltage events for crucial regulators to maintain system
stability and avoid issues during power drops.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20231026144824.4065145-3-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/regulator/of_regulator.c  |  2 ++
 include/linux/regulator/machine.h | 10 ++++++++++
 3 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 3137e40fcd3e..a072f721f288 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
 #include <linux/of.h>
+#include <linux/reboot.h>
 #include <linux/regmap.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/consumer.h>
@@ -5061,6 +5062,41 @@ void regulator_bulk_free(int num_consumers,
 }
 EXPORT_SYMBOL_GPL(regulator_bulk_free);
 
+/**
+ * regulator_handle_critical - Handle events for system-critical regulators.
+ * @rdev: The regulator device.
+ * @event: The event being handled.
+ *
+ * This function handles critical events such as under-voltage, over-current,
+ * and unknown errors for regulators deemed system-critical. On detecting such
+ * events, it triggers a hardware protection shutdown with a defined timeout.
+ */
+static void regulator_handle_critical(struct regulator_dev *rdev,
+				      unsigned long event)
+{
+	const char *reason = NULL;
+
+	if (!rdev->constraints->system_critical)
+		return;
+
+	switch (event) {
+	case REGULATOR_EVENT_UNDER_VOLTAGE:
+		reason = "System critical regulator: voltage drop detected";
+		break;
+	case REGULATOR_EVENT_OVER_CURRENT:
+		reason = "System critical regulator: over-current detected";
+		break;
+	case REGULATOR_EVENT_FAIL:
+		reason = "System critical regulator: unknown error";
+	}
+
+	if (!reason)
+		return;
+
+	hw_protection_shutdown(reason,
+			       REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS);
+}
+
 /**
  * regulator_notifier_call_chain - call regulator event notifier
  * @rdev: regulator source
@@ -5073,6 +5109,8 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
 {
+	regulator_handle_critical(rdev, event);
+
 	_notifier_call_chain(rdev, event, data);
 	return NOTIFY_DONE;
 
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 1b65e5e4e40f..3bdd6f1919a4 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -131,6 +131,8 @@ static int of_get_regulation_constraints(struct device *dev,
 		constraints->valid_ops_mask |= REGULATOR_CHANGE_STATUS;
 
 	constraints->pull_down = of_property_read_bool(np, "regulator-pull-down");
+	constraints->system_critical = of_property_read_bool(np,
+						"system-critical-regulator");
 
 	if (of_property_read_bool(np, "regulator-allow-bypass"))
 		constraints->valid_ops_mask |= REGULATOR_CHANGE_BYPASS;
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 621b7f4a3639..e0ddfb5593c9 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -49,6 +49,13 @@ struct regulator;
 #define DISABLE_IN_SUSPEND	1
 #define ENABLE_IN_SUSPEND	2
 
+/*
+ * Default time window (in milliseconds) following a critical under-voltage
+ * event during which less critical actions can be safely carried out by the
+ * system.
+ */
+#define REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS	10
+
 /* Regulator active discharge flags */
 enum regulator_active_discharge {
 	REGULATOR_ACTIVE_DISCHARGE_DEFAULT,
@@ -127,6 +134,8 @@ struct notification_limit {
  * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
  * @soft_start: Enable soft start so that voltage ramps slowly.
  * @pull_down: Enable pull down when regulator is disabled.
+ * @system_critical: Set if the regulator is critical to system stability or
+ *                   functionality.
  * @over_current_protection: Auto disable on over current event.
  *
  * @over_current_detection: Configure over current limits.
@@ -214,6 +223,7 @@ struct regulation_constraints {
 	unsigned ramp_disable:1; /* disable ramp delay */
 	unsigned soft_start:1;	/* ramp voltage slowly */
 	unsigned pull_down:1;	/* pull down resistor when regulator off */
+	unsigned system_critical:1;	/* critical to system stability */
 	unsigned over_current_protection:1; /* auto disable on over current */
 	unsigned over_current_detection:1; /* notify on over current */
 	unsigned over_voltage_detection:1; /* notify on over voltage */
-- 
cgit v1.2.3


From 1e22152aa59d793743fc53051dd7a042f362aecb Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 26 Oct 2023 16:48:24 +0200
Subject: regulator: Implement uv_survival_time for handling under-voltage
 events

Add 'uv_survival_time' field to regulation_constraints for specifying
survival time post critical under-voltage event. Update the regulator
notifier call chain and Device Tree property parsing to use this new
field, allowing a configurable timeout before emergency shutdown.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20231026144824.4065145-6-o.rempel@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 2 +-
 drivers/regulator/of_regulator.c  | 7 +++++++
 include/linux/regulator/machine.h | 8 ++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index a072f721f288..a6cb84af989e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -5094,7 +5094,7 @@ static void regulator_handle_critical(struct regulator_dev *rdev,
 		return;
 
 	hw_protection_shutdown(reason,
-			       REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS);
+			       rdev->constraints->uv_less_critical_window_ms);
 }
 
 /**
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 3bdd6f1919a4..03afc160fc72 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -175,6 +175,13 @@ static int of_get_regulation_constraints(struct device *dev,
 	if (!ret)
 		constraints->enable_time = pval;
 
+	ret = of_property_read_u32(np, "regulator-uv-survival-time-ms", &pval);
+	if (!ret)
+		constraints->uv_less_critical_window_ms = pval;
+	else
+		constraints->uv_less_critical_window_ms =
+				REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS;
+
 	constraints->soft_start = of_property_read_bool(np,
 					"regulator-soft-start");
 	ret = of_property_read_u32(np, "regulator-active-discharge", &pval);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index e0ddfb5593c9..0cd76d264727 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -162,6 +162,13 @@ struct notification_limit {
  *		      regulator_active_discharge values are used for
  *		      initialisation.
  * @enable_time: Turn-on time of the rails (unit: microseconds)
+ * @uv_less_critical_window_ms: Specifies the time window (in milliseconds)
+ *                              following a critical under-voltage (UV) event
+ *                              during which less critical actions can be
+ *                              safely carried out by the system (for example
+ *                              logging). After this time window more critical
+ *                              actions should be done (for example prevent
+ *                              HW damage).
  */
 struct regulation_constraints {
 
@@ -213,6 +220,7 @@ struct regulation_constraints {
 	unsigned int settling_time_up;
 	unsigned int settling_time_down;
 	unsigned int enable_time;
+	unsigned int uv_less_critical_window_ms;
 
 	unsigned int active_discharge;
 
-- 
cgit v1.2.3


From f3b8788cde61b02f1e6c202f8fac4360e6adbafc Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:46 -0700
Subject: LSM: Identify modules by more than name

Create a struct lsm_id to contain identifying information about Linux
Security Modules (LSMs). At inception this contains the name of the
module and an identifier associated with the security module.  Change
the security_add_hooks() interface to use this structure.  Change the
individual modules to maintain their own struct lsm_id and pass it to
security_add_hooks().

The values are for LSM identifiers are defined in a new UAPI
header file linux/lsm.h. Each existing LSM has been updated to
include it's LSMID in the lsm_id.

The LSM ID values are sequential, with the oldest module
LSM_ID_CAPABILITY being the lowest value and the existing modules
numbered in the order they were included in the main line kernel.
This is an arbitrary convention for assigning the values, but
none better presents itself. The value 0 is defined as being invalid.
The values 1-99 are reserved for any special case uses which may
arise in the future. This may include attributes of the LSM
infrastructure itself, possibly related to namespacing or network
attribute management. A special range is identified for such attributes
to help reduce confusion for developers unfamiliar with LSMs.

LSM attribute values are defined for the attributes presented by
modules that are available today. As with the LSM IDs, The value 0
is defined as being invalid. The values 1-99 are reserved for any
special case uses which may arise in the future.

Cc: linux-security-module <linux-security-module@vger.kernel.org>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Nacked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
[PM: forward ported beyond v6.6 due merge window changes]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/index.rst |  1 +
 MAINTAINERS                           |  1 +
 include/linux/lsm_hooks.h             | 16 +++++++++--
 include/uapi/linux/lsm.h              | 54 +++++++++++++++++++++++++++++++++++
 security/apparmor/lsm.c               |  8 +++++-
 security/bpf/hooks.c                  |  9 +++++-
 security/commoncap.c                  |  8 +++++-
 security/landlock/cred.c              |  2 +-
 security/landlock/fs.c                |  2 +-
 security/landlock/net.c               |  2 +-
 security/landlock/ptrace.c            |  2 +-
 security/landlock/setup.c             |  6 ++++
 security/landlock/setup.h             |  1 +
 security/loadpin/loadpin.c            |  9 +++++-
 security/lockdown/lockdown.c          |  8 +++++-
 security/safesetid/lsm.c              |  9 +++++-
 security/security.c                   | 12 ++++----
 security/selinux/hooks.c              |  9 +++++-
 security/smack/smack_lsm.c            |  8 +++++-
 security/tomoyo/tomoyo.c              |  9 +++++-
 security/yama/yama_lsm.c              |  8 +++++-
 21 files changed, 162 insertions(+), 22 deletions(-)
 create mode 100644 include/uapi/linux/lsm.h

(limited to 'include/linux')

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 031df47a7c19..8be8b1979194 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -33,6 +33,7 @@ place where this information is gathered.
    sysfs-platform_profile
    vduse
    futex2
+   lsm
 
 .. only::  subproject and html
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..f1d41fd9159a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19511,6 +19511,7 @@ L:	linux-security-module@vger.kernel.org (suggested Cc:)
 S:	Supported
 W:	http://kernsec.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git
+F:	include/uapi/linux/lsm.h
 F:	security/
 X:	security/selinux/
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index dcb5e5b5eb13..7f0adb33caaa 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -42,6 +42,18 @@ struct security_hook_heads {
 	#undef LSM_HOOK
 } __randomize_layout;
 
+/**
+ * struct lsm_id - Identify a Linux Security Module.
+ * @lsm: name of the LSM, must be approved by the LSM maintainers
+ * @id: LSM ID number from uapi/linux/lsm.h
+ *
+ * Contains the information that identifies the LSM.
+ */
+struct lsm_id {
+	const char	*name;
+	u64		id;
+};
+
 /*
  * Security module hook list structure.
  * For use with generic list macros for common operations.
@@ -50,7 +62,7 @@ struct security_hook_list {
 	struct hlist_node		list;
 	struct hlist_head		*head;
 	union security_list_options	hook;
-	const char			*lsm;
+	const struct lsm_id		*lsmid;
 } __randomize_layout;
 
 /*
@@ -104,7 +116,7 @@ extern struct security_hook_heads security_hook_heads;
 extern char *lsm_names;
 
 extern void security_add_hooks(struct security_hook_list *hooks, int count,
-				const char *lsm);
+			       const struct lsm_id *lsmid);
 
 #define LSM_FLAG_LEGACY_MAJOR	BIT(0)
 #define LSM_FLAG_EXCLUSIVE	BIT(1)
diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h
new file mode 100644
index 000000000000..f27c9a9cc376
--- /dev/null
+++ b/include/uapi/linux/lsm.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Linux Security Modules (LSM) - User space API
+ *
+ * Copyright (C) 2022 Casey Schaufler <casey@schaufler-ca.com>
+ * Copyright (C) 2022 Intel Corporation
+ */
+
+#ifndef _UAPI_LINUX_LSM_H
+#define _UAPI_LINUX_LSM_H
+
+/*
+ * ID tokens to identify Linux Security Modules (LSMs)
+ *
+ * These token values are used to uniquely identify specific LSMs
+ * in the kernel as well as in the kernel's LSM userspace API.
+ *
+ * A value of zero/0 is considered undefined and should not be used
+ * outside the kernel. Values 1-99 are reserved for potential
+ * future use.
+ */
+#define LSM_ID_UNDEF		0
+#define LSM_ID_CAPABILITY	100
+#define LSM_ID_SELINUX		101
+#define LSM_ID_SMACK		102
+#define LSM_ID_TOMOYO		103
+#define LSM_ID_IMA		104
+#define LSM_ID_APPARMOR		105
+#define LSM_ID_YAMA		106
+#define LSM_ID_LOADPIN		107
+#define LSM_ID_SAFESETID	108
+#define LSM_ID_LOCKDOWN		109
+#define LSM_ID_BPF		110
+#define LSM_ID_LANDLOCK		111
+
+/*
+ * LSM_ATTR_XXX definitions identify different LSM attributes
+ * which are used in the kernel's LSM userspace API. Support
+ * for these attributes vary across the different LSMs. None
+ * are required.
+ *
+ * A value of zero/0 is considered undefined and should not be used
+ * outside the kernel. Values 1-99 are reserved for potential
+ * future use.
+ */
+#define LSM_ATTR_UNDEF		0
+#define LSM_ATTR_CURRENT	100
+#define LSM_ATTR_EXEC		101
+#define LSM_ATTR_FSCREATE	102
+#define LSM_ATTR_KEYCREATE	103
+#define LSM_ATTR_PREV		104
+#define LSM_ATTR_SOCKCREATE	105
+
+#endif /* _UAPI_LINUX_LSM_H */
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 4981bdf02993..093da0a9dbd8 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -24,6 +24,7 @@
 #include <linux/zstd.h>
 #include <net/sock.h>
 #include <uapi/linux/mount.h>
+#include <uapi/linux/lsm.h>
 
 #include "include/apparmor.h"
 #include "include/apparmorfs.h"
@@ -1385,6 +1386,11 @@ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = {
 	.lbs_task = sizeof(struct aa_task_ctx),
 };
 
+const struct lsm_id apparmor_lsmid = {
+	.name = "apparmor",
+	.id = LSM_ID_APPARMOR,
+};
+
 static struct security_hook_list apparmor_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
@@ -2202,7 +2208,7 @@ static int __init apparmor_init(void)
 		goto buffers_out;
 	}
 	security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
-				"apparmor");
+				&apparmor_lsmid);
 
 	/* Report that AppArmor successfully initialized */
 	apparmor_initialized = 1;
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index cfaf1d0e6a5f..91011e0c361a 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -5,6 +5,7 @@
  */
 #include <linux/lsm_hooks.h>
 #include <linux/bpf_lsm.h>
+#include <uapi/linux/lsm.h>
 
 static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
@@ -15,9 +16,15 @@ static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(task_free, bpf_task_storage_free),
 };
 
+const struct lsm_id bpf_lsmid = {
+	.name = "bpf",
+	.id = LSM_ID_BPF,
+};
+
 static int __init bpf_lsm_init(void)
 {
-	security_add_hooks(bpf_lsm_hooks, ARRAY_SIZE(bpf_lsm_hooks), "bpf");
+	security_add_hooks(bpf_lsm_hooks, ARRAY_SIZE(bpf_lsm_hooks),
+			   &bpf_lsmid);
 	pr_info("LSM support for eBPF active\n");
 	return 0;
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index 8e8c630ce204..a64c0c8592bb 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -25,6 +25,7 @@
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/mnt_idmapping.h>
+#include <uapi/linux/lsm.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
@@ -1440,6 +1441,11 @@ int cap_mmap_file(struct file *file, unsigned long reqprot,
 
 #ifdef CONFIG_SECURITY
 
+const struct lsm_id capability_lsmid = {
+	.name = "capability",
+	.id = LSM_ID_CAPABILITY,
+};
+
 static struct security_hook_list capability_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(capable, cap_capable),
 	LSM_HOOK_INIT(settime, cap_settime),
@@ -1464,7 +1470,7 @@ static struct security_hook_list capability_hooks[] __ro_after_init = {
 static int __init capability_init(void)
 {
 	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
-				"capability");
+			   &capability_lsmid);
 	return 0;
 }
 
diff --git a/security/landlock/cred.c b/security/landlock/cred.c
index 13dff2a31545..786af18c4a1c 100644
--- a/security/landlock/cred.c
+++ b/security/landlock/cred.c
@@ -42,5 +42,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 __init void landlock_add_cred_hooks(void)
 {
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
-			   LANDLOCK_NAME);
+			   &landlock_lsmid);
 }
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index bc7c126deea2..490655d09b43 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1223,5 +1223,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 __init void landlock_add_fs_hooks(void)
 {
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
-			   LANDLOCK_NAME);
+			   &landlock_lsmid);
 }
diff --git a/security/landlock/net.c b/security/landlock/net.c
index aaa92c2b1f08..efa1b644a4af 100644
--- a/security/landlock/net.c
+++ b/security/landlock/net.c
@@ -196,5 +196,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 __init void landlock_add_net_hooks(void)
 {
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
-			   LANDLOCK_NAME);
+			   &landlock_lsmid);
 }
diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c
index 8a06d6c492bf..2bfc533d36e4 100644
--- a/security/landlock/ptrace.c
+++ b/security/landlock/ptrace.c
@@ -116,5 +116,5 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 __init void landlock_add_ptrace_hooks(void)
 {
 	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
-			   LANDLOCK_NAME);
+			   &landlock_lsmid);
 }
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index 3e11d303542f..f6dd33143b7f 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -8,6 +8,7 @@
 
 #include <linux/init.h>
 #include <linux/lsm_hooks.h>
+#include <uapi/linux/lsm.h>
 
 #include "common.h"
 #include "cred.h"
@@ -25,6 +26,11 @@ struct lsm_blob_sizes landlock_blob_sizes __ro_after_init = {
 	.lbs_superblock = sizeof(struct landlock_superblock_security),
 };
 
+const struct lsm_id landlock_lsmid = {
+	.name = LANDLOCK_NAME,
+	.id = LSM_ID_LANDLOCK,
+};
+
 static int __init landlock_init(void)
 {
 	landlock_add_cred_hooks();
diff --git a/security/landlock/setup.h b/security/landlock/setup.h
index 1daffab1ab4b..c4252d46d49d 100644
--- a/security/landlock/setup.h
+++ b/security/landlock/setup.h
@@ -14,5 +14,6 @@
 extern bool landlock_initialized;
 
 extern struct lsm_blob_sizes landlock_blob_sizes;
+extern const struct lsm_id landlock_lsmid;
 
 #endif /* _SECURITY_LANDLOCK_SETUP_H */
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index a9d40456a064..d682a851de58 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -20,6 +20,7 @@
 #include <linux/string_helpers.h>
 #include <linux/dm-verity-loadpin.h>
 #include <uapi/linux/loadpin.h>
+#include <uapi/linux/lsm.h>
 
 #define VERITY_DIGEST_FILE_HEADER "# LOADPIN_TRUSTED_VERITY_ROOT_DIGESTS"
 
@@ -208,6 +209,11 @@ static int loadpin_load_data(enum kernel_load_data_id id, bool contents)
 	return loadpin_check(NULL, (enum kernel_read_file_id) id);
 }
 
+const struct lsm_id loadpin_lsmid = {
+	.name = "loadpin",
+	.id = LSM_ID_LOADPIN,
+};
+
 static struct security_hook_list loadpin_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(sb_free_security, loadpin_sb_free_security),
 	LSM_HOOK_INIT(kernel_read_file, loadpin_read_file),
@@ -259,7 +265,8 @@ static int __init loadpin_init(void)
 	if (!register_sysctl("kernel/loadpin", loadpin_sysctl_table))
 		pr_notice("sysctl registration failed!\n");
 #endif
-	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
+	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks),
+			   &loadpin_lsmid);
 
 	return 0;
 }
diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
index 68d19632aeb7..cd84d8ea1dfb 100644
--- a/security/lockdown/lockdown.c
+++ b/security/lockdown/lockdown.c
@@ -13,6 +13,7 @@
 #include <linux/security.h>
 #include <linux/export.h>
 #include <linux/lsm_hooks.h>
+#include <uapi/linux/lsm.h>
 
 static enum lockdown_reason kernel_locked_down;
 
@@ -75,6 +76,11 @@ static struct security_hook_list lockdown_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(locked_down, lockdown_is_locked_down),
 };
 
+const struct lsm_id lockdown_lsmid = {
+	.name = "lockdown",
+	.id = LSM_ID_LOCKDOWN,
+};
+
 static int __init lockdown_lsm_init(void)
 {
 #if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY)
@@ -83,7 +89,7 @@ static int __init lockdown_lsm_init(void)
 	lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX);
 #endif
 	security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks),
-			   "lockdown");
+			   &lockdown_lsmid);
 	return 0;
 }
 
diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c
index 5be5894aa0ea..f42d5af5ffb0 100644
--- a/security/safesetid/lsm.c
+++ b/security/safesetid/lsm.c
@@ -19,6 +19,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
+#include <uapi/linux/lsm.h>
 #include "lsm.h"
 
 /* Flag indicating whether initialization completed */
@@ -261,6 +262,11 @@ static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old
 	return 0;
 }
 
+const struct lsm_id safesetid_lsmid = {
+	.name = "safesetid",
+	.id = LSM_ID_SAFESETID,
+};
+
 static struct security_hook_list safesetid_security_hooks[] = {
 	LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
 	LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid),
@@ -271,7 +277,8 @@ static struct security_hook_list safesetid_security_hooks[] = {
 static int __init safesetid_security_init(void)
 {
 	security_add_hooks(safesetid_security_hooks,
-			   ARRAY_SIZE(safesetid_security_hooks), "safesetid");
+			   ARRAY_SIZE(safesetid_security_hooks),
+			   &safesetid_lsmid);
 
 	/* Report that SafeSetID successfully initialized */
 	safesetid_initialized = 1;
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..08b1bd9457a9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -513,17 +513,17 @@ static int lsm_append(const char *new, char **result)
  * security_add_hooks - Add a modules hooks to the hook lists.
  * @hooks: the hooks to add
  * @count: the number of hooks to add
- * @lsm: the name of the security module
+ * @lsmid: the identification information for the security module
  *
  * Each LSM has to register its hooks with the infrastructure.
  */
 void __init security_add_hooks(struct security_hook_list *hooks, int count,
-			       const char *lsm)
+			       const struct lsm_id *lsmid)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
-		hooks[i].lsm = lsm;
+		hooks[i].lsmid = lsmid;
 		hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
 	}
 
@@ -532,7 +532,7 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 	 * and fix this up afterwards.
 	 */
 	if (slab_is_available()) {
-		if (lsm_append(lsm, &lsm_names) < 0)
+		if (lsm_append(lsmid->name, &lsm_names) < 0)
 			panic("%s - Cannot get early memory.\n", __func__);
 	}
 }
@@ -3817,7 +3817,7 @@ int security_getprocattr(struct task_struct *p, const char *lsm,
 	struct security_hook_list *hp;
 
 	hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
-		if (lsm != NULL && strcmp(lsm, hp->lsm))
+		if (lsm != NULL && strcmp(lsm, hp->lsmid->name))
 			continue;
 		return hp->hook.getprocattr(p, name, value);
 	}
@@ -3842,7 +3842,7 @@ int security_setprocattr(const char *lsm, const char *name, void *value,
 	struct security_hook_list *hp;
 
 	hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
-		if (lsm != NULL && strcmp(lsm, hp->lsm))
+		if (lsm != NULL && strcmp(lsm, hp->lsmid->name))
 			continue;
 		return hp->hook.setprocattr(name, value, size);
 	}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..f2423dfd19cd 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -92,6 +92,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fanotify.h>
 #include <linux/io_uring.h>
+#include <uapi/linux/lsm.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -6950,6 +6951,11 @@ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd)
 }
 #endif /* CONFIG_IO_URING */
 
+const struct lsm_id selinux_lsmid = {
+	.name = "selinux",
+	.id = LSM_ID_SELINUX,
+};
+
 /*
  * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
  * 1. any hooks that don't belong to (2.) or (3.) below,
@@ -7270,7 +7276,8 @@ static __init int selinux_init(void)
 
 	hashtab_cache_init();
 
-	security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");
+	security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
+			   &selinux_lsmid);
 
 	if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
 		panic("SELinux: Unable to register AVC netcache callback\n");
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 65130a791f57..f73f9a2834eb 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -43,6 +43,7 @@
 #include <linux/fs_parser.h>
 #include <linux/watch_queue.h>
 #include <linux/io_uring.h>
+#include <uapi/linux/lsm.h>
 #include "smack.h"
 
 #define TRANS_TRUE	"TRUE"
@@ -4933,6 +4934,11 @@ struct lsm_blob_sizes smack_blob_sizes __ro_after_init = {
 	.lbs_xattr_count = SMACK_INODE_INIT_XATTRS,
 };
 
+const struct lsm_id smack_lsmid = {
+	.name = "smack",
+	.id = LSM_ID_SMACK,
+};
+
 static struct security_hook_list smack_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
@@ -5140,7 +5146,7 @@ static __init int smack_init(void)
 	/*
 	 * Register with LSM
 	 */
-	security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
+	security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid);
 	smack_enabled = 1;
 
 	pr_info("Smack:  Initializing.\n");
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 255f1b470295..722205433105 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/lsm_hooks.h>
+#include <uapi/linux/lsm.h>
 #include "common.h"
 
 /**
@@ -542,6 +543,11 @@ static void tomoyo_task_free(struct task_struct *task)
 	}
 }
 
+const struct lsm_id tomoyo_lsmid = {
+	.name = "tomoyo",
+	.id = LSM_ID_TOMOYO,
+};
+
 /*
  * tomoyo_security_ops is a "struct security_operations" which is used for
  * registering TOMOYO.
@@ -595,7 +601,8 @@ static int __init tomoyo_init(void)
 	struct tomoyo_task *s = tomoyo_task(current);
 
 	/* register ourselves with the security framework */
-	security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), "tomoyo");
+	security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks),
+			   &tomoyo_lsmid);
 	pr_info("TOMOYO Linux initialized\n");
 	s->domain_info = &tomoyo_kernel_domain;
 	atomic_inc(&tomoyo_kernel_domain.users);
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 2503cf153d4a..5cdff292fcae 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -18,6 +18,7 @@
 #include <linux/task_work.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
+#include <uapi/linux/lsm.h>
 
 #define YAMA_SCOPE_DISABLED	0
 #define YAMA_SCOPE_RELATIONAL	1
@@ -421,6 +422,11 @@ static int yama_ptrace_traceme(struct task_struct *parent)
 	return rc;
 }
 
+const struct lsm_id yama_lsmid = {
+	.name = "yama",
+	.id = LSM_ID_YAMA,
+};
+
 static struct security_hook_list yama_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme),
@@ -471,7 +477,7 @@ static inline void yama_init_sysctl(void) { }
 static int __init yama_init(void)
 {
 	pr_info("Yama: becoming mindful.\n");
-	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), "yama");
+	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks), &yama_lsmid);
 	yama_init_sysctl();
 	return 0;
 }
-- 
cgit v1.2.3


From 9285c5ad9d00abfe0f4e2ce4039c8127e7a09738 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:47 -0700
Subject: LSM: Maintain a table of LSM attribute data

As LSMs are registered add their lsm_id pointers to a table.
This will be used later for attribute reporting.

Determine the number of possible security modules based on
their respective CONFIG options. This allows the number to be
known at build time. This allows data structures and tables
to use the constant.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h |  2 ++
 security/security.c      | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..50c178019a58 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -138,6 +138,8 @@ enum lockdown_reason {
 };
 
 extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
+extern u32 lsm_active_cnt;
+extern const struct lsm_id *lsm_idlist[];
 
 /* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
diff --git a/security/security.c b/security/security.c
index 08b1bd9457a9..0952d6bff4da 100644
--- a/security/security.c
+++ b/security/security.c
@@ -34,6 +34,25 @@
 /* How many LSMs were built into the kernel? */
 #define LSM_COUNT (__end_lsm_info - __start_lsm_info)
 
+/*
+ * How many LSMs are built into the kernel as determined at
+ * build time. Used to determine fixed array sizes.
+ * The capability module is accounted for by CONFIG_SECURITY
+ */
+#define LSM_CONFIG_COUNT ( \
+	(IS_ENABLED(CONFIG_SECURITY) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_SAFESETID) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \
+	(IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0))
+
 /*
  * These are descriptions of the reasons that can be passed to the
  * security_locked_down() LSM hook. Placing this array here allows
@@ -245,6 +264,12 @@ static void __init initialize_lsm(struct lsm_info *lsm)
 	}
 }
 
+/*
+ * Current index to use while initializing the lsm id list.
+ */
+u32 lsm_active_cnt __ro_after_init;
+const struct lsm_id *lsm_idlist[LSM_CONFIG_COUNT];
+
 /* Populate ordered LSMs list from comma-separated LSM name list. */
 static void __init ordered_lsm_parse(const char *order, const char *origin)
 {
@@ -522,6 +547,18 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 {
 	int i;
 
+	/*
+	 * A security module may call security_add_hooks() more
+	 * than once during initialization, and LSM initialization
+	 * is serialized. Landlock is one such case.
+	 * Look at the previous entry, if there is one, for duplication.
+	 */
+	if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
+		if (lsm_active_cnt >= LSM_CONFIG_COUNT)
+			panic("%s Too many LSMs registered.\n", __func__);
+		lsm_idlist[lsm_active_cnt++] = lsmid;
+	}
+
 	for (i = 0; i < count; i++) {
 		hooks[i].lsmid = lsmid;
 		hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
-- 
cgit v1.2.3


From 267c068e5f8b81b68cc4247c94dbba90a21a634e Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:48 -0700
Subject: proc: Use lsmids instead of lsm names for attrs

Use the LSM ID number instead of the LSM name to identify which
security module's attibute data should be shown in /proc/self/attr.
The security_[gs]etprocattr() functions have been changed to expect
the LSM ID. The change from a string comparison to an integer comparison
in these functions will provide a minor performance improvement.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickael Salaun <mic@digikod.net>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/proc/base.c           | 29 +++++++++++++++--------------
 fs/proc/internal.h       |  2 +-
 include/linux/security.h | 11 +++++------
 security/security.c      | 15 +++++++--------
 4 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd31e3b6bf77..98a031ac2648 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -97,6 +97,7 @@
 #include <linux/resctrl.h>
 #include <linux/cn_proc.h>
 #include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -146,10 +147,10 @@ struct pid_entry {
 	NOD(NAME, (S_IFREG|(MODE)),			\
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
-#define ATTR(LSM, NAME, MODE)				\
+#define ATTR(LSMID, NAME, MODE)				\
 	NOD(NAME, (S_IFREG|(MODE)),			\
 		NULL, &proc_pid_attr_operations,	\
-		{ .lsm = LSM })
+		{ .lsmid = LSMID })
 
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -2726,7 +2727,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (!task)
 		return -ESRCH;
 
-	length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+	length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
 				      file->f_path.dentry->d_name.name,
 				      &p);
 	put_task_struct(task);
@@ -2784,7 +2785,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (rv < 0)
 		goto out_free;
 
-	rv = security_setprocattr(PROC_I(inode)->op.lsm,
+	rv = security_setprocattr(PROC_I(inode)->op.lsmid,
 				  file->f_path.dentry->d_name.name, page,
 				  count);
 	mutex_unlock(&current->signal->cred_guard_mutex);
@@ -2833,27 +2834,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
 
 #ifdef CONFIG_SECURITY_SMACK
 static const struct pid_entry smack_attr_dir_stuff[] = {
-	ATTR("smack", "current",	0666),
+	ATTR(LSM_ID_SMACK, "current",	0666),
 };
 LSM_DIR_OPS(smack);
 #endif
 
 #ifdef CONFIG_SECURITY_APPARMOR
 static const struct pid_entry apparmor_attr_dir_stuff[] = {
-	ATTR("apparmor", "current",	0666),
-	ATTR("apparmor", "prev",	0444),
-	ATTR("apparmor", "exec",	0666),
+	ATTR(LSM_ID_APPARMOR, "current",	0666),
+	ATTR(LSM_ID_APPARMOR, "prev",		0444),
+	ATTR(LSM_ID_APPARMOR, "exec",		0666),
 };
 LSM_DIR_OPS(apparmor);
 #endif
 
 static const struct pid_entry attr_dir_stuff[] = {
-	ATTR(NULL, "current",		0666),
-	ATTR(NULL, "prev",		0444),
-	ATTR(NULL, "exec",		0666),
-	ATTR(NULL, "fscreate",		0666),
-	ATTR(NULL, "keycreate",		0666),
-	ATTR(NULL, "sockcreate",	0666),
+	ATTR(LSM_ID_UNDEF, "current",	0666),
+	ATTR(LSM_ID_UNDEF, "prev",		0444),
+	ATTR(LSM_ID_UNDEF, "exec",		0666),
+	ATTR(LSM_ID_UNDEF, "fscreate",	0666),
+	ATTR(LSM_ID_UNDEF, "keycreate",	0666),
+	ATTR(LSM_ID_UNDEF, "sockcreate",	0666),
 #ifdef CONFIG_SECURITY_SMACK
 	DIR("smack",			0555,
 	    proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9a8f32f21ff5..a71ac5379584 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,7 +92,7 @@ union proc_op {
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,
 		struct task_struct *task);
-	const char *lsm;
+	int lsmid;
 };
 
 struct proc_inode {
diff --git a/include/linux/security.h b/include/linux/security.h
index 50c178019a58..c81bca77f4f2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -472,10 +472,9 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
-int security_getprocattr(struct task_struct *p, const char *lsm, const char *name,
+int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 			 char **value);
-int security_setprocattr(const char *lsm, const char *name, void *value,
-			 size_t size);
+int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
@@ -1339,14 +1338,14 @@ static inline void security_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
 { }
 
-static inline int security_getprocattr(struct task_struct *p, const char *lsm,
+static inline int security_getprocattr(struct task_struct *p, int lsmid,
 				       const char *name, char **value)
 {
 	return -EINVAL;
 }
 
-static inline int security_setprocattr(const char *lsm, char *name,
-				       void *value, size_t size)
+static inline int security_setprocattr(int lsmid, char *name, void *value,
+				       size_t size)
 {
 	return -EINVAL;
 }
diff --git a/security/security.c b/security/security.c
index 0952d6bff4da..c66f9faefa40 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3840,7 +3840,7 @@ EXPORT_SYMBOL(security_d_instantiate);
 /**
  * security_getprocattr() - Read an attribute for a task
  * @p: the task
- * @lsm: LSM name
+ * @lsmid: LSM identification
  * @name: attribute name
  * @value: attribute value
  *
@@ -3848,13 +3848,13 @@ EXPORT_SYMBOL(security_d_instantiate);
  *
  * Return: Returns the length of @value on success, a negative value otherwise.
  */
-int security_getprocattr(struct task_struct *p, const char *lsm,
-			 const char *name, char **value)
+int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
+			 char **value)
 {
 	struct security_hook_list *hp;
 
 	hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
-		if (lsm != NULL && strcmp(lsm, hp->lsmid->name))
+		if (lsmid != 0 && lsmid != hp->lsmid->id)
 			continue;
 		return hp->hook.getprocattr(p, name, value);
 	}
@@ -3863,7 +3863,7 @@ int security_getprocattr(struct task_struct *p, const char *lsm,
 
 /**
  * security_setprocattr() - Set an attribute for a task
- * @lsm: LSM name
+ * @lsmid: LSM identification
  * @name: attribute name
  * @value: attribute value
  * @size: attribute value size
@@ -3873,13 +3873,12 @@ int security_getprocattr(struct task_struct *p, const char *lsm,
  *
  * Return: Returns bytes written on success, a negative value otherwise.
  */
-int security_setprocattr(const char *lsm, const char *name, void *value,
-			 size_t size)
+int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
 {
 	struct security_hook_list *hp;
 
 	hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
-		if (lsm != NULL && strcmp(lsm, hp->lsmid->name))
+		if (lsmid != 0 && lsmid != hp->lsmid->id)
 			continue;
 		return hp->hook.setprocattr(name, value, size);
 	}
-- 
cgit v1.2.3


From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:49 -0700
Subject: LSM: syscalls for current process attributes

Create a system call lsm_get_self_attr() to provide the security
module maintained attributes of the current process.
Create a system call lsm_set_self_attr() to set a security
module maintained attribute of the current process.
Historically these attributes have been exposed to user space via
entries in procfs under /proc/self/attr.

The attribute value is provided in a lsm_ctx structure. The structure
identifies the size of the attribute, and the attribute value. The format
of the attribute value is defined by the security module. A flags field
is included for LSM specific information. It is currently unused and must
be 0. The total size of the data, including the lsm_ctx structure and any
padding, is maintained as well.

struct lsm_ctx {
        __u64 id;
        __u64 flags;
        __u64 len;
        __u64 ctx_len;
        __u8 ctx[];
};

Two new LSM hooks are used to interface with the LSMs.
security_getselfattr() collects the lsm_ctx values from the
LSMs that support the hook, accounting for space requirements.
security_setselfattr() identifies which LSM the attribute is
intended for and passes it along.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/lsm.rst |  70 +++++++++++++++++
 include/linux/lsm_hook_defs.h       |   4 +
 include/linux/lsm_hooks.h           |   1 +
 include/linux/security.h            |  19 +++++
 include/linux/syscalls.h            |   5 ++
 include/uapi/linux/lsm.h            |  36 +++++++++
 kernel/sys_ni.c                     |   2 +
 security/Makefile                   |   1 +
 security/lsm_syscalls.c             |  57 ++++++++++++++
 security/security.c                 | 152 ++++++++++++++++++++++++++++++++++++
 10 files changed, 347 insertions(+)
 create mode 100644 Documentation/userspace-api/lsm.rst
 create mode 100644 security/lsm_syscalls.c

(limited to 'include/linux')

diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst
new file mode 100644
index 000000000000..f8499f3e2826
--- /dev/null
+++ b/Documentation/userspace-api/lsm.rst
@@ -0,0 +1,70 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2022 Casey Schaufler <casey@schaufler-ca.com>
+.. Copyright (C) 2022 Intel Corporation
+
+=====================================
+Linux Security Modules
+=====================================
+
+:Author: Casey Schaufler
+:Date: July 2023
+
+Linux security modules (LSM) provide a mechanism to implement
+additional access controls to the Linux security policies.
+
+The various security modules may support any of these attributes:
+
+``LSM_ATTR_CURRENT`` is the current, active security context of the
+process.
+The proc filesystem provides this value in ``/proc/self/attr/current``.
+This is supported by the SELinux, Smack and AppArmor security modules.
+Smack also provides this value in ``/proc/self/attr/smack/current``.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/current``.
+
+``LSM_ATTR_EXEC`` is the security context of the process at the time the
+current image was executed.
+The proc filesystem provides this value in ``/proc/self/attr/exec``.
+This is supported by the SELinux and AppArmor security modules.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/exec``.
+
+``LSM_ATTR_FSCREATE`` is the security context of the process used when
+creating file system objects.
+The proc filesystem provides this value in ``/proc/self/attr/fscreate``.
+This is supported by the SELinux security module.
+
+``LSM_ATTR_KEYCREATE`` is the security context of the process used when
+creating key objects.
+The proc filesystem provides this value in ``/proc/self/attr/keycreate``.
+This is supported by the SELinux security module.
+
+``LSM_ATTR_PREV`` is the security context of the process at the time the
+current security context was set.
+The proc filesystem provides this value in ``/proc/self/attr/prev``.
+This is supported by the SELinux and AppArmor security modules.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/prev``.
+
+``LSM_ATTR_SOCKCREATE`` is the security context of the process used when
+creating socket objects.
+The proc filesystem provides this value in ``/proc/self/attr/sockcreate``.
+This is supported by the SELinux security module.
+
+Kernel interface
+================
+
+Set a security attribute of the current process
+-----------------------------------------------
+
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_set_self_attr
+
+Get the specified security attributes of the current process
+------------------------------------------------------------
+
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_get_self_attr
+
+Additional documentation
+========================
+
+* Documentation/security/lsm.rst
+* Documentation/security/lsm-development.rst
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..c925a0d26edf 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
 LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
 LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
 	 struct inode *inode)
+LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr,
+	 struct lsm_ctx __user *ctx, size_t *size, u32 flags)
+LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr,
+	 struct lsm_ctx *ctx, size_t size, u32 flags)
 LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
 	 char **value)
 LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7f0adb33caaa..a2ade0ffe9e7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -25,6 +25,7 @@
 #ifndef __LINUX_LSM_HOOKS_H
 #define __LINUX_LSM_HOOKS_H
 
+#include <uapi/linux/lsm.h>
 #include <linux/security.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
diff --git a/include/linux/security.h b/include/linux/security.h
index c81bca77f4f2..dd1fe487385d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -60,6 +60,7 @@ struct fs_parameter;
 enum fs_value_type;
 struct watch;
 struct watch_notification;
+struct lsm_ctx;
 
 /* Default (no) options for the capable function */
 #define CAP_OPT_NONE 0x0
@@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
+int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t __user *size, u32 flags);
+int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t size, u32 flags);
 int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 			 char **value);
 int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
@@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
 { }
 
+static inline int security_getselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t __user *size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int security_setselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int security_getprocattr(struct task_struct *p, int lsmid,
 				       const char *name, char **value)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..4e1e56a24f1e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -71,6 +71,7 @@ struct clone_args;
 struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
+struct lsm_ctx;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
@@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd,
 		struct cachestat_range __user *cstat_range,
 		struct cachestat __user *cstat, unsigned int flags);
 asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags);
+asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t *size, __u32 flags);
+asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t size, __u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h
index f27c9a9cc376..eeda59a77c02 100644
--- a/include/uapi/linux/lsm.h
+++ b/include/uapi/linux/lsm.h
@@ -9,6 +9,36 @@
 #ifndef _UAPI_LINUX_LSM_H
 #define _UAPI_LINUX_LSM_H
 
+#include <linux/types.h>
+#include <linux/unistd.h>
+
+/**
+ * struct lsm_ctx - LSM context information
+ * @id: the LSM id number, see LSM_ID_XXX
+ * @flags: LSM specific flags
+ * @len: length of the lsm_ctx struct, @ctx and any other data or padding
+ * @ctx_len: the size of @ctx
+ * @ctx: the LSM context value
+ *
+ * The @len field MUST be equal to the size of the lsm_ctx struct
+ * plus any additional padding and/or data placed after @ctx.
+ *
+ * In all cases @ctx_len MUST be equal to the length of @ctx.
+ * If @ctx is a string value it should be nul terminated with
+ * @ctx_len equal to `strlen(@ctx) + 1`.  Binary values are
+ * supported.
+ *
+ * The @flags and @ctx fields SHOULD only be interpreted by the
+ * LSM specified by @id; they MUST be set to zero/0 when not used.
+ */
+struct lsm_ctx {
+	__u64 id;
+	__u64 flags;
+	__u64 len;
+	__u64 ctx_len;
+	__u8 ctx[];
+};
+
 /*
  * ID tokens to identify Linux Security Modules (LSMs)
  *
@@ -51,4 +81,10 @@
 #define LSM_ATTR_PREV		104
 #define LSM_ATTR_SOCKCREATE	105
 
+/*
+ * LSM_FLAG_XXX definitions identify special handling instructions
+ * for the API.
+ */
+#define LSM_FLAG_SINGLE	0x0001
+
 #endif /* _UAPI_LINUX_LSM_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e1a6e3c675c0..1f61b8452a6e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -171,6 +171,8 @@ COND_SYSCALL(landlock_add_rule);
 COND_SYSCALL(landlock_restrict_self);
 COND_SYSCALL(fadvise64_64);
 COND_SYSCALL_COMPAT(fadvise64_64);
+COND_SYSCALL(lsm_get_self_attr);
+COND_SYSCALL(lsm_set_self_attr);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);
diff --git a/security/Makefile b/security/Makefile
index 18121f8f85cd..59f238490665 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_KEYS)			+= keys/
 
 # always enable default capabilities
 obj-y					+= commoncap.o
+obj-$(CONFIG_SECURITY) 			+= lsm_syscalls.o
 obj-$(CONFIG_MMU)			+= min_addr.o
 
 # Object file lists
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
new file mode 100644
index 000000000000..226ae80d9683
--- /dev/null
+++ b/security/lsm_syscalls.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * System calls implementing the Linux Security Module API.
+ *
+ *  Copyright (C) 2022 Casey Schaufler <casey@schaufler-ca.com>
+ *  Copyright (C) 2022 Intel Corporation
+ */
+
+#include <asm/current.h>
+#include <linux/compiler_types.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/security.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/lsm_hooks.h>
+#include <uapi/linux/lsm.h>
+
+/**
+ * sys_lsm_set_self_attr - Set current task's security module attribute
+ * @attr: which attribute to set
+ * @ctx: the LSM contexts
+ * @size: size of @ctx
+ * @flags: reserved for future use
+ *
+ * Sets the calling task's LSM context. On success this function
+ * returns 0. If the attribute specified cannot be set a negative
+ * value indicating the reason for the error is returned.
+ */
+SYSCALL_DEFINE4(lsm_set_self_attr, unsigned int, attr, struct lsm_ctx __user *,
+		ctx, size_t, size, u32, flags)
+{
+	return security_setselfattr(attr, ctx, size, flags);
+}
+
+/**
+ * sys_lsm_get_self_attr - Return current task's security module attributes
+ * @attr: which attribute to return
+ * @ctx: the user-space destination for the information, or NULL
+ * @size: pointer to the size of space available to receive the data
+ * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
+ * attributes associated with the LSM identified in the passed @ctx be
+ * reported.
+ *
+ * Returns the calling task's LSM contexts. On success this
+ * function returns the number of @ctx array elements. This value
+ * may be zero if there are no LSM contexts assigned. If @size is
+ * insufficient to contain the return data -E2BIG is returned and
+ * @size is set to the minimum required size. In all other cases
+ * a negative value indicating the error is returned.
+ */
+SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *,
+		ctx, size_t __user *, size, u32, flags)
+{
+	return security_getselfattr(attr, ctx, size, flags);
+}
diff --git a/security/security.c b/security/security.c
index c66f9faefa40..9757d009113f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3837,6 +3837,158 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode)
 }
 EXPORT_SYMBOL(security_d_instantiate);
 
+/*
+ * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
+ */
+
+/**
+ * security_getselfattr - Read an LSM attribute of the current process.
+ * @attr: which attribute to return
+ * @uctx: the user-space destination for the information, or NULL
+ * @size: pointer to the size of space available to receive the data
+ * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
+ * attributes associated with the LSM identified in the passed @ctx be
+ * reported.
+ *
+ * A NULL value for @uctx can be used to get both the number of attributes
+ * and the size of the data.
+ *
+ * Returns the number of attributes found on success, negative value
+ * on error. @size is reset to the total size of the data.
+ * If @size is insufficient to contain the data -E2BIG is returned.
+ */
+int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
+			 size_t __user *size, u32 flags)
+{
+	struct security_hook_list *hp;
+	struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
+	u8 __user *base = (u8 __user *)uctx;
+	size_t total = 0;
+	size_t entrysize;
+	size_t left;
+	bool toobig = false;
+	bool single = false;
+	int count = 0;
+	int rc;
+
+	if (attr == LSM_ATTR_UNDEF)
+		return -EINVAL;
+	if (size == NULL)
+		return -EINVAL;
+	if (get_user(left, size))
+		return -EFAULT;
+
+	if (flags) {
+		/*
+		 * Only flag supported is LSM_FLAG_SINGLE
+		 */
+		if (flags != LSM_FLAG_SINGLE)
+			return -EINVAL;
+		if (uctx && copy_from_user(&lctx, uctx, sizeof(lctx)))
+			return -EFAULT;
+		/*
+		 * If the LSM ID isn't specified it is an error.
+		 */
+		if (lctx.id == LSM_ID_UNDEF)
+			return -EINVAL;
+		single = true;
+	}
+
+	/*
+	 * In the usual case gather all the data from the LSMs.
+	 * In the single case only get the data from the LSM specified.
+	 */
+	hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) {
+		if (single && lctx.id != hp->lsmid->id)
+			continue;
+		entrysize = left;
+		if (base)
+			uctx = (struct lsm_ctx __user *)(base + total);
+		rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags);
+		if (rc == -EOPNOTSUPP) {
+			rc = 0;
+			continue;
+		}
+		if (rc == -E2BIG) {
+			toobig = true;
+			left = 0;
+		} else if (rc < 0)
+			return rc;
+		else
+			left -= entrysize;
+
+		total += entrysize;
+		count += rc;
+		if (single)
+			break;
+	}
+	if (put_user(total, size))
+		return -EFAULT;
+	if (toobig)
+		return -E2BIG;
+	if (count == 0)
+		return LSM_RET_DEFAULT(getselfattr);
+	return count;
+}
+
+/*
+ * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
+ */
+
+/**
+ * security_setselfattr - Set an LSM attribute on the current process.
+ * @attr: which attribute to set
+ * @uctx: the user-space source for the information
+ * @size: the size of the data
+ * @flags: reserved for future use, must be 0
+ *
+ * Set an LSM attribute for the current process. The LSM, attribute
+ * and new value are included in @uctx.
+ *
+ * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
+ * if the user buffer is inaccessible, E2BIG if size is too big, or an
+ * LSM specific failure.
+ */
+int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
+			 size_t size, u32 flags)
+{
+	struct security_hook_list *hp;
+	struct lsm_ctx *lctx;
+	int rc = LSM_RET_DEFAULT(setselfattr);
+
+	if (flags)
+		return -EINVAL;
+	if (size < sizeof(*lctx))
+		return -EINVAL;
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	lctx = kmalloc(size, GFP_KERNEL);
+	if (lctx == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(lctx, uctx, size)) {
+		rc = -EFAULT;
+		goto free_out;
+	}
+
+	if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) ||
+	    lctx->len < lctx->ctx_len + sizeof(*lctx)) {
+		rc = -EINVAL;
+		goto free_out;
+	}
+
+	hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list)
+		if ((hp->lsmid->id) == lctx->id) {
+			rc = hp->hook.setselfattr(attr, lctx, size, flags);
+			break;
+		}
+
+free_out:
+	kfree(lctx);
+	return rc;
+}
+
 /**
  * security_getprocattr() - Read an attribute for a task
  * @p: the task
-- 
cgit v1.2.3


From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:50 -0700
Subject: LSM: Create lsm_list_modules system call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a system call to report the list of Linux Security Modules
that are active on the system. The list is provided as an array
of LSM ID numbers.

The calling application can use this list determine what LSM
specific actions it might take. That might include choosing an
output format, determining required privilege or bypassing
security module specific behavior.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/lsm.rst |  3 +++
 include/linux/syscalls.h            |  1 +
 kernel/sys_ni.c                     |  1 +
 security/lsm_syscalls.c             | 39 +++++++++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst
index f8499f3e2826..a76da373841b 100644
--- a/Documentation/userspace-api/lsm.rst
+++ b/Documentation/userspace-api/lsm.rst
@@ -63,6 +63,9 @@ Get the specified security attributes of the current process
 .. kernel-doc:: security/lsm_syscalls.c
     :identifiers: sys_lsm_get_self_attr
 
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_list_modules
+
 Additional documentation
 ========================
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4e1e56a24f1e..feec5719750b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t *size, __u32 flags);
 asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t size, __u32 flags);
+asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1f61b8452a6e..9fa5989bf2ce 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ COND_SYSCALL(fadvise64_64);
 COND_SYSCALL_COMPAT(fadvise64_64);
 COND_SYSCALL(lsm_get_self_attr);
 COND_SYSCALL(lsm_set_self_attr);
+COND_SYSCALL(lsm_list_modules);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
index 226ae80d9683..329aaca5efc0 100644
--- a/security/lsm_syscalls.c
+++ b/security/lsm_syscalls.c
@@ -55,3 +55,42 @@ SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *,
 {
 	return security_getselfattr(attr, ctx, size, flags);
 }
+
+/**
+ * sys_lsm_list_modules - Return a list of the active security modules
+ * @ids: the LSM module ids
+ * @size: pointer to size of @ids, updated on return
+ * @flags: reserved for future use, must be zero
+ *
+ * Returns a list of the active LSM ids. On success this function
+ * returns the number of @ids array elements. This value may be zero
+ * if there are no LSMs active. If @size is insufficient to contain
+ * the return data -E2BIG is returned and @size is set to the minimum
+ * required size. In all other cases a negative value indicating the
+ * error is returned.
+ */
+SYSCALL_DEFINE3(lsm_list_modules, u64 __user *, ids, size_t __user *, size,
+		u32, flags)
+{
+	size_t total_size = lsm_active_cnt * sizeof(*ids);
+	size_t usize;
+	int i;
+
+	if (flags)
+		return -EINVAL;
+
+	if (get_user(usize, size))
+		return -EFAULT;
+
+	if (put_user(total_size, size) != 0)
+		return -EFAULT;
+
+	if (usize < total_size)
+		return -E2BIG;
+
+	for (i = 0; i < lsm_active_cnt; i++)
+		if (put_user(lsm_idlist[i]->id, ids++))
+			return -EFAULT;
+
+	return lsm_active_cnt;
+}
-- 
cgit v1.2.3


From e1ca7129db2c3b3c4d261702905a752e6b2710b4 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:52 -0700
Subject: LSM: Helpers for attribute names and filling lsm_ctx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add lsm_name_to_attr(), which translates a text string to a
LSM_ATTR value if one is available.

Add lsm_fill_user_ctx(), which fills a struct lsm_ctx, including
the trailing attribute value.

Both are used in module specific components of LSM system calls.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 14 ++++++++++++++
 security/lsm_syscalls.c  | 24 ++++++++++++++++++++++++
 security/security.c      | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index dd1fe487385d..334f75aa7289 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <uapi/linux/lsm.h>
 
 struct linux_binprm;
 struct cred;
@@ -264,6 +265,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb);
 /* prototypes */
 extern int security_init(void);
 extern int early_security_init(void);
+extern u64 lsm_name_to_attr(const char *name);
 
 /* Security operations */
 int security_binder_set_context_mgr(const struct cred *mgr);
@@ -490,6 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 int security_locked_down(enum lockdown_reason what);
+int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
+		      size_t context_size, u64 id, u64 flags);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -507,6 +511,11 @@ static inline  int unregister_blocking_lsm_notifier(struct notifier_block *nb)
 	return 0;
 }
 
+static inline u64 lsm_name_to_attr(const char *name)
+{
+	return LSM_ATTR_UNDEF;
+}
+
 static inline void security_free_mnt_opts(void **mnt_opts)
 {
 }
@@ -1415,6 +1424,11 @@ static inline int security_locked_down(enum lockdown_reason what)
 {
 	return 0;
 }
+static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
+				    size_t context_size, u64 id, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
 #endif	/* CONFIG_SECURITY */
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
index 329aaca5efc0..5d391b1f7e69 100644
--- a/security/lsm_syscalls.c
+++ b/security/lsm_syscalls.c
@@ -17,6 +17,30 @@
 #include <linux/lsm_hooks.h>
 #include <uapi/linux/lsm.h>
 
+/**
+ * lsm_name_to_attr - map an LSM attribute name to its ID
+ * @name: name of the attribute
+ *
+ * Returns the LSM attribute value associated with @name, or 0 if
+ * there is no mapping.
+ */
+u64 lsm_name_to_attr(const char *name)
+{
+	if (!strcmp(name, "current"))
+		return LSM_ATTR_CURRENT;
+	if (!strcmp(name, "exec"))
+		return LSM_ATTR_EXEC;
+	if (!strcmp(name, "fscreate"))
+		return LSM_ATTR_FSCREATE;
+	if (!strcmp(name, "keycreate"))
+		return LSM_ATTR_KEYCREATE;
+	if (!strcmp(name, "prev"))
+		return LSM_ATTR_PREV;
+	if (!strcmp(name, "sockcreate"))
+		return LSM_ATTR_SOCKCREATE;
+	return LSM_ATTR_UNDEF;
+}
+
 /**
  * sys_lsm_set_self_attr - Set current task's security module attribute
  * @attr: which attribute to set
diff --git a/security/security.c b/security/security.c
index 9757d009113f..988483fcf153 100644
--- a/security/security.c
+++ b/security/security.c
@@ -771,6 +771,47 @@ static int lsm_superblock_alloc(struct super_block *sb)
 	return 0;
 }
 
+/**
+ * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
+ * @ctx: an LSM context to be filled
+ * @context: the new context value
+ * @context_size: the size of the new context value
+ * @id: LSM id
+ * @flags: LSM defined flags
+ *
+ * Fill all of the fields in a user space lsm_ctx structure.
+ * Caller is assumed to have verified that @ctx has enough space
+ * for @context.
+ *
+ * Returns 0 on success, -EFAULT on a copyout error, -ENOMEM
+ * if memory can't be allocated.
+ */
+int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
+		      size_t context_size, u64 id, u64 flags)
+{
+	struct lsm_ctx *lctx;
+	size_t locallen = struct_size(lctx, ctx, context_size);
+	int rc = 0;
+
+	lctx = kzalloc(locallen, GFP_KERNEL);
+	if (lctx == NULL)
+		return -ENOMEM;
+
+	lctx->id = id;
+	lctx->flags = flags;
+	lctx->ctx_len = context_size;
+	lctx->len = locallen;
+
+	memcpy(lctx->ctx, context, context_size);
+
+	if (copy_to_user(ctx, lctx, locallen))
+		rc = -EFAULT;
+
+	kfree(lctx);
+
+	return rc;
+}
+
 /*
  * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
  * can be accessed with:
-- 
cgit v1.2.3


From d7cf3412a9f6c547e5ee443fa7644e08898aa3e2 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 24 Oct 2023 14:44:00 -0400
Subject: lsm: consolidate buffer size handling into lsm_fill_user_ctx()

While we have a lsm_fill_user_ctx() helper function designed to make
life easier for LSMs which return lsm_ctx structs to userspace, we
didn't include all of the buffer length safety checks and buffer
padding adjustments in the helper.  This led to code duplication
across the different LSMs and the possibility for mistakes across the
different LSM subsystems.  In order to reduce code duplication and
decrease the chances of silly mistakes, we're consolidating all of
this code into the lsm_fill_user_ctx() helper.

The buffer padding is also modified from a fixed 8-byte alignment to
an alignment that matches the word length of the machine
(BITS_PER_LONG / 8).

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h   |  9 ++++----
 security/apparmor/lsm.c    | 15 +++----------
 security/security.c        | 55 ++++++++++++++++++++++++++--------------------
 security/selinux/hooks.c   | 44 +++++++++++++++++++------------------
 security/smack/smack_lsm.c | 23 +++++--------------
 5 files changed, 68 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 334f75aa7289..750130a7b9dd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -492,8 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 int security_locked_down(enum lockdown_reason what);
-int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
-		      size_t context_size, u64 id, u64 flags);
+int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len,
+		      void *val, size_t val_len, u64 id, u64 flags);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -1424,8 +1424,9 @@ static inline int security_locked_down(enum lockdown_reason what)
 {
 	return 0;
 }
-static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
-				    size_t context_size, u64 id, u64 flags)
+static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx,
+				    size_t *uctx_len, void *val, size_t val_len,
+				    u64 id, u64 flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 8165f80c10ff..332198e0a017 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -782,7 +782,6 @@ static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx,
 	int error = -ENOENT;
 	struct aa_task_ctx *ctx = task_ctx(current);
 	struct aa_label *label = NULL;
-	size_t total_len = 0;
 	char *value;
 
 	switch (attr) {
@@ -804,22 +803,14 @@ static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx,
 
 	if (label) {
 		error = aa_getprocattr(label, &value, false);
-		if (error > 0) {
-			total_len = ALIGN(struct_size(lx, ctx, error), 8);
-			if (total_len > *size)
-				error = -E2BIG;
-			else if (lx)
-				error = lsm_fill_user_ctx(lx, value, error,
-							  LSM_ID_APPARMOR, 0);
-			else
-				error = 1;
-		}
+		if (error > 0)
+			error = lsm_fill_user_ctx(lx, size, value, error,
+						  LSM_ID_APPARMOR, 0);
 		kfree(value);
 	}
 
 	aa_put_label(label);
 
-	*size = total_len;
 	if (error < 0)
 		return error;
 	return 1;
diff --git a/security/security.c b/security/security.c
index 78e7ffcc9f6c..86f7a1995991 100644
--- a/security/security.c
+++ b/security/security.c
@@ -772,42 +772,49 @@ static int lsm_superblock_alloc(struct super_block *sb)
 
 /**
  * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
- * @ctx: an LSM context to be filled
- * @context: the new context value
- * @context_size: the size of the new context value
+ * @uctx: a userspace LSM context to be filled
+ * @uctx_len: available uctx size (input), used uctx size (output)
+ * @val: the new LSM context value
+ * @val_len: the size of the new LSM context value
  * @id: LSM id
  * @flags: LSM defined flags
  *
- * Fill all of the fields in a user space lsm_ctx structure.
- * Caller is assumed to have verified that @ctx has enough space
- * for @context.
+ * Fill all of the fields in a userspace lsm_ctx structure.
  *
- * Returns 0 on success, -EFAULT on a copyout error, -ENOMEM
- * if memory can't be allocated.
+ * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
+ * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
  */
-int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context,
-		      size_t context_size, u64 id, u64 flags)
+int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len,
+		      void *val, size_t val_len,
+		      u64 id, u64 flags)
 {
-	struct lsm_ctx *lctx;
-	size_t locallen = struct_size(lctx, ctx, context_size);
+	struct lsm_ctx *nctx = NULL;
+	size_t nctx_len;
 	int rc = 0;
 
-	lctx = kzalloc(locallen, GFP_KERNEL);
-	if (lctx == NULL)
-		return -ENOMEM;
-
-	lctx->id = id;
-	lctx->flags = flags;
-	lctx->ctx_len = context_size;
-	lctx->len = locallen;
+	nctx_len = ALIGN(struct_size(nctx, ctx, val_len), BITS_PER_LONG / 8);
+	if (nctx_len > *uctx_len) {
+		rc = -E2BIG;
+		goto out;
+	}
 
-	memcpy(lctx->ctx, context, context_size);
+	nctx = kzalloc(nctx_len, GFP_KERNEL);
+	if (nctx == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	nctx->id = id;
+	nctx->flags = flags;
+	nctx->len = nctx_len;
+	nctx->ctx_len = val_len;
+	memcpy(nctx->ctx, val, val_len);
 
-	if (copy_to_user(ctx, lctx, locallen))
+	if (copy_to_user(uctx, nctx, nctx_len))
 		rc = -EFAULT;
 
-	kfree(lctx);
-
+out:
+	kfree(nctx);
+	*uctx_len = nctx_len;
 	return rc;
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b6c7930a3ab2..942f2b8c4ebb 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6486,30 +6486,32 @@ abort_change:
 	return error;
 }
 
+/**
+ * selinux_getselfattr - Get SELinux current task attributes
+ * @attr: the requested attribute
+ * @ctx: buffer to receive the result
+ * @size: buffer size (input), buffer size used (output)
+ * @flags: unused
+ *
+ * Fill the passed user space @ctx with the details of the requested
+ * attribute.
+ *
+ * Returns the number of attributes on success, an error code otherwise.
+ * There will only ever be one attribute.
+ */
 static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
 			       size_t *size, u32 flags)
 {
-	char *value;
-	size_t total_len;
-	int len;
-	int rc = 0;
-
-	len = selinux_lsm_getattr(attr, current, &value);
-	if (len < 0)
-		return len;
-
-	total_len = ALIGN(struct_size(ctx, ctx, len), 8);
-
-	if (total_len > *size)
-		rc = -E2BIG;
-	else if (ctx)
-		rc = lsm_fill_user_ctx(ctx, value, len, LSM_ID_SELINUX, 0);
-
-	kfree(value);
-	*size = total_len;
-	if (rc < 0)
-		return rc;
-	return 1;
+	int rc;
+	char *val;
+	int val_len;
+
+	val_len = selinux_lsm_getattr(attr, current, &val);
+	if (val_len < 0)
+		return val_len;
+	rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
+	kfree(val);
+	return (!rc ? 1 : rc);
 }
 
 static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 12160d060cc1..99664c8cf867 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3642,28 +3642,17 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
 			     size_t *size, u32 flags)
 {
-	struct smack_known *skp = smk_of_current();
-	int total;
-	int slen;
 	int rc;
+	struct smack_known *skp;
 
 	if (attr != LSM_ATTR_CURRENT)
 		return -EOPNOTSUPP;
 
-	slen = strlen(skp->smk_known) + 1;
-	total = ALIGN(slen + sizeof(*ctx), 8);
-	if (total > *size)
-		rc = -E2BIG;
-	else if (ctx)
-		rc = lsm_fill_user_ctx(ctx, skp->smk_known, slen, LSM_ID_SMACK,
-				       0);
-	else
-		rc = 1;
-
-	*size = total;
-	if (rc >= 0)
-		return 1;
-	return rc;
+	skp = smk_of_current();
+	rc = lsm_fill_user_ctx(ctx, size,
+			       skp->smk_known, strlen(skp->smk_known) + 1,
+			       LSM_ID_SMACK, 0);
+	return (!rc ? 1 : rc);
 }
 
 /**
-- 
cgit v1.2.3


From 8569992d64b8f750e34b7858eac5d7daaf0f80fd Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:45 -0700
Subject: KVM: Use gfn instead of hva for mmu_notifier_retry

Currently in mmu_notifier invalidate path, hva range is recorded and then
checked against by mmu_invalidate_retry_hva() in the page fault handling
path. However, for the soon-to-be-introduced private memory, a page fault
may not have a hva associated, checking gfn(gpa) makes more sense.

For existing hva based shared memory, gfn is expected to also work. The
only downside is when aliasing multiple gfns to a single hva, the
current algorithm of checking multiple ranges could result in a much
larger range being rejected. Such aliasing should be uncommon, so the
impact is expected small.

Suggested-by: Sean Christopherson <seanjc@google.com>
Cc: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
[sean: convert vmx_set_apic_access_page_addr() to gfn-based API]
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Xu Yilun <yilun.xu@linux.intel.com>
Message-Id: <20231027182217.3615211-4-seanjc@google.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 10 ++++++----
 arch/x86/kvm/vmx/vmx.c   | 11 +++++------
 include/linux/kvm_host.h | 33 +++++++++++++++++++++------------
 virt/kvm/kvm_main.c      | 43 ++++++++++++++++++++++++++++++++++---------
 4 files changed, 66 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b0f01d605617..b2d916f786ca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3056,7 +3056,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
  *
  * There are several ways to safely use this helper:
  *
- * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
  *   lookup, but it does need to be held while checking the MMU notifier.
  *
@@ -4366,7 +4366,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
 		return true;
 
 	return fault->slot &&
-	       mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+	       mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
 }
 
 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -6260,7 +6260,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	write_lock(&kvm->mmu_lock);
 
-	kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+	kvm_mmu_invalidate_begin(kvm);
+
+	kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
 
 	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
 
@@ -6270,7 +6272,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	if (flush)
 		kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
 
-	kvm_mmu_invalidate_end(kvm, 0, -1ul);
+	kvm_mmu_invalidate_end(kvm);
 
 	write_unlock(&kvm->mmu_lock);
 }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index be20a60047b1..40e3780d73ae 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6757,10 +6757,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 		return;
 
 	/*
-	 * Grab the memslot so that the hva lookup for the mmu_notifier retry
-	 * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
-	 * on the pfn lookup's validation of the memslot to ensure a valid hva
-	 * is used for the retry check.
+	 * Explicitly grab the memslot using KVM's internal slot ID to ensure
+	 * KVM doesn't unintentionally grab a userspace memslot.  It _should_
+	 * be impossible for userspace to create a memslot for the APIC when
+	 * APICv is enabled, but paranoia won't hurt in this case.
 	 */
 	slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
@@ -6785,8 +6785,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 		return;
 
 	read_lock(&vcpu->kvm->mmu_lock);
-	if (mmu_invalidate_retry_hva(kvm, mmu_seq,
-				     gfn_to_hva_memslot(slot, gfn))) {
+	if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
 		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 		read_unlock(&vcpu->kvm->mmu_lock);
 		goto out;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb6c6109fdca..11d091688346 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -787,8 +787,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_invalidate_seq;
 	long mmu_invalidate_in_progress;
-	unsigned long mmu_invalidate_range_start;
-	unsigned long mmu_invalidate_range_end;
+	gfn_t mmu_invalidate_range_start;
+	gfn_t mmu_invalidate_range_end;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1392,10 +1392,9 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
-			      unsigned long end);
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
-			    unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm);
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
+void kvm_mmu_invalidate_end(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -1970,9 +1969,9 @@ static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 	return 0;
 }
 
-static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
 					   unsigned long mmu_seq,
-					   unsigned long hva)
+					   gfn_t gfn)
 {
 	lockdep_assert_held(&kvm->mmu_lock);
 	/*
@@ -1981,10 +1980,20 @@ static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
 	 * that might be being invalidated. Note that it may include some false
 	 * positives, due to shortcuts when handing concurrent invalidations.
 	 */
-	if (unlikely(kvm->mmu_invalidate_in_progress) &&
-	    hva >= kvm->mmu_invalidate_range_start &&
-	    hva < kvm->mmu_invalidate_range_end)
-		return 1;
+	if (unlikely(kvm->mmu_invalidate_in_progress)) {
+		/*
+		 * Dropping mmu_lock after bumping mmu_invalidate_in_progress
+		 * but before updating the range is a KVM bug.
+		 */
+		if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA ||
+				 kvm->mmu_invalidate_range_end == INVALID_GPA))
+			return 1;
+
+		if (gfn >= kvm->mmu_invalidate_range_start &&
+		    gfn < kvm->mmu_invalidate_range_end)
+			return 1;
+	}
+
 	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5a97e6c7d9c2..9cc57b23ec81 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -543,9 +543,7 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 
 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 
-typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
-			     unsigned long end);
-
+typedef void (*on_lock_fn_t)(struct kvm *kvm);
 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 
 struct kvm_mmu_notifier_range {
@@ -637,7 +635,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 				locked = true;
 				KVM_MMU_LOCK(kvm);
 				if (!IS_KVM_NULL_FN(range->on_lock))
-					range->on_lock(kvm, range->start, range->end);
+					range->on_lock(kvm);
+
 				if (IS_KVM_NULL_FN(range->handler))
 					break;
 			}
@@ -742,16 +741,29 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 }
 
-void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
-			      unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm)
 {
+	lockdep_assert_held_write(&kvm->mmu_lock);
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_invalidate_in_progress++;
+
 	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = INVALID_GPA;
+		kvm->mmu_invalidate_range_end = INVALID_GPA;
+	}
+}
+
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
+
+	if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
 		kvm->mmu_invalidate_range_start = start;
 		kvm->mmu_invalidate_range_end = end;
 	} else {
@@ -771,6 +783,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
 	}
 }
 
+static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+	return kvm_unmap_gfn_range(kvm, range);
+}
+
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
@@ -778,7 +796,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	const struct kvm_mmu_notifier_range hva_range = {
 		.start		= range->start,
 		.end		= range->end,
-		.handler	= kvm_unmap_gfn_range,
+		.handler	= kvm_mmu_unmap_gfn_range,
 		.on_lock	= kvm_mmu_invalidate_begin,
 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
 		.flush_on_ret	= true,
@@ -817,9 +835,10 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	return 0;
 }
 
-void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
-			    unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm)
 {
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
@@ -834,6 +853,12 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
 	 */
 	kvm->mmu_invalidate_in_progress--;
 	KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
+
+	/*
+	 * Assert that at least one range was added between start() and end().
+	 * Not adding a range isn't fatal, but it is a KVM bug.
+	 */
+	WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
-- 
cgit v1.2.3


From f128cf8cfbecccf95e891ae90d9c917df5117c7a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:49 -0700
Subject: KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to
 CONFIG_KVM_GENERIC_MMU_NOTIFIER

Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where
appropriate to effectively maintain existing behavior.  Using a proper
Kconfig will simplify building more functionality on top of KVM's
mmu_notifier infrastructure.

Add a forward declaration of kvm_gfn_range to kvm_types.h so that
including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't
generate warnings due to kvm_gfn_range being undeclared.  PPC defines
hooks for PR vs. HV without guarding them via #ifdeffery, e.g.

  bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
  bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);

Alternatively, PPC could forward declare kvm_gfn_range, but there's no
good reason not to define it in common KVM.

Acked-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h     |  2 --
 arch/arm64/kvm/Kconfig                |  2 +-
 arch/loongarch/include/asm/kvm_host.h |  1 -
 arch/loongarch/kvm/Kconfig            |  2 +-
 arch/mips/include/asm/kvm_host.h      |  2 --
 arch/mips/kvm/Kconfig                 |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  2 --
 arch/powerpc/kvm/Kconfig              |  8 ++++----
 arch/powerpc/kvm/powerpc.c            |  4 +---
 arch/riscv/include/asm/kvm_host.h     |  2 --
 arch/riscv/kvm/Kconfig                |  2 +-
 arch/x86/include/asm/kvm_host.h       |  2 --
 arch/x86/kvm/Kconfig                  |  2 +-
 include/linux/kvm_host.h              |  6 +++---
 include/linux/kvm_types.h             |  1 +
 virt/kvm/Kconfig                      |  4 ++++
 virt/kvm/kvm_main.c                   | 10 +++++-----
 17 files changed, 23 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 5653d3553e3e..9029fe09f3f6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -954,8 +954,6 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
 int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 			      struct kvm_vcpu_events *events);
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 83c1e09be42e..1a777715199f 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -22,7 +22,7 @@ menuconfig KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	select KVM_GENERIC_HARDWARE_ENABLING
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_MMIO
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 11328700d4fa..b108301c2e5a 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -183,7 +183,6 @@ void kvm_flush_tlb_all(void);
 void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write);
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index fda425babfb2..f22bae89b07d 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -26,9 +26,9 @@ config KVM
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_GENERIC_HARDWARE_ENABLING
+	select KVM_GENERIC_MMU_NOTIFIER
 	select KVM_MMIO
 	select KVM_XFER_TO_GUEST_WORK
-	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	help
 	  Support hosting virtualized guest machines using
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 54a85f1d4f2c..179f320cc231 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -810,8 +810,6 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
 pgd_t *kvm_pgd_alloc(void);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
 /* Emulation */
 enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
 int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index a8cdba75f98d..c04987d2ed2e 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -25,7 +25,7 @@ config KVM
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	select INTERVAL_TREE
 	select KVM_GENERIC_HARDWARE_ENABLING
 	help
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 14ee0dece853..4b5c3f2acf78 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -62,8 +62,6 @@
 
 #include <linux/mmu_notifier.h>
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
 #define HPTEG_HASH_BITS_PTE_LONG	12
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 902611954200..b33358ee6424 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -42,7 +42,7 @@ config KVM_BOOK3S_64_HANDLER
 config KVM_BOOK3S_PR_POSSIBLE
 	bool
 	select KVM_MMIO
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 
 config KVM_BOOK3S_HV_POSSIBLE
 	bool
@@ -85,7 +85,7 @@ config KVM_BOOK3S_64_HV
 	tristate "KVM for POWER7 and later using hypervisor mode in host"
 	depends on KVM_BOOK3S_64 && PPC_POWERNV
 	select KVM_BOOK3S_HV_POSSIBLE
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	select CMA
 	help
 	  Support running unmodified book3s_64 guest kernels in
@@ -194,7 +194,7 @@ config KVM_E500V2
 	depends on !CONTEXT_TRACKING_USER
 	select KVM
 	select KVM_MMIO
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	help
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500v2 host processors.
@@ -211,7 +211,7 @@ config KVM_E500MC
 	select KVM
 	select KVM_MMIO
 	select KVM_BOOKE_HV
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	help
 	  Support running unmodified E500MC/E5500/E6500 guest kernels in
 	  virtual machines on E500MC/E5500/E6500 host processors.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8d3ec483bc2b..aac75c98a956 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -632,9 +632,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
-#if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-		BUILD_BUG();
-#endif
+		BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER));
 		r = 1;
 		break;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 0eefd9c991ae..6964dd235e97 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -267,8 +267,6 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
 #define KVM_RISCV_GSTAGE_TLB_MIN_ORDER		12
 
 void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index dfc237d7875b..ae2e05f050ec 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -30,7 +30,7 @@ config KVM
 	select KVM_GENERIC_HARDWARE_ENABLING
 	select KVM_MMIO
 	select KVM_XFER_TO_GUEST_WORK
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	help
 	  Support hosting virtualized guest machines.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d7036982332e..6f559fb75e6d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2141,8 +2141,6 @@ enum {
 # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
 #endif
 
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 950c12868d30..e61383674c75 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -24,7 +24,7 @@ config KVM
 	depends on HIGH_RES_TIMERS
 	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
-	select MMU_NOTIFIER
+	select KVM_GENERIC_MMU_NOTIFIER
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_IRQFD
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11d091688346..5faba69403ac 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -253,7 +253,7 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
-#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 union kvm_mmu_notifier_arg {
 	pte_t pte;
 };
@@ -783,7 +783,7 @@ struct kvm {
 	struct hlist_head irq_ack_notifier_list;
 #endif
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_invalidate_seq;
 	long mmu_invalidate_in_progress;
@@ -1946,7 +1946,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
 	if (unlikely(kvm->mmu_invalidate_in_progress))
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 6f4737d5046a..9d1f7835d8c1 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -6,6 +6,7 @@
 struct kvm;
 struct kvm_async_pf;
 struct kvm_device_ops;
+struct kvm_gfn_range;
 struct kvm_interrupt;
 struct kvm_irq_routing_table;
 struct kvm_memory_slot;
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 484d0873061c..ecae2914c97e 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -92,3 +92,7 @@ config HAVE_KVM_PM_NOTIFIER
 
 config KVM_GENERIC_HARDWARE_ENABLING
        bool
+
+config KVM_GENERIC_MMU_NOTIFIER
+       select MMU_NOTIFIER
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5422ce20dcba..dc81279ea385 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -535,7 +535,7 @@ void kvm_destroy_vcpus(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
 	return container_of(mn, struct kvm, mmu_notifier);
@@ -962,14 +962,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
 
-#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+#else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
 	return 0;
 }
 
-#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 static int kvm_pm_notifier_call(struct notifier_block *bl,
@@ -1289,7 +1289,7 @@ out_err:
 out_err_no_debugfs:
 	kvm_coalesced_mmio_free(kvm);
 out_no_coalesced_mmio:
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 	if (kvm->mmu_notifier.ops)
 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
@@ -1349,7 +1349,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 	/*
 	 * At this point, pending calls to invalidate_range_start()
-- 
cgit v1.2.3


From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:50 -0700
Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2

Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail.  The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.

Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.

Cc: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-9-seanjc@google.com>
Acked-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 22 ++++++++++++++++
 arch/x86/kvm/x86.c             |  2 +-
 include/linux/kvm_host.h       |  4 +--
 include/uapi/linux/kvm.h       | 13 ++++++++++
 virt/kvm/kvm_main.c            | 57 ++++++++++++++++++++++++++++++++++++------
 5 files changed, 87 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 7025b3751027..9edd9e436bab 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6192,6 +6192,28 @@ to know what fields can be changed for the system register described by
 ``op0, op1, crn, crm, op2``. KVM rejects ID register values that describe a
 superset of the features supported by the system.
 
+4.140 KVM_SET_USER_MEMORY_REGION2
+---------------------------------
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+::
+
+  struct kvm_userspace_memory_region2 {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
+	__u64 pad[16];
+  };
+
+See KVM_SET_USER_MEMORY_REGION.
+
 5. The kvm_run structure
 ========================
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c924075f6f1..7b389f27dffc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12576,7 +12576,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
 	}
 
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-		struct kvm_userspace_memory_region m;
+		struct kvm_userspace_memory_region2 m;
 
 		m.slot = id | (i << 16);
 		m.flags = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
 };
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  const struct kvm_userspace_memory_region *mem);
+			  const struct kvm_userspace_memory_region2 *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-			    const struct kvm_userspace_memory_region *mem);
+			    const struct kvm_userspace_memory_region2 *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 211b86de35ac..308cc70bd6ab 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
 	__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+/* for KVM_SET_USER_MEMORY_REGION2 */
+struct kvm_userspace_memory_region2 {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size;
+	__u64 userspace_addr;
+	__u64 pad[16];
+};
+
 /*
  * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
  * userspace, other bits are reserved for kvm internal use which are defined
@@ -1201,6 +1211,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230
+#define KVM_CAP_USER_MEMORY2 231
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1483,6 +1494,8 @@ struct kvm_vfio_spapr_tce {
 					struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+					 struct kvm_userspace_memory_region2)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dc81279ea385..756b94ecd511 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1580,7 +1580,15 @@ static void kvm_replace_memslot(struct kvm *kvm,
 	}
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+/*
+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
+	(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
+static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
@@ -1982,7 +1990,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
  * Must be called holding kvm->slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-			    const struct kvm_userspace_memory_region *mem)
+			    const struct kvm_userspace_memory_region2 *mem)
 {
 	struct kvm_memory_slot *old, *new;
 	struct kvm_memslots *slots;
@@ -2086,7 +2094,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  const struct kvm_userspace_memory_region *mem)
+			  const struct kvm_userspace_memory_region2 *mem)
 {
 	int r;
 
@@ -2098,7 +2106,7 @@ int kvm_set_memory_region(struct kvm *kvm,
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 
 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-					  struct kvm_userspace_memory_region *mem)
+					  struct kvm_userspace_memory_region2 *mem)
 {
 	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
@@ -4568,6 +4576,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 {
 	switch (arg) {
 	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_USER_MEMORY2:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
 	case KVM_CAP_INTERNAL_ERROR_DATA:
@@ -4823,6 +4832,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
 	return fd;
 }
 
+#define SANITY_CHECK_MEM_REGION_FIELD(field)					\
+do {										\
+	BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=		\
+		     offsetof(struct kvm_userspace_memory_region2, field));	\
+	BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=		\
+		     sizeof_field(struct kvm_userspace_memory_region2, field));	\
+} while (0)
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4845,15 +4862,39 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
 		break;
 	}
+	case KVM_SET_USER_MEMORY_REGION2:
 	case KVM_SET_USER_MEMORY_REGION: {
-		struct kvm_userspace_memory_region kvm_userspace_mem;
+		struct kvm_userspace_memory_region2 mem;
+		unsigned long size;
+
+		if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+			/*
+			 * Fields beyond struct kvm_userspace_memory_region shouldn't be
+			 * accessed, but avoid leaking kernel memory in case of a bug.
+			 */
+			memset(&mem, 0, sizeof(mem));
+			size = sizeof(struct kvm_userspace_memory_region);
+		} else {
+			size = sizeof(struct kvm_userspace_memory_region2);
+		}
+
+		/* Ensure the common parts of the two structs are identical. */
+		SANITY_CHECK_MEM_REGION_FIELD(slot);
+		SANITY_CHECK_MEM_REGION_FIELD(flags);
+		SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+		SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+		SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
 
 		r = -EFAULT;
-		if (copy_from_user(&kvm_userspace_mem, argp,
-						sizeof(kvm_userspace_mem)))
+		if (copy_from_user(&mem, argp, size))
+			goto out;
+
+		r = -EINVAL;
+		if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+		    (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
 			goto out;
 
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
-- 
cgit v1.2.3


From 16f95f3b95caded251a0440051e44a2fbe9e5f55 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:51 -0700
Subject: KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace

Add a new KVM exit type to allow userspace to handle memory faults that
KVM cannot resolve, but that userspace *may* be able to handle (without
terminating the guest).

KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit
conversions between private and shared memory.  With guest private memory,
there will be two kind of memory conversions:

  - explicit conversion: happens when the guest explicitly calls into KVM
    to map a range (as private or shared)

  - implicit conversion: happens when the guest attempts to access a gfn
    that is configured in the "wrong" state (private vs. shared)

On x86 (first architecture to support guest private memory), explicit
conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE,
but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable
as there is (obviously) no hypercall, and there is no guarantee that the
guest actually intends to convert between private and shared, i.e. what
KVM thinks is an implicit conversion "request" could actually be the
result of a guest code bug.

KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to
be implicit conversions.

Note!  To allow for future possibilities where KVM reports
KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved
fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's
perspective), not '0'!  Due to historical baggage within KVM, exiting to
userspace with '0' from deep callstacks, e.g. in emulation paths, is
infeasible as doing so would require a near-complete overhaul of KVM,
whereas KVM already propagates -errno return codes to userspace even when
the -errno originated in a low level helper.

Report the gpa+size instead of a single gfn even though the initial usage
is expected to always report single pages.  It's entirely possible, likely
even, that KVM will someday support sub-page granularity faults, e.g.
Intel's sub-page protection feature allows for additional protections at
128-byte granularity.

Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com
Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com
Cc: Anish Moorthy <amoorthy@google.com>
Cc: David Matlack <dmatlack@google.com>
Suggested-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20231027182217.3615211-10-seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 41 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c             |  1 +
 include/linux/kvm_host.h       | 11 +++++++++++
 include/uapi/linux/kvm.h       |  8 ++++++++
 4 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 9edd9e436bab..27d945d5b4e4 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6846,6 +6846,26 @@ array field represents return values. The userspace should update the return
 values of SBI call before resuming the VCPU. For more details on RISC-V SBI
 spec refer, https://github.com/riscv/riscv-sbi-doc.
 
+::
+
+		/* KVM_EXIT_MEMORY_FAULT */
+		struct {
+			__u64 flags;
+			__u64 gpa;
+			__u64 size;
+		} memory_fault;
+
+KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that
+could not be resolved by KVM.  The 'gpa' and 'size' (in bytes) describe the
+guest physical address range [gpa, gpa + size) of the fault.  The 'flags' field
+describes properties of the faulting access that are likely pertinent.
+Currently, no flags are defined.
+
+Note!  KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it
+accompanies a return code of '-1', not '0'!  errno will always be set to EFAULT
+or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT, userspace should assume
+kvm_run.exit_reason is stale/undefined for all other error numbers.
+
 ::
 
     /* KVM_EXIT_NOTIFY */
@@ -7880,6 +7900,27 @@ This capability is aimed to mitigate the threat that malicious VMs can
 cause CPU stuck (due to event windows don't open up) and make the CPU
 unavailable to host or other VMs.
 
+7.34 KVM_CAP_MEMORY_FAULT_INFO
+------------------------------
+
+:Architectures: x86
+:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP.
+
+The presence of this capability indicates that KVM_RUN will fill
+kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if
+there is a valid memslot but no backing VMA for the corresponding host virtual
+address.
+
+The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns
+an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set
+to KVM_EXIT_MEMORY_FAULT.
+
+Note: Userspaces which attempt to resolve memory faults so that they can retry
+KVM_RUN are encouraged to guard against repeatedly receiving the same
+error/annotated fault.
+
+See KVM_EXIT_MEMORY_FAULT for more information.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b389f27dffc..8f9d8939b63b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4625,6 +4625,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
 	case KVM_CAP_IRQFD_RESAMPLE:
+	case KVM_CAP_MEMORY_FAULT_INFO:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4e741ff27af3..96aa930536b1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2327,4 +2327,15 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+						 gpa_t gpa, gpa_t size)
+{
+	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+	vcpu->run->memory_fault.gpa = gpa;
+	vcpu->run->memory_fault.size = size;
+
+	/* Flags are not (yet) defined or communicated to userspace. */
+	vcpu->run->memory_fault.flags = 0;
+}
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 308cc70bd6ab..59010a685007 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -275,6 +275,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
 #define KVM_EXIT_LOONGARCH_IOCSR  38
+#define KVM_EXIT_MEMORY_FAULT     39
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -528,6 +529,12 @@ struct kvm_run {
 #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
 			__u32 flags;
 		} notify;
+		/* KVM_EXIT_MEMORY_FAULT */
+		struct {
+			__u64 flags;
+			__u64 gpa;
+			__u64 size;
+		} memory_fault;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -1212,6 +1219,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230
 #define KVM_CAP_USER_MEMORY2 231
+#define KVM_CAP_MEMORY_FAULT_INFO 232
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 5a475554db1e476a14216e742ea2bdb77362d5d5 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:21:55 -0700
Subject: KVM: Introduce per-page memory attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In confidential computing usages, whether a page is private or shared is
necessary information for KVM to perform operations like page fault
handling, page zapping etc. There are other potential use cases for
per-page memory attributes, e.g. to make memory read-only (or no-exec,
or exec-only, etc.) without having to modify memslots.

Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by
KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory
attributes to a guest memory range.

Use an xarray to store the per-page attributes internally, with a naive,
not fully optimized implementation, i.e. prioritize correctness over
performance for the initial implementation.

Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX
attributes/protections in the future, e.g. to give userspace fine-grained
control over read, write, and execute protections for guest memory.

Provide arch hooks for handling attribute changes before and after common
code sets the new attributes, e.g. x86 will use the "pre" hook to zap all
relevant mappings, and the "post" hook to track whether or not hugepages
can be used to map the range.

To simplify the implementation wrap the entire sequence with
kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly
guaranteed to be an invalidation.  For the initial use case, x86 *will*
always invalidate memory, and preventing arch code from creating new
mappings while the attributes are in flux makes it much easier to reason
about the correctness of consuming attributes.

It's possible that future usages may not require an invalidation, e.g.
if KVM ends up supporting RWX protections and userspace grants _more_
protections, but again opt for simplicity and punt optimizations to
if/when they are needed.

Suggested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com
Cc: Fuad Tabba <tabba@google.com>
Cc: Xu Yilun <yilun.xu@intel.com>
Cc: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-14-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst |  36 +++++++
 include/linux/kvm_host.h       |  19 ++++
 include/uapi/linux/kvm.h       |  13 +++
 virt/kvm/Kconfig               |   4 +
 virt/kvm/kvm_main.c            | 216 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 288 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 27d945d5b4e4..081ef09d3148 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6214,6 +6214,42 @@ superset of the features supported by the system.
 
 See KVM_SET_USER_MEMORY_REGION.
 
+4.141 KVM_SET_MEMORY_ATTRIBUTES
+-------------------------------
+
+:Capability: KVM_CAP_MEMORY_ATTRIBUTES
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_memory_attributes (in)
+:Returns: 0 on success, <0 on error
+
+KVM_SET_MEMORY_ATTRIBUTES allows userspace to set memory attributes for a range
+of guest physical memory.
+
+::
+
+  struct kvm_memory_attributes {
+	__u64 address;
+	__u64 size;
+	__u64 attributes;
+	__u64 flags;
+  };
+
+  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
+
+The address and size must be page aligned.  The supported attributes can be
+retrieved via ioctl(KVM_CHECK_EXTENSION) on KVM_CAP_MEMORY_ATTRIBUTES.  If
+executed on a VM, KVM_CAP_MEMORY_ATTRIBUTES precisely returns the attributes
+supported by that VM.  If executed at system scope, KVM_CAP_MEMORY_ATTRIBUTES
+returns all attributes supported by KVM.  The only attribute defined at this
+time is KVM_MEMORY_ATTRIBUTE_PRIVATE, which marks the associated gfn as being
+guest private memory.
+
+Note, there is no "get" API.  Userspace is responsible for explicitly tracking
+the state of a gfn/page as needed.
+
+The "flags" field is reserved for future extensions and must be '0'.
+
 5. The kvm_run structure
 ========================
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 96aa930536b1..68a144cb7dbc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,6 +256,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 union kvm_mmu_notifier_arg {
 	pte_t pte;
+	unsigned long attributes;
 };
 
 struct kvm_gfn_range {
@@ -806,6 +807,10 @@ struct kvm {
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
+#endif
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	/* Protected by slots_locks (for writes) and RCU (for reads) */
+	struct xarray mem_attr_array;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
 };
@@ -2338,4 +2343,18 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 	vcpu->run->memory_fault.flags = 0;
 }
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
+{
+	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
+}
+
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long attrs);
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+					struct kvm_gfn_range *range);
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+					 struct kvm_gfn_range *range);
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 59010a685007..e8d167e54980 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1220,6 +1220,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230
 #define KVM_CAP_USER_MEMORY2 231
 #define KVM_CAP_MEMORY_FAULT_INFO 232
+#define KVM_CAP_MEMORY_ATTRIBUTES 233
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2288,4 +2289,16 @@ struct kvm_s390_zpci_op {
 /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
 #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
+/* Available with KVM_CAP_MEMORY_ATTRIBUTES */
+#define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
+
+struct kvm_memory_attributes {
+	__u64 address;
+	__u64 size;
+	__u64 attributes;
+	__u64 flags;
+};
+
+#define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index ecae2914c97e..5bd7fcaf9089 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -96,3 +96,7 @@ config KVM_GENERIC_HARDWARE_ENABLING
 config KVM_GENERIC_MMU_NOTIFIER
        select MMU_NOTIFIER
        bool
+
+config KVM_GENERIC_MEMORY_ATTRIBUTES
+       select KVM_GENERIC_MMU_NOTIFIER
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f3291dec7a6..f1a575d39b3b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1211,6 +1211,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 	xa_init(&kvm->vcpu_array);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	xa_init(&kvm->mem_attr_array);
+#endif
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
@@ -1391,6 +1394,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	}
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	cleanup_srcu_struct(&kvm->srcu);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	xa_destroy(&kvm->mem_attr_array);
+#endif
 	kvm_arch_free_vm(kvm);
 	preempt_notifier_dec();
 	hardware_disable_all();
@@ -2397,6 +2403,200 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+/*
+ * Returns true if _all_ gfns in the range [@start, @end) have attributes
+ * matching @attrs.
+ */
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long attrs)
+{
+	XA_STATE(xas, &kvm->mem_attr_array, start);
+	unsigned long index;
+	bool has_attrs;
+	void *entry;
+
+	rcu_read_lock();
+
+	if (!attrs) {
+		has_attrs = !xas_find(&xas, end - 1);
+		goto out;
+	}
+
+	has_attrs = true;
+	for (index = start; index < end; index++) {
+		do {
+			entry = xas_next(&xas);
+		} while (xas_retry(&xas, entry));
+
+		if (xas.xa_index != index || xa_to_value(entry) != attrs) {
+			has_attrs = false;
+			break;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	return has_attrs;
+}
+
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+	if (!kvm)
+		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+	return 0;
+}
+
+static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
+						 struct kvm_mmu_notifier_range *range)
+{
+	struct kvm_gfn_range gfn_range;
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	struct kvm_memslot_iter iter;
+	bool found_memslot = false;
+	bool ret = false;
+	int i;
+
+	gfn_range.arg = range->arg;
+	gfn_range.may_block = range->may_block;
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
+			slot = iter.slot;
+			gfn_range.slot = slot;
+
+			gfn_range.start = max(range->start, slot->base_gfn);
+			gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
+			if (gfn_range.start >= gfn_range.end)
+				continue;
+
+			if (!found_memslot) {
+				found_memslot = true;
+				KVM_MMU_LOCK(kvm);
+				if (!IS_KVM_NULL_FN(range->on_lock))
+					range->on_lock(kvm);
+			}
+
+			ret |= range->handler(kvm, &gfn_range);
+		}
+	}
+
+	if (range->flush_on_ret && ret)
+		kvm_flush_remote_tlbs(kvm);
+
+	if (found_memslot)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
+					  struct kvm_gfn_range *range)
+{
+	/*
+	 * Unconditionally add the range to the invalidation set, regardless of
+	 * whether or not the arch callback actually needs to zap SPTEs.  E.g.
+	 * if KVM supports RWX attributes in the future and the attributes are
+	 * going from R=>RW, zapping isn't strictly necessary.  Unconditionally
+	 * adding the range allows KVM to require that MMU invalidations add at
+	 * least one range between begin() and end(), e.g. allows KVM to detect
+	 * bugs where the add() is missed.  Relaxing the rule *might* be safe,
+	 * but it's not obvious that allowing new mappings while the attributes
+	 * are in flux is desirable or worth the complexity.
+	 */
+	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+
+	return kvm_arch_pre_set_memory_attributes(kvm, range);
+}
+
+/* Set @attributes for the gfn range [@start, @end). */
+static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long attributes)
+{
+	struct kvm_mmu_notifier_range pre_set_range = {
+		.start = start,
+		.end = end,
+		.handler = kvm_pre_set_memory_attributes,
+		.on_lock = kvm_mmu_invalidate_begin,
+		.flush_on_ret = true,
+		.may_block = true,
+	};
+	struct kvm_mmu_notifier_range post_set_range = {
+		.start = start,
+		.end = end,
+		.arg.attributes = attributes,
+		.handler = kvm_arch_post_set_memory_attributes,
+		.on_lock = kvm_mmu_invalidate_end,
+		.may_block = true,
+	};
+	unsigned long i;
+	void *entry;
+	int r = 0;
+
+	entry = attributes ? xa_mk_value(attributes) : NULL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	/* Nothing to do if the entire range as the desired attributes. */
+	if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
+		goto out_unlock;
+
+	/*
+	 * Reserve memory ahead of time to avoid having to deal with failures
+	 * partway through setting the new attributes.
+	 */
+	for (i = start; i < end; i++) {
+		r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
+		if (r)
+			goto out_unlock;
+	}
+
+	kvm_handle_gfn_range(kvm, &pre_set_range);
+
+	for (i = start; i < end; i++) {
+		r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
+				    GFP_KERNEL_ACCOUNT));
+		KVM_BUG_ON(r, kvm);
+	}
+
+	kvm_handle_gfn_range(kvm, &post_set_range);
+
+out_unlock:
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
+					   struct kvm_memory_attributes *attrs)
+{
+	gfn_t start, end;
+
+	/* flags is currently not used. */
+	if (attrs->flags)
+		return -EINVAL;
+	if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+		return -EINVAL;
+	if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
+		return -EINVAL;
+	if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
+		return -EINVAL;
+
+	start = attrs->address >> PAGE_SHIFT;
+	end = (attrs->address + attrs->size) >> PAGE_SHIFT;
+
+	/*
+	 * xarray tracks data using "unsigned long", and as a result so does
+	 * KVM.  For simplicity, supports generic attributes only on 64-bit
+	 * architectures.
+	 */
+	BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
+
+	return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
+}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -4641,6 +4841,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_BINARY_STATS_FD:
 	case KVM_CAP_SYSTEM_EVENT_DATA:
 		return 1;
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	case KVM_CAP_MEMORY_ATTRIBUTES:
+		return kvm_supported_mem_attributes(kvm);
+#endif
 	default:
 		break;
 	}
@@ -5034,6 +5238,18 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	case KVM_SET_MEMORY_ATTRIBUTES: {
+		struct kvm_memory_attributes attrs;
+
+		r = -EFAULT;
+		if (copy_from_user(&attrs, argp, sizeof(attrs)))
+			goto out;
+
+		r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
+		break;
+	}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 	case KVM_CREATE_DEVICE: {
 		struct kvm_create_device cd;
 
-- 
cgit v1.2.3


From 0003e2a414687fff6a75250d381e4abf345d663f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:21:56 -0700
Subject: mm: Add AS_UNMOVABLE to mark mapping as completely unmovable

Add an "unmovable" flag for mappings that cannot be migrated under any
circumstance.  KVM will use the flag for its upcoming GUEST_MEMFD support,
which will not support compaction/migration, at least not in the
foreseeable future.

Test AS_UNMOVABLE under folio lock as already done for the async
compaction/dirty folio case, as the mapping can be removed by truncation
while compaction is running.  To avoid having to lock every folio with a
mapping, assume/require that unmovable mappings are also unevictable, and
have mapping_set_unmovable() also set AS_UNEVICTABLE.

Cc: Matthew Wilcox <willy@infradead.org>
Co-developed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-15-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/pagemap.h | 19 ++++++++++++++++++-
 mm/compaction.c         | 43 +++++++++++++++++++++++++++++++------------
 mm/migrate.c            |  2 ++
 3 files changed, 51 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..82c9bf506b79 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -203,7 +203,8 @@ enum mapping_flags {
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
 	AS_LARGE_FOLIO_SUPPORT = 6,
-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+	AS_RELEASE_ALWAYS = 7,	/* Call ->release_folio(), even if no private data */
+	AS_UNMOVABLE	= 8,	/* The mapping cannot be moved, ever */
 };
 
 /**
@@ -289,6 +290,22 @@ static inline void mapping_clear_release_always(struct address_space *mapping)
 	clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
 }
 
+static inline void mapping_set_unmovable(struct address_space *mapping)
+{
+	/*
+	 * It's expected unmovable mappings are also unevictable. Compaction
+	 * migrate scanner (isolate_migratepages_block()) relies on this to
+	 * reduce page locking.
+	 */
+	set_bit(AS_UNEVICTABLE, &mapping->flags);
+	set_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
+static inline bool mapping_unmovable(struct address_space *mapping)
+{
+	return test_bit(AS_UNMOVABLE, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return mapping->gfp_mask;
diff --git a/mm/compaction.c b/mm/compaction.c
index 38c8d216c6a3..12b828aed7c8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -883,6 +883,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
+		bool is_dirty, is_unevictable;
 
 		if (skip_on_failure && low_pfn >= next_skip_pfn) {
 			/*
@@ -1080,8 +1081,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!folio_test_lru(folio))
 			goto isolate_fail_put;
 
+		is_unevictable = folio_test_unevictable(folio);
+
 		/* Compaction might skip unevictable pages but CMA takes them */
-		if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
+		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
 			goto isolate_fail_put;
 
 		/*
@@ -1093,26 +1096,42 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
 			goto isolate_fail_put;
 
-		if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
-			bool migrate_dirty;
+		is_dirty = folio_test_dirty(folio);
+
+		if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
+		    (mapping && is_unevictable)) {
+			bool migrate_dirty = true;
+			bool is_unmovable;
 
 			/*
 			 * Only folios without mappings or that have
-			 * a ->migrate_folio callback are possible to
-			 * migrate without blocking.  However, we may
-			 * be racing with truncation, which can free
-			 * the mapping.  Truncation holds the folio lock
-			 * until after the folio is removed from the page
-			 * cache so holding it ourselves is sufficient.
+			 * a ->migrate_folio callback are possible to migrate
+			 * without blocking.
+			 *
+			 * Folios from unmovable mappings are not migratable.
+			 *
+			 * However, we can be racing with truncation, which can
+			 * free the mapping that we need to check. Truncation
+			 * holds the folio lock until after the folio is removed
+			 * from the page so holding it ourselves is sufficient.
+			 *
+			 * To avoid locking the folio just to check unmovable,
+			 * assume every unmovable folio is also unevictable,
+			 * which is a cheaper test.  If our assumption goes
+			 * wrong, it's not a correctness bug, just potentially
+			 * wasted cycles.
 			 */
 			if (!folio_trylock(folio))
 				goto isolate_fail_put;
 
 			mapping = folio_mapping(folio);
-			migrate_dirty = !mapping ||
-					mapping->a_ops->migrate_folio;
+			if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) {
+				migrate_dirty = !mapping ||
+						mapping->a_ops->migrate_folio;
+			}
+			is_unmovable = mapping && mapping_unmovable(mapping);
 			folio_unlock(folio);
-			if (!migrate_dirty)
+			if (!migrate_dirty || is_unmovable)
 				goto isolate_fail_put;
 		}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 06086dc9da28..60f2ff6b36aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -956,6 +956,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 
 		if (!mapping)
 			rc = migrate_folio(mapping, dst, src, mode);
+		else if (mapping_unmovable(mapping))
+			rc = -EOPNOTSUPP;
 		else if (mapping->a_ops->migrate_folio)
 			/*
 			 * Most folios have a mapping and most filesystems
-- 
cgit v1.2.3


From 438cbcdf105d84449fceb39a2d0e16d0ec20708f Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 10 Nov 2023 13:05:46 +0100
Subject: net: mdio: fix typo in header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The quotes symbol in
  "EEE "link partner ability 1
should be at the end of the register name
  "EEE link partner ability 1"

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 8fa23bdcedbf..007fd9c3e4b6 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
  * A function that translates value of following registers to the linkmode:
  * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20)
  * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60)
- * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61)
+ * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61)
  */
 static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val)
 {
-- 
cgit v1.2.3


From 3fad96e9b21bed214c1593d7d7fb3e40d1fbf6f4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 24 Oct 2023 11:57:15 +0100
Subject: firmware: arm_ffa: Declare ffa_bus_type structure in the header

smatch reports:
drivers/firmware/arm_ffa/bus.c:108:17: warning:
	symbol 'ffa_bus_type' was not declared. Should it be static?

ffa_bus_type is exported to be useful in the FF-A driver. So this
warning is not correct. However, declaring the ffa_bus_type structure
in the header like many other bus_types do already removes this warning.
So let us just do the same and get rid of the warning.

Link: https://lore.kernel.org/r/20231024105715.2369638-1-sudeep.holla@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 1abedb5b2e48..3d0fde57ba90 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -209,6 +209,8 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
 #define module_ffa_driver(__ffa_driver)	\
 	module_driver(__ffa_driver, ffa_register, ffa_unregister)
 
+extern struct bus_type ffa_bus_type;
+
 /* FFA transport related */
 struct ffa_partition_info {
 	u16 id;
-- 
cgit v1.2.3


From 4f0b9194bc119a9850a99e5e824808e2f468c348 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 3 Nov 2023 06:47:51 -0400
Subject: fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure()

The call to the inode_init_security_anon() LSM hook is not the sole
reason to use anon_inode_getfile_secure() or anon_inode_getfd_secure().
For example, the functions also allow one to create a file with non-zero
size, without needing a full-blown filesystem.  In this case, you don't
need a "secure" version, just unique inodes; the current name of the
functions is confusing and does not explain well the difference with
the more "standard" anon_inode_getfile() and anon_inode_getfd().

Of course, there is another side of the coin; neither io_uring nor
userfaultfd strictly speaking need distinct inodes, and it is not
that clear anymore that anon_inode_create_get{file,fd}() allow the LSM
to intercept and block the inode's creation.  If one was so inclined,
anon_inode_getfile_secure() and anon_inode_getfd_secure() could be kept,
using the shared inode or a new one depending on CONFIG_SECURITY.
However, this is probably overkill, and potentially a cause of bugs in
different configurations.  Therefore, just add a comment to io_uring
and userfaultfd explaining the choice of the function.

While at it, remove the export for what is now anon_inode_create_getfd().
There is no in-tree module that uses it, and the old name is gone anyway.
If anybody actually needs the symbol, they can ask or they can just use
anon_inode_create_getfile(), which will be exported very soon for use
in KVM.

Suggested-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 fs/anon_inodes.c            | 50 ++++++++++++++++++++++++++++++---------------
 fs/userfaultfd.c            |  5 +++--
 include/linux/anon_inodes.h |  4 ++--
 io_uring/io_uring.c         |  3 ++-
 4 files changed, 40 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..42b02dc36474 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name,
 					 const struct file_operations *fops,
 					 void *priv, int flags,
 					 const struct inode *context_inode,
-					 bool secure)
+					 bool make_inode)
 {
 	struct inode *inode;
 	struct file *file;
@@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name,
 	if (fops->owner && !try_module_get(fops->owner))
 		return ERR_PTR(-ENOENT);
 
-	if (secure) {
+	if (make_inode) {
 		inode =	anon_inode_make_secure_inode(name, context_inode);
 		if (IS_ERR(inode)) {
 			file = ERR_CAST(inode);
@@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name,
 EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
 /**
- * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
  *                             !S_PRIVATE anon inode rather than reuse the
  *                             singleton anon inode and calls the
- *                             inode_init_security_anon() LSM hook.  This
- *                             allows for both the inode to have its own
- *                             security context and for the LSM to enforce
- *                             policy on the inode's creation.
+ *                             inode_init_security_anon() LSM hook.
  *
  * @name:    [in]    name of the "class" of the new file
  * @fops:    [in]    file operations for the new file
@@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
  * @context_inode:
  *           [in]    the logical relationship with the new inode (optional)
  *
+ * Create a new anonymous inode and file pair.  This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ *   policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ *   the size returned by fstat()
+ *
  * The LSM may use @context_inode in inode_init_security_anon(), but a
- * reference to it is not held.  Returns the newly created file* or an error
- * pointer.  See the anon_inode_getfile() documentation for more information.
+ * reference to it is not held.
+ *
+ * Returns the newly created file* or an error pointer.
  */
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
 				       const struct file_operations *fops,
 				       void *priv, int flags,
 				       const struct inode *context_inode)
@@ -181,7 +188,7 @@ static int __anon_inode_getfd(const char *name,
 			      const struct file_operations *fops,
 			      void *priv, int flags,
 			      const struct inode *context_inode,
-			      bool secure)
+			      bool make_inode)
 {
 	int error, fd;
 	struct file *file;
@@ -192,7 +199,7 @@ static int __anon_inode_getfd(const char *name,
 	fd = error;
 
 	file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
-				    secure);
+				    make_inode);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_put_unused_fd;
@@ -231,10 +238,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /**
- * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
  * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
- * the inode_init_security_anon() LSM hook. This allows the inode to have its
- * own security context and for a LSM to reject creation of the inode.
+ * the inode_init_security_anon() LSM hook.
  *
  * @name:    [in]    name of the "class" of the new file
  * @fops:    [in]    file operations for the new file
@@ -243,16 +249,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
  * @context_inode:
  *           [in]    the logical relationship with the new inode (optional)
  *
+ * Create a new anonymous inode and file pair.  This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ *   policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ *   the size returned by fstat()
+ *
  * The LSM may use @context_inode in inode_init_security_anon(), but a
  * reference to it is not held.
+ *
+ * Returns a newly created file descriptor or an error code.
  */
-int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
 			    void *priv, int flags,
 			    const struct inode *context_inode)
 {
 	return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
 }
-EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
 
 static int __init anon_inode_init(void)
 {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..7a1cf8bab5eb 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1033,7 +1033,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
 {
 	int fd;
 
-	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
 			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
 	if (fd < 0)
 		return fd;
@@ -2205,7 +2205,8 @@ static int new_userfaultfd(int flags)
 	/* prevent the mm struct to be freed */
 	mmgrab(ctx->mm);
 
-	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+	/* Create a new inode so that the LSM can block the creation.  */
+	fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
 			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
 	if (fd < 0) {
 		mmdrop(ctx->mm);
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 5deaddbd7927..93a5f16d03f3 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -15,13 +15,13 @@ struct inode;
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
 				       const struct file_operations *fops,
 				       void *priv, int flags,
 				       const struct inode *context_inode);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
-int anon_inode_getfd_secure(const char *name,
+int anon_inode_create_getfd(const char *name,
 			    const struct file_operations *fops,
 			    void *priv, int flags,
 			    const struct inode *context_inode);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8d1bc6cdfe71..22b98f47bb28 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3835,7 +3835,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 		return ERR_PTR(ret);
 #endif
 
-	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+	/* Create a new inode so that the LSM can block the creation.  */
+	file = anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
 					 O_RDWR | O_CLOEXEC, NULL);
 #if defined(CONFIG_UNIX)
 	if (IS_ERR(file)) {
-- 
cgit v1.2.3


From a7800aa80ea4d5356b8474c2302812e9d4926fa6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 13 Nov 2023 05:42:34 -0500
Subject: KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing
 memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based
memory that is tied to a specific KVM virtual machine and whose primary
purpose is to serve guest memory.

A guest-first memory subsystem allows for optimizations and enhancements
that are kludgy or outright infeasible to implement/support in a generic
memory subsystem.  With guest_memfd, guest protections and mapping sizes
are fully decoupled from host userspace mappings.   E.g. KVM currently
doesn't support mapping memory as writable in the guest without it also
being writable in host userspace, as KVM's ABI uses VMA protections to
define the allow guest protection.  Userspace can fudge this by
establishing two mappings, a writable mapping for the guest and readable
one for itself, but that’s suboptimal on multiple fronts.

Similarly, KVM currently requires the guest mapping size to be a strict
subset of the host userspace mapping size, e.g. KVM doesn’t support
creating a 1GiB guest mapping unless userspace also has a 1GiB guest
mapping.  Decoupling the mappings sizes would allow userspace to precisely
map only what is needed without impacting guest performance, e.g. to
harden against unintentional accesses to guest memory.

Decoupling guest and userspace mappings may also allow for a cleaner
alternative to high-granularity mappings for HugeTLB, which has reached a
bit of an impasse and is unlikely to ever be merged.

A guest-first memory subsystem also provides clearer line of sight to
things like a dedicated memory pool (for slice-of-hardware VMs) and
elimination of "struct page" (for offload setups where userspace _never_
needs to mmap() guest memory).

More immediately, being able to map memory into KVM guests without mapping
said memory into the host is critical for Confidential VMs (CoCo VMs), the
initial use case for guest_memfd.  While AMD's SEV and Intel's TDX prevent
untrusted software from reading guest private data by encrypting guest
memory with a key that isn't usable by the untrusted host, projects such
as Protected KVM (pKVM) provide confidentiality and integrity *without*
relying on memory encryption.  And with SEV-SNP and TDX, accessing guest
private memory can be fatal to the host, i.e. KVM must be prevent host
userspace from accessing guest memory irrespective of hardware behavior.

Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as
being mappable only by KVM (or a similarly enlightened kernel subsystem).
That approach was abandoned largely due to it needing to play games with
PROT_NONE to prevent userspace from accessing guest memory.

Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping
guest private memory into userspace, but that approach failed to meet
several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel
wouldn't easily be able to enforce a 1:1 page:guest association, let alone
a 1:1 pfn:gfn mapping.  And using PG_hwpoison does not work for memory
that isn't backed by 'struct page', e.g. if devices gain support for
exposing encrypted memory regions to guests.

Attempt #3 was to extend the memfd() syscall and wrap shmem to provide
dedicated file-based guest memory.  That approach made it as far as v10
before feedback from Hugh Dickins and Christian Brauner (and others) led
to it demise.

Hugh's objection was that piggybacking shmem made no sense for KVM's use
case as KVM didn't actually *want* the features provided by shmem.  I.e.
KVM was using memfd() and shmem to avoid having to manage memory directly,
not because memfd() and shmem were the optimal solution, e.g. things like
read/write/mmap in shmem were dead weight.

Christian pointed out flaws with implementing a partial overlay (wrapping
only _some_ of shmem), e.g. poking at inode_operations or super_operations
would show shmem stuff, but address_space_operations and file_operations
would show KVM's overlay.  Paraphrashing heavily, Christian suggested KVM
stop being lazy and create a proper API.

Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com
Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com
Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com
Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com
Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com
Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com
Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner
Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com
Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey
Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org
Cc: Fuad Tabba <tabba@google.com>
Cc: Vishal Annapurve <vannapurve@google.com>
Cc: Ackerley Tng <ackerleytng@google.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Maciej Szmigiero <mail@maciej.szmigiero.name>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Quentin Perret <qperret@google.com>
Cc: Michael Roth <michael.roth@amd.com>
Cc: Wang <wei.w.wang@intel.com>
Cc: Liam Merwick <liam.merwick@oracle.com>
Cc: Isaku Yamahata <isaku.yamahata@gmail.com>
Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Co-developed-by: Chao Peng <chao.p.peng@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Co-developed-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20231027182217.3615211-17-seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst |  70 +++++-
 fs/anon_inodes.c               |   1 +
 include/linux/kvm_host.h       |  48 ++++
 include/uapi/linux/kvm.h       |  15 +-
 virt/kvm/Kconfig               |   4 +
 virt/kvm/Makefile.kvm          |   1 +
 virt/kvm/guest_memfd.c         | 538 +++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c            |  59 ++++-
 virt/kvm/kvm_mm.h              |  26 ++
 9 files changed, 754 insertions(+), 8 deletions(-)
 create mode 100644 virt/kvm/guest_memfd.c

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 081ef09d3148..1e61faf02b2a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6201,6 +6201,15 @@ superset of the features supported by the system.
 :Parameters: struct kvm_userspace_memory_region2 (in)
 :Returns: 0 on success, -1 on error
 
+KVM_SET_USER_MEMORY_REGION2 is an extension to KVM_SET_USER_MEMORY_REGION that
+allows mapping guest_memfd memory into a guest.  All fields shared with
+KVM_SET_USER_MEMORY_REGION identically.  Userspace can set KVM_MEM_GUEST_MEMFD
+in flags to have KVM bind the memory region to a given guest_memfd range of
+[guest_memfd_offset, guest_memfd_offset + memory_size].  The target guest_memfd
+must point at a file created via KVM_CREATE_GUEST_MEMFD on the current VM, and
+the target range must not be bound to any other memory region.  All standard
+bounds checks apply (use common sense).
+
 ::
 
   struct kvm_userspace_memory_region2 {
@@ -6209,10 +6218,24 @@ superset of the features supported by the system.
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
 	__u64 userspace_addr; /* start of the userspace allocated memory */
-	__u64 pad[16];
+	__u64 guest_memfd_offset;
+	__u32 guest_memfd;
+	__u32 pad1;
+	__u64 pad2[14];
   };
 
-See KVM_SET_USER_MEMORY_REGION.
+A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and
+userspace_addr (shared memory).  However, "valid" for userspace_addr simply
+means that the address itself must be a legal userspace address.  The backing
+mapping for userspace_addr is not required to be valid/populated at the time of
+KVM_SET_USER_MEMORY_REGION2, e.g. shared memory can be lazily mapped/allocated
+on-demand.
+
+When mapping a gfn into the guest, KVM selects shared vs. private, i.e consumes
+userspace_addr vs. guest_memfd, based on the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE
+state.  At VM creation time, all memory is shared, i.e. the PRIVATE attribute
+is '0' for all gfns.  Userspace can control whether memory is shared/private by
+toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed.
 
 4.141 KVM_SET_MEMORY_ATTRIBUTES
 -------------------------------
@@ -6250,6 +6273,49 @@ the state of a gfn/page as needed.
 
 The "flags" field is reserved for future extensions and must be '0'.
 
+4.142 KVM_CREATE_GUEST_MEMFD
+----------------------------
+
+:Capability: KVM_CAP_GUEST_MEMFD
+:Architectures: none
+:Type: vm ioctl
+:Parameters: struct kvm_create_guest_memfd(in)
+:Returns: 0 on success, <0 on error
+
+KVM_CREATE_GUEST_MEMFD creates an anonymous file and returns a file descriptor
+that refers to it.  guest_memfd files are roughly analogous to files created
+via memfd_create(), e.g. guest_memfd files live in RAM, have volatile storage,
+and are automatically released when the last reference is dropped.  Unlike
+"regular" memfd_create() files, guest_memfd files are bound to their owning
+virtual machine (see below), cannot be mapped, read, or written by userspace,
+and cannot be resized  (guest_memfd files do however support PUNCH_HOLE).
+
+::
+
+  struct kvm_create_guest_memfd {
+	__u64 size;
+	__u64 flags;
+	__u64 reserved[6];
+  };
+
+Conceptually, the inode backing a guest_memfd file represents physical memory,
+i.e. is coupled to the virtual machine as a thing, not to a "struct kvm".  The
+file itself, which is bound to a "struct kvm", is that instance's view of the
+underlying memory, e.g. effectively provides the translation of guest addresses
+to host memory.  This allows for use cases where multiple KVM structures are
+used to manage a single virtual machine, e.g. when performing intrahost
+migration of a virtual machine.
+
+KVM currently only supports mapping guest_memfd via KVM_SET_USER_MEMORY_REGION2,
+and more specifically via the guest_memfd and guest_memfd_offset fields in
+"struct kvm_userspace_memory_region2", where guest_memfd_offset is the offset
+into the guest_memfd instance.  For a given guest_memfd file, there can be at
+most one mapping per page, i.e. binding multiple memory regions to a single
+guest_memfd range is not allowed (any number of memory regions can be bound to
+a single guest_memfd file, but the bound ranges must not overlap).
+
+See KVM_SET_USER_MEMORY_REGION2 for additional details.
+
 5. The kvm_run structure
 ========================
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 42b02dc36474..8dd436ee985b 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -183,6 +183,7 @@ struct file *anon_inode_create_getfile(const char *name,
 	return __anon_inode_getfile(name, fops, priv, flags,
 				    context_inode, true);
 }
+EXPORT_SYMBOL_GPL(anon_inode_create_getfile);
 
 static int __anon_inode_getfd(const char *name,
 			      const struct file_operations *fops,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 68a144cb7dbc..a6de526c0426 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -589,8 +589,20 @@ struct kvm_memory_slot {
 	u32 flags;
 	short id;
 	u16 as_id;
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+	struct {
+		struct file __rcu *file;
+		pgoff_t pgoff;
+	} gmem;
+#endif
 };
 
+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+	return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
+}
+
 static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -685,6 +697,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
+/*
+ * Arch code must define kvm_arch_has_private_mem if support for private memory
+ * is enabled.
+ */
+#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
 struct kvm_memslots {
 	u64 generation;
 	atomic_long_t last_used_slot;
@@ -1400,6 +1423,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void kvm_mmu_invalidate_begin(struct kvm *kvm);
 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_mmu_invalidate_end(struct kvm *kvm);
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -2355,6 +2379,30 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 					struct kvm_gfn_range *range);
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 					 struct kvm_gfn_range *range);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
+	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+#else
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return false;
+}
 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 
+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_gmem_get_pfn(struct kvm *kvm,
+				   struct kvm_memory_slot *slot, gfn_t gfn,
+				   kvm_pfn_t *pfn, int *max_order)
+{
+	KVM_BUG_ON(1, kvm);
+	return -EIO;
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e8d167e54980..2802d10aa88c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
 	__u64 guest_phys_addr;
 	__u64 memory_size;
 	__u64 userspace_addr;
-	__u64 pad[16];
+	__u64 guest_memfd_offset;
+	__u32 guest_memfd;
+	__u32 pad1;
+	__u64 pad2[14];
 };
 
 /*
@@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
  */
 #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
 #define KVM_MEM_READONLY	(1UL << 1)
+#define KVM_MEM_GUEST_MEMFD	(1UL << 2)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -1221,6 +1225,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_USER_MEMORY2 231
 #define KVM_CAP_MEMORY_FAULT_INFO 232
 #define KVM_CAP_MEMORY_ATTRIBUTES 233
+#define KVM_CAP_GUEST_MEMFD 234
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2301,4 +2306,12 @@ struct kvm_memory_attributes {
 
 #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
 
+#define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
+
+struct kvm_create_guest_memfd {
+	__u64 size;
+	__u64 flags;
+	__u64 reserved[6];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 5bd7fcaf9089..08afef022db9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,3 +100,7 @@ config KVM_GENERIC_MMU_NOTIFIER
 config KVM_GENERIC_MEMORY_ATTRIBUTES
        select KVM_GENERIC_MMU_NOTIFIER
        bool
+
+config KVM_PRIVATE_MEM
+       select XARRAY_MULTI
+       bool
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index 2c27d5d0c367..724c89af78af 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
+kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
new file mode 100644
index 000000000000..e65f4170425c
--- /dev/null
+++ b/virt/kvm/guest_memfd.c
@@ -0,0 +1,538 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/anon_inodes.h>
+
+#include "kvm_mm.h"
+
+struct kvm_gmem {
+	struct kvm *kvm;
+	struct xarray bindings;
+	struct list_head entry;
+};
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+	struct folio *folio;
+
+	/* TODO: Support huge pages. */
+	folio = filemap_grab_folio(inode->i_mapping, index);
+	if (IS_ERR_OR_NULL(folio))
+		return NULL;
+
+	/*
+	 * Use the up-to-date flag to track whether or not the memory has been
+	 * zeroed before being handed off to the guest.  There is no backing
+	 * storage for the memory, so the folio will remain up-to-date until
+	 * it's removed.
+	 *
+	 * TODO: Skip clearing pages when trusted firmware will do it when
+	 * assigning memory to the guest.
+	 */
+	if (!folio_test_uptodate(folio)) {
+		unsigned long nr_pages = folio_nr_pages(folio);
+		unsigned long i;
+
+		for (i = 0; i < nr_pages; i++)
+			clear_highpage(folio_page(folio, i));
+
+		folio_mark_uptodate(folio);
+	}
+
+	/*
+	 * Ignore accessed, referenced, and dirty flags.  The memory is
+	 * unevictable and there is no storage to write back to.
+	 */
+	return folio;
+}
+
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+				      pgoff_t end)
+{
+	bool flush = false, found_memslot = false;
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+
+	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+		pgoff_t pgoff = slot->gmem.pgoff;
+
+		struct kvm_gfn_range gfn_range = {
+			.start = slot->base_gfn + max(pgoff, start) - pgoff,
+			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+			.slot = slot,
+			.may_block = true,
+		};
+
+		if (!found_memslot) {
+			found_memslot = true;
+
+			KVM_MMU_LOCK(kvm);
+			kvm_mmu_invalidate_begin(kvm);
+		}
+
+		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	if (found_memslot)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+				    pgoff_t end)
+{
+	struct kvm *kvm = gmem->kvm;
+
+	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+		KVM_MMU_LOCK(kvm);
+		kvm_mmu_invalidate_end(kvm);
+		KVM_MMU_UNLOCK(kvm);
+	}
+}
+
+static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct list_head *gmem_list = &inode->i_mapping->private_list;
+	pgoff_t start = offset >> PAGE_SHIFT;
+	pgoff_t end = (offset + len) >> PAGE_SHIFT;
+	struct kvm_gmem *gmem;
+
+	/*
+	 * Bindings must be stable across invalidation to ensure the start+end
+	 * are balanced.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_begin(gmem, start, end);
+
+	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_end(gmem, start, end);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return 0;
+}
+
+static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t start, index, end;
+	int r;
+
+	/* Dedicated guest is immutable by default. */
+	if (offset + len > i_size_read(inode))
+		return -EINVAL;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	r = 0;
+	for (index = start; index < end; ) {
+		struct folio *folio;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		folio = kvm_gmem_get_folio(inode, index);
+		if (!folio) {
+			r = -ENOMEM;
+			break;
+		}
+
+		index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/* 64-bit only, wrapping the index should be impossible. */
+		if (WARN_ON_ONCE(!index))
+			break;
+
+		cond_resched();
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return r;
+}
+
+static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
+			       loff_t len)
+{
+	int ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+	else
+		ret = kvm_gmem_allocate(file_inode(file), offset, len);
+
+	if (!ret)
+		file_modified(file);
+	return ret;
+}
+
+static int kvm_gmem_release(struct inode *inode, struct file *file)
+{
+	struct kvm_gmem *gmem = file->private_data;
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+
+	/*
+	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
+	 * reference to the file and thus no new bindings can be created, but
+	 * dereferencing the slot for existing bindings needs to be protected
+	 * against memslot updates, specifically so that unbind doesn't race
+	 * and free the memslot (kvm_gmem_get_file() will return NULL).
+	 */
+	mutex_lock(&kvm->slots_lock);
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	xa_for_each(&gmem->bindings, index, slot)
+		rcu_assign_pointer(slot->gmem.file, NULL);
+
+	synchronize_rcu();
+
+	/*
+	 * All in-flight operations are gone and new bindings can be created.
+	 * Zap all SPTEs pointed at by this file.  Do not free the backing
+	 * memory, as its lifetime is associated with the inode, not the file.
+	 */
+	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+	kvm_gmem_invalidate_end(gmem, 0, -1ul);
+
+	list_del(&gmem->entry);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	xa_destroy(&gmem->bindings);
+	kfree(gmem);
+
+	kvm_put_kvm(kvm);
+
+	return 0;
+}
+
+static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
+{
+	struct file *file;
+
+	rcu_read_lock();
+
+	file = rcu_dereference(slot->gmem.file);
+	if (file && !get_file_rcu(file))
+		file = NULL;
+
+	rcu_read_unlock();
+
+	return file;
+}
+
+static struct file_operations kvm_gmem_fops = {
+	.open		= generic_file_open,
+	.release	= kvm_gmem_release,
+	.fallocate	= kvm_gmem_fallocate,
+};
+
+void kvm_gmem_init(struct module *module)
+{
+	kvm_gmem_fops.owner = module;
+}
+
+static int kvm_gmem_migrate_folio(struct address_space *mapping,
+				  struct folio *dst, struct folio *src,
+				  enum migrate_mode mode)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+
+static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
+{
+	struct list_head *gmem_list = &mapping->private_list;
+	struct kvm_gmem *gmem;
+	pgoff_t start, end;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = page->index;
+	end = start + thp_nr_pages(page);
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_begin(gmem, start, end);
+
+	/*
+	 * Do not truncate the range, what action is taken in response to the
+	 * error is userspace's decision (assuming the architecture supports
+	 * gracefully handling memory errors).  If/when the guest attempts to
+	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
+	 * at which point KVM can either terminate the VM or propagate the
+	 * error to userspace.
+	 */
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_end(gmem, start, end);
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return MF_DELAYED;
+}
+
+static const struct address_space_operations kvm_gmem_aops = {
+	.dirty_folio = noop_dirty_folio,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio	= kvm_gmem_migrate_folio,
+#endif
+	.error_remove_page = kvm_gmem_error_page,
+};
+
+static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
+			    struct kstat *stat, u32 request_mask,
+			    unsigned int query_flags)
+{
+	struct inode *inode = path->dentry->d_inode;
+
+	generic_fillattr(idmap, request_mask, inode, stat);
+	return 0;
+}
+
+static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			    struct iattr *attr)
+{
+	return -EINVAL;
+}
+static const struct inode_operations kvm_gmem_iops = {
+	.getattr	= kvm_gmem_getattr,
+	.setattr	= kvm_gmem_setattr,
+};
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+{
+	const char *anon_name = "[kvm-gmem]";
+	struct kvm_gmem *gmem;
+	struct inode *inode;
+	struct file *file;
+	int fd, err;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+	if (!gmem) {
+		err = -ENOMEM;
+		goto err_fd;
+	}
+
+	file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
+					 O_RDWR, NULL);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_gmem;
+	}
+
+	file->f_flags |= O_LARGEFILE;
+
+	inode = file->f_inode;
+	WARN_ON(file->f_mapping != inode->i_mapping);
+
+	inode->i_private = (void *)(unsigned long)flags;
+	inode->i_op = &kvm_gmem_iops;
+	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_unmovable(inode->i_mapping);
+	/* Unmovable mappings are supposed to be marked unevictable as well. */
+	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+	kvm_get_kvm(kvm);
+	gmem->kvm = kvm;
+	xa_init(&gmem->bindings);
+	list_add(&gmem->entry, &inode->i_mapping->private_list);
+
+	fd_install(fd, file);
+	return fd;
+
+err_gmem:
+	kfree(gmem);
+err_fd:
+	put_unused_fd(fd);
+	return err;
+}
+
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
+{
+	loff_t size = args->size;
+	u64 flags = args->flags;
+	u64 valid_flags = 0;
+
+	if (flags & ~valid_flags)
+		return -EINVAL;
+
+	if (size <= 0 || !PAGE_ALIGNED(size))
+		return -EINVAL;
+
+	return __kvm_gmem_create(kvm, size, flags);
+}
+
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset)
+{
+	loff_t size = slot->npages << PAGE_SHIFT;
+	unsigned long start, end;
+	struct kvm_gmem *gmem;
+	struct inode *inode;
+	struct file *file;
+	int r = -EINVAL;
+
+	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kvm_gmem_fops)
+		goto err;
+
+	gmem = file->private_data;
+	if (gmem->kvm != kvm)
+		goto err;
+
+	inode = file_inode(file);
+
+	if (offset < 0 || !PAGE_ALIGNED(offset) ||
+	    offset + size > i_size_read(inode))
+		goto err;
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = start + slot->npages;
+
+	if (!xa_empty(&gmem->bindings) &&
+	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+		filemap_invalidate_unlock(inode->i_mapping);
+		goto err;
+	}
+
+	/*
+	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
+	 * be see either a NULL file or this new file, no need for them to go
+	 * away.
+	 */
+	rcu_assign_pointer(slot->gmem.file, file);
+	slot->gmem.pgoff = start;
+
+	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	/*
+	 * Drop the reference to the file, even on success.  The file pins KVM,
+	 * not the other way 'round.  Active bindings are invalidated if the
+	 * file is closed before memslots are destroyed.
+	 */
+	r = 0;
+err:
+	fput(file);
+	return r;
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	unsigned long start = slot->gmem.pgoff;
+	unsigned long end = start + slot->npages;
+	struct kvm_gmem *gmem;
+	struct file *file;
+
+	/*
+	 * Nothing to do if the underlying file was already closed (or is being
+	 * closed right now), kvm_gmem_release() invalidates all bindings.
+	 */
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return;
+
+	gmem = file->private_data;
+
+	filemap_invalidate_lock(file->f_mapping);
+	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+	rcu_assign_pointer(slot->gmem.file, NULL);
+	synchronize_rcu();
+	filemap_invalidate_unlock(file->f_mapping);
+
+	fput(file);
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+	struct kvm_gmem *gmem;
+	struct folio *folio;
+	struct page *page;
+	struct file *file;
+	int r;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return -EFAULT;
+
+	gmem = file->private_data;
+
+	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
+		r = -EIO;
+		goto out_fput;
+	}
+
+	folio = kvm_gmem_get_folio(file_inode(file), index);
+	if (!folio) {
+		r = -ENOMEM;
+		goto out_fput;
+	}
+
+	if (folio_test_hwpoison(folio)) {
+		r = -EHWPOISON;
+		goto out_unlock;
+	}
+
+	page = folio_file_page(folio, index);
+
+	*pfn = page_to_pfn(page);
+	if (max_order)
+		*max_order = 0;
+
+	r = 0;
+
+out_unlock:
+	folio_unlock(folio);
+out_fput:
+	fput(file);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f1a575d39b3b..8f46d757a2c5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -791,7 +791,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
 	}
 }
 
-static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
 	return kvm_unmap_gfn_range(kvm, range);
@@ -1027,6 +1027,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 /* This does not remove the slot from struct kvm_memslots data structures */
 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
+	if (slot->flags & KVM_MEM_GUEST_MEMFD)
+		kvm_gmem_unbind(slot);
+
 	kvm_destroy_dirty_bitmap(slot);
 
 	kvm_arch_free_memslot(kvm, slot);
@@ -1606,10 +1609,18 @@ static void kvm_replace_memslot(struct kvm *kvm,
 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
 	(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
+static int check_memory_region_flags(struct kvm *kvm,
+				     const struct kvm_userspace_memory_region2 *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
+	if (kvm_arch_has_private_mem(kvm))
+		valid_flags |= KVM_MEM_GUEST_MEMFD;
+
+	/* Dirty logging private memory is not currently supported. */
+	if (mem->flags & KVM_MEM_GUEST_MEMFD)
+		valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
 #ifdef __KVM_HAVE_READONLY_MEM
 	valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -2018,7 +2029,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	int as_id, id;
 	int r;
 
-	r = check_memory_region_flags(mem);
+	r = check_memory_region_flags(kvm, mem);
 	if (r)
 		return r;
 
@@ -2037,6 +2048,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size))
 		return -EINVAL;
+	if (mem->flags & KVM_MEM_GUEST_MEMFD &&
+	    (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
+	     mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
+		return -EINVAL;
 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
 		return -EINVAL;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -2075,6 +2090,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
 			return -EINVAL;
 	} else { /* Modify an existing slot. */
+		/* Private memslots are immutable, they can only be deleted. */
+		if (mem->flags & KVM_MEM_GUEST_MEMFD)
+			return -EINVAL;
 		if ((mem->userspace_addr != old->userspace_addr) ||
 		    (npages != old->npages) ||
 		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2103,10 +2121,23 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	new->npages = npages;
 	new->flags = mem->flags;
 	new->userspace_addr = mem->userspace_addr;
+	if (mem->flags & KVM_MEM_GUEST_MEMFD) {
+		r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
+		if (r)
+			goto out;
+	}
 
 	r = kvm_set_memslot(kvm, old, new, change);
 	if (r)
-		kfree(new);
+		goto out_unbind;
+
+	return 0;
+
+out_unbind:
+	if (mem->flags & KVM_MEM_GUEST_MEMFD)
+		kvm_gmem_unbind(new);
+out:
+	kfree(new);
 	return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
@@ -2442,7 +2473,7 @@ out:
 
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
-	if (!kvm)
+	if (!kvm || kvm_arch_has_private_mem(kvm))
 		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
 
 	return 0;
@@ -4844,6 +4875,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 	case KVM_CAP_MEMORY_ATTRIBUTES:
 		return kvm_supported_mem_attributes(kvm);
+#endif
+#ifdef CONFIG_KVM_PRIVATE_MEM
+	case KVM_CAP_GUEST_MEMFD:
+		return !kvm || kvm_arch_has_private_mem(kvm);
 #endif
 	default:
 		break;
@@ -5277,6 +5312,18 @@ static long kvm_vm_ioctl(struct file *filp,
 	case KVM_GET_STATS_FD:
 		r = kvm_vm_ioctl_get_stats_fd(kvm);
 		break;
+#ifdef CONFIG_KVM_PRIVATE_MEM
+	case KVM_CREATE_GUEST_MEMFD: {
+		struct kvm_create_guest_memfd guest_memfd;
+
+		r = -EFAULT;
+		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+			goto out;
+
+		r = kvm_gmem_create(kvm, &guest_memfd);
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -6409,6 +6456,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (WARN_ON_ONCE(r))
 		goto err_vfio;
 
+	kvm_gmem_init(module);
+
 	/*
 	 * Registration _must_ be the very last thing done, as this exposes
 	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..ecefc7ec51af 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -37,4 +37,30 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 }
 #endif /* HAVE_KVM_PFNCACHE */
 
+#ifdef CONFIG_KVM_PRIVATE_MEM
+void kvm_gmem_init(struct module *module);
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset);
+void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+#else
+static inline void kvm_gmem_init(struct module *module)
+{
+
+}
+
+static inline int kvm_gmem_bind(struct kvm *kvm,
+					 struct kvm_memory_slot *slot,
+					 unsigned int fd, loff_t offset)
+{
+	WARN_ON_ONCE(1);
+	return -EIO;
+}
+
+static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
 #endif /* __KVM_MM_H__ */
-- 
cgit v1.2.3


From 8dd2eee9d526c30fccfe75da7ec5365c6476e510 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Fri, 27 Oct 2023 11:22:02 -0700
Subject: KVM: x86/mmu: Handle page fault for private memory

Add support for resolving page faults on guest private memory for VMs
that differentiate between "shared" and "private" memory.  For such VMs,
KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and
hva-based shared memory, and KVM needs to map in the "correct" variant,
i.e. KVM needs to map the gfn shared/private as appropriate based on the
current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag.

For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request
shared vs. private via a bit in the guest page tables, i.e. what the guest
wants may conflict with the current memory attributes.  To support such
"implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT
to forward the request to userspace.  Add a new flag for memory faults,
KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to
map memory as shared vs. private.

Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory
so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace
needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to
exit on missing mappings when handling guest page fault VM-Exits.  In
that case, userspace will want to know RWX information in order to
correctly/precisely resolve the fault.

Note, private memory *must* be backed by guest_memfd, i.e. shared mappings
always come from the host userspace page tables, and private mappings
always come from a guest_memfd instance.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-21-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |   8 +++-
 arch/x86/kvm/mmu/mmu.c          | 101 ++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu/mmu_internal.h |   1 +
 include/linux/kvm_host.h        |   8 +++-
 include/uapi/linux/kvm.h        |   1 +
 5 files changed, 110 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1e61faf02b2a..726c87c35d57 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6952,6 +6952,7 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
 
 		/* KVM_EXIT_MEMORY_FAULT */
 		struct {
+  #define KVM_MEMORY_EXIT_FLAG_PRIVATE	(1ULL << 3)
 			__u64 flags;
 			__u64 gpa;
 			__u64 size;
@@ -6960,8 +6961,11 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
 KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that
 could not be resolved by KVM.  The 'gpa' and 'size' (in bytes) describe the
 guest physical address range [gpa, gpa + size) of the fault.  The 'flags' field
-describes properties of the faulting access that are likely pertinent.
-Currently, no flags are defined.
+describes properties of the faulting access that are likely pertinent:
+
+ - KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred
+   on a private memory access.  When clear, indicates the fault occurred on a
+   shared access.
 
 Note!  KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it
 accompanies a return code of '-1', not '0'!  errno will always be set to EFAULT
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f5c6b0643645..754a5aaebee5 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3147,9 +3147,9 @@ out:
 	return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
-			      const struct kvm_memory_slot *slot, gfn_t gfn,
-			      int max_level)
+static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
+				       const struct kvm_memory_slot *slot,
+				       gfn_t gfn, int max_level, bool is_private)
 {
 	struct kvm_lpage_info *linfo;
 	int host_level;
@@ -3161,6 +3161,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
 			break;
 	}
 
+	if (is_private)
+		return max_level;
+
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
@@ -3168,6 +3171,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
 	return min(host_level, max_level);
 }
 
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+			      const struct kvm_memory_slot *slot, gfn_t gfn,
+			      int max_level)
+{
+	bool is_private = kvm_slot_can_be_private(slot) &&
+			  kvm_mem_is_private(kvm, gfn);
+
+	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+}
+
 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
@@ -3188,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * Enforce the iTLB multihit workaround after capturing the requested
 	 * level, which will be used to do precise, accurate accounting.
 	 */
-	fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
-						     fault->gfn, fault->max_level);
+	fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+						       fault->gfn, fault->max_level,
+						       fault->is_private);
 	if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
 		return;
 
@@ -4269,6 +4283,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
 }
 
+static inline u8 kvm_max_level_for_order(int order)
+{
+	BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+	KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+			order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+			order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+		return PG_LEVEL_1G;
+
+	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+		return PG_LEVEL_2M;
+
+	return PG_LEVEL_4K;
+}
+
+static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+					      struct kvm_page_fault *fault)
+{
+	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+				      PAGE_SIZE, fault->write, fault->exec,
+				      fault->is_private);
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+				   struct kvm_page_fault *fault)
+{
+	int max_order, r;
+
+	if (!kvm_slot_can_be_private(fault->slot)) {
+		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		return -EFAULT;
+	}
+
+	r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+			     &max_order);
+	if (r) {
+		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		return r;
+	}
+
+	fault->max_level = min(kvm_max_level_for_order(max_order),
+			       fault->max_level);
+	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+
+	return RET_PF_CONTINUE;
+}
+
 static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_memory_slot *slot = fault->slot;
@@ -4301,6 +4364,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 			return RET_PF_EMULATE;
 	}
 
+	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		return -EFAULT;
+	}
+
+	if (fault->is_private)
+		return kvm_faultin_pfn_private(vcpu, fault);
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
 					  fault->write, &fault->map_writable,
@@ -7188,6 +7259,26 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+					struct kvm_gfn_range *range)
+{
+	/*
+	 * Zap SPTEs even if the slot can't be mapped PRIVATE.  KVM x86 only
+	 * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+	 * can simply ignore such slots.  But if userspace is making memory
+	 * PRIVATE, then KVM must prevent the guest from accessing the memory
+	 * as shared.  And if userspace is making memory SHARED and this point
+	 * is reached, then at least one page within the range was previously
+	 * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+	 * Zapping SPTEs in this case ensures KVM will reassess whether or not
+	 * a hugepage can be used for affected ranges.
+	 */
+	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+		return false;
+
+	return kvm_unmap_gfn_range(kvm, range);
+}
+
 static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 				int level)
 {
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index decc1f153669..86c7cb692786 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -201,6 +201,7 @@ struct kvm_page_fault {
 
 	/* Derived from mmu and global state.  */
 	const bool is_tdp;
+	const bool is_private;
 	const bool nx_huge_page_workaround_enabled;
 
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a6de526c0426..67dfd4d79529 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2357,14 +2357,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-						 gpa_t gpa, gpa_t size)
+						 gpa_t gpa, gpa_t size,
+						 bool is_write, bool is_exec,
+						 bool is_private)
 {
 	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
 	vcpu->run->memory_fault.gpa = gpa;
 	vcpu->run->memory_fault.size = size;
 
-	/* Flags are not (yet) defined or communicated to userspace. */
+	/* RWX flags are not (yet) defined or communicated to userspace. */
 	vcpu->run->memory_fault.flags = 0;
+	if (is_private)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2802d10aa88c..8eb10f560c69 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -535,6 +535,7 @@ struct kvm_run {
 		} notify;
 		/* KVM_EXIT_MEMORY_FAULT */
 		struct {
+#define KVM_MEMORY_EXIT_FLAG_PRIVATE	(1ULL << 3)
 			__u64 flags;
 			__u64 gpa;
 			__u64 size;
-- 
cgit v1.2.3


From 2333afa17af0f4b6651214ee17cfd5ae5f47787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:22:03 -0700
Subject: KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro

Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of
KVM_ADDRESS_SPACE_NUM.

No functional change intended.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-22-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 include/linux/kvm_host.h        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fa0d42202405..061eec231299 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2136,7 +2136,6 @@ enum {
 #define HF_SMM_MASK		(1 << 1)
 #define HF_SMM_INSIDE_NMI_MASK	(1 << 2)
 
-# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 # define KVM_ADDRESS_SPACE_NUM 2
 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
 # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 67dfd4d79529..db423ea9e3a4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -690,7 +690,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
 #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
-#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#if KVM_ADDRESS_SPACE_NUM == 1
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 {
 	return 0;
-- 
cgit v1.2.3


From eed52e434bc33603ddb0af62b6c4ef818948489d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 27 Oct 2023 11:22:04 -0700
Subject: KVM: Allow arch code to track number of memslot address spaces per VM

Let x86 track the number of address spaces on a per-VM basis so that KVM
can disallow SMM memslots for confidential VMs.  Confidentials VMs are
fundamentally incompatible with emulating SMM, which as the name suggests
requires being able to read and write guest memory and register state.

Disallowing SMM will simplify support for guest private memory, as KVM
will not need to worry about tracking memory attributes for multiple
address spaces (SMM is the only "non-default" address space across all
architectures).

Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-23-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_hv.c    |  2 +-
 arch/x86/include/asm/kvm_host.h |  8 +++++++-
 arch/x86/kvm/debugfs.c          |  2 +-
 arch/x86/kvm/mmu/mmu.c          |  6 +++---
 arch/x86/kvm/x86.c              |  2 +-
 include/linux/kvm_host.h        | 17 +++++++++++------
 virt/kvm/dirty_ring.c           |  2 +-
 virt/kvm/kvm_main.c             | 26 ++++++++++++++------------
 8 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 130bafdb1430..9b0eaa17275a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6084,7 +6084,7 @@ static int kvmhv_svm_off(struct kvm *kvm)
 	}
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		struct kvm_memory_slot *memslot;
 		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
 		int bkt;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 061eec231299..75ab0da06e64 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2136,9 +2136,15 @@ enum {
 #define HF_SMM_MASK		(1 << 1)
 #define HF_SMM_INSIDE_NMI_MASK	(1 << 2)
 
-# define KVM_ADDRESS_SPACE_NUM 2
+# define KVM_MAX_NR_ADDRESS_SPACES	2
 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
 # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+
+static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm)
+{
+	return KVM_MAX_NR_ADDRESS_SPACES;
+}
+
 #else
 # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
 #endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index ee8c4c3496ed..42026b3f3ff3 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -111,7 +111,7 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
 	mutex_lock(&kvm->slots_lock);
 	write_lock(&kvm->mmu_lock);
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		int bkt;
 
 		slots = __kvm_memslots(kvm, i);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 754a5aaebee5..4de7670d5976 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3763,7 +3763,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 	    kvm_page_track_write_tracking_enabled(kvm))
 		goto out_success;
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		slots = __kvm_memslots(kvm, i);
 		kvm_for_each_memslot(slot, bkt, slots) {
 			/*
@@ -6309,7 +6309,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
 	if (!kvm_memslots_have_rmaps(kvm))
 		return flush;
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		slots = __kvm_memslots(kvm, i);
 
 		kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
@@ -6806,7 +6806,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 	 * modifier prior to checking for a wrap of the MMIO generation so
 	 * that a wrap in any address space is detected.
 	 */
-	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+	gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
 
 	/*
 	 * The very rare case: if the MMIO generation number has wrapped,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1aad0c81f6f..f521c97f5c64 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12577,7 +12577,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
 		hva = slot->userspace_addr;
 	}
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		struct kvm_userspace_memory_region2 m;
 
 		m.slot = id | (i << 16);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index db423ea9e3a4..3ebc6912c54a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -80,8 +80,8 @@
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
 
-#ifndef KVM_ADDRESS_SPACE_NUM
-#define KVM_ADDRESS_SPACE_NUM	1
+#ifndef KVM_MAX_NR_ADDRESS_SPACES
+#define KVM_MAX_NR_ADDRESS_SPACES	1
 #endif
 
 /*
@@ -690,7 +690,12 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
 #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
-#if KVM_ADDRESS_SPACE_NUM == 1
+#if KVM_MAX_NR_ADDRESS_SPACES == 1
+static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm)
+{
+	return KVM_MAX_NR_ADDRESS_SPACES;
+}
+
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -745,9 +750,9 @@ struct kvm {
 	struct mm_struct *mm; /* userspace tied to this vm */
 	unsigned long nr_memslot_pages;
 	/* The two memslot sets - active and inactive (per address space) */
-	struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
+	struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2];
 	/* The current active memslot set for each address space */
-	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
+	struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES];
 	struct xarray vcpu_array;
 	/*
 	 * Protected by slots_lock, but can be read outside if an
@@ -1017,7 +1022,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
-	as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM);
+	as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES);
 	return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
 			lockdep_is_held(&kvm->slots_lock) ||
 			!refcount_read(&kvm->users_count));
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index c1cd7dfe4a90..86d267db87bb 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -58,7 +58,7 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
 	as_id = slot >> 16;
 	id = (u16)slot;
 
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return;
 
 	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f46d757a2c5..8758cb799e18 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -615,7 +615,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		struct interval_tree_node *node;
 
 		slots = __kvm_memslots(kvm, i);
@@ -1241,7 +1241,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 		goto out_err_no_irq_srcu;
 
 	refcount_set(&kvm->users_count, 1);
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		for (j = 0; j < 2; j++) {
 			slots = &kvm->__memslots[i][j];
 
@@ -1391,7 +1391,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 	kvm_arch_destroy_vm(kvm);
 	kvm_destroy_devices(kvm);
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
 		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
 	}
@@ -1682,7 +1682,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
 	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
 	 * use generations 1, 3, 5, ...
 	 */
-	gen += KVM_ADDRESS_SPACE_NUM;
+	gen += kvm_arch_nr_memslot_as_ids(kvm);
 
 	kvm_arch_memslots_updated(kvm, gen);
 
@@ -2052,7 +2052,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	    (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
 	     mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
 		return -EINVAL;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
 		return -EINVAL;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 		return -EINVAL;
@@ -2188,7 +2188,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
 
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	slots = __kvm_memslots(kvm, as_id);
@@ -2250,7 +2250,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	slots = __kvm_memslots(kvm, as_id);
@@ -2362,7 +2362,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
 
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	if (log->first_page & 63)
@@ -2493,7 +2493,7 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
 	gfn_range.arg = range->arg;
 	gfn_range.may_block = range->may_block;
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		slots = __kvm_memslots(kvm, i);
 
 		kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
@@ -4848,9 +4848,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;
 #endif
-#if KVM_ADDRESS_SPACE_NUM > 1
+#if KVM_MAX_NR_ADDRESS_SPACES > 1
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
-		return KVM_ADDRESS_SPACE_NUM;
+		if (kvm)
+			return kvm_arch_nr_memslot_as_ids(kvm);
+		return KVM_MAX_NR_ADDRESS_SPACES;
 #endif
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
@@ -4958,7 +4960,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm)
 
 	lockdep_assert_held(&kvm->slots_lock);
 
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 		if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
 			return false;
 	}
-- 
cgit v1.2.3


From 2e122362d25e1b977aa5c5c88c03956be4228e02 Mon Sep 17 00:00:00 2001
From: Michał Winiarski <michal.winiarski@intel.com>
Date: Tue, 24 Oct 2023 13:07:10 +0200
Subject: iosys-map: Rename locals used inside macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Widely used variable names can be used by macro users, potentially
leading to name collisions.
Suffix locals used inside the macros with an underscore, to reduce the
collision potential.

Suggested-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231024110710.3039807-1-michal.winiarski@intel.com
---
 include/linux/iosys-map.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index 1b06d074ade0..e3649a6563dd 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -168,9 +168,9 @@ struct iosys_map {
  * about the use of uninitialized variable.
  */
 #define IOSYS_MAP_INIT_OFFSET(map_, offset_) ({				\
-	struct iosys_map copy = *map_;					\
-	iosys_map_incr(&copy, offset_);					\
-	copy;								\
+	struct iosys_map copy_ = *map_;					\
+	iosys_map_incr(&copy_, offset_);				\
+	copy_;								\
 })
 
 /**
@@ -391,14 +391,14 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * Returns:
  * The value read from the mapping.
  */
-#define iosys_map_rd(map__, offset__, type__) ({				\
-	type__ val;								\
-	if ((map__)->is_iomem) {						\
-		__iosys_map_rd_io(val, (map__)->vaddr_iomem + (offset__), type__);\
-	} else {								\
-		__iosys_map_rd_sys(val, (map__)->vaddr + (offset__), type__);	\
-	}									\
-	val;									\
+#define iosys_map_rd(map__, offset__, type__) ({					\
+	type__ val_;									\
+	if ((map__)->is_iomem) {							\
+		__iosys_map_rd_io(val_, (map__)->vaddr_iomem + (offset__), type__);	\
+	} else {									\
+		__iosys_map_rd_sys(val_, (map__)->vaddr + (offset__), type__);		\
+	}										\
+	val_;										\
 })
 
 /**
@@ -413,13 +413,13 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * or if pointer may be unaligned (and problematic for the architecture
  * supported), use iosys_map_memcpy_to()
  */
-#define iosys_map_wr(map__, offset__, type__, val__) ({				\
-	type__ val = (val__);							\
-	if ((map__)->is_iomem) {						\
-		__iosys_map_wr_io(val, (map__)->vaddr_iomem + (offset__), type__);\
-	} else {								\
-		__iosys_map_wr_sys(val, (map__)->vaddr + (offset__), type__);	\
-	}									\
+#define iosys_map_wr(map__, offset__, type__, val__) ({					\
+	type__ val_ = (val__);								\
+	if ((map__)->is_iomem) {							\
+		__iosys_map_wr_io(val_, (map__)->vaddr_iomem + (offset__), type__);	\
+	} else {									\
+		__iosys_map_wr_sys(val_, (map__)->vaddr + (offset__), type__);		\
+	}										\
 })
 
 /**
@@ -485,9 +485,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * The value read from the mapping.
  */
 #define iosys_map_rd_field(map__, struct_offset__, struct_type__, field__) ({	\
-	struct_type__ *s;							\
+	struct_type__ *s_;							\
 	iosys_map_rd(map__, struct_offset__ + offsetof(struct_type__, field__),	\
-		     typeof(s->field__));					\
+		     typeof(s_->field__));					\
 })
 
 /**
@@ -508,9 +508,9 @@ static inline void iosys_map_memset(struct iosys_map *dst, size_t offset,
  * usage and memory layout.
  */
 #define iosys_map_wr_field(map__, struct_offset__, struct_type__, field__, val__) ({	\
-	struct_type__ *s;								\
+	struct_type__ *s_;								\
 	iosys_map_wr(map__, struct_offset__ + offsetof(struct_type__, field__),		\
-		     typeof(s->field__), val__);					\
+		     typeof(s_->field__), val__);					\
 })
 
 #endif /* __IOSYS_MAP_H__ */
-- 
cgit v1.2.3


From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 12:34:46 +0200
Subject: perf/core: Fix cpuctx refcounting

Audit of the refcounting turned up that perf_pmu_migrate_context()
fails to migrate the ctx refcount.

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org
Cc: <stable@vger.kernel.org>
---
 include/linux/perf_event.h | 13 ++++++++-----
 kernel/events/core.c       | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index afb028c54f33..5547ba68e6e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,11 +843,11 @@ struct perf_event {
 };
 
 /*
- *           ,-----------------------[1:n]----------------------.
- *           V                                                  V
- * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
- *           ^                      ^     |                     |
- *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
+ *           ,-----------------------[1:n]------------------------.
+ *           V                                                    V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
+ *                                        |                       |
+ *                                        `--[n:1]-> pmu <-[1:n]--'
  *
  *
  * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
@@ -865,6 +865,9 @@ struct perf_event {
  * ctx->mutex pinning the configuration. Since we hold a reference on
  * group_leader (through the filedesc) it can't go away, therefore it's
  * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ *
+ * perf_event holds a refcount on perf_event_context
+ * perf_event holds a refcount on perf_event_pmu_context
  */
 struct perf_event_pmu_context {
 	struct pmu			*pmu;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 683dc086ef10..b704d83a28b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
 	void *task_ctx_data = NULL;
 
 	if (!ctx->task) {
+		/*
+		 * perf_pmu_migrate_context() / __perf_pmu_install_event()
+		 * relies on the fact that find_get_pmu_context() cannot fail
+		 * for CPU contexts.
+		 */
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 				     int cpu, struct perf_event *event)
 {
 	struct perf_event_pmu_context *epc;
+	struct perf_event_context *old_ctx = event->ctx;
+
+	get_ctx(ctx); /* normally find_get_context() */
 
 	event->cpu = cpu;
 	epc = find_get_pmu_context(pmu, ctx, event);
@@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 	if (event->state >= PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_INACTIVE;
 	perf_install_in_context(ctx, event, cpu);
+
+	/*
+	 * Now that event->ctx is updated and visible, put the old ctx.
+	 */
+	put_ctx(old_ctx);
 }
 
 static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	struct perf_event_context *src_ctx, *dst_ctx;
 	LIST_HEAD(events);
 
+	/*
+	 * Since per-cpu context is persistent, no need to grab an extra
+	 * reference.
+	 */
 	src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
 	dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
 
-- 
cgit v1.2.3


From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Fri, 20 Oct 2023 21:27:46 +0530
Subject: sched/numa: Fix mm numa_scan_seq based unconditional scan

Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic")

NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the
task had previously accessed VMA. However unconditional scan of VMAs are
allowed during initial phase of VMA creation until process's
mm numa_scan_seq reaches 2 even though current task had not accessed VMA.

Rationale:
 - Without initial scan subsequent PTE update may never happen.
 - Give fair opportunity to all the VMAs to be scanned and subsequently
understand the access pattern of all the VMAs.

But it has a corner case where, if a VMA is created after some time,
process's mm numa_scan_seq could be already greater than 2.

For e.g., values of mm numa_scan_seq when VMAs are created by running
mmtest autonuma benchmark briefly looks like:
start_seq=0 : 459
start_seq=2 : 138
start_seq=3 : 144
start_seq=4 : 8
start_seq=8 : 1
start_seq=9 : 1
This results in no unconditional PTE updates for those VMAs created after
some time.

Fix:
 - Note down the initial value of mm numa_scan_seq in per VMA start_seq.
 - Allow unconditional scan till start_seq + 2.

Result:
SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus.
base kernel: upstream 6.6-rc6 with Mels patches [1] applied.

kernbench
==========		base                  patched %gain
Amean    elsp-128      165.09 ( 0.00%)      164.78 *   0.19%*

Duration User       41404.28    41375.08
Duration System      9862.22     9768.48
Duration Elapsed      519.87      518.72

Ops NUMA PTE updates           1041416.00      831536.00
Ops NUMA hint faults            263296.00      220966.00
Ops NUMA pages migrated         258021.00      212769.00
Ops AutoNUMA cost                 1328.67        1114.69

autonumabench

NUMA01_THREADLOCAL
==================
Amean  elsp-NUMA01_THREADLOCAL   81.79 (0.00%)  67.74 *  17.18%*

Duration User       54832.73    47379.67
Duration System        75.00      185.75
Duration Elapsed      576.72      476.09

Ops NUMA PTE updates                  394429.00    11121044.00
Ops NUMA hint faults                    1001.00     8906404.00
Ops NUMA pages migrated                  288.00     2998694.00
Ops AutoNUMA cost                          7.77       44666.84

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com
---
 include/linux/mm_types.h | 3 +++
 kernel/sched/fair.c      | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..950df415d7de 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -600,6 +600,9 @@ struct vma_numab_state {
 	 */
 	unsigned long pids_active[2];
 
+	/* MM scan sequence ID when scan first started after VMA creation */
+	int start_scan_seq;
+
 	/*
 	 * MM scan sequence ID when the VMA was last completely scanned.
 	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..44b5262b6657 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3164,7 +3164,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 	 * This is also done to avoid any side effect of task scanning
 	 * amplifying the unfairness of disjoint set of VMAs' access.
 	 */
-	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
 		return true;
 
 	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
@@ -3307,6 +3307,8 @@ retry_pids:
 			if (!vma->numab_state)
 				continue;
 
+			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
 			vma->numab_state->next_scan = now +
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
-- 
cgit v1.2.3


From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Wed, 15 Nov 2023 11:36:45 +0800
Subject: sched/eevdf: Sort the rbtree by virtual deadline

Sort the task timeline by virtual deadline and keep the min_vruntime
in the augmented tree, so we can avoid doubling the worst case cost
and make full use of the cached leftmost node to enable O(1) fastpath
picking in next patch.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com
---
 include/linux/sched.h |   2 +-
 kernel/sched/debug.c  |  11 +++-
 kernel/sched/fair.c   | 168 ++++++++++++++++++++------------------------------
 kernel/sched/sched.h  |   1 +
 4 files changed, 77 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..cd56d4018527 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,7 +553,7 @@ struct sched_entity {
 	struct load_weight		load;
 	struct rb_node			run_node;
 	u64				deadline;
-	u64				min_deadline;
+	u64				min_vruntime;
 
 	struct list_head		group_node;
 	unsigned int			on_rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4580a450700e..168eecc209b4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
-	struct sched_entity *last, *first;
+	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+	struct sched_entity *last, *first, *root;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
@@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(cfs_rq->exec_clock));
 
 	raw_spin_rq_lock_irqsave(rq, flags);
+	root = __pick_root_entity(cfs_rq);
+	if (root)
+		left_vruntime = root->min_vruntime;
 	first = __pick_first_entity(cfs_rq);
 	if (first)
-		left_vruntime = first->vruntime;
+		left_deadline = first->deadline;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		right_vruntime = last->vruntime;
 	min_vruntime = cfs_rq->min_vruntime;
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_deadline",
+			SPLIT_NS(left_deadline));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_vruntime",
 			SPLIT_NS(left_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44b5262b6657..31bca05c3612 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 static inline bool entity_before(const struct sched_entity *a,
 				 const struct sched_entity *b)
 {
-	return (s64)(a->vruntime - b->vruntime) < 0;
+	/*
+	 * Tiebreak on vruntime seems unnecessary since it can
+	 * hardly happen.
+	 */
+	return (s64)(a->deadline - b->deadline) < 0;
 }
 
 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
  *       to the loss in precision caused by the division.
  */
-int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	s64 avg = cfs_rq->avg_vruntime;
@@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		load += weight;
 	}
 
-	return avg >= entity_key(cfs_rq, se) * load;
+	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return vruntime_eligible(cfs_rq, se->vruntime);
 }
 
 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
+	struct sched_entity *se = __pick_root_entity(cfs_rq);
 	struct sched_entity *curr = cfs_rq->curr;
-
 	u64 vruntime = cfs_rq->min_vruntime;
 
 	if (curr) {
@@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 	if (se) {
 		if (!curr)
-			vruntime = se->vruntime;
+			vruntime = se->min_vruntime;
 		else
-			vruntime = min_vruntime(vruntime, se->vruntime);
+			vruntime = min_vruntime(vruntime, se->min_vruntime);
 	}
 
 	/* ensure we never gain time by being placed backwards. */
@@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 	return entity_before(__node_2_se(a), __node_2_se(b));
 }
 
-#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
 
-static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
 {
 	if (node) {
 		struct sched_entity *rse = __node_2_se(node);
-		if (deadline_gt(min_deadline, se, rse))
-			se->min_deadline = rse->min_deadline;
+		if (vruntime_gt(min_vruntime, se, rse))
+			se->min_vruntime = rse->min_vruntime;
 	}
 }
 
 /*
- * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
  */
-static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
 {
-	u64 old_min_deadline = se->min_deadline;
+	u64 old_min_vruntime = se->min_vruntime;
 	struct rb_node *node = &se->run_node;
 
-	se->min_deadline = se->deadline;
-	__update_min_deadline(se, node->rb_right);
-	__update_min_deadline(se, node->rb_left);
+	se->min_vruntime = se->vruntime;
+	__min_vruntime_update(se, node->rb_right);
+	__min_vruntime_update(se, node->rb_left);
 
-	return se->min_deadline == old_min_deadline;
+	return se->min_vruntime == old_min_vruntime;
 }
 
-RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
-		     run_node, min_deadline, min_deadline_update);
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+		     run_node, min_vruntime, min_vruntime_update);
 
 /*
  * Enqueue an entity into the rb-tree:
@@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	avg_vruntime_add(cfs_rq, se);
-	se->min_deadline = se->deadline;
+	se->min_vruntime = se->vruntime;
 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-				__entity_less, &min_deadline_cb);
+				__entity_less, &min_vruntime_cb);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-				  &min_deadline_cb);
+				  &min_vruntime_cb);
 	avg_vruntime_sub(cfs_rq, se);
 }
 
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+	if (!root)
+		return NULL;
+
+	return __node_2_se(root);
+}
+
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
@@ -850,23 +868,28 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  *     with the earliest virtual deadline.
  *
  * We can do this in O(log n) time due to an augmented RB-tree. The
- * tree keeps the entries sorted on service, but also functions as a
- * heap based on the deadline by keeping:
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
  *
- *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
  *
- * Which allows an EDF like search on (sub)trees.
+ * Which allows tree pruning through eligibility.
  */
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 	struct sched_entity *curr = cfs_rq->curr;
 	struct sched_entity *best = NULL;
-	struct sched_entity *best_left = NULL;
+
+	/*
+	 * We can safely skip eligibility check if there is only one entity
+	 * in this cfs_rq, saving some cycles.
+	 */
+	if (cfs_rq->nr_running == 1)
+		return curr && curr->on_rq ? curr : __node_2_se(node);
 
 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 		curr = NULL;
-	best = curr;
 
 	/*
 	 * Once selected, run a task until it either becomes non-eligible or
@@ -875,95 +898,38 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
 	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
 		return curr;
 
+	/* Heap search for the EEVD entity */
 	while (node) {
 		struct sched_entity *se = __node_2_se(node);
+		struct rb_node *left = node->rb_left;
 
 		/*
-		 * If this entity is not eligible, try the left subtree.
+		 * Eligible entities in left subtree are always better
+		 * choices, since they have earlier deadlines.
 		 */
-		if (!entity_eligible(cfs_rq, se)) {
-			node = node->rb_left;
+		if (left && vruntime_eligible(cfs_rq,
+					__node_2_se(left)->min_vruntime)) {
+			node = left;
 			continue;
 		}
 
 		/*
-		 * Now we heap search eligible trees for the best (min_)deadline
+		 * The left subtree either is empty or has no eligible
+		 * entity, so check the current node since it is the one
+		 * with earliest deadline that might be eligible.
 		 */
-		if (!best || deadline_gt(deadline, best, se))
+		if (entity_eligible(cfs_rq, se)) {
 			best = se;
-
-		/*
-		 * Every se in a left branch is eligible, keep track of the
-		 * branch with the best min_deadline
-		 */
-		if (node->rb_left) {
-			struct sched_entity *left = __node_2_se(node->rb_left);
-
-			if (!best_left || deadline_gt(min_deadline, best_left, left))
-				best_left = left;
-
-			/*
-			 * min_deadline is in the left branch. rb_left and all
-			 * descendants are eligible, so immediately switch to the second
-			 * loop.
-			 */
-			if (left->min_deadline == se->min_deadline)
-				break;
-		}
-
-		/* min_deadline is at this node, no need to look right */
-		if (se->deadline == se->min_deadline)
 			break;
-
-		/* else min_deadline is in the right branch. */
-		node = node->rb_right;
-	}
-
-	/*
-	 * We ran into an eligible node which is itself the best.
-	 * (Or nr_running == 0 and both are NULL)
-	 */
-	if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
-		return best;
-
-	/*
-	 * Now best_left and all of its children are eligible, and we are just
-	 * looking for deadline == min_deadline
-	 */
-	node = &best_left->run_node;
-	while (node) {
-		struct sched_entity *se = __node_2_se(node);
-
-		/* min_deadline is the current node */
-		if (se->deadline == se->min_deadline)
-			return se;
-
-		/* min_deadline is in the left branch */
-		if (node->rb_left &&
-		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
-			node = node->rb_left;
-			continue;
 		}
 
-		/* else min_deadline is in the right branch */
 		node = node->rb_right;
 	}
-	return NULL;
-}
 
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *se = __pick_eevdf(cfs_rq);
-
-	if (!se) {
-		struct sched_entity *left = __pick_first_entity(cfs_rq);
-		if (left) {
-			pr_err("EEVDF scheduling fail, picking leftmost\n");
-			return left;
-		}
-	}
+	if (!best || (curr && entity_before(curr, best)))
+		best = curr;
 
-	return se;
+	return best;
 }
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a42..539c7e763f15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2822,6 +2822,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
 		    double_rq_lock(_T->lock, _T->lock2),
 		    double_rq_unlock(_T->lock, _T->lock2))
 
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 
-- 
cgit v1.2.3


From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:18 +0100
Subject: sched: Unify runtime accounting across classes

All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.

Collapse all that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h    |  2 +-
 kernel/sched/deadline.c  | 15 +++----------
 kernel/sched/fair.c      | 57 ++++++++++++++++++++++++++++++++++++------------
 kernel/sched/rt.c        | 15 +++----------
 kernel/sched/sched.h     | 12 ++--------
 kernel/sched/stop_task.c | 13 +----------
 6 files changed, 53 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cd56d4018527..44b46d9743bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,7 @@ struct sched_statistics {
 	u64				block_max;
 	s64				sum_block_runtime;
 
-	u64				exec_max;
+	s64				exec_max;
 	u64				slice_max;
 
 	u64				nr_migrations_cold;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b28114478b82..de79719c63c0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1275,9 +1275,8 @@ static void update_curr_dl(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_dl_entity *dl_se = &curr->dl;
-	u64 delta_exec, scaled_delta_exec;
+	s64 delta_exec, scaled_delta_exec;
 	int cpu = cpu_of(rq);
-	u64 now;
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1290,21 +1289,13 @@ static void update_curr_dl(struct rq *rq)
 	 * natural solution, but the full ramifications of this
 	 * approach need further study.
 	 */
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0)) {
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
 		return;
 	}
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (dl_entity_is_special(dl_se))
 		return;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3e045d80cab..11073cf00134 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1103,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 {
-	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_clock_task(rq_of(cfs_rq));
-	u64 delta_exec;
-
-	if (unlikely(!curr))
-		return;
+	u64 now = rq_clock_task(rq);
+	s64 delta_exec;
 
 	delta_exec = now - curr->exec_start;
-	if (unlikely((s64)delta_exec <= 0))
-		return;
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
 
 	curr->exec_start = now;
+	curr->sum_exec_runtime += delta_exec;
 
 	if (schedstat_enabled()) {
 		struct sched_statistics *stats;
@@ -1129,8 +1123,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
 				max(delta_exec, stats->exec_max));
 	}
 
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq->exec_clock, delta_exec);
+	return delta_exec;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	s64 delta_exec;
+
+	delta_exec = update_curr_se(rq, &curr->se);
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
+
+	trace_sched_stat_runtime(curr, delta_exec, 0);
+
+	account_group_exec_runtime(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
+
+	return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 delta_exec;
+
+	if (unlikely(!curr))
+		return;
+
+	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+	if (unlikely(delta_exec <= 0))
+		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_deadline(cfs_rq, curr);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6aaf0a3d6081..3261b067b67e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1002,24 +1002,15 @@ static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	u64 delta_exec;
-	u64 now;
+	s64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0))
 		return;
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (!rt_bandwidth_enabled())
 		return;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 539c7e763f15..6703e9e81b1d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2212,6 +2212,8 @@ struct affinity_context {
 	unsigned int flags;
 };
 
+extern s64 update_curr_common(struct rq *rq);
+
 struct sched_class {
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -3262,16 +3264,6 @@ extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
 
-static inline void update_current_exec_runtime(struct task_struct *curr,
-						u64 now, u64 delta_exec)
-{
-	curr->se.sum_exec_runtime += delta_exec;
-	account_group_exec_runtime(curr, delta_exec);
-
-	curr->se.exec_start = now;
-	cgroup_account_cputime(curr, delta_exec);
-}
-
 #ifdef CONFIG_SCHED_MM_CID
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 6cf7304e6449..b1b8fe61c532 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
 
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *curr = rq->curr;
-	u64 now, delta_exec;
-
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
-
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	update_current_exec_runtime(curr, now, delta_exec);
+	update_curr_common(rq);
 }
 
 /*
-- 
cgit v1.2.3


From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:21 +0100
Subject: sched/deadline: Introduce deadline servers

Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).

RT Throttling has been introduced a while ago as a (mostly debug)
countermeasure one can utilize to reserve some CPU time for low priority
tasks (usually background type of work, e.g. workqueues, timers, etc.).
It however has its own problems (see documentation) and the undesired
effect of unconditionally throttling FIFO tasks even when no lower
priority activity needs to run (there are mechanisms to fix this issue
as well, but, again, with their own problems).

Introduce deadline servers to service low priority tasks needs under
starvation conditions. Deadline servers are built extending SCHED_DEADLINE
implementation to allow 2-level scheduling (a sched_deadline entity
becomes a container for lower priority scheduling entities).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h   |  22 +++-
 kernel/sched/core.c     |  17 +++
 kernel/sched/deadline.c | 332 ++++++++++++++++++++++++++++++++----------------
 kernel/sched/fair.c     |   2 +
 kernel/sched/sched.h    |  27 ++++
 5 files changed, 292 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44b46d9743bf..8d258162deb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,11 +63,13 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
+struct sched_dl_entity;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct task_struct;
 struct user_event_mm;
 
 /*
@@ -607,6 +609,9 @@ struct sched_rt_entity {
 #endif
 } __randomize_layout;
 
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
 struct sched_dl_entity {
 	struct rb_node			rb_node;
 
@@ -654,6 +659,7 @@ struct sched_dl_entity {
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
+	unsigned int			dl_server         : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -668,7 +674,20 @@ struct sched_dl_entity {
 	 * timer is needed to decrease the active utilization at the correct
 	 * time.
 	 */
-	struct hrtimer inactive_timer;
+	struct hrtimer			inactive_timer;
+
+	/*
+	 * Bits for DL-server functionality. Also see the comment near
+	 * dl_server_update().
+	 *
+	 * @rq the runqueue this server is for
+	 *
+	 * @server_has_tasks() returns true if @server_pick return a
+	 * runnable task.
+	 */
+	struct rq			*rq;
+	dl_server_has_tasks_f		server_has_tasks;
+	dl_server_pick_f		server_pick;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
@@ -795,6 +814,7 @@ struct task_struct {
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+	struct sched_dl_entity		*dl_server;
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 966631f05d71..f5f4495d1768 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3797,6 +3797,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		rq->idle_stamp = 0;
 	}
 #endif
+
+	p->dl_server = NULL;
 }
 
 /*
@@ -6003,12 +6005,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			p = pick_next_task_idle(rq);
 		}
 
+		/*
+		 * This is the fast path; it cannot be a DL server pick;
+		 * therefore even if @p == @prev, ->dl_server must be NULL.
+		 */
+		if (p->dl_server)
+			p->dl_server = NULL;
+
 		return p;
 	}
 
 restart:
 	put_prev_task_balance(rq, prev, rf);
 
+	/*
+	 * We've updated @prev and no longer need the server link, clear it.
+	 * Must be done before ->pick_next_task() because that can (re)set
+	 * ->dl_server.
+	 */
+	if (prev->dl_server)
+		prev->dl_server = NULL;
+
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 81810f67df7a..a04a436af8cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void)
 late_initcall(sched_dl_sysctl_init);
 #endif
 
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+	return dl_se->dl_server;
+}
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
+	BUG_ON(dl_server(dl_se));
 	return container_of(dl_se, struct task_struct, dl);
 }
 
@@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
 	return container_of(dl_rq, struct rq, dl);
 }
 
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = task_rq(p);
+	struct rq *rq = dl_se->rq;
+
+	if (!dl_server(dl_se))
+		rq = task_rq(dl_task_of(dl_se));
 
-	return &rq->dl;
+	return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+	return &rq_of_dl_se(dl_se)->dl;
 }
 
 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -394,9 +407,8 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
 static void task_non_contending(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct rq *rq = rq_of_dl_se(dl_se);
+	struct dl_rq *dl_rq = &rq->dl;
 	s64 zerolag_time;
 
 	/*
@@ -426,25 +438,33 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
 	 * utilization now, instead of starting a timer
 	 */
 	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
-		if (dl_task(p))
+		if (dl_server(dl_se)) {
 			sub_running_bw(dl_se, dl_rq);
+		} else {
+			struct task_struct *p = dl_task_of(dl_se);
 
-		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
-			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+			if (dl_task(p))
+				sub_running_bw(dl_se, dl_rq);
 
-			if (READ_ONCE(p->__state) == TASK_DEAD)
-				sub_rq_bw(dl_se, &rq->dl);
-			raw_spin_lock(&dl_b->lock);
-			__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
-			raw_spin_unlock(&dl_b->lock);
-			__dl_clear_params(dl_se);
+			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+				if (READ_ONCE(p->__state) == TASK_DEAD)
+					sub_rq_bw(dl_se, &rq->dl);
+				raw_spin_lock(&dl_b->lock);
+				__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+				raw_spin_unlock(&dl_b->lock);
+				__dl_clear_params(dl_se);
+			}
 		}
 
 		return;
 	}
 
 	dl_se->dl_non_contending = 1;
-	get_task_struct(p);
+	if (!dl_server(dl_se))
+		get_task_struct(dl_task_of(dl_se));
+
 	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
 }
 
@@ -471,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
 		 */
-		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
-			put_task_struct(dl_task_of(dl_se));
+		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+			if (!dl_server(dl_se))
+				put_task_struct(dl_task_of(dl_se));
+		}
 	} else {
 		/*
 		 * Since "dl_non_contending" is not set, the
@@ -485,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
 	}
 }
 
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
-
 	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
 }
 
@@ -740,8 +760,10 @@ static inline void deadline_queue_pull_task(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
 
 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
@@ -989,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
  */
 static void update_dl_entity(struct sched_dl_entity *dl_se)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, rq_clock(rq))) {
@@ -1021,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
 	struct hrtimer *timer = &dl_se->dl_timer;
-	struct rq *rq = task_rq(p);
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
 	s64 delta;
 
@@ -1059,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p)
 	 * and observe our state.
 	 */
 	if (!hrtimer_is_queued(timer)) {
-		get_task_struct(p);
+		if (!dl_server(dl_se))
+			get_task_struct(dl_task_of(dl_se));
 		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
 	}
 
 	return 1;
 }
 
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
+	 */
+	if (has_pushable_dl_tasks(rq)) {
+		/*
+		 * Nothing relies on rq->lock after this, so its safe to drop
+		 * rq->lock.
+		 */
+		rq_unpin_lock(rq, rf);
+		push_dl_task(rq);
+		rq_repin_lock(rq, rf);
+	}
+#endif
+}
+
 /*
  * This is the bandwidth enforcement timer callback. If here, we know
  * a task is not on its dl_rq, since the fact that the timer was running
@@ -1084,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     dl_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p;
 	struct rq_flags rf;
 	struct rq *rq;
 
+	if (dl_server(dl_se)) {
+		struct rq *rq = rq_of_dl_se(dl_se);
+		struct rq_flags rf;
+
+		rq_lock(rq, &rf);
+		if (dl_se->dl_throttled) {
+			sched_clock_tick();
+			update_rq_clock(rq);
+
+			if (dl_se->server_has_tasks(dl_se)) {
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+				resched_curr(rq);
+				__push_dl_task(rq, &rf);
+			} else {
+				replenish_dl_entity(dl_se);
+			}
+
+		}
+		rq_unlock(rq, &rf);
+
+		return HRTIMER_NORESTART;
+	}
+
+	p = dl_task_of(dl_se);
 	rq = task_rq_lock(p, &rf);
 
 	/*
@@ -1158,21 +1223,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	else
 		resched_curr(rq);
 
-#ifdef CONFIG_SMP
-	/*
-	 * Queueing this task back might have overloaded rq, check if we need
-	 * to kick someone away.
-	 */
-	if (has_pushable_dl_tasks(rq)) {
-		/*
-		 * Nothing relies on rq->lock after this, so its safe to drop
-		 * rq->lock.
-		 */
-		rq_unpin_lock(rq, &rf);
-		push_dl_task(rq);
-		rq_repin_lock(rq, &rf);
-	}
-#endif
+	__push_dl_task(rq, &rf);
 
 unlock:
 	task_rq_unlock(rq, p, &rf);
@@ -1214,12 +1265,11 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se)
  */
 static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
 	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
 			return;
 		dl_se->dl_throttled = 1;
 		if (dl_se->runtime > 0)
@@ -1270,29 +1320,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 	return (delta * u_act) >> BW_SHIFT;
 }
 
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+                        int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
 {
-	struct task_struct *curr = rq->curr;
-	struct sched_dl_entity *dl_se = &curr->dl;
-	s64 delta_exec, scaled_delta_exec;
-	int cpu = cpu_of(rq);
-
-	if (!dl_task(curr) || !on_dl_rq(dl_se))
-		return;
+	s64 scaled_delta_exec;
 
-	/*
-	 * Consumed budget is computed considering the time as
-	 * observed by schedulable tasks (excluding time spent
-	 * in hardirq context, etc.). Deadlines are instead
-	 * computed using hard walltime. This seems to be the more
-	 * natural solution, but the full ramifications of this
-	 * approach need further study.
-	 */
-	delta_exec = update_curr_common(rq);
 	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
@@ -1310,10 +1344,9 @@ static void update_curr_dl(struct rq *rq)
 	 * according to current frequency and CPU maximum capacity.
 	 */
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
-		scaled_delta_exec = grub_reclaim(delta_exec,
-						 rq,
-						 &curr->dl);
+		scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
 	} else {
+		int cpu = cpu_of(rq);
 		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
 		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
 
@@ -1332,11 +1365,20 @@ throttle:
 		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
 			dl_se->dl_overrun = 1;
 
-		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
-			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+		dequeue_dl_entity(dl_se, 0);
+		if (!dl_server(dl_se)) {
+			update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+			dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+		}
 
-		if (!is_leftmost(curr, &rq->dl))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+			if (dl_server(dl_se))
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+			else
+				enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+		}
+
+		if (!is_leftmost(dl_se, &rq->dl))
 			resched_curr(rq);
 	}
 
@@ -1366,20 +1408,82 @@ throttle:
 	}
 }
 
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+	if (!dl_server(dl_se)) {
+		dl_se->dl_server = 1;
+		setup_new_dl_entity(dl_se);
+	}
+	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick)
+{
+	dl_se->rq = rq;
+	dl_se->server_has_tasks = has_tasks;
+	dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_dl_entity *dl_se = &curr->dl;
+	s64 delta_exec;
+
+	if (!dl_task(curr) || !on_dl_rq(dl_se))
+		return;
+
+	/*
+	 * Consumed budget is computed considering the time as
+	 * observed by schedulable tasks (excluding time spent
+	 * in hardirq context, etc.). Deadlines are instead
+	 * computed using hard walltime. This seems to be the more
+	 * natural solution, but the full ramifications of this
+	 * approach need further study.
+	 */
+	delta_exec = update_curr_common(rq);
+	update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
 static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 {
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     inactive_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p = NULL;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(p, &rf);
+	if (!dl_server(dl_se)) {
+		p = dl_task_of(dl_se);
+		rq = task_rq_lock(p, &rf);
+	} else {
+		rq = dl_se->rq;
+		rq_lock(rq, &rf);
+	}
 
 	sched_clock_tick();
 	update_rq_clock(rq);
 
+	if (dl_server(dl_se))
+		goto no_task;
+
 	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
@@ -1396,14 +1500,21 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
 		goto unlock;
 	}
+
+no_task:
 	if (dl_se->dl_non_contending == 0)
 		goto unlock;
 
 	sub_running_bw(dl_se, &rq->dl);
 	dl_se->dl_non_contending = 0;
 unlock:
-	task_rq_unlock(rq, p, &rf);
-	put_task_struct(p);
+
+	if (!dl_server(dl_se)) {
+		task_rq_unlock(rq, p, &rf);
+		put_task_struct(p);
+	} else {
+		rq_unlock(rq, &rf);
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -1466,10 +1577,8 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
 	u64 deadline = dl_se->deadline;
 
-	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
@@ -1479,9 +1588,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 static inline
 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
-
-	WARN_ON(!dl_prio(prio));
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
@@ -1648,8 +1754,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
 	} else if ((flags & ENQUEUE_RESTORE) &&
-		  dl_time_before(dl_se->deadline,
-				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
 		setup_new_dl_entity(dl_se);
 	}
 
@@ -1730,19 +1835,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, flags);
 
+	if (dl_server(&p->dl))
+		return;
+
 	if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
-	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
-	dequeue_dl_entity(&p->dl, flags);
-
-	if (!p->dl.dl_throttled)
-		dequeue_pushable_dl_task(rq, p);
-}
-
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
@@ -1750,7 +1849,9 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (p->on_rq == TASK_ON_RQ_MIGRATING)
 		flags |= DEQUEUE_MIGRATING;
 
-	__dequeue_task_dl(rq, p, flags);
+	dequeue_dl_entity(&p->dl, flags);
+	if (!p->dl.dl_throttled && !dl_server(&p->dl))
+		dequeue_pushable_dl_task(rq, p);
 }
 
 /*
@@ -1940,12 +2041,12 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-	hrtick_start(rq, p->dl.runtime);
+	hrtick_start(rq, dl_se->runtime);
 }
 #else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
 }
 #endif
@@ -1965,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
 	if (!first)
 		return;
 
-	if (hrtick_enabled_dl(rq))
-		start_hrtick_dl(rq, p);
-
 	if (rq->curr->sched_class != &dl_sched_class)
 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
@@ -1990,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq)
 	struct dl_rq *dl_rq = &rq->dl;
 	struct task_struct *p;
 
+again:
 	if (!sched_dl_runnable(rq))
 		return NULL;
 
 	dl_se = pick_next_dl_entity(dl_rq);
 	WARN_ON_ONCE(!dl_se);
-	p = dl_task_of(dl_se);
+
+	if (dl_server(dl_se)) {
+		p = dl_se->server_pick(dl_se);
+		if (!p) {
+			WARN_ON_ONCE(1);
+			dl_se->dl_yielded = 1;
+			update_curr_dl_se(rq, dl_se, 0);
+			goto again;
+		}
+		p->dl_server = dl_se;
+	} else {
+		p = dl_task_of(dl_se);
+	}
 
 	return p;
 }
@@ -2005,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
 	struct task_struct *p;
 
 	p = pick_task_dl(rq);
-	if (p)
+	if (!p)
+		return p;
+
+	if (!p->dl_server)
 		set_next_task_dl(rq, p, true);
 
+	if (hrtick_enabled(rq))
+		start_hrtick_dl(rq, &p->dl);
+
 	return p;
 }
 
@@ -2045,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 	 * be set and schedule() will start a new hrtick for the next task.
 	 */
 	if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
-	    is_leftmost(p, &rq->dl))
-		start_hrtick_dl(rq, p);
+	    is_leftmost(&p->dl, &rq->dl))
+		start_hrtick_dl(rq, &p->dl);
 }
 
 static void task_fork_dl(struct task_struct *p)
@@ -2986,6 +3103,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
 	dl_se->dl_yielded		= 0;
 	dl_se->dl_non_contending	= 0;
 	dl_se->dl_overrun		= 0;
+	dl_se->dl_server		= 0;
 
 #ifdef CONFIG_RT_MUTEXES
 	dl_se->pi_se			= dl_se;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1cd92b11b289..07f555857698 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1131,6 +1131,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
 	trace_sched_stat_runtime(p, delta_exec);
 	account_group_exec_runtime(p, delta_exec);
 	cgroup_account_cputime(p, delta_exec);
+	if (p->dl_server)
+		dl_server_update(p->dl_server, delta_exec);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1cda787172f0..8a70d51ffa33 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -313,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 extern int  dl_bw_check_overflow(int cpu);
 
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ *   dl_se::rq -- runqueue we belong to.
+ *
+ *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ *                                server when it runs out of tasks to run.
+ *
+ *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ *                           returns NULL.
+ *
+ *   dl_server_update() -- called from update_curr_common(), propagates runtime
+ *                         to the server.
+ *
+ *   dl_server_start()
+ *   dl_server_stop()  -- start/stop the server when it has (no) tasks.
+ *
+ *   dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick);
+
 #ifdef CONFIG_CGROUP_SCHED
 
 struct cfs_rq;
-- 
cgit v1.2.3


From e4ab322fbaaaf84b23d6cb0e3317a7f68baf36dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 17 Sep 2023 13:22:17 +0200
Subject: cleanup: Add conditional guard support

Adds:

 - DEFINE_GUARD_COND() / DEFINE_LOCK_GUARD_1_COND() to extend existing
   guards with conditional lock primitives, eg. mutex_trylock(),
   mutex_lock_interruptible().

   nb. both primitives allow NULL 'locks', which cause the lock to
       fail (obviously).

 - extends scoped_guard() to not take the body when the the
   conditional guard 'fails'. eg.

     scoped_guard (mutex_intr, &task->signal_cred_guard_mutex) {
	...
     }

   will only execute the body when the mutex is held.

 - provides scoped_cond_guard(name, fail, args...); which extends
   scoped_guard() to do fail when the lock-acquire fails.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231102110706.460851167%40infradead.org
---
 include/linux/cleanup.h  | 52 +++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mutex.h    |  3 ++-
 include/linux/rwsem.h    |  8 ++++----
 include/linux/spinlock.h | 15 ++++++++++++++
 4 files changed, 70 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 9f1a9c455b68..c2d09bc4f976 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -125,25 +125,55 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *	trivial wrapper around DEFINE_CLASS() above specifically
  *	for locks.
  *
+ * DEFINE_GUARD_COND(name, ext, condlock)
+ *	wrapper around EXTEND_CLASS above to add conditional lock
+ *	variants to a base class, eg. mutex_trylock() or
+ *	mutex_lock_interruptible().
+ *
  * guard(name):
- *	an anonymous instance of the (guard) class
+ *	an anonymous instance of the (guard) class, not recommended for
+ *	conditional locks.
  *
  * scoped_guard (name, args...) { }:
  *	similar to CLASS(name, scope)(args), except the variable (with the
  *	explicit name 'scope') is declard in a for-loop such that its scope is
  *	bound to the next (compound) statement.
  *
+ *	for conditional locks the loop body is skipped when the lock is not
+ *	acquired.
+ *
+ * scoped_cond_guard (name, fail, args...) { }:
+ *      similar to scoped_guard(), except it does fail when the lock
+ *      acquire fails.
+ *
  */
 
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
-	DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T)
+	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
+	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	{ return *_T; }
+
+#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
+	EXTEND_CLASS(_name, _ext, \
+		     ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \
+		     class_##_name##_t _T) \
+	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_ptr(_T); }
 
 #define guard(_name) \
 	CLASS(_name, __UNIQUE_ID(guard))
 
+#define __guard_ptr(_name) class_##_name##_lock_ptr
+
 #define scoped_guard(_name, args...)					\
 	for (CLASS(_name, scope)(args),					\
-	     *done = NULL; !done; done = (void *)1)
+	     *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1)
+
+#define scoped_cond_guard(_name, _fail, args...) \
+	for (CLASS(_name, scope)(args), \
+	     *done = NULL; !done; done = (void *)1) \
+		if (!__guard_ptr(_name)(&scope)) _fail; \
+		else
 
 /*
  * Additional helper macros for generating lock guards with types, either for
@@ -152,6 +182,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *
  * DEFINE_LOCK_GUARD_0(name, lock, unlock, ...)
  * DEFINE_LOCK_GUARD_1(name, type, lock, unlock, ...)
+ * DEFINE_LOCK_GUARD_1_COND(name, ext, condlock)
  *
  * will result in the following type:
  *
@@ -173,6 +204,11 @@ typedef struct {							\
 static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
 {									\
 	if (_T->lock) { _unlock; }					\
+}									\
+									\
+static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T)	\
+{									\
+	return _T->lock;						\
 }
 
 
@@ -201,4 +237,14 @@ __DEFINE_LOCK_GUARD_1(_name, _type, _lock)
 __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__)		\
 __DEFINE_LOCK_GUARD_0(_name, _lock)
 
+#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock)		\
+	EXTEND_CLASS(_name, _ext,					\
+		     ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\
+		        if (_T->lock && !(_condlock)) _T->lock = NULL;	\
+			_t; }),						\
+		     typeof_member(class_##_name##_t, lock) l)		\
+	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_ptr(_T); }
+
+
 #endif /* __LINUX_GUARDS_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..95d11308f995 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -221,6 +221,7 @@ extern void mutex_unlock(struct mutex *lock);
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
-DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T))
+DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
+DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
 
 #endif /* __LINUX_MUTEX_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 1dd530ce8b45..9c29689ff505 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -203,11 +203,11 @@ extern void up_read(struct rw_semaphore *sem);
 extern void up_write(struct rw_semaphore *sem);
 
 DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
-DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
-
-DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T))
-DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T))
+DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
+DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)
 
+DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
+DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
 
 /*
  * downgrade write lock to read lock
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 31d3d747a9db..ceb56b39c70f 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -507,6 +507,8 @@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
 		    raw_spin_lock(_T->lock),
 		    raw_spin_unlock(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
+
 DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
 		    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
 		    raw_spin_unlock(_T->lock))
@@ -515,23 +517,36 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
 		    raw_spin_lock_irq(_T->lock),
 		    raw_spin_unlock_irq(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
+
 DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
 		    raw_spin_lock_irqsave(_T->lock, _T->flags),
 		    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
 		    unsigned long flags)
 
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
+			 raw_spin_trylock_irqsave(_T->lock, _T->flags))
+
 DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
 		    spin_lock(_T->lock),
 		    spin_unlock(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
+
 DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
 		    spin_lock_irq(_T->lock),
 		    spin_unlock_irq(_T->lock))
 
+DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
+			 spin_trylock_irq(_T->lock))
+
 DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
 		    spin_lock_irqsave(_T->lock, _T->flags),
 		    spin_unlock_irqrestore(_T->lock, _T->flags),
 		    unsigned long flags)
 
+DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
+			 spin_trylock_irqsave(_T->lock, _T->flags))
+
 #undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
-- 
cgit v1.2.3


From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 10 Nov 2023 17:39:28 -0800
Subject: bpf: Do not allocate percpu memory at init stage

Kirill Shutemov reported significant percpu memory consumption increase after
booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"). The percpu memory consumption is
increased from 111MB to 969MB. The number is from /proc/meminfo.

I tried to reproduce the issue with my local VM which at most supports upto
255 cpus. With 252 cpus, without the above commit, the percpu memory
consumption immediately after boot is 57MB while with the above commit the
percpu memory consumption is 231MB.

This is not good since so far percpu memory from bpf memory allocator is not
widely used yet. Let us change pre-allocation in init stage to on-demand
allocation when verifier detects there is a need of percpu memory for bpf
program. With this change, percpu memory consumption after boot can be reduced
signicantly.

  [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/

Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/core.c     |  8 +++-----
 kernel/bpf/verifier.c | 20 ++++++++++++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35bff17396c0..6762dac3ef76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 08626b519ce2..cd3afe57ece3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
 
 /* No hurry in this branch
  *
@@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-	bpf_global_percpu_ma_set = !ret;
-	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
+	return ret;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2267d5ed14e..6da370a047fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 
 #include "disasm.h"
@@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 #undef BPF_LINK_TYPE
 };
 
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
 struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
 static const struct bpf_line_info *
 find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
@@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
-					return -ENOMEM;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+				}
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
-- 
cgit v1.2.3


From 4aea6a6d61cd6e3df9ed98345638abad1b1e5276 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Thu, 30 Mar 2023 15:05:38 -0700
Subject: net/mlx5: Query maximum frequency adjustment of the PTP hardware
 clock

Some mlx5 devices do not support the default advertised maximum frequency
adjustment value for the PTP hardware clock that is set by the driver.
These devices need to be queried when initializing the clock functionality
in order to get the maximum supported frequency adjustment value. This
value can be greater than the minimum supported frequency adjustment across
mlx5 devices (50 million ppb).

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c    | 22 ++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                      |  5 ++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index 1daa4b019513..cac60a841e1d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -1000,6 +1000,25 @@ static void mlx5_init_clock_info(struct mlx5_core_dev *mdev)
 	info->frac = timer->tc.frac;
 }
 
+static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	u32 out[MLX5_ST_SZ_DW(mtutc_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {};
+	u8 log_max_freq_adjustment = 0;
+	int err;
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTUTC, 0, 0);
+	if (!err)
+		log_max_freq_adjustment =
+			MLX5_GET(mtutc_reg, out, log_max_freq_adjustment);
+
+	if (log_max_freq_adjustment)
+		clock->ptp_info.max_adj =
+			min(S32_MAX, 1 << log_max_freq_adjustment);
+}
+
 static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_clock *clock = &mdev->clock;
@@ -1007,6 +1026,9 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev)
 	/* Configure the PHC */
 	clock->ptp_info = mlx5_ptp_clock_info;
 
+	if (MLX5_CAP_MCAM_REG(mdev, mtutc))
+		mlx5_init_timer_max_freq_adjustment(mdev);
+
 	mlx5_timecounter_init(mdev);
 	mlx5_init_clock_info(mdev);
 	mlx5_init_overflow_period(clock);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..ce2e71cd6d2a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10103,7 +10103,10 @@ enum {
 struct mlx5_ifc_mtutc_reg_bits {
 	u8         reserved_at_0[0x5];
 	u8         freq_adj_units[0x3];
-	u8         reserved_at_8[0x14];
+	u8         reserved_at_8[0x3];
+	u8         log_max_freq_adjustment[0x5];
+
+	u8         reserved_at_10[0xc];
 	u8         operation[0x4];
 
 	u8         freq_adjustment[0x20];
-- 
cgit v1.2.3


From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:05:57 -0800
Subject: bpf: generalize reg_set_min_max() to handle non-const register
 comparisons

Generalize bounds adjustment logic of reg_set_min_max() to handle not
just register vs constant case, but in general any register vs any
register cases. For most of the operations it's trivial extension based
on range vs range comparison logic, we just need to properly pick
min/max of a range to compare against min/max of the other range.

For BPF_JSET we keep the original capabilities, just make sure JSET is
integrated in the common framework. This is manifested in the
internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more
uniform rev_opcode() handling. See the code for details. This allows to
reuse the same code exactly both for TRUE and FALSE branches without
explicitly handling both conditions with custom code.

Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE
case none of the registers are constants. This is now just a normal
generic case handled by reg_set_min_max().

To make tnum handling cleaner, tnum_with_subreg() helper is added, as
that's a common operator when dealing with 32-bit subregister bounds.
This keeps the overall logic much less noisy when it comes to tnums.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h  |   4 +
 kernel/bpf/tnum.c     |   7 +-
 kernel/bpf/verifier.c | 314 ++++++++++++++++++++++----------------------------
 3 files changed, 146 insertions(+), 179 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 1c3948a1d6ad..3c13240077b8 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a);
 struct tnum tnum_subreg(struct tnum a);
 /* Returns the tnum with the lower 32-bit subreg cleared */
 struct tnum tnum_clear_subreg(struct tnum a);
+/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower
+ * 32-bit subreg in *subreg*
+ */
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg);
 /* Returns the tnum with the lower 32-bit subreg set to value */
 struct tnum tnum_const_subreg(struct tnum a, u32 value);
 /* Returns true if 32-bit subreg @a is a known constant*/
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 3d7127f439a1..f4c91c9b27d7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -208,7 +208,12 @@ struct tnum tnum_clear_subreg(struct tnum a)
 	return tnum_lshift(tnum_rshift(a, 32), 32);
 }
 
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+	return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
 struct tnum tnum_const_subreg(struct tnum a, u32 value)
 {
-	return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+	return tnum_with_subreg(a, tnum_const(value));
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9ae6eae13471..39ce141c55d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14453,218 +14453,186 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
-/* Adjusts the register min/max values in the case that the dst_reg and
- * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
- * Technically we can do similar adjustments for pointers to the same object,
- * but we don't support that right now.
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg1,
-			    struct bpf_reg_state *true_reg2,
-			    struct bpf_reg_state *false_reg1,
-			    struct bpf_reg_state *false_reg2,
-			    u8 opcode, bool is_jmp32)
+static u8 rev_opcode(u8 opcode)
 {
-	struct tnum false_32off, false_64off;
-	struct tnum true_32off, true_64off;
-	u64 uval;
-	u32 uval32;
-	s64 sval;
-	s32 sval32;
-
-	/* If either register is a pointer, we can't learn anything about its
-	 * variable offset from the compare (unless they were a pointer into
-	 * the same object, but we don't bother with that).
+	switch (opcode) {
+	case BPF_JEQ:		return BPF_JNE;
+	case BPF_JNE:		return BPF_JEQ;
+	/* JSET doesn't have it's reverse opcode in BPF, so add
+	 * BPF_X flag to denote the reverse of that operation
 	 */
-	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
-		return;
-
-	/* we expect right-hand registers (src ones) to be constants, for now */
-	if (!is_reg_const(false_reg2, is_jmp32)) {
-		opcode = flip_opcode(opcode);
-		swap(true_reg1, true_reg2);
-		swap(false_reg1, false_reg2);
+	case BPF_JSET:		return BPF_JSET | BPF_X;
+	case BPF_JSET | BPF_X:	return BPF_JSET;
+	case BPF_JGE:		return BPF_JLT;
+	case BPF_JGT:		return BPF_JLE;
+	case BPF_JLE:		return BPF_JGT;
+	case BPF_JLT:		return BPF_JGE;
+	case BPF_JSGE:		return BPF_JSLT;
+	case BPF_JSGT:		return BPF_JSLE;
+	case BPF_JSLE:		return BPF_JSGT;
+	case BPF_JSLT:		return BPF_JSGE;
+	default:		return 0;
 	}
-	if (!is_reg_const(false_reg2, is_jmp32))
-		return;
+}
 
-	false_32off = tnum_subreg(false_reg1->var_off);
-	false_64off = false_reg1->var_off;
-	true_32off = tnum_subreg(true_reg1->var_off);
-	true_64off = true_reg1->var_off;
-	uval = false_reg2->var_off.value;
-	uval32 = (u32)tnum_subreg(false_reg2->var_off).value;
-	sval = (s64)uval;
-	sval32 = (s32)uval32;
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+				u8 opcode, bool is_jmp32)
+{
+	struct tnum t;
+	u64 val;
 
+again:
 	switch (opcode) {
-	/* JEQ/JNE comparison doesn't change the register equivalence.
-	 *
-	 * r1 = r2;
-	 * if (r1 == 42) goto label;
-	 * ...
-	 * label: // here both r1 and r2 are known to be 42.
-	 *
-	 * Hence when marking register as known preserve it's ID.
-	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg1, uval32);
-			true_32off = tnum_subreg(true_reg1->var_off);
+			reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+			reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+			reg2->u32_min_value = reg1->u32_min_value;
+			reg2->u32_max_value = reg1->u32_max_value;
+			reg2->s32_min_value = reg1->s32_min_value;
+			reg2->s32_max_value = reg1->s32_max_value;
+
+			t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+			reg2->var_off = tnum_with_subreg(reg2->var_off, t);
 		} else {
-			___mark_reg_known(true_reg1, uval);
-			true_64off = true_reg1->var_off;
+			reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+			reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+			reg2->umin_value = reg1->umin_value;
+			reg2->umax_value = reg1->umax_value;
+			reg2->smin_value = reg1->smin_value;
+			reg2->smax_value = reg1->smax_value;
+
+			reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+			reg2->var_off = reg1->var_off;
 		}
 		break;
 	case BPF_JNE:
-		if (is_jmp32) {
-			__mark_reg32_known(false_reg1, uval32);
-			false_32off = tnum_subreg(false_reg1->var_off);
-		} else {
-			___mark_reg_known(false_reg1, uval);
-			false_64off = false_reg1->var_off;
-		}
+		/* we don't derive any new information for inequality yet */
 		break;
 	case BPF_JSET:
+		if (!is_reg_const(reg2, is_jmp32))
+			swap(reg1, reg2);
+		if (!is_reg_const(reg2, is_jmp32))
+			break;
+		val = reg_const_value(reg2, is_jmp32);
+		/* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+		 * requires single bit to learn something useful. E.g., if we
+		 * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+		 * are actually set? We can learn something definite only if
+		 * it's a single-bit value to begin with.
+		 *
+		 * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+		 * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+		 * bit 1 is set, which we can readily use in adjustments.
+		 */
+		if (!is_power_of_2(val))
+			break;
 		if (is_jmp32) {
-			false_32off = tnum_and(false_32off, tnum_const(~uval32));
-			if (is_power_of_2(uval32))
-				true_32off = tnum_or(true_32off,
-						     tnum_const(uval32));
+			t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
 		} else {
-			false_64off = tnum_and(false_64off, tnum_const(~uval));
-			if (is_power_of_2(uval))
-				true_64off = tnum_or(true_64off,
-						     tnum_const(uval));
+			reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
 		}
 		break;
-	case BPF_JGE:
-	case BPF_JGT:
-	{
+	case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+		if (!is_reg_const(reg2, is_jmp32))
+			swap(reg1, reg2);
+		if (!is_reg_const(reg2, is_jmp32))
+			break;
+		val = reg_const_value(reg2, is_jmp32);
 		if (is_jmp32) {
-			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
-			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
-
-			false_reg1->u32_max_value = min(false_reg1->u32_max_value,
-						       false_umax);
-			true_reg1->u32_min_value = max(true_reg1->u32_min_value,
-						      true_umin);
+			t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
 		} else {
-			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
-			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
-
-			false_reg1->umax_value = min(false_reg1->umax_value, false_umax);
-			true_reg1->umin_value = max(true_reg1->umin_value, true_umin);
+			reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
 		}
 		break;
-	}
-	case BPF_JSGE:
-	case BPF_JSGT:
-	{
+	case BPF_JLE:
 		if (is_jmp32) {
-			s32 false_smax = opcode == BPF_JSGT ? sval32    : sval32 - 1;
-			s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
-
-			false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax);
-			true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+			reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
 		} else {
-			s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
-			s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
-
-			false_reg1->smax_value = min(false_reg1->smax_value, false_smax);
-			true_reg1->smin_value = max(true_reg1->smin_value, true_smin);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+			reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
 		}
 		break;
-	}
-	case BPF_JLE:
 	case BPF_JLT:
-	{
 		if (is_jmp32) {
-			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
-			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
-
-			false_reg1->u32_min_value = max(false_reg1->u32_min_value,
-						       false_umin);
-			true_reg1->u32_max_value = min(true_reg1->u32_max_value,
-						      true_umax);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+			reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
 		} else {
-			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
-			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
-
-			false_reg1->umin_value = max(false_reg1->umin_value, false_umin);
-			true_reg1->umax_value = min(true_reg1->umax_value, true_umax);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+			reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
 		}
 		break;
-	}
 	case BPF_JSLE:
+		if (is_jmp32) {
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+			reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+		} else {
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+			reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+		}
+		break;
 	case BPF_JSLT:
-	{
 		if (is_jmp32) {
-			s32 false_smin = opcode == BPF_JSLT ? sval32    : sval32 + 1;
-			s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
-
-			false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin);
-			true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax);
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+			reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
 		} else {
-			s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
-			s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
-
-			false_reg1->smin_value = max(false_reg1->smin_value, false_smin);
-			true_reg1->smax_value = min(true_reg1->smax_value, true_smax);
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+			reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
 		}
 		break;
-	}
+	case BPF_JGE:
+	case BPF_JGT:
+	case BPF_JSGE:
+	case BPF_JSGT:
+		/* just reuse LE/LT logic above */
+		opcode = flip_opcode(opcode);
+		swap(reg1, reg2);
+		goto again;
 	default:
 		return;
 	}
-
-	if (is_jmp32) {
-		false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off),
-					     tnum_subreg(false_32off));
-		true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off),
-					    tnum_subreg(true_32off));
-		reg_bounds_sync(false_reg1);
-		reg_bounds_sync(true_reg1);
-	} else {
-		false_reg1->var_off = false_64off;
-		true_reg1->var_off = true_64off;
-		reg_bounds_sync(false_reg1);
-		reg_bounds_sync(true_reg1);
-	}
-}
-
-/* Regs are known to be equal, so intersect their min/max/var_off */
-static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
-				  struct bpf_reg_state *dst_reg)
-{
-	src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
-							dst_reg->umin_value);
-	src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
-							dst_reg->umax_value);
-	src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
-							dst_reg->smin_value);
-	src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
-							dst_reg->smax_value);
-	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
-							     dst_reg->var_off);
-	reg_bounds_sync(src_reg);
-	reg_bounds_sync(dst_reg);
 }
 
-static void reg_combine_min_max(struct bpf_reg_state *true_src,
-				struct bpf_reg_state *true_dst,
-				struct bpf_reg_state *false_src,
-				struct bpf_reg_state *false_dst,
-				u8 opcode)
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *true_reg2,
+			    struct bpf_reg_state *false_reg1,
+			    struct bpf_reg_state *false_reg2,
+			    u8 opcode, bool is_jmp32)
 {
-	switch (opcode) {
-	case BPF_JEQ:
-		__reg_combine_min_max(true_src, true_dst);
-		break;
-	case BPF_JNE:
-		__reg_combine_min_max(false_src, false_dst);
-		break;
-	}
+	/* If either register is a pointer, we can't learn anything about its
+	 * variable offset from the compare (unless they were a pointer into
+	 * the same object, but we don't bother with that).
+	 */
+	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+		return;
+
+	/* fallthrough (FALSE) branch */
+	regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+	reg_bounds_sync(false_reg1);
+	reg_bounds_sync(false_reg2);
+
+	/* jump (TRUE) branch */
+	regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+	reg_bounds_sync(true_reg1);
+	reg_bounds_sync(true_reg2);
 }
 
 static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -14961,22 +14929,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
 				&other_branch_regs[insn->src_reg],
 				dst_reg, src_reg, opcode, is_jmp32);
-
-		if (dst_reg->type == SCALAR_VALUE &&
-		    src_reg->type == SCALAR_VALUE &&
-		    !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-			/* Comparing for equality, we can combine knowledge */
-			reg_combine_min_max(&other_branch_regs[insn->src_reg],
-					    &other_branch_regs[insn->dst_reg],
-					    src_reg, dst_reg, opcode);
-		}
 	} else /* BPF_SRC(insn->code) == BPF_K */ {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
 				src_reg /* fake one */,
 				dst_reg, src_reg /* same fake one */,
 				opcode, is_jmp32);
 	}
-
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
 	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
-- 
cgit v1.2.3


From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:06:00 -0800
Subject: bpf: add register bounds sanity checks and sanitization

Add simple sanity checks that validate well-formed ranges (min <= max)
across u64, s64, u32, and s32 ranges. Also for cases when the value is
constant (either 64-bit or 32-bit), we validate that ranges and tnums
are in agreement.

These bounds checks are performed at the end of BPF_ALU/BPF_ALU64
operations, on conditional jumps, and for LDX instructions (where subreg
zero/sign extension is probably the most important to check). This
covers most of the interesting cases.

Also, we validate the sanity of the return register when manually
adjusting it for some special helpers.

By default, sanity violation will trigger a warning in verifier log and
resetting register bounds to "unbounded" ones. But to aid development
and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will
trigger hard failure of verification with -EFAULT on register bounds
violations. This allows selftests to catch such issues. veristat will
also gain a CLI option to enable this behavior.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h   |   1 +
 include/uapi/linux/bpf.h       |   3 ++
 kernel/bpf/syscall.c           |   3 +-
 kernel/bpf/verifier.c          | 117 ++++++++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h |   3 ++
 5 files changed, 101 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..402b6bc44a1b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,6 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
+	bool test_sanity_strict;	/* fail verification on sanity violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7cf8bcf9f6a2..8a5855fcee69 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1200,6 +1200,9 @@ enum bpf_perf_event_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0ed286b8a0f0..f266e03ba342 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2573,7 +2573,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_SLEEPABLE |
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
-				 BPF_F_XDP_DEV_BOUND_ONLY))
+				 BPF_F_XDP_DEV_BOUND_ONLY |
+				 BPF_F_TEST_SANITY_STRICT))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65570eedfe88..e7edacf86e0f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2615,6 +2615,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
 	__update_reg_bounds(reg);
 }
 
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, const char *ctx)
+{
+	const char *msg;
+
+	if (reg->umin_value > reg->umax_value ||
+	    reg->smin_value > reg->smax_value ||
+	    reg->u32_min_value > reg->u32_max_value ||
+	    reg->s32_min_value > reg->s32_max_value) {
+		    msg = "range bounds violation";
+		    goto out;
+	}
+
+	if (tnum_is_const(reg->var_off)) {
+		u64 uval = reg->var_off.value;
+		s64 sval = (s64)uval;
+
+		if (reg->umin_value != uval || reg->umax_value != uval ||
+		    reg->smin_value != sval || reg->smax_value != sval) {
+			msg = "const tnum out of sync with range bounds";
+			goto out;
+		}
+	}
+
+	if (tnum_subreg_is_const(reg->var_off)) {
+		u32 uval32 = tnum_subreg(reg->var_off).value;
+		s32 sval32 = (s32)uval32;
+
+		if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+		    reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+			msg = "const subreg tnum out of sync with range bounds";
+			goto out;
+		}
+	}
+
+	return 0;
+out:
+	verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] "
+		"s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
+		ctx, msg, reg->umin_value, reg->umax_value,
+		reg->smin_value, reg->smax_value,
+		reg->u32_min_value, reg->u32_max_value,
+		reg->s32_min_value, reg->s32_max_value,
+		reg->var_off.value, reg->var_off.mask);
+	if (env->test_sanity_strict)
+		return -EFAULT;
+	__mark_reg_unbounded(reg);
+	return 0;
+}
+
 static bool __reg32_bound_s64(s32 a)
 {
 	return a >= 0 && a <= S32_MAX;
@@ -9982,14 +10032,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
-				   int func_id,
-				   struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+				  struct bpf_reg_state *regs, int ret_type,
+				  int func_id,
+				  struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
 
 	if (ret_type != RET_INTEGER)
-		return;
+		return 0;
 
 	switch (func_id) {
 	case BPF_FUNC_get_stack:
@@ -10015,6 +10066,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
 		reg_bounds_sync(ret_reg);
 		break;
 	}
+
+	return reg_bounds_sanity_check(env, ret_reg, "retval");
 }
 
 static int
@@ -10666,7 +10719,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].ref_obj_id = id;
 	}
 
-	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+	err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+	if (err)
+		return err;
 
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
@@ -14166,13 +14221,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		/* check dest operand */
 		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+		err = err ?: adjust_reg_min_max_vals(env, insn);
 		if (err)
 			return err;
-
-		return adjust_reg_min_max_vals(env, insn);
 	}
 
-	return 0;
+	return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
@@ -14653,18 +14707,21 @@ again:
  * Technically we can do similar adjustments for pointers to the same object,
  * but we don't support that right now.
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg1,
-			    struct bpf_reg_state *true_reg2,
-			    struct bpf_reg_state *false_reg1,
-			    struct bpf_reg_state *false_reg2,
-			    u8 opcode, bool is_jmp32)
+static int reg_set_min_max(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *true_reg1,
+			   struct bpf_reg_state *true_reg2,
+			   struct bpf_reg_state *false_reg1,
+			   struct bpf_reg_state *false_reg2,
+			   u8 opcode, bool is_jmp32)
 {
+	int err;
+
 	/* If either register is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless they were a pointer into
 	 * the same object, but we don't bother with that).
 	 */
 	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
-		return;
+		return 0;
 
 	/* fallthrough (FALSE) branch */
 	regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
@@ -14675,6 +14732,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1,
 	regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
 	reg_bounds_sync(true_reg1);
 	reg_bounds_sync(true_reg2);
+
+	err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+	err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+	err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+	err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+	return err;
 }
 
 static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -14968,15 +15031,20 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
 
 	if (BPF_SRC(insn->code) == BPF_X) {
-		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-				&other_branch_regs[insn->src_reg],
-				dst_reg, src_reg, opcode, is_jmp32);
+		err = reg_set_min_max(env,
+				      &other_branch_regs[insn->dst_reg],
+				      &other_branch_regs[insn->src_reg],
+				      dst_reg, src_reg, opcode, is_jmp32);
 	} else /* BPF_SRC(insn->code) == BPF_K */ {
-		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-				src_reg /* fake one */,
-				dst_reg, src_reg /* same fake one */,
-				opcode, is_jmp32);
+		err = reg_set_min_max(env,
+				      &other_branch_regs[insn->dst_reg],
+				      src_reg /* fake one */,
+				      dst_reg, src_reg /* same fake one */,
+				      opcode, is_jmp32);
 	}
+	if (err)
+		return err;
+
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
 	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
@@ -17479,10 +17547,8 @@ static int do_check(struct bpf_verifier_env *env)
 					       insn->off, BPF_SIZE(insn->code),
 					       BPF_READ, insn->dst_reg, false,
 					       BPF_MODE(insn->code) == BPF_MEMSX);
-			if (err)
-				return err;
-
-			err = save_aux_ptr_type(env, src_reg_type, true);
+			err = err ?: save_aux_ptr_type(env, src_reg_type, true);
+			err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
 			if (err)
 				return err;
 		} else if (class == BPF_STX) {
@@ -20769,6 +20835,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+	env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT;
 
 	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7cf8bcf9f6a2..8a5855fcee69 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1200,6 +1200,9 @@ enum bpf_perf_event_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
-- 
cgit v1.2.3


From dfcb264a01a9199e8338a548731baf5bbe77ef19 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Sat, 4 Nov 2023 16:49:06 +0100
Subject: power: supply: bq27xxx: Stop and start delayed work in suspend and
 resume

This driver uses delayed work to perform periodic battery state read out.
This delayed work is not stopped across suspend and resume cycle. The
read out can occur early in the resume cycle. In case of an I2C variant
of this hardware, that read out triggers I2C transfer. That I2C transfer
may happen while the I2C controller is still suspended, which produces a
WARNING in the kernel log.

Fix this by introducing trivial PM ops, which stop the delayed work before
the system enters suspend, and schedule the delayed work right after the
system resumes.

Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231104154920.68585-1-marex@denx.de
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c     | 22 ++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  1 +
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 4296600e8912..1c4a9d137744 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -2162,6 +2162,28 @@ void bq27xxx_battery_teardown(struct bq27xxx_device_info *di)
 }
 EXPORT_SYMBOL_GPL(bq27xxx_battery_teardown);
 
+#ifdef CONFIG_PM_SLEEP
+static int bq27xxx_battery_suspend(struct device *dev)
+{
+	struct bq27xxx_device_info *di = dev_get_drvdata(dev);
+
+	cancel_delayed_work(&di->work);
+	return 0;
+}
+
+static int bq27xxx_battery_resume(struct device *dev)
+{
+	struct bq27xxx_device_info *di = dev_get_drvdata(dev);
+
+	schedule_delayed_work(&di->work, 0);
+	return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+SIMPLE_DEV_PM_OPS(bq27xxx_battery_battery_pm_ops,
+		  bq27xxx_battery_suspend, bq27xxx_battery_resume);
+EXPORT_SYMBOL_GPL(bq27xxx_battery_battery_pm_ops);
+
 MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
 MODULE_DESCRIPTION("BQ27xxx battery monitor driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index 9b5475590518..3a1798b0c1a7 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -295,6 +295,7 @@ static struct i2c_driver bq27xxx_battery_i2c_driver = {
 	.driver = {
 		.name = "bq27xxx-battery",
 		.of_match_table = of_match_ptr(bq27xxx_battery_i2c_of_match_table),
+		.pm = &bq27xxx_battery_battery_pm_ops,
 	},
 	.probe = bq27xxx_battery_i2c_probe,
 	.remove = bq27xxx_battery_i2c_remove,
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 7c8d65414a70..7d8025fb74b7 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -83,5 +83,6 @@ struct bq27xxx_device_info {
 void bq27xxx_battery_update(struct bq27xxx_device_info *di);
 int bq27xxx_battery_setup(struct bq27xxx_device_info *di);
 void bq27xxx_battery_teardown(struct bq27xxx_device_info *di);
+extern const struct dev_pm_ops bq27xxx_battery_battery_pm_ops;
 
 #endif
-- 
cgit v1.2.3


From 74d016ecc1a7974664e98d1afbf649cd4e0e0423 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Nov 2023 22:41:27 -0500
Subject: new helper: user_path_locked_at()

Equivalent of kern_path_locked() taking dfd/userland name.
User introduced in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 16 +++++++++++++---
 include/linux/namei.h |  1 +
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 71c13b2990b4..3ffbe268d52c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2573,13 +2573,13 @@ static int filename_parentat(int dfd, struct filename *name,
 }
 
 /* does lookup, returns the object with parent locked */
-static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
+static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
 {
 	struct dentry *d;
 	struct qstr last;
 	int type, error;
 
-	error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
+	error = filename_parentat(dfd, name, 0, path, &last, &type);
 	if (error)
 		return ERR_PTR(error);
 	if (unlikely(type != LAST_NORM)) {
@@ -2598,12 +2598,22 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
 	struct filename *filename = getname_kernel(name);
-	struct dentry *res = __kern_path_locked(filename, path);
+	struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
 
 	putname(filename);
 	return res;
 }
 
+struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
+{
+	struct filename *filename = getname(name);
+	struct dentry *res = __kern_path_locked(dfd, filename, path);
+
+	putname(filename);
+	return res;
+}
+EXPORT_SYMBOL(user_path_locked_at);
+
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
 	struct filename *filename = getname_kernel(name);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3100371b5e32..74e0cc14ebf8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -66,6 +66,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne
 extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
 extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
+extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
 int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
 			   struct path *parent, struct qstr *last, int *type,
 			   const struct path *root);
-- 
cgit v1.2.3


From 3185d57cfcd34fadbe28f4ed57a6cb5122277ece Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 14 Nov 2023 11:42:02 +0100
Subject: indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc

Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR
macro.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231114104202.4680-1-tklauser@distanz.ch
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/indirect_call_wrapper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index c1c76a70a6ce..adb83a42a6b9 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -11,7 +11,7 @@
  *  @__VA_ARGS__: arguments for @f
  *
  * Avoid retpoline overhead for known builtin, checking @f vs each of them and
- * eventually invoking directly the builtin function. The functions are check
+ * eventually invoking directly the builtin function. The functions are checked
  * in the given order. Fallback to the indirect call.
  */
 #define INDIRECT_CALL_1(f, f1, ...)					\
-- 
cgit v1.2.3


From f7f965c982f7954b46db910146a7ffe0fe1eb5e1 Mon Sep 17 00:00:00 2001
From: Tao Zhang <quic_taozha@quicinc.com>
Date: Thu, 28 Sep 2023 14:29:36 +0800
Subject: coresight-tpdm: Introduce TPDM subtype to TPDM driver

Introduce the new subtype of "CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM"
for TPDM components in driver.

Signed-off-by: Tao Zhang <quic_taozha@quicinc.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/1695882586-10306-4-git-send-email-quic_taozha@quicinc.com
---
 drivers/hwtracing/coresight/coresight-core.c | 3 +++
 drivers/hwtracing/coresight/coresight-tpdm.c | 2 +-
 include/linux/coresight.h                    | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 9fabe00a40d6..d7f0e231feb9 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1093,6 +1093,7 @@ static int coresight_validate_source(struct coresight_device *csdev,
 
 	if (subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_PROC &&
 	    subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE &&
+	    subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM &&
 	    subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS) {
 		dev_err(&csdev->dev, "wrong device subtype in %s\n", function);
 		return -EINVAL;
@@ -1162,6 +1163,7 @@ int coresight_enable(struct coresight_device *csdev)
 		per_cpu(tracer_path, cpu) = path;
 		break;
 	case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
+	case CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM:
 	case CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS:
 		/*
 		 * Use the hash of source's device name as ID
@@ -1212,6 +1214,7 @@ void coresight_disable(struct coresight_device *csdev)
 		per_cpu(tracer_path, cpu) = NULL;
 		break;
 	case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
+	case CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM:
 	case CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS:
 		hash = hashlen_hash(hashlen_string(NULL, dev_name(&csdev->dev)));
 		/* Find the path by the hash. */
diff --git a/drivers/hwtracing/coresight/coresight-tpdm.c b/drivers/hwtracing/coresight/coresight-tpdm.c
index b6456120b76a..abaff0b934db 100644
--- a/drivers/hwtracing/coresight/coresight-tpdm.c
+++ b/drivers/hwtracing/coresight/coresight-tpdm.c
@@ -203,7 +203,7 @@ static int tpdm_probe(struct amba_device *adev, const struct amba_id *id)
 	if (!desc.name)
 		return -ENOMEM;
 	desc.type = CORESIGHT_DEV_TYPE_SOURCE;
-	desc.subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS;
+	desc.subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM;
 	desc.ops = &tpdm_cs_ops;
 	desc.pdata = adev->dev.platform_data;
 	desc.dev = &adev->dev;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index a269fffaf991..a4cb7dd6ca23 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -64,6 +64,7 @@ enum coresight_dev_subtype_source {
 	CORESIGHT_DEV_SUBTYPE_SOURCE_PROC,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_BUS,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE,
+	CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM,
 	CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS,
 };
 
-- 
cgit v1.2.3


From 3fc6350fc8470d42f5e700ecd1c3d90f9dd9fd2d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 13 Nov 2023 13:12:49 +0200
Subject: treewide, spi: Get rid of SPI_MASTER_HALF_DUPLEX

The SPI_MASTER_HALF_DUPLEX is the legacy name of a definition
for a half duplex flag. Since all others had been replaced with
the respective SPI_CONTROLLER prefix get rid of the last one
as well. There is no functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> # for input
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20231113111249.3982461-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/input/rmi4/rmi_spi.c             | 2 +-
 drivers/mmc/host/mmc_spi.c               | 2 +-
 drivers/net/ethernet/micrel/ks8851_spi.c | 4 ++--
 drivers/usb/gadget/udc/max3420_udc.c     | 2 +-
 include/linux/spi/spi.h                  | 2 --
 5 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/rmi4/rmi_spi.c b/drivers/input/rmi4/rmi_spi.c
index 852aeb0b2c07..07c866f42296 100644
--- a/drivers/input/rmi4/rmi_spi.c
+++ b/drivers/input/rmi4/rmi_spi.c
@@ -375,7 +375,7 @@ static int rmi_spi_probe(struct spi_device *spi)
 	struct rmi_device_platform_data *spi_pdata = spi->dev.platform_data;
 	int error;
 
-	if (spi->master->flags & SPI_MASTER_HALF_DUPLEX)
+	if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX)
 		return -EINVAL;
 
 	rmi_spi = devm_kzalloc(&spi->dev, sizeof(struct rmi_spi_xport),
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index cc333ad67cac..b0cccef4cfbf 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1322,7 +1322,7 @@ static int mmc_spi_probe(struct spi_device *spi)
 	/* We rely on full duplex transfers, mostly to reduce
 	 * per-transfer overheads (by making fewer transfers).
 	 */
-	if (spi->master->flags & SPI_MASTER_HALF_DUPLEX)
+	if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX)
 		return -EINVAL;
 
 	/* MMC and SD specs only seem to care that sampling is on the
diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c
index 70bc7253454f..7c41623dac90 100644
--- a/drivers/net/ethernet/micrel/ks8851_spi.c
+++ b/drivers/net/ethernet/micrel/ks8851_spi.c
@@ -156,7 +156,7 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned int op,
 
 	txb[0] = cpu_to_le16(op | KS_SPIOP_RD);
 
-	if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX) {
+	if (kss->spidev->master->flags & SPI_CONTROLLER_HALF_DUPLEX) {
 		msg = &kss->spi_msg2;
 		xfer = kss->spi_xfer2;
 
@@ -180,7 +180,7 @@ static void ks8851_rdreg(struct ks8851_net *ks, unsigned int op,
 	ret = spi_sync(kss->spidev, msg);
 	if (ret < 0)
 		netdev_err(ks->netdev, "read: spi_sync() failed\n");
-	else if (kss->spidev->master->flags & SPI_MASTER_HALF_DUPLEX)
+	else if (kss->spidev->master->flags & SPI_CONTROLLER_HALF_DUPLEX)
 		memcpy(rxb, trx, rxl);
 	else
 		memcpy(rxb, trx + 2, rxl);
diff --git a/drivers/usb/gadget/udc/max3420_udc.c b/drivers/usb/gadget/udc/max3420_udc.c
index 2d57786d3db7..89e8cf2a2a7d 100644
--- a/drivers/usb/gadget/udc/max3420_udc.c
+++ b/drivers/usb/gadget/udc/max3420_udc.c
@@ -1201,7 +1201,7 @@ static int max3420_probe(struct spi_device *spi)
 	int err, irq;
 	u8 reg[8];
 
-	if (spi->master->flags & SPI_MASTER_HALF_DUPLEX) {
+	if (spi->master->flags & SPI_CONTROLLER_HALF_DUPLEX) {
 		dev_err(&spi->dev, "UDC needs full duplex to work\n");
 		return -EINVAL;
 	}
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..7b4baff63c5c 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1638,8 +1638,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer)
 /* Compatibility layer */
 #define spi_master			spi_controller
 
-#define SPI_MASTER_HALF_DUPLEX		SPI_CONTROLLER_HALF_DUPLEX
-
 #define spi_master_get_devdata(_ctlr)	spi_controller_get_devdata(_ctlr)
 #define spi_master_set_devdata(_ctlr, _data)	\
 	spi_controller_set_devdata(_ctlr, _data)
-- 
cgit v1.2.3


From fac4a535758851215d23d7d92879aeee5035f51d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 8 Aug 2023 19:27:56 +0300
Subject: device property: Add fwnode_property_match_property_string()

Sometimes the users want to match the single value string property
against an array of predefined strings. Create a helper for them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230808162800.61651-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/base/property.c  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/property.h | 12 ++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 3bb9505f1631..8f8e2a6816bc 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -498,6 +498,41 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(fwnode_property_match_string);
 
+/**
+ * fwnode_property_match_property_string - find a property string value in an array and return index
+ * @fwnode: Firmware node to get the property of
+ * @propname: Name of the property holding the string value
+ * @array: String array to search in
+ * @n: Size of the @array
+ *
+ * Find a property string value in a given @array and if it is found return
+ * the index back.
+ *
+ * Return: index, starting from %0, if the string value was found in the @array (success),
+ *	   %-ENOENT when the string value was not found in the @array,
+ *	   %-EINVAL if given arguments are not valid,
+ *	   %-ENODATA if the property does not have a value,
+ *	   %-EPROTO or %-EILSEQ if the property is not a string,
+ *	   %-ENXIO if no suitable firmware interface is present.
+ */
+int fwnode_property_match_property_string(const struct fwnode_handle *fwnode,
+	const char *propname, const char * const *array, size_t n)
+{
+	const char *string;
+	int ret;
+
+	ret = fwnode_property_read_string(fwnode, propname, &string);
+	if (ret)
+		return ret;
+
+	ret = match_string(array, n, string);
+	if (ret < 0)
+		ret = -ENOENT;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fwnode_property_match_property_string);
+
 /**
  * fwnode_property_get_reference_args() - Find a reference with arguments
  * @fwnode:	Firmware node where to look for the reference
diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..2b8f07fc68a9 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -98,6 +98,18 @@ static inline bool device_is_compatible(const struct device *dev, const char *co
 	return fwnode_device_is_compatible(dev_fwnode(dev), compat);
 }
 
+int fwnode_property_match_property_string(const struct fwnode_handle *fwnode,
+					  const char *propname,
+					  const char * const *array, size_t n);
+
+static inline
+int device_property_match_property_string(const struct device *dev,
+					  const char *propname,
+					  const char * const *array, size_t n)
+{
+	return fwnode_property_match_property_string(dev_fwnode(dev), propname, array, n);
+}
+
 int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
 				       const char *prop, const char *nargs_prop,
 				       unsigned int nargs, unsigned int index,
-- 
cgit v1.2.3


From 96fa96e198f9707285003075fbbce7db6a485112 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 15 Nov 2023 11:39:18 +0000
Subject: net: linkmode: add linkmode_fill() helper

Add a linkmode_fill() helper, which will allow us to convert phylink's
open coded bitmap_fill() operations.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/linkmode.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 7303b4bc2ce0..287f590ed56b 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst)
 	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline void linkmode_fill(unsigned long *dst)
+{
+	bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
 static inline void linkmode_copy(unsigned long *dst, const unsigned long *src)
 {
 	bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS);
-- 
cgit v1.2.3


From 9e0be3f50c0e8517d0238b62409c20bcb8cd8785 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Fri, 10 Nov 2023 13:07:22 +0100
Subject: linux/export: clean up the IA-64 KSYM_FUNC macro

With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"),
there is no need to keep the IA-64 definition of the KSYM_FUNC macro.

Clean up the IA-64 definition of the KSYM_FUNC macro.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 45fca09b2319..69501e0ec239 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -50,9 +50,7 @@
 	    "	.previous"						"\n"	\
 	)
 
-#ifdef CONFIG_IA64
-#define KSYM_FUNC(name)		@fptr(name)
-#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
 #define KSYM_FUNC(name)		P%name
 #else
 #define KSYM_FUNC(name)		name
-- 
cgit v1.2.3


From a9214a8883ceb82df55aa90d1c49ddb85fc1e3d5 Mon Sep 17 00:00:00 2001
From: Etienne Carriere <etienne.carriere@foss.st.com>
Date: Mon, 30 Oct 2023 09:48:10 +0100
Subject: tee: system session

Adds kernel client API function tee_client_system_session() for a client
to request a system service entry in TEE context.

This feature is needed to prevent a system deadlock when several TEE
client applications invoke TEE, consuming all TEE thread contexts
available in the secure world. The deadlock can happen in the OP-TEE
driver for example if all these TEE threads issue an RPC call from TEE
to Linux OS to access an eMMC RPMB partition (TEE secure storage) which
device clock or regulator controller is accessed through an OP-TEE SCMI
services. In that case, Linux SCMI driver must reach OP-TEE SCMI service
without waiting until one of the consumed TEE threads is freed.

Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Etienne Carriere <etienne.carriere@foss.st.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c  |  8 ++++++++
 include/linux/tee_drv.h | 16 ++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 5ddfd5d9ac7f..792d6fae4354 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -1173,6 +1173,14 @@ int tee_client_close_session(struct tee_context *ctx, u32 session)
 }
 EXPORT_SYMBOL_GPL(tee_client_close_session);
 
+int tee_client_system_session(struct tee_context *ctx, u32 session)
+{
+	if (!ctx->teedev->desc->ops->system_session)
+		return -EINVAL;
+	return ctx->teedev->desc->ops->system_session(ctx, session);
+}
+EXPORT_SYMBOL_GPL(tee_client_system_session);
+
 int tee_client_invoke_func(struct tee_context *ctx,
 			   struct tee_ioctl_invoke_arg *arg,
 			   struct tee_param *param)
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 17eb1c5205d3..911ddf92dcee 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -84,6 +84,7 @@ struct tee_param {
  * @release:		release this open file
  * @open_session:	open a new session
  * @close_session:	close a session
+ * @system_session:	declare session as a system session
  * @invoke_func:	invoke a trusted function
  * @cancel_req:		request cancel of an ongoing invoke or open
  * @supp_recv:		called for supplicant to get a command
@@ -100,6 +101,7 @@ struct tee_driver_ops {
 			    struct tee_ioctl_open_session_arg *arg,
 			    struct tee_param *param);
 	int (*close_session)(struct tee_context *ctx, u32 session);
+	int (*system_session)(struct tee_context *ctx, u32 session);
 	int (*invoke_func)(struct tee_context *ctx,
 			   struct tee_ioctl_invoke_arg *arg,
 			   struct tee_param *param);
@@ -429,6 +431,20 @@ int tee_client_open_session(struct tee_context *ctx,
  */
 int tee_client_close_session(struct tee_context *ctx, u32 session);
 
+/**
+ * tee_client_system_session() - Declare session as a system session
+ * @ctx:	TEE Context
+ * @session:	Session id
+ *
+ * This function requests TEE to provision an entry context ready to use for
+ * that session only. The provisioned entry context is used for command
+ * invocation and session closure, not for command cancelling requests.
+ * TEE releases the provisioned context upon session closure.
+ *
+ * Return < 0 on error else 0 if an entry context has been provisioned.
+ */
+int tee_client_system_session(struct tee_context *ctx, u32 session);
+
 /**
  * tee_client_invoke_func() - Invoke a function in a Trusted Application
  * @ctx:	TEE Context
-- 
cgit v1.2.3


From 5e4166461cf66a26f925011d90017da74e410747 Mon Sep 17 00:00:00 2001
From: Yang Hubin <yanghb2019@lzu.edu.cn>
Date: Sat, 4 Nov 2023 00:45:01 -0700
Subject: f2fs: the name of a struct is wrong in a comment.

The macro SUMMARY_SIZE represents the size of the struct f2fs_summary,

instead of the size of the struct summary.

Signed-off-by: Yang Hubin <yanghb2019@lzu.edu.cn>
Signed-off-by: Qian Haolai <qianhl2023@lzu.edu.cn>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 039fe0ce8d83..053137a0fe45 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -441,7 +441,7 @@ struct f2fs_sit_block {
  * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
  */
 #define ENTRIES_IN_SUM		(F2FS_BLKSIZE / 8)
-#define	SUMMARY_SIZE		(7)	/* sizeof(struct summary) */
+#define	SUMMARY_SIZE		(7)	/* sizeof(struct f2fs_summary) */
 #define	SUM_FOOTER_SIZE		(5)	/* sizeof(struct summary_footer) */
 #define SUM_ENTRY_SIZE		(SUMMARY_SIZE * ENTRIES_IN_SUM)
 
-- 
cgit v1.2.3


From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 09:14:04 -0800
Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS

Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral
BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0].

A few selftests and veristat need to be adjusted in the same patch as
well.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                             |  2 +-
 include/uapi/linux/bpf.h                                 |  2 +-
 kernel/bpf/syscall.c                                     |  2 +-
 kernel/bpf/verifier.c                                    |  6 +++---
 tools/include/uapi/linux/bpf.h                           |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/reg_bounds.c      |  2 +-
 tools/testing/selftests/bpf/progs/verifier_bounds.c      |  4 ++--
 tools/testing/selftests/bpf/test_loader.c                |  6 +++---
 tools/testing/selftests/bpf/test_sock_addr.c             |  3 +--
 tools/testing/selftests/bpf/test_verifier.c              |  2 +-
 tools/testing/selftests/bpf/testing_helpers.c            |  4 ++--
 tools/testing/selftests/bpf/veristat.c                   | 12 ++++++------
 13 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 402b6bc44a1b..52a4012b8255 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,7 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
-	bool test_sanity_strict;	/* fail verification on sanity violations */
+	bool test_reg_invariants;	/* fail verification on register invariants violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8a5855fcee69..7a5498242eaa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1201,7 +1201,7 @@ enum bpf_perf_event_type {
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
 /* The verifier internal test flag. Behavior is undefined */
-#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+#define BPF_F_TEST_REG_INVARIANTS	(1U << 7)
 
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f266e03ba342..5e43ddd1b83f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2574,7 +2574,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
 				 BPF_F_XDP_DEV_BOUND_ONLY |
-				 BPF_F_TEST_SANITY_STRICT))
+				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 59505881e7a7..7c3461b89513 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2608,14 +2608,14 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
 
 	return 0;
 out:
-	verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] "
+	verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
 		"s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
 		ctx, msg, reg->umin_value, reg->umax_value,
 		reg->smin_value, reg->smax_value,
 		reg->u32_min_value, reg->u32_max_value,
 		reg->s32_min_value, reg->s32_max_value,
 		reg->var_off.value, reg->var_off.mask);
-	if (env->test_sanity_strict)
+	if (env->test_reg_invariants)
 		return -EFAULT;
 	__mark_reg_unbounded(reg);
 	return 0;
@@ -20791,7 +20791,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
-	env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT;
+	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
 
 	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8a5855fcee69..7a5498242eaa 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1201,7 +1201,7 @@ enum bpf_perf_event_type {
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
 /* The verifier internal test flag. Behavior is undefined */
-#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+#define BPF_F_TEST_REG_INVARIANTS	(1U << 7)
 
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 3f2d70831873..e770912fc1d2 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -35,7 +35,7 @@ static int check_load(const char *file, enum bpf_prog_type type)
 	}
 
 	bpf_program__set_type(prog, type);
-	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS);
 	bpf_program__set_log_level(prog, 4 | extra_prog_load_log_flags);
 
 	err = bpf_object__load(obj);
diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index fe0cb906644b..7a8b0bf0a7f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -838,7 +838,7 @@ static int load_range_cmp_prog(struct range x, struct range y, enum op op,
 		.log_level = 2,
 		.log_buf = log_buf,
 		.log_size = log_sz,
-		.prog_flags = BPF_F_TEST_SANITY_STRICT,
+		.prog_flags = BPF_F_TEST_REG_INVARIANTS,
 	);
 
 	/* ; skip exit block below
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 0c1460936373..ec430b71730b 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -965,7 +965,7 @@ l0_%=:	r0 = 0;						\
 SEC("xdp")
 __description("bound check with JMP_JSLT for crossing 64-bit signed boundary")
 __success __retval(0)
-__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */
+__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */
 __naked void crossing_64_bit_signed_boundary_2(void)
 {
 	asm volatile ("					\
@@ -1047,7 +1047,7 @@ l0_%=:	r0 = 0;						\
 SEC("xdp")
 __description("bound check with JMP32_JSLT for crossing 32-bit signed boundary")
 __success __retval(0)
-__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */
+__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */
 __naked void crossing_32_bit_signed_boundary_2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 57e27b1a73a6..a350ecdfba4a 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -179,7 +179,7 @@ static int parse_test_spec(struct test_loader *tester,
 	memset(spec, 0, sizeof(*spec));
 
 	spec->prog_name = bpf_program__name(prog);
-	spec->prog_flags = BPF_F_TEST_SANITY_STRICT; /* by default be strict */
+	spec->prog_flags = BPF_F_TEST_REG_INVARIANTS; /* by default be strict */
 
 	btf = bpf_object__btf(obj);
 	if (!btf) {
@@ -280,8 +280,8 @@ static int parse_test_spec(struct test_loader *tester,
 				update_flags(&spec->prog_flags, BPF_F_SLEEPABLE, clear);
 			} else if (strcmp(val, "BPF_F_XDP_HAS_FRAGS") == 0) {
 				update_flags(&spec->prog_flags, BPF_F_XDP_HAS_FRAGS, clear);
-			} else if (strcmp(val, "BPF_F_TEST_SANITY_STRICT") == 0) {
-				update_flags(&spec->prog_flags, BPF_F_TEST_SANITY_STRICT, clear);
+			} else if (strcmp(val, "BPF_F_TEST_REG_INVARIANTS") == 0) {
+				update_flags(&spec->prog_flags, BPF_F_TEST_REG_INVARIANTS, clear);
 			} else /* assume numeric value */ {
 				err = parse_int(val, &flags, "test prog flags");
 				if (err)
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 878c077e0fa7..b0068a9d2cfe 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -679,8 +679,7 @@ static int load_path(const struct sock_addr_test *test, const char *path)
 
 	bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR);
 	bpf_program__set_expected_attach_type(prog, test->expected_attach_type);
-	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
-	bpf_program__set_flags(prog, BPF_F_TEST_SANITY_STRICT);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS);
 
 	err = bpf_object__load(obj);
 	if (err) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4992022f3137..f36e41435be7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1588,7 +1588,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	if (fixup_skips != skips)
 		return;
 
-	pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT;
+	pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS;
 	if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT)
 		pflags |= BPF_F_STRICT_ALIGNMENT;
 	if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 9786a94a666c..d2458c1b1671 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -276,7 +276,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	if (type != BPF_PROG_TYPE_UNSPEC && bpf_program__type(prog) != type)
 		bpf_program__set_type(prog, type);
 
-	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT;
+	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS;
 	bpf_program__set_flags(prog, flags);
 
 	err = bpf_object__load(obj);
@@ -299,7 +299,7 @@ int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 {
 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
 		.kern_version = kern_version,
-		.prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT,
+		.prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS,
 		.log_level = extra_prog_load_log_flags,
 		.log_buf = log_buf,
 		.log_size = log_buf_sz,
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 609fd9753af0..1d418d66e375 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -145,7 +145,7 @@ static struct env {
 	bool debug;
 	bool quiet;
 	bool force_checkpoints;
-	bool strict_range_sanity;
+	bool force_reg_invariants;
 	enum resfmt out_fmt;
 	bool show_version;
 	bool comparison_mode;
@@ -225,8 +225,8 @@ static const struct argp_option opts[] = {
 	{ "filter", 'f', "FILTER", 0, "Filter expressions (or @filename for file with expressions)." },
 	{ "test-states", 't', NULL, 0,
 	  "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" },
-	{ "test-sanity", 'r', NULL, 0,
-	  "Force strict BPF verifier register sanity behavior (BPF_F_TEST_SANITY_STRICT program flag)" },
+	{ "test-reg-invariants", 'r', NULL, 0,
+	  "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" },
 	{},
 };
 
@@ -299,7 +299,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 		env.force_checkpoints = true;
 		break;
 	case 'r':
-		env.strict_range_sanity = true;
+		env.force_reg_invariants = true;
 		break;
 	case 'n':
 		errno = 0;
@@ -1028,8 +1028,8 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 
 	if (env.force_checkpoints)
 		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_STATE_FREQ);
-	if (env.strict_range_sanity)
-		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_SANITY_STRICT);
+	if (env.force_reg_invariants)
+		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_REG_INVARIANTS);
 
 	err = bpf_object__load(obj);
 	env.progs_processed++;
-- 
cgit v1.2.3


From 2eea9ce4310d8c0f8ef1dbe7b0e7d9219ff02b97 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:00 +0200
Subject: mounts: keep list of mounts in an rbtree

When adding a mount to a namespace insert it into an rbtree rooted in the
mnt_namespace instead of a linear list.

The mnt.mnt_list is still used to set up the mount tree and for
propagation, but not after the mount has been added to a namespace.  Hence
mnt_list can live in union with rb_node.  Use MNT_ONRB mount flag to
validate that the mount is on the correct list.

This allows removing the cursor used for reading /proc/$PID/mountinfo.  The
mnt_id_unique of the next mount can be used as an index into the seq file.

Tested by inserting 100k bind mounts, unsharing the mount namespace, and
unmounting.  No performance regressions have been observed.

For the last mount in the 100k list the statmount() call was more than 100x
faster due to the mount ID lookup not having to do a linear search.  This
patch makes the overhead of mount ID lookup non-observable in this range.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-3-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h            |  24 ++++---
 fs/namespace.c        | 190 +++++++++++++++++++++++---------------------------
 fs/pnode.c            |   2 +-
 fs/proc_namespace.c   |   3 -
 include/linux/mount.h |   5 +-
 5 files changed, 106 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mount.h b/fs/mount.h
index a14f762b3f29..4a42fc68f4cc 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@
 struct mnt_namespace {
 	struct ns_common	ns;
 	struct mount *	root;
-	/*
-	 * Traversal and modification of .list is protected by either
-	 * - taking namespace_sem for write, OR
-	 * - taking namespace_sem for read AND taking .ns_lock.
-	 */
-	struct list_head	list;
-	spinlock_t		ns_lock;
+	struct rb_root		mounts; /* Protected by namespace_sem */
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	u64 event;
-	unsigned int		mounts; /* # of mounts in the namespace */
+	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 } __randomize_layout;
 
@@ -55,7 +49,10 @@ struct mount {
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
 	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
-	struct list_head mnt_list;
+	union {
+		struct rb_node mnt_node;	/* Under ns->mounts */
+		struct list_head mnt_list;
+	};
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
 	struct list_head mnt_slave_list;/* list of slave mounts */
@@ -128,7 +125,6 @@ struct proc_mounts {
 	struct mnt_namespace *ns;
 	struct path root;
 	int (*show)(struct seq_file *, struct vfsmount *);
-	struct mount cursor;
 };
 
 extern const struct seq_operations mounts_op;
@@ -147,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }
 
+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+	WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+	mnt->mnt.mnt_flags &= ~MNT_ONRB;
+	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+	list_add_tail(&mnt->mnt_list, dt_list);
+}
+
 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0bcba81402b5..bbe94096e262 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -734,21 +734,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
 	return m;
 }
 
-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
-	spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
-	spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
-	return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
@@ -767,19 +752,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
 bool __is_local_mountpoint(struct dentry *dentry)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-	struct mount *mnt;
+	struct mount *mnt, *n;
 	bool is_covered = false;
 
 	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_for_each_entry(mnt, &ns->list, mnt_list) {
-		if (mnt_is_cursor(mnt))
-			continue;
+	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
-	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 
 	return is_covered;
@@ -1026,6 +1007,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 	mnt_add_count(old_parent, -1);
 }
 
+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+	return rb_entry(node, struct mount, mnt_node);
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+	struct rb_node **link = &ns->mounts.rb_node;
+	struct rb_node *parent = NULL;
+
+	WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+	mnt->mnt_ns = ns;
+	while (*link) {
+		parent = *link;
+		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+	rb_link_node(&mnt->mnt_node, parent, link);
+	rb_insert_color(&mnt->mnt_node, &ns->mounts);
+	mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -1039,12 +1044,13 @@ static void commit_tree(struct mount *mnt)
 	BUG_ON(parent == mnt);
 
 	list_add_tail(&head, &mnt->mnt_list);
-	list_for_each_entry(m, &head, mnt_list)
-		m->mnt_ns = n;
+	while (!list_empty(&head)) {
+		m = list_first_entry(&head, typeof(*m), mnt_list);
+		list_del(&m->mnt_list);
 
-	list_splice(&head, n->list.prev);
-
-	n->mounts += n->pending_mounts;
+		mnt_add_to_ns(n, m);
+	}
+	n->nr_mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
 	__attach_mnt(mnt, parent);
@@ -1192,7 +1198,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1417,65 +1423,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
 	return &p->mnt;
 }
 
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
-				   struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
 {
-	struct mount *mnt, *ret = NULL;
+	struct rb_node *node = ns->mounts.rb_node;
+	struct mount *ret = NULL;
 
-	lock_ns_list(ns);
-	list_for_each_continue(p, &ns->list) {
-		mnt = list_entry(p, typeof(*mnt), mnt_list);
-		if (!mnt_is_cursor(mnt)) {
-			ret = mnt;
-			break;
+	while (node) {
+		struct mount *m = node_to_mount(node);
+
+		if (mnt_id <= m->mnt_id_unique) {
+			ret = node_to_mount(node);
+			if (mnt_id == m->mnt_id_unique)
+				break;
+			node = node->rb_left;
+		} else {
+			node = node->rb_right;
 		}
 	}
-	unlock_ns_list(ns);
-
 	return ret;
 }
 
+#ifdef CONFIG_PROC_FS
+
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
-	struct list_head *prev;
 
 	down_read(&namespace_sem);
-	if (!*pos) {
-		prev = &p->ns->list;
-	} else {
-		prev = &p->cursor.mnt_list;
 
-		/* Read after we'd reached the end? */
-		if (list_empty(prev))
-			return NULL;
-	}
-
-	return mnt_list_next(p->ns, prev);
+	return mnt_find_id_at(p->ns, *pos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct proc_mounts *p = m->private;
-	struct mount *mnt = v;
+	struct mount *next = NULL, *mnt = v;
+	struct rb_node *node = rb_next(&mnt->mnt_node);
 
 	++*pos;
-	return mnt_list_next(p->ns, &mnt->mnt_list);
+	if (node) {
+		next = node_to_mount(node);
+		*pos = next->mnt_id_unique;
+	}
+	return next;
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
-	struct proc_mounts *p = m->private;
-	struct mount *mnt = v;
-
-	lock_ns_list(p->ns);
-	if (mnt)
-		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
-	else
-		list_del_init(&p->cursor.mnt_list);
-	unlock_ns_list(p->ns);
 	up_read(&namespace_sem);
 }
 
@@ -1493,14 +1491,6 @@ const struct seq_operations mounts_op = {
 	.show	= m_show,
 };
 
-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
-	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_del(&cursor->mnt_list);
-	unlock_ns_list(ns);
-	up_read(&namespace_sem);
-}
 #endif  /* CONFIG_PROC_FS */
 
 /**
@@ -1642,7 +1632,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	/* Gather the mounts to umount */
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
-		list_move(&p->mnt_list, &tmp_list);
+		if (p->mnt.mnt_flags & MNT_ONRB)
+			move_from_ns(p, &tmp_list);
+		else
+			list_move(&p->mnt_list, &tmp_list);
 	}
 
 	/* Hide the mounts from mnt_mounts */
@@ -1662,7 +1655,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		list_del_init(&p->mnt_list);
 		ns = p->mnt_ns;
 		if (ns) {
-			ns->mounts--;
+			ns->nr_mounts--;
 			__touch_mnt_namespace(ns);
 		}
 		p->mnt_ns = NULL;
@@ -1788,14 +1781,16 @@ static int do_umount(struct mount *mnt, int flags)
 
 	event++;
 	if (flags & MNT_DETACH) {
-		if (!list_empty(&mnt->mnt_list))
+		if (mnt->mnt.mnt_flags & MNT_ONRB ||
+		    !list_empty(&mnt->mnt_list))
 			umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
-			if (!list_empty(&mnt->mnt_list))
+			if (mnt->mnt.mnt_flags & MNT_ONRB ||
+			    !list_empty(&mnt->mnt_list))
 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
@@ -2213,9 +2208,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
 	unsigned int mounts = 0;
 	struct mount *p;
 
-	if (ns->mounts >= max)
+	if (ns->nr_mounts >= max)
 		return -ENOSPC;
-	max -= ns->mounts;
+	max -= ns->nr_mounts;
 	if (ns->pending_mounts >= max)
 		return -ENOSPC;
 	max -= ns->pending_mounts;
@@ -2359,8 +2354,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		if (source_mnt->mnt_ns) {
+			LIST_HEAD(head);
+
 			/* move from anon - the caller will destroy */
-			list_del_init(&source_mnt->mnt_ns->list);
+			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+				move_from_ns(p, &head);
+			list_del_init(&head);
 		}
 		if (beneath)
 			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2671,11 +2670,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 
 	lock_mount_hash();
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		p->mnt_ns = ns;
-		ns->mounts++;
+		mnt_add_to_ns(ns, p);
+		ns->nr_mounts++;
 	}
 	ns->root = mnt;
-	list_add_tail(&ns->list, &mnt->mnt_list);
 	mntget(&mnt->mnt);
 	unlock_mount_hash();
 	namespace_unlock();
@@ -3738,9 +3736,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	if (!anon)
 		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	refcount_set(&new_ns->ns.count, 1);
-	INIT_LIST_HEAD(&new_ns->list);
+	new_ns->mounts = RB_ROOT;
 	init_waitqueue_head(&new_ns->poll);
-	spin_lock_init(&new_ns->ns_lock);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
 	return new_ns;
@@ -3787,7 +3784,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		unlock_mount_hash();
 	}
 	new_ns->root = new;
-	list_add_tail(&new_ns->list, &new->mnt_list);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3797,8 +3793,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	p = old;
 	q = new;
 	while (p) {
-		q->mnt_ns = new_ns;
-		new_ns->mounts++;
+		mnt_add_to_ns(new_ns, q);
+		new_ns->nr_mounts++;
 		if (new_fs) {
 			if (&p->mnt == new_fs->root.mnt) {
 				new_fs->root.mnt = mntget(&q->mnt);
@@ -3840,10 +3836,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 		mntput(m);
 		return ERR_CAST(ns);
 	}
-	mnt->mnt_ns = ns;
 	ns->root = mnt;
-	ns->mounts++;
-	list_add(&mnt->mnt_list, &ns->list);
+	ns->nr_mounts++;
+	mnt_add_to_ns(ns, mnt);
 
 	err = vfs_path_lookup(m->mnt_root, m,
 			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4021,10 +4016,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 		goto err_path;
 	}
 	mnt = real_mount(newmount.mnt);
-	mnt->mnt_ns = ns;
 	ns->root = mnt;
-	ns->mounts = 1;
-	list_add(&mnt->mnt_list, &ns->list);
+	ns->nr_mounts = 1;
+	mnt_add_to_ns(ns, mnt);
 	mntget(newmount.mnt);
 
 	/* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4695,10 +4689,9 @@ static void __init init_mount_tree(void)
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
 	m = real_mount(mnt);
-	m->mnt_ns = ns;
 	ns->root = m;
-	ns->mounts = 1;
-	list_add(&m->mnt_list, &ns->list);
+	ns->nr_mounts = 1;
+	mnt_add_to_ns(ns, m);
 	init_task.nsproxy->mnt_ns = ns;
 	get_mnt_ns(ns);
 
@@ -4825,18 +4818,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 				int *new_mnt_flags)
 {
 	int new_flags = *new_mnt_flags;
-	struct mount *mnt;
+	struct mount *mnt, *n;
 	bool visible = false;
 
 	down_read(&namespace_sem);
-	lock_ns_list(ns);
-	list_for_each_entry(mnt, &ns->list, mnt_list) {
+	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt_is_cursor(mnt))
-			continue;
-
 		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
 			continue;
 
@@ -4884,7 +4873,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 	next:	;
 	}
 found:
-	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 	return visible;
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index e4d0340393d5..a799e0315cc9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
 	mnt->mnt.mnt_flags |= MNT_UMOUNT;
 	list_del_init(&mnt->mnt_child);
 	list_del_init(&mnt->mnt_umounting);
-	list_move_tail(&mnt->mnt_list, to_umount);
+	move_from_ns(mnt, to_umount);
 }
 
 /*
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..73d2274d5f59 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -283,8 +283,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	p->ns = ns;
 	p->root = root;
 	p->show = show;
-	INIT_LIST_HEAD(&p->cursor.mnt_list);
-	p->cursor.mnt.mnt_flags = MNT_CURSOR;
 
 	return 0;
 
@@ -301,7 +299,6 @@ static int mounts_release(struct inode *inode, struct file *file)
 	struct seq_file *m = file->private_data;
 	struct proc_mounts *p = m->private;
 	path_put(&p->root);
-	mnt_cursor_del(p->ns, &p->cursor);
 	put_mnt_ns(p->ns);
 	return seq_release_private(inode, file);
 }
diff --git a/include/linux/mount.h b/include/linux/mount.h
index ac3dd2876197..c34c18b4e8f3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -50,8 +50,7 @@ struct path;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
-			    MNT_CURSOR)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
 
 #define MNT_INTERNAL	0x4000
 
@@ -65,7 +64,7 @@ struct path;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
-#define MNT_CURSOR		0x10000000
+#define MNT_ONRB		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
-- 
cgit v1.2.3


From 982c3b3058433f20aba9fb032599cee5dfc17328 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:08 +0200
Subject: bdev: rename freeze and thaw helpers

We have bdev_mark_dead() etc and we're going to move block device
freezing to holder ops in the next patch. Make the naming consistent:

* freeze_bdev() -> bdev_freeze()
* thaw_bdev()   -> bdev_thaw()

Also document the return code.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-2-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 22 +++++++++++++---------
 drivers/md/dm.c        |  4 ++--
 fs/bcachefs/fs-ioctl.c |  4 ++--
 fs/ext4/ioctl.c        |  4 ++--
 fs/f2fs/file.c         |  4 ++--
 fs/super.c             |  4 ++--
 fs/xfs/xfs_fsops.c     |  4 ++--
 include/linux/blkdev.h |  4 ++--
 8 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/bdev.c b/block/bdev.c
index e4cfb7adb645..a890da1b3adf 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -207,18 +207,20 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
 EXPORT_SYMBOL(sync_blockdev_range);
 
 /**
- * freeze_bdev - lock a filesystem and force it into a consistent state
+ * bdev_freeze - lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
  * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
+ * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
  * actually.
+ *
+ * Return: On success zero is returned, negative error code on failure.
  */
-int freeze_bdev(struct block_device *bdev)
+int bdev_freeze(struct block_device *bdev)
 {
 	struct super_block *sb;
 	int error = 0;
@@ -248,15 +250,17 @@ done:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
 }
-EXPORT_SYMBOL(freeze_bdev);
+EXPORT_SYMBOL(bdev_freeze);
 
 /**
- * thaw_bdev - unlock filesystem
+ * bdev_thaw - unlock filesystem
  * @bdev:	blockdevice to unlock
  *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the filesystem and marks it writeable again after bdev_freeze().
+ *
+ * Return: On success zero is returned, negative error code on failure.
  */
-int thaw_bdev(struct block_device *bdev)
+int bdev_thaw(struct block_device *bdev)
 {
 	struct super_block *sb;
 	int error = -EINVAL;
@@ -285,7 +289,7 @@ out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
 }
-EXPORT_SYMBOL(thaw_bdev);
+EXPORT_SYMBOL(bdev_thaw);
 
 /*
  * pseudo-fs
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 23c32cd1f1d8..8dcabf84d866 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2675,7 +2675,7 @@ static int lock_fs(struct mapped_device *md)
 
 	WARN_ON(test_bit(DMF_FROZEN, &md->flags));
 
-	r = freeze_bdev(md->disk->part0);
+	r = bdev_freeze(md->disk->part0);
 	if (!r)
 		set_bit(DMF_FROZEN, &md->flags);
 	return r;
@@ -2685,7 +2685,7 @@ static void unlock_fs(struct mapped_device *md)
 {
 	if (!test_bit(DMF_FROZEN, &md->flags))
 		return;
-	thaw_bdev(md->disk->part0);
+	bdev_thaw(md->disk->part0);
 	clear_bit(DMF_FROZEN, &md->flags);
 }
 
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 5a39bcb597a3..45e663342e38 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -291,14 +291,14 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 
 	switch (flags) {
 	case FSOP_GOING_FLAGS_DEFAULT:
-		ret = freeze_bdev(c->vfs_sb->s_bdev);
+		ret = bdev_freeze(c->vfs_sb->s_bdev);
 		if (ret)
 			goto err;
 
 		bch2_journal_flush(&c->journal);
 		c->vfs_sb->s_flags |= SB_RDONLY;
 		bch2_fs_emergency_read_only(c);
-		thaw_bdev(c->vfs_sb->s_bdev);
+		bdev_thaw(c->vfs_sb->s_bdev);
 		break;
 
 	case FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 4f931f80cb34..aa6be510eb8f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
 
 	switch (flags) {
 	case EXT4_GOING_FLAGS_DEFAULT:
-		ret = freeze_bdev(sb->s_bdev);
+		ret = bdev_freeze(sb->s_bdev);
 		if (ret)
 			return ret;
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
-		thaw_bdev(sb->s_bdev);
+		bdev_thaw(sb->s_bdev);
 		break;
 	case EXT4_GOING_FLAGS_LOGFLUSH:
 		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e50363583f01..4580dfefd5e9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2239,11 +2239,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 
 	switch (in) {
 	case F2FS_GOING_DOWN_FULLSYNC:
-		ret = freeze_bdev(sb->s_bdev);
+		ret = bdev_freeze(sb->s_bdev);
 		if (ret)
 			goto out;
 		f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
-		thaw_bdev(sb->s_bdev);
+		bdev_thaw(sb->s_bdev);
 		break;
 	case F2FS_GOING_DOWN_METASYNC:
 		/* do checkpoint only */
diff --git a/fs/super.c b/fs/super.c
index aa43090ea52d..8333aaa878cf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1224,7 +1224,7 @@ static void do_thaw_all_callback(struct super_block *sb)
 
 	if (locked && sb->s_root) {
 		if (IS_ENABLED(CONFIG_BLOCK))
-			while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+			while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
 				pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
 		thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
 		return;
@@ -1532,7 +1532,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	/*
 	 * Until SB_BORN flag is set, there can be no active superblock
 	 * references and thus no filesystem freezing. get_active_super() will
-	 * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed.
+	 * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed.
 	 *
 	 * It is enough to check bdev was not frozen before we set s_bdev.
 	 */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7cb75cb6b8e9..57076a25f17d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -482,9 +482,9 @@ xfs_fs_goingdown(
 {
 	switch (inflags) {
 	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
-		if (!freeze_bdev(mp->m_super->s_bdev)) {
+		if (!bdev_freeze(mp->m_super->s_bdev)) {
 			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
-			thaw_bdev(mp->m_super->s_bdev);
+			bdev_thaw(mp->m_super->s_bdev);
 		}
 		break;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..7a3da7f44afb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1541,8 +1541,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
 }
 #endif /* CONFIG_BLOCK */
 
-int freeze_bdev(struct block_device *bdev);
-int thaw_bdev(struct block_device *bdev);
+int bdev_freeze(struct block_device *bdev);
+int bdev_thaw(struct block_device *bdev);
 
 struct io_comp_batch {
 	struct request *req_list;
-- 
cgit v1.2.3


From a30561a9be69d446d8d542a4f9735fe5ca9573df Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:10 +0200
Subject: bdev: add freeze and thaw holder operations

Add block device freeze and thaw holder operations. Follow-up patches
will implement block device freeze and thaw based on stuct
blk_holder_ops.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-4-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7a3da7f44afb..1bc776335ff8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1468,6 +1468,16 @@ struct blk_holder_ops {
 	 * Sync the file system mounted on the block device.
 	 */
 	void (*sync)(struct block_device *bdev);
+
+	/*
+	 * Freeze the file system mounted on the block device.
+	 */
+	int (*freeze)(struct block_device *bdev);
+
+	/*
+	 * Thaw the file system mounted on the block device.
+	 */
+	int (*thaw)(struct block_device *bdev);
 };
 
 extern const struct blk_holder_ops fs_holder_ops;
-- 
cgit v1.2.3


From 49ef8832fb1a9e0da0020eb17480fd286433bc13 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 27 Sep 2023 15:21:16 +0200
Subject: bdev: implement freeze and thaw holder operations

The old method of implementing block device freeze and thaw operations
required us to rely on get_active_super() to walk the list of all
superblocks on the system to find any superblock that might use the
block device. This is wasteful and not very pleasant overall.

Now that we can finally go straight from block device to owning
superblock things become way simpler.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-5-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c              | 65 +++++++++++++++----------------
 fs/super.c                | 99 +++++++++++++++++++++++++++++++++++++----------
 include/linux/blk_types.h |  2 +-
 3 files changed, 112 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/block/bdev.c b/block/bdev.c
index f27f1a7db2bd..53a2c653a259 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -222,31 +222,27 @@ EXPORT_SYMBOL(sync_blockdev_range);
  */
 int bdev_freeze(struct block_device *bdev)
 {
-	struct super_block *sb;
 	int error = 0;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (++bdev->bd_fsfreeze_count > 1)
-		goto done;
-
-	sb = get_active_super(bdev);
-	if (!sb)
-		goto sync;
-	if (sb->s_op->freeze_super)
-		error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
-	else
-		error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
-	deactivate_super(sb);
 
-	if (error) {
-		bdev->bd_fsfreeze_count--;
-		goto done;
+	if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
+	mutex_lock(&bdev->bd_holder_lock);
+	if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
+		error = bdev->bd_holder_ops->freeze(bdev);
+		lockdep_assert_not_held(&bdev->bd_holder_lock);
+	} else {
+		mutex_unlock(&bdev->bd_holder_lock);
+		error = sync_blockdev(bdev);
 	}
-	bdev->bd_fsfreeze_sb = sb;
 
-sync:
-	error = sync_blockdev(bdev);
-done:
+	if (error)
+		atomic_dec(&bdev->bd_fsfreeze_count);
+
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
 }
@@ -262,29 +258,32 @@ EXPORT_SYMBOL(bdev_freeze);
  */
 int bdev_thaw(struct block_device *bdev)
 {
-	struct super_block *sb;
-	int error = -EINVAL;
+	int error = -EINVAL, nr_freeze;
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count)
+
+	/*
+	 * If this returns < 0 it means that @bd_fsfreeze_count was
+	 * already 0 and no decrement was performed.
+	 */
+	nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
+	if (nr_freeze < 0)
 		goto out;
 
 	error = 0;
-	if (--bdev->bd_fsfreeze_count > 0)
+	if (nr_freeze > 0)
 		goto out;
 
-	sb = bdev->bd_fsfreeze_sb;
-	if (!sb)
-		goto out;
+	mutex_lock(&bdev->bd_holder_lock);
+	if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
+		error = bdev->bd_holder_ops->thaw(bdev);
+		lockdep_assert_not_held(&bdev->bd_holder_lock);
+	} else {
+		mutex_unlock(&bdev->bd_holder_lock);
+	}
 
-	if (sb->s_op->thaw_super)
-		error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
-	else
-		error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
 	if (error)
-		bdev->bd_fsfreeze_count++;
-	else
-		bdev->bd_fsfreeze_sb = NULL;
+		atomic_inc(&bdev->bd_fsfreeze_count);
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
diff --git a/fs/super.c b/fs/super.c
index 8333aaa878cf..8c943ab1f424 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1440,7 +1440,7 @@ EXPORT_SYMBOL(sget_dev);
  *
  * The function must be called with bdev->bd_holder_lock and releases it.
  */
-static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 	__releases(&bdev->bd_holder_lock)
 {
 	struct super_block *sb = bdev->bd_holder;
@@ -1454,18 +1454,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
 	spin_lock(&sb_lock);
 	sb->s_count++;
 	spin_unlock(&sb_lock);
+
 	mutex_unlock(&bdev->bd_holder_lock);
 
-	locked = super_lock_shared(sb);
-	if (!locked || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
-		put_super(sb);
-		return NULL;
-	}
+	locked = super_lock(sb, excl);
+
 	/*
-	 * The superblock is active and we hold s_umount, we can drop our
-	 * temporary reference now.
-	 */
+	 * If the superblock wasn't already SB_DYING then we hold
+	 * s_umount and can safely drop our temporary reference.
+         */
 	put_super(sb);
+
+	if (!locked)
+		return NULL;
+
+	if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
+		super_unlock(sb, excl);
+		return NULL;
+	}
+
 	return sb;
 }
 
@@ -1473,7 +1480,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 {
 	struct super_block *sb;
 
-	sb = bdev_super_lock_shared(bdev);
+	sb = bdev_super_lock(bdev, false);
 	if (!sb)
 		return;
 
@@ -1491,16 +1498,74 @@ static void fs_bdev_sync(struct block_device *bdev)
 {
 	struct super_block *sb;
 
-	sb = bdev_super_lock_shared(bdev);
+	sb = bdev_super_lock(bdev, false);
 	if (!sb)
 		return;
+
 	sync_filesystem(sb);
 	super_unlock_shared(sb);
 }
 
+static struct super_block *get_bdev_super(struct block_device *bdev)
+{
+	bool active = false;
+	struct super_block *sb;
+
+	sb = bdev_super_lock(bdev, true);
+	if (sb) {
+		active = atomic_inc_not_zero(&sb->s_active);
+		super_unlock_excl(sb);
+	}
+	if (!active)
+		return NULL;
+	return sb;
+}
+
+static int fs_bdev_freeze(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+	sb = get_bdev_super(bdev);
+	if (!sb)
+		return -EINVAL;
+
+	if (sb->s_op->freeze_super)
+		error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+	else
+		error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+	if (!error)
+		error = sync_blockdev(bdev);
+	deactivate_super(sb);
+	return error;
+}
+
+static int fs_bdev_thaw(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error;
+
+	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+	sb = get_bdev_super(bdev);
+	if (WARN_ON_ONCE(!sb))
+		return -EINVAL;
+
+	if (sb->s_op->thaw_super)
+		error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+	else
+		error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+	deactivate_super(sb);
+	return error;
+}
+
 const struct blk_holder_ops fs_holder_ops = {
 	.mark_dead		= fs_bdev_mark_dead,
 	.sync			= fs_bdev_sync,
+	.freeze			= fs_bdev_freeze,
+	.thaw			= fs_bdev_thaw,
 };
 EXPORT_SYMBOL_GPL(fs_holder_ops);
 
@@ -1530,15 +1595,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	}
 
 	/*
-	 * Until SB_BORN flag is set, there can be no active superblock
-	 * references and thus no filesystem freezing. get_active_super() will
-	 * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed.
-	 *
-	 * It is enough to check bdev was not frozen before we set s_bdev.
+	 * It is enough to check bdev was not frozen before we set
+	 * s_bdev as freezing will wait until SB_BORN is set.
 	 */
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
 		bdev_release(bdev_handle);
@@ -1551,7 +1611,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	if (bdev_stable_writes(bdev))
 		sb->s_iflags |= SB_I_STABLE_WRITES;
 	spin_unlock(&sb_lock);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 
 	snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
 	shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..88e1848b0869 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -57,7 +57,7 @@ struct block_device {
 	const struct blk_holder_ops *bd_holder_ops;
 	struct mutex		bd_holder_lock;
 	/* The counter of freeze processes */
-	int			bd_fsfreeze_count;
+	atomic_t		bd_fsfreeze_count;
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-- 
cgit v1.2.3


From 434f8d8299f2a0c97578f77ab23a70cd0ae56544 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:12 +0200
Subject: fs: remove get_active_super()

This function is now unused so remove it. One less function that uses
the global superblock list.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-6-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 28 ----------------------------
 include/linux/fs.h |  1 -
 2 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 8c943ab1f424..b49a49d70450 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1027,34 +1027,6 @@ void iterate_supers_type(struct file_system_type *type,
 
 EXPORT_SYMBOL(iterate_supers_type);
 
-/**
- * get_active_super - get an active reference to the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given.  Returns the superblock with an active
- * reference or %NULL if none was found.
- */
-struct super_block *get_active_super(struct block_device *bdev)
-{
-	struct super_block *sb;
-
-	if (!bdev)
-		return NULL;
-
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_bdev == bdev) {
-			if (!grab_super(sb))
-				return NULL;
-			super_unlock_excl(sb);
-			return sb;
-		}
-	}
-	spin_unlock(&sb_lock);
-	return NULL;
-}
-
 struct super_block *user_get_super(dev_t dev, bool excl)
 {
 	struct super_block *sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..7dc6c1bf5f55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3121,7 +3121,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int);
 extern struct file_system_type *get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
-extern struct super_block *get_active_super(struct block_device *bdev);
 extern void drop_super(struct super_block *sb);
 extern void drop_super_exclusive(struct super_block *sb);
 extern void iterate_supers(void (*)(struct super_block *, void *), void *);
-- 
cgit v1.2.3


From 90f95dc415de23267b888f8238c4a19fa0f66b89 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:13 +0200
Subject: super: remove bd_fsfreeze_sb

Remove bd_fsfreeze_sb as it's now unused and can be removed. Also move
bd_fsfreeze_count down to not have it weirdly placed in the middle of
the holder fields.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-7-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blk_types.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 88e1848b0869..749203277fee 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -56,14 +56,11 @@ struct block_device {
 	void *			bd_holder;
 	const struct blk_holder_ops *bd_holder_ops;
 	struct mutex		bd_holder_lock;
-	/* The counter of freeze processes */
-	atomic_t		bd_fsfreeze_count;
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-	/* Mutex for freeze */
-	struct mutex		bd_fsfreeze_mutex;
-	struct super_block	*bd_fsfreeze_sb;
+	atomic_t		bd_fsfreeze_count; /* number of freeze requests */
+	struct mutex		bd_fsfreeze_mutex; /* serialize freeze/thaw */
 
 	struct partition_meta_info *bd_meta_info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-- 
cgit v1.2.3


From e419cf3ebaee694a826ddcfb350f1b1ebaf1e599 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Oct 2023 15:01:16 +0200
Subject: blkdev: comment fs_holder_ops

Add a comment to @fs_holder_ops that @holder must point to a superblock.

Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-10-599c19f4faac@kernel.org
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1bc776335ff8..abf71cce785c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1480,6 +1480,11 @@ struct blk_holder_ops {
 	int (*thaw)(struct block_device *bdev);
 };
 
+/*
+ * For filesystems using @fs_holder_ops, the @holder argument passed to
+ * helpers used to open and claim block devices via
+ * bd_prepare_to_claim() must point to a superblock.
+ */
 extern const struct blk_holder_ops fs_holder_ops;
 
 /*
-- 
cgit v1.2.3


From 7366f8b6fc6aa21c4199cb5d337b023df69745b0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sat, 4 Nov 2023 15:00:13 +0100
Subject: fs: handle freezing from multiple devices

Before [1] freezing a filesystems through the block layer only worked
for the main block device as the owning superblock of additional block
devices could not be found. Any filesystem that made use of multiple
block devices would only be freezable via it's main block device.

For example, consider xfs over device mapper with /dev/dm-0 as main
block device and /dev/dm-1 as external log device. Two freeze requests
before [1]:

(1) dmsetup suspend /dev/dm-0 on the main block device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns 0.

(2) dmsetup suspend /dev/dm-1 on the log device

    bdev_freeze(dm-1)
    -> dm-1->bd_fsfreeze_count++

    The owning superblock isn't found and only the block device freeze
    count is incremented. Returns 0.

Two freeze requests after [1]:

(1') dmsetup suspend /dev/dm-0 on the main block device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns 0.

(2') dmsetup suspend /dev/dm-1 on the log device

    bdev_freeze(dm-0)
    -> dm-0->bd_fsfreeze_count++
    -> freeze_super(xfs-sb)

    The owning superblock is found and the filesystem gets frozen.
    Returns -EBUSY.

When (2') is called we initiate a freeze from another block device of
the same superblock. So we increment the bd_fsfreeze_count for that
additional block device. But we now also find the owning superblock for
additional block devices and call freeze_super() again which reports
-EBUSY.

This can be reproduced through xfstests via:

    mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p4 /dev/nvme1n1p3
    mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p6 /dev/nvme1n1p5

    FSTYP=xfs
    export TEST_DEV=/dev/nvme1n1p3
    export TEST_DIR=/mnt/test
    export TEST_LOGDEV=/dev/nvme1n1p4
    export SCRATCH_DEV=/dev/nvme1n1p5
    export SCRATCH_MNT=/mnt/scratch
    export SCRATCH_LOGDEV=/dev/nvme1n1p6
    export USE_EXTERNAL=yes

    sudo ./check generic/311

Current semantics allow two concurrent freezers: one initiated from
userspace via FREEZE_HOLDER_USERSPACE and one initiated from the kernel
via FREEZE_HOLDER_KERNEL. If there are multiple concurrent freeze
requests from either FREEZE_HOLDER_USERSPACE or FREEZE_HOLDER_KERNEL
-EBUSY is returned.

We need to preserve these semantics because as they are uapi via
FIFREEZE and FITHAW ioctl()s. IOW, freezes don't nest for FIFREEZE and
FITHAW. Other kernels consumers rely on non-nesting freezes as well.

With freezes initiated from the block layer freezes need to nest if the
same superblock is frozen via multiple devices. So we need to start
counting the number of freeze requests.

If FREEZE_MAY_NEST is passed alongside FREEZE_HOLDER_KERNEL or
FREEZE_HOLDER_USERSPACE we allow the caller to nest freeze calls.

To accommodate the old semantics we split the freeze counter into two
counting kernel initiated and userspace initiated freezes separately. We
can then also stop recording FREEZE_HOLDER_* in struct sb_writers.

We also simplify freezing by making all concurrent freezers share a
single active superblock reference count instead of having separate
references for kernel and userspace. I don't see why we would need two
active reference counts. Neither FREEZE_HOLDER_KERNEL nor
FREEZE_HOLDER_USERSPACE can put the active reference as long as they are
concurrent freezers anwyay. That was already true before we allowed
nesting freezes.

Survives various fstests runs with different options including the
reproducer, online scrub, and online repair, fsfreze, and so on. Also
survives blktests.

Link: https://lore.kernel.org/linux-block/87bkccnwxc.fsf@debian-BULLSEYE-live-builder-AMD64
Link: https://lore.kernel.org/r/20231104-vfs-multi-device-freeze-v2-2-5b5b69626eac@kernel.org
Fixes: 288d8706abfc ("bdev: implement freeze and thaw holder operations") [1] # no backport needed
Tested-by: Chandan Babu R <chandanbabu@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: Chandan Babu R <chandanbabu@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 144 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/fs.h |  18 ++++++-
 2 files changed, 129 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index d2b026566dea..aa4e5c11ee32 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1467,6 +1467,21 @@ static struct super_block *get_bdev_super(struct block_device *bdev)
 	return sb;
 }
 
+/**
+ * fs_bdev_freeze - freeze owning filesystem of block device
+ * @bdev: block device
+ *
+ * Freeze the filesystem that owns this block device if it is still
+ * active.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ *         failed a negative error code is returned.
+ */
 static int fs_bdev_freeze(struct block_device *bdev)
 {
 	struct super_block *sb;
@@ -1479,15 +1494,34 @@ static int fs_bdev_freeze(struct block_device *bdev)
 		return -EINVAL;
 
 	if (sb->s_op->freeze_super)
-		error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+		error = sb->s_op->freeze_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
 	else
-		error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+		error = freeze_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
 	if (!error)
 		error = sync_blockdev(bdev);
 	deactivate_super(sb);
 	return error;
 }
 
+/**
+ * fs_bdev_thaw - thaw owning filesystem of block device
+ * @bdev: block device
+ *
+ * Thaw the filesystem that owns this block device.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the thaw was successful zero is returned. If the thaw
+ *         failed a negative error code is returned. If this function
+ *         returns zero it doesn't mean that the filesystem is unfrozen
+ *         as it may have been frozen multiple times (kernel may hold a
+ *         freeze or might be frozen from other block devices).
+ */
 static int fs_bdev_thaw(struct block_device *bdev)
 {
 	struct super_block *sb;
@@ -1500,9 +1534,11 @@ static int fs_bdev_thaw(struct block_device *bdev)
 		return -EINVAL;
 
 	if (sb->s_op->thaw_super)
-		error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+		error = sb->s_op->thaw_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
 	else
-		error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+		error = thaw_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
 	deactivate_super(sb);
 	return error;
 }
@@ -1914,6 +1950,47 @@ static int wait_for_partially_frozen(struct super_block *sb)
 	return ret;
 }
 
+#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
+#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
+
+static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
+{
+	WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+	WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+	if (who & FREEZE_HOLDER_KERNEL)
+		++sb->s_writers.freeze_kcount;
+	if (who & FREEZE_HOLDER_USERSPACE)
+		++sb->s_writers.freeze_ucount;
+	return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
+{
+	WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+	WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+	if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
+		--sb->s_writers.freeze_kcount;
+	if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
+		--sb->s_writers.freeze_ucount;
+	return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
+{
+	WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+	WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+	if (who & FREEZE_HOLDER_KERNEL)
+		return (who & FREEZE_MAY_NEST) ||
+		       sb->s_writers.freeze_kcount == 0;
+	if (who & FREEZE_HOLDER_USERSPACE)
+		return (who & FREEZE_MAY_NEST) ||
+		       sb->s_writers.freeze_ucount == 0;
+	return false;
+}
+
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1926,6 +2003,7 @@ static int wait_for_partially_frozen(struct super_block *sb)
  * @who should be:
  * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
  * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
  *
  * The @who argument distinguishes between the kernel and userspace trying to
  * freeze the filesystem.  Although there cannot be multiple kernel freezes or
@@ -1933,6 +2011,13 @@ static int wait_for_partially_frozen(struct super_block *sb)
  * userspace can both hold a filesystem frozen.  The filesystem remains frozen
  * until there are no kernel or userspace freezes in effect.
  *
+ * A filesystem may hold multiple devices and thus a filesystems may be
+ * frozen through the block layer via multiple block devices. In this
+ * case the request is marked as being allowed to nest by passing
+ * FREEZE_MAY_NEST. The filesystem remains frozen until all block
+ * devices are unfrozen. If multiple freezes are attempted without
+ * FREEZE_MAY_NEST -EBUSY will be returned.
+ *
  * During this function, sb->s_writers.frozen goes through these values:
  *
  * SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1957,6 +2042,9 @@ static int wait_for_partially_frozen(struct super_block *sb)
  * mostly auxiliary for filesystems to verify they do not modify frozen fs.
  *
  * sb->s_writers.frozen is protected by sb->s_umount.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ *         failed a negative error code is returned.
  */
 int freeze_super(struct super_block *sb, enum freeze_holder who)
 {
@@ -1970,20 +2058,13 @@ int freeze_super(struct super_block *sb, enum freeze_holder who)
 
 retry:
 	if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
-		if (sb->s_writers.freeze_holders & who) {
-			deactivate_locked_super(sb);
-			return -EBUSY;
-		}
-
-		WARN_ON(sb->s_writers.freeze_holders == 0);
-
-		/*
-		 * Someone else already holds this type of freeze; share the
-		 * freeze and assign the active ref to the freeze.
-		 */
-		sb->s_writers.freeze_holders |= who;
-		super_unlock_excl(sb);
-		return 0;
+		if (may_freeze(sb, who))
+			ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
+		else
+			ret = -EBUSY;
+		/* All freezers share a single active reference. */
+		deactivate_locked_super(sb);
+		return ret;
 	}
 
 	if (sb->s_writers.frozen != SB_UNFROZEN) {
@@ -1998,7 +2079,7 @@ retry:
 
 	if (sb_rdonly(sb)) {
 		/* Nothing to do really... */
-		sb->s_writers.freeze_holders |= who;
+		WARN_ON_ONCE(freeze_inc(sb, who) > 1);
 		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 		wake_up_var(&sb->s_writers.frozen);
 		super_unlock_excl(sb);
@@ -2048,7 +2129,7 @@ retry:
 	 * For debugging purposes so that fs can warn if it sees write activity
 	 * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
 	 */
-	sb->s_writers.freeze_holders |= who;
+	WARN_ON_ONCE(freeze_inc(sb, who) > 1);
 	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 	wake_up_var(&sb->s_writers.frozen);
 	lockdep_sb_freeze_release(sb);
@@ -2069,21 +2150,15 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
 
 	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
 		goto out_unlock;
-	if (!(sb->s_writers.freeze_holders & who))
-		goto out_unlock;
 
 	/*
-	 * Freeze is shared with someone else.  Release our hold and drop the
-	 * active ref that freeze_super assigned to the freezer.
+	 * All freezers share a single active reference.
+	 * So just unlock in case there are any left.
 	 */
-	error = 0;
-	if (sb->s_writers.freeze_holders & ~who) {
-		sb->s_writers.freeze_holders &= ~who;
-		goto out_deactivate;
-	}
+	if (freeze_dec(sb, who))
+		goto out_unlock;
 
 	if (sb_rdonly(sb)) {
-		sb->s_writers.freeze_holders &= ~who;
 		sb->s_writers.frozen = SB_UNFROZEN;
 		wake_up_var(&sb->s_writers.frozen);
 		goto out_deactivate;
@@ -2094,13 +2169,13 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
 	if (sb->s_op->unfreeze_fs) {
 		error = sb->s_op->unfreeze_fs(sb);
 		if (error) {
-			printk(KERN_ERR "VFS:Filesystem thaw failed\n");
+			pr_err("VFS: Filesystem thaw failed\n");
+			freeze_inc(sb, who);
 			lockdep_sb_freeze_release(sb);
 			goto out_unlock;
 		}
 	}
 
-	sb->s_writers.freeze_holders &= ~who;
 	sb->s_writers.frozen = SB_UNFROZEN;
 	wake_up_var(&sb->s_writers.frozen);
 	sb_freeze_unlock(sb, SB_FREEZE_FS);
@@ -2124,6 +2199,11 @@ out_unlock:
  * @who should be:
  * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
  * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
+ *
+ * A filesystem may hold multiple devices and thus a filesystems may
+ * have been frozen through the block layer via multiple block devices.
+ * The filesystem remains frozen until all block devices are unfrozen.
  */
 int thaw_super(struct super_block *sb, enum freeze_holder who)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7dc6c1bf5f55..b2a3f1c61c19 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1185,7 +1185,8 @@ enum {
 
 struct sb_writers {
 	unsigned short			frozen;		/* Is sb frozen? */
-	unsigned short			freeze_holders;	/* Who froze fs? */
+	int				freeze_kcount;	/* How many kernel freeze requests? */
+	int				freeze_ucount;	/* How many userspace freeze requests? */
 	struct percpu_rw_semaphore	rw_sem[SB_FREEZE_LEVELS];
 };
 
@@ -2051,9 +2052,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 					struct file *dst_file, loff_t dst_pos,
 					loff_t len, unsigned int remap_flags);
 
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
 enum freeze_holder {
 	FREEZE_HOLDER_KERNEL	= (1U << 0),
 	FREEZE_HOLDER_USERSPACE	= (1U << 1),
+	FREEZE_MAY_NEST		= (1U << 2),
 };
 
 struct super_operations {
-- 
cgit v1.2.3


From cd34758c5238ae6976b10fe15bba7031b409c969 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:07 +0100
Subject: block: Remove blkdev_get_by_*() functions

blkdev_get_by_*() and blkdev_put() functions are now unused. Remove
them.

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-2-jack@suse.cz
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 94 ++++++++++++++++----------------------------------
 include/linux/blkdev.h |  5 ---
 2 files changed, 30 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/block/bdev.c b/block/bdev.c
index 53a2c653a259..0daf14e50dd0 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -732,7 +732,7 @@ void blkdev_put_no_open(struct block_device *bdev)
 }
 	
 /**
- * blkdev_get_by_dev - open a block device by device number
+ * bdev_open_by_dev - open a block device by device number
  * @dev: device number of block device to open
  * @mode: open mode (BLK_OPEN_*)
  * @holder: exclusive holder identifier
@@ -744,32 +744,40 @@ void blkdev_put_no_open(struct block_device *bdev)
  *
  * Use this interface ONLY if you really do not have anything better - i.e. when
  * you are behind a truly sucky interface and all you are given is a device
- * number.  Everything else should use blkdev_get_by_path().
+ * number.  Everything else should use bdev_open_by_path().
  *
  * CONTEXT:
  * Might sleep.
  *
  * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
  */
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-		const struct blk_holder_ops *hops)
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				     const struct blk_holder_ops *hops)
 {
-	bool unblock_events = true;
+	struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+					     GFP_KERNEL);
 	struct block_device *bdev;
+	bool unblock_events = true;
 	struct gendisk *disk;
 	int ret;
 
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
 	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
 			MAJOR(dev), MINOR(dev),
 			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
 			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
 	if (ret)
-		return ERR_PTR(ret);
+		goto free_handle;
 
 	bdev = blkdev_get_no_open(dev);
-	if (!bdev)
-		return ERR_PTR(-ENXIO);
+	if (!bdev) {
+		ret = -ENXIO;
+		goto free_handle;
+	}
 	disk = bdev->bd_disk;
 
 	if (holder) {
@@ -818,7 +826,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 
 	if (unblock_events)
 		disk_unblock_events(disk);
-	return bdev;
+	handle->bdev = bdev;
+	handle->holder = holder;
+	handle->mode = mode;
+	return handle;
 put_module:
 	module_put(disk->fops->owner);
 abort_claiming:
@@ -828,34 +839,14 @@ abort_claiming:
 	disk_unblock_events(disk);
 put_blkdev:
 	blkdev_put_no_open(bdev);
+free_handle:
+	kfree(handle);
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-				     const struct blk_holder_ops *hops)
-{
-	struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
-	struct block_device *bdev;
-
-	if (!handle)
-		return ERR_PTR(-ENOMEM);
-	bdev = blkdev_get_by_dev(dev, mode, holder, hops);
-	if (IS_ERR(bdev)) {
-		kfree(handle);
-		return ERR_CAST(bdev);
-	}
-	handle->bdev = bdev;
-	handle->holder = holder;
-	if (holder)
-		mode |= BLK_OPEN_EXCL;
-	handle->mode = mode;
-	return handle;
-}
 EXPORT_SYMBOL(bdev_open_by_dev);
 
 /**
- * blkdev_get_by_path - open a block device by name
+ * bdev_open_by_path - open a block device by name
  * @path: path to the block device to open
  * @mode: open mode (BLK_OPEN_*)
  * @holder: exclusive holder identifier
@@ -869,29 +860,9 @@ EXPORT_SYMBOL(bdev_open_by_dev);
  * Might sleep.
  *
  * RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
  */
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
-		void *holder, const struct blk_holder_ops *hops)
-{
-	struct block_device *bdev;
-	dev_t dev;
-	int error;
-
-	error = lookup_bdev(path, &dev);
-	if (error)
-		return ERR_PTR(error);
-
-	bdev = blkdev_get_by_dev(dev, mode, holder, hops);
-	if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		blkdev_put(bdev, holder);
-		return ERR_PTR(-EACCES);
-	}
-
-	return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
 struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 		void *holder, const struct blk_holder_ops *hops)
 {
@@ -914,8 +885,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 }
 EXPORT_SYMBOL(bdev_open_by_path);
 
-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
 {
+	struct block_device *bdev = handle->bdev;
 	struct gendisk *disk = bdev->bd_disk;
 
 	/*
@@ -929,8 +901,8 @@ void blkdev_put(struct block_device *bdev, void *holder)
 		sync_blockdev(bdev);
 
 	mutex_lock(&disk->open_mutex);
-	if (holder)
-		bd_end_claim(bdev, holder);
+	if (handle->holder)
+		bd_end_claim(bdev, handle->holder);
 
 	/*
 	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -947,12 +919,6 @@ void blkdev_put(struct block_device *bdev, void *holder)
 
 	module_put(disk->fops->owner);
 	blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-void bdev_release(struct bdev_handle *handle)
-{
-	blkdev_put(handle->bdev, handle->holder);
 	kfree(handle);
 }
 EXPORT_SYMBOL(bdev_release);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index abf71cce785c..7afc10315dd5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1500,10 +1500,6 @@ struct bdev_handle {
 	blk_mode_t mode;
 };
 
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
-		const struct blk_holder_ops *hops);
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
-		void *holder, const struct blk_holder_ops *hops);
 struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
@@ -1511,7 +1507,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
-void blkdev_put(struct block_device *bdev, void *holder);
 void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
-- 
cgit v1.2.3


From ed5cc702d311c14b653323d76062b0294effa66e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:08 +0100
Subject: block: Add config option to not allow writing to mounted devices

Writing to mounted devices is dangerous and can lead to filesystem
corruption as well as crashes. Furthermore syzbot comes with more and
more involved examples how to corrupt block device under a mounted
filesystem leading to kernel crashes and reports we can do nothing
about. Add tracking of writers to each block device and a kernel cmdline
argument which controls whether other writeable opens to block devices
open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make
filesystems use this flag for used devices.

Note that this effectively only prevents modification of the particular
block device's page cache by other writers. The actual device content
can still be modified by other means - e.g. by issuing direct scsi
commands, by doing writes through devices lower in the storage stack
(e.g. in case loop devices, DM, or MD are involved) etc. But blocking
direct modifications of the block device page cache is enough to give
filesystems a chance to perform data validation when loading data from
the underlying storage and thus prevent kernel crashes.

Syzbot can use this cmdline argument option to avoid uninteresting
crashes. Also users whose userspace setup does not need writing to
mounted block devices can set this option for hardening.

Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-3-jack@suse.cz
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/Kconfig             | 20 +++++++++++++
 block/bdev.c              | 75 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blk_types.h |  1 +
 include/linux/blkdev.h    |  2 ++
 4 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 55ae2286a4de..1de4682d48cc 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
 	select CRC_T10DIF
 	select CRC64_ROCKSOFT
 
+config BLK_DEV_WRITE_MOUNTED
+	bool "Allow writing to mounted block devices"
+	default y
+	help
+	When a block device is mounted, writing to its buffer cache is very
+	likely going to cause filesystem corruption. It is also rather easy to
+	crash the kernel in this way since the filesystem has no practical way
+	of detecting these writes to buffer cache and verifying its metadata
+	integrity. However there are some setups that need this capability
+	like running fsck on read-only mounted root device, modifying some
+	features on mounted ext4 filesystem, and similar. If you say N, the
+	kernel will prevent processes from writing to block devices that are
+	mounted by filesystems which provides some more protection from runaway
+	privileged processes and generally makes it much harder to crash
+	filesystem drivers. Note however that this does not prevent
+	underlying device(s) from being modified by other means, e.g. by
+	directly submitting SCSI commands or through access to lower layers of
+	storage stack. If in doubt, say Y. The configuration can be overridden
+	with the bdev_allow_write_mounted boot option.
+
 config BLK_DEV_ZONED
 	bool "Zoned block device support"
 	select MQ_IOSCHED_DEADLINE
diff --git a/block/bdev.c b/block/bdev.c
index 0daf14e50dd0..fc8d28d77495 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
 #include "../fs/internal.h"
 #include "blk.h"
 
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
@@ -730,7 +733,58 @@ void blkdev_put_no_open(struct block_device *bdev)
 {
 	put_device(&bdev->bd_device);
 }
-	
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+	return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+	bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+	bdev->bd_writers = 0;
+}
+
+static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
+{
+	if (bdev_allow_write_mounted)
+		return true;
+	/* Writes blocked? */
+	if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+		return false;
+	if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+		return false;
+	return true;
+}
+
+static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+	if (bdev_allow_write_mounted)
+		return;
+
+	/* Claim exclusive or shared write access. */
+	if (mode & BLK_OPEN_RESTRICT_WRITES)
+		bdev_block_writes(bdev);
+	else if (mode & BLK_OPEN_WRITE)
+		bdev->bd_writers++;
+}
+
+static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+	if (bdev_allow_write_mounted)
+		return;
+
+	/* Yield exclusive or shared write access. */
+	if (mode & BLK_OPEN_RESTRICT_WRITES)
+		bdev_unblock_writes(bdev);
+	else if (mode & BLK_OPEN_WRITE)
+		bdev->bd_writers--;
+}
+
 /**
  * bdev_open_by_dev - open a block device by device number
  * @dev: device number of block device to open
@@ -773,6 +827,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 	if (ret)
 		goto free_handle;
 
+	/* Blocking writes requires exclusive opener */
+	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
+		return ERR_PTR(-EINVAL);
+
 	bdev = blkdev_get_no_open(dev);
 	if (!bdev) {
 		ret = -ENXIO;
@@ -800,12 +858,16 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		goto abort_claiming;
 	if (!try_module_get(disk->fops->owner))
 		goto abort_claiming;
+	ret = -EBUSY;
+	if (!bdev_may_open(bdev, mode))
+		goto abort_claiming;
 	if (bdev_is_partition(bdev))
 		ret = blkdev_get_part(bdev, mode);
 	else
 		ret = blkdev_get_whole(bdev, mode);
 	if (ret)
 		goto put_module;
+	bdev_claim_write_access(bdev, mode);
 	if (holder) {
 		bd_finish_claiming(bdev, holder, hops);
 
@@ -901,6 +963,8 @@ void bdev_release(struct bdev_handle *handle)
 		sync_blockdev(bdev);
 
 	mutex_lock(&disk->open_mutex);
+	bdev_yield_write_access(bdev, handle->mode);
+
 	if (handle->holder)
 		bd_end_claim(bdev, handle->holder);
 
@@ -1069,3 +1133,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 
 	blkdev_put_no_open(bdev);
 }
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+	if (kstrtobool(str, &bdev_allow_write_mounted))
+		pr_warn("Invalid option string for bdev_allow_write_mounted:"
+			" '%s'\n", str);
+	return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 749203277fee..52e264d5a830 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,6 +66,7 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	int			bd_writers;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7afc10315dd5..0e0c0186aa32 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t;
 #define BLK_OPEN_NDELAY		((__force blk_mode_t)(1 << 3))
 /* open for "writes" only for ioctls (specialy hack for floppy.c) */
 #define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
+/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
+#define BLK_OPEN_RESTRICT_WRITES	((__force blk_mode_t)(1 << 5))
 
 struct gendisk {
 	/*
-- 
cgit v1.2.3


From 6f861765464f43a71462d52026fbddfc858239a5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Nov 2023 18:43:10 +0100
Subject: fs: Block writes to mounted block devices

Ask block layer to block writes to block devices mounted by filesystems.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231101174325.10596-5-jack@suse.cz
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e0c0186aa32..9f6c3373f9fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1494,7 +1494,8 @@ extern const struct blk_holder_ops fs_holder_ops;
  * as stored in sb->s_flags.
  */
 #define sb_open_mode(flags) \
-	(BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
+	(BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
+	 (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
 struct bdev_handle {
 	struct block_device *bdev;
-- 
cgit v1.2.3


From db3db63b1d17c98f69e894edaa2b0b364ecde7a9 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 4 Nov 2023 23:11:17 +0100
Subject: vfs: remove a redundant might_sleep in wait_on_inode

wait_on_bit already does it.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231104221117.2584708-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/writeback.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 083387c00f0c..6d0a14f7019d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
-	might_sleep();
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
-- 
cgit v1.2.3


From 446e2305827b76e8081057ce56bbd2703b4da8a9 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:29 +0100
Subject: net: Convert PHYs hwtstamp callback to use kernel_hwtstamp_config

The PHYs hwtstamp callback are still getting the timestamp config from
ifreq and using copy_from/to_user.
Get rid of these functions by using timestamp configuration in parameter.
This also allow to move on to kernel_hwtstamp_config and be similar to
net devices using the new ndo_hwstamp_get/set.

This adds the possibility to manipulate the timestamp configuration
from the kernel which was not possible with the copy_from/to_user.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm-phy-ptp.c     | 15 ++++++---------
 drivers/net/phy/dp83640.c         | 24 +++++++++++-------------
 drivers/net/phy/micrel.c          | 38 +++++++++++++++++---------------------
 drivers/net/phy/mscc/mscc_ptp.c   | 18 ++++++++----------
 drivers/net/phy/nxp-c45-tja11xx.c | 17 +++++++----------
 drivers/net/phy/phy.c             | 21 +++++++++++++++++++--
 drivers/ptp/ptp_ines.c            | 16 +++++++---------
 include/linux/mii_timestamper.h   |  4 +++-
 include/linux/phy.h               |  6 ++++--
 9 files changed, 82 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c
index cb4b91af5e17..617d384d4551 100644
--- a/drivers/net/phy/bcm-phy-ptp.c
+++ b/drivers/net/phy/bcm-phy-ptp.c
@@ -782,16 +782,13 @@ out:
 }
 
 static int bcm_ptp_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct ifreq *ifr)
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack)
 {
 	struct bcm_ptp_private *priv = mii2priv(mii_ts);
-	struct hwtstamp_config cfg;
 	u16 mode, ctrl;
 
-	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
-		return -EFAULT;
-
-	switch (cfg.rx_filter) {
+	switch (cfg->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		priv->hwts_rx = false;
 		break;
@@ -804,14 +801,14 @@ static int bcm_ptp_hwtstamp(struct mii_timestamper *mii_ts,
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		priv->hwts_rx = true;
 		break;
 	default:
 		return -ERANGE;
 	}
 
-	priv->tx_type = cfg.tx_type;
+	priv->tx_type = cfg->tx_type;
 
 	ctrl  = priv->hwts_rx ? SLICE_RX_EN : 0;
 	ctrl |= priv->tx_type != HWTSTAMP_TX_OFF ? SLICE_TX_EN : 0;
@@ -840,7 +837,7 @@ static int bcm_ptp_hwtstamp(struct mii_timestamper *mii_ts,
 	/* purge existing data */
 	skb_queue_purge(&priv->tx_queue);
 
-	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+	return 0;
 }
 
 static int bcm_ptp_ts_info(struct mii_timestamper *mii_ts,
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 2657be7cc049..5c42c47dc564 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1207,22 +1207,20 @@ static irqreturn_t dp83640_handle_interrupt(struct phy_device *phydev)
 	return IRQ_HANDLED;
 }
 
-static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
+static int dp83640_hwtstamp(struct mii_timestamper *mii_ts,
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack)
 {
 	struct dp83640_private *dp83640 =
 		container_of(mii_ts, struct dp83640_private, mii_ts);
-	struct hwtstamp_config cfg;
 	u16 txcfg0, rxcfg0;
 
-	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
-		return -EFAULT;
-
-	if (cfg.tx_type < 0 || cfg.tx_type > HWTSTAMP_TX_ONESTEP_SYNC)
+	if (cfg->tx_type < 0 || cfg->tx_type > HWTSTAMP_TX_ONESTEP_SYNC)
 		return -ERANGE;
 
-	dp83640->hwts_tx_en = cfg.tx_type;
+	dp83640->hwts_tx_en = cfg->tx_type;
 
-	switch (cfg.rx_filter) {
+	switch (cfg->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		dp83640->hwts_rx_en = 0;
 		dp83640->layer = 0;
@@ -1234,7 +1232,7 @@ static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		dp83640->hwts_rx_en = 1;
 		dp83640->layer = PTP_CLASS_L4;
 		dp83640->version = PTP_CLASS_V1;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
@@ -1242,7 +1240,7 @@ static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		dp83640->hwts_rx_en = 1;
 		dp83640->layer = PTP_CLASS_L4;
 		dp83640->version = PTP_CLASS_V2;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
@@ -1250,7 +1248,7 @@ static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		dp83640->hwts_rx_en = 1;
 		dp83640->layer = PTP_CLASS_L2;
 		dp83640->version = PTP_CLASS_V2;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
@@ -1258,7 +1256,7 @@ static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		dp83640->hwts_rx_en = 1;
 		dp83640->layer = PTP_CLASS_L4 | PTP_CLASS_L2;
 		dp83640->version = PTP_CLASS_V2;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		break;
 	default:
 		return -ERANGE;
@@ -1292,7 +1290,7 @@ static int dp83640_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 
 	mutex_unlock(&dp83640->clock->extreg_lock);
 
-	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+	return 0;
 }
 
 static void rx_timestamp_work(struct work_struct *work)
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 08e3915001c3..99af1e500c6c 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2395,24 +2395,22 @@ static void lan8814_flush_fifo(struct phy_device *phydev, bool egress)
 	lanphy_read_page_reg(phydev, 5, PTP_TSU_INT_STS);
 }
 
-static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
+static int lan8814_hwtstamp(struct mii_timestamper *mii_ts,
+			    struct kernel_hwtstamp_config *config,
+			    struct netlink_ext_ack *extack)
 {
 	struct kszphy_ptp_priv *ptp_priv =
 			  container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
 	struct phy_device *phydev = ptp_priv->phydev;
 	struct lan8814_shared_priv *shared = phydev->shared->priv;
 	struct lan8814_ptp_rx_ts *rx_ts, *tmp;
-	struct hwtstamp_config config;
 	int txcfg = 0, rxcfg = 0;
 	int pkt_ts_enable;
 
-	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
-		return -EFAULT;
+	ptp_priv->hwts_tx_type = config->tx_type;
+	ptp_priv->rx_filter = config->rx_filter;
 
-	ptp_priv->hwts_tx_type = config.tx_type;
-	ptp_priv->rx_filter = config.rx_filter;
-
-	switch (config.rx_filter) {
+	switch (config->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		ptp_priv->layer = 0;
 		ptp_priv->version = 0;
@@ -2458,13 +2456,13 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		lanphy_write_page_reg(ptp_priv->phydev, 5, PTP_TX_MOD,
 				      PTP_TX_MOD_TX_PTP_SYNC_TS_INSERT_);
 
-	if (config.rx_filter != HWTSTAMP_FILTER_NONE)
+	if (config->rx_filter != HWTSTAMP_FILTER_NONE)
 		lan8814_config_ts_intr(ptp_priv->phydev, true);
 	else
 		lan8814_config_ts_intr(ptp_priv->phydev, false);
 
 	mutex_lock(&shared->shared_lock);
-	if (config.rx_filter != HWTSTAMP_FILTER_NONE)
+	if (config->rx_filter != HWTSTAMP_FILTER_NONE)
 		shared->ref++;
 	else
 		shared->ref--;
@@ -2488,7 +2486,7 @@ static int lan8814_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 	lan8814_flush_fifo(ptp_priv->phydev, false);
 	lan8814_flush_fifo(ptp_priv->phydev, true);
 
-	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0;
+	return 0;
 }
 
 static void lan8814_txtstamp(struct mii_timestamper *mii_ts,
@@ -3703,21 +3701,19 @@ static void lan8841_ptp_enable_processing(struct kszphy_ptp_priv *ptp_priv,
 #define LAN8841_PTP_TX_TIMESTAMP_EN		443
 #define LAN8841_PTP_TX_MOD			445
 
-static int lan8841_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
+static int lan8841_hwtstamp(struct mii_timestamper *mii_ts,
+			    struct kernel_hwtstamp_config *config,
+			    struct netlink_ext_ack *extack)
 {
 	struct kszphy_ptp_priv *ptp_priv = container_of(mii_ts, struct kszphy_ptp_priv, mii_ts);
 	struct phy_device *phydev = ptp_priv->phydev;
-	struct hwtstamp_config config;
 	int txcfg = 0, rxcfg = 0;
 	int pkt_ts_enable;
 
-	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
-		return -EFAULT;
+	ptp_priv->hwts_tx_type = config->tx_type;
+	ptp_priv->rx_filter = config->rx_filter;
 
-	ptp_priv->hwts_tx_type = config.tx_type;
-	ptp_priv->rx_filter = config.rx_filter;
-
-	switch (config.rx_filter) {
+	switch (config->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		ptp_priv->layer = 0;
 		ptp_priv->version = 0;
@@ -3771,13 +3767,13 @@ static int lan8841_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 
 	/* Now enable/disable the timestamping */
 	lan8841_ptp_enable_processing(ptp_priv,
-				      config.rx_filter != HWTSTAMP_FILTER_NONE);
+				      config->rx_filter != HWTSTAMP_FILTER_NONE);
 
 	skb_queue_purge(&ptp_priv->tx_queue);
 
 	lan8841_ptp_flush_fifo(ptp_priv);
 
-	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0;
+	return 0;
 }
 
 static bool lan8841_rxtstamp(struct mii_timestamper *mii_ts,
diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c
index cf728bfd83e2..eb0b032cb613 100644
--- a/drivers/net/phy/mscc/mscc_ptp.c
+++ b/drivers/net/phy/mscc/mscc_ptp.c
@@ -1045,19 +1045,17 @@ static void vsc85xx_ts_reset_fifo(struct phy_device *phydev)
 			     val);
 }
 
-static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
+static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts,
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack)
 {
 	struct vsc8531_private *vsc8531 =
 		container_of(mii_ts, struct vsc8531_private, mii_ts);
 	struct phy_device *phydev = vsc8531->ptp->phydev;
-	struct hwtstamp_config cfg;
 	bool one_step = false;
 	u32 val;
 
-	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
-		return -EFAULT;
-
-	switch (cfg.tx_type) {
+	switch (cfg->tx_type) {
 	case HWTSTAMP_TX_ONESTEP_SYNC:
 		one_step = true;
 		break;
@@ -1069,9 +1067,9 @@ static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		return -ERANGE;
 	}
 
-	vsc8531->ptp->tx_type = cfg.tx_type;
+	vsc8531->ptp->tx_type = cfg->tx_type;
 
-	switch (cfg.rx_filter) {
+	switch (cfg->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
@@ -1084,7 +1082,7 @@ static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		return -ERANGE;
 	}
 
-	vsc8531->ptp->rx_filter = cfg.rx_filter;
+	vsc8531->ptp->rx_filter = cfg->rx_filter;
 
 	mutex_lock(&vsc8531->ts_lock);
 
@@ -1132,7 +1130,7 @@ static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 	vsc8531->ptp->configured = 1;
 	mutex_unlock(&vsc8531->ts_lock);
 
-	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+	return 0;
 }
 
 static int vsc85xx_ts_info(struct mii_timestamper *mii_ts,
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index 7ab080ff02df..780ad353cf55 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -1022,24 +1022,21 @@ static bool nxp_c45_rxtstamp(struct mii_timestamper *mii_ts,
 }
 
 static int nxp_c45_hwtstamp(struct mii_timestamper *mii_ts,
-			    struct ifreq *ifreq)
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack)
 {
 	struct nxp_c45_phy *priv = container_of(mii_ts, struct nxp_c45_phy,
 						mii_ts);
 	struct phy_device *phydev = priv->phydev;
 	const struct nxp_c45_phy_data *data;
-	struct hwtstamp_config cfg;
 
-	if (copy_from_user(&cfg, ifreq->ifr_data, sizeof(cfg)))
-		return -EFAULT;
-
-	if (cfg.tx_type < 0 || cfg.tx_type > HWTSTAMP_TX_ON)
+	if (cfg->tx_type < 0 || cfg->tx_type > HWTSTAMP_TX_ON)
 		return -ERANGE;
 
 	data = nxp_c45_get_data(phydev);
-	priv->hwts_tx = cfg.tx_type;
+	priv->hwts_tx = cfg->tx_type;
 
-	switch (cfg.rx_filter) {
+	switch (cfg->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		priv->hwts_rx = 0;
 		break;
@@ -1047,7 +1044,7 @@ static int nxp_c45_hwtstamp(struct mii_timestamper *mii_ts,
 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
 		priv->hwts_rx = 1;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
 		break;
 	default:
 		return -ERANGE;
@@ -1074,7 +1071,7 @@ static int nxp_c45_hwtstamp(struct mii_timestamper *mii_ts,
 		nxp_c45_clear_reg_field(phydev, &data->regmap->irq_egr_ts_en);
 
 nxp_c45_no_ptp_irq:
-	return copy_to_user(ifreq->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+	return 0;
 }
 
 static int nxp_c45_ts_info(struct mii_timestamper *mii_ts,
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a5fa077650e8..d058316666ba 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -325,9 +325,13 @@ EXPORT_SYMBOL(phy_ethtool_ksettings_get);
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 {
 	struct mii_ioctl_data *mii_data = if_mii(ifr);
+	struct kernel_hwtstamp_config kernel_cfg;
+	struct netlink_ext_ack extack = {};
 	u16 val = mii_data->val_in;
 	bool change_autoneg = false;
+	struct hwtstamp_config cfg;
 	int prtad, devad;
+	int ret;
 
 	switch (cmd) {
 	case SIOCGMIIPHY:
@@ -411,8 +415,21 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 		return 0;
 
 	case SIOCSHWTSTAMP:
-		if (phydev->mii_ts && phydev->mii_ts->hwtstamp)
-			return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr);
+		if (phydev->mii_ts && phydev->mii_ts->hwtstamp) {
+			if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+				return -EFAULT;
+
+			hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+			ret = phydev->mii_ts->hwtstamp(phydev->mii_ts, &kernel_cfg, &extack);
+			if (ret)
+				return ret;
+
+			hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+			if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+				return -EFAULT;
+
+			return 0;
+		}
 		fallthrough;
 
 	default:
diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c
index ed215b458183..1d2940a78455 100644
--- a/drivers/ptp/ptp_ines.c
+++ b/drivers/ptp/ptp_ines.c
@@ -328,17 +328,15 @@ static u64 ines_find_txts(struct ines_port *port, struct sk_buff *skb)
 	return ns;
 }
 
-static int ines_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
+static int ines_hwtstamp(struct mii_timestamper *mii_ts,
+			 struct kernel_hwtstamp_config *cfg,
+			 struct netlink_ext_ack *extack)
 {
 	struct ines_port *port = container_of(mii_ts, struct ines_port, mii_ts);
 	u32 cm_one_step = 0, port_conf, ts_stat_rx, ts_stat_tx;
-	struct hwtstamp_config cfg;
 	unsigned long flags;
 
-	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
-		return -EFAULT;
-
-	switch (cfg.tx_type) {
+	switch (cfg->tx_type) {
 	case HWTSTAMP_TX_OFF:
 		ts_stat_tx = 0;
 		break;
@@ -353,7 +351,7 @@ static int ines_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 		return -ERANGE;
 	}
 
-	switch (cfg.rx_filter) {
+	switch (cfg->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
 		ts_stat_rx = 0;
 		break;
@@ -372,7 +370,7 @@ static int ines_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
 		ts_stat_rx = TS_ENABLE;
-		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		cfg->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		break;
 	default:
 		return -ERANGE;
@@ -393,7 +391,7 @@ static int ines_hwtstamp(struct mii_timestamper *mii_ts, struct ifreq *ifr)
 
 	spin_unlock_irqrestore(&port->lock, flags);
 
-	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+	return 0;
 }
 
 static void ines_link_state(struct mii_timestamper *mii_ts,
diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h
index fa940bbaf8ae..26b04f73f214 100644
--- a/include/linux/mii_timestamper.h
+++ b/include/linux/mii_timestamper.h
@@ -9,6 +9,7 @@
 #include <linux/device.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/net_tstamp.h>
 
 struct phy_device;
 
@@ -51,7 +52,8 @@ struct mii_timestamper {
 			 struct sk_buff *skb, int type);
 
 	int  (*hwtstamp)(struct mii_timestamper *mii_ts,
-			 struct ifreq *ifreq);
+			 struct kernel_hwtstamp_config *kernel_config,
+			 struct netlink_ext_ack *extack);
 
 	void (*link_state)(struct mii_timestamper *mii_ts,
 			   struct phy_device *phydev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3cc52826f18e..e5f1f41e399c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1560,9 +1560,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev)
 	return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp;
 }
 
-static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr)
+static inline int phy_hwtstamp(struct phy_device *phydev,
+			       struct kernel_hwtstamp_config *cfg,
+			       struct netlink_ext_ack *extack)
 {
-	return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr);
+	return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack);
 }
 
 static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb,
-- 
cgit v1.2.3


From b8768dc4077712915f045ba1b198f521493c7914 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 14 Nov 2023 12:28:31 +0100
Subject: net: ethtool: Refactor identical get_ts_info implementations.

The vlan, macvlan and the bonding drivers call their "real" device driver
in order to report the time stamping capabilities.  Provide a core
ethtool helper function to avoid copy/paste in the stack.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 29 ++---------------------------
 drivers/net/macvlan.c           | 14 +-------------
 include/linux/ethtool.h         |  8 ++++++++
 net/8021q/vlan_dev.c            | 15 +--------------
 net/ethtool/common.c            |  6 ++++++
 5 files changed, 18 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8e6cc0e133b7..4e0600c7b050 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5755,10 +5755,8 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethtool_ts_info ts_info;
-	const struct ethtool_ops *ops;
 	struct net_device *real_dev;
 	bool sw_tx_support = false;
-	struct phy_device *phydev;
 	struct list_head *iter;
 	struct slave *slave;
 	int ret = 0;
@@ -5769,29 +5767,12 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 	rcu_read_unlock();
 
 	if (real_dev) {
-		ops = real_dev->ethtool_ops;
-		phydev = real_dev->phydev;
-
-		if (phy_has_tsinfo(phydev)) {
-			ret = phy_ts_info(phydev, info);
-			goto out;
-		} else if (ops->get_ts_info) {
-			ret = ops->get_ts_info(real_dev, info);
-			goto out;
-		}
+		ret = ethtool_get_ts_info_by_layer(real_dev, info);
 	} else {
 		/* Check if all slaves support software tx timestamping */
 		rcu_read_lock();
 		bond_for_each_slave_rcu(bond, slave, iter) {
-			ret = -1;
-			ops = slave->dev->ethtool_ops;
-			phydev = slave->dev->phydev;
-
-			if (phy_has_tsinfo(phydev))
-				ret = phy_ts_info(phydev, &ts_info);
-			else if (ops->get_ts_info)
-				ret = ops->get_ts_info(slave->dev, &ts_info);
-
+			ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info);
 			if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) {
 				sw_tx_support = true;
 				continue;
@@ -5803,15 +5784,9 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 		rcu_read_unlock();
 	}
 
-	ret = 0;
-	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-				SOF_TIMESTAMPING_SOFTWARE;
 	if (sw_tx_support)
 		info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
 
-	info->phc_index = -1;
-
-out:
 	dev_put(real_dev);
 	return ret;
 }
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index c8da94af4161..88dd8a2245b0 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1086,20 +1086,8 @@ static int macvlan_ethtool_get_ts_info(struct net_device *dev,
 				       struct ethtool_ts_info *info)
 {
 	struct net_device *real_dev = macvlan_dev_real_dev(dev);
-	const struct ethtool_ops *ops = real_dev->ethtool_ops;
-	struct phy_device *phydev = real_dev->phydev;
 
-	if (phy_has_tsinfo(phydev)) {
-		return phy_ts_info(phydev, info);
-	} else if (ops->get_ts_info) {
-		return ops->get_ts_info(real_dev, info);
-	} else {
-		info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-			SOF_TIMESTAMPING_SOFTWARE;
-		info->phc_index = -1;
-	}
-
-	return 0;
+	return ethtool_get_ts_info_by_layer(real_dev, info);
 }
 
 static netdev_features_t macvlan_fix_features(struct net_device *dev,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 689028257fcc..c2bb74143eda 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1043,6 +1043,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add,
 	return -EINVAL;
 }
 
+/**
+ * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer.
+ * @dev: pointer to net_device structure
+ * @info: buffer to hold the result
+ * Returns zero on success, non-zero otherwise.
+ */
+int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info);
+
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to a pointer to the start of string to update
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 2a7f1b15714a..407b2335f091 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -702,20 +702,7 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 				    struct ethtool_ts_info *info)
 {
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
-	const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
-	struct phy_device *phydev = vlan->real_dev->phydev;
-
-	if (phy_has_tsinfo(phydev)) {
-		return phy_ts_info(phydev, info);
-	} else if (ops->get_ts_info) {
-		return ops->get_ts_info(vlan->real_dev, info);
-	} else {
-		info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-			SOF_TIMESTAMPING_SOFTWARE;
-		info->phc_index = -1;
-	}
-
-	return 0;
+	return ethtool_get_ts_info_by_layer(vlan->real_dev, info);
 }
 
 static void vlan_dev_get_stats64(struct net_device *dev,
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index b4419fb6df6a..11d8797f63f6 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -661,6 +661,12 @@ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
 }
 EXPORT_SYMBOL(ethtool_get_phc_vclocks);
 
+int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info)
+{
+	return __ethtool_get_ts_info(dev, info);
+}
+EXPORT_SYMBOL(ethtool_get_ts_info_by_layer);
+
 const struct ethtool_phy_ops *ethtool_phy_ops;
 
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
-- 
cgit v1.2.3


From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:33 +0100
Subject: net: Make dev_set_hwtstamp_phylib accessible

Make the dev_set_hwtstamp_phylib function accessible in prevision to use
it from ethtool to reset the tstamp current configuration.

Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 net/core/dev_ioctl.c      | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a16c9cc063fe..2d840d7056f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3942,6 +3942,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
 int generic_hwtstamp_set_lower(struct net_device *dev,
 			       struct kernel_hwtstamp_config *kernel_cfg,
 			       struct netlink_ext_ack *extack);
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack);
 int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
 unsigned int dev_get_flags(const struct net_device *);
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index feeddf95f450..9a66cf5015f2 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -322,9 +322,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
  * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
  * dev->priv_flags.
  */
-static int dev_set_hwtstamp_phylib(struct net_device *dev,
-				   struct kernel_hwtstamp_config *cfg,
-				   struct netlink_ext_ack *extack)
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+			    struct kernel_hwtstamp_config *cfg,
+			    struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	bool phy_ts = phy_has_hwtstamp(dev->phydev);
@@ -363,6 +363,7 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib);
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
-- 
cgit v1.2.3


From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:40 +0100
Subject: net: Replace hwtstamp_source by timestamping layer

Replace hwtstamp_source which is only used by the kernel_hwtstamp_config
structure by the more widely use timestamp_layer structure. This is done
to prepare the support of selectable timestamping source.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/lan966x/lan966x_main.c |  6 +++---
 include/linux/net_tstamp.h                            | 11 +++--------
 net/core/dev_ioctl.c                                  |  2 +-
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index 2635ef8958c8..fbe56b1bb386 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -470,15 +470,15 @@ static int lan966x_port_hwtstamp_set(struct net_device *dev,
 	struct lan966x_port *port = netdev_priv(dev);
 	int err;
 
-	if (cfg->source != HWTSTAMP_SOURCE_NETDEV &&
-	    cfg->source != HWTSTAMP_SOURCE_PHYLIB)
+	if (cfg->source != MAC_TIMESTAMPING &&
+	    cfg->source != PHY_TIMESTAMPING)
 		return -EOPNOTSUPP;
 
 	err = lan966x_ptp_setup_traps(port, cfg);
 	if (err)
 		return err;
 
-	if (cfg->source == HWTSTAMP_SOURCE_NETDEV) {
+	if (cfg->source == MAC_TIMESTAMPING) {
 		if (!port->lan966x->ptp)
 			return -EOPNOTSUPP;
 
diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
index eb01c37e71e0..bb289c2ad376 100644
--- a/include/linux/net_tstamp.h
+++ b/include/linux/net_tstamp.h
@@ -5,11 +5,6 @@
 
 #include <uapi/linux/net_tstamp.h>
 
-enum hwtstamp_source {
-	HWTSTAMP_SOURCE_NETDEV,
-	HWTSTAMP_SOURCE_PHYLIB,
-};
-
 /**
  * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
  *
@@ -20,8 +15,8 @@ enum hwtstamp_source {
  *	a legacy implementation of a lower driver
  * @copied_to_user: request was passed to a legacy implementation which already
  *	copied the ioctl request back to user space
- * @source: indication whether timestamps should come from the netdev or from
- *	an attached phylib PHY
+ * @source: indication whether timestamps should come from software, the netdev
+ *	or from an attached phylib PHY
  *
  * Prefer using this structure for in-kernel processing of hardware
  * timestamping configuration, over the inextensible struct hwtstamp_config
@@ -33,7 +28,7 @@ struct kernel_hwtstamp_config {
 	int rx_filter;
 	struct ifreq *ifr;
 	bool copied_to_user;
-	enum hwtstamp_source source;
+	enum timestamping_layer source;
 };
 
 static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9a66cf5015f2..267cd00269d0 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -332,7 +332,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
 	bool changed = false;
 	int err;
 
-	cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
+	cfg->source = phy_ts ? PHY_TIMESTAMPING : MAC_TIMESTAMPING;
 
 	if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
 		err = ops->ndo_hwtstamp_get(dev, &old_cfg);
-- 
cgit v1.2.3


From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Tue, 14 Nov 2023 12:28:41 +0100
Subject: net: Change the API of PHY default timestamp to MAC

Change the API to select MAC default time stamping instead of the PHY.
Indeed the PHY is closer to the wire therefore theoretically it has less
delay than the MAC timestamping but the reality is different. Due to lower
time stamping clock frequency, latency in the MDIO bus and no PHC hardware
synchronization between different PHY, the PHY PTP is often less precise
than the MAC. The exception is for PHY designed specially for PTP case but
these devices are not very widespread. For not breaking the compatibility I
introduce a default_timestamp flag in phy_device that is set by the phy
driver to know we are using the old API behavior.

The phy_set_timestamp function is called at each call of phy_attach_direct.
In case of MAC driver using phylink this function is called when the
interface is turned up. Then if the interface goes down and up again the
last choice of timestamp will be overwritten by the default choice.
A solution could be to cache the timestamp status but it can bring other
issues. In case of SFP, if we change the module, it doesn't make sense to
blindly re-set the timestamp back to PHY, if the new module has a PHY with
mediocre timestamping capabilities.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm-phy-ptp.c     |  3 +++
 drivers/net/phy/dp83640.c         |  3 +++
 drivers/net/phy/micrel.c          |  6 ++++++
 drivers/net/phy/mscc/mscc_ptp.c   |  2 ++
 drivers/net/phy/nxp-c45-tja11xx.c |  3 +++
 drivers/net/phy/phy_device.c      | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/netdevice.h         |  5 +++++
 include/linux/phy.h               |  4 ++++
 net/core/dev.c                    |  3 +++
 net/core/dev_ioctl.c              | 36 ++++++++++++++++++++++--------------
 net/core/timestamping.c           | 10 ++++++++++
 net/ethtool/common.c              | 19 +++++++++++++++++--
 12 files changed, 115 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c
index 617d384d4551..d3e825c951ee 100644
--- a/drivers/net/phy/bcm-phy-ptp.c
+++ b/drivers/net/phy/bcm-phy-ptp.c
@@ -931,6 +931,9 @@ struct bcm_ptp_private *bcm_ptp_probe(struct phy_device *phydev)
 		return ERR_CAST(clock);
 	priv->ptp_clock = clock;
 
+	/* Timestamp selected by default to keep legacy API */
+	phydev->default_timestamp = true;
+
 	priv->phydev = phydev;
 	bcm_ptp_init(priv);
 
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 5c42c47dc564..64fd1a109c0f 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1450,6 +1450,9 @@ static int dp83640_probe(struct phy_device *phydev)
 	phydev->mii_ts = &dp83640->mii_ts;
 	phydev->priv = dp83640;
 
+	/* Timestamp selected by default to keep legacy API */
+	phydev->default_timestamp = true;
+
 	spin_lock_init(&dp83640->rx_lock);
 	skb_queue_head_init(&dp83640->rx_queue);
 	skb_queue_head_init(&dp83640->tx_queue);
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index bd4cd082662f..2b8dd0131926 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -3158,6 +3158,9 @@ static void lan8814_ptp_init(struct phy_device *phydev)
 	ptp_priv->mii_ts.ts_info  = lan8814_ts_info;
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
+
+	/* Timestamp selected by default to keep legacy API */
+	phydev->default_timestamp = true;
 }
 
 static int lan8814_ptp_probe_once(struct phy_device *phydev)
@@ -4586,6 +4589,9 @@ static int lan8841_probe(struct phy_device *phydev)
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
 
+	/* Timestamp selected by default to keep legacy API */
+	phydev->default_timestamp = true;
+
 	return 0;
 }
 
diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c
index eb0b032cb613..fd174eb06d4a 100644
--- a/drivers/net/phy/mscc/mscc_ptp.c
+++ b/drivers/net/phy/mscc/mscc_ptp.c
@@ -1570,6 +1570,8 @@ int vsc8584_ptp_probe(struct phy_device *phydev)
 		return PTR_ERR(vsc8531->load_save);
 	}
 
+	/* Timestamp selected by default to keep legacy API */
+	phydev->default_timestamp = true;
 	vsc8531->ptp->phydev = phydev;
 
 	return 0;
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index 780ad353cf55..0515c7b979db 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -1658,6 +1658,9 @@ static int nxp_c45_probe(struct phy_device *phydev)
 		priv->mii_ts.ts_info = nxp_c45_ts_info;
 		phydev->mii_ts = &priv->mii_ts;
 		ret = nxp_c45_init_ptp_clock(priv);
+
+		/* Timestamp selected by default to keep legacy API */
+		phydev->default_timestamp = true;
 	} else {
 		phydev_dbg(phydev, "PTP support not enabled even if the phy supports it");
 	}
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2ce74593d6e4..8c4794631daa 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1411,6 +1411,26 @@ int phy_sfp_probe(struct phy_device *phydev,
 }
 EXPORT_SYMBOL(phy_sfp_probe);
 
+/**
+ * phy_set_timestamp - set the default selected timestamping device
+ * @dev: Pointer to net_device
+ * @phydev: Pointer to phy_device
+ *
+ * This is used to set default timestamping device taking into account
+ * the new API choice, which is selecting the timestamping from MAC by
+ * default if the phydev does not have default_timestamp flag enabled.
+ */
+static void phy_set_timestamp(struct net_device *dev, struct phy_device *phydev)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	if (!phy_has_tsinfo(phydev))
+		return;
+
+	if (!ops->get_ts_info || phydev->default_timestamp)
+		dev->ts_layer = PHY_TIMESTAMPING;
+}
+
 /**
  * phy_attach_direct - attach a network device to a given PHY device pointer
  * @dev: network device to attach
@@ -1484,6 +1504,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 	phydev->phy_link_change = phy_link_change;
 	if (dev) {
+		phy_set_timestamp(dev, phydev);
 		phydev->attached_dev = dev;
 		dev->phydev = phydev;
 
@@ -1812,6 +1833,22 @@ void phy_detach(struct phy_device *phydev)
 
 	phy_suspend(phydev);
 	if (dev) {
+		const struct ethtool_ops *ops = dev->ethtool_ops;
+		struct ethtool_ts_info ts_info = {0};
+
+		if (ops->get_ts_info) {
+			ops->get_ts_info(dev, &ts_info);
+			if ((ts_info.so_timestamping &
+			    SOF_TIMESTAMPING_HARDWARE_MASK) ==
+			    SOF_TIMESTAMPING_HARDWARE_MASK)
+				dev->ts_layer = MAC_TIMESTAMPING;
+			else if ((ts_info.so_timestamping &
+				 SOF_TIMESTAMPING_SOFTWARE_MASK) ==
+				 SOF_TIMESTAMPING_SOFTWARE_MASK)
+				dev->ts_layer = SOFTWARE_TIMESTAMPING;
+		} else {
+			dev->ts_layer = NO_TIMESTAMPING;
+		}
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d840d7056f2..f020d2790c12 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <uapi/linux/netdev.h>
+#include <uapi/linux/net_tstamp.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2074,6 +2075,8 @@ enum netdev_ml_priv_type {
  *
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
+ *	@ts_layer:	Tracks which network device
+ *			performs packet	time stamping.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2435,6 +2438,8 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
+
+	enum timestamping_layer	ts_layer;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5f1f41e399c..317def2a7843 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -604,6 +604,8 @@ struct macsec_ops;
  *                 handling shall be postponed until PHY has resumed
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
+ * @default_timestamp: Flag indicating whether we are using the phy
+ *		       timestamp as the default one
  * @interface: enum phy_interface_t value
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
@@ -667,6 +669,8 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
+	unsigned default_timestamp:1;
+
 	int rate_matching;
 
 	enum phy_state state;
diff --git a/net/core/dev.c b/net/core/dev.c
index af53f6d838ce..05ce00632892 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10212,6 +10212,9 @@ int register_netdevice(struct net_device *dev)
 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
+	if (dev->ethtool_ops->get_ts_info)
+		dev->ts_layer = MAC_TIMESTAMPING;
+
 out:
 	return ret;
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 267cd00269d0..bc8be9749376 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,9 +259,7 @@ static int dev_eth_ioctl(struct net_device *dev,
  * @dev: Network device
  * @cfg: Timestamping configuration structure
  *
- * Helper for enforcing a common policy that phylib timestamping, if available,
- * should take precedence in front of hardware timestamping provided by the
- * netdev.
+ * Helper for calling the selected hardware provider timestamping.
  *
  * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
  * there only exists a phydev->mii_ts->hwtstamp() method. So this will return
@@ -271,10 +269,14 @@ static int dev_eth_ioctl(struct net_device *dev,
 static int dev_get_hwtstamp_phylib(struct net_device *dev,
 				   struct kernel_hwtstamp_config *cfg)
 {
-	if (phy_has_hwtstamp(dev->phydev))
+	enum timestamping_layer ts_layer = dev->ts_layer;
+
+	if (ts_layer == PHY_TIMESTAMPING)
 		return phy_hwtstamp_get(dev->phydev, cfg);
+	else if (ts_layer == MAC_TIMESTAMPING)
+		return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
 
-	return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+	return -EOPNOTSUPP;
 }
 
 static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
@@ -315,9 +317,8 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
  * @cfg: Timestamping configuration structure
  * @extack: Netlink extended ack message structure, for error reporting
  *
- * Helper for enforcing a common policy that phylib timestamping, if available,
- * should take precedence in front of hardware timestamping provided by the
- * netdev. If the netdev driver needs to perform specific actions even for PHY
+ * Helper for calling the selected hardware provider timestamping.
+ * If the netdev driver needs to perform specific actions even for PHY
  * timestamping to work properly (a switch port must trap the timestamped
  * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
  * dev->priv_flags.
@@ -327,20 +328,26 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
 			    struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	bool phy_ts = phy_has_hwtstamp(dev->phydev);
+	enum timestamping_layer ts_layer = dev->ts_layer;
 	struct kernel_hwtstamp_config old_cfg = {};
 	bool changed = false;
 	int err;
 
-	cfg->source = phy_ts ? PHY_TIMESTAMPING : MAC_TIMESTAMPING;
+	cfg->source = ts_layer;
+
+	if (ts_layer != PHY_TIMESTAMPING &&
+	    ts_layer != MAC_TIMESTAMPING)
+		return -EOPNOTSUPP;
 
-	if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+	if (ts_layer == PHY_TIMESTAMPING &&
+	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) {
 		err = ops->ndo_hwtstamp_get(dev, &old_cfg);
 		if (err)
 			return err;
 	}
 
-	if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+	if (ts_layer == MAC_TIMESTAMPING ||
+	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) {
 		err = ops->ndo_hwtstamp_set(dev, cfg, extack);
 		if (err) {
 			if (extack->_msg)
@@ -349,10 +356,11 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
 		}
 	}
 
-	if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS))
+	if (ts_layer == PHY_TIMESTAMPING &&
+	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)
 		changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
 
-	if (phy_ts) {
+	if (ts_layer == PHY_TIMESTAMPING) {
 		err = phy_hwtstamp_set(dev->phydev, cfg, extack);
 		if (err) {
 			if (changed)
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 04840697fe79..5cf51a523fb3 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -21,6 +21,7 @@ static unsigned int classify(const struct sk_buff *skb)
 
 void skb_clone_tx_timestamp(struct sk_buff *skb)
 {
+	enum timestamping_layer ts_layer;
 	struct mii_timestamper *mii_ts;
 	struct sk_buff *clone;
 	unsigned int type;
@@ -28,6 +29,10 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
 	if (!skb->sk)
 		return;
 
+	ts_layer = skb->dev->ts_layer;
+	if (ts_layer != PHY_TIMESTAMPING)
+		return;
+
 	type = classify(skb);
 	if (type == PTP_CLASS_NONE)
 		return;
@@ -44,12 +49,17 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
 
 bool skb_defer_rx_timestamp(struct sk_buff *skb)
 {
+	enum timestamping_layer ts_layer;
 	struct mii_timestamper *mii_ts;
 	unsigned int type;
 
 	if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
 		return false;
 
+	ts_layer = skb->dev->ts_layer;
+	if (ts_layer != PHY_TIMESTAMPING)
+		return false;
+
 	if (skb_headroom(skb) < ETH_HLEN)
 		return false;
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 11d8797f63f6..9f6e3b2c74e2 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -633,13 +633,28 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
+	enum timestamping_layer ts_layer;
+	int ret;
 
 	memset(info, 0, sizeof(*info));
 	info->cmd = ETHTOOL_GET_TS_INFO;
 
-	if (phy_has_tsinfo(phydev))
+	ts_layer = dev->ts_layer;
+	if (ts_layer == SOFTWARE_TIMESTAMPING) {
+		ret = ops->get_ts_info(dev, info);
+		if (ret)
+			return ret;
+		info->so_timestamping &= ~SOF_TIMESTAMPING_HARDWARE_MASK;
+		info->phc_index = -1;
+		info->rx_filters = 0;
+		info->tx_types = 0;
+		return 0;
+	}
+
+	if (ts_layer == PHY_TIMESTAMPING)
 		return phy_ts_info(phydev, info);
-	if (ops->get_ts_info)
+
+	if (ts_layer == MAC_TIMESTAMPING)
 		return ops->get_ts_info(dev, info);
 
 	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-- 
cgit v1.2.3


From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:16 -0800
Subject: bpf: move verbose_linfo() into kernel/bpf/log.c

verifier.c is huge. Let's try to move out parts that are logging-related
into log.c, as we previously did with bpf_log() and other related stuff.
This patch moves line info verbose output routines: it's pretty
self-contained and isolated code, so there is no problem with this.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  4 +++
 kernel/bpf/log.c             | 59 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 57 ------------------------------------------
 3 files changed, 63 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 52a4012b8255..d896f3db6a22 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
 int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+				  u32 insn_off,
+				  const char *prefix_fmt, ...);
+
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 850494423530..f20e1449c882 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,6 +10,8 @@
 #include <linux/bpf_verifier.h>
 #include <linux/math64.h>
 
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
 static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
 	/* ubuf and len_total should both be specified (or not) together */
@@ -325,3 +327,60 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 	va_end(args);
 }
 EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+	const struct bpf_line_info *linfo;
+	const struct bpf_prog *prog;
+	u32 i, nr_linfo;
+
+	prog = env->prog;
+	nr_linfo = prog->aux->nr_linfo;
+
+	if (!nr_linfo || insn_off >= prog->len)
+		return NULL;
+
+	linfo = prog->aux->linfo;
+	for (i = 1; i < nr_linfo; i++)
+		if (insn_off < linfo[i].insn_off)
+			break;
+
+	return &linfo[i - 1];
+}
+
+static const char *ltrim(const char *s)
+{
+	while (isspace(*s))
+		s++;
+
+	return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+				  u32 insn_off,
+				  const char *prefix_fmt, ...)
+{
+	const struct bpf_line_info *linfo;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	linfo = find_linfo(env, insn_off);
+	if (!linfo || linfo == env->prev_linfo)
+		return;
+
+	if (prefix_fmt) {
+		va_list args;
+
+		va_start(args, prefix_fmt);
+		bpf_verifier_vlog(&env->log, prefix_fmt, args);
+		va_end(args);
+	}
+
+	verbose(env, "%s\n",
+		ltrim(btf_name_by_offset(env->prog->aux->btf,
+					 linfo->line_off)));
+
+	env->prev_linfo = linfo;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7c3461b89513..683fdda25c13 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -337,27 +337,6 @@ struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
-{
-	const struct bpf_line_info *linfo;
-	const struct bpf_prog *prog;
-	u32 i, nr_linfo;
-
-	prog = env->prog;
-	nr_linfo = prog->aux->nr_linfo;
-
-	if (!nr_linfo || insn_off >= prog->len)
-		return NULL;
-
-	linfo = prog->aux->linfo;
-	for (i = 1; i < nr_linfo; i++)
-		if (insn_off < linfo[i].insn_off)
-			break;
-
-	return &linfo[i - 1];
-}
-
 __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 {
 	struct bpf_verifier_env *env = private_data;
@@ -371,42 +350,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
-static const char *ltrim(const char *s)
-{
-	while (isspace(*s))
-		s++;
-
-	return s;
-}
-
-__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
-					 u32 insn_off,
-					 const char *prefix_fmt, ...)
-{
-	const struct bpf_line_info *linfo;
-
-	if (!bpf_verifier_log_needed(&env->log))
-		return;
-
-	linfo = find_linfo(env, insn_off);
-	if (!linfo || linfo == env->prev_linfo)
-		return;
-
-	if (prefix_fmt) {
-		va_list args;
-
-		va_start(args, prefix_fmt);
-		bpf_verifier_vlog(&env->log, prefix_fmt, args);
-		va_end(args);
-	}
-
-	verbose(env, "%s\n",
-		ltrim(btf_name_by_offset(env->prog->aux->btf,
-					 linfo->line_off)));
-
-	env->prev_linfo = linfo;
-}
-
 static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 				   struct bpf_reg_state *reg,
 				   struct tnum *range, const char *ctx,
-- 
cgit v1.2.3


From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:17 -0800
Subject: bpf: move verifier state printing code to kernel/bpf/log.c

Move a good chunk of code from verifier.c to log.c: verifier state
verbose printing logic. This is an important and very much
logging/debugging oriented code. It fits the overlall log.c's focus on
verifier logging, and moving it allows to keep growing it without
unnecessarily adding to verifier.c code that otherwise contains a core
verification logic.

There are not many shared dependencies between this code and the rest of
verifier.c code, except a few single-line helpers for various register
type checks and a bit of state "scratching" helpers. We move all such
trivial helpers into include/bpf/bpf_verifier.h as static inlines.

No functional changes in this patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  72 ++++++++
 kernel/bpf/log.c             | 342 ++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 403 -------------------------------------------
 3 files changed, 414 insertions(+), 403 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d896f3db6a22..39edc76f436e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 	return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
 }
 
+static inline bool type_is_ptr_alloc_obj(u32 type)
+{
+	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
+}
+
+static inline bool type_is_non_owning_ref(u32 type)
+{
+	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
+static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	type = base_type(type);
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
+static inline bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET ||
+		type == PTR_TO_SOCK_COMMON ||
+		type == PTR_TO_TCP_SOCK ||
+		type == PTR_TO_XDP_SOCK;
+}
+
+static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+	env->scratched_regs |= 1U << regno;
+}
+
+static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+	env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+	return (env->scratched_regs >> regno) & 1;
+}
+
+static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+	return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+	return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = 0U;
+	env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = ~0U;
+	env->scratched_stack_slots = ~0ULL;
+}
+
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
+const char *dynptr_type_str(enum bpf_dynptr_type type);
+const char *iter_type_str(const struct btf *btf, u32 btf_id);
+const char *iter_state_str(enum bpf_iter_state state);
+
+void print_verifier_state(struct bpf_verifier_env *env,
+			  const struct bpf_func_state *state, bool print_all);
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state);
+
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index f20e1449c882..c1b257eac21b 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -384,3 +384,345 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
 
 	env->prev_linfo = linfo;
 }
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+	char postfix[16] = {0}, prefix[64] = {0};
+	static const char * const str[] = {
+		[NOT_INIT]		= "?",
+		[SCALAR_VALUE]		= "scalar",
+		[PTR_TO_CTX]		= "ctx",
+		[CONST_PTR_TO_MAP]	= "map_ptr",
+		[PTR_TO_MAP_VALUE]	= "map_value",
+		[PTR_TO_STACK]		= "fp",
+		[PTR_TO_PACKET]		= "pkt",
+		[PTR_TO_PACKET_META]	= "pkt_meta",
+		[PTR_TO_PACKET_END]	= "pkt_end",
+		[PTR_TO_FLOW_KEYS]	= "flow_keys",
+		[PTR_TO_SOCKET]		= "sock",
+		[PTR_TO_SOCK_COMMON]	= "sock_common",
+		[PTR_TO_TCP_SOCK]	= "tcp_sock",
+		[PTR_TO_TP_BUFFER]	= "tp_buffer",
+		[PTR_TO_XDP_SOCK]	= "xdp_sock",
+		[PTR_TO_BTF_ID]		= "ptr_",
+		[PTR_TO_MEM]		= "mem",
+		[PTR_TO_BUF]		= "buf",
+		[PTR_TO_FUNC]		= "func",
+		[PTR_TO_MAP_KEY]	= "map_key",
+		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
+	};
+
+	if (type & PTR_MAYBE_NULL) {
+		if (base_type(type) == PTR_TO_BTF_ID)
+			strncpy(postfix, "or_null_", 16);
+		else
+			strncpy(postfix, "_or_null", 16);
+	}
+
+	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+		 type & MEM_RDONLY ? "rdonly_" : "",
+		 type & MEM_RINGBUF ? "ringbuf_" : "",
+		 type & MEM_USER ? "user_" : "",
+		 type & MEM_PERCPU ? "percpu_" : "",
+		 type & MEM_RCU ? "rcu_" : "",
+		 type & PTR_UNTRUSTED ? "untrusted_" : "",
+		 type & PTR_TRUSTED ? "trusted_" : ""
+	);
+
+	snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+		 prefix, str[base_type(type)], postfix);
+	return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+		return "local";
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return "ringbuf";
+	case BPF_DYNPTR_TYPE_SKB:
+		return "skb";
+	case BPF_DYNPTR_TYPE_XDP:
+		return "xdp";
+	case BPF_DYNPTR_TYPE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown dynptr type %d\n", type);
+		return "<unknown>";
+	}
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
+static char slot_type_char[] = {
+	[STACK_INVALID]	= '?',
+	[STACK_SPILL]	= 'r',
+	[STACK_MISC]	= 'm',
+	[STACK_ZERO]	= '0',
+	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
+};
+
+static void print_liveness(struct bpf_verifier_env *env,
+			   enum bpf_reg_liveness live)
+{
+	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
+	    verbose(env, "_");
+	if (live & REG_LIVE_READ)
+		verbose(env, "r");
+	if (live & REG_LIVE_WRITTEN)
+		verbose(env, "w");
+	if (live & REG_LIVE_DONE)
+		verbose(env, "D");
+}
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+				const struct bpf_reg_state *reg,
+				const char **sep)
+{
+	struct {
+		const char *name;
+		u64 val;
+		bool omit;
+	} minmaxs[] = {
+		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
+		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
+		{"umin",   reg->umin_value,         reg->umin_value == 0},
+		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
+		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
+		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
+	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+	bool neg1, neg2;
+
+	for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+		if (m1->omit)
+			continue;
+
+		neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+		verbose(env, "%s%s=", *sep, m1->name);
+		*sep = ",";
+
+		for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+			if (m2->omit || m2->val != m1->val)
+				continue;
+			/* don't mix negatives with positives */
+			neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+			if (neg2 != neg1)
+				continue;
+			m2->omit = true;
+			verbose(env, "%s=", m2->name);
+		}
+
+		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
+	}
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
+			  bool print_all)
+{
+	const struct bpf_reg_state *reg;
+	enum bpf_reg_type t;
+	int i;
+
+	if (state->frameno)
+		verbose(env, " frame%d:", state->frameno);
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		reg = &state->regs[i];
+		t = reg->type;
+		if (t == NOT_INIT)
+			continue;
+		if (!print_all && !reg_scratched(env, i))
+			continue;
+		verbose(env, " R%d", i);
+		print_liveness(env, reg->live);
+		verbose(env, "=");
+		if (t == SCALAR_VALUE && reg->precise)
+			verbose(env, "P");
+		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+		    tnum_is_const(reg->var_off)) {
+			/* reg->off should be 0 for SCALAR_VALUE */
+			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
+			verbose(env, "%lld", reg->var_off.value + reg->off);
+		} else {
+			const char *sep = "";
+
+			verbose(env, "%s", reg_type_str(env, t));
+			if (base_type(t) == PTR_TO_BTF_ID)
+				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+			verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+			if (reg->id)
+				verbose_a("id=%d", reg->id);
+			if (reg->ref_obj_id)
+				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+			if (type_is_non_owning_ref(reg->type))
+				verbose_a("%s", "non_own_ref");
+			if (t != SCALAR_VALUE)
+				verbose_a("off=%d", reg->off);
+			if (type_is_pkt_pointer(t))
+				verbose_a("r=%d", reg->range);
+			else if (base_type(t) == CONST_PTR_TO_MAP ||
+				 base_type(t) == PTR_TO_MAP_KEY ||
+				 base_type(t) == PTR_TO_MAP_VALUE)
+				verbose_a("ks=%d,vs=%d",
+					  reg->map_ptr->key_size,
+					  reg->map_ptr->value_size);
+			if (tnum_is_const(reg->var_off)) {
+				/* Typically an immediate SCALAR_VALUE, but
+				 * could be a pointer whose offset is too big
+				 * for reg->off
+				 */
+				verbose_a("imm=%llx", reg->var_off.value);
+			} else {
+				print_scalar_ranges(env, reg, &sep);
+				if (!tnum_is_unknown(reg->var_off)) {
+					char tn_buf[48];
+
+					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+					verbose_a("var_off=%s", tn_buf);
+				}
+			}
+#undef verbose_a
+
+			verbose(env, ")");
+		}
+	}
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		char types_buf[BPF_REG_SIZE + 1];
+		bool valid = false;
+		int j;
+
+		for (j = 0; j < BPF_REG_SIZE; j++) {
+			if (state->stack[i].slot_type[j] != STACK_INVALID)
+				valid = true;
+			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+		}
+		types_buf[BPF_REG_SIZE] = 0;
+		if (!valid)
+			continue;
+		if (!print_all && !stack_slot_scratched(env, i))
+			continue;
+		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+		case STACK_SPILL:
+			reg = &state->stack[i].spilled_ptr;
+			t = reg->type;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
+			if (t == SCALAR_VALUE && reg->precise)
+				verbose(env, "P");
+			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+				verbose(env, "%lld", reg->var_off.value + reg->off);
+			break;
+		case STACK_DYNPTR:
+			i += BPF_DYNPTR_NR_SLOTS - 1;
+			reg = &state->stack[i].spilled_ptr;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+			if (reg->ref_obj_id)
+				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
+		case STACK_MISC:
+		case STACK_ZERO:
+		default:
+			reg = &state->stack[i].spilled_ptr;
+
+			for (j = 0; j < BPF_REG_SIZE; j++)
+				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+			types_buf[BPF_REG_SIZE] = 0;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=%s", types_buf);
+			break;
+		}
+	}
+	if (state->acquired_refs && state->refs[0].id) {
+		verbose(env, " refs=%d", state->refs[0].id);
+		for (i = 1; i < state->acquired_refs; i++)
+			if (state->refs[i].id)
+				verbose(env, ",%d", state->refs[i].id);
+	}
+	if (state->in_callback_fn)
+		verbose(env, " cb");
+	if (state->in_async_callback_fn)
+		verbose(env, " async_cb");
+	verbose(env, "\n");
+	if (!print_all)
+		mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+{
+	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+		/* remove new line character */
+		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+	} else {
+		verbose(env, "%d:", env->insn_idx);
+	}
+	print_verifier_state(env, state, false);
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 683fdda25c13..8c2d31aa3d31 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -368,21 +368,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 	verbose(env, " should have been in %s\n", tn_buf);
 }
 
-static bool type_is_pkt_pointer(enum bpf_reg_type type)
-{
-	type = base_type(type);
-	return type == PTR_TO_PACKET ||
-	       type == PTR_TO_PACKET_META;
-}
-
-static bool type_is_sk_pointer(enum bpf_reg_type type)
-{
-	return type == PTR_TO_SOCKET ||
-		type == PTR_TO_SOCK_COMMON ||
-		type == PTR_TO_TCP_SOCK ||
-		type == PTR_TO_XDP_SOCK;
-}
-
 static bool type_may_be_null(u32 type)
 {
 	return type & PTR_MAYBE_NULL;
@@ -406,16 +391,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
 		type == PTR_TO_MEM;
 }
 
-static bool type_is_ptr_alloc_obj(u32 type)
-{
-	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
-}
-
-static bool type_is_non_owning_ref(u32 type)
-{
-	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
-}
-
 static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 {
 	struct btf_record *rec = NULL;
@@ -532,83 +507,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
 	       insn->imm == BPF_CMPXCHG;
 }
 
-/* string representation of 'enum bpf_reg_type'
- *
- * Note that reg_type_str() can not appear more than once in a single verbose()
- * statement.
- */
-static const char *reg_type_str(struct bpf_verifier_env *env,
-				enum bpf_reg_type type)
-{
-	char postfix[16] = {0}, prefix[64] = {0};
-	static const char * const str[] = {
-		[NOT_INIT]		= "?",
-		[SCALAR_VALUE]		= "scalar",
-		[PTR_TO_CTX]		= "ctx",
-		[CONST_PTR_TO_MAP]	= "map_ptr",
-		[PTR_TO_MAP_VALUE]	= "map_value",
-		[PTR_TO_STACK]		= "fp",
-		[PTR_TO_PACKET]		= "pkt",
-		[PTR_TO_PACKET_META]	= "pkt_meta",
-		[PTR_TO_PACKET_END]	= "pkt_end",
-		[PTR_TO_FLOW_KEYS]	= "flow_keys",
-		[PTR_TO_SOCKET]		= "sock",
-		[PTR_TO_SOCK_COMMON]	= "sock_common",
-		[PTR_TO_TCP_SOCK]	= "tcp_sock",
-		[PTR_TO_TP_BUFFER]	= "tp_buffer",
-		[PTR_TO_XDP_SOCK]	= "xdp_sock",
-		[PTR_TO_BTF_ID]		= "ptr_",
-		[PTR_TO_MEM]		= "mem",
-		[PTR_TO_BUF]		= "buf",
-		[PTR_TO_FUNC]		= "func",
-		[PTR_TO_MAP_KEY]	= "map_key",
-		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
-	};
-
-	if (type & PTR_MAYBE_NULL) {
-		if (base_type(type) == PTR_TO_BTF_ID)
-			strncpy(postfix, "or_null_", 16);
-		else
-			strncpy(postfix, "_or_null", 16);
-	}
-
-	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
-		 type & MEM_RDONLY ? "rdonly_" : "",
-		 type & MEM_RINGBUF ? "ringbuf_" : "",
-		 type & MEM_USER ? "user_" : "",
-		 type & MEM_PERCPU ? "percpu_" : "",
-		 type & MEM_RCU ? "rcu_" : "",
-		 type & PTR_UNTRUSTED ? "untrusted_" : "",
-		 type & PTR_TRUSTED ? "trusted_" : ""
-	);
-
-	snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
-		 prefix, str[base_type(type)], postfix);
-	return env->tmp_str_buf;
-}
-
-static char slot_type_char[] = {
-	[STACK_INVALID]	= '?',
-	[STACK_SPILL]	= 'r',
-	[STACK_MISC]	= 'm',
-	[STACK_ZERO]	= '0',
-	[STACK_DYNPTR]	= 'd',
-	[STACK_ITER]	= 'i',
-};
-
-static void print_liveness(struct bpf_verifier_env *env,
-			   enum bpf_reg_liveness live)
-{
-	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
-	    verbose(env, "_");
-	if (live & REG_LIVE_READ)
-		verbose(env, "r");
-	if (live & REG_LIVE_WRITTEN)
-		verbose(env, "w");
-	if (live & REG_LIVE_DONE)
-		verbose(env, "D");
-}
-
 static int __get_spi(s32 off)
 {
 	return (-off - 1) / BPF_REG_SIZE;
@@ -678,87 +576,6 @@ static const char *btf_type_name(const struct btf *btf, u32 id)
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
 }
 
-static const char *dynptr_type_str(enum bpf_dynptr_type type)
-{
-	switch (type) {
-	case BPF_DYNPTR_TYPE_LOCAL:
-		return "local";
-	case BPF_DYNPTR_TYPE_RINGBUF:
-		return "ringbuf";
-	case BPF_DYNPTR_TYPE_SKB:
-		return "skb";
-	case BPF_DYNPTR_TYPE_XDP:
-		return "xdp";
-	case BPF_DYNPTR_TYPE_INVALID:
-		return "<invalid>";
-	default:
-		WARN_ONCE(1, "unknown dynptr type %d\n", type);
-		return "<unknown>";
-	}
-}
-
-static const char *iter_type_str(const struct btf *btf, u32 btf_id)
-{
-	if (!btf || btf_id == 0)
-		return "<invalid>";
-
-	/* we already validated that type is valid and has conforming name */
-	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
-}
-
-static const char *iter_state_str(enum bpf_iter_state state)
-{
-	switch (state) {
-	case BPF_ITER_STATE_ACTIVE:
-		return "active";
-	case BPF_ITER_STATE_DRAINED:
-		return "drained";
-	case BPF_ITER_STATE_INVALID:
-		return "<invalid>";
-	default:
-		WARN_ONCE(1, "unknown iter state %d\n", state);
-		return "<unknown>";
-	}
-}
-
-static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
-{
-	env->scratched_regs |= 1U << regno;
-}
-
-static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
-{
-	env->scratched_stack_slots |= 1ULL << spi;
-}
-
-static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
-{
-	return (env->scratched_regs >> regno) & 1;
-}
-
-static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
-{
-	return (env->scratched_stack_slots >> regno) & 1;
-}
-
-static bool verifier_state_scratched(const struct bpf_verifier_env *env)
-{
-	return env->scratched_regs || env->scratched_stack_slots;
-}
-
-static void mark_verifier_state_clean(struct bpf_verifier_env *env)
-{
-	env->scratched_regs = 0U;
-	env->scratched_stack_slots = 0ULL;
-}
-
-/* Used for printing the entire verifier state. */
-static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
-{
-	env->scratched_regs = ~0U;
-	env->scratched_stack_slots = ~0ULL;
-}
-
 static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 {
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -1298,226 +1115,6 @@ static void scrub_spilled_slot(u8 *stype)
 		*stype = STACK_MISC;
 }
 
-static void print_scalar_ranges(struct bpf_verifier_env *env,
-				const struct bpf_reg_state *reg,
-				const char **sep)
-{
-	struct {
-		const char *name;
-		u64 val;
-		bool omit;
-	} minmaxs[] = {
-		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
-		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
-		{"umin",   reg->umin_value,         reg->umin_value == 0},
-		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
-		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
-		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
-		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
-		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
-	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
-	bool neg1, neg2;
-
-	for (m1 = &minmaxs[0]; m1 < mend; m1++) {
-		if (m1->omit)
-			continue;
-
-		neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
-
-		verbose(env, "%s%s=", *sep, m1->name);
-		*sep = ",";
-
-		for (m2 = m1 + 2; m2 < mend; m2 += 2) {
-			if (m2->omit || m2->val != m1->val)
-				continue;
-			/* don't mix negatives with positives */
-			neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
-			if (neg2 != neg1)
-				continue;
-			m2->omit = true;
-			verbose(env, "%s=", m2->name);
-		}
-
-		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
-	}
-}
-
-static void print_verifier_state(struct bpf_verifier_env *env,
-				 const struct bpf_func_state *state,
-				 bool print_all)
-{
-	const struct bpf_reg_state *reg;
-	enum bpf_reg_type t;
-	int i;
-
-	if (state->frameno)
-		verbose(env, " frame%d:", state->frameno);
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		reg = &state->regs[i];
-		t = reg->type;
-		if (t == NOT_INIT)
-			continue;
-		if (!print_all && !reg_scratched(env, i))
-			continue;
-		verbose(env, " R%d", i);
-		print_liveness(env, reg->live);
-		verbose(env, "=");
-		if (t == SCALAR_VALUE && reg->precise)
-			verbose(env, "P");
-		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
-		    tnum_is_const(reg->var_off)) {
-			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			verbose(env, "%lld", reg->var_off.value + reg->off);
-		} else {
-			const char *sep = "";
-
-			verbose(env, "%s", reg_type_str(env, t));
-			if (base_type(t) == PTR_TO_BTF_ID)
-				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
-			verbose(env, "(");
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
- */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
-
-			if (reg->id)
-				verbose_a("id=%d", reg->id);
-			if (reg->ref_obj_id)
-				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
-			if (type_is_non_owning_ref(reg->type))
-				verbose_a("%s", "non_own_ref");
-			if (t != SCALAR_VALUE)
-				verbose_a("off=%d", reg->off);
-			if (type_is_pkt_pointer(t))
-				verbose_a("r=%d", reg->range);
-			else if (base_type(t) == CONST_PTR_TO_MAP ||
-				 base_type(t) == PTR_TO_MAP_KEY ||
-				 base_type(t) == PTR_TO_MAP_VALUE)
-				verbose_a("ks=%d,vs=%d",
-					  reg->map_ptr->key_size,
-					  reg->map_ptr->value_size);
-			if (tnum_is_const(reg->var_off)) {
-				/* Typically an immediate SCALAR_VALUE, but
-				 * could be a pointer whose offset is too big
-				 * for reg->off
-				 */
-				verbose_a("imm=%llx", reg->var_off.value);
-			} else {
-				print_scalar_ranges(env, reg, &sep);
-				if (!tnum_is_unknown(reg->var_off)) {
-					char tn_buf[48];
-
-					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose_a("var_off=%s", tn_buf);
-				}
-			}
-#undef verbose_a
-
-			verbose(env, ")");
-		}
-	}
-	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		char types_buf[BPF_REG_SIZE + 1];
-		bool valid = false;
-		int j;
-
-		for (j = 0; j < BPF_REG_SIZE; j++) {
-			if (state->stack[i].slot_type[j] != STACK_INVALID)
-				valid = true;
-			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
-		}
-		types_buf[BPF_REG_SIZE] = 0;
-		if (!valid)
-			continue;
-		if (!print_all && !stack_slot_scratched(env, i))
-			continue;
-		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
-		case STACK_SPILL:
-			reg = &state->stack[i].spilled_ptr;
-			t = reg->type;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			if (t == SCALAR_VALUE && reg->precise)
-				verbose(env, "P");
-			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
-				verbose(env, "%lld", reg->var_off.value + reg->off);
-			break;
-		case STACK_DYNPTR:
-			i += BPF_DYNPTR_NR_SLOTS - 1;
-			reg = &state->stack[i].spilled_ptr;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
-			if (reg->ref_obj_id)
-				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
-			break;
-		case STACK_ITER:
-			/* only main slot has ref_obj_id set; skip others */
-			reg = &state->stack[i].spilled_ptr;
-			if (!reg->ref_obj_id)
-				continue;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
-				iter_type_str(reg->iter.btf, reg->iter.btf_id),
-				reg->ref_obj_id, iter_state_str(reg->iter.state),
-				reg->iter.depth);
-			break;
-		case STACK_MISC:
-		case STACK_ZERO:
-		default:
-			reg = &state->stack[i].spilled_ptr;
-
-			for (j = 0; j < BPF_REG_SIZE; j++)
-				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
-			types_buf[BPF_REG_SIZE] = 0;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=%s", types_buf);
-			break;
-		}
-	}
-	if (state->acquired_refs && state->refs[0].id) {
-		verbose(env, " refs=%d", state->refs[0].id);
-		for (i = 1; i < state->acquired_refs; i++)
-			if (state->refs[i].id)
-				verbose(env, ",%d", state->refs[i].id);
-	}
-	if (state->in_callback_fn)
-		verbose(env, " cb");
-	if (state->in_async_callback_fn)
-		verbose(env, " async_cb");
-	verbose(env, "\n");
-	if (!print_all)
-		mark_verifier_state_clean(env);
-}
-
-static inline u32 vlog_alignment(u32 pos)
-{
-	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
-			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
-}
-
-static void print_insn_state(struct bpf_verifier_env *env,
-			     const struct bpf_func_state *state)
-{
-	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
-		/* remove new line character */
-		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
-	} else {
-		verbose(env, "%d:", env->insn_idx);
-	}
-	print_verifier_state(env, state, false);
-}
-
 /* copy array src of length n * size bytes to dst. dst is reallocated if it's too
  * small to hold src. This is different from krealloc since we don't want to preserve
  * the contents of dst.
-- 
cgit v1.2.3


From acfde6e8abee6b23e53b08606f861d9124288030 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 4 Nov 2023 00:10:18 -0400
Subject: struct dentry: get rid of randomize_layout idiocy

This is beyond ridiculous.  There is a reason why that thing is
cacheline-aligned...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3da2f0545d5d..1d9f7f132055 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -111,7 +111,7 @@ struct dentry {
 		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
 	 	struct rcu_head d_rcu;
 	} d_u;
-} __randomize_layout;
+};
 
 /*
  * dentry->d_lock spinlock nesting subclasses:
-- 
cgit v1.2.3


From 641c3ef5cb68a1426d42e6d3aba16db9bdfbe94f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 12:46:30 -0500
Subject: DCACHE_... ->d_flags bits: switch to BIT()

For bits 20..22 (inode type cached in ->d_flags) turn the definitions into
expressions like (5 << 20); everything else turns into straight use of
BIT()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 76 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1d9f7f132055..d9c314cc93b8 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -151,13 +151,13 @@ struct dentry_operations {
  */
 
 /* d_flags entries */
-#define DCACHE_OP_HASH			0x00000001
-#define DCACHE_OP_COMPARE		0x00000002
-#define DCACHE_OP_REVALIDATE		0x00000004
-#define DCACHE_OP_DELETE		0x00000008
-#define DCACHE_OP_PRUNE			0x00000010
+#define DCACHE_OP_HASH			BIT(0)
+#define DCACHE_OP_COMPARE		BIT(1)
+#define DCACHE_OP_REVALIDATE		BIT(2)
+#define DCACHE_OP_DELETE		BIT(3)
+#define DCACHE_OP_PRUNE			BIT(4)
 
-#define	DCACHE_DISCONNECTED		0x00000020
+#define	DCACHE_DISCONNECTED		BIT(5)
      /* This dentry is possibly not currently connected to the dcache tree, in
       * which case its parent will either be itself, or will have this flag as
       * well.  nfsd will not use a dentry with this bit set, but will first
@@ -168,50 +168,50 @@ struct dentry_operations {
       * dentry into place and return that dentry rather than the passed one,
       * typically using d_splice_alias. */
 
-#define DCACHE_REFERENCED		0x00000040 /* Recently used, don't discard. */
+#define DCACHE_REFERENCED		BIT(6) /* Recently used, don't discard. */
 
-#define DCACHE_DONTCACHE		0x00000080 /* Purge from memory on final dput() */
+#define DCACHE_DONTCACHE		BIT(7) /* Purge from memory on final dput() */
 
-#define DCACHE_CANT_MOUNT		0x00000100
-#define DCACHE_GENOCIDE			0x00000200
-#define DCACHE_SHRINK_LIST		0x00000400
+#define DCACHE_CANT_MOUNT		BIT(8)
+#define DCACHE_GENOCIDE			BIT(9)
+#define DCACHE_SHRINK_LIST		BIT(10)
 
-#define DCACHE_OP_WEAK_REVALIDATE	0x00000800
+#define DCACHE_OP_WEAK_REVALIDATE	BIT(11)
 
-#define DCACHE_NFSFS_RENAMED		0x00001000
+#define DCACHE_NFSFS_RENAMED		BIT(12)
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE			0x00002000 /* For use by dcookie subsystem */
-#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x00004000
+#define DCACHE_COOKIE			BIT(13) /* For use by dcookie subsystem */
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	BIT(14)
      /* Parent inode is watched by some fsnotify listener */
 
-#define DCACHE_DENTRY_KILLED		0x00008000
+#define DCACHE_DENTRY_KILLED		BIT(15)
 
-#define DCACHE_MOUNTED			0x00010000 /* is a mountpoint */
-#define DCACHE_NEED_AUTOMOUNT		0x00020000 /* handle automount on this dir */
-#define DCACHE_MANAGE_TRANSIT		0x00040000 /* manage transit from this dirent */
+#define DCACHE_MOUNTED			BIT(16) /* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT		BIT(17) /* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT		BIT(18) /* manage transit from this dirent */
 #define DCACHE_MANAGED_DENTRY \
 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
 
-#define DCACHE_LRU_LIST			0x00080000
-
-#define DCACHE_ENTRY_TYPE		0x00700000
-#define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
-#define DCACHE_WHITEOUT_TYPE		0x00100000 /* Whiteout dentry (stop pathwalk) */
-#define DCACHE_DIRECTORY_TYPE		0x00200000 /* Normal directory */
-#define DCACHE_AUTODIR_TYPE		0x00300000 /* Lookupless directory (presumed automount) */
-#define DCACHE_REGULAR_TYPE		0x00400000 /* Regular file type (or fallthru to such) */
-#define DCACHE_SPECIAL_TYPE		0x00500000 /* Other file type (or fallthru to such) */
-#define DCACHE_SYMLINK_TYPE		0x00600000 /* Symlink (or fallthru to such) */
-
-#define DCACHE_MAY_FREE			0x00800000
-#define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
-#define DCACHE_NOKEY_NAME		0x02000000 /* Encrypted name encoded without key */
-#define DCACHE_OP_REAL			0x04000000
-
-#define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
-#define DCACHE_DENTRY_CURSOR		0x20000000
-#define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+#define DCACHE_LRU_LIST			BIT(19)
+
+#define DCACHE_ENTRY_TYPE		(7 << 20) /* bits 20..22 are for storing type: */
+#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry (maybe fallthru to nowhere) */
+#define DCACHE_WHITEOUT_TYPE		(1 << 20) /* Whiteout dentry (stop pathwalk) */
+#define DCACHE_DIRECTORY_TYPE		(2 << 20) /* Normal directory */
+#define DCACHE_AUTODIR_TYPE		(3 << 20) /* Lookupless directory (presumed automount) */
+#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type (or fallthru to such) */
+#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type (or fallthru to such) */
+#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink (or fallthru to such) */
+
+#define DCACHE_MAY_FREE			BIT(23)
+#define DCACHE_FALLTHRU			BIT(24) /* Fall through to lower layer */
+#define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
+#define DCACHE_OP_REAL			BIT(26)
+
+#define DCACHE_PAR_LOOKUP		BIT(28) /* being looked up (with parent locked shared) */
+#define DCACHE_DENTRY_CURSOR		BIT(29)
+#define DCACHE_NORCU			BIT(30) /* No RCU delay for freeing */
 
 extern seqlock_t rename_lock;
 
-- 
cgit v1.2.3


From 0bec65a80f1b1ebcda05286e539a204713b70353 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 12:50:29 -0500
Subject: DCACHE_COOKIE: RIP

the last user gone in 2021...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d9c314cc93b8..92c0b2a1ae2e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -181,7 +181,6 @@ struct dentry_operations {
 #define DCACHE_NFSFS_RENAMED		BIT(12)
      /* this dentry has been "silly renamed" and has to be deleted on the last
       * dput() */
-#define DCACHE_COOKIE			BIT(13) /* For use by dcookie subsystem */
 #define DCACHE_FSNOTIFY_PARENT_WATCHED	BIT(14)
      /* Parent inode is watched by some fsnotify listener */
 
-- 
cgit v1.2.3


From 8219cb58feddcf28909072015f4e17e29f68c41a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 14:32:05 -0500
Subject: kill d_{is,set}_fallthru()

Introduced in 2015 and never had any in-tree users...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 20 ++------------------
 include/linux/dcache.h | 17 ++++-------------
 2 files changed, 6 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index b8f1b54a1492..9f5b2b5c1e6d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -344,7 +344,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
 
 	dentry->d_inode = inode;
 	flags = READ_ONCE(dentry->d_flags);
-	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	flags &= ~DCACHE_ENTRY_TYPE;
 	flags |= type_flags;
 	smp_store_release(&dentry->d_flags, flags);
 }
@@ -353,7 +353,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
 {
 	unsigned flags = READ_ONCE(dentry->d_flags);
 
-	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+	flags &= ~DCACHE_ENTRY_TYPE;
 	WRITE_ONCE(dentry->d_flags, flags);
 	dentry->d_inode = NULL;
 	if (dentry->d_flags & DCACHE_LRU_LIST)
@@ -1936,22 +1936,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
-
-/*
- * d_set_fallthru - Mark a dentry as falling through to a lower layer
- * @dentry - The dentry to mark
- *
- * Mark a dentry as falling through to the lower layer (as set with
- * d_pin_lower()).  This flag may be recorded on the medium.
- */
-void d_set_fallthru(struct dentry *dentry)
-{
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_FALLTHRU;
-	spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_set_fallthru);
-
 static unsigned d_flags_for_inode(struct inode *inode)
 {
 	unsigned add_flags = DCACHE_REGULAR_TYPE;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 92c0b2a1ae2e..8cd937bb2292 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -195,16 +195,15 @@ struct dentry_operations {
 #define DCACHE_LRU_LIST			BIT(19)
 
 #define DCACHE_ENTRY_TYPE		(7 << 20) /* bits 20..22 are for storing type: */
-#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry (maybe fallthru to nowhere) */
+#define DCACHE_MISS_TYPE		(0 << 20) /* Negative dentry */
 #define DCACHE_WHITEOUT_TYPE		(1 << 20) /* Whiteout dentry (stop pathwalk) */
 #define DCACHE_DIRECTORY_TYPE		(2 << 20) /* Normal directory */
 #define DCACHE_AUTODIR_TYPE		(3 << 20) /* Lookupless directory (presumed automount) */
-#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type (or fallthru to such) */
-#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type (or fallthru to such) */
-#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink (or fallthru to such) */
+#define DCACHE_REGULAR_TYPE		(4 << 20) /* Regular file type */
+#define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type */
+#define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink */
 
 #define DCACHE_MAY_FREE			BIT(23)
-#define DCACHE_FALLTHRU			BIT(24) /* Fall through to lower layer */
 #define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
 #define DCACHE_OP_REAL			BIT(26)
 
@@ -489,14 +488,6 @@ static inline int simple_positive(const struct dentry *dentry)
 	return d_really_is_positive(dentry) && !d_unhashed(dentry);
 }
 
-extern void d_set_fallthru(struct dentry *dentry);
-
-static inline bool d_is_fallthru(const struct dentry *dentry)
-{
-	return dentry->d_flags & DCACHE_FALLTHRU;
-}
-
-
 extern int sysctl_vfs_cache_pressure;
 
 static inline unsigned long vfs_pressure_ratio(unsigned long val)
-- 
cgit v1.2.3


From 0d486510f86eb8162022ed61e6dc424a10909a10 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 15:22:40 -0500
Subject: dentry.h: trim externs

d_instantiate_unique() had been gone for 7 years; __d_lookup...()
and shrink_dcache_for_umount() are fs/internal.h fodder.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h          | 4 ++++
 include/linux/dcache.h | 5 -----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/internal.h b/fs/internal.h
index 58e43341aebf..9e9fc629f935 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -215,6 +215,10 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *)
 extern char *simple_dname(struct dentry *, char *, int);
 extern void dput_to_list(struct dentry *, struct list_head *);
 extern void shrink_dentry_list(struct list_head *);
+extern void shrink_dcache_for_umount(struct super_block *);
+extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
+extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
+				const struct qstr *name, unsigned *seq);
 
 /*
  * pipe.c
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8cd937bb2292..9706bf1dc5de 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -218,7 +218,6 @@ extern seqlock_t rename_lock;
  */
 extern void d_instantiate(struct dentry *, struct inode *);
 extern void d_instantiate_new(struct dentry *, struct inode *);
-extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
@@ -240,7 +239,6 @@ extern struct dentry * d_obtain_alias(struct inode *);
 extern struct dentry * d_obtain_root(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_for_umount(struct super_block *);
 extern void d_invalidate(struct dentry *);
 
 /* only used at mount-time */
@@ -275,9 +273,6 @@ extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 /* appendix may either be NULL or be used for transname suffixes */
 extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
-extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
-extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
-				const struct qstr *name, unsigned *seq);
 
 static inline unsigned d_count(const struct dentry *dentry)
 {
-- 
cgit v1.2.3


From 2fcd38f4de7256e2b5cb23ad22a6e3ebfea7dd18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 15:24:45 -0500
Subject: [software coproarchaeology] dentry.h: kill a mysterious comment

there's a strange comment in front of d_lookup() declaration:

/* appendix may either be NULL or be used for transname suffixes */

Looks like nobody had been curious enough to track its history;
it predates git, it predates bitkeeper and if you look through
the pre-BK trees, you finally arrive at this in 2.1.44-for-davem:
  /* appendix may either be NULL or be used for transname suffixes */
 -extern struct dentry * d_lookup(struct inode * dir, struct qstr * name,
 -                               struct qstr * appendix);
 +extern struct dentry * d_lookup(struct dentry * dir, struct qstr * name);
In other words, it refers to the third argument d_lookup() used to have
back then.  It had been introduced in 2.1.43-pre, on June 12 1997,
along with d_lookup(), only to be removed by July 4 1997, presumably
when the Cthulhu-awful thing it used to be used for (look for
CONFIG_TRANS_NAMES in 2.1.43-pre, and keep a heavy-duty barfbag
ready) had been, er, noticed and recognized for what it had been.

Despite the appendectomy, the comment remained.  Some things really
need to be put out of their misery...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 9706bf1dc5de..a5e5e274eee0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -270,7 +270,6 @@ extern void d_move(struct dentry *, struct dentry *);
 extern void d_exchange(struct dentry *, struct dentry *);
 extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
 
-/* appendix may either be NULL or be used for transname suffixes */
 extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
 
-- 
cgit v1.2.3


From 698f1e2b71736977b04f951e2e2ef1c9a80696ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 16:19:59 -0500
Subject: kill d_backing_dentry()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a5e5e274eee0..fa0414cff85c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -530,21 +530,6 @@ static inline struct inode *d_backing_inode(const struct dentry *upper)
 	return inode;
 }
 
-/**
- * d_backing_dentry - Get upper or lower dentry we should be using
- * @upper: The upper layer
- *
- * This is the helper that should be used to get the dentry of the inode that
- * will be used if this dentry were opened as a file.  It may be the upper
- * dentry or it may be a lower dentry pinned by the upper.
- *
- * Normal filesystems should not use this to access their own dentries.
- */
-static inline struct dentry *d_backing_dentry(struct dentry *upper)
-{
-	return upper;
-}
-
 /**
  * d_real - Return the real dentry
  * @dentry: the dentry to query
-- 
cgit v1.2.3


From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 18 Nov 2023 18:38:05 -0800
Subject: net: partial revert of the "Make timestamping selectable: series

Revert following commits:

commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask")
commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer")
commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp")
commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers")
commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers")
commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer")
commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC")
commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp")
commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable")
commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command")

They need more time for reviews.

Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml           |  57 -----
 Documentation/networking/ethtool-netlink.rst       |  63 ------
 .../net/ethernet/microchip/lan966x/lan966x_main.c  |   6 +-
 drivers/net/phy/bcm-phy-ptp.c                      |   3 -
 drivers/net/phy/dp83640.c                          |   3 -
 drivers/net/phy/micrel.c                           |   6 -
 drivers/net/phy/mscc/mscc_ptp.c                    |   2 -
 drivers/net/phy/nxp-c45-tja11xx.c                  |   3 -
 drivers/net/phy/phy_device.c                       |  37 ----
 include/linux/net_tstamp.h                         |  11 +-
 include/linux/netdevice.h                          |   5 -
 include/linux/phy.h                                |   4 -
 include/uapi/linux/ethtool_netlink.h               |  29 ---
 include/uapi/linux/net_tstamp.h                    |  18 --
 net/core/dev.c                                     |   3 -
 net/core/dev_ioctl.c                               |  36 ++-
 net/core/timestamping.c                            |  10 -
 net/ethtool/Makefile                               |   2 +-
 net/ethtool/common.c                               |  19 +-
 net/ethtool/common.h                               |   1 -
 net/ethtool/netlink.c                              |  28 ---
 net/ethtool/netlink.h                              |   4 -
 net/ethtool/ts.c                                   | 244 ---------------------
 23 files changed, 28 insertions(+), 566 deletions(-)
 delete mode 100644 net/ethtool/ts.c

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 06d9120543d3..5c7a65b009b4 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -939,26 +939,6 @@ attribute-sets:
       -
         name: burst-tmr
         type: u32
-  -
-    name: ts
-    attributes:
-      -
-        name: header
-        type: nest
-        nested-attributes: header
-      -
-        name: ts-layer
-        type: u32
-  -
-    name: ts-list
-    attributes:
-      -
-        name: header
-        type: nest
-        nested-attributes: header
-      -
-        name: ts-list-layer
-        type: binary
 
 operations:
   enum-model: directional
@@ -1709,40 +1689,3 @@ operations:
       name: mm-ntf
       doc: Notification for change in MAC Merge configuration.
       notify: mm-get
-    -
-      name: ts-get
-      doc: Get current timestamp
-
-      attribute-set: ts
-
-      do:
-        request:
-          attributes:
-            - header
-        reply:
-          attributes: &ts
-            - header
-            - ts-layer
-    -
-      name: ts-list-get
-      doc: Get list of timestamp devices available on an interface
-
-      attribute-set: ts-list
-
-      do:
-        request:
-          attributes:
-            - header
-        reply:
-          attributes:
-            - header
-            - ts-list-layer
-    -
-      name: ts-set
-      doc: Set the timestamp device
-
-      attribute-set: ts
-
-      do:
-        request:
-          attributes: *ts
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 530c1775e5f4..2540c70952ff 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -225,9 +225,6 @@ Userspace to kernel:
   ``ETHTOOL_MSG_RSS_GET``               get RSS settings
   ``ETHTOOL_MSG_MM_GET``                get MAC merge layer state
   ``ETHTOOL_MSG_MM_SET``                set MAC merge layer parameters
-  ``ETHTOOL_MSG_TS_GET``                get current timestamping
-  ``ETHTOOL_MSG_TS_LIST_GET``           list available timestampings
-  ``ETHTOOL_MSG_TS_SET``                set current timestamping
   ===================================== =================================
 
 Kernel to userspace:
@@ -271,8 +268,6 @@ Kernel to userspace:
   ``ETHTOOL_MSG_PSE_GET_REPLY``            PSE parameters
   ``ETHTOOL_MSG_RSS_GET_REPLY``            RSS settings
   ``ETHTOOL_MSG_MM_GET_REPLY``             MAC merge layer status
-  ``ETHTOOL_MSG_TS_GET_REPLY``             current timestamping
-  ``ETHTOOL_MSG_TS_LIST_GET_REPLY``        available timestampings
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1999,61 +1994,6 @@ The attributes are propagated to the driver through the following structure:
 .. kernel-doc:: include/linux/ethtool.h
     :identifiers: ethtool_mm_cfg
 
-TS_GET
-======
-
-Gets current timestamping.
-
-Request contents:
-
-  =================================  ======  ====================
-  ``ETHTOOL_A_TS_HEADER``            nested  request header
-  =================================  ======  ====================
-
-Kernel response contents:
-
-  =======================  ======  ==============================
-  ``ETHTOOL_A_TS_HEADER``  nested  reply header
-  ``ETHTOOL_A_TS_LAYER``   u32     current timestamping
-  =======================  ======  ==============================
-
-This command get the current timestamp layer.
-
-TS_LIST_GET
-===========
-
-Get the list of available timestampings.
-
-Request contents:
-
-  =================================  ======  ====================
-  ``ETHTOOL_A_TS_HEADER``            nested  request header
-  =================================  ======  ====================
-
-Kernel response contents:
-
-  ===========================  ======  ==============================
-  ``ETHTOOL_A_TS_HEADER``      nested  reply header
-  ``ETHTOOL_A_TS_LIST_LAYER``  binary  available timestampings
-  ===========================  ======  ==============================
-
-This command lists all the possible timestamp layer available.
-
-TS_SET
-======
-
-Modify the selected timestamping.
-
-Request contents:
-
-  =======================  ======  ===================
-  ``ETHTOOL_A_TS_HEADER``  nested  reply header
-  ``ETHTOOL_A_TS_LAYER``   u32     timestamping
-  =======================  ======  ===================
-
-This command set the timestamping with one that should be listed by the
-TSLIST_GET command.
-
 Request translation
 ===================
 
@@ -2160,7 +2100,4 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_PLCA_GET_STATUS``
   n/a                                 ``ETHTOOL_MSG_MM_GET``
   n/a                                 ``ETHTOOL_MSG_MM_SET``
-  n/a                                 ``ETHTOOL_MSG_TS_GET``
-  n/a                                 ``ETHTOOL_MSG_TS_LIST_GET``
-  n/a                                 ``ETHTOOL_MSG_TS_SET``
   =================================== =====================================
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index fbe56b1bb386..2635ef8958c8 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -470,15 +470,15 @@ static int lan966x_port_hwtstamp_set(struct net_device *dev,
 	struct lan966x_port *port = netdev_priv(dev);
 	int err;
 
-	if (cfg->source != MAC_TIMESTAMPING &&
-	    cfg->source != PHY_TIMESTAMPING)
+	if (cfg->source != HWTSTAMP_SOURCE_NETDEV &&
+	    cfg->source != HWTSTAMP_SOURCE_PHYLIB)
 		return -EOPNOTSUPP;
 
 	err = lan966x_ptp_setup_traps(port, cfg);
 	if (err)
 		return err;
 
-	if (cfg->source == MAC_TIMESTAMPING) {
+	if (cfg->source == HWTSTAMP_SOURCE_NETDEV) {
 		if (!port->lan966x->ptp)
 			return -EOPNOTSUPP;
 
diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c
index d3e825c951ee..617d384d4551 100644
--- a/drivers/net/phy/bcm-phy-ptp.c
+++ b/drivers/net/phy/bcm-phy-ptp.c
@@ -931,9 +931,6 @@ struct bcm_ptp_private *bcm_ptp_probe(struct phy_device *phydev)
 		return ERR_CAST(clock);
 	priv->ptp_clock = clock;
 
-	/* Timestamp selected by default to keep legacy API */
-	phydev->default_timestamp = true;
-
 	priv->phydev = phydev;
 	bcm_ptp_init(priv);
 
diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 64fd1a109c0f..5c42c47dc564 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1450,9 +1450,6 @@ static int dp83640_probe(struct phy_device *phydev)
 	phydev->mii_ts = &dp83640->mii_ts;
 	phydev->priv = dp83640;
 
-	/* Timestamp selected by default to keep legacy API */
-	phydev->default_timestamp = true;
-
 	spin_lock_init(&dp83640->rx_lock);
 	skb_queue_head_init(&dp83640->rx_queue);
 	skb_queue_head_init(&dp83640->tx_queue);
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 2b8dd0131926..bd4cd082662f 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -3158,9 +3158,6 @@ static void lan8814_ptp_init(struct phy_device *phydev)
 	ptp_priv->mii_ts.ts_info  = lan8814_ts_info;
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
-
-	/* Timestamp selected by default to keep legacy API */
-	phydev->default_timestamp = true;
 }
 
 static int lan8814_ptp_probe_once(struct phy_device *phydev)
@@ -4589,9 +4586,6 @@ static int lan8841_probe(struct phy_device *phydev)
 
 	phydev->mii_ts = &ptp_priv->mii_ts;
 
-	/* Timestamp selected by default to keep legacy API */
-	phydev->default_timestamp = true;
-
 	return 0;
 }
 
diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c
index fd174eb06d4a..eb0b032cb613 100644
--- a/drivers/net/phy/mscc/mscc_ptp.c
+++ b/drivers/net/phy/mscc/mscc_ptp.c
@@ -1570,8 +1570,6 @@ int vsc8584_ptp_probe(struct phy_device *phydev)
 		return PTR_ERR(vsc8531->load_save);
 	}
 
-	/* Timestamp selected by default to keep legacy API */
-	phydev->default_timestamp = true;
 	vsc8531->ptp->phydev = phydev;
 
 	return 0;
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index 0515c7b979db..780ad353cf55 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -1658,9 +1658,6 @@ static int nxp_c45_probe(struct phy_device *phydev)
 		priv->mii_ts.ts_info = nxp_c45_ts_info;
 		phydev->mii_ts = &priv->mii_ts;
 		ret = nxp_c45_init_ptp_clock(priv);
-
-		/* Timestamp selected by default to keep legacy API */
-		phydev->default_timestamp = true;
 	} else {
 		phydev_dbg(phydev, "PTP support not enabled even if the phy supports it");
 	}
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 8c4794631daa..2ce74593d6e4 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1411,26 +1411,6 @@ int phy_sfp_probe(struct phy_device *phydev,
 }
 EXPORT_SYMBOL(phy_sfp_probe);
 
-/**
- * phy_set_timestamp - set the default selected timestamping device
- * @dev: Pointer to net_device
- * @phydev: Pointer to phy_device
- *
- * This is used to set default timestamping device taking into account
- * the new API choice, which is selecting the timestamping from MAC by
- * default if the phydev does not have default_timestamp flag enabled.
- */
-static void phy_set_timestamp(struct net_device *dev, struct phy_device *phydev)
-{
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!phy_has_tsinfo(phydev))
-		return;
-
-	if (!ops->get_ts_info || phydev->default_timestamp)
-		dev->ts_layer = PHY_TIMESTAMPING;
-}
-
 /**
  * phy_attach_direct - attach a network device to a given PHY device pointer
  * @dev: network device to attach
@@ -1504,7 +1484,6 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 	phydev->phy_link_change = phy_link_change;
 	if (dev) {
-		phy_set_timestamp(dev, phydev);
 		phydev->attached_dev = dev;
 		dev->phydev = phydev;
 
@@ -1833,22 +1812,6 @@ void phy_detach(struct phy_device *phydev)
 
 	phy_suspend(phydev);
 	if (dev) {
-		const struct ethtool_ops *ops = dev->ethtool_ops;
-		struct ethtool_ts_info ts_info = {0};
-
-		if (ops->get_ts_info) {
-			ops->get_ts_info(dev, &ts_info);
-			if ((ts_info.so_timestamping &
-			    SOF_TIMESTAMPING_HARDWARE_MASK) ==
-			    SOF_TIMESTAMPING_HARDWARE_MASK)
-				dev->ts_layer = MAC_TIMESTAMPING;
-			else if ((ts_info.so_timestamping &
-				 SOF_TIMESTAMPING_SOFTWARE_MASK) ==
-				 SOF_TIMESTAMPING_SOFTWARE_MASK)
-				dev->ts_layer = SOFTWARE_TIMESTAMPING;
-		} else {
-			dev->ts_layer = NO_TIMESTAMPING;
-		}
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
 	}
diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
index bb289c2ad376..eb01c37e71e0 100644
--- a/include/linux/net_tstamp.h
+++ b/include/linux/net_tstamp.h
@@ -5,6 +5,11 @@
 
 #include <uapi/linux/net_tstamp.h>
 
+enum hwtstamp_source {
+	HWTSTAMP_SOURCE_NETDEV,
+	HWTSTAMP_SOURCE_PHYLIB,
+};
+
 /**
  * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
  *
@@ -15,8 +20,8 @@
  *	a legacy implementation of a lower driver
  * @copied_to_user: request was passed to a legacy implementation which already
  *	copied the ioctl request back to user space
- * @source: indication whether timestamps should come from software, the netdev
- *	or from an attached phylib PHY
+ * @source: indication whether timestamps should come from the netdev or from
+ *	an attached phylib PHY
  *
  * Prefer using this structure for in-kernel processing of hardware
  * timestamping configuration, over the inextensible struct hwtstamp_config
@@ -28,7 +33,7 @@ struct kernel_hwtstamp_config {
 	int rx_filter;
 	struct ifreq *ifr;
 	bool copied_to_user;
-	enum timestamping_layer source;
+	enum hwtstamp_source source;
 };
 
 static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f020d2790c12..2d840d7056f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,7 +47,6 @@
 #include <uapi/linux/if_bonding.h>
 #include <uapi/linux/pkt_cls.h>
 #include <uapi/linux/netdev.h>
-#include <uapi/linux/net_tstamp.h>
 #include <linux/hashtable.h>
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
@@ -2075,8 +2074,6 @@ enum netdev_ml_priv_type {
  *
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
- *	@ts_layer:	Tracks which network device
- *			performs packet	time stamping.
  *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
@@ -2438,8 +2435,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
-
-	enum timestamping_layer	ts_layer;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 317def2a7843..e5f1f41e399c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -604,8 +604,6 @@ struct macsec_ops;
  *                 handling shall be postponed until PHY has resumed
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
- * @default_timestamp: Flag indicating whether we are using the phy
- *		       timestamp as the default one
  * @interface: enum phy_interface_t value
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
@@ -669,8 +667,6 @@ struct phy_device {
 	unsigned irq_suspended:1;
 	unsigned irq_rerun:1;
 
-	unsigned default_timestamp:1;
-
 	int rate_matching;
 
 	enum phy_state state;
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index df6c4fcc62c1..73e2c10dc2cc 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -57,9 +57,6 @@ enum {
 	ETHTOOL_MSG_PLCA_GET_STATUS,
 	ETHTOOL_MSG_MM_GET,
 	ETHTOOL_MSG_MM_SET,
-	ETHTOOL_MSG_TS_GET,
-	ETHTOOL_MSG_TS_LIST_GET,
-	ETHTOOL_MSG_TS_SET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -112,8 +109,6 @@ enum {
 	ETHTOOL_MSG_PLCA_NTF,
 	ETHTOOL_MSG_MM_GET_REPLY,
 	ETHTOOL_MSG_MM_NTF,
-	ETHTOOL_MSG_TS_GET_REPLY,
-	ETHTOOL_MSG_TS_LIST_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -980,30 +975,6 @@ enum {
 	ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1)
 };
 
-/* TS LAYER */
-
-enum {
-	ETHTOOL_A_TS_UNSPEC,
-	ETHTOOL_A_TS_HEADER,			/* nest - _A_HEADER_* */
-	ETHTOOL_A_TS_LAYER,			/* u32 */
-
-	/* add new constants above here */
-	__ETHTOOL_A_TS_CNT,
-	ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1)
-};
-
-/* TS LIST LAYER */
-
-enum {
-	ETHTOOL_A_TS_LIST_UNSPEC,
-	ETHTOOL_A_TS_LIST_HEADER,			/* nest - _A_HEADER_* */
-	ETHTOOL_A_TS_LIST_LAYER,		/* array, u32 */
-
-	/* add new constants above here */
-	__ETHTOOL_A_TS_LIST_CNT,
-	ETHTOOL_A_TS_LIST_MAX = (__ETHTOOL_A_TS_LIST_CNT - 1)
-};
-
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 4551fb3d7720..a2c66b3d7f0f 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -13,16 +13,6 @@
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
-/* Layer of the TIMESTAMPING provider */
-enum timestamping_layer {
-	NO_TIMESTAMPING,
-	SOFTWARE_TIMESTAMPING,
-	MAC_TIMESTAMPING,
-	PHY_TIMESTAMPING,
-
-	__TIMESTAMPING_COUNT,
-};
-
 /* SO_TIMESTAMPING flags */
 enum {
 	SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
@@ -58,14 +48,6 @@ enum {
 					 SOF_TIMESTAMPING_TX_SCHED | \
 					 SOF_TIMESTAMPING_TX_ACK)
 
-#define SOF_TIMESTAMPING_SOFTWARE_MASK	(SOF_TIMESTAMPING_RX_SOFTWARE | \
-					 SOF_TIMESTAMPING_TX_SOFTWARE | \
-					 SOF_TIMESTAMPING_SOFTWARE)
-
-#define SOF_TIMESTAMPING_HARDWARE_MASK	(SOF_TIMESTAMPING_RX_HARDWARE | \
-					 SOF_TIMESTAMPING_TX_HARDWARE | \
-					 SOF_TIMESTAMPING_RAW_HARDWARE)
-
 /**
  * struct so_timestamping - SO_TIMESTAMPING parameter
  *
diff --git a/net/core/dev.c b/net/core/dev.c
index 05ce00632892..af53f6d838ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10212,9 +10212,6 @@ int register_netdevice(struct net_device *dev)
 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
-	if (dev->ethtool_ops->get_ts_info)
-		dev->ts_layer = MAC_TIMESTAMPING;
-
 out:
 	return ret;
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index bc8be9749376..9a66cf5015f2 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,7 +259,9 @@ static int dev_eth_ioctl(struct net_device *dev,
  * @dev: Network device
  * @cfg: Timestamping configuration structure
  *
- * Helper for calling the selected hardware provider timestamping.
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev.
  *
  * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
  * there only exists a phydev->mii_ts->hwtstamp() method. So this will return
@@ -269,14 +271,10 @@ static int dev_eth_ioctl(struct net_device *dev,
 static int dev_get_hwtstamp_phylib(struct net_device *dev,
 				   struct kernel_hwtstamp_config *cfg)
 {
-	enum timestamping_layer ts_layer = dev->ts_layer;
-
-	if (ts_layer == PHY_TIMESTAMPING)
+	if (phy_has_hwtstamp(dev->phydev))
 		return phy_hwtstamp_get(dev->phydev, cfg);
-	else if (ts_layer == MAC_TIMESTAMPING)
-		return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
 
-	return -EOPNOTSUPP;
+	return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
 }
 
 static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
@@ -317,8 +315,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
  * @cfg: Timestamping configuration structure
  * @extack: Netlink extended ack message structure, for error reporting
  *
- * Helper for calling the selected hardware provider timestamping.
- * If the netdev driver needs to perform specific actions even for PHY
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev. If the netdev driver needs to perform specific actions even for PHY
  * timestamping to work properly (a switch port must trap the timestamped
  * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
  * dev->priv_flags.
@@ -328,26 +327,20 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
 			    struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	enum timestamping_layer ts_layer = dev->ts_layer;
+	bool phy_ts = phy_has_hwtstamp(dev->phydev);
 	struct kernel_hwtstamp_config old_cfg = {};
 	bool changed = false;
 	int err;
 
-	cfg->source = ts_layer;
-
-	if (ts_layer != PHY_TIMESTAMPING &&
-	    ts_layer != MAC_TIMESTAMPING)
-		return -EOPNOTSUPP;
+	cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
 
-	if (ts_layer == PHY_TIMESTAMPING &&
-	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) {
+	if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
 		err = ops->ndo_hwtstamp_get(dev, &old_cfg);
 		if (err)
 			return err;
 	}
 
-	if (ts_layer == MAC_TIMESTAMPING ||
-	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS) {
+	if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
 		err = ops->ndo_hwtstamp_set(dev, cfg, extack);
 		if (err) {
 			if (extack->_msg)
@@ -356,11 +349,10 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
 		}
 	}
 
-	if (ts_layer == PHY_TIMESTAMPING &&
-	    dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)
+	if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS))
 		changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
 
-	if (ts_layer == PHY_TIMESTAMPING) {
+	if (phy_ts) {
 		err = phy_hwtstamp_set(dev->phydev, cfg, extack);
 		if (err) {
 			if (changed)
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 5cf51a523fb3..04840697fe79 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -21,7 +21,6 @@ static unsigned int classify(const struct sk_buff *skb)
 
 void skb_clone_tx_timestamp(struct sk_buff *skb)
 {
-	enum timestamping_layer ts_layer;
 	struct mii_timestamper *mii_ts;
 	struct sk_buff *clone;
 	unsigned int type;
@@ -29,10 +28,6 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
 	if (!skb->sk)
 		return;
 
-	ts_layer = skb->dev->ts_layer;
-	if (ts_layer != PHY_TIMESTAMPING)
-		return;
-
 	type = classify(skb);
 	if (type == PTP_CLASS_NONE)
 		return;
@@ -49,17 +44,12 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
 
 bool skb_defer_rx_timestamp(struct sk_buff *skb)
 {
-	enum timestamping_layer ts_layer;
 	struct mii_timestamper *mii_ts;
 	unsigned int type;
 
 	if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
 		return false;
 
-	ts_layer = skb->dev->ts_layer;
-	if (ts_layer != PHY_TIMESTAMPING)
-		return false;
-
 	if (skb_headroom(skb) < ETH_HLEN)
 		return false;
 
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 4ea64c080639..504f954a1b28 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -8,4 +8,4 @@ ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
 		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
-		   module.o pse-pd.o plca.o mm.o ts.o
+		   module.o pse-pd.o plca.o mm.o
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 9f6e3b2c74e2..11d8797f63f6 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -633,28 +633,13 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
-	enum timestamping_layer ts_layer;
-	int ret;
 
 	memset(info, 0, sizeof(*info));
 	info->cmd = ETHTOOL_GET_TS_INFO;
 
-	ts_layer = dev->ts_layer;
-	if (ts_layer == SOFTWARE_TIMESTAMPING) {
-		ret = ops->get_ts_info(dev, info);
-		if (ret)
-			return ret;
-		info->so_timestamping &= ~SOF_TIMESTAMPING_HARDWARE_MASK;
-		info->phc_index = -1;
-		info->rx_filters = 0;
-		info->tx_types = 0;
-		return 0;
-	}
-
-	if (ts_layer == PHY_TIMESTAMPING)
+	if (phy_has_tsinfo(phydev))
 		return phy_ts_info(phydev, info);
-
-	if (ts_layer == MAC_TIMESTAMPING)
+	if (ops->get_ts_info)
 		return ops->get_ts_info(dev, info);
 
 	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index a264b635f7d3..28b8aaaf9bcb 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -35,7 +35,6 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN];
 extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
 extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
 extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
-extern const char ts_layer_names[][ETH_GSTRING_LEN];
 extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN];
 
 int __ethtool_get_link(struct net_device *dev);
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 8322bf71f80d..3bbd5afb7b31 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -306,9 +306,6 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_PLCA_GET_STATUS]	= &ethnl_plca_status_request_ops,
 	[ETHTOOL_MSG_MM_GET]		= &ethnl_mm_request_ops,
 	[ETHTOOL_MSG_MM_SET]		= &ethnl_mm_request_ops,
-	[ETHTOOL_MSG_TS_GET]		= &ethnl_ts_request_ops,
-	[ETHTOOL_MSG_TS_LIST_GET]	= &ethnl_ts_list_request_ops,
-	[ETHTOOL_MSG_TS_SET]		= &ethnl_ts_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -1131,31 +1128,6 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_mm_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1,
 	},
-	{
-		.cmd	= ETHTOOL_MSG_TS_GET,
-		.doit	= ethnl_default_doit,
-		.start	= ethnl_default_start,
-		.dumpit	= ethnl_default_dumpit,
-		.done	= ethnl_default_done,
-		.policy = ethnl_ts_get_policy,
-		.maxattr = ARRAY_SIZE(ethnl_ts_get_policy) - 1,
-	},
-	{
-		.cmd	= ETHTOOL_MSG_TS_LIST_GET,
-		.doit	= ethnl_default_doit,
-		.start	= ethnl_default_start,
-		.dumpit	= ethnl_default_dumpit,
-		.done	= ethnl_default_done,
-		.policy = ethnl_ts_get_policy,
-		.maxattr = ARRAY_SIZE(ethnl_ts_get_policy) - 1,
-	},
-	{
-		.cmd	= ETHTOOL_MSG_TS_SET,
-		.flags	= GENL_UNS_ADMIN_PERM,
-		.doit	= ethnl_default_set_doit,
-		.policy = ethnl_ts_set_policy,
-		.maxattr = ARRAY_SIZE(ethnl_ts_set_policy) - 1,
-	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 8fedf234b824..9a333a8d04c1 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -395,8 +395,6 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
 extern const struct ethnl_request_ops ethnl_mm_request_ops;
-extern const struct ethnl_request_ops ethnl_ts_request_ops;
-extern const struct ethnl_request_ops ethnl_ts_list_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -443,8 +441,6 @@ extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]
 extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
 extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1];
 extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1];
-extern const struct nla_policy ethnl_ts_get_policy[ETHTOOL_A_TS_HEADER + 1];
-extern const struct nla_policy ethnl_ts_set_policy[ETHTOOL_A_TS_MAX + 1];
 
 int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
 int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/ts.c b/net/ethtool/ts.c
deleted file mode 100644
index 357265e74e08..000000000000
--- a/net/ethtool/ts.c
+++ /dev/null
@@ -1,244 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/net_tstamp.h>
-#include <linux/phy.h>
-
-#include "netlink.h"
-#include "common.h"
-#include "bitset.h"
-
-struct ts_req_info {
-	struct ethnl_req_info		base;
-};
-
-struct ts_reply_data {
-	struct ethnl_reply_data		base;
-	enum timestamping_layer		ts_layer;
-};
-
-#define TS_REPDATA(__reply_base) \
-	container_of(__reply_base, struct ts_reply_data, base)
-
-/* TS_GET */
-const struct nla_policy ethnl_ts_get_policy[] = {
-	[ETHTOOL_A_TS_HEADER]		=
-		NLA_POLICY_NESTED(ethnl_header_policy),
-};
-
-static int ts_prepare_data(const struct ethnl_req_info *req_base,
-			   struct ethnl_reply_data *reply_base,
-			   const struct genl_info *info)
-{
-	struct ts_reply_data *data = TS_REPDATA(reply_base);
-	struct net_device *dev = reply_base->dev;
-	int ret;
-
-	ret = ethnl_ops_begin(dev);
-	if (ret < 0)
-		return ret;
-
-	data->ts_layer = dev->ts_layer;
-
-	ethnl_ops_complete(dev);
-
-	return ret;
-}
-
-static int ts_reply_size(const struct ethnl_req_info *req_base,
-			 const struct ethnl_reply_data *reply_base)
-{
-	return nla_total_size(sizeof(u32));
-}
-
-static int ts_fill_reply(struct sk_buff *skb,
-			 const struct ethnl_req_info *req_base,
-			 const struct ethnl_reply_data *reply_base)
-{
-	struct ts_reply_data *data = TS_REPDATA(reply_base);
-
-	return nla_put_u32(skb, ETHTOOL_A_TS_LAYER, data->ts_layer);
-}
-
-/* TS_SET */
-const struct nla_policy ethnl_ts_set_policy[] = {
-	[ETHTOOL_A_TS_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
-	[ETHTOOL_A_TS_LAYER]	= NLA_POLICY_RANGE(NLA_U32, 0,
-						   __TIMESTAMPING_COUNT - 1)
-};
-
-static int ethnl_set_ts_validate(struct ethnl_req_info *req_info,
-				 struct genl_info *info)
-{
-	struct nlattr **tb = info->attrs;
-	const struct net_device_ops *ops = req_info->dev->netdev_ops;
-
-	if (!ops->ndo_hwtstamp_set)
-		return -EOPNOTSUPP;
-
-	if (!tb[ETHTOOL_A_TS_LAYER])
-		return 0;
-
-	return 1;
-}
-
-static int ethnl_set_ts(struct ethnl_req_info *req_info, struct genl_info *info)
-{
-	struct net_device *dev = req_info->dev;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	struct kernel_hwtstamp_config config = {0};
-	struct nlattr **tb = info->attrs;
-	enum timestamping_layer ts_layer;
-	bool mod = false;
-	int ret;
-
-	ts_layer = dev->ts_layer;
-	ethnl_update_u32(&ts_layer, tb[ETHTOOL_A_TS_LAYER], &mod);
-
-	if (!mod)
-		return 0;
-
-	if (ts_layer == SOFTWARE_TIMESTAMPING) {
-		struct ethtool_ts_info ts_info = {0};
-
-		if (!ops->get_ts_info) {
-			NL_SET_ERR_MSG_ATTR(info->extack,
-					    tb[ETHTOOL_A_TS_LAYER],
-					    "this net device cannot support timestamping");
-			return -EINVAL;
-		}
-
-		ops->get_ts_info(dev, &ts_info);
-		if ((ts_info.so_timestamping &
-		    SOF_TIMESTAMPING_SOFTWARE_MASK) !=
-		    SOF_TIMESTAMPING_SOFTWARE_MASK) {
-			NL_SET_ERR_MSG_ATTR(info->extack,
-					    tb[ETHTOOL_A_TS_LAYER],
-					    "this net device cannot support software timestamping");
-			return -EINVAL;
-		}
-	} else if (ts_layer == MAC_TIMESTAMPING) {
-		struct ethtool_ts_info ts_info = {0};
-
-		if (!ops->get_ts_info) {
-			NL_SET_ERR_MSG_ATTR(info->extack,
-					    tb[ETHTOOL_A_TS_LAYER],
-					    "this net device cannot support timestamping");
-			return -EINVAL;
-		}
-
-		ops->get_ts_info(dev, &ts_info);
-		if ((ts_info.so_timestamping &
-		    SOF_TIMESTAMPING_HARDWARE_MASK) !=
-		    SOF_TIMESTAMPING_HARDWARE_MASK) {
-			NL_SET_ERR_MSG_ATTR(info->extack,
-					    tb[ETHTOOL_A_TS_LAYER],
-					    "this net device cannot support hardware timestamping");
-			return -EINVAL;
-		}
-	} else if (ts_layer == PHY_TIMESTAMPING && !phy_has_tsinfo(dev->phydev)) {
-		NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_TS_LAYER],
-				    "this phy device cannot support timestamping");
-		return -EINVAL;
-	}
-
-	/* Disable time stamping in the current layer. */
-	if (netif_device_present(dev) &&
-	    (dev->ts_layer == PHY_TIMESTAMPING ||
-	    dev->ts_layer == MAC_TIMESTAMPING)) {
-		ret = dev_set_hwtstamp_phylib(dev, &config, info->extack);
-		if (ret < 0)
-			return ret;
-	}
-
-	dev->ts_layer = ts_layer;
-
-	return 1;
-}
-
-const struct ethnl_request_ops ethnl_ts_request_ops = {
-	.request_cmd		= ETHTOOL_MSG_TS_GET,
-	.reply_cmd		= ETHTOOL_MSG_TS_GET_REPLY,
-	.hdr_attr		= ETHTOOL_A_TS_HEADER,
-	.req_info_size		= sizeof(struct ts_req_info),
-	.reply_data_size	= sizeof(struct ts_reply_data),
-
-	.prepare_data		= ts_prepare_data,
-	.reply_size		= ts_reply_size,
-	.fill_reply		= ts_fill_reply,
-
-	.set_validate		= ethnl_set_ts_validate,
-	.set			= ethnl_set_ts,
-};
-
-/* TS_LIST_GET */
-struct ts_list_reply_data {
-	struct ethnl_reply_data		base;
-	enum timestamping_layer		ts_layer[__TIMESTAMPING_COUNT];
-	u8				num_ts;
-};
-
-#define TS_LIST_REPDATA(__reply_base) \
-	container_of(__reply_base, struct ts_list_reply_data, base)
-
-static int ts_list_prepare_data(const struct ethnl_req_info *req_base,
-				struct ethnl_reply_data *reply_base,
-				const struct genl_info *info)
-{
-	struct ts_list_reply_data *data = TS_LIST_REPDATA(reply_base);
-	struct net_device *dev = reply_base->dev;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	int ret, i = 0;
-
-	ret = ethnl_ops_begin(dev);
-	if (ret < 0)
-		return ret;
-
-	if (phy_has_tsinfo(dev->phydev))
-		data->ts_layer[i++] = PHY_TIMESTAMPING;
-	if (ops->get_ts_info) {
-		struct ethtool_ts_info ts_info = {0};
-
-		ops->get_ts_info(dev, &ts_info);
-		if (ts_info.so_timestamping &
-		    SOF_TIMESTAMPING_HARDWARE_MASK)
-			data->ts_layer[i++] = MAC_TIMESTAMPING;
-
-		if (ts_info.so_timestamping &
-		    SOF_TIMESTAMPING_SOFTWARE_MASK)
-			data->ts_layer[i++] = SOFTWARE_TIMESTAMPING;
-	}
-
-	data->num_ts = i;
-	ethnl_ops_complete(dev);
-
-	return ret;
-}
-
-static int ts_list_reply_size(const struct ethnl_req_info *req_base,
-			      const struct ethnl_reply_data *reply_base)
-{
-	struct ts_list_reply_data *data = TS_LIST_REPDATA(reply_base);
-
-	return nla_total_size(sizeof(u32)) * data->num_ts;
-}
-
-static int ts_list_fill_reply(struct sk_buff *skb,
-			      const struct ethnl_req_info *req_base,
-			 const struct ethnl_reply_data *reply_base)
-{
-	struct ts_list_reply_data *data = TS_LIST_REPDATA(reply_base);
-
-	return nla_put(skb, ETHTOOL_A_TS_LIST_LAYER, sizeof(u32) * data->num_ts, data->ts_layer);
-}
-
-const struct ethnl_request_ops ethnl_ts_list_request_ops = {
-	.request_cmd		= ETHTOOL_MSG_TS_LIST_GET,
-	.reply_cmd		= ETHTOOL_MSG_TS_LIST_GET_REPLY,
-	.hdr_attr		= ETHTOOL_A_TS_HEADER,
-	.req_info_size		= sizeof(struct ts_req_info),
-	.reply_data_size	= sizeof(struct ts_list_reply_data),
-
-	.prepare_data		= ts_list_prepare_data,
-	.reply_size		= ts_list_reply_size,
-	.fill_reply		= ts_list_fill_reply,
-};
-- 
cgit v1.2.3


From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Wed, 15 Nov 2023 20:01:08 +0800
Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink

if a PF has 256 or more VFs, ip link command will allocate an order 3
memory or more, and maybe trigger OOM due to memory fragment,
the VFs needed memory size is computed in rtnl_vfinfo_size.

so introduce nlmsg_new_large which calls netlink_alloc_large_skb in
which vmalloc is used for large memory, to avoid the failure of
allocating memory

    ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\
	__GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0
    CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P           OE
    Call Trace:
    dump_stack+0x57/0x6a
    dump_header+0x4a/0x210
    oom_kill_process+0xe4/0x140
    out_of_memory+0x3e8/0x790
    __alloc_pages_slowpath.constprop.116+0x953/0xc50
    __alloc_pages_nodemask+0x2af/0x310
    kmalloc_large_node+0x38/0xf0
    __kmalloc_node_track_caller+0x417/0x4d0
    __kmalloc_reserve.isra.61+0x2e/0x80
    __alloc_skb+0x82/0x1c0
    rtnl_getlink+0x24f/0x370
    rtnetlink_rcv_msg+0x12c/0x350
    netlink_rcv_skb+0x50/0x100
    netlink_unicast+0x1b2/0x280
    netlink_sendmsg+0x355/0x4a0
    sock_sendmsg+0x5b/0x60
    ____sys_sendmsg+0x1ea/0x250
    ___sys_sendmsg+0x88/0xd0
    __sys_sendmsg+0x5e/0xa0
    do_syscall_64+0x33/0x40
    entry_SYSCALL_64_after_hwframe+0x44/0xa9
    RIP: 0033:0x7f95a65a5b70

Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h  |  1 +
 include/net/netlink.h    | 14 ++++++++++++++
 net/core/rtnetlink.c     |  2 +-
 net/netlink/af_netlink.c |  3 +--
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 75d7de34c908..abe91ed6b9aa 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -351,5 +351,6 @@ bool netlink_ns_capable(const struct sk_buff *skb,
 			struct user_namespace *ns, int cap);
 bool netlink_capable(const struct sk_buff *skb, int cap);
 bool netlink_net_capable(const struct sk_buff *skb, int cap);
+struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast);
 
 #endif	/* __LINUX_NETLINK_H */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 83bdf787aeee..167b91348e57 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1010,6 +1010,20 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
 	return alloc_skb(nlmsg_total_size(payload), flags);
 }
 
+/**
+ * nlmsg_new_large - Allocate a new netlink message with non-contiguous
+ * physical memory
+ * @payload: size of the message payload
+ *
+ * The allocated skb is unable to have frag page for shinfo->frags*,
+ * as the NULL setting for skb->head in netlink_skb_destructor() will
+ * bypass most of the handling in skb_release_data()
+ */
+static inline struct sk_buff *nlmsg_new_large(size_t payload)
+{
+	return netlink_alloc_large_skb(nlmsg_total_size(payload), 0);
+}
+
 /**
  * nlmsg_end - Finalize a netlink message
  * @skb: socket buffer the message is stored in
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e8431c6c8490..592164c2a540 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3849,7 +3849,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto out;
 
 	err = -ENOBUFS;
-	nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+	nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
 	if (nskb == NULL)
 		goto out;
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index eb086b06d60d..177126fb0484 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1204,8 +1204,7 @@ struct sock *netlink_getsockbyfilp(struct file *filp)
 	return sock;
 }
 
-static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
-					       int broadcast)
+struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
 {
 	struct sk_buff *skb;
 	void *data;
-- 
cgit v1.2.3


From f25d34646bd01505a0989ca67bc9a37390cae755 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Fri, 3 Nov 2023 19:25:23 +0100
Subject: platform/x86: wmi: Add wmidev_block_set()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, WMI drivers have to use the deprecated GUID-based
interface when setting data blocks. This prevents those
drivers from fully moving away from this interface.

Provide wmidev_block_set() so drivers using wmi_set_block() can
fully migrate to the modern bus-based interface.

Tested with a custom SSDT from the Intel Slim Bootloader project.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231103182526.3524-1-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/wmi.c | 64 ++++++++++++++++++++++++----------------------
 include/linux/wmi.h        |  2 ++
 2 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 5c27b4aa9690..9d9a050e7086 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -536,41 +536,50 @@ EXPORT_SYMBOL_GPL(wmidev_block_query);
  *
  * Return: acpi_status signaling success or error.
  */
-acpi_status wmi_set_block(const char *guid_string, u8 instance,
-			  const struct acpi_buffer *in)
+acpi_status wmi_set_block(const char *guid_string, u8 instance, const struct acpi_buffer *in)
 {
-	struct wmi_block *wblock;
-	struct guid_block *block;
 	struct wmi_device *wdev;
-	acpi_handle handle;
-	struct acpi_object_list input;
-	union acpi_object params[2];
-	char method[WMI_ACPI_METHOD_NAME_SIZE];
 	acpi_status status;
 
-	if (!in)
-		return AE_BAD_DATA;
-
 	wdev = wmi_find_device_by_guid(guid_string);
 	if (IS_ERR(wdev))
 		return AE_ERROR;
 
-	wblock = container_of(wdev, struct wmi_block, dev);
-	block = &wblock->gblock;
-	handle = wblock->acpi_device->handle;
+	status =  wmidev_block_set(wdev, instance, in);
+	wmi_device_put(wdev);
 
-	if (block->instance_count <= instance) {
-		status = AE_BAD_PARAMETER;
+	return status;
+}
+EXPORT_SYMBOL_GPL(wmi_set_block);
 
-		goto err_wdev_put;
-	}
+/**
+ * wmidev_block_set - Write to a WMI block
+ * @wdev: A wmi bus device from a driver
+ * @instance: Instance index
+ * @in: Buffer containing new values for the data block
+ *
+ * Write contents of the input buffer to an ACPI-WMI data block.
+ *
+ * Return: acpi_status signaling success or error.
+ */
+acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in)
+{
+	struct wmi_block *wblock = container_of(wdev, struct wmi_block, dev);
+	acpi_handle handle = wblock->acpi_device->handle;
+	struct guid_block *block = &wblock->gblock;
+	char method[WMI_ACPI_METHOD_NAME_SIZE];
+	struct acpi_object_list input;
+	union acpi_object params[2];
 
-	/* Check GUID is a data block */
-	if (block->flags & (ACPI_WMI_EVENT | ACPI_WMI_METHOD)) {
-		status = AE_ERROR;
+	if (!in)
+		return AE_BAD_DATA;
 
-		goto err_wdev_put;
-	}
+	if (block->instance_count <= instance)
+		return AE_BAD_PARAMETER;
+
+	/* Check GUID is a data block */
+	if (block->flags & (ACPI_WMI_EVENT | ACPI_WMI_METHOD))
+		return AE_ERROR;
 
 	input.count = 2;
 	input.pointer = params;
@@ -582,14 +591,9 @@ acpi_status wmi_set_block(const char *guid_string, u8 instance,
 
 	get_acpi_method_name(wblock, 'S', method);
 
-	status = acpi_evaluate_object(handle, method, &input, NULL);
-
-err_wdev_put:
-	wmi_device_put(wdev);
-
-	return status;
+	return acpi_evaluate_object(handle, method, &input, NULL);
 }
-EXPORT_SYMBOL_GPL(wmi_set_block);
+EXPORT_SYMBOL_GPL(wmidev_block_set);
 
 static void wmi_dump_wdg(const struct guid_block *g)
 {
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 763bd382cf2d..207544968268 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -35,6 +35,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 extern union acpi_object *wmidev_block_query(struct wmi_device *wdev,
 					     u8 instance);
 
+acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in);
+
 u8 wmidev_instance_count(struct wmi_device *wdev);
 
 extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
-- 
cgit v1.2.3


From 7275bf3e09578e1761157e7683f2e898c5c235a6 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Fri, 3 Nov 2023 19:25:24 +0100
Subject: platform/x86: wmi: Add to_wmi_device() helper macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a helper macro for WMI drivers to cast a device to
the corresponding WMI device. This should replace some
boilerplate code.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231103182526.3524-2-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 207544968268..8a643c39fcce 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -27,6 +27,14 @@ struct wmi_device {
 	bool setable;
 };
 
+/**
+ * to_wmi_device() - Helper macro to cast a device to a wmi_device
+ * @device: device struct
+ *
+ * Cast a struct device to a struct wmi_device.
+ */
+#define to_wmi_device(device)	container_of(device, struct wmi_device, dev)
+
 extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev,
 					  u8 instance, u32 method_id,
 					  const struct acpi_buffer *in,
-- 
cgit v1.2.3


From 762321dab9a72760bf9aec48362f932717c9424d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 25 Oct 2023 16:10:17 +0200
Subject: filemap: add a per-mapping stable writes flag

folio_wait_stable waits for writeback to finish before modifying the
contents of a folio again, e.g. to support check summing of the data
in the block integrity code.

Currently this behavior is controlled by the SB_I_STABLE_WRITES flag
on the super_block, which means it is uniform for the entire file system.
This is wrong for the block device pseudofs which is shared by all
block devices, or file systems that can use multiple devices like XFS
witht the RT subvolume or btrfs (although btrfs currently reimplements
folio_wait_stable anyway).

Add a per-address_space AS_STABLE_WRITES flag to control the behavior
in a more fine grained way.  The existing SB_I_STABLE_WRITES is kept
to initialize AS_STABLE_WRITES to the existing default which covers
most cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de
Tested-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c              |  2 ++
 include/linux/pagemap.h | 17 +++++++++++++++++
 mm/page-writeback.c     |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index edcd8a61975f..f238d987dec9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	lockdep_set_class_and_name(&mapping->invalidate_lock,
 				   &sb->s_type->invalidate_lock_key,
 				   "mapping.invalidate_lock");
+	if (sb->s_iflags & SB_I_STABLE_WRITES)
+		mapping_set_stable_writes(mapping);
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcc1ea44b4e8..06142ff7f9ce 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,6 +204,8 @@ enum mapping_flags {
 	AS_NO_WRITEBACK_TAGS = 5,
 	AS_LARGE_FOLIO_SUPPORT = 6,
 	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
+	AS_STABLE_WRITES,	/* must wait for writeback before modifying
+				   folio contents */
 };
 
 /**
@@ -289,6 +291,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping)
 	clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
 }
 
+static inline bool mapping_stable_writes(const struct address_space *mapping)
+{
+	return test_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
+static inline void mapping_set_stable_writes(struct address_space *mapping)
+{
+	set_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
+static inline void mapping_clear_stable_writes(struct address_space *mapping)
+{
+	clear_bit(AS_STABLE_WRITES, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return mapping->gfp_mask;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 46f2f5d3d183..ee2fd6a6af40 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -3107,7 +3107,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
  */
 void folio_wait_stable(struct folio *folio)
 {
-	if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
+	if (mapping_stable_writes(folio_mapping(folio)))
 		folio_wait_writeback(folio);
 }
 EXPORT_SYMBOL_GPL(folio_wait_stable);
-- 
cgit v1.2.3


From 48c9996f1dfe92bd7318472651c9ad538d6d53b5 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 6 Nov 2023 17:16:37 +0100
Subject: device property: Add SOFTWARE_NODE() macro for defining software
 nodes

Add SOFTWARE_NODE() macro in order to make defining software nodes look
nicer. This is analogous to different PROPERTY_ENTRY_*() macros for
defining properties.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Tested-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/property.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..97f901c0914e 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -489,6 +489,13 @@ struct software_node {
 	const struct property_entry *properties;
 };
 
+#define SOFTWARE_NODE(_name_, _properties_, _parent_)	\
+	(struct software_node) {			\
+		.name = _name_,				\
+		.properties = _properties_,		\
+		.parent = _parent_,			\
+	}
+
 bool is_software_node(const struct fwnode_handle *fwnode);
 const struct software_node *
 to_software_node(const struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 44844db91397d3d94589f3c0c855be02daeebdb3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 9 Nov 2023 16:01:48 +0100
Subject: thermal: core: Add trip thresholds for trip crossing detection

The trip crossing detection in handle_thermal_trip() does not work
correctly in the cases when a trip point is crossed on the way up and
then the zone temperature stays above its low temperature (that is, its
temperature decreased by its hysteresis).  The trip temperature may
be passed by the zone temperature subsequently in that case, even
multiple times, but that does not count as the trip crossing as long as
the zone temperature does not fall below the trip's low temperature or,
in other words, until the trip is crossed on the way down.

|-----------low--------high------------|
             |<--------->|
             |    hyst   |
             |           |
             |          -|--> crossed on the way up
             |
         <---|-- crossed on the way down

However, handle_thermal_trip() will invoke thermal_notify_tz_trip_up()
every time the trip temperature is passed by the zone temperature on
the way up regardless of whether or not the trip has been crossed on
the way down yet.  Moreover, it will not call thermal_notify_tz_trip_down()
if the last zone temperature was between the trip's temperature and its
low temperature, so some "trip crossed on the way down" events may not
be reported.

To address this issue, introduce trip thresholds equal to either the
temperature of the given trip, or its low temperature, such that if
the trip's threshold is passed by the zone temperature on the way up,
its value will be set to the trip's low temperature and
thermal_notify_tz_trip_up() will be called, and if the trip's threshold
is passed by the zone temperature on the way down, its value will be set
to the trip's temperature (high) and thermal_notify_tz_trip_down() will
be called.  Accordingly, if the threshold is passed on the way up, it
cannot be passed on the way up again until its passed on the way down
and if it is passed on the way down, it cannot be passed on the way down
again until it is passed on the way up which guarantees correct
triggering of trip crossing notifications.

If the last temperature of the zone is invalid, the trip's threshold
will be set depending of the zone's current temperature: If that
temperature is above the trip's temperature, its threshold will be
set to its low temperature or otherwise its threshold will be set to
its (high) temperature.  Because the zone temperature is initially
set to invalid and tz->last_temperature is only updated by
update_temperature(), this is sufficient to set the correct initial
threshold values for all trips.

Link: https://lore.kernel.org/all/20220718145038.1114379-4-daniel.lezcano@linaro.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 43 +++++++++++++++++++++++++++++++++++-------
 include/linux/thermal.h        |  2 ++
 2 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 9c17d35ccbbd..625ba07cbe2f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -345,22 +345,51 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
 }
 
 static void handle_thermal_trip(struct thermal_zone_device *tz,
-				const struct thermal_trip *trip)
+				struct thermal_trip *trip)
 {
 	if (trip->temperature == THERMAL_TEMP_INVALID)
 		return;
 
-	if (tz->last_temperature != THERMAL_TEMP_INVALID) {
-		if (tz->last_temperature < trip->temperature &&
-		    tz->temperature >= trip->temperature)
+	if (tz->last_temperature == THERMAL_TEMP_INVALID) {
+		/* Initialization. */
+		trip->threshold = trip->temperature;
+		if (tz->temperature >= trip->threshold)
+			trip->threshold -= trip->hysteresis;
+	} else if (tz->last_temperature < trip->threshold) {
+		/*
+		 * The trip threshold is equal to the trip temperature, unless
+		 * the latter has changed in the meantime.  In either case,
+		 * the trip is crossed if the current zone temperature is at
+		 * least equal to its temperature, but otherwise ensure that
+		 * the threshold and the trip temperature will be equal.
+		 */
+		if (tz->temperature >= trip->temperature) {
 			thermal_notify_tz_trip_up(tz->id,
 						  thermal_zone_trip_id(tz, trip),
 						  tz->temperature);
-		if (tz->last_temperature >= trip->temperature &&
-		    tz->temperature < trip->temperature - trip->hysteresis)
+			trip->threshold = trip->temperature - trip->hysteresis;
+		} else {
+			trip->threshold = trip->temperature;
+		}
+	} else {
+		/*
+		 * The previous zone temperature was above or equal to the trip
+		 * threshold, which would be equal to the "low temperature" of
+		 * the trip (its temperature minus its hysteresis), unless the
+		 * trip temperature or hysteresis had changed.  In either case,
+		 * the trip is crossed if the current zone temperature is below
+		 * the low temperature of the trip, but otherwise ensure that
+		 * the trip threshold will be equal to the low temperature of
+		 * the trip.
+		 */
+		if (tz->temperature < trip->temperature - trip->hysteresis) {
 			thermal_notify_tz_trip_down(tz->id,
 						    thermal_zone_trip_id(tz, trip),
 						    tz->temperature);
+			trip->threshold = trip->temperature;
+		} else {
+			trip->threshold = trip->temperature - trip->hysteresis;
+		}
 	}
 
 	if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT)
@@ -403,7 +432,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event)
 {
-	const struct thermal_trip *trip;
+	struct thermal_trip *trip;
 
 	if (atomic_read(&in_suspend))
 		return;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index cee814d5d1ac..1f9ee869f9f9 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -57,12 +57,14 @@ enum thermal_notify_event {
  * struct thermal_trip - representation of a point in temperature domain
  * @temperature: temperature value in miliCelsius
  * @hysteresis: relative hysteresis in miliCelsius
+ * @threshold: trip crossing notification threshold miliCelsius
  * @type: trip point type
  * @priv: pointer to driver data associated with this trip
  */
 struct thermal_trip {
 	int temperature;
 	int hysteresis;
+	int threshold;
 	enum thermal_trip_type type;
 	void *priv;
 };
-- 
cgit v1.2.3


From c96b8175522a2c52e297ee0a49827a668f95e1e8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 20 Nov 2023 16:06:11 +0900
Subject: block: Remove blk_set_runtime_active()

The function blk_set_runtime_active() is called only from
blk_post_runtime_resume(), so there is no need for that function to be
exported. Open-code this function directly in blk_post_runtime_resume()
and remove it.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20231120070611.33951-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-pm.c         | 33 +++++----------------------------
 include/linux/blk-pm.h |  1 -
 2 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-pm.c b/block/blk-pm.c
index 6b72b2e03fc8..42e842074715 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -163,38 +163,15 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
  * @q: the queue of the device
  *
  * Description:
- *    For historical reasons, this routine merely calls blk_set_runtime_active()
- *    to do the real work of restarting the queue.  It does this regardless of
- *    whether the device's runtime-resume succeeded; even if it failed the
+ *    Restart the queue of a runtime suspended device. It does this regardless
+ *    of whether the device's runtime-resume succeeded; even if it failed the
  *    driver or error handler will need to communicate with the device.
  *
  *    This function should be called near the end of the device's
- *    runtime_resume callback.
+ *    runtime_resume callback to correct queue runtime PM status and re-enable
+ *    peeking requests from the queue.
  */
 void blk_post_runtime_resume(struct request_queue *q)
-{
-	blk_set_runtime_active(q);
-}
-EXPORT_SYMBOL(blk_post_runtime_resume);
-
-/**
- * blk_set_runtime_active - Force runtime status of the queue to be active
- * @q: the queue of the device
- *
- * If the device is left runtime suspended during system suspend the resume
- * hook typically resumes the device and corrects runtime status
- * accordingly. However, that does not affect the queue runtime PM status
- * which is still "suspended". This prevents processing requests from the
- * queue.
- *
- * This function can be used in driver's resume hook to correct queue
- * runtime PM status and re-enable peeking requests from the queue. It
- * should be called before first request is added to the queue.
- *
- * This function is also called by blk_post_runtime_resume() for
- * runtime resumes.  It does everything necessary to restart the queue.
- */
-void blk_set_runtime_active(struct request_queue *q)
 {
 	int old_status;
 
@@ -211,4 +188,4 @@ void blk_set_runtime_active(struct request_queue *q)
 	if (old_status != RPM_ACTIVE)
 		blk_clear_pm_only(q);
 }
-EXPORT_SYMBOL(blk_set_runtime_active);
+EXPORT_SYMBOL(blk_post_runtime_resume);
diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h
index 2580e05a8ab6..004b38a538ff 100644
--- a/include/linux/blk-pm.h
+++ b/include/linux/blk-pm.h
@@ -15,7 +15,6 @@ extern int blk_pre_runtime_suspend(struct request_queue *q);
 extern void blk_post_runtime_suspend(struct request_queue *q, int err);
 extern void blk_pre_runtime_resume(struct request_queue *q);
 extern void blk_post_runtime_resume(struct request_queue *q);
-extern void blk_set_runtime_active(struct request_queue *q);
 #else
 static inline void blk_pm_runtime_init(struct request_queue *q,
 				       struct device *dev) {}
-- 
cgit v1.2.3


From 79e0c5be8c73a674c92bd4ba77b75f4f8c91d32e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:13 +0100
Subject: net, vrf: Move dstats structure to core

Just move struct pcpu_dstats out of the vrf into the core, and streamline
the field names slightly, so they better align with the {t,l}stats ones.

No functional change otherwise. A conversion of the u64s to u64_stats_t
could be done at a separate point in future. This move is needed as we are
moving the {t,l,d}stats allocation/freeing to the core.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231114004220.6495-2-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 drivers/net/vrf.c         | 24 +++++++-----------------
 include/linux/netdevice.h | 10 ++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index db766941b78f..3e6e0fdc3ba7 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -121,22 +121,12 @@ struct net_vrf {
 	int			ifindex;
 };
 
-struct pcpu_dstats {
-	u64			tx_pkts;
-	u64			tx_bytes;
-	u64			tx_drps;
-	u64			rx_pkts;
-	u64			rx_bytes;
-	u64			rx_drps;
-	struct u64_stats_sync	syncp;
-};
-
 static void vrf_rx_stats(struct net_device *dev, int len)
 {
 	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
 
 	u64_stats_update_begin(&dstats->syncp);
-	dstats->rx_pkts++;
+	dstats->rx_packets++;
 	dstats->rx_bytes += len;
 	u64_stats_update_end(&dstats->syncp);
 }
@@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev,
 		do {
 			start = u64_stats_fetch_begin(&dstats->syncp);
 			tbytes = dstats->tx_bytes;
-			tpkts = dstats->tx_pkts;
-			tdrops = dstats->tx_drps;
+			tpkts = dstats->tx_packets;
+			tdrops = dstats->tx_drops;
 			rbytes = dstats->rx_bytes;
-			rpkts = dstats->rx_pkts;
+			rpkts = dstats->rx_packets;
 		} while (u64_stats_fetch_retry(&dstats->syncp, start));
 		stats->tx_bytes += tbytes;
 		stats->tx_packets += tpkts;
@@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
 	if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
 		vrf_rx_stats(dev, len);
 	else
-		this_cpu_inc(dev->dstats->rx_drps);
+		this_cpu_inc(dev->dstats->rx_drops);
 
 	return NETDEV_TX_OK;
 }
@@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 		struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
 
 		u64_stats_update_begin(&dstats->syncp);
-		dstats->tx_pkts++;
+		dstats->tx_packets++;
 		dstats->tx_bytes += len;
 		u64_stats_update_end(&dstats->syncp);
 	} else {
-		this_cpu_inc(dev->dstats->tx_drps);
+		this_cpu_inc(dev->dstats->tx_drops);
 	}
 
 	return ret;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a16c9cc063fe..98082113156e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2755,6 +2755,16 @@ struct pcpu_sw_netstats {
 	struct u64_stats_sync   syncp;
 } __aligned(4 * sizeof(u64));
 
+struct pcpu_dstats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_drops;
+	u64			tx_packets;
+	u64			tx_bytes;
+	u64			tx_drops;
+	struct u64_stats_sync	syncp;
+} __aligned(8 * sizeof(u64));
+
 struct pcpu_lstats {
 	u64_stats_t packets;
 	u64_stats_t bytes;
-- 
cgit v1.2.3


From 34d21de99cea9cb17967874313e5b0262527833c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Nov 2023 01:42:14 +0100
Subject: net: Move {l,t,d}stats allocation to core and convert veth & vrf

Move {l,t,d}stats allocation to the core and let netdevs pick the stats
type they need. That way the driver doesn't have to bother with error
handling (allocation failure checking, making sure free happens in the
right spot, etc) - all happening in the core.

Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Cc: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231114004220.6495-3-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 drivers/net/veth.c        | 16 ++--------------
 drivers/net/vrf.c         | 14 +++-----------
 include/linux/netdevice.h | 20 +++++++++++++++----
 net/core/dev.c            | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 69 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 9980517ed8b0..ac030c241d1a 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1506,25 +1506,12 @@ static void veth_free_queues(struct net_device *dev)
 
 static int veth_dev_init(struct net_device *dev)
 {
-	int err;
-
-	dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
-	if (!dev->lstats)
-		return -ENOMEM;
-
-	err = veth_alloc_queues(dev);
-	if (err) {
-		free_percpu(dev->lstats);
-		return err;
-	}
-
-	return 0;
+	return veth_alloc_queues(dev);
 }
 
 static void veth_dev_free(struct net_device *dev)
 {
 	veth_free_queues(dev);
-	free_percpu(dev->lstats);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1796,6 +1783,7 @@ static void veth_setup(struct net_device *dev)
 			       NETIF_F_HW_VLAN_STAG_RX);
 	dev->needs_free_netdev = true;
 	dev->priv_destructor = veth_dev_free;
+	dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS;
 	dev->max_mtu = ETH_MAX_MTU;
 
 	dev->hw_features = VETH_FEATURES;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 3e6e0fdc3ba7..bb95ce43cd97 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1164,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev)
 
 	vrf_rtable_release(dev, vrf);
 	vrf_rt6_release(dev, vrf);
-
-	free_percpu(dev->dstats);
-	dev->dstats = NULL;
 }
 
 static int vrf_dev_init(struct net_device *dev)
 {
 	struct net_vrf *vrf = netdev_priv(dev);
 
-	dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
-	if (!dev->dstats)
-		goto out_nomem;
-
 	/* create the default dst which points back to us */
 	if (vrf_rtable_create(dev) != 0)
-		goto out_stats;
+		goto out_nomem;
 
 	if (vrf_rt6_create(dev) != 0)
 		goto out_rth;
@@ -1193,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev)
 
 out_rth:
 	vrf_rtable_release(dev, vrf);
-out_stats:
-	free_percpu(dev->dstats);
-	dev->dstats = NULL;
 out_nomem:
 	return -ENOMEM;
 }
@@ -1694,6 +1684,8 @@ static void vrf_setup(struct net_device *dev)
 	dev->min_mtu = IPV6_MIN_MTU;
 	dev->max_mtu = IP6_MAX_MTU;
 	dev->mtu = dev->max_mtu;
+
+	dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
 }
 
 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 98082113156e..2564e209465e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type {
 	ML_PRIV_CAN,
 };
 
+enum netdev_stat_type {
+	NETDEV_PCPU_STAT_NONE,
+	NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
+	NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
+	NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type {
  *
  * 	@ml_priv:	Mid-layer private
  *	@ml_priv_type:  Mid-layer private type
- * 	@lstats:	Loopback statistics
- * 	@tstats:	Tunnel statistics
- * 	@dstats:	Dummy statistics
- * 	@vstats:	Virtual ethernet statistics
+ *
+ *	@pcpu_stat_type:	Type of device statistics which the core should
+ *				allocate/free: none, lstats, tstats, dstats. none
+ *				means the driver is handling statistics allocation/
+ *				freeing internally.
+ *	@lstats:		Loopback statistics: packets, bytes
+ *	@tstats:		Tunnel statistics: RX/TX packets, RX/TX bytes
+ *	@dstats:		Dummy statistics: RX/TX/drop packets, RX/TX bytes
  *
  *	@garp_port:	GARP
  *	@mrp_port:	MRP
@@ -2354,6 +2365,7 @@ struct net_device {
 	void				*ml_priv;
 	enum netdev_ml_priv_type	ml_priv_type;
 
+	enum netdev_stat_type		pcpu_stat_type:8;
 	union {
 		struct pcpu_lstats __percpu		*lstats;
 		struct pcpu_sw_netstats __percpu	*tstats;
diff --git a/net/core/dev.c b/net/core/dev.c
index af53f6d838ce..0cc6e283edba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10051,6 +10051,46 @@ void netif_tx_stop_all_queues(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_tx_stop_all_queues);
 
+static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
+{
+	void __percpu *v;
+
+	switch (dev->pcpu_stat_type) {
+	case NETDEV_PCPU_STAT_NONE:
+		return 0;
+	case NETDEV_PCPU_STAT_LSTATS:
+		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+		break;
+	case NETDEV_PCPU_STAT_TSTATS:
+		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+		break;
+	case NETDEV_PCPU_STAT_DSTATS:
+		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return v ? 0 : -ENOMEM;
+}
+
+static void netdev_do_free_pcpu_stats(struct net_device *dev)
+{
+	switch (dev->pcpu_stat_type) {
+	case NETDEV_PCPU_STAT_NONE:
+		return;
+	case NETDEV_PCPU_STAT_LSTATS:
+		free_percpu(dev->lstats);
+		break;
+	case NETDEV_PCPU_STAT_TSTATS:
+		free_percpu(dev->tstats);
+		break;
+	case NETDEV_PCPU_STAT_DSTATS:
+		free_percpu(dev->dstats);
+		break;
+	}
+}
+
 /**
  * register_netdevice() - register a network device
  * @dev: device to register
@@ -10111,9 +10151,13 @@ int register_netdevice(struct net_device *dev)
 		goto err_uninit;
 	}
 
+	ret = netdev_do_alloc_pcpu_stats(dev);
+	if (ret)
+		goto err_uninit;
+
 	ret = dev_index_reserve(net, dev->ifindex);
 	if (ret < 0)
-		goto err_uninit;
+		goto err_free_pcpu;
 	dev->ifindex = ret;
 
 	/* Transfer changeable features to wanted_features and enable
@@ -10219,6 +10263,8 @@ err_uninit_notify:
 	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
 err_ifindex_release:
 	dev_index_release(net, dev->ifindex);
+err_free_pcpu:
+	netdev_do_free_pcpu_stats(dev);
 err_uninit:
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
@@ -10471,6 +10517,7 @@ void netdev_run_todo(void)
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
+		netdev_do_free_pcpu_stats(dev);
 		if (dev->priv_destructor)
 			dev->priv_destructor(dev);
 		if (dev->needs_free_netdev)
-- 
cgit v1.2.3


From d9dcdb4531fe39ce48919ef8c2c9369ee49f3ad2 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 Oct 2023 11:21:07 +0200
Subject: PCI: host-generic: Convert to platform remove callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code.  However the value returned is (mostly) ignored
and this typically results in resource leaks. To improve here there is a
quest to make the remove callback return void. In the first step of this
quest all drivers are converted to .remove_new() which already returns
void.

pci_host_common_remove() returned zero unconditionally. With that converted
to return void instead, the generic pci host driver can be switched to
.remove_new() trivially.

Link: https://lore.kernel.org/r/20231020092107.2148311-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Will Deacon <will@kernel.org>
---
 drivers/pci/controller/pci-host-common.c  | 4 +---
 drivers/pci/controller/pci-host-generic.c | 2 +-
 include/linux/pci-ecam.h                  | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c
index 6be3266cd7b5..45b71806182d 100644
--- a/drivers/pci/controller/pci-host-common.c
+++ b/drivers/pci/controller/pci-host-common.c
@@ -85,7 +85,7 @@ int pci_host_common_probe(struct platform_device *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_host_common_probe);
 
-int pci_host_common_remove(struct platform_device *pdev)
+void pci_host_common_remove(struct platform_device *pdev)
 {
 	struct pci_host_bridge *bridge = platform_get_drvdata(pdev);
 
@@ -93,8 +93,6 @@ int pci_host_common_remove(struct platform_device *pdev)
 	pci_stop_root_bus(bridge->bus);
 	pci_remove_root_bus(bridge->bus);
 	pci_unlock_rescan_remove();
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_host_common_remove);
 
diff --git a/drivers/pci/controller/pci-host-generic.c b/drivers/pci/controller/pci-host-generic.c
index 63865aeb636b..41cb6a057f6e 100644
--- a/drivers/pci/controller/pci-host-generic.c
+++ b/drivers/pci/controller/pci-host-generic.c
@@ -82,7 +82,7 @@ static struct platform_driver gen_pci_driver = {
 		.of_match_table = gen_pci_of_match,
 	},
 	.probe = pci_host_common_probe,
-	.remove = pci_host_common_remove,
+	.remove_new = pci_host_common_remove,
 };
 module_platform_driver(gen_pci_driver);
 
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index 6b1301e2498e..3a4860bd2758 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -93,6 +93,6 @@ extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
 /* for DT-based PCI controllers that support ECAM */
 int pci_host_common_probe(struct platform_device *pdev);
-int pci_host_common_remove(struct platform_device *pdev);
+void pci_host_common_remove(struct platform_device *pdev);
 #endif
 #endif
-- 
cgit v1.2.3


From 3171e46d677a668eed3086da78671f1e4f5b8405 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 30 Oct 2023 13:42:18 +0200
Subject: PCI: Avoid potential out-of-bounds read in
 pci_dev_for_each_resource()

Coverity complains that pointer in the pci_dev_for_each_resource() may be
wrong, i.e., might be used for the out-of-bounds read.

There is no actual issue right now because we have another check afterwards
and the out-of-bounds read is not being performed. In any case it's better
code with this fixed, hence the proposed change.

As Jonas pointed out "It probably makes the code slightly less performant
as res will now be checked for being not NULL (which will always be true),
but I doubt it will be significant (or in any hot paths)."

Fixes: 09cc90063240 ("PCI: Introduce pci_dev_for_each_resource()")
Reported-by: Bjorn Helgaas <bhelgaas@google.com>
Closes: https://lore.kernel.org/r/20230509182122.GA1259567@bhelgaas
Suggested-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://lore.kernel.org/r/20231030114218.2752236-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..4ebecc7896ef 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2127,14 +2127,14 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
 	(pci_resource_end((dev), (bar)) ? 				\
 	 resource_size(pci_resource_n((dev), (bar))) : 0)
 
-#define __pci_dev_for_each_res0(dev, res, ...)				\
-	for (unsigned int __b = 0;					\
-	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+#define __pci_dev_for_each_res0(dev, res, ...)				  \
+	for (unsigned int __b = 0;					  \
+	     __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \
 	     __b++)
 
-#define __pci_dev_for_each_res1(dev, res, __b)				\
-	for (__b = 0;							\
-	     res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES;	\
+#define __pci_dev_for_each_res1(dev, res, __b)				  \
+	for (__b = 0;							  \
+	     __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \
 	     __b++)
 
 #define pci_dev_for_each_resource(dev, res, ...)			\
-- 
cgit v1.2.3


From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:56 +0200
Subject: bpf: verify callbacks as if they are called unknown number of times

Prior to this patch callbacks were handled as regular function calls,
execution of callback body was modeled exactly once.
This patch updates callbacks handling logic as follows:
- introduces a function push_callback_call() that schedules callback
  body verification in env->head stack;
- updates prepare_func_exit() to reschedule callback body verification
  upon BPF_EXIT;
- as calls to bpf_*_iter_next(), calls to callback invoking functions
  are marked as checkpoints;
- is_state_visited() is updated to stop callback based iteration when
  some identical parent state is found.

Paths with callback function invoked zero times are now verified first,
which leads to necessity to modify some selftests:
- the following negative tests required adding release/unlock/drop
  calls to avoid previously masked unrelated error reports:
  - cb_refs.c:underflow_prog
  - exceptions_fail.c:reject_rbtree_add_throw
  - exceptions_fail.c:reject_with_cp_reference
- the following precision tracking selftests needed change in expected
  log trace:
  - verifier_subprog_precision.c:callback_result_precise
    (note: r0 precision is no longer propagated inside callback and
           I think this is a correct behavior)
  - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback
  - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback

Reported-by: Andrew Werner <awerner32@gmail.com>
Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   5 +
 kernel/bpf/verifier.c                              | 274 +++++++++++++--------
 tools/testing/selftests/bpf/progs/cb_refs.c        |   1 +
 .../testing/selftests/bpf/progs/exceptions_fail.c  |   2 +
 .../bpf/progs/verifier_subprog_precision.c         |  71 ++++--
 5 files changed, 240 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..dd326936dd6f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -400,6 +400,7 @@ struct bpf_verifier_state {
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
+	u32 callback_unroll_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -511,6 +512,10 @@ struct bpf_insn_aux_data {
 	 * this instruction, regardless of any heuristics
 	 */
 	bool force_checkpoint;
+	/* true if instruction is a call to a helper function that
+	 * accepts callback function as a parameter.
+	 */
+	bool calls_callback;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 68ee4803d3a2..a60dfa56ebb3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -547,13 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_dynptr_data;
 }
 
-static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
-static bool is_callback_calling_function(enum bpf_func_id func_id)
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_for_each_map_elem ||
-	       func_id == BPF_FUNC_timer_set_callback ||
 	       func_id == BPF_FUNC_find_vma ||
 	       func_id == BPF_FUNC_loop ||
 	       func_id == BPF_FUNC_user_ringbuf_drain;
@@ -564,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_timer_set_callback;
 }
 
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+	return is_sync_callback_calling_function(func_id) ||
+	       is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+	return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+	       (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
 static bool is_storage_get_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_storage_get ||
@@ -1808,6 +1819,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
+	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
@@ -3731,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 	}
 }
 
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -3906,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 					return -EFAULT;
 				return 0;
 			}
-		} else if ((bpf_helper_call(insn) &&
-			    is_callback_calling_function(insn->imm) &&
-			    !is_async_callback_calling_function(insn->imm)) ||
-			   (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
-			/* callback-calling helper or kfunc call, which means
-			 * we are exiting from subprog, but unlike the subprog
-			 * call handling above, we shouldn't propagate
-			 * precision of r1-r5 (if any requested), as they are
-			 * not actually arguments passed directly to callback
-			 * subprogs
+		} else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+			/* exit from callback subprog to callback-calling helper or
+			 * kfunc call. Use idx/subseq_idx check to discern it from
+			 * straight line code backtracking.
+			 * Unlike the subprog call handling above, we shouldn't
+			 * propagate precision of r1-r5 (if any requested), as they are
+			 * not actually arguments passed directly to callback subprogs
 			 */
 			if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
 				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
@@ -3950,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		} else if (opcode == BPF_EXIT) {
 			bool r0_precise;
 
+			/* Backtracking to a nested function call, 'idx' is a part of
+			 * the inner frame 'subseq_idx' is a part of the outer frame.
+			 * In case of a regular function call, instructions giving
+			 * precision to registers R1-R5 should have been found already.
+			 * In case of a callback, it is ok to have R1-R5 marked for
+			 * backtracking, as these registers are set by the function
+			 * invoking callback.
+			 */
+			if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+				for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+					bt_clear_reg(bt, i);
 			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
-				/* if backtracing was looking for registers R1-R5
-				 * they should have been found already.
-				 */
 				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
 				WARN_ONCE(1, "verifier backtracking bug");
 				return -EFAULT;
@@ -9421,11 +9440,11 @@ err_out:
 	return err;
 }
 
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			     int *insn_idx, int subprog,
-			     set_callee_state_fn set_callee_state_cb)
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			      int insn_idx, int subprog,
+			      set_callee_state_fn set_callee_state_cb)
 {
-	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state, *callback_state;
 	struct bpf_func_state *caller, *callee;
 	int err;
 
@@ -9433,44 +9452,22 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
-	if (subprog_is_global(env, subprog)) {
-		if (err) {
-			verbose(env, "Caller passes invalid args into func#%d\n",
-				subprog);
-			return err;
-		} else {
-			if (env->log.level & BPF_LOG_LEVEL)
-				verbose(env,
-					"Func#%d is global and valid. Skipping.\n",
-					subprog);
-			clear_caller_saved_regs(env, caller->regs);
-
-			/* All global functions return a 64-bit SCALAR_VALUE */
-			mark_reg_unknown(env, caller->regs, BPF_REG_0);
-			caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
-
-			/* continue with next insn after call */
-			return 0;
-		}
-	}
 
 	/* set_callee_state is used for direct subprog calls, but we are
 	 * interested in validating only BPF helpers that can call subprogs as
 	 * callbacks
 	 */
-	if (set_callee_state_cb != set_callee_state) {
-		env->subprog_info[subprog].is_cb = true;
-		if (bpf_pseudo_kfunc_call(insn) &&
-		    !is_callback_calling_kfunc(insn->imm)) {
-			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
-				func_id_name(insn->imm), insn->imm);
-			return -EFAULT;
-		} else if (!bpf_pseudo_kfunc_call(insn) &&
-			   !is_callback_calling_function(insn->imm)) { /* helper */
-			verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
-				func_id_name(insn->imm), insn->imm);
-			return -EFAULT;
-		}
+	env->subprog_info[subprog].is_cb = true;
+	if (bpf_pseudo_kfunc_call(insn) &&
+	    !is_sync_callback_calling_kfunc(insn->imm)) {
+		verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+			func_id_name(insn->imm), insn->imm);
+		return -EFAULT;
+	} else if (!bpf_pseudo_kfunc_call(insn) &&
+		   !is_callback_calling_function(insn->imm)) { /* helper */
+		verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+			func_id_name(insn->imm), insn->imm);
+		return -EFAULT;
 	}
 
 	if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -9481,25 +9478,76 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		/* there is no real recursion here. timer callbacks are async */
 		env->subprog_info[subprog].is_async_cb = true;
 		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
-					 *insn_idx, subprog);
+					 insn_idx, subprog);
 		if (!async_cb)
 			return -EFAULT;
 		callee = async_cb->frame[0];
 		callee->async_entry_cnt = caller->async_entry_cnt + 1;
 
 		/* Convert bpf_timer_set_callback() args into timer callback args */
-		err = set_callee_state_cb(env, caller, callee, *insn_idx);
+		err = set_callee_state_cb(env, caller, callee, insn_idx);
 		if (err)
 			return err;
 
+		return 0;
+	}
+
+	/* for callback functions enqueue entry to callback and
+	 * proceed with next instruction within current frame.
+	 */
+	callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+	if (!callback_state)
+		return -ENOMEM;
+
+	err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+			       callback_state);
+	if (err)
+		return err;
+
+	callback_state->callback_unroll_depth++;
+	return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			   int *insn_idx)
+{
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_func_state *caller;
+	int err, subprog, target_insn;
+
+	target_insn = *insn_idx + insn->imm + 1;
+	subprog = find_subprog(env, target_insn);
+	if (subprog < 0) {
+		verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+		return -EFAULT;
+	}
+
+	caller = state->frame[state->curframe];
+	err = btf_check_subprog_call(env, subprog, caller->regs);
+	if (err == -EFAULT)
+		return err;
+	if (subprog_is_global(env, subprog)) {
+		if (err) {
+			verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+			return err;
+		}
+
+		if (env->log.level & BPF_LOG_LEVEL)
+			verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
 		clear_caller_saved_regs(env, caller->regs);
+
+		/* All global functions return a 64-bit SCALAR_VALUE */
 		mark_reg_unknown(env, caller->regs, BPF_REG_0);
 		caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
 		/* continue with next insn after call */
 		return 0;
 	}
 
-	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state);
+	/* for regular function entry setup new frame and continue
+	 * from that frame.
+	 */
+	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
 	if (err)
 		return err;
 
@@ -9559,22 +9607,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			   int *insn_idx)
-{
-	int subprog, target_insn;
-
-	target_insn = *insn_idx + insn->imm + 1;
-	subprog = find_subprog(env, target_insn);
-	if (subprog < 0) {
-		verbose(env, "verifier bug. No program starts at insn %d\n",
-			target_insn);
-		return -EFAULT;
-	}
-
-	return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
 static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 				       struct bpf_func_state *caller,
 				       struct bpf_func_state *callee,
@@ -9798,6 +9830,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 			return -EINVAL;
 		}
+		if (!calls_callback(env, callee->callsite)) {
+			verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+				*insn_idx, callee->callsite);
+			return -EFAULT;
+		}
 	} else {
 		/* return to the caller whatever r0 had in the callee */
 		caller->regs[BPF_REG_0] = *r0;
@@ -9815,7 +9852,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 			return err;
 	}
 
-	*insn_idx = callee->callsite + 1;
+	/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+	 * there function call logic would reschedule callback visit. If iteration
+	 * converges is_state_visited() would prune that visit eventually.
+	 */
+	if (callee->in_callback_fn)
+		*insn_idx = callee->callsite;
+	else
+		*insn_idx = callee->callsite + 1;
+
 	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "returning from callee:\n");
 		print_verifier_state(env, callee, true);
@@ -10228,24 +10273,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 		break;
 	case BPF_FUNC_for_each_map_elem:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_map_elem_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_map_elem_callback_state);
 		break;
 	case BPF_FUNC_timer_set_callback:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_timer_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_timer_callback_state);
 		break;
 	case BPF_FUNC_find_vma:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_find_vma_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_find_vma_callback_state);
 		break;
 	case BPF_FUNC_snprintf:
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
 		update_loop_inline_state(env, meta.subprogno);
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_loop_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_loop_callback_state);
 		break;
 	case BPF_FUNC_dynptr_from_mem:
 		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
@@ -10341,8 +10386,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		break;
 	}
 	case BPF_FUNC_user_ringbuf_drain:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_user_ringbuf_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_user_ringbuf_callback_state);
 		break;
 	}
 
@@ -11230,7 +11275,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
 	       btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
 }
 
-static bool is_callback_calling_kfunc(u32 btf_id)
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
 {
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
@@ -11982,6 +12027,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EACCES;
 	}
 
+	/* Check the arguments */
+	err = check_kfunc_args(env, &meta, insn_idx);
+	if (err < 0)
+		return err;
+
+	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_rbtree_add_callback_state);
+		if (err) {
+			verbose(env, "kfunc %s#%d failed callback verification\n",
+				func_name, meta.func_id);
+			return err;
+		}
+	}
+
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
 
@@ -12017,10 +12077,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EINVAL;
 	}
 
-	/* Check the arguments */
-	err = check_kfunc_args(env, &meta, insn_idx);
-	if (err < 0)
-		return err;
 	/* In case of release function, we get register number of refcounted
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
@@ -12054,16 +12110,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_rbtree_add_callback_state);
-		if (err) {
-			verbose(env, "kfunc %s#%d failed callback verification\n",
-				func_name, meta.func_id);
-			return err;
-		}
-	}
-
 	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
 		if (!bpf_jit_supports_exceptions()) {
 			verbose(env, "JIT does not support calling kfunc %s#%d\n",
@@ -15427,6 +15473,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].force_checkpoint;
 }
 
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].calls_callback;
+}
 
 enum {
 	DONE_EXPLORING = 0,
@@ -15540,6 +15595,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		/* For functions that invoke callbacks it is not known how many times
+		 * callback would be called. Verifier models callback calling functions
+		 * by repeatedly visiting callback bodies and returning to origin call
+		 * instruction.
+		 * In order to stop such iteration verifier needs to identify when a
+		 * state identical some state from a previous iteration is reached.
+		 * Check below forces creation of checkpoint before callback calling
+		 * instruction to allow search for such identical states.
+		 */
+		if (is_sync_callback_calling_insn(insn)) {
+			mark_calls_callback(env, t);
+			mark_force_checkpoint(env, t);
+			mark_prune_point(env, t);
+			mark_jmp_point(env, t);
+		}
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 			struct bpf_kfunc_call_arg_meta meta;
 
@@ -17009,10 +17079,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				}
 				goto skip_inf_loop_check;
 			}
+			if (calls_callback(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur, true))
+					goto hit;
+				goto skip_inf_loop_check;
+			}
 			/* attempt to detect infinite loop to avoid unnecessary doomed work */
 			if (states_maybe_looping(&sl->state, cur) &&
 			    states_equal(env, &sl->state, cur, false) &&
-			    !iter_active_depths_differ(&sl->state, cur)) {
+			    !iter_active_depths_differ(&sl->state, cur) &&
+			    sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				verbose(env, "cur state:");
diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c
index 76d661b20e87..56c764df8196 100644
--- a/tools/testing/selftests/bpf/progs/cb_refs.c
+++ b/tools/testing/selftests/bpf/progs/cb_refs.c
@@ -33,6 +33,7 @@ int underflow_prog(void *ctx)
 	if (!p)
 		return 0;
 	bpf_for_each_map_elem(&array_map, cb1, &p, 0);
+	bpf_kfunc_call_test_release(p);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index 4c39e920dac2..8c0ef2742208 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx)
 		return 0;
 	bpf_spin_lock(&lock);
 	bpf_rbtree_add(&rbtree, &f->node, rbless);
+	bpf_spin_unlock(&lock);
 	return 0;
 }
 
@@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx)
 	if (!f)
 		return 0;
 	bpf_loop(5, subprog_cb_ref, NULL, 0);
+	bpf_obj_drop(f);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index db6b3143338b..da803cffb5ef 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -119,15 +119,26 @@ __naked int global_subprog_result_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("14: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 14 first_idx 10")
+__msg("mark_precise: frame0: last_idx 14 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
 __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
-__msg("mark_precise: frame0: parent state regs=r0 stack=:")
-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
-__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
+/* State entering callback body popped from states stack */
+__msg("from 9 to 17: frame1:")
+__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: (b7) r0 = 0")
+__msg("18: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 9:")
+/* r4 (flags) is always precise for bpf_loop() */
+__msg("frame 0: propagating r4")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
+__msg("from 18 to 9: safe")
 __naked int callback_result_precise(void)
 {
 	asm volatile (
@@ -233,20 +244,36 @@ __naked int parent_callee_saved_reg_precise_global(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("12: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 12 first_idx 10")
+__msg("mark_precise: frame0: last_idx 12 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs=r6 stack=:")
-__msg("mark_precise: frame0: last_idx 16 first_idx 0")
-__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit")
-__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
-__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8")
 __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* State entering callback body popped from states stack */
+__msg("from 9 to 15: frame1:")
+__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("15: (b7) r0 = 0")
+__msg("16: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 9:")
+/* r4 (flags) is always precise for bpf_loop(),
+ * r6 was marked before backtracking to callback body.
+ */
+__msg("frame 0: propagating r4,r6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
+__msg("mark_precise: frame0: parent state regs= stack=:")
+__msg("from 16 to 9: safe")
 __naked int parent_callee_saved_reg_precise_with_callback(void)
 {
 	asm volatile (
@@ -373,22 +400,38 @@ __naked int parent_stack_slot_precise_global(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("14: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 14 first_idx 11")
+__msg("mark_precise: frame0: last_idx 14 first_idx 10")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs= stack=-8:")
-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
-__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
-__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
-__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10")
 __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8")
 __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6")
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* State entering callback body popped from states stack */
+__msg("from 10 to 17: frame1:")
+__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: (b7) r0 = 0")
+__msg("18: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 10:")
+/* r4 (flags) is always precise for bpf_loop(),
+ * fp-8 was marked before backtracking to callback body.
+ */
+__msg("frame 0: propagating r4,fp-8")
+__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: parent state regs= stack=:")
+__msg("from 18 to 10: safe")
 __naked int parent_stack_slot_precise_with_callback(void)
 {
 	asm volatile (
-- 
cgit v1.2.3


From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:07:00 +0200
Subject: bpf: keep track of max number of bpf_loop callback iterations

In some cases verifier can't infer convergence of the bpf_loop()
iteration. E.g. for the following program:

    static int cb(__u32 idx, struct num_context* ctx)
    {
        ctx->i++;
        return 0;
    }

    SEC("?raw_tp")
    int prog(void *_)
    {
        struct num_context ctx = { .i = 0 };
        __u8 choice_arr[2] = { 0, 1 };

        bpf_loop(2, cb, &ctx, 0);
        return choice_arr[ctx.i];
    }

Each 'cb' simulation would eventually return to 'prog' and reach
'return choice_arr[ctx.i]' statement. At which point ctx.i would be
marked precise, thus forcing verifier to track multitude of separate
states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry.

This commit allows "brute force" handling for such cases by limiting
number of callback body simulations using 'umax' value of the first
bpf_loop() parameter.

For this, extend bpf_func_state with 'callback_depth' field.
Increment this field when callback visiting state is pushed to states
traversal stack. For frame #N it's 'callback_depth' field counts how
many times callback with frame depth N+1 had been executed.
Use bpf_func_state specifically to allow independent tracking of
callback depths when multiple nested bpf_loop() calls are present.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       | 11 +++++++
 kernel/bpf/verifier.c                              | 19 ++++++++++--
 .../bpf/progs/verifier_subprog_precision.c         | 35 +++++++++++++++-------
 3 files changed, 53 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index dd326936dd6f..aa4d19d0bc94 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -301,6 +301,17 @@ struct bpf_func_state {
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
+	/* For callback calling functions that limit number of possible
+	 * callback executions (e.g. bpf_loop) keeps track of current
+	 * simulated iteration number.
+	 * Value in frame N refers to number of times callback with frame
+	 * N+1 was simulated, e.g. for the following call:
+	 *
+	 *   bpf_loop(..., fn, ...); | suppose current frame is N
+	 *                           | fn would be simulated in frame N+1
+	 *                           | number of simulations is tracked in frame N
+	 */
+	u32 callback_depth;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2f03e6b11bb9..af2819d5c8ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9505,6 +9505,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
 		return err;
 
 	callback_state->callback_unroll_depth++;
+	callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+	caller->callback_depth = 0;
 	return 0;
 }
 
@@ -10309,8 +10311,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		break;
 	case BPF_FUNC_loop:
 		update_loop_inline_state(env, meta.subprogno);
-		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
-					 set_loop_callback_state);
+		/* Verifier relies on R1 value to determine if bpf_loop() iteration
+		 * is finished, thus mark it precise.
+		 */
+		err = mark_chain_precision(env, BPF_REG_1);
+		if (err)
+			return err;
+		if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+			err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+						 set_loop_callback_state);
+		} else {
+			cur_func(env)->callback_depth = 0;
+			if (env->log.level & BPF_LOG_LEVEL2)
+				verbose(env, "frame%d bpf_loop iteration limit reached\n",
+					env->cur_state->curframe);
+		}
 		break;
 	case BPF_FUNC_dynptr_from_mem:
 		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index da803cffb5ef..f61d623b1ce8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -119,7 +119,23 @@ __naked int global_subprog_result_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
-/* First simulated path does not include callback body */
+/* First simulated path does not include callback body,
+ * r1 and r4 are always precise for bpf_loop() calls.
+ */
+__msg("9: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: parent state regs=r4 stack=:")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
+__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: parent state regs=r1 stack=:")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
+__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0")
+__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0")
+__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8")
+__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* r6 precision propagation */
 __msg("14: (0f) r1 += r6")
 __msg("mark_precise: frame0: last_idx 14 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
@@ -134,10 +150,9 @@ __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 9:")
-/* r4 (flags) is always precise for bpf_loop() */
-__msg("frame 0: propagating r4")
+__msg("frame 0: propagating r1,r4")
 __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit")
 __msg("from 18 to 9: safe")
 __naked int callback_result_precise(void)
 {
@@ -264,12 +279,12 @@ __msg("15: (b7) r0 = 0")
 __msg("16: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 9:")
-/* r4 (flags) is always precise for bpf_loop(),
+/* r1, r4 are always precise for bpf_loop(),
  * r6 was marked before backtracking to callback body.
  */
-__msg("frame 0: propagating r4,r6")
+__msg("frame 0: propagating r1,r4,r6")
 __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit")
 __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
 __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs= stack=:")
@@ -422,12 +437,12 @@ __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 10:")
-/* r4 (flags) is always precise for bpf_loop(),
+/* r1, r4 are always precise for bpf_loop(),
  * fp-8 was marked before backtracking to callback body.
  */
-__msg("frame 0: propagating r4,fp-8")
+__msg("frame 0: propagating r1,r4,fp-8")
 __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit")
 __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
 __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
 __msg("mark_precise: frame0: parent state regs= stack=:")
-- 
cgit v1.2.3


From fc43e9c857b7aa55efba9398419b14d9e35dcc7d Mon Sep 17 00:00:00 2001
From: Charles Yi <be286@163.com>
Date: Tue, 31 Oct 2023 12:32:39 +0800
Subject: HID: fix HID device resource race between HID core and debugging
 support

hid_debug_events_release releases resources bound to the HID device instance.
hid_device_release releases the underlying HID device instance potentially
before hid_debug_events_release has completed releasing debug resources bound
to the same HID device instance.

Reference count to prevent the HID device instance from being torn down
preemptively when HID debugging support is used. When count reaches zero,
release core resources of HID device instance using hiddev_free.

The crash:

[  120.728477][ T4396] kernel BUG at lib/list_debug.c:53!
[  120.728505][ T4396] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[  120.739806][ T4396] Modules linked in: bcmdhd dhd_static_buf 8822cu pcie_mhi r8168
[  120.747386][ T4396] CPU: 1 PID: 4396 Comm: hidt_bridge Not tainted 5.10.110 #257
[  120.754771][ T4396] Hardware name: Rockchip RK3588 EVB4 LP4 V10 Board (DT)
[  120.761643][ T4396] pstate: 60400089 (nZCv daIf +PAN -UAO -TCO BTYPE=--)
[  120.768338][ T4396] pc : __list_del_entry_valid+0x98/0xac
[  120.773730][ T4396] lr : __list_del_entry_valid+0x98/0xac
[  120.779120][ T4396] sp : ffffffc01e62bb60
[  120.783126][ T4396] x29: ffffffc01e62bb60 x28: ffffff818ce3a200
[  120.789126][ T4396] x27: 0000000000000009 x26: 0000000000980000
[  120.795126][ T4396] x25: ffffffc012431000 x24: ffffff802c6d4e00
[  120.801125][ T4396] x23: ffffff8005c66f00 x22: ffffffc01183b5b8
[  120.807125][ T4396] x21: ffffff819df2f100 x20: 0000000000000000
[  120.813124][ T4396] x19: ffffff802c3f0700 x18: ffffffc01d2cd058
[  120.819124][ T4396] x17: 0000000000000000 x16: 0000000000000000
[  120.825124][ T4396] x15: 0000000000000004 x14: 0000000000003fff
[  120.831123][ T4396] x13: ffffffc012085588 x12: 0000000000000003
[  120.837123][ T4396] x11: 00000000ffffbfff x10: 0000000000000003
[  120.843123][ T4396] x9 : 455103d46b329300 x8 : 455103d46b329300
[  120.849124][ T4396] x7 : 74707572726f6320 x6 : ffffffc0124b8cb5
[  120.855124][ T4396] x5 : ffffffffffffffff x4 : 0000000000000000
[  120.861123][ T4396] x3 : ffffffc011cf4f90 x2 : ffffff81fee7b948
[  120.867122][ T4396] x1 : ffffffc011cf4f90 x0 : 0000000000000054
[  120.873122][ T4396] Call trace:
[  120.876259][ T4396]  __list_del_entry_valid+0x98/0xac
[  120.881304][ T4396]  hid_debug_events_release+0x48/0x12c
[  120.886617][ T4396]  full_proxy_release+0x50/0xbc
[  120.891323][ T4396]  __fput+0xdc/0x238
[  120.895075][ T4396]  ____fput+0x14/0x24
[  120.898911][ T4396]  task_work_run+0x90/0x148
[  120.903268][ T4396]  do_exit+0x1bc/0x8a4
[  120.907193][ T4396]  do_group_exit+0x8c/0xa4
[  120.911458][ T4396]  get_signal+0x468/0x744
[  120.915643][ T4396]  do_signal+0x84/0x280
[  120.919650][ T4396]  do_notify_resume+0xd0/0x218
[  120.924262][ T4396]  work_pending+0xc/0x3f0

[ Rahul Rameshbabu <sergeantsagara@protonmail.com>: rework changelog ]
Fixes: cd667ce24796 ("HID: use debugfs for events/reports dumping")
Signed-off-by: Charles Yi <be286@163.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c  | 12 ++++++++++--
 drivers/hid/hid-debug.c |  3 +++
 include/linux/hid.h     |  3 +++
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 8992e3c1e769..e0181218ad85 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -702,15 +702,22 @@ static void hid_close_report(struct hid_device *device)
  * Free a device structure, all reports, and all fields.
  */
 
-static void hid_device_release(struct device *dev)
+void hiddev_free(struct kref *ref)
 {
-	struct hid_device *hid = to_hid_device(dev);
+	struct hid_device *hid = container_of(ref, struct hid_device, ref);
 
 	hid_close_report(hid);
 	kfree(hid->dev_rdesc);
 	kfree(hid);
 }
 
+static void hid_device_release(struct device *dev)
+{
+	struct hid_device *hid = to_hid_device(dev);
+
+	kref_put(&hid->ref, hiddev_free);
+}
+
 /*
  * Fetch a report description item from the data stream. We support long
  * items, though they are not used yet.
@@ -2846,6 +2853,7 @@ struct hid_device *hid_allocate_device(void)
 	spin_lock_init(&hdev->debug_list_lock);
 	sema_init(&hdev->driver_input_lock, 1);
 	mutex_init(&hdev->ll_open_lock);
+	kref_init(&hdev->ref);
 
 	hid_bpf_device_init(hdev);
 
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index e7ef1ea107c9..7dd83ec74f8a 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -1135,6 +1135,7 @@ static int hid_debug_events_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 	list->hdev = (struct hid_device *) inode->i_private;
+	kref_get(&list->hdev->ref);
 	file->private_data = list;
 	mutex_init(&list->read_mutex);
 
@@ -1227,6 +1228,8 @@ static int hid_debug_events_release(struct inode *inode, struct file *file)
 	list_del(&list->node);
 	spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags);
 	kfifo_free(&list->hid_debug_fifo);
+
+	kref_put(&list->hdev->ref, hiddev_free);
 	kfree(list);
 
 	return 0;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5a8387a4a712..bf43f3ff6664 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -679,6 +679,7 @@ struct hid_device {							/* device report descriptor */
 	struct list_head debug_list;
 	spinlock_t  debug_list_lock;
 	wait_queue_head_t debug_wait;
+	struct kref			ref;
 
 	unsigned int id;						/* system unique id */
 
@@ -687,6 +688,8 @@ struct hid_device {							/* device report descriptor */
 #endif /* CONFIG_BPF */
 };
 
+void hiddev_free(struct kref *ref);
+
 #define to_hid_device(pdev) \
 	container_of(pdev, struct hid_device, dev)
 
-- 
cgit v1.2.3


From 600f111ef51dc2cbdb330b09d09f1856efa64912 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Nov 2023 21:58:23 +0000
Subject: fs: Rename mapping private members

It is hard to find where mapping->private_lock, mapping->private_list and
mapping->private_data are used, due to private_XXX being a relatively
common name for variables and structure members in the kernel.  To fit
with other members of struct address_space, rename them all to have an
i_ prefix.  Tested with an allmodconfig build.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org
Acked-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/aio.c             |  16 ++++----
 fs/btrfs/extent_io.c |  52 ++++++++++++-------------
 fs/btrfs/subpage.c   |   4 +-
 fs/buffer.c          | 108 +++++++++++++++++++++++++--------------------------
 fs/ext4/inode.c      |   4 +-
 fs/gfs2/glock.c      |   2 +-
 fs/gfs2/ops_fstype.c |   2 +-
 fs/hugetlbfs/inode.c |   4 +-
 fs/inode.c           |   8 ++--
 fs/nfs/write.c       |  12 +++---
 fs/nilfs2/inode.c    |   4 +-
 fs/ntfs/aops.c       |  10 ++---
 include/linux/fs.h   |  12 +++---
 mm/hugetlb.c         |   2 +-
 mm/migrate.c         |   6 +--
 15 files changed, 123 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index f8589caef9c1..d02842156b35 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 		return ERR_CAST(inode);
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
-	inode->i_mapping->private_data = ctx;
+	inode->i_mapping->i_private_data = ctx;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
@@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
 
 		/* Prevent further access to the kioctx from migratepages */
 		i_mapping = aio_ring_file->f_mapping;
-		spin_lock(&i_mapping->private_lock);
-		i_mapping->private_data = NULL;
+		spin_lock(&i_mapping->i_private_lock);
+		i_mapping->i_private_data = NULL;
 		ctx->aio_ring_file = NULL;
-		spin_unlock(&i_mapping->private_lock);
+		spin_unlock(&i_mapping->i_private_lock);
 
 		fput(aio_ring_file);
 	}
@@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
 
 	rc = 0;
 
-	/* mapping->private_lock here protects against the kioctx teardown.  */
-	spin_lock(&mapping->private_lock);
-	ctx = mapping->private_data;
+	/* mapping->i_private_lock here protects against the kioctx teardown.  */
+	spin_lock(&mapping->i_private_lock);
+	ctx = mapping->i_private_data;
 	if (!ctx) {
 		rc = -EINVAL;
 		goto out;
@@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
 out_unlock:
 	mutex_unlock(&ctx->ring_lock);
 out:
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 	return rc;
 }
 #else
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 03cef28d9e37..3431a53bf3fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -870,7 +870,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
 	 * will not race with any other ebs.
 	 */
 	if (page->mapping)
-		lockdep_assert_held(&page->mapping->private_lock);
+		lockdep_assert_held(&page->mapping->i_private_lock);
 
 	if (fs_info->nodesize >= PAGE_SIZE) {
 		if (!PagePrivate(page))
@@ -1736,16 +1736,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 		 * Take private lock to ensure the subpage won't be detached
 		 * in the meantime.
 		 */
-		spin_lock(&page->mapping->private_lock);
+		spin_lock(&page->mapping->i_private_lock);
 		if (!PagePrivate(page)) {
-			spin_unlock(&page->mapping->private_lock);
+			spin_unlock(&page->mapping->i_private_lock);
 			break;
 		}
 		spin_lock_irqsave(&subpage->lock, flags);
 		if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
 			      subpage->bitmaps)) {
 			spin_unlock_irqrestore(&subpage->lock, flags);
-			spin_unlock(&page->mapping->private_lock);
+			spin_unlock(&page->mapping->i_private_lock);
 			bit_start++;
 			continue;
 		}
@@ -1759,7 +1759,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
 		 */
 		eb = find_extent_buffer_nolock(fs_info, start);
 		spin_unlock_irqrestore(&subpage->lock, flags);
-		spin_unlock(&page->mapping->private_lock);
+		spin_unlock(&page->mapping->i_private_lock);
 
 		/*
 		 * The eb has already reached 0 refs thus find_extent_buffer()
@@ -1811,9 +1811,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
 		return submit_eb_subpage(page, wbc);
 
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	if (!PagePrivate(page)) {
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 		return 0;
 	}
 
@@ -1824,16 +1824,16 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
 	 * crashing the machine for something we can survive anyway.
 	 */
 	if (WARN_ON(!eb)) {
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 		return 0;
 	}
 
 	if (eb == ctx->eb) {
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 		return 0;
 	}
 	ret = atomic_inc_not_zero(&eb->refs);
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 	if (!ret)
 		return 0;
 
@@ -3056,7 +3056,7 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
 {
 	struct btrfs_subpage *subpage;
 
-	lockdep_assert_held(&page->mapping->private_lock);
+	lockdep_assert_held(&page->mapping->i_private_lock);
 
 	if (PagePrivate(page)) {
 		subpage = (struct btrfs_subpage *)page->private;
@@ -3079,14 +3079,14 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
 
 	/*
 	 * For mapped eb, we're going to change the page private, which should
-	 * be done under the private_lock.
+	 * be done under the i_private_lock.
 	 */
 	if (mapped)
-		spin_lock(&page->mapping->private_lock);
+		spin_lock(&page->mapping->i_private_lock);
 
 	if (!PagePrivate(page)) {
 		if (mapped)
-			spin_unlock(&page->mapping->private_lock);
+			spin_unlock(&page->mapping->i_private_lock);
 		return;
 	}
 
@@ -3110,7 +3110,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
 			detach_page_private(page);
 		}
 		if (mapped)
-			spin_unlock(&page->mapping->private_lock);
+			spin_unlock(&page->mapping->i_private_lock);
 		return;
 	}
 
@@ -3133,7 +3133,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
 	if (!page_range_has_eb(fs_info, page))
 		btrfs_detach_subpage(fs_info, page);
 
-	spin_unlock(&page->mapping->private_lock);
+	spin_unlock(&page->mapping->i_private_lock);
 }
 
 /* Release all pages attached to the extent buffer */
@@ -3514,7 +3514,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	/*
 	 * Preallocate page->private for subpage case, so that we won't
-	 * allocate memory with private_lock nor page lock hold.
+	 * allocate memory with i_private_lock nor page lock hold.
 	 *
 	 * The memory will be freed by attach_extent_buffer_page() or freed
 	 * manually if we exit earlier.
@@ -3535,10 +3535,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 			goto free_eb;
 		}
 
-		spin_lock(&mapping->private_lock);
+		spin_lock(&mapping->i_private_lock);
 		exists = grab_extent_buffer(fs_info, p);
 		if (exists) {
-			spin_unlock(&mapping->private_lock);
+			spin_unlock(&mapping->i_private_lock);
 			unlock_page(p);
 			put_page(p);
 			mark_extent_buffer_accessed(exists, p);
@@ -3558,7 +3558,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 		 * Thus needs no special handling in error path.
 		 */
 		btrfs_page_inc_eb_refs(fs_info, p);
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 
 		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
 		eb->pages[i] = p;
@@ -4563,12 +4563,12 @@ static int try_release_subpage_extent_buffer(struct page *page)
 	 * Finally to check if we have cleared page private, as if we have
 	 * released all ebs in the page, the page private should be cleared now.
 	 */
-	spin_lock(&page->mapping->private_lock);
+	spin_lock(&page->mapping->i_private_lock);
 	if (!PagePrivate(page))
 		ret = 1;
 	else
 		ret = 0;
-	spin_unlock(&page->mapping->private_lock);
+	spin_unlock(&page->mapping->i_private_lock);
 	return ret;
 
 }
@@ -4584,9 +4584,9 @@ int try_release_extent_buffer(struct page *page)
 	 * We need to make sure nobody is changing page->private, as we rely on
 	 * page->private as the pointer to extent buffer.
 	 */
-	spin_lock(&page->mapping->private_lock);
+	spin_lock(&page->mapping->i_private_lock);
 	if (!PagePrivate(page)) {
-		spin_unlock(&page->mapping->private_lock);
+		spin_unlock(&page->mapping->i_private_lock);
 		return 1;
 	}
 
@@ -4601,10 +4601,10 @@ int try_release_extent_buffer(struct page *page)
 	spin_lock(&eb->refs_lock);
 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
 		spin_unlock(&eb->refs_lock);
-		spin_unlock(&page->mapping->private_lock);
+		spin_unlock(&page->mapping->i_private_lock);
 		return 0;
 	}
-	spin_unlock(&page->mapping->private_lock);
+	spin_unlock(&page->mapping->i_private_lock);
 
 	/*
 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 1b999c6e4193..2347cf15278b 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -200,7 +200,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
 		return;
 
 	ASSERT(PagePrivate(page) && page->mapping);
-	lockdep_assert_held(&page->mapping->private_lock);
+	lockdep_assert_held(&page->mapping->i_private_lock);
 
 	subpage = (struct btrfs_subpage *)page->private;
 	atomic_inc(&subpage->eb_refs);
@@ -215,7 +215,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
 		return;
 
 	ASSERT(PagePrivate(page) && page->mapping);
-	lockdep_assert_held(&page->mapping->private_lock);
+	lockdep_assert_held(&page->mapping->i_private_lock);
 
 	subpage = (struct btrfs_subpage *)page->private;
 	ASSERT(atomic_read(&subpage->eb_refs));
diff --git a/fs/buffer.c b/fs/buffer.c
index 967f34b70aa8..5ffc44ab4854 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync);
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * private_lock.
+ * i_private_lock.
  *
- * Hack idea: for the blockdev mapping, private_lock contention
+ * Hack idea: for the blockdev mapping, i_private_lock contention
  * may be quite high.  This code could TryLock the page, and if that
- * succeeds, there is no need to take private_lock.
+ * succeeds, there is no need to take i_private_lock.
  */
 static struct buffer_head *
 __find_get_block_slow(struct block_device *bdev, sector_t block)
@@ -204,7 +204,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	if (IS_ERR(folio))
 		goto out;
 
-	spin_lock(&bd_mapping->private_lock);
+	spin_lock(&bd_mapping->i_private_lock);
 	head = folio_buffers(folio);
 	if (!head)
 		goto out_unlock;
@@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 		       1 << bd_inode->i_blkbits);
 	}
 out_unlock:
-	spin_unlock(&bd_mapping->private_lock);
+	spin_unlock(&bd_mapping->i_private_lock);
 	folio_put(folio);
 out:
 	return ret;
@@ -467,25 +467,25 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  *
  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
- * management of a list of dependent buffers at ->i_mapping->private_list.
+ * management of a list of dependent buffers at ->i_mapping->i_private_list.
  *
  * Locking is a little subtle: try_to_free_buffers() will remove buffers
  * from their controlling inode's queue when they are being freed.  But
  * try_to_free_buffers() will be operating against the *blockdev* mapping
  * at the time, not against the S_ISREG file which depends on those buffers.
- * So the locking for private_list is via the private_lock in the address_space
+ * So the locking for i_private_list is via the i_private_lock in the address_space
  * which backs the buffers.  Which is different from the address_space 
  * against which the buffers are listed.  So for a particular address_space,
- * mapping->private_lock does *not* protect mapping->private_list!  In fact,
- * mapping->private_list will always be protected by the backing blockdev's
- * ->private_lock.
+ * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
+ * mapping->i_private_list will always be protected by the backing blockdev's
+ * ->i_private_lock.
  *
  * Which introduces a requirement: all buffers on an address_space's
- * ->private_list must be from the same address_space: the blockdev's.
+ * ->i_private_list must be from the same address_space: the blockdev's.
  *
- * address_spaces which do not place buffers at ->private_list via these
- * utility functions are free to use private_lock and private_list for
- * whatever they want.  The only requirement is that list_empty(private_list)
+ * address_spaces which do not place buffers at ->i_private_list via these
+ * utility functions are free to use i_private_lock and i_private_list for
+ * whatever they want.  The only requirement is that list_empty(i_private_list)
  * be true at clear_inode() time.
  *
  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
@@ -508,7 +508,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  */
 
 /*
- * The buffer's backing address_space's private_lock must be held
+ * The buffer's backing address_space's i_private_lock must be held
  */
 static void __remove_assoc_queue(struct buffer_head *bh)
 {
@@ -519,7 +519,7 @@ static void __remove_assoc_queue(struct buffer_head *bh)
 
 int inode_has_buffers(struct inode *inode)
 {
-	return !list_empty(&inode->i_data.private_list);
+	return !list_empty(&inode->i_data.i_private_list);
 }
 
 /*
@@ -561,7 +561,7 @@ repeat:
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
  *
- * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * Starts I/O against the buffers at mapping->i_private_list, and waits upon
  * that I/O.
  *
  * Basically, this is a convenience function for fsync().
@@ -570,13 +570,13 @@ repeat:
  */
 int sync_mapping_buffers(struct address_space *mapping)
 {
-	struct address_space *buffer_mapping = mapping->private_data;
+	struct address_space *buffer_mapping = mapping->i_private_data;
 
-	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+	if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
 		return 0;
 
-	return fsync_buffers_list(&buffer_mapping->private_lock,
-					&mapping->private_list);
+	return fsync_buffers_list(&buffer_mapping->i_private_lock,
+					&mapping->i_private_list);
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 
@@ -673,17 +673,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 	struct address_space *buffer_mapping = bh->b_folio->mapping;
 
 	mark_buffer_dirty(bh);
-	if (!mapping->private_data) {
-		mapping->private_data = buffer_mapping;
+	if (!mapping->i_private_data) {
+		mapping->i_private_data = buffer_mapping;
 	} else {
-		BUG_ON(mapping->private_data != buffer_mapping);
+		BUG_ON(mapping->i_private_data != buffer_mapping);
 	}
 	if (!bh->b_assoc_map) {
-		spin_lock(&buffer_mapping->private_lock);
+		spin_lock(&buffer_mapping->i_private_lock);
 		list_move_tail(&bh->b_assoc_buffers,
-				&mapping->private_list);
+				&mapping->i_private_list);
 		bh->b_assoc_map = mapping;
-		spin_unlock(&buffer_mapping->private_lock);
+		spin_unlock(&buffer_mapping->i_private_lock);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
@@ -706,7 +706,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  * page on the dirty page list.
  *
- * We use private_lock to lock against try_to_free_buffers while using the
+ * We use i_private_lock to lock against try_to_free_buffers while using the
  * page's buffer list.  Also use this to protect against clean buffers being
  * added to the page after it was set dirty.
  *
@@ -718,7 +718,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
 	struct buffer_head *head;
 	bool newly_dirty;
 
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	head = folio_buffers(folio);
 	if (head) {
 		struct buffer_head *bh = head;
@@ -734,7 +734,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
 	 */
 	folio_memcg_lock(folio);
 	newly_dirty = !folio_test_set_dirty(folio);
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 
 	if (newly_dirty)
 		__folio_mark_dirty(folio, mapping, 1);
@@ -827,7 +827,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 		smp_mb();
 		if (buffer_dirty(bh)) {
 			list_add(&bh->b_assoc_buffers,
-				 &mapping->private_list);
+				 &mapping->i_private_list);
 			bh->b_assoc_map = mapping;
 		}
 		spin_unlock(lock);
@@ -851,7 +851,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  * probably unmounting the fs, but that doesn't mean we have already
  * done a sync().  Just drop the buffers from the inode list.
  *
- * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
+ * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
  * assumes that all the buffers are against the blockdev.  Not true
  * for reiserfs.
  */
@@ -859,13 +859,13 @@ void invalidate_inode_buffers(struct inode *inode)
 {
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
-		struct list_head *list = &mapping->private_list;
-		struct address_space *buffer_mapping = mapping->private_data;
+		struct list_head *list = &mapping->i_private_list;
+		struct address_space *buffer_mapping = mapping->i_private_data;
 
-		spin_lock(&buffer_mapping->private_lock);
+		spin_lock(&buffer_mapping->i_private_lock);
 		while (!list_empty(list))
 			__remove_assoc_queue(BH_ENTRY(list->next));
-		spin_unlock(&buffer_mapping->private_lock);
+		spin_unlock(&buffer_mapping->i_private_lock);
 	}
 }
 EXPORT_SYMBOL(invalidate_inode_buffers);
@@ -882,10 +882,10 @@ int remove_inode_buffers(struct inode *inode)
 
 	if (inode_has_buffers(inode)) {
 		struct address_space *mapping = &inode->i_data;
-		struct list_head *list = &mapping->private_list;
-		struct address_space *buffer_mapping = mapping->private_data;
+		struct list_head *list = &mapping->i_private_list;
+		struct address_space *buffer_mapping = mapping->i_private_data;
 
-		spin_lock(&buffer_mapping->private_lock);
+		spin_lock(&buffer_mapping->i_private_lock);
 		while (!list_empty(list)) {
 			struct buffer_head *bh = BH_ENTRY(list->next);
 			if (buffer_dirty(bh)) {
@@ -894,7 +894,7 @@ int remove_inode_buffers(struct inode *inode)
 			}
 			__remove_assoc_queue(bh);
 		}
-		spin_unlock(&buffer_mapping->private_lock);
+		spin_unlock(&buffer_mapping->i_private_lock);
 	}
 	return ret;
 }
@@ -1064,11 +1064,11 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	 * lock to be atomic wrt __find_get_block(), which does not
 	 * run under the folio lock.
 	 */
-	spin_lock(&inode->i_mapping->private_lock);
+	spin_lock(&inode->i_mapping->i_private_lock);
 	link_dev_buffers(folio, bh);
 	end_block = folio_init_buffers(folio, bdev,
 			(sector_t)index << sizebits, size);
-	spin_unlock(&inode->i_mapping->private_lock);
+	spin_unlock(&inode->i_mapping->i_private_lock);
 done:
 	ret = (block < end_block) ? 1 : -ENXIO;
 failed:
@@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
  * and then attach the address_space's inode to its superblock's dirty
  * inode list.
  *
- * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->private_lock,
+ * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
  * i_pages lock and mapping->host->i_lock.
  */
 void mark_buffer_dirty(struct buffer_head *bh)
@@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh)
 	if (bh->b_assoc_map) {
 		struct address_space *buffer_mapping = bh->b_folio->mapping;
 
-		spin_lock(&buffer_mapping->private_lock);
+		spin_lock(&buffer_mapping->i_private_lock);
 		list_del_init(&bh->b_assoc_buffers);
 		bh->b_assoc_map = NULL;
-		spin_unlock(&buffer_mapping->private_lock);
+		spin_unlock(&buffer_mapping->i_private_lock);
 	}
 	__brelse(bh);
 }
@@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio);
 
 /*
  * We attach and possibly dirty the buffers atomically wrt
- * block_dirty_folio() via private_lock.  try_to_free_buffers
+ * block_dirty_folio() via i_private_lock.  try_to_free_buffers
  * is already excluded via the folio lock.
  */
 struct buffer_head *create_empty_buffers(struct folio *folio,
@@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
 	} while (bh);
 	tail->b_this_page = head;
 
-	spin_lock(&folio->mapping->private_lock);
+	spin_lock(&folio->mapping->i_private_lock);
 	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
 		bh = head;
 		do {
@@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
 		} while (bh != head);
 	}
 	folio_attach_private(folio, head);
-	spin_unlock(&folio->mapping->private_lock);
+	spin_unlock(&folio->mapping->i_private_lock);
 
 	return head;
 }
@@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 			if (!folio_buffers(folio))
 				continue;
 			/*
-			 * We use folio lock instead of bd_mapping->private_lock
+			 * We use folio lock instead of bd_mapping->i_private_lock
 			 * to pin buffers here since we can afford to sleep and
 			 * it scales better than a global spinlock lock.
 			 */
@@ -2883,7 +2883,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
  * are unused, and releases them if so.
  *
  * Exclusion against try_to_free_buffers may be obtained by either
- * locking the folio or by holding its mapping's private_lock.
+ * locking the folio or by holding its mapping's i_private_lock.
  *
  * If the folio is dirty but all the buffers are clean then we need to
  * be sure to mark the folio clean as well.  This is because the folio
@@ -2894,7 +2894,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
  * The same applies to regular filesystem folios: if all the buffers are
  * clean then we set the folio clean and proceed.  To do that, we require
  * total exclusion from block_dirty_folio().  That is obtained with
- * private_lock.
+ * i_private_lock.
  *
  * try_to_free_buffers() is non-blocking.
  */
@@ -2946,7 +2946,7 @@ bool try_to_free_buffers(struct folio *folio)
 		goto out;
 	}
 
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	ret = drop_buffers(folio, &buffers_to_free);
 
 	/*
@@ -2959,13 +2959,13 @@ bool try_to_free_buffers(struct folio *folio)
 	 * the folio's buffers clean.  We discover that here and clean
 	 * the folio also.
 	 *
-	 * private_lock must be held over this entire operation in order
+	 * i_private_lock must be held over this entire operation in order
 	 * to synchronise against block_dirty_folio and prevent the
 	 * dirty bit from being lost.
 	 */
 	if (ret)
 		folio_cancel_dirty(folio);
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 out:
 	if (buffers_to_free) {
 		struct buffer_head *bh = buffers_to_free;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61277f7f8722..0558c8c986d4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
- * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * ext4 never places buffers on inode->i_mapping->i_private_list.  metadata
  * buffers are managed internally.
  */
 static int ext4_write_end(struct file *file,
@@ -3213,7 +3213,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 	}
 
 	/* Any metadata buffers to write? */
-	if (!list_empty(&inode->i_mapping->private_list))
+	if (!list_empty(&inode->i_mapping->i_private_list))
 		return true;
 	return inode->i_state & I_DIRTY_DATASYNC;
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d6bf1f8c25dc..d8b619ed2f1e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1213,7 +1213,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->host = s->s_bdev->bd_inode;
 		mapping->flags = 0;
 		mapping_set_gfp_mask(mapping, GFP_NOFS);
-		mapping->private_data = NULL;
+		mapping->i_private_data = NULL;
 		mapping->writeback_index = 0;
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108c5d26839..00ce89bdf32c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mapping->host = sb->s_bdev->bd_inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	mapping->private_data = NULL;
+	mapping->i_private_data = NULL;
 	mapping->writeback_index = 0;
 
 	spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f757d4f7ad98..05609ab15cbc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 	 * at inode creation time.  If this is a device special inode,
 	 * i_mapping may not point to the original address space.
 	 */
-	resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+	resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
 	/* Only regular and link inodes have associated reserve maps */
 	if (resv_map)
 		resv_map_release(&resv_map->refs);
@@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		simple_inode_init_ts(inode);
-		inode->i_mapping->private_data = resv_map;
+		inode->i_mapping->i_private_data = resv_map;
 		info->seals = F_SEAL_SEAL;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/fs/inode.c b/fs/inode.c
index edcd8a61975f..788aa0aa542b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -209,7 +209,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	atomic_set(&mapping->nr_thps, 0);
 #endif
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
-	mapping->private_data = NULL;
+	mapping->i_private_data = NULL;
 	mapping->writeback_index = 0;
 	init_rwsem(&mapping->invalidate_lock);
 	lockdep_set_class_and_name(&mapping->invalidate_lock,
@@ -396,8 +396,8 @@ static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
 	init_rwsem(&mapping->i_mmap_rwsem);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
+	INIT_LIST_HEAD(&mapping->i_private_list);
+	spin_lock_init(&mapping->i_private_lock);
 	mapping->i_mmap = RB_ROOT_CACHED;
 }
 
@@ -618,7 +618,7 @@ void clear_inode(struct inode *inode)
 	 * nor even WARN_ON(!mapping_empty).
 	 */
 	xa_unlock_irq(&inode->i_data.i_pages);
-	BUG_ON(!list_empty(&inode->i_data.private_list));
+	BUG_ON(!list_empty(&inode->i_data.i_private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	BUG_ON(!list_empty(&inode->i_wb_list));
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b664caea8b4e..7248705faef4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
 
 	if (!folio_test_private(folio))
 		return NULL;
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	req = nfs_folio_private_request(folio);
 	if (req) {
 		WARN_ON_ONCE(req->wb_head != req);
 		kref_get(&req->wb_kref);
 	}
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 	return req;
 }
 
@@ -769,13 +769,13 @@ static void nfs_inode_add_request(struct nfs_page *req)
 	 * Swap-space should not get truncated. Hence no need to plug the race
 	 * with invalidate/truncate.
 	 */
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	if (likely(!folio_test_swapcache(folio))) {
 		set_bit(PG_MAPPED, &req->wb_flags);
 		folio_set_private(folio);
 		folio->private = req;
 	}
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 	atomic_long_inc(&nfsi->nrequests);
 	/* this a head request for a page group - mark it as having an
 	 * extra reference so sub groups can follow suit.
@@ -796,13 +796,13 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		struct folio *folio = nfs_page_to_folio(req->wb_head);
 		struct address_space *mapping = folio_file_mapping(folio);
 
-		spin_lock(&mapping->private_lock);
+		spin_lock(&mapping->i_private_lock);
 		if (likely(folio && !folio_test_swapcache(folio))) {
 			folio->private = NULL;
 			folio_clear_private(folio);
 			clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
 		}
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 	}
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index f861f3a0bf5c..2ead36dfa2a3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -214,7 +214,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
 	/*
 	 * The page may not be locked, eg if called from try_to_unmap_one()
 	 */
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	head = folio_buffers(folio);
 	if (head) {
 		struct buffer_head *bh = head;
@@ -230,7 +230,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
 	} else if (ret) {
 		nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
 	}
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 
 	if (nr_dirty)
 		nilfs_set_file_dirty(inode, nr_dirty);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 71e31e789b29..548f3b51aa5f 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = {
  *
  * If the page does not have buffers, we create them and set them uptodate.
  * The page may not be locked which is why we need to handle the buffers under
- * the mapping->private_lock.  Once the buffers are marked dirty we no longer
+ * the mapping->i_private_lock.  Once the buffers are marked dirty we no longer
  * need the lock since try_to_free_buffers() does not free dirty buffers.
  */
 void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
@@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 	BUG_ON(!PageUptodate(page));
 	end = ofs + ni->itype.index.block_size;
 	bh_size = VFS_I(ni)->i_sb->s_blocksize;
-	spin_lock(&mapping->private_lock);
+	spin_lock(&mapping->i_private_lock);
 	if (unlikely(!page_has_buffers(page))) {
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 		bh = head = alloc_page_buffers(page, bh_size, true);
-		spin_lock(&mapping->private_lock);
+		spin_lock(&mapping->i_private_lock);
 		if (likely(!page_has_buffers(page))) {
 			struct buffer_head *tail;
 
@@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 			break;
 		set_buffer_dirty(bh);
 	} while ((bh = bh->b_this_page) != head);
-	spin_unlock(&mapping->private_lock);
+	spin_unlock(&mapping->i_private_lock);
 	filemap_dirty_folio(mapping, page_folio(page));
 	if (unlikely(buffers_to_free)) {
 		do {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..f171505940ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops;
  * @a_ops: Methods.
  * @flags: Error bits and flags (AS_*).
  * @wb_err: The most recent error which has occurred.
- * @private_lock: For use by the owner of the address_space.
- * @private_list: For use by the owner of the address_space.
- * @private_data: For use by the owner of the address_space.
+ * @i_private_lock: For use by the owner of the address_space.
+ * @i_private_list: For use by the owner of the address_space.
+ * @i_private_data: For use by the owner of the address_space.
  */
 struct address_space {
 	struct inode		*host;
@@ -484,9 +484,9 @@ struct address_space {
 	unsigned long		flags;
 	struct rw_semaphore	i_mmap_rwsem;
 	errseq_t		wb_err;
-	spinlock_t		private_lock;
-	struct list_head	private_list;
-	void			*private_data;
+	spinlock_t		i_private_lock;
+	struct list_head	i_private_list;
+	void *			i_private_data;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1169ef2f2176..38c4477fda6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1141,7 +1141,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
 	 * The VERY common case is inode->mapping == &inode->i_data but,
 	 * this may not be true for device special inodes.
 	 */
-	return (struct resv_map *)(&inode->i_data)->private_data;
+	return (struct resv_map *)(&inode->i_data)->i_private_data;
 }
 
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
diff --git a/mm/migrate.c b/mm/migrate.c
index 35a88334bb3c..377f55ebf7f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -746,7 +746,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
 
 recheck_buffers:
 		busy = false;
-		spin_lock(&mapping->private_lock);
+		spin_lock(&mapping->i_private_lock);
 		bh = head;
 		do {
 			if (atomic_read(&bh->b_count)) {
@@ -760,7 +760,7 @@ recheck_buffers:
 				rc = -EAGAIN;
 				goto unlock_buffers;
 			}
-			spin_unlock(&mapping->private_lock);
+			spin_unlock(&mapping->i_private_lock);
 			invalidate_bh_lrus();
 			invalidated = true;
 			goto recheck_buffers;
@@ -787,7 +787,7 @@ recheck_buffers:
 	rc = MIGRATEPAGE_SUCCESS;
 unlock_buffers:
 	if (check_refs)
-		spin_unlock(&mapping->private_lock);
+		spin_unlock(&mapping->i_private_lock);
 	bh = head;
 	do {
 		unlock_buffer(bh);
-- 
cgit v1.2.3


From f47507988145185aef5d0e7a0e28dbf6e7776f29 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 17 Oct 2023 22:05:23 +0200
Subject: thermal: ACPI: Move the ACPI thermal library to drivers/acpi/

The ACPI thermal library contains functions that can be used to
retrieve trip point temperature values through the platform firmware
for various types of trip points.  Each of these functions basically
evaluates a specific ACPI object, checks if the value produced by it
is reasonable and returns it (or THERMAL_TEMP_INVALID if anything
fails).

It made sense to hold it in drivers/thermal/ so long as it was only used
by the code in that directory, but since it is also going to be used by
the ACPI thermal driver located in drivers/acpi/, move it to the latter
in order to keep the code related to evaluating ACPI objects defined in
the specification proper together.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig                          |   5 ++
 drivers/acpi/Makefile                         |   1 +
 drivers/acpi/thermal_lib.c                    | 116 ++++++++++++++++++++++++++
 drivers/thermal/Kconfig                       |   4 -
 drivers/thermal/Makefile                      |   1 -
 drivers/thermal/intel/Kconfig                 |   2 +-
 drivers/thermal/intel/int340x_thermal/Kconfig |   2 +-
 drivers/thermal/thermal_acpi.c                | 116 --------------------------
 include/linux/acpi.h                          |   7 ++
 include/linux/thermal.h                       |   7 --
 10 files changed, 131 insertions(+), 130 deletions(-)
 create mode 100644 drivers/acpi/thermal_lib.c
 delete mode 100644 drivers/thermal/thermal_acpi.c

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index f819e760ff19..6f2bfcf7645c 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -61,6 +61,10 @@ config ACPI_CCA_REQUIRED
 config ACPI_TABLE_LIB
 	bool
 
+config ACPI_THERMAL_LIB
+       depends on THERMAL
+       bool
+
 config ACPI_DEBUGGER
 	bool "AML debugger interface"
 	select ACPI_DEBUG
@@ -327,6 +331,7 @@ config ACPI_THERMAL
 	tristate "Thermal Zone"
 	depends on ACPI_PROCESSOR
 	select THERMAL
+	select ACPI_THERMAL_LIB
 	default y
 	help
 	  This driver supports ACPI thermal zones.  Most mobile and
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index eaa09bf52f17..2a52083704e6 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_ACPI_TAD)		+= acpi_tad.o
 obj-$(CONFIG_ACPI_PCI_SLOT)	+= pci_slot.o
 obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-$(CONFIG_ACPI)		+= container.o
+obj-$(CONFIG_ACPI_THERMAL_LIB)	+= thermal_lib.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
 obj-$(CONFIG_ACPI_PLATFORM_PROFILE) 	+= platform_profile.o
 obj-$(CONFIG_ACPI_NFIT)		+= nfit/
diff --git a/drivers/acpi/thermal_lib.c b/drivers/acpi/thermal_lib.c
new file mode 100644
index 000000000000..43eaf0f2ff49
--- /dev/null
+++ b/drivers/acpi/thermal_lib.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023 Linaro Limited
+ * Copyright 2023 Intel Corporation
+ *
+ * Library routines for populating a generic thermal trip point structure
+ * with data obtained by evaluating a specific object in the ACPI Namespace.
+ */
+#include <linux/acpi.h>
+#include <linux/units.h>
+#include <linux/thermal.h>
+
+/*
+ * Minimum temperature for full military grade is 218°K (-55°C) and
+ * max temperature is 448°K (175°C). We can consider those values as
+ * the boundaries for the [trips] temperature returned by the
+ * firmware. Any values out of these boundaries may be considered
+ * bogus and we can assume the firmware has no data to provide.
+ */
+#define TEMP_MIN_DECIK	2180
+#define TEMP_MAX_DECIK	4480
+
+static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name,
+				  int *ret_temp)
+{
+	unsigned long long temp;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(adev->handle, obj_name, NULL, &temp);
+	if (ACPI_FAILURE(status)) {
+		acpi_handle_debug(adev->handle, "%s evaluation failed\n", obj_name);
+		return -ENODATA;
+	}
+
+	if (temp >= TEMP_MIN_DECIK && temp <= TEMP_MAX_DECIK) {
+		*ret_temp = deci_kelvin_to_millicelsius(temp);
+	} else {
+		acpi_handle_debug(adev->handle, "%s result %llu out of range\n",
+				  obj_name, temp);
+		*ret_temp = THERMAL_TEMP_INVALID;
+	}
+
+	return 0;
+}
+
+/**
+ * thermal_acpi_active_trip_temp - Retrieve active trip point temperature
+ * @adev: Target thermal zone ACPI device object.
+ * @id: Active cooling level (0 - 9).
+ * @ret_temp: Address to store the retrieved temperature value on success.
+ *
+ * Evaluate the _ACx object for the thermal zone represented by @adev to obtain
+ * the temperature of the active cooling trip point corresponding to the active
+ * cooling level given by @id.
+ *
+ * Return 0 on success or a negative error value on failure.
+ */
+int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp)
+{
+	char obj_name[] = {'_', 'A', 'C', '0' + id, '\0'};
+
+	if (id < 0 || id > 9)
+		return -EINVAL;
+
+	return thermal_acpi_trip_temp(adev, obj_name, ret_temp);
+}
+EXPORT_SYMBOL_GPL(thermal_acpi_active_trip_temp);
+
+/**
+ * thermal_acpi_passive_trip_temp - Retrieve passive trip point temperature
+ * @adev: Target thermal zone ACPI device object.
+ * @ret_temp: Address to store the retrieved temperature value on success.
+ *
+ * Evaluate the _PSV object for the thermal zone represented by @adev to obtain
+ * the temperature of the passive cooling trip point.
+ *
+ * Return 0 on success or -ENODATA on failure.
+ */
+int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp)
+{
+	return thermal_acpi_trip_temp(adev, "_PSV", ret_temp);
+}
+EXPORT_SYMBOL_GPL(thermal_acpi_passive_trip_temp);
+
+/**
+ * thermal_acpi_hot_trip_temp - Retrieve hot trip point temperature
+ * @adev: Target thermal zone ACPI device object.
+ * @ret_temp: Address to store the retrieved temperature value on success.
+ *
+ * Evaluate the _HOT object for the thermal zone represented by @adev to obtain
+ * the temperature of the trip point at which the system is expected to be put
+ * into the S4 sleep state.
+ *
+ * Return 0 on success or -ENODATA on failure.
+ */
+int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp)
+{
+	return thermal_acpi_trip_temp(adev, "_HOT", ret_temp);
+}
+EXPORT_SYMBOL_GPL(thermal_acpi_hot_trip_temp);
+
+/**
+ * thermal_acpi_critical_trip_temp - Retrieve critical trip point temperature
+ * @adev: Target thermal zone ACPI device object.
+ * @ret_temp: Address to store the retrieved temperature value on success.
+ *
+ * Evaluate the _CRT object for the thermal zone represented by @adev to obtain
+ * the temperature of the critical cooling trip point.
+ *
+ * Return 0 on success or -ENODATA on failure.
+ */
+int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp)
+{
+	return thermal_acpi_trip_temp(adev, "_CRT", ret_temp);
+}
+EXPORT_SYMBOL_GPL(thermal_acpi_critical_trip_temp);
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index c81a00fbca7d..59883502eff4 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -76,10 +76,6 @@ config THERMAL_OF
 	  Say 'Y' here if you need to build thermal infrastructure
 	  based on device tree.
 
-config THERMAL_ACPI
-       depends on ACPI
-       bool
-
 config THERMAL_WRITABLE_TRIPS
 	bool "Enable writable trip points"
 	help
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index c934cab309ae..a8318d671036 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -13,7 +13,6 @@ thermal_sys-$(CONFIG_THERMAL_NETLINK)		+= thermal_netlink.o
 # interface to/from other layers providing sensors
 thermal_sys-$(CONFIG_THERMAL_HWMON)		+= thermal_hwmon.o
 thermal_sys-$(CONFIG_THERMAL_OF)		+= thermal_of.o
-thermal_sys-$(CONFIG_THERMAL_ACPI)		+= thermal_acpi.o
 
 # governors
 CFLAGS_gov_power_allocator.o			:= -I$(src)
diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig
index ecd7e07eece0..b43953b5539f 100644
--- a/drivers/thermal/intel/Kconfig
+++ b/drivers/thermal/intel/Kconfig
@@ -85,7 +85,7 @@ config INTEL_BXT_PMIC_THERMAL
 config INTEL_PCH_THERMAL
 	tristate "Intel PCH Thermal Reporting Driver"
 	depends on X86 && PCI
-	select THERMAL_ACPI if ACPI
+	select ACPI_THERMAL_LIB if ACPI
 	help
 	  Enable this to support thermal reporting on certain intel PCHs.
 	  Thermal reporting device will provide temperature reading,
diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig
index 300ea53e9b33..e76b13e44d03 100644
--- a/drivers/thermal/intel/int340x_thermal/Kconfig
+++ b/drivers/thermal/intel/int340x_thermal/Kconfig
@@ -9,7 +9,7 @@ config INT340X_THERMAL
 	select THERMAL_GOV_USER_SPACE
 	select ACPI_THERMAL_REL
 	select ACPI_FAN
-	select THERMAL_ACPI
+	select ACPI_THERMAL_LIB
 	select INTEL_SOC_DTS_IOSF_CORE
 	select INTEL_TCC
 	select PROC_THERMAL_MMIO_RAPL if POWERCAP
diff --git a/drivers/thermal/thermal_acpi.c b/drivers/thermal/thermal_acpi.c
deleted file mode 100644
index 43eaf0f2ff49..000000000000
--- a/drivers/thermal/thermal_acpi.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2023 Linaro Limited
- * Copyright 2023 Intel Corporation
- *
- * Library routines for populating a generic thermal trip point structure
- * with data obtained by evaluating a specific object in the ACPI Namespace.
- */
-#include <linux/acpi.h>
-#include <linux/units.h>
-#include <linux/thermal.h>
-
-/*
- * Minimum temperature for full military grade is 218°K (-55°C) and
- * max temperature is 448°K (175°C). We can consider those values as
- * the boundaries for the [trips] temperature returned by the
- * firmware. Any values out of these boundaries may be considered
- * bogus and we can assume the firmware has no data to provide.
- */
-#define TEMP_MIN_DECIK	2180
-#define TEMP_MAX_DECIK	4480
-
-static int thermal_acpi_trip_temp(struct acpi_device *adev, char *obj_name,
-				  int *ret_temp)
-{
-	unsigned long long temp;
-	acpi_status status;
-
-	status = acpi_evaluate_integer(adev->handle, obj_name, NULL, &temp);
-	if (ACPI_FAILURE(status)) {
-		acpi_handle_debug(adev->handle, "%s evaluation failed\n", obj_name);
-		return -ENODATA;
-	}
-
-	if (temp >= TEMP_MIN_DECIK && temp <= TEMP_MAX_DECIK) {
-		*ret_temp = deci_kelvin_to_millicelsius(temp);
-	} else {
-		acpi_handle_debug(adev->handle, "%s result %llu out of range\n",
-				  obj_name, temp);
-		*ret_temp = THERMAL_TEMP_INVALID;
-	}
-
-	return 0;
-}
-
-/**
- * thermal_acpi_active_trip_temp - Retrieve active trip point temperature
- * @adev: Target thermal zone ACPI device object.
- * @id: Active cooling level (0 - 9).
- * @ret_temp: Address to store the retrieved temperature value on success.
- *
- * Evaluate the _ACx object for the thermal zone represented by @adev to obtain
- * the temperature of the active cooling trip point corresponding to the active
- * cooling level given by @id.
- *
- * Return 0 on success or a negative error value on failure.
- */
-int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp)
-{
-	char obj_name[] = {'_', 'A', 'C', '0' + id, '\0'};
-
-	if (id < 0 || id > 9)
-		return -EINVAL;
-
-	return thermal_acpi_trip_temp(adev, obj_name, ret_temp);
-}
-EXPORT_SYMBOL_GPL(thermal_acpi_active_trip_temp);
-
-/**
- * thermal_acpi_passive_trip_temp - Retrieve passive trip point temperature
- * @adev: Target thermal zone ACPI device object.
- * @ret_temp: Address to store the retrieved temperature value on success.
- *
- * Evaluate the _PSV object for the thermal zone represented by @adev to obtain
- * the temperature of the passive cooling trip point.
- *
- * Return 0 on success or -ENODATA on failure.
- */
-int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp)
-{
-	return thermal_acpi_trip_temp(adev, "_PSV", ret_temp);
-}
-EXPORT_SYMBOL_GPL(thermal_acpi_passive_trip_temp);
-
-/**
- * thermal_acpi_hot_trip_temp - Retrieve hot trip point temperature
- * @adev: Target thermal zone ACPI device object.
- * @ret_temp: Address to store the retrieved temperature value on success.
- *
- * Evaluate the _HOT object for the thermal zone represented by @adev to obtain
- * the temperature of the trip point at which the system is expected to be put
- * into the S4 sleep state.
- *
- * Return 0 on success or -ENODATA on failure.
- */
-int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp)
-{
-	return thermal_acpi_trip_temp(adev, "_HOT", ret_temp);
-}
-EXPORT_SYMBOL_GPL(thermal_acpi_hot_trip_temp);
-
-/**
- * thermal_acpi_critical_trip_temp - Retrieve critical trip point temperature
- * @adev: Target thermal zone ACPI device object.
- * @ret_temp: Address to store the retrieved temperature value on success.
- *
- * Evaluate the _CRT object for the thermal zone represented by @adev to obtain
- * the temperature of the critical cooling trip point.
- *
- * Return 0 on success or -ENODATA on failure.
- */
-int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp)
-{
-	return thermal_acpi_trip_temp(adev, "_CRT", ret_temp);
-}
-EXPORT_SYMBOL_GPL(thermal_acpi_critical_trip_temp);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 54189e0e5f41..b63d7811c728 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -424,6 +424,13 @@ extern int acpi_blacklisted(void);
 extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
+#ifdef CONFIG_ACPI_THERMAL_LIB
+int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
+int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
+int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
+int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
 int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index cee814d5d1ac..35f620059456 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -294,13 +294,6 @@ int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
 
-#ifdef CONFIG_THERMAL_ACPI
-int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp);
-int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp);
-int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp);
-int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp);
-#endif
-
 #ifdef CONFIG_THERMAL
 struct thermal_zone_device *thermal_zone_device_register_with_trips(
 					const char *type,
-- 
cgit v1.2.3


From 35732699f5d2922ff674e711e566cf44a4bd86d2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 22 Nov 2023 08:33:53 -0700
Subject: ACPI: Fix ARM32 platforms compile issue introduced by fw_table
 changes

Linus reported that:
After commit a103f46633fd the kernel stopped compiling for
several ARM32 platforms that I am building with a bare metal
compiler. Bare metal compilers (arm-none-eabi-) don't
define __linux__.

This is because the header <acpi/platform/acenv.h> is now
in the include path for <linux/irq.h>:

  CC      arch/arm/kernel/irq.o
  CC      kernel/sysctl.o
  CC      crypto/api.o
In file included from ../include/acpi/acpi.h:22,
                 from ../include/linux/fw_table.h:29,
                 from ../include/linux/acpi.h:18,
                 from ../include/linux/irqchip.h:14,
                 from ../arch/arm/kernel/irq.c:25:
../include/acpi/platform/acenv.h:218:2: error: #error Unknown target environment
  218 | #error Unknown target environment
      |  ^~~~~

The issue is caused by the introducing of splitting out the ACPI code to
support the new generic fw_table code.

Rafael suggested [1] moving the fw_table.h include in linux/acpi.h to below
the linux/mutex.h. Remove the two includes in fw_table.h. Replace
linux/fw_table.h include in fw_table.c with linux/acpi.h.

Link: https://lore.kernel.org/linux-acpi/CAJZ5v0idWdJq3JSqQWLG5q+b+b=zkEdWR55rGYEoxh7R6N8kFQ@mail.gmail.com/
Fixes: a103f46633fd ("acpi: Move common tables helper functions to common lib")
Closes: https://lore.kernel.org/linux-acpi/20231114-arm-build-bug-v1-1-458745fe32a4@linaro.org/
Reported-by: Linus Walleij <linus.walleij@linaro.org>
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h     | 22 +++++++++++-----------
 include/linux/fw_table.h |  3 ---
 lib/fw_table.c           |  2 +-
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 54189e0e5f41..4db54e928b36 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,7 +15,6 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
-#include <linux/fw_table.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -25,22 +24,13 @@ struct irq_domain_ops;
 #endif
 #include <acpi/acpi.h>
 
-#ifdef CONFIG_ACPI_TABLE_LIB
-#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
-#define __init_or_acpilib
-#define __initdata_or_acpilib
-#else
-#define EXPORT_SYMBOL_ACPI_LIB(x)
-#define __init_or_acpilib __init
-#define __initdata_or_acpilib __initdata
-#endif
-
 #ifdef	CONFIG_ACPI
 
 #include <linux/list.h>
 #include <linux/dynamic_debug.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/fw_table.h>
 
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
@@ -48,6 +38,16 @@ struct irq_domain_ops;
 #include <acpi/acpi_io.h>
 #include <asm/acpi.h>
 
+#ifdef CONFIG_ACPI_TABLE_LIB
+#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI)
+#define __init_or_acpilib
+#define __initdata_or_acpilib
+#else
+#define EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_acpilib __init
+#define __initdata_or_acpilib __initdata
+#endif
+
 static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 {
 	return adev ? adev->handle : NULL;
diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
index ff8fa58d5818..ca49947f0a77 100644
--- a/include/linux/fw_table.h
+++ b/include/linux/fw_table.h
@@ -25,9 +25,6 @@ struct acpi_subtable_proc {
 	int count;
 };
 
-#include <linux/acpi.h>
-#include <acpi/acpi.h>
-
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
diff --git a/lib/fw_table.c b/lib/fw_table.c
index b51f30a28e47..294df54e33b6 100644
--- a/lib/fw_table.c
+++ b/lib/fw_table.c
@@ -7,7 +7,7 @@
  *  Copyright (C) 2023 Intel Corp.
  */
 #include <linux/errno.h>
-#include <linux/fw_table.h>
+#include <linux/acpi.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
-- 
cgit v1.2.3


From 21f4c443731fdb064c0dd31a743aafd0b075156c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 20 Nov 2023 18:47:20 +0100
Subject: soundwire: stream: constify sdw_port_config when adding devices

sdw_stream_add_master() and sdw_stream_add_slave() do not modify
contents of passed sdw_port_config, so it can be made const for code
safety and as documentation of expected usage.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20231120174720.239610-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/stream.c    | 10 +++++-----
 include/linux/soundwire/sdw.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 69719b335bcb..9dc6399f206a 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -897,7 +897,7 @@ static struct sdw_port_runtime *sdw_port_alloc(struct list_head *port_list)
 }
 
 static int sdw_port_config(struct sdw_port_runtime *p_rt,
-			   struct sdw_port_config *port_config,
+			   const struct sdw_port_config *port_config,
 			   int port_index)
 {
 	p_rt->ch_mask = port_config[port_index].ch_mask;
@@ -970,7 +970,7 @@ static int sdw_slave_port_is_valid_range(struct device *dev, int num)
 
 static int sdw_slave_port_config(struct sdw_slave *slave,
 				 struct sdw_slave_runtime *s_rt,
-				 struct sdw_port_config *port_config)
+				 const struct sdw_port_config *port_config)
 {
 	struct sdw_port_runtime *p_rt;
 	int ret;
@@ -1026,7 +1026,7 @@ static int sdw_master_port_alloc(struct sdw_master_runtime *m_rt,
 }
 
 static int sdw_master_port_config(struct sdw_master_runtime *m_rt,
-				  struct sdw_port_config *port_config)
+				  const struct sdw_port_config *port_config)
 {
 	struct sdw_port_runtime *p_rt;
 	int ret;
@@ -1861,7 +1861,7 @@ EXPORT_SYMBOL(sdw_release_stream);
  */
 int sdw_stream_add_master(struct sdw_bus *bus,
 			  struct sdw_stream_config *stream_config,
-			  struct sdw_port_config *port_config,
+			  const struct sdw_port_config *port_config,
 			  unsigned int num_ports,
 			  struct sdw_stream_runtime *stream)
 {
@@ -1981,7 +1981,7 @@ EXPORT_SYMBOL(sdw_stream_remove_master);
  */
 int sdw_stream_add_slave(struct sdw_slave *slave,
 			 struct sdw_stream_config *stream_config,
-			 struct sdw_port_config *port_config,
+			 const struct sdw_port_config *port_config,
 			 unsigned int num_ports,
 			 struct sdw_stream_runtime *stream)
 {
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 4f3d14bb1538..904004d8b562 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1040,7 +1040,7 @@ int sdw_compute_params(struct sdw_bus *bus);
 
 int sdw_stream_add_master(struct sdw_bus *bus,
 		struct sdw_stream_config *stream_config,
-		struct sdw_port_config *port_config,
+		const struct sdw_port_config *port_config,
 		unsigned int num_ports,
 		struct sdw_stream_runtime *stream);
 int sdw_stream_remove_master(struct sdw_bus *bus,
@@ -1062,7 +1062,7 @@ void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id
 
 int sdw_stream_add_slave(struct sdw_slave *slave,
 			 struct sdw_stream_config *stream_config,
-			 struct sdw_port_config *port_config,
+			 const struct sdw_port_config *port_config,
 			 unsigned int num_ports,
 			 struct sdw_stream_runtime *stream);
 int sdw_stream_remove_slave(struct sdw_slave *slave,
@@ -1084,7 +1084,7 @@ int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val);
 
 static inline int sdw_stream_add_slave(struct sdw_slave *slave,
 				       struct sdw_stream_config *stream_config,
-				       struct sdw_port_config *port_config,
+				       const struct sdw_port_config *port_config,
 				       unsigned int num_ports,
 				       struct sdw_stream_runtime *stream)
 {
-- 
cgit v1.2.3


From 66b73e9a402d822723b3af5263eb12d735628eac Mon Sep 17 00:00:00 2001
From: Matt Coster <matt.coster@imgtec.com>
Date: Wed, 22 Nov 2023 16:34:22 +0000
Subject: sizes.h: Add entries between SZ_32G and SZ_64T

sizes.h has a gap in defines between SZ_32G and SZ_64T. Add the missing
defines so they can be used in drivers.

Signed-off-by: Matt Coster <matt.coster@imgtec.com>
Signed-off-by: Sarah Walker <sarah.walker@imgtec.com>
Signed-off-by: Donald Robson <donald.robson@imgtec.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/58b227d96f27859b453caf0ceaaac81a6616304b.1700668843.git.donald.robson@imgtec.com
Signed-off-by: Maxime Ripard <mripard@kernel.org>
---
 include/linux/sizes.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index 84aa448d8bb3..c3a00b967d18 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -47,8 +47,17 @@
 #define SZ_8G				_AC(0x200000000, ULL)
 #define SZ_16G				_AC(0x400000000, ULL)
 #define SZ_32G				_AC(0x800000000, ULL)
+#define SZ_64G				_AC(0x1000000000, ULL)
+#define SZ_128G				_AC(0x2000000000, ULL)
+#define SZ_256G				_AC(0x4000000000, ULL)
+#define SZ_512G				_AC(0x8000000000, ULL)
 
 #define SZ_1T				_AC(0x10000000000, ULL)
+#define SZ_2T				_AC(0x20000000000, ULL)
+#define SZ_4T				_AC(0x40000000000, ULL)
+#define SZ_8T				_AC(0x80000000000, ULL)
+#define SZ_16T				_AC(0x100000000000, ULL)
+#define SZ_32T				_AC(0x200000000000, ULL)
 #define SZ_64T				_AC(0x400000000000, ULL)
 
 #endif /* __LINUX_SIZES_H__ */
-- 
cgit v1.2.3


From 57b79ac9f43dc71fc8b55af51d1c9f469cb7a0de Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Thu, 5 May 2022 01:25:16 +0900
Subject: soc: apple: rtkit: Get rid of apple_rtkit_send_message_wait

It is fundamentally broken and has no users. Just remove it.

Acked-by: Eric Curtin <ecurtin@redhat.com>
Acked-by: Neal Gompa <neal@gompa.dev>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 drivers/soc/apple/rtkit.c       | 32 --------------------------------
 include/linux/soc/apple/rtkit.h | 18 ------------------
 2 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/apple/rtkit.c b/drivers/soc/apple/rtkit.c
index d9f19dc99da5..7c9b9f25bbc1 100644
--- a/drivers/soc/apple/rtkit.c
+++ b/drivers/soc/apple/rtkit.c
@@ -641,38 +641,6 @@ int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message,
 }
 EXPORT_SYMBOL_GPL(apple_rtkit_send_message);
 
-int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message,
-				  unsigned long timeout, bool atomic)
-{
-	DECLARE_COMPLETION_ONSTACK(completion);
-	int ret;
-	long t;
-
-	ret = apple_rtkit_send_message(rtk, ep, message, &completion, atomic);
-	if (ret < 0)
-		return ret;
-
-	if (atomic) {
-		ret = mbox_flush(rtk->mbox_chan, timeout);
-		if (ret < 0)
-			return ret;
-
-		if (try_wait_for_completion(&completion))
-			return 0;
-
-		return -ETIME;
-	} else {
-		t = wait_for_completion_interruptible_timeout(
-			&completion, msecs_to_jiffies(timeout));
-		if (t < 0)
-			return t;
-		else if (t == 0)
-			return -ETIME;
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(apple_rtkit_send_message_wait);
-
 int apple_rtkit_poll(struct apple_rtkit *rtk)
 {
 	return mbox_client_peek_data(rtk->mbox_chan);
diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h
index fc456f75c131..8c9ca857ccf6 100644
--- a/include/linux/soc/apple/rtkit.h
+++ b/include/linux/soc/apple/rtkit.h
@@ -160,24 +160,6 @@ int apple_rtkit_start_ep(struct apple_rtkit *rtk, u8 endpoint);
 int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message,
 			     struct completion *completion, bool atomic);
 
-/*
- * Send a message to the given endpoint and wait until it has been submitted
- * to the hardware FIFO.
- * Will return zero on success and a negative error code on failure
- * (e.g. -ETIME when the message couldn't be written within the given
- * timeout)
- *
- * @rtk:            RTKit reference
- * @ep:             target endpoint
- * @message:        message to be sent
- * @timeout:        timeout in milliseconds to allow the message transmission
- *                  to be completed
- * @atomic:         if set to true this function can be called from atomic
- *                  context.
- */
-int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message,
-				  unsigned long timeout, bool atomic);
-
 /*
  * Process incoming messages in atomic context.
  * This only guarantees that messages arrive as far as the recv_message_early
-- 
cgit v1.2.3


From 143897c4fa976d02bfafe5ae32b9ffc60dc6145a Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Tue, 14 Mar 2023 19:47:32 +0900
Subject: mailbox: apple: Delete driver

This driver is now orphaned and superseded by
drivers/soc/apple/mailbox.c.

Acked-by: Eric Curtin <ecurtin@redhat.com>
Acked-by: Neal Gompa <neal@gompa.dev>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 MAINTAINERS                     |   2 -
 drivers/mailbox/Kconfig         |  12 --
 drivers/mailbox/Makefile        |   2 -
 drivers/mailbox/apple-mailbox.c | 441 ----------------------------------------
 include/linux/apple-mailbox.h   |  19 --
 5 files changed, 476 deletions(-)
 delete mode 100644 drivers/mailbox/apple-mailbox.c
 delete mode 100644 include/linux/apple-mailbox.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..c57c011dbd8c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1943,7 +1943,6 @@ F:	drivers/i2c/busses/i2c-pasemi-platform.c
 F:	drivers/iommu/apple-dart.c
 F:	drivers/iommu/io-pgtable-dart.c
 F:	drivers/irqchip/irq-apple-aic.c
-F:	drivers/mailbox/apple-mailbox.c
 F:	drivers/nvme/host/apple.c
 F:	drivers/nvmem/apple-efuses.c
 F:	drivers/pinctrl/pinctrl-apple-gpio.c
@@ -1952,7 +1951,6 @@ F:	drivers/soc/apple/*
 F:	drivers/watchdog/apple_wdt.c
 F:	include/dt-bindings/interrupt-controller/apple-aic.h
 F:	include/dt-bindings/pinctrl/apple.h
-F:	include/linux/apple-mailbox.h
 F:	include/linux/soc/apple/*
 
 ARM/ARTPEC MACHINE SUPPORT
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index bc2e265cb02d..42940108a187 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -8,18 +8,6 @@ menuconfig MAILBOX
 
 if MAILBOX
 
-config APPLE_MAILBOX
-	tristate "Apple Mailbox driver"
-	depends on ARCH_APPLE || (ARM64 && COMPILE_TEST)
-	default ARCH_APPLE
-	help
-	  Apple SoCs have various co-processors required for certain
-	  peripherals to work (NVMe, display controller, etc.). This
-	  driver adds support for the mailbox controller used to
-	  communicate with those.
-
-	  Say Y here if you have a Apple SoC.
-
 config ARM_MHU
 	tristate "ARM MHU Mailbox"
 	depends on ARM_AMBA
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index fc9376117111..18793e6caa2f 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -60,5 +60,3 @@ obj-$(CONFIG_SUN6I_MSGBOX)	+= sun6i-msgbox.o
 obj-$(CONFIG_SPRD_MBOX)		+= sprd-mailbox.o
 
 obj-$(CONFIG_QCOM_IPCC)		+= qcom-ipcc.o
-
-obj-$(CONFIG_APPLE_MAILBOX)	+= apple-mailbox.o
diff --git a/drivers/mailbox/apple-mailbox.c b/drivers/mailbox/apple-mailbox.c
deleted file mode 100644
index 2a3e8d8ff8b5..000000000000
--- a/drivers/mailbox/apple-mailbox.c
+++ /dev/null
@@ -1,441 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only OR MIT
-/*
- * Apple mailbox driver
- *
- * Copyright (C) 2021 The Asahi Linux Contributors
- *
- * This driver adds support for two mailbox variants (called ASC and M3 by
- * Apple) found in Apple SoCs such as the M1. It consists of two FIFOs used to
- * exchange 64+32 bit messages between the main CPU and a co-processor.
- * Various coprocessors implement different IPC protocols based on these simple
- * messages and shared memory buffers.
- *
- * Both the main CPU and the co-processor see the same set of registers but
- * the first FIFO (A2I) is always used to transfer messages from the application
- * processor (us) to the I/O processor and the second one (I2A) for the
- * other direction.
- */
-
-#include <linux/apple-mailbox.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/gfp.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/mailbox_controller.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-
-#define APPLE_ASC_MBOX_CONTROL_FULL  BIT(16)
-#define APPLE_ASC_MBOX_CONTROL_EMPTY BIT(17)
-
-#define APPLE_ASC_MBOX_A2I_CONTROL 0x110
-#define APPLE_ASC_MBOX_A2I_SEND0   0x800
-#define APPLE_ASC_MBOX_A2I_SEND1   0x808
-#define APPLE_ASC_MBOX_A2I_RECV0   0x810
-#define APPLE_ASC_MBOX_A2I_RECV1   0x818
-
-#define APPLE_ASC_MBOX_I2A_CONTROL 0x114
-#define APPLE_ASC_MBOX_I2A_SEND0   0x820
-#define APPLE_ASC_MBOX_I2A_SEND1   0x828
-#define APPLE_ASC_MBOX_I2A_RECV0   0x830
-#define APPLE_ASC_MBOX_I2A_RECV1   0x838
-
-#define APPLE_M3_MBOX_CONTROL_FULL  BIT(16)
-#define APPLE_M3_MBOX_CONTROL_EMPTY BIT(17)
-
-#define APPLE_M3_MBOX_A2I_CONTROL 0x50
-#define APPLE_M3_MBOX_A2I_SEND0	  0x60
-#define APPLE_M3_MBOX_A2I_SEND1	  0x68
-#define APPLE_M3_MBOX_A2I_RECV0	  0x70
-#define APPLE_M3_MBOX_A2I_RECV1	  0x78
-
-#define APPLE_M3_MBOX_I2A_CONTROL 0x80
-#define APPLE_M3_MBOX_I2A_SEND0	  0x90
-#define APPLE_M3_MBOX_I2A_SEND1	  0x98
-#define APPLE_M3_MBOX_I2A_RECV0	  0xa0
-#define APPLE_M3_MBOX_I2A_RECV1	  0xa8
-
-#define APPLE_M3_MBOX_IRQ_ENABLE	0x48
-#define APPLE_M3_MBOX_IRQ_ACK		0x4c
-#define APPLE_M3_MBOX_IRQ_A2I_EMPTY	BIT(0)
-#define APPLE_M3_MBOX_IRQ_A2I_NOT_EMPTY BIT(1)
-#define APPLE_M3_MBOX_IRQ_I2A_EMPTY	BIT(2)
-#define APPLE_M3_MBOX_IRQ_I2A_NOT_EMPTY BIT(3)
-
-#define APPLE_MBOX_MSG1_OUTCNT GENMASK(56, 52)
-#define APPLE_MBOX_MSG1_INCNT  GENMASK(51, 48)
-#define APPLE_MBOX_MSG1_OUTPTR GENMASK(47, 44)
-#define APPLE_MBOX_MSG1_INPTR  GENMASK(43, 40)
-#define APPLE_MBOX_MSG1_MSG    GENMASK(31, 0)
-
-struct apple_mbox_hw {
-	unsigned int control_full;
-	unsigned int control_empty;
-
-	unsigned int a2i_control;
-	unsigned int a2i_send0;
-	unsigned int a2i_send1;
-
-	unsigned int i2a_control;
-	unsigned int i2a_recv0;
-	unsigned int i2a_recv1;
-
-	bool has_irq_controls;
-	unsigned int irq_enable;
-	unsigned int irq_ack;
-	unsigned int irq_bit_recv_not_empty;
-	unsigned int irq_bit_send_empty;
-};
-
-struct apple_mbox {
-	void __iomem *regs;
-	const struct apple_mbox_hw *hw;
-
-	int irq_recv_not_empty;
-	int irq_send_empty;
-
-	struct mbox_chan chan;
-
-	struct device *dev;
-	struct mbox_controller controller;
-	spinlock_t rx_lock;
-};
-
-static const struct of_device_id apple_mbox_of_match[];
-
-static bool apple_mbox_hw_can_send(struct apple_mbox *apple_mbox)
-{
-	u32 mbox_ctrl =
-		readl_relaxed(apple_mbox->regs + apple_mbox->hw->a2i_control);
-
-	return !(mbox_ctrl & apple_mbox->hw->control_full);
-}
-
-static bool apple_mbox_hw_send_empty(struct apple_mbox *apple_mbox)
-{
-	u32 mbox_ctrl =
-		readl_relaxed(apple_mbox->regs + apple_mbox->hw->a2i_control);
-
-	return mbox_ctrl & apple_mbox->hw->control_empty;
-}
-
-static int apple_mbox_hw_send(struct apple_mbox *apple_mbox,
-			      struct apple_mbox_msg *msg)
-{
-	if (!apple_mbox_hw_can_send(apple_mbox))
-		return -EBUSY;
-
-	dev_dbg(apple_mbox->dev, "> TX %016llx %08x\n", msg->msg0, msg->msg1);
-
-	writeq_relaxed(msg->msg0, apple_mbox->regs + apple_mbox->hw->a2i_send0);
-	writeq_relaxed(FIELD_PREP(APPLE_MBOX_MSG1_MSG, msg->msg1),
-		       apple_mbox->regs + apple_mbox->hw->a2i_send1);
-
-	return 0;
-}
-
-static bool apple_mbox_hw_can_recv(struct apple_mbox *apple_mbox)
-{
-	u32 mbox_ctrl =
-		readl_relaxed(apple_mbox->regs + apple_mbox->hw->i2a_control);
-
-	return !(mbox_ctrl & apple_mbox->hw->control_empty);
-}
-
-static int apple_mbox_hw_recv(struct apple_mbox *apple_mbox,
-			      struct apple_mbox_msg *msg)
-{
-	if (!apple_mbox_hw_can_recv(apple_mbox))
-		return -ENOMSG;
-
-	msg->msg0 = readq_relaxed(apple_mbox->regs + apple_mbox->hw->i2a_recv0);
-	msg->msg1 = FIELD_GET(
-		APPLE_MBOX_MSG1_MSG,
-		readq_relaxed(apple_mbox->regs + apple_mbox->hw->i2a_recv1));
-
-	dev_dbg(apple_mbox->dev, "< RX %016llx %08x\n", msg->msg0, msg->msg1);
-
-	return 0;
-}
-
-static int apple_mbox_chan_send_data(struct mbox_chan *chan, void *data)
-{
-	struct apple_mbox *apple_mbox = chan->con_priv;
-	struct apple_mbox_msg *msg = data;
-	int ret;
-
-	ret = apple_mbox_hw_send(apple_mbox, msg);
-	if (ret)
-		return ret;
-
-	/*
-	 * The interrupt is level triggered and will keep firing as long as the
-	 * FIFO is empty. It will also keep firing if the FIFO was empty
-	 * at any point in the past until it has been acknowledged at the
-	 * mailbox level. By acknowledging it here we can ensure that we will
-	 * only get the interrupt once the FIFO has been cleared again.
-	 * If the FIFO is already empty before the ack it will fire again
-	 * immediately after the ack.
-	 */
-	if (apple_mbox->hw->has_irq_controls) {
-		writel_relaxed(apple_mbox->hw->irq_bit_send_empty,
-			       apple_mbox->regs + apple_mbox->hw->irq_ack);
-	}
-	enable_irq(apple_mbox->irq_send_empty);
-
-	return 0;
-}
-
-static irqreturn_t apple_mbox_send_empty_irq(int irq, void *data)
-{
-	struct apple_mbox *apple_mbox = data;
-
-	/*
-	 * We don't need to acknowledge the interrupt at the mailbox level
-	 * here even if supported by the hardware. It will keep firing but that
-	 * doesn't matter since it's disabled at the main interrupt controller.
-	 * apple_mbox_chan_send_data will acknowledge it before enabling
-	 * it at the main controller again.
-	 */
-	disable_irq_nosync(apple_mbox->irq_send_empty);
-	mbox_chan_txdone(&apple_mbox->chan, 0);
-	return IRQ_HANDLED;
-}
-
-static int apple_mbox_poll(struct apple_mbox *apple_mbox)
-{
-	struct apple_mbox_msg msg;
-	int ret = 0;
-
-	while (apple_mbox_hw_recv(apple_mbox, &msg) == 0) {
-		mbox_chan_received_data(&apple_mbox->chan, (void *)&msg);
-		ret++;
-	}
-
-	/*
-	 * The interrupt will keep firing even if there are no more messages
-	 * unless we also acknowledge it at the mailbox level here.
-	 * There's no race if a message comes in between the check in the while
-	 * loop above and the ack below: If a new messages arrives inbetween
-	 * those two the interrupt will just fire again immediately after the
-	 * ack since it's level triggered.
-	 */
-	if (apple_mbox->hw->has_irq_controls) {
-		writel_relaxed(apple_mbox->hw->irq_bit_recv_not_empty,
-			       apple_mbox->regs + apple_mbox->hw->irq_ack);
-	}
-
-	return ret;
-}
-
-static irqreturn_t apple_mbox_recv_irq(int irq, void *data)
-{
-	struct apple_mbox *apple_mbox = data;
-
-	spin_lock(&apple_mbox->rx_lock);
-	apple_mbox_poll(apple_mbox);
-	spin_unlock(&apple_mbox->rx_lock);
-
-	return IRQ_HANDLED;
-}
-
-static bool apple_mbox_chan_peek_data(struct mbox_chan *chan)
-{
-	struct apple_mbox *apple_mbox = chan->con_priv;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&apple_mbox->rx_lock, flags);
-	ret = apple_mbox_poll(apple_mbox);
-	spin_unlock_irqrestore(&apple_mbox->rx_lock, flags);
-
-	return ret > 0;
-}
-
-static int apple_mbox_chan_flush(struct mbox_chan *chan, unsigned long timeout)
-{
-	struct apple_mbox *apple_mbox = chan->con_priv;
-	unsigned long deadline = jiffies + msecs_to_jiffies(timeout);
-
-	while (time_before(jiffies, deadline)) {
-		if (apple_mbox_hw_send_empty(apple_mbox)) {
-			mbox_chan_txdone(&apple_mbox->chan, 0);
-			return 0;
-		}
-
-		udelay(1);
-	}
-
-	return -ETIME;
-}
-
-static int apple_mbox_chan_startup(struct mbox_chan *chan)
-{
-	struct apple_mbox *apple_mbox = chan->con_priv;
-
-	/*
-	 * Only some variants of this mailbox HW provide interrupt control
-	 * at the mailbox level. We therefore need to handle enabling/disabling
-	 * interrupts at the main interrupt controller anyway for hardware that
-	 * doesn't. Just always keep the interrupts we care about enabled at
-	 * the mailbox level so that both hardware revisions behave almost
-	 * the same.
-	 */
-	if (apple_mbox->hw->has_irq_controls) {
-		writel_relaxed(apple_mbox->hw->irq_bit_recv_not_empty |
-				       apple_mbox->hw->irq_bit_send_empty,
-			       apple_mbox->regs + apple_mbox->hw->irq_enable);
-	}
-
-	enable_irq(apple_mbox->irq_recv_not_empty);
-	return 0;
-}
-
-static void apple_mbox_chan_shutdown(struct mbox_chan *chan)
-{
-	struct apple_mbox *apple_mbox = chan->con_priv;
-
-	disable_irq(apple_mbox->irq_recv_not_empty);
-}
-
-static const struct mbox_chan_ops apple_mbox_ops = {
-	.send_data = apple_mbox_chan_send_data,
-	.peek_data = apple_mbox_chan_peek_data,
-	.flush = apple_mbox_chan_flush,
-	.startup = apple_mbox_chan_startup,
-	.shutdown = apple_mbox_chan_shutdown,
-};
-
-static struct mbox_chan *apple_mbox_of_xlate(struct mbox_controller *mbox,
-					     const struct of_phandle_args *args)
-{
-	if (args->args_count != 0)
-		return ERR_PTR(-EINVAL);
-
-	return &mbox->chans[0];
-}
-
-static int apple_mbox_probe(struct platform_device *pdev)
-{
-	int ret;
-	const struct of_device_id *match;
-	char *irqname;
-	struct apple_mbox *mbox;
-	struct device *dev = &pdev->dev;
-
-	match = of_match_node(apple_mbox_of_match, pdev->dev.of_node);
-	if (!match)
-		return -EINVAL;
-	if (!match->data)
-		return -EINVAL;
-
-	mbox = devm_kzalloc(dev, sizeof(*mbox), GFP_KERNEL);
-	if (!mbox)
-		return -ENOMEM;
-	platform_set_drvdata(pdev, mbox);
-
-	mbox->dev = dev;
-	mbox->regs = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(mbox->regs))
-		return PTR_ERR(mbox->regs);
-
-	mbox->hw = match->data;
-	mbox->irq_recv_not_empty =
-		platform_get_irq_byname(pdev, "recv-not-empty");
-	if (mbox->irq_recv_not_empty < 0)
-		return -ENODEV;
-
-	mbox->irq_send_empty = platform_get_irq_byname(pdev, "send-empty");
-	if (mbox->irq_send_empty < 0)
-		return -ENODEV;
-
-	mbox->controller.dev = mbox->dev;
-	mbox->controller.num_chans = 1;
-	mbox->controller.chans = &mbox->chan;
-	mbox->controller.ops = &apple_mbox_ops;
-	mbox->controller.txdone_irq = true;
-	mbox->controller.of_xlate = apple_mbox_of_xlate;
-	mbox->chan.con_priv = mbox;
-	spin_lock_init(&mbox->rx_lock);
-
-	irqname = devm_kasprintf(dev, GFP_KERNEL, "%s-recv", dev_name(dev));
-	if (!irqname)
-		return -ENOMEM;
-
-	ret = devm_request_threaded_irq(dev, mbox->irq_recv_not_empty, NULL,
-					apple_mbox_recv_irq,
-					IRQF_NO_AUTOEN | IRQF_ONESHOT, irqname,
-					mbox);
-	if (ret)
-		return ret;
-
-	irqname = devm_kasprintf(dev, GFP_KERNEL, "%s-send", dev_name(dev));
-	if (!irqname)
-		return -ENOMEM;
-
-	ret = devm_request_irq(dev, mbox->irq_send_empty,
-			       apple_mbox_send_empty_irq, IRQF_NO_AUTOEN,
-			       irqname, mbox);
-	if (ret)
-		return ret;
-
-	return devm_mbox_controller_register(dev, &mbox->controller);
-}
-
-static const struct apple_mbox_hw apple_mbox_asc_hw = {
-	.control_full = APPLE_ASC_MBOX_CONTROL_FULL,
-	.control_empty = APPLE_ASC_MBOX_CONTROL_EMPTY,
-
-	.a2i_control = APPLE_ASC_MBOX_A2I_CONTROL,
-	.a2i_send0 = APPLE_ASC_MBOX_A2I_SEND0,
-	.a2i_send1 = APPLE_ASC_MBOX_A2I_SEND1,
-
-	.i2a_control = APPLE_ASC_MBOX_I2A_CONTROL,
-	.i2a_recv0 = APPLE_ASC_MBOX_I2A_RECV0,
-	.i2a_recv1 = APPLE_ASC_MBOX_I2A_RECV1,
-
-	.has_irq_controls = false,
-};
-
-static const struct apple_mbox_hw apple_mbox_m3_hw = {
-	.control_full = APPLE_M3_MBOX_CONTROL_FULL,
-	.control_empty = APPLE_M3_MBOX_CONTROL_EMPTY,
-
-	.a2i_control = APPLE_M3_MBOX_A2I_CONTROL,
-	.a2i_send0 = APPLE_M3_MBOX_A2I_SEND0,
-	.a2i_send1 = APPLE_M3_MBOX_A2I_SEND1,
-
-	.i2a_control = APPLE_M3_MBOX_I2A_CONTROL,
-	.i2a_recv0 = APPLE_M3_MBOX_I2A_RECV0,
-	.i2a_recv1 = APPLE_M3_MBOX_I2A_RECV1,
-
-	.has_irq_controls = true,
-	.irq_enable = APPLE_M3_MBOX_IRQ_ENABLE,
-	.irq_ack = APPLE_M3_MBOX_IRQ_ACK,
-	.irq_bit_recv_not_empty = APPLE_M3_MBOX_IRQ_I2A_NOT_EMPTY,
-	.irq_bit_send_empty = APPLE_M3_MBOX_IRQ_A2I_EMPTY,
-};
-
-static const struct of_device_id apple_mbox_of_match[] = {
-	{ .compatible = "apple,asc-mailbox-v4", .data = &apple_mbox_asc_hw },
-	{ .compatible = "apple,m3-mailbox-v2", .data = &apple_mbox_m3_hw },
-	{}
-};
-MODULE_DEVICE_TABLE(of, apple_mbox_of_match);
-
-static struct platform_driver apple_mbox_driver = {
-	.driver = {
-		.name = "apple-mailbox",
-		.of_match_table = apple_mbox_of_match,
-	},
-	.probe = apple_mbox_probe,
-};
-module_platform_driver(apple_mbox_driver);
-
-MODULE_LICENSE("Dual MIT/GPL");
-MODULE_AUTHOR("Sven Peter <sven@svenpeter.dev>");
-MODULE_DESCRIPTION("Apple Mailbox driver");
diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h
deleted file mode 100644
index 720fbb70294a..000000000000
--- a/include/linux/apple-mailbox.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
-/*
- * Apple mailbox message format
- *
- * Copyright (C) 2021 The Asahi Linux Contributors
- */
-
-#ifndef _LINUX_APPLE_MAILBOX_H_
-#define _LINUX_APPLE_MAILBOX_H_
-
-#include <linux/types.h>
-
-/* encodes a single 96bit message sent over the single channel */
-struct apple_mbox_msg {
-	u64 msg0;
-	u32 msg1;
-};
-
-#endif
-- 
cgit v1.2.3


From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Nov 2023 14:39:03 +0100
Subject: sched/cpufreq: Rework schedutil governor performance estimation

The current method to take into account uclamp hints when estimating the
target frequency can end in a situation where the selected target
frequency is finally higher than uclamp hints, whereas there are no real
needs. Such cases mainly happen because we are currently mixing the
traditional scheduler utilization signal with the uclamp performance
hints. By adding these 2 metrics, we loose an important information when
it comes to select the target frequency, and we have to make some
assumptions which can't fit all cases.

Rework the interface between the scheduler and schedutil governor in order
to propagate all information down to the cpufreq governor.

effective_cpu_util() interface changes and now returns the actual
utilization of the CPU with 2 optional inputs:

- The minimum performance for this CPU; typically the capacity to handle
  the deadline task and the interrupt pressure. But also uclamp_min
  request when available.

- The maximum targeting performance for this CPU which reflects the
  maximum level that we would like to not exceed. By default it will be
  the CPU capacity but can be reduced because of some performance hints
  set with uclamp. The value can be lower than actual utilization and/or
  min performance level.

A new sugov_effective_cpu_perf() interface is also available to compute
the final performance level that is targeted for the CPU, after applying
some cpufreq headroom and taking into account all inputs.

With these 2 functions, schedutil is now able to decide when it must go
above uclamp hints. It now also has a generic way to get the min
performance level.

The dependency between energy model and cpufreq governor and its headroom
policy doesn't exist anymore.

eenv_pd_max_util() asks schedutil for the targeted performance after
applying the impact of the waking task.

[ mingo: Refined the changelog & C comments. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org
---
 include/linux/energy_model.h     |  1 -
 kernel/sched/core.c              | 90 +++++++++++++++++-----------------------
 kernel/sched/cpufreq_schedutil.c | 35 +++++++++++-----
 kernel/sched/fair.c              | 22 ++++++++--
 kernel/sched/sched.h             | 24 ++++-------
 5 files changed, 89 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index b9caa01dfac4..adec808b371a 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	scale_cpu = arch_scale_cpu_capacity(cpu);
 	ps = &pd->table[pd->nr_perf_states - 1];
 
-	max_util = map_util_perf(max_util);
 	max_util = min(max_util, allowed_cpu_cap);
 	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2de77a6d5ef8..db4be4921e7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7467,18 +7467,13 @@ int sched_core_idle_cpu(int cpu)
  * required to meet deadlines.
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p)
+				 unsigned long *min,
+				 unsigned long *max)
 {
-	unsigned long dl_util, util, irq, max;
+	unsigned long util, irq, scale;
 	struct rq *rq = cpu_rq(cpu);
 
-	max = arch_scale_cpu_capacity(cpu);
-
-	if (!uclamp_is_used() &&
-	    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
-		return max;
-	}
+	scale = arch_scale_cpu_capacity(cpu);
 
 	/*
 	 * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7486,45 +7481,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	 * update_irq_load_avg().
 	 */
 	irq = cpu_util_irq(rq);
-	if (unlikely(irq >= max))
-		return max;
+	if (unlikely(irq >= scale)) {
+		if (min)
+			*min = scale;
+		if (max)
+			*max = scale;
+		return scale;
+	}
+
+	if (min) {
+		/*
+		 * The minimum utilization returns the highest level between:
+		 * - the computed DL bandwidth needed with the IRQ pressure which
+		 *   steals time to the deadline task.
+		 * - The minimum performance requirement for CFS and/or RT.
+		 */
+		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+		/*
+		 * When an RT task is runnable and uclamp is not used, we must
+		 * ensure that the task will run at maximum compute capacity.
+		 */
+		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+			*min = max(*min, scale);
+	}
 
 	/*
 	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
 	 * CFS tasks and we use the same metric to track the effective
 	 * utilization (PELT windows are synchronized) we can directly add them
 	 * to obtain the CPU's actual utilization.
-	 *
-	 * CFS and RT utilization can be boosted or capped, depending on
-	 * utilization clamp constraints requested by currently RUNNABLE
-	 * tasks.
-	 * When there are no CFS RUNNABLE tasks, clamps are released and
-	 * frequency will be gracefully reduced with the utilization decay.
 	 */
 	util = util_cfs + cpu_util_rt(rq);
-	if (type == FREQUENCY_UTIL)
-		util = uclamp_rq_util_with(rq, util, p);
-
-	dl_util = cpu_util_dl(rq);
+	util += cpu_util_dl(rq);
 
 	/*
-	 * For frequency selection we do not make cpu_util_dl() a permanent part
-	 * of this sum because we want to use cpu_bw_dl() later on, but we need
-	 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
-	 * that we select f_max when there is no idle time.
-	 *
-	 * NOTE: numerical errors or stop class might cause us to not quite hit
-	 * saturation when we should -- something for later.
+	 * The maximum hint is a soft bandwidth requirement, which can be lower
+	 * than the actual utilization because of uclamp_max requirements.
 	 */
-	if (util + dl_util >= max)
-		return max;
+	if (max)
+		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
 
-	/*
-	 * OTOH, for energy computation we need the estimated running time, so
-	 * include util_dl and ignore dl_bw.
-	 */
-	if (type == ENERGY_UTIL)
-		util += dl_util;
+	if (util >= scale)
+		return scale;
 
 	/*
 	 * There is still idle time; further improve the number by using the
@@ -7535,28 +7534,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	 *   U' = irq + --------- * U
 	 *                 max
 	 */
-	util = scale_irq_capacity(util, irq, max);
+	util = scale_irq_capacity(util, irq, scale);
 	util += irq;
 
-	/*
-	 * Bandwidth required by DEADLINE must always be granted while, for
-	 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
-	 * to gracefully reduce the frequency when no tasks show up for longer
-	 * periods of time.
-	 *
-	 * Ideally we would like to set bw_dl as min/guaranteed freq and util +
-	 * bw_dl as requested freq. However, cpufreq is not yet ready for such
-	 * an interface. So, we only do the latter for now.
-	 */
-	if (type == FREQUENCY_UTIL)
-		util += cpu_bw_dl(rq);
-
-	return min(max, util);
+	return min(scale, util);
 }
 
 unsigned long sched_cpu_util(int cpu)
 {
-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
 }
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5888176354e2..f3acf2cf26ed 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,7 +47,7 @@ struct sugov_cpu {
 	u64			last_update;
 
 	unsigned long		util;
-	unsigned long		bw_dl;
+	unsigned long		bw_min;
 
 	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
@@ -143,7 +143,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	util = map_util_perf(util);
 	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -153,14 +152,30 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+				 unsigned long min,
+				 unsigned long max)
+{
+	/* Add dvfs headroom to actual utilization */
+	actual = map_util_perf(actual);
+	/* Actually we don't need to target the max performance */
+	if (actual < max)
+		max = actual;
+
+	/*
+	 * Ensure at least minimum performance while providing more compute
+	 * capacity when possible.
+	 */
+	return max(min, max);
+}
+
 static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
-	unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
-	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
 
-	sg_cpu->bw_dl = cpu_bw_dl(rq);
-	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
-					  FREQUENCY_UTIL, NULL);
+	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+	sg_cpu->bw_min = min;
+	sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
 }
 
 /**
@@ -306,7 +321,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
  */
 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
 {
-	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
+	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
 		sg_cpu->sg_policy->limits_changed = true;
 }
 
@@ -407,8 +422,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
-	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
-				   map_util_perf(sg_cpu->util), max_cap);
+	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+				   sg_cpu->util, max_cap);
 
 	sg_cpu->sg_policy->last_freq_update_time = time;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53dea95ad8c9..34fe6e9490c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7793,7 +7793,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
 	for_each_cpu(cpu, pd_cpus) {
 		unsigned long util = cpu_util(cpu, p, -1, 0);
 
-		busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+		busy_time += effective_cpu_util(cpu, util, NULL, NULL);
 	}
 
 	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
@@ -7816,7 +7816,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 	for_each_cpu(cpu, pd_cpus) {
 		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
-		unsigned long eff_util;
+		unsigned long eff_util, min, max;
 
 		/*
 		 * Performance domain frequency: utilization clamping
@@ -7825,7 +7825,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+		eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+		/* Task's uclamp can modify min and max value */
+		if (tsk && uclamp_is_used()) {
+			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+			/*
+			 * If there is no active max uclamp constraint,
+			 * directly use task's one, otherwise keep max.
+			 */
+			if (uclamp_rq_is_idle(cpu_rq(cpu)))
+				max = uclamp_eff_value(p, UCLAMP_MAX);
+			else
+				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+		}
+
+		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util = max(max_util, eff_util);
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8a70d51ffa33..c1574cd388e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2994,24 +2994,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 
 #ifdef CONFIG_SMP
-/**
- * enum cpu_util_type - CPU utilization type
- * @FREQUENCY_UTIL:	Utilization used to select frequency
- * @ENERGY_UTIL:	Utilization used during energy calculation
- *
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within effective_cpu_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
- */
-enum cpu_util_type {
-	FREQUENCY_UTIL,
-	ENERGY_UTIL,
-};
-
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p);
+				 unsigned long *min,
+				 unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+				 unsigned long min,
+				 unsigned long max);
+
 
 /*
  * Verify the fitness of task @p to run on @cpu taking into account the
-- 
cgit v1.2.3


From 64bac5ea17d527872121adddfee869c7a0618f8f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:29 +0100
Subject: arch: consolidate arch_irq_work_raise prototypes

The prototype was hidden in an #ifdef on x86, which causes a warning:

kernel/irq_work.c:72:13: error: no previous prototype for 'arch_irq_work_raise' [-Werror=missing-prototypes]

Some architectures have a working prototype, while others don't.
Fix this by providing it in only one place that is always visible.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Acked-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/include/asm/irq_work.h     | 2 --
 arch/arm64/include/asm/irq_work.h   | 2 --
 arch/csky/include/asm/irq_work.h    | 2 +-
 arch/powerpc/include/asm/irq_work.h | 1 -
 arch/riscv/include/asm/irq_work.h   | 2 +-
 arch/s390/include/asm/irq_work.h    | 2 --
 arch/x86/include/asm/irq_work.h     | 1 -
 include/linux/irq_work.h            | 3 +++
 8 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/irq_work.h b/arch/arm/include/asm/irq_work.h
index 3149e4dc1b54..8895999834cc 100644
--- a/arch/arm/include/asm/irq_work.h
+++ b/arch/arm/include/asm/irq_work.h
@@ -9,6 +9,4 @@ static inline bool arch_irq_work_has_interrupt(void)
 	return is_smp();
 }
 
-extern void arch_irq_work_raise(void);
-
 #endif /* _ASM_ARM_IRQ_WORK_H */
diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h
index 81bbfa3a035b..a1020285ea75 100644
--- a/arch/arm64/include/asm/irq_work.h
+++ b/arch/arm64/include/asm/irq_work.h
@@ -2,8 +2,6 @@
 #ifndef __ASM_IRQ_WORK_H
 #define __ASM_IRQ_WORK_H
 
-extern void arch_irq_work_raise(void);
-
 static inline bool arch_irq_work_has_interrupt(void)
 {
 	return true;
diff --git a/arch/csky/include/asm/irq_work.h b/arch/csky/include/asm/irq_work.h
index 33aaf39d6f94..d39fcc1f5395 100644
--- a/arch/csky/include/asm/irq_work.h
+++ b/arch/csky/include/asm/irq_work.h
@@ -7,5 +7,5 @@ static inline bool arch_irq_work_has_interrupt(void)
 {
 	return true;
 }
-extern void arch_irq_work_raise(void);
+
 #endif /* __ASM_CSKY_IRQ_WORK_H */
diff --git a/arch/powerpc/include/asm/irq_work.h b/arch/powerpc/include/asm/irq_work.h
index b8b0be8f1a07..c6d3078bd8c3 100644
--- a/arch/powerpc/include/asm/irq_work.h
+++ b/arch/powerpc/include/asm/irq_work.h
@@ -6,6 +6,5 @@ static inline bool arch_irq_work_has_interrupt(void)
 {
 	return true;
 }
-extern void arch_irq_work_raise(void);
 
 #endif /* _ASM_POWERPC_IRQ_WORK_H */
diff --git a/arch/riscv/include/asm/irq_work.h b/arch/riscv/include/asm/irq_work.h
index b53891964ae0..b27a4d64fc6a 100644
--- a/arch/riscv/include/asm/irq_work.h
+++ b/arch/riscv/include/asm/irq_work.h
@@ -6,5 +6,5 @@ static inline bool arch_irq_work_has_interrupt(void)
 {
 	return IS_ENABLED(CONFIG_SMP);
 }
-extern void arch_irq_work_raise(void);
+
 #endif /* _ASM_RISCV_IRQ_WORK_H */
diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h
index 603783766d0a..f00c9f610d5a 100644
--- a/arch/s390/include/asm/irq_work.h
+++ b/arch/s390/include/asm/irq_work.h
@@ -7,6 +7,4 @@ static inline bool arch_irq_work_has_interrupt(void)
 	return true;
 }
 
-void arch_irq_work_raise(void);
-
 #endif /* _ASM_S390_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 800ffce0db29..6b4d36c95165 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void)
 {
 	return boot_cpu_has(X86_FEATURE_APIC);
 }
-extern void arch_irq_work_raise(void);
 #else
 static inline bool arch_irq_work_has_interrupt(void)
 {
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 8cd11a223260..136f2980cba3 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -66,6 +66,9 @@ void irq_work_sync(struct irq_work *work);
 void irq_work_run(void);
 bool irq_work_needs_cpu(void);
 void irq_work_single(void *arg);
+
+void arch_irq_work_raise(void);
+
 #else
 static inline bool irq_work_needs_cpu(void) { return false; }
 static inline void irq_work_run(void) { }
-- 
cgit v1.2.3


From a769154c7cac037914ba375ae88aae55b2c853e0 Mon Sep 17 00:00:00 2001
From: Hardik Gajjar <hgajjar@de.adit-jv.com>
Date: Fri, 27 Oct 2023 17:20:28 +0200
Subject: usb: xhci: Add timeout argument in address_device USB HCD callback

- The HCD address_device callback now accepts a user-defined timeout value
  in milliseconds, providing better control over command execution times.
- The default timeout value for the address_device command has been set
  to 5000 ms, aligning with the USB 3.2 specification. However, this
  timeout can be adjusted as needed.
- The xhci_setup_device function has been updated to accept the timeout
  value, allowing it to specify the maximum wait time for the command
  operation to complete.
- The hub driver has also been updated to accommodate the newly added
  timeout parameter during the SET_ADDRESS request.

Signed-off-by: Hardik Gajjar <hgajjar@de.adit-jv.com>
Reviewed-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20231027152029.104363-1-hgajjar@de.adit-jv.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c       |  2 +-
 drivers/usb/host/xhci-mem.c  |  2 ++
 drivers/usb/host/xhci-ring.c | 11 ++++++-----
 drivers/usb/host/xhci.c      | 23 ++++++++++++++++-------
 drivers/usb/host/xhci.h      |  9 +++++++--
 include/linux/usb/hcd.h      |  5 +++--
 6 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index b4584a0cd484..049058264367 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -4662,7 +4662,7 @@ static int hub_set_address(struct usb_device *udev, int devnum)
 	if (udev->state != USB_STATE_DEFAULT)
 		return -EINVAL;
 	if (hcd->driver->address_device)
-		retval = hcd->driver->address_device(hcd, udev);
+		retval = hcd->driver->address_device(hcd, udev, USB_CTRL_SET_TIMEOUT);
 	else
 		retval = usb_control_msg(udev, usb_sndaddr0pipe(),
 				USB_REQ_SET_ADDRESS, 0, devnum, 0,
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 62116586848b..6faa854152ef 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -1739,6 +1739,8 @@ struct xhci_command *xhci_alloc_command(struct xhci_hcd *xhci,
 	}
 
 	command->status = 0;
+	/* set default timeout to 5000 ms */
+	command->timeout_ms = XHCI_CMD_DEFAULT_TIMEOUT;
 	INIT_LIST_HEAD(&command->cmd_list);
 	return command;
 }
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index f3b5e6345858..2c1d614b3b0f 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -366,9 +366,10 @@ void xhci_ring_cmd_db(struct xhci_hcd *xhci)
 	readl(&xhci->dba->doorbell[0]);
 }
 
-static bool xhci_mod_cmd_timer(struct xhci_hcd *xhci, unsigned long delay)
+static bool xhci_mod_cmd_timer(struct xhci_hcd *xhci)
 {
-	return mod_delayed_work(system_wq, &xhci->cmd_timer, delay);
+	return mod_delayed_work(system_wq, &xhci->cmd_timer,
+			msecs_to_jiffies(xhci->current_cmd->timeout_ms));
 }
 
 static struct xhci_command *xhci_next_queued_cmd(struct xhci_hcd *xhci)
@@ -412,7 +413,7 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
 	if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
 	    !(xhci->xhc_state & XHCI_STATE_DYING)) {
 		xhci->current_cmd = cur_cmd;
-		xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT);
+		xhci_mod_cmd_timer(xhci);
 		xhci_ring_cmd_db(xhci);
 	}
 }
@@ -1787,7 +1788,7 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
 	if (!list_is_singular(&xhci->cmd_list)) {
 		xhci->current_cmd = list_first_entry(&cmd->cmd_list,
 						struct xhci_command, cmd_list);
-		xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT);
+		xhci_mod_cmd_timer(xhci);
 	} else if (xhci->current_cmd == cmd) {
 		xhci->current_cmd = NULL;
 	}
@@ -4287,7 +4288,7 @@ static int queue_command(struct xhci_hcd *xhci, struct xhci_command *cmd,
 	/* if there are no other commands queued we start the timeout timer */
 	if (list_empty(&xhci->cmd_list)) {
 		xhci->current_cmd = cmd;
-		xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT);
+		xhci_mod_cmd_timer(xhci);
 	}
 
 	list_add_tail(&cmd->cmd_list, &xhci->cmd_list);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 884b0898d9c9..cb885ddc2032 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -4029,12 +4029,18 @@ disable_slot:
 	return 0;
 }
 
-/*
- * Issue an Address Device command and optionally send a corresponding
- * SetAddress request to the device.
+/**
+ * xhci_setup_device - issues an Address Device command to assign a unique
+ *			USB bus address.
+ * @hcd: USB host controller data structure.
+ * @udev: USB dev structure representing the connected device.
+ * @setup: Enum specifying setup mode: address only or with context.
+ * @timeout_ms: Max wait time (ms) for the command operation to complete.
+ *
+ * Return: 0 if successful; otherwise, negative error code.
  */
 static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
-			     enum xhci_setup_dev setup)
+			     enum xhci_setup_dev setup, unsigned int timeout_ms)
 {
 	const char *act = setup == SETUP_CONTEXT_ONLY ? "context" : "address";
 	unsigned long flags;
@@ -4091,6 +4097,7 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
 	}
 
 	command->in_ctx = virt_dev->in_ctx;
+	command->timeout_ms = timeout_ms;
 
 	slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->in_ctx);
 	ctrl_ctx = xhci_get_input_control_ctx(virt_dev->in_ctx);
@@ -4217,14 +4224,16 @@ out:
 	return ret;
 }
 
-static int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev)
+static int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev,
+			       unsigned int timeout_ms)
 {
-	return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ADDRESS);
+	return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ADDRESS, timeout_ms);
 }
 
 static int xhci_enable_device(struct usb_hcd *hcd, struct usb_device *udev)
 {
-	return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ONLY);
+	return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ONLY,
+				 XHCI_CMD_DEFAULT_TIMEOUT);
 }
 
 /*
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 3ea5c092bba7..62b610f00754 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -791,6 +791,8 @@ struct xhci_command {
 	struct completion		*completion;
 	union xhci_trb			*command_trb;
 	struct list_head		cmd_list;
+	/* xHCI command response timeout in milliseconds */
+	unsigned int			timeout_ms;
 };
 
 /* drop context bitmasks */
@@ -1550,8 +1552,11 @@ struct xhci_td {
 	unsigned int		num_trbs;
 };
 
-/* xHCI command default timeout value */
-#define XHCI_CMD_DEFAULT_TIMEOUT	(5 * HZ)
+/*
+ * xHCI command default timeout value in milliseconds.
+ * USB 3.2 spec, section 9.2.6.1
+ */
+#define XHCI_CMD_DEFAULT_TIMEOUT	5000
 
 /* command descriptor */
 struct xhci_cd {
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 00724b4f6e12..cd77fc6095a1 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -372,8 +372,9 @@ struct hc_driver {
 		 * or bandwidth constraints.
 		 */
 	void	(*reset_bandwidth)(struct usb_hcd *, struct usb_device *);
-		/* Returns the hardware-chosen device address */
-	int	(*address_device)(struct usb_hcd *, struct usb_device *udev);
+		/* Set the hardware-chosen device address */
+	int	(*address_device)(struct usb_hcd *, struct usb_device *udev,
+				  unsigned int timeout_ms);
 		/* prepares the hardware to send commands to the device */
 	int	(*enable_device)(struct usb_hcd *, struct usb_device *udev);
 		/* Notifies the HCD after a hub descriptor is fetched.
-- 
cgit v1.2.3


From 5a1ccf0c72cf917ff3ccc131d1bb8d19338ffe52 Mon Sep 17 00:00:00 2001
From: Hardik Gajjar <hgajjar@de.adit-jv.com>
Date: Fri, 27 Oct 2023 17:20:29 +0200
Subject: usb: new quirk to reduce the SET_ADDRESS request timeout

This patch introduces a new USB quirk,
USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT, which modifies the timeout value
for the SET_ADDRESS request. The standard timeout for USB request/command
is 5000 ms, as recommended in the USB 3.2 specification (section 9.2.6.1).

However, certain scenarios, such as connecting devices through an APTIV
hub, can lead to timeout errors when the device enumerates as full speed
initially and later switches to high speed during chirp negotiation.

In such cases, USB analyzer logs reveal that the bus suspends for
5 seconds due to incorrect chirp parsing and resumes only after two
consecutive timeout errors trigger a hub driver reset.

Packet(54) Dir(?) Full Speed J(997.100 us) Idle(  2.850 us)
_______| Time Stamp(28 . 105 910 682)
_______|_____________________________________________________________Ch0
Packet(55) Dir(?) Full Speed J(997.118 us) Idle(  2.850 us)
_______| Time Stamp(28 . 106 910 632)
_______|_____________________________________________________________Ch0
Packet(56) Dir(?) Full Speed J(399.650 us) Idle(222.582 us)
_______| Time Stamp(28 . 107 910 600)
_______|_____________________________________________________________Ch0
Packet(57) Dir Chirp J( 23.955 ms) Idle(115.169 ms)
_______| Time Stamp(28 . 108 532 832)
_______|_____________________________________________________________Ch0
Packet(58) Dir(?) Full Speed J (Suspend)( 5.347 sec) Idle(  5.366 us)
_______| Time Stamp(28 . 247 657 600)
_______|_____________________________________________________________Ch0

This 5-second delay in device enumeration is undesirable, particularly
in automotive applications where quick enumeration is crucial
(ideally within 3 seconds).

The newly introduced quirks provide the flexibility to align with a
3-second time limit, as required in specific contexts like automotive
applications.

By reducing the SET_ADDRESS request timeout to 500 ms, the
system can respond more swiftly to errors, initiate rapid recovery, and
ensure efficient device enumeration. This change is vital for scenarios
where rapid smartphone enumeration and screen projection are essential.

To use the quirk, please write "vendor_id:product_id:p" to
/sys/bus/usb/drivers/hub/module/parameter/quirks

For example,
echo "0x2c48:0x0132:p" > /sys/bus/usb/drivers/hub/module/parameters/quirks"

Signed-off-by: Hardik Gajjar <hgajjar@de.adit-jv.com>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20231027152029.104363-2-hgajjar@de.adit-jv.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 +++
 drivers/usb/core/hub.c                          | 15 +++++++++++++--
 drivers/usb/core/quirks.c                       |  7 +++++++
 include/linux/usb/quirks.h                      |  3 +++
 4 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..0a6a4b7f7a3b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6908,6 +6908,9 @@
 					pause after every control message);
 				o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
 					delay after resetting its port);
+				p = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT
+					(Reduce timeout of the SET_ADDRESS
+					request from 5000 ms to 500 ms);
 			Example: quirks=0781:5580:bk,0a5c:5834:gij
 
 	usbhid.mousepoll=
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 049058264367..756fbd7cac43 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -54,6 +54,12 @@
 #define USB_TP_TRANSMISSION_DELAY_MAX	65535	/* ns */
 #define USB_PING_RESPONSE_TIME		400	/* ns */
 
+/*
+ * The SET_ADDRESS request timeout will be 500 ms when
+ * USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT quirk flag is set.
+ */
+#define USB_SHORT_SET_ADDRESS_REQ_TIMEOUT	500  /* ms */
+
 /* Protect struct usb_device->state and ->children members
  * Note: Both are also protected by ->dev.sem, except that ->state can
  * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */
@@ -4649,7 +4655,12 @@ EXPORT_SYMBOL_GPL(usb_ep0_reinit);
 static int hub_set_address(struct usb_device *udev, int devnum)
 {
 	int retval;
+	unsigned int timeout_ms = USB_CTRL_SET_TIMEOUT;
 	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+	struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
+
+	if (hub->hdev->quirks & USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT)
+		timeout_ms = USB_SHORT_SET_ADDRESS_REQ_TIMEOUT;
 
 	/*
 	 * The host controller will choose the device address,
@@ -4662,11 +4673,11 @@ static int hub_set_address(struct usb_device *udev, int devnum)
 	if (udev->state != USB_STATE_DEFAULT)
 		return -EINVAL;
 	if (hcd->driver->address_device)
-		retval = hcd->driver->address_device(hcd, udev, USB_CTRL_SET_TIMEOUT);
+		retval = hcd->driver->address_device(hcd, udev, timeout_ms);
 	else
 		retval = usb_control_msg(udev, usb_sndaddr0pipe(),
 				USB_REQ_SET_ADDRESS, 0, devnum, 0,
-				NULL, 0, USB_CTRL_SET_TIMEOUT);
+				NULL, 0, timeout_ms);
 	if (retval == 0) {
 		update_devnum(udev, devnum);
 		/* Device now using proper address. */
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 15e9bd180a1d..b4783574b8e6 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -138,6 +138,9 @@ static int quirks_param_set(const char *value, const struct kernel_param *kp)
 			case 'o':
 				flags |= USB_QUIRK_HUB_SLOW_RESET;
 				break;
+			case 'p':
+				flags |= USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT;
+				break;
 			/* Ignore unrecognized flag characters */
 			}
 		}
@@ -527,6 +530,10 @@ static const struct usb_device_id usb_quirk_list[] = {
 
 	{ USB_DEVICE(0x2386, 0x350e), .driver_info = USB_QUIRK_NO_LPM },
 
+	/* APTIV AUTOMOTIVE HUB */
+	{ USB_DEVICE(0x2c48, 0x0132), .driver_info =
+			USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT },
+
 	/* DJI CineSSD */
 	{ USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM },
 
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index eeb7c2157c72..59409c1fc3de 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -72,4 +72,7 @@
 /* device has endpoints that should be ignored */
 #define USB_QUIRK_ENDPOINT_IGNORE		BIT(15)
 
+/* short SET_ADDRESS request timeout */
+#define USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT	BIT(16)
+
 #endif /* __LINUX_USB_QUIRKS_H */
-- 
cgit v1.2.3


From d4e3b928ab487a8aecd1f6a140b40ac365116cfb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Nov 2023 19:13:27 -0500
Subject: closures: CLOSURE_CALLBACK() to fix type punning

Control flow integrity is now checking that type signatures match on
indirect function calls. That breaks closures, which embed a work_struct
in a closure in such a way that a closure_fn may also be used as a
workqueue fn by the underlying closure code.

So we have to change closure fns to take a work_struct as their
argument - but that results in a loss of clarity, as closure fns have
different semantics from normal workqueue functions (they run owning a
ref on the closure, which must be released with continue_at() or
closure_return()).

Thus, this patc introduces CLOSURE_CALLBACK() and closure_type() macros
as suggested by Kees, to smooth things over a bit.

Suggested-by: Kees Cook <keescook@chromium.org>
Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 drivers/md/bcache/btree.c           | 14 +++----
 drivers/md/bcache/journal.c         | 20 +++++-----
 drivers/md/bcache/movinggc.c        | 16 ++++----
 drivers/md/bcache/request.c         | 74 ++++++++++++++++++-------------------
 drivers/md/bcache/request.h         |  2 +-
 drivers/md/bcache/super.c           | 40 ++++++++++----------
 drivers/md/bcache/writeback.c       | 16 ++++----
 fs/bcachefs/btree_io.c              |  7 ++--
 fs/bcachefs/btree_update_interior.c |  4 +-
 fs/bcachefs/fs-io-direct.c          |  8 ++--
 fs/bcachefs/io_write.c              | 14 +++----
 fs/bcachefs/io_write.h              |  3 +-
 fs/bcachefs/journal_io.c            | 17 ++++-----
 fs/bcachefs/journal_io.h            |  2 +-
 include/linux/closure.h             |  9 ++++-
 lib/closure.c                       |  5 ++-
 16 files changed, 127 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ae5cbb55861f..9441eac3d546 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -293,16 +293,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
 	w->journal	= NULL;
 }
 
-static void btree_node_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_unlock)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);
 
 	up(&b->io_mutex);
 }
 
-static void __btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(__btree_node_write_done)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);
 	struct btree_write *w = btree_prev_write(b);
 
 	bch_bbio_free(b->bio, b->c);
@@ -315,12 +315,12 @@ static void __btree_node_write_done(struct closure *cl)
 	closure_return_with_destructor(cl, btree_node_write_unlock);
 }
 
-static void btree_node_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_write_done)
 {
-	struct btree *b = container_of(cl, struct btree, io);
+	closure_type(b, struct btree, io);
 
 	bio_free_pages(b->bio);
-	__btree_node_write_done(cl);
+	__btree_node_write_done(&cl->work);
 }
 
 static void btree_node_write_endio(struct bio *bio)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index c182c21de2e8..7ff14bd2feb8 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -723,11 +723,11 @@ static void journal_write_endio(struct bio *bio)
 	closure_put(&w->c->journal.io);
 }
 
-static void journal_write(struct closure *cl);
+static CLOSURE_CALLBACK(journal_write);
 
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct journal_write *w = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
@@ -736,19 +736,19 @@ static void journal_write_done(struct closure *cl)
 	continue_at_nobarrier(cl, journal_write, bch_journal_wq);
 }
 
-static void journal_write_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlock)
 	__releases(&c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);
 
 	c->journal.io_in_flight = 0;
 	spin_unlock(&c->journal.lock);
 }
 
-static void journal_write_unlocked(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_unlocked)
 	__releases(c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);
 	struct cache *ca = c->cache;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
@@ -823,12 +823,12 @@ static void journal_write_unlocked(struct closure *cl)
 	continue_at(cl, journal_write_done, NULL);
 }
 
-static void journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+	closure_type(c, struct cache_set, journal.io);
 
 	spin_lock(&c->journal.lock);
-	journal_write_unlocked(cl);
+	journal_write_unlocked(&cl->work);
 }
 
 static void journal_try_write(struct cache_set *c)
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9f32901fdad1..ebd500bdf0b2 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -35,16 +35,16 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 
 /* Moving GC - IO loop */
 
-static void moving_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(moving_io_destructor)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 
 	kfree(io);
 }
 
-static void write_moving_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving_finish)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 
 	bio_free_pages(bio);
@@ -89,9 +89,9 @@ static void moving_init(struct moving_io *io)
 	bch_bio_map(bio, NULL);
 }
 
-static void write_moving(struct closure *cl)
+static CLOSURE_CALLBACK(write_moving)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct data_insert_op *op = &io->op;
 
 	if (!op->status) {
@@ -113,9 +113,9 @@ static void write_moving(struct closure *cl)
 	continue_at(cl, write_moving_finish, op->wq);
 }
 
-static void read_moving_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_moving_submit)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	closure_type(io, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 
 	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a9b1f3896249..83d112bd2b1c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@
 
 struct kmem_cache *bch_search_cache;
 
-static void bch_data_insert_start(struct closure *cl);
+static CLOSURE_CALLBACK(bch_data_insert_start);
 
 static unsigned int cache_mode(struct cached_dev *dc)
 {
@@ -55,9 +55,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 /* Insert data into cache */
 
-static void bch_data_insert_keys(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_keys)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 	atomic_t *journal_ref = NULL;
 	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
 	int ret;
@@ -136,9 +136,9 @@ out:
 	continue_at(cl, bch_data_insert_keys, op->wq);
 }
 
-static void bch_data_insert_error(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_error)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -163,7 +163,7 @@ static void bch_data_insert_error(struct closure *cl)
 
 	op->insert_keys.top = dst;
 
-	bch_data_insert_keys(cl);
+	bch_data_insert_keys(&cl->work);
 }
 
 static void bch_data_insert_endio(struct bio *bio)
@@ -184,9 +184,9 @@ static void bch_data_insert_endio(struct bio *bio)
 	bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
 }
 
-static void bch_data_insert_start(struct closure *cl)
+static CLOSURE_CALLBACK(bch_data_insert_start)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;
 
 	if (op->bypass)
@@ -305,16 +305,16 @@ err:
  * If op->bypass is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->bio and op->inode.
  */
-void bch_data_insert(struct closure *cl)
+CLOSURE_CALLBACK(bch_data_insert)
 {
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	closure_type(op, struct data_insert_op, cl);
 
 	trace_bcache_write(op->c, op->inode, op->bio,
 			   op->writeback, op->bypass);
 
 	bch_keylist_init(&op->insert_keys);
 	bio_get(op->bio);
-	bch_data_insert_start(cl);
+	bch_data_insert_start(&cl->work);
 }
 
 /*
@@ -575,9 +575,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	return n == bio ? MAP_DONE : MAP_CONTINUE;
 }
 
-static void cache_lookup(struct closure *cl)
+static CLOSURE_CALLBACK(cache_lookup)
 {
-	struct search *s = container_of(cl, struct search, iop.cl);
+	closure_type(s, struct search, iop.cl);
 	struct bio *bio = &s->bio.bio;
 	struct cached_dev *dc;
 	int ret;
@@ -698,9 +698,9 @@ static void do_bio_hook(struct search *s,
 	bio_cnt_set(bio, 3);
 }
 
-static void search_free(struct closure *cl)
+static CLOSURE_CALLBACK(search_free)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 
 	atomic_dec(&s->iop.c->search_inflight);
 
@@ -749,20 +749,20 @@ static inline struct search *search_alloc(struct bio *bio,
 
 /* Cached devices */
 
-static void cached_dev_bio_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_bio_complete)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	cached_dev_put(dc);
-	search_free(cl);
+	search_free(&cl->work);
 }
 
 /* Process reads */
 
-static void cached_dev_read_error_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 
 	if (s->iop.replace_collision)
 		bch_mark_cache_miss_collision(s->iop.c, s->d);
@@ -770,12 +770,12 @@ static void cached_dev_read_error_done(struct closure *cl)
 	if (s->iop.bio)
 		bio_free_pages(s->iop.bio);
 
-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 }
 
-static void cached_dev_read_error(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_error)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
 	/*
@@ -801,9 +801,9 @@ static void cached_dev_read_error(struct closure *cl)
 	continue_at(cl, cached_dev_read_error_done, NULL);
 }
 
-static void cached_dev_cache_miss_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_cache_miss_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bcache_device *d = s->d;
 
 	if (s->iop.replace_collision)
@@ -812,13 +812,13 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->iop.bio)
 		bio_free_pages(s->iop.bio);
 
-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 	closure_put(&d->cl);
 }
 
-static void cached_dev_read_done(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	/*
@@ -858,9 +858,9 @@ static void cached_dev_read_done(struct closure *cl)
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }
 
-static void cached_dev_read_done_bh(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_read_done_bh)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	bch_mark_cache_accounting(s->iop.c, s->d,
@@ -955,13 +955,13 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 
 /* Process writes */
 
-static void cached_dev_write_complete(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_write_complete)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	up_read_non_owner(&dc->writeback_lock);
-	cached_dev_bio_complete(cl);
+	cached_dev_bio_complete(&cl->work);
 }
 
 static void cached_dev_write(struct cached_dev *dc, struct search *s)
@@ -1048,9 +1048,9 @@ insert_data:
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
-static void cached_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_nodata)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
 	if (s->iop.flush_journal)
@@ -1265,9 +1265,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	return MAP_CONTINUE;
 }
 
-static void flash_dev_nodata(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_nodata)
 {
-	struct search *s = container_of(cl, struct search, cl);
+	closure_type(s, struct search, cl);
 
 	if (s->iop.flush_journal)
 		bch_journal_meta(s->iop.c, cl);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 38ab4856eaab..46bbef00aebb 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,7 +34,7 @@ struct data_insert_op {
 };
 
 unsigned int bch_get_congested(const struct cache_set *c);
-void bch_data_insert(struct closure *cl);
+CLOSURE_CALLBACK(bch_data_insert);
 
 void bch_cached_dev_request_init(struct cached_dev *dc);
 void cached_dev_submit_bio(struct bio *bio);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8bd899766372..e0db905c1ca0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -327,9 +327,9 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
 	submit_bio(bio);
 }
 
-static void bch_write_bdev_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bch_write_bdev_super_unlock)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+	closure_type(dc, struct cached_dev, sb_write);
 
 	up(&dc->sb_write_mutex);
 }
@@ -363,9 +363,9 @@ static void write_super_endio(struct bio *bio)
 	closure_put(&ca->set->sb_write);
 }
 
-static void bcache_write_super_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(bcache_write_super_unlock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+	closure_type(c, struct cache_set, sb_write);
 
 	up(&c->sb_write_mutex);
 }
@@ -407,9 +407,9 @@ static void uuid_endio(struct bio *bio)
 	closure_put(cl);
 }
 
-static void uuid_io_unlock(struct closure *cl)
+static CLOSURE_CALLBACK(uuid_io_unlock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+	closure_type(c, struct cache_set, uuid_write);
 
 	up(&c->uuid_write_mutex);
 }
@@ -1342,9 +1342,9 @@ void bch_cached_dev_release(struct kobject *kobj)
 	module_put(THIS_MODULE);
 }
 
-static void cached_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_free)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	closure_type(dc, struct cached_dev, disk.cl);
 
 	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
 		cancel_writeback_rate_update_dwork(dc);
@@ -1376,9 +1376,9 @@ static void cached_dev_free(struct closure *cl)
 	kobject_put(&dc->disk.kobj);
 }
 
-static void cached_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cached_dev_flush)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	closure_type(dc, struct cached_dev, disk.cl);
 	struct bcache_device *d = &dc->disk;
 
 	mutex_lock(&bch_register_lock);
@@ -1497,9 +1497,9 @@ void bch_flash_dev_release(struct kobject *kobj)
 	kfree(d);
 }
 
-static void flash_dev_free(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_free)
 {
-	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	closure_type(d, struct bcache_device, cl);
 
 	mutex_lock(&bch_register_lock);
 	atomic_long_sub(bcache_dev_sectors_dirty(d),
@@ -1510,9 +1510,9 @@ static void flash_dev_free(struct closure *cl)
 	kobject_put(&d->kobj);
 }
 
-static void flash_dev_flush(struct closure *cl)
+static CLOSURE_CALLBACK(flash_dev_flush)
 {
-	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	closure_type(d, struct bcache_device, cl);
 
 	mutex_lock(&bch_register_lock);
 	bcache_device_unlink(d);
@@ -1668,9 +1668,9 @@ void bch_cache_set_release(struct kobject *kobj)
 	module_put(THIS_MODULE);
 }
 
-static void cache_set_free(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_free)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	closure_type(c, struct cache_set, cl);
 	struct cache *ca;
 
 	debugfs_remove(c->debug);
@@ -1709,9 +1709,9 @@ static void cache_set_free(struct closure *cl)
 	kobject_put(&c->kobj);
 }
 
-static void cache_set_flush(struct closure *cl)
+static CLOSURE_CALLBACK(cache_set_flush)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	closure_type(c, struct cache_set, caching);
 	struct cache *ca = c->cache;
 	struct btree *b;
 
@@ -1806,9 +1806,9 @@ static void conditional_stop_bcache_device(struct cache_set *c,
 	}
 }
 
-static void __cache_set_unregister(struct closure *cl)
+static CLOSURE_CALLBACK(__cache_set_unregister)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	closure_type(c, struct cache_set, caching);
 	struct cached_dev *dc;
 	struct bcache_device *d;
 	size_t i;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 24c049067f61..77427e355613 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -341,16 +341,16 @@ static void dirty_init(struct keybuf_key *w)
 	bch_bio_map(bio, NULL);
 }
 
-static void dirty_io_destructor(struct closure *cl)
+static CLOSURE_CALLBACK(dirty_io_destructor)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 
 	kfree(io);
 }
 
-static void write_dirty_finish(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty_finish)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
 
@@ -400,9 +400,9 @@ static void dirty_endio(struct bio *bio)
 	closure_put(&io->cl);
 }
 
-static void write_dirty(struct closure *cl)
+static CLOSURE_CALLBACK(write_dirty)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
 
@@ -462,9 +462,9 @@ static void read_dirty_endio(struct bio *bio)
 	dirty_endio(bio);
 }
 
-static void read_dirty_submit(struct closure *cl)
+static CLOSURE_CALLBACK(read_dirty_submit)
 {
-	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	closure_type(io, struct dirty_io, cl);
 
 	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 37d896edb06e..57c20390e10e 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
 	return offset;
 }
 
-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
 {
-	struct btree_node_read_all *ra =
-		container_of(cl, struct btree_node_read_all, cl);
+	closure_type(ra, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
 	struct printbuf buf = PRINTBUF;
@@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
 	if (sync) {
 		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
 	} else {
 		continue_at(&ra->cl, btree_node_read_all_replicas_done,
 			    c->io_complete_wq);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 76f27bc9fa24..d08efd6d958e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -778,9 +778,9 @@ static void btree_interior_update_work(struct work_struct *work)
 	}
 }
 
-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	closure_type(as, struct btree_update, cl);
 	struct bch_fs *c = as->c;
 
 	mutex_lock(&c->btree_interior_update_lock);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 5b42a76c4796..9a479e4de6b3 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty)
 	}
 }
 
-static void bch2_dio_read_complete(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
 {
-	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+	closure_type(dio, struct dio_read, cl);
 
 	dio->req->ki_complete(dio->req, dio->ret);
 	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
@@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }
 
-static void bch2_dio_write_flush_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
 {
-	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	closure_type(dio, struct dio_write, op.cl);
 	struct bch_fs *c = dio->op.c;
 
 	closure_debug_destroy(cl);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d704a8f829c8..8ede46b1e354 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running)
 	__wp_update_state(wp, state);
 }
 
-static void bch2_write_index(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_write_index)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);
 	struct write_point *wp = op->wp;
 	struct workqueue_struct *wq = index_update_wq(op);
 	unsigned long flags;
@@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op)
 		bch2_nocow_write_convert_unwritten(op);
 }
 
-static void bch2_nocow_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);
 
 	__bch2_nocow_write_done(op);
 	bch2_write_done(cl);
@@ -1363,7 +1363,7 @@ err:
 		op->insert_keys.top = op->insert_keys.keys;
 	} else if (op->flags & BCH_WRITE_SYNC) {
 		closure_sync(&op->cl);
-		bch2_nocow_write_done(&op->cl);
+		bch2_nocow_write_done(&op->cl.work);
 	} else {
 		/*
 		 * XXX
@@ -1566,9 +1566,9 @@ err:
  * If op->discard is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->bio and op->inode.
  */
-void bch2_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_write)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);
 	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
 	unsigned data_len;
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 9323167229ee..6c276a48f95d 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->devs_need_flush	= NULL;
 }
 
-void bch2_write(struct closure *);
-
+CLOSURE_CALLBACK(bch2_write);
 void bch2_write_point_do_index_updates(struct work_struct *);
 
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 786a09285509..02e6484f9953 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1025,10 +1025,9 @@ next_block:
 	return 0;
 }
 
-static void bch2_journal_read_device(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_journal_read_device)
 {
-	struct journal_device *ja =
-		container_of(cl, struct journal_device, read);
+	closure_type(ja, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
 	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
@@ -1520,9 +1519,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 }
 
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
@@ -1638,9 +1637,9 @@ static void journal_write_endio(struct bio *bio)
 	percpu_ref_put(&ca->io_ref);
 }
 
-static void do_journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(do_journal_write)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_last_unwritten_buf(j);
@@ -1850,9 +1849,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 	return 0;
 }
 
-void bch2_journal_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_journal_write)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_last_unwritten_buf(j);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index a88d097b13f1..c035e7c108e1 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 
 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
-void bch2_journal_write(struct closure *);
+CLOSURE_CALLBACK(bch2_journal_write);
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/include/linux/closure.h b/include/linux/closure.h
index de7bb47d8a46..c554c6a08768 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -104,7 +104,7 @@
 
 struct closure;
 struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
 extern struct dentry *bcache_debug;
 
 struct closure_waitlist {
@@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
 	} else
-		cl->fn(cl);
+		cl->fn(&cl->work);
 }
 
 /**
@@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 	__closure_wake_up(list);
 }
 
+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
+
 /**
  * continue_at - jump to another function with barrier
  *
diff --git a/lib/closure.c b/lib/closure.c
index f86c9eeafb35..c16540552d61 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -36,7 +36,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 			closure_debug_destroy(cl);
 
 			if (destructor)
-				destructor(cl);
+				destructor(&cl->work);
 
 			if (parent)
 				closure_put(parent);
@@ -108,8 +108,9 @@ struct closure_syncer {
 	int			done;
 };
 
-static void closure_sync_fn(struct closure *cl)
+static CLOSURE_CALLBACK(closure_sync_fn)
 {
+	struct closure *cl = container_of(ws, struct closure, work);
 	struct closure_syncer *s = cl->s;
 	struct task_struct *p;
 
-- 
cgit v1.2.3


From 6543ac13c623f906200dfd3f1c407d8d333b6995 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 17 Oct 2023 11:09:32 -0500
Subject: soundwire: bus: introduce controller_id

The existing SoundWire support misses a clear Controller/Manager
hiearchical definition to deal with all variants across SOC vendors.

a) Intel platforms have one controller with 4 or more Managers.
b) AMD platforms have two controllers with one Manager each, but due
to BIOS issues use two different link_id values within the scope of a
single controller.
c) QCOM platforms have one or more controller with one Manager each.

This patch adds a 'controller_id' which can be set by higher
levels. If assigned to -1, the controller_id will be set to the
system-unique IDA-assigned bus->id.

The main change is that the bus->id is no longer used for any device
name, which makes the definition completely predictable and not
dependent on any enumeration order. The bus->id is only used to insert
the Managers in the stream rt context.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Vijendar Mukunda <Vijendar.Mukunda@amd.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/stable/20231017160933.12624-2-pierre-louis.bossart%40linux.intel.com
Tested-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231017160933.12624-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c     | 8 ++++++++
 drivers/soundwire/bus.c             | 4 ++++
 drivers/soundwire/debugfs.c         | 2 +-
 drivers/soundwire/intel_auxdevice.c | 3 +++
 drivers/soundwire/master.c          | 2 +-
 drivers/soundwire/qcom.c            | 3 +++
 include/linux/soundwire/sdw.h       | 4 +++-
 7 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index 3a99f6dcdfaf..a3b1f4e6f0f9 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -927,6 +927,14 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 	amd_manager->bus.clk_stop_timeout = 200;
 	amd_manager->bus.link_id = amd_manager->instance;
 
+	/*
+	 * Due to BIOS compatibility, the two links are exposed within
+	 * the scope of a single controller. If this changes, the
+	 * controller_id will have to be updated with drv_data
+	 * information.
+	 */
+	amd_manager->bus.controller_id = 0;
+
 	switch (amd_manager->instance) {
 	case ACP_SDW0:
 		amd_manager->num_dout_ports = AMD_SDW0_MAX_TX_PORTS;
diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 41b0d9adf68e..f3fec15c3112 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -22,6 +22,10 @@ static int sdw_get_id(struct sdw_bus *bus)
 		return rc;
 
 	bus->id = rc;
+
+	if (bus->controller_id == -1)
+		bus->controller_id = rc;
+
 	return 0;
 }
 
diff --git a/drivers/soundwire/debugfs.c b/drivers/soundwire/debugfs.c
index d1553cb77187..67abd7e52f09 100644
--- a/drivers/soundwire/debugfs.c
+++ b/drivers/soundwire/debugfs.c
@@ -20,7 +20,7 @@ void sdw_bus_debugfs_init(struct sdw_bus *bus)
 		return;
 
 	/* create the debugfs master-N */
-	snprintf(name, sizeof(name), "master-%d-%d", bus->id, bus->link_id);
+	snprintf(name, sizeof(name), "master-%d-%d", bus->controller_id, bus->link_id);
 	bus->debugfs = debugfs_create_dir(name, sdw_debugfs_root);
 }
 
diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c
index 7f15e3549e53..93698532deac 100644
--- a/drivers/soundwire/intel_auxdevice.c
+++ b/drivers/soundwire/intel_auxdevice.c
@@ -234,6 +234,9 @@ static int intel_link_probe(struct auxiliary_device *auxdev,
 	cdns->instance = sdw->instance;
 	cdns->msg_count = 0;
 
+	/* single controller for all SoundWire links */
+	bus->controller_id = 0;
+
 	bus->link_id = auxdev->id;
 	bus->clk_stop_timeout = 1;
 
diff --git a/drivers/soundwire/master.c b/drivers/soundwire/master.c
index 9b05c9e25ebe..51abedbbaa66 100644
--- a/drivers/soundwire/master.c
+++ b/drivers/soundwire/master.c
@@ -145,7 +145,7 @@ int sdw_master_device_add(struct sdw_bus *bus, struct device *parent,
 	md->dev.fwnode = fwnode;
 	md->dev.dma_mask = parent->dma_mask;
 
-	dev_set_name(&md->dev, "sdw-master-%d", bus->id);
+	dev_set_name(&md->dev, "sdw-master-%d-%d", bus->controller_id, bus->link_id);
 
 	ret = device_register(&md->dev);
 	if (ret) {
diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c
index e9a52c1bd359..7f8549aab42f 100644
--- a/drivers/soundwire/qcom.c
+++ b/drivers/soundwire/qcom.c
@@ -1620,6 +1620,9 @@ static int qcom_swrm_probe(struct platform_device *pdev)
 		}
 	}
 
+	/* FIXME: is there a DT-defined value to use ? */
+	ctrl->bus.controller_id = -1;
+
 	ret = sdw_bus_master_add(&ctrl->bus, dev, dev->fwnode);
 	if (ret) {
 		dev_err(dev, "Failed to register Soundwire controller (%d)\n",
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 904004d8b562..66f814b63a43 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -886,7 +886,8 @@ struct sdw_master_ops {
  * struct sdw_bus - SoundWire bus
  * @dev: Shortcut to &bus->md->dev to avoid changing the entire code.
  * @md: Master device
- * @link_id: Link id number, can be 0 to N, unique for each Master
+ * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used.
+ * @link_id: Link id number, can be 0 to N, unique for each Controller
  * @id: bus system-wide unique id
  * @slaves: list of Slaves on this bus
  * @assigned: Bitmap for Slave device numbers.
@@ -918,6 +919,7 @@ struct sdw_master_ops {
 struct sdw_bus {
 	struct device *dev;
 	struct sdw_master_device *md;
+	int controller_id;
 	unsigned int link_id;
 	int id;
 	struct list_head slaves;
-- 
cgit v1.2.3


From 8802e580ee643e3f63c6b39ff64e7c7baa4a55ba Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:13 +0200
Subject: fs: create __sb_write_started() helper

Similar to sb_write_started() for use by other sb freeze levels.

Unlike the boolean sb_write_started(), this helper returns a tristate
to distiguish the cases of lockdep disabled or unknown lock state.

This is needed for fanotify "pre content" events.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-15-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..ac8b5a9b467b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1645,9 +1645,23 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
 #define __sb_writers_release(sb, lev)	\
 	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * > 0 sb freeze level is held
+ *   0 sb freeze level is not held
+ * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+
 static inline bool sb_write_started(const struct super_block *sb)
 {
-	return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
 /**
-- 
cgit v1.2.3


From 3d5cd4911e04683df8f4439fddd788e00a2510a8 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:14 +0200
Subject: fs: create file_write_started() helper

Convenience wrapper for sb_write_started(file_inode(inode)->i_sb)), which
has a single occurrence in the code right now.

Document the false negatives of those helpers, which makes them unusable
to assert that sb_start_write() is not held.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-16-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c    |  2 +-
 include/linux/fs.h | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index 92c68ab4f221..f791555fa246 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1423,7 +1423,7 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
-	lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
+	lockdep_assert(file_write_started(file_out));
 
 	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
 				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ac8b5a9b467b..75a10b632edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1659,11 +1659,32 @@ static inline int __sb_write_started(const struct super_block *sb, int level)
 	return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
 }
 
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
 static inline bool sb_write_started(const struct super_block *sb)
 {
 	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
+/**
+ * file_write_started - check if SB_FREEZE_WRITE is held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_started(const struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return true;
+	return sb_write_started(file_inode(file)->i_sb);
+}
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit v1.2.3


From 21b32e6a0ab5b174fa1ca2fb4c212577cf405d83 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 14:27:15 +0200
Subject: fs: create {sb,file}_write_not_started() helpers

Create new helpers {sb,file}_write_not_started() that can be used
to assert that sb_start_write() is not held.

This is needed for fanotify "pre content" events.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231122122715.2561213-17-amir73il@gmail.com
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75a10b632edd..ae0e2fb7bcea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1670,6 +1670,17 @@ static inline bool sb_write_started(const struct super_block *sb)
 	return __sb_write_started(sb, SB_FREEZE_WRITE);
 }
 
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+	return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
 /**
  * file_write_started - check if SB_FREEZE_WRITE is held
  * @file: the file we write to
@@ -1685,6 +1696,21 @@ static inline bool file_write_started(const struct file *file)
 	return sb_write_started(file_inode(file)->i_sb);
 }
 
+/**
+ * file_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_not_started(const struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return true;
+	return sb_write_not_started(file_inode(file)->i_sb);
+}
+
 /**
  * sb_end_write - drop write access to a superblock
  * @sb: the super we wrote to
-- 
cgit v1.2.3


From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 23 Nov 2023 19:59:36 -0800
Subject: bpf: Validate global subprogs lazily

Slightly change BPF verifier logic around eagerness and order of global
subprog validation. Instead of going over every global subprog eagerly
and validating it before main (entry) BPF program is verified, turn it
around. Validate main program first, mark subprogs that were called from
main program for later verification, but otherwise assume it is valid.
Afterwards, go over marked global subprogs and validate those,
potentially marking some more global functions as being called. Continue
this process until all (transitively) callable global subprogs are
validated. It's a BFS traversal at its heart and will always converge.

This is an important change because it allows to feature-gate some
subprograms that might not be verifiable on some older kernel, depending
on supported set of features.

E.g., at some point, global functions were allowed to accept a pointer
to memory, which size is identified by user-provided type.
Unfortunately, older kernels don't support this feature. With BPF CO-RE
approach, the natural way would be to still compile BPF object file once
and guard calls to this global subprog with some CO-RE check or using
.rodata variables. That's what people do to guard usage of new helpers
or kfuncs, and any other new BPF-side feature that might be missing on
old kernels.

That's currently impossible to do with global subprogs, unfortunately,
because they are eagerly and unconditionally validated. This patch set
aims to change this, so that in the future when global funcs gain new
features, those can be guarded using BPF CO-RE techniques in the same
fashion as any other new kernel feature.

Two selftests had to be adjusted in sync with these changes.

test_global_func12 relied on eager global subprog validation failing
before main program failure is detected (unknown return value). Fix by
making sure that main program is always valid.

verifier_subprog_precision's parent_stack_slot_precise subtest relied on
verifier checkpointing heuristic to do a checkpoint at instruction #5,
but that's no longer true because we don't have enough jumps validated
before reaching insn #5 due to global subprogs being validated later.

Other than that, no changes, as one would expect.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org
---
 include/linux/bpf.h                                |  2 +
 kernel/bpf/verifier.c                              | 48 +++++++++++++++++++---
 .../selftests/bpf/progs/test_global_func12.c       |  4 +-
 .../bpf/progs/verifier_subprog_precision.c         |  4 +-
 4 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 258ba232e302..eb447b0a9423 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 struct bpf_func_info_aux {
 	u16 linkage;
 	bool unreliable;
+	bool called : 1;
+	bool verified : 1;
 };
 
 enum bpf_jit_poke_reason {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2939ebf2638..8e7b6072e3f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -434,6 +434,11 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
 	return btf_type_name(env->prog->aux->btf, info->type_id);
 }
 
+static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog)
+{
+	return &env->prog->aux->func_info_aux[subprog];
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -9290,6 +9295,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 		verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
 			subprog, sub_name);
+		/* mark global subprog for verifying after main prog */
+		subprog_aux(env, subprog)->called = true;
 		clear_caller_saved_regs(env, caller->regs);
 
 		/* All global functions return a 64-bit SCALAR_VALUE */
@@ -19873,8 +19880,11 @@ out:
 	return ret;
 }
 
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
  * Consider:
  * int bar(int);
  * int foo(int f)
@@ -19893,14 +19903,26 @@ out:
 static int do_check_subprogs(struct bpf_verifier_env *env)
 {
 	struct bpf_prog_aux *aux = env->prog->aux;
-	int i, ret;
+	struct bpf_func_info_aux *sub_aux;
+	int i, ret, new_cnt;
 
 	if (!aux->func_info)
 		return 0;
 
+	/* exception callback is presumed to be always called */
+	if (env->exception_callback_subprog)
+		subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+	new_cnt = 0;
 	for (i = 1; i < env->subprog_cnt; i++) {
-		if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+		if (!subprog_is_global(env, i))
+			continue;
+
+		sub_aux = subprog_aux(env, i);
+		if (!sub_aux->called || sub_aux->verified)
 			continue;
+
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
 		ret = do_check_common(env, i, env->exception_callback_subprog == i);
@@ -19910,7 +19932,21 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 			verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
 				i, subprog_name(env, i));
 		}
+
+		/* We verified new global subprog, it might have called some
+		 * more global subprogs that we haven't verified yet, so we
+		 * need to do another pass over subprogs to verify those.
+		 */
+		sub_aux->verified = true;
+		new_cnt++;
 	}
+
+	/* We can't loop forever as we verify at least one global subprog on
+	 * each pass.
+	 */
+	if (new_cnt)
+		goto again;
+
 	return 0;
 }
 
@@ -20556,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret < 0)
 		goto skip_full_check;
 
-	ret = do_check_subprogs(env);
-	ret = ret ?: do_check_main(env);
+	ret = do_check_main(env);
+	ret = ret ?: do_check_subprogs(env);
 
 	if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
 		ret = bpf_prog_offload_finalize(env);
diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c
index 7f159d83c6f6..6e03d42519a6 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func12.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func12.c
@@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb)
 {
 	const struct S s = {.x = skb->len };
 
-	return foo(&s);
+	foo(&s);
+
+	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index f61d623b1ce8..b5efcaeaa1ae 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void)
 SEC("?raw_tp")
 __success __log_level(2)
 __msg("9: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 9 first_idx 6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)")
-__msg("mark_precise: frame0: parent state regs= stack=-8:")
-__msg("mark_precise: frame0: last_idx 5 first_idx 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6")
 __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6")
-- 
cgit v1.2.3


From ef5828805842204dd0259ecfc132b5916c8a77ae Mon Sep 17 00:00:00 2001
From: Michael-CY Lee <michael-cy.lee@mediatek.com>
Date: Wed, 22 Nov 2023 11:02:37 +0800
Subject: wifi: avoid offset calculation on NULL pointer

ieee80211_he_6ghz_oper() can be passed a NULL pointer
and checks for that, but already did the calculation
to inside of it before. Move it after the check.

Signed-off-by: Michael-CY Lee <michael-cy.lee@mediatek.com>
Link: https://lore.kernel.org/r/20231122030237.31276-1-michael-cy.lee@mediatek.com
[rewrite commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 958771bac9c0..c2ac9e9e7ee9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2830,12 +2830,14 @@ ieee80211_he_oper_size(const u8 *he_oper_ie)
 static inline const struct ieee80211_he_6ghz_oper *
 ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
 {
-	const u8 *ret = (const void *)&he_oper->optional;
+	const u8 *ret;
 	u32 he_oper_params;
 
 	if (!he_oper)
 		return NULL;
 
+	ret = (const void *)&he_oper->optional;
+
 	he_oper_params = le32_to_cpu(he_oper->he_oper_params);
 
 	if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
-- 
cgit v1.2.3


From d3ca4ab4f16eb81dc3e7721251adcba49b229d54 Mon Sep 17 00:00:00 2001
From: Liam Kearney <liam.kearney@morsemicro.com>
Date: Wed, 25 Oct 2023 11:27:55 +1100
Subject: wifi: ieee80211: fix PV1 frame control field name

Update PV1 frame control field TODS to FROMDS to match 802.11 standard

Signed-off-by: Liam Kearney <liam.kearney@morsemicro.com>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20231025002755.1752983-1-liam.kearney@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 958771bac9c0..5e5ea216f341 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -172,11 +172,11 @@
 #define IEEE80211_SN_MODULO		(IEEE80211_MAX_SN + 1)
 
 
-/* PV1 Layout 11ah 9.8.3.1 */
+/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */
 #define IEEE80211_PV1_FCTL_VERS		0x0003
 #define IEEE80211_PV1_FCTL_FTYPE	0x001c
 #define IEEE80211_PV1_FCTL_STYPE	0x00e0
-#define IEEE80211_PV1_FCTL_TODS		0x0100
+#define IEEE80211_PV1_FCTL_FROMDS		0x0100
 #define IEEE80211_PV1_FCTL_MOREFRAGS	0x0200
 #define IEEE80211_PV1_FCTL_PM		0x0400
 #define IEEE80211_PV1_FCTL_MOREDATA	0x0800
-- 
cgit v1.2.3


From d1f7728259ef02ac20b7afb6e7eb5a9eb1696c25 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 15 Nov 2023 17:49:59 +0100
Subject: gpiolib: provide gpio_device_get_label()

Provide a getter for the GPIO device label string so that users don't
have to dereference struct gpio_chip directly.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c      | 14 ++++++++++++++
 include/linux/gpio/driver.h |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 95d2a7b2ea3e..a5faaea6915d 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -254,6 +254,20 @@ int gpio_device_get_base(struct gpio_device *gdev)
 }
 EXPORT_SYMBOL_GPL(gpio_device_get_base);
 
+/**
+ * gpio_device_get_label() - Get the label of this GPIO device
+ * @gdev: GPIO device
+ *
+ * Returns:
+ * Pointer to the string containing the GPIO device label. The string's
+ * lifetime is tied to that of the underlying GPIO device.
+ */
+const char *gpio_device_get_label(struct gpio_device *gdev)
+{
+	return gdev->label;
+}
+EXPORT_SYMBOL(gpio_device_get_label);
+
 /**
  * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device
  * @gdev: GPIO device
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0aed62f0c633..100c329dc986 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -786,6 +786,7 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc);
 
 /* struct gpio_device getters */
 int gpio_device_get_base(struct gpio_device *gdev);
+const char *gpio_device_get_label(struct gpio_device *gdev);
 
 #else /* CONFIG_GPIOLIB */
 
-- 
cgit v1.2.3


From a066f906ba396ab00d4af19fc5fad42b2605582a Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Wed, 22 Nov 2023 14:52:43 +0100
Subject: firmware_loader: Expand Firmware upload error codes with firmware
 invalid error

No error code are available to signal an invalid firmware content.
Drivers that can check the firmware content validity can not return this
specific failure to the user-space

Expand the firmware error code with an additional code:
- "firmware invalid" code which can be used when the provided firmware
  is invalid

Sync lib/test_firmware.c file accordingly.

Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231122-feature_firmware_error_code-v3-1-04ec753afb71@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/base/firmware_loader/sysfs_upload.c | 1 +
 include/linux/firmware.h                    | 2 ++
 lib/test_firmware.c                         | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/firmware_loader/sysfs_upload.c b/drivers/base/firmware_loader/sysfs_upload.c
index a0af8f5f13d8..829270067d16 100644
--- a/drivers/base/firmware_loader/sysfs_upload.c
+++ b/drivers/base/firmware_loader/sysfs_upload.c
@@ -27,6 +27,7 @@ static const char * const fw_upload_err_str[] = {
 	[FW_UPLOAD_ERR_INVALID_SIZE] = "invalid-file-size",
 	[FW_UPLOAD_ERR_RW_ERROR]     = "read-write-error",
 	[FW_UPLOAD_ERR_WEAROUT]	     = "flash-wearout",
+	[FW_UPLOAD_ERR_FW_INVALID]   = "firmware-invalid",
 };
 
 static const char *fw_upload_progress(struct device *dev,
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index de7fea3bca51..0311858b46ce 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -27,6 +27,7 @@ struct firmware {
  * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size
  * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log
  * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry
+ * @FW_UPLOAD_ERR_FW_INVALID: invalid firmware file
  * @FW_UPLOAD_ERR_MAX: Maximum error code marker
  */
 enum fw_upload_err {
@@ -38,6 +39,7 @@ enum fw_upload_err {
 	FW_UPLOAD_ERR_INVALID_SIZE,
 	FW_UPLOAD_ERR_RW_ERROR,
 	FW_UPLOAD_ERR_WEAROUT,
+	FW_UPLOAD_ERR_FW_INVALID,
 	FW_UPLOAD_ERR_MAX
 };
 
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index add4699fc6cd..9cfdcd6d21db 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -1132,6 +1132,7 @@ static const char * const fw_upload_err_str[] = {
 	[FW_UPLOAD_ERR_INVALID_SIZE] = "invalid-file-size",
 	[FW_UPLOAD_ERR_RW_ERROR]     = "read-write-error",
 	[FW_UPLOAD_ERR_WEAROUT]	     = "flash-wearout",
+	[FW_UPLOAD_ERR_FW_INVALID]   = "firmware-invalid",
 };
 
 static void upload_err_inject_error(struct test_firmware_upload *tst,
-- 
cgit v1.2.3


From b286f4e87e325b76789f30337c98ba72e00532e2 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 13 Nov 2023 10:07:52 +0200
Subject: serial: core: Move tty and serdev to be children of serial core port
 device

Let's move tty and serdev controller to be children of the serial core port
device. This way the runtime PM usage count of a child device propagates
to the serial hardware device.

The tty and serdev devices are associated with a specific serial port of
a serial hardware controller device, and we now have serial core hierarchy
of controllers and ports.

The tty device moves happily with just a change of the parent device and
update of device_find_child() handling. The serdev device init needs some
changes to separate the serial hardware controller device from the parent
device.

With this change the tty devices move under sysfs similar to this x86_64
qemu example of a diff of "find /sys -name ttyS*":

 /sys/class/tty/ttyS0
 /sys/class/tty/ttyS3
 /sys/class/tty/ttyS1
-/sys/devices/pnp0/00:04/tty/ttyS0
-/sys/devices/platform/serial8250/tty/ttyS2
-/sys/devices/platform/serial8250/tty/ttyS3
-/sys/devices/platform/serial8250/tty/ttyS1
+/sys/devices/pnp0/00:04/00:04:0/00:04:0.0/tty/ttyS0
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.3/tty/ttyS3
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.1/tty/ttyS1
+/sys/devices/platform/serial8250/serial8250:0/serial8250:0.2/tty/ttyS2

If a serdev device is used instead of a tty, it moves in a similar way.

Suggested-by: Johan Hovold <johan@kernel.org>
Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231113080758.30346-1-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/core.c           | 15 +++++++++------
 drivers/tty/serdev/serdev-ttyport.c |  3 ++-
 drivers/tty/serial/serial_core.c    |  7 ++++---
 drivers/tty/tty_port.c              | 16 +++++++++-------
 include/linux/serdev.h              |  8 +++++++-
 include/linux/tty_port.h            |  4 ++--
 6 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index a5fdaf5e148e..3090e3454c44 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -468,6 +468,7 @@ EXPORT_SYMBOL_GPL(serdev_device_alloc);
 
 /**
  * serdev_controller_alloc() - Allocate a new serdev controller
+ * @host:	serial port hardware controller device
  * @parent:	parent device
  * @size:	size of private data
  *
@@ -476,8 +477,9 @@ EXPORT_SYMBOL_GPL(serdev_device_alloc);
  * The allocated private data region may be accessed via
  * serdev_controller_get_drvdata()
  */
-struct serdev_controller *serdev_controller_alloc(struct device *parent,
-					      size_t size)
+struct serdev_controller *serdev_controller_alloc(struct device *host,
+						  struct device *parent,
+						  size_t size)
 {
 	struct serdev_controller *ctrl;
 	int id;
@@ -502,7 +504,8 @@ struct serdev_controller *serdev_controller_alloc(struct device *parent,
 	ctrl->dev.type = &serdev_ctrl_type;
 	ctrl->dev.bus = &serdev_bus_type;
 	ctrl->dev.parent = parent;
-	device_set_node(&ctrl->dev, dev_fwnode(parent));
+	ctrl->host = host;
+	device_set_node(&ctrl->dev, dev_fwnode(host));
 	serdev_controller_set_drvdata(ctrl, &ctrl[1]);
 
 	dev_set_name(&ctrl->dev, "serial%d", id);
@@ -665,7 +668,7 @@ static int acpi_serdev_check_resources(struct serdev_controller *ctrl,
 		acpi_get_parent(adev->handle, &lookup.controller_handle);
 
 	/* Make sure controller and ResourceSource handle match */
-	if (!device_match_acpi_handle(ctrl->dev.parent, lookup.controller_handle))
+	if (!device_match_acpi_handle(ctrl->host, lookup.controller_handle))
 		return -ENODEV;
 
 	return 0;
@@ -730,7 +733,7 @@ static int acpi_serdev_register_devices(struct serdev_controller *ctrl)
 	bool skip;
 	int ret;
 
-	if (!has_acpi_companion(ctrl->dev.parent))
+	if (!has_acpi_companion(ctrl->host))
 		return -ENODEV;
 
 	/*
@@ -739,7 +742,7 @@ static int acpi_serdev_register_devices(struct serdev_controller *ctrl)
 	 * succeed in this case, so that the proper serdev devices can be
 	 * added "manually" later.
 	 */
-	ret = acpi_quirk_skip_serdev_enumeration(ctrl->dev.parent, &skip);
+	ret = acpi_quirk_skip_serdev_enumeration(ctrl->host, &skip);
 	if (ret)
 		return ret;
 	if (skip)
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index e3856814ce77..c5381fe15e1c 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -274,6 +274,7 @@ static const struct serdev_controller_ops ctrl_ops = {
 };
 
 struct device *serdev_tty_port_register(struct tty_port *port,
+					struct device *host,
 					struct device *parent,
 					struct tty_driver *drv, int idx)
 {
@@ -284,7 +285,7 @@ struct device *serdev_tty_port_register(struct tty_port *port,
 	if (!port || !drv || !parent)
 		return ERR_PTR(-ENODEV);
 
-	ctrl = serdev_controller_alloc(parent, sizeof(struct serport));
+	ctrl = serdev_controller_alloc(host, parent, sizeof(struct serport));
 	if (!ctrl)
 		return ERR_PTR(-ENOMEM);
 	serport = serdev_controller_get_drvdata(ctrl);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index f1348a509552..f36ea953e2c9 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2342,7 +2342,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 
 	mutex_lock(&port->mutex);
 
-	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
+	tty_dev = device_find_child(&uport->port_dev->dev, &match, serial_match_port);
 	if (tty_dev && device_may_wakeup(tty_dev)) {
 		enable_irq_wake(uport->irq);
 		put_device(tty_dev);
@@ -2423,7 +2423,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 
 	mutex_lock(&port->mutex);
 
-	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
+	tty_dev = device_find_child(&uport->port_dev->dev, &match, serial_match_port);
 	if (!uport->suspended && device_may_wakeup(tty_dev)) {
 		if (irqd_is_wakeup_set(irq_get_irq_data((uport->irq))))
 			disable_irq_wake(uport->irq);
@@ -3153,7 +3153,8 @@ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *u
 	 * setserial to be used to alter this port's parameters.
 	 */
 	tty_dev = tty_port_register_device_attr_serdev(port, drv->tty_driver,
-			uport->line, uport->dev, port, uport->tty_groups);
+			uport->line, uport->dev, &uport->port_dev->dev, port,
+			uport->tty_groups);
 	if (!IS_ERR(tty_dev)) {
 		device_set_wakeup_capable(tty_dev, 1);
 	} else {
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 63c125250961..ef72d2e4b928 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -171,7 +171,8 @@ EXPORT_SYMBOL_GPL(tty_port_register_device_attr);
  * @port: tty_port of the device
  * @driver: tty_driver for this device
  * @index: index of the tty
- * @device: parent if exists, otherwise NULL
+ * @host: serial port hardware device
+ * @parent: parent if exists, otherwise NULL
  * @drvdata: driver data for the device
  * @attr_grp: attribute group for the device
  *
@@ -180,20 +181,20 @@ EXPORT_SYMBOL_GPL(tty_port_register_device_attr);
  */
 struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device, void *drvdata,
+		struct device *host, struct device *parent, void *drvdata,
 		const struct attribute_group **attr_grp)
 {
 	struct device *dev;
 
 	tty_port_link_device(port, driver, index);
 
-	dev = serdev_tty_port_register(port, device, driver, index);
+	dev = serdev_tty_port_register(port, host, parent, driver, index);
 	if (PTR_ERR(dev) != -ENODEV) {
 		/* Skip creating cdev if we registered a serdev device */
 		return dev;
 	}
 
-	return tty_register_device_attr(driver, index, device, drvdata,
+	return tty_register_device_attr(driver, index, parent, drvdata,
 			attr_grp);
 }
 EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev);
@@ -203,17 +204,18 @@ EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev);
  * @port: tty_port of the device
  * @driver: tty_driver for this device
  * @index: index of the tty
- * @device: parent if exists, otherwise NULL
+ * @host: serial port hardware controller device
+ * @parent: parent if exists, otherwise NULL
  *
  * Register a serdev or tty device depending on if the parent device has any
  * defined serdev clients or not.
  */
 struct device *tty_port_register_device_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device)
+		struct device *host, struct device *parent)
 {
 	return tty_port_register_device_attr_serdev(port, driver, index,
-			device, NULL, NULL);
+			host, parent, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(tty_port_register_device_serdev);
 
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index f5f97fa25e8a..0ebf53bb254f 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -99,12 +99,14 @@ struct serdev_controller_ops {
 /**
  * struct serdev_controller - interface to the serdev controller
  * @dev:	Driver model representation of the device.
+ * @host:	Serial port hardware controller device
  * @nr:		number identifier for this controller/bus.
  * @serdev:	Pointer to slave device for this controller.
  * @ops:	Controller operations.
  */
 struct serdev_controller {
 	struct device		dev;
+	struct device		*host;
 	unsigned int		nr;
 	struct serdev_device	*serdev;
 	const struct serdev_controller_ops *ops;
@@ -167,7 +169,9 @@ struct serdev_device *serdev_device_alloc(struct serdev_controller *);
 int serdev_device_add(struct serdev_device *);
 void serdev_device_remove(struct serdev_device *);
 
-struct serdev_controller *serdev_controller_alloc(struct device *, size_t);
+struct serdev_controller *serdev_controller_alloc(struct device *host,
+						  struct device *parent,
+						  size_t size);
 int serdev_controller_add(struct serdev_controller *);
 void serdev_controller_remove(struct serdev_controller *);
 
@@ -311,11 +315,13 @@ struct tty_driver;
 
 #ifdef CONFIG_SERIAL_DEV_CTRL_TTYPORT
 struct device *serdev_tty_port_register(struct tty_port *port,
+					struct device *host,
 					struct device *parent,
 					struct tty_driver *drv, int idx);
 int serdev_tty_port_unregister(struct tty_port *port);
 #else
 static inline struct device *serdev_tty_port_register(struct tty_port *port,
+					   struct device *host,
 					   struct device *parent,
 					   struct tty_driver *drv, int idx)
 {
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 6b367eb17979..3276311a7f38 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -149,10 +149,10 @@ struct device *tty_port_register_device_attr(struct tty_port *port,
 		const struct attribute_group **attr_grp);
 struct device *tty_port_register_device_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device);
+		struct device *host, struct device *parent);
 struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
-		struct device *device, void *drvdata,
+		struct device *host, struct device *parent, void *drvdata,
 		const struct attribute_group **attr_grp);
 void tty_port_unregister_device(struct tty_port *port,
 		struct tty_driver *driver, unsigned index);
-- 
cgit v1.2.3


From 358779dd18c1e8531bd6d78c19ed802958d7c677 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 21 Nov 2023 10:22:44 +0100
Subject: tty: fix tty_operations types in documentation

Commits 95713967ba52 ("tty: make tty_operations::write()'s count
size_t") and dcaafbe6ee3b ("tty: propagate u8 data to
tty_operations::put_char()") changed types of characters to u8, but
omitted to fix the documentation.

Fix the latter now.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231121092258.9334-4-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_driver.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 18beff0cec1a..f428c1b784a2 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -72,8 +72,7 @@ struct serial_struct;
  *	is closed for the last time freeing up the resources. This is
  *	actually the second part of shutdown for routines that might sleep.
  *
- * @write: ``ssize_t ()(struct tty_struct *tty, const unsigned char *buf,
- *		    size_t count)``
+ * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)``
  *
  *	This routine is called by the kernel to write a series (@count) of
  *	characters (@buf) to the @tty device. The characters may come from
@@ -85,7 +84,7 @@ struct serial_struct;
  *
  *	Optional: Required for writable devices. May not sleep.
  *
- * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)``
+ * @put_char: ``int ()(struct tty_struct *tty, u8 ch)``
  *
  *	This routine is called by the kernel to write a single character @ch to
  *	the @tty device. If the kernel uses this routine, it must call the
-- 
cgit v1.2.3


From 239123e7e8ec4d35c8591c48f5de44925a88391d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 21 Nov 2023 10:22:45 +0100
Subject: tty: move locking docs out of Returns for functions in tty.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both tty_kref_get() and tty_get_baud_rate() note about locking in their
Return kernel-doc clause. Extract this info into a separate "Locking"
paragraph -- the same as we do for other tty functions.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231121092258.9334-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4b6340ac2af2..7625fc98fef3 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -393,8 +393,10 @@ extern const struct class tty_class;
  * tty_kref_get - get a tty reference
  * @tty: tty device
  *
- * Returns: a new reference to a tty object. The caller must hold sufficient
- * locks/counts to ensure that their existing reference cannot go away
+ * Returns: a new reference to a tty object
+ *
+ * Locking: The caller must hold sufficient locks/counts to ensure that their
+ * existing reference cannot go away.
  */
 static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
@@ -436,10 +438,10 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
  * tty_get_baud_rate - get tty bit rates
  * @tty: tty to query
  *
- * Returns: the baud rate as an integer for this terminal. The termios lock
- * must be held by the caller and the terminal bit flags may be updated.
+ * Returns: the baud rate as an integer for this terminal
  *
- * Locking: none
+ * Locking: The termios lock must be held by the caller and the terminal bit
+ * flags may be updated.
  */
 static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 {
-- 
cgit v1.2.3


From da549bdd15c295c24b2ee7ffe7ad0f3877fa8a87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Nov 2023 02:00:39 -0500
Subject: dentry: switch the lists of children to hlist

Saves a pointer per struct dentry and actually makes the things less
clumsy.  Cleaned the d_walk() and dcache_readdir() a bit by use
of hlist_for_... iterators.

A couple of new helpers - d_first_child() and d_next_sibling(),
to make the expressions less awful.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst     |  9 ++++
 arch/powerpc/platforms/cell/spufs/inode.c |  5 +-
 fs/afs/dynroot.c                          |  5 +-
 fs/autofs/expire.c                        |  7 +--
 fs/ceph/dir.c                             |  2 +-
 fs/ceph/mds_client.c                      |  2 +-
 fs/coda/cache.c                           |  2 +-
 fs/dcache.c                               | 76 +++++++++++++++----------------
 fs/libfs.c                                | 45 +++++++++---------
 fs/notify/fsnotify.c                      |  2 +-
 fs/tracefs/inode.c                        | 34 ++++++--------
 include/linux/dcache.h                    | 20 ++++++--
 12 files changed, 108 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 878e72b2f8b7..331405f4b29f 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1061,3 +1061,12 @@ export_operations ->encode_fh() no longer has a default implementation to
 encode FILEID_INO32_GEN* file handles.
 Filesystems that used the default implementation may use the generic helper
 generic_encode_ino32_fh() explicitly.
+
+---
+
+**mandatory**
+
+The list of children anchored in parent dentry got turned into hlist now.
+Field names got changed (->d_children/->d_sib instead of ->d_subdirs/->d_child
+for anchor/entries resp.), so any affected places will be immediately caught
+by compiler.
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 10c1320adfd0..030de2b8c145 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -145,10 +145,11 @@ spufs_evict_inode(struct inode *inode)
 
 static void spufs_prune_dir(struct dentry *dir)
 {
-	struct dentry *dentry, *tmp;
+	struct dentry *dentry;
+	struct hlist_node *n;
 
 	inode_lock(d_inode(dir));
-	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
+	hlist_for_each_entry_safe(dentry, n, &dir->d_children, d_sib) {
 		spin_lock(&dentry->d_lock);
 		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 4d04ef2d3ae7..fe45462834cc 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -370,7 +370,7 @@ error:
 void afs_dynroot_depopulate(struct super_block *sb)
 {
 	struct afs_net *net = afs_sb2net(sb);
-	struct dentry *root = sb->s_root, *subdir, *tmp;
+	struct dentry *root = sb->s_root, *subdir;
 
 	/* Prevent more subdirs from being created */
 	mutex_lock(&net->proc_cells_lock);
@@ -379,10 +379,11 @@ void afs_dynroot_depopulate(struct super_block *sb)
 	mutex_unlock(&net->proc_cells_lock);
 
 	if (root) {
+		struct hlist_node *n;
 		inode_lock(root->d_inode);
 
 		/* Remove all the pins for dirs created for manually added cells */
-		list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+		hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
 			if (subdir->d_fsdata) {
 				subdir->d_fsdata = NULL;
 				dput(subdir);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 038b3d2d9f57..39d8c84c16f4 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -73,12 +73,9 @@ done:
 /* p->d_lock held */
 static struct dentry *positive_after(struct dentry *p, struct dentry *child)
 {
-	if (child)
-		child = list_next_entry(child, d_child);
-	else
-		child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
+	child = child ? d_next_sibling(child) : d_first_child(p);
 
-	list_for_each_entry_from(child, &p->d_subdirs, d_child) {
+	hlist_for_each_entry_from(child, d_sib) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 		if (simple_positive(child)) {
 			dget_dlock(child);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 91709934c8b1..678596684596 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
- * d_child when we initially get results back from the MDS, and
+ * d_children when we initially get results back from the MDS, and
  * falling back to a "normal" sync readdir if any dentries in the dir
  * are dropped.
  *
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d95eb525519a..02ebfabfc8ee 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2128,7 +2128,7 @@ static bool drop_negative_children(struct dentry *dentry)
 		goto out;
 
 	spin_lock(&dentry->d_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+	hlist_for_each_entry(child, &dentry->d_children, d_sib) {
 		if (d_really_is_positive(child)) {
 			all_negative = false;
 			break;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index d6254cf9f397..970f0022ec52 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	struct dentry *de;
 
 	spin_lock(&parent->d_lock);
-	list_for_each_entry(de, &parent->d_subdirs, d_child) {
+	hlist_for_each_entry(de, &parent->d_children, d_sib) {
 		struct inode *inode = d_inode_rcu(de);
 		/* don't know what to do with negative dentries */
 		if (inode)
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..59f76c9a15d1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -51,8 +51,8 @@
  *   - d_lru
  *   - d_count
  *   - d_unhashed()
- *   - d_parent and d_subdirs
- *   - childrens' d_child and d_parent
+ *   - d_parent and d_chilren
+ *   - childrens' d_sib and d_parent
  *   - d_u.d_alias, d_inode
  *
  * Ordering:
@@ -537,7 +537,7 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+static inline void dentry_unlist(struct dentry *dentry)
 {
 	struct dentry *next;
 	/*
@@ -545,12 +545,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
 	 * attached to the dentry tree
 	 */
 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
-	if (unlikely(list_empty(&dentry->d_child)))
+	if (unlikely(hlist_unhashed(&dentry->d_sib)))
 		return;
-	__list_del_entry(&dentry->d_child);
+	__hlist_del(&dentry->d_sib);
 	/*
 	 * Cursors can move around the list of children.  While we'd been
-	 * a normal list member, it didn't matter - ->d_child.next would've
+	 * a normal list member, it didn't matter - ->d_sib.next would've
 	 * been updated.  However, from now on it won't be and for the
 	 * things like d_walk() it might end up with a nasty surprise.
 	 * Normally d_walk() doesn't care about cursors moving around -
@@ -558,20 +558,20 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
 	 * of its own, we get through it without ever unlocking the parent.
 	 * There is one exception, though - if we ascend from a child that
 	 * gets killed as soon as we unlock it, the next sibling is found
-	 * using the value left in its ->d_child.next.  And if _that_
+	 * using the value left in its ->d_sib.next.  And if _that_
 	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
 	 * before d_walk() regains parent->d_lock, we'll end up skipping
 	 * everything the cursor had been moved past.
 	 *
-	 * Solution: make sure that the pointer left behind in ->d_child.next
+	 * Solution: make sure that the pointer left behind in ->d_sib.next
 	 * points to something that won't be moving around.  I.e. skip the
 	 * cursors.
 	 */
-	while (dentry->d_child.next != &parent->d_subdirs) {
-		next = list_entry(dentry->d_child.next, struct dentry, d_child);
+	while (dentry->d_sib.next) {
+		next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
 		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
 			break;
-		dentry->d_child.next = next->d_child.next;
+		dentry->d_sib.next = next->d_sib.next;
 	}
 }
 
@@ -600,7 +600,7 @@ static void __dentry_kill(struct dentry *dentry)
 	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	dentry_unlist(dentry, parent);
+	dentry_unlist(dentry);
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	if (dentry->d_inode)
@@ -1348,8 +1348,7 @@ enum d_walk_ret {
 static void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *))
 {
-	struct dentry *this_parent;
-	struct list_head *next;
+	struct dentry *this_parent, *dentry;
 	unsigned seq = 0;
 	enum d_walk_ret ret;
 	bool retry = true;
@@ -1371,13 +1370,9 @@ again:
 		break;
 	}
 repeat:
-	next = this_parent->d_subdirs.next;
+	dentry = d_first_child(this_parent);
 resume:
-	while (next != &this_parent->d_subdirs) {
-		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
-		next = tmp->next;
-
+	hlist_for_each_entry_from(dentry, d_sib) {
 		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
 			continue;
 
@@ -1398,7 +1393,7 @@ resume:
 			continue;
 		}
 
-		if (!list_empty(&dentry->d_subdirs)) {
+		if (!hlist_empty(&dentry->d_children)) {
 			spin_unlock(&this_parent->d_lock);
 			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
 			this_parent = dentry;
@@ -1413,24 +1408,23 @@ resume:
 	rcu_read_lock();
 ascend:
 	if (this_parent != parent) {
-		struct dentry *child = this_parent;
-		this_parent = child->d_parent;
+		dentry = this_parent;
+		this_parent = dentry->d_parent;
 
-		spin_unlock(&child->d_lock);
+		spin_unlock(&dentry->d_lock);
 		spin_lock(&this_parent->d_lock);
 
 		/* might go back up the wrong parent if we have had a rename. */
 		if (need_seqretry(&rename_lock, seq))
 			goto rename_retry;
 		/* go into the first sibling still alive */
-		do {
-			next = child->d_child.next;
-			if (next == &this_parent->d_subdirs)
-				goto ascend;
-			child = list_entry(next, struct dentry, d_child);
-		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
-		rcu_read_unlock();
-		goto resume;
+		hlist_for_each_entry_continue(dentry, d_sib) {
+			if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+				rcu_read_unlock();
+				goto resume;
+			}
+		}
+		goto ascend;
 	}
 	if (need_seqretry(&rename_lock, seq))
 		goto rename_retry;
@@ -1530,7 +1524,7 @@ out:
  * Search the dentry child list of the specified parent,
  * and move any unused dentries to the end of the unused
  * list for prune_dcache(). We descend to the next level
- * whenever the d_subdirs list is non-empty and continue
+ * whenever the d_children list is non-empty and continue
  * searching.
  *
  * It returns zero iff there are no unused children,
@@ -1657,7 +1651,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 {
 	/* it has busy descendents; complain about those instead */
-	if (!list_empty(&dentry->d_subdirs))
+	if (!hlist_empty(&dentry->d_children))
 		return D_WALK_CONTINUE;
 
 	/* root with refcount 1 is fine */
@@ -1814,9 +1808,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_fsdata = NULL;
 	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
-	INIT_LIST_HEAD(&dentry->d_subdirs);
+	INIT_HLIST_HEAD(&dentry->d_children);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
-	INIT_LIST_HEAD(&dentry->d_child);
+	INIT_HLIST_NODE(&dentry->d_sib);
 	d_set_d_op(dentry, dentry->d_sb->s_d_op);
 
 	if (dentry->d_op && dentry->d_op->d_init) {
@@ -1855,7 +1849,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	 */
 	__dget_dlock(parent);
 	dentry->d_parent = parent;
-	list_add(&dentry->d_child, &parent->d_subdirs);
+	hlist_add_head(&dentry->d_sib, &parent->d_children);
 	spin_unlock(&parent->d_lock);
 
 	return dentry;
@@ -2993,11 +2987,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	} else {
 		target->d_parent = old_parent;
 		swap_names(dentry, target);
-		list_move(&target->d_child, &target->d_parent->d_subdirs);
+		if (!hlist_unhashed(&target->d_sib))
+			__hlist_del(&target->d_sib);
+		hlist_add_head(&target->d_sib, &target->d_parent->d_children);
 		__d_rehash(target);
 		fsnotify_update_flags(target);
 	}
-	list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	if (!hlist_unhashed(&dentry->d_sib))
+		__hlist_del(&dentry->d_sib);
+	hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
 	__d_rehash(dentry);
 	fsnotify_update_flags(dentry);
 	fscrypt_handle_d_move(dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index e9440d55073c..46c9177769c1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close);
  * If no such element exists, NULL is returned.
  */
 static struct dentry *scan_positives(struct dentry *cursor,
-					struct list_head *p,
+					struct hlist_node **p,
 					loff_t count,
 					struct dentry *last)
 {
 	struct dentry *dentry = cursor->d_parent, *found = NULL;
 
 	spin_lock(&dentry->d_lock);
-	while ((p = p->next) != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(p, struct dentry, d_child);
+	while (*p) {
+		struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+		p = &d->d_sib.next;
 		// we must at least skip cursors, to avoid livelocks
 		if (d->d_flags & DCACHE_DENTRY_CURSOR)
 			continue;
@@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor,
 			count = 1;
 		}
 		if (need_resched()) {
-			list_move(&cursor->d_child, p);
-			p = &cursor->d_child;
+			if (!hlist_unhashed(&cursor->d_sib))
+				__hlist_del(&cursor->d_sib);
+			hlist_add_behind(&cursor->d_sib, &d->d_sib);
+			p = &cursor->d_sib.next;
 			spin_unlock(&dentry->d_lock);
 			cond_resched();
 			spin_lock(&dentry->d_lock);
@@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 		inode_lock_shared(dentry->d_inode);
 
 		if (offset > 2)
-			to = scan_positives(cursor, &dentry->d_subdirs,
+			to = scan_positives(cursor, &dentry->d_children.first,
 					    offset - 2, NULL);
 		spin_lock(&dentry->d_lock);
+		hlist_del_init(&cursor->d_sib);
 		if (to)
-			list_move(&cursor->d_child, &to->d_child);
-		else
-			list_del_init(&cursor->d_child);
+			hlist_add_behind(&cursor->d_sib, &to->d_sib);
 		spin_unlock(&dentry->d_lock);
 		dput(to);
 
@@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct dentry *cursor = file->private_data;
-	struct list_head *anchor = &dentry->d_subdirs;
 	struct dentry *next = NULL;
-	struct list_head *p;
+	struct hlist_node **p;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
 	if (ctx->pos == 2)
-		p = anchor;
-	else if (!list_empty(&cursor->d_child))
-		p = &cursor->d_child;
+		p = &dentry->d_children.first;
 	else
-		return 0;
+		p = &cursor->d_sib.next;
 
 	while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
@@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 			      fs_umode_to_dtype(d_inode(next)->i_mode)))
 			break;
 		ctx->pos++;
-		p = &next->d_child;
+		p = &next->d_sib.next;
 	}
 	spin_lock(&dentry->d_lock);
+	hlist_del_init(&cursor->d_sib);
 	if (next)
-		list_move_tail(&cursor->d_child, &next->d_child);
-	else
-		list_del_init(&cursor->d_child);
+		hlist_add_before(&cursor->d_sib, &next->d_sib);
 	spin_unlock(&dentry->d_lock);
 	dput(next);
 
@@ -492,12 +490,11 @@ const struct file_operations simple_offset_dir_operations = {
 
 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
 {
-	struct dentry *child = NULL;
-	struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+	struct dentry *child = NULL, *d;
 
 	spin_lock(&parent->d_lock);
-	while ((p = p->next) != &parent->d_subdirs) {
-		struct dentry *d = container_of(p, struct dentry, d_child);
+	d = prev ? d_next_sibling(prev) : d_first_child(parent);
+	hlist_for_each_entry_from(d, d_sib) {
 		if (simple_positive(d)) {
 			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
 			if (simple_positive(d))
@@ -658,7 +655,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dentry->d_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+	hlist_for_each_entry(child, &dentry->d_children, d_sib) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 		if (simple_positive(child)) {
 			spin_unlock(&child->d_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..8bfd690e9f10 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		 * d_flags to indicate parental interest (their parent is the
 		 * original inode) */
 		spin_lock(&alias->d_lock);
-		list_for_each_entry(child, &alias->d_subdirs, d_child) {
+		hlist_for_each_entry(child, &alias->d_children, d_sib) {
 			if (!child->d_inode)
 				continue;
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 5b54948514fe..61ca5fcf10f9 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -199,26 +199,21 @@ static void change_gid(struct dentry *dentry, kgid_t gid)
  */
 static void set_gid(struct dentry *parent, kgid_t gid)
 {
-	struct dentry *this_parent;
-	struct list_head *next;
+	struct dentry *this_parent, *dentry;
 
 	this_parent = parent;
 	spin_lock(&this_parent->d_lock);
 
 	change_gid(this_parent, gid);
 repeat:
-	next = this_parent->d_subdirs.next;
+	dentry = d_first_child(this_parent);
 resume:
-	while (next != &this_parent->d_subdirs) {
-		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
-		next = tmp->next;
-
+	hlist_for_each_entry_from(dentry, d_sib) {
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		change_gid(dentry, gid);
 
-		if (!list_empty(&dentry->d_subdirs)) {
+		if (!hlist_empty(&dentry->d_children)) {
 			spin_unlock(&this_parent->d_lock);
 			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
 			this_parent = dentry;
@@ -233,21 +228,20 @@ resume:
 	rcu_read_lock();
 ascend:
 	if (this_parent != parent) {
-		struct dentry *child = this_parent;
-		this_parent = child->d_parent;
+		dentry = this_parent;
+		this_parent = dentry->d_parent;
 
-		spin_unlock(&child->d_lock);
+		spin_unlock(&dentry->d_lock);
 		spin_lock(&this_parent->d_lock);
 
 		/* go into the first sibling still alive */
-		do {
-			next = child->d_child.next;
-			if (next == &this_parent->d_subdirs)
-				goto ascend;
-			child = list_entry(next, struct dentry, d_child);
-		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
-		rcu_read_unlock();
-		goto resume;
+		hlist_for_each_entry_continue(dentry, d_sib) {
+			if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+				rcu_read_unlock();
+				goto resume;
+			}
+		}
+		goto ascend;
 	}
 	rcu_read_unlock();
 	spin_unlock(&this_parent->d_lock);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3da2f0545d5d..0e397a0c519c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -68,12 +68,12 @@ extern const struct qstr dotdot_name;
  * large memory footprint increase).
  */
 #ifdef CONFIG_64BIT
-# define DNAME_INLINE_LEN 32 /* 192 bytes */
+# define DNAME_INLINE_LEN 40 /* 192 bytes */
 #else
 # ifdef CONFIG_SMP
-#  define DNAME_INLINE_LEN 36 /* 128 bytes */
-# else
 #  define DNAME_INLINE_LEN 40 /* 128 bytes */
+# else
+#  define DNAME_INLINE_LEN 44 /* 128 bytes */
 # endif
 #endif
 
@@ -101,8 +101,8 @@ struct dentry {
 		struct list_head d_lru;		/* LRU list */
 		wait_queue_head_t *d_wait;	/* in-lookup ones only */
 	};
-	struct list_head d_child;	/* child of parent list */
-	struct list_head d_subdirs;	/* our children */
+	struct hlist_node d_sib;	/* child of parent list */
+	struct hlist_head d_children;	/* our children */
 	/*
 	 * d_alias and d_rcu can share memory
 	 */
@@ -600,4 +600,14 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+static inline struct dentry *d_first_child(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
+}
+
+static inline struct dentry *d_next_sibling(const struct dentry *dentry)
+{
+	return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
+}
+
 #endif	/* __LINUX_DCACHE_H */
-- 
cgit v1.2.3


From f2824db1b49f947ba6e208ddf02edf4b1391480a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Nov 2023 16:42:43 -0500
Subject: kill d_instantate_anon(), fold __d_instantiate_anon() into remaining
 caller

now that the only user of d_instantiate_anon() is gone...
[braino fix folded - kudos to Dan Carpenter]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 90 ++++++++++++++++++++------------------------------
 include/linux/dcache.h |  1 -
 2 files changed, 35 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f5b2b5c1e6d..6c520eda131f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2054,75 +2054,55 @@ struct dentry *d_make_root(struct inode *root_inode)
 }
 EXPORT_SYMBOL(d_make_root);
 
-static struct dentry *__d_instantiate_anon(struct dentry *dentry,
-					   struct inode *inode,
-					   bool disconnected)
-{
-	struct dentry *res;
-	unsigned add_flags;
-
-	security_d_instantiate(dentry, inode);
-	spin_lock(&inode->i_lock);
-	res = __d_find_any_alias(inode);
-	if (res) {
-		spin_unlock(&inode->i_lock);
-		dput(dentry);
-		goto out_iput;
-	}
-
-	/* attach a disconnected dentry */
-	add_flags = d_flags_for_inode(inode);
-
-	if (disconnected)
-		add_flags |= DCACHE_DISCONNECTED;
-
-	spin_lock(&dentry->d_lock);
-	__d_set_inode_and_type(dentry, inode, add_flags);
-	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
-	if (!disconnected) {
-		hlist_bl_lock(&dentry->d_sb->s_roots);
-		hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
-		hlist_bl_unlock(&dentry->d_sb->s_roots);
-	}
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&inode->i_lock);
-
-	return dentry;
-
- out_iput:
-	iput(inode);
-	return res;
-}
-
-struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
-{
-	return __d_instantiate_anon(dentry, inode, true);
-}
-EXPORT_SYMBOL(d_instantiate_anon);
-
 static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
 {
-	struct dentry *tmp;
-	struct dentry *res;
+	struct super_block *sb;
+	struct dentry *new, *res;
 
 	if (!inode)
 		return ERR_PTR(-ESTALE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	res = d_find_any_alias(inode);
+	sb = inode->i_sb;
+
+	res = d_find_any_alias(inode); /* existing alias? */
 	if (res)
-		goto out_iput;
+		goto out;
 
-	tmp = d_alloc_anon(inode->i_sb);
-	if (!tmp) {
+	new = d_alloc_anon(sb);
+	if (!new) {
 		res = ERR_PTR(-ENOMEM);
-		goto out_iput;
+		goto out;
 	}
 
-	return __d_instantiate_anon(tmp, inode, disconnected);
+	security_d_instantiate(new, inode);
+	spin_lock(&inode->i_lock);
+	res = __d_find_any_alias(inode); /* recheck under lock */
+	if (likely(!res)) { /* still no alias, attach a disconnected dentry */
+		unsigned add_flags = d_flags_for_inode(inode);
+
+		if (disconnected)
+			add_flags |= DCACHE_DISCONNECTED;
+
+		spin_lock(&new->d_lock);
+		__d_set_inode_and_type(new, inode, add_flags);
+		hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
+		if (!disconnected) {
+			hlist_bl_lock(&sb->s_roots);
+			hlist_bl_add_head(&new->d_hash, &sb->s_roots);
+			hlist_bl_unlock(&sb->s_roots);
+		}
+		spin_unlock(&new->d_lock);
+		spin_unlock(&inode->i_lock);
+		inode = NULL; /* consumed by new->d_inode */
+		res = new;
+	} else {
+		spin_unlock(&inode->i_lock);
+		dput(new);
+	}
 
-out_iput:
+ out:
 	iput(inode);
 	return res;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fa0414cff85c..8c5e3bdf1147 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -218,7 +218,6 @@ extern seqlock_t rename_lock;
  */
 extern void d_instantiate(struct dentry *, struct inode *);
 extern void d_instantiate_new(struct dentry *, struct inode *);
-extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-- 
cgit v1.2.3


From 8a54b38f3e5ced6cc4b246b8e54bd0f50deceaa8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Nov 2023 16:01:27 -0500
Subject: d_genocide(): move the extern into fs/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h          | 1 +
 include/linux/dcache.h | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/internal.h b/fs/internal.h
index 9e9fc629f935..d9a920e2636e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -219,6 +219,7 @@ extern void shrink_dcache_for_umount(struct super_block *);
 extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
 				const struct qstr *name, unsigned *seq);
+extern void d_genocide(struct dentry *);
 
 /*
  * pipe.c
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 8c5e3bdf1147..b4324d47f249 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -243,9 +243,6 @@ extern void d_invalidate(struct dentry *);
 /* only used at mount-time */
 extern struct dentry * d_make_root(struct inode *);
 
-/* <clickety>-<click> the ramfs-type tree */
-extern void d_genocide(struct dentry *);
-
 extern void d_mark_tmpfile(struct file *, struct inode *);
 extern void d_tmpfile(struct file *, struct inode *);
 
-- 
cgit v1.2.3


From 57851607326a2beef21e67f83f4f53a90df8445a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Nov 2023 21:38:48 -0500
Subject: get rid of DCACHE_GENOCIDE

... now that we never call d_genocide() other than from kill_litter_super()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 5 +----
 include/linux/dcache.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5947556b6e90..8473c8f0ce22 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3198,10 +3198,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
 		if (d_unhashed(dentry) || !dentry->d_inode)
 			return D_WALK_SKIP;
 
-		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
-			dentry->d_flags |= DCACHE_GENOCIDE;
-			dentry->d_lockref.count--;
-		}
+		dentry->d_lockref.count--;
 	}
 	return D_WALK_CONTINUE;
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b4324d47f249..981f529c6cb5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -173,7 +173,6 @@ struct dentry_operations {
 #define DCACHE_DONTCACHE		BIT(7) /* Purge from memory on final dput() */
 
 #define DCACHE_CANT_MOUNT		BIT(8)
-#define DCACHE_GENOCIDE			BIT(9)
 #define DCACHE_SHRINK_LIST		BIT(10)
 
 #define DCACHE_OP_WEAK_REVALIDATE	BIT(11)
-- 
cgit v1.2.3


From 1b327b5ac57cf83e3d015de45d0142852f475375 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Nov 2023 14:07:43 -0500
Subject: kill DCACHE_MAY_FREE

With the new ordering in __dentry_kill() it has become redundant -
it's set if and only if both DCACHE_DENTRY_KILLED and DCACHE_SHRINK_LIST
are set.

We set it in __dentry_kill(), after having set DCACHE_DENTRY_KILLED
with the only condition being that DCACHE_SHRINK_LIST is there;
all of that is done without dropping ->d_lock and the only place
that checks that flag (shrink_dentry_list()) does so under ->d_lock,
after having found the victim on its shrink list.  Since DCACHE_SHRINK_LIST
is set only when placing dentry into shrink list and removed only by
shrink_dentry_list() itself, a check for DCACHE_DENTRY_KILLED in
there would be equivalent to check for DCACHE_MAY_FREE.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 6 ++----
 include/linux/dcache.h | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 475ef1edba03..523cbf0780f0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -614,10 +614,8 @@ static struct dentry *__dentry_kill(struct dentry *dentry)
 	}
 	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	dentry_unlist(dentry);
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		dentry->d_flags |= DCACHE_MAY_FREE;
+	if (dentry->d_flags & DCACHE_SHRINK_LIST)
 		can_free = false;
-	}
 	spin_unlock(&dentry->d_lock);
 	if (likely(can_free))
 		dentry_free(dentry);
@@ -1072,7 +1070,7 @@ void shrink_dentry_list(struct list_head *list)
 			bool can_free;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
-			can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
 			spin_unlock(&dentry->d_lock);
 			if (can_free)
 				dentry_free(dentry);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b4449a1a47ff..48b393545ec2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -202,7 +202,6 @@ struct dentry_operations {
 #define DCACHE_SPECIAL_TYPE		(5 << 20) /* Other file type */
 #define DCACHE_SYMLINK_TYPE		(6 << 20) /* Symlink */
 
-#define DCACHE_MAY_FREE			BIT(23)
 #define DCACHE_NOKEY_NAME		BIT(25) /* Encrypted name encoded without key */
 #define DCACHE_OP_REAL			BIT(26)
 
-- 
cgit v1.2.3


From 1b6ae9f6e6c3e3c35aad0f11b116a81780b8aa03 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Mon, 6 Nov 2023 14:44:17 +0100
Subject: dcache: remove unnecessary NULL check in dget_dlock()

dget_dlock() requires dentry->d_lock to be held when called, yet
contains a NULL check for dentry.

An audit of all calls to dget_dlock() shows that it is never called
with a NULL pointer (as spin_lock()/spin_unlock() would crash in these
cases):

  $ git grep -W '\<dget_dlock\>'

  arch/powerpc/platforms/cell/spufs/inode.c-              spin_lock(&dentry->d_lock);
  arch/powerpc/platforms/cell/spufs/inode.c-              if (simple_positive(dentry)) {
  arch/powerpc/platforms/cell/spufs/inode.c:                      dget_dlock(dentry);

  fs/autofs/expire.c-             spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
  fs/autofs/expire.c-             if (simple_positive(child)) {
  fs/autofs/expire.c:                     dget_dlock(child);

  fs/autofs/root.c:                       dget_dlock(active);
  fs/autofs/root.c-                       spin_unlock(&active->d_lock);

  fs/autofs/root.c:                       dget_dlock(expiring);
  fs/autofs/root.c-                       spin_unlock(&expiring->d_lock);

  fs/ceph/dir.c-          if (!spin_trylock(&dentry->d_lock))
  fs/ceph/dir.c-                  continue;
  [...]
  fs/ceph/dir.c:                          dget_dlock(dentry);

  fs/ceph/mds_client.c-           spin_lock(&alias->d_lock);
  [...]
  fs/ceph/mds_client.c:                   dn = dget_dlock(alias);

  fs/configfs/inode.c-            spin_lock(&dentry->d_lock);
  fs/configfs/inode.c-            if (simple_positive(dentry)) {
  fs/configfs/inode.c:                    dget_dlock(dentry);

  fs/libfs.c:                             found = dget_dlock(d);
  fs/libfs.c-                     spin_unlock(&d->d_lock);

  fs/libfs.c:             found = dget_dlock(child);
  fs/libfs.c-     spin_unlock(&child->d_lock);

  fs/libfs.c:                             child = dget_dlock(d);
  fs/libfs.c-                     spin_unlock(&d->d_lock);

  fs/ocfs2/dcache.c:                      dget_dlock(dentry);
  fs/ocfs2/dcache.c-                      spin_unlock(&dentry->d_lock);

  include/linux/dcache.h:static inline struct dentry *dget_dlock(struct dentry *dentry)

After taking out the NULL check, dget_dlock() becomes almost identical
to __dget_dlock(); the only difference is that dget_dlock() returns the
dentry that was passed in. These are static inline helpers, so we can
rely on the compiler to discard unused return values. We can therefore
also remove __dget_dlock() and replace calls to it by dget_dlock().

Also fix up and improve the kerneldoc comments while we're at it.

Al Viro pointed out that we can also clean up some of the callers to
make use of the returned value and provided a bit more info for the
kerneldoc.

While preparing v2 I also noticed that the tabs used in the kerneldoc
comments were causing the kerneldoc to get parsed incorrectly so I also
fixed this up (including for d_unhashed, which is otherwise unrelated).

Testing: x86 defconfig build + boot; make htmldocs for the kerneldoc
warning. objdump shows there are code generation changes.

Link: https://lore.kernel.org/all/20231022164520.915013-1-vegard.nossum@oracle.com/
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 16 ++++------------
 include/linux/dcache.h | 41 ++++++++++++++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 523cbf0780f0..bb18ea91d717 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -877,12 +877,6 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 	spin_unlock(&dentry->d_lock);
 }
 
-/* This must be called with d_lock held */
-static inline void __dget_dlock(struct dentry *dentry)
-{
-	dentry->d_lockref.count++;
-}
-
 struct dentry *dget_parent(struct dentry *dentry)
 {
 	int gotref;
@@ -964,7 +958,7 @@ static struct dentry *__d_find_alias(struct inode *inode)
 	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
 		spin_lock(&alias->d_lock);
  		if (!d_unhashed(alias)) {
-			__dget_dlock(alias);
+			dget_dlock(alias);
 			spin_unlock(&alias->d_lock);
 			return alias;
 		}
@@ -1569,8 +1563,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
 {
 	struct dentry **victim = _data;
 	if (d_mountpoint(dentry)) {
-		__dget_dlock(dentry);
-		*victim = dentry;
+		*victim = dget_dlock(dentry);
 		return D_WALK_QUIT;
 	}
 	return D_WALK_CONTINUE;
@@ -1715,8 +1708,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	 * don't need child lock because it is not subject
 	 * to concurrency here
 	 */
-	__dget_dlock(parent);
-	dentry->d_parent = parent;
+	dentry->d_parent = dget_dlock(parent);
 	hlist_add_head(&dentry->d_sib, &parent->d_children);
 	spin_unlock(&parent->d_lock);
 
@@ -2683,7 +2675,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
 			spin_unlock(&alias->d_lock);
 			alias = NULL;
 		} else {
-			__dget_dlock(alias);
+			dget_dlock(alias);
 			__d_rehash(alias);
 			spin_unlock(&alias->d_lock);
 		}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 48b393545ec2..1666c387861f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -287,20 +287,40 @@ extern char *dentry_path(const struct dentry *, char *, int);
 /* Allocation counts.. */
 
 /**
- *	dget, dget_dlock -	get a reference to a dentry
- *	@dentry: dentry to get a reference to
+ * dget_dlock -	get a reference to a dentry
+ * @dentry: dentry to get a reference to
  *
- *	Given a dentry or %NULL pointer increment the reference count
- *	if appropriate and return the dentry. A dentry will not be 
- *	destroyed when it has references.
+ * Given a live dentry, increment the reference count and return the dentry.
+ * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
+ * caller's resonsibility.  There are many conditions sufficient to guarantee
+ * that; e.g. anything with non-negative refcount is alive, so's anything
+ * hashed, anything positive, anyone's parent, etc.
  */
 static inline struct dentry *dget_dlock(struct dentry *dentry)
 {
-	if (dentry)
-		dentry->d_lockref.count++;
+	dentry->d_lockref.count++;
 	return dentry;
 }
 
+
+/**
+ * dget - get a reference to a dentry
+ * @dentry: dentry to get a reference to
+ *
+ * Given a dentry or %NULL pointer increment the reference count
+ * if appropriate and return the dentry.  A dentry will not be
+ * destroyed when it has references.  Conversely, a dentry with
+ * no references can disappear for any number of reasons, starting
+ * with memory pressure.  In other words, that primitive is
+ * used to clone an existing reference; using it on something with
+ * zero refcount is a bug.
+ *
+ * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
+ * avoidance point of view it is equivalent to spin_lock()/increment
+ * refcount/spin_unlock(), so calling it under @dentry->d_lock is
+ * always a bug; so's calling it under ->d_lock on any of its descendents.
+ *
+ */
 static inline struct dentry *dget(struct dentry *dentry)
 {
 	if (dentry)
@@ -311,12 +331,11 @@ static inline struct dentry *dget(struct dentry *dentry)
 extern struct dentry *dget_parent(struct dentry *dentry);
 
 /**
- *	d_unhashed -	is dentry hashed
- *	@dentry: entry to check
+ * d_unhashed - is dentry hashed
+ * @dentry: entry to check
  *
- *	Returns true if the dentry passed is not currently hashed.
+ * Returns true if the dentry passed is not currently hashed.
  */
- 
 static inline int d_unhashed(const struct dentry *dentry)
 {
 	return hlist_bl_unhashed(&dentry->d_hash);
-- 
cgit v1.2.3


From 18caaedaf4c3712ab6821f292598a8f86e6d7972 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 26 Nov 2023 09:56:29 +0100
Subject: locking/lockdep: Slightly reorder 'struct lock_class' to save some
 memory

Based on pahole, 2 holes can be combined in the 'struct lock_class'. This
saves 8 bytes in the structure on my x86_64.

On a x86_64 configured with allmodconfig, this saves ~64kb of memory in
'kernel/locking/lockdep.o':

                text         data           bss           dec     filename
  Before:    102,501    1,912,490    11,531,636    13,546,627     kernel/locking/lockdep.o
  After:     102,181    1,912,490    11,466,100    13,480,771     kernel/locking/lockdep.o

because of:

  struct lock_class lock_classes[MAX_LOCKDEP_KEYS];

After the reorder, pahole gives:

  struct lock_class {
          struct hlist_node          hash_entry;           /*     0    16 */
          struct list_head           lock_entry;           /*    16    16 */
          struct list_head           locks_after;          /*    32    16 */
          struct list_head           locks_before;         /*    48    16 */
          /* --- cacheline 1 boundary (64 bytes) --- */
          const struct lockdep_subclass_key  * key;        /*    64     8 */
          lock_cmp_fn                cmp_fn;               /*    72     8 */
          lock_print_fn              print_fn;             /*    80     8 */
          unsigned int               subclass;             /*    88     4 */
          unsigned int               dep_gen_id;           /*    92     4 */
          long unsigned int          usage_mask;           /*    96     8 */
          const struct lock_trace  * usage_traces[10];     /*   104    80 */
          /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */
          const char  *              name;                 /*   184     8 */
          /* --- cacheline 3 boundary (192 bytes) --- */
          int                        name_version;         /*   192     4 */
          u8                         wait_type_inner;      /*   196     1 */
          u8                         wait_type_outer;      /*   197     1 */
          u8                         lock_type;            /*   198     1 */

          /* XXX 1 byte hole, try to pack */

          long unsigned int          contention_point[4];  /*   200    32 */
          long unsigned int          contending_point[4];  /*   232    32 */

          /* size: 264, cachelines: 5, members: 18 */
          /* sum members: 263, holes: 1, sum holes: 1 */
          /* last cacheline: 8 bytes */
  };

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/801258371fc4101f96495a5aaecef638d6cbd8d3.1700988869.git.christophe.jaillet@wanadoo.fr
---
 include/linux/lockdep_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 2ebc323d345a..857d785e89e6 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -127,12 +127,12 @@ struct lock_class {
 	unsigned long			usage_mask;
 	const struct lock_trace		*usage_traces[LOCK_TRACE_STATES];
 
+	const char			*name;
 	/*
 	 * Generation counter, when doing certain classes of graph walking,
 	 * to ensure that we check one node only once:
 	 */
 	int				name_version;
-	const char			*name;
 
 	u8				wait_type_inner;
 	u8				wait_type_outer;
-- 
cgit v1.2.3


From a2e7e59a94269484a83386972ca07c22fd188854 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 15 Nov 2023 18:25:44 +0000
Subject: iommu: Avoid more races around device probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out there are more subtle races beyond just the main part of
__iommu_probe_device() itself running in parallel - the dev_iommu_free()
on the way out of an unsuccessful probe can still manage to trip up
concurrent accesses to a device's fwspec. Thus, extend the scope of
iommu_probe_device_lock() to also serialise fwspec creation and initial
retrieval.

Reported-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Link: https://lore.kernel.org/linux-iommu/e2e20e1c-6450-4ac5-9804-b0000acdf7de@quicinc.com/
Fixes: 01657bc14a39 ("iommu: Avoid races around device probe")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: André Draszik <andre.draszik@linaro.org>
Tested-by: André Draszik <andre.draszik@linaro.org>
Link: https://lore.kernel.org/r/16f433658661d7cadfea51e7c65da95826112a2b.1700071477.git.robin.murphy@arm.com
Cc: stable@vger.kernel.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/scan.c      |  7 ++++++-
 drivers/iommu/iommu.c    | 20 ++++++++++----------
 drivers/iommu/of_iommu.c | 12 +++++++++---
 include/linux/iommu.h    |  1 +
 4 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index fa5dd71a80fa..02bb2cce423f 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1568,17 +1568,22 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
 	int err;
 	const struct iommu_ops *ops;
 
+	/* Serialise to make dev->iommu stable under our potential fwspec */
+	mutex_lock(&iommu_probe_device_lock);
 	/*
 	 * If we already translated the fwspec there is nothing left to do,
 	 * return the iommu_ops.
 	 */
 	ops = acpi_iommu_fwspec_ops(dev);
-	if (ops)
+	if (ops) {
+		mutex_unlock(&iommu_probe_device_lock);
 		return ops;
+	}
 
 	err = iort_iommu_configure_id(dev, id_in);
 	if (err && err != -EPROBE_DEFER)
 		err = viot_iommu_configure(dev);
+	mutex_unlock(&iommu_probe_device_lock);
 
 	/*
 	 * If we have reason to believe the IOMMU driver missed the initial
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c9a05bb49bfa..33e2a9b5d339 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -485,11 +485,12 @@ static void iommu_deinit_device(struct device *dev)
 	dev_iommu_free(dev);
 }
 
+DEFINE_MUTEX(iommu_probe_device_lock);
+
 static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
 	struct iommu_group *group;
-	static DEFINE_MUTEX(iommu_probe_device_lock);
 	struct group_device *gdev;
 	int ret;
 
@@ -502,17 +503,15 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 	 * probably be able to use device_lock() here to minimise the scope,
 	 * but for now enforcing a simple global ordering is fine.
 	 */
-	mutex_lock(&iommu_probe_device_lock);
+	lockdep_assert_held(&iommu_probe_device_lock);
 
 	/* Device is probed already if in a group */
-	if (dev->iommu_group) {
-		ret = 0;
-		goto out_unlock;
-	}
+	if (dev->iommu_group)
+		return 0;
 
 	ret = iommu_init_device(dev, ops);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	group = dev->iommu_group;
 	gdev = iommu_group_alloc_device(group, dev);
@@ -548,7 +547,6 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 			list_add_tail(&group->entry, group_list);
 	}
 	mutex_unlock(&group->mutex);
-	mutex_unlock(&iommu_probe_device_lock);
 
 	if (dev_is_pci(dev))
 		iommu_dma_set_pci_32bit_workaround(dev);
@@ -562,8 +560,6 @@ err_put_group:
 	iommu_deinit_device(dev);
 	mutex_unlock(&group->mutex);
 	iommu_group_put(group);
-out_unlock:
-	mutex_unlock(&iommu_probe_device_lock);
 
 	return ret;
 }
@@ -573,7 +569,9 @@ int iommu_probe_device(struct device *dev)
 	const struct iommu_ops *ops;
 	int ret;
 
+	mutex_lock(&iommu_probe_device_lock);
 	ret = __iommu_probe_device(dev, NULL);
+	mutex_unlock(&iommu_probe_device_lock);
 	if (ret)
 		return ret;
 
@@ -1822,7 +1820,9 @@ static int probe_iommu_group(struct device *dev, void *data)
 	struct list_head *group_list = data;
 	int ret;
 
+	mutex_lock(&iommu_probe_device_lock);
 	ret = __iommu_probe_device(dev, group_list);
+	mutex_unlock(&iommu_probe_device_lock);
 	if (ret == -ENODEV)
 		ret = 0;
 
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 157b286e36bf..c25b4ae6aeee 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -112,16 +112,20 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 					   const u32 *id)
 {
 	const struct iommu_ops *ops = NULL;
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct iommu_fwspec *fwspec;
 	int err = NO_IOMMU;
 
 	if (!master_np)
 		return NULL;
 
+	/* Serialise to make dev->iommu stable under our potential fwspec */
+	mutex_lock(&iommu_probe_device_lock);
+	fwspec = dev_iommu_fwspec_get(dev);
 	if (fwspec) {
-		if (fwspec->ops)
+		if (fwspec->ops) {
+			mutex_unlock(&iommu_probe_device_lock);
 			return fwspec->ops;
-
+		}
 		/* In the deferred case, start again from scratch */
 		iommu_fwspec_free(dev);
 	}
@@ -155,6 +159,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 		fwspec = dev_iommu_fwspec_get(dev);
 		ops    = fwspec->ops;
 	}
+	mutex_unlock(&iommu_probe_device_lock);
+
 	/*
 	 * If we have reason to believe the IOMMU driver missed the initial
 	 * probe for dev, replay it to get things in order.
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ec289c1016f5..6291aa7b079b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -845,6 +845,7 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 	dev->iommu->priv = priv;
 }
 
+extern struct mutex iommu_probe_device_lock;
 int iommu_probe_device(struct device *dev);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
-- 
cgit v1.2.3


From a9c362db39207c4934c9125e56ed730c5297c37c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 21 Nov 2023 18:03:59 +0000
Subject: iommu: Validate that devices match domains

Before we can allow drivers to coexist, we need to make sure that one
driver's domain ops can't misinterpret another driver's dev_iommu_priv
data. To that end, add a token to the domain so we can remember how it
was allocated - for now this may as well be the device ops, since they
still correlate 1:1 with drivers. We can trust ourselves for internal
default domain attachment, so add checks to cover all the public attach
interfaces.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/097c6f30480e4efe12195d00ba0e84ea4837fb4c.1700589539.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c                | 10 ++++++++++
 drivers/iommu/iommufd/hw_pagetable.c |  2 ++
 include/linux/iommu.h                |  2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7fafd073c33e..8e4436c606d3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2117,6 +2117,7 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops,
 		return NULL;
 
 	domain->type = type;
+	domain->owner = ops;
 	/*
 	 * If not already set, assume all sizes by default; the driver
 	 * may override this later
@@ -2282,10 +2283,16 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev)
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group)
 {
+	struct device *dev;
+
 	if (group->domain && group->domain != group->default_domain &&
 	    group->domain != group->blocking_domain)
 		return -EBUSY;
 
+	dev = iommu_group_first_dev(group);
+	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner)
+		return -EINVAL;
+
 	return __iommu_group_set_domain(group, domain);
 }
 
@@ -3477,6 +3484,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
 	if (!group)
 		return -ENODEV;
 
+	if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner)
+		return -EINVAL;
+
 	mutex_lock(&group->mutex);
 	curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL);
 	if (curr) {
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 2abbeafdbd22..5be7f513b622 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -135,6 +135,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			hwpt->domain = NULL;
 			goto out_abort;
 		}
+		hwpt->domain->owner = ops;
 	} else {
 		hwpt->domain = iommu_domain_alloc(idev->dev->bus);
 		if (!hwpt->domain) {
@@ -233,6 +234,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 		hwpt->domain = NULL;
 		goto out_abort;
 	}
+	hwpt->domain->owner = ops;
 
 	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
 		rc = -EINVAL;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ec289c1016f5..077bf8cae2f7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -106,7 +106,7 @@ struct iommu_domain {
 	unsigned type;
 	const struct iommu_domain_ops *ops;
 	const struct iommu_dirty_ops *dirty_ops;
-
+	const struct iommu_ops *owner; /* Whose domain_alloc we came from */
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
-- 
cgit v1.2.3


From 17de3f5fdd35676b0e3d41c7c9bf4e3032eb3673 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 21 Nov 2023 18:04:02 +0000
Subject: iommu: Retire bus ops

With the rest of the API internals converted, it's time to finally
tackle probe_device and how we bootstrap the per-device ops association
to begin with. This ends up being disappointingly straightforward, since
fwspec users are already doing it in order to find their of_xlate
callback, and it works out that we can easily do the equivalent for
other drivers too. Then shuffle the remaining awareness of iommu_ops
into the couple of core headers that still need it, and breathe a sigh
of relief.

Ding dong the bus ops are gone!

CC: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/a59011ef65b4b6657cb0b7a388d786b779b61305.1700589539.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c       | 31 ++++++++++++++++++-------------
 include/acpi/acpi_bus.h     |  2 ++
 include/linux/device.h      |  1 -
 include/linux/device/bus.h  |  5 -----
 include/linux/dma-map-ops.h |  1 +
 5 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 88aeae0acd9b..254f42b4220b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -148,7 +148,7 @@ struct iommu_group_attribute iommu_group_attr_##_name =		\
 static LIST_HEAD(iommu_device_list);
 static DEFINE_SPINLOCK(iommu_device_lock);
 
-static struct bus_type * const iommu_buses[] = {
+static const struct bus_type * const iommu_buses[] = {
 	&platform_bus_type,
 #ifdef CONFIG_PCI
 	&pci_bus_type,
@@ -257,13 +257,6 @@ int iommu_device_register(struct iommu_device *iommu,
 	/* We need to be able to take module references appropriately */
 	if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner))
 		return -EINVAL;
-	/*
-	 * Temporarily enforce global restriction to a single driver. This was
-	 * already the de-facto behaviour, since any possible combination of
-	 * existing drivers would compete for at least the PCI or platform bus.
-	 */
-	if (iommu_buses[0]->iommu_ops && iommu_buses[0]->iommu_ops != ops)
-		return -EBUSY;
 
 	iommu->ops = ops;
 	if (hwdev)
@@ -273,10 +266,8 @@ int iommu_device_register(struct iommu_device *iommu,
 	list_add_tail(&iommu->list, &iommu_device_list);
 	spin_unlock(&iommu_device_lock);
 
-	for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) {
-		iommu_buses[i]->iommu_ops = ops;
+	for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++)
 		err = bus_iommu_probe(iommu_buses[i]);
-	}
 	if (err)
 		iommu_device_unregister(iommu);
 	return err;
@@ -329,7 +320,6 @@ int iommu_device_register_bus(struct iommu_device *iommu,
 	list_add_tail(&iommu->list, &iommu_device_list);
 	spin_unlock(&iommu_device_lock);
 
-	bus->iommu_ops = ops;
 	err = bus_iommu_probe(bus);
 	if (err) {
 		iommu_device_unregister_bus(iommu, bus, nb);
@@ -496,12 +486,27 @@ static void iommu_deinit_device(struct device *dev)
 
 static int __iommu_probe_device(struct device *dev, struct list_head *group_list)
 {
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
+	const struct iommu_ops *ops;
+	struct iommu_fwspec *fwspec;
 	struct iommu_group *group;
 	static DEFINE_MUTEX(iommu_probe_device_lock);
 	struct group_device *gdev;
 	int ret;
 
+	/*
+	 * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU
+	 * instances with non-NULL fwnodes, and client devices should have been
+	 * identified with a fwspec by this point. Otherwise, we can currently
+	 * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can
+	 * be present, and that any of their registered instances has suitable
+	 * ops for probing, and thus cheekily co-opt the same mechanism.
+	 */
+	fwspec = dev_iommu_fwspec_get(dev);
+	if (fwspec && fwspec->ops)
+		ops = fwspec->ops;
+	else
+		ops = iommu_ops_from_fwnode(NULL);
+
 	if (!ops)
 		return -ENODEV;
 	/*
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 1216d72c650f..89079787905d 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -627,6 +627,8 @@ struct acpi_pci_root {
 
 /* helper */
 
+struct iommu_ops;
+
 bool acpi_dma_supported(const struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
 int acpi_iommu_fwspec_init(struct device *dev, u32 id,
diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..0314dbbdb534 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,7 +42,6 @@ struct class;
 struct subsys_private;
 struct device_node;
 struct fwnode_handle;
-struct iommu_ops;
 struct iommu_group;
 struct dev_pin_info;
 struct dev_iommu;
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index ae10c4322754..e25aab08f873 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -62,9 +62,6 @@ struct fwnode_handle;
  *			this bus.
  * @pm:		Power management operations of this bus, callback the specific
  *		device driver's pm-ops.
- * @iommu_ops:  IOMMU specific operations for this bus, used to attach IOMMU
- *              driver implementations to a bus and allow the driver to do
- *              bus-specific setup
  * @need_parent_lock:	When probing or removing a device on this bus, the
  *			device core should lock the device's parent.
  *
@@ -104,8 +101,6 @@ struct bus_type {
 
 	const struct dev_pm_ops *pm;
 
-	const struct iommu_ops *iommu_ops;
-
 	bool need_parent_lock;
 };
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f2fc203fb8a1..a52e508d1869 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 
 struct cma;
+struct iommu_ops;
 
 /*
  * Values for struct dma_map_ops.flags:
-- 
cgit v1.2.3


From 17b226dcf80ce79d02f4f0b08813d8848885b986 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Fri, 24 Nov 2023 15:24:33 +0100
Subject: iommu: Allow passing custom allocators to pgtable drivers

This will be useful for GPU drivers who want to keep page tables in a
pool so they can:

- keep freed page tables in a free pool and speed-up upcoming page
  table allocations
- batch page table allocation instead of allocating one page at a time
- pre-reserve pages for page tables needed for map/unmap operations,
  to ensure map/unmap operations don't try to allocate memory in paths
  they're allowed to block or fail

It might also be valuable for other aspects of GPU and similar
use-cases, like fine-grained memory accounting and resource limiting.

We will extend the Arm LPAE format to support custom allocators in a
separate commit.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20231124142434.1577550-2-boris.brezillon@collabora.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/io-pgtable.c | 23 +++++++++++++++++++++++
 include/linux/io-pgtable.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index b843fcd365d2..8841c1487f00 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -34,6 +34,26 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #endif
 };
 
+static int check_custom_allocator(enum io_pgtable_fmt fmt,
+				  struct io_pgtable_cfg *cfg)
+{
+	/* No custom allocator, no need to check the format. */
+	if (!cfg->alloc && !cfg->free)
+		return 0;
+
+	/* When passing a custom allocator, both the alloc and free
+	 * functions should be provided.
+	 */
+	if (!cfg->alloc || !cfg->free)
+		return -EINVAL;
+
+	/* Make sure the format supports custom allocators. */
+	if (io_pgtable_init_table[fmt]->caps & IO_PGTABLE_CAP_CUSTOM_ALLOCATOR)
+		return 0;
+
+	return -EINVAL;
+}
+
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
 					    struct io_pgtable_cfg *cfg,
 					    void *cookie)
@@ -44,6 +64,9 @@ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
 	if (fmt >= IO_PGTABLE_NUM_FMTS)
 		return NULL;
 
+	if (check_custom_allocator(fmt, cfg))
+		return NULL;
+
 	fns = io_pgtable_init_table[fmt];
 	if (!fns)
 		return NULL;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 25142a0e2fc2..86cf1f7ae389 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -100,6 +100,30 @@ struct io_pgtable_cfg {
 	const struct iommu_flush_ops	*tlb;
 	struct device			*iommu_dev;
 
+	/**
+	 * @alloc: Custom page allocator.
+	 *
+	 * Optional hook used to allocate page tables. If this function is NULL,
+	 * @free must be NULL too.
+	 *
+	 * Memory returned should be zeroed and suitable for dma_map_single() and
+	 * virt_to_phys().
+	 *
+	 * Not all formats support custom page allocators. Before considering
+	 * passing a non-NULL value, make sure the chosen page format supports
+	 * this feature.
+	 */
+	void *(*alloc)(void *cookie, size_t size, gfp_t gfp);
+
+	/**
+	 * @free: Custom page de-allocator.
+	 *
+	 * Optional hook used to free page tables allocated with the @alloc
+	 * hook. Must be non-NULL if @alloc is not NULL, must be NULL
+	 * otherwise.
+	 */
+	void (*free)(void *cookie, void *pages, size_t size);
+
 	/* Low-level data specific to the table format */
 	union {
 		struct {
@@ -241,16 +265,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop,
 		iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie);
 }
 
+/**
+ * enum io_pgtable_caps - IO page table backend capabilities.
+ */
+enum io_pgtable_caps {
+	/** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */
+	IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0),
+};
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.
  *
  * @alloc: Allocate a set of page tables described by cfg.
  * @free:  Free the page tables associated with iop.
+ * @caps:  Combination of @io_pgtable_caps flags encoding the backend capabilities.
  */
 struct io_pgtable_init_fns {
 	struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie);
 	void (*free)(struct io_pgtable *iop);
+	u32 caps;
 };
 
 extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-- 
cgit v1.2.3


From 8c88a474357ead632b07c70bf7f119ace8c3b39e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Nov 2023 17:25:26 +0100
Subject: debugfs: add API to allow debugfs operations cancellation

In some cases there might be longer-running hardware accesses
in debugfs files, or attempts to acquire locks, and we want
to still be able to quickly remove the files.

Introduce a cancellations API to use inside the debugfs handler
functions to be able to cancel such operations on a per-file
basis.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/debugfs/file.c       | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/debugfs/inode.c      | 32 ++++++++++++++++++-
 fs/debugfs/internal.h   |  5 +++
 include/linux/debugfs.h | 19 ++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 3eff92450fd5..5568cdea3490 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -114,6 +114,8 @@ int debugfs_file_get(struct dentry *dentry)
 		lockdep_init_map(&fsd->lockdep_map, fsd->lock_name ?: "debugfs",
 				 &fsd->key, 0);
 #endif
+		INIT_LIST_HEAD(&fsd->cancellations);
+		mutex_init(&fsd->cancellations_mtx);
 	}
 
 	/*
@@ -156,6 +158,86 @@ void debugfs_file_put(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(debugfs_file_put);
 
+/**
+ * debugfs_enter_cancellation - enter a debugfs cancellation
+ * @file: the file being accessed
+ * @cancellation: the cancellation object, the cancel callback
+ *	inside of it must be initialized
+ *
+ * When a debugfs file is removed it needs to wait for all active
+ * operations to complete. However, the operation itself may need
+ * to wait for hardware or completion of some asynchronous process
+ * or similar. As such, it may need to be cancelled to avoid long
+ * waits or even deadlocks.
+ *
+ * This function can be used inside a debugfs handler that may
+ * need to be cancelled. As soon as this function is called, the
+ * cancellation's 'cancel' callback may be called, at which point
+ * the caller should proceed to call debugfs_leave_cancellation()
+ * and leave the debugfs handler function as soon as possible.
+ * Note that the 'cancel' callback is only ever called in the
+ * context of some kind of debugfs_remove().
+ *
+ * This function must be paired with debugfs_leave_cancellation().
+ */
+void debugfs_enter_cancellation(struct file *file,
+				struct debugfs_cancellation *cancellation)
+{
+	struct debugfs_fsdata *fsd;
+	struct dentry *dentry = F_DENTRY(file);
+
+	INIT_LIST_HEAD(&cancellation->list);
+
+	if (WARN_ON(!d_is_reg(dentry)))
+		return;
+
+	if (WARN_ON(!cancellation->cancel))
+		return;
+
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if (WARN_ON(!fsd ||
+		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return;
+
+	mutex_lock(&fsd->cancellations_mtx);
+	list_add(&cancellation->list, &fsd->cancellations);
+	mutex_unlock(&fsd->cancellations_mtx);
+
+	/* if we're already removing wake it up to cancel */
+	if (d_unlinked(dentry))
+		complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_enter_cancellation);
+
+/**
+ * debugfs_leave_cancellation - leave cancellation section
+ * @file: the file being accessed
+ * @cancellation: the cancellation previously registered with
+ *	debugfs_enter_cancellation()
+ *
+ * See the documentation of debugfs_enter_cancellation().
+ */
+void debugfs_leave_cancellation(struct file *file,
+				struct debugfs_cancellation *cancellation)
+{
+	struct debugfs_fsdata *fsd;
+	struct dentry *dentry = F_DENTRY(file);
+
+	if (WARN_ON(!d_is_reg(dentry)))
+		return;
+
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if (WARN_ON(!fsd ||
+		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return;
+
+	mutex_lock(&fsd->cancellations_mtx);
+	if (!list_empty(&cancellation->list))
+		list_del(&cancellation->list);
+	mutex_unlock(&fsd->cancellations_mtx);
+}
+EXPORT_SYMBOL_GPL(debugfs_leave_cancellation);
+
 /*
  * Only permit access to world-readable files when the kernel is locked down.
  * We also need to exclude any file that has ways to write or alter it as root
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 80f4f000dcc1..d53c2860b03c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -247,6 +247,8 @@ static void debugfs_release_dentry(struct dentry *dentry)
 		lockdep_unregister_key(&fsd->key);
 		kfree(fsd->lock_name);
 #endif
+		WARN_ON(!list_empty(&fsd->cancellations));
+		mutex_destroy(&fsd->cancellations_mtx);
 	}
 
 	kfree(fsd);
@@ -756,8 +758,36 @@ static void __debugfs_file_removed(struct dentry *dentry)
 	lock_map_acquire(&fsd->lockdep_map);
 	lock_map_release(&fsd->lockdep_map);
 
-	if (!refcount_dec_and_test(&fsd->active_users))
+	/* if we hit zero, just wait for all to finish */
+	if (!refcount_dec_and_test(&fsd->active_users)) {
 		wait_for_completion(&fsd->active_users_drained);
+		return;
+	}
+
+	/* if we didn't hit zero, try to cancel any we can */
+	while (refcount_read(&fsd->active_users)) {
+		struct debugfs_cancellation *c;
+
+		/*
+		 * Lock the cancellations. Note that the cancellations
+		 * structs are meant to be on the stack, so we need to
+		 * ensure we either use them here or don't touch them,
+		 * and debugfs_leave_cancellation() will wait for this
+		 * to be finished processing before exiting one. It may
+		 * of course win and remove the cancellation, but then
+		 * chances are we never even got into this bit, we only
+		 * do if the refcount isn't zero already.
+		 */
+		mutex_lock(&fsd->cancellations_mtx);
+		while ((c = list_first_entry_or_null(&fsd->cancellations,
+						     typeof(*c), list))) {
+			list_del_init(&c->list);
+			c->cancel(dentry, c->cancel_data);
+		}
+		mutex_unlock(&fsd->cancellations_mtx);
+
+		wait_for_completion(&fsd->active_users_drained);
+	}
 }
 
 static void remove_one(struct dentry *victim)
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index c7d61cfc97d2..0c4c68cf161f 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -8,6 +8,7 @@
 #ifndef _DEBUGFS_INTERNAL_H_
 #define _DEBUGFS_INTERNAL_H_
 #include <linux/lockdep.h>
+#include <linux/list.h>
 
 struct file_operations;
 
@@ -29,6 +30,10 @@ struct debugfs_fsdata {
 			struct lock_class_key key;
 			char *lock_name;
 #endif
+
+			/* protect cancellations */
+			struct mutex cancellations_mtx;
+			struct list_head cancellations;
 		};
 	};
 };
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index ea2d919fd9c7..c9c65b132c0f 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -171,6 +171,25 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos);
 
+/**
+ * struct debugfs_cancellation - cancellation data
+ * @list: internal, for keeping track
+ * @cancel: callback to call
+ * @cancel_data: extra data for the callback to call
+ */
+struct debugfs_cancellation {
+	struct list_head list;
+	void (*cancel)(struct dentry *, void *);
+	void *cancel_data;
+};
+
+void __acquires(cancellation)
+debugfs_enter_cancellation(struct file *file,
+			   struct debugfs_cancellation *cancellation);
+void __releases(cancellation)
+debugfs_leave_cancellation(struct file *file,
+			   struct debugfs_cancellation *cancellation);
+
 #else
 
 #include <linux/err.h>
-- 
cgit v1.2.3


From 668bfeeabb5e402e3b36992f7859c284cc6e594d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Nov 2023 08:20:02 +0100
Subject: block: move a few definitions out of CONFIG_BLK_DEV_ZONED

Allow using a few symbols with IS_ENABLED instead of #idef by moving
the declarations out of #idef CONFIG_BLK_DEV_ZONED, and move
bdev_nr_zones into the remaining  #idef CONFIG_BLK_DEV_ZONED, #else
block below.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231127072002.1332685-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..17c0a7d0d319 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -331,22 +331,13 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 
 void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
 
-#ifdef CONFIG_BLK_DEV_ZONED
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data);
-unsigned int bdev_nr_zones(struct block_device *bdev);
-extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
-			    sector_t sectors, sector_t nr_sectors,
-			    gfp_t gfp_mask);
+		unsigned int nr_zones, report_zones_cb cb, void *data);
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+		sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask);
 int blk_revalidate_disk_zones(struct gendisk *disk,
-			      void (*update_driver_data)(struct gendisk *disk));
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline unsigned int bdev_nr_zones(struct block_device *bdev)
-{
-	return 0;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
+		void (*update_driver_data)(struct gendisk *disk));
 
 /*
  * Independent access ranges: struct blk_independent_access_range describes
@@ -643,6 +634,8 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
+unsigned int bdev_nr_zones(struct block_device *bdev);
+
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
@@ -687,6 +680,11 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 }
 
 #else /* CONFIG_BLK_DEV_ZONED */
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+	return 0;
+}
+
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
-- 
cgit v1.2.3


From 95ba893c9f4feb836ddce627efd0bb6af6667031 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 14 Nov 2023 13:37:09 +0100
Subject: dma-buf: fix check in dma_resv_add_fence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's valid to add the same fence multiple times to a dma-resv object and
we shouldn't need one extra slot for each.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Fixes: a3f7c10a269d5 ("dma-buf/dma-resv: check if the new fence is really later")
Cc: stable@vger.kernel.org # v5.19+
Link: https://patchwork.freedesktop.org/patch/msgid/20231115093035.1889-1-christian.koenig@amd.com
---
 drivers/dma-buf/dma-resv.c |  2 +-
 include/linux/dma-fence.h  | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 38b4110378de..eb8b733065b2 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -301,7 +301,7 @@ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence,
 
 		dma_resv_list_entry(fobj, i, obj, &old, &old_usage);
 		if ((old->context == fence->context && old_usage >= usage &&
-		     dma_fence_is_later(fence, old)) ||
+		     dma_fence_is_later_or_same(fence, old)) ||
 		    dma_fence_is_signaled(old)) {
 			dma_resv_list_set(fobj, i, fence, usage);
 			dma_fence_put(old);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index ebe78bd3d121..b3772edca2e6 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -498,6 +498,21 @@ static inline bool dma_fence_is_later(struct dma_fence *f1,
 	return __dma_fence_is_later(f1->seqno, f2->seqno, f1->ops);
 }
 
+/**
+ * dma_fence_is_later_or_same - return true if f1 is later or same as f2
+ * @f1: the first fence from the same context
+ * @f2: the second fence from the same context
+ *
+ * Returns true if f1 is chronologically later than f2 or the same fence. Both
+ * fences must be from the same context, since a seqno is not re-used across
+ * contexts.
+ */
+static inline bool dma_fence_is_later_or_same(struct dma_fence *f1,
+					      struct dma_fence *f2)
+{
+	return f1 == f2 || dma_fence_is_later(f1, f2);
+}
+
 /**
  * dma_fence_later - return the chronologically later fence
  * @f1:	the first fence from the same context
-- 
cgit v1.2.3


From 243ad8df7a1bd24c2e01bd99d9f0bb88844dae91 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 24 Nov 2023 12:27:52 +0000
Subject: net: phy: add possible interfaces

Add a possible_interfaces member to struct phy_device to indicate which
interfaces a clause 45 PHY may switch between depending on the media.
This must be populated by the PHY driver by the time the .config_init()
method completes according to the PHYs host-side configuration.

For example, the Marvell 88x3310 PHY can switch between 10GBASE-R,
5GBASE-R, 2500BASE-X, and SGMII on the host side depending on the media
side speed, so all these interface modes are set in the
possible_interfaces member.

This allows phylib users (such as phylink) to know in advance which
interface modes to expect, which allows them to appropriately restrict
the advertised link modes according to the capabilities of other parts
of the link.

Tested-by: Luo Jie <quic_luoj@quicinc.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/E1r6VHk-00DDLN-I7@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 2 ++
 include/linux/phy.h          | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 478126f6b5bc..400fb09d9cd6 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1246,6 +1246,8 @@ int phy_init_hw(struct phy_device *phydev)
 	if (ret < 0)
 		return ret;
 
+	phy_interface_zero(phydev->possible_interfaces);
+
 	if (phydev->drv->config_init) {
 		ret = phydev->drv->config_init(phydev);
 		if (ret < 0)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5f1f41e399c..6e7ebcc50b85 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -605,6 +605,8 @@ struct macsec_ops;
  * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended,
  *             requiring a rerun of the interrupt handler after resume
  * @interface: enum phy_interface_t value
+ * @possible_interfaces: bitmap if interface modes that the attached PHY
+ *			 will switch between depending on media speed.
  * @skb: Netlink message for cable diagnostics
  * @nest: Netlink nest used for cable diagnostics
  * @ehdr: nNtlink header for cable diagnostics
@@ -674,6 +676,7 @@ struct phy_device {
 	u32 dev_flags;
 
 	phy_interface_t interface;
+	DECLARE_PHY_INTERFACE_MASK(possible_interfaces);
 
 	/*
 	 * forced speed & duplex (no autoneg)
-- 
cgit v1.2.3


From 073d3d2ca7d462afc8159ca0175675b9b7b4f162 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 27 Oct 2023 12:40:04 +0530
Subject: OPP: Level zero is valid

The level zero can be used by some OPPs to drop performance state vote
for the device. It is perfectly fine to allow the same.

_set_opp_level() considers it as an invalid value currently and returns
early.

In order to support this properly, initialize the level field with
U32_MAX, which denotes unused level field.

Reported-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 24 ++++++++++++++++++++----
 drivers/opp/of.c       |  8 +++++++-
 include/linux/pm_opp.h |  5 ++++-
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 84f345c69ea5..f2e2aa07b431 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq_indexed);
  * @opp:	opp for which level value has to be returned for
  *
  * Return: level read from device tree corresponding to the opp, else
- * return 0.
+ * return U32_MAX.
  */
 unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp)
 {
@@ -221,7 +221,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_level);
  * @index:	index of the required opp
  *
  * Return: performance state read from device tree corresponding to the
- * required opp, else return 0.
+ * required opp, else return U32_MAX.
  */
 unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp,
 					    unsigned int index)
@@ -808,6 +808,14 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 	struct dev_pm_opp *opp;
 
 	opp = _find_key_ceil(dev, &temp, 0, true, _read_level, NULL);
+
+	/* False match */
+	if (temp == OPP_LEVEL_UNSET) {
+		dev_err(dev, "%s: OPP levels aren't available\n", __func__);
+		dev_pm_opp_put(opp);
+		return ERR_PTR(-ENODEV);
+	}
+
 	*level = temp;
 	return opp;
 }
@@ -1049,12 +1057,18 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 static int _set_performance_state(struct device *dev, struct device *pd_dev,
 				  struct dev_pm_opp *opp, int i)
 {
-	unsigned int pstate = likely(opp) ? opp->required_opps[i]->level: 0;
+	unsigned int pstate = 0;
 	int ret;
 
 	if (!pd_dev)
 		return 0;
 
+	if (likely(opp)) {
+		pstate = opp->required_opps[i]->level;
+		if (pstate == OPP_LEVEL_UNSET)
+			return 0;
+	}
+
 	ret = dev_pm_domain_set_performance_state(pd_dev, pstate);
 	if (ret) {
 		dev_err(dev, "Failed to set performance state of %s: %d (%d)\n",
@@ -1135,7 +1149,7 @@ static int _set_opp_level(struct device *dev, struct opp_table *opp_table,
 	int ret = 0;
 
 	if (opp) {
-		if (!opp->level)
+		if (opp->level == OPP_LEVEL_UNSET)
 			return 0;
 
 		level = opp->level;
@@ -1867,6 +1881,8 @@ struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table)
 
 	INIT_LIST_HEAD(&opp->node);
 
+	opp->level = OPP_LEVEL_UNSET;
+
 	return opp;
 }
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 81fa27599d58..85fad7ca0007 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1393,8 +1393,14 @@ int of_get_required_opp_performance_state(struct device_node *np, int index)
 
 	opp = _find_opp_of_np(opp_table, required_np);
 	if (opp) {
-		pstate = opp->level;
+		if (opp->level == OPP_LEVEL_UNSET) {
+			pr_err("%s: OPP levels aren't available for %pOF\n",
+			       __func__, np);
+		} else {
+			pstate = opp->level;
+		}
 		dev_pm_opp_put(opp);
+
 	}
 
 	dev_pm_opp_put_opp_table(opp_table);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index ccd97bcef269..af53101a1383 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -92,9 +92,12 @@ struct dev_pm_opp_config {
 	struct device ***virt_devs;
 };
 
+#define OPP_LEVEL_UNSET			U32_MAX
+
 /**
  * struct dev_pm_opp_data - The data to use to initialize an OPP.
- * @level: The performance level for the OPP.
+ * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if
+ * level field isn't used.
  * @freq: The clock rate in Hz for the OPP.
  * @u_volt: The voltage in uV for the OPP.
  */
-- 
cgit v1.2.3


From e37440e7e2c2760475d60c5556b59c8880a7fd63 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 27 Oct 2023 14:17:48 +0530
Subject: OPP: Call dev_pm_opp_set_opp() for required OPPs

Configuring the required OPP was never properly implemented, we just
took an exception for genpds and configured them directly, while leaving
out all other required OPP types.

Now that a standard call to dev_pm_opp_set_opp() takes care of
configuring the opp->level too, the special handling for genpds can be
avoided by simply calling dev_pm_opp_set_opp() for the required OPPs,
which shall eventually configure the corresponding level for genpds.

This also makes it possible for us to configure other type of required
OPPs (no concrete users yet though), via the same path. This is how
other frameworks take care of parent nodes, like clock, regulators, etc,
where we recursively call the same helper.

In order to call dev_pm_opp_set_opp() for the virtual genpd devices,
they must share the OPP table of the genpd. Call _add_opp_dev() for them
to get that done.

This commit also extends the struct dev_pm_opp_config to pass required
devices, for non-genpd cases, which can be used to call
dev_pm_opp_set_opp() for the non-genpd required devices.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 168 ++++++++++++++++++++++++-------------------------
 drivers/opp/of.c       |  17 ++---
 drivers/opp/opp.h      |   8 +--
 include/linux/pm_opp.h |   7 ++-
 4 files changed, 100 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index aeb216f7e978..e08375ed50aa 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1054,48 +1054,22 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 	return 0;
 }
 
-static int _set_performance_state(struct device *dev, struct device *pd_dev,
-				  struct dev_pm_opp *opp, int i)
-{
-	unsigned int pstate = 0;
-	int ret;
-
-	if (!pd_dev)
-		return 0;
-
-	if (likely(opp)) {
-		pstate = opp->required_opps[i]->level;
-		if (pstate == OPP_LEVEL_UNSET)
-			return 0;
-	}
-
-	ret = dev_pm_domain_set_performance_state(pd_dev, pstate);
-	if (ret) {
-		dev_err(dev, "Failed to set performance state of %s: %d (%d)\n",
-			dev_name(pd_dev), pstate, ret);
-	}
-
-	return ret;
-}
-
-static int _opp_set_required_opps_generic(struct device *dev,
-	struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down)
-{
-	dev_err(dev, "setting required-opps isn't supported for non-genpd devices\n");
-	return -ENOENT;
-}
-
-static int _opp_set_required_opps_genpd(struct device *dev,
-	struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down)
+/* This is only called for PM domain for now */
+static int _set_required_opps(struct device *dev, struct opp_table *opp_table,
+			      struct dev_pm_opp *opp, bool up)
 {
-	struct device **genpd_virt_devs = opp_table->genpd_virt_devs;
+	struct device **devs = opp_table->required_devs;
 	int index, target, delta, ret;
 
-	if (!genpd_virt_devs)
+	if (!devs)
 		return 0;
 
+	/* required-opps not fully initialized yet */
+	if (lazy_linking_pending(opp_table))
+		return -EBUSY;
+
 	/* Scaling up? Set required OPPs in normal order, else reverse */
-	if (!scaling_down) {
+	if (up) {
 		index = 0;
 		target = opp_table->required_opp_count;
 		delta = 1;
@@ -1106,9 +1080,11 @@ static int _opp_set_required_opps_genpd(struct device *dev,
 	}
 
 	while (index != target) {
-		ret = _set_performance_state(dev, genpd_virt_devs[index], opp, index);
-		if (ret)
-			return ret;
+		if (devs[index]) {
+			ret = dev_pm_opp_set_opp(devs[index], opp->required_opps[index]);
+			if (ret)
+				return ret;
+		}
 
 		index += delta;
 	}
@@ -1116,34 +1092,6 @@ static int _opp_set_required_opps_genpd(struct device *dev,
 	return 0;
 }
 
-/* This is only called for PM domain for now */
-static int _set_required_opps(struct device *dev, struct opp_table *opp_table,
-			      struct dev_pm_opp *opp, bool up)
-{
-	/* required-opps not fully initialized yet */
-	if (lazy_linking_pending(opp_table))
-		return -EBUSY;
-
-	if (opp_table->set_required_opps)
-		return opp_table->set_required_opps(dev, opp_table, opp, up);
-
-	return 0;
-}
-
-/* Update set_required_opps handler */
-void _update_set_required_opps(struct opp_table *opp_table)
-{
-	/* Already set */
-	if (opp_table->set_required_opps)
-		return;
-
-	/* All required OPPs will belong to genpd or none */
-	if (opp_table->required_opp_tables[0]->is_genpd)
-		opp_table->set_required_opps = _opp_set_required_opps_genpd;
-	else
-		opp_table->set_required_opps = _opp_set_required_opps_generic;
-}
-
 static int _set_opp_level(struct device *dev, struct opp_table *opp_table,
 			  struct dev_pm_opp *opp)
 {
@@ -2406,19 +2354,13 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
 {
 	int index;
 
-	if (!opp_table->genpd_virt_devs)
-		return;
-
 	for (index = 0; index < opp_table->required_opp_count; index++) {
-		if (!opp_table->genpd_virt_devs[index])
+		if (!opp_table->required_devs[index])
 			continue;
 
-		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
-		opp_table->genpd_virt_devs[index] = NULL;
+		dev_pm_domain_detach(opp_table->required_devs[index], false);
+		opp_table->required_devs[index] = NULL;
 	}
-
-	kfree(opp_table->genpd_virt_devs);
-	opp_table->genpd_virt_devs = NULL;
 }
 
 /*
@@ -2445,14 +2387,14 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
 	int index = 0, ret = -EINVAL;
 	const char * const *name = names;
 
-	if (opp_table->genpd_virt_devs)
-		return 0;
+	if (!opp_table->required_devs) {
+		dev_err(dev, "Required OPPs not available, can't attach genpd\n");
+		return -EINVAL;
+	}
 
-	opp_table->genpd_virt_devs = kcalloc(opp_table->required_opp_count,
-					     sizeof(*opp_table->genpd_virt_devs),
-					     GFP_KERNEL);
-	if (!opp_table->genpd_virt_devs)
-		return -ENOMEM;
+	/* Checking only the first one is enough ? */
+	if (opp_table->required_devs[0])
+		return 0;
 
 	while (*name) {
 		if (index >= opp_table->required_opp_count) {
@@ -2468,13 +2410,25 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
 			goto err;
 		}
 
-		opp_table->genpd_virt_devs[index] = virt_dev;
+		/*
+		 * Add the virtual genpd device as a user of the OPP table, so
+		 * we can call dev_pm_opp_set_opp() on it directly.
+		 *
+		 * This will be automatically removed when the OPP table is
+		 * removed, don't need to handle that here.
+		 */
+		if (!_add_opp_dev(virt_dev, opp_table->required_opp_tables[index])) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		opp_table->required_devs[index] = virt_dev;
 		index++;
 		name++;
 	}
 
 	if (virt_devs)
-		*virt_devs = opp_table->genpd_virt_devs;
+		*virt_devs = opp_table->required_devs;
 
 	return 0;
 
@@ -2484,10 +2438,42 @@ err:
 
 }
 
+static int _opp_set_required_devs(struct opp_table *opp_table,
+				  struct device *dev,
+				  struct device **required_devs)
+{
+	int i;
+
+	if (!opp_table->required_devs) {
+		dev_err(dev, "Required OPPs not available, can't set required devs\n");
+		return -EINVAL;
+	}
+
+	/* Another device that shares the OPP table has set the required devs ? */
+	if (opp_table->required_devs[0])
+		return 0;
+
+	for (i = 0; i < opp_table->required_opp_count; i++)
+		opp_table->required_devs[i] = required_devs[i];
+
+	return 0;
+}
+
+static void _opp_put_required_devs(struct opp_table *opp_table)
+{
+	int i;
+
+	for (i = 0; i < opp_table->required_opp_count; i++)
+		opp_table->required_devs[i] = NULL;
+}
+
 static void _opp_clear_config(struct opp_config_data *data)
 {
-	if (data->flags & OPP_CONFIG_GENPD)
+	if (data->flags & OPP_CONFIG_REQUIRED_DEVS)
+		_opp_put_required_devs(data->opp_table);
+	else if (data->flags & OPP_CONFIG_GENPD)
 		_opp_detach_genpd(data->opp_table);
+
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
@@ -2601,12 +2587,22 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Attach genpds */
 	if (config->genpd_names) {
+		if (config->required_devs)
+			goto err;
+
 		ret = _opp_attach_genpd(opp_table, dev, config->genpd_names,
 					config->virt_devs);
 		if (ret)
 			goto err;
 
 		data->flags |= OPP_CONFIG_GENPD;
+	} else if (config->required_devs) {
+		ret = _opp_set_required_devs(opp_table, dev,
+					     config->required_devs);
+		if (ret)
+			goto err;
+
+		data->flags |= OPP_CONFIG_REQUIRED_DEVS;
 	}
 
 	ret = xa_alloc(&opp_configs, &id, data, XA_LIMIT(1, INT_MAX),
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 4cdeeab5ceee..5a7e294e56b7 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -165,7 +165,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 	struct opp_table **required_opp_tables;
 	struct device_node *required_np, *np;
 	bool lazy = false;
-	int count, i;
+	int count, i, size;
 
 	/* Traversing the first OPP node is all we need */
 	np = of_get_next_available_child(opp_np, NULL);
@@ -179,12 +179,13 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 	if (count <= 0)
 		goto put_np;
 
-	required_opp_tables = kcalloc(count, sizeof(*required_opp_tables),
-				      GFP_KERNEL);
+	size = sizeof(*required_opp_tables) + sizeof(*opp_table->required_devs);
+	required_opp_tables = kcalloc(count, size, GFP_KERNEL);
 	if (!required_opp_tables)
 		goto put_np;
 
 	opp_table->required_opp_tables = required_opp_tables;
+	opp_table->required_devs = (void *)(required_opp_tables + count);
 	opp_table->required_opp_count = count;
 
 	for (i = 0; i < count; i++) {
@@ -208,8 +209,6 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table,
 		mutex_lock(&opp_table_lock);
 		list_add(&opp_table->lazy, &lazy_opp_tables);
 		mutex_unlock(&opp_table_lock);
-	} else {
-		_update_set_required_opps(opp_table);
 	}
 
 	goto put_np;
@@ -332,9 +331,14 @@ static int _link_required_opps(struct dev_pm_opp *opp, struct opp_table *opp_tab
 	 *
 	 * Just update the `level` with the right value, which
 	 * dev_pm_opp_set_opp() will take care of in the normal path itself.
+	 *
+	 * There is another case though, where a genpd's OPP table has
+	 * required-opps set to a parent genpd. The OPP core expects the user to
+	 * set the respective required `struct device` pointer via
+	 * dev_pm_opp_set_config().
 	 */
 	if (required_table->is_genpd && opp_table->required_opp_count == 1 &&
-	    !opp_table->genpd_virt_devs) {
+	    !opp_table->required_devs[0]) {
 		if (!WARN_ON(opp->level != OPP_LEVEL_UNSET))
 			opp->level = opp->required_opps[0]->level;
 	}
@@ -447,7 +451,6 @@ static void lazy_link_required_opp_table(struct opp_table *new_table)
 
 		/* All required opp-tables found, remove from lazy list */
 		if (!lazy) {
-			_update_set_required_opps(opp_table);
 			list_del_init(&opp_table->lazy);
 
 			list_for_each_entry(opp, &opp_table->opp_list, node)
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 08366f90f16b..23dcb2fbf8c3 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -35,6 +35,7 @@ extern struct list_head opp_tables;
 #define OPP_CONFIG_PROP_NAME		BIT(3)
 #define OPP_CONFIG_SUPPORTED_HW		BIT(4)
 #define OPP_CONFIG_GENPD		BIT(5)
+#define OPP_CONFIG_REQUIRED_DEVS	BIT(6)
 
 /**
  * struct opp_config_data - data for set config operations
@@ -160,9 +161,9 @@ enum opp_table_access {
  * @rate_clk_single: Currently configured frequency for single clk.
  * @current_opp: Currently configured OPP for the table.
  * @suspend_opp: Pointer to OPP to be used during device suspend.
- * @genpd_virt_devs: List of virtual devices for multiple genpd support.
  * @required_opp_tables: List of device OPP tables that are required by OPPs in
  *		this table.
+ * @required_devs: List of devices for required OPP tables.
  * @required_opp_count: Number of required devices.
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
@@ -180,7 +181,6 @@ enum opp_table_access {
  * @path_count: Number of interconnect paths
  * @enabled: Set to true if the device's resources are enabled/configured.
  * @is_genpd: Marks if the OPP table belongs to a genpd.
- * @set_required_opps: Helper responsible to set required OPPs.
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -211,8 +211,8 @@ struct opp_table {
 	struct dev_pm_opp *current_opp;
 	struct dev_pm_opp *suspend_opp;
 
-	struct device **genpd_virt_devs;
 	struct opp_table **required_opp_tables;
+	struct device **required_devs;
 	unsigned int required_opp_count;
 
 	unsigned int *supported_hw;
@@ -229,8 +229,6 @@ struct opp_table {
 	unsigned int path_count;
 	bool enabled;
 	bool is_genpd;
-	int (*set_required_opps)(struct device *dev,
-		struct opp_table *opp_table, struct dev_pm_opp *opp, bool scaling_down);
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index af53101a1383..81dff7facdc9 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -74,8 +74,10 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
  * @genpd_names: Null terminated array of pointers containing names of genpd to
- *		 attach.
- * @virt_devs: Pointer to return the array of virtual devices.
+ *		attach. Mutually exclusive with required_devs.
+ * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually
+ *		exclusive with required_devs.
+ * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs.
  *
  * This structure contains platform specific OPP configurations for the device.
  */
@@ -90,6 +92,7 @@ struct dev_pm_opp_config {
 	const char * const *regulator_names;
 	const char * const *genpd_names;
 	struct device ***virt_devs;
+	struct device **required_devs;
 };
 
 #define OPP_LEVEL_UNSET			U32_MAX
-- 
cgit v1.2.3


From 3652117f854819a148ff0fbe4492587d3520b5e5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:23 +0100
Subject: eventfd: simplify eventfd_signal()

Ever since the eventfd type was introduced back in 2007 in commit
e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal()
function only ever passed 1 as a value for @n. There's no point in
keeping that additional argument.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-2-bd549b14ce0c@kernel.org
Acked-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Andrew Donnellan <ajd@linux.ibm.com> # ocxl
Acked-by: Eric Farman <farman@linux.ibm.com>  # s390
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/x86/kvm/hyperv.c                     |  2 +-
 arch/x86/kvm/xen.c                        |  2 +-
 drivers/accel/habanalabs/common/device.c  |  2 +-
 drivers/fpga/dfl.c                        |  2 +-
 drivers/gpu/drm/drm_syncobj.c             |  6 +++---
 drivers/gpu/drm/i915/gvt/interrupt.c      |  2 +-
 drivers/infiniband/hw/mlx5/devx.c         |  2 +-
 drivers/misc/ocxl/file.c                  |  2 +-
 drivers/s390/cio/vfio_ccw_chp.c           |  2 +-
 drivers/s390/cio/vfio_ccw_drv.c           |  4 ++--
 drivers/s390/cio/vfio_ccw_ops.c           |  6 +++---
 drivers/s390/crypto/vfio_ap_ops.c         |  2 +-
 drivers/usb/gadget/function/f_fs.c        |  4 ++--
 drivers/vdpa/vdpa_user/vduse_dev.c        |  6 +++---
 drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c    |  2 +-
 drivers/vfio/pci/vfio_pci_core.c          |  6 +++---
 drivers/vfio/pci/vfio_pci_intrs.c         | 12 ++++++------
 drivers/vfio/platform/vfio_platform_irq.c |  4 ++--
 drivers/vhost/vdpa.c                      |  4 ++--
 drivers/vhost/vhost.c                     | 10 +++++-----
 drivers/vhost/vhost.h                     |  2 +-
 drivers/virt/acrn/ioeventfd.c             |  2 +-
 drivers/xen/privcmd.c                     |  2 +-
 fs/aio.c                                  |  2 +-
 fs/eventfd.c                              | 11 ++++-------
 include/linux/eventfd.h                   |  4 ++--
 mm/memcontrol.c                           | 10 +++++-----
 mm/vmpressure.c                           |  2 +-
 samples/vfio-mdev/mtty.c                  |  4 ++--
 virt/kvm/eventfd.c                        |  4 ++--
 30 files changed, 61 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335e4..4943f6b2bbee 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h
 	if (!eventfd)
 		return HV_STATUS_INVALID_PORT_ID;
 
-	eventfd_signal(eventfd, 1);
+	eventfd_signal(eventfd);
 	return HV_STATUS_SUCCESS;
 }
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index e53fad915a62..523bb6df5ac9 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
 		if (ret < 0 && ret != -ENOTCONN)
 			return false;
 	} else {
-		eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+		eventfd_signal(evtchnfd->deliver.eventfd.ctx);
 	}
 
 	*r = 0;
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 9711e8fc979d..3a89644f087c 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2044,7 +2044,7 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64
 	notifier_event->events_mask |= event_mask;
 
 	if (notifier_event->eventfd)
-		eventfd_signal(notifier_event->eventfd, 1);
+		eventfd_signal(notifier_event->eventfd);
 
 	mutex_unlock(&notifier_event->lock);
 }
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index dd7a783d53b5..e73f88050f08 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -1872,7 +1872,7 @@ static irqreturn_t dfl_irq_handler(int irq, void *arg)
 {
 	struct eventfd_ctx *trigger = arg;
 
-	eventfd_signal(trigger, 1);
+	eventfd_signal(trigger);
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 01da6789d044..b9cc62982196 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -1365,7 +1365,7 @@ static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence,
 	struct syncobj_eventfd_entry *entry =
 		container_of(cb, struct syncobj_eventfd_entry, fence_cb);
 
-	eventfd_signal(entry->ev_fd_ctx, 1);
+	eventfd_signal(entry->ev_fd_ctx);
 	syncobj_eventfd_entry_free(entry);
 }
 
@@ -1388,13 +1388,13 @@ syncobj_eventfd_entry_func(struct drm_syncobj *syncobj,
 	entry->fence = fence;
 
 	if (entry->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) {
-		eventfd_signal(entry->ev_fd_ctx, 1);
+		eventfd_signal(entry->ev_fd_ctx);
 		syncobj_eventfd_entry_free(entry);
 	} else {
 		ret = dma_fence_add_callback(fence, &entry->fence_cb,
 					     syncobj_eventfd_entry_fence_func);
 		if (ret == -ENOENT) {
-			eventfd_signal(entry->ev_fd_ctx, 1);
+			eventfd_signal(entry->ev_fd_ctx);
 			syncobj_eventfd_entry_free(entry);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c
index b32ba5f2b240..c8e7dfc9f791 100644
--- a/drivers/gpu/drm/i915/gvt/interrupt.c
+++ b/drivers/gpu/drm/i915/gvt/interrupt.c
@@ -453,7 +453,7 @@ static void inject_virtual_interrupt(struct intel_vgpu *vgpu)
 	if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
 		return;
 	if (vgpu->msi_trigger)
-		eventfd_signal(vgpu->msi_trigger, 1);
+		eventfd_signal(vgpu->msi_trigger);
 }
 
 static void propagate_event(struct intel_gvt_irq *irq,
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 8ba53edf2311..869369cb5b5f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2498,7 +2498,7 @@ static void dispatch_event_fd(struct list_head *fd_list,
 
 	list_for_each_entry_rcu(item, fd_list, xa_list) {
 		if (item->eventfd)
-			eventfd_signal(item->eventfd, 1);
+			eventfd_signal(item->eventfd);
 		else
 			deliver_event(item, data);
 	}
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index ac69b7f361f5..7eb74711ac96 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -184,7 +184,7 @@ static irqreturn_t irq_handler(void *private)
 {
 	struct eventfd_ctx *ev_ctx = private;
 
-	eventfd_signal(ev_ctx, 1);
+	eventfd_signal(ev_ctx);
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c
index d3f3a611f95b..38c176cf6295 100644
--- a/drivers/s390/cio/vfio_ccw_chp.c
+++ b/drivers/s390/cio/vfio_ccw_chp.c
@@ -115,7 +115,7 @@ static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private,
 
 	/* Notify the guest if more CRWs are on our queue */
 	if (!list_empty(&private->crw) && private->crw_trigger)
-		eventfd_signal(private->crw_trigger, 1);
+		eventfd_signal(private->crw_trigger);
 
 	return ret;
 }
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index 43601816ea4e..bfb35cfce1ef 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -112,7 +112,7 @@ void vfio_ccw_sch_io_todo(struct work_struct *work)
 		private->state = VFIO_CCW_STATE_IDLE;
 
 	if (private->io_trigger)
-		eventfd_signal(private->io_trigger, 1);
+		eventfd_signal(private->io_trigger);
 }
 
 void vfio_ccw_crw_todo(struct work_struct *work)
@@ -122,7 +122,7 @@ void vfio_ccw_crw_todo(struct work_struct *work)
 	private = container_of(work, struct vfio_ccw_private, crw_work);
 
 	if (!list_empty(&private->crw) && private->crw_trigger)
-		eventfd_signal(private->crw_trigger, 1);
+		eventfd_signal(private->crw_trigger);
 }
 
 /*
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index cba4971618ff..ea532a8a4a0c 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -421,7 +421,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private,
 	case VFIO_IRQ_SET_DATA_NONE:
 	{
 		if (*ctx)
-			eventfd_signal(*ctx, 1);
+			eventfd_signal(*ctx);
 		return 0;
 	}
 	case VFIO_IRQ_SET_DATA_BOOL:
@@ -432,7 +432,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private,
 			return -EFAULT;
 
 		if (trigger && *ctx)
-			eventfd_signal(*ctx, 1);
+			eventfd_signal(*ctx);
 		return 0;
 	}
 	case VFIO_IRQ_SET_DATA_EVENTFD:
@@ -612,7 +612,7 @@ static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count)
 					       "Relaying device request to user (#%u)\n",
 					       count);
 
-		eventfd_signal(private->req_trigger, 1);
+		eventfd_signal(private->req_trigger);
 	} else if (count == 0) {
 		dev_notice(dev,
 			   "No device request channel registered, blocked until released by user\n");
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 4db538a55192..542b5be73a6a 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1794,7 +1794,7 @@ static void vfio_ap_mdev_request(struct vfio_device *vdev, unsigned int count)
 					       "Relaying device request to user (#%u)\n",
 					       count);
 
-		eventfd_signal(matrix_mdev->req_trigger, 1);
+		eventfd_signal(matrix_mdev->req_trigger);
 	} else if (count == 0) {
 		dev_notice(dev,
 			   "No device request registered, blocked until released by user\n");
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index efe3e3b85769..fdd0fc7b8f25 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
 	io_data->kiocb->ki_complete(io_data->kiocb, ret);
 
 	if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd)
-		eventfd_signal(io_data->ffs->ffs_eventfd, 1);
+		eventfd_signal(io_data->ffs->ffs_eventfd);
 
 	if (io_data->read)
 		kfree(io_data->to_free);
@@ -2738,7 +2738,7 @@ static void __ffs_event_add(struct ffs_data *ffs,
 	ffs->ev.types[ffs->ev.count++] = type;
 	wake_up_locked(&ffs->ev.waitq);
 	if (ffs->ffs_eventfd)
-		eventfd_signal(ffs->ffs_eventfd, 1);
+		eventfd_signal(ffs->ffs_eventfd);
 }
 
 static void ffs_event_add(struct ffs_data *ffs,
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0ddd4b8abecb..6cb5ce4a8b9a 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -493,7 +493,7 @@ static void vduse_vq_kick(struct vduse_virtqueue *vq)
 		goto unlock;
 
 	if (vq->kickfd)
-		eventfd_signal(vq->kickfd, 1);
+		eventfd_signal(vq->kickfd);
 	else
 		vq->kicked = true;
 unlock:
@@ -911,7 +911,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev,
 		eventfd_ctx_put(vq->kickfd);
 	vq->kickfd = ctx;
 	if (vq->ready && vq->kicked && vq->kickfd) {
-		eventfd_signal(vq->kickfd, 1);
+		eventfd_signal(vq->kickfd);
 		vq->kicked = false;
 	}
 	spin_unlock(&vq->kick_lock);
@@ -960,7 +960,7 @@ static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq)
 
 	spin_lock_irq(&vq->irq_lock);
 	if (vq->ready && vq->cb.trigger) {
-		eventfd_signal(vq->cb.trigger, 1);
+		eventfd_signal(vq->cb.trigger);
 		signal = true;
 	}
 	spin_unlock_irq(&vq->irq_lock);
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
index c51229fccbd6..d62fbfff20b8 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -54,7 +54,7 @@ static irqreturn_t vfio_fsl_mc_irq_handler(int irq_num, void *arg)
 {
 	struct vfio_fsl_mc_irq *mc_irq = (struct vfio_fsl_mc_irq *)arg;
 
-	eventfd_signal(mc_irq->trigger, 1);
+	eventfd_signal(mc_irq->trigger);
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1929103ee59a..1cbc990d42e0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -443,7 +443,7 @@ static int vfio_pci_core_runtime_resume(struct device *dev)
 	 */
 	down_write(&vdev->memory_lock);
 	if (vdev->pm_wake_eventfd_ctx) {
-		eventfd_signal(vdev->pm_wake_eventfd_ctx, 1);
+		eventfd_signal(vdev->pm_wake_eventfd_ctx);
 		__vfio_pci_runtime_pm_exit(vdev);
 	}
 	up_write(&vdev->memory_lock);
@@ -1883,7 +1883,7 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 			pci_notice_ratelimited(pdev,
 				"Relaying device request to user (#%u)\n",
 				count);
-		eventfd_signal(vdev->req_trigger, 1);
+		eventfd_signal(vdev->req_trigger);
 	} else if (count == 0) {
 		pci_warn(pdev,
 			"No device request channel registered, blocked until released by user\n");
@@ -2302,7 +2302,7 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 	mutex_lock(&vdev->igate);
 
 	if (vdev->err_trigger)
-		eventfd_signal(vdev->err_trigger, 1);
+		eventfd_signal(vdev->err_trigger);
 
 	mutex_unlock(&vdev->igate);
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index cbb4bcbfbf83..237beac83809 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -94,7 +94,7 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
 		ctx = vfio_irq_ctx_get(vdev, 0);
 		if (WARN_ON_ONCE(!ctx))
 			return;
-		eventfd_signal(ctx->trigger, 1);
+		eventfd_signal(ctx->trigger);
 	}
 }
 
@@ -342,7 +342,7 @@ static irqreturn_t vfio_msihandler(int irq, void *arg)
 {
 	struct eventfd_ctx *trigger = arg;
 
-	eventfd_signal(trigger, 1);
+	eventfd_signal(trigger);
 	return IRQ_HANDLED;
 }
 
@@ -689,11 +689,11 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
 		if (!ctx)
 			continue;
 		if (flags & VFIO_IRQ_SET_DATA_NONE) {
-			eventfd_signal(ctx->trigger, 1);
+			eventfd_signal(ctx->trigger);
 		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
 			uint8_t *bools = data;
 			if (bools[i - start])
-				eventfd_signal(ctx->trigger, 1);
+				eventfd_signal(ctx->trigger);
 		}
 	}
 	return 0;
@@ -707,7 +707,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 	if (flags & VFIO_IRQ_SET_DATA_NONE) {
 		if (*ctx) {
 			if (count) {
-				eventfd_signal(*ctx, 1);
+				eventfd_signal(*ctx);
 			} else {
 				eventfd_ctx_put(*ctx);
 				*ctx = NULL;
@@ -722,7 +722,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
 
 		trigger = *(uint8_t *)data;
 		if (trigger && *ctx)
-			eventfd_signal(*ctx, 1);
+			eventfd_signal(*ctx);
 
 		return 0;
 	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c
index 665197caed89..61a1bfb68ac7 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -155,7 +155,7 @@ static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
 	spin_unlock_irqrestore(&irq_ctx->lock, flags);
 
 	if (ret == IRQ_HANDLED)
-		eventfd_signal(irq_ctx->trigger, 1);
+		eventfd_signal(irq_ctx->trigger);
 
 	return ret;
 }
@@ -164,7 +164,7 @@ static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
 {
 	struct vfio_platform_irq *irq_ctx = dev_id;
 
-	eventfd_signal(irq_ctx->trigger, 1);
+	eventfd_signal(irq_ctx->trigger);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 30df5c58db73..8d9f958946d6 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -178,7 +178,7 @@ static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
 	struct eventfd_ctx *call_ctx = vq->call_ctx.ctx;
 
 	if (call_ctx)
-		eventfd_signal(call_ctx, 1);
+		eventfd_signal(call_ctx);
 
 	return IRQ_HANDLED;
 }
@@ -189,7 +189,7 @@ static irqreturn_t vhost_vdpa_config_cb(void *private)
 	struct eventfd_ctx *config_ctx = v->config_ctx;
 
 	if (config_ctx)
-		eventfd_signal(config_ctx, 1);
+		eventfd_signal(config_ctx);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e0c181ad17e3..045f666b4f12 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2248,7 +2248,7 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		len -= l;
 		if (!len) {
 			if (vq->log_ctx)
-				eventfd_signal(vq->log_ctx, 1);
+				eventfd_signal(vq->log_ctx);
 			return 0;
 		}
 	}
@@ -2271,7 +2271,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 		log_used(vq, (used - (void __user *)vq->used),
 			 sizeof vq->used->flags);
 		if (vq->log_ctx)
-			eventfd_signal(vq->log_ctx, 1);
+			eventfd_signal(vq->log_ctx);
 	}
 	return 0;
 }
@@ -2289,7 +2289,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq)
 		log_used(vq, (used - (void __user *)vq->used),
 			 sizeof *vhost_avail_event(vq));
 		if (vq->log_ctx)
-			eventfd_signal(vq->log_ctx, 1);
+			eventfd_signal(vq->log_ctx);
 	}
 	return 0;
 }
@@ -2715,7 +2715,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
 		log_used(vq, offsetof(struct vring_used, idx),
 			 sizeof vq->used->idx);
 		if (vq->log_ctx)
-			eventfd_signal(vq->log_ctx, 1);
+			eventfd_signal(vq->log_ctx);
 	}
 	return r;
 }
@@ -2763,7 +2763,7 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
 	/* Signal the Guest tell them we used something up. */
 	if (vq->call_ctx.ctx && vhost_notify(dev, vq))
-		eventfd_signal(vq->call_ctx.ctx, 1);
+		eventfd_signal(vq->call_ctx.ctx);
 }
 EXPORT_SYMBOL_GPL(vhost_signal);
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f60d5f7bef94..9e942fcda5c3 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -249,7 +249,7 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
 		if ((vq)->error_ctx)                               \
-				eventfd_signal((vq)->error_ctx, 1);\
+				eventfd_signal((vq)->error_ctx);\
 	} while (0)
 
 enum {
diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c
index ac4037e9f947..4e845c6ca0b5 100644
--- a/drivers/virt/acrn/ioeventfd.c
+++ b/drivers/virt/acrn/ioeventfd.c
@@ -223,7 +223,7 @@ static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client,
 	mutex_lock(&client->vm->ioeventfds_lock);
 	p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type);
 	if (p)
-		eventfd_signal(p->eventfd, 1);
+		eventfd_signal(p->eventfd);
 	mutex_unlock(&client->vm->ioeventfds_lock);
 
 	return 0;
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 1ce7f3c7a950..7efc0c62e984 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -1147,7 +1147,7 @@ static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
 		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
 		    ioreq->size == kioeventfd->addr_len &&
 		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
-			eventfd_signal(kioeventfd->eventfd, 1);
+			eventfd_signal(kioeventfd->eventfd);
 			state = STATE_IORESP_READY;
 			break;
 		}
diff --git a/fs/aio.c b/fs/aio.c
index d02842156b35..7c691cf84cc7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1166,7 +1166,7 @@ static void aio_complete(struct aio_kiocb *iocb)
 	 * from IRQ context.
 	 */
 	if (iocb->ki_eventfd)
-		eventfd_signal(iocb->ki_eventfd, 1);
+		eventfd_signal(iocb->ki_eventfd);
 
 	/*
 	 * We have to order our ring_info tail store above and test
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 33a918f9566c..d2f7d2d8a351 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -72,22 +72,19 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
 }
 
 /**
- * eventfd_signal - Adds @n to the eventfd counter.
+ * eventfd_signal - Increment the event counter
  * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- *          The value cannot be negative.
  *
  * This function is supposed to be called by the kernel in paths that do not
  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  * value, and we signal this as overflow condition by returning a EPOLLERR
  * to poll(2).
  *
- * Returns the amount by which the counter was incremented.  This will be less
- * than @n if the counter has overflowed.
+ * Returns the amount by which the counter was incremented.
  */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+__u64 eventfd_signal(struct eventfd_ctx *ctx)
 {
-	return eventfd_signal_mask(ctx, n, 0);
+	return eventfd_signal_mask(ctx, 1, 0);
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index b9d83652c097..562089431551 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -35,7 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
+__u64 eventfd_signal(struct eventfd_ctx *ctx);
 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
@@ -58,7 +58,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+static inline int eventfd_signal(struct eventfd_ctx *ctx)
 {
 	return -ENOSYS;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 774bd6e21e27..dfbb1d3b77b7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4378,7 +4378,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
-		eventfd_signal(t->entries[i].eventfd, 1);
+		eventfd_signal(t->entries[i].eventfd);
 
 	/* i = current_threshold + 1 */
 	i++;
@@ -4390,7 +4390,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
-		eventfd_signal(t->entries[i].eventfd, 1);
+		eventfd_signal(t->entries[i].eventfd);
 
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
@@ -4430,7 +4430,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 	spin_lock(&memcg_oom_lock);
 
 	list_for_each_entry(ev, &memcg->oom_notify, list)
-		eventfd_signal(ev->eventfd, 1);
+		eventfd_signal(ev->eventfd);
 
 	spin_unlock(&memcg_oom_lock);
 	return 0;
@@ -4649,7 +4649,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 
 	/* already in OOM ? */
 	if (memcg->under_oom)
-		eventfd_signal(eventfd, 1);
+		eventfd_signal(eventfd);
 	spin_unlock(&memcg_oom_lock);
 
 	return 0;
@@ -4941,7 +4941,7 @@ static void memcg_event_remove(struct work_struct *work)
 	event->unregister_event(memcg, event->eventfd);
 
 	/* Notify userspace the event is going away. */
-	eventfd_signal(event->eventfd, 1);
+	eventfd_signal(event->eventfd);
 
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 22c6689d9302..bd5183dfd879 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -169,7 +169,7 @@ static bool vmpressure_event(struct vmpressure *vmpr,
 			continue;
 		if (level < ev->level)
 			continue;
-		eventfd_signal(ev->efd, 1);
+		eventfd_signal(ev->efd);
 		ret = true;
 	}
 	mutex_unlock(&vmpr->events_lock);
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 69ba0281f9e0..2284b3751240 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -234,10 +234,10 @@ static void mtty_trigger_interrupt(struct mdev_state *mdev_state)
 
 	if (is_msi(mdev_state)) {
 		if (mdev_state->msi_evtfd)
-			eventfd_signal(mdev_state->msi_evtfd, 1);
+			eventfd_signal(mdev_state->msi_evtfd);
 	} else if (is_intx(mdev_state)) {
 		if (mdev_state->intx_evtfd && !mdev_state->intx_mask) {
-			eventfd_signal(mdev_state->intx_evtfd, 1);
+			eventfd_signal(mdev_state->intx_evtfd);
 			mdev_state->intx_mask = true;
 		}
 	}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 89912a17f5d5..c0e230f4c3e9 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -61,7 +61,7 @@ static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
 
 	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
 				 srcu_read_lock_held(&resampler->kvm->irq_srcu))
-		eventfd_signal(irqfd->resamplefd, 1);
+		eventfd_signal(irqfd->resamplefd);
 }
 
 /*
@@ -786,7 +786,7 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	eventfd_signal(p->eventfd);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 120ae58593630819209a011a3f9c89f73bcc9894 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:24 +0100
Subject: eventfd: simplify eventfd_signal_mask()

The eventfd_signal_mask() helper was introduced for io_uring and similar
to eventfd_signal() it always passed 1 for @n. So don't bother with that
argument at all.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-3-bd549b14ce0c@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c            | 7 ++++---
 include/linux/eventfd.h | 5 ++---
 io_uring/io_uring.c     | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index d2f7d2d8a351..41109ba6bbe0 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,9 +43,10 @@ struct eventfd_ctx {
 	int id;
 };
 
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
 	unsigned long flags;
+	__u64 n = 1;
 
 	/*
 	 * Deadlock or stack overflow issues can happen if we recurse here
@@ -68,7 +69,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
 	current->in_eventfd = 0;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
-	return n;
+	return n == 1;
 }
 
 /**
@@ -84,7 +85,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
  */
 __u64 eventfd_signal(struct eventfd_ctx *ctx)
 {
-	return eventfd_signal_mask(ctx, 1, 0);
+	return eventfd_signal_mask(ctx, 0);
 }
 EXPORT_SYMBOL_GPL(eventfd_signal);
 
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 562089431551..971943ecb2a6 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx);
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask);
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx)
 	return -ENOSYS;
 }
 
-static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
-				      unsigned mask)
+static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
 	return -ENOSYS;
 }
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ed254076c723..70170a41eac4 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -558,7 +558,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
 	int ops = atomic_xchg(&ev_fd->ops, 0);
 
 	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
-		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 
 	/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
 	 * ordering in a race but if references are 0 we know we have to free
@@ -594,7 +594,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
 		goto out;
 
 	if (likely(eventfd_signal_allowed())) {
-		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	} else {
 		atomic_inc(&ev_fd->refs);
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
-- 
cgit v1.2.3


From b7638ad0c7802ea854599ce753d0e6d20690f7e2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:48:25 +0100
Subject: eventfd: make eventfd_signal{_mask}() void

No caller care about the return value.

Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-4-bd549b14ce0c@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventfd.c            | 40 +++++++++++++++-------------------------
 include/linux/eventfd.h | 16 +++++++---------
 2 files changed, 22 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 41109ba6bbe0..16bea05a7c78 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,10 +43,19 @@ struct eventfd_ctx {
 	int id;
 };
 
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
+/**
+ * eventfd_signal_mask - Increment the event counter
+ * @ctx: [in] Pointer to the eventfd context.
+ * @mask: [in] poll mask
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ */
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
 	unsigned long flags;
-	__u64 n = 1;
 
 	/*
 	 * Deadlock or stack overflow issues can happen if we recurse here
@@ -57,37 +66,18 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 	 * safe context.
 	 */
 	if (WARN_ON_ONCE(current->in_eventfd))
-		return 0;
+		return;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	current->in_eventfd = 1;
-	if (ULLONG_MAX - ctx->count < n)
-		n = ULLONG_MAX - ctx->count;
-	ctx->count += n;
+	if (ctx->count < ULLONG_MAX)
+		ctx->count++;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
 	current->in_eventfd = 0;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
-
-	return n == 1;
-}
-
-/**
- * eventfd_signal - Increment the event counter
- * @ctx: [in] Pointer to the eventfd context.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx)
-{
-	return eventfd_signal_mask(ctx, 0);
 }
-EXPORT_SYMBOL_GPL(eventfd_signal);
+EXPORT_SYMBOL_GPL(eventfd_signal_mask);
 
 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 971943ecb2a6..e32bee4345fb 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
-__u64 eventfd_signal(struct eventfd_ctx *ctx);
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
@@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 	return ERR_PTR(-ENOSYS);
 }
 
-static inline int eventfd_signal(struct eventfd_ctx *ctx)
+static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 {
-	return -ENOSYS;
-}
-
-static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
-{
-	return -ENOSYS;
 }
 
 static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
@@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 
 #endif
 
+static inline void eventfd_signal(struct eventfd_ctx *ctx)
+{
+	eventfd_signal_mask(ctx, 0);
+}
+
 #endif /* _LINUX_EVENTFD_H */
 
-- 
cgit v1.2.3


From e65a29f0235a438ece414d2d99bbf0d31aa97d04 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:44:37 +0100
Subject: mnt_idmapping: remove check_fsmapping()

The helper is a bit pointless. Just open-code the check.

Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-1-dae4abdde5bd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c            | 17 ++---------------
 fs/namespace.c                |  2 +-
 include/linux/mnt_idmapping.h |  3 ---
 3 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 57d1dedf3f8f..2674942311c3 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -25,19 +25,6 @@ struct mnt_idmap nop_mnt_idmap = {
 };
 EXPORT_SYMBOL_GPL(nop_mnt_idmap);
 
-/**
- * check_fsmapping - check whether an mount idmapping is allowed
- * @idmap: idmap of the relevent mount
- * @sb:    super block of the filesystem
- *
- * Return: true if @idmap is allowed, false if not.
- */
-bool check_fsmapping(const struct mnt_idmap *idmap,
-		     const struct super_block *sb)
-{
-	return idmap->owner != sb->s_user_ns;
-}
-
 /**
  * initial_idmapping - check whether this is the initial mapping
  * @ns: idmapping to check
@@ -94,8 +81,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
  */
 
 vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
-				   struct user_namespace *fs_userns,
-				   kuid_t kuid)
+		     struct user_namespace *fs_userns,
+		     kuid_t kuid)
 {
 	uid_t uid;
 	struct user_namespace *mnt_userns = idmap->owner;
diff --git a/fs/namespace.c b/fs/namespace.c
index b899cbbe24d9..78366f114515 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4289,7 +4289,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
 	 * Creating an idmapped mount with the filesystem wide idmapping
 	 * doesn't make sense so block that. We don't allow mushy semantics.
 	 */
-	if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
+	if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
 		return -EINVAL;
 
 	/*
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index b8da2db4ecd2..cd4d5c8781f5 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap,
 	return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid()));
 }
 
-bool check_fsmapping(const struct mnt_idmap *idmap,
-		     const struct super_block *sb);
-
 #endif /* _LINUX_MNT_IDMAPPING_H */
-- 
cgit v1.2.3


From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:44:39 +0100
Subject: mnt_idmapping: decouple from namespaces

There's no reason we need to couple mnt idmapping to namespaces in the
way we currently do. Copy the idmapping when an idmapped mount is
created and don't take any reference on the namespace at all.

We also can't easily refcount struct uid_gid_map because it needs to
stay the size of a cacheline otherwise we risk performance regressions
(Ignoring for a second that right now struct uid_gid_map isn't actually
 64 byte but 72 but that's a fix for another patch series.).

Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c      | 106 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/uidgid.h  |  13 ++++++
 kernel/user_namespace.c |   4 +-
 3 files changed, 106 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 35d78cb3c38a..64c5205e2b5e 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -9,8 +9,16 @@
 
 #include "internal.h"
 
+/*
+ * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
+ * never from raw values. These are just internal helpers.
+ */
+#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
+#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
+
 struct mnt_idmap {
-	struct user_namespace *owner;
+	struct uid_gid_map uid_map;
+	struct uid_gid_map gid_map;
 	refcount_t count;
 };
 
@@ -20,7 +28,6 @@ struct mnt_idmap {
  * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
  */
 struct mnt_idmap nop_mnt_idmap = {
-	.owner	= &init_user_ns,
 	.count	= REFCOUNT_INIT(1),
 };
 EXPORT_SYMBOL_GPL(nop_mnt_idmap);
@@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		     kuid_t kuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSUIDT_INIT(kuid);
@@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		uid = from_kuid(fs_userns, kuid);
 	if (uid == (uid_t)-1)
 		return INVALID_VFSUID;
-	return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+	return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
 }
 EXPORT_SYMBOL_GPL(make_vfsuid);
 
@@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		     struct user_namespace *fs_userns, kgid_t kgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSGIDT_INIT(kgid);
@@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		gid = from_kgid(fs_userns, kgid);
 	if (gid == (gid_t)-1)
 		return INVALID_VFSGID;
-	return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+	return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
 }
 EXPORT_SYMBOL_GPL(make_vfsgid);
 
@@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsuid_t vfsuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KUIDT(vfsuid);
-	uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+	uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
 	if (uid == (uid_t)-1)
 		return INVALID_UID;
 	if (initial_idmapping(fs_userns))
@@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsgid_t vfsgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KGIDT(vfsgid);
-	gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+	gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
 	if (gid == (gid_t)-1)
 		return INVALID_GID;
 	if (initial_idmapping(fs_userns))
@@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid)
 #endif
 EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
 
+static int copy_mnt_idmap(struct uid_gid_map *map_from,
+			  struct uid_gid_map *map_to)
+{
+	struct uid_gid_extent *forward, *reverse;
+	u32 nr_extents = READ_ONCE(map_from->nr_extents);
+	/* Pairs with smp_wmb() when writing the idmapping. */
+	smp_rmb();
+
+	/*
+	 * Don't blindly copy @map_to into @map_from if nr_extents is
+	 * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
+	 * read @nr_extents someone could have written an idmapping and
+	 * then we might end up with inconsistent data. So just don't do
+	 * anything at all.
+	 */
+	if (nr_extents == 0)
+		return 0;
+
+	/*
+	 * Here we know that nr_extents is greater than zero which means
+	 * a map has been written. Since idmappings can't be changed
+	 * once they have been written we know that we can safely copy
+	 * from @map_to into @map_from.
+	 */
+
+	if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		*map_to = *map_from;
+		return 0;
+	}
+
+	forward = kmemdup(map_from->forward,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!forward)
+		return -ENOMEM;
+
+	reverse = kmemdup(map_from->reverse,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!reverse) {
+		kfree(forward);
+		return -ENOMEM;
+	}
+
+	/*
+	 * The idmapping isn't exposed anywhere so we don't need to care
+	 * about ordering between extent pointers and @nr_extents
+	 * initialization.
+	 */
+	map_to->forward = forward;
+	map_to->reverse = reverse;
+	map_to->nr_extents = nr_extents;
+	return 0;
+}
+
+static void free_mnt_idmap(struct mnt_idmap *idmap)
+{
+	if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->uid_map.forward);
+		kfree(idmap->uid_map.reverse);
+	}
+	if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->gid_map.forward);
+		kfree(idmap->gid_map.reverse);
+	}
+	kfree(idmap);
+}
+
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
 {
 	struct mnt_idmap *idmap;
+	int ret;
 
 	idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
 	if (!idmap)
 		return ERR_PTR(-ENOMEM);
 
-	idmap->owner = get_user_ns(mnt_userns);
 	refcount_set(&idmap->count, 1);
+	ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
+	if (!ret)
+		ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
+	if (ret) {
+		free_mnt_idmap(idmap);
+		idmap = ERR_PTR(ret);
+	}
 	return idmap;
 }
 
@@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
  */
 void mnt_idmap_put(struct mnt_idmap *idmap)
 {
-	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
-		put_user_ns(idmap->owner);
-		kfree(idmap);
-	}
+	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+		free_mnt_idmap(idmap);
 }
 EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..415a7ca2b882 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -17,6 +17,7 @@
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
+struct uid_gid_map;
 
 typedef struct {
 	uid_t val;
@@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return from_kgid(ns, gid) != (gid_t) -1;
 }
 
+u32 map_id_down(struct uid_gid_map *map, u32 id);
+u32 map_id_up(struct uid_gid_map *map, u32 id);
+
 #else
 
 static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return gid_valid(gid);
 }
 
+static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
+
+static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
 #endif /* CONFIG_USER_NS */
 
 #endif /* _LINUX_UIDGID_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 625101249e4d..ce4d99df5f0e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -332,7 +332,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
 	return map_id_range_down(map, id, 1);
 }
@@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
 		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
 {
 	struct uid_gid_extent *extent;
 	unsigned extents = map->nr_extents;
-- 
cgit v1.2.3


From e0894ff038d86f30614ec16ec26dacb88c8d2bd4 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Mon, 27 Nov 2023 12:05:21 +1300
Subject: platform/x86: asus-wmi: disable USB0 hub on ROG Ally before suspend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ASUS have worked around an issue in XInput where it doesn't support USB
selective suspend, which causes suspend issues in Windows. They worked
around this by adjusting the MCU firmware to disable the USB0 hub when
the screen is switched off during the Microsoft DSM suspend path in ACPI.

The issue we have with this however is one of timing - the call the tells
the MCU to this isn't able to complete before suspend is done so we call
this in a prepare() and add a small msleep() to ensure it is done. This
must be done before the screen is switched off to prevent a variety of
possible races.

Further to this the MCU powersave option must also be disabled as it can
cause a number of issues such as:
- unreliable resume connection of N-Key
- complete loss of N-Key if the power is plugged in while suspended
Disabling the powersave option prevents this.

Without this the MCU is unable to initialise itself correctly on resume.

Signed-off-by: "Luke D. Jones" <luke@ljones.dev>
Tested-by: Philip Mueller <philm@manjaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20231126230521.125708-2-luke@ljones.dev
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/asus-wmi.c            | 50 ++++++++++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h |  3 ++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index ca668cf04020..9f7e23c5c6b4 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -16,6 +16,7 @@
 #include <linux/acpi.h>
 #include <linux/backlight.h>
 #include <linux/debugfs.h>
+#include <linux/delay.h>
 #include <linux/dmi.h>
 #include <linux/fb.h>
 #include <linux/hwmon.h>
@@ -132,6 +133,11 @@ module_param(fnlock_default, bool, 0444);
 #define ASUS_SCREENPAD_BRIGHT_MAX 255
 #define ASUS_SCREENPAD_BRIGHT_DEFAULT 60
 
+/* Controls the power state of the USB0 hub on ROG Ally which input is on */
+#define ASUS_USB0_PWR_EC0_CSEE "\\_SB.PCI0.SBRG.EC0.CSEE"
+/* 300ms so far seems to produce a reliable result on AC and battery */
+#define ASUS_USB0_PWR_EC0_CSEE_WAIT 300
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static int throttle_thermal_policy_write(struct asus_wmi *);
@@ -300,6 +306,9 @@ struct asus_wmi {
 
 	bool fnlock_locked;
 
+	/* The ROG Ally device requires the MCU USB device be disconnected before suspend */
+	bool ally_mcu_usb_switch;
+
 	struct asus_wmi_debug debug;
 
 	struct asus_wmi_driver *driver;
@@ -4488,6 +4497,8 @@ static int asus_wmi_add(struct platform_device *pdev)
 	asus->nv_temp_tgt_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_NV_THERM_TARGET);
 	asus->panel_overdrive_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_PANEL_OD);
 	asus->mini_led_mode_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE);
+	asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE)
+						&& dmi_match(DMI_BOARD_NAME, "RC71L");
 
 	err = fan_boost_mode_check_present(asus);
 	if (err)
@@ -4662,6 +4673,43 @@ static int asus_hotk_resume(struct device *device)
 		asus_wmi_fnlock_update(asus);
 
 	asus_wmi_tablet_mode_get_state(asus);
+
+	return 0;
+}
+
+static int asus_hotk_resume_early(struct device *device)
+{
+	struct asus_wmi *asus = dev_get_drvdata(device);
+
+	if (asus->ally_mcu_usb_switch) {
+		if (ACPI_FAILURE(acpi_execute_simple_method(NULL, ASUS_USB0_PWR_EC0_CSEE, 0xB8)))
+			dev_err(device, "ROG Ally MCU failed to connect USB dev\n");
+		else
+			msleep(ASUS_USB0_PWR_EC0_CSEE_WAIT);
+	}
+	return 0;
+}
+
+static int asus_hotk_prepare(struct device *device)
+{
+	struct asus_wmi *asus = dev_get_drvdata(device);
+	int result, err;
+
+	if (asus->ally_mcu_usb_switch) {
+		/* When powersave is enabled it causes many issues with resume of USB hub */
+		result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_MCU_POWERSAVE);
+		if (result == 1) {
+			dev_warn(device, "MCU powersave enabled, disabling to prevent resume issues");
+			err = asus_wmi_set_devstate(ASUS_WMI_DEVID_MCU_POWERSAVE, 0, &result);
+			if (err || result != 1)
+				dev_err(device, "Failed to set MCU powersave mode: %d\n", err);
+		}
+		/* sleep required to ensure USB0 is disabled before sleep continues */
+		if (ACPI_FAILURE(acpi_execute_simple_method(NULL, ASUS_USB0_PWR_EC0_CSEE, 0xB7)))
+			dev_err(device, "ROG Ally MCU failed to disconnect USB dev\n");
+		else
+			msleep(ASUS_USB0_PWR_EC0_CSEE_WAIT);
+	}
 	return 0;
 }
 
@@ -4709,6 +4757,8 @@ static const struct dev_pm_ops asus_pm_ops = {
 	.thaw = asus_hotk_thaw,
 	.restore = asus_hotk_restore,
 	.resume = asus_hotk_resume,
+	.resume_early = asus_hotk_resume_early,
+	.prepare = asus_hotk_prepare,
 };
 
 /* Registration ***************************************************************/
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 63e630276499..ab1c7deff118 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -114,6 +114,9 @@
 /* Charging mode - 1=Barrel, 2=USB */
 #define ASUS_WMI_DEVID_CHARGE_MODE	0x0012006C
 
+/* MCU powersave mode */
+#define ASUS_WMI_DEVID_MCU_POWERSAVE   0x001200E2
+
 /* epu is connected? 1 == true */
 #define ASUS_WMI_DEVID_EGPU_CONNECTED	0x00090018
 /* egpu on/off */
-- 
cgit v1.2.3


From 083772c9f972dcc248913b52a0dec1025baa1e16 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 26 Nov 2023 15:07:30 -0800
Subject: net: page_pool: record pools per netdev

Link the page pools with netdevs. This needs to be netns compatible
so we have two options. Either we record the pools per netns and
have to worry about moving them as the netdev gets moved.
Or we record them directly on the netdev so they move with the netdev
without any extra work.

Implement the latter option. Since pools may outlast netdev we need
a place to store orphans. In time honored tradition use loopback
for this purpose.

Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/list.h          | 20 +++++++++++
 include/linux/netdevice.h     |  4 +++
 include/linux/poison.h        |  2 ++
 include/net/page_pool/types.h |  4 +++
 net/core/page_pool_user.c     | 84 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 1837caedf723..059aa1fff41e 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -1119,6 +1119,26 @@ static inline void hlist_move_list(struct hlist_head *old,
 	old->first = NULL;
 }
 
+/**
+ * hlist_splice_init() - move all entries from one list to another
+ * @from: hlist_head from which entries will be moved
+ * @last: last entry on the @from list
+ * @to:   hlist_head to which entries will be moved
+ *
+ * @to can be empty, @from must contain at least @last.
+ */
+static inline void hlist_splice_init(struct hlist_head *from,
+				     struct hlist_node *last,
+				     struct hlist_head *to)
+{
+	if (to->first)
+		to->first->pprev = &last->next;
+	last->next = to->first;
+	to->first = from->first;
+	from->first->pprev = &to->first;
+	from->first = NULL;
+}
+
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87caa81f70c..998c7aaa98b8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2447,6 +2447,10 @@ struct net_device {
 #if IS_ENABLED(CONFIG_DPLL)
 	struct dpll_pin		*dpll_pin;
 #endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	/** @page_pools: page pools created for this netdevice */
+	struct hlist_head	page_pools;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 851a855d3868..27a7dad17eef 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -83,6 +83,8 @@
 
 /********** net/core/skbuff.c **********/
 #define SKB_LIST_POISON_NEXT	((void *)(0x800 + POISON_POINTER_DELTA))
+/********** net/ **********/
+#define NET_PTR_POISON		((void *)(0x801 + POISON_POINTER_DELTA))
 
 /********** kernel/bpf/ **********/
 #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index c19f0df3bf0b..b258a571201e 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -5,6 +5,7 @@
 
 #include <linux/dma-direction.h>
 #include <linux/ptr_ring.h>
+#include <linux/types.h>
 
 #define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
 					* map/unmap
@@ -48,6 +49,7 @@ struct pp_alloc_cache {
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
  * @dev:	device, for DMA pre-mapping purposes
+ * @netdev:	netdev this pool will serve (leave as NULL if none or multiple)
  * @napi:	NAPI which is the sole consumer of pages, otherwise NULL
  * @dma_dir:	DMA mapping direction
  * @max_len:	max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV
@@ -66,6 +68,7 @@ struct page_pool_params {
 		unsigned int	offset;
 	);
 	struct_group_tagged(page_pool_params_slow, slow,
+		struct net_device *netdev;
 /* private: used by test code only */
 		void (*init_callback)(struct page *page, void *arg);
 		void *init_arg;
@@ -189,6 +192,7 @@ struct page_pool {
 	struct page_pool_params_slow slow;
 	/* User-facing fields, protected by page_pools_lock */
 	struct {
+		struct hlist_node list;
 		u32 id;
 	} user;
 };
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 630d1eeecf2a..e5c7f078fbd4 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -1,14 +1,31 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <linux/xarray.h>
+#include <net/net_debug.h>
 #include <net/page_pool/types.h>
 
 #include "page_pool_priv.h"
 
 static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
+/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
+ * Ordering: inside rtnl_lock
+ */
 static DEFINE_MUTEX(page_pools_lock);
 
+/* Page pools are only reachable from user space (via netlink) if they are
+ * linked to a netdev at creation time. Following page pool "visibility"
+ * states are possible:
+ *  - normal
+ *    - user.list: linked to real netdev, netdev: real netdev
+ *  - orphaned - real netdev has disappeared
+ *    - user.list: linked to lo, netdev: lo
+ *  - invisible - either (a) created without netdev linking, (b) unlisted due
+ *      to error, or (c) the entire namespace which owned this pool disappeared
+ *    - user.list: unhashed, netdev: unknown
+ */
+
 int page_pool_list(struct page_pool *pool)
 {
 	static u32 id_alloc_next;
@@ -20,6 +37,10 @@ int page_pool_list(struct page_pool *pool)
 	if (err < 0)
 		goto err_unlock;
 
+	if (pool->slow.netdev)
+		hlist_add_head(&pool->user.list,
+			       &pool->slow.netdev->page_pools);
+
 	mutex_unlock(&page_pools_lock);
 	return 0;
 
@@ -32,5 +53,68 @@ void page_pool_unlist(struct page_pool *pool)
 {
 	mutex_lock(&page_pools_lock);
 	xa_erase(&page_pools, pool->user.id);
+	hlist_del(&pool->user.list);
+	mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
+{
+	struct page_pool *pool;
+	struct hlist_node *n;
+
+	mutex_lock(&page_pools_lock);
+	hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
+		hlist_del_init(&pool->user.list);
+		pool->slow.netdev = NET_PTR_POISON;
+	}
+	mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev(struct net_device *netdev)
+{
+	struct page_pool *pool, *last;
+	struct net_device *lo;
+
+	lo = dev_net(netdev)->loopback_dev;
+
+	mutex_lock(&page_pools_lock);
+	last = NULL;
+	hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+		pool->slow.netdev = lo;
+		last = pool;
+	}
+	if (last)
+		hlist_splice_init(&netdev->page_pools, &last->user.list,
+				  &lo->page_pools);
 	mutex_unlock(&page_pools_lock);
 }
+
+static int
+page_pool_netdevice_event(struct notifier_block *nb,
+			  unsigned long event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+
+	if (hlist_empty(&netdev->page_pools))
+		return NOTIFY_OK;
+
+	if (netdev->ifindex != LOOPBACK_IFINDEX)
+		page_pool_unreg_netdev(netdev);
+	else
+		page_pool_unreg_netdev_wipe(netdev);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block page_pool_netdevice_nb = {
+	.notifier_call = page_pool_netdevice_event,
+};
+
+static int __init page_pool_user_init(void)
+{
+	return register_netdevice_notifier(&page_pool_netdevice_nb);
+}
+
+subsys_initcall(page_pool_user_init);
-- 
cgit v1.2.3


From c392cbecd8eca4c53f2bf508731257d9d0a21c2d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 27 Nov 2023 16:47:04 -0700
Subject: io_uring/kbuf: defer release of mapped buffer rings

If a provided buffer ring is setup with IOU_PBUF_RING_MMAP, then the
kernel allocates the memory for it and the application is expected to
mmap(2) this memory. However, io_uring uses remap_pfn_range() for this
operation, so we cannot rely on normal munmap/release on freeing them
for us.

Stash an io_buf_free entry away for each of these, if any, and provide
a helper to free them post ->release().

Cc: stable@vger.kernel.org
Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/io_uring.c            |  2 ++
 io_uring/kbuf.c                | 44 +++++++++++++++++++++++++++++++++++++-----
 io_uring/kbuf.h                |  2 ++
 4 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0b..805bb635cdf5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
+	/* deferred free list, protected by ->uring_lock */
+	struct hlist_head	io_buf_list;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e40b11438210..3a216f0744dd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
+	INIT_HLIST_HEAD(&ctx->io_buf_list);
 	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2950,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 	io_rings_free(ctx);
+	io_kbuf_mmap_list_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75..85e680fc74ce 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,6 +33,11 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
+struct io_buf_free {
+	struct hlist_node		list;
+	void				*mem;
+};
+
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
@@ -223,7 +228,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->is_mmap) {
-			folio_put(virt_to_folio(bl->buf_ring));
+			/*
+			 * io_kbuf_list_free() will free the page(s) at
+			 * ->release() time.
+			 */
 			bl->buf_ring = NULL;
 			bl->is_mmap = 0;
 		} else if (bl->buf_nr_pages) {
@@ -531,18 +539,28 @@ error_unpin:
 	return -EINVAL;
 }
 
-static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+			      struct io_uring_buf_reg *reg,
 			      struct io_buffer_list *bl)
 {
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	struct io_buf_free *ibf;
 	size_t ring_size;
 	void *ptr;
 
 	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
-	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+	ptr = io_mem_alloc(ring_size);
 	if (!ptr)
 		return -ENOMEM;
 
+	/* Allocate and store deferred free entry */
+	ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+	if (!ibf) {
+		io_mem_free(ptr);
+		return -ENOMEM;
+	}
+	ibf->mem = ptr;
+	hlist_add_head(&ibf->list, &ctx->io_buf_list);
+
 	bl->buf_ring = ptr;
 	bl->is_mapped = 1;
 	bl->is_mmap = 1;
@@ -599,7 +617,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (!(reg.flags & IOU_PBUF_RING_MMAP))
 		ret = io_pin_pbuf_ring(&reg, bl);
 	else
-		ret = io_alloc_pbuf_ring(&reg, bl);
+		ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 
 	if (!ret) {
 		bl->nr_entries = reg.ring_entries;
@@ -649,3 +667,19 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 
 	return bl->buf_ring;
 }
+
+/*
+ * Called at or after ->release(), free the mmap'ed buffers that we used
+ * for memory mapped provided buffer rings.
+ */
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+	struct io_buf_free *ibf;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+		hlist_del(&ibf->list);
+		io_mem_free(ibf->mem);
+		kfree(ibf);
+	}
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index f2d615236b2c..6c7646e6057c 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -51,6 +51,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-- 
cgit v1.2.3


From 9a5f580c1c71b6aedba696c4898a7a7184cef8ad Mon Sep 17 00:00:00 2001
From: Muralidhara M K <muralidhara.mk@amd.com>
Date: Thu, 2 Nov 2023 11:42:24 +0000
Subject: EDAC/mc: Add support for HBM3 memory type

AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory. HBM is
a high-speed computer memory interface for 3D-stacked synchronous
dynamic random-access memory (SDRAM).

Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231102114225.2006878-4-muralimk@amd.com
---
 drivers/edac/edac_mc.c | 1 +
 include/linux/edac.h   | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 6faeb2ab3960..d6eed727b0cd 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -166,6 +166,7 @@ const char * const edac_mem_types[] = {
 	[MEM_NVDIMM]	= "Non-volatile-RAM",
 	[MEM_WIO2]	= "Wide-IO-2",
 	[MEM_HBM2]	= "High-bandwidth-memory-Gen2",
+	[MEM_HBM3]	= "High-bandwidth-memory-Gen3",
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index fa4bda2a70f6..1174beb94ab6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
  * @MEM_NVDIMM:		Non-volatile RAM
  * @MEM_WIO2:		Wide I/O 2.
  * @MEM_HBM2:		High bandwidth Memory Gen 2.
+ * @MEM_HBM3:		High bandwidth Memory Gen 3.
  */
 enum mem_type {
 	MEM_EMPTY = 0,
@@ -218,6 +219,7 @@ enum mem_type {
 	MEM_NVDIMM,
 	MEM_WIO2,
 	MEM_HBM2,
+	MEM_HBM3,
 };
 
 #define MEM_FLAG_EMPTY		BIT(MEM_EMPTY)
@@ -248,6 +250,7 @@ enum mem_type {
 #define MEM_FLAG_NVDIMM		BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 #define MEM_FLAG_HBM2		BIT(MEM_HBM2)
+#define MEM_FLAG_HBM3		BIT(MEM_HBM3)
 
 /**
  * enum edac_type - Error Detection and Correction capabilities and mode
-- 
cgit v1.2.3


From fad907cffd4bde7384812cf32fcf69becab805cc Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 28 Nov 2023 20:30:26 +0800
Subject: block: move .bd_inode into 1st cacheline of block_device

The .bd_inode field of block_device is used in IO fast path of
blkdev_write_iter() and blkdev_llseek(), so it is more efficient to keep
it into the 1st cacheline.

.bd_openers is only touched in open()/close(), and .bd_size_lock is only
for updating bdev capacity, which is in slow path too.

So swap .bd_inode layout with .bd_openers & .bd_size_lock to move
.bd_inode into the 1st cache line.

Cc: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231128123027.971610-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..f7d40692dd94 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -49,9 +49,10 @@ struct block_device {
 	bool			bd_write_holder;
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
+	struct inode		*bd_inode;	/* will die */
+
 	atomic_t		bd_openers;
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
-	struct inode *		bd_inode;	/* will die */
 	void *			bd_claiming;
 	void *			bd_holder;
 	const struct blk_holder_ops *bd_holder_ops;
-- 
cgit v1.2.3


From 67d995e069535c32829f5d368d919063492cec6e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Tue, 28 Nov 2023 20:30:27 +0800
Subject: block: warn once for each partition in bio_check_ro()

Commit 1b0a151c10a6 ("blk-core: use pr_warn_ratelimited() in
bio_check_ro()") fix message storm by limit the rate, however, there
will still be lots of message in the long term. Fix it better by warn
once for each partition.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231128123027.971610-3-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 14 +++++++++++---
 include/linux/blk_types.h |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index fdf25b8d6e78..2eca76ccf4ee 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -501,9 +501,17 @@ static inline void bio_check_ro(struct bio *bio)
 	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
 			return;
-		pr_warn_ratelimited("Trying to write to read-only block-device %pg\n",
-				    bio->bi_bdev);
-		/* Older lvm-tools actually trigger this */
+
+		if (bio->bi_bdev->bd_ro_warned)
+			return;
+
+		bio->bi_bdev->bd_ro_warned = true;
+		/*
+		 * Use ioctl to set underlying disk of raid/dm to read-only
+		 * will trigger this.
+		 */
+		pr_warn("Trying to write to read-only block-device %pg\n",
+			bio->bi_bdev);
 	}
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f7d40692dd94..b29ebd53417d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -70,6 +70,7 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
+	bool			bd_ro_warned;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
-- 
cgit v1.2.3


From a3db64c575ca201c9783f100c70b82d52bd78a93 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 27 Nov 2023 13:37:09 +0100
Subject: tty: make tty const in tty_get_baud_rate()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit 87888fb9ac0c ("tty: Remove baudrate dead code & make
ktermios params const"), the 'tty' parameter is only read in
tty_get_baud_rate(). Therefore, we can make 'tty' accepted in the
function 'const' for clarity.

The "the terminal bit flags may be updated." part of the
tty_get_baud_rate()'s kernel-doc is dropped as it is no longer true.
Because of the same commit above. And it was misplaced anyway.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231127123713.14504-1-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7625fc98fef3..e96c85f4f91e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -440,10 +440,9 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud,
  *
  * Returns: the baud rate as an integer for this terminal
  *
- * Locking: The termios lock must be held by the caller and the terminal bit
- * flags may be updated.
+ * Locking: The termios lock must be held by the caller.
  */
-static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
+static inline speed_t tty_get_baud_rate(const struct tty_struct *tty)
 {
 	return tty_termios_baud_rate(&tty->termios);
 }
-- 
cgit v1.2.3


From eec4954b81c3d9a38b99e78afb553c359db40093 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 28 Nov 2023 10:28:15 +0000
Subject: driver core: make device_is_dependent() static

The function device_is_dependent() is only called by the driver core
internally and should not, at this time, be called by anyone else
outside of it, so mark it as static so as not to give driver authors the
wrong idea.

Cc: Saravana Kannan <saravanak@google.com>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023112815-faculty-thud-add8@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 2 +-
 include/linux/device.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 67ba592afc77..e57467194104 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -298,7 +298,7 @@ static inline bool device_link_flag_is_sync_state_only(u32 flags)
  * Check if @target depends on @dev or any device dependent on it (its child or
  * its consumer etc).  Return 1 if that is the case or 0 otherwise.
  */
-int device_is_dependent(struct device *dev, void *target)
+static int device_is_dependent(struct device *dev, void *target)
 {
 	struct device_link *link;
 	int ret;
diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..4aa34c8d1361 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1071,7 +1071,6 @@ int device_rename(struct device *dev, const char *new_name);
 int device_move(struct device *dev, struct device *new_parent,
 		enum dpm_order dpm_order);
 int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
-int device_is_dependent(struct device *dev, void *target);
 
 static inline bool device_supports_offline(struct device *dev)
 {
-- 
cgit v1.2.3


From 33253d9e01d40542f07aaf718a655893cd2949e5 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:15:59 +0100
Subject: fbdev: Move default fb_mmap code into helper function

Move the default fb_mmap code for I/O address spaces into the helper
function fb_io_mmap(). The helper can either be called via struct
fb_ops.fb_mmap or as the default if no fb_mmap has been set. Also
set the new helper in __FB_DEFAULT_IOMEM_OPS_MMAP.

In the mid-term, fb_io_mmap() is supposed to become optional. Fbdev
drivers will initialize their struct fb_ops.fb_mmap to the helper
and select a corresponding Kconfig token. The helper can then be made
optional at compile time.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-31-tzimmermann@suse.de
---
 drivers/video/fbdev/core/fb_chrdev.c  | 36 ++++++-----------------------------
 drivers/video/fbdev/core/fb_io_fops.c | 27 ++++++++++++++++++++++++++
 include/linux/fb.h                    |  3 ++-
 3 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_chrdev.c b/drivers/video/fbdev/core/fb_chrdev.c
index b73a122950a9..089441c9d810 100644
--- a/drivers/video/fbdev/core/fb_chrdev.c
+++ b/drivers/video/fbdev/core/fb_chrdev.c
@@ -314,20 +314,16 @@ static long fb_compat_ioctl(struct file *file, unsigned int cmd,
 static int fb_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct fb_info *info = file_fb_info(file);
-	unsigned long mmio_pgoff;
-	unsigned long start;
-	u32 len;
+	int res;
 
 	if (!info)
 		return -ENODEV;
+
 	mutex_lock(&info->mm_lock);
 
 	if (info->fbops->fb_mmap) {
-		int res;
 
 		res = info->fbops->fb_mmap(info, vma);
-		mutex_unlock(&info->mm_lock);
-		return res;
 #if IS_ENABLED(CONFIG_FB_DEFERRED_IO)
 	} else if (info->fbdefio) {
 		/*
@@ -335,35 +331,15 @@ static int fb_mmap(struct file *file, struct vm_area_struct *vma)
 		 * minimum, point struct fb_ops.fb_mmap to fb_deferred_io_mmap().
 		 */
 		dev_warn_once(info->dev, "fbdev mmap not set up for deferred I/O.\n");
-		mutex_unlock(&info->mm_lock);
-		return -ENODEV;
+		res = -ENODEV;
 #endif
+	} else {
+		res = fb_io_mmap(info, vma);
 	}
 
-	/*
-	 * Ugh. This can be either the frame buffer mapping, or
-	 * if pgoff points past it, the mmio mapping.
-	 */
-	start = info->fix.smem_start;
-	len = info->fix.smem_len;
-	mmio_pgoff = PAGE_ALIGN((start & ~PAGE_MASK) + len) >> PAGE_SHIFT;
-	if (vma->vm_pgoff >= mmio_pgoff) {
-		if (info->var.accel_flags) {
-			mutex_unlock(&info->mm_lock);
-			return -EINVAL;
-		}
-
-		vma->vm_pgoff -= mmio_pgoff;
-		start = info->fix.mmio_start;
-		len = info->fix.mmio_len;
-	}
 	mutex_unlock(&info->mm_lock);
 
-	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-	vma->vm_page_prot = pgprot_framebuffer(vma->vm_page_prot, vma->vm_start,
-					       vma->vm_end, start);
-
-	return vm_iomap_memory(vma, start, len);
+	return res;
 }
 
 static int fb_open(struct inode *inode, struct file *file)
diff --git a/drivers/video/fbdev/core/fb_io_fops.c b/drivers/video/fbdev/core/fb_io_fops.c
index 871b829521af..60805e43914e 100644
--- a/drivers/video/fbdev/core/fb_io_fops.c
+++ b/drivers/video/fbdev/core/fb_io_fops.c
@@ -132,5 +132,32 @@ ssize_t fb_io_write(struct fb_info *info, const char __user *buf, size_t count,
 }
 EXPORT_SYMBOL(fb_io_write);
 
+int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
+{
+	unsigned long start = info->fix.smem_start;
+	u32 len = info->fix.smem_len;
+	unsigned long mmio_pgoff = PAGE_ALIGN((start & ~PAGE_MASK) + len) >> PAGE_SHIFT;
+
+	/*
+	 * This can be either the framebuffer mapping, or if pgoff points
+	 * past it, the mmio mapping.
+	 */
+	if (vma->vm_pgoff >= mmio_pgoff) {
+		if (info->var.accel_flags)
+			return -EINVAL;
+
+		vma->vm_pgoff -= mmio_pgoff;
+		start = info->fix.mmio_start;
+		len = info->fix.mmio_len;
+	}
+
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	vma->vm_page_prot = pgprot_framebuffer(vma->vm_page_prot, vma->vm_start,
+					       vma->vm_end, start);
+
+	return vm_iomap_memory(vma, start, len);
+}
+EXPORT_SYMBOL(fb_io_mmap);
+
 MODULE_DESCRIPTION("Fbdev helpers for framebuffers in I/O memory");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 94e2c44c6569..a36d05b576b0 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -536,6 +536,7 @@ extern ssize_t fb_io_read(struct fb_info *info, char __user *buf,
 			  size_t count, loff_t *ppos);
 extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf,
 			   size_t count, loff_t *ppos);
+int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 
 #define __FB_DEFAULT_IOMEM_OPS_RDWR \
 	.fb_read	= fb_io_read, \
@@ -547,7 +548,7 @@ extern ssize_t fb_io_write(struct fb_info *info, const char __user *buf,
 	.fb_imageblit	= cfb_imageblit
 
 #define __FB_DEFAULT_IOMEM_OPS_MMAP \
-	.fb_mmap	= NULL /* default implementation */
+	.fb_mmap	= fb_io_mmap
 
 #define FB_DEFAULT_IOMEM_OPS \
 	__FB_DEFAULT_IOMEM_OPS_RDWR, \
-- 
cgit v1.2.3


From b3e8813773c568fd2d65e9752abfda27442e502e Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:16:00 +0100
Subject: fbdev: Warn on incorrect framebuffer access

Test in framebuffer read, write and drawing helpers if FBINFO_VIRTFB
has been set correctly. Framebuffers in I/O memory should only be
accessed with the architecture's respective helpers. Framebuffers
in system memory should be accessed with the regular load and
store operations. Presumably not all drivers get this right, so we
now warn about it.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-32-tzimmermann@suse.de
---
 drivers/video/fbdev/core/cfbcopyarea.c | 3 +++
 drivers/video/fbdev/core/cfbfillrect.c | 3 +++
 drivers/video/fbdev/core/cfbimgblt.c   | 3 +++
 drivers/video/fbdev/core/fb_io_fops.c  | 9 +++++++++
 drivers/video/fbdev/core/fb_sys_fops.c | 6 ++++++
 drivers/video/fbdev/core/syscopyarea.c | 3 +++
 drivers/video/fbdev/core/sysfillrect.c | 3 +++
 drivers/video/fbdev/core/sysimgblt.c   | 3 +++
 include/linux/fb.h                     | 8 +++++++-
 9 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/cfbcopyarea.c b/drivers/video/fbdev/core/cfbcopyarea.c
index 5b80bf3dae50..a271f57d9c6c 100644
--- a/drivers/video/fbdev/core/cfbcopyarea.c
+++ b/drivers/video/fbdev/core/cfbcopyarea.c
@@ -391,6 +391,9 @@ void cfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (p->flags & FBINFO_VIRTFB)
+		fb_warn_once(p, "Framebuffer is not in I/O address space.");
+
 	/* if the beginning of the target area might overlap with the end of
 	the source area, be have to copy the area reverse. */
 	if ((dy == sy && dx > sx) || (dy > sy)) {
diff --git a/drivers/video/fbdev/core/cfbfillrect.c b/drivers/video/fbdev/core/cfbfillrect.c
index ba9f58b2a5e8..cbaa4c9e2355 100644
--- a/drivers/video/fbdev/core/cfbfillrect.c
+++ b/drivers/video/fbdev/core/cfbfillrect.c
@@ -287,6 +287,9 @@ void cfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (p->flags & FBINFO_VIRTFB)
+		fb_warn_once(p, "Framebuffer is not in I/O address space.");
+
 	if (p->fix.visual == FB_VISUAL_TRUECOLOR ||
 	    p->fix.visual == FB_VISUAL_DIRECTCOLOR )
 		fg = ((u32 *) (p->pseudo_palette))[rect->color];
diff --git a/drivers/video/fbdev/core/cfbimgblt.c b/drivers/video/fbdev/core/cfbimgblt.c
index 9ebda4e0dc7a..7d1d2f1a627d 100644
--- a/drivers/video/fbdev/core/cfbimgblt.c
+++ b/drivers/video/fbdev/core/cfbimgblt.c
@@ -326,6 +326,9 @@ void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (p->flags & FBINFO_VIRTFB)
+		fb_warn_once(p, "Framebuffer is not in I/O address space.");
+
 	bitstart = (dy * p->fix.line_length * 8) + (dx * bpp);
 	start_index = bitstart & (32 - 1);
 	pitch_index = (p->fix.line_length & (bpl - 1)) * 8;
diff --git a/drivers/video/fbdev/core/fb_io_fops.c b/drivers/video/fbdev/core/fb_io_fops.c
index 60805e43914e..3408ff1b2b7a 100644
--- a/drivers/video/fbdev/core/fb_io_fops.c
+++ b/drivers/video/fbdev/core/fb_io_fops.c
@@ -12,6 +12,9 @@ ssize_t fb_io_read(struct fb_info *info, char __user *buf, size_t count, loff_t
 	int c, cnt = 0, err = 0;
 	unsigned long total_size, trailing;
 
+	if (info->flags & FBINFO_VIRTFB)
+		fb_warn_once(info, "Framebuffer is not in I/O address space.");
+
 	if (!info->screen_base)
 		return -ENODEV;
 
@@ -73,6 +76,9 @@ ssize_t fb_io_write(struct fb_info *info, const char __user *buf, size_t count,
 	int c, cnt = 0, err = 0;
 	unsigned long total_size, trailing;
 
+	if (info->flags & FBINFO_VIRTFB)
+		fb_warn_once(info, "Framebuffer is not in I/O address space.");
+
 	if (!info->screen_base)
 		return -ENODEV;
 
@@ -138,6 +144,9 @@ int fb_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	u32 len = info->fix.smem_len;
 	unsigned long mmio_pgoff = PAGE_ALIGN((start & ~PAGE_MASK) + len) >> PAGE_SHIFT;
 
+	if (info->flags & FBINFO_VIRTFB)
+		fb_warn_once(info, "Framebuffer is not in I/O address space.");
+
 	/*
 	 * This can be either the framebuffer mapping, or if pgoff points
 	 * past it, the mmio mapping.
diff --git a/drivers/video/fbdev/core/fb_sys_fops.c b/drivers/video/fbdev/core/fb_sys_fops.c
index 0cb0989abda6..a9aa6519a5b3 100644
--- a/drivers/video/fbdev/core/fb_sys_fops.c
+++ b/drivers/video/fbdev/core/fb_sys_fops.c
@@ -22,6 +22,9 @@ ssize_t fb_sys_read(struct fb_info *info, char __user *buf, size_t count,
 	unsigned long total_size, c;
 	ssize_t ret;
 
+	if (!(info->flags & FBINFO_VIRTFB))
+		fb_warn_once(info, "Framebuffer is not in virtual address space.");
+
 	if (!info->screen_buffer)
 		return -ENODEV;
 
@@ -64,6 +67,9 @@ ssize_t fb_sys_write(struct fb_info *info, const char __user *buf,
 	unsigned long total_size, c;
 	size_t ret;
 
+	if (!(info->flags & FBINFO_VIRTFB))
+		fb_warn_once(info, "Framebuffer is not in virtual address space.");
+
 	if (!info->screen_buffer)
 		return -ENODEV;
 
diff --git a/drivers/video/fbdev/core/syscopyarea.c b/drivers/video/fbdev/core/syscopyarea.c
index 7b8bd3a2bedc..75e7001e8450 100644
--- a/drivers/video/fbdev/core/syscopyarea.c
+++ b/drivers/video/fbdev/core/syscopyarea.c
@@ -324,6 +324,9 @@ void sys_copyarea(struct fb_info *p, const struct fb_copyarea *area)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (!(p->flags & FBINFO_VIRTFB))
+		fb_warn_once(p, "Framebuffer is not in virtual address space.");
+
 	/* if the beginning of the target area might overlap with the end of
 	the source area, be have to copy the area reverse. */
 	if ((dy == sy && dx > sx) || (dy > sy)) {
diff --git a/drivers/video/fbdev/core/sysfillrect.c b/drivers/video/fbdev/core/sysfillrect.c
index bcdcaeae6538..e49221a88ccc 100644
--- a/drivers/video/fbdev/core/sysfillrect.c
+++ b/drivers/video/fbdev/core/sysfillrect.c
@@ -242,6 +242,9 @@ void sys_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (!(p->flags & FBINFO_VIRTFB))
+		fb_warn_once(p, "Framebuffer is not in virtual address space.");
+
 	if (p->fix.visual == FB_VISUAL_TRUECOLOR ||
 	    p->fix.visual == FB_VISUAL_DIRECTCOLOR )
 		fg = ((u32 *) (p->pseudo_palette))[rect->color];
diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
index 665ef7a0a249..6949bbd51d92 100644
--- a/drivers/video/fbdev/core/sysimgblt.c
+++ b/drivers/video/fbdev/core/sysimgblt.c
@@ -296,6 +296,9 @@ void sys_imageblit(struct fb_info *p, const struct fb_image *image)
 	if (p->state != FBINFO_STATE_RUNNING)
 		return;
 
+	if (!(p->flags & FBINFO_VIRTFB))
+		fb_warn_once(p, "Framebuffer is not in virtual address space.");
+
 	bitstart = (dy * p->fix.line_length * 8) + (dx * bpp);
 	start_index = bitstart & (32 - 1);
 	pitch_index = (p->fix.line_length & (bpl - 1)) * 8;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index a36d05b576b0..24f0ec366235 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -849,7 +849,10 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 }
 #endif
 
-/* Convenience logging macros */
+/*
+ * Convenience logging macros
+ */
+
 #define fb_err(fb_info, fmt, ...)					\
 	pr_err("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 #define fb_notice(info, fmt, ...)					\
@@ -861,4 +864,7 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 #define fb_dbg(fb_info, fmt, ...)					\
 	pr_debug("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 
+#define fb_warn_once(fb_info, fmt, ...)					\
+	pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
+
 #endif /* _LINUX_FB_H */
-- 
cgit v1.2.3


From 8813e86f6d82a7931446c3cbc5d596f77d0f1ba6 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 27 Nov 2023 14:16:01 +0100
Subject: fbdev: Remove default file-I/O implementations

Drop the default implementations for file read, write and mmap
operations. Each fbdev driver must now provide an implementation
and select any necessary helpers. If no implementation has been
set, fbdev returns an errno code to user space. The code is the
same as if the operation had not been set in the file_operations
struct.

This change makes the fbdev helpers for I/O memory optional. Most
systems only use system-memory framebuffers via DRM's fbdev emulation.

v2:
	* warn once if I/O callbacks are missing (Javier)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231127131655.4020-33-tzimmermann@suse.de
---
 drivers/video/fbdev/core/Kconfig     |  1 -
 drivers/video/fbdev/core/fb_chrdev.c | 37 ++++++++++++------------------------
 include/linux/fb.h                   |  5 +++++
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/Kconfig b/drivers/video/fbdev/core/Kconfig
index faab5d50cac3..21053bf00dc5 100644
--- a/drivers/video/fbdev/core/Kconfig
+++ b/drivers/video/fbdev/core/Kconfig
@@ -4,7 +4,6 @@
 #
 
 config FB_CORE
-	select FB_IOMEM_FOPS
 	select VIDEO_CMDLINE
 	tristate
 
diff --git a/drivers/video/fbdev/core/fb_chrdev.c b/drivers/video/fbdev/core/fb_chrdev.c
index 089441c9d810..4ebd16b7e3b8 100644
--- a/drivers/video/fbdev/core/fb_chrdev.c
+++ b/drivers/video/fbdev/core/fb_chrdev.c
@@ -34,13 +34,13 @@ static ssize_t fb_read(struct file *file, char __user *buf, size_t count, loff_t
 	if (!info)
 		return -ENODEV;
 
+	if (fb_WARN_ON_ONCE(info, !info->fbops->fb_read))
+		return -EINVAL;
+
 	if (info->state != FBINFO_STATE_RUNNING)
 		return -EPERM;
 
-	if (info->fbops->fb_read)
-		return info->fbops->fb_read(info, buf, count, ppos);
-
-	return fb_io_read(info, buf, count, ppos);
+	return info->fbops->fb_read(info, buf, count, ppos);
 }
 
 static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
@@ -50,13 +50,13 @@ static ssize_t fb_write(struct file *file, const char __user *buf, size_t count,
 	if (!info)
 		return -ENODEV;
 
+	if (fb_WARN_ON_ONCE(info, !info->fbops->fb_write))
+		return -EINVAL;
+
 	if (info->state != FBINFO_STATE_RUNNING)
 		return -EPERM;
 
-	if (info->fbops->fb_write)
-		return info->fbops->fb_write(info, buf, count, ppos);
-
-	return fb_io_write(info, buf, count, ppos);
+	return info->fbops->fb_write(info, buf, count, ppos);
 }
 
 static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
@@ -319,24 +319,11 @@ static int fb_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!info)
 		return -ENODEV;
 
-	mutex_lock(&info->mm_lock);
-
-	if (info->fbops->fb_mmap) {
-
-		res = info->fbops->fb_mmap(info, vma);
-#if IS_ENABLED(CONFIG_FB_DEFERRED_IO)
-	} else if (info->fbdefio) {
-		/*
-		 * FB deferred I/O wants you to handle mmap in your drivers. At a
-		 * minimum, point struct fb_ops.fb_mmap to fb_deferred_io_mmap().
-		 */
-		dev_warn_once(info->dev, "fbdev mmap not set up for deferred I/O.\n");
-		res = -ENODEV;
-#endif
-	} else {
-		res = fb_io_mmap(info, vma);
-	}
+	if (fb_WARN_ON_ONCE(info, !info->fbops->fb_mmap))
+		return -ENODEV;
 
+	mutex_lock(&info->mm_lock);
+	res = info->fbops->fb_mmap(info, vma);
 	mutex_unlock(&info->mm_lock);
 
 	return res;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 24f0ec366235..05dc9624897d 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -867,4 +867,9 @@ static inline bool fb_modesetting_disabled(const char *drvname)
 #define fb_warn_once(fb_info, fmt, ...)					\
 	pr_warn_once("fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
 
+#define fb_WARN_ONCE(fb_info, condition, fmt, ...) \
+	WARN_ONCE(condition, "fb%d: " fmt, (fb_info)->node, ##__VA_ARGS__)
+#define fb_WARN_ON_ONCE(fb_info, x) \
+	fb_WARN_ONCE(fb_info, (x), "%s", "fb_WARN_ON_ONCE(" __stringify(x) ")")
+
 #endif /* _LINUX_FB_H */
-- 
cgit v1.2.3


From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 17 Sep 2023 13:24:21 +0200
Subject: ptrace: Convert ptrace_attach() to use lock guards

Created as testing for the conditional guard infrastructure.
Specifically this makes use of the following form:

  scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
		     &task->signal->cred_guard_mutex) {
    ...
  }
  ...
  return 0;

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org
---
 include/linux/sched/task.h |   2 +
 include/linux/spinlock.h   |  26 +++++++++
 kernel/ptrace.c            | 128 +++++++++++++++++++++------------------------
 3 files changed, 89 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..4f3dca353556 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ceb56b39c70f..90bc853cafb6 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
 DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
 			 spin_trylock_irqsave(_T->lock, _T->flags))
 
+DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
+		    read_lock(_T->lock),
+		    read_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
+		    read_lock_irq(_T->lock),
+		    read_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
+		    read_lock_irqsave(_T->lock, _T->flags),
+		    read_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
+DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
+		    write_lock(_T->lock),
+		    write_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
+		    write_lock_irq(_T->lock),
+		    write_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
+		    write_lock_irqsave(_T->lock, _T->flags),
+		    write_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
 #undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..5c579fb9a5e3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -386,6 +386,34 @@ static int check_ptrace_options(unsigned long data)
 	return 0;
 }
 
+static inline void ptrace_set_stopped(struct task_struct *task)
+{
+	guard(spinlock)(&task->sighand->siglock);
+
+	/*
+	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
+	 * will be cleared if the child completes the transition or any
+	 * event which clears the group stop states happens.  We'll wait
+	 * for the transition to complete before returning from this
+	 * function.
+	 *
+	 * This hides STOPPED -> RUNNING -> TRACED transition from the
+	 * attaching thread but a different thread in the same group can
+	 * still observe the transient RUNNING state.  IOW, if another
+	 * thread's WNOHANG wait(2) on the stopped tracee races against
+	 * ATTACH, the wait(2) may fail due to the transient RUNNING.
+	 *
+	 * The following task_is_stopped() test is safe as both transitions
+	 * in and out of STOPPED are protected by siglock.
+	 */
+	if (task_is_stopped(task) &&
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_STOPPED;
+		signal_wake_up_state(task, __TASK_STOPPED);
+	}
+}
+
 static int ptrace_attach(struct task_struct *task, long request,
 			 unsigned long addr,
 			 unsigned long flags)
@@ -393,17 +421,17 @@ static int ptrace_attach(struct task_struct *task, long request,
 	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
-	retval = -EIO;
 	if (seize) {
 		if (addr != 0)
-			goto out;
+			return -EIO;
 		/*
 		 * This duplicates the check in check_ptrace_options() because
 		 * ptrace_attach() and ptrace_setoptions() have historically
 		 * used different error codes for unknown ptrace options.
 		 */
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
-			goto out;
+			return -EIO;
+
 		retval = check_ptrace_options(flags);
 		if (retval)
 			return retval;
@@ -414,88 +442,54 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	audit_ptrace(task);
 
-	retval = -EPERM;
 	if (unlikely(task->flags & PF_KTHREAD))
-		goto out;
+		return -EPERM;
 	if (same_thread_group(task, current))
-		goto out;
+		return -EPERM;
 
 	/*
 	 * Protect exec's credential calculations against our interference;
 	 * SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
-		goto out;
+	scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+			   &task->signal->cred_guard_mutex) {
 
-	task_lock(task);
-	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
-	task_unlock(task);
-	if (retval)
-		goto unlock_creds;
+		scoped_guard (task_lock, task) {
+			retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+			if (retval)
+				return retval;
+		}
 
-	write_lock_irq(&tasklist_lock);
-	retval = -EPERM;
-	if (unlikely(task->exit_state))
-		goto unlock_tasklist;
-	if (task->ptrace)
-		goto unlock_tasklist;
+		scoped_guard (write_lock_irq, &tasklist_lock) {
+			if (unlikely(task->exit_state))
+				return -EPERM;
+			if (task->ptrace)
+				return -EPERM;
 
-	task->ptrace = flags;
+			task->ptrace = flags;
 
-	ptrace_link(task, current);
+			ptrace_link(task, current);
 
-	/* SEIZE doesn't trap tracee on attach */
-	if (!seize)
-		send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
+			/* SEIZE doesn't trap tracee on attach */
+			if (!seize)
+				send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
 
-	spin_lock(&task->sighand->siglock);
+			ptrace_set_stopped(task);
+		}
+	}
 
 	/*
-	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
-	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
-	 * will be cleared if the child completes the transition or any
-	 * event which clears the group stop states happens.  We'll wait
-	 * for the transition to complete before returning from this
-	 * function.
-	 *
-	 * This hides STOPPED -> RUNNING -> TRACED transition from the
-	 * attaching thread but a different thread in the same group can
-	 * still observe the transient RUNNING state.  IOW, if another
-	 * thread's WNOHANG wait(2) on the stopped tracee races against
-	 * ATTACH, the wait(2) may fail due to the transient RUNNING.
-	 *
-	 * The following task_is_stopped() test is safe as both transitions
-	 * in and out of STOPPED are protected by siglock.
+	 * We do not bother to change retval or clear JOBCTL_TRAPPING
+	 * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+	 * not return to user-mode, it will exit and clear this bit in
+	 * __ptrace_unlink() if it wasn't already cleared by the tracee;
+	 * and until then nobody can ptrace this task.
 	 */
-	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
-		task->jobctl &= ~JOBCTL_STOPPED;
-		signal_wake_up_state(task, __TASK_STOPPED);
-	}
-
-	spin_unlock(&task->sighand->siglock);
-
-	retval = 0;
-unlock_tasklist:
-	write_unlock_irq(&tasklist_lock);
-unlock_creds:
-	mutex_unlock(&task->signal->cred_guard_mutex);
-out:
-	if (!retval) {
-		/*
-		 * We do not bother to change retval or clear JOBCTL_TRAPPING
-		 * if wait_on_bit() was interrupted by SIGKILL. The tracer will
-		 * not return to user-mode, it will exit and clear this bit in
-		 * __ptrace_unlink() if it wasn't already cleared by the tracee;
-		 * and until then nobody can ptrace this task.
-		 */
-		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
-		proc_ptrace_connector(task, PTRACE_ATTACH);
-	}
+	wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+	proc_ptrace_connector(task, PTRACE_ATTACH);
 
-	return retval;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From febab20caebac959fdc3d7520bc52de8b1184455 Mon Sep 17 00:00:00 2001
From: Wyes Karny <wyes.karny@amd.com>
Date: Fri, 17 Nov 2023 06:38:39 +0000
Subject: cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update

When amd_pstate is running, writing to scaling_min_freq and
scaling_max_freq has no effect. These values are only passed to the
policy level, but not to the platform level. This means that the
platform does not know about the frequency limits set by the user.

To fix this, update the min_perf and max_perf values at the platform
level whenever the user changes the scaling_min_freq and scaling_max_freq
values.

Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
Acked-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 60 ++++++++++++++++++++++++++++++++++----------
 include/linux/amd-pstate.h   |  4 +++
 2 files changed, 51 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 300f81d36291..3313d1d2c6dd 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -307,11 +307,11 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
 		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
 
 	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-
+	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
-
+	WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1));
 	return 0;
 }
 
@@ -329,11 +329,12 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
 		highest_perf = cppc_perf.highest_perf;
 
 	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-
+	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
 		   cppc_perf.lowest_nonlinear_perf);
 	WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
+	WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf);
 
 	if (cppc_state == AMD_PSTATE_ACTIVE)
 		return 0;
@@ -432,6 +433,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
 	u64 value = prev;
 
+	min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
+			cpudata->max_limit_perf);
+	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+			cpudata->max_limit_perf);
 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
 
 	if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
@@ -470,6 +475,22 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy)
 	return 0;
 }
 
+static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
+{
+	u32 max_limit_perf, min_limit_perf;
+	struct amd_cpudata *cpudata = policy->driver_data;
+
+	max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
+	min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+
+	WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
+	WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
+	WRITE_ONCE(cpudata->max_limit_freq, policy->max);
+	WRITE_ONCE(cpudata->min_limit_freq, policy->min);
+
+	return 0;
+}
+
 static int amd_pstate_update_freq(struct cpufreq_policy *policy,
 				  unsigned int target_freq, bool fast_switch)
 {
@@ -480,6 +501,9 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy,
 	if (!cpudata->max_freq)
 		return -ENODEV;
 
+	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
+		amd_pstate_update_min_max_limit(policy);
+
 	cap_perf = READ_ONCE(cpudata->highest_perf);
 	min_perf = READ_ONCE(cpudata->lowest_perf);
 	max_perf = cap_perf;
@@ -534,6 +558,10 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	struct amd_cpudata *cpudata = policy->driver_data;
 	unsigned int target_freq;
 
+	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
+		amd_pstate_update_min_max_limit(policy);
+
+
 	cap_perf = READ_ONCE(cpudata->highest_perf);
 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
 	max_freq = READ_ONCE(cpudata->max_freq);
@@ -747,6 +775,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 	/* Initial processor data capability frequencies */
 	cpudata->max_freq = max_freq;
 	cpudata->min_freq = min_freq;
+	cpudata->max_limit_freq = max_freq;
+	cpudata->min_limit_freq = min_freq;
 	cpudata->nominal_freq = nominal_freq;
 	cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq;
 
@@ -1185,16 +1215,25 @@ static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static void amd_pstate_epp_init(unsigned int cpu)
+static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
 	struct amd_cpudata *cpudata = policy->driver_data;
-	u32 max_perf, min_perf;
+	u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
 	u64 value;
 	s16 epp;
 
 	max_perf = READ_ONCE(cpudata->highest_perf);
 	min_perf = READ_ONCE(cpudata->lowest_perf);
+	max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
+	min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+
+	max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
+			cpudata->max_limit_perf);
+	min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
+			cpudata->max_limit_perf);
+
+	WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
+	WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
 
 	value = READ_ONCE(cpudata->cppc_req_cached);
 
@@ -1212,9 +1251,6 @@ static void amd_pstate_epp_init(unsigned int cpu)
 	value &= ~AMD_CPPC_DES_PERF(~0L);
 	value |= AMD_CPPC_DES_PERF(0);
 
-	if (cpudata->epp_policy == cpudata->policy)
-		goto skip_epp;
-
 	cpudata->epp_policy = cpudata->policy;
 
 	/* Get BIOS pre-defined epp value */
@@ -1224,7 +1260,7 @@ static void amd_pstate_epp_init(unsigned int cpu)
 		 * This return value can only be negative for shared_memory
 		 * systems where EPP register read/write not supported.
 		 */
-		goto skip_epp;
+		return;
 	}
 
 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
@@ -1238,8 +1274,6 @@ static void amd_pstate_epp_init(unsigned int cpu)
 
 	WRITE_ONCE(cpudata->cppc_req_cached, value);
 	amd_pstate_set_epp(cpudata, epp);
-skip_epp:
-	cpufreq_cpu_put(policy);
 }
 
 static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
@@ -1254,7 +1288,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
 
 	cpudata->policy = policy->policy;
 
-	amd_pstate_epp_init(policy->cpu);
+	amd_pstate_epp_update_limit(policy);
 
 	return 0;
 }
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 446394f84606..6ad02ad9c7b4 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -70,6 +70,10 @@ struct amd_cpudata {
 	u32	nominal_perf;
 	u32	lowest_nonlinear_perf;
 	u32	lowest_perf;
+	u32     min_limit_perf;
+	u32     max_limit_perf;
+	u32     min_limit_freq;
+	u32     max_limit_freq;
 
 	u32	max_freq;
 	u32	min_freq;
-- 
cgit v1.2.3


From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 27 Nov 2023 11:03:08 -0800
Subject: xsk: Add TX timestamp and TX checksum offload support

This change actually defines the (initial) metadata layout
that should be used by AF_XDP userspace (xsk_tx_metadata).
The first field is flags which requests appropriate offloads,
followed by the offload-specific fields. The supported per-device
offloads are exported via netlink (new xsk-flags).

The offloads themselves are still implemented in a bit of a
framework-y fashion that's left from my initial kfunc attempt.
I'm introducing new xsk_tx_metadata_ops which drivers are
supposed to implement. The drivers are also supposed
to call xsk_tx_metadata_request/xsk_tx_metadata_complete in
the right places. Since xsk_tx_metadata_{request,_complete}
are static inline, we don't incur any extra overhead doing
indirect calls.

The benefit of this scheme is as follows:
- keeps all metadata layout parsing away from driver code
- makes it easy to grep and see which drivers implement what
- don't need any extra flags to maintain to keep track of what
  offloads are implemented; if the callback is implemented - the offload
  is supported (used by netlink reporting code)

Two offloads are defined right now:
1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset
2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata
   area upon completion (tx_timestamp field)

XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes
SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass
metadata pointer).

The struct is forward-compatible and can be extended in the future
by appending more fields.

Reviewed-by: Song Yoong Siang <yoong.siang.song@intel.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml |  19 +++++-
 include/linux/netdevice.h               |   2 +
 include/linux/skbuff.h                  |  14 +++-
 include/net/xdp_sock.h                  | 110 ++++++++++++++++++++++++++++++++
 include/net/xdp_sock_drv.h              |  13 ++++
 include/net/xsk_buff_pool.h             |   6 ++
 include/uapi/linux/if_xdp.h             |  38 +++++++++++
 include/uapi/linux/netdev.h             |  16 +++++
 net/core/netdev-genl.c                  |  13 +++-
 net/xdp/xsk.c                           |  34 ++++++++++
 net/xdp/xsk_queue.h                     |   2 +-
 tools/include/uapi/linux/if_xdp.h       |  52 +++++++++++++--
 tools/include/uapi/linux/netdev.h       |  16 +++++
 tools/net/ynl/generated/netdev-user.c   |  19 ++++++
 tools/net/ynl/generated/netdev-user.h   |   3 +
 15 files changed, 348 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 14511b13f305..00439bcbd2e3 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -45,7 +45,6 @@ definitions:
   -
     type: flags
     name: xdp-rx-metadata
-    render-max: true
     entries:
       -
         name: timestamp
@@ -55,6 +54,18 @@ definitions:
         name: hash
         doc:
           Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
+  -
+    type: flags
+    name: xsk-flags
+    entries:
+      -
+        name: tx-timestamp
+        doc:
+          HW timestamping egress packets is supported by the driver.
+      -
+        name: tx-checksum
+        doc:
+          L3 checksum HW offload is supported by the driver.
 
 attribute-sets:
   -
@@ -86,6 +97,11 @@ attribute-sets:
              See Documentation/networking/xdp-rx-metadata.rst for more details.
         type: u64
         enum: xdp-rx-metadata
+      -
+        name: xsk-features
+        doc: Bitmask of enabled AF_XDP features.
+        type: u64
+        enum: xsk-flags
 
 operations:
   list:
@@ -103,6 +119,7 @@ operations:
             - xdp-features
             - xdp-zc-max-segs
             - xdp-rx-metadata-features
+            - xsk-features
       dump:
         reply: *dev-all
     -
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87caa81f70c..08da8b28c816 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1865,6 +1865,7 @@ enum netdev_stat_type {
  *	@netdev_ops:	Includes several pointers to callbacks,
  *			if one wants to override the ndo_*() functions
  *	@xdp_metadata_ops:	Includes pointers to XDP metadata callbacks.
+ *	@xsk_tx_metadata_ops:	Includes pointers to AF_XDP TX metadata callbacks.
  *	@ethtool_ops:	Management operations
  *	@l3mdev_ops:	Layer 3 master device operations
  *	@ndisc_ops:	Includes callbacks for different IPv6 neighbour
@@ -2128,6 +2129,7 @@ struct net_device {
 	unsigned long long	priv_flags;
 	const struct net_device_ops *netdev_ops;
 	const struct xdp_metadata_ops *xdp_metadata_ops;
+	const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
 	int			ifindex;
 	unsigned short		gflags;
 	unsigned short		hard_header_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27998f73183e..b370eb8d70f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -566,6 +566,15 @@ struct ubuf_info_msgzc {
 int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
 void mm_unaccount_pinned_pages(struct mmpin *mmp);
 
+/* Preserve some data across TX submission and completion.
+ *
+ * Note, this state is stored in the driver. Extending the layout
+ * might need some special care.
+ */
+struct xsk_tx_metadata_compl {
+	__u64 *tx_timestamp;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -578,7 +587,10 @@ struct skb_shared_info {
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
 	struct sk_buff	*frag_list;
-	struct skb_shared_hwtstamps hwtstamps;
+	union {
+		struct skb_shared_hwtstamps hwtstamps;
+		struct xsk_tx_metadata_compl xsk_meta;
+	};
 	unsigned int	gso_type;
 	u32		tskey;
 
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index bcf765124f72..3cb4dc9bd70e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -93,12 +93,105 @@ struct xdp_sock {
 	struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
 };
 
+/*
+ * AF_XDP TX metadata hooks for network devices.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
+ *
+ * void (*tmo_request_timestamp)(void *priv)
+ *     Called when AF_XDP frame requested egress timestamp.
+ *
+ * u64 (*tmo_fill_timestamp)(void *priv)
+ *     Called when AF_XDP frame, that had requested egress timestamp,
+ *     received a completion. The hook needs to return the actual HW timestamp.
+ *
+ * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
+ *     Called when AF_XDP frame requested HW checksum offload. csum_start
+ *     indicates position where checksumming should start.
+ *     csum_offset indicates position where checksum should be stored.
+ *
+ */
+struct xsk_tx_metadata_ops {
+	void	(*tmo_request_timestamp)(void *priv);
+	u64	(*tmo_fill_timestamp)(void *priv);
+	void	(*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+};
+
 #ifdef CONFIG_XDP_SOCKETS
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
+/**
+ *  xsk_tx_metadata_to_compl - Save enough relevant metadata information
+ *  to perform tx completion in the future.
+ *  @meta: pointer to AF_XDP metadata area
+ *  @compl: pointer to output struct xsk_tx_metadata_to_compl
+ *
+ *  This function should be called by the networking device when
+ *  it prepares AF_XDP egress packet. The value of @compl should be stored
+ *  and passed to xsk_tx_metadata_complete upon TX completion.
+ */
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+					    struct xsk_tx_metadata_compl *compl)
+{
+	if (!meta)
+		return;
+
+	if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+		compl->tx_timestamp = &meta->completion.tx_timestamp;
+	else
+		compl->tx_timestamp = NULL;
+}
+
+/**
+ *  xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
+ *  and call appropriate xsk_tx_metadata_ops operation.
+ *  @meta: pointer to AF_XDP metadata area
+ *  @ops: pointer to struct xsk_tx_metadata_ops
+ *  @priv: pointer to driver-private aread
+ *
+ *  This function should be called by the networking device when
+ *  it prepares AF_XDP egress packet.
+ */
+static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
+					   const struct xsk_tx_metadata_ops *ops,
+					   void *priv)
+{
+	if (!meta)
+		return;
+
+	if (ops->tmo_request_timestamp)
+		if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+			ops->tmo_request_timestamp(priv);
+
+	if (ops->tmo_request_checksum)
+		if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
+			ops->tmo_request_checksum(meta->request.csum_start,
+						  meta->request.csum_offset, priv);
+}
+
+/**
+ *  xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
+ *  and call appropriate xsk_tx_metadata_ops operation.
+ *  @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
+ *  @ops: pointer to struct xsk_tx_metadata_ops
+ *  @priv: pointer to driver-private aread
+ *
+ *  This function should be called by the networking device upon
+ *  AF_XDP egress completion.
+ */
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+					    const struct xsk_tx_metadata_ops *ops,
+					    void *priv)
+{
+	if (!compl)
+		return;
+
+	*compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
+}
+
 #else
 
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -115,6 +208,23 @@ static inline void __xsk_map_flush(void)
 {
 }
 
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+					    struct xsk_tx_metadata_compl *compl)
+{
+}
+
+static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
+					   const struct xsk_tx_metadata_ops *ops,
+					   void *priv)
+{
+}
+
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+					    const struct xsk_tx_metadata_ops *ops,
+					    void *priv)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 1f6fc8c7a84c..e2558ac3e195 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -165,6 +165,14 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
 	return xp_raw_get_data(pool, addr);
 }
 
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+	if (!pool->tx_metadata_len)
+		return NULL;
+
+	return xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
+}
+
 static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
 {
 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -324,6 +332,11 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
 	return NULL;
 }
 
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+	return NULL;
+}
+
 static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
 {
 }
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 1985ffaf9b0c..97f5cc10d79e 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -33,6 +33,7 @@ struct xdp_buff_xsk {
 };
 
 #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
+#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t))
 
 struct xsk_dma_map {
 	dma_addr_t *dma_pages;
@@ -234,4 +235,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
 }
 
+static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
+{
+	return pool->tx_metadata_len > 0;
+}
+
 #endif /* XSK_BUFF_POOL_H_ */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 2ecf79282c26..95de66d5a26c 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -106,6 +106,41 @@ struct xdp_options {
 #define XSK_UNALIGNED_BUF_ADDR_MASK \
 	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
 
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of struct xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP		(1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of struct
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM			(1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+	__u64 flags;
+
+	union {
+		struct {
+			/* XDP_TXMD_FLAGS_CHECKSUM */
+
+			/* Offset from desc->addr where checksumming should start. */
+			__u16 csum_start;
+			/* Offset from csum_start where checksum should be stored. */
+			__u16 csum_offset;
+		} request;
+
+		struct {
+			/* XDP_TXMD_FLAGS_TIMESTAMP */
+			__u64 tx_timestamp;
+		} completion;
+	};
+};
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
 	__u64 addr;
@@ -122,4 +157,7 @@ struct xdp_desc {
  */
 #define XDP_PKT_CONTD (1 << 0)
 
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 2943a151d4f1..48d5477a668c 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata {
 	NETDEV_XDP_RX_METADATA_MASK = 3,
 };
 
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ *   by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ *   driver.
+ */
+enum netdev_xsk_flags {
+	NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+	NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+
+	/* private: */
+	NETDEV_XSK_FLAGS_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
 	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+	NETDEV_A_DEV_XSK_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index fe61f85bcf33..10f2124e9e23 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -6,6 +6,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/xdp.h>
+#include <net/xdp_sock.h>
 
 #include "netdev-genl-gen.h"
 
@@ -13,6 +14,7 @@ static int
 netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
 		   const struct genl_info *info)
 {
+	u64 xsk_features = 0;
 	u64 xdp_rx_meta = 0;
 	void *hdr;
 
@@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 
+	if (netdev->xsk_tx_metadata_ops) {
+		if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+			xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+		if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+			xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+	}
+
 	if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
 	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
 			      netdev->xdp_features, NETDEV_A_DEV_PAD) ||
 	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
-			      xdp_rx_meta, NETDEV_A_DEV_PAD)) {
+			      xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+			      xsk_features, NETDEV_A_DEV_PAD)) {
 		genlmsg_cancel(rsp, hdr);
 		return -EINVAL;
 	}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c904356e2800..e83ade32f1fd 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -571,6 +571,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb)
 
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
+	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
+
+	if (compl->tx_timestamp) {
+		/* sw completion timestamp, not a real one */
+		*compl->tx_timestamp = ktime_get_tai_fast_ns();
+	}
+
 	xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
 	sock_wfree(skb);
 }
@@ -655,8 +662,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				     struct xdp_desc *desc)
 {
+	struct xsk_tx_metadata *meta = NULL;
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
+	bool first_frag = false;
 	int err;
 
 	if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
@@ -687,6 +696,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				kfree_skb(skb);
 				goto free_err;
 			}
+
+			first_frag = true;
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct page *page;
@@ -709,12 +720,35 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 			skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
 		}
+
+		if (first_frag && desc->options & XDP_TX_METADATA) {
+			if (unlikely(xs->pool->tx_metadata_len == 0)) {
+				err = -EINVAL;
+				goto free_err;
+			}
+
+			meta = buffer - xs->pool->tx_metadata_len;
+
+			if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
+				if (unlikely(meta->request.csum_start +
+					     meta->request.csum_offset +
+					     sizeof(__sum16) > len)) {
+					err = -EINVAL;
+					goto free_err;
+				}
+
+				skb->csum_start = hr + meta->request.csum_start;
+				skb->csum_offset = meta->request.csum_offset;
+				skb->ip_summed = CHECKSUM_PARTIAL;
+			}
+		}
 	}
 
 	skb->dev = dev;
 	skb->priority = READ_ONCE(xs->sk.sk_priority);
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
+	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
 	xsk_set_destructor_arg(skb);
 
 	return skb;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index c74a1372bcb9..6f2d1621c992 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -137,7 +137,7 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
 
 static inline bool xp_unused_options_set(u32 options)
 {
-	return options & ~XDP_PKT_CONTD;
+	return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA);
 }
 
 static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 34411a2e5b6c..d0882edc1642 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -26,11 +26,11 @@
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
 /* By setting this option, userspace application indicates that it can
- * handle multiple descriptors per packet thus enabling xsk core to split
+ * handle multiple descriptors per packet thus enabling AF_XDP to split
  * multi-buffer XDP frames into multiple Rx descriptors. Without this set
- * such frames will be dropped by xsk.
+ * such frames will be dropped.
  */
-#define XDP_USE_SG     (1 << 4)
+#define XDP_USE_SG	(1 << 4)
 
 /* Flags for xsk_umem_config flags */
 #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
@@ -106,6 +106,41 @@ struct xdp_options {
 #define XSK_UNALIGNED_BUF_ADDR_MASK \
 	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
 
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of union xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP		(1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of union
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM			(1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+	__u64 flags;
+
+	union {
+		struct {
+			/* XDP_TXMD_FLAGS_CHECKSUM */
+
+			/* Offset from desc->addr where checksumming should start. */
+			__u16 csum_start;
+			/* Offset from csum_start where checksum should be stored. */
+			__u16 csum_offset;
+		} request;
+
+		struct {
+			/* XDP_TXMD_FLAGS_TIMESTAMP */
+			__u64 tx_timestamp;
+		} completion;
+	};
+};
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
 	__u64 addr;
@@ -113,9 +148,16 @@ struct xdp_desc {
 	__u32 options;
 };
 
-/* Flag indicating packet constitutes of multiple buffers*/
+/* UMEM descriptor is __u64 */
+
+/* Flag indicating that the packet continues with the buffer pointed out by the
+ * next frame in the ring. The end of the packet is signalled by setting this
+ * bit to zero. For single buffer packets, every descriptor has 'options' set
+ * to 0 and this maintains backward compatibility.
+ */
 #define XDP_PKT_CONTD (1 << 0)
 
-/* UMEM descriptor is __u64 */
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 2943a151d4f1..48d5477a668c 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata {
 	NETDEV_XDP_RX_METADATA_MASK = 3,
 };
 
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ *   by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ *   driver.
+ */
+enum netdev_xsk_flags {
+	NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+	NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+
+	/* private: */
+	NETDEV_XSK_FLAGS_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
 	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+	NETDEV_A_DEV_XSK_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/tools/net/ynl/generated/netdev-user.c b/tools/net/ynl/generated/netdev-user.c
index b5ffe8cd1144..6283d87dad37 100644
--- a/tools/net/ynl/generated/netdev-user.c
+++ b/tools/net/ynl/generated/netdev-user.c
@@ -58,6 +58,19 @@ const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value)
 	return netdev_xdp_rx_metadata_strmap[value];
 }
 
+static const char * const netdev_xsk_flags_strmap[] = {
+	[0] = "tx-timestamp",
+	[1] = "tx-checksum",
+};
+
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value)
+{
+	value = ffs(value) - 1;
+	if (value < 0 || value >= (int)MNL_ARRAY_SIZE(netdev_xsk_flags_strmap))
+		return NULL;
+	return netdev_xsk_flags_strmap[value];
+}
+
 /* Policies */
 struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = {
 	[NETDEV_A_DEV_IFINDEX] = { .name = "ifindex", .type = YNL_PT_U32, },
@@ -65,6 +78,7 @@ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = {
 	[NETDEV_A_DEV_XDP_FEATURES] = { .name = "xdp-features", .type = YNL_PT_U64, },
 	[NETDEV_A_DEV_XDP_ZC_MAX_SEGS] = { .name = "xdp-zc-max-segs", .type = YNL_PT_U32, },
 	[NETDEV_A_DEV_XDP_RX_METADATA_FEATURES] = { .name = "xdp-rx-metadata-features", .type = YNL_PT_U64, },
+	[NETDEV_A_DEV_XSK_FEATURES] = { .name = "xsk-features", .type = YNL_PT_U64, },
 };
 
 struct ynl_policy_nest netdev_dev_nest = {
@@ -116,6 +130,11 @@ int netdev_dev_get_rsp_parse(const struct nlmsghdr *nlh, void *data)
 				return MNL_CB_ERROR;
 			dst->_present.xdp_rx_metadata_features = 1;
 			dst->xdp_rx_metadata_features = mnl_attr_get_u64(attr);
+		} else if (type == NETDEV_A_DEV_XSK_FEATURES) {
+			if (ynl_attr_validate(yarg, attr))
+				return MNL_CB_ERROR;
+			dst->_present.xsk_features = 1;
+			dst->xsk_features = mnl_attr_get_u64(attr);
 		}
 	}
 
diff --git a/tools/net/ynl/generated/netdev-user.h b/tools/net/ynl/generated/netdev-user.h
index 4fafac879df3..39af1908444b 100644
--- a/tools/net/ynl/generated/netdev-user.h
+++ b/tools/net/ynl/generated/netdev-user.h
@@ -19,6 +19,7 @@ extern const struct ynl_family ynl_netdev_family;
 const char *netdev_op_str(int op);
 const char *netdev_xdp_act_str(enum netdev_xdp_act value);
 const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value);
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value);
 
 /* Common nested types */
 /* ============== NETDEV_CMD_DEV_GET ============== */
@@ -50,12 +51,14 @@ struct netdev_dev_get_rsp {
 		__u32 xdp_features:1;
 		__u32 xdp_zc_max_segs:1;
 		__u32 xdp_rx_metadata_features:1;
+		__u32 xsk_features:1;
 	} _present;
 
 	__u32 ifindex;
 	__u64 xdp_features;
 	__u32 xdp_zc_max_segs;
 	__u64 xdp_rx_metadata_features;
+	__u64 xsk_features;
 };
 
 void netdev_dev_get_rsp_free(struct netdev_dev_get_rsp *rsp);
-- 
cgit v1.2.3


From 8866730aed5100f06d3d965c22f1c61f74942541 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 28 Nov 2023 17:25:56 -0800
Subject: bpf, sockmap: af_unix stream sockets need to hold ref for pair sock

AF_UNIX stream sockets are a paired socket. So sending on one of the pairs
will lookup the paired socket as part of the send operation. It is possible
however to put just one of the pairs in a BPF map. This currently increments
the refcnt on the sock in the sockmap to ensure it is not free'd by the
stack before sockmap cleans up its state and stops any skbs being sent/recv'd
to that socket.

But we missed a case. If the peer socket is closed it will be free'd by the
stack. However, the paired socket can still be referenced from BPF sockmap
side because we hold a reference there. Then if we are sending traffic through
BPF sockmap to that socket it will try to dereference the free'd pair in its
send logic creating a use after free. And following splat:

   [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0
   [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954
   [...]
   [59.905468] Call Trace:
   [59.905787]  <TASK>
   [59.906066]  dump_stack_lvl+0x130/0x1d0
   [59.908877]  print_report+0x16f/0x740
   [59.910629]  kasan_report+0x118/0x160
   [59.912576]  sk_wake_async+0x31/0x1b0
   [59.913554]  sock_def_readable+0x156/0x2a0
   [59.914060]  unix_stream_sendmsg+0x3f9/0x12a0
   [59.916398]  sock_sendmsg+0x20e/0x250
   [59.916854]  skb_send_sock+0x236/0xac0
   [59.920527]  sk_psock_backlog+0x287/0xaa0

To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its
paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The
primarily problem with keeping this logic in bpf_unix was: In the sock close()
we could handle the deref by having a close handler. But, when we are destroying
the psock through a map delete operation we wouldn't have gotten any signal
thorugh the proto struct other than it being replaced. If we do the deref from
the proto replace its too early because we need to deref the sk_pair after the
backlog worker has been stopped.

Given all this it seems best to just cache it at the end of the psock and eat 8B
for the af_unix and vsock users. Notice dgram sockets are OK because they handle
locking already.

Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 1 +
 include/net/af_unix.h | 1 +
 net/core/skmsg.c      | 2 ++
 net/unix/af_unix.c    | 2 --
 net/unix/unix_bpf.c   | 5 +++++
 5 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c1637515a8a4..c953b8c0d2f4 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -106,6 +106,7 @@ struct sk_psock {
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct delayed_work		work;
+	struct sock			*sk_pair;
 	struct rcu_work			rwork;
 };
 
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 824c258143a3..49c4640027d8 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -75,6 +75,7 @@ struct unix_sock {
 };
 
 #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk)
+#define unix_peer(sk) (unix_sk(sk)->peer)
 
 #define peer_wait peer_wq.wait
 
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6c31eefbd777..93ecfceac1bc 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -826,6 +826,8 @@ static void sk_psock_destroy(struct work_struct *work)
 
 	if (psock->sk_redir)
 		sock_put(psock->sk_redir);
+	if (psock->sk_pair)
+		sock_put(psock->sk_pair);
 	sock_put(psock->sk);
 	kfree(psock);
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a357dc5f2404..ac1f2bc18fc9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -213,8 +213,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 }
 #endif /* CONFIG_SECURITY_NETWORK */
 
-#define unix_peer(sk) (unix_sk(sk)->peer)
-
 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 {
 	return unix_peer(osk) == sk;
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 2f9d8271c6ec..7ea7c3a0d0d0 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -159,12 +159,17 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
 
 int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
+	struct sock *sk_pair;
+
 	if (restore) {
 		sk->sk_write_space = psock->saved_write_space;
 		sock_replace_proto(sk, psock->sk_proto);
 		return 0;
 	}
 
+	sk_pair = unix_peer(sk);
+	sock_hold(sk_pair);
+	psock->sk_pair = sk_pair;
 	unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
 	sock_replace_proto(sk, &unix_stream_bpf_prot);
 	return 0;
-- 
cgit v1.2.3


From 7577bc8249c3fc86096ef1b1c9a8f4b6232231e7 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 28 Nov 2023 18:29:20 -0800
Subject: tcp: Don't pass cookie to __cookie_v[46]_check().

tcp_hdr(skb) and SYN Cookie are passed to __cookie_v[46]_check(), but
none of the callers passes cookie other than ntohl(th->ack_seq) - 1.

Let's fetch it in __cookie_v[46]_check() instead of passing the cookie
over and over.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20231129022924.96156-5-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netfilter_ipv6.h   |  8 ++++----
 include/net/tcp.h                |  6 ++----
 net/core/filter.c                | 15 ++++-----------
 net/ipv4/syncookies.c            | 15 ++++++++-------
 net/ipv6/syncookies.c            | 15 ++++++++-------
 net/netfilter/nf_synproxy_core.c |  4 ++--
 6 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 7834c0be2831..61aa48f46dd7 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -51,7 +51,7 @@ struct nf_ipv6_ops {
 	u32 (*cookie_init_sequence)(const struct ipv6hdr *iph,
 				    const struct tcphdr *th, u16 *mssp);
 	int (*cookie_v6_check)(const struct ipv6hdr *iph,
-			       const struct tcphdr *th, __u32 cookie);
+			       const struct tcphdr *th);
 #endif
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -179,16 +179,16 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph,
 }
 
 static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
-				     const struct tcphdr *th, __u32 cookie)
+				     const struct tcphdr *th)
 {
 #if IS_ENABLED(CONFIG_SYN_COOKIES)
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
 
 	if (v6_ops)
-		return v6_ops->cookie_v6_check(iph, th, cookie);
+		return v6_ops->cookie_v6_check(iph, th);
 #elif IS_BUILTIN(CONFIG_IPV6)
-	return __cookie_v6_check(iph, th, cookie);
+	return __cookie_v6_check(iph, th);
 #endif
 #endif
 	return 0;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d2f0736b76b8..2b2c79c7bbcd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -491,8 +491,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 				 struct request_sock *req,
 				 struct dst_entry *dst, u32 tsoff);
-int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
-		      u32 cookie);
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
 struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 					    const struct tcp_request_sock_ops *af_ops,
@@ -586,8 +585,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *opt,
 		   const struct net *net, const struct dst_entry *dst);
 
 /* From net/ipv6/syncookies.c */
-int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
-		      u32 cookie);
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
 
 u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
diff --git a/net/core/filter.c b/net/core/filter.c
index 7e4d7c3bcc84..0adaa4afa35f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7238,7 +7238,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
 	   struct tcphdr *, th, u32, th_len)
 {
 #ifdef CONFIG_SYN_COOKIES
-	u32 cookie;
 	int ret;
 
 	if (unlikely(!sk || th_len < sizeof(*th)))
@@ -7260,8 +7259,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
 	if (tcp_synq_no_recent_overflow(sk))
 		return -ENOENT;
 
-	cookie = ntohl(th->ack_seq) - 1;
-
 	/* Both struct iphdr and struct ipv6hdr have the version field at the
 	 * same offset so we can cast to the shorter header (struct iphdr).
 	 */
@@ -7270,7 +7267,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
 		if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
 			return -EINVAL;
 
-		ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
+		ret = __cookie_v4_check((struct iphdr *)iph, th);
 		break;
 
 #if IS_BUILTIN(CONFIG_IPV6)
@@ -7281,7 +7278,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
 		if (sk->sk_family != AF_INET6)
 			return -EINVAL;
 
-		ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
+		ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
 		break;
 #endif /* CONFIG_IPV6 */
 
@@ -7734,9 +7731,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
 	   struct tcphdr *, th)
 {
-	u32 cookie = ntohl(th->ack_seq) - 1;
-
-	if (__cookie_v4_check(iph, th, cookie) > 0)
+	if (__cookie_v4_check(iph, th) > 0)
 		return 0;
 
 	return -EACCES;
@@ -7757,9 +7752,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
 	   struct tcphdr *, th)
 {
 #if IS_BUILTIN(CONFIG_IPV6)
-	u32 cookie = ntohl(th->ack_seq) - 1;
-
-	if (__cookie_v6_check(iph, th, cookie) > 0)
+	if (__cookie_v6_check(iph, th) > 0)
 		return 0;
 
 	return -EACCES;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 8b7d7d7788af..c08428d63d11 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -189,12 +189,14 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
  * Check if a ack sequence number is a valid syncookie.
  * Return the decoded mss if it is, or 0 if not.
  */
-int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
-		      u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th)
 {
+	__u32 cookie = ntohl(th->ack_seq) - 1;
 	__u32 seq = ntohl(th->seq) - 1;
-	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
-					    th->source, th->dest, seq);
+	__u32 mssind;
+
+	mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+				      th->source, th->dest, seq);
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
@@ -332,7 +334,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	const struct tcphdr *th = tcp_hdr(skb);
-	__u32 cookie = ntohl(th->ack_seq) - 1;
 	struct tcp_options_received tcp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_request_sock *ireq;
@@ -354,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	if (tcp_synq_no_recent_overflow(sk))
 		goto out;
 
-	mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
+	mss = __cookie_v4_check(ip_hdr(skb), th);
 	if (mss == 0) {
 		__NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
@@ -384,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	treq = tcp_rsk(req);
 	treq->rcv_isn		= ntohl(th->seq) - 1;
-	treq->snt_isn		= cookie;
+	treq->snt_isn		= ntohl(th->ack_seq) - 1;
 	treq->ts_off		= 0;
 	treq->txhash		= net_tx_rndhash();
 	req->mss		= mss;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 106376cbc9de..4cd26c481168 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -114,12 +114,14 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
 	return __cookie_v6_init_sequence(iph, th, mssp);
 }
 
-int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
-		      __u32 cookie)
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th)
 {
+	__u32 cookie = ntohl(th->ack_seq) - 1;
 	__u32 seq = ntohl(th->seq) - 1;
-	__u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
-					    th->source, th->dest, seq);
+	__u32 mssind;
+
+	mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+				      th->source, th->dest, seq);
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
@@ -128,7 +130,6 @@ EXPORT_SYMBOL_GPL(__cookie_v6_check);
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
-	__u32 cookie = ntohl(th->ack_seq) - 1;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_options_received tcp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -150,7 +151,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	if (tcp_synq_no_recent_overflow(sk))
 		goto out;
 
-	mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie);
+	mss = __cookie_v6_check(ipv6_hdr(skb), th);
 	if (mss == 0) {
 		__NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
@@ -213,7 +214,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->snt_synack	= 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
-	treq->snt_isn = cookie;
+	treq->snt_isn = ntohl(th->ack_seq) - 1;
 	treq->ts_off = 0;
 	treq->txhash = net_tx_rndhash();
 
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 467671f2d42f..fbbc4fd37349 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -617,7 +617,7 @@ synproxy_recv_client_ack(struct net *net,
 	struct synproxy_net *snet = synproxy_pernet(net);
 	int mss;
 
-	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	mss = __cookie_v4_check(ip_hdr(skb), th);
 	if (mss == 0) {
 		this_cpu_inc(snet->stats->cookie_invalid);
 		return false;
@@ -1034,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net,
 	struct synproxy_net *snet = synproxy_pernet(net);
 	int mss;
 
-	mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	mss = nf_cookie_v6_check(ipv6_hdr(skb), th);
 	if (mss == 0) {
 		this_cpu_inc(snet->stats->cookie_invalid);
 		return false;
-- 
cgit v1.2.3


From 39cefd85098d12439586824c39f8e1948fac186d Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 29 Nov 2023 17:31:55 +0100
Subject: spi: introduce SPI_TRANS_FAIL_IO for error reporting

The default message transfer implementation - spi_transfer_one_message -
invokes the specific device driver's transfer_one(), then waits for
completion. However, there is no mechanism for the device driver to
report failure in the middle of the transfer.

Introduce SPI_TRANS_FAIL_IO for drivers to report transfer failure.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/4b420dac528e60f122adde16851da88e4798c1ea.1701274975.git.namcao@linutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       |  3 +++
 include/linux/spi/spi.h | 12 ++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8ead7acb99f3..a4b8c07c5951 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1361,6 +1361,9 @@ static int spi_transfer_wait(struct spi_controller *ctlr,
 				"SPI transfer timed out\n");
 			return -ETIMEDOUT;
 		}
+
+		if (xfer->error & SPI_TRANS_FAIL_IO)
+			return -EIO;
 	}
 
 	return 0;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..aa25ae04c5c3 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -461,10 +461,13 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  *                  - return 1 if the transfer is still in progress. When
  *                    the driver is finished with this transfer it must
  *                    call spi_finalize_current_transfer() so the subsystem
- *                    can issue the next transfer. Note: transfer_one and
- *                    transfer_one_message are mutually exclusive; when both
- *                    are set, the generic subsystem does not call your
- *                    transfer_one callback.
+ *                    can issue the next transfer. If the transfer fails, the
+ *                    driver must set the flag SPI_TRANS_FAIL_IO to
+ *                    spi_transfer->error first, before calling
+ *                    spi_finalize_current_transfer().
+ *                    Note: transfer_one and transfer_one_message are mutually
+ *                    exclusive; when both are set, the generic subsystem does
+ *                    not call your transfer_one callback.
  * @handle_err: the subsystem calls the driver to handle an error that occurs
  *		in the generic implementation of transfer_one_message().
  * @mem_ops: optimized/dedicated operations for interactions with SPI memory.
@@ -1040,6 +1043,7 @@ struct spi_transfer {
 	unsigned	len;
 
 #define SPI_TRANS_FAIL_NO_START	BIT(0)
+#define SPI_TRANS_FAIL_IO	BIT(1)
 	u16		error;
 
 	dma_addr_t	tx_dma;
-- 
cgit v1.2.3


From cff49d58f57e5667c10a0db85d7461790bb85cf8 Mon Sep 17 00:00:00 2001
From: "Chia-Lin Kao (AceLan)" <acelan.kao@canonical.com>
Date: Wed, 29 Nov 2023 14:43:10 +0800
Subject: spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP

This commit updates the SPI subsystem, particularly affecting "SPI MEM"
drivers and core parts, by replacing the -ENOTSUPP error code with
-EOPNOTSUPP.

The key motivations for this change are as follows:
1. The spi-nor driver currently uses EOPNOTSUPP, whereas calls to spi-mem
might return ENOTSUPP. This update aims to unify the error reporting
within the SPI subsystem for clarity and consistency.

2. The use of ENOTSUPP has been flagged by checkpatch as inappropriate,
mainly being reserved for NFS-related errors. To align with kernel coding
standards and recommendations, this change is being made.

3. By using EOPNOTSUPP, we provide more specific context to the error,
indicating that a particular operation is not supported. This helps
differentiate from the more generic ENOTSUPP error, allowing drivers to
better handle and respond to different error scenarios.

Risks and Considerations:
While this change is primarily intended as a code cleanup and error code
unification, there is a minor risk of breaking user-space applications
that rely on specific return codes for unsupported operations. However,
this risk is considered low, as such use-cases are unlikely to be common
or critical. Nevertheless, developers and users should be aware of this
change, especially if they have scripts or tools that specifically handle
SPI error codes.

This commit does not introduce any functional changes to the SPI subsystem
or the affected drivers.

Signed-off-by: "Chia-Lin Kao (AceLan)" <acelan.kao@canonical.com>
Acked-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231129064311.272422-1-acelan.kao@canonical.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mtd/nand/spi/core.c | 2 +-
 drivers/mtd/spi-nor/core.c  | 2 +-
 drivers/spi/atmel-quadspi.c | 2 +-
 drivers/spi/spi-ath79.c     | 2 +-
 drivers/spi/spi-bcm-qspi.c  | 2 +-
 drivers/spi/spi-mem.c       | 6 +++---
 drivers/spi/spi-npcm-fiu.c  | 2 +-
 drivers/spi/spi-ti-qspi.c   | 4 ++--
 drivers/spi/spi-wpcm-fiu.c  | 2 +-
 include/linux/spi/spi-mem.h | 2 ++
 10 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 849ccfedbc72..e0b6715e5dfe 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -974,7 +974,7 @@ static int spinand_manufacturer_match(struct spinand_device *spinand,
 		spinand->manufacturer = manufacturer;
 		return 0;
 	}
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static int spinand_id_detect(struct spinand_device *spinand)
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 1c443fe568cf..87cb2047df80 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -3146,7 +3146,7 @@ int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable)
 	int ret;
 
 	ret = params->set_4byte_addr_mode(nor, enable);
-	if (ret && ret != -ENOTSUPP)
+	if (ret && ret != -EOPNOTSUPP)
 		return ret;
 
 	if (enable) {
diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c
index 3d1252566134..370c4d1572ed 100644
--- a/drivers/spi/atmel-quadspi.c
+++ b/drivers/spi/atmel-quadspi.c
@@ -272,7 +272,7 @@ static int atmel_qspi_find_mode(const struct spi_mem_op *op)
 		if (atmel_qspi_is_compatible(op, &atmel_qspi_modes[i]))
 			return i;
 
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static bool atmel_qspi_supports_op(struct spi_mem *mem,
diff --git a/drivers/spi/spi-ath79.c b/drivers/spi/spi-ath79.c
index c9f1d1e1dcf7..b7ada981464a 100644
--- a/drivers/spi/spi-ath79.c
+++ b/drivers/spi/spi-ath79.c
@@ -146,7 +146,7 @@ static int ath79_exec_mem_op(struct spi_mem *mem,
 	/* Only use for fast-read op. */
 	if (op->cmd.opcode != 0x0b || op->data.dir != SPI_MEM_DATA_IN ||
 	    op->addr.nbytes != 3 || op->dummy.nbytes != 1)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	/* disable GPIO mode */
 	ath79_spi_wr(sp, AR71XX_SPI_REG_FS, 0);
diff --git a/drivers/spi/spi-bcm-qspi.c b/drivers/spi/spi-bcm-qspi.c
index ef08fcac2f6d..d96222e6d7d2 100644
--- a/drivers/spi/spi-bcm-qspi.c
+++ b/drivers/spi/spi-bcm-qspi.c
@@ -1199,7 +1199,7 @@ static int bcm_qspi_exec_mem_op(struct spi_mem *mem,
 
 	if (!op->data.nbytes || !op->addr.nbytes || op->addr.nbytes > 4 ||
 	    op->data.dir != SPI_MEM_DATA_IN)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	buf = op->data.buf.in;
 	addr = op->addr.val;
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index edd7430d4c05..2dc8ceb85374 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -323,7 +323,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 		return ret;
 
 	if (!spi_mem_internal_supports_op(mem, op))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (ctlr->mem_ops && ctlr->mem_ops->exec_op && !spi_get_csgpiod(mem->spi, 0)) {
 		ret = spi_mem_access_start(mem);
@@ -339,7 +339,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 		 * read path) and expect the core to use the regular SPI
 		 * interface in other cases.
 		 */
-		if (!ret || ret != -ENOTSUPP)
+		if (!ret || ret != -ENOTSUPP || ret != -EOPNOTSUPP)
 			return ret;
 	}
 
@@ -559,7 +559,7 @@ spi_mem_dirmap_create(struct spi_mem *mem,
 	if (ret) {
 		desc->nodirmap = true;
 		if (!spi_mem_supports_op(desc->mem, &desc->info.op_tmpl))
-			ret = -ENOTSUPP;
+			ret = -EOPNOTSUPP;
 		else
 			ret = 0;
 	}
diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c
index 03db9f016a11..f3bb8bbc192f 100644
--- a/drivers/spi/spi-npcm-fiu.c
+++ b/drivers/spi/spi-npcm-fiu.c
@@ -556,7 +556,7 @@ static int npcm_fiu_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 		op->data.nbytes);
 
 	if (fiu->spix_mode || op->addr.nbytes > 4)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (fiu->clkrate != chip->clkrate) {
 		ret = clk_set_rate(fiu->clk, chip->clkrate);
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index fdc092a05284..a6a89c59c418 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -613,12 +613,12 @@ static int ti_qspi_exec_mem_op(struct spi_mem *mem,
 	/* Only optimize read path. */
 	if (!op->data.nbytes || op->data.dir != SPI_MEM_DATA_IN ||
 	    !op->addr.nbytes || op->addr.nbytes > 4)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	/* Address exceeds MMIO window size, fall back to regular mode. */
 	from = op->addr.val;
 	if (from + op->data.nbytes > qspi->mmap_size)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	mutex_lock(&qspi->list_lock);
 
diff --git a/drivers/spi/spi-wpcm-fiu.c b/drivers/spi/spi-wpcm-fiu.c
index 852ffe013d32..d76f7b5a9b97 100644
--- a/drivers/spi/spi-wpcm-fiu.c
+++ b/drivers/spi/spi-wpcm-fiu.c
@@ -361,7 +361,7 @@ static int wpcm_fiu_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 
 	wpcm_fiu_stall_host(fiu, false);
 
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static int wpcm_fiu_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op)
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 6b0a7dc48a4b..f866d5c8ed32 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -233,6 +233,8 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
  *		    limitations)
  * @supports_op: check if an operation is supported by the controller
  * @exec_op: execute a SPI memory operation
+ *           not all driver provides supports_op(), so it can return -EOPNOTSUPP
+ *           if the op is not supported by the driver/controller
  * @get_name: get a custom name for the SPI mem device from the controller.
  *	      This might be needed if the controller driver has been ported
  *	      to use the SPI mem layer and a custom name is used to keep
-- 
cgit v1.2.3


From 3d1ff9dfdc168722f570144aba0ce29d28d7f483 Mon Sep 17 00:00:00 2001
From: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
Date: Wed, 22 Nov 2023 10:05:56 -0600
Subject: dma-buf: Correct the documentation of name and exp_name symbols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the documentation of struct dma_buf members name and exp_name
as to how these members are to be used and accessed.

Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231122160556.24948-1-Ramesh.Errabolu@amd.com
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/linux/dma-buf.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 3f31baa3293f..8ff4add71f88 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -343,16 +343,19 @@ struct dma_buf {
 	/**
 	 * @exp_name:
 	 *
-	 * Name of the exporter; useful for debugging. See the
-	 * DMA_BUF_SET_NAME IOCTL.
+	 * Name of the exporter; useful for debugging. Must not be NULL
 	 */
 	const char *exp_name;
 
 	/**
 	 * @name:
 	 *
-	 * Userspace-provided name; useful for accounting and debugging,
-	 * protected by dma_resv_lock() on @resv and @name_lock for read access.
+	 * Userspace-provided name. Default value is NULL. If not NULL,
+	 * length cannot be longer than DMA_BUF_NAME_LEN, including NIL
+	 * char. Useful for accounting and debugging. Read/Write accesses
+	 * are protected by @name_lock
+	 *
+	 * See the IOCTLs DMA_BUF_SET_NAME or DMA_BUF_SET_NAME_A/B
 	 */
 	const char *name;
 
-- 
cgit v1.2.3


From 4ea95c04fa6b9043a1a301240996aeebe3cb28ec Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 29 Nov 2023 16:10:00 -0800
Subject: vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart

Drop the vfio_file_iommu_group() stub and instead unconditionally declare
the function to fudge around a KVM wart where KVM tries to do symbol_get()
on vfio_file_iommu_group() (and other VFIO symbols) even if CONFIG_VFIO=n.

Ensuring the symbol is always declared fixes a PPC build error when
modules are also disabled, in which case symbol_get() simply points at the
address of the symbol (with some attributes shenanigans).  Because KVM
does symbol_get() instead of directly depending on VFIO, the lack of a
fully defined symbol is not problematic (ugly, but "fine").

   arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7:
   error: attribute declaration must precede definition [-Werror,-Wignored-attributes]
           fn = symbol_get(vfio_file_iommu_group);
                ^
   include/linux/module.h:805:60: note: expanded from macro 'symbol_get'
   #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
                                                              ^
   include/linux/vfio.h:294:35: note: previous definition is here
   static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
                                     ^
   arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7:
   error: attribute declaration must precede definition [-Werror,-Wignored-attributes]
           fn = symbol_get(vfio_file_iommu_group);
                ^
   include/linux/module.h:805:65: note: expanded from macro 'symbol_get'
   #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
                                                                   ^
   include/linux/vfio.h:294:35: note: previous definition is here
   static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
                                     ^
   2 errors generated.

Although KVM is firmly in the wrong (there is zero reason for KVM to build
virt/kvm/vfio.c when VFIO is disabled), fudge around the error in VFIO as
the stub is unnecessary and doesn't serve its intended purpose (KVM is the
only external user of vfio_file_iommu_group()), and there is an in-flight
series to clean up the entire KVM<->VFIO interaction, i.e. fixing this in
KVM would result in more churn in the long run, and the stub needs to go
away regardless.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308251949.5IiaV0sz-lkp@intel.com
Closes: https://lore.kernel.org/oe-kbuild-all/202309030741.82aLACDG-lkp@intel.com
Closes: https://lore.kernel.org/oe-kbuild-all/202309110914.QLH0LU6L-lkp@intel.com
Link: https://lore.kernel.org/all/0-v1-08396538817d+13c5-vfio_kvm_kconfig_jgg@nvidia.com
Link: https://lore.kernel.org/all/20230916003118.2540661-1-seanjc@google.com
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Fixes: c1cce6d079b8 ("vfio: Compile vfio_group infrastructure optionally")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231130001000.543240-1-seanjc@google.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 454e9295970c..a65b2513f8cd 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 /*
  * External user API
  */
-#if IS_ENABLED(CONFIG_VFIO_GROUP)
 struct iommu_group *vfio_file_iommu_group(struct file *file);
+
+#if IS_ENABLED(CONFIG_VFIO_GROUP)
 bool vfio_file_is_group(struct file *file);
 bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 #else
-static inline struct iommu_group *vfio_file_iommu_group(struct file *file)
-{
-	return NULL;
-}
-
 static inline bool vfio_file_is_group(struct file *file)
 {
 	return false;
-- 
cgit v1.2.3


From 3717194f249227a3dfd8433bd9374cc7e0cf823d Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 29 Nov 2023 13:06:15 +0200
Subject: Input: gpio-keys - add system suspend support for dedicated wakeirqs

Some SoCs have a separate dedicated wake-up interrupt controller that can
be used to wake up the system from deeper idle states. We already support
configuring a separate interrupt for a gpio-keys button to be used with a
gpio line. However, we are lacking support system suspend for cases where
a separate interrupt needs to be used in deeper sleep modes.

Because of it's nature, gpio-keys does not know about the runtime PM state
of the button gpios, and may have several gpio buttons configured for each
gpio-keys device instance. Implementing runtime PM support for gpio-keys
does not help, and we cannot use drivers/base/power/wakeirq.c support. We
need to implement custom wakeirq support for gpio-keys.

For handling a dedicated wakeirq for system suspend, we enable and disable
it with gpio_keys_enable_wakeup() and gpio_keys_disable_wakeup() that we
already use based on device_may_wakeup().

Some systems may have a dedicated wakeirq that can also be used as the
main interrupt, this is already working for gpio-keys. Let's add some
wakeirq related comments while at it as the usage with a gpio line and
separate interrupt line may not be obvious.

Tested-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231129110618.27551-2-tony@atomide.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/gpio_keys.c | 69 +++++++++++++++++++++++++++++++++++---
 include/linux/gpio_keys.h          |  2 ++
 2 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index 2e7c2c046e67..06c09b7addf8 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -45,7 +45,9 @@ struct gpio_button_data {
 	unsigned int software_debounce;	/* in msecs, for GPIO-driven buttons */
 
 	unsigned int irq;
+	unsigned int wakeirq;
 	unsigned int wakeup_trigger_type;
+
 	spinlock_t lock;
 	bool disabled;
 	bool key_pressed;
@@ -511,6 +513,7 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 	struct gpio_button_data *bdata = &ddata->data[idx];
 	irq_handler_t isr;
 	unsigned long irqflags;
+	const char *wakedesc;
 	int irq;
 	int error;
 
@@ -575,6 +578,14 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 					!gpiod_cansleep(bdata->gpiod);
 		}
 
+		/*
+		 * If an interrupt was specified, use it instead of the gpio
+		 * interrupt and use the gpio for reading the state. A separate
+		 * interrupt may be used as the main button interrupt for
+		 * runtime PM to detect events also in deeper idle states. If a
+		 * dedicated wakeirq is used for system suspend only, see below
+		 * for bdata->wakeirq setup.
+		 */
 		if (button->irq) {
 			bdata->irq = button->irq;
 		} else {
@@ -672,6 +683,36 @@ static int gpio_keys_setup_key(struct platform_device *pdev,
 		return error;
 	}
 
+	if (!button->wakeirq)
+		return 0;
+
+	/* Use :wakeup suffix like drivers/base/power/wakeirq.c does */
+	wakedesc = devm_kasprintf(dev, GFP_KERNEL, "%s:wakeup", desc);
+	if (!wakedesc)
+		return -ENOMEM;
+
+	bdata->wakeirq = button->wakeirq;
+	irqflags |= IRQF_NO_SUSPEND;
+
+	/*
+	 * Wakeirq shares the handler with the main interrupt, it's only
+	 * active during system suspend. See gpio_keys_button_enable_wakeup()
+	 * and gpio_keys_button_disable_wakeup().
+	 */
+	error = devm_request_any_context_irq(dev, bdata->wakeirq, isr,
+					     irqflags, wakedesc, bdata);
+	if (error < 0) {
+		dev_err(dev, "Unable to claim wakeirq %d; error %d\n",
+			bdata->irq, error);
+		return error;
+	}
+
+	/*
+	 * Disable wakeirq until suspend. IRQF_NO_AUTOEN won't work if
+	 * IRQF_SHARED was set based on !button->can_disable.
+	 */
+	disable_irq(bdata->wakeirq);
+
 	return 0;
 }
 
@@ -728,7 +769,7 @@ gpio_keys_get_devtree_pdata(struct device *dev)
 	struct gpio_keys_platform_data *pdata;
 	struct gpio_keys_button *button;
 	struct fwnode_handle *child;
-	int nbuttons;
+	int nbuttons, irq;
 
 	nbuttons = device_get_child_node_count(dev);
 	if (nbuttons == 0)
@@ -750,9 +791,19 @@ gpio_keys_get_devtree_pdata(struct device *dev)
 	device_property_read_string(dev, "label", &pdata->name);
 
 	device_for_each_child_node(dev, child) {
-		if (is_of_node(child))
-			button->irq =
-				irq_of_parse_and_map(to_of_node(child), 0);
+		if (is_of_node(child)) {
+			irq = of_irq_get_byname(to_of_node(child), "irq");
+			if (irq > 0)
+				button->irq = irq;
+
+			irq = of_irq_get_byname(to_of_node(child), "wakeup");
+			if (irq > 0)
+				button->wakeirq = irq;
+
+			if (!button->irq && !button->wakeirq)
+				button->irq =
+					irq_of_parse_and_map(to_of_node(child), 0);
+		}
 
 		if (fwnode_property_read_u32(child, "linux,code",
 					     &button->code)) {
@@ -921,6 +972,11 @@ gpio_keys_button_enable_wakeup(struct gpio_button_data *bdata)
 		}
 	}
 
+	if (bdata->wakeirq) {
+		enable_irq(bdata->wakeirq);
+		disable_irq(bdata->irq);
+	}
+
 	return 0;
 }
 
@@ -929,6 +985,11 @@ gpio_keys_button_disable_wakeup(struct gpio_button_data *bdata)
 {
 	int error;
 
+	if (bdata->wakeirq) {
+		enable_irq(bdata->irq);
+		disable_irq(bdata->wakeirq);
+	}
+
 	/*
 	 * The trigger type is always both edges for gpio-based keys and we do
 	 * not support changing wakeup trigger for interrupt-based keys.
diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h
index 3f84aeb81e48..80fa930b04c6 100644
--- a/include/linux/gpio_keys.h
+++ b/include/linux/gpio_keys.h
@@ -21,6 +21,7 @@ struct device;
  *			disable button via sysfs
  * @value:		axis value for %EV_ABS
  * @irq:		Irq number in case of interrupt keys
+ * @wakeirq:		Optional dedicated wake-up interrupt
  */
 struct gpio_keys_button {
 	unsigned int code;
@@ -34,6 +35,7 @@ struct gpio_keys_button {
 	bool can_disable;
 	int value;
 	unsigned int irq;
+	unsigned int wakeirq;
 };
 
 /**
-- 
cgit v1.2.3


From da2e08d4630ab04ee5b61515fe423c582b5c3be2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 Oct 2023 16:18:38 -0700
Subject: i40e: Annotate struct i40e_qvlist_info with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct i40e_qvlist_info.

Cc: Tony Nguyen <anthony.l.nguyen@intel.com>
Cc: Shiraz Saleem <shiraz.saleem@intel.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Gurucharan G <gurucharanx.g@intel.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20231003231838.work.510-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/net/intel/i40e_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h
index ed42bd5f639f..0aa4411528fc 100644
--- a/include/linux/net/intel/i40e_client.h
+++ b/include/linux/net/intel/i40e_client.h
@@ -45,7 +45,7 @@ struct i40e_qv_info {
 
 struct i40e_qvlist_info {
 	u32 num_vectors;
-	struct i40e_qv_info qv_info[];
+	struct i40e_qv_info qv_info[] __counted_by(num_vectors);
 };
 
 
-- 
cgit v1.2.3


From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 29 Nov 2023 19:47:40 +0800
Subject: jbd2: increase the journal IO's priority

Current jbd2 only add REQ_SYNC for descriptor block, metadata log
buffer, commit buffer and superblock buffer, the submitted IO could be
throttled by writeback throttle in block layer, that could lead to
priority inversion in some cases. The log IO looks like a kind of high
priority metadata IO, so it should not be throttled by WBT like QOS
policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from
writeback throttle, and also add REQ_META together indicates it's a
metadata IO.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c     |  9 +++++----
 fs/jbd2/journal.c    | 20 +++++++++++---------
 include/linux/jbd2.h |  3 +++
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8d6f934c3d95..9bdb377a348f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	struct timespec64 now;
-	blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
+	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
 
 	*cbh = NULL;
 
@@ -395,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 */
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
-						journal->j_tail,
-						REQ_SYNC);
+						journal->j_tail, 0);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd2_debug(3, "superblock not updated\n");
@@ -715,6 +714,7 @@ start_journal_io:
 
 			for (i = 0; i < bufs; i++) {
 				struct buffer_head *bh = wbuf[i];
+
 				/*
 				 * Compute checksum.
 				 */
@@ -727,7 +727,8 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+					  bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 71b30f6a662d..206cb53ef2b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1100,8 +1100,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
-					      REQ_SYNC | REQ_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1775,8 +1774,7 @@ static int journal_reset(journal_t *journal)
 		 */
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
-						journal->j_tail,
-						REQ_SYNC | REQ_FUA);
+						journal->j_tail, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1798,6 +1796,11 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
 		return -EIO;
 	}
 
+	/*
+	 * Always set high priority flags to exempt from block layer's
+	 * QOS policies, e.g. writeback throttle.
+	 */
+	write_flags |= JBD2_JOURNAL_REQ_FLAGS;
 	if (!(journal->j_flags & JBD2_BARRIER))
 		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
 
@@ -2052,7 +2055,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
 	sb->s_errno    = cpu_to_be32(errcode);
 
-	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
+	jbd2_write_superblock(journal, REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -2173,8 +2176,7 @@ int jbd2_journal_destroy(journal_t *journal)
 				++journal->j_transaction_sequence;
 			write_unlock(&journal->j_state_lock);
 
-			jbd2_mark_journal_empty(journal,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
+			jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -2475,7 +2477,7 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+	jbd2_mark_journal_empty(journal, REQ_FUA);
 
 	if (flags)
 		err = __jbd2_journal_erase(journal, flags);
@@ -2521,7 +2523,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock_io(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+		jbd2_mark_journal_empty(journal, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6dcbb4eb80fb..beb30719ee16 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1374,6 +1374,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
 JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 
+/* Journal high priority write IO operation flags */
+#define JBD2_JOURNAL_REQ_FLAGS		(REQ_META | REQ_SYNC | REQ_IDLE)
+
 /*
  * Journal flag definitions
  */
-- 
cgit v1.2.3


From d839a656d0f3caca9f96e9bf912fd394ac6a11bc Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Fri, 1 Dec 2023 14:53:55 +0900
Subject: kprobes: consistent rcu api usage for kretprobe holder

It seems that the pointer-to-kretprobe "rp" within the kretprobe_holder is
RCU-managed, based on the (non-rethook) implementation of get_kretprobe().
The thought behind this patch is to make use of the RCU API where possible
when accessing this pointer so that the needed barriers are always in place
and to self-document the code.

The __rcu annotation to "rp" allows for sparse RCU checking. Plain writes
done to the "rp" pointer are changed to make use of the RCU macro for
assignment. For the single read, the implementation of get_kretprobe()
is simplified by making use of an RCU macro which accomplishes the same,
but note that the log warning text will be more generic.

I did find that there is a difference in assembly generated between the
usage of the RCU macros vs without. For example, on arm64, when using
rcu_assign_pointer(), the corresponding store instruction is a
store-release (STLR) which has an implicit barrier. When normal assignment
is done, a regular store (STR) is found. In the macro case, this seems to
be a result of rcu_assign_pointer() using smp_store_release() when the
value to write is not NULL.

Link: https://lore.kernel.org/all/20231122132058.3359-1-inwardvessel@gmail.com/

Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash")
Cc: stable@vger.kernel.org
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 7 ++-----
 kernel/kprobes.c        | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ab1da3142b06..64672bace560 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -139,7 +139,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  *
  */
 struct kretprobe_holder {
-	struct kretprobe	*rp;
+	struct kretprobe __rcu *rp;
 	struct objpool_head	pool;
 };
 
@@ -245,10 +245,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
 
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return READ_ONCE(ri->rph->rp);
+	return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held());
 }
 
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 075a632e6c7c..d5a0ee40bf66 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2252,7 +2252,7 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->rph = NULL;
 		return -ENOMEM;
 	}
-	rp->rph->rp = rp;
+	rcu_assign_pointer(rp->rph->rp, rp);
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
@@ -2300,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 		rethook_free(rps[i]->rh);
 #else
-		rps[i]->rph->rp = NULL;
+		rcu_assign_pointer(rps[i]->rph->rp, NULL);
 #endif
 	}
 	mutex_unlock(&kprobe_mutex);
-- 
cgit v1.2.3


From a1461f1fd6cfdc4b8917c9d4a91e92605d1f28dc Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 1 Dec 2023 14:53:56 +0900
Subject: rethook: Use __rcu pointer for rethook::handler

Since the rethook::handler is an RCU-maganged pointer so that it will
notice readers the rethook is stopped (unregistered) or not, it should
be an __rcu pointer and use appropriate functions to be accessed. This
will use appropriate memory barrier when accessing it. OTOH,
rethook::data is never changed, so we don't need to check it in
get_kretprobe().

NOTE: To avoid sparse warning, rethook::handler is defined by a raw
function pointer type with __rcu instead of rethook_handler_t.

Link: https://lore.kernel.org/all/170126066201.398836.837498688669005979.stgit@devnote2/

Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook")
Cc: stable@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311241808.rv9ceuAh-lkp@intel.com/
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h |  6 ++----
 include/linux/rethook.h |  7 ++++++-
 kernel/trace/rethook.c  | 23 ++++++++++++++---------
 3 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 64672bace560..0ff44d6633e3 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -197,10 +197,8 @@ extern int arch_trampoline_kprobe(struct kprobe *p);
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return (struct kretprobe *)READ_ONCE(ri->node.rethook->data);
+	/* rethook::data is non-changed field, so that you can access it freely. */
+	return (struct kretprobe *)ri->node.rethook->data;
 }
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
 {
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index ce69b2b7bc35..ba60962805f6 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -28,7 +28,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
  */
 struct rethook {
 	void			*data;
-	rethook_handler_t	handler;
+	/*
+	 * To avoid sparse warnings, this uses a raw function pointer with
+	 * __rcu, instead of rethook_handler_t. But this must be same as
+	 * rethook_handler_t.
+	 */
+	void (__rcu *handler) (struct rethook_node *, void *, unsigned long, struct pt_regs *);
 	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 6fd7d4ecbbc6..fa03094e9e69 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -48,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head)
  */
 void rethook_stop(struct rethook *rh)
 {
-	WRITE_ONCE(rh->handler, NULL);
+	rcu_assign_pointer(rh->handler, NULL);
 }
 
 /**
@@ -63,7 +63,7 @@ void rethook_stop(struct rethook *rh)
  */
 void rethook_free(struct rethook *rh)
 {
-	WRITE_ONCE(rh->handler, NULL);
+	rethook_stop(rh);
 
 	call_rcu(&rh->rcu, rethook_free_rcu);
 }
@@ -82,6 +82,12 @@ static int rethook_fini_pool(struct objpool_head *head, void *context)
 	return 0;
 }
 
+static inline rethook_handler_t rethook_get_handler(struct rethook *rh)
+{
+	return (rethook_handler_t)rcu_dereference_check(rh->handler,
+							rcu_read_lock_any_held());
+}
+
 /**
  * rethook_alloc() - Allocate struct rethook.
  * @data: a data to pass the @handler when hooking the return.
@@ -107,7 +113,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
 		return ERR_PTR(-ENOMEM);
 
 	rh->data = data;
-	rh->handler = handler;
+	rcu_assign_pointer(rh->handler, handler);
 
 	/* initialize the objpool for rethook nodes */
 	if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
@@ -135,9 +141,10 @@ static void free_rethook_node_rcu(struct rcu_head *head)
  */
 void rethook_recycle(struct rethook_node *node)
 {
-	lockdep_assert_preemption_disabled();
+	rethook_handler_t handler;
 
-	if (likely(READ_ONCE(node->rethook->handler)))
+	handler = rethook_get_handler(node->rethook);
+	if (likely(handler))
 		objpool_push(node, &node->rethook->pool);
 	else
 		call_rcu(&node->rcu, free_rethook_node_rcu);
@@ -153,9 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle);
  */
 struct rethook_node *rethook_try_get(struct rethook *rh)
 {
-	rethook_handler_t handler = READ_ONCE(rh->handler);
-
-	lockdep_assert_preemption_disabled();
+	rethook_handler_t handler = rethook_get_handler(rh);
 
 	/* Check whether @rh is going to be freed. */
 	if (unlikely(!handler))
@@ -300,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
 		rhn = container_of(first, struct rethook_node, llist);
 		if (WARN_ON_ONCE(rhn->frame != frame))
 			break;
-		handler = READ_ONCE(rhn->rethook->handler);
+		handler = rethook_get_handler(rhn->rethook);
 		if (handler)
 			handler(rhn, rhn->rethook->data,
 				correct_ret_addr, regs);
-- 
cgit v1.2.3


From df16c1c51d8166958f533c0c886766f7ee9dd50f Mon Sep 17 00:00:00 2001
From: Andrew Halaney <ahalaney@redhat.com>
Date: Mon, 27 Nov 2023 15:41:10 -0600
Subject: net: phy: mdio_device: Reset device only when necessary

Currently the phy reset sequence is as shown below for a
devicetree described mdio phy on boot:

1. Assert the phy_device's reset as part of registering
2. Deassert the phy_device's reset as part of registering
3. Deassert the phy_device's reset as part of phy_probe
4. Deassert the phy_device's reset as part of phy_hw_init

The extra two deasserts include waiting the deassert delay afterwards,
which is adding unnecessary delay.

This applies to both possible types of resets (reset controller
reference and a reset gpio) that can be used.

Here's some snipped tracing output using the following command line
params "trace_event=gpio:* trace_options=stacktrace" illustrating
the reset handling and where its coming from:

    /* Assert */
       systemd-udevd-283     [002] .....     6.780434: gpio_value: 544 set 0
       systemd-udevd-283     [002] .....     6.783849: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => mdiobus_register_device
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
       systemd-udevd-283     [002] .....     6.802480: gpio_value: 544 set 1
       systemd-udevd-283     [002] .....     6.805886: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
       systemd-udevd-283     [002] .....     6.882601: gpio_value: 544 set 1
       systemd-udevd-283     [002] .....     6.886014: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_probe
     => really_probe
     => __driver_probe_device
     => driver_probe_device
     => __device_attach_driver
     => bus_for_each_drv
     => __device_attach
     => device_initial_probe
     => bus_probe_device
     => device_add
     => phy_device_register
     => fwnode_mdiobus_phy_device_register
     => fwnode_mdiobus_register_phy
     => __of_mdiobus_register
     => stmmac_mdio_register
     => stmmac_dvr_probe
     => stmmac_pltfr_probe
     => devm_stmmac_pltfr_probe
     => qcom_ethqos_probe
     => platform_probe

    /* Deassert */
      NetworkManager-477     [000] .....     7.023144: gpio_value: 544 set 1
      NetworkManager-477     [000] .....     7.026596: <stack trace>
     => gpiod_set_raw_value_commit
     => gpiod_set_value_nocheck
     => gpiod_set_value_cansleep
     => mdio_device_reset
     => phy_init_hw
     => phy_attach_direct
     => phylink_fwnode_phy_connect
     => __stmmac_open
     => stmmac_open

There's a lot of paths where the device is getting its reset
asserted and deasserted. Let's track the state and only actually
do the assert/deassert when it changes.

Reported-by: Sagar Cheluvegowda <quic_scheluve@quicinc.com>
Signed-off-by: Andrew Halaney <ahalaney@redhat.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20231127-net-phy-reset-once-v2-1-448e8658779e@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mdio_device.c | 6 ++++++
 drivers/net/phy/phy_device.c  | 1 +
 include/linux/mdio.h          | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c
index 044828d081d2..73f6539b9e50 100644
--- a/drivers/net/phy/mdio_device.c
+++ b/drivers/net/phy/mdio_device.c
@@ -62,6 +62,7 @@ struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr)
 	mdiodev->device_remove = mdio_device_remove;
 	mdiodev->bus = bus;
 	mdiodev->addr = addr;
+	mdiodev->reset_state = -1;
 
 	dev_set_name(&mdiodev->dev, PHY_ID_FMT, bus->id, addr);
 
@@ -122,6 +123,9 @@ void mdio_device_reset(struct mdio_device *mdiodev, int value)
 	if (!mdiodev->reset_gpio && !mdiodev->reset_ctrl)
 		return;
 
+	if (mdiodev->reset_state == value)
+		return;
+
 	if (mdiodev->reset_gpio)
 		gpiod_set_value_cansleep(mdiodev->reset_gpio, value);
 
@@ -135,6 +139,8 @@ void mdio_device_reset(struct mdio_device *mdiodev, int value)
 	d = value ? mdiodev->reset_assert_delay : mdiodev->reset_deassert_delay;
 	if (d)
 		fsleep(d);
+
+	mdiodev->reset_state = value;
 }
 EXPORT_SYMBOL(mdio_device_reset);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 400fb09d9cd6..d8e9335d415c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -654,6 +654,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 	mdiodev->flags = MDIO_DEVICE_FLAG_PHY;
 	mdiodev->device_free = phy_mdio_device_free;
 	mdiodev->device_remove = phy_mdio_device_remove;
+	mdiodev->reset_state = -1;
 
 	dev->speed = SPEED_UNKNOWN;
 	dev->duplex = DUPLEX_UNKNOWN;
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 007fd9c3e4b6..79ceee3c8673 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -38,6 +38,7 @@ struct mdio_device {
 	/* Bus address of the MDIO device (0-31) */
 	int addr;
 	int flags;
+	int reset_state;
 	struct gpio_desc *reset_gpio;
 	struct reset_control *reset_ctrl;
 	unsigned int reset_assert_delay;
-- 
cgit v1.2.3


From 7232522e6cafdf466ed7649c14546fd07ccc1978 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 18:56:18 +0200
Subject: fanotify: store fsid in mark instead of in connector

Some filesystems like fuse and nfs have zero or non-unique fsid.
We would like to avoid reporting ambiguous fsid in events, so we need
to avoid marking objects with same fsid and different sb.

To make this easier to enforce, store the fsid in the marks of the group
instead of in the shared conenctor.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231130165619.3386452-2-amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.c      | 19 ++++----------
 fs/notify/fanotify/fanotify.h      | 10 ++++++++
 fs/notify/fanotify/fanotify_user.c | 18 ++++++++++---
 fs/notify/mark.c                   | 52 ++++++--------------------------------
 include/linux/fsnotify_backend.h   | 13 ++++------
 5 files changed, 42 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9dac7f6e72d2..aff1ab3c32aa 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -838,9 +838,8 @@ out:
 }
 
 /*
- * Get cached fsid of the filesystem containing the object from any connector.
- * All connectors are supposed to have the same fsid, but we do not verify that
- * here.
+ * Get cached fsid of the filesystem containing the object from any mark.
+ * All marks are supposed to have the same fsid, but we do not verify that here.
  */
 static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
 {
@@ -849,17 +848,9 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
 	__kernel_fsid_t fsid = {};
 
 	fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
-		struct fsnotify_mark_connector *conn;
-
-		conn = READ_ONCE(mark->connector);
-		/* Mark is just getting destroyed or created? */
-		if (!conn)
-			continue;
-		if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID))
+		if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID))
 			continue;
-		/* Pairs with smp_wmb() in fsnotify_add_mark_list() */
-		smp_rmb();
-		fsid = conn->fsid;
+		fsid = FANOTIFY_MARK(mark)->fsid;
 		if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
 			continue;
 		return fsid;
@@ -1068,7 +1059,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark,
 
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+	kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark));
 }
 
 const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 6936671e148d..f3b9ef60f0c0 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -489,6 +489,16 @@ static inline unsigned int fanotify_event_hash_bucket(
 	return event->hash & FANOTIFY_HTABLE_MASK;
 }
 
+struct fanotify_mark {
+	struct fsnotify_mark fsn_mark;
+	__kernel_fsid_t fsid;
+};
+
+static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark)
+{
+	return container_of(mark, struct fanotify_mark, fsn_mark);
+}
+
 static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 {
 	unsigned int mflags = 0;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4d765c72496f..e3d836d4d156 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1199,6 +1199,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   __kernel_fsid_t *fsid)
 {
 	struct ucounts *ucounts = group->fanotify_data.ucounts;
+	struct fanotify_mark *fan_mark;
 	struct fsnotify_mark *mark;
 	int ret;
 
@@ -1211,17 +1212,26 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
 		return ERR_PTR(-ENOSPC);
 
-	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-	if (!mark) {
+	fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	if (!fan_mark) {
 		ret = -ENOMEM;
 		goto out_dec_ucounts;
 	}
 
+	mark = &fan_mark->fsn_mark;
 	fsnotify_init_mark(mark, group);
 	if (fan_flags & FAN_MARK_EVICTABLE)
 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
 
-	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
+	/* Cache fsid of filesystem containing the marked object */
+	if (fsid) {
+		fan_mark->fsid = *fsid;
+		mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
+	} else {
+		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
+	}
+
+	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
 	if (ret) {
 		fsnotify_put_mark(mark);
 		goto out_dec_ucounts;
@@ -1935,7 +1945,7 @@ static int __init fanotify_user_setup(void)
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
 
-	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
 					 SLAB_PANIC|SLAB_ACCOUNT);
 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
 					       SLAB_PANIC);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c74ef947447d..d6944ff86ffa 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -537,8 +537,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 }
 
 static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
-					       unsigned int obj_type,
-					       __kernel_fsid_t *fsid)
+					       unsigned int obj_type)
 {
 	struct fsnotify_mark_connector *conn;
 
@@ -550,14 +549,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	conn->flags = 0;
 	conn->type = obj_type;
 	conn->obj = connp;
-	/* Cache fsid of filesystem containing the object */
-	if (fsid) {
-		conn->fsid = *fsid;
-		conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
-	} else {
-		conn->fsid.val[0] = conn->fsid.val[1] = 0;
-		conn->flags = 0;
-	}
+	conn->flags = 0;
 	fsnotify_get_sb_connectors(conn);
 
 	/*
@@ -608,8 +600,7 @@ out:
  */
 static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 				  fsnotify_connp_t *connp,
-				  unsigned int obj_type,
-				  int add_flags, __kernel_fsid_t *fsid)
+				  unsigned int obj_type, int add_flags)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
@@ -619,41 +610,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 	if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
 		return -EINVAL;
 
-	/* Backend is expected to check for zero fsid (e.g. tmpfs) */
-	if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
-		return -ENODEV;
-
 restart:
 	spin_lock(&mark->lock);
 	conn = fsnotify_grab_connector(connp);
 	if (!conn) {
 		spin_unlock(&mark->lock);
-		err = fsnotify_attach_connector_to_object(connp, obj_type,
-							  fsid);
+		err = fsnotify_attach_connector_to_object(connp, obj_type);
 		if (err)
 			return err;
 		goto restart;
-	} else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
-		conn->fsid = *fsid;
-		/* Pairs with smp_rmb() in fanotify_get_fsid() */
-		smp_wmb();
-		conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
-	} else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
-		   (fsid->val[0] != conn->fsid.val[0] ||
-		    fsid->val[1] != conn->fsid.val[1])) {
-		/*
-		 * Backend is expected to check for non uniform fsid
-		 * (e.g. btrfs), but maybe we missed something?
-		 * Only allow setting conn->fsid once to non zero fsid.
-		 * inotify and non-fid fanotify groups do not set nor test
-		 * conn->fsid.
-		 */
-		pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
-				    "%x.%x != %x.%x\n", __func__, conn->type,
-				    fsid->val[0], fsid->val[1],
-				    conn->fsid.val[0], conn->fsid.val[1]);
-		err = -EXDEV;
-		goto out_err;
 	}
 
 	/* is mark the first mark? */
@@ -703,7 +668,7 @@ out_err:
  */
 int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 			     fsnotify_connp_t *connp, unsigned int obj_type,
-			     int add_flags, __kernel_fsid_t *fsid)
+			     int add_flags)
 {
 	struct fsnotify_group *group = mark->group;
 	int ret = 0;
@@ -723,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
-	ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
+	ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
 	if (ret)
 		goto err;
 
@@ -742,14 +707,13 @@ err:
 }
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
-		      unsigned int obj_type, int add_flags,
-		      __kernel_fsid_t *fsid)
+		      unsigned int obj_type, int add_flags)
 {
 	int ret;
 	struct fsnotify_group *group = mark->group;
 
 	fsnotify_group_lock(group);
-	ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+	ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
 	fsnotify_group_unlock(group);
 	return ret;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c0892d75ce33..a80b525ca653 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
 struct fsnotify_mark_connector {
 	spinlock_t lock;
 	unsigned short type;	/* Type of object [lock] */
-#define FSNOTIFY_CONN_FLAG_HAS_FSID	0x01
 #define FSNOTIFY_CONN_FLAG_HAS_IREF	0x02
 	unsigned short flags;	/* flags [lock] */
-	__kernel_fsid_t fsid;	/* fsid of filesystem containing object */
 	union {
 		/* Object pointer [lock] */
 		fsnotify_connp_t *obj;
@@ -530,6 +528,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x0100
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS	0x0400
+#define FSNOTIFY_MARK_FLAG_HAS_FSID		0x0800
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
@@ -763,11 +762,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
 /* attach the mark to the object */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark,
 			     fsnotify_connp_t *connp, unsigned int obj_type,
-			     int add_flags, __kernel_fsid_t *fsid);
+			     int add_flags);
 extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 				    fsnotify_connp_t *connp,
-				    unsigned int obj_type, int add_flags,
-				    __kernel_fsid_t *fsid);
+				    unsigned int obj_type, int add_flags);
 
 /* attach the mark to the inode */
 static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
@@ -775,15 +773,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 					  int add_flags)
 {
 	return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL);
+				 FSNOTIFY_OBJ_TYPE_INODE, add_flags);
 }
 static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
 						 struct inode *inode,
 						 int add_flags)
 {
 	return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
-					FSNOTIFY_OBJ_TYPE_INODE, add_flags,
-					NULL);
+					FSNOTIFY_OBJ_TYPE_INODE, add_flags);
 }
 
 /* given a group and a mark, flag mark to be freed when all references are dropped */
-- 
cgit v1.2.3


From 30ad1938326bf9303ca38090339d948975a626f5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 18:56:19 +0200
Subject: fanotify: allow "weak" fsid when watching a single filesystem

So far, fanotify returns -ENODEV or -EXDEV when trying to set a mark
on a filesystem with a "weak" fsid, namely, zero fsid (e.g. fuse), or
non-uniform fsid (e.g. btrfs non-root subvol).

When group is watching inodes all from the same filesystem (or subvol),
allow adding inode marks with "weak" fsid, because there is no ambiguity
regarding which filesystem reports the event.

The first mark added to a group determines if this group is single or
multi filesystem, depending on the fsid at the path of the added mark.

If the first mark added has a "strong" fsid, marks with "weak" fsid
cannot be added and vice versa.

If the first mark added has a "weak" fsid, following marks must have
the same "weak" fsid and the same sb as the first mark.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231130165619.3386452-3-amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.c      |  15 +----
 fs/notify/fanotify/fanotify.h      |   6 ++
 fs/notify/fanotify/fanotify_user.c | 112 ++++++++++++++++++++++++++++++-------
 include/linux/fsnotify_backend.h   |   1 +
 4 files changed, 101 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index aff1ab3c32aa..1e4def21811e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path)
 		hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS);
 }
 
-static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
-				       __kernel_fsid_t *fsid2)
-{
-	return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
-}
-
 static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid)
 {
 	return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^
@@ -851,7 +845,8 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
 		if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID))
 			continue;
 		fsid = FANOTIFY_MARK(mark)->fsid;
-		if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+		if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) &&
+		    WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
 			continue;
 		return fsid;
 	}
@@ -933,12 +928,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 			return 0;
 	}
 
-	if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) {
+	if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
 		fsid = fanotify_get_fsid(iter_info);
-		/* Racing with mark destruction or creation? */
-		if (!fsid.val[0] && !fsid.val[1])
-			return 0;
-	}
 
 	event = fanotify_alloc_event(group, mask, data, data_type, dir,
 				     file_name, &fsid, match_mask);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index f3b9ef60f0c0..e5ab33cae6a7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -499,6 +499,12 @@ static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark)
 	return container_of(mark, struct fanotify_mark, fsn_mark);
 }
 
+static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
+				       __kernel_fsid_t *fsid2)
+{
+	return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
+}
+
 static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 {
 	unsigned int mflags = 0;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e3d836d4d156..f83e7cc5ccf2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -23,7 +23,7 @@
 
 #include <asm/ioctls.h>
 
-#include "../../mount.h"
+#include "../fsnotify.h"
 #include "../fdinfo.h"
 #include "fanotify.h"
 
@@ -1192,11 +1192,68 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	return recalc;
 }
 
+struct fan_fsid {
+	struct super_block *sb;
+	__kernel_fsid_t id;
+	bool weak;
+};
+
+static int fanotify_set_mark_fsid(struct fsnotify_group *group,
+				  struct fsnotify_mark *mark,
+				  struct fan_fsid *fsid)
+{
+	struct fsnotify_mark_connector *conn;
+	struct fsnotify_mark *old;
+	struct super_block *old_sb = NULL;
+
+	FANOTIFY_MARK(mark)->fsid = fsid->id;
+	mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
+	if (fsid->weak)
+		mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
+
+	/* First mark added will determine if group is single or multi fsid */
+	if (list_empty(&group->marks_list))
+		return 0;
+
+	/* Find sb of an existing mark */
+	list_for_each_entry(old, &group->marks_list, g_list) {
+		conn = READ_ONCE(old->connector);
+		if (!conn)
+			continue;
+		old_sb = fsnotify_connector_sb(conn);
+		if (old_sb)
+			break;
+	}
+
+	/* Only detached marks left? */
+	if (!old_sb)
+		return 0;
+
+	/* Do not allow mixing of marks with weak and strong fsid */
+	if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
+		return -EXDEV;
+
+	/* Allow mixing of marks with strong fsid from different fs */
+	if (!fsid->weak)
+		return 0;
+
+	/* Do not allow mixing marks with weak fsid from different fs */
+	if (old_sb != fsid->sb)
+		return -EXDEV;
+
+	/* Do not allow mixing marks from different btrfs sub-volumes */
+	if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
+				 &FANOTIFY_MARK(mark)->fsid))
+		return -EXDEV;
+
+	return 0;
+}
+
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   fsnotify_connp_t *connp,
 						   unsigned int obj_type,
 						   unsigned int fan_flags,
-						   __kernel_fsid_t *fsid)
+						   struct fan_fsid *fsid)
 {
 	struct ucounts *ucounts = group->fanotify_data.ucounts;
 	struct fanotify_mark *fan_mark;
@@ -1225,20 +1282,21 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 
 	/* Cache fsid of filesystem containing the marked object */
 	if (fsid) {
-		fan_mark->fsid = *fsid;
-		mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
+		ret = fanotify_set_mark_fsid(group, mark, fsid);
+		if (ret)
+			goto out_put_mark;
 	} else {
 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
 	}
 
 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
-	if (ret) {
-		fsnotify_put_mark(mark);
-		goto out_dec_ucounts;
-	}
+	if (ret)
+		goto out_put_mark;
 
 	return mark;
 
+out_put_mark:
+	fsnotify_put_mark(mark);
 out_dec_ucounts:
 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
@@ -1289,7 +1347,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
 static int fanotify_add_mark(struct fsnotify_group *group,
 			     fsnotify_connp_t *connp, unsigned int obj_type,
 			     __u32 mask, unsigned int fan_flags,
-			     __kernel_fsid_t *fsid)
+			     struct fan_fsid *fsid)
 {
 	struct fsnotify_mark *fsn_mark;
 	bool recalc;
@@ -1337,7 +1395,7 @@ out:
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 				      struct vfsmount *mnt, __u32 mask,
-				      unsigned int flags, __kernel_fsid_t *fsid)
+				      unsigned int flags, struct fan_fsid *fsid)
 {
 	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
 				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
@@ -1345,7 +1403,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 
 static int fanotify_add_sb_mark(struct fsnotify_group *group,
 				struct super_block *sb, __u32 mask,
-				unsigned int flags, __kernel_fsid_t *fsid)
+				unsigned int flags, struct fan_fsid *fsid)
 {
 	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
 				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
@@ -1353,7 +1411,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group,
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
 				   struct inode *inode, __u32 mask,
-				   unsigned int flags, __kernel_fsid_t *fsid)
+				   unsigned int flags, struct fan_fsid *fsid)
 {
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -1564,20 +1622,25 @@ out_destroy_group:
 	return fd;
 }
 
-static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
+			      struct fan_fsid *fsid)
 {
+	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	__kernel_fsid_t root_fsid;
 	int err;
 
 	/*
 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
 	 */
-	err = vfs_get_fsid(dentry, fsid);
+	err = vfs_get_fsid(dentry, &fsid->id);
 	if (err)
 		return err;
 
-	if (!fsid->val[0] && !fsid->val[1])
-		return -ENODEV;
+	fsid->sb = dentry->d_sb;
+	if (!fsid->id.val[0] && !fsid->id.val[1]) {
+		err = -ENODEV;
+		goto weak;
+	}
 
 	/*
 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
@@ -1587,11 +1650,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
 	if (err)
 		return err;
 
-	if (root_fsid.val[0] != fsid->val[0] ||
-	    root_fsid.val[1] != fsid->val[1])
-		return -EXDEV;
+	if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
+		err = -EXDEV;
+		goto weak;
+	}
 
+	fsid->weak = false;
 	return 0;
+
+weak:
+	/* Allow weak fsid when marking inodes */
+	fsid->weak = true;
+	return (mark_type == FAN_MARK_INODE) ? 0 : err;
 }
 
 /* Check if filesystem can encode a unique fid */
@@ -1675,7 +1745,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	struct fsnotify_group *group;
 	struct fd f;
 	struct path path;
-	__kernel_fsid_t __fsid, *fsid = NULL;
+	struct fan_fsid __fsid, *fsid = NULL;
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
@@ -1827,7 +1897,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	}
 
 	if (fid_mode) {
-		ret = fanotify_test_fsid(path.dentry, &__fsid);
+		ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
 		if (ret)
 			goto path_put_and_out;
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a80b525ca653..7f63be5ca0f1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -529,6 +529,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_NO_IREF		0x0200
 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS	0x0400
 #define FSNOTIFY_MARK_FLAG_HAS_FSID		0x0800
+#define FSNOTIFY_MARK_FLAG_WEAK_FSID		0x1000
 	unsigned int flags;		/* flags [mark->lock] */
 };
 
-- 
cgit v1.2.3


From c66272a4c9932d6c585eef99039747617d48d662 Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sat, 25 Nov 2023 19:50:10 +0800
Subject: crypto: hisilicon/qm - simplify the status of qm

The 'QM_INIT' and 'QM_CLOSE' status of qm and 'QP_INIT'
and 'QP_CLOSE' status of queue are not actually used. Currently,
driver only needs to switch status when the device or queue
is enabled or stopped, Therefore, remove unneeded status to
simplify driver. In addition, rename'QM_START to'QM_WORK' for
ease to understand.

Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/ABI/testing/debugfs-hisi-hpre |   2 +-
 Documentation/ABI/testing/debugfs-hisi-sec  |   2 +-
 Documentation/ABI/testing/debugfs-hisi-zip  |   2 +-
 drivers/crypto/hisilicon/debugfs.c          |   4 +
 drivers/crypto/hisilicon/qm.c               | 130 ++++------------------------
 drivers/crypto/hisilicon/qm_common.h        |   4 -
 include/linux/hisi_acc_qm.h                 |   8 +-
 7 files changed, 25 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-hisi-hpre b/Documentation/ABI/testing/debugfs-hisi-hpre
index 82abf92df429..8e8de49c5cc6 100644
--- a/Documentation/ABI/testing/debugfs-hisi-hpre
+++ b/Documentation/ABI/testing/debugfs-hisi-hpre
@@ -101,7 +101,7 @@ What:		/sys/kernel/debug/hisi_hpre/<bdf>/qm/status
 Date:		Apr 2020
 Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the status of the QM.
-		Four states: initiated, started, stopped and closed.
+		Two states: work, stop.
 		Available for both PF and VF, and take no other effect on HPRE.
 
 What:		/sys/kernel/debug/hisi_hpre/<bdf>/qm/diff_regs
diff --git a/Documentation/ABI/testing/debugfs-hisi-sec b/Documentation/ABI/testing/debugfs-hisi-sec
index 93c530d1bf0f..deeefe2c735e 100644
--- a/Documentation/ABI/testing/debugfs-hisi-sec
+++ b/Documentation/ABI/testing/debugfs-hisi-sec
@@ -81,7 +81,7 @@ What:		/sys/kernel/debug/hisi_sec2/<bdf>/qm/status
 Date:		Apr 2020
 Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the status of the QM.
-		Four states: initiated, started, stopped and closed.
+		Two states: work, stop.
 		Available for both PF and VF, and take no other effect on SEC.
 
 What:		/sys/kernel/debug/hisi_sec2/<bdf>/qm/diff_regs
diff --git a/Documentation/ABI/testing/debugfs-hisi-zip b/Documentation/ABI/testing/debugfs-hisi-zip
index fd3f314cf8d1..593714afaed2 100644
--- a/Documentation/ABI/testing/debugfs-hisi-zip
+++ b/Documentation/ABI/testing/debugfs-hisi-zip
@@ -94,7 +94,7 @@ What:		/sys/kernel/debug/hisi_zip/<bdf>/qm/status
 Date:		Apr 2020
 Contact:	linux-crypto@vger.kernel.org
 Description:	Dump the status of the QM.
-		Four states: initiated, started, stopped and closed.
+		Two states: work, stop.
 		Available for both PF and VF, and take no other effect on ZIP.
 
 What:		/sys/kernel/debug/hisi_zip/<bdf>/qm/diff_regs
diff --git a/drivers/crypto/hisilicon/debugfs.c b/drivers/crypto/hisilicon/debugfs.c
index 415139b4abc1..80ed4b2d209c 100644
--- a/drivers/crypto/hisilicon/debugfs.c
+++ b/drivers/crypto/hisilicon/debugfs.c
@@ -31,6 +31,10 @@ static const char * const qm_debug_file_name[] = {
 	[CLEAR_ENABLE] = "clear_enable",
 };
 
+static const char * const qm_s[] = {
+	"work", "stop",
+};
+
 struct qm_dfx_item {
 	const char *name;
 	u32 offset;
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 4d91a249be74..7d22412774ba 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -402,10 +402,6 @@ static const char * const qm_fifo_overflow[] = {
 	"cq", "eq", "aeq",
 };
 
-static const char * const qp_s[] = {
-	"none", "init", "start", "stop", "close",
-};
-
 struct qm_typical_qos_table {
 	u32 start;
 	u32 end;
@@ -433,85 +429,6 @@ static struct qm_typical_qos_table shaper_cbs_s[] = {
 
 static void qm_irqs_unregister(struct hisi_qm *qm);
 
-static bool qm_avail_state(struct hisi_qm *qm, enum qm_state new)
-{
-	enum qm_state curr = atomic_read(&qm->status.flags);
-	bool avail = false;
-
-	switch (curr) {
-	case QM_INIT:
-		if (new == QM_START || new == QM_CLOSE)
-			avail = true;
-		break;
-	case QM_START:
-		if (new == QM_STOP)
-			avail = true;
-		break;
-	case QM_STOP:
-		if (new == QM_CLOSE || new == QM_START)
-			avail = true;
-		break;
-	default:
-		break;
-	}
-
-	dev_dbg(&qm->pdev->dev, "change qm state from %s to %s\n",
-		qm_s[curr], qm_s[new]);
-
-	if (!avail)
-		dev_warn(&qm->pdev->dev, "Can not change qm state from %s to %s\n",
-			 qm_s[curr], qm_s[new]);
-
-	return avail;
-}
-
-static bool qm_qp_avail_state(struct hisi_qm *qm, struct hisi_qp *qp,
-			      enum qp_state new)
-{
-	enum qm_state qm_curr = atomic_read(&qm->status.flags);
-	enum qp_state qp_curr = 0;
-	bool avail = false;
-
-	if (qp)
-		qp_curr = atomic_read(&qp->qp_status.flags);
-
-	switch (new) {
-	case QP_INIT:
-		if (qm_curr == QM_START || qm_curr == QM_INIT)
-			avail = true;
-		break;
-	case QP_START:
-		if ((qm_curr == QM_START && qp_curr == QP_INIT) ||
-		    (qm_curr == QM_START && qp_curr == QP_STOP))
-			avail = true;
-		break;
-	case QP_STOP:
-		if ((qm_curr == QM_START && qp_curr == QP_START) ||
-		    (qp_curr == QP_INIT))
-			avail = true;
-		break;
-	case QP_CLOSE:
-		if ((qm_curr == QM_START && qp_curr == QP_INIT) ||
-		    (qm_curr == QM_START && qp_curr == QP_STOP) ||
-		    (qm_curr == QM_STOP && qp_curr == QP_STOP)  ||
-		    (qm_curr == QM_STOP && qp_curr == QP_INIT))
-			avail = true;
-		break;
-	default:
-		break;
-	}
-
-	dev_dbg(&qm->pdev->dev, "change qp state from %s to %s in QM %s\n",
-		qp_s[qp_curr], qp_s[new], qm_s[qm_curr]);
-
-	if (!avail)
-		dev_warn(&qm->pdev->dev,
-			 "Can not change qp state from %s to %s in QM %s\n",
-			 qp_s[qp_curr], qp_s[new], qm_s[qm_curr]);
-
-	return avail;
-}
-
 static u32 qm_get_hw_error_status(struct hisi_qm *qm)
 {
 	return readl(qm->io_base + QM_ABNORMAL_INT_STATUS);
@@ -1853,8 +1770,10 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type)
 	struct hisi_qp *qp;
 	int qp_id;
 
-	if (!qm_qp_avail_state(qm, NULL, QP_INIT))
+	if (atomic_read(&qm->status.flags) == QM_STOP) {
+		dev_info_ratelimited(dev, "failed to create qp as qm is stop!\n");
 		return ERR_PTR(-EPERM);
+	}
 
 	if (qm->qp_in_used == qm->qp_num) {
 		dev_info_ratelimited(dev, "All %u queues of QM are busy!\n",
@@ -1881,7 +1800,6 @@ static struct hisi_qp *qm_create_qp_nolock(struct hisi_qm *qm, u8 alg_type)
 	qp->alg_type = alg_type;
 	qp->is_in_kernel = true;
 	qm->qp_in_used++;
-	atomic_set(&qp->qp_status.flags, QP_INIT);
 
 	return qp;
 }
@@ -1924,11 +1842,6 @@ static void hisi_qm_release_qp(struct hisi_qp *qp)
 
 	down_write(&qm->qps_lock);
 
-	if (!qm_qp_avail_state(qm, qp, QP_CLOSE)) {
-		up_write(&qm->qps_lock);
-		return;
-	}
-
 	qm->qp_in_used--;
 	idr_remove(&qm->qp_idr, qp->qp_id);
 
@@ -2008,8 +1921,10 @@ static int qm_start_qp_nolock(struct hisi_qp *qp, unsigned long arg)
 	u32 pasid = arg;
 	int ret;
 
-	if (!qm_qp_avail_state(qm, qp, QP_START))
+	if (atomic_read(&qm->status.flags) == QM_STOP) {
+		dev_info_ratelimited(dev, "failed to start qp as qm is stop!\n");
 		return -EPERM;
+	}
 
 	ret = qm_qp_ctx_cfg(qp, qp_id, pasid);
 	if (ret)
@@ -2131,21 +2046,17 @@ static int qm_stop_qp_nolock(struct hisi_qp *qp)
 	 * is_resetting flag should be set negative so that this qp will not
 	 * be restarted after reset.
 	 */
-	if (atomic_read(&qp->qp_status.flags) == QP_STOP) {
+	if (atomic_read(&qp->qp_status.flags) != QP_START) {
 		qp->is_resetting = false;
 		return 0;
 	}
 
-	if (!qm_qp_avail_state(qp->qm, qp, QP_STOP))
-		return -EPERM;
-
 	atomic_set(&qp->qp_status.flags, QP_STOP);
 
 	ret = qm_drain_qp(qp);
 	if (ret)
 		dev_err(dev, "Failed to drain out data for stopping!\n");
 
-
 	flush_workqueue(qp->qm->wq);
 	if (unlikely(qp->is_resetting && atomic_read(&qp->qp_status.used)))
 		qp_stop_fail_cb(qp);
@@ -2865,13 +2776,8 @@ void hisi_qm_uninit(struct hisi_qm *qm)
 {
 	qm_cmd_uninit(qm);
 	hisi_qm_unint_work(qm);
-	down_write(&qm->qps_lock);
-
-	if (!qm_avail_state(qm, QM_CLOSE)) {
-		up_write(&qm->qps_lock);
-		return;
-	}
 
+	down_write(&qm->qps_lock);
 	hisi_qm_memory_uninit(qm);
 	hisi_qm_set_state(qm, QM_NOT_READY);
 	up_write(&qm->qps_lock);
@@ -3045,11 +2951,6 @@ int hisi_qm_start(struct hisi_qm *qm)
 
 	down_write(&qm->qps_lock);
 
-	if (!qm_avail_state(qm, QM_START)) {
-		up_write(&qm->qps_lock);
-		return -EPERM;
-	}
-
 	dev_dbg(dev, "qm start with %u queue pairs\n", qm->qp_num);
 
 	if (!qm->qp_num) {
@@ -3059,10 +2960,12 @@ int hisi_qm_start(struct hisi_qm *qm)
 	}
 
 	ret = __hisi_qm_start(qm);
-	if (!ret)
-		atomic_set(&qm->status.flags, QM_START);
+	if (ret)
+		goto err_unlock;
 
+	atomic_set(&qm->status.flags, QM_WORK);
 	hisi_qm_set_state(qm, QM_READY);
+
 err_unlock:
 	up_write(&qm->qps_lock);
 	return ret;
@@ -3159,10 +3062,11 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r)
 	down_write(&qm->qps_lock);
 
 	qm->status.stop_reason = r;
-	if (!qm_avail_state(qm, QM_STOP)) {
-		ret = -EPERM;
+	if (atomic_read(&qm->status.flags) == QM_STOP)
 		goto err_unlock;
-	}
+
+	/* Stop all the request sending at first. */
+	atomic_set(&qm->status.flags, QM_STOP);
 
 	if (qm->status.stop_reason == QM_SOFT_RESET ||
 	    qm->status.stop_reason == QM_DOWN) {
@@ -3186,7 +3090,6 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r)
 	}
 
 	qm_clear_queues(qm);
-	atomic_set(&qm->status.flags, QM_STOP);
 
 err_unlock:
 	up_write(&qm->qps_lock);
@@ -5350,7 +5253,6 @@ int hisi_qm_init(struct hisi_qm *qm)
 		goto err_free_qm_memory;
 
 	qm_cmd_init(qm);
-	atomic_set(&qm->status.flags, QM_INIT);
 
 	return 0;
 
diff --git a/drivers/crypto/hisilicon/qm_common.h b/drivers/crypto/hisilicon/qm_common.h
index 7b0b15c83ec1..0760bf55f13e 100644
--- a/drivers/crypto/hisilicon/qm_common.h
+++ b/drivers/crypto/hisilicon/qm_common.h
@@ -72,10 +72,6 @@ struct qm_aeqc {
 	__le32 dw6;
 };
 
-static const char * const qm_s[] = {
-	"init", "start", "close", "stop",
-};
-
 int qm_set_and_get_xqc(struct hisi_qm *qm, u8 cmd, void *xqc, u32 qp_id, bool op);
 void hisi_qm_show_last_dfx_regs(struct hisi_qm *qm);
 void hisi_qm_set_algqos_init(struct hisi_qm *qm);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index ddc7ebb70523..e3c0a1297b2c 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -108,17 +108,13 @@ enum qm_stop_reason {
 };
 
 enum qm_state {
-	QM_INIT = 0,
-	QM_START,
-	QM_CLOSE,
+	QM_WORK = 0,
 	QM_STOP,
 };
 
 enum qp_state {
-	QP_INIT = 1,
-	QP_START,
+	QP_START = 1,
 	QP_STOP,
-	QP_CLOSE,
 };
 
 enum qm_hw_ver {
-- 
cgit v1.2.3


From 488e8f685207e0758398963d6834f81e5e61c162 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 30 Nov 2023 16:16:22 +0200
Subject: fs: fork splice_file_range() from do_splice_direct()

In preparation of calling do_splice_direct() without file_start_write()
held, create a new helper splice_file_range(), to be called from context
of ->copy_file_range() methods instead of do_splice_direct().

Currently, the only difference is that splice_file_range() does not take
flags argument and that it asserts that file_start_write() is held, but
we factor out a common helper do_splice_direct_actor() that will be used
later.

Use the new helper from __ceph_copy_file_range(), that was incorrectly
passing to do_splice_direct() the copy flags argument as splice flags.
The value of copy flags in ceph is always 0, so it is a smenatic bug fix.

Move the declaration of both helpers to linux/splice.h.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231130141624.3338942-2-amir73il@gmail.com
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/file.c         |  9 ++++---
 fs/read_write.c        |  6 ++---
 fs/splice.c            | 71 ++++++++++++++++++++++++++++++++++++--------------
 include/linux/fs.h     |  2 --
 include/linux/splice.h | 13 +++++----
 5 files changed, 66 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..f11de6e1f1c1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -12,6 +12,7 @@
 #include <linux/falloc.h>
 #include <linux/iversion.h>
 #include <linux/ktime.h>
+#include <linux/splice.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 		 * {read,write}_iter, which will get caps again.
 		 */
 		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
-		ret = do_splice_direct(src_file, &src_off, dst_file,
-				       &dst_off, src_objlen, flags);
+		ret = splice_file_range(src_file, &src_off, dst_file, &dst_off,
+					src_objlen);
 		/* Abort on short copies or on error */
 		if (ret < (long)src_objlen) {
 			doutc(cl, "Failed partial copy (%zd)\n", ret);
@@ -3065,8 +3066,8 @@ out_caps:
 	 */
 	if (len && (len < src_ci->i_layout.object_size)) {
 		doutc(cl, "Final partial copy of %zu bytes\n", len);
-		bytes = do_splice_direct(src_file, &src_off, dst_file,
-					 &dst_off, len, flags);
+		bytes = splice_file_range(src_file, &src_off, dst_file,
+					  &dst_off, len);
 		if (bytes > 0)
 			ret += bytes;
 		else
diff --git a/fs/read_write.c b/fs/read_write.c
index f791555fa246..642c7ce1ced1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				size_t len, unsigned int flags)
 {
-	lockdep_assert(file_write_started(file_out));
-
-	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+	return splice_file_range(file_in, &pos_in, file_out, &pos_out,
+				 min_t(size_t, len, MAX_RW_COUNT));
 }
 EXPORT_SYMBOL(generic_copy_file_range);
 
diff --git a/fs/splice.c b/fs/splice.c
index 3fce5f6072dd..9007b2c8baa8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1170,25 +1170,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
 		file->f_op->splice_eof(file);
 }
 
-/**
- * do_splice_direct - splices data directly between two files
- * @in:		file to splice from
- * @ppos:	input file offset
- * @out:	file to splice to
- * @opos:	output file offset
- * @len:	number of bytes to splice
- * @flags:	splice modifier flags
- *
- * Description:
- *    For use by do_sendfile(). splice can easily emulate sendfile, but
- *    doing it in the application would incur an extra system call
- *    (splice in + splice out, as compared to just sendfile()). So this helper
- *    can splice directly through a process-private pipe.
- *
- * Callers already called rw_verify_area() on the entire range.
- */
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      loff_t *opos, size_t len, unsigned int flags)
+static long do_splice_direct_actor(struct file *in, loff_t *ppos,
+				   struct file *out, loff_t *opos,
+				   size_t len, unsigned int flags,
+				   splice_direct_actor *actor)
 {
 	struct splice_desc sd = {
 		.len		= len,
@@ -1207,14 +1192,60 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	if (unlikely(out->f_flags & O_APPEND))
 		return -EINVAL;
 
-	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+	ret = splice_direct_to_actor(in, &sd, actor);
 	if (ret > 0)
 		*ppos = sd.pos;
 
 	return ret;
 }
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    For use by do_sendfile(). splice can easily emulate sendfile, but
+ *    doing it in the application would incur an extra system call
+ *    (splice in + splice out, as compared to just sendfile()). So this helper
+ *    can splice directly through a process-private pipe.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags)
+{
+	return do_splice_direct_actor(in, ppos, out, opos, len, flags,
+				      direct_splice_actor);
+}
 EXPORT_SYMBOL(do_splice_direct);
 
+/**
+ * splice_file_range - splices data between two files for copy_file_range()
+ * @in:		file to splice from
+ * @ppos:	input file offset
+ * @out:	file to splice to
+ * @opos:	output file offset
+ * @len:	number of bytes to splice
+ *
+ * Description:
+ *    For use by generic_copy_file_range() and ->copy_file_range() methods.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+		       loff_t *opos, size_t len)
+{
+	lockdep_assert(file_write_started(out));
+
+	return do_splice_direct_actor(in, ppos, out, opos, len, 0,
+				      direct_splice_actor);
+}
+EXPORT_SYMBOL(splice_file_range);
+
 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 {
 	for (;;) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae0e2fb7bcea..04422a0eccdd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3052,8 +3052,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 			 size_t len, unsigned int flags);
 extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		loff_t *opos, size_t len, unsigned int flags);
 
 
 extern void
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 6c461573434d..49532d5dda52 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
 long vfs_splice_read(struct file *in, loff_t *ppos,
 		     struct pipe_inode_info *pipe, size_t len,
 		     unsigned int flags);
-extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
-				      splice_direct_actor *);
-extern long do_splice(struct file *in, loff_t *off_in,
-		      struct file *out, loff_t *off_out,
-		      size_t len, unsigned int flags);
+ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd,
+			       splice_direct_actor *actor);
+long do_splice(struct file *in, loff_t *off_in, struct file *out,
+	       loff_t *off_out, size_t len, unsigned int flags);
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      loff_t *opos, size_t len, unsigned int flags);
+long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+		       loff_t *opos, size_t len);
 
 extern long do_tee(struct file *in, struct file *out, size_t len,
 		   unsigned int flags);
-- 
cgit v1.2.3


From 77070eeb882124614a40616f01bfe60947be5778 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 30 Nov 2023 15:43:27 -0500
Subject: cgroup: Avoid false cacheline sharing of read mostly rstat_cpu

The rstat_cpu and also rstat_css_list of the cgroup structure are read
mostly variables. However, they may share the same cacheline as the
subsequent rstat_flush_next and *bstat variables which can be updated
frequently.  That will slow down the cgroup_rstat_cpu() call which is
called pretty frequently in the rstat code. Add a CACHELINE_PADDING()
line in between them to avoid false cacheline sharing.

A parallel kernel build on a 2-socket x86-64 server is used as the
benchmarking tool for measuring the lock hold time. Below were the lock
hold time frequency distribution before and after the patch:

      Run time        Before patch       After patch
      --------        ------------       -----------
       0-01 us         9,928,562          9,820,428
      01-05 us           110,151             50,935
      05-10 us               270                 93
      10-15 us               273                146
      15-20 us               135                 76
      20-25 us                 0                  2
      25-30 us                 1                  0

It can be seen that the patch further pushes the lock hold time towards
the lower end.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 37518436cfe7..5a97ea95b564 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -496,6 +496,13 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 	struct list_head rstat_css_list;
 
+	/*
+	 * Add padding to separate the read mostly rstat_cpu and
+	 * rstat_css_list into a different cacheline from the following
+	 * rstat_flush_next and *bstat fields which can have frequent updates.
+	 */
+	CACHELINE_PADDING(_pad_);
+
 	/*
 	 * A singly-linked list of cgroup structures to be rstat flushed.
 	 * This is a scratch field to be used exclusively by
-- 
cgit v1.2.3


From aabf7c37dfbce3e5fe24f0c86a34bc8f2f63cee8 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 29 Nov 2023 13:44:04 -0800
Subject: lkdtm: Add kfence read after free crash type

Add the ability to allocate memory from kfence and trigger a read after
free on that memory to validate that kfence is working properly. This is
used by ChromeOS integration tests to validate that kfence errors can be
collected on user devices and parsed properly.

Cc: Alexander Potapenko <glider@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: kasan-dev@googlegroups.com
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/20231129214413.3156334-1-swboyd@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm/heap.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kfence.h    |  2 ++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/lkdtm/heap.c b/drivers/misc/lkdtm/heap.c
index 0ce4cbf6abda..4f467d3972a6 100644
--- a/drivers/misc/lkdtm/heap.c
+++ b/drivers/misc/lkdtm/heap.c
@@ -4,6 +4,7 @@
  * page allocation and slab allocations.
  */
 #include "lkdtm.h"
+#include <linux/kfence.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
@@ -132,6 +133,64 @@ static void lkdtm_READ_AFTER_FREE(void)
 	kfree(val);
 }
 
+static void lkdtm_KFENCE_READ_AFTER_FREE(void)
+{
+	int *base, val, saw;
+	unsigned long timeout, resched_after;
+	size_t len = 1024;
+	/*
+	 * The slub allocator will use the either the first word or
+	 * the middle of the allocation to store the free pointer,
+	 * depending on configurations. Store in the second word to
+	 * avoid running into the freelist.
+	 */
+	size_t offset = sizeof(*base);
+
+	/*
+	 * 100x the sample interval should be more than enough to ensure we get
+	 * a KFENCE allocation eventually.
+	 */
+	timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
+	/*
+	 * Especially for non-preemption kernels, ensure the allocation-gate
+	 * timer can catch up: after @resched_after, every failed allocation
+	 * attempt yields, to ensure the allocation-gate timer is scheduled.
+	 */
+	resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
+	do {
+		base = kmalloc(len, GFP_KERNEL);
+		if (!base) {
+			pr_err("FAIL: Unable to allocate kfence memory!\n");
+			return;
+		}
+
+		if (is_kfence_address(base)) {
+			val = 0x12345678;
+			base[offset] = val;
+			pr_info("Value in memory before free: %x\n", base[offset]);
+
+			kfree(base);
+
+			pr_info("Attempting bad read from freed memory\n");
+			saw = base[offset];
+			if (saw != val) {
+				/* Good! Poisoning happened, so declare a win. */
+				pr_info("Memory correctly poisoned (%x)\n", saw);
+			} else {
+				pr_err("FAIL: Memory was not poisoned!\n");
+				pr_expected_config_param(CONFIG_INIT_ON_FREE_DEFAULT_ON, "init_on_free");
+			}
+			return;
+		}
+
+		kfree(base);
+		if (time_after(jiffies, resched_after))
+			cond_resched();
+	} while (time_before(jiffies, timeout));
+
+	pr_err("FAIL: kfence memory never allocated!\n");
+}
+
 static void lkdtm_WRITE_BUDDY_AFTER_FREE(void)
 {
 	unsigned long p = __get_free_page(GFP_KERNEL);
@@ -327,6 +386,7 @@ static struct crashtype crashtypes[] = {
 	CRASHTYPE(VMALLOC_LINEAR_OVERFLOW),
 	CRASHTYPE(WRITE_AFTER_FREE),
 	CRASHTYPE(READ_AFTER_FREE),
+	CRASHTYPE(KFENCE_READ_AFTER_FREE),
 	CRASHTYPE(WRITE_BUDDY_AFTER_FREE),
 	CRASHTYPE(READ_BUDDY_AFTER_FREE),
 	CRASHTYPE(SLAB_INIT_ON_ALLOC),
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 401af4757514..88100cc9caba 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -223,6 +223,8 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
 
 #else /* CONFIG_KFENCE */
 
+#define kfence_sample_interval	(0)
+
 static inline bool is_kfence_address(const void *addr) { return false; }
 static inline void kfence_alloc_pool_and_metadata(void) { }
 static inline void kfence_init(void) { }
-- 
cgit v1.2.3


From 12cd3cd8c797e07afcc47bc4afa760e4ec75e9d7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:42 +0200
Subject: params: Introduce the param_unknown_fn type

Introduce a new type for the callback to parse an unknown argument.
This unifies function prototypes which takes that as a parameter.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/moduleparam.h | 6 +++---
 kernel/params.c             | 8 ++------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 4fa9726bc328..bfb85fd13e1f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2);
  */
 extern bool parameqn(const char *name1, const char *name2, size_t n);
 
+typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg);
+
 /* Called on module insert or kernel boot */
 extern char *parse_args(const char *name,
 		      char *args,
@@ -392,9 +394,7 @@ extern char *parse_args(const char *name,
 		      unsigned num,
 		      s16 level_min,
 		      s16 level_max,
-		      void *arg,
-		      int (*unknown)(char *param, char *val,
-				     const char *doing, void *arg));
+		      void *arg, parse_unknown_fn unknown);
 
 /* Called by module remove. */
 #ifdef CONFIG_SYSFS
diff --git a/kernel/params.c b/kernel/params.c
index 2d4a0564697e..626fa8265932 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -120,9 +120,7 @@ static int parse_one(char *param,
 		     unsigned num_params,
 		     s16 min_level,
 		     s16 max_level,
-		     void *arg,
-		     int (*handle_unknown)(char *param, char *val,
-				     const char *doing, void *arg))
+		     void *arg, parse_unknown_fn handle_unknown)
 {
 	unsigned int i;
 	int err;
@@ -165,9 +163,7 @@ char *parse_args(const char *doing,
 		 unsigned num,
 		 s16 min_level,
 		 s16 max_level,
-		 void *arg,
-		 int (*unknown)(char *param, char *val,
-				const char *doing, void *arg))
+		 void *arg, parse_unknown_fn unknown)
 {
 	char *param, *val, *err = NULL;
 
-- 
cgit v1.2.3


From 492c5d455969fc2e829f26ed4c83487b068f0dd7 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:06 -0800
Subject: block: bio-integrity: directly map user buffers

Passthrough commands that utilize metadata currently need to bounce the
user space buffer through the kernel. Add support for mapping user space
directly so that we can avoid this costly overhead. This is similar to
how the normal bio data payload utilizes user addresses with
bio_map_user_iov().

If the user address can't directly be used for reason, like too many
segments or address unalignement, fallback to a copy of the user vec
while keeping the user address pinned for the IO duration so that it
can safely be copied on completion in any process context.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-2-kbusch@meta.com
[axboe: fold in fix from Kanchan Joshi]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/bio.h   |   9 +++
 2 files changed, 225 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b..feef615e2c9c 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	memset(bip, 0, sizeof(*bip));
 
+	/* always report as many vecs as asked explicitly, not inline vecs */
+	bip->bip_max_vcnt = nr_vecs;
 	if (nr_vecs > inline_vecs) {
-		bip->bip_max_vcnt = nr_vecs;
 		bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
 					  &bip->bip_max_vcnt, gfp_mask);
 		if (!bip->bip_vec)
 			goto err;
 	} else {
 		bip->bip_vec = bip->bip_inline_vecs;
-		bip->bip_max_vcnt = inline_vecs;
 	}
 
 	bip->bip_bio = bio;
@@ -91,6 +91,47 @@ err:
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
+				     bool dirty)
+{
+	int i;
+
+	for (i = 0; i < nr_vecs; i++) {
+		if (dirty && !PageCompound(bv[i].bv_page))
+			set_page_dirty_lock(bv[i].bv_page);
+		unpin_user_page(bv[i].bv_page);
+	}
+}
+
+static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
+{
+	unsigned short nr_vecs = bip->bip_max_vcnt - 1;
+	struct bio_vec *copy = &bip->bip_vec[1];
+	size_t bytes = bip->bip_iter.bi_size;
+	struct iov_iter iter;
+	int ret;
+
+	iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+	ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+	WARN_ON_ONCE(ret != bytes);
+
+	bio_integrity_unpin_bvec(copy, nr_vecs, true);
+}
+
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+	bool dirty = bio_data_dir(bip->bip_bio) == READ;
+
+	if (bip->bip_flags & BIP_COPY_USER) {
+		if (dirty)
+			bio_integrity_uncopy_user(bip);
+		kfree(bvec_virt(bip->bip_vec));
+		return;
+	}
+
+	bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else if (bip->bip_flags & BIP_INTEGRITY_USER)
+		bio_integrity_unmap_user(bip);
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+				   int nr_vecs, unsigned int len,
+				   unsigned int direction, u32 seed)
+{
+	bool write = direction == ITER_SOURCE;
+	struct bio_integrity_payload *bip;
+	struct iov_iter iter;
+	void *buf;
+	int ret;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (write) {
+		iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+		if (!copy_from_iter_full(buf, len, &iter)) {
+			ret = -EFAULT;
+			goto free_buf;
+		}
+
+		bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+	} else {
+		memset(buf, 0, len);
+
+		/*
+		 * We need to preserve the original bvec and the number of vecs
+		 * in it for completion handling
+		 */
+		bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
+	}
+
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto free_buf;
+	}
+
+	if (write)
+		bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+	else
+		memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
+
+	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+				     offset_in_page(buf));
+	if (ret != len) {
+		ret = -ENOMEM;
+		goto free_bip;
+	}
+
+	bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
+	bip->bip_iter.bi_sector = seed;
+	return 0;
+free_bip:
+	bio_integrity_free(bio);
+free_buf:
+	kfree(buf);
+	return ret;
+}
+
+static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
+				   int nr_vecs, unsigned int len, u32 seed)
+{
+	struct bio_integrity_payload *bip;
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
+
+	memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
+	bip->bip_flags |= BIP_INTEGRITY_USER;
+	bip->bip_iter.bi_sector = seed;
+	bip->bip_iter.bi_size = len;
+	return 0;
+}
+
+static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
+				    int nr_vecs, ssize_t bytes, ssize_t offset)
+{
+	unsigned int nr_bvecs = 0;
+	int i, j;
+
+	for (i = 0; i < nr_vecs; i = j) {
+		size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
+		struct folio *folio = page_folio(pages[i]);
+
+		bytes -= size;
+		for (j = i + 1; j < nr_vecs; j++) {
+			size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+			if (page_folio(pages[j]) != folio ||
+			    pages[j] != pages[j - 1] + 1)
+				break;
+			unpin_user_page(pages[j]);
+			size += next;
+			bytes -= next;
+		}
+
+		bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
+		offset = 0;
+		nr_bvecs++;
+	}
+
+	return nr_bvecs;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
+			   u32 seed)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+	struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
+	struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+	unsigned int direction, nr_bvecs;
+	struct iov_iter iter;
+	int ret, nr_vecs;
+	size_t offset;
+	bool copy;
+
+	if (bio_integrity(bio))
+		return -EINVAL;
+	if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+		return -E2BIG;
+
+	if (bio_data_dir(bio) == READ)
+		direction = ITER_DEST;
+	else
+		direction = ITER_SOURCE;
+
+	iov_iter_ubuf(&iter, direction, ubuf, bytes);
+	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+	if (nr_vecs > BIO_MAX_VECS)
+		return -E2BIG;
+	if (nr_vecs > UIO_FASTIOV) {
+		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
+		if (!bvec)
+			return -ENOMEM;
+		pages = NULL;
+	}
+
+	copy = !iov_iter_is_aligned(&iter, align, align);
+	ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+	if (unlikely(ret < 0))
+		goto free_bvec;
+
+	nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+	if (pages != stack_pages)
+		kvfree(pages);
+	if (nr_bvecs > queue_max_integrity_segments(q))
+		copy = true;
+
+	if (copy)
+		ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
+					      direction, seed);
+	else
+		ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+	if (ret)
+		goto release_pages;
+	if (bvec != stack_vec)
+		kfree(bvec);
+
+	return 0;
+
+release_pages:
+	bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+free_bvec:
+	if (bvec != stack_vec)
+		kfree(bvec);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee1349..ec4db73e5f4e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,8 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
+	BIP_COPY_USER		= 1 << 6, /* Kernel bounce buffer in use */
 };
 
 /*
@@ -718,6 +720,7 @@ static inline bool bioset_initialized(struct bio_set *bs)
 	for_each_bio(_bio)						\
 		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
 
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
 extern bool bio_integrity_prep(struct bio *);
@@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 ssize_t len, u32 seed)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
-- 
cgit v1.2.3


From e5da71f1e373f36c7506ffa9a60ef7ec6e84674d Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:08 -0800
Subject: iouring: remove IORING_URING_CMD_POLLED

No more users of this flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-4-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index aefb73eeeebf..fe23bf88f86f 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -28,7 +28,6 @@ enum io_uring_cmd_flags {
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
 #define IORING_URING_CMD_CANCELABLE	(1U << 30)
-#define IORING_URING_CMD_POLLED		(1U << 31)
 
 struct io_uring_cmd {
 	struct file	*file;
-- 
cgit v1.2.3


From 8fadb86d4ced8b8349a3b227d6d66736ff150819 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 30 Nov 2023 13:53:09 -0800
Subject: io_uring: remove uring_cmd cookie

No more users of this field.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20231130215309.2923568-5-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 8 ++------
 io_uring/uring_cmd.c     | 1 -
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index fe23bf88f86f..9e6ce6d4ab51 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -32,12 +32,8 @@ enum io_uring_cmd_flags {
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
-	union {
-		/* callback to defer completions to task context */
-		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
-		/* used for polled completion */
-		void *cookie;
-	};
+	/* callback to defer completions to task context */
+	void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
 	u32		cmd_op;
 	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index acbc2924ecd2..b39ec25c36bc 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -182,7 +182,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 			return -EOPNOTSUPP;
 		issue_flags |= IO_URING_F_IOPOLL;
 		req->iopoll_completed = 0;
-		WRITE_ONCE(ioucmd->cookie, NULL);
 	}
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
-- 
cgit v1.2.3


From 45b5623f2d721c25d1a2fdc8c4600fb4b7b61c75 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:55 -0800
Subject: bpf: rearrange bpf_func_state fields to save a bit of memory

It's a trivial rearrangement saving 8 bytes. We have 4 bytes of padding
at the end which can be filled with another field without increasing
struct bpf_func_state.

copy_func_state() logic remains correct without any further changes.

BEFORE
======
struct bpf_func_state {
        struct bpf_reg_state       regs[11];             /*     0  1320 */
        /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */
        int                        callsite;             /*  1320     4 */
        u32                        frameno;              /*  1324     4 */
        u32                        subprogno;            /*  1328     4 */
        u32                        async_entry_cnt;      /*  1332     4 */
        bool                       in_callback_fn;       /*  1336     1 */

        /* XXX 7 bytes hole, try to pack */

        /* --- cacheline 21 boundary (1344 bytes) --- */
        struct tnum                callback_ret_range;   /*  1344    16 */
        bool                       in_async_callback_fn; /*  1360     1 */
        bool                       in_exception_callback_fn; /*  1361     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        acquired_refs;        /*  1364     4 */
        struct bpf_reference_state * refs;               /*  1368     8 */
        int                        allocated_stack;      /*  1376     4 */

        /* XXX 4 bytes hole, try to pack */

        struct bpf_stack_state *   stack;                /*  1384     8 */

        /* size: 1392, cachelines: 22, members: 13 */
        /* sum members: 1379, holes: 3, sum holes: 13 */
        /* last cacheline: 48 bytes */
};

AFTER
=====
struct bpf_func_state {
        struct bpf_reg_state       regs[11];             /*     0  1320 */
        /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */
        int                        callsite;             /*  1320     4 */
        u32                        frameno;              /*  1324     4 */
        u32                        subprogno;            /*  1328     4 */
        u32                        async_entry_cnt;      /*  1332     4 */
        struct tnum                callback_ret_range;   /*  1336    16 */
        /* --- cacheline 21 boundary (1344 bytes) was 8 bytes ago --- */
        bool                       in_callback_fn;       /*  1352     1 */
        bool                       in_async_callback_fn; /*  1353     1 */
        bool                       in_exception_callback_fn; /*  1354     1 */

        /* XXX 1 byte hole, try to pack */

        int                        acquired_refs;        /*  1356     4 */
        struct bpf_reference_state * refs;               /*  1360     8 */
        struct bpf_stack_state *   stack;                /*  1368     8 */
        int                        allocated_stack;      /*  1376     4 */

        /* size: 1384, cachelines: 22, members: 13 */
        /* sum members: 1379, holes: 1, sum holes: 1 */
        /* padding: 4 */
        /* last cacheline: 40 bytes */
};

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d99a636d36a7..0c0e1bccad45 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -297,8 +297,8 @@ struct bpf_func_state {
 	 * void foo(void) { bpf_timer_set_callback(,foo); }
 	 */
 	u32 async_entry_cnt;
-	bool in_callback_fn;
 	struct tnum callback_ret_range;
+	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
 	/* For callback calling functions that limit number of possible
@@ -316,8 +316,8 @@ struct bpf_func_state {
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
 	struct bpf_reference_state *refs;
-	int allocated_stack;
 	struct bpf_stack_state *stack;
+	int allocated_stack;
 };
 
 struct bpf_idx_pair {
-- 
cgit v1.2.3


From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:58 -0800
Subject: bpf: enforce exact retval range on subprog/callback exit

Instead of relying on potentially imprecise tnum representation of
expected return value range for callbacks and subprogs, validate that
smin/smax range satisfy exact expected range of return values.

E.g., if callback would need to return [0, 2] range, tnum can't
represent this precisely and instead will allow [0, 3] range. By
checking smin/smax range, we can make sure that subprog/callback indeed
returns only valid [0, 2] range.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  7 ++++++-
 kernel/bpf/verifier.c        | 33 ++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c0e1bccad45..3378cc753061 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -275,6 +275,11 @@ struct bpf_reference_state {
 	int callback_ref;
 };
 
+struct bpf_retval_range {
+	s32 minval;
+	s32 maxval;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -297,7 +302,7 @@ struct bpf_func_state {
 	 * void foo(void) { bpf_timer_set_callback(,foo); }
 	 */
 	u32 async_entry_cnt;
-	struct tnum callback_ret_range;
+	struct bpf_retval_range callback_ret_range;
 	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 849fbf47b5f3..f3d9d7de68da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2305,6 +2305,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
 	regs[BPF_REG_FP].frameno = state->frameno;
 }
 
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+	return (struct bpf_retval_range){ minval, maxval };
+}
+
 #define BPF_MAIN_FUNC (-1)
 static void init_func_state(struct bpf_verifier_env *env,
 			    struct bpf_func_state *state,
@@ -2313,7 +2318,7 @@ static void init_func_state(struct bpf_verifier_env *env,
 	state->callsite = callsite;
 	state->frameno = frameno;
 	state->subprogno = subprogno;
-	state->callback_ret_range = tnum_range(0, 0);
+	state->callback_ret_range = retval_range(0, 0);
 	init_reg_state(env, state);
 	mark_verifier_state_scratched(env);
 }
@@ -9396,7 +9401,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 		return err;
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9418,7 +9423,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9448,7 +9453,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_async_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9476,7 +9481,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9499,7 +9504,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9531,7 +9536,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9560,6 +9565,11 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
 	return is_rbtree_lock_required_kfunc(kfunc_btf_id);
 }
 
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+{
+	return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state, *prev_st;
@@ -9583,9 +9593,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
 	caller = state->frame[state->curframe - 1];
 	if (callee->in_callback_fn) {
-		/* enforce R0 return value range [0, 1]. */
-		struct tnum range = callee->callback_ret_range;
-
 		if (r0->type != SCALAR_VALUE) {
 			verbose(env, "R0 not a scalar value\n");
 			return -EACCES;
@@ -9597,7 +9604,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		if (err)
 			return err;
 
-		if (!tnum_in(range, r0->var_off)) {
+		/* enforce R0 return value range */
+		if (!retval_range_within(callee->callback_ret_range, r0)) {
+			struct tnum range = tnum_range(callee->callback_ret_range.minval,
+						       callee->callback_ret_range.maxval);
+
 			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From aeb9ce058d7c6193dc41e06b3a5b29d22c446b14 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Wed, 29 Nov 2023 07:27:53 +0000
Subject: cache: enforce cache groups

Set up build time warnings to safeguard against future header changes of
organized structs.

Warning includes:

1) whether all variables are still in the same cache group
2) whether all the cache groups have the sum of the members size (in the
   maximum condition, including all members defined in configs)

The __cache_group* variables are ignored in kernel-doc check in the
various header files they appear in to enforce the cache groups.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cache.h | 25 +++++++++++++++++++++++++
 scripts/kernel-doc    |  5 +++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cache.h b/include/linux/cache.h
index 9900d20b76c2..0ecb17bb6883 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -85,6 +85,31 @@
 #define cache_line_size()	L1_CACHE_BYTES
 #endif
 
+#ifndef __cacheline_group_begin
+#define __cacheline_group_begin(GROUP) \
+	__u8 __cacheline_group_begin__##GROUP[0]
+#endif
+
+#ifndef __cacheline_group_end
+#define __cacheline_group_end(GROUP) \
+	__u8 __cacheline_group_end__##GROUP[0]
+#endif
+
+#ifndef CACHELINE_ASSERT_GROUP_MEMBER
+#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \
+	BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \
+		       offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \
+		       offsetofend(TYPE, MEMBER) <= \
+		       offsetof(TYPE, __cacheline_group_end__##GROUP)))
+#endif
+
+#ifndef CACHELINE_ASSERT_GROUP_SIZE
+#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \
+	BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \
+		     offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \
+		     SIZE)
+#endif
+
 /*
  * Helper to add padding within a struct to ensure data fall into separate
  * cachelines.
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 08a3e603db19..0a890fe4d22b 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1592,6 +1592,11 @@ sub push_parameter($$$$$) {
 		$parameterdescs{$param} = "anonymous\n";
 		$anon_struct_union = 1;
 	}
+	elsif ($param =~ "__cacheline_group" )
+	# handle cache group enforcing variables: they do not need be described in header files
+	{
+		return; # ignore __cacheline_group_begin and __cacheline_group_end
+	}
 
 	# warn if parameter has no description
 	# (but ignore ones starting with # as these are not parameters
-- 
cgit v1.2.3


From a9c8c738066b7ba9e208cfc3200a6f60593982b4 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 6 Nov 2023 09:31:41 +0200
Subject: device property: Add fwnode_name_eq()

Add fwnode_name_eq() to implement the functionality of of_node_name_eq()
on fwnode property API. The same convention of ending the comparison at
'@' (besides NUL) is applied on also both ACPI and swnode. The function
is intended for comparing unit address-less node names on DT and firmware
or swnodes compliant with DT bindings.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Tested-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/base/property.c  | 28 ++++++++++++++++++++++++++++
 include/linux/property.h |  1 +
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 8c40abed7852..b79608ee0b46 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -594,6 +594,34 @@ const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
 	return fwnode_call_ptr_op(fwnode, get_name_prefix);
 }
 
+/**
+ * fwnode_name_eq - Return true if node name is equal
+ * @fwnode: The firmware node
+ * @name: The name to which to compare the node name
+ *
+ * Compare the name provided as an argument to the name of the node, stopping
+ * the comparison at either NUL or '@' character, whichever comes first. This
+ * function is generally used for comparing node names while ignoring the
+ * possible unit address of the node.
+ *
+ * Return: true if the node name matches with the name provided in the @name
+ * argument, false otherwise.
+ */
+bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name)
+{
+	const char *node_name;
+	ptrdiff_t len;
+
+	node_name = fwnode_get_name(fwnode);
+	if (!node_name)
+		return false;
+
+	len = strchrnul(node_name, '@') - node_name;
+
+	return str_has_prefix(node_name, name) == len;
+}
+EXPORT_SYMBOL_GPL(fwnode_name_eq);
+
 /**
  * fwnode_get_parent - Return parent firwmare node
  * @fwnode: Firmware whose parent is retrieved
diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..4ebbb169df20 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -109,6 +109,7 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
 
 const char *fwnode_get_name(const struct fwnode_handle *fwnode);
 const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode);
+bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name);
 
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 578dc962ff2000ba4bf52d50717aea0819615634 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <bbrezillon@kernel.org>
Date: Fri, 24 Nov 2023 17:24:35 -0800
Subject: mtd: rawnand: Add destructive operation

Erase and program operations need the write protect (wp) pin to be
de-asserted to take effect. Add the concept of destructive
operation and pass the information to exec_op() so controllers know
when they should de-assert this pin without having to decode
the command opcode.

Signed-off-by: Boris Brezillon <bbrezillon@kernel.org>
Signed-off-by: David Regan <dregan@broadcom.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-1-dregan@broadcom.com
---
 drivers/mtd/nand/raw/nand_base.c |  6 ++++--
 include/linux/mtd/rawnand.h      | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 9e24bedffd89..7dd9be5d58c4 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -1493,7 +1493,8 @@ static int nand_exec_prog_page_op(struct nand_chip *chip, unsigned int page,
 			    NAND_COMMON_TIMING_NS(conf, tWB_max)),
 		NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tPROG_max), 0),
 	};
-	struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
+	struct nand_operation op = NAND_DESTRUCTIVE_OPERATION(chip->cur_cs,
+							      instrs);
 	int naddrs = nand_fill_column_cycles(chip, addrs, offset_in_page);
 
 	if (naddrs < 0)
@@ -1916,7 +1917,8 @@ int nand_erase_op(struct nand_chip *chip, unsigned int eraseblock)
 			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tBERS_max),
 					 0),
 		};
-		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
+		struct nand_operation op = NAND_DESTRUCTIVE_OPERATION(chip->cur_cs,
+								      instrs);
 
 		if (chip->options & NAND_ROW_ADDR_3)
 			instrs[1].ctx.addr.naddrs++;
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index c29ace15a053..bd02aba5e6e3 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1003,6 +1003,8 @@ struct nand_op_parser {
 /**
  * struct nand_operation - NAND operation descriptor
  * @cs: the CS line to select for this NAND operation
+ * @deassert_wp: set to true when the operation requires the WP pin to be
+ *		 de-asserted (ERASE, PROG, ...)
  * @instrs: array of instructions to execute
  * @ninstrs: length of the @instrs array
  *
@@ -1010,6 +1012,7 @@ struct nand_op_parser {
  */
 struct nand_operation {
 	unsigned int cs;
+	bool deassert_wp;
 	const struct nand_op_instr *instrs;
 	unsigned int ninstrs;
 };
@@ -1021,6 +1024,14 @@ struct nand_operation {
 		.ninstrs = ARRAY_SIZE(_instrs),			\
 	}
 
+#define NAND_DESTRUCTIVE_OPERATION(_cs, _instrs)		\
+	{							\
+		.cs = _cs,					\
+		.deassert_wp = true,				\
+		.instrs = _instrs,				\
+		.ninstrs = ARRAY_SIZE(_instrs),			\
+	}
+
 int nand_op_parser_exec_op(struct nand_chip *chip,
 			   const struct nand_op_parser *parser,
 			   const struct nand_operation *op, bool check_only);
-- 
cgit v1.2.3


From 68cce21e3cc5fea8d955a62394454149270c98bc Mon Sep 17 00:00:00 2001
From: David Regan <dregan@broadcom.com>
Date: Fri, 24 Nov 2023 17:24:36 -0800
Subject: mtd: rawnand: NAND controller write protect

Allow NAND controller to be responsible for write protect pin
handling during fast path and exec_op destructive operation
when controller_wp flag is set.

Signed-off-by: David Regan <dregan@broadcom.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-2-dregan@broadcom.com
---
 drivers/mtd/nand/raw/nand_base.c | 4 ++++
 include/linux/mtd/rawnand.h      | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 7dd9be5d58c4..0f342cd691a3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -366,6 +366,10 @@ static int nand_check_wp(struct nand_chip *chip)
 	if (chip->options & NAND_BROKEN_XD)
 		return 0;
 
+	/* controller responsible for NAND write protect */
+	if (chip->controller->controller_wp)
+		return 0;
+
 	/* Check the WP bit */
 	ret = nand_status_op(chip, &status);
 	if (ret)
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index bd02aba5e6e3..a17f795070d8 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1115,6 +1115,7 @@ struct nand_controller_ops {
  *			the bus without restarting an entire read operation nor
  *			changing the column.
  * @supported_op.cont_read: The controller supports sequential cache reads.
+ * @controller_wp:	the controller is in charge of handling the WP pin.
  */
 struct nand_controller {
 	struct mutex lock;
@@ -1123,6 +1124,7 @@ struct nand_controller {
 		unsigned int data_only_read: 1;
 		unsigned int cont_read: 1;
 	} supported_op;
+	bool controller_wp;
 };
 
 static inline void nand_controller_init(struct nand_controller *nfc)
-- 
cgit v1.2.3


From a87b8e3be926af0fc3b9b1af42b1127bd1ff077c Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 1 Dec 2023 10:29:51 -0800
Subject: usb: core: Allow subclassed USB drivers to override
 usb_choose_configuration()

For some USB devices we might want to do something different for
usb_choose_configuration(). One example here is the r8152 driver where
we want to end up using the vendor driver with the preferred
interface.

The r8152 driver tried to make things work by implementing a USB
generic_subclass driver and then overriding the normal config
selection after it happened. This is less than ideal and also caused
breakage if someone deauthorized and re-authorized the USB device
because the USB core ended up going back to it's default logic for
choosing the best config. I made an attempt to fix this [1] but it was
a bit ugly.

Let's do this better and allow USB generic_subclass drivers to
override usb_choose_configuration().

[1] https://lore.kernel.org/r/20231130154337.1.Ie00e07f07f87149c9ce0b27ae4e26991d307e14b@changeid

Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20231201102946.v2.2.Iade5fa31997f1a0ca3e1dec0591633b02471df12@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/generic.c | 7 +++++++
 include/linux/usb.h        | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/generic.c b/drivers/usb/core/generic.c
index 740342a2812a..dcb897158228 100644
--- a/drivers/usb/core/generic.c
+++ b/drivers/usb/core/generic.c
@@ -59,10 +59,17 @@ int usb_choose_configuration(struct usb_device *udev)
 	int num_configs;
 	int insufficient_power = 0;
 	struct usb_host_config *c, *best;
+	struct usb_device_driver *udriver = to_usb_device_driver(udev->dev.driver);
 
 	if (usb_device_is_owned(udev))
 		return 0;
 
+	if (udriver->choose_configuration) {
+		i = udriver->choose_configuration(udev);
+		if (i >= 0)
+			return i;
+	}
+
 	best = NULL;
 	c = udev->config;
 	num_configs = udev->descriptor.bNumConfigurations;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8c61643acd49..618e5a0b1a22 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1264,6 +1264,9 @@ struct usb_driver {
  *	module is being unloaded.
  * @suspend: Called when the device is going to be suspended by the system.
  * @resume: Called when the device is being resumed by the system.
+ * @choose_configuration: If non-NULL, called instead of the default
+ *	usb_choose_configuration(). If this returns an error then we'll go
+ *	on to call the normal usb_choose_configuration().
  * @dev_groups: Attributes attached to the device that will be created once it
  *	is bound to the driver.
  * @drvwrap: Driver-model core structure wrapper.
@@ -1287,6 +1290,9 @@ struct usb_device_driver {
 
 	int (*suspend) (struct usb_device *udev, pm_message_t message);
 	int (*resume) (struct usb_device *udev, pm_message_t message);
+
+	int (*choose_configuration) (struct usb_device *udev);
+
 	const struct attribute_group **dev_groups;
 	struct usbdrv_wrap drvwrap;
 	const struct usb_device_id *id_table;
-- 
cgit v1.2.3


From db9e54709895241dda23f9347f619afb15291353 Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Tue, 21 Nov 2023 20:38:47 +0000
Subject: usb: typec: tcpm: add tcpm_port_error_recovery symbol

Add tcpm_port_error_recovery symbol and corresponding event that runs in
tcpm_pd_event handler to set the port to the ERROR_RECOVERY state. tcpci
drivers can use the symbol to reset the port when tcpc faults affect port
functionality.

Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20231121203845.170234-5-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 14 ++++++++++++++
 include/linux/usb/tcpm.h      |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 50cbc52386b3..ff67553b6932 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -251,6 +251,7 @@ enum frs_typec_current {
 #define TCPM_FRS_EVENT		BIT(3)
 #define TCPM_SOURCING_VBUS	BIT(4)
 #define TCPM_PORT_CLEAN		BIT(5)
+#define TCPM_PORT_ERROR		BIT(6)
 
 #define LOG_BUFFER_ENTRIES	1024
 #define LOG_BUFFER_ENTRY_SIZE	128
@@ -5487,6 +5488,10 @@ static void tcpm_pd_event_handler(struct kthread_work *work)
 					tcpm_set_state(port, tcpm_default_state(port), 0);
 			}
 		}
+		if (events & TCPM_PORT_ERROR) {
+			tcpm_log(port, "port triggering error recovery");
+			tcpm_set_state(port, ERROR_RECOVERY, 0);
+		}
 
 		spin_lock(&port->pd_event_lock);
 	}
@@ -5554,6 +5559,15 @@ bool tcpm_port_is_toggling(struct tcpm_port *port)
 }
 EXPORT_SYMBOL_GPL(tcpm_port_is_toggling);
 
+void tcpm_port_error_recovery(struct tcpm_port *port)
+{
+	spin_lock(&port->pd_event_lock);
+	port->pd_events |= TCPM_PORT_ERROR;
+	spin_unlock(&port->pd_event_lock);
+	kthread_queue_work(port->wq, &port->event_work);
+}
+EXPORT_SYMBOL_GPL(tcpm_port_error_recovery);
+
 static void tcpm_enable_frs_work(struct kthread_work *work)
 {
 	struct tcpm_port *port = container_of(work, struct tcpm_port, enable_frs);
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index ab7ca872950b..65fac5e1f317 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -173,5 +173,6 @@ void tcpm_pd_hard_reset(struct tcpm_port *port);
 void tcpm_tcpc_reset(struct tcpm_port *port);
 void tcpm_port_clean(struct tcpm_port *port);
 bool tcpm_port_is_toggling(struct tcpm_port *port);
+void tcpm_port_error_recovery(struct tcpm_port *port);
 
 #endif /* __LINUX_USB_TCPM_H */
-- 
cgit v1.2.3


From 5e4c8814a431d21bfaf20b464134f40f2f81e152 Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Tue, 21 Nov 2023 20:38:48 +0000
Subject: usb: typec: tcpci: add vconn over current fault handling to
 maxim_core

Add TCPC_FAULT_STATUS_VCONN_OC constant and corresponding mask definition.
Maxim TCPC is capable of detecting VConn over current faults, so add
fault to alert mask. When a Vconn over current fault is triggered, put the
port in an error recovery state via tcpm_port_error_recovery.

Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20231121203845.170234-6-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpci_maxim_core.c | 20 +++++++++++++++++++-
 include/linux/usb/tcpci.h                 |  5 ++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpci_maxim_core.c b/drivers/usb/typec/tcpm/tcpci_maxim_core.c
index 9454b12a073c..7fb966fd639b 100644
--- a/drivers/usb/typec/tcpm/tcpci_maxim_core.c
+++ b/drivers/usb/typec/tcpm/tcpci_maxim_core.c
@@ -92,11 +92,16 @@ static void max_tcpci_init_regs(struct max_tcpci_chip *chip)
 		return;
 	}
 
+	/* Vconn Over Current Protection */
+	ret = max_tcpci_write8(chip, TCPC_FAULT_STATUS_MASK, TCPC_FAULT_STATUS_MASK_VCONN_OC);
+	if (ret < 0)
+		return;
+
 	alert_mask = TCPC_ALERT_TX_SUCCESS | TCPC_ALERT_TX_DISCARDED | TCPC_ALERT_TX_FAILED |
 		TCPC_ALERT_RX_HARD_RST | TCPC_ALERT_RX_STATUS | TCPC_ALERT_CC_STATUS |
 		TCPC_ALERT_VBUS_DISCNCT | TCPC_ALERT_RX_BUF_OVF | TCPC_ALERT_POWER_STATUS |
 		/* Enable Extended alert for detecting Fast Role Swap Signal */
-		TCPC_ALERT_EXTND | TCPC_ALERT_EXTENDED_STATUS;
+		TCPC_ALERT_EXTND | TCPC_ALERT_EXTENDED_STATUS | TCPC_ALERT_FAULT;
 
 	ret = max_tcpci_write16(chip, TCPC_ALERT_MASK, alert_mask);
 	if (ret < 0) {
@@ -295,6 +300,19 @@ static irqreturn_t _max_tcpci_irq(struct max_tcpci_chip *chip, u16 status)
 		}
 	}
 
+	if (status & TCPC_ALERT_FAULT) {
+		ret = max_tcpci_read8(chip, TCPC_FAULT_STATUS, &reg_status);
+		if (ret < 0)
+			return ret;
+
+		ret = max_tcpci_write8(chip, TCPC_FAULT_STATUS, reg_status);
+		if (ret < 0)
+			return ret;
+
+		if (reg_status & TCPC_FAULT_STATUS_VCONN_OC)
+			tcpm_port_error_recovery(chip->port);
+	}
+
 	if (status & TCPC_ALERT_EXTND) {
 		ret = max_tcpci_read8(chip, TCPC_ALERT_EXTENDED, &reg_status);
 		if (ret < 0)
diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 83376473ac76..467e8045e9f8 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -36,7 +36,9 @@
 
 #define TCPC_ALERT_MASK			0x12
 #define TCPC_POWER_STATUS_MASK		0x14
-#define TCPC_FAULT_STATUS_MASK		0x15
+
+#define TCPC_FAULT_STATUS_MASK			0x15
+#define TCPC_FAULT_STATUS_MASK_VCONN_OC		BIT(1)
 
 #define TCPC_EXTENDED_STATUS_MASK		0x16
 #define TCPC_EXTENDED_STATUS_MASK_VSAFE0V	BIT(0)
@@ -104,6 +106,7 @@
 
 #define TCPC_FAULT_STATUS		0x1f
 #define TCPC_FAULT_STATUS_ALL_REG_RST_TO_DEFAULT BIT(7)
+#define TCPC_FAULT_STATUS_VCONN_OC	BIT(1)
 
 #define TCPC_ALERT_EXTENDED		0x21
 
-- 
cgit v1.2.3


From 7cc4e6b0e4ddf610477fcec8e3d2a9caae7e8a6c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 29 Nov 2023 18:06:46 +0200
Subject: pinctrl: Convert unsigned to unsigned int

Simple type conversion with no functional change implied.
While at it, adjust indentation where it makes sense.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231129161459.1002323-24-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c                  | 28 ++++++++++++-------------
 drivers/pinctrl/core.h                  | 14 ++++++-------
 drivers/pinctrl/devicetree.c            |  8 ++++----
 drivers/pinctrl/pinconf-generic.c       | 16 +++++++--------
 drivers/pinctrl/pinconf.c               | 14 ++++++-------
 drivers/pinctrl/pinconf.h               | 10 ++++-----
 drivers/pinctrl/pinctrl-utils.c         | 26 ++++++++++++------------
 drivers/pinctrl/pinctrl-utils.h         | 18 ++++++++---------
 drivers/pinctrl/pinmux.c                | 36 ++++++++++++++++-----------------
 drivers/pinctrl/pinmux.h                | 20 +++++++++---------
 include/linux/pinctrl/machine.h         |  6 +++---
 include/linux/pinctrl/pinconf-generic.h | 10 ++++-----
 include/linux/pinctrl/pinconf.h         | 16 +++++++--------
 include/linux/pinctrl/pinctrl.h         | 24 +++++++++++-----------
 include/linux/pinctrl/pinmux.h          | 22 ++++++++++----------
 15 files changed, 134 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index e55959e1af5a..6d1e44efb5f3 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -143,7 +143,7 @@ struct pinctrl_dev *get_pinctrl_dev_from_of_node(struct device_node *np)
  */
 int pin_get_from_name(struct pinctrl_dev *pctldev, const char *name)
 {
-	unsigned i, pin;
+	unsigned int i, pin;
 
 	/* The pin number can be retrived from the pin controller descriptor */
 	for (i = 0; i < pctldev->desc->npins; i++) {
@@ -164,7 +164,7 @@ int pin_get_from_name(struct pinctrl_dev *pctldev, const char *name)
  * @pctldev: the pin control device to lookup the pin on
  * @pin: pin number/id to look up
  */
-const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin)
+const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned int pin)
 {
 	const struct pin_desc *desc;
 
@@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(pin_get_name);
 /* Deletes a range of pin descriptors */
 static void pinctrl_free_pindescs(struct pinctrl_dev *pctldev,
 				  const struct pinctrl_pin_desc *pins,
-				  unsigned num_pins)
+				  unsigned int num_pins)
 {
 	int i;
 
@@ -250,9 +250,9 @@ failed:
 
 static int pinctrl_register_pins(struct pinctrl_dev *pctldev,
 				 const struct pinctrl_pin_desc *pins,
-				 unsigned num_descs)
+				 unsigned int num_descs)
 {
-	unsigned i;
+	unsigned int i;
 	int ret = 0;
 
 	for (i = 0; i < num_descs; i++) {
@@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(pinctrl_add_gpio_range);
 
 void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
 			     struct pinctrl_gpio_range *ranges,
-			     unsigned nranges)
+			     unsigned int nranges)
 {
 	int i;
 
@@ -457,7 +457,7 @@ struct pinctrl_dev *pinctrl_find_and_add_gpio_range(const char *devname,
 EXPORT_SYMBOL_GPL(pinctrl_find_and_add_gpio_range);
 
 int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const char *pin_group,
-				const unsigned **pins, unsigned *num_pins)
+			   const unsigned int **pins, unsigned int *num_pins)
 {
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
 	int gs;
@@ -732,8 +732,8 @@ int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
 			       const char *pin_group)
 {
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
-	unsigned ngroups = pctlops->get_groups_count(pctldev);
-	unsigned group_selector = 0;
+	unsigned int ngroups = pctlops->get_groups_count(pctldev);
+	unsigned int group_selector = 0;
 
 	while (group_selector < ngroups) {
 		const char *gname = pctlops->get_group_name(pctldev,
@@ -1430,7 +1430,7 @@ EXPORT_SYMBOL_GPL(devm_pinctrl_put);
  * @num_maps: the number of maps in the mapping table
  */
 int pinctrl_register_mappings(const struct pinctrl_map *maps,
-			      unsigned num_maps)
+			      unsigned int num_maps)
 {
 	int i, ret;
 	struct pinctrl_maps *maps_node;
@@ -1645,7 +1645,7 @@ static int pinctrl_pins_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_dev *pctldev = s->private;
 	const struct pinctrl_ops *ops = pctldev->desc->pctlops;
-	unsigned i, pin;
+	unsigned int i, pin;
 #ifdef CONFIG_GPIOLIB
 	struct gpio_device *gdev __free(gpio_device_put) = NULL;
 	struct pinctrl_gpio_range *range;
@@ -1709,7 +1709,7 @@ static int pinctrl_groups_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_dev *pctldev = s->private;
 	const struct pinctrl_ops *ops = pctldev->desc->pctlops;
-	unsigned ngroups, selector = 0;
+	unsigned int ngroups, selector = 0;
 
 	mutex_lock(&pctldev->mutex);
 
@@ -1717,8 +1717,8 @@ static int pinctrl_groups_show(struct seq_file *s, void *what)
 
 	seq_puts(s, "registered pin groups:\n");
 	while (selector < ngroups) {
-		const unsigned *pins = NULL;
-		unsigned num_pins = 0;
+		const unsigned int *pins = NULL;
+		unsigned int num_pins = 0;
 		const char *gname = ops->get_group_name(pctldev, selector);
 		const char *pname;
 		int ret = 0;
diff --git a/drivers/pinctrl/core.h b/drivers/pinctrl/core.h
index 530370443c19..8b59dd72e4ff 100644
--- a/drivers/pinctrl/core.h
+++ b/drivers/pinctrl/core.h
@@ -111,8 +111,8 @@ struct pinctrl_state {
  * @func: the function selector to program
  */
 struct pinctrl_setting_mux {
-	unsigned group;
-	unsigned func;
+	unsigned int group;
+	unsigned int func;
 };
 
 /**
@@ -124,9 +124,9 @@ struct pinctrl_setting_mux {
  * @num_configs: the number of entries in array @configs
  */
 struct pinctrl_setting_configs {
-	unsigned group_or_pin;
+	unsigned int group_or_pin;
 	unsigned long *configs;
-	unsigned num_configs;
+	unsigned int num_configs;
 };
 
 /**
@@ -173,7 +173,7 @@ struct pin_desc {
 	void *drv_data;
 	/* These fields only added when supporting pinmux drivers */
 #ifdef CONFIG_PINMUX
-	unsigned mux_usecount;
+	unsigned int mux_usecount;
 	const char *mux_owner;
 	const struct pinctrl_setting_mux *mux_setting;
 	const char *gpio_owner;
@@ -189,7 +189,7 @@ struct pin_desc {
 struct pinctrl_maps {
 	struct list_head node;
 	const struct pinctrl_map *maps;
-	unsigned num_maps;
+	unsigned int num_maps;
 };
 
 #ifdef CONFIG_GENERIC_PINCTRL_GROUPS
@@ -232,7 +232,7 @@ int pinctrl_generic_remove_group(struct pinctrl_dev *pctldev,
 struct pinctrl_dev *get_pinctrl_dev_from_devname(const char *dev_name);
 struct pinctrl_dev *get_pinctrl_dev_from_of_node(struct device_node *np);
 int pin_get_from_name(struct pinctrl_dev *pctldev, const char *name);
-const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin);
+const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned int pin);
 int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
 			       const char *pin_group);
 
diff --git a/drivers/pinctrl/devicetree.c b/drivers/pinctrl/devicetree.c
index 6e0a40962f38..df1efc2e5202 100644
--- a/drivers/pinctrl/devicetree.c
+++ b/drivers/pinctrl/devicetree.c
@@ -24,11 +24,11 @@ struct pinctrl_dt_map {
 	struct list_head node;
 	struct pinctrl_dev *pctldev;
 	struct pinctrl_map *map;
-	unsigned num_maps;
+	unsigned int num_maps;
 };
 
 static void dt_free_map(struct pinctrl_dev *pctldev,
-		     struct pinctrl_map *map, unsigned num_maps)
+			struct pinctrl_map *map, unsigned int num_maps)
 {
 	int i;
 
@@ -64,7 +64,7 @@ void pinctrl_dt_free_maps(struct pinctrl *p)
 
 static int dt_remember_or_free_map(struct pinctrl *p, const char *statename,
 				   struct pinctrl_dev *pctldev,
-				   struct pinctrl_map *map, unsigned num_maps)
+				   struct pinctrl_map *map, unsigned int num_maps)
 {
 	int i;
 	struct pinctrl_dt_map *dt_map;
@@ -116,7 +116,7 @@ static int dt_to_map_one_config(struct pinctrl *p,
 	const struct pinctrl_ops *ops;
 	int ret;
 	struct pinctrl_map *map;
-	unsigned num_maps;
+	unsigned int num_maps;
 	bool allow_default = false;
 
 	/* Find the pin controller containing np_config */
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 8313cb5f3b3c..cada5d18ffae 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -57,7 +57,7 @@ static const struct pin_config_item conf_items[] = {
 
 static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
 				     struct seq_file *s, const char *gname,
-				     unsigned pin,
+				     unsigned int pin,
 				     const struct pin_config_item *items,
 				     int nitems, int *print_sep)
 {
@@ -110,7 +110,7 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
  * to be specified the other can be NULL/0.
  */
 void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev, struct seq_file *s,
-			       const char *gname, unsigned pin)
+			       const char *gname, unsigned int pin)
 {
 	const struct pinconf_ops *ops = pctldev->desc->confops;
 	int print_sep = 0;
@@ -295,15 +295,15 @@ EXPORT_SYMBOL_GPL(pinconf_generic_parse_dt_config);
 
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np, struct pinctrl_map **map,
-		unsigned *reserved_maps, unsigned *num_maps,
+		unsigned int *reserved_maps, unsigned int *num_maps,
 		enum pinctrl_map_type type)
 {
 	int ret;
 	const char *function;
 	struct device *dev = pctldev->dev;
 	unsigned long *configs = NULL;
-	unsigned num_configs = 0;
-	unsigned reserve, strings_count;
+	unsigned int num_configs = 0;
+	unsigned int reserve, strings_count;
 	struct property *prop;
 	const char *group;
 	const char *subnode_target_type = "pins";
@@ -379,9 +379,9 @@ EXPORT_SYMBOL_GPL(pinconf_generic_dt_subnode_to_map);
 
 int pinconf_generic_dt_node_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps, enum pinctrl_map_type type)
+		unsigned int *num_maps, enum pinctrl_map_type type)
 {
-	unsigned reserved_maps;
+	unsigned int reserved_maps;
 	struct device_node *np;
 	int ret;
 
@@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(pinconf_generic_dt_node_to_map);
 
 void pinconf_generic_dt_free_map(struct pinctrl_dev *pctldev,
 				 struct pinctrl_map *map,
-				 unsigned num_maps)
+				 unsigned int num_maps)
 {
 	pinctrl_utils_free_map(pctldev, map, num_maps);
 }
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 96d853a8f339..dca963633b5d 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -55,7 +55,7 @@ int pinconf_validate_map(const struct pinctrl_map *map, int i)
 	return 0;
 }
 
-int pin_config_get_for_pin(struct pinctrl_dev *pctldev, unsigned pin,
+int pin_config_get_for_pin(struct pinctrl_dev *pctldev, unsigned int pin,
 			   unsigned long *config)
 {
 	const struct pinconf_ops *ops = pctldev->desc->confops;
@@ -199,7 +199,7 @@ int pinconf_apply_setting(const struct pinctrl_setting *setting)
 	return 0;
 }
 
-int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned int pin,
 		       unsigned long *configs, size_t nconfigs)
 {
 	const struct pinconf_ops *ops;
@@ -214,7 +214,7 @@ int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
 #ifdef CONFIG_DEBUG_FS
 
 static void pinconf_show_config(struct seq_file *s, struct pinctrl_dev *pctldev,
-		      unsigned long *configs, unsigned num_configs)
+				unsigned long *configs, unsigned int num_configs)
 {
 	const struct pinconf_ops *confops;
 	int i;
@@ -304,7 +304,7 @@ static void pinconf_dump_pin(struct pinctrl_dev *pctldev,
 static int pinconf_pins_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_dev *pctldev = s->private;
-	unsigned i, pin;
+	unsigned int i, pin;
 
 	seq_puts(s, "Pin config settings per pin\n");
 	seq_puts(s, "Format: pin (name): configs\n");
@@ -333,7 +333,7 @@ static int pinconf_pins_show(struct seq_file *s, void *what)
 }
 
 static void pinconf_dump_group(struct pinctrl_dev *pctldev,
-			       struct seq_file *s, unsigned selector,
+			       struct seq_file *s, unsigned int selector,
 			       const char *gname)
 {
 	const struct pinconf_ops *ops = pctldev->desc->confops;
@@ -348,8 +348,8 @@ static int pinconf_groups_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_dev *pctldev = s->private;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
-	unsigned ngroups = pctlops->get_groups_count(pctldev);
-	unsigned selector = 0;
+	unsigned int ngroups = pctlops->get_groups_count(pctldev);
+	unsigned int selector = 0;
 
 	seq_puts(s, "Pin config settings per pin group\n");
 	seq_puts(s, "Format: group (name): configs\n");
diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h
index 694bfc9961fa..a14c950bc700 100644
--- a/drivers/pinctrl/pinconf.h
+++ b/drivers/pinctrl/pinconf.h
@@ -29,14 +29,14 @@ int pinconf_map_to_setting(const struct pinctrl_map *map,
 void pinconf_free_setting(const struct pinctrl_setting *setting);
 int pinconf_apply_setting(const struct pinctrl_setting *setting);
 
-int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned int pin,
 		       unsigned long *configs, size_t nconfigs);
 
 /*
  * You will only be interested in these if you're using PINCONF
  * so don't supply any stubs for these.
  */
-int pin_config_get_for_pin(struct pinctrl_dev *pctldev, unsigned pin,
+int pin_config_get_for_pin(struct pinctrl_dev *pctldev, unsigned int pin,
 			   unsigned long *config);
 int pin_config_group_get(const char *dev_name, const char *pin_group,
 			 unsigned long *config);
@@ -68,7 +68,7 @@ static inline int pinconf_apply_setting(const struct pinctrl_setting *setting)
 	return 0;
 }
 
-static inline int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
+static inline int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned int pin,
 				     unsigned long *configs, size_t nconfigs)
 {
 	return -ENOTSUPP;
@@ -112,7 +112,7 @@ static inline void pinconf_init_device_debugfs(struct dentry *devroot,
 
 void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
 			       struct seq_file *s, const char *gname,
-			       unsigned pin);
+			       unsigned int pin);
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 				 struct seq_file *s, unsigned long config);
@@ -120,7 +120,7 @@ void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 
 static inline void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
 					     struct seq_file *s,
-					     const char *gname, unsigned pin)
+					     const char *gname, unsigned int pin)
 {
 	return;
 }
diff --git a/drivers/pinctrl/pinctrl-utils.c b/drivers/pinctrl/pinctrl-utils.c
index 40862f7bd6ca..d81d7b46116c 100644
--- a/drivers/pinctrl/pinctrl-utils.c
+++ b/drivers/pinctrl/pinctrl-utils.c
@@ -18,11 +18,11 @@
 #include "pinctrl-utils.h"
 
 int pinctrl_utils_reserve_map(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, unsigned reserve)
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, unsigned int reserve)
 {
-	unsigned old_num = *reserved_maps;
-	unsigned new_num = *num_maps + reserve;
+	unsigned int old_num = *reserved_maps;
+	unsigned int new_num = *num_maps + reserve;
 	struct pinctrl_map *new_map;
 
 	if (old_num >= new_num)
@@ -43,8 +43,8 @@ int pinctrl_utils_reserve_map(struct pinctrl_dev *pctldev,
 EXPORT_SYMBOL_GPL(pinctrl_utils_reserve_map);
 
 int pinctrl_utils_add_map_mux(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, const char *group,
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, const char *group,
 		const char *function)
 {
 	if (WARN_ON(*num_maps == *reserved_maps))
@@ -60,9 +60,9 @@ int pinctrl_utils_add_map_mux(struct pinctrl_dev *pctldev,
 EXPORT_SYMBOL_GPL(pinctrl_utils_add_map_mux);
 
 int pinctrl_utils_add_map_configs(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, const char *group,
-		unsigned long *configs, unsigned num_configs,
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, const char *group,
+		unsigned long *configs, unsigned int num_configs,
 		enum pinctrl_map_type type)
 {
 	unsigned long *dup_configs;
@@ -86,11 +86,11 @@ int pinctrl_utils_add_map_configs(struct pinctrl_dev *pctldev,
 EXPORT_SYMBOL_GPL(pinctrl_utils_add_map_configs);
 
 int pinctrl_utils_add_config(struct pinctrl_dev *pctldev,
-		unsigned long **configs, unsigned *num_configs,
+		unsigned long **configs, unsigned int *num_configs,
 		unsigned long config)
 {
-	unsigned old_num = *num_configs;
-	unsigned new_num = old_num + 1;
+	unsigned int old_num = *num_configs;
+	unsigned int new_num = old_num + 1;
 	unsigned long *new_configs;
 
 	new_configs = krealloc(*configs, sizeof(*new_configs) * new_num,
@@ -110,7 +110,7 @@ int pinctrl_utils_add_config(struct pinctrl_dev *pctldev,
 EXPORT_SYMBOL_GPL(pinctrl_utils_add_config);
 
 void pinctrl_utils_free_map(struct pinctrl_dev *pctldev,
-	      struct pinctrl_map *map, unsigned num_maps)
+	      struct pinctrl_map *map, unsigned int num_maps)
 {
 	int i;
 
diff --git a/drivers/pinctrl/pinctrl-utils.h b/drivers/pinctrl/pinctrl-utils.h
index 4108ee2dd6d0..203fba257d71 100644
--- a/drivers/pinctrl/pinctrl-utils.h
+++ b/drivers/pinctrl/pinctrl-utils.h
@@ -15,21 +15,21 @@ struct pinctrl_dev;
 struct pinctrl_map;
 
 int pinctrl_utils_reserve_map(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, unsigned reserve);
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, unsigned int reserve);
 int pinctrl_utils_add_map_mux(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, const char *group,
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, const char *group,
 		const char *function);
 int pinctrl_utils_add_map_configs(struct pinctrl_dev *pctldev,
-		struct pinctrl_map **map, unsigned *reserved_maps,
-		unsigned *num_maps, const char *group,
-		unsigned long *configs, unsigned num_configs,
+		struct pinctrl_map **map, unsigned int *reserved_maps,
+		unsigned int *num_maps, const char *group,
+		unsigned long *configs, unsigned int num_configs,
 		enum pinctrl_map_type type);
 int pinctrl_utils_add_config(struct pinctrl_dev *pctldev,
-		unsigned long **configs, unsigned *num_configs,
+		unsigned long **configs, unsigned int *num_configs,
 		unsigned long config);
 void pinctrl_utils_free_map(struct pinctrl_dev *pctldev,
-		struct pinctrl_map *map, unsigned num_maps);
+		struct pinctrl_map *map, unsigned int num_maps);
 
 #endif /* __PINCTRL_UTILS_H__ */
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 23d2da0b99b9..abbb044d6ace 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -35,8 +35,8 @@
 int pinmux_check_ops(struct pinctrl_dev *pctldev)
 {
 	const struct pinmux_ops *ops = pctldev->desc->pmxops;
-	unsigned nfuncs;
-	unsigned selector = 0;
+	unsigned int nfuncs;
+	unsigned int selector = 0;
 
 	/* Check that we implement required operations */
 	if (!ops ||
@@ -84,7 +84,7 @@ int pinmux_validate_map(const struct pinctrl_map *map, int i)
  * Controllers not defined as strict will always return true,
  * menaning that the gpio can be used.
  */
-bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned pin)
+bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned int pin)
 {
 	struct pin_desc *desc = pin_desc_get(pctldev, pin);
 	const struct pinmux_ops *ops = pctldev->desc->pmxops;
@@ -262,7 +262,7 @@ static const char *pin_free(struct pinctrl_dev *pctldev, int pin,
  */
 int pinmux_request_gpio(struct pinctrl_dev *pctldev,
 			struct pinctrl_gpio_range *range,
-			unsigned pin, unsigned gpio)
+			unsigned int pin, unsigned int gpio)
 {
 	const char *owner;
 	int ret;
@@ -285,7 +285,7 @@ int pinmux_request_gpio(struct pinctrl_dev *pctldev,
  * @pin: the affected currently GPIO-muxed in pin
  * @range: applicable GPIO range
  */
-void pinmux_free_gpio(struct pinctrl_dev *pctldev, unsigned pin,
+void pinmux_free_gpio(struct pinctrl_dev *pctldev, unsigned int pin,
 		      struct pinctrl_gpio_range *range)
 {
 	const char *owner;
@@ -303,7 +303,7 @@ void pinmux_free_gpio(struct pinctrl_dev *pctldev, unsigned pin,
  */
 int pinmux_gpio_direction(struct pinctrl_dev *pctldev,
 			  struct pinctrl_gpio_range *range,
-			  unsigned pin, bool input)
+			  unsigned int pin, bool input)
 {
 	const struct pinmux_ops *ops;
 	int ret;
@@ -322,8 +322,8 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev,
 					const char *function)
 {
 	const struct pinmux_ops *ops = pctldev->desc->pmxops;
-	unsigned nfuncs = ops->get_functions_count(pctldev);
-	unsigned selector = 0;
+	unsigned int nfuncs = ops->get_functions_count(pctldev);
+	unsigned int selector = 0;
 
 	/* See if this pctldev has this function */
 	while (selector < nfuncs) {
@@ -344,7 +344,7 @@ int pinmux_map_to_setting(const struct pinctrl_map *map,
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinmux_ops *pmxops = pctldev->desc->pmxops;
 	char const * const *groups;
-	unsigned num_groups;
+	unsigned int num_groups;
 	int ret;
 	const char *group;
 
@@ -409,8 +409,8 @@ int pinmux_enable_setting(const struct pinctrl_setting *setting)
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
 	const struct pinmux_ops *ops = pctldev->desc->pmxops;
 	int ret = 0;
-	const unsigned *pins = NULL;
-	unsigned num_pins = 0;
+	const unsigned int *pins = NULL;
+	unsigned int num_pins = 0;
 	int i;
 	struct pin_desc *desc;
 
@@ -489,8 +489,8 @@ void pinmux_disable_setting(const struct pinctrl_setting *setting)
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
 	int ret = 0;
-	const unsigned *pins = NULL;
-	unsigned num_pins = 0;
+	const unsigned int *pins = NULL;
+	unsigned int num_pins = 0;
 	int i;
 	struct pin_desc *desc;
 
@@ -541,8 +541,8 @@ static int pinmux_functions_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_dev *pctldev = s->private;
 	const struct pinmux_ops *pmxops = pctldev->desc->pmxops;
-	unsigned nfuncs;
-	unsigned func_selector = 0;
+	unsigned int nfuncs;
+	unsigned int func_selector = 0;
 
 	if (!pmxops)
 		return 0;
@@ -553,7 +553,7 @@ static int pinmux_functions_show(struct seq_file *s, void *what)
 		const char *func = pmxops->get_function_name(pctldev,
 							  func_selector);
 		const char * const *groups;
-		unsigned num_groups;
+		unsigned int num_groups;
 		int ret;
 		int i;
 
@@ -584,7 +584,7 @@ static int pinmux_pins_show(struct seq_file *s, void *what)
 	struct pinctrl_dev *pctldev = s->private;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
 	const struct pinmux_ops *pmxops = pctldev->desc->pmxops;
-	unsigned i, pin;
+	unsigned int i, pin;
 
 	if (!pmxops)
 		return 0;
@@ -818,7 +818,7 @@ EXPORT_SYMBOL_GPL(pinmux_generic_get_function_name);
 int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev,
 				       unsigned int selector,
 				       const char * const **groups,
-				       unsigned * const num_groups)
+				       unsigned int * const num_groups)
 {
 	struct function_desc *function;
 
diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h
index ea6f99c24aa5..7c8aa25ccc80 100644
--- a/drivers/pinctrl/pinmux.h
+++ b/drivers/pinctrl/pinmux.h
@@ -26,16 +26,16 @@ int pinmux_check_ops(struct pinctrl_dev *pctldev);
 
 int pinmux_validate_map(const struct pinctrl_map *map, int i);
 
-bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned pin);
+bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned int pin);
 
 int pinmux_request_gpio(struct pinctrl_dev *pctldev,
 			struct pinctrl_gpio_range *range,
-			unsigned pin, unsigned gpio);
-void pinmux_free_gpio(struct pinctrl_dev *pctldev, unsigned pin,
+			unsigned int pin, unsigned int gpio);
+void pinmux_free_gpio(struct pinctrl_dev *pctldev, unsigned int pin,
 		      struct pinctrl_gpio_range *range);
 int pinmux_gpio_direction(struct pinctrl_dev *pctldev,
 			  struct pinctrl_gpio_range *range,
-			  unsigned pin, bool input);
+			  unsigned int pin, bool input);
 
 int pinmux_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting);
@@ -56,27 +56,27 @@ static inline int pinmux_validate_map(const struct pinctrl_map *map, int i)
 }
 
 static inline bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev,
-					       unsigned pin)
+					       unsigned int pin)
 {
 	return true;
 }
 
 static inline int pinmux_request_gpio(struct pinctrl_dev *pctldev,
 			struct pinctrl_gpio_range *range,
-			unsigned pin, unsigned gpio)
+			unsigned int pin, unsigned int gpio)
 {
 	return 0;
 }
 
 static inline void pinmux_free_gpio(struct pinctrl_dev *pctldev,
-				    unsigned pin,
+				    unsigned int pin,
 				    struct pinctrl_gpio_range *range)
 {
 }
 
 static inline int pinmux_gpio_direction(struct pinctrl_dev *pctldev,
 					struct pinctrl_gpio_range *range,
-					unsigned pin, bool input)
+					unsigned int pin, bool input)
 {
 	return 0;
 }
@@ -154,7 +154,7 @@ pinmux_generic_get_function_name(struct pinctrl_dev *pctldev,
 int pinmux_generic_get_function_groups(struct pinctrl_dev *pctldev,
 				       unsigned int selector,
 				       const char * const **groups,
-				       unsigned * const num_groups);
+				       unsigned int * const num_groups);
 
 struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev,
 						  unsigned int selector);
@@ -162,7 +162,7 @@ struct function_desc *pinmux_generic_get_function(struct pinctrl_dev *pctldev,
 int pinmux_generic_add_function(struct pinctrl_dev *pctldev,
 				const char *name,
 				const char * const *groups,
-				unsigned const num_groups,
+				unsigned int const num_groups,
 				void *data);
 
 int pinmux_generic_remove_function(struct pinctrl_dev *pctldev,
diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h
index ee8803f6ad07..673e96df453b 100644
--- a/include/linux/pinctrl/machine.h
+++ b/include/linux/pinctrl/machine.h
@@ -47,7 +47,7 @@ struct pinctrl_map_mux {
 struct pinctrl_map_configs {
 	const char *group_or_pin;
 	unsigned long *configs;
-	unsigned num_configs;
+	unsigned int num_configs;
 };
 
 /**
@@ -154,13 +154,13 @@ struct pinctrl_map;
 #ifdef CONFIG_PINCTRL
 
 extern int pinctrl_register_mappings(const struct pinctrl_map *map,
-				     unsigned num_maps);
+				     unsigned int num_maps);
 extern void pinctrl_unregister_mappings(const struct pinctrl_map *map);
 extern void pinctrl_provide_dummies(void);
 #else
 
 static inline int pinctrl_register_mappings(const struct pinctrl_map *map,
-					    unsigned num_maps)
+					    unsigned int num_maps)
 {
 	return 0;
 }
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d74b7a4ea154..a65d3d078e58 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -193,17 +193,17 @@ struct pinconf_generic_params {
 
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np, struct pinctrl_map **map,
-		unsigned *reserved_maps, unsigned *num_maps,
+		unsigned int *reserved_maps, unsigned int *num_maps,
 		enum pinctrl_map_type type);
 int pinconf_generic_dt_node_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps, enum pinctrl_map_type type);
+		unsigned int *num_maps, enum pinctrl_map_type type);
 void pinconf_generic_dt_free_map(struct pinctrl_dev *pctldev,
-		struct pinctrl_map *map, unsigned num_maps);
+		struct pinctrl_map *map, unsigned int num_maps);
 
 static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps)
+		unsigned int *num_maps)
 {
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_CONFIGS_GROUP);
@@ -211,7 +211,7 @@ static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctld
 
 static inline int pinconf_generic_dt_node_to_map_pin(struct pinctrl_dev *pctldev,
 		struct device_node *np_config, struct pinctrl_map **map,
-		unsigned *num_maps)
+		unsigned int *num_maps)
 {
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_CONFIGS_PIN);
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index f8a8215e9021..770ec2221156 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -40,25 +40,25 @@ struct pinconf_ops {
 	bool is_generic;
 #endif
 	int (*pin_config_get) (struct pinctrl_dev *pctldev,
-			       unsigned pin,
+			       unsigned int pin,
 			       unsigned long *config);
 	int (*pin_config_set) (struct pinctrl_dev *pctldev,
-			       unsigned pin,
+			       unsigned int pin,
 			       unsigned long *configs,
-			       unsigned num_configs);
+			       unsigned int num_configs);
 	int (*pin_config_group_get) (struct pinctrl_dev *pctldev,
-				     unsigned selector,
+				     unsigned int selector,
 				     unsigned long *config);
 	int (*pin_config_group_set) (struct pinctrl_dev *pctldev,
-				     unsigned selector,
+				     unsigned int selector,
 				     unsigned long *configs,
-				     unsigned num_configs);
+				     unsigned int num_configs);
 	void (*pin_config_dbg_show) (struct pinctrl_dev *pctldev,
 				     struct seq_file *s,
-				     unsigned offset);
+				     unsigned int offset);
 	void (*pin_config_group_dbg_show) (struct pinctrl_dev *pctldev,
 					   struct seq_file *s,
-					   unsigned selector);
+					   unsigned int selector);
 	void (*pin_config_config_dbg_show) (struct pinctrl_dev *pctldev,
 					    struct seq_file *s,
 					    unsigned long config);
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 4d252ea00ed1..9a8189ffd0f2 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -54,7 +54,7 @@ struct pingroup {
  * @drv_data: driver-defined per-pin data. pinctrl core does not touch this
  */
 struct pinctrl_pin_desc {
-	unsigned number;
+	unsigned int number;
 	const char *name;
 	void *drv_data;
 };
@@ -82,7 +82,7 @@ struct pinctrl_gpio_range {
 	unsigned int base;
 	unsigned int pin_base;
 	unsigned int npins;
-	unsigned const *pins;
+	unsigned int const *pins;
 	struct gpio_chip *gc;
 };
 
@@ -108,18 +108,18 @@ struct pinctrl_gpio_range {
 struct pinctrl_ops {
 	int (*get_groups_count) (struct pinctrl_dev *pctldev);
 	const char *(*get_group_name) (struct pinctrl_dev *pctldev,
-				       unsigned selector);
+				       unsigned int selector);
 	int (*get_group_pins) (struct pinctrl_dev *pctldev,
-			       unsigned selector,
-			       const unsigned **pins,
-			       unsigned *num_pins);
+			       unsigned int selector,
+			       const unsigned int **pins,
+			       unsigned int *num_pins);
 	void (*pin_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s,
-			  unsigned offset);
+			      unsigned int offset);
 	int (*dt_node_to_map) (struct pinctrl_dev *pctldev,
 			       struct device_node *np_config,
-			       struct pinctrl_map **map, unsigned *num_maps);
+			       struct pinctrl_map **map, unsigned int *num_maps);
 	void (*dt_free_map) (struct pinctrl_dev *pctldev,
-			     struct pinctrl_map *map, unsigned num_maps);
+			     struct pinctrl_map *map, unsigned int num_maps);
 };
 
 /**
@@ -193,7 +193,7 @@ extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *ranges,
-				unsigned nranges);
+				unsigned int nranges);
 extern void pinctrl_remove_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 
@@ -203,8 +203,8 @@ extern struct pinctrl_gpio_range *
 pinctrl_find_gpio_range_from_pin(struct pinctrl_dev *pctldev,
 				 unsigned int pin);
 extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev,
-				const char *pin_group, const unsigned **pins,
-				unsigned *num_pins);
+				  const char *pin_group, const unsigned int **pins,
+				  unsigned int *num_pins);
 
 /**
  * struct pinfunction - Description about a function
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index a7e370965c53..d6f7b58d6ad0 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -57,26 +57,26 @@ struct pinctrl_gpio_range;
  *	the pin request.
  */
 struct pinmux_ops {
-	int (*request) (struct pinctrl_dev *pctldev, unsigned offset);
-	int (*free) (struct pinctrl_dev *pctldev, unsigned offset);
+	int (*request) (struct pinctrl_dev *pctldev, unsigned int offset);
+	int (*free) (struct pinctrl_dev *pctldev, unsigned int offset);
 	int (*get_functions_count) (struct pinctrl_dev *pctldev);
 	const char *(*get_function_name) (struct pinctrl_dev *pctldev,
-					  unsigned selector);
+					  unsigned int selector);
 	int (*get_function_groups) (struct pinctrl_dev *pctldev,
-				  unsigned selector,
-				  const char * const **groups,
-				  unsigned *num_groups);
-	int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector,
-			unsigned group_selector);
+				    unsigned int selector,
+				    const char * const **groups,
+				    unsigned int *num_groups);
+	int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector,
+			unsigned int group_selector);
 	int (*gpio_request_enable) (struct pinctrl_dev *pctldev,
 				    struct pinctrl_gpio_range *range,
-				    unsigned offset);
+				    unsigned int offset);
 	void (*gpio_disable_free) (struct pinctrl_dev *pctldev,
 				   struct pinctrl_gpio_range *range,
-				   unsigned offset);
+				   unsigned int offset);
 	int (*gpio_set_direction) (struct pinctrl_dev *pctldev,
 				   struct pinctrl_gpio_range *range,
-				   unsigned offset,
+				   unsigned int offset,
 				   bool input);
 	bool strict;
 };
-- 
cgit v1.2.3


From 2202844e4468c7539dba0c0b06577c93735af952 Mon Sep 17 00:00:00 2001
From: Longfang Liu <liulongfang@huawei.com>
Date: Mon, 6 Nov 2023 15:22:23 +0800
Subject: vfio/migration: Add debugfs to live migration driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are multiple devices, software and operational steps involved
in the process of live migration. An error occurred on any node may
cause the live migration operation to fail.
This complex process makes it very difficult to locate and analyze
the cause when the function fails.

In order to quickly locate the cause of the problem when the
live migration fails, I added a set of debugfs to the vfio
live migration driver.

    +-------------------------------------------+
    |                                           |
    |                                           |
    |                  QEMU                     |
    |                                           |
    |                                           |
    +---+----------------------------+----------+
        |      ^                     |      ^
        |      |                     |      |
        |      |                     |      |
        v      |                     v      |
     +---------+--+               +---------+--+
     |src vfio_dev|               |dst vfio_dev|
     +--+---------+               +--+---------+
        |      ^                     |      ^
        |      |                     |      |
        v      |                     |      |
   +-----------+----+           +-----------+----+
   |src dev debugfs |           |dst dev debugfs |
   +----------------+           +----------------+

The entire debugfs directory will be based on the definition of
the CONFIG_DEBUG_FS macro. If this macro is not enabled, the
interfaces in vfio.h will be empty definitions, and the creation
and initialization of the debugfs directory will not be executed.

   vfio
    |
    +---<dev_name1>
    |    +---migration
    |        +--state
    |
    +---<dev_name2>
         +---migration
             +--state

debugfs will create a public root directory "vfio" file.
then create a dev_name() file for each live migration device.
First, create a unified state acquisition file of "migration"
in this device directory.
Then, create a public live migration state lookup file "state".

Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/r/20231106072225.28577-2-liulongfang@huawei.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/Kconfig      | 10 ++++++
 drivers/vfio/Makefile     |  1 +
 drivers/vfio/debugfs.c    | 92 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio.h       | 14 ++++++++
 drivers/vfio/vfio_main.c  |  4 +++
 include/linux/vfio.h      |  7 ++++
 include/uapi/linux/vfio.h |  1 +
 7 files changed, 129 insertions(+)
 create mode 100644 drivers/vfio/debugfs.c

(limited to 'include/linux')

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 6bda6dbb4878..ceae52fd7586 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -80,6 +80,16 @@ config VFIO_VIRQFD
 	select EVENTFD
 	default n
 
+config VFIO_DEBUGFS
+	bool "Export VFIO internals in DebugFS"
+	depends on DEBUG_FS
+	help
+	  Allows exposure of VFIO device internals. This option enables
+	  the use of debugfs by VFIO drivers as required. The device can
+	  cause the VFIO code create a top-level debug/vfio directory
+	  during initialization, and then populate a subdirectory with
+	  entries as required.
+
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
 source "drivers/vfio/mdev/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 68c05705200f..b2fc9fb499d8 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,6 +7,7 @@ vfio-$(CONFIG_VFIO_GROUP) += group.o
 vfio-$(CONFIG_IOMMUFD) += iommufd.o
 vfio-$(CONFIG_VFIO_CONTAINER) += container.o
 vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o
+vfio-$(CONFIG_VFIO_DEBUGFS) += debugfs.o
 
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
diff --git a/drivers/vfio/debugfs.c b/drivers/vfio/debugfs.c
new file mode 100644
index 000000000000..298bd866f157
--- /dev/null
+++ b/drivers/vfio/debugfs.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, HiSilicon Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/vfio.h>
+#include "vfio.h"
+
+static struct dentry *vfio_debugfs_root;
+
+static int vfio_device_state_read(struct seq_file *seq, void *data)
+{
+	struct device *vf_dev = seq->private;
+	struct vfio_device *vdev = container_of(vf_dev,
+						struct vfio_device, device);
+	enum vfio_device_mig_state state;
+	int ret;
+
+	BUILD_BUG_ON(VFIO_DEVICE_STATE_NR !=
+		     VFIO_DEVICE_STATE_PRE_COPY_P2P + 1);
+
+	ret = vdev->mig_ops->migration_get_state(vdev, &state);
+	if (ret)
+		return -EINVAL;
+
+	switch (state) {
+	case VFIO_DEVICE_STATE_ERROR:
+		seq_puts(seq, "ERROR\n");
+		break;
+	case VFIO_DEVICE_STATE_STOP:
+		seq_puts(seq, "STOP\n");
+		break;
+	case VFIO_DEVICE_STATE_RUNNING:
+		seq_puts(seq, "RUNNING\n");
+		break;
+	case VFIO_DEVICE_STATE_STOP_COPY:
+		seq_puts(seq, "STOP_COPY\n");
+		break;
+	case VFIO_DEVICE_STATE_RESUMING:
+		seq_puts(seq, "RESUMING\n");
+		break;
+	case VFIO_DEVICE_STATE_RUNNING_P2P:
+		seq_puts(seq, "RUNNING_P2P\n");
+		break;
+	case VFIO_DEVICE_STATE_PRE_COPY:
+		seq_puts(seq, "PRE_COPY\n");
+		break;
+	case VFIO_DEVICE_STATE_PRE_COPY_P2P:
+		seq_puts(seq, "PRE_COPY_P2P\n");
+		break;
+	default:
+		seq_puts(seq, "Invalid\n");
+	}
+
+	return 0;
+}
+
+void vfio_device_debugfs_init(struct vfio_device *vdev)
+{
+	struct device *dev = &vdev->device;
+
+	vdev->debug_root = debugfs_create_dir(dev_name(vdev->dev),
+					      vfio_debugfs_root);
+
+	if (vdev->mig_ops) {
+		struct dentry *vfio_dev_migration = NULL;
+
+		vfio_dev_migration = debugfs_create_dir("migration",
+							vdev->debug_root);
+		debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration,
+					    vfio_device_state_read);
+	}
+}
+
+void vfio_device_debugfs_exit(struct vfio_device *vdev)
+{
+	debugfs_remove_recursive(vdev->debug_root);
+}
+
+void vfio_debugfs_create_root(void)
+{
+	vfio_debugfs_root = debugfs_create_dir("vfio", NULL);
+}
+
+void vfio_debugfs_remove_root(void)
+{
+	debugfs_remove_recursive(vfio_debugfs_root);
+	vfio_debugfs_root = NULL;
+}
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 307e3f29b527..bde84ad344e5 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -448,4 +448,18 @@ static inline void vfio_device_put_kvm(struct vfio_device *device)
 }
 #endif
 
+#ifdef CONFIG_VFIO_DEBUGFS
+void vfio_debugfs_create_root(void);
+void vfio_debugfs_remove_root(void);
+
+void vfio_device_debugfs_init(struct vfio_device *vdev);
+void vfio_device_debugfs_exit(struct vfio_device *vdev);
+#else
+static inline void vfio_debugfs_create_root(void) { }
+static inline void vfio_debugfs_remove_root(void) { }
+
+static inline void vfio_device_debugfs_init(struct vfio_device *vdev) { }
+static inline void vfio_device_debugfs_exit(struct vfio_device *vdev) { }
+#endif /* CONFIG_VFIO_DEBUGFS */
+
 #endif
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 8d4995ada74a..1cc93aac99a2 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -311,6 +311,7 @@ static int __vfio_register_dev(struct vfio_device *device,
 	refcount_set(&device->refcount, 1);
 
 	vfio_device_group_register(device);
+	vfio_device_debugfs_init(device);
 
 	return 0;
 err_out:
@@ -378,6 +379,7 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 		}
 	}
 
+	vfio_device_debugfs_exit(device);
 	/* Balances vfio_device_set_group in register path */
 	vfio_device_remove_group(device);
 }
@@ -1676,6 +1678,7 @@ static int __init vfio_init(void)
 	if (ret)
 		goto err_alloc_dev_chrdev;
 
+	vfio_debugfs_create_root();
 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 	return 0;
 
@@ -1691,6 +1694,7 @@ err_virqfd:
 
 static void __exit vfio_cleanup(void)
 {
+	vfio_debugfs_remove_root();
 	ida_destroy(&vfio.device_ida);
 	vfio_cdev_cleanup();
 	class_destroy(vfio.device_class);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a65b2513f8cd..89b265bc6ec3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -69,6 +69,13 @@ struct vfio_device {
 	u8 iommufd_attached:1;
 #endif
 	u8 cdev_opened:1;
+#ifdef CONFIG_DEBUG_FS
+	/*
+	 * debug_root is a static property of the vfio_device
+	 * which must be set prior to registering the vfio_device.
+	 */
+	struct dentry *debug_root;
+#endif
 };
 
 /**
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 7f5fb010226d..2b68e6cdf190 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1219,6 +1219,7 @@ enum vfio_device_mig_state {
 	VFIO_DEVICE_STATE_RUNNING_P2P = 5,
 	VFIO_DEVICE_STATE_PRE_COPY = 6,
 	VFIO_DEVICE_STATE_PRE_COPY_P2P = 7,
+	VFIO_DEVICE_STATE_NR,
 };
 
 /**
-- 
cgit v1.2.3


From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:20 +0800
Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr()

map is the pointer of outer map, and need_defer needs some explanation.
need_defer tells the implementation to defer the reference release of
the passed element and ensure that the element is still alive before
the bpf program, which may manipulate it, exits.

The following three cases will invoke map_fd_put_ptr() and different
need_defer values will be passed to these callers:

1) release the reference of the old element in the map during map update
   or map deletion. The release must be deferred, otherwise the bpf
   program may incur use-after-free problem, so need_defer needs to be
   true.
2) release the reference of the to-be-added element in the error path of
   map update. The to-be-added element is not visible to any bpf
   program, so it is OK to pass false for need_defer parameter.
3) release the references of all elements in the map during map release.
   Any bpf program which has access to the map must have been exited and
   released, so need_defer=false will be OK.

These two parameters will be used by the following patches to fix the
potential use-after-free problem for map-in-map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  6 +++++-
 kernel/bpf/arraymap.c   | 12 +++++++-----
 kernel/bpf/hashtab.c    |  6 +++---
 kernel/bpf/map_in_map.c |  2 +-
 kernel/bpf/map_in_map.h |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb447b0a9423..d273348cfb2f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -106,7 +106,11 @@ struct bpf_map_ops {
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
 				int fd);
-	void (*map_fd_put_ptr)(void *ptr);
+	/* If need_defer is true, the implementation should guarantee that
+	 * the to-be-put element is still alive before the bpf program, which
+	 * may manipulate it, exists.
+	 */
+	void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
 	int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..f9aed5909d6e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	}
 
 	if (old_ptr)
-		map->ops->map_fd_put_ptr(old_ptr);
+		map->ops->map_fd_put_ptr(map, old_ptr, true);
 	return 0;
 }
 
@@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 
 	if (old_ptr) {
-		map->ops->map_fd_put_ptr(old_ptr);
+		map->ops->map_fd_put_ptr(map, old_ptr, true);
 		return 0;
 	} else {
 		return -ENOENT;
@@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 	return prog;
 }
 
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
+	/* bpf_prog is freed after one RCU or tasks trace grace period */
 	bpf_prog_put(ptr);
 }
 
@@ -1239,8 +1240,9 @@ err_out:
 	return ee;
 }
 
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
+	/* bpf_perf_event is freed after one RCU grace period */
 	bpf_event_entry_free_rcu(ptr);
 }
 
@@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 	return cgroup_get_from_fd(fd);
 }
 
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	/* cgroup_put free cgrp after a rcu grace period */
 	cgroup_put(ptr);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfc..5b9146fa825f 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
 
 	if (map->ops->map_fd_put_ptr) {
 		ptr = fd_htab_map_get_ptr(map, l);
-		map->ops->map_fd_put_ptr(ptr);
+		map->ops->map_fd_put_ptr(map, ptr, true);
 	}
 }
 
@@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map)
 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
 			void *ptr = fd_htab_map_get_ptr(map, l);
 
-			map->ops->map_fd_put_ptr(ptr);
+			map->ops->map_fd_put_ptr(map, ptr, false);
 		}
 	}
 
@@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 
 	ret = htab_map_update_elem(map, key, &ptr, map_flags);
 	if (ret)
-		map->ops->map_fd_put_ptr(ptr);
+		map->ops->map_fd_put_ptr(map, ptr, false);
 
 	return ret;
 }
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cd5eafaba97e..2dfeb5835e16 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 	return inner_map;
 }
 
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	/* ptr->ops->map_free() has to go through one
 	 * rcu grace period by itself.
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
 void bpf_map_meta_free(struct bpf_map *map_meta);
 void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
 			 int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
 u32 bpf_map_fd_sys_lookup_elem(void *ptr);
 
 #endif
-- 
cgit v1.2.3


From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:22 +0800
Subject: bpf: Defer the free of inner map when necessary

When updating or deleting an inner map in map array or map htab, the map
may still be accessed by non-sleepable program or sleepable program.
However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map
directly through bpf_map_put(), if the ref-counter is the last one
(which is true for most cases), the inner map will be freed by
ops->map_free() in a kworker. But for now, most .map_free() callbacks
don't use synchronize_rcu() or its variants to wait for the elapse of a
RCU grace period, so after the invocation of ops->map_free completes,
the bpf program which is accessing the inner map may incur
use-after-free problem.

Fix the free of inner map by invoking bpf_map_free_deferred() after both
one RCU grace period and one tasks trace RCU grace period if the inner
map has been removed from the outer map before. The deferment is
accomplished by using call_rcu() or call_rcu_tasks_trace() when
releasing the last ref-counter of bpf map. The newly-added rcu_head
field in bpf_map shares the same storage space with work field to
reduce the size of bpf_map.

Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.")
Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  7 ++++++-
 kernel/bpf/map_in_map.c | 11 ++++++++---
 kernel/bpf/syscall.c    | 32 +++++++++++++++++++++++++++-----
 3 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d273348cfb2f..de3bd03cbeea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -276,7 +276,11 @@ struct bpf_map {
 	 */
 	atomic64_t refcnt ____cacheline_aligned;
 	atomic64_t usercnt;
-	struct work_struct work;
+	/* rcu is used before freeing and work is only used during freeing */
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 	struct mutex freeze_mutex;
 	atomic64_t writecnt;
 	/* 'Ownership' of program-containing map is claimed by the first program
@@ -292,6 +296,7 @@ struct bpf_map {
 	} owner;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
+	bool free_after_mult_rcu_gp;
 	s64 __percpu *elem_count;
 };
 
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 2dfeb5835e16..3248ff5d8161 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 
 void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
-	/* ptr->ops->map_free() has to go through one
-	 * rcu grace period by itself.
+	struct bpf_map *inner_map = ptr;
+
+	/* The inner map may still be used by both non-sleepable and sleepable
+	 * bpf program, so free it after one RCU grace period and one tasks
+	 * trace RCU grace period.
 	 */
-	bpf_map_put(ptr);
+	if (need_defer)
+		WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+	bpf_map_put(inner_map);
 }
 
 u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5e43ddd1b83f..dd515f6b9741 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -719,6 +719,28 @@ static void bpf_map_put_uref(struct bpf_map *map)
 	}
 }
 
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+	INIT_WORK(&map->work, bpf_map_free_deferred);
+	/* Avoid spawning kworkers, since they all might contend
+	 * for the same mutex like slab_mutex.
+	 */
+	queue_work(system_unbound_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+	bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		bpf_map_free_rcu_gp(rcu);
+	else
+		call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (underlying map implementation ops->map_free() might sleep)
  */
@@ -728,11 +750,11 @@ void bpf_map_put(struct bpf_map *map)
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 		btf_put(map->btf);
-		INIT_WORK(&map->work, bpf_map_free_deferred);
-		/* Avoid spawning kworkers, since they all might contend
-		 * for the same mutex like slab_mutex.
-		 */
-		queue_work(system_unbound_wq, &map->work);
+
+		if (READ_ONCE(map->free_after_mult_rcu_gp))
+			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+		else
+			bpf_map_free_in_work(map);
 	}
 }
 EXPORT_SYMBOL_GPL(bpf_map_put);
-- 
cgit v1.2.3


From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:23 +0800
Subject: bpf: Optimize the free of inner map

When removing the inner map from the outer map, the inner map will be
freed after one RCU grace period and one RCU tasks trace grace
period, so it is certain that the bpf program, which may access the
inner map, has exited before the inner map is freed.

However there is no need to wait for one RCU tasks trace grace period if
the outer map is only accessed by non-sleepable program. So adding
sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding
the outer map into env->used_maps for sleepable program. Although the
max number of bpf program is INT_MAX - 1, the number of bpf programs
which are being loaded may be greater than INT_MAX, so using atomic64_t
instead of atomic_t for sleepable_refcnt. When removing the inner map
from the outer map, using sleepable_refcnt to decide whether or not a
RCU tasks trace grace period is needed before freeing the inner map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  2 ++
 kernel/bpf/core.c       |  4 ++++
 kernel/bpf/map_in_map.c | 14 +++++++++-----
 kernel/bpf/syscall.c    |  8 ++++++++
 kernel/bpf/verifier.c   |  4 +++-
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de3bd03cbeea..10e5e4d8a00f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -297,6 +297,8 @@ struct bpf_map {
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 	bool free_after_mult_rcu_gp;
+	bool free_after_rcu_gp;
+	atomic64_t sleepable_refcnt;
 	s64 __percpu *elem_count;
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cd3afe57ece3..4b813da8d6c0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2664,12 +2664,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 			  struct bpf_map **used_maps, u32 len)
 {
 	struct bpf_map *map;
+	bool sleepable;
 	u32 i;
 
+	sleepable = aux->sleepable;
 	for (i = 0; i < len; i++) {
 		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
 			map->ops->map_poke_untrack(map, aux);
+		if (sleepable)
+			atomic64_dec(&map->sleepable_refcnt);
 		bpf_map_put(map);
 	}
 }
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3248ff5d8161..8ef269e66ba5 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -131,12 +131,16 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	struct bpf_map *inner_map = ptr;
 
-	/* The inner map may still be used by both non-sleepable and sleepable
-	 * bpf program, so free it after one RCU grace period and one tasks
-	 * trace RCU grace period.
+	/* Defer the freeing of inner map according to the sleepable attribute
+	 * of bpf program which owns the outer map, so unnecessary waiting for
+	 * RCU tasks trace grace period can be avoided.
 	 */
-	if (need_defer)
-		WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+	if (need_defer) {
+		if (atomic64_read(&map->sleepable_refcnt))
+			WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+		else
+			WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+	}
 	bpf_map_put(inner_map);
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd515f6b9741..ebaccf77d56e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -751,8 +751,11 @@ void bpf_map_put(struct bpf_map *map)
 		bpf_map_free_id(map);
 		btf_put(map->btf);
 
+		WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
 		if (READ_ONCE(map->free_after_mult_rcu_gp))
 			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+		else if (READ_ONCE(map->free_after_rcu_gp))
+			call_rcu(&map->rcu, bpf_map_free_rcu_gp);
 		else
 			bpf_map_free_in_work(map);
 	}
@@ -5345,6 +5348,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 		goto out_unlock;
 	}
 
+	/* The bpf program will not access the bpf map, but for the sake of
+	 * simplicity, increase sleepable_refcnt for sleepable program as well.
+	 */
+	if (prog->aux->sleepable)
+		atomic64_inc(&map->sleepable_refcnt);
 	memcpy(used_maps_new, used_maps_old,
 	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
 	used_maps_new[prog->aux->used_map_cnt] = map;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cdb4f5f0ba79..1ed39665f802 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17889,10 +17889,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				return -E2BIG;
 			}
 
+			if (env->prog->aux->sleepable)
+				atomic64_inc(&map->sleepable_refcnt);
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
-			 * and all maps are released in free_used_maps()
+			 * and all maps are released in bpf_free_used_maps()
 			 */
 			bpf_map_inc(map);
 
-- 
cgit v1.2.3


From 2a502ff0c4e42a739b5aa550c901bf3852795532 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 1 Dec 2023 15:28:34 -0800
Subject: net: Add queue and napi association

Add the napi pointer in netdev queue for tracking the napi
instance for each queue. This achieves the queue<->napi mapping.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://lore.kernel.org/r/170147331483.5260.15723438819994285695.stgit@anambiarhost.jf.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h     |  8 ++++++++
 include/net/netdev_rx_queue.h |  4 ++++
 net/core/dev.c                | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c2d74bc112dd..5ddff11cbe26 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -665,6 +665,10 @@ struct netdev_queue {
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool    *pool;
 #endif
+	/* NAPI instance for the queue
+	 * Readers and writers must hold RTNL
+	 */
+	struct napi_struct      *napi;
 /*
  * write-mostly part
  */
@@ -2657,6 +2661,10 @@ static inline void *netdev_priv(const struct net_device *dev)
  */
 #define SET_NETDEV_DEVTYPE(net, devtype)	((net)->dev.type = (devtype))
 
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+			  enum netdev_queue_type type,
+			  struct napi_struct *napi);
+
 /* Default NAPI poll() weight
  * Device drivers are strongly advised to not use bigger value
  */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index cdcafb30d437..aa1716fb0e53 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -21,6 +21,10 @@ struct netdev_rx_queue {
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
 #endif
+	/* NAPI instance for the queue
+	 * Readers and writers must hold RTNL
+	 */
+	struct napi_struct		*napi;
 } ____cacheline_aligned_in_smp;
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index 3950ced396b5..0a2eecbeef38 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6400,6 +6400,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
+/**
+ * netif_queue_set_napi - Associate queue with the napi
+ * @dev: device to which NAPI and queue belong
+ * @queue_index: Index of queue
+ * @type: queue type as RX or TX
+ * @napi: NAPI context, pass NULL to clear previously set NAPI
+ *
+ * Set queue with its corresponding napi context. This should be done after
+ * registering the NAPI handler for the queue-vector and the queues have been
+ * mapped to the corresponding interrupt vector.
+ */
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+			  enum netdev_queue_type type, struct napi_struct *napi)
+{
+	struct netdev_rx_queue *rxq;
+	struct netdev_queue *txq;
+
+	if (WARN_ON_ONCE(napi && !napi->dev))
+		return;
+	if (dev->reg_state >= NETREG_REGISTERED)
+		ASSERT_RTNL();
+
+	switch (type) {
+	case NETDEV_QUEUE_TYPE_RX:
+		rxq = __netif_get_rx_queue(dev, queue_index);
+		rxq->napi = napi;
+		return;
+	case NETDEV_QUEUE_TYPE_TX:
+		txq = netdev_get_tx_queue(dev, queue_index);
+		txq->napi = napi;
+		return;
+	default:
+		return;
+	}
+}
+EXPORT_SYMBOL(netif_queue_set_napi);
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
-- 
cgit v1.2.3


From 26793bfb5d6072326d1465343e7cbf6156abca4f Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 1 Dec 2023 15:29:07 -0800
Subject: net: Add NAPI IRQ support

Add support to associate the interrupt vector number for a
NAPI instance.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://lore.kernel.org/r/170147334728.5260.13221803396905901904.stgit@anambiarhost.jf.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_lib.c | 2 ++
 include/linux/netdevice.h                | 6 ++++++
 net/core/dev.c                           | 1 +
 net/core/netdev-genl.c                   | 4 ++++
 4 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 83f6977fad22..626577c7d5b2 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2978,6 +2978,8 @@ void ice_q_vector_set_napi_queues(struct ice_q_vector *q_vector, bool locked)
 		ice_queue_set_napi(q_vector->vsi->netdev, tx_ring->q_index,
 				   NETDEV_QUEUE_TYPE_TX, &q_vector->napi,
 				   locked);
+	/* Also set the interrupt number for the NAPI */
+	netif_napi_set_irq(&q_vector->napi, q_vector->irq.virq);
 }
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5ddff11cbe26..5551177e024e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -382,6 +382,7 @@ struct napi_struct {
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
+	int			irq;
 };
 
 enum {
@@ -2665,6 +2666,11 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 			  enum netdev_queue_type type,
 			  struct napi_struct *napi);
 
+static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
+{
+	napi->irq = irq;
+}
+
 /* Default NAPI poll() weight
  * Device drivers are strongly advised to not use bigger value
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index 13c9d1580e62..1d720fc59161 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6471,6 +6471,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	 */
 	if (dev->threaded && napi_kthread_create(napi))
 		dev->threaded = 0;
+	netif_napi_set_irq(napi, -1);
 }
 EXPORT_SYMBOL(netif_napi_add_weight);
 
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index d55c92f9f26c..9753c19e36de 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -180,7 +180,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 	if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
 		goto nla_put_failure;
 
+	if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
+		goto nla_put_failure;
+
 	genlmsg_end(rsp, hdr);
+
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From 37e4b8df27bc68340f3fc80dbb27e3549c7f881c Mon Sep 17 00:00:00 2001
From: Jianheng Zhang <Jianheng.Zhang@synopsys.com>
Date: Fri, 1 Dec 2023 03:22:03 +0000
Subject: net: stmmac: fix FPE events losing

The status bits of register MAC_FPE_CTRL_STS are clear on read. Using
32-bit read for MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and
dwmac5_fpe_send_mpacket() clear the status bits. Then the stmmac interrupt
handler missing FPE event status and leads to FPE handshaking failure and
retries.
To avoid clear status bits of MAC_FPE_CTRL_STS in dwmac5_fpe_configure()
and dwmac5_fpe_send_mpacket(), add fpe_csr to stmmac_fpe_cfg structure to
cache the control bits of MAC_FPE_CTRL_STS and to avoid reading
MAC_FPE_CTRL_STS in those methods.

Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure")
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Jianheng Zhang <Jianheng.Zhang@synopsys.com>
Link: https://lore.kernel.org/r/CY5PR12MB637225A7CF529D5BE0FBE59CBF81A@CY5PR12MB6372.namprd12.prod.outlook.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c       | 45 +++++++++-------------
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h       |  4 +-
 .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    |  3 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  4 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  8 +++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c    |  1 +
 include/linux/stmmac.h                             |  1 +
 7 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index e95d35f1e5a0..8fd167501fa0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -710,28 +710,22 @@ void dwmac5_est_irq_status(void __iomem *ioaddr, struct net_device *dev,
 	}
 }
 
-void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
+void dwmac5_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg *cfg,
+			  u32 num_txq, u32 num_rxq,
 			  bool enable)
 {
 	u32 value;
 
-	if (!enable) {
-		value = readl(ioaddr + MAC_FPE_CTRL_STS);
-
-		value &= ~EFPE;
-
-		writel(value, ioaddr + MAC_FPE_CTRL_STS);
-		return;
+	if (enable) {
+		cfg->fpe_csr = EFPE;
+		value = readl(ioaddr + GMAC_RXQ_CTRL1);
+		value &= ~GMAC_RXQCTRL_FPRQ;
+		value |= (num_rxq - 1) << GMAC_RXQCTRL_FPRQ_SHIFT;
+		writel(value, ioaddr + GMAC_RXQ_CTRL1);
+	} else {
+		cfg->fpe_csr = 0;
 	}
-
-	value = readl(ioaddr + GMAC_RXQ_CTRL1);
-	value &= ~GMAC_RXQCTRL_FPRQ;
-	value |= (num_rxq - 1) << GMAC_RXQCTRL_FPRQ_SHIFT;
-	writel(value, ioaddr + GMAC_RXQ_CTRL1);
-
-	value = readl(ioaddr + MAC_FPE_CTRL_STS);
-	value |= EFPE;
-	writel(value, ioaddr + MAC_FPE_CTRL_STS);
+	writel(cfg->fpe_csr, ioaddr + MAC_FPE_CTRL_STS);
 }
 
 int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev)
@@ -741,6 +735,9 @@ int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev)
 
 	status = FPE_EVENT_UNKNOWN;
 
+	/* Reads from the MAC_FPE_CTRL_STS register should only be performed
+	 * here, since the status flags of MAC_FPE_CTRL_STS are "clear on read"
+	 */
 	value = readl(ioaddr + MAC_FPE_CTRL_STS);
 
 	if (value & TRSP) {
@@ -766,19 +763,15 @@ int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev)
 	return status;
 }
 
-void dwmac5_fpe_send_mpacket(void __iomem *ioaddr, enum stmmac_mpacket_type type)
+void dwmac5_fpe_send_mpacket(void __iomem *ioaddr, struct stmmac_fpe_cfg *cfg,
+			     enum stmmac_mpacket_type type)
 {
-	u32 value;
+	u32 value = cfg->fpe_csr;
 
-	value = readl(ioaddr + MAC_FPE_CTRL_STS);
-
-	if (type == MPACKET_VERIFY) {
-		value &= ~SRSP;
+	if (type == MPACKET_VERIFY)
 		value |= SVER;
-	} else {
-		value &= ~SVER;
+	else if (type == MPACKET_RESPONSE)
 		value |= SRSP;
-	}
 
 	writel(value, ioaddr + MAC_FPE_CTRL_STS);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index 53c138d0ff48..34e620790eb3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -153,9 +153,11 @@ int dwmac5_est_configure(void __iomem *ioaddr, struct stmmac_est *cfg,
 			 unsigned int ptp_rate);
 void dwmac5_est_irq_status(void __iomem *ioaddr, struct net_device *dev,
 			   struct stmmac_extra_stats *x, u32 txqcnt);
-void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
+void dwmac5_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg *cfg,
+			  u32 num_txq, u32 num_rxq,
 			  bool enable);
 void dwmac5_fpe_send_mpacket(void __iomem *ioaddr,
+			     struct stmmac_fpe_cfg *cfg,
 			     enum stmmac_mpacket_type type);
 int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index 453e88b75be0..a74e71db79f9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -1484,7 +1484,8 @@ static int dwxgmac3_est_configure(void __iomem *ioaddr, struct stmmac_est *cfg,
 	return 0;
 }
 
-static void dwxgmac3_fpe_configure(void __iomem *ioaddr, u32 num_txq,
+static void dwxgmac3_fpe_configure(void __iomem *ioaddr, struct stmmac_fpe_cfg *cfg,
+				   u32 num_txq,
 				   u32 num_rxq, bool enable)
 {
 	u32 value;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index b95d3e137813..68aa2d5ca6e5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -412,9 +412,11 @@ struct stmmac_ops {
 			     unsigned int ptp_rate);
 	void (*est_irq_status)(void __iomem *ioaddr, struct net_device *dev,
 			       struct stmmac_extra_stats *x, u32 txqcnt);
-	void (*fpe_configure)(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
+	void (*fpe_configure)(void __iomem *ioaddr, struct stmmac_fpe_cfg *cfg,
+			      u32 num_txq, u32 num_rxq,
 			      bool enable);
 	void (*fpe_send_mpacket)(void __iomem *ioaddr,
+				 struct stmmac_fpe_cfg *cfg,
 				 enum stmmac_mpacket_type type);
 	int (*fpe_irq_status)(void __iomem *ioaddr, struct net_device *dev);
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2afb2bd25977..37e64283f910 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -964,7 +964,8 @@ static void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up)
 	bool *hs_enable = &fpe_cfg->hs_enable;
 
 	if (is_up && *hs_enable) {
-		stmmac_fpe_send_mpacket(priv, priv->ioaddr, MPACKET_VERIFY);
+		stmmac_fpe_send_mpacket(priv, priv->ioaddr, fpe_cfg,
+					MPACKET_VERIFY);
 	} else {
 		*lo_state = FPE_STATE_OFF;
 		*lp_state = FPE_STATE_OFF;
@@ -5839,6 +5840,7 @@ static void stmmac_fpe_event_status(struct stmmac_priv *priv, int status)
 		/* If user has requested FPE enable, quickly response */
 		if (*hs_enable)
 			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						fpe_cfg,
 						MPACKET_RESPONSE);
 	}
 
@@ -7263,6 +7265,7 @@ static void stmmac_fpe_lp_task(struct work_struct *work)
 		if (*lo_state == FPE_STATE_ENTERING_ON &&
 		    *lp_state == FPE_STATE_ENTERING_ON) {
 			stmmac_fpe_configure(priv, priv->ioaddr,
+					     fpe_cfg,
 					     priv->plat->tx_queues_to_use,
 					     priv->plat->rx_queues_to_use,
 					     *enable);
@@ -7281,6 +7284,7 @@ static void stmmac_fpe_lp_task(struct work_struct *work)
 			netdev_info(priv->dev, SEND_VERIFY_MPAKCET_FMT,
 				    *lo_state, *lp_state);
 			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						fpe_cfg,
 						MPACKET_VERIFY);
 		}
 		/* Sleep then retry */
@@ -7295,6 +7299,7 @@ void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable)
 	if (priv->plat->fpe_cfg->hs_enable != enable) {
 		if (enable) {
 			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						priv->plat->fpe_cfg,
 						MPACKET_VERIFY);
 		} else {
 			priv->plat->fpe_cfg->lo_fpe_state = FPE_STATE_OFF;
@@ -7755,6 +7760,7 @@ int stmmac_suspend(struct device *dev)
 	if (priv->dma_cap.fpesel) {
 		/* Disable FPE */
 		stmmac_fpe_configure(priv, priv->ioaddr,
+				     priv->plat->fpe_cfg,
 				     priv->plat->tx_queues_to_use,
 				     priv->plat->rx_queues_to_use, false);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index ac41ef4cbd2f..6ad3e0a11936 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -1079,6 +1079,7 @@ disable:
 
 	priv->plat->fpe_cfg->enable = false;
 	stmmac_fpe_configure(priv, priv->ioaddr,
+			     priv->plat->fpe_cfg,
 			     priv->plat->tx_queues_to_use,
 			     priv->plat->rx_queues_to_use,
 			     false);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 0b4658a7eceb..dee5ad6e48c5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -175,6 +175,7 @@ struct stmmac_fpe_cfg {
 	bool hs_enable;				/* FPE handshake enable */
 	enum stmmac_fpe_state lp_fpe_state;	/* Link Partner FPE state */
 	enum stmmac_fpe_state lo_fpe_state;	/* Local station FPE state */
+	u32 fpe_csr;				/* MAC_FPE_CTRL_STS reg cache */
 };
 
 struct stmmac_safety_feature_cfg {
-- 
cgit v1.2.3


From a5e400a985df8041ed4659ed1462aa9134318130 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 20 Aug 2023 20:58:56 +0300
Subject: net/mlx5e: Honor user choice of IPsec replay window size

Users can configure IPsec replay window size, but mlx5 driver didn't
honor their choice and set always 32bits. Fix assignment logic to
configure right size from the beginning.

Fixes: 7db21ef4566e ("net/mlx5e: Set IPsec replay sequence numbers")
Reviewed-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c    | 21 +++++++++++++++++++++
 .../mellanox/mlx5/core/en_accel/ipsec_offload.c     |  2 +-
 include/linux/mlx5/mlx5_ifc.h                       |  7 +++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index 655496598c68..4028932d93ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -335,6 +335,27 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
 		attrs->replay_esn.esn = sa_entry->esn_state.esn;
 		attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb;
 		attrs->replay_esn.overlap = sa_entry->esn_state.overlap;
+		switch (x->replay_esn->replay_window) {
+		case 32:
+			attrs->replay_esn.replay_window =
+				MLX5_IPSEC_ASO_REPLAY_WIN_32BIT;
+			break;
+		case 64:
+			attrs->replay_esn.replay_window =
+				MLX5_IPSEC_ASO_REPLAY_WIN_64BIT;
+			break;
+		case 128:
+			attrs->replay_esn.replay_window =
+				MLX5_IPSEC_ASO_REPLAY_WIN_128BIT;
+			break;
+		case 256:
+			attrs->replay_esn.replay_window =
+				MLX5_IPSEC_ASO_REPLAY_WIN_256BIT;
+			break;
+		default:
+			WARN_ON(true);
+			return;
+		}
 	}
 
 	attrs->dir = x->xso.dir;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
index a91f772dc981..4e018fba2d5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
@@ -95,7 +95,7 @@ static void mlx5e_ipsec_packet_setup(void *obj, u32 pdn,
 
 		if (attrs->dir == XFRM_DEV_OFFLOAD_IN) {
 			MLX5_SET(ipsec_aso, aso_ctx, window_sz,
-				 attrs->replay_esn.replay_window / 64);
+				 attrs->replay_esn.replay_window);
 			MLX5_SET(ipsec_aso, aso_ctx, mode,
 				 MLX5_IPSEC_ASO_REPLAY_PROTECTION);
 		}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..90ca63f4bf63 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12001,6 +12001,13 @@ enum {
 	MLX5_IPSEC_ASO_INC_SN            = 0x2,
 };
 
+enum {
+	MLX5_IPSEC_ASO_REPLAY_WIN_32BIT  = 0x0,
+	MLX5_IPSEC_ASO_REPLAY_WIN_64BIT  = 0x1,
+	MLX5_IPSEC_ASO_REPLAY_WIN_128BIT = 0x2,
+	MLX5_IPSEC_ASO_REPLAY_WIN_256BIT = 0x3,
+};
+
 struct mlx5_ifc_ipsec_aso_bits {
 	u8         valid[0x1];
 	u8         reserved_at_201[0x1];
-- 
cgit v1.2.3


From c2bf84f1d1a1595dcc45fe867f0e02b331993fee Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 12 Nov 2023 13:50:00 +0200
Subject: net/mlx5e: Tidy up IPsec NAT-T SA discovery

IPsec NAT-T packets are UDP encapsulated packets over ESP normal ones.
In case they arrive to RX, the SPI and ESP are located in inner header,
while the check was performed on outer header instead.

That wrong check caused to the situation where received rekeying request
was missed and caused to rekey timeout, which "compensated" this failure
by completing rekeying.

Fixes: d65954934937 ("net/mlx5e: Support IPsec NAT-T functionality")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 .../mellanox/mlx5/core/en_accel/ipsec_fs.c         | 22 ++++++++++++++++------
 include/linux/mlx5/mlx5_ifc.h                      |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
index aeb399d8dae5..7a789061c998 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
@@ -1212,13 +1212,22 @@ static void setup_fte_esp(struct mlx5_flow_spec *spec)
 	MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_ESP);
 }
 
-static void setup_fte_spi(struct mlx5_flow_spec *spec, u32 spi)
+static void setup_fte_spi(struct mlx5_flow_spec *spec, u32 spi, bool encap)
 {
 	/* SPI number */
 	spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS;
 
-	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters.outer_esp_spi);
-	MLX5_SET(fte_match_param, spec->match_value, misc_parameters.outer_esp_spi, spi);
+	if (encap) {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 misc_parameters.inner_esp_spi);
+		MLX5_SET(fte_match_param, spec->match_value,
+			 misc_parameters.inner_esp_spi, spi);
+	} else {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 misc_parameters.outer_esp_spi);
+		MLX5_SET(fte_match_param, spec->match_value,
+			 misc_parameters.outer_esp_spi, spi);
+	}
 }
 
 static void setup_fte_no_frags(struct mlx5_flow_spec *spec)
@@ -1596,8 +1605,9 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
 	else
 		setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6);
 
-	setup_fte_spi(spec, attrs->spi);
-	setup_fte_esp(spec);
+	setup_fte_spi(spec, attrs->spi, attrs->encap);
+	if (!attrs->encap)
+		setup_fte_esp(spec);
 	setup_fte_no_frags(spec);
 	setup_fte_upper_proto_match(spec, &attrs->upspec);
 
@@ -1719,7 +1729,7 @@ static int tx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry)
 
 	switch (attrs->type) {
 	case XFRM_DEV_OFFLOAD_CRYPTO:
-		setup_fte_spi(spec, attrs->spi);
+		setup_fte_spi(spec, attrs->spi, false);
 		setup_fte_esp(spec);
 		setup_fte_reg_a(spec);
 		break;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 90ca63f4bf63..3f7b664d625b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -621,7 +621,7 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 
 	u8         reserved_at_140[0x8];
 	u8         bth_dst_qp[0x18];
-	u8	   reserved_at_160[0x20];
+	u8	   inner_esp_spi[0x20];
 	u8	   outer_esp_spi[0x20];
 	u8         reserved_at_1a0[0x60];
 };
-- 
cgit v1.2.3


From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 16:36:55 +0200
Subject: cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks

The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed.
SLUB defines them as NULL, so we can remove those altogether.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/cpuhotplug.h | 1 -
 include/linux/slab.h       | 8 --------
 kernel/cpu.c               | 5 -----
 3 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d305db70674b..07cb8f7030b6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -108,7 +108,6 @@ enum cpuhp_state {
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
 	CPUHP_RELAY_PREPARE,
-	CPUHP_SLAB_PREPARE,
 	CPUHP_MD_RAID5_PREPARE,
 	CPUHP_RCUTREE_PREP,
 	CPUHP_CPUIDLE_COUPLED_PREPARE,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..34e43cddc520 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size);
 
 void __init kmem_cache_init_late(void);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
-int slab_prepare_cpu(unsigned int cpu);
-int slab_dead_cpu(unsigned int cpu);
-#else
-#define slab_prepare_cpu	NULL
-#define slab_dead_cpu		NULL
-#endif
-
 #endif	/* _LINUX_SLAB_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9e4c6780adde..530b026d95a1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= relay_prepare_cpu,
 		.teardown.single	= NULL,
 	},
-	[CPUHP_SLAB_PREPARE] = {
-		.name			= "slab:prepare",
-		.startup.single		= slab_prepare_cpu,
-		.teardown.single	= slab_dead_cpu,
-	},
 	[CPUHP_RCUTREE_PREP] = {
 		.name			= "RCU/tree:prepare",
 		.startup.single		= rcutree_prepare_cpu,
-- 
cgit v1.2.3


From a9e0b9f27266d46ed6e73aac8d0844602cd0cb93 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 17:43:38 +0200
Subject: mm/slab: remove CONFIG_SLAB code from slab common code

In slab_common.c and slab.h headers, we can now remove all code behind
CONFIG_SLAB and CONFIG_DEBUG_SLAB ifdefs, and remove all CONFIG_SLUB
ifdefs.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 14 ++---------
 mm/slab.h            | 69 ++++------------------------------------------------
 mm/slab_common.c     | 22 ++---------------
 3 files changed, 9 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 34e43cddc520..b2015d0e01ad 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -24,7 +24,7 @@
 
 /*
  * Flags to pass to kmem_cache_create().
- * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
+ * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
  */
 /* DEBUG: Perform (expensive) checks on alloc/free */
 #define SLAB_CONSISTENCY_CHECKS	((slab_flags_t __force)0x00000100U)
@@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void)
  * Kmalloc array related definitions
  */
 
-#ifdef CONFIG_SLAB
 /*
- * SLAB and SLUB directly allocates requests fitting in to an order-1 page
+ * SLUB directly allocates requests fitting in to an order-1 page
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
 #define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
-#define KMALLOC_SHIFT_LOW	5
-#endif
-#endif
-
-#ifdef CONFIG_SLUB
-#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
-#ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
-#endif
 
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
diff --git a/mm/slab.h b/mm/slab.h
index 3d07fb428393..014c36ea51fa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -42,21 +42,6 @@ typedef union {
 struct slab {
 	unsigned long __page_flags;
 
-#if defined(CONFIG_SLAB)
-
-	struct kmem_cache *slab_cache;
-	union {
-		struct {
-			struct list_head slab_list;
-			void *freelist;	/* array of free object indexes */
-			void *s_mem;	/* first object */
-		};
-		struct rcu_head rcu_head;
-	};
-	unsigned int active;
-
-#elif defined(CONFIG_SLUB)
-
 	struct kmem_cache *slab_cache;
 	union {
 		struct {
@@ -91,10 +76,6 @@ struct slab {
 	};
 	unsigned int __unused;
 
-#else
-#error "Unexpected slab allocator configured"
-#endif
-
 	atomic_t __page_refcount;
 #ifdef CONFIG_MEMCG
 	unsigned long memcg_data;
@@ -111,7 +92,7 @@ SLAB_MATCH(memcg_data, memcg_data);
 #endif
 #undef SLAB_MATCH
 static_assert(sizeof(struct slab) <= sizeof(struct page));
-#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
+#if defined(system_has_freelist_aba)
 static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
 #endif
 
@@ -228,13 +209,7 @@ static inline size_t slab_size(const struct slab *slab)
 	return PAGE_SIZE << slab_order(slab);
 }
 
-#ifdef CONFIG_SLAB
-#include <linux/slab_def.h>
-#endif
-
-#ifdef CONFIG_SLUB
 #include <linux/slub_def.h>
-#endif
 
 #include <linux/memcontrol.h>
 #include <linux/fault-inject.h>
@@ -320,26 +295,16 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
 			 SLAB_CACHE_DMA32 | SLAB_PANIC | \
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
 
-#if defined(CONFIG_DEBUG_SLAB)
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
-#elif defined(CONFIG_SLUB_DEBUG)
+#ifdef CONFIG_SLUB_DEBUG
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 			  SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
 #else
 #define SLAB_DEBUG_FLAGS (0)
 #endif
 
-#if defined(CONFIG_SLAB)
-#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
-			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_ACCOUNT | SLAB_NO_MERGE)
-#elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
 			  SLAB_TEMPORARY | SLAB_ACCOUNT | \
 			  SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#else
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
-#endif
 
 /* Common flags available with current configuration */
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
@@ -672,18 +637,14 @@ size_t __ksize(const void *objp);
 
 static inline size_t slab_ksize(const struct kmem_cache *s)
 {
-#ifndef CONFIG_SLUB
-	return s->object_size;
-
-#else /* CONFIG_SLUB */
-# ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG
 	/*
 	 * Debugging requires use of the padding between object
 	 * and whatever may come after it.
 	 */
 	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
 		return s->object_size;
-# endif
+#endif
 	if (s->flags & SLAB_KASAN)
 		return s->object_size;
 	/*
@@ -697,7 +658,6 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 	 * Else we can use all the padding etc for the allocation
 	 */
 	return s->size;
-#endif
 }
 
 static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -775,23 +735,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
  * The slab lists for all objects.
  */
 struct kmem_cache_node {
-#ifdef CONFIG_SLAB
-	raw_spinlock_t list_lock;
-	struct list_head slabs_partial;	/* partial list first, better asm code */
-	struct list_head slabs_full;
-	struct list_head slabs_free;
-	unsigned long total_slabs;	/* length of all slab lists */
-	unsigned long free_slabs;	/* length of free slab list only */
-	unsigned long free_objects;
-	unsigned int free_limit;
-	unsigned int colour_next;	/* Per-node cache coloring */
-	struct array_cache *shared;	/* shared per node */
-	struct alien_cache **alien;	/* on other nodes */
-	unsigned long next_reap;	/* updated without locking */
-	int free_touched;		/* updated without locking */
-#endif
-
-#ifdef CONFIG_SLUB
 	spinlock_t list_lock;
 	unsigned long nr_partial;
 	struct list_head partial;
@@ -800,8 +743,6 @@ struct kmem_cache_node {
 	atomic_long_t total_objects;
 	struct list_head full;
 #endif
-#endif
-
 };
 
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
@@ -818,7 +759,7 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 		 if ((__n = get_node(__s, __node)))
 
 
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+#ifdef CONFIG_SLUB_DEBUG
 void dump_unreclaimable_slab(void);
 #else
 static inline void dump_unreclaimable_slab(void)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8d431193c273..63b8411db7ce 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -71,10 +71,8 @@ static int __init setup_slab_merge(char *str)
 	return 1;
 }
 
-#ifdef CONFIG_SLUB
 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
-#endif
 
 __setup("slab_nomerge", setup_slab_nomerge);
 __setup("slab_merge", setup_slab_merge);
@@ -197,10 +195,6 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
 		if (s->size - size >= sizeof(void *))
 			continue;
 
-		if (IS_ENABLED(CONFIG_SLAB) && align &&
-			(align > s->align || s->align % align))
-			continue;
-
 		return s;
 	}
 	return NULL;
@@ -1222,12 +1216,8 @@ void cache_random_seq_destroy(struct kmem_cache *cachep)
 }
 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
 
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
-#ifdef CONFIG_SLAB
-#define SLABINFO_RIGHTS (0600)
-#else
+#ifdef CONFIG_SLUB_DEBUG
 #define SLABINFO_RIGHTS (0400)
-#endif
 
 static void print_slabinfo_header(struct seq_file *m)
 {
@@ -1235,18 +1225,10 @@ static void print_slabinfo_header(struct seq_file *m)
 	 * Output format version, so at least we can change it
 	 * without _too_ many complaints.
 	 */
-#ifdef CONFIG_DEBUG_SLAB
-	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
 	seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#ifdef CONFIG_DEBUG_SLAB
-	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
-	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
 	seq_putc(m, '\n');
 }
 
@@ -1370,7 +1352,7 @@ static int __init slab_proc_init(void)
 }
 module_init(slab_proc_init);
 
-#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
+#endif /* CONFIG_SLUB_DEBUG */
 
 static __always_inline __realloc_size(2) void *
 __do_krealloc(const void *p, size_t new_size, gfp_t flags)
-- 
cgit v1.2.3


From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:49 -0700
Subject: iov_iter: remove unused 'iov' argument from import_single_range()

It is entirely unused, just get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/ublk_drv.c         | 6 ++----
 drivers/char/random.c            | 6 ++----
 fs/aio.c                         | 2 +-
 include/linux/uio.h              | 2 +-
 kernel/trace/trace_events_user.c | 3 +--
 lib/iov_iter.c                   | 2 +-
 net/ipv4/tcp.c                   | 6 ++----
 net/socket.c                     | 6 ++----
 security/keys/keyctl.c           | 3 +--
 9 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 83600b45e12a..5656b0a1233d 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -893,11 +893,10 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 	 */
 	if (ublk_need_map_req(req)) {
 		struct iov_iter iter;
-		struct iovec iov;
 		const int dir = ITER_DEST;
 
 		import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
-				&iov, &iter);
+					&iter);
 
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
@@ -915,13 +914,12 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
 
 	if (ublk_need_unmap_req(req)) {
 		struct iov_iter iter;
-		struct iovec iov;
 		const int dir = ITER_SOURCE;
 
 		WARN_ON_ONCE(io->res > rq_bytes);
 
 		import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
-				&iov, &iter);
+					&iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4a9c79391dee..e79ae238b30d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1364,7 +1364,6 @@ static void __cold try_to_generate_entropy(void)
 SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
 {
 	struct iov_iter iter;
-	struct iovec iov;
 	int ret;
 
 	if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
@@ -1385,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 			return ret;
 	}
 
-	ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter);
+	ret = import_single_range(ITER_DEST, ubuf, len, &iter);
 	if (unlikely(ret))
 		return ret;
 	return get_random_bytes_user(&iter);
@@ -1491,7 +1490,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		return 0;
 	case RNDADDENTROPY: {
 		struct iov_iter iter;
-		struct iovec iov;
 		ssize_t ret;
 		int len;
 
@@ -1503,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 		if (get_user(len, p++))
 			return -EFAULT;
-		ret = import_single_range(ITER_SOURCE, p, len, &iov, &iter);
+		ret = import_single_range(ITER_SOURCE, p, len, &iter);
 		if (unlikely(ret))
 			return ret;
 		ret = write_pool_user(&iter);
diff --git a/fs/aio.c b/fs/aio.c
index f8589caef9c1..251eeaef7fbf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
 	size_t len = iocb->aio_nbytes;
 
 	if (!vectored) {
-		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
+		ssize_t ret = import_single_range(rw, buf, len, iter);
 		*iovec = NULL;
 		return ret;
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b6214cbf2a43..bfafd3542fa7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
 int import_single_range(int type, void __user *buf, size_t len,
-		 struct iovec *iov, struct iov_iter *i);
+		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..4efc75d90a0d 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2177,14 +2177,13 @@ static int user_events_open(struct inode *node, struct file *file)
 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 				 size_t count, loff_t *ppos)
 {
-	struct iovec iov;
 	struct iov_iter i;
 
 	if (unlikely(*ppos != 0))
 		return -EFAULT;
 
 	if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
-					 count, &iov, &i)))
+					 count, &i)))
 		return -EFAULT;
 
 	return user_events_write_core(file, &i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index de7d11cf4c63..d60c73354e1f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1370,7 +1370,7 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 EXPORT_SYMBOL(import_iovec);
 
 int import_single_range(int rw, void __user *buf, size_t len,
-		 struct iovec *iov, struct iov_iter *i)
+			struct iov_iter *i)
 {
 	if (len > MAX_RW_COUNT)
 		len = MAX_RW_COUNT;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53bcc17c91e4..57cf3adb191a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1849,7 +1849,6 @@ static int receive_fallback_to_copy(struct sock *sk,
 {
 	unsigned long copy_address = (unsigned long)zc->copybuf_address;
 	struct msghdr msg = {};
-	struct iovec iov;
 	int err;
 
 	zc->length = 0;
@@ -1859,7 +1858,7 @@ static int receive_fallback_to_copy(struct sock *sk,
 		return -EINVAL;
 
 	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  inq, &iov, &msg.msg_iter);
+				  inq, &msg.msg_iter);
 	if (err)
 		return err;
 
@@ -1886,14 +1885,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
 {
 	unsigned long copy_address = (unsigned long)zc->copybuf_address;
 	struct msghdr msg = {};
-	struct iovec iov;
 	int err;
 
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
 	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  copylen, &iov, &msg.msg_iter);
+				  copylen, &msg.msg_iter);
 	if (err)
 		return err;
 	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
diff --git a/net/socket.c b/net/socket.c
index 3379c64217a4..1f0d0e8d0a50 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2161,10 +2161,9 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	struct sockaddr_storage address;
 	int err;
 	struct msghdr msg;
-	struct iovec iov;
 	int fput_needed;
 
-	err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter);
+	err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
@@ -2226,11 +2225,10 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 		.msg_name = addr ? (struct sockaddr *)&address : NULL,
 	};
 	struct socket *sock;
-	struct iovec iov;
 	int err, err2;
 	int fput_needed;
 
-	err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter);
+	err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 19be69fa4d05..193df7ca3ca8 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1252,12 +1252,11 @@ long keyctl_instantiate_key(key_serial_t id,
 			    key_serial_t ringid)
 {
 	if (_payload && plen) {
-		struct iovec iov;
 		struct iov_iter from;
 		int ret;
 
 		ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen,
-					  &iov, &from);
+					  &from);
 		if (unlikely(ret))
 			return ret;
 
-- 
cgit v1.2.3


From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:50 -0700
Subject: iov_iter: replace import_single_range() with import_ubuf()

With the removal of the 'iov' argument to import_single_range(), the two
functions are now fully identical. Convert the import_single_range()
callers to import_ubuf(), and remove the former fully.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/ublk_drv.c         |  7 ++-----
 drivers/char/random.c            |  4 ++--
 fs/aio.c                         |  2 +-
 include/linux/uio.h              |  2 --
 kernel/trace/trace_events_user.c |  3 +--
 lib/iov_iter.c                   | 13 -------------
 net/ipv4/tcp.c                   |  8 ++++----
 net/socket.c                     |  4 ++--
 security/keys/keyctl.c           |  4 ++--
 9 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5656b0a1233d..3eaf02ebeebe 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -895,9 +895,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 		struct iov_iter iter;
 		const int dir = ITER_DEST;
 
-		import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
-					&iter);
-
+		import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
@@ -918,8 +916,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
 
 		WARN_ON_ONCE(io->res > rq_bytes);
 
-		import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
-					&iter);
+		import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index e79ae238b30d..456be28ba67c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1384,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 			return ret;
 	}
 
-	ret = import_single_range(ITER_DEST, ubuf, len, &iter);
+	ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
 	if (unlikely(ret))
 		return ret;
 	return get_random_bytes_user(&iter);
@@ -1501,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 		if (get_user(len, p++))
 			return -EFAULT;
-		ret = import_single_range(ITER_SOURCE, p, len, &iter);
+		ret = import_ubuf(ITER_SOURCE, p, len, &iter);
 		if (unlikely(ret))
 			return ret;
 		ret = write_pool_user(&iter);
diff --git a/fs/aio.c b/fs/aio.c
index 251eeaef7fbf..4ea639509d41 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
 	size_t len = iocb->aio_nbytes;
 
 	if (!vectored) {
-		ssize_t ret = import_single_range(rw, buf, len, iter);
+		ssize_t ret = import_ubuf(rw, buf, len, iter);
 		*iovec = NULL;
 		return ret;
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index bfafd3542fa7..bea9c89922d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
-int import_single_range(int type, void __user *buf, size_t len,
-		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 4efc75d90a0d..e76f5e1efdf2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2182,8 +2182,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 	if (unlikely(*ppos != 0))
 		return -EFAULT;
 
-	if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
-					 count, &i)))
+	if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
 		return -EFAULT;
 
 	return user_events_write_core(file, &i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d60c73354e1f..009875bc95aa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1369,19 +1369,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 }
 EXPORT_SYMBOL(import_iovec);
 
-int import_single_range(int rw, void __user *buf, size_t len,
-			struct iov_iter *i)
-{
-	if (len > MAX_RW_COUNT)
-		len = MAX_RW_COUNT;
-	if (unlikely(!access_ok(buf, len)))
-		return -EFAULT;
-
-	iov_iter_ubuf(i, rw, buf, len);
-	return 0;
-}
-EXPORT_SYMBOL(import_single_range);
-
 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
 {
 	if (len > MAX_RW_COUNT)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57cf3adb191a..54d3c762d400 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1857,8 +1857,8 @@ static int receive_fallback_to_copy(struct sock *sk,
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
-	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  inq, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
+			  &msg.msg_iter);
 	if (err)
 		return err;
 
@@ -1890,8 +1890,8 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
-	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  copylen, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
+			  &msg.msg_iter);
 	if (err)
 		return err;
 	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
diff --git a/net/socket.c b/net/socket.c
index 1f0d0e8d0a50..5d49ae0c1b79 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2163,7 +2163,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	struct msghdr msg;
 	int fput_needed;
 
-	err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter);
+	err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
@@ -2228,7 +2228,7 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 	int err, err2;
 	int fput_needed;
 
-	err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 193df7ca3ca8..10ba439968f7 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1255,8 +1255,8 @@ long keyctl_instantiate_key(key_serial_t id,
 		struct iov_iter from;
 		int ret;
 
-		ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen,
-					  &from);
+		ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen,
+				  &from);
 		if (unlikely(ret))
 			return ret;
 
-- 
cgit v1.2.3


From 118eb89b1e7f6807776c012cffc5c9b07fd26164 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 15 Nov 2023 14:58:05 +0530
Subject: drivers: perf: arm_pmu: Drop 'pmu_lock' element from 'struct
 pmu_hw_events'

As 'pmu_lock' element is not being used in any ARM PMU implementation, just
drop this from 'struct pmu_hw_events'.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231115092805.737822-3-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmu.c       | 1 -
 include/linux/perf/arm_pmu.h | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index d712a19e47ac..379479b50bdd 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -893,7 +893,6 @@ struct arm_pmu *armpmu_alloc(void)
 		struct pmu_hw_events *events;
 
 		events = per_cpu_ptr(pmu->hw_events, cpu);
-		raw_spin_lock_init(&events->pmu_lock);
 		events->percpu_pmu = pmu;
 	}
 
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 143fbc10ecfe..e2503d48ddee 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -59,12 +59,6 @@ struct pmu_hw_events {
 	 */
 	DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS);
 
-	/*
-	 * Hardware lock to serialize accesses to PMU registers. Needed for the
-	 * read/modify/write sequences.
-	 */
-	raw_spinlock_t		pmu_lock;
-
 	/*
 	 * When using percpu IRQs, we need a percpu dev_id. Place it here as we
 	 * already have to allocate this struct per cpu.
-- 
cgit v1.2.3


From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:39 -0800
Subject: bpf: support non-r10 register spill/fill to/from stack in precision
 tracking

Use instruction (jump) history to record instructions that performed
register spill/fill to/from stack, regardless if this was done through
read-only r10 register, or any other register after copying r10 into it
*and* potentially adjusting offset.

To make this work reliably, we push extra per-instruction flags into
instruction history, encoding stack slot index (spi) and stack frame
number in extra 10 bit flags we take away from prev_idx in instruction
history. We don't touch idx field for maximum performance, as it's
checked most frequently during backtracking.

This change removes basically the last remaining practical limitation of
precision backtracking logic in BPF verifier. It fixes known
deficiencies, but also opens up new opportunities to reduce number of
verified states, explored in the subsequent patches.

There are only three differences in selftests' BPF object files
according to veristat, all in the positive direction (less states).

File                                    Program        Insns (A)  Insns (B)  Insns  (DIFF)  States (A)  States (B)  States (DIFF)
--------------------------------------  -------------  ---------  ---------  -------------  ----------  ----------  -------------
test_cls_redirect_dynptr.bpf.linked3.o  cls_redirect        2987       2864  -123 (-4.12%)         240         231    -9 (-3.75%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_tc       82848      82661  -187 (-0.23%)        5107        5073   -34 (-0.67%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_xdp      85116      84964  -152 (-0.18%)        5162        5130   -32 (-0.62%)

Note, I avoided renaming jmp_history to more generic insn_hist to
minimize number of lines changed and potential merge conflicts between
bpf and bpf-next trees.

Notice also cur_hist_entry pointer reset to NULL at the beginning of
instruction verification loop. This pointer avoids the problem of
relying on last jump history entry's insn_idx to determine whether we
already have entry for current instruction or not. It can happen that we
added jump history entry because current instruction is_jmp_point(), but
also we need to add instruction flags for stack access. In this case, we
don't want to entries, so we need to reuse last added entry, if it is
present.

Relying on insn_idx comparison has the same ambiguity problem as the one
that was fixed recently in [0], so we avoid that.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Tao Lyu <tao.lyu@epfl.ch>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |  31 +++-
 kernel/bpf/verifier.c                              | 175 ++++++++++++---------
 .../bpf/progs/verifier_subprog_precision.c         |  23 ++-
 tools/testing/selftests/bpf/verifier/precise.c     |  38 +++--
 4 files changed, 169 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3378cc753061..bada59812e00 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -325,12 +325,34 @@ struct bpf_func_state {
 	int allocated_stack;
 };
 
-struct bpf_idx_pair {
-	u32 prev_idx;
+#define MAX_CALL_FRAMES 8
+
+/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+enum {
+	/* instruction references stack slot through PTR_TO_STACK register;
+	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
+	 * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
+	 * 8 bytes per slot, so slot index (spi) is [0, 63])
+	 */
+	INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */
+
+	INSN_F_SPI_MASK = 0x3f, /* 6 bits */
+	INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */
+
+	INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */
+};
+
+static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
+static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
+
+struct bpf_jmp_history_entry {
 	u32 idx;
+	/* insn idx can't be bigger than 1 million */
+	u32 prev_idx : 22;
+	/* special flags, e.g., whether insn is doing register stack spill/load */
+	u32 flags : 10;
 };
 
-#define MAX_CALL_FRAMES 8
 /* Maximum number of register states that can exist at once */
 #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
@@ -413,7 +435,7 @@ struct bpf_verifier_state {
 	 * For most states jmp_history_cnt is [0-3].
 	 * For loops can go up to ~40.
 	 */
-	struct bpf_idx_pair *jmp_history;
+	struct bpf_jmp_history_entry *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
@@ -656,6 +678,7 @@ struct bpf_verifier_env {
 		int cur_stack;
 	} cfg;
 	struct backtrack_state bt;
+	struct bpf_jmp_history_entry *cur_hist_ent;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ed39665f802..9bc16dc66465 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1355,8 +1355,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	int i, err;
 
 	dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
-					    src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
-					    GFP_USER);
+					  src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+					  GFP_USER);
 	if (!dst_state->jmp_history)
 		return -ENOMEM;
 	dst_state->jmp_history_cnt = src->jmp_history_cnt;
@@ -3221,6 +3221,21 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return __check_reg_arg(env, state->regs, regno, t);
 }
 
+static int insn_stack_access_flags(int frameno, int spi)
+{
+	return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+	return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+	return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
 static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].jmp_point = true;
@@ -3232,28 +3247,51 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
 }
 
 /* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
-			    struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+			    int insn_flags)
 {
 	u32 cnt = cur->jmp_history_cnt;
-	struct bpf_idx_pair *p;
+	struct bpf_jmp_history_entry *p;
 	size_t alloc_size;
 
-	if (!is_jmp_point(env, env->insn_idx))
+	/* combine instruction flags if we already recorded this instruction */
+	if (env->cur_hist_ent) {
+		/* atomic instructions push insn_flags twice, for READ and
+		 * WRITE sides, but they should agree on stack slot
+		 */
+		WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
+			  (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+			  "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
+			  env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+		env->cur_hist_ent->flags |= insn_flags;
 		return 0;
+	}
 
 	cnt++;
 	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
 	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
 	if (!p)
 		return -ENOMEM;
-	p[cnt - 1].idx = env->insn_idx;
-	p[cnt - 1].prev_idx = env->prev_insn_idx;
 	cur->jmp_history = p;
+
+	p = &cur->jmp_history[cnt - 1];
+	p->idx = env->insn_idx;
+	p->prev_idx = env->prev_insn_idx;
+	p->flags = insn_flags;
 	cur->jmp_history_cnt = cnt;
+	env->cur_hist_ent = p;
+
 	return 0;
 }
 
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+						        u32 hist_end, int insn_idx)
+{
+	if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+		return &st->jmp_history[hist_end - 1];
+	return NULL;
+}
+
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
  * Return -ENOENT if we exhausted all instructions within given state.
@@ -3415,9 +3453,14 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
 	return bt->reg_masks[bt->frame] & (1 << reg);
 }
 
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	return bt->stack_masks[frame] & (1ull << slot);
+}
+
 static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
 {
-	return bt->stack_masks[bt->frame] & (1ull << slot);
+	return bt_is_frame_slot_set(bt, bt->frame, slot);
 }
 
 /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
@@ -3471,7 +3514,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
  *   - *was* processed previously during backtracking.
  */
 static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
-			  struct backtrack_state *bt)
+			  struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
 {
 	const struct bpf_insn_cbs cbs = {
 		.cb_call	= disasm_kfunc_name,
@@ -3484,7 +3527,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 	u8 mode = BPF_MODE(insn->code);
 	u32 dreg = insn->dst_reg;
 	u32 sreg = insn->src_reg;
-	u32 spi, i;
+	u32 spi, i, fr;
 
 	if (insn->code == 0)
 		return 0;
@@ -3545,20 +3588,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		 * by 'precise' mark in corresponding register of this state.
 		 * No further tracking necessary.
 		 */
-		if (insn->src_reg != BPF_REG_FP)
+		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
 			return 0;
-
 		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
 		 * that [fp - off] slot contains scalar that needs to be
 		 * tracked with precision
 		 */
-		spi = (-insn->off - 1) / BPF_REG_SIZE;
-		if (spi >= 64) {
-			verbose(env, "BUG spi %d\n", spi);
-			WARN_ONCE(1, "verifier backtracking bug");
-			return -EFAULT;
-		}
-		bt_set_slot(bt, spi);
+		spi = insn_stack_access_spi(hist->flags);
+		fr = insn_stack_access_frameno(hist->flags);
+		bt_set_frame_slot(bt, fr, spi);
 	} else if (class == BPF_STX || class == BPF_ST) {
 		if (bt_is_reg_set(bt, dreg))
 			/* stx & st shouldn't be using _scalar_ dst_reg
@@ -3567,17 +3605,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			 */
 			return -ENOTSUPP;
 		/* scalars can only be spilled into stack */
-		if (insn->dst_reg != BPF_REG_FP)
+		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
 			return 0;
-		spi = (-insn->off - 1) / BPF_REG_SIZE;
-		if (spi >= 64) {
-			verbose(env, "BUG spi %d\n", spi);
-			WARN_ONCE(1, "verifier backtracking bug");
-			return -EFAULT;
-		}
-		if (!bt_is_slot_set(bt, spi))
+		spi = insn_stack_access_spi(hist->flags);
+		fr = insn_stack_access_frameno(hist->flags);
+		if (!bt_is_frame_slot_set(bt, fr, spi))
 			return 0;
-		bt_clear_slot(bt, spi);
+		bt_clear_frame_slot(bt, fr, spi);
 		if (class == BPF_STX)
 			bt_set_reg(bt, sreg);
 	} else if (class == BPF_JMP || class == BPF_JMP32) {
@@ -3621,10 +3655,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 					WARN_ONCE(1, "verifier backtracking bug");
 					return -EFAULT;
 				}
-				/* we don't track register spills perfectly,
-				 * so fallback to force-precise instead of failing */
-				if (bt_stack_mask(bt) != 0)
-					return -ENOTSUPP;
+				/* we are now tracking register spills correctly,
+				 * so any instance of leftover slots is a bug
+				 */
+				if (bt_stack_mask(bt) != 0) {
+					verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+					WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+					return -EFAULT;
+				}
 				/* propagate r1-r5 to the caller */
 				for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
 					if (bt_is_reg_set(bt, i)) {
@@ -3649,8 +3687,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 				WARN_ONCE(1, "verifier backtracking bug");
 				return -EFAULT;
 			}
-			if (bt_stack_mask(bt) != 0)
-				return -ENOTSUPP;
+			if (bt_stack_mask(bt) != 0) {
+				verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+				WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+				return -EFAULT;
+			}
 			/* clear r1-r5 in callback subprog's mask */
 			for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 				bt_clear_reg(bt, i);
@@ -4087,6 +4128,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 	for (;;) {
 		DECLARE_BITMAP(mask, 64);
 		u32 history = st->jmp_history_cnt;
+		struct bpf_jmp_history_entry *hist;
 
 		if (env->log.level & BPF_LOG_LEVEL2) {
 			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4150,7 +4192,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				err = 0;
 				skip_first = false;
 			} else {
-				err = backtrack_insn(env, i, subseq_idx, bt);
+				hist = get_jmp_hist_entry(st, history, i);
+				err = backtrack_insn(env, i, subseq_idx, hist, bt);
 			}
 			if (err == -ENOTSUPP) {
 				mark_all_scalars_precise(env, env->cur_state);
@@ -4203,22 +4246,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 			bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
 			for_each_set_bit(i, mask, 64) {
 				if (i >= func->allocated_stack / BPF_REG_SIZE) {
-					/* the sequence of instructions:
-					 * 2: (bf) r3 = r10
-					 * 3: (7b) *(u64 *)(r3 -8) = r0
-					 * 4: (79) r4 = *(u64 *)(r10 -8)
-					 * doesn't contain jmps. It's backtracked
-					 * as a single block.
-					 * During backtracking insn 3 is not recognized as
-					 * stack access, so at the end of backtracking
-					 * stack slot fp-8 is still marked in stack_mask.
-					 * However the parent state may not have accessed
-					 * fp-8 and it's "unallocated" stack space.
-					 * In such case fallback to conservative.
-					 */
-					mark_all_scalars_precise(env, env->cur_state);
-					bt_reset(bt);
-					return 0;
+					verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
+						i, func->allocated_stack / BPF_REG_SIZE);
+					WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+					return -EFAULT;
 				}
 
 				if (!is_spilled_scalar_reg(&func->stack[i])) {
@@ -4391,7 +4422,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	struct bpf_reg_state *reg = NULL;
-	u32 dst_reg = insn->dst_reg;
+	int insn_flags = insn_stack_access_flags(state->frameno, spi);
 
 	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
 	if (err)
@@ -4432,17 +4463,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
-		if (dst_reg != BPF_REG_FP) {
-			/* The backtracking logic can only recognize explicit
-			 * stack slot address like [fp - 8]. Other spill of
-			 * scalar via different register has to be conservative.
-			 * Backtrack from here and mark all registers as precise
-			 * that contributed into 'reg' being a constant.
-			 */
-			err = mark_chain_precision(env, value_regno);
-			if (err)
-				return err;
-		}
 		save_register_state(state, spi, reg, size);
 		/* Break the relation on a narrowing spill. */
 		if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
@@ -4454,6 +4474,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		__mark_reg_known(&fake_reg, insn->imm);
 		fake_reg.type = SCALAR_VALUE;
 		save_register_state(state, spi, &fake_reg, size);
+		insn_flags = 0; /* not a register spill */
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -4499,9 +4520,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* Mark slots affected by this stack write. */
 		for (i = 0; i < size; i++)
-			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
-				type;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+		insn_flags = 0; /* not a register spill */
 	}
+
+	if (insn_flags)
+		return push_jmp_history(env, env->cur_state, insn_flags);
 	return 0;
 }
 
@@ -4694,6 +4718,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
 	struct bpf_reg_state *reg;
 	u8 *stype, type;
+	int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
 
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
@@ -4739,12 +4764,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 					return -EACCES;
 				}
 				mark_reg_unknown(env, state->regs, dst_regno);
+				insn_flags = 0; /* not restoring original register state */
 			}
 			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
-			return 0;
-		}
-
-		if (dst_regno >= 0) {
+		} else if (dst_regno >= 0) {
 			/* restore register state from stack */
 			copy_register_state(&state->regs[dst_regno], reg);
 			/* mark reg as written since spilled pointer state likely
@@ -4780,7 +4803,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
 		if (dst_regno >= 0)
 			mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+		insn_flags = 0; /* we are not restoring spilled register */
 	}
+	if (insn_flags)
+		return push_jmp_history(env, env->cur_state, insn_flags);
 	return 0;
 }
 
@@ -6940,7 +6966,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 			       BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
 	if (err)
 		return err;
-
 	return 0;
 }
 
@@ -16910,7 +16935,8 @@ hit:
 			 * the precision needs to be propagated back in
 			 * the current state.
 			 */
-			err = err ? : push_jmp_history(env, cur);
+			if (is_jmp_point(env, env->insn_idx))
+				err = err ? : push_jmp_history(env, cur, 0);
 			err = err ? : propagate_precision(env, &sl->state);
 			if (err)
 				return err;
@@ -17135,6 +17161,9 @@ static int do_check(struct bpf_verifier_env *env)
 		u8 class;
 		int err;
 
+		/* reset current history entry on each new instruction */
+		env->cur_hist_ent = NULL;
+
 		env->prev_insn_idx = prev_insn_idx;
 		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
@@ -17174,7 +17203,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (is_jmp_point(env, env->insn_idx)) {
-			err = push_jmp_history(env, state);
+			err = push_jmp_history(env, state, 0);
 			if (err)
 				return err;
 		}
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index 0dfe3f8b69ac..eba98fab2f54 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -589,11 +589,24 @@ static __u64 subprog_spill_reg_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
-/* precision backtracking can't currently handle stack access not through r10,
- * so we won't be able to mark stack slot fp-8 as precise, and so will
- * fallback to forcing all as precise
- */
-__msg("mark_precise: frame0: falling back to forcing all scalars precise")
+__msg("10: (0f) r1 += r7")
+__msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8")
+__msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4")
+__msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: parent state regs= stack=-8:  R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1")
+__msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7")
+__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2")
+__msg("mark_precise: frame1: regs= stack= before 16: (79) r2 = *(u64 *)(r1 +0)")
+__msg("mark_precise: frame1: regs= stack= before 15: (79) r0 = *(u64 *)(r10 -16)")
+__msg("mark_precise: frame1: regs= stack= before 14: (7b) *(u64 *)(r10 -16) = r2")
+__msg("mark_precise: frame1: regs= stack= before 13: (7b) *(u64 *)(r1 +0) = r2")
+__msg("mark_precise: frame1: regs=r2 stack= before 6: (85) call pc+6")
+__msg("mark_precise: frame0: regs=r2 stack= before 5: (bf) r2 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (07) r1 += -8")
+__msg("mark_precise: frame0: regs=r6 stack= before 3: (bf) r1 = r10")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 1")
 __naked int subprog_spill_into_parent_stack_slot_precise(void)
 {
 	asm volatile (
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 0d84dd1f38b6..8a2ff81d8350 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -140,10 +140,11 @@
 	.result = REJECT,
 },
 {
-	"precise: ST insn causing spi > allocated_stack",
+	"precise: ST zero to stack insn is supported",
 	.insns = {
 	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+	/* not a register spill, so we stop precision propagation for R4 here */
 	BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
 	BPF_MOV64_IMM(BPF_REG_0, -1),
@@ -157,11 +158,11 @@
 	mark_precise: frame0: last_idx 4 first_idx 2\
 	mark_precise: frame0: regs=r4 stack= before 4\
 	mark_precise: frame0: regs=r4 stack= before 3\
-	mark_precise: frame0: regs= stack=-8 before 2\
-	mark_precise: frame0: falling back to forcing all scalars precise\
-	force_precise: frame0: forcing r0 to be precise\
 	mark_precise: frame0: last_idx 5 first_idx 5\
-	mark_precise: frame0: parent state regs= stack=:",
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 4 first_idx 2\
+	mark_precise: frame0: regs=r0 stack= before 4\
+	5: R0=-1 R4=0",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
@@ -169,6 +170,8 @@
 	"precise: STX insn causing spi > allocated_stack",
 	.insns = {
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+	/* make later reg spill more interesting by having somewhat known scalar */
+	BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xff),
 	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
 	BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8),
@@ -179,18 +182,21 @@
 	},
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
-	.errstr = "mark_precise: frame0: last_idx 6 first_idx 6\
+	.errstr = "mark_precise: frame0: last_idx 7 first_idx 7\
 	mark_precise: frame0: parent state regs=r4 stack=:\
-	mark_precise: frame0: last_idx 5 first_idx 3\
-	mark_precise: frame0: regs=r4 stack= before 5\
-	mark_precise: frame0: regs=r4 stack= before 4\
-	mark_precise: frame0: regs= stack=-8 before 3\
-	mark_precise: frame0: falling back to forcing all scalars precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	mark_precise: frame0: last_idx 6 first_idx 6\
+	mark_precise: frame0: last_idx 6 first_idx 4\
+	mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\
+	mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\
+	mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 3 first_idx 3\
+	mark_precise: frame0: regs=r0 stack= before 3: (55) if r3 != 0x7b goto pc+0\
+	mark_precise: frame0: regs=r0 stack= before 2: (bf) r3 = r10\
+	mark_precise: frame0: regs=r0 stack= before 1: (57) r0 &= 255\
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 0 first_idx 0\
+	mark_precise: frame0: regs=r0 stack= before 0: (85) call bpf_get_prandom_u32#7\
+	mark_precise: frame0: last_idx 7 first_idx 7\
 	mark_precise: frame0: parent state regs= stack=:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
-- 
cgit v1.2.3


From 0949dd96dffec39683c6066cf8d0877cebc321ec Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 21 Nov 2023 13:44:25 +0000
Subject: drivers: base: Allow parts of GENERIC_CPU_DEVICES to be overridden

Architectures often have extra per-cpu work that needs doing
before a CPU is registered, often to determine if a CPU is
hotpluggable.

To allow the ACPI architectures to use GENERIC_CPU_DEVICES, move
the cpu_register() call into arch_register_cpu(), which is made __weak
so architectures with extra work can override it.
This aligns with the way x86, ia64 and loongarch register hotplug CPUs
when they become present.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1r5R3B-00Csz6-Uh@rmk-PC.armlinux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/cpu.c  | 14 ++++++++++----
 include/linux/cpu.h |  4 ++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 34b48f660b6b..579064fda97b 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -525,19 +525,25 @@ bool cpu_is_hotpluggable(unsigned int cpu)
 EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
 
 #ifdef CONFIG_GENERIC_CPU_DEVICES
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
+DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+int __weak arch_register_cpu(int cpu)
+{
+	return register_cpu(&per_cpu(cpu_devices, cpu), cpu);
+}
 #endif
 
 static void __init cpu_dev_register_generic(void)
 {
-#ifdef CONFIG_GENERIC_CPU_DEVICES
 	int i;
 
+	if (!IS_ENABLED(CONFIG_GENERIC_CPU_DEVICES))
+		return;
+
 	for_each_present_cpu(i) {
-		if (register_cpu(&per_cpu(cpu_devices, i), i))
+		if (arch_register_cpu(i))
 			panic("Failed to register CPU device");
 	}
-#endif
 }
 
 #ifdef CONFIG_GENERIC_CPU_VULNERABILITIES
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index fc8094419084..1e982d63eae8 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -88,6 +88,10 @@ extern ssize_t arch_cpu_probe(const char *, size_t);
 extern ssize_t arch_cpu_release(const char *, size_t);
 #endif
 
+#ifdef CONFIG_GENERIC_CPU_DEVICES
+DECLARE_PER_CPU(struct cpu, cpu_devices);
+#endif
+
 /*
  * These states are not related to the core CPU hotplug mechanism. They are
  * used by various (sub)architectures to track internal state
-- 
cgit v1.2.3


From bb5e44fb3be685ecb3feb120aca4269a92cc84cf Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Nov 2023 13:44:36 +0000
Subject: drivers: base: add arch_cpu_is_hotpluggable()

The differences between architecture specific implementations of
arch_register_cpu() are down to whether the CPU is hotpluggable or not.
Rather than overriding the weak version of arch_register_cpu(), provide
a function that can be used to provide this detail instead.

Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/E1r5R3M-00CszH-6r@rmk-PC.armlinux.org.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/cpu.c  | 11 ++++++++++-
 include/linux/cpu.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 58bb86091b34..221ffbeb1c9b 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -527,9 +527,18 @@ EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
 #ifdef CONFIG_GENERIC_CPU_DEVICES
 DEFINE_PER_CPU(struct cpu, cpu_devices);
 
+bool __weak arch_cpu_is_hotpluggable(int cpu)
+{
+	return false;
+}
+
 int __weak arch_register_cpu(int cpu)
 {
-	return register_cpu(&per_cpu(cpu_devices, cpu), cpu);
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	c->hotpluggable = arch_cpu_is_hotpluggable(cpu);
+
+	return register_cpu(c, cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1e982d63eae8..dcb89c987164 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -80,6 +80,7 @@ extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
 				 const struct attribute_group **groups,
 				 const char *fmt, ...);
+extern bool arch_cpu_is_hotpluggable(int cpu);
 extern int arch_register_cpu(int cpu);
 extern void arch_unregister_cpu(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit v1.2.3


From 43a71cd66b9c0a4af3d15d8644359fde35bdbed0 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Mon, 4 Dec 2023 20:12:30 +0000
Subject: net-device: reorganize net_device fast path variables

Reorganize fast path variables on tx-txrx-rx order
Fastpath variables end after npinfo.

Below data generated with pahole on x86 architecture.

Fast path variables span cache lines before change: 12
Fast path variables span cache lines after change: 4

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231204201232.520025-2-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 117 +++++++++++++++++++++++++---------------------
 net/core/dev.c            |  56 ++++++++++++++++++++++
 2 files changed, 120 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5551177e024e..cb96aad6a6ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2097,6 +2097,70 @@ enum netdev_stat_type {
  */
 
 struct net_device {
+	/* Cacheline organization can be found documented in
+	 * Documentation/networking/net_cachelines/net_device.rst.
+	 * Please update the document when adding new fields.
+	 */
+
+	/* TX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_tx);
+	unsigned long long	priv_flags;
+	const struct net_device_ops *netdev_ops;
+	const struct header_ops *header_ops;
+	struct netdev_queue	*_tx;
+	unsigned int		real_num_tx_queues;
+	unsigned int		gso_max_size;
+	unsigned int		gso_ipv4_max_size;
+	u16			gso_max_segs;
+	s16			num_tc;
+	/* Note : dev->mtu is often read without holding a lock.
+	 * Writers usually hold RTNL.
+	 * It is recommended to use READ_ONCE() to annotate the reads,
+	 * and to use WRITE_ONCE() to annotate the writes.
+	 */
+	unsigned int		mtu;
+	unsigned short		needed_headroom;
+	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
+#ifdef CONFIG_XPS
+	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	struct nf_hook_entries __rcu *nf_hooks_egress;
+#endif
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_mprog_entry __rcu *tcx_egress;
+#endif
+	__cacheline_group_end(net_device_read_tx);
+
+	/* TXRX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_txrx);
+	unsigned int		flags;
+	unsigned short		hard_header_len;
+	netdev_features_t	features;
+	struct inet6_dev __rcu	*ip6_ptr;
+	__cacheline_group_end(net_device_read_txrx);
+
+	/* RX read-mostly hotpath */
+	__cacheline_group_begin(net_device_read_rx);
+	struct list_head	ptype_specific;
+	int			ifindex;
+	unsigned int		real_num_rx_queues;
+	struct netdev_rx_queue	*_rx;
+	unsigned long		gro_flush_timeout;
+	int			napi_defer_hard_irqs;
+	unsigned int		gro_max_size;
+	unsigned int		gro_ipv4_max_size;
+	rx_handler_func_t __rcu	*rx_handler;
+	void __rcu		*rx_handler_data;
+	possible_net_t			nd_net;
+#ifdef CONFIG_NETPOLL
+	struct netpoll_info __rcu	*npinfo;
+#endif
+#ifdef CONFIG_NET_XGRESS
+	struct bpf_mprog_entry __rcu *tcx_ingress;
+#endif
+	__cacheline_group_end(net_device_read_rx);
+
 	char			name[IFNAMSIZ];
 	struct netdev_name_node	*name_node;
 	struct dev_ifalias	__rcu *ifalias;
@@ -2121,7 +2185,6 @@ struct net_device {
 	struct list_head	unreg_list;
 	struct list_head	close_list;
 	struct list_head	ptype_all;
-	struct list_head	ptype_specific;
 
 	struct {
 		struct list_head upper;
@@ -2129,26 +2192,13 @@ struct net_device {
 	} adj_list;
 
 	/* Read-mostly cache-line for fast-path access */
-	unsigned int		flags;
 	xdp_features_t		xdp_features;
-	unsigned long long	priv_flags;
-	const struct net_device_ops *netdev_ops;
 	const struct xdp_metadata_ops *xdp_metadata_ops;
 	const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
-	int			ifindex;
 	unsigned short		gflags;
-	unsigned short		hard_header_len;
 
-	/* Note : dev->mtu is often read without holding a lock.
-	 * Writers usually hold RTNL.
-	 * It is recommended to use READ_ONCE() to annotate the reads,
-	 * and to use WRITE_ONCE() to annotate the writes.
-	 */
-	unsigned int		mtu;
-	unsigned short		needed_headroom;
 	unsigned short		needed_tailroom;
 
-	netdev_features_t	features;
 	netdev_features_t	hw_features;
 	netdev_features_t	wanted_features;
 	netdev_features_t	vlan_features;
@@ -2192,8 +2242,6 @@ struct net_device {
 	const struct tlsdev_ops *tlsdev_ops;
 #endif
 
-	const struct header_ops *header_ops;
-
 	unsigned char		operstate;
 	unsigned char		link_mode;
 
@@ -2234,9 +2282,7 @@ struct net_device {
 
 
 	/* Protocol-specific pointers */
-
 	struct in_device __rcu	*ip_ptr;
-	struct inet6_dev __rcu	*ip6_ptr;
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
 	struct vlan_info __rcu	*vlan_info;
 #endif
@@ -2271,26 +2317,14 @@ struct net_device {
 	/* Interface address info used in eth_type_trans() */
 	const unsigned char	*dev_addr;
 
-	struct netdev_rx_queue	*_rx;
 	unsigned int		num_rx_queues;
-	unsigned int		real_num_rx_queues;
-
 	struct bpf_prog __rcu	*xdp_prog;
-	unsigned long		gro_flush_timeout;
-	int			napi_defer_hard_irqs;
 #define GRO_LEGACY_MAX_SIZE	65536u
 /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
  * and shinfo->gso_segs is a 16bit field.
  */
 #define GRO_MAX_SIZE		(8 * 65535u)
-	unsigned int		gro_max_size;
-	unsigned int		gro_ipv4_max_size;
 	unsigned int		xdp_zc_max_segs;
-	rx_handler_func_t __rcu	*rx_handler;
-	void __rcu		*rx_handler_data;
-#ifdef CONFIG_NET_XGRESS
-	struct bpf_mprog_entry __rcu *tcx_ingress;
-#endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
 	struct nf_hook_entries __rcu *nf_hooks_ingress;
@@ -2305,25 +2339,13 @@ struct net_device {
 /*
  * Cache lines mostly used on transmit path
  */
-	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 	unsigned int		num_tx_queues;
-	unsigned int		real_num_tx_queues;
 	struct Qdisc __rcu	*qdisc;
 	unsigned int		tx_queue_len;
 	spinlock_t		tx_global_lock;
 
 	struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
 
-#ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
-#endif
-#ifdef CONFIG_NET_XGRESS
-	struct bpf_mprog_entry __rcu *tcx_egress;
-#endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	struct nf_hook_entries __rcu *nf_hooks_egress;
-#endif
-
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
 #endif
@@ -2362,12 +2384,6 @@ struct net_device {
 	bool needs_free_netdev;
 	void (*priv_destructor)(struct net_device *dev);
 
-#ifdef CONFIG_NETPOLL
-	struct netpoll_info __rcu	*npinfo;
-#endif
-
-	possible_net_t			nd_net;
-
 	/* mid-layer private */
 	void				*ml_priv;
 	enum netdev_ml_priv_type	ml_priv_type;
@@ -2402,20 +2418,15 @@ struct net_device {
  */
 #define GSO_MAX_SIZE		(8 * GSO_MAX_SEGS)
 
-	unsigned int		gso_max_size;
 #define TSO_LEGACY_MAX_SIZE	65536
 #define TSO_MAX_SIZE		UINT_MAX
 	unsigned int		tso_max_size;
-	u16			gso_max_segs;
 #define TSO_MAX_SEGS		U16_MAX
 	u16			tso_max_segs;
-	unsigned int		gso_ipv4_max_size;
 
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-	s16			num_tc;
-	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
 	u8			prio_tc_map[TC_BITMASK + 1];
 
 #if IS_ENABLED(CONFIG_FCOE)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1d720fc59161..c5679dfbaa70 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11609,6 +11609,60 @@ static struct pernet_operations __net_initdata default_device_ops = {
 	.exit_batch = default_device_exit_batch,
 };
 
+static void __init net_dev_struct_check(void)
+{
+	/* TX read-mostly hotpath */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
+#ifdef CONFIG_XPS
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
+#endif
+#ifdef CONFIG_NET_XGRESS
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
+#endif
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 152);
+
+	/* TXRX read-mostly hotpath */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30);
+
+	/* RX read-mostly hotpath */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
+#ifdef CONFIG_NETPOLL
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
+#endif
+#ifdef CONFIG_NET_XGRESS
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
+#endif
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 96);
+}
+
 /*
  *	Initialize the DEV module. At boot time this walks the device list and
  *	unhooks any devices that fail to initialise (normally hardware not
@@ -11626,6 +11680,8 @@ static int __init net_dev_init(void)
 
 	BUG_ON(!dev_boot_phase);
 
+	net_dev_struct_check();
+
 	if (dev_proc_init())
 		goto out;
 
-- 
cgit v1.2.3


From d5fed5addb2b6bc13035de4338b7ea2052a2e006 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Mon, 4 Dec 2023 20:12:31 +0000
Subject: tcp: reorganize tcp_sock fast path variables

The variables are organized according in the following way:

- TX read-mostly hotpath cache lines
- TXRX read-mostly hotpath cache lines
- RX read-mostly hotpath cache lines
- TX read-write hotpath cache line
- TXRX read-write hotpath cache line
- RX read-write hotpath cache line

Fastpath cachelines end after rcvq_space.

Cache line boundaries are enforced only between read-mostly and
read-write. That is, if read-mostly tx cachelines bleed into
read-mostly txrx cachelines, we do not care. We care about the
boundaries between read and write cachelines because we want
to prevent false sharing.

Fast path variables span cache lines before change: 12
Fast path variables span cache lines after change: 8

Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20231204201232.520025-3-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 248 ++++++++++++++++++++++++++++------------------------
 net/ipv4/tcp.c      |  93 ++++++++++++++++++++
 2 files changed, 227 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 68f3d315d2e1..f55ec155f5b7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req)
 #define TCP_RMEM_TO_WIN_SCALE 8
 
 struct tcp_sock {
+	/* Cacheline organization can be found documented in
+	 * Documentation/networking/net_cachelines/tcp_sock.rst.
+	 * Please update the document when adding new fields.
+	 */
+
 	/* inet_connection_sock has to be the first member of tcp_sock */
 	struct inet_connection_sock	inet_conn;
-	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
+
+	/* TX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_tx);
+	/* timestamp of last sent data packet (for restart window) */
+	u32	max_window;	/* Maximal window ever seen from peer	*/
+	u32	rcv_ssthresh;	/* Current window clamp			*/
+	u32	reordering;	/* Packet reordering metric.		*/
+	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
+	/* from STCP, retrans queue hinting */
+	struct sk_buff *lost_skb_hint;
+	struct sk_buff *retransmit_skb_hint;
+	__cacheline_group_end(tcp_sock_read_tx);
+
+	/* TXRX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_txrx);
+	u32	tsoffset;	/* timestamp offset */
+	u32	snd_wnd;	/* The window we expect to receive	*/
+	u32	mss_cache;	/* Cached effective mss, not including SACKS */
+	u32	snd_cwnd;	/* Sending congestion window		*/
+	u32	prr_out;	/* Total number of pkts sent during Recovery. */
+	u32	lost_out;	/* Lost packets			*/
+	u32	sacked_out;	/* SACK'd packets			*/
+	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
+	u8	chrono_type : 2,	/* current chronograph type */
+		repair      : 1,
+		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
+		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+	__cacheline_group_end(tcp_sock_read_txrx);
+
+	/* RX read-mostly hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_read_rx);
+	u32	copied_seq;	/* Head of yet unread data */
+	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
+	u32	snd_wl1;	/* Sequence for window update		*/
+	u32	tlp_high_seq;	/* snd_nxt at the time of TLP */
+	u32	rttvar_us;	/* smoothed mdev_max			*/
+	u32	retrans_out;	/* Retransmitted packets out		*/
+	u16	advmss;		/* Advertised MSS			*/
+	u16	urg_data;	/* Saved octet of OOB data and control flags */
+	u32	lost;		/* Total data packets lost incl. rexmits */
+	struct  minmax rtt_min;
+	/* OOO segments go in this rbtree. Socket lock must be held. */
+	struct rb_root	out_of_order_queue;
+	u32	snd_ssthresh;	/* Slow start size threshold		*/
+	__cacheline_group_end(tcp_sock_read_rx);
 
+	/* TX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
+	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
+				 * The total number of segments sent.
+				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
+	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
+				 * total number of data bytes sent.
+				 */
+	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
+	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
+	u32	lsndtime;
+	u32	mdev_us;	/* medium deviation			*/
+	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
+	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
+	u64	tcp_mstamp;	/* most recent packet received/sent */
+	u32	rtt_seq;	/* sequence number to update rttvar	*/
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
+	struct sk_buff *highest_sack;   /* skb just after the highest
+					 * skb with SACKed bit set
+					 * (validity guaranteed only if
+					 * sacked_out > 0)
+					 */
+	u8	ecn_flags;	/* ECN status bits.			*/
+	__cacheline_group_end(tcp_sock_write_tx);
+
+	/* TXRX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_txrx);
 /*
  *	Header prediction flags
  *	0x5?10 << 16 + snd_wnd in net byte order
  */
 	__be32	pred_flags;
-
+	u32	rcv_nxt;	/* What we want to receive next		*/
+	u32	snd_nxt;	/* Next sequence we send		*/
+	u32	snd_una;	/* First byte we want an ack for	*/
+	u32	window_clamp;	/* Maximal window to advertise		*/
+	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
+	u32	packets_out;	/* Packets which are "in flight"	*/
+	u32	snd_up;		/* Urgent pointer		*/
+	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	delivered_ce;	/* Like the above but only ECE marked packets */
+	u32	app_limited;	/* limited until "delivered" reaches this val */
+	u32	rcv_wnd;	/* Current receiver window		*/
 /*
- *	RFC793 variables by their proper names. This means you can
- *	read the code and the spec side by side (and laugh ...)
- *	See RFC793 and RFC1122. The RFC writes these in capitals.
+ *      Options received (usually on last packet, some only on SYN packets).
  */
-	u64	bytes_received;	/* RFC4898 tcpEStatsAppHCThruOctetsReceived
+	struct tcp_options_received rx_opt;
+	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
+		rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
+	__cacheline_group_end(tcp_sock_write_txrx);
+
+	/* RX read-write hotpath cache lines */
+	__cacheline_group_begin(tcp_sock_write_rx);
+	u64	bytes_received;
+				/* RFC4898 tcpEStatsAppHCThruOctetsReceived
 				 * sum(delta(rcv_nxt)), or how many bytes
 				 * were acked.
 				 */
@@ -220,45 +318,44 @@ struct tcp_sock {
 	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
 				 * total number of data segments in.
 				 */
- 	u32	rcv_nxt;	/* What we want to receive next 	*/
-	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
- 	u32	snd_nxt;	/* Next sequence we send		*/
-	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
-				 * The total number of segments sent.
-				 */
-	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
-				 * total number of data segments sent.
-				 */
-	u64	bytes_sent;	/* RFC4898 tcpEStatsPerfHCDataOctetsOut
-				 * total number of data bytes sent.
-				 */
+	u32	max_packets_out;  /* max packets_out in last window */
+	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
+	u32	rate_delivered;    /* saved rate sample: packets delivered */
+	u32	rate_interval_us;  /* saved rate sample: time elapsed */
+	u32	rcv_rtt_last_tsecr;
+	u64	first_tx_mstamp;  /* start of window send phase */
+	u64	delivered_mstamp; /* time we reached "delivered" */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	struct {
+		u32	rtt_us;
+		u32	seq;
+		u64	time;
+	} rcv_rtt_est;
+/* Receiver queue space */
+	struct {
+		u32	space;
+		u32	seq;
+		u64	time;
+	} rcvq_space;
+	__cacheline_group_end(tcp_sock_write_rx);
+	/* End of Hot Path */
+
+/*
+ *	RFC793 variables by their proper names. This means you can
+ *	read the code and the spec side by side (and laugh ...)
+ *	See RFC793 and RFC1122. The RFC writes these in capitals.
+ */
 	u32	dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
 				 * total number of DSACK blocks received
 				 */
- 	u32	snd_una;	/* First byte we want an ack for	*/
- 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
-	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
-	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 	u32	compressed_ack_rcv_nxt;
-
-	u32	tsoffset;	/* timestamp offset */
-
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
-	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
-
-	u32	snd_wl1;	/* Sequence for window update		*/
-	u32	snd_wnd;	/* The window we expect to receive	*/
-	u32	max_window;	/* Maximal window ever seen from peer	*/
-	u32	mss_cache;	/* Cached effective mss, not including SACKS */
 
-	u32	window_clamp;	/* Maximal window to advertise		*/
-	u32	rcv_ssthresh;	/* Current window clamp			*/
 	u8	scaling_ratio;	/* see tcp_win_from_space() */
 	/* Information of the most recently (s)acked skb */
 	struct tcp_rack {
@@ -272,24 +369,16 @@ struct tcp_sock {
 		   dsack_seen:1, /* Whether DSACK seen after last adj */
 		   advanced:1;	 /* mstamp advanced since last lost marking */
 	} rack;
-	u16	advmss;		/* Advertised MSS			*/
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
 		tcp_usec_ts:1, /* TSval values in usec */
 		unused:4;
-	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
-	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
-	u8	chrono_type:2,	/* current chronograph type */
-		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
+		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
-		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
-		fastopen_client_fail:2; /* reason why fastopen failed */
-	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
-		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
-		repair      : 1,
+		fastopen_client_fail:2, /* reason why fastopen failed */
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
 	u8	save_syn:2,	/* Save headers of SYN packet */
@@ -297,45 +386,19 @@ struct tcp_sock {
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
-		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
-		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
-	u32	tlp_high_seq;	/* snd_nxt at the time of TLP */
+		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
 
 	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
-	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
-	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 
 /* RTT measurement */
-	u64	tcp_mstamp;	/* most recent packet received/sent */
-	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
-	u32	mdev_us;	/* medium deviation			*/
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
-	u32	rttvar_us;	/* smoothed mdev_max			*/
-	u32	rtt_seq;	/* sequence number to update rttvar	*/
-	struct  minmax rtt_min;
 
-	u32	packets_out;	/* Packets which are "in flight"	*/
-	u32	retrans_out;	/* Retransmitted packets out		*/
-	u32	max_packets_out;  /* max packets_out in last window */
-	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
-
-	u16	urg_data;	/* Saved octet of OOB data and control flags */
-	u8	ecn_flags;	/* ECN status bits.			*/
 	u8	keepalive_probes; /* num of allowed keep alive probes	*/
-	u32	reordering;	/* Packet reordering metric.		*/
 	u32	reord_seen;	/* number of data packet reordering events */
-	u32	snd_up;		/* Urgent pointer		*/
-
-/*
- *      Options received (usually on last packet, some only on SYN packets).
- */
-	struct tcp_options_received rx_opt;
 
 /*
  *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
  */
- 	u32	snd_ssthresh;	/* Slow start size threshold		*/
- 	u32	snd_cwnd;	/* Sending congestion window		*/
 	u32	snd_cwnd_cnt;	/* Linear increase counter		*/
 	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	u32	snd_cwnd_used;
@@ -343,32 +406,10 @@ struct tcp_sock {
 	u32	prior_cwnd;	/* cwnd right before starting loss recovery */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
-	u32	prr_out;	/* Total number of pkts sent during Recovery. */
-	u32	delivered;	/* Total data packets delivered incl. rexmits */
-	u32	delivered_ce;	/* Like the above but only ECE marked packets */
-	u32	lost;		/* Total data packets lost incl. rexmits */
-	u32	app_limited;	/* limited until "delivered" reaches this val */
-	u64	first_tx_mstamp;  /* start of window send phase */
-	u64	delivered_mstamp; /* time we reached "delivered" */
-	u32	rate_delivered;    /* saved rate sample: packets delivered */
-	u32	rate_interval_us;  /* saved rate sample: time elapsed */
-
- 	u32	rcv_wnd;	/* Current receiver window		*/
-	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
-	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
-	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
-	u32	lost_out;	/* Lost packets			*/
-	u32	sacked_out;	/* SACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
 	struct hrtimer	compressed_ack_timer;
 
-	/* from STCP, retrans queue hinting */
-	struct sk_buff* lost_skb_hint;
-	struct sk_buff *retransmit_skb_hint;
-
-	/* OOO segments go in this rbtree. Socket lock must be held. */
-	struct rb_root	out_of_order_queue;
 	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */
 
 	/* SACKs data, these 2 need to be together (see tcp_options_write) */
@@ -377,12 +418,6 @@ struct tcp_sock {
 
 	struct tcp_sack_block recv_sack_cache[4];
 
-	struct sk_buff *highest_sack;   /* skb just after the highest
-					 * skb with SACKed bit set
-					 * (validity guaranteed only if
-					 * sacked_out > 0)
-					 */
-
 	int     lost_cnt_hint;
 
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
@@ -433,21 +468,6 @@ struct tcp_sock {
 
 	u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
 
-/* Receiver side RTT estimation */
-	u32 rcv_rtt_last_tsecr;
-	struct {
-		u32	rtt_us;
-		u32	seq;
-		u64	time;
-	} rcv_rtt_est;
-
-/* Receiver queue space */
-	struct {
-		u32	space;
-		u32	seq;
-		u64	time;
-	} rcvq_space;
-
 /* TCP-specific MTU probe information. */
 	struct {
 		u32		  probe_seq_start;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a100df07d34a..70a1bafbefba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4564,6 +4564,97 @@ static void __init tcp_init_mem(void)
 	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;	/* 9.37 % */
 }
 
+static void __init tcp_struct_check(void)
+{
+	/* TX read-mostly hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
+
+	/* TXRX read-mostly hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 31);
+
+	/* RX read-mostly hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
+
+	/* TX read-write hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113);
+
+	/* TXRX read-write hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76);
+
+	/* RX read-write hotpath cache lines */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
+}
+
 void __init tcp_init(void)
 {
 	int max_rshare, max_wshare, cnt;
@@ -4574,6 +4665,8 @@ void __init tcp_init(void)
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
 		     sizeof_field(struct sk_buff, cb));
 
+	tcp_struct_check();
+
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 
 	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
-- 
cgit v1.2.3


From facd15dfd69122042502d99ab8c9f888b48ee994 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 4 Dec 2023 21:47:07 +0100
Subject: net: core: synchronize link-watch when carrier is queried

There are multiple ways to query for the carrier state: through
rtnetlink, sysfs, and (possibly) ethtool. Synchronize linkwatch
work before these operations so that we don't have a situation
where userspace queries the carrier state between the driver's
carrier off->on transition and linkwatch running and expects it
to work, when really (at least) TX cannot work until linkwatch
has run.

I previously posted a longer explanation of how this applies to
wireless [1] but with this wireless can simply query the state
before sending data, to ensure the kernel is ready for it.

[1] https://lore.kernel.org/all/346b21d87c69f817ea3c37caceb34f1f56255884.camel@sipsolutions.net/

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20231204214706.303c62768415.I1caedccae72ee5a45c9085c5eb49c145ce1c0dd5@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 9 +++++++++
 net/core/dev.c            | 2 +-
 net/core/dev.h            | 1 -
 net/core/link_watch.c     | 2 +-
 net/core/net-sysfs.c      | 8 +++++++-
 net/core/rtnetlink.c      | 8 ++++++++
 net/ethtool/ioctl.c       | 3 +++
 7 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb96aad6a6ee..1b935ee341b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4229,6 +4229,15 @@ static inline void netdev_ref_replace(struct net_device *odev,
  */
 void linkwatch_fire_event(struct net_device *dev);
 
+/**
+ * linkwatch_sync_dev - sync linkwatch for the given device
+ * @dev: network device to sync linkwatch for
+ *
+ * Sync linkwatch for the given device, removing it from the
+ * pending work list (if queued).
+ */
+void linkwatch_sync_dev(struct net_device *dev);
+
 /**
  *	netif_carrier_ok - test if carrier present
  *	@dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index c5679dfbaa70..0432b04cf9b0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10548,7 +10548,7 @@ void netdev_run_todo(void)
 		write_lock(&dev_base_lock);
 		dev->reg_state = NETREG_UNREGISTERED;
 		write_unlock(&dev_base_lock);
-		linkwatch_forget_dev(dev);
+		linkwatch_sync_dev(dev);
 	}
 
 	while (!list_empty(&list)) {
diff --git a/net/core/dev.h b/net/core/dev.h
index 7795b8ad841d..cf93e188785b 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -30,7 +30,6 @@ int __init dev_proc_init(void);
 #endif
 
 void linkwatch_init_dev(struct net_device *dev);
-void linkwatch_forget_dev(struct net_device *dev);
 void linkwatch_run_queue(void);
 
 void dev_addr_flush(struct net_device *dev);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index c469d1c4db5d..a19f21403339 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -245,7 +245,7 @@ static void __linkwatch_run_queue(int urgent_only)
 	spin_unlock_irq(&lweventlist_lock);
 }
 
-void linkwatch_forget_dev(struct net_device *dev)
+void linkwatch_sync_dev(struct net_device *dev)
 {
 	unsigned long flags;
 	int clean = 0;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fccaa5bac0ed..d9b33e923b18 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -194,8 +194,14 @@ static ssize_t carrier_show(struct device *dev,
 {
 	struct net_device *netdev = to_net_dev(dev);
 
-	if (netif_running(netdev))
+	if (netif_running(netdev)) {
+		/* Synchronize carrier state with link watch,
+		 * see also rtnl_getlink().
+		 */
+		linkwatch_sync_dev(netdev);
+
 		return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
+	}
 
 	return -EINVAL;
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 592164c2a540..5e0ab4c08f72 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3853,6 +3853,14 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (nskb == NULL)
 		goto out;
 
+	/* Synchronize the carrier state so we don't report a state
+	 * that we're not actually going to honour immediately; if
+	 * the driver just did a carrier off->on transition, we can
+	 * only TX if link watch work has run, but without this we'd
+	 * already report carrier on, even if it doesn't work yet.
+	 */
+	linkwatch_sync_dev(dev);
+
 	err = rtnl_fill_ifinfo(nskb, dev, net,
 			       RTM_NEWLINK, NETLINK_CB(skb).portid,
 			       nlh->nlmsg_seq, 0, 0, ext_filter_mask,
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 0b0ce4f81c01..a977f8903467 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -58,6 +58,9 @@ static struct devlink *netdev_to_devlink_get(struct net_device *dev)
 
 u32 ethtool_op_get_link(struct net_device *dev)
 {
+	/* Synchronize carrier state with link watch, see also rtnl_getlink() */
+	linkwatch_sync_dev(dev);
+
 	return netif_carrier_ok(dev) ? 1 : 0;
 }
 EXPORT_SYMBOL(ethtool_op_get_link);
-- 
cgit v1.2.3


From 7037d95a047cd89b1f680eed253c6ab586bef1ed Mon Sep 17 00:00:00 2001
From: Kelly Kane <kelly@hawknetworks.com>
Date: Sat, 2 Dec 2023 17:17:12 -0800
Subject: r8152: add vendor/device ID pair for ASUS USB-C2500

The ASUS USB-C2500 is an RTL8156 based 2.5G Ethernet controller.

Add the vendor and product ID values to the driver. This makes Ethernet
work with the adapter.

Signed-off-by: Kelly Kane <kelly@hawknetworks.com>
Link: https://lore.kernel.org/r/20231203011712.6314-1-kelly@hawknetworks.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/usb/r8152.c   | 1 +
 include/linux/usb/r8152.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index fcdc9ba0f826..9bf2140fd0a1 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -10049,6 +10049,7 @@ static const struct usb_device_id rtl8152_table[] = {
 	{ USB_DEVICE(VENDOR_ID_NVIDIA,  0x09ff) },
 	{ USB_DEVICE(VENDOR_ID_TPLINK,  0x0601) },
 	{ USB_DEVICE(VENDOR_ID_DLINK,   0xb301) },
+	{ USB_DEVICE(VENDOR_ID_ASUS,    0x1976) },
 	{}
 };
 
diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h
index 287e9d83fb8b..33a4c146dc19 100644
--- a/include/linux/usb/r8152.h
+++ b/include/linux/usb/r8152.h
@@ -30,6 +30,7 @@
 #define VENDOR_ID_NVIDIA		0x0955
 #define VENDOR_ID_TPLINK		0x2357
 #define VENDOR_ID_DLINK			0x2001
+#define VENDOR_ID_ASUS			0x0b05
 
 #if IS_REACHABLE(CONFIG_USB_RTL8152)
 extern u8 rtl8152_get_version(struct usb_interface *intf);
-- 
cgit v1.2.3


From a3a44d2d3a5c5ff6e73c711db5b1911b5a676bb0 Mon Sep 17 00:00:00 2001
From: Even Xu <even.xu@intel.com>
Date: Tue, 5 Dec 2023 09:50:30 +0800
Subject: HID: Intel-ish-hid: Ishtp: Add helper functions for client connection

For every ishtp client driver during initialization state, the flow is:
1 - Allocate an ISHTP client instance
2 - Reserve a host id and link the client instance
3 - Search a firmware client using UUID and get related
    client information
4 - Bind firmware client id to the ISHTP client instance
5 - Set the state the ISHTP client instance to CONNECTING
6 - Send connect request to firmware
7 - Register event callback for messages from the firmware

During deinitizalization state, the flow is:
9 - Set the state the ISHTP client instance to ISHTP_CL_DISCONNECTING
10 - Issue disconnect request to firmware
11 - Unlike the client instance
12 - Flush message queue
13 - Free ISHTP client instance

Step 2-7 are identical to the steps of client driver initialization
and driver reset flow, but reallocation of the RX/TX ring buffers
can be avoided in reset flow.

Also for step 9-12, they are identical to the steps of client driver
failure handling after connect request, driver reset flow and
driver removing.

So, add two helper functions to simplify client driver code.
ishtp_cl_establish_connection()
ishtp_cl_destroy_connection()

No functional changes are expected.

Signed-off-by: Even Xu <even.xu@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/intel-ish-hid/ishtp/client.c | 185 +++++++++++++++++++++++++++++--
 include/linux/intel-ish-client-if.h      |   3 +
 2 files changed, 177 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c
index 2d92fc129ce4..82c907f01bd3 100644
--- a/drivers/hid/intel-ish-hid/ishtp/client.c
+++ b/drivers/hid/intel-ish-hid/ishtp/client.c
@@ -339,16 +339,17 @@ static bool ishtp_cl_is_other_connecting(struct ishtp_cl *cl)
 }
 
 /**
- * ishtp_cl_connect() - Send connect request to firmware
+ * ishtp_cl_connect_to_fw() - Send connect request to firmware
  * @cl: client device instance
  *
- * Send a connect request for a client to firmware. If successful it will
- * RX and TX ring buffers
+ * Send a connect request to the firmware and wait for firmware response.
+ * If there is successful connection response from the firmware, change
+ * client state to ISHTP_CL_CONNECTED, and bind client to related
+ * firmware client_id.
  *
- * Return: 0 if successful connect response from the firmware and able
- * to bind and allocate ring buffers or error code on failure
+ * Return: 0 for success and error code on failure
  */
-int ishtp_cl_connect(struct ishtp_cl *cl)
+static int ishtp_cl_connect_to_fw(struct ishtp_cl *cl)
 {
 	struct ishtp_device *dev;
 	int rets;
@@ -358,8 +359,6 @@ int ishtp_cl_connect(struct ishtp_cl *cl)
 
 	dev = cl->dev;
 
-	dev->print_log(dev, "%s() current_state = %d\n", __func__, cl->state);
-
 	if (ishtp_cl_is_other_connecting(cl)) {
 		dev->print_log(dev, "%s() Busy\n", __func__);
 		return	-EBUSY;
@@ -405,6 +404,38 @@ int ishtp_cl_connect(struct ishtp_cl *cl)
 		return rets;
 	}
 
+	return rets;
+}
+
+/**
+ * ishtp_cl_connect() - Build connection with firmware
+ * @cl: client device instance
+ *
+ * Call ishtp_cl_connect_to_fw() to connect and bind to firmware. If successful,
+ * allocate RX and TX ring buffers, and start flow control with firmware to
+ * start communication.
+ *
+ * Return: 0 if there is successful connection to the firmware, allocate
+ * ring buffers.
+ */
+int ishtp_cl_connect(struct ishtp_cl *cl)
+{
+	struct ishtp_device *dev;
+	int rets;
+
+	if (!cl || !cl->dev)
+		return -ENODEV;
+
+	dev = cl->dev;
+
+	dev->print_log(dev, "%s() current_state = %d\n", __func__, cl->state);
+
+	rets = ishtp_cl_connect_to_fw(cl);
+	if (rets) {
+		dev->print_log(dev, "%s() Connect to fw failed\n", __func__);
+		return rets;
+	}
+
 	rets = ishtp_cl_alloc_rx_ring(cl);
 	if (rets) {
 		dev->print_log(dev, "%s() Alloc RX ring failed\n", __func__);
@@ -422,15 +453,147 @@ int ishtp_cl_connect(struct ishtp_cl *cl)
 		return rets;
 	}
 
-	/* Upon successful connection and allocation, emit flow-control */
+	/*
+	 * Upon successful connection and allocation, start flow-control.
+	 */
 	rets = ishtp_cl_read_start(cl);
 
-	dev->print_log(dev, "%s() successful\n", __func__);
-
 	return rets;
 }
 EXPORT_SYMBOL(ishtp_cl_connect);
 
+/**
+ * ishtp_cl_establish_connection() - Establish connection with the firmware
+ * @cl: client device instance
+ * @uuid: uuid of the client to search
+ * @tx_size: TX ring buffer size
+ * @rx_size: RX ring buffer size
+ * @reset: true if called for reset connection, otherwise for first connection
+ *
+ * This is a helper function for client driver to build connection with firmware.
+ * If it's first time connecting to the firmware, set reset to false, this
+ * function will link client to bus, find client id and send connect request to
+ * the firmware.
+ *
+ * If it's called for reset handler where client lost connection after
+ * firmware reset, set reset to true, this function will reinit client state and
+ * establish connection again. In this case, this function reuses current client
+ * structure and ring buffers to avoid allocation failure and memory fragments.
+ *
+ * Return: 0 for successful connection with the firmware,
+ * or error code on failure
+ */
+int ishtp_cl_establish_connection(struct ishtp_cl *cl, const guid_t *uuid,
+				  int tx_size, int rx_size, bool reset)
+{
+	struct ishtp_device *dev;
+	struct ishtp_fw_client *fw_client;
+	int rets;
+
+	if (!cl || !cl->dev)
+		return -ENODEV;
+
+	dev = cl->dev;
+
+	ishtp_set_connection_state(cl, ISHTP_CL_INITIALIZING);
+
+	/* reinit ishtp_cl structure if call for reset */
+	if (reset) {
+		cl->host_client_id = 0;
+		cl->fw_client_id = 0;
+		cl->ishtp_flow_ctrl_creds = 0;
+		cl->out_flow_ctrl_creds = 0;
+
+		cl->last_tx_path = CL_TX_PATH_IPC;
+		cl->last_dma_acked = 1;
+		cl->last_dma_addr = NULL;
+		cl->last_ipc_acked = 1;
+
+		cl->sending = 0;
+		cl->err_send_msg = 0;
+		cl->err_send_fc = 0;
+
+		cl->send_msg_cnt_ipc = 0;
+		cl->send_msg_cnt_dma = 0;
+		cl->recv_msg_cnt_ipc = 0;
+		cl->recv_msg_cnt_dma = 0;
+		cl->recv_msg_num_frags = 0;
+		cl->ishtp_flow_ctrl_cnt = 0;
+		cl->out_flow_ctrl_cnt = 0;
+	}
+
+	/* link to bus */
+	rets = ishtp_cl_link(cl);
+	if (rets) {
+		dev->print_log(dev, "%s() ishtp_cl_link failed\n", __func__);
+		return rets;
+	}
+
+	/* find firmware client */
+	fw_client = ishtp_fw_cl_get_client(dev, uuid);
+	if (!fw_client) {
+		dev->print_log(dev,
+			       "%s() ish client uuid not found\n", __func__);
+		return -ENOENT;
+	}
+
+	ishtp_set_tx_ring_size(cl, tx_size);
+	ishtp_set_rx_ring_size(cl, rx_size);
+
+	ishtp_cl_set_fw_client_id(cl, ishtp_get_fw_client_id(fw_client));
+
+	ishtp_set_connection_state(cl, ISHTP_CL_CONNECTING);
+
+	/*
+	 * For reset case, not allocate tx/rx ring buffer which are already
+	 * done in ishtp_cl_connect() during first connection.
+	 */
+	if (reset) {
+		rets = ishtp_cl_connect_to_fw(cl);
+		if (!rets)
+			rets = ishtp_cl_read_start(cl);
+		else
+			dev->print_log(dev,
+				"%s() connect to fw failed\n", __func__);
+	} else {
+		rets = ishtp_cl_connect(cl);
+	}
+
+	return rets;
+}
+EXPORT_SYMBOL(ishtp_cl_establish_connection);
+
+/**
+ * ishtp_cl_destroy_connection() - Disconnect with the firmware
+ * @cl: client device instance
+ * @reset: true if called for firmware reset, false for normal disconnection
+ *
+ * This is a helper function for client driver to disconnect with firmware,
+ * unlink to bus and flush message queue.
+ */
+void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset)
+{
+	if (!cl)
+		return;
+
+	if (reset) {
+		/*
+		 * For reset case, connection is already lost during fw reset.
+		 * Just set state to DISCONNECTED is enough.
+		 */
+		ishtp_set_connection_state(cl, ISHTP_CL_DISCONNECTED);
+	} else {
+		if (cl->state != ISHTP_CL_DISCONNECTED) {
+			ishtp_set_connection_state(cl, ISHTP_CL_DISCONNECTING);
+			ishtp_cl_disconnect(cl);
+		}
+	}
+
+	ishtp_cl_unlink(cl);
+	ishtp_cl_flush_queues(cl);
+}
+EXPORT_SYMBOL(ishtp_cl_destroy_connection);
+
 /**
  * ishtp_cl_read_start() - Prepare to read client message
  * @cl: client device instance
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index f45f13304add..771622650247 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -94,6 +94,9 @@ int ishtp_cl_link(struct ishtp_cl *cl);
 void ishtp_cl_unlink(struct ishtp_cl *cl);
 int ishtp_cl_disconnect(struct ishtp_cl *cl);
 int ishtp_cl_connect(struct ishtp_cl *cl);
+int ishtp_cl_establish_connection(struct ishtp_cl *cl, const guid_t *uuid,
+				  int tx_size, int rx_size, bool reset);
+void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset);
 int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length);
 int ishtp_cl_flush_queues(struct ishtp_cl *cl);
 int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb);
-- 
cgit v1.2.3


From 16a1d968358aa9e897ce995fa45cb15d55a0e83d Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 20:43:43 +0200
Subject: mm/slab: remove mm/slab.c and slab_def.h

Remove the SLAB implementation. Update CREDITS.
Also update and properly sort the SLOB entry there.

RIP SLAB allocator (1996 - 2024)

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 CREDITS                  |   12 +-
 include/linux/slab_def.h |  124 --
 mm/slab.c                | 4005 ----------------------------------------------
 3 files changed, 8 insertions(+), 4133 deletions(-)
 delete mode 100644 include/linux/slab_def.h
 delete mode 100644 mm/slab.c

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index f33a33fd2371..e9a094a93287 100644
--- a/CREDITS
+++ b/CREDITS
@@ -9,10 +9,6 @@
 			Linus
 ----------
 
-N: Matt Mackal
-E: mpm@selenic.com
-D: SLOB slab allocator
-
 N: Matti Aarnio
 E: mea@nic.funet.fi
 D: Alpha systems hacking, IPv6 and other network related stuff
@@ -1572,6 +1568,10 @@ S: Ampferstr. 50 / 4
 S: 6020 Innsbruck
 S: Austria
 
+N: Mark Hemment
+E: markhe@nextd.demon.co.uk
+D: SLAB allocator implementation
+
 N: Richard Henderson
 E: rth@twiddle.net
 E: rth@cygnus.com
@@ -2437,6 +2437,10 @@ D: work on suspend-to-ram/disk, killing duplicates from ioctl32,
 D: Altera SoCFPGA and Nokia N900 support.
 S: Czech Republic
 
+N: Olivia Mackall
+E: olivia@selenic.com
+D: SLOB slab allocator
+
 N: Paul Mackerras
 E: paulus@samba.org
 D: PPP driver
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
deleted file mode 100644
index a61e7d55d0d3..000000000000
--- a/include/linux/slab_def.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SLAB_DEF_H
-#define	_LINUX_SLAB_DEF_H
-
-#include <linux/kfence.h>
-#include <linux/reciprocal_div.h>
-
-/*
- * Definitions unique to the original Linux SLAB allocator.
- */
-
-struct kmem_cache {
-	struct array_cache __percpu *cpu_cache;
-
-/* 1) Cache tunables. Protected by slab_mutex */
-	unsigned int batchcount;
-	unsigned int limit;
-	unsigned int shared;
-
-	unsigned int size;
-	struct reciprocal_value reciprocal_buffer_size;
-/* 2) touched by every alloc & free from the backend */
-
-	slab_flags_t flags;		/* constant flags */
-	unsigned int num;		/* # of objs per slab */
-
-/* 3) cache_grow/shrink */
-	/* order of pgs per slab (2^n) */
-	unsigned int gfporder;
-
-	/* force GFP flags, e.g. GFP_DMA */
-	gfp_t allocflags;
-
-	size_t colour;			/* cache colouring range */
-	unsigned int colour_off;	/* colour offset */
-	unsigned int freelist_size;
-
-	/* constructor func */
-	void (*ctor)(void *obj);
-
-/* 4) cache creation/removal */
-	const char *name;
-	struct list_head list;
-	int refcount;
-	int object_size;
-	int align;
-
-/* 5) statistics */
-#ifdef CONFIG_DEBUG_SLAB
-	unsigned long num_active;
-	unsigned long num_allocations;
-	unsigned long high_mark;
-	unsigned long grown;
-	unsigned long reaped;
-	unsigned long errors;
-	unsigned long max_freeable;
-	unsigned long node_allocs;
-	unsigned long node_frees;
-	unsigned long node_overflow;
-	atomic_t allochit;
-	atomic_t allocmiss;
-	atomic_t freehit;
-	atomic_t freemiss;
-
-	/*
-	 * If debugging is enabled, then the allocator can add additional
-	 * fields and/or padding to every object. 'size' contains the total
-	 * object size including these internal fields, while 'obj_offset'
-	 * and 'object_size' contain the offset to the user object and its
-	 * size.
-	 */
-	int obj_offset;
-#endif /* CONFIG_DEBUG_SLAB */
-
-#ifdef CONFIG_KASAN_GENERIC
-	struct kasan_cache kasan_info;
-#endif
-
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-	unsigned int *random_seq;
-#endif
-
-#ifdef CONFIG_HARDENED_USERCOPY
-	unsigned int useroffset;	/* Usercopy region offset */
-	unsigned int usersize;		/* Usercopy region size */
-#endif
-
-	struct kmem_cache_node *node[MAX_NUMNODES];
-};
-
-static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
-				void *x)
-{
-	void *object = x - (x - slab->s_mem) % cache->size;
-	void *last_object = slab->s_mem + (cache->num - 1) * cache->size;
-
-	if (unlikely(object > last_object))
-		return last_object;
-	else
-		return object;
-}
-
-/*
- * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->size) by
- *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
- */
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct slab *slab, void *obj)
-{
-	u32 offset = (obj - slab->s_mem);
-	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-}
-
-static inline int objs_per_slab(const struct kmem_cache *cache,
-				     const struct slab *slab)
-{
-	if (is_kfence_address(slab_address(slab)))
-		return 1;
-	return cache->num;
-}
-
-#endif	/* _LINUX_SLAB_DEF_H */
diff --git a/mm/slab.c b/mm/slab.c
deleted file mode 100644
index 37efe3241f9c..000000000000
--- a/mm/slab.c
+++ /dev/null
@@ -1,4005 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/mm/slab.c
- * Written by Mark Hemment, 1996/97.
- * (markhe@nextd.demon.co.uk)
- *
- * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
- *
- * Major cleanup, different bufctl logic, per-cpu arrays
- *	(c) 2000 Manfred Spraul
- *
- * Cleanup, make the head arrays unconditional, preparation for NUMA
- * 	(c) 2002 Manfred Spraul
- *
- * An implementation of the Slab Allocator as described in outline in;
- *	UNIX Internals: The New Frontiers by Uresh Vahalia
- *	Pub: Prentice Hall	ISBN 0-13-101908-2
- * or with a little more detail in;
- *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
- *	Jeff Bonwick (Sun Microsystems).
- *	Presented at: USENIX Summer 1994 Technical Conference
- *
- * The memory is organized in caches, one cache for each object type.
- * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
- * Each cache consists out of many slabs (they are small (usually one
- * page long) and always contiguous), and each slab contains multiple
- * initialized objects.
- *
- * This means, that your constructor is used only for newly allocated
- * slabs and you must pass objects with the same initializations to
- * kmem_cache_free.
- *
- * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
- * normal). If you need a special memory type, then must create a new
- * cache for that memory type.
- *
- * In order to reduce fragmentation, the slabs are sorted in 3 groups:
- *   full slabs with 0 free objects
- *   partial slabs
- *   empty slabs with no allocated objects
- *
- * If partial slabs exist, then new allocations come from these slabs,
- * otherwise from empty slabs or new slabs are allocated.
- *
- * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
- * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
- *
- * Each cache has a short per-cpu head array, most allocs
- * and frees go into that array, and if that array overflows, then 1/2
- * of the entries in the array are given back into the global cache.
- * The head array is strictly LIFO and should improve the cache hit rates.
- * On SMP, it additionally reduces the spinlock operations.
- *
- * The c_cpuarray may not be read with enabled local interrupts -
- * it's changed with a smp_call_function().
- *
- * SMP synchronization:
- *  constructors and destructors are called without any locking.
- *  Several members in struct kmem_cache and struct slab never change, they
- *	are accessed without any locking.
- *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
- *  	and local interrupts are disabled so slab code is preempt-safe.
- *  The non-constant members are protected with a per-cache irq spinlock.
- *
- * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
- * in 2000 - many ideas in the current implementation are derived from
- * his patch.
- *
- * Further notes from the original documentation:
- *
- * 11 April '97.  Started multi-threading - markhe
- *	The global cache-chain is protected by the mutex 'slab_mutex'.
- *	The sem is only needed when accessing/extending the cache-chain, which
- *	can never happen inside an interrupt (kmem_cache_create(),
- *	kmem_cache_shrink() and kmem_cache_reap()).
- *
- *	At present, each engine can be growing a cache.  This should be blocked.
- *
- * 15 March 2005. NUMA slab allocator.
- *	Shai Fultheim <shai@scalex86.org>.
- *	Shobhit Dayal <shobhit@calsoftinc.com>
- *	Alok N Kataria <alokk@calsoftinc.com>
- *	Christoph Lameter <christoph@lameter.com>
- *
- *	Modified the slab allocator to be node aware on NUMA systems.
- *	Each node has its own list of partial, free and full slabs.
- *	All object allocations for a node occur from node specific slab lists.
- */
-
-#include	<linux/slab.h>
-#include	<linux/mm.h>
-#include	<linux/poison.h>
-#include	<linux/swap.h>
-#include	<linux/cache.h>
-#include	<linux/interrupt.h>
-#include	<linux/init.h>
-#include	<linux/compiler.h>
-#include	<linux/cpuset.h>
-#include	<linux/proc_fs.h>
-#include	<linux/seq_file.h>
-#include	<linux/notifier.h>
-#include	<linux/kallsyms.h>
-#include	<linux/kfence.h>
-#include	<linux/cpu.h>
-#include	<linux/sysctl.h>
-#include	<linux/module.h>
-#include	<linux/rcupdate.h>
-#include	<linux/string.h>
-#include	<linux/uaccess.h>
-#include	<linux/nodemask.h>
-#include	<linux/kmemleak.h>
-#include	<linux/mempolicy.h>
-#include	<linux/mutex.h>
-#include	<linux/fault-inject.h>
-#include	<linux/rtmutex.h>
-#include	<linux/reciprocal_div.h>
-#include	<linux/debugobjects.h>
-#include	<linux/memory.h>
-#include	<linux/prefetch.h>
-#include	<linux/sched/task_stack.h>
-
-#include	<net/sock.h>
-
-#include	<asm/cacheflush.h>
-#include	<asm/tlbflush.h>
-#include	<asm/page.h>
-
-#include <trace/events/kmem.h>
-
-#include	"internal.h"
-
-#include	"slab.h"
-
-/*
- * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
- *		  0 for faster, smaller code (especially in the critical paths).
- *
- * STATS	- 1 to collect stats for /proc/slabinfo.
- *		  0 for faster, smaller code (especially in the critical paths).
- *
- * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
- */
-
-#ifdef CONFIG_DEBUG_SLAB
-#define	DEBUG		1
-#define	STATS		1
-#define	FORCED_DEBUG	1
-#else
-#define	DEBUG		0
-#define	STATS		0
-#define	FORCED_DEBUG	0
-#endif
-
-/* Shouldn't this be in a header file somewhere? */
-#define	BYTES_PER_WORD		sizeof(void *)
-#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
-
-#ifndef ARCH_KMALLOC_FLAGS
-#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
-#endif
-
-#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
-				<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
-
-#if FREELIST_BYTE_INDEX
-typedef unsigned char freelist_idx_t;
-#else
-typedef unsigned short freelist_idx_t;
-#endif
-
-#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
-
-/*
- * struct array_cache
- *
- * Purpose:
- * - LIFO ordering, to hand out cache-warm objects from _alloc
- * - reduce the number of linked list operations
- * - reduce spinlock operations
- *
- * The limit is stored in the per-cpu structure to reduce the data cache
- * footprint.
- *
- */
-struct array_cache {
-	unsigned int avail;
-	unsigned int limit;
-	unsigned int batchcount;
-	unsigned int touched;
-	void *entry[];	/*
-			 * Must have this definition in here for the proper
-			 * alignment of array_cache. Also simplifies accessing
-			 * the entries.
-			 */
-};
-
-struct alien_cache {
-	spinlock_t lock;
-	struct array_cache ac;
-};
-
-/*
- * Need this for bootstrapping a per node allocator.
- */
-#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
-static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
-#define	CACHE_CACHE 0
-#define	SIZE_NODE (MAX_NUMNODES)
-
-static int drain_freelist(struct kmem_cache *cache,
-			struct kmem_cache_node *n, int tofree);
-static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-			int node, struct list_head *list);
-static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
-static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
-static void cache_reap(struct work_struct *unused);
-
-static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
-						void **list);
-static inline void fixup_slab_list(struct kmem_cache *cachep,
-				struct kmem_cache_node *n, struct slab *slab,
-				void **list);
-
-#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
-
-static void kmem_cache_node_init(struct kmem_cache_node *parent)
-{
-	INIT_LIST_HEAD(&parent->slabs_full);
-	INIT_LIST_HEAD(&parent->slabs_partial);
-	INIT_LIST_HEAD(&parent->slabs_free);
-	parent->total_slabs = 0;
-	parent->free_slabs = 0;
-	parent->shared = NULL;
-	parent->alien = NULL;
-	parent->colour_next = 0;
-	raw_spin_lock_init(&parent->list_lock);
-	parent->free_objects = 0;
-	parent->free_touched = 0;
-}
-
-#define MAKE_LIST(cachep, listp, slab, nodeid)				\
-	do {								\
-		INIT_LIST_HEAD(listp);					\
-		list_splice(&get_node(cachep, nodeid)->slab, listp);	\
-	} while (0)
-
-#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
-	do {								\
-	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
-	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
-	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
-	} while (0)
-
-#define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000U)
-#define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000U)
-#define	OBJFREELIST_SLAB(x)	((x)->flags & CFLGS_OBJFREELIST_SLAB)
-#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
-
-#define BATCHREFILL_LIMIT	16
-/*
- * Optimization question: fewer reaps means less probability for unnecessary
- * cpucache drain/refill cycles.
- *
- * OTOH the cpuarrays can contain lots of objects,
- * which could lock up otherwise freeable slabs.
- */
-#define REAPTIMEOUT_AC		(2*HZ)
-#define REAPTIMEOUT_NODE	(4*HZ)
-
-#if STATS
-#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
-#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
-#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
-#define	STATS_INC_GROWN(x)	((x)->grown++)
-#define	STATS_ADD_REAPED(x, y)	((x)->reaped += (y))
-#define	STATS_SET_HIGH(x)						\
-	do {								\
-		if ((x)->num_active > (x)->high_mark)			\
-			(x)->high_mark = (x)->num_active;		\
-	} while (0)
-#define	STATS_INC_ERR(x)	((x)->errors++)
-#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
-#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
-#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
-#define	STATS_SET_FREEABLE(x, i)					\
-	do {								\
-		if ((x)->max_freeable < i)				\
-			(x)->max_freeable = i;				\
-	} while (0)
-#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
-#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
-#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
-#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
-#else
-#define	STATS_INC_ACTIVE(x)	do { } while (0)
-#define	STATS_DEC_ACTIVE(x)	do { } while (0)
-#define	STATS_INC_ALLOCED(x)	do { } while (0)
-#define	STATS_INC_GROWN(x)	do { } while (0)
-#define	STATS_ADD_REAPED(x, y)	do { (void)(y); } while (0)
-#define	STATS_SET_HIGH(x)	do { } while (0)
-#define	STATS_INC_ERR(x)	do { } while (0)
-#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
-#define	STATS_INC_NODEFREES(x)	do { } while (0)
-#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
-#define	STATS_SET_FREEABLE(x, i) do { } while (0)
-#define STATS_INC_ALLOCHIT(x)	do { } while (0)
-#define STATS_INC_ALLOCMISS(x)	do { } while (0)
-#define STATS_INC_FREEHIT(x)	do { } while (0)
-#define STATS_INC_FREEMISS(x)	do { } while (0)
-#endif
-
-#if DEBUG
-
-/*
- * memory layout of objects:
- * 0		: objp
- * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
- * 		the end of an object is aligned with the end of the real
- * 		allocation. Catches writes behind the end of the allocation.
- * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
- * 		redzone word.
- * cachep->obj_offset: The real object.
- * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->size - 1* BYTES_PER_WORD: last caller address
- *					[BYTES_PER_WORD long]
- */
-static int obj_offset(struct kmem_cache *cachep)
-{
-	return cachep->obj_offset;
-}
-
-static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
-{
-	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-	return (unsigned long long *) (objp + obj_offset(cachep) -
-				      sizeof(unsigned long long));
-}
-
-static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
-{
-	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-	if (cachep->flags & SLAB_STORE_USER)
-		return (unsigned long long *)(objp + cachep->size -
-					      sizeof(unsigned long long) -
-					      REDZONE_ALIGN);
-	return (unsigned long long *) (objp + cachep->size -
-				       sizeof(unsigned long long));
-}
-
-static void **dbg_userword(struct kmem_cache *cachep, void *objp)
-{
-	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-	return (void **)(objp + cachep->size - BYTES_PER_WORD);
-}
-
-#else
-
-#define obj_offset(x)			0
-#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
-#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
-#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
-
-#endif
-
-/*
- * Do not go above this order unless 0 objects fit into the slab or
- * overridden on the command line.
- */
-#define	SLAB_MAX_ORDER_HI	1
-#define	SLAB_MAX_ORDER_LO	0
-static int slab_max_order = SLAB_MAX_ORDER_LO;
-static bool slab_max_order_set __initdata;
-
-static inline void *index_to_obj(struct kmem_cache *cache,
-				 const struct slab *slab, unsigned int idx)
-{
-	return slab->s_mem + cache->size * idx;
-}
-
-#define BOOT_CPUCACHE_ENTRIES	1
-/* internal cache of cache description objs */
-static struct kmem_cache kmem_cache_boot = {
-	.batchcount = 1,
-	.limit = BOOT_CPUCACHE_ENTRIES,
-	.shared = 1,
-	.size = sizeof(struct kmem_cache),
-	.name = "kmem_cache",
-};
-
-static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
-
-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
-{
-	return this_cpu_ptr(cachep->cpu_cache);
-}
-
-/*
- * Calculate the number of objects and left-over bytes for a given buffer size.
- */
-static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
-		slab_flags_t flags, size_t *left_over)
-{
-	unsigned int num;
-	size_t slab_size = PAGE_SIZE << gfporder;
-
-	/*
-	 * The slab management structure can be either off the slab or
-	 * on it. For the latter case, the memory allocated for a
-	 * slab is used for:
-	 *
-	 * - @buffer_size bytes for each object
-	 * - One freelist_idx_t for each object
-	 *
-	 * We don't need to consider alignment of freelist because
-	 * freelist will be at the end of slab page. The objects will be
-	 * at the correct alignment.
-	 *
-	 * If the slab management structure is off the slab, then the
-	 * alignment will already be calculated into the size. Because
-	 * the slabs are all pages aligned, the objects will be at the
-	 * correct alignment when allocated.
-	 */
-	if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
-		num = slab_size / buffer_size;
-		*left_over = slab_size % buffer_size;
-	} else {
-		num = slab_size / (buffer_size + sizeof(freelist_idx_t));
-		*left_over = slab_size %
-			(buffer_size + sizeof(freelist_idx_t));
-	}
-
-	return num;
-}
-
-#if DEBUG
-#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
-
-static void __slab_error(const char *function, struct kmem_cache *cachep,
-			char *msg)
-{
-	pr_err("slab error in %s(): cache `%s': %s\n",
-	       function, cachep->name, msg);
-	dump_stack();
-	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
-}
-#endif
-
-/*
- * By default on NUMA we use alien caches to stage the freeing of
- * objects allocated from other nodes. This causes massive memory
- * inefficiencies when using fake NUMA setup to split memory into a
- * large number of small nodes, so it can be disabled on the command
- * line
-  */
-
-static int use_alien_caches __read_mostly = 1;
-static int __init noaliencache_setup(char *s)
-{
-	use_alien_caches = 0;
-	return 1;
-}
-__setup("noaliencache", noaliencache_setup);
-
-static int __init slab_max_order_setup(char *str)
-{
-	get_option(&str, &slab_max_order);
-	slab_max_order = slab_max_order < 0 ? 0 :
-				min(slab_max_order, MAX_ORDER);
-	slab_max_order_set = true;
-
-	return 1;
-}
-__setup("slab_max_order=", slab_max_order_setup);
-
-#ifdef CONFIG_NUMA
-/*
- * Special reaping functions for NUMA systems called from cache_reap().
- * These take care of doing round robin flushing of alien caches (containing
- * objects freed on different nodes from which they were allocated) and the
- * flushing of remote pcps by calling drain_node_pages.
- */
-static DEFINE_PER_CPU(unsigned long, slab_reap_node);
-
-static void init_reap_node(int cpu)
-{
-	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
-						    node_online_map);
-}
-
-static void next_reap_node(void)
-{
-	int node = __this_cpu_read(slab_reap_node);
-
-	node = next_node_in(node, node_online_map);
-	__this_cpu_write(slab_reap_node, node);
-}
-
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-/*
- * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
- * via the workqueue/eventd.
- * Add the CPU number into the expiration time to minimize the possibility of
- * the CPUs getting into lockstep and contending for the global cache chain
- * lock.
- */
-static void start_cpu_timer(int cpu)
-{
-	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
-
-	if (reap_work->work.func == NULL) {
-		init_reap_node(cpu);
-		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
-		schedule_delayed_work_on(cpu, reap_work,
-					__round_jiffies_relative(HZ, cpu));
-	}
-}
-
-static void init_arraycache(struct array_cache *ac, int limit, int batch)
-{
-	if (ac) {
-		ac->avail = 0;
-		ac->limit = limit;
-		ac->batchcount = batch;
-		ac->touched = 0;
-	}
-}
-
-static struct array_cache *alloc_arraycache(int node, int entries,
-					    int batchcount, gfp_t gfp)
-{
-	size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
-	struct array_cache *ac = NULL;
-
-	ac = kmalloc_node(memsize, gfp, node);
-	/*
-	 * The array_cache structures contain pointers to free object.
-	 * However, when such objects are allocated or transferred to another
-	 * cache the pointers are not cleared and they could be counted as
-	 * valid references during a kmemleak scan. Therefore, kmemleak must
-	 * not scan such objects.
-	 */
-	kmemleak_no_scan(ac);
-	init_arraycache(ac, entries, batchcount);
-	return ac;
-}
-
-static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
-					struct slab *slab, void *objp)
-{
-	struct kmem_cache_node *n;
-	int slab_node;
-	LIST_HEAD(list);
-
-	slab_node = slab_nid(slab);
-	n = get_node(cachep, slab_node);
-
-	raw_spin_lock(&n->list_lock);
-	free_block(cachep, &objp, 1, slab_node, &list);
-	raw_spin_unlock(&n->list_lock);
-
-	slabs_destroy(cachep, &list);
-}
-
-/*
- * Transfer objects in one arraycache to another.
- * Locking must be handled by the caller.
- *
- * Return the number of entries transferred.
- */
-static int transfer_objects(struct array_cache *to,
-		struct array_cache *from, unsigned int max)
-{
-	/* Figure out how many entries to transfer */
-	int nr = min3(from->avail, max, to->limit - to->avail);
-
-	if (!nr)
-		return 0;
-
-	memcpy(to->entry + to->avail, from->entry + from->avail - nr,
-			sizeof(void *) *nr);
-
-	from->avail -= nr;
-	to->avail += nr;
-	return nr;
-}
-
-/* &alien->lock must be held by alien callers. */
-static __always_inline void __free_one(struct array_cache *ac, void *objp)
-{
-	/* Avoid trivial double-free. */
-	if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
-	    WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
-		return;
-	ac->entry[ac->avail++] = objp;
-}
-
-#ifndef CONFIG_NUMA
-
-#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, n) do { } while (0)
-
-static inline struct alien_cache **alloc_alien_cache(int node,
-						int limit, gfp_t gfp)
-{
-	return NULL;
-}
-
-static inline void free_alien_cache(struct alien_cache **ac_ptr)
-{
-}
-
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
-	return 0;
-}
-
-static inline gfp_t gfp_exact_node(gfp_t flags)
-{
-	return flags & ~__GFP_NOFAIL;
-}
-
-#else	/* CONFIG_NUMA */
-
-static struct alien_cache *__alloc_alien_cache(int node, int entries,
-						int batch, gfp_t gfp)
-{
-	size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
-	struct alien_cache *alc = NULL;
-
-	alc = kmalloc_node(memsize, gfp, node);
-	if (alc) {
-		kmemleak_no_scan(alc);
-		init_arraycache(&alc->ac, entries, batch);
-		spin_lock_init(&alc->lock);
-	}
-	return alc;
-}
-
-static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
-{
-	struct alien_cache **alc_ptr;
-	int i;
-
-	if (limit > 1)
-		limit = 12;
-	alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
-	if (!alc_ptr)
-		return NULL;
-
-	for_each_node(i) {
-		if (i == node || !node_online(i))
-			continue;
-		alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
-		if (!alc_ptr[i]) {
-			for (i--; i >= 0; i--)
-				kfree(alc_ptr[i]);
-			kfree(alc_ptr);
-			return NULL;
-		}
-	}
-	return alc_ptr;
-}
-
-static void free_alien_cache(struct alien_cache **alc_ptr)
-{
-	int i;
-
-	if (!alc_ptr)
-		return;
-	for_each_node(i)
-	    kfree(alc_ptr[i]);
-	kfree(alc_ptr);
-}
-
-static void __drain_alien_cache(struct kmem_cache *cachep,
-				struct array_cache *ac, int node,
-				struct list_head *list)
-{
-	struct kmem_cache_node *n = get_node(cachep, node);
-
-	if (ac->avail) {
-		raw_spin_lock(&n->list_lock);
-		/*
-		 * Stuff objects into the remote nodes shared array first.
-		 * That way we could avoid the overhead of putting the objects
-		 * into the free lists and getting them back later.
-		 */
-		if (n->shared)
-			transfer_objects(n->shared, ac, ac->limit);
-
-		free_block(cachep, ac->entry, ac->avail, node, list);
-		ac->avail = 0;
-		raw_spin_unlock(&n->list_lock);
-	}
-}
-
-/*
- * Called from cache_reap() to regularly drain alien caches round robin.
- */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
-{
-	int node = __this_cpu_read(slab_reap_node);
-
-	if (n->alien) {
-		struct alien_cache *alc = n->alien[node];
-		struct array_cache *ac;
-
-		if (alc) {
-			ac = &alc->ac;
-			if (ac->avail && spin_trylock_irq(&alc->lock)) {
-				LIST_HEAD(list);
-
-				__drain_alien_cache(cachep, ac, node, &list);
-				spin_unlock_irq(&alc->lock);
-				slabs_destroy(cachep, &list);
-			}
-		}
-	}
-}
-
-static void drain_alien_cache(struct kmem_cache *cachep,
-				struct alien_cache **alien)
-{
-	int i = 0;
-	struct alien_cache *alc;
-	struct array_cache *ac;
-	unsigned long flags;
-
-	for_each_online_node(i) {
-		alc = alien[i];
-		if (alc) {
-			LIST_HEAD(list);
-
-			ac = &alc->ac;
-			spin_lock_irqsave(&alc->lock, flags);
-			__drain_alien_cache(cachep, ac, i, &list);
-			spin_unlock_irqrestore(&alc->lock, flags);
-			slabs_destroy(cachep, &list);
-		}
-	}
-}
-
-static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
-				int node, int slab_node)
-{
-	struct kmem_cache_node *n;
-	struct alien_cache *alien = NULL;
-	struct array_cache *ac;
-	LIST_HEAD(list);
-
-	n = get_node(cachep, node);
-	STATS_INC_NODEFREES(cachep);
-	if (n->alien && n->alien[slab_node]) {
-		alien = n->alien[slab_node];
-		ac = &alien->ac;
-		spin_lock(&alien->lock);
-		if (unlikely(ac->avail == ac->limit)) {
-			STATS_INC_ACOVERFLOW(cachep);
-			__drain_alien_cache(cachep, ac, slab_node, &list);
-		}
-		__free_one(ac, objp);
-		spin_unlock(&alien->lock);
-		slabs_destroy(cachep, &list);
-	} else {
-		n = get_node(cachep, slab_node);
-		raw_spin_lock(&n->list_lock);
-		free_block(cachep, &objp, 1, slab_node, &list);
-		raw_spin_unlock(&n->list_lock);
-		slabs_destroy(cachep, &list);
-	}
-	return 1;
-}
-
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
-	int slab_node = slab_nid(virt_to_slab(objp));
-	int node = numa_mem_id();
-	/*
-	 * Make sure we are not freeing an object from another node to the array
-	 * cache on this cpu.
-	 */
-	if (likely(node == slab_node))
-		return 0;
-
-	return __cache_free_alien(cachep, objp, node, slab_node);
-}
-
-/*
- * Construct gfp mask to allocate from a specific node but do not reclaim or
- * warn about failures.
- */
-static inline gfp_t gfp_exact_node(gfp_t flags)
-{
-	return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
-}
-#endif
-
-static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
-{
-	struct kmem_cache_node *n;
-
-	/*
-	 * Set up the kmem_cache_node for cpu before we can
-	 * begin anything. Make sure some other cpu on this
-	 * node has not already allocated this
-	 */
-	n = get_node(cachep, node);
-	if (n) {
-		raw_spin_lock_irq(&n->list_lock);
-		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
-				cachep->num;
-		raw_spin_unlock_irq(&n->list_lock);
-
-		return 0;
-	}
-
-	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
-	if (!n)
-		return -ENOMEM;
-
-	kmem_cache_node_init(n);
-	n->next_reap = jiffies + REAPTIMEOUT_NODE +
-		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
-	n->free_limit =
-		(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
-
-	/*
-	 * The kmem_cache_nodes don't come and go as CPUs
-	 * come and go.  slab_mutex provides sufficient
-	 * protection here.
-	 */
-	cachep->node[node] = n;
-
-	return 0;
-}
-
-#if defined(CONFIG_NUMA) || defined(CONFIG_SMP)
-/*
- * Allocates and initializes node for a node on each slab cache, used for
- * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
- * will be allocated off-node since memory is not yet online for the new node.
- * When hotplugging memory or a cpu, existing nodes are not replaced if
- * already in use.
- *
- * Must hold slab_mutex.
- */
-static int init_cache_node_node(int node)
-{
-	int ret;
-	struct kmem_cache *cachep;
-
-	list_for_each_entry(cachep, &slab_caches, list) {
-		ret = init_cache_node(cachep, node, GFP_KERNEL);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-#endif
-
-static int setup_kmem_cache_node(struct kmem_cache *cachep,
-				int node, gfp_t gfp, bool force_change)
-{
-	int ret = -ENOMEM;
-	struct kmem_cache_node *n;
-	struct array_cache *old_shared = NULL;
-	struct array_cache *new_shared = NULL;
-	struct alien_cache **new_alien = NULL;
-	LIST_HEAD(list);
-
-	if (use_alien_caches) {
-		new_alien = alloc_alien_cache(node, cachep->limit, gfp);
-		if (!new_alien)
-			goto fail;
-	}
-
-	if (cachep->shared) {
-		new_shared = alloc_arraycache(node,
-			cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
-		if (!new_shared)
-			goto fail;
-	}
-
-	ret = init_cache_node(cachep, node, gfp);
-	if (ret)
-		goto fail;
-
-	n = get_node(cachep, node);
-	raw_spin_lock_irq(&n->list_lock);
-	if (n->shared && force_change) {
-		free_block(cachep, n->shared->entry,
-				n->shared->avail, node, &list);
-		n->shared->avail = 0;
-	}
-
-	if (!n->shared || force_change) {
-		old_shared = n->shared;
-		n->shared = new_shared;
-		new_shared = NULL;
-	}
-
-	if (!n->alien) {
-		n->alien = new_alien;
-		new_alien = NULL;
-	}
-
-	raw_spin_unlock_irq(&n->list_lock);
-	slabs_destroy(cachep, &list);
-
-	/*
-	 * To protect lockless access to n->shared during irq disabled context.
-	 * If n->shared isn't NULL in irq disabled context, accessing to it is
-	 * guaranteed to be valid until irq is re-enabled, because it will be
-	 * freed after synchronize_rcu().
-	 */
-	if (old_shared && force_change)
-		synchronize_rcu();
-
-fail:
-	kfree(old_shared);
-	kfree(new_shared);
-	free_alien_cache(new_alien);
-
-	return ret;
-}
-
-#ifdef CONFIG_SMP
-
-static void cpuup_canceled(long cpu)
-{
-	struct kmem_cache *cachep;
-	struct kmem_cache_node *n = NULL;
-	int node = cpu_to_mem(cpu);
-	const struct cpumask *mask = cpumask_of_node(node);
-
-	list_for_each_entry(cachep, &slab_caches, list) {
-		struct array_cache *nc;
-		struct array_cache *shared;
-		struct alien_cache **alien;
-		LIST_HEAD(list);
-
-		n = get_node(cachep, node);
-		if (!n)
-			continue;
-
-		raw_spin_lock_irq(&n->list_lock);
-
-		/* Free limit for this kmem_cache_node */
-		n->free_limit -= cachep->batchcount;
-
-		/* cpu is dead; no one can alloc from it. */
-		nc = per_cpu_ptr(cachep->cpu_cache, cpu);
-		free_block(cachep, nc->entry, nc->avail, node, &list);
-		nc->avail = 0;
-
-		if (!cpumask_empty(mask)) {
-			raw_spin_unlock_irq(&n->list_lock);
-			goto free_slab;
-		}
-
-		shared = n->shared;
-		if (shared) {
-			free_block(cachep, shared->entry,
-				   shared->avail, node, &list);
-			n->shared = NULL;
-		}
-
-		alien = n->alien;
-		n->alien = NULL;
-
-		raw_spin_unlock_irq(&n->list_lock);
-
-		kfree(shared);
-		if (alien) {
-			drain_alien_cache(cachep, alien);
-			free_alien_cache(alien);
-		}
-
-free_slab:
-		slabs_destroy(cachep, &list);
-	}
-	/*
-	 * In the previous loop, all the objects were freed to
-	 * the respective cache's slabs,  now we can go ahead and
-	 * shrink each nodelist to its limit.
-	 */
-	list_for_each_entry(cachep, &slab_caches, list) {
-		n = get_node(cachep, node);
-		if (!n)
-			continue;
-		drain_freelist(cachep, n, INT_MAX);
-	}
-}
-
-static int cpuup_prepare(long cpu)
-{
-	struct kmem_cache *cachep;
-	int node = cpu_to_mem(cpu);
-	int err;
-
-	/*
-	 * We need to do this right in the beginning since
-	 * alloc_arraycache's are going to use this list.
-	 * kmalloc_node allows us to add the slab to the right
-	 * kmem_cache_node and not this cpu's kmem_cache_node
-	 */
-	err = init_cache_node_node(node);
-	if (err < 0)
-		goto bad;
-
-	/*
-	 * Now we can go ahead with allocating the shared arrays and
-	 * array caches
-	 */
-	list_for_each_entry(cachep, &slab_caches, list) {
-		err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
-		if (err)
-			goto bad;
-	}
-
-	return 0;
-bad:
-	cpuup_canceled(cpu);
-	return -ENOMEM;
-}
-
-int slab_prepare_cpu(unsigned int cpu)
-{
-	int err;
-
-	mutex_lock(&slab_mutex);
-	err = cpuup_prepare(cpu);
-	mutex_unlock(&slab_mutex);
-	return err;
-}
-
-/*
- * This is called for a failed online attempt and for a successful
- * offline.
- *
- * Even if all the cpus of a node are down, we don't free the
- * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and
- * a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down.  The kmem_cache_node structure is usually allocated from
- * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
- */
-int slab_dead_cpu(unsigned int cpu)
-{
-	mutex_lock(&slab_mutex);
-	cpuup_canceled(cpu);
-	mutex_unlock(&slab_mutex);
-	return 0;
-}
-#endif
-
-static int slab_online_cpu(unsigned int cpu)
-{
-	start_cpu_timer(cpu);
-	return 0;
-}
-
-static int slab_offline_cpu(unsigned int cpu)
-{
-	/*
-	 * Shutdown cache reaper. Note that the slab_mutex is held so
-	 * that if cache_reap() is invoked it cannot do anything
-	 * expensive but will only modify reap_work and reschedule the
-	 * timer.
-	 */
-	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
-	/* Now the cache_reaper is guaranteed to be not running. */
-	per_cpu(slab_reap_work, cpu).work.func = NULL;
-	return 0;
-}
-
-#if defined(CONFIG_NUMA)
-/*
- * Drains freelist for a node on each slab cache, used for memory hot-remove.
- * Returns -EBUSY if all objects cannot be drained so that the node is not
- * removed.
- *
- * Must hold slab_mutex.
- */
-static int __meminit drain_cache_node_node(int node)
-{
-	struct kmem_cache *cachep;
-	int ret = 0;
-
-	list_for_each_entry(cachep, &slab_caches, list) {
-		struct kmem_cache_node *n;
-
-		n = get_node(cachep, node);
-		if (!n)
-			continue;
-
-		drain_freelist(cachep, n, INT_MAX);
-
-		if (!list_empty(&n->slabs_full) ||
-		    !list_empty(&n->slabs_partial)) {
-			ret = -EBUSY;
-			break;
-		}
-	}
-	return ret;
-}
-
-static int __meminit slab_memory_callback(struct notifier_block *self,
-					unsigned long action, void *arg)
-{
-	struct memory_notify *mnb = arg;
-	int ret = 0;
-	int nid;
-
-	nid = mnb->status_change_nid;
-	if (nid < 0)
-		goto out;
-
-	switch (action) {
-	case MEM_GOING_ONLINE:
-		mutex_lock(&slab_mutex);
-		ret = init_cache_node_node(nid);
-		mutex_unlock(&slab_mutex);
-		break;
-	case MEM_GOING_OFFLINE:
-		mutex_lock(&slab_mutex);
-		ret = drain_cache_node_node(nid);
-		mutex_unlock(&slab_mutex);
-		break;
-	case MEM_ONLINE:
-	case MEM_OFFLINE:
-	case MEM_CANCEL_ONLINE:
-	case MEM_CANCEL_OFFLINE:
-		break;
-	}
-out:
-	return notifier_from_errno(ret);
-}
-#endif /* CONFIG_NUMA */
-
-/*
- * swap the static kmem_cache_node with kmalloced memory
- */
-static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
-				int nodeid)
-{
-	struct kmem_cache_node *ptr;
-
-	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
-	BUG_ON(!ptr);
-
-	memcpy(ptr, list, sizeof(struct kmem_cache_node));
-	/*
-	 * Do not assume that spinlocks can be initialized via memcpy:
-	 */
-	raw_spin_lock_init(&ptr->list_lock);
-
-	MAKE_ALL_LISTS(cachep, ptr, nodeid);
-	cachep->node[nodeid] = ptr;
-}
-
-/*
- * For setting up all the kmem_cache_node for cache whose buffer_size is same as
- * size of kmem_cache_node.
- */
-static void __init set_up_node(struct kmem_cache *cachep, int index)
-{
-	int node;
-
-	for_each_online_node(node) {
-		cachep->node[node] = &init_kmem_cache_node[index + node];
-		cachep->node[node]->next_reap = jiffies +
-		    REAPTIMEOUT_NODE +
-		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-	}
-}
-
-/*
- * Initialisation.  Called after the page allocator have been initialised and
- * before smp_init().
- */
-void __init kmem_cache_init(void)
-{
-	int i;
-
-	kmem_cache = &kmem_cache_boot;
-
-	if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
-		use_alien_caches = 0;
-
-	for (i = 0; i < NUM_INIT_LISTS; i++)
-		kmem_cache_node_init(&init_kmem_cache_node[i]);
-
-	/*
-	 * Fragmentation resistance on low memory - only use bigger
-	 * page orders on machines with more than 32MB of memory if
-	 * not overridden on the command line.
-	 */
-	if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
-		slab_max_order = SLAB_MAX_ORDER_HI;
-
-	/* Bootstrap is tricky, because several objects are allocated
-	 * from caches that do not exist yet:
-	 * 1) initialize the kmem_cache cache: it contains the struct
-	 *    kmem_cache structures of all caches, except kmem_cache itself:
-	 *    kmem_cache is statically allocated.
-	 *    Initially an __init data area is used for the head array and the
-	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated
-	 *    array at the end of the bootstrap.
-	 * 2) Create the first kmalloc cache.
-	 *    The struct kmem_cache for the new cache is allocated normally.
-	 *    An __init data area is used for the head array.
-	 * 3) Create the remaining kmalloc caches, with minimally sized
-	 *    head arrays.
-	 * 4) Replace the __init data head arrays for kmem_cache and the first
-	 *    kmalloc cache with kmalloc allocated arrays.
-	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
-	 *    the other cache's with kmalloc allocated memory.
-	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
-	 */
-
-	/* 1) create the kmem_cache */
-
-	/*
-	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
-	 */
-	create_boot_cache(kmem_cache, "kmem_cache",
-		offsetof(struct kmem_cache, node) +
-				  nr_node_ids * sizeof(struct kmem_cache_node *),
-				  SLAB_HWCACHE_ALIGN, 0, 0);
-	list_add(&kmem_cache->list, &slab_caches);
-	slab_state = PARTIAL;
-
-	/*
-	 * Initialize the caches that provide memory for the  kmem_cache_node
-	 * structures first.  Without this, further allocations will bug.
-	 */
-	new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS);
-	slab_state = PARTIAL_NODE;
-	setup_kmalloc_cache_index_table();
-
-	/* 5) Replace the bootstrap kmem_cache_node */
-	{
-		int nid;
-
-		for_each_online_node(nid) {
-			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
-
-			init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
-					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
-		}
-	}
-
-	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
-}
-
-void __init kmem_cache_init_late(void)
-{
-	struct kmem_cache *cachep;
-
-	/* 6) resize the head arrays to their final sizes */
-	mutex_lock(&slab_mutex);
-	list_for_each_entry(cachep, &slab_caches, list)
-		if (enable_cpucache(cachep, GFP_NOWAIT))
-			BUG();
-	mutex_unlock(&slab_mutex);
-
-	/* Done! */
-	slab_state = FULL;
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Register a memory hotplug callback that initializes and frees
-	 * node.
-	 */
-	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
-
-	/*
-	 * The reap timers are started later, with a module init call: That part
-	 * of the kernel is not yet operational.
-	 */
-}
-
-static int __init cpucache_init(void)
-{
-	int ret;
-
-	/*
-	 * Register the timers that return unneeded pages to the page allocator
-	 */
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
-				slab_online_cpu, slab_offline_cpu);
-	WARN_ON(ret < 0);
-
-	return 0;
-}
-__initcall(cpucache_init);
-
-static noinline void
-slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
-{
-#if DEBUG
-	struct kmem_cache_node *n;
-	unsigned long flags;
-	int node;
-	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
-				      DEFAULT_RATELIMIT_BURST);
-
-	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
-		return;
-
-	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
-		nodeid, gfpflags, &gfpflags);
-	pr_warn("  cache: %s, object size: %d, order: %d\n",
-		cachep->name, cachep->size, cachep->gfporder);
-
-	for_each_kmem_cache_node(cachep, node, n) {
-		unsigned long total_slabs, free_slabs, free_objs;
-
-		raw_spin_lock_irqsave(&n->list_lock, flags);
-		total_slabs = n->total_slabs;
-		free_slabs = n->free_slabs;
-		free_objs = n->free_objects;
-		raw_spin_unlock_irqrestore(&n->list_lock, flags);
-
-		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
-			node, total_slabs - free_slabs, total_slabs,
-			(total_slabs * cachep->num) - free_objs,
-			total_slabs * cachep->num);
-	}
-#endif
-}
-
-/*
- * Interface to system's page allocator. No need to hold the
- * kmem_cache_node ->list_lock.
- *
- * If we requested dmaable memory, we will get it. Even if we
- * did not request dmaable memory, we might get it, but that
- * would be relatively rare and ignorable.
- */
-static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
-								int nodeid)
-{
-	struct folio *folio;
-	struct slab *slab;
-
-	flags |= cachep->allocflags;
-
-	folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder);
-	if (!folio) {
-		slab_out_of_memory(cachep, flags, nodeid);
-		return NULL;
-	}
-
-	slab = folio_slab(folio);
-
-	account_slab(slab, cachep->gfporder, cachep, flags);
-	__folio_set_slab(folio);
-	/* Make the flag visible before any changes to folio->mapping */
-	smp_wmb();
-	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
-	if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
-		slab_set_pfmemalloc(slab);
-
-	return slab;
-}
-
-/*
- * Interface to system's page release.
- */
-static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
-{
-	int order = cachep->gfporder;
-	struct folio *folio = slab_folio(slab);
-
-	BUG_ON(!folio_test_slab(folio));
-	__slab_clear_pfmemalloc(slab);
-	page_mapcount_reset(&folio->page);
-	folio->mapping = NULL;
-	/* Make the mapping reset visible before clearing the flag */
-	smp_wmb();
-	__folio_clear_slab(folio);
-
-	mm_account_reclaimed_pages(1 << order);
-	unaccount_slab(slab, order, cachep);
-	__free_pages(&folio->page, order);
-}
-
-static void kmem_rcu_free(struct rcu_head *head)
-{
-	struct kmem_cache *cachep;
-	struct slab *slab;
-
-	slab = container_of(head, struct slab, rcu_head);
-	cachep = slab->slab_cache;
-
-	kmem_freepages(cachep, slab);
-}
-
-#if DEBUG
-static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
-{
-	return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
-			((cachep->size % PAGE_SIZE) == 0);
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
-{
-	if (!is_debug_pagealloc_cache(cachep))
-		return;
-
-	__kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
-}
-
-#else
-static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
-				int map) {}
-
-#endif
-
-static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
-{
-	int size = cachep->object_size;
-	addr = &((char *)addr)[obj_offset(cachep)];
-
-	memset(addr, val, size);
-	*(unsigned char *)(addr + size - 1) = POISON_END;
-}
-
-static void dump_line(char *data, int offset, int limit)
-{
-	int i;
-	unsigned char error = 0;
-	int bad_count = 0;
-
-	pr_err("%03x: ", offset);
-	for (i = 0; i < limit; i++) {
-		if (data[offset + i] != POISON_FREE) {
-			error = data[offset + i];
-			bad_count++;
-		}
-	}
-	print_hex_dump(KERN_CONT, "", 0, 16, 1,
-			&data[offset], limit, 1);
-
-	if (bad_count == 1) {
-		error ^= POISON_FREE;
-		if (!(error & (error - 1))) {
-			pr_err("Single bit error detected. Probably bad RAM.\n");
-#ifdef CONFIG_X86
-			pr_err("Run memtest86+ or a similar memory test tool.\n");
-#else
-			pr_err("Run a memory test tool.\n");
-#endif
-		}
-	}
-}
-#endif
-
-#if DEBUG
-
-static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
-{
-	int i, size;
-	char *realobj;
-
-	if (cachep->flags & SLAB_RED_ZONE) {
-		pr_err("Redzone: 0x%llx/0x%llx\n",
-		       *dbg_redzone1(cachep, objp),
-		       *dbg_redzone2(cachep, objp));
-	}
-
-	if (cachep->flags & SLAB_STORE_USER)
-		pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
-	realobj = (char *)objp + obj_offset(cachep);
-	size = cachep->object_size;
-	for (i = 0; i < size && lines; i += 16, lines--) {
-		int limit;
-		limit = 16;
-		if (i + limit > size)
-			limit = size - i;
-		dump_line(realobj, i, limit);
-	}
-}
-
-static void check_poison_obj(struct kmem_cache *cachep, void *objp)
-{
-	char *realobj;
-	int size, i;
-	int lines = 0;
-
-	if (is_debug_pagealloc_cache(cachep))
-		return;
-
-	realobj = (char *)objp + obj_offset(cachep);
-	size = cachep->object_size;
-
-	for (i = 0; i < size; i++) {
-		char exp = POISON_FREE;
-		if (i == size - 1)
-			exp = POISON_END;
-		if (realobj[i] != exp) {
-			int limit;
-			/* Mismatch ! */
-			/* Print header */
-			if (lines == 0) {
-				pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
-				       print_tainted(), cachep->name,
-				       realobj, size);
-				print_objinfo(cachep, objp, 0);
-			}
-			/* Hexdump the affected line */
-			i = (i / 16) * 16;
-			limit = 16;
-			if (i + limit > size)
-				limit = size - i;
-			dump_line(realobj, i, limit);
-			i += 16;
-			lines++;
-			/* Limit to 5 lines */
-			if (lines > 5)
-				break;
-		}
-	}
-	if (lines != 0) {
-		/* Print some data about the neighboring objects, if they
-		 * exist:
-		 */
-		struct slab *slab = virt_to_slab(objp);
-		unsigned int objnr;
-
-		objnr = obj_to_index(cachep, slab, objp);
-		if (objnr) {
-			objp = index_to_obj(cachep, slab, objnr - 1);
-			realobj = (char *)objp + obj_offset(cachep);
-			pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
-			print_objinfo(cachep, objp, 2);
-		}
-		if (objnr + 1 < cachep->num) {
-			objp = index_to_obj(cachep, slab, objnr + 1);
-			realobj = (char *)objp + obj_offset(cachep);
-			pr_err("Next obj: start=%px, len=%d\n", realobj, size);
-			print_objinfo(cachep, objp, 2);
-		}
-	}
-}
-#endif
-
-#if DEBUG
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
-						struct slab *slab)
-{
-	int i;
-
-	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
-		poison_obj(cachep, slab->freelist - obj_offset(cachep),
-			POISON_FREE);
-	}
-
-	for (i = 0; i < cachep->num; i++) {
-		void *objp = index_to_obj(cachep, slab, i);
-
-		if (cachep->flags & SLAB_POISON) {
-			check_poison_obj(cachep, objp);
-			slab_kernel_map(cachep, objp, 1);
-		}
-		if (cachep->flags & SLAB_RED_ZONE) {
-			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "start of a freed object was overwritten");
-			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "end of a freed object was overwritten");
-		}
-	}
-}
-#else
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
-						struct slab *slab)
-{
-}
-#endif
-
-/**
- * slab_destroy - destroy and release all objects in a slab
- * @cachep: cache pointer being destroyed
- * @slab: slab being destroyed
- *
- * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache. The
- * kmem_cache_node ->list_lock is not held/needed.
- */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
-{
-	void *freelist;
-
-	freelist = slab->freelist;
-	slab_destroy_debugcheck(cachep, slab);
-	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
-		call_rcu(&slab->rcu_head, kmem_rcu_free);
-	else
-		kmem_freepages(cachep, slab);
-
-	/*
-	 * From now on, we don't use freelist
-	 * although actual page can be freed in rcu context
-	 */
-	if (OFF_SLAB(cachep))
-		kfree(freelist);
-}
-
-/*
- * Update the size of the caches before calling slabs_destroy as it may
- * recursively call kfree.
- */
-static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
-{
-	struct slab *slab, *n;
-
-	list_for_each_entry_safe(slab, n, list, slab_list) {
-		list_del(&slab->slab_list);
-		slab_destroy(cachep, slab);
-	}
-}
-
-/**
- * calculate_slab_order - calculate size (page order) of slabs
- * @cachep: pointer to the cache that is being created
- * @size: size of objects to be created in this cache.
- * @flags: slab allocation flags
- *
- * Also calculates the number of objects per slab.
- *
- * This could be made much more intelligent.  For now, try to avoid using
- * high order pages for slabs.  When the gfp() functions are more friendly
- * towards high-order requests, this should be changed.
- *
- * Return: number of left-over bytes in a slab
- */
-static size_t calculate_slab_order(struct kmem_cache *cachep,
-				size_t size, slab_flags_t flags)
-{
-	size_t left_over = 0;
-	int gfporder;
-
-	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
-		unsigned int num;
-		size_t remainder;
-
-		num = cache_estimate(gfporder, size, flags, &remainder);
-		if (!num)
-			continue;
-
-		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
-		if (num > SLAB_OBJ_MAX_NUM)
-			break;
-
-		if (flags & CFLGS_OFF_SLAB) {
-			struct kmem_cache *freelist_cache;
-			size_t freelist_size;
-			size_t freelist_cache_size;
-
-			freelist_size = num * sizeof(freelist_idx_t);
-			if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
-				freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
-			} else {
-				freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_);
-				if (!freelist_cache)
-					continue;
-				freelist_cache_size = freelist_cache->size;
-
-				/*
-				 * Needed to avoid possible looping condition
-				 * in cache_grow_begin()
-				 */
-				if (OFF_SLAB(freelist_cache))
-					continue;
-			}
-
-			/* check if off slab has enough benefit */
-			if (freelist_cache_size > cachep->size / 2)
-				continue;
-		}
-
-		/* Found something acceptable - save it away */
-		cachep->num = num;
-		cachep->gfporder = gfporder;
-		left_over = remainder;
-
-		/*
-		 * A VFS-reclaimable slab tends to have most allocations
-		 * as GFP_NOFS and we really don't want to have to be allocating
-		 * higher-order pages when we are unable to shrink dcache.
-		 */
-		if (flags & SLAB_RECLAIM_ACCOUNT)
-			break;
-
-		/*
-		 * Large number of objects is good, but very large slabs are
-		 * currently bad for the gfp()s.
-		 */
-		if (gfporder >= slab_max_order)
-			break;
-
-		/*
-		 * Acceptable internal fragmentation?
-		 */
-		if (left_over * 8 <= (PAGE_SIZE << gfporder))
-			break;
-	}
-	return left_over;
-}
-
-static struct array_cache __percpu *alloc_kmem_cache_cpus(
-		struct kmem_cache *cachep, int entries, int batchcount)
-{
-	int cpu;
-	size_t size;
-	struct array_cache __percpu *cpu_cache;
-
-	size = sizeof(void *) * entries + sizeof(struct array_cache);
-	cpu_cache = __alloc_percpu(size, sizeof(void *));
-
-	if (!cpu_cache)
-		return NULL;
-
-	for_each_possible_cpu(cpu) {
-		init_arraycache(per_cpu_ptr(cpu_cache, cpu),
-				entries, batchcount);
-	}
-
-	return cpu_cache;
-}
-
-static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
-{
-	if (slab_state >= FULL)
-		return enable_cpucache(cachep, gfp);
-
-	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
-	if (!cachep->cpu_cache)
-		return 1;
-
-	if (slab_state == DOWN) {
-		/* Creation of first cache (kmem_cache). */
-		set_up_node(kmem_cache, CACHE_CACHE);
-	} else if (slab_state == PARTIAL) {
-		/* For kmem_cache_node */
-		set_up_node(cachep, SIZE_NODE);
-	} else {
-		int node;
-
-		for_each_online_node(node) {
-			cachep->node[node] = kmalloc_node(
-				sizeof(struct kmem_cache_node), gfp, node);
-			BUG_ON(!cachep->node[node]);
-			kmem_cache_node_init(cachep->node[node]);
-		}
-	}
-
-	cachep->node[numa_mem_id()]->next_reap =
-			jiffies + REAPTIMEOUT_NODE +
-			((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
-	cpu_cache_get(cachep)->avail = 0;
-	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-	cpu_cache_get(cachep)->batchcount = 1;
-	cpu_cache_get(cachep)->touched = 0;
-	cachep->batchcount = 1;
-	cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	return 0;
-}
-
-slab_flags_t kmem_cache_flags(unsigned int object_size,
-	slab_flags_t flags, const char *name)
-{
-	return flags;
-}
-
-struct kmem_cache *
-__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
-		   slab_flags_t flags, void (*ctor)(void *))
-{
-	struct kmem_cache *cachep;
-
-	cachep = find_mergeable(size, align, flags, name, ctor);
-	if (cachep) {
-		cachep->refcount++;
-
-		/*
-		 * Adjust the object sizes so that we clear
-		 * the complete object on kzalloc.
-		 */
-		cachep->object_size = max_t(int, cachep->object_size, size);
-	}
-	return cachep;
-}
-
-static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
-			size_t size, slab_flags_t flags)
-{
-	size_t left;
-
-	cachep->num = 0;
-
-	/*
-	 * If slab auto-initialization on free is enabled, store the freelist
-	 * off-slab, so that its contents don't end up in one of the allocated
-	 * objects.
-	 */
-	if (unlikely(slab_want_init_on_free(cachep)))
-		return false;
-
-	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
-		return false;
-
-	left = calculate_slab_order(cachep, size,
-			flags | CFLGS_OBJFREELIST_SLAB);
-	if (!cachep->num)
-		return false;
-
-	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
-		return false;
-
-	cachep->colour = left / cachep->colour_off;
-
-	return true;
-}
-
-static bool set_off_slab_cache(struct kmem_cache *cachep,
-			size_t size, slab_flags_t flags)
-{
-	size_t left;
-
-	cachep->num = 0;
-
-	/*
-	 * Always use on-slab management when SLAB_NOLEAKTRACE
-	 * to avoid recursive calls into kmemleak.
-	 */
-	if (flags & SLAB_NOLEAKTRACE)
-		return false;
-
-	/*
-	 * Size is large, assume best to place the slab management obj
-	 * off-slab (should allow better packing of objs).
-	 */
-	left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
-	if (!cachep->num)
-		return false;
-
-	/*
-	 * If the slab has been placed off-slab, and we have enough space then
-	 * move it on-slab. This is at the expense of any extra colouring.
-	 */
-	if (left >= cachep->num * sizeof(freelist_idx_t))
-		return false;
-
-	cachep->colour = left / cachep->colour_off;
-
-	return true;
-}
-
-static bool set_on_slab_cache(struct kmem_cache *cachep,
-			size_t size, slab_flags_t flags)
-{
-	size_t left;
-
-	cachep->num = 0;
-
-	left = calculate_slab_order(cachep, size, flags);
-	if (!cachep->num)
-		return false;
-
-	cachep->colour = left / cachep->colour_off;
-
-	return true;
-}
-
-/*
- * __kmem_cache_create - Create a cache.
- * @cachep: cache management descriptor
- * @flags: SLAB flags
- *
- * Returns zero on success, nonzero on failure.
- *
- * The flags are
- *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
- */
-int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
-{
-	size_t ralign = BYTES_PER_WORD;
-	gfp_t gfp;
-	int err;
-	unsigned int size = cachep->size;
-
-#if DEBUG
-#if FORCED_DEBUG
-	/*
-	 * Enable redzoning and last user accounting, except for caches with
-	 * large objects, if the increased size would increase the object size
-	 * above the next power of two: caches with object sizes just above a
-	 * power of two have a significant amount of internal fragmentation.
-	 */
-	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
-						2 * sizeof(unsigned long long)))
-		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
-	if (!(flags & SLAB_TYPESAFE_BY_RCU))
-		flags |= SLAB_POISON;
-#endif
-#endif
-
-	/*
-	 * Check that size is in terms of words.  This is needed to avoid
-	 * unaligned accesses for some archs when redzoning is used, and makes
-	 * sure any on-slab bufctl's are also correctly aligned.
-	 */
-	size = ALIGN(size, BYTES_PER_WORD);
-
-	if (flags & SLAB_RED_ZONE) {
-		ralign = REDZONE_ALIGN;
-		/* If redzoning, ensure that the second redzone is suitably
-		 * aligned, by adjusting the object size accordingly. */
-		size = ALIGN(size, REDZONE_ALIGN);
-	}
-
-	/* 3) caller mandated alignment */
-	if (ralign < cachep->align) {
-		ralign = cachep->align;
-	}
-	/* disable debug if necessary */
-	if (ralign > __alignof__(unsigned long long))
-		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-	/*
-	 * 4) Store it.
-	 */
-	cachep->align = ralign;
-	cachep->colour_off = cache_line_size();
-	/* Offset must be a multiple of the alignment. */
-	if (cachep->colour_off < cachep->align)
-		cachep->colour_off = cachep->align;
-
-	if (slab_is_available())
-		gfp = GFP_KERNEL;
-	else
-		gfp = GFP_NOWAIT;
-
-#if DEBUG
-
-	/*
-	 * Both debugging options require word-alignment which is calculated
-	 * into align above.
-	 */
-	if (flags & SLAB_RED_ZONE) {
-		/* add space for red zone words */
-		cachep->obj_offset += sizeof(unsigned long long);
-		size += 2 * sizeof(unsigned long long);
-	}
-	if (flags & SLAB_STORE_USER) {
-		/* user store requires one word storage behind the end of
-		 * the real object. But if the second red zone needs to be
-		 * aligned to 64 bits, we must allow that much space.
-		 */
-		if (flags & SLAB_RED_ZONE)
-			size += REDZONE_ALIGN;
-		else
-			size += BYTES_PER_WORD;
-	}
-#endif
-
-	kasan_cache_create(cachep, &size, &flags);
-
-	size = ALIGN(size, cachep->align);
-	/*
-	 * We should restrict the number of objects in a slab to implement
-	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
-	 */
-	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
-		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
-
-#if DEBUG
-	/*
-	 * To activate debug pagealloc, off-slab management is necessary
-	 * requirement. In early phase of initialization, small sized slab
-	 * doesn't get initialized so it would not be possible. So, we need
-	 * to check size >= 256. It guarantees that all necessary small
-	 * sized slab is initialized in current slab initialization sequence.
-	 */
-	if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
-		size >= 256 && cachep->object_size > cache_line_size()) {
-		if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
-			size_t tmp_size = ALIGN(size, PAGE_SIZE);
-
-			if (set_off_slab_cache(cachep, tmp_size, flags)) {
-				flags |= CFLGS_OFF_SLAB;
-				cachep->obj_offset += tmp_size - size;
-				size = tmp_size;
-				goto done;
-			}
-		}
-	}
-#endif
-
-	if (set_objfreelist_slab_cache(cachep, size, flags)) {
-		flags |= CFLGS_OBJFREELIST_SLAB;
-		goto done;
-	}
-
-	if (set_off_slab_cache(cachep, size, flags)) {
-		flags |= CFLGS_OFF_SLAB;
-		goto done;
-	}
-
-	if (set_on_slab_cache(cachep, size, flags))
-		goto done;
-
-	return -E2BIG;
-
-done:
-	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
-	cachep->flags = flags;
-	cachep->allocflags = __GFP_COMP;
-	if (flags & SLAB_CACHE_DMA)
-		cachep->allocflags |= GFP_DMA;
-	if (flags & SLAB_CACHE_DMA32)
-		cachep->allocflags |= GFP_DMA32;
-	if (flags & SLAB_RECLAIM_ACCOUNT)
-		cachep->allocflags |= __GFP_RECLAIMABLE;
-	cachep->size = size;
-	cachep->reciprocal_buffer_size = reciprocal_value(size);
-
-#if DEBUG
-	/*
-	 * If we're going to use the generic kernel_map_pages()
-	 * poisoning, then it's going to smash the contents of
-	 * the redzone and userword anyhow, so switch them off.
-	 */
-	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
-		(cachep->flags & SLAB_POISON) &&
-		is_debug_pagealloc_cache(cachep))
-		cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
-
-	err = setup_cpu_cache(cachep, gfp);
-	if (err) {
-		__kmem_cache_release(cachep);
-		return err;
-	}
-
-	return 0;
-}
-
-#if DEBUG
-static void check_irq_off(void)
-{
-	BUG_ON(!irqs_disabled());
-}
-
-static void check_irq_on(void)
-{
-	BUG_ON(irqs_disabled());
-}
-
-static void check_mutex_acquired(void)
-{
-	BUG_ON(!mutex_is_locked(&slab_mutex));
-}
-
-static void check_spinlock_acquired(struct kmem_cache *cachep)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
-#endif
-}
-
-static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
-#endif
-}
-
-#else
-#define check_irq_off()	do { } while(0)
-#define check_irq_on()	do { } while(0)
-#define check_mutex_acquired()	do { } while(0)
-#define check_spinlock_acquired(x) do { } while(0)
-#define check_spinlock_acquired_node(x, y) do { } while(0)
-#endif
-
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int node, bool free_all, struct list_head *list)
-{
-	int tofree;
-
-	if (!ac || !ac->avail)
-		return;
-
-	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
-	if (tofree > ac->avail)
-		tofree = (ac->avail + 1) / 2;
-
-	free_block(cachep, ac->entry, tofree, node, list);
-	ac->avail -= tofree;
-	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
-}
-
-static void do_drain(void *arg)
-{
-	struct kmem_cache *cachep = arg;
-	struct array_cache *ac;
-	int node = numa_mem_id();
-	struct kmem_cache_node *n;
-	LIST_HEAD(list);
-
-	check_irq_off();
-	ac = cpu_cache_get(cachep);
-	n = get_node(cachep, node);
-	raw_spin_lock(&n->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node, &list);
-	raw_spin_unlock(&n->list_lock);
-	ac->avail = 0;
-	slabs_destroy(cachep, &list);
-}
-
-static void drain_cpu_caches(struct kmem_cache *cachep)
-{
-	struct kmem_cache_node *n;
-	int node;
-	LIST_HEAD(list);
-
-	on_each_cpu(do_drain, cachep, 1);
-	check_irq_on();
-	for_each_kmem_cache_node(cachep, node, n)
-		if (n->alien)
-			drain_alien_cache(cachep, n->alien);
-
-	for_each_kmem_cache_node(cachep, node, n) {
-		raw_spin_lock_irq(&n->list_lock);
-		drain_array_locked(cachep, n->shared, node, true, &list);
-		raw_spin_unlock_irq(&n->list_lock);
-
-		slabs_destroy(cachep, &list);
-	}
-}
-
-/*
- * Remove slabs from the list of free slabs.
- * Specify the number of slabs to drain in tofree.
- *
- * Returns the actual number of slabs released.
- */
-static int drain_freelist(struct kmem_cache *cache,
-			struct kmem_cache_node *n, int tofree)
-{
-	struct list_head *p;
-	int nr_freed;
-	struct slab *slab;
-
-	nr_freed = 0;
-	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
-
-		raw_spin_lock_irq(&n->list_lock);
-		p = n->slabs_free.prev;
-		if (p == &n->slabs_free) {
-			raw_spin_unlock_irq(&n->list_lock);
-			goto out;
-		}
-
-		slab = list_entry(p, struct slab, slab_list);
-		list_del(&slab->slab_list);
-		n->free_slabs--;
-		n->total_slabs--;
-		/*
-		 * Safe to drop the lock. The slab is no longer linked
-		 * to the cache.
-		 */
-		n->free_objects -= cache->num;
-		raw_spin_unlock_irq(&n->list_lock);
-		slab_destroy(cache, slab);
-		nr_freed++;
-
-		cond_resched();
-	}
-out:
-	return nr_freed;
-}
-
-bool __kmem_cache_empty(struct kmem_cache *s)
-{
-	int node;
-	struct kmem_cache_node *n;
-
-	for_each_kmem_cache_node(s, node, n)
-		if (!list_empty(&n->slabs_full) ||
-		    !list_empty(&n->slabs_partial))
-			return false;
-	return true;
-}
-
-int __kmem_cache_shrink(struct kmem_cache *cachep)
-{
-	int ret = 0;
-	int node;
-	struct kmem_cache_node *n;
-
-	drain_cpu_caches(cachep);
-
-	check_irq_on();
-	for_each_kmem_cache_node(cachep, node, n) {
-		drain_freelist(cachep, n, INT_MAX);
-
-		ret += !list_empty(&n->slabs_full) ||
-			!list_empty(&n->slabs_partial);
-	}
-	return (ret ? 1 : 0);
-}
-
-int __kmem_cache_shutdown(struct kmem_cache *cachep)
-{
-	return __kmem_cache_shrink(cachep);
-}
-
-void __kmem_cache_release(struct kmem_cache *cachep)
-{
-	int i;
-	struct kmem_cache_node *n;
-
-	cache_random_seq_destroy(cachep);
-
-	free_percpu(cachep->cpu_cache);
-
-	/* NUMA: free the node structures */
-	for_each_kmem_cache_node(cachep, i, n) {
-		kfree(n->shared);
-		free_alien_cache(n->alien);
-		kfree(n);
-		cachep->node[i] = NULL;
-	}
-}
-
-/*
- * Get the memory for a slab management obj.
- *
- * For a slab cache when the slab descriptor is off-slab, the
- * slab descriptor can't come from the same cache which is being created,
- * Because if it is the case, that means we defer the creation of
- * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
- * And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
- * This is a "chicken-and-egg" problem.
- *
- * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
- * which are all initialized during kmem_cache_init().
- */
-static void *alloc_slabmgmt(struct kmem_cache *cachep,
-				   struct slab *slab, int colour_off,
-				   gfp_t local_flags, int nodeid)
-{
-	void *freelist;
-	void *addr = slab_address(slab);
-
-	slab->s_mem = addr + colour_off;
-	slab->active = 0;
-
-	if (OBJFREELIST_SLAB(cachep))
-		freelist = NULL;
-	else if (OFF_SLAB(cachep)) {
-		/* Slab management obj is off-slab. */
-		freelist = kmalloc_node(cachep->freelist_size,
-					      local_flags, nodeid);
-	} else {
-		/* We will use last bytes at the slab for freelist */
-		freelist = addr + (PAGE_SIZE << cachep->gfporder) -
-				cachep->freelist_size;
-	}
-
-	return freelist;
-}
-
-static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx)
-{
-	return ((freelist_idx_t *) slab->freelist)[idx];
-}
-
-static inline void set_free_obj(struct slab *slab,
-					unsigned int idx, freelist_idx_t val)
-{
-	((freelist_idx_t *)(slab->freelist))[idx] = val;
-}
-
-static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
-{
-#if DEBUG
-	int i;
-
-	for (i = 0; i < cachep->num; i++) {
-		void *objp = index_to_obj(cachep, slab, i);
-
-		if (cachep->flags & SLAB_STORE_USER)
-			*dbg_userword(cachep, objp) = NULL;
-
-		if (cachep->flags & SLAB_RED_ZONE) {
-			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
-			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
-		}
-		/*
-		 * Constructors are not allowed to allocate memory from the same
-		 * cache which they are a constructor for.  Otherwise, deadlock.
-		 * They must also be threaded.
-		 */
-		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
-			kasan_unpoison_object_data(cachep,
-						   objp + obj_offset(cachep));
-			cachep->ctor(objp + obj_offset(cachep));
-			kasan_poison_object_data(
-				cachep, objp + obj_offset(cachep));
-		}
-
-		if (cachep->flags & SLAB_RED_ZONE) {
-			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "constructor overwrote the end of an object");
-			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
-				slab_error(cachep, "constructor overwrote the start of an object");
-		}
-		/* need to poison the objs? */
-		if (cachep->flags & SLAB_POISON) {
-			poison_obj(cachep, objp, POISON_FREE);
-			slab_kernel_map(cachep, objp, 0);
-		}
-	}
-#endif
-}
-
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-/* Hold information during a freelist initialization */
-struct freelist_init_state {
-	unsigned int pos;
-	unsigned int *list;
-	unsigned int count;
-};
-
-/*
- * Initialize the state based on the randomization method available.
- * return true if the pre-computed list is available, false otherwise.
- */
-static bool freelist_state_initialize(struct freelist_init_state *state,
-				struct kmem_cache *cachep,
-				unsigned int count)
-{
-	bool ret;
-	if (!cachep->random_seq) {
-		ret = false;
-	} else {
-		state->list = cachep->random_seq;
-		state->count = count;
-		state->pos = get_random_u32_below(count);
-		ret = true;
-	}
-	return ret;
-}
-
-/* Get the next entry on the list and randomize it using a random shift */
-static freelist_idx_t next_random_slot(struct freelist_init_state *state)
-{
-	if (state->pos >= state->count)
-		state->pos = 0;
-	return state->list[state->pos++];
-}
-
-/* Swap two freelist entries */
-static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b)
-{
-	swap(((freelist_idx_t *) slab->freelist)[a],
-		((freelist_idx_t *) slab->freelist)[b]);
-}
-
-/*
- * Shuffle the freelist initialization state based on pre-computed lists.
- * return true if the list was successfully shuffled, false otherwise.
- */
-static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
-{
-	unsigned int objfreelist = 0, i, rand, count = cachep->num;
-	struct freelist_init_state state;
-	bool precomputed;
-
-	if (count < 2)
-		return false;
-
-	precomputed = freelist_state_initialize(&state, cachep, count);
-
-	/* Take a random entry as the objfreelist */
-	if (OBJFREELIST_SLAB(cachep)) {
-		if (!precomputed)
-			objfreelist = count - 1;
-		else
-			objfreelist = next_random_slot(&state);
-		slab->freelist = index_to_obj(cachep, slab, objfreelist) +
-						obj_offset(cachep);
-		count--;
-	}
-
-	/*
-	 * On early boot, generate the list dynamically.
-	 * Later use a pre-computed list for speed.
-	 */
-	if (!precomputed) {
-		for (i = 0; i < count; i++)
-			set_free_obj(slab, i, i);
-
-		/* Fisher-Yates shuffle */
-		for (i = count - 1; i > 0; i--) {
-			rand = get_random_u32_below(i + 1);
-			swap_free_obj(slab, i, rand);
-		}
-	} else {
-		for (i = 0; i < count; i++)
-			set_free_obj(slab, i, next_random_slot(&state));
-	}
-
-	if (OBJFREELIST_SLAB(cachep))
-		set_free_obj(slab, cachep->num - 1, objfreelist);
-
-	return true;
-}
-#else
-static inline bool shuffle_freelist(struct kmem_cache *cachep,
-				struct slab *slab)
-{
-	return false;
-}
-#endif /* CONFIG_SLAB_FREELIST_RANDOM */
-
-static void cache_init_objs(struct kmem_cache *cachep,
-			    struct slab *slab)
-{
-	int i;
-	void *objp;
-	bool shuffled;
-
-	cache_init_objs_debug(cachep, slab);
-
-	/* Try to randomize the freelist if enabled */
-	shuffled = shuffle_freelist(cachep, slab);
-
-	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
-		slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) +
-						obj_offset(cachep);
-	}
-
-	for (i = 0; i < cachep->num; i++) {
-		objp = index_to_obj(cachep, slab, i);
-		objp = kasan_init_slab_obj(cachep, objp);
-
-		/* constructor could break poison info */
-		if (DEBUG == 0 && cachep->ctor) {
-			kasan_unpoison_object_data(cachep, objp);
-			cachep->ctor(objp);
-			kasan_poison_object_data(cachep, objp);
-		}
-
-		if (!shuffled)
-			set_free_obj(slab, i, i);
-	}
-}
-
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab)
-{
-	void *objp;
-
-	objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active));
-	slab->active++;
-
-	return objp;
-}
-
-static void slab_put_obj(struct kmem_cache *cachep,
-			struct slab *slab, void *objp)
-{
-	unsigned int objnr = obj_to_index(cachep, slab, objp);
-#if DEBUG
-	unsigned int i;
-
-	/* Verify double free bug */
-	for (i = slab->active; i < cachep->num; i++) {
-		if (get_free_obj(slab, i) == objnr) {
-			pr_err("slab: double free detected in cache '%s', objp %px\n",
-			       cachep->name, objp);
-			BUG();
-		}
-	}
-#endif
-	slab->active--;
-	if (!slab->freelist)
-		slab->freelist = objp + obj_offset(cachep);
-
-	set_free_obj(slab, slab->active, objnr);
-}
-
-/*
- * Grow (by 1) the number of slabs within a cache.  This is called by
- * kmem_cache_alloc() when there are no active objs left in a cache.
- */
-static struct slab *cache_grow_begin(struct kmem_cache *cachep,
-				gfp_t flags, int nodeid)
-{
-	void *freelist;
-	size_t offset;
-	gfp_t local_flags;
-	int slab_node;
-	struct kmem_cache_node *n;
-	struct slab *slab;
-
-	/*
-	 * Be lazy and only check for valid flags here,  keeping it out of the
-	 * critical path in kmem_cache_alloc().
-	 */
-	if (unlikely(flags & GFP_SLAB_BUG_MASK))
-		flags = kmalloc_fix_flags(flags);
-
-	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
-	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
-	check_irq_off();
-	if (gfpflags_allow_blocking(local_flags))
-		local_irq_enable();
-
-	/*
-	 * Get mem for the objs.  Attempt to allocate a physical page from
-	 * 'nodeid'.
-	 */
-	slab = kmem_getpages(cachep, local_flags, nodeid);
-	if (!slab)
-		goto failed;
-
-	slab_node = slab_nid(slab);
-	n = get_node(cachep, slab_node);
-
-	/* Get colour for the slab, and cal the next value. */
-	n->colour_next++;
-	if (n->colour_next >= cachep->colour)
-		n->colour_next = 0;
-
-	offset = n->colour_next;
-	if (offset >= cachep->colour)
-		offset = 0;
-
-	offset *= cachep->colour_off;
-
-	/*
-	 * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
-	 * page_address() in the latter returns a non-tagged pointer,
-	 * as it should be for slab pages.
-	 */
-	kasan_poison_slab(slab);
-
-	/* Get slab management. */
-	freelist = alloc_slabmgmt(cachep, slab, offset,
-			local_flags & ~GFP_CONSTRAINT_MASK, slab_node);
-	if (OFF_SLAB(cachep) && !freelist)
-		goto opps1;
-
-	slab->slab_cache = cachep;
-	slab->freelist = freelist;
-
-	cache_init_objs(cachep, slab);
-
-	if (gfpflags_allow_blocking(local_flags))
-		local_irq_disable();
-
-	return slab;
-
-opps1:
-	kmem_freepages(cachep, slab);
-failed:
-	if (gfpflags_allow_blocking(local_flags))
-		local_irq_disable();
-	return NULL;
-}
-
-static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
-{
-	struct kmem_cache_node *n;
-	void *list = NULL;
-
-	check_irq_off();
-
-	if (!slab)
-		return;
-
-	INIT_LIST_HEAD(&slab->slab_list);
-	n = get_node(cachep, slab_nid(slab));
-
-	raw_spin_lock(&n->list_lock);
-	n->total_slabs++;
-	if (!slab->active) {
-		list_add_tail(&slab->slab_list, &n->slabs_free);
-		n->free_slabs++;
-	} else
-		fixup_slab_list(cachep, n, slab, &list);
-
-	STATS_INC_GROWN(cachep);
-	n->free_objects += cachep->num - slab->active;
-	raw_spin_unlock(&n->list_lock);
-
-	fixup_objfreelist_debug(cachep, &list);
-}
-
-#if DEBUG
-
-/*
- * Perform extra freeing checks:
- * - detect bad pointers.
- * - POISON/RED_ZONE checking
- */
-static void kfree_debugcheck(const void *objp)
-{
-	if (!virt_addr_valid(objp)) {
-		pr_err("kfree_debugcheck: out of range ptr %lxh\n",
-		       (unsigned long)objp);
-		BUG();
-	}
-}
-
-static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
-{
-	unsigned long long redzone1, redzone2;
-
-	redzone1 = *dbg_redzone1(cache, obj);
-	redzone2 = *dbg_redzone2(cache, obj);
-
-	/*
-	 * Redzone is ok.
-	 */
-	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
-		return;
-
-	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
-		slab_error(cache, "double free detected");
-	else
-		slab_error(cache, "memory outside object was overwritten");
-
-	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
-	       obj, redzone1, redzone2);
-}
-
-static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
-				   unsigned long caller)
-{
-	unsigned int objnr;
-	struct slab *slab;
-
-	BUG_ON(virt_to_cache(objp) != cachep);
-
-	objp -= obj_offset(cachep);
-	kfree_debugcheck(objp);
-	slab = virt_to_slab(objp);
-
-	if (cachep->flags & SLAB_RED_ZONE) {
-		verify_redzone_free(cachep, objp);
-		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
-		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
-	}
-	if (cachep->flags & SLAB_STORE_USER)
-		*dbg_userword(cachep, objp) = (void *)caller;
-
-	objnr = obj_to_index(cachep, slab, objp);
-
-	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != index_to_obj(cachep, slab, objnr));
-
-	if (cachep->flags & SLAB_POISON) {
-		poison_obj(cachep, objp, POISON_FREE);
-		slab_kernel_map(cachep, objp, 0);
-	}
-	return objp;
-}
-
-#else
-#define kfree_debugcheck(x) do { } while(0)
-#define cache_free_debugcheck(x, objp, z) (objp)
-#endif
-
-static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
-						void **list)
-{
-#if DEBUG
-	void *next = *list;
-	void *objp;
-
-	while (next) {
-		objp = next - obj_offset(cachep);
-		next = *(void **)next;
-		poison_obj(cachep, objp, POISON_FREE);
-	}
-#endif
-}
-
-static inline void fixup_slab_list(struct kmem_cache *cachep,
-				struct kmem_cache_node *n, struct slab *slab,
-				void **list)
-{
-	/* move slabp to correct slabp list: */
-	list_del(&slab->slab_list);
-	if (slab->active == cachep->num) {
-		list_add(&slab->slab_list, &n->slabs_full);
-		if (OBJFREELIST_SLAB(cachep)) {
-#if DEBUG
-			/* Poisoning will be done without holding the lock */
-			if (cachep->flags & SLAB_POISON) {
-				void **objp = slab->freelist;
-
-				*objp = *list;
-				*list = objp;
-			}
-#endif
-			slab->freelist = NULL;
-		}
-	} else
-		list_add(&slab->slab_list, &n->slabs_partial);
-}
-
-/* Try to find non-pfmemalloc slab if needed */
-static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n,
-					struct slab *slab, bool pfmemalloc)
-{
-	if (!slab)
-		return NULL;
-
-	if (pfmemalloc)
-		return slab;
-
-	if (!slab_test_pfmemalloc(slab))
-		return slab;
-
-	/* No need to keep pfmemalloc slab if we have enough free objects */
-	if (n->free_objects > n->free_limit) {
-		slab_clear_pfmemalloc(slab);
-		return slab;
-	}
-
-	/* Move pfmemalloc slab to the end of list to speed up next search */
-	list_del(&slab->slab_list);
-	if (!slab->active) {
-		list_add_tail(&slab->slab_list, &n->slabs_free);
-		n->free_slabs++;
-	} else
-		list_add_tail(&slab->slab_list, &n->slabs_partial);
-
-	list_for_each_entry(slab, &n->slabs_partial, slab_list) {
-		if (!slab_test_pfmemalloc(slab))
-			return slab;
-	}
-
-	n->free_touched = 1;
-	list_for_each_entry(slab, &n->slabs_free, slab_list) {
-		if (!slab_test_pfmemalloc(slab)) {
-			n->free_slabs--;
-			return slab;
-		}
-	}
-
-	return NULL;
-}
-
-static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
-{
-	struct slab *slab;
-
-	assert_raw_spin_locked(&n->list_lock);
-	slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
-					slab_list);
-	if (!slab) {
-		n->free_touched = 1;
-		slab = list_first_entry_or_null(&n->slabs_free, struct slab,
-						slab_list);
-		if (slab)
-			n->free_slabs--;
-	}
-
-	if (sk_memalloc_socks())
-		slab = get_valid_first_slab(n, slab, pfmemalloc);
-
-	return slab;
-}
-
-static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
-				struct kmem_cache_node *n, gfp_t flags)
-{
-	struct slab *slab;
-	void *obj;
-	void *list = NULL;
-
-	if (!gfp_pfmemalloc_allowed(flags))
-		return NULL;
-
-	raw_spin_lock(&n->list_lock);
-	slab = get_first_slab(n, true);
-	if (!slab) {
-		raw_spin_unlock(&n->list_lock);
-		return NULL;
-	}
-
-	obj = slab_get_obj(cachep, slab);
-	n->free_objects--;
-
-	fixup_slab_list(cachep, n, slab, &list);
-
-	raw_spin_unlock(&n->list_lock);
-	fixup_objfreelist_debug(cachep, &list);
-
-	return obj;
-}
-
-/*
- * Slab list should be fixed up by fixup_slab_list() for existing slab
- * or cache_grow_end() for new slab
- */
-static __always_inline int alloc_block(struct kmem_cache *cachep,
-		struct array_cache *ac, struct slab *slab, int batchcount)
-{
-	/*
-	 * There must be at least one object available for
-	 * allocation.
-	 */
-	BUG_ON(slab->active >= cachep->num);
-
-	while (slab->active < cachep->num && batchcount--) {
-		STATS_INC_ALLOCED(cachep);
-		STATS_INC_ACTIVE(cachep);
-		STATS_SET_HIGH(cachep);
-
-		ac->entry[ac->avail++] = slab_get_obj(cachep, slab);
-	}
-
-	return batchcount;
-}
-
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
-{
-	int batchcount;
-	struct kmem_cache_node *n;
-	struct array_cache *ac, *shared;
-	int node;
-	void *list = NULL;
-	struct slab *slab;
-
-	check_irq_off();
-	node = numa_mem_id();
-
-	ac = cpu_cache_get(cachep);
-	batchcount = ac->batchcount;
-	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-		/*
-		 * If there was little recent activity on this cache, then
-		 * perform only a partial refill.  Otherwise we could generate
-		 * refill bouncing.
-		 */
-		batchcount = BATCHREFILL_LIMIT;
-	}
-	n = get_node(cachep, node);
-
-	BUG_ON(ac->avail > 0 || !n);
-	shared = READ_ONCE(n->shared);
-	if (!n->free_objects && (!shared || !shared->avail))
-		goto direct_grow;
-
-	raw_spin_lock(&n->list_lock);
-	shared = READ_ONCE(n->shared);
-
-	/* See if we can refill from the shared array */
-	if (shared && transfer_objects(ac, shared, batchcount)) {
-		shared->touched = 1;
-		goto alloc_done;
-	}
-
-	while (batchcount > 0) {
-		/* Get slab alloc is to come from. */
-		slab = get_first_slab(n, false);
-		if (!slab)
-			goto must_grow;
-
-		check_spinlock_acquired(cachep);
-
-		batchcount = alloc_block(cachep, ac, slab, batchcount);
-		fixup_slab_list(cachep, n, slab, &list);
-	}
-
-must_grow:
-	n->free_objects -= ac->avail;
-alloc_done:
-	raw_spin_unlock(&n->list_lock);
-	fixup_objfreelist_debug(cachep, &list);
-
-direct_grow:
-	if (unlikely(!ac->avail)) {
-		/* Check if we can use obj in pfmemalloc slab */
-		if (sk_memalloc_socks()) {
-			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
-
-			if (obj)
-				return obj;
-		}
-
-		slab = cache_grow_begin(cachep, gfp_exact_node(flags), node);
-
-		/*
-		 * cache_grow_begin() can reenable interrupts,
-		 * then ac could change.
-		 */
-		ac = cpu_cache_get(cachep);
-		if (!ac->avail && slab)
-			alloc_block(cachep, ac, slab, batchcount);
-		cache_grow_end(cachep, slab);
-
-		if (!ac->avail)
-			return NULL;
-	}
-	ac->touched = 1;
-
-	return ac->entry[--ac->avail];
-}
-
-#if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-				gfp_t flags, void *objp, unsigned long caller)
-{
-	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
-	if (!objp || is_kfence_address(objp))
-		return objp;
-	if (cachep->flags & SLAB_POISON) {
-		check_poison_obj(cachep, objp);
-		slab_kernel_map(cachep, objp, 1);
-		poison_obj(cachep, objp, POISON_INUSE);
-	}
-	if (cachep->flags & SLAB_STORE_USER)
-		*dbg_userword(cachep, objp) = (void *)caller;
-
-	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
-				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep, "double free, or memory outside object was overwritten");
-			pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
-			       objp, *dbg_redzone1(cachep, objp),
-			       *dbg_redzone2(cachep, objp));
-		}
-		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
-		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
-	}
-
-	objp += obj_offset(cachep);
-	if (cachep->ctor && cachep->flags & SLAB_POISON)
-		cachep->ctor(objp);
-	if ((unsigned long)objp & (arch_slab_minalign() - 1)) {
-		pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp,
-		       arch_slab_minalign());
-	}
-	return objp;
-}
-#else
-#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
-#endif
-
-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
-	void *objp;
-	struct array_cache *ac;
-
-	check_irq_off();
-
-	ac = cpu_cache_get(cachep);
-	if (likely(ac->avail)) {
-		ac->touched = 1;
-		objp = ac->entry[--ac->avail];
-
-		STATS_INC_ALLOCHIT(cachep);
-		goto out;
-	}
-
-	STATS_INC_ALLOCMISS(cachep);
-	objp = cache_alloc_refill(cachep, flags);
-	/*
-	 * the 'ac' may be updated by cache_alloc_refill(),
-	 * and kmemleak_erase() requires its correct value.
-	 */
-	ac = cpu_cache_get(cachep);
-
-out:
-	/*
-	 * To avoid a false negative, if an object that is in one of the
-	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
-	 * treat the array pointers as a reference to the object.
-	 */
-	if (objp)
-		kmemleak_erase(&ac->entry[ac->avail]);
-	return objp;
-}
-
-#ifdef CONFIG_NUMA
-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
-
-/*
- * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
- *
- * If we are in_interrupt, then process context, including cpusets and
- * mempolicy, may not apply and should not be used for allocation policy.
- */
-static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
-	int nid_alloc, nid_here;
-
-	if (in_interrupt() || (flags & __GFP_THISNODE))
-		return NULL;
-	nid_alloc = nid_here = numa_mem_id();
-	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
-		nid_alloc = cpuset_slab_spread_node();
-	else if (current->mempolicy)
-		nid_alloc = mempolicy_slab_node();
-	if (nid_alloc != nid_here)
-		return ____cache_alloc_node(cachep, flags, nid_alloc);
-	return NULL;
-}
-
-/*
- * Fallback function if there was no memory available and no objects on a
- * certain node and fall back is permitted. First we scan all the
- * available node for available objects. If that fails then we
- * perform an allocation without specifying a node. This allows the page
- * allocator to do its reclaim / fallback magic. We then insert the
- * slab into the proper nodelist and then allocate from it.
- */
-static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
-{
-	struct zonelist *zonelist;
-	struct zoneref *z;
-	struct zone *zone;
-	enum zone_type highest_zoneidx = gfp_zone(flags);
-	void *obj = NULL;
-	struct slab *slab;
-	int nid;
-	unsigned int cpuset_mems_cookie;
-
-	if (flags & __GFP_THISNODE)
-		return NULL;
-
-retry_cpuset:
-	cpuset_mems_cookie = read_mems_allowed_begin();
-	zonelist = node_zonelist(mempolicy_slab_node(), flags);
-
-retry:
-	/*
-	 * Look through allowed nodes for objects available
-	 * from existing per node queues.
-	 */
-	for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
-		nid = zone_to_nid(zone);
-
-		if (cpuset_zone_allowed(zone, flags) &&
-			get_node(cache, nid) &&
-			get_node(cache, nid)->free_objects) {
-				obj = ____cache_alloc_node(cache,
-					gfp_exact_node(flags), nid);
-				if (obj)
-					break;
-		}
-	}
-
-	if (!obj) {
-		/*
-		 * This allocation will be performed within the constraints
-		 * of the current cpuset / memory policy requirements.
-		 * We may trigger various forms of reclaim on the allowed
-		 * set and go into memory reserves if necessary.
-		 */
-		slab = cache_grow_begin(cache, flags, numa_mem_id());
-		cache_grow_end(cache, slab);
-		if (slab) {
-			nid = slab_nid(slab);
-			obj = ____cache_alloc_node(cache,
-				gfp_exact_node(flags), nid);
-
-			/*
-			 * Another processor may allocate the objects in
-			 * the slab since we are not holding any locks.
-			 */
-			if (!obj)
-				goto retry;
-		}
-	}
-
-	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
-		goto retry_cpuset;
-	return obj;
-}
-
-/*
- * An interface to enable slab creation on nodeid
- */
-static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
-				int nodeid)
-{
-	struct slab *slab;
-	struct kmem_cache_node *n;
-	void *obj = NULL;
-	void *list = NULL;
-
-	VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
-	n = get_node(cachep, nodeid);
-	BUG_ON(!n);
-
-	check_irq_off();
-	raw_spin_lock(&n->list_lock);
-	slab = get_first_slab(n, false);
-	if (!slab)
-		goto must_grow;
-
-	check_spinlock_acquired_node(cachep, nodeid);
-
-	STATS_INC_NODEALLOCS(cachep);
-	STATS_INC_ACTIVE(cachep);
-	STATS_SET_HIGH(cachep);
-
-	BUG_ON(slab->active == cachep->num);
-
-	obj = slab_get_obj(cachep, slab);
-	n->free_objects--;
-
-	fixup_slab_list(cachep, n, slab, &list);
-
-	raw_spin_unlock(&n->list_lock);
-	fixup_objfreelist_debug(cachep, &list);
-	return obj;
-
-must_grow:
-	raw_spin_unlock(&n->list_lock);
-	slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
-	if (slab) {
-		/* This slab isn't counted yet so don't update free_objects */
-		obj = slab_get_obj(cachep, slab);
-	}
-	cache_grow_end(cachep, slab);
-
-	return obj ? obj : fallback_alloc(cachep, flags);
-}
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid)
-{
-	void *objp = NULL;
-	int slab_node = numa_mem_id();
-
-	if (nodeid == NUMA_NO_NODE) {
-		if (current->mempolicy || cpuset_do_slab_mem_spread()) {
-			objp = alternate_node_alloc(cachep, flags);
-			if (objp)
-				goto out;
-		}
-		/*
-		 * Use the locally cached objects if possible.
-		 * However ____cache_alloc does not allow fallback
-		 * to other nodes. It may fail while we still have
-		 * objects on other nodes available.
-		 */
-		objp = ____cache_alloc(cachep, flags);
-		nodeid = slab_node;
-	} else if (nodeid == slab_node) {
-		objp = ____cache_alloc(cachep, flags);
-	} else if (!get_node(cachep, nodeid)) {
-		/* Node not bootstrapped yet */
-		objp = fallback_alloc(cachep, flags);
-		goto out;
-	}
-
-	/*
-	 * We may just have run out of memory on the local node.
-	 * ____cache_alloc_node() knows how to locate memory on other nodes
-	 */
-	if (!objp)
-		objp = ____cache_alloc_node(cachep, flags, nodeid);
-out:
-	return objp;
-}
-#else
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused)
-{
-	return ____cache_alloc(cachep, flags);
-}
-
-#endif /* CONFIG_NUMA */
-
-static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
-		int nodeid, size_t orig_size, unsigned long caller)
-{
-	unsigned long save_flags;
-	void *objp;
-	struct obj_cgroup *objcg = NULL;
-	bool init = false;
-
-	flags &= gfp_allowed_mask;
-	cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags);
-	if (unlikely(!cachep))
-		return NULL;
-
-	objp = kfence_alloc(cachep, orig_size, flags);
-	if (unlikely(objp))
-		goto out;
-
-	local_irq_save(save_flags);
-	objp = __do_cache_alloc(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
-	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
-	prefetchw(objp);
-	init = slab_want_init_on_alloc(flags, cachep);
-
-out:
-	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init,
-				cachep->object_size);
-	return objp;
-}
-
-static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
-	   size_t orig_size, unsigned long caller)
-{
-	return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
-			       caller);
-}
-
-/*
- * Caller needs to acquire correct kmem_cache_node's list_lock
- * @list: List of detached free slabs should be freed by caller
- */
-static void free_block(struct kmem_cache *cachep, void **objpp,
-			int nr_objects, int node, struct list_head *list)
-{
-	int i;
-	struct kmem_cache_node *n = get_node(cachep, node);
-	struct slab *slab;
-
-	n->free_objects += nr_objects;
-
-	for (i = 0; i < nr_objects; i++) {
-		void *objp;
-		struct slab *slab;
-
-		objp = objpp[i];
-
-		slab = virt_to_slab(objp);
-		list_del(&slab->slab_list);
-		check_spinlock_acquired_node(cachep, node);
-		slab_put_obj(cachep, slab, objp);
-		STATS_DEC_ACTIVE(cachep);
-
-		/* fixup slab chains */
-		if (slab->active == 0) {
-			list_add(&slab->slab_list, &n->slabs_free);
-			n->free_slabs++;
-		} else {
-			/* Unconditionally move a slab to the end of the
-			 * partial list on free - maximum time for the
-			 * other objects to be freed, too.
-			 */
-			list_add_tail(&slab->slab_list, &n->slabs_partial);
-		}
-	}
-
-	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
-		n->free_objects -= cachep->num;
-
-		slab = list_last_entry(&n->slabs_free, struct slab, slab_list);
-		list_move(&slab->slab_list, list);
-		n->free_slabs--;
-		n->total_slabs--;
-	}
-}
-
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
-{
-	int batchcount;
-	struct kmem_cache_node *n;
-	int node = numa_mem_id();
-	LIST_HEAD(list);
-
-	batchcount = ac->batchcount;
-
-	check_irq_off();
-	n = get_node(cachep, node);
-	raw_spin_lock(&n->list_lock);
-	if (n->shared) {
-		struct array_cache *shared_array = n->shared;
-		int max = shared_array->limit - shared_array->avail;
-		if (max) {
-			if (batchcount > max)
-				batchcount = max;
-			memcpy(&(shared_array->entry[shared_array->avail]),
-			       ac->entry, sizeof(void *) * batchcount);
-			shared_array->avail += batchcount;
-			goto free_done;
-		}
-	}
-
-	free_block(cachep, ac->entry, batchcount, node, &list);
-free_done:
-#if STATS
-	{
-		int i = 0;
-		struct slab *slab;
-
-		list_for_each_entry(slab, &n->slabs_free, slab_list) {
-			BUG_ON(slab->active);
-
-			i++;
-		}
-		STATS_SET_FREEABLE(cachep, i);
-	}
-#endif
-	raw_spin_unlock(&n->list_lock);
-	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
-	slabs_destroy(cachep, &list);
-}
-
-/*
- * Release an obj back to its cache. If the obj has a constructed state, it must
- * be in this state _before_ it is released.  Called with disabled ints.
- */
-static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
-					 unsigned long caller)
-{
-	bool init;
-
-	memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1);
-
-	if (is_kfence_address(objp)) {
-		kmemleak_free_recursive(objp, cachep->flags);
-		__kfence_free(objp);
-		return;
-	}
-
-	/*
-	 * As memory initialization might be integrated into KASAN,
-	 * kasan_slab_free and initialization memset must be
-	 * kept together to avoid discrepancies in behavior.
-	 */
-	init = slab_want_init_on_free(cachep);
-	if (init && !kasan_has_integrated_init())
-		memset(objp, 0, cachep->object_size);
-	/* KASAN might put objp into memory quarantine, delaying its reuse. */
-	if (kasan_slab_free(cachep, objp, init))
-		return;
-
-	/* Use KCSAN to help debug racy use-after-free. */
-	if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
-		__kcsan_check_access(objp, cachep->object_size,
-				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
-
-	___cache_free(cachep, objp, caller);
-}
-
-void ___cache_free(struct kmem_cache *cachep, void *objp,
-		unsigned long caller)
-{
-	struct array_cache *ac = cpu_cache_get(cachep);
-
-	check_irq_off();
-	kmemleak_free_recursive(objp, cachep->flags);
-	objp = cache_free_debugcheck(cachep, objp, caller);
-
-	/*
-	 * Skip calling cache_free_alien() when the platform is not numa.
-	 * This will avoid cache misses that happen while accessing slabp (which
-	 * is per page memory  reference) to get nodeid. Instead use a global
-	 * variable to skip the call, which is mostly likely to be present in
-	 * the cache.
-	 */
-	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
-		return;
-
-	if (ac->avail < ac->limit) {
-		STATS_INC_FREEHIT(cachep);
-	} else {
-		STATS_INC_FREEMISS(cachep);
-		cache_flusharray(cachep, ac);
-	}
-
-	if (sk_memalloc_socks()) {
-		struct slab *slab = virt_to_slab(objp);
-
-		if (unlikely(slab_test_pfmemalloc(slab))) {
-			cache_free_pfmemalloc(cachep, slab, objp);
-			return;
-		}
-	}
-
-	__free_one(ac, objp);
-}
-
-static __always_inline
-void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
-			     gfp_t flags)
-{
-	void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
-
-	trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE);
-
-	return ret;
-}
-
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
-	return __kmem_cache_alloc_lru(cachep, NULL, flags);
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
-void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
-			   gfp_t flags)
-{
-	return __kmem_cache_alloc_lru(cachep, lru, flags);
-}
-EXPORT_SYMBOL(kmem_cache_alloc_lru);
-
-static __always_inline void
-cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
-				  size_t size, void **p, unsigned long caller)
-{
-	size_t i;
-
-	for (i = 0; i < size; i++)
-		p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
-}
-
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-			  void **p)
-{
-	struct obj_cgroup *objcg = NULL;
-	unsigned long irqflags;
-	size_t i;
-
-	s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
-	if (!s)
-		return 0;
-
-	local_irq_save(irqflags);
-	for (i = 0; i < size; i++) {
-		void *objp = kfence_alloc(s, s->object_size, flags) ?:
-			     __do_cache_alloc(s, flags, NUMA_NO_NODE);
-
-		if (unlikely(!objp))
-			goto error;
-		p[i] = objp;
-	}
-	local_irq_restore(irqflags);
-
-	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
-
-	/*
-	 * memcg and kmem_cache debug support and memory initialization.
-	 * Done outside of the IRQ disabled section.
-	 */
-	slab_post_alloc_hook(s, objcg, flags, size, p,
-			slab_want_init_on_alloc(flags, s), s->object_size);
-	/* FIXME: Trace call missing. Christoph would like a bulk variant */
-	return size;
-error:
-	local_irq_restore(irqflags);
-	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
-	slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size);
-	kmem_cache_free_bulk(s, i, p);
-	return 0;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
-{
-	void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
-
-	trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid);
-
-	return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
-
-void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
-			     int nodeid, size_t orig_size,
-			     unsigned long caller)
-{
-	return slab_alloc_node(cachep, NULL, flags, nodeid,
-			       orig_size, caller);
-}
-
-#ifdef CONFIG_PRINTK
-void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
-{
-	struct kmem_cache *cachep;
-	unsigned int objnr;
-	void *objp;
-
-	kpp->kp_ptr = object;
-	kpp->kp_slab = slab;
-	cachep = slab->slab_cache;
-	kpp->kp_slab_cache = cachep;
-	objp = object - obj_offset(cachep);
-	kpp->kp_data_offset = obj_offset(cachep);
-	slab = virt_to_slab(objp);
-	objnr = obj_to_index(cachep, slab, objp);
-	objp = index_to_obj(cachep, slab, objnr);
-	kpp->kp_objp = objp;
-	if (DEBUG && cachep->flags & SLAB_STORE_USER)
-		kpp->kp_ret = *dbg_userword(cachep, objp);
-}
-#endif
-
-static __always_inline
-void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp,
-			  unsigned long caller)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	debug_check_no_locks_freed(objp, cachep->object_size);
-	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(objp, cachep->object_size);
-	__cache_free(cachep, objp, caller);
-	local_irq_restore(flags);
-}
-
-void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
-		       unsigned long caller)
-{
-	__do_kmem_cache_free(cachep, objp, caller);
-}
-
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
-{
-	cachep = cache_from_obj(cachep, objp);
-	if (!cachep)
-		return;
-
-	trace_kmem_cache_free(_RET_IP_, objp, cachep);
-	__do_kmem_cache_free(cachep, objp, _RET_IP_);
-}
-EXPORT_SYMBOL(kmem_cache_free);
-
-void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	for (int i = 0; i < size; i++) {
-		void *objp = p[i];
-		struct kmem_cache *s;
-
-		if (!orig_s) {
-			struct folio *folio = virt_to_folio(objp);
-
-			/* called via kfree_bulk */
-			if (!folio_test_slab(folio)) {
-				local_irq_restore(flags);
-				free_large_kmalloc(folio, objp);
-				local_irq_save(flags);
-				continue;
-			}
-			s = folio_slab(folio)->slab_cache;
-		} else {
-			s = cache_from_obj(orig_s, objp);
-		}
-
-		if (!s)
-			continue;
-
-		debug_check_no_locks_freed(objp, s->object_size);
-		if (!(s->flags & SLAB_DEBUG_OBJECTS))
-			debug_check_no_obj_freed(objp, s->object_size);
-
-		__cache_free(s, objp, _RET_IP_);
-	}
-	local_irq_restore(flags);
-
-	/* FIXME: add tracing */
-}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-
-/*
- * This initializes kmem_cache_node or resizes various caches for all nodes.
- */
-static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
-{
-	int ret;
-	int node;
-	struct kmem_cache_node *n;
-
-	for_each_online_node(node) {
-		ret = setup_kmem_cache_node(cachep, node, gfp, true);
-		if (ret)
-			goto fail;
-
-	}
-
-	return 0;
-
-fail:
-	if (!cachep->list.next) {
-		/* Cache is not active yet. Roll back what we did */
-		node--;
-		while (node >= 0) {
-			n = get_node(cachep, node);
-			if (n) {
-				kfree(n->shared);
-				free_alien_cache(n->alien);
-				kfree(n);
-				cachep->node[node] = NULL;
-			}
-			node--;
-		}
-	}
-	return -ENOMEM;
-}
-
-/* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-			    int batchcount, int shared, gfp_t gfp)
-{
-	struct array_cache __percpu *cpu_cache, *prev;
-	int cpu;
-
-	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
-	if (!cpu_cache)
-		return -ENOMEM;
-
-	prev = cachep->cpu_cache;
-	cachep->cpu_cache = cpu_cache;
-	/*
-	 * Without a previous cpu_cache there's no need to synchronize remote
-	 * cpus, so skip the IPIs.
-	 */
-	if (prev)
-		kick_all_cpus_sync();
-
-	check_irq_on();
-	cachep->batchcount = batchcount;
-	cachep->limit = limit;
-	cachep->shared = shared;
-
-	if (!prev)
-		goto setup_node;
-
-	for_each_online_cpu(cpu) {
-		LIST_HEAD(list);
-		int node;
-		struct kmem_cache_node *n;
-		struct array_cache *ac = per_cpu_ptr(prev, cpu);
-
-		node = cpu_to_mem(cpu);
-		n = get_node(cachep, node);
-		raw_spin_lock_irq(&n->list_lock);
-		free_block(cachep, ac->entry, ac->avail, node, &list);
-		raw_spin_unlock_irq(&n->list_lock);
-		slabs_destroy(cachep, &list);
-	}
-	free_percpu(prev);
-
-setup_node:
-	return setup_kmem_cache_nodes(cachep, gfp);
-}
-
-/* Called with slab_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
-{
-	int err;
-	int limit = 0;
-	int shared = 0;
-	int batchcount = 0;
-
-	err = cache_random_seq_create(cachep, cachep->num, gfp);
-	if (err)
-		goto end;
-
-	/*
-	 * The head array serves three purposes:
-	 * - create a LIFO ordering, i.e. return objects that are cache-warm
-	 * - reduce the number of spinlock operations.
-	 * - reduce the number of linked list operations on the slab and
-	 *   bufctl chains: array operations are cheaper.
-	 * The numbers are guessed, we should auto-tune as described by
-	 * Bonwick.
-	 */
-	if (cachep->size > 131072)
-		limit = 1;
-	else if (cachep->size > PAGE_SIZE)
-		limit = 8;
-	else if (cachep->size > 1024)
-		limit = 24;
-	else if (cachep->size > 256)
-		limit = 54;
-	else
-		limit = 120;
-
-	/*
-	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
-	 * allocation behaviour: Most allocs on one cpu, most free operations
-	 * on another cpu. For these cases, an efficient object passing between
-	 * cpus is necessary. This is provided by a shared array. The array
-	 * replaces Bonwick's magazine layer.
-	 * On uniprocessor, it's functionally equivalent (but less efficient)
-	 * to a larger limit. Thus disabled by default.
-	 */
-	shared = 0;
-	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
-		shared = 8;
-
-#if DEBUG
-	/*
-	 * With debugging enabled, large batchcount lead to excessively long
-	 * periods with disabled local interrupts. Limit the batchcount
-	 */
-	if (limit > 32)
-		limit = 32;
-#endif
-	batchcount = (limit + 1) / 2;
-	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-end:
-	if (err)
-		pr_err("enable_cpucache failed for %s, error %d\n",
-		       cachep->name, -err);
-	return err;
-}
-
-/*
- * Drain an array if it contains any elements taking the node lock only if
- * necessary. Note that the node listlock also protects the array_cache
- * if drain_array() is used on the shared array.
- */
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
-			 struct array_cache *ac, int node)
-{
-	LIST_HEAD(list);
-
-	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
-	check_mutex_acquired();
-
-	if (!ac || !ac->avail)
-		return;
-
-	if (ac->touched) {
-		ac->touched = 0;
-		return;
-	}
-
-	raw_spin_lock_irq(&n->list_lock);
-	drain_array_locked(cachep, ac, node, false, &list);
-	raw_spin_unlock_irq(&n->list_lock);
-
-	slabs_destroy(cachep, &list);
-}
-
-/**
- * cache_reap - Reclaim memory from caches.
- * @w: work descriptor
- *
- * Called from workqueue/eventd every few seconds.
- * Purpose:
- * - clear the per-cpu caches for this CPU.
- * - return freeable pages to the main free memory pool.
- *
- * If we cannot acquire the cache chain mutex then just give up - we'll try
- * again on the next iteration.
- */
-static void cache_reap(struct work_struct *w)
-{
-	struct kmem_cache *searchp;
-	struct kmem_cache_node *n;
-	int node = numa_mem_id();
-	struct delayed_work *work = to_delayed_work(w);
-
-	if (!mutex_trylock(&slab_mutex))
-		/* Give up. Setup the next iteration. */
-		goto out;
-
-	list_for_each_entry(searchp, &slab_caches, list) {
-		check_irq_on();
-
-		/*
-		 * We only take the node lock if absolutely necessary and we
-		 * have established with reasonable certainty that
-		 * we can do some work if the lock was obtained.
-		 */
-		n = get_node(searchp, node);
-
-		reap_alien(searchp, n);
-
-		drain_array(searchp, n, cpu_cache_get(searchp), node);
-
-		/*
-		 * These are racy checks but it does not matter
-		 * if we skip one check or scan twice.
-		 */
-		if (time_after(n->next_reap, jiffies))
-			goto next;
-
-		n->next_reap = jiffies + REAPTIMEOUT_NODE;
-
-		drain_array(searchp, n, n->shared, node);
-
-		if (n->free_touched)
-			n->free_touched = 0;
-		else {
-			int freed;
-
-			freed = drain_freelist(searchp, n, (n->free_limit +
-				5 * searchp->num - 1) / (5 * searchp->num));
-			STATS_ADD_REAPED(searchp, freed);
-		}
-next:
-		cond_resched();
-	}
-	check_irq_on();
-	mutex_unlock(&slab_mutex);
-	next_reap_node();
-out:
-	/* Set up the next iteration */
-	schedule_delayed_work_on(smp_processor_id(), work,
-				round_jiffies_relative(REAPTIMEOUT_AC));
-}
-
-void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
-{
-	unsigned long active_objs, num_objs, active_slabs;
-	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
-	unsigned long free_slabs = 0;
-	int node;
-	struct kmem_cache_node *n;
-
-	for_each_kmem_cache_node(cachep, node, n) {
-		check_irq_on();
-		raw_spin_lock_irq(&n->list_lock);
-
-		total_slabs += n->total_slabs;
-		free_slabs += n->free_slabs;
-		free_objs += n->free_objects;
-
-		if (n->shared)
-			shared_avail += n->shared->avail;
-
-		raw_spin_unlock_irq(&n->list_lock);
-	}
-	num_objs = total_slabs * cachep->num;
-	active_slabs = total_slabs - free_slabs;
-	active_objs = num_objs - free_objs;
-
-	sinfo->active_objs = active_objs;
-	sinfo->num_objs = num_objs;
-	sinfo->active_slabs = active_slabs;
-	sinfo->num_slabs = total_slabs;
-	sinfo->shared_avail = shared_avail;
-	sinfo->limit = cachep->limit;
-	sinfo->batchcount = cachep->batchcount;
-	sinfo->shared = cachep->shared;
-	sinfo->objects_per_slab = cachep->num;
-	sinfo->cache_order = cachep->gfporder;
-}
-
-void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
-{
-#if STATS
-	{			/* node stats */
-		unsigned long high = cachep->high_mark;
-		unsigned long allocs = cachep->num_allocations;
-		unsigned long grown = cachep->grown;
-		unsigned long reaped = cachep->reaped;
-		unsigned long errors = cachep->errors;
-		unsigned long max_freeable = cachep->max_freeable;
-		unsigned long node_allocs = cachep->node_allocs;
-		unsigned long node_frees = cachep->node_frees;
-		unsigned long overflows = cachep->node_overflow;
-
-		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
-			   allocs, high, grown,
-			   reaped, errors, max_freeable, node_allocs,
-			   node_frees, overflows);
-	}
-	/* cpu stats */
-	{
-		unsigned long allochit = atomic_read(&cachep->allochit);
-		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
-		unsigned long freehit = atomic_read(&cachep->freehit);
-		unsigned long freemiss = atomic_read(&cachep->freemiss);
-
-		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-			   allochit, allocmiss, freehit, freemiss);
-	}
-#endif
-}
-
-#define MAX_SLABINFO_WRITE 128
-/**
- * slabinfo_write - Tuning for the slab allocator
- * @file: unused
- * @buffer: user buffer
- * @count: data length
- * @ppos: unused
- *
- * Return: %0 on success, negative error code otherwise.
- */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-		       size_t count, loff_t *ppos)
-{
-	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
-	int limit, batchcount, shared, res;
-	struct kmem_cache *cachep;
-
-	if (count > MAX_SLABINFO_WRITE)
-		return -EINVAL;
-	if (copy_from_user(&kbuf, buffer, count))
-		return -EFAULT;
-	kbuf[MAX_SLABINFO_WRITE] = '\0';
-
-	tmp = strchr(kbuf, ' ');
-	if (!tmp)
-		return -EINVAL;
-	*tmp = '\0';
-	tmp++;
-	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
-		return -EINVAL;
-
-	/* Find the cache in the chain of caches. */
-	mutex_lock(&slab_mutex);
-	res = -EINVAL;
-	list_for_each_entry(cachep, &slab_caches, list) {
-		if (!strcmp(cachep->name, kbuf)) {
-			if (limit < 1 || batchcount < 1 ||
-					batchcount > limit || shared < 0) {
-				res = 0;
-			} else {
-				res = do_tune_cpucache(cachep, limit,
-						       batchcount, shared,
-						       GFP_KERNEL);
-			}
-			break;
-		}
-	}
-	mutex_unlock(&slab_mutex);
-	if (res >= 0)
-		res = count;
-	return res;
-}
-
-#ifdef CONFIG_HARDENED_USERCOPY
-/*
- * Rejects incorrectly sized objects and objects that are to be copied
- * to/from userspace but do not fall entirely within the containing slab
- * cache's usercopy region.
- *
- * Returns NULL if check passes, otherwise const char * to name of cache
- * to indicate an error.
- */
-void __check_heap_object(const void *ptr, unsigned long n,
-			 const struct slab *slab, bool to_user)
-{
-	struct kmem_cache *cachep;
-	unsigned int objnr;
-	unsigned long offset;
-
-	ptr = kasan_reset_tag(ptr);
-
-	/* Find and validate object. */
-	cachep = slab->slab_cache;
-	objnr = obj_to_index(cachep, slab, (void *)ptr);
-	BUG_ON(objnr >= cachep->num);
-
-	/* Find offset within object. */
-	if (is_kfence_address(ptr))
-		offset = ptr - kfence_object_start(ptr);
-	else
-		offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep);
-
-	/* Allow address range falling entirely within usercopy region. */
-	if (offset >= cachep->useroffset &&
-	    offset - cachep->useroffset <= cachep->usersize &&
-	    n <= cachep->useroffset - offset + cachep->usersize)
-		return;
-
-	usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
-}
-#endif /* CONFIG_HARDENED_USERCOPY */
-- 
cgit v1.2.3


From 7ef08ae8277c66657127844179912214c67fb4bc Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 3 Oct 2023 09:54:15 +0200
Subject: mm/slab: move struct kmem_cache_cpu declaration to slub.c

Nothing outside SLUB itself accesses the struct kmem_cache_cpu fields so
it does not need to be declared in slub_def.h. This allows also to move
enum stat_item.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 54 ------------------------------------------------
 mm/slub.c                | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index deb90cf4bffb..a0229ea42977 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -12,60 +12,6 @@
 #include <linux/reciprocal_div.h>
 #include <linux/local_lock.h>
 
-enum stat_item {
-	ALLOC_FASTPATH,		/* Allocation from cpu slab */
-	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
-	FREE_FASTPATH,		/* Free to cpu slab */
-	FREE_SLOWPATH,		/* Freeing not to cpu slab */
-	FREE_FROZEN,		/* Freeing to frozen slab */
-	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
-	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
-	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
-	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
-	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
-	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
-	FREE_SLAB,		/* Slab freed to the page allocator */
-	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
-	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
-	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
-	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
-	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
-	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
-	DEACTIVATE_BYPASS,	/* Implicit deactivation */
-	ORDER_FALLBACK,		/* Number of times fallback was necessary */
-	CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
-	CMPXCHG_DOUBLE_FAIL,	/* Number of times that cmpxchg double did not match */
-	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
-	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
-	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
-	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
-	NR_SLUB_STAT_ITEMS
-};
-
-#ifndef CONFIG_SLUB_TINY
-/*
- * When changing the layout, make sure freelist and tid are still compatible
- * with this_cpu_cmpxchg_double() alignment requirements.
- */
-struct kmem_cache_cpu {
-	union {
-		struct {
-			void **freelist;	/* Pointer to next available object */
-			unsigned long tid;	/* Globally unique transaction id */
-		};
-		freelist_aba_t freelist_tid;
-	};
-	struct slab *slab;	/* The slab from which we are allocating */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	struct slab *partial;	/* Partially allocated frozen slabs */
-#endif
-	local_lock_t lock;	/* Protects the fields above */
-#ifdef CONFIG_SLUB_STATS
-	unsigned stat[NR_SLUB_STAT_ITEMS];
-#endif
-};
-#endif /* CONFIG_SLUB_TINY */
-
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 #define slub_percpu_partial(c)		((c)->partial)
 
diff --git a/mm/slub.c b/mm/slub.c
index 3e01731783df..979932d046fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -330,6 +330,60 @@ static void debugfs_slab_add(struct kmem_cache *);
 static inline void debugfs_slab_add(struct kmem_cache *s) { }
 #endif
 
+enum stat_item {
+	ALLOC_FASTPATH,		/* Allocation from cpu slab */
+	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
+	FREE_FASTPATH,		/* Free to cpu slab */
+	FREE_SLOWPATH,		/* Freeing not to cpu slab */
+	FREE_FROZEN,		/* Freeing to frozen slab */
+	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
+	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
+	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
+	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
+	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
+	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
+	FREE_SLAB,		/* Slab freed to the page allocator */
+	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
+	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
+	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
+	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
+	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
+	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
+	DEACTIVATE_BYPASS,	/* Implicit deactivation */
+	ORDER_FALLBACK,		/* Number of times fallback was necessary */
+	CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
+	CMPXCHG_DOUBLE_FAIL,	/* Failures of slab freelist update */
+	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
+	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
+	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
+	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
+	NR_SLUB_STAT_ITEMS
+};
+
+#ifndef CONFIG_SLUB_TINY
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
+struct kmem_cache_cpu {
+	union {
+		struct {
+			void **freelist;	/* Pointer to next available object */
+			unsigned long tid;	/* Globally unique transaction id */
+		};
+		freelist_aba_t freelist_tid;
+	};
+	struct slab *slab;	/* The slab from which we are allocating */
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	struct slab *partial;	/* Partially allocated frozen slabs */
+#endif
+	local_lock_t lock;	/* Protects the fields above */
+#ifdef CONFIG_SLUB_STATS
+	unsigned int stat[NR_SLUB_STAT_ITEMS];
+#endif
+};
+#endif /* CONFIG_SLUB_TINY */
+
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
-- 
cgit v1.2.3


From 19975f83412fbb9b1458f3dfbf16ca043a57788a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 3 Oct 2023 09:59:48 +0200
Subject: mm/slab: move the rest of slub_def.h to mm/slab.h

mm/slab.h is the only place to include include/linux/slub_def.h which
has allowed switching between SLAB and SLUB. Now we can simply move the
contents over and remove slub_def.h.

Use this opportunity to fix up some whitespace (alignment) issues.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 150 -----------------------------------------------
 mm/slab.h                | 138 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 137 insertions(+), 151 deletions(-)
 delete mode 100644 include/linux/slub_def.h

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
deleted file mode 100644
index a0229ea42977..000000000000
--- a/include/linux/slub_def.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SLUB_DEF_H
-#define _LINUX_SLUB_DEF_H
-
-/*
- * SLUB : A Slab allocator without object queues.
- *
- * (C) 2007 SGI, Christoph Lameter
- */
-#include <linux/kfence.h>
-#include <linux/kobject.h>
-#include <linux/reciprocal_div.h>
-#include <linux/local_lock.h>
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-#define slub_percpu_partial(c)		((c)->partial)
-
-#define slub_set_percpu_partial(c, p)		\
-({						\
-	slub_percpu_partial(c) = (p)->next;	\
-})
-
-#define slub_percpu_partial_read_once(c)     READ_ONCE(slub_percpu_partial(c))
-#else
-#define slub_percpu_partial(c)			NULL
-
-#define slub_set_percpu_partial(c, p)
-
-#define slub_percpu_partial_read_once(c)	NULL
-#endif // CONFIG_SLUB_CPU_PARTIAL
-
-/*
- * Word size structure that can be atomically updated or read and that
- * contains both the order and the number of objects that a slab of the
- * given order would contain.
- */
-struct kmem_cache_order_objects {
-	unsigned int x;
-};
-
-/*
- * Slab cache management.
- */
-struct kmem_cache {
-#ifndef CONFIG_SLUB_TINY
-	struct kmem_cache_cpu __percpu *cpu_slab;
-#endif
-	/* Used for retrieving partial slabs, etc. */
-	slab_flags_t flags;
-	unsigned long min_partial;
-	unsigned int size;	/* The size of an object including metadata */
-	unsigned int object_size;/* The size of an object without metadata */
-	struct reciprocal_value reciprocal_size;
-	unsigned int offset;	/* Free pointer offset */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	/* Number of per cpu partial objects to keep around */
-	unsigned int cpu_partial;
-	/* Number of per cpu partial slabs to keep around */
-	unsigned int cpu_partial_slabs;
-#endif
-	struct kmem_cache_order_objects oo;
-
-	/* Allocation and freeing of slabs */
-	struct kmem_cache_order_objects min;
-	gfp_t allocflags;	/* gfp flags to use on each alloc */
-	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(void *);
-	unsigned int inuse;		/* Offset to metadata */
-	unsigned int align;		/* Alignment */
-	unsigned int red_left_pad;	/* Left redzone padding size */
-	const char *name;	/* Name (only for display!) */
-	struct list_head list;	/* List of slab caches */
-#ifdef CONFIG_SYSFS
-	struct kobject kobj;	/* For sysfs */
-#endif
-#ifdef CONFIG_SLAB_FREELIST_HARDENED
-	unsigned long random;
-#endif
-
-#ifdef CONFIG_NUMA
-	/*
-	 * Defragmentation by allocating from a remote node.
-	 */
-	unsigned int remote_node_defrag_ratio;
-#endif
-
-#ifdef CONFIG_SLAB_FREELIST_RANDOM
-	unsigned int *random_seq;
-#endif
-
-#ifdef CONFIG_KASAN_GENERIC
-	struct kasan_cache kasan_info;
-#endif
-
-#ifdef CONFIG_HARDENED_USERCOPY
-	unsigned int useroffset;	/* Usercopy region offset */
-	unsigned int usersize;		/* Usercopy region size */
-#endif
-
-	struct kmem_cache_node *node[MAX_NUMNODES];
-};
-
-#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
-#define SLAB_SUPPORTS_SYSFS
-void sysfs_slab_unlink(struct kmem_cache *);
-void sysfs_slab_release(struct kmem_cache *);
-#else
-static inline void sysfs_slab_unlink(struct kmem_cache *s)
-{
-}
-static inline void sysfs_slab_release(struct kmem_cache *s)
-{
-}
-#endif
-
-void *fixup_red_left(struct kmem_cache *s, void *p);
-
-static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab,
-				void *x) {
-	void *object = x - (x - slab_address(slab)) % cache->size;
-	void *last_object = slab_address(slab) +
-		(slab->objects - 1) * cache->size;
-	void *result = (unlikely(object > last_object)) ? last_object : object;
-
-	result = fixup_red_left(cache, result);
-	return result;
-}
-
-/* Determine object index from a given position */
-static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
-					  void *addr, void *obj)
-{
-	return reciprocal_divide(kasan_reset_tag(obj) - addr,
-				 cache->reciprocal_size);
-}
-
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-					const struct slab *slab, void *obj)
-{
-	if (is_kfence_address(obj))
-		return 0;
-	return __obj_to_index(cache, slab_address(slab), obj);
-}
-
-static inline int objs_per_slab(const struct kmem_cache *cache,
-				     const struct slab *slab)
-{
-	return slab->objects;
-}
-#endif /* _LINUX_SLUB_DEF_H */
diff --git a/mm/slab.h b/mm/slab.h
index 014c36ea51fa..3a8d13c099fa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -209,7 +209,143 @@ static inline size_t slab_size(const struct slab *slab)
 	return PAGE_SIZE << slab_order(slab);
 }
 
-#include <linux/slub_def.h>
+#include <linux/kfence.h>
+#include <linux/kobject.h>
+#include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
+
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+#define slub_percpu_partial(c)			((c)->partial)
+
+#define slub_set_percpu_partial(c, p)		\
+({						\
+	slub_percpu_partial(c) = (p)->next;	\
+})
+
+#define slub_percpu_partial_read_once(c)	READ_ONCE(slub_percpu_partial(c))
+#else
+#define slub_percpu_partial(c)			NULL
+
+#define slub_set_percpu_partial(c, p)
+
+#define slub_percpu_partial_read_once(c)	NULL
+#endif // CONFIG_SLUB_CPU_PARTIAL
+
+/*
+ * Word size structure that can be atomically updated or read and that
+ * contains both the order and the number of objects that a slab of the
+ * given order would contain.
+ */
+struct kmem_cache_order_objects {
+	unsigned int x;
+};
+
+/*
+ * Slab cache management.
+ */
+struct kmem_cache {
+#ifndef CONFIG_SLUB_TINY
+	struct kmem_cache_cpu __percpu *cpu_slab;
+#endif
+	/* Used for retrieving partial slabs, etc. */
+	slab_flags_t flags;
+	unsigned long min_partial;
+	unsigned int size;		/* Object size including metadata */
+	unsigned int object_size;	/* Object size without metadata */
+	struct reciprocal_value reciprocal_size;
+	unsigned int offset;		/* Free pointer offset */
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	/* Number of per cpu partial objects to keep around */
+	unsigned int cpu_partial;
+	/* Number of per cpu partial slabs to keep around */
+	unsigned int cpu_partial_slabs;
+#endif
+	struct kmem_cache_order_objects oo;
+
+	/* Allocation and freeing of slabs */
+	struct kmem_cache_order_objects min;
+	gfp_t allocflags;		/* gfp flags to use on each alloc */
+	int refcount;			/* Refcount for slab cache destroy */
+	void (*ctor)(void *object);	/* Object constructor */
+	unsigned int inuse;		/* Offset to metadata */
+	unsigned int align;		/* Alignment */
+	unsigned int red_left_pad;	/* Left redzone padding size */
+	const char *name;		/* Name (only for display!) */
+	struct list_head list;		/* List of slab caches */
+#ifdef CONFIG_SYSFS
+	struct kobject kobj;		/* For sysfs */
+#endif
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	unsigned long random;
+#endif
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Defragmentation by allocating from a remote node.
+	 */
+	unsigned int remote_node_defrag_ratio;
+#endif
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	unsigned int *random_seq;
+#endif
+
+#ifdef CONFIG_KASAN_GENERIC
+	struct kasan_cache kasan_info;
+#endif
+
+#ifdef CONFIG_HARDENED_USERCOPY
+	unsigned int useroffset;	/* Usercopy region offset */
+	unsigned int usersize;		/* Usercopy region size */
+#endif
+
+	struct kmem_cache_node *node[MAX_NUMNODES];
+};
+
+#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
+#define SLAB_SUPPORTS_SYSFS
+void sysfs_slab_unlink(struct kmem_cache *s);
+void sysfs_slab_release(struct kmem_cache *s);
+#else
+static inline void sysfs_slab_unlink(struct kmem_cache *s) { }
+static inline void sysfs_slab_release(struct kmem_cache *s) { }
+#endif
+
+void *fixup_red_left(struct kmem_cache *s, void *p);
+
+static inline void *nearest_obj(struct kmem_cache *cache,
+				const struct slab *slab, void *x)
+{
+	void *object = x - (x - slab_address(slab)) % cache->size;
+	void *last_object = slab_address(slab) +
+		(slab->objects - 1) * cache->size;
+	void *result = (unlikely(object > last_object)) ? last_object : object;
+
+	result = fixup_red_left(cache, result);
+	return result;
+}
+
+/* Determine object index from a given position */
+static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
+					  void *addr, void *obj)
+{
+	return reciprocal_divide(kasan_reset_tag(obj) - addr,
+				 cache->reciprocal_size);
+}
+
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+					const struct slab *slab, void *obj)
+{
+	if (is_kfence_address(obj))
+		return 0;
+	return __obj_to_index(cache, slab_address(slab), obj);
+}
+
+static inline int objs_per_slab(const struct kmem_cache *cache,
+				const struct slab *slab)
+{
+	return slab->objects;
+}
 
 #include <linux/memcontrol.h>
 #include <linux/fault-inject.h>
-- 
cgit v1.2.3


From 9396c4ee93f9ac03cd0cea0bb345fbc657772943 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 4 Dec 2023 19:00:44 +0000
Subject: net/tcp: Don't store TCP-AO maclen on reqsk

This extra check doesn't work for a handshake when SYN segment has
(current_key.maclen != rnext_key.maclen). It could be amended to
preserve rnext_key.maclen instead of current_key.maclen, but that
requires a lookup on listen socket.

Originally, this extra maclen check was introduced just because it was
cheap. Drop it and convert tcp_request_sock::maclen into boolean
tcp_request_sock::used_tcp_ao.

Fixes: 06b22ef29591 ("net/tcp: Wire TCP-AO to request sockets")
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/tcp.h   | 8 ++------
 net/ipv4/tcp_ao.c     | 4 ++--
 net/ipv4/tcp_input.c  | 5 +++--
 net/ipv4/tcp_output.c | 9 +++------
 4 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 68f3d315d2e1..b646b574b060 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -169,7 +169,7 @@ struct tcp_request_sock {
 #ifdef CONFIG_TCP_AO
 	u8				ao_keyid;
 	u8				ao_rcv_next;
-	u8				maclen;
+	bool				used_tcp_ao;
 #endif
 };
 
@@ -180,14 +180,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 
 static inline bool tcp_rsk_used_ao(const struct request_sock *req)
 {
-	/* The real length of MAC is saved in the request socket,
-	 * signing anything with zero-length makes no sense, so here is
-	 * a little hack..
-	 */
 #ifndef CONFIG_TCP_AO
 	return false;
 #else
-	return tcp_rsk(req)->maclen != 0;
+	return tcp_rsk(req)->used_tcp_ao;
 #endif
 }
 
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 18dacfef7a07..f8308d3f565e 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -851,7 +851,7 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 	const struct tcp_ao_hdr *aoh;
 	struct tcp_ao_key *key;
 
-	treq->maclen = 0;
+	treq->used_tcp_ao = false;
 
 	if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
 		return;
@@ -863,7 +863,7 @@ void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
 
 	treq->ao_rcv_next = aoh->keyid;
 	treq->ao_keyid = aoh->rnext_keyid;
-	treq->maclen = tcp_ao_maclen(key);
+	treq->used_tcp_ao = true;
 }
 
 static enum skb_drop_reason
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcb55d98004c..337c8bb07ccc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7182,11 +7182,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
 		goto drop_and_release; /* Invalid TCP options */
 	if (aoh) {
-		tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
+		tcp_rsk(req)->used_tcp_ao = true;
 		tcp_rsk(req)->ao_rcv_next = aoh->keyid;
 		tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+
 	} else {
-		tcp_rsk(req)->maclen = 0;
+		tcp_rsk(req)->used_tcp_ao = false;
 	}
 #endif
 	tcp_rsk(req)->snt_isn = isn;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 93eef1dbbc55..f5ef15e1d9ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3720,7 +3720,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	if (tcp_rsk_used_ao(req)) {
 #ifdef CONFIG_TCP_AO
 		struct tcp_ao_key *ao_key = NULL;
-		u8 maclen = tcp_rsk(req)->maclen;
 		u8 keyid = tcp_rsk(req)->ao_keyid;
 
 		ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
@@ -3730,13 +3729,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 		 * for another peer-matching key, but the peer has requested
 		 * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
 		 */
-		if (unlikely(!ao_key || tcp_ao_maclen(ao_key) != maclen)) {
-			u8 key_maclen = ao_key ? tcp_ao_maclen(ao_key) : 0;
-
+		if (unlikely(!ao_key)) {
 			rcu_read_unlock();
 			kfree_skb(skb);
-			net_warn_ratelimited("TCP-AO: the keyid %u with maclen %u|%u from SYN packet is not present - not sending SYNACK\n",
-					     keyid, maclen, key_maclen);
+			net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
+					     keyid);
 			return NULL;
 		}
 		key.ao_key = ao_key;
-- 
cgit v1.2.3


From 16e5ac127d8d18adf85fe5ba847d77b58d1ed418 Mon Sep 17 00:00:00 2001
From: Naresh Solanki <naresh.solanki@9elements.com>
Date: Tue, 5 Dec 2023 16:22:04 +0530
Subject: regulator: event: Add regulator netlink event support

This commit introduces netlink event support to the regulator subsystem.

Changes:
- Introduce event.c and regnl.h for netlink event handling.
- Implement reg_generate_netlink_event to broadcast regulator events.
- Update Makefile to include the new event.c file.

Signed-off-by: Naresh Solanki <naresh.solanki@9elements.com>
Link: https://lore.kernel.org/r/20231205105207.1262928-1-naresh.solanki@9elements.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig          | 10 +++++
 drivers/regulator/Makefile         |  1 +
 drivers/regulator/core.c           | 19 +++++++-
 drivers/regulator/event.c          | 91 ++++++++++++++++++++++++++++++++++++++
 drivers/regulator/regnl.h          | 13 ++++++
 include/linux/regulator/consumer.h | 47 +-------------------
 include/uapi/regulator/regulator.h | 90 +++++++++++++++++++++++++++++++++++++
 7 files changed, 224 insertions(+), 47 deletions(-)
 create mode 100644 drivers/regulator/event.c
 create mode 100644 drivers/regulator/regnl.h
 create mode 100644 include/uapi/regulator/regulator.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index f3ec24691378..550145f82726 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -56,6 +56,16 @@ config REGULATOR_USERSPACE_CONSUMER
 
 	  If unsure, say no.
 
+config REGULATOR_NETLINK_EVENTS
+	bool "Enable support for receiving regulator events via netlink"
+	depends on NET
+	help
+	  Enabling this option allows the kernel to broadcast regulator events using
+	  the netlink mechanism. User-space applications can subscribe to these events
+	  for real-time updates on various regulator events.
+
+	  If unsure, say no.
+
 config REGULATOR_88PG86X
 	tristate "Marvell 88PG86X voltage regulators"
 	depends on I2C
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index b2b059b5ee56..46fb569e6be8 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -5,6 +5,7 @@
 
 
 obj-$(CONFIG_REGULATOR) += core.o dummy.o fixed-helper.o helpers.o devres.o irq_helpers.o
+obj-$(CONFIG_REGULATOR_NETLINK_EVENTS) += event.o
 obj-$(CONFIG_OF) += of_regulator.o
 obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 4aa9ec8c22f3..a968dabb48f5 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -33,6 +33,7 @@
 
 #include "dummy.h"
 #include "internal.h"
+#include "regnl.h"
 
 static DEFINE_WW_CLASS(regulator_ww_class);
 static DEFINE_MUTEX(regulator_nesting_mutex);
@@ -4854,7 +4855,23 @@ static int _notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data)
 {
 	/* call rdev chain first */
-	return blocking_notifier_call_chain(&rdev->notifier, event, data);
+	int ret =  blocking_notifier_call_chain(&rdev->notifier, event, data);
+
+	if (IS_REACHABLE(CONFIG_REGULATOR_NETLINK_EVENTS)) {
+		struct device *parent = rdev->dev.parent;
+		const char *rname = rdev_get_name(rdev);
+		char name[32];
+
+		/* Avoid duplicate debugfs directory names */
+		if (parent && rname == rdev->desc->name) {
+			snprintf(name, sizeof(name), "%s-%s", dev_name(parent),
+				 rname);
+			rname = name;
+		}
+		reg_generate_netlink_event(rname, event);
+	}
+
+	return ret;
 }
 
 int _regulator_bulk_get(struct device *dev, int num_consumers,
diff --git a/drivers/regulator/event.c b/drivers/regulator/event.c
new file mode 100644
index 000000000000..0ec58f306b38
--- /dev/null
+++ b/drivers/regulator/event.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regulator event over netlink
+ *
+ * Author: Naresh Solanki <Naresh.Solanki@9elements.com>
+ */
+
+#include <regulator/regulator.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "regnl.h"
+
+static unsigned int reg_event_seqnum;
+
+static const struct genl_multicast_group reg_event_mcgrps[] = {
+	{ .name = REG_GENL_MCAST_GROUP_NAME, },
+};
+
+static struct genl_family reg_event_genl_family __ro_after_init = {
+	.module = THIS_MODULE,
+	.name = REG_GENL_FAMILY_NAME,
+	.version = REG_GENL_VERSION,
+	.maxattr = REG_GENL_ATTR_MAX,
+	.mcgrps = reg_event_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(reg_event_mcgrps),
+};
+
+int reg_generate_netlink_event(const char *reg_name, u64 event)
+{
+	struct sk_buff *skb;
+	struct nlattr *attr;
+	struct reg_genl_event *edata;
+	void *msg_header;
+	int size;
+
+	/* allocate memory */
+	size = nla_total_size(sizeof(struct reg_genl_event)) +
+	    nla_total_size(0);
+
+	skb = genlmsg_new(size, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	/* add the genetlink message header */
+	msg_header = genlmsg_put(skb, 0, reg_event_seqnum++,
+				 &reg_event_genl_family, 0,
+				 REG_GENL_CMD_EVENT);
+	if (!msg_header) {
+		nlmsg_free(skb);
+		return -ENOMEM;
+	}
+
+	/* fill the data */
+	attr = nla_reserve(skb, REG_GENL_ATTR_EVENT, sizeof(struct reg_genl_event));
+	if (!attr) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	edata = nla_data(attr);
+	memset(edata, 0, sizeof(struct reg_genl_event));
+
+	strscpy(edata->reg_name, reg_name, sizeof(edata->reg_name));
+	edata->event = event;
+
+	/* send multicast genetlink message */
+	genlmsg_end(skb, msg_header);
+	size = genlmsg_multicast(&reg_event_genl_family, skb, 0, 0, GFP_ATOMIC);
+
+	return size;
+}
+
+static int __init reg_event_genetlink_init(void)
+{
+	return genl_register_family(&reg_event_genl_family);
+}
+
+static int __init reg_event_init(void)
+{
+	int error;
+
+	/* create genetlink for acpi event */
+	error = reg_event_genetlink_init();
+	if (error)
+		pr_warn("Failed to create genetlink family for reg event\n");
+
+	return 0;
+}
+
+fs_initcall(reg_event_init);
diff --git a/drivers/regulator/regnl.h b/drivers/regulator/regnl.h
new file mode 100644
index 000000000000..bcba16cc05cc
--- /dev/null
+++ b/drivers/regulator/regnl.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Regulator event over netlink
+ *
+ * Author: Naresh Solanki <Naresh.Solanki@9elements.com>
+ */
+
+#ifndef __REGULATOR_EVENT_H
+#define __REGULATOR_EVENT_H
+
+int reg_generate_netlink_event(const char *reg_name, u64 event);
+
+#endif
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 39b666b40ea6..4660582a3302 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -33,6 +33,7 @@
 
 #include <linux/err.h>
 #include <linux/suspend.h>
+#include <regulator/regulator.h>
 
 struct device;
 struct notifier_block;
@@ -84,52 +85,6 @@ struct regulator_dev;
 #define REGULATOR_MODE_IDLE			0x4
 #define REGULATOR_MODE_STANDBY			0x8
 
-/*
- * Regulator notifier events.
- *
- * UNDER_VOLTAGE  Regulator output is under voltage.
- * OVER_CURRENT   Regulator output current is too high.
- * REGULATION_OUT Regulator output is out of regulation.
- * FAIL           Regulator output has failed.
- * OVER_TEMP      Regulator over temp.
- * FORCE_DISABLE  Regulator forcibly shut down by software.
- * VOLTAGE_CHANGE Regulator voltage changed.
- *                Data passed is old voltage cast to (void *).
- * DISABLE        Regulator was disabled.
- * PRE_VOLTAGE_CHANGE   Regulator is about to have voltage changed.
- *                      Data passed is "struct pre_voltage_change_data"
- * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason.
- *                      Data passed is old voltage cast to (void *).
- * PRE_DISABLE    Regulator is about to be disabled
- * ABORT_DISABLE  Regulator disable failed for some reason
- *
- * NOTE: These events can be OR'ed together when passed into handler.
- */
-
-#define REGULATOR_EVENT_UNDER_VOLTAGE		0x01
-#define REGULATOR_EVENT_OVER_CURRENT		0x02
-#define REGULATOR_EVENT_REGULATION_OUT		0x04
-#define REGULATOR_EVENT_FAIL			0x08
-#define REGULATOR_EVENT_OVER_TEMP		0x10
-#define REGULATOR_EVENT_FORCE_DISABLE		0x20
-#define REGULATOR_EVENT_VOLTAGE_CHANGE		0x40
-#define REGULATOR_EVENT_DISABLE			0x80
-#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE	0x100
-#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE	0x200
-#define REGULATOR_EVENT_PRE_DISABLE		0x400
-#define REGULATOR_EVENT_ABORT_DISABLE		0x800
-#define REGULATOR_EVENT_ENABLE			0x1000
-/*
- * Following notifications should be emitted only if detected condition
- * is such that the HW is likely to still be working but consumers should
- * take a recovery action to prevent problems esacalating into errors.
- */
-#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN	0x2000
-#define REGULATOR_EVENT_OVER_CURRENT_WARN	0x4000
-#define REGULATOR_EVENT_OVER_VOLTAGE_WARN	0x8000
-#define REGULATOR_EVENT_OVER_TEMP_WARN		0x10000
-#define REGULATOR_EVENT_WARN_MASK		0x1E000
-
 /*
  * Regulator errors that can be queried using regulator_get_error_flags
  *
diff --git a/include/uapi/regulator/regulator.h b/include/uapi/regulator/regulator.h
new file mode 100644
index 000000000000..d2b5612198b6
--- /dev/null
+++ b/include/uapi/regulator/regulator.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Regulator uapi header
+ *
+ * Author: Naresh Solanki <Naresh.Solanki@9elements.com>
+ */
+
+#ifndef _UAPI_REGULATOR_H
+#define _UAPI_REGULATOR_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+/*
+ * Regulator notifier events.
+ *
+ * UNDER_VOLTAGE  Regulator output is under voltage.
+ * OVER_CURRENT   Regulator output current is too high.
+ * REGULATION_OUT Regulator output is out of regulation.
+ * FAIL           Regulator output has failed.
+ * OVER_TEMP      Regulator over temp.
+ * FORCE_DISABLE  Regulator forcibly shut down by software.
+ * VOLTAGE_CHANGE Regulator voltage changed.
+ *                Data passed is old voltage cast to (void *).
+ * DISABLE        Regulator was disabled.
+ * PRE_VOLTAGE_CHANGE   Regulator is about to have voltage changed.
+ *                      Data passed is "struct pre_voltage_change_data"
+ * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason.
+ *                      Data passed is old voltage cast to (void *).
+ * PRE_DISABLE    Regulator is about to be disabled
+ * ABORT_DISABLE  Regulator disable failed for some reason
+ *
+ * NOTE: These events can be OR'ed together when passed into handler.
+ */
+
+#define REGULATOR_EVENT_UNDER_VOLTAGE		0x01
+#define REGULATOR_EVENT_OVER_CURRENT		0x02
+#define REGULATOR_EVENT_REGULATION_OUT		0x04
+#define REGULATOR_EVENT_FAIL			0x08
+#define REGULATOR_EVENT_OVER_TEMP		0x10
+#define REGULATOR_EVENT_FORCE_DISABLE		0x20
+#define REGULATOR_EVENT_VOLTAGE_CHANGE		0x40
+#define REGULATOR_EVENT_DISABLE			0x80
+#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE	0x100
+#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE	0x200
+#define REGULATOR_EVENT_PRE_DISABLE		0x400
+#define REGULATOR_EVENT_ABORT_DISABLE		0x800
+#define REGULATOR_EVENT_ENABLE			0x1000
+/*
+ * Following notifications should be emitted only if detected condition
+ * is such that the HW is likely to still be working but consumers should
+ * take a recovery action to prevent problems esacalating into errors.
+ */
+#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN	0x2000
+#define REGULATOR_EVENT_OVER_CURRENT_WARN	0x4000
+#define REGULATOR_EVENT_OVER_VOLTAGE_WARN	0x8000
+#define REGULATOR_EVENT_OVER_TEMP_WARN		0x10000
+#define REGULATOR_EVENT_WARN_MASK		0x1E000
+
+struct reg_genl_event {
+	char reg_name[32];
+	uint64_t event;
+};
+
+/* attributes of reg_genl_family */
+enum {
+	REG_GENL_ATTR_UNSPEC,
+	REG_GENL_ATTR_EVENT,	/* reg event info needed by user space */
+	__REG_GENL_ATTR_MAX,
+};
+
+#define REG_GENL_ATTR_MAX (__REG_GENL_ATTR_MAX - 1)
+
+/* commands supported by the reg_genl_family */
+enum {
+	REG_GENL_CMD_UNSPEC,
+	REG_GENL_CMD_EVENT,	/* kernel->user notifications for reg events */
+	__REG_GENL_CMD_MAX,
+};
+
+#define REG_GENL_CMD_MAX (__REG_GENL_CMD_MAX - 1)
+
+#define REG_GENL_FAMILY_NAME		"reg_event"
+#define REG_GENL_VERSION		0x01
+#define REG_GENL_MCAST_GROUP_NAME	"reg_mc_group"
+
+#endif /* _UAPI_REGULATOR_H */
-- 
cgit v1.2.3


From 15bece7bec0df91a8ed1c185483d67708425ca8e Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Fri, 24 Nov 2023 20:16:15 +0800
Subject: cpu/hotplug: Remove unused CPU hotplug states

There are unused hotplug states which either have never been used or the
removal of the usage did not remove the state constant.

Drop them to reduce the size of the cpuhp_hp_states array.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231124121615.1604-1-yuzenghui@huawei.com
---
 include/linux/cpuhotplug.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index efc0c0b07efb..af6c21aab985 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -66,15 +66,12 @@ enum cpuhp_state {
 	CPUHP_PERF_POWER,
 	CPUHP_PERF_SUPERH,
 	CPUHP_X86_HPET_DEAD,
-	CPUHP_X86_APB_DEAD,
 	CPUHP_X86_MCE_DEAD,
 	CPUHP_VIRT_NET_DEAD,
 	CPUHP_IBMVNIC_DEAD,
 	CPUHP_SLUB_DEAD,
 	CPUHP_DEBUG_OBJ_DEAD,
 	CPUHP_MM_WRITEBACK_DEAD,
-	/* Must be after CPUHP_MM_VMSTAT_DEAD */
-	CPUHP_MM_DEMOTION_DEAD,
 	CPUHP_MM_VMSTAT_DEAD,
 	CPUHP_SOFTIRQ_DEAD,
 	CPUHP_NET_MVNETA_DEAD,
@@ -96,7 +93,6 @@ enum cpuhp_state {
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
 	CPUHP_IOMMU_IOVA_DEAD,
-	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
 	CPUHP_AP_DTPM_CPU_DEAD,
@@ -118,7 +114,6 @@ enum cpuhp_state {
 	CPUHP_XEN_EVTCHN_PREPARE,
 	CPUHP_ARM_SHMOBILE_SCU_PREPARE,
 	CPUHP_SH_SH3X_PREPARE,
-	CPUHP_NET_FLOW_PREPARE,
 	CPUHP_TOPOLOGY_PREPARE,
 	CPUHP_NET_IUCV_PREPARE,
 	CPUHP_ARM_BL_PREPARE,
@@ -151,18 +146,14 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
-	CPUHP_AP_IRQ_RISCV_STARTING,
 	CPUHP_AP_IRQ_LOONGARCH_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-	CPUHP_AP_MICROCODE_LOADER,
 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 	CPUHP_AP_PERF_X86_STARTING,
 	CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
-	CPUHP_AP_PERF_X86_CQM_STARTING,
 	CPUHP_AP_PERF_X86_CSTATE_STARTING,
 	CPUHP_AP_PERF_XTENSA_STARTING,
-	CPUHP_AP_MIPS_OP_LOONGSON3_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
 	CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
@@ -179,7 +170,6 @@ enum cpuhp_state {
 	CPUHP_AP_QCOM_TIMER_STARTING,
 	CPUHP_AP_TEGRA_TIMER_STARTING,
 	CPUHP_AP_ARMADA_TIMER_STARTING,
-	CPUHP_AP_MARCO_TIMER_STARTING,
 	CPUHP_AP_MIPS_GIC_TIMER_STARTING,
 	CPUHP_AP_ARC_TIMER_STARTING,
 	CPUHP_AP_RISCV_TIMER_STARTING,
@@ -217,9 +207,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
-	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
-	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
@@ -252,8 +240,6 @@ enum cpuhp_state {
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
-	/* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */
-	CPUHP_AP_MM_DEMOTION_ONLINE,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
 	CPUHP_AP_ACTIVE,
-- 
cgit v1.2.3


From 57b8543ceee82ea72be1745a6dc3a9111d55a151 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Thu, 23 Nov 2023 15:36:13 +0530
Subject: ACPI: bus: update acpi_dev_uid_match() to support multiple types

According to the ACPI specification, a _UID object can evaluate to
either a numeric value or a string.

Update acpi_dev_uid_match() to support _UID matching for both integer
and string types.

Suggested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
[ rjw: Rename auxiliary macros, relocate kerneldoc comment ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 19 -------------------
 include/acpi/acpi_bus.h | 41 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h    |  8 +++-----
 3 files changed, 43 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 28c75242fca9..fe7e850c6479 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -824,25 +824,6 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs)
 }
 EXPORT_SYMBOL(acpi_check_dsm);
 
-/**
- * acpi_dev_uid_match - Match device by supplied UID
- * @adev: ACPI device to match.
- * @uid2: Unique ID of the device.
- *
- * Matches UID in @adev with given @uid2.
- *
- * Returns:
- *  - %true if matches.
- *  - %false otherwise.
- */
-bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
-{
-	const char *uid1 = acpi_device_uid(adev);
-
-	return uid1 && uid2 && !strcmp(uid1, uid2);
-}
-EXPORT_SYMBOL_GPL(acpi_dev_uid_match);
-
 /**
  * acpi_dev_hid_uid_match - Match device by supplied HID and UID
  * @adev: ACPI device to match.
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 1216d72c650f..4bde0b417476 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -764,10 +764,49 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
 		adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
 }
 
-bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2);
 bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
 int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);
 
+static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2)
+{
+	const char *uid1 = acpi_device_uid(adev);
+
+	return uid1 && uid2 && !strcmp(uid1, uid2);
+}
+
+static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2)
+{
+	u64 uid1;
+
+	return !acpi_dev_uid_to_integer(adev, &uid1) && uid1 == uid2;
+}
+
+#define TYPE_ENTRY(type, x)			\
+	const type: x,				\
+	type: x
+
+#define ACPI_STR_TYPES(match)			\
+	TYPE_ENTRY(unsigned char *, match),	\
+	TYPE_ENTRY(signed char *, match),		\
+	TYPE_ENTRY(char *, match),		\
+	TYPE_ENTRY(void *, match)
+
+/**
+ * acpi_dev_uid_match - Match device by supplied UID
+ * @adev: ACPI device to match.
+ * @uid2: Unique ID of the device.
+ *
+ * Matches UID in @adev with given @uid2.
+ *
+ * Returns: %true if matches, %false otherwise.
+ */
+#define acpi_dev_uid_match(adev, uid2)					\
+	_Generic(uid2,							\
+		 /* Treat @uid2 as a string for acpi string types */	\
+		 ACPI_STR_TYPES(acpi_str_uid_match),			\
+		 /* Treat as an integer otherwise */			\
+		 default: acpi_int_uid_match)(adev, uid2)
+
 void acpi_dev_clear_dependencies(struct acpi_device *supplier);
 bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
 struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4db54e928b36..2abe81f074de 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -756,6 +756,9 @@ const char *acpi_get_subsystem_id(acpi_handle handle);
 #define ACPI_HANDLE(dev)		(NULL)
 #define ACPI_HANDLE_FWNODE(fwnode)	(NULL)
 
+/* Get rid of the -Wunused-variable for adev */
+#define acpi_dev_uid_match(adev, uid2)			(adev && false)
+
 #include <acpi/acpi_numa.h>
 
 struct fwnode_handle;
@@ -772,11 +775,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
-static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2)
-{
-	return false;
-}
-
 static inline bool
 acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
 {
-- 
cgit v1.2.3


From b2b32a1738815155d4a0039bb7a6092d40f23e81 Mon Sep 17 00:00:00 2001
From: Raag Jadav <raag.jadav@intel.com>
Date: Thu, 23 Nov 2023 15:36:14 +0530
Subject: ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types

Now that we have _UID matching support for both integer and string types,
we can support them into acpi_dev_hid_uid_match() helper as well.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 29 -----------------------------
 include/acpi/acpi_bus.h | 24 +++++++++++++++++++++++-
 include/linux/acpi.h    |  7 +------
 3 files changed, 24 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index fe7e850c6479..03f6de9a0807 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -824,35 +824,6 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs)
 }
 EXPORT_SYMBOL(acpi_check_dsm);
 
-/**
- * acpi_dev_hid_uid_match - Match device by supplied HID and UID
- * @adev: ACPI device to match.
- * @hid2: Hardware ID of the device.
- * @uid2: Unique ID of the device, pass NULL to not check _UID.
- *
- * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2
- * will be treated as a match. If user wants to validate @uid2, it should be
- * done before calling this function.
- *
- * Returns:
- *  - %true if matches or @uid2 is NULL.
- *  - %false otherwise.
- */
-bool acpi_dev_hid_uid_match(struct acpi_device *adev,
-			    const char *hid2, const char *uid2)
-{
-	const char *hid1 = acpi_device_hid(adev);
-
-	if (strcmp(hid1, hid2))
-		return false;
-
-	if (!uid2)
-		return true;
-
-	return acpi_dev_uid_match(adev, uid2);
-}
-EXPORT_SYMBOL(acpi_dev_hid_uid_match);
-
 /**
  * acpi_dev_uid_to_integer - treat ACPI device _UID as integer
  * @adev: ACPI device to get _UID from
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 4bde0b417476..aae31552c574 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -764,9 +764,15 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
 		adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set);
 }
 
-bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
 int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);
 
+static inline bool acpi_dev_hid_match(struct acpi_device *adev, const char *hid2)
+{
+	const char *hid1 = acpi_device_hid(adev);
+
+	return hid1 && hid2 && !strcmp(hid1, hid2);
+}
+
 static inline bool acpi_str_uid_match(struct acpi_device *adev, const char *uid2)
 {
 	const char *uid1 = acpi_device_uid(adev);
@@ -807,6 +813,22 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2)
 		 /* Treat as an integer otherwise */			\
 		 default: acpi_int_uid_match)(adev, uid2)
 
+/**
+ * acpi_dev_hid_uid_match - Match device by supplied HID and UID
+ * @adev: ACPI device to match.
+ * @hid2: Hardware ID of the device.
+ * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID.
+ *
+ * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2
+ * will be treated as a match. If user wants to validate @uid2, it should be
+ * done before calling this function.
+ *
+ * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise.
+ */
+#define acpi_dev_hid_uid_match(adev, hid2, uid2)			\
+	(acpi_dev_hid_match(adev, hid2) &&				\
+		(!(uid2) || acpi_dev_uid_match(adev, uid2)))
+
 void acpi_dev_clear_dependencies(struct acpi_device *supplier);
 bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
 struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2abe81f074de..75274585656c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -758,6 +758,7 @@ const char *acpi_get_subsystem_id(acpi_handle handle);
 
 /* Get rid of the -Wunused-variable for adev */
 #define acpi_dev_uid_match(adev, uid2)			(adev && false)
+#define acpi_dev_hid_uid_match(adev, hid2, uid2)	(adev && false)
 
 #include <acpi/acpi_numa.h>
 
@@ -775,12 +776,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 
 struct acpi_device;
 
-static inline bool
-acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2)
-{
-	return false;
-}
-
 static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:14 -0800
Subject: bpf: add BPF token delegation mount options to BPF FS

Add few new mount options to BPF FS that allow to specify that a given
BPF FS instance allows creation of BPF token (added in the next patch),
and what sort of operations are allowed under BPF token. As such, we get
4 new mount options, each is a bit mask
  - `delegate_cmds` allow to specify which bpf() syscall commands are
    allowed with BPF token derived from this BPF FS instance;
  - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies
    a set of allowable BPF map types that could be created with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies
    a set of allowable BPF program types that could be loaded with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies
    a set of allowable BPF program attach types that could be loaded with
    BPF token; delegate_progs and delegate_attachs are meant to be used
    together, as full BPF program type is, in general, determined
    through both program type and program attach type.

Currently, these mount options accept the following forms of values:
  - a special value "any", that enables all possible values of a given
  bit set;
  - numeric value (decimal or hexadecimal, determined by kernel
  automatically) that specifies a bit mask value directly;
  - all the values for a given mount option are combined, if specified
  multiple times. E.g., `mount -t bpf nodev /path/to/mount -o
  delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3
  mask.

Ideally, more convenient (for humans) symbolic form derived from
corresponding UAPI enums would be accepted (e.g., `-o
delegate_progs=kprobe|tracepoint`) and I intend to implement this, but
it requires a bunch of UAPI header churn, so I postponed it until this
feature lands upstream or at least there is a definite consensus that
this feature is acceptable and is going to make it, just to minimize
amount of wasted effort and not increase amount of non-essential code to
be reviewed.

Attentive reader will notice that BPF FS is now marked as
FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init
user namespace as long as the process has sufficient *namespaced*
capabilities within that user namespace. But in reality we still
restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in
init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added
to allow creating BPF FS context object (i.e., fsopen("bpf")) from
inside unprivileged process inside non-init userns, to capture that
userns as the owning userns. It will still be required to pass this
context object back to privileged process to instantiate and mount it.

This manipulation is important, because capturing non-init userns as the
owning userns of BPF FS instance (super block) allows to use that userns
to constraint BPF token to that userns later on (see next patch). So
creating BPF FS with delegation inside unprivileged userns will restrict
derived BPF token objects to only "work" inside that intended userns,
making it scoped to a intended "container". Also, setting these
delegation options requires capable(CAP_SYS_ADMIN), so unprivileged
process cannot set this up without involvement of a privileged process.

There is a set of selftests at the end of the patch set that simulates
this sequence of steps and validates that everything works as intended.
But careful review is requested to make sure there are no missed gaps in
the implementation and testing.

This somewhat subtle set of aspects is the result of previous
discussions ([0]) about various user namespace implications and
interactions with BPF token functionality and is necessary to contain
BPF token inside intended user namespace.

  [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 10 ++++++
 kernel/bpf/inode.c  | 88 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 10e5e4d8a00f..d3c9acc593ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1581,6 +1581,16 @@ struct bpf_link_primer {
 	u32 id;
 };
 
+struct bpf_mount_opts {
+	umode_t mode;
+
+	/* BPF token-related delegation options */
+	u64 delegate_cmds;
+	u64 delegate_maps;
+	u64 delegate_progs;
+	u64 delegate_attachs;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1aafb2ff2e95..220fe0f99095 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -599,10 +600,31 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
+	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
 
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
+
+	if (opts->delegate_cmds == ~0ULL)
+		seq_printf(m, ",delegate_cmds=any");
+	else if (opts->delegate_cmds)
+		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
+
+	if (opts->delegate_maps == ~0ULL)
+		seq_printf(m, ",delegate_maps=any");
+	else if (opts->delegate_maps)
+		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
+
+	if (opts->delegate_progs == ~0ULL)
+		seq_printf(m, ",delegate_progs=any");
+	else if (opts->delegate_progs)
+		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
+
+	if (opts->delegate_attachs == ~0ULL)
+		seq_printf(m, ",delegate_attachs=any");
+	else if (opts->delegate_attachs)
+		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
 	return 0;
 }
 
@@ -626,22 +648,27 @@ static const struct super_operations bpf_super_ops = {
 
 enum {
 	OPT_MODE,
+	OPT_DELEGATE_CMDS,
+	OPT_DELEGATE_MAPS,
+	OPT_DELEGATE_PROGS,
+	OPT_DELEGATE_ATTACHS,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
 	fsparam_u32oct	("mode",			OPT_MODE),
+	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
+	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
+	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
+	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
 	{}
 };
 
-struct bpf_mount_opts {
-	umode_t mode;
-};
-
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = fc->s_fs_info;
 	struct fs_parse_result result;
-	int opt;
+	int opt, err;
+	u64 msk;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -665,6 +692,28 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
+	case OPT_DELEGATE_CMDS:
+	case OPT_DELEGATE_MAPS:
+	case OPT_DELEGATE_PROGS:
+	case OPT_DELEGATE_ATTACHS:
+		if (strcmp(param->string, "any") == 0) {
+			msk = ~0ULL;
+		} else {
+			err = kstrtou64(param->string, 0, &msk);
+			if (err)
+				return err;
+		}
+		/* Setting delegation mount options requires privileges */
+		if (msk && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		switch (opt) {
+		case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break;
+		case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break;
+		case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break;
+		case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break;
+		default: return -EINVAL;
+		}
+		break;
 	}
 
 	return 0;
@@ -739,10 +788,14 @@ out:
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = sb->s_fs_info;
 	struct inode *inode;
 	int ret;
 
+	/* Mounting an instance of BPF FS requires privileges */
+	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -764,7 +817,7 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	kfree(fc->s_fs_info);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -786,17 +839,32 @@ static int bpf_init_fs_context(struct fs_context *fc)
 
 	opts->mode = S_IRWXUGO;
 
-	fc->fs_private = opts;
+	/* start out with no BPF token delegation enabled */
+	opts->delegate_cmds = 0;
+	opts->delegate_maps = 0;
+	opts->delegate_progs = 0;
+	opts->delegate_attachs = 0;
+
+	fc->s_fs_info = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
+static void bpf_kill_super(struct super_block *sb)
+{
+	struct bpf_mount_opts *opts = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	kfree(opts);
+}
+
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= bpf_kill_super,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 static int __init bpf_init(void)
-- 
cgit v1.2.3


From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:15 -0800
Subject: bpf: introduce BPF token object

Add new kind of BPF kernel object, BPF token. BPF token is meant to
allow delegating privileged BPF functionality, like loading a BPF
program or creating a BPF map, from privileged process to a *trusted*
unprivileged process, all while having a good amount of control over which
privileged operations could be performed using provided BPF token.

This is achieved through mounting BPF FS instance with extra delegation
mount options, which determine what operations are delegatable, and also
constraining it to the owning user namespace (as mentioned in the
previous patch).

BPF token itself is just a derivative from BPF FS and can be created
through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF
FS FD, which can be attained through open() API by opening BPF FS mount
point. Currently, BPF token "inherits" delegated command, map types,
prog type, and attach type bit sets from BPF FS as is. In the future,
having an BPF token as a separate object with its own FD, we can allow
to further restrict BPF token's allowable set of things either at the
creation time or after the fact, allowing the process to guard itself
further from unintentionally trying to load undesired kind of BPF
programs. But for now we keep things simple and just copy bit sets as is.

When BPF token is created from BPF FS mount, we take reference to the
BPF super block's owning user namespace, and then use that namespace for
checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN}
capabilities that are normally only checked against init userns (using
capable()), but now we check them using ns_capable() instead (if BPF
token is provided). See bpf_token_capable() for details.

Such setup means that BPF token in itself is not sufficient to grant BPF
functionality. User namespaced process has to *also* have necessary
combination of capabilities inside that user namespace. So while
previously CAP_BPF was useless when granted within user namespace, now
it gains a meaning and allows container managers and sys admins to have
a flexible control over which processes can and need to use BPF
functionality within the user namespace (i.e., container in practice).
And BPF FS delegation mount options and derived BPF tokens serve as
a per-container "flag" to grant overall ability to use bpf() (plus further
restrict on which parts of bpf() syscalls are treated as namespaced).

Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF)
within the BPF FS owning user namespace, rounding up the ns_capable()
story of BPF token.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  41 ++++++++
 include/uapi/linux/bpf.h       |  37 +++++++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/inode.c             |  12 ++-
 kernel/bpf/syscall.c           |  17 ++++
 kernel/bpf/token.c             | 214 +++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  37 +++++++
 7 files changed, 354 insertions(+), 6 deletions(-)
 create mode 100644 kernel/bpf/token.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c9acc593ea..aa9cf8e5fab1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -51,6 +51,10 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
+struct bpf_token;
+struct user_namespace;
+struct super_block;
+struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1591,6 +1595,13 @@ struct bpf_mount_opts {
 	u64 delegate_attachs;
 };
 
+struct bpf_token {
+	struct work_struct work;
+	atomic64_t refcnt;
+	struct user_namespace *userns;
+	u64 allowed_cmds;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
+bool bpf_token_capable(const struct bpf_token *token, int cap);
+
 static inline bool bpf_allow_ptr_leaks(void)
 {
 	return perfmon_capable();
@@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
+void bpf_token_inc(struct bpf_token *token);
+void bpf_token_put(struct bpf_token *token);
+int bpf_token_create(union bpf_attr *attr);
+struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
+struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
+			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
+static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+static inline void bpf_token_inc(struct bpf_token *token)
+{
+}
+
+static inline void bpf_token_put(struct bpf_token *token)
+{
+}
+
+static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline void __dev_flush(void)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e88746ba7d21..d4a567e5bc3c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1712,6 +1744,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..4ce95acfcaa7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 220fe0f99095..6ce3f9696e72 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
 
-static struct inode *bpf_get_inode(struct super_block *sb,
-				   const struct inode *dir,
-				   umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+			    const struct inode *dir,
+			    umode_t mode)
 {
 	struct inode *inode;
 
@@ -602,11 +602,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+	u64 mask;
 
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
-	if (opts->delegate_cmds == ~0ULL)
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((opts->delegate_cmds & mask) == mask)
 		seq_printf(m, ",delegate_cmds=any");
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
@@ -639,7 +641,7 @@ static void bpf_free_inode(struct inode *inode)
 	free_inode_nonrcu(inode);
 }
 
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ee33a52abf18..a156d549b356 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5377,6 +5377,20 @@ out_prog_put:
 	return ret;
 }
 
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TOKEN_CREATE))
+		return -EINVAL;
+
+	/* no flags are supported yet */
+	if (attr->token_create.flags)
+		return -EINVAL;
+
+	return bpf_token_create(attr);
+}
+
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -5510,6 +5524,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TOKEN_CREATE:
+		err = token_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..e18aaecc67e9
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,214 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	/* BPF token allows ns_capable() level of capabilities, but only if
+	 * token's userns is *exactly* the same as current user's userns
+	 */
+	if (token && current_user_ns() == token->userns) {
+		if (ns_capable(token->userns, cap))
+			return true;
+		if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))
+			return true;
+	}
+	/* otherwise fallback to capable() checks */
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+	atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+	put_user_ns(token->userns);
+	kvfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+	struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+	bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+	if (!token)
+		return;
+
+	if (!atomic64_dec_and_test(&token->refcnt))
+		return;
+
+	INIT_WORK(&token->work, bpf_token_put_deferred);
+	schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+
+	bpf_token_put(token);
+	return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+	u64 mask;
+
+	BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((token->allowed_cmds & mask) == mask)
+		seq_printf(m, "allowed_cmds:\tany\n");
+	else
+		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+	.release	= bpf_token_release,
+	.show_fdinfo	= bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+	struct bpf_mount_opts *mnt_opts;
+	struct bpf_token *token = NULL;
+	struct user_namespace *userns;
+	struct inode *inode;
+	struct file *file;
+	struct path path;
+	struct fd f;
+	umode_t mode;
+	int err, fd;
+
+	f = fdget(attr->token_create.bpffs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	path = f.file->f_path;
+	path_get(&path);
+	fdput(f);
+
+	if (path.dentry != path.mnt->mnt_sb->s_root) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	err = path_permission(&path, MAY_ACCESS);
+	if (err)
+		goto out_path;
+
+	userns = path.dentry->d_sb->s_user_ns;
+	/*
+	 * Enforce that creators of BPF tokens are in the same user
+	 * namespace as the BPF FS instance. This makes reasoning about
+	 * permissions a lot easier and we can always relax this later.
+	 */
+	if (current_user_ns() != userns) {
+		err = -EPERM;
+		goto out_path;
+	}
+	if (!ns_capable(userns, CAP_BPF)) {
+		err = -EPERM;
+		goto out_path;
+	}
+
+	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_path;
+	}
+
+	inode->i_op = &bpf_token_iops;
+	inode->i_fop = &bpf_token_fops;
+	clear_nlink(inode); /* make sure it is unlinked */
+
+	file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+	if (IS_ERR(file)) {
+		iput(inode);
+		err = PTR_ERR(file);
+		goto out_path;
+	}
+
+	token = kvzalloc(sizeof(*token), GFP_USER);
+	if (!token) {
+		err = -ENOMEM;
+		goto out_file;
+	}
+
+	atomic64_set(&token->refcnt, 1);
+
+	/* remember bpffs owning userns for future ns_capable() checks */
+	token->userns = get_user_ns(userns);
+
+	mnt_opts = path.dentry->d_sb->s_fs_info;
+	token->allowed_cmds = mnt_opts->delegate_cmds;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto out_token;
+	}
+
+	file->private_data = token;
+	fd_install(fd, file);
+
+	path_put(&path);
+	return fd;
+
+out_token:
+	bpf_token_free(token);
+out_file:
+	fput(file);
+out_path:
+	path_put(&path);
+	return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_token *token;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &bpf_token_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	token = f.file->private_data;
+	bpf_token_inc(token);
+	fdput(f);
+
+	return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	/* BPF token can be used only within exactly the same userns in which
+	 * it was created
+	 */
+	if (!token || current_user_ns() != token->userns)
+		return false;
+
+	return token->allowed_cmds & (1ULL << cmd);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e88746ba7d21..d4a567e5bc3c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1712,6 +1744,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:16 -0800
Subject: bpf: add BPF token support to BPF_MAP_CREATE command

Allow providing token_fd for BPF_MAP_CREATE command to allow controlled
BPF map creation from unprivileged process through delegated BPF token.

Wire through a set of allowed BPF map types to BPF token, derived from
BPF FS at BPF token creation time. This, in combination with allowed_cmds
allows to create a narrowly-focused BPF token (controlled by privileged
agent) with a restrictive set of BPF maps that application can attempt
to create.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  2 +
 include/uapi/linux/bpf.h                           |  2 +
 kernel/bpf/inode.c                                 |  3 +-
 kernel/bpf/syscall.c                               | 52 ++++++++++++++++------
 kernel/bpf/token.c                                 | 16 +++++++
 tools/include/uapi/linux/bpf.h                     |  2 +
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 ++
 8 files changed, 67 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aa9cf8e5fab1..e08e8436df38 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1600,6 +1600,7 @@ struct bpf_token {
 	atomic64_t refcnt;
 	struct user_namespace *userns;
 	u64 allowed_cmds;
+	u64 allowed_maps;
 };
 
 struct bpf_struct_ops_value;
@@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr);
 struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d4a567e5bc3c..0bba3392b17a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1433,6 +1434,7 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 6ce3f9696e72..9c7865d1c53d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -613,7 +613,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
 
-	if (opts->delegate_maps == ~0ULL)
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((opts->delegate_maps & mask) == mask)
 		seq_printf(m, ",delegate_maps=any");
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a156d549b356..22e14124cd61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1009,8 +1009,8 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
-			 u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
 	u32 key_size, value_size;
@@ -1038,7 +1038,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-		if (!bpf_capable()) {
+		if (!bpf_token_capable(token, CAP_BPF)) {
 			ret = -EPERM;
 			goto free_map_tab;
 		}
@@ -1126,11 +1126,12 @@ static bool bpf_net_capable(void)
 	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
+	struct bpf_token *token = NULL;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
@@ -1181,14 +1182,32 @@ static int map_create(union bpf_attr *attr)
 	if (!ops->map_mem_usage)
 		return -EINVAL;
 
+	if (attr->map_token_fd) {
+		token = bpf_token_get_from_fd(attr->map_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+
+		/* if current token doesn't grant map creation permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+		    !bpf_token_allow_map_type(token, attr->map_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	err = -EPERM;
+
 	/* Intent here is for unprivileged_bpf_disabled to block BPF map
 	 * creation for unprivileged users; other actions depend
 	 * on fd availability and access to bpffs, so are dependent on
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+		goto put_token;
 
 	/* check privileged map type permissions */
 	switch (map_type) {
@@ -1221,25 +1240,27 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
-		if (!bpf_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_BPF))
+			goto put_token;
 		break;
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!bpf_net_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_NET_ADMIN))
+			goto put_token;
 		break;
 	default:
 		WARN(1, "unsupported map type %d", map_type);
-		return -EPERM;
+		goto put_token;
 	}
 
 	map = ops->map_alloc(attr);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
+	if (IS_ERR(map)) {
+		err = PTR_ERR(map);
+		goto put_token;
+	}
 	map->ops = ops;
 	map->map_type = map_type;
 
@@ -1276,7 +1297,7 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 
 		if (attr->btf_value_type_id) {
-			err = map_check_btf(map, btf, attr->btf_key_type_id,
+			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
 					    attr->btf_value_type_id);
 			if (err)
 				goto free_map;
@@ -1297,6 +1318,7 @@ static int map_create(union bpf_attr *attr)
 		goto free_map_sec;
 
 	bpf_map_save_memcg(map);
+	bpf_token_put(token);
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -1317,6 +1339,8 @@ free_map_sec:
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index e18aaecc67e9..06c34dae658e 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -72,6 +72,13 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_cmds:\tany\n");
 	else
 		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+	BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((token->allowed_maps & mask) == mask)
+		seq_printf(m, "allowed_maps:\tany\n");
+	else
+		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -161,6 +168,7 @@ int bpf_token_create(union bpf_attr *attr)
 
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
+	token->allowed_maps = mnt_opts->delegate_maps;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -212,3 +220,11 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 
 	return token->allowed_cmds & (1ULL << cmd);
 }
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+	if (!token || type >= __MAX_BPF_MAP_TYPE)
+		return false;
+
+	return token->allowed_maps & (1ULL << type);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d4a567e5bc3c..0bba3392b17a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1433,6 +1434,7 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 9f766ddd946a..573249a2814d 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -68,6 +68,8 @@ void test_libbpf_probe_map_types(void)
 
 		if (map_type == BPF_MAP_TYPE_UNSPEC)
 			continue;
+		if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(map_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index c440ea3311ed..2a0633f43c73 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -132,6 +132,9 @@ static void test_libbpf_bpf_map_type_str(void)
 		const char *map_type_str;
 		char buf[256];
 
+		if (map_type == __MAX_BPF_MAP_TYPE)
+			continue;
+
 		map_type_name = btf__str_by_offset(btf, e->name_off);
 		map_type_str = libbpf_bpf_map_type_str(map_type);
 		ASSERT_OK_PTR(map_type_str, map_type_name);
-- 
cgit v1.2.3


From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:18 -0800
Subject: bpf: add BPF token support to BPF_PROG_LOAD command

Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of
allowed BPF program types and attach types, derived from BPF FS at BPF
token creation time. Then make sure we perform bpf_token_capable()
checks everywhere where it's relevant.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  6 ++
 include/uapi/linux/bpf.h                           |  2 +
 kernel/bpf/core.c                                  |  1 +
 kernel/bpf/inode.c                                 |  6 +-
 kernel/bpf/syscall.c                               | 87 ++++++++++++++++------
 kernel/bpf/token.c                                 | 27 +++++++
 tools/include/uapi/linux/bpf.h                     |  2 +
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 +
 9 files changed, 110 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e08e8436df38..20af87b59d70 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1601,6 +1602,8 @@ struct bpf_token {
 	struct user_namespace *userns;
 	u64 allowed_cmds;
 	u64 allowed_maps;
+	u64 allowed_progs;
+	u64 allowed_attachs;
 };
 
 struct bpf_struct_ops_value;
@@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9f9989e0d062..4df2d025c784 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1504,6 +1505,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4b813da8d6c0..47085839af8d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2751,6 +2751,7 @@ void bpf_prog_free(struct bpf_prog *fp)
 
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
+	bpf_token_put(aux->token);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9c7865d1c53d..5359a0929c35 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -619,12 +619,14 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
 
-	if (opts->delegate_progs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((opts->delegate_progs & mask) == mask)
 		seq_printf(m, ",delegate_progs=any");
 	else if (opts->delegate_progs)
 		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
 
-	if (opts->delegate_attachs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((opts->delegate_attachs & mask) == mask)
 		seq_printf(m, ",delegate_attachs=any");
 	else if (opts->delegate_attachs)
 		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d87c5c27cde3..2c8393c21b8c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2608,13 +2608,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
+	struct bpf_token *token = NULL;
+	bool bpf_cap;
 	int err;
 	char license[128];
 
@@ -2631,10 +2633,31 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
+	bpf_prog_load_fixup_attach_type(attr);
+
+	if (attr->prog_token_fd) {
+		token = bpf_token_get_from_fd(attr->prog_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+		/* if current token doesn't grant prog loading permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+		    !bpf_token_allow_prog_type(token, attr->prog_type,
+					       attr->expected_attach_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	bpf_cap = bpf_token_capable(token, CAP_BPF);
+	err = -EPERM;
+
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
 	/* Intent here is for unprivileged_bpf_disabled to block BPF program
 	 * creation for unprivileged users; other actions depend
@@ -2643,21 +2666,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * capability checks are still carried out for these
 	 * and other operations.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+		goto put_token;
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
-		return -E2BIG;
+	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+		err = -E2BIG;
+		goto put_token;
+	}
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
-	if (is_net_admin_prog_type(type) && !bpf_net_capable())
-		return -EPERM;
-	if (is_perfmon_prog_type(type) && !perfmon_capable())
-		return -EPERM;
+	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+		goto put_token;
+	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+		goto put_token;
 
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
@@ -2667,27 +2692,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		if (IS_ERR(dst_prog)) {
 			dst_prog = NULL;
 			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
-			if (IS_ERR(attach_btf))
-				return -EINVAL;
+			if (IS_ERR(attach_btf)) {
+				err = -EINVAL;
+				goto put_token;
+			}
 			if (!btf_is_kernel(attach_btf)) {
 				/* attaching through specifying bpf_prog's BTF
 				 * objects directly might be supported eventually
 				 */
 				btf_put(attach_btf);
-				return -ENOTSUPP;
+				err = -ENOTSUPP;
+				goto put_token;
 			}
 		}
 	} else if (attr->attach_btf_id) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(attach_btf))
-			return PTR_ERR(attach_btf);
-		if (!attach_btf)
-			return -EINVAL;
+		if (IS_ERR(attach_btf)) {
+			err = PTR_ERR(attach_btf);
+			goto put_token;
+		}
+		if (!attach_btf) {
+			err = -EINVAL;
+			goto put_token;
+		}
 		btf_get(attach_btf);
 	}
 
-	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
 				       dst_prog)) {
@@ -2695,7 +2726,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -EINVAL;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	/* plain bpf_prog allocation */
@@ -2705,7 +2737,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -ENOMEM;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
@@ -2716,6 +2749,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
+	/* move token into prog->aux, reuse taken refcnt */
+	prog->aux->token = token;
+	token = NULL;
+
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog;
@@ -2817,6 +2854,8 @@ free_prog:
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
@@ -3806,7 +3845,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!bpf_net_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 06c34dae658e..5a51e6b8f6bf 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -79,6 +79,20 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_maps:\tany\n");
 	else
 		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+	BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((token->allowed_progs & mask) == mask)
+		seq_printf(m, "allowed_progs:\tany\n");
+	else
+		seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+	BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((token->allowed_attachs & mask) == mask)
+		seq_printf(m, "allowed_attachs:\tany\n");
+	else
+		seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -169,6 +183,8 @@ int bpf_token_create(union bpf_attr *attr)
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
 	token->allowed_maps = mnt_opts->delegate_maps;
+	token->allowed_progs = mnt_opts->delegate_progs;
+	token->allowed_attachs = mnt_opts->delegate_attachs;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -228,3 +244,14 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t
 
 	return token->allowed_maps & (1ULL << type);
 }
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type)
+{
+	if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+		return false;
+
+	return (token->allowed_progs & (1ULL << prog_type)) &&
+	       (token->allowed_attachs & (1ULL << attach_type));
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9f9989e0d062..4df2d025c784 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1504,6 +1505,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 573249a2814d..4ed46ed58a7b 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -30,6 +30,8 @@ void test_libbpf_probe_prog_types(void)
 
 		if (prog_type == BPF_PROG_TYPE_UNSPEC)
 			continue;
+		if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(prog_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index 2a0633f43c73..384bc1f7a65e 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -189,6 +189,9 @@ static void test_libbpf_bpf_prog_type_str(void)
 		const char *prog_type_str;
 		char buf[256];
 
+		if (prog_type == __MAX_BPF_PROG_TYPE)
+			continue;
+
 		prog_type_name = btf__str_by_offset(btf, e->name_off);
 		prog_type_str = libbpf_bpf_prog_type_str(prog_type);
 		ASSERT_OK_PTR(prog_type_str, prog_type_name);
-- 
cgit v1.2.3


From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:19 -0800
Subject: bpf: take into account BPF token when fetching helper protos

Instead of performing unconditional system-wide bpf_capable() and
perfmon_capable() calls inside bpf_base_func_proto() function (and other
similar ones) to determine eligibility of a given BPF helper for a given
program, use previously recorded BPF token during BPF_PROG_LOAD command
handling to inform the decision.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/media/rc/bpf-lirc.c |  2 +-
 include/linux/bpf.h         |  5 +++--
 kernel/bpf/cgroup.c         |  6 +++---
 kernel/bpf/helpers.c        |  6 +++---
 kernel/bpf/syscall.c        |  5 +++--
 kernel/trace/bpf_trace.c    |  2 +-
 net/core/filter.c           | 32 ++++++++++++++++----------------
 net/ipv4/bpf_tcp_ca.c       |  2 +-
 net/netfilter/nf_bpf_link.c |  2 +-
 9 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index fe17c7f98e81..6d07693c6b9f 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (perfmon_capable())
+		if (bpf_token_capable(prog->aux->token, CAP_PERFMON))
 			return bpf_get_trace_printk_proto();
 		fallthrough;
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20af87b59d70..2a3ab4f3dd8c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
+						 const struct bpf_prog *prog);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..98e0e3835b28 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ee9bdf29246a..b3be5742d6f1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!bpf_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	switch (func_id) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c8393c21b8c..1cc03f08c9cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5712,7 +5712,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
 const struct bpf_func_proto * __weak
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5762,7 +5762,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+		       ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1648bde28f01..774cf476a892 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1626,7 +1626,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0adaa4afa35f..0bf2a03d8203 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,7 +87,7 @@
 #include "dev.h"
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id);
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
 {
@@ -7841,7 +7841,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7934,7 +7934,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return NULL;
 		}
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7953,7 +7953,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8140,7 +8140,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8199,7 +8199,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 
 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
@@ -8260,7 +8260,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_sock_proto;
 #endif /* CONFIG_INET */
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8302,7 +8302,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8346,7 +8346,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_lookup_tcp_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8357,7 +8357,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_flow_dissector_load_bytes_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8384,7 +8384,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11215,7 +11215,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11397,7 +11397,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11731,7 +11731,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
 };
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id)
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	const struct bpf_func_proto *func;
 
@@ -11760,10 +11760,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	return func;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 39dcccf0f174..c7bbd8f3c708 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index e502ec00b2fe..1969facac91c 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
 static const struct bpf_func_proto *
 bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 const struct bpf_verifier_ops netfilter_verifier_ops = {
-- 
cgit v1.2.3


From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:20 -0800
Subject: bpf: consistently use BPF token throughout BPF verifier logic

Remove remaining direct queries to perfmon_capable() and bpf_capable()
in BPF verifier logic and instead use BPF token (if available) to make
decisions about privileges.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 16 ++++++++--------
 include/linux/filter.h |  2 +-
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/core.c      |  2 +-
 kernel/bpf/verifier.c  | 13 ++++++-------
 net/core/filter.c      |  4 ++--
 6 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2a3ab4f3dd8c..435abad3cc61 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled;
 
 bool bpf_token_capable(const struct bpf_token *token, int cap);
 
-static inline bool bpf_allow_ptr_leaks(void)
+static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_allow_uninit_stack(void)
+static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v1(void)
+static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v4(void)
+static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a4953fafc8cb..14354605ad26 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_capable())
+	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
 		return false;
 
 	return true;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4a4a67956e21..8d365bda9a8b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool bypass_spec_v1 = bpf_bypass_spec_v1();
+	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
 	u64 array_size, mask64;
 	struct bpf_array *array;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 47085839af8d..ced511f44174 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -675,7 +675,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !bpf_capable())
+	    !bpf_token_capable(fp->aux->token, CAP_BPF))
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e5ce530641ba..45e85fb76d82 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20597,7 +20597,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
-	is_priv = bpf_capable();
+
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
 
 	bpf_get_btf_vmlinux();
 
@@ -20629,12 +20634,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
-	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
-	env->allow_uninit_stack = bpf_allow_uninit_stack();
-	env->bypass_spec_v1 = bpf_bypass_spec_v1();
-	env->bypass_spec_v4 = bpf_bypass_spec_v4();
-	env->bpf_capable = bpf_capable();
-
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0bf2a03d8203..adcfc2c25754 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8559,7 +8559,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!bpf_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 			return false;
 		break;
 	}
@@ -8571,7 +8571,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!bpf_capable())
+			if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 				return false;
 			break;
 		default:
-- 
cgit v1.2.3


From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:21 -0800
Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks

Based on upstream discussion ([0]), rework existing
bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead
of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF
program struct. Also, we pass bpf_attr union with all the user-provided
arguments for BPF_PROG_LOAD command.  This will give LSMs as much
information as we can basically provide.

The hook is also BPF token-aware now, and optional bpf_token struct is
passed as a third argument. bpf_prog_load LSM hook is called after
a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were
allocated and filled out, but right before performing full-fledged BPF
verification step.

bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for
consistency. SELinux code is adjusted to all new names, types, and
signatures.

Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be
used by some LSMs to allocate extra security blob, but also by other
LSMs to reject BPF program loading, we need to make sure that
bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one
*even* if the hook itself returned error. If we don't do that, we run
the risk of leaking memory. This seems to be possible today when
combining SELinux and BPF LSM, as one example, depending on their
relative ordering.

Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to
sleepable LSM hooks list, as they are both executed in sleepable
context. Also drop bpf_prog_load hook from untrusted, as there is no
issue with refcount or anything else anymore, that originally forced us
to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM
hook arguments as trusted"). We now trigger this hook much later and it
should not be an issue anymore.

  [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      | 12 +++++++-----
 kernel/bpf/bpf_lsm.c          |  5 +++--
 kernel/bpf/syscall.c          | 25 +++++++++++++------------
 security/security.c           | 25 +++++++++++++++----------
 security/selinux/hooks.c      | 15 ++++++++-------
 6 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..41ec4a7c070e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
 LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
 LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
+LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..65467eef6678 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_prog_aux;
+struct bpf_token;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
 extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
-extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				  struct bpf_token *token);
+extern void security_bpf_prog_free(struct bpf_prog *prog);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map)
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+					 struct bpf_token *token)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e14c822f8911..3e956f6302f3 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -263,6 +263,8 @@ BTF_ID(func, bpf_lsm_bpf_map)
 BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
 BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -346,8 +348,7 @@ BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
 #ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1cc03f08c9cd..7717c7c7b95d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2162,7 +2162,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
 	free_uid(aux->user);
-	security_bpf_prog_free(aux);
+	security_bpf_prog_free(aux->prog);
 	bpf_prog_free(aux->prog);
 }
 
@@ -2753,10 +2753,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->token = token;
 	token = NULL;
 
-	err = security_bpf_prog_alloc(prog->aux);
-	if (err)
-		goto free_prog;
-
 	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
 
@@ -2764,12 +2760,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (copy_from_bpfptr(prog->insns,
 			     make_bpfptr(attr->insns, uattr.is_kernel),
 			     bpf_prog_insn_size(prog)) != 0)
-		goto free_prog_sec;
+		goto free_prog;
 	/* copy eBPF program license from user space */
 	if (strncpy_from_bpfptr(license,
 				make_bpfptr(attr->license, uattr.is_kernel),
 				sizeof(license) - 1) < 0)
-		goto free_prog_sec;
+		goto free_prog;
 	license[sizeof(license) - 1] = 0;
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2783,25 +2779,29 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
 	    bpf_prog_is_dev_bound(dst_prog->aux)) {
 		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog_sec;
+		goto free_prog;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
+		goto free_prog;
+
+	err = security_bpf_prog_load(prog, attr, token);
+	if (err)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
@@ -2847,10 +2847,11 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
+
 free_prog_sec:
-	free_uid(prog->aux->user);
-	security_bpf_prog_free(prog->aux);
+	security_bpf_prog_free(prog);
 free_prog:
+	free_uid(prog->aux->user);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..c8a1c66cfaad 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5180,16 +5180,21 @@ int security_bpf_map_alloc(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_load() - Check if loading of BPF program is allowed
+ * @prog: BPF program object
+ * @attr: BPF syscall attributes used to create BPF program
+ * @token: BPF token used to grant user access to BPF subsystem
  *
- * Initialize the security field inside bpf program.
+ * Perform an access control check when the kernel loads a BPF program and
+ * allocates associated BPF program object. This hook is also responsible for
+ * allocating any required LSM state for the BPF program.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+			   struct bpf_token *token)
 {
-	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
 /**
@@ -5204,14 +5209,14 @@ void security_bpf_map_free(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_free() - Free a bpf program's LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_free() - Free a BPF program's LSM blob
+ * @prog: BPF program struct
  *
- * Clean up the security information stored inside bpf prog.
+ * Clean up the security information stored inside BPF program.
  */
-void security_bpf_prog_free(struct bpf_prog_aux *aux)
+void security_bpf_prog_free(struct bpf_prog *prog)
 {
-	call_void_hook(bpf_prog_free_security, aux);
+	call_void_hook(bpf_prog_free, prog);
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..eabee39e983c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6805,7 +6805,8 @@ static void selinux_bpf_map_free(struct bpf_map *map)
 	kfree(bpfsec);
 }
 
-static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				 struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6814,16 +6815,16 @@ static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
 		return -ENOMEM;
 
 	bpfsec->sid = current_sid();
-	aux->security = bpfsec;
+	prog->aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
+static void selinux_bpf_prog_free(struct bpf_prog *prog)
 {
-	struct bpf_security_struct *bpfsec = aux->security;
+	struct bpf_security_struct *bpfsec = prog->aux->security;
 
-	aux->security = NULL;
+	prog->aux->security = NULL;
 	kfree(bpfsec);
 }
 #endif
@@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
 	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
+	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 #endif
 #ifdef CONFIG_BPF_SYSCALL
 	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
-	LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
+	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
-- 
cgit v1.2.3


From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:22 -0800
Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks

Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc
hook into bpf_map_create, taking not just struct bpf_map, but also
bpf_attr and bpf_token, to give a fuller context to LSMs.

Unlike bpf_prog_alloc, there is no need to move the hook around, as it
currently is firing right before allocating BPF map ID and FD, which
seems to be a sweet spot.

But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free
LSM hook is called even if bpf_map_create hook returned error, as if few
LSMs are combined together it could be that one LSM successfully
allocated security blob for its needs, while subsequent LSM rejected BPF
map creation. The former LSM would still need to free up LSM blob, so we
need to ensure security_bpf_map_free() is called regardless of the
outcome.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      |  6 ++++--
 kernel/bpf/bpf_lsm.c          |  6 +++---
 kernel/bpf/syscall.c          |  4 ++--
 security/security.c           | 16 ++++++++++------
 security/selinux/hooks.c      |  7 ++++---
 6 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 41ec4a7c070e..adb25cc63ce3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
diff --git a/include/linux/security.h b/include/linux/security.h
index 65467eef6678..08fd777cbe94 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2025,7 +2025,8 @@ struct bpf_token;
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_alloc(struct bpf_map *map);
+extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				   struct bpf_token *token);
 extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
@@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_alloc(struct bpf_map *map)
+static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+					  struct bpf_token *token)
 {
 	return 0;
 }
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 3e956f6302f3..9e4e615f11eb 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,8 +260,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
 BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
@@ -347,7 +347,7 @@ BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7717c7c7b95d..aff045eed375 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1309,9 +1309,9 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_alloc(map);
+	err = security_bpf_map_create(map, attr, token);
 	if (err)
-		goto free_map;
+		goto free_map_sec;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
diff --git a/security/security.c b/security/security.c
index c8a1c66cfaad..ad24cf36da94 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5167,16 +5167,20 @@ int security_bpf_prog(struct bpf_prog *prog)
 }
 
 /**
- * security_bpf_map_alloc() - Allocate a bpf map LSM blob
- * @map: bpf map
+ * security_bpf_map_create() - Check if BPF map creation is allowed
+ * @map: BPF map object
+ * @attr: BPF syscall attributes used to create BPF map
+ * @token: BPF token used to grant user access
  *
- * Initialize the security field inside bpf map.
+ * Do a check when the kernel creates a new BPF map. This is also the
+ * point where LSM blob is allocated for LSMs that need them.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_map_alloc(struct bpf_map *map)
+int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+			    struct bpf_token *token)
 {
-	return call_int_hook(bpf_map_alloc_security, 0, map);
+	return call_int_hook(bpf_map_create, 0, map, attr, token);
 }
 
 /**
@@ -5205,7 +5209,7 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
  */
 void security_bpf_map_free(struct bpf_map *map)
 {
-	call_void_hook(bpf_map_free_security, map);
+	call_void_hook(bpf_map_free, map);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eabee39e983c..002351ab67b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6783,7 +6783,8 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 			    BPF__PROG_RUN, NULL);
 }
 
-static int selinux_bpf_map_alloc(struct bpf_map *map)
+static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				  struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
 	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
@@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
 #endif
 #ifdef CONFIG_BPF_SYSCALL
-	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+	LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
 	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:23 -0800
Subject: bpf,lsm: add BPF token LSM hooks

Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to
allocate LSM security blob (we add `void *security` field to struct
bpf_token for that), but also control who can instantiate BPF token.
This follows existing pattern for BPF map and BPF prog.

Also add security_bpf_token_allow_cmd() and security_bpf_token_capable()
LSM hooks that allow LSM implementation to control and negate (if
necessary) BPF token's delegation of a specific bpf_cmd and capability,
respectively.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  3 +++
 include/linux/lsm_hook_defs.h |  5 ++++
 include/linux/security.h      | 25 ++++++++++++++++++
 kernel/bpf/bpf_lsm.c          |  4 +++
 kernel/bpf/token.c            | 18 ++++++++-----
 security/security.c           | 60 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 435abad3cc61..7a483f6b6d5f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1604,6 +1604,9 @@ struct bpf_token {
 	u64 allowed_maps;
 	u64 allowed_progs;
 	u64 allowed_attachs;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 struct bpf_struct_ops_value;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index adb25cc63ce3..3fdd00b452ac 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
+LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
+	 struct path *path)
+LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
+LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
+LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 08fd777cbe94..00809d2d5c38 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
 extern void security_bpf_prog_free(struct bpf_prog *prog);
+extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path);
+extern void security_bpf_token_free(struct bpf_token *token);
+extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *
 
 static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
+
+static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path)
+{
+	return 0;
+}
+
+static inline void security_bpf_token_free(struct bpf_token *token)
+{ }
+
+static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return 0;
+}
+
+static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9e4e615f11eb..7d2f96413a57 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -265,6 +265,10 @@ BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 5a51e6b8f6bf..17212efcde60 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -7,6 +7,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/user_namespace.h>
+#include <linux/security.h>
 
 bool bpf_token_capable(const struct bpf_token *token, int cap)
 {
@@ -14,10 +15,9 @@ bool bpf_token_capable(const struct bpf_token *token, int cap)
 	 * token's userns is *exactly* the same as current user's userns
 	 */
 	if (token && current_user_ns() == token->userns) {
-		if (ns_capable(token->userns, cap))
-			return true;
-		if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))
-			return true;
+		if (ns_capable(token->userns, cap) ||
+		    (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)))
+			return security_bpf_token_capable(token, cap) == 0;
 	}
 	/* otherwise fallback to capable() checks */
 	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
@@ -30,6 +30,7 @@ void bpf_token_inc(struct bpf_token *token)
 
 static void bpf_token_free(struct bpf_token *token)
 {
+	security_bpf_token_free(token);
 	put_user_ns(token->userns);
 	kvfree(token);
 }
@@ -186,6 +187,10 @@ int bpf_token_create(union bpf_attr *attr)
 	token->allowed_progs = mnt_opts->delegate_progs;
 	token->allowed_attachs = mnt_opts->delegate_attachs;
 
+	err = security_bpf_token_create(token, attr, &path);
+	if (err)
+		goto out_token;
+
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
 		err = fd;
@@ -233,8 +238,9 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 	 */
 	if (!token || current_user_ns() != token->userns)
 		return false;
-
-	return token->allowed_cmds & (1ULL << cmd);
+	if (!(token->allowed_cmds & (1ULL << cmd)))
+		return false;
+	return security_bpf_token_cmd(token, cmd) == 0;
 }
 
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
diff --git a/security/security.c b/security/security.c
index ad24cf36da94..088a79c35c26 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5201,6 +5201,55 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
+/**
+ * security_bpf_token_create() - Check if creating of BPF token is allowed
+ * @token: BPF token object
+ * @attr: BPF syscall attributes used to create BPF token
+ * @path: path pointing to BPF FS mount point from which BPF token is created
+ *
+ * Do a check when the kernel instantiates a new BPF token object from BPF FS
+ * instance. This is also the point where LSM blob can be allocated for LSMs.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+			      struct path *path)
+{
+	return call_int_hook(bpf_token_create, 0, token, attr, path);
+}
+
+/**
+ * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
+ * requested BPF syscall command
+ * @token: BPF token object
+ * @cmd: BPF syscall command requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF syscall command.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return call_int_hook(bpf_token_cmd, 0, token, cmd);
+}
+
+/**
+ * security_bpf_token_capable() - Check if BPF token is allowed to delegate
+ * requested BPF-related capability
+ * @token: BPF token object
+ * @cap: capabilities requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF-related capabilities.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return call_int_hook(bpf_token_capable, 0, token, cap);
+}
+
 /**
  * security_bpf_map_free() - Free a bpf map's LSM blob
  * @map: bpf map
@@ -5222,6 +5271,17 @@ void security_bpf_prog_free(struct bpf_prog *prog)
 {
 	call_void_hook(bpf_prog_free, prog);
 }
+
+/**
+ * security_bpf_token_free() - Free a BPF token's LSM blob
+ * @token: BPF token struct
+ *
+ * Clean up the security information stored inside BPF token.
+ */
+void security_bpf_token_free(struct bpf_token *token)
+{
+	call_void_hook(bpf_token_free, token);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /**
-- 
cgit v1.2.3


From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Dec 2023 17:21:14 -0500
Subject: cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated()
 check

Currently, the cpu_is_isolated() function checks only the statically
isolated CPUs specified via the "isolcpus" and "nohz_full" kernel
command line options. This function is used by vmstat and memcg to
reduce interference with isolated CPUs by not doing stat flushing
or scheduling works on those CPUs.

Workloads running on isolated CPUs within isolated cpuset
partitions should receive the same treatment to reduce unnecessary
interference. This patch introduces a new cpuset_cpu_is_isolated()
function to be called by cpu_is_isolated() so that the set of dynamically
created cpuset isolated CPUs will be included in the check.

Assuming that testing a bit in a cpumask is atomic, no synchronization
primitive is currently used to synchronize access to the cpuset's
isolated_cpus mask.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h          |  6 ++++++
 include/linux/sched/isolation.h |  4 +++-
 kernel/cgroup/cpuset.c          | 11 +++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d629094fac6e..875d12598bd2 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -77,6 +77,7 @@ extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern bool cpuset_cpu_is_isolated(int cpu);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
 	return false;
 }
 
+static inline bool cpuset_cpu_is_isolated(int cpu)
+{
+	return false;
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index fe1a46f30d24..2b461129d1fa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_ISOLATION_H
 
 #include <linux/cpumask.h>
+#include <linux/cpuset.h>
 #include <linux/init.h>
 #include <linux/tick.h>
 
@@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
 static inline bool cpu_is_isolated(int cpu)
 {
 	return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
-		 !housekeeping_test_cpu(cpu, HK_TYPE_TICK);
+	       !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
+	       cpuset_cpu_is_isolated(cpu);
 }
 
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2a16df86c55c..dfbb16aca9f4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1518,6 +1518,17 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
 	WARN_ON_ONCE(ret < 0);
 }
 
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+	return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
 /*
  * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
  * @cs: cpuset
-- 
cgit v1.2.3


From be0a3600aa1ebe9d23243c91d41ab1a2d5091a9b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 5 Dec 2023 13:24:08 +0100
Subject: thermal: sysfs: Rework the handling of trip point updates

Both trip_point_temp_store() and trip_point_hyst_store() use
thermal_zone_set_trip() to update a given trip point, but none of them
actually needs to change more than one field in struct thermal_trip
representing it.  However, each of them effectively calls
__thermal_zone_get_trip() twice in a row for the same trip index value,
once directly and once via thermal_zone_set_trip(), which is not
particularly efficient, and the way in which thermal_zone_set_trip()
carries out the update is not particularly straightforward.

Moreover, input processing need not be done under the thermal zone lock
in any of these functions.

Rework trip_point_temp_store() and trip_point_hyst_store() to address
the above, move the part of thermal_zone_set_trip() that is still
useful to a new function called thermal_zone_trip_updated() and drop
the rest of it.

While at it, make trip_point_hyst_store() reject negative hysteresis
values.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.h  |  2 ++
 drivers/thermal/thermal_sysfs.c | 52 ++++++++++++++++++++++++++++-------------
 drivers/thermal/thermal_trip.c  | 45 +++++++----------------------------
 include/linux/thermal.h         |  4 ----
 4 files changed, 47 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 0a3b3ec5120b..7dfe6c8deb8e 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -124,6 +124,8 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			    struct thermal_trip *trip);
 int thermal_zone_trip_id(struct thermal_zone_device *tz,
 			 const struct thermal_trip *trip);
+void thermal_zone_trip_updated(struct thermal_zone_device *tz,
+			       const struct thermal_trip *trip);
 int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 
 /* sysfs I/F */
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index eef40d4f3063..06202aa50060 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -120,8 +120,13 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 		      const char *buf, size_t count)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	struct thermal_trip trip;
+	struct thermal_trip *trip;
 	int trip_id, ret;
+	int temp;
+
+	ret = kstrtoint(buf, 10, &temp);
+	if (ret)
+		return -EINVAL;
 
 	if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip_id) != 1)
 		return -EINVAL;
@@ -133,15 +138,20 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 		goto unlock;
 	}
 
-	ret = __thermal_zone_get_trip(tz, trip_id, &trip);
-	if (ret)
-		goto unlock;
+	trip = &tz->trips[trip_id];
 
-	ret = kstrtoint(buf, 10, &trip.temperature);
-	if (ret)
-		goto unlock;
+	if (temp != trip->temperature) {
+		if (tz->ops->set_trip_temp) {
+			ret = tz->ops->set_trip_temp(tz, trip_id, temp);
+			if (ret)
+				goto unlock;
+		}
+
+		trip->temperature = temp;
+
+		thermal_zone_trip_updated(tz, trip);
+	}
 
-	ret = thermal_zone_set_trip(tz, trip_id, &trip);
 unlock:
 	mutex_unlock(&tz->lock);
 	
@@ -179,8 +189,13 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 		      const char *buf, size_t count)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	struct thermal_trip trip;
+	struct thermal_trip *trip;
 	int trip_id, ret;
+	int hyst;
+
+	ret = kstrtoint(buf, 10, &hyst);
+	if (ret || hyst < 0)
+		return -EINVAL;
 
 	if (sscanf(attr->attr.name, "trip_point_%d_hyst", &trip_id) != 1)
 		return -EINVAL;
@@ -192,15 +207,20 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 		goto unlock;
 	}
 
-	ret = __thermal_zone_get_trip(tz, trip_id, &trip);
-	if (ret)
-		goto unlock;
+	trip = &tz->trips[trip_id];
 
-	ret = kstrtoint(buf, 10, &trip.hysteresis);
-	if (ret)
-		goto unlock;
+	if (hyst != trip->hysteresis) {
+		if (tz->ops->set_trip_hyst) {
+			ret = tz->ops->set_trip_hyst(tz, trip_id, hyst);
+			if (ret)
+				goto unlock;
+		}
+
+		trip->hysteresis = hyst;
+
+		thermal_zone_trip_updated(tz, trip);
+	}
 
-	ret = thermal_zone_set_trip(tz, trip_id, &trip);
 unlock:
 	mutex_unlock(&tz->lock);
 
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index e3dd583234dd..90861dec7eb0 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -147,42 +147,6 @@ int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_trip);
 
-int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
-			  const struct thermal_trip *trip)
-{
-	struct thermal_trip t;
-	int ret;
-
-	ret = __thermal_zone_get_trip(tz, trip_id, &t);
-	if (ret)
-		return ret;
-
-	if (t.type != trip->type)
-		return -EINVAL;
-
-	if (t.temperature != trip->temperature && tz->ops->set_trip_temp) {
-		ret = tz->ops->set_trip_temp(tz, trip_id, trip->temperature);
-		if (ret)
-			return ret;
-	}
-
-	if (t.hysteresis != trip->hysteresis && tz->ops->set_trip_hyst) {
-		ret = tz->ops->set_trip_hyst(tz, trip_id, trip->hysteresis);
-		if (ret)
-			return ret;
-	}
-
-	if (tz->trips && (t.temperature != trip->temperature || t.hysteresis != trip->hysteresis))
-		tz->trips[trip_id] = *trip;
-
-	thermal_notify_tz_trip_change(tz->id, trip_id, trip->type,
-				      trip->temperature, trip->hysteresis);
-
-	__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
-
-	return 0;
-}
-
 int thermal_zone_trip_id(struct thermal_zone_device *tz,
 			 const struct thermal_trip *trip)
 {
@@ -192,3 +156,12 @@ int thermal_zone_trip_id(struct thermal_zone_device *tz,
 	 */
 	return trip - tz->trips;
 }
+
+void thermal_zone_trip_updated(struct thermal_zone_device *tz,
+			       const struct thermal_trip *trip)
+{
+	thermal_notify_tz_trip_change(tz->id, thermal_zone_trip_id(tz, trip),
+				      trip->type, trip->temperature,
+				      trip->hysteresis);
+	__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
+}
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1f9ee869f9f9..0ea99f50d57c 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -282,10 +282,6 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			    struct thermal_trip *trip);
 int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 			  struct thermal_trip *trip);
-
-int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
-			  const struct thermal_trip *trip);
-
 int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data);
-- 
cgit v1.2.3


From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Dec 2023 09:30:40 +0100
Subject: bpf: Fix prog_array_map_poke_run map poke update

Lee pointed out issue found by syscaller [0] hitting BUG in prog array
map poke update in prog_array_map_poke_run function due to error value
returned from bpf_arch_text_poke function.

There's race window where bpf_arch_text_poke can fail due to missing
bpf program kallsym symbols, which is accounted for with check for
-EINVAL in that BUG_ON call.

The problem is that in such case we won't update the tail call jump
and cause imbalance for the next tail call update check which will
fail with -EBUSY in bpf_arch_text_poke.

I'm hitting following race during the program load:

  CPU 0                             CPU 1

  bpf_prog_load
    bpf_check
      do_misc_fixups
        prog_array_map_poke_track

                                    map_update_elem
                                      bpf_fd_array_map_update_elem
                                        prog_array_map_poke_run

                                          bpf_arch_text_poke returns -EINVAL

    bpf_prog_kallsyms_add

After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next
poke update fails on expected jump instruction check in bpf_arch_text_poke
with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run.

Similar race exists on the program unload.

Fixing this by moving the update to bpf_arch_poke_desc_update function which
makes sure we call __bpf_arch_text_poke that skips the bpf address check.

Each architecture has slightly different approach wrt looking up bpf address
in bpf_arch_text_poke, so instead of splitting the function or adding new
'checkip' argument in previous version, it seems best to move the whole
map_poke_run update as arch specific code.

  [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Cc: Lee Jones <lee@kernel.org>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |  3 +++
 kernel/bpf/arraymap.c       | 58 ++++++++-------------------------------------
 3 files changed, 59 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c10d9abc239..e89e415aa743 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 #endif
 	WARN(1, "verification of programs using bpf_throw should have failed\n");
 }
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old)
+{
+	u8 *old_addr, *new_addr, *old_bypass_addr;
+	int ret;
+
+	old_bypass_addr = old ? NULL : poke->bypass_addr;
+	old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+	new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+	/*
+	 * On program loading or teardown, the program's kallsym entry
+	 * might not be in place, so we use __bpf_arch_text_poke to skip
+	 * the kallsyms check.
+	 */
+	if (new) {
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, new_addr);
+		BUG_ON(ret < 0);
+		if (!old) {
+			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+						   BPF_MOD_JUMP,
+						   poke->bypass_addr,
+						   NULL);
+			BUG_ON(ret < 0);
+		}
+	} else {
+		ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+					   BPF_MOD_JUMP,
+					   old_bypass_addr,
+					   poke->bypass_addr);
+		BUG_ON(ret < 0);
+		/* let other CPUs finish the execution of program
+		 * so that it will not possible to expose them
+		 * to invalid nop, stack unwind, nop state
+		 */
+		if (!ret)
+			synchronize_rcu();
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, NULL);
+		BUG_ON(ret < 0);
+	}
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6762dac3ef76..cff5bb08820e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3175,6 +3175,9 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old);
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len);
 int bpf_arch_text_invalidate(void *dst, size_t len);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..c85ff9162a5c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
 	mutex_unlock(&aux->poke_mutex);
 }
 
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+				      struct bpf_prog *new, struct bpf_prog *old)
+{
+	WARN_ON_ONCE(1);
+}
+
 static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 				    struct bpf_prog *old,
 				    struct bpf_prog *new)
 {
-	u8 *old_addr, *new_addr, *old_bypass_addr;
 	struct prog_poke_elem *elem;
 	struct bpf_array_aux *aux;
 
@@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 
 	list_for_each_entry(elem, &aux->poke_progs, list) {
 		struct bpf_jit_poke_descriptor *poke;
-		int i, ret;
+		int i;
 
 		for (i = 0; i < elem->aux->size_poke_tab; i++) {
 			poke = &elem->aux->poke_tab[i];
@@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			 *    activated, so tail call updates can arrive from here
 			 *    while JIT is still finishing its final fixup for
 			 *    non-activated poke entries.
-			 * 3) On program teardown, the program's kallsym entry gets
-			 *    removed out of RCU callback, but we can only untrack
-			 *    from sleepable context, therefore bpf_arch_text_poke()
-			 *    might not see that this is in BPF text section and
-			 *    bails out with -EINVAL. As these are unreachable since
-			 *    RCU grace period already passed, we simply skip them.
-			 * 4) Also programs reaching refcount of zero while patching
+			 * 3) Also programs reaching refcount of zero while patching
 			 *    is in progress is okay since we're protected under
 			 *    poke_mutex and untrack the programs before the JIT
-			 *    buffer is freed. When we're still in the middle of
-			 *    patching and suddenly kallsyms entry of the program
-			 *    gets evicted, we just skip the rest which is fine due
-			 *    to point 3).
-			 * 5) Any other error happening below from bpf_arch_text_poke()
-			 *    is a unexpected bug.
+			 *    buffer is freed.
 			 */
 			if (!READ_ONCE(poke->tailcall_target_stable))
 				continue;
@@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			    poke->tail_call.key != key)
 				continue;
 
-			old_bypass_addr = old ? NULL : poke->bypass_addr;
-			old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
-			new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
-			if (new) {
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, new_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				if (!old) {
-					ret = bpf_arch_text_poke(poke->tailcall_bypass,
-								 BPF_MOD_JUMP,
-								 poke->bypass_addr,
-								 NULL);
-					BUG_ON(ret < 0 && ret != -EINVAL);
-				}
-			} else {
-				ret = bpf_arch_text_poke(poke->tailcall_bypass,
-							 BPF_MOD_JUMP,
-							 old_bypass_addr,
-							 poke->bypass_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				/* let other CPUs finish the execution of program
-				 * so that it will not possible to expose them
-				 * to invalid nop, stack unwind, nop state
-				 */
-				if (!ret)
-					synchronize_rcu();
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, NULL);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-			}
+			bpf_arch_poke_desc_update(poke, new, old);
 		}
 	}
 }
-- 
cgit v1.2.3


From 187da0f8250aa94bd96266096aef6f694e0b4cd2 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Mon, 13 Nov 2023 17:20:33 -0800
Subject: hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write

The routine __vma_private_lock tests for the existence of a reserve map
associated with a private hugetlb mapping.  A pointer to the reserve map
is in vma->vm_private_data.  __vma_private_lock was checking the pointer
for NULL.  However, it is possible that the low bits of the pointer could
be used as flags.  In such instances, vm_private_data is not NULL and not
a valid pointer.  This results in the null-ptr-deref reported by syzbot:

general protection fault, probably for non-canonical address 0xdffffc000000001d:
 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef]
CPU: 0 PID: 5048 Comm: syz-executor139 Not tainted 6.6.0-rc7-syzkaller-00142-g88
8cf78c29e2 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 1
0/09/2023
RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5004
...
Call Trace:
 <TASK>
 lock_acquire kernel/locking/lockdep.c:5753 [inline]
 lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5718
 down_write+0x93/0x200 kernel/locking/rwsem.c:1573
 hugetlb_vma_lock_write mm/hugetlb.c:300 [inline]
 hugetlb_vma_lock_write+0xae/0x100 mm/hugetlb.c:291
 __hugetlb_zap_begin+0x1e9/0x2b0 mm/hugetlb.c:5447
 hugetlb_zap_begin include/linux/hugetlb.h:258 [inline]
 unmap_vmas+0x2f4/0x470 mm/memory.c:1733
 exit_mmap+0x1ad/0xa60 mm/mmap.c:3230
 __mmput+0x12a/0x4d0 kernel/fork.c:1349
 mmput+0x62/0x70 kernel/fork.c:1371
 exit_mm kernel/exit.c:567 [inline]
 do_exit+0x9ad/0x2a20 kernel/exit.c:861
 __do_sys_exit kernel/exit.c:991 [inline]
 __se_sys_exit kernel/exit.c:989 [inline]
 __x64_sys_exit+0x42/0x50 kernel/exit.c:989
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Mask off low bit flags before checking for NULL pointer.  In addition, the
reserve map only 'belongs' to the OWNER (parent in parent/child
relationships) so also check for the OWNER flag.

Link: https://lkml.kernel.org/r/20231114012033.259600-1-mike.kravetz@oracle.com
Reported-by: syzbot+6ada951e7c0f7bc8a71e@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-mm/00000000000078d1e00608d7878b@google.com/
Fixes: bf4916922c60 ("hugetlbfs: extend hugetlb_vma_lock to private VMAs")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Cc: Edward Adam Davis <eadavis@qq.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 +----
 mm/hugetlb.c            | 7 +++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d3acecc5db4b..236ec7b63c54 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1268,10 +1268,7 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
 }
 
-static inline bool __vma_private_lock(struct vm_area_struct *vma)
-{
-	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
-}
+bool __vma_private_lock(struct vm_area_struct *vma);
 
 /*
  * Safe version of huge_pte_offset() to check the locks.  See comments
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1169ef2f2176..6feb3e0630d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1182,6 +1182,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 	return (get_vma_private_data(vma) & flag) != 0;
 }
 
+bool __vma_private_lock(struct vm_area_struct *vma)
+{
+	return !(vma->vm_flags & VM_MAYSHARE) &&
+		get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
+		is_vma_resv_set(vma, HPAGE_RESV_OWNER);
+}
+
 void hugetlb_dup_vma_private(struct vm_area_struct *vma)
 {
 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-- 
cgit v1.2.3


From 8e92157d7f6190c86bfd6144a409001469827100 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 28 Nov 2023 19:44:03 +0200
Subject: units: add missing header

BITS_PER_BYTE is defined in bits.h.

Link: https://lkml.kernel.org/r/20231128174404.393393-1-andriy.shevchenko@linux.intel.com
Fixes: e8eed5f7366f ("units: Add BYTES_PER_*BIT")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Damian Muszynski <damian.muszynski@intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/units.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index ff1bd6b5f5b3..45110daaf8d3 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_UNITS_H
 #define _LINUX_UNITS_H
 
+#include <linux/bits.h>
 #include <linux/math.h>
 
 /* Metric prefixes in accordance with Système international (d'unités) */
-- 
cgit v1.2.3


From 73424d00dc63ba681856e06cfb0a5abbdb62e2b5 Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Thu, 30 Nov 2023 11:40:18 +0800
Subject: highmem: fix a memory copy problem in memcpy_from_folio

Clang static checker complains that value stored to 'from' is never read.
And memcpy_from_folio() only copy the last chunk memory from folio to
destination.  Use 'to += chunk' to replace 'from += chunk' to fix this
typo problem.

Link: https://lkml.kernel.org/r/20231130034017.1210429-1-suhui@nfschina.com
Fixes: b23d03ef7af5 ("highmem: add memcpy_to_folio() and memcpy_from_folio()")
Signed-off-by: Su Hui <suhui@nfschina.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4cacc0e43b51..be20cff4ba73 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -454,7 +454,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
 		memcpy(to, from, chunk);
 		kunmap_local(from);
 
-		from += chunk;
+		to += chunk;
 		offset += chunk;
 		len -= chunk;
 	} while (len > 0);
-- 
cgit v1.2.3


From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:48 -0800
Subject: bpf: Let bpf_prog_pack_free handle any pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, bpf_prog_pack_free only can only free pointer to struct
bpf_binary_header, which is not flexible. Add a size argument to
bpf_prog_pack_free so that it can handle any pointer.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h  |  2 +-
 kernel/bpf/core.c       | 21 ++++++++++-----------
 kernel/bpf/dispatcher.c |  5 +----
 3 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 14354605ad26..12d907f17d36 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1067,7 +1067,7 @@ struct bpf_binary_header *
 bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
-void bpf_prog_pack_free(struct bpf_binary_header *hdr);
+void bpf_prog_pack_free(void *ptr, u32 size);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ced511f44174..c34513d645c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -928,20 +928,20 @@ out:
 	return ptr;
 }
 
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(void *ptr, u32 size)
 {
 	struct bpf_prog_pack *pack = NULL, *tmp;
 	unsigned int nbits;
 	unsigned long pos;
 
 	mutex_lock(&pack_mutex);
-	if (hdr->size > BPF_PROG_PACK_SIZE) {
-		bpf_jit_free_exec(hdr);
+	if (size > BPF_PROG_PACK_SIZE) {
+		bpf_jit_free_exec(ptr);
 		goto out;
 	}
 
 	list_for_each_entry(tmp, &pack_list, list) {
-		if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+		if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
 			pack = tmp;
 			break;
 		}
@@ -950,10 +950,10 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
 		goto out;
 
-	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
-	pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+	nbits = BPF_PROG_SIZE_TO_NBITS(size);
+	pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
 
-	WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+	WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
 		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
 
 	bitmap_clear(pack->bitmap, pos, nbits);
@@ -1100,8 +1100,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	*rw_header = kvmalloc(size, GFP_KERNEL);
 	if (!*rw_header) {
-		bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
-		bpf_prog_pack_free(ro_header);
+		bpf_prog_pack_free(ro_header, size);
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
 	}
@@ -1132,7 +1131,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
 	kvfree(rw_header);
 
 	if (IS_ERR(ptr)) {
-		bpf_prog_pack_free(ro_header);
+		bpf_prog_pack_free(ro_header, ro_header->size);
 		return PTR_ERR(ptr);
 	}
 	return 0;
@@ -1153,7 +1152,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
 {
 	u32 size = ro_header->size;
 
-	bpf_prog_pack_free(ro_header);
+	bpf_prog_pack_free(ro_header, size);
 	kvfree(rw_header);
 	bpf_jit_uncharge_modmem(size);
 }
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index fa3e9225aedc..56760fc10e78 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -150,10 +150,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			goto out;
 		d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
 		if (!d->rw_image) {
-			u32 size = PAGE_SIZE;
-
-			bpf_arch_text_copy(d->image, &size, sizeof(size));
-			bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+			bpf_prog_pack_free(d->image, PAGE_SIZE);
 			d->image = NULL;
 			goto out;
 		}
-- 
cgit v1.2.3


From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:49 -0800
Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline()

We are using "im" for "struct bpf_tramp_image" and "tr" for "struct
bpf_trampoline" in most of the code base. The only exception is the
prototype and fallback version of arch_prepare_bpf_trampoline(). Update
them to match the rest of the code base.

We mix "orig_call" and "func_addr" for the argument in different versions
of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 10 +++++-----
 include/linux/bpf.h           |  4 ++--
 kernel/bpf/trampoline.c       |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 7d4af64e3982..d81b886ea4df 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1828,7 +1828,7 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
  *
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
-			      struct bpf_tramp_links *tlinks, void *orig_call,
+			      struct bpf_tramp_links *tlinks, void *func_addr,
 			      int nregs, u32 flags)
 {
 	int i;
@@ -1926,7 +1926,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 
 	if (flags & BPF_TRAMP_F_IP_ARG) {
 		/* save ip address of the traced function */
-		emit_addr_mov_i64(A64_R(10), (const u64)orig_call, ctx);
+		emit_addr_mov_i64(A64_R(10), (const u64)func_addr, ctx);
 		emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
 	}
 
@@ -2029,7 +2029,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
-				void *orig_call)
+				void *func_addr)
 {
 	int i, ret;
 	int nregs = m->nr_args;
@@ -2050,7 +2050,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 	if (ret < 0)
 		return ret;
 
@@ -2061,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	ctx.idx = 0;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0)
 		ret = -EINVAL;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a483f6b6d5f..17eb6d905204 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx;
  *      fexit = a set of program to run after original function
  */
 struct bpf_tramp_image;
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
-				void *orig_call);
+				void *func_addr);
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e97aeda3a86b..e114a1c7961e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1032,10 +1032,10 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
 }
 
 int __weak
-arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
 			    struct bpf_tramp_links *tlinks,
-			    void *orig_call)
+			    void *func_addr)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:50 -0800
Subject: bpf: Add helpers for trampoline image management

As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec()
to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for
different archs during the transition. Add the following helpers for this
transition:

void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
void arch_protect_bpf_trampoline(void *image, unsigned int size);
void arch_unprotect_bpf_trampoline(void *image, unsigned int size);

The fallback version of these helpers require size <= PAGE_SIZE, but they
are only called with size == PAGE_SIZE. They will be called with size <
PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  5 +++++
 kernel/bpf/bpf_struct_ops.c    | 12 +++++------
 kernel/bpf/trampoline.c        | 46 +++++++++++++++++++++++++++++++++++-------
 net/bpf/bpf_dummy_struct_ops.c |  7 +++----
 4 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 17eb6d905204..b7fca151cf1b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
 				void *func_addr);
+void *arch_alloc_bpf_trampoline(unsigned int size);
+void arch_free_bpf_trampoline(void *image, unsigned int size);
+void arch_protect_bpf_trampoline(void *image, unsigned int size);
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index db6176fb64dc..e9e95879bce2 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -515,7 +515,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			if (err)
 				goto reset_unlock;
 		}
-		set_memory_rox((long)st_map->image, 1);
+		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 		/* Let bpf_link handle registration & unregistration.
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
@@ -524,7 +524,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		goto unlock;
 	}
 
-	set_memory_rox((long)st_map->image, 1);
+	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
 		/* This refcnt increment on the map here after
@@ -547,8 +547,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 * there was a race in registering the struct_ops (under the same name) to
 	 * a sub-system through different struct_ops's maps.
 	 */
-	set_memory_nx((long)st_map->image, 1);
-	set_memory_rw((long)st_map->image, 1);
+	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -616,7 +615,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
 	if (st_map->image) {
-		bpf_jit_free_exec(st_map->image);
+		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
 	}
 	bpf_map_area_free(st_map->uvalue);
@@ -691,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(ret);
 	}
 
-	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!st_map->image) {
 		/* __bpf_struct_ops_map_free() uses st_map->image as flag
 		 * for "charged or not". In this case, we need to unchange
@@ -711,7 +710,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	}
 
 	mutex_init(&st_map->lock);
-	set_vm_flush_reset_perms(st_map->image);
 	bpf_map_init_from_attr(map, attr);
 
 	return map;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e114a1c7961e..affbcbf7e76e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -254,7 +254,7 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
 static void bpf_tramp_image_free(struct bpf_tramp_image *im)
 {
 	bpf_image_ksym_del(&im->ksym);
-	bpf_jit_free_exec(im->image);
+	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
 	bpf_jit_uncharge_modmem(PAGE_SIZE);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
@@ -365,10 +365,9 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 		goto out_free_im;
 
 	err = -ENOMEM;
-	im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
+	im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!image)
 		goto out_uncharge;
-	set_vm_flush_reset_perms(image);
 
 	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
 	if (err)
@@ -381,7 +380,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	return im;
 
 out_free_image:
-	bpf_jit_free_exec(im->image);
+	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
 out_uncharge:
 	bpf_jit_uncharge_modmem(PAGE_SIZE);
 out_free_im:
@@ -444,7 +443,7 @@ again:
 	if (err < 0)
 		goto out_free;
 
-	set_memory_rox((long)im->image, 1);
+	arch_protect_bpf_trampoline(im->image, PAGE_SIZE);
 
 	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
@@ -465,8 +464,7 @@ again:
 		tr->fops->trampoline = 0;
 
 		/* reset im->image memory attr for arch_prepare_bpf_trampoline */
-		set_memory_nx((long)im->image, 1);
-		set_memory_rw((long)im->image, 1);
+		arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE);
 		goto again;
 	}
 #endif
@@ -1040,6 +1038,40 @@ arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image
 	return -ENOTSUPP;
 }
 
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+	void *image;
+
+	if (WARN_ON_ONCE(size > PAGE_SIZE))
+		return NULL;
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (image)
+		set_vm_flush_reset_perms(image);
+	return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	/* bpf_jit_free_exec doesn't need "size", but
+	 * bpf_prog_pack_free() needs it.
+	 */
+	bpf_jit_free_exec(image);
+}
+
+void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	set_memory_rox((long)image, 1);
+}
+
+void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	set_memory_nx((long)image, 1);
+	set_memory_rw((long)image, 1);
+}
+
 static int __init init_trampolines(void)
 {
 	int i;
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 5918d1b32e19..2748f9d77b18 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -101,12 +101,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		goto out;
 	}
 
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!image) {
 		err = -ENOMEM;
 		goto out;
 	}
-	set_vm_flush_reset_perms(image);
 
 	link = kzalloc(sizeof(*link), GFP_USER);
 	if (!link) {
@@ -124,7 +123,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (err < 0)
 		goto out;
 
-	set_memory_rox((long)image, 1);
+	arch_protect_bpf_trampoline(image, PAGE_SIZE);
 	prog_ret = dummy_ops_call_op(image, args);
 
 	err = dummy_ops_copy_args(args);
@@ -134,7 +133,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		err = -EFAULT;
 out:
 	kfree(args);
-	bpf_jit_free_exec(image);
+	arch_free_bpf_trampoline(image, PAGE_SIZE);
 	if (link)
 		bpf_link_put(&link->link);
 	kfree(tlinks);
-- 
cgit v1.2.3


From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:52 -0800
Subject: bpf: Add arch_bpf_trampoline_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper will be used to calculate the size of the trampoline before
allocating the memory.

arch_prepare_bpf_trampoline() for arm64 and riscv64 can use
arch_bpf_trampoline_size() to check the trampoline fits in the image.

OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process
twice, so it cannot use arch_bpf_trampoline_size().

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c   | 56 ++++++++++++++++++++++++++++++-----------
 arch/riscv/net/bpf_jit_comp64.c | 22 ++++++++++++----
 arch/s390/net/bpf_jit_comp.c    | 56 +++++++++++++++++++++++++----------------
 arch/x86/net/bpf_jit_comp.c     | 40 ++++++++++++++++++++++++++---
 include/linux/bpf.h             |  2 ++
 kernel/bpf/trampoline.c         |  6 +++++
 6 files changed, 136 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index d81b886ea4df..a6671253b7ed 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2026,18 +2026,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	return ctx->idx;
 }
 
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
-				void *image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks,
-				void *func_addr)
+static int btf_func_model_nregs(const struct btf_func_model *m)
 {
-	int i, ret;
 	int nregs = m->nr_args;
-	int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE;
-	struct jit_ctx ctx = {
-		.image = NULL,
-		.idx = 0,
-	};
+	int i;
 
 	/* extra registers needed for struct argument */
 	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
@@ -2046,19 +2038,53 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 			nregs += (m->arg_size[i] + 7) / 8 - 1;
 	}
 
+	return nregs;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct jit_ctx ctx = {
+		.image = NULL,
+		.idx = 0,
+	};
+	struct bpf_tramp_image im;
+	int nregs, ret;
+
+	nregs = btf_func_model_nregs(m);
 	/* the first 8 registers are used for arguments */
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
+	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags);
 	if (ret < 0)
 		return ret;
 
-	if (ret > max_insns)
-		return -EFBIG;
+	return ret < 0 ? ret : ret * AARCH64_INSN_SIZE;
+}
 
-	ctx.image = image;
-	ctx.idx = 0;
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
+				void *image_end, const struct btf_func_model *m,
+				u32 flags, struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	int ret, nregs;
+	struct jit_ctx ctx = {
+		.image = image,
+		.idx = 0,
+	};
+
+	nregs = btf_func_model_nregs(m);
+	/* the first 8 registers are used for arguments */
+	if (nregs > 8)
+		return -ENOTSUPP;
+
+	ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr);
+	if (ret < 0)
+		return ret;
+
+	if (ret > ((long)image_end - (long)image))
+		return -EFBIG;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
 	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 8581693e62d3..35747fafde57 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1029,6 +1029,21 @@ out:
 	return ret;
 }
 
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct bpf_tramp_image im;
+	struct rv_jit_context ctx;
+	int ret;
+
+	ctx.ninsns = 0;
+	ctx.insns = NULL;
+	ctx.ro_insns = NULL;
+	ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx);
+
+	return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns);
+}
+
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
@@ -1037,14 +1052,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	int ret;
 	struct rv_jit_context ctx;
 
-	ctx.ninsns = 0;
-	ctx.insns = NULL;
-	ctx.ro_insns = NULL;
-	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
+	ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr);
 	if (ret < 0)
 		return ret;
 
-	if (ninsns_rvoff(ret) > (long)image_end - (long)image)
+	if (ret > (long)image_end - (long)image)
 		return -EFBIG;
 
 	ctx.ninsns = 0;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bf06b7283f0c..cc129617480a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2637,6 +2637,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	return 0;
 }
 
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *orig_call)
+{
+	struct bpf_tramp_image im;
+	struct bpf_tramp_jit tjit;
+	int ret;
+
+	memset(&tjit, 0, sizeof(tjit));
+
+	ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags,
+					    tlinks, orig_call);
+
+	return ret < 0 ? ret : tjit.common.prg;
+}
+
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
@@ -2644,30 +2659,27 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 {
 	struct bpf_tramp_jit tjit;
 	int ret;
-	int i;
 
-	for (i = 0; i < 2; i++) {
-		if (i == 0) {
-			/* Compute offsets, check whether the code fits. */
-			memset(&tjit, 0, sizeof(tjit));
-		} else {
-			/* Generate the code. */
-			tjit.common.prg = 0;
-			tjit.common.prg_buf = image;
-		}
-		ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
-						    tlinks, func_addr);
-		if (ret < 0)
-			return ret;
-		if (tjit.common.prg > (char *)image_end - (char *)image)
-			/*
-			 * Use the same error code as for exceeding
-			 * BPF_MAX_TRAMP_LINKS.
-			 */
-			return -E2BIG;
-	}
+	/* Compute offsets, check whether the code fits. */
+	memset(&tjit, 0, sizeof(tjit));
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
+
+	if (ret < 0)
+		return ret;
+	if (tjit.common.prg > (char *)image_end - (char *)image)
+		/*
+		 * Use the same error code as for exceeding
+		 * BPF_MAX_TRAMP_LINKS.
+		 */
+		return -E2BIG;
+
+	tjit.common.prg = 0;
+	tjit.common.prg_buf = image;
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
 
-	return tjit.common.prg;
+	return ret < 0 ? ret : tjit.common.prg;
 }
 
 bool bpf_jit_supports_subprog_tailcalls(void)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5f7528cac344..5d75069fdcc2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2422,10 +2422,10 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
  * add rsp, 8                      // skip eth_type_trans's frame
  * ret                             // return to its caller
  */
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
-				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_links *tlinks,
-				void *func_addr)
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+					 const struct btf_func_model *m, u32 flags,
+					 struct bpf_tramp_links *tlinks,
+					 void *func_addr)
 {
 	int i, ret, nr_regs = m->nr_args, stack_size = 0;
 	int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
@@ -2678,6 +2678,38 @@ cleanup:
 	return ret;
 }
 
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+				const struct btf_func_model *m, u32 flags,
+				struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	return __arch_prepare_bpf_trampoline(im, image, image_end, m, flags, tlinks, func_addr);
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct bpf_tramp_image im;
+	void *image;
+	int ret;
+
+	/* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+	 * This will NOT cause fragmentation in direct map, as we do not
+	 * call set_memory_*() on this buffer.
+	 *
+	 * We cannot use kvmalloc here, because we need image to be in
+	 * module memory range.
+	 */
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!image)
+		return -ENOMEM;
+
+	ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, m, flags,
+					    tlinks, func_addr);
+	bpf_jit_free_exec(image);
+	return ret;
+}
+
 static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b7fca151cf1b..2332ddeb396b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
 void arch_protect_bpf_trampoline(void *image, unsigned int size);
 void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index affbcbf7e76e..b553cbd89e55 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1072,6 +1072,12 @@ void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
 	set_memory_rw((long)image, 1);
 }
 
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+				    struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	return -ENOTSUPP;
+}
+
 static int __init init_trampolines(void)
 {
 	int i;
-- 
cgit v1.2.3


From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:53 -0800
Subject: bpf: Use arch_bpf_trampoline_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of blindly allocating PAGE_SIZE for each trampoline, check the size
of the trampoline with arch_bpf_trampoline_size(). This size is saved in
bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback
arch_alloc_bpf_trampoline() still allocates a whole page because we need to
use set_memory_* to protect the memory.

struct_ops trampoline still uses a whole page for multiple trampolines.

With this size check at caller (regular trampoline and struct_ops
trampoline), remove arch_bpf_trampoline_size() from
arch_prepare_bpf_trampoline() in archs.

Also, update bpf_image_ksym_add() to handle symbol of different sizes.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c   |  7 ------
 arch/riscv/net/bpf_jit_comp64.c |  7 ------
 include/linux/bpf.h             |  3 ++-
 kernel/bpf/bpf_struct_ops.c     |  7 ++++++
 kernel/bpf/dispatcher.c         |  2 +-
 kernel/bpf/trampoline.c         | 55 +++++++++++++++++++++++++----------------
 6 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a6671253b7ed..8955da5c47cf 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2079,13 +2079,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr);
-	if (ret < 0)
-		return ret;
-
-	if (ret > ((long)image_end - (long)image))
-		return -EFBIG;
-
 	jit_fill_hole(image, (unsigned int)(image_end - image));
 	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 35747fafde57..58dc64dd94a8 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1052,13 +1052,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	int ret;
 	struct rv_jit_context ctx;
 
-	ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr);
-	if (ret < 0)
-		return ret;
-
-	if (ret > (long)image_end - (long)image)
-		return -EFBIG;
-
 	ctx.ninsns = 0;
 	/*
 	 * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to write the
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2332ddeb396b..c1a06263a4f3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type {
 
 struct bpf_tramp_image {
 	void *image;
+	int size;
 	struct bpf_ksym ksym;
 	struct percpu_ref pcref;
 	void *ip_after_call;
@@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index e9e95879bce2..4d53c53fc5aa 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -355,6 +355,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      void *image, void *image_end)
 {
 	u32 flags;
+	int size;
 
 	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
 	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
@@ -362,6 +363,12 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	 * and it must be used alone.
 	 */
 	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+	if (size < 0)
+		return size;
+	if (size > (unsigned long)image_end - (unsigned long)image)
+		return -E2BIG;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
 					   model, flags, tlinks, NULL);
 }
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 56760fc10e78..70fb82bf1637 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			d->image = NULL;
 			goto out;
 		}
-		bpf_image_ksym_add(d->image, &d->ksym);
+		bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
 	}
 
 	prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b553cbd89e55..d382f5ebe06c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
-	ksym->end = ksym->start + PAGE_SIZE;
+	ksym->end = ksym->start + size;
 	bpf_ksym_add(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
 			   PAGE_SIZE, false, ksym->name);
@@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
 static void bpf_tramp_image_free(struct bpf_tramp_image *im)
 {
 	bpf_image_ksym_del(&im->ksym);
-	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
-	bpf_jit_uncharge_modmem(PAGE_SIZE);
+	arch_free_bpf_trampoline(im->image, im->size);
+	bpf_jit_uncharge_modmem(im->size);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
 }
@@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
 	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
 }
 
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_ksym *ksym;
@@ -360,12 +360,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	if (!im)
 		goto out;
 
-	err = bpf_jit_charge_modmem(PAGE_SIZE);
+	err = bpf_jit_charge_modmem(size);
 	if (err)
 		goto out_free_im;
+	im->size = size;
 
 	err = -ENOMEM;
-	im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+	im->image = image = arch_alloc_bpf_trampoline(size);
 	if (!image)
 		goto out_uncharge;
 
@@ -376,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	ksym = &im->ksym;
 	INIT_LIST_HEAD_RCU(&ksym->lnode);
 	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
-	bpf_image_ksym_add(image, ksym);
+	bpf_image_ksym_add(image, size, ksym);
 	return im;
 
 out_free_image:
-	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
+	arch_free_bpf_trampoline(im->image, im->size);
 out_uncharge:
-	bpf_jit_uncharge_modmem(PAGE_SIZE);
+	bpf_jit_uncharge_modmem(size);
 out_free_im:
 	kfree(im);
 out:
@@ -395,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 	struct bpf_tramp_links *tlinks;
 	u32 orig_flags = tr->flags;
 	bool ip_arg = false;
-	int err, total;
+	int err, total, size;
 
 	tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
 	if (IS_ERR(tlinks))
@@ -408,12 +409,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		goto out;
 	}
 
-	im = bpf_tramp_image_alloc(tr->key);
-	if (IS_ERR(im)) {
-		err = PTR_ERR(im);
-		goto out;
-	}
-
 	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
 	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
@@ -437,13 +432,31 @@ again:
 		tr->flags |= BPF_TRAMP_F_ORIG_STACK;
 #endif
 
-	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
+	size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+					tlinks, tr->func.addr);
+	if (size < 0) {
+		err = size;
+		goto out;
+	}
+
+	if (size > PAGE_SIZE) {
+		err = -E2BIG;
+		goto out;
+	}
+
+	im = bpf_tramp_image_alloc(tr->key, size);
+	if (IS_ERR(im)) {
+		err = PTR_ERR(im);
+		goto out;
+	}
+
+	err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
 					  &tr->func.model, tr->flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out_free;
 
-	arch_protect_bpf_trampoline(im->image, PAGE_SIZE);
+	arch_protect_bpf_trampoline(im->image, im->size);
 
 	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
@@ -463,8 +476,8 @@ again:
 		tr->fops->func = NULL;
 		tr->fops->trampoline = 0;
 
-		/* reset im->image memory attr for arch_prepare_bpf_trampoline */
-		arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE);
+		/* free im memory and reallocate later */
+		bpf_tramp_image_free(im);
 		goto again;
 	}
 #endif
-- 
cgit v1.2.3


From f922b16aa5fad7284e2b7fd7c22bab13c0e418b6 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:09 -0800
Subject: firmware: xilinx: Update firmware call interface to support
 additional args

System-level platform management layer (do_fw_call()) has support for
maximum of 5 arguments as of now (1 EEMI API ID + 4 command arguments).
In order to support new EEMI PM_IOCTL IDs (Secure Read/Write), this
support must be extended to support one additional argument, which
results in a configuration of - 1 EEMI API ID + 5 command arguments.

Update zynqmp_pm_invoke_fn() and do_fw_call() with this new definition
containing variable arguments. As a result, update all the references
to pm invoke function with the updated definition.

Co-developed-by: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Signed-off-by: Izhar Ameer Shaikh <izhar.ameer.shaikh@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-2-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c        | 278 +++++++++++++++-----------------
 drivers/soc/xilinx/xlnx_event_manager.c |   2 +-
 drivers/soc/xilinx/zynqmp_power.c       |   2 +-
 include/linux/firmware/xlnx-zynqmp.h    |   3 +-
 4 files changed, 132 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 7156b4884b41..2a7fd3fbac8c 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -106,8 +106,7 @@ static int zynqmp_pm_ret_code(u32 ret_status)
 	}
 }
 
-static noinline int do_fw_call_fail(u64 arg0, u64 arg1, u64 arg2,
-				    u32 *ret_payload)
+static noinline int do_fw_call_fail(u32 *ret_payload, u32 num_args, ...)
 {
 	return -ENODEV;
 }
@@ -116,25 +115,35 @@ static noinline int do_fw_call_fail(u64 arg0, u64 arg1, u64 arg2,
  * PM function call wrapper
  * Invoke do_fw_call_smc or do_fw_call_hvc, depending on the configuration
  */
-static int (*do_fw_call)(u64, u64, u64, u32 *ret_payload) = do_fw_call_fail;
+static int (*do_fw_call)(u32 *ret_payload, u32, ...) = do_fw_call_fail;
 
 /**
  * do_fw_call_smc() - Call system-level platform management layer (SMC)
- * @arg0:		Argument 0 to SMC call
- * @arg1:		Argument 1 to SMC call
- * @arg2:		Argument 2 to SMC call
+ * @num_args:		Number of variable arguments should be <= 8
  * @ret_payload:	Returned value array
  *
  * Invoke platform management function via SMC call (no hypervisor present).
  *
  * Return: Returns status, either success or error+reason
  */
-static noinline int do_fw_call_smc(u64 arg0, u64 arg1, u64 arg2,
-				   u32 *ret_payload)
+static noinline int do_fw_call_smc(u32 *ret_payload, u32 num_args, ...)
 {
 	struct arm_smccc_res res;
+	u64 args[8] = {0};
+	va_list arg_list;
+	u8 i;
 
-	arm_smccc_smc(arg0, arg1, arg2, 0, 0, 0, 0, 0, &res);
+	if (num_args > 8)
+		return -EINVAL;
+
+	va_start(arg_list, num_args);
+
+	for (i = 0; i < num_args; i++)
+		args[i] = va_arg(arg_list, u64);
+
+	va_end(arg_list);
+
+	arm_smccc_smc(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], &res);
 
 	if (ret_payload) {
 		ret_payload[0] = lower_32_bits(res.a0);
@@ -148,9 +157,7 @@ static noinline int do_fw_call_smc(u64 arg0, u64 arg1, u64 arg2,
 
 /**
  * do_fw_call_hvc() - Call system-level platform management layer (HVC)
- * @arg0:		Argument 0 to HVC call
- * @arg1:		Argument 1 to HVC call
- * @arg2:		Argument 2 to HVC call
+ * @num_args:		Number of variable arguments should be <= 8
  * @ret_payload:	Returned value array
  *
  * Invoke platform management function via HVC
@@ -159,12 +166,24 @@ static noinline int do_fw_call_smc(u64 arg0, u64 arg1, u64 arg2,
  *
  * Return: Returns status, either success or error+reason
  */
-static noinline int do_fw_call_hvc(u64 arg0, u64 arg1, u64 arg2,
-				   u32 *ret_payload)
+static noinline int do_fw_call_hvc(u32 *ret_payload, u32 num_args, ...)
 {
 	struct arm_smccc_res res;
+	u64 args[8] = {0};
+	va_list arg_list;
+	u8 i;
+
+	if (num_args > 8)
+		return -EINVAL;
+
+	va_start(arg_list, num_args);
+
+	for (i = 0; i < num_args; i++)
+		args[i] = va_arg(arg_list, u64);
 
-	arm_smccc_hvc(arg0, arg1, arg2, 0, 0, 0, 0, 0, &res);
+	va_end(arg_list);
+
+	arm_smccc_hvc(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], &res);
 
 	if (ret_payload) {
 		ret_payload[0] = lower_32_bits(res.a0);
@@ -184,7 +203,7 @@ static int __do_feature_check_call(const u32 api_id, u32 *ret_payload)
 	smc_arg[0] = PM_SIP_SVC | PM_FEATURE_CHECK;
 	smc_arg[1] = api_id;
 
-	ret = do_fw_call(smc_arg[0], smc_arg[1], 0, ret_payload);
+	ret = do_fw_call(ret_payload, 2, smc_arg[0], smc_arg[1]);
 	if (ret)
 		ret = -EOPNOTSUPP;
 	else
@@ -295,11 +314,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_is_function_supported);
  * zynqmp_pm_invoke_fn() - Invoke the system-level platform management layer
  *			   caller function depending on the configuration
  * @pm_api_id:		Requested PM-API call
- * @arg0:		Argument 0 to requested PM-API call
- * @arg1:		Argument 1 to requested PM-API call
- * @arg2:		Argument 2 to requested PM-API call
- * @arg3:		Argument 3 to requested PM-API call
  * @ret_payload:	Returned value array
+ * @num_args:		Number of arguments to requested PM-API call
  *
  * Invoke platform management function for SMC or HVC call, depending on
  * configuration.
@@ -316,26 +332,38 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_is_function_supported);
  *
  * Return: Returns status, either success or error+reason
  */
-int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
-			u32 arg2, u32 arg3, u32 *ret_payload)
+int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...)
 {
 	/*
 	 * Added SIP service call Function Identifier
 	 * Make sure to stay in x0 register
 	 */
-	u64 smc_arg[4];
-	int ret;
+	u64 smc_arg[8];
+	int ret, i;
+	va_list arg_list;
+	u32 args[14] = {0};
+
+	if (num_args > 14)
+		return -EINVAL;
+
+	va_start(arg_list, num_args);
 
 	/* Check if feature is supported or not */
 	ret = zynqmp_pm_feature(pm_api_id);
 	if (ret < 0)
 		return ret;
 
+	for (i = 0; i < num_args; i++)
+		args[i] = va_arg(arg_list, u32);
+
+	va_end(arg_list);
+
 	smc_arg[0] = PM_SIP_SVC | pm_api_id;
-	smc_arg[1] = ((u64)arg1 << 32) | arg0;
-	smc_arg[2] = ((u64)arg3 << 32) | arg2;
+	for (i = 0; i < 7; i++)
+		smc_arg[i + 1] = ((u64)args[(i * 2) + 1] << 32) | args[i * 2];
 
-	return do_fw_call(smc_arg[0], smc_arg[1], smc_arg[2], ret_payload);
+	return do_fw_call(ret_payload, 8, smc_arg[0], smc_arg[1], smc_arg[2], smc_arg[3],
+			  smc_arg[4], smc_arg[5], smc_arg[6], smc_arg[7]);
 }
 
 static u32 pm_api_version;
@@ -347,14 +375,12 @@ int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
 {
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(TF_A_PM_REGISTER_SGI, sgi_num, reset, 0, 0,
-				  NULL);
+	ret = zynqmp_pm_invoke_fn(TF_A_PM_REGISTER_SGI, NULL, 2, sgi_num, reset);
 	if (!ret)
 		return ret;
 
 	/* try old implementation as fallback strategy if above fails */
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_REGISTER_SGI, sgi_num,
-				   reset, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, IOCTL_REGISTER_SGI, sgi_num, reset);
 }
 
 /**
@@ -376,7 +402,7 @@ int zynqmp_pm_get_api_version(u32 *version)
 		*version = pm_api_version;
 		return 0;
 	}
-	ret = zynqmp_pm_invoke_fn(PM_GET_API_VERSION, 0, 0, 0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_GET_API_VERSION, ret_payload, 0);
 	*version = ret_payload[1];
 
 	return ret;
@@ -399,7 +425,7 @@ int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 	if (!idcode || !version)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, 0, 0, 0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, ret_payload, 0);
 	*idcode = ret_payload[1];
 	*version = ret_payload[2];
 
@@ -427,7 +453,7 @@ static int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
 		return 0;
 	}
 
-	ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, 0, 0, 0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, ret_payload, 0);
 	if (ret < 0)
 		return ret;
 
@@ -459,8 +485,7 @@ static int zynqmp_pm_get_trustzone_version(u32 *version)
 		*version = pm_tz_version;
 		return 0;
 	}
-	ret = zynqmp_pm_invoke_fn(PM_GET_TRUSTZONE_VERSION, 0, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_GET_TRUSTZONE_VERSION, ret_payload, 0);
 	*version = ret_payload[1];
 
 	return ret;
@@ -507,8 +532,8 @@ int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out)
 {
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_QUERY_DATA, qdata.qid, qdata.arg1,
-				  qdata.arg2, qdata.arg3, out);
+	ret = zynqmp_pm_invoke_fn(PM_QUERY_DATA, out, 4, qdata.qid, qdata.arg1, qdata.arg2,
+				  qdata.arg3);
 
 	/*
 	 * For clock name query, all bytes in SMC response are clock name
@@ -530,7 +555,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_query_data);
  */
 int zynqmp_pm_clock_enable(u32 clock_id)
 {
-	return zynqmp_pm_invoke_fn(PM_CLOCK_ENABLE, clock_id, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_CLOCK_ENABLE, NULL, 1, clock_id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_enable);
 
@@ -545,7 +570,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_enable);
  */
 int zynqmp_pm_clock_disable(u32 clock_id)
 {
-	return zynqmp_pm_invoke_fn(PM_CLOCK_DISABLE, clock_id, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_CLOCK_DISABLE, NULL, 1, clock_id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_disable);
 
@@ -564,8 +589,7 @@ int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETSTATE, clock_id, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETSTATE, ret_payload, 1, clock_id);
 	*state = ret_payload[1];
 
 	return ret;
@@ -584,8 +608,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getstate);
  */
 int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider)
 {
-	return zynqmp_pm_invoke_fn(PM_CLOCK_SETDIVIDER, clock_id, divider,
-				   0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_CLOCK_SETDIVIDER, NULL, 2, clock_id, divider);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setdivider);
 
@@ -604,8 +627,7 @@ int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETDIVIDER, clock_id, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETDIVIDER, ret_payload, 1, clock_id);
 	*divider = ret_payload[1];
 
 	return ret;
@@ -623,10 +645,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getdivider);
  */
 int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
 {
-	return zynqmp_pm_invoke_fn(PM_CLOCK_SETRATE, clock_id,
-				   lower_32_bits(rate),
-				   upper_32_bits(rate),
-				   0, NULL);
+	return zynqmp_pm_invoke_fn(PM_CLOCK_SETRATE, NULL, 3, clock_id, lower_32_bits(rate),
+				   upper_32_bits(rate));
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setrate);
 
@@ -645,8 +665,7 @@ int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETRATE, clock_id, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETRATE, ret_payload, 1, clock_id);
 	*rate = ((u64)ret_payload[2] << 32) | ret_payload[1];
 
 	return ret;
@@ -664,8 +683,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getrate);
  */
 int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id)
 {
-	return zynqmp_pm_invoke_fn(PM_CLOCK_SETPARENT, clock_id,
-				   parent_id, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_CLOCK_SETPARENT, NULL, 2, clock_id, parent_id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setparent);
 
@@ -684,8 +702,7 @@ int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETPARENT, clock_id, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETPARENT, ret_payload, 1, clock_id);
 	*parent_id = ret_payload[1];
 
 	return ret;
@@ -704,8 +721,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getparent);
  */
 int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_PLL_FRAC_MODE,
-				   clk_id, mode, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_SET_PLL_FRAC_MODE, clk_id, mode);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_mode);
 
@@ -721,8 +737,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_mode);
  */
 int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_GET_PLL_FRAC_MODE,
-				   clk_id, 0, mode);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, mode, 3, 0, IOCTL_GET_PLL_FRAC_MODE, clk_id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_get_pll_frac_mode);
 
@@ -739,8 +754,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_pll_frac_mode);
  */
 int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_PLL_FRAC_DATA,
-				   clk_id, data, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_SET_PLL_FRAC_DATA, clk_id, data);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_data);
 
@@ -756,8 +770,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_pll_frac_data);
  */
 int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_GET_PLL_FRAC_DATA,
-				   clk_id, 0, data);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, data, 3, 0, IOCTL_GET_PLL_FRAC_DATA, clk_id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_get_pll_frac_data);
 
@@ -778,9 +791,8 @@ int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value)
 	u32 mask = (node_id == NODE_SD_0) ? GENMASK(15, 0) : GENMASK(31, 16);
 
 	if (value) {
-		return zynqmp_pm_invoke_fn(PM_IOCTL, node_id,
-					   IOCTL_SET_SD_TAPDELAY,
-					   type, value, NULL);
+		return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, node_id, IOCTL_SET_SD_TAPDELAY, type,
+					   value);
 	}
 
 	/*
@@ -798,7 +810,7 @@ int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value)
 	 * Use PM_MMIO_READ/PM_MMIO_WRITE to re-implement the missing counter
 	 * part of IOCTL_SET_SD_TAPDELAY which clears SDx_ITAPDLYENA bits.
 	 */
-	return zynqmp_pm_invoke_fn(PM_MMIO_WRITE, reg, mask, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_MMIO_WRITE, NULL, 2, reg, mask);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_sd_tapdelay);
 
@@ -814,8 +826,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_sd_tapdelay);
  */
 int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, node_id, IOCTL_SD_DLL_RESET,
-				   type, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, node_id, IOCTL_SD_DLL_RESET, type);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_sd_dll_reset);
 
@@ -831,8 +842,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_sd_dll_reset);
  */
 int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, dev_id, IOCTL_OSPI_MUX_SELECT,
-				   select, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, dev_id, IOCTL_OSPI_MUX_SELECT, select);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_ospi_mux_select);
 
@@ -847,8 +857,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_ospi_mux_select);
  */
 int zynqmp_pm_write_ggs(u32 index, u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_WRITE_GGS,
-				   index, value, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_WRITE_GGS, index, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_write_ggs);
 
@@ -863,8 +872,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_write_ggs);
  */
 int zynqmp_pm_read_ggs(u32 index, u32 *value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_READ_GGS,
-				   index, 0, value);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, value, 3, 0, IOCTL_READ_GGS, index);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_read_ggs);
 
@@ -880,8 +888,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_read_ggs);
  */
 int zynqmp_pm_write_pggs(u32 index, u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_WRITE_PGGS, index, value,
-				   NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_WRITE_PGGS, index, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_write_pggs);
 
@@ -897,15 +904,13 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_write_pggs);
  */
 int zynqmp_pm_read_pggs(u32 index, u32 *value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_READ_PGGS, index, 0,
-				   value);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, value, 3, 0, IOCTL_READ_PGGS, index);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_read_pggs);
 
 int zynqmp_pm_set_tapdelay_bypass(u32 index, u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_TAPDELAY_BYPASS,
-				   index, value, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_SET_TAPDELAY_BYPASS, index, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_tapdelay_bypass);
 
@@ -920,8 +925,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_tapdelay_bypass);
  */
 int zynqmp_pm_set_boot_health_status(u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_BOOT_HEALTH_STATUS,
-				   value, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, 0, IOCTL_SET_BOOT_HEALTH_STATUS, value);
 }
 
 /**
@@ -935,8 +939,7 @@ int zynqmp_pm_set_boot_health_status(u32 value)
 int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 			   const enum zynqmp_pm_reset_action assert_flag)
 {
-	return zynqmp_pm_invoke_fn(PM_RESET_ASSERT, reset, assert_flag,
-				   0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_RESET_ASSERT, NULL, 2, reset, assert_flag);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_reset_assert);
 
@@ -955,8 +958,7 @@ int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status)
 	if (!status)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_RESET_GET_STATUS, reset, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_RESET_GET_STATUS, ret_payload, 1, reset);
 	*status = ret_payload[1];
 
 	return ret;
@@ -981,9 +983,8 @@ int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_FPGA_LOAD, lower_32_bits(address),
-				  upper_32_bits(address), size, flags,
-				  ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_FPGA_LOAD, ret_payload, 4, lower_32_bits(address),
+				  upper_32_bits(address), size, flags);
 	if (ret_payload[0])
 		return -ret_payload[0];
 
@@ -1008,7 +1009,7 @@ int zynqmp_pm_fpga_get_status(u32 *value)
 	if (!value)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_FPGA_GET_STATUS, 0, 0, 0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_FPGA_GET_STATUS, ret_payload, 0);
 	*value = ret_payload[1];
 
 	return ret;
@@ -1036,11 +1037,9 @@ int zynqmp_pm_fpga_get_config_status(u32 *value)
 	lower_addr = lower_32_bits((u64)&buf);
 	upper_addr = upper_32_bits((u64)&buf);
 
-	ret = zynqmp_pm_invoke_fn(PM_FPGA_READ,
-				  XILINX_ZYNQMP_PM_FPGA_CONFIG_STAT_OFFSET,
-				  lower_addr, upper_addr,
-				  XILINX_ZYNQMP_PM_FPGA_READ_CONFIG_REG,
-				  ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_FPGA_READ, ret_payload, 4,
+				  XILINX_ZYNQMP_PM_FPGA_CONFIG_STAT_OFFSET, lower_addr, upper_addr,
+				  XILINX_ZYNQMP_PM_FPGA_READ_CONFIG_REG);
 
 	*value = ret_payload[1];
 
@@ -1058,7 +1057,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_fpga_get_config_status);
  */
 int zynqmp_pm_pinctrl_request(const u32 pin)
 {
-	return zynqmp_pm_invoke_fn(PM_PINCTRL_REQUEST, pin, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_REQUEST, NULL, 1, pin);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_request);
 
@@ -1072,7 +1071,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_request);
  */
 int zynqmp_pm_pinctrl_release(const u32 pin)
 {
-	return zynqmp_pm_invoke_fn(PM_PINCTRL_RELEASE, pin, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_RELEASE, NULL, 1, pin);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_release);
 
@@ -1093,8 +1092,7 @@ int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
 	if (!id)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_GET_FUNCTION, pin, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_GET_FUNCTION, ret_payload, 1, pin);
 	*id = ret_payload[1];
 
 	return ret;
@@ -1112,8 +1110,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_get_function);
  */
 int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id)
 {
-	return zynqmp_pm_invoke_fn(PM_PINCTRL_SET_FUNCTION, pin, id,
-				   0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_SET_FUNCTION, NULL, 2, pin, id);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_set_function);
 
@@ -1136,8 +1133,7 @@ int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
 	if (!value)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_GET, pin, param,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_GET, ret_payload, 2, pin, param);
 	*value = ret_payload[1];
 
 	return ret;
@@ -1166,8 +1162,7 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
 			return -EOPNOTSUPP;
 	}
 
-	return zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_SET, pin,
-				   param, value, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_SET, NULL, 3, pin, param, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_set_config);
 
@@ -1185,8 +1180,7 @@ unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode)
 	unsigned int ret;
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 
-	ret = zynqmp_pm_invoke_fn(PM_MMIO_READ, CRL_APB_BOOT_PIN_CTRL, 0,
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_MMIO_READ, ret_payload, 1, CRL_APB_BOOT_PIN_CTRL);
 
 	*ps_mode = ret_payload[1];
 
@@ -1205,8 +1199,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_bootmode_read);
  */
 int zynqmp_pm_bootmode_write(u32 ps_mode)
 {
-	return zynqmp_pm_invoke_fn(PM_MMIO_WRITE, CRL_APB_BOOT_PIN_CTRL,
-				   CRL_APB_BOOTPIN_CTRL_MASK, ps_mode, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_MMIO_WRITE, NULL, 3, CRL_APB_BOOT_PIN_CTRL,
+				   CRL_APB_BOOTPIN_CTRL_MASK, ps_mode);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_bootmode_write);
 
@@ -1221,7 +1215,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_bootmode_write);
  */
 int zynqmp_pm_init_finalize(void)
 {
-	return zynqmp_pm_invoke_fn(PM_PM_INIT_FINALIZE, 0, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_PM_INIT_FINALIZE, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_init_finalize);
 
@@ -1235,7 +1229,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_init_finalize);
  */
 int zynqmp_pm_set_suspend_mode(u32 mode)
 {
-	return zynqmp_pm_invoke_fn(PM_SET_SUSPEND_MODE, mode, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_SET_SUSPEND_MODE, NULL, 1, mode);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_suspend_mode);
 
@@ -1254,8 +1248,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_suspend_mode);
 int zynqmp_pm_request_node(const u32 node, const u32 capabilities,
 			   const u32 qos, const enum zynqmp_pm_request_ack ack)
 {
-	return zynqmp_pm_invoke_fn(PM_REQUEST_NODE, node, capabilities,
-				   qos, ack, NULL);
+	return zynqmp_pm_invoke_fn(PM_REQUEST_NODE, NULL, 4, node, capabilities, qos, ack);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_request_node);
 
@@ -1271,7 +1264,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_request_node);
  */
 int zynqmp_pm_release_node(const u32 node)
 {
-	return zynqmp_pm_invoke_fn(PM_RELEASE_NODE, node, 0, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_RELEASE_NODE, NULL, 1, node);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_release_node);
 
@@ -1290,8 +1283,7 @@ int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode)
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	int ret;
 
-	ret = zynqmp_pm_invoke_fn(PM_IOCTL, node_id,
-				  IOCTL_GET_RPU_OPER_MODE, 0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_IOCTL, ret_payload, 2, node_id, IOCTL_GET_RPU_OPER_MODE);
 
 	/* only set rpu_mode if no error */
 	if (ret == XST_PM_SUCCESS)
@@ -1313,9 +1305,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_rpu_mode);
  */
 int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, node_id,
-				   IOCTL_SET_RPU_OPER_MODE, (u32)rpu_mode,
-				   0, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, node_id, IOCTL_SET_RPU_OPER_MODE,
+				   (u32)rpu_mode);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_rpu_mode);
 
@@ -1331,9 +1322,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_rpu_mode);
  */
 int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, node_id,
-				   IOCTL_TCM_COMB_CONFIG, (u32)tcm_mode, 0,
-				   NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 3, node_id, IOCTL_TCM_COMB_CONFIG,
+				   (u32)tcm_mode);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_tcm_config);
 
@@ -1348,7 +1338,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_tcm_config);
 int zynqmp_pm_force_pwrdwn(const u32 node,
 			   const enum zynqmp_pm_request_ack ack)
 {
-	return zynqmp_pm_invoke_fn(PM_FORCE_POWERDOWN, node, ack, 0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_FORCE_POWERDOWN, NULL, 2, node, ack);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_force_pwrdwn);
 
@@ -1367,8 +1357,8 @@ int zynqmp_pm_request_wake(const u32 node,
 			   const enum zynqmp_pm_request_ack ack)
 {
 	/* set_addr flag is encoded into 1st bit of address */
-	return zynqmp_pm_invoke_fn(PM_REQUEST_WAKEUP, node, address | set_addr,
-				   address >> 32, ack, NULL);
+	return zynqmp_pm_invoke_fn(PM_REQUEST_WAKEUP, NULL, 4, node, address | set_addr,
+				   address >> 32, ack);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_request_wake);
 
@@ -1388,8 +1378,7 @@ int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities,
 			      const u32 qos,
 			      const enum zynqmp_pm_request_ack ack)
 {
-	return zynqmp_pm_invoke_fn(PM_SET_REQUIREMENT, node, capabilities,
-				   qos, ack, NULL);
+	return zynqmp_pm_invoke_fn(PM_SET_REQUIREMENT, NULL, 4, node, capabilities, qos, ack);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_requirement);
 
@@ -1404,9 +1393,8 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_requirement);
  */
 int zynqmp_pm_load_pdi(const u32 src, const u64 address)
 {
-	return zynqmp_pm_invoke_fn(PM_LOAD_PDI, src,
-				   lower_32_bits(address),
-				   upper_32_bits(address), 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_LOAD_PDI, NULL, 3, src, lower_32_bits(address),
+				   upper_32_bits(address));
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_load_pdi);
 
@@ -1426,9 +1414,8 @@ int zynqmp_pm_aes_engine(const u64 address, u32 *out)
 	if (!out)
 		return -EINVAL;
 
-	ret = zynqmp_pm_invoke_fn(PM_SECURE_AES, upper_32_bits(address),
-				  lower_32_bits(address),
-				  0, 0, ret_payload);
+	ret = zynqmp_pm_invoke_fn(PM_SECURE_AES, ret_payload, 2, upper_32_bits(address),
+				  lower_32_bits(address));
 	*out = ret_payload[1];
 
 	return ret;
@@ -1456,8 +1443,7 @@ int zynqmp_pm_sha_hash(const u64 address, const u32 size, const u32 flags)
 	u32 lower_addr = lower_32_bits(address);
 	u32 upper_addr = upper_32_bits(address);
 
-	return zynqmp_pm_invoke_fn(PM_SECURE_SHA, upper_addr, lower_addr,
-				   size, flags, NULL);
+	return zynqmp_pm_invoke_fn(PM_SECURE_SHA, NULL, 4, upper_addr, lower_addr, size, flags);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_sha_hash);
 
@@ -1479,8 +1465,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_sha_hash);
 int zynqmp_pm_register_notifier(const u32 node, const u32 event,
 				const u32 wake, const u32 enable)
 {
-	return zynqmp_pm_invoke_fn(PM_REGISTER_NOTIFIER, node, event,
-				   wake, enable, NULL);
+	return zynqmp_pm_invoke_fn(PM_REGISTER_NOTIFIER, NULL, 4, node, event, wake, enable);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_register_notifier);
 
@@ -1493,8 +1478,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_register_notifier);
  */
 int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype)
 {
-	return zynqmp_pm_invoke_fn(PM_SYSTEM_SHUTDOWN, type, subtype,
-				   0, 0, NULL);
+	return zynqmp_pm_invoke_fn(PM_SYSTEM_SHUTDOWN, NULL, 2, type, subtype);
 }
 
 /**
@@ -1506,8 +1490,7 @@ int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype)
  */
 int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_SET_FEATURE_CONFIG,
-				   id, value, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, 0, IOCTL_SET_FEATURE_CONFIG, id, value);
 }
 
 /**
@@ -1520,8 +1503,7 @@ int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value)
 int zynqmp_pm_get_feature_config(enum pm_feature_config_id id,
 				 u32 *payload)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, 0, IOCTL_GET_FEATURE_CONFIG,
-				   id, 0, payload);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, payload, 3, 0, IOCTL_GET_FEATURE_CONFIG, id);
 }
 
 /**
@@ -1534,8 +1516,7 @@ int zynqmp_pm_get_feature_config(enum pm_feature_config_id id,
  */
 int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, node, IOCTL_SET_SD_CONFIG,
-				   config, value, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, node, IOCTL_SET_SD_CONFIG, config, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_sd_config);
 
@@ -1550,8 +1531,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_set_sd_config);
 int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config,
 			     u32 value)
 {
-	return zynqmp_pm_invoke_fn(PM_IOCTL, node, IOCTL_SET_GEM_CONFIG,
-				   config, value, NULL);
+	return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 4, node, IOCTL_SET_GEM_CONFIG, config, value);
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_set_gem_config);
 
diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c
index 86a048a10a13..adb875f89829 100644
--- a/drivers/soc/xilinx/xlnx_event_manager.c
+++ b/drivers/soc/xilinx/xlnx_event_manager.c
@@ -483,7 +483,7 @@ static void xlnx_call_notify_cb_handler(const u32 *payload)
 
 static void xlnx_get_event_callback_data(u32 *buf)
 {
-	zynqmp_pm_invoke_fn(GET_CALLBACK_DATA, 0, 0, 0, 0, buf);
+	zynqmp_pm_invoke_fn(GET_CALLBACK_DATA, buf, 0);
 }
 
 static irqreturn_t xlnx_event_handler(int irq, void *dev_id)
diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c
index c2c819701eec..0de0acf80ef8 100644
--- a/drivers/soc/xilinx/zynqmp_power.c
+++ b/drivers/soc/xilinx/zynqmp_power.c
@@ -51,7 +51,7 @@ static enum pm_suspend_mode suspend_mode = PM_SUSPEND_MODE_STD;
 
 static void zynqmp_pm_get_callback_data(u32 *buf)
 {
-	zynqmp_pm_invoke_fn(GET_CALLBACK_DATA, 0, 0, 0, 0, buf);
+	zynqmp_pm_invoke_fn(GET_CALLBACK_DATA, buf, 0);
 }
 
 static void suspend_event_callback(const u32 *payload, void *data)
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d1ea3898564c..41190bbed8ed 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -509,8 +509,7 @@ struct zynqmp_pm_query_data {
 	u32 arg3;
 };
 
-int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
-			u32 arg2, u32 arg3, u32 *ret_payload);
+int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
-- 
cgit v1.2.3


From f689a0ca45fcdf4139727a3a02a49efbb1902306 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:10 -0800
Subject: firmware: xilinx: Expand feature check to support all PLM modules

To support feature check for all modules, append the module id of the API
that is being checked to the feature check API so it could be routed to
the target module for processing.

There is no need to check compatible string because the board information
is taken via firmware interface.

Co-developed-by: Saeed Nowshadi <saeed.nowshadi@amd.com>
Signed-off-by: Saeed Nowshadi <saeed.nowshadi@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-3-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 41 +++++++++++++++++++++---------------
 include/linux/firmware/xlnx-zynqmp.h | 11 ++++++++++
 2 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 2a7fd3fbac8c..ec77aefee17e 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -199,9 +199,29 @@ static int __do_feature_check_call(const u32 api_id, u32 *ret_payload)
 {
 	int ret;
 	u64 smc_arg[2];
+	u32 module_id;
+	u32 feature_check_api_id;
 
-	smc_arg[0] = PM_SIP_SVC | PM_FEATURE_CHECK;
-	smc_arg[1] = api_id;
+	module_id = FIELD_GET(MODULE_ID_MASK, api_id);
+
+	/*
+	 * Feature check of APIs belonging to PM, XSEM, and TF-A are handled by calling
+	 * PM_FEATURE_CHECK API. For other modules, call PM_API_FEATURES API.
+	 */
+	if (module_id == PM_MODULE_ID || module_id == XSEM_MODULE_ID || module_id == TF_A_MODULE_ID)
+		feature_check_api_id = PM_FEATURE_CHECK;
+	else
+		feature_check_api_id = PM_API_FEATURES;
+
+	/*
+	 * Feature check of TF-A APIs is done in the TF-A layer and it expects for
+	 * MODULE_ID_MASK bits of SMC's arg[0] to be the same as PM_MODULE_ID.
+	 */
+	if (module_id == TF_A_MODULE_ID)
+		module_id = PM_MODULE_ID;
+
+	smc_arg[0] = PM_SIP_SVC | FIELD_PREP(MODULE_ID_MASK, module_id) | feature_check_api_id;
+	smc_arg[1] = (api_id & API_ID_MASK);
 
 	ret = do_fw_call(ret_payload, 2, smc_arg[0], smc_arg[1]);
 	if (ret)
@@ -1904,22 +1924,9 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	np = of_find_compatible_node(NULL, NULL, "xlnx,zynqmp");
-	if (!np) {
-		np = of_find_compatible_node(NULL, NULL, "xlnx,versal");
-		if (!np)
-			return 0;
-
+	ret = do_feature_check_call(PM_FEATURE_CHECK);
+	if (ret >= 0 && ((ret & FIRMWARE_VERSION_MASK) >= PM_API_VERSION_1))
 		feature_check_enabled = true;
-	}
-
-	if (!feature_check_enabled) {
-		ret = do_feature_check_call(PM_FEATURE_CHECK);
-		if (ret >= 0)
-			feature_check_enabled = true;
-	}
-
-	of_node_put(np);
 
 	devinfo = devm_kzalloc(dev, sizeof(*devinfo), GFP_KERNEL);
 	if (!devinfo)
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 41190bbed8ed..e9a7fece5efe 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -32,6 +32,7 @@
 #define PM_SIP_SVC			0xC2000000
 
 /* PM API versions */
+#define PM_API_VERSION_1	1
 #define PM_API_VERSION_2	2
 
 #define PM_PINCTRL_PARAM_SET_VERSION	2
@@ -47,6 +48,9 @@
 #define FAMILY_CODE_MASK	GENMASK(27, 21)
 #define SUB_FAMILY_CODE_MASK	GENMASK(20, 19)
 
+#define API_ID_MASK		GENMASK(7, 0)
+#define MODULE_ID_MASK		GENMASK(11, 8)
+
 /* ATF only commands */
 #define TF_A_PM_REGISTER_SGI		0xa04
 #define PM_GET_TRUSTZONE_VERSION	0xa03
@@ -112,6 +116,12 @@
 #define XPM_EVENT_ERROR_MASK_NOC_NCR		BIT(13)
 #define XPM_EVENT_ERROR_MASK_NOC_CR		BIT(12)
 
+enum pm_module_id {
+	PM_MODULE_ID = 0x0,
+	XSEM_MODULE_ID = 0x3,
+	TF_A_MODULE_ID = 0xa,
+};
+
 enum pm_api_cb_id {
 	PM_INIT_SUSPEND_CB = 30,
 	PM_ACKNOWLEDGE_CB = 31,
@@ -119,6 +129,7 @@ enum pm_api_cb_id {
 };
 
 enum pm_api_id {
+	PM_API_FEATURES = 0,
 	PM_GET_API_VERSION = 1,
 	PM_REGISTER_NOTIFIER = 5,
 	PM_FORCE_POWERDOWN = 8,
-- 
cgit v1.2.3


From 8c016c807a90535432543204dbbb032e4a709009 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:12 -0800
Subject: drivers: soc: xilinx: Fix error message on SGI registration failure

Failure to register SGI for firmware event notification is non-fatal error
when feature is not supported by other modules such as Xen and TF-A. Add
_info level log message for such special case.

Also add XST_PM_INVALID_VERSION error code and map it to -EOPNOSUPP Linux
kernel error code. If feature is not supported or EEMI API version is
mismatch, firmware can return XST_PM_INVALID_VERSION = 4 or
XST_PM_NO_FEATURE = 19 error code.

Co-developed-by: Tanmay Shah <tanmay.shah@amd.com>
Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-5-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c        | 4 +++-
 drivers/soc/xilinx/xlnx_event_manager.c | 6 +++++-
 include/linux/firmware/xlnx-zynqmp.h    | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 533679a07b57..077b49e440b5 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -92,6 +92,8 @@ static int zynqmp_pm_ret_code(u32 ret_status)
 		return 0;
 	case XST_PM_NO_FEATURE:
 		return -ENOTSUPP;
+	case XST_PM_INVALID_VERSION:
+		return -EOPNOTSUPP;
 	case XST_PM_NO_ACCESS:
 		return -EACCES;
 	case XST_PM_ABORT_SUSPEND:
@@ -396,7 +398,7 @@ int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset)
 	int ret;
 
 	ret = zynqmp_pm_invoke_fn(TF_A_PM_REGISTER_SGI, NULL, 2, sgi_num, reset);
-	if (!ret)
+	if (ret != -EOPNOTSUPP && !ret)
 		return ret;
 
 	/* try old implementation as fallback strategy if above fails */
diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c
index adb875f89829..27a8d89a0a09 100644
--- a/drivers/soc/xilinx/xlnx_event_manager.c
+++ b/drivers/soc/xilinx/xlnx_event_manager.c
@@ -653,7 +653,11 @@ static int xlnx_event_manager_probe(struct platform_device *pdev)
 
 	ret = zynqmp_pm_register_sgi(sgi_num, 0);
 	if (ret) {
-		dev_err(&pdev->dev, "SGI %d Registration over TF-A failed with %d\n", sgi_num, ret);
+		if (ret == -EOPNOTSUPP)
+			dev_err(&pdev->dev, "SGI registration not supported by TF-A or Xen\n");
+		else
+			dev_err(&pdev->dev, "SGI %d registration failed, err %d\n", sgi_num, ret);
+
 		xlnx_event_cleanup_sgi(pdev);
 		return ret;
 	}
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index e9a7fece5efe..ec1800c86006 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -172,6 +172,7 @@ enum pm_api_id {
 /* PMU-FW return status codes */
 enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
+	XST_PM_INVALID_VERSION = 4,
 	XST_PM_NO_FEATURE = 19,
 	XST_PM_INTERNAL = 2000,
 	XST_PM_CONFLICT = 2001,
-- 
cgit v1.2.3


From 5dac2a98f6542ae1ce78b702374ea4be3f5ee07d Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Wed, 29 Nov 2023 03:27:13 -0800
Subject: firmware: zynqmp: Add support to handle IPI CRC failure

Added new PM error code XST_PM_INVALID_CRC to handle CRC validation failure
during IPI communication.

Co-developed-by: Naman Trivedi Manojbhai <naman.trivedimanojbhai@amd.com>
Signed-off-by: Naman Trivedi Manojbhai <naman.trivedimanojbhai@amd.com>
Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231129112713.22718-6-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 1 +
 include/linux/firmware/xlnx-zynqmp.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 077b49e440b5..caa0952bfc42 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -103,6 +103,7 @@ static int zynqmp_pm_ret_code(u32 ret_status)
 	case XST_PM_INTERNAL:
 	case XST_PM_CONFLICT:
 	case XST_PM_INVALID_NODE:
+	case XST_PM_INVALID_CRC:
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index ec1800c86006..6b48294f3c92 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -174,6 +174,7 @@ enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
 	XST_PM_INVALID_VERSION = 4,
 	XST_PM_NO_FEATURE = 19,
+	XST_PM_INVALID_CRC = 301,
 	XST_PM_INTERNAL = 2000,
 	XST_PM_CONFLICT = 2001,
 	XST_PM_NO_ACCESS = 2002,
-- 
cgit v1.2.3


From 5ec42bf04d72fd6d0a6855810cc779e0ee31dfd7 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 4 Dec 2023 15:27:06 -0600
Subject: PCI: add INTEL_HDA_ARL to pci_ids.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCI ID insertion follows the increasing order in the table, but
this hardware follows MTL (MeteorLake).

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20231204212710.185976-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 275799b5f535..97cc0baad0f4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -3065,6 +3065,7 @@
 #define PCI_DEVICE_ID_INTEL_82443GX_0	0x71a0
 #define PCI_DEVICE_ID_INTEL_82443GX_2	0x71a2
 #define PCI_DEVICE_ID_INTEL_82372FB_1	0x7601
+#define PCI_DEVICE_ID_INTEL_HDA_ARL	0x7728
 #define PCI_DEVICE_ID_INTEL_HDA_RPL_S	0x7a50
 #define PCI_DEVICE_ID_INTEL_HDA_ADL_S	0x7ad0
 #define PCI_DEVICE_ID_INTEL_HDA_MTL	0x7e28
-- 
cgit v1.2.3


From fa422b353d212373fb2b2857a5ea5a6fa4876f9c Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Mon, 23 Oct 2023 15:20:46 +0800
Subject: mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind

Now, if we suddenly remove a PMEM device(by calling unbind) which
contains FSDAX while programs are still accessing data in this device,
e.g.:
```
 $FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 &
 # $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 &
 echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind
```
it could come into an unacceptable state:
  1. device has gone but mount point still exists, and umount will fail
       with "target is busy"
  2. programs will hang and cannot be killed
  3. may crash with NULL pointer dereference

To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we
are going to remove the whole device, and make sure all related processes
could be notified so that they could end up gracefully.

This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1].  With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
on it to unmap all files in use, and notify processes who are using
those files.

Call trace:
trigger unbind
 -> unbind_store()
  -> ... (skip)
   -> devres_release_all()
    -> kill_dax()
     -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
      -> xfs_dax_notify_failure()
      `-> freeze_super()             // freeze (kernel call)
      `-> do xfs rmap
      ` -> mf_dax_kill_procs()
      `  -> collect_procs_fsdax()    // all associated processes
      `  -> unmap_and_kill()
      ` -> invalidate_inode_pages2_range() // drop file's cache
      `-> thaw_super()               // thaw (both kernel & user call)

Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
new dax mapping from being created.  Do not shutdown filesystem directly
if configuration is not supported, or if failure range includes metadata
area.  Make sure all files and processes(not only the current progress)
are handled correctly.  Also drop the cache of associated files before
pmem is removed.

[1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
[2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 drivers/dax/super.c         |   3 +-
 fs/xfs/xfs_notify_failure.c | 108 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/mm.h          |   1 +
 mm/memory-failure.c         |  21 +++++++--
 4 files changed, 122 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0da9232ea175..f4b635526345 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	if (dax_dev->holder_data != NULL)
-		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
+				MF_MEM_PRE_REMOVE);
 
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00f..fa50e5308292 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
 
 #include <linux/mm.h>
 #include <linux/dax.h>
+#include <linux/fs.h>
 
 struct xfs_failure_info {
 	xfs_agblock_t		startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
 	struct xfs_mount		*mp = cur->bc_mp;
 	struct xfs_inode		*ip;
 	struct xfs_failure_info		*notify = data;
+	struct address_space		*mapping;
+	pgoff_t				pgoff;
+	unsigned long			pgcnt;
 	int				error = 0;
 
 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+		/* Continue the query because this isn't a failure. */
+		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		notify->want_shutdown = true;
 		return 0;
 	}
@@ -92,14 +99,60 @@ xfs_dax_failure_fn(
 		return 0;
 	}
 
-	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
-				  xfs_failure_pgoff(mp, rec, notify),
-				  xfs_failure_pgcnt(mp, rec, notify),
-				  notify->mf_flags);
+	mapping = VFS_I(ip)->i_mapping;
+	pgoff = xfs_failure_pgoff(mp, rec, notify);
+	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+	/* Continue the rmap query if the inode isn't a dax file. */
+	if (dax_mapping(mapping))
+		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+					  notify->mf_flags);
+
+	/* Invalidate the cache in dax pages. */
+	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+		invalidate_inode_pages2_range(mapping, pgoff,
+					      pgoff + pgcnt - 1);
+
 	xfs_irele(ip);
 	return error;
 }
 
+static int
+xfs_dax_notify_failure_freeze(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+	int			error;
+
+	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+	if (error)
+		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+	return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+	struct xfs_mount	*mp,
+	bool			kernel_frozen)
+{
+	struct super_block	*sb = mp->m_super;
+	int			error;
+
+	if (kernel_frozen) {
+		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+		if (error)
+			xfs_emerg(mp, "still frozen after notify failure, err=%d",
+				error);
+	}
+
+	/*
+	 * Also thaw userspace call anyway because the device is about to be
+	 * removed immediately.
+	 */
+	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
 static int
 xfs_dax_notify_ddev_failure(
 	struct xfs_mount	*mp,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
 	struct xfs_btree_cur	*cur = NULL;
 	struct xfs_buf		*agf_bp = NULL;
 	int			error = 0;
+	bool			kernel_frozen = false;
 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
 	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
 							     daddr + bblen - 1);
 	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
 
+	if (mf_flags & MF_MEM_PRE_REMOVE) {
+		xfs_info(mp, "Device is about to be removed!");
+		/*
+		 * Freeze fs to prevent new mappings from being created.
+		 * - Keep going on if others already hold the kernel forzen.
+		 * - Keep going on if other errors too because this device is
+		 *   starting to fail.
+		 * - If kernel frozen state is hold successfully here, thaw it
+		 *   here as well at the end.
+		 */
+		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+	}
+
 	error = xfs_trans_alloc_empty(mp, &tp);
 	if (error)
-		return error;
+		goto out;
 
 	for (; agno <= end_agno; agno++) {
 		struct xfs_rmap_irec	ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
 	}
 
 	xfs_trans_cancel(tp);
-	if (error || notify.want_shutdown) {
+
+	/*
+	 * Shutdown fs from a force umount in pre-remove case which won't fail,
+	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
+	 * CORRUPT flag if error occured or notify.want_shutdown was set during
+	 * RMAP querying.
+	 */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+	else if (error || notify.want_shutdown) {
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		if (!error)
 			error = -EFSCORRUPTED;
 	}
+
+out:
+	/* Thaw the fs if it has been frozen before. */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
 	return error;
 }
 
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
 
 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
 	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		/*
+		 * In the pre-remove case the failure notification is attempting
+		 * to trigger a force unmount.  The expectation is that the
+		 * device is still present, but its removal is in progress and
+		 * can not be cancelled, proceed with accessing the log device.
+		 */
+		if (mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
 	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
 	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
 
+	/* Notify failure on the whole device. */
+	if (offset == 0 && len == U64_MAX) {
+		offset = ddev_start;
+		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+	}
+
 	/* Ignore the range out of filesystem area */
 	if (offset + len - 1 < ddev_start)
 		return -ENXIO;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..caf13e94260e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3904,6 +3904,7 @@ enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 	MF_NO_RETRY = 1 << 6,
+	MF_MEM_PRE_REMOVE = 1 << 7,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 660c21859118..cff3bda60691 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -679,7 +679,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
  */
 static void collect_procs_fsdax(struct page *page,
 		struct address_space *mapping, pgoff_t pgoff,
-		struct list_head *to_kill)
+		struct list_head *to_kill, bool pre_remove)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -687,8 +687,15 @@ static void collect_procs_fsdax(struct page *page,
 	i_mmap_lock_read(mapping);
 	rcu_read_lock();
 	for_each_process(tsk) {
-		struct task_struct *t = task_early_kill(tsk, true);
+		struct task_struct *t = tsk;
 
+		/*
+		 * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
+		 * the current may not be the one accessing the fsdax page.
+		 * Otherwise, search for the current task.
+		 */
+		if (!pre_remove)
+			t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1795,6 +1802,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 	dax_entry_t cookie;
 	struct page *page;
 	size_t end = index + count;
+	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
 
 	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
 
@@ -1806,9 +1814,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		if (!page)
 			goto unlock;
 
-		SetPageHWPoison(page);
+		if (!pre_remove)
+			SetPageHWPoison(page);
 
-		collect_procs_fsdax(page, mapping, index, &to_kill);
+		/*
+		 * The pre_remove case is revoking access, the memory is still
+		 * good and could theoretically be put back into service.
+		 */
+		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
 		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
 				index, mf_flags);
 unlock:
-- 
cgit v1.2.3


From 3bc05faf37876f99e2a7baffa9c66fdcfb11d1f7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 5 Dec 2023 17:42:30 +0100
Subject: net: dsa: microchip: properly support platform_data probing

The ksz driver has bits and pieces of platform_data probing support, but
it doesn't work.

The conventional thing to do is to have an encapsulating structure for
struct dsa_chip_data that gets put into dev->platform_data. This driver
expects a struct ksz_platform_data, but that doesn't contain a struct
dsa_chip_data as first element, which will obviously not work with
dsa_switch_probe() -> dsa_switch_parse().

Pointing dev->platform_data to a struct dsa_chip_data directly is in
principle possible, but that doesn't work either. The driver has
ksz_switch_detect() to read the device ID from hardware, followed by
ksz_check_device_id() to compare it against a predetermined expected
value. This protects against early errors in the SPI/I2C communication.
With platform_data, the mechanism in ksz_check_device_id() doesn't work
and even leads to NULL pointer dereferences, since of_device_get_match_data()
doesn't work in that probe path.

So obviously, the platform_data support is actually missing, and the
existing handling of struct ksz_platform_data is bogus. Complete the
support by adding a struct dsa_chip_data as first element, and fixing up
ksz_check_device_id() to pick up the platform_data instead of the
unavailable of_device_get_match_data().

The early dev->chip_id assignment from ksz_switch_register() is also
bogus, because ksz_switch_detect() sets it to an initial value. So
remove it.

Also, ksz_platform_data :: enabled_ports isn't used anywhere, delete it.

Link: https://lore.kernel.org/netdev/20231204154315.3906267-1-dd@embedd.com/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Danzberger <dd@embedd.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz_common.c      | 21 +++++++++++++--------
 include/linux/platform_data/microchip-ksz.h |  4 +++-
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 9545aed905f5..db1bbcf3a5f2 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1673,15 +1673,23 @@ static const struct ksz_chip_data *ksz_lookup_info(unsigned int prod_num)
 
 static int ksz_check_device_id(struct ksz_device *dev)
 {
-	const struct ksz_chip_data *dt_chip_data;
+	const struct ksz_chip_data *expected_chip_data;
+	u32 expected_chip_id;
 
-	dt_chip_data = of_device_get_match_data(dev->dev);
+	if (dev->pdata) {
+		expected_chip_id = dev->pdata->chip_id;
+		expected_chip_data = ksz_lookup_info(expected_chip_id);
+		if (WARN_ON(!expected_chip_data))
+			return -ENODEV;
+	} else {
+		expected_chip_data = of_device_get_match_data(dev->dev);
+		expected_chip_id = expected_chip_data->chip_id;
+	}
 
-	/* Check for Device Tree and Chip ID */
-	if (dt_chip_data->chip_id != dev->chip_id) {
+	if (expected_chip_id != dev->chip_id) {
 		dev_err(dev->dev,
 			"Device tree specifies chip %s but found %s, please fix it!\n",
-			dt_chip_data->dev_name, dev->info->dev_name);
+			expected_chip_data->dev_name, dev->info->dev_name);
 		return -ENODEV;
 	}
 
@@ -4156,9 +4164,6 @@ int ksz_switch_register(struct ksz_device *dev)
 	int ret;
 	int i;
 
-	if (dev->pdata)
-		dev->chip_id = dev->pdata->chip_id;
-
 	dev->reset_gpio = devm_gpiod_get_optional(dev->dev, "reset",
 						  GPIOD_OUT_LOW);
 	if (IS_ERR(dev->reset_gpio))
diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index ea1cc6d829e9..6480bf4af0fb 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -20,10 +20,12 @@
 #define __MICROCHIP_KSZ_H
 
 #include <linux/types.h>
+#include <linux/platform_data/dsa.h>
 
 struct ksz_platform_data {
+	/* Must be first such that dsa_register_switch() can access it */
+	struct dsa_chip_data cd;
 	u32 chip_id;
-	u16 enabled_ports;
 };
 
 #endif
-- 
cgit v1.2.3


From d16f1096b320d42e41ad9dee4d4098afd140d3e1 Mon Sep 17 00:00:00 2001
From: Daniel Danzberger <dd@embedd.com>
Date: Tue, 5 Dec 2023 17:42:31 +0100
Subject: net: dsa: microchip: move ksz_chip_id enum to platform include

With the ksz_chip_id enums moved to the platform include file for ksz
switches, platform code that instantiates a device can now use these to
set ksz_platform_data::chip_id.

Signed-off-by: Daniel Danzberger <dd@embedd.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz_common.h      | 20 +-------------------
 include/linux/platform_data/microchip-ksz.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index d1a1c876ba0d..15612101a155 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -14,6 +14,7 @@
 #include <linux/regmap.h>
 #include <net/dsa.h>
 #include <linux/irq.h>
+#include <linux/platform_data/microchip-ksz.h>
 
 #include "ksz_ptp.h"
 
@@ -203,25 +204,6 @@ enum ksz_model {
 	LAN9374,
 };
 
-enum ksz_chip_id {
-	KSZ8563_CHIP_ID = 0x8563,
-	KSZ8795_CHIP_ID = 0x8795,
-	KSZ8794_CHIP_ID = 0x8794,
-	KSZ8765_CHIP_ID = 0x8765,
-	KSZ8830_CHIP_ID = 0x8830,
-	KSZ9477_CHIP_ID = 0x00947700,
-	KSZ9896_CHIP_ID = 0x00989600,
-	KSZ9897_CHIP_ID = 0x00989700,
-	KSZ9893_CHIP_ID = 0x00989300,
-	KSZ9563_CHIP_ID = 0x00956300,
-	KSZ9567_CHIP_ID = 0x00956700,
-	LAN9370_CHIP_ID = 0x00937000,
-	LAN9371_CHIP_ID = 0x00937100,
-	LAN9372_CHIP_ID = 0x00937200,
-	LAN9373_CHIP_ID = 0x00937300,
-	LAN9374_CHIP_ID = 0x00937400,
-};
-
 enum ksz_regs {
 	REG_SW_MAC_ADDR,
 	REG_IND_CTRL_0,
diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index 6480bf4af0fb..f177416635a2 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -22,6 +22,25 @@
 #include <linux/types.h>
 #include <linux/platform_data/dsa.h>
 
+enum ksz_chip_id {
+	KSZ8563_CHIP_ID = 0x8563,
+	KSZ8795_CHIP_ID = 0x8795,
+	KSZ8794_CHIP_ID = 0x8794,
+	KSZ8765_CHIP_ID = 0x8765,
+	KSZ8830_CHIP_ID = 0x8830,
+	KSZ9477_CHIP_ID = 0x00947700,
+	KSZ9896_CHIP_ID = 0x00989600,
+	KSZ9897_CHIP_ID = 0x00989700,
+	KSZ9893_CHIP_ID = 0x00989300,
+	KSZ9563_CHIP_ID = 0x00956300,
+	KSZ9567_CHIP_ID = 0x00956700,
+	LAN9370_CHIP_ID = 0x00937000,
+	LAN9371_CHIP_ID = 0x00937100,
+	LAN9372_CHIP_ID = 0x00937200,
+	LAN9373_CHIP_ID = 0x00937300,
+	LAN9374_CHIP_ID = 0x00937400,
+};
+
 struct ksz_platform_data {
 	/* Must be first such that dsa_register_switch() can access it */
 	struct dsa_chip_data cd;
-- 
cgit v1.2.3


From 37c8ceb6d92c955f5dd8223c3f6c90b277322210 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Mon, 30 Oct 2023 08:22:26 +0200
Subject: mmc: core: Remove packed command leftovers

Packed commands support was removed long time ago, but some bits got
left behind. Remove them.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20231030062226.1895692-1-avri.altman@wdc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   |  5 -----
 include/linux/mmc/card.h |  3 ---
 include/linux/mmc/core.h |  1 -
 include/linux/mmc/mmc.h  | 10 ----------
 4 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 705942edacc6..7b996db570c9 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -613,11 +613,6 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		} else {
 			card->ext_csd.data_tag_unit_size = 0;
 		}
-
-		card->ext_csd.max_packed_writes =
-			ext_csd[EXT_CSD_MAX_PACKED_WRITES];
-		card->ext_csd.max_packed_reads =
-			ext_csd[EXT_CSD_MAX_PACKED_READS];
 	} else {
 		card->ext_csd.data_sector_size = 512;
 	}
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 7b12eebc5586..47eeb122524c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -52,9 +52,6 @@ struct mmc_ext_csd {
 	u8			part_config;
 	u8			cache_ctrl;
 	u8			rst_n_function;
-	u8			max_packed_writes;
-	u8			max_packed_reads;
-	u8			packed_event_en;
 	unsigned int		part_time;		/* Units: ms */
 	unsigned int		sa_timeout;		/* Units: 100ns */
 	unsigned int		generic_cmd6_time;	/* Units: 10ms */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 6efec0b9820c..2c7928a50907 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -27,7 +27,6 @@ struct mmc_command {
 	u32			opcode;
 	u32			arg;
 #define MMC_CMD23_ARG_REL_WR	(1 << 31)
-#define MMC_CMD23_ARG_PACKED	((0 << 31) | (1 << 30))
 #define MMC_CMD23_ARG_TAG_REQ	(1 << 29)
 	u32			resp[4];
 	unsigned int		flags;		/* expected response type */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 6f7993803ee7..cf2bcb5da30d 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -257,8 +257,6 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_FLUSH_CACHE		32      /* W */
 #define EXT_CSD_CACHE_CTRL		33      /* R/W */
 #define EXT_CSD_POWER_OFF_NOTIFICATION	34	/* R/W */
-#define EXT_CSD_PACKED_FAILURE_INDEX	35	/* RO */
-#define EXT_CSD_PACKED_CMD_STATUS	36	/* RO */
 #define EXT_CSD_EXP_EVENTS_STATUS	54	/* RO, 2 bytes */
 #define EXT_CSD_EXP_EVENTS_CTRL		56	/* R/W, 2 bytes */
 #define EXT_CSD_DATA_SECTOR_SIZE	61	/* R */
@@ -321,8 +319,6 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_SUPPORTED_MODE		493	/* RO */
 #define EXT_CSD_TAG_UNIT_SIZE		498	/* RO */
 #define EXT_CSD_DATA_TAG_SUPPORT	499	/* RO */
-#define EXT_CSD_MAX_PACKED_WRITES	500	/* RO */
-#define EXT_CSD_MAX_PACKED_READS	501	/* RO */
 #define EXT_CSD_BKOPS_SUPPORT		502	/* RO */
 #define EXT_CSD_HPI_FEATURES		503	/* RO */
 
@@ -402,18 +398,12 @@ static inline bool mmc_ready_for_data(u32 status)
 #define EXT_CSD_PWR_CL_8BIT_SHIFT	4
 #define EXT_CSD_PWR_CL_4BIT_SHIFT	0
 
-#define EXT_CSD_PACKED_EVENT_EN	BIT(3)
-
 /*
  * EXCEPTION_EVENT_STATUS field
  */
 #define EXT_CSD_URGENT_BKOPS		BIT(0)
 #define EXT_CSD_DYNCAP_NEEDED		BIT(1)
 #define EXT_CSD_SYSPOOL_EXHAUSTED	BIT(2)
-#define EXT_CSD_PACKED_FAILURE		BIT(3)
-
-#define EXT_CSD_PACKED_GENERIC_ERROR	BIT(0)
-#define EXT_CSD_PACKED_INDEXED_ERROR	BIT(1)
 
 /*
  * BKOPS status level
-- 
cgit v1.2.3


From 9c0a5b3f9e55cf9a3dc85843666cae28adfdf7e3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 4 Dec 2023 23:05:21 +0100
Subject: w1: gpio: Don't use platform data for driver data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct device's .platform_data isn't for drivers to write to. For
driver-specific data there is .driver_data instead.

As there is no in-tree platform that provides w1_gpio_platform_data,
drop the include file and replace it by a local struct w1_gpio_ddata.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/8f7ebe03ddaa5a5c6e2b36fecdf59da7fc373527.1701727212.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 drivers/w1/masters/w1-gpio.c | 56 +++++++++++++++++++++-----------------------
 include/linux/w1-gpio.h      | 22 -----------------
 2 files changed, 27 insertions(+), 51 deletions(-)
 delete mode 100644 include/linux/w1-gpio.h

(limited to 'include/linux')

diff --git a/drivers/w1/masters/w1-gpio.c b/drivers/w1/masters/w1-gpio.c
index e45acb6d916e..8d65db65178c 100644
--- a/drivers/w1/masters/w1-gpio.c
+++ b/drivers/w1/masters/w1-gpio.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/w1-gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/of_platform.h>
 #include <linux/err.h>
@@ -18,9 +17,16 @@
 
 #include <linux/w1.h>
 
+struct w1_gpio_ddata {
+	struct gpio_desc *gpiod;
+	struct gpio_desc *pullup_gpiod;
+	void (*enable_external_pullup)(int enable);
+	unsigned int pullup_duration;
+};
+
 static u8 w1_gpio_set_pullup(void *data, int delay)
 {
-	struct w1_gpio_platform_data *pdata = data;
+	struct w1_gpio_ddata *pdata = data;
 
 	if (delay) {
 		pdata->pullup_duration = delay;
@@ -46,14 +52,14 @@ static u8 w1_gpio_set_pullup(void *data, int delay)
 
 static void w1_gpio_write_bit(void *data, u8 bit)
 {
-	struct w1_gpio_platform_data *pdata = data;
+	struct w1_gpio_ddata *pdata = data;
 
 	gpiod_set_value(pdata->gpiod, bit);
 }
 
 static u8 w1_gpio_read_bit(void *data)
 {
-	struct w1_gpio_platform_data *pdata = data;
+	struct w1_gpio_ddata *pdata = data;
 
 	return gpiod_get_value(pdata->gpiod) ? 1 : 0;
 }
@@ -69,35 +75,25 @@ MODULE_DEVICE_TABLE(of, w1_gpio_dt_ids);
 static int w1_gpio_probe(struct platform_device *pdev)
 {
 	struct w1_bus_master *master;
-	struct w1_gpio_platform_data *pdata;
+	struct w1_gpio_ddata *pdata;
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	/* Enforce open drain mode by default */
 	enum gpiod_flags gflags = GPIOD_OUT_LOW_OPEN_DRAIN;
 	int err;
 
-	if (of_have_populated_dt()) {
-		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-		if (!pdata)
-			return -ENOMEM;
-
-		/*
-		 * This parameter means that something else than the gpiolib has
-		 * already set the line into open drain mode, so we should just
-		 * driver it high/low like we are in full control of the line and
-		 * open drain will happen transparently.
-		 */
-		if (of_property_present(np, "linux,open-drain"))
-			gflags = GPIOD_OUT_LOW;
-
-		pdev->dev.platform_data = pdata;
-	}
-	pdata = dev_get_platdata(dev);
+	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
 
-	if (!pdata) {
-		dev_err(dev, "No configuration data\n");
-		return -ENXIO;
-	}
+	/*
+	 * This parameter means that something else than the gpiolib has
+	 * already set the line into open drain mode, so we should just
+	 * driver it high/low like we are in full control of the line and
+	 * open drain will happen transparently.
+	 */
+	if (of_property_present(np, "linux,open-drain"))
+		gflags = GPIOD_OUT_LOW;
 
 	master = devm_kzalloc(dev, sizeof(struct w1_bus_master),
 			GFP_KERNEL);
@@ -152,7 +148,7 @@ static int w1_gpio_probe(struct platform_device *pdev)
 static int w1_gpio_remove(struct platform_device *pdev)
 {
 	struct w1_bus_master *master = platform_get_drvdata(pdev);
-	struct w1_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
+	struct w1_gpio_ddata *pdata = master->data;
 
 	if (pdata->enable_external_pullup)
 		pdata->enable_external_pullup(0);
@@ -167,7 +163,8 @@ static int w1_gpio_remove(struct platform_device *pdev)
 
 static int __maybe_unused w1_gpio_suspend(struct device *dev)
 {
-	struct w1_gpio_platform_data *pdata = dev_get_platdata(dev);
+	struct w1_bus_master *master = dev_get_drvdata(dev);
+	struct w1_gpio_ddata *pdata = master->data;
 
 	if (pdata->enable_external_pullup)
 		pdata->enable_external_pullup(0);
@@ -177,7 +174,8 @@ static int __maybe_unused w1_gpio_suspend(struct device *dev)
 
 static int __maybe_unused w1_gpio_resume(struct device *dev)
 {
-	struct w1_gpio_platform_data *pdata = dev_get_platdata(dev);
+	struct w1_bus_master *master = dev_get_drvdata(dev);
+	struct w1_gpio_ddata *pdata = master->data;
 
 	if (pdata->enable_external_pullup)
 		pdata->enable_external_pullup(1);
diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h
deleted file mode 100644
index 3495fd0dc790..000000000000
--- a/include/linux/w1-gpio.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * w1-gpio interface to platform code
- *
- * Copyright (C) 2007 Ville Syrjala <syrjala@sci.fi>
- */
-#ifndef _LINUX_W1_GPIO_H
-#define _LINUX_W1_GPIO_H
-
-struct gpio_desc;
-
-/**
- * struct w1_gpio_platform_data - Platform-dependent data for w1-gpio
- */
-struct w1_gpio_platform_data {
-	struct gpio_desc *gpiod;
-	struct gpio_desc *pullup_gpiod;
-	void (*enable_external_pullup)(int enable);
-	unsigned int pullup_duration;
-};
-
-#endif /* _LINUX_W1_GPIO_H */
-- 
cgit v1.2.3


From 1fe13d83e2873b0aedeb5b9a299ca763bd37d75f Mon Sep 17 00:00:00 2001
From: Kaihua Zhong <zhongkaihua@kylinos.cn>
Date: Wed, 29 Nov 2023 09:55:26 +0800
Subject: mfd: Fix a few spelling mistakes in PMIC header file comments

Fix four comment typos in MFD PMIC header files.

Reported-by: k2ci <kernel-bot@kylinos.cn>
Signed-off-by: Kaihua Zhong <zhongkaihua@kylinos.cn>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20231129015526.3302865-1-zhongkaihua@kylinos.cn
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/max77693-private.h | 2 +-
 include/linux/mfd/max77843-private.h | 2 +-
 include/linux/mfd/si476x-platform.h  | 2 +-
 include/linux/mfd/tps65910.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 311f7d3d2323..54444ff2a5de 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -405,7 +405,7 @@ enum max77693_haptic_reg {
 	MAX77693_HAPTIC_REG_END,
 };
 
-/* max77693-pmic LSCNFG configuraton register */
+/* max77693-pmic LSCNFG configuration register */
 #define MAX77693_PMIC_LOW_SYS_MASK      0x80
 #define MAX77693_PMIC_LOW_SYS_SHIFT     7
 
diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h
index 0bc7454c4dbe..2fb4db67f110 100644
--- a/include/linux/mfd/max77843-private.h
+++ b/include/linux/mfd/max77843-private.h
@@ -198,7 +198,7 @@ enum max77843_irq_muic {
 #define MAX77843_MCONFIG_MEN_MASK		BIT(MCONFIG_MEN_SHIFT)
 #define MAX77843_MCONFIG_PDIV_MASK		(0x3 << MCONFIG_PDIV_SHIFT)
 
-/* Max77843 charger insterrupts */
+/* Max77843 charger interrupts */
 #define MAX77843_CHG_BYP_I			BIT(0)
 #define MAX77843_CHG_BATP_I			BIT(2)
 #define MAX77843_CHG_BAT_I			BIT(3)
diff --git a/include/linux/mfd/si476x-platform.h b/include/linux/mfd/si476x-platform.h
index 18363b773d07..cb99e16ca947 100644
--- a/include/linux/mfd/si476x-platform.h
+++ b/include/linux/mfd/si476x-platform.h
@@ -10,7 +10,7 @@
 #ifndef __SI476X_PLATFORM_H__
 #define __SI476X_PLATFORM_H__
 
-/* It is possible to select one of the four adresses using pins A0
+/* It is possible to select one of the four addresses using pins A0
  * and A1 on SI476x */
 #define SI476X_I2C_ADDR_1	0x60
 #define SI476X_I2C_ADDR_2	0x61
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 701925db75b3..f67ef0a4e041 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -749,7 +749,7 @@
 #define VDDCTRL_ST_SHIFT                                 0
 
 
-/*Register VDDCTRL_OP  (0x28) bit definitios */
+/*Register VDDCTRL_OP  (0x28) bit definitions */
 #define VDDCTRL_OP_CMD_MASK                              0x80
 #define VDDCTRL_OP_CMD_SHIFT                             7
 #define VDDCTRL_OP_SEL_MASK                              0x7F
-- 
cgit v1.2.3


From 4d8ff6b0991d5e86b17b235fc46ec62e9195cb9b Mon Sep 17 00:00:00 2001
From: Amit Kumar Mahapatra <amit.kumar-mahapatra@amd.com>
Date: Sat, 25 Nov 2023 14:51:30 +0530
Subject: spi: Add multi-cs memories support in SPI core

AMD-Xilinx GQSPI controller has two advanced mode that allows the
controller to consider two flashes as one single device.

One of these two mode is the parallel mode in which each byte of data is
stored in both devices, the even bits in the lower flash & the odd bits in
the upper flash. The byte split is automatically handled by the QSPI
controller.

The other mode is the stacked mode in which both the flashes share the
same SPI bus but each of the device contain half of the data. In this mode,
the controller does not follow CS requests but instead internally wires the
two CS levels with the value of the most significant address bit.

For supporting both these modes SPI core need to be updated for providing
multiple CS for a single SPI device.

For adding multi CS support the SPI device need to be aware of all the CS
values. So, the "chip_select" member in the spi_device structure is now an
array that holds all the CS values.

spi_device structure now has a "cs_index_mask" member. This acts as an
index to the chip_select array. If nth bit of spi->cs_index_mask is set
then the driver would assert spi->chip_select[n].

In parallel mode all the chip selects are asserted/de-asserted
simultaneously and each byte of data is stored in both devices, the even
bits in one, the odd bits in the other. The split is automatically handled
by the GQSPI controller. The GQSPI controller supports a maximum of two
flashes connected in parallel mode. A SPI_CONTROLLER_MULTI_CS flag bit is
added in the spi controller flags, through ctlr->flags the spi core
will make sure that the controller is capable of handling multiple chip
selects at once.

For supporting multiple CS via GPIO the cs_gpiod member of the spi_device
structure is now an array that holds the gpio descriptor for each
chipselect.

CS GPIO is not tested on our hardware, but it has been tested by @Stefan
https://lore.kernel.org/all/005001da1efc$619ad5a0$24d080e0$@opensource.cirrus.com/

Signed-off-by: Amit Kumar Mahapatra <amit.kumar-mahapatra@amd.com>
Tested-by: Stefan Binding <sbinding@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20231125092137.2948-4-amit.kumar-mahapatra@amd.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 259 ++++++++++++++++++++++++++++++++++++++++--------
 include/linux/spi/spi.h |  51 ++++++++--
 2 files changed, 258 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8ead7acb99f3..45b6898cf0ee 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -612,10 +612,21 @@ static int spi_dev_check(struct device *dev, void *data)
 {
 	struct spi_device *spi = to_spi_device(dev);
 	struct spi_device *new_spi = data;
-
-	if (spi->controller == new_spi->controller &&
-	    spi_get_chipselect(spi, 0) == spi_get_chipselect(new_spi, 0))
-		return -EBUSY;
+	int idx, nw_idx;
+	u8 cs, cs_nw;
+
+	if (spi->controller == new_spi->controller) {
+		for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+			cs = spi_get_chipselect(spi, idx);
+			for (nw_idx = 0; nw_idx < SPI_CS_CNT_MAX; nw_idx++) {
+				cs_nw = spi_get_chipselect(new_spi, nw_idx);
+				if (cs != 0xFF && cs_nw != 0xFF && cs == cs_nw) {
+					dev_err(dev, "chipselect %d already in use\n", cs_nw);
+					return -EBUSY;
+				}
+			}
+		}
+	}
 	return 0;
 }
 
@@ -629,13 +640,32 @@ static int __spi_add_device(struct spi_device *spi)
 {
 	struct spi_controller *ctlr = spi->controller;
 	struct device *dev = ctlr->dev.parent;
-	int status;
+	int status, idx, nw_idx;
+	u8 cs, nw_cs;
+
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+		/* Chipselects are numbered 0..max; validate. */
+		cs = spi_get_chipselect(spi, idx);
+		if (cs != 0xFF && cs >= ctlr->num_chipselect) {
+			dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, idx),
+				ctlr->num_chipselect);
+			return -EINVAL;
+		}
+	}
 
-	/* Chipselects are numbered 0..max; validate. */
-	if (spi_get_chipselect(spi, 0) >= ctlr->num_chipselect) {
-		dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, 0),
-			ctlr->num_chipselect);
-		return -EINVAL;
+	/*
+	 * Make sure that multiple logical CS doesn't map to the same physical CS.
+	 * For example, spi->chip_select[0] != spi->chip_select[1] and so on.
+	 */
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+		cs = spi_get_chipselect(spi, idx);
+		for (nw_idx = idx + 1; nw_idx < SPI_CS_CNT_MAX; nw_idx++) {
+			nw_cs = spi_get_chipselect(spi, nw_idx);
+			if (cs != 0xFF && nw_cs != 0xFF && cs == nw_cs) {
+				dev_err(dev, "chipselect %d already in use\n", nw_cs);
+				return -EBUSY;
+			}
+		}
 	}
 
 	/* Set the bus ID string */
@@ -647,11 +677,8 @@ static int __spi_add_device(struct spi_device *spi)
 	 * its configuration.
 	 */
 	status = bus_for_each_dev(&spi_bus_type, NULL, spi, spi_dev_check);
-	if (status) {
-		dev_err(dev, "chipselect %d already in use\n",
-				spi_get_chipselect(spi, 0));
+	if (status)
 		return status;
-	}
 
 	/* Controller may unregister concurrently */
 	if (IS_ENABLED(CONFIG_SPI_DYNAMIC) &&
@@ -659,8 +686,15 @@ static int __spi_add_device(struct spi_device *spi)
 		return -ENODEV;
 	}
 
-	if (ctlr->cs_gpiods)
-		spi_set_csgpiod(spi, 0, ctlr->cs_gpiods[spi_get_chipselect(spi, 0)]);
+	if (ctlr->cs_gpiods) {
+		u8 cs;
+
+		for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+			cs = spi_get_chipselect(spi, idx);
+			if (cs != 0xFF)
+				spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]);
+		}
+	}
 
 	/*
 	 * Drivers may modify this initial i/o setup, but will
@@ -701,6 +735,9 @@ int spi_add_device(struct spi_device *spi)
 	struct spi_controller *ctlr = spi->controller;
 	int status;
 
+	/* Set the bus ID string */
+	spi_dev_set_name(spi);
+
 	mutex_lock(&ctlr->add_lock);
 	status = __spi_add_device(spi);
 	mutex_unlock(&ctlr->add_lock);
@@ -727,6 +764,7 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr,
 {
 	struct spi_device	*proxy;
 	int			status;
+	u8                      idx;
 
 	/*
 	 * NOTE:  caller did any chip->bus_num checks necessary.
@@ -742,6 +780,18 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr,
 
 	WARN_ON(strlen(chip->modalias) >= sizeof(proxy->modalias));
 
+	/*
+	 * Zero(0) is a valid physical CS value and can be located at any
+	 * logical CS in the spi->chip_select[]. If all the physical CS
+	 * are initialized to 0 then It would be difficult to differentiate
+	 * between a valid physical CS 0 & an unused logical CS whose physical
+	 * CS can be 0. As a solution to this issue initialize all the CS to 0xFF.
+	 * Now all the unused logical CS will have 0xFF physical CS value & can be
+	 * ignore while performing physical CS validity checks.
+	 */
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		spi_set_chipselect(proxy, idx, 0xFF);
+
 	spi_set_chipselect(proxy, 0, chip->chip_select);
 	proxy->max_speed_hz = chip->max_speed_hz;
 	proxy->mode = chip->mode;
@@ -750,6 +800,15 @@ struct spi_device *spi_new_device(struct spi_controller *ctlr,
 	proxy->dev.platform_data = (void *) chip->platform_data;
 	proxy->controller_data = chip->controller_data;
 	proxy->controller_state = NULL;
+	/*
+	 * spi->chip_select[i] gives the corresponding physical CS for logical CS i
+	 * logical CS number is represented by setting the ith bit in spi->cs_index_mask
+	 * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and
+	 * spi->chip_select[0] will give the physical CS.
+	 * By default spi->chip_select[0] will hold the physical CS number so, set
+	 * spi->cs_index_mask as 0x01.
+	 */
+	proxy->cs_index_mask = 0x01;
 
 	if (chip->swnode) {
 		status = device_add_software_node(&proxy->dev, chip->swnode);
@@ -942,32 +1001,51 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes
 }
 
 /*-------------------------------------------------------------------------*/
+static inline bool spi_is_last_cs(struct spi_device *spi)
+{
+	u8 idx;
+	bool last = false;
+
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+		if ((spi->cs_index_mask >> idx) & 0x01) {
+			if (spi->controller->last_cs[idx] == spi_get_chipselect(spi, idx))
+				last = true;
+		}
+	}
+	return last;
+}
+
 
 static void spi_set_cs(struct spi_device *spi, bool enable, bool force)
 {
 	bool activate = enable;
+	u8 idx;
 
 	/*
 	 * Avoid calling into the driver (or doing delays) if the chip select
 	 * isn't actually changing from the last time this was called.
 	 */
-	if (!force && ((enable && spi->controller->last_cs == spi_get_chipselect(spi, 0)) ||
-		       (!enable && spi->controller->last_cs != spi_get_chipselect(spi, 0))) &&
+	if (!force && ((enable && spi->controller->last_cs_index_mask == spi->cs_index_mask &&
+			spi_is_last_cs(spi)) ||
+		       (!enable && spi->controller->last_cs_index_mask == spi->cs_index_mask &&
+			!spi_is_last_cs(spi))) &&
 	    (spi->controller->last_cs_mode_high == (spi->mode & SPI_CS_HIGH)))
 		return;
 
 	trace_spi_set_cs(spi, activate);
 
-	spi->controller->last_cs = enable ? spi_get_chipselect(spi, 0) : -1;
+	spi->controller->last_cs_index_mask = spi->cs_index_mask;
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : -1;
 	spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH;
 
-	if ((spi_get_csgpiod(spi, 0) || !spi->controller->set_cs_timing) && !activate)
-		spi_delay_exec(&spi->cs_hold, NULL);
-
 	if (spi->mode & SPI_CS_HIGH)
 		enable = !enable;
 
-	if (spi_get_csgpiod(spi, 0)) {
+	if (spi_is_csgpiod(spi)) {
+		if (!spi->controller->set_cs_timing && !activate)
+			spi_delay_exec(&spi->cs_hold, NULL);
+
 		if (!(spi->mode & SPI_NO_CS)) {
 			/*
 			 * Historically ACPI has no means of the GPIO polarity and
@@ -979,26 +1057,38 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force)
 			 * ambiguity. That's why we use enable, that takes SPI_CS_HIGH
 			 * into account.
 			 */
-			if (has_acpi_companion(&spi->dev))
-				gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), !enable);
-			else
-				/* Polarity handled by GPIO library */
-				gpiod_set_value_cansleep(spi_get_csgpiod(spi, 0), activate);
+			for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+				if (((spi->cs_index_mask >> idx) & 0x01) &&
+				    spi_get_csgpiod(spi, idx)) {
+					if (has_acpi_companion(&spi->dev))
+						gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx),
+									 !enable);
+					else
+						/* Polarity handled by GPIO library */
+						gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx),
+									 activate);
+
+					if (activate)
+						spi_delay_exec(&spi->cs_setup, NULL);
+					else
+						spi_delay_exec(&spi->cs_inactive, NULL);
+				}
+			}
 		}
 		/* Some SPI masters need both GPIO CS & slave_select */
 		if ((spi->controller->flags & SPI_CONTROLLER_GPIO_SS) &&
 		    spi->controller->set_cs)
 			spi->controller->set_cs(spi, !enable);
+
+		if (!spi->controller->set_cs_timing) {
+			if (activate)
+				spi_delay_exec(&spi->cs_setup, NULL);
+			else
+				spi_delay_exec(&spi->cs_inactive, NULL);
+		}
 	} else if (spi->controller->set_cs) {
 		spi->controller->set_cs(spi, !enable);
 	}
-
-	if (spi_get_csgpiod(spi, 0) || !spi->controller->set_cs_timing) {
-		if (activate)
-			spi_delay_exec(&spi->cs_setup, NULL);
-		else
-			spi_delay_exec(&spi->cs_inactive, NULL);
-	}
 }
 
 #ifdef CONFIG_HAS_DMA
@@ -2222,8 +2312,8 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc,
 static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi,
 			   struct device_node *nc)
 {
-	u32 value;
-	int rc;
+	u32 value, cs[SPI_CS_CNT_MAX];
+	int rc, idx;
 
 	/* Mode (clock phase/polarity/etc.) */
 	if (of_property_read_bool(nc, "spi-cpha"))
@@ -2295,14 +2385,53 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi,
 		return 0;
 	}
 
+	if (ctlr->num_chipselect > SPI_CS_CNT_MAX) {
+		dev_err(&ctlr->dev, "No. of CS is more than max. no. of supported CS\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Zero(0) is a valid physical CS value and can be located at any
+	 * logical CS in the spi->chip_select[]. If all the physical CS
+	 * are initialized to 0 then It would be difficult to differentiate
+	 * between a valid physical CS 0 & an unused logical CS whose physical
+	 * CS can be 0. As a solution to this issue initialize all the CS to 0xFF.
+	 * Now all the unused logical CS will have 0xFF physical CS value & can be
+	 * ignore while performing physical CS validity checks.
+	 */
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		spi_set_chipselect(spi, idx, 0xFF);
+
 	/* Device address */
-	rc = of_property_read_u32(nc, "reg", &value);
-	if (rc) {
+	rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1,
+						 SPI_CS_CNT_MAX);
+	if (rc < 0) {
 		dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n",
 			nc, rc);
 		return rc;
 	}
-	spi_set_chipselect(spi, 0, value);
+	if (rc > ctlr->num_chipselect) {
+		dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n",
+			nc, rc);
+		return rc;
+	}
+	if ((of_property_read_bool(nc, "parallel-memories")) &&
+	    (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) {
+		dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n");
+		return -EINVAL;
+	}
+	for (idx = 0; idx < rc; idx++)
+		spi_set_chipselect(spi, idx, cs[idx]);
+
+	/*
+	 * spi->chip_select[i] gives the corresponding physical CS for logical CS i
+	 * logical CS number is represented by setting the ith bit in spi->cs_index_mask
+	 * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and
+	 * spi->chip_select[0] will give the physical CS.
+	 * By default spi->chip_select[0] will hold the physical CS number so, set
+	 * spi->cs_index_mask as 0x01.
+	 */
+	spi->cs_index_mask = 0x01;
 
 	/* Device speed */
 	if (!of_property_read_u32(nc, "spi-max-frequency", &value))
@@ -2408,6 +2537,7 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi,
 	struct spi_controller *ctlr = spi->controller;
 	struct spi_device *ancillary;
 	int rc = 0;
+	u8 idx;
 
 	/* Alloc an spi_device */
 	ancillary = spi_alloc_device(ctlr);
@@ -2418,12 +2548,33 @@ struct spi_device *spi_new_ancillary_device(struct spi_device *spi,
 
 	strscpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias));
 
+	/*
+	 * Zero(0) is a valid physical CS value and can be located at any
+	 * logical CS in the spi->chip_select[]. If all the physical CS
+	 * are initialized to 0 then It would be difficult to differentiate
+	 * between a valid physical CS 0 & an unused logical CS whose physical
+	 * CS can be 0. As a solution to this issue initialize all the CS to 0xFF.
+	 * Now all the unused logical CS will have 0xFF physical CS value & can be
+	 * ignore while performing physical CS validity checks.
+	 */
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		spi_set_chipselect(ancillary, idx, 0xFF);
+
 	/* Use provided chip-select for ancillary device */
 	spi_set_chipselect(ancillary, 0, chip_select);
 
 	/* Take over SPI mode/speed from SPI main device */
 	ancillary->max_speed_hz = spi->max_speed_hz;
 	ancillary->mode = spi->mode;
+	/*
+	 * spi->chip_select[i] gives the corresponding physical CS for logical CS i
+	 * logical CS number is represented by setting the ith bit in spi->cs_index_mask
+	 * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and
+	 * spi->chip_select[0] will give the physical CS.
+	 * By default spi->chip_select[0] will hold the physical CS number so, set
+	 * spi->cs_index_mask as 0x01.
+	 */
+	ancillary->cs_index_mask = 0x01;
 
 	WARN_ON(!mutex_is_locked(&ctlr->add_lock));
 
@@ -2626,6 +2777,7 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr,
 	struct acpi_spi_lookup lookup = {};
 	struct spi_device *spi;
 	int ret;
+	u8 idx;
 
 	if (!ctlr && index == -1)
 		return ERR_PTR(-EINVAL);
@@ -2661,12 +2813,33 @@ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	/*
+	 * Zero(0) is a valid physical CS value and can be located at any
+	 * logical CS in the spi->chip_select[]. If all the physical CS
+	 * are initialized to 0 then It would be difficult to differentiate
+	 * between a valid physical CS 0 & an unused logical CS whose physical
+	 * CS can be 0. As a solution to this issue initialize all the CS to 0xFF.
+	 * Now all the unused logical CS will have 0xFF physical CS value & can be
+	 * ignore while performing physical CS validity checks.
+	 */
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		spi_set_chipselect(spi, idx, 0xFF);
+
 	ACPI_COMPANION_SET(&spi->dev, adev);
 	spi->max_speed_hz	= lookup.max_speed_hz;
 	spi->mode		|= lookup.mode;
 	spi->irq		= lookup.irq;
 	spi->bits_per_word	= lookup.bits_per_word;
 	spi_set_chipselect(spi, 0, lookup.chip_select);
+	/*
+	 * spi->chip_select[i] gives the corresponding physical CS for logical CS i
+	 * logical CS number is represented by setting the ith bit in spi->cs_index_mask
+	 * So, for example, if spi->cs_index_mask = 0x01 then logical CS number is 0 and
+	 * spi->chip_select[0] will give the physical CS.
+	 * By default spi->chip_select[0] will hold the physical CS number so, set
+	 * spi->cs_index_mask as 0x01.
+	 */
+	spi->cs_index_mask	= 0x01;
 
 	return spi;
 }
@@ -3100,6 +3273,7 @@ int spi_register_controller(struct spi_controller *ctlr)
 	struct boardinfo	*bi;
 	int			first_dynamic;
 	int			status;
+	int			idx;
 
 	if (!dev)
 		return -ENODEV;
@@ -3164,7 +3338,8 @@ int spi_register_controller(struct spi_controller *ctlr)
 	}
 
 	/* Setting last_cs to -1 means no chip selected */
-	ctlr->last_cs = -1;
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++)
+		ctlr->last_cs[idx] = -1;
 
 	status = device_add(&ctlr->dev);
 	if (status < 0)
@@ -3889,7 +4064,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 	 * cs_change is set for each transfer.
 	 */
 	if ((spi->mode & SPI_CS_WORD) && (!(ctlr->mode_bits & SPI_CS_WORD) ||
-					  spi_get_csgpiod(spi, 0))) {
+					  spi_is_csgpiod(spi))) {
 		size_t maxsize = BITS_TO_BYTES(spi->bits_per_word);
 		int ret;
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 255a0562aea5..50622054b6af 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -20,6 +20,9 @@
 
 #include <uapi/linux/spi/spi.h>
 
+/* Max no. of CS supported per spi device */
+#define SPI_CS_CNT_MAX 4
+
 struct dma_chan;
 struct software_node;
 struct ptp_system_timestamp;
@@ -132,7 +135,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  * @max_speed_hz: Maximum clock rate to be used with this chip
  *	(on this board); may be changed by the device's driver.
  *	The spi_transfer.speed_hz can override this for each transfer.
- * @chip_select: Chipselect, distinguishing chips handled by @controller.
+ * @chip_select: Array of physical chipselect, spi->chipselect[i] gives
+ *	the corresponding physical CS for logical CS i.
  * @mode: The spi mode defines how data is clocked out and in.
  *	This may be changed by the device's driver.
  *	The "active low" default for chipselect mode can be overridden
@@ -157,8 +161,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  *	the device will bind to the named driver and only the named driver.
  *	Do not set directly, because core frees it; use driver_set_override() to
  *	set or clear it.
- * @cs_gpiod: GPIO descriptor of the chipselect line (optional, NULL when
- *	not using a GPIO line)
+ * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines
+ *	(optional, NULL when not using a GPIO line)
  * @word_delay: delay to be inserted between consecutive
  *	words of a transfer
  * @cs_setup: delay to be introduced by the controller after CS is asserted
@@ -167,6 +171,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg,
  *	deasserted. If @cs_change_delay is used from @spi_transfer, then the
  *	two delays will be added up.
  * @pcpu_statistics: statistics for the spi_device
+ * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array
  *
  * A @spi_device is used to interchange data between an SPI slave
  * (usually a discrete chip) and CPU memory.
@@ -182,7 +187,7 @@ struct spi_device {
 	struct spi_controller	*controller;
 	struct spi_controller	*master;	/* Compatibility layer */
 	u32			max_speed_hz;
-	u8			chip_select;
+	u8			chip_select[SPI_CS_CNT_MAX];
 	u8			bits_per_word;
 	bool			rt;
 #define SPI_NO_TX		BIT(31)		/* No transmit wire */
@@ -213,7 +218,7 @@ struct spi_device {
 	void			*controller_data;
 	char			modalias[SPI_NAME_SIZE];
 	const char		*driver_override;
-	struct gpio_desc	*cs_gpiod;	/* Chip select GPIO descriptor */
+	struct gpio_desc	*cs_gpiod[SPI_CS_CNT_MAX];	/* Chip select gpio desc */
 	struct spi_delay	word_delay; /* Inter-word delay */
 	/* CS delays */
 	struct spi_delay	cs_setup;
@@ -223,6 +228,13 @@ struct spi_device {
 	/* The statistics */
 	struct spi_statistics __percpu	*pcpu_statistics;
 
+	/* Bit mask of the chipselect(s) that the driver need to use from
+	 * the chipselect array.When the controller is capable to handle
+	 * multiple chip selects & memories are connected in parallel
+	 * then more than one bit need to be set in cs_index_mask.
+	 */
+	u32			cs_index_mask : SPI_CS_CNT_MAX;
+
 	/*
 	 * Likely need more hooks for more protocol options affecting how
 	 * the controller talks to each chip, like:
@@ -279,22 +291,33 @@ static inline void *spi_get_drvdata(const struct spi_device *spi)
 
 static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx)
 {
-	return spi->chip_select;
+	return spi->chip_select[idx];
 }
 
 static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect)
 {
-	spi->chip_select = chipselect;
+	spi->chip_select[idx] = chipselect;
 }
 
 static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx)
 {
-	return spi->cs_gpiod;
+	return spi->cs_gpiod[idx];
 }
 
 static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod)
 {
-	spi->cs_gpiod = csgpiod;
+	spi->cs_gpiod[idx] = csgpiod;
+}
+
+static inline bool spi_is_csgpiod(struct spi_device *spi)
+{
+	u8 idx;
+
+	for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) {
+		if (spi_get_csgpiod(spi, idx))
+			return true;
+	}
+	return false;
 }
 
 /**
@@ -399,6 +422,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @bus_lock_spinlock: spinlock for SPI bus locking
  * @bus_lock_mutex: mutex for exclusion of multiple callers
  * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use
+ * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert
+ *	more than one chip select at once.
  * @setup: updates the device mode and clocking records used by a
  *	device's SPI controller; protocol code may call this.  This
  *	must fail if an unrecognized or unsupported mode is requested.
@@ -567,6 +592,11 @@ struct spi_controller {
 #define SPI_CONTROLLER_MUST_TX		BIT(4)	/* Requires tx */
 #define SPI_CONTROLLER_GPIO_SS		BIT(5)	/* GPIO CS must select slave */
 #define SPI_CONTROLLER_SUSPENDED	BIT(6)	/* Currently suspended */
+	/*
+	 * The spi-controller has multi chip select capability and can
+	 * assert/de-assert more than one chip select at once.
+	 */
+#define SPI_CONTROLLER_MULTI_CS		BIT(7)
 
 	/* Flag indicating if the allocation of this struct is devres-managed */
 	bool			devm_allocated;
@@ -677,7 +707,8 @@ struct spi_controller {
 	bool				rt;
 	bool				auto_runtime_pm;
 	bool				cur_msg_mapped;
-	char				last_cs;
+	char				last_cs[SPI_CS_CNT_MAX];
+	char				last_cs_index_mask;
 	bool				last_cs_mode_high;
 	bool                            fallback;
 	struct completion               xfer_completion;
-- 
cgit v1.2.3


From a7fb0423c201ba12815877a0b5a68a6a1710b23a Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 7 Dec 2023 08:46:14 -0500
Subject: cgroup: Move rcu_head up near the top of cgroup_root
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU
safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu()
for freeing the cgroup_root.

The current implementation of kvfree_rcu(), however, has the limitation
that the offset of the rcu_head structure within the larger data
structure must be less than 4096 or the compilation will fail. See the
macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h
for more information.

By putting rcu_head below the large cgroup structure, any change to the
cgroup structure that makes it larger run the risk of causing build
failure under certain configurations. Commit 77070eeb8821 ("cgroup:
Avoid false cacheline sharing of read mostly rstat_cpu") happens to be
the last straw that breaks it. Fix this problem by moving the rcu_head
structure up before the cgroup structure.

Fixes: d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5a97ea95b564..ea48c861cd36 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -562,6 +562,10 @@ struct cgroup_root {
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
+	/* A list running through the active hierarchies */
+	struct list_head root_list;
+	struct rcu_head rcu;	/* Must be near the top */
+
 	/*
 	 * The root cgroup. The containing cgroup_root will be destroyed on its
 	 * release. cgrp->ancestors[0] will be used overflowing into the
@@ -575,10 +579,6 @@ struct cgroup_root {
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
 
-	/* A list running through the active hierarchies */
-	struct list_head root_list;
-	struct rcu_head rcu;
-
 	/* Hierarchy-specific flags */
 	unsigned int flags;
 
-- 
cgit v1.2.3


From f76f0d7f20672611974d3cc705996751fc403734 Mon Sep 17 00:00:00 2001
From: Wenkai Lin <linwenkai6@hisilicon.com>
Date: Sat, 2 Dec 2023 17:17:18 +0800
Subject: crypto: hisilicon/qm - add a function to set qm algs

Extract a public function to set qm algs and remove
the similar code for setting qm algs in each module.

Signed-off-by: Wenkai Lin <linwenkai6@hisilicon.com>
Signed-off-by: Hao Fang <fanghao11@huawei.com>
Signed-off-by: Zhiqi Song <songzhiqi1@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/hpre/hpre_main.c | 42 +++-----------------------
 drivers/crypto/hisilicon/qm.c             | 36 +++++++++++++++++++++++
 drivers/crypto/hisilicon/sec2/sec_main.c  | 47 +++++------------------------
 drivers/crypto/hisilicon/zip/zip_main.c   | 49 +++++--------------------------
 include/linux/hisi_acc_qm.h               |  8 ++++-
 5 files changed, 62 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 56777099ef69..84c92d85d23d 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -118,8 +118,6 @@
 #define HPRE_DFX_COMMON2_LEN		0xE
 #define HPRE_DFX_CORE_LEN		0x43
 
-#define HPRE_DEV_ALG_MAX_LEN	256
-
 static const char hpre_name[] = "hisi_hpre";
 static struct dentry *hpre_debugfs_root;
 static const struct pci_device_id hpre_dev_ids[] = {
@@ -135,12 +133,7 @@ struct hpre_hw_error {
 	const char *msg;
 };
 
-struct hpre_dev_alg {
-	u32 alg_msk;
-	const char *alg;
-};
-
-static const struct hpre_dev_alg hpre_dev_algs[] = {
+static const struct qm_dev_alg hpre_dev_algs[] = {
 	{
 		.alg_msk = BIT(0),
 		.alg = "rsa\n"
@@ -362,35 +355,6 @@ bool hpre_check_alg_support(struct hisi_qm *qm, u32 alg)
 	return false;
 }
 
-static int hpre_set_qm_algs(struct hisi_qm *qm)
-{
-	struct device *dev = &qm->pdev->dev;
-	char *algs, *ptr;
-	u32 alg_msk;
-	int i;
-
-	if (!qm->use_sva)
-		return 0;
-
-	algs = devm_kzalloc(dev, HPRE_DEV_ALG_MAX_LEN * sizeof(char), GFP_KERNEL);
-	if (!algs)
-		return -ENOMEM;
-
-	alg_msk = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_DEV_ALG_BITMAP_CAP, qm->cap_ver);
-
-	for (i = 0; i < ARRAY_SIZE(hpre_dev_algs); i++)
-		if (alg_msk & hpre_dev_algs[i].alg_msk)
-			strcat(algs, hpre_dev_algs[i].alg);
-
-	ptr = strrchr(algs, '\n');
-	if (ptr)
-		*ptr = '\0';
-
-	qm->uacce->algs = algs;
-
-	return 0;
-}
-
 static int hpre_diff_regs_show(struct seq_file *s, void *unused)
 {
 	struct hisi_qm *qm = s->private;
@@ -1141,6 +1105,7 @@ static void hpre_debugfs_exit(struct hisi_qm *qm)
 
 static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 {
+	u64 alg_msk;
 	int ret;
 
 	if (pdev->revision == QM_HW_V1) {
@@ -1171,7 +1136,8 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		return ret;
 	}
 
-	ret = hpre_set_qm_algs(qm);
+	alg_msk = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_DEV_ALG_BITMAP_CAP, qm->cap_ver);
+	ret = hisi_qm_set_algs(qm, alg_msk, hpre_dev_algs, ARRAY_SIZE(hpre_dev_algs));
 	if (ret) {
 		pci_err(pdev, "Failed to set hpre algs!\n");
 		hisi_qm_uninit(qm);
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 7ed079a9c929..4170c24c2ed1 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -234,6 +234,8 @@
 #define QM_QOS_MAX_CIR_U		6
 #define QM_AUTOSUSPEND_DELAY		3000
 
+#define QM_DEV_ALG_MAX_LEN		256
+
 #define QM_MK_CQC_DW3_V1(hop_num, pg_sz, buf_sz, cqe_sz) \
 	(((hop_num) << QM_CQ_HOP_NUM_SHIFT) | \
 	((pg_sz) << QM_CQ_PAGE_SIZE_SHIFT) | \
@@ -749,6 +751,40 @@ static void qm_get_xqc_depth(struct hisi_qm *qm, u16 *low_bits,
 	*high_bits = (depth >> QM_XQ_DEPTH_SHIFT) & QM_XQ_DEPTH_MASK;
 }
 
+int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs,
+		     u32 dev_algs_size)
+{
+	struct device *dev = &qm->pdev->dev;
+	char *algs, *ptr;
+	int i;
+
+	if (!qm->uacce)
+		return 0;
+
+	if (dev_algs_size >= QM_DEV_ALG_MAX_LEN) {
+		dev_err(dev, "algs size %u is equal or larger than %d.\n",
+			dev_algs_size, QM_DEV_ALG_MAX_LEN);
+		return -EINVAL;
+	}
+
+	algs = devm_kzalloc(dev, QM_DEV_ALG_MAX_LEN * sizeof(char), GFP_KERNEL);
+	if (!algs)
+		return -ENOMEM;
+
+	for (i = 0; i < dev_algs_size; i++)
+		if (alg_msk & dev_algs[i].alg_msk)
+			strcat(algs, dev_algs[i].alg);
+
+	ptr = strrchr(algs, '\n');
+	if (ptr) {
+		*ptr = '\0';
+		qm->uacce->algs = algs;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hisi_qm_set_algs);
+
 static u32 qm_get_irq_num(struct hisi_qm *qm)
 {
 	if (qm->fun_type == QM_HW_PF)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index 0e56a47eb862..2eceab7600ca 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -120,7 +120,6 @@
 					GENMASK_ULL(42, 25))
 #define SEC_AEAD_BITMAP			(GENMASK_ULL(7, 6) | GENMASK_ULL(18, 17) | \
 					GENMASK_ULL(45, 43))
-#define SEC_DEV_ALG_MAX_LEN		256
 
 struct sec_hw_error {
 	u32 int_msk;
@@ -132,11 +131,6 @@ struct sec_dfx_item {
 	u32 offset;
 };
 
-struct sec_dev_alg {
-	u64 alg_msk;
-	const char *algs;
-};
-
 static const char sec_name[] = "hisi_sec2";
 static struct dentry *sec_debugfs_root;
 
@@ -173,15 +167,15 @@ static const struct hisi_qm_cap_info sec_basic_info[] = {
 	{SEC_CORE4_ALG_BITMAP_HIGH, 0x3170, 0, GENMASK(31, 0), 0x3FFF, 0x3FFF, 0x3FFF},
 };
 
-static const struct sec_dev_alg sec_dev_algs[] = { {
+static const struct qm_dev_alg sec_dev_algs[] = { {
 		.alg_msk = SEC_CIPHER_BITMAP,
-		.algs = "cipher\n",
+		.alg = "cipher\n",
 	}, {
 		.alg_msk = SEC_DIGEST_BITMAP,
-		.algs = "digest\n",
+		.alg = "digest\n",
 	}, {
 		.alg_msk = SEC_AEAD_BITMAP,
-		.algs = "aead\n",
+		.alg = "aead\n",
 	},
 };
 
@@ -1077,37 +1071,9 @@ static int sec_pf_probe_init(struct sec_dev *sec)
 	return ret;
 }
 
-static int sec_set_qm_algs(struct hisi_qm *qm)
-{
-	struct device *dev = &qm->pdev->dev;
-	char *algs, *ptr;
-	u64 alg_mask;
-	int i;
-
-	if (!qm->use_sva)
-		return 0;
-
-	algs = devm_kzalloc(dev, SEC_DEV_ALG_MAX_LEN * sizeof(char), GFP_KERNEL);
-	if (!algs)
-		return -ENOMEM;
-
-	alg_mask = sec_get_alg_bitmap(qm, SEC_DEV_ALG_BITMAP_HIGH, SEC_DEV_ALG_BITMAP_LOW);
-
-	for (i = 0; i < ARRAY_SIZE(sec_dev_algs); i++)
-		if (alg_mask & sec_dev_algs[i].alg_msk)
-			strcat(algs, sec_dev_algs[i].algs);
-
-	ptr = strrchr(algs, '\n');
-	if (ptr)
-		*ptr = '\0';
-
-	qm->uacce->algs = algs;
-
-	return 0;
-}
-
 static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 {
+	u64 alg_msk;
 	int ret;
 
 	qm->pdev = pdev;
@@ -1142,7 +1108,8 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		return ret;
 	}
 
-	ret = sec_set_qm_algs(qm);
+	alg_msk = sec_get_alg_bitmap(qm, SEC_DEV_ALG_BITMAP_HIGH, SEC_DEV_ALG_BITMAP_LOW);
+	ret = hisi_qm_set_algs(qm, alg_msk, sec_dev_algs, ARRAY_SIZE(sec_dev_algs));
 	if (ret) {
 		pci_err(qm->pdev, "Failed to set sec algs!\n");
 		hisi_qm_uninit(qm);
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index 07ab61c113ab..2934de25efa4 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -74,7 +74,6 @@
 #define HZIP_AXI_SHUTDOWN_ENABLE	BIT(14)
 #define HZIP_WR_PORT			BIT(11)
 
-#define HZIP_DEV_ALG_MAX_LEN		256
 #define HZIP_ALG_ZLIB_BIT		GENMASK(1, 0)
 #define HZIP_ALG_GZIP_BIT		GENMASK(3, 2)
 #define HZIP_ALG_DEFLATE_BIT		GENMASK(5, 4)
@@ -128,23 +127,18 @@ struct zip_dfx_item {
 	u32 offset;
 };
 
-struct zip_dev_alg {
-	u32 alg_msk;
-	const char *algs;
-};
-
-static const struct zip_dev_alg zip_dev_algs[] = { {
+static const struct qm_dev_alg zip_dev_algs[] = { {
 		.alg_msk = HZIP_ALG_ZLIB_BIT,
-		.algs = "zlib\n",
+		.alg = "zlib\n",
 	}, {
 		.alg_msk = HZIP_ALG_GZIP_BIT,
-		.algs = "gzip\n",
+		.alg = "gzip\n",
 	}, {
 		.alg_msk = HZIP_ALG_DEFLATE_BIT,
-		.algs = "deflate\n",
+		.alg = "deflate\n",
 	}, {
 		.alg_msk = HZIP_ALG_LZ77_BIT,
-		.algs = "lz77_zstd\n",
+		.alg = "lz77_zstd\n",
 	},
 };
 
@@ -478,35 +472,6 @@ static int hisi_zip_set_high_perf(struct hisi_qm *qm)
 	return ret;
 }
 
-static int hisi_zip_set_qm_algs(struct hisi_qm *qm)
-{
-	struct device *dev = &qm->pdev->dev;
-	char *algs, *ptr;
-	u32 alg_mask;
-	int i;
-
-	if (!qm->use_sva)
-		return 0;
-
-	algs = devm_kzalloc(dev, HZIP_DEV_ALG_MAX_LEN * sizeof(char), GFP_KERNEL);
-	if (!algs)
-		return -ENOMEM;
-
-	alg_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_DEV_ALG_BITMAP, qm->cap_ver);
-
-	for (i = 0; i < ARRAY_SIZE(zip_dev_algs); i++)
-		if (alg_mask & zip_dev_algs[i].alg_msk)
-			strcat(algs, zip_dev_algs[i].algs);
-
-	ptr = strrchr(algs, '\n');
-	if (ptr)
-		*ptr = '\0';
-
-	qm->uacce->algs = algs;
-
-	return 0;
-}
-
 static void hisi_zip_open_sva_prefetch(struct hisi_qm *qm)
 {
 	u32 val;
@@ -1193,6 +1158,7 @@ static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip)
 
 static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 {
+	u64 alg_msk;
 	int ret;
 
 	qm->pdev = pdev;
@@ -1228,7 +1194,8 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
 		return ret;
 	}
 
-	ret = hisi_zip_set_qm_algs(qm);
+	alg_msk = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_DEV_ALG_BITMAP, qm->cap_ver);
+	ret = hisi_qm_set_algs(qm, alg_msk, zip_dev_algs, ARRAY_SIZE(zip_dev_algs));
 	if (ret) {
 		pci_err(qm->pdev, "Failed to set zip algs!\n");
 		hisi_qm_uninit(qm);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index e3c0a1297b2c..cdc979f66dba 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -156,6 +156,11 @@ enum qm_cap_bits {
 	QM_SUPPORT_RPM,
 };
 
+struct qm_dev_alg {
+	u64 alg_msk;
+	const char *alg;
+};
+
 struct dfx_diff_registers {
 	u32 *regs;
 	u32 reg_offset;
@@ -361,7 +366,6 @@ struct hisi_qm {
 	struct work_struct rst_work;
 	struct work_struct cmd_process;
 
-	const char *algs;
 	bool use_sva;
 
 	resource_size_t phys_base;
@@ -559,6 +563,8 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
 u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
 			const struct hisi_qm_cap_info *info_table,
 			u32 index, bool is_read);
+int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs,
+		     u32 dev_algs_size);
 
 /* Used by VFIO ACC live migration driver */
 struct pci_driver *hisi_sec_get_pf_driver(void);
-- 
cgit v1.2.3


From cabe13d0bd2efb8dd50ed2310f57b33e1a69a0d4 Mon Sep 17 00:00:00 2001
From: Zhiqi Song <songzhiqi1@huawei.com>
Date: Sat, 2 Dec 2023 17:17:19 +0800
Subject: crypto: hisilicon/qm - save capability registers in qm init process

In previous capability register implementation, qm irq related values
were read from capability registers dynamically when needed. But in
abnormal scenario, e.g. the core is timeout and the device needs to
soft reset and reset failed after disabling the MSE, the device can
not be removed normally, causing the following call trace:

	| Call trace:
        |  pci_irq_vector+0xfc/0x140
        |  hisi_qm_uninit+0x278/0x3b0 [hisi_qm]
        |  hpre_remove+0x16c/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        |  device_release_driver_internal+0x1ec/0x3e0
        |  device_release_driver+0x3c/0x60
        |  pci_stop_bus_device+0xfc/0x22c
        |  pci_stop_and_remove_bus_device+0x38/0x70
        |  pci_iov_remove_virtfn+0x108/0x1c0
        |  sriov_disable+0x7c/0x1e4
        |  pci_disable_sriov+0x4c/0x6c
        |  hisi_qm_sriov_disable+0x90/0x160 [hisi_qm]
        |  hpre_remove+0x1a8/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        |  device_release_driver_internal+0x1ec/0x3e0
        |  driver_detach+0x168/0x2d0
        |  bus_remove_driver+0xc0/0x230
        |  driver_unregister+0x58/0xdc
        |  pci_unregister_driver+0x40/0x220
        |  hpre_exit+0x34/0x64 [hisi_hpre]
        |  __arm64_sys_delete_module+0x374/0x620
        [...]

        | Call trace:
        |  free_msi_irqs+0x25c/0x300
        |  pci_disable_msi+0x19c/0x264
        |  pci_free_irq_vectors+0x4c/0x70
        |  hisi_qm_pci_uninit+0x44/0x90 [hisi_qm]
        |  hisi_qm_uninit+0x28c/0x3b0 [hisi_qm]
        |  hpre_remove+0x16c/0x1c0 [hisi_hpre]
        |  pci_device_remove+0x6c/0x264
        [...]

The reason for this call trace is that when the MSE is disabled, the value
of capability registers in the BAR space become invalid. This will make the
subsequent unregister process get the wrong irq vector through capability
registers and get the wrong irq number by pci_irq_vector().

So add a capability table structure to pre-store the valid value of the irq
information capability register in qm init process, avoid obtaining invalid
capability register value after the MSE is disabled.

Fixes: 3536cc55cada ("crypto: hisilicon/qm - support get device irq information from hardware registers")
Signed-off-by: Zhiqi Song <songzhiqi1@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/qm.c | 62 ++++++++++++++++++++++++++++++++++++-------
 include/linux/hisi_acc_qm.h   | 12 +++++++++
 2 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 4170c24c2ed1..d05ad2b16d0a 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -301,6 +301,13 @@ enum qm_basic_type {
 	QM_VF_IRQ_NUM_CAP,
 };
 
+enum qm_pre_store_cap_idx {
+	QM_EQ_IRQ_TYPE_CAP_IDX = 0x0,
+	QM_AEQ_IRQ_TYPE_CAP_IDX,
+	QM_ABN_IRQ_TYPE_CAP_IDX,
+	QM_PF2VF_IRQ_TYPE_CAP_IDX,
+};
+
 static const struct hisi_qm_cap_info qm_cap_info_comm[] = {
 	{QM_SUPPORT_DB_ISOLATION, 0x30,   0, BIT(0),  0x0, 0x0, 0x0},
 	{QM_SUPPORT_FUNC_QOS,     0x3100, 0, BIT(8),  0x0, 0x0, 0x1},
@@ -330,6 +337,13 @@ static const struct hisi_qm_cap_info qm_basic_info[] = {
 	{QM_VF_IRQ_NUM_CAP,     0x311c,   0,  GENMASK(15, 0), 0x1,       0x2,       0x3},
 };
 
+static const u32 qm_pre_store_caps[] = {
+	QM_EQ_IRQ_TYPE_CAP,
+	QM_AEQ_IRQ_TYPE_CAP,
+	QM_ABN_IRQ_TYPE_CAP,
+	QM_PF2VF_IRQ_TYPE_CAP,
+};
+
 struct qm_mailbox {
 	__le16 w0;
 	__le16 queue_num;
@@ -4774,7 +4788,7 @@ static void qm_unregister_abnormal_irq(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_VF)
 		return;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_ABN_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
 		return;
 
@@ -4791,7 +4805,7 @@ static int qm_register_abnormal_irq(struct hisi_qm *qm)
 	if (qm->fun_type == QM_HW_VF)
 		return 0;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_ABN_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4808,7 +4822,7 @@ static void qm_unregister_mb_cmd_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_PF2VF_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -4822,7 +4836,7 @@ static int qm_register_mb_cmd_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_PF2VF_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4839,7 +4853,7 @@ static void qm_unregister_aeq_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_AEQ_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -4853,7 +4867,7 @@ static int qm_register_aeq_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_AEQ_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4871,7 +4885,7 @@ static void qm_unregister_eq_irq(struct hisi_qm *qm)
 	struct pci_dev *pdev = qm->pdev;
 	u32 irq_vector, val;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_EQ_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return;
 
@@ -4885,7 +4899,7 @@ static int qm_register_eq_irq(struct hisi_qm *qm)
 	u32 irq_vector, val;
 	int ret;
 
-	val = hisi_qm_get_hw_info(qm, qm_basic_info, QM_EQ_IRQ_TYPE_CAP, qm->cap_ver);
+	val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
 	if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
 		return 0;
 
@@ -4973,7 +4987,29 @@ static int qm_get_qp_num(struct hisi_qm *qm)
 	return 0;
 }
 
-static void qm_get_hw_caps(struct hisi_qm *qm)
+static int qm_pre_store_irq_type_caps(struct hisi_qm *qm)
+{
+	struct hisi_qm_cap_record *qm_cap;
+	struct pci_dev *pdev = qm->pdev;
+	size_t i, size;
+
+	size = ARRAY_SIZE(qm_pre_store_caps);
+	qm_cap = devm_kzalloc(&pdev->dev, sizeof(*qm_cap) * size, GFP_KERNEL);
+	if (!qm_cap)
+		return -ENOMEM;
+
+	for (i = 0; i < size; i++) {
+		qm_cap[i].type = qm_pre_store_caps[i];
+		qm_cap[i].cap_val = hisi_qm_get_hw_info(qm, qm_basic_info,
+							qm_pre_store_caps[i], qm->cap_ver);
+	}
+
+	qm->cap_tables.qm_cap_table = qm_cap;
+
+	return 0;
+}
+
+static int qm_get_hw_caps(struct hisi_qm *qm)
 {
 	const struct hisi_qm_cap_info *cap_info = qm->fun_type == QM_HW_PF ?
 						  qm_cap_info_pf : qm_cap_info_vf;
@@ -5004,6 +5040,9 @@ static void qm_get_hw_caps(struct hisi_qm *qm)
 		if (val)
 			set_bit(cap_info[i].type, &qm->caps);
 	}
+
+	/* Fetch and save the value of irq type related capability registers */
+	return qm_pre_store_irq_type_caps(qm);
 }
 
 static int qm_get_pci_res(struct hisi_qm *qm)
@@ -5025,7 +5064,10 @@ static int qm_get_pci_res(struct hisi_qm *qm)
 		goto err_request_mem_regions;
 	}
 
-	qm_get_hw_caps(qm);
+	ret = qm_get_hw_caps(qm);
+	if (ret)
+		goto err_ioremap;
+
 	if (test_bit(QM_SUPPORT_DB_ISOLATION, &qm->caps)) {
 		qm->db_interval = QM_QP_DB_INTERVAL;
 		qm->db_phys_base = pci_resource_start(pdev, PCI_BAR_4);
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index cdc979f66dba..5f4c74facf6a 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -266,6 +266,16 @@ struct hisi_qm_cap_info {
 	u32 v3_val;
 };
 
+struct hisi_qm_cap_record {
+	u32 type;
+	u32 cap_val;
+};
+
+struct hisi_qm_cap_tables {
+	struct hisi_qm_cap_record *qm_cap_table;
+	struct hisi_qm_cap_record *dev_cap_table;
+};
+
 struct hisi_qm_list {
 	struct mutex lock;
 	struct list_head list;
@@ -376,6 +386,8 @@ struct hisi_qm {
 	u32 mb_qos;
 	u32 type_rate;
 	struct qm_err_isolate isolate_data;
+
+	struct hisi_qm_cap_tables cap_tables;
 };
 
 struct hisi_qp_status {
-- 
cgit v1.2.3


From ee25fba76acd8324f9de6628872c8c612a684209 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:00 +0100
Subject: gpiolib: provide gpiochip_dup_line_label()

gpiochip_is_requested() not only has a misleading name but it returns
a pointer to a string that is freed when the descriptor is released.

Provide a new helper meant to replace it, which returns a copy of the
label string instead.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 29 +++++++++++++++++++++++++++++
 include/linux/gpio/driver.h |  1 +
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 95d2a7b2ea3e..0147c900afea 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2386,6 +2386,35 @@ const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(gpiochip_is_requested);
 
+/**
+ * gpiochip_dup_line_label - Get a copy of the consumer label.
+ * @gc: GPIO chip controlling this line.
+ * @offset: Hardware offset of the line.
+ *
+ * Returns:
+ * Pointer to a copy of the consumer label if the line is requested or NULL
+ * if it's not. If a valid pointer was returned, it must be freed using
+ * kfree(). In case of a memory allocation error, the function returns %ENOMEM.
+ *
+ * Must not be called from atomic context.
+ */
+char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset)
+{
+	const char *label;
+	char *copy;
+
+	label = gpiochip_is_requested(gc, offset);
+	if (!label)
+		return NULL;
+
+	copy = kstrdup(label, GFP_KERNEL);
+	if (!copy)
+		return ERR_PTR(-ENOMEM);
+
+	return copy;
+}
+EXPORT_SYMBOL_GPL(gpiochip_dup_line_label);
+
 /**
  * gpiochip_request_own_desc - Allow GPIO chip to request its own descriptor
  * @gc: GPIO chip
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 0aed62f0c633..5ac6dc30c547 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -532,6 +532,7 @@ struct gpio_chip {
 };
 
 const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
+char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
-- 
cgit v1.2.3


From 6fd9c9933475a3efd7eed2f80c7778908a560a1f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:08 +0100
Subject: gpiolib: use gpiochip_dup_line_label() in for_each helpers

Rework for_each_requested_gpio_in_range() to use the new helper to
retrieve a dynamically allocated copy of the descriptor label and free
it at the end of each iteration. We need to leverage the CLASS()'
destructor to make sure that the label is freed even when breaking out
of the loop.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 5ac6dc30c547..cae4cdaa87db 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -534,17 +534,38 @@ struct gpio_chip {
 const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
+
+struct _gpiochip_for_each_data {
+	const char **label;
+	unsigned int *i;
+};
+
+DEFINE_CLASS(_gpiochip_for_each_data,
+	     struct _gpiochip_for_each_data,
+	     if (*_T.label) kfree(*_T.label),
+	     ({
+		struct _gpiochip_for_each_data _data = { label, i };
+		*_data.i = 0;
+		_data;
+	     }),
+	     const char **label, int *i)
+
 /**
  * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range
- * @chip:	the chip to query
- * @i:		loop variable
- * @base:	first GPIO in the range
- * @size:	amount of GPIOs to check starting from @base
- * @label:	label of current GPIO
+ * @_chip:	the chip to query
+ * @_i:		loop variable
+ * @_base:	first GPIO in the range
+ * @_size:	amount of GPIOs to check starting from @base
+ * @_label:	label of current GPIO
  */
-#define for_each_requested_gpio_in_range(chip, i, base, size, label)			\
-	for (i = 0; i < size; i++)							\
-		if ((label = gpiochip_is_requested(chip, base + i)) == NULL) {} else
+#define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label)		\
+	for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i);			\
+	     *_data.i < _size;								\
+	     (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL)			\
+		if ((*_data.label =							\
+			gpiochip_dup_line_label(_chip, _base + *_data.i)) == NULL) {}	\
+		else if (IS_ERR(*_data.label)) {}					\
+		else
 
 /* Iterates over all requested GPIO of the given @chip */
 #define for_each_requested_gpio(chip, i, label)						\
-- 
cgit v1.2.3


From f8d05e276b45e3097dfddd628fa991ce69c05c99 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 4 Dec 2023 10:35:09 +0100
Subject: gpiolib: remove gpiochip_is_requested()

We have no external users of gpiochip_is_requested(). Let's remove it
and replace its internal calls with direct testing of the REQUESTED flag.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 48 ++++++++++++---------------------------------
 include/linux/gpio/driver.h |  1 -
 2 files changed, 13 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 0147c900afea..0ca33397812c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1071,7 +1071,7 @@ void gpiochip_remove(struct gpio_chip *gc)
 
 	spin_lock_irqsave(&gpio_lock, flags);
 	for (i = 0; i < gdev->ngpio; i++) {
-		if (gpiochip_is_requested(gc, i))
+		if (test_bit(FLAG_REQUESTED, &gdev->descs[i].flags))
 			break;
 	}
 	spin_unlock_irqrestore(&gpio_lock, flags);
@@ -2359,33 +2359,6 @@ void gpiod_free(struct gpio_desc *desc)
 	gpio_device_put(desc->gdev);
 }
 
-/**
- * gpiochip_is_requested - return string iff signal was requested
- * @gc: controller managing the signal
- * @offset: of signal within controller's 0..(ngpio - 1) range
- *
- * Returns NULL if the GPIO is not currently requested, else a string.
- * The string returned is the label passed to gpio_request(); if none has been
- * passed it is a meaningless, non-NULL constant.
- *
- * This function is for use by GPIO controller drivers.  The label can
- * help with diagnostics, and knowing that the signal is used as a GPIO
- * can help avoid accidentally multiplexing it to another controller.
- */
-const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset)
-{
-	struct gpio_desc *desc;
-
-	desc = gpiochip_get_desc(gc, offset);
-	if (IS_ERR(desc))
-		return NULL;
-
-	if (test_bit(FLAG_REQUESTED, &desc->flags) == 0)
-		return NULL;
-	return desc->label;
-}
-EXPORT_SYMBOL_GPL(gpiochip_is_requested);
-
 /**
  * gpiochip_dup_line_label - Get a copy of the consumer label.
  * @gc: GPIO chip controlling this line.
@@ -2400,18 +2373,23 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested);
  */
 char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset)
 {
-	const char *label;
-	char *copy;
+	struct gpio_desc *desc;
+	char *label;
 
-	label = gpiochip_is_requested(gc, offset);
-	if (!label)
+	desc = gpiochip_get_desc(gc, offset);
+	if (IS_ERR(desc))
 		return NULL;
 
-	copy = kstrdup(label, GFP_KERNEL);
-	if (!copy)
+	guard(spinlock_irqsave)(&gpio_lock);
+
+	if (!test_bit(FLAG_REQUESTED, &desc->flags))
+		return NULL;
+
+	label = kstrdup(desc->label, GFP_KERNEL);
+	if (!label)
 		return ERR_PTR(-ENOMEM);
 
-	return copy;
+	return label;
 }
 EXPORT_SYMBOL_GPL(gpiochip_dup_line_label);
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index cae4cdaa87db..d1a3cb061927 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -531,7 +531,6 @@ struct gpio_chip {
 #endif /* CONFIG_OF_GPIO */
 };
 
-const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset);
 char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset);
 
 
-- 
cgit v1.2.3


From 2a48c635fd9a48699805bbfeee1e4b94b8fe819d Mon Sep 17 00:00:00 2001
From: "justinstitt@google.com" <justinstitt@google.com>
Date: Wed, 6 Dec 2023 23:16:10 +0000
Subject: ethtool: Implement ethtool_puts()

Use strscpy() to implement ethtool_puts().

Functionally the same as ethtool_sprintf() when it's used with two
arguments or with just "%s" format specifier.

Signed-off-by: Justin Stitt <justinstitt@google.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Madhuri Sripada <madhuri.sripada@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 13 +++++++++++++
 net/ethtool/ioctl.c     |  7 +++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c2bb74143eda..deb683d3360f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -1061,6 +1061,19 @@ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info
  */
 extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
 
+/**
+ * ethtool_puts - Write string to ethtool string data
+ * @data: Pointer to a pointer to the start of string to update
+ * @str: String to write
+ *
+ * Write string to *data without a trailing newline. Update *data
+ * to point at start of next string.
+ *
+ * Prefer this function to ethtool_sprintf() when given only
+ * two arguments or if @fmt is just "%s".
+ */
+extern void ethtool_puts(u8 **data, const char *str);
+
 /* Link mode to forced speed capabilities maps */
 struct ethtool_forced_speed_map {
 	u32		speed;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index a977f8903467..755a4be01bef 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1994,6 +1994,13 @@ __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...)
 }
 EXPORT_SYMBOL(ethtool_sprintf);
 
+void ethtool_puts(u8 **data, const char *str)
+{
+	strscpy(*data, str, ETH_GSTRING_LEN);
+	*data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_puts);
+
 static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_value id;
-- 
cgit v1.2.3


From 675bf8ef209cc8da28ffefd7d8a93c53735cc84a Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Thu, 30 Nov 2023 12:30:01 +0100
Subject: tty: virtio: drop virtio_cons_early_init()

The last user of virtio_cons_early_init() was dropped in commit
7fb2b2d51244 ("s390/virtio: remove the old KVM virtio transport").

So now, drop virtio_cons_early_init() and the logic and headers behind
too.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Acked-by: Jason Wang <jasowang@redhat.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: linux-alpha@vger.kernel.org
Cc: virtualization@lists.linux.dev
Link: https://lore.kernel.org/r/20231130113001.29154-1-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                    |  1 -
 drivers/char/virtio_console.c  | 48 ------------------------------------------
 include/linux/virtio_console.h | 38 ---------------------------------
 3 files changed, 87 deletions(-)
 delete mode 100644 include/linux/virtio_console.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index ea790149af79..d60009f3ef9c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23018,7 +23018,6 @@ M:	Amit Shah <amit@kernel.org>
 L:	virtualization@lists.linux.dev
 S:	Maintained
 F:	drivers/char/virtio_console.c
-F:	include/linux/virtio_console.h
 F:	include/uapi/linux/virtio_console.h
 
 VIRTIO CORE AND NET DRIVERS
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 431e9e5bf9c1..8abe599c1c07 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -230,9 +230,6 @@ struct port {
 	bool guest_connected;
 };
 
-/* This is the very early arch-specified put chars function. */
-static int (*early_put_chars)(u32, const char *, int);
-
 static struct port *find_port_by_vtermno(u32 vtermno)
 {
 	struct port *port;
@@ -1114,9 +1111,6 @@ static int put_chars(u32 vtermno, const char *buf, int count)
 	void *data;
 	int ret;
 
-	if (unlikely(early_put_chars))
-		return early_put_chars(vtermno, buf, count);
-
 	port = find_port_by_vtermno(vtermno);
 	if (!port)
 		return -EPIPE;
@@ -1142,10 +1136,6 @@ static int get_chars(u32 vtermno, char *buf, int count)
 {
 	struct port *port;
 
-	/* If we've not set up the port yet, we have no input to give. */
-	if (unlikely(early_put_chars))
-		return 0;
-
 	port = find_port_by_vtermno(vtermno);
 	if (!port)
 		return -EPIPE;
@@ -1201,21 +1191,6 @@ static const struct hv_ops hv_ops = {
 	.notifier_hangup = notifier_del_vio,
 };
 
-/*
- * Console drivers are initialized very early so boot messages can go
- * out, so we do things slightly differently from the generic virtio
- * initialization of the net and block drivers.
- *
- * At this stage, the console is output-only.  It's too early to set
- * up a virtqueue, so we let the drivers do some boutique early-output
- * thing.
- */
-int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
-{
-	early_put_chars = put_chars;
-	return hvc_instantiate(0, 0, &hv_ops);
-}
-
 static int init_port_console(struct port *port)
 {
 	int ret;
@@ -1256,13 +1231,6 @@ static int init_port_console(struct port *port)
 	spin_unlock_irq(&pdrvdata_lock);
 	port->guest_connected = true;
 
-	/*
-	 * Start using the new console output if this is the first
-	 * console to come up.
-	 */
-	if (early_put_chars)
-		early_put_chars = NULL;
-
 	/* Notify host of port being opened */
 	send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 1);
 
@@ -1999,7 +1967,6 @@ static int virtcons_probe(struct virtio_device *vdev)
 	struct ports_device *portdev;
 	int err;
 	bool multiport;
-	bool early = early_put_chars != NULL;
 
 	/* We only need a config space if features are offered */
 	if (!vdev->config->get &&
@@ -2010,9 +1977,6 @@ static int virtcons_probe(struct virtio_device *vdev)
 		return -EINVAL;
 	}
 
-	/* Ensure to read early_put_chars now */
-	barrier();
-
 	portdev = kmalloc(sizeof(*portdev), GFP_KERNEL);
 	if (!portdev) {
 		err = -ENOMEM;
@@ -2100,18 +2064,6 @@ static int virtcons_probe(struct virtio_device *vdev)
 	__send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
 			   VIRTIO_CONSOLE_DEVICE_READY, 1);
 
-	/*
-	 * If there was an early virtio console, assume that there are no
-	 * other consoles. We need to wait until the hvc_alloc matches the
-	 * hvc_instantiate, otherwise tty_open will complain, resulting in
-	 * a "Warning: unable to open an initial console" boot failure.
-	 * Without multiport this is done in add_port above. With multiport
-	 * this might take some host<->guest communication - thus we have to
-	 * wait.
-	 */
-	if (multiport && early)
-		wait_for_completion(&early_console_added);
-
 	return 0;
 
 free_chrdev:
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
deleted file mode 100644
index d2e2785af602..000000000000
--- a/include/linux/virtio_console.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers:
- *
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Copyright (C) Red Hat, Inc., 2009, 2010, 2011
- * Copyright (C) Amit Shah <amit.shah@redhat.com>, 2009, 2010, 2011
- */
-#ifndef _LINUX_VIRTIO_CONSOLE_H
-#define _LINUX_VIRTIO_CONSOLE_H
-
-#include <uapi/linux/virtio_console.h>
-
-int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int));
-#endif /* _LINUX_VIRTIO_CONSOLE_H */
-- 
cgit v1.2.3


From 093258a9963bfac043244995bff87dc2c931b9b5 Mon Sep 17 00:00:00 2001
From: Théo Lebrun <theo.lebrun@bootlin.com>
Date: Thu, 30 Nov 2023 15:07:13 +0100
Subject: tty: serial: amba: cleanup whitespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix whitespace in include/linux/amba/serial.h to match current kernel
coding standards. Fixes about:

 - CHECK: spaces preferred around that '|' (ctx:VxV)
 - ERROR: code indent should use tabs where possible
 - WARNING: Unnecessary space before function pointer arguments
 - WARNING: please, no spaces at the start of a line

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231130-mbly-uart-v5-1-6566703a04b5@bootlin.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/amba/serial.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index a1307b58cc2c..27003ec52114 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -75,10 +75,10 @@
 #define UART011_DR_PE		(1 << 9)
 #define UART011_DR_FE		(1 << 8)
 
-#define UART01x_RSR_OE 		0x08
-#define UART01x_RSR_BE 		0x04
-#define UART01x_RSR_PE 		0x02
-#define UART01x_RSR_FE 		0x01
+#define UART01x_RSR_OE		0x08
+#define UART01x_RSR_BE		0x04
+#define UART01x_RSR_PE		0x02
+#define UART01x_RSR_FE		0x01
 
 #define UART011_FR_RI		0x100
 #define UART011_FR_TXFE		0x080
@@ -86,9 +86,9 @@
 #define UART01x_FR_TXFF		0x020
 #define UART01x_FR_RXFE		0x010
 #define UART01x_FR_BUSY		0x008
-#define UART01x_FR_DCD 		0x004
-#define UART01x_FR_DSR 		0x002
-#define UART01x_FR_CTS 		0x001
+#define UART01x_FR_DCD		0x004
+#define UART01x_FR_DSR		0x002
+#define UART01x_FR_CTS		0x001
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 /*
@@ -110,14 +110,14 @@
 #define UART011_CR_TXE		0x0100	/* transmit enable */
 #define UART011_CR_LBE		0x0080	/* loopback enable */
 #define UART010_CR_RTIE		0x0040
-#define UART010_CR_TIE 		0x0020
-#define UART010_CR_RIE 		0x0010
+#define UART010_CR_TIE		0x0020
+#define UART010_CR_RIE		0x0010
 #define UART010_CR_MSIE		0x0008
 #define ST_UART011_CR_OVSFACT	0x0008	/* Oversampling factor */
 #define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
 #define UART01x_CR_SIREN	0x0002	/* SIR enable */
 #define UART01x_CR_UARTEN	0x0001	/* UART enable */
- 
+
 #define UART011_LCRH_SPS	0x80
 #define UART01x_LCRH_WLEN_8	0x60
 #define UART01x_LCRH_WLEN_7	0x40
@@ -203,8 +203,8 @@
 #define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
 #define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
 
-#define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
-#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
+#define UART01x_RSR_ANY		(UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE)
+#define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS)
 
 #ifndef __ASSEMBLY__
 struct amba_device; /* in uncompress this is included but amba/bus.h is not */
@@ -220,8 +220,8 @@ struct amba_pl011_data {
 	bool dma_rx_poll_enable;
 	unsigned int dma_rx_poll_rate;
 	unsigned int dma_rx_poll_timeout;
-        void (*init) (void);
-	void (*exit) (void);
+	void (*init)(void);
+	void (*exit)(void);
 };
 #endif
 
-- 
cgit v1.2.3


From 49943393c9f0be61fd494a884851aa117cd72382 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:48 +0100
Subject: tty: switch tty_port::xmit_* to u8

Both xmit_buf and xmit_fifo of struct tty_port should be u8. To conform
to characters in the rest of the tty layer.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-4-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_port.c   | 2 +-
 include/linux/tty_port.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index ef72d2e4b928..14cca33d2269 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -247,7 +247,7 @@ int tty_port_alloc_xmit_buf(struct tty_port *port)
 	/* We may sleep in get_zeroed_page() */
 	mutex_lock(&port->buf_mutex);
 	if (port->xmit_buf == NULL) {
-		port->xmit_buf = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+		port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL);
 		if (port->xmit_buf)
 			kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE);
 	}
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 3276311a7f38..1b861f2100b6 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -114,8 +114,8 @@ struct tty_port {
 	unsigned char		console:1;
 	struct mutex		mutex;
 	struct mutex		buf_mutex;
-	unsigned char		*xmit_buf;
-	DECLARE_KFIFO_PTR(xmit_fifo, unsigned char);
+	u8			*xmit_buf;
+	DECLARE_KFIFO_PTR(xmit_fifo, u8);
 	unsigned int		close_delay;
 	unsigned int		closing_wait;
 	int			drain_delay;
-- 
cgit v1.2.3


From 3a00da027946cd08db1c1be2de4620950bbdf074 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:49 +0100
Subject: tty: make tty_operations::send_xchar accept u8 char

tty_operations::send_xchar is one of the last users of 'char' type for
characters in the tty layer. Convert it to u8 now.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-bluetooth@vger.kernel.org
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20231206073712.17776-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/capi/capi.c         | 4 ++--
 drivers/mmc/core/sdio_uart.c     | 2 +-
 drivers/tty/amiserial.c          | 2 +-
 drivers/tty/serial/serial_core.c | 2 +-
 drivers/tty/tty_io.c             | 2 +-
 include/linux/tty.h              | 2 +-
 include/linux/tty_driver.h       | 4 ++--
 net/bluetooth/rfcomm/tty.c       | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 2f3789515445..6e80d7bd3c4d 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1231,9 +1231,9 @@ static void capinc_tty_hangup(struct tty_struct *tty)
 	tty_port_hangup(&mp->port);
 }
 
-static void capinc_tty_send_xchar(struct tty_struct *tty, char ch)
+static void capinc_tty_send_xchar(struct tty_struct *tty, u8 ch)
 {
-	pr_debug("capinc_tty_send_xchar(%d)\n", ch);
+	pr_debug("capinc_tty_send_xchar(%u)\n", ch);
 }
 
 static const struct tty_operations capinc_ops = {
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index a05322f15771..370fadf1d6d1 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -792,7 +792,7 @@ static unsigned int sdio_uart_chars_in_buffer(struct tty_struct *tty)
 	return kfifo_len(&port->xmit_fifo);
 }
 
-static void sdio_uart_send_xchar(struct tty_struct *tty, char ch)
+static void sdio_uart_send_xchar(struct tty_struct *tty, u8 ch)
 {
 	struct sdio_uart_port *port = tty->driver_data;
 
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index a80f059f77bf..a30dc054ffbf 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -811,7 +811,7 @@ static void rs_flush_buffer(struct tty_struct *tty)
  * This function is used to send a high-priority XON/XOFF character to
  * the device
  */
-static void rs_send_xchar(struct tty_struct *tty, char ch)
+static void rs_send_xchar(struct tty_struct *tty, u8 ch)
 {
 	struct serial_state *info = tty->driver_data;
         unsigned long flags;
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0393853b9947..80085b151b34 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -687,7 +687,7 @@ EXPORT_SYMBOL_GPL(uart_xchar_out);
  * This function is used to send a high-priority XON/XOFF character to
  * the device
  */
-static void uart_send_xchar(struct tty_struct *tty, char ch)
+static void uart_send_xchar(struct tty_struct *tty, u8 ch)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 005d91c63707..6a502110da61 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1149,7 +1149,7 @@ ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter)
  *
  * Locking: none for xchar method, write ordering for write method.
  */
-int tty_send_xchar(struct tty_struct *tty, char ch)
+int tty_send_xchar(struct tty_struct *tty, u8 ch)
 {
 	bool was_stopped = tty->flow.stopped;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e96c85f4f91e..d3bedcc08738 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -410,7 +410,7 @@ void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 void stop_tty(struct tty_struct *tty);
 void start_tty(struct tty_struct *tty);
 void tty_write_message(struct tty_struct *tty, char *msg);
-int tty_send_xchar(struct tty_struct *tty, char ch);
+int tty_send_xchar(struct tty_struct *tty, u8 ch);
 int tty_put_char(struct tty_struct *tty, unsigned char c);
 unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index f428c1b784a2..7372124fbf90 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -242,7 +242,7 @@ struct serial_struct;
  *	Optional: If not provided, the device is assumed to have no FIFO.
  *	Usually correct to invoke via tty_wait_until_sent(). May sleep.
  *
- * @send_xchar: ``void ()(struct tty_struct *tty, char ch)``
+ * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)``
  *
  *	This routine is used to send a high-priority XON/XOFF character (@ch)
  *	to the @tty device.
@@ -374,7 +374,7 @@ struct tty_operations {
 	void (*flush_buffer)(struct tty_struct *tty);
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
-	void (*send_xchar)(struct tty_struct *tty, char ch);
+	void (*send_xchar)(struct tty_struct *tty, u8 ch);
 	int (*tiocmget)(struct tty_struct *tty);
 	int (*tiocmset)(struct tty_struct *tty,
 			unsigned int set, unsigned int clear);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 94ec913dfb76..69c75c041fe1 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1041,7 +1041,7 @@ static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
 	tty_wakeup(tty);
 }
 
-static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
+static void rfcomm_tty_send_xchar(struct tty_struct *tty, u8 ch)
 {
 	BT_DBG("tty %p ch %c", tty, ch);
 }
-- 
cgit v1.2.3


From 4e8d8878145f1478886e1630c44113ad2c2eb99d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:36:50 +0100
Subject: tty: core: the rest to u8

There are still last minor users in the tty core that still reference
characters by the 'char' type. Switch them to u8.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-6-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 10 +++++-----
 include/linux/tty.h  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 6a502110da61..f3ca2105b66d 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -852,9 +852,9 @@ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty,
 {
 	void *cookie = NULL;
 	unsigned long offset = 0;
-	char kernel_buf[64];
 	ssize_t retval = 0;
 	size_t copied, count = iov_iter_count(to);
+	u8 kernel_buf[64];
 
 	do {
 		ssize_t size = min(count, sizeof(kernel_buf));
@@ -995,7 +995,7 @@ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty,
 
 	/* write_buf/write_cnt is protected by the atomic_write_lock mutex */
 	if (tty->write_cnt < chunk) {
-		unsigned char *buf_chunk;
+		u8 *buf_chunk;
 
 		if (chunk < 1024)
 			chunk = 1024;
@@ -2278,10 +2278,10 @@ static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI)
  *  * Called functions take tty_ldiscs_lock
  *  * current->signal->tty check is safe without locks
  */
-static int tiocsti(struct tty_struct *tty, char __user *p)
+static int tiocsti(struct tty_struct *tty, u8 __user *p)
 {
-	char ch;
 	struct tty_ldisc *ld;
+	u8 ch;
 
 	if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN))
 		return -EIO;
@@ -3142,7 +3142,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
  *
  * Return: the number of characters successfully output.
  */
-int tty_put_char(struct tty_struct *tty, unsigned char ch)
+int tty_put_char(struct tty_struct *tty, u8 ch)
 {
 	if (tty->ops->put_char)
 		return tty->ops->put_char(tty, ch);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index d3bedcc08738..cc08f7e1c122 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -242,7 +242,7 @@ struct tty_struct {
 	void *driver_data;
 	spinlock_t files_lock;
 	int write_cnt;
-	unsigned char *write_buf;
+	u8 *write_buf;
 
 	struct list_head tty_files;
 
@@ -411,7 +411,7 @@ void stop_tty(struct tty_struct *tty);
 void start_tty(struct tty_struct *tty);
 void tty_write_message(struct tty_struct *tty, char *msg);
 int tty_send_xchar(struct tty_struct *tty, u8 ch);
-int tty_put_char(struct tty_struct *tty, unsigned char c);
+int tty_put_char(struct tty_struct *tty, u8 c);
 unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 unsigned int tty_write_room(struct tty_struct *tty);
 void tty_driver_flush_buffer(struct tty_struct *tty);
-- 
cgit v1.2.3


From f2470d2bc4327c2c1a604c6e247442dbb14c90c5 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:37:07 +0100
Subject: tty: serdev: convert to u8 and size_t in serdev_controller_ops

Switch character types to u8 and sizes to size_t. To conform to
characters/sizes in the rest of the tty layer.

In this patch, only struct serdev_controller_ops hooks. The rest will
follow.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-23-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/serdev-ttyport.c | 2 +-
 include/linux/serdev.h              | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index c5381fe15e1c..e94e090cf0a1 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -74,7 +74,7 @@ static const struct tty_port_client_operations client_ops = {
  * Callback functions from the serdev core.
  */
 
-static int ttyport_write_buf(struct serdev_controller *ctrl, const unsigned char *data, size_t len)
+static ssize_t ttyport_write_buf(struct serdev_controller *ctrl, const u8 *data, size_t len)
 {
 	struct serport *serport = serdev_controller_get_drvdata(ctrl);
 	struct tty_struct *tty = serport->tty;
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 0ebf53bb254f..8cdab2c3b6d5 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -82,7 +82,7 @@ enum serdev_parity {
  * serdev controller structures
  */
 struct serdev_controller_ops {
-	int (*write_buf)(struct serdev_controller *, const unsigned char *, size_t);
+	ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t);
 	void (*write_flush)(struct serdev_controller *);
 	int (*write_room)(struct serdev_controller *);
 	int (*open)(struct serdev_controller *);
@@ -185,9 +185,9 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl
 	serdev->ops->write_wakeup(serdev);
 }
 
-static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
-					      const unsigned char *data,
-					      size_t count)
+static inline ssize_t serdev_controller_receive_buf(struct serdev_controller *ctrl,
+						    const u8 *data,
+						    size_t count)
 {
 	struct serdev_device *serdev = ctrl->serdev;
 
-- 
cgit v1.2.3


From 475fc6e2de6fec0ff3c9a74ddcfd2b52c90adc0d Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 6 Dec 2023 08:37:08 +0100
Subject: tty: serdev: convert to u8 and size_t

Switch character types to u8 and sizes to size_t. To conform to
characters/sizes in the rest of the tty layer.

This patch converts struct serdev_device_ops hooks and its
instantiations.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Acked-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20231206073712.17776-24-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bluetooth/btmtkuart.c                          |  4 ++--
 drivers/bluetooth/btnxpuart.c                          |  4 ++--
 drivers/bluetooth/hci_serdev.c                         |  4 ++--
 drivers/gnss/serial.c                                  |  4 ++--
 drivers/gnss/sirf.c                                    |  4 ++--
 drivers/greybus/gb-beagleplay.c                        |  5 +++--
 drivers/iio/chemical/pms7003.c                         |  6 +++---
 drivers/iio/chemical/scd30_serial.c                    |  6 +++---
 drivers/iio/chemical/sps30_serial.c                    | 18 +++++++++---------
 drivers/iio/imu/bno055/bno055_ser_core.c               |  6 +++---
 drivers/mfd/rave-sp.c                                  | 10 +++++-----
 drivers/net/ethernet/qualcomm/qca_uart.c               |  5 ++---
 drivers/nfc/pn533/uart.c                               |  4 ++--
 drivers/nfc/s3fwrn5/uart.c                             |  5 ++---
 drivers/platform/chrome/cros_ec_uart.c                 |  5 ++---
 drivers/platform/surface/aggregator/controller.h       |  4 ++--
 drivers/platform/surface/aggregator/core.c             |  4 ++--
 drivers/platform/surface/aggregator/ssh_packet_layer.c |  4 ++--
 drivers/platform/surface/aggregator/ssh_packet_layer.h |  2 +-
 drivers/tty/serdev/core.c                              | 12 +++++-------
 include/linux/serdev.h                                 | 13 +++++++------
 sound/drivers/serial-generic.c                         |  4 ++--
 22 files changed, 65 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c
index 935feab815d9..d4f4e40ac5d2 100644
--- a/drivers/bluetooth/btmtkuart.c
+++ b/drivers/bluetooth/btmtkuart.c
@@ -385,8 +385,8 @@ static int btmtkuart_recv(struct hci_dev *hdev, const u8 *data, size_t count)
 	return 0;
 }
 
-static int btmtkuart_receive_buf(struct serdev_device *serdev, const u8 *data,
-				 size_t count)
+static ssize_t btmtkuart_receive_buf(struct serdev_device *serdev,
+				     const u8 *data, size_t count)
 {
 	struct btmtkuart_dev *bdev = serdev_device_get_drvdata(serdev);
 	int err;
diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index b7e66b7ac570..d75487a90bba 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -1264,8 +1264,8 @@ static const struct h4_recv_pkt nxp_recv_pkts[] = {
 	{ NXP_RECV_FW_REQ_V3,   .recv = nxp_recv_fw_req_v3 },
 };
 
-static int btnxpuart_receive_buf(struct serdev_device *serdev, const u8 *data,
-				 size_t count)
+static ssize_t btnxpuart_receive_buf(struct serdev_device *serdev,
+				     const u8 *data, size_t count)
 {
 	struct btnxpuart_dev *nxpdev = serdev_device_get_drvdata(serdev);
 
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index f16fd79bc02b..39c8b567da3c 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -271,8 +271,8 @@ static void hci_uart_write_wakeup(struct serdev_device *serdev)
  *
  * Return: number of processed bytes
  */
-static int hci_uart_receive_buf(struct serdev_device *serdev, const u8 *data,
-				   size_t count)
+static ssize_t hci_uart_receive_buf(struct serdev_device *serdev,
+				    const u8 *data, size_t count)
 {
 	struct hci_uart *hu = serdev_device_get_drvdata(serdev);
 
diff --git a/drivers/gnss/serial.c b/drivers/gnss/serial.c
index 5d8e9bfb24d0..baa956494e79 100644
--- a/drivers/gnss/serial.c
+++ b/drivers/gnss/serial.c
@@ -80,8 +80,8 @@ static const struct gnss_operations gnss_serial_gnss_ops = {
 	.write_raw	= gnss_serial_write_raw,
 };
 
-static int gnss_serial_receive_buf(struct serdev_device *serdev,
-					const unsigned char *buf, size_t count)
+static ssize_t gnss_serial_receive_buf(struct serdev_device *serdev,
+				       const u8 *buf, size_t count)
 {
 	struct gnss_serial *gserial = serdev_device_get_drvdata(serdev);
 	struct gnss_device *gdev = gserial->gdev;
diff --git a/drivers/gnss/sirf.c b/drivers/gnss/sirf.c
index bcb53ccfee4d..6801a8fb2040 100644
--- a/drivers/gnss/sirf.c
+++ b/drivers/gnss/sirf.c
@@ -160,8 +160,8 @@ static const struct gnss_operations sirf_gnss_ops = {
 	.write_raw	= sirf_write_raw,
 };
 
-static int sirf_receive_buf(struct serdev_device *serdev,
-				const unsigned char *buf, size_t count)
+static ssize_t sirf_receive_buf(struct serdev_device *serdev,
+				const u8 *buf, size_t count)
 {
 	struct sirf_data *data = serdev_device_get_drvdata(serdev);
 	struct gnss_device *gdev = data->gdev;
diff --git a/drivers/greybus/gb-beagleplay.c b/drivers/greybus/gb-beagleplay.c
index 43318c1993ba..2da37ff92cf1 100644
--- a/drivers/greybus/gb-beagleplay.c
+++ b/drivers/greybus/gb-beagleplay.c
@@ -257,7 +257,7 @@ static void hdlc_rx_frame(struct gb_beagleplay *bg)
 	}
 }
 
-static int hdlc_rx(struct gb_beagleplay *bg, const u8 *data, size_t count)
+static ssize_t hdlc_rx(struct gb_beagleplay *bg, const u8 *data, size_t count)
 {
 	size_t i;
 	u8 c;
@@ -317,7 +317,8 @@ static void hdlc_deinit(struct gb_beagleplay *bg)
 	flush_work(&bg->tx_work);
 }
 
-static int gb_tty_receive(struct serdev_device *sd, const unsigned char *data, size_t count)
+static ssize_t gb_tty_receive(struct serdev_device *sd, const u8 *data,
+			      size_t count)
 {
 	struct gb_beagleplay *bg = serdev_device_get_drvdata(sd);
 
diff --git a/drivers/iio/chemical/pms7003.c b/drivers/iio/chemical/pms7003.c
index e9857d93b307..b5cf15a515d2 100644
--- a/drivers/iio/chemical/pms7003.c
+++ b/drivers/iio/chemical/pms7003.c
@@ -211,13 +211,13 @@ static bool pms7003_frame_is_okay(struct pms7003_frame *frame)
 	return checksum == pms7003_calc_checksum(frame);
 }
 
-static int pms7003_receive_buf(struct serdev_device *serdev,
-			       const unsigned char *buf, size_t size)
+static ssize_t pms7003_receive_buf(struct serdev_device *serdev, const u8 *buf,
+				   size_t size)
 {
 	struct iio_dev *indio_dev = serdev_device_get_drvdata(serdev);
 	struct pms7003_state *state = iio_priv(indio_dev);
 	struct pms7003_frame *frame = &state->frame;
-	int num;
+	size_t num;
 
 	if (!frame->expected_length) {
 		u16 magic;
diff --git a/drivers/iio/chemical/scd30_serial.c b/drivers/iio/chemical/scd30_serial.c
index 3c519103d30b..a47654591e55 100644
--- a/drivers/iio/chemical/scd30_serial.c
+++ b/drivers/iio/chemical/scd30_serial.c
@@ -174,13 +174,13 @@ static int scd30_serdev_command(struct scd30_state *state, enum scd30_cmd cmd, u
 	return 0;
 }
 
-static int scd30_serdev_receive_buf(struct serdev_device *serdev,
-				    const unsigned char *buf, size_t size)
+static ssize_t scd30_serdev_receive_buf(struct serdev_device *serdev,
+					const u8 *buf, size_t size)
 {
 	struct iio_dev *indio_dev = serdev_device_get_drvdata(serdev);
 	struct scd30_serdev_priv *priv;
 	struct scd30_state *state;
-	int num;
+	size_t num;
 
 	if (!indio_dev)
 		return 0;
diff --git a/drivers/iio/chemical/sps30_serial.c b/drivers/iio/chemical/sps30_serial.c
index 164f4b3e025c..3afa89f8acc3 100644
--- a/drivers/iio/chemical/sps30_serial.c
+++ b/drivers/iio/chemical/sps30_serial.c
@@ -74,8 +74,8 @@ static int sps30_serial_xfer(struct sps30_state *state, const unsigned char *buf
 }
 
 static const struct {
-	unsigned char byte;
-	unsigned char byte2;
+	u8 byte;
+	u8 byte2;
 } sps30_serial_bytes[] = {
 	{ 0x11, 0x31 },
 	{ 0x13, 0x33 },
@@ -83,7 +83,7 @@ static const struct {
 	{ 0x7d, 0x5d },
 };
 
-static int sps30_serial_put_byte(unsigned char *buf, unsigned char byte)
+static int sps30_serial_put_byte(u8 *buf, u8 byte)
 {
 	int i;
 
@@ -102,7 +102,7 @@ static int sps30_serial_put_byte(unsigned char *buf, unsigned char byte)
 	return 1;
 }
 
-static char sps30_serial_get_byte(bool escaped, unsigned char byte2)
+static u8 sps30_serial_get_byte(bool escaped, u8 byte2)
 {
 	int i;
 
@@ -130,8 +130,8 @@ static unsigned char sps30_serial_calc_chksum(const unsigned char *buf, size_t n
 	return ~chksum;
 }
 
-static int sps30_serial_prep_frame(unsigned char *buf, unsigned char cmd,
-				   const unsigned char *arg, size_t arg_size)
+static int sps30_serial_prep_frame(u8 *buf, u8 cmd, const u8 *arg,
+				   size_t arg_size)
 {
 	unsigned char chksum;
 	int num = 0;
@@ -210,14 +210,14 @@ static int sps30_serial_command(struct sps30_state *state, unsigned char cmd,
 	return rsp_size;
 }
 
-static int sps30_serial_receive_buf(struct serdev_device *serdev,
-				    const unsigned char *buf, size_t size)
+static ssize_t sps30_serial_receive_buf(struct serdev_device *serdev,
+					const u8 *buf, size_t size)
 {
 	struct iio_dev *indio_dev = dev_get_drvdata(&serdev->dev);
 	struct sps30_serial_priv *priv;
 	struct sps30_state *state;
-	unsigned char byte;
 	size_t i;
+	u8 byte;
 
 	if (!indio_dev)
 		return 0;
diff --git a/drivers/iio/imu/bno055/bno055_ser_core.c b/drivers/iio/imu/bno055/bno055_ser_core.c
index 57728a568471..5677bdf4f846 100644
--- a/drivers/iio/imu/bno055/bno055_ser_core.c
+++ b/drivers/iio/imu/bno055/bno055_ser_core.c
@@ -378,12 +378,12 @@ static void bno055_ser_handle_rx(struct bno055_ser_priv *priv, int status)
  * Also, we assume to RX one pkt per time (i.e. the HW doesn't send anything
  * unless we require to AND we don't queue more than one request per time).
  */
-static int bno055_ser_receive_buf(struct serdev_device *serdev,
-				  const unsigned char *buf, size_t size)
+static ssize_t bno055_ser_receive_buf(struct serdev_device *serdev,
+				      const u8 *buf, size_t size)
 {
 	int status;
 	struct bno055_ser_priv *priv = serdev_device_get_drvdata(serdev);
-	int remaining = size;
+	size_t remaining = size;
 
 	if (size == 0)
 		return 0;
diff --git a/drivers/mfd/rave-sp.c b/drivers/mfd/rave-sp.c
index da50eba10014..b1229bb143ee 100644
--- a/drivers/mfd/rave-sp.c
+++ b/drivers/mfd/rave-sp.c
@@ -471,17 +471,17 @@ static void rave_sp_receive_frame(struct rave_sp *sp,
 		rave_sp_receive_reply(sp, data, length);
 }
 
-static int rave_sp_receive_buf(struct serdev_device *serdev,
-			       const unsigned char *buf, size_t size)
+static ssize_t rave_sp_receive_buf(struct serdev_device *serdev,
+				   const u8 *buf, size_t size)
 {
 	struct device *dev = &serdev->dev;
 	struct rave_sp *sp = dev_get_drvdata(dev);
 	struct rave_sp_deframer *deframer = &sp->deframer;
-	const unsigned char *src = buf;
-	const unsigned char *end = buf + size;
+	const u8 *src = buf;
+	const u8 *end = buf + size;
 
 	while (src < end) {
-		const unsigned char byte = *src++;
+		const u8 byte = *src++;
 
 		switch (deframer->state) {
 		case RAVE_SP_EXPECT_SOF:
diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c
index 9adec91f35e9..223321897b96 100644
--- a/drivers/net/ethernet/qualcomm/qca_uart.c
+++ b/drivers/net/ethernet/qualcomm/qca_uart.c
@@ -58,9 +58,8 @@ struct qcauart {
 	unsigned char *tx_buffer;
 };
 
-static int
-qca_tty_receive(struct serdev_device *serdev, const unsigned char *data,
-		size_t count)
+static ssize_t
+qca_tty_receive(struct serdev_device *serdev, const u8 *data, size_t count)
 {
 	struct qcauart *qca = serdev_device_get_drvdata(serdev);
 	struct net_device *netdev = qca->net_dev;
diff --git a/drivers/nfc/pn533/uart.c b/drivers/nfc/pn533/uart.c
index a556acdb947b..2eb5978bd79e 100644
--- a/drivers/nfc/pn533/uart.c
+++ b/drivers/nfc/pn533/uart.c
@@ -203,8 +203,8 @@ static int pn532_uart_rx_is_frame(struct sk_buff *skb)
 	return 0;
 }
 
-static int pn532_receive_buf(struct serdev_device *serdev,
-		const unsigned char *data, size_t count)
+static ssize_t pn532_receive_buf(struct serdev_device *serdev,
+				 const u8 *data, size_t count)
 {
 	struct pn532_uart_phy *dev = serdev_device_get_drvdata(serdev);
 	size_t i;
diff --git a/drivers/nfc/s3fwrn5/uart.c b/drivers/nfc/s3fwrn5/uart.c
index 82ea35d748a5..456d3947116c 100644
--- a/drivers/nfc/s3fwrn5/uart.c
+++ b/drivers/nfc/s3fwrn5/uart.c
@@ -51,9 +51,8 @@ static const struct s3fwrn5_phy_ops uart_phy_ops = {
 	.write = s3fwrn82_uart_write,
 };
 
-static int s3fwrn82_uart_read(struct serdev_device *serdev,
-			      const unsigned char *data,
-			      size_t count)
+static ssize_t s3fwrn82_uart_read(struct serdev_device *serdev,
+				  const u8 *data, size_t count)
 {
 	struct s3fwrn82_uart_phy *phy = serdev_device_get_drvdata(serdev);
 	size_t i;
diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c
index 788246559bbb..68d80559fddc 100644
--- a/drivers/platform/chrome/cros_ec_uart.c
+++ b/drivers/platform/chrome/cros_ec_uart.c
@@ -81,9 +81,8 @@ struct cros_ec_uart {
 	struct response_info response;
 };
 
-static int cros_ec_uart_rx_bytes(struct serdev_device *serdev,
-				 const u8 *data,
-				 size_t count)
+static ssize_t cros_ec_uart_rx_bytes(struct serdev_device *serdev,
+				     const u8 *data, size_t count)
 {
 	struct ec_host_response *host_response;
 	struct cros_ec_device *ec_dev = serdev_device_get_drvdata(serdev);
diff --git a/drivers/platform/surface/aggregator/controller.h b/drivers/platform/surface/aggregator/controller.h
index f0d987abc51e..f1638c2081e8 100644
--- a/drivers/platform/surface/aggregator/controller.h
+++ b/drivers/platform/surface/aggregator/controller.h
@@ -238,8 +238,8 @@ struct ssam_controller {
  * layer of the controller has been shut down, %-ESHUTDOWN.
  */
 static inline
-int ssam_controller_receive_buf(struct ssam_controller *ctrl,
-				const unsigned char *buf, size_t n)
+ssize_t ssam_controller_receive_buf(struct ssam_controller *ctrl, const u8 *buf,
+				    size_t n)
 {
 	return ssh_ptl_rx_rcvbuf(&ctrl->rtl.ptl, buf, n);
 }
diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c
index 1a6373dea109..72d7314a4efd 100644
--- a/drivers/platform/surface/aggregator/core.c
+++ b/drivers/platform/surface/aggregator/core.c
@@ -227,8 +227,8 @@ EXPORT_SYMBOL_GPL(ssam_client_bind);
 
 /* -- Glue layer (serdev_device -> ssam_controller). ------------------------ */
 
-static int ssam_receive_buf(struct serdev_device *dev, const unsigned char *buf,
-			    size_t n)
+static ssize_t ssam_receive_buf(struct serdev_device *dev, const u8 *buf,
+				size_t n)
 {
 	struct ssam_controller *ctrl;
 
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.c b/drivers/platform/surface/aggregator/ssh_packet_layer.c
index def8d7ac541f..d726b1a86319 100644
--- a/drivers/platform/surface/aggregator/ssh_packet_layer.c
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.c
@@ -1887,9 +1887,9 @@ int ssh_ptl_rx_stop(struct ssh_ptl *ptl)
  * Return: Returns the number of bytes transferred (positive or zero) on
  * success. Returns %-ESHUTDOWN if the packet layer has been shut down.
  */
-int ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n)
+ssize_t ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n)
 {
-	int used;
+	size_t used;
 
 	if (test_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state))
 		return -ESHUTDOWN;
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.h b/drivers/platform/surface/aggregator/ssh_packet_layer.h
index 64633522f971..c80e822070df 100644
--- a/drivers/platform/surface/aggregator/ssh_packet_layer.h
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.h
@@ -162,7 +162,7 @@ void ssh_ptl_shutdown(struct ssh_ptl *ptl);
 int ssh_ptl_submit(struct ssh_ptl *ptl, struct ssh_packet *p);
 void ssh_ptl_cancel(struct ssh_packet *p);
 
-int ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n);
+ssize_t ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n);
 
 /**
  * ssh_ptl_tx_wakeup_transfer() - Wake up packet transmitter thread for
diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 3090e3454c44..de8d87d4858d 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -225,8 +225,7 @@ EXPORT_SYMBOL_GPL(serdev_device_write_wakeup);
  * Return: The number of bytes written (less than count if not enough room in
  * the write buffer), or a negative errno on errors.
  */
-int serdev_device_write_buf(struct serdev_device *serdev,
-			    const unsigned char *buf, size_t count)
+int serdev_device_write_buf(struct serdev_device *serdev, const u8 *buf, size_t count)
 {
 	struct serdev_controller *ctrl = serdev->ctrl;
 
@@ -259,13 +258,12 @@ EXPORT_SYMBOL_GPL(serdev_device_write_buf);
  * -ETIMEDOUT or -ERESTARTSYS if interrupted before any bytes were written, or
  * a negative errno on errors.
  */
-int serdev_device_write(struct serdev_device *serdev,
-			const unsigned char *buf, size_t count,
-			long timeout)
+ssize_t serdev_device_write(struct serdev_device *serdev, const u8 *buf,
+			    size_t count, long timeout)
 {
 	struct serdev_controller *ctrl = serdev->ctrl;
-	int written = 0;
-	int ret;
+	size_t written = 0;
+	ssize_t ret;
 
 	if (!ctrl || !ctrl->ops->write_buf || !serdev->ops->write_wakeup)
 		return -EINVAL;
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 8cdab2c3b6d5..3fab88ba265e 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -27,7 +27,7 @@ struct serdev_device;
  *			not sleep.
  */
 struct serdev_device_ops {
-	int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t);
+	ssize_t (*receive_buf)(struct serdev_device *, const u8 *, size_t);
 	void (*write_wakeup)(struct serdev_device *);
 };
 
@@ -204,13 +204,13 @@ void serdev_device_close(struct serdev_device *);
 int devm_serdev_device_open(struct device *, struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
-int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
+int serdev_device_write_buf(struct serdev_device *, const u8 *, size_t);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_get_tiocm(struct serdev_device *);
 int serdev_device_set_tiocm(struct serdev_device *, int, int);
 int serdev_device_break_ctl(struct serdev_device *serdev, int break_state);
 void serdev_device_write_wakeup(struct serdev_device *);
-int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long);
+ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long);
 void serdev_device_write_flush(struct serdev_device *);
 int serdev_device_write_room(struct serdev_device *);
 
@@ -248,7 +248,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
 static inline int serdev_device_write_buf(struct serdev_device *serdev,
-					  const unsigned char *buf,
+					  const u8 *buf,
 					  size_t count)
 {
 	return -ENODEV;
@@ -266,8 +266,9 @@ static inline int serdev_device_break_ctl(struct serdev_device *serdev, int brea
 {
 	return -EOPNOTSUPP;
 }
-static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
-				      size_t count, unsigned long timeout)
+static inline ssize_t serdev_device_write(struct serdev_device *sdev,
+					  const u8 *buf, size_t count,
+					  unsigned long timeout)
 {
 	return -ENODEV;
 }
diff --git a/sound/drivers/serial-generic.c b/sound/drivers/serial-generic.c
index c8db6c75d133..d6e5aafd697c 100644
--- a/sound/drivers/serial-generic.c
+++ b/sound/drivers/serial-generic.c
@@ -100,8 +100,8 @@ static void snd_serial_generic_write_wakeup(struct serdev_device *serdev)
 	snd_serial_generic_tx_wakeup(drvdata);
 }
 
-static int snd_serial_generic_receive_buf(struct serdev_device *serdev,
-				const unsigned char *buf, size_t count)
+static ssize_t snd_serial_generic_receive_buf(struct serdev_device *serdev,
+					      const u8 *buf, size_t count)
 {
 	int ret;
 	struct snd_serial_generic *drvdata = serdev_device_get_drvdata(serdev);
-- 
cgit v1.2.3


From 8132d887a7023b212f242a51ae89281c69fde996 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:11:56 -0400
Subject: KVM: remove CONFIG_HAVE_KVM_EVENTFD

virt/kvm/eventfd.c is compiled unconditionally, meaning that the ioeventfds
member of struct kvm is accessed unconditionally.  CONFIG_HAVE_KVM_EVENTFD
therefore must be defined for KVM common code to compile successfully,
remove it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/Kconfig     |  1 -
 arch/loongarch/kvm/Kconfig |  1 -
 arch/mips/kvm/Kconfig      |  1 -
 arch/powerpc/kvm/Kconfig   |  1 -
 arch/riscv/kvm/Kconfig     |  1 -
 arch/s390/kvm/Kconfig      |  1 -
 arch/x86/kvm/Kconfig       |  1 -
 include/linux/kvm_host.h   | 30 +-----------------------------
 virt/kvm/Kconfig           |  5 +----
 9 files changed, 2 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 1a777715199f..87bd96c1254f 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -29,7 +29,6 @@ menuconfig KVM
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_XFER_TO_GUEST_WORK
 	select KVM_VFIO
-	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
 	select NEED_KVM_DIRTY_RING_WITH_BITMAP
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index f22bae89b07d..daba4cd5e87d 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	depends on AS_HAS_LVZ_EXTENSION
 	depends on HAVE_KVM
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
-	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index c04987d2ed2e..428141b0b48f 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
 	select EXPORT_UASM
 	select PREEMPT_NOTIFIERS
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
-	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
 	select KVM_GENERIC_MMU_NOTIFIER
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b33358ee6424..0f4e6e7ba35d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
 	bool
 	select PREEMPT_NOTIFIERS
-	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_VFIO
 	select IRQ_BYPASS_MANAGER
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index ae2e05f050ec..be264b78487d 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
 	depends on RISCV_SBI && MMU
-	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 45fdf2a9b2e3..ed567b858535 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
-	select HAVE_KVM_EVENTFD
 	select KVM_ASYNC_PF
 	select KVM_ASYNC_PF_SYNC
 	select HAVE_KVM_IRQCHIP
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index c1716e83d176..088c66bee98f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -33,7 +33,6 @@ config KVM
 	select IRQ_BYPASS_MANAGER
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
-	select HAVE_KVM_EVENTFD
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea1523a7b83a..3fe5a6be7768 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -782,7 +782,6 @@ struct kvm {
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
-#ifdef CONFIG_HAVE_KVM_EVENTFD
 	struct {
 		spinlock_t        lock;
 		struct list_head  items;
@@ -791,7 +790,6 @@ struct kvm {
 		struct mutex      resampler_lock;
 	} irqfds;
 	struct list_head ioeventfds;
-#endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	refcount_t users_count;
@@ -2056,8 +2054,6 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
 
-#ifdef CONFIG_HAVE_KVM_EVENTFD
-
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
@@ -2082,31 +2078,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
 {
 	return false;
 }
-#endif
-
-#else
-
-static inline void kvm_eventfd_init(struct kvm *kvm) {}
-
-static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
-{
-	return -EINVAL;
-}
-
-static inline void kvm_irqfd_release(struct kvm *kvm) {}
-
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-static inline void kvm_irq_routing_update(struct kvm *kvm)
-{
-}
-#endif
-
-static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
-{
-	return -ENOSYS;
-}
-
-#endif /* CONFIG_HAVE_KVM_EVENTFD */
+#endif /* CONFIG_HAVE_KVM_IRQFD */
 
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 2c964586aa14..72e0cac864f9 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -3,6 +3,7 @@
 
 config HAVE_KVM
        bool
+       select EVENTFD
 
 config HAVE_KVM_PFNCACHE
        bool
@@ -39,10 +40,6 @@ config NEED_KVM_DIRTY_RING_WITH_BITMAP
 	bool
 	depends on HAVE_KVM_DIRTY_RING
 
-config HAVE_KVM_EVENTFD
-       bool
-       select EVENTFD
-
 config KVM_MMIO
        bool
 
-- 
cgit v1.2.3


From c5b31cc2371728ddefe9baf1d036aeb630a25d96 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:07:32 -0400
Subject: KVM: remove CONFIG_HAVE_KVM_IRQFD

All platforms with a kernel irqchip have support for irqfd.  Unify the
two configuration items so that userspace can expect to use irqfd to
inject interrupts into the irqchip.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/Kconfig     | 1 -
 arch/powerpc/kvm/Kconfig   | 2 --
 arch/powerpc/kvm/powerpc.c | 2 +-
 arch/riscv/kvm/Kconfig     | 1 -
 arch/s390/kvm/Kconfig      | 1 -
 arch/x86/kvm/Kconfig       | 1 -
 include/linux/kvm_host.h   | 9 ++++-----
 include/trace/events/kvm.h | 8 ++++----
 virt/kvm/Kconfig           | 3 ---
 virt/kvm/eventfd.c         | 6 +++---
 virt/kvm/kvm_main.c        | 4 ++--
 11 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 87bd96c1254f..b07c60c9737d 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -29,7 +29,6 @@ menuconfig KVM
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select KVM_XFER_TO_GUEST_WORK
 	select KVM_VFIO
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
 	select NEED_KVM_DIRTY_RING_WITH_BITMAP
 	select HAVE_KVM_MSI
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 0f4e6e7ba35d..b47196085a42 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -224,7 +224,6 @@ config KVM_MPIC
 	bool "KVM in-kernel MPIC emulation"
 	depends on KVM && PPC_E500
 	select HAVE_KVM_IRQCHIP
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_MSI
 	help
@@ -237,7 +236,6 @@ config KVM_XICS
 	bool "KVM in-kernel XICS emulation"
 	depends on KVM_BOOK3S_64 && !KVM_MPIC
 	select HAVE_KVM_IRQCHIP
-	select HAVE_KVM_IRQFD
 	default y
 	help
 	  Include support for the XICS (eXternal Interrupt Controller
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c39945a7fce3..c3f82e238b70 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -578,7 +578,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 #endif
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQFD_RESAMPLE:
 		r = !xive_enabled();
 		break;
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index be264b78487d..2b0b51035302 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
 	depends on RISCV_SBI && MMU
 	select HAVE_KVM_IRQCHIP
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_MSI
 	select HAVE_KVM_VCPU_ASYNC_IOCTL
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index ed567b858535..bb6d90351119 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -26,7 +26,6 @@ config KVM
 	select KVM_ASYNC_PF
 	select KVM_ASYNC_PF_SYNC
 	select HAVE_KVM_IRQCHIP
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_INVALID_WAKEUPS
 	select HAVE_KVM_NO_POLL
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 088c66bee98f..b07247b0b958 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
 	select KVM_GENERIC_MMU_NOTIFIER
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
-	select HAVE_KVM_IRQFD
 	select HAVE_KVM_DIRTY_RING_TSO
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
 	select IRQ_BYPASS_MANAGER
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3fe5a6be7768..1bba24a13ec9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -805,8 +805,7 @@ struct kvm {
 	 * Update side is protected by irq_lock.
 	 */
 	struct kvm_irq_routing_table __rcu *irq_routing;
-#endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+
 	struct hlist_head irq_ack_notifier_list;
 #endif
 
@@ -996,7 +995,7 @@ static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 }
 #endif
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 int kvm_irqfd_init(void);
 void kvm_irqfd_exit(void);
 #else
@@ -2057,7 +2056,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
 void kvm_eventfd_init(struct kvm *kvm);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 bool kvm_notify_irqfd_resampler(struct kvm *kvm,
@@ -2078,7 +2077,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
 {
 	return false;
 }
-#endif /* CONFIG_HAVE_KVM_IRQFD */
+#endif /* CONFIG_HAVE_KVM_IRQCHIP */
 
 void kvm_arch_irq_routing_update(struct kvm *kvm);
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 3bd31ea23fee..011fba6b5552 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -62,7 +62,7 @@ TRACE_EVENT(kvm_vcpu_wakeup,
 		  __entry->valid ? "valid" : "invalid")
 );
 
-#if defined(CONFIG_HAVE_KVM_IRQFD)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
 TRACE_EVENT(kvm_set_irq,
 	TP_PROTO(unsigned int gsi, int level, int irq_source_id),
 	TP_ARGS(gsi, level, irq_source_id),
@@ -82,7 +82,7 @@ TRACE_EVENT(kvm_set_irq,
 	TP_printk("gsi %u level %d source %d",
 		  __entry->gsi, __entry->level, __entry->irq_source_id)
 );
-#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
 #if defined(__KVM_HAVE_IOAPIC)
 #define kvm_deliver_mode		\
@@ -170,7 +170,7 @@ TRACE_EVENT(kvm_msi_set_irq,
 
 #endif /* defined(__KVM_HAVE_IOAPIC) */
 
-#if defined(CONFIG_HAVE_KVM_IRQFD)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
 
 #ifdef kvm_irqchips
 #define kvm_ack_irq_string "irqchip %s pin %u"
@@ -197,7 +197,7 @@ TRACE_EVENT(kvm_ack_irq,
 	TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm)
 );
 
-#endif /* defined(CONFIG_HAVE_KVM_IRQFD) */
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 72e0cac864f9..6793211a0b64 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -11,9 +11,6 @@ config HAVE_KVM_PFNCACHE
 config HAVE_KVM_IRQCHIP
        bool
 
-config HAVE_KVM_IRQFD
-       bool
-
 config HAVE_KVM_IRQ_ROUTING
        bool
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 89912a17f5d5..19534156d48c 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -28,7 +28,7 @@
 
 #include <kvm/iodev.h>
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 
@@ -531,7 +531,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	spin_lock_init(&kvm->irqfds.lock);
 	INIT_LIST_HEAD(&kvm->irqfds.items);
 	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
@@ -540,7 +540,7 @@ kvm_eventfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 /*
  * shutdown any irqfd's that match fd+gsi
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8758cb799e18..a20cf1f9ad29 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1273,7 +1273,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	if (r)
 		goto out_err_no_disable;
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 
@@ -4826,7 +4826,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_CAP_SIGNAL_MSI:
 #endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQFD:
 #endif
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
-- 
cgit v1.2.3


From 8ed26ab8d59111c2f7b86d200d1eb97d2a458fd1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 18 Oct 2023 12:18:00 -0400
Subject: KVM: clean up directives to compile out irqfds

Keep all #ifdef CONFIG_HAVE_KVM_IRQCHIP parts of eventfd.c together, and
compile out the irqfds field of struct kvm if the symbol is not defined.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/eventfd.c       | 26 ++++++++++++--------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bba24a13ec9..7e7fd25b09b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -782,6 +782,7 @@ struct kvm {
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	struct {
 		spinlock_t        lock;
 		struct list_head  items;
@@ -789,6 +790,7 @@ struct kvm {
 		struct list_head  resampler_list;
 		struct mutex      resampler_lock;
 	} irqfds;
+#endif
 	struct list_head ioeventfds;
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 19534156d48c..d516e6d33f56 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -526,21 +526,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	synchronize_srcu(&kvm->irq_srcu);
 	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
-#endif
-
-void
-kvm_eventfd_init(struct kvm *kvm)
-{
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
-	spin_lock_init(&kvm->irqfds.lock);
-	INIT_LIST_HEAD(&kvm->irqfds.items);
-	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
-	mutex_init(&kvm->irqfds.resampler_lock);
-#endif
-	INIT_LIST_HEAD(&kvm->ioeventfds);
-}
 
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
 /*
  * shutdown any irqfd's that match fd+gsi
  */
@@ -1012,3 +998,15 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	return kvm_assign_ioeventfd(kvm, args);
 }
+
+void
+kvm_eventfd_init(struct kvm *kvm)
+{
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	spin_lock_init(&kvm->irqfds.lock);
+	INIT_LIST_HEAD(&kvm->irqfds.items);
+	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+	mutex_init(&kvm->irqfds.resampler_lock);
+#endif
+	INIT_LIST_HEAD(&kvm->ioeventfds);
+}
-- 
cgit v1.2.3


From 92e1567ee3e3f6f160e320890ac77eec50bf8e7d Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Thu, 7 Dec 2023 22:25:17 -0500
Subject: bpf: Add some comments to stack representation

Add comments to the datastructure tracking the stack state, as the
mapping between each stack slot and where its state is stored is not
entirely obvious.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231208032519.260451-2-andreimatei1@gmail.com
---
 include/linux/bpf_verifier.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index bada59812e00..314b679fb494 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -321,7 +321,17 @@ struct bpf_func_state {
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
 	struct bpf_reference_state *refs;
+	/* The state of the stack. Each element of the array describes BPF_REG_SIZE
+	 * (i.e. 8) bytes worth of stack memory.
+	 * stack[0] represents bytes [*(r10-8)..*(r10-1)]
+	 * stack[1] represents bytes [*(r10-16)..*(r10-9)]
+	 * ...
+	 * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)]
+	 */
 	struct bpf_stack_state *stack;
+	/* Size of the current stack, in bytes. The stack state is tracked below, in
+	 * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
+	 */
 	int allocated_stack;
 };
 
@@ -658,6 +668,10 @@ struct bpf_verifier_env {
 	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
+	/* Allow access to uninitialized stack memory. Writes with fixed offset are
+	 * always allowed, so this refers to reads (with fixed or variable offset),
+	 * to writes with variable offset and to indirect (helper) accesses.
+	 */
 	bool allow_uninit_stack;
 	bool bpf_capable;
 	bool bypass_spec_v1;
-- 
cgit v1.2.3


From 5b20755b7780464fea3e54af0af744258dcc2841 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 26 Nov 2023 16:19:14 +0900
Subject: init: move THIS_MODULE from <linux/export.h> to <linux/init.h>

Commit f50169324df4 ("module.h: split out the EXPORT_SYMBOL into
export.h") appropriately separated EXPORT_SYMBOL into <linux/export.h>
because modules and EXPORT_SYMBOL are orthogonal; modules are symbol
consumers, while EXPORT_SYMBOL are used by symbol providers, which
may not be necessarily a module.

However, that commit also relocated THIS_MODULE. As explained in the
commit description, the intention was to define THIS_MODULE in a
lightweight header, but I do not believe <linux/export.h> was the
best location because EXPORT_SYMBOL and THIS_MODULE are unrelated.

Move it to another lightweight header, <linux/init.h>. The reason for
choosing <linux/init.h> is to make <linux/moduleparam.h> self-contained
without relying on <linux/linkage.h> incorrectly including
<linux/export.h>.

With this adjustment, the role of <linux/export.h> becomes clearer as
it only defines EXPORT_SYMBOL.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/export.h | 18 ------------------
 include/linux/init.h   |  7 +++++++
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 9911508a9604..0bbd02fd351d 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -6,15 +6,6 @@
 #include <linux/linkage.h>
 #include <linux/stringify.h>
 
-/*
- * Export symbols from the kernel to modules.  Forked from module.h
- * to reduce the amount of pointless cruft we feed to gcc when only
- * exporting a simple symbol or two.
- *
- * Try not to add #includes here.  It slows compilation and makes kernel
- * hackers place grumpy comments in header files.
- */
-
 /*
  * This comment block is used by fixdep. Please do not remove.
  *
@@ -23,15 +14,6 @@
  * side effect of the *.o build rule.
  */
 
-#ifndef __ASSEMBLY__
-#ifdef MODULE
-extern struct module __this_module;
-#define THIS_MODULE (&__this_module)
-#else
-#define THIS_MODULE ((struct module *)0)
-#endif
-#endif /* __ASSEMBLY__ */
-
 #ifdef CONFIG_64BIT
 #define __EXPORT_SYMBOL_REF(sym)			\
 	.balign 8				ASM_NL	\
diff --git a/include/linux/init.h b/include/linux/init.h
index 01b52c9c7526..3fa3f6241350 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -179,6 +179,13 @@ extern void (*late_time_init)(void);
 
 extern bool initcall_debug;
 
+#ifdef MODULE
+extern struct module __this_module;
+#define THIS_MODULE (&__this_module)
+#else
+#define THIS_MODULE ((struct module *)0)
+#endif
+
 #endif
   
 #ifndef MODULE
-- 
cgit v1.2.3


From b73f08bb7fe5a0901646ca5ceaa1e7a2d5ee6293 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Thu, 7 Dec 2023 13:39:27 +0100
Subject: iio: adc: ad9467: fix scale setting

When reading in_voltage_scale we can get something like:

root@analog:/sys/bus/iio/devices/iio:device2# cat in_voltage_scale
0.038146

However, when reading the available options:

root@analog:/sys/bus/iio/devices/iio:device2# cat
in_voltage_scale_available
2000.000000 2100.000006 2200.000007 2300.000008 2400.000009 2500.000010

which does not make sense. Moreover, when trying to set a new scale we
get an error because there's no call to __ad9467_get_scale() to give us
values as given when reading in_voltage_scale. Fix it by computing the
available scales during probe and properly pass the list when
.read_available() is called.

While at it, change to use .read_available() from iio_info. Also note
that to properly fix this, adi-axi-adc.c has to be changed accordingly.

Fixes: ad6797120238 ("iio: adc: ad9467: add support AD9467 ADC")
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20231207-iio-backend-prep-v2-4-a4a33bc4d70e@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad9467.c            | 47 +++++++++++++++++++++++
 drivers/iio/adc/adi-axi-adc.c       | 74 ++++++++-----------------------------
 include/linux/iio/adc/adi-axi-adc.h |  4 ++
 3 files changed, 66 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad9467.c b/drivers/iio/adc/ad9467.c
index 104c6d394a3e..f668313730cb 100644
--- a/drivers/iio/adc/ad9467.c
+++ b/drivers/iio/adc/ad9467.c
@@ -120,6 +120,7 @@ struct ad9467_state {
 	struct spi_device		*spi;
 	struct clk			*clk;
 	unsigned int			output_mode;
+	unsigned int                    (*scales)[2];
 
 	struct gpio_desc		*pwrdown_gpio;
 	/* ensure consistent state obtained on multiple related accesses */
@@ -216,6 +217,7 @@ static void __ad9467_get_scale(struct adi_axi_adc_conv *conv, int index,
 	.channel = _chan,						\
 	.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) |		\
 		BIT(IIO_CHAN_INFO_SAMP_FREQ),				\
+	.info_mask_shared_by_type_available = BIT(IIO_CHAN_INFO_SCALE), \
 	.scan_index = _si,						\
 	.scan_type = {							\
 		.sign = _sign,						\
@@ -370,6 +372,26 @@ static int ad9467_write_raw(struct adi_axi_adc_conv *conv,
 	}
 }
 
+static int ad9467_read_avail(struct adi_axi_adc_conv *conv,
+			     struct iio_chan_spec const *chan,
+			     const int **vals, int *type, int *length,
+			     long mask)
+{
+	const struct adi_axi_adc_chip_info *info = conv->chip_info;
+	struct ad9467_state *st = adi_axi_adc_conv_priv(conv);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_SCALE:
+		*vals = (const int *)st->scales;
+		*type = IIO_VAL_INT_PLUS_MICRO;
+		/* Values are stored in a 2D matrix */
+		*length = info->num_scales * 2;
+		return IIO_AVAIL_LIST;
+	default:
+		return -EINVAL;
+	}
+}
+
 static int ad9467_outputmode_set(struct spi_device *spi, unsigned int mode)
 {
 	int ret;
@@ -382,6 +404,26 @@ static int ad9467_outputmode_set(struct spi_device *spi, unsigned int mode)
 				AN877_ADC_TRANSFER_SYNC);
 }
 
+static int ad9467_scale_fill(struct adi_axi_adc_conv *conv)
+{
+	const struct adi_axi_adc_chip_info *info = conv->chip_info;
+	struct ad9467_state *st = adi_axi_adc_conv_priv(conv);
+	unsigned int i, val1, val2;
+
+	st->scales = devm_kmalloc_array(&st->spi->dev, info->num_scales,
+					sizeof(*st->scales), GFP_KERNEL);
+	if (!st->scales)
+		return -ENOMEM;
+
+	for (i = 0; i < info->num_scales; i++) {
+		__ad9467_get_scale(conv, i, &val1, &val2);
+		st->scales[i][0] = val1;
+		st->scales[i][1] = val2;
+	}
+
+	return 0;
+}
+
 static int ad9467_preenable_setup(struct adi_axi_adc_conv *conv)
 {
 	struct ad9467_state *st = adi_axi_adc_conv_priv(conv);
@@ -440,6 +482,10 @@ static int ad9467_probe(struct spi_device *spi)
 
 	conv->chip_info = &info->axi_adc_info;
 
+	ret = ad9467_scale_fill(conv);
+	if (ret)
+		return ret;
+
 	id = ad9467_spi_read(spi, AN877_ADC_REG_CHIP_ID);
 	if (id != conv->chip_info->id) {
 		dev_err(&spi->dev, "Mismatch CHIP_ID, got 0x%X, expected 0x%X\n",
@@ -450,6 +496,7 @@ static int ad9467_probe(struct spi_device *spi)
 	conv->reg_access = ad9467_reg_access;
 	conv->write_raw = ad9467_write_raw;
 	conv->read_raw = ad9467_read_raw;
+	conv->read_avail = ad9467_read_avail;
 	conv->preenable_setup = ad9467_preenable_setup;
 
 	st->output_mode = info->default_output_mode |
diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index aff0532a974a..ae83ada7f9f2 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -144,6 +144,20 @@ static int adi_axi_adc_write_raw(struct iio_dev *indio_dev,
 	return conv->write_raw(conv, chan, val, val2, mask);
 }
 
+static int adi_axi_adc_read_avail(struct iio_dev *indio_dev,
+				  struct iio_chan_spec const *chan,
+				  const int **vals, int *type, int *length,
+				  long mask)
+{
+	struct adi_axi_adc_state *st = iio_priv(indio_dev);
+	struct adi_axi_adc_conv *conv = &st->client->conv;
+
+	if (!conv->read_avail)
+		return -EOPNOTSUPP;
+
+	return conv->read_avail(conv, chan, vals, type, length, mask);
+}
+
 static int adi_axi_adc_update_scan_mode(struct iio_dev *indio_dev,
 					const unsigned long *scan_mask)
 {
@@ -228,69 +242,11 @@ struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev,
 }
 EXPORT_SYMBOL_NS_GPL(devm_adi_axi_adc_conv_register, IIO_ADI_AXI);
 
-static ssize_t in_voltage_scale_available_show(struct device *dev,
-					       struct device_attribute *attr,
-					       char *buf)
-{
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct adi_axi_adc_state *st = iio_priv(indio_dev);
-	struct adi_axi_adc_conv *conv = &st->client->conv;
-	size_t len = 0;
-	int i;
-
-	for (i = 0; i < conv->chip_info->num_scales; i++) {
-		const unsigned int *s = conv->chip_info->scale_table[i];
-
-		len += scnprintf(buf + len, PAGE_SIZE - len,
-				 "%u.%06u ", s[0], s[1]);
-	}
-	buf[len - 1] = '\n';
-
-	return len;
-}
-
-static IIO_DEVICE_ATTR_RO(in_voltage_scale_available, 0);
-
-enum {
-	ADI_AXI_ATTR_SCALE_AVAIL,
-};
-
-#define ADI_AXI_ATTR(_en_, _file_)			\
-	[ADI_AXI_ATTR_##_en_] = &iio_dev_attr_##_file_.dev_attr.attr
-
-static struct attribute *adi_axi_adc_attributes[] = {
-	ADI_AXI_ATTR(SCALE_AVAIL, in_voltage_scale_available),
-	NULL
-};
-
-static umode_t axi_adc_attr_is_visible(struct kobject *kobj,
-				       struct attribute *attr, int n)
-{
-	struct device *dev = kobj_to_dev(kobj);
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct adi_axi_adc_state *st = iio_priv(indio_dev);
-	struct adi_axi_adc_conv *conv = &st->client->conv;
-
-	switch (n) {
-	case ADI_AXI_ATTR_SCALE_AVAIL:
-		if (!conv->chip_info->num_scales)
-			return 0;
-		return attr->mode;
-	default:
-		return attr->mode;
-	}
-}
-
-static const struct attribute_group adi_axi_adc_attribute_group = {
-	.attrs = adi_axi_adc_attributes,
-	.is_visible = axi_adc_attr_is_visible,
-};
-
 static const struct iio_info adi_axi_adc_info = {
 	.read_raw = &adi_axi_adc_read_raw,
 	.write_raw = &adi_axi_adc_write_raw,
-	.attrs = &adi_axi_adc_attribute_group,
 	.update_scan_mode = &adi_axi_adc_update_scan_mode,
+	.read_avail = &adi_axi_adc_read_avail,
 };
 
 static const struct adi_axi_adc_core_info adi_axi_adc_10_0_a_info = {
diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h
index 52620e5b8052..b7904992d561 100644
--- a/include/linux/iio/adc/adi-axi-adc.h
+++ b/include/linux/iio/adc/adi-axi-adc.h
@@ -41,6 +41,7 @@ struct adi_axi_adc_chip_info {
  * @reg_access		IIO debugfs_reg_access hook for the client ADC
  * @read_raw		IIO read_raw hook for the client ADC
  * @write_raw		IIO write_raw hook for the client ADC
+ * @read_avail		IIO read_avail hook for the client ADC
  */
 struct adi_axi_adc_conv {
 	const struct adi_axi_adc_chip_info		*chip_info;
@@ -54,6 +55,9 @@ struct adi_axi_adc_conv {
 	int (*write_raw)(struct adi_axi_adc_conv *conv,
 			 struct iio_chan_spec const *chan,
 			 int val, int val2, long mask);
+	int (*read_avail)(struct adi_axi_adc_conv *conv,
+			  struct iio_chan_spec const *chan,
+			  const int **val, int *type, int *length, long mask);
 };
 
 struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev,
-- 
cgit v1.2.3


From 23e9f0138963ceef2a252d887534923a0502b2da Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 3 Nov 2023 11:14:50 +0800
Subject: mm/vmstat: move pgdemote_* to per-node stats

Demotion will migrate pages across nodes.  Previously, only the global
demotion statistics were accounted for.  Changed them to per-node
statistics, making it easier to observe where demotion occurs on each
node.

This will help to identify which nodes are under pressure.

This patch also make pgdemote_* behind CONFIG_NUMA_BALANCING, since
demotion is not available for !CONFIG_NUMA_BALANCING

With this patch, here is a sample where node0 node1 are DRAM,
node3 is PMEM:
Global stats:
$ grep demote /proc/vmstat
pgdemote_kswapd 254288
pgdemote_direct 113497
pgdemote_khugepaged 0

Per-node stats:
$ grep demote /sys/devices/system/node/node0/vmstat # demotion source
pgdemote_kswapd 68454
pgdemote_direct 83431
pgdemote_khugepaged 0
$ grep demote /sys/devices/system/node/node1/vmstat # demotion source
pgdemote_kswapd 185834
pgdemote_direct 30066
pgdemote_khugepaged 0
$ grep demote /sys/devices/system/node/node3/vmstat # demotion target
pgdemote_kswapd 0
pgdemote_direct 0
pgdemote_khugepaged 0

Link: https://lkml.kernel.org/r/20231103031450.1456523-1-lizhijian@fujitsu.com
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Acked-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h        |  4 ++++
 include/linux/vm_event_item.h |  3 ---
 mm/vmscan.c                   | 12 ++++++++----
 mm/vmstat.c                   |  6 +++---
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c25226beeed..14faffa4354f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,6 +206,10 @@ enum node_stat_item {
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
 	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
+	/* PGDEMOTE_*: pages demoted */
+	PGDEMOTE_KSWAPD,
+	PGDEMOTE_DIRECT,
+	PGDEMOTE_KHUGEPAGED,
 #endif
 	NR_VM_NODE_STAT_ITEMS
 };
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8abfa1240040..d1b847502f09 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGSTEAL_KSWAPD,
 		PGSTEAL_DIRECT,
 		PGSTEAL_KHUGEPAGED,
-		PGDEMOTE_KSWAPD,
-		PGDEMOTE_DIRECT,
-		PGDEMOTE_KHUGEPAGED,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_KHUGEPAGED,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 506f8220c5fe..5dc581cac225 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -409,12 +409,14 @@ void drop_slab(void)
 
 static int reclaimer_offset(void)
 {
+#ifdef CONFIG_NUMA_BALANCING
 	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
-	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
-			PGSCAN_DIRECT - PGSCAN_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+#endif
+	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+			PGSCAN_DIRECT - PGSCAN_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
 
@@ -976,8 +978,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 	migrate_pages(demote_folios, alloc_demote_folio, NULL,
 		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
 		      &nr_succeeded);
-
-	__count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
+#ifdef CONFIG_NUMA_BALANCING
+	mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
+			    nr_succeeded);
+#endif
 
 	return nr_succeeded;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 359460deb377..afa5a38fcc9c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1248,6 +1248,9 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_NUMA_BALANCING
 	"pgpromote_success",
 	"pgpromote_candidate",
+	"pgdemote_kswapd",
+	"pgdemote_direct",
+	"pgdemote_khugepaged",
 #endif
 
 	/* enum writeback_stat_item counters */
@@ -1279,9 +1282,6 @@ const char * const vmstat_text[] = {
 	"pgsteal_kswapd",
 	"pgsteal_direct",
 	"pgsteal_khugepaged",
-	"pgdemote_kswapd",
-	"pgdemote_direct",
-	"pgdemote_khugepaged",
 	"pgscan_kswapd",
 	"pgscan_direct",
 	"pgscan_khugepaged",
-- 
cgit v1.2.3


From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:37 +0800
Subject: maple_tree: introduce {mtree,mas}_lock_nested()

In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is
introduced.  For example, when duplicating maple tree, we need to hold the
locks of two trees, in which case nested locks are needed.

At the same time, add the definition of spin_lock_nested() in tools for
testing.

Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h     | 4 ++++
 tools/include/linux/spinlock.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index d01e850b570f..f91dbc7fe091 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -256,6 +256,8 @@ struct maple_tree {
 	struct maple_tree name = MTREE_INIT(name, 0)
 
 #define mtree_lock(mt)		spin_lock((&(mt)->ma_lock))
+#define mtree_lock_nested(mas, subclass) \
+		spin_lock_nested((&(mt)->ma_lock), subclass)
 #define mtree_unlock(mt)	spin_unlock((&(mt)->ma_lock))
 
 /*
@@ -406,6 +408,8 @@ struct ma_wr_state {
 };
 
 #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
+#define mas_lock_nested(mas, subclass) \
+		spin_lock_nested(&((mas)->tree->ma_lock), subclass)
 #define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
 
 
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 622266b197d0..a6cdf25b6b9d 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -11,6 +11,7 @@
 #define spin_lock_init(x)	pthread_mutex_init(x, NULL)
 
 #define spin_lock(x)			pthread_mutex_lock(x)
+#define spin_lock_nested(x, subclass)	pthread_mutex_lock(x)
 #define spin_unlock(x)			pthread_mutex_unlock(x)
 #define spin_lock_bh(x)			pthread_mutex_lock(x)
 #define spin_unlock_bh(x)		pthread_mutex_unlock(x)
-- 
cgit v1.2.3


From fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:38 +0800
Subject: maple_tree: introduce interfaces __mt_dup() and mtree_dup()

Introduce interfaces __mt_dup() and mtree_dup(), which are used to
duplicate a maple tree.  They duplicate a maple tree in Depth-First Search
(DFS) pre-order traversal.  It uses memcopy() to copy nodes in the source
tree and allocate new child nodes in non-leaf nodes.  The new node is
exactly the same as the source node except for all the addresses stored in
it.  It will be faster than traversing all elements in the source tree and
inserting them one by one into the new tree.  The time complexity of these
two functions is O(n).

The difference between __mt_dup() and mtree_dup() is that mtree_dup()
handles locks internally.

Analysis of the average time complexity of this algorithm:

For simplicity, let's assume that the maximum branching factor of all
non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a
full tree.

Under the given conditions, if there is a maple tree with n elements, the
number of its leaves is n/16.  From bottom to top, the number of nodes in
each level is 1/16 of the number of nodes in the level below.  So the
total number of nodes in the entire tree is given by the sum of n/16 +
n/16^2 + n/16^3 + ...  + 1.  This is a geometric series, and it has log(n)
terms with base 16.  According to the formula for the sum of a geometric
series, the sum of this series can be calculated as (n-1)/15.  Each node
has only one parent node pointer, which can be considered as an edge.  In
total, there are (n-1)/15-1 edges.

This algorithm consists of two operations:

1. Traversing all nodes in DFS order.
2. For each node, making a copy and performing necessary modifications
   to create a new node.

For the first part, DFS traversal will visit each edge twice.  Let
T(ascend) represent the cost of taking one step downwards, and T(descend)
represent the cost of taking one step upwards.  And both of them are
constants (although mas_ascend() may not be, as it contains a loop, but
here we ignore it and treat it as a constant).  So the time spent on the
first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)).

For the second part, each node will be copied, and the cost of copying a
node is denoted as T(copy_node).  For each non-leaf node, it is necessary
to reallocate all child nodes, and the cost of this operation is denoted
as T(dup_alloc).  The behavior behind memory allocation is complex and not
specific to the maple tree operation.  Here, we assume that the time
required for a single allocation is constant.  Since the size of a node is
fixed, both of these symbols are also constants.  We can calculate that
the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15
- n/16) * T(dup_alloc).

Adding both parts together, the total time spent by the algorithm can be
represented as:

((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) -
n/16 * T(dup_alloc) - (T(ascend) + T(descend))

Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)
Let C2 = T(dup_alloc)
Let C3 = T(ascend) + T(descend)

Finally, the expression can be simplified as:
((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3).

This is a linear function, so the average time complexity is O(n).

Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h |   3 +
 lib/maple_tree.c           | 274 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 277 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index f91dbc7fe091..a452dd8a1e5c 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -329,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index,
 		void *entry, gfp_t gfp);
 void *mtree_erase(struct maple_tree *mt, unsigned long index);
 
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+
 void mtree_destroy(struct maple_tree *mt);
 void __mt_destroy(struct maple_tree *mt);
 
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index ca7039633844..718a222cc090 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4,6 +4,8 @@
  * Copyright (c) 2018-2022 Oracle Corporation
  * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
  *	    Matthew Wilcox <willy@infradead.org>
+ * Copyright (c) 2023 ByteDance
+ * Author: Peng Zhang <zhangpeng.00@bytedance.com>
  */
 
 /*
@@ -6475,6 +6477,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index)
 }
 EXPORT_SYMBOL(mtree_erase);
 
+/*
+ * mas_dup_free() - Free an incomplete duplication of a tree.
+ * @mas: The maple state of a incomplete tree.
+ *
+ * The parameter @mas->node passed in indicates that the allocation failed on
+ * this node. This function frees all nodes starting from @mas->node in the
+ * reverse order of mas_dup_build(). There is no need to hold the source tree
+ * lock at this time.
+ */
+static void mas_dup_free(struct ma_state *mas)
+{
+	struct maple_node *node;
+	enum maple_type type;
+	void __rcu **slots;
+	unsigned char count, i;
+
+	/* Maybe the first node allocation failed. */
+	if (mas_is_none(mas))
+		return;
+
+	while (!mte_is_root(mas->node)) {
+		mas_ascend(mas);
+		if (mas->offset) {
+			mas->offset--;
+			do {
+				mas_descend(mas);
+				mas->offset = mas_data_end(mas);
+			} while (!mte_is_leaf(mas->node));
+
+			mas_ascend(mas);
+		}
+
+		node = mte_to_node(mas->node);
+		type = mte_node_type(mas->node);
+		slots = ma_slots(node, type);
+		count = mas_data_end(mas) + 1;
+		for (i = 0; i < count; i++)
+			((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
+		mt_free_bulk(count, slots);
+	}
+
+	node = mte_to_node(mas->node);
+	mt_free_one(node);
+}
+
+/*
+ * mas_copy_node() - Copy a maple node and replace the parent.
+ * @mas: The maple state of source tree.
+ * @new_mas: The maple state of new tree.
+ * @parent: The parent of the new node.
+ *
+ * Copy @mas->node to @new_mas->node, set @parent to be the parent of
+ * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
+ */
+static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
+		struct maple_pnode *parent)
+{
+	struct maple_node *node = mte_to_node(mas->node);
+	struct maple_node *new_node = mte_to_node(new_mas->node);
+	unsigned long val;
+
+	/* Copy the node completely. */
+	memcpy(new_node, node, sizeof(struct maple_node));
+	/* Update the parent node pointer. */
+	val = (unsigned long)node->parent & MAPLE_NODE_MASK;
+	new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
+}
+
+/*
+ * mas_dup_alloc() - Allocate child nodes for a maple node.
+ * @mas: The maple state of source tree.
+ * @new_mas: The maple state of new tree.
+ * @gfp: The GFP_FLAGS to use for allocations.
+ *
+ * This function allocates child nodes for @new_mas->node during the duplication
+ * process. If memory allocation fails, @mas is set to -ENOMEM.
+ */
+static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
+		gfp_t gfp)
+{
+	struct maple_node *node = mte_to_node(mas->node);
+	struct maple_node *new_node = mte_to_node(new_mas->node);
+	enum maple_type type;
+	unsigned char request, count, i;
+	void __rcu **slots;
+	void __rcu **new_slots;
+	unsigned long val;
+
+	/* Allocate memory for child nodes. */
+	type = mte_node_type(mas->node);
+	new_slots = ma_slots(new_node, type);
+	request = mas_data_end(mas) + 1;
+	count = mt_alloc_bulk(gfp, request, (void **)new_slots);
+	if (unlikely(count < request)) {
+		memset(new_slots, 0, request * sizeof(void *));
+		mas_set_err(mas, -ENOMEM);
+		return;
+	}
+
+	/* Restore node type information in slots. */
+	slots = ma_slots(node, type);
+	for (i = 0; i < count; i++) {
+		val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
+		val &= MAPLE_NODE_MASK;
+		((unsigned long *)new_slots)[i] |= val;
+	}
+}
+
+/*
+ * mas_dup_build() - Build a new maple tree from a source tree
+ * @mas: The maple state of source tree, need to be in MAS_START state.
+ * @new_mas: The maple state of new tree, need to be in MAS_START state.
+ * @gfp: The GFP_FLAGS to use for allocations.
+ *
+ * This function builds a new tree in DFS preorder. If the memory allocation
+ * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
+ * last node. mas_dup_free() will free the incomplete duplication of a tree.
+ *
+ * Note that the attributes of the two trees need to be exactly the same, and the
+ * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
+ */
+static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
+		gfp_t gfp)
+{
+	struct maple_node *node;
+	struct maple_pnode *parent = NULL;
+	struct maple_enode *root;
+	enum maple_type type;
+
+	if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
+	    unlikely(!mtree_empty(new_mas->tree))) {
+		mas_set_err(mas, -EINVAL);
+		return;
+	}
+
+	root = mas_start(mas);
+	if (mas_is_ptr(mas) || mas_is_none(mas))
+		goto set_new_tree;
+
+	node = mt_alloc_one(gfp);
+	if (!node) {
+		new_mas->node = MAS_NONE;
+		mas_set_err(mas, -ENOMEM);
+		return;
+	}
+
+	type = mte_node_type(mas->node);
+	root = mt_mk_node(node, type);
+	new_mas->node = root;
+	new_mas->min = 0;
+	new_mas->max = ULONG_MAX;
+	root = mte_mk_root(root);
+	while (1) {
+		mas_copy_node(mas, new_mas, parent);
+		if (!mte_is_leaf(mas->node)) {
+			/* Only allocate child nodes for non-leaf nodes. */
+			mas_dup_alloc(mas, new_mas, gfp);
+			if (unlikely(mas_is_err(mas)))
+				return;
+		} else {
+			/*
+			 * This is the last leaf node and duplication is
+			 * completed.
+			 */
+			if (mas->max == ULONG_MAX)
+				goto done;
+
+			/* This is not the last leaf node and needs to go up. */
+			do {
+				mas_ascend(mas);
+				mas_ascend(new_mas);
+			} while (mas->offset == mas_data_end(mas));
+
+			/* Move to the next subtree. */
+			mas->offset++;
+			new_mas->offset++;
+		}
+
+		mas_descend(mas);
+		parent = ma_parent_ptr(mte_to_node(new_mas->node));
+		mas_descend(new_mas);
+		mas->offset = 0;
+		new_mas->offset = 0;
+	}
+done:
+	/* Specially handle the parent of the root node. */
+	mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
+set_new_tree:
+	/* Make them the same height */
+	new_mas->tree->ma_flags = mas->tree->ma_flags;
+	rcu_assign_pointer(new_mas->tree->ma_root, root);
+}
+
+/**
+ * __mt_dup(): Duplicate an entire maple tree
+ * @mt: The source maple tree
+ * @new: The new maple tree
+ * @gfp: The GFP_FLAGS to use for allocations
+ *
+ * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
+ * traversal. It uses memcpy() to copy nodes in the source tree and allocate
+ * new child nodes in non-leaf nodes. The new node is exactly the same as the
+ * source node except for all the addresses stored in it. It will be faster than
+ * traversing all elements in the source tree and inserting them one by one into
+ * the new tree.
+ * The user needs to ensure that the attributes of the source tree and the new
+ * tree are the same, and the new tree needs to be an empty tree, otherwise
+ * -EINVAL will be returned.
+ * Note that the user needs to manually lock the source tree and the new tree.
+ *
+ * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
+ * the attributes of the two trees are different or the new tree is not an empty
+ * tree.
+ */
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
+{
+	int ret = 0;
+	MA_STATE(mas, mt, 0, 0);
+	MA_STATE(new_mas, new, 0, 0);
+
+	mas_dup_build(&mas, &new_mas, gfp);
+	if (unlikely(mas_is_err(&mas))) {
+		ret = xa_err(mas.node);
+		if (ret == -ENOMEM)
+			mas_dup_free(&new_mas);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(__mt_dup);
+
+/**
+ * mtree_dup(): Duplicate an entire maple tree
+ * @mt: The source maple tree
+ * @new: The new maple tree
+ * @gfp: The GFP_FLAGS to use for allocations
+ *
+ * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
+ * traversal. It uses memcpy() to copy nodes in the source tree and allocate
+ * new child nodes in non-leaf nodes. The new node is exactly the same as the
+ * source node except for all the addresses stored in it. It will be faster than
+ * traversing all elements in the source tree and inserting them one by one into
+ * the new tree.
+ * The user needs to ensure that the attributes of the source tree and the new
+ * tree are the same, and the new tree needs to be an empty tree, otherwise
+ * -EINVAL will be returned.
+ *
+ * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
+ * the attributes of the two trees are different or the new tree is not an empty
+ * tree.
+ */
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
+{
+	int ret = 0;
+	MA_STATE(mas, mt, 0, 0);
+	MA_STATE(new_mas, new, 0, 0);
+
+	mas_lock(&new_mas);
+	mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
+	mas_dup_build(&mas, &new_mas, gfp);
+	mas_unlock(&mas);
+	if (unlikely(mas_is_err(&mas))) {
+		ret = xa_err(mas.node);
+		if (ret == -ENOMEM)
+			mas_dup_free(&new_mas);
+	}
+
+	mas_unlock(&new_mas);
+	return ret;
+}
+EXPORT_SYMBOL(mtree_dup);
+
 /**
  * __mt_destroy() - Walk and free all nodes of a locked maple tree.
  * @mt: The maple tree
-- 
cgit v1.2.3


From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:45 +0800
Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap()

In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then
directly replacing the entries of VMAs in the new maple tree can result in
better performance.  __mt_dup() uses DFS pre-order to duplicate the maple
tree, so it is efficient.

The average time complexity of __mt_dup() is O(n), where n is the number
of VMAs.  The proof of the time complexity is provided in the commit log
that introduces __mt_dup().  After duplicating the maple tree, each
element is traversed and replaced (ignoring the cases of deletion, which
are rare).  Since it is only a replacement operation for each element,
this process is also O(n).

Analyzing the exact time complexity of the previous algorithm is
challenging because each insertion can involve appending to a node,
pushing data to adjacent nodes, or even splitting nodes.  The frequency of
each action is difficult to calculate.  The worst-case scenario for a
single insertion is when the tree undergoes splitting at every level.  If
we consider each insertion as the worst-case scenario, we can determine
that the upper bound of the time complexity is O(n*log(n)), although this
is a loose upper bound.  However, based on the test data, it appears that
the actual time complexity is likely to be O(n).

As the entire maple tree is duplicated using __mt_dup(), if dup_mmap()
fails, there will be a portion of VMAs that have not been duplicated in
the maple tree.  To handle this, we mark the failure point with
XA_ZERO_ENTRY.  In exit_mmap(), if this marker is encountered, stop
releasing VMAs that have not been duplicated after this point.

There is a "spawn" in byte-unixbench[1], which can be used to test the
performance of fork().  I modified it slightly to make it work with
different number of VMAs.

Below are the test results.  The first row shows the number of VMAs.  The
second and third rows show the number of fork() calls per ten seconds,
corresponding to next-20231006 and the this patchset, respectively.  The
test results were obtained with CPU binding to avoid scheduler load
balancing that could cause unstable results.  There are still some
fluctuations in the test results, but at least they are better than the
original performance.

21     121   221    421    821    1621   3221   6421   12821  25621  51221
112100 76261 54227  34035  20195  11112  6017   3161   1606   802    393
114558 83067 65008  45824  28751  16072  8922   4747   2436   1233   599
2.19%  8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42%

[1] https://github.com/kdlucas/byte-unixbench/tree/master

Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 +++++++++++
 kernel/fork.c      | 40 +++++++++++++++++++++++++++++-----------
 mm/internal.h      | 11 -----------
 mm/memory.c        |  7 ++++++-
 mm/mmap.c          |  9 ++++++---
 5 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..64cd1ee4aacc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
 	return mas_expected_entries(&vmi->mas, count);
 }
 
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+			unsigned long start, unsigned long end, gfp_t gfp)
+{
+	__mas_set_range(&vmi->mas, start, end - 1);
+	mas_store_gfp(&vmi->mas, NULL, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /* Free any unused preallocations */
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..93924392a5c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	int retval;
 	unsigned long charge = 0;
 	LIST_HEAD(uf);
-	VMA_ITERATOR(old_vmi, oldmm, 0);
 	VMA_ITERATOR(vmi, mm, 0);
 
 	uprobe_start_dup_mmap();
@@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		goto out;
 	khugepaged_fork(mm, oldmm);
 
-	retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
-	if (retval)
+	/* Use __mt_dup() to efficiently build an identical maple tree. */
+	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+	if (unlikely(retval))
 		goto out;
 
 	mt_clear_in_rcu(vmi.mas.tree);
-	for_each_vma(old_vmi, mpnt) {
+	for_each_vma(vmi, mpnt) {
 		struct file *file;
 
 		vma_start_write(mpnt);
 		if (mpnt->vm_flags & VM_DONTCOPY) {
+			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+						    mpnt->vm_end, GFP_KERNEL);
+			if (retval)
+				goto loop_out;
+
 			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
 		}
@@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (is_vm_hugetlb_page(tmp))
 			hugetlb_dup_vma_private(tmp);
 
-		/* Link the vma into the MT */
-		if (vma_iter_bulk_store(&vmi, tmp))
-			goto fail_nomem_vmi_store;
+		/*
+		 * Link the vma into the MT. After using __mt_dup(), memory
+		 * allocation is not necessary here, so it cannot fail.
+		 */
+		vma_iter_bulk_store(&vmi, tmp);
 
 		mm->map_count++;
 		if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
 
-		if (retval)
+		if (retval) {
+			mpnt = vma_next(&vmi);
 			goto loop_out;
+		}
 	}
 	/* a new mm has just been created */
 	retval = arch_dup_mmap(oldmm, mm);
 loop_out:
 	vma_iter_free(&vmi);
-	if (!retval)
+	if (!retval) {
 		mt_set_in_rcu(vmi.mas.tree);
+	} else if (mpnt) {
+		/*
+		 * The entire maple tree has already been duplicated. If the
+		 * mmap duplication fails, mark the failure point with
+		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+		 * stop releasing VMAs that have not been duplicated after this
+		 * point.
+		 */
+		mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+		mas_store(&vmi.mas, XA_ZERO_ENTRY);
+	}
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
@@ -778,8 +798,6 @@ fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
 
-fail_nomem_vmi_store:
-	unlink_anon_vmas(tmp);
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
diff --git a/mm/internal.h b/mm/internal.h
index b61034bd50f5..89a5a794d68f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi)
 	mas_store_prealloc(&vmi->mas, NULL);
 }
 
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
-			unsigned long start, unsigned long end, gfp_t gfp)
-{
-	__mas_set_range(&vmi->mas, start, end - 1);
-	mas_store_gfp(&vmi->mas, NULL, gfp);
-	if (unlikely(mas_is_err(&vmi->mas)))
-		return -ENOMEM;
-
-	return 0;
-}
-
 static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
 {
 	return mas_walk(&vmi->mas);
diff --git a/mm/memory.c b/mm/memory.c
index 5c757fba8858..a7025ed5c65b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 * be 0.  This will underflow and is okay.
 		 */
 		next = mas_find(mas, ceiling - 1);
+		if (unlikely(xa_is_zero(next)))
+			next = NULL;
 
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
@@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = mas_find(mas, ceiling - 1);
+				if (unlikely(xa_is_zero(next)))
+					next = NULL;
 				if (mm_wr_locked)
 					vma_start_write(vma);
 				unlink_anon_vmas(vma);
@@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		unmap_single_vma(tlb, vma, start, end, &details,
 				 mm_wr_locked);
 		hugetlb_zap_end(vma, &details);
-	} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
+		vma = mas_find(mas, tree_end - 1);
+	} while (vma && likely(!xa_is_zero(vma)));
 	mmu_notifier_invalidate_range_end(&range);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1971bfffcc03..4f1cb814586d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);
 
 	vma = mas_find(&mas, ULONG_MAX);
-	if (!vma) {
+	if (!vma || unlikely(xa_is_zero(vma))) {
 		/* Can happen if dup_mmap() received an OOM */
 		mmap_read_unlock(mm);
-		return;
+		mmap_write_lock(mm);
+		goto destroy;
 	}
 
 	lru_add_drain();
@@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm)
 		remove_vma(vma, true);
 		count++;
 		cond_resched();
-	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+		vma = mas_find(&mas, ULONG_MAX);
+	} while (vma && likely(!xa_is_zero(vma)));
 
 	BUG_ON(count != mm->map_count);
 
 	trace_exit_mmap(mm);
+destroy:
 	__mt_destroy(&mm->mm_mt);
 	mmap_write_unlock(mm);
 	vm_unacct_memory(nr_accounted);
-- 
cgit v1.2.3


From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001
From: Liam Ni <zhiguangni01@gmail.com>
Date: Thu, 26 Oct 2023 10:03:29 +0800
Subject: NUMA: optimize detection of memory with no node id assigned by
 firmware

Sanity check that makes sure the nodes cover all memory loops over
numa_meminfo to count the pages that have node id assigned by the
firmware, then loops again over memblock.memory to find the total amount
of memory and in the end checks that the difference between the total
memory and memory that covered by nodes is less than some threshold.
Worse, the loop over numa_meminfo calls __absent_pages_in_range() that
also partially traverses memblock.memory.

It's much simpler and more efficient to have a single traversal of
memblock.memory that verifies that amount of memory not covered by nodes
is less than a threshold.

Introduce memblock_validate_numa_coverage() that does exactly that and use
it instead of numa_meminfo_cover_memory().

Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com
Signed-off-by: Liam Ni <zhiguangni01@gmail.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Bibo Mao <maobibo@loongson.cn>
Cc: Binbin Zhou <zhoubinbin@loongson.cn>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feiyang Chen <chenfeiyang@loongson.cn>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/kernel/numa.c | 28 +---------------------------
 arch/x86/mm/numa.c           | 34 ++--------------------------------
 include/linux/memblock.h     |  1 +
 mm/memblock.c                | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 38 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index 6e65ff12d5c7..8fe21f868f72 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -226,32 +226,6 @@ static void __init node_mem_init(unsigned int node)
 
 #ifdef CONFIG_ACPI_NUMA
 
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common).  Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
-	int i;
-	u64 numaram, biosram;
-
-	numaram = 0;
-	for (i = 0; i < mi->nr_blks; i++) {
-		u64 s = mi->blk[i].start >> PAGE_SHIFT;
-		u64 e = mi->blk[i].end >> PAGE_SHIFT;
-
-		numaram += e - s;
-		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
-		if ((s64)numaram < 0)
-			numaram = 0;
-	}
-	max_pfn = max_low_pfn;
-	biosram = max_pfn - absent_pages_in_range(0, max_pfn);
-
-	BUG_ON((s64)(biosram - numaram) >= (1 << (20 - PAGE_SHIFT)));
-	return true;
-}
-
 static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type)
 {
 	static unsigned long num_physpages;
@@ -396,7 +370,7 @@ int __init init_numa_memory(void)
 		return -EINVAL;
 
 	init_node_memblock();
-	if (numa_meminfo_cover_memory(&numa_meminfo) == false)
+	if (!memblock_validate_numa_coverage(SZ_1M))
 		return -EINVAL;
 
 	for_each_node_mask(node, node_possible_map) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b29ceb19e46e..adc497b93f03 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -449,37 +449,6 @@ int __node_distance(int from, int to)
 }
 EXPORT_SYMBOL(__node_distance);
 
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common).  Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
-	u64 numaram, e820ram;
-	int i;
-
-	numaram = 0;
-	for (i = 0; i < mi->nr_blks; i++) {
-		u64 s = mi->blk[i].start >> PAGE_SHIFT;
-		u64 e = mi->blk[i].end >> PAGE_SHIFT;
-		numaram += e - s;
-		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
-		if ((s64)numaram < 0)
-			numaram = 0;
-	}
-
-	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
-
-	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
-		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
-		       (numaram << PAGE_SHIFT) >> 20,
-		       (e820ram << PAGE_SHIFT) >> 20);
-		return false;
-	}
-	return true;
-}
-
 /*
  * Mark all currently memblock-reserved physical memory (which covers the
  * kernel's own memory ranges) as hot-unswappable.
@@ -585,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 			return -EINVAL;
 		}
 	}
-	if (!numa_meminfo_cover_memory(mi))
+
+	if (!memblock_validate_numa_coverage(SZ_1M))
 		return -EINVAL;
 
 	/* Finally register nodes. */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ae3bde302f70..b695f9e946da 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 bool memblock_overlaps_region(struct memblock_type *type,
 			      phys_addr_t base, phys_addr_t size);
+bool memblock_validate_numa_coverage(unsigned long threshold_bytes);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5a88d6d24d79..4a62f7774b65 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -734,6 +734,40 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 	return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
 }
 
+/**
+ * memblock_validate_numa_coverage - check if amount of memory with
+ * no node ID assigned is less than a threshold
+ * @threshold_bytes: maximal number of pages that can have unassigned node
+ * ID (in bytes).
+ *
+ * A buggy firmware may report memory that does not belong to any node.
+ * Check if amount of such memory is below @threshold_bytes.
+ *
+ * Return: true on success, false on failure.
+ */
+bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes)
+{
+	unsigned long nr_pages = 0;
+	unsigned long start_pfn, end_pfn, mem_size_mb;
+	int nid, i;
+
+	/* calculate lose page */
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		if (nid == NUMA_NO_NODE)
+			nr_pages += end_pfn - start_pfn;
+	}
+
+	if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) {
+		mem_size_mb = memblock_phys_mem_size() >> 20;
+		pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
+		       (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
+		return false;
+	}
+
+	return true;
+}
+
+
 /**
  * memblock_isolate_range - isolate given range into disjoint memblocks
  * @type: memblock type to isolate range for
-- 
cgit v1.2.3


From a4fc4a0c45f2617c3aa8b693739de264e0c09909 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Nov 2023 21:26:40 +0000
Subject: mm: add folio_zero_tail() and use it in ext4

Patch series "Add folio_zero_tail() and folio_fill_tail()".

I'm trying to make it easier for filesystems with tailpacking / stuffing /
inline data to use folios.  The primary function here is
folio_fill_tail().  You give it a pointer to memory where the data
currently is, and it takes care of copying it into the folio at that
offset.  That works for gfs2 & iomap.  Then There's Ext4.  Rather than gin
up some kind of specialist "Here's a two pointers to two blocks of memory"
routine, just let it do its current thing, and let it call
folio_zero_tail(), which is also called by folio_fill_tail().

Other filesystems can be converted later; these ones seemed like good
examples as they're already partly or completely converted to folios.


This patch (of 3):

Instead of unmapping the folio after copying the data to it, then mapping
it again to zero the tail, provide folio_zero_tail() to zero the tail of
an already-mapped folio.

[akpm@linux-foundation.org: fix kerneldoc argument ordering]
Link: https://lkml.kernel.org/r/20231107212643.3490372-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231107212643.3490372-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ext4/inline.c        |  3 +--
 include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9a84a5f9fef4..d5bd1e3a5d36 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
 	BUG_ON(len > PAGE_SIZE);
 	kaddr = kmap_local_folio(folio, 0);
 	ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
-	flush_dcache_folio(folio);
+	kaddr = folio_zero_tail(folio, len, kaddr + len);
 	kunmap_local(kaddr);
-	folio_zero_segment(folio, len, folio_size(folio));
 	folio_mark_uptodate(folio);
 	brelse(iloc.bh);
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index be20cff4ba73..5ebd5e4dfbf8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -483,6 +483,44 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset,
 	flush_dcache_folio(folio);
 }
 
+/**
+ * folio_zero_tail - Zero the tail of a folio.
+ * @folio: The folio to zero.
+ * @offset: The byte offset in the folio to start zeroing at.
+ * @kaddr: The address the folio is currently mapped to.
+ *
+ * If you have already used kmap_local_folio() to map a folio, written
+ * some data to it and now need to zero the end of the folio (and flush
+ * the dcache), you can use this function.  If you do not have the
+ * folio kmapped (eg the folio has been partially populated by DMA),
+ * use folio_zero_range() or folio_zero_segment() instead.
+ *
+ * Return: An address which can be passed to kunmap_local().
+ */
+static inline __must_check void *folio_zero_tail(struct folio *folio,
+		size_t offset, void *kaddr)
+{
+	size_t len = folio_size(folio) - offset;
+
+	if (folio_test_highmem(folio)) {
+		size_t max = PAGE_SIZE - offset_in_page(offset);
+
+		while (len > max) {
+			memset(kaddr, 0, max);
+			kunmap_local(kaddr);
+			len -= max;
+			offset += max;
+			max = PAGE_SIZE;
+			kaddr = kmap_local_folio(folio, offset);
+		}
+	}
+
+	memset(kaddr, 0, len);
+	flush_dcache_folio(folio);
+
+	return kaddr;
+}
+
 /**
  * memcpy_from_file_folio - Copy some bytes from a file folio.
  * @to: The destination buffer.
-- 
cgit v1.2.3


From 6eaa266b54660f6b3654ad8902b4f7027054f55a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 7 Nov 2023 21:26:41 +0000
Subject: mm: add folio_fill_tail() and use it in iomap

The iomap code was limited to PAGE_SIZE bytes; generalise it to cover
an arbitrary-sized folio, and move it to be a common helper.

[akpm@linux-foundation.org: fix folio_fill_tail(), per Andreas Gruenbacher]
Link: https://lkml.kernel.org/r/20231107212643.3490372-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/iomap/buffered-io.c  | 14 ++------------
 include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f72df2babe56..093c4515b22a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
 {
 	const struct iomap *iomap = iomap_iter_srcmap(iter);
 	size_t size = i_size_read(iter->inode) - iomap->offset;
-	size_t poff = offset_in_page(iomap->offset);
 	size_t offset = offset_in_folio(folio, iomap->offset);
-	void *addr;
 
 	if (folio_test_uptodate(folio))
 		return 0;
 
-	if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
-		return -EIO;
-	if (WARN_ON_ONCE(size > PAGE_SIZE -
-			 offset_in_page(iomap->inline_data)))
-		return -EIO;
 	if (WARN_ON_ONCE(size > iomap->length))
 		return -EIO;
 	if (offset > 0)
 		ifs_alloc(iter->inode, folio, iter->flags);
 
-	addr = kmap_local_folio(folio, offset);
-	memcpy(addr, iomap->inline_data, size);
-	memset(addr + size, 0, PAGE_SIZE - poff - size);
-	kunmap_local(addr);
-	iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
+	folio_fill_tail(folio, offset, iomap->inline_data, size);
+	iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
 	return 0;
 }
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 5ebd5e4dfbf8..451c1dff0e87 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -521,6 +521,44 @@ static inline __must_check void *folio_zero_tail(struct folio *folio,
 	return kaddr;
 }
 
+/**
+ * folio_fill_tail - Copy some data to a folio and pad with zeroes.
+ * @folio: The destination folio.
+ * @offset: The offset into @folio at which to start copying.
+ * @from: The data to copy.
+ * @len: How many bytes of data to copy.
+ *
+ * This function is most useful for filesystems which support inline data.
+ * When they want to copy data from the inode into the page cache, this
+ * function does everything for them.  It supports large folios even on
+ * HIGHMEM configurations.
+ */
+static inline void folio_fill_tail(struct folio *folio, size_t offset,
+		const char *from, size_t len)
+{
+	char *to = kmap_local_folio(folio, offset);
+
+	VM_BUG_ON(offset + len > folio_size(folio));
+
+	if (folio_test_highmem(folio)) {
+		size_t max = PAGE_SIZE - offset_in_page(offset);
+
+		while (len > max) {
+			memcpy(to, from, max);
+			kunmap_local(to);
+			len -= max;
+			from += max;
+			offset += max;
+			max = PAGE_SIZE;
+			to = kmap_local_folio(folio, offset);
+		}
+	}
+
+	memcpy(to, from, len);
+	to = folio_zero_tail(folio, offset + len, to + len);
+	kunmap_local(to);
+}
+
 /**
  * memcpy_from_file_folio - Copy some bytes from a file folio.
  * @to: The destination buffer.
-- 
cgit v1.2.3


From c36f9d3d2c3e17f9eef1d2f47a63c91d51d55e87 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 8 Nov 2023 20:46:02 +0000
Subject: mm: remove test_set_page_writeback()

Patch series "Make folio_start_writeback return void".

Most of the folio flag-setting functions return void.
folio_start_writeback is gratuitously different; the only two filesystems
that do anything with the return value emit debug messages if it's already
set, and we can (and should) do that internally without bothering the
filesystem to do it.


This patch (of 4):

There are no more callers of this wrapper.

Link: https://lkml.kernel.org/r/20231108204605.745109-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231108204605.745109-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Steve French <sfrench@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a88e64acebfe..a440062e9386 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -780,11 +780,6 @@ bool set_page_writeback(struct page *page);
 #define folio_start_writeback_keepwrite(folio)	\
 	__folio_start_writeback(folio, true)
 
-static inline bool test_set_page_writeback(struct page *page)
-{
-	return set_page_writeback(page);
-}
-
 static __always_inline bool folio_test_head(struct folio *folio)
 {
 	return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY));
-- 
cgit v1.2.3


From b5612c368648a7be52411b288d09593e5945d1aa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 8 Nov 2023 20:46:05 +0000
Subject: mm: return void from folio_start_writeback() and related functions

Nobody now checks the return value from any of these functions, so
add an assertion at the beginning of the function and return void.

Link: https://lkml.kernel.org/r/20231108204605.745109-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Steve French <sfrench@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  4 ++--
 mm/folio-compat.c          |  4 ++--
 mm/page-writeback.c        | 54 +++++++++++++++++++++-------------------------
 3 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a440062e9386..735cddc13d20 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -772,8 +772,8 @@ static __always_inline void SetPageUptodate(struct page *page)
 
 CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
-bool __folio_start_writeback(struct folio *folio, bool keep_write);
-bool set_page_writeback(struct page *page);
+void __folio_start_writeback(struct folio *folio, bool keep_write);
+void set_page_writeback(struct page *page);
 
 #define folio_start_writeback(folio)			\
 	__folio_start_writeback(folio, false)
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 10c3247542cb..aee3b9a16828 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -46,9 +46,9 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
-bool set_page_writeback(struct page *page)
+void set_page_writeback(struct page *page)
 {
-	return folio_start_writeback(page_folio(page));
+	folio_start_writeback(page_folio(page));
 }
 EXPORT_SYMBOL(set_page_writeback);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ee2fd6a6af40..ca64bd513fa2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2982,67 +2982,63 @@ bool __folio_end_writeback(struct folio *folio)
 	return ret;
 }
 
-bool __folio_start_writeback(struct folio *folio, bool keep_write)
+void __folio_start_writeback(struct folio *folio, bool keep_write)
 {
 	long nr = folio_nr_pages(folio);
 	struct address_space *mapping = folio_mapping(folio);
-	bool ret;
 	int access_ret;
 
+	VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
+
 	folio_memcg_lock(folio);
 	if (mapping && mapping_use_writeback_tags(mapping)) {
 		XA_STATE(xas, &mapping->i_pages, folio_index(folio));
 		struct inode *inode = mapping->host;
 		struct backing_dev_info *bdi = inode_to_bdi(inode);
 		unsigned long flags;
+		bool on_wblist;
 
 		xas_lock_irqsave(&xas, flags);
 		xas_load(&xas);
-		ret = folio_test_set_writeback(folio);
-		if (!ret) {
-			bool on_wblist;
+		folio_test_set_writeback(folio);
 
-			on_wblist = mapping_tagged(mapping,
-						   PAGECACHE_TAG_WRITEBACK);
+		on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
 
-			xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
-			if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
-				struct bdi_writeback *wb = inode_to_wb(inode);
-
-				wb_stat_mod(wb, WB_WRITEBACK, nr);
-				if (!on_wblist)
-					wb_inode_writeback_start(wb);
-			}
+		xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
+		if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+			struct bdi_writeback *wb = inode_to_wb(inode);
 
-			/*
-			 * We can come through here when swapping
-			 * anonymous folios, so we don't necessarily
-			 * have an inode to track for sync.
-			 */
-			if (mapping->host && !on_wblist)
-				sb_mark_inode_writeback(mapping->host);
+			wb_stat_mod(wb, WB_WRITEBACK, nr);
+			if (!on_wblist)
+				wb_inode_writeback_start(wb);
 		}
+
+		/*
+		 * We can come through here when swapping anonymous
+		 * folios, so we don't necessarily have an inode to
+		 * track for sync.
+		 */
+		if (mapping->host && !on_wblist)
+			sb_mark_inode_writeback(mapping->host);
 		if (!folio_test_dirty(folio))
 			xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
 		if (!keep_write)
 			xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
 		xas_unlock_irqrestore(&xas, flags);
 	} else {
-		ret = folio_test_set_writeback(folio);
-	}
-	if (!ret) {
-		lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
-		zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
+		folio_test_set_writeback(folio);
 	}
+
+	lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
+	zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
 	folio_memcg_unlock(folio);
+
 	access_ret = arch_make_folio_accessible(folio);
 	/*
 	 * If writeback has been triggered on a page that cannot be made
 	 * accessible, it is too late to recover here.
 	 */
 	VM_BUG_ON_FOLIO(access_ret != 0, folio);
-
-	return ret;
 }
 EXPORT_SYMBOL(__folio_start_writeback);
 
-- 
cgit v1.2.3


From 16f5dfbc851b55b87101a20e181d4a14be3007d6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 9 Nov 2023 21:15:07 +0000
Subject: gfp: include __GFP_NOWARN in GFP_NOWAIT

GFP_NOWAIT callers are always prepared for their allocations to fail
because they fail so frequently.  Forcing the callers to remember to add
__GFP_NOWARN is just annoying and leads to an endless stream of patches
for the places where we forgot to add it.

We can now remove __GFP_NOWARN from all the callers which specify
GFP_NOWAIT, but I'd rather wait a cycle and send patches to each
maintainer instead of creating a big pile of merge conflicts.

Link: https://lkml.kernel.org/r/20231109211507.2262419-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 6583a58670c5..ae994534a12a 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t;
  * accounted to kmemcg.
  *
  * %GFP_NOWAIT is for kernel allocations that should not stall for direct
- * reclaim, start physical IO or use any filesystem callback.
+ * reclaim, start physical IO or use any filesystem callback.  It is very
+ * likely to fail to allocate memory, even for very small allocations.
  *
  * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
  * that do not require the starting of any physical IO.
@@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t;
 #define GFP_ATOMIC	(__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
 #define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
 #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
-#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
+#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM | __GFP_NOWARN)
 #define GFP_NOIO	(__GFP_RECLAIM)
 #define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
 #define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
-- 
cgit v1.2.3


From af7628d6ec196999175ecb3fdb38336489b0f88a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Nov 2023 16:14:47 +0000
Subject: fs: convert error_remove_page to error_remove_folio

There were already assertions that we were not passing a tail page to
error_remove_page(), so make the compiler enforce that by converting
everything to pass and use a folio.

Link: https://lkml.kernel.org/r/20231117161447.2461643-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/locking.rst |  4 ++--
 Documentation/filesystems/vfs.rst     |  6 +++---
 block/fops.c                          |  2 +-
 fs/afs/write.c                        |  2 +-
 fs/bcachefs/fs.c                      |  2 +-
 fs/btrfs/inode.c                      |  2 +-
 fs/ceph/addr.c                        |  4 ++--
 fs/ext2/inode.c                       |  2 +-
 fs/ext4/inode.c                       |  6 +++---
 fs/f2fs/compress.c                    |  2 +-
 fs/f2fs/inode.c                       |  2 +-
 fs/gfs2/aops.c                        |  4 ++--
 fs/hugetlbfs/inode.c                  |  6 +++---
 fs/nfs/file.c                         |  2 +-
 fs/ntfs/aops.c                        |  6 +++---
 fs/ocfs2/aops.c                       |  2 +-
 fs/xfs/xfs_aops.c                     |  2 +-
 fs/zonefs/file.c                      |  2 +-
 include/linux/fs.h                    |  2 +-
 include/linux/mm.h                    |  3 ++-
 mm/memory-failure.c                   | 10 +++++-----
 mm/shmem.c                            |  6 +++---
 mm/truncate.c                         |  9 ++++-----
 23 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 7be2900806c8..421daf837940 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -261,7 +261,7 @@ prototypes::
 			struct folio *src, enum migrate_mode);
 	int (*launder_folio)(struct folio *);
 	bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
-	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*error_remove_folio)(struct address_space *, struct folio *);
 	int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
 	int (*swap_deactivate)(struct file *);
 	int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
@@ -287,7 +287,7 @@ direct_IO:
 migrate_folio:		yes (both)
 launder_folio:		yes
 is_partially_uptodate:	yes
-error_remove_page:	yes
+error_remove_folio:	yes
 swap_activate:		no
 swap_deactivate:	no
 swap_rw:		yes, unlocks
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 99acc2e98673..dd99ce5912d8 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -823,7 +823,7 @@ cache in your filesystem.  The following members are defined:
 		bool (*is_partially_uptodate) (struct folio *, size_t from,
 					       size_t count);
 		void (*is_dirty_writeback)(struct folio *, bool *, bool *);
-		int (*error_remove_page) (struct mapping *mapping, struct page *page);
+		int (*error_remove_folio)(struct mapping *mapping, struct folio *);
 		int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
 		int (*swap_deactivate)(struct file *);
 		int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
@@ -1034,8 +1034,8 @@ cache in your filesystem.  The following members are defined:
 	VM if a folio should be treated as dirty or writeback for the
 	purposes of stalling.
 
-``error_remove_page``
-	normally set to generic_error_remove_page if truncation is ok
+``error_remove_folio``
+	normally set to generic_error_remove_folio if truncation is ok
 	for this address space.  Used for memory failure handling.
 	Setting this implies you deal with pages going away under you,
 	unless you have them locked or reference counts increased.
diff --git a/block/fops.c b/block/fops.c
index 0abaac705daf..0bdad1e8d514 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -500,7 +500,7 @@ const struct address_space_operations def_blk_aops = {
 	.readahead		= blkdev_readahead,
 	.writepages		= blkdev_writepages,
 	.is_partially_uptodate  = iomap_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.migrate_folio		= filemap_migrate_folio,
 };
 #endif /* CONFIG_BUFFER_HEAD */
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 57d05d67f0c2..e87b52b1f34c 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping,
 		folio_clear_uptodate(folio);
 		folio_end_writeback(folio);
 		folio_lock(folio);
-		generic_error_remove_page(mapping, &folio->page);
+		generic_error_remove_folio(mapping, folio);
 		folio_unlock(folio);
 		folio_put(folio);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4d51be813509..df4a97b6637b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1103,7 +1103,7 @@ static const struct address_space_operations bch_address_space_operations = {
 #ifdef CONFIG_MIGRATION
 	.migrate_folio	= filemap_migrate_folio,
 #endif
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 struct bcachefs_fid {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9f5a9894f88f..ff7b4efca24f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10930,7 +10930,7 @@ static const struct address_space_operations btrfs_aops = {
 	.release_folio	= btrfs_release_folio,
 	.migrate_folio	= btrfs_migrate_folio,
 	.dirty_folio	= filemap_dirty_folio,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 	.swap_activate	= btrfs_swap_activate,
 	.swap_deactivate = btrfs_swap_deactivate,
 };
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85be3bf18cdf..13af429ab030 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -907,8 +907,8 @@ static void writepages_finish(struct ceph_osd_request *req)
 			doutc(cl, "unlocking %p\n", page);
 
 			if (remove_page)
-				generic_error_remove_page(inode->i_mapping,
-							  page);
+				generic_error_remove_folio(inode->i_mapping,
+							  page_folio(page));
 
 			unlock_page(page);
 		}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 464faf6c217e..5a4272b2c6b0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = {
 	.writepages		= ext2_writepages,
 	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 };
 
 static const struct address_space_operations ext2_dax_aops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61277f7f8722..d7729b17a66b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3564,7 +3564,7 @@ static const struct address_space_operations ext4_aops = {
 	.direct_IO		= noop_direct_IO,
 	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.swap_activate		= ext4_iomap_swap_activate,
 };
 
@@ -3581,7 +3581,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.direct_IO		= noop_direct_IO,
 	.migrate_folio		= buffer_migrate_folio_norefs,
 	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.swap_activate		= ext4_iomap_swap_activate,
 };
 
@@ -3598,7 +3598,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.direct_IO		= noop_direct_IO,
 	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.swap_activate		= ext4_iomap_swap_activate,
 };
 
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 36e5dab6baae..6b2af514660d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
 				continue;
 			}
 
-			generic_error_remove_page(mapping, &folio->page);
+			generic_error_remove_folio(mapping, folio);
 			folio_unlock(folio);
 		}
 		folio_batch_release(&fbatch);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 560bfcad1af2..a9eb3891f417 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -600,7 +600,7 @@ make_now:
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		inode->i_mapping->a_ops = &f2fs_compress_aops;
 		/*
-		 * generic_error_remove_page only truncates pages of regular
+		 * generic_error_remove_folio only truncates pages of regular
 		 * inode
 		 */
 		inode->i_mode |= S_IFREG;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ba8742dc91f8..5cffb079b87c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -745,7 +745,7 @@ static const struct address_space_operations gfs2_aops = {
 	.bmap = gfs2_bmap,
 	.migrate_folio = filemap_migrate_folio,
 	.is_partially_uptodate = iomap_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
@@ -758,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.invalidate_folio = gfs2_invalidate_folio,
 	.release_folio = gfs2_release_folio,
 	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 void gfs2_set_aops(struct inode *inode)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f757d4f7ad98..36132c9125f9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1129,8 +1129,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
 #define hugetlbfs_migrate_folio NULL
 #endif
 
-static int hugetlbfs_error_remove_page(struct address_space *mapping,
-				struct page *page)
+static int hugetlbfs_error_remove_folio(struct address_space *mapping,
+				struct folio *folio)
 {
 	return 0;
 }
@@ -1277,7 +1277,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 	.write_end	= hugetlbfs_write_end,
 	.dirty_folio	= noop_dirty_folio,
 	.migrate_folio  = hugetlbfs_migrate_folio,
-	.error_remove_page	= hugetlbfs_error_remove_page,
+	.error_remove_folio	= hugetlbfs_error_remove_folio,
 };
 
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3f9768810427..e8cccb94b927 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -567,7 +567,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migrate_folio = nfs_migrate_folio,
 	.launder_folio = nfs_launder_folio,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 	.swap_activate = nfs_swap_activate,
 	.swap_deactivate = nfs_swap_deactivate,
 	.swap_rw = nfs_swap_rw,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 71e31e789b29..70479ce915e8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = {
 	.bmap		= ntfs_bmap,
 	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 /*
@@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = {
 #endif /* NTFS_RW */
 	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 /*
@@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = {
 #endif /* NTFS_RW */
 	.migrate_folio	= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
-	.error_remove_page = generic_error_remove_page,
+	.error_remove_folio = generic_error_remove_folio,
 };
 
 #ifdef NTFS_RW
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ba790219d528..795997806326 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2480,5 +2480,5 @@ const struct address_space_operations ocfs2_aops = {
 	.release_folio		= ocfs2_release_folio,
 	.migrate_folio		= buffer_migrate_folio,
 	.is_partially_uptodate	= block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 };
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 465d7630bb21..813f85156b0c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.bmap			= xfs_vm_bmap,
 	.migrate_folio		= filemap_migrate_folio,
 	.is_partially_uptodate  = iomap_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.swap_activate		= xfs_iomap_swapfile_activate,
 };
 
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index b2c9b35df8f7..6ab2318a9c8e 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = {
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
 	.is_partially_uptodate	= iomap_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
+	.error_remove_folio	= generic_error_remove_folio,
 	.swap_activate		= zonefs_swap_activate,
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..31b2cf963db9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -434,7 +434,7 @@ struct address_space_operations {
 	bool (*is_partially_uptodate) (struct folio *, size_t from,
 			size_t count);
 	void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
-	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*error_remove_folio)(struct address_space *, struct folio *);
 
 	/* swapfile support */
 	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64cd1ee4aacc..13a090271716 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2384,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int generic_error_remove_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_folio(struct address_space *mapping,
+		struct folio *folio);
 
 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
 		unsigned long address, struct pt_regs *regs);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6aec94821fda..d8c853b35dbb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -927,13 +927,13 @@ static int delete_from_lru_cache(struct folio *folio)
 	return -EIO;
 }
 
-static int truncate_error_page(struct folio *folio, unsigned long pfn,
+static int truncate_error_folio(struct folio *folio, unsigned long pfn,
 				struct address_space *mapping)
 {
 	int ret = MF_FAILED;
 
-	if (mapping->a_ops->error_remove_page) {
-		int err = mapping->a_ops->error_remove_page(mapping, &folio->page);
+	if (mapping->a_ops->error_remove_folio) {
+		int err = mapping->a_ops->error_remove_folio(mapping, folio);
 
 		if (err != 0)
 			pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
@@ -1054,7 +1054,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
 	 *
 	 * Open: to take i_rwsem or not for this? Right now we don't.
 	 */
-	ret = truncate_error_page(folio, page_to_pfn(p), mapping);
+	ret = truncate_error_folio(folio, page_to_pfn(p), mapping);
 	if (has_extra_refcount(ps, p, extra_pins))
 		ret = MF_FAILED;
 
@@ -1188,7 +1188,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
 
 	mapping = folio_mapping(folio);
 	if (mapping) {
-		res = truncate_error_page(folio, page_to_pfn(p), mapping);
+		res = truncate_error_folio(folio, page_to_pfn(p), mapping);
 		/* The page is kept in page cache. */
 		extra_pins = true;
 		folio_unlock(folio);
diff --git a/mm/shmem.c b/mm/shmem.c
index 91e2620148b2..97bc622da774 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4445,8 +4445,8 @@ static void __init shmem_destroy_inodecache(void)
 }
 
 /* Keep the page in page cache instead of truncating it */
-static int shmem_error_remove_page(struct address_space *mapping,
-				   struct page *page)
+static int shmem_error_remove_folio(struct address_space *mapping,
+				   struct folio *folio)
 {
 	return 0;
 }
@@ -4461,7 +4461,7 @@ const struct address_space_operations shmem_aops = {
 #ifdef CONFIG_MIGRATION
 	.migrate_folio	= migrate_folio,
 #endif
-	.error_remove_page = shmem_error_remove_page,
+	.error_remove_folio = shmem_error_remove_folio,
 };
 EXPORT_SYMBOL(shmem_aops);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 52e3a703e7b2..725b150e47ac 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -250,10 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
 /*
  * Used to get rid of pages on hardware memory corruption.
  */
-int generic_error_remove_page(struct address_space *mapping, struct page *page)
+int generic_error_remove_folio(struct address_space *mapping,
+		struct folio *folio)
 {
-	VM_BUG_ON_PAGE(PageTail(page), page);
-
 	if (!mapping)
 		return -EINVAL;
 	/*
@@ -262,9 +261,9 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
 	 */
 	if (!S_ISREG(mapping->host->i_mode))
 		return -EIO;
-	return truncate_inode_folio(mapping, page_folio(page));
+	return truncate_inode_folio(mapping, folio);
 }
-EXPORT_SYMBOL(generic_error_remove_page);
+EXPORT_SYMBOL(generic_error_remove_folio);
 
 /**
  * mapping_evict_folio() - Remove an unused folio from the page-cache.
-- 
cgit v1.2.3


From 022012dcf44209074af97b6ae531a10c08736b31 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:13 +0100
Subject: lib/stackdepot, kasan: add flags to __stack_depot_save and rename

Change the bool can_alloc argument of __stack_depot_save to a u32
  argument that accepts a set of flags.

The following patch will add another flag to stack_depot_save_flags
  besides the existing STACK_DEPOT_FLAG_CAN_ALLOC.

Also rename the function to stack_depot_save_flags, as
  __stack_depot_save is a cryptic name,

Link: https://lkml.kernel.org/r/645fa15239621eebbd3a10331e5864b718839512.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 36 +++++++++++++++++++++++++-----------
 lib/stackdepot.c           | 16 +++++++++++-----
 mm/kasan/common.c          |  7 ++++---
 mm/kasan/generic.c         |  9 +++++----
 mm/kasan/kasan.h           |  2 +-
 mm/kasan/tags.c            |  3 ++-
 6 files changed, 48 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index e58306783d8e..0b262e14144e 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -32,6 +32,17 @@ typedef u32 depot_stack_handle_t;
  */
 #define STACK_DEPOT_EXTRA_BITS 5
 
+typedef u32 depot_flags_t;
+
+/*
+ * Flags that can be passed to stack_depot_save_flags(); see the comment next
+ * to its declaration for more details.
+ */
+#define STACK_DEPOT_FLAG_CAN_ALLOC	((depot_flags_t)0x0001)
+
+#define STACK_DEPOT_FLAGS_NUM	1
+#define STACK_DEPOT_FLAGS_MASK	((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1))
+
 /*
  * Using stack depot requires its initialization, which can be done in 3 ways:
  *
@@ -69,31 +80,34 @@ static inline int stack_depot_early_init(void)	{ return 0; }
 #endif
 
 /**
- * __stack_depot_save - Save a stack trace to stack depot
+ * stack_depot_save_flags - Save a stack trace to stack depot
  *
  * @entries:		Pointer to the stack trace
  * @nr_entries:		Number of frames in the stack
  * @alloc_flags:	Allocation GFP flags
- * @can_alloc:		Allocate stack pools (increased chance of failure if false)
+ * @depot_flags:	Stack depot flags
+ *
+ * Saves a stack trace from @entries array of size @nr_entries.
  *
- * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
- * %true, stack depot can replenish the stack pools in case no space is left
- * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
- * any allocations and fails if no space is left to store the stack trace.
+ * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can
+ * replenish the stack pools in case no space is left (allocates using GFP
+ * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and
+ * fails if no space is left to store the stack trace.
  *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
  *
- * Context: Any context, but setting @can_alloc to %false is required if
+ * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if
  *          alloc_pages() cannot be used from the current context. Currently
  *          this is the case for contexts where neither %GFP_ATOMIC nor
  *          %GFP_NOWAIT can be used (NMI, raw_spin_lock).
  *
  * Return: Handle of the stack struct stored in depot, 0 on failure
  */
-depot_stack_handle_t __stack_depot_save(unsigned long *entries,
-					unsigned int nr_entries,
-					gfp_t gfp_flags, bool can_alloc);
+depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
+					    unsigned int nr_entries,
+					    gfp_t gfp_flags,
+					    depot_flags_t depot_flags);
 
 /**
  * stack_depot_save - Save a stack trace to stack depot
@@ -103,7 +117,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
  * @alloc_flags:	Allocation GFP flags
  *
  * Context: Contexts where allocations via alloc_pages() are allowed.
- *          See __stack_depot_save() for more details.
+ *          See stack_depot_save_flags() for more details.
  *
  * Return: Handle of the stack trace stored in depot, 0 on failure
  */
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 4bb0af423f82..59d61d5c09a7 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -450,19 +450,24 @@ static inline struct stack_record *find_stack(struct list_head *bucket,
 	return NULL;
 }
 
-depot_stack_handle_t __stack_depot_save(unsigned long *entries,
-					unsigned int nr_entries,
-					gfp_t alloc_flags, bool can_alloc)
+depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
+					    unsigned int nr_entries,
+					    gfp_t alloc_flags,
+					    depot_flags_t depot_flags)
 {
 	struct list_head *bucket;
 	struct stack_record *found = NULL;
 	depot_stack_handle_t handle = 0;
 	struct page *page = NULL;
 	void *prealloc = NULL;
+	bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
 	bool need_alloc = false;
 	unsigned long flags;
 	u32 hash;
 
+	if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
+		return 0;
+
 	/*
 	 * If this stack trace is from an interrupt, including anything before
 	 * interrupt entry usually leads to unbounded stack depot growth.
@@ -541,13 +546,14 @@ exit:
 		handle = found->handle.handle;
 	return handle;
 }
-EXPORT_SYMBOL_GPL(__stack_depot_save);
+EXPORT_SYMBOL_GPL(stack_depot_save_flags);
 
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 				      unsigned int nr_entries,
 				      gfp_t alloc_flags)
 {
-	return __stack_depot_save(entries, nr_entries, alloc_flags, true);
+	return stack_depot_save_flags(entries, nr_entries, alloc_flags,
+				      STACK_DEPOT_FLAG_CAN_ALLOC);
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
 
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 256930da578a..825a0240ec02 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -22,6 +22,7 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/slab.h>
+#include <linux/stackdepot.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -37,19 +38,19 @@ struct slab *kasan_addr_to_slab(const void *addr)
 	return NULL;
 }
 
-depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
+depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags)
 {
 	unsigned long entries[KASAN_STACK_DEPTH];
 	unsigned int nr_entries;
 
 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
-	return __stack_depot_save(entries, nr_entries, flags, can_alloc);
+	return stack_depot_save_flags(entries, nr_entries, flags, depot_flags);
 }
 
 void kasan_set_track(struct kasan_track *track, gfp_t flags)
 {
 	track->pid = current->pid;
-	track->stack = kasan_save_stack(flags, true);
+	track->stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC);
 }
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 4d837ab83f08..5d168c9afb32 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/slab.h>
+#include <linux/stackdepot.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -472,7 +473,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
 			sizeof(struct kasan_free_meta) : 0);
 }
 
-static void __kasan_record_aux_stack(void *addr, bool can_alloc)
+static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
 {
 	struct slab *slab = kasan_addr_to_slab(addr);
 	struct kmem_cache *cache;
@@ -489,17 +490,17 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc)
 		return;
 
 	alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
-	alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc);
+	alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
 }
 
 void kasan_record_aux_stack(void *addr)
 {
-	return __kasan_record_aux_stack(addr, true);
+	return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
 }
 
 void kasan_record_aux_stack_noalloc(void *addr)
 {
-	return __kasan_record_aux_stack(addr, false);
+	return __kasan_record_aux_stack(addr, 0);
 }
 
 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8b06bab5c406..b29d46b83d1f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -368,7 +368,7 @@ static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int
 static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
 #endif
 
-depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
+depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
 void kasan_set_track(struct kasan_track *track, gfp_t flags);
 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
 void kasan_save_free_info(struct kmem_cache *cache, void *object);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 7dcfe341d48e..4fd32121b0fd 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -13,6 +13,7 @@
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
+#include <linux/stackdepot.h>
 #include <linux/static_key.h>
 #include <linux/string.h>
 #include <linux/types.h>
@@ -101,7 +102,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object,
 	struct kasan_stack_ring_entry *entry;
 	void *old_ptr;
 
-	stack = kasan_save_stack(gfp_flags, true);
+	stack = kasan_save_stack(gfp_flags, STACK_DEPOT_FLAG_CAN_ALLOC);
 
 	/*
 	 * Prevent save_stack_info() from modifying stack ring
-- 
cgit v1.2.3


From 410b764f89f59cce858d94fc781b68c1f27a0ca9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:14 +0100
Subject: lib/stackdepot: add refcount for records

Add a reference counter for how many times a stack records has been
  added to stack depot.

Add a new STACK_DEPOT_FLAG_GET flag to stack_depot_save_flags that
  instructs the stack depot to increment the refcount.

Do not yet decrement the refcount; this is implemented in one of the
  following patches.

Do not yet enable any users to use the flag to avoid overflowing the
  refcount.

This is preparatory patch for implementing the eviction of stack records
  from the stack depot.

Link: https://lkml.kernel.org/r/a3fc14a2359d019d2a008d4ff8b46a665371ffee.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 13 ++++++++++---
 lib/stackdepot.c           | 12 ++++++++++--
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 0b262e14144e..611716702d73 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -39,8 +39,9 @@ typedef u32 depot_flags_t;
  * to its declaration for more details.
  */
 #define STACK_DEPOT_FLAG_CAN_ALLOC	((depot_flags_t)0x0001)
+#define STACK_DEPOT_FLAG_GET		((depot_flags_t)0x0002)
 
-#define STACK_DEPOT_FLAGS_NUM	1
+#define STACK_DEPOT_FLAGS_NUM	2
 #define STACK_DEPOT_FLAGS_MASK	((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1))
 
 /*
@@ -94,6 +95,9 @@ static inline int stack_depot_early_init(void)	{ return 0; }
  * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and
  * fails if no space is left to store the stack trace.
  *
+ * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment
+ * the refcount on the saved stack trace if it already exists in stack depot.
+ *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
  *
@@ -116,8 +120,11 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
  * @nr_entries:		Number of frames in the stack
  * @alloc_flags:	Allocation GFP flags
  *
- * Context: Contexts where allocations via alloc_pages() are allowed.
- *          See stack_depot_save_flags() for more details.
+ * Does not increment the refcount on the saved stack trace; see
+ * stack_depot_save_flags() for more details.
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed;
+ *          see stack_depot_save_flags() for more details.
  *
  * Return: Handle of the stack trace stored in depot, 0 on failure
  */
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 59d61d5c09a7..911dee11bf39 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/printk.h>
+#include <linux/refcount.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stacktrace.h>
@@ -60,6 +61,7 @@ struct stack_record {
 	u32 hash;			/* Hash in hash table */
 	u32 size;			/* Number of stored frames */
 	union handle_parts handle;
+	refcount_t count;
 	unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
 };
 
@@ -373,6 +375,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 	stack->hash = hash;
 	stack->size = size;
 	/* stack->handle is already filled in by depot_init_pool(). */
+	refcount_set(&stack->count, 1);
 	memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
 
 	/*
@@ -489,6 +492,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 	/* Fast path: look the stack trace up without full locking. */
 	found = find_stack(bucket, entries, nr_entries, hash);
 	if (found) {
+		if (depot_flags & STACK_DEPOT_FLAG_GET)
+			refcount_inc(&found->count);
 		read_unlock_irqrestore(&pool_rwlock, flags);
 		goto exit;
 	}
@@ -528,12 +533,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 			list_add(&new->list, bucket);
 			found = new;
 		}
-	} else if (prealloc) {
+	} else {
+		if (depot_flags & STACK_DEPOT_FLAG_GET)
+			refcount_inc(&found->count);
 		/*
 		 * Stack depot already contains this stack trace, but let's
 		 * keep the preallocated memory for future.
 		 */
-		depot_keep_new_pool(&prealloc);
+		if (prealloc)
+			depot_keep_new_pool(&prealloc);
 	}
 
 	write_unlock_irqrestore(&pool_rwlock, flags);
-- 
cgit v1.2.3


From 108be8def46e9422f5a5abc96b0ab8fb6b3fb344 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 20 Nov 2023 18:47:15 +0100
Subject: lib/stackdepot: allow users to evict stack traces

Add stack_depot_put, a function that decrements the reference counter on a
stack record and removes it from the stack depot once the counter reaches
0.

Internally, when removing a stack record, the function unlinks it from the
hash table bucket and returns to the freelist.

With this change, the users of stack depot can call stack_depot_put when
keeping a stack trace in the stack depot is not needed anymore.  This
allows avoiding polluting the stack depot with irrelevant stack traces and
thus have more space to store the relevant ones before the stack depot
reaches its capacity.

Link: https://lkml.kernel.org/r/1d1ad5692ee43d4fc2b3fd9d221331d30b36123f.1700502145.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 14 ++++++++++++++
 lib/stackdepot.c           | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 611716702d73..a6796f178913 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -97,6 +97,8 @@ static inline int stack_depot_early_init(void)	{ return 0; }
  *
  * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment
  * the refcount on the saved stack trace if it already exists in stack depot.
+ * Users of this flag must also call stack_depot_put() when keeping the stack
+ * trace is no longer required to avoid overflowing the refcount.
  *
  * If the provided stack trace comes from the interrupt context, only the part
  * up to the interrupt entry is saved.
@@ -162,6 +164,18 @@ void stack_depot_print(depot_stack_handle_t stack);
 int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 		       int spaces);
 
+/**
+ * stack_depot_put - Drop a reference to a stack trace from stack depot
+ *
+ * @handle:	Stack depot handle returned from stack_depot_save()
+ *
+ * The stack trace is evicted from stack depot once all references to it have
+ * been dropped (once the number of stack_depot_evict() calls matches the
+ * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for
+ * this stack trace).
+ */
+void stack_depot_put(depot_stack_handle_t handle);
+
 /**
  * stack_depot_set_extra_bits - Set extra bits in a stack depot handle
  *
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 911dee11bf39..c1b31160f4b4 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -394,7 +394,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
 	size_t offset = parts.offset << DEPOT_STACK_ALIGN;
 	struct stack_record *stack;
 
-	lockdep_assert_held_read(&pool_rwlock);
+	lockdep_assert_held(&pool_rwlock);
 
 	if (parts.pool_index > pools_num) {
 		WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
@@ -410,6 +410,14 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
 	return stack;
 }
 
+/* Links stack into the freelist. */
+static void depot_free_stack(struct stack_record *stack)
+{
+	lockdep_assert_held_write(&pool_rwlock);
+
+	list_add(&stack->list, &free_stacks);
+}
+
 /* Calculates the hash for a stack. */
 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
 {
@@ -592,6 +600,33 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 }
 EXPORT_SYMBOL_GPL(stack_depot_fetch);
 
+void stack_depot_put(depot_stack_handle_t handle)
+{
+	struct stack_record *stack;
+	unsigned long flags;
+
+	if (!handle || stack_depot_disabled)
+		return;
+
+	write_lock_irqsave(&pool_rwlock, flags);
+
+	stack = depot_fetch_stack(handle);
+	if (WARN_ON(!stack))
+		goto out;
+
+	if (refcount_dec_and_test(&stack->count)) {
+		/* Unlink stack from the hash table. */
+		list_del(&stack->list);
+
+		/* Free stack. */
+		depot_free_stack(stack);
+	}
+
+out:
+	write_unlock_irqrestore(&pool_rwlock, flags);
+}
+EXPORT_SYMBOL_GPL(stack_depot_put);
+
 void stack_depot_print(depot_stack_handle_t stack)
 {
 	unsigned long *entries;
-- 
cgit v1.2.3


From 95a2ac937013cc3aaaea02abcdd167b96874548d Mon Sep 17 00:00:00 2001
From: Sumanth Korikkar <sumanthk@linux.ibm.com>
Date: Mon, 20 Nov 2023 15:53:54 +0100
Subject: mm: use vmem_altmap code without CONFIG_ZONE_DEVICE

vmem_altmap_free() and vmem_altmap_offset() could be utlized without
CONFIG_ZONE_DEVICE enabled.  For example,
mm/memory_hotplug.c:__add_pages() relies on that.  The altmap is no longer
restricted to ZONE_DEVICE handling, but instead depends on
CONFIG_SPARSEMEM_VMEMMAP.

When CONFIG_SPARSEMEM_VMEMMAP is disabled, these functions are defined as
inline stubs, ensuring compatibility with configurations that do not use
sparsemem vmemmap.  Without it, lkp reported the following:

ld: arch/x86/mm/init_64.o: in function `remove_pagetable':
init_64.c:(.meminit.text+0xfc7): undefined reference to
`vmem_altmap_free'

Link: https://lkml.kernel.org/r/20231120145354.308999-4-sumanthk@linux.ibm.com
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311180545.VeyRXEDq-lkp@intel.com/
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 12 ------------
 include/linux/mm.h       | 26 ++++++++++++++++++++++++++
 mm/memremap.c            | 14 +-------------
 3 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1314d9c5f05b..744c830f4b13 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap);
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 unsigned long memremap_compat_align(void);
 #else
 static inline void *devm_memremap_pages(struct device *dev,
@@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
 	return false;
 }
 
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
-	return 0;
-}
-
-static inline void vmem_altmap_free(struct vmem_altmap *altmap,
-		unsigned long nr_pfns)
-{
-}
-
 /* when memremap_pages() is disabled all archs can remap a single page */
 static inline unsigned long memremap_compat_align(void)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13a090271716..a422cc123a2d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3871,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
 #endif
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+	/* number of pfns from base where pfn_to_page() is valid */
+	if (altmap)
+		return altmap->reserve + altmap->free;
+	return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+				    unsigned long nr_pfns)
+{
+	altmap->alloc -= nr_pfns;
+}
+#else
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+	return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+				    unsigned long nr_pfns)
+{
+}
+#endif
+
 #define VMEMMAP_RESERVE_NR	2
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
diff --git a/mm/memremap.c b/mm/memremap.c
index bee85560a243..9531faa92a7c 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -7,6 +7,7 @@
 #include <linux/memremap.h>
 #include <linux/pfn_t.h>
 #include <linux/swap.h>
+#include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/swapops.h>
 #include <linux/types.h>
@@ -422,19 +423,6 @@ void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
 }
 EXPORT_SYMBOL_GPL(devm_memunmap_pages);
 
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
-	/* number of pfns from base where pfn_to_page() is valid */
-	if (altmap)
-		return altmap->reserve + altmap->free;
-	return 0;
-}
-
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
-{
-	altmap->alloc -= nr_pfns;
-}
-
 /**
  * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
  * @pfn: page frame number to lookup page_map
-- 
cgit v1.2.3


From 38ca8a185389716e9f7566bce4bb0085f71da61d Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Tue, 21 Nov 2023 20:43:49 +0100
Subject: pgtable: fix s390 ptdesc field comments

Patch series "minor ptdesc updates", v3.


This patch (of 2):

Since commit d08d4e7cd6bf ("s390/mm: use full 4KB page for 2KB PTE") there
is no fragmented page tracking on s390.  Fix the corresponding comments.

Link: https://lkml.kernel.org/r/cover.1700594815.git.agordeev@linux.ibm.com
Link: https://lkml.kernel.org/r/2eead241f3a45bed26c7911cf66bded1e35670b8.1700594815.git.agordeev@linux.ibm.com
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Suggested-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..fbec64036baa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a);
  * @pmd_huge_pte:     Protected by ptdesc->ptl, used for THPs.
  * @__page_mapping:   Aliases with page->mapping. Unused for page tables.
  * @pt_mm:            Used for x86 pgds.
- * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
- * @_refcount:        Same as page refcount. Used for s390 page tables.
+ * @_refcount:        Same as page refcount.
  * @pt_memcg_data:    Memcg data. Tracked for page tables here.
  *
  * This struct overlays struct page for now. Do not modify without a good
-- 
cgit v1.2.3


From f7dd74ac239aad5ef7575ea03c45fd7956e00285 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Tue, 21 Nov 2023 20:43:50 +0100
Subject: pgtable: rename ptdesc _refcount field to __page_refcount

Rename ptdesc _refcount field to __page_refcount similar to the other
unused page fields.

Link: https://lkml.kernel.org/r/982bdc652ba79a606c3d01c905766e7e076b3315.1700594815.git.agordeev@linux.ibm.com
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Suggested-by: Vishal Moola <vishal.moola@gmail.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fbec64036baa..ef18d2b25378 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -405,7 +405,7 @@ FOLIO_MATCH(compound_head, _head_2a);
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
- * @_refcount:        Same as page refcount.
+ * @__page_refcount:  Same as page refcount.
  * @pt_memcg_data:    Memcg data. Tracked for page tables here.
  *
  * This struct overlays struct page for now. Do not modify without a good
@@ -438,7 +438,7 @@ struct ptdesc {
 #endif
 	};
 	unsigned int __page_type;
-	atomic_t _refcount;
+	atomic_t __page_refcount;
 #ifdef CONFIG_MEMCG
 	unsigned long pt_memcg_data;
 #endif
@@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1);
 TABLE_MATCH(mapping, __page_mapping);
 TABLE_MATCH(rcu_head, pt_rcu_head);
 TABLE_MATCH(page_type, __page_type);
-TABLE_MATCH(_refcount, _refcount);
+TABLE_MATCH(_refcount, __page_refcount);
 #ifdef CONFIG_MEMCG
 TABLE_MATCH(memcg_data, pt_memcg_data);
 #endif
-- 
cgit v1.2.3


From 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 23 Nov 2023 19:23:17 +0200
Subject: mm: list_lru: Update kernel documentation to follow the requirements

kernel-doc is not happy about documentation in list_lru.h:

list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add'
list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add'
list_lru.h:90: warning: No description found for return value of 'list_lru_add'
list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del'
list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del'
list_lru.h:103: warning: No description found for return value of 'list_lru_del'
list_lru.h:116: warning: No description found for return value of 'list_lru_count_one'
list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one'
list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq'

Fix the documentation accordingly.

While at it, fix the references to the parameters in functions
inside the long descriptions, on which the above script is not
complaining (yet?).

Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list_lru.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b35968ee9fb5..db86ad78d428 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
 
 /**
  * list_lru_add: add an element to the lru list's tail
- * @list_lru: the lru pointer
+ * @lru: the lru pointer
  * @item: the item to be added.
  *
  * If the element is already part of a list, this function returns doing
@@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  * the caller organize itself in a way that elements can be in more than
  * one type of list, it is up to the caller to fully remove the item from
  * the previous list (with list_lru_del() for instance) before moving it
- * to @list_lru
+ * to @lru.
  *
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
  */
 bool list_lru_add(struct list_lru *lru, struct list_head *item);
 
 /**
  * list_lru_del: delete an element to the lru list
- * @list_lru: the lru pointer
+ * @lru: the lru pointer
  * @item: the item to be deleted.
  *
- * This function works analogously as list_lru_add in terms of list
+ * This function works analogously as list_lru_add() in terms of list
  * manipulation. The comments about an element already pertaining to
- * a list are also valid for list_lru_del.
+ * a list are also valid for list_lru_del().
  *
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
  */
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
@@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
  * @nid: the node id to count from.
  * @memcg: the cgroup to count from.
  *
- * Always return a non-negative number, 0 for empty lists. There is no
- * guarantee that the list is not updated while the count is being computed.
- * Callers that want such a guarantee need to provide an outer lock.
+ * There is no guarantee that the list is not updated while the count is being
+ * computed. Callers that want such a guarantee need to provide an outer lock.
+ *
+ * Return: 0 for empty lists, otherwise the number of objects
+ * currently held by @lru.
  */
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg);
@@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
 
 /**
- * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
@@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
  *
- * This function will scan all elements in a particular list_lru, calling the
+ * This function will scan all elements in a particular @lru, calling the
  * @isolate callback for each of those items, along with the current list
  * spinlock and a caller-provided opaque. The @isolate callback can choose to
  * drop the lock internally, but *must* return with the lock held. The callback
- * will return an enum lru_status telling the list_lru infrastructure what to
+ * will return an enum lru_status telling the @lru infrastructure what to
  * do with the object being scanned.
  *
- * Please note that nr_to_walk does not mean how many objects will be freed,
+ * Please note that @nr_to_walk does not mean how many objects will be freed,
  * just how many objects will be scanned.
  *
- * Return value: the number of objects effectively removed from the LRU.
+ * Return: the number of objects effectively removed from the LRU.
  */
 unsigned long list_lru_walk_one(struct list_lru *lru,
 				int nid, struct mem_cgroup *memcg,
 				list_lru_walk_cb isolate, void *cb_arg,
 				unsigned long *nr_to_walk);
 /**
- * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
@@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
  *
- * Same as @list_lru_walk_one except that the spinlock is acquired with
+ * Same as list_lru_walk_one() except that the spinlock is acquired with
  * spin_lock_irq().
  */
 unsigned long list_lru_walk_one_irq(struct list_lru *lru,
-- 
cgit v1.2.3


From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Oct 2023 16:57:10 +0100
Subject: introduce for_other_threads(p, t)

Cosmetic, but imho it makes the usage look more clear and simple, the new
helper doesn't require to initialize "t".

After this change while_each_thread() has only 3 users, and it is only
used in the do/while loops.

Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c                    |  3 +--
 include/linux/sched/signal.h |  3 +++
 kernel/signal.c              | 11 ++++-------
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..ee43597cb453 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * will be able to manipulate the current directory, etc.
 	 * It would be nice to force an unshare instead...
 	 */
-	t = p;
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
 	rcu_read_lock();
-	while_each_thread(p, t) {
+	for_other_threads(p, t) {
 		if (t->fs == p->fs)
 			n_fs++;
 	}
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..41d6759d6a4a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
+#define for_other_threads(p, t)	\
+	for (t = p; (t = next_thread(t)) != p; )
+
 #define __for_each_thread(signal, t)	\
 	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
 		lockdep_is_held(&tasklist_lock))
diff --git a/kernel/signal.c b/kernel/signal.c
index 47a7602dfe8d..5aa216e841a2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1376,12 +1376,12 @@ int force_sig_info(struct kernel_siginfo *info)
  */
 int zap_other_threads(struct task_struct *p)
 {
-	struct task_struct *t = p;
+	struct task_struct *t;
 	int count = 0;
 
 	p->signal->group_stop_count = 0;
 
-	while_each_thread(p, t) {
+	for_other_threads(p, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 		/* Don't require de_thread to wait for the vhost_worker */
 		if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
@@ -2465,12 +2465,10 @@ static bool do_signal_stop(int signr)
 			sig->group_exit_code = signr;
 
 		sig->group_stop_count = 0;
-
 		if (task_set_jobctl_pending(current, signr | gstop))
 			sig->group_stop_count++;
 
-		t = current;
-		while_each_thread(current, t) {
+		for_other_threads(current, t) {
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
@@ -2966,8 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
 	if (sigisemptyset(&retarget))
 		return;
 
-	t = tsk;
-	while_each_thread(tsk, t) {
+	for_other_threads(tsk, t) {
 		if (t->flags & PF_EXITING)
 			continue;
 
-- 
cgit v1.2.3


From 0eb5085c38749f2a91e5bd8cbebb1ebf3398343c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 16 Nov 2023 14:36:38 +0100
Subject: arch: remove ARCH_TASK_STRUCT_ON_STACK

IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ON_STACK.
IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture"). Therefore remove support for ARCH_TASK_STRUCT_ON_STACK
as well.

Note: this also reveals a potential bug in powerpc code, which makes use of
__init_task_data without selecting ARCH_TASK_STRUCT_ON_STACK which makes
__init_task_data a no-op. This is broken since commit d11ed3ab3166 ("Expand
INIT_TASK() in init/init_task.c and remove") from 2018 and needs to be
addressed separately.

Link: https://lkml.kernel.org/r/20231116133638.1636277-4-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig                 |  4 ----
 arch/powerpc/kexec/core_64.c |  3 +--
 include/linux/init_task.h    |  7 -------
 include/linux/sched.h        |  2 --
 init/init_task.c             | 10 ++--------
 5 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index c2f87ef9f0ae..bfcc7c2dc039 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -301,10 +301,6 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED
 config ARCH_HAS_CPU_FINALIZE_INIT
 	bool
 
-# Select if arch init_task must go in the __init_task_data section
-config ARCH_TASK_STRUCT_ON_STACK
-	bool
-
 config HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	bool
 	help
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 0bee7ca9a77c..762e4d09aacf 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -283,8 +283,7 @@ static void kexec_prepare_cpus(void)
  * We could use a smaller stack if we don't care about anything using
  * current, but that audit has not been performed.
  */
-static union thread_union kexec_stack __init_task_data =
-	{ };
+static union thread_union kexec_stack = { };
 
 /*
  * For similar reasons to the stack above, the kexecing CPU needs to be on a
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 40fc5813cf93..bccb3f1f6262 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -37,13 +37,6 @@ extern struct cred init_cred;
 
 #define INIT_TASK_COMM "swapper"
 
-/* Attach to the init_task data structure for proper alignment */
-#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
-#define __init_task_data __section(".data..init_task")
-#else
-#define __init_task_data /**/
-#endif
-
 /* Attach to the thread_info data structure for proper alignment */
 #define __init_thread_info __section(".data..init_thread_info")
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..c2ecb2e06046 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1955,9 +1955,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
 	struct task_struct task;
-#endif
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
 #endif
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..6f6485d554df 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -51,8 +51,7 @@ static struct sighand_struct init_sighand = {
 };
 
 #ifdef CONFIG_SHADOW_CALL_STACK
-unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)]
-		__init_task_data = {
+unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
 	[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
 };
 #endif
@@ -61,12 +60,7 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)]
  * Set up the first task table, touch at your own risk!. Base=0,
  * limit=0x1fffff (=2MB)
  */
-struct task_struct init_task
-#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
-	__init_task_data
-#endif
-	__aligned(L1_CACHE_BYTES)
-= {
+struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	.thread_info	= INIT_THREAD_INFO(init_task),
 	.stack_refcount	= REFCOUNT_INIT(1),
-- 
cgit v1.2.3


From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Nov 2023 16:16:49 +0100
Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill
 recalc_sigpending_and_wake()

The purpose of recalc_sigpending_and_wake() is not clear, it looks
"obviously unneeded" because we are going to send the signal which can't
be blocked or ignored.

Add the comment to explain why we can't rely on send_signal_locked() and
make this logic more simple/explicit.  recalc_sigpending_and_wake() has no
other users, it can die.

In fact I think we don't even need signal_wake_up(), the target task must
be either current or a TASK_TRACED child, otherwise the usage of siglock
is not safe.  But this needs another change.

Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h |  1 -
 kernel/signal.c              | 17 ++++-------------
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 41d6759d6a4a..015c0e3a3e1d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags,
  * This is required every time the blocked sigset_t changes.
  * callers must hold sighand->siglock.
  */
-extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 extern void calculate_sigpending(void);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 5aa216e841a2..c9c57d053ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
 	return false;
 }
 
-/*
- * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
- * This is superfluous when called on current, the wakeup is a harmless no-op.
- */
-void recalc_sigpending_and_wake(struct task_struct *t)
-{
-	if (recalc_sigpending_tsk(t))
-		signal_wake_up(t, 0);
-}
-
 void recalc_sigpending(void)
 {
 	if (!recalc_sigpending_tsk(current) && !freezing(current))
@@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 		action->sa.sa_handler = SIG_DFL;
 		if (handler == HANDLER_EXIT)
 			action->sa.sa_flags |= SA_IMMUTABLE;
-		if (blocked) {
+		if (blocked)
 			sigdelset(&t->blocked, sig);
-			recalc_sigpending_and_wake(t);
-		}
 	}
 	/*
 	 * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
@@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 	    (!t->ptrace || (handler == HANDLER_EXIT)))
 		t->signal->flags &= ~SIGNAL_UNKILLABLE;
 	ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
+	/* This can happen if the signal was already pending and blocked */
+	if (!task_sigpending(t))
+		signal_wake_up(t, 0);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
 	return ret;
-- 
cgit v1.2.3


From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 15 Nov 2023 21:00:27 +0800
Subject: resource: add walk_system_ram_res_rev()

This function, being a variant of walk_system_ram_res() introduced in
commit 8c86e70acead ("resource: provide new functions to walk through
resources"), walks through a list of all the resources of System RAM in
reversed order, i.e., from higher to lower.

It will be used in kexec_file code to load kernel, initrd etc when
preparing kexec reboot.

Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 14f5cfabbbc8..db7fe25f3370 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -331,6 +331,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+			int (*func)(struct resource *, void *));
+extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 866ef3663a0b..e8a244300e5b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -27,6 +27,8 @@
 #include <linux/mount.h>
 #include <linux/resource_ext.h>
 #include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 
 
@@ -429,6 +431,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 				     func);
 }
 
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+				int (*func)(struct resource *, void *))
+{
+	struct resource res, *rams;
+	int rams_size = 16, i;
+	unsigned long flags;
+	int ret = -1;
+
+	/* create a list */
+	rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+	if (!rams)
+		return ret;
+
+	flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+	i = 0;
+	while ((start < end) &&
+		(!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+		if (i >= rams_size) {
+			/* re-alloc */
+			struct resource *rams_new;
+
+			rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+					     (rams_size + 16) * sizeof(struct resource),
+					     GFP_KERNEL);
+			if (!rams_new)
+				goto out;
+
+			rams = rams_new;
+			rams_size += 16;
+		}
+
+		rams[i].start = res.start;
+		rams[i++].end = res.end;
+
+		start = res.end + 1;
+	}
+
+	/* go reverse */
+	for (i--; i >= 0; i--) {
+		ret = (*func)(&rams[i], arg);
+		if (ret)
+			break;
+	}
+
+out:
+	kvfree(rams);
+	return ret;
+}
+
 /*
  * This function calls the @func callback against all memory ranges, which
  * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
-- 
cgit v1.2.3


From 7973be94724464222ae0b1860a25be04ab7b0132 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 8 Dec 2023 18:52:38 +0200
Subject: clk: x86: lpss-atom: Drop unneeded 'extern' in the header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'extern' for the functions is not needed, drop it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231208165238.3309058-1-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/clk-lpss.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h
index 41df326583f9..7f132029316a 100644
--- a/include/linux/platform_data/x86/clk-lpss.h
+++ b/include/linux/platform_data/x86/clk-lpss.h
@@ -15,6 +15,6 @@ struct lpss_clk_data {
 	struct clk *clk;
 };
 
-extern int lpss_atom_clk_init(void);
+int lpss_atom_clk_init(void);
 
 #endif /* __CLK_LPSS_H */
-- 
cgit v1.2.3


From 72dd14d241e1c6e241fc5b265746c59f306c6aa3 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 4 Dec 2023 14:17:37 -0800
Subject: platform/x86/intel/tpmi: Modify external interface to get read/write
 state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify the external interface tpmi_get_feature_status() to get read
and write blocked instead of locked and disabled. Since auxiliary device
is not created when disabled, no use of returning disabled state. Also
locked state is not useful as feature driver can't use locked state
in a meaningful way.

Using read and write state, feature driver can decide which operations
to restrict for that feature.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231204221740.3645130-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/intel/tpmi.c | 8 ++++----
 include/linux/intel_tpmi.h        | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c
index 09972e2b8a06..92c5c3a90d53 100644
--- a/drivers/platform/x86/intel/tpmi.c
+++ b/drivers/platform/x86/intel/tpmi.c
@@ -345,8 +345,8 @@ err_unlock:
 	return ret;
 }
 
-int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id,
-			    int *locked, int *disabled)
+int tpmi_get_feature_status(struct auxiliary_device *auxdev,
+			    int feature_id, bool *read_blocked, bool *write_blocked)
 {
 	struct intel_vsec_device *intel_vsec_dev = dev_to_ivdev(auxdev->dev.parent);
 	struct intel_tpmi_info *tpmi_info = auxiliary_get_drvdata(&intel_vsec_dev->auxdev);
@@ -357,8 +357,8 @@ int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id,
 	if (ret)
 		return ret;
 
-	*locked = feature_state.locked;
-	*disabled = !feature_state.enabled;
+	*read_blocked = feature_state.read_blocked;
+	*write_blocked = feature_state.write_blocked;
 
 	return 0;
 }
diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index ee07393445f9..4f89c5bd8663 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -32,7 +32,6 @@ struct intel_tpmi_plat_info {
 struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev);
 struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index);
 int tpmi_get_resource_count(struct auxiliary_device *auxdev);
-
-int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, int *locked,
-			    int *disabled);
+int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked,
+			    bool *write_blocked);
 #endif
-- 
cgit v1.2.3


From 046d7be6210e7f870e53eb38fd410237e9d1d88f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 4 Dec 2023 14:17:38 -0800
Subject: platform/x86/intel/tpmi: Move TPMI ID definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move TPMI ID definitions to common include file. In this way other
feature drivers don't have to redefine.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20231204221740.3645130-4-srinivas.pandruvada@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/intel/tpmi.c | 13 -------------
 include/linux/intel_tpmi.h        | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c
index 92c5c3a90d53..e73cdea67fff 100644
--- a/drivers/platform/x86/intel/tpmi.c
+++ b/drivers/platform/x86/intel/tpmi.c
@@ -170,19 +170,6 @@ struct tpmi_feature_state {
 	u32 locked:1;
 } __packed;
 
-/*
- * List of supported TMPI IDs.
- * Some TMPI IDs are not used by Linux, so the numbers are not consecutive.
- */
-enum intel_tpmi_id {
-	TPMI_ID_RAPL = 0, /* Running Average Power Limit */
-	TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */
-	TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */
-	TPMI_ID_SST = 5, /* Speed Select Technology */
-	TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */
-	TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */
-};
-
 /*
  * The size from hardware is in u32 units. This size is from a trusted hardware,
  * but better to verify for pre silicon platforms. Set size to 0, when invalid.
diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index 4f89c5bd8663..a3529b962be6 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -12,6 +12,19 @@
 #define TPMI_MINOR_VERSION(val)	FIELD_GET(GENMASK(4, 0), val)
 #define TPMI_MAJOR_VERSION(val)	FIELD_GET(GENMASK(7, 5), val)
 
+/*
+ * List of supported TMPI IDs.
+ * Some TMPI IDs are not used by Linux, so the numbers are not consecutive.
+ */
+enum intel_tpmi_id {
+	TPMI_ID_RAPL = 0,	/* Running Average Power Limit */
+	TPMI_ID_PEM = 1,	/* Power and Perf excursion Monitor */
+	TPMI_ID_UNCORE = 2,	/* Uncore Frequency Scaling */
+	TPMI_ID_SST = 5,	/* Speed Select Technology */
+	TPMI_CONTROL_ID = 0x80,	/* Special ID for getting feature status */
+	TPMI_INFO_ID = 0x81,	/* Special ID for PCI BDF and Package ID information */
+};
+
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
  * @package_id:	CPU Package id
-- 
cgit v1.2.3


From 6bb3703aa52c9b5bb9716cbeae7350247b675209 Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:52 +0900
Subject: efi: expose efivar generic ops register function

This is a preparation for supporting efivar operations provided by other
than efi subsystem.  Both register and unregister functions are exposed
so that non-efi subsystem can revert the efi generic operation.

Acked-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efi.c | 12 ++++++++++++
 include/linux/efi.h        |  3 +++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 9d3910d1abe1..32a67c61c3b8 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -231,6 +231,18 @@ static void generic_ops_unregister(void)
 	efivars_unregister(&generic_efivars);
 }
 
+void efivars_generic_ops_register(void)
+{
+	generic_ops_register();
+}
+EXPORT_SYMBOL_GPL(efivars_generic_ops_register);
+
+void efivars_generic_ops_unregister(void)
+{
+	generic_ops_unregister();
+}
+EXPORT_SYMBOL_GPL(efivars_generic_ops_unregister);
+
 #ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS
 #define EFIVAR_SSDT_NAME_MAX	16UL
 static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 9cc5bf32f6f2..1b2f50efb98c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1348,4 +1348,7 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 
 umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n);
 
+void efivars_generic_ops_register(void);
+void efivars_generic_ops_unregister(void);
+
 #endif /* _LINUX_EFI_H */
-- 
cgit v1.2.3


From 1f71f37fbbd065b3326d9b7d8bb5ae688cd653d0 Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:53 +0900
Subject: efi: Add EFI_ACCESS_DENIED status code

This commit adds the EFI_ACCESS_DENIED status code.

Acked-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1b2f50efb98c..3668aa204c47 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -40,6 +40,7 @@ struct screen_info;
 #define EFI_WRITE_PROTECTED	( 8 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_OUT_OF_RESOURCES	( 9 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_NOT_FOUND		(14 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_ACCESS_DENIED	(15 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_TIMEOUT		(18 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_ABORTED		(21 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_SECURITY_VIOLATION	(26 | (1UL << (BITS_PER_LONG-1)))
-- 
cgit v1.2.3


From 94f7f6182c72ba642c1f20111681f9cc8621c95f Mon Sep 17 00:00:00 2001
From: Masahisa Kojima <masahisa.kojima@linaro.org>
Date: Tue, 7 Nov 2023 14:40:55 +0900
Subject: efivarfs: automatically update super block flag

efivar operation is updated when the tee_stmm_efi module is probed.
tee_stmm_efi module supports SetVariable runtime service, but user needs
to manually remount the efivarfs as RW to enable the write access if the
previous efivar operation does not support SetVariable and efivarfs is
mounted as read-only.

This commit notifies the update of efivar operation to efivarfs
subsystem, then drops SB_RDONLY flag if the efivar operation supports
SetVariable.

Signed-off-by: Masahisa Kojima <masahisa.kojima@linaro.org>
[ardb: use per-superblock instance of the notifier block]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efi.c  |  6 ++++++
 drivers/firmware/efi/vars.c |  8 ++++++++
 fs/efivarfs/internal.h      |  2 ++
 fs/efivarfs/super.c         | 27 +++++++++++++++++++++++++++
 include/linux/efi.h         |  8 ++++++++
 5 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 32a67c61c3b8..4fcda50acfa4 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -32,6 +32,7 @@
 #include <linux/ucs2_string.h>
 #include <linux/memblock.h>
 #include <linux/security.h>
+#include <linux/notifier.h>
 
 #include <asm/early_ioremap.h>
 
@@ -187,6 +188,9 @@ static const struct attribute_group efi_subsys_attr_group = {
 	.is_visible = efi_attr_is_visible,
 };
 
+struct blocking_notifier_head efivar_ops_nh;
+EXPORT_SYMBOL_GPL(efivar_ops_nh);
+
 static struct efivars generic_efivars;
 static struct efivar_operations generic_ops;
 
@@ -431,6 +435,8 @@ static int __init efisubsys_init(void)
 		platform_device_register_simple("efivars", 0, NULL, 0);
 	}
 
+	BLOCKING_INIT_NOTIFIER_HEAD(&efivar_ops_nh);
+
 	error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group);
 	if (error) {
 		pr_err("efi: Sysfs attribute export failed with error %d.\n",
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index e9dc7116daf1..f654e6f6af87 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -63,6 +63,7 @@ int efivars_register(struct efivars *efivars,
 		     const struct efivar_operations *ops)
 {
 	int rv;
+	int event;
 
 	if (down_interruptible(&efivars_lock))
 		return -EINTR;
@@ -77,6 +78,13 @@ int efivars_register(struct efivars *efivars,
 
 	__efivars = efivars;
 
+	if (efivar_supports_writes())
+		event = EFIVAR_OPS_RDWR;
+	else
+		event = EFIVAR_OPS_RDONLY;
+
+	blocking_notifier_call_chain(&efivar_ops_nh, event, NULL);
+
 	pr_info("Registered efivars operations\n");
 	rv = 0;
 out:
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 1dc0ccce3cc3..169252e6dc46 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -17,6 +17,8 @@ struct efivarfs_mount_opts {
 struct efivarfs_fs_info {
 	struct efivarfs_mount_opts mount_opts;
 	struct list_head efivarfs_list;
+	struct super_block *sb;
+	struct notifier_block nb;
 };
 
 struct efi_variable {
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index cee325b5bbdd..6038dd39367a 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -15,10 +15,30 @@
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/statfs.h>
+#include <linux/notifier.h>
 #include <linux/printk.h>
 
 #include "internal.h"
 
+static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
+				 void *data)
+{
+	struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, nb);
+
+	switch (event) {
+	case EFIVAR_OPS_RDONLY:
+		sfi->sb->s_flags |= SB_RDONLY;
+		break;
+	case EFIVAR_OPS_RDWR:
+		sfi->sb->s_flags &= ~SB_RDONLY;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
 static void efivarfs_evict_inode(struct inode *inode)
 {
 	clear_inode(inode);
@@ -317,6 +337,12 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (!root)
 		return -ENOMEM;
 
+	sfi->sb = sb;
+	sfi->nb.notifier_call = efivarfs_ops_notifier;
+	err = blocking_notifier_chain_register(&efivar_ops_nh, &sfi->nb);
+	if (err)
+		return err;
+
 	err = efivar_init(efivarfs_callback, (void *)sb, true,
 			  &sfi->efivarfs_list);
 	if (err)
@@ -371,6 +397,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
 {
 	struct efivarfs_fs_info *sfi = sb->s_fs_info;
 
+	blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
 	kill_litter_super(sb);
 
 	/* Remove all entries and destroy */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3668aa204c47..c74f47711f0b 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1349,6 +1349,14 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 
 umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n);
 
+/*
+ * efivar ops event type
+ */
+#define EFIVAR_OPS_RDONLY 0
+#define EFIVAR_OPS_RDWR 1
+
+extern struct blocking_notifier_head efivar_ops_nh;
+
 void efivars_generic_ops_register(void);
 void efivars_generic_ops_unregister(void);
 
-- 
cgit v1.2.3


From 704af3a40747e395b67892127943e6ffd5e2b642 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sun, 10 Dec 2023 21:24:43 +0100
Subject: platform/x86: wmi: Remove chardev interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The design of the WMI chardev interface is broken:
- it assumes that WMI drivers are not instantiated twice
- it offers next to no abstractions, the WMI driver gets
a raw byte buffer
- it is only used by a single driver, something which is
unlikely to change

Since the only user (dell-smbios-wmi) has been migrated
to his own ioctl interface, remove it.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231210202443.646427-6-W_Armin@gmx.de
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/wmi.c | 180 ++-------------------------------------------
 include/linux/wmi.h        |   8 --
 2 files changed, 5 insertions(+), 183 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 7df5b5ee7983..7303702290e5 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -23,17 +23,14 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
-#include <linux/uaccess.h>
 #include <linux/uuid.h>
 #include <linux/wmi.h>
 #include <linux/fs.h>
-#include <uapi/linux/wmi.h>
 
 MODULE_AUTHOR("Carlos Corbacho");
 MODULE_DESCRIPTION("ACPI-WMI Mapping Driver");
@@ -66,12 +63,9 @@ struct wmi_block {
 	struct wmi_device dev;
 	struct list_head list;
 	struct guid_block gblock;
-	struct miscdevice char_dev;
-	struct mutex char_mutex;
 	struct acpi_device *acpi_device;
 	wmi_notify_handler handler;
 	void *handler_data;
-	u64 req_buf_size;
 	unsigned long flags;
 };
 
@@ -256,26 +250,6 @@ static void wmi_device_put(struct wmi_device *wdev)
  * Exported WMI functions
  */
 
-/**
- * set_required_buffer_size - Sets the buffer size needed for performing IOCTL
- * @wdev: A wmi bus device from a driver
- * @length: Required buffer size
- *
- * Allocates memory needed for buffer, stores the buffer size in that memory.
- *
- * Return: 0 on success or a negative error code for failure.
- */
-int set_required_buffer_size(struct wmi_device *wdev, u64 length)
-{
-	struct wmi_block *wblock;
-
-	wblock = container_of(wdev, struct wmi_block, dev);
-	wblock->req_buf_size = length;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(set_required_buffer_size);
-
 /**
  * wmi_instance_count - Get number of WMI object instances
  * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
@@ -884,111 +858,12 @@ static int wmi_dev_match(struct device *dev, struct device_driver *driver)
 
 	return 0;
 }
-static int wmi_char_open(struct inode *inode, struct file *filp)
-{
-	/*
-	 * The miscdevice already stores a pointer to itself
-	 * inside filp->private_data
-	 */
-	struct wmi_block *wblock = container_of(filp->private_data, struct wmi_block, char_dev);
-
-	filp->private_data = wblock;
-
-	return nonseekable_open(inode, filp);
-}
-
-static ssize_t wmi_char_read(struct file *filp, char __user *buffer,
-			     size_t length, loff_t *offset)
-{
-	struct wmi_block *wblock = filp->private_data;
-
-	return simple_read_from_buffer(buffer, length, offset,
-				       &wblock->req_buf_size,
-				       sizeof(wblock->req_buf_size));
-}
-
-static long wmi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct wmi_ioctl_buffer __user *input =
-		(struct wmi_ioctl_buffer __user *) arg;
-	struct wmi_block *wblock = filp->private_data;
-	struct wmi_ioctl_buffer *buf;
-	struct wmi_driver *wdriver;
-	int ret;
-
-	if (_IOC_TYPE(cmd) != WMI_IOC)
-		return -ENOTTY;
-
-	/* make sure we're not calling a higher instance than exists*/
-	if (_IOC_NR(cmd) >= wblock->gblock.instance_count)
-		return -EINVAL;
-
-	mutex_lock(&wblock->char_mutex);
-	buf = wblock->handler_data;
-	if (get_user(buf->length, &input->length)) {
-		dev_dbg(&wblock->dev.dev, "Read length from user failed\n");
-		ret = -EFAULT;
-		goto out_ioctl;
-	}
-	/* if it's too small, abort */
-	if (buf->length < wblock->req_buf_size) {
-		dev_err(&wblock->dev.dev,
-			"Buffer %lld too small, need at least %lld\n",
-			buf->length, wblock->req_buf_size);
-		ret = -EINVAL;
-		goto out_ioctl;
-	}
-	/* if it's too big, warn, driver will only use what is needed */
-	if (buf->length > wblock->req_buf_size)
-		dev_warn(&wblock->dev.dev,
-			"Buffer %lld is bigger than required %lld\n",
-			buf->length, wblock->req_buf_size);
-
-	/* copy the structure from userspace */
-	if (copy_from_user(buf, input, wblock->req_buf_size)) {
-		dev_dbg(&wblock->dev.dev, "Copy %llu from user failed\n",
-			wblock->req_buf_size);
-		ret = -EFAULT;
-		goto out_ioctl;
-	}
-
-	/* let the driver do any filtering and do the call */
-	wdriver = drv_to_wdrv(wblock->dev.dev.driver);
-	if (!try_module_get(wdriver->driver.owner)) {
-		ret = -EBUSY;
-		goto out_ioctl;
-	}
-	ret = wdriver->filter_callback(&wblock->dev, cmd, buf);
-	module_put(wdriver->driver.owner);
-	if (ret)
-		goto out_ioctl;
-
-	/* return the result (only up to our internal buffer size) */
-	if (copy_to_user(input, buf, wblock->req_buf_size)) {
-		dev_dbg(&wblock->dev.dev, "Copy %llu to user failed\n",
-			wblock->req_buf_size);
-		ret = -EFAULT;
-	}
-
-out_ioctl:
-	mutex_unlock(&wblock->char_mutex);
-	return ret;
-}
-
-static const struct file_operations wmi_fops = {
-	.owner		= THIS_MODULE,
-	.read		= wmi_char_read,
-	.open		= wmi_char_open,
-	.unlocked_ioctl	= wmi_ioctl,
-	.compat_ioctl	= compat_ptr_ioctl,
-};
 
 static int wmi_dev_probe(struct device *dev)
 {
 	struct wmi_block *wblock = dev_to_wblock(dev);
 	struct wmi_driver *wdriver = drv_to_wdrv(dev->driver);
 	int ret = 0;
-	char *buf;
 
 	if (ACPI_FAILURE(wmi_method_enable(wblock, true)))
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
@@ -996,55 +871,17 @@ static int wmi_dev_probe(struct device *dev)
 	if (wdriver->probe) {
 		ret = wdriver->probe(dev_to_wdev(dev),
 				find_guid_context(wblock, wdriver));
-		if (ret != 0)
-			goto probe_failure;
-	}
-
-	/* driver wants a character device made */
-	if (wdriver->filter_callback) {
-		/* check that required buffer size declared by driver or MOF */
-		if (!wblock->req_buf_size) {
-			dev_err(&wblock->dev.dev,
-				"Required buffer size not set\n");
-			ret = -EINVAL;
-			goto probe_failure;
-		}
+		if (!ret) {
+			if (ACPI_FAILURE(wmi_method_enable(wblock, false)))
+				dev_warn(dev, "Failed to disable device\n");
 
-		wblock->handler_data = kmalloc(wblock->req_buf_size,
-					       GFP_KERNEL);
-		if (!wblock->handler_data) {
-			ret = -ENOMEM;
-			goto probe_failure;
-		}
-
-		buf = kasprintf(GFP_KERNEL, "wmi/%s", wdriver->driver.name);
-		if (!buf) {
-			ret = -ENOMEM;
-			goto probe_string_failure;
-		}
-		wblock->char_dev.minor = MISC_DYNAMIC_MINOR;
-		wblock->char_dev.name = buf;
-		wblock->char_dev.fops = &wmi_fops;
-		wblock->char_dev.mode = 0444;
-		ret = misc_register(&wblock->char_dev);
-		if (ret) {
-			dev_warn(dev, "failed to register char dev: %d\n", ret);
-			ret = -ENOMEM;
-			goto probe_misc_failure;
+			return ret;
 		}
 	}
 
 	set_bit(WMI_PROBED, &wblock->flags);
-	return 0;
 
-probe_misc_failure:
-	kfree(buf);
-probe_string_failure:
-	kfree(wblock->handler_data);
-probe_failure:
-	if (ACPI_FAILURE(wmi_method_enable(wblock, false)))
-		dev_warn(dev, "failed to disable device\n");
-	return ret;
+	return 0;
 }
 
 static void wmi_dev_remove(struct device *dev)
@@ -1054,12 +891,6 @@ static void wmi_dev_remove(struct device *dev)
 
 	clear_bit(WMI_PROBED, &wblock->flags);
 
-	if (wdriver->filter_callback) {
-		misc_deregister(&wblock->char_dev);
-		kfree(wblock->char_dev.name);
-		kfree(wblock->handler_data);
-	}
-
 	if (wdriver->remove)
 		wdriver->remove(dev_to_wdev(dev));
 
@@ -1131,7 +962,6 @@ static int wmi_create_device(struct device *wmi_bus_dev,
 
 	if (wblock->gblock.flags & ACPI_WMI_METHOD) {
 		wblock->dev.dev.type = &wmi_type_method;
-		mutex_init(&wblock->char_mutex);
 		goto out_init;
 	}
 
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 8a643c39fcce..50f7f1e4fd4f 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -11,7 +11,6 @@
 #include <linux/device.h>
 #include <linux/acpi.h>
 #include <linux/mod_devicetable.h>
-#include <uapi/linux/wmi.h>
 
 /**
  * struct wmi_device - WMI device structure
@@ -47,8 +46,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct
 
 u8 wmidev_instance_count(struct wmi_device *wdev);
 
-extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
-
 /**
  * struct wmi_driver - WMI driver structure
  * @driver: Driver model structure
@@ -57,11 +54,8 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length);
  * @probe: Callback for device binding
  * @remove: Callback for device unbinding
  * @notify: Callback for receiving WMI events
- * @filter_callback: Callback for filtering device IOCTLs
  *
  * This represents WMI drivers which handle WMI devices.
- * @filter_callback is only necessary for drivers which
- * want to set up a WMI IOCTL interface.
  */
 struct wmi_driver {
 	struct device_driver driver;
@@ -71,8 +65,6 @@ struct wmi_driver {
 	int (*probe)(struct wmi_device *wdev, const void *context);
 	void (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
-	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
-				struct wmi_ioctl_buffer *arg);
 };
 
 extern int __must_check __wmi_driver_register(struct wmi_driver *driver,
-- 
cgit v1.2.3


From 58e82a62669da52e688f4a8b89922c1839bf1001 Mon Sep 17 00:00:00 2001
From: Ma Jun <Jun.Ma2@amd.com>
Date: Mon, 11 Dec 2023 18:06:23 +0800
Subject: platform/x86/amd: Add support for AMD ACPI based Wifi band RFI
 mitigation feature

Due to electrical and mechanical constraints in certain platform designs
there may be likely interference of relatively high-powered harmonics of
the (G-)DDR memory clocks with local radio module frequency bands used
by Wifi 6/6e/7.

To mitigate this, AMD has introduced a mechanism that devices can use to
notify active use of particular frequencies so that other devices can make
relative internal adjustments as necessary to avoid this resonance.

Co-developed-by: Evan Quan <quanliangl@hotmail.com>
Signed-off-by: Evan Quan <quanliangl@hotmail.com>
Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/amd/Kconfig  |  14 ++
 drivers/platform/x86/amd/Makefile |   1 +
 drivers/platform/x86/amd/wbrf.c   | 317 ++++++++++++++++++++++++++++++++++++++
 include/linux/acpi_amd_wbrf.h     |  91 +++++++++++
 4 files changed, 423 insertions(+)
 create mode 100644 drivers/platform/x86/amd/wbrf.c
 create mode 100644 include/linux/acpi_amd_wbrf.h

(limited to 'include/linux')

diff --git a/drivers/platform/x86/amd/Kconfig b/drivers/platform/x86/amd/Kconfig
index 55f3a2fc6aec..54753213cc61 100644
--- a/drivers/platform/x86/amd/Kconfig
+++ b/drivers/platform/x86/amd/Kconfig
@@ -18,3 +18,17 @@ config AMD_HSMP
 
 	  If you choose to compile this driver as a module the module will be
 	  called amd_hsmp.
+
+config AMD_WBRF
+	bool "AMD Wifi RF Band mitigations (WBRF)"
+	depends on ACPI
+	help
+	  WBRF(Wifi Band RFI mitigation) mechanism allows Wifi drivers
+	  to notify the frequencies they are using so that other hardware
+	  can be reconfigured to avoid harmonic conflicts.
+
+	  AMD provides an ACPI based mechanism to support WBRF on platform with
+	  appropriate underlying support.
+
+	  This mechanism will only be activated on platforms that advertise a
+	  need for it.
diff --git a/drivers/platform/x86/amd/Makefile b/drivers/platform/x86/amd/Makefile
index f04932b7a7d1..dcec0a46f8af 100644
--- a/drivers/platform/x86/amd/Makefile
+++ b/drivers/platform/x86/amd/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_AMD_PMC)		+= pmc/
 amd_hsmp-y			:= hsmp.o
 obj-$(CONFIG_AMD_HSMP)		+= amd_hsmp.o
 obj-$(CONFIG_AMD_PMF)		+= pmf/
+obj-$(CONFIG_AMD_WBRF)		+= wbrf.o
diff --git a/drivers/platform/x86/amd/wbrf.c b/drivers/platform/x86/amd/wbrf.c
new file mode 100644
index 000000000000..dd197b3aebe0
--- /dev/null
+++ b/drivers/platform/x86/amd/wbrf.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Wifi Frequency Band Manage Interface
+ * Copyright (C) 2023 Advanced Micro Devices
+ */
+
+#include <linux/acpi.h>
+#include <linux/acpi_amd_wbrf.h>
+
+/*
+ * Functions bit vector for WBRF method
+ *
+ * Bit 0: WBRF supported.
+ * Bit 1: Function 1 (Add / Remove frequency) is supported.
+ * Bit 2: Function 2 (Get frequency list) is supported.
+ */
+#define WBRF_ENABLED		0x0
+#define WBRF_RECORD			0x1
+#define WBRF_RETRIEVE		0x2
+
+#define WBRF_REVISION		0x1
+
+/*
+ * The data structure used for WBRF_RETRIEVE is not naturally aligned.
+ * And unfortunately the design has been settled down.
+ */
+struct amd_wbrf_ranges_out {
+	u32			num_of_ranges;
+	struct freq_band_range	band_list[MAX_NUM_OF_WBRF_RANGES];
+} __packed;
+
+static const guid_t wifi_acpi_dsm_guid =
+	GUID_INIT(0x7b7656cf, 0xdc3d, 0x4c1c,
+		  0x83, 0xe9, 0x66, 0xe7, 0x21, 0xde, 0x30, 0x70);
+
+/*
+ * Used to notify consumer (amdgpu driver currently) about
+ * the wifi frequency is change.
+ */
+static BLOCKING_NOTIFIER_HEAD(wbrf_chain_head);
+
+static int wbrf_record(struct acpi_device *adev, uint8_t action, struct wbrf_ranges_in_out *in)
+{
+	union acpi_object argv4;
+	union acpi_object *tmp;
+	union acpi_object *obj;
+	u32 num_of_ranges = 0;
+	u32 num_of_elements;
+	u32 arg_idx = 0;
+	int ret;
+	u32 i;
+
+	if (!in)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(in->band_list); i++) {
+		if (in->band_list[i].start && in->band_list[i].end)
+			num_of_ranges++;
+	}
+
+	/*
+	 * The num_of_ranges value in the "in" object supplied by
+	 * the caller is required to be equal to the number of
+	 * entries in the band_list array in there.
+	 */
+	if (num_of_ranges != in->num_of_ranges)
+		return -EINVAL;
+
+	/*
+	 * Every input frequency band comes with two end points(start/end)
+	 * and each is accounted as an element. Meanwhile the range count
+	 * and action type are accounted as an element each.
+	 * So, the total element count = 2 * num_of_ranges + 1 + 1.
+	 */
+	num_of_elements = 2 * num_of_ranges + 2;
+
+	tmp = kcalloc(num_of_elements, sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	argv4.package.type = ACPI_TYPE_PACKAGE;
+	argv4.package.count = num_of_elements;
+	argv4.package.elements = tmp;
+
+	/* save the number of ranges*/
+	tmp[0].integer.type = ACPI_TYPE_INTEGER;
+	tmp[0].integer.value = num_of_ranges;
+
+	/* save the action(WBRF_RECORD_ADD/REMOVE/RETRIEVE) */
+	tmp[1].integer.type = ACPI_TYPE_INTEGER;
+	tmp[1].integer.value = action;
+
+	arg_idx = 2;
+	for (i = 0; i < ARRAY_SIZE(in->band_list); i++) {
+		if (!in->band_list[i].start || !in->band_list[i].end)
+			continue;
+
+		tmp[arg_idx].integer.type = ACPI_TYPE_INTEGER;
+		tmp[arg_idx++].integer.value = in->band_list[i].start;
+		tmp[arg_idx].integer.type = ACPI_TYPE_INTEGER;
+		tmp[arg_idx++].integer.value = in->band_list[i].end;
+	}
+
+	obj = acpi_evaluate_dsm(adev->handle, &wifi_acpi_dsm_guid,
+				WBRF_REVISION, WBRF_RECORD, &argv4);
+
+	if (!obj)
+		return -EINVAL;
+
+	if (obj->type != ACPI_TYPE_INTEGER) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = obj->integer.value;
+	if (ret)
+		ret = -EINVAL;
+
+out:
+	ACPI_FREE(obj);
+	kfree(tmp);
+
+	return ret;
+}
+
+/**
+ * acpi_amd_wbrf_add_remove - add or remove the frequency band the device is using
+ *
+ * @dev: device pointer
+ * @action: remove or add the frequency band into bios
+ * @in: input structure containing the frequency band the device is using
+ *
+ * Broadcast to other consumers the frequency band the device starts
+ * to use. Underneath the surface the information is cached into an
+ * internal buffer first. Then a notification is sent to all those
+ * registered consumers. So then they can retrieve that buffer to
+ * know the latest active frequency bands. Consumers that haven't
+ * yet been registered can retrieve the information from the cache
+ * when they register.
+ *
+ * Return:
+ * 0 for success add/remove wifi frequency band.
+ * Returns a negative error code for failure.
+ */
+int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in)
+{
+	struct acpi_device *adev;
+	int ret;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev)
+		return -ENODEV;
+
+	ret = wbrf_record(adev, action, in);
+	if (ret)
+		return ret;
+
+	blocking_notifier_call_chain(&wbrf_chain_head, WBRF_CHANGED, NULL);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_amd_wbrf_add_remove);
+
+/**
+ * acpi_amd_wbrf_supported_producer - determine if the WBRF can be enabled
+ *                                    for the device as a producer
+ *
+ * @dev: device pointer
+ *
+ * Check if the platform equipped with necessary implementations to
+ * support WBRF for the device as a producer.
+ *
+ * Return:
+ * true if WBRF is supported, otherwise returns false
+ */
+bool acpi_amd_wbrf_supported_producer(struct device *dev)
+{
+	struct acpi_device *adev;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev)
+		return false;
+
+	return acpi_check_dsm(adev->handle, &wifi_acpi_dsm_guid,
+			      WBRF_REVISION, BIT(WBRF_RECORD));
+}
+EXPORT_SYMBOL_GPL(acpi_amd_wbrf_supported_producer);
+
+/**
+ * acpi_amd_wbrf_supported_consumer - determine if the WBRF can be enabled
+ *                                    for the device as a consumer
+ *
+ * @dev: device pointer
+ *
+ * Determine if the platform equipped with necessary implementations to
+ * support WBRF for the device as a consumer.
+ *
+ * Return:
+ * true if WBRF is supported, otherwise returns false.
+ */
+bool acpi_amd_wbrf_supported_consumer(struct device *dev)
+{
+	struct acpi_device *adev;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev)
+		return false;
+
+	return acpi_check_dsm(adev->handle, &wifi_acpi_dsm_guid,
+			      WBRF_REVISION, BIT(WBRF_RETRIEVE));
+}
+EXPORT_SYMBOL_GPL(acpi_amd_wbrf_supported_consumer);
+
+/**
+ * amd_wbrf_retrieve_freq_band - retrieve current active frequency bands
+ *
+ * @dev: device pointer
+ * @out: output structure containing all the active frequency bands
+ *
+ * Retrieve the current active frequency bands which were broadcasted
+ * by other producers. The consumer who calls this API should take
+ * proper actions if any of the frequency band may cause RFI with its
+ * own frequency band used.
+ *
+ * Return:
+ * 0 for getting wifi freq band successfully.
+ * Returns a negative error code for failure.
+ */
+int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out)
+{
+	struct amd_wbrf_ranges_out acpi_out = {0};
+	struct acpi_device *adev;
+	union acpi_object *obj;
+	union acpi_object param;
+	int ret = 0;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev)
+		return -ENODEV;
+
+	param.type = ACPI_TYPE_STRING;
+	param.string.length = 0;
+	param.string.pointer = NULL;
+
+	obj = acpi_evaluate_dsm(adev->handle, &wifi_acpi_dsm_guid,
+							WBRF_REVISION, WBRF_RETRIEVE, &param);
+	if (!obj)
+		return -EINVAL;
+
+	/*
+	 * The return buffer is with variable length and the format below:
+	 * number_of_entries(1 DWORD):       Number of entries
+	 * start_freq of 1st entry(1 QWORD): Start frequency of the 1st entry
+	 * end_freq of 1st entry(1 QWORD):   End frequency of the 1st entry
+	 * ...
+	 * ...
+	 * start_freq of the last entry(1 QWORD)
+	 * end_freq of the last entry(1 QWORD)
+	 *
+	 * Thus the buffer length is determined by the number of entries.
+	 * - For zero entry scenario, the buffer length will be 4 bytes.
+	 * - For one entry scenario, the buffer length will be 20 bytes.
+	 */
+	if (obj->buffer.length > sizeof(acpi_out) || obj->buffer.length < 4) {
+		dev_err(dev, "Wrong sized WBRT information");
+		ret = -EINVAL;
+		goto out;
+	}
+	memcpy(&acpi_out, obj->buffer.pointer, obj->buffer.length);
+
+	out->num_of_ranges = acpi_out.num_of_ranges;
+	memcpy(out->band_list, acpi_out.band_list, sizeof(acpi_out.band_list));
+
+out:
+	ACPI_FREE(obj);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(amd_wbrf_retrieve_freq_band);
+
+/**
+ * amd_wbrf_register_notifier - register for notifications of frequency
+ *                                   band update
+ *
+ * @nb: driver notifier block
+ *
+ * The consumer should register itself via this API so that it can get
+ * notified on the frequency band updates from other producers.
+ *
+ * Return:
+ * 0 for registering a consumer driver successfully.
+ * Returns a negative error code for failure.
+ */
+int amd_wbrf_register_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&wbrf_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(amd_wbrf_register_notifier);
+
+/**
+ * amd_wbrf_unregister_notifier - unregister for notifications of
+ *                                     frequency band update
+ *
+ * @nb: driver notifier block
+ *
+ * The consumer should call this API when it is longer interested with
+ * the frequency band updates from other producers. Usually, this should
+ * be performed during driver cleanup.
+ *
+ * Return:
+ * 0 for unregistering a consumer driver.
+ * Returns a negative error code for failure.
+ */
+int amd_wbrf_unregister_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&wbrf_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(amd_wbrf_unregister_notifier);
diff --git a/include/linux/acpi_amd_wbrf.h b/include/linux/acpi_amd_wbrf.h
new file mode 100644
index 000000000000..898f31d536d4
--- /dev/null
+++ b/include/linux/acpi_amd_wbrf.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Wifi Band Exclusion Interface (AMD ACPI Implementation)
+ * Copyright (C) 2023 Advanced Micro Devices
+ */
+
+#ifndef _ACPI_AMD_WBRF_H
+#define _ACPI_AMD_WBRF_H
+
+#include <linux/device.h>
+#include <linux/notifier.h>
+
+/* The maximum number of frequency band ranges */
+#define MAX_NUM_OF_WBRF_RANGES		11
+
+/* Record actions */
+#define WBRF_RECORD_ADD		0x0
+#define WBRF_RECORD_REMOVE	0x1
+
+/**
+ * struct freq_band_range - Wifi frequency band range definition
+ * @start: start frequency point (in Hz)
+ * @end: end frequency point (in Hz)
+ */
+struct freq_band_range {
+	u64		start;
+	u64		end;
+};
+
+/**
+ * struct wbrf_ranges_in_out - wbrf ranges info
+ * @num_of_ranges: total number of band ranges in this struct
+ * @band_list: array of Wifi band ranges
+ */
+struct wbrf_ranges_in_out {
+	u64			num_of_ranges;
+	struct freq_band_range	band_list[MAX_NUM_OF_WBRF_RANGES];
+};
+
+/**
+ * enum wbrf_notifier_actions - wbrf notifier actions index
+ * @WBRF_CHANGED: there was some frequency band updates. The consumers
+ *               should retrieve the latest active frequency bands.
+ */
+enum wbrf_notifier_actions {
+	WBRF_CHANGED,
+};
+
+#if IS_ENABLED(CONFIG_AMD_WBRF)
+bool acpi_amd_wbrf_supported_producer(struct device *dev);
+int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in);
+bool acpi_amd_wbrf_supported_consumer(struct device *dev);
+int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out);
+int amd_wbrf_register_notifier(struct notifier_block *nb);
+int amd_wbrf_unregister_notifier(struct notifier_block *nb);
+#else
+static inline
+bool acpi_amd_wbrf_supported_consumer(struct device *dev)
+{
+	return false;
+}
+
+static inline
+int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in)
+{
+	return -ENODEV;
+}
+
+static inline
+bool acpi_amd_wbrf_supported_producer(struct device *dev)
+{
+	return false;
+}
+static inline
+int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out)
+{
+	return -ENODEV;
+}
+static inline
+int amd_wbrf_register_notifier(struct notifier_block *nb)
+{
+	return -ENODEV;
+}
+static inline
+int amd_wbrf_unregister_notifier(struct notifier_block *nb)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_AMD_WBRF */
+
+#endif /* _ACPI_AMD_WBRF_H */
-- 
cgit v1.2.3


From 2ffdd4773d98b1f7488f8e37bd881bbecec24d85 Mon Sep 17 00:00:00 2001
From: Hsiao Chien Sung <shawn.sung@mediatek.com>
Date: Tue, 24 Oct 2023 21:00:33 +0800
Subject: soc: mediatek: Support MT8188 VDOSYS1 Padding in mtk-mmsys

- Add Padding components
- Add Mutex module definitions for Padding

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Hsiao Chien Sung <shawn.sung@mediatek.com>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-mutex.c       | 16 ++++++++++++++++
 include/linux/soc/mediatek/mtk-mmsys.h |  8 ++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mtk-mutex.c b/drivers/soc/mediatek/mtk-mutex.c
index e76722289175..73c256d3950b 100644
--- a/drivers/soc/mediatek/mtk-mutex.c
+++ b/drivers/soc/mediatek/mtk-mutex.c
@@ -141,6 +141,14 @@
 #define MT8188_MUTEX_MOD_DISP1_MDP_RDMA5	5
 #define MT8188_MUTEX_MOD_DISP1_MDP_RDMA6	6
 #define MT8188_MUTEX_MOD_DISP1_MDP_RDMA7	7
+#define MT8188_MUTEX_MOD_DISP1_PADDING0		8
+#define MT8188_MUTEX_MOD_DISP1_PADDING1		9
+#define MT8188_MUTEX_MOD_DISP1_PADDING2		10
+#define MT8188_MUTEX_MOD_DISP1_PADDING3		11
+#define MT8188_MUTEX_MOD_DISP1_PADDING4		12
+#define MT8188_MUTEX_MOD_DISP1_PADDING5		13
+#define MT8188_MUTEX_MOD_DISP1_PADDING6		14
+#define MT8188_MUTEX_MOD_DISP1_PADDING7		15
 #define MT8188_MUTEX_MOD_DISP1_VPP_MERGE0	20
 #define MT8188_MUTEX_MOD_DISP1_VPP_MERGE1	21
 #define MT8188_MUTEX_MOD_DISP1_VPP_MERGE2	22
@@ -473,6 +481,14 @@ static const unsigned int mt8188_mutex_mod[DDP_COMPONENT_ID_MAX] = {
 	[DDP_COMPONENT_MDP_RDMA5] = MT8188_MUTEX_MOD_DISP1_MDP_RDMA5,
 	[DDP_COMPONENT_MDP_RDMA6] = MT8188_MUTEX_MOD_DISP1_MDP_RDMA6,
 	[DDP_COMPONENT_MDP_RDMA7] = MT8188_MUTEX_MOD_DISP1_MDP_RDMA7,
+	[DDP_COMPONENT_PADDING0] = MT8188_MUTEX_MOD_DISP1_PADDING0,
+	[DDP_COMPONENT_PADDING1] = MT8188_MUTEX_MOD_DISP1_PADDING1,
+	[DDP_COMPONENT_PADDING2] = MT8188_MUTEX_MOD_DISP1_PADDING2,
+	[DDP_COMPONENT_PADDING3] = MT8188_MUTEX_MOD_DISP1_PADDING3,
+	[DDP_COMPONENT_PADDING4] = MT8188_MUTEX_MOD_DISP1_PADDING4,
+	[DDP_COMPONENT_PADDING5] = MT8188_MUTEX_MOD_DISP1_PADDING5,
+	[DDP_COMPONENT_PADDING6] = MT8188_MUTEX_MOD_DISP1_PADDING6,
+	[DDP_COMPONENT_PADDING7] = MT8188_MUTEX_MOD_DISP1_PADDING7,
 	[DDP_COMPONENT_MERGE1] = MT8188_MUTEX_MOD_DISP1_VPP_MERGE0,
 	[DDP_COMPONENT_MERGE2] = MT8188_MUTEX_MOD_DISP1_VPP_MERGE1,
 	[DDP_COMPONENT_MERGE3] = MT8188_MUTEX_MOD_DISP1_VPP_MERGE2,
diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 2475ef914746..4885b065b849 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -62,6 +62,14 @@ enum mtk_ddp_comp_id {
 	DDP_COMPONENT_OVL_2L1,
 	DDP_COMPONENT_OVL_2L2,
 	DDP_COMPONENT_OVL1,
+	DDP_COMPONENT_PADDING0,
+	DDP_COMPONENT_PADDING1,
+	DDP_COMPONENT_PADDING2,
+	DDP_COMPONENT_PADDING3,
+	DDP_COMPONENT_PADDING4,
+	DDP_COMPONENT_PADDING5,
+	DDP_COMPONENT_PADDING6,
+	DDP_COMPONENT_PADDING7,
 	DDP_COMPONENT_POSTMASK0,
 	DDP_COMPONENT_PWM0,
 	DDP_COMPONENT_PWM1,
-- 
cgit v1.2.3


From d1c371035c8204112d84266e6bde7537f25448f7 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sun, 10 Dec 2023 10:50:28 +0800
Subject: quota: convert dquot_claim_space_nodirty() to return void

dquot_claim_space_nodirty() always return zero, let's convert it
to return void, then, its caller can get rid of handling failure
case.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20231210025028.3262900-1-chao@kernel.org>
---
 fs/quota/dquot.c         |  6 +++---
 include/linux/quotaops.h | 15 +++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58b5de081b57..44ff2813ae51 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1787,7 +1787,7 @@ EXPORT_SYMBOL(dquot_alloc_inode);
 /*
  * Convert in-memory reserved quotas to real consumed quotas
  */
-int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	struct dquot **dquots;
 	int cnt, index;
@@ -1797,7 +1797,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 		*inode_reserved_space(inode) -= number;
 		__inode_add_bytes(inode, number);
 		spin_unlock(&inode->i_lock);
-		return 0;
+		return;
 	}
 
 	dquots = i_dquot(inode);
@@ -1822,7 +1822,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 	spin_unlock(&inode->i_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
-	return 0;
+	return;
 }
 EXPORT_SYMBOL(dquot_claim_space_nodirty);
 
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 4fa4ef0a173a..06cc8888199e 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -74,7 +74,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags);
 
 int dquot_alloc_inode(struct inode *inode);
 
-int dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
+void dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
 void dquot_free_inode(struct inode *inode);
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);
 
@@ -257,10 +257,9 @@ static inline void __dquot_free_space(struct inode *inode, qsize_t number,
 		inode_sub_bytes(inode, number);
 }
 
-static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	inode_add_bytes(inode, number);
-	return 0;
 }
 
 static inline int dquot_reclaim_space_nodirty(struct inode *inode,
@@ -358,14 +357,10 @@ static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
 				DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
 }
 
-static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
+static inline void dquot_claim_block(struct inode *inode, qsize_t nr)
 {
-	int ret;
-
-	ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
-	if (!ret)
-		mark_inode_dirty_sync(inode);
-	return ret;
+	dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
+	mark_inode_dirty_sync(inode);
 }
 
 static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3


From 2ebe81c814355d000fe49d9c4213983844dcb32b Mon Sep 17 00:00:00 2001
From: Aleksander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 6 Dec 2023 21:59:19 +0100
Subject: net, xdp: Allow metadata > 32

32 bytes may be not enough for some custom metadata. Relax the restriction,
allow metadata larger than 32 bytes and make __skb_metadata_differs() work
with bigger lengths.

Now size of metadata is only limited by the fact it is stored as u8 in
skb_shared_info, so maximum possible value is 255. Size still has to be
aligned to 4, so the actual upper limit becomes 252. Most driver
implementations will offer less, none can offer more.

Other important conditions, such as having enough space for xdp_frame
building, are already checked in bpf_xdp_adjust_meta().

Signed-off-by: Aleksander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/eb87653c-8ff8-447d-a7a1-25961f60518a@kernel.org
Link: https://lore.kernel.org/bpf/20231206205919.404415-3-larysa.zaremba@intel.com
---
 include/linux/skbuff.h | 13 ++++++++-----
 include/net/xdp.h      |  7 ++++++-
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b370eb8d70f7..df6ef42639d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4247,10 +4247,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 {
 	const void *a = skb_metadata_end(skb_a);
 	const void *b = skb_metadata_end(skb_b);
-	/* Using more efficient varaiant than plain call to memcmp(). */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 diffs = 0;
 
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    BITS_PER_LONG != 64)
+		goto slow;
+
+	/* Using more efficient variant than plain call to memcmp(). */
 	switch (meta_len) {
 #define __it(x, op) (x -= sizeof(u##op))
 #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
@@ -4270,11 +4273,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 		fallthrough;
 	case  4: diffs |= __it_diff(a, b, 32);
 		break;
+	default:
+slow:
+		return memcmp(a - meta_len, b - meta_len, meta_len);
 	}
 	return diffs;
-#else
-	return memcmp(a - meta_len, b - meta_len, meta_len);
-#endif
 }
 
 static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 349c36fb5fd8..5d3673afc037 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -369,7 +369,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp)
 
 static inline bool xdp_metalen_invalid(unsigned long metalen)
 {
-	return (metalen & (sizeof(__u32) - 1)) || (metalen > 32);
+	unsigned long meta_max;
+
+	meta_max = type_max(typeof_member(struct skb_shared_info, meta_len));
+	BUILD_BUG_ON(!__builtin_constant_p(meta_max));
+
+	return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max;
 }
 
 struct xdp_attachment_info {
-- 
cgit v1.2.3


From 718ab8226636a1a3a7d281f5d6a7ad7c925efe5a Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 28 Nov 2023 09:15:07 +0100
Subject: PCI/ASPM: Add pci_enable_link_state_locked()

Add pci_enable_link_state_locked() for enabling link states that can be
used in contexts where a pci_bus_sem read lock is already held (e.g. from
pci_walk_bus()).

This helper will be used to fix a couple of potential deadlocks where
the current helper is called with the lock already held, hence the CC
stable tag.

Fixes: f492edb40b54 ("PCI: vmd: Add quirk to configure PCIe ASPM and LTR")
Link: https://lore.kernel.org/r/20231128081512.19387-2-johan+linaro@kernel.org
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
[bhelgaas: include helper name in subject, commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Cc: <stable@vger.kernel.org>	# 6.3
Cc: Michael Bottini <michael.a.bottini@linux.intel.com>
Cc: David E. Box <david.e.box@linux.intel.com>
---
 drivers/pci/pcie/aspm.c | 53 +++++++++++++++++++++++++++++++++++++------------
 include/linux/pci.h     |  3 +++
 2 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 50b04ae5c394..5eb462772354 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1109,17 +1109,7 @@ int pci_disable_link_state(struct pci_dev *pdev, int state)
 }
 EXPORT_SYMBOL(pci_disable_link_state);
 
-/**
- * pci_enable_link_state - Clear and set the default device link state so that
- * the link may be allowed to enter the specified states. Note that if the
- * BIOS didn't grant ASPM control to the OS, this does nothing because we can't
- * touch the LNKCTL register. Also note that this does not enable states
- * disabled by pci_disable_link_state(). Return 0 or a negative errno.
- *
- * @pdev: PCI device
- * @state: Mask of ASPM link states to enable
- */
-int pci_enable_link_state(struct pci_dev *pdev, int state)
+static int __pci_enable_link_state(struct pci_dev *pdev, int state, bool locked)
 {
 	struct pcie_link_state *link = pcie_aspm_get_link(pdev);
 
@@ -1136,7 +1126,8 @@ int pci_enable_link_state(struct pci_dev *pdev, int state)
 		return -EPERM;
 	}
 
-	down_read(&pci_bus_sem);
+	if (!locked)
+		down_read(&pci_bus_sem);
 	mutex_lock(&aspm_lock);
 	link->aspm_default = 0;
 	if (state & PCIE_LINK_STATE_L0S)
@@ -1157,12 +1148,48 @@ int pci_enable_link_state(struct pci_dev *pdev, int state)
 	link->clkpm_default = (state & PCIE_LINK_STATE_CLKPM) ? 1 : 0;
 	pcie_set_clkpm(link, policy_to_clkpm_state(link));
 	mutex_unlock(&aspm_lock);
-	up_read(&pci_bus_sem);
+	if (!locked)
+		up_read(&pci_bus_sem);
 
 	return 0;
 }
+
+/**
+ * pci_enable_link_state - Clear and set the default device link state so that
+ * the link may be allowed to enter the specified states. Note that if the
+ * BIOS didn't grant ASPM control to the OS, this does nothing because we can't
+ * touch the LNKCTL register. Also note that this does not enable states
+ * disabled by pci_disable_link_state(). Return 0 or a negative errno.
+ *
+ * @pdev: PCI device
+ * @state: Mask of ASPM link states to enable
+ */
+int pci_enable_link_state(struct pci_dev *pdev, int state)
+{
+	return __pci_enable_link_state(pdev, state, false);
+}
 EXPORT_SYMBOL(pci_enable_link_state);
 
+/**
+ * pci_enable_link_state_locked - Clear and set the default device link state
+ * so that the link may be allowed to enter the specified states. Note that if
+ * the BIOS didn't grant ASPM control to the OS, this does nothing because we
+ * can't touch the LNKCTL register. Also note that this does not enable states
+ * disabled by pci_disable_link_state(). Return 0 or a negative errno.
+ *
+ * @pdev: PCI device
+ * @state: Mask of ASPM link states to enable
+ *
+ * Context: Caller holds pci_bus_sem read lock.
+ */
+int pci_enable_link_state_locked(struct pci_dev *pdev, int state)
+{
+	lockdep_assert_held_read(&pci_bus_sem);
+
+	return __pci_enable_link_state(pdev, state, true);
+}
+EXPORT_SYMBOL(pci_enable_link_state_locked);
+
 static int pcie_aspm_set_policy(const char *val,
 				const struct kernel_param *kp)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..dea043bc1e38 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1829,6 +1829,7 @@ extern bool pcie_ports_native;
 int pci_disable_link_state(struct pci_dev *pdev, int state);
 int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
 int pci_enable_link_state(struct pci_dev *pdev, int state);
+int pci_enable_link_state_locked(struct pci_dev *pdev, int state);
 void pcie_no_aspm(void);
 bool pcie_aspm_support_enabled(void);
 bool pcie_aspm_enabled(struct pci_dev *pdev);
@@ -1839,6 +1840,8 @@ static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state)
 { return 0; }
 static inline int pci_enable_link_state(struct pci_dev *pdev, int state)
 { return 0; }
+static inline int pci_enable_link_state_locked(struct pci_dev *pdev, int state)
+{ return 0; }
 static inline void pcie_no_aspm(void) { }
 static inline bool pcie_aspm_support_enabled(void) { return false; }
 static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; }
-- 
cgit v1.2.3


From 46eae99ef73302f9fb3dddcd67c374b3dffe8fd6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:02 +0200
Subject: add statmount(2) syscall

Add a way to query attributes of a single mount instead of having to parse
the complete /proc/$PID/mountinfo, which might be huge.

Lookup the mount the new 64bit mount ID.  If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Design is based on a suggestion by Linus:

  "So I'd suggest something that is very much like "statfsat()", which gets
   a buffer and a length, and returns an extended "struct statfs" *AND*
   just a string description at the end."

The interface closely mimics that of statx.

Handle ASCII attributes by appending after the end of the structure (as per
above suggestion).  Pointers to strings are stored in u64 members to make
the structure the same regardless of pointer size.  Strings are nul
terminated.

Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-5-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
[Christian Brauner <brauner@kernel.org>: various minor changes]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 281 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h   |   5 +
 include/uapi/linux/mount.h |  53 +++++++++
 3 files changed, 339 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index d3665d025acb..ae35d8b6aca8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4683,6 +4683,287 @@ int show_path(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+	struct mount *mnt = mnt_find_id_at(ns, id);
+
+	if (!mnt || mnt->mnt_id_unique != id)
+		return NULL;
+
+	return &mnt->mnt;
+}
+
+struct kstatmount {
+	struct statmount __user *const buf;
+	size_t const bufsize;
+	struct vfsmount *const mnt;
+	u64 const mask;
+	struct seq_file seq;
+	struct path root;
+	struct statmount sm;
+	size_t pos;
+	int err;
+};
+
+typedef int (*statmount_func_t)(struct kstatmount *);
+
+static int statmount_string_seq(struct kstatmount *s, statmount_func_t func)
+{
+	size_t rem = s->bufsize - s->pos - sizeof(s->sm);
+	struct seq_file *seq = &s->seq;
+	int ret;
+
+	seq->count = 0;
+	seq->size = min(seq->size, rem);
+	seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
+	if (!seq->buf)
+		return -ENOMEM;
+
+	ret = func(s);
+	if (ret)
+		return ret;
+
+	if (seq_has_overflowed(seq)) {
+		if (seq->size == rem)
+			return -EOVERFLOW;
+		seq->size *= 2;
+		if (seq->size > MAX_RW_COUNT)
+			return -ENOMEM;
+		kvfree(seq->buf);
+		return 0;
+	}
+
+	/* Done */
+	return 1;
+}
+
+static void statmount_string(struct kstatmount *s, u64 mask, statmount_func_t func,
+		       u32 *str)
+{
+	int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0;
+	struct statmount *sm = &s->sm;
+	struct seq_file *seq = &s->seq;
+
+	if (s->err || !(s->mask & mask))
+		return;
+
+	seq->size = PAGE_SIZE;
+	while (!ret)
+		ret = statmount_string_seq(s, func);
+
+	if (ret < 0) {
+		s->err = ret;
+	} else {
+		seq->buf[seq->count++] = '\0';
+		if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) {
+			s->err = -EFAULT;
+		} else {
+			*str = s->pos;
+			s->pos += seq->count;
+		}
+	}
+	kvfree(seq->buf);
+	sm->mask |= mask;
+}
+
+static void statmount_numeric(struct kstatmount *s, u64 mask, statmount_func_t func)
+{
+	if (s->err || !(s->mask & mask))
+		return;
+
+	s->err = func(s);
+	s->sm.mask |= mask;
+}
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+	u64 attr_flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		attr_flags |= MOUNT_ATTR_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		attr_flags |= MOUNT_ATTR_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		attr_flags |= MOUNT_ATTR_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		attr_flags |= MOUNT_ATTR_NOEXEC;
+	if (mnt_flags & MNT_NODIRATIME)
+		attr_flags |= MOUNT_ATTR_NODIRATIME;
+	if (mnt_flags & MNT_NOSYMFOLLOW)
+		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+	if (mnt_flags & MNT_NOATIME)
+		attr_flags |= MOUNT_ATTR_NOATIME;
+	else if (mnt_flags & MNT_RELATIME)
+		attr_flags |= MOUNT_ATTR_RELATIME;
+	else
+		attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+	if (is_idmapped_mnt(mnt))
+		attr_flags |= MOUNT_ATTR_IDMAP;
+
+	return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+	u64 propagation = 0;
+
+	if (IS_MNT_SHARED(m))
+		propagation |= MS_SHARED;
+	if (IS_MNT_SLAVE(m))
+		propagation |= MS_SLAVE;
+	if (IS_MNT_UNBINDABLE(m))
+		propagation |= MS_UNBINDABLE;
+	if (!propagation)
+		propagation |= MS_PRIVATE;
+
+	return propagation;
+}
+
+static int statmount_sb_basic(struct kstatmount *s)
+{
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	s->sm.sb_dev_major = MAJOR(sb->s_dev);
+	s->sm.sb_dev_minor = MINOR(sb->s_dev);
+	s->sm.sb_magic = sb->s_magic;
+	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+
+	return 0;
+}
+
+static int statmount_mnt_basic(struct kstatmount *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	s->sm.mnt_id = m->mnt_id_unique;
+	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+	s->sm.mnt_id_old = m->mnt_id;
+	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+
+	return 0;
+}
+
+static int statmount_propagate_from(struct kstatmount *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	if (!IS_MNT_SLAVE(m))
+		return 0;
+
+	s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+
+	return 0;
+}
+
+static int statmount_mnt_root(struct kstatmount *s)
+{
+	struct seq_file *seq = &s->seq;
+	int err = show_path(seq, s->mnt->mnt_root);
+
+	if (!err && !seq_has_overflowed(seq)) {
+		seq->buf[seq->count] = '\0';
+		seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
+	}
+	return err;
+}
+
+static int statmount_mnt_point(struct kstatmount *s)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
+
+	return err == SEQ_SKIP ? 0 : err;
+}
+
+static int statmount_fs_type(struct kstatmount *s)
+{
+	struct seq_file *seq = &s->seq;
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	seq_puts(seq, sb->s_type->name);
+	return 0;
+}
+
+static int do_statmount(struct kstatmount *s)
+{
+	struct statmount *sm = &s->sm;
+	struct mount *m = real_mount(s->mnt);
+	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
+	int err;
+
+	/*
+	 * Don't trigger audit denials. We just want to determine what
+	 * mounts to show users.
+	 */
+	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
+	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = security_sb_statfs(s->mnt->mnt_root);
+	if (err)
+		return err;
+
+	statmount_numeric(s, STATMOUNT_SB_BASIC, statmount_sb_basic);
+	statmount_numeric(s, STATMOUNT_MNT_BASIC, statmount_mnt_basic);
+	statmount_numeric(s, STATMOUNT_PROPAGATE_FROM, statmount_propagate_from);
+	statmount_string(s, STATMOUNT_FS_TYPE, statmount_fs_type, &sm->fs_type);
+	statmount_string(s, STATMOUNT_MNT_ROOT, statmount_mnt_root, &sm->mnt_root);
+	statmount_string(s, STATMOUNT_MNT_POINT, statmount_mnt_point, &sm->mnt_point);
+
+	if (s->err)
+		return s->err;
+
+	/* Return the number of bytes copied to the buffer */
+	sm->size = copysize + s->pos;
+
+	if (copy_to_user(s->buf, sm, copysize))
+		return -EFAULT;
+
+	return 0;
+}
+
+SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
+		struct statmount __user *, buf, size_t, bufsize,
+		unsigned int, flags)
+{
+	struct vfsmount *mnt;
+	struct mnt_id_req kreq;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (copy_from_user(&kreq, req, sizeof(kreq)))
+		return -EFAULT;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+	ret = -ENOENT;
+	if (mnt) {
+		struct kstatmount s = {
+			.mask = kreq.request_mask,
+			.buf = buf,
+			.bufsize = bufsize,
+			.mnt = mnt,
+		};
+
+		get_fs_root(current->fs, &s.root);
+		ret = do_statmount(&s);
+		path_put(&s.root);
+	}
+	up_read(&namespace_sem);
+
+	return ret;
+}
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..530ca9adf5f1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,8 @@ struct landlock_ruleset_attr;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
+struct statmount;
+struct mnt_id_req;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -407,6 +409,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
+asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
+			      struct statmount __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index bb242fdcfe6b..afdf4f2f6672 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,4 +138,57 @@ struct mount_attr {
 /* List of all mount_attr versions. */
 #define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
 
+
+/*
+ * Structure for getting mount/superblock/filesystem info with statmount(2).
+ *
+ * The interface is similar to statx(2): individual fields or groups can be
+ * selected with the @mask argument of statmount().  Kernel will set the @mask
+ * field according to the supported fields.
+ *
+ * If string fields are selected, then the caller needs to pass a buffer that
+ * has space after the fixed part of the structure.  Nul terminated strings are
+ * copied there and offsets relative to @str are stored in the relevant fields.
+ * If the buffer is too small, then EOVERFLOW is returned.  The actually used
+ * size is returned in @size.
+ */
+struct statmount {
+	__u32 size;		/* Total size, including strings */
+	__u32 __spare1;
+	__u64 mask;		/* What results were written */
+	__u32 sb_dev_major;	/* Device ID */
+	__u32 sb_dev_minor;
+	__u64 sb_magic;		/* ..._SUPER_MAGIC */
+	__u32 sb_flags;		/* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+	__u32 fs_type;		/* [str] Filesystem type */
+	__u64 mnt_id;		/* Unique ID of mount */
+	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
+	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
+	__u32 mnt_parent_id_old;
+	__u64 mnt_attr;		/* MOUNT_ATTR_... */
+	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+	__u64 mnt_peer_group;	/* ID of shared peer group */
+	__u64 mnt_master;	/* Mount receives propagation from this ID */
+	__u64 propagate_from;	/* Propagation from in current namespace */
+	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
+	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
+	__u64 __spare2[50];
+	char str[];		/* Variable size part containing strings */
+};
+
+struct mnt_id_req {
+	__u64 mnt_id;
+	__u64 request_mask;
+};
+
+/*
+ * @mask bits for statmount(2)
+ */
+#define STATMOUNT_SB_BASIC		0x00000001U     /* Want/got sb_... */
+#define STATMOUNT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
+#define STATMOUNT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
+#define STATMOUNT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
+#define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
+#define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3


From 5bc2ea60897e0f899fb93930dd867dae7c8eb11f Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Mon, 11 Dec 2023 20:27:47 +0800
Subject: iio: core: introduce trough info element for minimum values

The IIO_CHAN_INFO_PEAK info element is used for maximum values and
currently there is no equivalent for minimum values. Instead of
overloading the existing peak info element, a new info element can
be added.

In principle there is no need to add a _TROUGH_SCALE element as the
scale will be the same as the one required for INFO_PEAK, which in
turn is sometimes omitted if a single scale for peaks and raw values
is required.

Add an IIO_CHAN_INFO_TROUGH info element for minimum values.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20231211122747.9723-1-579lpy@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 1 +
 include/linux/iio/types.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index f6a123d397db..9a85752124dd 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -184,6 +184,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_THERMOCOUPLE_TYPE] = "thermocouple_type",
 	[IIO_CHAN_INFO_CALIBAMBIENT] = "calibambient",
 	[IIO_CHAN_INFO_ZEROPOINT] = "zeropoint",
+	[IIO_CHAN_INFO_TROUGH] = "trough_raw",
 };
 /**
  * iio_device_id() - query the unique ID for the device
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 117bde7d6ad7..d89982c98368 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -68,6 +68,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_THERMOCOUPLE_TYPE,
 	IIO_CHAN_INFO_CALIBAMBIENT,
 	IIO_CHAN_INFO_ZEROPOINT,
+	IIO_CHAN_INFO_TROUGH,
 };
 
 #endif /* _IIO_TYPES_H_ */
-- 
cgit v1.2.3


From 4649620d9404d3aceb25891c24bab77143e3f21c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 8 Dec 2023 20:13:44 +0100
Subject: thermal: core: Make thermal_zone_device_unregister() return after
 freeing the zone

Make thermal_zone_device_unregister() wait until all of the references
to the given thermal zone object have been dropped and free it before
returning.

This guarantees that when thermal_zone_device_unregister() returns,
there is no leftover activity regarding the thermal zone in question
which is required by some of its callers (for instance, modular driver
code that wants to know when it is safe to let the module go away).

Subsequently, this will allow some confusing device_is_registered()
checks to be dropped from the thermal sysfs and core code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-and-tested-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c | 6 +++++-
 include/linux/thermal.h        | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 625ba07cbe2f..70a294d12187 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -822,7 +822,7 @@ static void thermal_release(struct device *dev)
 		tz = to_thermal_zone(dev);
 		thermal_zone_destroy_device_groups(tz);
 		mutex_destroy(&tz->lock);
-		kfree(tz);
+		complete(&tz->removal);
 	} else if (!strncmp(dev_name(dev), "cooling_device",
 			    sizeof("cooling_device") - 1)) {
 		cdev = to_cooling_device(dev);
@@ -1315,6 +1315,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 	INIT_LIST_HEAD(&tz->thermal_instances);
 	ida_init(&tz->ida);
 	mutex_init(&tz->lock);
+	init_completion(&tz->removal);
 	id = ida_alloc(&thermal_tz_ida, GFP_KERNEL);
 	if (id < 0) {
 		result = id;
@@ -1494,6 +1495,9 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 	put_device(&tz->device);
 
 	thermal_notify_tz_delete(tz_id);
+
+	wait_for_completion(&tz->removal);
+	kfree(tz);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0ea99f50d57c..bedbaec9a42e 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -117,6 +117,7 @@ struct thermal_cooling_device {
  * @id:		unique id number for each thermal zone
  * @type:	the thermal zone device type
  * @device:	&struct device for this thermal zone
+ * @removal:	removal completion
  * @trip_temp_attrs:	attributes for trip points for sysfs: trip temperature
  * @trip_type_attrs:	attributes for trip points for sysfs: trip type
  * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
@@ -156,6 +157,7 @@ struct thermal_zone_device {
 	int id;
 	char type[THERMAL_NAME_LENGTH];
 	struct device device;
+	struct completion removal;
 	struct attribute_group trips_attribute_group;
 	struct thermal_attr *trip_temp_attrs;
 	struct thermal_attr *trip_type_attrs;
-- 
cgit v1.2.3


From 4e58aaeebb3c27993c734c99eae6881b196b1ddb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 1 Nov 2023 18:28:38 -0700
Subject: rcu: Restrict access to RCU CPU stall notifiers

Although the RCU CPU stall notifiers can be useful for dumping state when
tracking down delicate forward-progress bugs where NUMA effects cause
cache lines to be delivered to a given CPU regularly, but always in a
state that prevents that CPU from making forward progress.  These bugs can
be detected by the RCU CPU stall-warning mechanism, but in some cases,
the stall-warnings printk()s disrupt the forward-progress bug before
any useful state can be obtained.

Unfortunately, the notifier mechanism added by commit 5b404fdabacf ("rcu:
Add RCU CPU stall notifier") can make matters worse if used at all
carelessly. For example, if the stall warning was caused by a lock not
being released, then any attempt to acquire that lock in the notifier
will hang. This will prevent not only the notifier from producing any
useful output, but it will also prevent the stall-warning message from
ever appearing.

This commit therefore hides this new RCU CPU stall notifier
mechanism under a new RCU_CPU_STALL_NOTIFIER Kconfig option that
depends on both DEBUG_KERNEL and RCU_EXPERT.  In addition, the
rcupdate.rcu_cpu_stall_notifiers=1 kernel boot parameter must also
be specified.  The RCU_CPU_STALL_NOTIFIER Kconfig option's help text
contains a warning and explains the dangers of careless use, recommending
lockless notifier code.  In addition, a WARN() is triggered each time
that an attempt is made to register a stall-warning notifier in kernels
built with CONFIG_RCU_CPU_STALL_NOTIFIER=y.

This combination of measures will keep use of this mechanism confined to
debug kernels and away from routine deployments.

[ paulmck: Apply Dan Carpenter feedback. ]

Fixes: 5b404fdabacf ("rcu: Add RCU CPU stall notifier")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 ++++++
 include/linux/rcu_notifier.h                    |  6 +++---
 kernel/rcu/Kconfig.debug                        | 25 +++++++++++++++++++++++++
 kernel/rcu/rcu.h                                |  8 +++++---
 kernel/rcu/rcutorture.c                         | 12 +++++++-----
 kernel/rcu/tree_stall.h                         | 11 ++++++++++-
 kernel/rcu/update.c                             |  6 ++++++
 7 files changed, 62 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..b72e2049c487 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5302,6 +5302,12 @@
 			Dump ftrace buffer after reporting RCU CPU
 			stall warning.
 
+	rcupdate.rcu_cpu_stall_notifiers= [KNL]
+			Provide RCU CPU stall notifiers, but see the
+			warnings in the RCU_CPU_STALL_NOTIFIER Kconfig
+			option's help text.  TL;DR:  You almost certainly
+			do not want rcupdate.rcu_cpu_stall_notifiers.
+
 	rcupdate.rcu_cpu_stall_suppress= [KNL]
 			Suppress RCU CPU stall warning messages.
 
diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
index ebf371364581..5640f024773b 100644
--- a/include/linux/rcu_notifier.h
+++ b/include/linux/rcu_notifier.h
@@ -13,7 +13,7 @@
 #define RCU_STALL_NOTIFY_NORM	1
 #define RCU_STALL_NOTIFY_EXP	2
 
-#ifdef CONFIG_RCU_STALL_COMMON
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #include <linux/notifier.h>
 #include <linux/types.h>
@@ -21,12 +21,12 @@
 int rcu_stall_chain_notifier_register(struct notifier_block *n);
 int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
 
-#else // #ifdef CONFIG_RCU_STALL_COMMON
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 // No RCU CPU stall warnings in Tiny RCU.
 static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
 static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
 
-#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #endif /* __LINUX_RCU_NOTIFIER_H */
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 2984de629f74..9b0b52e1836f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -105,6 +105,31 @@ config RCU_CPU_STALL_CPUTIME
 	  The boot option rcupdate.rcu_cpu_stall_cputime has the same function
 	  as this one, but will override this if it exists.
 
+config RCU_CPU_STALL_NOTIFIER
+	bool "Provide RCU CPU-stall notifiers"
+	depends on RCU_STALL_COMMON
+	depends on DEBUG_KERNEL
+	depends on RCU_EXPERT
+	default n
+	help
+	  WARNING:  You almost certainly do not want this!!!
+
+	  Enable RCU CPU-stall notifiers, which are invoked just before
+	  printing the RCU CPU stall warning.  As such, bugs in notifier
+	  callbacks can prevent stall warnings from being printed.
+	  And the whole reason that a stall warning is being printed is
+	  that something is hung up somewhere.	Therefore, the notifier
+	  callbacks must be written extremely carefully, preferably
+	  containing only lockless code.  After all, it is quite possible
+	  that the whole reason that the RCU CPU stall is happening in
+	  the first place is that someone forgot to release whatever lock
+	  that you are thinking of acquiring.  In which case, having your
+	  notifier callback acquire that lock will hang, preventing the
+	  RCU CPU stall warning from appearing.
+
+	  Say Y here if you want RCU CPU stall notifiers (you don't want them)
+	  Say N if you are unsure.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index b531c33e9545..f94f65877f2b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -262,6 +262,8 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
 	return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
 }
 
+extern int rcu_cpu_stall_notifiers;
+
 #ifdef CONFIG_RCU_STALL_COMMON
 
 extern int rcu_cpu_stall_ftrace_dump;
@@ -659,10 +661,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
 bool rcu_cpu_beenfullyonline(int cpu);
 #endif
 
-#ifdef CONFIG_RCU_STALL_COMMON
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 int rcu_stall_notifier_call_chain(unsigned long val, void *v);
-#else // #ifdef CONFIG_RCU_STALL_COMMON
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
-#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30fc9d34e329..07a6a183c555 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2450,10 +2450,12 @@ static int rcu_torture_stall(void *args)
 	unsigned long stop_at;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
-	ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
-	if (ret)
-		pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
-			__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+	if (rcu_cpu_stall_notifiers) {
+		ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+		if (ret)
+			pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+				__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+	}
 	if (stall_cpu_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2497,7 +2499,7 @@ static int rcu_torture_stall(void *args)
 		cur_ops->readunlock(idx);
 	}
 	pr_alert("%s end.\n", __func__);
-	if (!ret) {
+	if (rcu_cpu_stall_notifiers && !ret) {
 		ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
 		if (ret)
 			pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ac8e86babe44..5d666428546b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -1061,6 +1061,7 @@ static int __init rcu_sysrq_init(void)
 }
 early_initcall(rcu_sysrq_init);
 
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -1081,7 +1082,13 @@ static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
  */
 int rcu_stall_chain_notifier_register(struct notifier_block *n)
 {
-	return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+	int rcsn = rcu_cpu_stall_notifiers;
+
+	WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+	     rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+	if (rcsn)
+		return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+	return -EEXIST;
 }
 EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
 
@@ -1115,3 +1122,5 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v)
 {
 	return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
 }
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c534d6806d3d..46aaaa9fe339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -538,9 +538,15 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
 #endif
 
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+
 #ifdef CONFIG_RCU_STALL_COMMON
 int rcu_cpu_stall_ftrace_dump __read_mostly;
 module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
 int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
 module_param(rcu_cpu_stall_suppress, int, 0644);
-- 
cgit v1.2.3


From 23d90b2404050c00c15058710d56bb46e1c5ab36 Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Fri, 20 Oct 2023 18:30:15 +0100
Subject: rcu: Remove unused macros from rcupdate.h

ulong2long, USHORT_CMP_GE and USHORT_CMP_LT are redundant and have been
unused for quite a few releases.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rcupdate.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f7206b2623c9..aa87c82236dd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -34,9 +34,6 @@
 
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
-#define ulong2long(a)		(*(long *)(&(a)))
-#define USHORT_CMP_GE(a, b)	(USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
-#define USHORT_CMP_LT(a, b)	(USHRT_MAX / 2 < (unsigned short)((a) - (b)))
 
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
-- 
cgit v1.2.3


From c5e2a973448d958feb7881e4d875eac59fdeff3d Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:41 -0300
Subject: rtnl: add helper to check if rtnl group has listeners

As of today, rtnl code creates a new skb and unconditionally fills and
broadcasts it to the relevant group. For most operations this is okay
and doesn't waste resources in general.

When operations are done without the rtnl_lock, as in tc-flower, such
skb allocation, message fill and no-op broadcasting can happen in all
cores of the system, which contributes to system pressure and wastes
precious cpu cycles when no one will receive the built message.

Introduce this helper so rtnetlink operations can simply check if someone
is listening and then proceed if necessary.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-2-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3d6cf306cd55..a7d757e96c55 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -130,4 +130,11 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 
 extern void rtnl_offload_xstats_notify(struct net_device *dev);
 
+static inline int rtnl_has_listeners(const struct net *net, u32 group)
+{
+	struct sock *rtnl = net->rtnl;
+
+	return netlink_has_listeners(rtnl, group);
+}
+
 #endif	/* __LINUX_RTNETLINK_H */
-- 
cgit v1.2.3


From 8439109b76a3c405808383bf9dd532fc4b9c2dbd Mon Sep 17 00:00:00 2001
From: Victor Nogueira <victor@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:42 -0300
Subject: rtnl: add helper to check if a notification is needed

Building on the rtnl_has_listeners helper, add the rtnl_notify_needed
helper to check if we can bail out early in the notification routines.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-3-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index a7d757e96c55..0cbbbded0331 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -137,4 +137,19 @@ static inline int rtnl_has_listeners(const struct net *net, u32 group)
 	return netlink_has_listeners(rtnl, group);
 }
 
+/**
+ * rtnl_notify_needed - check if notification is needed
+ * @net: Pointer to the net namespace
+ * @nlflags: netlink ingress message flags
+ * @group: rtnl group
+ *
+ * Based on the ingress message flags and rtnl group, returns true
+ * if a notification is needed, false otherwise.
+ */
+static inline bool
+rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group)
+{
+	return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group);
+}
+
 #endif	/* __LINUX_RTNETLINK_H */
-- 
cgit v1.2.3


From ddb6b284bdc32b6e218b3d90b5a745ea26620812 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Fri, 8 Dec 2023 16:28:43 -0300
Subject: rtnl: add helper to send if skb is not null

This is a convenience helper for routines handling conditional rtnl
events, that is code that might send a notification depending on
rtnl_has_listeners/rtnl_notify_needed.

Instead of:
   if (skb)
      rtnetlink_send(...)

Use:
      rtnetlink_maybe_send(...)

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Link: https://lore.kernel.org/r/20231208192847.714940-4-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0cbbbded0331..6a8543b34e2c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -10,6 +10,13 @@
 #include <uapi/linux/rtnetlink.h>
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
+
+static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net,
+				       u32 pid, u32 group, int echo)
+{
+	return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo);
+}
+
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
 			u32 group, const struct nlmsghdr *nlh, gfp_t flags);
-- 
cgit v1.2.3


From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:21 -0800
Subject: bpf: tidy up exception callback management a bit

Use the fact that we are passing subprog index around and have
a corresponding struct bpf_subprog_info in bpf_verifier_env for each
subprogram. We don't need to separately pass around a flag whether
subprog is exception callback or not, each relevant verifier function
can determine this using provided subprog index if we maintain
bpf_subprog_info properly.

Also move out exception callback-specific logic from
btf_prepare_func_args(), keeping it generic. We can enforce all these
restriction right before exception callback verification pass. We add
out parameter, arg_cnt, for now, but this will be unnecessary with
subsequent refactoring and will be removed.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/btf.c      | 11 ++---------
 kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c1a06263a4f3..0bd4889e917a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, bool is_ex_cb);
+			  struct bpf_reg_state *reg, u32 *nargs);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63cf4128fc05..d56433bf8aba 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6956,7 +6956,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  * (either PTR_TO_CTX or SCALAR_VALUE).
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs, bool is_ex_cb)
+			  struct bpf_reg_state *regs, u32 *arg_cnt)
 {
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -7013,6 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
+	*arg_cnt = nargs;
 	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
@@ -7062,14 +7063,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
-	/* We have already ensured that the callback returns an integer, just
-	 * like all global subprogs. We need to determine it only has a single
-	 * scalar argument.
-	 */
-	if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
-		bpf_log(log, "exception cb only supports single integer argument\n");
-		return -EINVAL;
-	}
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 727a59e4a647..d1755db1b503 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -442,6 +442,25 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env,
 	return &env->prog->aux->func_info_aux[subprog];
 }
 
+static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->subprog_info[subprog];
+}
+
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+	struct bpf_subprog_info *info = subprog_info(env, subprog);
+
+	info->is_cb = true;
+	info->is_async_cb = true;
+	info->is_exception_cb = true;
+}
+
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+	return subprog_info(env, subprog)->is_exception_cb;
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -2892,6 +2911,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 			if (env->subprog_info[i].start != ex_cb_insn)
 				continue;
 			env->exception_callback_subprog = i;
+			mark_subprog_exc_cb(env, i);
 			break;
 		}
 	}
@@ -19166,9 +19186,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 
 		env->exception_callback_subprog = env->subprog_cnt - 1;
 		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
-		env->subprog_info[env->exception_callback_subprog].is_cb = true;
-		env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
-		env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+		mark_subprog_exc_cb(env, env->exception_callback_subprog);
 	}
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
@@ -19868,7 +19886,7 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
-static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
@@ -19899,9 +19917,23 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
+		u32 nargs;
+
+		ret = btf_prepare_func_args(env, subprog, regs, &nargs);
 		if (ret)
 			goto out;
+		if (subprog_is_exc_cb(env, subprog)) {
+			state->frame[0]->in_exception_callback_fn = true;
+			/* We have already ensured that the callback returns an integer, just
+			 * like all global subprogs. We need to determine it only has a single
+			 * scalar argument.
+			 */
+			if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) {
+				verbose(env, "exception cb only supports single integer argument\n");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
 		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
 			if (regs[i].type == PTR_TO_CTX)
 				mark_reg_known_zero(env, regs, i);
@@ -19915,12 +19947,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
 				regs[i].id = ++env->id_gen;
 			}
 		}
-		if (is_ex_cb) {
-			state->frame[0]->in_exception_callback_fn = true;
-			env->subprog_info[subprog].is_cb = true;
-			env->subprog_info[subprog].is_async_cb = true;
-			env->subprog_info[subprog].is_exception_cb = true;
-		}
 	} else {
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -20000,7 +20026,7 @@ again:
 
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
-		ret = do_check_common(env, i, env->exception_callback_subprog == i);
+		ret = do_check_common(env, i);
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -20030,7 +20056,7 @@ static int do_check_main(struct bpf_verifier_env *env)
 	int ret;
 
 	env->insn_idx = 0;
-	ret = do_check_common(env, 0, false);
+	ret = do_check_common(env, 0);
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
-- 
cgit v1.2.3


From 406a6fa44bfbc8563f0612b08d43df2fa65e8bc5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:22 -0800
Subject: bpf: use bitfields for simple per-subprog bool flags

We have a bunch of bool flags for each subprog. Instead of wasting bytes
for them, use bitfields instead.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 314b679fb494..c2819a6579a5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -611,12 +611,12 @@ struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
-	bool has_tail_call;
-	bool tail_call_reachable;
-	bool has_ld_abs;
-	bool is_cb;
-	bool is_async_cb;
-	bool is_exception_cb;
+	bool has_tail_call: 1;
+	bool tail_call_reachable: 1;
+	bool has_ld_abs: 1;
+	bool is_cb: 1;
+	bool is_async_cb: 1;
+	bool is_exception_cb: 1;
 };
 
 struct bpf_verifier_env;
-- 
cgit v1.2.3


From 1ca51628e7303718fdabe29c7d36f582500d5cf2 Mon Sep 17 00:00:00 2001
From: Shun Hao <shunh@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:34 +0200
Subject: net/mlx5: Introduce indirect-sw-encap ICM properties

Add new fields for device memory capabilities, in order to support
creation of new ICM memory type of SW encap.

Signed-off-by: Shun Hao <shunh@nvidia.com>
Link: https://lore.kernel.org/r/107cca7dd6a932a1704abf6ebd1b801105546a8e.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..02b25dc36143 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1193,7 +1193,8 @@ struct mlx5_ifc_device_mem_cap_bits {
 	u8         log_sw_icm_alloc_granularity[0x6];
 	u8         log_steering_sw_icm_size[0x8];
 
-	u8         reserved_at_120[0x18];
+	u8         log_indirect_encap_sw_icm_size[0x8];
+	u8         reserved_at_128[0x10];
 	u8         log_header_modify_pattern_sw_icm_size[0x8];
 
 	u8         header_modify_sw_icm_start_address[0x40];
@@ -1204,7 +1205,11 @@ struct mlx5_ifc_device_mem_cap_bits {
 
 	u8         memic_operations[0x20];
 
-	u8         reserved_at_220[0x5e0];
+	u8         reserved_at_220[0x20];
+
+	u8         indirect_encap_sw_icm_start_address[0x40];
+
+	u8         reserved_at_280[0x580];
 };
 
 struct mlx5_ifc_device_event_cap_bits {
-- 
cgit v1.2.3


From a429ec96c07f3020af12029acefc46f42ff5c91c Mon Sep 17 00:00:00 2001
From: Shun Hao <shunh@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:35 +0200
Subject: RDMA/mlx5: Support handling of SW encap ICM area

New type for this ICM area, now the user can allocate/deallocate
the new type of SW encap ICM memory, to store the encap header data
which are managed by SW.

Signed-off-by: Shun Hao <shunh@nvidia.com>
Link: https://lore.kernel.org/r/546fe43fc700240709e30acf7713ec6834d652bd.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/dm.c           | 5 +++++
 drivers/infiniband/hw/mlx5/mr.c           | 1 +
 include/linux/mlx5/driver.h               | 1 +
 include/uapi/rdma/mlx5_user_ioctl_verbs.h | 1 +
 4 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/dm.c b/drivers/infiniband/hw/mlx5/dm.c
index 3669c90b2dad..b4c97fb62abf 100644
--- a/drivers/infiniband/hw/mlx5/dm.c
+++ b/drivers/infiniband/hw/mlx5/dm.c
@@ -341,6 +341,8 @@ static enum mlx5_sw_icm_type get_icm_type(int uapi_type)
 		return MLX5_SW_ICM_TYPE_HEADER_MODIFY;
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
 		return MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN;
+	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
+		return MLX5_SW_ICM_TYPE_SW_ENCAP;
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	default:
 		return MLX5_SW_ICM_TYPE_STEERING;
@@ -364,6 +366,7 @@ static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
 	switch (type) {
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
 		if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) ||
 		      MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) ||
 		      MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) ||
@@ -438,6 +441,7 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
 		return handle_alloc_dm_sw_icm(context, attr, attrs, type);
 	default:
 		return ERR_PTR(-EOPNOTSUPP);
@@ -491,6 +495,7 @@ static int mlx5_ib_dealloc_dm(struct ib_dm *ibdm,
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
 		return mlx5_dm_icm_dealloc(ctx, to_icm(ibdm));
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 18e459b55746..a8ee2ca1f4a1 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1347,6 +1347,7 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
 	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
+	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
 		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
 			return ERR_PTR(-EINVAL);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d2b8d4a74a30..96cb8845682d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -688,6 +688,7 @@ enum mlx5_sw_icm_type {
 	MLX5_SW_ICM_TYPE_STEERING,
 	MLX5_SW_ICM_TYPE_HEADER_MODIFY,
 	MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN,
+	MLX5_SW_ICM_TYPE_SW_ENCAP,
 };
 
 #define MLX5_MAX_RESERVED_GIDS 8
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 7af9e09ea556..3189c7f08d17 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -64,6 +64,7 @@ enum mlx5_ib_uapi_dm_type {
 	MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM,
 	MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM,
 	MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM,
+	MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM,
 };
 
 enum mlx5_ib_uapi_devx_create_event_channel_flags {
-- 
cgit v1.2.3


From eb524d0fd46249b0b9e5d52372dc65d8b32430c3 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 6 Dec 2023 16:01:37 +0200
Subject: net/mlx5: E-Switch, expose eswitch manager vport

Expose the ability the query the eswitch manager vport number.
Next patch will utilize this capability to reveal the correct
register C0 value to the users.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/614fb0e216250e2ce3340471ec141b83ec45c7f4.1701871118.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 7 -------
 include/linux/mlx5/eswitch.h                      | 8 ++++++++
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 37ab66e7b403..60a9a6cba0b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -616,13 +616,6 @@ static inline bool mlx5_esw_allowed(const struct mlx5_eswitch *esw)
 	return esw && MLX5_ESWITCH_MANAGER(esw->dev);
 }
 
-/* The returned number is valid only when the dev is eswitch manager. */
-static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
-{
-	return mlx5_core_is_ecpf_esw_manager(dev) ?
-		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
-}
-
 static inline bool
 mlx5_esw_is_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num)
 {
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 950d2431a53c..df73a2ccc9af 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -7,6 +7,7 @@
 #define _MLX5_ESWITCH_
 
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
 #include <net/devlink.h>
 
 #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
@@ -210,4 +211,11 @@ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
 	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS;
 }
 
+/* The returned number is valid only when the dev is eswitch manager. */
+static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev)
+{
+	return mlx5_core_is_ecpf_esw_manager(dev) ?
+		MLX5_VPORT_ECPF : MLX5_VPORT_PF;
+}
+
 #endif
-- 
cgit v1.2.3


From 98fb9b9680c9f3895ced02d6a73e27f5d7b5892b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 6 Dec 2023 22:37:57 +0100
Subject: wifi: ieee80211: don't require protected vendor action frames

For vendor action frames, whether a protected one should be
used or not is clearly up to the individual vendor and frame,
so even though a protected dual is defined, it may not get
used. Thus, don't require protection for vendor action frames
when they're used in a connection.

Since we obviously don't process frames unknown to the kernel
in the kernel, it may makes sense to invert this list to have
all the ones the kernel processes and knows to be requiring
protection, but that'd be a different change.

Fixes: 91535613b609 ("wifi: mac80211: don't drop all unprotected public action frames")
Reported-by: Jouni Malinen <j@w1.fi>
Link: https://msgid.link/20231206223801.f6a2cf4e67ec.Ifa6acc774bd67801d3dafb405278f297683187aa@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c2ac9e9e7ee9..2b5e500bf093 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4447,7 +4447,8 @@ ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb)
 		action != WLAN_PUB_ACTION_LOC_TRACK_NOTI &&
 		action != WLAN_PUB_ACTION_FTM_REQUEST &&
 		action != WLAN_PUB_ACTION_FTM_RESPONSE &&
-		action != WLAN_PUB_ACTION_FILS_DISCOVERY;
+		action != WLAN_PUB_ACTION_FILS_DISCOVERY &&
+		action != WLAN_PUB_ACTION_VENDOR_SPECIFIC;
 }
 
 /**
-- 
cgit v1.2.3


From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 27 Oct 2023 08:05:20 +0800
Subject: iommu: Change kconfig around IOMMU_SVA

Linus suggested that the kconfig here is confusing:

https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/

Let's break it into three kconfigs controlling distinct things:

 - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional
   fields for the IOMMU. Currently only PASID, but later patches store
   a struct iommu_mm_data *

 - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit
   for keeping track of the ENQCMD instruction. x86 will select this if
   IOMMU_SVA is enabled

 - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code
   for iommu driver use and the IOMMU exported API

This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/Kconfig             | 5 +++++
 arch/x86/Kconfig         | 1 +
 arch/x86/kernel/traps.c  | 2 +-
 drivers/iommu/Kconfig    | 1 +
 include/linux/iommu.h    | 2 +-
 include/linux/mm_types.h | 2 +-
 include/linux/sched.h    | 2 +-
 kernel/fork.c            | 2 +-
 mm/Kconfig               | 3 +++
 mm/init-mm.c             | 2 +-
 10 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..3e49f862670e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -301,6 +301,11 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED
 config ARCH_HAS_CPU_FINALIZE_INIT
 	bool
 
+# The architecture has a per-task state that includes the mm's PASID
+config ARCH_HAS_CPU_PASID
+	bool
+	select IOMMU_MM_DATA
+
 # Select if arch init_task must go in the __init_task_data section
 config ARCH_TASK_STRUCT_ON_STACK
 	bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..68a2ec36a46e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -71,6 +71,7 @@ config X86
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
 	select ARCH_HAS_CPU_FINALIZE_INIT
+	select ARCH_HAS_CPU_PASID		if IOMMU_SVA
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE	if !X86_PAE
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c876f1d36a81..2b62dbb3396a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -565,7 +565,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
  */
 static bool try_fixup_enqcmd_gp(void)
 {
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	u32 pasid;
 
 	/*
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7673bb82945b..9a29d742617e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -160,6 +160,7 @@ config IOMMU_DMA
 
 # Shared Virtual Addressing
 config IOMMU_SVA
+	select IOMMU_MM_DATA
 	bool
 
 config FSL_PAMU
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c7394b39599c..cd3f398095bf 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 	return false;
 }
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
 	mm->pasid = IOMMU_PASID_INVALID;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..41f248608dd9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -938,7 +938,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 		u32 pasid;
 #endif
 #ifdef CONFIG_KSM
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..70888a36677b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -954,7 +954,7 @@ struct task_struct {
 	/* Recursion prevention for eventfd_signal() */
 	unsigned			in_eventfd:1;
 #endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	unsigned			pasid_activated:1;
 #endif
 #ifdef	CONFIG_CPU_SUP_INTEL
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..43fd9bc1a522 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1179,7 +1179,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->use_memdelay = 0;
 #endif
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	tsk->pasid_activated = 0;
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 89971a894b60..0143f4d905c9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1270,6 +1270,9 @@ config LOCK_MM_AND_FIND_VMA
 	bool
 	depends on !STACK_GROWSUP
 
+config IOMMU_MM_DATA
+	bool
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/init-mm.c b/mm/init-mm.c
index cfd367822cdd..c52dc2740a3d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -44,7 +44,7 @@ struct mm_struct init_mm = {
 #endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 	.pasid		= IOMMU_PASID_INVALID,
 #endif
 	INIT_MM_CONTEXT(init_mm)
-- 
cgit v1.2.3


From 2396046d75d3c0b2cfead852a77efd023f8539dc Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:22 +0800
Subject: iommu: Add mm_get_enqcmd_pasid() helper function

mm_get_enqcmd_pasid() should be used by architecture code and closely
related to learn the PASID value that the x86 ENQCMD operation should
use for the mm.

For the moment SMMUv3 uses this without any connection to ENQCMD, it
will be cleaned up similar to how the prior patch made VT-d use the
PASID argument of set_dev_pasid().

The motivation is to replace mm->pasid with an iommu private data
structure that is introduced in a later patch.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-4-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/kernel/traps.c                         |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 23 +++++++++++++++--------
 drivers/iommu/iommu-sva.c                       |  2 +-
 include/linux/iommu.h                           | 12 ++++++++++++
 4 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2b62dbb3396a..5944d759afe7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -591,7 +591,7 @@ static bool try_fixup_enqcmd_gp(void)
 	if (!mm_valid_pasid(current->mm))
 		return false;
 
-	pasid = current->mm->pasid;
+	pasid = mm_get_enqcmd_pasid(current->mm);
 
 	/*
 	 * Did this thread already have its PASID activated?
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 353248ab18e7..05722121f00e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -246,7 +246,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
 						    smmu_domain);
 	}
 
-	arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size);
+	arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start,
+				size);
 }
 
 static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
@@ -264,10 +265,11 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
 	 * but disable translation.
 	 */
-	arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd);
+	arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm),
+					 &quiet_cd);
 
 	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
-	arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
+	arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
 
 	smmu_mn->cleared = true;
 	mutex_unlock(&sva_lock);
@@ -325,10 +327,13 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
-		ret = arm_smmu_write_ctx_desc(master, mm->pasid, cd);
+		ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm),
+					      cd);
 		if (ret) {
-			list_for_each_entry_from_reverse(master, &smmu_domain->devices, domain_head)
-				arm_smmu_write_ctx_desc(master, mm->pasid, NULL);
+			list_for_each_entry_from_reverse(
+				master, &smmu_domain->devices, domain_head)
+				arm_smmu_write_ctx_desc(
+					master, mm_get_enqcmd_pasid(mm), NULL);
 			break;
 		}
 	}
@@ -358,7 +363,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
 
 	list_del(&smmu_mn->list);
 
-	arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, NULL);
+	arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm),
+					 NULL);
 
 	/*
 	 * If we went through clear(), we've already invalidated, and no
@@ -366,7 +372,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
 	 */
 	if (!smmu_mn->cleared) {
 		arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
-		arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0);
+		arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0,
+					0);
 	}
 
 	/* Frees smmu_mn */
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index b78671a8a914..4a2f5699747f 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -141,7 +141,7 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
 	struct iommu_domain *domain = handle->domain;
 
-	return domain->mm->pasid;
+	return mm_get_enqcmd_pasid(domain->mm);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cd3f398095bf..4fb239c6ca8d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1346,6 +1346,12 @@ static inline bool mm_valid_pasid(struct mm_struct *mm)
 {
 	return mm->pasid != IOMMU_PASID_INVALID;
 }
+
+static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
+{
+	return mm->pasid;
+}
+
 void mm_pasid_drop(struct mm_struct *mm);
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
@@ -1368,6 +1374,12 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 }
 static inline void mm_pasid_init(struct mm_struct *mm) {}
 static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
+
+static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
+{
+	return IOMMU_PASID_INVALID;
+}
+
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
 #endif /* CONFIG_IOMMU_SVA */
 
-- 
cgit v1.2.3


From 541a3e257d48c16b77d19f39ed939ef5832046df Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:23 +0800
Subject: mm: Add structure to keep sva information

Introduce iommu_mm_data structure to keep sva information (pasid and the
related sva domains). Add iommu_mm pointer, pointing to an instance of
iommu_mm_data structure, to mm.

Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-5-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h    | 5 +++++
 include/linux/mm_types.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4fb239c6ca8d..f7b1b469e98d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -812,6 +812,11 @@ struct iommu_sva {
 	struct iommu_domain		*domain;
 };
 
+struct iommu_mm_data {
+	u32			pasid;
+	struct list_head	sva_domains;
+};
+
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 41f248608dd9..0b4314fab478 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -727,6 +727,7 @@ struct mm_cid {
 #endif
 
 struct kioctx_table;
+struct iommu_mm_data;
 struct mm_struct {
 	struct {
 		/*
@@ -940,6 +941,7 @@ struct mm_struct {
 
 #ifdef CONFIG_IOMMU_MM_DATA
 		u32 pasid;
+		struct iommu_mm_data *iommu_mm;
 #endif
 #ifdef CONFIG_KSM
 		/*
-- 
cgit v1.2.3


From 092edaddb660376648acb97678570ed5d8299768 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:24 +0800
Subject: iommu: Support mm PASID 1:n with sva domains

Each mm bound to devices gets a PASID and corresponding sva domains
allocated in iommu_sva_bind_device(), which are referenced by iommu_mm
field of the mm. The PASID is released in __mmdrop(), while a sva domain
is released when no one is using it (the reference count is decremented
in iommu_sva_unbind_device()). However, although sva domains and their
PASID are separate objects such that their own life cycles could be
handled independently, an enqcmd use case may require releasing the
PASID in releasing the mm (i.e., once a PASID is allocated for a mm, it
will be permanently used by the mm and won't be released until the end
of mm) and only allows to drop the PASID after the sva domains are
released. To this end, mmgrab() is called in iommu_sva_domain_alloc() to
increment the mm reference count and mmdrop() is invoked in
iommu_domain_free() to decrement the mm reference count.

Since the required info of PASID and sva domains is kept in struct
iommu_mm_data of a mm, use mm->iommu_mm field instead of the old pasid
field in mm struct. The sva domain list is protected by iommu_sva_lock.

Besides, this patch removes mm_pasid_init(), as with the introduced
iommu_mm structure, initializing mm pasid in mm_init() is unnecessary.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-6-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu-sva.c | 92 +++++++++++++++++++++++++++--------------------
 include/linux/iommu.h     | 23 ++++++++++--
 2 files changed, 74 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 4a2f5699747f..5175e8d85247 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -12,32 +12,42 @@
 static DEFINE_MUTEX(iommu_sva_lock);
 
 /* Allocate a PASID for the mm within range (inclusive) */
-static int iommu_sva_alloc_pasid(struct mm_struct *mm, struct device *dev)
+static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev)
 {
+	struct iommu_mm_data *iommu_mm;
 	ioasid_t pasid;
-	int ret = 0;
+
+	lockdep_assert_held(&iommu_sva_lock);
 
 	if (!arch_pgtable_dma_compat(mm))
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
-	mutex_lock(&iommu_sva_lock);
+	iommu_mm = mm->iommu_mm;
 	/* Is a PASID already associated with this mm? */
-	if (mm_valid_pasid(mm)) {
-		if (mm->pasid >= dev->iommu->max_pasids)
-			ret = -EOVERFLOW;
-		goto out;
+	if (iommu_mm) {
+		if (iommu_mm->pasid >= dev->iommu->max_pasids)
+			return ERR_PTR(-EOVERFLOW);
+		return iommu_mm;
 	}
 
+	iommu_mm = kzalloc(sizeof(struct iommu_mm_data), GFP_KERNEL);
+	if (!iommu_mm)
+		return ERR_PTR(-ENOMEM);
+
 	pasid = iommu_alloc_global_pasid(dev);
 	if (pasid == IOMMU_PASID_INVALID) {
-		ret = -ENOSPC;
-		goto out;
+		kfree(iommu_mm);
+		return ERR_PTR(-ENOSPC);
 	}
-	mm->pasid = pasid;
-	ret = 0;
-out:
-	mutex_unlock(&iommu_sva_lock);
-	return ret;
+	iommu_mm->pasid = pasid;
+	INIT_LIST_HEAD(&iommu_mm->sva_domains);
+	/*
+	 * Make sure the write to mm->iommu_mm is not reordered in front of
+	 * initialization to iommu_mm fields. If it does, readers may see a
+	 * valid iommu_mm with uninitialized values.
+	 */
+	smp_store_release(&mm->iommu_mm, iommu_mm);
+	return iommu_mm;
 }
 
 /**
@@ -58,31 +68,33 @@ out:
  */
 struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
 {
+	struct iommu_mm_data *iommu_mm;
 	struct iommu_domain *domain;
 	struct iommu_sva *handle;
 	int ret;
 
+	mutex_lock(&iommu_sva_lock);
+
 	/* Allocate mm->pasid if necessary. */
-	ret = iommu_sva_alloc_pasid(mm, dev);
-	if (ret)
-		return ERR_PTR(ret);
+	iommu_mm = iommu_alloc_mm_data(mm, dev);
+	if (IS_ERR(iommu_mm)) {
+		ret = PTR_ERR(iommu_mm);
+		goto out_unlock;
+	}
 
 	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
-	if (!handle)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_lock(&iommu_sva_lock);
-	/* Search for an existing domain. */
-	domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid,
-						IOMMU_DOMAIN_SVA);
-	if (IS_ERR(domain)) {
-		ret = PTR_ERR(domain);
+	if (!handle) {
+		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
-	if (domain) {
-		domain->users++;
-		goto out;
+	/* Search for an existing domain. */
+	list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) {
+		ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
+		if (!ret) {
+			domain->users++;
+			goto out;
+		}
 	}
 
 	/* Allocate a new domain and set it on device pasid. */
@@ -92,23 +104,23 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 		goto out_unlock;
 	}
 
-	ret = iommu_attach_device_pasid(domain, dev, mm->pasid);
+	ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
 	if (ret)
 		goto out_free_domain;
 	domain->users = 1;
+	list_add(&domain->next, &mm->iommu_mm->sva_domains);
+
 out:
 	mutex_unlock(&iommu_sva_lock);
 	handle->dev = dev;
 	handle->domain = domain;
-
 	return handle;
 
 out_free_domain:
 	iommu_domain_free(domain);
+	kfree(handle);
 out_unlock:
 	mutex_unlock(&iommu_sva_lock);
-	kfree(handle);
-
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(iommu_sva_bind_device);
@@ -124,12 +136,13 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device);
 void iommu_sva_unbind_device(struct iommu_sva *handle)
 {
 	struct iommu_domain *domain = handle->domain;
-	ioasid_t pasid = domain->mm->pasid;
+	struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm;
 	struct device *dev = handle->dev;
 
 	mutex_lock(&iommu_sva_lock);
+	iommu_detach_device_pasid(domain, dev, iommu_mm->pasid);
 	if (--domain->users == 0) {
-		iommu_detach_device_pasid(domain, dev, pasid);
+		list_del(&domain->next);
 		iommu_domain_free(domain);
 	}
 	mutex_unlock(&iommu_sva_lock);
@@ -205,8 +218,11 @@ out_put_mm:
 
 void mm_pasid_drop(struct mm_struct *mm)
 {
-	if (likely(!mm_valid_pasid(mm)))
+	struct iommu_mm_data *iommu_mm = mm->iommu_mm;
+
+	if (!iommu_mm)
 		return;
 
-	iommu_free_global_pasid(mm->pasid);
+	iommu_free_global_pasid(iommu_mm->pasid);
+	kfree(iommu_mm);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f7b1b469e98d..c6bbbe0901d0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -121,6 +121,11 @@ struct iommu_domain {
 		struct {	/* IOMMU_DOMAIN_SVA */
 			struct mm_struct *mm;
 			int users;
+			/*
+			 * Next iommu_domain in mm->iommu_mm->sva-domains list
+			 * protected by iommu_sva_lock.
+			 */
+			struct list_head next;
 		};
 	};
 };
@@ -1345,16 +1350,28 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 #ifdef CONFIG_IOMMU_MM_DATA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
-	mm->pasid = IOMMU_PASID_INVALID;
+	/*
+	 * During dup_mm(), a new mm will be memcpy'd from an old one and that makes
+	 * the new mm and the old one point to a same iommu_mm instance. When either
+	 * one of the two mms gets released, the iommu_mm instance is freed, leaving
+	 * the other mm running into a use-after-free/double-free problem. To avoid
+	 * the problem, zeroing the iommu_mm pointer of a new mm is needed here.
+	 */
+	mm->iommu_mm = NULL;
 }
+
 static inline bool mm_valid_pasid(struct mm_struct *mm)
 {
-	return mm->pasid != IOMMU_PASID_INVALID;
+	return READ_ONCE(mm->iommu_mm);
 }
 
 static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 {
-	return mm->pasid;
+	struct iommu_mm_data *iommu_mm = READ_ONCE(mm->iommu_mm);
+
+	if (!iommu_mm)
+		return IOMMU_PASID_INVALID;
+	return iommu_mm->pasid;
 }
 
 void mm_pasid_drop(struct mm_struct *mm);
-- 
cgit v1.2.3


From 1fa05c932dc71c474da38e4fd0456131128f8486 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Fri, 27 Oct 2023 08:05:25 +0800
Subject: mm: Deprecate pasid field

Drop the pasid field, as all the information needed for sva domain
management has been moved to the newly added iommu_mm field.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-7-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/mm_types.h | 1 -
 mm/init-mm.c             | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b4314fab478..ec71c91e210b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -940,7 +940,6 @@ struct mm_struct {
 		struct work_struct async_put_work;
 
 #ifdef CONFIG_IOMMU_MM_DATA
-		u32 pasid;
 		struct iommu_mm_data *iommu_mm;
 #endif
 #ifdef CONFIG_KSM
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c52dc2740a3d..24c809379274 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -44,9 +44,6 @@ struct mm_struct init_mm = {
 #endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
-#ifdef CONFIG_IOMMU_MM_DATA
-	.pasid		= IOMMU_PASID_INVALID,
-#endif
 	INIT_MM_CONTEXT(init_mm)
 };
 
-- 
cgit v1.2.3


From 4720287c7bf76e59d19d4dfbdc3f54eeea6fd46b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:08 -0400
Subject: iommu: Remove struct iommu_ops *iommu from arch_setup_dma_ops()

This is not being used to pass ops, it is just a way to tell if an
iommu driver was probed. These days this can be detected directly via
device_iommu_mapped(). Call device_iommu_mapped() in the two places that
need to check it and remove the iommu parameter everywhere.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rob Herring <robh@kernel.org>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arc/mm/dma.c               |  2 +-
 arch/arm/mm/dma-mapping-nommu.c |  2 +-
 arch/arm/mm/dma-mapping.c       | 10 +++++-----
 arch/arm64/mm/dma-mapping.c     |  4 ++--
 arch/mips/mm/dma-noncoherent.c  |  2 +-
 arch/riscv/mm/dma-noncoherent.c |  2 +-
 drivers/acpi/scan.c             |  3 +--
 drivers/hv/hv_common.c          |  2 +-
 drivers/of/device.c             |  2 +-
 include/linux/dma-map-ops.h     |  4 ++--
 10 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 2a7fbbb83b70..197707bc7658 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -91,7 +91,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
  * Plug in direct dma map ops.
  */
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent)
+			bool coherent)
 {
 	/*
 	 * IOC hardware snoops all DMA traffic keeping the caches consistent
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index cfd9c933d2f0..b94850b57995 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -34,7 +34,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent)
+			bool coherent)
 {
 	if (IS_ENABLED(CONFIG_CPU_V7M)) {
 		/*
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 5409225b4abc..6c359a3af8d9 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1713,7 +1713,7 @@ void arm_iommu_detach_device(struct device *dev)
 EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
 
 static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    const struct iommu_ops *iommu, bool coherent)
+				    bool coherent)
 {
 	struct dma_iommu_mapping *mapping;
 
@@ -1748,7 +1748,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev)
 #else
 
 static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    const struct iommu_ops *iommu, bool coherent)
+				    bool coherent)
 {
 }
 
@@ -1757,7 +1757,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) { }
 #endif	/* CONFIG_ARM_DMA_USE_IOMMU */
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent)
+			bool coherent)
 {
 	/*
 	 * Due to legacy code that sets the ->dma_coherent flag from a bus
@@ -1776,8 +1776,8 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 	if (dev->dma_ops)
 		return;
 
-	if (iommu)
-		arm_setup_iommu_dma_ops(dev, dma_base, size, iommu, coherent);
+	if (device_iommu_mapped(dev))
+		arm_setup_iommu_dma_ops(dev, dma_base, size, coherent);
 
 	xen_setup_dma_ops(dev);
 	dev->archdata.dma_ops_setup = true;
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 3cb101e8cb29..61886e43e3a1 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -47,7 +47,7 @@ void arch_teardown_dma_ops(struct device *dev)
 #endif
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			const struct iommu_ops *iommu, bool coherent)
+			bool coherent)
 {
 	int cls = cache_line_size_of_cpu();
 
@@ -58,7 +58,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		   ARCH_DMA_MINALIGN, cls);
 
 	dev->dma_coherent = coherent;
-	if (iommu)
+	if (device_iommu_mapped(dev))
 		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 	xen_setup_dma_ops(dev);
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 3c4fc97b9f39..0f3cec663a12 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -138,7 +138,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		const struct iommu_ops *iommu, bool coherent)
+		bool coherent)
 {
 	dev->dma_coherent = coherent;
 }
diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c
index 4e4e469b8dd6..843107f834b2 100644
--- a/arch/riscv/mm/dma-noncoherent.c
+++ b/arch/riscv/mm/dma-noncoherent.c
@@ -129,7 +129,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		const struct iommu_ops *iommu, bool coherent)
+			bool coherent)
 {
 	WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN,
 		   TAINT_CPU_OUT_OF_SPEC,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 02bb2cce423f..444a0b3c72f2 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1641,8 +1641,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
-	arch_setup_dma_ops(dev, 0, U64_MAX,
-				iommu, attr == DEV_DMA_COHERENT);
+	arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT);
 
 	return 0;
 }
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 4372f5d146ab..0285a74363b3 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -488,7 +488,7 @@ void hv_setup_dma_ops(struct device *dev, bool coherent)
 	 * Hyper-V does not offer a vIOMMU in the guest
 	 * VM, so pass 0/NULL for the IOMMU settings
 	 */
-	arch_setup_dma_ops(dev, 0, 0, NULL, coherent);
+	arch_setup_dma_ops(dev, 0, 0, coherent);
 }
 EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
 
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 1ca42ad9dd15..65c71be71a8d 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -193,7 +193,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 	dev_dbg(dev, "device is%sbehind an iommu\n",
 		iommu ? " " : " not ");
 
-	arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
+	arch_setup_dma_ops(dev, dma_start, size, coherent);
 
 	if (!iommu)
 		of_dma_set_restricted_buffer(dev, np);
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a52e508d1869..e9cc317e9d7d 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -427,10 +427,10 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		const struct iommu_ops *iommu, bool coherent);
+		bool coherent);
 #else
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size, const struct iommu_ops *iommu, bool coherent)
+		u64 size, bool coherent)
 {
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
-- 
cgit v1.2.3


From 6ff6e184f1f4d4993d45ca3f934c8288890965fe Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:09 -0400
Subject: iommmu/of: Do not return struct iommu_ops from of_iommu_configure()

Nothing needs this pointer. Return a normal error code with the usual
IOMMU semantic that ENODEV means 'there is no IOMMU driver'.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Rob Herring <robh@kernel.org>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/of_iommu.c | 31 +++++++++++++++++++------------
 drivers/of/device.c      | 22 +++++++++++++++-------
 include/linux/of_iommu.h | 13 ++++++-------
 3 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5ecca53847d3..c6510d7e7b24 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -107,16 +107,22 @@ static int of_iommu_configure_device(struct device_node *master_np,
 		      of_iommu_configure_dev(master_np, dev);
 }
 
-const struct iommu_ops *of_iommu_configure(struct device *dev,
-					   struct device_node *master_np,
-					   const u32 *id)
+/*
+ * Returns:
+ *  0 on success, an iommu was configured
+ *  -ENODEV if the device does not have any IOMMU
+ *  -EPROBEDEFER if probing should be tried again
+ *  -errno fatal errors
+ */
+int of_iommu_configure(struct device *dev, struct device_node *master_np,
+		       const u32 *id)
 {
 	const struct iommu_ops *ops = NULL;
 	struct iommu_fwspec *fwspec;
 	int err = NO_IOMMU;
 
 	if (!master_np)
-		return NULL;
+		return -ENODEV;
 
 	/* Serialise to make dev->iommu stable under our potential fwspec */
 	mutex_lock(&iommu_probe_device_lock);
@@ -124,7 +130,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 	if (fwspec) {
 		if (fwspec->ops) {
 			mutex_unlock(&iommu_probe_device_lock);
-			return fwspec->ops;
+			return 0;
 		}
 		/* In the deferred case, start again from scratch */
 		iommu_fwspec_free(dev);
@@ -169,14 +175,15 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 		err = iommu_probe_device(dev);
 
 	/* Ignore all other errors apart from EPROBE_DEFER */
-	if (err == -EPROBE_DEFER) {
-		ops = ERR_PTR(err);
-	} else if (err < 0) {
-		dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
-		ops = NULL;
+	if (err < 0) {
+		if (err == -EPROBE_DEFER)
+			return err;
+		dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err));
+		return err;
 	}
-
-	return ops;
+	if (!ops)
+		return -ENODEV;
+	return 0;
 }
 
 static enum iommu_resv_type __maybe_unused
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 65c71be71a8d..873d933e8e6d 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -93,12 +93,12 @@ of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
 int of_dma_configure_id(struct device *dev, struct device_node *np,
 			bool force_dma, const u32 *id)
 {
-	const struct iommu_ops *iommu;
 	const struct bus_dma_region *map = NULL;
 	struct device_node *bus_np;
 	u64 dma_start = 0;
 	u64 mask, end, size = 0;
 	bool coherent;
+	int iommu_ret;
 	int ret;
 
 	if (np == dev->of_node)
@@ -181,21 +181,29 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 	dev_dbg(dev, "device is%sdma coherent\n",
 		coherent ? " " : " not ");
 
-	iommu = of_iommu_configure(dev, np, id);
-	if (PTR_ERR(iommu) == -EPROBE_DEFER) {
+	iommu_ret = of_iommu_configure(dev, np, id);
+	if (iommu_ret == -EPROBE_DEFER) {
 		/* Don't touch range map if it wasn't set from a valid dma-ranges */
 		if (!ret)
 			dev->dma_range_map = NULL;
 		kfree(map);
 		return -EPROBE_DEFER;
-	}
+	} else if (iommu_ret == -ENODEV) {
+		dev_dbg(dev, "device is not behind an iommu\n");
+	} else if (iommu_ret) {
+		dev_err(dev, "iommu configuration for device failed with %pe\n",
+			ERR_PTR(iommu_ret));
 
-	dev_dbg(dev, "device is%sbehind an iommu\n",
-		iommu ? " " : " not ");
+		/*
+		 * Historically this routine doesn't fail driver probing
+		 * due to errors in of_iommu_configure()
+		 */
+	} else
+		dev_dbg(dev, "device is behind an iommu\n");
 
 	arch_setup_dma_ops(dev, dma_start, size, coherent);
 
-	if (!iommu)
+	if (iommu_ret)
 		of_dma_set_restricted_buffer(dev, np);
 
 	return 0;
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 9a5e6b410dd2..e61cbbe12dac 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -8,20 +8,19 @@ struct iommu_ops;
 
 #ifdef CONFIG_OF_IOMMU
 
-extern const struct iommu_ops *of_iommu_configure(struct device *dev,
-					struct device_node *master_np,
-					const u32 *id);
+extern int of_iommu_configure(struct device *dev, struct device_node *master_np,
+			      const u32 *id);
 
 extern void of_iommu_get_resv_regions(struct device *dev,
 				      struct list_head *list);
 
 #else
 
-static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
-					 struct device_node *master_np,
-					 const u32 *id)
+static inline int of_iommu_configure(struct device *dev,
+				     struct device_node *master_np,
+				     const u32 *id)
 {
-	return NULL;
+	return -ENODEV;
 }
 
 static inline void of_iommu_get_resv_regions(struct device *dev,
-- 
cgit v1.2.3


From eda1a94caf6b05482bbf57dc244e7a31a9dba77c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 7 Dec 2023 14:03:12 -0400
Subject: iommu: Mark dev_iommu_priv_set() with a lockdep

A perfect driver would only call dev_iommu_priv_set() from its probe
callback. We've made it functionally correct to call it from the of_xlate
by adding a lock around that call.

lockdep assert that iommu_probe_device_lock is held to discourage misuse.

Exclude PPC kernels with CONFIG_FSL_PAMU turned on because FSL_PAMU uses a
global static for its priv and abuses priv for its domain.

Remove the pointless stores of NULL, all these are on paths where the core
code will free dev->iommu after the op returns.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c                   | 2 --
 drivers/iommu/apple-dart.c                  | 1 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 -
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 1 -
 drivers/iommu/intel/iommu.c                 | 2 --
 drivers/iommu/iommu.c                       | 9 +++++++++
 drivers/iommu/omap-iommu.c                  | 1 -
 include/linux/iommu.h                       | 5 +----
 8 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index fcc987f5d4ed..8199c678c2dc 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -551,8 +551,6 @@ static void amd_iommu_uninit_device(struct device *dev)
 	if (dev_data->domain)
 		detach_device(dev);
 
-	dev_iommu_priv_set(dev, NULL);
-
 	/*
 	 * We keep dev_data around for unplugged devices and reuse it when the
 	 * device is re-plugged - not doing so would introduce a ton of races.
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index ee05f4824bfa..56cfc33042e0 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -740,7 +740,6 @@ static void apple_dart_release_device(struct device *dev)
 {
 	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
 
-	dev_iommu_priv_set(dev, NULL);
 	kfree(cfg);
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index fc4317c25b6d..1855d3892b15 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2695,7 +2695,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 
 err_free_master:
 	kfree(master);
-	dev_iommu_priv_set(dev, NULL);
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4d09c0047892..adc7937fd8a3 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1420,7 +1420,6 @@ static void arm_smmu_release_device(struct device *dev)
 
 	arm_smmu_rpm_put(cfg->smmu);
 
-	dev_iommu_priv_set(dev, NULL);
 	kfree(cfg);
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 897159dba47d..511589341074 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4461,7 +4461,6 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 		ret = intel_pasid_alloc_table(dev);
 		if (ret) {
 			dev_err(dev, "PASID table allocation failed\n");
-			dev_iommu_priv_set(dev, NULL);
 			kfree(info);
 			return ERR_PTR(ret);
 		}
@@ -4479,7 +4478,6 @@ static void intel_iommu_release_device(struct device *dev)
 	dmar_remove_one_dev_info(dev);
 	intel_pasid_free_table(dev);
 	intel_iommu_debugfs_remove_dev(info);
-	dev_iommu_priv_set(dev, NULL);
 	kfree(info);
 	set_dma_ops(dev, NULL);
 }
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index df58025c001b..68e648b55767 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -387,6 +387,15 @@ static u32 dev_iommu_get_max_pasids(struct device *dev)
 	return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids);
 }
 
+void dev_iommu_priv_set(struct device *dev, void *priv)
+{
+	/* FSL_PAMU does something weird */
+	if (!IS_ENABLED(CONFIG_FSL_PAMU))
+		lockdep_assert_held(&iommu_probe_device_lock);
+	dev->iommu->priv = priv;
+}
+EXPORT_SYMBOL_GPL(dev_iommu_priv_set);
+
 /*
  * Init the dev->iommu and dev->iommu_group in the struct device and get the
  * driver probed
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index c66b070841dd..c9528065a59a 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1719,7 +1719,6 @@ static void omap_iommu_release_device(struct device *dev)
 	if (!dev->of_node || !arch_data)
 		return;
 
-	dev_iommu_priv_set(dev, NULL);
 	kfree(arch_data);
 
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c6bbbe0901d0..3a556996fea7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -850,10 +850,7 @@ static inline void *dev_iommu_priv_get(struct device *dev)
 		return NULL;
 }
 
-static inline void dev_iommu_priv_set(struct device *dev, void *priv)
-{
-	dev->iommu->priv = priv;
-}
+void dev_iommu_priv_set(struct device *dev, void *priv);
 
 extern struct mutex iommu_probe_device_lock;
 int iommu_probe_device(struct device *dev);
-- 
cgit v1.2.3


From 62e1f212e5fe7624249212813ee96202e0c31430 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:14 +0000
Subject: arm: perf/kvm: Use GENMASK for ARMV8_PMU_PMCR_N

This is so that FIELD_GET and FIELD_PREP can be used and that the fields
are in a consistent format to arm64/tools/sysreg

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-3-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kvm/pmu-emul.c      | 8 +++-----
 arch/arm64/kvm/sys_regs.c      | 4 ++--
 drivers/perf/arm_pmuv3.c       | 4 ++--
 include/linux/perf/arm_pmuv3.h | 3 +--
 4 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index fe99b3dab6ce..3d9467ff73bc 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -267,9 +267,8 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
 {
-	u64 val = kvm_vcpu_read_pmcr(vcpu) >> ARMV8_PMU_PMCR_N_SHIFT;
+	u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu));
 
-	val &= ARMV8_PMU_PMCR_N_MASK;
 	if (val == 0)
 		return BIT(ARMV8_PMU_CYCLE_IDX);
 	else
@@ -1136,8 +1135,7 @@ u8 kvm_arm_pmu_get_pmuver_limit(void)
  */
 u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
 {
-	u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0) &
-			~(ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT);
+	u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
 
-	return pmcr | ((u64)vcpu->kvm->arch.pmcr_n << ARMV8_PMU_PMCR_N_SHIFT);
+	return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N);
 }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 4735e1b37fb3..ff45d688bd7d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -877,7 +877,7 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
 	u64 pmcr, val;
 
 	pmcr = kvm_vcpu_read_pmcr(vcpu);
-	val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
+	val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr);
 	if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
 		kvm_inject_undefined(vcpu);
 		return false;
@@ -1143,7 +1143,7 @@ static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
 static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
 		    u64 val)
 {
-	u8 new_n = (val >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
+	u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val);
 	struct kvm *kvm = vcpu->kvm;
 
 	mutex_lock(&kvm->arch.config_lock);
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 09478e2b825e..374e973a4e42 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -15,6 +15,7 @@
 #include <clocksource/arm_arch_timer.h>
 
 #include <linux/acpi.h>
+#include <linux/bitfield.h>
 #include <linux/clocksource.h>
 #include <linux/of.h>
 #include <linux/perf/arm_pmu.h>
@@ -1111,8 +1112,7 @@ static void __armv8pmu_probe_pmu(void *info)
 	probe->present = true;
 
 	/* Read the nb of CNTx counters supported from PMNC */
-	cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT)
-		& ARMV8_PMU_PMCR_N_MASK;
+	cpu_pmu->num_events = FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read());
 
 	/* Add the CPU cycles counter */
 	cpu_pmu->num_events += 1;
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 9c226adf938a..ed62bd75cec7 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -215,8 +215,7 @@
 #define ARMV8_PMU_PMCR_DP	(1 << 5) /* Disable CCNT if non-invasive debug*/
 #define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
 #define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
-#define ARMV8_PMU_PMCR_N_SHIFT	11  /* Number of counters supported */
-#define ARMV8_PMU_PMCR_N_MASK	0x1f
+#define ARMV8_PMU_PMCR_N	GENMASK(15, 11) /* Number of counters supported */
 #define ARMV8_PMU_PMCR_MASK	0xff    /* Mask for writable bits */
 
 /*
-- 
cgit v1.2.3


From 2f6a00f30600417ee2737f2b1229c75663f1e3c9 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:15 +0000
Subject: arm: perf: Use GENMASK for PMMIR fields

This is so that FIELD_GET and FIELD_PREP can be used and that the fields
are in a consistent format to arm64/tools/sysreg

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-4-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c       | 8 +++-----
 include/linux/perf/arm_pmuv3.h | 9 +++------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 374e973a4e42..36bc00494f56 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -332,7 +332,7 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
 {
 	struct pmu *pmu = dev_get_drvdata(dev);
 	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
+	u32 slots = FIELD_GET(ARMV8_PMU_SLOTS, cpu_pmu->reg_pmmir);
 
 	return sysfs_emit(page, "0x%08x\n", slots);
 }
@@ -344,8 +344,7 @@ static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
 {
 	struct pmu *pmu = dev_get_drvdata(dev);
 	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT)
-			& ARMV8_PMU_BUS_SLOTS_MASK;
+	u32 bus_slots = FIELD_GET(ARMV8_PMU_BUS_SLOTS, cpu_pmu->reg_pmmir);
 
 	return sysfs_emit(page, "0x%08x\n", bus_slots);
 }
@@ -357,8 +356,7 @@ static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
 {
 	struct pmu *pmu = dev_get_drvdata(dev);
 	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
-	u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT)
-			& ARMV8_PMU_BUS_WIDTH_MASK;
+	u32 bus_width = FIELD_GET(ARMV8_PMU_BUS_WIDTH, cpu_pmu->reg_pmmir);
 	u32 val = 0;
 
 	/* Encoded as Log2(number of bytes), plus one */
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index ed62bd75cec7..1bc7678c10d4 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -250,12 +250,9 @@
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
 
 /* PMMIR_EL1.SLOTS mask */
-#define ARMV8_PMU_SLOTS_MASK	0xff
-
-#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
-#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
-#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
-#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
+#define ARMV8_PMU_SLOTS		GENMASK(7, 0)
+#define ARMV8_PMU_BUS_SLOTS	GENMASK(15, 8)
+#define ARMV8_PMU_BUS_WIDTH	GENMASK(19, 16)
 
 /*
  * This code is really good
-- 
cgit v1.2.3


From d30f09b6d7de5d159dbb537f9d67dceb67409420 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:16 +0000
Subject: arm: perf: Convert remaining fields to use GENMASK

Convert the remaining fields to use either GENMASK or be built from
other fields. These all already started at bit 0 so don't need a code
change for the lack of _SHIFT.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-5-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c       |  2 +-
 include/linux/perf/arm_pmuv3.h | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 36bc00494f56..a93b4cf88562 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -675,7 +675,7 @@ static u32 armv8pmu_getreset_flags(void)
 	value = read_pmovsclr();
 
 	/* Write to clear flags */
-	value &= ARMV8_PMU_OVSR_MASK;
+	value &= ARMV8_PMU_OVERFLOWED_MASK;
 	write_pmovsclr(value);
 
 	return value;
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 1bc7678c10d4..daa63542242d 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -216,19 +216,25 @@
 #define ARMV8_PMU_PMCR_LC	(1 << 6) /* Overflow on 64 bit cycle counter */
 #define ARMV8_PMU_PMCR_LP	(1 << 7) /* Long event counter enable */
 #define ARMV8_PMU_PMCR_N	GENMASK(15, 11) /* Number of counters supported */
-#define ARMV8_PMU_PMCR_MASK	0xff    /* Mask for writable bits */
+/* Mask for writable bits */
+#define ARMV8_PMU_PMCR_MASK	(ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \
+				 ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \
+				 ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \
+				 ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP)
 
 /*
  * PMOVSR: counters overflow flag status reg
  */
-#define ARMV8_PMU_OVSR_MASK		0xffffffff	/* Mask for writable bits */
-#define ARMV8_PMU_OVERFLOWED_MASK	ARMV8_PMU_OVSR_MASK
+#define ARMV8_PMU_OVSR_P		GENMASK(30, 0)
+#define ARMV8_PMU_OVSR_C		BIT(31)
+/* Mask for writable bits is both P and C fields */
+#define ARMV8_PMU_OVERFLOWED_MASK	(ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C)
 
 /*
  * PMXEVTYPER: Event selection reg
  */
 #define ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
-#define ARMV8_PMU_EVTYPE_EVENT	0xffff		/* Mask for EVENT bits */
+#define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
 
 /*
  * Event filters for PMUv3
@@ -243,11 +249,13 @@
 /*
  * PMUSERENR: user enable reg
  */
-#define ARMV8_PMU_USERENR_MASK	0xf		/* Mask for writable bits */
 #define ARMV8_PMU_USERENR_EN	(1 << 0) /* PMU regs can be accessed at EL0 */
 #define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
 #define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+/* Mask for writable bits */
+#define ARMV8_PMU_USERENR_MASK	(ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \
+				 ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER)
 
 /* PMMIR_EL1.SLOTS mask */
 #define ARMV8_PMU_SLOTS		GENMASK(7, 0)
-- 
cgit v1.2.3


From 3115ee021bfb04efde2e96507bfcc1330261a6a1 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:17 +0000
Subject: arm64: perf: Include threshold control fields in PMEVTYPER mask

FEAT_PMUv3_TH (Armv8.8) adds two new fields to PMEVTYPER, so include
them in the mask. These aren't writable on 32 bit kernels as they are in
the high part of the register, so only include them for arm64.

It would be difficult to do this statically in the asm header files for
each platform without resulting in circular includes or #ifdefs inline
in the code. For that reason the ARMV8_PMU_EVTYPE_MASK definition has
been removed and the mask is constructed programmatically.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-6-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c       | 9 ++++++++-
 include/linux/perf/arm_pmuv3.h | 3 ++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index a93b4cf88562..441bf73ee3d5 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -558,8 +558,15 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
 static void armv8pmu_write_evtype(int idx, u32 val)
 {
 	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
+	unsigned long mask = ARMV8_PMU_EVTYPE_EVENT |
+			     ARMV8_PMU_INCLUDE_EL2 |
+			     ARMV8_PMU_EXCLUDE_EL0 |
+			     ARMV8_PMU_EXCLUDE_EL1;
 
-	val &= ARMV8_PMU_EVTYPE_MASK;
+	if (IS_ENABLED(CONFIG_ARM64))
+		mask |= ARMV8_PMU_EVTYPE_TC | ARMV8_PMU_EVTYPE_TH;
+
+	val &= mask;
 	write_pmevtypern(counter, val);
 }
 
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index daa63542242d..91957b3468e9 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -233,8 +233,9 @@
 /*
  * PMXEVTYPER: Event selection reg
  */
-#define ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
 #define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
+#define ARMV8_PMU_EVTYPE_TH	GENMASK(43, 32)
+#define ARMV8_PMU_EVTYPE_TC	GENMASK(63, 61)
 
 /*
  * Event filters for PMUv3
-- 
cgit v1.2.3


From f6da86969a3c284466ab6080764b2ed91689f262 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:18 +0000
Subject: arm: pmu: Share user ABI format mechanism with SPE

This mechanism makes it much easier to define and read new attributes
so move it to the arm_pmu.h header so that it can be shared. At the same
time update the existing format attributes to use it.

GENMASK has to be changed to GENMASK_ULL because the config fields are
64 bits even on arm32 where this will also be used now.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-7-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c     | 21 ++++++++++++++++-----
 drivers/perf/arm_spe_pmu.c   | 22 ----------------------
 include/linux/perf/arm_pmu.h | 22 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 441bf73ee3d5..8a573db81da1 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -299,20 +299,31 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
 	.is_visible = armv8pmu_event_attr_is_visible,
 };
 
-PMU_FORMAT_ATTR(event, "config:0-15");
-PMU_FORMAT_ATTR(long, "config1:0");
-PMU_FORMAT_ATTR(rdpmc, "config1:1");
+/* User ABI */
+#define ATTR_CFG_FLD_event_CFG		config
+#define ATTR_CFG_FLD_event_LO		0
+#define ATTR_CFG_FLD_event_HI		15
+#define ATTR_CFG_FLD_long_CFG		config1
+#define ATTR_CFG_FLD_long_LO		0
+#define ATTR_CFG_FLD_long_HI		0
+#define ATTR_CFG_FLD_rdpmc_CFG		config1
+#define ATTR_CFG_FLD_rdpmc_LO		1
+#define ATTR_CFG_FLD_rdpmc_HI		1
+
+GEN_PMU_FORMAT_ATTR(event);
+GEN_PMU_FORMAT_ATTR(long);
+GEN_PMU_FORMAT_ATTR(rdpmc);
 
 static int sysctl_perf_user_access __read_mostly;
 
 static bool armv8pmu_event_is_64bit(struct perf_event *event)
 {
-	return event->attr.config1 & 0x1;
+	return ATTR_CFG_GET_FLD(&event->attr, long);
 }
 
 static bool armv8pmu_event_want_user_access(struct perf_event *event)
 {
-	return event->attr.config1 & 0x2;
+	return ATTR_CFG_GET_FLD(&event->attr, rdpmc);
 }
 
 static struct attribute *armv8_pmuv3_format_attrs[] = {
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index d2b0cbf0e0c4..b622d75d8c9e 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -206,28 +206,6 @@ static const struct attribute_group arm_spe_pmu_cap_group = {
 #define ATTR_CFG_FLD_inv_event_filter_LO	0
 #define ATTR_CFG_FLD_inv_event_filter_HI	63
 
-/* Why does everything I do descend into this? */
-#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
-	(lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi
-
-#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
-	__GEN_PMU_FORMAT_ATTR(cfg, lo, hi)
-
-#define GEN_PMU_FORMAT_ATTR(name)					\
-	PMU_FORMAT_ATTR(name,						\
-	_GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG,			\
-			     ATTR_CFG_FLD_##name##_LO,			\
-			     ATTR_CFG_FLD_##name##_HI))
-
-#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi)				\
-	((((attr)->cfg) >> lo) & GENMASK(hi - lo, 0))
-
-#define ATTR_CFG_GET_FLD(attr, name)					\
-	_ATTR_CFG_GET_FLD(attr,						\
-			  ATTR_CFG_FLD_##name##_CFG,			\
-			  ATTR_CFG_FLD_##name##_LO,			\
-			  ATTR_CFG_FLD_##name##_HI)
-
 GEN_PMU_FORMAT_ATTR(ts_enable);
 GEN_PMU_FORMAT_ATTR(pa_enable);
 GEN_PMU_FORMAT_ATTR(pct_enable);
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index e2503d48ddee..b3b34f6670cf 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -183,4 +183,26 @@ void armpmu_free_irq(int irq, int cpu);
 #define ARMV8_SPE_PDEV_NAME "arm,spe-v1"
 #define ARMV8_TRBE_PDEV_NAME "arm,trbe"
 
+/* Why does everything I do descend into this? */
+#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	(lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi
+
+#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi)				\
+	__GEN_PMU_FORMAT_ATTR(cfg, lo, hi)
+
+#define GEN_PMU_FORMAT_ATTR(name)					\
+	PMU_FORMAT_ATTR(name,						\
+	_GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG,			\
+			     ATTR_CFG_FLD_##name##_LO,			\
+			     ATTR_CFG_FLD_##name##_HI))
+
+#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi)				\
+	((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0))
+
+#define ATTR_CFG_GET_FLD(attr, name)					\
+	_ATTR_CFG_GET_FLD(attr,						\
+			  ATTR_CFG_FLD_##name##_CFG,			\
+			  ATTR_CFG_FLD_##name##_LO,			\
+			  ATTR_CFG_FLD_##name##_HI)
+
 #endif /* __ARM_PMU_H__ */
-- 
cgit v1.2.3


From 816c26754447e8b28d6c604e1f5b1d205b2586ee Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 11 Dec 2023 16:13:22 +0000
Subject: arm64: perf: Add support for event counting threshold

FEAT_PMUv3_TH (Armv8.8) permits a PMU counter to increment only on
events whose count meets a specified threshold condition. For example if
PMEVTYPERn.TC (Threshold Control) is set to 0b101 (Greater than or
equal, count), and the threshold is set to 2, then the PMU counter will
now only increment by 1 when an event would have previously incremented
the PMU counter by 2 or more on a single processor cycle.

Three new Perf event config fields, 'threshold', 'threshold_compare' and
'threshold_count' have been added to control the feature.
threshold_compare maps to the upper two bits of PMEVTYPERn.TC and
threshold_count maps to the first bit of TC. These separate attributes
have been picked rather than enumerating all the possible combinations
of the TC field as in the Arm ARM. The attributes would be used on a
Perf command line like this:

  $ perf stat -e stall_slot/threshold=2,threshold_compare=2/

A new capability for reading out the maximum supported threshold value
has also been added:

  $ cat /sys/bus/event_source/devices/armv8_pmuv3/caps/threshold_max

  0x000000ff

If a threshold higher than threshold_max is provided, then an error is
generated. If FEAT_PMUv3_TH isn't implemented or a 32 bit kernel is
running, then threshold_max reads zero, and attempting to set a
threshold value will also result in an error.

The threshold is per PMU counter, and there are potentially different
threshold_max values per PMU type on heterogeneous systems.

Bits higher than 32 now need to be written into PMEVTYPER, so
armv8pmu_write_evtype() has to be updated to take an unsigned long value
rather than u32 which gives the correct behavior on both aarch32 and 64.

Signed-off-by: James Clark <james.clark@arm.com>
Link: https://lore.kernel.org/r/20231211161331.1277825-11-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c       | 79 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/perf/arm_pmuv3.h |  1 +
 2 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 2ba215f74d92..23fa6c5da82c 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -309,10 +309,22 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
 #define ATTR_CFG_FLD_rdpmc_CFG		config1
 #define ATTR_CFG_FLD_rdpmc_LO		1
 #define ATTR_CFG_FLD_rdpmc_HI		1
+#define ATTR_CFG_FLD_threshold_count_CFG	config1 /* PMEVTYPER.TC[0] */
+#define ATTR_CFG_FLD_threshold_count_LO		2
+#define ATTR_CFG_FLD_threshold_count_HI		2
+#define ATTR_CFG_FLD_threshold_compare_CFG	config1 /* PMEVTYPER.TC[2:1] */
+#define ATTR_CFG_FLD_threshold_compare_LO	3
+#define ATTR_CFG_FLD_threshold_compare_HI	4
+#define ATTR_CFG_FLD_threshold_CFG		config1 /* PMEVTYPER.TH */
+#define ATTR_CFG_FLD_threshold_LO		5
+#define ATTR_CFG_FLD_threshold_HI		16
 
 GEN_PMU_FORMAT_ATTR(event);
 GEN_PMU_FORMAT_ATTR(long);
 GEN_PMU_FORMAT_ATTR(rdpmc);
+GEN_PMU_FORMAT_ATTR(threshold_count);
+GEN_PMU_FORMAT_ATTR(threshold_compare);
+GEN_PMU_FORMAT_ATTR(threshold);
 
 static int sysctl_perf_user_access __read_mostly;
 
@@ -326,10 +338,27 @@ static bool armv8pmu_event_want_user_access(struct perf_event *event)
 	return ATTR_CFG_GET_FLD(&event->attr, rdpmc);
 }
 
+static u8 armv8pmu_event_threshold_control(struct perf_event_attr *attr)
+{
+	u8 th_compare = ATTR_CFG_GET_FLD(attr, threshold_compare);
+	u8 th_count = ATTR_CFG_GET_FLD(attr, threshold_count);
+
+	/*
+	 * The count bit is always the bottom bit of the full control field, and
+	 * the comparison is the upper two bits, but it's not explicitly
+	 * labelled in the Arm ARM. For the Perf interface we split it into two
+	 * fields, so reconstruct it here.
+	 */
+	return (th_compare << 1) | th_count;
+}
+
 static struct attribute *armv8_pmuv3_format_attrs[] = {
 	&format_attr_event.attr,
 	&format_attr_long.attr,
 	&format_attr_rdpmc.attr,
+	&format_attr_threshold.attr,
+	&format_attr_threshold_compare.attr,
+	&format_attr_threshold_count.attr,
 	NULL,
 };
 
@@ -379,10 +408,38 @@ static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_RO(bus_width);
 
+static u32 threshold_max(struct arm_pmu *cpu_pmu)
+{
+	/*
+	 * PMMIR.THWIDTH is readable and non-zero on aarch32, but it would be
+	 * impossible to write the threshold in the upper 32 bits of PMEVTYPER.
+	 */
+	if (IS_ENABLED(CONFIG_ARM))
+		return 0;
+
+	/*
+	 * The largest value that can be written to PMEVTYPER<n>_EL0.TH is
+	 * (2 ^ PMMIR.THWIDTH) - 1.
+	 */
+	return (1 << FIELD_GET(ARMV8_PMU_THWIDTH, cpu_pmu->reg_pmmir)) - 1;
+}
+
+static ssize_t threshold_max_show(struct device *dev,
+				  struct device_attribute *attr, char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+
+	return sysfs_emit(page, "0x%08x\n", threshold_max(cpu_pmu));
+}
+
+static DEVICE_ATTR_RO(threshold_max);
+
 static struct attribute *armv8_pmuv3_caps_attrs[] = {
 	&dev_attr_slots.attr,
 	&dev_attr_bus_slots.attr,
 	&dev_attr_bus_width.attr,
+	&dev_attr_threshold_max.attr,
 	NULL,
 };
 
@@ -566,7 +623,7 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value)
 		armv8pmu_write_hw_counter(event, value);
 }
 
-static void armv8pmu_write_evtype(int idx, u32 val)
+static void armv8pmu_write_evtype(int idx, unsigned long val)
 {
 	u32 counter = ARMV8_IDX_TO_COUNTER(idx);
 	unsigned long mask = ARMV8_PMU_EVTYPE_EVENT |
@@ -935,6 +992,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
 				     struct perf_event_attr *attr)
 {
 	unsigned long config_base = 0;
+	struct perf_event *perf_event = container_of(attr, struct perf_event,
+						     attr);
+	struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
+	u32 th;
 
 	if (attr->exclude_idle) {
 		pr_debug("ARM performance counters do not support mode exclusion\n");
@@ -968,6 +1029,22 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
 	if (attr->exclude_user)
 		config_base |= ARMV8_PMU_EXCLUDE_EL0;
 
+	/*
+	 * If FEAT_PMUv3_TH isn't implemented, then THWIDTH (threshold_max) will
+	 * be 0 and will also trigger this check, preventing it from being used.
+	 */
+	th = ATTR_CFG_GET_FLD(attr, threshold);
+	if (th > threshold_max(cpu_pmu)) {
+		pr_debug("PMU event threshold exceeds max value\n");
+		return -EINVAL;
+	}
+
+	if (IS_ENABLED(CONFIG_ARM64) && th) {
+		config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TH, th);
+		config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TC,
+					  armv8pmu_event_threshold_control(attr));
+	}
+
 	/*
 	 * Install the filter into config_base as this is used to
 	 * construct the event type.
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 91957b3468e9..0f4d62ef3a9a 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -262,6 +262,7 @@
 #define ARMV8_PMU_SLOTS		GENMASK(7, 0)
 #define ARMV8_PMU_BUS_SLOTS	GENMASK(15, 8)
 #define ARMV8_PMU_BUS_WIDTH	GENMASK(19, 16)
+#define ARMV8_PMU_THWIDTH	GENMASK(23, 20)
 
 /*
  * This code is really good
-- 
cgit v1.2.3


From 253ca8678d30bcf94410b54476fc1e0f1627a137 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 26 Nov 2023 12:24:38 -0800
Subject: Improve __fget_files_rcu() code generation (and thus __fget_light())

Commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") caused a
performance regression as reported by the kernel test robot.

The __fget_light() function is one of those critical ones for some
loads, and the code generation was unnecessarily impacted.  Let's just
write that function to better.

Reported-by: kernel test robot <oliver.sang@intel.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Closes: https://lore.kernel.org/oe-lkp/202311201406.2022ca3f-oliver.sang@intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wiCJtLbFWNURB34b9a_R_unaH3CiMRXfkR0-iihB_z68A@mail.gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c               | 51 ++++++++++++++++++++++++++++++++-----------------
 include/linux/fdtable.h | 17 +++++++++++------
 2 files changed, 44 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 5fb0b146e79e..50df31e104a5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -959,31 +959,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		struct file *file;
 		struct fdtable *fdt = rcu_dereference_raw(files->fdt);
 		struct file __rcu **fdentry;
+		unsigned long nospec_mask;
 
-		if (unlikely(fd >= fdt->max_fds))
-			return NULL;
-
-		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+		/* Mask is a 0 for invalid fd's, ~0 for valid ones */
+		nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
 
 		/*
-		 * Ok, we have a file pointer. However, because we do
-		 * this all locklessly under RCU, we may be racing with
-		 * that file being closed.
-		 *
-		 * Such a race can take two forms:
-		 *
-		 *  (a) the file ref already went down to zero and the
-		 *      file hasn't been reused yet or the file count
-		 *      isn't zero but the file has already been reused.
+		 * fdentry points to the 'fd' offset, or fdt->fd[0].
+		 * Loading from fdt->fd[0] is always safe, because the
+		 * array always exists.
 		 */
-		file = __get_file_rcu(fdentry);
+		fdentry = fdt->fd + (fd & nospec_mask);
+
+		/* Do the load, then mask any invalid result */
+		file = rcu_dereference_raw(*fdentry);
+		file = (void *)(nospec_mask & (unsigned long)file);
 		if (unlikely(!file))
 			return NULL;
 
-		if (unlikely(IS_ERR(file)))
+		/*
+		 * Ok, we have a file pointer that was valid at
+		 * some point, but it might have become stale since.
+		 *
+		 * We need to confirm it by incrementing the refcount
+		 * and then check the lookup again.
+		 *
+		 * atomic_long_inc_not_zero() gives us a full memory
+		 * barrier. We only really need an 'acquire' one to
+		 * protect the loads below, but we don't have that.
+		 */
+		if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
 			continue;
 
 		/*
+		 * Such a race can take two forms:
+		 *
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
+		 *
 		 *  (b) the file table entry has changed under us.
 		 *       Note that we don't need to re-check the 'fdt->fd'
 		 *       pointer having changed, because it always goes
@@ -991,7 +1005,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
+		if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
+		    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
@@ -1128,13 +1143,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 	 * atomic_read_acquire() pairs with atomic_dec_and_test() in
 	 * put_files_struct().
 	 */
-	if (atomic_read_acquire(&files->count) == 1) {
+	if (likely(atomic_read_acquire(&files->count) == 1)) {
 		file = files_lookup_fd_raw(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
 			return 0;
 		return (unsigned long)file;
 	} else {
-		file = __fget(fd, mask);
+		file = __fget_files(files, fd, mask);
 		if (!file)
 			return 0;
 		return FDPUT_FPUT | (unsigned long)file;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index bc4c3287a65e..80bd7789bab1 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -83,12 +83,17 @@ struct dentry;
 static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
-
-	if (fd < fdt->max_fds) {
-		fd = array_index_nospec(fd, fdt->max_fds);
-		return rcu_dereference_raw(fdt->fd[fd]);
-	}
-	return NULL;
+	unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
+	struct file *needs_masking;
+
+	/*
+	 * 'mask' is zero for an out-of-bounds fd, all ones for ok.
+	 * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
+	 *
+	 * Accessing fdt->fd[0] is ok, but needs masking of the result.
+	 */
+	needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
+	return (struct file *)(mask & (unsigned long)needs_masking);
 }
 
 static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
-- 
cgit v1.2.3


From a88c955fcfb49727d0ed86b47410f6555a8e69e4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:07 +0100
Subject: file: s/close_fd_get_file()/file_close_fd()/g

That really shouldn't have "get" in there as that implies we're bumping
the reference count which we don't do at all. We used to but not anmore.
Now we're just closing the fd and pick that file from the fdtable
without bumping the reference count. Update the wrong documentation
while at it.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-1-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/android/binder.c |  2 +-
 fs/file.c                | 14 +++++++++-----
 fs/open.c                |  2 +-
 include/linux/fdtable.h  |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 92128aae2d06..7658103ba760 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1921,7 +1921,7 @@ static void binder_deferred_fd_close(int fd)
 	if (!twcb)
 		return;
 	init_task_work(&twcb->twork, binder_do_fd_close);
-	twcb->file = close_fd_get_file(fd);
+	twcb->file = file_close_fd(fd);
 	if (twcb->file) {
 		// pin it until binder_do_fd_close(); see comments there
 		get_file(twcb->file);
diff --git a/fs/file.c b/fs/file.c
index 50df31e104a5..66f04442a384 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -796,7 +796,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 }
 
 /*
- * See close_fd_get_file() below, this variant assumes current->files->file_lock
+ * See file_close_fd() below, this variant assumes current->files->file_lock
  * is held.
  */
 struct file *__close_fd_get_file(unsigned int fd)
@@ -804,11 +804,15 @@ struct file *__close_fd_get_file(unsigned int fd)
 	return pick_file(current->files, fd);
 }
 
-/*
- * variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file.
+/**
+ * file_close_fd - return file associated with fd
+ * @fd: file descriptor to retrieve file for
+ *
+ * Doesn't take a separate reference count.
+ *
+ * Returns: The file associated with @fd (NULL if @fd is not open)
  */
-struct file *close_fd_get_file(unsigned int fd)
+struct file *file_close_fd(unsigned int fd)
 {
 	struct files_struct *files = current->files;
 	struct file *file;
diff --git a/fs/open.c b/fs/open.c
index 02dc608d40d8..48775329f3f1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1577,7 +1577,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 	int retval;
 	struct file *file;
 
-	file = close_fd_get_file(fd);
+	file = file_close_fd(fd);
 	if (!file)
 		return -EBADF;
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 80bd7789bab1..78c8326d74ae 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -119,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned,
 
 extern int close_fd(unsigned int fd);
 extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
-extern struct file *close_fd_get_file(unsigned int fd);
+extern struct file *file_close_fd(unsigned int fd);
 extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
 		      struct files_struct **new_fdp);
 
-- 
cgit v1.2.3


From 372a34e66fb7f95124fadae9c600b231c35696a7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:09 +0100
Subject: fs: replace f_rcuhead with f_task_work

The naming is actively misleading since we switched to
SLAB_TYPESAFE_BY_RCU. rcu_head is #define callback_head. Use
callback_head directly and rename f_rcuhead to f_task_work.

Add comments in there to explain what it's used for.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-3-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c    | 6 +++---
 include/linux/fs.h | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 6deac386486d..3ba764d73fc9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -407,7 +407,7 @@ static void delayed_fput(struct work_struct *unused)
 
 static void ____fput(struct callback_head *work)
 {
-	__fput(container_of(work, struct file, f_rcuhead));
+	__fput(container_of(work, struct file, f_task_work));
 }
 
 /*
@@ -438,8 +438,8 @@ void fput(struct file *file)
 			return;
 		}
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
-			init_task_work(&file->f_rcuhead, ____fput);
-			if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
+			init_task_work(&file->f_task_work, ____fput);
+			if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
 				return;
 			/*
 			 * After this task has run exit_task_work(),
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..354fd02e0e11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  */
 struct file {
 	union {
+		/* fput() uses task work when closing and freeing file (default). */
+		struct callback_head 	f_task_work;
+		/* fput() must use workqueue (most kernel threads). */
 		struct llist_node	f_llist;
-		struct rcu_head 	f_rcuhead;
 		unsigned int 		f_iocb_flags;
 	};
 
-- 
cgit v1.2.3


From eac9189c96196574a83a553ca5a7543dd9f5fe3e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:10 +0100
Subject: file: stop exposing receive_fd_user()

Not every subsystem needs to have their own specialized helper.
Just us the __receive_fd() helper.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-4-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/file.h | 7 -------
 include/net/scm.h    | 9 +++++++++
 net/compat.c         | 2 +-
 net/core/scm.c       | 2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index 6e9099d29343..c0d5219c2852 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -101,13 +101,6 @@ extern int __receive_fd(struct file *file, int __user *ufd,
 
 extern int receive_fd(struct file *file, unsigned int o_flags);
 
-static inline int receive_fd_user(struct file *file, int __user *ufd,
-				  unsigned int o_flags)
-{
-	if (ufd == NULL)
-		return -EFAULT;
-	return __receive_fd(file, ufd, o_flags);
-}
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
diff --git a/include/net/scm.h b/include/net/scm.h
index e8c76b4be2fe..8aae2468bae0 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -5,6 +5,7 @@
 #include <linux/limits.h>
 #include <linux/net.h>
 #include <linux/cred.h>
+#include <linux/file.h>
 #include <linux/security.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
@@ -208,5 +209,13 @@ static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg,
 	scm_destroy_cred(scm);
 }
 
+static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
+				  unsigned int flags)
+{
+	if (!ufd)
+		return -EFAULT;
+	return __receive_fd(f, ufd, flags);
+}
+
 #endif /* __LINUX_NET_SCM_H */
 
diff --git a/net/compat.c b/net/compat.c
index 6564720f32b7..485db8ee9b28 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -297,7 +297,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
 	int err = 0, i;
 
 	for (i = 0; i < fdmax; i++) {
-		err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags);
+		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
 		if (err < 0)
 			break;
 	}
diff --git a/net/core/scm.c b/net/core/scm.c
index 880027ecf516..eec78e312550 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -319,7 +319,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 	}
 
 	for (i = 0; i < fdmax; i++) {
-		err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags);
+		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
 		if (err < 0)
 			break;
 	}
-- 
cgit v1.2.3


From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:11 +0100
Subject: file: remove __receive_fd()

Honestly, there's little value in having a helper with and without that
int __user *ufd argument. It's just messy and doesn't really give us
anything. Just expose receive_fd() with that argument and get rid of
that helper.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/vdpa/vdpa_user/vduse_dev.c |  2 +-
 fs/file.c                          | 11 +++--------
 include/linux/file.h               |  5 +----
 include/net/scm.h                  |  2 +-
 kernel/pid.c                       |  2 +-
 kernel/seccomp.c                   |  2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0ddd4b8abecb..fafd4610b185 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1157,7 +1157,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 			fput(f);
 			break;
 		}
-		ret = receive_fd(f, perm_to_file_flags(entry.perm));
+		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
 		fput(f);
 		break;
 	}
diff --git a/fs/file.c b/fs/file.c
index c8eaa0b29a08..3b683b9101d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1296,7 +1296,7 @@ out_unlock:
 }
 
 /**
- * __receive_fd() - Install received file into file descriptor table
+ * receive_fd() - Install received file into file descriptor table
  * @file: struct file that was received from another process
  * @ufd: __user pointer to write new fd number to
  * @o_flags: the O_* flags to apply to the new fd entry
@@ -1310,7 +1310,7 @@ out_unlock:
  *
  * Returns newly install fd or -ve on error.
  */
-int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
 	int new_fd;
 	int error;
@@ -1335,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 	__receive_sock(file);
 	return new_fd;
 }
+EXPORT_SYMBOL_GPL(receive_fd);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 {
@@ -1350,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 	return new_fd;
 }
 
-int receive_fd(struct file *file, unsigned int o_flags)
-{
-	return __receive_fd(file, NULL, o_flags);
-}
-EXPORT_SYMBOL_GPL(receive_fd);
-
 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
 	int err = -EBADF;
diff --git a/include/linux/file.h b/include/linux/file.h
index c0d5219c2852..6834a29338c4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(struct file *file, int __user *ufd,
-			unsigned int o_flags);
-
-extern int receive_fd(struct file *file, unsigned int o_flags);
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 8aae2468bae0..cf68acec4d70 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -214,7 +214,7 @@ static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
 {
 	if (!ufd)
 		return -EFAULT;
-	return __receive_fd(f, ufd, flags);
+	return receive_fd(f, ufd, flags);
 }
 
 #endif /* __LINUX_NET_SCM_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 6500ef956f2f..b52b10865454 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = receive_fd(file, O_CLOEXEC);
+	ret = receive_fd(file, NULL, O_CLOEXEC);
 	fput(file);
 
 	return ret;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..aca7b437882e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
 	 */
 	list_del_init(&addfd->list);
 	if (!addfd->setfd)
-		fd = receive_fd(addfd->file, addfd->flags);
+		fd = receive_fd(addfd->file, NULL, addfd->flags);
 	else
 		fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
 	addfd->ret = fd;
-- 
cgit v1.2.3


From b66509b8497f2b002a2654e386a440f1274ddcc7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:35 +0000
Subject: io_uring: split out cmd api into a separate header

linux/io_uring.h is slowly becoming a rubbish bin where we put
anything exposed to other subsystems. For instance, the task exit
hooks and io_uring cmd infra are completely orthogonal and don't need
each other's definitions. Start cleaning it up by splitting out all
command bits into a new header file.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7ec50bae6e21f371d3850796e716917fc141225a.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS                    |  1 +
 drivers/block/ublk_drv.c       |  2 +-
 drivers/nvme/host/ioctl.c      |  2 +-
 include/linux/io_uring.h       | 89 +-----------------------------------------
 include/linux/io_uring/cmd.h   | 81 ++++++++++++++++++++++++++++++++++++++
 include/linux/io_uring_types.h | 20 ++++++++++
 io_uring/io_uring.c            |  1 +
 io_uring/rw.c                  |  2 +-
 io_uring/uring_cmd.c           |  2 +-
 security/selinux/hooks.c       |  2 +-
 security/smack/smack_lsm.c     |  2 +-
 11 files changed, 110 insertions(+), 94 deletions(-)
 create mode 100644 include/linux/io_uring/cmd.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e2c6187a3ac8..eb404a6b4a11 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11102,6 +11102,7 @@ L:	io-uring@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.dk/linux-block
 T:	git git://git.kernel.dk/liburing
+F:	include/linux/io_uring/
 F:	include/linux/io_uring.h
 F:	include/linux/io_uring_types.h
 F:	include/trace/events/io_uring.h
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 83600b45e12a..909377068a87 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -36,7 +36,7 @@
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
 #include <linux/cdev.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 #include <linux/blk-mq.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 4939ed35638f..276788bbb96d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -5,7 +5,7 @@
  */
 #include <linux/ptrace.h>	/* for force_successful_syscall_return */
 #include <linux/nvme_ioctl.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 #include "nvme.h"
 
 enum {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index aefb73eeeebf..d8fc93492dc5 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -6,71 +6,13 @@
 #include <linux/xarray.h>
 #include <uapi/linux/io_uring.h>
 
-enum io_uring_cmd_flags {
-	IO_URING_F_COMPLETE_DEFER	= 1,
-	IO_URING_F_UNLOCKED		= 2,
-	/* the request is executed from poll, it should not be freed */
-	IO_URING_F_MULTISHOT		= 4,
-	/* executed by io-wq */
-	IO_URING_F_IOWQ			= 8,
-	/* int's last bit, sign checks are usually faster than a bit test */
-	IO_URING_F_NONBLOCK		= INT_MIN,
-
-	/* ctx state flags, for URING_CMD */
-	IO_URING_F_SQE128		= (1 << 8),
-	IO_URING_F_CQE32		= (1 << 9),
-	IO_URING_F_IOPOLL		= (1 << 10),
-
-	/* set when uring wants to cancel a previously issued command */
-	IO_URING_F_CANCEL		= (1 << 11),
-	IO_URING_F_COMPAT		= (1 << 12),
-};
-
-/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
-#define IORING_URING_CMD_CANCELABLE	(1U << 30)
-#define IORING_URING_CMD_POLLED		(1U << 31)
-
-struct io_uring_cmd {
-	struct file	*file;
-	const struct io_uring_sqe *sqe;
-	union {
-		/* callback to defer completions to task context */
-		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
-		/* used for polled completion */
-		void *cookie;
-	};
-	u32		cmd_op;
-	u32		flags;
-	u8		pdu[32]; /* available inline for free use */
-};
-
-static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
-{
-	return sqe->cmd;
-}
-
 #if defined(CONFIG_IO_URING)
-int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
-			      struct iov_iter *iter, void *ioucmd);
-void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
-			unsigned issue_flags);
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
-void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
-			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
-			    unsigned flags);
-/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
-
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
-}
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 
 static inline void io_uring_files_cancel(void)
 {
@@ -89,28 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk)
 	if (tsk->io_uring)
 		__io_uring_free(tsk);
 }
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
-void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
-		unsigned int issue_flags);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 #else
-static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
-			      struct iov_iter *iter, void *ioucmd)
-{
-	return -EOPNOTSUPP;
-}
-static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
-		ssize_t ret2, unsigned issue_flags)
-{
-}
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
 static inline struct sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
@@ -133,14 +54,6 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
-static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
-		unsigned int issue_flags)
-{
-}
-static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
-	return NULL;
-}
 #endif
 
 #endif
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
new file mode 100644
index 000000000000..62fcfaf6fcc9
--- /dev/null
+++ b/include/linux/io_uring/cmd.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_CMD_H
+#define _LINUX_IO_URING_CMD_H
+
+#include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE	(1U << 30)
+#define IORING_URING_CMD_POLLED		(1U << 31)
+
+struct io_uring_cmd {
+	struct file	*file;
+	const struct io_uring_sqe *sqe;
+	union {
+		/* callback to defer completions to task context */
+		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
+		/* used for polled completion */
+		void *cookie;
+	};
+	u32		cmd_op;
+	u32		flags;
+	u8		pdu[32]; /* available inline for free use */
+};
+
+static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
+{
+	return sqe->cmd;
+}
+
+#if defined(CONFIG_IO_URING)
+int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd);
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
+			unsigned issue_flags);
+void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+			    unsigned flags);
+/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
+void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
+
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
+
+#else
+static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
+			      struct iov_iter *iter, void *ioucmd)
+{
+	return -EOPNOTSUPP;
+}
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+		ssize_t ret2, unsigned issue_flags)
+{
+}
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+}
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+}
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return NULL;
+}
+#endif
+
+#endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 805bb635cdf5..8c807bcc8b2b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,26 @@
 #include <linux/llist.h>
 #include <uapi/linux/io_uring.h>
 
+enum io_uring_cmd_flags {
+	IO_URING_F_COMPLETE_DEFER	= 1,
+	IO_URING_F_UNLOCKED		= 2,
+	/* the request is executed from poll, it should not be freed */
+	IO_URING_F_MULTISHOT		= 4,
+	/* executed by io-wq */
+	IO_URING_F_IOWQ			= 8,
+	/* int's last bit, sign checks are usually faster than a bit test */
+	IO_URING_F_NONBLOCK		= INT_MIN,
+
+	/* ctx state flags, for URING_CMD */
+	IO_URING_F_SQE128		= (1 << 8),
+	IO_URING_F_CQE32		= (1 << 9),
+	IO_URING_F_IOPOLL		= (1 << 10),
+
+	/* set when uring wants to cancel a previously issued command */
+	IO_URING_F_CANCEL		= (1 << 11),
+	IO_URING_F_COMPAT		= (1 << 12),
+};
+
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0c269652ad26..5c946f3ed91c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -70,6 +70,7 @@
 #include <linux/fadvise.h>
 #include <linux/task_work.h>
 #include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 #include <linux/audit.h>
 #include <linux/security.h>
 #include <asm/shmparam.h>
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 64390d4e20c1..4943d683508b 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -10,7 +10,7 @@
 #include <linux/poll.h>
 #include <linux/nospec.h>
 #include <linux/compat.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 
 #include <uapi/linux/io_uring.h>
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index acbc2924ecd2..4ed0c66e3aae 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -2,7 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/file.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 #include <linux/security.h>
 #include <linux/nospec.h>
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..17ec5e109aec 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -91,7 +91,7 @@
 #include <uapi/linux/mount.h>
 #include <linux/fsnotify.h>
 #include <linux/fanotify.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 
 #include "avc.h"
 #include "objsec.h"
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 65130a791f57..2cdaa46088a0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -42,7 +42,7 @@
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/watch_queue.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/cmd.h>
 #include "smack.h"
 
 #define TRANS_TRUE	"TRUE"
-- 
cgit v1.2.3


From 6b04a3737057ddfed396c954f9e4be4fe6d53c62 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:36 +0000
Subject: io_uring/cmd: inline io_uring_cmd_do_in_task_lazy

Now as we can easily include io_uring_types.h, move IOU_F_TWQ_LAZY_WAKE
and inline io_uring_cmd_do_in_task_lazy().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/2ec9fb31dd192d1c5cf26d0a2dec5657d88a8e48.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h   | 31 ++++++++++++++++---------------
 include/linux/io_uring_types.h | 11 +++++++++++
 io_uring/io_uring.h            | 10 ----------
 io_uring/uring_cmd.c           |  7 -------
 4 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 62fcfaf6fcc9..ee9b3bc3a4af 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -36,15 +36,6 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
 			    unsigned flags);
-/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned));
-
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
-}
 
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
@@ -60,12 +51,9 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
 		ssize_t ret2, unsigned issue_flags)
 {
 }
-static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-}
-static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
+			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
+			    unsigned flags)
 {
 }
 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
@@ -78,4 +66,17 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
 }
 #endif
 
+/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
+static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
+}
+
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
+{
+	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
+}
+
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 8c807bcc8b2b..bebab36abce8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -7,6 +7,17 @@
 #include <linux/llist.h>
 #include <uapi/linux/io_uring.h>
 
+enum {
+	/*
+	 * A hint to not wake right away but delay until there are enough of
+	 * tw's queued to match the number of CQEs the task is waiting for.
+	 *
+	 * Must not be used wirh requests generating more than one CQE.
+	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
+	 */
+	IOU_F_TWQ_LAZY_WAKE			= 1,
+};
+
 enum io_uring_cmd_flags {
 	IO_URING_F_COMPLETE_DEFER	= 1,
 	IO_URING_F_UNLOCKED		= 2,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ed84f2737b3a..ef783a2444ac 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -15,16 +15,6 @@
 #include <trace/events/io_uring.h>
 #endif
 
-enum {
-	/*
-	 * A hint to not wake right away but delay until there are enough of
-	 * tw's queued to match the number of CQEs the task is waiting for.
-	 *
-	 * Must not be used wirh requests generating more than one CQE.
-	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
-	 */
-	IOU_F_TWQ_LAZY_WAKE			= 1,
-};
 
 enum {
 	IOU_OK			= 0,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 4ed0c66e3aae..94e9de1c24aa 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -78,13 +78,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 }
 EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
 
-void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
-			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-{
-	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
-
 static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
 					  u64 extra1, u64 extra2)
 {
-- 
cgit v1.2.3


From 055c15626a45b1ebc9f2f34981e705e1af171236 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 Dec 2023 00:57:37 +0000
Subject: io_uring/cmd: inline io_uring_cmd_get_task

With io_uring_types.h we see all required definitions to inline
io_uring_cmd_get_task().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/aa8e317f09e651a5f3e72f8c0ad3902084c1f930.1701391955.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 10 +++++-----
 io_uring/uring_cmd.c         |  6 ------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index ee9b3bc3a4af..d69b4038aa3e 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -39,7 +39,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
 
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -60,10 +59,6 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
 }
-static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
-	return NULL;
-}
 #endif
 
 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
@@ -79,4 +74,9 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 	__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
 }
 
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+	return cmd_to_io_kiocb(cmd)->task;
+}
+
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 94e9de1c24aa..34030583b9b2 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -52,12 +52,6 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 
-struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
-{
-	return cmd_to_io_kiocb(cmd)->task;
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
-
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-- 
cgit v1.2.3


From 0f292086c22b43202daffc14b585d3b54b9a1206 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:36 +0200
Subject: splice: return type ssize_t from all helpers

Not sure why some splice helpers return long, maybe historic reasons.
Change them all to return ssize_t to conform to the splice methods and
to the rest of the helpers.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20231208-horchen-helium-d3ec1535ede5@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-2-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h          |   8 ++--
 fs/overlayfs/copy_up.c |   2 +-
 fs/read_write.c        |   2 +-
 fs/splice.c            | 104 +++++++++++++++++++++++++------------------------
 include/linux/splice.h |  43 ++++++++++----------
 io_uring/splice.c      |   4 +-
 6 files changed, 82 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/fs/internal.h b/fs/internal.h
index 58e43341aebf..4eaa758e3bfd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -243,10 +243,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
 /*
  * fs/splice.c:
  */
-long splice_file_to_pipe(struct file *in,
-			 struct pipe_inode_info *opipe,
-			 loff_t *offset,
-			 size_t len, unsigned int flags);
+ssize_t splice_file_to_pipe(struct file *in,
+			    struct pipe_inode_info *opipe,
+			    loff_t *offset,
+			    size_t len, unsigned int flags);
 
 /*
  * fs/xattr.c:
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2f587ee0b334..070d36ba0f09 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -285,7 +285,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 
 	while (len) {
 		size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
-		long bytes;
+		ssize_t bytes;
 
 		if (len < this_len)
 			this_len = len;
diff --git a/fs/read_write.c b/fs/read_write.c
index 01a14570015b..7783b8522693 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1214,7 +1214,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
 #endif /* CONFIG_COMPAT */
 
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
-		  	   size_t count, loff_t max)
+			   size_t count, loff_t max)
 {
 	struct fd in, out;
 	struct inode *in_inode, *out_inode;
diff --git a/fs/splice.c b/fs/splice.c
index 7cda013e5a1e..c39d1abf23c8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 	unsigned int tail = pipe->tail;
 	unsigned int head = pipe->head;
 	unsigned int mask = pipe->ring_size - 1;
-	int ret = 0, page_nr = 0;
+	ssize_t ret = 0;
+	int page_nr = 0;
 
 	if (!spd_pages)
 		return 0;
@@ -932,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op)
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t *ppos, size_t len, unsigned int flags)
+static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			      loff_t *ppos, size_t len, unsigned int flags)
 {
 	if (unlikely(!out->f_op->splice_write))
 		return warn_unsupported(out, "write");
@@ -955,9 +956,9 @@ static void do_splice_eof(struct splice_desc *sd)
  * Callers already called rw_verify_area() on the entire range.
  * No need to call it for sub ranges.
  */
-static long do_splice_read(struct file *in, loff_t *ppos,
-			   struct pipe_inode_info *pipe, size_t len,
-			   unsigned int flags)
+static ssize_t do_splice_read(struct file *in, loff_t *ppos,
+			      struct pipe_inode_info *pipe, size_t len,
+			      unsigned int flags)
 {
 	unsigned int p_space;
 
@@ -999,11 +1000,11 @@ static long do_splice_read(struct file *in, loff_t *ppos,
  * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
  * a hole and a negative error code otherwise.
  */
-long vfs_splice_read(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags)
+ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
 {
-	int ret;
+	ssize_t ret;
 
 	ret = rw_verify_area(READ, in, ppos, len);
 	if (unlikely(ret < 0))
@@ -1030,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 			       splice_direct_actor *actor)
 {
 	struct pipe_inode_info *pipe;
-	long ret, bytes;
+	ssize_t ret, bytes;
 	size_t len;
 	int i, flags, more;
 
@@ -1181,10 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
 		file->f_op->splice_eof(file);
 }
 
-static long do_splice_direct_actor(struct file *in, loff_t *ppos,
-				   struct file *out, loff_t *opos,
-				   size_t len, unsigned int flags,
-				   splice_direct_actor *actor)
+static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
+				      struct file *out, loff_t *opos,
+				      size_t len, unsigned int flags,
+				      splice_direct_actor *actor)
 {
 	struct splice_desc sd = {
 		.len		= len,
@@ -1195,7 +1196,7 @@ static long do_splice_direct_actor(struct file *in, loff_t *ppos,
 		.splice_eof	= direct_file_splice_eof,
 		.opos		= opos,
 	};
-	long ret;
+	ssize_t ret;
 
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
@@ -1226,8 +1227,8 @@ static long do_splice_direct_actor(struct file *in, loff_t *ppos,
  *
  * Callers already called rw_verify_area() on the entire range.
  */
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      loff_t *opos, size_t len, unsigned int flags)
+ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+			 loff_t *opos, size_t len, unsigned int flags)
 {
 	return do_splice_direct_actor(in, ppos, out, opos, len, flags,
 				      direct_splice_actor);
@@ -1249,8 +1250,8 @@ EXPORT_SYMBOL(do_splice_direct);
  *
  * Callers already called rw_verify_area() on the entire range.
  */
-long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
-		       loff_t *opos, size_t len)
+ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+			  loff_t *opos, size_t len)
 {
 	lockdep_assert(file_write_started(out));
 
@@ -1280,12 +1281,12 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       struct pipe_inode_info *opipe,
 			       size_t len, unsigned int flags);
 
-long splice_file_to_pipe(struct file *in,
-			 struct pipe_inode_info *opipe,
-			 loff_t *offset,
-			 size_t len, unsigned int flags)
+ssize_t splice_file_to_pipe(struct file *in,
+			    struct pipe_inode_info *opipe,
+			    loff_t *offset,
+			    size_t len, unsigned int flags)
 {
-	long ret;
+	ssize_t ret;
 
 	pipe_lock(opipe);
 	ret = wait_for_space(opipe, flags);
@@ -1300,13 +1301,13 @@ long splice_file_to_pipe(struct file *in,
 /*
  * Determine where to splice to/from.
  */
-long do_splice(struct file *in, loff_t *off_in, struct file *out,
-	       loff_t *off_out, size_t len, unsigned int flags)
+ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
+		  loff_t *off_out, size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *ipipe;
 	struct pipe_inode_info *opipe;
 	loff_t offset;
-	long ret;
+	ssize_t ret;
 
 	if (unlikely(!(in->f_mode & FMODE_READ) ||
 		     !(out->f_mode & FMODE_WRITE)))
@@ -1397,14 +1398,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
 	return ret;
 }
 
-static long __do_splice(struct file *in, loff_t __user *off_in,
-			struct file *out, loff_t __user *off_out,
-			size_t len, unsigned int flags)
+static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
+			   struct file *out, loff_t __user *off_out,
+			   size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *ipipe;
 	struct pipe_inode_info *opipe;
 	loff_t offset, *__off_in = NULL, *__off_out = NULL;
-	long ret;
+	ssize_t ret;
 
 	ipipe = get_pipe_info(in, true);
 	opipe = get_pipe_info(out, true);
@@ -1443,16 +1444,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in,
 	return ret;
 }
 
-static int iter_to_pipe(struct iov_iter *from,
-			struct pipe_inode_info *pipe,
-			unsigned flags)
+static ssize_t iter_to_pipe(struct iov_iter *from,
+			    struct pipe_inode_info *pipe,
+			    unsigned int flags)
 {
 	struct pipe_buffer buf = {
 		.ops = &user_page_pipe_buf_ops,
 		.flags = flags
 	};
 	size_t total = 0;
-	int ret = 0;
+	ssize_t ret = 0;
 
 	while (iov_iter_count(from)) {
 		struct page *pages[16];
@@ -1501,8 +1502,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
  * For lack of a better implementation, implement vmsplice() to userspace
  * as a simple copy of the pipes pages to the user iov.
  */
-static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
-			     unsigned int flags)
+static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
+				unsigned int flags)
 {
 	struct pipe_inode_info *pipe = get_pipe_info(file, true);
 	struct splice_desc sd = {
@@ -1510,7 +1511,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
 		.flags = flags,
 		.u.data = iter
 	};
-	long ret = 0;
+	ssize_t ret = 0;
 
 	if (!pipe)
 		return -EBADF;
@@ -1534,11 +1535,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
  * as splice-from-memory, where the regular splice is splice-from-file (or
  * to file). In both cases the output is a pipe, naturally.
  */
-static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
-			     unsigned int flags)
+static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
+				unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
-	long ret = 0;
+	ssize_t ret = 0;
 	unsigned buf_flag = 0;
 
 	if (flags & SPLICE_F_GIFT)
@@ -1634,7 +1635,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		size_t, len, unsigned int, flags)
 {
 	struct fd in, out;
-	long error;
+	ssize_t error;
 
 	if (unlikely(!len))
 		return 0;
@@ -1648,7 +1649,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		out = fdget(fd_out);
 		if (out.file) {
 			error = __do_splice(in.file, off_in, out.file, off_out,
-						len, flags);
+					    len, flags);
 			fdput(out);
 		}
 		fdput(in);
@@ -1871,15 +1872,15 @@ retry:
 /*
  * Link contents of ipipe to opipe.
  */
-static int link_pipe(struct pipe_inode_info *ipipe,
-		     struct pipe_inode_info *opipe,
-		     size_t len, unsigned int flags)
+static ssize_t link_pipe(struct pipe_inode_info *ipipe,
+			 struct pipe_inode_info *opipe,
+			 size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
 	unsigned int i_head, o_head;
 	unsigned int i_tail, o_tail;
 	unsigned int i_mask, o_mask;
-	int ret = 0;
+	ssize_t ret = 0;
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
@@ -1962,11 +1963,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
  * The 'flags' used are the SPLICE_F_* variants, currently the only
  * applicable one is SPLICE_F_NONBLOCK.
  */
-long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
+ssize_t do_tee(struct file *in, struct file *out, size_t len,
+	       unsigned int flags)
 {
 	struct pipe_inode_info *ipipe = get_pipe_info(in, true);
 	struct pipe_inode_info *opipe = get_pipe_info(out, true);
-	int ret = -EINVAL;
+	ssize_t ret = -EINVAL;
 
 	if (unlikely(!(in->f_mode & FMODE_READ) ||
 		     !(out->f_mode & FMODE_WRITE)))
@@ -2003,7 +2005,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
 	struct fd in, out;
-	int error;
+	ssize_t error;
 
 	if (unlikely(flags & ~SPLICE_F_ALL))
 		return -EINVAL;
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 49532d5dda52..068a8e8ffd73 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -68,31 +68,30 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
 typedef int (splice_direct_actor)(struct pipe_inode_info *,
 				  struct splice_desc *);
 
-extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
-				loff_t *, size_t, unsigned int,
-				splice_actor *);
-extern ssize_t __splice_from_pipe(struct pipe_inode_info *,
-				  struct splice_desc *, splice_actor *);
-extern ssize_t splice_to_pipe(struct pipe_inode_info *,
-			      struct splice_pipe_desc *);
-extern ssize_t add_to_pipe(struct pipe_inode_info *,
-			      struct pipe_buffer *);
-long vfs_splice_read(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags);
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor);
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
+			   struct splice_desc *sd, splice_actor *actor);
+ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+			      struct splice_pipe_desc *spd);
+ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf);
+ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags);
 ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd,
 			       splice_direct_actor *actor);
-long do_splice(struct file *in, loff_t *off_in, struct file *out,
-	       loff_t *off_out, size_t len, unsigned int flags);
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      loff_t *opos, size_t len, unsigned int flags);
-long splice_file_range(struct file *in, loff_t *ppos, struct file *out,
-		       loff_t *opos, size_t len);
+ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
+		  loff_t *off_out, size_t len, unsigned int flags);
+ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+			 loff_t *opos, size_t len, unsigned int flags);
+ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+			  loff_t *opos, size_t len);
 
-extern long do_tee(struct file *in, struct file *out, size_t len,
-		   unsigned int flags);
-extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
-				loff_t *ppos, size_t len, unsigned int flags);
+ssize_t do_tee(struct file *in, struct file *out, size_t len,
+	       unsigned int flags);
+ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags);
 
 /*
  * for dynamic pipe sizing
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7c4469e9540e..3b659cd23e9d 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	struct file *in;
-	long ret = 0;
+	ssize_t ret = 0;
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
@@ -92,7 +92,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	loff_t *poff_in, *poff_out;
 	struct file *in;
-	long ret = 0;
+	ssize_t ret = 0;
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
-- 
cgit v1.2.3


From 705bcfcbde38b9dd4db00cd3deb0b98bddb0dd4a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:37 +0200
Subject: fs: use splice_copy_file_range() inline helper

generic_copy_file_range() is just a wrapper around splice_file_range(),
which caps the maximum copy length.

The only caller of splice_file_range(), namely __ceph_copy_file_range()
is already ready to cope with short copy.

Move the length capping into splice_file_range() and replace the exported
symbol generic_copy_file_range() with a simple inline helper.

Suggested-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/linux-fsdevel/20231204083849.GC32438@lst.de/
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/file.c         |  4 ++--
 fs/fuse/file.c         |  5 +++--
 fs/nfs/nfs4file.c      |  5 +++--
 fs/read_write.c        | 34 ----------------------------------
 fs/smb/client/cifsfs.c |  5 +++--
 fs/splice.c            |  7 ++++---
 include/linux/fs.h     |  3 ---
 include/linux/splice.h |  7 +++++++
 8 files changed, 22 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f11de6e1f1c1..d380d9dad0e0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -3090,8 +3090,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
 				     len, flags);
 
 	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(src_file, src_off, dst_file,
-					      dst_off, len, flags);
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+					     dst_off, len);
 	return ret;
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..7d0860ef63d0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,7 @@
 #include <linux/uio.h>
 #include <linux/fs.h>
 #include <linux/filelock.h>
+#include <linux/splice.h>
 
 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
 			  unsigned int open_flags, int opcode,
@@ -3193,8 +3194,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
 				     len, flags);
 
 	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(src_file, src_off, dst_file,
-					      dst_off, len, flags);
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+					     dst_off, len);
 	return ret;
 }
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 02788c3c85e5..e238abc78a13 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -10,6 +10,7 @@
 #include <linux/mount.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_ssc.h>
+#include <linux/splice.h>
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
@@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 	ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
 				     flags);
 	if (ret == -EOPNOTSUPP || ret == -EXDEV)
-		ret = generic_copy_file_range(file_in, pos_in, file_out,
-					      pos_out, count, flags);
+		ret = splice_copy_file_range(file_in, pos_in, file_out,
+					     pos_out, count);
 	return ret;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 7783b8522693..e3abf603eaaf 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1396,40 +1396,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
 }
 #endif
 
-/**
- * generic_copy_file_range - copy data between two files
- * @file_in:	file structure to read from
- * @pos_in:	file offset to read from
- * @file_out:	file structure to write data to
- * @pos_out:	file offset to write data to
- * @len:	amount of data to copy
- * @flags:	copy flags
- *
- * This is a generic filesystem helper to copy data from one file to another.
- * It has no constraints on the source or destination file owners - the files
- * can belong to different superblocks and different filesystem types. Short
- * copies are allowed.
- *
- * This should be called from the @file_out filesystem, as per the
- * ->copy_file_range() method.
- *
- * Returns the number of bytes copied or a negative error indicating the
- * failure.
- */
-
-ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-				struct file *file_out, loff_t pos_out,
-				size_t len, unsigned int flags)
-{
-	/* May only be called from within ->copy_file_range() methods */
-	if (WARN_ON_ONCE(flags))
-		return -EINVAL;
-
-	return splice_file_range(file_in, &pos_in, file_out, &pos_out,
-				 min_t(size_t, len, MAX_RW_COUNT));
-}
-EXPORT_SYMBOL(generic_copy_file_range);
-
 /*
  * Performs necessary checks before doing a file copy
  *
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index ea3a7a668b45..ee461bf0ef63 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -25,6 +25,7 @@
 #include <linux/freezer.h>
 #include <linux/namei.h>
 #include <linux/random.h>
+#include <linux/splice.h>
 #include <linux/uuid.h>
 #include <linux/xattr.h>
 #include <uapi/linux/magic.h>
@@ -1362,8 +1363,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
 	free_xid(xid);
 
 	if (rc == -EOPNOTSUPP || rc == -EXDEV)
-		rc = generic_copy_file_range(src_file, off, dst_file,
-					     destoff, len, flags);
+		rc = splice_copy_file_range(src_file, off, dst_file,
+					    destoff, len);
 	return rc;
 }
 
diff --git a/fs/splice.c b/fs/splice.c
index c39d1abf23c8..218e24b1ac40 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1244,7 +1244,7 @@ EXPORT_SYMBOL(do_splice_direct);
  * @len:	number of bytes to splice
  *
  * Description:
- *    For use by generic_copy_file_range() and ->copy_file_range() methods.
+ *    For use by ->copy_file_range() methods.
  *    Like do_splice_direct(), but vfs_copy_file_range() already holds
  *    start_file_write() on @out file.
  *
@@ -1255,8 +1255,9 @@ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
 {
 	lockdep_assert(file_write_started(out));
 
-	return do_splice_direct_actor(in, ppos, out, opos, len, 0,
-				      splice_file_range_actor);
+	return do_splice_direct_actor(in, ppos, out, opos,
+				      min_t(size_t, len, MAX_RW_COUNT),
+				      0, splice_file_range_actor);
 }
 EXPORT_SYMBOL(splice_file_range);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04422a0eccdd..900d0cd55b50 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2090,9 +2090,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
-extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-				       struct file *file_out, loff_t pos_out,
-				       size_t len, unsigned int flags);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 068a8e8ffd73..9dec4861d09f 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -88,6 +88,13 @@ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
 			  loff_t *opos, size_t len);
 
+static inline long splice_copy_file_range(struct file *in, loff_t pos_in,
+					  struct file *out, loff_t pos_out,
+					  size_t len)
+{
+	return splice_file_range(in, &pos_in, out, &pos_out, len);
+}
+
 ssize_t do_tee(struct file *in, struct file *out, size_t len,
 	       unsigned int flags);
 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
-- 
cgit v1.2.3


From 36e28c42187c95eb148873ffb059bfdcb8cdb75b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:38 +0200
Subject: fsnotify: split fsnotify_perm() into two hooks

We would like to make changes to the fsnotify access permission hook -
add file range arguments and add the pre modify event.

In preparation for these changes, split the fsnotify_perm() hook into
fsnotify_open_perm() and fsnotify_file_perm().

This is needed for fanotify "pre content" events.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 34 +++++++++++++++++++---------------
 security/security.c      |  4 ++--
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index bcb6609b54b3..926bb4461b9e 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -100,29 +100,33 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
-/* Simple call site for access decisions */
-static inline int fsnotify_perm(struct file *file, int mask)
+/*
+ * fsnotify_file_perm - permission hook before file access
+ */
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
-	int ret;
-	__u32 fsnotify_mask = 0;
+	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
-	if (!(mask & (MAY_READ | MAY_OPEN)))
+	if (!(perm_mask & MAY_READ))
 		return 0;
 
-	if (mask & MAY_OPEN) {
-		fsnotify_mask = FS_OPEN_PERM;
+	return fsnotify_file(file, fsnotify_mask);
+}
 
-		if (file->f_flags & __FMODE_EXEC) {
-			ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+/*
+ * fsnotify_open_perm - permission hook before file open
+ */
+static inline int fsnotify_open_perm(struct file *file)
+{
+	int ret;
 
-			if (ret)
-				return ret;
-		}
-	} else if (mask & MAY_READ) {
-		fsnotify_mask = FS_ACCESS_PERM;
+	if (file->f_flags & __FMODE_EXEC) {
+		ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
+		if (ret)
+			return ret;
 	}
 
-	return fsnotify_file(file, fsnotify_mask);
+	return fsnotify_file(file, FS_OPEN_PERM);
 }
 
 /*
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..d7f3703c5905 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2586,7 +2586,7 @@ int security_file_permission(struct file *file, int mask)
 	if (ret)
 		return ret;
 
-	return fsnotify_perm(file, mask);
+	return fsnotify_file_perm(file, mask);
 }
 
 /**
@@ -2837,7 +2837,7 @@ int security_file_open(struct file *file)
 	if (ret)
 		return ret;
 
-	return fsnotify_perm(file, MAY_OPEN);
+	return fsnotify_open_perm(file);
 }
 
 /**
-- 
cgit v1.2.3


From cb383f06686734ef04daf63a4369566800717b7b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:39 +0200
Subject: fsnotify: assert that file_start_write() is not held in permission
 hooks

filesystem may be modified in the context of fanotify permission events
(e.g. by HSM service), so assert that sb freeze protection is not held.

If the assertion fails, then the following deadlock would be possible:

CPU0				CPU1			CPU2
-------------------------------------------------------------------------
file_start_write()#0
...
  fsnotify_perm()
    fanotify_get_response() =>	(read event and fill file)
				...
				...			freeze_super()
				...			  sb_wait_write()
				...
				vfs_write()
				  file_start_write()#1

This example demonstrates a use case of an hierarchical storage management
(HSM) service that uses fanotify permission events to fill the content of
a file before access, while a 3rd process starts fsfreeze.

This creates a circular dependeny:
  file_start_write()#0 => fanotify_get_response =>
    file_start_write()#1 =>
      sb_wait_write() =>
        file_end_write()#0

Where file_end_write()#0 can never be called and none of the threads can
make progress.

The assertion is checked for both MAY_READ and MAY_WRITE permission
hooks in preparation for a pre-modify permission event.

The assertion is not checked for an open permission event, because
do_open() takes mnt_want_write() in O_TRUNC case, meaning that it is not
safe to write to filesystem in the content of an open permission event.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-5-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 926bb4461b9e..0a9d6a8a747a 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -107,6 +107,13 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
 	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
+	/*
+	 * filesystem may be modified in the context of permission events
+	 * (e.g. by HSM filling a file on access), so sb freeze protection
+	 * must not be held.
+	 */
+	lockdep_assert_once(file_write_not_started(file));
+
 	if (!(perm_mask & MAY_READ))
 		return 0;
 
-- 
cgit v1.2.3


From d9e5d31084b024734e64307521414ef0ae1d5333 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 12 Dec 2023 11:44:40 +0200
Subject: fsnotify: optionally pass access range in file permission hooks

In preparation for pre-content permission events with file access range,
move fsnotify_file_perm() hook out of security_file_permission() and into
the callers.

Callers that have the access range information call the new hook
fsnotify_file_area_perm() with the access range.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231212094440.250945-6-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/open.c                |  4 ++++
 fs/read_write.c          | 10 ++++++++--
 fs/readdir.c             |  4 ++++
 fs/remap_range.c         |  8 +++++++-
 include/linux/fsnotify.h | 13 +++++++++++--
 security/security.c      |  8 +-------
 6 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 02dc608d40d8..d877228d5939 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
+	ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
+	if (ret)
+		return ret;
+
 	if (S_ISFIFO(inode->i_mode))
 		return -ESPIPE;
 
diff --git a/fs/read_write.c b/fs/read_write.c
index e3abf603eaaf..d4c036e82b6c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -354,6 +354,9 @@ out_putf:
 
 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 {
+	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
+	int ret;
+
 	if (unlikely((ssize_t) count < 0))
 		return -EINVAL;
 
@@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 		}
 	}
 
-	return security_file_permission(file,
-				read_write == READ ? MAY_READ : MAY_WRITE);
+	ret = security_file_permission(file, mask);
+	if (ret)
+		return ret;
+
+	return fsnotify_file_area_perm(file, mask, ppos, count);
 }
 EXPORT_SYMBOL(rw_verify_area);
 
diff --git a/fs/readdir.c b/fs/readdir.c
index c8c46e294431..278bc0254732 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 	if (res)
 		goto out;
 
+	res = fsnotify_file_perm(file, MAY_READ);
+	if (res)
+		goto out;
+
 	res = down_read_killable(&inode->i_rwsem);
 	if (res)
 		goto out;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 12131f2a6c9e..f8c1120b8311 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
 			     bool write)
 {
+	int mask = write ? MAY_WRITE : MAY_READ;
 	loff_t tmp;
+	int ret;
 
 	if (unlikely(pos < 0 || len < 0))
 		return -EINVAL;
@@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
 	if (unlikely(check_add_overflow(pos, len, &tmp)))
 		return -EINVAL;
 
-	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+	ret = security_file_permission(file, mask);
+	if (ret)
+		return ret;
+
+	return fsnotify_file_area_perm(file, mask, &pos, len);
 }
 
 /*
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 0a9d6a8a747a..11e6434b8e71 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -101,9 +101,10 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 }
 
 /*
- * fsnotify_file_perm - permission hook before file access
+ * fsnotify_file_area_perm - permission hook before access to file range
  */
-static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
+					  const loff_t *ppos, size_t count)
 {
 	__u32 fsnotify_mask = FS_ACCESS_PERM;
 
@@ -120,6 +121,14 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 	return fsnotify_file(file, fsnotify_mask);
 }
 
+/*
+ * fsnotify_file_perm - permission hook before file access
+ */
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+{
+	return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
+}
+
 /*
  * fsnotify_open_perm - permission hook before file open
  */
diff --git a/security/security.c b/security/security.c
index d7f3703c5905..2a7fc7881cbc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2580,13 +2580,7 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir,
  */
 int security_file_permission(struct file *file, int mask)
 {
-	int ret;
-
-	ret = call_int_hook(file_permission, 0, file, mask);
-	if (ret)
-		return ret;
-
-	return fsnotify_file_perm(file, mask);
+	return call_int_hook(file_permission, 0, file, mask);
 }
 
 /**
-- 
cgit v1.2.3


From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:20 -0400
Subject: maple_tree: move debug check to __mas_set_range()

__mas_set_range() was created to shortcut resetting the maple state and a
debug check was added to the caller (the vma iterator) to ensure the
internal maple state remains safe to use.  Move the debug check from the
vma iterator into the maple tree itself so other users do not incorrectly
use the advanced maple state modification.

Fallout from this change include a large amount of debug setup needed to
be moved to earlier in the header, and the maple_tree.h radix-tree test
code needed to move the inclusion of the header to after the atomic
define.  None of those changes have functional changes.

Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h                  | 255 ++++++++++++++--------------
 mm/internal.h                               |   2 -
 tools/testing/radix-tree/linux/maple_tree.h |   2 +-
 3 files changed, 130 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index a452dd8a1e5c..b5d5992578c9 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -557,6 +557,131 @@ static inline void mas_reset(struct ma_state *mas)
  */
 #define mas_for_each(__mas, __entry, __max) \
 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+enum mt_dump_format {
+	mt_dump_dec,
+	mt_dump_hex,
+};
+
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
+void mas_dump(const struct ma_state *mas);
+void mas_wr_dump(const struct ma_wr_state *wr_mas);
+void mt_validate(struct maple_tree *mt);
+void mt_cache_shrink(void);
+#define MT_BUG_ON(__tree, __x) do {					\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mt_dump(__tree, mt_dump_hex);				\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MAS_BUG_ON(__mas, __x) do {					\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_dump(__mas);					\
+		mt_dump((__mas)->tree, mt_dump_hex);			\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MAS_WR_BUG_ON(__wrmas, __x) do {				\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (__x) {							\
+		pr_info("BUG at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_wr_dump(__wrmas);					\
+		mas_dump((__wrmas)->mas);				\
+		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+} while (0)
+
+#define MT_WARN_ON(__tree, __x)  ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mt_dump(__tree, mt_dump_hex);				\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+
+#define MAS_WARN_ON(__mas, __x) ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_dump(__mas);					\
+		mt_dump((__mas)->tree, mt_dump_hex);			\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+
+#define MAS_WR_WARN_ON(__wrmas, __x) ({					\
+	int ret = !!(__x);						\
+	atomic_inc(&maple_tree_tests_run);				\
+	if (ret) {							\
+		pr_info("WARN at %s:%d (%u)\n",				\
+		__func__, __LINE__, __x);				\
+		mas_wr_dump(__wrmas);					\
+		mas_dump((__wrmas)->mas);				\
+		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
+		pr_info("Pass: %u Run:%u\n",				\
+			atomic_read(&maple_tree_tests_passed),		\
+			atomic_read(&maple_tree_tests_run));		\
+		dump_stack();						\
+	} else {							\
+		atomic_inc(&maple_tree_tests_passed);			\
+	}								\
+	unlikely(ret);							\
+})
+#else
+#define MT_BUG_ON(__tree, __x)		BUG_ON(__x)
+#define MAS_BUG_ON(__mas, __x)		BUG_ON(__x)
+#define MAS_WR_BUG_ON(__mas, __x)	BUG_ON(__x)
+#define MT_WARN_ON(__tree, __x)		WARN_ON(__x)
+#define MAS_WARN_ON(__mas, __x)		WARN_ON(__x)
+#define MAS_WR_WARN_ON(__mas, __x)	WARN_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
 /**
  * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
  * current location.
@@ -570,6 +695,9 @@ static inline void mas_reset(struct ma_state *mas)
 static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 		unsigned long last)
 {
+	/* Ensure the range starts within the current slot */
+	MAS_WARN_ON(mas, mas_is_active(mas) &&
+		   (mas->index > start || mas->last < start));
 	mas->index = start;
 	mas->last = last;
 }
@@ -587,8 +715,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 static inline
 void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
 {
-	__mas_set_range(mas, start, last);
 	mas->node = MAS_START;
+	__mas_set_range(mas, start, last);
 }
 
 /**
@@ -713,129 +841,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
 	for (__entry = mt_find(__tree, &(__index), __max); \
 		__entry; __entry = mt_find_after(__tree, &(__index), __max))
 
-
-#ifdef CONFIG_DEBUG_MAPLE_TREE
-enum mt_dump_format {
-	mt_dump_dec,
-	mt_dump_hex,
-};
-
-extern atomic_t maple_tree_tests_run;
-extern atomic_t maple_tree_tests_passed;
-
-void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
-void mas_dump(const struct ma_state *mas);
-void mas_wr_dump(const struct ma_wr_state *wr_mas);
-void mt_validate(struct maple_tree *mt);
-void mt_cache_shrink(void);
-#define MT_BUG_ON(__tree, __x) do {					\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mt_dump(__tree, mt_dump_hex);				\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MAS_BUG_ON(__mas, __x) do {					\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_dump(__mas);					\
-		mt_dump((__mas)->tree, mt_dump_hex);			\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MAS_WR_BUG_ON(__wrmas, __x) do {				\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (__x) {							\
-		pr_info("BUG at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_wr_dump(__wrmas);					\
-		mas_dump((__wrmas)->mas);				\
-		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-} while (0)
-
-#define MT_WARN_ON(__tree, __x)  ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mt_dump(__tree, mt_dump_hex);				\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-
-#define MAS_WARN_ON(__mas, __x) ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_dump(__mas);					\
-		mt_dump((__mas)->tree, mt_dump_hex);			\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-
-#define MAS_WR_WARN_ON(__wrmas, __x) ({					\
-	int ret = !!(__x);						\
-	atomic_inc(&maple_tree_tests_run);				\
-	if (ret) {							\
-		pr_info("WARN at %s:%d (%u)\n",				\
-		__func__, __LINE__, __x);				\
-		mas_wr_dump(__wrmas);					\
-		mas_dump((__wrmas)->mas);				\
-		mt_dump((__wrmas)->mas->tree, mt_dump_hex);		\
-		pr_info("Pass: %u Run:%u\n",				\
-			atomic_read(&maple_tree_tests_passed),		\
-			atomic_read(&maple_tree_tests_run));		\
-		dump_stack();						\
-	} else {							\
-		atomic_inc(&maple_tree_tests_passed);			\
-	}								\
-	unlikely(ret);							\
-})
-#else
-#define MT_BUG_ON(__tree, __x)		BUG_ON(__x)
-#define MAS_BUG_ON(__mas, __x)		BUG_ON(__x)
-#define MAS_WR_BUG_ON(__mas, __x)	BUG_ON(__x)
-#define MT_WARN_ON(__tree, __x)		WARN_ON(__x)
-#define MAS_WARN_ON(__mas, __x)		WARN_ON(__x)
-#define MAS_WR_WARN_ON(__mas, __x)	WARN_ON(__x)
-#endif /* CONFIG_DEBUG_MAPLE_TREE */
-
 #endif /*_LINUX_MAPLE_TREE_H */
diff --git a/mm/internal.h b/mm/internal.h
index 2bc9ff8db393..0005b8adbd5c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1135,8 +1135,6 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 static inline void vma_iter_config(struct vma_iterator *vmi,
 		unsigned long index, unsigned long last)
 {
-	MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START &&
-		   (vmi->mas.index > index || vmi->mas.last < index));
 	__mas_set_range(&vmi->mas, index, last - 1);
 }
 
diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h
index 7d8d1f445b89..06c89bdcc515 100644
--- a/tools/testing/radix-tree/linux/maple_tree.h
+++ b/tools/testing/radix-tree/linux/maple_tree.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 #define atomic_t int32_t
-#include "../../../../include/linux/maple_tree.h"
 #define atomic_inc(x) uatomic_inc(x)
 #define atomic_read(x) uatomic_read(x)
 #define atomic_set(x, y) do {} while (0)
 #define U8_MAX UCHAR_MAX
+#include "../../../../include/linux/maple_tree.h"
-- 
cgit v1.2.3


From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:21 -0400
Subject: maple_tree: add end of node tracking to the maple state

Analysis of the mas_for_each() iteration showed that there is a
significant time spent finding the end of a node.  This time can be
greatly reduced if the end of the node is cached in the maple state.  Care
must be taken to update & invalidate as necessary.

Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h       | 1 +
 lib/maple_tree.c                 | 7 +++++++
 tools/testing/radix-tree/maple.c | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index b5d5992578c9..0b82efe0cf1e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -393,6 +393,7 @@ struct ma_state {
 	unsigned char depth;		/* depth of tree descent during write */
 	unsigned char offset;
 	unsigned char mas_flags;
+	unsigned char end;		/* The end of the node */
 };
 
 struct ma_wr_state {
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 8d379d34ea0a..ea0a36341fed 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2841,6 +2841,7 @@ next:
 			goto dead_node;
 	} while (!ma_is_leaf(type));
 
+	mas->end = end;
 	mas->offset = offset;
 	mas->index = min;
 	mas->last = max;
@@ -3507,6 +3508,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
 	mas_replace_node(wr_mas->mas, old_enode);
 reuse_node:
 	mas_update_gap(wr_mas->mas);
+	wr_mas->mas->end = b_end;
 	return 1;
 }
 
@@ -4010,6 +4012,7 @@ done:
 	}
 	trace_ma_write(__func__, mas, 0, wr_mas->entry);
 	mas_update_gap(mas);
+	mas->end = new_end;
 	return true;
 }
 
@@ -4190,6 +4193,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
 	if (!wr_mas->content || !wr_mas->entry)
 		mas_update_gap(mas);
 
+	mas->end = new_end;
 	trace_ma_write(__func__, mas, new_end, wr_mas->entry);
 	return  true;
 }
@@ -4428,6 +4432,7 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
 	if (unlikely(mte_dead_node(mas->node)))
 		return 1;
 
+	mas->end = mas->offset;
 	return 0;
 
 no_entry:
@@ -5074,6 +5079,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
 	if (mas->index < min)
 		mas->index = min;
 	mas->last = mas->index + size - 1;
+	mas->end = mas_data_end(mas);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mas_empty_area);
@@ -5134,6 +5140,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
 		mas->last = max;
 
 	mas->index = mas->last - size + 1;
+	mas->end = mas_data_end(mas);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mas_empty_area_rev);
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index cb5358674521..7095fb0ec026 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -945,6 +945,7 @@ retry:
 		goto retry;
 	}
 
+	mas->end = mas_data_end(mas);
 	return ret;
 
 not_found:
-- 
cgit v1.2.3


From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:25 -0400
Subject: maple_tree: separate ma_state node from status

The maple tree node is overloaded to keep status as well as the active
node.  This, unfortunately, results in a re-walk on underflow or overflow.
Since the maple state has room, the status can be placed in its own enum
in the structure.  Once an underflow/overflow is detected, certain modes
can restore the status to active and others may need to re-walk just that
one node to see the entry.

The status being an enum has the benefit of detecting unhandled status in
switch statements.

[Liam.Howlett@oracle.com: fix comments about MAS_*]
  Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com
[Liam.Howlett@oracle.com: update forking to separate maple state and node]
  Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com
[Liam.Howlett@oracle.com: fix mas_prev() state separation code]
  Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h       |  87 ++++----
 include/linux/mm_types.h         |   3 +-
 lib/maple_tree.c                 | 459 +++++++++++++++++++++++----------------
 lib/test_maple_tree.c            | 189 ++++++++--------
 mm/internal.h                    |   8 +-
 tools/testing/radix-tree/maple.c |  26 ++-
 6 files changed, 445 insertions(+), 327 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 0b82efe0cf1e..4dd668f7b111 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt)
 
 /* Advanced API */
 
+/*
+ * Maple State Status
+ * ma_active means the maple state is pointing to a node and offset and can
+ * continue operating on the tree.
+ * ma_start means we have not searched the tree.
+ * ma_root means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * ma_none means we have searched the tree and there is no node in the
+ * tree for this entry.  For example, we searched for index 1 in an empty
+ * tree.  Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * ma_pause means the data within the maple state may be stale, restart the
+ * operation
+ * ma_overflow means the search has reached the upper limit of the search
+ * ma_underflow means the search has reached the lower limit of the search
+ * ma_error means there was an error, check the node for the error number.
+ */
+enum maple_status {
+	ma_active,
+	ma_start,
+	ma_root,
+	ma_none,
+	ma_pause,
+	ma_overflow,
+	ma_underflow,
+	ma_error,
+};
+
 /*
  * The maple state is defined in the struct ma_state and is used to keep track
  * of information during operations, and even between operations when using the
@@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt)
  * When returning a value the maple state index and last respectively contain
  * the start and end of the range for the entry.  Ranges are inclusive in the
  * Maple Tree.
+ *
+ * The status of the state is used to determine how the next action should treat
+ * the state.  For instance, if the status is ma_start then the next action
+ * should start at the root of the tree and walk down.  If the status is
+ * ma_pause then the node may be stale data and should be discarded.  If the
+ * status is ma_overflow, then the last action hit the upper limit.
+ *
  */
 struct ma_state {
 	struct maple_tree *tree;	/* The tree we're operating in */
@@ -390,6 +427,7 @@ struct ma_state {
 	unsigned long min;		/* The minimum index of this node - implied pivot min */
 	unsigned long max;		/* The maximum index of this node - implied pivot max */
 	struct maple_alloc *alloc;	/* Allocated nodes for this operation */
+	enum maple_status status;	/* The status of the state (active, start, none, etc) */
 	unsigned char depth;		/* depth of tree descent during write */
 	unsigned char offset;
 	unsigned char mas_flags;
@@ -416,28 +454,12 @@ struct ma_wr_state {
 		spin_lock_nested(&((mas)->tree->ma_lock), subclass)
 #define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
 
-
 /*
  * Special values for ma_state.node.
- * MAS_START means we have not searched the tree.
- * MAS_ROOT means we have searched the tree and the entry we found lives in
- * the root of the tree (ie it has index 0, length 1 and is the only entry in
- * the tree).
- * MAS_NONE means we have searched the tree and there is no node in the
- * tree for this entry.  For example, we searched for index 1 in an empty
- * tree.  Or we have a tree which points to a full leaf node and we
- * searched for an entry which is larger than can be contained in that
- * leaf node.
  * MA_ERROR represents an errno.  After dropping the lock and attempting
  * to resolve the error, the walk would have to be restarted from the
  * top of the tree as the tree may have been modified.
  */
-#define MAS_START	((struct maple_enode *)1UL)
-#define MAS_ROOT	((struct maple_enode *)5UL)
-#define MAS_NONE	((struct maple_enode *)9UL)
-#define MAS_PAUSE	((struct maple_enode *)17UL)
-#define MAS_OVERFLOW	((struct maple_enode *)33UL)
-#define MAS_UNDERFLOW	((struct maple_enode *)65UL)
 #define MA_ERROR(err) \
 		((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
 
@@ -446,7 +468,8 @@ struct ma_wr_state {
 		.tree = mt,						\
 		.index = first,						\
 		.last = end,						\
-		.node = MAS_START,					\
+		.node = NULL,						\
+		.status = ma_start,					\
 		.min = 0,						\
 		.max = ULONG_MAX,					\
 		.alloc = NULL,						\
@@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max);
 void *mas_find_rev(struct ma_state *mas, unsigned long min);
 void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
 int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
-bool mas_is_err(struct ma_state *mas);
 
 bool mas_nomem(struct ma_state *mas, gfp_t gfp);
 void mas_pause(struct ma_state *mas);
@@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
 	mas->tree = tree;
 	mas->index = mas->last = addr;
 	mas->max = ULONG_MAX;
-	mas->node = MAS_START;
+	mas->status = ma_start;
+	mas->node = NULL;
 }
 
-/* Checks if a mas has not found anything */
-static inline bool mas_is_none(const struct ma_state *mas)
-{
-	return mas->node == MAS_NONE;
-}
-
-/* Checks if a mas has been paused */
-static inline bool mas_is_paused(const struct ma_state *mas)
+static inline bool mas_is_active(struct ma_state *mas)
 {
-	return mas->node == MAS_PAUSE;
+	return mas->status == ma_active;
 }
 
-/* Check if the mas is pointing to a node or not */
-static inline bool mas_is_active(struct ma_state *mas)
+static inline bool mas_is_err(struct ma_state *mas)
 {
-	if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE)
-		return true;
-
-	return false;
+	return mas->status == ma_error;
 }
 
 /**
@@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas)
  *
  * Context: Any context.
  */
-static inline void mas_reset(struct ma_state *mas)
+static __always_inline void mas_reset(struct ma_state *mas)
 {
-	mas->node = MAS_START;
+	mas->status = ma_start;
+	mas->node = NULL;
 }
 
 /**
@@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
 static inline
 void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
 {
-	mas->node = MAS_START;
+	mas_reset(mas);
 	__mas_set_range(mas, start, last);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ef18d2b25378..a66534c78c4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1071,7 +1071,8 @@ struct vma_iterator {
 		.mas = {						\
 			.tree = &(__mm)->mm_mt,				\
 			.index = __addr,				\
-			.node = MAS_START,				\
+			.node = NULL,					\
+			.status = ma_start,				\
 		},							\
 	}
 
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index f0d2aea91351..187a9796188e 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -249,40 +249,40 @@ static __always_inline bool mt_is_reserved(const void *entry)
 		xa_is_internal(entry);
 }
 
-static inline void mas_set_err(struct ma_state *mas, long err)
+static __always_inline void mas_set_err(struct ma_state *mas, long err)
 {
 	mas->node = MA_ERROR(err);
+	mas->status = ma_error;
 }
 
-static inline bool mas_is_ptr(const struct ma_state *mas)
+static __always_inline bool mas_is_ptr(const struct ma_state *mas)
 {
-	return mas->node == MAS_ROOT;
+	return mas->status == ma_root;
 }
 
-static inline bool mas_is_start(const struct ma_state *mas)
+static __always_inline bool mas_is_start(const struct ma_state *mas)
 {
-	return mas->node == MAS_START;
+	return mas->status == ma_start;
 }
 
-bool mas_is_err(struct ma_state *mas)
+static __always_inline bool mas_is_none(const struct ma_state *mas)
 {
-	return xa_is_err(mas->node);
+	return mas->status == ma_none;
 }
 
-static __always_inline bool mas_is_overflow(struct ma_state *mas)
+static __always_inline bool mas_is_paused(const struct ma_state *mas)
 {
-	if (unlikely(mas->node == MAS_OVERFLOW))
-		return true;
-
-	return false;
+	return mas->status == ma_pause;
 }
 
-static __always_inline bool mas_is_underflow(struct ma_state *mas)
+static __always_inline bool mas_is_overflow(struct ma_state *mas)
 {
-	if (unlikely(mas->node == MAS_UNDERFLOW))
-		return true;
+	return mas->status == ma_overflow;
+}
 
-	return false;
+static inline bool mas_is_underflow(struct ma_state *mas)
+{
+	return mas->status == ma_underflow;
 }
 
 static inline bool mas_searchable(struct ma_state *mas)
@@ -1274,6 +1274,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
 	if (mas->mas_flags & MA_STATE_PREALLOC) {
 		if (allocated)
 			return;
+		BUG_ON(!allocated);
 		WARN_ON(!allocated);
 	}
 
@@ -1379,14 +1380,14 @@ static void mas_node_count(struct ma_state *mas, int count)
  * mas_start() - Sets up maple state for operations.
  * @mas: The maple state.
  *
- * If mas->node == MAS_START, then set the min, max and depth to
+ * If mas->status == mas_start, then set the min, max and depth to
  * defaults.
  *
  * Return:
- * - If mas->node is an error or not MAS_START, return NULL.
- * - If it's an empty tree:     NULL & mas->node == MAS_NONE
- * - If it's a single entry:    The entry & mas->node == MAS_ROOT
- * - If it's a tree:            NULL & mas->node == safe root node.
+ * - If mas->node is an error or not mas_start, return NULL.
+ * - If it's an empty tree:     NULL & mas->status == ma_none
+ * - If it's a single entry:    The entry & mas->status == mas_root
+ * - If it's a tree:            NULL & mas->status == safe root node.
  */
 static inline struct maple_enode *mas_start(struct ma_state *mas)
 {
@@ -1402,6 +1403,7 @@ retry:
 		/* Tree with nodes */
 		if (likely(xa_is_node(root))) {
 			mas->depth = 1;
+			mas->status = ma_active;
 			mas->node = mte_safe_root(root);
 			mas->offset = 0;
 			if (mte_dead_node(mas->node))
@@ -1412,13 +1414,14 @@ retry:
 
 		/* empty tree */
 		if (unlikely(!root)) {
-			mas->node = MAS_NONE;
+			mas->node = NULL;
+			mas->status = ma_none;
 			mas->offset = MAPLE_NODE_SLOTS;
 			return NULL;
 		}
 
 		/* Single entry tree */
-		mas->node = MAS_ROOT;
+		mas->status = ma_root;
 		mas->offset = MAPLE_NODE_SLOTS;
 
 		/* Single entry tree. */
@@ -2225,19 +2228,21 @@ static inline bool mas_next_sibling(struct ma_state *mas)
 }
 
 /*
- * mte_node_or_node() - Return the encoded node or MAS_NONE.
+ * mte_node_or_none() - Set the enode and state.
  * @enode: The encoded maple node.
  *
- * Shorthand to avoid setting %NULLs in the tree or maple_subtree_state.
- *
- * Return: @enode or MAS_NONE
+ * Set the node to the enode and the status.
  */
-static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode)
+static inline void mas_node_or_none(struct ma_state *mas,
+		struct maple_enode *enode)
 {
-	if (enode)
-		return enode;
-
-	return ma_enode_ptr(MAS_NONE);
+	if (enode) {
+		mas->node = enode;
+		mas->status = ma_active;
+	} else {
+		mas->node = NULL;
+		mas->status = ma_none;
+	}
 }
 
 /*
@@ -2557,13 +2562,15 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast,
  * The node will either be RCU freed or pushed back on the maple state.
  */
 static inline void mas_topiary_node(struct ma_state *mas,
-		struct maple_enode *enode, bool in_rcu)
+		struct ma_state *tmp_mas, bool in_rcu)
 {
 	struct maple_node *tmp;
+	struct maple_enode *enode;
 
-	if (enode == MAS_NONE)
+	if (mas_is_none(tmp_mas))
 		return;
 
+	enode = tmp_mas->node;
 	tmp = mte_to_node(enode);
 	mte_set_node_dead(enode);
 	if (in_rcu)
@@ -2603,8 +2610,8 @@ static inline void mas_topiary_replace(struct ma_state *mas,
 	/* Update the parent pointers in the tree */
 	tmp[0] = *mas;
 	tmp[0].offset = 0;
-	tmp[1].node = MAS_NONE;
-	tmp[2].node = MAS_NONE;
+	tmp[1].status = ma_none;
+	tmp[2].status = ma_none;
 	while (!mte_is_leaf(tmp[0].node)) {
 		n = 0;
 		for (i = 0; i < 3; i++) {
@@ -2624,7 +2631,7 @@ static inline void mas_topiary_replace(struct ma_state *mas,
 			break;
 
 		while (n < 3)
-			tmp_next[n++].node = MAS_NONE;
+			tmp_next[n++].status = ma_none;
 
 		for (i = 0; i < 3; i++)
 			tmp[i] = tmp_next[i];
@@ -2637,8 +2644,8 @@ static inline void mas_topiary_replace(struct ma_state *mas,
 	tmp[0] = *mas;
 	tmp[0].offset = 0;
 	tmp[0].node = old_enode;
-	tmp[1].node = MAS_NONE;
-	tmp[2].node = MAS_NONE;
+	tmp[1].status = ma_none;
+	tmp[2].status = ma_none;
 	in_rcu = mt_in_rcu(mas->tree);
 	do {
 		n = 0;
@@ -2653,7 +2660,7 @@ static inline void mas_topiary_replace(struct ma_state *mas,
 				if ((tmp_next[n].min >= tmp_next->index) &&
 				    (tmp_next[n].max <= tmp_next->last)) {
 					mat_add(&subtrees, tmp_next[n].node);
-					tmp_next[n].node = MAS_NONE;
+					tmp_next[n].status = ma_none;
 				} else {
 					n++;
 				}
@@ -2664,16 +2671,16 @@ static inline void mas_topiary_replace(struct ma_state *mas,
 			break;
 
 		while (n < 3)
-			tmp_next[n++].node = MAS_NONE;
+			tmp_next[n++].status = ma_none;
 
 		for (i = 0; i < 3; i++) {
-			mas_topiary_node(mas, tmp[i].node, in_rcu);
+			mas_topiary_node(mas, &tmp[i], in_rcu);
 			tmp[i] = tmp_next[i];
 		}
 	} while (!mte_is_leaf(tmp[0].node));
 
 	for (i = 0; i < 3; i++)
-		mas_topiary_node(mas, tmp[i].node, in_rcu);
+		mas_topiary_node(mas, &tmp[i], in_rcu);
 
 	mas_mat_destroy(mas, &subtrees);
 }
@@ -2712,9 +2719,9 @@ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast,
 {
 	bool new_lmax = true;
 
-	mast->l->node = mte_node_or_none(left);
-	mast->m->node = mte_node_or_none(middle);
-	mast->r->node = mte_node_or_none(right);
+	mas_node_or_none(mast->l, left);
+	mas_node_or_none(mast->m, middle);
+	mas_node_or_none(mast->r, right);
 
 	mast->l->min = mast->orig_l->min;
 	if (split == mast->bn->b_end) {
@@ -2894,7 +2901,7 @@ static int mas_spanning_rebalance(struct ma_state *mas,
 	mast->l = &l_mas;
 	mast->m = &m_mas;
 	mast->r = &r_mas;
-	l_mas.node = r_mas.node = m_mas.node = MAS_NONE;
+	l_mas.status = r_mas.status = m_mas.status = ma_none;
 
 	/* Check if this is not root and has sufficient data.  */
 	if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) &&
@@ -3421,7 +3428,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
 		/* Try to push left. */
 		if (mas_push_data(mas, height, &mast, true))
 			break;
-
 		/* Try to push right. */
 		if (mas_push_data(mas, height, &mast, false))
 			break;
@@ -3537,6 +3543,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry)
 	slots = ma_slots(node, type);
 	node->parent = ma_parent_ptr(mas_tree_parent(mas));
 	mas->node = mt_mk_node(node, type);
+	mas->status = ma_active;
 
 	if (mas->index) {
 		if (contents) {
@@ -3569,7 +3576,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry)
 		mas_root_expand(mas, entry);
 	else {
 		rcu_assign_pointer(mas->tree->ma_root, entry);
-		mas->node = MAS_START;
+		mas->status = ma_start;
 	}
 }
 
@@ -3801,7 +3808,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry)
 		mas->depth = 0;
 		mas_set_height(mas);
 		rcu_assign_pointer(mas->tree->ma_root, entry);
-		mas->node = MAS_START;
+		mas->status = ma_start;
 		goto done;
 	}
 
@@ -3814,6 +3821,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry)
 	slots = ma_slots(node, type);
 	node->parent = ma_parent_ptr(mas_tree_parent(mas));
 	mas->node = mt_mk_node(node, type);
+	mas->status = ma_active;
 	rcu_assign_pointer(slots[0], entry);
 	pivots[0] = mas->last;
 	mas->depth = 1;
@@ -4367,11 +4375,13 @@ static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
 
 /*
  * mas_prev_node() - Find the prev non-null entry at the same level in the
- * tree.  The prev value will be mas->node[mas->offset] or MAS_NONE.
+ * tree.  The prev value will be mas->node[mas->offset] or the status will be
+ * ma_none.
  * @mas: The maple state
  * @min: The lower limit to search
  *
- * The prev node value will be mas->node[mas->offset] or MAS_NONE.
+ * The prev node value will be mas->node[mas->offset] or the status will be
+ * ma_none.
  * Return: 1 if the node is dead, 0 otherwise.
  */
 static int mas_prev_node(struct ma_state *mas, unsigned long min)
@@ -4441,7 +4451,7 @@ no_entry:
 	if (unlikely(ma_dead_node(node)))
 		return 1;
 
-	mas->node = MAS_NONE;
+	mas->status = ma_underflow;
 	return 0;
 }
 
@@ -4455,8 +4465,7 @@ no_entry:
  *
  * Return: The entry in the previous slot which is possibly NULL
  */
-static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty,
-			   bool set_underflow)
+static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
 {
 	void *entry;
 	void __rcu **slots;
@@ -4489,13 +4498,16 @@ again:
 		mas->last = mas->index - 1;
 		mas->index = mas_safe_min(mas, pivots, mas->offset);
 	} else  {
+		if (mas->index <= min)
+			goto underflow;
+
 		if (mas_prev_node(mas, min)) {
 			mas_rewalk(mas, save_point);
 			goto retry;
 		}
 
-		if (mas_is_none(mas))
-			goto underflow;
+		if (WARN_ON_ONCE(mas_is_underflow(mas)))
+			return NULL;
 
 		mas->last = mas->max;
 		node = mas_mn(mas);
@@ -4509,12 +4521,15 @@ again:
 	if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
 		goto retry;
 
+
 	if (likely(entry))
 		return entry;
 
 	if (!empty) {
-		if (mas->index <= min)
-			goto underflow;
+		if (mas->index <= min) {
+			mas->status = ma_underflow;
+			return NULL;
+		}
 
 		goto again;
 	}
@@ -4522,8 +4537,7 @@ again:
 	return entry;
 
 underflow:
-	if (set_underflow)
-		mas->node = MAS_UNDERFLOW;
+	mas->status = ma_underflow;
 	return NULL;
 }
 
@@ -4532,7 +4546,8 @@ underflow:
  * @mas: The maple state
  * @max: The maximum pivot value to check.
  *
- * The next value will be mas->node[mas->offset] or MAS_NONE.
+ * The next value will be mas->node[mas->offset] or the status will have
+ * overflowed.
  * Return: 1 on dead node, 0 otherwise.
  */
 static int mas_next_node(struct ma_state *mas, struct maple_node *node,
@@ -4548,13 +4563,13 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node,
 	void __rcu **slots;
 
 	if (mas->max >= max)
-		goto no_entry;
+		goto overflow;
 
 	min = mas->max + 1;
 	level = 0;
 	do {
 		if (ma_is_root(node))
-			goto no_entry;
+			goto overflow;
 
 		/* Walk up. */
 		if (unlikely(mas_ascend(mas)))
@@ -4605,11 +4620,11 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node,
 	mas->min = min;
 	return 0;
 
-no_entry:
+overflow:
 	if (unlikely(ma_dead_node(node)))
 		return 1;
 
-	mas->node = MAS_NONE;
+	mas->status = ma_overflow;
 	return 0;
 }
 
@@ -4624,8 +4639,7 @@ no_entry:
  *
  * Return: The entry in the next slot which is possibly NULL
  */
-static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty,
-			   bool set_overflow)
+static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
 {
 	void __rcu **slots;
 	unsigned long *pivots;
@@ -4646,13 +4660,15 @@ retry:
 		if (likely(mas->offset < mas->end))
 			pivot = pivots[mas->offset];
 		else
-			goto overflow;
+			pivot = mas->max;
 
 		if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
 			goto retry;
 
-		if (pivot >= max)
-			goto overflow;
+		if (pivot >= max) { /* Was at the limit, next will extend beyond */
+			mas->status = ma_overflow;
+			return NULL;
+		}
 	}
 
 	if (likely(mas->offset < mas->end)) {
@@ -4664,16 +4680,18 @@ again:
 		else
 			mas->last = mas->max;
 	} else  {
+		if (mas->last >= max) {
+			mas->status = ma_overflow;
+			return NULL;
+		}
+
 		if (mas_next_node(mas, node, max)) {
 			mas_rewalk(mas, save_point);
 			goto retry;
 		}
 
-		if (WARN_ON_ONCE(mas_is_none(mas))) {
-			mas->node = MAS_OVERFLOW;
+		if (WARN_ON_ONCE(mas_is_overflow(mas)))
 			return NULL;
-			goto overflow;
-		}
 
 		mas->offset = 0;
 		mas->index = mas->min;
@@ -4691,20 +4709,18 @@ again:
 	if (entry)
 		return entry;
 
+
 	if (!empty) {
-		if (mas->last >= max)
-			goto overflow;
+		if (mas->last >= max) {
+			mas->status = ma_overflow;
+			return NULL;
+		}
 
 		mas->index = mas->last + 1;
 		goto again;
 	}
 
 	return entry;
-
-overflow:
-	if (set_overflow)
-		mas->node = MAS_OVERFLOW;
-	return NULL;
 }
 
 /*
@@ -4723,11 +4739,11 @@ overflow:
 static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit)
 {
 	if (mas->last >= limit) {
-		mas->node = MAS_OVERFLOW;
+		mas->status = ma_overflow;
 		return NULL;
 	}
 
-	return mas_next_slot(mas, limit, false, true);
+	return mas_next_slot(mas, limit, false);
 }
 
 /*
@@ -4895,7 +4911,7 @@ done:
  * @mas: The maple state.
  *
  * mas->index and mas->last will be set to the range if there is a value.  If
- * mas->node is MAS_NONE, reset to MAS_START.
+ * mas->status is ma_none, reset to ma_start
  *
  * Return: the entry at the location or %NULL.
  */
@@ -4904,7 +4920,7 @@ void *mas_walk(struct ma_state *mas)
 	void *entry;
 
 	if (!mas_is_active(mas) || !mas_is_start(mas))
-		mas->node = MAS_START;
+		mas->status = ma_start;
 retry:
 	entry = mas_state_walk(mas);
 	if (mas_is_start(mas)) {
@@ -4920,7 +4936,7 @@ retry:
 
 		mas->index = 1;
 		mas->last = ULONG_MAX;
-		mas->node = MAS_NONE;
+		mas->status = ma_none;
 		return NULL;
 	}
 
@@ -5672,27 +5688,40 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max,
 	bool was_none = mas_is_none(mas);
 
 	if (unlikely(mas->last >= max)) {
-		mas->node = MAS_OVERFLOW;
+		mas->status = ma_overflow;
 		return true;
 	}
 
-	if (mas_is_active(mas))
+	switch (mas->status) {
+	case ma_active:
 		return false;
-
-	if (mas_is_none(mas) || mas_is_paused(mas)) {
-		mas->node = MAS_START;
-	} else if (mas_is_overflow(mas)) {
+	case ma_none:
+		fallthrough;
+	case ma_pause:
+		mas->status = ma_start;
+		fallthrough;
+	case ma_start:
+		mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
+		break;
+	case ma_overflow:
 		/* Overflowed before, but the max changed */
-		mas->node = MAS_START;
-	} else if (mas_is_underflow(mas)) {
-		mas->node = MAS_START;
+		mas->status = ma_active;
+		break;
+	case ma_underflow:
+		/* The user expects the mas to be one before where it is */
+		mas->status = ma_active;
 		*entry = mas_walk(mas);
 		if (*entry)
 			return true;
+		break;
+	case ma_root:
+		break;
+	case ma_error:
+		return true;
 	}
 
-	if (mas_is_start(mas))
-		*entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
+	if (likely(mas_is_active(mas))) /* Fast path */
+		return false;
 
 	if (mas_is_ptr(mas)) {
 		*entry = NULL;
@@ -5702,7 +5731,7 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max,
 		}
 		mas->index = 1;
 		mas->last = ULONG_MAX;
-		mas->node = MAS_NONE;
+		mas->status = ma_none;
 		return true;
 	}
 
@@ -5731,7 +5760,7 @@ void *mas_next(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, false, true);
+	return mas_next_slot(mas, max, false);
 }
 EXPORT_SYMBOL_GPL(mas_next);
 
@@ -5754,7 +5783,7 @@ void *mas_next_range(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, true, true);
+	return mas_next_slot(mas, max, true);
 }
 EXPORT_SYMBOL_GPL(mas_next_range);
 
@@ -5785,33 +5814,45 @@ EXPORT_SYMBOL_GPL(mt_next);
 static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
 {
 	if (unlikely(mas->index <= min)) {
-		mas->node = MAS_UNDERFLOW;
+		mas->status = ma_underflow;
 		return true;
 	}
 
-	if (mas_is_active(mas))
+	switch (mas->status) {
+	case ma_active:
 		return false;
-
-	if (mas_is_overflow(mas)) {
-		mas->node = MAS_START;
+	case ma_start:
+		break;
+	case ma_none:
+		fallthrough;
+	case ma_pause:
+		mas->status = ma_start;
+		break;
+	case ma_underflow:
+		/* underflowed before but the min changed */
+		mas->status = ma_active;
+		break;
+	case ma_overflow:
+		/* User expects mas to be one after where it is */
+		mas->status = ma_active;
 		*entry = mas_walk(mas);
 		if (*entry)
 			return true;
-	}
-
-	if (mas_is_none(mas) || mas_is_paused(mas)) {
-		mas->node = MAS_START;
-	} else if (mas_is_underflow(mas)) {
-		/* underflowed before but the min changed */
-		mas->node = MAS_START;
+		break;
+	case ma_root:
+		break;
+	case ma_error:
+		return true;
 	}
 
 	if (mas_is_start(mas))
 		mas_walk(mas);
 
 	if (unlikely(mas_is_ptr(mas))) {
-		if (!mas->index)
-			goto none;
+		if (!mas->index) {
+			mas->status = ma_none;
+			return true;
+		}
 		mas->index = mas->last = 0;
 		*entry = mas_root(mas);
 		return true;
@@ -5821,7 +5862,7 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry
 		if (mas->index) {
 			/* Walked to out-of-range pointer? */
 			mas->index = mas->last = 0;
-			mas->node = MAS_ROOT;
+			mas->status = ma_root;
 			*entry = mas_root(mas);
 			return true;
 		}
@@ -5829,10 +5870,6 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry
 	}
 
 	return false;
-
-none:
-	mas->node = MAS_NONE;
-	return true;
 }
 
 /**
@@ -5841,7 +5878,7 @@ none:
  * @min: The minimum value to check.
  *
  * Must hold rcu_read_lock or the write lock.
- * Will reset mas to MAS_START if the node is MAS_NONE.  Will stop on not
+ * Will reset mas to ma_start if the status is ma_none.  Will stop on not
  * searchable nodes.
  *
  * Return: the previous value or %NULL.
@@ -5853,7 +5890,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min)
 	if (mas_prev_setup(mas, min, &entry))
 		return entry;
 
-	return mas_prev_slot(mas, min, false, true);
+	return mas_prev_slot(mas, min, false);
 }
 EXPORT_SYMBOL_GPL(mas_prev);
 
@@ -5864,7 +5901,7 @@ EXPORT_SYMBOL_GPL(mas_prev);
  *
  * Sets @mas->index and @mas->last to the range.
  * Must hold rcu_read_lock or the write lock.
- * Will reset mas to MAS_START if the node is MAS_NONE.  Will stop on not
+ * Will reset mas to ma_start if the node is ma_none.  Will stop on not
  * searchable nodes.
  *
  * Return: the previous value or %NULL.
@@ -5876,7 +5913,7 @@ void *mas_prev_range(struct ma_state *mas, unsigned long min)
 	if (mas_prev_setup(mas, min, &entry))
 		return entry;
 
-	return mas_prev_slot(mas, min, true, true);
+	return mas_prev_slot(mas, min, true);
 }
 EXPORT_SYMBOL_GPL(mas_prev_range);
 
@@ -5919,7 +5956,8 @@ EXPORT_SYMBOL_GPL(mt_prev);
  */
 void mas_pause(struct ma_state *mas)
 {
-	mas->node = MAS_PAUSE;
+	mas->status = ma_pause;
+	mas->node = NULL;
 }
 EXPORT_SYMBOL_GPL(mas_pause);
 
@@ -5933,32 +5971,52 @@ EXPORT_SYMBOL_GPL(mas_pause);
  */
 static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
 {
-	if (mas_is_active(mas)) {
+	switch (mas->status) {
+	case ma_active:
 		if (mas->last < max)
 			return false;
-
 		return true;
-	}
-
-	if (mas_is_paused(mas)) {
+	case ma_start:
+		break;
+	case ma_pause:
 		if (unlikely(mas->last >= max))
 			return true;
 
 		mas->index = ++mas->last;
-		mas->node = MAS_START;
-	} else if (mas_is_none(mas)) {
+		mas->status = ma_start;
+		break;
+	case ma_none:
 		if (unlikely(mas->last >= max))
 			return true;
 
 		mas->index = mas->last;
-		mas->node = MAS_START;
-	} else if (mas_is_overflow(mas) || mas_is_underflow(mas)) {
-		if (mas->index > max) {
-			mas->node = MAS_OVERFLOW;
+		mas->status = ma_start;
+		break;
+	case ma_underflow:
+		/* mas is pointing at entry before unable to go lower */
+		if (unlikely(mas->index >= max)) {
+			mas->status = ma_overflow;
 			return true;
 		}
 
-		mas->node = MAS_START;
+		mas->status = ma_active;
+		*entry = mas_walk(mas);
+		if (*entry)
+			return true;
+		break;
+	case ma_overflow:
+		if (unlikely(mas->last >= max))
+			return true;
+
+		mas->status = ma_active;
+		*entry = mas_walk(mas);
+		if (*entry)
+			return true;
+		break;
+	case ma_root:
+		break;
+	case ma_error:
+		return true;
 	}
 
 	if (mas_is_start(mas)) {
@@ -5985,7 +6043,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m
 	return false;
 
 ptr_out_of_range:
-	mas->node = MAS_NONE;
+	mas->status = ma_none;
 	mas->index = 1;
 	mas->last = ULONG_MAX;
 	return true;
@@ -5999,7 +6057,7 @@ ptr_out_of_range:
  *
  * Must hold rcu_read_lock or the write lock.
  * If an entry exists, last and index are updated accordingly.
- * May set @mas->node to MAS_NONE.
+ * May set @mas->status to ma_overflow.
  *
  * Return: The entry or %NULL.
  */
@@ -6011,7 +6069,10 @@ void *mas_find(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, false, false);
+	entry = mas_next_slot(mas, max, false);
+	/* Ignore overflow */
+	mas->status = ma_active;
+	return entry;
 }
 EXPORT_SYMBOL_GPL(mas_find);
 
@@ -6023,7 +6084,7 @@ EXPORT_SYMBOL_GPL(mas_find);
  *
  * Must hold rcu_read_lock or the write lock.
  * If an entry exists, last and index are updated accordingly.
- * May set @mas->node to MAS_NONE.
+ * May set @mas->status to ma_overflow.
  *
  * Return: The entry or %NULL.
  */
@@ -6035,7 +6096,7 @@ void *mas_find_range(struct ma_state *mas, unsigned long max)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_next_slot */
-	return mas_next_slot(mas, max, true, false);
+	return mas_next_slot(mas, max, true);
 }
 EXPORT_SYMBOL_GPL(mas_find_range);
 
@@ -6050,33 +6111,45 @@ EXPORT_SYMBOL_GPL(mas_find_range);
 static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
 		void **entry)
 {
-	if (mas_is_active(mas)) {
-		if (mas->index > min)
-			return false;
-
-		return true;
-	}
 
-	if (mas_is_paused(mas)) {
+	switch (mas->status) {
+	case ma_active:
+		goto active;
+	case ma_start:
+		break;
+	case ma_pause:
 		if (unlikely(mas->index <= min)) {
-			mas->node = MAS_NONE;
+			mas->status = ma_underflow;
 			return true;
 		}
-		mas->node = MAS_START;
 		mas->last = --mas->index;
-	} else if (mas_is_none(mas)) {
+		mas->status = ma_start;
+		break;
+	case ma_none:
 		if (mas->index <= min)
 			goto none;
 
 		mas->last = mas->index;
-		mas->node = MAS_START;
-	} else if (mas_is_underflow(mas) || mas_is_overflow(mas)) {
-		if (mas->last <= min) {
-			mas->node = MAS_UNDERFLOW;
+		mas->status = ma_start;
+		break;
+	case ma_overflow: /* user expects the mas to be one after where it is */
+		if (unlikely(mas->index <= min)) {
+			mas->status = ma_underflow;
 			return true;
 		}
 
-		mas->node = MAS_START;
+		mas->status = ma_active;
+		break;
+	case ma_underflow: /* user expects the mas to be one before where it is */
+		if (unlikely(mas->index <= min))
+			return true;
+
+		mas->status = ma_active;
+		break;
+	case ma_root:
+		break;
+	case ma_error:
+		return true;
 	}
 
 	if (mas_is_start(mas)) {
@@ -6099,19 +6172,20 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
 			 * previous location is 0.
 			 */
 			mas->last = mas->index = 0;
-			mas->node = MAS_ROOT;
+			mas->status = ma_root;
 			*entry = mas_root(mas);
 			return true;
 		}
 	}
 
+active:
 	if (mas->index < min)
 		return true;
 
 	return false;
 
 none:
-	mas->node = MAS_NONE;
+	mas->status = ma_none;
 	return true;
 }
 
@@ -6124,7 +6198,7 @@ none:
  *
  * Must hold rcu_read_lock or the write lock.
  * If an entry exists, last and index are updated accordingly.
- * May set @mas->node to MAS_NONE.
+ * May set @mas->status to ma_underflow.
  *
  * Return: The entry or %NULL.
  */
@@ -6136,7 +6210,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_prev_slot */
-	return mas_prev_slot(mas, min, false, false);
+	return mas_prev_slot(mas, min, false);
 
 }
 EXPORT_SYMBOL_GPL(mas_find_rev);
@@ -6150,7 +6224,7 @@ EXPORT_SYMBOL_GPL(mas_find_rev);
  *
  * Must hold rcu_read_lock or the write lock.
  * If an entry exists, last and index are updated accordingly.
- * May set @mas->node to MAS_NONE.
+ * May set @mas->status to ma_underflow.
  *
  * Return: The entry or %NULL.
  */
@@ -6162,7 +6236,7 @@ void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
 		return entry;
 
 	/* Retries on dead nodes handled by mas_prev_slot */
-	return mas_prev_slot(mas, min, true, false);
+	return mas_prev_slot(mas, min, true);
 }
 EXPORT_SYMBOL_GPL(mas_find_range_rev);
 
@@ -6183,7 +6257,7 @@ void *mas_erase(struct ma_state *mas)
 	MA_WR_STATE(wr_mas, mas, NULL);
 
 	if (!mas_is_active(mas) || !mas_is_start(mas))
-		mas->node = MAS_START;
+		mas->status = ma_start;
 
 	/* Retry unnecessary when holding the write lock. */
 	entry = mas_state_walk(mas);
@@ -6228,7 +6302,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp)
 	if (!mas_allocated(mas))
 		return false;
 
-	mas->node = MAS_START;
+	mas->status = ma_start;
 	return true;
 }
 
@@ -6627,7 +6701,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
 
 	node = mt_alloc_one(gfp);
 	if (!node) {
-		new_mas->node = MAS_NONE;
+		new_mas->status = ma_none;
 		mas_set_err(mas, -ENOMEM);
 		return;
 	}
@@ -6971,11 +7045,11 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
 static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
 {
 
-	struct maple_enode *p = MAS_NONE, *mn = mas->node;
+	struct maple_enode *p, *mn = mas->node;
 	unsigned long p_min, p_max;
 
 	mas_next_node(mas, mas_mn(mas), max);
-	if (!mas_is_none(mas))
+	if (!mas_is_overflow(mas))
 		return;
 
 	if (mte_is_root(mn))
@@ -6988,7 +7062,7 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
 		p_min = mas->min;
 		p_max = mas->max;
 		mas_prev_node(mas, 0);
-	} while (!mas_is_none(mas));
+	} while (!mas_is_underflow(mas));
 
 	mas->node = p;
 	mas->max = p_max;
@@ -7443,7 +7517,7 @@ static void mt_validate_nulls(struct maple_tree *mt)
 	MA_STATE(mas, mt, 0, 0);
 
 	mas_start(&mas);
-	if (mas_is_none(&mas) || (mas.node == MAS_ROOT))
+	if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
 		return;
 
 	while (!mte_is_leaf(mas.node))
@@ -7460,7 +7534,7 @@ static void mt_validate_nulls(struct maple_tree *mt)
 		last = entry;
 		if (offset == mas_data_end(&mas)) {
 			mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
-			if (mas_is_none(&mas))
+			if (mas_is_overflow(&mas))
 				return;
 			offset = 0;
 			slots = ma_slots(mte_to_node(mas.node),
@@ -7469,7 +7543,7 @@ static void mt_validate_nulls(struct maple_tree *mt)
 			offset++;
 		}
 
-	} while (!mas_is_none(&mas));
+	} while (!mas_is_overflow(&mas));
 }
 
 /*
@@ -7490,7 +7564,7 @@ void mt_validate(struct maple_tree *mt)
 	while (!mte_is_leaf(mas.node))
 		mas_descend(&mas);
 
-	while (!mas_is_none(&mas)) {
+	while (!mas_is_overflow(&mas)) {
 		MAS_WARN_ON(&mas, mte_dead_node(mas.node));
 		end = mas_data_end(&mas);
 		if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
@@ -7515,16 +7589,35 @@ EXPORT_SYMBOL_GPL(mt_validate);
 void mas_dump(const struct ma_state *mas)
 {
 	pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node);
-	if (mas_is_none(mas))
-		pr_err("(MAS_NONE) ");
-	else if (mas_is_ptr(mas))
-		pr_err("(MAS_ROOT) ");
-	else if (mas_is_start(mas))
-		 pr_err("(MAS_START) ");
-	else if (mas_is_paused(mas))
-		pr_err("(MAS_PAUSED) ");
-
-	pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last);
+	switch (mas->status) {
+	case ma_active:
+		pr_err("(ma_active)");
+		break;
+	case ma_none:
+		pr_err("(ma_none)");
+		break;
+	case ma_root:
+		pr_err("(ma_root)");
+		break;
+	case ma_start:
+		pr_err("(ma_start) ");
+		break;
+	case ma_pause:
+		pr_err("(ma_pause) ");
+		break;
+	case ma_overflow:
+		pr_err("(ma_overflow) ");
+		break;
+	case ma_underflow:
+		pr_err("(ma_underflow) ");
+		break;
+	case ma_error:
+		pr_err("(ma_error) ");
+		break;
+	}
+
+	pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
+	       mas->index, mas->last);
 	pr_err("     min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n",
 	       mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
 	if (mas->index > mas->last)
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 3e4597fb49d3..e7a5d688c9e0 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -54,6 +54,11 @@ atomic_t maple_tree_tests_passed;
 #else
 #define cond_resched()			do {} while (0)
 #endif
+
+#define mas_is_none(x)		((x)->status == ma_none)
+#define mas_is_overflow(x)	((x)->status == ma_overflow)
+#define mas_is_underflow(x)	((x)->status == ma_underflow)
+
 static int __init mtree_insert_index(struct maple_tree *mt,
 				     unsigned long index, gfp_t gfp)
 {
@@ -582,7 +587,7 @@ static noinline void __init check_find(struct maple_tree *mt)
 	MT_BUG_ON(mt, last != mas.last);
 
 
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	mas.index = ULONG_MAX;
 	mas.last = ULONG_MAX;
 	entry2 = mas_prev(&mas, 0);
@@ -2178,7 +2183,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt)
 	MT_BUG_ON(mt, val != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 5);
-	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	mas.index = 0;
 	mas.last = 5;
@@ -3042,10 +3047,6 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt)
  *		DNE	active		active		range of NULL
  */
 
-#define mas_active(x)		(((x).node != MAS_ROOT) && \
-				 ((x).node != MAS_START) && \
-				 ((x).node != MAS_PAUSE) && \
-				 ((x).node != MAS_NONE))
 static noinline void __init check_state_handling(struct maple_tree *mt)
 {
 	MA_STATE(mas, mt, 0, 0);
@@ -3060,7 +3061,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	/* prev: Start -> underflow*/
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
-	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+	MT_BUG_ON(mt, mas.status != ma_underflow);
 
 	/* prev: Start -> root */
 	mas_set(&mas, 10);
@@ -3068,7 +3069,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* prev: pause -> root */
 	mas_set(&mas, 10);
@@ -3077,7 +3078,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* next: start -> none */
 	mas_set(&mas, 0);
@@ -3085,7 +3086,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* next: start -> none*/
 	mas_set(&mas, 10);
@@ -3093,7 +3094,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find: start -> root */
 	mas_set(&mas, 0);
@@ -3101,21 +3102,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* find: root -> none */
 	entry = mas_find(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find: none -> none */
 	entry = mas_find(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find: start -> none */
 	mas_set(&mas, 10);
@@ -3123,14 +3124,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find_rev: none -> root */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* find_rev: start -> root */
 	mas_set(&mas, 0);
@@ -3138,21 +3139,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* find_rev: root -> none */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find_rev: none -> none */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* find_rev: start -> root */
 	mas_set(&mas, 10);
@@ -3160,7 +3161,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* walk: start -> none */
 	mas_set(&mas, 10);
@@ -3168,7 +3169,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* walk: pause -> none*/
 	mas_set(&mas, 10);
@@ -3177,7 +3178,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* walk: none -> none */
 	mas.index = mas.last = 10;
@@ -3185,14 +3186,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* walk: none -> none */
 	entry = mas_walk(&mas);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* walk: start -> root */
 	mas_set(&mas, 0);
@@ -3200,7 +3201,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* walk: pause -> root */
 	mas_set(&mas, 0);
@@ -3209,22 +3210,22 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* walk: none -> root */
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	entry = mas_walk(&mas);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* walk: root -> root */
 	entry = mas_walk(&mas);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	/* walk: root -> none */
 	mas_set(&mas, 10);
@@ -3232,7 +3233,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 1);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_NONE);
+	MT_BUG_ON(mt, mas.status != ma_none);
 
 	/* walk: none -> root */
 	mas.index = mas.last = 0;
@@ -3240,7 +3241,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0);
-	MT_BUG_ON(mt, mas.node != MAS_ROOT);
+	MT_BUG_ON(mt, mas.status != ma_root);
 
 	mas_unlock(&mas);
 
@@ -3258,7 +3259,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* next: pause ->active */
 	mas_set(&mas, 0);
@@ -3267,126 +3268,132 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* next: none ->active */
 	mas.index = mas.last = 0;
 	mas.offset = 0;
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* next:active ->active */
-	entry = mas_next(&mas, ULONG_MAX);
+	/* next:active ->active (spanning limit) */
+	entry = mas_next(&mas, 0x2100);
 	MT_BUG_ON(mt, entry != ptr2);
 	MT_BUG_ON(mt, mas.index != 0x2000);
 	MT_BUG_ON(mt, mas.last != 0x2500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* next:active -> active beyond data */
+	/* next:active -> overflow (limit reached) beyond data */
 	entry = mas_next(&mas, 0x2999);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x2501);
 	MT_BUG_ON(mt, mas.last != 0x2fff);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_overflow(&mas));
 
-	/* Continue after last range ends after max */
+	/* next:overflow -> active (limit changed) */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
 	MT_BUG_ON(mt, mas.last != 0x3500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* next:active -> active continued */
+	/* next:active ->  overflow (limit reached) */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x3501);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, !mas_active(mas));
-
-	/* next:active -> overflow  */
-	entry = mas_next(&mas, ULONG_MAX);
-	MT_BUG_ON(mt, entry != NULL);
-	MT_BUG_ON(mt, mas.index != 0x3501);
-	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_OVERFLOW);
+	MT_BUG_ON(mt, !mas_is_overflow(&mas));
 
 	/* next:overflow -> overflow  */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x3501);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, mas.node != MAS_OVERFLOW);
+	MT_BUG_ON(mt, !mas_is_overflow(&mas));
 
 	/* prev:overflow -> active  */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
 	MT_BUG_ON(mt, mas.last != 0x3500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* next: none -> active, skip value at location */
 	mas_set(&mas, 0);
 	entry = mas_next(&mas, ULONG_MAX);
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	mas.offset = 0;
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr2);
 	MT_BUG_ON(mt, mas.index != 0x2000);
 	MT_BUG_ON(mt, mas.last != 0x2500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* prev:active ->active */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* prev:active -> active spanning end range */
+	/* prev:active -> underflow (span limit) */
+	mas_next(&mas, ULONG_MAX);
+	entry = mas_prev(&mas, 0x1200);
+	MT_BUG_ON(mt, entry != ptr);
+	MT_BUG_ON(mt, mas.index != 0x1000);
+	MT_BUG_ON(mt, mas.last != 0x1500);
+	MT_BUG_ON(mt, !mas_is_active(&mas)); /* spanning limit */
+	entry = mas_prev(&mas, 0x1200); /* underflow */
+	MT_BUG_ON(mt, entry != NULL);
+	MT_BUG_ON(mt, mas.index != 0x1000);
+	MT_BUG_ON(mt, mas.last != 0x1500);
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
+
+	/* prev:underflow -> underflow (lower limit) spanning end range */
 	entry = mas_prev(&mas, 0x0100);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0x0FFF);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
-	/* prev:active -> underflow */
+	/* prev:underflow -> underflow */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0x0FFF);
-	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	/* prev:underflow -> underflow */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0x0FFF);
-	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	/* next:underflow -> active */
 	entry = mas_next(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* prev:first value -> underflow */
 	entry = mas_prev(&mas, 0x1000);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW);
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	/* find:underflow -> first value */
 	entry = mas_find(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* prev: pause ->active */
 	mas_set(&mas, 0x3600);
@@ -3397,21 +3404,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr2);
 	MT_BUG_ON(mt, mas.index != 0x2000);
 	MT_BUG_ON(mt, mas.last != 0x2500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* prev:active -> active spanning min */
+	/* prev:active -> underflow spanning min */
 	entry = mas_prev(&mas, 0x1600);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1FFF);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	/* prev: active ->active, continue */
 	entry = mas_prev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find: start ->active */
 	mas_set(&mas, 0);
@@ -3419,7 +3426,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find: pause ->active */
 	mas_set(&mas, 0);
@@ -3428,7 +3435,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find: start ->active on value */;
 	mas_set(&mas, 1200);
@@ -3436,14 +3443,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find:active ->active */
 	entry = mas_find(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != ptr2);
 	MT_BUG_ON(mt, mas.index != 0x2000);
 	MT_BUG_ON(mt, mas.last != 0x2500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 
 	/* find:active -> active (NULL)*/
@@ -3451,35 +3458,35 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x2501);
 	MT_BUG_ON(mt, mas.last != 0x2FFF);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MAS_BUG_ON(&mas, !mas_is_active(&mas));
 
 	/* find: overflow ->active */
 	entry = mas_find(&mas, 0x5000);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
 	MT_BUG_ON(mt, mas.last != 0x3500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find:active -> active (NULL) end*/
 	entry = mas_find(&mas, ULONG_MAX);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x3501);
 	MT_BUG_ON(mt, mas.last != ULONG_MAX);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MAS_BUG_ON(&mas, !mas_is_active(&mas));
 
 	/* find_rev: active (END) ->active */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr3);
 	MT_BUG_ON(mt, mas.index != 0x3000);
 	MT_BUG_ON(mt, mas.last != 0x3500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find_rev:active ->active */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != ptr2);
 	MT_BUG_ON(mt, mas.index != 0x2000);
 	MT_BUG_ON(mt, mas.last != 0x2500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* find_rev: pause ->active */
 	mas_pause(&mas);
@@ -3487,14 +3494,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
-	/* find_rev:active -> active */
+	/* find_rev:active -> underflow */
 	entry = mas_find_rev(&mas, 0);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0);
 	MT_BUG_ON(mt, mas.last != 0x0FFF);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_underflow(&mas));
 
 	/* find_rev: start ->active */
 	mas_set(&mas, 0x1200);
@@ -3502,7 +3509,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk start ->active */
 	mas_set(&mas, 0x1200);
@@ -3510,7 +3517,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk start ->active */
 	mas_set(&mas, 0x1600);
@@ -3518,7 +3525,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1fff);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk pause ->active */
 	mas_set(&mas, 0x1200);
@@ -3527,7 +3534,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk pause -> active */
 	mas_set(&mas, 0x1600);
@@ -3536,25 +3543,25 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1fff);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk none -> active */
 	mas_set(&mas, 0x1200);
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	entry = mas_walk(&mas);
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk none -> active */
 	mas_set(&mas, 0x1600);
-	mas.node = MAS_NONE;
+	mas.status = ma_none;
 	entry = mas_walk(&mas);
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1fff);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk active -> active */
 	mas.index = 0x1200;
@@ -3564,7 +3571,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != ptr);
 	MT_BUG_ON(mt, mas.index != 0x1000);
 	MT_BUG_ON(mt, mas.last != 0x1500);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	/* mas_walk active -> active */
 	mas.index = 0x1600;
@@ -3573,7 +3580,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
 	MT_BUG_ON(mt, entry != NULL);
 	MT_BUG_ON(mt, mas.index != 0x1501);
 	MT_BUG_ON(mt, mas.last != 0x1fff);
-	MT_BUG_ON(mt, !mas_active(mas));
+	MT_BUG_ON(mt, !mas_is_active(&mas));
 
 	mas_unlock(&mas);
 }
diff --git a/mm/internal.h b/mm/internal.h
index 0005b8adbd5c..8450562744cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1163,13 +1163,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
 {
 
 #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
-	if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+	if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
 			vmi->mas.index > vma->vm_start)) {
 		pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
 			vmi->mas.index, vma->vm_start, vma->vm_start,
 			vma->vm_end, vmi->mas.index, vmi->mas.last);
 	}
-	if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+	if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
 			vmi->mas.last <  vma->vm_start)) {
 		pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
 		       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
@@ -1177,7 +1177,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
 	}
 #endif
 
-	if (vmi->mas.node != MAS_START &&
+	if (vmi->mas.status != ma_start &&
 	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
 		vma_iter_invalidate(vmi);
 
@@ -1188,7 +1188,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
 static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
 			struct vm_area_struct *vma, gfp_t gfp)
 {
-	if (vmi->mas.node != MAS_START &&
+	if (vmi->mas.status != ma_start &&
 	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
 		vma_iter_invalidate(vmi);
 
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 7095fb0ec026..857c439e6bbc 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -118,6 +118,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas.alloc == NULL);
 	MT_BUG_ON(mt, mas.alloc->slot[0] == NULL);
 	mas_push_node(&mas, mn);
+	mas_reset(&mas);
 	mas_nomem(&mas, GFP_KERNEL); /* free */
 	mtree_unlock(mt);
 
@@ -141,7 +142,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 
 	mn->parent = ma_parent_ptr(mn);
 	ma_free_rcu(mn);
-	mas.node = MAS_START;
+	mas.status = ma_start;
 	mas_nomem(&mas, GFP_KERNEL);
 	/* Allocate 3 nodes, will fail. */
 	mas_node_count(&mas, 3);
@@ -158,6 +159,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	/* Ensure we counted 3. */
 	MT_BUG_ON(mt, mas_allocated(&mas) != 3);
 	/* Free. */
+	mas_reset(&mas);
 	mas_nomem(&mas, GFP_KERNEL);
 
 	/* Set allocation request to 1. */
@@ -272,6 +274,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 			ma_free_rcu(mn);
 			MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1);
 		}
+		mas_reset(&mas);
 		MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL));
 
 	}
@@ -294,6 +297,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 		smn = smn->slot[0]; /* next. */
 	}
 	MT_BUG_ON(mt, mas_allocated(&mas) != total);
+	mas_reset(&mas);
 	mas_nomem(&mas, GFP_KERNEL); /* Free. */
 
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
@@ -441,7 +445,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	mas.node = MA_ERROR(-ENOMEM);
 	mas_node_count(&mas, 10); /* Request */
 	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
-	mas.node = MAS_START;
+	mas.status = ma_start;
 	MT_BUG_ON(mt, mas_allocated(&mas) != 10);
 	mas_destroy(&mas);
 
@@ -452,7 +456,7 @@ static noinline void __init check_new_node(struct maple_tree *mt)
 	mas.node = MA_ERROR(-ENOMEM);
 	mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */
 	mas_nomem(&mas, GFP_KERNEL); /* Fill request */
-	mas.node = MAS_START;
+	mas.status = ma_start;
 	MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1);
 	mas_destroy(&mas);
 
@@ -941,7 +945,7 @@ retry:
 
 	ret = mas_descend_walk(mas, range_min, range_max);
 	if (unlikely(mte_dead_node(mas->node))) {
-		mas->node = MAS_START;
+		mas->status = ma_start;
 		goto retry;
 	}
 
@@ -961,10 +965,10 @@ static inline void *mas_range_load(struct ma_state *mas,
 	unsigned long index = mas->index;
 
 	if (mas_is_none(mas) || mas_is_paused(mas))
-		mas->node = MAS_START;
+		mas->status = ma_start;
 retry:
 	if (mas_tree_walk(mas, range_min, range_max))
-		if (unlikely(mas->node == MAS_ROOT))
+		if (unlikely(mas->status == ma_root))
 			return mas_root(mas);
 
 	if (likely(mas->offset != MAPLE_NODE_SLOTS))
@@ -35337,7 +35341,7 @@ static void mas_dfs_preorder(struct ma_state *mas)
 	unsigned char end, slot = 0;
 	unsigned long *pivots;
 
-	if (mas->node == MAS_START) {
+	if (mas->status == ma_start) {
 		mas_start(mas);
 		return;
 	}
@@ -35374,7 +35378,7 @@ walk_up:
 
 	return;
 done:
-	mas->node = MAS_NONE;
+	mas->status = ma_none;
 }
 
 
@@ -35833,7 +35837,7 @@ static noinline void __init check_nomem(struct maple_tree *mt)
 	mas_store(&ms, &ms); /* insert 1 -> &ms, fails. */
 	MT_BUG_ON(mt, ms.node != MA_ERROR(-ENOMEM));
 	mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */
-	MT_BUG_ON(mt, ms.node != MAS_START);
+	MT_BUG_ON(mt, ms.status != ma_start);
 	mtree_unlock(mt);
 	MT_BUG_ON(mt, mtree_insert(mt, 2, mt, GFP_KERNEL) != 0);
 	mtree_lock(mt);
@@ -35952,7 +35956,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b)
 
 	if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) {
 		if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) {
-			pr_err("One is MAS_ROOT and the other is not.\n");
+			pr_err("One is ma_root and the other is not.\n");
 			return -1;
 		}
 		return 0;
@@ -35961,7 +35965,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b)
 	while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) {
 
 		if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) {
-			pr_err("One is MAS_NONE and the other is not.\n");
+			pr_err("One is ma_none and the other is not.\n");
 			return -1;
 		}
 
-- 
cgit v1.2.3


From 0de56e38b307b0cb2ac825e8e7cb371a28daf844 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 1 Nov 2023 13:16:27 -0400
Subject: maple_tree: use maple state end for write operations

ma_wr_state was previously tracking the end of the node for writing.
Since the implementation of the ma_state end tracking, this is duplicated
work.  This patch removes the maple write state tracking of the end of the
node and uses the maple state end instead.

Link: https://lkml.kernel.org/r/20231101171629.3612299-11-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h |  1 -
 lib/maple_tree.c           | 46 ++++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 4dd668f7b111..b3d63123b945 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -441,7 +441,6 @@ struct ma_wr_state {
 	unsigned long r_max;		/* range max */
 	enum maple_type type;		/* mas->node type */
 	unsigned char offset_end;	/* The offset where the write ends */
-	unsigned char node_end;		/* mas->node end */
 	unsigned long *pivots;		/* mas->node->pivots pointer */
 	unsigned long end_piv;		/* The pivot at the offset end */
 	void __rcu **slots;		/* mas->node->slots pointer */
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index c7016066f12b..59500fe6988b 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2158,11 +2158,11 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
 	}
 
 	slot = offset_end + 1;
-	if (slot > wr_mas->node_end)
+	if (slot > mas->end)
 		goto b_end;
 
 	/* Copy end data to the end of the node. */
-	mas_mab_cp(mas, slot, wr_mas->node_end + 1, b_node, ++b_end);
+	mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end);
 	b_node->b_end--;
 	return;
 
@@ -2253,8 +2253,8 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
 
 	wr_mas->node = mas_mn(wr_mas->mas);
 	wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
-	count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type,
-					       wr_mas->pivots, mas->max);
+	count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
+				       wr_mas->pivots, mas->max);
 	offset = mas->offset;
 
 	while (offset < count && mas->index > wr_mas->pivots[offset])
@@ -3904,10 +3904,10 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas)
 
 	memset(&b_node, 0, sizeof(struct maple_big_node));
 	/* Copy l_mas and store the value in b_node. */
-	mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end);
+	mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
 	/* Copy r_mas into b_node. */
-	if (r_mas.offset <= r_wr_mas.node_end)
-		mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end,
+	if (r_mas.offset <= r_mas.end)
+		mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
 			   &b_node, b_node.b_end + 1);
 	else
 		b_node.b_end++;
@@ -3949,7 +3949,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
 	if (mas->last == wr_mas->end_piv)
 		offset_end++; /* don't copy this offset */
 	else if (unlikely(wr_mas->r_max == ULONG_MAX))
-		mas_bulk_rebalance(mas, wr_mas->node_end, wr_mas->type);
+		mas_bulk_rebalance(mas, mas->end, wr_mas->type);
 
 	/* set up node. */
 	if (in_rcu) {
@@ -3985,12 +3985,12 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
 	 * this range wrote to the end of the node or it overwrote the rest of
 	 * the data
 	 */
-	if (offset_end > wr_mas->node_end)
+	if (offset_end > mas->end)
 		goto done;
 
 	dst_offset = mas->offset + 1;
 	/* Copy to the end of node if necessary. */
-	copy_size = wr_mas->node_end - offset_end + 1;
+	copy_size = mas->end - offset_end + 1;
 	memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
 	       sizeof(void *) * copy_size);
 	memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
@@ -4077,10 +4077,10 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
 	} else {
 		/* Check next slot(s) if we are overwriting the end */
 		if ((mas->last == wr_mas->end_piv) &&
-		    (wr_mas->node_end != wr_mas->offset_end) &&
+		    (mas->end != wr_mas->offset_end) &&
 		    !wr_mas->slots[wr_mas->offset_end + 1]) {
 			wr_mas->offset_end++;
-			if (wr_mas->offset_end == wr_mas->node_end)
+			if (wr_mas->offset_end == mas->end)
 				mas->last = mas->max;
 			else
 				mas->last = wr_mas->pivots[wr_mas->offset_end];
@@ -4105,11 +4105,11 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
 
 static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
 {
-	while ((wr_mas->offset_end < wr_mas->node_end) &&
+	while ((wr_mas->offset_end < wr_mas->mas->end) &&
 	       (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
 		wr_mas->offset_end++;
 
-	if (wr_mas->offset_end < wr_mas->node_end)
+	if (wr_mas->offset_end < wr_mas->mas->end)
 		wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
 	else
 		wr_mas->end_piv = wr_mas->mas->max;
@@ -4121,7 +4121,7 @@ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
 static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
 {
 	struct ma_state *mas = wr_mas->mas;
-	unsigned char new_end = wr_mas->node_end + 2;
+	unsigned char new_end = mas->end + 2;
 
 	new_end -= wr_mas->offset_end - mas->offset;
 	if (wr_mas->r_min == mas->index)
@@ -4155,10 +4155,10 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
 	if (mt_in_rcu(mas->tree))
 		return false;
 
-	if (mas->offset != wr_mas->node_end)
+	if (mas->offset != mas->end)
 		return false;
 
-	end = wr_mas->node_end;
+	end = mas->end;
 	if (mas->offset != end)
 		return false;
 
@@ -4210,7 +4210,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas)
 	trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
 	memset(&b_node, 0, sizeof(struct maple_big_node));
 	mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
-	mas_commit_b_node(wr_mas, &b_node, wr_mas->node_end);
+	mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end);
 }
 
 static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
@@ -4238,7 +4238,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
 	if (mas_wr_append(wr_mas, new_end))
 		return;
 
-	if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas))
+	if (new_end == mas->end && mas_wr_slot_store(wr_mas))
 		return;
 
 	if (mas_wr_node_store(wr_mas, new_end))
@@ -5052,6 +5052,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
 	unsigned char offset;
 	unsigned long *pivots;
 	enum maple_type mt;
+	struct maple_node *node;
 
 	if (min > max)
 		return -EINVAL;
@@ -5082,13 +5083,14 @@ int mas_empty_area(struct ma_state *mas, unsigned long min,
 	if (unlikely(offset == MAPLE_NODE_SLOTS))
 		return -EBUSY;
 
+	node = mas_mn(mas);
 	mt = mte_node_type(mas->node);
-	pivots = ma_pivots(mas_mn(mas), mt);
+	pivots = ma_pivots(node, mt);
 	min = mas_safe_min(mas, pivots, offset);
 	if (mas->index < min)
 		mas->index = min;
 	mas->last = mas->index + size - 1;
-	mas->end = mas_data_end(mas);
+	mas->end = ma_data_end(node, mt, pivots, mas->max);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mas_empty_area);
@@ -7596,7 +7598,7 @@ void mas_wr_dump(const struct ma_wr_state *wr_mas)
 	pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n",
 	       wr_mas->node, wr_mas->r_min, wr_mas->r_max);
 	pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
-	       wr_mas->type, wr_mas->offset_end, wr_mas->node_end,
+	       wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
 	       wr_mas->end_piv);
 }
 EXPORT_SYMBOL_GPL(mas_wr_dump);
-- 
cgit v1.2.3


From 0a97c01cd20bb96359d8c9dedad92a061ed34e0b Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:18 -0800
Subject: list_lru: allow explicit memcg and NUMA node selection

Patch series "workload-specific and memory pressure-driven zswap
writeback", v8.

There are currently several issues with zswap writeback:

1. There is only a single global LRU for zswap, making it impossible to
   perform worload-specific shrinking - an memcg under memory pressure
   cannot determine which pages in the pool it owns, and often ends up
   writing pages from other memcgs. This issue has been previously
   observed in practice and mitigated by simply disabling
   memcg-initiated shrinking:

   https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

   But this solution leaves a lot to be desired, as we still do not
   have an avenue for an memcg to free up its own memory locked up in
   the zswap pool.

2. We only shrink the zswap pool when the user-defined limit is hit.
   This means that if we set the limit too high, cold data that are
   unlikely to be used again will reside in the pool, wasting precious
   memory. It is hard to predict how much zswap space will be needed
   ahead of time, as this depends on the workload (specifically, on
   factors such as memory access patterns and compressibility of the
   memory pages).

This patch series solves these issues by separating the global zswap LRU
into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e
memcg- and NUMA-aware) zswap writeback under memory pressure.  The new
shrinker does not have any parameter that must be tuned by the user, and
can be opted in or out on a per-memcg basis.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.


This patch (of 6):

The interface of list_lru is based on the assumption that the list node
and the data it represents belong to the same allocated on the correct
node/memcg.  While this assumption is valid for existing slab objects LRU
such as dentries and inodes, it is undocumented, and rather inflexible for
certain potential list_lru users (such as the upcoming zswap shrinker and
the THP shrinker).  It has caused us a lot of issues during our
development.

This patch changes list_lru interface so that the caller must explicitly
specify numa node and memcg when adding and removing objects.  The old
list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and
list_lru_del_obj(), respectively.

It also extends the list_lru API with a new function, list_lru_putback,
which undoes a previous list_lru_isolate call.  Unlike list_lru_add, it
does not increment the LRU node count (as list_lru_isolate does not
decrement the node count).  list_lru_putback also allows for explicit
memcg and NUMA node selection.

Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/android/binder_alloc.c |  7 +++---
 fs/dcache.c                    |  8 ++++---
 fs/gfs2/quota.c                |  6 ++---
 fs/inode.c                     |  4 ++--
 fs/nfs/nfs42xattr.c            |  8 +++----
 fs/nfsd/filecache.c            |  4 ++--
 fs/xfs/xfs_buf.c               |  6 ++---
 fs/xfs/xfs_dquot.c             |  2 +-
 fs/xfs/xfs_qm.c                |  2 +-
 include/linux/list_lru.h       | 54 +++++++++++++++++++++++++++++++++++++++---
 mm/list_lru.c                  | 48 ++++++++++++++++++++++++++++++-------
 mm/workingset.c                |  4 ++--
 12 files changed, 117 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 138f6d43d13b..f69d30c9f50f 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -234,7 +234,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
 		if (page->page_ptr) {
 			trace_binder_alloc_lru_start(alloc, index);
 
-			on_lru = list_lru_del(&binder_alloc_lru, &page->lru);
+			on_lru = list_lru_del_obj(&binder_alloc_lru, &page->lru);
 			WARN_ON(!on_lru);
 
 			trace_binder_alloc_lru_end(alloc, index);
@@ -285,7 +285,7 @@ free_range:
 
 		trace_binder_free_lru_start(alloc, index);
 
-		ret = list_lru_add(&binder_alloc_lru, &page->lru);
+		ret = list_lru_add_obj(&binder_alloc_lru, &page->lru);
 		WARN_ON(!ret);
 
 		trace_binder_free_lru_end(alloc, index);
@@ -848,7 +848,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc)
 			if (!alloc->pages[i].page_ptr)
 				continue;
 
-			on_lru = list_lru_del(&binder_alloc_lru,
+			on_lru = list_lru_del_obj(&binder_alloc_lru,
 					      &alloc->pages[i].lru);
 			page_addr = alloc->buffer + i * PAGE_SIZE;
 			binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
@@ -1287,4 +1287,3 @@ int binder_alloc_copy_from_buffer(struct binder_alloc *alloc,
 	return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
 					   dest, bytes);
 }
-
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..2ba37643b9c5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -428,7 +428,8 @@ static void d_lru_add(struct dentry *dentry)
 	this_cpu_inc(nr_dentry_unused);
 	if (d_is_negative(dentry))
 		this_cpu_inc(nr_dentry_negative);
-	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+	WARN_ON_ONCE(!list_lru_add_obj(
+			&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
 static void d_lru_del(struct dentry *dentry)
@@ -438,7 +439,8 @@ static void d_lru_del(struct dentry *dentry)
 	this_cpu_dec(nr_dentry_unused);
 	if (d_is_negative(dentry))
 		this_cpu_dec(nr_dentry_negative);
-	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+	WARN_ON_ONCE(!list_lru_del_obj(
+			&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
 static void d_shrink_del(struct dentry *dentry)
@@ -1240,7 +1242,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item,
 		 *
 		 * This is guaranteed by the fact that all LRU management
 		 * functions are intermediated by the LRU API calls like
-		 * list_lru_add and list_lru_del. List movement in this file
+		 * list_lru_add_obj and list_lru_del_obj. List movement in this file
 		 * only ever occur through this functions or through callbacks
 		 * like this one, that are called from the LRU API.
 		 *
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 95dae7838b4e..b57f8c7b35be 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
 		if (qd->qd_sbd != sdp)
 			continue;
 		if (lockref_get_not_dead(&qd->qd_lockref)) {
-			list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+			list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
 			return qd;
 		}
 	}
@@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd)
 	}
 
 	qd->qd_lockref.count = 0;
-	list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+	list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru);
 	spin_unlock(&qd->qd_lockref.lock);
 }
 
@@ -1517,7 +1517,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		lockref_mark_dead(&qd->qd_lockref);
 		spin_unlock(&qd->qd_lockref.lock);
 
-		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+		list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
 		list_add(&qd->qd_lru, &dispose);
 	}
 	spin_unlock(&qd_lock);
diff --git a/fs/inode.c b/fs/inode.c
index f238d987dec9..ef2034a985e0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -464,7 +464,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
 	if (!mapping_shrinkable(&inode->i_data))
 		return;
 
-	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
+	if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_inc(nr_unused);
 	else if (rotate)
 		inode->i_state |= I_REFERENCED;
@@ -482,7 +482,7 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
+	if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_dec(nr_unused);
 }
 
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 2ad66a8922f4..49aaf28a6950 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
 	lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
 	    &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
 
-	return list_lru_add(lru, &entry->lru);
+	return list_lru_add_obj(lru, &entry->lru);
 }
 
 static bool
@@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
 	lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
 	    &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
 
-	return list_lru_del(lru, &entry->lru);
+	return list_lru_del_obj(lru, &entry->lru);
 }
 
 /*
@@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode)
 
 	oldcache = nfsi->xattr_cache;
 	if (oldcache != NULL) {
-		list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+		list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru);
 		oldcache->inode = NULL;
 	}
 	nfsi->xattr_cache = NULL;
@@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add)
 			kref_get(&cache->ref);
 			nfsi->xattr_cache = cache;
 			cache->inode = inode;
-			list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+			list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru);
 		}
 
 		spin_unlock(&inode->i_lock);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ef063f93fde9..6c2decfdeb4b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
 static bool nfsd_file_lru_add(struct nfsd_file *nf)
 {
 	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
-	if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+	if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) {
 		trace_nfsd_file_lru_add(nf);
 		return true;
 	}
@@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf)
 
 static bool nfsd_file_lru_remove(struct nfsd_file *nf)
 {
-	if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+	if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) {
 		trace_nfsd_file_lru_del(nf);
 		return true;
 	}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 545c7991b9b5..669332849680 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -169,7 +169,7 @@ xfs_buf_stale(
 
 	atomic_set(&bp->b_lru_ref, 0);
 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
-	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
 		atomic_dec(&bp->b_hold);
 
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
@@ -1047,7 +1047,7 @@ xfs_buf_rele(
 		 * buffer for the LRU and clear the (now stale) dispose list
 		 * state flag
 		 */
-		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+		if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
 			bp->b_state &= ~XFS_BSTATE_DISPOSE;
 			atomic_inc(&bp->b_hold);
 		}
@@ -1060,7 +1060,7 @@ xfs_buf_rele(
 		 * was on was the disposal list
 		 */
 		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+			list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
 		} else {
 			ASSERT(list_empty(&bp->b_lru));
 		}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a013b87ab8d5..61a45a86ffe8 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1065,7 +1065,7 @@ xfs_qm_dqput(
 		struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
 		trace_xfs_dqput_free(dqp);
 
-		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+		if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru))
 			XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
 	}
 	xfs_dqunlock(dqp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 94a7932ac570..67d0a8564ff3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -171,7 +171,7 @@ xfs_qm_dqpurge(
 	 * hits zero, so it really should be on the freelist here.
 	 */
 	ASSERT(!list_empty(&dqp->q_lru));
-	list_lru_del(&qi->qi_lru, &dqp->q_lru);
+	list_lru_del_obj(&qi->qi_lru, &dqp->q_lru);
 	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
 
 	xfs_qm_dqdestroy(dqp);
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index db86ad78d428..7675a48a0701 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  * list_lru_add: add an element to the lru list's tail
  * @lru: the lru pointer
  * @item: the item to be added.
+ * @nid: the node id of the sublist to add the item to.
+ * @memcg: the cgroup of the sublist to add the item to.
  *
  * If the element is already part of a list, this function returns doing
  * nothing. Therefore the caller does not need to keep state about whether or
@@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  *
  * Return: true if the list was updated, false otherwise
  */
-bool list_lru_add(struct list_lru *lru, struct list_head *item);
+bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg);
 
 /**
- * list_lru_del: delete an element to the lru list
+ * list_lru_add_obj: add an element to the lru list's tail
+ * @lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * This function is similar to list_lru_add(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
+
+/**
+ * list_lru_del: delete an element from the lru list
  * @lru: the lru pointer
  * @item: the item to be deleted.
+ * @nid: the node id of the sublist to delete the item from.
+ * @memcg: the cgroup of the sublist to delete the item from.
  *
  * This function works analogously as list_lru_add() in terms of list
  * manipulation. The comments about an element already pertaining to
@@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
  *
  * Return: true if the list was updated, false otherwise
  */
-bool list_lru_del(struct list_lru *lru, struct list_head *item);
+bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg);
+
+/**
+ * list_lru_del_obj: delete an element from the lru list
+ * @lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function is similar to list_lru_del(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
+ *
+ * Return value: true if the list was updated, false otherwise.
+ */
+bool list_lru_del_obj(struct list_lru *lru, struct list_head *item);
 
 /**
  * list_lru_count_one: return the number of objects currently held by @lru
@@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
 void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 			   struct list_head *head);
+/**
+ * list_lru_putback: undo list_lru_isolate
+ * @lru: the lru pointer.
+ * @item: the item to put back.
+ * @nid: the node id of the sublist to put the item back to.
+ * @memcg: the cgroup of the sublist to put the item back to.
+ *
+ * Put back an isolated item into its original LRU. Note that unlike
+ * list_lru_add, this does not increment the node LRU count (as
+ * list_lru_isolate does not originally decrement this count).
+ *
+ * Since we might have dropped the LRU lock in between, recompute list_lru_one
+ * from the node's id and memcg.
+ */
+void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
+		      struct mem_cgroup *memcg);
 
 typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
 		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a05e5bef3b40..fcca67ac26ec 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -116,21 +116,19 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
-bool list_lru_add(struct list_lru *lru, struct list_head *item)
+bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg)
 {
-	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
-	struct mem_cgroup *memcg;
 	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
 	if (list_empty(item)) {
-		l = list_lru_from_kmem(lru, nid, item, &memcg);
+		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
 		list_add_tail(item, &l->list);
 		/* Set shrinker bit if the first element was added */
 		if (!l->nr_items++)
-			set_shrinker_bit(memcg, nid,
-					 lru_shrinker_id(lru));
+			set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
 		nlru->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
@@ -140,15 +138,25 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_add);
 
-bool list_lru_del(struct list_lru *lru, struct list_head *item)
+bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
+	struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
+		mem_cgroup_from_slab_obj(item) : NULL;
+
+	return list_lru_add(lru, item, nid, memcg);
+}
+EXPORT_SYMBOL_GPL(list_lru_add_obj);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
+		    struct mem_cgroup *memcg)
+{
 	struct list_lru_node *nlru = &lru->node[nid];
 	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
 	if (!list_empty(item)) {
-		l = list_lru_from_kmem(lru, nid, item, NULL);
+		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
 		list_del_init(item);
 		l->nr_items--;
 		nlru->nr_items--;
@@ -160,6 +168,16 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
+bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
+{
+	int nid = page_to_nid(virt_to_page(item));
+	struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
+		mem_cgroup_from_slab_obj(item) : NULL;
+
+	return list_lru_del(lru, item, nid, memcg);
+}
+EXPORT_SYMBOL_GPL(list_lru_del_obj);
+
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
 {
 	list_del_init(item);
@@ -175,6 +193,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
 }
 EXPORT_SYMBOL_GPL(list_lru_isolate_move);
 
+void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
+		      struct mem_cgroup *memcg)
+{
+	struct list_lru_one *list =
+		list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+
+	if (list_empty(item)) {
+		list_add_tail(item, &list->list);
+		if (!list->nr_items++)
+			set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
+	}
+}
+EXPORT_SYMBOL_GPL(list_lru_putback);
+
 unsigned long list_lru_count_one(struct list_lru *lru,
 				 int nid, struct mem_cgroup *memcg)
 {
diff --git a/mm/workingset.c b/mm/workingset.c
index b192e44a0e7c..c17d45c6f29b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -615,12 +615,12 @@ void workingset_update_node(struct xa_node *node)
 
 	if (node->count && node->count == node->nr_values) {
 		if (list_empty(&node->private_list)) {
-			list_lru_add(&shadow_nodes, &node->private_list);
+			list_lru_add_obj(&shadow_nodes, &node->private_list);
 			__inc_lruvec_kmem_state(node, WORKINGSET_NODES);
 		}
 	} else {
 		if (!list_empty(&node->private_list)) {
-			list_lru_del(&shadow_nodes, &node->private_list);
+			list_lru_del_obj(&shadow_nodes, &node->private_list);
 			__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
 		}
 	}
-- 
cgit v1.2.3


From fdc4161ff6a5e96222e159c1f1b28d31a985130d Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:19 -0800
Subject: memcontrol: implement mem_cgroup_tryget_online()

This patch implements a helper function that try to get a reference to an
memcg's css, as well as checking if it is online.  This new function is
almost exactly the same as the existing mem_cgroup_tryget(), except for
the onlineness check.  In the !CONFIG_MEMCG case, it always returns true,
analogous to mem_cgroup_tryget().  This is useful for e.g to the new zswap
writeback scheme, where we need to select the next online memcg as a
candidate for the global limit reclaim.

Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7bdcf3020d7a..2bd7d14ace78 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -821,6 +821,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
 	return !memcg || css_tryget(&memcg->css);
 }
 
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+	return !memcg || css_tryget_online(&memcg->css);
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 	if (memcg)
@@ -1349,6 +1354,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
 	return true;
 }
 
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+	return true;
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
-- 
cgit v1.2.3


From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Nov 2023 11:40:20 -0800
Subject: zswap: make shrinking memcg-aware

Currently, we only have a single global LRU for zswap.  This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing pages
from other memcgs.  This issue has been previously observed in practice
and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

[nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()]
  Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com
[nphamcs@gmail.com: drop the pool's reference at the end of the writeback step]
  Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Co-developed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |   5 +
 include/linux/zswap.h      |   2 +
 mm/memcontrol.c            |   2 +
 mm/swap.h                  |   3 +-
 mm/swap_state.c            |  24 +++-
 mm/zswap.c                 | 268 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 245 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2bd7d14ace78..a308c8eacf20 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+	return NULL;
+}
+
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
 	return false;
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 2a60ce39cfde..e571e393669b 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 
 #else
 
@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
+static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 592572d4842e..ce75e504fe8b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5614,6 +5614,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 
+	zswap_memcg_offline_cleanup(memcg);
+
 	memcg_offline_kmem(memcg);
 	reparent_shrinker_deferred(memcg);
 	wb_memcg_offline(memcg);
diff --git a/mm/swap.h b/mm/swap.h
index 73c332ee4d91..c0dc73e10e91 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated);
+				     bool *new_page_allocated,
+				     bool skip_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85d9e5806a6a..6c84236382f3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated)
+				     bool *new_page_allocated,
+				     bool skip_if_exists)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (err != -EEXIST)
 			goto fail_put_swap;
 
+		/*
+		 * Protect against a recursive call to __read_swap_cache_async()
+		 * on the same entry waiting forever here because SWAP_HAS_CACHE
+		 * is set but the folio is not the swap cache yet. This can
+		 * happen today if mem_cgroup_swapin_charge_folio() below
+		 * triggers reclaim through zswap, which may call
+		 * __read_swap_cache_async() in the writeback path.
+		 */
+		if (skip_if_exists)
+			goto fail_put_swap;
+
 		/*
 		 * We might race against __delete_from_swap_cache(), and
 		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
@@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+					&page_allocated, false);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
@@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
 				swp_entry(swp_type(entry), offset),
-				gfp_mask, mpol, ilx, &page_allocated);
+				gfp_mask, mpol, ilx, &page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
@@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
-						&page_allocated);
+						&page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 skip:
 	/* The page was likely read above, so no need for plugging here */
 	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
-					&page_allocated);
+					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
 	return page;
diff --git a/mm/zswap.c b/mm/zswap.c
index 699c6ee11222..213626e0f659 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -35,6 +35,7 @@
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/workqueue.h>
+#include <linux/list_lru.h>
 
 #include "swap.h"
 #include "internal.h"
@@ -174,8 +175,8 @@ struct zswap_pool {
 	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
-	struct list_head lru;
-	spinlock_t lru_lock;
+	struct list_lru list_lru;
+	struct mem_cgroup *next_shrink;
 };
 
 /*
@@ -291,15 +292,46 @@ static void zswap_update_total_size(void)
 	zswap_pool_total_size = total;
 }
 
+/* should be called under RCU */
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+	return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+	return NULL;
+}
+#endif
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+	return page_to_nid(virt_to_page(entry));
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+	struct zswap_pool *pool;
+
+	/* lock out zswap pools list modification */
+	spin_lock(&zswap_pools_lock);
+	list_for_each_entry(pool, &zswap_pools, list) {
+		if (pool->next_shrink == memcg)
+			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+	}
+	spin_unlock(&zswap_pools_lock);
+}
+
 /*********************************
 * zswap entry functions
 **********************************/
 static struct kmem_cache *zswap_entry_cache;
 
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
 {
 	struct zswap_entry *entry;
-	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
 	if (!entry)
 		return NULL;
 	entry->refcount = 1;
@@ -312,6 +344,61 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
+/*********************************
+* lru functions
+**********************************/
+static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+	int nid = entry_to_nid(entry);
+	struct mem_cgroup *memcg;
+
+	/*
+	 * Note that it is safe to use rcu_read_lock() here, even in the face of
+	 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
+	 * used in list_lru lookup, only two scenarios are possible:
+	 *
+	 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+	 *    new entry will be reparented to memcg's parent's list_lru.
+	 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+	 *    new entry will be added directly to memcg's parent's list_lru.
+	 *
+	 * Similar reasoning holds for list_lru_del() and list_lru_putback().
+	 */
+	rcu_read_lock();
+	memcg = mem_cgroup_from_entry(entry);
+	/* will always succeed */
+	list_lru_add(list_lru, &entry->lru, nid, memcg);
+	rcu_read_unlock();
+}
+
+static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+	int nid = entry_to_nid(entry);
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_entry(entry);
+	/* will always succeed */
+	list_lru_del(list_lru, &entry->lru, nid, memcg);
+	rcu_read_unlock();
+}
+
+static void zswap_lru_putback(struct list_lru *list_lru,
+		struct zswap_entry *entry)
+{
+	int nid = entry_to_nid(entry);
+	spinlock_t *lock = &list_lru->node[nid].lock;
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_entry(entry);
+	spin_lock(lock);
+	/* we cannot use list_lru_add here, because it increments node's lru count */
+	list_lru_putback(list_lru, &entry->lru, nid, memcg);
+	spin_unlock(lock);
+	rcu_read_unlock();
+}
+
 /*********************************
 * rbtree functions
 **********************************/
@@ -396,9 +483,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		spin_lock(&entry->pool->lru_lock);
-		list_del(&entry->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		zswap_lru_del(&entry->pool->list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
@@ -632,21 +717,15 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
 		zswap_entry_put(tree, entry);
 }
 
-static int zswap_reclaim_entry(struct zswap_pool *pool)
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+				       spinlock_t *lock, void *arg)
 {
-	struct zswap_entry *entry;
+	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
 	struct zswap_tree *tree;
 	pgoff_t swpoffset;
-	int ret;
+	enum lru_status ret = LRU_REMOVED_RETRY;
+	int writeback_result;
 
-	/* Get an entry off the LRU */
-	spin_lock(&pool->lru_lock);
-	if (list_empty(&pool->lru)) {
-		spin_unlock(&pool->lru_lock);
-		return -EINVAL;
-	}
-	entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
-	list_del_init(&entry->lru);
 	/*
 	 * Once the lru lock is dropped, the entry might get freed. The
 	 * swpoffset is copied to the stack, and entry isn't deref'd again
@@ -654,28 +733,32 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
 	 */
 	swpoffset = swp_offset(entry->swpentry);
 	tree = zswap_trees[swp_type(entry->swpentry)];
-	spin_unlock(&pool->lru_lock);
+	list_lru_isolate(l, item);
+	/*
+	 * It's safe to drop the lock here because we return either
+	 * LRU_REMOVED_RETRY or LRU_RETRY.
+	 */
+	spin_unlock(lock);
 
 	/* Check for invalidate() race */
 	spin_lock(&tree->lock);
-	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
-		ret = -EAGAIN;
+	if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
 		goto unlock;
-	}
+
 	/* Hold a reference to prevent a free during writeback */
 	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
-	ret = zswap_writeback_entry(entry, tree);
+	writeback_result = zswap_writeback_entry(entry, tree);
 
 	spin_lock(&tree->lock);
-	if (ret) {
-		/* Writeback failed, put entry back on LRU */
-		spin_lock(&pool->lru_lock);
-		list_move(&entry->lru, &pool->lru);
-		spin_unlock(&pool->lru_lock);
+	if (writeback_result) {
+		zswap_reject_reclaim_fail++;
+		zswap_lru_putback(&entry->pool->list_lru, entry);
+		ret = LRU_RETRY;
 		goto put_unlock;
 	}
+	zswap_written_back_pages++;
 
 	/*
 	 * Writeback started successfully, the page now belongs to the
@@ -689,24 +772,91 @@ put_unlock:
 	zswap_entry_put(tree, entry);
 unlock:
 	spin_unlock(&tree->lock);
-	return ret ? -EAGAIN : 0;
+	spin_lock(lock);
+	return ret;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+	struct zswap_pool *pool;
+	int nid, shrunk = 0;
+
+	/*
+	 * Skip zombies because their LRUs are reparented and we would be
+	 * reclaiming from the parent instead of the dead memcg.
+	 */
+	if (memcg && !mem_cgroup_online(memcg))
+		return -ENOENT;
+
+	pool = zswap_pool_current_get();
+	if (!pool)
+		return -EINVAL;
+
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
+		unsigned long nr_to_walk = 1;
+
+		shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+					    &shrink_memcg_cb, NULL, &nr_to_walk);
+	}
+	zswap_pool_put(pool);
+	return shrunk ? 0 : -EAGAIN;
 }
 
 static void shrink_worker(struct work_struct *w)
 {
 	struct zswap_pool *pool = container_of(w, typeof(*pool),
 						shrink_work);
+	struct mem_cgroup *memcg;
 	int ret, failures = 0;
 
+	/* global reclaim will select cgroup in a round-robin fashion. */
 	do {
-		ret = zswap_reclaim_entry(pool);
-		if (ret) {
-			zswap_reject_reclaim_fail++;
-			if (ret != -EAGAIN)
+		spin_lock(&zswap_pools_lock);
+		pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+		memcg = pool->next_shrink;
+
+		/*
+		 * We need to retry if we have gone through a full round trip, or if we
+		 * got an offline memcg (or else we risk undoing the effect of the
+		 * zswap memcg offlining cleanup callback). This is not catastrophic
+		 * per se, but it will keep the now offlined memcg hostage for a while.
+		 *
+		 * Note that if we got an online memcg, we will keep the extra
+		 * reference in case the original reference obtained by mem_cgroup_iter
+		 * is dropped by the zswap memcg offlining callback, ensuring that the
+		 * memcg is not killed when we are reclaiming.
+		 */
+		if (!memcg) {
+			spin_unlock(&zswap_pools_lock);
+			if (++failures == MAX_RECLAIM_RETRIES)
 				break;
+
+			goto resched;
+		}
+
+		if (!mem_cgroup_tryget_online(memcg)) {
+			/* drop the reference from mem_cgroup_iter() */
+			mem_cgroup_iter_break(NULL, memcg);
+			pool->next_shrink = NULL;
+			spin_unlock(&zswap_pools_lock);
+
 			if (++failures == MAX_RECLAIM_RETRIES)
 				break;
+
+			goto resched;
 		}
+		spin_unlock(&zswap_pools_lock);
+
+		ret = shrink_memcg(memcg);
+		/* drop the extra reference */
+		mem_cgroup_put(memcg);
+
+		if (ret == -EINVAL)
+			break;
+		if (ret && ++failures == MAX_RECLAIM_RETRIES)
+			break;
+
+resched:
 		cond_resched();
 	} while (!zswap_can_accept());
 	zswap_pool_put(pool);
@@ -767,8 +917,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	INIT_LIST_HEAD(&pool->lru);
-	spin_lock_init(&pool->lru_lock);
+	list_lru_init_memcg(&pool->list_lru, NULL);
 	INIT_WORK(&pool->shrink_work, shrink_worker);
 
 	zswap_pool_debug("created", pool);
@@ -834,6 +983,13 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
+	list_lru_destroy(&pool->list_lru);
+
+	spin_lock(&zswap_pools_lock);
+	mem_cgroup_iter_break(NULL, pool->next_shrink);
+	pool->next_shrink = NULL;
+	spin_unlock(&zswap_pools_lock);
+
 	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
 		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
@@ -1081,7 +1237,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	/* try to allocate swap cache page */
 	mpol = get_task_policy(current);
 	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &page_was_allocated);
+				NO_INTERLEAVE_INDEX, &page_was_allocated, true);
 	if (!page) {
 		ret = -ENOMEM;
 		goto fail;
@@ -1147,7 +1303,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 	/* start writeback */
 	__swap_writepage(page, &wbc);
 	put_page(page);
-	zswap_written_back_pages++;
 
 	return ret;
 
@@ -1204,6 +1359,7 @@ bool zswap_store(struct folio *folio)
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
+	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
 	struct zpool *zpool;
 	unsigned int dlen = PAGE_SIZE;
@@ -1235,15 +1391,15 @@ bool zswap_store(struct folio *folio)
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	spin_unlock(&tree->lock);
-
-	/*
-	 * XXX: zswap reclaim does not work with cgroups yet. Without a
-	 * cgroup-aware entry LRU, we will push out entries system-wide based on
-	 * local cgroup limits.
-	 */
 	objcg = get_obj_cgroup_from_folio(folio);
-	if (objcg && !obj_cgroup_may_zswap(objcg))
-		goto reject;
+	if (objcg && !obj_cgroup_may_zswap(objcg)) {
+		memcg = get_mem_cgroup_from_objcg(objcg);
+		if (shrink_memcg(memcg)) {
+			mem_cgroup_put(memcg);
+			goto reject;
+		}
+		mem_cgroup_put(memcg);
+	}
 
 	/* reclaim space if needed */
 	if (zswap_is_full()) {
@@ -1260,7 +1416,7 @@ bool zswap_store(struct folio *folio)
 	}
 
 	/* allocate entry */
-	entry = zswap_entry_cache_alloc(GFP_KERNEL);
+	entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
 	if (!entry) {
 		zswap_reject_kmemcache_fail++;
 		goto reject;
@@ -1287,6 +1443,15 @@ bool zswap_store(struct folio *folio)
 	if (!entry->pool)
 		goto freepage;
 
+	if (objcg) {
+		memcg = get_mem_cgroup_from_objcg(objcg);
+		if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
+			mem_cgroup_put(memcg);
+			goto put_pool;
+		}
+		mem_cgroup_put(memcg);
+	}
+
 	/* compress */
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 
@@ -1365,9 +1530,8 @@ insert_entry:
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	if (entry->length) {
-		spin_lock(&entry->pool->lru_lock);
-		list_add(&entry->lru, &entry->pool->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		INIT_LIST_HEAD(&entry->lru);
+		zswap_lru_add(&entry->pool->list_lru, entry);
 	}
 	spin_unlock(&tree->lock);
 
@@ -1380,6 +1544,7 @@ insert_entry:
 
 put_dstmem:
 	mutex_unlock(acomp_ctx->mutex);
+put_pool:
 	zswap_pool_put(entry->pool);
 freepage:
 	zswap_entry_cache_free(entry);
@@ -1474,9 +1639,8 @@ freeentry:
 		zswap_invalidate_entry(tree, entry);
 		folio_mark_dirty(folio);
 	} else if (entry->length) {
-		spin_lock(&entry->pool->lru_lock);
-		list_move(&entry->lru, &entry->pool->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		zswap_lru_del(&entry->pool->list_lru, entry);
+		zswap_lru_add(&entry->pool->list_lru, entry);
 	}
 	zswap_entry_put(tree, entry);
 	spin_unlock(&tree->lock);
-- 
cgit v1.2.3


From 7108cc3f765cafd48a6a35f8add140beaecfa75b Mon Sep 17 00:00:00 2001
From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Date: Thu, 30 Nov 2023 11:40:21 -0800
Subject: mm: memcg: add per-memcg zswap writeback stat

Since zswap now writes back pages from memcg-specific LRUs, we now need a
new stat to show writebacks count for each memcg.

[nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB]
  Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com
Suggested-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 +
 mm/memcontrol.c               | 1 +
 mm/vmstat.c                   | 1 +
 mm/zswap.c                    | 4 ++++
 4 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d1b847502f09..747943bc8cc2 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_ZSWAP
 		ZSWPIN,
 		ZSWPOUT,
+		ZSWPWB,
 #endif
 #ifdef CONFIG_X86
 		DIRECT_MAP_LEVEL2_SPLIT,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce75e504fe8b..69b0ad455242 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -703,6 +703,7 @@ static const unsigned int memcg_vm_event_stat[] = {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	ZSWPIN,
 	ZSWPOUT,
+	ZSWPWB,
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	THP_FAULT_ALLOC,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index afa5a38fcc9c..cfd8d8256f8e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1401,6 +1401,7 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_ZSWAP
 	"zswpin",
 	"zswpout",
+	"zswpwb",
 #endif
 #ifdef CONFIG_X86
 	"direct_map_level2_splits",
diff --git a/mm/zswap.c b/mm/zswap.c
index 213626e0f659..c329fca810c8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -760,6 +760,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	}
 	zswap_written_back_pages++;
 
+	if (entry->objcg)
+		count_objcg_event(entry->objcg, ZSWPWB);
+
+	count_vm_event(ZSWPWB);
 	/*
 	 * Writeback started successfully, the page now belongs to the
 	 * swapcache. Drop the entry from zswap - unless invalidate already
-- 
cgit v1.2.3


From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 30 Nov 2023 11:40:23 -0800
Subject: zswap: shrink zswap pool based on memory pressure

Currently, we only shrink the zswap pool when the user-defined limit is
hit.  This means that if we set the limit too high, cold data that are
unlikely to be used again will reside in the pool, wasting precious
memory.  It is hard to predict how much zswap space will be needed ahead
of time, as this depends on the workload (specifically, on factors such as
memory access patterns and compressibility of the memory pages).

This patch implements a memcg- and NUMA-aware shrinker for zswap, that is
initiated when there is memory pressure.  The shrinker does not have any
parameter that must be tuned by the user, and can be opted in or out on a
per-memcg basis.

Furthermore, to make it more robust for many workloads and prevent
overshrinking (i.e evicting warm pages that might be refaulted into
memory), we build in the following heuristics:

* Estimate the number of warm pages residing in zswap, and attempt to
  protect this region of the zswap LRU.
* Scale the number of freeable objects by an estimate of the memory
  saving factor. The better zswap compresses the data, the fewer pages
  we will evict to swap (as we will otherwise incur IO for relatively
  small memory saving).
* During reclaim, if the shrinker encounters a page that is also being
  brought into memory, the shrinker will cautiously terminate its
  shrinking action, as this is a sign that it is touching the warmer
  region of the zswap LRU.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.

[nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing]
  Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/zswap.rst |  10 ++
 include/linux/mmzone.h                 |   2 +
 include/linux/zswap.h                  |  25 ++++-
 mm/Kconfig                             |  14 +++
 mm/mmzone.c                            |   1 +
 mm/swap_state.c                        |   2 +
 mm/zswap.c                             | 192 ++++++++++++++++++++++++++++++++-
 7 files changed, 240 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 45b98390e938..62fc244ec702 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -153,6 +153,16 @@ attribute, e. g.::
 
 Setting this parameter to 100 will disable the hysteresis.
 
+When there is a sizable amount of cold memory residing in the zswap pool, it
+can be advantageous to proactively write these cold pages to swap and reclaim
+the memory for other use cases. By default, the zswap shrinker is disabled.
+User can enable it as follows:
+
+  echo Y > /sys/module/zswap/parameters/shrinker_enabled
+
+This can be enabled at the boot time if ``CONFIG_ZSWAP_SHRINKER_DEFAULT_ON`` is
+selected.
+
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, same-value filled pages and various counters for the reasons
 pages are rejected.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14faffa4354f..9ef9d010bff0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
 #include <linux/local_lock.h>
+#include <linux/zswap.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -641,6 +642,7 @@ struct lruvec {
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
+	struct zswap_lruvec_state zswap_lruvec_state;
 };
 
 /* Isolate for asynchronous migration */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e571e393669b..08c240e16a01 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,20 +5,40 @@
 #include <linux/types.h>
 #include <linux/mm_types.h>
 
+struct lruvec;
+
 extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
+struct zswap_lruvec_state {
+	/*
+	 * Number of pages in zswap that should be protected from the shrinker.
+	 * This number is an estimate of the following counts:
+	 *
+	 * a) Recent page faults.
+	 * b) Recent insertion to the zswap LRU. This includes new zswap stores,
+	 *    as well as recent zswap LRU rotations.
+	 *
+	 * These pages are likely to be warm, and might incur IO if the are written
+	 * to swap.
+	 */
+	atomic_long_t nr_zswap_protected;
+};
+
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_page_swapin(struct page *page);
 #else
 
+struct zswap_lruvec_state {};
+
 static inline bool zswap_store(struct folio *folio)
 {
 	return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
-
+static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
+static inline void zswap_page_swapin(struct page *page) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 57cd378c73d6..ca87cdb72f11 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -61,6 +61,20 @@ config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
 	  The cost is that if the page was never dirtied and needs to be
 	  swapped out again, it will be re-compressed.
 
+config ZSWAP_SHRINKER_DEFAULT_ON
+	bool "Shrink the zswap pool on memory pressure"
+	depends on ZSWAP
+	default n
+	help
+	  If selected, the zswap shrinker will be enabled, and the pages
+	  stored in the zswap pool will become available for reclaim (i.e
+	  written back to the backing swap device) on memory pressure.
+
+	  This means that zswap writeback could happen even if the pool is
+	  not yet full, or the cgroup zswap limit has not been reached,
+	  reducing the chance that cold pages will reside in the zswap pool
+	  and consume memory indefinitely.
+
 choice
 	prompt "Default compressor"
 	depends on ZSWAP
diff --git a/mm/mmzone.c b/mm/mmzone.c
index b594d3f268fe..c01896eca736 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec)
 
 	memset(lruvec, 0, sizeof(struct lruvec));
 	spin_lock_init(&lruvec->lru_lock);
+	zswap_lruvec_state_init(lruvec);
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6c84236382f3..c597cec606e4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -687,6 +687,7 @@ skip:
 					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
+	zswap_page_swapin(page);
 	return page;
 }
 
@@ -862,6 +863,7 @@ skip:
 					&page_allocated, false);
 	if (unlikely(page_allocated))
 		swap_readpage(page, false, NULL);
+	zswap_page_swapin(page);
 	return page;
 }
 
diff --git a/mm/zswap.c b/mm/zswap.c
index c329fca810c8..015425ed9003 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -148,6 +148,11 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
 /* Number of zpools in zswap_pool (empirically determined for scalability) */
 #define ZSWAP_NR_ZPOOLS 32
 
+/* Enable/disable memory pressure-based shrinker. */
+static bool zswap_shrinker_enabled = IS_ENABLED(
+		CONFIG_ZSWAP_SHRINKER_DEFAULT_ON);
+module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
+
 /*********************************
 * data structures
 **********************************/
@@ -177,6 +182,8 @@ struct zswap_pool {
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
 	struct list_lru list_lru;
 	struct mem_cgroup *next_shrink;
+	struct shrinker *shrinker;
+	atomic_t nr_stored;
 };
 
 /*
@@ -275,17 +282,26 @@ static bool zswap_can_accept(void)
 			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 
+static u64 get_zswap_pool_size(struct zswap_pool *pool)
+{
+	u64 pool_size = 0;
+	int i;
+
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		pool_size += zpool_get_total_size(pool->zpools[i]);
+
+	return pool_size;
+}
+
 static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
-	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-			total += zpool_get_total_size(pool->zpools[i]);
+		total += get_zswap_pool_size(pool);
 
 	rcu_read_unlock();
 
@@ -344,13 +360,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
+/*********************************
+* zswap lruvec functions
+**********************************/
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_page_swapin(struct page *page)
+{
+	struct lruvec *lruvec;
+
+	if (page) {
+		lruvec = folio_lruvec(page_folio(page));
+		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	}
+}
+
 /*********************************
 * lru functions
 **********************************/
 static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 {
+	atomic_long_t *nr_zswap_protected;
+	unsigned long lru_size, old, new;
 	int nid = entry_to_nid(entry);
 	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 
 	/*
 	 * Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -368,6 +405,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
 	memcg = mem_cgroup_from_entry(entry);
 	/* will always succeed */
 	list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+	/* Update the protection area */
+	lru_size = list_lru_count_one(list_lru, nid, memcg);
+	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+	nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+	old = atomic_long_inc_return(nr_zswap_protected);
+	/*
+	 * Decay to avoid overflow and adapt to changing workloads.
+	 * This is based on LRU reclaim cost decaying heuristics.
+	 */
+	do {
+		new = old > lru_size / 4 ? old / 2 : old;
+	} while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
 	rcu_read_unlock();
 }
 
@@ -389,6 +439,7 @@ static void zswap_lru_putback(struct list_lru *list_lru,
 	int nid = entry_to_nid(entry);
 	spinlock_t *lock = &list_lru->node[nid].lock;
 	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_entry(entry);
@@ -396,6 +447,10 @@ static void zswap_lru_putback(struct list_lru *list_lru,
 	/* we cannot use list_lru_add here, because it increments node's lru count */
 	list_lru_putback(list_lru, &entry->lru, nid, memcg);
 	spin_unlock(lock);
+
+	lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
+	/* increment the protection area to account for the LRU rotation. */
+	atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 	rcu_read_unlock();
 }
 
@@ -485,6 +540,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
 	else {
 		zswap_lru_del(&entry->pool->list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
+		atomic_dec(&entry->pool->nr_stored);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -526,6 +582,109 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 	return entry;
 }
 
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+				       spinlock_t *lock, void *arg);
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+	unsigned long shrink_ret, nr_protected, lru_size;
+	struct zswap_pool *pool = shrinker->private_data;
+	bool encountered_page_in_swapcache = false;
+
+	if (!zswap_shrinker_enabled) {
+		sc->nr_scanned = 0;
+		return SHRINK_STOP;
+	}
+
+	nr_protected =
+		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+
+	/*
+	 * Abort if we are shrinking into the protected region.
+	 *
+	 * This short-circuiting is necessary because if we have too many multiple
+	 * concurrent reclaimers getting the freeable zswap object counts at the
+	 * same time (before any of them made reasonable progress), the total
+	 * number of reclaimed objects might be more than the number of unprotected
+	 * objects (i.e the reclaimers will reclaim into the protected area of the
+	 * zswap LRU).
+	 */
+	if (nr_protected >= lru_size - sc->nr_to_scan) {
+		sc->nr_scanned = 0;
+		return SHRINK_STOP;
+	}
+
+	shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+		&encountered_page_in_swapcache);
+
+	if (encountered_page_in_swapcache)
+		return SHRINK_STOP;
+
+	return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	struct zswap_pool *pool = shrinker->private_data;
+	struct mem_cgroup *memcg = sc->memcg;
+	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+	if (!zswap_shrinker_enabled)
+		return 0;
+
+#ifdef CONFIG_MEMCG_KMEM
+	mem_cgroup_flush_stats();
+	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+#else
+	/* use pool stats instead of memcg stats */
+	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+	nr_stored = atomic_read(&pool->nr_stored);
+#endif
+
+	if (!nr_stored)
+		return 0;
+
+	nr_protected =
+		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+	nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+	/*
+	 * Subtract the lru size by an estimate of the number of pages
+	 * that should be protected.
+	 */
+	nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+	/*
+	 * Scale the number of freeable pages by the memory saving factor.
+	 * This ensures that the better zswap compresses memory, the fewer
+	 * pages we will evict to swap (as it will otherwise incur IO for
+	 * relatively small memory saving).
+	 */
+	return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+	pool->shrinker =
+		shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+	if (!pool->shrinker)
+		return;
+
+	pool->shrinker->private_data = pool;
+	pool->shrinker->scan_objects = zswap_shrinker_scan;
+	pool->shrinker->count_objects = zswap_shrinker_count;
+	pool->shrinker->batch = 0;
+	pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
 /*********************************
 * per-cpu code
 **********************************/
@@ -721,6 +880,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 				       spinlock_t *lock, void *arg)
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+	bool *encountered_page_in_swapcache = (bool *)arg;
 	struct zswap_tree *tree;
 	pgoff_t swpoffset;
 	enum lru_status ret = LRU_REMOVED_RETRY;
@@ -756,6 +916,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		zswap_reject_reclaim_fail++;
 		zswap_lru_putback(&entry->pool->list_lru, entry);
 		ret = LRU_RETRY;
+
+		/*
+		 * Encountering a page already in swap cache is a sign that we are shrinking
+		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
+		 * shrinker context).
+		 */
+		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+			ret = LRU_SKIP;
+			*encountered_page_in_swapcache = true;
+		}
+
 		goto put_unlock;
 	}
 	zswap_written_back_pages++;
@@ -914,6 +1085,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 				       &pool->node);
 	if (ret)
 		goto error;
+
+	zswap_alloc_shrinker(pool);
+	if (!pool->shrinker)
+		goto error;
+
 	pr_debug("using %s compressor\n", pool->tfm_name);
 
 	/* being the current pool takes 1 ref; this func expects the
@@ -921,13 +1097,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	list_lru_init_memcg(&pool->list_lru, NULL);
+	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+		goto lru_fail;
+	shrinker_register(pool->shrinker);
 	INIT_WORK(&pool->shrink_work, shrink_worker);
+	atomic_set(&pool->nr_stored, 0);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
+lru_fail:
+	list_lru_destroy(&pool->list_lru);
+	shrinker_free(pool->shrinker);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -985,6 +1167,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	zswap_pool_debug("destroying", pool);
 
+	shrinker_free(pool->shrinker);
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
 	list_lru_destroy(&pool->list_lru);
@@ -1536,6 +1719,7 @@ insert_entry:
 	if (entry->length) {
 		INIT_LIST_HEAD(&entry->lru);
 		zswap_lru_add(&entry->pool->list_lru, entry);
+		atomic_inc(&entry->pool->nr_stored);
 	}
 	spin_unlock(&tree->lock);
 
-- 
cgit v1.2.3


From 9294a037c01564786abb15436529fae3863268a2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 30 Nov 2023 02:36:44 +0000
Subject: mm/damon/core: implement goal-oriented feedback-driven quota
 auto-tuning

Patch series "mm/damon: let users feed and tame/auto-tune DAMOS".

Introduce Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning.
It makes DAMOS self-tuned with periodic simple user feedback.

Background: DAMOS Control Difficulty
====================================

DAMOS helps users easily implement access pattern aware system operations.
However, controlling DAMOS in the wild is not that easy.

The basic way for DAMOS control is specifying the target access pattern.
In this approach, the user is assumed to well understand the access
pattern and the characteristics of the system and the workloads.  Though
there are useful tools for that, it takes time and effort depending on the
complexity and the dynamicity of the system and the workloads.  After all,
the access pattern consists of three ranges, namely the size, the access
rate, and the age of the regions.  It means users need to tune six
parameters, which is anyway not a simple task.

One of the worst cases would be DAMOS being too aggressive like a
berserker, and therefore consuming too much system resource and making
unwanted radical system operations.  To let users avoid such cases, DAMOS
allows users to set the upper-limit of the schemes' aggressiveness, namely
DAMOS quota.  DAMOS further provides its best-effort under the limit by
prioritizing regions based on the access pattern of the regions.  For
example, users can ask DAMOS to page out up to 100 MiB of memory regions
per second.  Then DAMOS pages out regions that are not accessed for a
longer time (colder) first under the limit.  This allows users to set the
target access pattern a bit naive with wider ranges, and focus on tuning
only one parameter, the quota.  In other words, the number of parameters
to tune can be reduced from six to one.

Still, however, the optimum value for the quota depends on the system and
the workloads' characteristics, so not that simple.  The number of
parameters to tune can also increase again if the user needs to run
multiple schemes.

Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning
=============================================================

Users would use DAMOS since they want to achieve something with it.  They
will likely have measurable metrics representing the achievement and the
target number of the metric like SLO, and continuously measure that
anyway.  While the additional cost of getting the information is nearly
zero, it could be useful for DAMOS to understand how appropriate its
current aggressiveness is set, and adjust it on its own to make the metric
value more close to the target.

Based on this idea, we introduce a new way of tuning DAMOS with nearly
zero additional effort, namely Aim-oriented Feedback-driven DAMOS
Aggressiveness Auto Tuning.  It asks users to provide feedback
representing how well DAMOS is doing relative to the users' aim.  Then
DAMOS adjusts its aggressiveness, specifically the quota that provides
the best effort result under the limit, based on the current level of
the aggressiveness and the users' feedback.

Implementation
==============

The implementation asks users to represent the feedback with score
numbers.  The scores could be anything including user-space specific
metrics including latency and throughput of special user-space workloads,
and system metrics including free memory ratio, memory pressure stall time
(PSI), and active to inactive LRU lists size ratio.  The feedback scores
and the aggressiveness of the given DAMOS scheme are assumed to be
positively proportional, though.  Selecting metrics of the assumption is
the users' responsibility.

The core logic uses the below simple feedback loop algorithm to calculate
the next aggressiveness level of the scheme from the current
aggressiveness level and the current feedback (target_score and
current_score).  It calculates the compensation for next aggressiveness as
a proportion of current aggressiveness and distance to the target score.
As a result, it arrives at the near-goal state in a short time using big
steps when it's far from the goal, but avoids making unnecessarily radical
changes that could turn out to be a bad decision using small steps when
its near to the goal.

    f(n) = max(1, f(n - 1) * ((target_score - current_score) / target_score + 1))

Note that the compensation value becomes negative when it's over
achieving the goal.  That's why the feedback metric and the
aggressiveness of the scheme should be positively proportional.  The
distance-adaptive speed manipulation is simply applied.

Example Use Cases
=================

If users want to reduce the memory footprint of the system as much as
possible as long as the time spent for handling the resulting memory
pressure is within a threshold, they could use DAMOS scheme that reclaims
cold memory regions aiming for a little level of memory pressure stall
time.

If users want the active/inactive LRU lists well balanced to reduce the
performance impact due to possible future memory pressure, they could use
two schemes.  The first one would be set to locate hot pages in the active
LRU list, aiming for a specific active-to-inactive LRU list size ratio,
say, 70%.  The second one would be to locate cold pages in the inactive
LRU list, aiming for a specific inactive-to-active LRU list size ratio,
say, 30%.  Then, DAMOS will balance the two schemes based on the goal and
feedback.

This aim-oriented auto tuning could also be useful for general
balancing-required access aware system operations such as system memory
auto scaling[3] and tiered memory management[4].  These two example usages
are not what current DAMOS implementation is already supporting, but
require additional DAMOS action developments, though.

Evaluation: subtle memory pressure aiming proactive reclamation
===============================================================

To show if the implementation works as expected, we prepare four different
system configurations on AWS i3.metal instances.  The first setup
(original) runs the workload without any DAMOS scheme.  The second setup
(not-tuned) runs the workload with a virtual address space-based proactive
reclamation scheme that pages out memory regions that are not accessed for
five seconds or more.  The third setup (offline-tuned) runs the same
proactive reclamation DAMOS scheme, but after making it tuned for each
workload offline, using our previous user-space driven automatic tuning
approach, namely DAMOOS[1].  The fourth and final setup (AFDAA) runs the
scheme that is the same as that of 'not-tuned' setup, but aims to keep
0.5% of 'some' memory pressure stall time (PSI) for the last 10 seconds
using the aiming-oriented auto tuning.

For each setup, we run realistic workloads from PARSEC3 and SPLASH-2X
benchmark suites.  For each run, we measure RSS and runtime of the
workload, and 'some' memory pressure stall time (PSI) of the system.  We
repeat the runs five times and use averaged measurements.

For simple comparison of the results, we normalize the measurements to
those of 'original'.  In the case of the PSI, though, the measurement for
'original' was zero, so we normalize the value to that of 'not-tuned'
scheme's result.  The normalized results are shown below.

            Not-tuned         Offline-tuned     AFDAA
    RSS     0.622688178226118 0.787950678944904 0.740093483278979
    runtime 1.11767826657912  1.0564674983585   1.0910833880499
    PSI     1                 0.727521443794069 0.308498846350299

The 'not-tuned' scheme achieves about 38.7% memory saving but incur about
11.7% runtime slowdown.  The 'offline-tuned' scheme achieves about 22.2%
memory saving with about 5.5% runtime slowdown.  It also achieves about
28.2% memory pressure stall time saving.  AFDAA achieves about 26% memory
saving with about 9.1% runtime slowdown.  It also achieves about 69.1%
memory pressure stall time saving.  We repeat this test multiple times,
and get consistent results.  AFDAA is now integrated in our daily DAMON
performance test setup.

Apparently the aggressiveness of 'AFDAA' setup is somewhere between those
of 'not-tuned' and 'offline-tuned' setup, since its memory saving and
runtime overhead are between those of the other two setups.  Actually we
set the memory pressure stall time goal aiming for this middle
aggressiveness.  The difference in the two metrics are not significant,
though.  However, it shows significant saving of the memory pressure stall
time, which was the goal of the auto-tuning, over the two variants.
Hence, we conclude the automatic tuning is working as expected.

Please note that the AFDAA setup is only for the evaluation, and
therefore intentionally set a bit aggressive.  It might not be
appropriate for production environments.

The test code is also available[2], so you could reproduce it on your
system and workloads.

Patches Sequence
================

The first four patches implement the core logic and user interfaces for
the auto tuning.  The first patch implements the core logic for the auto
tuning, and the API for DAMOS users in the kernel space.  The second
patch implements basic file operations of DAMON sysfs directories and
files that will be used for setting the goals and providing the
feedback.  The third patch connects the quota goals files inputs to the
DAMOS core logic.  Finally the fourth patch implements a dedicated DAMOS
sysfs command for efficiently committing the quota goals feedback.

Two patches for simple tests of the logic and interfaces follow.  The
fifth patch implements the core logic unit test.  The sixth patch
implements a selftest for the DAMON Sysfs interface for the goals.

Finally, three patches for documentation follows.  The seventh patch
documents the design of the feature.  The eighth patch updates the API
doc for the new sysfs files.  The final eighth patch updates the usage
document for the features.

References
==========

[1] DAOS paper:
    https://www.amazon.science/publications/daos-data-access-aware-operating-system
[2] Evaluation code:
    https://github.com/damonitor/damon-tests/commit/3f884e61193f0166b8724554b6d06b0c449a712d
[3] Memory auto scaling RFC idea:
    https://lore.kernel.org/damon/20231112195114.61474-1-sj@kernel.org/
[4] DAMON-based tiered memory management RFC idea:
    https://lore.kernel.org/damon/20231112195602.61525-1-sj@kernel.org/


This patch (of 9)

Users can effectively control the upper-limit aggressiveness of DAMOS
schemes using the quota feature.  The quota provides best result under the
limit by prioritizing regions based on the access pattern.  That said,
finding the best value, which could depend on dynamic characteristics of
the system and the workloads, is still challenging.

Implement a simple feedback-driven tuning mechanism and use it for
automatic tuning of DAMOS quota.  The implementation allows users to
provide the feedback by setting a feedback score returning callback
function.  Then DAMOS periodically calls the function back and adjusts the
quota based on the return value of the callback and current quota value.

Note that the absolute-value based time/size quotas still work as the
maximum hard limits of the scheme's aggressiveness.  The feedback-driven
auto-tuned quota is applied only if it is not exceeding the manually set
maximum limits.  Same for the scheme-target access pattern and filters
like other features.

[sj@kernel.org: document get_score_arg field of struct damos_quota]
  Link: https://lkml.kernel.org/r/20231204170106.60992-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231130023652.50284-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231130023652.50284-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 20 +++++++++++++++
 mm/damon/core.c       | 68 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab2f17d9926b..aa34ab433bc5 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -136,6 +136,9 @@ enum damos_action {
  * @weight_nr_accesses:	Weight of the region's nr_accesses for prioritization.
  * @weight_age:		Weight of the region's age for prioritization.
  *
+ * @get_score:		Feedback function for self-tuning quota.
+ * @get_score_arg:	Parameter for @get_score
+ *
  * To avoid consuming too much CPU time or IO resources for applying the
  * &struct damos->action to large memory, DAMON allows users to set time and/or
  * size quotas.  The quotas can be set by writing non-zero values to &ms and
@@ -153,6 +156,17 @@ enum damos_action {
  * You could customize the prioritization logic by setting &weight_sz,
  * &weight_nr_accesses, and &weight_age, because monitoring operations are
  * encouraged to respect those.
+ *
+ * If @get_score function pointer is set, DAMON calls it back with
+ * @get_score_arg and get the return value of it for every @reset_interval.
+ * Then, DAMON adjusts the effective quota using the return value as a feedback
+ * score to the current quota, using its internal feedback loop algorithm.
+ *
+ * The feedback loop algorithem assumes the quota input and the feedback score
+ * output are in a positive proportional relationship, and the goal of the
+ * tuning is getting the feedback screo value of 10,000.  If @ms and/or @sz are
+ * set together, those work as a hard limit quota.  If neither @ms nor @sz are
+ * set, the mechanism starts from the quota of one byte.
  */
 struct damos_quota {
 	unsigned long ms;
@@ -163,6 +177,9 @@ struct damos_quota {
 	unsigned int weight_nr_accesses;
 	unsigned int weight_age;
 
+	unsigned long (*get_score)(void *arg);
+	void *get_score_arg;
+
 /* private: */
 	/* For throughput estimation */
 	unsigned long total_charged_sz;
@@ -179,6 +196,9 @@ struct damos_quota {
 	/* For prioritization */
 	unsigned long histogram[DAMOS_MAX_SCORE + 1];
 	unsigned int min_score;
+
+	/* For feedback loop */
+	unsigned long esz_bp;
 };
 
 /**
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ce1562783e7e..f91715a58dc7 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1038,26 +1038,76 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 	}
 }
 
-/* Shouldn't be called if quota->ms and quota->sz are zero */
+/*
+ * damon_feed_loop_next_input() - get next input to achieve a target score.
+ * @last_input	The last input.
+ * @score	Current score that made with @last_input.
+ *
+ * Calculate next input to achieve the target score, based on the last input
+ * and current score.  Assuming the input and the score are positively
+ * proportional, calculate how much compensation should be added to or
+ * subtracted from the last input as a proportion of the last input.  Avoid
+ * next input always being zero by setting it non-zero always.  In short form
+ * (assuming support of float and signed calculations), the algorithm is as
+ * below.
+ *
+ * next_input = max(last_input * ((goal - current) / goal + 1), 1)
+ *
+ * For simple implementation, we assume the target score is always 10,000.  The
+ * caller should adjust @score for this.
+ *
+ * Returns next input that assumed to achieve the target score.
+ */
+static unsigned long damon_feed_loop_next_input(unsigned long last_input,
+		unsigned long score)
+{
+	const unsigned long goal = 10000;
+	unsigned long score_goal_diff = max(goal, score) - min(goal, score);
+	unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal;
+	unsigned long compensation = last_input * score_goal_diff_bp / 10000;
+	/* Set minimum input as 10000 to avoid compensation be zero */
+	const unsigned long min_input = 10000;
+
+	if (goal > score)
+		return last_input + compensation;
+	if (last_input > compensation + min_input)
+		return last_input - compensation;
+	return min_input;
+}
+
+/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */
 static void damos_set_effective_quota(struct damos_quota *quota)
 {
 	unsigned long throughput;
 	unsigned long esz;
 
-	if (!quota->ms) {
+	if (!quota->ms && !quota->get_score) {
 		quota->esz = quota->sz;
 		return;
 	}
 
-	if (quota->total_charged_ns)
-		throughput = quota->total_charged_sz * 1000000 /
-			quota->total_charged_ns;
-	else
-		throughput = PAGE_SIZE * 1024;
-	esz = throughput * quota->ms;
+	if (quota->get_score) {
+		quota->esz_bp = damon_feed_loop_next_input(
+				max(quota->esz_bp, 10000UL),
+				quota->get_score(quota->get_score_arg));
+		esz = quota->esz_bp / 10000;
+	}
+
+	if (quota->ms) {
+		if (quota->total_charged_ns)
+			throughput = quota->total_charged_sz * 1000000 /
+				quota->total_charged_ns;
+		else
+			throughput = PAGE_SIZE * 1024;
+		if (quota->get_score)
+			esz = min(throughput * quota->ms, esz);
+		else
+			esz = throughput * quota->ms;
+	}
 
 	if (quota->sz && quota->sz < esz)
 		esz = quota->sz;
+
 	quota->esz = esz;
 }
 
@@ -1069,7 +1119,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 	unsigned long cumulated_sz;
 	unsigned int score, max_score = 0;
 
-	if (!quota->ms && !quota->sz)
+	if (!quota->ms && !quota->sz && !quota->get_score)
 		return;
 
 	/* New charge window starts */
-- 
cgit v1.2.3


From 1486fb50136f4799946f5ecfe050094574647153 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 18 Nov 2023 10:32:28 +0800
Subject: mm: ksm: use more folio api in ksm_might_need_to_copy()

Patch series "mm: cleanup and use more folio in page fault", v3.

Rename page_copy_prealloc() to folio_prealloc(), which is used by more
functions, also do more folio conversion in page fault.


This patch (of 5):

Since ksm only support normal page, no swapout/in for ksm large folio too,
add large folio check in ksm_might_need_to_copy(), also convert
page->index to folio->index as page->index is going away.

Then convert ksm_might_need_to_copy() to use more folio api to save nine
compound_head() calls, short 'address' to reduce max-line-length.

Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h |  4 ++--
 mm/ksm.c            | 39 +++++++++++++++++++++------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c2dd786a30e1..4643d5244e77 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm)
  * but what if the vma was unmerged while the page was swapped out?
  */
 struct page *ksm_might_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address);
+			struct vm_area_struct *vma, unsigned long addr);
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
@@ -130,7 +130,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 }
 
 static inline struct page *ksm_might_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address)
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	return page;
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index 5d60d5385de6..b93389a3780e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2876,48 +2876,51 @@ void __ksm_exit(struct mm_struct *mm)
 }
 
 struct page *ksm_might_need_to_copy(struct page *page,
-			struct vm_area_struct *vma, unsigned long address)
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct folio *folio = page_folio(page);
 	struct anon_vma *anon_vma = folio_anon_vma(folio);
-	struct page *new_page;
+	struct folio *new_folio;
 
-	if (PageKsm(page)) {
-		if (page_stable_node(page) &&
+	if (folio_test_large(folio))
+		return page;
+
+	if (folio_test_ksm(folio)) {
+		if (folio_stable_node(folio) &&
 		    !(ksm_run & KSM_RUN_UNMERGE))
 			return page;	/* no need to copy it */
 	} else if (!anon_vma) {
 		return page;		/* no need to copy it */
-	} else if (page->index == linear_page_index(vma, address) &&
+	} else if (folio->index == linear_page_index(vma, addr) &&
 			anon_vma->root == vma->anon_vma->root) {
 		return page;		/* still no need to copy it */
 	}
 	if (PageHWPoison(page))
 		return ERR_PTR(-EHWPOISON);
-	if (!PageUptodate(page))
+	if (!folio_test_uptodate(folio))
 		return page;		/* let do_swap_page report the error */
 
-	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-	if (new_page &&
-	    mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
-		put_page(new_page);
-		new_page = NULL;
+	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+	if (new_folio &&
+	    mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
+		folio_put(new_folio);
+		new_folio = NULL;
 	}
-	if (new_page) {
-		if (copy_mc_user_highpage(new_page, page, address, vma)) {
-			put_page(new_page);
+	if (new_folio) {
+		if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) {
+			folio_put(new_folio);
 			memory_failure_queue(page_to_pfn(page), 0);
 			return ERR_PTR(-EHWPOISON);
 		}
-		SetPageDirty(new_page);
-		__SetPageUptodate(new_page);
-		__SetPageLocked(new_page);
+		folio_set_dirty(new_folio);
+		__folio_mark_uptodate(new_folio);
+		__folio_set_locked(new_folio);
 #ifdef CONFIG_SWAP
 		count_vm_event(KSM_SWPIN_COPY);
 #endif
 	}
 
-	return new_page;
+	return new_folio ? &new_folio->page : NULL;
 }
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
-- 
cgit v1.2.3


From f67f8d4a8c1e1ebc85a6cbdb9a7266f14863461c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 1 Dec 2023 14:59:36 -0500
Subject: mm/rmap: fix misplaced parenthesis of a likely()

Running my yearly branch profiler to see where likely/unlikely annotation
may be added or removed, I discovered this:

correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0   457918 100 page_try_dup_anon_rmap         rmap.h               264
[..]
  458021        0   0 page_try_dup_anon_rmap         rmap.h               265

I thought it was interesting that line 264 of rmap.h had a 100% incorrect
annotation, but the line directly below it was 100% correct. Looking at the
code:

	if (likely(!is_device_private_page(page) &&
	    unlikely(page_needs_cow_for_dma(vma, page))))

It didn't make sense. The "likely()" was around the entire if statement
(not just the "!is_device_private_page(page)"), which also included the
"unlikely()" portion of that if condition.

If the unlikely portion is unlikely to be true, that would make the entire
if condition unlikely to be true, so it made no sense at all to say the
entire if condition is true.

What is more likely to be likely is just the first part of the if statement
before the && operation. It's likely to be a misplaced parenthesis. And
after making the if condition broken into a likely() && unlikely(), both
now appear to be correct!

Link: https://lkml.kernel.org/r/20231201145936.5ddfdb50@gandalf.local.home
Fixes:fb3d824d1a46c ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b26fe858fd44..3c2fc291b071 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -261,8 +261,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
 	 * guarantee the pinned page won't be randomly replaced in the
 	 * future on write faults.
 	 */
-	if (likely(!is_device_private_page(page) &&
-	    unlikely(page_needs_cow_for_dma(vma, page))))
+	if (likely(!is_device_private_page(page)) &&
+	    unlikely(page_needs_cow_for_dma(vma, page)))
 		return -EBUSY;
 
 	ClearPageAnonExclusive(page);
-- 
cgit v1.2.3


From 82c944d05b1a24c76948ee9d6bb1d7de1ebb8b3a Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Tue, 28 Nov 2023 14:25:30 +0100
Subject: net: wan: Add framer framework support

A framer is a component in charge of an E1/T1 line interface.
Connected usually to a TDM bus, it converts TDM frames to/from E1/T1
frames. It also provides information related to the E1/T1 line.

The framer framework provides a set of APIs for the framer drivers
(framer provider) to create/destroy a framer and APIs for the framer
users (framer consumer) to obtain a reference to the framer, and
use the framer.

This basic implementation provides a framer abstraction for:
 - power on/off the framer
 - get the framer status (line state)
 - be notified on framer status changes
 - get/set the framer configuration

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231128132534.258459-2-herve.codina@bootlin.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/net/wan/Kconfig                |   2 +
 drivers/net/wan/Makefile               |   2 +
 drivers/net/wan/framer/Kconfig         |  25 +
 drivers/net/wan/framer/Makefile        |   6 +
 drivers/net/wan/framer/framer-core.c   | 882 +++++++++++++++++++++++++++++++++
 include/linux/framer/framer-provider.h | 194 ++++++++
 include/linux/framer/framer.h          | 205 ++++++++
 7 files changed, 1316 insertions(+)
 create mode 100644 drivers/net/wan/framer/Kconfig
 create mode 100644 drivers/net/wan/framer/Makefile
 create mode 100644 drivers/net/wan/framer/framer-core.c
 create mode 100644 include/linux/framer/framer-provider.h
 create mode 100644 include/linux/framer/framer.h

(limited to 'include/linux')

diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
index dcb069dde66b..7dda87756d3f 100644
--- a/drivers/net/wan/Kconfig
+++ b/drivers/net/wan/Kconfig
@@ -95,6 +95,8 @@ config HDLC_X25
 comment "X.25/LAPB support is disabled"
 	depends on HDLC && (LAPB!=m || HDLC!=m) && LAPB!=y
 
+source "drivers/net/wan/framer/Kconfig"
+
 config PCI200SYN
 	tristate "Goramo PCI200SYN support"
 	depends on HDLC && PCI
diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile
index 5bec8fae47f8..8119b49d1da9 100644
--- a/drivers/net/wan/Makefile
+++ b/drivers/net/wan/Makefile
@@ -14,6 +14,8 @@ obj-$(CONFIG_HDLC_FR)		+= hdlc_fr.o
 obj-$(CONFIG_HDLC_PPP)		+= hdlc_ppp.o
 obj-$(CONFIG_HDLC_X25)		+= hdlc_x25.o
 
+obj-y				+= framer/
+
 obj-$(CONFIG_FARSYNC)		+= farsync.o
 
 obj-$(CONFIG_LAPBETHER)		+= lapbether.o
diff --git a/drivers/net/wan/framer/Kconfig b/drivers/net/wan/framer/Kconfig
new file mode 100644
index 000000000000..57fe7ba6aa37
--- /dev/null
+++ b/drivers/net/wan/framer/Kconfig
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# FRAMER
+#
+
+menuconfig FRAMER
+	tristate "Framer Subsystem"
+	help
+	  A framer is a component in charge of an E1/T1 line interface.
+	  Connected usually to a TDM bus, it converts TDM frames to/from E1/T1
+	  frames. It also provides information related to the E1/T1 line.
+	  Used with HDLC, the network can be reached through the E1/T1 line.
+
+	  This framework is designed to provide a generic interface for framer
+	  devices present in the kernel. This layer will have the generic
+	  API by which framer drivers can create framer using the framer
+	  framework and framer users can obtain reference to the framer.
+	  All the users of this framework should select this config.
+
+if FRAMER
+
+config GENERIC_FRAMER
+	bool
+
+endif # FRAMER
diff --git a/drivers/net/wan/framer/Makefile b/drivers/net/wan/framer/Makefile
new file mode 100644
index 000000000000..78dbd8e563d0
--- /dev/null
+++ b/drivers/net/wan/framer/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the framer drivers.
+#
+
+obj-$(CONFIG_GENERIC_FRAMER)			+= framer-core.o
diff --git a/drivers/net/wan/framer/framer-core.c b/drivers/net/wan/framer/framer-core.c
new file mode 100644
index 000000000000..c04dc88bda6c
--- /dev/null
+++ b/drivers/net/wan/framer/framer-core.c
@@ -0,0 +1,882 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Generic Framer framework.
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#include <linux/device.h>
+#include <linux/framer/framer.h>
+#include <linux/framer/framer-provider.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/pm_runtime.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+static struct class *framer_class;
+static DEFINE_MUTEX(framer_provider_mutex);
+static LIST_HEAD(framer_provider_list);
+static DEFINE_IDA(framer_ida);
+
+#define dev_to_framer(a)	(container_of((a), struct framer, dev))
+
+int framer_pm_runtime_get(struct framer *framer)
+{
+	int ret;
+
+	if (!pm_runtime_enabled(&framer->dev))
+		return -EOPNOTSUPP;
+
+	ret = pm_runtime_get(&framer->dev);
+	if (ret < 0 && ret != -EINPROGRESS)
+		pm_runtime_put_noidle(&framer->dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_pm_runtime_get);
+
+int framer_pm_runtime_get_sync(struct framer *framer)
+{
+	int ret;
+
+	if (!pm_runtime_enabled(&framer->dev))
+		return -EOPNOTSUPP;
+
+	ret = pm_runtime_get_sync(&framer->dev);
+	if (ret < 0)
+		pm_runtime_put_sync(&framer->dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_pm_runtime_get_sync);
+
+int framer_pm_runtime_put(struct framer *framer)
+{
+	if (!pm_runtime_enabled(&framer->dev))
+		return -EOPNOTSUPP;
+
+	return pm_runtime_put(&framer->dev);
+}
+EXPORT_SYMBOL_GPL(framer_pm_runtime_put);
+
+int framer_pm_runtime_put_sync(struct framer *framer)
+{
+	if (!pm_runtime_enabled(&framer->dev))
+		return -EOPNOTSUPP;
+
+	return pm_runtime_put_sync(&framer->dev);
+}
+EXPORT_SYMBOL_GPL(framer_pm_runtime_put_sync);
+
+/**
+ * framer_init - framer internal initialization before framer operation
+ * @framer: the framer returned by framer_get()
+ *
+ * Used to allow framer's driver to perform framer internal initialization,
+ * such as PLL block powering, clock initialization or anything that's
+ * is required by the framer to perform the start of operation.
+ * Must be called before framer_power_on().
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_init(struct framer *framer)
+{
+	bool start_polling = false;
+	int ret;
+
+	ret = framer_pm_runtime_get_sync(framer);
+	if (ret < 0 && ret != -EOPNOTSUPP)
+		return ret;
+	ret = 0; /* Override possible ret == -EOPNOTSUPP */
+
+	mutex_lock(&framer->mutex);
+	if (framer->power_count > framer->init_count)
+		dev_warn(&framer->dev, "framer_power_on was called before framer init\n");
+
+	if (framer->init_count == 0) {
+		if (framer->ops->init) {
+			ret = framer->ops->init(framer);
+			if (ret < 0) {
+				dev_err(&framer->dev, "framer init failed --> %d\n", ret);
+				goto out;
+			}
+		}
+		if (framer->ops->flags & FRAMER_FLAG_POLL_STATUS)
+			start_polling = true;
+	}
+	++framer->init_count;
+
+out:
+	mutex_unlock(&framer->mutex);
+
+	if (!ret && start_polling) {
+		ret = framer_get_status(framer, &framer->prev_status);
+		if (ret < 0) {
+			dev_warn(&framer->dev, "framer get status failed --> %d\n", ret);
+			/* Will be retried on polling_work */
+			ret = 0;
+		}
+		queue_delayed_work(system_power_efficient_wq, &framer->polling_work, 1 * HZ);
+	}
+
+	framer_pm_runtime_put(framer);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_init);
+
+/**
+ * framer_exit - Framer internal un-initialization
+ * @framer: the framer returned by framer_get()
+ *
+ * Must be called after framer_power_off().
+ */
+int framer_exit(struct framer *framer)
+{
+	int ret;
+
+	ret = framer_pm_runtime_get_sync(framer);
+	if (ret < 0 && ret != -EOPNOTSUPP)
+		return ret;
+	ret = 0; /* Override possible ret == -EOPNOTSUPP */
+
+	mutex_lock(&framer->mutex);
+	--framer->init_count;
+	if (framer->init_count == 0) {
+		if (framer->ops->flags & FRAMER_FLAG_POLL_STATUS) {
+			mutex_unlock(&framer->mutex);
+			cancel_delayed_work_sync(&framer->polling_work);
+			mutex_lock(&framer->mutex);
+		}
+
+		if (framer->ops->exit)
+			framer->ops->exit(framer);
+	}
+
+	mutex_unlock(&framer->mutex);
+	framer_pm_runtime_put(framer);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_exit);
+
+/**
+ * framer_power_on - Enable the framer and enter proper operation
+ * @framer: the framer returned by framer_get()
+ *
+ * Must be called after framer_init().
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_power_on(struct framer *framer)
+{
+	int ret;
+
+	if (framer->pwr) {
+		ret = regulator_enable(framer->pwr);
+		if (ret)
+			return ret;
+	}
+
+	ret = framer_pm_runtime_get_sync(framer);
+	if (ret < 0 && ret != -EOPNOTSUPP)
+		goto err_pm_sync;
+
+	mutex_lock(&framer->mutex);
+	if (framer->power_count == 0 && framer->ops->power_on) {
+		ret = framer->ops->power_on(framer);
+		if (ret < 0) {
+			dev_err(&framer->dev, "framer poweron failed --> %d\n", ret);
+			goto err_pwr_on;
+		}
+	}
+	++framer->power_count;
+	mutex_unlock(&framer->mutex);
+	return 0;
+
+err_pwr_on:
+	mutex_unlock(&framer->mutex);
+	framer_pm_runtime_put_sync(framer);
+err_pm_sync:
+	if (framer->pwr)
+		regulator_disable(framer->pwr);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_power_on);
+
+/**
+ * framer_power_off - Disable the framer.
+ * @framer: the framer returned by framer_get()
+ *
+ * Must be called before framer_exit().
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_power_off(struct framer *framer)
+{
+	int ret;
+
+	mutex_lock(&framer->mutex);
+	if (framer->power_count == 1 && framer->ops->power_off) {
+		ret = framer->ops->power_off(framer);
+		if (ret < 0) {
+			dev_err(&framer->dev, "framer poweroff failed --> %d\n", ret);
+			mutex_unlock(&framer->mutex);
+			return ret;
+		}
+	}
+	--framer->power_count;
+	mutex_unlock(&framer->mutex);
+	framer_pm_runtime_put(framer);
+
+	if (framer->pwr)
+		regulator_disable(framer->pwr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(framer_power_off);
+
+/**
+ * framer_get_status() - Gets the framer status
+ * @framer: the framer returned by framer_get()
+ * @status: the status to retrieve
+ *
+ * Used to get the framer status. framer_init() must have been called
+ * on the framer.
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_get_status(struct framer *framer, struct framer_status *status)
+{
+	int ret;
+
+	if (!framer->ops->get_status)
+		return -EOPNOTSUPP;
+
+	/* Be sure to have known values (struct padding and future extensions) */
+	memset(status, 0, sizeof(*status));
+
+	mutex_lock(&framer->mutex);
+	ret = framer->ops->get_status(framer, status);
+	mutex_unlock(&framer->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_get_status);
+
+/**
+ * framer_set_config() - Sets the framer configuration
+ * @framer: the framer returned by framer_get()
+ * @config: the configuration to set
+ *
+ * Used to set the framer configuration. framer_init() must have been called
+ * on the framer.
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_set_config(struct framer *framer, const struct framer_config *config)
+{
+	int ret;
+
+	if (!framer->ops->set_config)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&framer->mutex);
+	ret = framer->ops->set_config(framer, config);
+	mutex_unlock(&framer->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_set_config);
+
+/**
+ * framer_get_config() - Gets the framer configuration
+ * @framer: the framer returned by framer_get()
+ * @config: the configuration to retrieve
+ *
+ * Used to get the framer configuration. framer_init() must have been called
+ * on the framer.
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_get_config(struct framer *framer, struct framer_config *config)
+{
+	int ret;
+
+	if (!framer->ops->get_config)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&framer->mutex);
+	ret = framer->ops->get_config(framer, config);
+	mutex_unlock(&framer->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(framer_get_config);
+
+static void framer_polling_work(struct work_struct *work)
+{
+	struct framer *framer = container_of(work, struct framer, polling_work.work);
+	struct framer_status status;
+	int ret;
+
+	ret = framer_get_status(framer, &status);
+	if (ret) {
+		dev_err(&framer->dev, "polling, get status failed (%d)\n", ret);
+		goto end;
+	}
+	if (memcmp(&framer->prev_status, &status, sizeof(status))) {
+		blocking_notifier_call_chain(&framer->notifier_list,
+					     FRAMER_EVENT_STATUS, NULL);
+		memcpy(&framer->prev_status, &status, sizeof(status));
+	}
+
+end:
+	/* Re-schedule task in 1 sec */
+	queue_delayed_work(system_power_efficient_wq, &framer->polling_work, 1 * HZ);
+}
+
+/**
+ * framer_notifier_register() - Registers a notifier
+ * @framer: the framer returned by framer_get()
+ * @nb: the notifier block to register
+ *
+ * Used to register a notifier block on framer events. framer_init() must have
+ * been called on the framer.
+ * The available framer events are present in enum framer_events.
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_notifier_register(struct framer *framer, struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&framer->notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(framer_notifier_register);
+
+/**
+ * framer_notifier_unregister() - Unregisters a notifier
+ * @framer: the framer returned by framer_get()
+ * @nb: the notifier block to unregister
+ *
+ * Used to unregister a notifier block. framer_init() must have
+ * been called on the framer.
+ *
+ * Return: %0 if successful, a negative error code otherwise
+ */
+int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&framer->notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(framer_notifier_unregister);
+
+static struct framer_provider *framer_provider_of_lookup(const struct device_node *node)
+{
+	struct framer_provider *framer_provider;
+
+	list_for_each_entry(framer_provider, &framer_provider_list, list) {
+		if (device_match_of_node(framer_provider->dev, node))
+			return framer_provider;
+	}
+
+	return ERR_PTR(-EPROBE_DEFER);
+}
+
+static struct framer *framer_of_get_from_provider(struct of_phandle_args *args)
+{
+	struct framer_provider *framer_provider;
+	struct framer *framer;
+
+	mutex_lock(&framer_provider_mutex);
+	framer_provider = framer_provider_of_lookup(args->np);
+	if (IS_ERR(framer_provider) || !try_module_get(framer_provider->owner)) {
+		framer = ERR_PTR(-EPROBE_DEFER);
+		goto end;
+	}
+
+	framer = framer_provider->of_xlate(framer_provider->dev, args);
+
+	module_put(framer_provider->owner);
+
+end:
+	mutex_unlock(&framer_provider_mutex);
+
+	return framer;
+}
+
+static struct framer *framer_of_get_byphandle(struct device_node *np, const char *propname,
+					      int index)
+{
+	struct of_phandle_args args;
+	struct framer *framer;
+	int ret;
+
+	ret = of_parse_phandle_with_optional_args(np, propname, "#framer-cells", index, &args);
+	if (ret)
+		return ERR_PTR(-ENODEV);
+
+	if (!of_device_is_available(args.np)) {
+		framer = ERR_PTR(-ENODEV);
+		goto out_node_put;
+	}
+
+	framer = framer_of_get_from_provider(&args);
+
+out_node_put:
+	of_node_put(args.np);
+
+	return framer;
+}
+
+static struct framer *framer_of_get_byparent(struct device_node *np, int index)
+{
+	struct of_phandle_args args;
+	struct framer *framer;
+
+	args.np = of_get_parent(np);
+	args.args_count = 1;
+	args.args[0] = index;
+
+	while (args.np) {
+		framer = framer_of_get_from_provider(&args);
+		if (IS_ERR(framer) && PTR_ERR(framer) != -EPROBE_DEFER) {
+			args.np = of_get_next_parent(args.np);
+			continue;
+		}
+		of_node_put(args.np);
+		return framer;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+/**
+ * framer_get() - lookup and obtain a reference to a framer.
+ * @dev: device that requests the framer
+ * @con_id: name of the framer from device's point of view
+ *
+ * Returns the framer driver, after getting a refcount to it; or
+ * -ENODEV if there is no such framer. The caller is responsible for
+ * calling framer_put() to release that count.
+ */
+struct framer *framer_get(struct device *dev, const char *con_id)
+{
+	struct framer *framer = ERR_PTR(-ENODEV);
+	struct device_link *link;
+	int ret;
+
+	if (dev->of_node) {
+		if (con_id)
+			framer = framer_of_get_byphandle(dev->of_node, con_id, 0);
+		else
+			framer = framer_of_get_byparent(dev->of_node, 0);
+	}
+
+	if (IS_ERR(framer))
+		return framer;
+
+	get_device(&framer->dev);
+
+	if (!try_module_get(framer->ops->owner)) {
+		ret = -EPROBE_DEFER;
+		goto err_put_device;
+	}
+
+	link = device_link_add(dev, &framer->dev, DL_FLAG_STATELESS);
+	if (!link) {
+		dev_err(dev, "failed to create device_link to %s\n", dev_name(&framer->dev));
+		ret = -EPROBE_DEFER;
+		goto err_module_put;
+	}
+
+	return framer;
+
+err_module_put:
+	module_put(framer->ops->owner);
+err_put_device:
+	put_device(&framer->dev);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(framer_get);
+
+/**
+ * framer_put() - release the framer
+ * @dev: device that wants to release this framer
+ * @framer: the framer returned by framer_get()
+ *
+ * Releases a refcount the caller received from framer_get().
+ */
+void framer_put(struct device *dev, struct framer *framer)
+{
+	device_link_remove(dev, &framer->dev);
+
+	module_put(framer->ops->owner);
+	put_device(&framer->dev);
+}
+EXPORT_SYMBOL_GPL(framer_put);
+
+static void devm_framer_put(struct device *dev, void *res)
+{
+	struct framer *framer = *(struct framer **)res;
+
+	framer_put(dev, framer);
+}
+
+/**
+ * devm_framer_get() - lookup and obtain a reference to a framer.
+ * @dev: device that requests this framer
+ * @con_id: name of the framer from device's point of view
+ *
+ * Gets the framer using framer_get(), and associates a device with it using
+ * devres. On driver detach, framer_put() function is invoked on the devres
+ * data, then, devres data is freed.
+ */
+struct framer *devm_framer_get(struct device *dev, const char *con_id)
+{
+	struct framer **ptr, *framer;
+
+	ptr = devres_alloc(devm_framer_put, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	framer = framer_get(dev, con_id);
+	if (!IS_ERR(framer)) {
+		*ptr = framer;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+		return framer;
+	}
+
+	return framer;
+}
+EXPORT_SYMBOL_GPL(devm_framer_get);
+
+/**
+ * devm_framer_optional_get() - lookup and obtain a reference to an optional
+ * framer.
+ * @dev: device that requests this framer
+ * @con_id: name of the framer from device's point of view
+ *
+ * Same as devm_framer_get() except that if the framer does not exist, it is not
+ * considered an error and -ENODEV will not be returned. Instead the NULL framer
+ * is returned.
+ */
+struct framer *devm_framer_optional_get(struct device *dev, const char *con_id)
+{
+	struct framer *framer = devm_framer_get(dev, con_id);
+
+	if (PTR_ERR(framer) == -ENODEV)
+		framer = NULL;
+
+	return framer;
+}
+EXPORT_SYMBOL_GPL(devm_framer_optional_get);
+
+static void framer_notify_status_work(struct work_struct *work)
+{
+	struct framer *framer = container_of(work, struct framer, notify_status_work);
+
+	blocking_notifier_call_chain(&framer->notifier_list, FRAMER_EVENT_STATUS, NULL);
+}
+
+void framer_notify_status_change(struct framer *framer)
+{
+	/* Can be called from atomic context -> just schedule a task to call
+	 * blocking notifiers
+	 */
+	queue_work(system_power_efficient_wq, &framer->notify_status_work);
+}
+EXPORT_SYMBOL_GPL(framer_notify_status_change);
+
+/**
+ * framer_create() - create a new framer
+ * @dev: device that is creating the new framer
+ * @node: device node of the framer. default to dev->of_node.
+ * @ops: function pointers for performing framer operations
+ *
+ * Called to create a framer using framer framework.
+ */
+struct framer *framer_create(struct device *dev, struct device_node *node,
+			     const struct framer_ops *ops)
+{
+	struct framer *framer;
+	int ret;
+	int id;
+
+	/* get_status() is mandatory if the provider ask for polling status */
+	if (WARN_ON((ops->flags & FRAMER_FLAG_POLL_STATUS) && !ops->get_status))
+		return ERR_PTR(-EINVAL);
+
+	framer = kzalloc(sizeof(*framer), GFP_KERNEL);
+	if (!framer)
+		return ERR_PTR(-ENOMEM);
+
+	id = ida_alloc(&framer_ida, GFP_KERNEL);
+	if (id < 0) {
+		dev_err(dev, "unable to get id\n");
+		ret = id;
+		goto free_framer;
+	}
+
+	device_initialize(&framer->dev);
+	mutex_init(&framer->mutex);
+	INIT_WORK(&framer->notify_status_work, framer_notify_status_work);
+	INIT_DELAYED_WORK(&framer->polling_work, framer_polling_work);
+	BLOCKING_INIT_NOTIFIER_HEAD(&framer->notifier_list);
+
+	framer->dev.class = framer_class;
+	framer->dev.parent = dev;
+	framer->dev.of_node = node ? node : dev->of_node;
+	framer->id = id;
+	framer->ops = ops;
+
+	ret = dev_set_name(&framer->dev, "framer-%s.%d", dev_name(dev), id);
+	if (ret)
+		goto put_dev;
+
+	/* framer-supply */
+	framer->pwr = regulator_get_optional(&framer->dev, "framer");
+	if (IS_ERR(framer->pwr)) {
+		ret = PTR_ERR(framer->pwr);
+		if (ret == -EPROBE_DEFER)
+			goto put_dev;
+
+		framer->pwr = NULL;
+	}
+
+	ret = device_add(&framer->dev);
+	if (ret)
+		goto put_dev;
+
+	if (pm_runtime_enabled(dev)) {
+		pm_runtime_enable(&framer->dev);
+		pm_runtime_no_callbacks(&framer->dev);
+	}
+
+	return framer;
+
+put_dev:
+	put_device(&framer->dev);  /* calls framer_release() which frees resources */
+	return ERR_PTR(ret);
+
+free_framer:
+	kfree(framer);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(framer_create);
+
+/**
+ * framer_destroy() - destroy the framer
+ * @framer: the framer to be destroyed
+ *
+ * Called to destroy the framer.
+ */
+void framer_destroy(struct framer *framer)
+{
+	/* polling_work should already be stopped but if framer_exit() was not
+	 * called (bug), here it's the last time to do that ...
+	 */
+	cancel_delayed_work_sync(&framer->polling_work);
+	cancel_work_sync(&framer->notify_status_work);
+	pm_runtime_disable(&framer->dev);
+	device_unregister(&framer->dev); /* calls framer_release() which frees resources */
+}
+EXPORT_SYMBOL_GPL(framer_destroy);
+
+static void devm_framer_destroy(struct device *dev, void *res)
+{
+	struct framer *framer = *(struct framer **)res;
+
+	framer_destroy(framer);
+}
+
+/**
+ * devm_framer_create() - create a new framer
+ * @dev: device that is creating the new framer
+ * @node: device node of the framer
+ * @ops: function pointers for performing framer operations
+ *
+ * Creates a new framer device adding it to the framer class.
+ * While at that, it also associates the device with the framer using devres.
+ * On driver detach, release function is invoked on the devres data,
+ * then, devres data is freed.
+ */
+struct framer *devm_framer_create(struct device *dev, struct device_node *node,
+				  const struct framer_ops *ops)
+{
+	struct framer **ptr, *framer;
+
+	ptr = devres_alloc(devm_framer_destroy, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	framer = framer_create(dev, node, ops);
+	if (!IS_ERR(framer)) {
+		*ptr = framer;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return framer;
+}
+EXPORT_SYMBOL_GPL(devm_framer_create);
+
+/**
+ * framer_provider_simple_of_xlate() - returns the framer instance from framer provider
+ * @dev: the framer provider device
+ * @args: of_phandle_args (not used here)
+ *
+ * Intended to be used by framer provider for the common case where #framer-cells is
+ * 0. For other cases where #framer-cells is greater than '0', the framer provider
+ * should provide a custom of_xlate function that reads the *args* and returns
+ * the appropriate framer.
+ */
+struct framer *framer_provider_simple_of_xlate(struct device *dev, struct of_phandle_args *args)
+{
+	struct class_dev_iter iter;
+	struct framer *framer;
+
+	class_dev_iter_init(&iter, framer_class, NULL, NULL);
+	while ((dev = class_dev_iter_next(&iter))) {
+		framer = dev_to_framer(dev);
+		if (args->np != framer->dev.of_node)
+			continue;
+
+		class_dev_iter_exit(&iter);
+		return framer;
+	}
+
+	class_dev_iter_exit(&iter);
+	return ERR_PTR(-ENODEV);
+}
+EXPORT_SYMBOL_GPL(framer_provider_simple_of_xlate);
+
+/**
+ * __framer_provider_of_register() - create/register framer provider with the framework
+ * @dev: struct device of the framer provider
+ * @owner: the module owner containing of_xlate
+ * @of_xlate: function pointer to obtain framer instance from framer provider
+ *
+ * Creates struct framer_provider from dev and of_xlate function pointer.
+ * This is used in the case of dt boot for finding the framer instance from
+ * framer provider.
+ */
+struct framer_provider *
+__framer_provider_of_register(struct device *dev, struct module *owner,
+			      struct framer *(*of_xlate)(struct device *dev,
+							 struct of_phandle_args *args))
+{
+	struct framer_provider *framer_provider;
+
+	framer_provider = kzalloc(sizeof(*framer_provider), GFP_KERNEL);
+	if (!framer_provider)
+		return ERR_PTR(-ENOMEM);
+
+	framer_provider->dev = dev;
+	framer_provider->owner = owner;
+	framer_provider->of_xlate = of_xlate;
+
+	of_node_get(framer_provider->dev->of_node);
+
+	mutex_lock(&framer_provider_mutex);
+	list_add_tail(&framer_provider->list, &framer_provider_list);
+	mutex_unlock(&framer_provider_mutex);
+
+	return framer_provider;
+}
+EXPORT_SYMBOL_GPL(__framer_provider_of_register);
+
+/**
+ * framer_provider_of_unregister() - unregister framer provider from the framework
+ * @framer_provider: framer provider returned by framer_provider_of_register()
+ *
+ * Removes the framer_provider created using framer_provider_of_register().
+ */
+void framer_provider_of_unregister(struct framer_provider *framer_provider)
+{
+	mutex_lock(&framer_provider_mutex);
+	list_del(&framer_provider->list);
+	mutex_unlock(&framer_provider_mutex);
+
+	of_node_put(framer_provider->dev->of_node);
+	kfree(framer_provider);
+}
+EXPORT_SYMBOL_GPL(framer_provider_of_unregister);
+
+static void devm_framer_provider_of_unregister(struct device *dev, void *res)
+{
+	struct framer_provider *framer_provider = *(struct framer_provider **)res;
+
+	framer_provider_of_unregister(framer_provider);
+}
+
+/**
+ * __devm_framer_provider_of_register() - create/register framer provider with
+ * the framework
+ * @dev: struct device of the framer provider
+ * @owner: the module owner containing of_xlate
+ * @of_xlate: function pointer to obtain framer instance from framer provider
+ *
+ * Creates struct framer_provider from dev and of_xlate function pointer.
+ * This is used in the case of dt boot for finding the framer instance from
+ * framer provider. While at that, it also associates the device with the
+ * framer provider using devres. On driver detach, release function is invoked
+ * on the devres data, then, devres data is freed.
+ */
+struct framer_provider *
+__devm_framer_provider_of_register(struct device *dev, struct module *owner,
+				   struct framer *(*of_xlate)(struct device *dev,
+							      struct of_phandle_args *args))
+{
+	struct framer_provider **ptr, *framer_provider;
+
+	ptr = devres_alloc(devm_framer_provider_of_unregister, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	framer_provider = __framer_provider_of_register(dev, owner, of_xlate);
+	if (!IS_ERR(framer_provider)) {
+		*ptr = framer_provider;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return framer_provider;
+}
+EXPORT_SYMBOL_GPL(__devm_framer_provider_of_register);
+
+/**
+ * framer_release() - release the framer
+ * @dev: the dev member within framer
+ *
+ * When the last reference to the device is removed, it is called
+ * from the embedded kobject as release method.
+ */
+static void framer_release(struct device *dev)
+{
+	struct framer *framer;
+
+	framer = dev_to_framer(dev);
+	regulator_put(framer->pwr);
+	ida_free(&framer_ida, framer->id);
+	kfree(framer);
+}
+
+static int __init framer_core_init(void)
+{
+	framer_class = class_create("framer");
+	if (IS_ERR(framer_class)) {
+		pr_err("failed to create framer class (%pe)\n", framer_class);
+		return PTR_ERR(framer_class);
+	}
+
+	framer_class->dev_release = framer_release;
+
+	return 0;
+}
+device_initcall(framer_core_init);
diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h
new file mode 100644
index 000000000000..782cd5fc83d5
--- /dev/null
+++ b/include/linux/framer/framer-provider.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Generic framer profider header file
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#ifndef __DRIVERS_PROVIDER_FRAMER_H
+#define __DRIVERS_PROVIDER_FRAMER_H
+
+#include <linux/export.h>
+#include <linux/framer/framer.h>
+#include <linux/types.h>
+
+#define FRAMER_FLAG_POLL_STATUS  BIT(0)
+
+/**
+ * struct framer_ops - set of function pointers for performing framer operations
+ * @init: operation to be performed for initializing the framer
+ * @exit: operation to be performed while exiting
+ * @power_on: powering on the framer
+ * @power_off: powering off the framer
+ * @flags: OR-ed flags (FRAMER_FLAG_*) to ask for core functionality
+ *          - @FRAMER_FLAG_POLL_STATUS:
+ *            Ask the core to perform a polling to get the framer status and
+ *            notify consumers on change.
+ *            The framer should call @framer_notify_status_change() when it
+ *            detects a status change. This is usually done using interrupts.
+ *            If the framer cannot detect this change, it can ask the core for
+ *            a status polling. The core will call @get_status() periodically
+ *            and, on change detected, it will notify the consumer.
+ *            the @get_status()
+ * @owner: the module owner containing the ops
+ */
+struct framer_ops {
+	int	(*init)(struct framer *framer);
+	void	(*exit)(struct framer *framer);
+	int	(*power_on)(struct framer *framer);
+	int	(*power_off)(struct framer *framer);
+
+	/**
+	 * @get_status:
+	 *
+	 * Optional.
+	 *
+	 * Used to get the framer status. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*get_status)(struct framer *framer, struct framer_status *status);
+
+	/**
+	 * @set_config:
+	 *
+	 * Optional.
+	 *
+	 * Used to set the framer configuration. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*set_config)(struct framer *framer, const struct framer_config *config);
+
+	/**
+	 * @get_config:
+	 *
+	 * Optional.
+	 *
+	 * Used to get the framer configuration. framer_init() must have
+	 * been called on the framer.
+	 *
+	 * Returns: 0 if successful, an negative error code otherwise
+	 */
+	int	(*get_config)(struct framer *framer, struct framer_config *config);
+
+	u32 flags;
+	struct module *owner;
+};
+
+/**
+ * struct framer_provider - represents the framer provider
+ * @dev: framer provider device
+ * @children: can be used to override the default (dev->of_node) child node
+ * @owner: the module owner having of_xlate
+ * @list: to maintain a linked list of framer providers
+ * @of_xlate: function pointer to obtain framer instance from framer pointer
+ */
+struct framer_provider {
+	struct device		*dev;
+	struct module		*owner;
+	struct list_head	list;
+	struct framer * (*of_xlate)(struct device *dev,
+				    struct of_phandle_args *args);
+};
+
+static inline void framer_set_drvdata(struct framer *framer, void *data)
+{
+	dev_set_drvdata(&framer->dev, data);
+}
+
+static inline void *framer_get_drvdata(struct framer *framer)
+{
+	return dev_get_drvdata(&framer->dev);
+}
+
+#if IS_ENABLED(CONFIG_GENERIC_FRAMER)
+
+/* Create and destroy a framer */
+struct framer *framer_create(struct device *dev, struct device_node *node,
+			     const struct framer_ops *ops);
+void framer_destroy(struct framer *framer);
+
+/* devm version */
+struct framer *devm_framer_create(struct device *dev, struct device_node *node,
+				  const struct framer_ops *ops);
+
+struct framer *framer_provider_simple_of_xlate(struct device *dev,
+					       struct of_phandle_args *args);
+
+struct framer_provider *
+__framer_provider_of_register(struct device *dev, struct module *owner,
+			      struct framer *(*of_xlate)(struct device *dev,
+							 struct of_phandle_args *args));
+
+void framer_provider_of_unregister(struct framer_provider *framer_provider);
+
+struct framer_provider *
+__devm_framer_provider_of_register(struct device *dev, struct module *owner,
+				   struct framer *(*of_xlate)(struct device *dev,
+							      struct of_phandle_args *args));
+
+void framer_notify_status_change(struct framer *framer);
+
+#else /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */
+
+static inline struct framer *framer_create(struct device *dev, struct device_node *node,
+					   const struct framer_ops *ops)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void framer_destroy(struct framer *framer)
+{
+}
+
+/* devm version */
+static inline struct framer *devm_framer_create(struct device *dev, struct device_node *node,
+						const struct framer_ops *ops)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer *framer_provider_simple_of_xlate(struct device *dev,
+							     struct of_phandle_args *args)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer_provider *
+__framer_provider_of_register(struct device *dev, struct module *owner,
+			      struct framer *(*of_xlate)(struct device *dev,
+							 struct of_phandle_args *args))
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_provider_of_unregister(struct framer_provider *framer_provider)
+{
+}
+
+static inline struct framer_provider *
+__devm_framer_provider_of_register(struct device *dev, struct module *owner,
+				   struct framer *(*of_xlate)(struct device *dev,
+							      struct of_phandle_args *args))
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_notify_status_change(struct framer *framer)
+{
+}
+
+#endif /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */
+
+#define framer_provider_of_register(dev, xlate)		\
+	__framer_provider_of_register((dev), THIS_MODULE, (xlate))
+
+#define devm_framer_provider_of_register(dev, xlate)	\
+	__devm_framer_provider_of_register((dev), THIS_MODULE, (xlate))
+
+#endif /* __DRIVERS_PROVIDER_FRAMER_H */
diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h
new file mode 100644
index 000000000000..9a9b88962c29
--- /dev/null
+++ b/include/linux/framer/framer.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Generic framer header file
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#ifndef __DRIVERS_FRAMER_H
+#define __DRIVERS_FRAMER_H
+
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/device.h>
+#include <linux/workqueue.h>
+
+/**
+ * enum framer_iface - Framer interface
+ * @FRAMER_IFACE_E1: E1 interface
+ * @FRAMER_IFACE_T1: T1 interface
+ */
+enum framer_iface {
+	FRAMER_IFACE_E1,
+	FRAMER_IFACE_T1,
+};
+
+/**
+ * enum framer_clock_type - Framer clock type
+ * @FRAMER_CLOCK_EXT: External clock
+ * @FRAMER_CLOCK_INT: Internal clock
+ */
+enum framer_clock_type {
+	FRAMER_CLOCK_EXT,
+	FRAMER_CLOCK_INT,
+};
+
+/**
+ * struct framer_config - Framer configuration
+ * @iface: Framer line interface
+ * @clock_type: Framer clock type
+ * @line_clock_rate: Framer line clock rate
+ */
+struct framer_config {
+	enum framer_iface iface;
+	enum framer_clock_type clock_type;
+	unsigned long line_clock_rate;
+};
+
+/**
+ * struct framer_status - Framer status
+ * @link_is_on: Framer link state. true, the link is on, false, the link is off.
+ */
+struct framer_status {
+	bool link_is_on;
+};
+
+/**
+ * enum framer_event - Event available for notification
+ * @FRAMER_EVENT_STATUS: Event notified on framer_status changes
+ */
+enum framer_event {
+	FRAMER_EVENT_STATUS,
+};
+
+/**
+ * struct framer - represents the framer device
+ * @dev: framer device
+ * @id: id of the framer device
+ * @ops: function pointers for performing framer operations
+ * @mutex: mutex to protect framer_ops
+ * @init_count: used to protect when the framer is used by multiple consumers
+ * @power_count: used to protect when the framer is used by multiple consumers
+ * @pwr: power regulator associated with the framer
+ * @notify_status_work: work structure used for status notifications
+ * @notifier_list: notifier list used for notifications
+ * @polling_work: delayed work structure used for the polling task
+ * @prev_status: previous read status used by the polling task to detect changes
+ */
+struct framer {
+	struct device			dev;
+	int				id;
+	const struct framer_ops		*ops;
+	struct mutex			mutex;	/* Protect framer */
+	int				init_count;
+	int				power_count;
+	struct regulator		*pwr;
+	struct work_struct		notify_status_work;
+	struct blocking_notifier_head	notifier_list;
+	struct delayed_work		polling_work;
+	struct framer_status		prev_status;
+};
+
+#if IS_ENABLED(CONFIG_GENERIC_FRAMER)
+int framer_pm_runtime_get(struct framer *framer);
+int framer_pm_runtime_get_sync(struct framer *framer);
+int framer_pm_runtime_put(struct framer *framer);
+int framer_pm_runtime_put_sync(struct framer *framer);
+int framer_init(struct framer *framer);
+int framer_exit(struct framer *framer);
+int framer_power_on(struct framer *framer);
+int framer_power_off(struct framer *framer);
+int framer_get_status(struct framer *framer, struct framer_status *status);
+int framer_get_config(struct framer *framer, struct framer_config *config);
+int framer_set_config(struct framer *framer, const struct framer_config *config);
+int framer_notifier_register(struct framer *framer, struct notifier_block *nb);
+int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb);
+
+struct framer *framer_get(struct device *dev, const char *con_id);
+void framer_put(struct device *dev, struct framer *framer);
+
+struct framer *devm_framer_get(struct device *dev, const char *con_id);
+struct framer *devm_framer_optional_get(struct device *dev, const char *con_id);
+#else
+static inline int framer_pm_runtime_get(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_get_sync(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_put(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_pm_runtime_put_sync(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_init(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_exit(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_power_on(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_power_off(struct framer *framer)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_get_status(struct framer *framer, struct framer_status *status)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_get_config(struct framer *framer, struct framer_config *config)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_set_config(struct framer *framer, const struct framer_config *config)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_notifier_register(struct framer *framer,
+					   struct notifier_block *nb)
+{
+	return -ENOSYS;
+}
+
+static inline int framer_notifier_unregister(struct framer *framer,
+					     struct notifier_block *nb)
+{
+	return -ENOSYS;
+}
+
+struct framer *framer_get(struct device *dev, const char *con_id)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+void framer_put(struct device *dev, struct framer *framer)
+{
+}
+
+static inline struct framer *devm_framer_get(struct device *dev, const char *con_id)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct framer *devm_framer_optional_get(struct device *dev, const char *con_id)
+{
+	return NULL;
+}
+
+#endif
+
+#endif /* __DRIVERS_FRAMER_H */
-- 
cgit v1.2.3


From c96e976d9a05d559f4ac4f617ea0f798c75a1799 Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Tue, 28 Nov 2023 14:25:32 +0100
Subject: net: wan: framer: Add support for the Lantiq PEF2256 framer

The Lantiq PEF2256 is a framer and line interface component designed to
fulfill all required interfacing between an analog E1/T1/J1 line and the
digital PCM system highway/H.100 bus.

Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20231128132534.258459-4-herve.codina@bootlin.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/net/wan/framer/Kconfig                |  17 +
 drivers/net/wan/framer/Makefile               |   1 +
 drivers/net/wan/framer/pef2256/Makefile       |   8 +
 drivers/net/wan/framer/pef2256/pef2256-regs.h | 250 ++++++++
 drivers/net/wan/framer/pef2256/pef2256.c      | 880 ++++++++++++++++++++++++++
 include/linux/framer/pef2256.h                |  31 +
 6 files changed, 1187 insertions(+)
 create mode 100644 drivers/net/wan/framer/pef2256/Makefile
 create mode 100644 drivers/net/wan/framer/pef2256/pef2256-regs.h
 create mode 100644 drivers/net/wan/framer/pef2256/pef2256.c
 create mode 100644 include/linux/framer/pef2256.h

(limited to 'include/linux')

diff --git a/drivers/net/wan/framer/Kconfig b/drivers/net/wan/framer/Kconfig
index 57fe7ba6aa37..dc83caef9485 100644
--- a/drivers/net/wan/framer/Kconfig
+++ b/drivers/net/wan/framer/Kconfig
@@ -22,4 +22,21 @@ if FRAMER
 config GENERIC_FRAMER
 	bool
 
+config FRAMER_PEF2256
+	tristate "Lantiq PEF2256"
+	depends on OF
+	depends on HAS_IOMEM
+	select GENERIC_FRAMER
+	select MFD_CORE
+	select REGMAP_MMIO
+	help
+	  Enable support for the Lantiq PEF2256 (FALC56) framer.
+	  The PEF2256 is a framer and line interface between analog E1/T1/J1
+	  line and a digital PCM bus.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called framer-pef2256.
+
 endif # FRAMER
diff --git a/drivers/net/wan/framer/Makefile b/drivers/net/wan/framer/Makefile
index 78dbd8e563d0..3403f2b14534 100644
--- a/drivers/net/wan/framer/Makefile
+++ b/drivers/net/wan/framer/Makefile
@@ -4,3 +4,4 @@
 #
 
 obj-$(CONFIG_GENERIC_FRAMER)			+= framer-core.o
+obj-$(CONFIG_FRAMER_PEF2256)			+= pef2256/
diff --git a/drivers/net/wan/framer/pef2256/Makefile b/drivers/net/wan/framer/pef2256/Makefile
new file mode 100644
index 000000000000..f4d1208dd8a4
--- /dev/null
+++ b/drivers/net/wan/framer/pef2256/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the pef2256 driver.
+#
+
+obj-$(CONFIG_FRAMER_PEF2256)		+= framer-pef2256.o
+
+framer-pef2256-objs	:= pef2256.o
diff --git a/drivers/net/wan/framer/pef2256/pef2256-regs.h b/drivers/net/wan/framer/pef2256/pef2256-regs.h
new file mode 100644
index 000000000000..5d3183c91714
--- /dev/null
+++ b/drivers/net/wan/framer/pef2256/pef2256-regs.h
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PEF2256 registers definition
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+#ifndef __PEF2256_REGS_H__
+#define __PEF2256_REGS_H__
+
+#include "linux/bitfield.h"
+
+/* Command Register */
+#define PEF2256_CMDR		0x02
+#define PEF2256_CMDR_RRES	BIT(6)
+#define PEF2256_CMDR_XRES	BIT(4)
+#define PEF2256_CMDR_SRES	BIT(0)
+
+/* Interrupt Mask Register 0..5 */
+#define PEF2256_IMR0	        0x14
+#define PEF2256_IMR1	        0x15
+#define PEF2256_IMR2	        0x16
+#define PEF2256_IMR3	        0x17
+#define PEF2256_IMR4	        0x18
+#define PEF2256_IMR5	        0x19
+
+/* Framer Mode Register 0 */
+#define PEF2256_FMR0		0x1C
+#define PEF2256_FMR0_XC_MASK	GENMASK(7, 6)
+#define PEF2256_FMR0_XC_NRZ	FIELD_PREP_CONST(PEF2256_FMR0_XC_MASK, 0x0)
+#define PEF2256_FMR0_XC_CMI	FIELD_PREP_CONST(PEF2256_FMR0_XC_MASK, 0x1)
+#define PEF2256_FMR0_XC_AMI	FIELD_PREP_CONST(PEF2256_FMR0_XC_MASK, 0x2)
+#define PEF2256_FMR0_XC_HDB3	FIELD_PREP_CONST(PEF2256_FMR0_XC_MASK, 0x3)
+#define PEF2256_FMR0_RC_MASK	GENMASK(5, 4)
+#define PEF2256_FMR0_RC_NRZ	FIELD_PREP_CONST(PEF2256_FMR0_RC_MASK, 0x0)
+#define PEF2256_FMR0_RC_CMI	FIELD_PREP_CONST(PEF2256_FMR0_RC_MASK, 0x1)
+#define PEF2256_FMR0_RC_AMI	FIELD_PREP_CONST(PEF2256_FMR0_RC_MASK, 0x2)
+#define PEF2256_FMR0_RC_HDB3	FIELD_PREP_CONST(PEF2256_FMR0_RC_MASK, 0x3)
+
+/* Framer Mode Register 1 */
+#define PEF2256_FMR1		0x1D
+#define PEF2256_FMR1_XFS	BIT(3)
+#define PEF2256_FMR1_ECM	BIT(2)
+/* SSD is defined on 2 bits. The other bit is on SIC1 register */
+#define PEF2256_FMR1_SSD_MASK	GENMASK(1, 1)
+#define PEF2256_FMR1_SSD_2048	FIELD_PREP_CONST(PEF2256_FMR1_SSD_MASK, 0x0)
+#define PEF2256_FMR1_SSD_4096	FIELD_PREP_CONST(PEF2256_FMR1_SSD_MASK, 0x1)
+#define PEF2256_FMR1_SSD_8192	FIELD_PREP_CONST(PEF2256_FMR1_SSD_MASK, 0x0)
+#define PEF2256_FMR1_SSD_16384	FIELD_PREP_CONST(PEF2256_FMR1_SSD_MASK, 0x1)
+
+/* Framer Mode Register 2 */
+#define PEF2256_FMR2			  0x1E
+#define PEF2256_FMR2_RFS_MASK		  GENMASK(7, 6)
+#define PEF2256_FMR2_RFS_DOUBLEFRAME	  FIELD_PREP_CONST(PEF2256_FMR2_RFS_MASK, 0x0)
+#define PEF2256_FMR2_RFS_CRC4_MULTIFRAME  FIELD_PREP_CONST(PEF2256_FMR2_RFS_MASK, 0x2)
+#define PEF2256_FMR2_RFS_AUTO_MULTIFRAME  FIELD_PREP_CONST(PEF2256_FMR2_RFS_MASK, 0x3)
+#define PEF2256_FMR2_AXRA		  BIT(1)
+
+/* Transmit Service Word */
+#define PEF2256_XSW		0x20
+#define PEF2256_XSW_XSIS	BIT(7)
+#define PEF2256_XSW_XTM		BIT(6)
+#define PEF2256_XSW_XY_MASK	GENMASK(5, 0)
+#define PEF2256_XSW_XY(_v)	FIELD_PREP(PEF2256_XSW_XY_MASK, _v)
+
+/* Transmit Spare Bits */
+#define PEF2256_XSP	        0x21
+#define PEF2256_XSP_XSIF	BIT(2)
+
+/* Transmit Control 0..1 */
+#define PEF2256_XC0		0x22
+#define PEF2256_XC1		0x23
+
+/* Receive Control 0 */
+#define PEF2256_RC0		0x24
+#define PEF2256_RC0_SWD		BIT(7)
+#define PEF2256_RC0_ASY4	BIT(6)
+
+/* Receive Control 1 */
+#define PEF2256_RC1		0x25
+
+/* Transmit Pulse Mask 0..1 */
+#define PEF2256_XPM0		0x26
+#define PEF2256_XPM1		0x27
+
+/* Transmit Pulse Mask 2 */
+#define PEF2256_XPM2		0x28
+#define PEF2256_XPM2_XLT	BIT(6)
+
+/* Transparent Service Word Mask */
+#define PEF2256_TSWM		0x29
+
+/* Line Interface Mode 0 */
+#define PEF2256_LIM0		0x36
+#define PEF2256_2X_LIM0_BIT3	BIT(3) /* v2.x, described as a forced '1' bit */
+#define PEF2256_LIM0_MAS	BIT(0)
+
+/* Line Interface Mode 1 */
+#define PEF2256_LIM1		  0x37
+#define PEF2256_12_LIM1_RIL_MASK  GENMASK(6, 4)
+#define PEF2256_12_LIM1_RIL_910	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x0)
+#define PEF2256_12_LIM1_RIL_740	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x1)
+#define PEF2256_12_LIM1_RIL_590	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x2)
+#define PEF2256_12_LIM1_RIL_420	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x3)
+#define PEF2256_12_LIM1_RIL_320	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x4)
+#define PEF2256_12_LIM1_RIL_210	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x5)
+#define PEF2256_12_LIM1_RIL_160	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x6)
+#define PEF2256_12_LIM1_RIL_100	  FIELD_PREP_CONST(PEF2256_12_LIM1_RIL_MASK, 0x7)
+#define PEF2256_2X_LIM1_RIL_MASK  GENMASK(6, 4)
+#define PEF2256_2X_LIM1_RIL_2250  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x0)
+#define PEF2256_2X_LIM1_RIL_1100  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x1)
+#define PEF2256_2X_LIM1_RIL_600	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x2)
+#define PEF2256_2X_LIM1_RIL_350	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x3)
+#define PEF2256_2X_LIM1_RIL_210	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x4)
+#define PEF2256_2X_LIM1_RIL_140	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x5)
+#define PEF2256_2X_LIM1_RIL_100	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x6)
+#define PEF2256_2X_LIM1_RIL_50	  FIELD_PREP_CONST(PEF2256_2X_LIM1_RIL_MASK, 0x7)
+
+/* Pulse Count Detection */
+#define PEF2256_PCD		0x38
+
+ /* Pulse Count Recovery */
+#define PEF2256_PCR		0x39
+
+ /* Line Interface Mode 2 */
+#define PEF2256_LIM2		0x3A
+#define PEF2256_LIM2_SLT_MASK	GENMASK(5, 4)
+#define PEF2256_LIM2_SLT_THR55	FIELD_PREP_CONST(PEF2256_LIM2_SLT_MASK, 0x0)
+#define PEF2256_LIM2_SLT_THR67	FIELD_PREP_CONST(PEF2256_LIM2_SLT_MASK, 0x1)
+#define PEF2256_LIM2_SLT_THR50	FIELD_PREP_CONST(PEF2256_LIM2_SLT_MASK, 0x2)
+#define PEF2256_LIM2_SLT_THR45	FIELD_PREP_CONST(PEF2256_LIM2_SLT_MASK, 0x3)
+#define PEF2256_LIM2_ELT	BIT(2)
+
+/* System Interface Control 1 */
+#define PEF2256_SIC1	          0x3E
+#define PEF2256_SIC1_SSC_MASK	  (BIT(7) |  BIT(3))
+#define PEF2256_SIC1_SSC_2048	  (0)
+#define PEF2256_SIC1_SSC_4096	  BIT(3)
+#define PEF2256_SIC1_SSC_8192	  BIT(7)
+#define PEF2256_SIC1_SSC_16384	  (BIT(7) |  BIT(3))
+/* SSD is defined on 2 bits. The other bit is on FMR1 register */
+#define PEF2256_SIC1_SSD_MASK	  GENMASK(6, 6)
+#define PEF2256_SIC1_SSD_2048	  FIELD_PREP_CONST(PEF2256_SIC1_SSD_MASK, 0x0)
+#define PEF2256_SIC1_SSD_4096	  FIELD_PREP_CONST(PEF2256_SIC1_SSD_MASK, 0x0)
+#define PEF2256_SIC1_SSD_8192	  FIELD_PREP_CONST(PEF2256_SIC1_SSD_MASK, 0x1)
+#define PEF2256_SIC1_SSD_16384	  FIELD_PREP_CONST(PEF2256_SIC1_SSD_MASK, 0x1)
+#define PEF2256_SIC1_RBS_MASK	  GENMASK(5, 4)
+#define PEF2256_SIC1_RBS_2FRAMES  FIELD_PREP_CONST(PEF2256_SIC1_RBS_MASK, 0x0)
+#define PEF2256_SIC1_RBS_1FRAME	  FIELD_PREP_CONST(PEF2256_SIC1_RBS_MASK, 0x1)
+#define PEF2256_SIC1_RBS_96BITS	  FIELD_PREP_CONST(PEF2256_SIC1_RBS_MASK, 0x2)
+#define PEF2256_SIC1_RBS_BYPASS	  FIELD_PREP_CONST(PEF2256_SIC1_RBS_MASK, 0x3)
+#define PEF2256_SIC1_XBS_MASK	  GENMASK(1, 0)
+#define PEF2256_SIC1_XBS_BYPASS	  FIELD_PREP_CONST(PEF2256_SIC1_XBS_MASK, 0x0)
+#define PEF2256_SIC1_XBS_1FRAME	  FIELD_PREP_CONST(PEF2256_SIC1_XBS_MASK, 0x1)
+#define PEF2256_SIC1_XBS_2FRAMES  FIELD_PREP_CONST(PEF2256_SIC1_XBS_MASK, 0x2)
+#define PEF2256_SIC1_XBS_96BITS	  FIELD_PREP_CONST(PEF2256_SIC1_XBS_MASK, 0x3)
+
+/* System Interface Control 2 */
+#define PEF2256_SIC2		0x3F
+#define PEF2256_SIC2_SICS_MASK	GENMASK(3, 1)
+#define PEF2256_SIC2_SICS(_v)	FIELD_PREP(PEF2256_SIC2_SICS_MASK, _v)
+
+/* System Interface Control 3 */
+#define PEF2256_SIC3		0x40
+#define PEF2256_SIC3_RTRI	BIT(5)
+#define PEF2256_SIC3_RESX	BIT(3)
+#define PEF2256_SIC3_RESR	BIT(2)
+
+/* Clock Mode Register 1 */
+#define PEF2256_CMR1			0x44
+#define PEF2256_CMR1_RS_MASK		GENMASK(5, 4)
+#define PEF2256_CMR1_RS_DPLL		FIELD_PREP_CONST(PEF2256_CMR1_RS_MASK, 0x0)
+#define PEF2256_CMR1_RS_DPLL_LOS_HIGH	FIELD_PREP_CONST(PEF2256_CMR1_RS_MASK, 0x1)
+#define PEF2256_CMR1_RS_DCOR_2048	FIELD_PREP_CONST(PEF2256_CMR1_RS_MASK, 0x2)
+#define PEF2256_CMR1_RS_DCOR_8192	FIELD_PREP_CONST(PEF2256_CMR1_RS_MASK, 0x3)
+#define PEF2256_CMR1_DCS		BIT(3)
+
+/* Clock Mode Register 2 */
+#define PEF2256_CMR2		0x45
+#define PEF2256_CMR2_DCOXC	BIT(5)
+
+/* Global Configuration Register */
+#define PEF2256_GCR		0x46
+#define PEF2256_GCR_SCI		BIT(6)
+#define PEF2256_GCR_ECMC	BIT(4)
+
+/* Port Configuration 5 */
+#define PEF2256_PC5		0x84
+#define PEF2256_PC5_CRP         BIT(0)
+
+/* Global Port Configuration 1 */
+#define PEF2256_GPC1			0x85
+#define PEF2256_GPC1_CSFP_MASK		GENMASK(7, 5)
+#define PEF2256_GPC1_CSFP_SEC_IN_HIGH	FIELD_PREP_CONST(PEF2256_GPC1_CSFP_MASK, 0x0)
+#define PEF2256_GPC1_CSFP_SEC_OUT_HIGH	FIELD_PREP_CONST(PEF2256_GPC1_CSFP_MASK, 0x1)
+#define PEF2256_GPC1_CSFP_FSC_OUT_HIGH	FIELD_PREP_CONST(PEF2256_GPC1_CSFP_MASK, 0x2)
+#define PEF2256_GPC1_CSFP_FSC_OUT_LOW	FIELD_PREP_CONST(PEF2256_GPC1_CSFP_MASK, 0x3)
+
+/* Port Configuration 6 */
+#define PEF2256_PC6		0x86
+
+/* Global Counter Mode n=1..8 */
+#define PEF2256_GCM(_n)         (0x92 + (_n) - 1)
+#define PEF2256_GCM1	        0x92
+#define PEF2256_GCM2	        0x93
+#define PEF2256_GCM3	        0x94
+#define PEF2256_GCM4	        0x95
+#define PEF2256_GCM5	        0x96
+#define PEF2256_GCM6	        0x97
+#define PEF2256_GCM7	        0x98
+#define PEF2256_GCM8	        0x99
+
+/* Version Status Register */
+#define PEF2256_VSTR		 0x4A
+#define PEF2256_VSTR_VERSION_12	 0x00
+#define PEF2256_VSTR_VERSION_21	 0x10
+#define PEF2256_VSTR_VERSION_2x	 0x05
+
+/* Framer Receive Status 0 */
+#define PEF2256_FRS0		0x4C
+#define PEF2256_FRS0_LOS	BIT(7)
+#define PEF2256_FRS0_AIS	BIT(6)
+
+/* Interrupt Status Register 0..5 */
+#define PEF2256_ISR(_n)		(0x68 + (_n))
+#define PEF2256_ISR0		0x68
+#define PEF2256_ISR1		0x69
+#define PEF2256_ISR2		0x6A
+#define PEF2256_ISR3		0x6B
+#define PEF2256_ISR4		0x6C
+#define PEF2256_ISR5		0x6D
+
+/* Global Interrupt Status */
+#define PEF2256_GIS		0x6E
+#define PEF2256_GIS_ISR(_n)	BIT(_n)
+
+/* Wafer Identification Register */
+#define PEF2256_WID		   0xEC
+#define PEF2256_12_WID_MASK	   GENMASK(1, 0)
+#define PEF2256_12_WID_VERSION_12  FIELD_PREP_CONST(PEF2256_12_WID_MASK, 0x3)
+#define PEF2256_2X_WID_MASK	   GENMASK(7, 6)
+#define PEF2256_2X_WID_VERSION_21  FIELD_PREP_CONST(PEF2256_2X_WID_MASK, 0x0)
+#define PEF2256_2X_WID_VERSION_22  FIELD_PREP_CONST(PEF2256_2X_WID_MASK, 0x1)
+
+/* IMR2/ISR2 Interrupts common bits */
+#define PEF2256_INT2_AIS	BIT(3)
+#define PEF2256_INT2_LOS	BIT(2)
+
+#endif /* __PEF2256_REGS_H__ */
diff --git a/drivers/net/wan/framer/pef2256/pef2256.c b/drivers/net/wan/framer/pef2256/pef2256.c
new file mode 100644
index 000000000000..4f81053ee4f0
--- /dev/null
+++ b/drivers/net/wan/framer/pef2256/pef2256.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PEF2256 also known as FALC56 driver
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+
+#include <linux/framer/pef2256.h>
+#include <linux/clk.h>
+#include <linux/framer/framer-provider.h>
+#include <linux/gpio/consumer.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include "pef2256-regs.h"
+
+enum pef2256_frame_type {
+	PEF2256_FRAME_E1_DOUBLEFRAME,
+	PEF2256_FRAME_E1_CRC4_MULTIFRAME,
+	PEF2256_FRAME_E1_AUTO_MULTIFRAME,
+	PEF2256_FRAME_T1J1_4FRAME,
+	PEF2256_FRAME_T1J1_12FRAME,
+	PEF2256_FRAME_T1J1_24FRAME,
+	PEF2256_FRAME_T1J1_72FRAME,
+};
+
+struct pef2256 {
+	struct device *dev;
+	struct regmap *regmap;
+	enum pef2256_version version;
+	struct clk *mclk;
+	struct clk *sclkr;
+	struct clk *sclkx;
+	struct gpio_desc *reset_gpio;
+	unsigned long sysclk_rate;
+	u32 data_rate;
+	bool is_tx_falling_edge;
+	bool is_subordinate;
+	enum pef2256_frame_type frame_type;
+	u8 channel_phase;
+	atomic_t carrier;
+	struct framer *framer;
+};
+
+static u8 pef2256_read8(struct pef2256 *pef2256, int offset)
+{
+	int val;
+
+	regmap_read(pef2256->regmap, offset, &val);
+	return val;
+}
+
+static void pef2256_write8(struct pef2256 *pef2256, int offset, u8 val)
+{
+	regmap_write(pef2256->regmap, offset, val);
+}
+
+static void pef2256_clrbits8(struct pef2256 *pef2256, int offset, u8 clr)
+{
+	regmap_clear_bits(pef2256->regmap, offset, clr);
+}
+
+static void pef2256_setbits8(struct pef2256 *pef2256, int offset, u8 set)
+{
+	regmap_set_bits(pef2256->regmap, offset, set);
+}
+
+static void pef2256_clrsetbits8(struct pef2256 *pef2256, int offset, u8 clr, u8 set)
+{
+	regmap_update_bits(pef2256->regmap, offset, clr | set, set);
+}
+
+enum pef2256_version pef2256_get_version(struct pef2256 *pef2256)
+{
+	enum pef2256_version version = PEF2256_VERSION_UNKNOWN;
+	u8 vstr, wid;
+
+	vstr = pef2256_read8(pef2256, PEF2256_VSTR);
+	wid = pef2256_read8(pef2256, PEF2256_WID);
+
+	switch (vstr) {
+	case PEF2256_VSTR_VERSION_12:
+		if ((wid & PEF2256_12_WID_MASK) == PEF2256_12_WID_VERSION_12)
+			version = PEF2256_VERSION_1_2;
+		break;
+	case PEF2256_VSTR_VERSION_2x:
+		switch (wid & PEF2256_2X_WID_MASK) {
+		case PEF2256_2X_WID_VERSION_21:
+			version = PEF2256_VERSION_2_1;
+			break;
+		case PEF2256_2X_WID_VERSION_22:
+			version = PEF2256_VERSION_2_2;
+			break;
+		}
+		break;
+	case PEF2256_VSTR_VERSION_21:
+		version = PEF2256_VERSION_2_1;
+		break;
+	}
+
+	if (version == PEF2256_VERSION_UNKNOWN)
+		dev_err(pef2256->dev, "Unknown version (0x%02x, 0x%02x)\n", vstr, wid);
+
+	return version;
+}
+EXPORT_SYMBOL_GPL(pef2256_get_version);
+
+enum pef2256_gcm_config_item {
+	PEF2256_GCM_CONFIG_1544000 = 0,
+	PEF2256_GCM_CONFIG_2048000,
+	PEF2256_GCM_CONFIG_8192000,
+	PEF2256_GCM_CONFIG_10000000,
+	PEF2256_GCM_CONFIG_12352000,
+	PEF2256_GCM_CONFIG_16384000,
+};
+
+struct pef2256_gcm_config {
+	u8 gcm_12[6];
+	u8 gcm_2x[8];
+};
+
+static const struct pef2256_gcm_config pef2256_gcm_configs[] = {
+	[PEF2256_GCM_CONFIG_1544000] = {
+		.gcm_12 = {0xF0, 0x51, 0x00, 0x80, 0x00, 0x15},
+		.gcm_2x = {0x00, 0x15, 0x00, 0x08, 0x00, 0x3F, 0x9C, 0xDF},
+	},
+	[PEF2256_GCM_CONFIG_2048000] = {
+		.gcm_12 = {0x00, 0x58, 0xD2, 0xC2, 0x00, 0x10},
+		.gcm_2x = {0x00, 0x18, 0xFB, 0x0B, 0x00, 0x2F, 0xDB, 0xDF},
+	},
+	[PEF2256_GCM_CONFIG_8192000] = {
+		.gcm_12 = {0x00, 0x58, 0xD2, 0xC2, 0x03, 0x10},
+		.gcm_2x = {0x00, 0x18, 0xFB, 0x0B, 0x00, 0x0B, 0xDB, 0xDF},
+	},
+	[PEF2256_GCM_CONFIG_10000000] = {
+		.gcm_12 = {0x90, 0x51, 0x81, 0x8F, 0x04, 0x10},
+		.gcm_2x = {0x40, 0x1B, 0x3D, 0x0A, 0x00, 0x07, 0xC9, 0xDC},
+	},
+	[PEF2256_GCM_CONFIG_12352000] = {
+		.gcm_12 = {0xF0, 0x51, 0x00, 0x80, 0x07, 0x15},
+		.gcm_2x = {0x00, 0x19, 0x00, 0x08, 0x01, 0x0A, 0x98, 0xDA},
+	},
+	[PEF2256_GCM_CONFIG_16384000] = {
+		.gcm_12 = {0x00, 0x58, 0xD2, 0xC2, 0x07, 0x10},
+		.gcm_2x = {0x00, 0x18, 0xFB, 0x0B, 0x01, 0x0B, 0xDB, 0xDF},
+	},
+};
+
+static int pef2256_setup_gcm(struct pef2256 *pef2256)
+{
+	enum pef2256_gcm_config_item item;
+	unsigned long mclk_rate;
+	const u8 *gcm;
+	int i, count;
+
+	mclk_rate = clk_get_rate(pef2256->mclk);
+	switch (mclk_rate) {
+	case 1544000:
+		item = PEF2256_GCM_CONFIG_1544000;
+		break;
+	case 2048000:
+		item = PEF2256_GCM_CONFIG_2048000;
+		break;
+	case 8192000:
+		item = PEF2256_GCM_CONFIG_8192000;
+		break;
+	case 10000000:
+		item = PEF2256_GCM_CONFIG_10000000;
+		break;
+	case 12352000:
+		item = PEF2256_GCM_CONFIG_12352000;
+		break;
+	case 16384000:
+		item = PEF2256_GCM_CONFIG_16384000;
+		break;
+	default:
+		dev_err(pef2256->dev, "Unsupported v2.x MCLK rate %lu\n", mclk_rate);
+		return -EINVAL;
+	}
+
+	BUILD_BUG_ON(item >= ARRAY_SIZE(pef2256_gcm_configs));
+
+	if (pef2256->version == PEF2256_VERSION_1_2) {
+		gcm = pef2256_gcm_configs[item].gcm_12;
+		count = ARRAY_SIZE(pef2256_gcm_configs[item].gcm_12);
+	} else {
+		gcm = pef2256_gcm_configs[item].gcm_2x;
+		count = ARRAY_SIZE(pef2256_gcm_configs[item].gcm_2x);
+	}
+
+	for (i = 0; i < count; i++)
+		pef2256_write8(pef2256, PEF2256_GCM(i + 1), *(gcm + i));
+
+	return 0;
+}
+
+static int pef2256_setup_e1_line(struct pef2256 *pef2256)
+{
+	u8 fmr1, fmr2;
+
+	/* RCLK output : DPLL clock, DCO-X enabled, DCO-X internal ref clock */
+	pef2256_write8(pef2256, PEF2256_CMR1, 0x00);
+
+	/* SCLKR selected, SCLKX selected,
+	 * receive synchro pulse sourced by SYPR,
+	 * transmit synchro pulse sourced by SYPX,
+	 * DCO-X center frequency enabled
+	 */
+	pef2256_write8(pef2256, PEF2256_CMR2, PEF2256_CMR2_DCOXC);
+
+	if (pef2256->is_subordinate) {
+		/* select RCLK source = 2M,  disable switching from RCLK to SYNC */
+		pef2256_clrsetbits8(pef2256, PEF2256_CMR1, PEF2256_CMR1_RS_MASK,
+				    PEF2256_CMR1_RS_DCOR_2048 | PEF2256_CMR1_DCS);
+	}
+
+	/* slave mode, local loop off, mode short-haul
+	 * In v2.x, bit3 is a forced 1 bit in the datasheet -> Need to be set.
+	 */
+	if (pef2256->version == PEF2256_VERSION_1_2)
+		pef2256_write8(pef2256, PEF2256_LIM0, 0x00);
+	else
+		pef2256_write8(pef2256, PEF2256_LIM0, PEF2256_2X_LIM0_BIT3);
+
+	/* "master" mode */
+	if (!pef2256->is_subordinate)
+		pef2256_setbits8(pef2256, PEF2256_LIM0, PEF2256_LIM0_MAS);
+
+	/* analog interface selected, remote loop off */
+	pef2256_write8(pef2256, PEF2256_LIM1, 0x00);
+
+	/* receive input threshold = 0,21V */
+	if (pef2256->version == PEF2256_VERSION_1_2)
+		pef2256_clrsetbits8(pef2256, PEF2256_LIM1, PEF2256_12_LIM1_RIL_MASK,
+				    PEF2256_12_LIM1_RIL_210);
+	else
+		pef2256_clrsetbits8(pef2256, PEF2256_LIM1, PEF2256_2X_LIM1_RIL_MASK,
+				    PEF2256_2X_LIM1_RIL_210);
+
+	/* transmit pulse mask, default value from datasheet
+	 * transmit line in normal operation
+	 */
+	if (pef2256->version == PEF2256_VERSION_1_2)
+		pef2256_write8(pef2256, PEF2256_XPM0, 0x7B);
+	else
+		pef2256_write8(pef2256, PEF2256_XPM0, 0x9C);
+	pef2256_write8(pef2256, PEF2256_XPM1, 0x03);
+	pef2256_write8(pef2256, PEF2256_XPM2, 0x00);
+
+	/* HDB3 coding, no alarm simulation */
+	pef2256_write8(pef2256, PEF2256_FMR0, PEF2256_FMR0_XC_HDB3 | PEF2256_FMR0_RC_HDB3);
+
+	/* E1, frame format, 2 Mbit/s system data rate, no AIS
+	 * transmission to remote end or system interface, payload loop
+	 * off, transmit remote alarm on
+	 */
+	fmr1 = 0x00;
+	fmr2 = PEF2256_FMR2_AXRA;
+	switch (pef2256->frame_type) {
+	case PEF2256_FRAME_E1_DOUBLEFRAME:
+		fmr2 |= PEF2256_FMR2_RFS_DOUBLEFRAME;
+		break;
+	case PEF2256_FRAME_E1_CRC4_MULTIFRAME:
+		fmr1 |= PEF2256_FMR1_XFS;
+		fmr2 |= PEF2256_FMR2_RFS_CRC4_MULTIFRAME;
+		break;
+	case PEF2256_FRAME_E1_AUTO_MULTIFRAME:
+		fmr1 |= PEF2256_FMR1_XFS;
+		fmr2 |= PEF2256_FMR2_RFS_AUTO_MULTIFRAME;
+		break;
+	default:
+		dev_err(pef2256->dev, "Unsupported frame type %d\n", pef2256->frame_type);
+		return -EINVAL;
+	}
+	pef2256_clrsetbits8(pef2256, PEF2256_FMR1, PEF2256_FMR1_XFS, fmr1);
+	pef2256_write8(pef2256, PEF2256_FMR2, fmr2);
+
+	if (!pef2256->is_subordinate) {
+		/* SEC input, active high */
+		pef2256_write8(pef2256, PEF2256_GPC1, PEF2256_GPC1_CSFP_SEC_IN_HIGH);
+	} else {
+		/* FSC output, active high */
+		pef2256_write8(pef2256, PEF2256_GPC1, PEF2256_GPC1_CSFP_FSC_OUT_HIGH);
+	}
+
+	/* SCLKR, SCLKX, RCLK configured to inputs,
+	 * XFMS active low, CLK1 and CLK2 pin configuration
+	 */
+	pef2256_write8(pef2256, PEF2256_PC5, 0x00);
+	pef2256_write8(pef2256, PEF2256_PC6, 0x00);
+
+	/* port RCLK is output */
+	pef2256_setbits8(pef2256, PEF2256_PC5, PEF2256_PC5_CRP);
+
+	return 0;
+}
+
+static void pef2256_setup_e1_los(struct pef2256 *pef2256)
+{
+	/* detection of LOS alarm = 176 pulses (ie (10 + 1) * 16) */
+	pef2256_write8(pef2256, PEF2256_PCD, 10);
+	/* recovery of LOS alarm = 22 pulses (ie 21 + 1) */
+	pef2256_write8(pef2256, PEF2256_PCR, 21);
+	/* E1 default for the receive slicer threshold */
+	pef2256_write8(pef2256, PEF2256_LIM2, PEF2256_LIM2_SLT_THR50);
+	if (pef2256->is_subordinate) {
+		/* Loop-timed */
+		pef2256_setbits8(pef2256, PEF2256_LIM2, PEF2256_LIM2_ELT);
+	}
+}
+
+static int pef2256_setup_e1_system(struct pef2256 *pef2256)
+{
+	u8 sic1, fmr1;
+
+	/* 2.048 MHz system clocking rate, receive buffer 2 frames, transmit
+	 * buffer bypass, data sampled and transmitted on the falling edge of
+	 * SCLKR/X, automatic freeze signaling, data is active in the first
+	 * channel phase
+	 */
+	pef2256_write8(pef2256, PEF2256_SIC1, 0x00);
+	pef2256_write8(pef2256, PEF2256_SIC2, 0x00);
+	pef2256_write8(pef2256, PEF2256_SIC3, 0x00);
+
+	if (pef2256->is_subordinate) {
+		/* transmit buffer size = 2 frames, transparent mode */
+		pef2256_clrsetbits8(pef2256, PEF2256_SIC1, PEF2256_SIC1_XBS_MASK,
+				    PEF2256_SIC1_XBS_2FRAMES);
+	}
+
+	if (pef2256->version != PEF2256_VERSION_1_2) {
+		/* during inactive channel phase switch RDO/RSIG into tri-state */
+		pef2256_setbits8(pef2256, PEF2256_SIC3, PEF2256_SIC3_RTRI);
+	}
+
+	if (pef2256->is_tx_falling_edge) {
+		/* falling edge sync pulse transmit, rising edge sync pulse receive */
+		pef2256_clrsetbits8(pef2256, PEF2256_SIC3, PEF2256_SIC3_RESX, PEF2256_SIC3_RESR);
+	} else {
+		/* rising edge sync pulse transmit, falling edge sync pulse receive */
+		pef2256_clrsetbits8(pef2256, PEF2256_SIC3, PEF2256_SIC3_RESR, PEF2256_SIC3_RESX);
+	}
+
+	/* transmit offset counter (XCO10..0) = 4 */
+	pef2256_write8(pef2256, PEF2256_XC0, 0);
+	pef2256_write8(pef2256, PEF2256_XC1, 4);
+	/* receive offset counter (RCO10..0) = 4 */
+	pef2256_write8(pef2256, PEF2256_RC0, 0);
+	pef2256_write8(pef2256, PEF2256_RC1, 4);
+
+	/* system clock rate */
+	switch (pef2256->sysclk_rate) {
+	case 2048000:
+		sic1 = PEF2256_SIC1_SSC_2048;
+		break;
+	case 4096000:
+		sic1 = PEF2256_SIC1_SSC_4096;
+		break;
+	case 8192000:
+		sic1 = PEF2256_SIC1_SSC_8192;
+		break;
+	case 16384000:
+		sic1 = PEF2256_SIC1_SSC_16384;
+		break;
+	default:
+		dev_err(pef2256->dev, "Unsupported sysclk rate %lu\n", pef2256->sysclk_rate);
+		return -EINVAL;
+	}
+	pef2256_clrsetbits8(pef2256, PEF2256_SIC1, PEF2256_SIC1_SSC_MASK, sic1);
+
+	/* data clock rate */
+	switch (pef2256->data_rate) {
+	case 2048000:
+		fmr1 = PEF2256_FMR1_SSD_2048;
+		sic1 = PEF2256_SIC1_SSD_2048;
+		break;
+	case 4096000:
+		fmr1 = PEF2256_FMR1_SSD_4096;
+		sic1 = PEF2256_SIC1_SSD_4096;
+		break;
+	case 8192000:
+		fmr1 = PEF2256_FMR1_SSD_8192;
+		sic1 = PEF2256_SIC1_SSD_8192;
+		break;
+	case 16384000:
+		fmr1 = PEF2256_FMR1_SSD_16384;
+		sic1 = PEF2256_SIC1_SSD_16384;
+		break;
+	default:
+		dev_err(pef2256->dev, "Unsupported data rate %u\n", pef2256->data_rate);
+		return -EINVAL;
+	}
+	pef2256_clrsetbits8(pef2256, PEF2256_FMR1, PEF2256_FMR1_SSD_MASK, fmr1);
+	pef2256_clrsetbits8(pef2256, PEF2256_SIC1, PEF2256_SIC1_SSD_MASK, sic1);
+
+	/* channel phase */
+	pef2256_clrsetbits8(pef2256, PEF2256_SIC2, PEF2256_SIC2_SICS_MASK,
+			    PEF2256_SIC2_SICS(pef2256->channel_phase));
+
+	return 0;
+}
+
+static void pef2256_setup_e1_signaling(struct pef2256 *pef2256)
+{
+	/* All bits of the transmitted service word are cleared */
+	pef2256_write8(pef2256, PEF2256_XSW, PEF2256_XSW_XY(0x1F));
+
+	/* CAS disabled and clear spare bit values */
+	pef2256_write8(pef2256, PEF2256_XSP, 0x00);
+
+	if (pef2256->is_subordinate) {
+		/* transparent mode */
+		pef2256_setbits8(pef2256, PEF2256_XSW, PEF2256_XSW_XTM);
+	}
+
+	/* Si-Bit, Spare bit For International, FAS word */
+	pef2256_setbits8(pef2256, PEF2256_XSW, PEF2256_XSW_XSIS);
+	pef2256_setbits8(pef2256, PEF2256_XSP, PEF2256_XSP_XSIF);
+
+	/* no transparent mode active */
+	pef2256_write8(pef2256, PEF2256_TSWM, 0x00);
+}
+
+static void pef2256_setup_e1_errors(struct pef2256 *pef2256)
+{
+	/* error counter latched every 1s */
+	pef2256_setbits8(pef2256, PEF2256_FMR1, PEF2256_FMR1_ECM);
+
+	/* error counter mode COFA */
+	pef2256_setbits8(pef2256, PEF2256_GCR, PEF2256_GCR_ECMC);
+
+	/* errors in service words have no influence */
+	pef2256_setbits8(pef2256, PEF2256_RC0, PEF2256_RC0_SWD);
+
+	/* 4 consecutive incorrect FAS causes loss of sync */
+	pef2256_setbits8(pef2256, PEF2256_RC0, PEF2256_RC0_ASY4);
+}
+
+static int pef2256_setup_e1(struct pef2256 *pef2256)
+{
+	int ret;
+
+	/* Setup, Master clocking mode (GCM8..1) */
+	ret = pef2256_setup_gcm(pef2256);
+	if (ret)
+		return ret;
+
+	/* Select E1 mode */
+	pef2256_write8(pef2256, PEF2256_FMR1, 0x00);
+
+	/* internal second timer, power on */
+	pef2256_write8(pef2256, PEF2256_GCR, 0x00);
+
+	/* Setup line interface */
+	ret = pef2256_setup_e1_line(pef2256);
+	if (ret)
+		return ret;
+
+	/* Setup Loss-of-signal detection and recovery */
+	pef2256_setup_e1_los(pef2256);
+
+	/* Setup system interface */
+	ret = pef2256_setup_e1_system(pef2256);
+	if (ret)
+		return ret;
+
+	/* Setup signaling */
+	pef2256_setup_e1_signaling(pef2256);
+
+	/* Setup errors counters and condition */
+	pef2256_setup_e1_errors(pef2256);
+
+	/* status changed interrupt at both up and down */
+	pef2256_setbits8(pef2256, PEF2256_GCR, PEF2256_GCR_SCI);
+
+	/* Clear any ISR2 pending interrupts and unmask needed interrupts */
+	pef2256_read8(pef2256, PEF2256_ISR2);
+	pef2256_clrbits8(pef2256, PEF2256_IMR2, PEF2256_INT2_LOS | PEF2256_INT2_AIS);
+
+	/* reset lines */
+	pef2256_write8(pef2256, PEF2256_CMDR, PEF2256_CMDR_RRES | PEF2256_CMDR_XRES);
+	return 0;
+}
+
+static void pef2256_isr_default_handler(struct pef2256 *pef2256, u8 nbr, u8 isr)
+{
+	dev_warn_ratelimited(pef2256->dev, "ISR%u: 0x%02x not handled\n", nbr, isr);
+}
+
+static bool pef2256_is_carrier_on(struct pef2256 *pef2256)
+{
+	u8 frs0;
+
+	frs0 = pef2256_read8(pef2256, PEF2256_FRS0);
+	return !(frs0 & (PEF2256_FRS0_LOS | PEF2256_FRS0_AIS));
+}
+
+static void pef2256_isr2_handler(struct pef2256 *pef2256, u8 nbr, u8 isr)
+{
+	bool carrier;
+
+	if (isr & (PEF2256_INT2_LOS | PEF2256_INT2_AIS)) {
+		carrier = pef2256_is_carrier_on(pef2256);
+		if (atomic_xchg(&pef2256->carrier, carrier) != carrier)
+			framer_notify_status_change(pef2256->framer);
+	}
+}
+
+static irqreturn_t pef2256_irq_handler(int irq, void *priv)
+{
+	static void (*pef2256_isr_handler[])(struct pef2256 *, u8, u8) = {
+		[0] = pef2256_isr_default_handler,
+		[1] = pef2256_isr_default_handler,
+		[2] = pef2256_isr2_handler,
+		[3] = pef2256_isr_default_handler,
+		[4] = pef2256_isr_default_handler,
+		[5] = pef2256_isr_default_handler
+	};
+	struct pef2256 *pef2256 = (struct pef2256 *)priv;
+	u8 gis;
+	u8 isr;
+	u8 n;
+
+	gis = pef2256_read8(pef2256, PEF2256_GIS);
+
+	for (n = 0; n < ARRAY_SIZE(pef2256_isr_handler); n++) {
+		if (gis & PEF2256_GIS_ISR(n)) {
+			isr = pef2256_read8(pef2256, PEF2256_ISR(n));
+			pef2256_isr_handler[n](pef2256, n, isr);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int pef2256_check_rates(struct pef2256 *pef2256, unsigned long sysclk_rate,
+			       unsigned long data_rate)
+{
+	unsigned long rate;
+
+	switch (sysclk_rate) {
+	case 2048000:
+	case 4096000:
+	case 8192000:
+	case 16384000:
+		break;
+	default:
+		dev_err(pef2256->dev, "Unsupported system clock rate %lu\n", sysclk_rate);
+		return -EINVAL;
+	}
+
+	for (rate = data_rate; rate <= data_rate * 4; rate *= 2) {
+		if (rate == sysclk_rate)
+			return 0;
+	}
+	dev_err(pef2256->dev, "Unsupported data rate %lu with system clock rate %lu\n",
+		data_rate, sysclk_rate);
+	return -EINVAL;
+}
+
+static int pef2556_of_parse(struct pef2256 *pef2256, struct device_node *np)
+{
+	int ret;
+
+	pef2256->data_rate = 2048000;
+	ret = of_property_read_u32(np, "lantiq,data-rate-bps", &pef2256->data_rate);
+	if (ret && ret != -EINVAL) {
+		dev_err(pef2256->dev, "%pOF: failed to read lantiq,data-rate-bps\n", np);
+		return ret;
+	}
+
+	ret =  pef2256_check_rates(pef2256, pef2256->sysclk_rate, pef2256->data_rate);
+	if (ret)
+		return ret;
+
+	pef2256->is_tx_falling_edge = of_property_read_bool(np, "lantiq,clock-falling-edge");
+
+	pef2256->channel_phase = 0;
+	ret = of_property_read_u8(np, "lantiq,channel-phase", &pef2256->channel_phase);
+	if (ret && ret != -EINVAL) {
+		dev_err(pef2256->dev, "%pOF: failed to read lantiq,channel-phase\n",
+			np);
+		return ret;
+	}
+	if (pef2256->channel_phase >= pef2256->sysclk_rate / pef2256->data_rate) {
+		dev_err(pef2256->dev, "%pOF: Invalid lantiq,channel-phase %u\n",
+			np, pef2256->channel_phase);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct regmap_config pef2256_regmap_config = {
+	.reg_bits = 32,
+	.val_bits = 8,
+	.max_register = 0xff,
+};
+
+static const struct mfd_cell pef2256_devs[] = {
+	{ .name = "lantiq-pef2256-pinctrl", },
+};
+
+static int pef2256_add_audio_devices(struct pef2256 *pef2256)
+{
+	const char *compatible = "lantiq,pef2256-codec";
+	struct mfd_cell *audio_devs;
+	struct device_node *np;
+	unsigned int count = 0;
+	unsigned int i;
+	int ret;
+
+	for_each_available_child_of_node(pef2256->dev->of_node, np) {
+		if (of_device_is_compatible(np, compatible))
+			count++;
+	}
+
+	if (!count)
+		return 0;
+
+	audio_devs = kcalloc(count, sizeof(*audio_devs), GFP_KERNEL);
+	if (!audio_devs)
+		return -ENOMEM;
+
+	for (i = 0; i < count; i++) {
+		audio_devs[i].name = "framer-codec";
+		audio_devs[i].of_compatible = compatible;
+		audio_devs[i].id = i;
+	}
+
+	ret = mfd_add_devices(pef2256->dev, 0, audio_devs, count, NULL, 0, NULL);
+	kfree(audio_devs);
+	return ret;
+}
+
+static int pef2256_framer_get_status(struct framer *framer, struct framer_status *status)
+{
+	struct pef2256 *pef2256 = framer_get_drvdata(framer);
+
+	status->link_is_on = !!atomic_read(&pef2256->carrier);
+	return 0;
+}
+
+static int pef2256_framer_set_config(struct framer *framer, const struct framer_config *config)
+{
+	struct pef2256 *pef2256 = framer_get_drvdata(framer);
+
+	if (config->iface != FRAMER_IFACE_E1) {
+		dev_err(pef2256->dev, "Only E1 line is currently supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	switch (config->clock_type) {
+	case FRAMER_CLOCK_EXT:
+		pef2256->is_subordinate = true;
+		break;
+	case FRAMER_CLOCK_INT:
+		pef2256->is_subordinate = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Apply the new settings */
+	return pef2256_setup_e1(pef2256);
+}
+
+static int pef2256_framer_get_config(struct framer *framer, struct framer_config *config)
+{
+	struct pef2256 *pef2256 = framer_get_drvdata(framer);
+
+	config->iface = FRAMER_IFACE_E1;
+	config->clock_type = pef2256->is_subordinate ? FRAMER_CLOCK_EXT : FRAMER_CLOCK_INT;
+	config->line_clock_rate = 2048000;
+	return 0;
+}
+
+static const struct framer_ops pef2256_framer_ops = {
+	.owner = THIS_MODULE,
+	.get_status = pef2256_framer_get_status,
+	.get_config = pef2256_framer_get_config,
+	.set_config = pef2256_framer_set_config,
+};
+
+static int pef2256_probe(struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	unsigned long sclkr_rate, sclkx_rate;
+	struct framer_provider *framer_provider;
+	struct pef2256 *pef2256;
+	const char *version_txt;
+	void __iomem *iomem;
+	int ret;
+	int irq;
+
+	pef2256 = devm_kzalloc(&pdev->dev, sizeof(*pef2256), GFP_KERNEL);
+	if (!pef2256)
+		return -ENOMEM;
+
+	pef2256->dev = &pdev->dev;
+	atomic_set(&pef2256->carrier, 0);
+
+	pef2256->is_subordinate = true;
+	pef2256->frame_type = PEF2256_FRAME_E1_DOUBLEFRAME;
+
+	iomem = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(iomem))
+		return PTR_ERR(iomem);
+
+	pef2256->regmap = devm_regmap_init_mmio(&pdev->dev, iomem,
+						&pef2256_regmap_config);
+	if (IS_ERR(pef2256->regmap)) {
+		dev_err(&pdev->dev, "Failed to initialise Regmap (%ld)\n",
+			PTR_ERR(pef2256->regmap));
+		return PTR_ERR(pef2256->regmap);
+	}
+
+	pef2256->mclk = devm_clk_get_enabled(&pdev->dev, "mclk");
+	if (IS_ERR(pef2256->mclk))
+		return PTR_ERR(pef2256->mclk);
+
+	pef2256->sclkr = devm_clk_get_enabled(&pdev->dev, "sclkr");
+	if (IS_ERR(pef2256->sclkr))
+		return PTR_ERR(pef2256->sclkr);
+
+	pef2256->sclkx = devm_clk_get_enabled(&pdev->dev, "sclkx");
+	if (IS_ERR(pef2256->sclkx))
+		return PTR_ERR(pef2256->sclkx);
+
+	/* Both SCLKR (receive) and SCLKX (transmit) must have the same rate,
+	 * stored as sysclk_rate.
+	 * The exact value will be checked at pef2256_check_rates()
+	 */
+	sclkr_rate = clk_get_rate(pef2256->sclkr);
+	sclkx_rate = clk_get_rate(pef2256->sclkx);
+	if (sclkr_rate != sclkx_rate) {
+		dev_err(pef2256->dev, "clk rate mismatch. sclkr %lu Hz, sclkx %lu Hz\n",
+			sclkr_rate, sclkx_rate);
+		return -EINVAL;
+	}
+	pef2256->sysclk_rate = sclkr_rate;
+
+	/* Reset the component. The MCLK clock must be active during reset */
+	pef2256->reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW);
+	if (IS_ERR(pef2256->reset_gpio))
+		return PTR_ERR(pef2256->reset_gpio);
+	if (pef2256->reset_gpio) {
+		gpiod_set_value_cansleep(pef2256->reset_gpio, 1);
+		usleep_range(10, 20);
+		gpiod_set_value_cansleep(pef2256->reset_gpio, 0);
+		usleep_range(10, 20);
+	}
+
+	pef2256->version = pef2256_get_version(pef2256);
+	switch (pef2256->version) {
+	case PEF2256_VERSION_1_2:
+		version_txt = "1.2";
+		break;
+	case PEF2256_VERSION_2_1:
+		version_txt = "2.1";
+		break;
+	case PEF2256_VERSION_2_2:
+		version_txt = "2.2";
+		break;
+	default:
+		return -ENODEV;
+	}
+	dev_info(pef2256->dev, "Version %s detected\n", version_txt);
+
+	ret = pef2556_of_parse(pef2256, np);
+	if (ret)
+		return ret;
+
+	/* Create the framer. It can be used on interrupts */
+	pef2256->framer = devm_framer_create(pef2256->dev, NULL, &pef2256_framer_ops);
+	if (IS_ERR(pef2256->framer))
+		return PTR_ERR(pef2256->framer);
+
+	framer_set_drvdata(pef2256->framer, pef2256);
+
+	/* Disable interrupts */
+	pef2256_write8(pef2256, PEF2256_IMR0, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR1, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR2, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR3, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR4, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR5, 0xff);
+
+	/* Clear any pending interrupts */
+	pef2256_read8(pef2256, PEF2256_ISR0);
+	pef2256_read8(pef2256, PEF2256_ISR1);
+	pef2256_read8(pef2256, PEF2256_ISR2);
+	pef2256_read8(pef2256, PEF2256_ISR3);
+	pef2256_read8(pef2256, PEF2256_ISR4);
+	pef2256_read8(pef2256, PEF2256_ISR5);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+	ret = devm_request_irq(pef2256->dev, irq, pef2256_irq_handler, 0, "pef2256", pef2256);
+	if (ret < 0)
+		return ret;
+
+	platform_set_drvdata(pdev, pef2256);
+
+	ret = mfd_add_devices(pef2256->dev, 0, pef2256_devs,
+			      ARRAY_SIZE(pef2256_devs), NULL, 0, NULL);
+	if (ret) {
+		dev_err(pef2256->dev, "add devices failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = pef2256_setup_e1(pef2256);
+	if (ret)
+		return ret;
+
+	framer_provider = devm_framer_provider_of_register(pef2256->dev,
+							   framer_provider_simple_of_xlate);
+	if (IS_ERR(framer_provider))
+		return PTR_ERR(framer_provider);
+
+	/* Add audio devices */
+	ret = pef2256_add_audio_devices(pef2256);
+	if (ret < 0) {
+		dev_err(pef2256->dev, "add audio devices failed (%d)\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int pef2256_remove(struct platform_device *pdev)
+{
+	struct pef2256 *pef2256 = platform_get_drvdata(pdev);
+
+	/* Disable interrupts */
+	pef2256_write8(pef2256, PEF2256_IMR0, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR1, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR2, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR3, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR4, 0xff);
+	pef2256_write8(pef2256, PEF2256_IMR5, 0xff);
+
+	return 0;
+}
+
+static const struct of_device_id pef2256_id_table[] = {
+	{ .compatible = "lantiq,pef2256" },
+	{} /* sentinel */
+};
+MODULE_DEVICE_TABLE(of, pef2256_id_table);
+
+static struct platform_driver pef2256_driver = {
+	.driver = {
+		.name = "lantiq-pef2256",
+		.of_match_table = pef2256_id_table,
+	},
+	.probe = pef2256_probe,
+	.remove = pef2256_remove,
+};
+module_platform_driver(pef2256_driver);
+
+struct regmap *pef2256_get_regmap(struct pef2256 *pef2256)
+{
+	return pef2256->regmap;
+}
+EXPORT_SYMBOL_GPL(pef2256_get_regmap);
+
+MODULE_AUTHOR("Herve Codina <herve.codina@bootlin.com>");
+MODULE_DESCRIPTION("PEF2256 driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/framer/pef2256.h b/include/linux/framer/pef2256.h
new file mode 100644
index 000000000000..71d80af58c40
--- /dev/null
+++ b/include/linux/framer/pef2256.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * PEF2256 consumer API
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina <herve.codina@bootlin.com>
+ */
+#ifndef __PEF2256_H__
+#define __PEF2256_H__
+
+#include <linux/types.h>
+
+struct pef2256;
+struct regmap;
+
+/* Retrieve the PEF2256 regmap */
+struct regmap *pef2256_get_regmap(struct pef2256 *pef2256);
+
+/* PEF2256 hardware versions */
+enum pef2256_version {
+	PEF2256_VERSION_UNKNOWN,
+	PEF2256_VERSION_1_2,
+	PEF2256_VERSION_2_1,
+	PEF2256_VERSION_2_2,
+};
+
+/* Get the PEF2256 hardware version */
+enum pef2256_version pef2256_get_version(struct pef2256 *pef2256);
+
+#endif /* __PEF2256_H__ */
-- 
cgit v1.2.3


From d3bb89ea9c13e5a98d2b7a0ba8e50a77893132cb Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 7 Dec 2023 23:25:25 +0800
Subject: mm: fix VMA heap bounds checking

After converting selinux to VMA heap check helper, the gcl triggers an
execheap SELinux denial, which is caused by a changed logic check.

Previously selinux only checked that the VMA range was within the VMA heap
range, and the implementation checks the intersection between the two
ranges, but the corner case (vm_end=start_brk, brk=vm_start) isn't handled
correctly.

Since commit 11250fd12eb8 ("mm: factor out VMA stack and heap checks") was
only a function extraction, it seems that the issue was introduced by
commit 0db0c01b53a1 ("procfs: fix /proc/<pid>/maps heap check").  Let's
fix above corner cases, meanwhile, correct the wrong indentation of the
stack and heap check helpers.

Fixes: 11250fd12eb8 ("mm: factor out VMA stack and heap checks")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reported-by: Ondrej Mosnacek <omosnace@redhat.com>
Closes: https://lore.kernel.org/selinux/CAFqZXNv0SVT0fkOK6neP9AXbj3nxJ61JAY4+zJzvxqJaeuhbFw@mail.gmail.com/
Tested-by: Ondrej Mosnacek <omosnace@redhat.com>
Link: https://lkml.kernel.org/r/20231207152525.2607420-1-wangkefeng.wang@huawei.com
Cc: David Hildenbrand <david@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <stephen.smalley.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..da5219b48d52 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -886,8 +886,8 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
  */
 static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
 {
-       return vma->vm_start <= vma->vm_mm->brk &&
-		vma->vm_end >= vma->vm_mm->start_brk;
+	return vma->vm_start < vma->vm_mm->brk &&
+		vma->vm_end > vma->vm_mm->start_brk;
 }
 
 /*
@@ -901,8 +901,8 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
 	 * its "stack".  It's not even well-defined for programs written
 	 * languages like Go.
 	 */
-       return vma->vm_start <= vma->vm_mm->start_stack &&
-	       vma->vm_end >= vma->vm_mm->start_stack;
+	return vma->vm_start <= vma->vm_mm->start_stack &&
+		vma->vm_end >= vma->vm_mm->start_stack;
 }
 
 static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 6376a824595607e99d032a39ba3394988b4fce96 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 8 Dec 2023 17:50:18 +0000
Subject: mm/damon/core: make damon_start() waits until kdamond_fn() starts

The cleanup tasks of kdamond threads including reset of corresponding
DAMON context's ->kdamond field and decrease of global nr_running_ctxs
counter is supposed to be executed by kdamond_fn().  However, commit
0f91d13366a4 ("mm/damon: simplify stop mechanism") made neither
damon_start() nor damon_stop() ensure the corresponding kdamond has
started the execution of kdamond_fn().

As a result, the cleanup can be skipped if damon_stop() is called fast
enough after the previous damon_start().  Especially the skipped reset
of ->kdamond could cause a use-after-free.

Fix it by waiting for start of kdamond_fn() execution from
damon_start().

Link: https://lkml.kernel.org/r/20231208175018.63880-1-sj@kernel.org
Fixes: 0f91d13366a4 ("mm/damon: simplify stop mechanism")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Jakub Acs <acsjakub@amazon.de>
Cc: Changbin Du <changbin.du@intel.com>
Cc: Jakub Acs <acsjakub@amazon.de>
Cc: <stable@vger.kernel.org> # 5.15.x
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 mm/damon/core.c       | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ab2f17d9926b..e00ddf1ed39c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -559,6 +559,8 @@ struct damon_ctx {
 	 * update
 	 */
 	unsigned long next_ops_update_sis;
+	/* for waiting until the execution of the kdamond_fn is started */
+	struct completion kdamond_started;
 
 /* public: */
 	struct task_struct *kdamond;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ce1562783e7e..3a05e71509b9 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -445,6 +445,8 @@ struct damon_ctx *damon_new_ctx(void)
 	if (!ctx)
 		return NULL;
 
+	init_completion(&ctx->kdamond_started);
+
 	ctx->attrs.sample_interval = 5 * 1000;
 	ctx->attrs.aggr_interval = 100 * 1000;
 	ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
@@ -668,11 +670,14 @@ static int __damon_start(struct damon_ctx *ctx)
 	mutex_lock(&ctx->kdamond_lock);
 	if (!ctx->kdamond) {
 		err = 0;
+		reinit_completion(&ctx->kdamond_started);
 		ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
 				nr_running_ctxs);
 		if (IS_ERR(ctx->kdamond)) {
 			err = PTR_ERR(ctx->kdamond);
 			ctx->kdamond = NULL;
+		} else {
+			wait_for_completion(&ctx->kdamond_started);
 		}
 	}
 	mutex_unlock(&ctx->kdamond_lock);
@@ -1433,6 +1438,7 @@ static int kdamond_fn(void *data)
 
 	pr_debug("kdamond (%d) starts\n", current->pid);
 
+	complete(&ctx->kdamond_started);
 	kdamond_init_intervals_sis(ctx);
 
 	if (ctx->ops.init)
-- 
cgit v1.2.3


From 081488051d28d32569ebb7c7a23572778b2e7d57 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:04 -0700
Subject: mm/mglru: fix underprotected page cache

Unmapped folios accessed through file descriptors can be underprotected.
Those folios are added to the oldest generation based on:

1. The fact that they are less costly to reclaim (no need to walk the
   rmap and flush the TLB) and have less impact on performance (don't
   cause major PFs and can be non-blocking if needed again).
2. The observation that they are likely to be single-use. E.g., for
   client use cases like Android, its apps parse configuration files
   and store the data in heap (anon); for server use cases like MySQL,
   it reads from InnoDB files and holds the cached data for tables in
   buffer pools (anon).

However, the oldest generation can be very short lived, and if so, it
doesn't provide the PID controller with enough time to respond to a surge
of refaults.  (Note that the PID controller uses weighted refaults and
those from evicted generations only take a half of the whole weight.) In
other words, for a short lived generation, the moving average smooths out
the spike quickly.

To fix the problem:
1. For folios that are already on LRU, if they can be beyond the
   tracking range of tiers, i.e., five accesses through file
   descriptors, move them to the second oldest generation to give them
   more time to age. (Note that tiers are used by the PID controller
   to statistically determine whether folios accessed multiple times
   through file descriptors are worth protecting.)
2. When adding unmapped folios to LRU, adjust the placement of them so
   that they are not too close to the tail. The effect of this is
   similar to the above.

On Android, launching 55 apps sequentially:
                           Before     After      Change
  workingset_refault_anon  25641024   25598972   0%
  workingset_refault_file  115016834  106178438  -8%

Link: https://lkml.kernel.org/r/20231208061407.2125867-1-yuzhao@google.com
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Charan Teja Kalla <quic_charante@quicinc.com>
Tested-by: Kalesh Singh <kaleshsingh@google.com>
Cc: T.J. Mercier <tjmercier@google.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 23 ++++++++++++++---------
 mm/vmscan.c               |  2 +-
 mm/workingset.c           |  6 +++---
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 9ae7def16cb2..f4fe593c1400 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -232,22 +232,27 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 	if (folio_test_unevictable(folio) || !lrugen->enabled)
 		return false;
 	/*
-	 * There are three common cases for this page:
-	 * 1. If it's hot, e.g., freshly faulted in or previously hot and
-	 *    migrated, add it to the youngest generation.
-	 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
-	 *    not in swapcache or a dirty page pending writeback, add it to the
-	 *    second oldest generation.
-	 * 3. Everything else (clean, cold) is added to the oldest generation.
+	 * There are four common cases for this page:
+	 * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
+	 *    generation, and it's protected over the rest below.
+	 * 2. If it can't be evicted immediately, i.e., a dirty page pending
+	 *    writeback, add it to the second youngest generation.
+	 * 3. If it should be evicted first, e.g., cold and clean from
+	 *    folio_rotate_reclaimable(), add it to the oldest generation.
+	 * 4. Everything else falls between 2 & 3 above and is added to the
+	 *    second oldest generation if it's considered inactive, or the
+	 *    oldest generation otherwise. See lru_gen_is_active().
 	 */
 	if (folio_test_active(folio))
 		seq = lrugen->max_seq;
 	else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
 		 (folio_test_reclaim(folio) &&
 		  (folio_test_dirty(folio) || folio_test_writeback(folio))))
-		seq = lrugen->min_seq[type] + 1;
-	else
+		seq = lrugen->max_seq - 1;
+	else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
 		seq = lrugen->min_seq[type];
+	else
+		seq = lrugen->min_seq[type] + 1;
 
 	gen = lru_gen_from_seq(seq);
 	flags = (gen + 1UL) << LRU_GEN_PGOFF;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 506f8220c5fe..540333f5415c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4232,7 +4232,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	}
 
 	/* protected */
-	if (tier > tier_idx) {
+	if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 
 		gen = folio_inc_gen(lruvec, folio, false);
diff --git a/mm/workingset.c b/mm/workingset.c
index b192e44a0e7c..33baad203277 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -313,10 +313,10 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
 	 * 1. For pages accessed through page tables, hotter pages pushed out
 	 *    hot pages which refaulted immediately.
 	 * 2. For pages accessed multiple times through file descriptors,
-	 *    numbers of accesses might have been out of the range.
+	 *    they would have been protected by sort_folio().
 	 */
-	if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
-		folio_set_workingset(folio);
+	if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
+		set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
 	}
 unlock:
-- 
cgit v1.2.3


From 8aa420617918d12d1f5d55030a503c9418e73c2c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:06 -0700
Subject: mm/mglru: respect min_ttl_ms with memcgs

While investigating kswapd "consuming 100% CPU" [1] (also see "mm/mglru:
try to stop at high watermarks"), it was discovered that the memcg LRU can
breach the thrashing protection imposed by min_ttl_ms.

Before the memcg LRU:
  kswapd()
    shrink_node_memcgs()
      mem_cgroup_iter()
        inc_max_seq()  // always hit a different memcg
    lru_gen_age_node()
      mem_cgroup_iter()
        check the timestamp of the oldest generation

After the memcg LRU:
  kswapd()
    shrink_many()
      restart:
        iterate the memcg LRU:
          inc_max_seq()  // occasionally hit the same memcg
          if raced with lru_gen_rotate_memcg():
            goto restart
    lru_gen_age_node()
      mem_cgroup_iter()
        check the timestamp of the oldest generation

Specifically, when the restart happens in shrink_many(), it needs to stick
with the (memcg LRU) generation it began with.  In other words, it should
neither re-read memcg_lru->seq nor age an lruvec of a different
generation.  Otherwise it can hit the same memcg multiple times without
giving lru_gen_age_node() a chance to check the timestamp of that memcg's
oldest generation (against min_ttl_ms).

[1] https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@mail.gmail.com/

Link: https://lkml.kernel.org/r/20231208061407.2125867-3-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: T.J. Mercier <tjmercier@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 30 +++++++++++++++++-------------
 mm/vmscan.c            | 30 ++++++++++++++++--------------
 2 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c25226beeed..23533b12bee2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -505,33 +505,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  * the old generation, is incremented when all its bins become empty.
  *
  * There are four operations:
- * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
+ * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
  *    current generation (old or young) and updates its "seg" to "head";
- * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
+ * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
  *    current generation (old or young) and updates its "seg" to "tail";
- * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
+ * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
  *    generation, updates its "gen" to "old" and resets its "seg" to "default";
- * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
+ * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
  *    young generation, updates its "gen" to "young" and resets its "seg" to
  *    "default".
  *
  * The events that trigger the above operations are:
  * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
- * 2. The first attempt to reclaim an memcg below low, which triggers
+ * 2. The first attempt to reclaim a memcg below low, which triggers
  *    MEMCG_LRU_TAIL;
- * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
+ * 3. The first attempt to reclaim a memcg below reclaimable size threshold,
  *    which triggers MEMCG_LRU_TAIL;
- * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
+ * 4. The second attempt to reclaim a memcg below reclaimable size threshold,
  *    which triggers MEMCG_LRU_YOUNG;
- * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
+ * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
  * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
- * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+ * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
  *
- * Note that memcg LRU only applies to global reclaim, and the round-robin
- * incrementing of their max_seq counters ensures the eventual fairness to all
- * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ * Notes:
+ * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
+ *    of their max_seq counters ensures the eventual fairness to all eligible
+ *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
+ * 2. There are only two valid generations: old (seq) and young (seq+1).
+ *    MEMCG_NR_GENS is set to three so that when reading the generation counter
+ *    locklessly, a stale value (seq-1) does not wraparound to young.
  */
-#define MEMCG_NR_GENS	2
+#define MEMCG_NR_GENS	3
 #define MEMCG_NR_BINS	8
 
 struct lru_gen_memcg {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb7dacd8f753..e5fe4a94345f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4089,6 +4089,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 	else
 		VM_WARN_ON_ONCE(true);
 
+	WRITE_ONCE(lruvec->lrugen.seg, seg);
+	WRITE_ONCE(lruvec->lrugen.gen, new);
+
 	hlist_nulls_del_rcu(&lruvec->lrugen.list);
 
 	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
@@ -4099,9 +4102,6 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 	pgdat->memcg_lru.nr_memcgs[old]--;
 	pgdat->memcg_lru.nr_memcgs[new]++;
 
-	lruvec->lrugen.gen = new;
-	WRITE_ONCE(lruvec->lrugen.seg, seg);
-
 	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
 		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
 
@@ -4124,11 +4124,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
 
 		gen = get_memcg_gen(pgdat->memcg_lru.seq);
 
+		lruvec->lrugen.gen = gen;
+
 		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
 		pgdat->memcg_lru.nr_memcgs[gen]++;
 
-		lruvec->lrugen.gen = gen;
-
 		spin_unlock_irq(&pgdat->memcg_lru.lock);
 	}
 }
@@ -4635,7 +4635,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
 	DEFINE_MAX_SEQ(lruvec);
 
 	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
-		return 0;
+		return -1;
 
 	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
 		return nr_to_scan;
@@ -4710,7 +4710,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 		cond_resched();
 	}
 
-	/* whether try_to_inc_max_seq() was successful */
+	/* whether this lruvec should be rotated */
 	return nr_to_scan < 0;
 }
 
@@ -4764,13 +4764,13 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
 	struct lruvec *lruvec;
 	struct lru_gen_folio *lrugen;
 	struct mem_cgroup *memcg;
-	const struct hlist_nulls_node *pos;
+	struct hlist_nulls_node *pos;
 
+	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
 	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
 restart:
 	op = 0;
 	memcg = NULL;
-	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
 
 	rcu_read_lock();
 
@@ -4781,6 +4781,10 @@ restart:
 		}
 
 		mem_cgroup_put(memcg);
+		memcg = NULL;
+
+		if (gen != READ_ONCE(lrugen->gen))
+			continue;
 
 		lruvec = container_of(lrugen, struct lruvec, lrugen);
 		memcg = lruvec_memcg(lruvec);
@@ -4865,16 +4869,14 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
 	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
 		return;
 	/*
-	 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
-	 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
-	 * estimated reclaimed_to_scanned_ratio = inactive / total.
+	 * Determine the initial priority based on
+	 * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+	 * where reclaimed_to_scanned_ratio = inactive / total.
 	 */
 	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
 	if (get_swappiness(lruvec, sc))
 		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
 
-	reclaimable /= MEMCG_NR_GENS;
-
 	/* round down reclaimable and round up sc->nr_to_reclaim */
 	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
 
-- 
cgit v1.2.3


From 4376807bf2d5371c3e00080c972be568c3f8a7d1 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 7 Dec 2023 23:14:07 -0700
Subject: mm/mglru: reclaim offlined memcgs harder

In the effort to reduce zombie memcgs [1], it was discovered that the
memcg LRU doesn't apply enough pressure on offlined memcgs.  Specifically,
instead of rotating them to the tail of the current generation
(MEMCG_LRU_TAIL) for a second attempt, it moves them to the next
generation (MEMCG_LRU_YOUNG) after the first attempt.

Not applying enough pressure on offlined memcgs can cause them to build
up, and this can be particularly harmful to memory-constrained systems.

On Pixel 8 Pro, launching apps for 50 cycles:
                 Before  After  Change
  Zombie memcgs  45      35     -22%

[1] https://lore.kernel.org/CABdmKX2M6koq4Q0Cmp_-=wbP0Qa190HdEGGaHfxNS05gAkUtPA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20231208061407.2125867-4-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: T.J. Mercier <tjmercier@google.com>
Tested-by: T.J. Mercier <tjmercier@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  8 ++++----
 mm/vmscan.c            | 24 ++++++++++++++++--------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 23533b12bee2..9db36e197712 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -519,10 +519,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  * 2. The first attempt to reclaim a memcg below low, which triggers
  *    MEMCG_LRU_TAIL;
- * 3. The first attempt to reclaim a memcg below reclaimable size threshold,
- *    which triggers MEMCG_LRU_TAIL;
- * 4. The second attempt to reclaim a memcg below reclaimable size threshold,
- *    which triggers MEMCG_LRU_YOUNG;
+ * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
+ *    threshold, which triggers MEMCG_LRU_TAIL;
+ * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
+ *    threshold, which triggers MEMCG_LRU_YOUNG;
  * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
  * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
  * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5fe4a94345f..9dd8977de5a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4598,7 +4598,12 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
 	}
 
 	/* try to scrape all its memory if this memcg was deleted */
-	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+	if (!mem_cgroup_online(memcg)) {
+		*nr_to_scan = total;
+		return false;
+	}
+
+	*nr_to_scan = total >> sc->priority;
 
 	/*
 	 * The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4719,14 +4724,9 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 	bool success;
 	unsigned long scanned = sc->nr_scanned;
 	unsigned long reclaimed = sc->nr_reclaimed;
-	int seg = lru_gen_memcg_seg(lruvec);
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-	/* see the comment on MEMCG_NR_GENS */
-	if (!lruvec_is_sizable(lruvec, sc))
-		return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
-
 	mem_cgroup_calculate_protection(NULL, memcg);
 
 	if (mem_cgroup_below_min(NULL, memcg))
@@ -4734,7 +4734,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 
 	if (mem_cgroup_below_low(NULL, memcg)) {
 		/* see the comment on MEMCG_NR_GENS */
-		if (seg != MEMCG_LRU_TAIL)
+		if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL)
 			return MEMCG_LRU_TAIL;
 
 		memcg_memory_event(memcg, MEMCG_LOW);
@@ -4750,7 +4750,15 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 
 	flush_reclaim_state(sc);
 
-	return success ? MEMCG_LRU_YOUNG : 0;
+	if (success && mem_cgroup_online(memcg))
+		return MEMCG_LRU_YOUNG;
+
+	if (!success && lruvec_is_sizable(lruvec, sc))
+		return 0;
+
+	/* one retry if offlined or too small */
+	return lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL ?
+	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
 }
 
 #ifdef CONFIG_MEMCG
-- 
cgit v1.2.3


From 05ce71929efc79f5978589e0456a54eb0fe6485e Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 27 Nov 2023 16:19:31 +0100
Subject: PM: domains: Drop the unused pm_genpd_opp_to_performance_state()

Since commit 7c41cdcd3bbe ("OPP: Simplify the over-designed pstate <->
level dance"), there is no longer any users of the
pm_genpd_opp_to_performance_state() API. Let's therefore drop it and its
corresponding ->opp_to_performance_state() callback, which also no longer
has any users.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231127151931.47055-1-ulf.hansson@linaro.org
---
 drivers/base/power/domain.c | 32 --------------------------------
 include/linux/pm_domain.h   | 12 ------------
 2 files changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index da1777e39eaa..cf65b5a9783c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -3042,38 +3042,6 @@ int of_genpd_parse_idle_states(struct device_node *dn,
 }
 EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
 
-/**
- * pm_genpd_opp_to_performance_state - Gets performance state of the genpd from its OPP node.
- *
- * @genpd_dev: Genpd's device for which the performance-state needs to be found.
- * @opp: struct dev_pm_opp of the OPP for which we need to find performance
- *	state.
- *
- * Returns performance state encoded in the OPP of the genpd. This calls
- * platform specific genpd->opp_to_performance_state() callback to translate
- * power domain OPP to performance state.
- *
- * Returns performance state on success and 0 on failure.
- */
-unsigned int pm_genpd_opp_to_performance_state(struct device *genpd_dev,
-					       struct dev_pm_opp *opp)
-{
-	struct generic_pm_domain *genpd = NULL;
-	int state;
-
-	genpd = container_of(genpd_dev, struct generic_pm_domain, dev);
-
-	if (unlikely(!genpd->opp_to_performance_state))
-		return 0;
-
-	genpd_lock(genpd);
-	state = genpd->opp_to_performance_state(genpd, opp);
-	genpd_unlock(genpd);
-
-	return state;
-}
-EXPORT_SYMBOL_GPL(pm_genpd_opp_to_performance_state);
-
 static int __init genpd_bus_init(void)
 {
 	return bus_register(&genpd_bus_type);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 34663d0d5c55..b97c5e9820f9 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -118,7 +118,6 @@ struct genpd_power_state {
 };
 
 struct genpd_lock_ops;
-struct dev_pm_opp;
 struct opp_table;
 
 struct generic_pm_domain {
@@ -146,8 +145,6 @@ struct generic_pm_domain {
 	int (*power_on)(struct generic_pm_domain *domain);
 	struct raw_notifier_head power_notifiers; /* Power on/off notifiers */
 	struct opp_table *opp_table;	/* OPP table of the genpd */
-	unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd,
-						 struct dev_pm_opp *opp);
 	int (*set_performance_state)(struct generic_pm_domain *genpd,
 				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
@@ -348,8 +345,6 @@ int of_genpd_remove_subdomain(struct of_phandle_args *parent_spec,
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
 int of_genpd_parse_idle_states(struct device_node *dn,
 			       struct genpd_power_state **states, int *n);
-unsigned int pm_genpd_opp_to_performance_state(struct device *genpd_dev,
-					       struct dev_pm_opp *opp);
 
 int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
@@ -395,13 +390,6 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn,
 	return -ENODEV;
 }
 
-static inline unsigned int
-pm_genpd_opp_to_performance_state(struct device *genpd_dev,
-				  struct dev_pm_opp *opp)
-{
-	return 0;
-}
-
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From 4f7aa122bc9219baca0bfface5917062d6c45ee8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 7 Dec 2023 16:12:04 +0100
Subject: dpll: remove leftover mode_supported() op and use mode_get() instead

Mode supported is currently reported to the user exactly the same, as
the current mode. That's because mode changing is not implemented.
Remove the leftover mode_supported() op and use mode_get() to fill up
the supported mode exposed to user.

One, if even, mode changing is going to be introduced, this could be
very easily taken back. In the meantime, prevent drivers form
implementing this in wrong way (as for example recent netdevsim
implementation attempt intended to do).

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dpll/dpll_netlink.c                    | 16 ++++++++++------
 drivers/net/ethernet/intel/ice/ice_dpll.c      | 26 --------------------------
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c |  9 ---------
 drivers/ptp/ptp_ocp.c                          |  8 --------
 include/linux/dpll.h                           |  3 ---
 5 files changed, 10 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 442a0ebeb953..e1a4737500f5 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -101,13 +101,17 @@ dpll_msg_add_mode_supported(struct sk_buff *msg, struct dpll_device *dpll,
 {
 	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
 	enum dpll_mode mode;
+	int ret;
 
-	if (!ops->mode_supported)
-		return 0;
-	for (mode = DPLL_MODE_MANUAL; mode <= DPLL_MODE_MAX; mode++)
-		if (ops->mode_supported(dpll, dpll_priv(dpll), mode, extack))
-			if (nla_put_u32(msg, DPLL_A_MODE_SUPPORTED, mode))
-				return -EMSGSIZE;
+	/* No mode change is supported now, so the only supported mode is the
+	 * one obtained by mode_get().
+	 */
+
+	ret = ops->mode_get(dpll, dpll_priv(dpll), &mode, extack);
+	if (ret)
+		return ret;
+	if (nla_put_u32(msg, DPLL_A_MODE_SUPPORTED, mode))
+		return -EMSGSIZE;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 86b180cb32a0..b9c5eced6326 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -512,31 +512,6 @@ ice_dpll_lock_status_get(const struct dpll_device *dpll, void *dpll_priv,
 	return 0;
 }
 
-/**
- * ice_dpll_mode_supported - check if dpll's working mode is supported
- * @dpll: registered dpll pointer
- * @dpll_priv: private data pointer passed on dpll registration
- * @mode: mode to be checked for support
- * @extack: error reporting
- *
- * Dpll subsystem callback. Provides information if working mode is supported
- * by dpll.
- *
- * Return:
- * * true - mode is supported
- * * false - mode is not supported
- */
-static bool ice_dpll_mode_supported(const struct dpll_device *dpll,
-				    void *dpll_priv,
-				    enum dpll_mode mode,
-				    struct netlink_ext_ack *extack)
-{
-	if (mode == DPLL_MODE_AUTOMATIC)
-		return true;
-
-	return false;
-}
-
 /**
  * ice_dpll_mode_get - get dpll's working mode
  * @dpll: registered dpll pointer
@@ -1197,7 +1172,6 @@ static const struct dpll_pin_ops ice_dpll_output_ops = {
 
 static const struct dpll_device_ops ice_dpll_ops = {
 	.lock_status_get = ice_dpll_lock_status_get,
-	.mode_supported = ice_dpll_mode_supported,
 	.mode_get = ice_dpll_mode_get,
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
index 2cd81bb32c66..a7ffd61fe248 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -128,18 +128,9 @@ static int mlx5_dpll_device_mode_get(const struct dpll_device *dpll,
 	return 0;
 }
 
-static bool mlx5_dpll_device_mode_supported(const struct dpll_device *dpll,
-					    void *priv,
-					    enum dpll_mode mode,
-					    struct netlink_ext_ack *extack)
-{
-	return mode == DPLL_MODE_MANUAL;
-}
-
 static const struct dpll_device_ops mlx5_dpll_device_ops = {
 	.lock_status_get = mlx5_dpll_device_lock_status_get,
 	.mode_get = mlx5_dpll_device_mode_get,
-	.mode_supported = mlx5_dpll_device_mode_supported,
 };
 
 static int mlx5_dpll_pin_direction_get(const struct dpll_pin *pin,
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 4021d3d325f9..b022af3d20fe 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -4260,13 +4260,6 @@ static int ptp_ocp_dpll_mode_get(const struct dpll_device *dpll, void *priv,
 	return 0;
 }
 
-static bool ptp_ocp_dpll_mode_supported(const struct dpll_device *dpll,
-					void *priv, const enum dpll_mode mode,
-					struct netlink_ext_ack *extack)
-{
-	return mode == DPLL_MODE_AUTOMATIC;
-}
-
 static int ptp_ocp_dpll_direction_get(const struct dpll_pin *pin,
 				      void *pin_priv,
 				      const struct dpll_device *dpll,
@@ -4350,7 +4343,6 @@ static int ptp_ocp_dpll_frequency_get(const struct dpll_pin *pin,
 static const struct dpll_device_ops dpll_ops = {
 	.lock_status_get = ptp_ocp_dpll_lock_status_get,
 	.mode_get = ptp_ocp_dpll_mode_get,
-	.mode_supported = ptp_ocp_dpll_mode_supported,
 };
 
 static const struct dpll_pin_ops dpll_pins_ops = {
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 578fc5fa3750..b1a5f9ca8ee5 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -17,9 +17,6 @@ struct dpll_pin;
 struct dpll_device_ops {
 	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
 			enum dpll_mode *mode, struct netlink_ext_ack *extack);
-	bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv,
-			       const enum dpll_mode mode,
-			       struct netlink_ext_ack *extack);
 	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
 			       enum dpll_lock_status *status,
 			       struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 59b3e31e73322ec195e45e0a1da712c752ee1b0c Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Tue, 28 Nov 2023 04:00:10 +0000
Subject: leds: trigger: netdev: Extend speeds up to 10G

Add 2.5G, 5G and 10G as available speeds to the netdev LED trigger.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/99e7d3304c6bba7f4863a4a80764a869855f2085.1701143925.git.daniel@makrotopia.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/trigger/ledtrig-netdev.c | 32 +++++++++++++++++++++++++++++++-
 include/linux/leds.h                  |  3 +++
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/leds/trigger/ledtrig-netdev.c b/drivers/leds/trigger/ledtrig-netdev.c
index e358e77e4b38..bd68da15c723 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -99,6 +99,18 @@ static void set_baseline_state(struct led_netdev_data *trigger_data)
 		    trigger_data->link_speed == SPEED_1000)
 			blink_on = true;
 
+		if (test_bit(TRIGGER_NETDEV_LINK_2500, &trigger_data->mode) &&
+		    trigger_data->link_speed == SPEED_2500)
+			blink_on = true;
+
+		if (test_bit(TRIGGER_NETDEV_LINK_5000, &trigger_data->mode) &&
+		    trigger_data->link_speed == SPEED_5000)
+			blink_on = true;
+
+		if (test_bit(TRIGGER_NETDEV_LINK_10000, &trigger_data->mode) &&
+		    trigger_data->link_speed == SPEED_10000)
+			blink_on = true;
+
 		if (test_bit(TRIGGER_NETDEV_HALF_DUPLEX, &trigger_data->mode) &&
 		    trigger_data->duplex == DUPLEX_HALF)
 			blink_on = true;
@@ -286,6 +298,9 @@ static ssize_t netdev_led_attr_show(struct device *dev, char *buf,
 	case TRIGGER_NETDEV_LINK_10:
 	case TRIGGER_NETDEV_LINK_100:
 	case TRIGGER_NETDEV_LINK_1000:
+	case TRIGGER_NETDEV_LINK_2500:
+	case TRIGGER_NETDEV_LINK_5000:
+	case TRIGGER_NETDEV_LINK_10000:
 	case TRIGGER_NETDEV_HALF_DUPLEX:
 	case TRIGGER_NETDEV_FULL_DUPLEX:
 	case TRIGGER_NETDEV_TX:
@@ -316,6 +331,9 @@ static ssize_t netdev_led_attr_store(struct device *dev, const char *buf,
 	case TRIGGER_NETDEV_LINK_10:
 	case TRIGGER_NETDEV_LINK_100:
 	case TRIGGER_NETDEV_LINK_1000:
+	case TRIGGER_NETDEV_LINK_2500:
+	case TRIGGER_NETDEV_LINK_5000:
+	case TRIGGER_NETDEV_LINK_10000:
 	case TRIGGER_NETDEV_HALF_DUPLEX:
 	case TRIGGER_NETDEV_FULL_DUPLEX:
 	case TRIGGER_NETDEV_TX:
@@ -334,7 +352,10 @@ static ssize_t netdev_led_attr_store(struct device *dev, const char *buf,
 	if (test_bit(TRIGGER_NETDEV_LINK, &mode) &&
 	    (test_bit(TRIGGER_NETDEV_LINK_10, &mode) ||
 	     test_bit(TRIGGER_NETDEV_LINK_100, &mode) ||
-	     test_bit(TRIGGER_NETDEV_LINK_1000, &mode)))
+	     test_bit(TRIGGER_NETDEV_LINK_1000, &mode) ||
+	     test_bit(TRIGGER_NETDEV_LINK_2500, &mode) ||
+	     test_bit(TRIGGER_NETDEV_LINK_5000, &mode) ||
+	     test_bit(TRIGGER_NETDEV_LINK_10000, &mode)))
 		return -EINVAL;
 
 	cancel_delayed_work_sync(&trigger_data->work);
@@ -364,6 +385,9 @@ DEFINE_NETDEV_TRIGGER(link, TRIGGER_NETDEV_LINK);
 DEFINE_NETDEV_TRIGGER(link_10, TRIGGER_NETDEV_LINK_10);
 DEFINE_NETDEV_TRIGGER(link_100, TRIGGER_NETDEV_LINK_100);
 DEFINE_NETDEV_TRIGGER(link_1000, TRIGGER_NETDEV_LINK_1000);
+DEFINE_NETDEV_TRIGGER(link_2500, TRIGGER_NETDEV_LINK_2500);
+DEFINE_NETDEV_TRIGGER(link_5000, TRIGGER_NETDEV_LINK_5000);
+DEFINE_NETDEV_TRIGGER(link_10000, TRIGGER_NETDEV_LINK_10000);
 DEFINE_NETDEV_TRIGGER(half_duplex, TRIGGER_NETDEV_HALF_DUPLEX);
 DEFINE_NETDEV_TRIGGER(full_duplex, TRIGGER_NETDEV_FULL_DUPLEX);
 DEFINE_NETDEV_TRIGGER(tx, TRIGGER_NETDEV_TX);
@@ -422,6 +446,9 @@ static struct attribute *netdev_trig_attrs[] = {
 	&dev_attr_link_10.attr,
 	&dev_attr_link_100.attr,
 	&dev_attr_link_1000.attr,
+	&dev_attr_link_2500.attr,
+	&dev_attr_link_5000.attr,
+	&dev_attr_link_10000.attr,
 	&dev_attr_full_duplex.attr,
 	&dev_attr_half_duplex.attr,
 	&dev_attr_rx.attr,
@@ -519,6 +546,9 @@ static void netdev_trig_work(struct work_struct *work)
 			 test_bit(TRIGGER_NETDEV_LINK_10, &trigger_data->mode) ||
 			 test_bit(TRIGGER_NETDEV_LINK_100, &trigger_data->mode) ||
 			 test_bit(TRIGGER_NETDEV_LINK_1000, &trigger_data->mode) ||
+			 test_bit(TRIGGER_NETDEV_LINK_2500, &trigger_data->mode) ||
+			 test_bit(TRIGGER_NETDEV_LINK_5000, &trigger_data->mode) ||
+			 test_bit(TRIGGER_NETDEV_LINK_10000, &trigger_data->mode) ||
 			 test_bit(TRIGGER_NETDEV_HALF_DUPLEX, &trigger_data->mode) ||
 			 test_bit(TRIGGER_NETDEV_FULL_DUPLEX, &trigger_data->mode);
 		interval = jiffies_to_msecs(
diff --git a/include/linux/leds.h b/include/linux/leds.h
index aa16dc2a8230..1bdf7f5a0d7c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -588,6 +588,9 @@ enum led_trigger_netdev_modes {
 	TRIGGER_NETDEV_LINK_10,
 	TRIGGER_NETDEV_LINK_100,
 	TRIGGER_NETDEV_LINK_1000,
+	TRIGGER_NETDEV_LINK_2500,
+	TRIGGER_NETDEV_LINK_5000,
+	TRIGGER_NETDEV_LINK_10000,
 	TRIGGER_NETDEV_HALF_DUPLEX,
 	TRIGGER_NETDEV_FULL_DUPLEX,
 	TRIGGER_NETDEV_TX,
-- 
cgit v1.2.3


From 4ff4379ce6eefe81695bcc2e021ce1dac3d707d2 Mon Sep 17 00:00:00 2001
From: Florian Eckert <fe@dev.tdt.de>
Date: Mon, 27 Nov 2023 12:03:08 +0100
Subject: tty: add new helper function tty_get_tiocm

There is no in-kernel function to get the status register of a tty device
like the TIOCMGET ioctl returns to userspace. Create a new function,
tty_get_tiocm(), to obtain the status register that other portions of the
kernel can call if they need this information, and move the existing
internal tty_tiocmget() function to use this interface.

Signed-off-by: Florian Eckert <fe@dev.tdt.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20231127110311.3583957-2-fe@dev.tdt.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/tty/tty_io.c | 28 ++++++++++++++++++++++------
 include/linux/tty.h  |  1 +
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 06414e43e0b5..e2e93404133e 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2498,6 +2498,24 @@ static int send_break(struct tty_struct *tty, unsigned int duration)
 	return retval;
 }
 
+/**
+ * tty_get_tiocm - get tiocm status register
+ * @tty: tty device
+ *
+ * Obtain the modem status bits from the tty driver if the feature
+ * is supported.
+ */
+int tty_get_tiocm(struct tty_struct *tty)
+{
+	int retval = -ENOTTY;
+
+	if (tty->ops->tiocmget)
+		retval = tty->ops->tiocmget(tty);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(tty_get_tiocm);
+
 /**
  * tty_tiocmget - get modem status
  * @tty: tty device
@@ -2510,14 +2528,12 @@ static int send_break(struct tty_struct *tty, unsigned int duration)
  */
 static int tty_tiocmget(struct tty_struct *tty, int __user *p)
 {
-	int retval = -ENOTTY;
+	int retval;
 
-	if (tty->ops->tiocmget) {
-		retval = tty->ops->tiocmget(tty);
+	retval = tty_get_tiocm(tty);
+	if (retval >= 0)
+		retval = put_user(retval, p);
 
-		if (retval >= 0)
-			retval = put_user(retval, p);
-	}
 	return retval;
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4b6340ac2af2..d219a11e3fe0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -419,6 +419,7 @@ bool tty_unthrottle_safe(struct tty_struct *tty);
 int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 int tty_get_icount(struct tty_struct *tty,
 		struct serial_icounter_struct *icount);
+int tty_get_tiocm(struct tty_struct *tty);
 int is_current_pgrp_orphaned(void);
 void tty_hangup(struct tty_struct *tty);
 void tty_vhangup(struct tty_struct *tty);
-- 
cgit v1.2.3


From bdc22c8d52d70fc5655ab4dbf72fa79b034bb7b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 5 Dec 2023 20:18:39 +0100
Subject: thermal: trip: Send trip change notifications on all trip updates

The _store callbacks of the trip point temperature and hysteresis sysfs
attributes invoke thermal_notify_tz_trip_change() to send a notification
regarding the trip point change, but when trip points are updated by the
platform firmware, trip point change notifications are not sent.

To make the behavior after a trip point change more consistent,
modify all of the 3 places where trip point temperature is updated
to use a new function called thermal_zone_set_trip_temp() for this
purpose and make that function call thermal_notify_tz_trip_change().

Note that trip point hysteresis can only be updated via sysfs and
trip_point_hyst_store() calls thermal_notify_tz_trip_change() already,
so this code path need not be changed.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/acpi/thermal.c                                     |  7 +++++--
 .../thermal/intel/int340x_thermal/int340x_thermal_zone.c   |  8 +++++---
 drivers/thermal/thermal_sysfs.c                            |  4 ++--
 drivers/thermal/thermal_trip.c                             | 14 +++++++++++++-
 include/linux/thermal.h                                    |  2 ++
 5 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index f74d81abdbfc..3e679e9feec5 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -297,6 +297,7 @@ static int acpi_thermal_adjust_trip(struct thermal_trip *trip, void *data)
 	struct acpi_thermal_trip *acpi_trip = trip->priv;
 	struct adjust_trip_data *atd = data;
 	struct acpi_thermal *tz = atd->tz;
+	int temp;
 
 	if (!acpi_trip || !acpi_thermal_trip_valid(acpi_trip))
 		return 0;
@@ -307,9 +308,11 @@ static int acpi_thermal_adjust_trip(struct thermal_trip *trip, void *data)
 		acpi_thermal_update_trip_devices(tz, trip);
 
 	if (acpi_thermal_trip_valid(acpi_trip))
-		trip->temperature = acpi_thermal_temp(tz, acpi_trip->temp_dk);
+		temp = acpi_thermal_temp(tz, acpi_trip->temp_dk);
 	else
-		trip->temperature = THERMAL_TEMP_INVALID;
+		temp = THERMAL_TEMP_INVALID;
+
+	thermal_zone_set_trip_temp(tz->thermal_zone, trip, temp);
 
 	return 0;
 }
diff --git a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c
index a03b67579dd9..3e4bfe817fac 100644
--- a/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c
+++ b/drivers/thermal/intel/int340x_thermal/int340x_thermal_zone.c
@@ -225,7 +225,8 @@ EXPORT_SYMBOL_GPL(int340x_thermal_zone_remove);
 
 static int int340x_update_one_trip(struct thermal_trip *trip, void *arg)
 {
-	struct acpi_device *zone_adev = arg;
+	struct int34x_thermal_zone *int34x_zone = arg;
+	struct acpi_device *zone_adev = int34x_zone->adev;
 	int temp, err;
 
 	switch (trip->type) {
@@ -249,14 +250,15 @@ static int int340x_update_one_trip(struct thermal_trip *trip, void *arg)
 	if (err)
 		temp = THERMAL_TEMP_INVALID;
 
-	trip->temperature = temp;
+	thermal_zone_set_trip_temp(int34x_zone->zone, trip, temp);
+
 	return 0;
 }
 
 void int340x_thermal_update_trips(struct int34x_thermal_zone *int34x_zone)
 {
 	thermal_zone_for_each_trip(int34x_zone->zone, int340x_update_one_trip,
-				   int34x_zone->adev);
+				   int34x_zone);
 }
 EXPORT_SYMBOL_GPL(int340x_thermal_update_trips);
 
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index f52af8a3b4b5..d8ff74a4338a 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -129,9 +129,9 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 				goto unlock;
 		}
 
-		trip->temperature = temp;
+		thermal_zone_set_trip_temp(tz, trip, temp);
 
-		thermal_zone_trip_updated(tz, trip);
+		__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
 	}
 
 unlock:
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index 581f1cd883e5..a1ad345c0741 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -152,7 +152,6 @@ int thermal_zone_trip_id(struct thermal_zone_device *tz,
 	 */
 	return trip - tz->trips;
 }
-
 void thermal_zone_trip_updated(struct thermal_zone_device *tz,
 			       const struct thermal_trip *trip)
 {
@@ -161,3 +160,16 @@ void thermal_zone_trip_updated(struct thermal_zone_device *tz,
 				      trip->hysteresis);
 	__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
 }
+
+void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
+				struct thermal_trip *trip, int temp)
+{
+	if (trip->temperature == temp)
+		return;
+
+	trip->temperature = temp;
+	thermal_notify_tz_trip_change(tz->id, thermal_zone_trip_id(tz, trip),
+				      trip->type, trip->temperature,
+				      trip->hysteresis);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index bedbaec9a42e..09f6eb82c191 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -291,6 +291,8 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
 			       int (*cb)(struct thermal_trip *, void *),
 			       void *data);
 int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
+void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
+				struct thermal_trip *trip, int temp);
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp);
 
-- 
cgit v1.2.3


From ad6534c626fedd818718d76c36d69c7d8e7b61cc Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Fri, 8 Dec 2023 10:56:49 +0800
Subject: PCI: Add Alibaba Vendor ID to linux/pci_ids.h

The Alibaba Vendor ID (0x1ded) is now used by Alibaba elasticRDMA ("erdma")
and will be shared with the upcoming PCIe PMU ("dwc_pcie_pmu"). Move the
Vendor ID to linux/pci_ids.h so that it can shared by several drivers
later.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# pci_ids.h
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20231208025652.87192-3-xueshuai@linux.alibaba.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/infiniband/hw/erdma/erdma_hw.h | 2 --
 include/linux/pci_ids.h                | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h
index 9d316fdc6f9a..a155519a862f 100644
--- a/drivers/infiniband/hw/erdma/erdma_hw.h
+++ b/drivers/infiniband/hw/erdma/erdma_hw.h
@@ -11,8 +11,6 @@
 #include <linux/types.h>
 
 /* PCIe device related definition. */
-#define PCI_VENDOR_ID_ALIBABA 0x1ded
-
 #define ERDMA_PCI_WIDTH 64
 #define ERDMA_FUNC_BAR 0
 #define ERDMA_MISX_BAR 2
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 275799b5f535..844ffdac8d7d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2605,6 +2605,8 @@
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
+#define PCI_VENDOR_ID_ALIBABA		0x1ded
+
 #define PCI_VENDOR_ID_TEHUTI		0x1fc9
 #define PCI_DEVICE_ID_TEHUTI_3009	0x3009
 #define PCI_DEVICE_ID_TEHUTI_3010	0x3010
-- 
cgit v1.2.3


From ac16087134b837d42b75bb1c741070b6c142f258 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Fri, 8 Dec 2023 10:56:50 +0800
Subject: PCI: Move pci_clear_and_set_dword() helper to PCI header

The clear and set pattern is commonly used for accessing PCI config,
move the helper pci_clear_and_set_dword() from aspm.c into PCI header.
In addition, rename to pci_clear_and_set_config_dword() to retain the
"config" information and match the other accessors.

No functional change intended.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20231208025652.87192-4-xueshuai@linux.alibaba.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/pci/access.c    | 12 +++++++++
 drivers/pci/pcie/aspm.c | 65 +++++++++++++++++++++++--------------------------
 include/linux/pci.h     |  2 ++
 3 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 6554a2e89d36..6449056b57dd 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -598,3 +598,15 @@ int pci_write_config_dword(const struct pci_dev *dev, int where,
 	return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val);
 }
 EXPORT_SYMBOL(pci_write_config_dword);
+
+void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos,
+				    u32 clear, u32 set)
+{
+	u32 val;
+
+	pci_read_config_dword(dev, pos, &val);
+	val &= ~clear;
+	val |= set;
+	pci_write_config_dword(dev, pos, val);
+}
+EXPORT_SYMBOL(pci_clear_and_set_config_dword);
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 50b04ae5c394..a720a8e3ced2 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -426,17 +426,6 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 	}
 }
 
-static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos,
-				    u32 clear, u32 set)
-{
-	u32 val;
-
-	pci_read_config_dword(pdev, pos, &val);
-	val &= ~clear;
-	val |= set;
-	pci_write_config_dword(pdev, pos, val);
-}
-
 /* Calculate L1.2 PM substate timing parameters */
 static void aspm_calc_l12_info(struct pcie_link_state *link,
 				u32 parent_l1ss_cap, u32 child_l1ss_cap)
@@ -501,10 +490,12 @@ static void aspm_calc_l12_info(struct pcie_link_state *link,
 	cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK;
 
 	if (pl1_2_enables || cl1_2_enables) {
-		pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1,
-					PCI_L1SS_CTL1_L1_2_MASK, 0);
-		pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
-					PCI_L1SS_CTL1_L1_2_MASK, 0);
+		pci_clear_and_set_config_dword(child,
+					       child->l1ss + PCI_L1SS_CTL1,
+					       PCI_L1SS_CTL1_L1_2_MASK, 0);
+		pci_clear_and_set_config_dword(parent,
+					       parent->l1ss + PCI_L1SS_CTL1,
+					       PCI_L1SS_CTL1_L1_2_MASK, 0);
 	}
 
 	/* Program T_POWER_ON times in both ports */
@@ -512,22 +503,26 @@ static void aspm_calc_l12_info(struct pcie_link_state *link,
 	pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2);
 
 	/* Program Common_Mode_Restore_Time in upstream device */
-	pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1);
+	pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1);
 
 	/* Program LTR_L1.2_THRESHOLD time in both ports */
-	pci_clear_and_set_dword(parent,	parent->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
-				PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1);
-	pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
-				PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1);
+	pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+				       PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+				       ctl1);
+	pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_LTR_L12_TH_VALUE |
+				       PCI_L1SS_CTL1_LTR_L12_TH_SCALE,
+				       ctl1);
 
 	if (pl1_2_enables || cl1_2_enables) {
-		pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0,
-					pl1_2_enables);
-		pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0,
-					cl1_2_enables);
+		pci_clear_and_set_config_dword(parent,
+					       parent->l1ss + PCI_L1SS_CTL1, 0,
+					       pl1_2_enables);
+		pci_clear_and_set_config_dword(child,
+					       child->l1ss + PCI_L1SS_CTL1, 0,
+					       cl1_2_enables);
 	}
 }
 
@@ -687,10 +682,10 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
 	 */
 
 	/* Disable all L1 substates */
-	pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_L1SS_MASK, 0);
-	pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_L1SS_MASK, 0);
+	pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1SS_MASK, 0);
+	pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1SS_MASK, 0);
 	/*
 	 * If needed, disable L1, and it gets enabled later
 	 * in pcie_config_aspm_link().
@@ -713,10 +708,10 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
 		val |= PCI_L1SS_CTL1_PCIPM_L1_2;
 
 	/* Enable what we need to enable */
-	pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_L1SS_MASK, val);
-	pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1,
-				PCI_L1SS_CTL1_L1SS_MASK, val);
+	pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1SS_MASK, val);
+	pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1,
+				       PCI_L1SS_CTL1_L1SS_MASK, val);
 }
 
 static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..268c4bd98ef3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1239,6 +1239,8 @@ int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val);
 int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val);
 int pci_write_config_word(const struct pci_dev *dev, int where, u16 val);
 int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val);
+void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos,
+				    u32 clear, u32 set);
 
 int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val);
 int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val);
-- 
cgit v1.2.3


From c82a1662d4548c454de5343b88f69b9fc82266b3 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 8 Dec 2023 23:56:41 +0100
Subject: leds: trigger: Remove unused function led_trigger_rename_static()

This function was added with a8df7b1ab70b ("leds: add led_trigger_rename
function") 11 yrs ago, but it has no users. So remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/d90f30be-f661-4db7-b0b5-d09d07a78a68@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-triggers.c | 13 -------------
 include/linux/leds.h        | 17 -----------------
 2 files changed, 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 6a5e1f41f9a4..bd59a14a4a90 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -269,19 +269,6 @@ void led_trigger_set_default(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_trigger_set_default);
 
-void led_trigger_rename_static(const char *name, struct led_trigger *trig)
-{
-	/* new name must be on a temporary string to prevent races */
-	BUG_ON(name == trig->name);
-
-	down_write(&triggers_list_lock);
-	/* this assumes that trig->name was originaly allocated to
-	 * non constant storage */
-	strcpy((char *)trig->name, name);
-	up_write(&triggers_list_lock);
-}
-EXPORT_SYMBOL_GPL(led_trigger_rename_static);
-
 /* LED Trigger Interface */
 
 int led_trigger_register(struct led_trigger *trig)
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 1bdf7f5a0d7c..4754b02d3a2c 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -527,23 +527,6 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 	return led_cdev->trigger_data;
 }
 
-/**
- * led_trigger_rename_static - rename a trigger
- * @name: the new trigger name
- * @trig: the LED trigger to rename
- *
- * Change a LED trigger name by copying the string passed in
- * name into current trigger name, which MUST be large
- * enough for the new string.
- *
- * Note that name must NOT point to the same string used
- * during LED registration, as that could lead to races.
- *
- * This is meant to be used on triggers with statically
- * allocated name.
- */
-void led_trigger_rename_static(const char *name, struct led_trigger *trig);
-
 #define module_led_trigger(__led_trigger) \
 	module_driver(__led_trigger, led_trigger_register, \
 		      led_trigger_unregister)
-- 
cgit v1.2.3


From 595e52284d24adc376890d3fc93bdca4707d9aca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Dec 2023 08:58:15 -0700
Subject: io_uring/poll: don't enable lazy wake for POLLEXCLUSIVE

There are a few quirks around using lazy wake for poll unconditionally,
and one of them is related the EPOLLEXCLUSIVE. Those may trigger
exclusive wakeups, which wake a limited number of entries in the wait
queue. If that wake number is less than the number of entries someone is
waiting for (and that someone is also using DEFER_TASKRUN), then we can
get stuck waiting for more entries while we should be processing the ones
we already got.

If we're doing exclusive poll waits, flag the request as not being
compatible with lazy wakeups.

Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Fixes: 6ce4a93dbb5b ("io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/poll.c                | 20 +++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 805bb635cdf5..239a4f68801b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -434,6 +434,7 @@ enum {
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
+	REQ_F_POLL_NO_LAZY_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -501,6 +502,8 @@ enum {
 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT),
 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
 	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT),
+	/* don't use lazy poll wake for this request */
+	REQ_F_POLL_NO_LAZY	= BIT(REQ_F_POLL_NO_LAZY_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index d38d05edb4fa..d59b74a99d4e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -366,11 +366,16 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
 
 static void __io_poll_execute(struct io_kiocb *req, int mask)
 {
+	unsigned flags = 0;
+
 	io_req_set_res(req, mask, 0);
 	req->io_task_work.func = io_poll_task_func;
 
 	trace_io_uring_task_add(req, mask);
-	__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
+
+	if (!(req->flags & REQ_F_POLL_NO_LAZY))
+		flags = IOU_F_TWQ_LAZY_WAKE;
+	__io_req_task_work_add(req, flags);
 }
 
 static inline void io_poll_execute(struct io_kiocb *req, int res)
@@ -526,10 +531,19 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
 	poll->head = head;
 	poll->wait.private = (void *) wqe_private;
 
-	if (poll->events & EPOLLEXCLUSIVE)
+	if (poll->events & EPOLLEXCLUSIVE) {
+		/*
+		 * Exclusive waits may only wake a limited amount of entries
+		 * rather than all of them, this may interfere with lazy
+		 * wake if someone does wait(events > 1). Ensure we don't do
+		 * lazy wake for those, as we need to process each one as they
+		 * come in.
+		 */
+		req->flags |= REQ_F_POLL_NO_LAZY;
 		add_wait_queue_exclusive(head, &poll->wait);
-	else
+	} else {
 		add_wait_queue(head, &poll->wait);
+	}
 }
 
 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
-- 
cgit v1.2.3


From 3c6b0c1c28184038d90dffe8eb542bedcb8ccf98 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 30 Nov 2023 14:27:29 +0100
Subject: srcu: Use try-lock lockdep annotation for NMI-safe access.

It is claimed that srcu_read_lock_nmisafe() NMI-safe. However it
triggers a lockdep if used from NMI because lockdep expects a deadlock
since nothing disables NMIs while the lock is acquired.

This is because commit f0f44752f5f61 ("rcu: Annotate SRCU's update-side
lockdep dependencies") annotates synchronize_srcu() as a write lock
usage. This helps to detect a deadlocks such as
	srcu_read_lock();
	synchronize_srcu();
	srcu_read_unlock();

The side effect is that the lock srcu_struct now has a USED usage in normal
contexts, so it conflicts with a USED_READ usage in NMI. But this shouldn't
cause a real deadlock because the write lock usage from synchronize_srcu()
is a fake one and only used for read/write deadlock detection.

Use a try-lock annotation for srcu_read_lock_nmisafe() to avoid lockdep
complains if used from NMI.

Fixes: f0f44752f5f6 ("rcu: Annotate SRCU's update-side lockdep dependencies")
Link: https://lore.kernel.org/r/20230927160231.XRCDDSK4@linutronix.de
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rcupdate.h | 6 ++++++
 include/linux/srcu.h     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f7206b2623c9..31d523c4e089 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -301,6 +301,11 @@ static inline void rcu_lock_acquire(struct lockdep_map *map)
 	lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
 }
 
+static inline void rcu_try_lock_acquire(struct lockdep_map *map)
+{
+	lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
+}
+
 static inline void rcu_lock_release(struct lockdep_map *map)
 {
 	lock_release(map, _THIS_IP_);
@@ -315,6 +320,7 @@ int rcu_read_lock_any_held(void);
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 # define rcu_lock_acquire(a)		do { } while (0)
+# define rcu_try_lock_acquire(a)	do { } while (0)
 # define rcu_lock_release(a)		do { } while (0)
 
 static inline int rcu_read_lock_held(void)
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 127ef3b2e607..236610e4a8fa 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -229,7 +229,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp
 
 	srcu_check_nmi_safety(ssp, true);
 	retval = __srcu_read_lock_nmisafe(ssp);
-	rcu_lock_acquire(&ssp->dep_map);
+	rcu_try_lock_acquire(&ssp->dep_map);
 	return retval;
 }
 
-- 
cgit v1.2.3


From 493dffa3ab07b5d2c0b7bd5de5bff6e85f01f52a Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 11:22:12 +0200
Subject: rculist.h: docs: Fix wrong function summary

The brief summary in the docstring for function list_next_or_null_rcu()
states that the function is supposed to provide the "first" member of a
list, whereas in truth it returns the next member.

Change the docstring so it describes what the function actually does.

Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 include/linux/rculist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index d29740be4833..3dc1e58865f7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -355,7 +355,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
 })
 
 /**
- * list_next_or_null_rcu - get the first element from a list
+ * list_next_or_null_rcu - get the next element from a list
  * @head:	the head for the list.
  * @ptr:        the list head to take the next element from.
  * @type:       the type of the struct this is embedded in.
-- 
cgit v1.2.3


From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001
From: Jie Jiang <jiejiang@chromium.org>
Date: Tue, 12 Dec 2023 09:39:23 +0000
Subject: bpf: Support uid and gid when mounting bpffs

Parse uid and gid in bpf_parse_param() so that they can be passed in as
the `data` parameter when mount() bpffs. This will be useful when we
want to control which user/group has the control to the mounted bpffs,
otherwise a separate chown() call will be needed.

Signed-off-by: Jie Jiang <jiejiang@chromium.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Mike Frysinger <vapier@chromium.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/inode.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0bd4889e917a..c87c608a3689 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1595,6 +1595,8 @@ struct bpf_link_primer {
 };
 
 struct bpf_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
 	umode_t mode;
 
 	/* BPF token-related delegation options */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5359a0929c35..0a8e1188ea46 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -601,9 +601,16 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
-	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+	struct inode *inode = d_inode(root);
+	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
 	u64 mask;
 
+	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, inode->i_uid));
+	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, inode->i_gid));
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
@@ -652,6 +659,8 @@ const struct super_operations bpf_super_ops = {
 };
 
 enum {
+	OPT_UID,
+	OPT_GID,
 	OPT_MODE,
 	OPT_DELEGATE_CMDS,
 	OPT_DELEGATE_MAPS,
@@ -660,6 +669,8 @@ enum {
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
+	fsparam_u32	("uid",				OPT_UID),
+	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
 	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
 	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
@@ -672,6 +683,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct bpf_mount_opts *opts = fc->s_fs_info;
 	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
 	int opt, err;
 	u64 msk;
 
@@ -694,6 +707,34 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	switch (opt) {
+	case OPT_UID:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			goto bad_value;
+
+		/*
+		 * The requested uid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kuid_has_mapping(fc->user_ns, uid))
+			goto bad_value;
+
+		opts->uid = uid;
+		break;
+	case OPT_GID:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			goto bad_value;
+
+		/*
+		 * The requested gid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kgid_has_mapping(fc->user_ns, gid))
+			goto bad_value;
+
+		opts->gid = gid;
+		break;
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
@@ -722,6 +763,9 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	return 0;
+
+bad_value:
+	return invalfc(fc, "Bad value for '%s'", param->key);
 }
 
 struct bpf_preload_ops *bpf_preload_ops;
@@ -808,6 +852,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &bpf_super_ops;
 
 	inode = sb->s_root->d_inode;
+	inode->i_uid = opts->uid;
+	inode->i_gid = opts->gid;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
@@ -843,6 +889,8 @@ static int bpf_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	opts->mode = S_IRWXUGO;
+	opts->uid = current_fsuid();
+	opts->gid = current_fsgid();
 
 	/* start out with no BPF token delegation enabled */
 	opts->delegate_cmds = 0;
-- 
cgit v1.2.3


From 537fec0733c4a72e2a2b69fee365459c5b75d92e Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 5 Dec 2023 22:08:42 +0100
Subject: net: make vlan_get_tag() return -ENODATA instead of -EINVAL

__vlan_hwaccel_get_tag() is used in veth XDP hints implementation,
its return value (-EINVAL if skb is not VLAN tagged) is passed to bpf code,
but XDP hints specification requires drivers to return -ENODATA, if a hint
cannot be provided for a particular packet.

Solve this inconsistency by changing error return value of
__vlan_hwaccel_get_tag() from -EINVAL to -ENODATA, do the same thing to
__vlan_get_tag(), because this function is supposed to follow the same
convention. This, in turn, makes -ENODATA the only non-zero value
vlan_get_tag() can return. We can do this with no side effects, because
none of the users of the 3 above-mentioned functions rely on the exact
value.

Suggested-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://lore.kernel.org/r/20231205210847.28460-14-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/if_vlan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3028af87716e..c1645c86eed9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
 
 	if (!eth_type_vlan(veth->h_vlan_proto))
-		return -EINVAL;
+		return -ENODATA;
 
 	*vlan_tci = ntohs(veth->h_vlan_TCI);
 	return 0;
@@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 		return 0;
 	} else {
 		*vlan_tci = 0;
-		return -EINVAL;
+		return -ENODATA;
 	}
 }
 
-- 
cgit v1.2.3


From 7978bad4b6b9265a1e808a5f679ee428d1dd6523 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 5 Dec 2023 22:08:43 +0100
Subject: mlx5: implement VLAN tag XDP hint

Implement the newly added .xmo_rx_vlan_tag() hint function.

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20231205210847.28460-15-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 15 +++++++++++++++
 include/linux/mlx5/device.h                      |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index e2e7d82cfca4..9e695ed122ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -256,9 +256,24 @@ static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
 	return 0;
 }
 
+static int mlx5e_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
+				 u16 *vlan_tci)
+{
+	const struct mlx5e_xdp_buff *_ctx = (void *)ctx;
+	const struct mlx5_cqe64 *cqe = _ctx->cqe;
+
+	if (!cqe_has_vlan(cqe))
+		return -ENODATA;
+
+	*vlan_proto = htons(ETH_P_8021Q);
+	*vlan_tci = be16_to_cpu(cqe->vlan_info);
+	return 0;
+}
+
 const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
 	.xmo_rx_timestamp		= mlx5e_xdp_rx_timestamp,
 	.xmo_rx_hash			= mlx5e_xdp_rx_hash,
+	.xmo_rx_vlan_tag		= mlx5e_xdp_rx_vlan_tag,
 };
 
 struct mlx5e_xsk_tx_complete {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 820bca965fb6..01275c6e8468 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe)
 	return (cqe->tls_outer_l3_tunneled >> 3) & 0x3;
 }
 
-static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe)
+static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe)
 {
 	return cqe->l4_l3_hdr_type & 0x1;
 }
-- 
cgit v1.2.3


From 0c476157085fe2ad13b9bec70ea672e86647fa1a Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 12 Dec 2023 06:41:43 +0100
Subject: net: phy: c45: add genphy_c45_pma_read_ext_abilities() function

Move part of the genphy_c45_pma_read_abilities() code to a separate
function.

Some PHYs do not implement PMA/PMD status 2 register (Register 1.8) but
do implement PMA/PMD extended ability register (Register 1.11). To make
use of it, we need to be able to access this part of code separately.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20231212054144.87527-2-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-c45.c | 129 ++++++++++++++++++++++++++--------------------
 include/linux/phy.h       |   1 +
 2 files changed, 75 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 8e6fd4962c48..747d14bf152c 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -919,6 +919,79 @@ int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(genphy_c45_pma_baset1_read_abilities);
 
+/**
+ * genphy_c45_pma_read_ext_abilities - read supported link modes from PMA
+ * @phydev: target phy_device struct
+ *
+ * Read the supported link modes from the PMA/PMD extended ability register
+ * (Register 1.11).
+ */
+int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev)
+{
+	int val;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_EXTABLE);
+	if (val < 0)
+		return val;
+
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10GBLRM);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10GBT);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10GBKX4);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10GBKR);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_1000BT);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_1000BKX);
+
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_100BTX);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_100BTX);
+
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10BT);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+			 phydev->supported,
+			 val & MDIO_PMA_EXTABLE_10BT);
+
+	if (val & MDIO_PMA_EXTABLE_NBT) {
+		val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD,
+				   MDIO_PMA_NG_EXTABLE);
+		if (val < 0)
+			return val;
+
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+				 phydev->supported,
+				 val & MDIO_PMA_NG_EXTABLE_2_5GBT);
+
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+				 phydev->supported,
+				 val & MDIO_PMA_NG_EXTABLE_5GBT);
+	}
+
+	if (val & MDIO_PMA_EXTABLE_BT1) {
+		val = genphy_c45_pma_baset1_read_abilities(phydev);
+		if (val < 0)
+			return val;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(genphy_c45_pma_read_ext_abilities);
+
 /**
  * genphy_c45_pma_read_abilities - read supported link modes from PMA
  * @phydev: target phy_device struct
@@ -962,63 +1035,9 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev)
 			 val & MDIO_PMA_STAT2_10GBER);
 
 	if (val & MDIO_PMA_STAT2_EXTABLE) {
-		val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_EXTABLE);
+		val = genphy_c45_pma_read_ext_abilities(phydev);
 		if (val < 0)
 			return val;
-
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10GBLRM);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10GBT);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10GBKX4);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10GBKR);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_1000BT);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_1000BKX);
-
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_100BTX);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_100BTX);
-
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10BT);
-		linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
-				 phydev->supported,
-				 val & MDIO_PMA_EXTABLE_10BT);
-
-		if (val & MDIO_PMA_EXTABLE_NBT) {
-			val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD,
-					   MDIO_PMA_NG_EXTABLE);
-			if (val < 0)
-				return val;
-
-			linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
-					 phydev->supported,
-					 val & MDIO_PMA_NG_EXTABLE_2_5GBT);
-
-			linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
-					 phydev->supported,
-					 val & MDIO_PMA_NG_EXTABLE_5GBT);
-		}
-
-		if (val & MDIO_PMA_EXTABLE_BT1) {
-			val = genphy_c45_pma_baset1_read_abilities(phydev);
-			if (val < 0)
-				return val;
-		}
 	}
 
 	/* This is optional functionality. If not supported, we may get an error
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6e7ebcc50b85..dbb5e13e3e1b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1866,6 +1866,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev);
 int genphy_c45_an_disable_aneg(struct phy_device *phydev);
 int genphy_c45_read_mdix(struct phy_device *phydev);
 int genphy_c45_pma_read_abilities(struct phy_device *phydev);
+int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev);
 int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev);
 int genphy_c45_read_eee_abilities(struct phy_device *phydev);
 int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev);
-- 
cgit v1.2.3


From 13049408a4bd29c92227ca2d6befab80dbb96663 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 7 May 2023 16:47:42 +0300
Subject: net/mlx5: Add mlx5_ifc bits used for supporting single netdev
 Socket-Direct

Multiple device caps and features are required to support
single netdev Socket-Direct.
Add them here in preparation for the feature implementation.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ce2e71cd6d2a..405d141b4a08 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -435,7 +435,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         flow_table_modify[0x1];
 	u8         reformat[0x1];
 	u8         decap[0x1];
-	u8         reserved_at_9[0x1];
+	u8         reset_root_to_default[0x1];
 	u8         pop_vlan[0x1];
 	u8         push_vlan[0x1];
 	u8         reserved_at_c[0x1];
@@ -1801,7 +1801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         disable_local_lb_uc[0x1];
 	u8         disable_local_lb_mc[0x1];
 	u8         log_min_hairpin_wq_data_sz[0x5];
-	u8         reserved_at_3e8[0x2];
+	u8         reserved_at_3e8[0x1];
+	u8         silent_mode[0x1];
 	u8         vhca_state[0x1];
 	u8         log_max_vlan_list[0x5];
 	u8         reserved_at_3f0[0x3];
@@ -1818,7 +1819,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_460[0x1];
 	u8         ats[0x1];
-	u8         reserved_at_462[0x1];
+	u8         cross_vhca_rqt[0x1];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x1];
 	u8         crypto[0x1];
@@ -1943,6 +1944,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 enum {
 	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS  = 0x80000,
+	MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE  = (1ULL << 20),
 };
 
 enum {
@@ -1992,7 +1994,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_260[0x120];
 	u8	   reserved_at_380[0x10];
 	u8	   ec_vf_vport_base[0x10];
-	u8	   reserved_at_3a0[0x460];
+
+	u8	   reserved_at_3a0[0x10];
+	u8	   max_rqt_vhca_id[0x10];
+
+	u8	   reserved_at_3c0[0x440];
 };
 
 enum mlx5_ifc_flow_destination_type {
@@ -2151,6 +2157,13 @@ struct mlx5_ifc_rq_num_bits {
 	u8         rq_num[0x18];
 };
 
+struct mlx5_ifc_rq_vhca_bits {
+	u8         reserved_at_0[0x8];
+	u8         rq_num[0x18];
+	u8         reserved_at_20[0x10];
+	u8         rq_vhca_id[0x10];
+};
+
 struct mlx5_ifc_mac_address_layout_bits {
 	u8         reserved_at_0[0x10];
 	u8         mac_addr_47_32[0x10];
@@ -3901,7 +3914,10 @@ struct mlx5_ifc_rqtc_bits {
 
 	u8    reserved_at_e0[0x6a0];
 
-	struct mlx5_ifc_rq_num_bits rq_num[];
+	union {
+		DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_num_bits, rq_num);
+		DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_vhca_bits, rq_vhca);
+	};
 };
 
 enum {
@@ -4744,7 +4760,10 @@ struct mlx5_ifc_set_l2_table_entry_in_bits {
 
 	u8         reserved_at_c0[0x20];
 
-	u8         reserved_at_e0[0x13];
+	u8         reserved_at_e0[0x10];
+	u8         silent_mode_valid[0x1];
+	u8         silent_mode[0x1];
+	u8         reserved_at_f2[0x1];
 	u8         vlan_valid[0x1];
 	u8         vlan[0xc];
 
-- 
cgit v1.2.3


From f5e956329960903d908668d7a20bbc08e0a8b92b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Mon, 7 Aug 2023 09:05:34 +0300
Subject: net/mlx5: Expose Management PCIe Index Register (MPIR)

MPIR register allows to query the PCIe indexes
and Socket-Direct related parameters.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c      | 10 ++++++++++
 include/linux/mlx5/driver.h                         |  1 +
 include/linux/mlx5/mlx5_ifc.h                       | 14 ++++++++++++++
 4 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 6b14e347d914..a79b7959361b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -243,6 +243,7 @@ int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
 			u8 access_reg_group);
 int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
 			u8 feature_group, u8 access_reg_group);
+int mlx5_query_mpir_reg(struct mlx5_core_dev *dev, u32 *mpir);
 
 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 7d8c732818f2..7fba1c46e2ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1206,3 +1206,13 @@ int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 	*speed = max_speed;
 	return 0;
 }
+
+int mlx5_query_mpir_reg(struct mlx5_core_dev *dev, u32 *mpir)
+{
+	u32 in[MLX5_ST_SZ_DW(mpir_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(mpir_reg);
+
+	MLX5_SET(mpir_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(dev, in, sz, mpir, sz, MLX5_REG_MPIR, 0, 0);
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d2b8d4a74a30..2f67cec1a898 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -150,6 +150,7 @@ enum {
 	MLX5_REG_MTPPSE		 = 0x9054,
 	MLX5_REG_MTUTC		 = 0x9055,
 	MLX5_REG_MPEGC		 = 0x9056,
+	MLX5_REG_MPIR		 = 0x9059,
 	MLX5_REG_MCQS		 = 0x9060,
 	MLX5_REG_MCQI		 = 0x9061,
 	MLX5_REG_MCC		 = 0x9062,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 405d141b4a08..828938368fb7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10108,6 +10108,20 @@ struct mlx5_ifc_mpegc_reg_bits {
 	u8         reserved_at_60[0x100];
 };
 
+struct mlx5_ifc_mpir_reg_bits {
+	u8         sdm[0x1];
+	u8         reserved_at_1[0x1b];
+	u8         host_buses[0x4];
+
+	u8         reserved_at_20[0x20];
+
+	u8         local_port[0x8];
+	u8         reserved_at_28[0x15];
+	u8         sd_group[0x3];
+
+	u8         reserved_at_60[0x20];
+};
+
 enum {
 	MLX5_MTUTC_FREQ_ADJ_UNITS_PPB          = 0x0,
 	MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM   = 0x1,
-- 
cgit v1.2.3


From b25bd37c859f32e50a436ab9d2078b76e433008e Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 6 Aug 2023 14:01:10 +0300
Subject: net/mlx5: Move TISes from priv to mdev HW resources

The transport interface send (TIS) object is responsible for performing
all transport related operations of the transmit side. Messages from
Send Queues get segmented and transmitted by the TIS including all
transport required implications, e.g. in the case of large send offload,
the TIS is responsible for the segmentation.

These are stateless objects and can be used by multiple netdevs (e.g.
representors) who share the same core device.

Providing the TISes as a service from the core layer to the netdev layer
reduces the number of replecated TIS objects (in case of multiple
netdevs), and will ease the transition to netdev with multiple mdevs.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  19 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c   |   7 +-
 .../net/ethernet/mellanox/mlx5/core/en_common.c    |  74 +++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 102 ++++-----------------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  10 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  19 +++-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  |   2 +
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |   7 +-
 include/linux/mlx5/driver.h                        |   2 +
 11 files changed, 140 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 43f027bf2da3..6808e0d82944 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -72,7 +72,6 @@ struct page_pool;
 #define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
 #define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
 
-#define MLX5E_MAX_NUM_TC	8
 #define MLX5E_MAX_NUM_MQPRIO_CH_TC TC_QOPT_MAX_QUEUE
 
 #define MLX5_RX_HEADROOM NET_SKB_PAD
@@ -758,7 +757,7 @@ struct mlx5e_channel {
 	/* data path */
 	struct mlx5e_rq            rq;
 	struct mlx5e_xdpsq         rq_xdpsq;
-	struct mlx5e_txqsq         sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_txqsq         sq[MLX5_MAX_NUM_TC];
 	struct mlx5e_icosq         icosq;   /* internal control operations */
 	struct mlx5e_txqsq __rcu * __rcu *qos_sqs;
 	bool                       xdp;
@@ -808,7 +807,7 @@ struct mlx5e_channels {
 
 struct mlx5e_channel_stats {
 	struct mlx5e_ch_stats ch;
-	struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_sq_stats sq[MLX5_MAX_NUM_TC];
 	struct mlx5e_rq_stats rq;
 	struct mlx5e_rq_stats xskrq;
 	struct mlx5e_xdpsq_stats rq_xdpsq;
@@ -818,8 +817,8 @@ struct mlx5e_channel_stats {
 
 struct mlx5e_ptp_stats {
 	struct mlx5e_ch_stats ch;
-	struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
-	struct mlx5e_ptp_cq_stats cq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_sq_stats sq[MLX5_MAX_NUM_TC];
+	struct mlx5e_ptp_cq_stats cq[MLX5_MAX_NUM_TC];
 	struct mlx5e_rq_stats rq;
 } ____cacheline_aligned_in_smp;
 
@@ -886,7 +885,6 @@ struct mlx5e_priv {
 	struct mlx5e_rq            drop_rq;
 
 	struct mlx5e_channels      channels;
-	u32                        tisn[MLX5_MAX_PORTS][MLX5E_MAX_NUM_TC];
 	struct mlx5e_rx_res       *rx_res;
 	u32                       *tx_rates;
 
@@ -984,6 +982,8 @@ struct mlx5e_profile {
 	void	(*update_stats)(struct mlx5e_priv *priv);
 	void	(*update_carrier)(struct mlx5e_priv *priv);
 	int	(*max_nch_limit)(struct mlx5_core_dev *mdev);
+	u32	(*get_tisn)(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv,
+			    u8 lag_port, u8 tc);
 	unsigned int (*stats_grps_num)(struct mlx5e_priv *priv);
 	mlx5e_stats_grp_t *stats_grps;
 	const struct mlx5e_rx_handlers *rx_handlers;
@@ -991,6 +991,11 @@ struct mlx5e_profile {
 	u32     features;
 };
 
+u32 mlx5e_profile_get_tisn(struct mlx5_core_dev *mdev,
+			   struct mlx5e_priv *priv,
+			   const struct mlx5e_profile *profile,
+			   u8 lag_port, u8 tc);
+
 #define mlx5e_profile_feature_cap(profile, feature)	\
 	((profile)->features & BIT(MLX5E_PROFILE_FEATURE_##feature))
 
@@ -1132,8 +1137,6 @@ void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq);
 int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn);
 void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn);
 
-int mlx5e_create_tises(struct mlx5e_priv *priv);
-void mlx5e_destroy_tises(struct mlx5e_priv *priv);
 int mlx5e_update_nic_rx(struct mlx5e_priv *priv);
 void mlx5e_update_carrier(struct mlx5e_priv *priv);
 int mlx5e_close(struct net_device *netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index af3928eddafd..04cec76c1ac4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -518,9 +518,11 @@ static int mlx5e_ptp_open_txqsqs(struct mlx5e_ptp *c,
 
 	for (tc = 0; tc < num_tc; tc++) {
 		int txq_ix = ix_base + tc;
+		u32 tisn;
 
-		err = mlx5e_ptp_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix,
-					   cparams, tc, &c->ptpsq[tc]);
+		tisn = mlx5e_profile_get_tisn(c->mdev, c->priv, c->priv->profile,
+					      c->lag_port, tc);
+		err = mlx5e_ptp_open_txqsq(c, tisn, txq_ix, cparams, tc, &c->ptpsq[tc]);
 		if (err)
 			goto close_txqsq;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
index 7b700d0f956a..86f1854698b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
@@ -49,7 +49,7 @@ enum {
 
 struct mlx5e_ptp {
 	/* data path */
-	struct mlx5e_ptpsq         ptpsq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_ptpsq         ptpsq[MLX5_MAX_NUM_TC];
 	struct mlx5e_rq            rq;
 	struct napi_struct         napi;
 	struct device             *pdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
index 244bc15a42ab..9e2211f0c3a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
@@ -77,6 +77,7 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs,
 	struct mlx5e_params *params;
 	struct mlx5e_channel *c;
 	struct mlx5e_txqsq *sq;
+	u32 tisn;
 
 	params = &chs->params;
 
@@ -126,8 +127,10 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs,
 	err = mlx5e_open_cq(priv, params->tx_cq_moderation, &param_cq, &ccp, &sq->cq);
 	if (err)
 		goto err_free_sq;
-	err = mlx5e_open_txqsq(c, priv->tisn[c->lag_port][0], txq_ix, params,
-			       &param_sq, sq, 0, hw_id,
+
+	tisn = mlx5e_profile_get_tisn(c->mdev, c->priv, c->priv->profile,
+				      c->lag_port, 0);
+	err = mlx5e_open_txqsq(c, tisn, txq_ix, params, &param_sq, sq, 0, hw_id,
 			       priv->htb_qos_sq_stats[node_qid]);
 	if (err)
 		goto err_close_cq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 41c396e76457..67f546683e85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -74,6 +74,72 @@ int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey)
 	return err;
 }
 
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn)
+{
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
+
+	if (mlx5_lag_is_lacp_owner(mdev))
+		MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
+
+	return mlx5_core_create_tis(mdev, in, tisn);
+}
+
+void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn)
+{
+	mlx5_core_destroy_tis(mdev, tisn);
+}
+
+static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC])
+{
+	int tc, i;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++)
+			mlx5e_destroy_tis(mdev, tisn[i][tc]);
+}
+
+static bool mlx5_lag_should_assign_affinity(struct mlx5_core_dev *mdev)
+{
+	return MLX5_CAP_GEN(mdev, lag_tx_port_affinity) && mlx5e_get_num_lag_ports(mdev) > 1;
+}
+
+static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC])
+{
+	int tc, i;
+	int err;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++) {
+		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) {
+			u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+			void *tisc;
+
+			tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+			MLX5_SET(tisc, tisc, prio, tc << 1);
+
+			if (mlx5_lag_should_assign_affinity(mdev))
+				MLX5_SET(tisc, tisc, lag_tx_port_affinity, i + 1);
+
+			err = mlx5e_create_tis(mdev, in, &tisn[i][tc]);
+			if (err)
+				goto err_close_tises;
+		}
+	}
+
+	return 0;
+
+err_close_tises:
+	for (; i >= 0; i--) {
+		for (tc--; tc >= 0; tc--)
+			mlx5e_destroy_tis(mdev, tisn[i][tc]);
+		tc = MLX5_MAX_NUM_TC;
+	}
+
+	return err;
+}
+
 int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 {
 	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
@@ -103,6 +169,11 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 		goto err_destroy_mkey;
 	}
 
+	err = mlx5e_create_tises(mdev, res->tisn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
+		goto err_destroy_bfreg;
+	}
 	INIT_LIST_HEAD(&res->td.tirs_list);
 	mutex_init(&res->td.list_lock);
 
@@ -115,6 +186,8 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 
 	return 0;
 
+err_destroy_bfreg:
+	mlx5_free_bfreg(mdev, &res->bfreg);
 err_destroy_mkey:
 	mlx5_core_destroy_mkey(mdev, res->mkey);
 err_dealloc_transport_domain:
@@ -130,6 +203,7 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 
 	mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv);
 	mdev->mlx5e_res.dek_priv = NULL;
+	mlx5e_destroy_tises(mdev, res->tisn);
 	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b49b7c28863c..ecb40950ec8d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1352,6 +1352,17 @@ void mlx5e_close_rq(struct mlx5e_rq *rq)
 	mlx5e_free_rq(rq);
 }
 
+u32 mlx5e_profile_get_tisn(struct mlx5_core_dev *mdev,
+			   struct mlx5e_priv *priv,
+			   const struct mlx5e_profile *profile,
+			   u8 lag_port, u8 tc)
+{
+	if (profile->get_tisn)
+		return profile->get_tisn(mdev, priv, lag_port, tc);
+
+	return mdev->mlx5e_res.hw_objs.tisn[lag_port][tc];
+}
+
 static void mlx5e_free_xdpsq_db(struct mlx5e_xdpsq *sq)
 {
 	kvfree(sq->db.xdpi_fifo.xi);
@@ -1920,7 +1931,8 @@ int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
 		return err;
 
 	csp.tis_lst_sz      = 1;
-	csp.tisn            = c->priv->tisn[c->lag_port][0]; /* tc = 0 */
+	csp.tisn            = mlx5e_profile_get_tisn(c->mdev, c->priv, c->priv->profile,
+						     c->lag_port, 0); /* tc = 0 */
 	csp.cqn             = sq->cq.mcq.cqn;
 	csp.wq_ctrl         = &sq->wq_ctrl;
 	csp.min_inline_mode = sq->min_inline_mode;
@@ -2204,12 +2216,15 @@ static int mlx5e_open_sqs(struct mlx5e_channel *c,
 	for (tc = 0; tc < mlx5e_get_dcb_num_tc(params); tc++) {
 		int txq_ix = c->ix + tc * params->num_channels;
 		u32 qos_queue_group_id;
+		u32 tisn;
 
+		tisn = mlx5e_profile_get_tisn(c->mdev, c->priv, c->priv->profile,
+					      c->lag_port, tc);
 		err = mlx5e_txq_get_qos_node_hw_id(params, txq_ix, &qos_queue_group_id);
 		if (err)
 			goto err_close_sqs;
 
-		err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix,
+		err = mlx5e_open_txqsq(c, tisn, txq_ix,
 				       params, &cparam->txq_sq, &c->sq[tc], tc,
 				       qos_queue_group_id,
 				       &c->priv->channel_stats[c->ix]->sq[tc]);
@@ -3351,72 +3366,6 @@ void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq)
 	mlx5e_free_cq(&drop_rq->cq);
 }
 
-int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn)
-{
-	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
-
-	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
-
-	if (mlx5_lag_is_lacp_owner(mdev))
-		MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
-
-	return mlx5_core_create_tis(mdev, in, tisn);
-}
-
-void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn)
-{
-	mlx5_core_destroy_tis(mdev, tisn);
-}
-
-void mlx5e_destroy_tises(struct mlx5e_priv *priv)
-{
-	int tc, i;
-
-	for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++)
-		for (tc = 0; tc < priv->profile->max_tc; tc++)
-			mlx5e_destroy_tis(priv->mdev, priv->tisn[i][tc]);
-}
-
-static bool mlx5e_lag_should_assign_affinity(struct mlx5_core_dev *mdev)
-{
-	return MLX5_CAP_GEN(mdev, lag_tx_port_affinity) && mlx5e_get_num_lag_ports(mdev) > 1;
-}
-
-int mlx5e_create_tises(struct mlx5e_priv *priv)
-{
-	int tc, i;
-	int err;
-
-	for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) {
-		for (tc = 0; tc < priv->profile->max_tc; tc++) {
-			u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
-			void *tisc;
-
-			tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
-
-			MLX5_SET(tisc, tisc, prio, tc << 1);
-
-			if (mlx5e_lag_should_assign_affinity(priv->mdev))
-				MLX5_SET(tisc, tisc, lag_tx_port_affinity, i + 1);
-
-			err = mlx5e_create_tis(priv->mdev, in, &priv->tisn[i][tc]);
-			if (err)
-				goto err_close_tises;
-		}
-	}
-
-	return 0;
-
-err_close_tises:
-	for (; i >= 0; i--) {
-		for (tc--; tc >= 0; tc--)
-			mlx5e_destroy_tis(priv->mdev, priv->tisn[i][tc]);
-		tc = priv->profile->max_tc;
-	}
-
-	return err;
-}
-
 static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
 {
 	if (priv->mqprio_rl) {
@@ -3425,7 +3374,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
 		priv->mqprio_rl = NULL;
 	}
 	mlx5e_accel_cleanup_tx(priv);
-	mlx5e_destroy_tises(priv);
 }
 
 static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd)
@@ -3527,7 +3475,7 @@ static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv,
 
 	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	if (tc && tc != MLX5E_MAX_NUM_TC)
+	if (tc && tc != MLX5_MAX_NUM_TC)
 		return -EINVAL;
 
 	new_params = priv->channels.params;
@@ -5482,23 +5430,13 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
 {
 	int err;
 
-	err = mlx5e_create_tises(priv);
-	if (err) {
-		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
-		return err;
-	}
-
 	err = mlx5e_accel_init_tx(priv);
 	if (err)
-		goto err_destroy_tises;
+		return err;
 
 	mlx5e_set_mqprio_rl(priv);
 	mlx5e_dcbnl_initialize(priv);
 	return 0;
-
-err_destroy_tises:
-	mlx5e_destroy_tises(priv);
-	return err;
 }
 
 static void mlx5e_nic_enable(struct mlx5e_priv *priv)
@@ -5593,7 +5531,7 @@ static const struct mlx5e_profile mlx5e_nic_profile = {
 	.update_stats	   = mlx5e_stats_update_ndo_stats,
 	.update_carrier	   = mlx5e_update_carrier,
 	.rx_handlers       = &mlx5e_rx_handlers_nic,
-	.max_tc		   = MLX5E_MAX_NUM_TC,
+	.max_tc		   = MLX5_MAX_NUM_TC,
 	.stats_grps	   = mlx5e_nic_stats_grps,
 	.stats_grps_num	   = mlx5e_nic_stats_grps_num,
 	.features          = BIT(MLX5E_PROFILE_FEATURE_PTP_RX) |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index fe0726c7b847..e3018a141b6f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1180,12 +1180,6 @@ static int mlx5e_init_rep_tx(struct mlx5e_priv *priv)
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	int err;
 
-	err = mlx5e_create_tises(priv);
-	if (err) {
-		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
-		return err;
-	}
-
 	err = mlx5e_rep_neigh_init(rpriv);
 	if (err)
 		goto err_neigh_init;
@@ -1208,7 +1202,6 @@ err_ht_init:
 err_init_tx:
 	mlx5e_rep_neigh_cleanup(rpriv);
 err_neigh_init:
-	mlx5e_destroy_tises(priv);
 	return err;
 }
 
@@ -1222,7 +1215,6 @@ static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv)
 		mlx5e_cleanup_uplink_rep_tx(rpriv);
 
 	mlx5e_rep_neigh_cleanup(rpriv);
-	mlx5e_destroy_tises(priv);
 }
 
 static void mlx5e_rep_enable(struct mlx5e_priv *priv)
@@ -1452,7 +1444,7 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = {
 	.update_stats           = mlx5e_stats_update_ndo_stats,
 	.update_carrier	        = mlx5e_update_carrier,
 	.rx_handlers            = &mlx5e_rx_handlers_rep,
-	.max_tc			= MLX5E_MAX_NUM_TC,
+	.max_tc			= MLX5_MAX_NUM_TC,
 	.stats_grps		= mlx5e_ul_rep_stats_grps,
 	.stats_grps_num		= mlx5e_ul_rep_stats_grps_num,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 2bf77a5251b4..58845121954c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -339,7 +339,7 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv)
 		return err;
 	}
 
-	err = mlx5i_create_tis(priv->mdev, ipriv->qpn, &priv->tisn[0][0]);
+	err = mlx5i_create_tis(priv->mdev, ipriv->qpn, &ipriv->tisn);
 	if (err) {
 		mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
 		goto err_destroy_underlay_qp;
@@ -356,7 +356,7 @@ static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
 {
 	struct mlx5i_priv *ipriv = priv->ppriv;
 
-	mlx5e_destroy_tis(priv->mdev, priv->tisn[0][0]);
+	mlx5e_destroy_tis(priv->mdev, ipriv->tisn);
 	mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn);
 }
 
@@ -483,6 +483,18 @@ static unsigned int mlx5i_stats_grps_num(struct mlx5e_priv *priv)
 	return ARRAY_SIZE(mlx5i_stats_grps);
 }
 
+u32 mlx5i_get_tisn(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv, u8 lag_port, u8 tc)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+
+	if (WARN(lag_port || tc,
+		 "IPoIB unexpected non-zero value: lag_port (%u), tc (%u)\n",
+		 lag_port, tc))
+		return 0;
+
+	return ipriv->tisn;
+}
+
 static const struct mlx5e_profile mlx5i_nic_profile = {
 	.init		   = mlx5i_init,
 	.cleanup	   = mlx5i_cleanup,
@@ -499,6 +511,7 @@ static const struct mlx5e_profile mlx5i_nic_profile = {
 	.max_tc		   = MLX5I_MAX_NUM_TC,
 	.stats_grps        = mlx5i_stats_grps,
 	.stats_grps_num    = mlx5i_stats_grps_num,
+	.get_tisn          = mlx5i_get_tisn,
 };
 
 /* mlx5i netdev NDos */
@@ -829,7 +842,7 @@ int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
 	*params = (struct rdma_netdev_alloc_params){
 		.sizeof_priv = sizeof(struct mlx5i_priv) +
 			       sizeof(struct mlx5e_priv),
-		.txqs = nch * MLX5E_MAX_NUM_TC,
+		.txqs = nch * MLX5_MAX_NUM_TC,
 		.rxqs = nch,
 		.param = mdev,
 		.initialize_rdma_netdev = mlx5_rdma_setup_rn,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index f3f2af972020..2ab6437a1c49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -53,6 +53,7 @@ extern const struct mlx5e_rx_handlers mlx5i_rx_handlers;
 struct mlx5i_priv {
 	struct rdma_netdev rn; /* keep this first */
 	u32 qpn;
+	u32 tisn;
 	bool   sub_interface;
 	u32    num_sub_interfaces;
 	u32    qkey;
@@ -63,6 +64,7 @@ struct mlx5i_priv {
 };
 
 int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn);
+u32 mlx5i_get_tisn(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv, u8 lag_port, u8 tc);
 
 /* Underlay QP create/destroy functions */
 int mlx5i_create_underlay_qp(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
index 03e681297937..f87471306f6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -218,7 +218,7 @@ static int mlx5i_pkey_open(struct net_device *netdev)
 		goto err_unint_underlay_qp;
 	}
 
-	err = mlx5i_create_tis(mdev, ipriv->qpn, &epriv->tisn[0][0]);
+	err = mlx5i_create_tis(mdev, ipriv->qpn, &ipriv->tisn);
 	if (err) {
 		mlx5_core_warn(mdev, "create child tis failed, %d\n", err);
 		goto err_remove_rx_uderlay_qp;
@@ -240,7 +240,7 @@ static int mlx5i_pkey_open(struct net_device *netdev)
 err_close_channels:
 	mlx5e_close_channels(&epriv->channels);
 err_clear_state_opened_flag:
-	mlx5e_destroy_tis(mdev, epriv->tisn[0][0]);
+	mlx5e_destroy_tis(mdev, ipriv->tisn);
 err_remove_rx_uderlay_qp:
 	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn);
 err_unint_underlay_qp:
@@ -269,7 +269,7 @@ static int mlx5i_pkey_close(struct net_device *netdev)
 	mlx5i_uninit_underlay_qp(priv);
 	mlx5e_deactivate_priv_channels(priv);
 	mlx5e_close_channels(&priv->channels);
-	mlx5e_destroy_tis(mdev, priv->tisn[0][0]);
+	mlx5e_destroy_tis(mdev, ipriv->tisn);
 unlock:
 	mutex_unlock(&priv->state_lock);
 	return 0;
@@ -361,6 +361,7 @@ static const struct mlx5e_profile mlx5i_pkey_nic_profile = {
 	.update_stats	   = NULL,
 	.rx_handlers       = &mlx5i_rx_handlers,
 	.max_tc		   = MLX5I_MAX_NUM_TC,
+	.get_tisn          = mlx5i_get_tisn,
 };
 
 const struct mlx5e_profile *mlx5i_pkey_get_profile(void)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2f67cec1a898..7ee5b79ff3d6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -679,6 +679,8 @@ struct mlx5e_resources {
 		struct mlx5_td             td;
 		u32			   mkey;
 		struct mlx5_sq_bfreg       bfreg;
+#define MLX5_MAX_NUM_TC 8
+		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From 50d73710715de7d1a2c88194562f520816af9c2a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 12 Dec 2023 15:27:51 +0100
Subject: ethtool: add SET for TCP_DATA_SPLIT ringparam

Follow up commit 9690ae604290 ("ethtool: add header/data split
indication") and add the set part of Ethtool's header split, i.e.
ability to enable/disable header split via the Ethtool Netlink
interface. This might be helpful to optimize the setup for particular
workloads, for example, to avoid XDP frags, and so on.
A driver should advertise ``ETHTOOL_RING_USE_TCP_DATA_SPLIT`` in its
ops->supported_ring_params to allow doing that. "Unknown" passed from
the userspace when the header split is supported means the driver is
free to choose the preferred state.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20231212142752.935000-2-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h |  2 ++
 net/ethtool/rings.c     | 12 ++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index deb683d3360f..67b30940234b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -95,6 +95,7 @@ struct kernel_ethtool_ringparam {
  * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push
  * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push
  * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len
+ * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split
  */
 enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_RX_BUF_LEN		= BIT(0),
@@ -102,6 +103,7 @@ enum ethtool_supported_ring_param {
 	ETHTOOL_RING_USE_TX_PUSH		= BIT(2),
 	ETHTOOL_RING_USE_RX_PUSH		= BIT(3),
 	ETHTOOL_RING_USE_TX_PUSH_BUF_LEN	= BIT(4),
+	ETHTOOL_RING_USE_TCP_DATA_SPLIT		= BIT(5),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index fb09f774ea01..b7865a14fdf8 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -124,6 +124,8 @@ const struct nla_policy ethnl_rings_set_policy[] = {
 	[ETHTOOL_A_RINGS_RX_JUMBO]		= { .type = NLA_U32 },
 	[ETHTOOL_A_RINGS_TX]			= { .type = NLA_U32 },
 	[ETHTOOL_A_RINGS_RX_BUF_LEN]            = NLA_POLICY_MIN(NLA_U32, 1),
+	[ETHTOOL_A_RINGS_TCP_DATA_SPLIT]	=
+		NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED),
 	[ETHTOOL_A_RINGS_CQE_SIZE]		= NLA_POLICY_MIN(NLA_U32, 1),
 	[ETHTOOL_A_RINGS_TX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
 	[ETHTOOL_A_RINGS_RX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
@@ -145,6 +147,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info,
 		return -EOPNOTSUPP;
 	}
 
+	if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] &&
+	    !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+				    "setting TCP data split is not supported");
+		return -EOPNOTSUPP;
+	}
+
 	if (tb[ETHTOOL_A_RINGS_CQE_SIZE] &&
 	    !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) {
 		NL_SET_ERR_MSG_ATTR(info->extack,
@@ -202,6 +212,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
 	ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod);
 	ethnl_update_u32(&kernel_ringparam.rx_buf_len,
 			 tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod);
+	ethnl_update_u8(&kernel_ringparam.tcp_data_split,
+			tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod);
 	ethnl_update_u32(&kernel_ringparam.cqe_size,
 			 tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod);
 	ethnl_update_u8(&kernel_ringparam.tx_push,
-- 
cgit v1.2.3


From 0a149ab78ee220c75eef797abea7a29f4490e226 Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Tue, 12 Dec 2023 12:46:11 +0800
Subject: page_pool: transition to reference count management after page
 draining

To support multiple users referencing the same fragment,
'pp_frag_count' is renamed to 'pp_ref_count', transitioning pp pages
from fragment management to reference count management after draining
based on the suggestion from [1].

The idea is that the concept of fragmenting exists before the page is
drained, and all related functions retain their current names.
However, once the page is drained, its management shifts to being
governed by 'pp_ref_count'. Therefore, all functions associated with
that lifecycle stage of a pp page are renamed.

[1]
http://lore.kernel.org/netdev/f71d9448-70c8-8793-dc9a-0eb48a570300@huawei.com

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://lore.kernel.org/r/20231212044614.42733-2-liangchen.linux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  4 +-
 include/linux/mm_types.h                        |  2 +-
 include/net/page_pool/helpers.h                 | 60 ++++++++++++++-----------
 include/net/page_pool/types.h                   |  6 +--
 net/core/page_pool.c                            | 12 ++---
 5 files changed, 46 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 8d9743a5e42c..98d33ac7ec64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -298,8 +298,8 @@ static void mlx5e_page_release_fragmented(struct mlx5e_rq *rq,
 	u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags;
 	struct page *page = frag_page->page;
 
-	if (page_pool_defrag_page(page, drain_count) == 0)
-		page_pool_put_defragged_page(rq->page_pool, page, -1, true);
+	if (page_pool_unref_page(page, drain_count) == 0)
+		page_pool_put_unrefed_page(rq->page_pool, page, -1, true);
 }
 
 static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..64e4572ef06d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -125,7 +125,7 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			atomic_long_t pp_frag_count;
+			atomic_long_t pp_ref_count;
 		};
 		struct {	/* Tail pages of compound page */
 			unsigned long compound_head;	/* Bit zero is set */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 7dc65774cde5..841e0a930bd7 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -29,7 +29,7 @@
  * page allocated from page pool. Page splitting enables memory saving and thus
  * avoids TLB/cache miss for data access, but there also is some cost to
  * implement page splitting, mainly some cache line dirtying/bouncing for
- * 'struct page' and atomic operation for page->pp_frag_count.
+ * 'struct page' and atomic operation for page->pp_ref_count.
  *
  * The API keeps track of in-flight pages, in order to let API users know when
  * it is safe to free a page_pool object, the API users must call
@@ -210,69 +210,77 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 	return pool->p.dma_dir;
 }
 
-/* pp_frag_count represents the number of writers who can update the page
- * either by updating skb->data or via DMA mappings for the device.
- * We can't rely on the page refcnt for that as we don't know who might be
- * holding page references and we can't reliably destroy or sync DMA mappings
- * of the fragments.
+/**
+ * page_pool_fragment_page() - split a fresh page into fragments
+ * @page:	page to split
+ * @nr:		references to set
+ *
+ * pp_ref_count represents the number of outstanding references to the page,
+ * which will be freed using page_pool APIs (rather than page allocator APIs
+ * like put_page()). Such references are usually held by page_pool-aware
+ * objects like skbs marked for page pool recycling.
  *
- * When pp_frag_count reaches 0 we can either recycle the page if the page
- * refcnt is 1 or return it back to the memory allocator and destroy any
- * mappings we have.
+ * This helper allows the caller to take (set) multiple references to a
+ * freshly allocated page. The page must be freshly allocated (have a
+ * pp_ref_count of 1). This is commonly done by drivers and
+ * "fragment allocators" to save atomic operations - either when they know
+ * upfront how many references they will need; or to take MAX references and
+ * return the unused ones with a single atomic dec(), instead of performing
+ * multiple atomic inc() operations.
  */
 static inline void page_pool_fragment_page(struct page *page, long nr)
 {
-	atomic_long_set(&page->pp_frag_count, nr);
+	atomic_long_set(&page->pp_ref_count, nr);
 }
 
-static inline long page_pool_defrag_page(struct page *page, long nr)
+static inline long page_pool_unref_page(struct page *page, long nr)
 {
 	long ret;
 
-	/* If nr == pp_frag_count then we have cleared all remaining
+	/* If nr == pp_ref_count then we have cleared all remaining
 	 * references to the page:
 	 * 1. 'n == 1': no need to actually overwrite it.
 	 * 2. 'n != 1': overwrite it with one, which is the rare case
-	 *              for pp_frag_count draining.
+	 *              for pp_ref_count draining.
 	 *
 	 * The main advantage to doing this is that not only we avoid a atomic
 	 * update, as an atomic_read is generally a much cheaper operation than
 	 * an atomic update, especially when dealing with a page that may be
-	 * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count
+	 * referenced by only 2 or 3 users; but also unify the pp_ref_count
 	 * handling by ensuring all pages have partitioned into only 1 piece
 	 * initially, and only overwrite it when the page is partitioned into
 	 * more than one piece.
 	 */
-	if (atomic_long_read(&page->pp_frag_count) == nr) {
+	if (atomic_long_read(&page->pp_ref_count) == nr) {
 		/* As we have ensured nr is always one for constant case using
 		 * the BUILD_BUG_ON(), only need to handle the non-constant case
-		 * here for pp_frag_count draining, which is a rare case.
+		 * here for pp_ref_count draining, which is a rare case.
 		 */
 		BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
 		if (!__builtin_constant_p(nr))
-			atomic_long_set(&page->pp_frag_count, 1);
+			atomic_long_set(&page->pp_ref_count, 1);
 
 		return 0;
 	}
 
-	ret = atomic_long_sub_return(nr, &page->pp_frag_count);
+	ret = atomic_long_sub_return(nr, &page->pp_ref_count);
 	WARN_ON(ret < 0);
 
-	/* We are the last user here too, reset pp_frag_count back to 1 to
+	/* We are the last user here too, reset pp_ref_count back to 1 to
 	 * ensure all pages have been partitioned into 1 piece initially,
 	 * this should be the rare case when the last two fragment users call
-	 * page_pool_defrag_page() currently.
+	 * page_pool_unref_page() currently.
 	 */
 	if (unlikely(!ret))
-		atomic_long_set(&page->pp_frag_count, 1);
+		atomic_long_set(&page->pp_ref_count, 1);
 
 	return ret;
 }
 
-static inline bool page_pool_is_last_frag(struct page *page)
+static inline bool page_pool_is_last_ref(struct page *page)
 {
-	/* If page_pool_defrag_page() returns 0, we were the last user */
-	return page_pool_defrag_page(page, 1) == 0;
+	/* If page_pool_unref_page() returns 0, we were the last user */
+	return page_pool_unref_page(page, 1) == 0;
 }
 
 /**
@@ -297,10 +305,10 @@ static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	if (!page_pool_is_last_frag(page))
+	if (!page_pool_is_last_ref(page))
 		return;
 
-	page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
+	page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct);
 #endif
 }
 
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index ac286ea8ce2d..76481c465375 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -234,9 +234,9 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 }
 #endif
 
-void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
-				  unsigned int dma_sync_size,
-				  bool allow_direct);
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+				unsigned int dma_sync_size,
+				bool allow_direct);
 
 static inline bool is_page_pool_compiled_in(void)
 {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index c2e7c9a6efbe..c92b757369f2 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -677,8 +677,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
 	return NULL;
 }
 
-void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
-				  unsigned int dma_sync_size, bool allow_direct)
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+				unsigned int dma_sync_size, bool allow_direct)
 {
 	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 	if (page && !page_pool_recycle_in_ring(pool, page)) {
@@ -687,7 +687,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
 		page_pool_return_page(pool, page);
 	}
 }
-EXPORT_SYMBOL(page_pool_put_defragged_page);
+EXPORT_SYMBOL(page_pool_put_unrefed_page);
 
 /**
  * page_pool_put_page_bulk() - release references on multiple pages
@@ -714,7 +714,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 		struct page *page = virt_to_head_page(data[i]);
 
 		/* It is not the last user for the page frag case */
-		if (!page_pool_is_last_frag(page))
+		if (!page_pool_is_last_ref(page))
 			continue;
 
 		page = __page_pool_put_page(pool, page, -1, false);
@@ -756,7 +756,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
 	long drain_count = BIAS_MAX - pool->frag_users;
 
 	/* Some user is still using the page frag */
-	if (likely(page_pool_defrag_page(page, drain_count)))
+	if (likely(page_pool_unref_page(page, drain_count)))
 		return NULL;
 
 	if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
@@ -777,7 +777,7 @@ static void page_pool_free_frag(struct page_pool *pool)
 
 	pool->frag_page = NULL;
 
-	if (!page || page_pool_defrag_page(page, drain_count))
+	if (!page || page_pool_unref_page(page, drain_count))
 		return;
 
 	page_pool_return_page(pool, page);
-- 
cgit v1.2.3


From 6caa290684255991ffeebf228b2fd9e7e4da8f34 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:02:56 -0800
Subject: Input: navpoint - convert to use GPIO descriptor

The Navpoint driver uses a GPIO line, convert this to use
a GPIO descriptor. There are no in-kernel users but out of tree
users can easily be added or converted using a GPIO descriptor
table as with numerous other drivers.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-1-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/navpoint.c | 41 +++++++++++++++--------------------------
 include/linux/input/navpoint.h |  1 -
 2 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/mouse/navpoint.c b/drivers/input/mouse/navpoint.c
index c00dc1275da2..ba757783c258 100644
--- a/drivers/input/mouse/navpoint.c
+++ b/drivers/input/mouse/navpoint.c
@@ -10,7 +10,7 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/input.h>
 #include <linux/input/navpoint.h>
 #include <linux/interrupt.h>
@@ -32,7 +32,7 @@ struct navpoint {
 	struct ssp_device	*ssp;
 	struct input_dev	*input;
 	struct device		*dev;
-	int			gpio;
+	struct gpio_desc	*gpiod;
 	int			index;
 	u8			data[1 + HEADER_LENGTH(0xff)];
 };
@@ -170,16 +170,14 @@ static void navpoint_up(struct navpoint *navpoint)
 		dev_err(navpoint->dev,
 			"timeout waiting for SSSR[CSS] to clear\n");
 
-	if (gpio_is_valid(navpoint->gpio))
-		gpio_set_value(navpoint->gpio, 1);
+	gpiod_set_value(navpoint->gpiod, 1);
 }
 
 static void navpoint_down(struct navpoint *navpoint)
 {
 	struct ssp_device *ssp = navpoint->ssp;
 
-	if (gpio_is_valid(navpoint->gpio))
-		gpio_set_value(navpoint->gpio, 0);
+	gpiod_set_value(navpoint->gpiod, 0);
 
 	pxa_ssp_write_reg(ssp, SSCR0, 0);
 
@@ -216,18 +214,9 @@ static int navpoint_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	if (gpio_is_valid(pdata->gpio)) {
-		error = gpio_request_one(pdata->gpio, GPIOF_OUT_INIT_LOW,
-					 "SYNAPTICS_ON");
-		if (error)
-			return error;
-	}
-
 	ssp = pxa_ssp_request(pdata->port, pdev->name);
-	if (!ssp) {
-		error = -ENODEV;
-		goto err_free_gpio;
-	}
+	if (!ssp)
+		return -ENODEV;
 
 	/* HaRET does not disable devices before jumping into Linux */
 	if (pxa_ssp_read_reg(ssp, SSCR0) & SSCR0_SSE) {
@@ -242,10 +231,18 @@ static int navpoint_probe(struct platform_device *pdev)
 		goto err_free_mem;
 	}
 
+	navpoint->gpiod = gpiod_get_optional(&pdev->dev,
+					     NULL, GPIOD_OUT_LOW);
+	if (IS_ERR(navpoint->gpiod)) {
+		error = PTR_ERR(navpoint->gpiod);
+		dev_err(&pdev->dev, "error getting GPIO\n");
+		goto err_free_mem;
+	}
+	gpiod_set_consumer_name(navpoint->gpiod, "SYNAPTICS_ON");
+
 	navpoint->ssp = ssp;
 	navpoint->input = input;
 	navpoint->dev = &pdev->dev;
-	navpoint->gpio = pdata->gpio;
 
 	input->name = pdev->name;
 	input->dev.parent = &pdev->dev;
@@ -288,17 +285,12 @@ err_free_mem:
 	input_free_device(input);
 	kfree(navpoint);
 	pxa_ssp_free(ssp);
-err_free_gpio:
-	if (gpio_is_valid(pdata->gpio))
-		gpio_free(pdata->gpio);
 
 	return error;
 }
 
 static void navpoint_remove(struct platform_device *pdev)
 {
-	const struct navpoint_platform_data *pdata =
-					dev_get_platdata(&pdev->dev);
 	struct navpoint *navpoint = platform_get_drvdata(pdev);
 	struct ssp_device *ssp = navpoint->ssp;
 
@@ -308,9 +300,6 @@ static void navpoint_remove(struct platform_device *pdev)
 	kfree(navpoint);
 
 	pxa_ssp_free(ssp);
-
-	if (gpio_is_valid(pdata->gpio))
-		gpio_free(pdata->gpio);
 }
 
 static int navpoint_suspend(struct device *dev)
diff --git a/include/linux/input/navpoint.h b/include/linux/input/navpoint.h
index d464ffb4db52..5192ae3f5ec1 100644
--- a/include/linux/input/navpoint.h
+++ b/include/linux/input/navpoint.h
@@ -5,5 +5,4 @@
 
 struct navpoint_platform_data {
 	int		port;		/* PXA SSP port for pxa_ssp_request() */
-	int		gpio;		/* GPIO for power on/off */
 };
-- 
cgit v1.2.3


From e53c18da99c75f080bd99436c57824f2ab657f03 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:06:55 -0800
Subject: Input: omap-keypad - drop optional GPIO support

The driver supports passing some GPIO lines for rows and columns
through the driver data, but there is no in-kernel user of this.

Further the use seems convoluted because the GPIO lines are unused
in the driver, then explicitly free:ed when removing it without
being requested when probing it, which is assymetric and just
a recepie for disaster.

Remove the support for these unused GPIOs, if need be support can
be reestablished in an organized fashion using GPIO descriptors.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-3-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/omap-keypad.c      | 19 +------------------
 include/linux/platform_data/keypad-omap.h |  3 ---
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c
index 454fb8675657..16f936db7305 100644
--- a/drivers/input/keyboard/omap-keypad.c
+++ b/drivers/input/keyboard/omap-keypad.c
@@ -21,7 +21,6 @@
 #include <linux/mutex.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
 #include <linux/platform_data/gpio-omap.h>
 #include <linux/platform_data/keypad-omap.h>
 #include <linux/soc/ti/omap1-io.h>
@@ -49,9 +48,6 @@ struct omap_kp {
 
 static DECLARE_TASKLET_DISABLED_OLD(kp_tasklet, omap_kp_tasklet);
 
-static unsigned int *row_gpios;
-static unsigned int *col_gpios;
-
 static irqreturn_t omap_kp_interrupt(int irq, void *dev_id)
 {
 	/* disable keyboard interrupt and schedule for handling */
@@ -180,7 +176,7 @@ static int omap_kp_probe(struct platform_device *pdev)
 	struct omap_kp *omap_kp;
 	struct input_dev *input_dev;
 	struct omap_kp_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	int i, col_idx, row_idx, ret;
+	int ret;
 	unsigned int row_shift, keycodemax;
 
 	if (!pdata->rows || !pdata->cols || !pdata->keymap_data) {
@@ -209,17 +205,9 @@ static int omap_kp_probe(struct platform_device *pdev)
 	if (pdata->delay)
 		omap_kp->delay = pdata->delay;
 
-	if (pdata->row_gpios && pdata->col_gpios) {
-		row_gpios = pdata->row_gpios;
-		col_gpios = pdata->col_gpios;
-	}
-
 	omap_kp->rows = pdata->rows;
 	omap_kp->cols = pdata->cols;
 
-	col_idx = 0;
-	row_idx = 0;
-
 	timer_setup(&omap_kp->timer, omap_kp_timer, 0);
 
 	/* get the irq and init timer*/
@@ -276,11 +264,6 @@ err4:
 err3:
 	device_remove_file(&pdev->dev, &dev_attr_enable);
 err2:
-	for (i = row_idx - 1; i >= 0; i--)
-		gpio_free(row_gpios[i]);
-	for (i = col_idx - 1; i >= 0; i--)
-		gpio_free(col_gpios[i]);
-
 	kfree(omap_kp);
 	input_free_device(input_dev);
 
diff --git a/include/linux/platform_data/keypad-omap.h b/include/linux/platform_data/keypad-omap.h
index 3e7c64c854f4..f3f1311cdf3a 100644
--- a/include/linux/platform_data/keypad-omap.h
+++ b/include/linux/platform_data/keypad-omap.h
@@ -19,9 +19,6 @@ struct omap_kp_platform_data {
 	bool rep;
 	unsigned long delay;
 	bool dbounce;
-	/* specific to OMAP242x*/
-	unsigned int *row_gpios;
-	unsigned int *col_gpios;
 };
 
 /* Group (0..3) -- when multiple keys are pressed, only the
-- 
cgit v1.2.3


From 7395de647e87476f5b5d2f9a9fe80cee86b4e7cc Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 23:04:47 -0800
Subject: Input: as5011 - convert to GPIO descriptor

This driver does not have any in-tree users but is passing a
legacy GPIO number through platform data.

Convert it to use a GPIO descriptor, new users or outoftree
users can easily be implemented using GPIO descriptor tables
or software nodes.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-4-9433162914a3@linaro.org
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/joystick/as5011.c | 24 +++++++++++-------------
 include/linux/input/as5011.h    |  1 -
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/joystick/as5011.c b/drivers/input/joystick/as5011.c
index bf8b1cc0ea9c..f1822c19a289 100644
--- a/drivers/input/joystick/as5011.c
+++ b/drivers/input/joystick/as5011.c
@@ -13,7 +13,7 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/input.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/delay.h>
 #include <linux/input/as5011.h>
 #include <linux/slab.h>
@@ -61,7 +61,7 @@ MODULE_LICENSE("GPL");
 struct as5011_device {
 	struct input_dev *input_dev;
 	struct i2c_client *i2c_client;
-	unsigned int button_gpio;
+	struct gpio_desc *button_gpiod;
 	unsigned int button_irq;
 	unsigned int axis_irq;
 };
@@ -114,7 +114,7 @@ static int as5011_i2c_read(struct i2c_client *client,
 static irqreturn_t as5011_button_interrupt(int irq, void *dev_id)
 {
 	struct as5011_device *as5011 = dev_id;
-	int val = gpio_get_value_cansleep(as5011->button_gpio);
+	int val = gpiod_get_value_cansleep(as5011->button_gpiod);
 
 	input_report_key(as5011->input_dev, BTN_JOYSTICK, !val);
 	input_sync(as5011->input_dev);
@@ -248,7 +248,6 @@ static int as5011_probe(struct i2c_client *client)
 
 	as5011->i2c_client = client;
 	as5011->input_dev = input_dev;
-	as5011->button_gpio = plat_data->button_gpio;
 	as5011->axis_irq = plat_data->axis_irq;
 
 	input_dev->name = "Austria Microsystem as5011 joystick";
@@ -262,18 +261,20 @@ static int as5011_probe(struct i2c_client *client)
 	input_set_abs_params(as5011->input_dev, ABS_Y,
 		AS5011_MIN_AXIS, AS5011_MAX_AXIS, AS5011_FUZZ, AS5011_FLAT);
 
-	error = gpio_request(as5011->button_gpio, "AS5011 button");
-	if (error < 0) {
-		dev_err(&client->dev, "Failed to request button gpio\n");
+	as5011->button_gpiod = devm_gpiod_get(&client->dev, NULL, GPIOD_IN);
+	if (IS_ERR(as5011->button_gpiod)) {
+		error = PTR_ERR(as5011->button_gpiod);
+		dev_err(&client->dev, "Failed to request button GPIO\n");
 		goto err_free_mem;
 	}
+	gpiod_set_consumer_name(as5011->button_gpiod, "AS5011 button");
 
-	irq = gpio_to_irq(as5011->button_gpio);
+	irq = gpiod_to_irq(as5011->button_gpiod);
 	if (irq < 0) {
 		dev_err(&client->dev,
 			"Failed to get irq number for button gpio\n");
 		error = irq;
-		goto err_free_button_gpio;
+		goto err_free_mem;
 	}
 
 	as5011->button_irq = irq;
@@ -286,7 +287,7 @@ static int as5011_probe(struct i2c_client *client)
 	if (error < 0) {
 		dev_err(&client->dev,
 			"Can't allocate button irq %d\n", as5011->button_irq);
-		goto err_free_button_gpio;
+		goto err_free_mem;
 	}
 
 	error = as5011_configure_chip(as5011, plat_data);
@@ -317,8 +318,6 @@ err_free_axis_irq:
 	free_irq(as5011->axis_irq, as5011);
 err_free_button_irq:
 	free_irq(as5011->button_irq, as5011);
-err_free_button_gpio:
-	gpio_free(as5011->button_gpio);
 err_free_mem:
 	input_free_device(input_dev);
 	kfree(as5011);
@@ -332,7 +331,6 @@ static void as5011_remove(struct i2c_client *client)
 
 	free_irq(as5011->axis_irq, as5011);
 	free_irq(as5011->button_irq, as5011);
-	gpio_free(as5011->button_gpio);
 
 	input_unregister_device(as5011->input_dev);
 	kfree(as5011);
diff --git a/include/linux/input/as5011.h b/include/linux/input/as5011.h
index 5fba52a56cd6..5705d5de3aea 100644
--- a/include/linux/input/as5011.h
+++ b/include/linux/input/as5011.h
@@ -7,7 +7,6 @@
  */
 
 struct as5011_platform_data {
-	unsigned int button_gpio;
 	unsigned int axis_irq; /* irq number */
 	unsigned long axis_irqflags;
 	char xp, xn; /* threshold for x axis */
-- 
cgit v1.2.3


From 6ab3d50b106c9aea123a80551a6c9deace83b914 Mon Sep 17 00:00:00 2001
From: Qiang Yu <quic_qianyu@quicinc.com>
Date: Tue, 7 Nov 2023 16:14:49 +0800
Subject: bus: mhi: host: Add a separate timeout parameter for waiting ready

Some devices(eg. SDX75) take longer than expected (default, 8 seconds) to
set ready after reboot. Hence add optional ready timeout parameter and pass
the appropriate timeout value to mhi_poll_reg_field() to wait enough for
device ready as part of power up sequence.

Signed-off-by: Qiang Yu <quic_qianyu@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1699344890-87076-2-git-send-email-quic_qianyu@quicinc.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/host/init.c     |  1 +
 drivers/bus/mhi/host/internal.h |  2 +-
 drivers/bus/mhi/host/main.c     |  5 +++--
 drivers/bus/mhi/host/pm.c       | 24 +++++++++++++++++-------
 include/linux/mhi.h             |  4 ++++
 5 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index f78aefd2d7a3..65ceac1837f9 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -881,6 +881,7 @@ static int parse_config(struct mhi_controller *mhi_cntrl,
 	if (!mhi_cntrl->timeout_ms)
 		mhi_cntrl->timeout_ms = MHI_TIMEOUT_MS;
 
+	mhi_cntrl->ready_timeout_ms = config->ready_timeout_ms;
 	mhi_cntrl->bounce_buf = config->use_bounce_buf;
 	mhi_cntrl->buffer_len = config->buf_len;
 	if (!mhi_cntrl->buffer_len)
diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
index 2e139e76de4c..30ac415a3000 100644
--- a/drivers/bus/mhi/host/internal.h
+++ b/drivers/bus/mhi/host/internal.h
@@ -321,7 +321,7 @@ int __must_check mhi_read_reg_field(struct mhi_controller *mhi_cntrl,
 				    u32 *out);
 int __must_check mhi_poll_reg_field(struct mhi_controller *mhi_cntrl,
 				    void __iomem *base, u32 offset, u32 mask,
-				    u32 val, u32 delayus);
+				    u32 val, u32 delayus, u32 timeout_ms);
 void mhi_write_reg(struct mhi_controller *mhi_cntrl, void __iomem *base,
 		   u32 offset, u32 val);
 int __must_check mhi_write_reg_field(struct mhi_controller *mhi_cntrl,
diff --git a/drivers/bus/mhi/host/main.c b/drivers/bus/mhi/host/main.c
index dcf627b36e82..6cf11457380b 100644
--- a/drivers/bus/mhi/host/main.c
+++ b/drivers/bus/mhi/host/main.c
@@ -40,10 +40,11 @@ int __must_check mhi_read_reg_field(struct mhi_controller *mhi_cntrl,
 
 int __must_check mhi_poll_reg_field(struct mhi_controller *mhi_cntrl,
 				    void __iomem *base, u32 offset,
-				    u32 mask, u32 val, u32 delayus)
+				    u32 mask, u32 val, u32 delayus,
+				    u32 timeout_ms)
 {
 	int ret;
-	u32 out, retry = (mhi_cntrl->timeout_ms * 1000) / delayus;
+	u32 out, retry = (timeout_ms * 1000) / delayus;
 
 	while (retry--) {
 		ret = mhi_read_reg_field(mhi_cntrl, base, offset, mask, &out);
diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
index 8a4362d75fc4..a2f2feef1476 100644
--- a/drivers/bus/mhi/host/pm.c
+++ b/drivers/bus/mhi/host/pm.c
@@ -163,6 +163,7 @@ int mhi_ready_state_transition(struct mhi_controller *mhi_cntrl)
 	enum mhi_pm_state cur_state;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
 	u32 interval_us = 25000; /* poll register field every 25 milliseconds */
+	u32 timeout_ms;
 	int ret, i;
 
 	/* Check if device entered error state */
@@ -173,14 +174,18 @@ int mhi_ready_state_transition(struct mhi_controller *mhi_cntrl)
 
 	/* Wait for RESET to be cleared and READY bit to be set by the device */
 	ret = mhi_poll_reg_field(mhi_cntrl, mhi_cntrl->regs, MHICTRL,
-				 MHICTRL_RESET_MASK, 0, interval_us);
+				 MHICTRL_RESET_MASK, 0, interval_us,
+				 mhi_cntrl->timeout_ms);
 	if (ret) {
 		dev_err(dev, "Device failed to clear MHI Reset\n");
 		return ret;
 	}
 
+	timeout_ms = mhi_cntrl->ready_timeout_ms ?
+		mhi_cntrl->ready_timeout_ms : mhi_cntrl->timeout_ms;
 	ret = mhi_poll_reg_field(mhi_cntrl, mhi_cntrl->regs, MHISTATUS,
-				 MHISTATUS_READY_MASK, 1, interval_us);
+				 MHISTATUS_READY_MASK, 1, interval_us,
+				 timeout_ms);
 	if (ret) {
 		dev_err(dev, "Device failed to enter MHI Ready\n");
 		return ret;
@@ -479,7 +484,7 @@ static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl)
 
 		/* Wait for the reset bit to be cleared by the device */
 		ret = mhi_poll_reg_field(mhi_cntrl, mhi_cntrl->regs, MHICTRL,
-				 MHICTRL_RESET_MASK, 0, 25000);
+				 MHICTRL_RESET_MASK, 0, 25000, mhi_cntrl->timeout_ms);
 		if (ret)
 			dev_err(dev, "Device failed to clear MHI Reset\n");
 
@@ -492,8 +497,8 @@ static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl)
 		if (!MHI_IN_PBL(mhi_get_exec_env(mhi_cntrl))) {
 			/* wait for ready to be set */
 			ret = mhi_poll_reg_field(mhi_cntrl, mhi_cntrl->regs,
-						 MHISTATUS,
-						 MHISTATUS_READY_MASK, 1, 25000);
+						 MHISTATUS, MHISTATUS_READY_MASK,
+						 1, 25000, mhi_cntrl->timeout_ms);
 			if (ret)
 				dev_err(dev, "Device failed to enter READY state\n");
 		}
@@ -1111,7 +1116,8 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
 	if (state == MHI_STATE_SYS_ERR) {
 		mhi_set_mhi_state(mhi_cntrl, MHI_STATE_RESET);
 		ret = mhi_poll_reg_field(mhi_cntrl, mhi_cntrl->regs, MHICTRL,
-				 MHICTRL_RESET_MASK, 0, interval_us);
+				 MHICTRL_RESET_MASK, 0, interval_us,
+				 mhi_cntrl->timeout_ms);
 		if (ret) {
 			dev_info(dev, "Failed to reset MHI due to syserr state\n");
 			goto error_exit;
@@ -1202,14 +1208,18 @@ EXPORT_SYMBOL_GPL(mhi_power_down);
 int mhi_sync_power_up(struct mhi_controller *mhi_cntrl)
 {
 	int ret = mhi_async_power_up(mhi_cntrl);
+	u32 timeout_ms;
 
 	if (ret)
 		return ret;
 
+	/* Some devices need more time to set ready during power up */
+	timeout_ms = mhi_cntrl->ready_timeout_ms ?
+		mhi_cntrl->ready_timeout_ms : mhi_cntrl->timeout_ms;
 	wait_event_timeout(mhi_cntrl->state_event,
 			   MHI_IN_MISSION_MODE(mhi_cntrl->ee) ||
 			   MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state),
-			   msecs_to_jiffies(mhi_cntrl->timeout_ms));
+			   msecs_to_jiffies(timeout_ms));
 
 	ret = (MHI_IN_MISSION_MODE(mhi_cntrl->ee)) ? 0 : -ETIMEDOUT;
 	if (ret)
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 039943ec4d4e..d0f9b522f328 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -266,6 +266,7 @@ struct mhi_event_config {
  * struct mhi_controller_config - Root MHI controller configuration
  * @max_channels: Maximum number of channels supported
  * @timeout_ms: Timeout value for operations. 0 means use default
+ * @ready_timeout_ms: Timeout value for waiting device to be ready (optional)
  * @buf_len: Size of automatically allocated buffers. 0 means use default
  * @num_channels: Number of channels defined in @ch_cfg
  * @ch_cfg: Array of defined channels
@@ -277,6 +278,7 @@ struct mhi_event_config {
 struct mhi_controller_config {
 	u32 max_channels;
 	u32 timeout_ms;
+	u32 ready_timeout_ms;
 	u32 buf_len;
 	u32 num_channels;
 	const struct mhi_channel_config *ch_cfg;
@@ -330,6 +332,7 @@ struct mhi_controller_config {
  * @pm_mutex: Mutex for suspend/resume operation
  * @pm_lock: Lock for protecting MHI power management state
  * @timeout_ms: Timeout in ms for state transitions
+ * @ready_timeout_ms: Timeout in ms for waiting device to be ready (optional)
  * @pm_state: MHI power management state
  * @db_access: DB access states
  * @ee: MHI device execution environment
@@ -419,6 +422,7 @@ struct mhi_controller {
 	struct mutex pm_mutex;
 	rwlock_t pm_lock;
 	u32 timeout_ms;
+	u32 ready_timeout_ms;
 	u32 pm_state;
 	u32 db_access;
 	enum mhi_ee_type ee;
-- 
cgit v1.2.3


From fb6e30a72539ce28c1323aef4190d35aac106f6f Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:14 -0700
Subject: net: ethtool: pass a pointer to parameters to get/set_rxfh ethtool
 ops

The get/set_rxfh ethtool ops currently takes the rxfh (RSS) parameters
as direct function arguments. This will force us to change the API (and
all drivers' functions) every time some new parameters are added.

This is part 1/2 of the fix, as suggested in [1]:

- First simplify the code by always providing a pointer to all params
   (indir, key and func); the fact that some of them may be NULL seems
   like a weird historic thing or a premature optimization.
   It will simplify the drivers if all pointers are always present.

 - Then make the functions take a dev pointer, and a pointer to a
   single struct wrapping all arguments. The set_* should also take
   an extack.

Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1]
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Suggested-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-2-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/amazon/ena/ena_ethtool.c      |  28 ++---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c       |  33 +++---
 .../net/ethernet/aquantia/atlantic/aq_ethtool.c    |  31 ++---
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    |  25 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |  28 ++---
 drivers/net/ethernet/broadcom/tg3.c                |  22 ++--
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c    |  31 ++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c |  24 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c     |  25 ++--
 drivers/net/ethernet/emulex/benet/be_ethtool.c     |  28 ++---
 .../net/ethernet/freescale/enetc/enetc_ethtool.c   |  31 ++---
 .../net/ethernet/fungible/funeth/funeth_ethtool.c  |  40 +++----
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c   |  17 +--
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  23 ++--
 drivers/net/ethernet/huawei/hinic/hinic_ethtool.c  |  40 +++----
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c   |  26 +++--
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  38 +++----
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c     |  42 ++++---
 drivers/net/ethernet/intel/ice/ice_ethtool.c       |  44 ++++---
 drivers/net/ethernet/intel/idpf/idpf_ethtool.c     |  40 ++++---
 drivers/net/ethernet/intel/igb/igb_ethtool.c       |  27 ++---
 drivers/net/ethernet/intel/igc/igc_ethtool.c       |  27 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   |  35 +++---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c       |  27 ++---
 drivers/net/ethernet/marvell/mvneta.c              |  25 ++--
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c    |  50 ++++----
 .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c  |  41 +++----
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    |  40 +++----
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   6 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  28 +++--
 drivers/net/ethernet/microchip/lan743x_ethtool.c   |  34 +++---
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c |  33 +++---
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  38 +++----
 .../net/ethernet/pensando/ionic/ionic_ethtool.c    |  26 +++--
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c    |  32 +++---
 drivers/net/ethernet/sfc/ethtool_common.c          |  53 +++++----
 drivers/net/ethernet/sfc/ethtool_common.h          |  17 +--
 drivers/net/ethernet/sfc/falcon/ethtool.c          |  26 +++--
 drivers/net/ethernet/sfc/siena/ethtool_common.c    |  53 +++++----
 drivers/net/ethernet/sfc/siena/ethtool_common.h    |  17 +--
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  31 ++---
 drivers/net/hyperv/netvsc_drv.c                    |  32 +++---
 drivers/net/virtio_net.c                           |  29 ++---
 drivers/net/vmxnet3/vmxnet3_ethtool.c              |  22 ++--
 include/linux/ethtool.h                            |  38 +++++--
 net/ethtool/common.c                               |  12 +-
 net/ethtool/ioctl.c                                | 126 +++++++++++----------
 net/ethtool/rss.c                                  |  16 +--
 48 files changed, 827 insertions(+), 730 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index e3ef081aa42b..2d8c5a2841b8 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
@@ -802,15 +802,15 @@ static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
 	return rc;
 }
 
-static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			u8 *hfunc)
+static int ena_get_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	enum ena_admin_hash_functions ena_func;
 	u8 func;
 	int rc;
 
-	rc = ena_indirection_table_get(adapter, indir);
+	rc = ena_indirection_table_get(adapter, rxfh->indir);
 	if (rc)
 		return rc;
 
@@ -825,7 +825,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 		return rc;
 	}
 
-	rc = ena_com_get_hash_key(adapter->ena_dev, key);
+	rc = ena_com_get_hash_key(adapter->ena_dev, rxfh->key);
 	if (rc)
 		return rc;
 
@@ -842,27 +842,27 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 		return -EOPNOTSUPP;
 	}
 
-	if (hfunc)
-		*hfunc = func;
+	rxfh->hfunc = func;
 
 	return 0;
 }
 
-static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
-			const u8 *key, const u8 hfunc)
+static int ena_set_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
 {
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	struct ena_com_dev *ena_dev = adapter->ena_dev;
 	enum ena_admin_hash_functions func = 0;
 	int rc;
 
-	if (indir) {
-		rc = ena_indirection_table_set(adapter, indir);
+	if (rxfh->indir) {
+		rc = ena_indirection_table_set(adapter, rxfh->indir);
 		if (rc)
 			return rc;
 	}
 
-	switch (hfunc) {
+	switch (rxfh->hfunc) {
 	case ETH_RSS_HASH_NO_CHANGE:
 		func = ena_com_get_current_hash_function(ena_dev);
 		break;
@@ -874,12 +874,12 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir,
 		break;
 	default:
 		netif_err(adapter, drv, netdev, "Unsupported hfunc %d\n",
-			  hfunc);
+			  rxfh->hfunc);
 		return -EOPNOTSUPP;
 	}
 
-	if (key || func) {
-		rc = ena_com_fill_hash_function(ena_dev, func, key,
+	if (rxfh->key || func) {
+		rc = ena_com_fill_hash_function(ena_dev, func, rxfh->key,
 						ENA_HASH_KEY_SIZE,
 						0xFFFFFFFF);
 		if (unlikely(rc)) {
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 32fab5e77246..58e7e88aae5b 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -527,47 +527,48 @@ static u32 xgbe_get_rxfh_indir_size(struct net_device *netdev)
 	return ARRAY_SIZE(pdata->rss_table);
 }
 
-static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int xgbe_get_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	unsigned int i;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ARRAY_SIZE(pdata->rss_table); i++)
-			indir[i] = XGMAC_GET_BITS(pdata->rss_table[i],
-						  MAC_RSSDR, DMCH);
+			rxfh->indir[i] = XGMAC_GET_BITS(pdata->rss_table[i],
+							MAC_RSSDR, DMCH);
 	}
 
-	if (key)
-		memcpy(key, pdata->rss_key, sizeof(pdata->rss_key));
+	if (rxfh->key)
+		memcpy(rxfh->key, pdata->rss_key, sizeof(pdata->rss_key));
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int xgbe_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int xgbe_set_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	struct xgbe_hw_if *hw_if = &pdata->hw_if;
 	unsigned int ret;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) {
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP) {
 		netdev_err(netdev, "unsupported hash function\n");
 		return -EOPNOTSUPP;
 	}
 
-	if (indir) {
-		ret = hw_if->set_rss_lookup_table(pdata, indir);
+	if (rxfh->indir) {
+		ret = hw_if->set_rss_lookup_table(pdata, rxfh->indir);
 		if (ret)
 			return ret;
 	}
 
-	if (key) {
-		ret = hw_if->set_rss_hash_key(pdata, key);
+	if (rxfh->key) {
+		ret = hw_if->set_rss_hash_key(pdata, rxfh->key);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index ac4ea93bd8dd..18a6c8d99fa0 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -447,8 +447,8 @@ static u32 aq_ethtool_get_rss_key_size(struct net_device *ndev)
 	return sizeof(cfg->aq_rss.hash_secret_key);
 }
 
-static int aq_ethtool_get_rss(struct net_device *ndev, u32 *indir, u8 *key,
-			      u8 *hfunc)
+static int aq_ethtool_get_rss(struct net_device *ndev,
+			      struct ethtool_rxfh_param *rxfh)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
 	struct aq_nic_cfg_s *cfg;
@@ -456,21 +456,21 @@ static int aq_ethtool_get_rss(struct net_device *ndev, u32 *indir, u8 *key,
 
 	cfg = aq_nic_get_cfg(aq_nic);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */
-	if (indir) {
+	rxfh->hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */
+	if (rxfh->indir) {
 		for (i = 0; i < AQ_CFG_RSS_INDIRECTION_TABLE_MAX; i++)
-			indir[i] = cfg->aq_rss.indirection_table[i];
+			rxfh->indir[i] = cfg->aq_rss.indirection_table[i];
 	}
-	if (key)
-		memcpy(key, cfg->aq_rss.hash_secret_key,
+	if (rxfh->key)
+		memcpy(rxfh->key, cfg->aq_rss.hash_secret_key,
 		       sizeof(cfg->aq_rss.hash_secret_key));
 
 	return 0;
 }
 
-static int aq_ethtool_set_rss(struct net_device *netdev, const u32 *indir,
-			      const u8 *key, const u8 hfunc)
+static int aq_ethtool_set_rss(struct net_device *netdev,
+			      struct ethtool_rxfh_param *rxfh,
+			      struct netlink_ext_ack *extack)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(netdev);
 	struct aq_nic_cfg_s *cfg;
@@ -482,16 +482,17 @@ static int aq_ethtool_set_rss(struct net_device *netdev, const u32 *indir,
 	rss_entries = cfg->aq_rss.indirection_table_size;
 
 	/* We do not allow change in unsupported parameters */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 	/* Fill out the redirection table */
-	if (indir)
+	if (rxfh->indir)
 		for (i = 0; i < rss_entries; i++)
-			cfg->aq_rss.indirection_table[i] = indir[i];
+			cfg->aq_rss.indirection_table[i] = rxfh->indir[i];
 
 	/* Fill out the rss hash key */
-	if (key) {
-		memcpy(cfg->aq_rss.hash_secret_key, key,
+	if (rxfh->key) {
+		memcpy(cfg->aq_rss.hash_secret_key, rxfh->key,
 		       sizeof(cfg->aq_rss.hash_secret_key));
 		err = aq_nic->aq_hw_ops->hw_rss_hash_set(aq_nic->aq_hw,
 			&cfg->aq_rss);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index bda3ccc28eca..81d232e6d05f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3486,16 +3486,15 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
 	return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
-static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-			  u8 *hfunc)
+static int bnx2x_get_rxfh(struct net_device *dev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	u8 ind_table[T_ETH_INDIRECTION_TABLE_SIZE] = {0};
 	size_t i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 
 	/* Get the current configuration of the RSS indirection table */
@@ -3511,13 +3510,14 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 	 * queue.
 	 */
 	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
-		indir[i] = ind_table[i] - bp->fp->cl_id;
+		rxfh->indir[i] = ind_table[i] - bp->fp->cl_id;
 
 	return 0;
 }
 
-static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int bnx2x_set_rxfh(struct net_device *dev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	size_t i;
@@ -3525,11 +3525,12 @@ static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir,
 	/* We require at least one supported parameter to be changed and no
 	 * change in any of the unsupported parameters
 	 */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++) {
@@ -3542,7 +3543,7 @@ static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir,
 		 * align the received table to the Client ID of the leading RSS
 		 * queue
 		 */
-		bp->rss_conf_obj.ind_table[i] = indir[i] + bp->fp->cl_id;
+		bp->rss_conf_obj.ind_table[i] = rxfh->indir[i] + bp->fp->cl_id;
 	}
 
 	if (bp->state == BNX2X_STATE_OPEN)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 45ce7e2e3662..2742d9decc73 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -1333,49 +1333,49 @@ static u32 bnxt_get_rxfh_key_size(struct net_device *dev)
 	return HW_HASH_KEY_SIZE;
 }
 
-static int bnxt_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int bnxt_get_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	struct bnxt_vnic_info *vnic;
 	u32 i, tbl_size;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	if (!bp->vnic_info)
 		return 0;
 
 	vnic = &bp->vnic_info[0];
-	if (indir && bp->rss_indir_tbl) {
+	if (rxfh->indir && bp->rss_indir_tbl) {
 		tbl_size = bnxt_get_rxfh_indir_size(dev);
 		for (i = 0; i < tbl_size; i++)
-			indir[i] = bp->rss_indir_tbl[i];
+			rxfh->indir[i] = bp->rss_indir_tbl[i];
 	}
 
-	if (key && vnic->rss_hash_key)
-		memcpy(key, vnic->rss_hash_key, HW_HASH_KEY_SIZE);
+	if (rxfh->key && vnic->rss_hash_key)
+		memcpy(rxfh->key, vnic->rss_hash_key, HW_HASH_KEY_SIZE);
 
 	return 0;
 }
 
-static int bnxt_set_rxfh(struct net_device *dev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int bnxt_set_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = netdev_priv(dev);
 	int rc = 0;
 
-	if (hfunc && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc && rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (key)
+	if (rxfh->key)
 		return -EOPNOTSUPP;
 
-	if (indir) {
+	if (rxfh->indir) {
 		u32 i, pad, tbl_size = bnxt_get_rxfh_indir_size(dev);
 
 		for (i = 0; i < tbl_size; i++)
-			bp->rss_indir_tbl[i] = indir[i];
+			bp->rss_indir_tbl[i] = rxfh->indir[i];
 		pad = bp->rss_indir_tbl_entries - tbl_size;
 		if (pad)
 			memset(&bp->rss_indir_tbl[i], 0, pad * sizeof(u16));
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index f52830dfb26a..04964bbe08cf 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12745,24 +12745,23 @@ static u32 tg3_get_rxfh_indir_size(struct net_device *dev)
 	return size;
 }
 
-static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
+static int tg3_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	int i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 
 	for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++)
-		indir[i] = tp->rss_ind_tbl[i];
+		rxfh->indir[i] = tp->rss_ind_tbl[i];
 
 	return 0;
 }
 
-static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key,
-			const u8 hfunc)
+static int tg3_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	size_t i;
@@ -12770,15 +12769,16 @@ static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key,
 	/* We require at least one supported parameter to be changed and no
 	 * change in any of the unsupported parameters
 	 */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++)
-		tp->rss_ind_tbl[i] = indir[i];
+		tp->rss_ind_tbl[i] = rxfh->indir[i];
 
 	if (!netif_running(dev) || !tg3_flag(tp, ENABLE_RSS))
 		return 0;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index d8d71bf97983..34125b8cd935 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -653,35 +653,36 @@ static u32 nicvf_get_rxfh_indir_size(struct net_device *dev)
 	return nic->rss_info.rss_size;
 }
 
-static int nicvf_get_rxfh(struct net_device *dev, u32 *indir, u8 *hkey,
-			  u8 *hfunc)
+static int nicvf_get_rxfh(struct net_device *dev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct nicvf *nic = netdev_priv(dev);
 	struct nicvf_rss_info *rss = &nic->rss_info;
 	int idx;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (idx = 0; idx < rss->rss_size; idx++)
-			indir[idx] = rss->ind_tbl[idx];
+			rxfh->indir[idx] = rss->ind_tbl[idx];
 	}
 
-	if (hkey)
-		memcpy(hkey, rss->key, RSS_HASH_KEY_SIZE * sizeof(u64));
+	if (rxfh->key)
+		memcpy(rxfh->key, rss->key, RSS_HASH_KEY_SIZE * sizeof(u64));
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int nicvf_set_rxfh(struct net_device *dev, const u32 *indir,
-			  const u8 *hkey, const u8 hfunc)
+static int nicvf_set_rxfh(struct net_device *dev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct nicvf *nic = netdev_priv(dev);
 	struct nicvf_rss_info *rss = &nic->rss_info;
 	int idx;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	if (!rss->enable) {
@@ -690,13 +691,13 @@ static int nicvf_set_rxfh(struct net_device *dev, const u32 *indir,
 		return -EIO;
 	}
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (idx = 0; idx < rss->rss_size; idx++)
-			rss->ind_tbl[idx] = indir[idx];
+			rss->ind_tbl[idx] = rxfh->indir[idx];
 	}
 
-	if (hkey) {
-		memcpy(rss->key, hkey, RSS_HASH_KEY_SIZE * sizeof(u64));
+	if (rxfh->key) {
+		memcpy(rss->key, rxfh->key, RSS_HASH_KEY_SIZE * sizeof(u64));
 		nicvf_set_rss_key(nic);
 	}
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 8477a93cee6b..47eecde36285 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -1588,22 +1588,23 @@ static u32 get_rss_table_size(struct net_device *dev)
 	return pi->rss_size;
 }
 
-static int get_rss_table(struct net_device *dev, u32 *p, u8 *key, u8 *hfunc)
+static int get_rss_table(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	const struct port_info *pi = netdev_priv(dev);
 	unsigned int n = pi->rss_size;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!p)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 	while (n--)
-		p[n] = pi->rss[n];
+		rxfh->indir[n] = pi->rss[n];
 	return 0;
 }
 
-static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key,
-			 const u8 hfunc)
+static int set_rss_table(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	unsigned int i;
 	struct port_info *pi = netdev_priv(dev);
@@ -1611,16 +1612,17 @@ static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key,
 	/* We require at least one supported parameter to be changed and no
 	 * change in any of the unsupported parameters
 	 */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
-	if (!p)
+	if (!rxfh->indir)
 		return 0;
 
 	/* Interface must be brought up atleast once */
 	if (pi->adapter->flags & CXGB4_FULL_INIT_DONE) {
 		for (i = 0; i < pi->rss_size; i++)
-			pi->rss[i] = p[i];
+			pi->rss[i] = rxfh->indir[i];
 
 		return cxgb4_write_rss(pi, pi->rss);
 	}
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 08b7cc0a1809..241906697019 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -568,31 +568,32 @@ static u32 enic_get_rxfh_key_size(struct net_device *netdev)
 	return ENIC_RSS_LEN;
 }
 
-static int enic_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey,
-			 u8 *hfunc)
+static int enic_get_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct enic *enic = netdev_priv(netdev);
 
-	if (hkey)
-		memcpy(hkey, enic->rss_key, ENIC_RSS_LEN);
+	if (rxfh->key)
+		memcpy(rxfh->key, enic->rss_key, ENIC_RSS_LEN);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int enic_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *hkey, const u8 hfunc)
+static int enic_set_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct enic *enic = netdev_priv(netdev);
 
-	if ((hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) ||
-	    indir)
+	if (rxfh->indir ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EINVAL;
 
-	if (hkey)
-		memcpy(enic->rss_key, hkey, ENIC_RSS_LEN);
+	if (rxfh->key)
+		memcpy(enic->rss_key, rxfh->key, ENIC_RSS_LEN);
 
 	return __enic_set_rsskey(enic);
 }
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index a29de29bdf23..f001a649f58f 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1271,43 +1271,45 @@ static u32 be_get_rxfh_key_size(struct net_device *netdev)
 	return RSS_HASH_KEY_LEN;
 }
 
-static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey,
-		       u8 *hfunc)
+static int be_get_rxfh(struct net_device *netdev,
+		       struct ethtool_rxfh_param *rxfh)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	int i;
 	struct rss_info *rss = &adapter->rss_info;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++)
-			indir[i] = rss->rss_queue[i];
+			rxfh->indir[i] = rss->rss_queue[i];
 	}
 
-	if (hkey)
-		memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN);
+	if (rxfh->key)
+		memcpy(rxfh->key, rss->rss_hkey, RSS_HASH_KEY_LEN);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int be_set_rxfh(struct net_device *netdev, const u32 *indir,
-		       const u8 *hkey, const u8 hfunc)
+static int be_set_rxfh(struct net_device *netdev,
+		       struct ethtool_rxfh_param *rxfh,
+		       struct netlink_ext_ack *extack)
 {
 	int rc = 0, i, j;
 	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 *hkey = rxfh->key;
 	u8 rsstable[RSS_INDIR_TABLE_LEN];
 
 	/* We do not allow change in unsupported parameters */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (indir) {
+	if (rxfh->indir) {
 		struct be_rx_obj *rxo;
 
 		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++) {
-			j = indir[i];
+			j = rxfh->indir[i];
 			rxo = &adapter->rx_obj[j];
 			rsstable[i] = rxo->rss_id;
 			adapter->rss_info.rss_queue[i] = j;
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
index e993ed04ab57..f7753ea5b57e 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
@@ -690,25 +690,26 @@ static u32 enetc_get_rxfh_indir_size(struct net_device *ndev)
 	return priv->si->num_rss;
 }
 
-static int enetc_get_rxfh(struct net_device *ndev, u32 *indir, u8 *key,
-			  u8 *hfunc)
+static int enetc_get_rxfh(struct net_device *ndev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	struct enetc_hw *hw = &priv->si->hw;
 	int err = 0, i;
 
 	/* return hash function */
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	/* return hash key */
-	if (key && hw->port)
+	if (rxfh->key && hw->port)
 		for (i = 0; i < ENETC_RSSHASH_KEY_SIZE / 4; i++)
-			((u32 *)key)[i] = enetc_port_rd(hw, ENETC_PRSSK(i));
+			((u32 *)rxfh->key)[i] = enetc_port_rd(hw,
+							      ENETC_PRSSK(i));
 
 	/* return RSS table */
-	if (indir)
-		err = enetc_get_rss_table(priv->si, indir, priv->si->num_rss);
+	if (rxfh->indir)
+		err = enetc_get_rss_table(priv->si, rxfh->indir,
+					  priv->si->num_rss);
 
 	return err;
 }
@@ -722,20 +723,22 @@ void enetc_set_rss_key(struct enetc_hw *hw, const u8 *bytes)
 }
 EXPORT_SYMBOL_GPL(enetc_set_rss_key);
 
-static int enetc_set_rxfh(struct net_device *ndev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int enetc_set_rxfh(struct net_device *ndev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
 	struct enetc_hw *hw = &priv->si->hw;
 	int err = 0;
 
 	/* set hash key, if PF */
-	if (key && hw->port)
-		enetc_set_rss_key(hw, key);
+	if (rxfh->key && hw->port)
+		enetc_set_rss_key(hw, rxfh->key);
 
 	/* set RSS table */
-	if (indir)
-		err = enetc_set_rss_table(priv->si, indir, priv->si->num_rss);
+	if (rxfh->indir)
+		err = enetc_set_rss_table(priv->si, rxfh->indir,
+					  priv->si->num_rss);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c
index 091c93bd7587..4edd0adfc6c7 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c
@@ -977,44 +977,44 @@ static u32 fun_get_rxfh_key_size(struct net_device *netdev)
 	return sizeof(fp->rss_key);
 }
 
-static int fun_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			u8 *hfunc)
+static int fun_get_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh)
 {
 	const struct funeth_priv *fp = netdev_priv(netdev);
 
 	if (!fp->rss_cfg)
 		return -EOPNOTSUPP;
 
-	if (indir)
-		memcpy(indir, fp->indir_table,
+	if (rxfh->indir)
+		memcpy(rxfh->indir, fp->indir_table,
 		       sizeof(u32) * fp->indir_table_nentries);
 
-	if (key)
-		memcpy(key, fp->rss_key, sizeof(fp->rss_key));
+	if (rxfh->key)
+		memcpy(rxfh->key, fp->rss_key, sizeof(fp->rss_key));
 
-	if (hfunc)
-		*hfunc = fp->hash_algo == FUN_ETH_RSS_ALG_TOEPLITZ ?
-				ETH_RSS_HASH_TOP : ETH_RSS_HASH_CRC32;
+	rxfh->hfunc = fp->hash_algo == FUN_ETH_RSS_ALG_TOEPLITZ ?
+			ETH_RSS_HASH_TOP : ETH_RSS_HASH_CRC32;
 
 	return 0;
 }
 
-static int fun_set_rxfh(struct net_device *netdev, const u32 *indir,
-			const u8 *key, const u8 hfunc)
+static int fun_set_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
 {
 	struct funeth_priv *fp = netdev_priv(netdev);
-	const u32 *rss_indir = indir ? indir : fp->indir_table;
-	const u8 *rss_key = key ? key : fp->rss_key;
+	const u32 *rss_indir = rxfh->indir ? rxfh->indir : fp->indir_table;
+	const u8 *rss_key = rxfh->key ? rxfh->key : fp->rss_key;
 	enum fun_eth_hash_alg algo;
 
 	if (!fp->rss_cfg)
 		return -EOPNOTSUPP;
 
-	if (hfunc == ETH_RSS_HASH_NO_CHANGE)
+	if (rxfh->hfunc == ETH_RSS_HASH_NO_CHANGE)
 		algo = fp->hash_algo;
-	else if (hfunc == ETH_RSS_HASH_CRC32)
+	else if (rxfh->hfunc == ETH_RSS_HASH_CRC32)
 		algo = FUN_ETH_RSS_ALG_CRC32;
-	else if (hfunc == ETH_RSS_HASH_TOP)
+	else if (rxfh->hfunc == ETH_RSS_HASH_TOP)
 		algo = FUN_ETH_RSS_ALG_TOEPLITZ;
 	else
 		return -EINVAL;
@@ -1031,10 +1031,10 @@ static int fun_set_rxfh(struct net_device *netdev, const u32 *indir,
 	}
 
 	fp->hash_algo = algo;
-	if (key)
-		memcpy(fp->rss_key, key, sizeof(fp->rss_key));
-	if (indir)
-		memcpy(fp->indir_table, indir,
+	if (rxfh->key)
+		memcpy(fp->rss_key, rxfh->key, sizeof(fp->rss_key));
+	if (rxfh->indir)
+		memcpy(fp->indir_table, rxfh->indir,
 		       sizeof(u32) * fp->indir_table_nentries);
 	return 0;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index fe40cceb0f79..a5bb306b2cf1 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -1186,7 +1186,7 @@ hns_get_rss_indir_size(struct net_device *netdev)
 }
 
 static int
-hns_get_rss(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+hns_get_rss(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
 	struct hns_nic_priv *priv = netdev_priv(netdev);
 	struct hnae_ae_ops *ops;
@@ -1199,15 +1199,16 @@ hns_get_rss(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
 
 	ops = priv->ae_handle->dev->ops;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
-	return ops->get_rss(priv->ae_handle, indir, key, hfunc);
+	return ops->get_rss(priv->ae_handle,
+			    rxfh->indir, rxfh->key, &rxfh->hfunc);
 }
 
 static int
-hns_set_rss(struct net_device *netdev, const u32 *indir, const u8 *key,
-	    const u8 hfunc)
+hns_set_rss(struct net_device *netdev, struct ethtool_rxfh_param *rxfh,
+	    struct netlink_ext_ack *extack)
 {
 	struct hns_nic_priv *priv = netdev_priv(netdev);
 	struct hnae_ae_ops *ops;
@@ -1220,12 +1221,14 @@ hns_set_rss(struct net_device *netdev, const u32 *indir, const u8 *key,
 
 	ops = priv->ae_handle->dev->ops;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) {
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP) {
 		netdev_err(netdev, "Invalid hfunc!\n");
 		return -EOPNOTSUPP;
 	}
 
-	return ops->set_rss(priv->ae_handle, indir, key, hfunc);
+	return ops->set_rss(priv->ae_handle,
+			    rxfh->indir, rxfh->key, rxfh->hfunc);
 }
 
 static int hns_get_rxnfc(struct net_device *netdev,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 682239f33082..999a0ee162a6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -941,19 +941,21 @@ static u32 hns3_get_rss_indir_size(struct net_device *netdev)
 	return ae_dev->dev_specs.rss_ind_tbl_size;
 }
 
-static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key,
-			u8 *hfunc)
+static int hns3_get_rss(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh)
 {
 	struct hnae3_handle *h = hns3_get_handle(netdev);
 
 	if (!h->ae_algo->ops->get_rss)
 		return -EOPNOTSUPP;
 
-	return h->ae_algo->ops->get_rss(h, indir, key, hfunc);
+	return h->ae_algo->ops->get_rss(h, rxfh->indir, rxfh->key,
+					&rxfh->hfunc);
 }
 
-static int hns3_set_rss(struct net_device *netdev, const u32 *indir,
-			const u8 *key, const u8 hfunc)
+static int hns3_set_rss(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
 {
 	struct hnae3_handle *h = hns3_get_handle(netdev);
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(h->pdev);
@@ -962,19 +964,22 @@ static int hns3_set_rss(struct net_device *netdev, const u32 *indir,
 		return -EOPNOTSUPP;
 
 	if ((ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2 &&
-	     hfunc != ETH_RSS_HASH_TOP) || (hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	     hfunc != ETH_RSS_HASH_TOP && hfunc != ETH_RSS_HASH_XOR)) {
+	     rxfh->hfunc != ETH_RSS_HASH_TOP) ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP &&
+	     rxfh->hfunc != ETH_RSS_HASH_XOR)) {
 		netdev_err(netdev, "hash func not supported\n");
 		return -EOPNOTSUPP;
 	}
 
-	if (!indir) {
+	if (!rxfh->indir) {
 		netdev_err(netdev,
 			   "set rss failed for indir is empty\n");
 		return -EOPNOTSUPP;
 	}
 
-	return h->ae_algo->ops->set_rss(h, indir, key, hfunc);
+	return h->ae_algo->ops->set_rss(h, rxfh->indir, rxfh->key,
+					rxfh->hfunc);
 }
 
 static int hns3_get_rxnfc(struct net_device *netdev,
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
index f4b680286911..0304f03d4093 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
@@ -1137,7 +1137,7 @@ static int hinic_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
 }
 
 static int hinic_get_rxfh(struct net_device *netdev,
-			  u32 *indir, u8 *key, u8 *hfunc)
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct hinic_dev *nic_dev = netdev_priv(netdev);
 	u8 hash_engine_type = 0;
@@ -1146,32 +1146,33 @@ static int hinic_get_rxfh(struct net_device *netdev,
 	if (!(nic_dev->flags & HINIC_RSS_ENABLE))
 		return -EOPNOTSUPP;
 
-	if (hfunc) {
-		err = hinic_rss_get_hash_engine(nic_dev,
-						nic_dev->rss_tmpl_idx,
-						&hash_engine_type);
-		if (err)
-			return -EFAULT;
+	err = hinic_rss_get_hash_engine(nic_dev,
+					nic_dev->rss_tmpl_idx,
+					&hash_engine_type);
+	if (err)
+		return -EFAULT;
 
-		*hfunc = hash_engine_type ? ETH_RSS_HASH_TOP : ETH_RSS_HASH_XOR;
-	}
+	rxfh->hfunc = hash_engine_type ? ETH_RSS_HASH_TOP : ETH_RSS_HASH_XOR;
 
-	if (indir) {
+	if (rxfh->indir) {
 		err = hinic_rss_get_indir_tbl(nic_dev,
-					      nic_dev->rss_tmpl_idx, indir);
+					      nic_dev->rss_tmpl_idx,
+					      rxfh->indir);
 		if (err)
 			return -EFAULT;
 	}
 
-	if (key)
+	if (rxfh->key)
 		err = hinic_rss_get_template_tbl(nic_dev,
-						 nic_dev->rss_tmpl_idx, key);
+						 nic_dev->rss_tmpl_idx,
+						 rxfh->key);
 
 	return err;
 }
 
-static int hinic_set_rxfh(struct net_device *netdev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int hinic_set_rxfh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct hinic_dev *nic_dev = netdev_priv(netdev);
 	int err = 0;
@@ -1179,11 +1180,12 @@ static int hinic_set_rxfh(struct net_device *netdev, const u32 *indir,
 	if (!(nic_dev->flags & HINIC_RSS_ENABLE))
 		return -EOPNOTSUPP;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE) {
-		if (hfunc != ETH_RSS_HASH_TOP && hfunc != ETH_RSS_HASH_XOR)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE) {
+		if (rxfh->hfunc != ETH_RSS_HASH_TOP &&
+		    rxfh->hfunc != ETH_RSS_HASH_XOR)
 			return -EOPNOTSUPP;
 
-		nic_dev->rss_hash_engine = (hfunc == ETH_RSS_HASH_XOR) ?
+		nic_dev->rss_hash_engine = (rxfh->hfunc == ETH_RSS_HASH_XOR) ?
 			HINIC_RSS_HASH_ENGINE_TYPE_XOR :
 			HINIC_RSS_HASH_ENGINE_TYPE_TOEP;
 		err = hinic_rss_set_hash_engine
@@ -1193,7 +1195,7 @@ static int hinic_set_rxfh(struct net_device *netdev, const u32 *indir,
 			return -EFAULT;
 	}
 
-	err = __set_rss_rxfh(netdev, indir, key);
+	err = __set_rss_rxfh(netdev, rxfh->indir, rxfh->key);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 13a05604dcc0..1bc5b6c0b897 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -1057,16 +1057,16 @@ static u32 fm10k_get_rssrk_size(struct net_device __always_unused *netdev)
 	return FM10K_RSSRK_SIZE * FM10K_RSSRK_ENTRIES_PER_REG;
 }
 
-static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key,
-			  u8 *hfunc)
+static int fm10k_get_rssh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
+	u8 *key = rxfh->key;
 	int i, err;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	err = fm10k_get_reta(netdev, indir);
+	err = fm10k_get_reta(netdev, rxfh->indir);
 	if (err || !key)
 		return err;
 
@@ -1076,23 +1076,25 @@ static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key,
 	return 0;
 }
 
-static int fm10k_set_rssh(struct net_device *netdev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int fm10k_set_rssh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_hw *hw = &interface->hw;
 	int i, err;
 
 	/* We do not allow change in unsupported parameters */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	err = fm10k_set_reta(netdev, indir);
-	if (err || !key)
+	err = fm10k_set_reta(netdev, rxfh->indir);
+	if (err || !rxfh->key)
 		return err;
 
-	for (i = 0; i < FM10K_RSSRK_SIZE; i++, key += 4) {
-		u32 rssrk = le32_to_cpu(*(__le32 *)key);
+	for (i = 0; i < FM10K_RSSRK_SIZE; i++, rxfh->key += 4) {
+		u32 rssrk = le32_to_cpu(*(__le32 *)rxfh->key);
 
 		if (interface->rssrk[i] == rssrk)
 			continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index a0b10230268d..53820d63e6fa 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -5120,15 +5120,13 @@ static u32 i40e_get_rxfh_indir_size(struct net_device *netdev)
 /**
  * i40e_get_rxfh - get the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
+ * @rxfh: pointer to param struct (indir, key, hfunc)
  *
  * Reads the indirection table directly from the hardware. Returns 0 on
  * success.
  **/
-static int i40e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int i40e_get_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -5136,13 +5134,12 @@ static int i40e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	int ret;
 	u16 i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
-	seed = key;
+	seed = rxfh->key;
 	lut = kzalloc(I40E_HLUT_ARRAY_SIZE, GFP_KERNEL);
 	if (!lut)
 		return -ENOMEM;
@@ -5150,7 +5147,7 @@ static int i40e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	if (ret)
 		goto out;
 	for (i = 0; i < I40E_HLUT_ARRAY_SIZE; i++)
-		indir[i] = (u32)(lut[i]);
+		rxfh->indir[i] = (u32)(lut[i]);
 
 out:
 	kfree(lut);
@@ -5161,15 +5158,15 @@ out:
 /**
  * i40e_set_rxfh - set the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function to use
+ * @rxfh: pointer to param struct (indir, key, hfunc)
+ * @extack: extended ACK from the Netlink message
  *
  * Returns -EINVAL if the table specifies an invalid queue id, otherwise
  * returns 0 after programming the table.
  **/
-static int i40e_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int i40e_set_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -5177,17 +5174,18 @@ static int i40e_set_rxfh(struct net_device *netdev, const u32 *indir,
 	u8 *seed = NULL;
 	u16 i;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (key) {
+	if (rxfh->key) {
 		if (!vsi->rss_hkey_user) {
 			vsi->rss_hkey_user = kzalloc(I40E_HKEY_ARRAY_SIZE,
 						     GFP_KERNEL);
 			if (!vsi->rss_hkey_user)
 				return -ENOMEM;
 		}
-		memcpy(vsi->rss_hkey_user, key, I40E_HKEY_ARRAY_SIZE);
+		memcpy(vsi->rss_hkey_user, rxfh->key, I40E_HKEY_ARRAY_SIZE);
 		seed = vsi->rss_hkey_user;
 	}
 	if (!vsi->rss_lut_user) {
@@ -5197,9 +5195,9 @@ static int i40e_set_rxfh(struct net_device *netdev, const u32 *indir,
 	}
 
 	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
-	if (indir)
+	if (rxfh->indir)
 		for (i = 0; i < I40E_HLUT_ARRAY_SIZE; i++)
-			vsi->rss_lut_user[i] = (u8)(indir[i]);
+			vsi->rss_lut_user[i] = (u8)(rxfh->indir[i]);
 	else
 		i40e_fill_rss_lut(pf, vsi->rss_lut_user, I40E_HLUT_ARRAY_SIZE,
 				  vsi->rss_size);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 631c7cfdc814..c376a489e4c2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1894,27 +1894,24 @@ static u32 iavf_get_rxfh_indir_size(struct net_device *netdev)
 /**
  * iavf_get_rxfh - get the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function in use
+ * @rxfh: pointer to param struct (indir, key, hfunc)
  *
  * Reads the indirection table directly from the hardware. Always returns 0.
  **/
-static int iavf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int iavf_get_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 	u16 i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (key)
-		memcpy(key, adapter->rss_key, adapter->rss_key_size);
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->key)
+		memcpy(rxfh->key, adapter->rss_key, adapter->rss_key_size);
 
-	if (indir)
+	if (rxfh->indir)
 		/* Each 32 bits pointed by 'indir' is stored with a lut entry */
 		for (i = 0; i < adapter->rss_lut_size; i++)
-			indir[i] = (u32)adapter->rss_lut[i];
+			rxfh->indir[i] = (u32)adapter->rss_lut[i];
 
 	return 0;
 }
@@ -1922,33 +1919,34 @@ static int iavf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 /**
  * iavf_set_rxfh - set the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function to use
+ * @rxfh: pointer to param struct (indir, key, hfunc)
+ * @extack: extended ACK from the Netlink message
  *
  * Returns -EINVAL if the table specifies an invalid queue id, otherwise
  * returns 0 after programming the table.
  **/
-static int iavf_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int iavf_set_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 	u16 i;
 
 	/* Only support toeplitz hash function */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (!key && !indir)
+	if (!rxfh->key && !rxfh->indir)
 		return 0;
 
-	if (key)
-		memcpy(adapter->rss_key, key, adapter->rss_key_size);
+	if (rxfh->key)
+		memcpy(adapter->rss_key, rxfh->key, adapter->rss_key_size);
 
-	if (indir) {
+	if (rxfh->indir) {
 		/* Each 32 bits pointed by 'indir' is stored with a lut entry */
 		for (i = 0; i < adapter->rss_lut_size; i++)
-			adapter->rss_lut[i] = (u8)(indir[i]);
+			adapter->rss_lut[i] = (u8)(rxfh->indir[i]);
 	}
 
 	return iavf_config_rss(adapter);
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 98c9317581e0..70cbd8092aea 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3196,8 +3196,8 @@ static u32 ice_get_rxfh_indir_size(struct net_device *netdev)
 }
 
 static int
-ice_get_rxfh_context(struct net_device *netdev, u32 *indir,
-		     u8 *key, u8 *hfunc, u32 rss_context)
+ice_get_rxfh_context(struct net_device *netdev,
+		     struct ethtool_rxfh_param *rxfh, u32 rss_context)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
@@ -3230,17 +3230,16 @@ ice_get_rxfh_context(struct net_device *netdev, u32 *indir,
 		vsi = vsi->tc_map_vsi[rss_context];
 	}
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	lut = kzalloc(vsi->rss_table_size, GFP_KERNEL);
 	if (!lut)
 		return -ENOMEM;
 
-	err = ice_get_rss_key(vsi, key);
+	err = ice_get_rss_key(vsi, rxfh->key);
 	if (err)
 		goto out;
 
@@ -3250,12 +3249,12 @@ ice_get_rxfh_context(struct net_device *netdev, u32 *indir,
 
 	if (ice_is_adq_active(pf)) {
 		for (i = 0; i < vsi->rss_table_size; i++)
-			indir[i] = offset + lut[i] % qcount;
+			rxfh->indir[i] = offset + lut[i] % qcount;
 		goto out;
 	}
 
 	for (i = 0; i < vsi->rss_table_size; i++)
-		indir[i] = lut[i];
+		rxfh->indir[i] = lut[i];
 
 out:
 	kfree(lut);
@@ -3265,31 +3264,28 @@ out:
 /**
  * ice_get_rxfh - get the Rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
+ * @rxfh: pointer to param struct (indir, key, hfunc)
  *
  * Reads the indirection table directly from the hardware.
  */
 static int
-ice_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc)
+ice_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
-	return ice_get_rxfh_context(netdev, indir, key, hfunc, 0);
+	return ice_get_rxfh_context(netdev, rxfh, 0);
 }
 
 /**
  * ice_set_rxfh - set the Rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function
+ * @rxfh: pointer to param struct (indir, key, hfunc)
+ * @extack: extended ACK from the Netlink message
  *
  * Returns -EINVAL if the table specifies an invalid queue ID, otherwise
  * returns 0 after programming the table.
  */
 static int
-ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
-	     const u8 hfunc)
+ice_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh,
+	     struct netlink_ext_ack *extack)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_vsi *vsi = np->vsi;
@@ -3298,7 +3294,8 @@ ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
 	int err;
 
 	dev = ice_pf_to_dev(pf);
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
@@ -3312,7 +3309,7 @@ ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
 		return -EOPNOTSUPP;
 	}
 
-	if (key) {
+	if (rxfh->key) {
 		if (!vsi->rss_hkey_user) {
 			vsi->rss_hkey_user =
 				devm_kzalloc(dev, ICE_VSIQF_HKEY_ARRAY_SIZE,
@@ -3320,7 +3317,8 @@ ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
 			if (!vsi->rss_hkey_user)
 				return -ENOMEM;
 		}
-		memcpy(vsi->rss_hkey_user, key, ICE_VSIQF_HKEY_ARRAY_SIZE);
+		memcpy(vsi->rss_hkey_user, rxfh->key,
+		       ICE_VSIQF_HKEY_ARRAY_SIZE);
 
 		err = ice_set_rss_key(vsi, vsi->rss_hkey_user);
 		if (err)
@@ -3335,11 +3333,11 @@ ice_set_rxfh(struct net_device *netdev, const u32 *indir, const u8 *key,
 	}
 
 	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
-	if (indir) {
+	if (rxfh->indir) {
 		int i;
 
 		for (i = 0; i < vsi->rss_table_size; i++)
-			vsi->rss_lut_user[i] = (u8)(indir[i]);
+			vsi->rss_lut_user[i] = (u8)(rxfh->indir[i]);
 	} else {
 		ice_fill_rss_lut(vsi->rss_lut_user, vsi->rss_table_size,
 				 vsi->rss_size);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c
index d549d3c5910d..986d429d1175 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_ethtool.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_ethtool.c
@@ -75,14 +75,12 @@ static u32 idpf_get_rxfh_indir_size(struct net_device *netdev)
 /**
  * idpf_get_rxfh - get the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function in use
+ * @rxfh: pointer to param struct (indir, key, hfunc)
  *
  * Reads the indirection table directly from the hardware. Always returns 0.
  */
-static int idpf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int idpf_get_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct idpf_netdev_priv *np = netdev_priv(netdev);
 	struct idpf_rss_data *rss_data;
@@ -103,15 +101,14 @@ static int idpf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	if (np->state != __IDPF_VPORT_UP)
 		goto unlock_mutex;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (key)
-		memcpy(key, rss_data->rss_key, rss_data->rss_key_size);
+	if (rxfh->key)
+		memcpy(rxfh->key, rss_data->rss_key, rss_data->rss_key_size);
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < rss_data->rss_lut_size; i++)
-			indir[i] = rss_data->rss_lut[i];
+			rxfh->indir[i] = rss_data->rss_lut[i];
 	}
 
 unlock_mutex:
@@ -123,15 +120,15 @@ unlock_mutex:
 /**
  * idpf_set_rxfh - set the rx flow hash indirection table
  * @netdev: network interface device structure
- * @indir: indirection table
- * @key: hash key
- * @hfunc: hash function to use
+ * @rxfh: pointer to param struct (indir, key, hfunc)
+ * @extack: extended ACK from the Netlink message
  *
  * Returns -EINVAL if the table specifies an invalid queue id, otherwise
  * returns 0 after programming the table.
  */
-static int idpf_set_rxfh(struct net_device *netdev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int idpf_set_rxfh(struct net_device *netdev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct idpf_netdev_priv *np = netdev_priv(netdev);
 	struct idpf_rss_data *rss_data;
@@ -154,17 +151,18 @@ static int idpf_set_rxfh(struct net_device *netdev, const u32 *indir,
 	if (np->state != __IDPF_VPORT_UP)
 		goto unlock_mutex;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) {
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP) {
 		err = -EOPNOTSUPP;
 		goto unlock_mutex;
 	}
 
-	if (key)
-		memcpy(rss_data->rss_key, key, rss_data->rss_key_size);
+	if (rxfh->key)
+		memcpy(rss_data->rss_key, rxfh->key, rss_data->rss_key_size);
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (lut = 0; lut < rss_data->rss_lut_size; lut++)
-			rss_data->rss_lut[lut] = indir[lut];
+			rss_data->rss_lut[lut] = rxfh->indir[lut];
 	}
 
 	err = idpf_config_rss(vport);
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 89dac7b215e5..d868a70732f4 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -3280,18 +3280,17 @@ static u32 igb_get_rxfh_indir_size(struct net_device *netdev)
 	return IGB_RETA_SIZE;
 }
 
-static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			u8 *hfunc)
+static int igb_get_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 	for (i = 0; i < IGB_RETA_SIZE; i++)
-		indir[i] = adapter->rss_indir_tbl[i];
+		rxfh->indir[i] = adapter->rss_indir_tbl[i];
 
 	return 0;
 }
@@ -3331,8 +3330,9 @@ void igb_write_rss_indir_tbl(struct igb_adapter *adapter)
 	}
 }
 
-static int igb_set_rxfh(struct net_device *netdev, const u32 *indir,
-			const u8 *key, const u8 hfunc)
+static int igb_set_rxfh(struct net_device *netdev,
+			struct ethtool_rxfh_param *rxfh,
+			struct netlink_ext_ack *extack)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -3340,10 +3340,11 @@ static int igb_set_rxfh(struct net_device *netdev, const u32 *indir,
 	u32 num_queues;
 
 	/* We do not allow change in unsupported parameters */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	num_queues = adapter->rss_queues;
@@ -3360,12 +3361,12 @@ static int igb_set_rxfh(struct net_device *netdev, const u32 *indir,
 
 	/* Verify user input. */
 	for (i = 0; i < IGB_RETA_SIZE; i++)
-		if (indir[i] >= num_queues)
+		if (rxfh->indir[i] >= num_queues)
 			return -EINVAL;
 
 
 	for (i = 0; i < IGB_RETA_SIZE; i++)
-		adapter->rss_indir_tbl[i] = indir[i];
+		adapter->rss_indir_tbl[i] = rxfh->indir[i];
 
 	igb_write_rss_indir_tbl(adapter);
 
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index 2ed92bf34059..684f6f2a0572 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1426,45 +1426,46 @@ static u32 igc_ethtool_get_rxfh_indir_size(struct net_device *netdev)
 	return IGC_RETA_SIZE;
 }
 
-static int igc_ethtool_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-				u8 *hfunc)
+static int igc_ethtool_get_rxfh(struct net_device *netdev,
+				struct ethtool_rxfh_param *rxfh)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	int i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 	for (i = 0; i < IGC_RETA_SIZE; i++)
-		indir[i] = adapter->rss_indir_tbl[i];
+		rxfh->indir[i] = adapter->rss_indir_tbl[i];
 
 	return 0;
 }
 
-static int igc_ethtool_set_rxfh(struct net_device *netdev, const u32 *indir,
-				const u8 *key, const u8 hfunc)
+static int igc_ethtool_set_rxfh(struct net_device *netdev,
+				struct ethtool_rxfh_param *rxfh,
+				struct netlink_ext_ack *extack)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	u32 num_queues;
 	int i;
 
 	/* We do not allow change in unsupported parameters */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	num_queues = adapter->rss_queues;
 
 	/* Verify user input. */
 	for (i = 0; i < IGC_RETA_SIZE; i++)
-		if (indir[i] >= num_queues)
+		if (rxfh->indir[i] >= num_queues)
 			return -EINVAL;
 
 	for (i = 0; i < IGC_RETA_SIZE; i++)
-		adapter->rss_indir_tbl[i] = indir[i];
+		adapter->rss_indir_tbl[i] = rxfh->indir[i];
 
 	igc_write_rss_indir_tbl(adapter);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index dd722b0381e0..af55f19d5bba 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3107,35 +3107,37 @@ static void ixgbe_get_reta(struct ixgbe_adapter *adapter, u32 *indir)
 		indir[i] = adapter->rss_indir_tbl[i] & rss_m;
 }
 
-static int ixgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			  u8 *hfunc)
+static int ixgbe_get_rxfh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (indir)
-		ixgbe_get_reta(adapter, indir);
+	if (rxfh->indir)
+		ixgbe_get_reta(adapter, rxfh->indir);
 
-	if (key)
-		memcpy(key, adapter->rss_key, ixgbe_get_rxfh_key_size(netdev));
+	if (rxfh->key)
+		memcpy(rxfh->key, adapter->rss_key,
+		       ixgbe_get_rxfh_key_size(netdev));
 
 	return 0;
 }
 
-static int ixgbe_set_rxfh(struct net_device *netdev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int ixgbe_set_rxfh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int i;
 	u32 reta_entries = ixgbe_rss_indir_tbl_entries(adapter);
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	/* Fill out the redirection table */
-	if (indir) {
+	if (rxfh->indir) {
 		int max_queues = min_t(int, adapter->num_rx_queues,
 				       ixgbe_rss_indir_tbl_max(adapter));
 
@@ -3146,18 +3148,19 @@ static int ixgbe_set_rxfh(struct net_device *netdev, const u32 *indir,
 
 		/* Verify user input. */
 		for (i = 0; i < reta_entries; i++)
-			if (indir[i] >= max_queues)
+			if (rxfh->indir[i] >= max_queues)
 				return -EINVAL;
 
 		for (i = 0; i < reta_entries; i++)
-			adapter->rss_indir_tbl[i] = indir[i];
+			adapter->rss_indir_tbl[i] = rxfh->indir[i];
 
 		ixgbe_store_reta(adapter);
 	}
 
 	/* Fill out the rss hash key */
-	if (key) {
-		memcpy(adapter->rss_key, key, ixgbe_get_rxfh_key_size(netdev));
+	if (rxfh->key) {
+		memcpy(adapter->rss_key, rxfh->key,
+		       ixgbe_get_rxfh_key_size(netdev));
 		ixgbe_store_key(adapter);
 	}
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 296915414a7c..7ac53171b041 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -897,40 +897,41 @@ static u32 ixgbevf_get_rxfh_key_size(struct net_device *netdev)
 	return IXGBEVF_RSS_HASH_KEY_SIZE;
 }
 
-static int ixgbevf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			    u8 *hfunc)
+static int ixgbevf_get_rxfh(struct net_device *netdev,
+			    struct ethtool_rxfh_param *rxfh)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	int err = 0;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	if (adapter->hw.mac.type >= ixgbe_mac_X550_vf) {
-		if (key)
-			memcpy(key, adapter->rss_key,
+		if (rxfh->key)
+			memcpy(rxfh->key, adapter->rss_key,
 			       ixgbevf_get_rxfh_key_size(netdev));
 
-		if (indir) {
+		if (rxfh->indir) {
 			int i;
 
 			for (i = 0; i < IXGBEVF_X550_VFRETA_SIZE; i++)
-				indir[i] = adapter->rss_indir_tbl[i];
+				rxfh->indir[i] = adapter->rss_indir_tbl[i];
 		}
 	} else {
 		/* If neither indirection table nor hash key was requested
 		 *  - just return a success avoiding taking any locks.
 		 */
-		if (!indir && !key)
+		if (!rxfh->indir && !rxfh->key)
 			return 0;
 
 		spin_lock_bh(&adapter->mbx_lock);
-		if (indir)
-			err = ixgbevf_get_reta_locked(&adapter->hw, indir,
+		if (rxfh->indir)
+			err = ixgbevf_get_reta_locked(&adapter->hw,
+						      rxfh->indir,
 						      adapter->num_rx_queues);
 
-		if (!err && key)
-			err = ixgbevf_get_rss_key_locked(&adapter->hw, key);
+		if (!err && rxfh->key)
+			err = ixgbevf_get_rss_key_locked(&adapter->hw,
+							 rxfh->key);
 
 		spin_unlock_bh(&adapter->mbx_lock);
 	}
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 29aac327574d..a641b3534ca3 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -5030,8 +5030,9 @@ static int  mvneta_config_rss(struct mvneta_port *pp)
 	return 0;
 }
 
-static int mvneta_ethtool_set_rxfh(struct net_device *dev, const u32 *indir,
-				   const u8 *key, const u8 hfunc)
+static int mvneta_ethtool_set_rxfh(struct net_device *dev,
+				   struct ethtool_rxfh_param *rxfh,
+				   struct netlink_ext_ack *extack)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 
@@ -5042,20 +5043,21 @@ static int mvneta_ethtool_set_rxfh(struct net_device *dev, const u32 *indir,
 	/* We require at least one supported parameter to be changed
 	 * and no change in any of the unsupported parameters
 	 */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
-	memcpy(pp->indir, indir, MVNETA_RSS_LU_TABLE_SIZE);
+	memcpy(pp->indir, rxfh->indir, MVNETA_RSS_LU_TABLE_SIZE);
 
 	return mvneta_config_rss(pp);
 }
 
-static int mvneta_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-				   u8 *hfunc)
+static int mvneta_ethtool_get_rxfh(struct net_device *dev,
+				   struct ethtool_rxfh_param *rxfh)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 
@@ -5063,13 +5065,12 @@ static int mvneta_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 	if (pp->neta_armada3700)
 		return -EOPNOTSUPP;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
-	memcpy(indir, pp->indir, MVNETA_RSS_LU_TABLE_SIZE);
+	memcpy(rxfh->indir, pp->indir, MVNETA_RSS_LU_TABLE_SIZE);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 93137606869e..ceef48ddd26e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5634,8 +5634,8 @@ static u32 mvpp2_ethtool_get_rxfh_indir_size(struct net_device *dev)
 	return mvpp22_rss_is_supported(port) ? MVPP22_RSS_TABLE_ENTRIES : 0;
 }
 
-static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-				  u8 *hfunc)
+static int mvpp2_ethtool_get_rxfh(struct net_device *dev,
+				  struct ethtool_rxfh_param *rxfh)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	int ret = 0;
@@ -5643,17 +5643,17 @@ static int mvpp2_ethtool_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 	if (!mvpp22_rss_is_supported(port))
 		return -EOPNOTSUPP;
 
-	if (indir)
-		ret = mvpp22_port_rss_ctx_indir_get(port, 0, indir);
+	if (rxfh->indir)
+		ret = mvpp22_port_rss_ctx_indir_get(port, 0, rxfh->indir);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_CRC32;
+	rxfh->hfunc = ETH_RSS_HASH_CRC32;
 
 	return ret;
 }
 
-static int mvpp2_ethtool_set_rxfh(struct net_device *dev, const u32 *indir,
-				  const u8 *key, const u8 hfunc)
+static int mvpp2_ethtool_set_rxfh(struct net_device *dev,
+				  struct ethtool_rxfh_param *rxfh,
+				  struct netlink_ext_ack *extack)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	int ret = 0;
@@ -5661,20 +5661,22 @@ static int mvpp2_ethtool_set_rxfh(struct net_device *dev, const u32 *indir,
 	if (!mvpp22_rss_is_supported(port))
 		return -EOPNOTSUPP;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_CRC32)
 		return -EOPNOTSUPP;
 
-	if (key)
+	if (rxfh->key)
 		return -EOPNOTSUPP;
 
-	if (indir)
-		ret = mvpp22_port_rss_ctx_indir_set(port, 0, indir);
+	if (rxfh->indir)
+		ret = mvpp22_port_rss_ctx_indir_set(port, 0, rxfh->indir);
 
 	return ret;
 }
 
-static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev, u32 *indir,
-					  u8 *key, u8 *hfunc, u32 rss_context)
+static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev,
+					  struct ethtool_rxfh_param *rxfh,
+					  u32 rss_context)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	int ret = 0;
@@ -5684,19 +5686,18 @@ static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev, u32 *indir,
 	if (rss_context >= MVPP22_N_RSS_TABLES)
 		return -EINVAL;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_CRC32;
+	rxfh->hfunc = ETH_RSS_HASH_CRC32;
 
-	if (indir)
-		ret = mvpp22_port_rss_ctx_indir_get(port, rss_context, indir);
+	if (rxfh->indir)
+		ret = mvpp22_port_rss_ctx_indir_get(port, rss_context,
+						    rxfh->indir);
 
 	return ret;
 }
 
 static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
-					  const u32 *indir, const u8 *key,
-					  const u8 hfunc, u32 *rss_context,
-					  bool delete)
+					  struct ethtool_rxfh_param *rxfh,
+					  u32 *rss_context, bool delete)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	int ret;
@@ -5704,10 +5705,11 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
 	if (!mvpp22_rss_is_supported(port))
 		return -EOPNOTSUPP;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_CRC32)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_CRC32)
 		return -EOPNOTSUPP;
 
-	if (key)
+	if (rxfh->key)
 		return -EOPNOTSUPP;
 
 	if (delete)
@@ -5719,7 +5721,7 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
 			return ret;
 	}
 
-	return mvpp22_port_rss_ctx_indir_set(port, *rss_context, indir);
+	return mvpp22_port_rss_ctx_indir_set(port, *rss_context, rxfh->indir);
 }
 /* Device ops */
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
index 53f6258a973c..a2c182d7bfc4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
@@ -836,8 +836,8 @@ static int otx2_rss_ctx_create(struct otx2_nic *pfvf,
 }
 
 /* RSS context configuration */
-static int otx2_set_rxfh_context(struct net_device *dev, const u32 *indir,
-				 const u8 *hkey, const u8 hfunc,
+static int otx2_set_rxfh_context(struct net_device *dev,
+				 struct ethtool_rxfh_param *rxfh,
 				 u32 *rss_context, bool delete)
 {
 	struct otx2_nic *pfvf = netdev_priv(dev);
@@ -845,7 +845,8 @@ static int otx2_set_rxfh_context(struct net_device *dev, const u32 *indir,
 	struct otx2_rss_info *rss;
 	int ret, idx;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	if (*rss_context != ETH_RXFH_CONTEXT_ALLOC &&
@@ -859,8 +860,8 @@ static int otx2_set_rxfh_context(struct net_device *dev, const u32 *indir,
 		return -EIO;
 	}
 
-	if (hkey) {
-		memcpy(rss->key, hkey, sizeof(rss->key));
+	if (rxfh->key) {
+		memcpy(rss->key, rxfh->key, sizeof(rss->key));
 		otx2_set_rss_key(pfvf);
 	}
 	if (delete)
@@ -871,28 +872,29 @@ static int otx2_set_rxfh_context(struct net_device *dev, const u32 *indir,
 		if (ret)
 			return ret;
 	}
-	if (indir) {
+	if (rxfh->indir) {
 		rss_ctx = rss->rss_ctx[*rss_context];
 		for (idx = 0; idx < rss->rss_size; idx++)
-			rss_ctx->ind_tbl[idx] = indir[idx];
+			rss_ctx->ind_tbl[idx] = rxfh->indir[idx];
 	}
 	otx2_set_rss_table(pfvf, *rss_context);
 
 	return 0;
 }
 
-static int otx2_get_rxfh_context(struct net_device *dev, u32 *indir,
-				 u8 *hkey, u8 *hfunc, u32 rss_context)
+static int otx2_get_rxfh_context(struct net_device *dev,
+				 struct ethtool_rxfh_param *rxfh,
+				 u32 rss_context)
 {
 	struct otx2_nic *pfvf = netdev_priv(dev);
 	struct otx2_rss_ctx *rss_ctx;
 	struct otx2_rss_info *rss;
+	u32 *indir = rxfh->indir;
 	int idx, rx_queues;
 
 	rss = &pfvf->hw.rss_info;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	if (!indir)
 		return 0;
@@ -914,28 +916,29 @@ static int otx2_get_rxfh_context(struct net_device *dev, u32 *indir,
 		for (idx = 0; idx < rss->rss_size; idx++)
 			indir[idx] = rss_ctx->ind_tbl[idx];
 	}
-	if (hkey)
-		memcpy(hkey, rss->key, sizeof(rss->key));
+	if (rxfh->key)
+		memcpy(rxfh->key, rss->key, sizeof(rss->key));
 
 	return 0;
 }
 
 /* Get RSS configuration */
-static int otx2_get_rxfh(struct net_device *dev, u32 *indir,
-			 u8 *hkey, u8 *hfunc)
+static int otx2_get_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
-	return otx2_get_rxfh_context(dev, indir, hkey, hfunc,
+	return otx2_get_rxfh_context(dev, rxfh,
 				     DEFAULT_RSS_CONTEXT_GROUP);
 }
 
 /* Configure RSS table and hash key */
-static int otx2_set_rxfh(struct net_device *dev, const u32 *indir,
-			 const u8 *hkey, const u8 hfunc)
+static int otx2_set_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 
 	u32 rss_context = DEFAULT_RSS_CONTEXT_GROUP;
 
-	return otx2_set_rxfh_context(dev, indir, hkey, hfunc, &rss_context, 0);
+	return otx2_set_rxfh_context(dev, rxfh, &rss_context, 0);
 }
 
 static u32 otx2_get_msglevel(struct net_device *netdev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 164a13272faa..619e1c3ef7f9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1258,8 +1258,8 @@ static int mlx4_en_check_rxfh_func(struct net_device *dev, u8 hfunc)
 	return -EINVAL;
 }
 
-static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key,
-			    u8 *hfunc)
+static int mlx4_en_get_rxfh(struct net_device *dev,
+			    struct ethtool_rxfh_param *rxfh)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u32 n = mlx4_en_get_rxfh_indir_size(dev);
@@ -1269,19 +1269,19 @@ static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key,
 	rss_rings = rounddown_pow_of_two(rss_rings);
 
 	for (i = 0; i < n; i++) {
-		if (!ring_index)
+		if (!rxfh->indir)
 			break;
-		ring_index[i] = i % rss_rings;
+		rxfh->indir[i] = i % rss_rings;
 	}
-	if (key)
-		memcpy(key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE);
-	if (hfunc)
-		*hfunc = priv->rss_hash_fn;
+	if (rxfh->key)
+		memcpy(rxfh->key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE);
+	rxfh->hfunc = priv->rss_hash_fn;
 	return 0;
 }
 
-static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
-			    const u8 *key, const u8 hfunc)
+static int mlx4_en_set_rxfh(struct net_device *dev,
+			    struct ethtool_rxfh_param *rxfh,
+			    struct netlink_ext_ack *extack)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	u32 n = mlx4_en_get_rxfh_indir_size(dev);
@@ -1295,12 +1295,12 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
 	 * between rings
 	 */
 	for (i = 0; i < n; i++) {
-		if (!ring_index)
+		if (!rxfh->indir)
 			break;
-		if (i > 0 && !ring_index[i] && !rss_rings)
+		if (i > 0 && !rxfh->indir[i] && !rss_rings)
 			rss_rings = i;
 
-		if (ring_index[i] != (i % (rss_rings ?: n)))
+		if (rxfh->indir[i] != (i % (rss_rings ?: n)))
 			return -EINVAL;
 	}
 
@@ -1311,8 +1311,8 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
 	if (!is_power_of_2(rss_rings))
 		return -EINVAL;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE) {
-		err = mlx4_en_check_rxfh_func(dev, hfunc);
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE) {
+		err = mlx4_en_check_rxfh_func(dev, rxfh->hfunc);
 		if (err)
 			return err;
 	}
@@ -1323,12 +1323,12 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
 		mlx4_en_stop_port(dev, 1);
 	}
 
-	if (ring_index)
+	if (rxfh->indir)
 		priv->prof->rss_rings = rss_rings;
-	if (key)
-		memcpy(priv->rss_key, key, MLX4_EN_RSS_KEY_SIZE);
-	if (hfunc !=  ETH_RSS_HASH_NO_CHANGE)
-		priv->rss_hash_fn = hfunc;
+	if (rxfh->key)
+		memcpy(priv->rss_key, rxfh->key, MLX4_EN_RSS_KEY_SIZE);
+	if (rxfh->hfunc !=  ETH_RSS_HASH_NO_CHANGE)
+		priv->rss_hash_fn = rxfh->hfunc;
 
 	if (port_up) {
 		err = mlx4_en_start_port(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 43f027bf2da3..2cb47268bac6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1175,9 +1175,9 @@ int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv,
 				     struct ethtool_link_ksettings *link_ksettings);
 int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 				     const struct ethtool_link_ksettings *link_ksettings);
-int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc);
-int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key,
-		   const u8 hfunc);
+int mlx5e_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh);
+int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
+		   struct netlink_ext_ack *extack);
 u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv *priv);
 u32 mlx5e_ethtool_get_rxfh_indir_size(struct mlx5e_priv *priv);
 int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 792a0ea544cd..110cb7973eae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1262,23 +1262,26 @@ static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev)
 	return mlx5e_ethtool_get_rxfh_indir_size(priv);
 }
 
-static int mlx5e_get_rxfh_context(struct net_device *dev, u32 *indir,
-				  u8 *key, u8 *hfunc, u32 rss_context)
+static int mlx5e_get_rxfh_context(struct net_device *dev,
+				  struct ethtool_rxfh_param *rxfh,
+				  u32 rss_context)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	int err;
 
 	mutex_lock(&priv->state_lock);
-	err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rss_context, indir, key, hfunc);
+	err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rss_context,
+					rxfh->indir, rxfh->key, &rxfh->hfunc);
 	mutex_unlock(&priv->state_lock);
 	return err;
 }
 
-static int mlx5e_set_rxfh_context(struct net_device *dev, const u32 *indir,
-				  const u8 *key, const u8 hfunc,
+static int mlx5e_set_rxfh_context(struct net_device *dev,
+				  struct ethtool_rxfh_param *rxfh,
 				  u32 *rss_context, bool delete)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	u8 hfunc = rxfh->hfunc;
 	int err;
 
 	mutex_lock(&priv->state_lock);
@@ -1295,7 +1298,8 @@ static int mlx5e_set_rxfh_context(struct net_device *dev, const u32 *indir,
 			goto unlock;
 	}
 
-	err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, *rss_context, indir, key,
+	err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, *rss_context,
+					rxfh->indir, rxfh->key,
 					hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc);
 
 unlock:
@@ -1303,20 +1307,20 @@ unlock:
 	return err;
 }
 
-int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-		   u8 *hfunc)
+int mlx5e_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
-	return mlx5e_get_rxfh_context(netdev, indir, key, hfunc, 0);
+	return mlx5e_get_rxfh_context(netdev, rxfh, 0);
 }
 
-int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
-		   const u8 *key, const u8 hfunc)
+int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
+		   struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	u8 hfunc = rxfh->hfunc;
 	int err;
 
 	mutex_lock(&priv->state_lock);
-	err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, 0, indir, key,
+	err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, 0, rxfh->indir, rxfh->key,
 					hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc);
 	mutex_unlock(&priv->state_lock);
 	return err;
diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index 6961cfc55fb9..8c4a2bb6a537 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -934,11 +934,11 @@ static u32 lan743x_ethtool_get_rxfh_indir_size(struct net_device *netdev)
 }
 
 static int lan743x_ethtool_get_rxfh(struct net_device *netdev,
-				    u32 *indir, u8 *key, u8 *hfunc)
+				    struct ethtool_rxfh_param *rxfh)
 {
 	struct lan743x_adapter *adapter = netdev_priv(netdev);
 
-	if (indir) {
+	if (rxfh->indir) {
 		int dw_index;
 		int byte_index = 0;
 
@@ -947,17 +947,17 @@ static int lan743x_ethtool_get_rxfh(struct net_device *netdev,
 				lan743x_csr_read(adapter, RFE_INDX(dw_index));
 
 			byte_index = dw_index << 2;
-			indir[byte_index + 0] =
+			rxfh->indir[byte_index + 0] =
 				((four_entries >> 0) & 0x000000FF);
-			indir[byte_index + 1] =
+			rxfh->indir[byte_index + 1] =
 				((four_entries >> 8) & 0x000000FF);
-			indir[byte_index + 2] =
+			rxfh->indir[byte_index + 2] =
 				((four_entries >> 16) & 0x000000FF);
-			indir[byte_index + 3] =
+			rxfh->indir[byte_index + 3] =
 				((four_entries >> 24) & 0x000000FF);
 		}
 	}
-	if (key) {
+	if (rxfh->key) {
 		int dword_index;
 		int byte_index = 0;
 
@@ -967,28 +967,30 @@ static int lan743x_ethtool_get_rxfh(struct net_device *netdev,
 						 RFE_HASH_KEY(dword_index));
 
 			byte_index = dword_index << 2;
-			key[byte_index + 0] =
+			rxfh->key[byte_index + 0] =
 				((four_entries >> 0) & 0x000000FF);
-			key[byte_index + 1] =
+			rxfh->key[byte_index + 1] =
 				((four_entries >> 8) & 0x000000FF);
-			key[byte_index + 2] =
+			rxfh->key[byte_index + 2] =
 				((four_entries >> 16) & 0x000000FF);
-			key[byte_index + 3] =
+			rxfh->key[byte_index + 3] =
 				((four_entries >> 24) & 0x000000FF);
 		}
 	}
-	if (hfunc)
-		(*hfunc) = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 	return 0;
 }
 
 static int lan743x_ethtool_set_rxfh(struct net_device *netdev,
-				    const u32 *indir, const u8 *key,
-				    const u8 hfunc)
+				    struct ethtool_rxfh_param *rxfh,
+				    struct netlink_ext_ack *extack)
 {
 	struct lan743x_adapter *adapter = netdev_priv(netdev);
+	u32 *indir = rxfh->indir;
+	u8 *key = rxfh->key;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	if (indir) {
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 777e65b8223d..ab2413d71f6c 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -248,28 +248,28 @@ static u32 mana_rss_indir_size(struct net_device *ndev)
 	return MANA_INDIRECT_TABLE_SIZE;
 }
 
-static int mana_get_rxfh(struct net_device *ndev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+static int mana_get_rxfh(struct net_device *ndev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	int i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */
+	rxfh->hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++)
-			indir[i] = apc->indir_table[i];
+			rxfh->indir[i] = apc->indir_table[i];
 	}
 
-	if (key)
-		memcpy(key, apc->hashkey, MANA_HASH_KEY_SIZE);
+	if (rxfh->key)
+		memcpy(rxfh->key, apc->hashkey, MANA_HASH_KEY_SIZE);
 
 	return 0;
 }
 
-static int mana_set_rxfh(struct net_device *ndev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int mana_set_rxfh(struct net_device *ndev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	bool update_hash = false, update_table = false;
@@ -280,25 +280,26 @@ static int mana_set_rxfh(struct net_device *ndev, const u32 *indir,
 	if (!apc->port_is_up)
 		return -EOPNOTSUPP;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++)
-			if (indir[i] >= apc->num_queues)
+			if (rxfh->indir[i] >= apc->num_queues)
 				return -EINVAL;
 
 		update_table = true;
 		for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
 			save_table[i] = apc->indir_table[i];
-			apc->indir_table[i] = indir[i];
+			apc->indir_table[i] = rxfh->indir[i];
 		}
 	}
 
-	if (key) {
+	if (rxfh->key) {
 		update_hash = true;
 		memcpy(save_key, apc->hashkey, MANA_HASH_KEY_SIZE);
-		memcpy(apc->hashkey, key, MANA_HASH_KEY_SIZE);
+		memcpy(apc->hashkey, rxfh->key, MANA_HASH_KEY_SIZE);
 	}
 
 	err = mana_config_rss(apc, TRI_STATE_TRUE, update_hash, update_table);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index ff9f39969215..fbca8d0efd85 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -1794,8 +1794,8 @@ static u32 nfp_net_get_rxfh_key_size(struct net_device *netdev)
 	return nfp_net_rss_key_sz(nn);
 }
 
-static int nfp_net_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			    u8 *hfunc)
+static int nfp_net_get_rxfh(struct net_device *netdev,
+			    struct ethtool_rxfh_param *rxfh)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	int i;
@@ -1803,41 +1803,41 @@ static int nfp_net_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	if (!(nn->cap & NFP_NET_CFG_CTRL_RSS_ANY))
 		return -EOPNOTSUPP;
 
-	if (indir)
+	if (rxfh->indir)
 		for (i = 0; i < ARRAY_SIZE(nn->rss_itbl); i++)
-			indir[i] = nn->rss_itbl[i];
-	if (key)
-		memcpy(key, nn->rss_key, nfp_net_rss_key_sz(nn));
-	if (hfunc) {
-		*hfunc = nn->rss_hfunc;
-		if (*hfunc >= 1 << ETH_RSS_HASH_FUNCS_COUNT)
-			*hfunc = ETH_RSS_HASH_UNKNOWN;
-	}
+			rxfh->indir[i] = nn->rss_itbl[i];
+	if (rxfh->key)
+		memcpy(rxfh->key, nn->rss_key, nfp_net_rss_key_sz(nn));
+
+	rxfh->hfunc = nn->rss_hfunc;
+	if (rxfh->hfunc >= 1 << ETH_RSS_HASH_FUNCS_COUNT)
+		rxfh->hfunc = ETH_RSS_HASH_UNKNOWN;
 
 	return 0;
 }
 
 static int nfp_net_set_rxfh(struct net_device *netdev,
-			    const u32 *indir, const u8 *key,
-			    const u8 hfunc)
+			    struct ethtool_rxfh_param *rxfh,
+			    struct netlink_ext_ack *extack)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	int i;
 
 	if (!(nn->cap & NFP_NET_CFG_CTRL_RSS_ANY) ||
-	    !(hfunc == ETH_RSS_HASH_NO_CHANGE || hfunc == nn->rss_hfunc))
+	    !(rxfh->hfunc == ETH_RSS_HASH_NO_CHANGE ||
+	      rxfh->hfunc == nn->rss_hfunc))
 		return -EOPNOTSUPP;
 
-	if (!key && !indir)
+	if (!rxfh->key && !rxfh->indir)
 		return 0;
 
-	if (key) {
-		memcpy(nn->rss_key, key, nfp_net_rss_key_sz(nn));
+	if (rxfh->key) {
+		memcpy(nn->rss_key, rxfh->key, nfp_net_rss_key_sz(nn));
 		nfp_net_rss_write_key(nn);
 	}
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ARRAY_SIZE(nn->rss_itbl); i++)
-			nn->rss_itbl[i] = indir[i];
+			nn->rss_itbl[i] = rxfh->indir[i];
 
 		nfp_net_rss_write_itbl(nn);
 	}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index 3a6b0a9bc241..cd3c0b01402e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -823,36 +823,38 @@ static u32 ionic_get_rxfh_key_size(struct net_device *netdev)
 	return IONIC_RSS_HASH_KEY_SIZE;
 }
 
-static int ionic_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
-			  u8 *hfunc)
+static int ionic_get_rxfh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
 	unsigned int i, tbl_sz;
 
-	if (indir) {
+	if (rxfh->indir) {
 		tbl_sz = le16_to_cpu(lif->ionic->ident.lif.eth.rss_ind_tbl_sz);
 		for (i = 0; i < tbl_sz; i++)
-			indir[i] = lif->rss_ind_tbl[i];
+			rxfh->indir[i] = lif->rss_ind_tbl[i];
 	}
 
-	if (key)
-		memcpy(key, lif->rss_hash_key, IONIC_RSS_HASH_KEY_SIZE);
+	if (rxfh->key)
+		memcpy(rxfh->key, lif->rss_hash_key, IONIC_RSS_HASH_KEY_SIZE);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int ionic_set_rxfh(struct net_device *netdev, const u32 *indir,
-			  const u8 *key, const u8 hfunc)
+static int ionic_set_rxfh(struct net_device *netdev,
+			  struct ethtool_rxfh_param *rxfh,
+			  struct netlink_ext_ack *extack)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	return ionic_lif_rss_config(lif, lif->rss_types, key, indir);
+	return ionic_lif_rss_config(lif, lif->rss_types,
+				    rxfh->key, rxfh->indir);
 }
 
 static int ionic_set_tunable(struct net_device *dev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index b6b849a079ed..0e240b5ab8d4 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1370,28 +1370,29 @@ static u32 qede_get_rxfh_key_size(struct net_device *dev)
 	return sizeof(edev->rss_key);
 }
 
-static int qede_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
+static int qede_get_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	int i;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
 	for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
-		indir[i] = edev->rss_ind_table[i];
+		rxfh->indir[i] = edev->rss_ind_table[i];
 
-	if (key)
-		memcpy(key, edev->rss_key, qede_get_rxfh_key_size(dev));
+	if (rxfh->key)
+		memcpy(rxfh->key, edev->rss_key, qede_get_rxfh_key_size(dev));
 
 	return 0;
 }
 
-static int qede_set_rxfh(struct net_device *dev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+static int qede_set_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct qed_update_vport_params *vport_update_params;
 	struct qede_dev *edev = netdev_priv(dev);
@@ -1403,20 +1404,21 @@ static int qede_set_rxfh(struct net_device *dev, const u32 *indir,
 		return -EOPNOTSUPP;
 	}
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (!indir && !key)
+	if (!rxfh->indir && !rxfh->key)
 		return 0;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < QED_RSS_IND_TABLE_SIZE; i++)
-			edev->rss_ind_table[i] = indir[i];
+			edev->rss_ind_table[i] = rxfh->indir[i];
 		edev->rss_params_inited |= QEDE_RSS_INDIR_INITED;
 	}
 
-	if (key) {
-		memcpy(&edev->rss_key, key, qede_get_rxfh_key_size(dev));
+	if (rxfh->key) {
+		memcpy(&edev->rss_key, rxfh->key, qede_get_rxfh_key_size(dev));
 		edev->rss_params_inited |= QEDE_RSS_KEY_INITED;
 	}
 
diff --git a/drivers/net/ethernet/sfc/ethtool_common.c b/drivers/net/ethernet/sfc/ethtool_common.c
index a8cbceeb301b..e32ae9038d9e 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/ethtool_common.c
@@ -1163,8 +1163,8 @@ u32 efx_ethtool_get_rxfh_key_size(struct net_device *net_dev)
 	return efx->type->rx_hash_key_size;
 }
 
-int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
-			 u8 *hfunc)
+int efx_ethtool_get_rxfh(struct net_device *net_dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
 	int rc;
@@ -1173,24 +1173,27 @@ int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
 	if (rc)
 		return rc;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (indir)
-		memcpy(indir, efx->rss_context.rx_indir_table,
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
 		       sizeof(efx->rss_context.rx_indir_table));
-	if (key)
-		memcpy(key, efx->rss_context.rx_hash_key,
+	if (rxfh->key)
+		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
 		       efx->type->rx_hash_key_size);
 	return 0;
 }
 
-int efx_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
-			 const u8 *key, const u8 hfunc)
+int efx_ethtool_set_rxfh(struct net_device *net_dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
+	u32 *indir = rxfh->indir;
+	u8 *key = rxfh->key;
 
 	/* Hash function is Toeplitz, cannot be changed */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 	if (!indir && !key)
 		return 0;
@@ -1203,8 +1206,9 @@ int efx_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
 	return efx->type->rx_push_rss_config(efx, true, indir, key);
 }
 
-int efx_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
-				 u8 *key, u8 *hfunc, u32 rss_context)
+int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
+				 struct ethtool_rxfh_param *rxfh,
+				 u32 rss_context)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
@@ -1223,31 +1227,34 @@ int efx_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
 	if (rc)
 		goto out_unlock;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (indir)
-		memcpy(indir, ctx->rx_indir_table, sizeof(ctx->rx_indir_table));
-	if (key)
-		memcpy(key, ctx->rx_hash_key, efx->type->rx_hash_key_size);
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, ctx->rx_indir_table,
+		       sizeof(ctx->rx_indir_table));
+	if (rxfh->key)
+		memcpy(rxfh->key, ctx->rx_hash_key,
+		       efx->type->rx_hash_key_size);
 out_unlock:
 	mutex_unlock(&efx->rss_lock);
 	return rc;
 }
 
 int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
-				 const u32 *indir, const u8 *key,
-				 const u8 hfunc, u32 *rss_context,
-				 bool delete)
+				 struct ethtool_rxfh_param *rxfh,
+				 u32 *rss_context, bool delete)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
+	u32 *indir = rxfh->indir;
 	bool allocated = false;
+	u8 *key = rxfh->key;
 	int rc;
 
 	if (!efx->type->rx_push_rss_context_config)
 		return -EOPNOTSUPP;
 	/* Hash function is Toeplitz, cannot be changed */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
diff --git a/drivers/net/ethernet/sfc/ethtool_common.h b/drivers/net/ethernet/sfc/ethtool_common.h
index 659491932101..384318ee1127 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/ethtool_common.h
@@ -44,16 +44,17 @@ int efx_ethtool_set_rxnfc(struct net_device *net_dev,
 			  struct ethtool_rxnfc *info);
 u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev);
 u32 efx_ethtool_get_rxfh_key_size(struct net_device *net_dev);
-int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
-			 u8 *hfunc);
+int efx_ethtool_get_rxfh(struct net_device *net_dev,
+			 struct ethtool_rxfh_param *rxfh);
 int efx_ethtool_set_rxfh(struct net_device *net_dev,
-			 const u32 *indir, const u8 *key, const u8 hfunc);
-int efx_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
-				 u8 *key, u8 *hfunc, u32 rss_context);
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack);
+int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
+				 struct ethtool_rxfh_param *rxfh,
+				 u32 rss_context);
 int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
-				 const u32 *indir, const u8 *key,
-				 const u8 hfunc, u32 *rss_context,
-				 bool delete);
+				 struct ethtool_rxfh_param *rxfh,
+				 u32 *rss_context, bool delete);
 int efx_ethtool_reset(struct net_device *net_dev, u32 *flags);
 int efx_ethtool_get_module_eeprom(struct net_device *net_dev,
 				  struct ethtool_eeprom *ee,
diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c
index 3976a333f7e3..f4db683b80f7 100644
--- a/drivers/net/ethernet/sfc/falcon/ethtool.c
+++ b/drivers/net/ethernet/sfc/falcon/ethtool.c
@@ -1257,31 +1257,33 @@ static u32 ef4_ethtool_get_rxfh_indir_size(struct net_device *net_dev)
 		0 : ARRAY_SIZE(efx->rx_indir_table));
 }
 
-static int ef4_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
-				u8 *hfunc)
+static int ef4_ethtool_get_rxfh(struct net_device *net_dev,
+				struct ethtool_rxfh_param *rxfh)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (indir)
-		memcpy(indir, efx->rx_indir_table, sizeof(efx->rx_indir_table));
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, efx->rx_indir_table,
+		       sizeof(efx->rx_indir_table));
 	return 0;
 }
 
-static int ef4_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
-				const u8 *key, const u8 hfunc)
+static int ef4_ethtool_set_rxfh(struct net_device *net_dev,
+				struct ethtool_rxfh_param *rxfh,
+				struct netlink_ext_ack *extack)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 
 	/* We do not allow change in unsupported parameters */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
-	if (!indir)
+	if (!rxfh->indir)
 		return 0;
 
-	return efx->type->rx_push_rss_config(efx, true, indir);
+	return efx->type->rx_push_rss_config(efx, true, rxfh->indir);
 }
 
 static int ef4_ethtool_get_module_eeprom(struct net_device *net_dev,
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.c b/drivers/net/ethernet/sfc/siena/ethtool_common.c
index f590e87e5a23..20b64e3521cb 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.c
@@ -1164,8 +1164,8 @@ u32 efx_siena_ethtool_get_rxfh_key_size(struct net_device *net_dev)
 	return efx->type->rx_hash_key_size;
 }
 
-int efx_siena_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
-			       u8 *hfunc)
+int efx_siena_ethtool_get_rxfh(struct net_device *net_dev,
+			       struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	int rc;
@@ -1174,24 +1174,27 @@ int efx_siena_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
 	if (rc)
 		return rc;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (indir)
-		memcpy(indir, efx->rss_context.rx_indir_table,
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
 		       sizeof(efx->rss_context.rx_indir_table));
-	if (key)
-		memcpy(key, efx->rss_context.rx_hash_key,
+	if (rxfh->key)
+		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
 		       efx->type->rx_hash_key_size);
 	return 0;
 }
 
-int efx_siena_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
-			       const u8 *key, const u8 hfunc)
+int efx_siena_ethtool_set_rxfh(struct net_device *net_dev,
+			       struct ethtool_rxfh_param *rxfh,
+			       struct netlink_ext_ack *extack)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
+	u32 *indir = rxfh->indir;
+	u8 *key = rxfh->key;
 
 	/* Hash function is Toeplitz, cannot be changed */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 	if (!indir && !key)
 		return 0;
@@ -1204,8 +1207,9 @@ int efx_siena_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir,
 	return efx->type->rx_push_rss_config(efx, true, indir, key);
 }
 
-int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
-				       u8 *key, u8 *hfunc, u32 rss_context)
+int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
+				       struct ethtool_rxfh_param *rxfh,
+				       u32 rss_context)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
@@ -1224,31 +1228,34 @@ int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
 	if (rc)
 		goto out_unlock;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (indir)
-		memcpy(indir, ctx->rx_indir_table, sizeof(ctx->rx_indir_table));
-	if (key)
-		memcpy(key, ctx->rx_hash_key, efx->type->rx_hash_key_size);
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, ctx->rx_indir_table,
+		       sizeof(ctx->rx_indir_table));
+	if (rxfh->key)
+		memcpy(rxfh->key, ctx->rx_hash_key,
+		       efx->type->rx_hash_key_size);
 out_unlock:
 	mutex_unlock(&efx->rss_lock);
 	return rc;
 }
 
 int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
-				       const u32 *indir, const u8 *key,
-				       const u8 hfunc, u32 *rss_context,
-				       bool delete)
+				       struct ethtool_rxfh_param *rxfh,
+				       u32 *rss_context, bool delete)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
+	u32 *indir = rxfh->indir;
 	bool allocated = false;
+	u8 *key = rxfh->key;
 	int rc;
 
 	if (!efx->type->rx_push_rss_context_config)
 		return -EOPNOTSUPP;
 	/* Hash function is Toeplitz, cannot be changed */
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.h b/drivers/net/ethernet/sfc/siena/ethtool_common.h
index 04b375dc6800..2f892f8eff70 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.h
@@ -41,16 +41,17 @@ int efx_siena_ethtool_set_rxnfc(struct net_device *net_dev,
 				struct ethtool_rxnfc *info);
 u32 efx_siena_ethtool_get_rxfh_indir_size(struct net_device *net_dev);
 u32 efx_siena_ethtool_get_rxfh_key_size(struct net_device *net_dev);
-int efx_siena_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key,
-			       u8 *hfunc);
+int efx_siena_ethtool_get_rxfh(struct net_device *net_dev,
+			       struct ethtool_rxfh_param *rxfh);
 int efx_siena_ethtool_set_rxfh(struct net_device *net_dev,
-			       const u32 *indir, const u8 *key, const u8 hfunc);
-int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev, u32 *indir,
-				       u8 *key, u8 *hfunc, u32 rss_context);
+			       struct ethtool_rxfh_param *rxfh,
+			       struct netlink_ext_ack *extack);
+int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
+				       struct ethtool_rxfh_param *rxfh,
+				       u32 rss_context);
 int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
-				       const u32 *indir, const u8 *key,
-				       const u8 hfunc, u32 *rss_context,
-				       bool delete);
+				       struct ethtool_rxfh_param *rxfh,
+				       u32 *rss_context, bool delete);
 int efx_siena_ethtool_reset(struct net_device *net_dev, u32 *flags);
 int efx_siena_ethtool_get_module_eeprom(struct net_device *net_dev,
 					struct ethtool_eeprom *ee,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index b9b80e0fceeb..8105ce47c6ad 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -1087,41 +1087,42 @@ static u32 stmmac_get_rxfh_indir_size(struct net_device *dev)
 	return ARRAY_SIZE(priv->rss.table);
 }
 
-static int stmmac_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-			   u8 *hfunc)
+static int stmmac_get_rxfh(struct net_device *dev,
+			   struct ethtool_rxfh_param *rxfh)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	int i;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
-			indir[i] = priv->rss.table[i];
+			rxfh->indir[i] = priv->rss.table[i];
 	}
 
-	if (key)
-		memcpy(key, priv->rss.key, sizeof(priv->rss.key));
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->key)
+		memcpy(rxfh->key, priv->rss.key, sizeof(priv->rss.key));
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int stmmac_set_rxfh(struct net_device *dev, const u32 *indir,
-			   const u8 *key, const u8 hfunc)
+static int stmmac_set_rxfh(struct net_device *dev,
+			   struct ethtool_rxfh_param *rxfh,
+			   struct netlink_ext_ack *extack)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	int i;
 
-	if ((hfunc != ETH_RSS_HASH_NO_CHANGE) && (hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++)
-			priv->rss.table[i] = indir[i];
+			priv->rss.table[i] = rxfh->indir[i];
 	}
 
-	if (key)
-		memcpy(priv->rss.key, key, sizeof(priv->rss.key));
+	if (rxfh->key)
+		memcpy(priv->rss.key, rxfh->key, sizeof(priv->rss.key));
 
 	return stmmac_rss_configure(priv, priv->hw, &priv->rss,
 				    priv->plat->rx_queues_to_use);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1e2fa0ec2327..4406427d4617 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1752,8 +1752,8 @@ static u32 netvsc_rss_indir_size(struct net_device *dev)
 	return ndc->rx_table_sz;
 }
 
-static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
-			   u8 *hfunc)
+static int netvsc_get_rxfh(struct net_device *dev,
+			   struct ethtool_rxfh_param *rxfh)
 {
 	struct net_device_context *ndc = netdev_priv(dev);
 	struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
@@ -1763,47 +1763,49 @@ static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 	if (!ndev)
 		return -ENODEV;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;	/* Toeplitz */
+	rxfh->hfunc = ETH_RSS_HASH_TOP;	/* Toeplitz */
 
 	rndis_dev = ndev->extension;
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ndc->rx_table_sz; i++)
-			indir[i] = ndc->rx_table[i];
+			rxfh->indir[i] = ndc->rx_table[i];
 	}
 
-	if (key)
-		memcpy(key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN);
+	if (rxfh->key)
+		memcpy(rxfh->key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN);
 
 	return 0;
 }
 
-static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
-			   const u8 *key, const u8 hfunc)
+static int netvsc_set_rxfh(struct net_device *dev,
+			   struct ethtool_rxfh_param *rxfh,
+			   struct netlink_ext_ack *extack)
 {
 	struct net_device_context *ndc = netdev_priv(dev);
 	struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
 	struct rndis_device *rndis_dev;
+	u8 *key = rxfh->key;
 	int i;
 
 	if (!ndev)
 		return -ENODEV;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
 	rndis_dev = ndev->extension;
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < ndc->rx_table_sz; i++)
-			if (indir[i] >= ndev->num_chn)
+			if (rxfh->indir[i] >= ndev->num_chn)
 				return -EINVAL;
 
 		for (i = 0; i < ndc->rx_table_sz; i++)
-			ndc->rx_table[i] = indir[i];
+			ndc->rx_table[i] = rxfh->indir[i];
 	}
 
 	if (!key) {
-		if (!indir)
+		if (!rxfh->indir)
 			return 0;
 
 		key = rndis_dev->rss_key;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 10614e9f7cad..25cf44ce95dd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3731,39 +3731,42 @@ static u32 virtnet_get_rxfh_indir_size(struct net_device *dev)
 	return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size;
 }
 
-static int virtnet_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
+static int virtnet_get_rxfh(struct net_device *dev,
+			    struct ethtool_rxfh_param *rxfh)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int i;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < vi->rss_indir_table_size; ++i)
-			indir[i] = vi->ctrl->rss.indirection_table[i];
+			rxfh->indir[i] = vi->ctrl->rss.indirection_table[i];
 	}
 
-	if (key)
-		memcpy(key, vi->ctrl->rss.key, vi->rss_key_size);
+	if (rxfh->key)
+		memcpy(rxfh->key, vi->ctrl->rss.key, vi->rss_key_size);
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
 
 	return 0;
 }
 
-static int virtnet_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, const u8 hfunc)
+static int virtnet_set_rxfh(struct net_device *dev,
+			    struct ethtool_rxfh_param *rxfh,
+			    struct netlink_ext_ack *extack)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int i;
 
-	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (indir) {
+	if (rxfh->indir) {
 		for (i = 0; i < vi->rss_indir_table_size; ++i)
-			vi->ctrl->rss.indirection_table[i] = indir[i];
+			vi->ctrl->rss.indirection_table[i] = rxfh->indir[i];
 	}
-	if (key)
-		memcpy(vi->ctrl->rss.key, key, vi->rss_key_size);
+	if (rxfh->key)
+		memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size);
 
 	virtnet_commit_rss_command(vi);
 
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 8f5f202cde39..7e8008d5378a 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -1136,27 +1136,26 @@ vmxnet3_get_rss_indir_size(struct net_device *netdev)
 }
 
 static int
-vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key, u8 *hfunc)
+vmxnet3_get_rss(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
 	unsigned int n = rssConf->indTableSize;
 
-	if (hfunc)
-		*hfunc = ETH_RSS_HASH_TOP;
-	if (!p)
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (!rxfh->indir)
 		return 0;
 	if (n > UPT1_RSS_MAX_IND_TABLE_SIZE)
 		return 0;
 	while (n--)
-		p[n] = rssConf->indTable[n];
+		rxfh->indir[n] = rssConf->indTable[n];
 	return 0;
 
 }
 
 static int
-vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key,
-		const u8 hfunc)
+vmxnet3_set_rss(struct net_device *netdev, struct ethtool_rxfh_param *rxfh,
+		struct netlink_ext_ack *extack)
 {
 	unsigned int i;
 	unsigned long flags;
@@ -1164,13 +1163,14 @@ vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key,
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
 
 	/* We do not allow change in unsupported parameters */
-	if (key ||
-	    (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP))
+	if (rxfh->key ||
+	    (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	     rxfh->hfunc != ETH_RSS_HASH_TOP))
 		return -EOPNOTSUPP;
-	if (!p)
+	if (!rxfh->indir)
 		return 0;
 	for (i = 0; i < rssConf->indTableSize; i++)
-		rssConf->indTable[i] = p[i];
+		rssConf->indTable[i] = rxfh->indir[i];
 
 	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 67b30940234b..3ab2b6a90419 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -596,6 +596,28 @@ struct ethtool_mm_stats {
 	u64 MACMergeHoldCount;
 };
 
+/**
+ * struct ethtool_rxfh_param - RXFH (RSS) parameters
+ * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
+ *	Valid values are one of the %ETH_RSS_HASH_*.
+ * @indir_size: On SET, the array size of the user buffer for the
+ *	indirection table, which may be zero, or
+ *	%ETH_RXFH_INDIR_NO_CHANGE.  On GET (read from the driver),
+ *	the array size of the hardware indirection table.
+ * @indir: The indirection table of size @indir_size entries.
+ * @key_size: On SET, the array size of the user buffer for the hash key,
+ *	which may be zero.  On GET (read from the driver), the size of the
+ *	hardware hash key.
+ * @key: The hash key of size @key_size bytes.
+ */
+struct ethtool_rxfh_param {
+	u8	hfunc;
+	u32	indir_size;
+	u32	*indir;
+	u32	key_size;
+	u8	*key;
+};
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
@@ -846,14 +868,14 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	u32	(*get_rxfh_key_size)(struct net_device *);
 	u32	(*get_rxfh_indir_size)(struct net_device *);
-	int	(*get_rxfh)(struct net_device *, u32 *indir, u8 *key,
-			    u8 *hfunc);
-	int	(*set_rxfh)(struct net_device *, const u32 *indir,
-			    const u8 *key, const u8 hfunc);
-	int	(*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key,
-				    u8 *hfunc, u32 rss_context);
-	int	(*set_rxfh_context)(struct net_device *, const u32 *indir,
-				    const u8 *key, const u8 hfunc,
+	int	(*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *);
+	int	(*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *,
+			    struct netlink_ext_ack *extack);
+	int	(*get_rxfh_context)(struct net_device *,
+				    struct ethtool_rxfh_param *,
+				    u32 rss_context);
+	int	(*set_rxfh_context)(struct net_device *,
+				    struct ethtool_rxfh_param *,
 				    u32 *rss_context, bool delete);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 11d8797f63f6..6b2a360dcdf0 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -589,8 +589,8 @@ err_free_info:
 
 int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
 {
+	struct ethtool_rxfh_param rxfh = {};
 	u32 dev_size, current_max = 0;
-	u32 *indir;
 	int ret;
 
 	if (!dev->ethtool_ops->get_rxfh_indir_size ||
@@ -600,21 +600,21 @@ int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
 	if (dev_size == 0)
 		return -EOPNOTSUPP;
 
-	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
-	if (!indir)
+	rxfh.indir = kcalloc(dev_size, sizeof(rxfh.indir[0]), GFP_USER);
+	if (!rxfh.indir)
 		return -ENOMEM;
 
-	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+	ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
 	if (ret)
 		goto out;
 
 	while (dev_size--)
-		current_max = max(current_max, indir[dev_size]);
+		current_max = max(current_max, rxfh.indir[dev_size]);
 
 	*max = current_max;
 
 out:
-	kfree(indir);
+	kfree(rxfh.indir);
 	return ret;
 }
 
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 755a4be01bef..5f49a105fc73 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1061,15 +1061,15 @@ EXPORT_SYMBOL(netdev_rss_key_fill);
 static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
-	u32 user_size, dev_size;
-	u32 *indir;
+	struct ethtool_rxfh_param rxfh = {};
+	u32 user_size;
 	int ret;
 
 	if (!dev->ethtool_ops->get_rxfh_indir_size ||
 	    !dev->ethtool_ops->get_rxfh)
 		return -EOPNOTSUPP;
-	dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
-	if (dev_size == 0)
+	rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+	if (rxfh.indir_size == 0)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&user_size,
@@ -1078,41 +1078,41 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
 		return -EFAULT;
 
 	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
-			 &dev_size, sizeof(dev_size)))
+			 &rxfh.indir_size, sizeof(rxfh.indir_size)))
 		return -EFAULT;
 
 	/* If the user buffer size is 0, this is just a query for the
 	 * device table size.  Otherwise, if it's smaller than the
 	 * device table size it's an error.
 	 */
-	if (user_size < dev_size)
+	if (user_size < rxfh.indir_size)
 		return user_size == 0 ? 0 : -EINVAL;
 
-	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
-	if (!indir)
+	rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER);
+	if (!rxfh.indir)
 		return -ENOMEM;
 
-	ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+	ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
 	if (ret)
 		goto out;
-
 	if (copy_to_user(useraddr +
 			 offsetof(struct ethtool_rxfh_indir, ring_index[0]),
-			 indir, dev_size * sizeof(indir[0])))
+			 rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir)))
 		ret = -EFAULT;
 
 out:
-	kfree(indir);
+	kfree(rxfh.indir);
 	return ret;
 }
 
 static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 						     void __user *useraddr)
 {
-	struct ethtool_rxnfc rx_rings;
-	u32 user_size, dev_size, i;
-	u32 *indir;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rxfh_param rxfh_dev = {};
+	struct netlink_ext_ack *extack = NULL;
+	struct ethtool_rxnfc rx_rings;
+	u32 user_size, i;
 	int ret;
 	u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
 
@@ -1120,8 +1120,8 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 	    !ops->get_rxnfc)
 		return -EOPNOTSUPP;
 
-	dev_size = ops->get_rxfh_indir_size(dev);
-	if (dev_size == 0)
+	rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
+	if (rxfh_dev.indir_size == 0)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&user_size,
@@ -1129,11 +1129,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 			   sizeof(user_size)))
 		return -EFAULT;
 
-	if (user_size != 0 && user_size != dev_size)
+	if (user_size != 0 && user_size != rxfh_dev.indir_size)
 		return -EINVAL;
 
-	indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
-	if (!indir)
+	rxfh_dev.indir = kcalloc(rxfh_dev.indir_size,
+				 sizeof(rxfh_dev.indir[0]), GFP_USER);
+	if (!rxfh_dev.indir)
 		return -ENOMEM;
 
 	rx_rings.cmd = ETHTOOL_GRXRINGS;
@@ -1142,18 +1143,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 		goto out;
 
 	if (user_size == 0) {
-		for (i = 0; i < dev_size; i++)
+		u32 *indir = rxfh_dev.indir;
+
+		for (i = 0; i < rxfh_dev.indir_size; i++)
 			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
 	} else {
-		ret = ethtool_copy_validate_indir(indir,
+		ret = ethtool_copy_validate_indir(rxfh_dev.indir,
 						  useraddr + ringidx_offset,
 						  &rx_rings,
-						  dev_size);
+						  rxfh_dev.indir_size);
 		if (ret)
 			goto out;
 	}
 
-	ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+	rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE;
+	ret = ops->set_rxfh(dev, &rxfh_dev, extack);
 	if (ret)
 		goto out;
 
@@ -1164,32 +1168,29 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
 		dev->priv_flags |= IFF_RXFH_CONFIGURED;
 
 out:
-	kfree(indir);
+	kfree(rxfh_dev.indir);
 	return ret;
 }
 
 static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 					       void __user *useraddr)
 {
-	int ret;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rxfh_param rxfh_dev = {};
 	u32 user_indir_size, user_key_size;
-	u32 dev_indir_size = 0, dev_key_size = 0;
 	struct ethtool_rxfh rxfh;
-	u32 total_size;
 	u32 indir_bytes;
-	u32 *indir = NULL;
-	u8 dev_hfunc = 0;
-	u8 *hkey = NULL;
 	u8 *rss_config;
+	u32 total_size;
+	int ret;
 
 	if (!ops->get_rxfh)
 		return -EOPNOTSUPP;
 
 	if (ops->get_rxfh_indir_size)
-		dev_indir_size = ops->get_rxfh_indir_size(dev);
+		rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
 	if (ops->get_rxfh_key_size)
-		dev_key_size = ops->get_rxfh_key_size(dev);
+		rxfh_dev.key_size = ops->get_rxfh_key_size(dev);
 
 	if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
 		return -EFAULT;
@@ -1203,38 +1204,37 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (rxfh.rss_context && !ops->get_rxfh_context)
 		return -EOPNOTSUPP;
 
-	rxfh.indir_size = dev_indir_size;
-	rxfh.key_size = dev_key_size;
+	rxfh.indir_size = rxfh_dev.indir_size;
+	rxfh.key_size = rxfh_dev.key_size;
 	if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
 		return -EFAULT;
 
-	if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
-	    (user_key_size && (user_key_size != dev_key_size)))
+	if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) ||
+	    (user_key_size && user_key_size != rxfh_dev.key_size))
 		return -EINVAL;
 
-	indir_bytes = user_indir_size * sizeof(indir[0]);
+	indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]);
 	total_size = indir_bytes + user_key_size;
 	rss_config = kzalloc(total_size, GFP_USER);
 	if (!rss_config)
 		return -ENOMEM;
 
 	if (user_indir_size)
-		indir = (u32 *)rss_config;
+		rxfh_dev.indir = (u32 *)rss_config;
 
 	if (user_key_size)
-		hkey = rss_config + indir_bytes;
+		rxfh_dev.key = rss_config + indir_bytes;
 
 	if (rxfh.rss_context)
-		ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
-							 &dev_hfunc,
+		ret = dev->ethtool_ops->get_rxfh_context(dev, &rxfh_dev,
 							 rxfh.rss_context);
 	else
-		ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+		ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev);
 	if (ret)
 		goto out;
 
 	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
-			 &dev_hfunc, sizeof(rxfh.hfunc))) {
+			 &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) {
 		ret = -EFAULT;
 	} else if (copy_to_user(useraddr +
 			      offsetof(struct ethtool_rxfh, rss_config[0]),
@@ -1250,16 +1250,17 @@ out:
 static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 					       void __user *useraddr)
 {
-	int ret;
+	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	u32 dev_indir_size = 0, dev_key_size = 0, i;
+	struct ethtool_rxfh_param rxfh_dev = {};
+	struct netlink_ext_ack *extack = NULL;
 	struct ethtool_rxnfc rx_rings;
 	struct ethtool_rxfh rxfh;
-	u32 dev_indir_size = 0, dev_key_size = 0, i;
-	u32 *indir = NULL, indir_bytes = 0;
-	u8 *hkey = NULL;
-	u8 *rss_config;
-	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
 	bool delete = false;
+	u32 indir_bytes = 0;
+	u8 *rss_config;
+	int ret;
 
 	if (!ops->get_rxnfc || !ops->set_rxfh)
 		return -EOPNOTSUPP;
@@ -1291,7 +1292,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		return -EINVAL;
 
 	if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
-		indir_bytes = dev_indir_size * sizeof(indir[0]);
+		indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]);
 
 	rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
 	if (!rss_config)
@@ -1308,8 +1309,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	 */
 	if (rxfh.indir_size &&
 	    rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
-		indir = (u32 *)rss_config;
-		ret = ethtool_copy_validate_indir(indir,
+		rxfh_dev.indir = (u32 *)rss_config;
+		rxfh_dev.indir_size = dev_indir_size;
+		ret = ethtool_copy_validate_indir(rxfh_dev.indir,
 						  useraddr + rss_cfg_offset,
 						  &rx_rings,
 						  rxfh.indir_size);
@@ -1317,7 +1319,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 			goto out;
 	} else if (rxfh.indir_size == 0) {
 		if (rxfh.rss_context == 0) {
-			indir = (u32 *)rss_config;
+			u32 *indir;
+
+			rxfh_dev.indir = (u32 *)rss_config;
+			rxfh_dev.indir_size = dev_indir_size;
+			indir = rxfh_dev.indir;
 			for (i = 0; i < dev_indir_size; i++)
 				indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
 		} else {
@@ -1326,8 +1332,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	}
 
 	if (rxfh.key_size) {
-		hkey = rss_config + indir_bytes;
-		if (copy_from_user(hkey,
+		rxfh_dev.key_size = dev_key_size;
+		rxfh_dev.key = rss_config + indir_bytes;
+		if (copy_from_user(rxfh_dev.key,
 				   useraddr + rss_cfg_offset + indir_bytes,
 				   rxfh.key_size)) {
 			ret = -EFAULT;
@@ -1335,11 +1342,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		}
 	}
 
+	rxfh_dev.hfunc = rxfh.hfunc;
+
 	if (rxfh.rss_context)
-		ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
+		ret = ops->set_rxfh_context(dev, &rxfh_dev,
 					    &rxfh.rss_context, delete);
 	else
-		ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+		ret = ops->set_rxfh(dev, &rxfh_dev, extack);
+
 	if (ret)
 		goto out;
 
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 5764202e6cb6..56fae7d5c0f7 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -48,9 +48,9 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 	struct rss_reply_data *data = RSS_REPDATA(reply_base);
 	struct rss_req_info *request = RSS_REQINFO(req_base);
 	struct net_device *dev = reply_base->dev;
+	struct ethtool_rxfh_param rxfh = {};
 	const struct ethtool_ops *ops;
 	u32 total_size, indir_bytes;
-	u8 dev_hfunc = 0;
 	u8 *rss_config;
 	int ret;
 
@@ -83,21 +83,23 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 
 	if (data->indir_size)
 		data->indir_table = (u32 *)rss_config;
-
 	if (data->hkey_size)
 		data->hkey = rss_config + indir_bytes;
 
+	rxfh.indir_size = data->indir_size;
+	rxfh.indir = data->indir_table;
+	rxfh.key_size = data->hkey_size;
+	rxfh.key = data->hkey;
+
 	if (request->rss_context)
-		ret = ops->get_rxfh_context(dev, data->indir_table, data->hkey,
-					    &dev_hfunc, request->rss_context);
+		ret = ops->get_rxfh_context(dev, &rxfh, request->rss_context);
 	else
-		ret = ops->get_rxfh(dev, data->indir_table, data->hkey,
-				    &dev_hfunc);
+		ret = ops->get_rxfh(dev, &rxfh);
 
 	if (ret)
 		goto out_ops;
 
-	data->hfunc = dev_hfunc;
+	data->hfunc = rxfh.hfunc;
 out_ops:
 	ethnl_ops_complete(dev);
 	return ret;
-- 
cgit v1.2.3


From dcd8dbf9e734eb334113ea43186c1c26e9f497bb Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:15 -0700
Subject: net: ethtool: get rid of get/set_rxfh_context functions

Add the RSS context parameters to struct ethtool_rxfh_param and use the
get/set_rxfh to handle the RSS contexts as well.

This is part 2/2 of the fix suggested in [1]:

 - Add a rss_context member to the argument struct and a capability
   like cap_link_lanes_supported to indicate whether driver supports
   rss contexts, then you can remove *et_rxfh_context functions,
   and instead call *et_rxfh() with a non-zero rss_context.

Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1]
CC: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Tony Nguyen <anthony.l.nguyen@intel.com>
CC: Marcin Wojtas <mw@semihalf.com>
CC: Russell King <linux@armlinux.org.uk>
CC: Sunil Goutham <sgoutham@marvell.com>
CC: Geetha sowjanya <gakula@marvell.com>
CC: Subbaraya Sundeep <sbhatta@marvell.com>
CC: hariprasad <hkelam@marvell.com>
CC: Saeed Mahameed <saeedm@nvidia.com>
CC: Leon Romanovsky <leon@kernel.org>
CC: Edward Cree <ecree.xilinx@gmail.com>
CC: Martin Habets <habetsm.xilinx@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-3-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_ethtool.c       |  29 +++---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c    |  63 +++--------
 .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c  |  63 +++++------
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  37 ++-----
 drivers/net/ethernet/sfc/ef100_ethtool.c           |   3 +-
 drivers/net/ethernet/sfc/ethtool.c                 |   3 +-
 drivers/net/ethernet/sfc/ethtool_common.c          | 115 +++++++++++----------
 drivers/net/ethernet/sfc/ethtool_common.h          |   6 --
 drivers/net/ethernet/sfc/siena/ethtool.c           |   3 +-
 drivers/net/ethernet/sfc/siena/ethtool_common.c    | 115 +++++++++++----------
 drivers/net/ethernet/sfc/siena/ethtool_common.h    |   6 --
 include/linux/ethtool.h                            |  26 ++---
 net/ethtool/ioctl.c                                |  27 ++---
 net/ethtool/rss.c                                  |   9 +-
 14 files changed, 205 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 70cbd8092aea..c77605b42833 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3195,11 +3195,18 @@ static u32 ice_get_rxfh_indir_size(struct net_device *netdev)
 	return np->vsi->rss_table_size;
 }
 
+/**
+ * ice_get_rxfh - get the Rx flow hash indirection table
+ * @netdev: network interface device structure
+ * @rxfh: pointer to param struct (indir, key, hfunc)
+ *
+ * Reads the indirection table directly from the hardware.
+ */
 static int
-ice_get_rxfh_context(struct net_device *netdev,
-		     struct ethtool_rxfh_param *rxfh, u32 rss_context)
+ice_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
+	u32 rss_context = rxfh->rss_context;
 	struct ice_vsi *vsi = np->vsi;
 	struct ice_pf *pf = vsi->back;
 	u16 qcount, offset;
@@ -3261,19 +3268,6 @@ out:
 	return err;
 }
 
-/**
- * ice_get_rxfh - get the Rx flow hash indirection table
- * @netdev: network interface device structure
- * @rxfh: pointer to param struct (indir, key, hfunc)
- *
- * Reads the indirection table directly from the hardware.
- */
-static int
-ice_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
-{
-	return ice_get_rxfh_context(netdev, rxfh, 0);
-}
-
 /**
  * ice_set_rxfh - set the Rx flow hash indirection table
  * @netdev: network interface device structure
@@ -3298,6 +3292,9 @@ ice_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh,
 	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
+	if (rxfh->rss_context)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags)) {
 		/* RSS not supported return error here */
 		netdev_warn(netdev, "RSS is not configured on this VSI!\n");
@@ -4215,6 +4212,7 @@ ice_get_module_eeprom(struct net_device *netdev,
 }
 
 static const struct ethtool_ops ice_ethtool_ops = {
+	.cap_rss_ctx_supported  = true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE |
 				     ETHTOOL_COALESCE_RX_USECS_HIGH,
@@ -4248,7 +4246,6 @@ static const struct ethtool_ops ice_ethtool_ops = {
 	.set_pauseparam		= ice_set_pauseparam,
 	.get_rxfh_key_size	= ice_get_rxfh_key_size,
 	.get_rxfh_indir_size	= ice_get_rxfh_indir_size,
-	.get_rxfh_context	= ice_get_rxfh_context,
 	.get_rxfh		= ice_get_rxfh,
 	.set_rxfh		= ice_set_rxfh,
 	.get_channels		= ice_get_channels,
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index ceef48ddd26e..9193cf61de26 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5638,47 +5638,7 @@ static int mvpp2_ethtool_get_rxfh(struct net_device *dev,
 				  struct ethtool_rxfh_param *rxfh)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
-	int ret = 0;
-
-	if (!mvpp22_rss_is_supported(port))
-		return -EOPNOTSUPP;
-
-	if (rxfh->indir)
-		ret = mvpp22_port_rss_ctx_indir_get(port, 0, rxfh->indir);
-
-	rxfh->hfunc = ETH_RSS_HASH_CRC32;
-
-	return ret;
-}
-
-static int mvpp2_ethtool_set_rxfh(struct net_device *dev,
-				  struct ethtool_rxfh_param *rxfh,
-				  struct netlink_ext_ack *extack)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	int ret = 0;
-
-	if (!mvpp22_rss_is_supported(port))
-		return -EOPNOTSUPP;
-
-	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	    rxfh->hfunc != ETH_RSS_HASH_CRC32)
-		return -EOPNOTSUPP;
-
-	if (rxfh->key)
-		return -EOPNOTSUPP;
-
-	if (rxfh->indir)
-		ret = mvpp22_port_rss_ctx_indir_set(port, 0, rxfh->indir);
-
-	return ret;
-}
-
-static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev,
-					  struct ethtool_rxfh_param *rxfh,
-					  u32 rss_context)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
+	u32 rss_context = rxfh->rss_context;
 	int ret = 0;
 
 	if (!mvpp22_rss_is_supported(port))
@@ -5695,12 +5655,13 @@ static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev,
 	return ret;
 }
 
-static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
-					  struct ethtool_rxfh_param *rxfh,
-					  u32 *rss_context, bool delete)
+static int mvpp2_ethtool_set_rxfh(struct net_device *dev,
+				  struct ethtool_rxfh_param *rxfh,
+				  struct netlink_ext_ack *extack)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
-	int ret;
+	u32 *rss_context = &rxfh->rss_context;
+	int ret = 0;
 
 	if (!mvpp22_rss_is_supported(port))
 		return -EOPNOTSUPP;
@@ -5712,7 +5673,7 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
 	if (rxfh->key)
 		return -EOPNOTSUPP;
 
-	if (delete)
+	if (*rss_context && rxfh->rss_delete)
 		return mvpp22_port_rss_ctx_delete(port, *rss_context);
 
 	if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) {
@@ -5721,8 +5682,13 @@ static int mvpp2_ethtool_set_rxfh_context(struct net_device *dev,
 			return ret;
 	}
 
-	return mvpp22_port_rss_ctx_indir_set(port, *rss_context, rxfh->indir);
+	if (rxfh->indir)
+		ret = mvpp22_port_rss_ctx_indir_set(port, *rss_context,
+						    rxfh->indir);
+
+	return ret;
 }
+
 /* Device ops */
 
 static const struct net_device_ops mvpp2_netdev_ops = {
@@ -5742,6 +5708,7 @@ static const struct net_device_ops mvpp2_netdev_ops = {
 };
 
 static const struct ethtool_ops mvpp2_eth_tool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES,
 	.nway_reset		= mvpp2_ethtool_nway_reset,
@@ -5764,8 +5731,6 @@ static const struct ethtool_ops mvpp2_eth_tool_ops = {
 	.get_rxfh_indir_size	= mvpp2_ethtool_get_rxfh_indir_size,
 	.get_rxfh		= mvpp2_ethtool_get_rxfh,
 	.set_rxfh		= mvpp2_ethtool_set_rxfh,
-	.get_rxfh_context	= mvpp2_ethtool_get_rxfh_context,
-	.set_rxfh_context	= mvpp2_ethtool_set_rxfh_context,
 };
 
 /* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
index a2c182d7bfc4..2928898c7f8d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
@@ -835,11 +835,12 @@ static int otx2_rss_ctx_create(struct otx2_nic *pfvf,
 	return 0;
 }
 
-/* RSS context configuration */
-static int otx2_set_rxfh_context(struct net_device *dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 *rss_context, bool delete)
+/* Configure RSS table and hash key */
+static int otx2_set_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
 {
+	u32 rss_context = DEFAULT_RSS_CONTEXT_GROUP;
 	struct otx2_nic *pfvf = netdev_priv(dev);
 	struct otx2_rss_ctx *rss_ctx;
 	struct otx2_rss_info *rss;
@@ -849,8 +850,11 @@ static int otx2_set_rxfh_context(struct net_device *dev,
 	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
-	if (*rss_context != ETH_RXFH_CONTEXT_ALLOC &&
-	    *rss_context >= MAX_RSS_GROUPS)
+	if (rxfh->rss_context)
+		rss_context = rxfh->rss_context;
+
+	if (rss_context != ETH_RXFH_CONTEXT_ALLOC &&
+	    rss_context >= MAX_RSS_GROUPS)
 		return -EINVAL;
 
 	rss = &pfvf->hw.rss_info;
@@ -864,28 +868,30 @@ static int otx2_set_rxfh_context(struct net_device *dev,
 		memcpy(rss->key, rxfh->key, sizeof(rss->key));
 		otx2_set_rss_key(pfvf);
 	}
-	if (delete)
-		return otx2_rss_ctx_delete(pfvf, *rss_context);
+	if (rxfh->rss_delete)
+		return otx2_rss_ctx_delete(pfvf, rss_context);
 
-	if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) {
-		ret = otx2_rss_ctx_create(pfvf, rss_context);
+	if (rss_context == ETH_RXFH_CONTEXT_ALLOC) {
+		ret = otx2_rss_ctx_create(pfvf, &rss_context);
+		rxfh->rss_context = rss_context;
 		if (ret)
 			return ret;
 	}
 	if (rxfh->indir) {
-		rss_ctx = rss->rss_ctx[*rss_context];
+		rss_ctx = rss->rss_ctx[rss_context];
 		for (idx = 0; idx < rss->rss_size; idx++)
 			rss_ctx->ind_tbl[idx] = rxfh->indir[idx];
 	}
-	otx2_set_rss_table(pfvf, *rss_context);
+	otx2_set_rss_table(pfvf, rss_context);
 
 	return 0;
 }
 
-static int otx2_get_rxfh_context(struct net_device *dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 rss_context)
+/* Get RSS configuration */
+static int otx2_get_rxfh(struct net_device *dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
+	u32 rss_context = DEFAULT_RSS_CONTEXT_GROUP;
 	struct otx2_nic *pfvf = netdev_priv(dev);
 	struct otx2_rss_ctx *rss_ctx;
 	struct otx2_rss_info *rss;
@@ -895,6 +901,8 @@ static int otx2_get_rxfh_context(struct net_device *dev,
 	rss = &pfvf->hw.rss_info;
 
 	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->rss_context)
+		rss_context = rxfh->rss_context;
 
 	if (!indir)
 		return 0;
@@ -922,25 +930,6 @@ static int otx2_get_rxfh_context(struct net_device *dev,
 	return 0;
 }
 
-/* Get RSS configuration */
-static int otx2_get_rxfh(struct net_device *dev,
-			 struct ethtool_rxfh_param *rxfh)
-{
-	return otx2_get_rxfh_context(dev, rxfh,
-				     DEFAULT_RSS_CONTEXT_GROUP);
-}
-
-/* Configure RSS table and hash key */
-static int otx2_set_rxfh(struct net_device *dev,
-			 struct ethtool_rxfh_param *rxfh,
-			 struct netlink_ext_ack *extack)
-{
-
-	u32 rss_context = DEFAULT_RSS_CONTEXT_GROUP;
-
-	return otx2_set_rxfh_context(dev, rxfh, &rss_context, 0);
-}
-
 static u32 otx2_get_msglevel(struct net_device *netdev)
 {
 	struct otx2_nic *pfvf = netdev_priv(netdev);
@@ -1321,6 +1310,7 @@ static void otx2_get_fec_stats(struct net_device *netdev,
 }
 
 static const struct ethtool_ops otx2_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE,
@@ -1343,8 +1333,6 @@ static const struct ethtool_ops otx2_ethtool_ops = {
 	.get_rxfh_indir_size	= otx2_get_rxfh_indir_size,
 	.get_rxfh		= otx2_get_rxfh,
 	.set_rxfh		= otx2_set_rxfh,
-	.get_rxfh_context	= otx2_get_rxfh_context,
-	.set_rxfh_context	= otx2_set_rxfh_context,
 	.get_msglevel		= otx2_get_msglevel,
 	.set_msglevel		= otx2_set_msglevel,
 	.get_pauseparam		= otx2_get_pauseparam,
@@ -1444,6 +1432,7 @@ static int otx2vf_get_link_ksettings(struct net_device *netdev,
 }
 
 static const struct ethtool_ops otx2vf_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE,
@@ -1462,8 +1451,6 @@ static const struct ethtool_ops otx2vf_ethtool_ops = {
 	.get_rxfh_indir_size	= otx2_get_rxfh_indir_size,
 	.get_rxfh		= otx2_get_rxfh,
 	.set_rxfh		= otx2_set_rxfh,
-	.get_rxfh_context	= otx2_get_rxfh_context,
-	.set_rxfh_context	= otx2_set_rxfh_context,
 	.get_ringparam		= otx2_get_ringparam,
 	.set_ringparam		= otx2_set_ringparam,
 	.get_coalesce		= otx2_get_coalesce,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 110cb7973eae..0fe7ea88d567 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1262,11 +1262,10 @@ static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev)
 	return mlx5e_ethtool_get_rxfh_indir_size(priv);
 }
 
-static int mlx5e_get_rxfh_context(struct net_device *dev,
-				  struct ethtool_rxfh_param *rxfh,
-				  u32 rss_context)
+int mlx5e_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
 {
-	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	u32 rss_context = rxfh->rss_context;
 	int err;
 
 	mutex_lock(&priv->state_lock);
@@ -1276,16 +1275,16 @@ static int mlx5e_get_rxfh_context(struct net_device *dev,
 	return err;
 }
 
-static int mlx5e_set_rxfh_context(struct net_device *dev,
-				  struct ethtool_rxfh_param *rxfh,
-				  u32 *rss_context, bool delete)
+int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
+		   struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	u32 *rss_context = &rxfh->rss_context;
 	u8 hfunc = rxfh->hfunc;
 	int err;
 
 	mutex_lock(&priv->state_lock);
-	if (delete) {
+	if (*rss_context && rxfh->rss_delete) {
 		err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context);
 		goto unlock;
 	}
@@ -1307,25 +1306,6 @@ unlock:
 	return err;
 }
 
-int mlx5e_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh)
-{
-	return mlx5e_get_rxfh_context(netdev, rxfh, 0);
-}
-
-int mlx5e_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh,
-		   struct netlink_ext_ack *extack)
-{
-	struct mlx5e_priv *priv = netdev_priv(dev);
-	u8 hfunc = rxfh->hfunc;
-	int err;
-
-	mutex_lock(&priv->state_lock);
-	err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, 0, rxfh->indir, rxfh->key,
-					hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc);
-	mutex_unlock(&priv->state_lock);
-	return err;
-}
-
 #define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC		100
 #define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC		8000
 #define MLX5E_PFC_PREVEN_MINOR_PRECENT		85
@@ -2402,6 +2382,7 @@ static void mlx5e_get_rmon_stats(struct net_device *netdev,
 }
 
 const struct ethtool_ops mlx5e_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE |
@@ -2424,8 +2405,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_rxfh_indir_size = mlx5e_get_rxfh_indir_size,
 	.get_rxfh          = mlx5e_get_rxfh,
 	.set_rxfh          = mlx5e_set_rxfh,
-	.get_rxfh_context  = mlx5e_get_rxfh_context,
-	.set_rxfh_context  = mlx5e_set_rxfh_context,
 	.get_rxnfc         = mlx5e_get_rxnfc,
 	.set_rxnfc         = mlx5e_set_rxnfc,
 	.get_tunable       = mlx5e_get_tunable,
diff --git a/drivers/net/ethernet/sfc/ef100_ethtool.c b/drivers/net/ethernet/sfc/ef100_ethtool.c
index 702abbe59b76..cf55202b3a7b 100644
--- a/drivers/net/ethernet/sfc/ef100_ethtool.c
+++ b/drivers/net/ethernet/sfc/ef100_ethtool.c
@@ -37,6 +37,7 @@ ef100_ethtool_get_ringparam(struct net_device *net_dev,
 /*	Ethtool options available
  */
 const struct ethtool_ops ef100_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.get_drvinfo		= efx_ethtool_get_drvinfo,
 	.get_msglevel		= efx_ethtool_get_msglevel,
 	.set_msglevel		= efx_ethtool_set_msglevel,
@@ -60,8 +61,6 @@ const struct ethtool_ops ef100_ethtool_ops = {
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
 	.get_rxfh		= efx_ethtool_get_rxfh,
 	.set_rxfh		= efx_ethtool_set_rxfh,
-	.get_rxfh_context	= efx_ethtool_get_rxfh_context,
-	.set_rxfh_context	= efx_ethtool_set_rxfh_context,
 
 	.get_module_info	= efx_ethtool_get_module_info,
 	.get_module_eeprom	= efx_ethtool_get_module_eeprom,
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 364323599f7b..37c69c8d90b1 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -240,6 +240,7 @@ static int efx_ethtool_get_ts_info(struct net_device *net_dev,
 }
 
 const struct ethtool_ops efx_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_USECS_IRQ |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
@@ -269,8 +270,6 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
 	.get_rxfh		= efx_ethtool_get_rxfh,
 	.set_rxfh		= efx_ethtool_set_rxfh,
-	.get_rxfh_context	= efx_ethtool_get_rxfh_context,
-	.set_rxfh_context	= efx_ethtool_set_rxfh_context,
 	.get_ts_info		= efx_ethtool_get_ts_info,
 	.get_module_info	= efx_ethtool_get_module_info,
 	.get_module_eeprom	= efx_ethtool_get_module_eeprom,
diff --git a/drivers/net/ethernet/sfc/ethtool_common.c b/drivers/net/ethernet/sfc/ethtool_common.c
index e32ae9038d9e..7d5e5db4eac5 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/ethtool_common.c
@@ -1163,52 +1163,8 @@ u32 efx_ethtool_get_rxfh_key_size(struct net_device *net_dev)
 	return efx->type->rx_hash_key_size;
 }
 
-int efx_ethtool_get_rxfh(struct net_device *net_dev,
-			 struct ethtool_rxfh_param *rxfh)
-{
-	struct efx_nic *efx = efx_netdev_priv(net_dev);
-	int rc;
-
-	rc = efx->type->rx_pull_rss_config(efx);
-	if (rc)
-		return rc;
-
-	rxfh->hfunc = ETH_RSS_HASH_TOP;
-	if (rxfh->indir)
-		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
-		       sizeof(efx->rss_context.rx_indir_table));
-	if (rxfh->key)
-		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
-		       efx->type->rx_hash_key_size);
-	return 0;
-}
-
-int efx_ethtool_set_rxfh(struct net_device *net_dev,
-			 struct ethtool_rxfh_param *rxfh,
-			 struct netlink_ext_ack *extack)
-{
-	struct efx_nic *efx = efx_netdev_priv(net_dev);
-	u32 *indir = rxfh->indir;
-	u8 *key = rxfh->key;
-
-	/* Hash function is Toeplitz, cannot be changed */
-	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	    rxfh->hfunc != ETH_RSS_HASH_TOP)
-		return -EOPNOTSUPP;
-	if (!indir && !key)
-		return 0;
-
-	if (!key)
-		key = efx->rss_context.rx_hash_key;
-	if (!indir)
-		indir = efx->rss_context.rx_indir_table;
-
-	return efx->type->rx_push_rss_config(efx, true, indir, key);
-}
-
-int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 rss_context)
+static int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
+					struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
@@ -1218,7 +1174,7 @@ int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
 		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
-	ctx = efx_find_rss_context_entry(efx, rss_context);
+	ctx = efx_find_rss_context_entry(efx, rxfh->rss_context);
 	if (!ctx) {
 		rc = -ENOENT;
 		goto out_unlock;
@@ -1239,11 +1195,35 @@ out_unlock:
 	return rc;
 }
 
-int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 *rss_context, bool delete)
+int efx_ethtool_get_rxfh(struct net_device *net_dev,
+			 struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
+	int rc;
+
+	if (rxfh->rss_context)
+		return efx_ethtool_get_rxfh_context(net_dev, rxfh);
+
+	rc = efx->type->rx_pull_rss_config(efx);
+	if (rc)
+		return rc;
+
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
+		       sizeof(efx->rss_context.rx_indir_table));
+	if (rxfh->key)
+		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
+		       efx->type->rx_hash_key_size);
+	return 0;
+}
+
+static int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
+					struct ethtool_rxfh_param *rxfh,
+					struct netlink_ext_ack *extack)
+{
+	struct efx_nic *efx = efx_netdev_priv(net_dev);
+	u32 *rss_context = &rxfh->rss_context;
 	struct efx_rss_context *ctx;
 	u32 *indir = rxfh->indir;
 	bool allocated = false;
@@ -1252,15 +1232,11 @@ int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
 
 	if (!efx->type->rx_push_rss_context_config)
 		return -EOPNOTSUPP;
-	/* Hash function is Toeplitz, cannot be changed */
-	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	    rxfh->hfunc != ETH_RSS_HASH_TOP)
-		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
 
 	if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) {
-		if (delete) {
+		if (rxfh->rss_delete) {
 			/* alloc + delete == Nothing to do */
 			rc = -EINVAL;
 			goto out_unlock;
@@ -1283,7 +1259,7 @@ int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
 		}
 	}
 
-	if (delete) {
+	if (rxfh->rss_delete) {
 		/* delete this context */
 		rc = efx->type->rx_push_rss_context_config(efx, ctx, NULL, NULL);
 		if (!rc)
@@ -1306,6 +1282,33 @@ out_unlock:
 	return rc;
 }
 
+int efx_ethtool_set_rxfh(struct net_device *net_dev,
+			 struct ethtool_rxfh_param *rxfh,
+			 struct netlink_ext_ack *extack)
+{
+	struct efx_nic *efx = efx_netdev_priv(net_dev);
+	u32 *indir = rxfh->indir;
+	u8 *key = rxfh->key;
+
+	/* Hash function is Toeplitz, cannot be changed */
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (rxfh->rss_context)
+		return efx_ethtool_set_rxfh_context(net_dev, rxfh, extack);
+
+	if (!indir && !key)
+		return 0;
+
+	if (!key)
+		key = efx->rss_context.rx_hash_key;
+	if (!indir)
+		indir = efx->rss_context.rx_indir_table;
+
+	return efx->type->rx_push_rss_config(efx, true, indir, key);
+}
+
 int efx_ethtool_reset(struct net_device *net_dev, u32 *flags)
 {
 	struct efx_nic *efx = efx_netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/sfc/ethtool_common.h b/drivers/net/ethernet/sfc/ethtool_common.h
index 384318ee1127..a680e5980213 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/ethtool_common.h
@@ -49,12 +49,6 @@ int efx_ethtool_get_rxfh(struct net_device *net_dev,
 int efx_ethtool_set_rxfh(struct net_device *net_dev,
 			 struct ethtool_rxfh_param *rxfh,
 			 struct netlink_ext_ack *extack);
-int efx_ethtool_get_rxfh_context(struct net_device *net_dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 rss_context);
-int efx_ethtool_set_rxfh_context(struct net_device *net_dev,
-				 struct ethtool_rxfh_param *rxfh,
-				 u32 *rss_context, bool delete);
 int efx_ethtool_reset(struct net_device *net_dev, u32 *flags);
 int efx_ethtool_get_module_eeprom(struct net_device *net_dev,
 				  struct ethtool_eeprom *ee,
diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c
index e4ec589216c1..14dd3893bdef 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool.c
+++ b/drivers/net/ethernet/sfc/siena/ethtool.c
@@ -240,6 +240,7 @@ static int efx_ethtool_get_ts_info(struct net_device *net_dev,
 }
 
 const struct ethtool_ops efx_siena_ethtool_ops = {
+	.cap_rss_ctx_supported	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_USECS_IRQ |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE_RX,
@@ -269,8 +270,6 @@ const struct ethtool_ops efx_siena_ethtool_ops = {
 	.get_rxfh_key_size	= efx_siena_ethtool_get_rxfh_key_size,
 	.get_rxfh		= efx_siena_ethtool_get_rxfh,
 	.set_rxfh		= efx_siena_ethtool_set_rxfh,
-	.get_rxfh_context	= efx_siena_ethtool_get_rxfh_context,
-	.set_rxfh_context	= efx_siena_ethtool_set_rxfh_context,
 	.get_ts_info		= efx_ethtool_get_ts_info,
 	.get_module_info	= efx_siena_ethtool_get_module_info,
 	.get_module_eeprom	= efx_siena_ethtool_get_module_eeprom,
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.c b/drivers/net/ethernet/sfc/siena/ethtool_common.c
index 20b64e3521cb..5f0a8127e967 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.c
@@ -1164,52 +1164,8 @@ u32 efx_siena_ethtool_get_rxfh_key_size(struct net_device *net_dev)
 	return efx->type->rx_hash_key_size;
 }
 
-int efx_siena_ethtool_get_rxfh(struct net_device *net_dev,
-			       struct ethtool_rxfh_param *rxfh)
-{
-	struct efx_nic *efx = netdev_priv(net_dev);
-	int rc;
-
-	rc = efx->type->rx_pull_rss_config(efx);
-	if (rc)
-		return rc;
-
-	rxfh->hfunc = ETH_RSS_HASH_TOP;
-	if (rxfh->indir)
-		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
-		       sizeof(efx->rss_context.rx_indir_table));
-	if (rxfh->key)
-		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
-		       efx->type->rx_hash_key_size);
-	return 0;
-}
-
-int efx_siena_ethtool_set_rxfh(struct net_device *net_dev,
-			       struct ethtool_rxfh_param *rxfh,
-			       struct netlink_ext_ack *extack)
-{
-	struct efx_nic *efx = netdev_priv(net_dev);
-	u32 *indir = rxfh->indir;
-	u8 *key = rxfh->key;
-
-	/* Hash function is Toeplitz, cannot be changed */
-	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	    rxfh->hfunc != ETH_RSS_HASH_TOP)
-		return -EOPNOTSUPP;
-	if (!indir && !key)
-		return 0;
-
-	if (!key)
-		key = efx->rss_context.rx_hash_key;
-	if (!indir)
-		indir = efx->rss_context.rx_indir_table;
-
-	return efx->type->rx_push_rss_config(efx, true, indir, key);
-}
-
-int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
-				       struct ethtool_rxfh_param *rxfh,
-				       u32 rss_context)
+static int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
+					      struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_rss_context *ctx;
@@ -1219,7 +1175,7 @@ int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
 		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
-	ctx = efx_siena_find_rss_context_entry(efx, rss_context);
+	ctx = efx_siena_find_rss_context_entry(efx, rxfh->rss_context);
 	if (!ctx) {
 		rc = -ENOENT;
 		goto out_unlock;
@@ -1240,11 +1196,35 @@ out_unlock:
 	return rc;
 }
 
-int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
-				       struct ethtool_rxfh_param *rxfh,
-				       u32 *rss_context, bool delete)
+int efx_siena_ethtool_get_rxfh(struct net_device *net_dev,
+			       struct ethtool_rxfh_param *rxfh)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
+	int rc;
+
+	if (rxfh->rss_context)
+		return efx_siena_ethtool_get_rxfh_context(net_dev, rxfh);
+
+	rc = efx->type->rx_pull_rss_config(efx);
+	if (rc)
+		return rc;
+
+	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (rxfh->indir)
+		memcpy(rxfh->indir, efx->rss_context.rx_indir_table,
+		       sizeof(efx->rss_context.rx_indir_table));
+	if (rxfh->key)
+		memcpy(rxfh->key, efx->rss_context.rx_hash_key,
+		       efx->type->rx_hash_key_size);
+	return 0;
+}
+
+static int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
+					      struct ethtool_rxfh_param *rxfh,
+					      struct netlink_ext_ack *extack)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	u32 *rss_context = &rxfh->rss_context;
 	struct efx_rss_context *ctx;
 	u32 *indir = rxfh->indir;
 	bool allocated = false;
@@ -1253,15 +1233,11 @@ int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
 
 	if (!efx->type->rx_push_rss_context_config)
 		return -EOPNOTSUPP;
-	/* Hash function is Toeplitz, cannot be changed */
-	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
-	    rxfh->hfunc != ETH_RSS_HASH_TOP)
-		return -EOPNOTSUPP;
 
 	mutex_lock(&efx->rss_lock);
 
 	if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) {
-		if (delete) {
+		if (rxfh->rss_delete) {
 			/* alloc + delete == Nothing to do */
 			rc = -EINVAL;
 			goto out_unlock;
@@ -1284,7 +1260,7 @@ int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
 		}
 	}
 
-	if (delete) {
+	if (rxfh->rss_delete) {
 		/* delete this context */
 		rc = efx->type->rx_push_rss_context_config(efx, ctx, NULL, NULL);
 		if (!rc)
@@ -1307,6 +1283,33 @@ out_unlock:
 	return rc;
 }
 
+int efx_siena_ethtool_set_rxfh(struct net_device *net_dev,
+			       struct ethtool_rxfh_param *rxfh,
+			       struct netlink_ext_ack *extack)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	u32 *indir = rxfh->indir;
+	u8 *key = rxfh->key;
+
+	/* Hash function is Toeplitz, cannot be changed */
+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
+	if (rxfh->rss_context)
+		efx_siena_ethtool_set_rxfh_context(net_dev, rxfh, extack);
+
+	if (!indir && !key)
+		return 0;
+
+	if (!key)
+		key = efx->rss_context.rx_hash_key;
+	if (!indir)
+		indir = efx->rss_context.rx_indir_table;
+
+	return efx->type->rx_push_rss_config(efx, true, indir, key);
+}
+
 int efx_siena_ethtool_reset(struct net_device *net_dev, u32 *flags)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.h b/drivers/net/ethernet/sfc/siena/ethtool_common.h
index 2f892f8eff70..d674bab0f65b 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.h
@@ -46,12 +46,6 @@ int efx_siena_ethtool_get_rxfh(struct net_device *net_dev,
 int efx_siena_ethtool_set_rxfh(struct net_device *net_dev,
 			       struct ethtool_rxfh_param *rxfh,
 			       struct netlink_ext_ack *extack);
-int efx_siena_ethtool_get_rxfh_context(struct net_device *net_dev,
-				       struct ethtool_rxfh_param *rxfh,
-				       u32 rss_context);
-int efx_siena_ethtool_set_rxfh_context(struct net_device *net_dev,
-				       struct ethtool_rxfh_param *rxfh,
-				       u32 *rss_context, bool delete);
 int efx_siena_ethtool_reset(struct net_device *net_dev, u32 *flags);
 int efx_siena_ethtool_get_module_eeprom(struct net_device *net_dev,
 					struct ethtool_eeprom *ee,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3ab2b6a90419..66fe254c3e51 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -609,6 +609,12 @@ struct ethtool_mm_stats {
  *	which may be zero.  On GET (read from the driver), the size of the
  *	hardware hash key.
  * @key: The hash key of size @key_size bytes.
+ * @rss_context: RSS context identifier.  Context 0 is the default for normal
+ *	traffic; other contexts can be referenced as the destination for RX flow
+ *	classification rules.  On SET, %ETH_RXFH_CONTEXT_ALLOC is used
+ *	to allocate a new RSS context; on return this field will
+ *	contain the ID of the newly allocated context.
+ * @rss_delete: Set to non-ZERO to remove the @rss_context context.
  */
 struct ethtool_rxfh_param {
 	u8	hfunc;
@@ -616,12 +622,16 @@ struct ethtool_rxfh_param {
 	u32	*indir;
 	u32	key_size;
 	u8	*key;
+	u32	rss_context;
+	u8	rss_delete;
 };
 
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
  *	parameter.
+ * @cap_rss_ctx_supported: indicates if the driver supports RSS
+ *	contexts.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @supported_ring_params: supported ring params.
  * @get_drvinfo: Report driver/device information. Modern drivers no
@@ -718,15 +728,6 @@ struct ethtool_rxfh_param {
  *	will remain unchanged.
  *	Returns a negative error code or zero. An error code must be returned
  *	if at least one unsupported change was requested.
- * @get_rxfh_context: Get the contents of the RX flow hash indirection table,
- *	hash key, and/or hash function assiciated to the given rss context.
- *	Returns a negative error code or zero.
- * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting
- *	the contents of the RX flow hash indirection table, hash key, and/or
- *	hash function associated to the given context. Arguments which are set
- *	to %NULL or zero will remain unchanged.
- *	Returns a negative error code or zero. An error code must be returned
- *	if at least one unsupported change was requested.
  * @get_channels: Get number of channels.
  * @set_channels: Set number of channels.  Returns a negative error code or
  *	zero.
@@ -809,6 +810,7 @@ struct ethtool_rxfh_param {
  */
 struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
+	u32     cap_rss_ctx_supported:1;
 	u32	supported_coalesce_params;
 	u32	supported_ring_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
@@ -871,12 +873,6 @@ struct ethtool_ops {
 	int	(*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *);
 	int	(*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *,
 			    struct netlink_ext_ack *extack);
-	int	(*get_rxfh_context)(struct net_device *,
-				    struct ethtool_rxfh_param *,
-				    u32 rss_context);
-	int	(*set_rxfh_context)(struct net_device *,
-				    struct ethtool_rxfh_param *,
-				    u32 *rss_context, bool delete);
 	void	(*get_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*set_channels)(struct net_device *, struct ethtool_channels *);
 	int	(*get_dump_flag)(struct net_device *, struct ethtool_dump *);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 5f49a105fc73..86e5fc64b711 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1201,7 +1201,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
-	if (rxfh.rss_context && !ops->get_rxfh_context)
+	if (rxfh.rss_context && !ops->cap_rss_ctx_supported)
 		return -EOPNOTSUPP;
 
 	rxfh.indir_size = rxfh_dev.indir_size;
@@ -1225,11 +1225,9 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (user_key_size)
 		rxfh_dev.key = rss_config + indir_bytes;
 
-	if (rxfh.rss_context)
-		ret = dev->ethtool_ops->get_rxfh_context(dev, &rxfh_dev,
-							 rxfh.rss_context);
-	else
-		ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev);
+	rxfh_dev.rss_context = rxfh.rss_context;
+
+	ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev);
 	if (ret)
 		goto out;
 
@@ -1257,7 +1255,6 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	struct netlink_ext_ack *extack = NULL;
 	struct ethtool_rxnfc rx_rings;
 	struct ethtool_rxfh rxfh;
-	bool delete = false;
 	u32 indir_bytes = 0;
 	u8 *rss_config;
 	int ret;
@@ -1277,7 +1274,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
-	if (rxfh.rss_context && !ops->set_rxfh_context)
+	if (rxfh.rss_context && !ops->cap_rss_ctx_supported)
 		return -EOPNOTSUPP;
 
 	/* If either indir, hash key or function is valid, proceed further.
@@ -1327,7 +1324,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 			for (i = 0; i < dev_indir_size; i++)
 				indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
 		} else {
-			delete = true;
+			rxfh_dev.rss_delete = true;
 		}
 	}
 
@@ -1343,21 +1340,17 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	}
 
 	rxfh_dev.hfunc = rxfh.hfunc;
+	rxfh_dev.rss_context = rxfh.rss_context;
 
-	if (rxfh.rss_context)
-		ret = ops->set_rxfh_context(dev, &rxfh_dev,
-					    &rxfh.rss_context, delete);
-	else
-		ret = ops->set_rxfh(dev, &rxfh_dev, extack);
-
+	ret = ops->set_rxfh(dev, &rxfh_dev, extack);
 	if (ret)
 		goto out;
 
 	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
-			 &rxfh.rss_context, sizeof(rxfh.rss_context)))
+			 &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context)))
 		ret = -EFAULT;
 
-	if (!rxfh.rss_context) {
+	if (!rxfh_dev.rss_context) {
 		/* indicate whether rxfh was set to default */
 		if (rxfh.indir_size == 0)
 			dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 56fae7d5c0f7..efc9f4409e40 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -59,7 +59,7 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 		return -EOPNOTSUPP;
 
 	/* Some drivers don't handle rss_context */
-	if (request->rss_context && !ops->get_rxfh_context)
+	if (request->rss_context && !ops->cap_rss_ctx_supported)
 		return -EOPNOTSUPP;
 
 	ret = ethnl_ops_begin(dev);
@@ -90,12 +90,9 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 	rxfh.indir = data->indir_table;
 	rxfh.key_size = data->hkey_size;
 	rxfh.key = data->hkey;
+	rxfh.rss_context = request->rss_context;
 
-	if (request->rss_context)
-		ret = ops->get_rxfh_context(dev, &rxfh, request->rss_context);
-	else
-		ret = ops->get_rxfh(dev, &rxfh);
-
+	ret = ops->get_rxfh(dev, &rxfh);
 	if (ret)
 		goto out_ops;
 
-- 
cgit v1.2.3


From 13e59344fb9d3c9d3acd138ae320b5b67b658694 Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:16 -0700
Subject: net: ethtool: add support for symmetric-xor RSS hash

Symmetric RSS hash functions are beneficial in applications that monitor
both Tx and Rx packets of the same flow (IDS, software firewalls, ..etc).
Getting all traffic of the same flow on the same RX queue results in
higher CPU cache efficiency.

A NIC that supports "symmetric-xor" can achieve this RSS hash symmetry
by XORing the source and destination fields and pass the values to the
RSS hash algorithm.

The user may request RSS hash symmetry for a specific algorithm, via:

    # ethtool -X eth0 hfunc <hash_alg> symmetric-xor

or turn symmetry off (asymmetric) by:

    # ethtool -X eth0 hfunc <hash_alg>

The specific fields for each flow type should then be specified as usual
via:
    # ethtool -N|-U eth0 rx-flow-hash <flow_type> s|d|f|n

Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-4-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml     |  4 ++++
 Documentation/networking/ethtool-netlink.rst |  6 +++++-
 Documentation/networking/scaling.rst         | 15 ++++++++++++++
 include/linux/ethtool.h                      |  6 ++++++
 include/uapi/linux/ethtool.h                 | 13 +++++++++++-
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/ioctl.c                          | 30 ++++++++++++++++++++++++----
 net/ethtool/rss.c                            |  5 +++++
 8 files changed, 74 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 5c7a65b009b4..197208f419dc 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -908,6 +908,9 @@ attribute-sets:
       -
         name: hkey
         type: binary
+      -
+        name: input_xfrm
+        type: u32
   -
     name: plca
     attributes:
@@ -1598,6 +1601,7 @@ operations:
             - hfunc
             - indir
             - hkey
+            - input_xfrm
       dump: *rss-get-op
     -
       name: plca-get-cfg
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 6a49624a9cbf..d583d9abf2f8 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1774,12 +1774,16 @@ Kernel response contents:
   ``ETHTOOL_A_RSS_HFUNC``              u32     RSS hash func
   ``ETHTOOL_A_RSS_INDIR``              binary  Indir table bytes
   ``ETHTOOL_A_RSS_HKEY``               binary  Hash key bytes
+  ``ETHTOOL_A_RSS_INPUT_XFRM``         u32     RSS input data transformation
 =====================================  ======  ==========================
 
 ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function
 being used. Current supported options are toeplitz, xor or crc32.
-ETHTOOL_A_RSS_INDIR attribute returns RSS indrection table where each byte
+ETHTOOL_A_RSS_INDIR attribute returns RSS indirection table where each byte
 indicates queue number.
+ETHTOOL_A_RSS_INPUT_XFRM attribute is a bitmap indicating the type of
+transformation applied to the input protocol fields before given to the RSS
+hfunc. Current supported option is symmetric-xor.
 
 PLCA_GET_CFG
 ============
diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst
index 03ae19a689fc..4eb50bcb9d42 100644
--- a/Documentation/networking/scaling.rst
+++ b/Documentation/networking/scaling.rst
@@ -44,6 +44,21 @@ by masking out the low order seven bits of the computed hash for the
 packet (usually a Toeplitz hash), taking this number as a key into the
 indirection table and reading the corresponding value.
 
+Some NICs support symmetric RSS hashing where, if the IP (source address,
+destination address) and TCP/UDP (source port, destination port) tuples
+are swapped, the computed hash is the same. This is beneficial in some
+applications that monitor TCP/IP flows (IDS, firewalls, ...etc) and need
+both directions of the flow to land on the same Rx queue (and CPU). The
+"Symmetric-XOR" is a type of RSS algorithms that achieves this hash
+symmetry by XORing the input source and destination fields of the IP
+and/or L4 protocols. This, however, results in reduced input entropy and
+could potentially be exploited. Specifically, the algorithm XORs the input
+as follows::
+
+    # (SRC_IP ^ DST_IP, SRC_IP ^ DST_IP, SRC_PORT ^ DST_PORT, SRC_PORT ^ DST_PORT)
+
+The result is then fed to the underlying RSS algorithm.
+
 Some advanced NICs allow steering packets to queues based on
 programmable filters. For example, webserver bound TCP port 80 packets
 can be directed to their own receive queue. Such “n-tuple” filters can
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 66fe254c3e51..cfcd952a1d4f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -615,6 +615,8 @@ struct ethtool_mm_stats {
  *	to allocate a new RSS context; on return this field will
  *	contain the ID of the newly allocated context.
  * @rss_delete: Set to non-ZERO to remove the @rss_context context.
+ * @input_xfrm: Defines how the input data is transformed. Valid values are one
+ *	of %RXH_XFRM_*.
  */
 struct ethtool_rxfh_param {
 	u8	hfunc;
@@ -624,6 +626,7 @@ struct ethtool_rxfh_param {
 	u8	*key;
 	u32	rss_context;
 	u8	rss_delete;
+	u8	input_xfrm;
 };
 
 /**
@@ -632,6 +635,8 @@ struct ethtool_rxfh_param {
  *	parameter.
  * @cap_rss_ctx_supported: indicates if the driver supports RSS
  *	contexts.
+ * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor
+ *	RSS.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @supported_ring_params: supported ring params.
  * @get_drvinfo: Report driver/device information. Modern drivers no
@@ -811,6 +816,7 @@ struct ethtool_rxfh_param {
 struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
 	u32     cap_rss_ctx_supported:1;
+	u32	cap_rss_sym_xor_supported:1;
 	u32	supported_coalesce_params;
 	u32	supported_ring_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f7fba0dc87e5..0787d561ace0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1266,6 +1266,8 @@ struct ethtool_rxfh_indir {
  *	hardware hash key.
  * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
  *	Valid values are one of the %ETH_RSS_HASH_*.
+ * @input_xfrm: Defines how the input data is transformed. Valid values are one
+ *	of %RXH_XFRM_*.
  * @rsvd8: Reserved for future use; see the note on reserved space.
  * @rsvd32: Reserved for future use; see the note on reserved space.
  * @rss_config: RX ring/queue index for each hash value i.e., indirection table
@@ -1285,7 +1287,8 @@ struct ethtool_rxfh {
 	__u32   indir_size;
 	__u32   key_size;
 	__u8	hfunc;
-	__u8	rsvd8[3];
+	__u8	input_xfrm;
+	__u8	rsvd8[2];
 	__u32	rsvd32;
 	__u32   rss_config[];
 };
@@ -1992,6 +1995,14 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 
 #define WOL_MODE_COUNT		8
 
+/* RSS hash function data
+ * XOR the corresponding source and destination fields of each specified
+ * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH
+ * calculation. Note that this XORing reduces the input set entropy and could
+ * be exploited to reduce the RSS queue spread.
+ */
+#define	RXH_XFRM_SYM_XOR	(1 << 0)
+
 /* L2-L4 network traffic flow types */
 #define	TCP_V4_FLOW	0x01	/* hash or spec (tcp_ip4_spec) */
 #define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 73e2c10dc2cc..3f89074aa06c 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -908,6 +908,7 @@ enum {
 	ETHTOOL_A_RSS_HFUNC,		/* u32 */
 	ETHTOOL_A_RSS_INDIR,		/* binary */
 	ETHTOOL_A_RSS_HKEY,		/* binary */
+	ETHTOOL_A_RSS_INPUT_XFRM,	/* u32 */
 
 	__ETHTOOL_A_RSS_CNT,
 	ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1),
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 86e5fc64b711..86d47425038b 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -972,18 +972,35 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
 static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
 						u32 cmd, void __user *useraddr)
 {
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rxfh_param rxfh = {};
 	struct ethtool_rxnfc info;
 	size_t info_size = sizeof(info);
 	int rc;
 
-	if (!dev->ethtool_ops->set_rxnfc)
+	if (!ops->set_rxnfc || !ops->get_rxfh)
 		return -EOPNOTSUPP;
 
 	rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
 	if (rc)
 		return rc;
 
-	rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+	rc = ops->get_rxfh(dev, &rxfh);
+	if (rc)
+		return rc;
+
+	/* Sanity check: if symmetric-xor is set, then:
+	 * 1 - no other fields besides IP src/dst and/or L4 src/dst
+	 * 2 - If src is set, dst must also be set
+	 */
+	if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) &&
+	    ((info.data & ~(RXH_IP_SRC | RXH_IP_DST |
+			    RXH_L4_B_0_1 | RXH_L4_B_2_3)) ||
+	     (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) ||
+	     (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3))))
+		return -EINVAL;
+
+	rc = ops->set_rxnfc(dev, &info);
 	if (rc)
 		return rc;
 
@@ -1198,7 +1215,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	user_key_size = rxfh.key_size;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
 	if (rxfh.rss_context && !ops->cap_rss_ctx_supported)
@@ -1271,11 +1288,15 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		return -EFAULT;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
 	if (rxfh.rss_context && !ops->cap_rss_ctx_supported)
 		return -EOPNOTSUPP;
+	/* Check input data transformation capabilities */
+	if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) &&
+	    !ops->cap_rss_sym_xor_supported)
+		return -EOPNOTSUPP;
 
 	/* If either indir, hash key or function is valid, proceed further.
 	 * Must request at least one change: indir size, hash key or function.
@@ -1341,6 +1362,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 
 	rxfh_dev.hfunc = rxfh.hfunc;
 	rxfh_dev.rss_context = rxfh.rss_context;
+	rxfh_dev.input_xfrm = rxfh.input_xfrm;
 
 	ret = ops->set_rxfh(dev, &rxfh_dev, extack);
 	if (ret)
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index efc9f4409e40..71679137eff2 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -13,6 +13,7 @@ struct rss_reply_data {
 	u32				indir_size;
 	u32				hkey_size;
 	u32				hfunc;
+	u32				input_xfrm;
 	u32				*indir_table;
 	u8				*hkey;
 };
@@ -97,6 +98,7 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 		goto out_ops;
 
 	data->hfunc = rxfh.hfunc;
+	data->input_xfrm = rxfh.input_xfrm;
 out_ops:
 	ethnl_ops_complete(dev);
 	return ret;
@@ -110,6 +112,7 @@ rss_reply_size(const struct ethnl_req_info *req_base,
 	int len;
 
 	len = nla_total_size(sizeof(u32)) +	/* _RSS_HFUNC */
+	      nla_total_size(sizeof(u32)) +	/* _RSS_INPUT_XFRM */
 	      nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */
 	      nla_total_size(data->hkey_size);	/* _RSS_HKEY */
 
@@ -124,6 +127,8 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base,
 
 	if ((data->hfunc &&
 	     nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) ||
+	    (data->input_xfrm &&
+	     nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) ||
 	    (data->indir_size &&
 	     nla_put(skb, ETHTOOL_A_RSS_INDIR,
 		     sizeof(u32) * data->indir_size, data->indir_table)) ||
-- 
cgit v1.2.3


From dc6e44c9d6d68e8aa5de78d15f43f93145719b72 Mon Sep 17 00:00:00 2001
From: Qi Zhang <qi.z.zhang@intel.com>
Date: Tue, 12 Dec 2023 17:33:18 -0700
Subject: ice: refactor RSS configuration

Refactor the driver to use a communication data structure for RSS
config. To do so we introduce the new ice_rss_hash_cfg struct, and then
pass it as an argument to several functions.

Also introduce enum ice_rss_cfg_hdr_type to specify a more granular and
flexible RSS configuration:

ICE_RSS_OUTER_HEADERS - take outer layer as RSS input set
ICE_RSS_INNER_HEADERS - take inner layer as RSS input set
ICE_RSS_INNER_HEADERS_W_OUTER_IPV4 - take inner layer as RSS input set for
                                     packet with outer IPV4
ICE_RSS_INNER_HEADERS_W_OUTER_IPV6 - take inner layer as RSS input set for
                                     packet with outer IPV6
ICE_RSS_ANY_HEADERS - try with outer first then inner (same as the
		      behaviour without this change)

Finally, move the virtchnl_rss_algorithm enum to be with the other RSS
related structures in the virtchnl.h file.

There should be no functional change due to this patch.

Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Co-developed-by: Ahmed Zaki <ahmed.zaki@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-6-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_ethtool.c  |   6 +-
 drivers/net/ethernet/intel/ice/ice_flow.c     | 232 ++++++++++++++++----------
 drivers/net/ethernet/intel/ice/ice_flow.h     |  33 +++-
 drivers/net/ethernet/intel/ice/ice_lib.c      | 100 +++++------
 drivers/net/ethernet/intel/ice/ice_virtchnl.c |  36 ++--
 include/linux/avf/virtchnl.h                  |  16 +-
 6 files changed, 246 insertions(+), 177 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index c77605b42833..fcd8678bea66 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -2591,6 +2591,7 @@ static int
 ice_set_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
 {
 	struct ice_pf *pf = vsi->back;
+	struct ice_rss_hash_cfg cfg;
 	struct device *dev;
 	u64 hashed_flds;
 	int status;
@@ -2617,7 +2618,10 @@ ice_set_rss_hash_opt(struct ice_vsi *vsi, struct ethtool_rxnfc *nfc)
 		return -EINVAL;
 	}
 
-	status = ice_add_rss_cfg(&pf->hw, vsi->idx, hashed_flds, hdrs);
+	cfg.hash_flds = hashed_flds;
+	cfg.addl_hdrs = hdrs;
+	cfg.hdr_type = ICE_RSS_ANY_HEADERS;
+	status = ice_add_rss_cfg(&pf->hw, vsi->idx, &cfg);
 	if (status) {
 		dev_dbg(dev, "ice_add_rss_cfg failed, vsi num = %d, error = %d\n",
 			vsi->vsi_num, status);
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.c b/drivers/net/ethernet/intel/ice/ice_flow.c
index fb8b925aaf8b..c34be72f99b3 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.c
+++ b/drivers/net/ethernet/intel/ice/ice_flow.c
@@ -1855,37 +1855,49 @@ int ice_flow_rem_vsi_prof(struct ice_hw *hw, u16 vsi_handle, u64 prof_id)
 /**
  * ice_flow_set_rss_seg_info - setup packet segments for RSS
  * @segs: pointer to the flow field segment(s)
- * @hash_fields: fields to be hashed on for the segment(s)
- * @flow_hdr: protocol header fields within a packet segment
+ * @seg_cnt: segment count
+ * @cfg: configure parameters
  *
  * Helper function to extract fields from hash bitmap and use flow
  * header value to set flow field segment for further use in flow
  * profile entry or removal.
  */
 static int
-ice_flow_set_rss_seg_info(struct ice_flow_seg_info *segs, u64 hash_fields,
-			  u32 flow_hdr)
+ice_flow_set_rss_seg_info(struct ice_flow_seg_info *segs, u8 seg_cnt,
+			  const struct ice_rss_hash_cfg *cfg)
 {
+	struct ice_flow_seg_info *seg;
 	u64 val;
-	u8 i;
+	u16 i;
+
+	/* set inner most segment */
+	seg = &segs[seg_cnt - 1];
 
-	for_each_set_bit(i, (unsigned long *)&hash_fields,
-			 ICE_FLOW_FIELD_IDX_MAX)
-		ice_flow_set_fld(segs, (enum ice_flow_field)i,
+	for_each_set_bit(i, (const unsigned long *)&cfg->hash_flds,
+			 (u16)ICE_FLOW_FIELD_IDX_MAX)
+		ice_flow_set_fld(seg, (enum ice_flow_field)i,
 				 ICE_FLOW_FLD_OFF_INVAL, ICE_FLOW_FLD_OFF_INVAL,
 				 ICE_FLOW_FLD_OFF_INVAL, false);
 
-	ICE_FLOW_SET_HDRS(segs, flow_hdr);
+	ICE_FLOW_SET_HDRS(seg, cfg->addl_hdrs);
 
-	if (segs->hdrs & ~ICE_FLOW_RSS_SEG_HDR_VAL_MASKS &
+	/* set outer most header */
+	if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV4)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV4 |
+						    ICE_FLOW_SEG_HDR_IPV_OTHER;
+	else if (cfg->hdr_type == ICE_RSS_INNER_HEADERS_W_OUTER_IPV6)
+		segs[ICE_RSS_OUTER_HEADERS].hdrs |= ICE_FLOW_SEG_HDR_IPV6 |
+						    ICE_FLOW_SEG_HDR_IPV_OTHER;
+
+	if (seg->hdrs & ~ICE_FLOW_RSS_SEG_HDR_VAL_MASKS &
 	    ~ICE_FLOW_RSS_HDRS_INNER_MASK & ~ICE_FLOW_SEG_HDR_IPV_OTHER)
 		return -EINVAL;
 
-	val = (u64)(segs->hdrs & ICE_FLOW_RSS_SEG_HDR_L3_MASKS);
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L3_MASKS);
 	if (val && !is_power_of_2(val))
 		return -EIO;
 
-	val = (u64)(segs->hdrs & ICE_FLOW_RSS_SEG_HDR_L4_MASKS);
+	val = (u64)(seg->hdrs & ICE_FLOW_RSS_SEG_HDR_L4_MASKS);
 	if (val && !is_power_of_2(val))
 		return -EIO;
 
@@ -1955,6 +1967,39 @@ int ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
 	return status;
 }
 
+/**
+ * ice_get_rss_hdr_type - get a RSS profile's header type
+ * @prof: RSS flow profile
+ */
+static enum ice_rss_cfg_hdr_type
+ice_get_rss_hdr_type(struct ice_flow_prof *prof)
+{
+	if (prof->segs_cnt == ICE_FLOW_SEG_SINGLE) {
+		return ICE_RSS_OUTER_HEADERS;
+	} else if (prof->segs_cnt == ICE_FLOW_SEG_MAX) {
+		const struct ice_flow_seg_info *s;
+
+		s = &prof->segs[ICE_RSS_OUTER_HEADERS];
+		if (s->hdrs == ICE_FLOW_SEG_HDR_NONE)
+			return ICE_RSS_INNER_HEADERS;
+		if (s->hdrs & ICE_FLOW_SEG_HDR_IPV4)
+			return ICE_RSS_INNER_HEADERS_W_OUTER_IPV4;
+		if (s->hdrs & ICE_FLOW_SEG_HDR_IPV6)
+			return ICE_RSS_INNER_HEADERS_W_OUTER_IPV6;
+	}
+
+	return ICE_RSS_ANY_HEADERS;
+}
+
+static bool
+ice_rss_match_prof(struct ice_rss_cfg *r, struct ice_flow_prof *prof,
+		   enum ice_rss_cfg_hdr_type hdr_type)
+{
+	return (r->hash.hdr_type == hdr_type &&
+		r->hash.hash_flds == prof->segs[prof->segs_cnt - 1].match &&
+		r->hash.addl_hdrs == prof->segs[prof->segs_cnt - 1].hdrs);
+}
+
 /**
  * ice_rem_rss_list - remove RSS configuration from list
  * @hw: pointer to the hardware structure
@@ -1966,15 +2011,16 @@ int ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
 static void
 ice_rem_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
 {
+	enum ice_rss_cfg_hdr_type hdr_type;
 	struct ice_rss_cfg *r, *tmp;
 
 	/* Search for RSS hash fields associated to the VSI that match the
 	 * hash configurations associated to the flow profile. If found
 	 * remove from the RSS entry list of the VSI context and delete entry.
 	 */
+	hdr_type = ice_get_rss_hdr_type(prof);
 	list_for_each_entry_safe(r, tmp, &hw->rss_list_head, l_entry)
-		if (r->hashed_flds == prof->segs[prof->segs_cnt - 1].match &&
-		    r->packet_hdr == prof->segs[prof->segs_cnt - 1].hdrs) {
+		if (ice_rss_match_prof(r, prof, hdr_type)) {
 			clear_bit(vsi_handle, r->vsis);
 			if (bitmap_empty(r->vsis, ICE_MAX_VSI)) {
 				list_del(&r->l_entry);
@@ -1995,11 +2041,12 @@ ice_rem_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
 static int
 ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
 {
+	enum ice_rss_cfg_hdr_type hdr_type;
 	struct ice_rss_cfg *r, *rss_cfg;
 
+	hdr_type = ice_get_rss_hdr_type(prof);
 	list_for_each_entry(r, &hw->rss_list_head, l_entry)
-		if (r->hashed_flds == prof->segs[prof->segs_cnt - 1].match &&
-		    r->packet_hdr == prof->segs[prof->segs_cnt - 1].hdrs) {
+		if (ice_rss_match_prof(r, prof, hdr_type)) {
 			set_bit(vsi_handle, r->vsis);
 			return 0;
 		}
@@ -2009,8 +2056,9 @@ ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
 	if (!rss_cfg)
 		return -ENOMEM;
 
-	rss_cfg->hashed_flds = prof->segs[prof->segs_cnt - 1].match;
-	rss_cfg->packet_hdr = prof->segs[prof->segs_cnt - 1].hdrs;
+	rss_cfg->hash.hash_flds = prof->segs[prof->segs_cnt - 1].match;
+	rss_cfg->hash.addl_hdrs = prof->segs[prof->segs_cnt - 1].hdrs;
+	rss_cfg->hash.hdr_type = hdr_type;
 	set_bit(vsi_handle, rss_cfg->vsis);
 
 	list_add_tail(&rss_cfg->l_entry, &hw->rss_list_head);
@@ -2019,54 +2067,55 @@ ice_add_rss_list(struct ice_hw *hw, u16 vsi_handle, struct ice_flow_prof *prof)
 }
 
 #define ICE_FLOW_PROF_HASH_S	0
-#define ICE_FLOW_PROF_HASH_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_HASH_S)
+#define ICE_FLOW_PROF_HASH_M	GENMASK_ULL(31, 0)
 #define ICE_FLOW_PROF_HDR_S	32
-#define ICE_FLOW_PROF_HDR_M	(0x3FFFFFFFULL << ICE_FLOW_PROF_HDR_S)
-#define ICE_FLOW_PROF_ENCAP_S	63
-#define ICE_FLOW_PROF_ENCAP_M	(BIT_ULL(ICE_FLOW_PROF_ENCAP_S))
-
-#define ICE_RSS_OUTER_HEADERS	1
-#define ICE_RSS_INNER_HEADERS	2
+#define ICE_FLOW_PROF_HDR_M	GENMASK_ULL(61, 32)
+#define ICE_FLOW_PROF_ENCAP_S	62
+#define ICE_FLOW_PROF_ENCAP_M	GENMASK_ULL(63, 62)
 
 /* Flow profile ID format:
  * [0:31] - Packet match fields
- * [32:62] - Protocol header
- * [63] - Encapsulation flag, 0 if non-tunneled, 1 if tunneled
+ * [32:61] - Protocol header
+ * [62:63] - Encapsulation flag:
+ *	     0 if non-tunneled
+ *	     1 if tunneled
+ *	     2 for tunneled with outer ipv4
+ *	     3 for tunneled with outer ipv6
  */
-#define ICE_FLOW_GEN_PROFID(hash, hdr, segs_cnt) \
-	((u64)(((u64)(hash) & ICE_FLOW_PROF_HASH_M) | \
+#define ICE_FLOW_GEN_PROFID(hash, hdr, encap)                                \
+	((u64)(((u64)(hash) & ICE_FLOW_PROF_HASH_M) |                        \
 	       (((u64)(hdr) << ICE_FLOW_PROF_HDR_S) & ICE_FLOW_PROF_HDR_M) | \
-	       ((u8)((segs_cnt) - 1) ? ICE_FLOW_PROF_ENCAP_M : 0)))
+	       (((u64)(encap) << ICE_FLOW_PROF_ENCAP_S) &                    \
+		ICE_FLOW_PROF_ENCAP_M)))
 
 /**
  * ice_add_rss_cfg_sync - add an RSS configuration
  * @hw: pointer to the hardware structure
  * @vsi_handle: software VSI handle
- * @hashed_flds: hash bit fields (ICE_FLOW_HASH_*) to configure
- * @addl_hdrs: protocol header fields
- * @segs_cnt: packet segment count
+ * @cfg: configure parameters
  *
  * Assumption: lock has already been acquired for RSS list
  */
 static int
-ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		     u32 addl_hdrs, u8 segs_cnt)
+ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
 {
 	const enum ice_block blk = ICE_BLK_RSS;
 	struct ice_flow_prof *prof = NULL;
 	struct ice_flow_seg_info *segs;
+	u8 segs_cnt;
 	int status;
 
-	if (!segs_cnt || segs_cnt > ICE_FLOW_SEG_MAX)
-		return -EINVAL;
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			   ICE_FLOW_SEG_SINGLE :
+			   ICE_FLOW_SEG_MAX;
 
 	segs = kcalloc(segs_cnt, sizeof(*segs), GFP_KERNEL);
 	if (!segs)
 		return -ENOMEM;
 
 	/* Construct the packet segment info from the hashed fields */
-	status = ice_flow_set_rss_seg_info(&segs[segs_cnt - 1], hashed_flds,
-					   addl_hdrs);
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
 	if (status)
 		goto exit;
 
@@ -2120,9 +2169,9 @@ ice_add_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
 	 * segment information.
 	 */
 	status = ice_flow_add_prof(hw, blk, ICE_FLOW_RX,
-				   ICE_FLOW_GEN_PROFID(hashed_flds,
+				   ICE_FLOW_GEN_PROFID(cfg->hash_flds,
 						       segs[segs_cnt - 1].hdrs,
-						       segs_cnt),
+						       cfg->hdr_type),
 				   segs, segs_cnt, &prof);
 	if (status)
 		goto exit;
@@ -2147,29 +2196,37 @@ exit:
  * ice_add_rss_cfg - add an RSS configuration with specified hashed fields
  * @hw: pointer to the hardware structure
  * @vsi_handle: software VSI handle
- * @hashed_flds: hash bit fields (ICE_FLOW_HASH_*) to configure
- * @addl_hdrs: protocol header fields
+ * @cfg: configure parameters
  *
  * This function will generate a flow profile based on fields associated with
  * the input fields to hash on, the flow type and use the VSI number to add
  * a flow entry to the profile.
  */
 int
-ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		u32 addl_hdrs)
+ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
 {
+	struct ice_rss_hash_cfg local_cfg;
 	int status;
 
-	if (hashed_flds == ICE_HASH_INVALID ||
-	    !ice_is_vsi_valid(hw, vsi_handle))
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !cfg ||
+	    cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
 		return -EINVAL;
 
 	mutex_lock(&hw->rss_locks);
-	status = ice_add_rss_cfg_sync(hw, vsi_handle, hashed_flds, addl_hdrs,
-				      ICE_RSS_OUTER_HEADERS);
-	if (!status)
-		status = ice_add_rss_cfg_sync(hw, vsi_handle, hashed_flds,
-					      addl_hdrs, ICE_RSS_INNER_HEADERS);
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+	} else {
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_add_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_add_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+	}
 	mutex_unlock(&hw->rss_locks);
 
 	return status;
@@ -2179,28 +2236,29 @@ ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
  * ice_rem_rss_cfg_sync - remove an existing RSS configuration
  * @hw: pointer to the hardware structure
  * @vsi_handle: software VSI handle
- * @hashed_flds: Packet hash types (ICE_FLOW_HASH_*) to remove
- * @addl_hdrs: Protocol header fields within a packet segment
- * @segs_cnt: packet segment count
+ * @cfg: configure parameters
  *
  * Assumption: lock has already been acquired for RSS list
  */
 static int
-ice_rem_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		     u32 addl_hdrs, u8 segs_cnt)
+ice_rem_rss_cfg_sync(struct ice_hw *hw, u16 vsi_handle,
+		     const struct ice_rss_hash_cfg *cfg)
 {
 	const enum ice_block blk = ICE_BLK_RSS;
 	struct ice_flow_seg_info *segs;
 	struct ice_flow_prof *prof;
+	u8 segs_cnt;
 	int status;
 
+	segs_cnt = (cfg->hdr_type == ICE_RSS_OUTER_HEADERS) ?
+			   ICE_FLOW_SEG_SINGLE :
+			   ICE_FLOW_SEG_MAX;
 	segs = kcalloc(segs_cnt, sizeof(*segs), GFP_KERNEL);
 	if (!segs)
 		return -ENOMEM;
 
 	/* Construct the packet segment info from the hashed fields */
-	status = ice_flow_set_rss_seg_info(&segs[segs_cnt - 1], hashed_flds,
-					   addl_hdrs);
+	status = ice_flow_set_rss_seg_info(segs, segs_cnt, cfg);
 	if (status)
 		goto out;
 
@@ -2233,31 +2291,39 @@ out:
  * ice_rem_rss_cfg - remove an existing RSS config with matching hashed fields
  * @hw: pointer to the hardware structure
  * @vsi_handle: software VSI handle
- * @hashed_flds: Packet hash types (ICE_FLOW_HASH_*) to remove
- * @addl_hdrs: Protocol header fields within a packet segment
+ * @cfg: configure parameters
  *
  * This function will lookup the flow profile based on the input
  * hash field bitmap, iterate through the profile entry list of
  * that profile and find entry associated with input VSI to be
- * removed. Calls are made to underlying flow s which will APIs
+ * removed. Calls are made to underlying flow apis which will in
  * turn build or update buffers for RSS XLT1 section.
  */
-int __maybe_unused
-ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		u32 addl_hdrs)
+int
+ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		const struct ice_rss_hash_cfg *cfg)
 {
+	struct ice_rss_hash_cfg local_cfg;
 	int status;
 
-	if (hashed_flds == ICE_HASH_INVALID ||
-	    !ice_is_vsi_valid(hw, vsi_handle))
+	if (!ice_is_vsi_valid(hw, vsi_handle) || !cfg ||
+	    cfg->hdr_type > ICE_RSS_ANY_HEADERS ||
+	    cfg->hash_flds == ICE_HASH_INVALID)
 		return -EINVAL;
 
 	mutex_lock(&hw->rss_locks);
-	status = ice_rem_rss_cfg_sync(hw, vsi_handle, hashed_flds, addl_hdrs,
-				      ICE_RSS_OUTER_HEADERS);
-	if (!status)
-		status = ice_rem_rss_cfg_sync(hw, vsi_handle, hashed_flds,
-					      addl_hdrs, ICE_RSS_INNER_HEADERS);
+	local_cfg = *cfg;
+	if (cfg->hdr_type < ICE_RSS_ANY_HEADERS) {
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+	} else {
+		local_cfg.hdr_type = ICE_RSS_OUTER_HEADERS;
+		status = ice_rem_rss_cfg_sync(hw, vsi_handle, &local_cfg);
+		if (!status) {
+			local_cfg.hdr_type = ICE_RSS_INNER_HEADERS;
+			status = ice_rem_rss_cfg_sync(hw, vsi_handle,
+						      &local_cfg);
+		}
+	}
 	mutex_unlock(&hw->rss_locks);
 
 	return status;
@@ -2307,6 +2373,7 @@ ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
  */
 int ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 avf_hash)
 {
+	struct ice_rss_hash_cfg hcfg;
 	int status = 0;
 	u64 hash_flds;
 
@@ -2379,8 +2446,10 @@ int ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 avf_hash)
 		if (rss_hash == ICE_HASH_INVALID)
 			return -EIO;
 
-		status = ice_add_rss_cfg(hw, vsi_handle, rss_hash,
-					 ICE_FLOW_SEG_HDR_NONE);
+		hcfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		hcfg.hash_flds = rss_hash;
+		hcfg.hdr_type = ICE_RSS_ANY_HEADERS;
+		status = ice_add_rss_cfg(hw, vsi_handle, &hcfg);
 		if (status)
 			break;
 	}
@@ -2404,16 +2473,7 @@ int ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle)
 	mutex_lock(&hw->rss_locks);
 	list_for_each_entry(r, &hw->rss_list_head, l_entry) {
 		if (test_bit(vsi_handle, r->vsis)) {
-			status = ice_add_rss_cfg_sync(hw, vsi_handle,
-						      r->hashed_flds,
-						      r->packet_hdr,
-						      ICE_RSS_OUTER_HEADERS);
-			if (status)
-				break;
-			status = ice_add_rss_cfg_sync(hw, vsi_handle,
-						      r->hashed_flds,
-						      r->packet_hdr,
-						      ICE_RSS_INNER_HEADERS);
+			status = ice_add_rss_cfg_sync(hw, vsi_handle, &r->hash);
 			if (status)
 				break;
 		}
@@ -2444,8 +2504,8 @@ u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs)
 	mutex_lock(&hw->rss_locks);
 	list_for_each_entry(r, &hw->rss_list_head, l_entry)
 		if (test_bit(vsi_handle, r->vsis) &&
-		    r->packet_hdr == hdrs) {
-			rss_hash = r->hashed_flds;
+		    r->hash.addl_hdrs == hdrs) {
+			rss_hash = r->hash.hash_flds;
 			break;
 		}
 	mutex_unlock(&hw->rss_locks);
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
index 96923ef0a5a8..d1de84b78386 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.h
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -34,6 +34,8 @@
 #define ICE_HASH_TCP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_TCP_PORT)
 #define ICE_HASH_UDP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_UDP_PORT)
 #define ICE_HASH_UDP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_UDP_PORT)
+#define ICE_HASH_SCTP_IPV4	(ICE_FLOW_HASH_IPV4 | ICE_FLOW_HASH_SCTP_PORT)
+#define ICE_HASH_SCTP_IPV6	(ICE_FLOW_HASH_IPV6 | ICE_FLOW_HASH_SCTP_PORT)
 
 #define ICE_FLOW_HASH_GTP_TEID \
 	(BIT_ULL(ICE_FLOW_FIELD_IDX_GTPC_TEID))
@@ -279,6 +281,23 @@ enum ice_flow_avf_hdr_field {
 	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
 	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP))
 
+enum ice_rss_cfg_hdr_type {
+	ICE_RSS_OUTER_HEADERS, /* take outer headers as inputset. */
+	ICE_RSS_INNER_HEADERS, /* take inner headers as inputset. */
+	/* take inner headers as inputset for packet with outer ipv4. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV4,
+	/* take inner headers as inputset for packet with outer ipv6. */
+	ICE_RSS_INNER_HEADERS_W_OUTER_IPV6,
+	/* take outer headers first then inner headers as inputset */
+	ICE_RSS_ANY_HEADERS
+};
+
+struct ice_rss_hash_cfg {
+	u32 addl_hdrs; /* protocol header fields */
+	u64 hash_flds; /* hash bit field (ICE_FLOW_HASH_*) to configure */
+	enum ice_rss_cfg_hdr_type hdr_type; /* to specify inner or outer */
+};
+
 enum ice_flow_dir {
 	ICE_FLOW_RX		= 0x02,
 };
@@ -289,6 +308,7 @@ enum ice_flow_priority {
 	ICE_FLOW_PRIO_HIGH
 };
 
+#define ICE_FLOW_SEG_SINGLE		1
 #define ICE_FLOW_SEG_MAX		2
 #define ICE_FLOW_SEG_RAW_FLD_MAX	2
 #define ICE_FLOW_FV_EXTRACT_SZ		2
@@ -378,8 +398,7 @@ struct ice_rss_cfg {
 	struct list_head l_entry;
 	/* bitmap of VSIs added to the RSS entry */
 	DECLARE_BITMAP(vsis, ICE_MAX_VSI);
-	u64 hashed_flds;
-	u32 packet_hdr;
+	struct ice_rss_hash_cfg hash;
 };
 
 int
@@ -403,11 +422,9 @@ void ice_rem_vsi_rss_list(struct ice_hw *hw, u16 vsi_handle);
 int ice_replay_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
 int ice_add_avf_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds);
 int ice_rem_vsi_rss_cfg(struct ice_hw *hw, u16 vsi_handle);
-int
-ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		u32 addl_hdrs);
-int
-ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u64 hashed_flds,
-		u32 addl_hdrs);
+int ice_add_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		    const struct ice_rss_hash_cfg *cfg);
+int ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
+		    const struct ice_rss_hash_cfg *cfg);
 u64 ice_get_rss_cfg(struct ice_hw *hw, u16 vsi_handle, u32 hdrs);
 #endif /* _ICE_FLOW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index bb6151e798e4..b22f1b861831 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1611,6 +1611,38 @@ static void ice_vsi_set_vf_rss_flow_fld(struct ice_vsi *vsi)
 			vsi->vsi_num, status);
 }
 
+static const struct ice_rss_hash_cfg default_rss_cfgs[] = {
+	/* configure RSS for IPv4 with input set IP src/dst */
+	{ICE_FLOW_SEG_HDR_IPV4, ICE_FLOW_HASH_IPV4, ICE_RSS_ANY_HEADERS},
+	/* configure RSS for IPv6 with input set IPv6 src/dst */
+	{ICE_FLOW_SEG_HDR_IPV6, ICE_FLOW_HASH_IPV6, ICE_RSS_ANY_HEADERS},
+	/* configure RSS for tcp4 with input set IP src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_TCP_IPV4,  ICE_RSS_ANY_HEADERS},
+	/* configure RSS for udp4 with input set IP src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4,
+				ICE_HASH_UDP_IPV4,  ICE_RSS_ANY_HEADERS},
+	/* configure RSS for sctp4 with input set IP src/dst - only support
+	 * RSS on SCTPv4 on outer headers (non-tunneled)
+	 */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4,
+		ICE_HASH_SCTP_IPV4, ICE_RSS_OUTER_HEADERS},
+	/* configure RSS for tcp6 with input set IPv6 src/dst, TCP src/dst */
+	{ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_TCP_IPV6,  ICE_RSS_ANY_HEADERS},
+	/* configure RSS for udp6 with input set IPv6 src/dst, UDP src/dst */
+	{ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6,
+				ICE_HASH_UDP_IPV6,  ICE_RSS_ANY_HEADERS},
+	/* configure RSS for sctp6 with input set IPv6 src/dst - only support
+	 * RSS on SCTPv6 on outer headers (non-tunneled)
+	 */
+	{ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6,
+		ICE_HASH_SCTP_IPV6, ICE_RSS_OUTER_HEADERS},
+	/* configure RSS for IPSEC ESP SPI with input set MAC_IPV4_SPI */
+	{ICE_FLOW_SEG_HDR_ESP,
+		ICE_FLOW_HASH_ESP_SPI, ICE_RSS_OUTER_HEADERS},
+};
+
 /**
  * ice_vsi_set_rss_flow_fld - Sets RSS input set for different flows
  * @vsi: VSI to be configured
@@ -1629,6 +1661,7 @@ static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi)
 	struct ice_hw *hw = &pf->hw;
 	struct device *dev;
 	int status;
+	u32 i;
 
 	dev = ice_pf_to_dev(pf);
 	if (ice_is_safe_mode(pf)) {
@@ -1636,67 +1669,14 @@ static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi)
 			vsi_num);
 		return;
 	}
-	/* configure RSS for IPv4 with input set IP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV4,
-				 ICE_FLOW_SEG_HDR_IPV4);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for ipv4 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for IPv6 with input set IPv6 src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV6,
-				 ICE_FLOW_SEG_HDR_IPV6);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for ipv6 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for tcp4 with input set IP src/dst, TCP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_TCP_IPV4,
-				 ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV4);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for tcp4 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
+	for (i = 0; i < ARRAY_SIZE(default_rss_cfgs); i++) {
+		const struct ice_rss_hash_cfg *cfg = &default_rss_cfgs[i];
 
-	/* configure RSS for udp4 with input set IP src/dst, UDP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_UDP_IPV4,
-				 ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV4);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for udp4 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for sctp4 with input set IP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV4,
-				 ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV4);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for sctp4 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for tcp6 with input set IPv6 src/dst, TCP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_TCP_IPV6,
-				 ICE_FLOW_SEG_HDR_TCP | ICE_FLOW_SEG_HDR_IPV6);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for tcp6 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for udp6 with input set IPv6 src/dst, UDP src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_HASH_UDP_IPV6,
-				 ICE_FLOW_SEG_HDR_UDP | ICE_FLOW_SEG_HDR_IPV6);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for udp6 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	/* configure RSS for sctp6 with input set IPv6 src/dst */
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_IPV6,
-				 ICE_FLOW_SEG_HDR_SCTP | ICE_FLOW_SEG_HDR_IPV6);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for sctp6 flow, vsi = %d, error = %d\n",
-			vsi_num, status);
-
-	status = ice_add_rss_cfg(hw, vsi_handle, ICE_FLOW_HASH_ESP_SPI,
-				 ICE_FLOW_SEG_HDR_ESP);
-	if (status)
-		dev_dbg(dev, "ice_add_rss_cfg failed for esp/spi flow, vsi = %d, error = %d\n",
-			vsi_num, status);
+		status = ice_add_rss_cfg(hw, vsi_handle, cfg);
+		if (status)
+			dev_dbg(dev, "ice_add_rss_cfg failed, addl_hdrs = %x, hash_flds = %llx, hdr_type = %d\n",
+				cfg->addl_hdrs, cfg->hash_flds, cfg->hdr_type);
+	}
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 8872f7a4f432..0e7b5984a7c7 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -689,9 +689,7 @@ out:
  * a specific virtchnl RSS cfg
  * @hw: pointer to the hardware
  * @rss_cfg: pointer to the virtchnl RSS cfg
- * @addl_hdrs: pointer to the protocol header fields (ICE_FLOW_SEG_HDR_*)
- * to configure
- * @hash_flds: pointer to the hash bit fields (ICE_FLOW_HASH_*) to configure
+ * @hash_cfg: pointer to the HW hash configuration
  *
  * Return true if all the protocol header and hash fields in the RSS cfg could
  * be parsed, else return false
@@ -699,13 +697,18 @@ out:
  * This function parses the virtchnl RSS cfg to be the intended
  * hash fields and the intended header for RSS configuration
  */
-static bool
-ice_vc_parse_rss_cfg(struct ice_hw *hw, struct virtchnl_rss_cfg *rss_cfg,
-		     u32 *addl_hdrs, u64 *hash_flds)
+static bool ice_vc_parse_rss_cfg(struct ice_hw *hw,
+				 struct virtchnl_rss_cfg *rss_cfg,
+				 struct ice_rss_hash_cfg *hash_cfg)
 {
 	const struct ice_vc_hash_field_match_type *hf_list;
 	const struct ice_vc_hdr_match_type *hdr_list;
 	int i, hf_list_len, hdr_list_len;
+	u32 *addl_hdrs = &hash_cfg->addl_hdrs;
+	u64 *hash_flds = &hash_cfg->hash_flds;
+
+	/* set outer layer RSS as default */
+	hash_cfg->hdr_type = ICE_RSS_OUTER_HEADERS;
 
 	hf_list = ice_vc_hash_field_list;
 	hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list);
@@ -856,18 +859,24 @@ static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add)
 
 		kfree(ctx);
 	} else {
-		u32 addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
-		u64 hash_flds = ICE_HASH_INVALID;
+		struct ice_rss_hash_cfg cfg;
+
+		/* Only check for none raw pattern case */
+		if (!ice_vc_validate_pattern(vf, &rss_cfg->proto_hdrs)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+		cfg.addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		cfg.hash_flds = ICE_HASH_INVALID;
+		cfg.hdr_type = ICE_RSS_ANY_HEADERS;
 
-		if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &addl_hdrs,
-					  &hash_flds)) {
+		if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &cfg)) {
 			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 			goto error_param;
 		}
 
 		if (add) {
-			if (ice_add_rss_cfg(hw, vsi->idx, hash_flds,
-					    addl_hdrs)) {
+			if (ice_add_rss_cfg(hw, vsi->idx, &cfg)) {
 				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
 				dev_err(dev, "ice_add_rss_cfg failed for vsi = %d, v_ret = %d\n",
 					vsi->vsi_num, v_ret);
@@ -875,8 +884,7 @@ static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add)
 		} else {
 			int status;
 
-			status = ice_rem_rss_cfg(hw, vsi->idx, hash_flds,
-						 addl_hdrs);
+			status = ice_rem_rss_cfg(hw, vsi->idx, &cfg);
 			/* We just ignore -ENOENT, because if two configurations
 			 * share the same profile remove one of them actually
 			 * removes both, since the profile is deleted.
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 6b3acf15be5c..b0e060cc79ac 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -911,6 +911,14 @@ struct virtchnl_rss_hena {
 
 VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena);
 
+/* Type of RSS algorithm */
+enum virtchnl_rss_algorithm {
+	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
+	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
+	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
+	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
+};
+
 /* VIRTCHNL_OP_ENABLE_CHANNELS
  * VIRTCHNL_OP_DISABLE_CHANNELS
  * VF sends these messages to enable or disable channels based on
@@ -1095,14 +1103,6 @@ enum virtchnl_vfr_states {
 	VIRTCHNL_VFR_VFACTIVE,
 };
 
-/* Type of RSS algorithm */
-enum virtchnl_rss_algorithm {
-	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
-	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
-	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
-	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
-};
-
 #define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
 #define PROTO_HDR_SHIFT			5
 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT)
-- 
cgit v1.2.3


From 4a3de3fb0eb6897488dd510006abd9673f1fb34c Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Tue, 12 Dec 2023 17:33:21 -0700
Subject: iavf: enable symmetric-xor RSS for Toeplitz hash function

Allow the user to set the symmetric Toeplitz hash function via:

    # ethtool -X eth0 hfunc toeplitz symmetric-xor

The driver will reject any new RSS configuration if a field other than
(IP src/dst and L4 src/dst ports) is requested for hashing.

The symmetric RSS will not be supported on PFs not advertising the ADV RSS
Offload flag (ADV_RSS_SUPPORT()), for example the E700 series (i40e).

Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://lore.kernel.org/r/20231213003321.605376-9-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/iavf/iavf.h             |  5 ++-
 drivers/net/ethernet/intel/iavf/iavf_adv_rss.c     |  8 +++-
 drivers/net/ethernet/intel/iavf/iavf_adv_rss.h     |  3 +-
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c     | 32 ++++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_main.c        |  4 ++
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c    | 41 ++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_virtchnl.c      | 50 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_virtchnl.h      |  1 +
 .../ethernet/intel/ice/ice_virtchnl_allowlist.c    |  1 +
 include/linux/avf/virtchnl.h                       | 19 ++++++++
 10 files changed, 156 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index e7ab89dc883a..f83fbcc72075 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -312,7 +312,8 @@ struct iavf_adapter {
 #define IAVF_FLAG_AQ_SET_HENA			BIT_ULL(12)
 #define IAVF_FLAG_AQ_SET_RSS_KEY		BIT_ULL(13)
 #define IAVF_FLAG_AQ_SET_RSS_LUT		BIT_ULL(14)
-#define IAVF_FLAG_AQ_CONFIGURE_PROMISC_MODE	BIT_ULL(15)
+#define IAVF_FLAG_AQ_SET_RSS_HFUNC		BIT_ULL(15)
+#define IAVF_FLAG_AQ_CONFIGURE_PROMISC_MODE	BIT_ULL(16)
 #define IAVF_FLAG_AQ_ENABLE_VLAN_STRIPPING	BIT_ULL(19)
 #define IAVF_FLAG_AQ_DISABLE_VLAN_STRIPPING	BIT_ULL(20)
 #define IAVF_FLAG_AQ_ENABLE_CHANNELS		BIT_ULL(21)
@@ -414,6 +415,7 @@ struct iavf_adapter {
 	struct iavf_vsi vsi;
 	u32 aq_wait_count;
 	/* RSS stuff */
+	enum virtchnl_rss_algorithm hfunc;
 	u64 hena;
 	u16 rss_key_size;
 	u16 rss_lut_size;
@@ -539,6 +541,7 @@ void iavf_get_hena(struct iavf_adapter *adapter);
 void iavf_set_hena(struct iavf_adapter *adapter);
 void iavf_set_rss_key(struct iavf_adapter *adapter);
 void iavf_set_rss_lut(struct iavf_adapter *adapter);
+void iavf_set_rss_hfunc(struct iavf_adapter *adapter);
 void iavf_enable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_disable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_virtchnl_completion(struct iavf_adapter *adapter,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_adv_rss.c b/drivers/net/ethernet/intel/iavf/iavf_adv_rss.c
index 6edbf134b73f..a9e1da35e248 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_adv_rss.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_adv_rss.c
@@ -95,17 +95,21 @@ iavf_fill_adv_rss_sctp_hdr(struct virtchnl_proto_hdr *hdr, u64 hash_flds)
  * @rss_cfg: the virtchnl message to be filled with RSS configuration setting
  * @packet_hdrs: the RSS configuration protocol header types
  * @hash_flds: the RSS configuration protocol hash fields
+ * @symm: if true, symmetric hash is required
  *
  * Returns 0 if the RSS configuration virtchnl message is filled successfully
  */
 int
 iavf_fill_adv_rss_cfg_msg(struct virtchnl_rss_cfg *rss_cfg,
-			  u32 packet_hdrs, u64 hash_flds)
+			  u32 packet_hdrs, u64 hash_flds, bool symm)
 {
 	struct virtchnl_proto_hdrs *proto_hdrs = &rss_cfg->proto_hdrs;
 	struct virtchnl_proto_hdr *hdr;
 
-	rss_cfg->rss_algorithm = VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC;
+	if (symm)
+		rss_cfg->rss_algorithm = VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC;
+	else
+		rss_cfg->rss_algorithm = VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC;
 
 	proto_hdrs->tunnel_level = 0;	/* always outer layer */
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_adv_rss.h b/drivers/net/ethernet/intel/iavf/iavf_adv_rss.h
index 4d3be11af7aa..e31eb2afebea 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_adv_rss.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_adv_rss.h
@@ -80,13 +80,14 @@ struct iavf_adv_rss {
 
 	u32 packet_hdrs;
 	u64 hash_flds;
+	bool symm;
 
 	struct virtchnl_rss_cfg cfg_msg;
 };
 
 int
 iavf_fill_adv_rss_cfg_msg(struct virtchnl_rss_cfg *rss_cfg,
-			  u32 packet_hdrs, u64 hash_flds);
+			  u32 packet_hdrs, u64 hash_flds, bool symm);
 struct iavf_adv_rss *
 iavf_find_adv_rss_cfg_by_hdrs(struct iavf_adapter *adapter, u32 packet_hdrs);
 void
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index c376a489e4c2..f147743792fb 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1529,11 +1529,12 @@ static u32 iavf_adv_rss_parse_hdrs(struct ethtool_rxnfc *cmd)
 /**
  * iavf_adv_rss_parse_hash_flds - parses hash fields from RSS hash input
  * @cmd: ethtool rxnfc command
+ * @symm: true if Symmetric Topelitz is set
  *
  * This function parses the rxnfc command and returns intended hash fields for
  * RSS configuration
  */
-static u64 iavf_adv_rss_parse_hash_flds(struct ethtool_rxnfc *cmd)
+static u64 iavf_adv_rss_parse_hash_flds(struct ethtool_rxnfc *cmd, bool symm)
 {
 	u64 hfld = IAVF_ADV_RSS_HASH_INVALID;
 
@@ -1605,17 +1606,20 @@ iavf_set_adv_rss_hash_opt(struct iavf_adapter *adapter,
 	struct iavf_adv_rss *rss_old, *rss_new;
 	bool rss_new_add = false;
 	int count = 50, err = 0;
+	bool symm = false;
 	u64 hash_flds;
 	u32 hdrs;
 
 	if (!ADV_RSS_SUPPORT(adapter))
 		return -EOPNOTSUPP;
 
+	symm = !!(adapter->hfunc == VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC);
+
 	hdrs = iavf_adv_rss_parse_hdrs(cmd);
 	if (hdrs == IAVF_ADV_RSS_FLOW_SEG_HDR_NONE)
 		return -EINVAL;
 
-	hash_flds = iavf_adv_rss_parse_hash_flds(cmd);
+	hash_flds = iavf_adv_rss_parse_hash_flds(cmd, symm);
 	if (hash_flds == IAVF_ADV_RSS_HASH_INVALID)
 		return -EINVAL;
 
@@ -1623,7 +1627,8 @@ iavf_set_adv_rss_hash_opt(struct iavf_adapter *adapter,
 	if (!rss_new)
 		return -ENOMEM;
 
-	if (iavf_fill_adv_rss_cfg_msg(&rss_new->cfg_msg, hdrs, hash_flds)) {
+	if (iavf_fill_adv_rss_cfg_msg(&rss_new->cfg_msg, hdrs, hash_flds,
+				      symm)) {
 		kfree(rss_new);
 		return -EINVAL;
 	}
@@ -1642,9 +1647,11 @@ iavf_set_adv_rss_hash_opt(struct iavf_adapter *adapter,
 	if (rss_old) {
 		if (rss_old->state != IAVF_ADV_RSS_ACTIVE) {
 			err = -EBUSY;
-		} else if (rss_old->hash_flds != hash_flds) {
+		} else if (rss_old->hash_flds != hash_flds ||
+			   rss_old->symm != symm) {
 			rss_old->state = IAVF_ADV_RSS_ADD_REQUEST;
 			rss_old->hash_flds = hash_flds;
+			rss_old->symm = symm;
 			memcpy(&rss_old->cfg_msg, &rss_new->cfg_msg,
 			       sizeof(rss_new->cfg_msg));
 		} else {
@@ -1655,6 +1662,7 @@ iavf_set_adv_rss_hash_opt(struct iavf_adapter *adapter,
 		rss_new->state = IAVF_ADV_RSS_ADD_REQUEST;
 		rss_new->packet_hdrs = hdrs;
 		rss_new->hash_flds = hash_flds;
+		rss_new->symm = symm;
 		list_add_tail(&rss_new->list, &adapter->adv_rss_list_head);
 	}
 	spin_unlock_bh(&adapter->adv_rss_lock);
@@ -1905,6 +1913,9 @@ static int iavf_get_rxfh(struct net_device *netdev,
 	u16 i;
 
 	rxfh->hfunc = ETH_RSS_HASH_TOP;
+	if (adapter->hfunc == VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC)
+		rxfh->input_xfrm |= RXH_XFRM_SYM_XOR;
+
 	if (rxfh->key)
 		memcpy(rxfh->key, adapter->rss_key, adapter->rss_key_size);
 
@@ -1937,6 +1948,18 @@ static int iavf_set_rxfh(struct net_device *netdev,
 	    rxfh->hfunc != ETH_RSS_HASH_TOP)
 		return -EOPNOTSUPP;
 
+	if ((rxfh->input_xfrm & RXH_XFRM_SYM_XOR) &&
+	    adapter->hfunc != VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC) {
+		if (!ADV_RSS_SUPPORT(adapter))
+			return -EOPNOTSUPP;
+		adapter->hfunc = VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC;
+		adapter->aq_required |= IAVF_FLAG_AQ_SET_RSS_HFUNC;
+	} else if (!(rxfh->input_xfrm & RXH_XFRM_SYM_XOR) &&
+		    adapter->hfunc != VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC) {
+		adapter->hfunc = VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC;
+		adapter->aq_required |= IAVF_FLAG_AQ_SET_RSS_HFUNC;
+	}
+
 	if (!rxfh->key && !rxfh->indir)
 		return 0;
 
@@ -1955,6 +1978,7 @@ static int iavf_set_rxfh(struct net_device *netdev,
 static const struct ethtool_ops iavf_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE,
+	.cap_rss_sym_xor_supported = true,
 	.get_drvinfo		= iavf_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
 	.get_ringparam		= iavf_get_ringparam,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 06a87030c163..0b3b33acf1bd 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2166,6 +2166,10 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter)
 		iavf_set_rss_lut(adapter);
 		return 0;
 	}
+	if (adapter->aq_required & IAVF_FLAG_AQ_SET_RSS_HFUNC) {
+		iavf_set_rss_hfunc(adapter);
+		return 0;
+	}
 
 	if (adapter->aq_required & IAVF_FLAG_AQ_CONFIGURE_PROMISC_MODE) {
 		iavf_set_promiscuous(adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 64c4443dbef9..64a351e70a56 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -1141,6 +1141,34 @@ void iavf_set_rss_lut(struct iavf_adapter *adapter)
 	kfree(vrl);
 }
 
+/**
+ * iavf_set_rss_hfunc
+ * @adapter: adapter structure
+ *
+ * Request the PF to set our RSS Hash function
+ **/
+void iavf_set_rss_hfunc(struct iavf_adapter *adapter)
+{
+	struct virtchnl_rss_hfunc *vrh;
+	int len = sizeof(*vrh);
+
+	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
+		/* bail because we already have a command pending */
+		dev_err(&adapter->pdev->dev, "Cannot set RSS Hash function, command %d pending\n",
+			adapter->current_op);
+		return;
+	}
+	vrh = kzalloc(len, GFP_KERNEL);
+	if (!vrh)
+		return;
+	vrh->vsi_id = adapter->vsi.id;
+	vrh->rss_algorithm = adapter->hfunc;
+	adapter->current_op = VIRTCHNL_OP_CONFIG_RSS_HFUNC;
+	adapter->aq_required &= ~IAVF_FLAG_AQ_SET_RSS_HFUNC;
+	iavf_send_pf_msg(adapter, VIRTCHNL_OP_CONFIG_RSS_HFUNC, (u8 *)vrh, len);
+	kfree(vrh);
+}
+
 /**
  * iavf_enable_vlan_stripping
  * @adapter: adapter structure
@@ -2142,6 +2170,19 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			dev_warn(&adapter->pdev->dev, "Failed to add VLAN filter, error %s\n",
 				 iavf_stat_str(&adapter->hw, v_retval));
 			break;
+		case VIRTCHNL_OP_CONFIG_RSS_HFUNC:
+			dev_warn(&adapter->pdev->dev, "Failed to configure hash function, error %s\n",
+				 iavf_stat_str(&adapter->hw, v_retval));
+
+			if (adapter->hfunc ==
+					VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC)
+				adapter->hfunc =
+					VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC;
+			else
+				adapter->hfunc =
+					VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC;
+
+			break;
 		default:
 			dev_err(&adapter->pdev->dev, "PF returned error %d (%s) to our request %d\n",
 				v_retval, iavf_stat_str(&adapter->hw, v_retval),
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 482f8e8b1951..d4ad0739b57b 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -999,6 +999,51 @@ error_param:
 				     NULL, 0);
 }
 
+/**
+ * ice_vc_config_rss_hfunc
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Configure the VF's RSS Hash function
+ */
+static int ice_vc_config_rss_hfunc(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_rss_hfunc *vrh = (struct virtchnl_rss_hfunc *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	u8 hfunc = ICE_AQ_VSI_Q_OPT_RSS_HASH_TPLZ;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!ice_vc_isvalid_vsi_id(vf, vrh->vsi_id)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (vrh->rss_algorithm == VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC)
+		hfunc = ICE_AQ_VSI_Q_OPT_RSS_HASH_SYM_TPLZ;
+
+	if (ice_set_rss_hfunc(vsi, hfunc))
+		v_ret = VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR;
+error_param:
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_RSS_HFUNC, v_ret,
+				     NULL, 0);
+}
+
 /**
  * ice_vc_cfg_promiscuous_mode_msg
  * @vf: pointer to the VF info
@@ -3766,6 +3811,7 @@ static const struct ice_virtchnl_ops ice_virtchnl_dflt_ops = {
 	.cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
 	.config_rss_key = ice_vc_config_rss_key,
 	.config_rss_lut = ice_vc_config_rss_lut,
+	.config_rss_hfunc = ice_vc_config_rss_hfunc,
 	.get_stats_msg = ice_vc_get_stats_msg,
 	.cfg_promiscuous_mode_msg = ice_vc_cfg_promiscuous_mode_msg,
 	.add_vlan_msg = ice_vc_add_vlan_msg,
@@ -3895,6 +3941,7 @@ static const struct ice_virtchnl_ops ice_virtchnl_repr_ops = {
 	.cfg_irq_map_msg = ice_vc_cfg_irq_map_msg,
 	.config_rss_key = ice_vc_config_rss_key,
 	.config_rss_lut = ice_vc_config_rss_lut,
+	.config_rss_hfunc = ice_vc_config_rss_hfunc,
 	.get_stats_msg = ice_vc_get_stats_msg,
 	.cfg_promiscuous_mode_msg = ice_vc_repr_cfg_promiscuous_mode,
 	.add_vlan_msg = ice_vc_add_vlan_msg,
@@ -4077,6 +4124,9 @@ error_handler:
 	case VIRTCHNL_OP_CONFIG_RSS_LUT:
 		err = ops->config_rss_lut(vf, msg);
 		break;
+	case VIRTCHNL_OP_CONFIG_RSS_HFUNC:
+		err = ops->config_rss_hfunc(vf, msg);
+		break;
 	case VIRTCHNL_OP_GET_STATS:
 		err = ops->get_stats_msg(vf, msg);
 		break;
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.h b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
index cd747718de73..60dfbe05980a 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
@@ -32,6 +32,7 @@ struct ice_virtchnl_ops {
 	int (*cfg_irq_map_msg)(struct ice_vf *vf, u8 *msg);
 	int (*config_rss_key)(struct ice_vf *vf, u8 *msg);
 	int (*config_rss_lut)(struct ice_vf *vf, u8 *msg);
+	int (*config_rss_hfunc)(struct ice_vf *vf, u8 *msg);
 	int (*get_stats_msg)(struct ice_vf *vf, u8 *msg);
 	int (*cfg_promiscuous_mode_msg)(struct ice_vf *vf, u8 *msg);
 	int (*add_vlan_msg)(struct ice_vf *vf, u8 *msg);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
index 7d547fa616fa..5e19d48a05b4 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -68,6 +68,7 @@ static const u32 vlan_v2_allowlist_opcodes[] = {
 static const u32 rss_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
 	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
+	VIRTCHNL_OP_CONFIG_RSS_HFUNC,
 };
 
 /* VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC */
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b0e060cc79ac..a44d9dc7e3eb 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -118,6 +118,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_GET_STATS = 15,
 	VIRTCHNL_OP_RSVD = 16,
 	VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */
+	VIRTCHNL_OP_CONFIG_RSS_HFUNC = 18,
 	/* opcode 19 is reserved */
 	VIRTCHNL_OP_IWARP = 20, /* advanced opcode */
 	VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP,
@@ -919,6 +920,21 @@ enum virtchnl_rss_algorithm {
 	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
 };
 
+/* VIRTCHNL_OP_CONFIG_RSS_HFUNC
+ * VF sends this message to configure the RSS hash function. Only supported
+ * if both PF and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during
+ * configuration negotiation.
+ * The hash function is initialized to VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC
+ * by the PF.
+ */
+struct virtchnl_rss_hfunc {
+	u16 vsi_id;
+	u16 rss_algorithm; /* enum virtchnl_rss_algorithm */
+	u32 reserved;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hfunc);
+
 /* VIRTCHNL_OP_ENABLE_CHANNELS
  * VIRTCHNL_OP_DISABLE_CHANNELS
  * VF sends these messages to enable or disable channels based on
@@ -1542,6 +1558,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 							 vrl->lut_entries);
 		}
 		break;
+	case VIRTCHNL_OP_CONFIG_RSS_HFUNC:
+		valid_len = sizeof(struct virtchnl_rss_hfunc);
+		break;
 	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
 		break;
 	case VIRTCHNL_OP_SET_RSS_HENA:
-- 
cgit v1.2.3


From 62210a26cd4f8ad52683a71c0226dfe85de1144d Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 18 Oct 2023 17:58:12 +0530
Subject: bus: mhi: ep: Use slab allocator where applicable

Use slab allocator for allocating the memory for objects used frequently
and are of fixed size. This reduces the overheard associated with
kmalloc().

Suggested-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20231018122812.47261-1-manivannan.sadhasivam@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/ep/main.c | 66 +++++++++++++++++++++++++++++++++++------------
 include/linux/mhi_ep.h    |  3 +++
 2 files changed, 52 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/ep/main.c b/drivers/bus/mhi/ep/main.c
index e2513f5f47a6..517279600645 100644
--- a/drivers/bus/mhi/ep/main.c
+++ b/drivers/bus/mhi/ep/main.c
@@ -74,7 +74,7 @@ static int mhi_ep_send_completion_event(struct mhi_ep_cntrl *mhi_cntrl, struct m
 	struct mhi_ring_element *event;
 	int ret;
 
-	event = kzalloc(sizeof(struct mhi_ring_element), GFP_KERNEL);
+	event = kmem_cache_zalloc(mhi_cntrl->ev_ring_el_cache, GFP_KERNEL | GFP_DMA);
 	if (!event)
 		return -ENOMEM;
 
@@ -83,7 +83,7 @@ static int mhi_ep_send_completion_event(struct mhi_ep_cntrl *mhi_cntrl, struct m
 	event->dword[1] = MHI_TRE_EV_DWORD1(ring->ch_id, MHI_PKT_TYPE_TX_EVENT);
 
 	ret = mhi_ep_send_event(mhi_cntrl, ring->er_index, event, MHI_TRE_DATA_GET_BEI(tre));
-	kfree(event);
+	kmem_cache_free(mhi_cntrl->ev_ring_el_cache, event);
 
 	return ret;
 }
@@ -93,7 +93,7 @@ int mhi_ep_send_state_change_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_stat
 	struct mhi_ring_element *event;
 	int ret;
 
-	event = kzalloc(sizeof(struct mhi_ring_element), GFP_KERNEL);
+	event = kmem_cache_zalloc(mhi_cntrl->ev_ring_el_cache, GFP_KERNEL | GFP_DMA);
 	if (!event)
 		return -ENOMEM;
 
@@ -101,7 +101,7 @@ int mhi_ep_send_state_change_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_stat
 	event->dword[1] = MHI_SC_EV_DWORD1(MHI_PKT_TYPE_STATE_CHANGE_EVENT);
 
 	ret = mhi_ep_send_event(mhi_cntrl, 0, event, 0);
-	kfree(event);
+	kmem_cache_free(mhi_cntrl->ev_ring_el_cache, event);
 
 	return ret;
 }
@@ -111,7 +111,7 @@ int mhi_ep_send_ee_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_ee_type exec_e
 	struct mhi_ring_element *event;
 	int ret;
 
-	event = kzalloc(sizeof(struct mhi_ring_element), GFP_KERNEL);
+	event = kmem_cache_zalloc(mhi_cntrl->ev_ring_el_cache, GFP_KERNEL | GFP_DMA);
 	if (!event)
 		return -ENOMEM;
 
@@ -119,7 +119,7 @@ int mhi_ep_send_ee_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_ee_type exec_e
 	event->dword[1] = MHI_SC_EV_DWORD1(MHI_PKT_TYPE_EE_EVENT);
 
 	ret = mhi_ep_send_event(mhi_cntrl, 0, event, 0);
-	kfree(event);
+	kmem_cache_free(mhi_cntrl->ev_ring_el_cache, event);
 
 	return ret;
 }
@@ -130,7 +130,7 @@ static int mhi_ep_send_cmd_comp_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_e
 	struct mhi_ring_element *event;
 	int ret;
 
-	event = kzalloc(sizeof(struct mhi_ring_element), GFP_KERNEL);
+	event = kmem_cache_zalloc(mhi_cntrl->ev_ring_el_cache, GFP_KERNEL | GFP_DMA);
 	if (!event)
 		return -ENOMEM;
 
@@ -139,7 +139,7 @@ static int mhi_ep_send_cmd_comp_event(struct mhi_ep_cntrl *mhi_cntrl, enum mhi_e
 	event->dword[1] = MHI_CC_EV_DWORD1(MHI_PKT_TYPE_CMD_COMPLETION_EVENT);
 
 	ret = mhi_ep_send_event(mhi_cntrl, 0, event, 0);
-	kfree(event);
+	kmem_cache_free(mhi_cntrl->ev_ring_el_cache, event);
 
 	return ret;
 }
@@ -451,7 +451,7 @@ static int mhi_ep_process_ch_ring(struct mhi_ep_ring *ring, struct mhi_ring_elem
 		mhi_chan->xfer_cb(mhi_chan->mhi_dev, &result);
 	} else {
 		/* UL channel */
-		result.buf_addr = kzalloc(len, GFP_KERNEL);
+		result.buf_addr = kmem_cache_zalloc(mhi_cntrl->tre_buf_cache, GFP_KERNEL | GFP_DMA);
 		if (!result.buf_addr)
 			return -ENOMEM;
 
@@ -459,7 +459,7 @@ static int mhi_ep_process_ch_ring(struct mhi_ep_ring *ring, struct mhi_ring_elem
 			ret = mhi_ep_read_channel(mhi_cntrl, ring, &result, len);
 			if (ret < 0) {
 				dev_err(&mhi_chan->mhi_dev->dev, "Failed to read channel\n");
-				kfree(result.buf_addr);
+				kmem_cache_free(mhi_cntrl->tre_buf_cache, result.buf_addr);
 				return ret;
 			}
 
@@ -471,7 +471,7 @@ static int mhi_ep_process_ch_ring(struct mhi_ep_ring *ring, struct mhi_ring_elem
 			/* Read until the ring becomes empty */
 		} while (!mhi_ep_queue_is_empty(mhi_chan->mhi_dev, DMA_TO_DEVICE));
 
-		kfree(result.buf_addr);
+		kmem_cache_free(mhi_cntrl->tre_buf_cache, result.buf_addr);
 	}
 
 	return 0;
@@ -780,14 +780,14 @@ static void mhi_ep_ch_ring_worker(struct work_struct *work)
 		if (ret) {
 			dev_err(dev, "Error updating write offset for ring\n");
 			mutex_unlock(&chan->lock);
-			kfree(itr);
+			kmem_cache_free(mhi_cntrl->ring_item_cache, itr);
 			continue;
 		}
 
 		/* Sanity check to make sure there are elements in the ring */
 		if (ring->rd_offset == ring->wr_offset) {
 			mutex_unlock(&chan->lock);
-			kfree(itr);
+			kmem_cache_free(mhi_cntrl->ring_item_cache, itr);
 			continue;
 		}
 
@@ -799,12 +799,12 @@ static void mhi_ep_ch_ring_worker(struct work_struct *work)
 			dev_err(dev, "Error processing ring for channel (%u): %d\n",
 				ring->ch_id, ret);
 			mutex_unlock(&chan->lock);
-			kfree(itr);
+			kmem_cache_free(mhi_cntrl->ring_item_cache, itr);
 			continue;
 		}
 
 		mutex_unlock(&chan->lock);
-		kfree(itr);
+		kmem_cache_free(mhi_cntrl->ring_item_cache, itr);
 	}
 }
 
@@ -860,7 +860,7 @@ static void mhi_ep_queue_channel_db(struct mhi_ep_cntrl *mhi_cntrl, unsigned lon
 		u32 ch_id = ch_idx + i;
 
 		ring = &mhi_cntrl->mhi_chan[ch_id].ring;
-		item = kzalloc(sizeof(*item), GFP_ATOMIC);
+		item = kmem_cache_zalloc(mhi_cntrl->ring_item_cache, GFP_ATOMIC);
 		if (!item)
 			return;
 
@@ -1407,6 +1407,29 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl,
 		goto err_free_ch;
 	}
 
+	mhi_cntrl->ev_ring_el_cache = kmem_cache_create("mhi_ep_event_ring_el",
+							sizeof(struct mhi_ring_element), 0,
+							SLAB_CACHE_DMA, NULL);
+	if (!mhi_cntrl->ev_ring_el_cache) {
+		ret = -ENOMEM;
+		goto err_free_cmd;
+	}
+
+	mhi_cntrl->tre_buf_cache = kmem_cache_create("mhi_ep_tre_buf", MHI_EP_DEFAULT_MTU, 0,
+						      SLAB_CACHE_DMA, NULL);
+	if (!mhi_cntrl->tre_buf_cache) {
+		ret = -ENOMEM;
+		goto err_destroy_ev_ring_el_cache;
+	}
+
+	mhi_cntrl->ring_item_cache = kmem_cache_create("mhi_ep_ring_item",
+							sizeof(struct mhi_ep_ring_item), 0,
+							0, NULL);
+	if (!mhi_cntrl->ev_ring_el_cache) {
+		ret = -ENOMEM;
+		goto err_destroy_tre_buf_cache;
+	}
+
 	INIT_WORK(&mhi_cntrl->state_work, mhi_ep_state_worker);
 	INIT_WORK(&mhi_cntrl->reset_work, mhi_ep_reset_worker);
 	INIT_WORK(&mhi_cntrl->cmd_ring_work, mhi_ep_cmd_ring_worker);
@@ -1415,7 +1438,7 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl,
 	mhi_cntrl->wq = alloc_workqueue("mhi_ep_wq", 0, 0);
 	if (!mhi_cntrl->wq) {
 		ret = -ENOMEM;
-		goto err_free_cmd;
+		goto err_destroy_ring_item_cache;
 	}
 
 	INIT_LIST_HEAD(&mhi_cntrl->st_transition_list);
@@ -1474,6 +1497,12 @@ err_ida_free:
 	ida_free(&mhi_ep_cntrl_ida, mhi_cntrl->index);
 err_destroy_wq:
 	destroy_workqueue(mhi_cntrl->wq);
+err_destroy_ring_item_cache:
+	kmem_cache_destroy(mhi_cntrl->ring_item_cache);
+err_destroy_ev_ring_el_cache:
+	kmem_cache_destroy(mhi_cntrl->ev_ring_el_cache);
+err_destroy_tre_buf_cache:
+	kmem_cache_destroy(mhi_cntrl->tre_buf_cache);
 err_free_cmd:
 	kfree(mhi_cntrl->mhi_cmd);
 err_free_ch:
@@ -1495,6 +1524,9 @@ void mhi_ep_unregister_controller(struct mhi_ep_cntrl *mhi_cntrl)
 
 	free_irq(mhi_cntrl->irq, mhi_cntrl);
 
+	kmem_cache_destroy(mhi_cntrl->tre_buf_cache);
+	kmem_cache_destroy(mhi_cntrl->ev_ring_el_cache);
+	kmem_cache_destroy(mhi_cntrl->ring_item_cache);
 	kfree(mhi_cntrl->mhi_cmd);
 	kfree(mhi_cntrl->mhi_chan);
 
diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index f198a8ac7ee7..ce85d42b685d 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -128,6 +128,9 @@ struct mhi_ep_cntrl {
 	struct work_struct reset_work;
 	struct work_struct cmd_ring_work;
 	struct work_struct ch_ring_work;
+	struct kmem_cache *ring_item_cache;
+	struct kmem_cache *ev_ring_el_cache;
+	struct kmem_cache *tre_buf_cache;
 
 	void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector);
 	int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr,
-- 
cgit v1.2.3


From b08ded2ef2e98768d5ee5f71da8fe768b1f7774b Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 17 Aug 2023 23:24:52 +0530
Subject: bus: mhi: ep: Pass mhi_ep_buf_info struct to read/write APIs

In the preparation of DMA async support, let's pass the parameters to
read_from_host() and write_to_host() APIs using mhi_ep_buf_info structure.

No functional change.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/ep/main.c                    | 23 +++++-----
 drivers/bus/mhi/ep/ring.c                    | 41 +++++++++--------
 drivers/pci/endpoint/functions/pci-epf-mhi.c | 66 +++++++++++++++-------------
 include/linux/mhi_ep.h                       | 16 ++++++-
 4 files changed, 84 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/ep/main.c b/drivers/bus/mhi/ep/main.c
index c759183943ed..b9334d548c41 100644
--- a/drivers/bus/mhi/ep/main.c
+++ b/drivers/bus/mhi/ep/main.c
@@ -340,10 +340,9 @@ static int mhi_ep_read_channel(struct mhi_ep_cntrl *mhi_cntrl,
 	struct mhi_ep_chan *mhi_chan = &mhi_cntrl->mhi_chan[ring->ch_id];
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
 	size_t tr_len, read_offset, write_offset;
+	struct mhi_ep_buf_info buf_info = {};
 	struct mhi_ring_element *el;
 	bool tr_done = false;
-	void *write_addr;
-	u64 read_addr;
 	u32 buf_left;
 	int ret;
 
@@ -372,11 +371,13 @@ static int mhi_ep_read_channel(struct mhi_ep_cntrl *mhi_cntrl,
 
 		read_offset = mhi_chan->tre_size - mhi_chan->tre_bytes_left;
 		write_offset = len - buf_left;
-		read_addr = mhi_chan->tre_loc + read_offset;
-		write_addr = result->buf_addr + write_offset;
+
+		buf_info.host_addr = mhi_chan->tre_loc + read_offset;
+		buf_info.dev_addr = result->buf_addr + write_offset;
+		buf_info.size = tr_len;
 
 		dev_dbg(dev, "Reading %zd bytes from channel (%u)\n", tr_len, ring->ch_id);
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, read_addr, write_addr, tr_len);
+		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
 		if (ret < 0) {
 			dev_err(&mhi_chan->mhi_dev->dev, "Error reading from channel\n");
 			return ret;
@@ -499,12 +500,11 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 	struct mhi_ep_cntrl *mhi_cntrl = mhi_dev->mhi_cntrl;
 	struct mhi_ep_chan *mhi_chan = mhi_dev->dl_chan;
 	struct device *dev = &mhi_chan->mhi_dev->dev;
+	struct mhi_ep_buf_info buf_info = {};
 	struct mhi_ring_element *el;
 	u32 buf_left, read_offset;
 	struct mhi_ep_ring *ring;
 	enum mhi_ev_ccs code;
-	void *read_addr;
-	u64 write_addr;
 	size_t tr_len;
 	u32 tre_len;
 	int ret;
@@ -533,11 +533,13 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 
 		tr_len = min(buf_left, tre_len);
 		read_offset = skb->len - buf_left;
-		read_addr = skb->data + read_offset;
-		write_addr = MHI_TRE_DATA_GET_PTR(el);
+
+		buf_info.dev_addr = skb->data + read_offset;
+		buf_info.host_addr = MHI_TRE_DATA_GET_PTR(el);
+		buf_info.size = tr_len;
 
 		dev_dbg(dev, "Writing %zd bytes to channel (%u)\n", tr_len, ring->ch_id);
-		ret = mhi_cntrl->write_to_host(mhi_cntrl, read_addr, write_addr, tr_len);
+		ret = mhi_cntrl->write_to_host(mhi_cntrl, &buf_info);
 		if (ret < 0) {
 			dev_err(dev, "Error writing to the channel\n");
 			goto err_exit;
@@ -1445,7 +1447,6 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl,
 		ret = -ENOMEM;
 		goto err_destroy_tre_buf_cache;
 	}
-
 	INIT_WORK(&mhi_cntrl->state_work, mhi_ep_state_worker);
 	INIT_WORK(&mhi_cntrl->reset_work, mhi_ep_reset_worker);
 	INIT_WORK(&mhi_cntrl->cmd_ring_work, mhi_ep_cmd_ring_worker);
diff --git a/drivers/bus/mhi/ep/ring.c b/drivers/bus/mhi/ep/ring.c
index a1071c13761b..7ea952860def 100644
--- a/drivers/bus/mhi/ep/ring.c
+++ b/drivers/bus/mhi/ep/ring.c
@@ -30,7 +30,8 @@ static int __mhi_ep_cache_ring(struct mhi_ep_ring *ring, size_t end)
 {
 	struct mhi_ep_cntrl *mhi_cntrl = ring->mhi_cntrl;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
-	size_t start, copy_size;
+	struct mhi_ep_buf_info buf_info = {};
+	size_t start;
 	int ret;
 
 	/* Don't proceed in the case of event ring. This happens during mhi_ep_ring_start(). */
@@ -43,30 +44,34 @@ static int __mhi_ep_cache_ring(struct mhi_ep_ring *ring, size_t end)
 
 	start = ring->wr_offset;
 	if (start < end) {
-		copy_size = (end - start) * sizeof(struct mhi_ring_element);
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, ring->rbase +
-						(start * sizeof(struct mhi_ring_element)),
-						&ring->ring_cache[start], copy_size);
+		buf_info.size = (end - start) * sizeof(struct mhi_ring_element);
+		buf_info.host_addr = ring->rbase + (start * sizeof(struct mhi_ring_element));
+		buf_info.dev_addr = &ring->ring_cache[start];
+
+		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
 		if (ret < 0)
 			return ret;
 	} else {
-		copy_size = (ring->ring_size - start) * sizeof(struct mhi_ring_element);
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, ring->rbase +
-						(start * sizeof(struct mhi_ring_element)),
-						&ring->ring_cache[start], copy_size);
+		buf_info.size = (ring->ring_size - start) * sizeof(struct mhi_ring_element);
+		buf_info.host_addr = ring->rbase + (start * sizeof(struct mhi_ring_element));
+		buf_info.dev_addr = &ring->ring_cache[start];
+
+		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
 		if (ret < 0)
 			return ret;
 
 		if (end) {
-			ret = mhi_cntrl->read_from_host(mhi_cntrl, ring->rbase,
-							&ring->ring_cache[0],
-							end * sizeof(struct mhi_ring_element));
+			buf_info.host_addr = ring->rbase;
+			buf_info.dev_addr = &ring->ring_cache[0];
+			buf_info.size = end * sizeof(struct mhi_ring_element);
+
+			ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
 			if (ret < 0)
 				return ret;
 		}
 	}
 
-	dev_dbg(dev, "Cached ring: start %zu end %zu size %zu\n", start, end, copy_size);
+	dev_dbg(dev, "Cached ring: start %zu end %zu size %zu\n", start, end, buf_info.size);
 
 	return 0;
 }
@@ -102,6 +107,7 @@ int mhi_ep_ring_add_element(struct mhi_ep_ring *ring, struct mhi_ring_element *e
 {
 	struct mhi_ep_cntrl *mhi_cntrl = ring->mhi_cntrl;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	struct mhi_ep_buf_info buf_info = {};
 	size_t old_offset = 0;
 	u32 num_free_elem;
 	__le64 rp;
@@ -133,12 +139,11 @@ int mhi_ep_ring_add_element(struct mhi_ep_ring *ring, struct mhi_ring_element *e
 	rp = cpu_to_le64(ring->rd_offset * sizeof(*el) + ring->rbase);
 	memcpy_toio((void __iomem *) &ring->ring_ctx->generic.rp, &rp, sizeof(u64));
 
-	ret = mhi_cntrl->write_to_host(mhi_cntrl, el, ring->rbase + (old_offset * sizeof(*el)),
-				       sizeof(*el));
-	if (ret < 0)
-		return ret;
+	buf_info.host_addr = ring->rbase + (old_offset * sizeof(*el));
+	buf_info.dev_addr = el;
+	buf_info.size = sizeof(*el);
 
-	return 0;
+	return mhi_cntrl->write_to_host(mhi_cntrl, &buf_info);
 }
 
 void mhi_ep_ring_init(struct mhi_ep_ring *ring, enum mhi_ep_ring_type type, u32 id)
diff --git a/drivers/pci/endpoint/functions/pci-epf-mhi.c b/drivers/pci/endpoint/functions/pci-epf-mhi.c
index b7b9d3e21f97..ec5f4a38178b 100644
--- a/drivers/pci/endpoint/functions/pci-epf-mhi.c
+++ b/drivers/pci/endpoint/functions/pci-epf-mhi.c
@@ -209,28 +209,28 @@ static void pci_epf_mhi_raise_irq(struct mhi_ep_cntrl *mhi_cntrl, u32 vector)
 			  vector + 1);
 }
 
-static int pci_epf_mhi_iatu_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
-				 void *to, size_t size)
+static int pci_epf_mhi_iatu_read(struct mhi_ep_cntrl *mhi_cntrl,
+				 struct mhi_ep_buf_info *buf_info)
 {
 	struct pci_epf_mhi *epf_mhi = to_epf_mhi(mhi_cntrl);
-	size_t offset = get_align_offset(epf_mhi, from);
+	size_t offset = get_align_offset(epf_mhi, buf_info->host_addr);
 	void __iomem *tre_buf;
 	phys_addr_t tre_phys;
 	int ret;
 
 	mutex_lock(&epf_mhi->lock);
 
-	ret = __pci_epf_mhi_alloc_map(mhi_cntrl, from, &tre_phys, &tre_buf,
-				      offset, size);
+	ret = __pci_epf_mhi_alloc_map(mhi_cntrl, buf_info->host_addr, &tre_phys,
+				      &tre_buf, offset, buf_info->size);
 	if (ret) {
 		mutex_unlock(&epf_mhi->lock);
 		return ret;
 	}
 
-	memcpy_fromio(to, tre_buf, size);
+	memcpy_fromio(buf_info->dev_addr, tre_buf, buf_info->size);
 
-	__pci_epf_mhi_unmap_free(mhi_cntrl, from, tre_phys, tre_buf, offset,
-				 size);
+	__pci_epf_mhi_unmap_free(mhi_cntrl, buf_info->host_addr, tre_phys,
+				 tre_buf, offset, buf_info->size);
 
 	mutex_unlock(&epf_mhi->lock);
 
@@ -238,27 +238,27 @@ static int pci_epf_mhi_iatu_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
 }
 
 static int pci_epf_mhi_iatu_write(struct mhi_ep_cntrl *mhi_cntrl,
-				  void *from, u64 to, size_t size)
+				  struct mhi_ep_buf_info *buf_info)
 {
 	struct pci_epf_mhi *epf_mhi = to_epf_mhi(mhi_cntrl);
-	size_t offset = get_align_offset(epf_mhi, to);
+	size_t offset = get_align_offset(epf_mhi, buf_info->host_addr);
 	void __iomem *tre_buf;
 	phys_addr_t tre_phys;
 	int ret;
 
 	mutex_lock(&epf_mhi->lock);
 
-	ret = __pci_epf_mhi_alloc_map(mhi_cntrl, to, &tre_phys, &tre_buf,
-				      offset, size);
+	ret = __pci_epf_mhi_alloc_map(mhi_cntrl, buf_info->host_addr, &tre_phys,
+				      &tre_buf, offset, buf_info->size);
 	if (ret) {
 		mutex_unlock(&epf_mhi->lock);
 		return ret;
 	}
 
-	memcpy_toio(tre_buf, from, size);
+	memcpy_toio(tre_buf, buf_info->dev_addr, buf_info->size);
 
-	__pci_epf_mhi_unmap_free(mhi_cntrl, to, tre_phys, tre_buf, offset,
-				 size);
+	__pci_epf_mhi_unmap_free(mhi_cntrl, buf_info->host_addr, tre_phys,
+				 tre_buf, offset, buf_info->size);
 
 	mutex_unlock(&epf_mhi->lock);
 
@@ -270,8 +270,8 @@ static void pci_epf_mhi_dma_callback(void *param)
 	complete(param);
 }
 
-static int pci_epf_mhi_edma_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
-				 void *to, size_t size)
+static int pci_epf_mhi_edma_read(struct mhi_ep_cntrl *mhi_cntrl,
+				 struct mhi_ep_buf_info *buf_info)
 {
 	struct pci_epf_mhi *epf_mhi = to_epf_mhi(mhi_cntrl);
 	struct device *dma_dev = epf_mhi->epf->epc->dev.parent;
@@ -284,13 +284,13 @@ static int pci_epf_mhi_edma_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
 	dma_addr_t dst_addr;
 	int ret;
 
-	if (size < SZ_4K)
-		return pci_epf_mhi_iatu_read(mhi_cntrl, from, to, size);
+	if (buf_info->size < SZ_4K)
+		return pci_epf_mhi_iatu_read(mhi_cntrl, buf_info);
 
 	mutex_lock(&epf_mhi->lock);
 
 	config.direction = DMA_DEV_TO_MEM;
-	config.src_addr = from;
+	config.src_addr = buf_info->host_addr;
 
 	ret = dmaengine_slave_config(chan, &config);
 	if (ret) {
@@ -298,14 +298,16 @@ static int pci_epf_mhi_edma_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
 		goto err_unlock;
 	}
 
-	dst_addr = dma_map_single(dma_dev, to, size, DMA_FROM_DEVICE);
+	dst_addr = dma_map_single(dma_dev, buf_info->dev_addr, buf_info->size,
+				  DMA_FROM_DEVICE);
 	ret = dma_mapping_error(dma_dev, dst_addr);
 	if (ret) {
 		dev_err(dev, "Failed to map remote memory\n");
 		goto err_unlock;
 	}
 
-	desc = dmaengine_prep_slave_single(chan, dst_addr, size, DMA_DEV_TO_MEM,
+	desc = dmaengine_prep_slave_single(chan, dst_addr, buf_info->size,
+					   DMA_DEV_TO_MEM,
 					   DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
 	if (!desc) {
 		dev_err(dev, "Failed to prepare DMA\n");
@@ -332,15 +334,15 @@ static int pci_epf_mhi_edma_read(struct mhi_ep_cntrl *mhi_cntrl, u64 from,
 	}
 
 err_unmap:
-	dma_unmap_single(dma_dev, dst_addr, size, DMA_FROM_DEVICE);
+	dma_unmap_single(dma_dev, dst_addr, buf_info->size, DMA_FROM_DEVICE);
 err_unlock:
 	mutex_unlock(&epf_mhi->lock);
 
 	return ret;
 }
 
-static int pci_epf_mhi_edma_write(struct mhi_ep_cntrl *mhi_cntrl, void *from,
-				  u64 to, size_t size)
+static int pci_epf_mhi_edma_write(struct mhi_ep_cntrl *mhi_cntrl,
+				  struct mhi_ep_buf_info *buf_info)
 {
 	struct pci_epf_mhi *epf_mhi = to_epf_mhi(mhi_cntrl);
 	struct device *dma_dev = epf_mhi->epf->epc->dev.parent;
@@ -353,13 +355,13 @@ static int pci_epf_mhi_edma_write(struct mhi_ep_cntrl *mhi_cntrl, void *from,
 	dma_addr_t src_addr;
 	int ret;
 
-	if (size < SZ_4K)
-		return pci_epf_mhi_iatu_write(mhi_cntrl, from, to, size);
+	if (buf_info->size < SZ_4K)
+		return pci_epf_mhi_iatu_write(mhi_cntrl, buf_info);
 
 	mutex_lock(&epf_mhi->lock);
 
 	config.direction = DMA_MEM_TO_DEV;
-	config.dst_addr = to;
+	config.dst_addr = buf_info->host_addr;
 
 	ret = dmaengine_slave_config(chan, &config);
 	if (ret) {
@@ -367,14 +369,16 @@ static int pci_epf_mhi_edma_write(struct mhi_ep_cntrl *mhi_cntrl, void *from,
 		goto err_unlock;
 	}
 
-	src_addr = dma_map_single(dma_dev, from, size, DMA_TO_DEVICE);
+	src_addr = dma_map_single(dma_dev, buf_info->dev_addr, buf_info->size,
+				  DMA_TO_DEVICE);
 	ret = dma_mapping_error(dma_dev, src_addr);
 	if (ret) {
 		dev_err(dev, "Failed to map remote memory\n");
 		goto err_unlock;
 	}
 
-	desc = dmaengine_prep_slave_single(chan, src_addr, size, DMA_MEM_TO_DEV,
+	desc = dmaengine_prep_slave_single(chan, src_addr, buf_info->size,
+					   DMA_MEM_TO_DEV,
 					   DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
 	if (!desc) {
 		dev_err(dev, "Failed to prepare DMA\n");
@@ -401,7 +405,7 @@ static int pci_epf_mhi_edma_write(struct mhi_ep_cntrl *mhi_cntrl, void *from,
 	}
 
 err_unmap:
-	dma_unmap_single(dma_dev, src_addr, size, DMA_FROM_DEVICE);
+	dma_unmap_single(dma_dev, src_addr, buf_info->size, DMA_FROM_DEVICE);
 err_unlock:
 	mutex_unlock(&epf_mhi->lock);
 
diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index ce85d42b685d..96f3a133540d 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -49,6 +49,18 @@ struct mhi_ep_db_info {
 	u32 status;
 };
 
+/**
+ * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info
+ * @dev_addr: Address of the buffer in endpoint
+ * @host_addr: Address of the bufffer in host
+ * @size: Size of the buffer
+ */
+struct mhi_ep_buf_info {
+	void *dev_addr;
+	u64 host_addr;
+	size_t size;
+};
+
 /**
  * struct mhi_ep_cntrl - MHI Endpoint controller structure
  * @cntrl_dev: Pointer to the struct device of physical bus acting as the MHI
@@ -137,8 +149,8 @@ struct mhi_ep_cntrl {
 			 void __iomem **virt, size_t size);
 	void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys,
 			   void __iomem *virt, size_t size);
-	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, u64 from, void *to, size_t size);
-	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, void *from, u64 to, size_t size);
+	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From 927105244f8bc48e6841826a5644c6a961e03b5d Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Mon, 27 Nov 2023 13:57:37 +0530
Subject: bus: mhi: ep: Rename read_from_host() and write_to_host() APIs

In the preparation for adding async API support, let's rename the existing
APIs to read_sync() and write_sync() to make it explicit that these APIs
are used for synchronous read/write.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/ep/main.c                    | 4 ++--
 drivers/bus/mhi/ep/ring.c                    | 8 ++++----
 drivers/pci/endpoint/functions/pci-epf-mhi.c | 8 ++++----
 include/linux/mhi_ep.h                       | 8 ++++----
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/ep/main.c b/drivers/bus/mhi/ep/main.c
index b9334d548c41..88e189a8463a 100644
--- a/drivers/bus/mhi/ep/main.c
+++ b/drivers/bus/mhi/ep/main.c
@@ -377,7 +377,7 @@ static int mhi_ep_read_channel(struct mhi_ep_cntrl *mhi_cntrl,
 		buf_info.size = tr_len;
 
 		dev_dbg(dev, "Reading %zd bytes from channel (%u)\n", tr_len, ring->ch_id);
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
+		ret = mhi_cntrl->read_sync(mhi_cntrl, &buf_info);
 		if (ret < 0) {
 			dev_err(&mhi_chan->mhi_dev->dev, "Error reading from channel\n");
 			return ret;
@@ -539,7 +539,7 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 		buf_info.size = tr_len;
 
 		dev_dbg(dev, "Writing %zd bytes to channel (%u)\n", tr_len, ring->ch_id);
-		ret = mhi_cntrl->write_to_host(mhi_cntrl, &buf_info);
+		ret = mhi_cntrl->write_sync(mhi_cntrl, &buf_info);
 		if (ret < 0) {
 			dev_err(dev, "Error writing to the channel\n");
 			goto err_exit;
diff --git a/drivers/bus/mhi/ep/ring.c b/drivers/bus/mhi/ep/ring.c
index 7ea952860def..aeb53b2c34a8 100644
--- a/drivers/bus/mhi/ep/ring.c
+++ b/drivers/bus/mhi/ep/ring.c
@@ -48,7 +48,7 @@ static int __mhi_ep_cache_ring(struct mhi_ep_ring *ring, size_t end)
 		buf_info.host_addr = ring->rbase + (start * sizeof(struct mhi_ring_element));
 		buf_info.dev_addr = &ring->ring_cache[start];
 
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
+		ret = mhi_cntrl->read_sync(mhi_cntrl, &buf_info);
 		if (ret < 0)
 			return ret;
 	} else {
@@ -56,7 +56,7 @@ static int __mhi_ep_cache_ring(struct mhi_ep_ring *ring, size_t end)
 		buf_info.host_addr = ring->rbase + (start * sizeof(struct mhi_ring_element));
 		buf_info.dev_addr = &ring->ring_cache[start];
 
-		ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
+		ret = mhi_cntrl->read_sync(mhi_cntrl, &buf_info);
 		if (ret < 0)
 			return ret;
 
@@ -65,7 +65,7 @@ static int __mhi_ep_cache_ring(struct mhi_ep_ring *ring, size_t end)
 			buf_info.dev_addr = &ring->ring_cache[0];
 			buf_info.size = end * sizeof(struct mhi_ring_element);
 
-			ret = mhi_cntrl->read_from_host(mhi_cntrl, &buf_info);
+			ret = mhi_cntrl->read_sync(mhi_cntrl, &buf_info);
 			if (ret < 0)
 				return ret;
 		}
@@ -143,7 +143,7 @@ int mhi_ep_ring_add_element(struct mhi_ep_ring *ring, struct mhi_ring_element *e
 	buf_info.dev_addr = el;
 	buf_info.size = sizeof(*el);
 
-	return mhi_cntrl->write_to_host(mhi_cntrl, &buf_info);
+	return mhi_cntrl->write_sync(mhi_cntrl, &buf_info);
 }
 
 void mhi_ep_ring_init(struct mhi_ep_ring *ring, enum mhi_ep_ring_type type, u32 id)
diff --git a/drivers/pci/endpoint/functions/pci-epf-mhi.c b/drivers/pci/endpoint/functions/pci-epf-mhi.c
index ec5f4a38178b..2c5b627eb5ee 100644
--- a/drivers/pci/endpoint/functions/pci-epf-mhi.c
+++ b/drivers/pci/endpoint/functions/pci-epf-mhi.c
@@ -536,11 +536,11 @@ static int pci_epf_mhi_link_up(struct pci_epf *epf)
 	mhi_cntrl->alloc_map = pci_epf_mhi_alloc_map;
 	mhi_cntrl->unmap_free = pci_epf_mhi_unmap_free;
 	if (info->flags & MHI_EPF_USE_DMA) {
-		mhi_cntrl->read_from_host = pci_epf_mhi_edma_read;
-		mhi_cntrl->write_to_host = pci_epf_mhi_edma_write;
+		mhi_cntrl->read_sync = pci_epf_mhi_edma_read;
+		mhi_cntrl->write_sync = pci_epf_mhi_edma_write;
 	} else {
-		mhi_cntrl->read_from_host = pci_epf_mhi_iatu_read;
-		mhi_cntrl->write_to_host = pci_epf_mhi_iatu_write;
+		mhi_cntrl->read_sync = pci_epf_mhi_iatu_read;
+		mhi_cntrl->write_sync = pci_epf_mhi_iatu_write;
 	}
 
 	/* Register the MHI EP controller */
diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index 96f3a133540d..b96b543bf2f6 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -94,8 +94,8 @@ struct mhi_ep_buf_info {
  * @raise_irq: CB function for raising IRQ to the host
  * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it
  * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context
- * @read_from_host: CB function for reading from host memory from endpoint
- * @write_to_host: CB function for writing to host memory from endpoint
+ * @read_sync: CB function for reading from host memory synchronously
+ * @write_sync: CB function for writing to host memory synchronously
  * @mhi_state: MHI Endpoint state
  * @max_chan: Maximum channels supported by the endpoint controller
  * @mru: MRU (Maximum Receive Unit) value of the endpoint controller
@@ -149,8 +149,8 @@ struct mhi_ep_cntrl {
 			 void __iomem **virt, size_t size);
 	void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys,
 			   void __iomem *virt, size_t size);
-	int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
-	int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From 8b786ed8fb089e347af21d13ba5677325fcd4cd8 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Mon, 27 Nov 2023 15:35:50 +0530
Subject: bus: mhi: ep: Introduce async read/write callbacks

These callbacks can be implemented by the controller drivers to perform
async read/write operation that increases the throughput.

For aiding the async operation, a completion callback is also introduced.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi_ep.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index b96b543bf2f6..14c6e8d3f573 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -54,11 +54,16 @@ struct mhi_ep_db_info {
  * @dev_addr: Address of the buffer in endpoint
  * @host_addr: Address of the bufffer in host
  * @size: Size of the buffer
+ * @cb: Callback to be executed by controller drivers after transfer completion (async)
+ * @cb_buf: Opaque buffer to be passed to the callback
  */
 struct mhi_ep_buf_info {
 	void *dev_addr;
 	u64 host_addr;
 	size_t size;
+
+	void (*cb)(struct mhi_ep_buf_info *buf_info);
+	void *cb_buf;
 };
 
 /**
@@ -96,6 +101,8 @@ struct mhi_ep_buf_info {
  * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context
  * @read_sync: CB function for reading from host memory synchronously
  * @write_sync: CB function for writing to host memory synchronously
+ * @read_async: CB function for reading from host memory asynchronously
+ * @write_async: CB function for writing to host memory asynchronously
  * @mhi_state: MHI Endpoint state
  * @max_chan: Maximum channels supported by the endpoint controller
  * @mru: MRU (Maximum Receive Unit) value of the endpoint controller
@@ -151,6 +158,8 @@ struct mhi_ep_cntrl {
 			   void __iomem *virt, size_t size);
 	int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 	int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*read_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
+	int (*write_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info);
 
 	enum mhi_state mhi_state;
 
-- 
cgit v1.2.3


From b4c2bea8ceaa50cd42a8f73667389d801a3ecf2d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 25 Oct 2023 16:02:03 +0200
Subject: add listmount(2) syscall

Add way to query the children of a particular mount.  This is a more
flexible way to iterate the mount tree than having to parse
/proc/self/mountinfo.

Lookup the mount by the new 64bit mount ID. If a mount needs to be
queried based on path, then statx(2) can be used to first query the
mount ID belonging to the path.

Return an array of new (64bit) mount ID's. Without privileges only
mounts are listed which are reachable from the task's root.

Folded into this patch are several later improvements. Keeping them
separate would make the history pointlessly confusing:

* Recursive listing of mounts is the default now (cf. [1]).
* Remove explicit LISTMOUNT_UNREACHABLE flag (cf. [1]) and fail if mount
  is unreachable from current root. This also makes permission checking
  consistent with statmount() (cf. [3]).
* Start listing mounts in unique mount ID order (cf. [2]) to allow
  continuing listmount() from a midpoint.
* Allow to continue listmount(). The @request_mask parameter is renamed
  and to @param to be usable by both statmount() and listmount().
  If @param is set to a mount id then listmount() will continue listing
  mounts from that id on. This allows listing mounts in multiple
  listmount invocations without having to resize the buffer. If @param
  is zero then the listing starts from the beginning (cf. [4]).
* Don't return EOVERFLOW, instead return the buffer size which allows to
  detect a full buffer as well (cf. [4]).

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20231025140205.3586473-6-mszeredi@redhat.com
Reviewed-by: Ian Kent <raven@themaw.net>
Link: https://lore.kernel.org/r/20231128160337.29094-2-mszeredi@redhat.com [1] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-3-mszeredi@redhat.com [2] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-4-mszeredi@redhat.com [3] (folded)
Link: https://lore.kernel.org/r/20231128160337.29094-5-mszeredi@redhat.com [4] (folded)
[Christian Brauner <brauner@kernel.org>: various smaller fixes]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 86 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/syscalls.h   |  3 ++
 include/uapi/linux/mount.h | 14 +++++++-
 3 files changed, 100 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7f1618ed2aba..873185b8a84b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/nospec.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -1009,7 +1010,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
 
 static inline struct mount *node_to_mount(struct rb_node *node)
 {
-	return rb_entry(node, struct mount, mnt_node);
+	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
 }
 
 static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
@@ -4945,7 +4946,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 		return -EFAULT;
 
 	memset(ks, 0, sizeof(*ks));
-	ks->mask = kreq->request_mask;
+	ks->mask = kreq->param;
 	ks->buf = buf;
 	ks->bufsize = bufsize;
 	ks->seq.size = seq_size;
@@ -4999,6 +5000,87 @@ retry:
 	return ret;
 }
 
+static struct mount *listmnt_next(struct mount *curr)
+{
+	return node_to_mount(rb_next(&curr->mnt_node));
+}
+
+static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
+			    u64 __user *buf, size_t bufsize,
+			    const struct path *root)
+{
+	struct mount *r;
+	ssize_t ctr;
+	int err;
+
+	/*
+	 * Don't trigger audit denials. We just want to determine what
+	 * mounts to show users.
+	 */
+	if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
+	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	err = security_sb_statfs(orig->dentry);
+	if (err)
+		return err;
+
+	for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) {
+		if (r->mnt_id_unique == mnt_id)
+			continue;
+		if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+			continue;
+		ctr = array_index_nospec(ctr, bufsize);
+		if (put_user(r->mnt_id_unique, buf + ctr))
+			return -EFAULT;
+		if (check_add_overflow(ctr, 1, &ctr))
+			return -ERANGE;
+	}
+	return ctr;
+}
+
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
+		u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mnt_id_req kreq;
+	struct mount *first;
+	struct path root, orig;
+	u64 mnt_id, last_mnt_id;
+	ssize_t ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (copy_from_user(&kreq, req, sizeof(kreq)))
+		return -EFAULT;
+	mnt_id = kreq.mnt_id;
+	last_mnt_id = kreq.param;
+
+	down_read(&namespace_sem);
+	get_fs_root(current->fs, &root);
+	if (mnt_id == LSMT_ROOT) {
+		orig = root;
+	} else {
+		ret = -ENOENT;
+		orig.mnt  = lookup_mnt_in_ns(mnt_id, ns);
+		if (!orig.mnt)
+			goto err;
+		orig.dentry = orig.mnt->mnt_root;
+	}
+	if (!last_mnt_id)
+		first = node_to_mount(rb_first(&ns->mounts));
+	else
+		first = mnt_find_id_at(ns, last_mnt_id + 1);
+
+	ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root);
+err:
+	path_put(&root);
+	up_read(&namespace_sem);
+	return ret;
+}
+
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 530ca9adf5f1..2d6d3e76e3f7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -412,6 +412,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
 			      struct statmount __user *buf, size_t bufsize,
 			      unsigned int flags);
+asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
+			      u64 __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index afdf4f2f6672..dc9a0112d819 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -176,9 +176,16 @@ struct statmount {
 	char str[];		/* Variable size part containing strings */
 };
 
+/*
+ * Structure for passing mount ID and miscellaneous parameters to statmount(2)
+ * and listmount(2).
+ *
+ * For statmount(2) @param represents the request mask.
+ * For listmount(2) @param represents the last listed mount id (or zero).
+ */
 struct mnt_id_req {
 	__u64 mnt_id;
-	__u64 request_mask;
+	__u64 param;
 };
 
 /*
@@ -191,4 +198,9 @@ struct mnt_id_req {
 #define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
 #define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
 
+/*
+ * Special @mnt_id values that can be passed to listmount
+ */
+#define LSMT_ROOT		0xffffffffffffffff	/* root mount */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
cgit v1.2.3


From bf873a800ac3234eba991603a450eaa517d27022 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Dec 2023 20:35:11 -0800
Subject: net: skbuff: fix spelling errors

Correct spelling as reported by codespell.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20231213043511.10357-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b370eb8d70f7..7ce38874dbd1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1069,7 +1069,7 @@ struct sk_buff {
 	refcount_t		users;
 
 #ifdef CONFIG_SKB_EXTENSIONS
-	/* only useable after checking ->active_extensions != 0 */
+	/* only usable after checking ->active_extensions != 0 */
 	struct skb_ext		*extensions;
 #endif
 };
@@ -3311,7 +3311,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 					     unsigned int order)
 {
 	/* This piece of code contains several assumptions.
-	 * 1.  This is for device Rx, therefor a cold page is preferred.
+	 * 1.  This is for device Rx, therefore a cold page is preferred.
 	 * 2.  The expectation is the user wants a compound page.
 	 * 3.  If requesting a order 0 page it will not be compound
 	 *     due to the check to see if order has a value in prep_new_page
@@ -4247,7 +4247,7 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
 {
 	const void *a = skb_metadata_end(skb_a);
 	const void *b = skb_metadata_end(skb_b);
-	/* Using more efficient varaiant than plain call to memcmp(). */
+	/* Using more efficient variant than plain call to memcmp(). */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
 	u64 diffs = 0;
 
-- 
cgit v1.2.3


From ee08acb58fe47fc3bc2c137965985cdb1df40b35 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 2 Nov 2023 20:33:18 +0530
Subject: bus: mhi: ep: Add support for async DMA write operation

In order to optimize the data transfer, let's use the async DMA operation
for writing (queuing) data to the host.

In the async path, the completion event for the transfer ring will only be
sent to the host when the controller driver notifies the MHI stack of the
actual transfer completion using the callback (mhi_ep_skb_completion)
supplied in "struct mhi_ep_buf_info".

Also to accommodate the async operation, the transfer ring read offset
(ring->rd_offset) is cached in the "struct mhi_ep_chan" and updated locally
to let the stack queue further ring items to the controller driver. But the
actual read offset of the transfer ring will only be updated in the
completion callback.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/ep/internal.h |  1 +
 drivers/bus/mhi/ep/main.c     | 74 +++++++++++++++++++++++++++++++------------
 include/linux/mhi_ep.h        |  4 +++
 3 files changed, 59 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/ep/internal.h b/drivers/bus/mhi/ep/internal.h
index 8c5cf2b67951..577965f95fda 100644
--- a/drivers/bus/mhi/ep/internal.h
+++ b/drivers/bus/mhi/ep/internal.h
@@ -162,6 +162,7 @@ struct mhi_ep_chan {
 	void (*xfer_cb)(struct mhi_ep_device *mhi_dev, struct mhi_result *result);
 	enum mhi_ch_state state;
 	enum dma_data_direction dir;
+	size_t rd_offset;
 	u64 tre_loc;
 	u32 tre_size;
 	u32 tre_bytes_left;
diff --git a/drivers/bus/mhi/ep/main.c b/drivers/bus/mhi/ep/main.c
index 88e189a8463a..48ef61ccf737 100644
--- a/drivers/bus/mhi/ep/main.c
+++ b/drivers/bus/mhi/ep/main.c
@@ -199,6 +199,8 @@ static int mhi_ep_process_cmd_ring(struct mhi_ep_ring *ring, struct mhi_ring_ele
 
 				goto err_unlock;
 			}
+
+			mhi_chan->rd_offset = ch_ring->rd_offset;
 		}
 
 		/* Set channel state to RUNNING */
@@ -328,7 +330,7 @@ bool mhi_ep_queue_is_empty(struct mhi_ep_device *mhi_dev, enum dma_data_directio
 	struct mhi_ep_cntrl *mhi_cntrl = mhi_dev->mhi_cntrl;
 	struct mhi_ep_ring *ring = &mhi_cntrl->mhi_chan[mhi_chan->chan].ring;
 
-	return !!(ring->rd_offset == ring->wr_offset);
+	return !!(mhi_chan->rd_offset == ring->wr_offset);
 }
 EXPORT_SYMBOL_GPL(mhi_ep_queue_is_empty);
 
@@ -355,7 +357,7 @@ static int mhi_ep_read_channel(struct mhi_ep_cntrl *mhi_cntrl,
 			return -ENODEV;
 		}
 
-		el = &ring->ring_cache[ring->rd_offset];
+		el = &ring->ring_cache[mhi_chan->rd_offset];
 
 		/* Check if there is data pending to be read from previous read operation */
 		if (mhi_chan->tre_bytes_left) {
@@ -434,6 +436,7 @@ static int mhi_ep_read_channel(struct mhi_ep_cntrl *mhi_cntrl,
 				tr_done = true;
 			}
 
+			mhi_chan->rd_offset = (mhi_chan->rd_offset + 1) % ring->ring_size;
 			mhi_ep_ring_inc_index(ring);
 		}
 
@@ -494,6 +497,35 @@ static int mhi_ep_process_ch_ring(struct mhi_ep_ring *ring, struct mhi_ring_elem
 	return 0;
 }
 
+static void mhi_ep_skb_completion(struct mhi_ep_buf_info *buf_info)
+{
+	struct mhi_ep_device *mhi_dev = buf_info->mhi_dev;
+	struct mhi_ep_cntrl *mhi_cntrl = mhi_dev->mhi_cntrl;
+	struct mhi_ep_chan *mhi_chan = mhi_dev->dl_chan;
+	struct mhi_ep_ring *ring = &mhi_cntrl->mhi_chan[mhi_chan->chan].ring;
+	struct mhi_ring_element *el = &ring->ring_cache[ring->rd_offset];
+	struct device *dev = &mhi_dev->dev;
+	struct mhi_result result = {};
+	int ret;
+
+	if (mhi_chan->xfer_cb) {
+		result.buf_addr = buf_info->cb_buf;
+		result.dir = mhi_chan->dir;
+		result.bytes_xferd = buf_info->size;
+
+		mhi_chan->xfer_cb(mhi_dev, &result);
+	}
+
+	ret = mhi_ep_send_completion_event(mhi_cntrl, ring, el, buf_info->size,
+					   buf_info->code);
+	if (ret) {
+		dev_err(dev, "Error sending transfer completion event\n");
+		return;
+	}
+
+	mhi_ep_ring_inc_index(ring);
+}
+
 /* TODO: Handle partially formed TDs */
 int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 {
@@ -504,7 +536,6 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 	struct mhi_ring_element *el;
 	u32 buf_left, read_offset;
 	struct mhi_ep_ring *ring;
-	enum mhi_ev_ccs code;
 	size_t tr_len;
 	u32 tre_len;
 	int ret;
@@ -528,7 +559,7 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 			goto err_exit;
 		}
 
-		el = &ring->ring_cache[ring->rd_offset];
+		el = &ring->ring_cache[mhi_chan->rd_offset];
 		tre_len = MHI_TRE_DATA_GET_LEN(el);
 
 		tr_len = min(buf_left, tre_len);
@@ -537,33 +568,35 @@ int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb)
 		buf_info.dev_addr = skb->data + read_offset;
 		buf_info.host_addr = MHI_TRE_DATA_GET_PTR(el);
 		buf_info.size = tr_len;
+		buf_info.cb = mhi_ep_skb_completion;
+		buf_info.cb_buf = skb;
+		buf_info.mhi_dev = mhi_dev;
 
-		dev_dbg(dev, "Writing %zd bytes to channel (%u)\n", tr_len, ring->ch_id);
-		ret = mhi_cntrl->write_sync(mhi_cntrl, &buf_info);
-		if (ret < 0) {
-			dev_err(dev, "Error writing to the channel\n");
-			goto err_exit;
-		}
-
-		buf_left -= tr_len;
 		/*
 		 * For all TREs queued by the host for DL channel, only the EOT flag will be set.
 		 * If the packet doesn't fit into a single TRE, send the OVERFLOW event to
 		 * the host so that the host can adjust the packet boundary to next TREs. Else send
 		 * the EOT event to the host indicating the packet boundary.
 		 */
-		if (buf_left)
-			code = MHI_EV_CC_OVERFLOW;
+		if (buf_left - tr_len)
+			buf_info.code = MHI_EV_CC_OVERFLOW;
 		else
-			code = MHI_EV_CC_EOT;
+			buf_info.code = MHI_EV_CC_EOT;
 
-		ret = mhi_ep_send_completion_event(mhi_cntrl, ring, el, tr_len, code);
-		if (ret) {
-			dev_err(dev, "Error sending transfer completion event\n");
+		dev_dbg(dev, "Writing %zd bytes to channel (%u)\n", tr_len, ring->ch_id);
+		ret = mhi_cntrl->write_async(mhi_cntrl, &buf_info);
+		if (ret < 0) {
+			dev_err(dev, "Error writing to the channel\n");
 			goto err_exit;
 		}
 
-		mhi_ep_ring_inc_index(ring);
+		buf_left -= tr_len;
+
+		/*
+		 * Update the read offset cached in mhi_chan. Actual read offset
+		 * will be updated by the completion handler.
+		 */
+		mhi_chan->rd_offset = (mhi_chan->rd_offset + 1) % ring->ring_size;
 	} while (buf_left);
 
 	mutex_unlock(&mhi_chan->lock);
@@ -803,7 +836,7 @@ static void mhi_ep_ch_ring_worker(struct work_struct *work)
 		}
 
 		/* Sanity check to make sure there are elements in the ring */
-		if (ring->rd_offset == ring->wr_offset) {
+		if (chan->rd_offset == ring->wr_offset) {
 			mutex_unlock(&chan->lock);
 			kmem_cache_free(mhi_cntrl->ring_item_cache, itr);
 			continue;
@@ -1447,6 +1480,7 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl,
 		ret = -ENOMEM;
 		goto err_destroy_tre_buf_cache;
 	}
+
 	INIT_WORK(&mhi_cntrl->state_work, mhi_ep_state_worker);
 	INIT_WORK(&mhi_cntrl->reset_work, mhi_ep_reset_worker);
 	INIT_WORK(&mhi_cntrl->cmd_ring_work, mhi_ep_cmd_ring_worker);
diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h
index 14c6e8d3f573..11bf3212f782 100644
--- a/include/linux/mhi_ep.h
+++ b/include/linux/mhi_ep.h
@@ -51,16 +51,20 @@ struct mhi_ep_db_info {
 
 /**
  * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info
+ * @mhi_dev: MHI device associated with this buffer
  * @dev_addr: Address of the buffer in endpoint
  * @host_addr: Address of the bufffer in host
  * @size: Size of the buffer
+ * @code: Transfer completion code
  * @cb: Callback to be executed by controller drivers after transfer completion (async)
  * @cb_buf: Opaque buffer to be passed to the callback
  */
 struct mhi_ep_buf_info {
+	struct mhi_ep_device *mhi_dev;
 	void *dev_addr;
 	u64 host_addr;
 	size_t size;
+	int code;
 
 	void (*cb)(struct mhi_ep_buf_info *buf_info);
 	void *cb_buf;
-- 
cgit v1.2.3


From 0fe1798968115488c0c02f4633032a015b1faf97 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Thu, 14 Dec 2023 15:52:29 +0300
Subject: virtio/vsock: send credit update during setting SO_RCVLOWAT

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set
'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the
transport doesn't need to do it.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |  1 +
 include/linux/virtio_vsock.h            |  1 +
 include/net/af_vsock.h                  |  2 +-
 net/vmw_vsock/af_vsock.c                |  9 +++++++--
 net/vmw_vsock/hyperv_transport.c        |  4 ++--
 net/vmw_vsock/virtio_transport.c        |  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 ++++++++++++++++++++++++++++++
 net/vmw_vsock/vsock_loopback.c          |  1 +
 8 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+		.notify_set_rcvlowat      = virtio_transport_notify_set_rcvlowat,
 
 		.read_skb = virtio_transport_read_skb,
 	},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
 	u64 (*stream_rcvhiwat)(struct vsock_sock *);
 	bool (*stream_is_active)(struct vsock_sock *);
 	bool (*stream_allow)(u32 cid, u32 port);
-	int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
 	/* SEQ_PACKET. */
 	ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
 		struct vsock_transport_send_notify_data *);
 	/* sk_lock held by the caller */
 	void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+	int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
 	/* Shutdown. */
 	int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
 	transport = vsk->transport;
 
-	if (transport && transport->set_rcvlowat)
-		return transport->set_rcvlowat(vsk, val);
+	if (transport && transport->notify_set_rcvlowat) {
+		int err;
+
+		err = transport->notify_set_rcvlowat(vsk, val);
+		if (err)
+			return err;
+	}
 
 	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 	return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
 	return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
 	.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
 	.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-	.set_rcvlowat             = hvs_set_rcvlowat
+	.notify_set_rcvlowat      = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+		.notify_set_rcvlowat      = virtio_transport_notify_set_rcvlowat,
 
 		.read_skb = virtio_transport_read_skb,
 	},
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index b35306dfcebe..16ff976a86e3 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1690,6 +1690,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool send_update;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	/* If number of available bytes is less than new SO_RCVLOWAT value,
+	 * kick sender to send more data, because sender may sleep in its
+	 * 'send()' syscall waiting for enough space at our side. Also
+	 * don't send credit update when peer already knows actual value -
+	 * such transmission will be useless.
+	 */
+	send_update = (vvs->rx_bytes < val) &&
+		      (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+	spin_unlock_bh(&vvs->rx_lock);
+
+	if (send_update) {
+		int err;
+
+		err = virtio_transport_send_credit_update(vsk);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,6 +96,7 @@ static struct virtio_transport loopback_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+		.notify_set_rcvlowat      = virtio_transport_notify_set_rcvlowat,
 
 		.read_skb = virtio_transport_read_skb,
 	},
-- 
cgit v1.2.3


From 4ad4c1f394b84f9941a10aa8aaf11102478a390b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 24 Nov 2023 18:10:03 +0000
Subject: dma-mapping: don't store redundant offsets

A bus_dma_region necessarily stores both CPU and DMA base addresses for
a range, so there's no need to also store the difference between them.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/acpi/scan.c        |  1 -
 drivers/of/address.c       |  1 -
 include/linux/dma-direct.h | 19 ++++++++++++-------
 kernel/dma/direct.c        |  1 -
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 02bb2cce423f..ee88a727f200 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1532,7 +1532,6 @@ int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
 			r->cpu_start = rentry->res->start;
 			r->dma_start = rentry->res->start - rentry->offset;
 			r->size = resource_size(rentry->res);
-			r->offset = rentry->offset;
 			r++;
 		}
 	}
diff --git a/drivers/of/address.c b/drivers/of/address.c
index b59956310f66..ae46a3605904 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -955,7 +955,6 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map)
 		r->cpu_start = range.cpu_addr;
 		r->dma_start = range.bus_addr;
 		r->size = range.size;
-		r->offset = range.cpu_addr - range.bus_addr;
 		r++;
 	}
 out:
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 18aade195884..3eb3589ff43e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -21,7 +21,6 @@ struct bus_dma_region {
 	phys_addr_t	cpu_start;
 	dma_addr_t	dma_start;
 	u64		size;
-	u64		offset;
 };
 
 static inline dma_addr_t translate_phys_to_dma(struct device *dev,
@@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size)
-			return (dma_addr_t)paddr - m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = paddr - m->cpu_start;
+
+		if (paddr >= m->cpu_start && offset < m->size)
+			return m->dma_start + offset;
+	}
 
 	/* make sure dma_capable fails when no translation is available */
 	return DMA_MAPPING_ERROR;
@@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size)
-			return (phys_addr_t)dma_addr + m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = dma_addr - m->dma_start;
+
+		if (dma_addr >= m->dma_start && offset < m->size)
+			return m->cpu_start + offset;
+	}
 
 	return (phys_addr_t)-1;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 73c95815789a..98b2e192fd69 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -677,7 +677,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 		return -ENOMEM;
 	map[0].cpu_start = cpu_start;
 	map[0].dma_start = dma_start;
-	map[0].offset = offset;
 	map[0].size = size;
 	dev->dma_range_map = map;
 	return 0;
-- 
cgit v1.2.3


From 134c6eaa6087d78c0e289931ca15ae7a5007670d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 13 Dec 2023 15:02:35 -0800
Subject: driver core: Add a guard() definition for the device_lock()

At present there are ~200 usages of device_lock() in the kernel. Some of
those usages lead to "goto unlock;" patterns which have proven to be
error prone. Define a "device" guard() definition to allow for those to
be cleaned up and prevent new ones from appearing.

Link: http://lore.kernel.org/r/657897453dda8_269bd29492@dwillia2-mobl3.amr.corp.intel.com.notmuch
Link: http://lore.kernel.org/r/6577b0c2a02df_a04c5294bb@dwillia2-xfh.jf.intel.com.notmuch
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Link: https://lore.kernel.org/r/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index d7a72a8749ea..6c83294395ac 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1007,6 +1007,8 @@ static inline void device_unlock(struct device *dev)
 	mutex_unlock(&dev->mutex);
 }
 
+DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))
+
 static inline void device_lock_assert(struct device *dev)
 {
 	lockdep_assert_held(&dev->mutex);
-- 
cgit v1.2.3


From 7f38b70042fcaa49219045bd1a9a2836e27a58ac Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:27 +0000
Subject: of: device: Export of_device_make_bus_id()

This helper is really handy to create unique device names based on their
device tree path, we may need it outside of the OF core (in the NVMEM
subsystem) so let's export it. As this helper has nothing patform
specific, let's move it to of/device.c instead of of/platform.c so we
can add its prototype to of_device.h.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-2-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/of/device.c       | 41 +++++++++++++++++++++++++++++++++++++++++
 drivers/of/platform.c     | 40 ----------------------------------------
 include/linux/of_device.h |  6 ++++++
 3 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index 1ca42ad9dd15..6e9572c4af83 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -304,3 +304,44 @@ int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *
 	return 0;
 }
 EXPORT_SYMBOL_GPL(of_device_uevent_modalias);
+
+/**
+ * of_device_make_bus_id - Use the device node data to assign a unique name
+ * @dev: pointer to device structure that is linked to a device tree node
+ *
+ * This routine will first try using the translated bus address to
+ * derive a unique name. If it cannot, then it will prepend names from
+ * parent nodes until a unique name can be derived.
+ */
+void of_device_make_bus_id(struct device *dev)
+{
+	struct device_node *node = dev->of_node;
+	const __be32 *reg;
+	u64 addr;
+	u32 mask;
+
+	/* Construct the name, using parent nodes if necessary to ensure uniqueness */
+	while (node->parent) {
+		/*
+		 * If the address can be translated, then that is as much
+		 * uniqueness as we need. Make it the first component and return
+		 */
+		reg = of_get_property(node, "reg", NULL);
+		if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
+			if (!of_property_read_u32(node, "mask", &mask))
+				dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
+					     addr, ffs(mask) - 1, node, dev_name(dev));
+
+			else
+				dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
+					     addr, node, dev_name(dev));
+			return;
+		}
+
+		/* format arguments only used if dev_name() resolves to NULL */
+		dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s",
+			     kbasename(node->full_name), dev_name(dev));
+		node = node->parent;
+	}
+}
+EXPORT_SYMBOL_GPL(of_device_make_bus_id);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 126d265aa7d8..c39e49a1eba0 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -97,46 +97,6 @@ static const struct of_device_id of_skipped_node_table[] = {
  * mechanism for creating devices from device tree nodes.
  */
 
-/**
- * of_device_make_bus_id - Use the device node data to assign a unique name
- * @dev: pointer to device structure that is linked to a device tree node
- *
- * This routine will first try using the translated bus address to
- * derive a unique name. If it cannot, then it will prepend names from
- * parent nodes until a unique name can be derived.
- */
-static void of_device_make_bus_id(struct device *dev)
-{
-	struct device_node *node = dev->of_node;
-	const __be32 *reg;
-	u64 addr;
-	u32 mask;
-
-	/* Construct the name, using parent nodes if necessary to ensure uniqueness */
-	while (node->parent) {
-		/*
-		 * If the address can be translated, then that is as much
-		 * uniqueness as we need. Make it the first component and return
-		 */
-		reg = of_get_property(node, "reg", NULL);
-		if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
-			if (!of_property_read_u32(node, "mask", &mask))
-				dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
-					     addr, ffs(mask) - 1, node, dev_name(dev));
-
-			else
-				dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
-					     addr, node, dev_name(dev));
-			return;
-		}
-
-		/* format arguments only used if dev_name() resolves to NULL */
-		dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s",
-			     kbasename(node->full_name), dev_name(dev));
-		node = node->parent;
-	}
-}
-
 /**
  * of_device_alloc - Allocate and initialize an of_device
  * @np: device node to assign to device
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 2c7a3d4bc775..a72661e47faa 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -40,6 +40,9 @@ static inline int of_dma_configure(struct device *dev,
 {
 	return of_dma_configure_id(dev, np, force_dma, NULL);
 }
+
+void of_device_make_bus_id(struct device *dev);
+
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -82,6 +85,9 @@ static inline int of_dma_configure(struct device *dev,
 {
 	return 0;
 }
+
+static inline void of_device_make_bus_id(struct device *dev) {}
+
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3


From 4a1a40233b4a9fc159a5c7a27dc34c5c7bc5be55 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:28 +0000
Subject: nvmem: Move of_nvmem_layout_get_container() in another header

nvmem-consumer.h is included by consumer devices, extracting data from
NVMEM devices whereas nvmem-provider.h is included by devices providing
NVMEM content.

The only users of of_nvmem_layout_get_container() outside of the core
are layout drivers, so better move its prototype to nvmem-provider.h.

While we do so, we also move the kdoc associated with the function to
the header rather than the .c file.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-3-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           |  8 --------
 include/linux/nvmem-consumer.h |  7 -------
 include/linux/nvmem-provider.h | 21 +++++++++++++++++++++
 3 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 608b352a7d91..b5b6ec8e04bb 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -847,14 +847,6 @@ static int nvmem_add_cells_from_layout(struct nvmem_device *nvmem)
 }
 
 #if IS_ENABLED(CONFIG_OF)
-/**
- * of_nvmem_layout_get_container() - Get OF node to layout container.
- *
- * @nvmem: nvmem device.
- *
- * Return: a node pointer with refcount incremented or NULL if no
- * container exists. Use of_node_put() on it when done.
- */
 struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
 {
 	return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 6ec4b9743e25..2d306fa13b1a 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -247,7 +247,6 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 				     const char *id);
 struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 					 const char *name);
-struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem);
 #else
 static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np,
 						   const char *id)
@@ -260,12 +259,6 @@ static inline struct nvmem_device *of_nvmem_device_get(struct device_node *np,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-
-static inline struct device_node *
-of_nvmem_layout_get_container(struct nvmem_device *nvmem)
-{
-	return NULL;
-}
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
 #endif  /* ifndef _LINUX_NVMEM_CONSUMER_H */
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e3930835235b..e5de21516387 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -244,6 +244,27 @@ nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 
 #endif /* CONFIG_NVMEM */
 
+#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
+
+/**
+ * of_nvmem_layout_get_container() - Get OF node of layout container
+ *
+ * @nvmem: nvmem device
+ *
+ * Return: a node pointer with refcount incremented or NULL if no
+ * container exists. Use of_node_put() on it when done.
+ */
+struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem);
+
+#else  /* CONFIG_NVMEM && CONFIG_OF */
+
+static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_NVMEM && CONFIG_OF */
+
 #define module_nvmem_layout_driver(__layout_driver)		\
 	module_driver(__layout_driver, nvmem_layout_register,	\
 		      nvmem_layout_unregister)
-- 
cgit v1.2.3


From 1b7c298a4ecbc28cc6ee94005734bff55eb83d22 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:30 +0000
Subject: nvmem: Simplify the ->add_cells() hook

The layout entry is not used and will anyway be made useless by the new
layout bus infrastructure coming next, so drop it. While at it, clarify
the kdoc entry.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-5-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c             | 2 +-
 drivers/nvmem/layouts/onie-tlv.c | 3 +--
 drivers/nvmem/layouts/sl28vpd.c  | 3 +--
 include/linux/nvmem-provider.h   | 8 +++-----
 4 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index b5e5ce67398f..f63db5e01fca 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -816,7 +816,7 @@ static int nvmem_add_cells_from_layout(struct nvmem_device *nvmem)
 	int ret;
 
 	if (layout && layout->add_cells) {
-		ret = layout->add_cells(&nvmem->dev, nvmem, layout);
+		ret = layout->add_cells(&nvmem->dev, nvmem);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/nvmem/layouts/onie-tlv.c b/drivers/nvmem/layouts/onie-tlv.c
index 59fc87ccfcff..defd42d4375c 100644
--- a/drivers/nvmem/layouts/onie-tlv.c
+++ b/drivers/nvmem/layouts/onie-tlv.c
@@ -182,8 +182,7 @@ static bool onie_tlv_crc_is_valid(struct device *dev, size_t table_len, u8 *tabl
 	return true;
 }
 
-static int onie_tlv_parse_table(struct device *dev, struct nvmem_device *nvmem,
-				struct nvmem_layout *layout)
+static int onie_tlv_parse_table(struct device *dev, struct nvmem_device *nvmem)
 {
 	struct onie_tlv_hdr hdr;
 	size_t table_len, data_len, hdr_len;
diff --git a/drivers/nvmem/layouts/sl28vpd.c b/drivers/nvmem/layouts/sl28vpd.c
index 05671371f631..26c7cf21b523 100644
--- a/drivers/nvmem/layouts/sl28vpd.c
+++ b/drivers/nvmem/layouts/sl28vpd.c
@@ -80,8 +80,7 @@ static int sl28vpd_v1_check_crc(struct device *dev, struct nvmem_device *nvmem)
 	return 0;
 }
 
-static int sl28vpd_add_cells(struct device *dev, struct nvmem_device *nvmem,
-			     struct nvmem_layout *layout)
+static int sl28vpd_add_cells(struct device *dev, struct nvmem_device *nvmem)
 {
 	const struct nvmem_cell_info *pinfo;
 	struct nvmem_cell_info info = {0};
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e5de21516387..3939991b3c5f 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -156,9 +156,8 @@ struct nvmem_cell_table {
  *
  * @name:		Layout name.
  * @of_match_table:	Open firmware match table.
- * @add_cells:		Will be called if a nvmem device is found which
- *			has this layout. The function will add layout
- *			specific cells with nvmem_add_one_cell().
+ * @add_cells:		Called to populate the layout using
+ *			nvmem_add_one_cell().
  * @fixup_cell_info:	Will be called before a cell is added. Can be
  *			used to modify the nvmem_cell_info.
  * @owner:		Pointer to struct module.
@@ -172,8 +171,7 @@ struct nvmem_cell_table {
 struct nvmem_layout {
 	const char *name;
 	const struct of_device_id *of_match_table;
-	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem,
-			 struct nvmem_layout *layout);
+	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
 	void (*fixup_cell_info)(struct nvmem_device *nvmem,
 				struct nvmem_layout *layout,
 				struct nvmem_cell_info *cell);
-- 
cgit v1.2.3


From 1172460e716784ac7e1049a537bdca8edbf97360 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:31 +0000
Subject: nvmem: Move and rename ->fixup_cell_info()

This hook is meant to be used by any provider and instantiating a layout
just for this is useless. Let's instead move this hook to the nvmem
device and add it to the config structure to be easily shared by the
providers.

While at moving this hook, rename it ->fixup_dt_cell_info() to clarify
its main intended purpose.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-6-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           |  6 +++---
 drivers/nvmem/imx-ocotp.c      | 11 +++--------
 drivers/nvmem/internals.h      |  2 ++
 drivers/nvmem/mtk-efuse.c      | 11 +++--------
 include/linux/nvmem-provider.h |  9 ++++-----
 5 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index f63db5e01fca..a885713d6b4b 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -675,7 +675,6 @@ static int nvmem_validate_keepouts(struct nvmem_device *nvmem)
 
 static int nvmem_add_cells_from_dt(struct nvmem_device *nvmem, struct device_node *np)
 {
-	struct nvmem_layout *layout = nvmem->layout;
 	struct device *dev = &nvmem->dev;
 	struct device_node *child;
 	const __be32 *addr;
@@ -705,8 +704,8 @@ static int nvmem_add_cells_from_dt(struct nvmem_device *nvmem, struct device_nod
 
 		info.np = of_node_get(child);
 
-		if (layout && layout->fixup_cell_info)
-			layout->fixup_cell_info(nvmem, layout, &info);
+		if (nvmem->fixup_dt_cell_info)
+			nvmem->fixup_dt_cell_info(nvmem, &info);
 
 		ret = nvmem_add_one_cell(nvmem, &info);
 		kfree(info.name);
@@ -895,6 +894,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 
 	kref_init(&nvmem->refcnt);
 	INIT_LIST_HEAD(&nvmem->cells);
+	nvmem->fixup_dt_cell_info = config->fixup_dt_cell_info;
 
 	nvmem->owner = config->owner;
 	if (!nvmem->owner && config->dev->driver)
diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
index f1e202efaa49..79dd4fda0329 100644
--- a/drivers/nvmem/imx-ocotp.c
+++ b/drivers/nvmem/imx-ocotp.c
@@ -583,17 +583,12 @@ static const struct of_device_id imx_ocotp_dt_ids[] = {
 };
 MODULE_DEVICE_TABLE(of, imx_ocotp_dt_ids);
 
-static void imx_ocotp_fixup_cell_info(struct nvmem_device *nvmem,
-				      struct nvmem_layout *layout,
-				      struct nvmem_cell_info *cell)
+static void imx_ocotp_fixup_dt_cell_info(struct nvmem_device *nvmem,
+					 struct nvmem_cell_info *cell)
 {
 	cell->read_post_process = imx_ocotp_cell_pp;
 }
 
-static struct nvmem_layout imx_ocotp_layout = {
-	.fixup_cell_info = imx_ocotp_fixup_cell_info,
-};
-
 static int imx_ocotp_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -619,7 +614,7 @@ static int imx_ocotp_probe(struct platform_device *pdev)
 	imx_ocotp_nvmem_config.size = 4 * priv->params->nregs;
 	imx_ocotp_nvmem_config.dev = dev;
 	imx_ocotp_nvmem_config.priv = priv;
-	imx_ocotp_nvmem_config.layout = &imx_ocotp_layout;
+	imx_ocotp_nvmem_config.fixup_dt_cell_info = &imx_ocotp_fixup_dt_cell_info;
 
 	priv->config = &imx_ocotp_nvmem_config;
 
diff --git a/drivers/nvmem/internals.h b/drivers/nvmem/internals.h
index ce353831cd65..893553fbdf51 100644
--- a/drivers/nvmem/internals.h
+++ b/drivers/nvmem/internals.h
@@ -23,6 +23,8 @@ struct nvmem_device {
 	struct bin_attribute	eeprom;
 	struct device		*base_dev;
 	struct list_head	cells;
+	void (*fixup_dt_cell_info)(struct nvmem_device *nvmem,
+				   struct nvmem_cell_info *cell);
 	const struct nvmem_keepout *keepout;
 	unsigned int		nkeepout;
 	nvmem_reg_read_t	reg_read;
diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c
index 87c94686cfd2..84f05b40a411 100644
--- a/drivers/nvmem/mtk-efuse.c
+++ b/drivers/nvmem/mtk-efuse.c
@@ -45,9 +45,8 @@ static int mtk_efuse_gpu_speedbin_pp(void *context, const char *id, int index,
 	return 0;
 }
 
-static void mtk_efuse_fixup_cell_info(struct nvmem_device *nvmem,
-				      struct nvmem_layout *layout,
-				      struct nvmem_cell_info *cell)
+static void mtk_efuse_fixup_dt_cell_info(struct nvmem_device *nvmem,
+					 struct nvmem_cell_info *cell)
 {
 	size_t sz = strlen(cell->name);
 
@@ -61,10 +60,6 @@ static void mtk_efuse_fixup_cell_info(struct nvmem_device *nvmem,
 		cell->read_post_process = mtk_efuse_gpu_speedbin_pp;
 }
 
-static struct nvmem_layout mtk_efuse_layout = {
-	.fixup_cell_info = mtk_efuse_fixup_cell_info,
-};
-
 static int mtk_efuse_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -91,7 +86,7 @@ static int mtk_efuse_probe(struct platform_device *pdev)
 	econfig.priv = priv;
 	econfig.dev = dev;
 	if (pdata->uses_post_processing)
-		econfig.layout = &mtk_efuse_layout;
+		econfig.fixup_dt_cell_info = &mtk_efuse_fixup_dt_cell_info;
 	nvmem = devm_nvmem_register(dev, &econfig);
 
 	return PTR_ERR_OR_ZERO(nvmem);
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 3939991b3c5f..36415a602d9e 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -83,6 +83,8 @@ struct nvmem_cell_info {
  * @cells:	Optional array of pre-defined NVMEM cells.
  * @ncells:	Number of elements in cells.
  * @add_legacy_fixed_of_cells:	Read fixed NVMEM cells from old OF syntax.
+ * @fixup_dt_cell_info: Will be called before a cell is added. Can be
+ *		used to modify the nvmem_cell_info.
  * @keepout:	Optional array of keepout ranges (sorted ascending by start).
  * @nkeepout:	Number of elements in the keepout array.
  * @type:	Type of the nvmem storage
@@ -113,6 +115,8 @@ struct nvmem_config {
 	const struct nvmem_cell_info	*cells;
 	int			ncells;
 	bool			add_legacy_fixed_of_cells;
+	void (*fixup_dt_cell_info)(struct nvmem_device *nvmem,
+				   struct nvmem_cell_info *cell);
 	const struct nvmem_keepout *keepout;
 	unsigned int		nkeepout;
 	enum nvmem_type		type;
@@ -158,8 +162,6 @@ struct nvmem_cell_table {
  * @of_match_table:	Open firmware match table.
  * @add_cells:		Called to populate the layout using
  *			nvmem_add_one_cell().
- * @fixup_cell_info:	Will be called before a cell is added. Can be
- *			used to modify the nvmem_cell_info.
  * @owner:		Pointer to struct module.
  * @node:		List node.
  *
@@ -172,9 +174,6 @@ struct nvmem_layout {
 	const char *name;
 	const struct of_device_id *of_match_table;
 	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
-	void (*fixup_cell_info)(struct nvmem_device *nvmem,
-				struct nvmem_layout *layout,
-				struct nvmem_cell_info *cell);
 
 	/* private */
 	struct module *owner;
-- 
cgit v1.2.3


From fc29fd821d9ac2ae3d32a722fac39ce874efb883 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 11:15:32 +0000
Subject: nvmem: core: Rework layouts to become regular devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current layout support was initially written without modules support in
mind. When the requirement for module support rose, the existing base
was improved to adopt modularization support, but kind of a design flaw
was introduced. With the existing implementation, when a storage device
registers into NVMEM, the core tries to hook a layout (if any) and
populates its cells immediately. This means, if the hardware description
expects a layout to be hooked up, but no driver was provided for that,
the storage medium will fail to probe and try later from
scratch. Even if we consider that the hardware description shall be
correct, we could still probe the storage device (especially if it
contains the rootfs).

One way to overcome this situation is to consider the layouts as
devices, and leverage the native notifier mechanism. When a new NVMEM
device is registered, we can populate its nvmem-layout child, if any,
and wait for the matching to be done in order to get the cells (the
waiting can be easily done with the NVMEM notifiers). If the layout
driver is compiled as a module, it should automatically be loaded. This
way, there is no strong order to enforce, any NVMEM device creation
or NVMEM layout driver insertion will be observed as a new event which
may lead to the creation of additional cells, without disturbing the
probes with costly (and sometimes endless) deferrals.

In order to achieve that goal we create a new bus for the nvmem-layouts
with minimal logic to match nvmem-layout devices with nvmem-layout
drivers. All this infrastructure code is created in the layouts.c file.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20231215111536.316972-7-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/Kconfig            |   1 +
 drivers/nvmem/Makefile           |   2 +
 drivers/nvmem/core.c             | 170 ++++++++++++---------------------
 drivers/nvmem/internals.h        |  21 ++++
 drivers/nvmem/layouts.c          | 201 +++++++++++++++++++++++++++++++++++++++
 drivers/nvmem/layouts/Kconfig    |   8 ++
 drivers/nvmem/layouts/onie-tlv.c |  24 ++++-
 drivers/nvmem/layouts/sl28vpd.c  |  24 ++++-
 include/linux/nvmem-provider.h   |  38 ++++----
 9 files changed, 354 insertions(+), 135 deletions(-)
 create mode 100644 drivers/nvmem/layouts.c

(limited to 'include/linux')

diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig
index 5bc9c4874fe3..283134498fbc 100644
--- a/drivers/nvmem/Kconfig
+++ b/drivers/nvmem/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menuconfig NVMEM
 	bool "NVMEM Support"
+	imply NVMEM_LAYOUTS
 	help
 	  Support for NVMEM(Non Volatile Memory) devices like EEPROM, EFUSES...
 
diff --git a/drivers/nvmem/Makefile b/drivers/nvmem/Makefile
index 423baf089515..cdd01fbf1313 100644
--- a/drivers/nvmem/Makefile
+++ b/drivers/nvmem/Makefile
@@ -5,6 +5,8 @@
 
 obj-$(CONFIG_NVMEM)		+= nvmem_core.o
 nvmem_core-y			:= core.o
+obj-$(CONFIG_NVMEM_LAYOUTS)	+= nvmem_layouts.o
+nvmem_layouts-y			:= layouts.o
 obj-y				+= layouts/
 
 # Devices
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index a885713d6b4b..4c2154ddf8a7 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -55,9 +55,6 @@ static LIST_HEAD(nvmem_lookup_list);
 
 static BLOCKING_NOTIFIER_HEAD(nvmem_notifier);
 
-static DEFINE_SPINLOCK(nvmem_layout_lock);
-static LIST_HEAD(nvmem_layouts);
-
 static int __nvmem_reg_read(struct nvmem_device *nvmem, unsigned int offset,
 			    void *val, size_t bytes)
 {
@@ -740,97 +737,22 @@ static int nvmem_add_cells_from_fixed_layout(struct nvmem_device *nvmem)
 	return err;
 }
 
-int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner)
+int nvmem_layout_register(struct nvmem_layout *layout)
 {
-	layout->owner = owner;
-
-	spin_lock(&nvmem_layout_lock);
-	list_add(&layout->node, &nvmem_layouts);
-	spin_unlock(&nvmem_layout_lock);
-
-	blocking_notifier_call_chain(&nvmem_notifier, NVMEM_LAYOUT_ADD, layout);
+	if (!layout->add_cells)
+		return -EINVAL;
 
-	return 0;
+	/* Populate the cells */
+	return layout->add_cells(&layout->nvmem->dev, layout->nvmem);
 }
-EXPORT_SYMBOL_GPL(__nvmem_layout_register);
+EXPORT_SYMBOL_GPL(nvmem_layout_register);
 
 void nvmem_layout_unregister(struct nvmem_layout *layout)
 {
-	blocking_notifier_call_chain(&nvmem_notifier, NVMEM_LAYOUT_REMOVE, layout);
-
-	spin_lock(&nvmem_layout_lock);
-	list_del(&layout->node);
-	spin_unlock(&nvmem_layout_lock);
+	/* Keep the API even with an empty stub in case we need it later */
 }
 EXPORT_SYMBOL_GPL(nvmem_layout_unregister);
 
-static struct nvmem_layout *nvmem_layout_get(struct nvmem_device *nvmem)
-{
-	struct device_node *layout_np;
-	struct nvmem_layout *l, *layout = ERR_PTR(-EPROBE_DEFER);
-
-	layout_np = of_nvmem_layout_get_container(nvmem);
-	if (!layout_np)
-		return NULL;
-
-	/* Fixed layouts don't have a matching driver */
-	if (of_device_is_compatible(layout_np, "fixed-layout")) {
-		of_node_put(layout_np);
-		return NULL;
-	}
-
-	/*
-	 * In case the nvmem device was built-in while the layout was built as a
-	 * module, we shall manually request the layout driver loading otherwise
-	 * we'll never have any match.
-	 */
-	of_request_module(layout_np);
-
-	spin_lock(&nvmem_layout_lock);
-
-	list_for_each_entry(l, &nvmem_layouts, node) {
-		if (of_match_node(l->of_match_table, layout_np)) {
-			if (try_module_get(l->owner))
-				layout = l;
-
-			break;
-		}
-	}
-
-	spin_unlock(&nvmem_layout_lock);
-	of_node_put(layout_np);
-
-	return layout;
-}
-
-static void nvmem_layout_put(struct nvmem_layout *layout)
-{
-	if (layout)
-		module_put(layout->owner);
-}
-
-static int nvmem_add_cells_from_layout(struct nvmem_device *nvmem)
-{
-	struct nvmem_layout *layout = nvmem->layout;
-	int ret;
-
-	if (layout && layout->add_cells) {
-		ret = layout->add_cells(&nvmem->dev, nvmem);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-#if IS_ENABLED(CONFIG_OF)
-struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
-{
-	return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
-}
-EXPORT_SYMBOL_GPL(of_nvmem_layout_get_container);
-#endif
-
 const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 					struct nvmem_layout *layout)
 {
@@ -838,7 +760,7 @@ const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 	const struct of_device_id *match;
 
 	layout_np = of_nvmem_layout_get_container(nvmem);
-	match = of_match_node(layout->of_match_table, layout_np);
+	match = of_match_node(layout->dev.driver->of_match_table, layout_np);
 
 	return match ? match->data : NULL;
 }
@@ -950,19 +872,6 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 			goto err_put_device;
 	}
 
-	/*
-	 * If the driver supplied a layout by config->layout, the module
-	 * pointer will be NULL and nvmem_layout_put() will be a noop.
-	 */
-	nvmem->layout = config->layout ?: nvmem_layout_get(nvmem);
-	if (IS_ERR(nvmem->layout)) {
-		rval = PTR_ERR(nvmem->layout);
-		nvmem->layout = NULL;
-
-		if (rval == -EPROBE_DEFER)
-			goto err_teardown_compat;
-	}
-
 	if (config->cells) {
 		rval = nvmem_add_cells(nvmem, config->cells, config->ncells);
 		if (rval)
@@ -983,24 +892,24 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	if (rval)
 		goto err_remove_cells;
 
-	rval = nvmem_add_cells_from_layout(nvmem);
-	if (rval)
-		goto err_remove_cells;
-
 	dev_dbg(&nvmem->dev, "Registering nvmem device %s\n", config->name);
 
 	rval = device_add(&nvmem->dev);
 	if (rval)
 		goto err_remove_cells;
 
+	rval = nvmem_populate_layout(nvmem);
+	if (rval)
+		goto err_remove_dev;
+
 	blocking_notifier_call_chain(&nvmem_notifier, NVMEM_ADD, nvmem);
 
 	return nvmem;
 
+err_remove_dev:
+	device_del(&nvmem->dev);
 err_remove_cells:
 	nvmem_device_remove_all_cells(nvmem);
-	nvmem_layout_put(nvmem->layout);
-err_teardown_compat:
 	if (config->compat)
 		nvmem_sysfs_remove_compat(nvmem, config);
 err_put_device:
@@ -1022,7 +931,7 @@ static void nvmem_device_release(struct kref *kref)
 		device_remove_bin_file(nvmem->base_dev, &nvmem->eeprom);
 
 	nvmem_device_remove_all_cells(nvmem);
-	nvmem_layout_put(nvmem->layout);
+	nvmem_destroy_layout(nvmem);
 	device_unregister(&nvmem->dev);
 }
 
@@ -1324,6 +1233,12 @@ nvmem_cell_get_from_lookup(struct device *dev, const char *con_id)
 	return cell;
 }
 
+static void nvmem_layout_module_put(struct nvmem_device *nvmem)
+{
+	if (nvmem->layout && nvmem->layout->dev.driver)
+		module_put(nvmem->layout->dev.driver->owner);
+}
+
 #if IS_ENABLED(CONFIG_OF)
 static struct nvmem_cell_entry *
 nvmem_find_cell_entry_by_node(struct nvmem_device *nvmem, struct device_node *np)
@@ -1342,6 +1257,18 @@ nvmem_find_cell_entry_by_node(struct nvmem_device *nvmem, struct device_node *np
 	return cell;
 }
 
+static int nvmem_layout_module_get_optional(struct nvmem_device *nvmem)
+{
+	if (!nvmem->layout)
+		return 0;
+
+	if (!nvmem->layout->dev.driver ||
+	    !try_module_get(nvmem->layout->dev.driver->owner))
+		return -EPROBE_DEFER;
+
+	return 0;
+}
+
 /**
  * of_nvmem_cell_get() - Get a nvmem cell from given device node and cell id
  *
@@ -1404,16 +1331,29 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id)
 		return ERR_CAST(nvmem);
 	}
 
+	ret = nvmem_layout_module_get_optional(nvmem);
+	if (ret) {
+		of_node_put(cell_np);
+		__nvmem_device_put(nvmem);
+		return ERR_PTR(ret);
+	}
+
 	cell_entry = nvmem_find_cell_entry_by_node(nvmem, cell_np);
 	of_node_put(cell_np);
 	if (!cell_entry) {
 		__nvmem_device_put(nvmem);
-		return ERR_PTR(-ENOENT);
+		nvmem_layout_module_put(nvmem);
+		if (nvmem->layout)
+			return ERR_PTR(-EPROBE_DEFER);
+		else
+			return ERR_PTR(-ENOENT);
 	}
 
 	cell = nvmem_create_cell(cell_entry, id, cell_index);
-	if (IS_ERR(cell))
+	if (IS_ERR(cell)) {
 		__nvmem_device_put(nvmem);
+		nvmem_layout_module_put(nvmem);
+	}
 
 	return cell;
 }
@@ -1527,6 +1467,7 @@ void nvmem_cell_put(struct nvmem_cell *cell)
 
 	kfree(cell);
 	__nvmem_device_put(nvmem);
+	nvmem_layout_module_put(nvmem);
 }
 EXPORT_SYMBOL_GPL(nvmem_cell_put);
 
@@ -2104,11 +2045,22 @@ EXPORT_SYMBOL_GPL(nvmem_dev_name);
 
 static int __init nvmem_init(void)
 {
-	return bus_register(&nvmem_bus_type);
+	int ret;
+
+	ret = bus_register(&nvmem_bus_type);
+	if (ret)
+		return ret;
+
+	ret = nvmem_layout_bus_register();
+	if (ret)
+		bus_unregister(&nvmem_bus_type);
+
+	return ret;
 }
 
 static void __exit nvmem_exit(void)
 {
+	nvmem_layout_bus_unregister();
 	bus_unregister(&nvmem_bus_type);
 }
 
diff --git a/drivers/nvmem/internals.h b/drivers/nvmem/internals.h
index 893553fbdf51..4946456c76c7 100644
--- a/drivers/nvmem/internals.h
+++ b/drivers/nvmem/internals.h
@@ -34,4 +34,25 @@ struct nvmem_device {
 	void *priv;
 };
 
+#if IS_ENABLED(CONFIG_OF)
+int nvmem_layout_bus_register(void);
+void nvmem_layout_bus_unregister(void);
+int nvmem_populate_layout(struct nvmem_device *nvmem);
+void nvmem_destroy_layout(struct nvmem_device *nvmem);
+#else /* CONFIG_OF */
+static inline int nvmem_layout_bus_register(void)
+{
+	return 0;
+}
+
+static inline void nvmem_layout_bus_unregister(void) {}
+
+static inline int nvmem_populate_layout(struct nvmem_device *nvmem)
+{
+	return 0;
+}
+
+static inline void nvmem_destroy_layout(struct nvmem_device *nvmem) { }
+#endif /* CONFIG_OF */
+
 #endif  /* ifndef _LINUX_NVMEM_INTERNALS_H */
diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
new file mode 100644
index 000000000000..6a6aa58369ff
--- /dev/null
+++ b/drivers/nvmem/layouts.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMEM layout bus handling
+ *
+ * Copyright (C) 2023 Bootlin
+ * Author: Miquel Raynal <miquel.raynal@bootlin.com
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/nvmem-provider.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+
+#include "internals.h"
+
+#define to_nvmem_layout_driver(drv) \
+	(container_of((drv), struct nvmem_layout_driver, driver))
+#define to_nvmem_layout_device(_dev) \
+	container_of((_dev), struct nvmem_layout, dev)
+
+static int nvmem_layout_bus_match(struct device *dev, struct device_driver *drv)
+{
+	return of_driver_match_device(dev, drv);
+}
+
+static int nvmem_layout_bus_probe(struct device *dev)
+{
+	struct nvmem_layout_driver *drv = to_nvmem_layout_driver(dev->driver);
+	struct nvmem_layout *layout = to_nvmem_layout_device(dev);
+
+	if (!drv->probe || !drv->remove)
+		return -EINVAL;
+
+	return drv->probe(layout);
+}
+
+static void nvmem_layout_bus_remove(struct device *dev)
+{
+	struct nvmem_layout_driver *drv = to_nvmem_layout_driver(dev->driver);
+	struct nvmem_layout *layout = to_nvmem_layout_device(dev);
+
+	return drv->remove(layout);
+}
+
+static struct bus_type nvmem_layout_bus_type = {
+	.name		= "nvmem-layout",
+	.match		= nvmem_layout_bus_match,
+	.probe		= nvmem_layout_bus_probe,
+	.remove		= nvmem_layout_bus_remove,
+};
+
+int nvmem_layout_driver_register(struct nvmem_layout_driver *drv)
+{
+	drv->driver.bus = &nvmem_layout_bus_type;
+
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(nvmem_layout_driver_register);
+
+void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(nvmem_layout_driver_unregister);
+
+static void nvmem_layout_release_device(struct device *dev)
+{
+	struct nvmem_layout *layout = to_nvmem_layout_device(dev);
+
+	of_node_put(layout->dev.of_node);
+	kfree(layout);
+}
+
+static int nvmem_layout_create_device(struct nvmem_device *nvmem,
+				      struct device_node *np)
+{
+	struct nvmem_layout *layout;
+	struct device *dev;
+	int ret;
+
+	layout = kzalloc(sizeof(*layout), GFP_KERNEL);
+	if (!layout)
+		return -ENOMEM;
+
+	/* Create a bidirectional link */
+	layout->nvmem = nvmem;
+	nvmem->layout = layout;
+
+	/* Device model registration */
+	dev = &layout->dev;
+	device_initialize(dev);
+	dev->parent = &nvmem->dev;
+	dev->bus = &nvmem_layout_bus_type;
+	dev->release = nvmem_layout_release_device;
+	dev->coherent_dma_mask = DMA_BIT_MASK(32);
+	dev->dma_mask = &dev->coherent_dma_mask;
+	device_set_node(dev, of_fwnode_handle(of_node_get(np)));
+	of_device_make_bus_id(dev);
+	of_msi_configure(dev, dev->of_node);
+
+	ret = device_add(dev);
+	if (ret) {
+		put_device(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct of_device_id of_nvmem_layout_skip_table[] = {
+	{ .compatible = "fixed-layout", },
+	{}
+};
+
+static int nvmem_layout_bus_populate(struct nvmem_device *nvmem,
+				     struct device_node *layout_dn)
+{
+	int ret;
+
+	/* Make sure it has a compatible property */
+	if (!of_get_property(layout_dn, "compatible", NULL)) {
+		pr_debug("%s() - skipping %pOF, no compatible prop\n",
+			 __func__, layout_dn);
+		return 0;
+	}
+
+	/* Fixed layouts are parsed manually somewhere else for now */
+	if (of_match_node(of_nvmem_layout_skip_table, layout_dn)) {
+		pr_debug("%s() - skipping %pOF node\n", __func__, layout_dn);
+		return 0;
+	}
+
+	if (of_node_check_flag(layout_dn, OF_POPULATED_BUS)) {
+		pr_debug("%s() - skipping %pOF, already populated\n",
+			 __func__, layout_dn);
+
+		return 0;
+	}
+
+	/* NVMEM layout buses expect only a single device representing the layout */
+	ret = nvmem_layout_create_device(nvmem, layout_dn);
+	if (ret)
+		return ret;
+
+	of_node_set_flag(layout_dn, OF_POPULATED_BUS);
+
+	return 0;
+}
+
+struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem)
+{
+	return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout");
+}
+EXPORT_SYMBOL_GPL(of_nvmem_layout_get_container);
+
+/*
+ * Returns the number of devices populated, 0 if the operation was not relevant
+ * for this nvmem device, an error code otherwise.
+ */
+int nvmem_populate_layout(struct nvmem_device *nvmem)
+{
+	struct device_node *layout_dn;
+	int ret;
+
+	layout_dn = of_nvmem_layout_get_container(nvmem);
+	if (!layout_dn)
+		return 0;
+
+	/* Populate the layout device */
+	device_links_supplier_sync_state_pause();
+	ret = nvmem_layout_bus_populate(nvmem, layout_dn);
+	device_links_supplier_sync_state_resume();
+
+	of_node_put(layout_dn);
+	return ret;
+}
+
+void nvmem_destroy_layout(struct nvmem_device *nvmem)
+{
+	struct device *dev;
+
+	if (!nvmem->layout)
+		return;
+
+	dev = &nvmem->layout->dev;
+	of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
+	device_unregister(dev);
+}
+
+int nvmem_layout_bus_register(void)
+{
+	return bus_register(&nvmem_layout_bus_type);
+}
+
+void nvmem_layout_bus_unregister(void)
+{
+	bus_unregister(&nvmem_layout_bus_type);
+}
diff --git a/drivers/nvmem/layouts/Kconfig b/drivers/nvmem/layouts/Kconfig
index 7ff1ee1c1f05..9c6e672fc350 100644
--- a/drivers/nvmem/layouts/Kconfig
+++ b/drivers/nvmem/layouts/Kconfig
@@ -1,5 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0
 
+config NVMEM_LAYOUTS
+	bool
+	depends on OF
+
+if NVMEM_LAYOUTS
+
 menu "Layout Types"
 
 config NVMEM_LAYOUT_SL28_VPD
@@ -21,3 +27,5 @@ config NVMEM_LAYOUT_ONIE_TLV
 	  If unsure, say N.
 
 endmenu
+
+endif
diff --git a/drivers/nvmem/layouts/onie-tlv.c b/drivers/nvmem/layouts/onie-tlv.c
index defd42d4375c..b24cc5dcc6ee 100644
--- a/drivers/nvmem/layouts/onie-tlv.c
+++ b/drivers/nvmem/layouts/onie-tlv.c
@@ -225,16 +225,32 @@ static int onie_tlv_parse_table(struct device *dev, struct nvmem_device *nvmem)
 	return 0;
 }
 
+static int onie_tlv_probe(struct nvmem_layout *layout)
+{
+	layout->add_cells = onie_tlv_parse_table;
+
+	return nvmem_layout_register(layout);
+}
+
+static void onie_tlv_remove(struct nvmem_layout *layout)
+{
+	nvmem_layout_unregister(layout);
+}
+
 static const struct of_device_id onie_tlv_of_match_table[] = {
 	{ .compatible = "onie,tlv-layout", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, onie_tlv_of_match_table);
 
-static struct nvmem_layout onie_tlv_layout = {
-	.name = "ONIE tlv layout",
-	.of_match_table = onie_tlv_of_match_table,
-	.add_cells = onie_tlv_parse_table,
+static struct nvmem_layout_driver onie_tlv_layout = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "onie-tlv-layout",
+		.of_match_table = onie_tlv_of_match_table,
+	},
+	.probe = onie_tlv_probe,
+	.remove = onie_tlv_remove,
 };
 module_nvmem_layout_driver(onie_tlv_layout);
 
diff --git a/drivers/nvmem/layouts/sl28vpd.c b/drivers/nvmem/layouts/sl28vpd.c
index 26c7cf21b523..b8ffae646cc2 100644
--- a/drivers/nvmem/layouts/sl28vpd.c
+++ b/drivers/nvmem/layouts/sl28vpd.c
@@ -134,16 +134,32 @@ static int sl28vpd_add_cells(struct device *dev, struct nvmem_device *nvmem)
 	return 0;
 }
 
+static int sl28vpd_probe(struct nvmem_layout *layout)
+{
+	layout->add_cells = sl28vpd_add_cells;
+
+	return nvmem_layout_register(layout);
+}
+
+static void sl28vpd_remove(struct nvmem_layout *layout)
+{
+	nvmem_layout_unregister(layout);
+}
+
 static const struct of_device_id sl28vpd_of_match_table[] = {
 	{ .compatible = "kontron,sl28-vpd" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, sl28vpd_of_match_table);
 
-static struct nvmem_layout sl28vpd_layout = {
-	.name = "sl28-vpd",
-	.of_match_table = sl28vpd_of_match_table,
-	.add_cells = sl28vpd_add_cells,
+static struct nvmem_layout_driver sl28vpd_layout = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "kontron-sl28vpd-layout",
+		.of_match_table = sl28vpd_of_match_table,
+	},
+	.probe = sl28vpd_probe,
+	.remove = sl28vpd_remove,
 };
 module_nvmem_layout_driver(sl28vpd_layout);
 
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 36415a602d9e..6fe65b35ea97 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -9,6 +9,7 @@
 #ifndef _LINUX_NVMEM_PROVIDER_H
 #define _LINUX_NVMEM_PROVIDER_H
 
+#include <linux/device.h>
 #include <linux/device/driver.h>
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -158,12 +159,11 @@ struct nvmem_cell_table {
 /**
  * struct nvmem_layout - NVMEM layout definitions
  *
- * @name:		Layout name.
- * @of_match_table:	Open firmware match table.
- * @add_cells:		Called to populate the layout using
- *			nvmem_add_one_cell().
- * @owner:		Pointer to struct module.
- * @node:		List node.
+ * @dev:		Device-model layout device.
+ * @nvmem:		The underlying NVMEM device
+ * @add_cells:		Will be called if a nvmem device is found which
+ *			has this layout. The function will add layout
+ *			specific cells with nvmem_add_one_cell().
  *
  * A nvmem device can hold a well defined structure which can just be
  * evaluated during runtime. For example a TLV list, or a list of "name=val"
@@ -171,13 +171,15 @@ struct nvmem_cell_table {
  * cells.
  */
 struct nvmem_layout {
-	const char *name;
-	const struct of_device_id *of_match_table;
+	struct device dev;
+	struct nvmem_device *nvmem;
 	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
+};
 
-	/* private */
-	struct module *owner;
-	struct list_head node;
+struct nvmem_layout_driver {
+	struct device_driver driver;
+	int (*probe)(struct nvmem_layout *layout);
+	void (*remove)(struct nvmem_layout *layout);
 };
 
 #if IS_ENABLED(CONFIG_NVMEM)
@@ -194,11 +196,15 @@ void nvmem_del_cell_table(struct nvmem_cell_table *table);
 int nvmem_add_one_cell(struct nvmem_device *nvmem,
 		       const struct nvmem_cell_info *info);
 
-int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner);
-#define nvmem_layout_register(layout) \
-	__nvmem_layout_register(layout, THIS_MODULE)
+int nvmem_layout_register(struct nvmem_layout *layout);
 void nvmem_layout_unregister(struct nvmem_layout *layout);
 
+int nvmem_layout_driver_register(struct nvmem_layout_driver *drv);
+void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv);
+#define module_nvmem_layout_driver(__nvmem_layout_driver)		\
+	module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \
+		      nvmem_layout_driver_unregister)
+
 const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
 					struct nvmem_layout *layout);
 
@@ -262,8 +268,4 @@ static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_dev
 
 #endif /* CONFIG_NVMEM && CONFIG_OF */
 
-#define module_nvmem_layout_driver(__layout_driver)		\
-	module_driver(__layout_driver, nvmem_layout_register,	\
-		      nvmem_layout_unregister)
-
 #endif  /* ifndef _LINUX_NVMEM_PROVIDER_H */
-- 
cgit v1.2.3


From 1f78c56007ba61b7b8c3f7dbb6787b6af116d3f0 Mon Sep 17 00:00:00 2001
From: Théo Lebrun <theo.lebrun@bootlin.com>
Date: Thu, 7 Dec 2023 18:56:06 +0100
Subject: tty: serial: amba: Use linux/{bits,bitfield}.h macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver uses bit shifts and hexadecimal expressions to declare
constants. Replace that with the BIT(), GENMASK() & FIELD_PREP_CONST()
macros to clarify intent.

include/linux/amba/serial.h gets included from arch/arm/include/debug/pl01x.S.
Avoid includes and macro tricks for the four defines that are involved:
UART01x_DR, UART01x_FR, UART01x_FR_TXFF and UART01x_FR_BUSY.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231207-mbly-uart-v6-1-e384afa5e78c@bootlin.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/amba/serial.h | 251 +++++++++++++++++++++++---------------------
 1 file changed, 130 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index 27003ec52114..9120de05ead0 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -10,6 +10,11 @@
 #ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H
 #define ASM_ARM_HARDWARE_SERIAL_AMBA_H
 
+#ifndef __ASSEMBLY__
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#endif
+
 #include <linux/types.h>
 
 /* -------------------------------------------------------------------------------
@@ -70,138 +75,142 @@
 #define ZX_UART011_ICR		0x4c
 #define ZX_UART011_DMACR	0x50
 
-#define UART011_DR_OE		(1 << 11)
-#define UART011_DR_BE		(1 << 10)
-#define UART011_DR_PE		(1 << 9)
-#define UART011_DR_FE		(1 << 8)
-
-#define UART01x_RSR_OE		0x08
-#define UART01x_RSR_BE		0x04
-#define UART01x_RSR_PE		0x02
-#define UART01x_RSR_FE		0x01
-
-#define UART011_FR_RI		0x100
-#define UART011_FR_TXFE		0x080
-#define UART011_FR_RXFF		0x040
-#define UART01x_FR_TXFF		0x020
-#define UART01x_FR_RXFE		0x010
-#define UART01x_FR_BUSY		0x008
-#define UART01x_FR_DCD		0x004
-#define UART01x_FR_DSR		0x002
-#define UART01x_FR_CTS		0x001
+#define UART011_DR_OE		BIT(11)
+#define UART011_DR_BE		BIT(10)
+#define UART011_DR_PE		BIT(9)
+#define UART011_DR_FE		BIT(8)
+
+#define UART01x_RSR_OE		BIT(3)
+#define UART01x_RSR_BE		BIT(2)
+#define UART01x_RSR_PE		BIT(1)
+#define UART01x_RSR_FE		BIT(0)
+
+#define UART011_FR_RI		BIT(8)
+#define UART011_FR_TXFE		BIT(7)
+#define UART011_FR_RXFF		BIT(6)
+#define UART01x_FR_TXFF		(1 << 5)	/* used in ASM */
+#define UART01x_FR_RXFE		BIT(4)
+#define UART01x_FR_BUSY		(1 << 3)	/* used in ASM */
+#define UART01x_FR_DCD		BIT(2)
+#define UART01x_FR_DSR		BIT(1)
+#define UART01x_FR_CTS		BIT(0)
 #define UART01x_FR_TMSK		(UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 /*
  * Some bits of Flag Register on ZTE device have different position from
  * standard ones.
  */
-#define ZX_UART01x_FR_BUSY	0x100
-#define ZX_UART01x_FR_DSR	0x008
-#define ZX_UART01x_FR_CTS	0x002
-#define ZX_UART011_FR_RI	0x001
-
-#define UART011_CR_CTSEN	0x8000	/* CTS hardware flow control */
-#define UART011_CR_RTSEN	0x4000	/* RTS hardware flow control */
-#define UART011_CR_OUT2		0x2000	/* OUT2 */
-#define UART011_CR_OUT1		0x1000	/* OUT1 */
-#define UART011_CR_RTS		0x0800	/* RTS */
-#define UART011_CR_DTR		0x0400	/* DTR */
-#define UART011_CR_RXE		0x0200	/* receive enable */
-#define UART011_CR_TXE		0x0100	/* transmit enable */
-#define UART011_CR_LBE		0x0080	/* loopback enable */
-#define UART010_CR_RTIE		0x0040
-#define UART010_CR_TIE		0x0020
-#define UART010_CR_RIE		0x0010
-#define UART010_CR_MSIE		0x0008
-#define ST_UART011_CR_OVSFACT	0x0008	/* Oversampling factor */
-#define UART01x_CR_IIRLP	0x0004	/* SIR low power mode */
-#define UART01x_CR_SIREN	0x0002	/* SIR enable */
-#define UART01x_CR_UARTEN	0x0001	/* UART enable */
-
-#define UART011_LCRH_SPS	0x80
+#define ZX_UART01x_FR_BUSY	BIT(8)
+#define ZX_UART01x_FR_DSR	BIT(3)
+#define ZX_UART01x_FR_CTS	BIT(1)
+#define ZX_UART011_FR_RI	BIT(0)
+
+#define UART011_CR_CTSEN	BIT(15)	/* CTS hardware flow control */
+#define UART011_CR_RTSEN	BIT(14)	/* RTS hardware flow control */
+#define UART011_CR_OUT2		BIT(13)	/* OUT2 */
+#define UART011_CR_OUT1		BIT(12)	/* OUT1 */
+#define UART011_CR_RTS		BIT(11)	/* RTS */
+#define UART011_CR_DTR		BIT(10)	/* DTR */
+#define UART011_CR_RXE		BIT(9)	/* receive enable */
+#define UART011_CR_TXE		BIT(8)	/* transmit enable */
+#define UART011_CR_LBE		BIT(7)	/* loopback enable */
+#define UART010_CR_RTIE		BIT(6)
+#define UART010_CR_TIE		BIT(5)
+#define UART010_CR_RIE		BIT(4)
+#define UART010_CR_MSIE		BIT(3)
+#define ST_UART011_CR_OVSFACT	BIT(3)	/* Oversampling factor */
+#define UART01x_CR_IIRLP	BIT(2)	/* SIR low power mode */
+#define UART01x_CR_SIREN	BIT(1)	/* SIR enable */
+#define UART01x_CR_UARTEN	BIT(0)	/* UART enable */
+
+#define UART011_LCRH_SPS	BIT(7)
 #define UART01x_LCRH_WLEN_8	0x60
 #define UART01x_LCRH_WLEN_7	0x40
 #define UART01x_LCRH_WLEN_6	0x20
 #define UART01x_LCRH_WLEN_5	0x00
-#define UART01x_LCRH_FEN	0x10
-#define UART01x_LCRH_STP2	0x08
-#define UART01x_LCRH_EPS	0x04
-#define UART01x_LCRH_PEN	0x02
-#define UART01x_LCRH_BRK	0x01
-
-#define ST_UART011_DMAWM_RX_1	(0 << 3)
-#define ST_UART011_DMAWM_RX_2	(1 << 3)
-#define ST_UART011_DMAWM_RX_4	(2 << 3)
-#define ST_UART011_DMAWM_RX_8	(3 << 3)
-#define ST_UART011_DMAWM_RX_16	(4 << 3)
-#define ST_UART011_DMAWM_RX_32	(5 << 3)
-#define ST_UART011_DMAWM_RX_48	(6 << 3)
-#define ST_UART011_DMAWM_TX_1	0
-#define ST_UART011_DMAWM_TX_2	1
-#define ST_UART011_DMAWM_TX_4	2
-#define ST_UART011_DMAWM_TX_8	3
-#define ST_UART011_DMAWM_TX_16	4
-#define ST_UART011_DMAWM_TX_32	5
-#define ST_UART011_DMAWM_TX_48	6
-
-#define UART010_IIR_RTIS	0x08
-#define UART010_IIR_TIS		0x04
-#define UART010_IIR_RIS		0x02
-#define UART010_IIR_MIS		0x01
-
-#define UART011_IFLS_RX1_8	(0 << 3)
-#define UART011_IFLS_RX2_8	(1 << 3)
-#define UART011_IFLS_RX4_8	(2 << 3)
-#define UART011_IFLS_RX6_8	(3 << 3)
-#define UART011_IFLS_RX7_8	(4 << 3)
-#define UART011_IFLS_TX1_8	(0 << 0)
-#define UART011_IFLS_TX2_8	(1 << 0)
-#define UART011_IFLS_TX4_8	(2 << 0)
-#define UART011_IFLS_TX6_8	(3 << 0)
-#define UART011_IFLS_TX7_8	(4 << 0)
+#define UART01x_LCRH_FEN	BIT(4)
+#define UART01x_LCRH_STP2	BIT(3)
+#define UART01x_LCRH_EPS	BIT(2)
+#define UART01x_LCRH_PEN	BIT(1)
+#define UART01x_LCRH_BRK	BIT(0)
+
+#define ST_UART011_DMAWM_RX	GENMASK(5, 3)
+#define ST_UART011_DMAWM_RX_1	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 0)
+#define ST_UART011_DMAWM_RX_2	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 1)
+#define ST_UART011_DMAWM_RX_4	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 2)
+#define ST_UART011_DMAWM_RX_8	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 3)
+#define ST_UART011_DMAWM_RX_16	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 4)
+#define ST_UART011_DMAWM_RX_32	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 5)
+#define ST_UART011_DMAWM_RX_48	FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 6)
+#define ST_UART011_DMAWM_TX	GENMASK(2, 0)
+#define ST_UART011_DMAWM_TX_1	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 0)
+#define ST_UART011_DMAWM_TX_2	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 1)
+#define ST_UART011_DMAWM_TX_4	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 2)
+#define ST_UART011_DMAWM_TX_8	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 3)
+#define ST_UART011_DMAWM_TX_16	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 4)
+#define ST_UART011_DMAWM_TX_32	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 5)
+#define ST_UART011_DMAWM_TX_48	FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 6)
+
+#define UART010_IIR_RTIS	BIT(3)
+#define UART010_IIR_TIS		BIT(2)
+#define UART010_IIR_RIS		BIT(1)
+#define UART010_IIR_MIS		BIT(0)
+
+#define UART011_IFLS_RXIFLSEL	GENMASK(5, 3)
+#define UART011_IFLS_RX1_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 0)
+#define UART011_IFLS_RX2_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 1)
+#define UART011_IFLS_RX4_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 2)
+#define UART011_IFLS_RX6_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 3)
+#define UART011_IFLS_RX7_8	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 4)
+#define UART011_IFLS_TXIFLSEL	GENMASK(2, 0)
+#define UART011_IFLS_TX1_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 0)
+#define UART011_IFLS_TX2_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 1)
+#define UART011_IFLS_TX4_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 2)
+#define UART011_IFLS_TX6_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 3)
+#define UART011_IFLS_TX7_8	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 4)
 /* special values for ST vendor with deeper fifo */
-#define UART011_IFLS_RX_HALF	(5 << 3)
-#define UART011_IFLS_TX_HALF	(5 << 0)
-
-#define UART011_OEIM		(1 << 10)	/* overrun error interrupt mask */
-#define UART011_BEIM		(1 << 9)	/* break error interrupt mask */
-#define UART011_PEIM		(1 << 8)	/* parity error interrupt mask */
-#define UART011_FEIM		(1 << 7)	/* framing error interrupt mask */
-#define UART011_RTIM		(1 << 6)	/* receive timeout interrupt mask */
-#define UART011_TXIM		(1 << 5)	/* transmit interrupt mask */
-#define UART011_RXIM		(1 << 4)	/* receive interrupt mask */
-#define UART011_DSRMIM		(1 << 3)	/* DSR interrupt mask */
-#define UART011_DCDMIM		(1 << 2)	/* DCD interrupt mask */
-#define UART011_CTSMIM		(1 << 1)	/* CTS interrupt mask */
-#define UART011_RIMIM		(1 << 0)	/* RI interrupt mask */
-
-#define UART011_OEIS		(1 << 10)	/* overrun error interrupt status */
-#define UART011_BEIS		(1 << 9)	/* break error interrupt status */
-#define UART011_PEIS		(1 << 8)	/* parity error interrupt status */
-#define UART011_FEIS		(1 << 7)	/* framing error interrupt status */
-#define UART011_RTIS		(1 << 6)	/* receive timeout interrupt status */
-#define UART011_TXIS		(1 << 5)	/* transmit interrupt status */
-#define UART011_RXIS		(1 << 4)	/* receive interrupt status */
-#define UART011_DSRMIS		(1 << 3)	/* DSR interrupt status */
-#define UART011_DCDMIS		(1 << 2)	/* DCD interrupt status */
-#define UART011_CTSMIS		(1 << 1)	/* CTS interrupt status */
-#define UART011_RIMIS		(1 << 0)	/* RI interrupt status */
-
-#define UART011_OEIC		(1 << 10)	/* overrun error interrupt clear */
-#define UART011_BEIC		(1 << 9)	/* break error interrupt clear */
-#define UART011_PEIC		(1 << 8)	/* parity error interrupt clear */
-#define UART011_FEIC		(1 << 7)	/* framing error interrupt clear */
-#define UART011_RTIC		(1 << 6)	/* receive timeout interrupt clear */
-#define UART011_TXIC		(1 << 5)	/* transmit interrupt clear */
-#define UART011_RXIC		(1 << 4)	/* receive interrupt clear */
-#define UART011_DSRMIC		(1 << 3)	/* DSR interrupt clear */
-#define UART011_DCDMIC		(1 << 2)	/* DCD interrupt clear */
-#define UART011_CTSMIC		(1 << 1)	/* CTS interrupt clear */
-#define UART011_RIMIC		(1 << 0)	/* RI interrupt clear */
-
-#define UART011_DMAONERR	(1 << 2)	/* disable dma on error */
-#define UART011_TXDMAE		(1 << 1)	/* enable transmit dma */
-#define UART011_RXDMAE		(1 << 0)	/* enable receive dma */
+#define UART011_IFLS_RX_HALF	FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 5)
+#define UART011_IFLS_TX_HALF	FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 5)
+
+#define UART011_OEIM		BIT(10)	/* overrun error interrupt mask */
+#define UART011_BEIM		BIT(9)	/* break error interrupt mask */
+#define UART011_PEIM		BIT(8)	/* parity error interrupt mask */
+#define UART011_FEIM		BIT(7)	/* framing error interrupt mask */
+#define UART011_RTIM		BIT(6)	/* receive timeout interrupt mask */
+#define UART011_TXIM		BIT(5)	/* transmit interrupt mask */
+#define UART011_RXIM		BIT(4)	/* receive interrupt mask */
+#define UART011_DSRMIM		BIT(3)	/* DSR interrupt mask */
+#define UART011_DCDMIM		BIT(2)	/* DCD interrupt mask */
+#define UART011_CTSMIM		BIT(1)	/* CTS interrupt mask */
+#define UART011_RIMIM		BIT(0)	/* RI interrupt mask */
+
+#define UART011_OEIS		BIT(10)	/* overrun error interrupt status */
+#define UART011_BEIS		BIT(9)	/* break error interrupt status */
+#define UART011_PEIS		BIT(8)	/* parity error interrupt status */
+#define UART011_FEIS		BIT(7)	/* framing error interrupt status */
+#define UART011_RTIS		BIT(6)	/* receive timeout interrupt status */
+#define UART011_TXIS		BIT(5)	/* transmit interrupt status */
+#define UART011_RXIS		BIT(4)	/* receive interrupt status */
+#define UART011_DSRMIS		BIT(3)	/* DSR interrupt status */
+#define UART011_DCDMIS		BIT(2)	/* DCD interrupt status */
+#define UART011_CTSMIS		BIT(1)	/* CTS interrupt status */
+#define UART011_RIMIS		BIT(0)	/* RI interrupt status */
+
+#define UART011_OEIC		BIT(10)	/* overrun error interrupt clear */
+#define UART011_BEIC		BIT(9)	/* break error interrupt clear */
+#define UART011_PEIC		BIT(8)	/* parity error interrupt clear */
+#define UART011_FEIC		BIT(7)	/* framing error interrupt clear */
+#define UART011_RTIC		BIT(6)	/* receive timeout interrupt clear */
+#define UART011_TXIC		BIT(5)	/* transmit interrupt clear */
+#define UART011_RXIC		BIT(4)	/* receive interrupt clear */
+#define UART011_DSRMIC		BIT(3)	/* DSR interrupt clear */
+#define UART011_DCDMIC		BIT(2)	/* DCD interrupt clear */
+#define UART011_CTSMIC		BIT(1)	/* CTS interrupt clear */
+#define UART011_RIMIC		BIT(0)	/* RI interrupt clear */
+
+#define UART011_DMAONERR	BIT(2)	/* disable dma on error */
+#define UART011_TXDMAE		BIT(1)	/* enable transmit dma */
+#define UART011_RXDMAE		BIT(0)	/* enable receive dma */
 
 #define UART01x_RSR_ANY		(UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE)
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS)
-- 
cgit v1.2.3


From 0c734c5ea76e333fbb8dd83b5bab46291b38096b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Dec 2023 11:08:15 -0700
Subject: block: improve struct request_queue layout

It's clearly been a while since someone looked at this, so I gave it a
quick shot. There are few issues in here:

- Random bundling of members that are mostly read-only and often written
- Random holes that need not be there

This moves the most frequently used bits into cacheline 1 and 2, with
the 2nd one being more write intensive than the first one, which is
basically read-only.

Outside of making this work a bit more efficiently, it also reduces the
size of struct request_queue for my test setup from 864 bytes (spanning
14 cachelines!) to 832 bytes and 13 cachelines.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/d2b7b61c-4868-45c0-9060-4f9c73de9d7e@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 17c0a7d0d319..185ed3770e3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -367,59 +367,51 @@ struct blk_independent_access_ranges {
 };
 
 struct request_queue {
-	struct request		*last_merge;
-	struct elevator_queue	*elevator;
-
-	struct percpu_ref	q_usage_counter;
+	/*
+	 * The queue owner gets to use this for whatever they like.
+	 * ll_rw_blk doesn't touch it.
+	 */
+	void			*queuedata;
 
-	struct blk_queue_stats	*stats;
-	struct rq_qos		*rq_qos;
-	struct mutex		rq_qos_mutex;
+	struct elevator_queue	*elevator;
 
 	const struct blk_mq_ops	*mq_ops;
 
 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
 
+	/*
+	 * various queue flags, see QUEUE_* below
+	 */
+	unsigned long		queue_flags;
+
+	unsigned int		rq_timeout;
+
 	unsigned int		queue_depth;
 
+	refcount_t		refs;
+
 	/* hw dispatch queues */
-	struct xarray		hctx_table;
 	unsigned int		nr_hw_queues;
+	struct xarray		hctx_table;
 
-	/*
-	 * The queue owner gets to use this for whatever they like.
-	 * ll_rw_blk doesn't touch it.
-	 */
-	void			*queuedata;
-
-	/*
-	 * various queue flags, see QUEUE_* below
-	 */
-	unsigned long		queue_flags;
-	/*
-	 * Number of contexts that have called blk_set_pm_only(). If this
-	 * counter is above zero then only RQF_PM requests are processed.
-	 */
-	atomic_t		pm_only;
+	struct percpu_ref	q_usage_counter;
 
-	/*
-	 * ida allocated id for this queue.  Used to index queues from
-	 * ioctx.
-	 */
-	int			id;
+	struct request		*last_merge;
 
 	spinlock_t		queue_lock;
 
-	struct gendisk		*disk;
+	int			quiesce_depth;
 
-	refcount_t		refs;
+	struct gendisk		*disk;
 
 	/*
 	 * mq queue kobject
 	 */
 	struct kobject *mq_kobj;
 
+	struct queue_limits	limits;
+
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
@@ -430,24 +422,40 @@ struct request_queue {
 #endif
 
 	/*
-	 * queue settings
+	 * Number of contexts that have called blk_set_pm_only(). If this
+	 * counter is above zero then only RQF_PM requests are processed.
 	 */
-	unsigned long		nr_requests;	/* Max # of requests */
+	atomic_t		pm_only;
+
+	struct blk_queue_stats	*stats;
+	struct rq_qos		*rq_qos;
+	struct mutex		rq_qos_mutex;
+
+	/*
+	 * ida allocated id for this queue.  Used to index queues from
+	 * ioctx.
+	 */
+	int			id;
 
 	unsigned int		dma_pad_mask;
 
+	/*
+	 * queue settings
+	 */
+	unsigned long		nr_requests;	/* Max # of requests */
+
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct blk_crypto_profile *crypto_profile;
 	struct kobject *crypto_kobject;
 #endif
 
-	unsigned int		rq_timeout;
-
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 
 	atomic_t		nr_active_requests_shared_tags;
 
+	unsigned int		required_elevator_features;
+
 	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;
@@ -458,11 +466,12 @@ struct request_queue {
 	struct mutex		blkcg_mutex;
 #endif
 
-	struct queue_limits	limits;
+	int			node;
 
-	unsigned int		required_elevator_features;
+	spinlock_t		requeue_lock;
+	struct list_head	requeue_list;
+	struct delayed_work	requeue_work;
 
-	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace __rcu	*blk_trace;
 #endif
@@ -472,10 +481,6 @@ struct request_queue {
 	struct blk_flush_queue	*fq;
 	struct list_head	flush_list;
 
-	struct list_head	requeue_list;
-	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
-
 	struct mutex		sysfs_lock;
 	struct mutex		sysfs_dir_lock;
 
@@ -500,8 +505,6 @@ struct request_queue {
 	 */
 	struct mutex		mq_freeze_lock;
 
-	int			quiesce_depth;
-
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 
-- 
cgit v1.2.3


From 826a5d8c9df9605fb4fdefa45432f95580241a1f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 25 Oct 2023 21:42:57 +0300
Subject: device property: Implement device_is_big_endian()

Some users want to use the struct device pointer to see if the
device is big endian in terms of Open Firmware specifications,
i.e. if it has a "big-endian" property, or if the kernel was
compiled for BE *and* the device has a "native-endian" property.

Provide inline helper for the users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20231025184259.250588-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 9f2585d705a8..55c2692ffa8c 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -80,12 +80,38 @@ int fwnode_property_match_string(const struct fwnode_handle *fwnode,
 
 bool fwnode_device_is_available(const struct fwnode_handle *fwnode);
 
+static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode)
+{
+	if (fwnode_property_present(fwnode, "big-endian"))
+		return true;
+	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) &&
+	    fwnode_property_present(fwnode, "native-endian"))
+		return true;
+	return false;
+}
+
 static inline
 bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat)
 {
 	return fwnode_property_match_string(fwnode, "compatible", compat) >= 0;
 }
 
+/**
+ * device_is_big_endian - check if a device has BE registers
+ * @dev: Pointer to the struct device
+ *
+ * Returns: true if the device has a "big-endian" property, or if the kernel
+ * was compiled for BE *and* the device has a "native-endian" property.
+ * Returns false otherwise.
+ *
+ * Callers would nominally use ioread32be/iowrite32be if
+ * device_is_big_endian() == true, or readl/writel otherwise.
+ */
+static inline bool device_is_big_endian(const struct device *dev)
+{
+	return fwnode_device_is_big_endian(dev_fwnode(dev));
+}
+
 /**
  * device_is_compatible - match 'compatible' property of the device with a given string
  * @dev: Pointer to the struct device
-- 
cgit v1.2.3


From c27dfca4555bf74dd7dd7161d8ef2790ec1c7283 Mon Sep 17 00:00:00 2001
From: Ricky Wu <ricky_wu@realtek.com>
Date: Fri, 8 Dec 2023 11:21:43 +0800
Subject: misc: rtsx: add to support new card reader rts5264 new definition and
 function

in order to support NEW chip rts5264, the definitions of some internal
registers are define in new file rts5264.h, and some callback functions
and the workflow for rts5264 are define in new file rts5264.c

also add rts5264.o to Makefile

Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
Link: https://lore.kernel.org/r/20231208032145.2143580-2-ricky_wu@realtek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/cardreader/Makefile  |   2 +-
 drivers/misc/cardreader/rts5264.c | 886 ++++++++++++++++++++++++++++++++++++++
 drivers/misc/cardreader/rts5264.h | 278 ++++++++++++
 include/linux/rtsx_pci.h          |   8 +
 4 files changed, 1173 insertions(+), 1 deletion(-)
 create mode 100644 drivers/misc/cardreader/rts5264.c
 create mode 100644 drivers/misc/cardreader/rts5264.h

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/Makefile b/drivers/misc/cardreader/Makefile
index 895128475d83..1e1bca6b0b22 100644
--- a/drivers/misc/cardreader/Makefile
+++ b/drivers/misc/cardreader/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_MISC_ALCOR_PCI)	+= alcor_pci.o
 obj-$(CONFIG_MISC_RTSX_PCI)	+= rtsx_pci.o
-rtsx_pci-objs := rtsx_pcr.o rts5209.o rts5229.o rtl8411.o rts5227.o rts5249.o rts5260.o rts5261.o rts5228.o
+rtsx_pci-objs := rtsx_pcr.o rts5209.o rts5229.o rtl8411.o rts5227.o rts5249.o rts5260.o rts5261.o rts5228.o rts5264.o
 obj-$(CONFIG_MISC_RTSX_USB)	+= rtsx_usb.o
diff --git a/drivers/misc/cardreader/rts5264.c b/drivers/misc/cardreader/rts5264.c
new file mode 100644
index 000000000000..8be4ed7d9d47
--- /dev/null
+++ b/drivers/misc/cardreader/rts5264.c
@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Driver for Realtek PCI-Express card reader
+ *
+ * Copyright(c) 2009-2013 Realtek Semiconductor Corp. All rights reserved.
+ *
+ * Author:
+ *   Ricky Wu <ricky_wu@realtek.com>
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/rtsx_pci.h>
+
+#include "rts5264.h"
+#include "rtsx_pcr.h"
+
+static u8 rts5264_get_ic_version(struct rtsx_pcr *pcr)
+{
+	u8 val;
+
+	rtsx_pci_read_register(pcr, DUMMY_REG_RESET_0, &val);
+	return val & 0x0F;
+}
+
+static void rts5264_fill_driving(struct rtsx_pcr *pcr, u8 voltage)
+{
+	u8 driving_3v3[4][3] = {
+		{0x88, 0x88, 0x88},
+		{0x77, 0x77, 0x77},
+		{0x99, 0x99, 0x99},
+		{0x66, 0x66, 0x66},
+	};
+	u8 driving_1v8[4][3] = {
+		{0x99, 0x99, 0x99},
+		{0x77, 0x77, 0x77},
+		{0xBB, 0xBB, 0xBB},
+		{0x65, 0x65, 0x65},
+	};
+	u8 (*driving)[3], drive_sel;
+
+	if (voltage == OUTPUT_3V3) {
+		driving = driving_3v3;
+		drive_sel = pcr->sd30_drive_sel_3v3;
+	} else {
+		driving = driving_1v8;
+		drive_sel = pcr->sd30_drive_sel_1v8;
+	}
+
+	rtsx_pci_write_register(pcr, SD30_CLK_DRIVE_SEL,
+			0xFF, driving[drive_sel][0]);
+	rtsx_pci_write_register(pcr, SD30_CMD_DRIVE_SEL,
+			0xFF, driving[drive_sel][1]);
+	rtsx_pci_write_register(pcr, SD30_DAT_DRIVE_SEL,
+			0xFF, driving[drive_sel][2]);
+}
+
+static void rts5264_force_power_down(struct rtsx_pcr *pcr, u8 pm_state, bool runtime)
+{
+	/* Set relink_time to 0 */
+	rtsx_pci_write_register(pcr, AUTOLOAD_CFG_BASE + 1, MASK_8_BIT_DEF, 0);
+	rtsx_pci_write_register(pcr, AUTOLOAD_CFG_BASE + 2, MASK_8_BIT_DEF, 0);
+	rtsx_pci_write_register(pcr, AUTOLOAD_CFG_BASE + 3,
+				RELINK_TIME_MASK, 0);
+
+	if (pm_state == HOST_ENTER_S3)
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3,
+					D3_DELINK_MODE_EN, D3_DELINK_MODE_EN);
+
+	if (!runtime) {
+		rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG1,
+				CD_RESUME_EN_MASK, 0);
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x01, 0x00);
+		rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+				FORCE_PM_CONTROL | FORCE_PM_VALUE, FORCE_PM_CONTROL);
+	} else {
+		rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+				FORCE_PM_CONTROL | FORCE_PM_VALUE, 0);
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x01, 0x01);
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3,
+					D3_DELINK_MODE_EN, 0);
+		rtsx_pci_write_register(pcr, RTS5264_FW_CTL,
+				RTS5264_INFORM_RTD3_COLD, RTS5264_INFORM_RTD3_COLD);
+		rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG4,
+				RTS5264_FORCE_PRSNT_LOW, RTS5264_FORCE_PRSNT_LOW);
+	}
+
+	rtsx_pci_write_register(pcr, RTS5264_REG_FPDCTL,
+		SSC_POWER_DOWN, SSC_POWER_DOWN);
+}
+
+static int rts5264_enable_auto_blink(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, OLT_LED_CTL,
+		LED_SHINE_MASK, LED_SHINE_EN);
+}
+
+static int rts5264_disable_auto_blink(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, OLT_LED_CTL,
+		LED_SHINE_MASK, LED_SHINE_DISABLE);
+}
+
+static int rts5264_turn_on_led(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, GPIO_CTL,
+		0x02, 0x02);
+}
+
+static int rts5264_turn_off_led(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, GPIO_CTL,
+		0x02, 0x00);
+}
+
+/* SD Pull Control Enable:
+ *     SD_DAT[3:0] ==> pull up
+ *     SD_CD       ==> pull up
+ *     SD_WP       ==> pull up
+ *     SD_CMD      ==> pull up
+ *     SD_CLK      ==> pull down
+ */
+static const u32 rts5264_sd_pull_ctl_enable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL2, 0xAA),
+	RTSX_REG_PAIR(CARD_PULL_CTL3, 0xE9),
+	0,
+};
+
+/* SD Pull Control Disable:
+ *     SD_DAT[3:0] ==> pull down
+ *     SD_CD       ==> pull up
+ *     SD_WP       ==> pull down
+ *     SD_CMD      ==> pull down
+ *     SD_CLK      ==> pull down
+ */
+static const u32 rts5264_sd_pull_ctl_disable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL2, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL3, 0xD5),
+	0,
+};
+
+static int rts5264_sd_set_sample_push_timing_sd30(struct rtsx_pcr *pcr)
+{
+	rtsx_pci_write_register(pcr, SD_CFG1, SD_MODE_SELECT_MASK
+		| SD_ASYNC_FIFO_NOT_RST, SD_30_MODE | SD_ASYNC_FIFO_NOT_RST);
+	rtsx_pci_write_register(pcr, CLK_CTL, CLK_LOW_FREQ, CLK_LOW_FREQ);
+	rtsx_pci_write_register(pcr, CARD_CLK_SOURCE, 0xFF,
+			CRC_VAR_CLK0 | SD30_FIX_CLK | SAMPLE_VAR_CLK1);
+	rtsx_pci_write_register(pcr, CLK_CTL, CLK_LOW_FREQ, 0);
+
+	return 0;
+}
+
+static int rts5264_card_power_on(struct rtsx_pcr *pcr, int card)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	if (option->ocp_en)
+		rtsx_pci_enable_ocp(pcr);
+
+	rtsx_pci_write_register(pcr, REG_CRC_DUMMY_0,
+		CFG_SD_POW_AUTO_PD, CFG_SD_POW_AUTO_PD);
+
+	rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG1,
+			RTS5264_LDO1_TUNE_MASK, RTS5264_LDO1_33);
+	rtsx_pci_write_register(pcr, RTS5264_LDO1233318_POW_CTL,
+			RTS5264_LDO1_POWERON, RTS5264_LDO1_POWERON);
+	rtsx_pci_write_register(pcr, RTS5264_LDO1233318_POW_CTL,
+			RTS5264_LDO3318_POWERON, RTS5264_LDO3318_POWERON);
+
+	msleep(20);
+
+	rtsx_pci_write_register(pcr, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN);
+
+	/* Initialize SD_CFG1 register */
+	rtsx_pci_write_register(pcr, SD_CFG1, 0xFF,
+			SD_CLK_DIVIDE_128 | SD_20_MODE | SD_BUS_WIDTH_1BIT);
+	rtsx_pci_write_register(pcr, SD_SAMPLE_POINT_CTL,
+			0xFF, SD20_RX_POS_EDGE);
+	rtsx_pci_write_register(pcr, SD_PUSH_POINT_CTL, 0xFF, 0);
+	rtsx_pci_write_register(pcr, CARD_STOP, SD_STOP | SD_CLR_ERR,
+			SD_STOP | SD_CLR_ERR);
+
+	/* Reset SD_CFG3 register */
+	rtsx_pci_write_register(pcr, SD_CFG3, SD30_CLK_END_EN, 0);
+	rtsx_pci_write_register(pcr, REG_SD_STOP_SDCLK_CFG,
+			SD30_CLK_STOP_CFG_EN | SD30_CLK_STOP_CFG1 |
+			SD30_CLK_STOP_CFG0, 0);
+
+	if (pcr->extra_caps & EXTRA_CAPS_SD_SDR50 ||
+	    pcr->extra_caps & EXTRA_CAPS_SD_SDR104)
+		rts5264_sd_set_sample_push_timing_sd30(pcr);
+
+	return 0;
+}
+
+static int rts5264_switch_output_voltage(struct rtsx_pcr *pcr, u8 voltage)
+{
+	rtsx_pci_write_register(pcr, RTS5264_CARD_PWR_CTL,
+			RTS5264_PUPDC, RTS5264_PUPDC);
+
+	switch (voltage) {
+	case OUTPUT_3V3:
+		rtsx_pci_write_register(pcr, RTS5264_LDO1233318_POW_CTL,
+				RTS5264_TUNE_REF_LDO3318, RTS5264_TUNE_REF_LDO3318);
+		rtsx_pci_write_register(pcr, RTS5264_DV3318_CFG,
+				RTS5264_DV3318_TUNE_MASK, RTS5264_DV3318_33);
+		rtsx_pci_write_register(pcr, SD_PAD_CTL,
+				SD_IO_USING_1V8, 0);
+		break;
+	case OUTPUT_1V8:
+		rtsx_pci_write_register(pcr, RTS5264_LDO1233318_POW_CTL,
+				RTS5264_TUNE_REF_LDO3318, RTS5264_TUNE_REF_LDO3318_DFT);
+		rtsx_pci_write_register(pcr, RTS5264_DV3318_CFG,
+				RTS5264_DV3318_TUNE_MASK, RTS5264_DV3318_18);
+		rtsx_pci_write_register(pcr, SD_PAD_CTL,
+				SD_IO_USING_1V8, SD_IO_USING_1V8);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* set pad drive */
+	rts5264_fill_driving(pcr, voltage);
+
+	return 0;
+}
+
+static void rts5264_stop_cmd(struct rtsx_pcr *pcr)
+{
+	rtsx_pci_writel(pcr, RTSX_HCBCTLR, STOP_CMD);
+	rtsx_pci_writel(pcr, RTSX_HDBCTLR, STOP_DMA);
+	rtsx_pci_write_register(pcr, DMACTL, DMA_RST, DMA_RST);
+	rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
+}
+
+static void rts5264_card_before_power_off(struct rtsx_pcr *pcr)
+{
+	rts5264_stop_cmd(pcr);
+	rts5264_switch_output_voltage(pcr, OUTPUT_3V3);
+}
+
+static int rts5264_card_power_off(struct rtsx_pcr *pcr, int card)
+{
+	int err = 0;
+
+	rts5264_card_before_power_off(pcr);
+	err = rtsx_pci_write_register(pcr, RTS5264_LDO1233318_POW_CTL,
+				RTS5264_LDO_POWERON_MASK, 0);
+
+	rtsx_pci_write_register(pcr, REG_CRC_DUMMY_0,
+		CFG_SD_POW_AUTO_PD, 0);
+	if (pcr->option.ocp_en)
+		rtsx_pci_disable_ocp(pcr);
+
+	return err;
+}
+
+static void rts5264_enable_ocp(struct rtsx_pcr *pcr)
+{
+	u8 mask = 0;
+	u8 val = 0;
+
+	rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG0,
+			RTS5264_LDO1_OCP_EN | RTS5264_LDO1_OCP_LMT_EN,
+			RTS5264_LDO1_OCP_EN | RTS5264_LDO1_OCP_LMT_EN);
+	rtsx_pci_write_register(pcr, RTS5264_LDO2_CFG0,
+			RTS5264_LDO2_OCP_EN | RTS5264_LDO2_OCP_LMT_EN,
+			RTS5264_LDO2_OCP_EN | RTS5264_LDO2_OCP_LMT_EN);
+	rtsx_pci_write_register(pcr, RTS5264_LDO3_CFG0,
+			RTS5264_LDO3_OCP_EN | RTS5264_LDO3_OCP_LMT_EN,
+			RTS5264_LDO3_OCP_EN | RTS5264_LDO3_OCP_LMT_EN);
+	rtsx_pci_write_register(pcr, RTS5264_OVP_DET,
+			RTS5264_POW_VDET, RTS5264_POW_VDET);
+
+	mask = SD_OCP_INT_EN | SD_DETECT_EN;
+	mask |= SDVIO_OCP_INT_EN | SDVIO_DETECT_EN;
+	val = mask;
+	rtsx_pci_write_register(pcr, REG_OCPCTL, mask, val);
+
+	mask = SD_VDD3_OCP_INT_EN | SD_VDD3_DETECT_EN;
+	val = mask;
+	rtsx_pci_write_register(pcr, RTS5264_OCP_VDD3_CTL, mask, val);
+
+	mask = RTS5264_OVP_INT_EN | RTS5264_OVP_DETECT_EN;
+	val = mask;
+	rtsx_pci_write_register(pcr, RTS5264_OVP_CTL, mask, val);
+}
+
+static void rts5264_disable_ocp(struct rtsx_pcr *pcr)
+{
+	u8 mask = 0;
+
+	mask = SD_OCP_INT_EN | SD_DETECT_EN;
+	mask |= SDVIO_OCP_INT_EN | SDVIO_DETECT_EN;
+	rtsx_pci_write_register(pcr, REG_OCPCTL, mask, 0);
+
+	mask = SD_VDD3_OCP_INT_EN | SD_VDD3_DETECT_EN;
+	rtsx_pci_write_register(pcr, RTS5264_OCP_VDD3_CTL, mask, 0);
+
+	mask = RTS5264_OVP_INT_EN | RTS5264_OVP_DETECT_EN;
+	rtsx_pci_write_register(pcr, RTS5264_OVP_CTL, mask, 0);
+
+	rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG0,
+			RTS5264_LDO1_OCP_EN | RTS5264_LDO1_OCP_LMT_EN, 0);
+	rtsx_pci_write_register(pcr, RTS5264_LDO2_CFG0,
+			RTS5264_LDO2_OCP_EN | RTS5264_LDO2_OCP_LMT_EN, 0);
+	rtsx_pci_write_register(pcr, RTS5264_LDO3_CFG0,
+			RTS5264_LDO3_OCP_EN | RTS5264_LDO3_OCP_LMT_EN, 0);
+	rtsx_pci_write_register(pcr, RTS5264_OVP_DET, RTS5264_POW_VDET, 0);
+}
+
+static void rts5264_init_ocp(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	if (option->ocp_en) {
+		u8 mask, val;
+
+		rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG0,
+			RTS5264_LDO1_OCP_THD_MASK, option->sd_800mA_ocp_thd);
+		rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG0,
+			RTS5264_LDO1_OCP_LMT_THD_MASK,
+			RTS5264_LDO1_LMT_THD_2000);
+
+		rtsx_pci_write_register(pcr, RTS5264_LDO2_CFG0,
+			RTS5264_LDO2_OCP_THD_MASK, RTS5264_LDO2_OCP_THD_950);
+		rtsx_pci_write_register(pcr, RTS5264_LDO2_CFG0,
+			RTS5264_LDO2_OCP_LMT_THD_MASK,
+			RTS5264_LDO2_LMT_THD_2000);
+
+		rtsx_pci_write_register(pcr, RTS5264_LDO3_CFG0,
+			RTS5264_LDO3_OCP_THD_MASK, RTS5264_LDO3_OCP_THD_710);
+		rtsx_pci_write_register(pcr, RTS5264_LDO3_CFG0,
+			RTS5264_LDO3_OCP_LMT_THD_MASK,
+			RTS5264_LDO3_LMT_THD_1500);
+
+		rtsx_pci_write_register(pcr, RTS5264_OVP_DET,
+			RTS5264_TUNE_VROV_MASK, RTS5264_TUNE_VROV_1V6);
+
+		mask = SD_OCP_GLITCH_MASK | SDVIO_OCP_GLITCH_MASK;
+		val = pcr->hw_param.ocp_glitch;
+		rtsx_pci_write_register(pcr, REG_OCPGLITCH, mask, val);
+
+	} else {
+		rtsx_pci_write_register(pcr, RTS5264_LDO1_CFG0,
+			RTS5264_LDO1_OCP_EN | RTS5264_LDO1_OCP_LMT_EN, 0);
+		rtsx_pci_write_register(pcr, RTS5264_LDO2_CFG0,
+			RTS5264_LDO2_OCP_EN | RTS5264_LDO2_OCP_LMT_EN, 0);
+		rtsx_pci_write_register(pcr, RTS5264_LDO3_CFG0,
+			RTS5264_LDO3_OCP_EN | RTS5264_LDO3_OCP_LMT_EN, 0);
+		rtsx_pci_write_register(pcr, RTS5264_OVP_DET,
+			RTS5264_POW_VDET, 0);
+	}
+}
+
+static int rts5264_get_ocpstat2(struct rtsx_pcr *pcr, u8 *val)
+{
+	return rtsx_pci_read_register(pcr, RTS5264_OCP_VDD3_STS, val);
+}
+
+static int rts5264_get_ovpstat(struct rtsx_pcr *pcr, u8 *val)
+{
+	return rtsx_pci_read_register(pcr, RTS5264_OVP_STS, val);
+}
+
+static void rts5264_clear_ocpstat(struct rtsx_pcr *pcr)
+{
+	u8 mask = 0;
+	u8 val = 0;
+
+	mask = SD_OCP_INT_CLR | SD_OC_CLR;
+	mask |= SDVIO_OCP_INT_CLR | SDVIO_OC_CLR;
+	val = mask;
+	rtsx_pci_write_register(pcr, REG_OCPCTL, mask, val);
+	rtsx_pci_write_register(pcr, RTS5264_OCP_VDD3_CTL,
+		SD_VDD3_OCP_INT_CLR | SD_VDD3_OC_CLR,
+		SD_VDD3_OCP_INT_CLR | SD_VDD3_OC_CLR);
+	rtsx_pci_write_register(pcr, RTS5264_OVP_CTL,
+		RTS5264_OVP_INT_CLR | RTS5264_OVP_CLR,
+		RTS5264_OVP_INT_CLR | RTS5264_OVP_CLR);
+
+	udelay(1000);
+
+	rtsx_pci_write_register(pcr, REG_OCPCTL, mask, 0);
+	rtsx_pci_write_register(pcr, RTS5264_OCP_VDD3_CTL,
+		SD_VDD3_OCP_INT_CLR | SD_VDD3_OC_CLR, 0);
+	rtsx_pci_write_register(pcr, RTS5264_OVP_CTL,
+		RTS5264_OVP_INT_CLR | RTS5264_OVP_CLR, 0);
+}
+
+static void rts5264_process_ocp(struct rtsx_pcr *pcr)
+{
+	if (!pcr->option.ocp_en)
+		return;
+
+	rtsx_pci_get_ocpstat(pcr, &pcr->ocp_stat);
+	rts5264_get_ocpstat2(pcr, &pcr->ocp_stat2);
+	rts5264_get_ovpstat(pcr, &pcr->ovp_stat);
+
+	if ((pcr->ocp_stat & (SD_OC_NOW | SD_OC_EVER | SDVIO_OC_NOW | SDVIO_OC_EVER)) ||
+			(pcr->ocp_stat2 & (SD_VDD3_OC_NOW | SD_VDD3_OC_EVER)) ||
+			(pcr->ovp_stat & (RTS5264_OVP_NOW | RTS5264_OVP_EVER))) {
+		rts5264_clear_ocpstat(pcr);
+		rts5264_card_power_off(pcr, RTSX_SD_CARD);
+		rtsx_pci_write_register(pcr, CARD_OE, SD_OUTPUT_EN, 0);
+		pcr->ocp_stat = 0;
+		pcr->ocp_stat2 = 0;
+		pcr->ovp_stat = 0;
+	}
+}
+
+static void rts5264_init_from_hw(struct rtsx_pcr *pcr)
+{
+	struct pci_dev *pdev = pcr->pci;
+	u32 lval1, lval2, i;
+	u16 setting_reg1, setting_reg2;
+	u8 valid, efuse_valid, tmp;
+
+	rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+		REG_EFUSE_POR | REG_EFUSE_POWER_MASK,
+		REG_EFUSE_POR | REG_EFUSE_POWERON);
+	udelay(1);
+	rtsx_pci_write_register(pcr, RTS5264_EFUSE_ADDR,
+		RTS5264_EFUSE_ADDR_MASK, 0x00);
+	rtsx_pci_write_register(pcr, RTS5264_EFUSE_CTL,
+		RTS5264_EFUSE_ENABLE | RTS5264_EFUSE_MODE_MASK,
+		RTS5264_EFUSE_ENABLE);
+
+	/* Wait transfer end */
+	for (i = 0; i < MAX_RW_REG_CNT; i++) {
+		rtsx_pci_read_register(pcr, RTS5264_EFUSE_CTL, &tmp);
+		if ((tmp & 0x80) == 0)
+			break;
+	}
+	rtsx_pci_read_register(pcr, RTS5264_EFUSE_READ_DATA, &tmp);
+	efuse_valid = ((tmp & 0x0C) >> 2);
+	pcr_dbg(pcr, "Load efuse valid: 0x%x\n", efuse_valid);
+
+	pci_read_config_dword(pdev, PCR_SETTING_REG2, &lval2);
+	pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", PCR_SETTING_REG2, lval2);
+	/* 0x816 */
+	valid = (u8)((lval2 >> 16) & 0x03);
+
+	rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+		REG_EFUSE_POR, 0);
+	pcr_dbg(pcr, "Disable efuse por!\n");
+
+	if (efuse_valid == 2 || efuse_valid == 3) {
+		if (valid == 3) {
+			/* Bypass efuse */
+			setting_reg1 = PCR_SETTING_REG1;
+			setting_reg2 = PCR_SETTING_REG2;
+		} else {
+			/* Use efuse data */
+			setting_reg1 = PCR_SETTING_REG4;
+			setting_reg2 = PCR_SETTING_REG5;
+		}
+	} else if (efuse_valid == 0) {
+		// default
+		setting_reg1 = PCR_SETTING_REG1;
+		setting_reg2 = PCR_SETTING_REG2;
+	} else {
+		return;
+	}
+
+	pci_read_config_dword(pdev, setting_reg2, &lval2);
+	pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", setting_reg2, lval2);
+
+	if (!rts5264_vendor_setting_valid(lval2)) {
+		pcr_dbg(pcr, "skip fetch vendor setting\n");
+		return;
+	}
+
+	pcr->rtd3_en = rts5264_reg_to_rtd3(lval2);
+
+	if (rts5264_reg_check_reverse_socket(lval2))
+		pcr->flags |= PCR_REVERSE_SOCKET;
+
+	pci_read_config_dword(pdev, setting_reg1, &lval1);
+	pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", setting_reg1, lval1);
+
+	pcr->aspm_en = rts5264_reg_to_aspm(lval1);
+	pcr->sd30_drive_sel_1v8 = rts5264_reg_to_sd30_drive_sel_1v8(lval1);
+	pcr->sd30_drive_sel_3v3 = rts5264_reg_to_sd30_drive_sel_3v3(lval1);
+
+	if (setting_reg1 == PCR_SETTING_REG1) {
+		/* store setting */
+		rtsx_pci_write_register(pcr, 0xFF0C, 0xFF, (u8)(lval1 & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF0D, 0xFF, (u8)((lval1 >> 8) & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF0E, 0xFF, (u8)((lval1 >> 16) & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF0F, 0xFF, (u8)((lval1 >> 24) & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF10, 0xFF, (u8)(lval2 & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF11, 0xFF, (u8)((lval2 >> 8) & 0xFF));
+		rtsx_pci_write_register(pcr, 0xFF12, 0xFF, (u8)((lval2 >> 16) & 0xFF));
+
+		pci_write_config_dword(pdev, PCR_SETTING_REG4, lval1);
+		lval2 = lval2 & 0x00FFFFFF;
+		pci_write_config_dword(pdev, PCR_SETTING_REG5, lval2);
+	}
+}
+
+static void rts5264_init_from_cfg(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	if (rtsx_check_dev_flag(pcr, ASPM_L1_1_EN | ASPM_L1_2_EN
+				| PM_L1_1_EN | PM_L1_2_EN))
+		rtsx_pci_disable_oobs_polling(pcr);
+	else
+		rtsx_pci_enable_oobs_polling(pcr);
+
+	rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0xFF, 0);
+
+	if (option->ltr_en) {
+		if (option->ltr_enabled)
+			rtsx_set_ltr_latency(pcr, option->ltr_active_latency);
+	}
+}
+
+static int rts5264_extra_init_hw(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+
+	rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG1,
+			CD_RESUME_EN_MASK, CD_RESUME_EN_MASK);
+	rtsx_pci_write_register(pcr, REG_VREF, PWD_SUSPND_EN, PWD_SUSPND_EN);
+
+	rts5264_init_from_cfg(pcr);
+	rts5264_init_from_hw(pcr);
+
+	/* power off efuse */
+	rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+			REG_EFUSE_POWER_MASK, REG_EFUSE_POWEROFF);
+	rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG2,
+			RTS5264_CHIP_RST_N_SEL, 0);
+	rtsx_pci_write_register(pcr, RTS5264_REG_LDO12_CFG,
+			RTS5264_LDO12_SR_MASK, RTS5264_LDO12_SR_0_0_MS);
+	rtsx_pci_write_register(pcr, CDGW, 0xFF, 0x01);
+	rtsx_pci_write_register(pcr, RTS5264_CKMUX_MBIAS_PWR,
+			RTS5264_POW_CKMUX, RTS5264_POW_CKMUX);
+	rtsx_pci_write_register(pcr, RTS5264_CMD_OE_START_EARLY,
+			RTS5264_CMD_OE_EARLY_EN | RTS5264_CMD_OE_EARLY_CYCLE_MASK,
+			RTS5264_CMD_OE_EARLY_EN);
+	rtsx_pci_write_register(pcr, RTS5264_DAT_OE_START_EARLY,
+			RTS5264_DAT_OE_EARLY_EN | RTS5264_DAT_OE_EARLY_CYCLE_MASK,
+			RTS5264_DAT_OE_EARLY_EN);
+	rtsx_pci_write_register(pcr, SSC_DIV_N_0, 0xFF, 0x5D);
+
+	rtsx_pci_write_register(pcr, RTS5264_PWR_CUT,
+			RTS5264_CFG_MEM_PD, RTS5264_CFG_MEM_PD);
+	rtsx_pci_write_register(pcr, L1SUB_CONFIG1,
+			AUX_CLK_ACTIVE_SEL_MASK, MAC_CKSW_DONE);
+	rtsx_pci_write_register(pcr, L1SUB_CONFIG3, 0xFF, 0);
+	rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG4,
+			RTS5264_AUX_CLK_16M_EN, 0);
+
+	/* Release PRSNT# */
+	rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG4,
+			RTS5264_FORCE_PRSNT_LOW, 0);
+	rtsx_pci_write_register(pcr, PCLK_CTL,
+			PCLK_MODE_SEL, PCLK_MODE_SEL);
+
+	/* LED shine disabled, set initial shine cycle period */
+	rtsx_pci_write_register(pcr, OLT_LED_CTL, 0x0F, 0x02);
+
+	/* Configure driving */
+	rts5264_fill_driving(pcr, OUTPUT_3V3);
+
+	if (pcr->flags & PCR_REVERSE_SOCKET)
+		rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x30);
+	else
+		rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00);
+
+	/*
+	 * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced
+	 * to drive low, and we forcibly request clock.
+	 */
+	if (option->force_clkreq_0)
+		rtsx_pci_write_register(pcr, PETXCFG,
+				 FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_LOW);
+	else
+		rtsx_pci_write_register(pcr, PETXCFG,
+				 FORCE_CLKREQ_DELINK_MASK, FORCE_CLKREQ_HIGH);
+
+	rtsx_pci_write_register(pcr, PWD_SUSPEND_EN, 0xFF, 0xFF);
+	rtsx_pci_write_register(pcr, RBCTL, U_AUTO_DMA_EN_MASK, 0);
+	rtsx_pci_write_register(pcr, RTS5264_AUTOLOAD_CFG4,
+			RTS5264_F_HIGH_RC_MASK, RTS5264_F_HIGH_RC_400K);
+
+	if (pcr->rtd3_en) {
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x01, 0x00);
+		rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+				FORCE_PM_CONTROL | FORCE_PM_VALUE, 0);
+	} else {
+		rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, 0x01, 0x00);
+		rtsx_pci_write_register(pcr, RTS5264_REG_PME_FORCE_CTL,
+				FORCE_PM_CONTROL | FORCE_PM_VALUE, FORCE_PM_CONTROL);
+	}
+	rtsx_pci_write_register(pcr, pcr->reg_pm_ctrl3, D3_DELINK_MODE_EN, 0x00);
+
+	/* Clear Enter RTD3_cold Information*/
+	rtsx_pci_write_register(pcr, RTS5264_FW_CTL,
+		RTS5264_INFORM_RTD3_COLD, 0);
+
+	return 0;
+}
+
+static void rts5264_enable_aspm(struct rtsx_pcr *pcr, bool enable)
+{
+	u8 val = FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1;
+	u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1;
+
+	if (pcr->aspm_enabled == enable)
+		return;
+
+	val |= (pcr->aspm_en & 0x02);
+	rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val);
+	pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC, pcr->aspm_en);
+	pcr->aspm_enabled = enable;
+}
+
+static void rts5264_disable_aspm(struct rtsx_pcr *pcr, bool enable)
+{
+	u8 val = FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1;
+	u8 mask = FORCE_ASPM_VAL_MASK | FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1;
+
+	if (pcr->aspm_enabled == enable)
+		return;
+
+	pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL,
+					   PCI_EXP_LNKCTL_ASPMC, 0);
+	rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, mask, val);
+	rtsx_pci_write_register(pcr, SD_CFG1, SD_ASYNC_FIFO_NOT_RST, 0);
+	udelay(10);
+	pcr->aspm_enabled = enable;
+}
+
+static void rts5264_set_aspm(struct rtsx_pcr *pcr, bool enable)
+{
+	if (enable)
+		rts5264_enable_aspm(pcr, true);
+	else
+		rts5264_disable_aspm(pcr, false);
+}
+
+static void rts5264_set_l1off_cfg_sub_d0(struct rtsx_pcr *pcr, int active)
+{
+	struct rtsx_cr_option *option = &(pcr->option);
+
+	u32 interrupt = rtsx_pci_readl(pcr, RTSX_BIPR);
+	int card_exist = (interrupt & SD_EXIST);
+	int aspm_L1_1, aspm_L1_2;
+	u8 val = 0;
+
+	aspm_L1_1 = rtsx_check_dev_flag(pcr, ASPM_L1_1_EN);
+	aspm_L1_2 = rtsx_check_dev_flag(pcr, ASPM_L1_2_EN);
+
+	if (active) {
+		/* Run, latency: 60us */
+		if (aspm_L1_1)
+			val = option->ltr_l1off_snooze_sspwrgate;
+	} else {
+		/* L1off, latency: 300us */
+		if (aspm_L1_2)
+			val = option->ltr_l1off_sspwrgate;
+	}
+
+	if (aspm_L1_1 || aspm_L1_2) {
+		if (rtsx_check_dev_flag(pcr,
+					LTR_L1SS_PWR_GATE_CHECK_CARD_EN)) {
+			if (card_exist)
+				val &= ~L1OFF_MBIAS2_EN_5250;
+			else
+				val |= L1OFF_MBIAS2_EN_5250;
+		}
+	}
+	rtsx_set_l1off_sub(pcr, val);
+}
+
+static const struct pcr_ops rts5264_pcr_ops = {
+	.turn_on_led = rts5264_turn_on_led,
+	.turn_off_led = rts5264_turn_off_led,
+	.extra_init_hw = rts5264_extra_init_hw,
+	.enable_auto_blink = rts5264_enable_auto_blink,
+	.disable_auto_blink = rts5264_disable_auto_blink,
+	.card_power_on = rts5264_card_power_on,
+	.card_power_off = rts5264_card_power_off,
+	.switch_output_voltage = rts5264_switch_output_voltage,
+	.force_power_down = rts5264_force_power_down,
+	.stop_cmd = rts5264_stop_cmd,
+	.set_aspm = rts5264_set_aspm,
+	.set_l1off_cfg_sub_d0 = rts5264_set_l1off_cfg_sub_d0,
+	.enable_ocp = rts5264_enable_ocp,
+	.disable_ocp = rts5264_disable_ocp,
+	.init_ocp = rts5264_init_ocp,
+	.process_ocp = rts5264_process_ocp,
+	.clear_ocpstat = rts5264_clear_ocpstat,
+};
+
+static inline u8 double_ssc_depth(u8 depth)
+{
+	return ((depth > 1) ? (depth - 1) : depth);
+}
+
+int rts5264_pci_switch_clock(struct rtsx_pcr *pcr, unsigned int card_clock,
+		u8 ssc_depth, bool initial_mode, bool double_clk, bool vpclk)
+{
+	int err, clk;
+	u16 n;
+	u8 clk_divider, mcu_cnt, div;
+	static const u8 depth[] = {
+		[RTSX_SSC_DEPTH_4M] = RTS5264_SSC_DEPTH_4M,
+		[RTSX_SSC_DEPTH_2M] = RTS5264_SSC_DEPTH_2M,
+		[RTSX_SSC_DEPTH_1M] = RTS5264_SSC_DEPTH_1M,
+		[RTSX_SSC_DEPTH_500K] = RTS5264_SSC_DEPTH_512K,
+	};
+
+	if (initial_mode) {
+		/* We use 250k(around) here, in initial stage */
+		clk_divider = SD_CLK_DIVIDE_128;
+		card_clock = 30000000;
+	} else {
+		clk_divider = SD_CLK_DIVIDE_0;
+	}
+	err = rtsx_pci_write_register(pcr, SD_CFG1,
+			SD_CLK_DIVIDE_MASK, clk_divider);
+	if (err < 0)
+		return err;
+
+	card_clock /= 1000000;
+	pcr_dbg(pcr, "Switch card clock to %dMHz\n", card_clock);
+
+	clk = card_clock;
+	if (!initial_mode && double_clk)
+		clk = card_clock * 2;
+	pcr_dbg(pcr, "Internal SSC clock: %dMHz (cur_clock = %d)\n",
+		clk, pcr->cur_clock);
+
+	if (clk == pcr->cur_clock)
+		return 0;
+
+	if (pcr->ops->conv_clk_and_div_n)
+		n = pcr->ops->conv_clk_and_div_n(clk, CLK_TO_DIV_N);
+	else
+		n = clk - 4;
+	if ((clk <= 4) || (n > 396))
+		return -EINVAL;
+
+	mcu_cnt = 125/clk + 3;
+	if (mcu_cnt > 15)
+		mcu_cnt = 15;
+
+	div = CLK_DIV_1;
+	while ((n < MIN_DIV_N_PCR - 4) && (div < CLK_DIV_8)) {
+		if (pcr->ops->conv_clk_and_div_n) {
+			int dbl_clk = pcr->ops->conv_clk_and_div_n(n,
+					DIV_N_TO_CLK) * 2;
+			n = pcr->ops->conv_clk_and_div_n(dbl_clk,
+					CLK_TO_DIV_N);
+		} else {
+			n = (n + 4) * 2 - 4;
+		}
+		div++;
+	}
+
+	n = (n / 2) - 1;
+	pcr_dbg(pcr, "n = %d, div = %d\n", n, div);
+
+	ssc_depth = depth[ssc_depth];
+	if (double_clk)
+		ssc_depth = double_ssc_depth(ssc_depth);
+
+	if (ssc_depth) {
+		if (div == CLK_DIV_2) {
+			if (ssc_depth > 1)
+				ssc_depth -= 1;
+			else
+				ssc_depth = RTS5264_SSC_DEPTH_8M;
+		} else if (div == CLK_DIV_4) {
+			if (ssc_depth > 2)
+				ssc_depth -= 2;
+			else
+				ssc_depth = RTS5264_SSC_DEPTH_8M;
+		} else if (div == CLK_DIV_8) {
+			if (ssc_depth > 3)
+				ssc_depth -= 3;
+			else
+				ssc_depth = RTS5264_SSC_DEPTH_8M;
+		}
+	} else {
+		ssc_depth = 0;
+	}
+	pcr_dbg(pcr, "ssc_depth = %d\n", ssc_depth);
+
+	rtsx_pci_init_cmd(pcr);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_CTL,
+				CHANGE_CLK, CHANGE_CLK);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CLK_DIV,
+			0xFF, (div << 4) | mcu_cnt);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SSC_CTL1, SSC_RSTB, 0);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SSC_CTL2,
+			SSC_DEPTH_MASK, ssc_depth);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SSC_DIV_N_0, 0xFF, n);
+
+	if (is_version(pcr, 0x5264, IC_VER_A)) {
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SSC_CTL1, SSC_RSTB, 0);
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, RTS5264_CARD_CLK_SRC2,
+			RTS5264_REG_BIG_KVCO_A, 0);
+	} else {
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SSC_CTL1, SSC_RSTB, SSC_RSTB);
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, RTS5264_SYS_DUMMY_1,
+			RTS5264_REG_BIG_KVCO, 0);
+	}
+
+	if (vpclk) {
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL,
+				PHASE_NOT_RESET, 0);
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK1_CTL,
+				PHASE_NOT_RESET, 0);
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK0_CTL,
+				PHASE_NOT_RESET, PHASE_NOT_RESET);
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_VPCLK1_CTL,
+				PHASE_NOT_RESET, PHASE_NOT_RESET);
+	}
+
+	err = rtsx_pci_send_cmd(pcr, 2000);
+	if (err < 0)
+		return err;
+
+	/* Wait SSC clock stable */
+	udelay(SSC_CLOCK_STABLE_WAIT);
+	err = rtsx_pci_write_register(pcr, CLK_CTL, CHANGE_CLK, 0);
+	if (err < 0)
+		return err;
+
+	pcr->cur_clock = clk;
+	return 0;
+}
+
+void rts5264_init_params(struct rtsx_pcr *pcr)
+{
+	struct rtsx_cr_option *option = &pcr->option;
+	struct rtsx_hw_param *hw_param = &pcr->hw_param;
+	u8 val;
+
+	pcr->extra_caps = EXTRA_CAPS_SD_SDR50 | EXTRA_CAPS_SD_SDR104;
+	pcr->extra_caps |= EXTRA_CAPS_NO_MMC;
+	rtsx_pci_read_register(pcr, RTS5264_FW_STATUS, &val);
+	if (!(val & RTS5264_EXPRESS_LINK_FAIL_MASK))
+		pcr->extra_caps |= EXTRA_CAPS_SD_EXPRESS;
+	pcr->num_slots = 1;
+	pcr->ops = &rts5264_pcr_ops;
+
+	pcr->flags = 0;
+	pcr->card_drive_sel = RTSX_CARD_DRIVE_DEFAULT;
+	pcr->sd30_drive_sel_1v8 = 0x00;
+	pcr->sd30_drive_sel_3v3 = 0x00;
+	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_REG;
+	pcr->tx_initial_phase = SET_CLOCK_PHASE(24, 24, 11);
+	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
+
+	pcr->ic_version = rts5264_get_ic_version(pcr);
+	pcr->sd_pull_ctl_enable_tbl = rts5264_sd_pull_ctl_enable_tbl;
+	pcr->sd_pull_ctl_disable_tbl = rts5264_sd_pull_ctl_disable_tbl;
+
+	pcr->reg_pm_ctrl3 = RTS5264_AUTOLOAD_CFG3;
+
+	option->dev_flags = (LTR_L1SS_PWR_GATE_CHECK_CARD_EN
+				| LTR_L1SS_PWR_GATE_EN);
+	option->ltr_en = true;
+
+	/* init latency of active, idle, L1OFF to 60us, 300us, 3ms */
+	option->ltr_active_latency = LTR_ACTIVE_LATENCY_DEF;
+	option->ltr_idle_latency = LTR_IDLE_LATENCY_DEF;
+	option->ltr_l1off_latency = LTR_L1OFF_LATENCY_DEF;
+	option->l1_snooze_delay = L1_SNOOZE_DELAY_DEF;
+	option->ltr_l1off_sspwrgate = 0x7F;
+	option->ltr_l1off_snooze_sspwrgate = 0x78;
+
+	option->ocp_en = 1;
+	hw_param->interrupt_en |= (SD_OC_INT_EN | SD_OVP_INT_EN);
+	hw_param->ocp_glitch =  SD_OCP_GLITCH_800U | SDVIO_OCP_GLITCH_800U;
+	option->sd_800mA_ocp_thd =  RTS5264_LDO1_OCP_THD_1150;
+}
diff --git a/drivers/misc/cardreader/rts5264.h b/drivers/misc/cardreader/rts5264.h
new file mode 100644
index 000000000000..e3cbbf2fe1a4
--- /dev/null
+++ b/drivers/misc/cardreader/rts5264.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Driver for Realtek PCI-Express card reader
+ *
+ * Copyright(c) 2018-2019 Realtek Semiconductor Corp. All rights reserved.
+ *
+ * Author:
+ *   Ricky Wu <ricky_wu@realtek.com>
+ */
+#ifndef RTS5264_H
+#define RTS5264_H
+
+/*New add*/
+#define rts5264_vendor_setting_valid(reg)	((reg) & 0x010000)
+#define rts5264_reg_to_aspm(reg) \
+	(((~(reg) >> 28) & 0x02) | (((reg) >> 28) & 0x01))
+#define rts5264_reg_check_reverse_socket(reg)	((reg) & 0x04)
+#define rts5264_reg_to_sd30_drive_sel_1v8(reg)	(((reg) >> 22) & 0x03)
+#define rts5264_reg_to_sd30_drive_sel_3v3(reg)	(((reg) >> 16) & 0x03)
+#define rts5264_reg_to_rtd3(reg)		((reg) & 0x08)
+
+#define RTS5264_AUTOLOAD_CFG0		0xFF7B
+#define RTS5264_AUTOLOAD_CFG1		0xFF7C
+#define RTS5264_AUTOLOAD_CFG3		0xFF7E
+#define RTS5264_AUTOLOAD_CFG4		0xFF7F
+#define RTS5264_FORCE_PRSNT_LOW		(1 << 6)
+#define RTS5264_AUX_CLK_16M_EN		(1 << 5)
+#define RTS5264_F_HIGH_RC_MASK		(1 << 4)
+#define RTS5264_F_HIGH_RC_1_6M		(1 << 4)
+#define RTS5264_F_HIGH_RC_400K		(0 << 4)
+
+/* SSC_CTL2 0xFC12 */
+#define RTS5264_SSC_DEPTH_MASK		0x07
+#define RTS5264_SSC_DEPTH_DISALBE	0x00
+#define RTS5264_SSC_DEPTH_8M		0x01
+#define RTS5264_SSC_DEPTH_4M		0x02
+#define RTS5264_SSC_DEPTH_2M		0x03
+#define RTS5264_SSC_DEPTH_1M		0x04
+#define RTS5264_SSC_DEPTH_512K		0x05
+#define RTS5264_SSC_DEPTH_256K		0x06
+#define RTS5264_SSC_DEPTH_128K		0x07
+
+#define RTS5264_CARD_CLK_SRC2		0xFC2F
+#define RTS5264_REG_BIG_KVCO_A		0x20
+
+/* efuse control register*/
+#define RTS5264_EFUSE_CTL		0xFC30
+#define RTS5264_EFUSE_ENABLE		0x80
+/* EFUSE_MODE: 0=READ 1=PROGRAM */
+#define RTS5264_EFUSE_MODE_MASK		0x40
+#define RTS5264_EFUSE_PROGRAM		0x40
+
+#define RTS5264_EFUSE_ADDR		0xFC31
+#define	RTS5264_EFUSE_ADDR_MASK		0x3F
+
+#define RTS5264_EFUSE_WRITE_DATA	0xFC32
+#define RTS5264_EFUSE_READ_DATA		0xFC34
+
+#define RTS5264_SYS_DUMMY_1		0xFC35
+#define RTS5264_REG_BIG_KVCO		0x04
+
+/* DMACTL 0xFE2C */
+#define RTS5264_DMA_PACK_SIZE_MASK	0x70
+
+#define RTS5264_FW_CFG1			0xFF55
+#define RTS5264_SYS_CLK_SEL_MCU_CLK	(0x01<<7)
+#define RTS5264_CRC_CLK_SEL_MCU_CLK	(0x01<<6)
+#define RTS5264_FAKE_MCU_CLOCK_GATING	(0x01<<5)
+#define RTS5264_MCU_BUS_SEL_MASK	(0x01<<4)
+
+/* FW status register */
+#define RTS5264_FW_STATUS		0xFF56
+#define RTS5264_EXPRESS_LINK_FAIL_MASK	(0x01<<7)
+
+/* FW control register */
+#define RTS5264_FW_CTL			0xFF5F
+#define RTS5264_INFORM_RTD3_COLD	(0x01<<5)
+
+#define RTS5264_REG_FPDCTL		0xFF60
+
+#define RTS5264_REG_LDO12_CFG		0xFF6E
+#define RTS5264_LDO12_SR_MASK		(0x03<<6)
+#define RTS5264_LDO12_SR_1_0_MS		(0x03<<6)
+#define RTS5264_LDO12_SR_0_5_MS		(0x02<<6)
+#define RTS5264_LDO12_SR_0_2_5_MS	(0x01<<6)
+#define RTS5264_LDO12_SR_0_0_MS		(0x00<<6)
+#define RTS5264_LDO12_VO_TUNE_MASK	(0x07<<1)
+#define RTS5264_LDO12_115		(0x03<<1)
+#define RTS5264_LDO12_120		(0x04<<1)
+#define RTS5264_LDO12_125		(0x05<<1)
+#define RTS5264_LDO12_130		(0x06<<1)
+#define RTS5264_LDO12_135		(0x07<<1)
+
+/* LDO control register */
+#define RTS5264_CARD_PWR_CTL		0xFD50
+#define RTS5264_SD_CLK_ISO		(0x01<<7)
+#define RTS5264_PAD_SD_DAT_FW_CTRL	(0x01<<6)
+#define RTS5264_PUPDC			(0x01<<5)
+#define RTS5264_SD_CMD_ISO		(0x01<<4)
+
+#define RTS5264_OCP_VDD3_CTL		0xFD89
+#define SD_VDD3_DETECT_EN		0x08
+#define SD_VDD3_OCP_INT_EN		0x04
+#define SD_VDD3_OCP_INT_CLR		0x02
+#define SD_VDD3_OC_CLR			0x01
+
+#define RTS5264_OCP_VDD3_STS		0xFD8A
+#define SD_VDD3_OCP_DETECT		0x08
+#define SD_VDD3_OC_NOW			0x04
+#define SD_VDD3_OC_EVER			0x02
+
+#define RTS5264_OVP_CTL			0xFD8D
+#define RTS5264_OVP_TIME_MASK		0xF0
+#define RTS5264_OVP_TIME_DFT		0x50
+#define RTS5264_OVP_DETECT_EN		0x08
+#define RTS5264_OVP_INT_EN		0x04
+#define RTS5264_OVP_INT_CLR		0x02
+#define RTS5264_OVP_CLR			0x01
+
+#define RTS5264_OVP_STS			0xFD8E
+#define RTS5264_OVP_GLTCH_TIME_MASK	0xF0
+#define RTS5264_OVP_GLTCH_TIME_DFT	0x50
+#define RTS5264_VOVER_DET		0x08
+#define RTS5264_OVP_NOW			0x04
+#define RTS5264_OVP_EVER		0x02
+
+#define RTS5264_CMD_OE_START_EARLY		0xFDCB
+#define RTS5264_CMD_OE_EARLY_LEAVE		0x08
+#define RTS5264_CMD_OE_EARLY_CYCLE_MASK		0x06
+#define RTS5264_CMD_OE_EARLY_4CYCLE		0x06
+#define RTS5264_CMD_OE_EARLY_3CYCLE		0x04
+#define RTS5264_CMD_OE_EARLY_2CYCLE		0x02
+#define RTS5264_CMD_OE_EARLY_1CYCLE		0x00
+#define RTS5264_CMD_OE_EARLY_EN			0x01
+
+#define RTS5264_DAT_OE_START_EARLY		0xFDCC
+#define RTS5264_DAT_OE_EARLY_LEAVE		0x08
+#define RTS5264_DAT_OE_EARLY_CYCLE_MASK		0x06
+#define RTS5264_DAT_OE_EARLY_4CYCLE		0x06
+#define RTS5264_DAT_OE_EARLY_3CYCLE		0x04
+#define RTS5264_DAT_OE_EARLY_2CYCLE		0x02
+#define RTS5264_DAT_OE_EARLY_1CYCLE		0x00
+#define RTS5264_DAT_OE_EARLY_EN			0x01
+
+#define RTS5264_LDO1233318_POW_CTL	0xFF70
+#define RTS5264_TUNE_REF_LDO3318	(0x03<<6)
+#define RTS5264_TUNE_REF_LDO3318_DFT	(0x02<<6)
+#define RTS5264_LDO3318_POWERON		(0x01<<3)
+#define RTS5264_LDO3_POWERON		(0x01<<2)
+#define RTS5264_LDO2_POWERON		(0x01<<1)
+#define RTS5264_LDO1_POWERON		(0x01<<0)
+#define RTS5264_LDO_POWERON_MASK	(0x0F<<0)
+
+#define RTS5264_DV3318_CFG		0xFF71
+#define RTS5264_DV3318_TUNE_MASK	(0x07<<4)
+#define RTS5264_DV3318_18		(0x02<<4)
+#define RTS5264_DV3318_19		(0x04<<4)
+#define RTS5264_DV3318_33		(0x07<<4)
+
+#define RTS5264_LDO1_CFG0		0xFF72
+#define RTS5264_LDO1_OCP_THD_MASK	(0x07 << 5)
+#define RTS5264_LDO1_OCP_EN		(0x01 << 4)
+#define RTS5264_LDO1_OCP_LMT_THD_MASK	(0x03 << 2)
+#define RTS5264_LDO1_OCP_LMT_EN		(0x01 << 1)
+
+#define RTS5264_LDO1_OCP_THD_850	(0x00<<5)
+#define RTS5264_LDO1_OCP_THD_950	(0x01<<5)
+#define RTS5264_LDO1_OCP_THD_1050	(0x02<<5)
+#define RTS5264_LDO1_OCP_THD_1100	(0x03<<5)
+#define RTS5264_LDO1_OCP_THD_1150	(0x04<<5)
+#define RTS5264_LDO1_OCP_THD_1200	(0x05<<5)
+#define RTS5264_LDO1_OCP_THD_1300	(0x06<<5)
+#define RTS5264_LDO1_OCP_THD_1350	(0x07<<5)
+
+#define RTS5264_LDO1_LMT_THD_1700	(0x00<<2)
+#define RTS5264_LDO1_LMT_THD_1800	(0x01<<2)
+#define RTS5264_LDO1_LMT_THD_1900	(0x02<<2)
+#define RTS5264_LDO1_LMT_THD_2000	(0x03<<2)
+
+#define RTS5264_LDO1_CFG1		0xFF73
+#define RTS5264_LDO1_TUNE_MASK		(0x07<<1)
+#define RTS5264_LDO1_18			(0x05<<1)
+#define RTS5264_LDO1_33			(0x07<<1)
+#define RTS5264_LDO1_PWD_MASK		(0x01<<0)
+
+#define RTS5264_LDO2_CFG0		0xFF74
+#define RTS5264_LDO2_OCP_THD_MASK	(0x07<<5)
+#define RTS5264_LDO2_OCP_EN		(0x01<<4)
+#define RTS5264_LDO2_OCP_LMT_THD_MASK	(0x03<<2)
+#define RTS5264_LDO2_OCP_LMT_EN		(0x01<<1)
+
+#define RTS5264_LDO2_OCP_THD_750	(0x00<<5)
+#define RTS5264_LDO2_OCP_THD_850	(0x01<<5)
+#define RTS5264_LDO2_OCP_THD_900	(0x02<<5)
+#define RTS5264_LDO2_OCP_THD_950	(0x03<<5)
+#define RTS5264_LDO2_OCP_THD_1050	(0x04<<5)
+#define RTS5264_LDO2_OCP_THD_1100	(0x05<<5)
+#define RTS5264_LDO2_OCP_THD_1150	(0x06<<5)
+#define RTS5264_LDO2_OCP_THD_1200	(0x07<<5)
+
+#define RTS5264_LDO2_LMT_THD_1700	(0x00<<2)
+#define RTS5264_LDO2_LMT_THD_1800	(0x01<<2)
+#define RTS5264_LDO2_LMT_THD_1900	(0x02<<2)
+#define RTS5264_LDO2_LMT_THD_2000	(0x03<<2)
+
+#define RTS5264_LDO2_CFG1		0xFF75
+#define RTS5264_LDO2_TUNE_MASK		(0x07<<1)
+#define RTS5264_LDO2_18			(0x02<<1)
+#define RTS5264_LDO2_185		(0x03<<1)
+#define RTS5264_LDO2_19			(0x04<<1)
+#define RTS5264_LDO2_195		(0x05<<1)
+#define RTS5264_LDO2_33			(0x07<<1)
+#define RTS5264_LDO2_PWD_MASK		(0x01<<0)
+
+#define RTS5264_LDO3_CFG0		0xFF76
+#define RTS5264_LDO3_OCP_THD_MASK	(0x07<<5)
+#define RTS5264_LDO3_OCP_EN		(0x01<<4)
+#define RTS5264_LDO3_OCP_LMT_THD_MASK	(0x03<<2)
+#define RTS5264_LDO3_OCP_LMT_EN		(0x01<<1)
+
+#define RTS5264_LDO3_OCP_THD_610	(0x00<<5)
+#define RTS5264_LDO3_OCP_THD_630	(0x01<<5)
+#define RTS5264_LDO3_OCP_THD_670	(0x02<<5)
+#define RTS5264_LDO3_OCP_THD_710	(0x03<<5)
+#define RTS5264_LDO3_OCP_THD_750	(0x04<<5)
+#define RTS5264_LDO3_OCP_THD_770	(0x05<<5)
+#define RTS5264_LDO3_OCP_THD_810	(0x06<<5)
+#define RTS5264_LDO3_OCP_THD_850	(0x07<<5)
+
+#define RTS5264_LDO3_LMT_THD_1200	(0x00<<2)
+#define RTS5264_LDO3_LMT_THD_1300	(0x01<<2)
+#define RTS5264_LDO3_LMT_THD_1400	(0x02<<2)
+#define RTS5264_LDO3_LMT_THD_1500	(0x03<<2)
+
+#define RTS5264_LDO3_CFG1		0xFF77
+#define RTS5264_LDO3_TUNE_MASK		(0x07<<1)
+#define RTS5264_LDO3_12			(0x02<<1)
+#define RTS5264_LDO3_125		(0x03<<1)
+#define RTS5264_LDO3_13			(0x04<<1)
+#define RTS5264_LDO3_135		(0x05<<1)
+#define RTS5264_LDO3_33			(0x07<<1)
+#define RTS5264_LDO3_PWD_MASK		(0x01<<0)
+
+#define RTS5264_REG_PME_FORCE_CTL	0xFF78
+#define FORCE_PM_CONTROL		0x20
+#define FORCE_PM_VALUE			0x10
+#define REG_EFUSE_BYPASS		0x08
+#define REG_EFUSE_POR			0x04
+#define REG_EFUSE_POWER_MASK		0x03
+#define REG_EFUSE_POWERON		0x03
+#define REG_EFUSE_POWEROFF		0x00
+
+#define RTS5264_PWR_CUT			0xFF81
+#define RTS5264_CFG_MEM_PD		0xF0
+
+#define RTS5264_OVP_DET			0xFF8A
+#define RTS5264_POW_VDET		0x04
+#define RTS5264_TUNE_VROV_MASK		0x03
+#define RTS5264_TUNE_VROV_2V		0x03
+#define RTS5264_TUNE_VROV_1V8		0x02
+#define RTS5264_TUNE_VROV_1V6		0x01
+#define RTS5264_TUNE_VROV_1V4		0x00
+
+#define RTS5264_CKMUX_MBIAS_PWR		0xFF8B
+#define RTS5264_NON_XTAL_SEL		0x80
+#define RTS5264_POW_CKMUX		0x40
+#define RTS5264_LVD_MASK		0x04
+#define RTS5264_POW_PSW_MASK		0x03
+#define RTS5264_POW_PSW_DFT		0x03
+
+/* Single LUN, support SD/SD EXPRESS */
+#define DEFAULT_SINGLE		0
+#define SD_LUN			1
+#define SD_EXPRESS_LUN		2
+
+int rts5264_pci_switch_clock(struct rtsx_pcr *pcr, unsigned int card_clock,
+		u8 ssc_depth, bool initial_mode, bool double_clk, bool vpclk);
+
+#endif /* RTS5264_H */
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index 534038d962e4..4612ef09a0c7 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -60,6 +60,7 @@
 #define   SD_EXIST			(1 << 16)
 #define   DELINK_INT			GPIO0_INT
 #define   MS_OC_INT			(1 << 23)
+#define   SD_OVP_INT		(1 << 23)
 #define   SD_OC_INT			(1 << 22)
 
 #define CARD_INT		(XD_INT | MS_INT | SD_INT)
@@ -80,6 +81,7 @@
 #define   OC_INT_EN			(1 << 23)
 #define   DELINK_INT_EN			GPIO0_INT_EN
 #define   MS_OC_INT_EN			(1 << 23)
+#define   SD_OVP_INT_EN			(1 << 23)
 #define   SD_OC_INT_EN			(1 << 22)
 
 #define RTSX_DUM_REG			0x1C
@@ -583,6 +585,7 @@
 #define   OBFF_DISABLE			0x00
 
 #define CDRESUMECTL			0xFE52
+#define CDGW				0xFE53
 #define WAKE_SEL_CTL			0xFE54
 #define PCLK_CTL			0xFE55
 #define   PCLK_MODE_SEL			0x20
@@ -764,6 +767,9 @@
 #define   SD_VIO_LDO_1V8		0x40
 #define   SD_VIO_LDO_3V3		0x70
 
+#define RTS5264_AUTOLOAD_CFG2		0xFF7D
+#define RTS5264_CHIP_RST_N_SEL		(1 << 6)
+
 #define RTS5260_AUTOLOAD_CFG4		0xFF7F
 #define   RTS5260_MIMO_DISABLE		0x8A
 /*RTS5261*/
@@ -1261,6 +1267,7 @@ struct rtsx_pcr {
 	u8				dma_error_count;
 	u8			ocp_stat;
 	u8			ocp_stat2;
+	u8			ovp_stat;
 	u8			rtd3_en;
 };
 
@@ -1271,6 +1278,7 @@ struct rtsx_pcr {
 #define PID_5260	0x5260
 #define PID_5261	0x5261
 #define PID_5228	0x5228
+#define PID_5264	0x5264
 
 #define CHK_PCI_PID(pcr, pid)		((pcr)->pci->device == (pid))
 #define PCI_VID(pcr)			((pcr)->pci->vendor)
-- 
cgit v1.2.3


From b6e53731e07db7e8d35b789fd83565fe75540180 Mon Sep 17 00:00:00 2001
From: Fei Shao <fshao@chromium.org>
Date: Wed, 6 Dec 2023 15:17:26 -0800
Subject: spmi: Introduce device-managed functions

Utilize the managed resource (devres) framework and add the following
devm_* helpers for the SPMI driver:

- devm_spmi_controller_alloc()
- devm_spmi_controller_add()

[sboyd@kernel.org: Rename to spmi-devres for module niceness, slap on
GPL module license]

Signed-off-by: Fei Shao <fshao@chromium.org>
Link: https://lore.kernel.org/r/20230824104101.4083400-2-fshao@chromium.org
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20231206231733.4031901-4-sboyd@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/spmi/Makefile      |  2 +-
 drivers/spmi/spmi-devres.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spmi.h       |  3 +++
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 drivers/spmi/spmi-devres.c

(limited to 'include/linux')

diff --git a/drivers/spmi/Makefile b/drivers/spmi/Makefile
index 9d974424c8c1..7f152167bb05 100644
--- a/drivers/spmi/Makefile
+++ b/drivers/spmi/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for kernel SPMI framework.
 #
-obj-$(CONFIG_SPMI)	+= spmi.o
+obj-$(CONFIG_SPMI)	+= spmi.o spmi-devres.o
 
 obj-$(CONFIG_SPMI_HISI3670)	+= hisi-spmi-controller.o
 obj-$(CONFIG_SPMI_MSM_PMIC_ARB)	+= spmi-pmic-arb.o
diff --git a/drivers/spmi/spmi-devres.c b/drivers/spmi/spmi-devres.c
new file mode 100644
index 000000000000..7683e6fdb18f
--- /dev/null
+++ b/drivers/spmi/spmi-devres.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023 Google LLC.
+ */
+
+#include <linux/device.h>
+#include <linux/spmi.h>
+
+static void devm_spmi_controller_release(struct device *parent, void *res)
+{
+	spmi_controller_put(*(struct spmi_controller **)res);
+}
+
+struct spmi_controller *devm_spmi_controller_alloc(struct device *parent, size_t size)
+{
+	struct spmi_controller **ptr, *ctrl;
+
+	ptr = devres_alloc(devm_spmi_controller_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	ctrl = spmi_controller_alloc(parent, size);
+	if (!ctrl) {
+		devres_free(ptr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	*ptr = ctrl;
+	devres_add(parent, ptr);
+
+	return ctrl;
+}
+EXPORT_SYMBOL_GPL(devm_spmi_controller_alloc);
+
+static void devm_spmi_controller_remove(struct device *parent, void *res)
+{
+	spmi_controller_remove(*(struct spmi_controller **)res);
+}
+
+int devm_spmi_controller_add(struct device *parent, struct spmi_controller *ctrl)
+{
+	struct spmi_controller **ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_spmi_controller_remove, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = spmi_controller_add(ctrl);
+	if (ret) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	*ptr = ctrl;
+	devres_add(parent, ptr);
+
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(devm_spmi_controller_add);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SPMI devres helpers");
diff --git a/include/linux/spmi.h b/include/linux/spmi.h
index 2a4ce4144f9f..28e8c8bd3944 100644
--- a/include/linux/spmi.h
+++ b/include/linux/spmi.h
@@ -120,6 +120,9 @@ static inline void spmi_controller_put(struct spmi_controller *ctrl)
 int spmi_controller_add(struct spmi_controller *ctrl);
 void spmi_controller_remove(struct spmi_controller *ctrl);
 
+struct spmi_controller *devm_spmi_controller_alloc(struct device *parent, size_t size);
+int devm_spmi_controller_add(struct device *parent, struct spmi_controller *ctrl);
+
 /**
  * struct spmi_driver - SPMI slave device driver
  * @driver:	SPMI device drivers should initialize name and owner field of
-- 
cgit v1.2.3


From 8d6608e4f89a0a21caadcf32fb5ed700e2f5682d Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Thu, 14 Dec 2023 15:53:47 +0100
Subject: firmware: xilinx: Remove clock_setrate and clock_getrate api

As per the current code base, PM_CLOCK_SETRATE and PM_CLOCK_GETRATE
APIs are not supported for the runtime operations. In the case of
ZynqMP returning an error from TF-A when there is any request to
access these APIs and for Versal also it is returning an error like
NO_ACCESS from the firmware. So, just removing the unused code to
avoid the confusion around these APIs.

Also, there is no issue with the backward compatibility as these APIs
were never used since implemented. Hence no need to bump up the
version of the feature check API as well.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
Signed-off-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/6ccbffbafd1f0f48f6574d5a3bf2db6a5603fdb0.1702565618.git.michal.simek@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 38 ------------------------------------
 include/linux/firmware/xlnx-zynqmp.h | 14 -------------
 2 files changed, 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index caa0952bfc42..2ae9da268215 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -657,44 +657,6 @@ int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getdivider);
 
-/**
- * zynqmp_pm_clock_setrate() - Set the clock rate for given id
- * @clock_id:	ID of the clock
- * @rate:	rate value in hz
- *
- * This function is used by master to set rate for any clock.
- *
- * Return: Returns status, either success or error+reason
- */
-int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
-{
-	return zynqmp_pm_invoke_fn(PM_CLOCK_SETRATE, NULL, 3, clock_id, lower_32_bits(rate),
-				   upper_32_bits(rate));
-}
-EXPORT_SYMBOL_GPL(zynqmp_pm_clock_setrate);
-
-/**
- * zynqmp_pm_clock_getrate() - Get the clock rate for given id
- * @clock_id:	ID of the clock
- * @rate:	rate value in hz
- *
- * This function is used by master to get rate
- * for any clock.
- *
- * Return: Returns status, either success or error+reason
- */
-int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
-{
-	u32 ret_payload[PAYLOAD_ARG_CNT];
-	int ret;
-
-	ret = zynqmp_pm_invoke_fn(PM_CLOCK_GETRATE, ret_payload, 1, clock_id);
-	*rate = ((u64)ret_payload[2] << 32) | ret_payload[1];
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(zynqmp_pm_clock_getrate);
-
 /**
  * zynqmp_pm_clock_setparent() - Set the clock parent for given id
  * @clock_id:	ID of the clock
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 6b48294f3c92..c6a7fb1f980b 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -160,8 +160,6 @@ enum pm_api_id {
 	PM_CLOCK_GETSTATE = 38,
 	PM_CLOCK_SETDIVIDER = 39,
 	PM_CLOCK_GETDIVIDER = 40,
-	PM_CLOCK_SETRATE = 41,
-	PM_CLOCK_GETRATE = 42,
 	PM_CLOCK_SETPARENT = 43,
 	PM_CLOCK_GETPARENT = 44,
 	PM_FPGA_READ = 46,
@@ -533,8 +531,6 @@ int zynqmp_pm_clock_disable(u32 clock_id);
 int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state);
 int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider);
 int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider);
-int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate);
-int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate);
 int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id);
 int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id);
 int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode);
@@ -639,16 +635,6 @@ static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
-{
-	return -ENODEV;
-}
-
-static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
-{
-	return -ENODEV;
-}
-
 static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From b9ae996210163e89a2a9aece7c582fb43694485a Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@amd.com>
Date: Thu, 14 Dec 2023 15:53:48 +0100
Subject: firmware: xilinx: Remove zynqmp_pm_pinctrl_get_function()

There is no user for this interface that's why remove it.

Signed-off-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/e52a415a004e28a43e6d08e9e22d9e8fef3737df.1702565618.git.michal.simek@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 24 ------------------------
 include/linux/firmware/xlnx-zynqmp.h |  7 -------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 2ae9da268215..a55dfd9bae6b 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -1060,30 +1060,6 @@ int zynqmp_pm_pinctrl_release(const u32 pin)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_release);
 
-/**
- * zynqmp_pm_pinctrl_get_function - Read function id set for the given pin
- * @pin: Pin number
- * @id: Buffer to store function ID
- *
- * This function provides the function currently set for the given pin.
- *
- * Return: Returns status, either success or error+reason
- */
-int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
-{
-	u32 ret_payload[PAYLOAD_ARG_CNT];
-	int ret;
-
-	if (!id)
-		return -EINVAL;
-
-	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_GET_FUNCTION, ret_payload, 1, pin);
-	*id = ret_payload[1];
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_get_function);
-
 /**
  * zynqmp_pm_pinctrl_set_function - Set requested function for the pin
  * @pin: Pin number
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index c6a7fb1f980b..1478f691cc10 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -149,7 +149,6 @@ enum pm_api_id {
 	PM_SECURE_SHA = 26,
 	PM_PINCTRL_REQUEST = 28,
 	PM_PINCTRL_RELEASE = 29,
-	PM_PINCTRL_GET_FUNCTION = 30,
 	PM_PINCTRL_SET_FUNCTION = 31,
 	PM_PINCTRL_CONFIG_PARAM_GET = 32,
 	PM_PINCTRL_CONFIG_PARAM_SET = 33,
@@ -567,7 +566,6 @@ int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype);
 int zynqmp_pm_set_boot_health_status(u32 value);
 int zynqmp_pm_pinctrl_request(const u32 pin);
 int zynqmp_pm_pinctrl_release(const u32 pin);
-int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id);
 int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id);
 int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
 				 u32 *value);
@@ -804,11 +802,6 @@ static inline int zynqmp_pm_pinctrl_release(const u32 pin)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
-{
-	return -ENODEV;
-}
-
 static inline int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From f8fa5d76925991976b3e7076f9d1052515ec1fca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:24:10 -0700
Subject: cred: switch to using atomic_long_t

There are multiple ways to grab references to credentials, and the only
protection we have against overflowing it is the memory required to do
so.

With memory sizes only moving in one direction, let's bump the reference
count to 64-bit and move it outside the realm of feasibly overflowing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h |  8 +++----
 kernel/cred.c        | 64 ++++++++++++++++++++++++++--------------------------
 2 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index af8d353a4b86..a3383f8efb8f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -109,7 +109,7 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  * same context as task->real_cred.
  */
 struct cred {
-	atomic_t	usage;
+	atomic_long_t	usage;
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	atomic_t	subscribers;	/* number of processes subscribed */
 	void		*put_addr;
@@ -229,7 +229,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
 {
-	atomic_add(nr, &cred->usage);
+	atomic_long_add(nr, &cred->usage);
 	return cred;
 }
 
@@ -288,7 +288,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return NULL;
-	if (!atomic_inc_not_zero(&nonconst_cred->usage))
+	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
@@ -313,7 +313,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_sub_and_test(nr, &cred->usage))
+		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 3c714cb31660..4a6cd0f0fef5 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -102,17 +102,17 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	if (cred->magic != CRED_MAGIC_DEAD ||
-	    atomic_read(&cred->usage) != 0 ||
+	    atomic_long_read(&cred->usage) != 0 ||
 	    read_cred_subscribers(cred) != 0)
 		panic("CRED: put_cred_rcu() sees %p with"
-		      " mag %x, put %p, usage %d, subscr %d\n",
+		      " mag %x, put %p, usage %ld, subscr %d\n",
 		      cred, cred->magic, cred->put_addr,
-		      atomic_read(&cred->usage),
+		      atomic_long_read(&cred->usage),
 		      read_cred_subscribers(cred));
 #else
-	if (atomic_read(&cred->usage) != 0)
-		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
-		      cred, atomic_read(&cred->usage));
+	if (atomic_long_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+		      cred, atomic_long_read(&cred->usage));
 #endif
 
 	security_cred_free(cred);
@@ -137,11 +137,11 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
-	kdebug("__put_cred(%p{%d,%d})", cred,
-	       atomic_read(&cred->usage),
+	kdebug("__put_cred(%p{%ld,%d})", cred,
+	       atomic_long_read(&cred->usage),
 	       read_cred_subscribers(cred));
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	BUG_ON(atomic_long_read(&cred->usage) != 0);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	BUG_ON(read_cred_subscribers(cred) != 0);
 	cred->magic = CRED_MAGIC_DEAD;
@@ -164,8 +164,8 @@ void exit_creds(struct task_struct *tsk)
 {
 	struct cred *real_cred, *cred;
 
-	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
-	       atomic_read(&tsk->cred->usage),
+	kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_long_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
 	real_cred = (struct cred *) tsk->real_cred;
@@ -230,7 +230,7 @@ struct cred *cred_alloc_blank(void)
 	if (!new)
 		return NULL;
 
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	new->magic = CRED_MAGIC;
 #endif
@@ -276,7 +276,7 @@ struct cred *prepare_creds(void)
 	memcpy(new, old, sizeof(struct cred));
 
 	new->non_rcu = 0;
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
@@ -363,8 +363,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	    ) {
 		p->real_cred = get_cred_many(p->cred, 2);
 		alter_cred_subscribers(p->cred, 2);
-		kdebug("share_creds(%p{%d,%d})",
-		       p->cred, atomic_read(&p->cred->usage),
+		kdebug("share_creds(%p{%ld,%d})",
+		       p->cred, atomic_long_read(&p->cred->usage),
 		       read_cred_subscribers(p->cred));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 		return 0;
@@ -457,8 +457,8 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old = task->real_cred;
 
-	kdebug("commit_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("commit_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 	BUG_ON(task->cred != old);
@@ -467,7 +467,7 @@ int commit_creds(struct cred *new)
 	validate_creds(old);
 	validate_creds(new);
 #endif
-	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_long_read(&new->usage) < 1);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
 
@@ -539,14 +539,14 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
-	kdebug("abort_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("abort_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	BUG_ON(read_cred_subscribers(new) != 0);
 #endif
-	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_long_read(&new->usage) < 1);
 	put_cred(new);
 }
 EXPORT_SYMBOL(abort_creds);
@@ -562,8 +562,8 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	kdebug("override_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("override_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 	validate_creds(old);
@@ -585,8 +585,8 @@ const struct cred *override_creds(const struct cred *new)
 	rcu_assign_pointer(current->cred, new);
 	alter_cred_subscribers(old, -1);
 
-	kdebug("override_creds() = %p{%d,%d}", old,
-	       atomic_read(&old->usage),
+	kdebug("override_creds() = %p{%ld,%d}", old,
+	       atomic_long_read(&old->usage),
 	       read_cred_subscribers(old));
 	return old;
 }
@@ -603,8 +603,8 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
-	kdebug("revert_creds(%p{%d,%d})", old,
-	       atomic_read(&old->usage),
+	kdebug("revert_creds(%p{%ld,%d})", old,
+	       atomic_long_read(&old->usage),
 	       read_cred_subscribers(old));
 
 	validate_creds(old);
@@ -735,7 +735,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 
 	*new = *old;
 	new->non_rcu = 0;
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
@@ -849,8 +849,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
 	       cred == tsk->cred ? "[eff]" : "");
 	pr_err("->magic=%x, put_addr=%p\n",
 	       cred->magic, cred->put_addr);
-	pr_err("->usage=%d, subscr=%d\n",
-	       atomic_read(&cred->usage),
+	pr_err("->usage=%ld, subscr=%d\n",
+	       atomic_long_read(&cred->usage),
 	       read_cred_subscribers(cred));
 	pr_err("->*uid = { %d,%d,%d,%d }\n",
 		from_kuid_munged(&init_user_ns, cred->uid),
@@ -922,9 +922,9 @@ EXPORT_SYMBOL(__validate_process_creds);
  */
 void validate_creds_for_do_exit(struct task_struct *tsk)
 {
-	kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+	kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
 	       tsk->real_cred, tsk->cred,
-	       atomic_read(&tsk->cred->usage),
+	       atomic_long_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
 	__validate_process_creds(tsk, __FILE__, __LINE__);
-- 
cgit v1.2.3


From ae1914174a63a558113e80d24ccac2773f9f7b2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:40:57 -0700
Subject: cred: get rid of CONFIG_DEBUG_CREDENTIALS

This code is rarely (never?) enabled by distros, and it hasn't caught
anything in decades. Let's kill off this legacy debug code.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/configs/skiroot_defconfig    |   1 -
 arch/s390/configs/debug_defconfig         |   1 -
 fs/nfsd/auth.c                            |   4 -
 fs/nfsd/nfssvc.c                          |   1 -
 fs/nfsd/vfs.c                             |   9 +-
 fs/open.c                                 |   3 -
 include/linux/cred.h                      |  50 -------
 kernel/cred.c                             | 231 +++---------------------------
 kernel/exit.c                             |   3 -
 lib/Kconfig.debug                         |  15 --
 net/sunrpc/auth.c                         |   3 -
 security/selinux/hooks.c                  |   6 -
 tools/objtool/noreturns.h                 |   1 -
 tools/testing/selftests/bpf/config.x86_64 |   1 -
 tools/testing/selftests/hid/config.common |   1 -
 15 files changed, 17 insertions(+), 313 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 8d3eacb50d56..9d44e6630908 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -301,7 +301,6 @@ CONFIG_WQ_WATCHDOG=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_DEBUG_CREDENTIALS=y
 # CONFIG_FTRACE is not set
 CONFIG_XMON=y
 # CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 438cd92e6080..dd0608629310 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -834,7 +834,6 @@ CONFIG_DEBUG_IRQFLAGS=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_RCU_TORTURE_TEST=m
 CONFIG_RCU_REF_SCALE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=300
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index fdf2aad73470..e6beaaf4f170 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 
-	validate_process_creds();
-
 	/* discard any old override before preparing the new set */
 	revert_creds(get_cred(current_real_cred()));
 	new = prepare_creds();
@@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	validate_process_creds();
 	put_cred(override_creds(new));
 	put_cred(new);
-	validate_process_creds();
 	return 0;
 
 oom:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index fe61d9bbcc1f..5014ab87d313 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -955,7 +955,6 @@ nfsd(void *vrqstp)
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
-		validate_process_creds();
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fbbea7498f02..e01e4e2acbd9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -901,7 +901,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	int host_err;
 	bool retried = false;
 
-	validate_process_creds();
 	/*
 	 * If we get here, then the client has already done an "open",
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -926,7 +925,6 @@ retry:
 		}
 		err = nfserrno(host_err);
 	}
-	validate_process_creds();
 	return err;
 }
 
@@ -943,12 +941,7 @@ int
 nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
 		   struct file **filp)
 {
-	int err;
-
-	validate_process_creds();
-	err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
-	validate_process_creds();
-	return err;
+	return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
 }
 
 /*
diff --git a/fs/open.c b/fs/open.c
index 02dc608d40d8..3494a9cd8046 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1088,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags,
 	int error;
 	struct file *f;
 
-	validate_creds(cred);
-
 	/* We must always pass in a valid mount pointer. */
 	BUG_ON(!path->mnt);
 
@@ -1128,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	struct file *f;
 	int error;
 
-	validate_creds(cred);
 	f = alloc_empty_file(flags, cred);
 	if (IS_ERR(f))
 		return f;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a3383f8efb8f..2976f534a7a3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -110,13 +110,6 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  */
 struct cred {
 	atomic_long_t	usage;
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	atomic_t	subscribers;	/* number of processes subscribed */
-	void		*put_addr;
-	unsigned	magic;
-#define CRED_MAGIC	0x43736564
-#define CRED_MAGIC_DEAD	0x44656144
-#endif
 	kuid_t		uid;		/* real UID of the task */
 	kgid_t		gid;		/* real GID of the task */
 	kuid_t		suid;		/* saved UID of the task */
@@ -172,46 +165,6 @@ extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
 extern int set_cred_ucounts(struct cred *);
 
-/*
- * check for validity of credentials
- */
-#ifdef CONFIG_DEBUG_CREDENTIALS
-extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned);
-extern void __validate_process_creds(struct task_struct *,
-				     const char *, unsigned);
-
-extern bool creds_are_invalid(const struct cred *cred);
-
-static inline void __validate_creds(const struct cred *cred,
-				    const char *file, unsigned line)
-{
-	if (unlikely(creds_are_invalid(cred)))
-		__invalid_creds(cred, file, line);
-}
-
-#define validate_creds(cred)				\
-do {							\
-	__validate_creds((cred), __FILE__, __LINE__);	\
-} while(0)
-
-#define validate_process_creds()				\
-do {								\
-	__validate_process_creds(current, __FILE__, __LINE__);	\
-} while(0)
-
-extern void validate_creds_for_do_exit(struct task_struct *);
-#else
-static inline void validate_creds(const struct cred *cred)
-{
-}
-static inline void validate_creds_for_do_exit(struct task_struct *tsk)
-{
-}
-static inline void validate_process_creds(void)
-{
-}
-#endif
-
 static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 {
 	return cap_issubset(cred->cap_ambient,
@@ -264,7 +217,6 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return get_new_cred_many(nonconst_cred, nr);
 }
@@ -290,7 +242,6 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 		return NULL;
 	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return cred;
 }
@@ -312,7 +263,6 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
-		validate_creds(cred);
 		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
diff --git a/kernel/cred.c b/kernel/cred.c
index 4a6cd0f0fef5..c033a201c808 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -43,10 +43,6 @@ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
  */
 struct cred init_cred = {
 	.usage			= ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	.subscribers		= ATOMIC_INIT(2),
-	.magic			= CRED_MAGIC,
-#endif
 	.uid			= GLOBAL_ROOT_UID,
 	.gid			= GLOBAL_ROOT_GID,
 	.suid			= GLOBAL_ROOT_UID,
@@ -66,31 +62,6 @@ struct cred init_cred = {
 	.ucounts		= &init_ucounts,
 };
 
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	return atomic_read(&cred->subscribers);
-#else
-	return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	struct cred *cred = (struct cred *) _cred;
-
-	atomic_add(n, &cred->subscribers);
-#endif
-}
-
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 	kdebug("put_cred_rcu(%p)", cred);
 
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	if (cred->magic != CRED_MAGIC_DEAD ||
-	    atomic_long_read(&cred->usage) != 0 ||
-	    read_cred_subscribers(cred) != 0)
-		panic("CRED: put_cred_rcu() sees %p with"
-		      " mag %x, put %p, usage %ld, subscr %d\n",
-		      cred, cred->magic, cred->put_addr,
-		      atomic_long_read(&cred->usage),
-		      read_cred_subscribers(cred));
-#else
 	if (atomic_long_read(&cred->usage) != 0)
 		panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
 		      cred, atomic_long_read(&cred->usage));
-#endif
 
 	security_cred_free(cred);
 	key_put(cred->session_keyring);
@@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
-	kdebug("__put_cred(%p{%ld,%d})", cred,
-	       atomic_long_read(&cred->usage),
-	       read_cred_subscribers(cred));
+	kdebug("__put_cred(%p{%ld})", cred,
+	       atomic_long_read(&cred->usage));
 
 	BUG_ON(atomic_long_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(cred) != 0);
-	cred->magic = CRED_MAGIC_DEAD;
-	cred->put_addr = __builtin_return_address(0);
-#endif
 	BUG_ON(cred == current->cred);
 	BUG_ON(cred == current->real_cred);
 
@@ -164,9 +118,8 @@ void exit_creds(struct task_struct *tsk)
 {
 	struct cred *real_cred, *cred;
 
-	kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
-	       atomic_long_read(&tsk->cred->usage),
-	       read_cred_subscribers(tsk->cred));
+	kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_long_read(&tsk->cred->usage));
 
 	real_cred = (struct cred *) tsk->real_cred;
 	tsk->real_cred = NULL;
@@ -174,15 +127,10 @@ void exit_creds(struct task_struct *tsk)
 	cred = (struct cred *) tsk->cred;
 	tsk->cred = NULL;
 
-	validate_creds(cred);
 	if (real_cred == cred) {
-		alter_cred_subscribers(cred, -2);
 		put_cred_many(cred, 2);
 	} else {
-		validate_creds(real_cred);
-		alter_cred_subscribers(real_cred, -1);
 		put_cred(real_cred);
-		alter_cred_subscribers(cred, -1);
 		put_cred(cred);
 	}
 
@@ -231,9 +179,6 @@ struct cred *cred_alloc_blank(void)
 		return NULL;
 
 	atomic_long_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	new->magic = CRED_MAGIC;
-#endif
 	if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
 
@@ -264,8 +209,6 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	validate_process_creds();
-
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
 		return NULL;
@@ -277,7 +220,6 @@ struct cred *prepare_creds(void)
 
 	new->non_rcu = 0;
 	atomic_long_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
@@ -300,7 +242,6 @@ struct cred *prepare_creds(void)
 	if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
 
-	validate_creds(new);
 	return new;
 
 error:
@@ -362,10 +303,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 		clone_flags & CLONE_THREAD
 	    ) {
 		p->real_cred = get_cred_many(p->cred, 2);
-		alter_cred_subscribers(p->cred, 2);
-		kdebug("share_creds(%p{%ld,%d})",
-		       p->cred, atomic_long_read(&p->cred->usage),
-		       read_cred_subscribers(p->cred));
+		kdebug("share_creds(%p{%ld})",
+		       p->cred, atomic_long_read(&p->cred->usage));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 		return 0;
 	}
@@ -404,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 
 	p->cred = p->real_cred = get_cred(new);
 	inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
-	alter_cred_subscribers(new, 2);
-	validate_creds(new);
 	return 0;
 
 error_put:
@@ -457,16 +394,10 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old = task->real_cred;
 
-	kdebug("commit_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
+	kdebug("commit_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
 	BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(old) < 2);
-	validate_creds(old);
-	validate_creds(new);
-#endif
 	BUG_ON(atomic_long_read(&new->usage) < 1);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
@@ -502,14 +433,12 @@ int commit_creds(struct cred *new)
 	 * RLIMIT_NPROC limits on user->processes have already been checked
 	 * in set_user().
 	 */
-	alter_cred_subscribers(new, 2);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
-	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
 	if (!uid_eq(new->uid,   old->uid)  ||
@@ -539,13 +468,9 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
-	kdebug("abort_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
+	kdebug("abort_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(new) != 0);
-#endif
 	BUG_ON(atomic_long_read(&new->usage) < 1);
 	put_cred(new);
 }
@@ -562,12 +487,8 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	kdebug("override_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
-
-	validate_creds(old);
-	validate_creds(new);
+	kdebug("override_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
 	/*
 	 * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
@@ -576,18 +497,12 @@ const struct cred *override_creds(const struct cred *new)
 	 * we are only installing the cred into the thread-synchronous
 	 * '->cred' pointer, not the '->real_cred' pointer that is
 	 * visible to other threads under RCU.
-	 *
-	 * Also note that we did validate_creds() manually, not depending
-	 * on the validation in 'get_cred()'.
 	 */
 	get_new_cred((struct cred *)new);
-	alter_cred_subscribers(new, 1);
 	rcu_assign_pointer(current->cred, new);
-	alter_cred_subscribers(old, -1);
 
-	kdebug("override_creds() = %p{%ld,%d}", old,
-	       atomic_long_read(&old->usage),
-	       read_cred_subscribers(old));
+	kdebug("override_creds() = %p{%ld}", old,
+	       atomic_long_read(&old->usage));
 	return old;
 }
 EXPORT_SYMBOL(override_creds);
@@ -603,15 +518,10 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
-	kdebug("revert_creds(%p{%ld,%d})", old,
-	       atomic_long_read(&old->usage),
-	       read_cred_subscribers(old));
+	kdebug("revert_creds(%p{%ld})", old,
+	       atomic_long_read(&old->usage));
 
-	validate_creds(old);
-	validate_creds(override);
-	alter_cred_subscribers(old, 1);
 	rcu_assign_pointer(current->cred, old);
-	alter_cred_subscribers(override, -1);
 	put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);
@@ -731,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	kdebug("prepare_kernel_cred() alloc %p", new);
 
 	old = get_task_cred(daemon);
-	validate_creds(old);
 
 	*new = *old;
 	new->non_rcu = 0;
 	atomic_long_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
 	get_group_info(new->group_info);
@@ -760,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 		goto error;
 
 	put_cred(old);
-	validate_creds(new);
 	return new;
 
 error:
@@ -825,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
-	if (cred->magic != CRED_MAGIC)
-		return true;
-	return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
-			       const struct task_struct *tsk)
-{
-	pr_err("%s credentials: %p %s%s%s\n",
-	       label, cred,
-	       cred == &init_cred ? "[init]" : "",
-	       cred == tsk->real_cred ? "[real]" : "",
-	       cred == tsk->cred ? "[eff]" : "");
-	pr_err("->magic=%x, put_addr=%p\n",
-	       cred->magic, cred->put_addr);
-	pr_err("->usage=%ld, subscr=%d\n",
-	       atomic_long_read(&cred->usage),
-	       read_cred_subscribers(cred));
-	pr_err("->*uid = { %d,%d,%d,%d }\n",
-		from_kuid_munged(&init_user_ns, cred->uid),
-		from_kuid_munged(&init_user_ns, cred->euid),
-		from_kuid_munged(&init_user_ns, cred->suid),
-		from_kuid_munged(&init_user_ns, cred->fsuid));
-	pr_err("->*gid = { %d,%d,%d,%d }\n",
-		from_kgid_munged(&init_user_ns, cred->gid),
-		from_kgid_munged(&init_user_ns, cred->egid),
-		from_kgid_munged(&init_user_ns, cred->sgid),
-		from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
-	pr_err("->security is %p\n", cred->security);
-	if ((unsigned long) cred->security >= PAGE_SIZE &&
-	    (((unsigned long) cred->security & 0xffffff00) !=
-	     (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
-		pr_err("->security {%x, %x}\n",
-		       ((u32*)cred->security)[0],
-		       ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
-	pr_err("Invalid credentials\n");
-	pr_err("At %s:%u\n", file, line);
-	dump_invalid_creds(cred, "Specified", current);
-	BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
-			      const char *file, unsigned line)
-{
-	if (tsk->cred == tsk->real_cred) {
-		if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
-			     creds_are_invalid(tsk->cred)))
-			goto invalid_creds;
-	} else {
-		if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
-			     read_cred_subscribers(tsk->cred) < 1 ||
-			     creds_are_invalid(tsk->real_cred) ||
-			     creds_are_invalid(tsk->cred)))
-			goto invalid_creds;
-	}
-	return;
-
-invalid_creds:
-	pr_err("Invalid process credentials\n");
-	pr_err("At %s:%u\n", file, line);
-
-	dump_invalid_creds(tsk->real_cred, "Real", tsk);
-	if (tsk->cred != tsk->real_cred)
-		dump_invalid_creds(tsk->cred, "Effective", tsk);
-	else
-		pr_err("Effective creds == Real creds\n");
-	BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
-	kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
-	       tsk->real_cred, tsk->cred,
-	       atomic_long_read(&tsk->cred->usage),
-	       read_cred_subscribers(tsk->cred));
-
-	__validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..aedc0832c9f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -824,8 +824,6 @@ void __noreturn do_exit(long code)
 	ptrace_event(PTRACE_EVENT_EXIT, code);
 	user_events_exit(tsk);
 
-	validate_creds_for_do_exit(tsk);
-
 	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
 
@@ -909,7 +907,6 @@ void __noreturn do_exit(long code)
 	if (tsk->task_frag.page)
 		put_page(tsk->task_frag.page);
 
-	validate_creds_for_do_exit(tsk);
 	exit_task_stack_account(tsk);
 
 	check_stack_usage();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cc7d53d9dc01..4405f81248fb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1739,21 +1739,6 @@ config DEBUG_MAPLE_TREE
 
 endmenu
 
-config DEBUG_CREDENTIALS
-	bool "Debug credential management"
-	depends on DEBUG_KERNEL
-	help
-	  Enable this to turn on some debug checking for credential
-	  management.  The additional code keeps track of the number of
-	  pointers from task_structs to any given cred struct, and checks to
-	  see that this number never exceeds the usage count of the cred
-	  struct.
-
-	  Furthermore, if SELinux is enabled, this also checks that the
-	  security pointer in the cred struct is never seen to be invalid.
-
-	  If unsure, say N.
-
 source "kernel/rcu/Kconfig.debug"
 
 config DEBUG_WQ_FORCE_RR_CPU
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 7bfe7d9a32aa..04534ea537c8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -40,9 +40,6 @@ static unsigned long number_cred_unused;
 
 static struct cred machine_cred = {
 	.usage = ATOMIC_INIT(1),
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	.magic = CRED_MAGIC,
-#endif
 };
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..340b2bbbb2dd 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1660,8 +1660,6 @@ static int inode_has_perm(const struct cred *cred,
 	struct inode_security_struct *isec;
 	u32 sid;
 
-	validate_creds(cred);
-
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
@@ -3056,8 +3054,6 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
 	struct inode_security_struct *isec;
 	u32 sid;
 
-	validate_creds(cred);
-
 	ad.type = LSM_AUDIT_DATA_DENTRY;
 	ad.u.dentry = dentry;
 	sid = cred_sid(cred);
@@ -3101,8 +3097,6 @@ static int selinux_inode_permission(struct inode *inode, int mask)
 	if (!mask)
 		return 0;
 
-	validate_creds(cred);
-
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 649ebdef9c3f..1685d7ea6a9f 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -6,7 +6,6 @@
  *
  * Yes, this is unfortunate.  A better solution is in the works.
  */
-NORETURN(__invalid_creds)
 NORETURN(__kunit_abort)
 NORETURN(__module_put_and_kthread_exit)
 NORETURN(__reiserfs_panic)
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index 2e70a6048278..49a29dbc1910 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -50,7 +50,6 @@ CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_MEMORY_INIT=y
diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common
index 0617275d93cc..0f456dbab62f 100644
--- a/tools/testing/selftests/hid/config.common
+++ b/tools/testing/selftests/hid/config.common
@@ -46,7 +46,6 @@ CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEFAULT_FQ_CODEL=y
-- 
cgit v1.2.3


From 4382159696c9af67ee047ed55f2dbf05480f52f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:17 +0100
Subject: cfi: Flip headers

Normal include order is that linux/foo.h should include asm/foo.h, CFI has it
the wrong way around.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20231215092707.231038174@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/riscv/include/asm/cfi.h | 3 ++-
 arch/riscv/kernel/cfi.c      | 2 +-
 arch/x86/include/asm/cfi.h   | 3 ++-
 arch/x86/kernel/cfi.c        | 4 ++--
 include/asm-generic/Kbuild   | 1 +
 include/asm-generic/cfi.h    | 5 +++++
 include/linux/cfi.h          | 1 +
 7 files changed, 14 insertions(+), 5 deletions(-)
 create mode 100644 include/asm-generic/cfi.h

(limited to 'include/linux')

diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h
index 56bf9d69d5e3..8f7a62257044 100644
--- a/arch/riscv/include/asm/cfi.h
+++ b/arch/riscv/include/asm/cfi.h
@@ -7,8 +7,9 @@
  *
  * Copyright (C) 2023 Google LLC
  */
+#include <linux/bug.h>
 
-#include <linux/cfi.h>
+struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
diff --git a/arch/riscv/kernel/cfi.c b/arch/riscv/kernel/cfi.c
index 820158d7a291..6ec9dbd7292e 100644
--- a/arch/riscv/kernel/cfi.c
+++ b/arch/riscv/kernel/cfi.c
@@ -4,7 +4,7 @@
  *
  * Copyright (C) 2023 Google LLC
  */
-#include <asm/cfi.h>
+#include <linux/cfi.h>
 #include <asm/insn.h>
 
 /*
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 58dacd90daef..2a494643089d 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -7,8 +7,9 @@
  *
  * Copyright (C) 2022 Google LLC
  */
+#include <linux/bug.h>
 
-#include <linux/cfi.h>
+struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index 8674a5c0c031..e6bf78fac146 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -4,10 +4,10 @@
  *
  * Copyright (C) 2022 Google LLC
  */
-#include <asm/cfi.h>
+#include <linux/string.h>
+#include <linux/cfi.h>
 #include <asm/insn.h>
 #include <asm/insn-eval.h>
-#include <linux/string.h>
 
 /*
  * Returns the target address and the expected type when regs->ip points
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index def242528b1d..d436bee4d129 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -11,6 +11,7 @@ mandatory-y += bitops.h
 mandatory-y += bug.h
 mandatory-y += bugs.h
 mandatory-y += cacheflush.h
+mandatory-y += cfi.h
 mandatory-y += checksum.h
 mandatory-y += compat.h
 mandatory-y += current.h
diff --git a/include/asm-generic/cfi.h b/include/asm-generic/cfi.h
new file mode 100644
index 000000000000..41fac3537bf9
--- /dev/null
+++ b/include/asm-generic/cfi.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_GENERIC_CFI_H
+#define __ASM_GENERIC_CFI_H
+
+#endif /* __ASM_GENERIC_CFI_H */
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 3552ec82b725..2309d74e77e6 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -9,6 +9,7 @@
 
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <asm/cfi.h>
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
-- 
cgit v1.2.3


From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:18 +0100
Subject: x86/cfi,bpf: Fix BPF JIT call

The current BPF call convention is __nocfi, except when it calls !JIT things,
then it calls regular C functions.

It so happens that with FineIBT the __nocfi and C calling conventions are
incompatible. Specifically __nocfi will call at func+0, while FineIBT will have
endbr-poison there, which is not a valid indirect target. Causing #CP.

Notably this only triggers on IBT enabled hardware, which is probably why this
hasn't been reported (also, most people will have JIT on anyway).

Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for
x86.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/cfi.h    | 110 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/alternative.c |  47 +++++++++++++++---
 arch/x86/net/bpf_jit_comp.c   |  82 +++++++++++++++++++++++++++++--
 include/linux/bpf.h           |  12 ++++-
 include/linux/cfi.h           |   7 +++
 kernel/bpf/core.c             |  25 ++++++++++
 6 files changed, 269 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 2a494643089d..7a7b0b823a98 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -9,15 +9,125 @@
  */
 #include <linux/bug.h>
 
+/*
+ * An overview of the various calling conventions...
+ *
+ * Traditional:
+ *
+ * foo:
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   call *%r11
+ *
+ *
+ * IBT:
+ *
+ * foo:
+ *   endbr64
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo / call foo+4
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   call *%r11
+ *
+ *
+ * kCFI:
+ *
+ * __cfi_foo:
+ *   movl $0x12345678, %eax
+ *				# 11 nops when CONFIG_CALL_PADDING
+ * foo:
+ *   endbr64			# when IBT
+ *   ... code here ...
+ *   ret
+ *
+ * direct call:
+ *   call foo			# / call foo+4 when IBT
+ *
+ * indirect call:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   movl $(-0x12345678), %r10d
+ *   addl -4(%r11), %r10d	# -15 when CONFIG_CALL_PADDING
+ *   jz   1f
+ *   ud2
+ * 1:call *%r11
+ *
+ *
+ * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into):
+ *
+ * __cfi_foo:
+ *   endbr64
+ *   subl 0x12345678, %r10d
+ *   jz   foo
+ *   ud2
+ *   nop
+ * foo:
+ *   osp nop3			# was endbr64
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo / call foo+4
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   movl $0x12345678, %r10d
+ *   subl $16, %r11
+ *   nop4
+ *   call *%r11
+ *
+ */
+enum cfi_mode {
+	CFI_DEFAULT,	/* FineIBT if hardware has IBT, otherwise kCFI */
+	CFI_OFF,	/* Taditional / IBT depending on .config */
+	CFI_KCFI,	/* Optionally CALL_PADDING, IBT, RETPOLINE */
+	CFI_FINEIBT,	/* see arch/x86/kernel/alternative.c */
+};
+
+extern enum cfi_mode cfi_mode;
+
 struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+extern u32 cfi_bpf_hash;
+
+static inline int cfi_get_offset(void)
+{
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		return 16;
+	case CFI_KCFI:
+		if (IS_ENABLED(CONFIG_CALL_PADDING))
+			return 16;
+		return 5;
+	default:
+		return 0;
+	}
+}
+#define cfi_get_offset cfi_get_offset
+
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_NONE;
 }
+#define cfi_bpf_hash 0U
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..d808d3aaec7e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
 #include <asm/fixmap.h>
 #include <asm/paravirt.h>
 #include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
 
 int __read_mostly alternatives_patched;
 
@@ -832,15 +833,43 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
 #endif /* CONFIG_X86_KERNEL_IBT */
 
 #ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT	CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT	CFI_KCFI
+#else
+#define __CFI_DEFAULT	CFI_OFF
+#endif
 
-enum cfi_mode {
-	CFI_DEFAULT,
-	CFI_OFF,
-	CFI_KCFI,
-	CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+				    const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
+"	.type	cfi_bpf_hash,@object				\n"
+"	.globl	cfi_bpf_hash					\n"
+"	.p2align	2, 0x0					\n"
+"cfi_bpf_hash:							\n"
+"	.long	__kcfi_typeid___bpf_prog_runX			\n"
+"	.size	cfi_bpf_hash, 4					\n"
+"	.popsection						\n"
+);
+#endif
+
+#ifdef CONFIG_FINEIBT
 
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
 static bool cfi_rand __ro_after_init = true;
 static u32  cfi_seed __ro_after_init;
 
@@ -1149,8 +1178,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
 		goto err;
 
 	if (cfi_rand) {
-		if (builtin)
+		if (builtin) {
 			cfi_seed = get_random_u32();
+			cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+		}
 
 		ret = cfi_rand_preamble(start_cfi, end_cfi);
 		if (ret)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index af4a5de7d93a..5d5b967b111d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,6 +17,7 @@
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
+#include <asm/cfi.h>
 
 static bool all_callee_regs_used[4] = {true, true, true, true};
 
@@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 #ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR()	EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR()		EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON()	EMIT(gen_endbr_poison(), 4)
 #else
 #define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
 #endif
 
 static bool is_imm8(int value)
@@ -304,6 +307,69 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+/*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+static void emit_fineibt(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT_ENDBR();
+	EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);	/* subl $hash, %r10d	*/
+	EMIT2(0x74, 0x07);				/* jz.d8 +7		*/
+	EMIT2(0x0f, 0x0b);				/* ud2			*/
+	EMIT1(0x90);					/* nop			*/
+	EMIT_ENDBR_POISON();
+
+	*pprog = prog;
+}
+
+static void emit_kcfi(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT1_off32(0xb8, cfi_bpf_hash);		/* movl $hash, %eax	*/
+#ifdef CONFIG_CALL_PADDING
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+#endif
+	EMIT_ENDBR();
+
+	*pprog = prog;
+}
+
+static void emit_cfi(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		emit_fineibt(&prog);
+		break;
+
+	case CFI_KCFI:
+		emit_kcfi(&prog);
+		break;
+
+	default:
+		EMIT_ENDBR();
+		break;
+	}
+
+	*pprog = prog;
+}
+
 /*
  * Emit x86-64 prologue code for BPF program.
  * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -315,10 +381,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 {
 	u8 *prog = *pprog;
 
+	emit_cfi(&prog);
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
 	 */
-	EMIT_ENDBR();
 	memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
 	prog += X86_PATCH_SIZE;
 	if (!ebpf_from_cbpf) {
@@ -3013,9 +3079,16 @@ out_image:
 			jit_data->header = header;
 			jit_data->rw_header = rw_header;
 		}
-		prog->bpf_func = (void *)image;
+		/*
+		 * ctx.prog_offset is used when CFI preambles put code *before*
+		 * the function. See emit_cfi(). For FineIBT specifically this code
+		 * can also be executed and bpf_prog_kallsyms_add() will
+		 * generate an additional symbol to cover this, hence also
+		 * decrement proglen.
+		 */
+		prog->bpf_func = (void *)image + cfi_get_offset();
 		prog->jited = 1;
-		prog->jited_len = proglen;
+		prog->jited_len = proglen - cfi_get_offset();
 	} else {
 		prog = orig_prog;
 	}
@@ -3070,6 +3143,7 @@ void bpf_jit_free(struct bpf_prog *prog)
 			kvfree(jit_data->addrs);
 			kfree(jit_data);
 		}
+		prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
 		hdr = bpf_jit_binary_pack_hdr(prog);
 		bpf_jit_binary_pack_free(hdr, NULL);
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c87c608a3689..9d84c376851a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -29,6 +29,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/static_call.h>
 #include <linux/memcontrol.h>
+#include <linux/cfi.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1211,7 +1212,11 @@ struct bpf_dispatcher {
 #endif
 };
 
-static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
+#ifndef __bpfcall
+#define __bpfcall __nocfi
+#endif
+
+static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
 	bpf_func_t bpf_func)
@@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 
 #define DEFINE_BPF_DISPATCHER(name)					\
 	__BPF_DISPATCHER_SC(name);					\
-	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
+	noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		bpf_func_t bpf_func)					\
@@ -1453,6 +1458,9 @@ struct bpf_prog_aux {
 	struct bpf_kfunc_desc_tab *kfunc_tab;
 	struct bpf_kfunc_btf_tab *kfunc_btf_tab;
 	u32 size_poke_tab;
+#ifdef CONFIG_FINEIBT
+	struct bpf_ksym ksym_prefix;
+#endif
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 2309d74e77e6..1ed2d96c0cfc 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -11,6 +11,13 @@
 #include <linux/module.h>
 #include <asm/cfi.h>
 
+#ifndef cfi_get_offset
+static inline int cfi_get_offset(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
 				      unsigned long *target, u32 type);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c34513d645c4..5aa6863ac33b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 #endif
 
 	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
 	mutex_init(&fp->aux->used_maps_mutex);
 	mutex_init(&fp->aux->dst_mutex);
 
@@ -683,6 +686,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 	fp->aux->ksym.prog = true;
 
 	bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+	/*
+	 * When FineIBT, code in the __cfi_foo() symbols can get executed
+	 * and hence unwinder needs help.
+	 */
+	if (cfi_mode != CFI_FINEIBT)
+		return;
+
+	snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+		 "__cfi_%s", fp->aux->ksym.name);
+
+	fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+	fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;
+
+	bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
 }
 
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -691,6 +711,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 		return;
 
 	bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+	if (cfi_mode != CFI_FINEIBT)
+		return;
+	bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
 }
 
 static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
-- 
cgit v1.2.3


From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:20 +0100
Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI

BPF struct_ops uses __arch_prepare_bpf_trampoline() to write
trampolines for indirect function calls. These tramplines much have
matching CFI.

In order to obtain the correct CFI hash for the various methods, add a
matching structure that contains stub functions, the compiler will
generate correct CFI which we can pilfer for the trampolines.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/cfi.h     |  6 ++++
 arch/x86/kernel/alternative.c  | 22 ++++++++++++++
 arch/x86/net/bpf_jit_comp.c    | 66 ++++++++++++++++++++++++++--------------
 include/linux/bpf.h            | 13 ++++++++
 kernel/bpf/bpf_struct_ops.c    | 16 +++++-----
 net/bpf/bpf_dummy_struct_ops.c | 31 ++++++++++++++++++-
 net/ipv4/bpf_tcp_ca.c          | 69 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 191 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 8779abd217b7..1a50b2cd4713 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -123,6 +123,8 @@ static inline int cfi_get_offset(void)
 }
 #define cfi_get_offset cfi_get_offset
 
+extern u32 cfi_get_func_hash(void *func);
+
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
@@ -130,6 +132,10 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 }
 #define cfi_bpf_hash 0U
 #define cfi_bpf_subprog_hash 0U
+static inline u32 cfi_get_func_hash(void *func)
+{
+	return 0;
+}
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cb393efd5ccd..49c2a62ba5e4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -883,6 +883,28 @@ asm (
 "	.size	cfi_bpf_subprog_hash, 4				\n"
 "	.popsection						\n"
 );
+
+u32 cfi_get_func_hash(void *func)
+{
+	u32 hash;
+
+	func -= cfi_get_offset();
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		func += 7;
+		break;
+	case CFI_KCFI:
+		func += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (get_kernel_nofault(hash, func))
+		return 0;
+
+	return hash;
+}
 #endif
 
 #ifdef CONFIG_FINEIBT
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 43b8c08fdf8d..c89a4abdd726 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -312,9 +312,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  * in arch/x86/kernel/alternative.c
  */
 
-static void emit_fineibt(u8 **pprog, bool is_subprog)
+static void emit_fineibt(u8 **pprog, u32 hash)
 {
-	u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash;
 	u8 *prog = *pprog;
 
 	EMIT_ENDBR();
@@ -327,9 +326,8 @@ static void emit_fineibt(u8 **pprog, bool is_subprog)
 	*pprog = prog;
 }
 
-static void emit_kcfi(u8 **pprog, bool is_subprog)
+static void emit_kcfi(u8 **pprog, u32 hash)
 {
-	u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash;
 	u8 *prog = *pprog;
 
 	EMIT1_off32(0xb8, hash);			/* movl $hash, %eax	*/
@@ -351,17 +349,17 @@ static void emit_kcfi(u8 **pprog, bool is_subprog)
 	*pprog = prog;
 }
 
-static void emit_cfi(u8 **pprog, bool is_subprog)
+static void emit_cfi(u8 **pprog, u32 hash)
 {
 	u8 *prog = *pprog;
 
 	switch (cfi_mode) {
 	case CFI_FINEIBT:
-		emit_fineibt(&prog, is_subprog);
+		emit_fineibt(&prog, hash);
 		break;
 
 	case CFI_KCFI:
-		emit_kcfi(&prog, is_subprog);
+		emit_kcfi(&prog, hash);
 		break;
 
 	default:
@@ -383,7 +381,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 {
 	u8 *prog = *pprog;
 
-	emit_cfi(&prog, is_subprog);
+	emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
 	 */
@@ -2510,10 +2508,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	u8 *prog;
 	bool save_ret;
 
+	/*
+	 * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+	 * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+	 * because @func_addr.
+	 */
+	WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+		     (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
 	/* extra registers for struct arguments */
-	for (i = 0; i < m->nr_args; i++)
+	for (i = 0; i < m->nr_args; i++) {
 		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
 			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+	}
 
 	/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
 	 * are passed through regs, the remains are through stack.
@@ -2596,20 +2603,27 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	prog = rw_image;
 
-	EMIT_ENDBR();
-	/*
-	 * This is the direct-call trampoline, as such it needs accounting
-	 * for the __fentry__ call.
-	 */
-	x86_call_depth_emit_accounting(&prog, NULL);
+	if (flags & BPF_TRAMP_F_INDIRECT) {
+		/*
+		 * Indirect call for bpf_struct_ops
+		 */
+		emit_cfi(&prog, cfi_get_func_hash(func_addr));
+	} else {
+		/*
+		 * Direct-call fentry stub, as such it needs accounting for the
+		 * __fentry__ call.
+		 */
+		x86_call_depth_emit_accounting(&prog, NULL);
+	}
 	EMIT1(0x55);		 /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
-	if (!is_imm8(stack_size))
+	if (!is_imm8(stack_size)) {
 		/* sub rsp, stack_size */
 		EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
-	else
+	} else {
 		/* sub rsp, stack_size */
 		EMIT4(0x48, 0x83, 0xEC, stack_size);
+	}
 	if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
 		EMIT1(0x50);		/* push rax */
 	/* mov QWORD PTR [rbp - rbx_off], rbx */
@@ -2643,10 +2657,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		}
 	}
 
-	if (fentry->nr_links)
+	if (fentry->nr_links) {
 		if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
 			       flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
 			return -EINVAL;
+	}
 
 	if (fmod_ret->nr_links) {
 		branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
@@ -2665,11 +2680,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		restore_regs(m, &prog, regs_off);
 		save_args(m, &prog, arg_stack_off, true);
 
-		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
 			/* Before calling the original function, restore the
 			 * tail_call_cnt from stack to rax.
 			 */
 			RESTORE_TAIL_CALL_CNT(stack_size);
+		}
 
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
 			emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
@@ -2698,17 +2714,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		/* Update the branches saved in invoke_bpf_mod_ret with the
 		 * aligned address of do_fexit.
 		 */
-		for (i = 0; i < fmod_ret->nr_links; i++)
+		for (i = 0; i < fmod_ret->nr_links; i++) {
 			emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
 					    image + (branches[i] - (u8 *)rw_image), X86_JNE);
+		}
 	}
 
-	if (fexit->nr_links)
+	if (fexit->nr_links) {
 		if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
 			       false, image, rw_image)) {
 			ret = -EINVAL;
 			goto cleanup;
 		}
+	}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
 		restore_regs(m, &prog, regs_off);
@@ -2725,11 +2743,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 			ret = -EINVAL;
 			goto cleanup;
 		}
-	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
 		/* Before running the original function, restore the
 		 * tail_call_cnt from stack to rax.
 		 */
 		RESTORE_TAIL_CALL_CNT(stack_size);
+	}
 
 	/* restore return value of orig_call or fentry prog back into RAX */
 	if (save_ret)
@@ -2737,9 +2756,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
 	EMIT1(0xC9); /* leave */
-	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+	if (flags & BPF_TRAMP_F_SKIP_FRAME) {
 		/* skip our return address and return to parent */
 		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
+	}
 	emit_return(&prog, image + (prog - (u8 *)rw_image));
 	/* Make sure the trampoline generation logic doesn't overflow */
 	if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9d84c376851a..db46b3359bf5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1060,6 +1060,17 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
 
+/*
+ * Indicate the trampoline should be suitable to receive indirect calls;
+ * without this indirectly calling the generated code can result in #UD/#CP,
+ * depending on the CFI options.
+ *
+ * Used by bpf_struct_ops.
+ *
+ * Incompatible with FENTRY usage, overloads @func_addr argument.
+ */
+#define BPF_TRAMP_F_INDIRECT		BIT(8)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
@@ -1697,6 +1708,7 @@ struct bpf_struct_ops {
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 	u32 type_id;
 	u32 value_id;
+	void *cfi_stubs;
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
@@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
+				      void *stub_func,
 				      void *image, void *image_end);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 4d53c53fc5aa..02068bd0e4d9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -352,17 +352,16 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
-				      void *image, void *image_end)
+				      void *stub_func, void *image, void *image_end)
 {
-	u32 flags;
+	u32 flags = BPF_TRAMP_F_INDIRECT;
 	int size;
 
 	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
 	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
-	/* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
-	 * and it must be used alone.
-	 */
-	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+	if (model->ret_size > 0)
+		flags |= BPF_TRAMP_F_RET_FENTRY_RET;
 
 	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
 	if (size < 0)
@@ -370,7 +369,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	if (size > (unsigned long)image_end - (unsigned long)image)
 		return -E2BIG;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
-					   model, flags, tlinks, NULL);
+					   model, flags, tlinks, stub_func);
 }
 
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -504,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 							&st_ops->func_models[i],
+							*(void **)(st_ops->cfi_stubs + moff),
 							image, image_end);
 		if (err < 0)
 			goto reset_unlock;
 
-		*(void **)(kdata + moff) = image;
+		*(void **)(kdata + moff) = image + cfi_get_offset();
 		image += err;
 
 		/* put prog_id to udata */
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 2748f9d77b18..8906f7bdf4a9 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -12,6 +12,11 @@ extern struct bpf_struct_ops bpf_bpf_dummy_ops;
 /* A common type for test_N with return value in bpf_dummy_ops */
 typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
 
+static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...)
+{
+	return 0;
+}
+
 struct bpf_dummy_ops_test_args {
 	u64 args[MAX_BPF_FUNC_ARGS];
 	struct bpf_dummy_ops_state state;
@@ -62,7 +67,7 @@ static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args)
 
 static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args)
 {
-	dummy_ops_test_ret_fn test = (void *)image;
+	dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset();
 	struct bpf_dummy_ops_state *state = NULL;
 
 	/* state needs to be NULL if args[0] is 0 */
@@ -119,6 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	op_idx = prog->expected_attach_type;
 	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 						&st_ops->func_models[op_idx],
+						&dummy_ops_test_ret_function,
 						image, image + PAGE_SIZE);
 	if (err < 0)
 		goto out;
@@ -219,6 +225,28 @@ static void bpf_dummy_unreg(void *kdata)
 {
 }
 
+static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb)
+{
+	return 0;
+}
+
+static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
+			    char a3, unsigned long a4)
+{
+	return 0;
+}
+
+static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb)
+{
+	return 0;
+}
+
+static struct bpf_dummy_ops __bpf_bpf_dummy_ops = {
+	.test_1 = bpf_dummy_test_1,
+	.test_2 = bpf_dummy_test_2,
+	.test_sleepable = bpf_dummy_test_sleepable,
+};
+
 struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.verifier_ops = &bpf_dummy_verifier_ops,
 	.init = bpf_dummy_init,
@@ -227,4 +255,5 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.reg = bpf_dummy_reg,
 	.unreg = bpf_dummy_unreg,
 	.name = "bpf_dummy_ops",
+	.cfi_stubs = &__bpf_bpf_dummy_ops,
 };
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index c7bbd8f3c708..634cfafa583d 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -271,6 +271,74 @@ static int bpf_tcp_ca_validate(void *kdata)
 	return tcp_validate_congestion_control(kdata);
 }
 
+static u32 bpf_tcp_ca_ssthresh(struct sock *sk)
+{
+	return 0;
+}
+
+static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+}
+
+static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state)
+{
+}
+
+static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+}
+
+static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags)
+{
+}
+
+static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+}
+
+static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+{
+	return 0;
+}
+
+static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
+{
+}
+
+static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk)
+{
+	return 0;
+}
+
+static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk)
+{
+	return 0;
+}
+
+static void __bpf_tcp_ca_init(struct sock *sk)
+{
+}
+
+static void __bpf_tcp_ca_release(struct sock *sk)
+{
+}
+
+static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+	.ssthresh = bpf_tcp_ca_ssthresh,
+	.cong_avoid = bpf_tcp_ca_cong_avoid,
+	.set_state = bpf_tcp_ca_set_state,
+	.cwnd_event = bpf_tcp_ca_cwnd_event,
+	.in_ack_event = bpf_tcp_ca_in_ack_event,
+	.pkts_acked = bpf_tcp_ca_pkts_acked,
+	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
+	.cong_control = bpf_tcp_ca_cong_control,
+	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+
+	.init = __bpf_tcp_ca_init,
+	.release = __bpf_tcp_ca_release,
+};
+
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
@@ -281,6 +349,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.init = bpf_tcp_ca_init,
 	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
+	.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
 };
 
 static int __init bpf_tcp_ca_kfunc_init(void)
-- 
cgit v1.2.3


From e9d13b9d2f99ccf7afeab490d97eaa5ac9846598 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:21 +0100
Subject: cfi: Add CFI_NOSEAL()

Add a CFI_NOSEAL() helper to mark functions that need to retain their
CFI information, despite not otherwise leaking their address.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.669401084@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/cfi.h | 5 +++++
 include/linux/cfi.h        | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 1a50b2cd4713..7cd752557905 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -8,6 +8,7 @@
  * Copyright (C) 2022 Google LLC
  */
 #include <linux/bug.h>
+#include <asm/ibt.h>
 
 /*
  * An overview of the various calling conventions...
@@ -138,4 +139,8 @@ static inline u32 cfi_get_func_hash(void *func)
 }
 #endif /* CONFIG_CFI_CLANG */
 
+#if HAS_KERNEL_IBT == 1
+#define CFI_NOSEAL(x)	asm(IBT_NOSEAL(__stringify(x)))
+#endif
+
 #endif /* _ASM_X86_CFI_H */
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 1ed2d96c0cfc..f0df518e11dd 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -46,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr,
 #endif /* CONFIG_ARCH_USES_CFI_TRAPS */
 #endif /* CONFIG_MODULES */
 
+#ifndef CFI_NOSEAL
+#define CFI_NOSEAL(x)
+#endif
+
 #endif /* _LINUX_CFI_H */
-- 
cgit v1.2.3


From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Fri, 15 Dec 2023 10:12:23 +0100
Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature

As per the earlier patches, BPF sub-programs have bpf_callback_t
signature and CFI expects callers to have matching signature. This is
violated by bpf_prog_aux::bpf_exception_cb().

[peterz: Changelog]
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com
Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 +-
 kernel/bpf/helpers.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index db46b3359bf5..5e694934cf37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1484,7 +1484,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
-	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
+	u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e0c0e3676df8..07fd4b5704f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2537,7 +2537,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	 * which skips compiler generated instrumentation to do the same.
 	 */
 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
-	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
-- 
cgit v1.2.3


From 117211aa739a926e6555cfea883be84bee6f1695 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 16 Dec 2023 00:05:02 +0100
Subject: bpf: Add missing BPF_LINK_TYPE invocations

Pengfei Xu reported [1] Syzkaller/KASAN issue found in bpf_link_show_fdinfo.

The reason is missing BPF_LINK_TYPE invocation for uprobe multi
link and for several other links, adding that.

[1] https://lore.kernel.org/bpf/ZXptoKRSLspnk2ie@xpf.sh.intel.com/

Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support")
Fixes: 84601d6ee68a ("bpf: add bpf_link support for BPF_NETFILTER programs")
Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20231215230502.2769743-1-jolsa@kernel.org
---
 include/linux/bpf_types.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..94baced5a1ad 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -142,9 +142,13 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
 BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
+BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx)
+BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit)
 #endif
 #ifdef CONFIG_PERF_EVENTS
 BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 #endif
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
 BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
+BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi)
-- 
cgit v1.2.3


From bb339db4d363c84e0a8d70827df591397ccd7312 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Fri, 15 Dec 2023 17:56:48 +0000
Subject: arm: perf: Fix ARCH=arm build with GCC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LLVM ignores everything inside the if statement and doesn't generate
errors, but GCC doesn't ignore it, resulting in the following error:

  drivers/perf/arm_pmuv3.c: In function ‘armv8pmu_write_evtype’:
  include/linux/bits.h:34:29: error: left shift count >= width of type [-Werror=shift-count-overflow]
  34 |         (((~UL(0)) - (UL(1) << (l)) + 1) & \

Fix it by using GENMASK_ULL which doesn't overflow on arm32 (even though
the value is never used there).

Fixes: 3115ee021bfb ("arm64: perf: Include threshold control fields in PMEVTYPER mask")
Reported-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Closes: https://lore.kernel.org/linux-arm-kernel/20231215120817.h2f3akgv72zhrtqo@pengutronix.de/
Signed-off-by: James Clark <james.clark@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20231215175648.3397170-2-james.clark@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 0f4d62ef3a9a..46377e134d67 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -234,8 +234,8 @@
  * PMXEVTYPER: Event selection reg
  */
 #define ARMV8_PMU_EVTYPE_EVENT	GENMASK(15, 0)	/* Mask for EVENT bits */
-#define ARMV8_PMU_EVTYPE_TH	GENMASK(43, 32)
-#define ARMV8_PMU_EVTYPE_TC	GENMASK(63, 61)
+#define ARMV8_PMU_EVTYPE_TH	GENMASK_ULL(43, 32) /* arm64 only */
+#define ARMV8_PMU_EVTYPE_TC	GENMASK_ULL(63, 61) /* arm64 only */
 
 /*
  * Event filters for PMUv3
-- 
cgit v1.2.3


From ebb30ccbbdbd6fae5177b676da4f4ac92bb4f635 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:31 +0100
Subject: net: phy: make addr type u8 in phy_package_shared struct

Switch addr type in phy_package_shared struct to u8.

The value is already checked to be non negative and to be less than
PHY_MAX_ADDR, hence u8 is better suited than using int.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index dbb5e13e3e1b..4b13cc85c4f5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -338,7 +338,7 @@ struct mdio_bus_stats {
  * phy_package_leave().
  */
 struct phy_package_shared {
-	int addr;
+	u8 addr;
 	refcount_t refcnt;
 	unsigned long flags;
 	size_t priv_size;
-- 
cgit v1.2.3


From 9eea577eb1155fe4a183bc5e7bf269b0b2e7a6ba Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:32 +0100
Subject: net: phy: extend PHY package API to support multiple global address

Current API for PHY package are limited to single address to configure
global settings for the PHY package.

It was found that some PHY package (for example the qca807x, a PHY
package that is shipped with a bundle of 5 PHY) requires multiple PHY
address to configure global settings. An example scenario is a PHY that
have a dedicated PHY for PSGMII/serdes calibrarion and have a specific
PHY in the package where the global PHY mode is set and affects every
other PHY in the package.

Change the API in the following way:
- Change phy_package_join() to take the base addr of the PHY package
  instead of the global PHY addr.
- Make __/phy_package_write/read() require an additional arg that
  select what global PHY address to use by passing the offset from the
  base addr passed on phy_package_join().

Each user of this API is updated to follow this new implementation
following a pattern where an enum is defined to declare the offset of the
addr.

We also drop the check if shared is defined as any user of the
phy_package_read/write is expected to use phy_package_join first. Misuse
of this will correctly trigger a kernel panic for NULL pointer
exception.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm54140.c       | 16 +++++++---
 drivers/net/phy/mscc/mscc.h      |  5 ++++
 drivers/net/phy/mscc/mscc_main.c |  4 +--
 drivers/net/phy/phy_device.c     | 35 ++++++++++++----------
 include/linux/phy.h              | 64 ++++++++++++++++++++++++++--------------
 5 files changed, 80 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c
index d43076592f81..2eea3d09b1e6 100644
--- a/drivers/net/phy/bcm54140.c
+++ b/drivers/net/phy/bcm54140.c
@@ -128,6 +128,10 @@
 #define BCM54140_DEFAULT_DOWNSHIFT 5
 #define BCM54140_MAX_DOWNSHIFT 9
 
+enum bcm54140_global_phy {
+	BCM54140_BASE_ADDR = 0,
+};
+
 struct bcm54140_priv {
 	int port;
 	int base_addr;
@@ -429,11 +433,13 @@ static int bcm54140_base_read_rdb(struct phy_device *phydev, u16 rdb)
 	int ret;
 
 	phy_lock_mdio_bus(phydev);
-	ret = __phy_package_write(phydev, MII_BCM54XX_RDB_ADDR, rdb);
+	ret = __phy_package_write(phydev, BCM54140_BASE_ADDR,
+				  MII_BCM54XX_RDB_ADDR, rdb);
 	if (ret < 0)
 		goto out;
 
-	ret = __phy_package_read(phydev, MII_BCM54XX_RDB_DATA);
+	ret = __phy_package_read(phydev, BCM54140_BASE_ADDR,
+				 MII_BCM54XX_RDB_DATA);
 
 out:
 	phy_unlock_mdio_bus(phydev);
@@ -446,11 +452,13 @@ static int bcm54140_base_write_rdb(struct phy_device *phydev,
 	int ret;
 
 	phy_lock_mdio_bus(phydev);
-	ret = __phy_package_write(phydev, MII_BCM54XX_RDB_ADDR, rdb);
+	ret = __phy_package_write(phydev, BCM54140_BASE_ADDR,
+				  MII_BCM54XX_RDB_ADDR, rdb);
 	if (ret < 0)
 		goto out;
 
-	ret = __phy_package_write(phydev, MII_BCM54XX_RDB_DATA, val);
+	ret = __phy_package_write(phydev, BCM54140_BASE_ADDR,
+				  MII_BCM54XX_RDB_DATA, val);
 
 out:
 	phy_unlock_mdio_bus(phydev);
diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h
index 7a962050a4d4..6a3d8a754eb8 100644
--- a/drivers/net/phy/mscc/mscc.h
+++ b/drivers/net/phy/mscc/mscc.h
@@ -416,6 +416,11 @@ struct vsc8531_private {
  * gpio_lock: used for PHC operations. Common for all PHYs as the load/save GPIO
  * is shared.
  */
+
+enum vsc85xx_global_phy {
+	VSC88XX_BASE_ADDR = 0,
+};
+
 struct vsc85xx_shared_private {
 	struct mutex gpio_lock;
 };
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 4171f01d34e5..6f74ce0ab1aa 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -711,7 +711,7 @@ int phy_base_write(struct phy_device *phydev, u32 regnum, u16 val)
 		dump_stack();
 	}
 
-	return __phy_package_write(phydev, regnum, val);
+	return __phy_package_write(phydev, VSC88XX_BASE_ADDR, regnum, val);
 }
 
 /* phydev->bus->mdio_lock should be locked when using this function */
@@ -722,7 +722,7 @@ int phy_base_read(struct phy_device *phydev, u32 regnum)
 		dump_stack();
 	}
 
-	return __phy_package_read(phydev, regnum);
+	return __phy_package_read(phydev, VSC88XX_BASE_ADDR, regnum);
 }
 
 u32 vsc85xx_csr_read(struct phy_device *phydev,
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d8e9335d415c..0c52a9eff188 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1651,20 +1651,22 @@ EXPORT_SYMBOL_GPL(phy_driver_is_genphy_10g);
 /**
  * phy_package_join - join a common PHY group
  * @phydev: target phy_device struct
- * @addr: cookie and PHY address for global register access
+ * @base_addr: cookie and base PHY address of PHY package for offset
+ *   calculation of global register access
  * @priv_size: if non-zero allocate this amount of bytes for private data
  *
  * This joins a PHY group and provides a shared storage for all phydevs in
  * this group. This is intended to be used for packages which contain
  * more than one PHY, for example a quad PHY transceiver.
  *
- * The addr parameter serves as a cookie which has to have the same value
- * for all members of one group and as a PHY address to access generic
- * registers of a PHY package. Usually, one of the PHY addresses of the
- * different PHYs in the package provides access to these global registers.
+ * The base_addr parameter serves as cookie which has to have the same values
+ * for all members of one group and as the base PHY address of the PHY package
+ * for offset calculation to access generic registers of a PHY package.
+ * Usually, one of the PHY addresses of the different PHYs in the package
+ * provides access to these global registers.
  * The address which is given here, will be used in the phy_package_read()
- * and phy_package_write() convenience functions. If your PHY doesn't have
- * global registers you can just pick any of the PHY addresses.
+ * and phy_package_write() convenience functions as base and added to the
+ * passed offset in those functions.
  *
  * This will set the shared pointer of the phydev to the shared storage.
  * If this is the first call for a this cookie the shared storage will be
@@ -1674,17 +1676,17 @@ EXPORT_SYMBOL_GPL(phy_driver_is_genphy_10g);
  * Returns < 1 on error, 0 on success. Esp. calling phy_package_join()
  * with the same cookie but a different priv_size is an error.
  */
-int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size)
+int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size)
 {
 	struct mii_bus *bus = phydev->mdio.bus;
 	struct phy_package_shared *shared;
 	int ret;
 
-	if (addr < 0 || addr >= PHY_MAX_ADDR)
+	if (base_addr < 0 || base_addr >= PHY_MAX_ADDR)
 		return -EINVAL;
 
 	mutex_lock(&bus->shared_lock);
-	shared = bus->shared[addr];
+	shared = bus->shared[base_addr];
 	if (!shared) {
 		ret = -ENOMEM;
 		shared = kzalloc(sizeof(*shared), GFP_KERNEL);
@@ -1696,9 +1698,9 @@ int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size)
 				goto err_free;
 			shared->priv_size = priv_size;
 		}
-		shared->addr = addr;
+		shared->base_addr = base_addr;
 		refcount_set(&shared->refcnt, 1);
-		bus->shared[addr] = shared;
+		bus->shared[base_addr] = shared;
 	} else {
 		ret = -EINVAL;
 		if (priv_size && priv_size != shared->priv_size)
@@ -1736,7 +1738,7 @@ void phy_package_leave(struct phy_device *phydev)
 		return;
 
 	if (refcount_dec_and_mutex_lock(&shared->refcnt, &bus->shared_lock)) {
-		bus->shared[shared->addr] = NULL;
+		bus->shared[shared->base_addr] = NULL;
 		mutex_unlock(&bus->shared_lock);
 		kfree(shared->priv);
 		kfree(shared);
@@ -1755,7 +1757,8 @@ static void devm_phy_package_leave(struct device *dev, void *res)
  * devm_phy_package_join - resource managed phy_package_join()
  * @dev: device that is registering this PHY package
  * @phydev: target phy_device struct
- * @addr: cookie and PHY address for global register access
+ * @base_addr: cookie and base PHY address of PHY package for offset
+ *   calculation of global register access
  * @priv_size: if non-zero allocate this amount of bytes for private data
  *
  * Managed phy_package_join(). Shared storage fetched by this function,
@@ -1763,7 +1766,7 @@ static void devm_phy_package_leave(struct device *dev, void *res)
  * phy_package_join() for more information.
  */
 int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
-			  int addr, size_t priv_size)
+			  int base_addr, size_t priv_size)
 {
 	struct phy_device **ptr;
 	int ret;
@@ -1773,7 +1776,7 @@ int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
 	if (!ptr)
 		return -ENOMEM;
 
-	ret = phy_package_join(phydev, addr, priv_size);
+	ret = phy_package_join(phydev, base_addr, priv_size);
 
 	if (!ret) {
 		*ptr = phydev;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4b13cc85c4f5..d653f660c39d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -327,7 +327,8 @@ struct mdio_bus_stats {
 
 /**
  * struct phy_package_shared - Shared information in PHY packages
- * @addr: Common PHY address used to combine PHYs in one package
+ * @base_addr: Base PHY address of PHY package used to combine PHYs
+ *   in one package and for offset calculation of phy_package_read/write
  * @refcnt: Number of PHYs connected to this shared data
  * @flags: Initialization of PHY package
  * @priv_size: Size of the shared private data @priv
@@ -338,7 +339,7 @@ struct mdio_bus_stats {
  * phy_package_leave().
  */
 struct phy_package_shared {
-	u8 addr;
+	u8 base_addr;
 	refcount_t refcnt;
 	unsigned long flags;
 	size_t priv_size;
@@ -1976,10 +1977,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev,
 int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
 int phy_ethtool_nway_reset(struct net_device *ndev);
-int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size);
+int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size);
 void phy_package_leave(struct phy_device *phydev);
 int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
-			  int addr, size_t priv_size);
+			  int base_addr, size_t priv_size);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
@@ -2002,46 +2003,65 @@ int __phy_hwtstamp_set(struct phy_device *phydev,
 		       struct kernel_hwtstamp_config *config,
 		       struct netlink_ext_ack *extack);
 
-static inline int phy_package_read(struct phy_device *phydev, u32 regnum)
+static inline int phy_package_address(struct phy_device *phydev,
+				      unsigned int addr_offset)
 {
 	struct phy_package_shared *shared = phydev->shared;
+	u8 base_addr = shared->base_addr;
 
-	if (!shared)
+	if (addr_offset >= PHY_MAX_ADDR - base_addr)
 		return -EIO;
 
-	return mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
+	/* we know that addr will be in the range 0..31 and thus the
+	 * implicit cast to a signed int is not a problem.
+	 */
+	return base_addr + addr_offset;
 }
 
-static inline int __phy_package_read(struct phy_device *phydev, u32 regnum)
+static inline int phy_package_read(struct phy_device *phydev,
+				   unsigned int addr_offset, u32 regnum)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
+
+	return mdiobus_read(phydev->mdio.bus, addr, regnum);
+}
+
+static inline int __phy_package_read(struct phy_device *phydev,
+				     unsigned int addr_offset, u32 regnum)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+
+	if (addr < 0)
+		return addr;
 
-	return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum);
+	return __mdiobus_read(phydev->mdio.bus, addr, regnum);
 }
 
 static inline int phy_package_write(struct phy_device *phydev,
-				    u32 regnum, u16 val)
+				    unsigned int addr_offset, u32 regnum,
+				    u16 val)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
 
-	return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
+	return mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
 static inline int __phy_package_write(struct phy_device *phydev,
-				      u32 regnum, u16 val)
+				      unsigned int addr_offset, u32 regnum,
+				      u16 val)
 {
-	struct phy_package_shared *shared = phydev->shared;
+	int addr = phy_package_address(phydev, addr_offset);
 
-	if (!shared)
-		return -EIO;
+	if (addr < 0)
+		return addr;
 
-	return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val);
+	return __mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
 static inline bool __phy_package_set_once(struct phy_device *phydev,
-- 
cgit v1.2.3


From d63710fc0f1a501fd75a7025e3070a96ffa1645f Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Fri, 15 Dec 2023 14:15:34 +0100
Subject: net: phy: add support for PHY package MMD read/write

Some PHY in PHY package may require to read/write MMD regs to correctly
configure the PHY package.

Add support for these additional required function in both lock and no
lock variant.

It's assumed that the entire PHY package is either C22 or C45. We use
C22 or C45 way of writing/reading to mmd regs based on the passed phydev
whether it's C22 or C45.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 140 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h        |  16 ++++++
 2 files changed, 156 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index b729ac8b2640..15f349e5995a 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -650,6 +650,146 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
 }
 EXPORT_SYMBOL(phy_write_mmd);
 
+/**
+ * __phy_package_read_mmd - read MMD reg relative to PHY package base addr
+ * @phydev: The phy_device struct
+ * @addr_offset: The offset to be added to PHY package base_addr
+ * @devad: The MMD to read from
+ * @regnum: The register on the MMD to read
+ *
+ * Convenience helper for reading a register of an MMD on a given PHY
+ * using the PHY package base address. The base address is added to
+ * the addr_offset value.
+ *
+ * Same calling rules as for __phy_read();
+ *
+ * NOTE: It's assumed that the entire PHY package is either C22 or C45.
+ */
+int __phy_package_read_mmd(struct phy_device *phydev,
+			   unsigned int addr_offset, int devad,
+			   u32 regnum)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+
+	if (addr < 0)
+		return addr;
+
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
+
+	return mmd_phy_read(phydev->mdio.bus, addr, phydev->is_c45, devad,
+			    regnum);
+}
+EXPORT_SYMBOL(__phy_package_read_mmd);
+
+/**
+ * phy_package_read_mmd - read MMD reg relative to PHY package base addr
+ * @phydev: The phy_device struct
+ * @addr_offset: The offset to be added to PHY package base_addr
+ * @devad: The MMD to read from
+ * @regnum: The register on the MMD to read
+ *
+ * Convenience helper for reading a register of an MMD on a given PHY
+ * using the PHY package base address. The base address is added to
+ * the addr_offset value.
+ *
+ * Same calling rules as for phy_read();
+ *
+ * NOTE: It's assumed that the entire PHY package is either C22 or C45.
+ */
+int phy_package_read_mmd(struct phy_device *phydev,
+			 unsigned int addr_offset, int devad,
+			 u32 regnum)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+	int val;
+
+	if (addr < 0)
+		return addr;
+
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
+
+	phy_lock_mdio_bus(phydev);
+	val = mmd_phy_read(phydev->mdio.bus, addr, phydev->is_c45, devad,
+			   regnum);
+	phy_unlock_mdio_bus(phydev);
+
+	return val;
+}
+EXPORT_SYMBOL(phy_package_read_mmd);
+
+/**
+ * __phy_package_write_mmd - write MMD reg relative to PHY package base addr
+ * @phydev: The phy_device struct
+ * @addr_offset: The offset to be added to PHY package base_addr
+ * @devad: The MMD to write to
+ * @regnum: The register on the MMD to write
+ * @val: value to write to @regnum
+ *
+ * Convenience helper for writing a register of an MMD on a given PHY
+ * using the PHY package base address. The base address is added to
+ * the addr_offset value.
+ *
+ * Same calling rules as for __phy_write();
+ *
+ * NOTE: It's assumed that the entire PHY package is either C22 or C45.
+ */
+int __phy_package_write_mmd(struct phy_device *phydev,
+			    unsigned int addr_offset, int devad,
+			    u32 regnum, u16 val)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+
+	if (addr < 0)
+		return addr;
+
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
+
+	return mmd_phy_write(phydev->mdio.bus, addr, phydev->is_c45, devad,
+			     regnum, val);
+}
+EXPORT_SYMBOL(__phy_package_write_mmd);
+
+/**
+ * phy_package_write_mmd - write MMD reg relative to PHY package base addr
+ * @phydev: The phy_device struct
+ * @addr_offset: The offset to be added to PHY package base_addr
+ * @devad: The MMD to write to
+ * @regnum: The register on the MMD to write
+ * @val: value to write to @regnum
+ *
+ * Convenience helper for writing a register of an MMD on a given PHY
+ * using the PHY package base address. The base address is added to
+ * the addr_offset value.
+ *
+ * Same calling rules as for phy_write();
+ *
+ * NOTE: It's assumed that the entire PHY package is either C22 or C45.
+ */
+int phy_package_write_mmd(struct phy_device *phydev,
+			  unsigned int addr_offset, int devad,
+			  u32 regnum, u16 val)
+{
+	int addr = phy_package_address(phydev, addr_offset);
+	int ret;
+
+	if (addr < 0)
+		return addr;
+
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
+
+	phy_lock_mdio_bus(phydev);
+	ret = mmd_phy_write(phydev->mdio.bus, addr, phydev->is_c45, devad,
+			    regnum, val);
+	phy_unlock_mdio_bus(phydev);
+
+	return ret;
+}
+EXPORT_SYMBOL(phy_package_write_mmd);
+
 /**
  * phy_modify_changed - Function for modifying a PHY register
  * @phydev: the phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d653f660c39d..e9e85d347587 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2064,6 +2064,22 @@ static inline int __phy_package_write(struct phy_device *phydev,
 	return __mdiobus_write(phydev->mdio.bus, addr, regnum, val);
 }
 
+int __phy_package_read_mmd(struct phy_device *phydev,
+			   unsigned int addr_offset, int devad,
+			   u32 regnum);
+
+int phy_package_read_mmd(struct phy_device *phydev,
+			 unsigned int addr_offset, int devad,
+			 u32 regnum);
+
+int __phy_package_write_mmd(struct phy_device *phydev,
+			    unsigned int addr_offset, int devad,
+			    u32 regnum, u16 val);
+
+int phy_package_write_mmd(struct phy_device *phydev,
+			  unsigned int addr_offset, int devad,
+			  u32 regnum, u16 val);
+
 static inline bool __phy_package_set_once(struct phy_device *phydev,
 					  unsigned int b)
 {
-- 
cgit v1.2.3


From 120931db07b49252aba2073096b595482d71857c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 27 Nov 2023 23:36:52 -0600
Subject: rtc: Add support for configuring the UIP timeout for RTC reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The UIP timeout is hardcoded to 10ms for all RTC reads, but in some
contexts this might not be enough time. Add a timeout parameter to
mc146818_get_time() and mc146818_get_time_callback().

If UIP timeout is configured by caller to be >=100 ms and a call
takes this long, log a warning.

Make all callers use 10ms to ensure no functional changes.

Cc:  <stable@vger.kernel.org> # 6.1.y
Fixes: ec5895c0f2d8 ("rtc: mc146818-lib: extract mc146818_avoid_UIP")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Reviewed-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Acked-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Link: https://lore.kernel.org/r/20231128053653.101798-4-mario.limonciello@amd.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 arch/alpha/kernel/rtc.c        |  2 +-
 arch/x86/kernel/hpet.c         |  2 +-
 arch/x86/kernel/rtc.c          |  2 +-
 drivers/base/power/trace.c     |  2 +-
 drivers/rtc/rtc-cmos.c         |  6 +++---
 drivers/rtc/rtc-mc146818-lib.c | 37 +++++++++++++++++++++++++++++--------
 include/linux/mc146818rtc.h    |  3 ++-
 7 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c
index fb3025396ac9..cfdf90bc8b3f 100644
--- a/arch/alpha/kernel/rtc.c
+++ b/arch/alpha/kernel/rtc.c
@@ -80,7 +80,7 @@ init_rtc_epoch(void)
 static int
 alpha_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
-	int ret = mc146818_get_time(tm);
+	int ret = mc146818_get_time(tm, 10);
 
 	if (ret < 0) {
 		dev_err_ratelimited(dev, "unable to read current time\n");
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 41eecf180b7f..17adad4cbe78 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1438,7 +1438,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	memset(&curr_time, 0, sizeof(struct rtc_time));
 
 	if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
-		if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+		if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) {
 			pr_err_ratelimited("unable to read current time from RTC\n");
 			return IRQ_HANDLED;
 		}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1309b9b05338..961ebc7f1872 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -67,7 +67,7 @@ void mach_get_cmos_time(struct timespec64 *now)
 		return;
 	}
 
-	if (mc146818_get_time(&tm)) {
+	if (mc146818_get_time(&tm, 10)) {
 		pr_err("Unable to read current time from RTC\n");
 		now->tv_sec = now->tv_nsec = 0;
 		return;
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index 72b7a92337b1..c2e925357474 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -120,7 +120,7 @@ static unsigned int read_magic_time(void)
 	struct rtc_time time;
 	unsigned int val;
 
-	if (mc146818_get_time(&time) < 0) {
+	if (mc146818_get_time(&time, 10) < 0) {
 		pr_err("Unable to read current time from RTC\n");
 		return 0;
 	}
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 2a1a83caf081..468ccd99f218 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -231,7 +231,7 @@ static int cmos_read_time(struct device *dev, struct rtc_time *t)
 	if (!pm_trace_rtc_valid())
 		return -EIO;
 
-	ret = mc146818_get_time(t);
+	ret = mc146818_get_time(t, 10);
 	if (ret < 0) {
 		dev_err_ratelimited(dev, "unable to read current time\n");
 		return ret;
@@ -307,7 +307,7 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 	 *
 	 * Use the mc146818_avoid_UIP() function to avoid this.
 	 */
-	if (!mc146818_avoid_UIP(cmos_read_alarm_callback, &p))
+	if (!mc146818_avoid_UIP(cmos_read_alarm_callback, 10, &p))
 		return -EIO;
 
 	if (!(p.rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
@@ -556,7 +556,7 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	 *
 	 * Use mc146818_avoid_UIP() to avoid this.
 	 */
-	if (!mc146818_avoid_UIP(cmos_set_alarm_callback, &p))
+	if (!mc146818_avoid_UIP(cmos_set_alarm_callback, 10, &p))
 		return -ETIMEDOUT;
 
 	cmos->alarm_expires = rtc_tm_to_time64(&t->time);
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 43a28e82674e..4c17b3cef11e 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -8,26 +8,31 @@
 #include <linux/acpi.h>
 #endif
 
+#define UIP_RECHECK_DELAY		100	/* usec */
+#define UIP_RECHECK_DELAY_MS		(USEC_PER_MSEC / UIP_RECHECK_DELAY)
+#define UIP_RECHECK_LOOPS_MS(x)		(x / UIP_RECHECK_DELAY_MS)
+
 /*
  * Execute a function while the UIP (Update-in-progress) bit of the RTC is
- * unset.
+ * unset. The timeout is configurable by the caller in ms.
  *
  * Warning: callback may be executed more then once.
  */
 bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
+			int timeout,
 			void *param)
 {
 	int i;
 	unsigned long flags;
 	unsigned char seconds;
 
-	for (i = 0; i < 100; i++) {
+	for (i = 0; UIP_RECHECK_LOOPS_MS(i) < timeout; i++) {
 		spin_lock_irqsave(&rtc_lock, flags);
 
 		/*
 		 * Check whether there is an update in progress during which the
 		 * readout is unspecified. The maximum update time is ~2ms. Poll
-		 * every 100 usec for completion.
+		 * for completion.
 		 *
 		 * Store the second value before checking UIP so a long lasting
 		 * NMI which happens to hit after the UIP check cannot make
@@ -37,7 +42,7 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
 
 		if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) {
 			spin_unlock_irqrestore(&rtc_lock, flags);
-			udelay(100);
+			udelay(UIP_RECHECK_DELAY);
 			continue;
 		}
 
@@ -56,7 +61,7 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
 		 */
 		if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) {
 			spin_unlock_irqrestore(&rtc_lock, flags);
-			udelay(100);
+			udelay(UIP_RECHECK_DELAY);
 			continue;
 		}
 
@@ -72,6 +77,10 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
 		}
 		spin_unlock_irqrestore(&rtc_lock, flags);
 
+		if (UIP_RECHECK_LOOPS_MS(i) >= 100)
+			pr_warn("Reading current time from RTC took around %li ms\n",
+				UIP_RECHECK_LOOPS_MS(i));
+
 		return true;
 	}
 	return false;
@@ -84,7 +93,7 @@ EXPORT_SYMBOL_GPL(mc146818_avoid_UIP);
  */
 bool mc146818_does_rtc_work(void)
 {
-	return mc146818_avoid_UIP(NULL, NULL);
+	return mc146818_avoid_UIP(NULL, 10, NULL);
 }
 EXPORT_SYMBOL_GPL(mc146818_does_rtc_work);
 
@@ -130,13 +139,25 @@ static void mc146818_get_time_callback(unsigned char seconds, void *param_in)
 	p->ctrl = CMOS_READ(RTC_CONTROL);
 }
 
-int mc146818_get_time(struct rtc_time *time)
+/**
+ * mc146818_get_time - Get the current time from the RTC
+ * @time: pointer to struct rtc_time to store the current time
+ * @timeout: timeout value in ms
+ *
+ * This function reads the current time from the RTC and stores it in the
+ * provided struct rtc_time. The timeout parameter specifies the maximum
+ * time to wait for the RTC to become ready.
+ *
+ * Return: 0 on success, -ETIMEDOUT if the RTC did not become ready within
+ * the specified timeout, or another error code if an error occurred.
+ */
+int mc146818_get_time(struct rtc_time *time, int timeout)
 {
 	struct mc146818_get_time_callback_param p = {
 		.time = time
 	};
 
-	if (!mc146818_avoid_UIP(mc146818_get_time_callback, &p)) {
+	if (!mc146818_avoid_UIP(mc146818_get_time_callback, timeout, &p)) {
 		memset(time, 0, sizeof(*time));
 		return -ETIMEDOUT;
 	}
diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index b0da04fe087b..34dfcc77f505 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -126,10 +126,11 @@ struct cmos_rtc_board_info {
 #endif /* ARCH_RTC_LOCATION */
 
 bool mc146818_does_rtc_work(void);
-int mc146818_get_time(struct rtc_time *time);
+int mc146818_get_time(struct rtc_time *time, int timeout);
 int mc146818_set_time(struct rtc_time *time);
 
 bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param),
+			int timeout,
 			void *param);
 
 #endif /* _MC146818RTC_H */
-- 
cgit v1.2.3


From 32da0f00ddcb101730cf242289b2b10ede0e1156 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 15 Dec 2023 14:57:10 -0300
Subject: net: rtnl: introduce rcu_replace_pointer_rtnl

Introduce the rcu_replace_pointer_rtnl helper to lockdep check rtnl lock
rcu replacements, alongside the already existing helpers.

This is a quality of life helper so instead of using:
   rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
   .. or the open coded..
   rtnl_dereference() / rcu_assign_pointer()
   .. or the lazy check version ..
   rcu_replace_pointer(rp, p, 1)
Use:
   rcu_replace_pointer_rtnl(rp, p)

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6a8543b34e2c..410529fca18b 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -79,6 +79,18 @@ static inline bool lockdep_rtnl_is_held(void)
 #define rtnl_dereference(p)					\
 	rcu_dereference_protected(p, lockdep_rtnl_is_held())
 
+/**
+ * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning
+ * its old value
+ * @rp: RCU pointer, whose value is returned
+ * @p: regular pointer
+ *
+ * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated
+ * pointer. The old value of @rp is returned, and @rp is set to @p
+ */
+#define rcu_replace_pointer_rtnl(rp, p)			\
+	rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
+
 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 {
 	return rtnl_dereference(dev->ingress_queue);
-- 
cgit v1.2.3


From b2adbc9cea752539f6421e9d4642408f666c1251 Mon Sep 17 00:00:00 2001
From: Alvin Šipraga <alsi@bang-olufsen.dk>
Date: Fri, 24 Nov 2023 14:17:44 +0100
Subject: clk: si5351: allow PLLs to be adjusted without reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new PLL reset mode flag which controls whether or not to
reset a PLL after adjusting its rate. The mode can be configured through
platform data or device tree.

Since commit 6dc669a22c77 ("clk: si5351: Add PLL soft reset"), the
driver unconditionally resets a PLL whenever its rate is adjusted.
The rationale was that a PLL reset was required to get three outputs
working at the same time. Before this change, the driver never reset the
PLLs.

Commit b26ff127c52c ("clk: si5351: Apply PLL soft reset before enabling
the outputs") subsequently introduced an option to reset the PLL when
enabling a clock output that sourced it. Here, the rationale was that
this is required to get a deterministic phase relationship between
multiple output clocks.

This clearly shows that it is useful to reset the PLLs in applications
where multiple clock outputs are used. However, the Si5351 also allows
for glitch-free rate adjustment of its PLLs if one avoids resetting the
PLL. In our audio application where a single Si5351 clock output is used
to supply a runtime adjustable bit clock, this unconditional PLL reset
behaviour introduces unwanted glitches in the clock output.

It would appear that the problem being solved in the former commit
may be solved by using the optional device tree property introduced in
the latter commit, obviating the need for an unconditional PLL reset
after rate adjustment. But it's not OK to break the default behaviour of
the driver, and it cannot be assumed that all device trees are using the
property introduced in the latter commit. Hence, the new behaviour is
made opt-in.

Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Rabeeh Khoury <rabeeh@solid-run.com>
Cc: Jacob Siverskog <jacob@teenage.engineering>
Cc: Sergej Sawazki <sergej@taudac.com>
Signed-off-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Acked-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: https://lore.kernel.org/r/20231124-alvin-clk-si5351-no-pll-reset-v6-3-69b82311cb90@bang-olufsen.dk
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-si5351.c             | 47 +++++++++++++++++++++++++++++++++---
 include/linux/platform_data/si5351.h |  2 ++
 2 files changed, 46 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-si5351.c b/drivers/clk/clk-si5351.c
index a9a0bc448a4b..4ce83c5265b8 100644
--- a/drivers/clk/clk-si5351.c
+++ b/drivers/clk/clk-si5351.c
@@ -506,6 +506,8 @@ static int si5351_pll_set_rate(struct clk_hw *hw, unsigned long rate,
 {
 	struct si5351_hw_data *hwdata =
 		container_of(hw, struct si5351_hw_data, hw);
+	struct si5351_platform_data *pdata =
+		hwdata->drvdata->client->dev.platform_data;
 	u8 reg = (hwdata->num == 0) ? SI5351_PLLA_PARAMETERS :
 		SI5351_PLLB_PARAMETERS;
 
@@ -518,9 +520,10 @@ static int si5351_pll_set_rate(struct clk_hw *hw, unsigned long rate,
 		(hwdata->params.p2 == 0) ? SI5351_CLK_INTEGER_MODE : 0);
 
 	/* Do a pll soft reset on the affected pll */
-	si5351_reg_write(hwdata->drvdata, SI5351_PLL_RESET,
-			 hwdata->num == 0 ? SI5351_PLL_RESET_A :
-					    SI5351_PLL_RESET_B);
+	if (pdata->pll_reset[hwdata->num])
+		si5351_reg_write(hwdata->drvdata, SI5351_PLL_RESET,
+				 hwdata->num == 0 ? SI5351_PLL_RESET_A :
+						    SI5351_PLL_RESET_B);
 
 	dev_dbg(&hwdata->drvdata->client->dev,
 		"%s - %s: p1 = %lu, p2 = %lu, p3 = %lu, parent_rate = %lu, rate = %lu\n",
@@ -1222,6 +1225,44 @@ static int si5351_dt_parse(struct i2c_client *client,
 		}
 	}
 
+	/*
+	 * Parse PLL reset mode. For compatibility with older device trees, the
+	 * default is to always reset a PLL after setting its rate.
+	 */
+	pdata->pll_reset[0] = true;
+	pdata->pll_reset[1] = true;
+
+	of_property_for_each_u32(np, "silabs,pll-reset-mode", prop, p, num) {
+		if (num >= 2) {
+			dev_err(&client->dev,
+				"invalid pll %d on pll-reset-mode prop\n", num);
+			return -EINVAL;
+		}
+
+		p = of_prop_next_u32(prop, p, &val);
+		if (!p) {
+			dev_err(&client->dev,
+				"missing pll-reset-mode for pll %d\n", num);
+			return -EINVAL;
+		}
+
+		switch (val) {
+		case 0:
+			/* Reset PLL whenever its rate is adjusted */
+			pdata->pll_reset[num] = true;
+			break;
+		case 1:
+			/* Don't reset PLL whenever its rate is adjusted */
+			pdata->pll_reset[num] = false;
+			break;
+		default:
+			dev_err(&client->dev,
+				"invalid pll-reset-mode %d for pll %d\n", val,
+				num);
+			return -EINVAL;
+		}
+	}
+
 	/* per clkout properties */
 	for_each_child_of_node(np, child) {
 		if (of_property_read_u32(child, "reg", &num)) {
diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h
index c71a2dd66143..5f412a615532 100644
--- a/include/linux/platform_data/si5351.h
+++ b/include/linux/platform_data/si5351.h
@@ -105,10 +105,12 @@ struct si5351_clkout_config {
  * @clk_xtal: xtal input clock
  * @clk_clkin: clkin input clock
  * @pll_src: array of pll source clock setting
+ * @pll_reset: array indicating if plls should be reset after setting the rate
  * @clkout: array of clkout configuration
  */
 struct si5351_platform_data {
 	enum si5351_pll_src pll_src[2];
+	bool pll_reset[2];
 	struct si5351_clkout_config clkout[8];
 };
 
-- 
cgit v1.2.3


From 5a2a2cda916335fff4d804e58f36b2305926841e Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Mon, 18 Dec 2023 15:16:16 +0800
Subject: gpiolib: remove duplicate inclusions

Remove second `#include <linux/err.h>`. Remove `#include <asm/errno.h>`
too as it's included by `err.h`.

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index bd9bea7cb270..e846bd4e7559 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -722,7 +722,6 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 #else
 
 #include <asm/bug.h>
-#include <asm/errno.h>
 
 static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 					      struct irq_domain *domain)
@@ -811,8 +810,6 @@ const char *gpio_device_get_label(struct gpio_device *gdev);
 
 #else /* CONFIG_GPIOLIB */
 
-#include <linux/err.h>
-
 #include <asm/bug.h>
 
 static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc)
-- 
cgit v1.2.3


From 58ff9c5acb4aef58e118bbf39736cc4d6c11a3d3 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 22 Nov 2023 15:03:51 +0900
Subject: PCI: Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX

Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX to be more explicit about the type
of IRQ being referenced as well as to match the PCI specifications
terms. Redefine PCI_IRQ_LEGACY as an alias to PCI_IRQ_INTX to avoid the
need for doing the renaming tree-wide. New drivers and new code should
now prefer using PCI_IRQ_INTX instead of PCI_IRQ_LEGACY.

Link: https://lore.kernel.org/r/20231122060406.14695-2-dlemoal@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/pci.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..9ab4b46c0d19 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1073,11 +1073,13 @@ enum {
 	PCI_SCAN_ALL_PCIE_DEVS	= 0x00000040,	/* Scan all, not just dev 0 */
 };
 
-#define PCI_IRQ_LEGACY		(1 << 0) /* Allow legacy interrupts */
+#define PCI_IRQ_INTX		(1 << 0) /* Allow INTx interrupts */
 #define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
 
+#define PCI_IRQ_LEGACY		PCI_IRQ_INTX /* Deprecated! Use PCI_IRQ_INTX */
+
 /* These external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
-- 
cgit v1.2.3


From 74955cb8ccc38539f8c029336e07e6b43b6a942e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 22 Nov 2023 15:03:52 +0900
Subject: PCI: endpoint: Drop PCI_EPC_IRQ_XXX definitions

linux/pci.h defines the IRQ flags PCI_IRQ_INTX, PCI_IRQ_MSI and
PCI_IRQ_MSIX. Let's use these flags directly instead of the endpoint
definitions provided by enum pci_epc_irq_type. This removes the need
for defining this enum type completely.

Link: https://lore.kernel.org/r/20231122060406.14695-3-dlemoal@kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/pci/controller/cadence/pcie-cadence-ep.c  |  9 ++++-----
 drivers/pci/controller/dwc/pci-dra7xx.c           |  6 +++---
 drivers/pci/controller/dwc/pci-imx6.c             |  9 ++++-----
 drivers/pci/controller/dwc/pci-keystone.c         |  9 ++++-----
 drivers/pci/controller/dwc/pci-layerscape-ep.c    |  8 ++++----
 drivers/pci/controller/dwc/pcie-artpec6.c         |  8 ++++----
 drivers/pci/controller/dwc/pcie-designware-ep.c   |  2 +-
 drivers/pci/controller/dwc/pcie-designware-plat.c |  9 ++++-----
 drivers/pci/controller/dwc/pcie-designware.h      |  2 +-
 drivers/pci/controller/dwc/pcie-keembay.c         | 13 ++++++-------
 drivers/pci/controller/dwc/pcie-qcom-ep.c         |  6 +++---
 drivers/pci/controller/dwc/pcie-rcar-gen4.c       |  7 +++----
 drivers/pci/controller/dwc/pcie-tegra194.c        |  9 ++++-----
 drivers/pci/controller/dwc/pcie-uniphier-ep.c     |  7 +++----
 drivers/pci/controller/pcie-rcar-ep.c             |  7 +++----
 drivers/pci/controller/pcie-rockchip-ep.c         |  7 +++----
 drivers/pci/endpoint/functions/pci-epf-mhi.c      |  2 +-
 drivers/pci/endpoint/functions/pci-epf-ntb.c      |  4 ++--
 drivers/pci/endpoint/functions/pci-epf-test.c     |  6 +++---
 drivers/pci/endpoint/functions/pci-epf-vntb.c     |  7 ++-----
 drivers/pci/endpoint/pci-epc-core.c               |  2 +-
 include/linux/pci-epc.h                           | 11 ++---------
 22 files changed, 65 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/controller/cadence/pcie-cadence-ep.c b/drivers/pci/controller/cadence/pcie-cadence-ep.c
index 3142feb8ac19..3d71d687ea64 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-ep.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-ep.c
@@ -532,25 +532,24 @@ static int cdns_pcie_ep_send_msix_irq(struct cdns_pcie_ep *ep, u8 fn, u8 vfn,
 }
 
 static int cdns_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn,
-				  enum pci_epc_irq_type type,
-				  u16 interrupt_num)
+				  unsigned int type, u16 interrupt_num)
 {
 	struct cdns_pcie_ep *ep = epc_get_drvdata(epc);
 	struct cdns_pcie *pcie = &ep->pcie;
 	struct device *dev = pcie->dev;
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		if (vfn > 0) {
 			dev_err(dev, "Cannot raise legacy interrupts for VF\n");
 			return -EINVAL;
 		}
 		return cdns_pcie_ep_send_legacy_irq(ep, fn, vfn, 0);
 
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return cdns_pcie_ep_send_msi_irq(ep, fn, vfn, interrupt_num);
 
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return cdns_pcie_ep_send_msix_irq(ep, fn, vfn, interrupt_num);
 
 	default:
diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c
index b445ffe95e3f..f257a42f3314 100644
--- a/drivers/pci/controller/dwc/pci-dra7xx.c
+++ b/drivers/pci/controller/dwc/pci-dra7xx.c
@@ -404,16 +404,16 @@ static void dra7xx_pcie_raise_msi_irq(struct dra7xx_pcie *dra7xx,
 }
 
 static int dra7xx_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				 enum pci_epc_irq_type type, u16 interrupt_num)
+				 unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pci);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		dra7xx_pcie_raise_legacy_irq(dra7xx);
 		break;
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		dra7xx_pcie_raise_msi_irq(dra7xx, interrupt_num);
 		break;
 	default:
diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c
index 74703362aeec..a5365ab8897e 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -1058,17 +1058,16 @@ static void imx6_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int imx6_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				  enum pci_epc_irq_type type,
-				  u16 interrupt_num)
+				  unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return dw_pcie_ep_raise_legacy_irq(ep, func_no);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(pci->dev, "UNKNOWN IRQ type\n");
diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c
index 0def919f89fa..84f6a4acee07 100644
--- a/drivers/pci/controller/dwc/pci-keystone.c
+++ b/drivers/pci/controller/dwc/pci-keystone.c
@@ -900,20 +900,19 @@ static void ks_pcie_am654_raise_legacy_irq(struct keystone_pcie *ks_pcie)
 }
 
 static int ks_pcie_am654_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				   enum pci_epc_irq_type type,
-				   u16 interrupt_num)
+				   unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pci);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		ks_pcie_am654_raise_legacy_irq(ks_pcie);
 		break;
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
 		break;
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
 		break;
 	default:
diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 3d3c50ef4b6f..5f78a9415286 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -166,16 +166,16 @@ static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				enum pci_epc_irq_type type, u16 interrupt_num)
+				unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return dw_pcie_ep_raise_legacy_irq(ep, func_no);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return dw_pcie_ep_raise_msix_irq_doorbell(ep, func_no,
 							  interrupt_num);
 	default:
diff --git a/drivers/pci/controller/dwc/pcie-artpec6.c b/drivers/pci/controller/dwc/pcie-artpec6.c
index 9b572a2b2c9a..fc426182443a 100644
--- a/drivers/pci/controller/dwc/pcie-artpec6.c
+++ b/drivers/pci/controller/dwc/pcie-artpec6.c
@@ -352,15 +352,15 @@ static void artpec6_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int artpec6_pcie_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				  enum pci_epc_irq_type type, u16 interrupt_num)
+				  unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
-		dev_err(pci->dev, "EP cannot trigger legacy IRQs\n");
+	case PCI_IRQ_INTX:
+		dev_err(pci->dev, "EP cannot trigger INTx IRQs\n");
 		return -EINVAL;
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(pci->dev, "UNKNOWN IRQ type\n");
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index f6207989fc6a..87759c899fab 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -443,7 +443,7 @@ static int dw_pcie_ep_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 }
 
 static int dw_pcie_ep_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-				enum pci_epc_irq_type type, u16 interrupt_num)
+				unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie_ep *ep = epc_get_drvdata(epc);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index b625841e98aa..c83968aa0149 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -42,17 +42,16 @@ static void dw_plat_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int dw_plat_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				     enum pci_epc_irq_type type,
-				     u16 interrupt_num)
+				     unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return dw_pcie_ep_raise_legacy_irq(ep, func_no);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(pci->dev, "UNKNOWN IRQ type\n");
diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index 55ff76e3d384..ffb9a62f3179 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -335,7 +335,7 @@ struct dw_pcie_ep_ops {
 	void	(*ep_init)(struct dw_pcie_ep *ep);
 	void	(*deinit)(struct dw_pcie_ep *ep);
 	int	(*raise_irq)(struct dw_pcie_ep *ep, u8 func_no,
-			     enum pci_epc_irq_type type, u16 interrupt_num);
+			     unsigned int type, u16 interrupt_num);
 	const struct pci_epc_features* (*get_features)(struct dw_pcie_ep *ep);
 	/*
 	 * Provide a method to implement the different func config space
diff --git a/drivers/pci/controller/dwc/pcie-keembay.c b/drivers/pci/controller/dwc/pcie-keembay.c
index 289bff99d762..8e0e2e28ef67 100644
--- a/drivers/pci/controller/dwc/pcie-keembay.c
+++ b/drivers/pci/controller/dwc/pcie-keembay.c
@@ -289,19 +289,18 @@ static void keembay_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int keembay_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				     enum pci_epc_irq_type type,
-				     u16 interrupt_num)
+				     unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
-		/* Legacy interrupts are not supported in Keem Bay */
-		dev_err(pci->dev, "Legacy IRQ is not supported\n");
+	case PCI_IRQ_INTX:
+		/* INTx interrupts are not supported in Keem Bay */
+		dev_err(pci->dev, "INTx IRQ is not supported\n");
 		return -EINVAL;
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(pci->dev, "Unknown IRQ type %d\n", type);
diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
index 9e58f055199a..2e5ab5fef310 100644
--- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
+++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
@@ -726,14 +726,14 @@ static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev,
 }
 
 static int qcom_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				  enum pci_epc_irq_type type, u16 interrupt_num)
+				  unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return dw_pcie_ep_raise_legacy_irq(ep, func_no);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(pci->dev, "Unknown IRQ type\n");
diff --git a/drivers/pci/controller/dwc/pcie-rcar-gen4.c b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
index 3bc45e513b3d..25354a82674d 100644
--- a/drivers/pci/controller/dwc/pcie-rcar-gen4.c
+++ b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
@@ -362,15 +362,14 @@ static void rcar_gen4_pcie_ep_deinit(struct dw_pcie_ep *ep)
 }
 
 static int rcar_gen4_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				       enum pci_epc_irq_type type,
-				       u16 interrupt_num)
+				       unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *dw = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return dw_pcie_ep_raise_legacy_irq(ep, func_no);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
 	default:
 		dev_err(dw->dev, "Unknown IRQ type\n");
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index 0fe113598ebb..a1f37d2d7798 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -1979,20 +1979,19 @@ static int tegra_pcie_ep_raise_msix_irq(struct tegra_pcie_dw *pcie, u16 irq)
 }
 
 static int tegra_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				   enum pci_epc_irq_type type,
-				   u16 interrupt_num)
+				   unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return tegra_pcie_ep_raise_legacy_irq(pcie, interrupt_num);
 
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return tegra_pcie_ep_raise_msi_irq(pcie, interrupt_num);
 
-	case PCI_EPC_IRQ_MSIX:
+	case PCI_IRQ_MSIX:
 		return tegra_pcie_ep_raise_msix_irq(pcie, interrupt_num);
 
 	default:
diff --git a/drivers/pci/controller/dwc/pcie-uniphier-ep.c b/drivers/pci/controller/dwc/pcie-uniphier-ep.c
index cba3c88fcf39..d47236d5678d 100644
--- a/drivers/pci/controller/dwc/pcie-uniphier-ep.c
+++ b/drivers/pci/controller/dwc/pcie-uniphier-ep.c
@@ -256,15 +256,14 @@ static int uniphier_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep,
 }
 
 static int uniphier_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
-				      enum pci_epc_irq_type type,
-				      u16 interrupt_num)
+				      unsigned int type, u16 interrupt_num)
 {
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return uniphier_pcie_ep_raise_legacy_irq(ep);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return uniphier_pcie_ep_raise_msi_irq(ep, func_no,
 						      interrupt_num);
 	default:
diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c
index 7034c0ff23d0..e6909271def7 100644
--- a/drivers/pci/controller/pcie-rcar-ep.c
+++ b/drivers/pci/controller/pcie-rcar-ep.c
@@ -402,16 +402,15 @@ static int rcar_pcie_ep_assert_msi(struct rcar_pcie *pcie,
 }
 
 static int rcar_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn,
-				  enum pci_epc_irq_type type,
-				  u16 interrupt_num)
+				  unsigned int type, u16 interrupt_num)
 {
 	struct rcar_pcie_endpoint *ep = epc_get_drvdata(epc);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return rcar_pcie_ep_assert_intx(ep, fn, 0);
 
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return rcar_pcie_ep_assert_msi(&ep->pcie, fn, interrupt_num);
 
 	default:
diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c
index 0af0e965fb57..95b1c8ef59c3 100644
--- a/drivers/pci/controller/pcie-rockchip-ep.c
+++ b/drivers/pci/controller/pcie-rockchip-ep.c
@@ -407,15 +407,14 @@ static int rockchip_pcie_ep_send_msi_irq(struct rockchip_pcie_ep *ep, u8 fn,
 }
 
 static int rockchip_pcie_ep_raise_irq(struct pci_epc *epc, u8 fn, u8 vfn,
-				      enum pci_epc_irq_type type,
-				      u16 interrupt_num)
+				      unsigned int type, u16 interrupt_num)
 {
 	struct rockchip_pcie_ep *ep = epc_get_drvdata(epc);
 
 	switch (type) {
-	case PCI_EPC_IRQ_LEGACY:
+	case PCI_IRQ_INTX:
 		return rockchip_pcie_ep_send_legacy_irq(ep, fn, 0);
-	case PCI_EPC_IRQ_MSI:
+	case PCI_IRQ_MSI:
 		return rockchip_pcie_ep_send_msi_irq(ep, fn, interrupt_num);
 	default:
 		return -EINVAL;
diff --git a/drivers/pci/endpoint/functions/pci-epf-mhi.c b/drivers/pci/endpoint/functions/pci-epf-mhi.c
index b7b9d3e21f97..b9f8d2858cb7 100644
--- a/drivers/pci/endpoint/functions/pci-epf-mhi.c
+++ b/drivers/pci/endpoint/functions/pci-epf-mhi.c
@@ -205,7 +205,7 @@ static void pci_epf_mhi_raise_irq(struct mhi_ep_cntrl *mhi_cntrl, u32 vector)
 	 * MHI supplies 0 based MSI vectors but the API expects the vector
 	 * number to start from 1, so we need to increment the vector by 1.
 	 */
-	pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, PCI_EPC_IRQ_MSI,
+	pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no, PCI_IRQ_MSI,
 			  vector + 1);
 }
 
diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c
index 9aac2c6f3bb9..fad00b1a8335 100644
--- a/drivers/pci/endpoint/functions/pci-epf-ntb.c
+++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c
@@ -140,9 +140,9 @@ static struct pci_epf_header epf_ntb_header = {
 static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up)
 {
 	enum pci_epc_interface_type type;
-	enum pci_epc_irq_type irq_type;
 	struct epf_ntb_epc *ntb_epc;
 	struct epf_ntb_ctrl *ctrl;
+	unsigned int irq_type;
 	struct pci_epc *epc;
 	u8 func_no, vfunc_no;
 	bool is_msix;
@@ -159,7 +159,7 @@ static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up)
 			ctrl->link_status |= LINK_STATUS_UP;
 		else
 			ctrl->link_status &= ~LINK_STATUS_UP;
-		irq_type = is_msix ? PCI_EPC_IRQ_MSIX : PCI_EPC_IRQ_MSI;
+		irq_type = is_msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI;
 		ret = pci_epc_raise_irq(epc, func_no, vfunc_no, irq_type, 1);
 		if (ret) {
 			dev_err(&epc->dev,
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 1f0d2b84296a..9d39fda5c348 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -602,7 +602,7 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test,
 	switch (reg->irq_type) {
 	case IRQ_TYPE_LEGACY:
 		pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no,
-				  PCI_EPC_IRQ_LEGACY, 0);
+				  PCI_IRQ_INTX, 0);
 		break;
 	case IRQ_TYPE_MSI:
 		count = pci_epc_get_msi(epc, epf->func_no, epf->vfunc_no);
@@ -612,7 +612,7 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test,
 			return;
 		}
 		pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no,
-				  PCI_EPC_IRQ_MSI, reg->irq_number);
+				  PCI_IRQ_MSI, reg->irq_number);
 		break;
 	case IRQ_TYPE_MSIX:
 		count = pci_epc_get_msix(epc, epf->func_no, epf->vfunc_no);
@@ -622,7 +622,7 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test,
 			return;
 		}
 		pci_epc_raise_irq(epc, epf->func_no, epf->vfunc_no,
-				  PCI_EPC_IRQ_MSIX, reg->irq_number);
+				  PCI_IRQ_MSIX, reg->irq_number);
 		break;
 	default:
 		dev_err(dev, "Failed to raise IRQ, unknown type\n");
diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c
index 3f60128560ed..e6a0bcc177e4 100644
--- a/drivers/pci/endpoint/functions/pci-epf-vntb.c
+++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c
@@ -1172,11 +1172,8 @@ static int vntb_epf_peer_db_set(struct ntb_dev *ndev, u64 db_bits)
 	func_no = ntb->epf->func_no;
 	vfunc_no = ntb->epf->vfunc_no;
 
-	ret = pci_epc_raise_irq(ntb->epf->epc,
-				func_no,
-				vfunc_no,
-				PCI_EPC_IRQ_MSI,
-				interrupt_num + 1);
+	ret = pci_epc_raise_irq(ntb->epf->epc, func_no, vfunc_no,
+				PCI_IRQ_MSI, interrupt_num + 1);
 	if (ret)
 		dev_err(&ntb->ntb.dev, "Failed to raise IRQ\n");
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 56e1184bc6c2..0810420df42c 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -217,7 +217,7 @@ EXPORT_SYMBOL_GPL(pci_epc_start);
  * Invoke to raise an legacy, MSI or MSI-X interrupt
  */
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-		      enum pci_epc_irq_type type, u16 interrupt_num)
+		      unsigned int type, u16 interrupt_num)
 {
 	int ret;
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 5cb694031072..f498f9aa2ab0 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -19,13 +19,6 @@ enum pci_epc_interface_type {
 	SECONDARY_INTERFACE,
 };
 
-enum pci_epc_irq_type {
-	PCI_EPC_IRQ_UNKNOWN,
-	PCI_EPC_IRQ_LEGACY,
-	PCI_EPC_IRQ_MSI,
-	PCI_EPC_IRQ_MSIX,
-};
-
 static inline const char *
 pci_epc_interface_string(enum pci_epc_interface_type type)
 {
@@ -79,7 +72,7 @@ struct pci_epc_ops {
 			    u16 interrupts, enum pci_barno, u32 offset);
 	int	(*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-			     enum pci_epc_irq_type type, u16 interrupt_num);
+			     unsigned int type, u16 interrupt_num);
 	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			       phys_addr_t phys_addr, u8 interrupt_num,
 			       u32 entry_size, u32 *msi_data,
@@ -229,7 +222,7 @@ int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			phys_addr_t phys_addr, u8 interrupt_num,
 			u32 entry_size, u32 *msi_data, u32 *msi_addr_offset);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
-		      enum pci_epc_irq_type type, u16 interrupt_num);
+		      unsigned int type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
-- 
cgit v1.2.3


From 3314f2097dee43defc20554f961a8b17f4787e2d Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Tue, 5 Dec 2023 17:01:01 -0800
Subject: intel: add bit macro includes where needed

This series is introducing the use of FIELD_GET and FIELD_PREP which
requires bitfield.h to be included. Fix all the includes in this one
change, and rearrange includes into alphabetical order to ease
readability and future maintenance.

virtchnl.h and it's usage was modified to have it's own includes as it
should. This required including bits.h for virtchnl.h.

Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/e1000/e1000_hw.c    |  1 +
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c    |  1 +
 drivers/net/ethernet/intel/fm10k/fm10k_vf.c    |  1 +
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  1 +
 drivers/net/ethernet/intel/i40e/i40e_dcb.c     |  2 ++
 drivers/net/ethernet/intel/i40e/i40e_nvm.c     |  1 +
 drivers/net/ethernet/intel/iavf/iavf_common.c  |  3 ++-
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c |  5 +++--
 drivers/net/ethernet/intel/iavf/iavf_fdir.c    |  1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c    |  1 +
 drivers/net/ethernet/intel/igb/e1000_i210.c    |  4 ++--
 drivers/net/ethernet/intel/igb/e1000_nvm.c     |  4 ++--
 drivers/net/ethernet/intel/igb/e1000_phy.c     |  4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c      | 28 +++++++++++++-------------
 drivers/net/ethernet/intel/igc/igc_i225.c      |  1 +
 drivers/net/ethernet/intel/igc/igc_phy.c       |  1 +
 include/linux/avf/virtchnl.h                   |  1 +
 17 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c
index 4542e2bc28e8..4576511c99f5 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_hw.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c
@@ -5,6 +5,7 @@
  * Shared functions for accessing and configuring the MAC
  */
 
+#include <linux/bitfield.h>
 #include "e1000.h"
 
 static s32 e1000_check_downshift(struct e1000_hw *hw);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index af1b0cde3670..ae700a1807c6 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2019 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include "fm10k_pf.h"
 #include "fm10k_vf.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
index dc8ccd378ec9..c50928ec14ff 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2019 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include "fm10k_vf.h"
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index bd52b73cf61f..522cf2e5f365 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2,6 +2,7 @@
 /* Copyright(c) 2013 - 2021 Intel Corporation. */
 
 #include <linux/avf/virtchnl.h>
+#include <linux/bitfield.h>
 #include <linux/delay.h>
 #include <linux/etherdevice.h>
 #include <linux/pci.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 498728e16a37..0d334036ab8b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2021 Intel Corporation. */
 
+#include <linux/bitfield.h>
+#include "i40e_adminq.h"
 #include "i40e_alloc.h"
 #include "i40e_dcb.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 62eb34871094..157eacfdc918 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include <linux/delay.h>
 #include "i40e_alloc.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/iavf/iavf_common.c b/drivers/net/ethernet/intel/iavf/iavf_common.c
index 89d2bce529ae..af5cc69f26e3 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_common.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_common.c
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/avf/virtchnl.h>
+#include <linux/bitfield.h>
 #include "iavf_type.h"
 #include "iavf_adminq.h"
 #include "iavf_prototype.h"
-#include <linux/avf/virtchnl.h>
 
 /**
  * iavf_aq_str - convert AQ err code to a string
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index cdb4849f5db4..bffb45802dab 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/bitfield.h>
+#include <linux/uaccess.h>
+
 /* ethtool support for iavf */
 #include "iavf.h"
 
-#include <linux/uaccess.h>
-
 /* ethtool statistics helpers */
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_fdir.c b/drivers/net/ethernet/intel/iavf/iavf_fdir.c
index 03e774bd2a5b..65ddcd81c993 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_fdir.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_fdir.c
@@ -3,6 +3,7 @@
 
 /* flow director ethtool support for iavf */
 
+#include <linux/bitfield.h>
 #include "iavf.h"
 
 #define GTPU_PORT	2152
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index d64c4997136b..fb7edba9c2f8 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include <linux/prefetch.h>
 
 #include "iavf.h"
diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.c b/drivers/net/ethernet/intel/igb/e1000_i210.c
index b9b9d35494d2..53b396fd194a 100644
--- a/drivers/net/ethernet/intel/igb/e1000_i210.c
+++ b/drivers/net/ethernet/intel/igb/e1000_i210.c
@@ -5,9 +5,9 @@
  * e1000_i211
  */
 
-#include <linux/types.h>
+#include <linux/bitfield.h>
 #include <linux/if_ether.h>
-
+#include <linux/types.h>
 #include "e1000_hw.h"
 #include "e1000_i210.h"
 
diff --git a/drivers/net/ethernet/intel/igb/e1000_nvm.c b/drivers/net/ethernet/intel/igb/e1000_nvm.c
index fa136e6e9328..0da57e89593a 100644
--- a/drivers/net/ethernet/intel/igb/e1000_nvm.c
+++ b/drivers/net/ethernet/intel/igb/e1000_nvm.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2007 - 2018 Intel Corporation. */
 
-#include <linux/if_ether.h>
+#include <linux/bitfield.h>
 #include <linux/delay.h>
-
+#include <linux/if_ether.h>
 #include "e1000_mac.h"
 #include "e1000_nvm.h"
 
diff --git a/drivers/net/ethernet/intel/igb/e1000_phy.c b/drivers/net/ethernet/intel/igb/e1000_phy.c
index a018000f7db9..3c1b562a3271 100644
--- a/drivers/net/ethernet/intel/igb/e1000_phy.c
+++ b/drivers/net/ethernet/intel/igb/e1000_phy.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2007 - 2018 Intel Corporation. */
 
-#include <linux/if_ether.h>
+#include <linux/bitfield.h>
 #include <linux/delay.h>
-
+#include <linux/if_ether.h>
 #include "e1000_mac.h"
 #include "e1000_phy.h"
 
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index fd712585af27..e6c1fbee049e 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -3,25 +3,25 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
+#include <linux/bitfield.h>
 #include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/tcp.h>
-#include <linux/ipv6.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-#include <net/ip6_checksum.h>
-#include <linux/mii.h>
 #include <linux/ethtool.h>
 #include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/ipv6.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
 #include <linux/prefetch.h>
 #include <linux/sctp.h>
-
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
 #include "igbvf.h"
 
 char igbvf_driver_name[] = "igbvf";
diff --git a/drivers/net/ethernet/intel/igc/igc_i225.c b/drivers/net/ethernet/intel/igc/igc_i225.c
index 17546a035ab1..d2562c8e8015 100644
--- a/drivers/net/ethernet/intel/igc/igc_i225.c
+++ b/drivers/net/ethernet/intel/igc/igc_i225.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c)  2018 Intel Corporation */
 
+#include <linux/bitfield.h>
 #include <linux/delay.h>
 
 #include "igc_hw.h"
diff --git a/drivers/net/ethernet/intel/igc/igc_phy.c b/drivers/net/ethernet/intel/igc/igc_phy.c
index 53b77c969c85..d0d9e7170154 100644
--- a/drivers/net/ethernet/intel/igc/igc_phy.c
+++ b/drivers/net/ethernet/intel/igc/igc_phy.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c)  2018 Intel Corporation */
 
+#include <linux/bitfield.h>
 #include "igc_phy.h"
 
 /**
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index a44d9dc7e3eb..8e177b67e82f 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -5,6 +5,7 @@
 #define _VIRTCHNL_H_
 
 #include <linux/bitops.h>
+#include <linux/bits.h>
 #include <linux/overflow.h>
 #include <uapi/linux/if_ether.h>
 
-- 
cgit v1.2.3


From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 13 Dec 2023 19:44:17 +0000
Subject: kunit: add KUNIT_INIT_TABLE to init linker section

Add KUNIT_INIT_TABLE to the INIT_DATA linker section.

Alter the KUnit macros to create init tests:
kunit_test_init_section_suites

Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and
KUNIT_INIT_TABLE.

Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Rae Moar <rmoar@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/asm-generic/vmlinux.lds.h |  9 +++++-
 include/kunit/test.h              | 30 +++++++++++-------
 include/linux/module.h            |  2 ++
 kernel/module/main.c              |  3 ++
 lib/kunit/executor.c              | 64 +++++++++++++++++++++++++++++++++++----
 lib/kunit/test.c                  | 26 +++++++++++-----
 6 files changed, 109 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1107905d37fc..5dd3a61d673d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -700,7 +700,8 @@
 	THERMAL_TABLE(governor)						\
 	EARLYCON_TABLE()						\
 	LSM_TABLE()							\
-	EARLY_LSM_TABLE()
+	EARLY_LSM_TABLE()						\
+	KUNIT_INIT_TABLE()
 
 #define INIT_TEXT							\
 	*(.init.text .init.text.*)					\
@@ -926,6 +927,12 @@
 		. = ALIGN(8);						\
 		BOUNDED_SECTION_POST_LABEL(.kunit_test_suites, __kunit_suites, _start, _end)
 
+/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
+#define KUNIT_INIT_TABLE()						\
+		. = ALIGN(8);						\
+		BOUNDED_SECTION_POST_LABEL(.kunit_init_test_suites, \
+				__kunit_init_suites, _start, _end)
+
 #ifdef CONFIG_BLK_DEV_INITRD
 #define INIT_RAM_FS							\
 	. = ALIGN(4);							\
diff --git a/include/kunit/test.h b/include/kunit/test.h
index 20ed9f9275c9..fe79cd736e94 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -337,6 +337,9 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites);
 void kunit_exec_run_tests(struct kunit_suite_set *suite_set, bool builtin);
 void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr);
 
+struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set,
+		struct kunit_suite_set suite_set);
+
 #if IS_BUILTIN(CONFIG_KUNIT)
 int kunit_run_all_tests(void);
 #else
@@ -371,6 +374,11 @@ static inline int kunit_run_all_tests(void)
 
 #define kunit_test_suite(suite)	kunit_test_suites(&suite)
 
+#define __kunit_init_test_suites(unique_array, ...)			       \
+	static struct kunit_suite *unique_array[]			       \
+	__aligned(sizeof(struct kunit_suite *))				       \
+	__used __section(".kunit_init_test_suites") = { __VA_ARGS__ }
+
 /**
  * kunit_test_init_section_suites() - used to register one or more &struct
  *				      kunit_suite containing init functions or
@@ -378,21 +386,21 @@ static inline int kunit_run_all_tests(void)
  *
  * @__suites: a statically allocated list of &struct kunit_suite.
  *
- * This functions identically as kunit_test_suites() except that it suppresses
- * modpost warnings for referencing functions marked __init or data marked
- * __initdata; this is OK because currently KUnit only runs tests upon boot
- * during the init phase or upon loading a module during the init phase.
+ * This functions similar to kunit_test_suites() except that it compiles the
+ * list of suites during init phase.
+ *
+ * This macro also suffixes the array and suite declarations it makes with
+ * _probe; so that modpost suppresses warnings about referencing init data
+ * for symbols named in this manner.
  *
- * NOTE TO KUNIT DEVS: If we ever allow KUnit tests to be run after boot, these
- * tests must be excluded.
+ * Note: these init tests are not able to be run after boot so there is no
+ * "run" debugfs file generated for these tests.
  *
- * The only thing this macro does that's different from kunit_test_suites is
- * that it suffixes the array and suite declarations it makes with _probe;
- * modpost suppresses warnings about referencing init data for symbols named in
- * this manner.
+ * Also, do not mark the suite or test case structs with __initdata because
+ * they will be used after the init phase with debugfs.
  */
 #define kunit_test_init_section_suites(__suites...)			\
-	__kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe),	\
+	__kunit_init_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \
 			    ##__suites)
 
 #define kunit_test_init_section_suite(suite)	\
diff --git a/include/linux/module.h b/include/linux/module.h
index a98e188cf37b..9cd0009bd050 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -540,6 +540,8 @@ struct module {
 	struct static_call_site *static_call_sites;
 #endif
 #if IS_ENABLED(CONFIG_KUNIT)
+	int num_kunit_init_suites;
+	struct kunit_suite **kunit_init_suites;
 	int num_kunit_suites;
 	struct kunit_suite **kunit_suites;
 #endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8db5..36681911c05a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->kunit_suites = section_objs(info, ".kunit_test_suites",
 					      sizeof(*mod->kunit_suites),
 					      &mod->num_kunit_suites);
+	mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+					      sizeof(*mod->kunit_init_suites),
+					      &mod->num_kunit_init_suites);
 #endif
 
 	mod->extable = section_objs(info, "__ex_table",
diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index 1236b3cd2fbb..847329c51e91 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -12,6 +12,8 @@
  */
 extern struct kunit_suite * const __kunit_suites_start[];
 extern struct kunit_suite * const __kunit_suites_end[];
+extern struct kunit_suite * const __kunit_init_suites_start[];
+extern struct kunit_suite * const __kunit_init_suites_end[];
 
 static char *action_param;
 
@@ -292,6 +294,33 @@ void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr)
 	}
 }
 
+struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set,
+		struct kunit_suite_set suite_set)
+{
+	struct kunit_suite_set total_suite_set = {NULL, NULL};
+	struct kunit_suite **total_suite_start = NULL;
+	size_t init_num_suites, num_suites, suite_size;
+
+	init_num_suites = init_suite_set.end - init_suite_set.start;
+	num_suites = suite_set.end - suite_set.start;
+	suite_size = sizeof(suite_set.start);
+
+	/* Allocate memory for array of all kunit suites */
+	total_suite_start = kmalloc_array(init_num_suites + num_suites, suite_size, GFP_KERNEL);
+	if (!total_suite_start)
+		return total_suite_set;
+
+	/* Append init suites and then all other kunit suites */
+	memcpy(total_suite_start, init_suite_set.start, init_num_suites * suite_size);
+	memcpy(total_suite_start + init_num_suites, suite_set.start, num_suites * suite_size);
+
+	/* Set kunit suite set start and end */
+	total_suite_set.start = total_suite_start;
+	total_suite_set.end = total_suite_start + (init_num_suites + num_suites);
+
+	return total_suite_set;
+}
+
 #if IS_BUILTIN(CONFIG_KUNIT)
 
 static char *kunit_shutdown;
@@ -313,21 +342,41 @@ static void kunit_handle_shutdown(void)
 
 int kunit_run_all_tests(void)
 {
-	struct kunit_suite_set suite_set = {
+	struct kunit_suite_set suite_set = {NULL, NULL};
+	struct kunit_suite_set filtered_suite_set = {NULL, NULL};
+	struct kunit_suite_set init_suite_set = {
+		__kunit_init_suites_start, __kunit_init_suites_end,
+	};
+	struct kunit_suite_set normal_suite_set = {
 		__kunit_suites_start, __kunit_suites_end,
 	};
+	size_t init_num_suites = init_suite_set.end - init_suite_set.start;
 	int err = 0;
+
+	if (init_num_suites > 0) {
+		suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set);
+		if (!suite_set.start)
+			goto out;
+	} else
+		suite_set = normal_suite_set;
+
 	if (!kunit_enabled()) {
 		pr_info("kunit: disabled\n");
-		goto out;
+		goto free_out;
 	}
 
 	if (filter_glob_param || filter_param) {
-		suite_set = kunit_filter_suites(&suite_set, filter_glob_param,
+		filtered_suite_set = kunit_filter_suites(&suite_set, filter_glob_param,
 				filter_param, filter_action_param, &err);
+
+		/* Free original suite set before using filtered suite set */
+		if (init_num_suites > 0)
+			kfree(suite_set.start);
+		suite_set = filtered_suite_set;
+
 		if (err) {
 			pr_err("kunit executor: error filtering suites: %d\n", err);
-			goto out;
+			goto free_out;
 		}
 	}
 
@@ -340,9 +389,12 @@ int kunit_run_all_tests(void)
 	else
 		pr_err("kunit executor: unknown action '%s'\n", action_param);
 
-	if (filter_glob_param || filter_param) { /* a copy was made of each suite */
+free_out:
+	if (filter_glob_param || filter_param)
 		kunit_free_suite_set(suite_set);
-	}
+	else if (init_num_suites > 0)
+		/* Don't use kunit_free_suite_set because suites aren't individually allocated */
+		kfree(suite_set.start);
 
 out:
 	kunit_handle_shutdown();
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 7deee3701d20..6b60d85ce108 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -742,28 +742,40 @@ EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
 #ifdef CONFIG_MODULES
 static void kunit_module_init(struct module *mod)
 {
-	struct kunit_suite_set suite_set = {
+	struct kunit_suite_set suite_set, filtered_set;
+	struct kunit_suite_set normal_suite_set = {
 		mod->kunit_suites, mod->kunit_suites + mod->num_kunit_suites,
 	};
+	struct kunit_suite_set init_suite_set = {
+		mod->kunit_init_suites, mod->kunit_init_suites + mod->num_kunit_init_suites,
+	};
 	const char *action = kunit_action();
 	int err = 0;
 
-	suite_set = kunit_filter_suites(&suite_set,
+	if (mod->num_kunit_init_suites > 0)
+		suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set);
+	else
+		suite_set = normal_suite_set;
+
+	filtered_set = kunit_filter_suites(&suite_set,
 					kunit_filter_glob() ?: "*.*",
 					kunit_filter(), kunit_filter_action(),
 					&err);
 	if (err)
 		pr_err("kunit module: error filtering suites: %d\n", err);
 
-	mod->kunit_suites = (struct kunit_suite **)suite_set.start;
-	mod->num_kunit_suites = suite_set.end - suite_set.start;
+	mod->kunit_suites = (struct kunit_suite **)filtered_set.start;
+	mod->num_kunit_suites = filtered_set.end - filtered_set.start;
+
+	if (mod->num_kunit_init_suites > 0)
+		kfree(suite_set.start);
 
 	if (!action)
-		kunit_exec_run_tests(&suite_set, false);
+		kunit_exec_run_tests(&filtered_set, false);
 	else if (!strcmp(action, "list"))
-		kunit_exec_list_tests(&suite_set, false);
+		kunit_exec_list_tests(&filtered_set, false);
 	else if (!strcmp(action, "list_attr"))
-		kunit_exec_list_tests(&suite_set, true);
+		kunit_exec_list_tests(&filtered_set, true);
 	else
 		pr_err("kunit: unknown action '%s'\n", action);
 }
-- 
cgit v1.2.3


From 86362293044b382aece355f9e4e3f7116dcd1eae Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 22 Jul 2023 16:08:44 -0700
Subject: PCI: endpoint: Make struct pci_epf_ops in pci_epf_driver const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pci_epf_ops struct contains a set of callbacks that are used by the
pci_epf_driver, and is never modified by the EPF core itself.

Marking the struct pointer const allows EPF drivers to declare their
pci_epf_ops struct to be const.

This allows the struct to be placed in the read-only section. Which
for example brings some security benefits as the callbacks can not be
overwritten.

[kwilczynski: commit log]
Link: https://lore.kernel.org/linux-pci/20230722230848.589428-1-lars@metafoo.de
Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
---
 include/linux/pci-epf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 3f44b6aec477..764dc62b3acc 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -98,7 +98,7 @@ struct pci_epf_driver {
 	void	(*remove)(struct pci_epf *epf);
 
 	struct device_driver	driver;
-	struct pci_epf_ops	*ops;
+	const struct pci_epf_ops *ops;
 	struct module		*owner;
 	struct list_head	epf_group;
 	const struct pci_epf_device_id	*id_table;
-- 
cgit v1.2.3


From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 13 Dec 2023 09:37:01 -0500
Subject: tracing: Allow creating instances with specified system events

A trace instance may only need to enable specific events. As the eventfs
directory of an instance currently creates all events which adds overhead,
allow internal instances to be created with just the events in systems
that they care about. This currently only deals with systems and not
individual events, but this should bring down the overhead of creating
instances for specific use cases quite bit.

The trace_array_get_by_name() now has another parameter "systems". This
parameter is a const string pointer of a comma/space separated list of
event systems that should be created by the trace_array. (Note if the
trace_array already exists, this parameter is ignored).

The list of systems is saved and if a module is loaded, its events will
not be added unless the system for those events also match the systems
string.

Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: Arun Easi   <aeasi@marvell.com>
Cc: Daniel Wagner <dwagner@suse.de>
Tested-by: Dmytro Maluka <dmaluka@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 drivers/scsi/qla2xxx/qla_os.c       |  2 +-
 include/linux/trace.h               |  4 ++--
 kernel/trace/trace.c                | 23 ++++++++++++++----
 kernel/trace/trace.h                |  1 +
 kernel/trace/trace_boot.c           |  2 +-
 kernel/trace/trace_events.c         | 48 ++++++++++++++++++++++++++++++++++---
 samples/ftrace/sample-trace-array.c |  2 +-
 7 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 03348f605c2e..dd674378f2f3 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work)
 static void
 qla_trace_init(void)
 {
-	qla_trc_array = trace_array_get_by_name("qla2xxx");
+	qla_trc_array = trace_array_get_by_name("qla2xxx", NULL);
 	if (!qla_trc_array) {
 		ql_log(ql_log_fatal, NULL, 0x0001,
 		       "Unable to create qla2xxx trace instance, instance logging will be disabled.\n");
diff --git a/include/linux/trace.h b/include/linux/trace.h
index 2a70a447184c..fdcd76b7be83 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
 		       const char *fmt, ...);
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
-struct trace_array *trace_array_get_by_name(const char *name);
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems);
 int trace_array_destroy(struct trace_array *tr);
 
 /* For osnoise tracer */
@@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr)
 static inline void trace_array_put(struct trace_array *tr)
 {
 }
-static inline struct trace_array *trace_array_get_by_name(const char *name)
+static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
 {
 	return NULL;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 199df497db07..59e39b652afb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9490,7 +9490,8 @@ static int trace_array_create_dir(struct trace_array *tr)
 	return ret;
 }
 
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
 {
 	struct trace_array *tr;
 	int ret;
@@ -9510,6 +9511,12 @@ static struct trace_array *trace_array_create(const char *name)
 	if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
+	if (systems) {
+		tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+		if (!tr->system_names)
+			goto out_free_tr;
+	}
+
 	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9556,12 +9563,18 @@ static struct trace_array *trace_array_create(const char *name)
 	free_trace_buffers(tr);
 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr->system_names);
 	kfree(tr->name);
 	kfree(tr);
 
 	return ERR_PTR(ret);
 }
 
+static struct trace_array *trace_array_create(const char *name)
+{
+	return trace_array_create_systems(name, NULL);
+}
+
 static int instance_mkdir(const char *name)
 {
 	struct trace_array *tr;
@@ -9587,6 +9600,7 @@ out_unlock:
 /**
  * trace_array_get_by_name - Create/Lookup a trace array, given its name.
  * @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
  *
  * Returns pointer to trace array with given name.
  * NULL, if it cannot be created.
@@ -9600,7 +9614,7 @@ out_unlock:
  * trace_array_put() is called, user space can not delete it.
  *
  */
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
 {
 	struct trace_array *tr;
 
@@ -9612,7 +9626,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
 			goto out_unlock;
 	}
 
-	tr = trace_array_create(name);
+	tr = trace_array_create_systems(name, systems);
 
 	if (IS_ERR(tr))
 		tr = NULL;
@@ -9659,6 +9673,7 @@ static int __remove_instance(struct trace_array *tr)
 
 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr->system_names);
 	kfree(tr->name);
 	kfree(tr);
 
@@ -10377,7 +10392,7 @@ __init static void enable_instances(void)
 		if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
 			do_allocate_snapshot(tok);
 
-		tr = trace_array_get_by_name(tok);
+		tr = trace_array_get_by_name(tok, NULL);
 		if (!tr) {
 			pr_warn("Failed to create instance buffer %s\n", curr_str);
 			continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0489e72c8169..79180aed13ee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
 	unsigned int		flags;
 	raw_spinlock_t		start_lock;
+	const char		*system_names;
 	struct list_head	err_log;
 	struct dentry		*dir;
 	struct dentry		*options;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
 		if (!p || *p == '\0')
 			continue;
 
-		tr = trace_array_get_by_name(p);
+		tr = trace_array_get_by_name(p, NULL);
 		if (!tr) {
 			pr_err("Failed to get trace instance %s\n", p);
 			continue;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f29e815ca5b2..b70d03818038 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2896,6 +2896,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 	up_write(&trace_event_sem);
 }
 
+static bool event_in_systems(struct trace_event_call *call,
+			     const char *systems)
+{
+	const char *system;
+	const char *p;
+
+	if (!systems)
+		return true;
+
+	system = call->class->system;
+	p = strstr(systems, system);
+	if (!p)
+		return false;
+
+	if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+		return false;
+
+	p += strlen(system);
+	return !*p || isspace(*p) || *p == ',';
+}
+
 static struct trace_event_file *
 trace_create_new_event(struct trace_event_call *call,
 		       struct trace_array *tr)
@@ -2905,9 +2926,12 @@ trace_create_new_event(struct trace_event_call *call,
 	struct trace_event_file *file;
 	unsigned int first;
 
+	if (!event_in_systems(call, tr->system_names))
+		return NULL;
+
 	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	pid_list = rcu_dereference_protected(tr->filtered_pids,
 					     lockdep_is_held(&event_mutex));
@@ -2972,8 +2996,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);
 
 	if (eventdir_initialized)
 		return event_create_dir(tr->event_dir, file);
@@ -3012,8 +3045,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
 	int ret;
 
 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);
 
 	ret = event_define_fields(call);
 	if (ret)
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c
index 6aba02a31c96..d0ee9001c7b3 100644
--- a/samples/ftrace/sample-trace-array.c
+++ b/samples/ftrace/sample-trace-array.c
@@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void)
 	 * NOTE: This function increments the reference counter
 	 * associated with the trace array - "tr".
 	 */
-	tr = trace_array_get_by_name("sample-instance");
+	tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes");
 
 	if (!tr)
 		return -1;
-- 
cgit v1.2.3


From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 13:19:01 -0500
Subject: tracing: Allow for max buffer data size trace_marker writes

Allow a trace write to be as big as the ring buffer tracing data will
allow. Currently, it only allows writes of 1KB in size, but there's no
reason that it cannot allow what the ring buffer can hold.

Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 15 +++++++++++++++
 kernel/trace/trace.c        | 31 ++++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..b1b03b2c0f08 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
 bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer);
 
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
 void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bfe2697a92ee..16b640d824f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5190,6 +5190,21 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+	/* If abs timestamp is requested, events have a timestamp too */
+	if (ring_buffer_time_stamp_abs(buffer))
+		return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND;
+	return BUF_MAX_DATA_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
 static void rb_clear_buffer_page(struct buffer_page *page)
 {
 	local_set(&page->write, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 59e39b652afb..dba1328e454b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7278,8 +7278,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	enum event_trigger_type tt = ETT_NONE;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
+	int meta_size;
 	ssize_t written;
-	int size;
+	size_t size;
 	int len;
 
 /* Used in tracing_mark_raw_write() as well */
@@ -7292,12 +7293,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (!(tr->trace_flags & TRACE_ITER_MARKERS))
 		return -EINVAL;
 
-	if (cnt > TRACE_BUF_SIZE)
-		cnt = TRACE_BUF_SIZE;
-
-	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+	if ((ssize_t)cnt < 0)
+		return -EINVAL;
 
-	size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+	meta_size = sizeof(*entry) + 2;  /* add '\0' and possible '\n' */
+ again:
+	size = cnt + meta_size;
 
 	/* If less than "<faulted>", then make sure we can still add that */
 	if (cnt < FAULTED_SIZE)
@@ -7306,9 +7307,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					    tracing_gen_ctx());
-	if (unlikely(!event))
+	if (unlikely(!event)) {
+		/*
+		 * If the size was greater than what was allowed, then
+		 * make it smaller and try again.
+		 */
+		if (size > ring_buffer_max_event_size(buffer)) {
+			/* cnt < FAULTED size should never be bigger than max */
+			if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+				return -EBADF;
+			cnt = ring_buffer_max_event_size(buffer) - meta_size;
+			/* The above should only happen once */
+			if (WARN_ON_ONCE(cnt + meta_size == size))
+				return -EBADF;
+			goto again;
+		}
+
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
+	}
 
 	entry = ring_buffer_event_data(event);
 	entry->ip = _THIS_IP_;
-- 
cgit v1.2.3


From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sat, 9 Dec 2023 17:52:20 -0500
Subject: trace_seq: Increase the buffer size to almost two pages

Now that trace_marker can hold more than 1KB string, and can write as much
as the ring buffer can hold, the trace_seq is not big enough to hold
writes:

 ~# a="1234567890"
 ~# cnt=4080
 ~# s=""
 ~# while [ $cnt -gt 10 ]; do
 ~#	s="${s}${a}"
 ~#	cnt=$((cnt-10))
 ~# done
 ~# echo $s > trace_marker
 ~# cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-860     [002] .....   105.543465: tracing_mark_write[LINE TOO BIG]
            <...>-860     [002] .....   105.543496: tracing_mark_write: 789012345678901234567890

By increasing the trace_seq buffer to almost two pages, it can now print
out the first line.

This also subtracts the rest of the trace_seq fields from the buffer, so
that the entire trace_seq is now PAGE_SIZE aligned.

Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 9 ++++++---
 kernel/trace/trace.c      | 6 +++---
 kernel/trace/trace_seq.c  | 3 ---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
  */
 
+#define TRACE_SEQ_BUFFER_SIZE	(PAGE_SIZE * 2 - \
+	(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
 struct trace_seq {
-	char			buffer[PAGE_SIZE];
+	char			buffer[TRACE_SEQ_BUFFER_SIZE];
 	struct seq_buf		seq;
 	size_t			readpos;
 	int			full;
@@ -21,7 +24,7 @@ struct trace_seq {
 static inline void
 trace_seq_init(struct trace_seq *s)
 {
-	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
+	seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
 	s->full = 0;
 	s->readpos = 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dba1328e454b..0be30cccabb4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
 
 	/* OK if part of the temp seq buffer */
 	if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
-	    (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+	    (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
 		return true;
 
 	/* Core rodata can not be freed */
@@ -6932,8 +6932,8 @@ waitagain:
 		goto out;
 	}
 
-	if (cnt >= PAGE_SIZE)
-		cnt = PAGE_SIZE - 1;
+	if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+		cnt = TRACE_SEQ_BUFFER_SIZE - 1;
 
 	/* reset all but tr, trace, and overruns */
 	trace_iterator_reset(iter);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
  * trace_seq_init() more than once to reset the trace_seq to start
  * from scratch.
  * 
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
  * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to
-- 
cgit v1.2.3


From b9670ee2e975e1cb6751019d5dc5c193aecd8ba2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sun, 17 Dec 2023 12:07:02 -0800
Subject: Revert "iio: hid-sensor-als: Add light chromaticity support"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit ee3710f39f9d0ae5137a866138d005fe1ad18132.

This commit assumes that every HID descriptor for ALS sensor has
presence of usage id ID HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X and
HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y. When the above usage ids are
absent,  driver probe fails. This breaks ALS sensor functionality on
many platforms.

Till we have a good solution, revert this commit.

Reported-by: Thomas Weißschuh <thomas@t-8ch.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc:  <stable@vger.kernel.org>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20231217200703.719876-2-srinivas.pandruvada@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/light/hid-sensor-als.c | 63 --------------------------------------
 include/linux/hid-sensor-ids.h     |  3 --
 2 files changed, 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index f17304b54468..d44b3f30ae4a 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c
@@ -17,8 +17,6 @@ enum {
 	CHANNEL_SCAN_INDEX_INTENSITY,
 	CHANNEL_SCAN_INDEX_ILLUM,
 	CHANNEL_SCAN_INDEX_COLOR_TEMP,
-	CHANNEL_SCAN_INDEX_CHROMATICITY_X,
-	CHANNEL_SCAN_INDEX_CHROMATICITY_Y,
 	CHANNEL_SCAN_INDEX_MAX
 };
 
@@ -78,30 +76,6 @@ static const struct iio_chan_spec als_channels[] = {
 		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
 		.scan_index = CHANNEL_SCAN_INDEX_COLOR_TEMP,
 	},
-	{
-		.type = IIO_CHROMATICITY,
-		.modified = 1,
-		.channel2 = IIO_MOD_X,
-		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
-		BIT(IIO_CHAN_INFO_SCALE) |
-		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
-		.scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_X,
-	},
-	{
-		.type = IIO_CHROMATICITY,
-		.modified = 1,
-		.channel2 = IIO_MOD_Y,
-		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
-		BIT(IIO_CHAN_INFO_SCALE) |
-		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
-		.scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_Y,
-	},
 	IIO_CHAN_SOFT_TIMESTAMP(CHANNEL_SCAN_INDEX_TIMESTAMP)
 };
 
@@ -145,16 +119,6 @@ static int als_read_raw(struct iio_dev *indio_dev,
 			min = als_state->als[chan->scan_index].logical_minimum;
 			address = HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE;
 			break;
-		case  CHANNEL_SCAN_INDEX_CHROMATICITY_X:
-			report_id = als_state->als[chan->scan_index].report_id;
-			min = als_state->als[chan->scan_index].logical_minimum;
-			address = HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X;
-			break;
-		case  CHANNEL_SCAN_INDEX_CHROMATICITY_Y:
-			report_id = als_state->als[chan->scan_index].report_id;
-			min = als_state->als[chan->scan_index].logical_minimum;
-			address = HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y;
-			break;
 		default:
 			report_id = -1;
 			break;
@@ -279,14 +243,6 @@ static int als_capture_sample(struct hid_sensor_hub_device *hsdev,
 		als_state->scan.illum[CHANNEL_SCAN_INDEX_COLOR_TEMP] = sample_data;
 		ret = 0;
 		break;
-	case HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X:
-		als_state->scan.illum[CHANNEL_SCAN_INDEX_CHROMATICITY_X] = sample_data;
-		ret = 0;
-		break;
-	case HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y:
-		als_state->scan.illum[CHANNEL_SCAN_INDEX_CHROMATICITY_Y] = sample_data;
-		ret = 0;
-		break;
 	case HID_USAGE_SENSOR_TIME_TIMESTAMP:
 		als_state->timestamp = hid_sensor_convert_timestamp(&als_state->common_attributes,
 								    *(s64 *)raw_data);
@@ -335,25 +291,6 @@ static int als_parse_report(struct platform_device *pdev,
 		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].index,
 		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].report_id);
 
-	for (i = 0; i < 2; i++) {
-		int next_scan_index = CHANNEL_SCAN_INDEX_CHROMATICITY_X + i;
-
-		ret = sensor_hub_input_get_attribute_info(hsdev,
-				HID_INPUT_REPORT, usage_id,
-				HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X + i,
-				&st->als[next_scan_index]);
-		if (ret < 0)
-			return ret;
-
-		als_adjust_channel_bit_mask(channels,
-					CHANNEL_SCAN_INDEX_CHROMATICITY_X + i,
-					st->als[next_scan_index].size);
-
-		dev_dbg(&pdev->dev, "als %x:%x\n",
-			st->als[next_scan_index].index,
-			st->als[next_scan_index].report_id);
-	}
-
 	st->scale_precision = hid_sensor_format_scale(usage_id,
 				&st->als[CHANNEL_SCAN_INDEX_INTENSITY],
 				&st->scale_pre_decml, &st->scale_post_decml);
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 6730ee900ee1..8af4fb3e0254 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -22,9 +22,6 @@
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY			0x2004d3
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X			0x2004d4
-#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y			0x2004d5
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From d4005431673929a1259ad791db87408fcf85d2cc Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Sun, 17 Dec 2023 12:07:03 -0800
Subject: Revert "iio: hid-sensor-als: Add light color temperature support"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 5f05285df691b1e82108eead7165feae238c95ef.

This commit assumes that every HID descriptor for ALS sensor has
presence of usage id ID HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE.
When the above usage id is absent,  driver probe fails. This breaks
ALS sensor functionality on many platforms.

Till we have a good solution, revert this commit.

Reported-by: Thomas Weißschuh <thomas@t-8ch.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc:  <stable@vger.kernel.org>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20231217200703.719876-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/iio/light/hid-sensor-als.c | 37 ++-----------------------------------
 include/linux/hid-sensor-ids.h     |  1 -
 2 files changed, 2 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index d44b3f30ae4a..5cd27f04b45e 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c
@@ -14,9 +14,8 @@
 #include "../common/hid-sensors/hid-sensor-trigger.h"
 
 enum {
-	CHANNEL_SCAN_INDEX_INTENSITY,
-	CHANNEL_SCAN_INDEX_ILLUM,
-	CHANNEL_SCAN_INDEX_COLOR_TEMP,
+	CHANNEL_SCAN_INDEX_INTENSITY = 0,
+	CHANNEL_SCAN_INDEX_ILLUM = 1,
 	CHANNEL_SCAN_INDEX_MAX
 };
 
@@ -66,16 +65,6 @@ static const struct iio_chan_spec als_channels[] = {
 		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
 		.scan_index = CHANNEL_SCAN_INDEX_ILLUM,
 	},
-	{
-		.type = IIO_COLORTEMP,
-		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
-		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
-		BIT(IIO_CHAN_INFO_SCALE) |
-		BIT(IIO_CHAN_INFO_SAMP_FREQ) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS) |
-		BIT(IIO_CHAN_INFO_HYSTERESIS_RELATIVE),
-		.scan_index = CHANNEL_SCAN_INDEX_COLOR_TEMP,
-	},
 	IIO_CHAN_SOFT_TIMESTAMP(CHANNEL_SCAN_INDEX_TIMESTAMP)
 };
 
@@ -114,11 +103,6 @@ static int als_read_raw(struct iio_dev *indio_dev,
 			min = als_state->als[chan->scan_index].logical_minimum;
 			address = HID_USAGE_SENSOR_LIGHT_ILLUM;
 			break;
-		case  CHANNEL_SCAN_INDEX_COLOR_TEMP:
-			report_id = als_state->als[chan->scan_index].report_id;
-			min = als_state->als[chan->scan_index].logical_minimum;
-			address = HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE;
-			break;
 		default:
 			report_id = -1;
 			break;
@@ -239,10 +223,6 @@ static int als_capture_sample(struct hid_sensor_hub_device *hsdev,
 		als_state->scan.illum[CHANNEL_SCAN_INDEX_ILLUM] = sample_data;
 		ret = 0;
 		break;
-	case HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE:
-		als_state->scan.illum[CHANNEL_SCAN_INDEX_COLOR_TEMP] = sample_data;
-		ret = 0;
-		break;
 	case HID_USAGE_SENSOR_TIME_TIMESTAMP:
 		als_state->timestamp = hid_sensor_convert_timestamp(&als_state->common_attributes,
 								    *(s64 *)raw_data);
@@ -278,19 +258,6 @@ static int als_parse_report(struct platform_device *pdev,
 			st->als[i].report_id);
 	}
 
-	ret = sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT,
-				usage_id,
-				HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE,
-				&st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP]);
-	if (ret < 0)
-		return ret;
-	als_adjust_channel_bit_mask(channels, CHANNEL_SCAN_INDEX_COLOR_TEMP,
-				st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].size);
-
-	dev_dbg(&pdev->dev, "als %x:%x\n",
-		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].index,
-		st->als[CHANNEL_SCAN_INDEX_COLOR_TEMP].report_id);
-
 	st->scale_precision = hid_sensor_format_scale(usage_id,
 				&st->als[CHANNEL_SCAN_INDEX_INTENSITY],
 				&st->scale_pre_decml, &st->scale_post_decml);
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 8af4fb3e0254..13b1e65fbdcc 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -21,7 +21,6 @@
 #define HID_USAGE_SENSOR_ALS					0x200041
 #define HID_USAGE_SENSOR_DATA_LIGHT				0x2004d0
 #define HID_USAGE_SENSOR_LIGHT_ILLUM				0x2004d1
-#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE		0x2004d2
 
 /* PROX (200011) */
 #define HID_USAGE_SENSOR_PROX                                   0x200011
-- 
cgit v1.2.3


From 7259eb7b534735b9c1153654c0bb4c5f059c0dd3 Mon Sep 17 00:00:00 2001
From: Moti Haimovski <mhaimovski@habana.ai>
Date: Sun, 12 Nov 2023 18:07:10 +0200
Subject: accel/habanalabs/gaudi2: add signed dev info uAPI

User will provide a nonce via the INFO ioctl, and will retrieve
the signed device info generated using given nonce.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c      |  8 ++++
 drivers/accel/habanalabs/common/habanalabs.h       |  2 +
 drivers/accel/habanalabs/common/habanalabs_ioctl.c | 53 ++++++++++++++++++++++
 include/linux/habanalabs/cpucp_if.h                |  8 +++-
 include/uapi/drm/habanalabs_accel.h                | 28 ++++++++++++
 5 files changed, 98 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 9e9dfe013659..3558a6a8e192 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -3244,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
 					HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
 }
 
+int hl_fw_get_dev_info_signed(struct hl_device *hdev,
+			      struct cpucp_dev_info_signed *dev_info_signed, u32 nonce)
+{
+	return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed,
+					 sizeof(struct cpucp_dev_info_signed), nonce,
+					 HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
+}
+
 int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
 						dma_addr_t buff, u32 *size)
 {
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 7b0209e5bad6..dd3fe3ddc00a 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3964,6 +3964,8 @@ long hl_fw_get_max_power(struct hl_device *hdev);
 void hl_fw_set_max_power(struct hl_device *hdev);
 int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
 				u32 nonce);
+int hl_fw_get_dev_info_signed(struct hl_device *hdev,
+			      struct cpucp_dev_info_signed *dev_info_signed, u32 nonce);
 int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value);
 int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value);
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 8ef36effb95b..a92713e0e580 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -19,6 +19,9 @@
 
 #include <asm/msr.h>
 
+/* make sure there is space for all the signed info */
+static_assert(sizeof(struct cpucp_info) <= SEC_DEV_INFO_BUF_SZ);
+
 static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
 	[HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr),
 	[HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf),
@@ -719,6 +722,53 @@ free_sec_attest_info:
 	return rc;
 }
 
+static int dev_info_signed(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct cpucp_dev_info_signed *dev_info_signed;
+	struct hl_info_signed *info;
+	u32 max_size = args->return_size;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	dev_info_signed = kzalloc(sizeof(*dev_info_signed), GFP_KERNEL);
+	if (!dev_info_signed)
+		return -ENOMEM;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		rc = -ENOMEM;
+		goto free_dev_info_signed;
+	}
+
+	rc = hl_fw_get_dev_info_signed(hpriv->hdev,
+					dev_info_signed, args->sec_attest_nonce);
+	if (rc)
+		goto free_info;
+
+	info->nonce = le32_to_cpu(dev_info_signed->nonce);
+	info->info_sig_len = dev_info_signed->info_sig_len;
+	info->pub_data_len = le16_to_cpu(dev_info_signed->pub_data_len);
+	info->certificate_len = le16_to_cpu(dev_info_signed->certificate_len);
+	info->dev_info_len = sizeof(struct cpucp_info);
+	memcpy(&info->info_sig, &dev_info_signed->info_sig, sizeof(info->info_sig));
+	memcpy(&info->public_data, &dev_info_signed->public_data, sizeof(info->public_data));
+	memcpy(&info->certificate, &dev_info_signed->certificate, sizeof(info->certificate));
+	memcpy(&info->dev_info, &dev_info_signed->info, info->dev_info_len);
+
+	rc = copy_to_user(out, info, min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0;
+
+free_info:
+	kfree(info);
+free_dev_info_signed:
+	kfree(dev_info_signed);
+
+	return rc;
+}
+
+
 static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	int rc;
@@ -1089,6 +1139,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_FW_GENERIC_REQ:
 		return send_fw_generic_request(hdev, args);
 
+	case HL_INFO_DEV_SIGNED:
+		return dev_info_signed(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -EINVAL;
diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 86ea7c63a0d2..f316c8d0f3fc 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -659,6 +659,12 @@ enum pq_init_status {
  *       number (nonce) provided by the host to prevent replay attacks.
  *       public key and certificate also provided as part of the FW response.
  *
+ * CPUCP_PACKET_INFO_SIGNED_GET -
+ *       Get the device information signed by the Trusted Platform device.
+ *       device info data is also hashed with some unique number (nonce) provided
+ *       by the host to prevent replay attacks. public key and certificate also
+ *       provided as part of the FW response.
+ *
  * CPUCP_PACKET_MONITOR_DUMP_GET -
  *       Get monitors registers dump from the CpuCP kernel.
  *       The CPU will put the registers dump in the a buffer allocated by the driver
@@ -733,7 +739,7 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
 	CPUCP_PACKET_RESERVED2,			/* not used */
 	CPUCP_PACKET_SEC_ATTEST_GET,		/* internal */
-	CPUCP_PACKET_RESERVED3,			/* not used */
+	CPUCP_PACKET_INFO_SIGNED_GET,		/* internal */
 	CPUCP_PACKET_RESERVED4,			/* not used */
 	CPUCP_PACKET_MONITOR_DUMP_GET,		/* debugfs */
 	CPUCP_PACKET_RESERVED5,			/* not used */
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 347c7b62e60e..a512dc4cffd0 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -846,6 +846,7 @@ enum hl_server_type {
 #define HL_INFO_HW_ERR_EVENT			36
 #define HL_INFO_FW_ERR_EVENT			37
 #define HL_INFO_USER_ENGINE_ERR_EVENT		38
+#define HL_INFO_DEV_SIGNED			40
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -1256,6 +1257,7 @@ struct hl_info_dev_memalloc_page_sizes {
 #define SEC_SIGNATURE_BUF_SZ	255	/* (256 - 1) 1 byte used for size */
 #define SEC_PUB_DATA_BUF_SZ	510	/* (512 - 2) 2 bytes used for size */
 #define SEC_CERTIFICATE_BUF_SZ	2046	/* (2048 - 2) 2 bytes used for size */
+#define SEC_DEV_INFO_BUF_SZ	5120
 
 /*
  * struct hl_info_sec_attest - attestation report of the boot
@@ -1290,6 +1292,32 @@ struct hl_info_sec_attest {
 	__u8 pad0[2];
 };
 
+/*
+ * struct hl_info_signed - device information signed by a secured device.
+ * @nonce: number only used once. random number provided by host. this also passed to the quote
+ *         command as a qualifying data.
+ * @pub_data_len: length of the public data (bytes)
+ * @certificate_len: length of the certificate (bytes)
+ * @info_sig_len: length of the attestation signature (bytes)
+ * @public_data: public key info signed info data (outPublic + name + qualifiedName)
+ * @certificate: certificate for the signing key
+ * @info_sig: signature of the info + nonce data.
+ * @dev_info_len: length of device info (bytes)
+ * @dev_info: device info as byte array.
+ */
+struct hl_info_signed {
+	__u32 nonce;
+	__u16 pub_data_len;
+	__u16 certificate_len;
+	__u8 info_sig_len;
+	__u8 public_data[SEC_PUB_DATA_BUF_SZ];
+	__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
+	__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
+	__u16 dev_info_len;
+	__u8 dev_info[SEC_DEV_INFO_BUF_SZ];
+	__u8 pad[2];
+};
+
 /**
  * struct hl_page_fault_info - page fault information.
  * @timestamp: timestamp of page fault.
-- 
cgit v1.2.3


From 403863e985e8eba608d53b2907caaf37b6176290 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Sat, 16 Dec 2023 13:29:58 +0100
Subject: netlink: introduce typedef for filter function

Make the code using filter function a bit nicer by consolidating the
filter function arguments using typedef.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/connector/connector.c | 5 ++---
 include/linux/connector.h     | 3 +--
 include/linux/netlink.h       | 6 ++++--
 net/netlink/af_netlink.c      | 3 +--
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 7f7b94f616a6..4028e8eeba82 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -59,9 +59,8 @@ static int cn_already_initialized;
  * both, or if both are zero then the group is looked up and sent there.
  */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group,
-	gfp_t gfp_mask,
-	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
-	void *filter_data)
+			 gfp_t gfp_mask, netlink_filter_fn filter,
+			 void *filter_data)
 {
 	struct cn_callback_entry *__cbq;
 	unsigned int size;
diff --git a/include/linux/connector.h b/include/linux/connector.h
index cec2d99ae902..70bc1160f3d8 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -100,8 +100,7 @@ void cn_del_callback(const struct cb_id *id);
  */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid,
 			 u32 group, gfp_t gfp_mask,
-			 int (*filter)(struct sock *dsk, struct sk_buff *skb,
-				       void *data),
+			 netlink_filter_fn filter,
 			 void *filter_data);
 
 /**
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index abe91ed6b9aa..1a4445bf2ab9 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -228,10 +228,12 @@ bool netlink_strict_get_check(struct sk_buff *skb);
 int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 		      __u32 group, gfp_t allocation);
+
+typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data);
+
 int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
 			       __u32 portid, __u32 group, gfp_t allocation,
-			       int (*filter)(struct sock *dsk,
-					     struct sk_buff *skb, void *data),
+			       netlink_filter_fn filter,
 			       void *filter_data);
 int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code);
 int netlink_register_notifier(struct notifier_block *nb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 177126fb0484..4ed8ffd58ff3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1519,8 +1519,7 @@ out:
 int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
 			       u32 portid,
 			       u32 group, gfp_t allocation,
-			       int (*filter)(struct sock *dsk,
-					     struct sk_buff *skb, void *data),
+			       netlink_filter_fn filter,
 			       void *filter_data)
 {
 	struct net *net = sock_net(ssk);
-- 
cgit v1.2.3


From 67ba055dd7758c34f6e64c9d35132362c1e1f0b5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 19 Dec 2023 17:40:12 +0200
Subject: regulator: Reuse LINEAR_RANGE() in REGULATOR_LINEAR_RANGE()

REGULATOR_LINEAR_RANGE() repeats what LINEAR_RANGE() provides.
Deduplicate the former by using the latter. No functional change
intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20231219154012.2478688-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4b7eceb3828b..22a07c0900a4 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -51,12 +51,7 @@ enum regulator_detection_severity {
 
 /* Initialize struct linear_range for regulators */
 #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV)	\
-{									\
-	.min		= _min_uV,					\
-	.min_sel	= _min_sel,					\
-	.max_sel	= _max_sel,					\
-	.step		= _step_uV,					\
-}
+	LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV)
 
 /**
  * struct regulator_ops - regulator operations.
-- 
cgit v1.2.3


From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 19 Dec 2023 07:37:35 -0800
Subject: Revert BPF token-related functionality

This patch includes the following revert (one  conflicting BPF FS
patch and three token patch sets, represented by merge commits):
  - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'";
  - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs";
  - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'";
  - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'".

Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 drivers/media/rc/bpf-lirc.c                        |    2 +-
 include/linux/bpf.h                                |   85 +-
 include/linux/filter.h                             |    2 +-
 include/linux/lsm_hook_defs.h                      |   15 +-
 include/linux/security.h                           |   43 +-
 include/uapi/linux/bpf.h                           |   42 -
 kernel/bpf/Makefile                                |    2 +-
 kernel/bpf/arraymap.c                              |    2 +-
 kernel/bpf/bpf_lsm.c                               |   15 +-
 kernel/bpf/cgroup.c                                |    6 +-
 kernel/bpf/core.c                                  |    3 +-
 kernel/bpf/helpers.c                               |    6 +-
 kernel/bpf/inode.c                                 |  326 +------
 kernel/bpf/syscall.c                               |  215 ++--
 kernel/bpf/token.c                                 |  271 -----
 kernel/bpf/verifier.c                              |   13 +-
 kernel/trace/bpf_trace.c                           |    2 +-
 net/core/filter.c                                  |   36 +-
 net/ipv4/bpf_tcp_ca.c                              |    2 +-
 net/netfilter/nf_bpf_link.c                        |    2 +-
 security/security.c                                |  101 +-
 security/selinux/hooks.c                           |   47 +-
 tools/include/uapi/linux/bpf.h                     |   42 -
 tools/lib/bpf/Build                                |    2 +-
 tools/lib/bpf/bpf.c                                |   37 +-
 tools/lib/bpf/bpf.h                                |   35 +-
 tools/lib/bpf/btf.c                                |    7 +-
 tools/lib/bpf/elf.c                                |    2 +
 tools/lib/bpf/features.c                           |  478 ---------
 tools/lib/bpf/libbpf.c                             |  573 ++++++++---
 tools/lib/bpf/libbpf.h                             |   37 +-
 tools/lib/bpf/libbpf.map                           |    1 -
 tools/lib/bpf/libbpf_internal.h                    |   36 +-
 tools/lib/bpf/libbpf_probes.c                      |    8 +-
 tools/lib/bpf/str_error.h                          |    3 -
 .../selftests/bpf/prog_tests/libbpf_probes.c       |    4 -
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |    6 -
 tools/testing/selftests/bpf/prog_tests/token.c     | 1031 --------------------
 tools/testing/selftests/bpf/progs/priv_map.c       |   13 -
 tools/testing/selftests/bpf/progs/priv_prog.c      |   13 -
 40 files changed, 641 insertions(+), 2925 deletions(-)
 delete mode 100644 kernel/bpf/token.c
 delete mode 100644 tools/lib/bpf/features.c
 delete mode 100644 tools/testing/selftests/bpf/prog_tests/token.c
 delete mode 100644 tools/testing/selftests/bpf/progs/priv_map.c
 delete mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c

(limited to 'include/linux')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 6d07693c6b9f..fe17c7f98e81 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (bpf_token_capable(prog->aux->token, CAP_PERFMON))
+		if (perfmon_capable())
 			return bpf_get_trace_printk_proto();
 		fallthrough;
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2f54cc0436c4..7a8d4c81a39a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,10 +52,6 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
-struct bpf_token;
-struct user_namespace;
-struct super_block;
-struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1488,7 +1484,6 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
-	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1613,31 +1608,6 @@ struct bpf_link_primer {
 	u32 id;
 };
 
-struct bpf_mount_opts {
-	kuid_t uid;
-	kgid_t gid;
-	umode_t mode;
-
-	/* BPF token-related delegation options */
-	u64 delegate_cmds;
-	u64 delegate_maps;
-	u64 delegate_progs;
-	u64 delegate_attachs;
-};
-
-struct bpf_token {
-	struct work_struct work;
-	atomic64_t refcnt;
-	struct user_namespace *userns;
-	u64 allowed_cmds;
-	u64 allowed_maps;
-	u64 allowed_progs;
-	u64 allowed_attachs;
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
-};
-
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
-extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-bool bpf_token_capable(const struct bpf_token *token, int cap);
-
-static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
+static inline bool bpf_allow_ptr_leaks(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
+static inline bool bpf_allow_uninit_stack(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v1(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v4(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
@@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
-void bpf_token_inc(struct bpf_token *token);
-void bpf_token_put(struct bpf_token *token);
-int bpf_token_create(union bpf_attr *attr);
-struct bpf_token *bpf_token_get_from_fd(u32 ufd);
-
-bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
-bool bpf_token_allow_prog_type(const struct bpf_token *token,
-			       enum bpf_prog_type prog_type,
-			       enum bpf_attach_type attach_type);
-
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
-struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
-			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
-						 const struct bpf_prog *prog);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
-static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
-}
-
-static inline void bpf_token_inc(struct bpf_token *token)
-{
-}
-
-static inline void bpf_token_put(struct bpf_token *token)
-{
-}
-
-static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline void __dev_flush(void)
 {
 }
@@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	return NULL;
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 12d907f17d36..68fb6c8142fe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
+	if (bpf_jit_harden == 1 && bpf_capable())
 		return false;
 
 	return true;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3fdd00b452ac..ff217a5ce552 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
-	 struct path *path)
-LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
-LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
-LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
+LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 00809d2d5c38..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,7 +32,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_token;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-				   struct bpf_token *token);
+extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-				  struct bpf_token *token);
-extern void security_bpf_prog_free(struct bpf_prog *prog);
-extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path);
-extern void security_bpf_token_free(struct bpf_token *token);
-extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-					  struct bpf_token *token)
+static inline int security_bpf_map_alloc(struct bpf_map *map)
 {
 	return 0;
 }
@@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-					 struct bpf_token *token)
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog *prog)
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
 { }
-
-static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path)
-{
-	return 0;
-}
-
-static inline void security_bpf_token_free(struct bpf_token *token)
-{ }
-
-static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
-{
-	return 0;
-}
-
-static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return 0;
-}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42f4d3090efe..754e68ca8744 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -847,36 +847,6 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
- * BPF_TOKEN_CREATE
- *	Description
- *		Create BPF token with embedded information about what
- *		BPF-related functionality it allows:
- *		- a set of allowed bpf() syscall commands;
- *		- a set of allowed BPF map types to be created with
- *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
- *		- a set of allowed BPF program types and BPF program attach
- *		types to be loaded with BPF_PROG_LOAD command, if
- *		BPF_PROG_LOAD itself is allowed.
- *
- *		BPF token is created (derived) from an instance of BPF FS,
- *		assuming it has necessary delegation mount options specified.
- *		This BPF token can be passed as an extra parameter to various
- *		bpf() syscall commands to grant BPF subsystem functionality to
- *		unprivileged processes.
- *
- *		When created, BPF token is "associated" with the owning
- *		user namespace of BPF FS instance (super block) that it was
- *		derived from, and subsequent BPF operations performed with
- *		BPF token would be performing capabilities checks (i.e.,
- *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
- *		that user namespace. Without BPF token, such capabilities
- *		have to be granted in init user namespace, making bpf()
- *		syscall incompatible with user namespace, for the most part.
- *
- *	Return
- *		A new file descriptor (a nonnegative integer), or -1 if an
- *		error occurred (in which case, *errno* is set appropriately).
- *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -931,8 +901,6 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
-	BPF_TOKEN_CREATE,
-	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -983,7 +951,6 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
-	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1028,7 +995,6 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
-	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1437,7 +1403,6 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
-		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -1507,7 +1472,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
-		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1620,7 +1584,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
-		__u32		btf_token_fd;
 	};
 
 	struct {
@@ -1751,11 +1714,6 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
-	struct { /* struct used by BPF_TOKEN_CREATE command */
-		__u32		flags;
-		__u32		bpffs_fd;
-	} token_create;
-
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4ce95acfcaa7..f526b7573e97 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 13358675ff2e..0bdbbbeab155 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
+	bool bypass_spec_v1 = bpf_bypass_spec_v1();
 	u64 array_size, mask64;
 	struct bpf_array *array;
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 63b4dc495125..e8e910395bf6 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,15 +260,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
 BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_create)
-BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
 BTF_ID(func, bpf_lsm_bpf_prog)
-BTF_ID(func, bpf_lsm_bpf_prog_load)
-BTF_ID(func, bpf_lsm_bpf_prog_free)
-BTF_ID(func, bpf_lsm_bpf_token_create)
-BTF_ID(func, bpf_lsm_bpf_token_free)
-BTF_ID(func, bpf_lsm_bpf_token_cmd)
-BTF_ID(func, bpf_lsm_bpf_token_capable)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -363,8 +357,9 @@ BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free)
-BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
 #ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 98e0e3835b28..491d20038cbe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 14ace23d517b..ea6843be2616 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -682,7 +682,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !bpf_token_capable(fp->aux->token, CAP_BPF))
+	    !bpf_capable())
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
@@ -2779,7 +2779,6 @@ void bpf_prog_free(struct bpf_prog *fp)
 
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
-	bpf_token_put(aux->token);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 07fd4b5704f3..be72824f32b2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		break;
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+	if (!bpf_capable())
 		return NULL;
 
 	switch (func_id) {
@@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		break;
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+	if (!perfmon_capable())
 		return NULL;
 
 	switch (func_id) {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4383b3d13a55..1aafb2ff2e95 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,7 +20,6 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
-#include <linux/kstrtox.h>
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -99,9 +98,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
 
-struct inode *bpf_get_inode(struct super_block *sb,
-			    const struct inode *dir,
-			    umode_t mode)
+static struct inode *bpf_get_inode(struct super_block *sb,
+				   const struct inode *dir,
+				   umode_t mode)
 {
 	struct inode *inode;
 
@@ -595,183 +594,15 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
 }
 EXPORT_SYMBOL(bpf_prog_get_type_path);
 
-struct bpffs_btf_enums {
-	const struct btf *btf;
-	const struct btf_type *cmd_t;
-	const struct btf_type *map_t;
-	const struct btf_type *prog_t;
-	const struct btf_type *attach_t;
-};
-
-static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
-{
-	const struct btf *btf;
-	const struct btf_type *t;
-	const char *name;
-	int i, n;
-
-	memset(info, 0, sizeof(*info));
-
-	btf = bpf_get_btf_vmlinux();
-	if (IS_ERR(btf))
-		return PTR_ERR(btf);
-	if (!btf)
-		return -ENOENT;
-
-	info->btf = btf;
-
-	for (i = 1, n = btf_nr_types(btf); i < n; i++) {
-		t = btf_type_by_id(btf, i);
-		if (!btf_type_is_enum(t))
-			continue;
-
-		name = btf_name_by_offset(btf, t->name_off);
-		if (!name)
-			continue;
-
-		if (strcmp(name, "bpf_cmd") == 0)
-			info->cmd_t = t;
-		else if (strcmp(name, "bpf_map_type") == 0)
-			info->map_t = t;
-		else if (strcmp(name, "bpf_prog_type") == 0)
-			info->prog_t = t;
-		else if (strcmp(name, "bpf_attach_type") == 0)
-			info->attach_t = t;
-		else
-			continue;
-
-		if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
-			return 0;
-	}
-
-	return -ESRCH;
-}
-
-static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
-				const char *prefix, const char *str, int *value)
-{
-	const struct btf_enum *e;
-	const char *name;
-	int i, n, pfx_len = strlen(prefix);
-
-	*value = 0;
-
-	if (!btf || !enum_t)
-		return false;
-
-	for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
-		e = &btf_enum(enum_t)[i];
-
-		name = btf_name_by_offset(btf, e->name_off);
-		if (!name || strncasecmp(name, prefix, pfx_len) != 0)
-			continue;
-
-		/* match symbolic name case insensitive and ignoring prefix */
-		if (strcasecmp(name + pfx_len, str) == 0) {
-			*value = e->val;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-static void seq_print_delegate_opts(struct seq_file *m,
-				    const char *opt_name,
-				    const struct btf *btf,
-				    const struct btf_type *enum_t,
-				    const char *prefix,
-				    u64 delegate_msk, u64 any_msk)
-{
-	const struct btf_enum *e;
-	bool first = true;
-	const char *name;
-	u64 msk;
-	int i, n, pfx_len = strlen(prefix);
-
-	delegate_msk &= any_msk; /* clear unknown bits */
-
-	if (delegate_msk == 0)
-		return;
-
-	seq_printf(m, ",%s", opt_name);
-	if (delegate_msk == any_msk) {
-		seq_printf(m, "=any");
-		return;
-	}
-
-	if (btf && enum_t) {
-		for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
-			e = &btf_enum(enum_t)[i];
-			name = btf_name_by_offset(btf, e->name_off);
-			if (!name || strncasecmp(name, prefix, pfx_len) != 0)
-				continue;
-			msk = 1ULL << e->val;
-			if (delegate_msk & msk) {
-				/* emit lower-case name without prefix */
-				seq_printf(m, "%c", first ? '=' : ':');
-				name += pfx_len;
-				while (*name) {
-					seq_printf(m, "%c", tolower(*name));
-					name++;
-				}
-
-				delegate_msk &= ~msk;
-				first = false;
-			}
-		}
-	}
-	if (delegate_msk)
-		seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
-}
-
 /*
  * Display the mount options in /proc/mounts.
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
-	struct inode *inode = d_inode(root);
-	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
-	u64 mask;
-
-	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
-		seq_printf(m, ",uid=%u",
-			   from_kuid_munged(&init_user_ns, inode->i_uid));
-	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
-		seq_printf(m, ",gid=%u",
-			   from_kgid_munged(&init_user_ns, inode->i_gid));
+	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
-
-	if (opts->delegate_cmds || opts->delegate_maps ||
-	    opts->delegate_progs || opts->delegate_attachs) {
-		struct bpffs_btf_enums info;
-
-		/* ignore errors, fallback to hex */
-		(void)find_bpffs_btf_enums(&info);
-
-		mask = (1ULL << __MAX_BPF_CMD) - 1;
-		seq_print_delegate_opts(m, "delegate_cmds",
-					info.btf, info.cmd_t, "BPF_",
-					opts->delegate_cmds, mask);
-
-		mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_maps",
-					info.btf, info.map_t, "BPF_MAP_TYPE_",
-					opts->delegate_maps, mask);
-
-		mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_progs",
-					info.btf, info.prog_t, "BPF_PROG_TYPE_",
-					opts->delegate_progs, mask);
-
-		mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_attachs",
-					info.btf, info.attach_t, "BPF_",
-					opts->delegate_attachs, mask);
-	}
-
 	return 0;
 }
 
@@ -786,7 +617,7 @@ static void bpf_free_inode(struct inode *inode)
 	free_inode_nonrcu(inode);
 }
 
-const struct super_operations bpf_super_ops = {
+static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
@@ -794,33 +625,23 @@ const struct super_operations bpf_super_ops = {
 };
 
 enum {
-	OPT_UID,
-	OPT_GID,
 	OPT_MODE,
-	OPT_DELEGATE_CMDS,
-	OPT_DELEGATE_MAPS,
-	OPT_DELEGATE_PROGS,
-	OPT_DELEGATE_ATTACHS,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
-	fsparam_u32	("uid",				OPT_UID),
-	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
-	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
-	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
-	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
-	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
 	{}
 };
 
+struct bpf_mount_opts {
+	umode_t mode;
+};
+
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct bpf_mount_opts *opts = fc->s_fs_info;
+	struct bpf_mount_opts *opts = fc->fs_private;
 	struct fs_parse_result result;
-	kuid_t uid;
-	kgid_t gid;
-	int opt, err;
+	int opt;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -841,104 +662,12 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	switch (opt) {
-	case OPT_UID:
-		uid = make_kuid(current_user_ns(), result.uint_32);
-		if (!uid_valid(uid))
-			goto bad_value;
-
-		/*
-		 * The requested uid must be representable in the
-		 * filesystem's idmapping.
-		 */
-		if (!kuid_has_mapping(fc->user_ns, uid))
-			goto bad_value;
-
-		opts->uid = uid;
-		break;
-	case OPT_GID:
-		gid = make_kgid(current_user_ns(), result.uint_32);
-		if (!gid_valid(gid))
-			goto bad_value;
-
-		/*
-		 * The requested gid must be representable in the
-		 * filesystem's idmapping.
-		 */
-		if (!kgid_has_mapping(fc->user_ns, gid))
-			goto bad_value;
-
-		opts->gid = gid;
-		break;
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
-	case OPT_DELEGATE_CMDS:
-	case OPT_DELEGATE_MAPS:
-	case OPT_DELEGATE_PROGS:
-	case OPT_DELEGATE_ATTACHS: {
-		struct bpffs_btf_enums info;
-		const struct btf_type *enum_t;
-		const char *enum_pfx;
-		u64 *delegate_msk, msk = 0;
-		char *p;
-		int val;
-
-		/* ignore errors, fallback to hex */
-		(void)find_bpffs_btf_enums(&info);
-
-		switch (opt) {
-		case OPT_DELEGATE_CMDS:
-			delegate_msk = &opts->delegate_cmds;
-			enum_t = info.cmd_t;
-			enum_pfx = "BPF_";
-			break;
-		case OPT_DELEGATE_MAPS:
-			delegate_msk = &opts->delegate_maps;
-			enum_t = info.map_t;
-			enum_pfx = "BPF_MAP_TYPE_";
-			break;
-		case OPT_DELEGATE_PROGS:
-			delegate_msk = &opts->delegate_progs;
-			enum_t = info.prog_t;
-			enum_pfx = "BPF_PROG_TYPE_";
-			break;
-		case OPT_DELEGATE_ATTACHS:
-			delegate_msk = &opts->delegate_attachs;
-			enum_t = info.attach_t;
-			enum_pfx = "BPF_";
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		while ((p = strsep(&param->string, ":"))) {
-			if (strcmp(p, "any") == 0) {
-				msk |= ~0ULL;
-			} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
-				msk |= 1ULL << val;
-			} else {
-				err = kstrtou64(p, 0, &msk);
-				if (err)
-					return err;
-			}
-		}
-
-		/* Setting delegation mount options requires privileges */
-		if (msk && !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		*delegate_msk |= msk;
-		break;
-	}
-	default:
-		/* ignore unknown mount options */
-		break;
 	}
 
 	return 0;
-
-bad_value:
-	return invalfc(fc, "Bad value for '%s'", param->key);
 }
 
 struct bpf_preload_ops *bpf_preload_ops;
@@ -1010,14 +739,10 @@ out:
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = sb->s_fs_info;
+	struct bpf_mount_opts *opts = fc->fs_private;
 	struct inode *inode;
 	int ret;
 
-	/* Mounting an instance of BPF FS requires privileges */
-	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -1025,8 +750,6 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &bpf_super_ops;
 
 	inode = sb->s_root->d_inode;
-	inode->i_uid = opts->uid;
-	inode->i_gid = opts->gid;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
@@ -1041,7 +764,7 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->s_fs_info);
+	kfree(fc->fs_private);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -1062,35 +785,18 @@ static int bpf_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	opts->mode = S_IRWXUGO;
-	opts->uid = current_fsuid();
-	opts->gid = current_fsgid();
-
-	/* start out with no BPF token delegation enabled */
-	opts->delegate_cmds = 0;
-	opts->delegate_maps = 0;
-	opts->delegate_progs = 0;
-	opts->delegate_attachs = 0;
 
-	fc->s_fs_info = opts;
+	fc->fs_private = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
-static void bpf_kill_super(struct super_block *sb)
-{
-	struct bpf_mount_opts *opts = sb->s_fs_info;
-
-	kill_litter_super(sb);
-	kfree(opts);
-}
-
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= bpf_kill_super,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.kill_sb	= kill_litter_super,
 };
 
 static int __init bpf_init(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8faa1a20edf8..1bf9805ee185 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1011,8 +1011,8 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
-			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
+			 u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
 	u32 key_size, value_size;
@@ -1040,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-		if (!bpf_token_capable(token, CAP_BPF)) {
+		if (!bpf_capable()) {
 			ret = -EPERM;
 			goto free_map_tab;
 		}
@@ -1123,17 +1123,11 @@ free_map_tab:
 	return ret;
 }
 
-static bool bpf_net_capable(void)
-{
-	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
-}
-
-#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
-	struct bpf_token *token = NULL;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
@@ -1184,32 +1178,14 @@ static int map_create(union bpf_attr *attr)
 	if (!ops->map_mem_usage)
 		return -EINVAL;
 
-	if (attr->map_token_fd) {
-		token = bpf_token_get_from_fd(attr->map_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-
-		/* if current token doesn't grant map creation permissions,
-		 * then we can't use this token, so ignore it and rely on
-		 * system-wide capabilities checks
-		 */
-		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
-		    !bpf_token_allow_map_type(token, attr->map_type)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	err = -EPERM;
-
 	/* Intent here is for unprivileged_bpf_disabled to block BPF map
 	 * creation for unprivileged users; other actions depend
 	 * on fd availability and access to bpffs, so are dependent on
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
-		goto put_token;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+		return -EPERM;
 
 	/* check privileged map type permissions */
 	switch (map_type) {
@@ -1242,27 +1218,25 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
-		if (!bpf_token_capable(token, CAP_BPF))
-			goto put_token;
+		if (!bpf_capable())
+			return -EPERM;
 		break;
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!bpf_token_capable(token, CAP_NET_ADMIN))
-			goto put_token;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		break;
 	default:
 		WARN(1, "unsupported map type %d", map_type);
-		goto put_token;
+		return -EPERM;
 	}
 
 	map = ops->map_alloc(attr);
-	if (IS_ERR(map)) {
-		err = PTR_ERR(map);
-		goto put_token;
-	}
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	map->ops = ops;
 	map->map_type = map_type;
 
@@ -1299,7 +1273,7 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 
 		if (attr->btf_value_type_id) {
-			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
+			err = map_check_btf(map, btf, attr->btf_key_type_id,
 					    attr->btf_value_type_id);
 			if (err)
 				goto free_map;
@@ -1311,16 +1285,15 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_create(map, attr, token);
+	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_sec;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map_sec;
 
 	bpf_map_save_memcg(map);
-	bpf_token_put(token);
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -1341,8 +1314,6 @@ free_map_sec:
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
-put_token:
-	bpf_token_put(token);
 	return err;
 }
 
@@ -2173,7 +2144,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
 	free_uid(aux->user);
-	security_bpf_prog_free(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -2619,15 +2590,13 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
+#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
-	struct bpf_token *token = NULL;
-	bool bpf_cap;
 	int err;
 	char license[128];
 
@@ -2644,31 +2613,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
-	bpf_prog_load_fixup_attach_type(attr);
-
-	if (attr->prog_token_fd) {
-		token = bpf_token_get_from_fd(attr->prog_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-		/* if current token doesn't grant prog loading permissions,
-		 * then we can't use this token, so ignore it and rely on
-		 * system-wide capabilities checks
-		 */
-		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
-		    !bpf_token_allow_prog_type(token, attr->prog_type,
-					       attr->expected_attach_type)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	bpf_cap = bpf_token_capable(token, CAP_BPF);
-	err = -EPERM;
-
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !bpf_cap)
-		goto put_token;
+	    !bpf_capable())
+		return -EPERM;
 
 	/* Intent here is for unprivileged_bpf_disabled to block BPF program
 	 * creation for unprivileged users; other actions depend
@@ -2677,23 +2625,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * capability checks are still carried out for these
 	 * and other operations.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
-		goto put_token;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+		return -EPERM;
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
-		err = -E2BIG;
-		goto put_token;
-	}
+	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_cap)
-		goto put_token;
+	    !bpf_capable())
+		return -EPERM;
 
-	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
-		goto put_token;
-	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
-		goto put_token;
+	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (is_perfmon_prog_type(type) && !perfmon_capable())
+		return -EPERM;
 
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
@@ -2703,33 +2649,27 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		if (IS_ERR(dst_prog)) {
 			dst_prog = NULL;
 			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
-			if (IS_ERR(attach_btf)) {
-				err = -EINVAL;
-				goto put_token;
-			}
+			if (IS_ERR(attach_btf))
+				return -EINVAL;
 			if (!btf_is_kernel(attach_btf)) {
 				/* attaching through specifying bpf_prog's BTF
 				 * objects directly might be supported eventually
 				 */
 				btf_put(attach_btf);
-				err = -ENOTSUPP;
-				goto put_token;
+				return -ENOTSUPP;
 			}
 		}
 	} else if (attr->attach_btf_id) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(attach_btf)) {
-			err = PTR_ERR(attach_btf);
-			goto put_token;
-		}
-		if (!attach_btf) {
-			err = -EINVAL;
-			goto put_token;
-		}
+		if (IS_ERR(attach_btf))
+			return PTR_ERR(attach_btf);
+		if (!attach_btf)
+			return -EINVAL;
 		btf_get(attach_btf);
 	}
 
+	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
 				       dst_prog)) {
@@ -2737,8 +2677,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		err = -EINVAL;
-		goto put_token;
+		return -EINVAL;
 	}
 
 	/* plain bpf_prog allocation */
@@ -2748,8 +2687,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		err = -EINVAL;
-		goto put_token;
+		return -ENOMEM;
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
@@ -2760,9 +2698,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
-	/* move token into prog->aux, reuse taken refcnt */
-	prog->aux->token = token;
-	token = NULL;
+	err = security_bpf_prog_alloc(prog->aux);
+	if (err)
+		goto free_prog;
 
 	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
@@ -2771,12 +2709,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (copy_from_bpfptr(prog->insns,
 			     make_bpfptr(attr->insns, uattr.is_kernel),
 			     bpf_prog_insn_size(prog)) != 0)
-		goto free_prog;
+		goto free_prog_sec;
 	/* copy eBPF program license from user space */
 	if (strncpy_from_bpfptr(license,
 				make_bpfptr(attr->license, uattr.is_kernel),
 				sizeof(license) - 1) < 0)
-		goto free_prog;
+		goto free_prog_sec;
 	license[sizeof(license) - 1] = 0;
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2790,29 +2728,25 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
-			goto free_prog;
+			goto free_prog_sec;
 	}
 
 	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
 	    bpf_prog_is_dev_bound(dst_prog->aux)) {
 		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
 		if (err)
-			goto free_prog;
+			goto free_prog_sec;
 	}
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog;
+		goto free_prog_sec;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
-		goto free_prog;
-
-	err = security_bpf_prog_load(prog, attr, token);
-	if (err)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
@@ -2858,16 +2792,13 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
-
 free_prog_sec:
-	security_bpf_prog_free(prog);
-free_prog:
 	free_uid(prog->aux->user);
+	security_bpf_prog_free(prog->aux);
+free_prog:
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
-put_token:
-	bpf_token_put(token);
 	return err;
 }
 
@@ -3857,7 +3788,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN))
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
@@ -4060,7 +3991,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	if (!bpf_net_capable())
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
 		return -EINVAL;
@@ -4828,31 +4759,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
 
 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
-	struct bpf_token *token = NULL;
-
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (attr->btf_token_fd) {
-		token = bpf_token_get_from_fd(attr->btf_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-		if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	if (!bpf_token_capable(token, CAP_BPF)) {
-		bpf_token_put(token);
+	if (!bpf_capable())
 		return -EPERM;
-	}
-
-	bpf_token_put(token);
 
 	return btf_new_fd(attr, uattr, uattr_size);
 }
@@ -5470,20 +5385,6 @@ out_prog_put:
 	return ret;
 }
 
-#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
-
-static int token_create(union bpf_attr *attr)
-{
-	if (CHECK_ATTR(BPF_TOKEN_CREATE))
-		return -EINVAL;
-
-	/* no flags are supported yet */
-	if (attr->token_create.flags)
-		return -EINVAL;
-
-	return bpf_token_create(attr);
-}
-
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -5617,9 +5518,6 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
-	case BPF_TOKEN_CREATE:
-		err = token_create(&attr);
-		break;
 	default:
 		err = -EINVAL;
 		break;
@@ -5726,7 +5624,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
 const struct bpf_func_proto * __weak
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id, prog);
+	return bpf_base_func_proto(func_id);
 }
 
 BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5776,8 +5674,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
-		       ? NULL : &bpf_sys_bpf_proto;
+		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
deleted file mode 100644
index a86fccd57e2d..000000000000
--- a/kernel/bpf/token.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <linux/bpf.h>
-#include <linux/vmalloc.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/idr.h>
-#include <linux/namei.h>
-#include <linux/user_namespace.h>
-#include <linux/security.h>
-
-bool bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	/* BPF token allows ns_capable() level of capabilities, but only if
-	 * token's userns is *exactly* the same as current user's userns
-	 */
-	if (token && current_user_ns() == token->userns) {
-		if (ns_capable(token->userns, cap) ||
-		    (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)))
-			return security_bpf_token_capable(token, cap) == 0;
-	}
-	/* otherwise fallback to capable() checks */
-	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
-}
-
-void bpf_token_inc(struct bpf_token *token)
-{
-	atomic64_inc(&token->refcnt);
-}
-
-static void bpf_token_free(struct bpf_token *token)
-{
-	security_bpf_token_free(token);
-	put_user_ns(token->userns);
-	kvfree(token);
-}
-
-static void bpf_token_put_deferred(struct work_struct *work)
-{
-	struct bpf_token *token = container_of(work, struct bpf_token, work);
-
-	bpf_token_free(token);
-}
-
-void bpf_token_put(struct bpf_token *token)
-{
-	if (!token)
-		return;
-
-	if (!atomic64_dec_and_test(&token->refcnt))
-		return;
-
-	INIT_WORK(&token->work, bpf_token_put_deferred);
-	schedule_work(&token->work);
-}
-
-static int bpf_token_release(struct inode *inode, struct file *filp)
-{
-	struct bpf_token *token = filp->private_data;
-
-	bpf_token_put(token);
-	return 0;
-}
-
-static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
-{
-	struct bpf_token *token = filp->private_data;
-	u64 mask;
-
-	BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
-	mask = (1ULL << __MAX_BPF_CMD) - 1;
-	if ((token->allowed_cmds & mask) == mask)
-		seq_printf(m, "allowed_cmds:\tany\n");
-	else
-		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
-
-	BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
-	if ((token->allowed_maps & mask) == mask)
-		seq_printf(m, "allowed_maps:\tany\n");
-	else
-		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
-
-	BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
-	if ((token->allowed_progs & mask) == mask)
-		seq_printf(m, "allowed_progs:\tany\n");
-	else
-		seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
-
-	BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
-	if ((token->allowed_attachs & mask) == mask)
-		seq_printf(m, "allowed_attachs:\tany\n");
-	else
-		seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
-}
-
-#define BPF_TOKEN_INODE_NAME "bpf-token"
-
-static const struct inode_operations bpf_token_iops = { };
-
-static const struct file_operations bpf_token_fops = {
-	.release	= bpf_token_release,
-	.show_fdinfo	= bpf_token_show_fdinfo,
-};
-
-int bpf_token_create(union bpf_attr *attr)
-{
-	struct bpf_mount_opts *mnt_opts;
-	struct bpf_token *token = NULL;
-	struct user_namespace *userns;
-	struct inode *inode;
-	struct file *file;
-	struct path path;
-	struct fd f;
-	umode_t mode;
-	int err, fd;
-
-	f = fdget(attr->token_create.bpffs_fd);
-	if (!f.file)
-		return -EBADF;
-
-	path = f.file->f_path;
-	path_get(&path);
-	fdput(f);
-
-	if (path.dentry != path.mnt->mnt_sb->s_root) {
-		err = -EINVAL;
-		goto out_path;
-	}
-	if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
-		err = -EINVAL;
-		goto out_path;
-	}
-	err = path_permission(&path, MAY_ACCESS);
-	if (err)
-		goto out_path;
-
-	userns = path.dentry->d_sb->s_user_ns;
-	/*
-	 * Enforce that creators of BPF tokens are in the same user
-	 * namespace as the BPF FS instance. This makes reasoning about
-	 * permissions a lot easier and we can always relax this later.
-	 */
-	if (current_user_ns() != userns) {
-		err = -EPERM;
-		goto out_path;
-	}
-	if (!ns_capable(userns, CAP_BPF)) {
-		err = -EPERM;
-		goto out_path;
-	}
-
-	mnt_opts = path.dentry->d_sb->s_fs_info;
-	if (mnt_opts->delegate_cmds == 0 &&
-	    mnt_opts->delegate_maps == 0 &&
-	    mnt_opts->delegate_progs == 0 &&
-	    mnt_opts->delegate_attachs == 0) {
-		err = -ENOENT; /* no BPF token delegation is set up */
-		goto out_path;
-	}
-
-	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out_path;
-	}
-
-	inode->i_op = &bpf_token_iops;
-	inode->i_fop = &bpf_token_fops;
-	clear_nlink(inode); /* make sure it is unlinked */
-
-	file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
-	if (IS_ERR(file)) {
-		iput(inode);
-		err = PTR_ERR(file);
-		goto out_path;
-	}
-
-	token = kvzalloc(sizeof(*token), GFP_USER);
-	if (!token) {
-		err = -ENOMEM;
-		goto out_file;
-	}
-
-	atomic64_set(&token->refcnt, 1);
-
-	/* remember bpffs owning userns for future ns_capable() checks */
-	token->userns = get_user_ns(userns);
-
-	token->allowed_cmds = mnt_opts->delegate_cmds;
-	token->allowed_maps = mnt_opts->delegate_maps;
-	token->allowed_progs = mnt_opts->delegate_progs;
-	token->allowed_attachs = mnt_opts->delegate_attachs;
-
-	err = security_bpf_token_create(token, attr, &path);
-	if (err)
-		goto out_token;
-
-	fd = get_unused_fd_flags(O_CLOEXEC);
-	if (fd < 0) {
-		err = fd;
-		goto out_token;
-	}
-
-	file->private_data = token;
-	fd_install(fd, file);
-
-	path_put(&path);
-	return fd;
-
-out_token:
-	bpf_token_free(token);
-out_file:
-	fput(file);
-out_path:
-	path_put(&path);
-	return err;
-}
-
-struct bpf_token *bpf_token_get_from_fd(u32 ufd)
-{
-	struct fd f = fdget(ufd);
-	struct bpf_token *token;
-
-	if (!f.file)
-		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_token_fops) {
-		fdput(f);
-		return ERR_PTR(-EINVAL);
-	}
-
-	token = f.file->private_data;
-	bpf_token_inc(token);
-	fdput(f);
-
-	return token;
-}
-
-bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
-{
-	/* BPF token can be used only within exactly the same userns in which
-	 * it was created
-	 */
-	if (!token || current_user_ns() != token->userns)
-		return false;
-	if (!(token->allowed_cmds & (1ULL << cmd)))
-		return false;
-	return security_bpf_token_cmd(token, cmd) == 0;
-}
-
-bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
-{
-	if (!token || type >= __MAX_BPF_MAP_TYPE)
-		return false;
-
-	return token->allowed_maps & (1ULL << type);
-}
-
-bool bpf_token_allow_prog_type(const struct bpf_token *token,
-			       enum bpf_prog_type prog_type,
-			       enum bpf_attach_type attach_type)
-{
-	if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
-		return false;
-
-	return (token->allowed_progs & (1ULL << prog_type)) &&
-	       (token->allowed_attachs & (1ULL << attach_type));
-}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9456ee0ad129..4ceec8c2a484 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20594,12 +20594,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
-
-	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
-	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
-	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
-	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
-	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
+	is_priv = bpf_capable();
 
 	bpf_get_btf_vmlinux();
 
@@ -20631,6 +20626,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+	env->allow_uninit_stack = bpf_allow_uninit_stack();
+	env->bypass_spec_v1 = bpf_bypass_spec_v1();
+	env->bypass_spec_v4 = bpf_bypass_spec_v4();
+	env->bpf_capable = bpf_capable();
+
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 492d60e9c480..7ac6c52b25eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3cc52b82bab8..24061f29c9dd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,7 +87,7 @@
 #include "dev.h"
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+bpf_sk_base_func_proto(enum bpf_func_id func_id);
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
 {
@@ -7862,7 +7862,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -7955,7 +7955,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return NULL;
 		}
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -7974,7 +7974,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8161,7 +8161,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8220,7 +8220,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 
 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
@@ -8281,7 +8281,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_sock_proto;
 #endif /* CONFIG_INET */
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8323,7 +8323,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8367,7 +8367,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_lookup_tcp_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8378,7 +8378,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_flow_dissector_load_bytes_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8405,7 +8405,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8580,7 +8580,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+		if (!bpf_capable())
 			return false;
 		break;
 	}
@@ -8592,7 +8592,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+			if (!bpf_capable())
 				return false;
 			break;
 		default:
@@ -11236,7 +11236,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -11418,7 +11418,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -11752,7 +11752,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
 };
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_sk_base_func_proto(enum bpf_func_id func_id)
 {
 	const struct bpf_func_proto *func;
 
@@ -11781,10 +11781,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+	if (!perfmon_capable())
 		return NULL;
 
 	return func;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 634cfafa583d..ae8b15e6896f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 5257d5e7eb09..0e4beae421f8 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
 static const struct bpf_func_proto *
 bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id, prog);
+	return bpf_base_func_proto(func_id);
 }
 
 const struct bpf_verifier_ops netfilter_verifier_ops = {
diff --git a/security/security.c b/security/security.c
index 088a79c35c26..dcb3e7014f9b 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5167,87 +5167,29 @@ int security_bpf_prog(struct bpf_prog *prog)
 }
 
 /**
- * security_bpf_map_create() - Check if BPF map creation is allowed
- * @map: BPF map object
- * @attr: BPF syscall attributes used to create BPF map
- * @token: BPF token used to grant user access
- *
- * Do a check when the kernel creates a new BPF map. This is also the
- * point where LSM blob is allocated for LSMs that need them.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-			    struct bpf_token *token)
-{
-	return call_int_hook(bpf_map_create, 0, map, attr, token);
-}
-
-/**
- * security_bpf_prog_load() - Check if loading of BPF program is allowed
- * @prog: BPF program object
- * @attr: BPF syscall attributes used to create BPF program
- * @token: BPF token used to grant user access to BPF subsystem
- *
- * Perform an access control check when the kernel loads a BPF program and
- * allocates associated BPF program object. This hook is also responsible for
- * allocating any required LSM state for the BPF program.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-			   struct bpf_token *token)
-{
-	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
-}
-
-/**
- * security_bpf_token_create() - Check if creating of BPF token is allowed
- * @token: BPF token object
- * @attr: BPF syscall attributes used to create BPF token
- * @path: path pointing to BPF FS mount point from which BPF token is created
- *
- * Do a check when the kernel instantiates a new BPF token object from BPF FS
- * instance. This is also the point where LSM blob can be allocated for LSMs.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-			      struct path *path)
-{
-	return call_int_hook(bpf_token_create, 0, token, attr, path);
-}
-
-/**
- * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
- * requested BPF syscall command
- * @token: BPF token object
- * @cmd: BPF syscall command requested to be delegated by BPF token
+ * security_bpf_map_alloc() - Allocate a bpf map LSM blob
+ * @map: bpf map
  *
- * Do a check when the kernel decides whether provided BPF token should allow
- * delegation of requested BPF syscall command.
+ * Initialize the security field inside bpf map.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+int security_bpf_map_alloc(struct bpf_map *map)
 {
-	return call_int_hook(bpf_token_cmd, 0, token, cmd);
+	return call_int_hook(bpf_map_alloc_security, 0, map);
 }
 
 /**
- * security_bpf_token_capable() - Check if BPF token is allowed to delegate
- * requested BPF-related capability
- * @token: BPF token object
- * @cap: capabilities requested to be delegated by BPF token
+ * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
+ * @aux: bpf program aux info struct
  *
- * Do a check when the kernel decides whether provided BPF token should allow
- * delegation of requested BPF-related capabilities.
+ * Initialize the security field inside bpf program.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_token_capable(const struct bpf_token *token, int cap)
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
-	return call_int_hook(bpf_token_capable, 0, token, cap);
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
 }
 
 /**
@@ -5258,29 +5200,18 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap)
  */
 void security_bpf_map_free(struct bpf_map *map)
 {
-	call_void_hook(bpf_map_free, map);
-}
-
-/**
- * security_bpf_prog_free() - Free a BPF program's LSM blob
- * @prog: BPF program struct
- *
- * Clean up the security information stored inside BPF program.
- */
-void security_bpf_prog_free(struct bpf_prog *prog)
-{
-	call_void_hook(bpf_prog_free, prog);
+	call_void_hook(bpf_map_free_security, map);
 }
 
 /**
- * security_bpf_token_free() - Free a BPF token's LSM blob
- * @token: BPF token struct
+ * security_bpf_prog_free() - Free a bpf program's LSM blob
+ * @aux: bpf program aux info struct
  *
- * Clean up the security information stored inside BPF token.
+ * Clean up the security information stored inside bpf prog.
  */
-void security_bpf_token_free(struct bpf_token *token)
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
 {
-	call_void_hook(bpf_token_free, token);
+	call_void_hook(bpf_prog_free_security, aux);
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1501e95366a1..feda711c6b7b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6783,8 +6783,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 			    BPF__PROG_RUN, NULL);
 }
 
-static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-				  struct bpf_token *token)
+static int selinux_bpf_map_alloc(struct bpf_map *map)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6806,8 +6805,7 @@ static void selinux_bpf_map_free(struct bpf_map *map)
 	kfree(bpfsec);
 }
 
-static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-				 struct bpf_token *token)
+static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6816,39 +6814,16 @@ static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 		return -ENOMEM;
 
 	bpfsec->sid = current_sid();
-	prog->aux->security = bpfsec;
+	aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog *prog)
+static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
 {
-	struct bpf_security_struct *bpfsec = prog->aux->security;
+	struct bpf_security_struct *bpfsec = aux->security;
 
-	prog->aux->security = NULL;
-	kfree(bpfsec);
-}
-
-static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				    struct path *path)
-{
-	struct bpf_security_struct *bpfsec;
-
-	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
-	if (!bpfsec)
-		return -ENOMEM;
-
-	bpfsec->sid = current_sid();
-	token->security = bpfsec;
-
-	return 0;
-}
-
-static void selinux_bpf_token_free(struct bpf_token *token)
-{
-	struct bpf_security_struct *bpfsec = token->security;
-
-	token->security = NULL;
+	aux->security = NULL;
 	kfree(bpfsec);
 }
 #endif
@@ -7204,9 +7179,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
-	LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free),
+	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7263,9 +7237,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
 #endif
 #ifdef CONFIG_BPF_SYSCALL
-	LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
-	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
-	LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create),
+	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+	LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0545201b55f..7f24d898efbb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -847,36 +847,6 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
- * BPF_TOKEN_CREATE
- *	Description
- *		Create BPF token with embedded information about what
- *		BPF-related functionality it allows:
- *		- a set of allowed bpf() syscall commands;
- *		- a set of allowed BPF map types to be created with
- *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
- *		- a set of allowed BPF program types and BPF program attach
- *		types to be loaded with BPF_PROG_LOAD command, if
- *		BPF_PROG_LOAD itself is allowed.
- *
- *		BPF token is created (derived) from an instance of BPF FS,
- *		assuming it has necessary delegation mount options specified.
- *		This BPF token can be passed as an extra parameter to various
- *		bpf() syscall commands to grant BPF subsystem functionality to
- *		unprivileged processes.
- *
- *		When created, BPF token is "associated" with the owning
- *		user namespace of BPF FS instance (super block) that it was
- *		derived from, and subsequent BPF operations performed with
- *		BPF token would be performing capabilities checks (i.e.,
- *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
- *		that user namespace. Without BPF token, such capabilities
- *		have to be granted in init user namespace, making bpf()
- *		syscall incompatible with user namespace, for the most part.
- *
- *	Return
- *		A new file descriptor (a nonnegative integer), or -1 if an
- *		error occurred (in which case, *errno* is set appropriately).
- *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -931,8 +901,6 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
-	BPF_TOKEN_CREATE,
-	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -983,7 +951,6 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
-	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1028,7 +995,6 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
-	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1437,7 +1403,6 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
-		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -1507,7 +1472,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
-		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1620,7 +1584,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
-		__u32		btf_token_fd;
 	};
 
 	struct {
@@ -1751,11 +1714,6 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
-	struct { /* struct used by BPF_TOKEN_CREATE command */
-		__u32		flags;
-		__u32		bpffs_fd;
-	} token_create;
-
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index b6619199a706..2d0c282c8588 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,4 +1,4 @@
 libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \
 	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
-	    usdt.o zip.o elf.o features.o
+	    usdt.o zip.o elf.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 0ad8e532b3cf..9dc9625651dc 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -103,7 +103,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts)
  *   [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/
  *   [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper")
  */
-int probe_memcg_account(int token_fd)
+int probe_memcg_account(void)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd);
 	struct bpf_insn insns[] = {
@@ -120,7 +120,6 @@ int probe_memcg_account(int token_fd)
 	attr.insns = ptr_to_u64(insns);
 	attr.insn_cnt = insn_cnt;
 	attr.license = ptr_to_u64("GPL");
-	attr.prog_token_fd = token_fd;
 
 	prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz);
 	if (prog_fd >= 0) {
@@ -147,7 +146,7 @@ int bump_rlimit_memlock(void)
 	struct rlimit rlim;
 
 	/* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */
-	if (memlock_bumped || feat_supported(NULL, FEAT_MEMCG_ACCOUNT))
+	if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT))
 		return 0;
 
 	memlock_bumped = true;
@@ -170,7 +169,7 @@ int bpf_map_create(enum bpf_map_type map_type,
 		   __u32 max_entries,
 		   const struct bpf_map_create_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
 	union bpf_attr attr;
 	int fd;
 
@@ -182,7 +181,7 @@ int bpf_map_create(enum bpf_map_type map_type,
 		return libbpf_err(-EINVAL);
 
 	attr.map_type = map_type;
-	if (map_name && feat_supported(NULL, FEAT_PROG_NAME))
+	if (map_name && kernel_supports(NULL, FEAT_PROG_NAME))
 		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
 	attr.key_size = key_size;
 	attr.value_size = value_size;
@@ -199,8 +198,6 @@ int bpf_map_create(enum bpf_map_type map_type,
 	attr.numa_node = OPTS_GET(opts, numa_node, 0);
 	attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);
 
-	attr.map_token_fd = OPTS_GET(opts, token_fd, 0);
-
 	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
 	return libbpf_err_errno(fd);
 }
@@ -235,7 +232,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, size_t insn_cnt,
 		  struct bpf_prog_load_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, log_true_size);
 	void *finfo = NULL, *linfo = NULL;
 	const char *func_info, *line_info;
 	__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
@@ -264,9 +261,8 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 	attr.prog_flags = OPTS_GET(opts, prog_flags, 0);
 	attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0);
 	attr.kern_version = OPTS_GET(opts, kern_version, 0);
-	attr.prog_token_fd = OPTS_GET(opts, token_fd, 0);
 
-	if (prog_name && feat_supported(NULL, FEAT_PROG_NAME))
+	if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME))
 		libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
 	attr.license = ptr_to_u64(license);
 
@@ -1186,7 +1182,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 
 int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size);
 	union bpf_attr attr;
 	char *log_buf;
 	size_t log_size;
@@ -1211,8 +1207,6 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts
 
 	attr.btf = ptr_to_u64(btf_data);
 	attr.btf_size = btf_size;
-	attr.btf_token_fd = OPTS_GET(opts, token_fd, 0);
-
 	/* log_level == 0 and log_buf != NULL means "try loading without
 	 * log_buf, but retry with log_buf and log_level=1 on error", which is
 	 * consistent across low-level and high-level BTF and program loading
@@ -1293,20 +1287,3 @@ int bpf_prog_bind_map(int prog_fd, int map_fd,
 	ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz);
 	return libbpf_err_errno(ret);
 }
-
-int bpf_token_create(int bpffs_fd, struct bpf_token_create_opts *opts)
-{
-	const size_t attr_sz = offsetofend(union bpf_attr, token_create);
-	union bpf_attr attr;
-	int fd;
-
-	if (!OPTS_VALID(opts, bpf_token_create_opts))
-		return libbpf_err(-EINVAL);
-
-	memset(&attr, 0, attr_sz);
-	attr.token_create.bpffs_fd = bpffs_fd;
-	attr.token_create.flags = OPTS_GET(opts, flags, 0);
-
-	fd = sys_bpf_fd(BPF_TOKEN_CREATE, &attr, attr_sz);
-	return libbpf_err_errno(fd);
-}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 991b86bfe7e4..d0f53772bdc0 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -51,11 +51,8 @@ struct bpf_map_create_opts {
 
 	__u32 numa_node;
 	__u32 map_ifindex;
-
-	__u32 token_fd;
-	size_t :0;
 };
-#define bpf_map_create_opts__last_field token_fd
+#define bpf_map_create_opts__last_field map_ifindex
 
 LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
 			      const char *map_name,
@@ -105,10 +102,9 @@ struct bpf_prog_load_opts {
 	 * If kernel doesn't support this feature, log_size is left unchanged.
 	 */
 	__u32 log_true_size;
-	__u32 token_fd;
 	size_t :0;
 };
-#define bpf_prog_load_opts__last_field token_fd
+#define bpf_prog_load_opts__last_field log_true_size
 
 LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type,
 			     const char *prog_name, const char *license,
@@ -134,10 +130,9 @@ struct bpf_btf_load_opts {
 	 * If kernel doesn't support this feature, log_size is left unchanged.
 	 */
 	__u32 log_true_size;
-	__u32 token_fd;
 	size_t :0;
 };
-#define bpf_btf_load_opts__last_field token_fd
+#define bpf_btf_load_opts__last_field log_true_size
 
 LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size,
 			    struct bpf_btf_load_opts *opts);
@@ -645,30 +640,6 @@ struct bpf_test_run_opts {
 LIBBPF_API int bpf_prog_test_run_opts(int prog_fd,
 				      struct bpf_test_run_opts *opts);
 
-struct bpf_token_create_opts {
-	size_t sz; /* size of this struct for forward/backward compatibility */
-	__u32 flags;
-	size_t :0;
-};
-#define bpf_token_create_opts__last_field flags
-
-/**
- * @brief **bpf_token_create()** creates a new instance of BPF token derived
- * from specified BPF FS mount point.
- *
- * BPF token created with this API can be passed to bpf() syscall for
- * commands like BPF_PROG_LOAD, BPF_MAP_CREATE, etc.
- *
- * @param bpffs_fd FD for BPF FS instance from which to derive a BPF token
- * instance.
- * @param opts optional BPF token creation options, can be NULL
- *
- * @return BPF token FD > 0, on success; negative error code, otherwise (errno
- * is also set to the error code)
- */
-LIBBPF_API int bpf_token_create(int bpffs_fd,
-				struct bpf_token_create_opts *opts);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 63033c334320..ee95fd379d4d 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1317,9 +1317,7 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf)
 
 static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian);
 
-int btf_load_into_kernel(struct btf *btf,
-			 char *log_buf, size_t log_sz, __u32 log_level,
-			 int token_fd)
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level)
 {
 	LIBBPF_OPTS(bpf_btf_load_opts, opts);
 	__u32 buf_sz = 0, raw_size;
@@ -1369,7 +1367,6 @@ retry_load:
 		opts.log_level = log_level;
 	}
 
-	opts.token_fd = token_fd;
 	btf->fd = bpf_btf_load(raw_data, raw_size, &opts);
 	if (btf->fd < 0) {
 		/* time to turn on verbose mode and try again */
@@ -1397,7 +1394,7 @@ done:
 
 int btf__load_into_kernel(struct btf *btf)
 {
-	return btf_load_into_kernel(btf, NULL, 0, 0, 0);
+	return btf_load_into_kernel(btf, NULL, 0, 0);
 }
 
 int btf__fd(const struct btf *btf)
diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c
index c92e02394159..b02faec748a5 100644
--- a/tools/lib/bpf/elf.c
+++ b/tools/lib/bpf/elf.c
@@ -11,6 +11,8 @@
 #include "libbpf_internal.h"
 #include "str_error.h"
 
+#define STRERR_BUFSIZE  128
+
 /* A SHT_GNU_versym section holds 16-bit words. This bit is set if
  * the symbol is hidden and can only be seen when referenced using an
  * explicit version number. This is a GNU extension.
diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
deleted file mode 100644
index ce98a334be21..000000000000
--- a/tools/lib/bpf/features.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-#include <linux/kernel.h>
-#include <linux/filter.h>
-#include "bpf.h"
-#include "libbpf.h"
-#include "libbpf_common.h"
-#include "libbpf_internal.h"
-#include "str_error.h"
-
-static inline __u64 ptr_to_u64(const void *ptr)
-{
-	return (__u64)(unsigned long)ptr;
-}
-
-static int probe_fd(int fd)
-{
-	if (fd >= 0)
-		close(fd);
-	return fd >= 0;
-}
-
-static int probe_kern_prog_name(int token_fd)
-{
-	const size_t attr_sz = offsetofend(union bpf_attr, prog_name);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	union bpf_attr attr;
-	int ret;
-
-	memset(&attr, 0, attr_sz);
-	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	attr.license = ptr_to_u64("GPL");
-	attr.insns = ptr_to_u64(insns);
-	attr.insn_cnt = (__u32)ARRAY_SIZE(insns);
-	attr.prog_token_fd = token_fd;
-	libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name));
-
-	/* make sure loading with name works */
-	ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS);
-	return probe_fd(ret);
-}
-
-static int probe_kern_global_data(int token_fd)
-{
-	char *cp, errmsg[STRERR_BUFSIZE];
-	struct bpf_insn insns[] = {
-		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
-		BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd);
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd);
-	int ret, map, insn_cnt = ARRAY_SIZE(insns);
-
-	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, &map_opts);
-	if (map < 0) {
-		ret = -errno;
-		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
-		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
-			__func__, cp, -ret);
-		return ret;
-	}
-
-	insns[0].imm = map;
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts);
-	close(map);
-	return probe_fd(ret);
-}
-
-static int probe_kern_btf(int token_fd)
-{
-	static const char strs[] = "\0int";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_func(int token_fd)
-{
-	static const char strs[] = "\0int\0x\0a";
-	/* void x(int a) {} */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* FUNC_PROTO */                                /* [2] */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
-		BTF_PARAM_ENC(7, 1),
-		/* FUNC x */                                    /* [3] */
-		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_func_global(int token_fd)
-{
-	static const char strs[] = "\0int\0x\0a";
-	/* static void x(int a) {} */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* FUNC_PROTO */                                /* [2] */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
-		BTF_PARAM_ENC(7, 1),
-		/* FUNC x BTF_FUNC_GLOBAL */                    /* [3] */
-		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_datasec(int token_fd)
-{
-	static const char strs[] = "\0x\0.data";
-	/* static int a; */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* VAR x */                                     /* [2] */
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
-		BTF_VAR_STATIC,
-		/* DATASEC val */                               /* [3] */
-		BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
-		BTF_VAR_SECINFO_ENC(2, 0, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_float(int token_fd)
-{
-	static const char strs[] = "\0float";
-	__u32 types[] = {
-		/* float */
-		BTF_TYPE_FLOAT_ENC(1, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_decl_tag(int token_fd)
-{
-	static const char strs[] = "\0tag";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* VAR x */                                     /* [2] */
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
-		BTF_VAR_STATIC,
-		/* attr */
-		BTF_TYPE_DECL_TAG_ENC(1, 2, -1),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_type_tag(int token_fd)
-{
-	static const char strs[] = "\0tag";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
-		/* attr */
-		BTF_TYPE_TYPE_TAG_ENC(1, 1),				/* [2] */
-		/* ptr */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),	/* [3] */
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_array_mmap(int token_fd)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, opts,
-		.map_flags = BPF_F_MMAPABLE,
-		.token_fd = token_fd,
-	);
-	int fd;
-
-	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_kern_exp_attach_type(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.token_fd = token_fd,
-	);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	int fd, insn_cnt = ARRAY_SIZE(insns);
-
-	/* use any valid combination of program type and (optional)
-	 * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS)
-	 * to see if kernel supports expected_attach_type field for
-	 * BPF_PROG_LOAD command
-	 */
-	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_kern_probe_read_kernel(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),	/* r1 = r10 (fp) */
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),	/* r1 += -8 */
-		BPF_MOV64_IMM(BPF_REG_2, 8),		/* r2 = 8 */
-		BPF_MOV64_IMM(BPF_REG_3, 0),		/* r3 = 0 */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel),
-		BPF_EXIT_INSN(),
-	};
-	int fd, insn_cnt = ARRAY_SIZE(insns);
-
-	fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_prog_bind_map(int token_fd)
-{
-	char *cp, errmsg[STRERR_BUFSIZE];
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd);
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd);
-	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
-
-	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, &map_opts);
-	if (map < 0) {
-		ret = -errno;
-		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
-		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
-			__func__, cp, -ret);
-		return ret;
-	}
-
-	prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts);
-	if (prog < 0) {
-		close(map);
-		return 0;
-	}
-
-	ret = bpf_prog_bind_map(prog, map, NULL);
-
-	close(map);
-	close(prog);
-
-	return ret >= 0;
-}
-
-static int probe_module_btf(int token_fd)
-{
-	static const char strs[] = "\0int";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
-	};
-	struct bpf_btf_info info;
-	__u32 len = sizeof(info);
-	char name[16];
-	int fd, err;
-
-	fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd);
-	if (fd < 0)
-		return 0; /* BTF not supported at all */
-
-	memset(&info, 0, sizeof(info));
-	info.name = ptr_to_u64(name);
-	info.name_len = sizeof(name);
-
-	/* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer;
-	 * kernel's module BTF support coincides with support for
-	 * name/name_len fields in struct bpf_btf_info.
-	 */
-	err = bpf_btf_get_info_by_fd(fd, &info, &len);
-	close(fd);
-	return !err;
-}
-
-static int probe_perf_link(int token_fd)
-{
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	int prog_fd, link_fd, err;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL",
-				insns, ARRAY_SIZE(insns), &opts);
-	if (prog_fd < 0)
-		return -errno;
-
-	/* use invalid perf_event FD to get EBADF, if link is supported;
-	 * otherwise EINVAL should be returned
-	 */
-	link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL);
-	err = -errno; /* close() can clobber errno */
-
-	if (link_fd >= 0)
-		close(link_fd);
-	close(prog_fd);
-
-	return link_fd < 0 && err == -EBADF;
-}
-
-static int probe_uprobe_multi_link(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, load_opts,
-		.expected_attach_type = BPF_TRACE_UPROBE_MULTI,
-		.token_fd = token_fd,
-	);
-	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	int prog_fd, link_fd, err;
-	unsigned long offset = 0;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL",
-				insns, ARRAY_SIZE(insns), &load_opts);
-	if (prog_fd < 0)
-		return -errno;
-
-	/* Creating uprobe in '/' binary should fail with -EBADF. */
-	link_opts.uprobe_multi.path = "/";
-	link_opts.uprobe_multi.offsets = &offset;
-	link_opts.uprobe_multi.cnt = 1;
-
-	link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
-	err = -errno; /* close() can clobber errno */
-
-	if (link_fd >= 0)
-		close(link_fd);
-	close(prog_fd);
-
-	return link_fd < 0 && err == -EBADF;
-}
-
-static int probe_kern_bpf_cookie(int token_fd)
-{
-	struct bpf_insn insns[] = {
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	int ret, insn_cnt = ARRAY_SIZE(insns);
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(ret);
-}
-
-static int probe_kern_btf_enum64(int token_fd)
-{
-	static const char strs[] = "\0enum64";
-	__u32 types[] = {
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-typedef int (*feature_probe_fn)(int /* token_fd */);
-
-static struct kern_feature_cache feature_cache;
-
-static struct kern_feature_desc {
-	const char *desc;
-	feature_probe_fn probe;
-} feature_probes[__FEAT_CNT] = {
-	[FEAT_PROG_NAME] = {
-		"BPF program name", probe_kern_prog_name,
-	},
-	[FEAT_GLOBAL_DATA] = {
-		"global variables", probe_kern_global_data,
-	},
-	[FEAT_BTF] = {
-		"minimal BTF", probe_kern_btf,
-	},
-	[FEAT_BTF_FUNC] = {
-		"BTF functions", probe_kern_btf_func,
-	},
-	[FEAT_BTF_GLOBAL_FUNC] = {
-		"BTF global function", probe_kern_btf_func_global,
-	},
-	[FEAT_BTF_DATASEC] = {
-		"BTF data section and variable", probe_kern_btf_datasec,
-	},
-	[FEAT_ARRAY_MMAP] = {
-		"ARRAY map mmap()", probe_kern_array_mmap,
-	},
-	[FEAT_EXP_ATTACH_TYPE] = {
-		"BPF_PROG_LOAD expected_attach_type attribute",
-		probe_kern_exp_attach_type,
-	},
-	[FEAT_PROBE_READ_KERN] = {
-		"bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel,
-	},
-	[FEAT_PROG_BIND_MAP] = {
-		"BPF_PROG_BIND_MAP support", probe_prog_bind_map,
-	},
-	[FEAT_MODULE_BTF] = {
-		"module BTF support", probe_module_btf,
-	},
-	[FEAT_BTF_FLOAT] = {
-		"BTF_KIND_FLOAT support", probe_kern_btf_float,
-	},
-	[FEAT_PERF_LINK] = {
-		"BPF perf link support", probe_perf_link,
-	},
-	[FEAT_BTF_DECL_TAG] = {
-		"BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag,
-	},
-	[FEAT_BTF_TYPE_TAG] = {
-		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
-	},
-	[FEAT_MEMCG_ACCOUNT] = {
-		"memcg-based memory accounting", probe_memcg_account,
-	},
-	[FEAT_BPF_COOKIE] = {
-		"BPF cookie support", probe_kern_bpf_cookie,
-	},
-	[FEAT_BTF_ENUM64] = {
-		"BTF_KIND_ENUM64 support", probe_kern_btf_enum64,
-	},
-	[FEAT_SYSCALL_WRAPPER] = {
-		"Kernel using syscall wrapper", probe_kern_syscall_wrapper,
-	},
-	[FEAT_UPROBE_MULTI_LINK] = {
-		"BPF multi-uprobe link support", probe_uprobe_multi_link,
-	},
-};
-
-bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
-{
-	struct kern_feature_desc *feat = &feature_probes[feat_id];
-	int ret;
-
-	/* assume global feature cache, unless custom one is provided */
-	if (!cache)
-		cache = &feature_cache;
-
-	if (READ_ONCE(cache->res[feat_id]) == FEAT_UNKNOWN) {
-		ret = feat->probe(cache->token_fd);
-		if (ret > 0) {
-			WRITE_ONCE(cache->res[feat_id], FEAT_SUPPORTED);
-		} else if (ret == 0) {
-			WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
-		} else {
-			pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret);
-			WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
-		}
-	}
-
-	return READ_ONCE(cache->res[feat_id]) == FEAT_SUPPORTED;
-}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4b5ff9508e18..ac54ebc0629f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -59,8 +59,6 @@
 #define BPF_FS_MAGIC		0xcafe4a11
 #endif
 
-#define BPF_FS_DEFAULT_PATH "/sys/fs/bpf"
-
 #define BPF_INSN_SZ (sizeof(struct bpf_insn))
 
 /* vsprintf() in __base_pr() uses nonliteral format string. It may break
@@ -695,10 +693,6 @@ struct bpf_object {
 
 	struct usdt_manager *usdt_man;
 
-	struct kern_feature_cache *feat_cache;
-	char *token_path;
-	int token_fd;
-
 	char path[];
 };
 
@@ -2198,7 +2192,7 @@ static int build_map_pin_path(struct bpf_map *map, const char *path)
 	int err;
 
 	if (!path)
-		path = BPF_FS_DEFAULT_PATH;
+		path = "/sys/fs/bpf";
 
 	err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map));
 	if (err)
@@ -3285,7 +3279,7 @@ skip_exception_cb:
 	} else {
 		/* currently BPF_BTF_LOAD only supports log_level 1 */
 		err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size,
-					   obj->log_level ? 1 : 0, obj->token_fd);
+					   obj->log_level ? 1 : 0);
 	}
 	if (sanitize) {
 		if (!err) {
@@ -4608,63 +4602,6 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
 	return 0;
 }
 
-static int bpf_object_prepare_token(struct bpf_object *obj)
-{
-	const char *bpffs_path;
-	int bpffs_fd = -1, token_fd, err;
-	bool mandatory;
-	enum libbpf_print_level level;
-
-	/* token is already set up */
-	if (obj->token_fd > 0)
-		return 0;
-	/* token is explicitly prevented */
-	if (obj->token_fd < 0) {
-		pr_debug("object '%s': token is prevented, skipping...\n", obj->name);
-		/* reset to zero to avoid extra checks during map_create and prog_load steps */
-		obj->token_fd = 0;
-		return 0;
-	}
-
-	mandatory = obj->token_path != NULL;
-	level = mandatory ? LIBBPF_WARN : LIBBPF_DEBUG;
-
-	bpffs_path = obj->token_path ?: BPF_FS_DEFAULT_PATH;
-	bpffs_fd = open(bpffs_path, O_DIRECTORY, O_RDWR);
-	if (bpffs_fd < 0) {
-		err = -errno;
-		__pr(level, "object '%s': failed (%d) to open BPF FS mount at '%s'%s\n",
-		     obj->name, err, bpffs_path,
-		     mandatory ? "" : ", skipping optional step...");
-		return mandatory ? err : 0;
-	}
-
-	token_fd = bpf_token_create(bpffs_fd, 0);
-	close(bpffs_fd);
-	if (token_fd < 0) {
-		if (!mandatory && token_fd == -ENOENT) {
-			pr_debug("object '%s': BPF FS at '%s' doesn't have BPF token delegation set up, skipping...\n",
-				 obj->name, bpffs_path);
-			return 0;
-		}
-		__pr(level, "object '%s': failed (%d) to create BPF token from '%s'%s\n",
-		     obj->name, token_fd, bpffs_path,
-		     mandatory ? "" : ", skipping optional step...");
-		return mandatory ? token_fd : 0;
-	}
-
-	obj->feat_cache = calloc(1, sizeof(*obj->feat_cache));
-	if (!obj->feat_cache) {
-		close(token_fd);
-		return -ENOMEM;
-	}
-
-	obj->token_fd = token_fd;
-	obj->feat_cache->token_fd = token_fd;
-
-	return 0;
-}
-
 static int
 bpf_object__probe_loading(struct bpf_object *obj)
 {
@@ -4674,7 +4611,6 @@ bpf_object__probe_loading(struct bpf_object *obj)
 		BPF_EXIT_INSN(),
 	};
 	int ret, insn_cnt = ARRAY_SIZE(insns);
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = obj->token_fd);
 
 	if (obj->gen_loader)
 		return 0;
@@ -4684,9 +4620,9 @@ bpf_object__probe_loading(struct bpf_object *obj)
 		pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret);
 
 	/* make sure basic loading works */
-	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &opts);
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0)
-		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
+		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0) {
 		ret = errno;
 		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
@@ -4701,18 +4637,462 @@ bpf_object__probe_loading(struct bpf_object *obj)
 	return 0;
 }
 
+static int probe_fd(int fd)
+{
+	if (fd >= 0)
+		close(fd);
+	return fd >= 0;
+}
+
+static int probe_kern_prog_name(void)
+{
+	const size_t attr_sz = offsetofend(union bpf_attr, prog_name);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	union bpf_attr attr;
+	int ret;
+
+	memset(&attr, 0, attr_sz);
+	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	attr.license = ptr_to_u64("GPL");
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = (__u32)ARRAY_SIZE(insns);
+	libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name));
+
+	/* make sure loading with name works */
+	ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS);
+	return probe_fd(ret);
+}
+
+static int probe_kern_global_data(void)
+{
+	char *cp, errmsg[STRERR_BUFSIZE];
+	struct bpf_insn insns[] = {
+		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
+		BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int ret, map, insn_cnt = ARRAY_SIZE(insns);
+
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL);
+	if (map < 0) {
+		ret = -errno;
+		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
+		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
+			__func__, cp, -ret);
+		return ret;
+	}
+
+	insns[0].imm = map;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
+	close(map);
+	return probe_fd(ret);
+}
+
+static int probe_kern_btf(void)
+{
+	static const char strs[] = "\0int";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_func(void)
+{
+	static const char strs[] = "\0int\0x\0a";
+	/* void x(int a) {} */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* FUNC_PROTO */                                /* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
+		BTF_PARAM_ENC(7, 1),
+		/* FUNC x */                                    /* [3] */
+		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_func_global(void)
+{
+	static const char strs[] = "\0int\0x\0a";
+	/* static void x(int a) {} */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* FUNC_PROTO */                                /* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
+		BTF_PARAM_ENC(7, 1),
+		/* FUNC x BTF_FUNC_GLOBAL */                    /* [3] */
+		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_datasec(void)
+{
+	static const char strs[] = "\0x\0.data";
+	/* static int a; */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* VAR x */                                     /* [2] */
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+		BTF_VAR_STATIC,
+		/* DATASEC val */                               /* [3] */
+		BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+		BTF_VAR_SECINFO_ENC(2, 0, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_float(void)
+{
+	static const char strs[] = "\0float";
+	__u32 types[] = {
+		/* float */
+		BTF_TYPE_FLOAT_ENC(1, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_decl_tag(void)
+{
+	static const char strs[] = "\0tag";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* VAR x */                                     /* [2] */
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+		BTF_VAR_STATIC,
+		/* attr */
+		BTF_TYPE_DECL_TAG_ENC(1, 2, -1),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_type_tag(void)
+{
+	static const char strs[] = "\0tag";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		/* attr */
+		BTF_TYPE_TYPE_TAG_ENC(1, 1),				/* [2] */
+		/* ptr */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),	/* [3] */
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_array_mmap(void)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE);
+	int fd;
+
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts);
+	return probe_fd(fd);
+}
+
+static int probe_kern_exp_attach_type(void)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
+
+	/* use any valid combination of program type and (optional)
+	 * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS)
+	 * to see if kernel supports expected_attach_type field for
+	 * BPF_PROG_LOAD command
+	 */
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts);
+	return probe_fd(fd);
+}
+
+static int probe_kern_probe_read_kernel(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),	/* r1 = r10 (fp) */
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),	/* r1 += -8 */
+		BPF_MOV64_IMM(BPF_REG_2, 8),		/* r2 = 8 */
+		BPF_MOV64_IMM(BPF_REG_3, 0),		/* r3 = 0 */
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel),
+		BPF_EXIT_INSN(),
+	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
+
+	fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(fd);
+}
+
+static int probe_prog_bind_map(void)
+{
+	char *cp, errmsg[STRERR_BUFSIZE];
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
+
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL);
+	if (map < 0) {
+		ret = -errno;
+		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
+		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
+			__func__, cp, -ret);
+		return ret;
+	}
+
+	prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
+	if (prog < 0) {
+		close(map);
+		return 0;
+	}
+
+	ret = bpf_prog_bind_map(prog, map, NULL);
+
+	close(map);
+	close(prog);
+
+	return ret >= 0;
+}
+
+static int probe_module_btf(void)
+{
+	static const char strs[] = "\0int";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
+	};
+	struct bpf_btf_info info;
+	__u32 len = sizeof(info);
+	char name[16];
+	int fd, err;
+
+	fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs));
+	if (fd < 0)
+		return 0; /* BTF not supported at all */
+
+	memset(&info, 0, sizeof(info));
+	info.name = ptr_to_u64(name);
+	info.name_len = sizeof(name);
+
+	/* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer;
+	 * kernel's module BTF support coincides with support for
+	 * name/name_len fields in struct bpf_btf_info.
+	 */
+	err = bpf_btf_get_info_by_fd(fd, &info, &len);
+	close(fd);
+	return !err;
+}
+
+static int probe_perf_link(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int prog_fd, link_fd, err;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL",
+				insns, ARRAY_SIZE(insns), NULL);
+	if (prog_fd < 0)
+		return -errno;
+
+	/* use invalid perf_event FD to get EBADF, if link is supported;
+	 * otherwise EINVAL should be returned
+	 */
+	link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL);
+	err = -errno; /* close() can clobber errno */
+
+	if (link_fd >= 0)
+		close(link_fd);
+	close(prog_fd);
+
+	return link_fd < 0 && err == -EBADF;
+}
+
+static int probe_uprobe_multi_link(void)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, load_opts,
+		.expected_attach_type = BPF_TRACE_UPROBE_MULTI,
+	);
+	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int prog_fd, link_fd, err;
+	unsigned long offset = 0;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL",
+				insns, ARRAY_SIZE(insns), &load_opts);
+	if (prog_fd < 0)
+		return -errno;
+
+	/* Creating uprobe in '/' binary should fail with -EBADF. */
+	link_opts.uprobe_multi.path = "/";
+	link_opts.uprobe_multi.offsets = &offset;
+	link_opts.uprobe_multi.cnt = 1;
+
+	link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
+	err = -errno; /* close() can clobber errno */
+
+	if (link_fd >= 0)
+		close(link_fd);
+	close(prog_fd);
+
+	return link_fd < 0 && err == -EBADF;
+}
+
+static int probe_kern_bpf_cookie(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie),
+		BPF_EXIT_INSN(),
+	};
+	int ret, insn_cnt = ARRAY_SIZE(insns);
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(ret);
+}
+
+static int probe_kern_btf_enum64(void)
+{
+	static const char strs[] = "\0enum64";
+	__u32 types[] = {
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_syscall_wrapper(void);
+
+enum kern_feature_result {
+	FEAT_UNKNOWN = 0,
+	FEAT_SUPPORTED = 1,
+	FEAT_MISSING = 2,
+};
+
+typedef int (*feature_probe_fn)(void);
+
+static struct kern_feature_desc {
+	const char *desc;
+	feature_probe_fn probe;
+	enum kern_feature_result res;
+} feature_probes[__FEAT_CNT] = {
+	[FEAT_PROG_NAME] = {
+		"BPF program name", probe_kern_prog_name,
+	},
+	[FEAT_GLOBAL_DATA] = {
+		"global variables", probe_kern_global_data,
+	},
+	[FEAT_BTF] = {
+		"minimal BTF", probe_kern_btf,
+	},
+	[FEAT_BTF_FUNC] = {
+		"BTF functions", probe_kern_btf_func,
+	},
+	[FEAT_BTF_GLOBAL_FUNC] = {
+		"BTF global function", probe_kern_btf_func_global,
+	},
+	[FEAT_BTF_DATASEC] = {
+		"BTF data section and variable", probe_kern_btf_datasec,
+	},
+	[FEAT_ARRAY_MMAP] = {
+		"ARRAY map mmap()", probe_kern_array_mmap,
+	},
+	[FEAT_EXP_ATTACH_TYPE] = {
+		"BPF_PROG_LOAD expected_attach_type attribute",
+		probe_kern_exp_attach_type,
+	},
+	[FEAT_PROBE_READ_KERN] = {
+		"bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel,
+	},
+	[FEAT_PROG_BIND_MAP] = {
+		"BPF_PROG_BIND_MAP support", probe_prog_bind_map,
+	},
+	[FEAT_MODULE_BTF] = {
+		"module BTF support", probe_module_btf,
+	},
+	[FEAT_BTF_FLOAT] = {
+		"BTF_KIND_FLOAT support", probe_kern_btf_float,
+	},
+	[FEAT_PERF_LINK] = {
+		"BPF perf link support", probe_perf_link,
+	},
+	[FEAT_BTF_DECL_TAG] = {
+		"BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag,
+	},
+	[FEAT_BTF_TYPE_TAG] = {
+		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
+	},
+	[FEAT_MEMCG_ACCOUNT] = {
+		"memcg-based memory accounting", probe_memcg_account,
+	},
+	[FEAT_BPF_COOKIE] = {
+		"BPF cookie support", probe_kern_bpf_cookie,
+	},
+	[FEAT_BTF_ENUM64] = {
+		"BTF_KIND_ENUM64 support", probe_kern_btf_enum64,
+	},
+	[FEAT_SYSCALL_WRAPPER] = {
+		"Kernel using syscall wrapper", probe_kern_syscall_wrapper,
+	},
+	[FEAT_UPROBE_MULTI_LINK] = {
+		"BPF multi-uprobe link support", probe_uprobe_multi_link,
+	},
+};
+
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
 {
+	struct kern_feature_desc *feat = &feature_probes[feat_id];
+	int ret;
+
 	if (obj && obj->gen_loader)
 		/* To generate loader program assume the latest kernel
 		 * to avoid doing extra prog_load, map_create syscalls.
 		 */
 		return true;
 
-	if (obj->token_fd)
-		return feat_supported(obj->feat_cache, feat_id);
+	if (READ_ONCE(feat->res) == FEAT_UNKNOWN) {
+		ret = feat->probe();
+		if (ret > 0) {
+			WRITE_ONCE(feat->res, FEAT_SUPPORTED);
+		} else if (ret == 0) {
+			WRITE_ONCE(feat->res, FEAT_MISSING);
+		} else {
+			pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret);
+			WRITE_ONCE(feat->res, FEAT_MISSING);
+		}
+	}
 
-	return feat_supported(NULL, feat_id);
+	return READ_ONCE(feat->res) == FEAT_SUPPORTED;
 }
 
 static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd)
@@ -4831,7 +5211,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	create_attr.map_flags = def->map_flags;
 	create_attr.numa_node = map->numa_node;
 	create_attr.map_extra = map->map_extra;
-	create_attr.token_fd = obj->token_fd;
 
 	if (bpf_map__is_struct_ops(map))
 		create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
@@ -6667,7 +7046,6 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 	load_attr.attach_btf_id = prog->attach_btf_id;
 	load_attr.kern_version = kern_version;
 	load_attr.prog_ifindex = prog->prog_ifindex;
-	load_attr.token_fd = obj->token_fd;
 
 	/* specify func_info/line_info only if kernel supports them */
 	btf_fd = bpf_object__btf_fd(obj);
@@ -7129,10 +7507,10 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object
 static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 					  const struct bpf_object_open_opts *opts)
 {
-	const char *obj_name, *kconfig, *btf_tmp_path, *token_path;
+	const char *obj_name, *kconfig, *btf_tmp_path;
 	struct bpf_object *obj;
 	char tmp_name[64];
-	int err, token_fd;
+	int err;
 	char *log_buf;
 	size_t log_size;
 	__u32 log_level;
@@ -7166,28 +7544,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf,
 	if (log_size && !log_buf)
 		return ERR_PTR(-EINVAL);
 
-	token_path = OPTS_GET(opts, bpf_token_path, NULL);
-	token_fd = OPTS_GET(opts, bpf_token_fd, -1);
-	/* non-empty token path can't be combined with invalid token FD */
-	if (token_path && token_path[0] != '\0' && token_fd < 0)
-		return ERR_PTR(-EINVAL);
-	/* empty token path can't be combined with valid token FD */
-	if (token_path && token_path[0] == '\0' && token_fd > 0)
-		return ERR_PTR(-EINVAL);
-	/* if user didn't specify bpf_token_path/bpf_token_fd explicitly,
-	 * check if LIBBPF_BPF_TOKEN_PATH envvar was set and treat it as
-	 * bpf_token_path option
-	 */
-	if (token_fd == 0 && !token_path)
-		token_path = getenv("LIBBPF_BPF_TOKEN_PATH");
-	/* empty token_path is equivalent to invalid token_fd */
-	if (token_path && token_path[0] == '\0') {
-		token_path = NULL;
-		token_fd = -1;
-	}
-	if (token_path && strlen(token_path) >= PATH_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
-
 	obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name);
 	if (IS_ERR(obj))
 		return obj;
@@ -7196,19 +7552,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf,
 	obj->log_size = log_size;
 	obj->log_level = log_level;
 
-	obj->token_fd = token_fd <= 0 ? token_fd : dup_good_fd(token_fd);
-	if (token_fd > 0 && obj->token_fd < 0) {
-		err = -errno;
-		goto out;
-	}
-	if (token_path) {
-		obj->token_path = strdup(token_path);
-		if (!obj->token_path) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
 	btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL);
 	if (btf_tmp_path) {
 		if (strlen(btf_tmp_path) >= PATH_MAX) {
@@ -7719,8 +8062,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
 	if (obj->gen_loader)
 		bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps);
 
-	err = bpf_object_prepare_token(obj);
-	err = err ? : bpf_object__probe_loading(obj);
+	err = bpf_object__probe_loading(obj);
 	err = err ? : bpf_object__load_vmlinux_btf(obj, false);
 	err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
 	err = err ? : bpf_object__sanitize_and_load_btf(obj);
@@ -8257,11 +8599,6 @@ void bpf_object__close(struct bpf_object *obj)
 	}
 	zfree(&obj->programs);
 
-	zfree(&obj->feat_cache);
-	zfree(&obj->token_path);
-	if (obj->token_fd > 0)
-		close(obj->token_fd);
-
 	free(obj);
 }
 
@@ -10275,7 +10612,7 @@ static const char *arch_specific_syscall_pfx(void)
 #endif
 }
 
-int probe_kern_syscall_wrapper(int token_fd)
+static int probe_kern_syscall_wrapper(void)
 {
 	char syscall_name[64];
 	const char *ksys_pfx;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 916904bd2a7a..6cd9c501624f 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -177,45 +177,10 @@ struct bpf_object_open_opts {
 	 * logs through its print callback.
 	 */
 	__u32 kernel_log_level;
-	/* FD of a BPF token instantiated by user through bpf_token_create()
-	 * API. BPF object will keep dup()'ed FD internally, so passed token
-	 * FD can be closed after BPF object/skeleton open step.
-	 *
-	 * Setting bpf_token_fd to negative value disables libbpf's automatic
-	 * attempt to create BPF token from default BPF FS mount point
-	 * (/sys/fs/bpf), in case this default behavior is undesirable.
-	 *
-	 * If bpf_token_path and bpf_token_fd are not specified, libbpf will
-	 * consult LIBBPF_BPF_TOKEN_PATH environment variable. If set, it will
-	 * be taken as a value of bpf_token_path option and will force libbpf
-	 * to either create BPF token from provided custom BPF FS path, or
-	 * will disable implicit BPF token creation, if envvar value is an
-	 * empty string.
-	 *
-	 * bpf_token_path and bpf_token_fd are mutually exclusive and only one
-	 * of those options should be set. Either of them overrides
-	 * LIBBPF_BPF_TOKEN_PATH envvar.
-	 */
-	int bpf_token_fd;
-	/* Path to BPF FS mount point to derive BPF token from.
-	 *
-	 * Created BPF token will be used for all bpf() syscall operations
-	 * that accept BPF token (e.g., map creation, BTF and program loads,
-	 * etc) automatically within instantiated BPF object.
-	 *
-	 * Setting bpf_token_path option to empty string disables libbpf's
-	 * automatic attempt to create BPF token from default BPF FS mount
-	 * point (/sys/fs/bpf), in case this default behavior is undesirable.
-	 *
-	 * bpf_token_path and bpf_token_fd are mutually exclusive and only one
-	 * of those options should be set. Either of them overrides
-	 * LIBBPF_BPF_TOKEN_PATH envvar.
-	 */
-	const char *bpf_token_path;
 
 	size_t :0;
 };
-#define bpf_object_open_opts__last_field bpf_token_path
+#define bpf_object_open_opts__last_field kernel_log_level
 
 /**
  * @brief **bpf_object__open()** creates a bpf_object by opening
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index df7657b65c47..91c5aef7dae7 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -401,7 +401,6 @@ LIBBPF_1.3.0 {
 		bpf_program__attach_netkit;
 		bpf_program__attach_tcx;
 		bpf_program__attach_uprobe_multi;
-		bpf_token_create;
 		ring__avail_data_size;
 		ring__consume;
 		ring__consumer_pos;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 4cda32298c49..b5d334754e5d 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -360,32 +360,15 @@ enum kern_feature_id {
 	__FEAT_CNT,
 };
 
-enum kern_feature_result {
-	FEAT_UNKNOWN = 0,
-	FEAT_SUPPORTED = 1,
-	FEAT_MISSING = 2,
-};
-
-struct kern_feature_cache {
-	enum kern_feature_result res[__FEAT_CNT];
-	int token_fd;
-};
-
-bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id);
+int probe_memcg_account(void);
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id);
-
-int probe_kern_syscall_wrapper(int token_fd);
-int probe_memcg_account(int token_fd);
 int bump_rlimit_memlock(void);
 
 int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
 int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
-			 const char *str_sec, size_t str_len,
-			 int token_fd);
-int btf_load_into_kernel(struct btf *btf,
-			 char *log_buf, size_t log_sz, __u32 log_level,
-			 int token_fd);
+			 const char *str_sec, size_t str_len);
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level);
 
 struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf);
 void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
@@ -549,17 +532,6 @@ static inline bool is_ldimm64_insn(struct bpf_insn *insn)
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 }
 
-/* Unconditionally dup FD, ensuring it doesn't use [0, 2] range.
- * Original FD is not closed or altered in any other way.
- * Preserves original FD value, if it's invalid (negative).
- */
-static inline int dup_good_fd(int fd)
-{
-	if (fd < 0)
-		return fd;
-	return fcntl(fd, F_DUPFD_CLOEXEC, 3);
-}
-
 /* if fd is stdin, stdout, or stderr, dup to a fd greater than 2
  * Takes ownership of the fd passed in, and closes it if calling
  * fcntl(fd, F_DUPFD_CLOEXEC, 3).
@@ -571,7 +543,7 @@ static inline int ensure_good_fd(int fd)
 	if (fd < 0)
 		return fd;
 	if (fd < 3) {
-		fd = dup_good_fd(fd);
+		fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 		saved_errno = errno;
 		close(old_fd);
 		errno = saved_errno;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 8e7437006639..9c4db90b92b6 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -219,8 +219,7 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts)
 }
 
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
-			 const char *str_sec, size_t str_len,
-			 int token_fd)
+			 const char *str_sec, size_t str_len)
 {
 	struct btf_header hdr = {
 		.magic = BTF_MAGIC,
@@ -230,7 +229,6 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 		.str_off = types_len,
 		.str_len = str_len,
 	};
-	LIBBPF_OPTS(bpf_btf_load_opts, opts, .token_fd = token_fd);
 	int btf_fd, btf_len;
 	__u8 *raw_btf;
 
@@ -243,7 +241,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 	memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len);
 	memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len);
 
-	btf_fd = bpf_btf_load(raw_btf, btf_len, &opts);
+	btf_fd = bpf_btf_load(raw_btf, btf_len, NULL);
 
 	free(raw_btf);
 	return btf_fd;
@@ -273,7 +271,7 @@ static int load_local_storage_btf(void)
 	};
 
 	return libbpf__load_raw_btf((char *)types, sizeof(types),
-				     strs, sizeof(strs), 0);
+				     strs, sizeof(strs));
 }
 
 static int probe_map_create(enum bpf_map_type map_type)
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
index 626d7ffb03d6..a139334d57b6 100644
--- a/tools/lib/bpf/str_error.h
+++ b/tools/lib/bpf/str_error.h
@@ -2,8 +2,5 @@
 #ifndef __LIBBPF_STR_ERROR_H
 #define __LIBBPF_STR_ERROR_H
 
-#define STRERR_BUFSIZE  128
-
 char *libbpf_strerror_r(int err, char *dst, int len);
-
 #endif /* __LIBBPF_STR_ERROR_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 4ed46ed58a7b..9f766ddd946a 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -30,8 +30,6 @@ void test_libbpf_probe_prog_types(void)
 
 		if (prog_type == BPF_PROG_TYPE_UNSPEC)
 			continue;
-		if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0)
-			continue;
 
 		if (!test__start_subtest(prog_type_name))
 			continue;
@@ -70,8 +68,6 @@ void test_libbpf_probe_map_types(void)
 
 		if (map_type == BPF_MAP_TYPE_UNSPEC)
 			continue;
-		if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0)
-			continue;
 
 		if (!test__start_subtest(map_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index 62ea855ec4d0..eb34d612d6f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -132,9 +132,6 @@ static void test_libbpf_bpf_map_type_str(void)
 		const char *map_type_str;
 		char buf[256];
 
-		if (map_type == __MAX_BPF_MAP_TYPE)
-			continue;
-
 		map_type_name = btf__str_by_offset(btf, e->name_off);
 		map_type_str = libbpf_bpf_map_type_str(map_type);
 		ASSERT_OK_PTR(map_type_str, map_type_name);
@@ -189,9 +186,6 @@ static void test_libbpf_bpf_prog_type_str(void)
 		const char *prog_type_str;
 		char buf[256];
 
-		if (prog_type == __MAX_BPF_PROG_TYPE)
-			continue;
-
 		prog_type_name = btf__str_by_offset(btf, e->name_off);
 		prog_type_str = libbpf_bpf_prog_type_str(prog_type);
 		ASSERT_OK_PTR(prog_type_str, prog_type_name);
diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c
deleted file mode 100644
index b5dce630e0e1..000000000000
--- a/tools/testing/selftests/bpf/prog_tests/token.c
+++ /dev/null
@@ -1,1031 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-#define _GNU_SOURCE
-#include <test_progs.h>
-#include <bpf/btf.h>
-#include "cap_helpers.h"
-#include <fcntl.h>
-#include <sched.h>
-#include <signal.h>
-#include <unistd.h>
-#include <linux/filter.h>
-#include <linux/unistd.h>
-#include <linux/mount.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/un.h>
-#include "priv_map.skel.h"
-#include "priv_prog.skel.h"
-#include "dummy_st_ops_success.skel.h"
-
-static inline int sys_mount(const char *dev_name, const char *dir_name,
-			    const char *type, unsigned long flags,
-			    const void *data)
-{
-	return syscall(__NR_mount, dev_name, dir_name, type, flags, data);
-}
-
-static inline int sys_fsopen(const char *fsname, unsigned flags)
-{
-	return syscall(__NR_fsopen, fsname, flags);
-}
-
-static inline int sys_fspick(int dfd, const char *path, unsigned flags)
-{
-	return syscall(__NR_fspick, dfd, path, flags);
-}
-
-static inline int sys_fsconfig(int fs_fd, unsigned cmd, const char *key, const void *val, int aux)
-{
-	return syscall(__NR_fsconfig, fs_fd, cmd, key, val, aux);
-}
-
-static inline int sys_fsmount(int fs_fd, unsigned flags, unsigned ms_flags)
-{
-	return syscall(__NR_fsmount, fs_fd, flags, ms_flags);
-}
-
-static inline int sys_move_mount(int from_dfd, const char *from_path,
-				 int to_dfd, const char *to_path,
-				 unsigned flags)
-{
-	return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags);
-}
-
-static int drop_priv_caps(__u64 *old_caps)
-{
-	return cap_disable_effective((1ULL << CAP_BPF) |
-				     (1ULL << CAP_PERFMON) |
-				     (1ULL << CAP_NET_ADMIN) |
-				     (1ULL << CAP_SYS_ADMIN), old_caps);
-}
-
-static int restore_priv_caps(__u64 old_caps)
-{
-	return cap_enable_effective(old_caps, NULL);
-}
-
-static int set_delegate_mask(int fs_fd, const char *key, __u64 mask, const char *mask_str)
-{
-	char buf[32];
-	int err;
-
-	if (!mask_str) {
-		if (mask == ~0ULL) {
-			mask_str = "any";
-		} else {
-			snprintf(buf, sizeof(buf), "0x%llx", (unsigned long long)mask);
-			mask_str = buf;
-		}
-	}
-
-	err = sys_fsconfig(fs_fd, FSCONFIG_SET_STRING, key,
-			   mask_str, 0);
-	if (err < 0)
-		err = -errno;
-	return err;
-}
-
-#define zclose(fd) do { if (fd >= 0) close(fd); fd = -1; } while (0)
-
-struct bpffs_opts {
-	__u64 cmds;
-	__u64 maps;
-	__u64 progs;
-	__u64 attachs;
-	const char *cmds_str;
-	const char *maps_str;
-	const char *progs_str;
-	const char *attachs_str;
-};
-
-static int create_bpffs_fd(void)
-{
-	int fs_fd;
-
-	/* create VFS context */
-	fs_fd = sys_fsopen("bpf", 0);
-	ASSERT_GE(fs_fd, 0, "fs_fd");
-
-	return fs_fd;
-}
-
-static int materialize_bpffs_fd(int fs_fd, struct bpffs_opts *opts)
-{
-	int mnt_fd, err;
-
-	/* set up token delegation mount options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", opts->cmds, opts->cmds_str);
-	if (!ASSERT_OK(err, "fs_cfg_cmds"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_maps", opts->maps, opts->maps_str);
-	if (!ASSERT_OK(err, "fs_cfg_maps"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_progs", opts->progs, opts->progs_str);
-	if (!ASSERT_OK(err, "fs_cfg_progs"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_attachs", opts->attachs, opts->attachs_str);
-	if (!ASSERT_OK(err, "fs_cfg_attachs"))
-		return err;
-
-	/* instantiate FS object */
-	err = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
-	if (err < 0)
-		return -errno;
-
-	/* create O_PATH fd for detached mount */
-	mnt_fd = sys_fsmount(fs_fd, 0, 0);
-	if (err < 0)
-		return -errno;
-
-	return mnt_fd;
-}
-
-/* send FD over Unix domain (AF_UNIX) socket */
-static int sendfd(int sockfd, int fd)
-{
-	struct msghdr msg = {};
-	struct cmsghdr *cmsg;
-	int fds[1] = { fd }, err;
-	char iobuf[1];
-	struct iovec io = {
-		.iov_base = iobuf,
-		.iov_len = sizeof(iobuf),
-	};
-	union {
-		char buf[CMSG_SPACE(sizeof(fds))];
-		struct cmsghdr align;
-	} u;
-
-	msg.msg_iov = &io;
-	msg.msg_iovlen = 1;
-	msg.msg_control = u.buf;
-	msg.msg_controllen = sizeof(u.buf);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(fds));
-	memcpy(CMSG_DATA(cmsg), fds, sizeof(fds));
-
-	err = sendmsg(sockfd, &msg, 0);
-	if (err < 0)
-		err = -errno;
-	if (!ASSERT_EQ(err, 1, "sendmsg"))
-		return -EINVAL;
-
-	return 0;
-}
-
-/* receive FD over Unix domain (AF_UNIX) socket */
-static int recvfd(int sockfd, int *fd)
-{
-	struct msghdr msg = {};
-	struct cmsghdr *cmsg;
-	int fds[1], err;
-	char iobuf[1];
-	struct iovec io = {
-		.iov_base = iobuf,
-		.iov_len = sizeof(iobuf),
-	};
-	union {
-		char buf[CMSG_SPACE(sizeof(fds))];
-		struct cmsghdr align;
-	} u;
-
-	msg.msg_iov = &io;
-	msg.msg_iovlen = 1;
-	msg.msg_control = u.buf;
-	msg.msg_controllen = sizeof(u.buf);
-
-	err = recvmsg(sockfd, &msg, 0);
-	if (err < 0)
-		err = -errno;
-	if (!ASSERT_EQ(err, 1, "recvmsg"))
-		return -EINVAL;
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	if (!ASSERT_OK_PTR(cmsg, "cmsg_null") ||
-	    !ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(fds)), "cmsg_len") ||
-	    !ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET, "cmsg_level") ||
-	    !ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS, "cmsg_type"))
-		return -EINVAL;
-
-	memcpy(fds, CMSG_DATA(cmsg), sizeof(fds));
-	*fd = fds[0];
-
-	return 0;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = write(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
-static int write_file(const char *path, const void *buf, size_t count)
-{
-	int fd;
-	ssize_t ret;
-
-	fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
-	if (fd < 0)
-		return -1;
-
-	ret = write_nointr(fd, buf, count);
-	close(fd);
-	if (ret < 0 || (size_t)ret != count)
-		return -1;
-
-	return 0;
-}
-
-static int create_and_enter_userns(void)
-{
-	uid_t uid;
-	gid_t gid;
-	char map[100];
-
-	uid = getuid();
-	gid = getgid();
-
-	if (unshare(CLONE_NEWUSER))
-		return -1;
-
-	if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
-	    errno != ENOENT)
-		return -1;
-
-	snprintf(map, sizeof(map), "0 %d 1", uid);
-	if (write_file("/proc/self/uid_map", map, strlen(map)))
-		return -1;
-
-
-	snprintf(map, sizeof(map), "0 %d 1", gid);
-	if (write_file("/proc/self/gid_map", map, strlen(map)))
-		return -1;
-
-	if (setgid(0))
-		return -1;
-
-	if (setuid(0))
-		return -1;
-
-	return 0;
-}
-
-typedef int (*child_callback_fn)(int);
-
-static void child(int sock_fd, struct bpffs_opts *opts, child_callback_fn callback)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
-	int mnt_fd = -1, fs_fd = -1, err = 0, bpffs_fd = -1;
-
-	/* setup userns with root mappings */
-	err = create_and_enter_userns();
-	if (!ASSERT_OK(err, "create_and_enter_userns"))
-		goto cleanup;
-
-	/* setup mountns to allow creating BPF FS (fsopen("bpf")) from unpriv process */
-	err = unshare(CLONE_NEWNS);
-	if (!ASSERT_OK(err, "create_mountns"))
-		goto cleanup;
-
-	err = sys_mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
-	if (!ASSERT_OK(err, "remount_root"))
-		goto cleanup;
-
-	fs_fd = create_bpffs_fd();
-	if (!ASSERT_GE(fs_fd, 0, "create_bpffs_fd")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* ensure unprivileged child cannot set delegation options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_maps", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_maps_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_progs", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_progs_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_attachs", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm");
-
-	/* pass BPF FS context object to parent */
-	err = sendfd(sock_fd, fs_fd);
-	if (!ASSERT_OK(err, "send_fs_fd"))
-		goto cleanup;
-	zclose(fs_fd);
-
-	/* avoid mucking around with mount namespaces and mounting at
-	 * well-known path, just get detach-mounted BPF FS fd back from parent
-	 */
-	err = recvfd(sock_fd, &mnt_fd);
-	if (!ASSERT_OK(err, "recv_mnt_fd"))
-		goto cleanup;
-
-	/* try to fspick() BPF FS and try to add some delegation options */
-	fs_fd = sys_fspick(mnt_fd, "", FSPICK_EMPTY_PATH);
-	if (!ASSERT_GE(fs_fd, 0, "bpffs_fspick")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* ensure unprivileged child cannot reconfigure to set delegation options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_maps", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_maps_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_progs", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_progs_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_attachs", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	zclose(fs_fd);
-
-	bpffs_fd = openat(mnt_fd, ".", 0, O_RDWR);
-	if (!ASSERT_GE(bpffs_fd, 0, "bpffs_open")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* do custom test logic with customly set up BPF FS instance */
-	err = callback(bpffs_fd);
-	if (!ASSERT_OK(err, "test_callback"))
-		goto cleanup;
-
-	err = 0;
-cleanup:
-	zclose(sock_fd);
-	zclose(mnt_fd);
-	zclose(fs_fd);
-	zclose(bpffs_fd);
-
-	exit(-err);
-}
-
-static int wait_for_pid(pid_t pid)
-{
-	int status, ret;
-
-again:
-	ret = waitpid(pid, &status, 0);
-	if (ret == -1) {
-		if (errno == EINTR)
-			goto again;
-
-		return -1;
-	}
-
-	if (!WIFEXITED(status))
-		return -1;
-
-	return WEXITSTATUS(status);
-}
-
-static void parent(int child_pid, struct bpffs_opts *bpffs_opts, int sock_fd)
-{
-	int fs_fd = -1, mnt_fd = -1, err;
-
-	err = recvfd(sock_fd, &fs_fd);
-	if (!ASSERT_OK(err, "recv_bpffs_fd"))
-		goto cleanup;
-
-	mnt_fd = materialize_bpffs_fd(fs_fd, bpffs_opts);
-	if (!ASSERT_GE(mnt_fd, 0, "materialize_bpffs_fd")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	zclose(fs_fd);
-
-	/* pass BPF FS context object to parent */
-	err = sendfd(sock_fd, mnt_fd);
-	if (!ASSERT_OK(err, "send_mnt_fd"))
-		goto cleanup;
-	zclose(mnt_fd);
-
-	err = wait_for_pid(child_pid);
-	ASSERT_OK(err, "waitpid_child");
-
-cleanup:
-	zclose(sock_fd);
-	zclose(fs_fd);
-	zclose(mnt_fd);
-
-	if (child_pid > 0)
-		(void)kill(child_pid, SIGKILL);
-}
-
-static void subtest_userns(struct bpffs_opts *bpffs_opts, child_callback_fn cb)
-{
-	int sock_fds[2] = { -1, -1 };
-	int child_pid = 0, err;
-
-	err = socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds);
-	if (!ASSERT_OK(err, "socketpair"))
-		goto cleanup;
-
-	child_pid = fork();
-	if (!ASSERT_GE(child_pid, 0, "fork"))
-		goto cleanup;
-
-	if (child_pid == 0) {
-		zclose(sock_fds[0]);
-		return child(sock_fds[1], bpffs_opts, cb);
-
-	} else {
-		zclose(sock_fds[1]);
-		return parent(child_pid, bpffs_opts, sock_fds[0]);
-	}
-
-cleanup:
-	zclose(sock_fds[0]);
-	zclose(sock_fds[1]);
-	if (child_pid > 0)
-		(void)kill(child_pid, SIGKILL);
-}
-
-static int userns_map_create(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
-	int err, token_fd = -1, map_fd = -1;
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* while inside non-init userns, we need both a BPF token *and*
-	 * CAP_BPF inside current userns to create privileged map; let's test
-	 * that neither BPF token alone nor namespaced CAP_BPF is sufficient
-	 */
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* no token, no CAP_BPF -> fail */
-	map_opts.token_fd = 0;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_wo_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_wo_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* token without CAP_BPF -> fail */
-	map_opts.token_fd = token_fd;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_wo_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_w_token_wo_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */
-	err = restore_priv_caps(old_caps);
-	if (!ASSERT_OK(err, "restore_caps"))
-		goto cleanup;
-
-	/* CAP_BPF without token -> fail */
-	map_opts.token_fd = 0;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_w_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_w_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* finally, namespaced CAP_BPF + token -> success */
-	map_opts.token_fd = token_fd;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_w_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_GT(map_fd, 0, "stack_map_w_token_w_cap_bpf")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-cleanup:
-	zclose(token_fd);
-	zclose(map_fd);
-	return err;
-}
-
-static int userns_btf_load(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_btf_load_opts, btf_opts);
-	int err, token_fd = -1, btf_fd = -1;
-	const void *raw_btf_data;
-	struct btf *btf = NULL;
-	__u32 raw_btf_size;
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* while inside non-init userns, we need both a BPF token *and*
-	 * CAP_BPF inside current userns to create privileged map; let's test
-	 * that neither BPF token alone nor namespaced CAP_BPF is sufficient
-	 */
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* setup a trivial BTF data to load to the kernel */
-	btf = btf__new_empty();
-	if (!ASSERT_OK_PTR(btf, "empty_btf"))
-		goto cleanup;
-
-	ASSERT_GT(btf__add_int(btf, "int", 4, 0), 0, "int_type");
-
-	raw_btf_data = btf__raw_data(btf, &raw_btf_size);
-	if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data"))
-		goto cleanup;
-
-	/* no token + no CAP_BPF -> failure */
-	btf_opts.token_fd = 0;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_LT(btf_fd, 0, "no_token_no_cap_should_fail"))
-		goto cleanup;
-
-	/* token + no CAP_BPF -> failure */
-	btf_opts.token_fd = token_fd;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_LT(btf_fd, 0, "token_no_cap_should_fail"))
-		goto cleanup;
-
-	/* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */
-	err = restore_priv_caps(old_caps);
-	if (!ASSERT_OK(err, "restore_caps"))
-		goto cleanup;
-
-	/* token + CAP_BPF -> success */
-	btf_opts.token_fd = token_fd;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_GT(btf_fd, 0, "token_and_cap_success"))
-		goto cleanup;
-
-	err = 0;
-cleanup:
-	btf__free(btf);
-	zclose(btf_fd);
-	zclose(token_fd);
-	return err;
-}
-
-static int userns_prog_load(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts);
-	int err, token_fd = -1, prog_fd = -1;
-	struct bpf_insn insns[] = {
-		/* bpf_jiffies64() requires CAP_BPF */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
-		/* bpf_get_current_task() requires CAP_PERFMON */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_current_task),
-		/* r0 = 0; exit; */
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	size_t insn_cnt = ARRAY_SIZE(insns);
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* validate we can successfully load BPF program with token; this
-	 * being XDP program (CAP_NET_ADMIN) using bpf_jiffies64() (CAP_BPF)
-	 * and bpf_get_current_task() (CAP_PERFMON) helpers validates we have
-	 * BPF token wired properly in a bunch of places in the kernel
-	 */
-	prog_opts.token_fd = token_fd;
-	prog_opts.expected_attach_type = BPF_XDP;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_GT(prog_fd, 0, "prog_fd")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	/* no token + caps -> failure */
-	prog_opts.token_fd = 0;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* no caps + token -> failure */
-	prog_opts.token_fd = token_fd;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	/* no caps + no token -> definitely a failure */
-	prog_opts.token_fd = 0;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	err = 0;
-cleanup:
-	zclose(prog_fd);
-	zclose(token_fd);
-	return err;
-}
-
-static int userns_obj_priv_map(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct priv_map *skel;
-	int err, token_fd;
-
-	skel = priv_map__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		priv_map__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* use bpf_token_path to provide BPF FS path */
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = priv_map__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = priv_map__load(skel);
-	priv_map__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_path_load"))
-		return -EINVAL;
-
-	/* create token and pass it through bpf_token_fd */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "create_token"))
-		return -EINVAL;
-
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = token_fd;
-	skel = priv_map__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_fd_open"))
-		return -EINVAL;
-
-	/* we can close our token FD, bpf_object owns dup()'ed FD now */
-	close(token_fd);
-
-	err = priv_map__load(skel);
-	priv_map__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_fd_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int userns_obj_priv_prog(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct priv_prog *skel;
-	int err;
-
-	skel = priv_prog__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		priv_prog__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* use bpf_token_path to provide BPF FS path */
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = priv_prog__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = priv_prog__load(skel);
-	priv_prog__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_path_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-/* this test is called with BPF FS that doesn't delegate BPF_BTF_LOAD command,
- * which should cause struct_ops application to fail, as BTF won't be uploaded
- * into the kernel, even if STRUCT_OPS programs themselves are allowed
- */
-static int validate_struct_ops_load(int mnt_fd, bool expect_success)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (expect_success) {
-		if (!ASSERT_OK(err, "obj_token_path_load"))
-			return -EINVAL;
-	} else /* expect failure */ {
-		if (!ASSERT_ERR(err, "obj_token_path_load"))
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int userns_obj_priv_btf_fail(int mnt_fd)
-{
-	return validate_struct_ops_load(mnt_fd, false /* should fail */);
-}
-
-static int userns_obj_priv_btf_success(int mnt_fd)
-{
-	return validate_struct_ops_load(mnt_fd, true /* should succeed */);
-}
-
-#define TOKEN_ENVVAR "LIBBPF_BPF_TOKEN_PATH"
-#define TOKEN_BPFFS_CUSTOM "/bpf-token-fs"
-
-static int userns_obj_priv_implicit_token(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	/* before we mount BPF FS with token delegation, struct_ops skeleton
-	 * should fail to load
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* mount custom BPF FS over /sys/fs/bpf so that libbpf can create BPF
-	 * token automatically and implicitly
-	 */
-	err = sys_move_mount(mnt_fd, "", AT_FDCWD, "/sys/fs/bpf", MOVE_MOUNT_F_EMPTY_PATH);
-	if (!ASSERT_OK(err, "move_mount_bpffs"))
-		return -EINVAL;
-
-	/* disable implicit BPF token creation by setting
-	 * LIBBPF_BPF_TOKEN_PATH envvar to empty value, load should fail
-	 */
-	err = setenv(TOKEN_ENVVAR, "", 1 /*overwrite*/);
-	if (!ASSERT_OK(err, "setenv_token_path"))
-		return -EINVAL;
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_token_envvar_disabled_load")) {
-		unsetenv(TOKEN_ENVVAR);
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-	unsetenv(TOKEN_ENVVAR);
-
-	/* now the same struct_ops skeleton should succeed thanks to libppf
-	 * creating BPF token from /sys/fs/bpf mount point
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load"))
-		return -EINVAL;
-
-	dummy_st_ops_success__destroy(skel);
-
-	/* now disable implicit token through empty bpf_token_path, should fail */
-	opts.bpf_token_path = "";
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
-		return -EINVAL;
-
-	/* now disable implicit token through negative bpf_token_fd, should fail */
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = -1;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_neg_token_fd_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int userns_obj_priv_implicit_token_envvar(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	/* before we mount BPF FS with token delegation, struct_ops skeleton
-	 * should fail to load
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* mount custom BPF FS over custom location, so libbpf can't create
-	 * BPF token implicitly, unless pointed to it through
-	 * LIBBPF_BPF_TOKEN_PATH envvar
-	 */
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	if (!ASSERT_OK(mkdir(TOKEN_BPFFS_CUSTOM, 0777), "mkdir_bpffs_custom"))
-		goto err_out;
-	err = sys_move_mount(mnt_fd, "", AT_FDCWD, TOKEN_BPFFS_CUSTOM, MOVE_MOUNT_F_EMPTY_PATH);
-	if (!ASSERT_OK(err, "move_mount_bpffs"))
-		goto err_out;
-
-	/* even though we have BPF FS with delegation, it's not at default
-	 * /sys/fs/bpf location, so we still fail to load until envvar is set up
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load2")) {
-		dummy_st_ops_success__destroy(skel);
-		goto err_out;
-	}
-
-	err = setenv(TOKEN_ENVVAR, TOKEN_BPFFS_CUSTOM, 1 /*overwrite*/);
-	if (!ASSERT_OK(err, "setenv_token_path"))
-		goto err_out;
-
-	/* now the same struct_ops skeleton should succeed thanks to libppf
-	 * creating BPF token from custom mount point
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load"))
-		goto err_out;
-
-	dummy_st_ops_success__destroy(skel);
-
-	/* now disable implicit token through empty bpf_token_path, envvar
-	 * will be ignored, should fail
-	 */
-	opts.bpf_token_path = "";
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open"))
-		goto err_out;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
-		goto err_out;
-
-	/* now disable implicit token through negative bpf_token_fd, envvar
-	 * will be ignored, should fail
-	 */
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = -1;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open"))
-		goto err_out;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_neg_token_fd_load"))
-		goto err_out;
-
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	unsetenv(TOKEN_ENVVAR);
-	return 0;
-err_out:
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	unsetenv(TOKEN_ENVVAR);
-	return -EINVAL;
-}
-
-#define bit(n) (1ULL << (n))
-
-void test_token(void)
-{
-	if (test__start_subtest("map_token")) {
-		struct bpffs_opts opts = {
-			.cmds_str = "map_create",
-			.maps_str = "stack",
-		};
-
-		subtest_userns(&opts, userns_map_create);
-	}
-	if (test__start_subtest("btf_token")) {
-		struct bpffs_opts opts = {
-			.cmds = 1ULL << BPF_BTF_LOAD,
-		};
-
-		subtest_userns(&opts, userns_btf_load);
-	}
-	if (test__start_subtest("prog_token")) {
-		struct bpffs_opts opts = {
-			.cmds_str = "PROG_LOAD",
-			.progs_str = "XDP",
-			.attachs_str = "xdp",
-		};
-
-		subtest_userns(&opts, userns_prog_load);
-	}
-	if (test__start_subtest("obj_priv_map")) {
-		struct bpffs_opts opts = {
-			.cmds = bit(BPF_MAP_CREATE),
-			.maps = bit(BPF_MAP_TYPE_QUEUE),
-		};
-
-		subtest_userns(&opts, userns_obj_priv_map);
-	}
-	if (test__start_subtest("obj_priv_prog")) {
-		struct bpffs_opts opts = {
-			.cmds = bit(BPF_PROG_LOAD),
-			.progs = bit(BPF_PROG_TYPE_KPROBE),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_prog);
-	}
-	if (test__start_subtest("obj_priv_btf_fail")) {
-		struct bpffs_opts opts = {
-			/* disallow BTF loading */
-			.cmds = bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_btf_fail);
-	}
-	if (test__start_subtest("obj_priv_btf_success")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_btf_success);
-	}
-	if (test__start_subtest("obj_priv_implicit_token")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_implicit_token);
-	}
-	if (test__start_subtest("obj_priv_implicit_token_envvar")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_implicit_token_envvar);
-	}
-}
diff --git a/tools/testing/selftests/bpf/progs/priv_map.c b/tools/testing/selftests/bpf/progs/priv_map.c
deleted file mode 100644
index 9085be50f03b..000000000000
--- a/tools/testing/selftests/bpf/progs/priv_map.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-
-char _license[] SEC("license") = "GPL";
-
-struct {
-	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, 1);
-	__type(value, __u32);
-} priv_map SEC(".maps");
diff --git a/tools/testing/selftests/bpf/progs/priv_prog.c b/tools/testing/selftests/bpf/progs/priv_prog.c
deleted file mode 100644
index 3c7b2b618c8a..000000000000
--- a/tools/testing/selftests/bpf/progs/priv_prog.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-
-char _license[] SEC("license") = "GPL";
-
-SEC("kprobe")
-int kprobe_prog(void *ctx)
-{
-	return 1;
-}
-- 
cgit v1.2.3


From fd27ef6b44bec26915c5b2b22c13856d9f0ba17a Mon Sep 17 00:00:00 2001
From: Feng Liu <feliu@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:40 +0200
Subject: virtio-pci: Introduce admin virtqueue

Introduce support for the admin virtqueue. By negotiating
VIRTIO_F_ADMIN_VQ feature, driver detects capability and creates one
administration virtqueue. Administration virtqueue implementation in
virtio pci generic layer, enables multiple types of upper layer
drivers such as vfio, net, blk to utilize it.

Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/virtio/virtio.c                | 37 +++++++++++++++--
 drivers/virtio/virtio_pci_common.c     |  3 ++
 drivers/virtio/virtio_pci_common.h     | 15 ++++++-
 drivers/virtio/virtio_pci_modern.c     | 75 +++++++++++++++++++++++++++++++++-
 drivers/virtio/virtio_pci_modern_dev.c | 24 ++++++++++-
 include/linux/virtio_config.h          |  4 ++
 include/linux/virtio_pci_modern.h      |  2 +
 include/uapi/linux/virtio_pci.h        |  5 +++
 8 files changed, 157 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 3893dc29eb26..f4080692b351 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -302,9 +302,15 @@ static int virtio_dev_probe(struct device *_d)
 	if (err)
 		goto err;
 
+	if (dev->config->create_avq) {
+		err = dev->config->create_avq(dev);
+		if (err)
+			goto err;
+	}
+
 	err = drv->probe(dev);
 	if (err)
-		goto err;
+		goto err_probe;
 
 	/* If probe didn't do it, mark device DRIVER_OK ourselves. */
 	if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
@@ -316,6 +322,10 @@ static int virtio_dev_probe(struct device *_d)
 	virtio_config_enable(dev);
 
 	return 0;
+
+err_probe:
+	if (dev->config->destroy_avq)
+		dev->config->destroy_avq(dev);
 err:
 	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return err;
@@ -331,6 +341,9 @@ static void virtio_dev_remove(struct device *_d)
 
 	drv->remove(dev);
 
+	if (dev->config->destroy_avq)
+		dev->config->destroy_avq(dev);
+
 	/* Driver should have reset device. */
 	WARN_ON_ONCE(dev->config->get_status(dev));
 
@@ -489,13 +502,20 @@ EXPORT_SYMBOL_GPL(unregister_virtio_device);
 int virtio_device_freeze(struct virtio_device *dev)
 {
 	struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
+	int ret;
 
 	virtio_config_disable(dev);
 
 	dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
 
-	if (drv && drv->freeze)
-		return drv->freeze(dev);
+	if (drv && drv->freeze) {
+		ret = drv->freeze(dev);
+		if (ret)
+			return ret;
+	}
+
+	if (dev->config->destroy_avq)
+		dev->config->destroy_avq(dev);
 
 	return 0;
 }
@@ -532,10 +552,16 @@ int virtio_device_restore(struct virtio_device *dev)
 	if (ret)
 		goto err;
 
+	if (dev->config->create_avq) {
+		ret = dev->config->create_avq(dev);
+		if (ret)
+			goto err;
+	}
+
 	if (drv->restore) {
 		ret = drv->restore(dev);
 		if (ret)
-			goto err;
+			goto err_restore;
 	}
 
 	/* If restore didn't do it, mark device DRIVER_OK ourselves. */
@@ -546,6 +572,9 @@ int virtio_device_restore(struct virtio_device *dev)
 
 	return 0;
 
+err_restore:
+	if (dev->config->destroy_avq)
+		dev->config->destroy_avq(dev);
 err:
 	virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return ret;
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 7a5593997e0e..fafd13d0e4d4 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -236,6 +236,9 @@ void vp_del_vqs(struct virtio_device *vdev)
 	int i;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+		if (vp_dev->is_avq(vdev, vq->index))
+			continue;
+
 		if (vp_dev->per_vq_vectors) {
 			int v = vp_dev->vqs[vq->index]->msix_vector;
 
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 4b773bd7c58c..7306128e63e9 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -41,6 +41,14 @@ struct virtio_pci_vq_info {
 	unsigned int msix_vector;
 };
 
+struct virtio_pci_admin_vq {
+	/* Virtqueue info associated with this admin queue. */
+	struct virtio_pci_vq_info info;
+	/* Name of the admin queue: avq.$vq_index. */
+	char name[10];
+	u16 vq_index;
+};
+
 /* Our device structure */
 struct virtio_pci_device {
 	struct virtio_device vdev;
@@ -58,9 +66,13 @@ struct virtio_pci_device {
 	spinlock_t lock;
 	struct list_head virtqueues;
 
-	/* array of all queues for house-keeping */
+	/* Array of all virtqueues reported in the
+	 * PCI common config num_queues field
+	 */
 	struct virtio_pci_vq_info **vqs;
 
+	struct virtio_pci_admin_vq admin_vq;
+
 	/* MSI-X support */
 	int msix_enabled;
 	int intx_enabled;
@@ -86,6 +98,7 @@ struct virtio_pci_device {
 	void (*del_vq)(struct virtio_pci_vq_info *info);
 
 	u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
+	bool (*is_avq)(struct virtio_device *vdev, unsigned int index);
 };
 
 /* Constants for MSI-X */
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index ee6a386d250b..ce915018b5b0 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -19,6 +19,8 @@
 #define VIRTIO_RING_NO_LEGACY
 #include "virtio_pci_common.h"
 
+#define VIRTIO_AVQ_SGS_MAX	4
+
 static u64 vp_get_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -26,6 +28,16 @@ static u64 vp_get_features(struct virtio_device *vdev)
 	return vp_modern_get_features(&vp_dev->mdev);
 }
 
+static bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return false;
+
+	return index == vp_dev->admin_vq.vq_index;
+}
+
 static void vp_transport_features(struct virtio_device *vdev, u64 features)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -37,6 +49,9 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features)
 
 	if (features & BIT_ULL(VIRTIO_F_RING_RESET))
 		__virtio_set_bit(vdev, VIRTIO_F_RING_RESET);
+
+	if (features & BIT_ULL(VIRTIO_F_ADMIN_VQ))
+		__virtio_set_bit(vdev, VIRTIO_F_ADMIN_VQ);
 }
 
 static int __vp_check_common_size_one_feature(struct virtio_device *vdev, u32 fbit,
@@ -69,6 +84,9 @@ static int vp_check_common_size(struct virtio_device *vdev)
 	if (vp_check_common_size_one_feature(vdev, VIRTIO_F_RING_RESET, queue_reset))
 		return -EINVAL;
 
+	if (vp_check_common_size_one_feature(vdev, VIRTIO_F_ADMIN_VQ, admin_queue_num))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -345,6 +363,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
 	bool (*notify)(struct virtqueue *vq);
 	struct virtqueue *vq;
+	bool is_avq;
 	u16 num;
 	int err;
 
@@ -353,11 +372,13 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	else
 		notify = vp_notify;
 
-	if (index >= vp_modern_get_num_queues(mdev))
+	is_avq = vp_is_avq(&vp_dev->vdev, index);
+	if (index >= vp_modern_get_num_queues(mdev) && !is_avq)
 		return ERR_PTR(-EINVAL);
 
+	num = is_avq ?
+		VIRTIO_AVQ_SGS_MAX : vp_modern_get_queue_size(mdev, index);
 	/* Check if queue is either not available or already active. */
-	num = vp_modern_get_queue_size(mdev, index);
 	if (!num || vp_modern_get_queue_enable(mdev, index))
 		return ERR_PTR(-ENOENT);
 
@@ -383,6 +404,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 		goto err;
 	}
 
+	if (is_avq)
+		vp_dev->admin_vq.info.vq = vq;
+
 	return vq;
 
 err:
@@ -418,6 +442,9 @@ static void del_vq(struct virtio_pci_vq_info *info)
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
 
+	if (vp_is_avq(&vp_dev->vdev, vq->index))
+		vp_dev->admin_vq.info.vq = NULL;
+
 	if (vp_dev->msix_enabled)
 		vp_modern_queue_vector(mdev, vq->index,
 				       VIRTIO_MSI_NO_VECTOR);
@@ -527,6 +554,45 @@ static bool vp_get_shm_region(struct virtio_device *vdev,
 	return true;
 }
 
+static int vp_modern_create_avq(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_pci_admin_vq *avq;
+	struct virtqueue *vq;
+	u16 admin_q_num;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return 0;
+
+	admin_q_num = vp_modern_avq_num(&vp_dev->mdev);
+	if (!admin_q_num)
+		return -EINVAL;
+
+	avq = &vp_dev->admin_vq;
+	avq->vq_index = vp_modern_avq_index(&vp_dev->mdev);
+	sprintf(avq->name, "avq.%u", avq->vq_index);
+	vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL,
+			      avq->name, NULL, VIRTIO_MSI_NO_VECTOR);
+	if (IS_ERR(vq)) {
+		dev_err(&vdev->dev, "failed to setup admin virtqueue, err=%ld",
+			PTR_ERR(vq));
+		return PTR_ERR(vq);
+	}
+
+	vp_modern_set_queue_enable(&vp_dev->mdev, avq->info.vq->index, true);
+	return 0;
+}
+
+static void vp_modern_destroy_avq(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return;
+
+	vp_dev->del_vq(&vp_dev->admin_vq.info);
+}
+
 static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 	.get		= NULL,
 	.set		= NULL,
@@ -545,6 +611,8 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 	.get_shm_region  = vp_get_shm_region,
 	.disable_vq_and_reset = vp_modern_disable_vq_and_reset,
 	.enable_vq_after_reset = vp_modern_enable_vq_after_reset,
+	.create_avq = vp_modern_create_avq,
+	.destroy_avq = vp_modern_destroy_avq,
 };
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
@@ -565,6 +633,8 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.get_shm_region  = vp_get_shm_region,
 	.disable_vq_and_reset = vp_modern_disable_vq_and_reset,
 	.enable_vq_after_reset = vp_modern_enable_vq_after_reset,
+	.create_avq = vp_modern_create_avq,
+	.destroy_avq = vp_modern_destroy_avq,
 };
 
 /* the PCI probing function */
@@ -588,6 +658,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 	vp_dev->config_vector = vp_config_vector;
 	vp_dev->setup_vq = setup_vq;
 	vp_dev->del_vq = del_vq;
+	vp_dev->is_avq = vp_is_avq;
 	vp_dev->isr = mdev->isr;
 	vp_dev->vdev.id = mdev->id;
 
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 7de8b1ebabac..0d3dbfaf4b23 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -207,6 +207,10 @@ static inline void check_offsets(void)
 		     offsetof(struct virtio_pci_modern_common_cfg, queue_notify_data));
 	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_RESET !=
 		     offsetof(struct virtio_pci_modern_common_cfg, queue_reset));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_ADM_Q_IDX !=
+		     offsetof(struct virtio_pci_modern_common_cfg, admin_queue_index));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_ADM_Q_NUM !=
+		     offsetof(struct virtio_pci_modern_common_cfg, admin_queue_num));
 }
 
 /*
@@ -296,7 +300,7 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 	mdev->common = vp_modern_map_capability(mdev, common,
 			      sizeof(struct virtio_pci_common_cfg), 4, 0,
 			      offsetofend(struct virtio_pci_modern_common_cfg,
-					  queue_reset),
+					  admin_queue_num),
 			      &mdev->common_len, NULL);
 	if (!mdev->common)
 		goto err_map_common;
@@ -719,6 +723,24 @@ void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
 }
 EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify);
 
+u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+	return vp_ioread16(&cfg->admin_queue_num);
+}
+EXPORT_SYMBOL_GPL(vp_modern_avq_num);
+
+u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+	return vp_ioread16(&cfg->admin_queue_index);
+}
+EXPORT_SYMBOL_GPL(vp_modern_avq_index);
+
 MODULE_VERSION("0.1");
 MODULE_DESCRIPTION("Modern Virtio PCI Device");
 MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 2b3438de2c4d..da9b271b54db 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -93,6 +93,8 @@ typedef void vq_callback_t(struct virtqueue *);
  *	Returns 0 on success or error status
  *	If disable_vq_and_reset is set, then enable_vq_after_reset must also be
  *	set.
+ * @create_avq: create admin virtqueue resource.
+ * @destroy_avq: destroy admin virtqueue resource.
  */
 struct virtio_config_ops {
 	void (*get)(struct virtio_device *vdev, unsigned offset,
@@ -120,6 +122,8 @@ struct virtio_config_ops {
 			       struct virtio_shm_region *region, u8 id);
 	int (*disable_vq_and_reset)(struct virtqueue *vq);
 	int (*enable_vq_after_reset)(struct virtqueue *vq);
+	int (*create_avq)(struct virtio_device *vdev);
+	void (*destroy_avq)(struct virtio_device *vdev);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index a09e13a577a9..c0b1b1ca1163 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -125,4 +125,6 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
 void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
+u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev);
+u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev);
 #endif
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 44f4dd2add18..240ddeef7eae 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -175,6 +175,9 @@ struct virtio_pci_modern_common_cfg {
 
 	__le16 queue_notify_data;	/* read-write */
 	__le16 queue_reset;		/* read-write */
+
+	__le16 admin_queue_index;	/* read-only */
+	__le16 admin_queue_num;		/* read-only */
 };
 
 /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
@@ -215,6 +218,8 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
 #define VIRTIO_PCI_COMMON_Q_NDATA	56
 #define VIRTIO_PCI_COMMON_Q_RESET	58
+#define VIRTIO_PCI_COMMON_ADM_Q_IDX	60
+#define VIRTIO_PCI_COMMON_ADM_Q_NUM	62
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit v1.2.3


From 92792ac752aa80d5ee71bc291d90edd06cd76bd1 Mon Sep 17 00:00:00 2001
From: Feng Liu <feliu@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:41 +0200
Subject: virtio-pci: Introduce admin command sending function

Add support for sending admin command through admin virtqueue interface.
Abort any inflight admin commands once device reset completes. Activate
admin queue when device becomes ready; deactivate on device reset.

To comply to the below specification statement [1], the admin virtqueue
is activated for upper layer users only after setting DRIVER_OK status.

[1] The driver MUST NOT send any buffer available notifications to the
device before setting DRIVER_OK.

Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-4-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/virtio/virtio_pci_common.h |   6 ++
 drivers/virtio/virtio_pci_modern.c | 143 ++++++++++++++++++++++++++++++++++++-
 include/linux/virtio.h             |   8 +++
 include/uapi/linux/virtio_pci.h    |  22 ++++++
 4 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 7306128e63e9..282d087a9266 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -29,6 +29,7 @@
 #include <linux/virtio_pci_modern.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 
 struct virtio_pci_vq_info {
 	/* the actual virtqueue */
@@ -44,6 +45,8 @@ struct virtio_pci_vq_info {
 struct virtio_pci_admin_vq {
 	/* Virtqueue info associated with this admin queue. */
 	struct virtio_pci_vq_info info;
+	/* serializing admin commands execution and virtqueue deletion */
+	struct mutex cmd_lock;
 	/* Name of the admin queue: avq.$vq_index. */
 	char name[10];
 	u16 vq_index;
@@ -152,4 +155,7 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
 int virtio_pci_modern_probe(struct virtio_pci_device *);
 void virtio_pci_modern_remove(struct virtio_pci_device *);
 
+int vp_modern_admin_cmd_exec(struct virtio_device *vdev,
+			     struct virtio_admin_cmd *cmd);
+
 #endif
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index ce915018b5b0..9bd66300a80a 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -38,6 +38,132 @@ static bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
 	return index == vp_dev->admin_vq.vq_index;
 }
 
+static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq,
+				    struct scatterlist **sgs,
+				    unsigned int out_num,
+				    unsigned int in_num,
+				    void *data)
+{
+	struct virtqueue *vq;
+	int ret, len;
+
+	vq = admin_vq->info.vq;
+	if (!vq)
+		return -EIO;
+
+	ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, data, GFP_KERNEL);
+	if (ret < 0)
+		return -EIO;
+
+	if (unlikely(!virtqueue_kick(vq)))
+		return -EIO;
+
+	while (!virtqueue_get_buf(vq, &len) &&
+	       !virtqueue_is_broken(vq))
+		cpu_relax();
+
+	if (virtqueue_is_broken(vq))
+		return -EIO;
+
+	return 0;
+}
+
+int vp_modern_admin_cmd_exec(struct virtio_device *vdev,
+			     struct virtio_admin_cmd *cmd)
+{
+	struct scatterlist *sgs[VIRTIO_AVQ_SGS_MAX], hdr, stat;
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_admin_cmd_status *va_status;
+	unsigned int out_num = 0, in_num = 0;
+	struct virtio_admin_cmd_hdr *va_hdr;
+	u16 status;
+	int ret;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return -EOPNOTSUPP;
+
+	va_status = kzalloc(sizeof(*va_status), GFP_KERNEL);
+	if (!va_status)
+		return -ENOMEM;
+
+	va_hdr = kzalloc(sizeof(*va_hdr), GFP_KERNEL);
+	if (!va_hdr) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	va_hdr->opcode = cmd->opcode;
+	va_hdr->group_type = cmd->group_type;
+	va_hdr->group_member_id = cmd->group_member_id;
+
+	/* Add header */
+	sg_init_one(&hdr, va_hdr, sizeof(*va_hdr));
+	sgs[out_num] = &hdr;
+	out_num++;
+
+	if (cmd->data_sg) {
+		sgs[out_num] = cmd->data_sg;
+		out_num++;
+	}
+
+	/* Add return status */
+	sg_init_one(&stat, va_status, sizeof(*va_status));
+	sgs[out_num + in_num] = &stat;
+	in_num++;
+
+	if (cmd->result_sg) {
+		sgs[out_num + in_num] = cmd->result_sg;
+		in_num++;
+	}
+
+	mutex_lock(&vp_dev->admin_vq.cmd_lock);
+	ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, sgs,
+				       out_num, in_num, sgs);
+	mutex_unlock(&vp_dev->admin_vq.cmd_lock);
+
+	if (ret) {
+		dev_err(&vdev->dev,
+			"Failed to execute command on admin vq: %d\n.", ret);
+		goto err_cmd_exec;
+	}
+
+	status = le16_to_cpu(va_status->status);
+	if (status != VIRTIO_ADMIN_STATUS_OK) {
+		dev_err(&vdev->dev,
+			"admin command error: status(%#x) qualifier(%#x)\n",
+			status, le16_to_cpu(va_status->status_qualifier));
+		ret = -status;
+	}
+
+err_cmd_exec:
+	kfree(va_hdr);
+err_alloc:
+	kfree(va_status);
+	return ret;
+}
+
+static void vp_modern_avq_activate(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return;
+
+	__virtqueue_unbreak(admin_vq->info.vq);
+}
+
+static void vp_modern_avq_deactivate(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+		return;
+
+	__virtqueue_break(admin_vq->info.vq);
+}
+
 static void vp_transport_features(struct virtio_device *vdev, u64 features)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -213,6 +339,8 @@ static void vp_set_status(struct virtio_device *vdev, u8 status)
 	/* We should never be setting status to 0. */
 	BUG_ON(status == 0);
 	vp_modern_set_status(&vp_dev->mdev, status);
+	if (status & VIRTIO_CONFIG_S_DRIVER_OK)
+		vp_modern_avq_activate(vdev);
 }
 
 static void vp_reset(struct virtio_device *vdev)
@@ -229,6 +357,9 @@ static void vp_reset(struct virtio_device *vdev)
 	 */
 	while (vp_modern_get_status(mdev))
 		msleep(1);
+
+	vp_modern_avq_deactivate(vdev);
+
 	/* Flush pending VQ/configuration callbacks. */
 	vp_synchronize_vectors(vdev);
 }
@@ -404,8 +535,11 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 		goto err;
 	}
 
-	if (is_avq)
+	if (is_avq) {
+		mutex_lock(&vp_dev->admin_vq.cmd_lock);
 		vp_dev->admin_vq.info.vq = vq;
+		mutex_unlock(&vp_dev->admin_vq.cmd_lock);
+	}
 
 	return vq;
 
@@ -442,8 +576,11 @@ static void del_vq(struct virtio_pci_vq_info *info)
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
 
-	if (vp_is_avq(&vp_dev->vdev, vq->index))
+	if (vp_is_avq(&vp_dev->vdev, vq->index)) {
+		mutex_lock(&vp_dev->admin_vq.cmd_lock);
 		vp_dev->admin_vq.info.vq = NULL;
+		mutex_unlock(&vp_dev->admin_vq.cmd_lock);
+	}
 
 	if (vp_dev->msix_enabled)
 		vp_modern_queue_vector(mdev, vq->index,
@@ -662,6 +799,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 	vp_dev->isr = mdev->isr;
 	vp_dev->vdev.id = mdev->id;
 
+	mutex_init(&vp_dev->admin_vq.cmd_lock);
 	return 0;
 }
 
@@ -669,5 +807,6 @@ void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev)
 {
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
 
+	mutex_destroy(&vp_dev->admin_vq.cmd_lock);
 	vp_modern_remove(mdev);
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 4cc614a38376..b0201747a263 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -103,6 +103,14 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 int virtqueue_reset(struct virtqueue *vq,
 		    void (*recycle)(struct virtqueue *vq, void *buf));
 
+struct virtio_admin_cmd {
+	__le16 opcode;
+	__le16 group_type;
+	__le64 group_member_id;
+	struct scatterlist *data_sg;
+	struct scatterlist *result_sg;
+};
+
 /**
  * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 240ddeef7eae..187fd9e34a30 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -223,4 +223,26 @@ struct virtio_pci_cfg_cap {
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
+/* Admin command status. */
+#define VIRTIO_ADMIN_STATUS_OK		0
+
+struct __packed virtio_admin_cmd_hdr {
+	__le16 opcode;
+	/*
+	 * 1 - SR-IOV
+	 * 2-65535 - reserved
+	 */
+	__le16 group_type;
+	/* Unused, reserved for future extensions. */
+	__u8 reserved1[12];
+	__le64 group_member_id;
+};
+
+struct __packed virtio_admin_cmd_status {
+	__le16 status;
+	__le16 status_qualifier;
+	/* Unused, reserved for future extensions. */
+	__u8 reserved2[4];
+};
+
 #endif
-- 
cgit v1.2.3


From c3fc3e098bd64c560dde49a6e72b21b055150abe Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:44 +0200
Subject: virtio-pci: Introduce APIs to execute legacy IO admin commands

Introduce APIs to execute legacy IO admin commands.

It includes: io_legacy_read/write for both common and the device
configuration, io_legacy_notify_info.

In addition, exposing an API to check whether the legacy IO commands are
supported. (i.e. virtio_pci_admin_has_legacy_io()).

Those APIs will be used by the next patches from this series.

Note:
Unlike modern drivers which support hardware virtio devices, legacy
drivers assume software-based devices: e.g. they don't use proper memory
barriers on ARM, use big endian on PPC, etc. X86 drivers are mostly ok
though, more or less by chance. For now, only support legacy IO on X86.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20231219093247.170936-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/virtio/Makefile                     |   1 +
 drivers/virtio/virtio_pci_admin_legacy_io.c | 244 ++++++++++++++++++++++++++++
 drivers/virtio/virtio_pci_common.c          |  11 ++
 drivers/virtio/virtio_pci_common.h          |   2 +
 include/linux/virtio_pci_admin.h            |  23 +++
 5 files changed, 281 insertions(+)
 create mode 100644 drivers/virtio/virtio_pci_admin_legacy_io.c
 create mode 100644 include/linux/virtio_pci_admin.h

(limited to 'include/linux')

diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 8e98d24917cc..73ace62af440 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
+virtio_pci-$(CONFIG_VIRTIO_PCI_ADMIN_LEGACY) += virtio_pci_admin_legacy_io.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
 obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
diff --git a/drivers/virtio/virtio_pci_admin_legacy_io.c b/drivers/virtio/virtio_pci_admin_legacy_io.c
new file mode 100644
index 000000000000..819cfbbc67c3
--- /dev/null
+++ b/drivers/virtio/virtio_pci_admin_legacy_io.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/virtio_pci_admin.h>
+#include "virtio_pci_common.h"
+
+/*
+ * virtio_pci_admin_has_legacy_io - Checks whether the legacy IO
+ * commands are supported
+ * @dev: VF pci_dev
+ *
+ * Returns true on success.
+ */
+bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_pci_device *vp_dev;
+
+	if (!virtio_dev)
+		return false;
+
+	if (!virtio_has_feature(virtio_dev, VIRTIO_F_ADMIN_VQ))
+		return false;
+
+	vp_dev = to_vp_device(virtio_dev);
+
+	if ((vp_dev->admin_vq.supported_cmds & VIRTIO_LEGACY_ADMIN_CMD_BITMAP) ==
+		VIRTIO_LEGACY_ADMIN_CMD_BITMAP)
+		return true;
+	return false;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_has_legacy_io);
+
+static int virtio_pci_admin_legacy_io_write(struct pci_dev *pdev, u16 opcode,
+					    u8 offset, u8 size, u8 *buf)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_legacy_wr_data *data;
+	struct virtio_admin_cmd cmd = {};
+	struct scatterlist data_sg;
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	data = kzalloc(sizeof(*data) + size, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->offset = offset;
+	memcpy(data->registers, buf, size);
+	sg_init_one(&data_sg, data, sizeof(*data) + size);
+	cmd.opcode = cpu_to_le16(opcode);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(data);
+	return ret;
+}
+
+/*
+ * virtio_pci_admin_legacy_io_write_common - Write legacy common configuration
+ * of a member device
+ * @dev: VF pci_dev
+ * @offset: starting byte offset within the common configuration area to write to
+ * @size: size of the data to write
+ * @buf: buffer which holds the data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf)
+{
+	return virtio_pci_admin_legacy_io_write(pdev,
+					VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE,
+					offset, size, buf);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_common_io_write);
+
+/*
+ * virtio_pci_admin_legacy_io_write_device - Write legacy device configuration
+ * of a member device
+ * @dev: VF pci_dev
+ * @offset: starting byte offset within the device configuration area to write to
+ * @size: size of the data to write
+ * @buf: buffer which holds the data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf)
+{
+	return virtio_pci_admin_legacy_io_write(pdev,
+					VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_WRITE,
+					offset, size, buf);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_device_io_write);
+
+static int virtio_pci_admin_legacy_io_read(struct pci_dev *pdev, u16 opcode,
+					   u8 offset, u8 size, u8 *buf)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_legacy_rd_data *data;
+	struct scatterlist data_sg, result_sg;
+	struct virtio_admin_cmd cmd = {};
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->offset = offset;
+	sg_init_one(&data_sg, data, sizeof(*data));
+	sg_init_one(&result_sg, buf, size);
+	cmd.opcode = cpu_to_le16(opcode);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.data_sg = &data_sg;
+	cmd.result_sg = &result_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+	kfree(data);
+	return ret;
+}
+
+/*
+ * virtio_pci_admin_legacy_device_io_read - Read legacy device configuration of
+ * a member device
+ * @dev: VF pci_dev
+ * @offset: starting byte offset within the device configuration area to read from
+ * @size: size of the data to be read
+ * @buf: buffer to hold the returned data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf)
+{
+	return virtio_pci_admin_legacy_io_read(pdev,
+					VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ,
+					offset, size, buf);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_device_io_read);
+
+/*
+ * virtio_pci_admin_legacy_common_io_read - Read legacy common configuration of
+ * a member device
+ * @dev: VF pci_dev
+ * @offset: starting byte offset within the common configuration area to read from
+ * @size: size of the data to be read
+ * @buf: buffer to hold the returned data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf)
+{
+	return virtio_pci_admin_legacy_io_read(pdev,
+					VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ,
+					offset, size, buf);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_common_io_read);
+
+/*
+ * virtio_pci_admin_legacy_io_notify_info - Read the queue notification
+ * information for legacy interface
+ * @dev: VF pci_dev
+ * @req_bar_flags: requested bar flags
+ * @bar: on output the BAR number of the owner or member device
+ * @bar_offset: on output the offset within bar
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev,
+					   u8 req_bar_flags, u8 *bar,
+					   u64 *bar_offset)
+{
+	struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+	struct virtio_admin_cmd_notify_info_result *result;
+	struct virtio_admin_cmd cmd = {};
+	struct scatterlist result_sg;
+	int vf_id;
+	int ret;
+
+	if (!virtio_dev)
+		return -ENODEV;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return vf_id;
+
+	result = kzalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return -ENOMEM;
+
+	sg_init_one(&result_sg, result, sizeof(*result));
+	cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO);
+	cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+	cmd.group_member_id = cpu_to_le64(vf_id + 1);
+	cmd.result_sg = &result_sg;
+	ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+	if (!ret) {
+		struct virtio_admin_cmd_notify_info_data *entry;
+		int i;
+
+		ret = -ENOENT;
+		for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) {
+			entry = &result->entries[i];
+			if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END)
+				break;
+			if (entry->flags != req_bar_flags)
+				continue;
+			*bar = entry->bar;
+			*bar_offset = le64_to_cpu(entry->offset);
+			ret = 0;
+			break;
+		}
+	}
+
+	kfree(result);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_io_notify_info);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index fafd13d0e4d4..9f93330fc2cc 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -645,6 +645,17 @@ static struct pci_driver virtio_pci_driver = {
 	.sriov_configure = virtio_pci_sriov_configure,
 };
 
+struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev)
+{
+	struct virtio_pci_device *pf_vp_dev;
+
+	pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver);
+	if (IS_ERR(pf_vp_dev))
+		return NULL;
+
+	return &pf_vp_dev->vdev;
+}
+
 module_pci_driver(virtio_pci_driver);
 
 MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index a39bffd5fd46..7fef52bee455 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -156,6 +156,8 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
 int virtio_pci_modern_probe(struct virtio_pci_device *);
 void virtio_pci_modern_remove(struct virtio_pci_device *);
 
+struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev);
+
 #define VIRTIO_LEGACY_ADMIN_CMD_BITMAP \
 	(BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE) | \
 	 BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ) | \
diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h
new file mode 100644
index 000000000000..f4a100a0fe2e
--- /dev/null
+++ b/include/linux/virtio_pci_admin.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_ADMIN_H
+#define _LINUX_VIRTIO_PCI_ADMIN_H
+
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY
+bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev);
+int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf);
+int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf);
+int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset,
+					    u8 size, u8 *buf);
+int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset,
+					   u8 size, u8 *buf);
+int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev,
+					   u8 req_bar_flags, u8 *bar,
+					   u64 *bar_offset);
+#endif
+
+#endif /* _LINUX_VIRTIO_PCI_ADMIN_H */
-- 
cgit v1.2.3


From 8bccc5b80678c69f7729ce4cd232c0aa98fa6277 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:45 +0200
Subject: vfio/pci: Expose vfio_pci_core_setup_barmap()

Expose vfio_pci_core_setup_barmap() to be used by drivers.

This will let drivers to mmap a BAR and re-use it from both vfio and the
driver when it's applicable.

This API will be used in the next patches by the vfio/virtio coming
driver.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20231219093247.170936-8-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 7 ++++---
 include/linux/vfio_pci_core.h    | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index e27de61ac9fe..a9887fd6de46 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -200,7 +200,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 	return done;
 }
 
-static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
+int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	int ret;
@@ -223,6 +223,7 @@ static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap);
 
 ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos, bool iswrite)
@@ -262,7 +263,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 		}
 		x_end = end;
 	} else {
-		int ret = vfio_pci_setup_barmap(vdev, bar);
+		int ret = vfio_pci_core_setup_barmap(vdev, bar);
 		if (ret) {
 			done = ret;
 			goto out;
@@ -438,7 +439,7 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
 		return -EINVAL;
 #endif
 
-	ret = vfio_pci_setup_barmap(vdev, bar);
+	ret = vfio_pci_core_setup_barmap(vdev, bar);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 562e8754869d..67ac58e20e1d 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -127,6 +127,7 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
 
-- 
cgit v1.2.3


From 8486ae162b3b6cc1055366f044495cf1966231f1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 19 Dec 2023 11:32:46 +0200
Subject: vfio/pci: Expose vfio_pci_core_iowrite/read##size()

Expose vfio_pci_core_iowrite/read##size() to let it be used by drivers.

This functionality is needed to enable direct access to some physical
BAR of the device with the proper locks/checks in place.

The next patches from this series will use this functionality on a data
path flow when a direct access to the BAR is needed.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20231219093247.170936-9-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 50 +++++++++++++++++++++-------------------
 include/linux/vfio_pci_core.h    | 19 +++++++++++++++
 2 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index a9887fd6de46..07fea08ea8a2 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -38,7 +38,7 @@
 #define vfio_iowrite8	iowrite8
 
 #define VFIO_IOWRITE(size) \
-static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev,		\
+int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev,	\
 			bool test_mem, u##size val, void __iomem *io)	\
 {									\
 	if (test_mem) {							\
@@ -55,7 +55,8 @@ static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev,		\
 		up_read(&vdev->memory_lock);				\
 									\
 	return 0;							\
-}
+}									\
+EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
 
 VFIO_IOWRITE(8)
 VFIO_IOWRITE(16)
@@ -65,7 +66,7 @@ VFIO_IOWRITE(64)
 #endif
 
 #define VFIO_IOREAD(size) \
-static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev,		\
+int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev,	\
 			bool test_mem, u##size *val, void __iomem *io)	\
 {									\
 	if (test_mem) {							\
@@ -82,7 +83,8 @@ static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev,		\
 		up_read(&vdev->memory_lock);				\
 									\
 	return 0;							\
-}
+}									\
+EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
 
 VFIO_IOREAD(8)
 VFIO_IOREAD(16)
@@ -119,13 +121,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 				if (copy_from_user(&val, buf, 4))
 					return -EFAULT;
 
-				ret = vfio_pci_iowrite32(vdev, test_mem,
-							 val, io + off);
+				ret = vfio_pci_core_iowrite32(vdev, test_mem,
+							      val, io + off);
 				if (ret)
 					return ret;
 			} else {
-				ret = vfio_pci_ioread32(vdev, test_mem,
-							&val, io + off);
+				ret = vfio_pci_core_ioread32(vdev, test_mem,
+							     &val, io + off);
 				if (ret)
 					return ret;
 
@@ -141,13 +143,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 				if (copy_from_user(&val, buf, 2))
 					return -EFAULT;
 
-				ret = vfio_pci_iowrite16(vdev, test_mem,
-							 val, io + off);
+				ret = vfio_pci_core_iowrite16(vdev, test_mem,
+							      val, io + off);
 				if (ret)
 					return ret;
 			} else {
-				ret = vfio_pci_ioread16(vdev, test_mem,
-							&val, io + off);
+				ret = vfio_pci_core_ioread16(vdev, test_mem,
+							     &val, io + off);
 				if (ret)
 					return ret;
 
@@ -163,13 +165,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 				if (copy_from_user(&val, buf, 1))
 					return -EFAULT;
 
-				ret = vfio_pci_iowrite8(vdev, test_mem,
-							val, io + off);
+				ret = vfio_pci_core_iowrite8(vdev, test_mem,
+							     val, io + off);
 				if (ret)
 					return ret;
 			} else {
-				ret = vfio_pci_ioread8(vdev, test_mem,
-						       &val, io + off);
+				ret = vfio_pci_core_ioread8(vdev, test_mem,
+							    &val, io + off);
 				if (ret)
 					return ret;
 
@@ -364,21 +366,21 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
 {
 	switch (ioeventfd->count) {
 	case 1:
-		vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
-				  ioeventfd->data, ioeventfd->addr);
+		vfio_pci_core_iowrite8(ioeventfd->vdev, test_mem,
+				       ioeventfd->data, ioeventfd->addr);
 		break;
 	case 2:
-		vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
-				   ioeventfd->data, ioeventfd->addr);
+		vfio_pci_core_iowrite16(ioeventfd->vdev, test_mem,
+					ioeventfd->data, ioeventfd->addr);
 		break;
 	case 4:
-		vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
-				   ioeventfd->data, ioeventfd->addr);
+		vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
+					ioeventfd->data, ioeventfd->addr);
 		break;
 #ifdef iowrite64
 	case 8:
-		vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
-				   ioeventfd->data, ioeventfd->addr);
+		vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
+					ioeventfd->data, ioeventfd->addr);
 		break;
 #endif
 	}
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 67ac58e20e1d..85e84b92751b 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -131,4 +131,23 @@ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
 
+#define VFIO_IOWRITE_DECLATION(size) \
+int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev,	\
+			bool test_mem, u##size val, void __iomem *io);
+
+VFIO_IOWRITE_DECLATION(8)
+VFIO_IOWRITE_DECLATION(16)
+VFIO_IOWRITE_DECLATION(32)
+#ifdef iowrite64
+VFIO_IOWRITE_DECLATION(64)
+#endif
+
+#define VFIO_IOREAD_DECLATION(size) \
+int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev,	\
+			bool test_mem, u##size *val, void __iomem *io);
+
+VFIO_IOREAD_DECLATION(8)
+VFIO_IOREAD_DECLATION(16)
+VFIO_IOREAD_DECLATION(32)
+
 #endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From a4104821ad651d8a0b374f0b2474c345bbb42f82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Dec 2023 12:30:43 -0700
Subject: io_uring/unix: drop usage of io_uring socket

Since we no longer allow sending io_uring fds over SCM_RIGHTS, move to
using io_is_uring_fops() to detect whether this is a io_uring fd or not.
With that done, kill off io_uring_get_socket() as nobody calls it
anymore.

This is in preparation to yanking out the rest of the core related to
unix gc with io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h | 10 +++++-----
 io_uring/io_uring.c      | 13 -------------
 io_uring/io_uring.h      |  1 -
 net/core/scm.c           |  2 +-
 net/unix/scm.c           |  4 +---
 5 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index d8fc93492dc5..68ed6697fece 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -7,12 +7,12 @@
 #include <uapi/linux/io_uring.h>
 
 #if defined(CONFIG_IO_URING)
-struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+bool io_is_uring_fops(struct file *file);
 
 static inline void io_uring_files_cancel(void)
 {
@@ -32,10 +32,6 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
-static inline struct sock *io_uring_get_socket(struct file *file)
-{
-	return NULL;
-}
 static inline void io_uring_task_cancel(void)
 {
 }
@@ -54,6 +50,10 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
 {
 	return -EOPNOTSUPP;
 }
+static inline bool io_is_uring_fops(struct file *file)
+{
+	return false;
+}
 #endif
 
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2b24ce692b0b..a9acccd45880 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -171,19 +171,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
 };
 #endif
 
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
-	if (io_is_uring_fops(file)) {
-		struct io_ring_ctx *ctx = file->private_data;
-
-		return ctx->ring_sock->sk;
-	}
-#endif
-	return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 1112c198e516..04e33f25919c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -44,7 +44,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
 
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
-bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
diff --git a/net/core/scm.c b/net/core/scm.c
index db3f7cd519c2..d0e0852a24d5 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -105,7 +105,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		if (fd < 0 || !(file = fget_raw(fd)))
 			return -EBADF;
 		/* don't allow io_uring files */
-		if (io_uring_get_socket(file)) {
+		if (io_is_uring_fops(file)) {
 			fput(file);
 			return -EINVAL;
 		}
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 6ff628f2349f..822ce0d0d791 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -35,10 +35,8 @@ struct sock *unix_get_socket(struct file *filp)
 		/* PF_UNIX ? */
 		if (s && ops && ops->family == PF_UNIX)
 			u_sock = s;
-	} else {
-		/* Could be an io_uring instance */
-		u_sock = io_uring_get_socket(filp);
 	}
+
 	return u_sock;
 }
 EXPORT_SYMBOL(unix_get_socket);
-- 
cgit v1.2.3


From 6e5e6d274956305f1fc0340522b38f5f5be74bdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Dec 2023 12:36:34 -0700
Subject: io_uring: drop any code related to SCM_RIGHTS

This is dead code after we dropped support for passing io_uring fds
over SCM_RIGHTS, get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   3 -
 io_uring/filetable.c           |  11 +--
 io_uring/io_uring.c            |  32 +-------
 io_uring/rsrc.c                | 169 +----------------------------------------
 io_uring/rsrc.h                |  15 ----
 5 files changed, 10 insertions(+), 220 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index bebab36abce8..fc8f2570b92b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -389,9 +389,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		rsrc_quiesce_wq;
 	unsigned			rsrc_quiesce;
 
-	#if defined(CONFIG_UNIX)
-		struct socket		*ring_sock;
-	#endif
 	/* hashed buffered write serialization */
 	struct io_wq_hash		*hash_map;
 
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index e7d749991de4..6e86e6188dbe 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 		io_file_bitmap_clear(&ctx->file_table, slot_index);
 	}
 
-	ret = io_scm_file_account(ctx, file);
-	if (!ret) {
-		*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-		io_fixed_file_set(file_slot, file);
-		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
-	return ret;
+	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
+	io_fixed_file_set(file_slot, file);
+	io_file_bitmap_set(&ctx->file_table, slot_index);
+	return 0;
 }
 
 int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a9acccd45880..a9a519fa9926 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -60,7 +60,6 @@
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
-#include <net/scm.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -2866,13 +2865,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
 
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
-
-#if defined(CONFIG_UNIX)
-	if (ctx->ring_sock) {
-		ctx->ring_sock->file = NULL; /* so that iput() is called */
-		sock_release(ctx->ring_sock);
-	}
-#endif
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
 	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@@ -3781,32 +3773,12 @@ static int io_uring_install_fd(struct file *file)
 /*
  * Allocate an anonymous fd, this is what constitutes the application
  * visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
  */
 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 {
-	struct file *file;
-#if defined(CONFIG_UNIX)
-	int ret;
-
-	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
-				&ctx->ring_sock);
-	if (ret)
-		return ERR_PTR(ret);
-#endif
-
-	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+	return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
 					 O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
-	if (IS_ERR(file)) {
-		sock_release(ctx->ring_sock);
-		ctx->ring_sock = NULL;
-	} else {
-		ctx->ring_sock->file = file;
-	}
-#endif
-	return file;
 }
 
 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f521c5965a93..4818b79231dd 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -24,7 +24,6 @@ struct io_rsrc_update {
 };
 
 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage);
@@ -157,7 +156,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
 
 	switch (node->type) {
 	case IORING_RSRC_FILE:
-		io_rsrc_file_put(node->ctx, prsrc);
+		fput(prsrc->file);
 		break;
 	case IORING_RSRC_BUFFER:
 		io_rsrc_buf_put(node->ctx, prsrc);
@@ -402,23 +401,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				break;
 			}
 			/*
-			 * Don't allow io_uring instances to be registered. If
-			 * UNIX isn't enabled, then this causes a reference
-			 * cycle and this instance can never get freed. If UNIX
-			 * is enabled we'll handle it just fine, but there's
-			 * still no point in allowing a ring fd as it doesn't
-			 * support regular read/write anyway.
+			 * Don't allow io_uring instances to be registered.
 			 */
 			if (io_is_uring_fops(file)) {
 				fput(file);
 				err = -EBADF;
 				break;
 			}
-			err = io_scm_file_account(ctx, file);
-			if (err) {
-				fput(file);
-				break;
-			}
 			*io_get_tag_slot(data, i) = tag;
 			io_fixed_file_set(file_slot, file);
 			io_file_bitmap_set(&ctx->file_table, i);
@@ -675,22 +664,12 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	for (i = 0; i < ctx->nr_user_files; i++) {
 		struct file *file = io_file_from_index(&ctx->file_table, i);
 
-		/* skip scm accounted files, they'll be freed by ->ring_sock */
-		if (!file || io_file_need_scm(file))
+		if (!file)
 			continue;
 		io_file_bitmap_clear(&ctx->file_table, i);
 		fput(file);
 	}
 
-#if defined(CONFIG_UNIX)
-	if (ctx->ring_sock) {
-		struct sock *sock = ctx->ring_sock->sk;
-		struct sk_buff *skb;
-
-		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-			kfree_skb(skb);
-	}
-#endif
 	io_free_file_tables(&ctx->file_table);
 	io_file_table_set_alloc_range(ctx, 0, 0);
 	io_rsrc_data_free(ctx->file_data);
@@ -718,137 +697,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing. We account only files that can hold other
- * files because otherwise they can't form a loop and so are not interesting
- * for GC.
- */
-int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
-	struct sock *sk = ctx->ring_sock->sk;
-	struct sk_buff_head *head = &sk->sk_receive_queue;
-	struct scm_fp_list *fpl;
-	struct sk_buff *skb;
-
-	if (likely(!io_file_need_scm(file)))
-		return 0;
-
-	/*
-	 * See if we can merge this file into an existing skb SCM_RIGHTS
-	 * file set. If there's no room, fall back to allocating a new skb
-	 * and filling it in.
-	 */
-	spin_lock_irq(&head->lock);
-	skb = skb_peek(head);
-	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
-		__skb_unlink(skb, head);
-	else
-		skb = NULL;
-	spin_unlock_irq(&head->lock);
-
-	if (!skb) {
-		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
-		if (!fpl)
-			return -ENOMEM;
-
-		skb = alloc_skb(0, GFP_KERNEL);
-		if (!skb) {
-			kfree(fpl);
-			return -ENOMEM;
-		}
-
-		fpl->user = get_uid(current_user());
-		fpl->max = SCM_MAX_FD;
-		fpl->count = 0;
-
-		UNIXCB(skb).fp = fpl;
-		skb->sk = sk;
-		skb->destructor = io_uring_destruct_scm;
-		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-	}
-
-	fpl = UNIXCB(skb).fp;
-	fpl->fp[fpl->count++] = get_file(file);
-	unix_inflight(fpl->user, file);
-	skb_queue_head(head, skb);
-	fput(file);
-#endif
-	return 0;
-}
-
-static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
-{
-#if defined(CONFIG_UNIX)
-	struct sock *sock = ctx->ring_sock->sk;
-	struct sk_buff_head list, *head = &sock->sk_receive_queue;
-	struct sk_buff *skb;
-	int i;
-
-	__skb_queue_head_init(&list);
-
-	/*
-	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
-	 * remove this entry and rearrange the file array.
-	 */
-	skb = skb_dequeue(head);
-	while (skb) {
-		struct scm_fp_list *fp;
-
-		fp = UNIXCB(skb).fp;
-		for (i = 0; i < fp->count; i++) {
-			int left;
-
-			if (fp->fp[i] != file)
-				continue;
-
-			unix_notinflight(fp->user, fp->fp[i]);
-			left = fp->count - 1 - i;
-			if (left) {
-				memmove(&fp->fp[i], &fp->fp[i + 1],
-						left * sizeof(struct file *));
-			}
-			fp->count--;
-			if (!fp->count) {
-				kfree_skb(skb);
-				skb = NULL;
-			} else {
-				__skb_queue_tail(&list, skb);
-			}
-			fput(file);
-			file = NULL;
-			break;
-		}
-
-		if (!file)
-			break;
-
-		__skb_queue_tail(&list, skb);
-
-		skb = skb_dequeue(head);
-	}
-
-	if (skb_peek(&list)) {
-		spin_lock_irq(&head->lock);
-		while ((skb = __skb_dequeue(&list)) != NULL)
-			__skb_queue_tail(head, skb);
-		spin_unlock_irq(&head->lock);
-	}
-#endif
-}
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	struct file *file = prsrc->file;
-
-	if (likely(!io_file_need_scm(file)))
-		fput(file);
-	else
-		io_rsrc_file_scm_put(ctx, file);
-}
-
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags)
 {
@@ -897,21 +745,12 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			goto fail;
 
 		/*
-		 * Don't allow io_uring instances to be registered. If UNIX
-		 * isn't enabled, then this causes a reference cycle and this
-		 * instance can never get freed. If UNIX is enabled we'll
-		 * handle it just fine, but there's still no point in allowing
-		 * a ring fd as it doesn't support regular read/write anyway.
+		 * Don't allow io_uring instances to be registered.
 		 */
 		if (io_is_uring_fops(file)) {
 			fput(file);
 			goto fail;
 		}
-		ret = io_scm_file_account(ctx, file);
-		if (ret) {
-			fput(file);
-			goto fail;
-		}
 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 		io_fixed_file_set(file_slot, file);
 		io_file_bitmap_set(&ctx->file_table, i);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 08ac0d8e07ef..7238b9cfe33b 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -75,21 +75,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags);
 
-int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
-
-static inline bool io_file_need_scm(struct file *filp)
-{
-	return false;
-}
-
-static inline int io_scm_file_account(struct io_ring_ctx *ctx,
-				      struct file *file)
-{
-	if (likely(!io_file_need_scm(file)))
-		return 0;
-	return __io_scm_file_account(ctx, file);
-}
-
 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			     unsigned nr_args);
 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
-- 
cgit v1.2.3


From ee0cf5e07f44a10fce8f1bfa9db226c0b5ecf880 Mon Sep 17 00:00:00 2001
From: Théo Lebrun <theo.lebrun@bootlin.com>
Date: Mon, 18 Dec 2023 18:14:16 +0100
Subject: clk: fixed-rate: fix
 clk_hw_register_fixed_rate_with_accuracy_parent_hw
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing comma and remove extraneous NULL argument. The macro is
currently used by no one which explains why the typo slipped by.

Fixes: 2d34f09e79c9 ("clk: fixed-rate: Add support for specifying parents via DT/pointers")
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://lore.kernel.org/r/20231218-mbly-clk-v1-1-44ce54108f06@bootlin.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ace3a4ce2fc9..1293c38ddb7f 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -448,8 +448,8 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
  */
 #define clk_hw_register_fixed_rate_with_accuracy_parent_hw(dev, name,	      \
 		parent_hw, flags, fixed_rate, fixed_accuracy)		      \
-	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw)   \
-				     NULL, NULL, (flags), (fixed_rate),	      \
+	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw),  \
+				     NULL, (flags), (fixed_rate),	      \
 				     (fixed_accuracy), 0, false)
 /**
  * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate
-- 
cgit v1.2.3


From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:25 -0800
Subject: bpf: abstract away global subprog arg preparation logic from reg
 state setup

btf_prepare_func_args() is used to understand expectations and
restrictions on global subprog arguments. But current implementation is
hard to extend, as it intermixes BTF-based func prototype parsing and
interpretation logic with setting up register state at subprog entry.

Worse still, those registers are not completely set up inside
btf_prepare_func_args(), requiring some more logic later in
do_check_common(). Like calling mark_reg_unknown() and similar
initialization operations.

This intermixing of BTF interpretation and register state setup is
problematic. First, it causes duplication of BTF parsing logic for global
subprog verification (to set up initial state of global subprog) and
global subprog call sites analysis (when we need to check that whatever
is being passed into global subprog matches expectations), performed in
btf_check_subprog_call().

Given we want to extend global func argument with tags later, this
duplication is problematic. So refactor btf_prepare_func_args() to do
only BTF-based func proto and args parsing, returning high-level
argument "expectations" only, with no regard to specifics of register
state. I.e., if it's a context argument, instead of setting register
state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as
"an argument specification" for further processing inside
do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc.

This allows to reuse btf_prepare_func_args() in following patches at
global subprog call site analysis time. It also keeps register setup
code consistently in one place, do_check_common().

Besides all this, we cache this argument specs information inside
env->subprog_info, eliminating the need to redo these potentially
expensive BTF traversals, especially if BPF program's BTF is big and/or
there are lots of global subprog calls.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  3 +--
 include/linux/bpf_verifier.h | 16 ++++++++++++++++
 kernel/bpf/btf.c             | 38 ++++++++++++++++++++------------------
 kernel/bpf/verifier.c        | 43 +++++++++++++++++++++++++++----------------
 4 files changed, 64 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a8d4c81a39a..c050c82cc9a5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, u32 *nargs);
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c2819a6579a5..5742e9c0a7b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_arg_info {
+	enum bpf_arg_type arg_type;
+	union {
+		u32 mem_size;
+	};
+};
+
 struct bpf_subprog_info {
 	/* 'start' has to be the first field otherwise find_subprog() won't work */
 	u32 start; /* insn idx of function entry point */
@@ -617,6 +624,10 @@ struct bpf_subprog_info {
 	bool is_cb: 1;
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
+	bool args_cached: 1;
+
+	u8 arg_cnt;
+	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
 };
 
 struct bpf_verifier_env;
@@ -727,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->subprog_info[subprog];
+}
+
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d56433bf8aba..be2104e5f2f5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6948,16 +6948,17 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
-/* Convert BTF of a function into bpf_reg_state if possible
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
  * EINVAL - cannot convert BTF.
- * 0 - Successfully converted BTF into bpf_reg_state
- * (either PTR_TO_CTX or SCALAR_VALUE).
+ * 0 - Successfully processed BTF and constructed argument expectations.
  */
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs, u32 *arg_cnt)
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 {
+	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
 	enum bpf_prog_type prog_type = prog->type;
@@ -6967,6 +6968,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	u32 i, nargs, btf_id;
 	const char *tname;
 
+	if (sub->args_cached)
+		return 0;
+
 	if (!prog->aux->func_info ||
 	    prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
 		bpf_log(log, "Verifier bug\n");
@@ -6990,10 +6994,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	}
 	tname = btf_name_by_offset(btf, t->name_off);
 
-	if (log->level & BPF_LOG_LEVEL)
-		bpf_log(log, "Validating %s() func#%d...\n",
-			tname, subprog);
-
 	if (prog->aux->func_info_aux[subprog].unreliable) {
 		bpf_log(log, "Verifier bug in function %s()\n", tname);
 		return -EFAULT;
@@ -7013,7 +7013,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	*arg_cnt = nargs;
 	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
@@ -7028,24 +7027,24 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	 * Only PTR_TO_CTX and SCALAR are supported atm.
 	 */
 	for (i = 0; i < nargs; i++) {
-		struct bpf_reg_state *reg = &regs[i + 1];
-
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
 			t = btf_type_by_id(btf, t->type);
 		if (btf_type_is_int(t) || btf_is_any_enum(t)) {
-			reg->type = SCALAR_VALUE;
+			sub->args[i].arg_type = ARG_ANYTHING;
 			continue;
 		}
 		if (btf_type_is_ptr(t)) {
+			u32 mem_size;
+
 			if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-				reg->type = PTR_TO_CTX;
+				sub->args[i].arg_type = ARG_PTR_TO_CTX;
 				continue;
 			}
 
 			t = btf_type_skip_modifiers(btf, t->type, NULL);
 
-			ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+			ref_t = btf_resolve_size(btf, t, &mem_size);
 			if (IS_ERR(ref_t)) {
 				bpf_log(log,
 				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
@@ -7054,15 +7053,18 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 				return -EINVAL;
 			}
 
-			reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
-			reg->id = ++env->id_gen;
-
+			sub->args[i].arg_type = ARG_PTR_TO_MEM_OR_NULL;
+			sub->args[i].mem_size = mem_size;
 			continue;
 		}
 		bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
+
+	sub->arg_cnt = nargs;
+	sub->args_cached = true;
+
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df1cae459c77..6555785b9f63 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -442,11 +442,6 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env,
 	return &env->prog->aux->func_info_aux[subprog];
 }
 
-static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
-{
-	return &env->subprog_info[subprog];
-}
-
 static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_subprog_info *info = subprog_info(env, subprog);
@@ -19937,34 +19932,50 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		u32 nargs;
+		struct bpf_subprog_info *sub = subprog_info(env, subprog);
+		const char *sub_name = subprog_name(env, subprog);
+		struct bpf_subprog_arg_info *arg;
+		struct bpf_reg_state *reg;
 
-		ret = btf_prepare_func_args(env, subprog, regs, &nargs);
+		verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+		ret = btf_prepare_func_args(env, subprog);
 		if (ret)
 			goto out;
+
 		if (subprog_is_exc_cb(env, subprog)) {
 			state->frame[0]->in_exception_callback_fn = true;
 			/* We have already ensured that the callback returns an integer, just
 			 * like all global subprogs. We need to determine it only has a single
 			 * scalar argument.
 			 */
-			if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) {
+			if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
 				verbose(env, "exception cb only supports single integer argument\n");
 				ret = -EINVAL;
 				goto out;
 			}
 		}
-		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
-			if (regs[i].type == PTR_TO_CTX)
+		for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+			arg = &sub->args[i - BPF_REG_1];
+			reg = &regs[i];
+
+			if (arg->arg_type == ARG_PTR_TO_CTX) {
+				reg->type = PTR_TO_CTX;
 				mark_reg_known_zero(env, regs, i);
-			else if (regs[i].type == SCALAR_VALUE)
+			} else if (arg->arg_type == ARG_ANYTHING) {
+				reg->type = SCALAR_VALUE;
 				mark_reg_unknown(env, regs, i);
-			else if (base_type(regs[i].type) == PTR_TO_MEM) {
-				const u32 mem_size = regs[i].mem_size;
-
+			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+				reg->type = PTR_TO_MEM;
+				if (arg->arg_type & PTR_MAYBE_NULL)
+					reg->type |= PTR_MAYBE_NULL;
 				mark_reg_known_zero(env, regs, i);
-				regs[i].mem_size = mem_size;
-				regs[i].id = ++env->id_gen;
+				reg->mem_size = arg->mem_size;
+				reg->id = ++env->id_gen;
+			} else {
+				WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
+					  i - BPF_REG_1, arg->arg_type);
+				ret = -EFAULT;
+				goto out;
 			}
 		}
 	} else {
-- 
cgit v1.2.3


From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:26 -0800
Subject: bpf: reuse btf_prepare_func_args() check for main program BTF
 validation

Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args()
logic to validate "trustworthiness" of main BPF program's BTF information,
if it is present.

We ignored results of original BTF check anyway, often times producing
confusing and ominously-sounding "reg type unsupported for arg#0
function" message, which has no apparent effect on program correctness
and verification process.

All the -EFAULT returning sanity checks are already performed in
check_btf_info_early(), so there is zero reason to have this duplication
of logic between btf_check_subprog_call() and btf_check_subprog_arg_match().
Dropping btf_check_subprog_arg_match() simplifies
btf_check_func_arg_match() further removing `bool processing_call` flag.

One subtle bit that was done by btf_check_subprog_arg_match() was
potentially marking main program's BTF as unreliable. We do this
explicitly now with a dedicated simple check, preserving the original
behavior, but now based on well factored btf_prepare_func_args() logic.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  2 -
 kernel/bpf/btf.c                                   | 50 ++--------------------
 kernel/bpf/verifier.c                              | 25 ++++++-----
 tools/testing/selftests/bpf/prog_tests/log_fixup.c |  4 +-
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       |  2 +-
 .../selftests/bpf/progs/task_kfunc_failure.c       |  2 +-
 6 files changed, 19 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c050c82cc9a5..d0d7eff22b8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
-				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index be2104e5f2f5..33d9a1c73f6e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6768,8 +6768,7 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok,
-				    bool processing_call)
+				    bool ptr_to_mem_ok)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	struct bpf_verifier_log *log = &env->log;
@@ -6842,7 +6841,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					i, btf_type_str(t));
 				return -EINVAL;
 			}
-		} else if (ptr_to_mem_ok && processing_call) {
+		} else if (ptr_to_mem_ok) {
 			const struct btf_type *resolve_ret;
 			u32 type_size;
 
@@ -6867,55 +6866,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	return 0;
 }
 
-/* Compare BTF of a function declaration with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
-				struct bpf_reg_state *regs)
-{
-	struct bpf_prog *prog = env->prog;
-	struct btf *btf = prog->aux->btf;
-	bool is_global;
-	u32 btf_id;
-	int err;
-
-	if (!prog->aux->func_info)
-		return -EINVAL;
-
-	btf_id = prog->aux->func_info[subprog].type_id;
-	if (!btf_id)
-		return -EFAULT;
-
-	if (prog->aux->func_info_aux[subprog].unreliable)
-		return -EINVAL;
-
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
-
-	/* Compiler optimizations can remove arguments from static functions
-	 * or mismatched type can be passed into a global function.
-	 * In such cases mark the function as unreliable from BTF point of view.
-	 */
-	if (err)
-		prog->aux->func_info_aux[subprog].unreliable = true;
-	return err;
-}
-
 /* Compare BTF of a function call with given bpf_reg_state.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
  * EINVAL - there is a type mismatch or BTF is not available.
  * 0 - BTF matches with what bpf_reg_state expects.
  * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- *
- * NOTE: the code is duplicated from btf_check_subprog_arg_match()
- * because btf_check_func_arg_match() is still doing both. Once that
- * function is split in 2, we can call from here btf_check_subprog_arg_match()
- * first, and then treat the calling part in a new code path.
  */
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs)
@@ -6937,7 +6893,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6555785b9f63..c26e9ab5226c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19904,6 +19904,7 @@ static void free_states(struct bpf_verifier_env *env)
 static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_state *state;
 	struct bpf_reg_state *regs;
 	int ret, i;
@@ -19930,9 +19931,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 	state->first_insn_idx = env->subprog_info[subprog].start;
 	state->last_insn_idx = -1;
 
+
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		struct bpf_subprog_info *sub = subprog_info(env, subprog);
 		const char *sub_name = subprog_name(env, subprog);
 		struct bpf_subprog_arg_info *arg;
 		struct bpf_reg_state *reg;
@@ -19979,21 +19980,19 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 			}
 		}
 	} else {
+		/* if main BPF program has associated BTF info, validate that
+		 * it's matching expected signature, and otherwise mark BTF
+		 * info for main program as unreliable
+		 */
+		if (env->prog->aux->func_info_aux) {
+			ret = btf_prepare_func_args(env, 0);
+			if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+				env->prog->aux->func_info_aux[0].unreliable = true;
+		}
+
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
 		mark_reg_known_zero(env, regs, BPF_REG_1);
-		ret = btf_check_subprog_arg_match(env, subprog, regs);
-		if (ret == -EFAULT)
-			/* unlikely verifier bug. abort.
-			 * ret == 0 and ret < 0 are sadly acceptable for
-			 * main() function due to backward compatibility.
-			 * Like socket filter program may be written as:
-			 * int bpf_prog(struct pt_regs *ctx)
-			 * and never dereference that ctx in the program.
-			 * 'struct pt_regs' is a type mismatch for socket
-			 * filter that should be using 'struct __sk_buff'.
-			 */
-			goto out;
 	}
 
 	ret = do_check(env);
diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
index effd78b2a657..7a3fa2ff567b 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
@@ -169,9 +169,9 @@ void test_log_fixup(void)
 	if (test__start_subtest("bad_core_relo_trunc_none"))
 		bad_core_relo(0, TRUNC_NONE /* full buf */);
 	if (test__start_subtest("bad_core_relo_trunc_partial"))
-		bad_core_relo(300, TRUNC_PARTIAL /* truncate original log a bit */);
+		bad_core_relo(280, TRUNC_PARTIAL /* truncate original log a bit */);
 	if (test__start_subtest("bad_core_relo_trunc_full"))
-		bad_core_relo(210, TRUNC_FULL  /* truncate also libbpf's message patch */);
+		bad_core_relo(220, TRUNC_FULL  /* truncate also libbpf's message patch */);
 	if (test__start_subtest("bad_core_relo_subprog"))
 		bad_core_relo_subprog();
 	if (test__start_subtest("missing_map"))
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 0fa564a5cc5b..9fe9c4a4e8f6 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -78,7 +78,7 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("kretprobe/cgroup_destroy_locked")
-__failure __msg("reg type unsupported for arg#0 function")
+__failure __msg("calling kernel function bpf_cgroup_acquire is not allowed")
 int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp)
 {
 	struct cgroup *acquired;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index dcdea3127086..ad88a3796ddf 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -248,7 +248,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 }
 
 SEC("lsm/task_free")
-__failure __msg("reg type unsupported for arg#0 function")
+__failure __msg("R1 must be a rcu pointer")
 int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
 {
 	struct task_struct *acquired;
-- 
cgit v1.2.3


From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:27 -0800
Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs

Generalize btf_prepare_func_args() to support both global and static
subprogs. We are going to utilize this property in the next patch,
reusing btf_prepare_func_args() for subprog call logic instead of
reparsing BTF information in a completely separate implementation.

btf_prepare_func_args() now detects whether subprog is global or static
makes slight logic adjustments for static func cases, like not failing
fatally (-EFAULT) for conditions that are allowable for static subprogs.

Somewhat subtle (but major!) difference is the handling of pointer arguments.
Both global and static functions need to handle special context
arguments (which are pointers to predefined type names), but static
subprogs give up on any other pointers, falling back to marking subprog
as "unreliable", disabling the use of BTF type information altogether.

For global functions, though, we are assuming that such pointers to
unrecognized types are just pointers to fixed-sized memory region (or
error out if size cannot be established, like for `void *` pointers).

This patch accommodates these small differences and sets up a stage for
refactoring in the next patch, eliminating a separate BTF-based parsing
logic in btf_check_func_arg_match().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 +++++
 kernel/bpf/btf.c             | 18 +++++++++---------
 kernel/bpf/verifier.c        |  5 -----
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5742e9c0a7b8..d3ea9ef04767 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -738,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->prog->aux->func_info_aux[subprog];
+}
+
 static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
 {
 	return &env->subprog_info[subprog];
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 33d9a1c73f6e..d321340e16f1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6914,6 +6914,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 {
+	bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
 	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -6927,14 +6928,15 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	if (sub->args_cached)
 		return 0;
 
-	if (!prog->aux->func_info ||
-	    prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+	if (!prog->aux->func_info) {
 		bpf_log(log, "Verifier bug\n");
 		return -EFAULT;
 	}
 
 	btf_id = prog->aux->func_info[subprog].type_id;
 	if (!btf_id) {
+		if (!is_global) /* not fatal for static funcs */
+			return -EINVAL;
 		bpf_log(log, "Global functions need valid BTF\n");
 		return -EFAULT;
 	}
@@ -6990,16 +6992,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			sub->args[i].arg_type = ARG_ANYTHING;
 			continue;
 		}
-		if (btf_type_is_ptr(t)) {
+		if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+			sub->args[i].arg_type = ARG_PTR_TO_CTX;
+			continue;
+		}
+		if (is_global && btf_type_is_ptr(t)) {
 			u32 mem_size;
 
-			if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-				sub->args[i].arg_type = ARG_PTR_TO_CTX;
-				continue;
-			}
-
 			t = btf_type_skip_modifiers(btf, t->type, NULL);
-
 			ref_t = btf_resolve_size(btf, t, &mem_size);
 			if (IS_ERR(ref_t)) {
 				bpf_log(log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c26e9ab5226c..6c9ecb86a8d9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -437,11 +437,6 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
 	return btf_type_name(env->prog->aux->btf, info->type_id);
 }
 
-static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog)
-{
-	return &env->prog->aux->func_info_aux[subprog];
-}
-
 static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_subprog_info *info = subprog_info(env, subprog);
-- 
cgit v1.2.3


From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:28 -0800
Subject: bpf: move subprog call logic back to verifier.c

Subprog call logic in btf_check_subprog_call() currently has both a lot
of BTF parsing logic (which is, presumably, what justified putting it
into btf.c), but also a bunch of register state checks, some of each
utilize deep verifier logic helpers, necessarily exported from
verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(),
and check_mem_reg().

Going forward, btf_check_subprog_call() will have a minimum of
BTF-related logic, but will get more internal verifier logic related to
register state manipulation. So move it into verifier.c to minimize
amount of verifier-specific logic exposed to btf.c.

We do this move before refactoring btf_check_func_arg_match() to
preserve as much history post-refactoring as possible.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   2 -
 include/linux/bpf_verifier.h |   8 ---
 kernel/bpf/btf.c             | 139 ---------------------------------------
 kernel/bpf/verifier.c        | 153 +++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 146 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0d7eff22b8a..7671530d6e4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d3ea9ef04767..d07d857ca67f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
 void
 bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno);
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
-			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type);
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-		   u32 regno, u32 mem_size);
-
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
 					     struct btf *btf, u32 btf_id)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d321340e16f1..341811bcca53 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6765,145 +6765,6 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
-static int btf_check_func_arg_match(struct bpf_verifier_env *env,
-				    const struct btf *btf, u32 func_id,
-				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok)
-{
-	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
-	struct bpf_verifier_log *log = &env->log;
-	const char *func_name, *ref_tname;
-	const struct btf_type *t, *ref_t;
-	const struct btf_param *args;
-	u32 i, nargs, ref_id;
-	int ret;
-
-	t = btf_type_by_id(btf, func_id);
-	if (!t || !btf_type_is_func(t)) {
-		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info or in add_kfunc_call().
-		 */
-		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
-			func_id);
-		return -EFAULT;
-	}
-	func_name = btf_name_by_offset(btf, t->name_off);
-
-	t = btf_type_by_id(btf, t->type);
-	if (!t || !btf_type_is_func_proto(t)) {
-		bpf_log(log, "Invalid BTF of func %s\n", func_name);
-		return -EFAULT;
-	}
-	args = (const struct btf_param *)(t + 1);
-	nargs = btf_type_vlen(t);
-	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
-			MAX_BPF_FUNC_REG_ARGS);
-		return -EINVAL;
-	}
-
-	/* check that BTF function arguments match actual types that the
-	 * verifier sees.
-	 */
-	for (i = 0; i < nargs; i++) {
-		enum bpf_arg_type arg_type = ARG_DONTCARE;
-		u32 regno = i + 1;
-		struct bpf_reg_state *reg = &regs[regno];
-
-		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
-		if (btf_type_is_scalar(t)) {
-			if (reg->type == SCALAR_VALUE)
-				continue;
-			bpf_log(log, "R%d is not a scalar\n", regno);
-			return -EINVAL;
-		}
-
-		if (!btf_type_is_ptr(t)) {
-			bpf_log(log, "Unrecognized arg#%d type %s\n",
-				i, btf_type_str(t));
-			return -EINVAL;
-		}
-
-		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
-		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
-		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
-		if (ret < 0)
-			return ret;
-
-		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-			/* If function expects ctx type in BTF check that caller
-			 * is passing PTR_TO_CTX.
-			 */
-			if (reg->type != PTR_TO_CTX) {
-				bpf_log(log,
-					"arg#%d expected pointer to ctx, but got %s\n",
-					i, btf_type_str(t));
-				return -EINVAL;
-			}
-		} else if (ptr_to_mem_ok) {
-			const struct btf_type *resolve_ret;
-			u32 type_size;
-
-			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
-			if (IS_ERR(resolve_ret)) {
-				bpf_log(log,
-					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-					i, btf_type_str(ref_t), ref_tname,
-					PTR_ERR(resolve_ret));
-				return -EINVAL;
-			}
-
-			if (check_mem_reg(env, reg, regno, type_size))
-				return -EINVAL;
-		} else {
-			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
-				func_name, func_id);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
-/* Compare BTF of a function call with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			   struct bpf_reg_state *regs)
-{
-	struct bpf_prog *prog = env->prog;
-	struct btf *btf = prog->aux->btf;
-	bool is_global;
-	u32 btf_id;
-	int err;
-
-	if (!prog->aux->func_info)
-		return -EINVAL;
-
-	btf_id = prog->aux->func_info[subprog].type_id;
-	if (!btf_id)
-		return -EFAULT;
-
-	if (prog->aux->func_info_aux[subprog].unreliable)
-		return -EINVAL;
-
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
-
-	/* Compiler optimizations can remove arguments from static functions
-	 * or mismatched type can be passed into a global function.
-	 * In such cases mark the function as unreliable from BTF point of view.
-	 */
-	if (err)
-		prog->aux->func_info_aux[subprog].unreliable = true;
-	return err;
-}
-
 /* Process BTF of a function to produce high-level expectation of function
  * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
  * is cached in subprog info for reuse.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c9ecb86a8d9..f48e49f2d482 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5127,8 +5127,8 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno)
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+		             const struct bpf_reg_state *reg, int regno)
 {
 	return __check_ptr_off_reg(env, reg, regno, false);
 }
@@ -7300,8 +7300,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 	return err;
 }
 
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-		   u32 regno, u32 mem_size)
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			 u32 regno, u32 mem_size)
 {
 	bool may_be_null = type_may_be_null(reg->type);
 	struct bpf_reg_state saved_reg;
@@ -8286,9 +8286,9 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
 	return field;
 }
 
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
-			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type)
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg, int regno,
+				  enum bpf_arg_type arg_type)
 {
 	u32 type = reg->type;
 
@@ -9249,6 +9249,145 @@ err_out:
 	return err;
 }
 
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+				    const struct btf *btf, u32 func_id,
+				    struct bpf_reg_state *regs,
+				    bool ptr_to_mem_ok)
+{
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	struct bpf_verifier_log *log = &env->log;
+	const char *func_name, *ref_tname;
+	const struct btf_type *t, *ref_t;
+	const struct btf_param *args;
+	u32 i, nargs, ref_id;
+	int ret;
+
+	t = btf_type_by_id(btf, func_id);
+	if (!t || !btf_type_is_func(t)) {
+		/* These checks were already done by the verifier while loading
+		 * struct bpf_func_info or in add_kfunc_call().
+		 */
+		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+			func_id);
+		return -EFAULT;
+	}
+	func_name = btf_name_by_offset(btf, t->name_off);
+
+	t = btf_type_by_id(btf, t->type);
+	if (!t || !btf_type_is_func_proto(t)) {
+		bpf_log(log, "Invalid BTF of func %s\n", func_name);
+		return -EFAULT;
+	}
+	args = (const struct btf_param *)(t + 1);
+	nargs = btf_type_vlen(t);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+			MAX_BPF_FUNC_REG_ARGS);
+		return -EINVAL;
+	}
+
+	/* check that BTF function arguments match actual types that the
+	 * verifier sees.
+	 */
+	for (i = 0; i < nargs; i++) {
+		enum bpf_arg_type arg_type = ARG_DONTCARE;
+		u32 regno = i + 1;
+		struct bpf_reg_state *reg = &regs[regno];
+
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		if (btf_type_is_scalar(t)) {
+			if (reg->type == SCALAR_VALUE)
+				continue;
+			bpf_log(log, "R%d is not a scalar\n", regno);
+			return -EINVAL;
+		}
+
+		if (!btf_type_is_ptr(t)) {
+			bpf_log(log, "Unrecognized arg#%d type %s\n",
+				i, btf_type_str(t));
+			return -EINVAL;
+		}
+
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		if (ret < 0)
+			return ret;
+
+		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+			/* If function expects ctx type in BTF check that caller
+			 * is passing PTR_TO_CTX.
+			 */
+			if (reg->type != PTR_TO_CTX) {
+				bpf_log(log,
+					"arg#%d expected pointer to ctx, but got %s\n",
+					i, btf_type_str(t));
+				return -EINVAL;
+			}
+		} else if (ptr_to_mem_ok) {
+			const struct btf_type *resolve_ret;
+			u32 type_size;
+
+			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+			if (IS_ERR(resolve_ret)) {
+				bpf_log(log,
+					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+					i, btf_type_str(ref_t), ref_tname,
+					PTR_ERR(resolve_ret));
+				return -EINVAL;
+			}
+
+			if (check_mem_reg(env, reg, regno, type_size))
+				return -EINVAL;
+		} else {
+			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+				func_name, func_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+			          struct bpf_reg_state *regs)
+{
+	struct bpf_prog *prog = env->prog;
+	struct btf *btf = prog->aux->btf;
+	bool is_global;
+	u32 btf_id;
+	int err;
+
+	if (!prog->aux->func_info)
+		return -EINVAL;
+
+	btf_id = prog->aux->func_info[subprog].type_id;
+	if (!btf_id)
+		return -EFAULT;
+
+	if (prog->aux->func_info_aux[subprog].unreliable)
+		return -EINVAL;
+
+	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
+	/* Compiler optimizations can remove arguments from static functions
+	 * or mismatched type can be passed into a global function.
+	 * In such cases mark the function as unreliable from BTF point of view.
+	 */
+	if (err)
+		prog->aux->func_info_aux[subprog].unreliable = true;
+	return err;
+}
+
 static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			      int insn_idx, int subprog,
 			      set_callee_state_fn set_callee_state_cb)
-- 
cgit v1.2.3


From 7437bb73f087e5f216f9c6603f5149d354e315af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Dec 2023 17:53:57 +0100
Subject: block: remove support for the host aware zone model

When zones were first added the SCSI and ATA specs, two different
models were supported (in addition to the drive managed one that
is invisible to the host):

 - host managed where non-conventional zones there is strict requirement
   to write at the write pointer, or else an error is returned
 - host aware where a write point is maintained if writes always happen
   at it, otherwise it is left in an under-defined state and the
   sequential write preferred zones behave like conventional zones
   (probably very badly performing ones, though)

Not surprisingly this lukewarm model didn't prove to be very useful and
was finally removed from the ZBC and SBC specs (NVMe never implemented
it).  Due to to the easily disappearing write pointer host software
could never rely on the write pointer to actually be useful for say
recovery.

Fortunately only a few HDD prototypes shipped using this model which
never made it to mass production.  Drop the support before it is too
late.  Note that any such host aware prototype HDD can still be used
with Linux as we'll now treat it as a conventional HDD.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20231217165359.604246-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c           | 67 ++++++------------------------------------
 block/blk-sysfs.c              |  9 ++----
 block/partitions/core.c        | 12 ++------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-kcopyd.c         |  2 +-
 drivers/md/dm-table.c          | 45 +++++++++++++---------------
 drivers/md/dm-zoned-metadata.c |  7 ++---
 drivers/md/dm-zoned-target.c   |  4 +--
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/scsi_debug.c      | 27 ++++++++---------
 drivers/scsi/sd.c              | 45 +++++++++++++---------------
 drivers/scsi/sd_zbc.c          | 16 +---------
 fs/btrfs/zoned.c               | 23 ++-------------
 fs/btrfs/zoned.h               |  2 +-
 fs/f2fs/data.c                 |  2 +-
 fs/f2fs/super.c                | 17 +++++------
 include/linux/blkdev.h         | 37 ++---------------------
 19 files changed, 92 insertions(+), 231 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 09e3a4d5e4d2..50e9efb59f67 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
-	lim->zoned = BLK_ZONED_NONE;
+	lim->zoned = false;
 	lim->zone_write_granularity = 0;
 	lim->dma_alignment = 511;
 }
@@ -880,79 +880,30 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
 
-static bool disk_has_partitions(struct gendisk *disk)
-{
-	unsigned long idx;
-	struct block_device *part;
-	bool ret = false;
-
-	rcu_read_lock();
-	xa_for_each(&disk->part_tbl, idx, part) {
-		if (bdev_is_partition(part)) {
-			ret = true;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
 /**
  * disk_set_zoned - configure the zoned model for a disk
  * @disk:	the gendisk of the queue to configure
- * @model:	the zoned model to set
- *
- * Set the zoned model of @disk to @model.
+ * @zoned:	zoned or not.
  *
- * When @model is BLK_ZONED_HM (host managed), this should be called only
- * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
- * If @model specifies BLK_ZONED_HA (host aware), the effective model used
- * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
- * on the disk.
+ * When @zoned is %true, this should be called only if zoned block device
+ * support is enabled (CONFIG_BLK_DEV_ZONED option).
  */
-void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk, bool zoned)
 {
 	struct request_queue *q = disk->queue;
-	unsigned int old_model = q->limits.zoned;
 
-	switch (model) {
-	case BLK_ZONED_HM:
-		/*
-		 * Host managed devices are supported only if
-		 * CONFIG_BLK_DEV_ZONED is enabled.
-		 */
+	if (zoned) {
 		WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
-		break;
-	case BLK_ZONED_HA:
-		/*
-		 * Host aware devices can be treated either as regular block
-		 * devices (similar to drive managed devices) or as zoned block
-		 * devices to take advantage of the zone command set, similarly
-		 * to host managed devices. We try the latter if there are no
-		 * partitions and zoned block device support is enabled, else
-		 * we do nothing special as far as the block layer is concerned.
-		 */
-		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
-		    disk_has_partitions(disk))
-			model = BLK_ZONED_NONE;
-		break;
-	case BLK_ZONED_NONE:
-	default:
-		if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
-			model = BLK_ZONED_NONE;
-		break;
-	}
 
-	q->limits.zoned = model;
-	if (model != BLK_ZONED_NONE) {
 		/*
 		 * Set the zone write granularity to the device logical block
 		 * size by default. The driver can change this value if needed.
 		 */
+		q->limits.zoned = true;
 		blk_queue_zone_write_granularity(q,
 						queue_logical_block_size(q));
-	} else if (old_model != BLK_ZONED_NONE) {
+	} else if (q->limits.zoned) {
+		q->limits.zoned = false;
 		disk_clear_zone_settings(disk);
 	}
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 63e481262336..d5e669a401b0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 
 static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 {
-	switch (blk_queue_zoned_model(q)) {
-	case BLK_ZONED_HA:
-		return sprintf(page, "host-aware\n");
-	case BLK_ZONED_HM:
+	if (blk_queue_is_zoned(q))
 		return sprintf(page, "host-managed\n");
-	default:
-		return sprintf(page, "none\n");
-	}
+	return sprintf(page, "none\n");
 }
 
 static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
diff --git a/block/partitions/core.c b/block/partitions/core.c
index f47ffcfdfcec..e6ac73617f3e 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	 * Partitions are not supported on zoned block devices that are used as
 	 * such.
 	 */
-	switch (disk->queue->limits.zoned) {
-	case BLK_ZONED_HM:
+	if (bdev_is_zoned(disk->part0)) {
 		pr_warn("%s: partitions not supported on host managed zoned block device\n",
 			disk->disk_name);
 		return ERR_PTR(-ENXIO);
-	case BLK_ZONED_HA:
-		pr_info("%s: disabling host aware zoned block device support due to partitions\n",
-			disk->disk_name);
-		disk_set_zoned(disk, BLK_ZONED_NONE);
-		break;
-	case BLK_ZONED_NONE:
-		break;
 	}
 
 	if (xa_load(&disk->part_tbl, partno))
@@ -613,7 +605,7 @@ static int blk_add_partitions(struct gendisk *disk)
 	/*
 	 * Partitions are not supported on host managed zoned block devices.
 	 */
-	if (disk->queue->limits.zoned == BLK_ZONED_HM) {
+	if (bdev_is_zoned(disk->part0)) {
 		pr_warn("%s: ignoring partition table on host managed zoned block device\n",
 			disk->disk_name);
 		ret = 0;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 55c5b48bc276..369eb1e78bb5 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -159,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 	struct nullb_device *dev = nullb->dev;
 	struct request_queue *q = nullb->q;
 
-	disk_set_zoned(nullb->disk, BLK_ZONED_HM);
+	disk_set_zoned(nullb->disk, true);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
 	blk_queue_chunk_sectors(q, dev->zone_size_sects);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 83600b45e12a..24fb95f19d52 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -250,7 +250,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
 {
 	const struct ublk_param_zoned *p = &ub->params.zoned;
 
-	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
+	disk_set_zoned(ub->ub_disk, true);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
 	blk_queue_required_elevator_features(ub->ub_disk->queue,
 					     ELEVATOR_F_ZBD_SEQ_WRITE);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a28f1687066b..19a4f20bd1c2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -730,7 +730,7 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-	disk_set_zoned(vblk->disk, BLK_ZONED_HM);
+	disk_set_zoned(vblk->disk, true);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 
 	virtio_cread(vdev, struct virtio_blk_config,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d01807c50f20..36bcfdccae04 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -807,7 +807,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	 */
 	if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
 		for (i = 0; i < job->num_dests; i++) {
-			if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
+			if (bdev_is_zoned(dests[i].bdev)) {
 				job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
 				break;
 			}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 198d38b53322..260b5b8f2b0d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1579,21 +1579,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t)
 	return true;
 }
 
-static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
-				  sector_t start, sector_t len, void *data)
+static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev,
+			    sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-	enum blk_zoned_model *zoned_model = data;
+	bool *zoned = data;
 
-	return blk_queue_zoned_model(q) != *zoned_model;
+	return bdev_is_zoned(dev->bdev) != *zoned;
 }
 
 static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
 				 sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return blk_queue_zoned_model(q) != BLK_ZONED_NONE;
+	return bdev_is_zoned(dev->bdev);
 }
 
 /*
@@ -1603,8 +1600,7 @@ static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
  * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
  * zoned model with all zoned devices having the same zone size.
  */
-static bool dm_table_supports_zoned_model(struct dm_table *t,
-					  enum blk_zoned_model zoned_model)
+static bool dm_table_supports_zoned(struct dm_table *t, bool zoned)
 {
 	for (unsigned int i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = dm_table_get_target(t, i);
@@ -1623,11 +1619,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
 
 		if (dm_target_supports_zoned_hm(ti->type)) {
 			if (!ti->type->iterate_devices ||
-			    ti->type->iterate_devices(ti, device_not_zoned_model,
-						      &zoned_model))
+			    ti->type->iterate_devices(ti, device_not_zoned,
+						      &zoned))
 				return false;
 		} else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
-			if (zoned_model == BLK_ZONED_HM)
+			if (zoned)
 				return false;
 		}
 	}
@@ -1650,14 +1646,13 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
  * zone sectors, if the destination device is a zoned block device, it shall
  * have the specified zone_sectors.
  */
-static int validate_hardware_zoned_model(struct dm_table *t,
-					 enum blk_zoned_model zoned_model,
-					 unsigned int zone_sectors)
+static int validate_hardware_zoned(struct dm_table *t, bool zoned,
+				   unsigned int zone_sectors)
 {
-	if (zoned_model == BLK_ZONED_NONE)
+	if (!zoned)
 		return 0;
 
-	if (!dm_table_supports_zoned_model(t, zoned_model)) {
+	if (!dm_table_supports_zoned(t, zoned)) {
 		DMERR("%s: zoned model is not consistent across all devices",
 		      dm_device_name(t->md));
 		return -EINVAL;
@@ -1683,8 +1678,8 @@ int dm_calculate_queue_limits(struct dm_table *t,
 			      struct queue_limits *limits)
 {
 	struct queue_limits ti_limits;
-	enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
 	unsigned int zone_sectors = 0;
+	bool zoned = false;
 
 	blk_set_stacking_limits(limits);
 
@@ -1706,12 +1701,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
 		ti->type->iterate_devices(ti, dm_set_device_limits,
 					  &ti_limits);
 
-		if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
+		if (!zoned && ti_limits.zoned) {
 			/*
 			 * After stacking all limits, validate all devices
 			 * in table support this zoned model and zone sectors.
 			 */
-			zoned_model = ti_limits.zoned;
+			zoned = ti_limits.zoned;
 			zone_sectors = ti_limits.chunk_sectors;
 		}
 
@@ -1744,18 +1739,18 @@ combine_limits:
 	 * Verify that the zoned model and zone sectors, as determined before
 	 * any .io_hints override, are the same across all devices in the table.
 	 * - this is especially relevant if .io_hints is emulating a disk-managed
-	 *   zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
+	 *   zoned model on host-managed zoned block devices.
 	 * BUT...
 	 */
-	if (limits->zoned != BLK_ZONED_NONE) {
+	if (limits->zoned) {
 		/*
 		 * ...IF the above limits stacking determined a zoned model
 		 * validate that all of the table's devices conform to it.
 		 */
-		zoned_model = limits->zoned;
+		zoned = limits->zoned;
 		zone_sectors = limits->chunk_sectors;
 	}
-	if (validate_hardware_zoned_model(t, zoned_model, zone_sectors))
+	if (validate_hardware_zoned(t, zoned, zone_sectors))
 		return -EINVAL;
 
 	return validate_hardware_logical_block_alignment(t, limits);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 60a4dc01ea18..fdfe30f7b697 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -2836,12 +2836,11 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num)
 {
 	struct dmz_dev *dev = &zmd->dev[num];
 
-	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
+	if (!bdev_is_zoned(dev->bdev))
 		dmz_dev_info(dev, "Regular block device");
 	else
-		dmz_dev_info(dev, "Host-%s zoned block device",
-			     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
-			     "aware" : "managed");
+		dmz_dev_info(dev, "Host-managed zoned block device");
+
 	if (zmd->sb_version > 1) {
 		sector_t sector_offset =
 			dev->zone_offset << zmd->zone_nr_sectors_shift;
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index b487f7acc860..621794a9edd6 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -702,7 +702,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path,
 	}
 
 	bdev = ddev->bdev;
-	if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
+	if (!bdev_is_zoned(bdev)) {
 		if (nr_devs == 1) {
 			ti->error = "Invalid regular device";
 			goto err;
@@ -1010,7 +1010,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	limits->max_sectors = chunk_sectors;
 
 	/* We are exposing a drive-managed zoned block device */
-	limits->zoned = BLK_ZONED_NONE;
+	limits->zoned = false;
 }
 
 /*
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ec8557810c21..6d4c440e97e2 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -108,7 +108,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 		goto free_data;
 	}
 
-	disk_set_zoned(ns->disk, BLK_ZONED_HM);
+	disk_set_zoned(ns->disk, true);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
 	disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 6d8218a44122..d03d66f11493 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -339,7 +339,7 @@ struct sdebug_dev_info {
 	bool used;
 
 	/* For ZBC devices */
-	enum blk_zoned_model zmodel;
+	bool zoned;
 	unsigned int zcap;
 	unsigned int zsize;
 	unsigned int zsize_shift;
@@ -844,8 +844,11 @@ static bool write_since_sync;
 static bool sdebug_statistics = DEF_STATISTICS;
 static bool sdebug_wp;
 static bool sdebug_allow_restart;
-/* Following enum: 0: no zbc, def; 1: host aware; 2: host managed */
-static enum blk_zoned_model sdeb_zbc_model = BLK_ZONED_NONE;
+static enum {
+	BLK_ZONED_NONE	= 0,
+	BLK_ZONED_HA	= 1,
+	BLK_ZONED_HM	= 2,
+} sdeb_zbc_model = BLK_ZONED_NONE;
 static char *sdeb_zbc_model_s;
 
 enum sam_lun_addr_method {SAM_LUN_AM_PERIPHERAL = 0x0,
@@ -1815,8 +1818,6 @@ static int inquiry_vpd_b1(struct sdebug_dev_info *devip, unsigned char *arr)
 	arr[1] = 1;	/* non rotating medium (e.g. solid state) */
 	arr[2] = 0;
 	arr[3] = 5;	/* less than 1.8" */
-	if (devip->zmodel == BLK_ZONED_HA)
-		arr[4] = 1 << 4;	/* zoned field = 01b */
 
 	return 0x3c;
 }
@@ -1883,7 +1884,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
 	if (! arr)
 		return DID_REQUEUE << 16;
 	is_disk = (sdebug_ptype == TYPE_DISK);
-	is_zbc = (devip->zmodel != BLK_ZONED_NONE);
+	is_zbc = devip->zoned;
 	is_disk_zbc = (is_disk || is_zbc);
 	have_wlun = scsi_is_wlun(scp->device->lun);
 	if (have_wlun)
@@ -2195,7 +2196,7 @@ static int resp_readcap16(struct scsi_cmnd *scp,
 	 * Since the scsi_debug READ CAPACITY implementation always reports the
 	 * total disk capacity, set RC BASIS = 1 for host-managed ZBC devices.
 	 */
-	if (devip->zmodel == BLK_ZONED_HM)
+	if (devip->zoned)
 		arr[12] |= 1 << 4;
 
 	arr[15] = sdebug_lowest_aligned & 0xff;
@@ -2648,7 +2649,7 @@ static int resp_mode_sense(struct scsi_cmnd *scp,
 	msense_6 = (MODE_SENSE == cmd[0]);
 	llbaa = msense_6 ? false : !!(cmd[1] & 0x10);
 	is_disk = (sdebug_ptype == TYPE_DISK);
-	is_zbc = (devip->zmodel != BLK_ZONED_NONE);
+	is_zbc = devip->zoned;
 	if ((is_disk || is_zbc) && !dbd)
 		bd_len = llbaa ? 16 : 8;
 	else
@@ -3194,8 +3195,6 @@ static int check_zbc_access_params(struct scsi_cmnd *scp,
 	struct sdeb_zone_state *zsp_end = zbc_zone(devip, lba + num - 1);
 
 	if (!write) {
-		if (devip->zmodel == BLK_ZONED_HA)
-			return 0;
 		/* For host-managed, reads cannot cross zone types boundaries */
 		if (zsp->z_type != zsp_end->z_type) {
 			mk_sense_buffer(scp, ILLEGAL_REQUEST,
@@ -5322,7 +5321,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip)
 	if (devip->zcap < devip->zsize)
 		devip->nr_zones += devip->nr_seq_zones;
 
-	if (devip->zmodel == BLK_ZONED_HM) {
+	if (devip->zoned) {
 		/* zbc_max_open_zones can be 0, meaning "not reported" */
 		if (sdeb_zbc_max_open >= devip->nr_zones - 1)
 			devip->max_open = (devip->nr_zones - 1) / 2;
@@ -5347,7 +5346,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip)
 			zsp->z_size =
 				min_t(u64, devip->zsize, capacity - zstart);
 		} else if ((zstart & (devip->zsize - 1)) == 0) {
-			if (devip->zmodel == BLK_ZONED_HM)
+			if (devip->zoned)
 				zsp->z_type = ZBC_ZTYPE_SWR;
 			else
 				zsp->z_type = ZBC_ZTYPE_SWP;
@@ -5390,13 +5389,13 @@ static struct sdebug_dev_info *sdebug_device_create(
 		}
 		devip->sdbg_host = sdbg_host;
 		if (sdeb_zbc_in_use) {
-			devip->zmodel = sdeb_zbc_model;
+			devip->zoned = sdeb_zbc_model == BLK_ZONED_HM;
 			if (sdebug_device_create_zones(devip)) {
 				kfree(devip);
 				return NULL;
 			}
 		} else {
-			devip->zmodel = BLK_ZONED_NONE;
+			devip->zoned = false;
 		}
 		devip->create_ts = ktime_get_boottime();
 		atomic_set(&devip->stopped, (sdeb_tur_ms_to_ready > 0 ? 2 : 0));
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fa00dd503cbf..19a19eb277f5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3117,7 +3117,6 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 	struct request_queue *q = sdkp->disk->queue;
 	struct scsi_vpd *vpd;
 	u16 rot;
-	u8 zoned;
 
 	rcu_read_lock();
 	vpd = rcu_dereference(sdkp->device->vpd_pgb1);
@@ -3128,7 +3127,7 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 	}
 
 	rot = get_unaligned_be16(&vpd->data[4]);
-	zoned = (vpd->data[8] >> 4) & 3;
+	sdkp->zoned = (vpd->data[8] >> 4) & 3;
 	rcu_read_unlock();
 
 	if (rot == 1) {
@@ -3138,37 +3137,33 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 
 	if (sdkp->device->type == TYPE_ZBC) {
 		/*
-		 * Host-managed: Per ZBC and ZAC specifications, writes in
-		 * sequential write required zones of host-managed devices must
-		 * be aligned to the device physical block size.
+		 * Host-managed.
+		 */
+		disk_set_zoned(sdkp->disk, true);
+
+		/*
+		 * Per ZBC and ZAC specifications, writes in sequential write
+		 * required zones of host-managed devices must be aligned to
+		 * the device physical block size.
 		 */
-		disk_set_zoned(sdkp->disk, BLK_ZONED_HM);
 		blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
 	} else {
-		sdkp->zoned = zoned;
-		if (sdkp->zoned == 1) {
-			/* Host-aware */
-			disk_set_zoned(sdkp->disk, BLK_ZONED_HA);
-		} else {
-			/* Regular disk or drive managed disk */
-			disk_set_zoned(sdkp->disk, BLK_ZONED_NONE);
-		}
+		/*
+		 * Anything else.  This includes host-aware device that we treat
+		 * as conventional.
+		 */
+		disk_set_zoned(sdkp->disk, false);
 	}
 
 	if (!sdkp->first_scan)
 		return;
 
-	if (blk_queue_is_zoned(q)) {
-		sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n",
-		      q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware");
-	} else {
-		if (sdkp->zoned == 1)
-			sd_printk(KERN_NOTICE, sdkp,
-				  "Host-aware SMR disk used as regular disk\n");
-		else if (sdkp->zoned == 2)
-			sd_printk(KERN_NOTICE, sdkp,
-				  "Drive-managed SMR disk\n");
-	}
+	if (blk_queue_is_zoned(q))
+		sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n");
+	else if (sdkp->zoned == 1)
+		sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n");
+	else if (sdkp->zoned == 2)
+		sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n");
 }
 
 /**
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a25215507668..26af5ab7d7c1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -836,10 +836,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
 
 	/*
 	 * For all zoned disks, initialize zone append emulation data if not
-	 * already done. This is necessary also for host-aware disks used as
-	 * regular disks due to the presence of partitions as these partitions
-	 * may be deleted and the disk zoned model changed back from
-	 * BLK_ZONED_NONE to BLK_ZONED_HA.
+	 * already done.
 	 */
 	if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) {
 		ret = sd_zbc_init_disk(sdkp);
@@ -932,17 +929,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 	sdkp->device->use_10_for_rw = 0;
 	sdkp->device->use_16_for_sync = 1;
 
-	if (!blk_queue_is_zoned(q)) {
-		/*
-		 * This can happen for a host aware disk with partitions.
-		 * The block device zone model was already cleared by
-		 * disk_set_zoned(). Only free the scsi disk zone
-		 * information and exit early.
-		 */
-		sd_zbc_free_zone_info(sdkp);
-		return 0;
-	}
-
 	/* Check zoned block device characteristics (unconstrained reads) */
 	ret = sd_zbc_check_zoned_characteristics(sdkp, buf);
 	if (ret)
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 188378ca19c7..23c1e6b19a65 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 
 	kvfree(zones);
 
-	switch (bdev_zoned_model(bdev)) {
-	case BLK_ZONED_HM:
+	if (bdev_is_zoned(bdev)) {
 		model = "host-managed zoned";
 		emulated = "";
-		break;
-	case BLK_ZONED_HA:
-		model = "host-aware zoned";
-		emulated = "";
-		break;
-	case BLK_ZONED_NONE:
+	} else {
 		model = "regular";
 		emulated = "emulated ";
-		break;
-	default:
-		/* Just in case */
-		btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
-				 bdev_zoned_model(bdev),
-				 rcu_str_deref(device->name));
-		ret = -EOPNOTSUPP;
-		goto out_free_zone_info;
 	}
 
 	btrfs_info_in_rcu(fs_info,
@@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 
 out:
 	kvfree(zones);
-out_free_zone_info:
 	btrfs_destroy_dev_zone_info(device);
-
 	return ret;
 }
 
@@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 
 	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
-		if (device->bdev &&
-		    bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+		if (device->bdev && bdev_is_zoned(device->bdev)) {
 			btrfs_err(fs_info,
 				"zoned: mode not enabled but zoned device found: %pg",
 				device->bdev);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..bc1b540c1597 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -324,7 +324,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
 	}
 
 	/* Do not allow Host Manged zoned device */
-	return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+	return !bdev_is_zoned(bdev);
 }
 
 static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e42b5f24deb..9b62549a29ce 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
 		}
 		blkaddr -= FDEV(devi).start_blk;
 	}
-	return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM &&
+	return bdev_is_zoned(FDEV(devi).bdev) &&
 		f2fs_blkz_is_seq(sbi, devi, blkaddr) &&
 		(blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1);
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 033af907c3b1..850c87ae7d98 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4282,24 +4282,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 			sbi->aligned_blksize = false;
 
 #ifdef CONFIG_BLK_DEV_ZONED
-		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
-				!f2fs_sb_has_blkzoned(sbi)) {
-			f2fs_err(sbi, "Zoned block device feature not enabled");
-			return -EINVAL;
-		}
-		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+		if (bdev_is_zoned(FDEV(i).bdev)) {
+			if (!f2fs_sb_has_blkzoned(sbi)) {
+				f2fs_err(sbi, "Zoned block device feature not enabled");
+				return -EINVAL;
+			}
 			if (init_blkz_info(sbi, i)) {
 				f2fs_err(sbi, "Failed to initialize F2FS blkzone information");
 				return -EINVAL;
 			}
 			if (max_devices == 1)
 				break;
-			f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+			f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)",
 				  i, FDEV(i).path,
 				  FDEV(i).total_segments,
-				  FDEV(i).start_blk, FDEV(i).end_blk,
-				  bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
-				  "Host-aware" : "Host-managed");
+				  FDEV(i).start_blk, FDEV(i).end_blk);
 			continue;
 		}
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 185ed3770e3a..28cda9fb239e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -263,18 +263,6 @@ static inline bool blk_op_is_passthrough(blk_opf_t op)
 	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 }
 
-/*
- * Zoned block device models (zoned limit).
- *
- * Note: This needs to be ordered from the least to the most severe
- * restrictions for the inheritance in blk_stack_limits() to work.
- */
-enum blk_zoned_model {
-	BLK_ZONED_NONE = 0,	/* Regular block device */
-	BLK_ZONED_HA,		/* Host-aware zoned block device */
-	BLK_ZONED_HM,		/* Host-managed zoned block device */
-};
-
 /*
  * BLK_BOUNCE_NONE:	never bounce (default)
  * BLK_BOUNCE_HIGH:	bounce all highmem pages
@@ -316,7 +304,7 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		raid_partial_stripes_expensive;
-	enum blk_zoned_model	zoned;
+	bool			zoned;
 
 	/*
 	 * Drivers that set dma_alignment to less than 511 must be prepared to
@@ -329,7 +317,7 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
-void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
+void disk_set_zoned(struct gendisk *disk, bool zoned);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
@@ -617,23 +605,9 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q)
 }
 #endif
 
-static inline enum blk_zoned_model
-blk_queue_zoned_model(struct request_queue *q)
-{
-	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
-		return q->limits.zoned;
-	return BLK_ZONED_NONE;
-}
-
 static inline bool blk_queue_is_zoned(struct request_queue *q)
 {
-	switch (blk_queue_zoned_model(q)) {
-	case BLK_ZONED_HA:
-	case BLK_ZONED_HM:
-		return true;
-	default:
-		return false;
-	}
+	return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned;
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -1260,11 +1234,6 @@ static inline bool bdev_nowait(struct block_device *bdev)
 	return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
 }
 
-static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
-{
-	return blk_queue_zoned_model(bdev_get_queue(bdev));
-}
-
 static inline bool bdev_is_zoned(struct block_device *bdev)
 {
 	return blk_queue_is_zoned(bdev_get_queue(bdev));
-- 
cgit v1.2.3


From d73e93b4dfab10c80688b061c30048df05585c7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 17 Dec 2023 17:53:58 +0100
Subject: block: simplify disk_set_zoned

Only use disk_set_zoned to actually enable zoned device support.
For clearing it, call disk_clear_zoned, which is renamed from
disk_clear_zone_settings and now directly clears the zoned flag as
well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20231217165359.604246-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c           | 32 +++++++++++---------------------
 block/blk-zoned.c              |  3 ++-
 block/blk.h                    |  2 --
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/nvme/host/zns.c        |  2 +-
 drivers/scsi/sd.c              |  7 +++++--
 include/linux/blkdev.h         |  3 ++-
 9 files changed, 24 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 50e9efb59f67..bb94a3d471f4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -881,31 +881,21 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
 
 /**
- * disk_set_zoned - configure the zoned model for a disk
- * @disk:	the gendisk of the queue to configure
- * @zoned:	zoned or not.
- *
- * When @zoned is %true, this should be called only if zoned block device
- * support is enabled (CONFIG_BLK_DEV_ZONED option).
+ * disk_set_zoned - inidicate a zoned device
+ * @disk:	gendisk to configure
  */
-void disk_set_zoned(struct gendisk *disk, bool zoned)
+void disk_set_zoned(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 
-	if (zoned) {
-		WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
-
-		/*
-		 * Set the zone write granularity to the device logical block
-		 * size by default. The driver can change this value if needed.
-		 */
-		q->limits.zoned = true;
-		blk_queue_zone_write_granularity(q,
-						queue_logical_block_size(q));
-	} else if (q->limits.zoned) {
-		q->limits.zoned = false;
-		disk_clear_zone_settings(disk);
-	}
+	WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
+
+	/*
+	 * Set the zone write granularity to the device logical block
+	 * size by default. The driver can change this value if needed.
+	 */
+	q->limits.zoned = true;
+	blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
 }
 EXPORT_SYMBOL_GPL(disk_set_zoned);
 
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 619ee41a51cc..580a58e53efd 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -616,12 +616,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 
-void disk_clear_zone_settings(struct gendisk *disk)
+void disk_clear_zoned(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 
 	blk_mq_freeze_queue(q);
 
+	q->limits.zoned = false;
 	disk_free_zone_bitmaps(disk);
 	blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
 	q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
diff --git a/block/blk.h b/block/blk.h
index 08a358bc0919..1ef920f72e0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
 
 #ifdef CONFIG_BLK_DEV_ZONED
 void disk_free_zone_bitmaps(struct gendisk *disk);
-void disk_clear_zone_settings(struct gendisk *disk);
 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 		unsigned long arg);
 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 		unsigned int cmd, unsigned long arg);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
-static inline void disk_clear_zone_settings(struct gendisk *disk) {}
 static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 		unsigned int cmd, unsigned long arg)
 {
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 369eb1e78bb5..6f5e0994862e 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -159,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 	struct nullb_device *dev = nullb->dev;
 	struct request_queue *q = nullb->q;
 
-	disk_set_zoned(nullb->disk, true);
+	disk_set_zoned(nullb->disk);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
 	blk_queue_chunk_sectors(q, dev->zone_size_sects);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 24fb95f19d52..d50d69b2c023 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -250,7 +250,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
 {
 	const struct ublk_param_zoned *p = &ub->params.zoned;
 
-	disk_set_zoned(ub->ub_disk, true);
+	disk_set_zoned(ub->ub_disk);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
 	blk_queue_required_elevator_features(ub->ub_disk->queue,
 					     ELEVATOR_F_ZBD_SEQ_WRITE);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 19a4f20bd1c2..7d7a19b2b9a8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -730,7 +730,7 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
 
 	dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-	disk_set_zoned(vblk->disk, true);
+	disk_set_zoned(vblk->disk);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 
 	virtio_cread(vdev, struct virtio_blk_config,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 6d4c440e97e2..3d98e435821e 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -108,7 +108,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
 		goto free_data;
 	}
 
-	disk_set_zoned(ns->disk, true);
+	disk_set_zoned(ns->disk);
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
 	disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 19a19eb277f5..dbed075cdb98 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3135,11 +3135,13 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
 	}
 
+
+#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */
 	if (sdkp->device->type == TYPE_ZBC) {
 		/*
 		 * Host-managed.
 		 */
-		disk_set_zoned(sdkp->disk, true);
+		disk_set_zoned(sdkp->disk);
 
 		/*
 		 * Per ZBC and ZAC specifications, writes in sequential write
@@ -3152,8 +3154,9 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 		 * Anything else.  This includes host-aware device that we treat
 		 * as conventional.
 		 */
-		disk_set_zoned(sdkp->disk, false);
+		disk_clear_zoned(sdkp->disk);
 	}
+#endif /* CONFIG_BLK_DEV_ZONED */
 
 	if (!sdkp->first_scan)
 		return;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 28cda9fb239e..bc236e77d85e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -317,7 +317,8 @@ struct queue_limits {
 typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
-void disk_set_zoned(struct gendisk *disk, bool zoned);
+void disk_set_zoned(struct gendisk *disk);
+void disk_clear_zoned(struct gendisk *disk);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 8be0c877fb3b671dac0cf56d1f1f9e65f9a9fb81 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:43:05 +0100
Subject: thunderbolt: make tb_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the tb_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Andreas Noever <andreas.noever@gmail.com>
Cc: Michael Jamet <michael.jamet@intel.com>
Cc: Yehezkel Bernat <YehezkelShB@gmail.com>
Cc:  <linux-usb@vger.kernel.org>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://lore.kernel.org/r/2023121904-utopia-broadcast-06d1@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/thunderbolt/domain.c | 2 +-
 include/linux/thunderbolt.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index ec7b5f65804e..9fb1a64f3300 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -307,7 +307,7 @@ static const struct attribute_group *domain_attr_groups[] = {
 	NULL,
 };
 
-struct bus_type tb_bus_type = {
+const struct bus_type tb_bus_type = {
 	.name = "thunderbolt",
 	.match = tb_service_match,
 	.probe = tb_service_probe,
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 6151c210d987..2c835e5c41f6 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -86,7 +86,7 @@ struct tb {
 	unsigned long privdata[];
 };
 
-extern struct bus_type tb_bus_type;
+extern const struct bus_type tb_bus_type;
 extern struct device_type tb_service_type;
 extern struct device_type tb_xdomain_type;
 
-- 
cgit v1.2.3


From 1a36e0f50f963465e9b2b980d250ab38b8fcd7a3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 17 Dec 2023 10:32:38 +0200
Subject: net: Add MDB bulk deletion device operation

Add MDB net device operation that will be invoked by rtnetlink code in
response to received 'RTM_DELMDB' messages with the 'NLM_F_BULK' flag
set. Subsequent patches will implement the operation in the bridge and
VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1b935ee341b4..75c7725e5e4f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1329,6 +1329,9 @@ struct netdev_net_notifier {
  * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
  *		      struct netlink_ext_ack *extack);
  *	Deletes the MDB entry from dev.
+ * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
+ *			   struct netlink_ext_ack *extack);
+ *	Bulk deletes MDB entries from dev.
  * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
  *		       struct netlink_callback *cb);
  *	Dumps MDB entries from dev. The first argument (marker) in the netlink
@@ -1611,6 +1614,9 @@ struct net_device_ops {
 	int			(*ndo_mdb_del)(struct net_device *dev,
 					       struct nlattr *tb[],
 					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_del_bulk)(struct net_device *dev,
+						    struct nlattr *tb[],
+						    struct netlink_ext_ack *extack);
 	int			(*ndo_mdb_dump)(struct net_device *dev,
 						struct sk_buff *skb,
 						struct netlink_callback *cb);
-- 
cgit v1.2.3


From 40ca4ee3136d2d09977d1cab8c0c0e1582c3359d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Tue, 12 Dec 2023 06:12:43 -0500
Subject: evm: don't copy up 'security.evm' xattr

The security.evm HMAC and the original file signatures contain
filesystem specific data.  As a result, the HMAC and signature
are not the same on the stacked and backing filesystems.

Don't copy up 'security.evm'.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h               | 6 ++++++
 security/integrity/evm/evm_main.c | 7 +++++++
 security/security.c               | 2 +-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 01fc495a83e2..36ec884320d9 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -31,6 +31,7 @@ extern void evm_inode_post_setxattr(struct dentry *dentry,
 				    const char *xattr_name,
 				    const void *xattr_value,
 				    size_t xattr_value_len);
+extern int evm_inode_copy_up_xattr(const char *name);
 extern int evm_inode_removexattr(struct mnt_idmap *idmap,
 				 struct dentry *dentry, const char *xattr_name);
 extern void evm_inode_post_removexattr(struct dentry *dentry,
@@ -117,6 +118,11 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry,
 	return;
 }
 
+static inline int  evm_inode_copy_up_xattr(const char *name)
+{
+	return 0;
+}
+
 static inline int evm_inode_removexattr(struct mnt_idmap *idmap,
 					struct dentry *dentry,
 					const char *xattr_name)
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 894570fe39bc..02adba635b02 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -863,6 +863,13 @@ void evm_inode_post_setattr(struct dentry *dentry, int ia_valid)
 		evm_update_evmxattr(dentry, NULL, NULL, 0);
 }
 
+int evm_inode_copy_up_xattr(const char *name)
+{
+	if (strcmp(name, XATTR_NAME_EVM) == 0)
+		return 1; /* Discard */
+	return -EOPNOTSUPP;
+}
+
 /*
  * evm_inode_init_security - initializes security.evm HMAC value
  */
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..f00ec4d988b8 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2539,7 +2539,7 @@ int security_inode_copy_up_xattr(const char *name)
 			return rc;
 	}
 
-	return LSM_RET_DEFAULT(inode_copy_up_xattr);
+	return evm_inode_copy_up_xattr(name);
 }
 EXPORT_SYMBOL(security_inode_copy_up_xattr);
 
-- 
cgit v1.2.3


From cd708c938f055c9eb5a366ec1c8edcefa28afc28 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:06:40 -0500
Subject: evm: add support to disable EVM on unsupported filesystems

Identify EVM unsupported filesystems by defining a new flag
SB_I_EVM_UNSUPPORTED.

Don't verify, write, remove or update 'security.evm' on unsupported
filesystems.

Acked-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/fs.h                |  1 +
 security/integrity/evm/evm_main.c | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..1474f36e9b38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
+#define SB_I_EVM_UNSUPPORTED		0x00000080
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 02adba635b02..cc7956d7878b 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -151,6 +151,17 @@ static int evm_find_protected_xattrs(struct dentry *dentry)
 	return count;
 }
 
+static int is_unsupported_fs(struct dentry *dentry)
+{
+	struct inode *inode = d_backing_inode(dentry);
+
+	if (inode->i_sb->s_iflags & SB_I_EVM_UNSUPPORTED) {
+		pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr
  *
@@ -181,6 +192,9 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 		     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
 		return iint->evm_status;
 
+	if (is_unsupported_fs(dentry))
+		return INTEGRITY_UNKNOWN;
+
 	/* if status is not PASS, try to check again - against -ENOMEM */
 
 	/* first need to know the sig type */
@@ -408,6 +422,9 @@ enum integrity_status evm_verifyxattr(struct dentry *dentry,
 	if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
 		return INTEGRITY_UNKNOWN;
 
+	if (is_unsupported_fs(dentry))
+		return INTEGRITY_UNKNOWN;
+
 	if (!iint) {
 		iint = integrity_iint_find(d_backing_inode(dentry));
 		if (!iint)
@@ -491,15 +508,21 @@ static int evm_protect_xattr(struct mnt_idmap *idmap,
 	if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
+		if (is_unsupported_fs(dentry))
+			return -EPERM;
 	} else if (!evm_protected_xattr(xattr_name)) {
 		if (!posix_xattr_acl(xattr_name))
 			return 0;
+		if (is_unsupported_fs(dentry))
+			return 0;
+
 		evm_status = evm_verify_current_integrity(dentry);
 		if ((evm_status == INTEGRITY_PASS) ||
 		    (evm_status == INTEGRITY_NOXATTRS))
 			return 0;
 		goto out;
-	}
+	} else if (is_unsupported_fs(dentry))
+		return 0;
 
 	evm_status = evm_verify_current_integrity(dentry);
 	if (evm_status == INTEGRITY_NOXATTRS) {
@@ -750,6 +773,9 @@ void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name,
 	if (!(evm_initialized & EVM_INIT_HMAC))
 		return;
 
+	if (is_unsupported_fs(dentry))
+		return;
+
 	evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
 }
 
@@ -814,8 +840,12 @@ int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
 		return 0;
 
+	if (is_unsupported_fs(dentry))
+		return 0;
+
 	if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
 		return 0;
+
 	evm_status = evm_verify_current_integrity(dentry);
 	/*
 	 * Writing attrs is safe for portable signatures, as portable signatures
@@ -859,6 +889,9 @@ void evm_inode_post_setattr(struct dentry *dentry, int ia_valid)
 	if (!(evm_initialized & EVM_INIT_HMAC))
 		return;
 
+	if (is_unsupported_fs(dentry))
+		return;
+
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
 		evm_update_evmxattr(dentry, NULL, NULL, 0);
 }
-- 
cgit v1.2.3


From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:16 -0500
Subject: ring-buffer: Page size per ring buffer

Currently the size of one sub buffer page is global for all buffers and
it is hard coded to one system page. In order to introduce configurable
ring buffer sub page size, the internal logic should be refactored to
work with sub page size per ring buffer.

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 68 +++++++++++++++++++++++++--------------------
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_events.c | 59 +++++++++++++++++++++++++++++----------
 5 files changed, 86 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1b03b2c0f08..ce46218ce46d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
 struct trace_seq;
 
 int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2400c8e68fd3..d9f656502400 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -374,11 +374,6 @@ static inline bool test_time_stamp(u64 delta)
 	return !!(delta & TS_DELTA_TEST);
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
 struct rb_irq_work {
 	struct irq_work			work;
 	wait_queue_head_t		waiters;
@@ -510,6 +505,9 @@ struct trace_buffer {
 
 	struct rb_irq_work		irq_work;
 	bool				time_stamp_abs;
+
+	unsigned int			subbuf_size;
+	unsigned int			max_data_size;
 };
 
 struct ring_buffer_iter {
@@ -523,10 +521,11 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 	u64				page_stamp;
 	struct ring_buffer_event	*event;
+	size_t				event_size;
 	int				missed_events;
 };
 
-int ring_buffer_print_page_header(struct trace_seq *s)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
 {
 	struct buffer_data_page field;
 
@@ -550,7 +549,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	trace_seq_printf(s, "\tfield: char data;\t"
 			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			 (unsigned int)offsetof(typeof(field), data),
-			 (unsigned int)BUF_PAGE_SIZE,
+			 (unsigned int)buffer->subbuf_size,
 			 (unsigned int)is_signed_type(char));
 
 	return !trace_seq_has_overflowed(s);
@@ -1625,7 +1624,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
 		goto fail_free_buffer;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	/* Default buffer page size - one system page */
+	buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+	/* Max payload is buffer page size - header (8bytes) */
+	buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+	nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
@@ -1944,7 +1949,7 @@ static void update_pages_handler(struct work_struct *work)
  * @size: the new size.
  * @cpu_id: the cpu buffer to resize
  *
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
  *
  * Returns 0 on success and < 0 on failure.
  */
@@ -1966,7 +1971,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
 		return 0;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
 
 	/* we need a minimum of two pages */
 	if (nr_pages < 2)
@@ -2213,7 +2218,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 */
 	barrier();
 
-	if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+	if ((iter->head + length) > commit || length > iter->event_size)
 		/* Writer corrupted the read? */
 		goto reset;
 
@@ -2446,6 +2451,7 @@ static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	      unsigned long tail, struct rb_event_info *info)
 {
+	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
 	struct buffer_page *tail_page = info->tail_page;
 	struct ring_buffer_event *event;
 	unsigned long length = info->length;
@@ -2454,13 +2460,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Only the event that crossed the page boundary
 	 * must fill the old tail_page with padding.
 	 */
-	if (tail >= BUF_PAGE_SIZE) {
+	if (tail >= bsize) {
 		/*
 		 * If the page was filled, then we still need
 		 * to update the real_end. Reset it to zero
 		 * and the reader will ignore it.
 		 */
-		if (tail == BUF_PAGE_SIZE)
+		if (tail == bsize)
 			tail_page->real_end = 0;
 
 		local_sub(length, &tail_page->write);
@@ -2488,7 +2494,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we are less than the minimum size, we don't need to
 	 * worry about it.
 	 */
-	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+	if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
 		/* No room for any events */
 
 		/* Mark the rest of the page with padding */
@@ -2503,19 +2509,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	/* Put in a discarded event */
-	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+	event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
 	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	event->time_delta = 1;
 
 	/* account for padding bytes */
-	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+	local_add(bsize - tail, &cpu_buffer->entries_bytes);
 
 	/* Make sure the padding is visible before the tail_page->write update */
 	smp_wmb();
 
 	/* Set write to end of buffer */
-	length = (tail + length) - BUF_PAGE_SIZE;
+	length = (tail + length) - bsize;
 	local_sub(length, &tail_page->write);
 }
 
@@ -3469,7 +3475,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - info->length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE)) {
+	if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
 		check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
 		return rb_move_tail(cpu_buffer, tail, info);
 	}
@@ -3600,7 +3606,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
 		add_ts_default = RB_ADD_STAMP_ABSOLUTE;
 		info.length += RB_LEN_TIME_EXTEND;
-		if (info.length > BUF_MAX_DATA_SIZE)
+		if (info.length > cpu_buffer->buffer->max_data_size)
 			goto out_fail;
 	} else {
 		add_ts_default = RB_ADD_STAMP_NONE;
@@ -3675,7 +3681,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
 	if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
 		goto out;
 
-	if (unlikely(length > BUF_MAX_DATA_SIZE))
+	if (unlikely(length > buffer->max_data_size))
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3825,7 +3831,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	if (length > BUF_MAX_DATA_SIZE)
+	if (length > buffer->max_data_size)
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4405,6 +4411,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
 	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
@@ -4540,7 +4547,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 #define USECS_WAIT	1000000
         for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
 		/* If the write is past the end of page, a writer is still updating it */
-		if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+		if (likely(!reader || rb_page_write(reader) <= bsize))
 			break;
 
 		udelay(1);
@@ -4984,7 +4991,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 		return NULL;
 
 	/* Holds the entire event: data and meta data */
-	iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+	iter->event_size = buffer->subbuf_size;
+	iter->event = kmalloc(iter->event_size, flags);
 	if (!iter->event) {
 		kfree(iter);
 		return NULL;
@@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 {
 	/*
 	 * Earlier, this method returned
-	 *	BUF_PAGE_SIZE * buffer->nr_pages
+	 *	buffer->subbuf_size * buffer->nr_pages
 	 * Since the nr_pages field is now removed, we have converted this to
 	 * return the per cpu buffer value.
 	 */
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
-	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+	return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
@@ -5123,8 +5131,8 @@ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
 {
 	/* If abs timestamp is requested, events have a timestamp too */
 	if (ring_buffer_time_stamp_abs(buffer))
-		return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND;
-	return BUF_MAX_DATA_SIZE;
+		return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+	return buffer->max_data_size;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
 
@@ -5730,7 +5738,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		/* If there is room at the end of the page to save the
 		 * missed events, then record it there.
 		 */
-		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+		if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
 			memcpy(&bpage->data[commit], &missed_events,
 			       sizeof(missed_events));
 			local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5742,8 +5750,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	/*
 	 * This page may be off to user land. Zero it out here.
 	 */
-	if (commit < BUF_PAGE_SIZE)
-		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+	if (commit < buffer->subbuf_size)
+		memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
 
  out_unlock:
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55dabee4c78b..76dd0a4c8cb5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5018,7 +5018,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
 {
 	struct trace_array *tr = inode->i_private;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79180aed13ee..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
 void tracing_reset_all_online_cpus_unlocked(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
 int tracing_open_file_tr(struct inode *inode, struct file *filp);
 int tracing_release_file_tr(struct inode *inode, struct file *filp);
 int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b70d03818038..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 }
 
 static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 {
-	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_array *tr = filp->private_data;
 	struct trace_seq *s;
 	int r;
 
@@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 
 	trace_seq_init(s);
 
-	func(s);
+	ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, trace_seq_used(s));
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	ring_buffer_print_entry_header(s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos,
 				    s->buffer, trace_seq_used(s));
 
@@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
 	.release = subsystem_release,
 };
 
-static const struct file_operations ftrace_show_header_fops = {
-	.open = tracing_open_generic,
-	.read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+	.open = tracing_open_generic_tr,
+	.read = show_header_page_file,
 	.llseek = default_llseek,
+	.release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+	.open = tracing_open_generic_tr,
+	.read = show_header_event_file,
+	.llseek = default_llseek,
+	.release = tracing_release_generic_tr,
 };
 
 static int
@@ -3794,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
 		return 1;
 	}
 
-	if (strcmp(name, "header_page") == 0)
-		*data = ring_buffer_print_page_header;
-
-	else if (strcmp(name, "header_event") == 0)
-		*data = ring_buffer_print_entry_header;
+	if (strcmp(name, "header_page") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_show_header_page_fops;
 
-	else
+	} else if (strcmp(name, "header_event") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_show_header_event_fops;
+	} else
 		return 0;
 
-	*mode = TRACE_MODE_READ;
-	*fops = &ftrace_show_header_fops;
 	return 1;
 }
 
-- 
cgit v1.2.3


From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:17 -0500
Subject: ring-buffer: Add interface for configuring trace sub buffer size

The trace ring buffer sub page size can be configured, per trace
instance. A new ftrace file "buffer_subbuf_order" is added to get and
set the size of the ring buffer sub page for current trace instance.
The size must be an order of system page size, that's why the new
interface works with system page order, instead of absolute page size:
0 means the ring buffer sub page is equal to 1 system page and so
forth:
0 - 1 system page
1 - 2 system pages
2 - 4 system pages
...
The ring buffer sub page size is limited between 1 and 128 system
pages. The default value is 1 system page.
New ring buffer APIs are introduced:
 ring_buffer_subbuf_order_set()
 ring_buffer_subbuf_order_get()
 ring_buffer_subbuf_size_get()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 +++
 kernel/trace/ring_buffer.c  | 73 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        | 48 +++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ce46218ce46d..12573306b889 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,10 @@ struct trace_seq;
 int ring_buffer_print_entry_header(struct trace_seq *s);
 int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9f656502400..20fc0121735d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -507,6 +507,7 @@ struct trace_buffer {
 	bool				time_stamp_abs;
 
 	unsigned int			subbuf_size;
+	unsigned int			subbuf_order;
 	unsigned int			max_data_size;
 };
 
@@ -5761,6 +5762,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+	return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+	if (!buffer)
+		return -EINVAL;
+
+	return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ *  0 - 1 system page
+ *  1 - 2 system pages
+ *  3 - 4 system pages
+ *  ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+	int psize;
+
+	if (!buffer || order < 0)
+		return -EINVAL;
+
+	if (buffer->subbuf_order == order)
+		return 0;
+
+	psize = (1 << order) * PAGE_SIZE;
+	if (psize <= BUF_PAGE_HDR_SIZE)
+		return -EINVAL;
+
+	buffer->subbuf_order = order;
+	buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+	/* Todo: reset the buffer with the new page size */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 76dd0a4c8cb5..a010aba4c4a4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9358,6 +9358,51 @@ static const struct file_operations buffer_percent_fops = {
 	.llseek		= default_llseek,
 };
 
+static ssize_t
+buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_order_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	/* limit between 1 and 128 system pages */
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
+	if (ret)
+		return ret;
+
+	(*ppos)++;
+
+	return cnt;
+}
+
+static const struct file_operations buffer_order_fops = {
+	.open		= tracing_open_generic_tr,
+	.read		= buffer_order_read,
+	.write		= buffer_order_write,
+	.release	= tracing_release_generic_tr,
+	.llseek		= default_llseek,
+};
+
 static struct dentry *trace_instance_dir;
 
 static void
@@ -9824,6 +9869,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
 			tr, &buffer_percent_fops);
 
+	trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
+			  tr, &buffer_order_fops);
+
 	create_trace_options_dir(tr);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-- 
cgit v1.2.3


From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:19 -0500
Subject: ring-buffer: Read and write to ring buffers with custom sub buffer
 size

As the size of the ring sub buffer page can be changed dynamically,
the logic that reads and writes to the buffer should be fixed to take
that into account. Some internal ring buffer APIs are changed:
 ring_buffer_alloc_read_page()
 ring_buffer_free_read_page()
 ring_buffer_read_page()
A new API is introduced:
 ring_buffer_read_page_data()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
[ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h          | 11 ++++--
 kernel/trace/ring_buffer.c           | 75 ++++++++++++++++++++++++++----------
 kernel/trace/ring_buffer_benchmark.c | 10 +++--
 kernel/trace/trace.c                 | 34 +++++++++-------
 4 files changed, 89 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 12573306b889..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer);
 size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
 size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
 
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+				struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+			  struct buffer_data_read_page *data_page,
 			  size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
 
 struct trace_seq;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b928bd03dd0e..c2afcf98ea91 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,11 @@ struct buffer_data_page {
 	unsigned char	 data[] RB_ALIGN_DATA;	/* data of buffer page */
 };
 
+struct buffer_data_read_page {
+	unsigned		order;	/* order of the page */
+	struct buffer_data_page	*data;	/* actual data, stored in this page */
+};
+
 /*
  * Note, the buffer_page list must be first. The buffer pages
  * are allocated in cache lines, which means that each buffer
@@ -5483,40 +5488,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * Returns:
  *  The page allocated, or ERR_PTR
  */
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_data_page *bpage = NULL;
+	struct buffer_data_read_page *bpage = NULL;
 	unsigned long flags;
 	struct page *page;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return ERR_PTR(-ENODEV);
 
+	bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+	if (!bpage)
+		return ERR_PTR(-ENOMEM);
+
+	bpage->order = buffer->subbuf_order;
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	arch_spin_lock(&cpu_buffer->lock);
 
 	if (cpu_buffer->free_page) {
-		bpage = cpu_buffer->free_page;
+		bpage->data = cpu_buffer->free_page;
 		cpu_buffer->free_page = NULL;
 	}
 
 	arch_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
-	if (bpage)
+	if (bpage->data)
 		goto out;
 
 	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
 				cpu_buffer->buffer->subbuf_order);
-	if (!page)
+	if (!page) {
+		kfree(bpage);
 		return ERR_PTR(-ENOMEM);
+	}
 
-	bpage = page_address(page);
+	bpage->data = page_address(page);
 
  out:
-	rb_init_page(bpage);
+	rb_init_page(bpage->data);
 
 	return bpage;
 }
@@ -5526,14 +5539,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
  * ring_buffer_free_read_page - free an allocated read page
  * @buffer: the buffer the page was allocate for
  * @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
  *
  * Free a page allocated from ring_buffer_alloc_read_page.
  */
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+				struct buffer_data_read_page *data_page)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_data_page *bpage = data;
+	struct buffer_data_page *bpage = data_page->data;
 	struct page *page = virt_to_page(bpage);
 	unsigned long flags;
 
@@ -5542,8 +5556,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	/* If the page is still in use someplace else, we can't reuse it */
-	if (page_ref_count(page) > 1)
+	/*
+	 * If the page is still in use someplace else, or order of the page
+	 * is different from the subbuffer order of the buffer -
+	 * we can't reuse it
+	 */
+	if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
 		goto out;
 
 	local_irq_save(flags);
@@ -5558,7 +5576,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
 	local_irq_restore(flags);
 
  out:
-	free_pages((unsigned long)bpage, buffer->subbuf_order);
+	free_pages((unsigned long)bpage, data_page->order);
+	kfree(data_page);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
@@ -5579,9 +5598,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *	rpage = ring_buffer_alloc_read_page(buffer, cpu);
  *	if (IS_ERR(rpage))
  *		return PTR_ERR(rpage);
- *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
  *	if (ret >= 0)
- *		process_page(rpage, ret);
+ *		process_page(ring_buffer_read_page_data(rpage), ret);
+ *	ring_buffer_free_read_page(buffer, cpu, rpage);
  *
  * When @full is set, the function will not return true unless
  * the writer is off the reader page.
@@ -5596,7 +5616,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct trace_buffer *buffer,
-			  void **data_page, size_t len, int cpu, int full)
+			  struct buffer_data_read_page *data_page,
+			  size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -5621,10 +5642,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 
 	len -= BUF_PAGE_HDR_SIZE;
 
-	if (!data_page)
+	if (!data_page || !data_page->data)
+		goto out;
+	if (data_page->order != buffer->subbuf_order)
 		goto out;
 
-	bpage = *data_page;
+	bpage = data_page->data;
 	if (!bpage)
 		goto out;
 
@@ -5718,11 +5741,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		/* swap the pages */
 		rb_init_page(bpage);
 		bpage = reader->page;
-		reader->page = *data_page;
+		reader->page = data_page->data;
 		local_set(&reader->write, 0);
 		local_set(&reader->entries, 0);
 		reader->read = 0;
-		*data_page = bpage;
+		data_page->data = bpage;
 
 		/*
 		 * Use the real_end for the data size,
@@ -5767,6 +5790,18 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page:  the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+	return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
 /**
  * ring_buffer_subbuf_size_get - get size of the sub buffer.
  * @buffer: the buffer to get the sub buffer size from
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
 
 static enum event_status read_page(int cpu)
 {
+	struct buffer_data_read_page *bpage;
 	struct ring_buffer_event *event;
 	struct rb_page *rpage;
 	unsigned long commit;
-	void *bpage;
+	int page_size;
 	int *entry;
 	int ret;
 	int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
 	if (IS_ERR(bpage))
 		return EVENT_DROPPED;
 
-	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	page_size = ring_buffer_subbuf_size_get(buffer);
+	ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
 	if (ret >= 0) {
-		rpage = bpage;
+		rpage = ring_buffer_read_page_data(bpage);
 		/* The commit may have missed event flags set, clear them */
 		commit = local_read(&rpage->commit) & 0xfffff;
 		for (i = 0; i < commit && !test_error ; i += inc) {
 
-			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+			if (i >= (page_size - offsetof(struct rb_page, data))) {
 				TEST_ERROR();
 				break;
 			}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a010aba4c4a4..711095aa731d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8286,6 +8286,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 {
 	struct ftrace_buffer_info *info = filp->private_data;
 	struct trace_iterator *iter = &info->iter;
+	void *trace_data;
+	int page_size;
 	ssize_t ret = 0;
 	ssize_t size;
 
@@ -8297,6 +8299,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return -EBUSY;
 #endif
 
+	page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
 	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
 							  iter->cpu_file);
@@ -8311,13 +8315,13 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return ret;
 
 	/* Do we have previous read data to read? */
-	if (info->read < PAGE_SIZE)
+	if (info->read < page_size)
 		goto read;
 
  again:
 	trace_access_lock(iter->cpu_file);
 	ret = ring_buffer_read_page(iter->array_buffer->buffer,
-				    &info->spare,
+				    info->spare,
 				    count,
 				    iter->cpu_file, 0);
 	trace_access_unlock(iter->cpu_file);
@@ -8338,11 +8342,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	info->read = 0;
  read:
-	size = PAGE_SIZE - info->read;
+	size = page_size - info->read;
 	if (size > count)
 		size = count;
-
-	ret = copy_to_user(ubuf, info->spare + info->read, size);
+	trace_data = ring_buffer_read_page_data(info->spare);
+	ret = copy_to_user(ubuf, trace_data + info->read, size);
 	if (ret == size)
 		return -EFAULT;
 
@@ -8453,6 +8457,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
+	int page_size;
 	int entries, i;
 	ssize_t ret = 0;
 
@@ -8461,13 +8466,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		return -EBUSY;
 #endif
 
-	if (*ppos & (PAGE_SIZE - 1))
+	page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+	if (*ppos & (page_size - 1))
 		return -EINVAL;
 
-	if (len & (PAGE_SIZE - 1)) {
-		if (len < PAGE_SIZE)
+	if (len & (page_size - 1)) {
+		if (len < page_size)
 			return -EINVAL;
-		len &= PAGE_MASK;
+		len &= (~(page_size - 1));
 	}
 
 	if (splice_grow_spd(pipe, &spd))
@@ -8477,7 +8483,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
 
-	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
 		struct page *page;
 		int r;
 
@@ -8498,7 +8504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 		ref->cpu = iter->cpu_file;
 
-		r = ring_buffer_read_page(ref->buffer, &ref->page,
+		r = ring_buffer_read_page(ref->buffer, ref->page,
 					  len, iter->cpu_file, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8507,14 +8513,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 		}
 
-		page = virt_to_page(ref->page);
+		page = virt_to_page(ring_buffer_read_page_data(ref->page));
 
 		spd.pages[i] = page;
-		spd.partial[i].len = PAGE_SIZE;
+		spd.partial[i].len = page_size;
 		spd.partial[i].offset = 0;
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
-		*ppos += PAGE_SIZE;
+		*ppos += page_size;
 
 		entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
 	}
-- 
cgit v1.2.3


From f3e25e68ceb2abaeefcac8f930c940c4494705d0 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Nov 2023 12:20:11 +0100
Subject: pwm: Drop unused member "pwm" from struct pwm_device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This member is only assigned to and never read. So drop it.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 1 -
 include/linux/pwm.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 29078486534d..24bb796d15f6 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -265,7 +265,6 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 		pwm = &chip->pwms[i];
 
 		pwm->chip = chip;
-		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
 	}
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index cda3597b84f2..8cadf9ee8d26 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -79,7 +79,6 @@ struct pwm_device {
 	const char *label;
 	unsigned long flags;
 	unsigned int hwpwm;
-	unsigned int pwm;
 	struct pwm_chip *chip;
 
 	struct pwm_args args;
-- 
cgit v1.2.3


From 54c86dd20bba23109e32e4e2f94ff93dd9863bc3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Nov 2023 12:20:12 +0100
Subject: pwm: Replace PWM chip unique base by unique ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Traditionally each PWM device had a unique ID stored in the "pwm" member
of struct pwm_device. However this number was hardly used and dropped
in the previous commit. To identify a certain PWM you're supposed to use
the chip's ID and the hwpwm of the PWM device now.

With the PWM chip base gone PWM chips can get their IDs better and
simpler using an idr.

This is expected to change the numbering of PWM chips, but nothing
should rely on the numbering anyhow.

Other than that the side effects are:

 - The PWM chip IDs are smaller and in most cases consecutive.
 - The ordering in /sys/kernel/debug/pwm is ordered by ascending PWM
   chip ID.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 67 ++++++++++++++++++++---------------------------------
 drivers/pwm/sysfs.c |  2 +-
 include/linux/pwm.h |  3 +--
 3 files changed, 27 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 24bb796d15f6..ebe404dfdf5b 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -8,6 +8,7 @@
 
 #include <linux/acpi.h>
 #include <linux/module.h>
+#include <linux/idr.h>
 #include <linux/of.h>
 #include <linux/pwm.h>
 #include <linux/list.h>
@@ -23,52 +24,25 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/pwm.h>
 
-#define MAX_PWMS 1024
-
 static DEFINE_MUTEX(pwm_lookup_lock);
 static LIST_HEAD(pwm_lookup_list);
 
-/* protects access to pwm_chips and allocated_pwms */
+/* protects access to pwm_chips */
 static DEFINE_MUTEX(pwm_lock);
 
-static LIST_HEAD(pwm_chips);
-static DECLARE_BITMAP(allocated_pwms, MAX_PWMS);
-
-/* Called with pwm_lock held */
-static int alloc_pwms(unsigned int count)
-{
-	unsigned int start;
-
-	start = bitmap_find_next_zero_area(allocated_pwms, MAX_PWMS, 0,
-					   count, 0);
-
-	if (start + count > MAX_PWMS)
-		return -ENOSPC;
-
-	bitmap_set(allocated_pwms, start, count);
-
-	return start;
-}
-
-/* Called with pwm_lock held */
-static void free_pwms(struct pwm_chip *chip)
-{
-	bitmap_clear(allocated_pwms, chip->base, chip->npwm);
-
-	kfree(chip->pwms);
-	chip->pwms = NULL;
-}
+static DEFINE_IDR(pwm_chips);
 
 static struct pwm_chip *pwmchip_find_by_name(const char *name)
 {
 	struct pwm_chip *chip;
+	unsigned long id, tmp;
 
 	if (!name)
 		return NULL;
 
 	mutex_lock(&pwm_lock);
 
-	list_for_each_entry(chip, &pwm_chips, list) {
+	idr_for_each_entry_ul(&pwm_chips, chip, tmp, id) {
 		const char *chip_name = dev_name(chip->dev);
 
 		if (chip_name && strcmp(chip_name, name) == 0) {
@@ -252,14 +226,14 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 
 	mutex_lock(&pwm_lock);
 
-	ret = alloc_pwms(chip->npwm);
+	ret = idr_alloc(&pwm_chips, chip, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		mutex_unlock(&pwm_lock);
 		kfree(chip->pwms);
 		return ret;
 	}
 
-	chip->base = ret;
+	chip->id = ret;
 
 	for (i = 0; i < chip->npwm; i++) {
 		pwm = &chip->pwms[i];
@@ -268,8 +242,6 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 		pwm->hwpwm = i;
 	}
 
-	list_add(&chip->list, &pwm_chips);
-
 	mutex_unlock(&pwm_lock);
 
 	if (IS_ENABLED(CONFIG_OF))
@@ -296,11 +268,11 @@ void pwmchip_remove(struct pwm_chip *chip)
 
 	mutex_lock(&pwm_lock);
 
-	list_del_init(&chip->list);
-
-	free_pwms(chip);
+	idr_remove(&pwm_chips, chip->id);
 
 	mutex_unlock(&pwm_lock);
+
+	kfree(chip->pwms);
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
@@ -596,10 +568,11 @@ EXPORT_SYMBOL_GPL(pwm_adjust_config);
 static struct pwm_chip *fwnode_to_pwmchip(struct fwnode_handle *fwnode)
 {
 	struct pwm_chip *chip;
+	unsigned long id, tmp;
 
 	mutex_lock(&pwm_lock);
 
-	list_for_each_entry(chip, &pwm_chips, list)
+	idr_for_each_entry_ul(&pwm_chips, chip, tmp, id)
 		if (chip->dev && device_match_fwnode(chip->dev, fwnode)) {
 			mutex_unlock(&pwm_lock);
 			return chip;
@@ -1057,17 +1030,27 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 
 static void *pwm_seq_start(struct seq_file *s, loff_t *pos)
 {
+	unsigned long id = *pos;
+	void *ret;
+
 	mutex_lock(&pwm_lock);
 	s->private = "";
 
-	return seq_list_start(&pwm_chips, *pos);
+	ret = idr_get_next_ul(&pwm_chips, &id);
+	*pos = id;
+	return ret;
 }
 
 static void *pwm_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
+	unsigned long id = *pos + 1;
+	void *ret;
+
 	s->private = "\n";
 
-	return seq_list_next(v, &pwm_chips, pos);
+	ret = idr_get_next_ul(&pwm_chips, &id);
+	*pos = id;
+	return ret;
 }
 
 static void pwm_seq_stop(struct seq_file *s, void *v)
@@ -1077,7 +1060,7 @@ static void pwm_seq_stop(struct seq_file *s, void *v)
 
 static int pwm_seq_show(struct seq_file *s, void *v)
 {
-	struct pwm_chip *chip = list_entry(v, struct pwm_chip, list);
+	struct pwm_chip *chip = v;
 
 	seq_printf(s, "%s%s/%s, %d PWM device%s\n", (char *)s->private,
 		   chip->dev->bus ? chip->dev->bus->name : "no-bus",
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 8d1254761e4d..4edb994fa2e1 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -510,7 +510,7 @@ void pwmchip_sysfs_export(struct pwm_chip *chip)
 	 * the kernel it's just not exported.
 	 */
 	parent = device_create(&pwm_class, chip->dev, MKDEV(0, 0), chip,
-			       "pwmchip%d", chip->base);
+			       "pwmchip%d", chip->id);
 	if (IS_ERR(parent)) {
 		dev_warn(chip->dev,
 			 "device_create failed for pwm_chip sysfs export\n");
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 8cadf9ee8d26..c27a4bb76012 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -292,7 +292,7 @@ struct pwm_chip {
 	struct device *dev;
 	const struct pwm_ops *ops;
 	struct module *owner;
-	int base;
+	unsigned int id;
 	unsigned int npwm;
 
 	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
@@ -300,7 +300,6 @@ struct pwm_chip {
 	unsigned int of_pwm_n_cells;
 
 	/* only used internally by the PWM framework */
-	struct list_head list;
 	struct pwm_device *pwms;
 };
 
-- 
cgit v1.2.3


From 2d91123ae5614b8737abd3a519b81265309a1ac3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 29 Nov 2023 11:18:32 +0100
Subject: pwm: Update kernel doc for struct pwm_chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit c572f3b9c8b7 ("pwm: Replace PWM chip unique base by unique ID")
changed the members of struct pwm_chip, but failed to update the
documentation accordingly. Catch up and document the new member and drop
description for the two removed ones.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index c27a4bb76012..f87655c06c82 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -281,11 +281,10 @@ struct pwm_ops {
  * @dev: device providing the PWMs
  * @ops: callbacks for this PWM controller
  * @owner: module providing this chip
- * @base: number of first PWM controlled by this chip
+ * @id: unique number of this PWM chip
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
- * @list: list node for internal use
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
-- 
cgit v1.2.3


From c748a6d77c06a78651030e17da6beb278a1c9470 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:24 +0000
Subject: pwm: Rename pwm_apply_state() to pwm_apply_might_sleep()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to introduce a pwm api which can be used from atomic context,
we will need two functions for applying pwm changes:

	int pwm_apply_might_sleep(struct pwm *, struct pwm_state *);
	int pwm_apply_atomic(struct pwm *, struct pwm_state *);

This commit just deals with renaming pwm_apply_state(), a following
commit will introduce the pwm_apply_atomic() function.

Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> # for input
Acked-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Lee Jones <lee@kernel.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/pwm.rst               |  8 ++++----
 MAINTAINERS                                    |  2 +-
 drivers/gpu/drm/i915/display/intel_backlight.c |  6 +++---
 drivers/gpu/drm/solomon/ssd130x.c              |  2 +-
 drivers/hwmon/pwm-fan.c                        |  8 ++++----
 drivers/input/misc/da7280.c                    |  4 ++--
 drivers/input/misc/pwm-beeper.c                |  4 ++--
 drivers/input/misc/pwm-vibra.c                 |  8 ++++----
 drivers/leds/leds-pwm.c                        |  2 +-
 drivers/leds/rgb/leds-pwm-multicolor.c         |  4 ++--
 drivers/media/rc/pwm-ir-tx.c                   |  4 ++--
 drivers/platform/x86/lenovo-yogabook.c         |  2 +-
 drivers/pwm/core.c                             | 18 ++++++++---------
 drivers/pwm/pwm-twl-led.c                      |  2 +-
 drivers/pwm/pwm-vt8500.c                       |  2 +-
 drivers/pwm/sysfs.c                            | 10 ++++-----
 drivers/regulator/pwm-regulator.c              |  4 ++--
 drivers/video/backlight/lm3630a_bl.c           |  2 +-
 drivers/video/backlight/lp855x_bl.c            |  2 +-
 drivers/video/backlight/pwm_bl.c               | 12 +++++------
 drivers/video/fbdev/ssd1307fb.c                |  2 +-
 include/linux/pwm.h                            | 28 +++++++++++++-------------
 22 files changed, 68 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index bb264490a87a..f1d8197c8c43 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -41,7 +41,7 @@ the getter, devm_pwm_get() and devm_fwnode_pwm_get(), also exist.
 
 After being requested, a PWM has to be configured using::
 
-	int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+	int pwm_apply_might_sleep(struct pwm_device *pwm, struct pwm_state *state);
 
 This API controls both the PWM period/duty_cycle config and the
 enable/disable state.
@@ -57,13 +57,13 @@ If supported by the driver, the signal can be optimized, for example to improve
 EMI by phase shifting the individual channels of a chip.
 
 The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
-around pwm_apply_state() and should not be used if the user wants to change
+around pwm_apply_might_sleep() and should not be used if the user wants to change
 several parameter at once. For example, if you see pwm_config() and
 pwm_{enable,disable}() calls in the same function, this probably means you
-should switch to pwm_apply_state().
+should switch to pwm_apply_might_sleep().
 
 The PWM user API also allows one to query the PWM state that was passed to the
-last invocation of pwm_apply_state() using pwm_get_state(). Note this is
+last invocation of pwm_apply_might_sleep() using pwm_get_state(). Note this is
 different to what the driver has actually implemented if the request cannot be
 satisfied exactly with the hardware in use. There is currently no way for
 consumers to get the actually implemented settings.
diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..c58480595220 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17576,7 +17576,7 @@ F:	drivers/video/backlight/pwm_bl.c
 F:	include/dt-bindings/pwm/
 F:	include/linux/pwm.h
 F:	include/linux/pwm_backlight.h
-K:	pwm_(config|apply_state|ops)
+K:	pwm_(config|apply_might_sleep|ops)
 
 PXA GPIO DRIVER
 M:	Robert Jarzmik <robert.jarzmik@free.fr>
diff --git a/drivers/gpu/drm/i915/display/intel_backlight.c b/drivers/gpu/drm/i915/display/intel_backlight.c
index 2e8f17c04522..ff9b9918b0a1 100644
--- a/drivers/gpu/drm/i915/display/intel_backlight.c
+++ b/drivers/gpu/drm/i915/display/intel_backlight.c
@@ -274,7 +274,7 @@ static void ext_pwm_set_backlight(const struct drm_connector_state *conn_state,
 	struct intel_panel *panel = &to_intel_connector(conn_state->connector)->panel;
 
 	pwm_set_relative_duty_cycle(&panel->backlight.pwm_state, level, 100);
-	pwm_apply_state(panel->backlight.pwm, &panel->backlight.pwm_state);
+	pwm_apply_might_sleep(panel->backlight.pwm, &panel->backlight.pwm_state);
 }
 
 static void
@@ -427,7 +427,7 @@ static void ext_pwm_disable_backlight(const struct drm_connector_state *old_conn
 	intel_backlight_set_pwm_level(old_conn_state, level);
 
 	panel->backlight.pwm_state.enabled = false;
-	pwm_apply_state(panel->backlight.pwm, &panel->backlight.pwm_state);
+	pwm_apply_might_sleep(panel->backlight.pwm, &panel->backlight.pwm_state);
 }
 
 void intel_backlight_disable(const struct drm_connector_state *old_conn_state)
@@ -749,7 +749,7 @@ static void ext_pwm_enable_backlight(const struct intel_crtc_state *crtc_state,
 
 	pwm_set_relative_duty_cycle(&panel->backlight.pwm_state, level, 100);
 	panel->backlight.pwm_state.enabled = true;
-	pwm_apply_state(panel->backlight.pwm, &panel->backlight.pwm_state);
+	pwm_apply_might_sleep(panel->backlight.pwm, &panel->backlight.pwm_state);
 }
 
 static void __intel_backlight_enable(const struct intel_crtc_state *crtc_state,
diff --git a/drivers/gpu/drm/solomon/ssd130x.c b/drivers/gpu/drm/solomon/ssd130x.c
index e0174f82e353..cce043a4a1dc 100644
--- a/drivers/gpu/drm/solomon/ssd130x.c
+++ b/drivers/gpu/drm/solomon/ssd130x.c
@@ -319,7 +319,7 @@ static int ssd130x_pwm_enable(struct ssd130x_device *ssd130x)
 
 	pwm_init_state(ssd130x->pwm, &pwmstate);
 	pwm_set_relative_duty_cycle(&pwmstate, 50, 100);
-	pwm_apply_state(ssd130x->pwm, &pwmstate);
+	pwm_apply_might_sleep(ssd130x->pwm, &pwmstate);
 
 	/* Enable the PWM */
 	pwm_enable(ssd130x->pwm);
diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c
index 6e4516c2ab89..b67bc9e833c0 100644
--- a/drivers/hwmon/pwm-fan.c
+++ b/drivers/hwmon/pwm-fan.c
@@ -151,7 +151,7 @@ static int pwm_fan_power_on(struct pwm_fan_ctx *ctx)
 	}
 
 	state->enabled = true;
-	ret = pwm_apply_state(ctx->pwm, state);
+	ret = pwm_apply_might_sleep(ctx->pwm, state);
 	if (ret) {
 		dev_err(ctx->dev, "failed to enable PWM\n");
 		goto disable_regulator;
@@ -181,7 +181,7 @@ static int pwm_fan_power_off(struct pwm_fan_ctx *ctx)
 
 	state->enabled = false;
 	state->duty_cycle = 0;
-	ret = pwm_apply_state(ctx->pwm, state);
+	ret = pwm_apply_might_sleep(ctx->pwm, state);
 	if (ret) {
 		dev_err(ctx->dev, "failed to disable PWM\n");
 		return ret;
@@ -207,7 +207,7 @@ static int  __set_pwm(struct pwm_fan_ctx *ctx, unsigned long pwm)
 
 		period = state->period;
 		state->duty_cycle = DIV_ROUND_UP(pwm * (period - 1), MAX_PWM);
-		ret = pwm_apply_state(ctx->pwm, state);
+		ret = pwm_apply_might_sleep(ctx->pwm, state);
 		if (ret)
 			return ret;
 		ret = pwm_fan_power_on(ctx);
@@ -278,7 +278,7 @@ static int pwm_fan_update_enable(struct pwm_fan_ctx *ctx, long val)
 						    state,
 						    &enable_regulator);
 
-			pwm_apply_state(ctx->pwm, state);
+			pwm_apply_might_sleep(ctx->pwm, state);
 			pwm_fan_switch_power(ctx, enable_regulator);
 			pwm_fan_update_state(ctx, 0);
 		}
diff --git a/drivers/input/misc/da7280.c b/drivers/input/misc/da7280.c
index ce82548916bb..c1fa75c0f970 100644
--- a/drivers/input/misc/da7280.c
+++ b/drivers/input/misc/da7280.c
@@ -352,7 +352,7 @@ static int da7280_haptic_set_pwm(struct da7280_haptic *haptics, bool enabled)
 		state.duty_cycle = period_mag_multi;
 	}
 
-	error = pwm_apply_state(haptics->pwm_dev, &state);
+	error = pwm_apply_might_sleep(haptics->pwm_dev, &state);
 	if (error)
 		dev_err(haptics->dev, "Failed to apply pwm state: %d\n", error);
 
@@ -1175,7 +1175,7 @@ static int da7280_probe(struct i2c_client *client)
 		/* Sync up PWM state and ensure it is off. */
 		pwm_init_state(haptics->pwm_dev, &state);
 		state.enabled = false;
-		error = pwm_apply_state(haptics->pwm_dev, &state);
+		error = pwm_apply_might_sleep(haptics->pwm_dev, &state);
 		if (error) {
 			dev_err(dev, "Failed to apply PWM state: %d\n", error);
 			return error;
diff --git a/drivers/input/misc/pwm-beeper.c b/drivers/input/misc/pwm-beeper.c
index 1e731d8397c6..5b9aedf4362f 100644
--- a/drivers/input/misc/pwm-beeper.c
+++ b/drivers/input/misc/pwm-beeper.c
@@ -39,7 +39,7 @@ static int pwm_beeper_on(struct pwm_beeper *beeper, unsigned long period)
 	state.period = period;
 	pwm_set_relative_duty_cycle(&state, 50, 100);
 
-	error = pwm_apply_state(beeper->pwm, &state);
+	error = pwm_apply_might_sleep(beeper->pwm, &state);
 	if (error)
 		return error;
 
@@ -138,7 +138,7 @@ static int pwm_beeper_probe(struct platform_device *pdev)
 	/* Sync up PWM state and ensure it is off. */
 	pwm_init_state(beeper->pwm, &state);
 	state.enabled = false;
-	error = pwm_apply_state(beeper->pwm, &state);
+	error = pwm_apply_might_sleep(beeper->pwm, &state);
 	if (error) {
 		dev_err(dev, "failed to apply initial PWM state: %d\n",
 			error);
diff --git a/drivers/input/misc/pwm-vibra.c b/drivers/input/misc/pwm-vibra.c
index acac79c488aa..3e5ed685ed8f 100644
--- a/drivers/input/misc/pwm-vibra.c
+++ b/drivers/input/misc/pwm-vibra.c
@@ -56,7 +56,7 @@ static int pwm_vibrator_start(struct pwm_vibrator *vibrator)
 	pwm_set_relative_duty_cycle(&state, vibrator->level, 0xffff);
 	state.enabled = true;
 
-	err = pwm_apply_state(vibrator->pwm, &state);
+	err = pwm_apply_might_sleep(vibrator->pwm, &state);
 	if (err) {
 		dev_err(pdev, "failed to apply pwm state: %d\n", err);
 		return err;
@@ -67,7 +67,7 @@ static int pwm_vibrator_start(struct pwm_vibrator *vibrator)
 		state.duty_cycle = vibrator->direction_duty_cycle;
 		state.enabled = true;
 
-		err = pwm_apply_state(vibrator->pwm_dir, &state);
+		err = pwm_apply_might_sleep(vibrator->pwm_dir, &state);
 		if (err) {
 			dev_err(pdev, "failed to apply dir-pwm state: %d\n", err);
 			pwm_disable(vibrator->pwm);
@@ -160,7 +160,7 @@ static int pwm_vibrator_probe(struct platform_device *pdev)
 	/* Sync up PWM state and ensure it is off. */
 	pwm_init_state(vibrator->pwm, &state);
 	state.enabled = false;
-	err = pwm_apply_state(vibrator->pwm, &state);
+	err = pwm_apply_might_sleep(vibrator->pwm, &state);
 	if (err) {
 		dev_err(&pdev->dev, "failed to apply initial PWM state: %d\n",
 			err);
@@ -174,7 +174,7 @@ static int pwm_vibrator_probe(struct platform_device *pdev)
 		/* Sync up PWM state and ensure it is off. */
 		pwm_init_state(vibrator->pwm_dir, &state);
 		state.enabled = false;
-		err = pwm_apply_state(vibrator->pwm_dir, &state);
+		err = pwm_apply_might_sleep(vibrator->pwm_dir, &state);
 		if (err) {
 			dev_err(&pdev->dev, "failed to apply initial PWM state: %d\n",
 				err);
diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c
index 2b3bf1353b70..4e3936a39d0e 100644
--- a/drivers/leds/leds-pwm.c
+++ b/drivers/leds/leds-pwm.c
@@ -54,7 +54,7 @@ static int led_pwm_set(struct led_classdev *led_cdev,
 
 	led_dat->pwmstate.duty_cycle = duty;
 	led_dat->pwmstate.enabled = true;
-	return pwm_apply_state(led_dat->pwm, &led_dat->pwmstate);
+	return pwm_apply_might_sleep(led_dat->pwm, &led_dat->pwmstate);
 }
 
 __attribute__((nonnull))
diff --git a/drivers/leds/rgb/leds-pwm-multicolor.c b/drivers/leds/rgb/leds-pwm-multicolor.c
index 46cd062b8b24..e1a81e0109e8 100644
--- a/drivers/leds/rgb/leds-pwm-multicolor.c
+++ b/drivers/leds/rgb/leds-pwm-multicolor.c
@@ -51,8 +51,8 @@ static int led_pwm_mc_set(struct led_classdev *cdev,
 
 		priv->leds[i].state.duty_cycle = duty;
 		priv->leds[i].state.enabled = duty > 0;
-		ret = pwm_apply_state(priv->leds[i].pwm,
-				      &priv->leds[i].state);
+		ret = pwm_apply_might_sleep(priv->leds[i].pwm,
+					    &priv->leds[i].state);
 		if (ret)
 			break;
 	}
diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c
index c5f37c03af9c..cf51e2760975 100644
--- a/drivers/media/rc/pwm-ir-tx.c
+++ b/drivers/media/rc/pwm-ir-tx.c
@@ -68,7 +68,7 @@ static int pwm_ir_tx(struct rc_dev *dev, unsigned int *txbuf,
 
 	for (i = 0; i < count; i++) {
 		state.enabled = !(i % 2);
-		pwm_apply_state(pwm, &state);
+		pwm_apply_might_sleep(pwm, &state);
 
 		edge = ktime_add_us(edge, txbuf[i]);
 		delta = ktime_us_delta(edge, ktime_get());
@@ -77,7 +77,7 @@ static int pwm_ir_tx(struct rc_dev *dev, unsigned int *txbuf,
 	}
 
 	state.enabled = false;
-	pwm_apply_state(pwm, &state);
+	pwm_apply_might_sleep(pwm, &state);
 
 	return count;
 }
diff --git a/drivers/platform/x86/lenovo-yogabook.c b/drivers/platform/x86/lenovo-yogabook.c
index b8d0239192cb..fd62bf746ebd 100644
--- a/drivers/platform/x86/lenovo-yogabook.c
+++ b/drivers/platform/x86/lenovo-yogabook.c
@@ -435,7 +435,7 @@ static int yogabook_pdev_set_kbd_backlight(struct yogabook_data *data, u8 level)
 		.enabled = level,
 	};
 
-	pwm_apply_state(data->kbd_bl_pwm, &state);
+	pwm_apply_might_sleep(data->kbd_bl_pwm, &state);
 	gpiod_set_value(data->kbd_bl_led_enable, level ? 1 : 0);
 	return 0;
 }
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index b0e50ca9398a..c6228843a1a7 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -326,8 +326,8 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 }
 EXPORT_SYMBOL_GPL(pwm_request_from_chip);
 
-static void pwm_apply_state_debug(struct pwm_device *pwm,
-				  const struct pwm_state *state)
+static void pwm_apply_debug(struct pwm_device *pwm,
+			    const struct pwm_state *state)
 {
 	struct pwm_state *last = &pwm->last;
 	struct pwm_chip *chip = pwm->chip;
@@ -433,11 +433,11 @@ static void pwm_apply_state_debug(struct pwm_device *pwm,
 }
 
 /**
- * pwm_apply_state() - atomically apply a new state to a PWM device
+ * pwm_apply_might_sleep() - atomically apply a new state to a PWM device
  * @pwm: PWM device
  * @state: new state to apply
  */
-int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
+int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
 {
 	struct pwm_chip *chip;
 	int err;
@@ -445,7 +445,7 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
 	/*
 	 * Some lowlevel driver's implementations of .apply() make use of
 	 * mutexes, also with some drivers only returning when the new
-	 * configuration is active calling pwm_apply_state() from atomic context
+	 * configuration is active calling pwm_apply_might_sleep() from atomic context
 	 * is a bad idea. So make it explicit that calling this function might
 	 * sleep.
 	 */
@@ -475,11 +475,11 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
 	 * only do this after pwm->state was applied as some
 	 * implementations of .get_state depend on this
 	 */
-	pwm_apply_state_debug(pwm, state);
+	pwm_apply_debug(pwm, state);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pwm_apply_state);
+EXPORT_SYMBOL_GPL(pwm_apply_might_sleep);
 
 /**
  * pwm_capture() - capture and report a PWM signal
@@ -537,7 +537,7 @@ int pwm_adjust_config(struct pwm_device *pwm)
 		state.period = pargs.period;
 		state.polarity = pargs.polarity;
 
-		return pwm_apply_state(pwm, &state);
+		return pwm_apply_might_sleep(pwm, &state);
 	}
 
 	/*
@@ -560,7 +560,7 @@ int pwm_adjust_config(struct pwm_device *pwm)
 		state.duty_cycle = state.period - state.duty_cycle;
 	}
 
-	return pwm_apply_state(pwm, &state);
+	return pwm_apply_might_sleep(pwm, &state);
 }
 EXPORT_SYMBOL_GPL(pwm_adjust_config);
 
diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index 8a870d0db3c6..c670ccb81653 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -172,7 +172,7 @@ static int twl4030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * We cannot skip calling ->config even if state->period ==
 	 * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle
 	 * because we might have exited early in the last call to
-	 * pwm_apply_state because of !state->enabled and so the two values in
+	 * pwm_apply_might_sleep because of !state->enabled and so the two values in
 	 * pwm->state might not be configured in hardware.
 	 */
 	ret = twl4030_pwmled_config(chip, pwm,
diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index bdea60389487..7bfeacee05d0 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -206,7 +206,7 @@ static int vt8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	 * We cannot skip calling ->config even if state->period ==
 	 * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle
 	 * because we might have exited early in the last call to
-	 * pwm_apply_state because of !state->enabled and so the two values in
+	 * pwm_apply_might_sleep because of !state->enabled and so the two values in
 	 * pwm->state might not be configured in hardware.
 	 */
 	err = vt8500_pwm_config(chip, pwm, state->duty_cycle, state->period);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 4edb994fa2e1..1698609d91c8 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -62,7 +62,7 @@ static ssize_t period_store(struct device *child,
 	mutex_lock(&export->lock);
 	pwm_get_state(pwm, &state);
 	state.period = val;
-	ret = pwm_apply_state(pwm, &state);
+	ret = pwm_apply_might_sleep(pwm, &state);
 	mutex_unlock(&export->lock);
 
 	return ret ? : size;
@@ -97,7 +97,7 @@ static ssize_t duty_cycle_store(struct device *child,
 	mutex_lock(&export->lock);
 	pwm_get_state(pwm, &state);
 	state.duty_cycle = val;
-	ret = pwm_apply_state(pwm, &state);
+	ret = pwm_apply_might_sleep(pwm, &state);
 	mutex_unlock(&export->lock);
 
 	return ret ? : size;
@@ -144,7 +144,7 @@ static ssize_t enable_store(struct device *child,
 		goto unlock;
 	}
 
-	ret = pwm_apply_state(pwm, &state);
+	ret = pwm_apply_might_sleep(pwm, &state);
 
 unlock:
 	mutex_unlock(&export->lock);
@@ -194,7 +194,7 @@ static ssize_t polarity_store(struct device *child,
 	mutex_lock(&export->lock);
 	pwm_get_state(pwm, &state);
 	state.polarity = polarity;
-	ret = pwm_apply_state(pwm, &state);
+	ret = pwm_apply_might_sleep(pwm, &state);
 	mutex_unlock(&export->lock);
 
 	return ret ? : size;
@@ -401,7 +401,7 @@ static int pwm_class_apply_state(struct pwm_export *export,
 				 struct pwm_device *pwm,
 				 struct pwm_state *state)
 {
-	int ret = pwm_apply_state(pwm, state);
+	int ret = pwm_apply_might_sleep(pwm, state);
 
 	/* release lock taken in pwm_class_get_state */
 	mutex_unlock(&export->lock);
diff --git a/drivers/regulator/pwm-regulator.c b/drivers/regulator/pwm-regulator.c
index 2aff6db748e2..698c420e0869 100644
--- a/drivers/regulator/pwm-regulator.c
+++ b/drivers/regulator/pwm-regulator.c
@@ -90,7 +90,7 @@ static int pwm_regulator_set_voltage_sel(struct regulator_dev *rdev,
 	pwm_set_relative_duty_cycle(&pstate,
 			drvdata->duty_cycle_table[selector].dutycycle, 100);
 
-	ret = pwm_apply_state(drvdata->pwm, &pstate);
+	ret = pwm_apply_might_sleep(drvdata->pwm, &pstate);
 	if (ret) {
 		dev_err(&rdev->dev, "Failed to configure PWM: %d\n", ret);
 		return ret;
@@ -216,7 +216,7 @@ static int pwm_regulator_set_voltage(struct regulator_dev *rdev,
 
 	pwm_set_relative_duty_cycle(&pstate, dutycycle, duty_unit);
 
-	ret = pwm_apply_state(drvdata->pwm, &pstate);
+	ret = pwm_apply_might_sleep(drvdata->pwm, &pstate);
 	if (ret) {
 		dev_err(&rdev->dev, "Failed to configure PWM: %d\n", ret);
 		return ret;
diff --git a/drivers/video/backlight/lm3630a_bl.c b/drivers/video/backlight/lm3630a_bl.c
index 8fcb62be597b..a3412c936ca2 100644
--- a/drivers/video/backlight/lm3630a_bl.c
+++ b/drivers/video/backlight/lm3630a_bl.c
@@ -180,7 +180,7 @@ static int lm3630a_pwm_ctrl(struct lm3630a_chip *pchip, int br, int br_max)
 
 	pchip->pwmd_state.enabled = pchip->pwmd_state.duty_cycle ? true : false;
 
-	return pwm_apply_state(pchip->pwmd, &pchip->pwmd_state);
+	return pwm_apply_might_sleep(pchip->pwmd, &pchip->pwmd_state);
 }
 
 /* update and get brightness */
diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index da1f124db69c..7075bfab59c4 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -234,7 +234,7 @@ static int lp855x_pwm_ctrl(struct lp855x *lp, int br, int max_br)
 	state.duty_cycle = div_u64(br * state.period, max_br);
 	state.enabled = state.duty_cycle;
 
-	return pwm_apply_state(lp->pwm, &state);
+	return pwm_apply_might_sleep(lp->pwm, &state);
 }
 
 static int lp855x_bl_update_status(struct backlight_device *bl)
diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 289bd9ce4d36..35c716e9043c 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -103,7 +103,7 @@ static int pwm_backlight_update_status(struct backlight_device *bl)
 		pwm_get_state(pb->pwm, &state);
 		state.duty_cycle = compute_duty_cycle(pb, brightness, &state);
 		state.enabled = true;
-		pwm_apply_state(pb->pwm, &state);
+		pwm_apply_might_sleep(pb->pwm, &state);
 
 		pwm_backlight_power_on(pb);
 	} else {
@@ -120,7 +120,7 @@ static int pwm_backlight_update_status(struct backlight_device *bl)
 		 * inactive output.
 		 */
 		state.enabled = !pb->power_supply && !pb->enable_gpio;
-		pwm_apply_state(pb->pwm, &state);
+		pwm_apply_might_sleep(pb->pwm, &state);
 	}
 
 	if (pb->notify_after)
@@ -528,7 +528,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	if (!state.period && (data->pwm_period_ns > 0))
 		state.period = data->pwm_period_ns;
 
-	ret = pwm_apply_state(pb->pwm, &state);
+	ret = pwm_apply_might_sleep(pb->pwm, &state);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to apply initial PWM state: %d\n",
 			ret);
@@ -633,7 +633,7 @@ static void pwm_backlight_remove(struct platform_device *pdev)
 	pwm_get_state(pb->pwm, &state);
 	state.duty_cycle = 0;
 	state.enabled = false;
-	pwm_apply_state(pb->pwm, &state);
+	pwm_apply_might_sleep(pb->pwm, &state);
 
 	if (pb->exit)
 		pb->exit(&pdev->dev);
@@ -649,7 +649,7 @@ static void pwm_backlight_shutdown(struct platform_device *pdev)
 	pwm_get_state(pb->pwm, &state);
 	state.duty_cycle = 0;
 	state.enabled = false;
-	pwm_apply_state(pb->pwm, &state);
+	pwm_apply_might_sleep(pb->pwm, &state);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -673,7 +673,7 @@ static int pwm_backlight_suspend(struct device *dev)
 	pwm_get_state(pb->pwm, &state);
 	state.duty_cycle = 0;
 	state.enabled = false;
-	pwm_apply_state(pb->pwm, &state);
+	pwm_apply_might_sleep(pb->pwm, &state);
 
 	if (pb->notify_after)
 		pb->notify_after(pb->dev, 0);
diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c
index 5ae48e36fccb..1a4f90ea7d5a 100644
--- a/drivers/video/fbdev/ssd1307fb.c
+++ b/drivers/video/fbdev/ssd1307fb.c
@@ -347,7 +347,7 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
 
 		pwm_init_state(par->pwm, &pwmstate);
 		pwm_set_relative_duty_cycle(&pwmstate, 50, 100);
-		pwm_apply_state(par->pwm, &pwmstate);
+		pwm_apply_might_sleep(par->pwm, &pwmstate);
 
 		/* Enable the PWM */
 		pwm_enable(par->pwm);
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f87655c06c82..b64b8a82415c 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -92,8 +92,8 @@ struct pwm_device {
  * @state: state to fill with the current PWM state
  *
  * The returned PWM state represents the state that was applied by a previous call to
- * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to
- * hardware. If pwm_apply_state() was never called, this returns either the current hardware
+ * pwm_apply_might_sleep(). Drivers may have to slightly tweak that state before programming it to
+ * hardware. If pwm_apply_might_sleep() was never called, this returns either the current hardware
  * state (if supported) or the default settings.
  */
 static inline void pwm_get_state(const struct pwm_device *pwm,
@@ -157,20 +157,20 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
 }
 
 /**
- * pwm_init_state() - prepare a new state to be applied with pwm_apply_state()
+ * pwm_init_state() - prepare a new state to be applied with pwm_apply_might_sleep()
  * @pwm: PWM device
  * @state: state to fill with the prepared PWM state
  *
  * This functions prepares a state that can later be tweaked and applied
- * to the PWM device with pwm_apply_state(). This is a convenient function
+ * to the PWM device with pwm_apply_might_sleep(). This is a convenient function
  * that first retrieves the current PWM state and the replaces the period
  * and polarity fields with the reference values defined in pwm->args.
  * Once the function returns, you can adjust the ->enabled and ->duty_cycle
- * fields according to your needs before calling pwm_apply_state().
+ * fields according to your needs before calling pwm_apply_might_sleep().
  *
  * ->duty_cycle is initially set to zero to avoid cases where the current
  * ->duty_cycle value exceed the pwm_args->period one, which would trigger
- * an error if the user calls pwm_apply_state() without adjusting ->duty_cycle
+ * an error if the user calls pwm_apply_might_sleep() without adjusting ->duty_cycle
  * first.
  */
 static inline void pwm_init_state(const struct pwm_device *pwm,
@@ -226,7 +226,7 @@ pwm_get_relative_duty_cycle(const struct pwm_state *state, unsigned int scale)
  *
  * pwm_init_state(pwm, &state);
  * pwm_set_relative_duty_cycle(&state, 50, 100);
- * pwm_apply_state(pwm, &state);
+ * pwm_apply_might_sleep(pwm, &state);
  *
  * This functions returns -EINVAL if @duty_cycle and/or @scale are
  * inconsistent (@scale == 0 or @duty_cycle > @scale).
@@ -304,7 +304,7 @@ struct pwm_chip {
 
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
-int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state);
+int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**
@@ -332,7 +332,7 @@ static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
 
 	state.duty_cycle = duty_ns;
 	state.period = period_ns;
-	return pwm_apply_state(pwm, &state);
+	return pwm_apply_might_sleep(pwm, &state);
 }
 
 /**
@@ -353,7 +353,7 @@ static inline int pwm_enable(struct pwm_device *pwm)
 		return 0;
 
 	state.enabled = true;
-	return pwm_apply_state(pwm, &state);
+	return pwm_apply_might_sleep(pwm, &state);
 }
 
 /**
@@ -372,7 +372,7 @@ static inline void pwm_disable(struct pwm_device *pwm)
 		return;
 
 	state.enabled = false;
-	pwm_apply_state(pwm, &state);
+	pwm_apply_might_sleep(pwm, &state);
 }
 
 /* PWM provider APIs */
@@ -403,8 +403,8 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
 #else
-static inline int pwm_apply_state(struct pwm_device *pwm,
-				  const struct pwm_state *state)
+static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
+					const struct pwm_state *state)
 {
 	might_sleep();
 	return -ENOTSUPP;
@@ -521,7 +521,7 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
 	state.period = pwm->args.period;
 	state.usage_power = false;
 
-	pwm_apply_state(pwm, &state);
+	pwm_apply_might_sleep(pwm, &state);
 }
 
 struct pwm_lookup {
-- 
cgit v1.2.3


From dc518b378dced419baa95d76a85f4c8c405722bc Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:25 +0000
Subject: pwm: Replace ENOTSUPP with EOPNOTSUPP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to Documentation/dev-tools/checkpatch.rst ENOTSUPP is
not recommended and EOPNOTSUPP should be used instead.

Signed-off-by: Sean Young <sean@mess.org>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index b64b8a82415c..c9cb87b59ac8 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -407,12 +407,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 					const struct pwm_state *state)
 {
 	might_sleep();
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int pwm_adjust_config(struct pwm_device *pwm)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
-- 
cgit v1.2.3


From 7170d3beafc2373dd76b6b5d6e617d89e4e42b8b Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Tue, 19 Dec 2023 16:30:27 +0000
Subject: pwm: Make it possible to apply PWM changes in atomic context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some PWM devices require sleeping, for example if the pwm device is
connected over I2C. However, many PWM devices could be used from atomic
context, e.g. memory mapped PWM. This is useful for, for example, the
pwm-ir-tx driver which requires precise timing. Sleeping causes havoc
with the generated IR signal.

Since not all PWM devices can support atomic context, we also add a
pwm_might_sleep() function to check if is not supported.

Signed-off-by: Sean Young <sean@mess.org>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/pwm.rst |  9 ++++++
 MAINTAINERS                      |  2 +-
 drivers/pwm/core.c               | 62 +++++++++++++++++++++++++++++++++-------
 include/linux/pwm.h              | 25 ++++++++++++++++
 4 files changed, 86 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index f1d8197c8c43..3c28ccc4b611 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -46,6 +46,15 @@ After being requested, a PWM has to be configured using::
 This API controls both the PWM period/duty_cycle config and the
 enable/disable state.
 
+PWM devices can be used from atomic context, if the PWM does not sleep. You
+can check if this the case with::
+
+        bool pwm_might_sleep(struct pwm_device *pwm);
+
+If false, the PWM can also be configured from atomic context with::
+
+	int pwm_apply_atomic(struct pwm_device *pwm, struct pwm_state *state);
+
 As a consumer, don't rely on the output's state for a disabled PWM. If it's
 easily possible, drivers are supposed to emit the inactive state, but some
 drivers cannot. If you rely on getting the inactive state, use .duty_cycle=0,
diff --git a/MAINTAINERS b/MAINTAINERS
index c58480595220..5342cf32d73f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17576,7 +17576,7 @@ F:	drivers/video/backlight/pwm_bl.c
 F:	include/dt-bindings/pwm/
 F:	include/linux/pwm.h
 F:	include/linux/pwm_backlight.h
-K:	pwm_(config|apply_might_sleep|ops)
+K:	pwm_(config|apply_might_sleep|apply_atomic|ops)
 
 PXA GPIO DRIVER
 M:	Robert Jarzmik <robert.jarzmik@free.fr>
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index c6228843a1a7..f1ded8ce5ea4 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -433,24 +433,15 @@ static void pwm_apply_debug(struct pwm_device *pwm,
 }
 
 /**
- * pwm_apply_might_sleep() - atomically apply a new state to a PWM device
+ * __pwm_apply() - atomically apply a new state to a PWM device
  * @pwm: PWM device
  * @state: new state to apply
  */
-int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
+static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 {
 	struct pwm_chip *chip;
 	int err;
 
-	/*
-	 * Some lowlevel driver's implementations of .apply() make use of
-	 * mutexes, also with some drivers only returning when the new
-	 * configuration is active calling pwm_apply_might_sleep() from atomic context
-	 * is a bad idea. So make it explicit that calling this function might
-	 * sleep.
-	 */
-	might_sleep();
-
 	if (!pwm || !state || !state->period ||
 	    state->duty_cycle > state->period)
 		return -EINVAL;
@@ -479,8 +470,57 @@ int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
 
 	return 0;
 }
+
+/**
+ * pwm_apply_might_sleep() - atomically apply a new state to a PWM device
+ * Cannot be used in atomic context.
+ * @pwm: PWM device
+ * @state: new state to apply
+ */
+int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state)
+{
+	int err;
+
+	/*
+	 * Some lowlevel driver's implementations of .apply() make use of
+	 * mutexes, also with some drivers only returning when the new
+	 * configuration is active calling pwm_apply_might_sleep() from atomic context
+	 * is a bad idea. So make it explicit that calling this function might
+	 * sleep.
+	 */
+	might_sleep();
+
+	if (IS_ENABLED(CONFIG_PWM_DEBUG) && pwm->chip->atomic) {
+		/*
+		 * Catch any drivers that have been marked as atomic but
+		 * that will sleep anyway.
+		 */
+		non_block_start();
+		err = __pwm_apply(pwm, state);
+		non_block_end();
+	} else {
+		err = __pwm_apply(pwm, state);
+	}
+
+	return err;
+}
 EXPORT_SYMBOL_GPL(pwm_apply_might_sleep);
 
+/**
+ * pwm_apply_atomic() - apply a new state to a PWM device from atomic context
+ * Not all PWM devices support this function, check with pwm_might_sleep().
+ * @pwm: PWM device
+ * @state: new state to apply
+ */
+int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state)
+{
+	WARN_ONCE(!pwm->chip->atomic,
+		  "sleeping PWM driver used in atomic context\n");
+
+	return __pwm_apply(pwm, state);
+}
+EXPORT_SYMBOL_GPL(pwm_apply_atomic);
+
 /**
  * pwm_capture() - capture and report a PWM signal
  * @pwm: PWM device
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index c9cb87b59ac8..495af3627939 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -285,6 +285,7 @@ struct pwm_ops {
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
+ * @atomic: can the driver's ->apply() be called in atomic context
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
@@ -297,6 +298,7 @@ struct pwm_chip {
 	struct pwm_device * (*of_xlate)(struct pwm_chip *chip,
 					const struct of_phandle_args *args);
 	unsigned int of_pwm_n_cells;
+	bool atomic;
 
 	/* only used internally by the PWM framework */
 	struct pwm_device *pwms;
@@ -305,6 +307,7 @@ struct pwm_chip {
 #if IS_ENABLED(CONFIG_PWM)
 /* PWM user APIs */
 int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state);
+int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**
@@ -375,6 +378,17 @@ static inline void pwm_disable(struct pwm_device *pwm)
 	pwm_apply_might_sleep(pwm, &state);
 }
 
+/**
+ * pwm_might_sleep() - is pwm_apply_atomic() supported?
+ * @pwm: PWM device
+ *
+ * Returns: false if pwm_apply_atomic() can be called from atomic context.
+ */
+static inline bool pwm_might_sleep(struct pwm_device *pwm)
+{
+	return !pwm->chip->atomic;
+}
+
 /* PWM provider APIs */
 int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 		unsigned long timeout);
@@ -403,6 +417,11 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
 #else
+static inline bool pwm_might_sleep(struct pwm_device *pwm)
+{
+	return true;
+}
+
 static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 					const struct pwm_state *state)
 {
@@ -410,6 +429,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm,
 	return -EOPNOTSUPP;
 }
 
+static inline int pwm_apply_atomic(struct pwm_device *pwm,
+				   const struct pwm_state *state)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int pwm_adjust_config(struct pwm_device *pwm)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From fe22944cf05ede8e6f841cfecdb7093a53a3d9b3 Mon Sep 17 00:00:00 2001
From: xiaoming Wang <xiaoming.wang@intel.com>
Date: Tue, 19 Dec 2023 11:34:11 +0800
Subject: cpu/hotplug: Increase the number of dynamic states

The dynamically allocatable hotplug state space can be exhausted by
the existing drivers and infrastructure which install CPU hotplug
states dynamically. That prevents new drivers and infrastructure from
installing dynamically allocated states.

Increase the size of the CPUHP_AP_ONLINE_DYN state by 10 to make
room.

Signed-off-by: Xiaoming Wang <xiaoming.wang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231219033411.816100-1-xiaoming.wang@intel.com
---
 include/linux/cpuhotplug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index af6c21aab985..8bd454dfe453 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -239,7 +239,7 @@ enum cpuhp_state {
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_BASE_CACHEINFO_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
+	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 40,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
 	CPUHP_AP_ACTIVE,
-- 
cgit v1.2.3


From 80e4a9987999e682366d60f43a7b2adefc48e222 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 10 Dec 2023 00:00:47 +0100
Subject: pwm: Drop two unused API functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions are unused. Also I think there is no valid use case
where these are correct to be called. So drop them.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 495af3627939..5dd665d8c909 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -111,12 +111,6 @@ static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 	return state.enabled;
 }
 
-static inline void pwm_set_period(struct pwm_device *pwm, u64 period)
-{
-	if (pwm)
-		pwm->state.period = period;
-}
-
 static inline u64 pwm_get_period(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
@@ -126,12 +120,6 @@ static inline u64 pwm_get_period(const struct pwm_device *pwm)
 	return state.period;
 }
 
-static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
-{
-	if (pwm)
-		pwm->state.duty_cycle = duty;
-}
-
 static inline u64 pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
 	struct pwm_state state;
-- 
cgit v1.2.3


From eba2591d99d1f14a04c8a8a845ab0795b93f5646 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Wed, 13 Dec 2023 21:29:59 +0100
Subject: mm: Introduce pudp/p4dp/pgdp_get() functions

Instead of directly dereferencing page tables entries, which can cause
issues (see commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when
accessing page tables"), let's introduce new functions to get the
pud/p4d/pgd entries (the pte and pmd versions already exist).

Note that arm pgd_t is actually an array so pgdp_get() is defined as a
macro to avoid a build error.

Those new functions will be used in subsequent commits by the riscv
architecture.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20231213203001.179237-3-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/arm/include/asm/pgtable.h |  2 ++
 include/linux/pgtable.h        | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 16b02f44c7d3..d657b84b6bf7 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -151,6 +151,8 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 
+#define pgdp_get(pgpd)		READ_ONCE(*pgdp)
+
 #define pud_page(pud)		pmd_page(__pmd(pud_val(pud)))
 #define pud_write(pud)		pmd_write(__pmd(pud_val(pud)))
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..8b7daccd11be 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -292,6 +292,27 @@ static inline pmd_t pmdp_get(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pudp_get
+static inline pud_t pudp_get(pud_t *pudp)
+{
+	return READ_ONCE(*pudp);
+}
+#endif
+
+#ifndef p4dp_get
+static inline p4d_t p4dp_get(p4d_t *p4dp)
+{
+	return READ_ONCE(*p4dp);
+}
+#endif
+
+#ifndef pgdp_get
+static inline pgd_t pgdp_get(pgd_t *pgdp)
+{
+	return READ_ONCE(*pgdp);
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
-- 
cgit v1.2.3


From 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Wed, 29 Nov 2023 03:21:53 +0000
Subject: mm: memcg: restore subtree stats flushing

Stats flushing for memcg currently follows the following rules:
- Always flush the entire memcg hierarchy (i.e. flush the root).
- Only one flusher is allowed at a time. If someone else tries to flush
  concurrently, they skip and return immediately.
- A periodic flusher flushes all the stats every 2 seconds.

The reason this approach is followed is because all flushes are serialized
by a global rstat spinlock.  On the memcg side, flushing is invoked from
userspace reads as well as in-kernel flushers (e.g.  reclaim, refault,
etc).  This approach aims to avoid serializing all flushers on the global
lock, which can cause a significant performance hit under high
concurrency.

This approach has the following problems:
- Occasionally a userspace read of the stats of a non-root cgroup will
  be too expensive as it has to flush the entire hierarchy [1].
- Sometimes the stats accuracy are compromised if there is an ongoing
  flush, and we skip and return before the subtree of interest is
  actually flushed, yielding stale stats (by up to 2s due to periodic
  flushing). This is more visible when reading stats from userspace,
  but can also affect in-kernel flushers.

The latter problem is particulary a concern when userspace reads stats
after an event occurs, but gets stats from before the event. Examples:
- When memory usage / pressure spikes, a userspace OOM handler may look
  at the stats of different memcgs to select a victim based on various
  heuristics (e.g. how much private memory will be freed by killing
  this). Reading stale stats from before the usage spike in this case
  may cause a wrongful OOM kill.
- A proactive reclaimer may read the stats after writing to
  memory.reclaim to measure the success of the reclaim operation. Stale
  stats from before reclaim may give a false negative.
- Reading the stats of a parent and a child memcg may be inconsistent
  (child larger than parent), if the flush doesn't happen when the
  parent is read, but happens when the child is read.

As for in-kernel flushers, they will occasionally get stale stats.  No
regressions are currently known from this, but if there are regressions,
they would be very difficult to debug and link to the source of the
problem.

This patch aims to fix these problems by restoring subtree flushing, and
removing the unified/coalesced flushing logic that skips flushing if there
is an ongoing flush.  This change would introduce a significant regression
with global stats flushing thresholds.  With per-memcg stats flushing
thresholds, this seems to perform really well.  The thresholds protect the
underlying lock from unnecessary contention.

This patch was tested in two ways to ensure the latency of flushing is
up to par, on a machine with 384 cpus:

- A synthetic test with 5000 concurrent workers in 500 cgroups doing
  allocations and reclaim, as well as 1000 readers for memory.stat
  (variation of [2]). No regressions were noticed in the total runtime.
  Note that significant regressions in this test are observed with
  global stats thresholds, but not with per-memcg thresholds.

- A synthetic stress test for concurrently reading memcg stats while
  memory allocation/freeing workers are running in the background,
  provided by Wei Xu [3]. With 250k threads reading the stats every
  100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01%
  of reads take more than 1ms, and no reads take more than 100ms.

[1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/
[3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/

[akpm@linux-foundation.org: fix mm/zswap.c]
[yosryahmed@google.com: remove stats flushing mutex]
  Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com
Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Tested-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ivan Babrou <ivan@cloudflare.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  8 +++---
 mm/memcontrol.c            | 68 ++++++++++++++++++++++++++--------------------
 mm/vmscan.c                |  2 +-
 mm/workingset.c            | 10 +++++--
 mm/zswap.c                 |  2 +-
 5 files changed, 52 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a308c8eacf20..43b77363ab8e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
-void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_ratelimited(void);
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
@@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
-static inline void mem_cgroup_flush_stats(void)
+static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 {
 }
 
-static inline void mem_cgroup_flush_stats_ratelimited(void)
+static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5aa0c2cb68b..b08b9cd4a3a8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -670,7 +670,6 @@ struct memcg_vmstats {
  */
 static void flush_memcg_stats_dwork(struct work_struct *w);
 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
 static u64 flush_last_time;
 
 #define FLUSH_TIME (2UL*HZ)
@@ -731,35 +730,40 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	}
 }
 
-static void do_flush_stats(void)
+static void do_flush_stats(struct mem_cgroup *memcg)
 {
-	/*
-	 * We always flush the entire tree, so concurrent flushers can just
-	 * skip. This avoids a thundering herd problem on the rstat global lock
-	 * from memcg flushers (e.g. reclaim, refault, etc).
-	 */
-	if (atomic_read(&stats_flush_ongoing) ||
-	    atomic_xchg(&stats_flush_ongoing, 1))
-		return;
-
-	WRITE_ONCE(flush_last_time, jiffies_64);
-
-	cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+	if (mem_cgroup_is_root(memcg))
+		WRITE_ONCE(flush_last_time, jiffies_64);
 
-	atomic_set(&stats_flush_ongoing, 0);
+	cgroup_rstat_flush(memcg->css.cgroup);
 }
 
-void mem_cgroup_flush_stats(void)
+/*
+ * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
+ * @memcg: root of the subtree to flush
+ *
+ * Flushing is serialized by the underlying global rstat lock. There is also a
+ * minimum amount of work to be done even if there are no stat updates to flush.
+ * Hence, we only flush the stats if the updates delta exceeds a threshold. This
+ * avoids unnecessary work and contention on the underlying lock.
+ */
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 {
-	if (memcg_should_flush_stats(root_mem_cgroup))
-		do_flush_stats();
+	if (mem_cgroup_disabled())
+		return;
+
+	if (!memcg)
+		memcg = root_mem_cgroup;
+
+	if (memcg_should_flush_stats(memcg))
+		do_flush_stats(memcg);
 }
 
-void mem_cgroup_flush_stats_ratelimited(void)
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 {
 	/* Only flush if the periodic flusher is one full cycle late */
 	if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
-		mem_cgroup_flush_stats();
+		mem_cgroup_flush_stats(memcg);
 }
 
 static void flush_memcg_stats_dwork(struct work_struct *w)
@@ -768,7 +772,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
 	 * Deliberately ignore memcg_should_flush_stats() here so that flushing
 	 * in latency-sensitive paths is as cheap as possible.
 	 */
-	do_flush_stats();
+	do_flush_stats(root_mem_cgroup);
 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }
 
@@ -1643,7 +1647,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 	 *
 	 * Current memory state:
 	 */
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 		u64 size;
@@ -4193,7 +4197,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
 	int nid;
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		seq_printf(m, "%s=%lu", stat->name,
@@ -4274,7 +4278,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		unsigned long nr;
@@ -4770,7 +4774,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -6865,7 +6869,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
 	int i;
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 		int nid;
@@ -8096,7 +8100,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 			break;
 		}
 
-		cgroup_rstat_flush(memcg->css.cgroup);
+		/*
+		 * mem_cgroup_flush_stats() ignores small changes. Use
+		 * do_flush_stats() directly to get accurate stats for charging.
+		 */
+		do_flush_stats(memcg);
 		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
 		if (pages < max)
 			continue;
@@ -8161,8 +8169,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
 static u64 zswap_current_read(struct cgroup_subsys_state *css,
 			      struct cftype *cft)
 {
-	cgroup_rstat_flush(css->cgroup);
-	return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	mem_cgroup_flush_stats(memcg);
+	return memcg_page_state(memcg, MEMCG_ZSWAP_B);
 }
 
 static int zswap_max_show(struct seq_file *m, void *v)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f0eba9ef3332..b4ca3563bcf4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,7 +2226,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
 	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 	 * lruvec stats for heuristics.
 	 */
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(sc->target_mem_cgroup);
 
 	/*
 	 * Determine the scan balance between anon and file LRUs.
diff --git a/mm/workingset.c b/mm/workingset.c
index 6b9871f5a2e8..2a2a34234df9 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
 
 	rcu_read_unlock();
 
-	/* Flush stats (and potentially sleep) outside the RCU read section */
-	mem_cgroup_flush_stats_ratelimited();
+	/*
+	 * Flush stats (and potentially sleep) outside the RCU read section.
+	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
+	 * still needed here?
+	 */
+	mem_cgroup_flush_stats_ratelimited(eviction_memcg);
 
 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 		struct lruvec *lruvec;
 		int i;
 
-		mem_cgroup_flush_stats();
+		mem_cgroup_flush_stats(sc->memcg);
 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
 			pages += lruvec_page_state_local(lruvec,
diff --git a/mm/zswap.c b/mm/zswap.c
index 015425ed9003..ac31fec176e9 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -641,7 +641,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 		return 0;
 
 #ifdef CONFIG_MEMCG_KMEM
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats(memcg);
 	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
 	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
 #else
-- 
cgit v1.2.3


From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 7 Dec 2023 16:12:04 +0000
Subject: mm: thp: introduce multi-size THP sysfs interface

In preparation for adding support for anonymous multi-size THP, introduce
new sysfs structure that will be used to control the new behaviours.  A
new directory is added under transparent_hugepage for each supported THP
size, and contains an `enabled` file, which can be set to "inherit" (to
inherit the global setting), "always", "madvise" or "never".  For now, the
kernel still only supports PMD-sized anonymous THP, so only 1 directory is
populated.

The first half of the change converts transhuge_vma_suitable() and
hugepage_vma_check() so that they take a bitfield of orders for which the
user wants to determine support, and the functions filter out all the
orders that can't be supported, given the current sysfs configuration and
the VMA dimensions.  The resulting functions are renamed to
thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively.
Convenience functions that take a single, unencoded order and return a
boolean are also defined as thp_vma_suitable_order() and
thp_vma_allowable_order().

The second half of the change implements the new sysfs interface.  It has
been done so that each supported THP size has a `struct thpsize`, which
describes the relevant metadata and is itself a kobject.  This is pretty
minimal for now, but should make it easy to add new per-thpsize files to
the interface if needed in future (e.g.  per-size defrag).  Rather than
keep the `enabled` state directly in the struct thpsize, I've elected to
directly encode it into huge_anon_orders_[always|madvise|inherit]
bitfields since this reduces the amount of work required in
thp_vma_allowable_orders() which is called for every page fault.

See Documentation/admin-guide/mm/transhuge.rst, as modified by this
commit, for details of how the new sysfs interface works.

[ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled]
  Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Barry Song <v-songbaohua@oppo.com>
Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Itaru Kitayama <itaru.kitayama@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/transhuge.rst |  97 +++++++++---
 Documentation/filesystems/proc.rst         |   6 +-
 fs/proc/task_mmu.c                         |   3 +-
 include/linux/huge_mm.h                    | 181 ++++++++++++++++++----
 mm/huge_memory.c                           | 231 +++++++++++++++++++++++++----
 mm/khugepaged.c                            |  20 ++-
 mm/memory.c                                |   6 +-
 mm/page_vma_mapped.c                       |   3 +-
 8 files changed, 459 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index b0cc8243e093..04eb45a2f940 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -45,10 +45,25 @@ components:
    the two is using hugepages just because of the fact the TLB miss is
    going to run faster.
 
+Modern kernels support "multi-size THP" (mTHP), which introduces the
+ability to allocate memory in blocks that are bigger than a base page
+but smaller than traditional PMD-size (as described above), in
+increments of a power-of-2 number of pages. mTHP can back anonymous
+memory (for example 16K, 32K, 64K, etc). These THPs continue to be
+PTE-mapped, but in many cases can still provide similar benefits to
+those outlined above: Page faults are significantly reduced (by a
+factor of e.g. 4, 8, 16, etc), but latency spikes are much less
+prominent because the size of each page isn't as huge as the PMD-sized
+variant and there is less memory to clear in each page fault. Some
+architectures also employ TLB compression mechanisms to squeeze more
+entries in when a set of PTEs are virtually and physically contiguous
+and approporiately aligned. In this case, TLB misses will occur less
+often.
+
 THP can be enabled system wide or restricted to certain tasks or even
 memory ranges inside task's address space. Unless THP is completely
 disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into huge pages.
+collapses sequences of basic pages into PMD-sized huge pages.
 
 The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
 interface and using madvise(2) and prctl(2) system calls.
@@ -95,12 +110,40 @@ Global THP controls
 Transparent Hugepage Support for anonymous memory can be entirely disabled
 (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
 regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved with one of::
+system wide. This can be achieved per-supported-THP-size with one of::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
+	echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
+	echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
+
+where <size> is the hugepage size being addressed, the available sizes
+for which vary by system.
+
+For example::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+
+Alternatively it is possible to specify that a given hugepage size
+will inherit the top-level "enabled" value::
+
+	echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/enabled
+
+For example::
+
+	echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+
+The top-level setting (for use with "inherit") can be set by issuing
+one of the following commands::
 
 	echo always >/sys/kernel/mm/transparent_hugepage/enabled
 	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
 	echo never >/sys/kernel/mm/transparent_hugepage/enabled
 
+By default, PMD-sized hugepages have enabled="inherit" and all other
+hugepage sizes have enabled="never". If enabling multiple hugepage
+sizes, the kernel will select the most appropriate enabled size for a
+given allocation.
+
 It's also possible to limit defrag efforts in the VM to generate
 anonymous hugepages in case they're not immediately free to madvise
 regions or to never try to defrag memory and simply fallback to regular
@@ -146,25 +189,34 @@ madvise
 never
 	should be self-explanatory.
 
-By default kernel tries to use huge zero page on read page fault to
-anonymous mapping. It's possible to disable huge zero page by writing 0
-or enable it back by writing 1::
+By default kernel tries to use huge, PMD-mappable zero page on read
+page fault to anonymous mapping. It's possible to disable huge zero
+page by writing 0 or enable it back by writing 1::
 
 	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 
-Some userspace (such as a test program, or an optimized memory allocation
-library) may want to know the size (in bytes) of a transparent hugepage::
+Some userspace (such as a test program, or an optimized memory
+allocation library) may want to know the size (in bytes) of a
+PMD-mappable transparent hugepage::
 
 	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
 
-khugepaged will be automatically started when
-transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-be automatically shutdown if it's set to "never".
+khugepaged will be automatically started when one or more hugepage
+sizes are enabled (either by directly setting "always" or "madvise",
+or by setting "inherit" while the top-level enabled is set to "always"
+or "madvise"), and it'll be automatically shutdown when the last
+hugepage size is disabled (either by directly setting "never", or by
+setting "inherit" while the top-level enabled is set to "never").
 
 Khugepaged controls
 -------------------
 
+.. note::
+   khugepaged currently only searches for opportunities to collapse to
+   PMD-sized THP and no attempt is made to collapse to other THP
+   sizes.
+
 khugepaged runs usually at low frequency so while one may not want to
 invoke defrag algorithms synchronously during the page faults, it
 should be worth invoking defrag at least in khugepaged. However it's
@@ -282,19 +334,26 @@ force
 Need of application restart
 ===========================
 
-The transparent_hugepage/enabled values and tmpfs mount option only affect
-future behavior. So to make them effective you need to restart any
-application that could have been using hugepages. This also applies to the
-regions registered in khugepaged.
+The transparent_hugepage/enabled and
+transparent_hugepage/hugepages-<size>kB/enabled values and tmpfs mount
+option only affect future behavior. So to make them effective you need
+to restart any application that could have been using hugepages. This
+also applies to the regions registered in khugepaged.
 
 Monitoring usage
 ================
 
-The number of anonymous transparent huge pages currently used by the
+.. note::
+   Currently the below counters only record events relating to
+   PMD-sized THP. Events relating to other THP sizes are not included.
+
+The number of PMD-sized anonymous transparent huge pages currently used by the
 system is available by reading the AnonHugePages field in ``/proc/meminfo``.
-To identify what applications are using anonymous transparent huge pages,
-it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
-for each mapping.
+To identify what applications are using PMD-sized anonymous transparent huge
+pages, it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages
+fields for each mapping. (Note that AnonHugePages only applies to traditional
+PMD-sized THP for historical reasons and should have been called
+AnonHugePmdMapped).
 
 The number of file transparent huge pages mapped to userspace is available
 by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
@@ -413,7 +472,7 @@ for huge pages.
 Optimizing the applications
 ===========================
 
-To be guaranteed that the kernel will map a 2M page immediately in any
+To be guaranteed that the kernel will map a THP immediately in any
 memory region, the mmap region has to be hugepage naturally
 aligned. posix_memalign() can provide that guarantee.
 
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 49ef12df631b..104c6d047d9b 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -528,9 +528,9 @@ replaced by copy-on-write) part of the underlying shmem object out on swap.
 does not take into account swapped out page of underlying shmem objects.
 "Locked" indicates whether the mapping is locked in memory or not.
 
-"THPeligible" indicates whether the mapping is eligible for allocating THP
-pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise.
-It just shows the current status.
+"THPeligible" indicates whether the mapping is eligible for allocating
+naturally aligned THP pages of any currently enabled size. 1 if true, 0
+otherwise.
 
 "VmFlags" field deserves a separate description. This member represents the
 kernel flags associated with the particular virtual memory area in two letter
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d19924bf0a39..79855e1c5b57 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -865,7 +865,8 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %8u\n",
-		   hugepage_vma_check(vma, vma->vm_flags, true, false, true));
+		   !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
+					      true, THP_ORDERS_ALL));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa0350b0812a..609c153bae57 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
+/*
+ * Mask of all large folio orders supported for anonymous THP.
+ */
+#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+
+/*
+ * Mask of all large folio orders supported for file THP.
+ */
+#define THP_ORDERS_ALL_FILE	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+
+/*
+ * Mask of all large folio orders supported for THP.
+ */
+#define THP_ORDERS_ALL		(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+
+#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
+	(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
@@ -77,45 +95,105 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
 
 extern unsigned long transparent_hugepage_flags;
+extern unsigned long huge_anon_orders_always;
+extern unsigned long huge_anon_orders_madvise;
+extern unsigned long huge_anon_orders_inherit;
 
-#define hugepage_flags_enabled()					       \
-	(transparent_hugepage_flags &				       \
-	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
-	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define hugepage_flags_always()				\
-	(transparent_hugepage_flags &			\
-	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+static inline bool hugepage_global_enabled(void)
+{
+	return transparent_hugepage_flags &
+			((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+			(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
+}
+
+static inline bool hugepage_global_always(void)
+{
+	return transparent_hugepage_flags &
+			(1<<TRANSPARENT_HUGEPAGE_FLAG);
+}
+
+static inline bool hugepage_flags_enabled(void)
+{
+	/*
+	 * We cover both the anon and the file-backed case here; we must return
+	 * true if globally enabled, even when all anon sizes are set to never.
+	 * So we don't need to look at huge_anon_orders_inherit.
+	 */
+	return hugepage_global_enabled() ||
+	       huge_anon_orders_always ||
+	       huge_anon_orders_madvise;
+}
+
+static inline int highest_order(unsigned long orders)
+{
+	return fls_long(orders) - 1;
+}
+
+static inline int next_order(unsigned long *orders, int prev)
+{
+	*orders &= ~BIT(prev);
+	return highest_order(*orders);
+}
 
 /*
  * Do the below checks:
  *   - For file vma, check if the linear page offset of vma is
- *     HPAGE_PMD_NR aligned within the file.  The hugepage is
- *     guaranteed to be hugepage-aligned within the file, but we must
- *     check that the PMD-aligned addresses in the VMA map to
- *     PMD-aligned offsets within the file, else the hugepage will
- *     not be PMD-mappable.
- *   - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ *     order-aligned within the file.  The hugepage is
+ *     guaranteed to be order-aligned within the file, but we must
+ *     check that the order-aligned addresses in the VMA map to
+ *     order-aligned offsets within the file, else the hugepage will
+ *     not be mappable.
+ *   - For all vmas, check if the haddr is in an aligned hugepage
  *     area.
  */
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+		unsigned long addr, int order)
 {
+	unsigned long hpage_size = PAGE_SIZE << order;
 	unsigned long haddr;
 
 	/* Don't have to check pgoff for anonymous vma */
 	if (!vma_is_anonymous(vma)) {
 		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-				HPAGE_PMD_NR))
+				hpage_size >> PAGE_SHIFT))
 			return false;
 	}
 
-	haddr = addr & HPAGE_PMD_MASK;
+	haddr = ALIGN_DOWN(addr, hpage_size);
 
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+	if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
 		return false;
 	return true;
 }
 
+/*
+ * Filter the bitfield of input orders to the ones suitable for use in the vma.
+ * See thp_vma_suitable_order().
+ * All orders that pass the checks are returned as a bitfield.
+ */
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long orders)
+{
+	int order;
+
+	/*
+	 * Iterate over orders, highest to lowest, removing orders that don't
+	 * meet alignment requirements from the set. Exit loop at first order
+	 * that meets requirements, since all lower orders must also meet
+	 * requirements.
+	 */
+
+	order = highest_order(orders);
+
+	while (orders) {
+		if (thp_vma_suitable_order(vma, addr, order))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	return orders;
+}
+
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
 {
 	struct inode *inode;
@@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
-			bool smaps, bool in_pf, bool enforce_sysfs);
+unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
+					 unsigned long vm_flags, bool smaps,
+					 bool in_pf, bool enforce_sysfs,
+					 unsigned long orders);
+
+/**
+ * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
+ * @vma:  the vm area to check
+ * @vm_flags: use these vm_flags instead of vma->vm_flags
+ * @smaps: whether answer will be used for smaps file
+ * @in_pf: whether answer will be used by page fault handler
+ * @enforce_sysfs: whether sysfs config should be taken into account
+ * @orders: bitfield of all orders to consider
+ *
+ * Calculates the intersection of the requested hugepage orders and the allowed
+ * hugepage orders for the provided vma. Permitted orders are encoded as a set
+ * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
+ * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
+ *
+ * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
+ * orders are allowed.
+ */
+static inline
+unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+				       unsigned long vm_flags, bool smaps,
+				       bool in_pf, bool enforce_sysfs,
+				       unsigned long orders)
+{
+	/* Optimization to check if required orders are enabled early. */
+	if (enforce_sysfs && vma_is_anonymous(vma)) {
+		unsigned long mask = READ_ONCE(huge_anon_orders_always);
+
+		if (vm_flags & VM_HUGEPAGE)
+			mask |= READ_ONCE(huge_anon_orders_madvise);
+		if (hugepage_global_always() ||
+		    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
+			mask |= READ_ONCE(huge_anon_orders_inherit);
+
+		orders &= mask;
+		if (!orders)
+			return 0;
+	}
+
+	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
+					  enforce_sysfs, orders);
+}
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return false;
 }
 
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+		unsigned long addr, int order)
 {
 	return false;
 }
 
-static inline bool hugepage_vma_check(struct vm_area_struct *vma,
-				      unsigned long vm_flags, bool smaps,
-				      bool in_pf, bool enforce_sysfs)
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long orders)
 {
-	return false;
+	return 0;
+}
+
+static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+					unsigned long vm_flags, bool smaps,
+					bool in_pf, bool enforce_sysfs,
+					unsigned long orders)
+{
+	return 0;
 }
 
 static inline void folio_prep_large_rmappable(struct folio *folio) {}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c848ea97ab02..387b030c7f15 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -74,12 +74,23 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
+unsigned long huge_anon_orders_always __read_mostly;
+unsigned long huge_anon_orders_madvise __read_mostly;
+unsigned long huge_anon_orders_inherit __read_mostly;
+
+unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
+					 unsigned long vm_flags, bool smaps,
+					 bool in_pf, bool enforce_sysfs,
+					 unsigned long orders)
+{
+	/* Check the intersection of requested and supported orders. */
+	orders &= vma_is_anonymous(vma) ?
+			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
+	if (!orders)
+		return 0;
 
-bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
-			bool smaps, bool in_pf, bool enforce_sysfs)
-{
 	if (!vma->vm_mm)		/* vdso */
-		return false;
+		return 0;
 
 	/*
 	 * Explicitly disabled through madvise or prctl, or some
@@ -88,16 +99,16 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 	 * */
 	if ((vm_flags & VM_NOHUGEPAGE) ||
 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-		return false;
+		return 0;
 	/*
 	 * If the hardware/firmware marked hugepage support disabled.
 	 */
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
-		return false;
+		return 0;
 
 	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
 	if (vma_is_dax(vma))
-		return in_pf;
+		return in_pf ? orders : 0;
 
 	/*
 	 * khugepaged special VMA and hugetlb VMA.
@@ -105,17 +116,29 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 	 * VM_MIXEDMAP set.
 	 */
 	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
-		return false;
+		return 0;
 
 	/*
-	 * Check alignment for file vma and size for both file and anon vma.
+	 * Check alignment for file vma and size for both file and anon vma by
+	 * filtering out the unsuitable orders.
 	 *
 	 * Skip the check for page fault. Huge fault does the check in fault
-	 * handlers. And this check is not suitable for huge PUD fault.
+	 * handlers.
 	 */
-	if (!in_pf &&
-	    !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
-		return false;
+	if (!in_pf) {
+		int order = highest_order(orders);
+		unsigned long addr;
+
+		while (orders) {
+			addr = vma->vm_end - (PAGE_SIZE << order);
+			if (thp_vma_suitable_order(vma, addr, order))
+				break;
+			order = next_order(&orders, order);
+		}
+
+		if (!orders)
+			return 0;
+	}
 
 	/*
 	 * Enabled via shmem mount options or sysfs settings.
@@ -124,29 +147,33 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 	 */
 	if (!in_pf && shmem_file(vma->vm_file))
 		return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
-				     !enforce_sysfs, vma->vm_mm, vm_flags);
-
-	/* Enforce sysfs THP requirements as necessary */
-	if (enforce_sysfs &&
-	    (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
-					   !hugepage_flags_always())))
-		return false;
+				     !enforce_sysfs, vma->vm_mm, vm_flags)
+			? orders : 0;
 
 	if (!vma_is_anonymous(vma)) {
+		/*
+		 * Enforce sysfs THP requirements as necessary. Anonymous vmas
+		 * were already handled in thp_vma_allowable_orders().
+		 */
+		if (enforce_sysfs &&
+		    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+						    !hugepage_global_always())))
+			return 0;
+
 		/*
 		 * Trust that ->huge_fault() handlers know what they are doing
 		 * in fault path.
 		 */
 		if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
-			return true;
+			return orders;
 		/* Only regular file is valid in collapse path */
 		if (((!in_pf || smaps)) && file_thp_enabled(vma))
-			return true;
-		return false;
+			return orders;
+		return 0;
 	}
 
 	if (vma_is_temporary_stack(vma))
-		return false;
+		return 0;
 
 	/*
 	 * THPeligible bit of smaps should show 1 for proper VMAs even
@@ -156,9 +183,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 	 * the first page fault.
 	 */
 	if (!vma->anon_vma)
-		return (smaps || in_pf);
+		return (smaps || in_pf) ? orders : 0;
 
-	return true;
+	return orders;
 }
 
 static bool get_huge_zero_page(void)
@@ -412,9 +439,136 @@ static const struct attribute_group hugepage_attr_group = {
 	.attrs = hugepage_attr,
 };
 
+static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
+static void thpsize_release(struct kobject *kobj);
+static DEFINE_SPINLOCK(huge_anon_orders_lock);
+static LIST_HEAD(thpsize_list);
+
+struct thpsize {
+	struct kobject kobj;
+	struct list_head node;
+	int order;
+};
+
+#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
+
+static ssize_t thpsize_enabled_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	int order = to_thpsize(kobj)->order;
+	const char *output;
+
+	if (test_bit(order, &huge_anon_orders_always))
+		output = "[always] inherit madvise never";
+	else if (test_bit(order, &huge_anon_orders_inherit))
+		output = "always [inherit] madvise never";
+	else if (test_bit(order, &huge_anon_orders_madvise))
+		output = "always inherit [madvise] never";
+	else
+		output = "always inherit madvise [never]";
+
+	return sysfs_emit(buf, "%s\n", output);
+}
+
+static ssize_t thpsize_enabled_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	int order = to_thpsize(kobj)->order;
+	ssize_t ret = count;
+
+	if (sysfs_streq(buf, "always")) {
+		spin_lock(&huge_anon_orders_lock);
+		clear_bit(order, &huge_anon_orders_inherit);
+		clear_bit(order, &huge_anon_orders_madvise);
+		set_bit(order, &huge_anon_orders_always);
+		spin_unlock(&huge_anon_orders_lock);
+	} else if (sysfs_streq(buf, "inherit")) {
+		spin_lock(&huge_anon_orders_lock);
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_madvise);
+		set_bit(order, &huge_anon_orders_inherit);
+		spin_unlock(&huge_anon_orders_lock);
+	} else if (sysfs_streq(buf, "madvise")) {
+		spin_lock(&huge_anon_orders_lock);
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_inherit);
+		set_bit(order, &huge_anon_orders_madvise);
+		spin_unlock(&huge_anon_orders_lock);
+	} else if (sysfs_streq(buf, "never")) {
+		spin_lock(&huge_anon_orders_lock);
+		clear_bit(order, &huge_anon_orders_always);
+		clear_bit(order, &huge_anon_orders_inherit);
+		clear_bit(order, &huge_anon_orders_madvise);
+		spin_unlock(&huge_anon_orders_lock);
+	} else
+		ret = -EINVAL;
+
+	return ret;
+}
+
+static struct kobj_attribute thpsize_enabled_attr =
+	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+
+static struct attribute *thpsize_attrs[] = {
+	&thpsize_enabled_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group thpsize_attr_group = {
+	.attrs = thpsize_attrs,
+};
+
+static const struct kobj_type thpsize_ktype = {
+	.release = &thpsize_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+static struct thpsize *thpsize_create(int order, struct kobject *parent)
+{
+	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
+	struct thpsize *thpsize;
+	int ret;
+
+	thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
+	if (!thpsize)
+		return ERR_PTR(-ENOMEM);
+
+	ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
+				   "hugepages-%lukB", size);
+	if (ret) {
+		kfree(thpsize);
+		return ERR_PTR(ret);
+	}
+
+	ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
+	if (ret) {
+		kobject_put(&thpsize->kobj);
+		return ERR_PTR(ret);
+	}
+
+	thpsize->order = order;
+	return thpsize;
+}
+
+static void thpsize_release(struct kobject *kobj)
+{
+	kfree(to_thpsize(kobj));
+}
+
 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 {
 	int err;
+	struct thpsize *thpsize;
+	unsigned long orders;
+	int order;
+
+	/*
+	 * Default to setting PMD-sized THP to inherit the global setting and
+	 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
+	 * constant so we have to do this here.
+	 */
+	huge_anon_orders_inherit = BIT(PMD_ORDER);
 
 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 	if (unlikely(!*hugepage_kobj)) {
@@ -434,8 +588,24 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 		goto remove_hp_group;
 	}
 
+	orders = THP_ORDERS_ALL_ANON;
+	order = highest_order(orders);
+	while (orders) {
+		thpsize = thpsize_create(order, *hugepage_kobj);
+		if (IS_ERR(thpsize)) {
+			pr_err("failed to create thpsize for order %d\n", order);
+			err = PTR_ERR(thpsize);
+			goto remove_all;
+		}
+		list_add(&thpsize->node, &thpsize_list);
+		order = next_order(&orders, order);
+	}
+
 	return 0;
 
+remove_all:
+	hugepage_exit_sysfs(*hugepage_kobj);
+	return err;
 remove_hp_group:
 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 delete_obj:
@@ -445,6 +615,13 @@ delete_obj:
 
 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 {
+	struct thpsize *thpsize, *tmp;
+
+	list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
+		list_del(&thpsize->node);
+		kobject_put(&thpsize->kobj);
+	}
+
 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 	kobject_put(hugepage_kobj);
@@ -811,7 +988,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	struct folio *folio;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 
-	if (!transhuge_vma_suitable(vma, haddr))
+	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 064654717843..d72aecd3624a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -446,7 +446,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    hugepage_flags_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags, false, false, true))
+		if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
+					    PMD_ORDER))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -922,16 +923,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 	if (!vma)
 		return SCAN_VMA_NULL;
 
-	if (!transhuge_vma_suitable(vma, address))
+	if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
-				cc->is_khugepaged))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+				     cc->is_khugepaged, PMD_ORDER))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
 	 * remapped to file after khugepaged reaquired the mmap_lock.
 	 *
-	 * hugepage_vma_check may return true for qualified file
+	 * thp_vma_allowable_order may return true for qualified file
 	 * vmas.
 	 */
 	if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1503,7 +1504,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
 	 * analogously elide sysfs THP settings here.
 	 */
-	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+				     PMD_ORDER))
 		return SCAN_VMA_CHECK;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2368,7 +2370,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 			progress++;
 			break;
 		}
-		if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
+		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+					     true, PMD_ORDER)) {
 skip:
 			progress++;
 			continue;
@@ -2705,7 +2708,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	*prev = vma;
 
-	if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+				     PMD_ORDER))
 		return -EINVAL;
 
 	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/memory.c b/mm/memory.c
index 99582b188ed2..8ab2d994d997 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4322,7 +4322,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 	pmd_t entry;
 	vm_fault_t ret = VM_FAULT_FALLBACK;
 
-	if (!transhuge_vma_suitable(vma, haddr))
+	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
 		return ret;
 
 	page = compound_head(page);
@@ -5116,7 +5116,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 retry_pud:
 	if (pud_none(*vmf.pud) &&
-	    hugepage_vma_check(vma, vm_flags, false, true, true)) {
+	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) {
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -5150,7 +5150,7 @@ retry_pud:
 		goto retry_pud;
 
 	if (pmd_none(*vmf.pmd) &&
-	    hugepage_vma_check(vma, vm_flags, false, true, true)) {
+	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) {
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e0b368e545ed..74d2de15fb5e 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -268,7 +268,8 @@ restart:
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
 			if ((pvmw->flags & PVMW_SYNC) &&
-			    transhuge_vma_suitable(vma, pvmw->address) &&
+			    thp_vma_suitable_order(vma, pvmw->address,
+						   PMD_ORDER) &&
 			    (pvmw->nr_pages >= HPAGE_PMD_NR)) {
 				spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
 
-- 
cgit v1.2.3


From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 7 Dec 2023 16:12:05 +0000
Subject: mm: thp: support allocation of anonymous multi-size THP

Introduce the logic to allow THP to be configured (through the new sysfs
interface we just added) to allocate large folios to back anonymous
memory, which are larger than the base page size but smaller than
PMD-size.  We call this new THP extension "multi-size THP" (mTHP).

mTHP continues to be PTE-mapped, but in many cases can still provide
similar benefits to traditional PMD-sized THP: Page faults are
significantly reduced (by a factor of e.g.  4, 8, 16, etc.  depending on
the configured order), but latency spikes are much less prominent because
the size of each page isn't as huge as the PMD-sized variant and there is
less memory to clear in each page fault.  The number of per-page
operations (e.g.  ref counting, rmap management, lru list management) are
also significantly reduced since those ops now become per-folio.

Some architectures also employ TLB compression mechanisms to squeeze more
entries in when a set of PTEs are virtually and physically contiguous and
approporiately aligned.  In this case, TLB misses will occur less often.

The new behaviour is disabled by default, but can be enabled at runtime by
writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see
documentation in previous commit).  The long term aim is to change the
default to include suitable lower orders, but there are some risks around
internal fragmentation that need to be better understood first.

[ryan.roberts@arm.com: resolve some multi-size THP review nits]
  Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Itaru Kitayama <itaru.kitayama@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |   6 ++-
 mm/memory.c             | 109 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 104 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 609c153bae57..fa7a38a30fc6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 /*
- * Mask of all large folio orders supported for anonymous THP.
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
  */
-#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+#define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
  * Mask of all large folio orders supported for file THP.
diff --git a/mm/memory.c b/mm/memory.c
index 8ab2d994d997..3c530b639559 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4125,6 +4125,84 @@ out_release:
 	return ret;
 }
 
+static bool pte_range_none(pte_t *pte, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (!pte_none(ptep_get_lockless(pte + i)))
+			return false;
+	}
+
+	return true;
+}
+
+static struct folio *alloc_anon_folio(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long orders;
+	struct folio *folio;
+	unsigned long addr;
+	pte_t *pte;
+	gfp_t gfp;
+	int order;
+
+	/*
+	 * If uffd is active for the vma we need per-page fault fidelity to
+	 * maintain the uffd semantics.
+	 */
+	if (unlikely(userfaultfd_armed(vma)))
+		goto fallback;
+
+	/*
+	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
+	 * for this vma. Then filter out the orders that can't be allocated over
+	 * the faulting address and still be fully contained in the vma.
+	 */
+	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
+					  BIT(PMD_ORDER) - 1);
+	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+
+	if (!orders)
+		goto fallback;
+
+	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
+	if (!pte)
+		return ERR_PTR(-EAGAIN);
+
+	/*
+	 * Find the highest order where the aligned range is completely
+	 * pte_none(). Note that all remaining orders will be completely
+	 * pte_none().
+	 */
+	order = highest_order(orders);
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		if (pte_range_none(pte + pte_index(addr), 1 << order))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	pte_unmap(pte);
+
+	/* Try allocating the highest of the remaining orders. */
+	gfp = vma_thp_gfp_mask(vma);
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		folio = vma_alloc_folio(gfp, order, vma, addr, true);
+		if (folio) {
+			clear_huge_page(&folio->page, vmf->address, 1 << order);
+			return folio;
+		}
+		order = next_order(&orders, order);
+	}
+
+fallback:
+#endif
+	return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -4134,9 +4212,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address;
 	struct folio *folio;
 	vm_fault_t ret = 0;
+	int nr_pages = 1;
 	pte_t entry;
+	int i;
 
 	/* File mapping without ->vm_ops ? */
 	if (vma->vm_flags & VM_SHARED)
@@ -4176,10 +4257,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+	/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
+	folio = alloc_anon_folio(vmf);
+	if (IS_ERR(folio))
+		return 0;
 	if (!folio)
 		goto oom;
 
+	nr_pages = folio_nr_pages(folio);
+	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+
 	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
 		goto oom_free_page;
 	folio_throttle_swaprate(folio, GFP_KERNEL);
@@ -4196,12 +4283,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry), vma);
 
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-			&vmf->ptl);
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
 	if (!vmf->pte)
 		goto release;
-	if (vmf_pte_changed(vmf)) {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
+		update_mmu_tlb(vma, addr, vmf->pte);
+		goto release;
+	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
 		goto release;
 	}
 
@@ -4216,16 +4306,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		return handle_userfault(vmf, VM_UFFD_MISSING);
 	}
 
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-	folio_add_new_anon_rmap(folio, vma, vmf->address);
+	folio_ref_add(folio, nr_pages - 1);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
 	if (uffd_wp)
 		entry = pte_mkuffd_wp(entry);
-	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
-- 
cgit v1.2.3


From 6ad59a3838cd0a8536721e60b8e4fbe5fdeb233a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 13 Dec 2023 19:03:33 +0000
Subject: mm/damon: update email of SeongJae

Patch series "mm/damon: misc updates for 6.8".

Update comments, tests, and documents for DAMON.


This patch (of 6):

SeongJae is using his kernel.org account for DAMON development.  Update
the old email addresses on the comments of DAMON source files.

Link: https://lkml.kernel.org/r/20231213190338.54146-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231213190338.54146-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h     | 2 +-
 mm/damon/core-test.h      | 2 +-
 mm/damon/core.c           | 2 +-
 mm/damon/dbgfs-test.h     | 2 +-
 mm/damon/dbgfs.c          | 2 +-
 mm/damon/modules-common.c | 2 +-
 mm/damon/vaddr-test.h     | 2 +-
 mm/damon/vaddr.c          | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 12510d8c51c6..5881e4ac30be 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -2,7 +2,7 @@
 /*
  * DAMON api
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #ifndef _DAMON_H_
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 6e5e9502d648..876e398557b0 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -4,7 +4,7 @@
  *
  * Copyright 2019 Amazon.com, Inc. or its affiliates.  All rights reserved.
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #ifdef CONFIG_DAMON_KUNIT_TEST
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2c0cc65d041e..36f6f1d21ff0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -2,7 +2,7 @@
 /*
  * Data Access Monitor
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #define pr_fmt(fmt) "damon: " fmt
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 0bb0d532b159..2d85217f5ba4 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -2,7 +2,7 @@
 /*
  * DAMON Debugfs Interface Unit Tests
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index dc0ea1fc30ca..7dac24e69e3b 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -2,7 +2,7 @@
 /*
  * DAMON Debugfs Interface
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #define pr_fmt(fmt) "damon-dbgfs: " fmt
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
index b2381a8466ec..7cf96574cde7 100644
--- a/mm/damon/modules-common.c
+++ b/mm/damon/modules-common.c
@@ -2,7 +2,7 @@
 /*
  * Common Primitives for DAMON Modules
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #include <linux/damon.h>
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index dcf1ca6b31cc..83626483f82b 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -4,7 +4,7 @@
  *
  * Copyright 2019 Amazon.com, Inc. or its affiliates.  All rights reserved.
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a4d1f63c5b23..a67454b825dc 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -2,7 +2,7 @@
 /*
  * DAMON Primitives for Virtual Address Spaces
  *
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
  */
 
 #define pr_fmt(fmt) "damon-va: " fmt
-- 
cgit v1.2.3


From 0abfa8efad8dccc3899f64dafa985a251714a709 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 12 Dec 2023 20:33:16 -0800
Subject: gfp: gfp_types.h: fix typos & punctuation

Correct typos/spellos and punctutation.

Link: https://lkml.kernel.org/r/20231213043316.10128-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index ae994534a12a..1b6053da8754 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t;
  * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
  *
  * The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
+ * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
  * !costly allocations are too essential to fail so they are implicitly
  * non-failing by default (with some exceptions like OOM victims might fail so
  * the caller still has to check for failures) while costly requests try to be
  * not disruptive and back off even without invoking the OOM killer.
  * The following three modifiers might be used to override some of these
- * implicit rules
+ * implicit rules.
  *
  * %__GFP_NORETRY: The VM implementation will try only very lightweight
  * memory direct reclaim to get some memory under memory pressure (thus
  * it can sleep). It will avoid disruptive actions like OOM killer. The
  * caller must handle the failure which is quite likely to happen under
  * heavy memory pressure. The flag is suitable when failure can easily be
- * handled at small cost, such as reduced throughput
+ * handled at small cost, such as reduced throughput.
  *
  * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
  * procedures that have previously failed if there is some indication
- * that progress has been made else where.  It can wait for other
- * tasks to attempt high level approaches to freeing memory such as
+ * that progress has been made elsewhere.  It can wait for other
+ * tasks to attempt high-level approaches to freeing memory such as
  * compaction (which removes fragmentation) and page-out.
  * There is still a definite limit to the number of retries, but it is
  * a larger limit than with %__GFP_NORETRY.
@@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t;
  * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
  * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
  * memory tags at the same time as zeroing memory has minimal additional
- * performace impact.
+ * performance impact.
  *
  * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
  * Used for userspace and vmalloc pages; the latter are unpoisoned by
-- 
cgit v1.2.3


From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 13 Dec 2023 13:57:41 +0800
Subject: kexec_file: add kexec_file flag to control debug printing

Patch series "kexec_file: print out debugging message if required", v4.

Currently, specifying '-d' on kexec command will print a lot of debugging
informationabout kexec/kdump loading with kexec_load interface.

However, kexec_file_load prints nothing even though '-d' is specified.
It's very inconvenient to debug or analyze the kexec/kdump loading when
something wrong happened with kexec/kdump itself or develper want to check
the kexec/kdump loading.

In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked
in code.  If it's passed in, debugging message of kexec_file code will be
printed out and can be seen from console and dmesg.  Otherwise, the
debugging message is printed like beofre when pr_debug() is taken.

Note:
****
=====
1) The code in kexec-tools utility also need be changed to support
passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified.
The patch link is here:
=========
[PATCH] kexec_file: add kexec_file flag to support debug printing
http://lists.infradead.org/pipermail/kexec/2023-November/028505.html

2) s390 also has kexec_file code, while I am not sure what debugging
information is necessary. So leave it to s390 developer.

Test:
****
====
Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64
again. And on x86_64, the printed messages look like below:
--------------------------------------------------------------
kexec measurement buffer for the loaded kernel at 0x207fffe000.
Loaded purgatory at 0x207fff9000
Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180
Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000
Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280
Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro
rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M
E820 memmap:
0000000000000000-000000000009a3ff (1)
000000000009a400-000000000009ffff (2)
00000000000e0000-00000000000fffff (2)
0000000000100000-000000006ff83fff (1)
000000006ff84000-000000007ac50fff (2)
......
000000207fff6150-000000207fff615f (128)
000000207fff6160-000000207fff714f (1)
000000207fff7150-000000207fff715f (128)
000000207fff7160-000000207fff814f (1)
000000207fff8150-000000207fff815f (128)
000000207fff8160-000000207fffffff (1)
nr_segments = 5
segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000
segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000
segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000
segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000
segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000
kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8
---------------------------------------------------------------------------


This patch (of 7):

When specifying 'kexec -c -d', kexec_load interface will print loading
information, e.g the regions where kernel/initrd/purgatory/cmdline are
put, the memmap passed to 2nd kernel taken as system RAM ranges, and
printing all contents of struct kexec_segment, etc.  These are very
helpful for analyzing or positioning what's happening when kexec/kdump
itself failed.  The debugging printing for kexec_load interface is made in
user space utility kexec-tools.

Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing.
Because kexec_file code is mostly implemented in kernel space, and the
debugging printing functionality is missed.  It's not convenient when
debugging kexec/kdump loading and jumping with kexec_file_load interface.

Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging
message printing.  And add global variable kexec_file_dbg_print and macro
kexec_dprintk() to facilitate the printing.

This is a preparation, later kexec_dprintk() will be used to replace the
existing pr_debug().  Once 'kexec -s -d' is specified, it will print out
kexec/kdump loading information.  If '-d' is not specified, it regresses
to pr_debug().

Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h      | 9 ++++++++-
 include/uapi/linux/kexec.h | 1 +
 kernel/kexec_core.c        | 2 ++
 kernel/kexec_file.c        | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8227455192b7..400cb6c02176 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
-				 KEXEC_FILE_NO_INITRAMFS)
+				 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG)
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
@@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; }
 static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
 #endif
 
+extern bool kexec_file_dbg_print;
+
+#define kexec_dprintk(fmt, ...)					\
+	printk("%s" fmt,					\
+	       kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,	\
+	       ##__VA_ARGS__)
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 01766dd839b0..c17bb096ea68 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -25,6 +25,7 @@
 #define KEXEC_FILE_UNLOAD	0x00000001
 #define KEXEC_FILE_ON_CRASH	0x00000002
 #define KEXEC_FILE_NO_INITRAMFS	0x00000004
+#define KEXEC_FILE_DEBUG	0x00000008
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bc4c096ab1f3..64072acef2b6 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
+bool kexec_file_dbg_print;
+
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ba3ef30921b8..3ee204474de6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 	 */
 	kfree(image->image_loader_data);
 	image->image_loader_data = NULL;
+
+	kexec_file_dbg_print = false;
 }
 
 #ifdef CONFIG_KEXEC_SIG
@@ -278,6 +280,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
 	if (!image)
 		return -ENOMEM;
 
+	kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
 	image->file_mode = 1;
 
 	if (kexec_on_panic) {
-- 
cgit v1.2.3


From f6120d527b8611aeaa1a34a33337f530d78a789c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 19:56:59 -0500
Subject: task_stack.h: add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched/task_stack.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index f158b025c175..ccd72b978e1f 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/magic.h>
+#include <linux/refcount.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 
-- 
cgit v1.2.3


From 6a2623b17634688cfec58dd44041c5db2143719d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 20:00:15 -0500
Subject: nsproxy.h: add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/nsproxy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 771cb0285872..5601d14e2886 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_NSPROXY_H
 #define _LINUX_NSPROXY_H
 
+#include <linux/refcount.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 
-- 
cgit v1.2.3


From 316aa04d1ffa61f73ce2679d1ae1dca8747aeb1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:00:59 -0500
Subject: kmsan: add missing types.h dependency

more header dependency pruning/fixing

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/kmsan_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h
index 8bfa6c98176d..929287981afe 100644
--- a/include/linux/kmsan_types.h
+++ b/include/linux/kmsan_types.h
@@ -9,6 +9,8 @@
 #ifndef _LINUX_KMSAN_TYPES_H
 #define _LINUX_KMSAN_TYPES_H
 
+#include <linux/types.h>
+
 /* These constants are defined in the MSan LLVM instrumentation pass. */
 #define KMSAN_RETVAL_SIZE 800
 #define KMSAN_PARAM_SIZE 800
-- 
cgit v1.2.3


From 058e0529d12ae9fed34c6eeef700cd294f9622a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 20:22:22 -0500
Subject: time_namespace.h: fix missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/time_namespace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 03d9c5ac01d1..5258d81cef17 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -7,6 +7,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
+#include <linux/time64.h>
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
-- 
cgit v1.2.3


From ea115c248a478ce1acbf4776e4666fb663285b2f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 17 Dec 2023 20:01:01 -0500
Subject: torture: add missing dependency on hrtimer.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/torture.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index c98d0c83d117..1541454da03e 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/hrtimer.h>
 
 /* Definitions for a non-string torture-test module parameter. */
 #define torture_param(type, name, init, msg) \
-- 
cgit v1.2.3


From bea32141764bc76db2d75c9484b71ded56119ab4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Dec 2022 14:32:23 -0500
Subject: nodemask: Split out include/linux/nodemask_types.h

sched.h, which defines task_struct, needs nodemask_t - but sched.h is a
frequently used header and ideally shouldn't be pulling in any more code
that it needs to.

This splits out nodemask_types.h which has the definition sched.h needs,
which will avoid a circular header dependency in the alloc tagging patch
series, and as a bonus should speed up kernel build times.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/nodemask.h       |  2 +-
 include/linux/nodemask_types.h | 10 ++++++++++
 include/linux/sched.h          |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/nodemask_types.h

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..b61438313a73 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -93,10 +93,10 @@
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 #include <linux/minmax.h>
+#include <linux/nodemask_types.h>
 #include <linux/numa.h>
 #include <linux/random.h>
 
-typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
 
 /**
diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h
new file mode 100644
index 000000000000..6b28d97ea6ed
--- /dev/null
+++ b/include/linux/nodemask_types.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NODEMASK_TYPES_H
+#define __LINUX_NODEMASK_TYPES_H
+
+#include <linux/bitops.h>
+#include <linux/numa.h>
+
+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
+
+#endif /* __LINUX_NODEMASK_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..5a5b7b122682 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -20,7 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
-#include <linux/nodemask.h>
+#include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/resource.h>
-- 
cgit v1.2.3


From d9f29deb7fe8137fd1954871443cbbc1b6125832 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 23 Jan 2023 21:32:18 -0500
Subject: prandom: Remove unused include

prandom.h doesn't use percpu.h - this fixes some circular header issues.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/prandom.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f2ed5b72b3d6..f7f1e5251c67 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -10,7 +10,6 @@
 
 #include <linux/types.h>
 #include <linux/once.h>
-#include <linux/percpu.h>
 #include <linux/random.h>
 
 struct rnd_state {
-- 
cgit v1.2.3


From 6060ef31f1162fb91a1688fa5098b38c4b9c680c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 2 Dec 2022 14:39:55 -0500
Subject: timekeeping: Kill percpu.h dependency

Slimming down recursive header includes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h        | 2 +-
 include/linux/time_namespace.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f2044d5a652b..02d264ca9dce 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -16,7 +16,7 @@
 #include <linux/rbtree.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/percpu.h>
+#include <linux/percpu-defs.h>
 #include <linux/seqlock.h>
 #include <linux/timer.h>
 #include <linux/timerqueue.h>
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 5258d81cef17..876e31b4461d 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -12,6 +12,8 @@
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+struct vm_area_struct;
+
 struct timens_offsets {
 	struct timespec64 monotonic;
 	struct timespec64 boottime;
-- 
cgit v1.2.3


From d7a73e3f089204aee3393687e23fd45a22657b08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:27:00 -0500
Subject: kernel/numa.c: Move logging out of numa.h

Moving these stub functions to a .c file means we can kill a sched.h
dependency on printk.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/numa.h | 19 ++++++-------------
 kernel/Makefile      |  1 +
 kernel/numa.c        | 26 ++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 kernel/numa.c

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index a904861de800..915033a75731 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_NUMA_H
 #define _LINUX_NUMA_H
+#include <linux/init.h>
 #include <linux/types.h>
 
 #ifdef CONFIG_NODES_SHIFT
@@ -22,34 +23,26 @@
 #endif
 
 #ifdef CONFIG_NUMA
-#include <linux/printk.h>
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
-static inline int memory_add_physaddr_to_nid(u64 start)
-{
-	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int memory_add_physaddr_to_nid(u64 start);
 #endif
+
 #ifndef phys_to_target_node
-static inline int phys_to_target_node(u64 start)
-{
-	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int phys_to_target_node(u64 start);
 #endif
+
 #ifndef numa_fill_memblks
 static inline int __init numa_fill_memblks(u64 start, u64 end)
 {
 	return NUMA_NO_MEMBLK;
 }
 #endif
+
 #else /* !CONFIG_NUMA */
 static inline int numa_nearest_node(int node, unsigned int state)
 {
diff --git a/kernel/Makefile b/kernel/Makefile
index 3947122d618b..ce105a5558fc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
 obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
 obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
 obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_NUMA) += numa.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/numa.c b/kernel/numa.c
new file mode 100644
index 000000000000..67ca6b8585c0
--- /dev/null
+++ b/kernel/numa.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/printk.h>
+#include <linux/numa.h>
+
+/* Stub functions: */
+
+#ifndef memory_add_physaddr_to_nid
+int memory_add_physaddr_to_nid(u64 start)
+{
+	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#ifndef phys_to_target_node
+int phys_to_target_node(u64 start)
+{
+	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+#endif
-- 
cgit v1.2.3


From d1d71b30e1f85e8b5d7c0d8edc16869bdc4d535f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:05:04 -0500
Subject: sched.h: Move (spin|rwlock)_needbreak() to spinlock.h

This lets us kill the dependency on spinlock.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h    | 31 -------------------------------
 include/linux/spinlock.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a5b7b122682..7501a3451a20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2227,37 +2227,6 @@ static inline bool preempt_model_preemptible(void)
 	return preempt_model_full() || preempt_model_rt();
 }
 
-/*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
- * but a general need for low latency)
- */
-static inline int spin_needbreak(spinlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
-	return spin_is_contended(lock);
-#else
-	return 0;
-#endif
-}
-
-/*
- * Check if a rwlock is contended.
- * Returns non-zero if there is another task waiting on the rwlock.
- * Returns zero if the lock is not contended or the system / underlying
- * rwlock implementation does not support contention detection.
- * Technically does not depend on CONFIG_PREEMPTION, but a general need
- * for low latency.
- */
-static inline int rwlock_needbreak(rwlock_t *lock)
-{
-#ifdef CONFIG_PREEMPTION
-	return rwlock_is_contended(lock);
-#else
-	return 0;
-#endif
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 31d3d747a9db..0c71f06454d9 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -449,6 +449,37 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
+/*
+ * Does a critical section need to be broken due to another
+ * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
+ * but a general need for low latency)
+ */
+static inline int spin_needbreak(spinlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+	return spin_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+	return rwlock_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
 #define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
 
 #else  /* !CONFIG_PREEMPT_RT */
-- 
cgit v1.2.3


From 2e346b19aab9ee40e5e429667a0a515f1d68b714 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:11:05 -0500
Subject: ktime.h: move ktime_t to types.h

ktime.h pulls in quite a few headers recursively (including printk.h) -
this is going to help with trimming sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/ktime.h | 8 +++-----
 include/linux/types.h | 3 +++
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 73f20deb497d..3a4e723eae0f 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -21,12 +21,10 @@
 #ifndef _LINUX_KTIME_H
 #define _LINUX_KTIME_H
 
-#include <linux/time.h>
-#include <linux/jiffies.h>
 #include <asm/bug.h>
-
-/* Nanosecond scalar representation for kernel time values */
-typedef s64	ktime_t;
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/types.h>
 
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
diff --git a/include/linux/types.h b/include/linux/types.h
index 253168bb3fe1..2bc8766ba20c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -120,6 +120,9 @@ typedef s64			int64_t;
 #define aligned_be64		__aligned_be64
 #define aligned_le64		__aligned_le64
 
+/* Nanosecond scalar representation for kernel time values */
+typedef s64	ktime_t;
+
 /**
  * The type used for indexing onto a disc or disc partition.
  *
-- 
cgit v1.2.3


From 50d91c76582513852e38eb80491f54d44cfb51fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 11:52:02 -0500
Subject: hrtimers: Split out hrtimer_types.h

We need to reduce the scope of what's included in sched.h: task_struct
includes a hrtimer, so split out the core types into their own header.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h       | 44 ++-----------------------------------
 include/linux/hrtimer_types.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h         |  2 +-
 3 files changed, 53 insertions(+), 43 deletions(-)
 create mode 100644 include/linux/hrtimer_types.h

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 02d264ca9dce..87e3bedf8eb0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -13,13 +13,13 @@
 #define _LINUX_HRTIMER_H
 
 #include <linux/hrtimer_defs.h>
-#include <linux/rbtree.h>
+#include <linux/hrtimer_types.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/percpu-defs.h>
+#include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/timer.h>
-#include <linux/timerqueue.h>
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
@@ -59,14 +59,6 @@ enum hrtimer_mode {
 	HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
 };
 
-/*
- * Return values for the callback function
- */
-enum hrtimer_restart {
-	HRTIMER_NORESTART,	/* Timer is not restarted */
-	HRTIMER_RESTART,	/* Timer must be restarted */
-};
-
 /*
  * Values to track state of the timer
  *
@@ -94,38 +86,6 @@ enum hrtimer_restart {
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 
-/**
- * struct hrtimer - the basic hrtimer structure
- * @node:	timerqueue node, which also manages node.expires,
- *		the absolute expiry time in the hrtimers internal
- *		representation. The time is related to the clock on
- *		which the timer is based. Is setup by adding
- *		slack to the _softexpires value. For non range timers
- *		identical to _softexpires.
- * @_softexpires: the absolute earliest expiry time of the hrtimer.
- *		The time which was given as expiry time when the timer
- *		was armed.
- * @function:	timer expiry callback function
- * @base:	pointer to the timer base (per cpu and per clock)
- * @state:	state information (See bit values above)
- * @is_rel:	Set if the timer was armed relative
- * @is_soft:	Set if hrtimer will be expired in soft interrupt context.
- * @is_hard:	Set if hrtimer will be expired in hard interrupt context
- *		even on RT.
- *
- * The hrtimer structure must be initialized by hrtimer_init()
- */
-struct hrtimer {
-	struct timerqueue_node		node;
-	ktime_t				_softexpires;
-	enum hrtimer_restart		(*function)(struct hrtimer *);
-	struct hrtimer_clock_base	*base;
-	u8				state;
-	u8				is_rel;
-	u8				is_soft;
-	u8				is_hard;
-};
-
 /**
  * struct hrtimer_sleeper - simple sleeper structure
  * @timer:	embedded timer structure
diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
new file mode 100644
index 000000000000..f4ef391b96a7
--- /dev/null
+++ b/include/linux/hrtimer_types.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HRTIMER_TYPES_H
+#define _LINUX_HRTIMER_TYPES_H
+
+#include <linux/types.h>
+#include <linux/timerqueue.h>
+
+struct hrtimer_clock_base;
+
+/*
+ * Return values for the callback function
+ */
+enum hrtimer_restart {
+	HRTIMER_NORESTART,	/* Timer is not restarted */
+	HRTIMER_RESTART,	/* Timer must be restarted */
+};
+
+/**
+ * struct hrtimer - the basic hrtimer structure
+ * @node:	timerqueue node, which also manages node.expires,
+ *		the absolute expiry time in the hrtimers internal
+ *		representation. The time is related to the clock on
+ *		which the timer is based. Is setup by adding
+ *		slack to the _softexpires value. For non range timers
+ *		identical to _softexpires.
+ * @_softexpires: the absolute earliest expiry time of the hrtimer.
+ *		The time which was given as expiry time when the timer
+ *		was armed.
+ * @function:	timer expiry callback function
+ * @base:	pointer to the timer base (per cpu and per clock)
+ * @state:	state information (See bit values above)
+ * @is_rel:	Set if the timer was armed relative
+ * @is_soft:	Set if hrtimer will be expired in soft interrupt context.
+ * @is_hard:	Set if hrtimer will be expired in hard interrupt context
+ *		even on RT.
+ *
+ * The hrtimer structure must be initialized by hrtimer_init()
+ */
+struct hrtimer {
+	struct timerqueue_node		node;
+	ktime_t				_softexpires;
+	enum hrtimer_restart		(*function)(struct hrtimer *);
+	struct hrtimer_clock_base	*base;
+	u8				state;
+	u8				is_rel;
+	u8				is_soft;
+	u8				is_hard;
+};
+
+#endif /* _LINUX_HRTIMER_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7501a3451a20..3762809652da 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -17,7 +17,7 @@
 #include <linux/kmsan_types.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
-#include <linux/hrtimer.h>
+#include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
-- 
cgit v1.2.3


From d84f317915172c6511fd6c14ea3f70c9d67fdf67 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 12:35:44 -0500
Subject: locking/mutex: split out mutex_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/mutex.h       | 52 +--------------------------------
 include/linux/mutex_types.h | 71 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h       |  2 +-
 3 files changed, 73 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/mutex_types.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..0dfba5df6524 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 #include <linux/cleanup.h>
+#include <linux/mutex_types.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)			\
@@ -33,49 +34,6 @@
 
 #ifndef CONFIG_PREEMPT_RT
 
-/*
- * Simple, straightforward mutexes with strict semantics:
- *
- * - only one task can hold the mutex at a time
- * - only the owner can unlock the mutex
- * - multiple unlocks are not permitted
- * - recursive locking is not permitted
- * - a mutex object must be initialized via the API
- * - a mutex object must not be initialized via memset or copying
- * - task may not exit with mutex held
- * - memory areas where held locks reside must not be freed
- * - held mutexes must not be reinitialized
- * - mutexes may not be used in hardware or software interrupt
- *   contexts such as tasklets and timers
- *
- * These semantics are fully enforced when DEBUG_MUTEXES is
- * enabled. Furthermore, besides enforcing the above rules, the mutex
- * debugging code also implements a number of additional features
- * that make lock debugging easier and faster:
- *
- * - uses symbolic names of mutexes, whenever they are printed in debug output
- * - point-of-acquire tracking, symbolic lookup of function names
- * - list of all locks held in the system, printout of them
- * - owner tracking
- * - detects self-recursing locks and prints out all relevant info
- * - detects multi-task circular deadlocks and prints out all affected
- *   locks and tasks (and only those tasks)
- */
-struct mutex {
-	atomic_long_t		owner;
-	raw_spinlock_t		wait_lock;
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_queue osq; /* Spinner MCS lock */
-#endif
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_MUTEXES
-	void			*magic;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_MUTEXES
 
 #define __DEBUG_MUTEX_INITIALIZER(lockname)				\
@@ -131,14 +89,6 @@ extern bool mutex_is_locked(struct mutex *lock);
 /*
  * Preempt-RT variant based on rtmutexes.
  */
-#include <linux/rtmutex.h>
-
-struct mutex {
-	struct rt_mutex_base	rtmutex;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
 
 #define __MUTEX_INITIALIZER(mutexname)					\
 {									\
diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h
new file mode 100644
index 000000000000..fdf7f515fde8
--- /dev/null
+++ b/include/linux/mutex_types.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MUTEX_TYPES_H
+#define __LINUX_MUTEX_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/lockdep_types.h>
+#include <linux/osq_lock.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+
+#ifndef CONFIG_PREEMPT_RT
+
+/*
+ * Simple, straightforward mutexes with strict semantics:
+ *
+ * - only one task can hold the mutex at a time
+ * - only the owner can unlock the mutex
+ * - multiple unlocks are not permitted
+ * - recursive locking is not permitted
+ * - a mutex object must be initialized via the API
+ * - a mutex object must not be initialized via memset or copying
+ * - task may not exit with mutex held
+ * - memory areas where held locks reside must not be freed
+ * - held mutexes must not be reinitialized
+ * - mutexes may not be used in hardware or software interrupt
+ *   contexts such as tasklets and timers
+ *
+ * These semantics are fully enforced when DEBUG_MUTEXES is
+ * enabled. Furthermore, besides enforcing the above rules, the mutex
+ * debugging code also implements a number of additional features
+ * that make lock debugging easier and faster:
+ *
+ * - uses symbolic names of mutexes, whenever they are printed in debug output
+ * - point-of-acquire tracking, symbolic lookup of function names
+ * - list of all locks held in the system, printout of them
+ * - owner tracking
+ * - detects self-recursing locks and prints out all relevant info
+ * - detects multi-task circular deadlocks and prints out all affected
+ *   locks and tasks (and only those tasks)
+ */
+struct mutex {
+	atomic_long_t		owner;
+	raw_spinlock_t		wait_lock;
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#endif
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_MUTEXES
+	void			*magic;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#else /* !CONFIG_PREEMPT_RT */
+/*
+ * Preempt-RT variant based on rtmutexes.
+ */
+#include <linux/rtmutex.h>
+
+struct mutex {
+	struct rt_mutex_base	rtmutex;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* __LINUX_MUTEX_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3762809652da..e8892789969b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -15,7 +15,7 @@
 #include <linux/sem.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
-#include <linux/mutex.h>
+#include <linux/mutex_types.h>
 #include <linux/plist.h>
 #include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
-- 
cgit v1.2.3


From 53d31ba842d9cc391032d051a210c3c9941f1529 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 12:43:30 -0500
Subject: posix-cpu-timers: Split out posix-timers_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/posix-timers.h       | 69 ++------------------------------
 include/linux/posix-timers_types.h | 80 ++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h              |  2 +-
 3 files changed, 84 insertions(+), 67 deletions(-)
 create mode 100644 include/linux/posix-timers_types.h

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index d607f51404fc..dc7b738de299 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -2,40 +2,16 @@
 #ifndef _linux_POSIX_TIMERS_H
 #define _linux_POSIX_TIMERS_H
 
-#include <linux/spinlock.h>
+#include <linux/alarmtimer.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/alarmtimer.h>
+#include <linux/posix-timers_types.h>
+#include <linux/spinlock.h>
 #include <linux/timerqueue.h>
 
 struct kernel_siginfo;
 struct task_struct;
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK	4
-#define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK	3
-#define CPUCLOCK_PROF		0
-#define CPUCLOCK_VIRT		1
-#define CPUCLOCK_SCHED		2
-#define CPUCLOCK_MAX		3
-#define CLOCKFD			CPUCLOCK_MAX
-#define CLOCKFD_MASK		(CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
 static inline clockid_t make_process_cpuclock(const unsigned int pid,
 		const clockid_t clock)
 {
@@ -109,44 +85,6 @@ static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
 	ctmr->node.expires = exp;
 }
 
-/**
- * posix_cputimer_base - Container per posix CPU clock
- * @nextevt:		Earliest-expiration cache
- * @tqhead:		timerqueue head for cpu_timers
- */
-struct posix_cputimer_base {
-	u64			nextevt;
-	struct timerqueue_head	tqhead;
-};
-
-/**
- * posix_cputimers - Container for posix CPU timer related data
- * @bases:		Base container for posix CPU clocks
- * @timers_active:	Timers are queued.
- * @expiry_active:	Timer expiry is active. Used for
- *			process wide timers to avoid multiple
- *			task trying to handle expiry concurrently
- *
- * Used in task_struct and signal_struct
- */
-struct posix_cputimers {
-	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
-	unsigned int			timers_active;
-	unsigned int			expiry_active;
-};
-
-/**
- * posix_cputimers_work - Container for task work based posix CPU timer expiry
- * @work:	The task work to be scheduled
- * @mutex:	Mutex held around expiry in context of this task work
- * @scheduled:  @work has been scheduled already, no further processing
- */
-struct posix_cputimers_work {
-	struct callback_head	work;
-	struct mutex		mutex;
-	unsigned int		scheduled;
-};
-
 static inline void posix_cputimers_init(struct posix_cputimers *pct)
 {
 	memset(pct, 0, sizeof(*pct));
@@ -179,7 +117,6 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
 		.bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases),	\
 	},
 #else
-struct posix_cputimers { };
 struct cpu_timer { };
 #define INIT_CPU_TIMERS(s)
 static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h
new file mode 100644
index 000000000000..4783fa17bfeb
--- /dev/null
+++ b/include/linux/posix-timers_types.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _linux_POSIX_TIMERS_TYPES_H
+#define _linux_POSIX_TIMERS_TYPES_H
+
+#include <linux/mutex_types.h>
+#include <linux/timerqueue.h>
+#include <linux/types.h>
+
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK	4
+#define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK	3
+#define CPUCLOCK_PROF		0
+#define CPUCLOCK_VIRT		1
+#define CPUCLOCK_SCHED		2
+#define CPUCLOCK_MAX		3
+#define CLOCKFD			CPUCLOCK_MAX
+#define CLOCKFD_MASK		(CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+#ifdef CONFIG_POSIX_TIMERS
+
+/**
+ * posix_cputimer_base - Container per posix CPU clock
+ * @nextevt:		Earliest-expiration cache
+ * @tqhead:		timerqueue head for cpu_timers
+ */
+struct posix_cputimer_base {
+	u64			nextevt;
+	struct timerqueue_head	tqhead;
+};
+
+/**
+ * posix_cputimers - Container for posix CPU timer related data
+ * @bases:		Base container for posix CPU clocks
+ * @timers_active:	Timers are queued.
+ * @expiry_active:	Timer expiry is active. Used for
+ *			process wide timers to avoid multiple
+ *			task trying to handle expiry concurrently
+ *
+ * Used in task_struct and signal_struct
+ */
+struct posix_cputimers {
+	struct posix_cputimer_base	bases[CPUCLOCK_MAX];
+	unsigned int			timers_active;
+	unsigned int			expiry_active;
+};
+
+/**
+ * posix_cputimers_work - Container for task work based posix CPU timer expiry
+ * @work:	The task work to be scheduled
+ * @mutex:	Mutex held around expiry in context of this task work
+ * @scheduled:  @work has been scheduled already, no further processing
+ */
+struct posix_cputimers_work {
+	struct callback_head	work;
+	struct mutex		mutex;
+	unsigned int		scheduled;
+};
+
+#else /* CONFIG_POSIX_TIMERS */
+
+struct posix_cputimers { };
+
+#endif /* CONFIG_POSIX_TIMERS */
+
+#endif /* _linux_POSIX_TIMERS_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e8892789969b..6d803d0904d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -31,7 +31,7 @@
 #include <linux/syscall_user_dispatch.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
-#include <linux/posix-timers.h>
+#include <linux/posix-timers_types.h>
 #include <linux/rseq.h>
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
-- 
cgit v1.2.3


From f038cc1379c0ff462d83895cae8beb75a0f6bf02 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:01:06 -0500
Subject: locking/seqlock: Split out seqlock_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h         |  2 +-
 include/linux/seqlock.h       | 79 +-----------------------------------
 include/linux/seqlock_types.h | 93 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 78 deletions(-)
 create mode 100644 include/linux/seqlock_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d803d0904d9..436f7ce1450a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,7 +33,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/rseq.h>
-#include <linux/seqlock.h>
+#include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e92f9d5577ba..d90d8ee29d81 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -18,6 +18,7 @@
 #include <linux/lockdep.h>
 #include <linux/mutex.h>
 #include <linux/preempt.h>
+#include <linux/seqlock_types.h>
 #include <linux/spinlock.h>
 
 #include <asm/processor.h>
@@ -37,37 +38,6 @@
  */
 #define KCSAN_SEQLOCK_REGION_MAX 1000
 
-/*
- * Sequence counters (seqcount_t)
- *
- * This is the raw counting mechanism, without any writer protection.
- *
- * Write side critical sections must be serialized and non-preemptible.
- *
- * If readers can be invoked from hardirq or softirq contexts,
- * interrupts or bottom halves must also be respectively disabled before
- * entering the write section.
- *
- * This mechanism can't be used if the protected data contains pointers,
- * as the writer can invalidate a pointer that a reader is following.
- *
- * If the write serialization mechanism is one of the common kernel
- * locking primitives, use a sequence counter with associated lock
- * (seqcount_LOCKNAME_t) instead.
- *
- * If it's desired to automatically handle the sequence counter writer
- * serialization and non-preemptibility requirements, use a sequential
- * lock (seqlock_t) instead.
- *
- * See Documentation/locking/seqlock.rst
- */
-typedef struct seqcount {
-	unsigned sequence;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} seqcount_t;
-
 static inline void __seqcount_init(seqcount_t *s, const char *name,
 					  struct lock_class_key *key)
 {
@@ -131,28 +101,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * See Documentation/locking/seqlock.rst
  */
 
-/*
- * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
- * disable preemption. It can lead to higher latencies, and the write side
- * sections will not be able to acquire locks which become sleeping locks
- * (e.g. spinlock_t).
- *
- * To remain preemptible while avoiding a possible livelock caused by the
- * reader preempting the writer, use a different technique: let the reader
- * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
- * case, acquire then release the associated LOCKNAME writer serialization
- * lock. This will allow any possibly-preempted writer to make progress
- * until the end of its writer serialization lock critical section.
- *
- * This lock-unlock technique must be implemented for all of PREEMPT_RT
- * sleeping locks.  See Documentation/locking/locktypes.rst
- */
-#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
-#define __SEQ_LOCK(expr)	expr
-#else
-#define __SEQ_LOCK(expr)
-#endif
-
 /*
  * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
  * @seqcount:	The real sequence counter
@@ -194,11 +142,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * @lockbase:		prefix for associated lock/unlock
  */
 #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
-typedef struct seqcount_##lockname {					\
-	seqcount_t		seqcount;				\
-	__SEQ_LOCK(locktype	*lock);					\
-} seqcount_##lockname##_t;						\
-									\
 static __always_inline seqcount_t *					\
 __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
 {									\
@@ -284,6 +227,7 @@ SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
 SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
 SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
 SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
+#undef SEQCOUNT_LOCKNAME
 
 /*
  * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
@@ -794,25 +738,6 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
 	smp_wmb();      /* increment "sequence" before following stores */
 }
 
-/*
- * Sequential locks (seqlock_t)
- *
- * Sequence counters with an embedded spinlock for writer serialization
- * and non-preemptibility.
- *
- * For more info, see:
- *    - Comments on top of seqcount_t
- *    - Documentation/locking/seqlock.rst
- */
-typedef struct {
-	/*
-	 * Make sure that readers don't starve writers on PREEMPT_RT: use
-	 * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
-	 */
-	seqcount_spinlock_t seqcount;
-	spinlock_t lock;
-} seqlock_t;
-
 #define __SEQLOCK_UNLOCKED(lockname)					\
 	{								\
 		.seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
diff --git a/include/linux/seqlock_types.h b/include/linux/seqlock_types.h
new file mode 100644
index 000000000000..dfdf43e3fa3d
--- /dev/null
+++ b/include/linux/seqlock_types.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_SEQLOCK_TYPES_H
+#define __LINUX_SEQLOCK_TYPES_H
+
+#include <linux/lockdep_types.h>
+#include <linux/mutex_types.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * Sequence counters (seqcount_t)
+ *
+ * This is the raw counting mechanism, without any writer protection.
+ *
+ * Write side critical sections must be serialized and non-preemptible.
+ *
+ * If readers can be invoked from hardirq or softirq contexts,
+ * interrupts or bottom halves must also be respectively disabled before
+ * entering the write section.
+ *
+ * This mechanism can't be used if the protected data contains pointers,
+ * as the writer can invalidate a pointer that a reader is following.
+ *
+ * If the write serialization mechanism is one of the common kernel
+ * locking primitives, use a sequence counter with associated lock
+ * (seqcount_LOCKNAME_t) instead.
+ *
+ * If it's desired to automatically handle the sequence counter writer
+ * serialization and non-preemptibility requirements, use a sequential
+ * lock (seqlock_t) instead.
+ *
+ * See Documentation/locking/seqlock.rst
+ */
+typedef struct seqcount {
+	unsigned sequence;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+} seqcount_t;
+
+/*
+ * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
+ * disable preemption. It can lead to higher latencies, and the write side
+ * sections will not be able to acquire locks which become sleeping locks
+ * (e.g. spinlock_t).
+ *
+ * To remain preemptible while avoiding a possible livelock caused by the
+ * reader preempting the writer, use a different technique: let the reader
+ * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
+ * case, acquire then release the associated LOCKNAME writer serialization
+ * lock. This will allow any possibly-preempted writer to make progress
+ * until the end of its writer serialization lock critical section.
+ *
+ * This lock-unlock technique must be implemented for all of PREEMPT_RT
+ * sleeping locks.  See Documentation/locking/locktypes.rst
+ */
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
+#define __SEQ_LOCK(expr)	expr
+#else
+#define __SEQ_LOCK(expr)
+#endif
+
+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
+typedef struct seqcount_##lockname {					\
+	seqcount_t		seqcount;				\
+	__SEQ_LOCK(locktype	*lock);					\
+} seqcount_##lockname##_t;
+
+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
+SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
+SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
+SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
+#undef SEQCOUNT_LOCKNAME
+
+/*
+ * Sequential locks (seqlock_t)
+ *
+ * Sequence counters with an embedded spinlock for writer serialization
+ * and non-preemptibility.
+ *
+ * For more info, see:
+ *    - Comments on top of seqcount_t
+ *    - Documentation/locking/seqlock.rst
+ */
+typedef struct {
+	/*
+	 * Make sure that readers don't starve writers on PREEMPT_RT: use
+	 * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
+	 */
+	seqcount_spinlock_t seqcount;
+	spinlock_t lock;
+} seqlock_t;
+
+#endif /* __LINUX_SEQLOCK_TYPES_H */
-- 
cgit v1.2.3


From 6d5e9d63683042a8d344cd5d6f9cf23613864a29 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:03:22 -0500
Subject: pid: Split out pid_types.h

Trimming down sched.h dependencies: we dont't want to include more than
the base types.

Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 drivers/target/target_core_xcopy.c |  1 +
 include/linux/pid.h                | 15 ++-------------
 include/linux/pid_types.h          | 16 ++++++++++++++++
 include/linux/sched.h              |  2 +-
 include/linux/seccomp.h            |  2 ++
 5 files changed, 22 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/pid_types.h

(limited to 'include/linux')

diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c
index 91ed015b588c..4128631c9dfd 100644
--- a/drivers/target/target_core_xcopy.c
+++ b/drivers/target/target_core_xcopy.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/configfs.h>
 #include <linux/ratelimit.h>
 #include <scsi/scsi_proto.h>
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 653a527574c4..f254c3a45b9b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -2,18 +2,10 @@
 #ifndef _LINUX_PID_H
 #define _LINUX_PID_H
 
+#include <linux/pid_types.h>
 #include <linux/rculist.h>
-#include <linux/wait.h>
 #include <linux/refcount.h>
-
-enum pid_type
-{
-	PIDTYPE_PID,
-	PIDTYPE_TGID,
-	PIDTYPE_PGID,
-	PIDTYPE_SID,
-	PIDTYPE_MAX,
-};
+#include <linux/wait.h>
 
 /*
  * What is struct pid?
@@ -110,9 +102,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
 extern void transfer_pid(struct task_struct *old, struct task_struct *new,
 			 enum pid_type);
 
-struct pid_namespace;
-extern struct pid_namespace init_pid_ns;
-
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
 
diff --git a/include/linux/pid_types.h b/include/linux/pid_types.h
new file mode 100644
index 000000000000..c2aee1d91dcf
--- /dev/null
+++ b/include/linux/pid_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PID_TYPES_H
+#define _LINUX_PID_TYPES_H
+
+enum pid_type {
+	PIDTYPE_PID,
+	PIDTYPE_TGID,
+	PIDTYPE_PGID,
+	PIDTYPE_SID,
+	PIDTYPE_MAX,
+};
+
+struct pid_namespace;
+extern struct pid_namespace init_pid_ns;
+
+#endif /* _LINUX_PID_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 436f7ce1450a..37cc9d257073 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,7 +11,7 @@
 
 #include <asm/current.h>
 
-#include <linux/pid.h>
+#include <linux/pid_types.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 175079552f68..1ec0d8dc4b69 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -126,6 +126,8 @@ static inline long seccomp_get_metadata(struct task_struct *task,
 
 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
 struct seq_file;
+struct pid_namespace;
+struct pid;
 
 int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
 			   struct pid *pid, struct task_struct *task);
-- 
cgit v1.2.3


From f551103cb964e9e6f5c03b3b8723424723731e76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 17:49:24 -0500
Subject: sched.h: move pid helpers to pid.h

This is needed for killing the sched.h dependency on rcupdate.h, and
pid.h is a better place for this code anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/x86/um/sysrq_64.c          |   1 +
 drivers/gpu/drm/lima/lima_ctx.c |   1 +
 drivers/irqchip/irq-gic-v4.c    |   1 +
 include/linux/pid.h             | 125 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h           | 122 ---------------------------------------
 include/linux/sched/signal.h    |   1 +
 ipc/util.h                      |   1 +
 kernel/async.c                  |   5 +-
 kernel/locking/spinlock_debug.c |   1 +
 9 files changed, 134 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index ef1eb4f4f612..0bf6de40abff 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/utsname.h>
diff --git a/drivers/gpu/drm/lima/lima_ctx.c b/drivers/gpu/drm/lima/lima_ctx.c
index 891d5cd5019a..8389f2d7d021 100644
--- a/drivers/gpu/drm/lima/lima_ctx.c
+++ b/drivers/gpu/drm/lima/lima_ctx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /* Copyright 2018-2019 Qiang Yu <yuq825@gmail.com> */
 
+#include <linux/pid.h>
 #include <linux/slab.h>
 
 #include "lima_device.h"
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 94d56a03b175..ca32ac19d284 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 
 #include <linux/irqchip/arm-gic-v4.h>
diff --git a/include/linux/pid.h b/include/linux/pid.h
index f254c3a45b9b..395cacce1179 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,9 @@
 
 #include <linux/pid_types.h>
 #include <linux/rculist.h>
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/sched.h>
 #include <linux/wait.h>
 
 /*
@@ -204,4 +206,127 @@ pid_t pid_vnr(struct pid *pid);
 		}							\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
+
+static inline struct pid *task_pid(struct task_struct *task)
+{
+	return task->thread_pid;
+}
+
+/*
+ * the helpers to get the task's different pids as they are seen
+ * from various namespaces
+ *
+ * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
+ * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
+ *                     current.
+ * task_xid_nr_ns()  : id seen from the ns specified;
+ *
+ * see also pid_nr() etc in include/linux/pid.h
+ */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
+
+static inline pid_t task_pid_nr(struct task_struct *tsk)
+{
+	return tsk->pid;
+}
+
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
+
+static inline pid_t task_pid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
+}
+
+
+static inline pid_t task_tgid_nr(struct task_struct *tsk)
+{
+	return tsk->tgid;
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
+ */
+static inline int pid_alive(const struct task_struct *p)
+{
+	return p->thread_pid != NULL;
+}
+
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
+
+static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
+}
+
+
+static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
+
+static inline pid_t task_session_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
+}
+
+static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
+}
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
+}
+
+static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t pid = 0;
+
+	rcu_read_lock();
+	if (pid_alive(tsk))
+		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+
+	return pid;
+}
+
+static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+{
+	return task_ppid_nr_ns(tsk, &init_pid_ns);
+}
+
+/* Obsolete, do not use: */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
+/**
+ * is_global_init - check if a task structure is init. Since init
+ * is free to have sub-threads we need to check tgid.
+ * @tsk: Task structure to be checked.
+ *
+ * Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
+ */
+static inline int is_global_init(struct task_struct *tsk)
+{
+	return task_tgid_nr(tsk) == 1;
+}
+
 #endif /* _LINUX_PID_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37cc9d257073..9e2708c2cfa6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1561,114 +1561,6 @@ struct task_struct {
 	 */
 };
 
-static inline struct pid *task_pid(struct task_struct *task)
-{
-	return task->thread_pid;
-}
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
- *                     current.
- * task_xid_nr_ns()  : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
-	return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
-	return tsk->tgid;
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
-	return p->thread_pid != NULL;
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
-}
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
-}
-
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-	pid_t pid = 0;
-
-	rcu_read_lock();
-	if (pid_alive(tsk))
-		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-	rcu_read_unlock();
-
-	return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-	return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
-/* Obsolete, do not use: */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return task_pgrp_nr_ns(tsk, &init_pid_ns);
-}
-
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
@@ -1712,20 +1604,6 @@ static inline char task_state_to_char(struct task_struct *tsk)
 	return task_index_to_char(task_state_index(tsk));
 }
 
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
-	return task_tgid_nr(tsk) == 1;
-}
-
 extern struct pid *cad_pid;
 
 /*
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..b847d8fa75a9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -9,6 +9,7 @@
 #include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/refcount.h>
+#include <linux/pid.h>
 #include <linux/posix-timers.h>
 #include <linux/mm_types.h>
 #include <asm/ptrace.h>
diff --git a/ipc/util.h b/ipc/util.h
index 67bdd2aa2c28..a55d6cebe6d3 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -14,6 +14,7 @@
 #include <linux/unistd.h>
 #include <linux/err.h>
 #include <linux/ipc_namespace.h>
+#include <linux/pid.h>
 
 /*
  * The IPC ID contains 2 separate numbers - index and sequence number.
diff --git a/kernel/async.c b/kernel/async.c
index b2c4ba5686ee..79f6a3034b1f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
 
 #include <linux/async.h>
 #include <linux/atomic.h>
-#include <linux/ktime.h>
 #include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/wait.h>
 #include <linux/workqueue.h>
 
 #include "workqueue_internal.h"
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 14235671a1a7..87b03d2e41db 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/pid.h>
 
 void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 			  struct lock_class_key *key, short inner)
-- 
cgit v1.2.3


From 8b7787a543cde905e53eaf29172c9472fe8a6a75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:12:49 -0500
Subject: plist: Split out plist_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/plist.h       | 12 +-----------
 include/linux/plist_types.h | 17 +++++++++++++++++
 include/linux/sched.h       |  2 +-
 init/init_task.c            |  1 +
 kernel/futex/core.c         |  1 +
 kernel/futex/requeue.c      |  1 +
 kernel/futex/waitwake.c     |  1 +
 mm/swapfile.c               |  1 +
 8 files changed, 24 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/plist_types.h

(limited to 'include/linux')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 0f352c1d3c80..8c1c8adf7fe9 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -75,20 +75,10 @@
 
 #include <linux/container_of.h>
 #include <linux/list.h>
-#include <linux/types.h>
+#include <linux/plist_types.h>
 
 #include <asm/bug.h>
 
-struct plist_head {
-	struct list_head node_list;
-};
-
-struct plist_node {
-	int			prio;
-	struct list_head	prio_list;
-	struct list_head	node_list;
-};
-
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h
new file mode 100644
index 000000000000..c37e784330af
--- /dev/null
+++ b/include/linux/plist_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_PLIST_TYPES_H
+#define _LINUX_PLIST_TYPES_H
+
+#include <linux/types.h>
+
+struct plist_head {
+	struct list_head node_list;
+};
+
+struct plist_node {
+	int			prio;
+	struct list_head	prio_list;
+	struct list_head	node_list;
+};
+
+#endif /* _LINUX_PLIST_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e2708c2cfa6..8c230f24688b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,7 +16,7 @@
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
 #include <linux/mutex_types.h>
-#include <linux/plist.h>
+#include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..56220898a256 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -12,6 +12,7 @@
 #include <linux/audit.h>
 #include <linux/numa.h>
 #include <linux/scs.h>
+#include <linux/plist.h>
 
 #include <linux/uaccess.h>
 
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index dad981a865b8..e0e853412c15 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,6 +34,7 @@
 #include <linux/compat.h>
 #include <linux/jhash.h>
 #include <linux/pagemap.h>
+#include <linux/plist.h>
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index eb21f065816b..b47bb764b352 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/plist.h>
 #include <linux/sched/signal.h>
 
 #include "futex.h"
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 61b112897a84..3a10375d9521 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/plist.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/freezer.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4bc70f459164..25019af07181 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
 #include <linux/completion.h>
 #include <linux/suspend.h>
 #include <linux/zswap.h>
+#include <linux/plist.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-- 
cgit v1.2.3


From 6dfeff09d5ad331905c7066207053d286d58ac83 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 18:14:41 +0000
Subject: wait: Remove uapi header file from main header file

There's really no overlap between uapi/linux/wait.h and linux/wait.h.
There are two files which rely on the uapi file being implcitly included,
so explicitly include it there and remove it from the main header file.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/wait.h   | 1 -
 kernel/exit.c          | 4 +++-
 kernel/pid_namespace.c | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3473b663176f..8aa3372f21a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 
 #include <asm/current.h>
-#include <uapi/linux/wait.h>
 
 typedef struct wait_queue_entry wait_queue_entry_t;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..2ef33047371b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,8 +69,10 @@
 #include <linux/rethook.h>
 #include <linux/sysfs.h>
 #include <linux/user_events.h>
-
 #include <linux/uaccess.h>
+
+#include <uapi/linux/wait.h>
+
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 3028b2218aa4..7ade20e95232 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/idr.h>
+#include <uapi/linux/wait.h>
 #include "pid_sysctl.h"
 
 static DEFINE_MUTEX(pid_caches_mutex);
-- 
cgit v1.2.3


From 097691960f7084ca82adb2e866d03a81753f0cb7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:28:46 -0500
Subject: rslib: kill bogus dependency on list.h

list_head is defined in types.h, not list.h - this kills a sched.h
dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/rslib.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rslib.h b/include/linux/rslib.h
index 238bb85243d3..a04dacbdc8ae 100644
--- a/include/linux/rslib.h
+++ b/include/linux/rslib.h
@@ -10,7 +10,6 @@
 #ifndef _RSLIB_H_
 #define _RSLIB_H_
 
-#include <linux/list.h>
 #include <linux/types.h>	/* for gfp_t */
 #include <linux/gfp.h>		/* for GFP_KERNEL */
 
-- 
cgit v1.2.3


From eee51b0ae5c52a77ed65ad59b55002d1397b40d5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:32:55 -0500
Subject: timerqueue: Split out timerqueue_types.h

Trimming down sched.h dependencies: timerqueue_types can include just
rbtree_types.h instead of pulling in rbtree.h.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/hrtimer_types.h      |  2 +-
 include/linux/posix-timers_types.h |  2 +-
 include/linux/timerqueue.h         | 13 +------------
 include/linux/timerqueue_types.h   | 17 +++++++++++++++++
 4 files changed, 20 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/timerqueue_types.h

(limited to 'include/linux')

diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h
index f4ef391b96a7..ad66a3081735 100644
--- a/include/linux/hrtimer_types.h
+++ b/include/linux/hrtimer_types.h
@@ -3,7 +3,7 @@
 #define _LINUX_HRTIMER_TYPES_H
 
 #include <linux/types.h>
-#include <linux/timerqueue.h>
+#include <linux/timerqueue_types.h>
 
 struct hrtimer_clock_base;
 
diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h
index 4783fa17bfeb..a4712c1008c9 100644
--- a/include/linux/posix-timers_types.h
+++ b/include/linux/posix-timers_types.h
@@ -3,7 +3,7 @@
 #define _linux_POSIX_TIMERS_TYPES_H
 
 #include <linux/mutex_types.h>
-#include <linux/timerqueue.h>
+#include <linux/timerqueue_types.h>
 #include <linux/types.h>
 
 /*
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index adc80e29168e..62973f7d4610 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -3,18 +3,7 @@
 #define _LINUX_TIMERQUEUE_H
 
 #include <linux/rbtree.h>
-#include <linux/ktime.h>
-
-
-struct timerqueue_node {
-	struct rb_node node;
-	ktime_t expires;
-};
-
-struct timerqueue_head {
-	struct rb_root_cached rb_root;
-};
-
+#include <linux/timerqueue_types.h>
 
 extern bool timerqueue_add(struct timerqueue_head *head,
 			   struct timerqueue_node *node);
diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h
new file mode 100644
index 000000000000..dc298d0923e3
--- /dev/null
+++ b/include/linux/timerqueue_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMERQUEUE_TYPES_H
+#define _LINUX_TIMERQUEUE_TYPES_H
+
+#include <linux/rbtree_types.h>
+#include <linux/types.h>
+
+struct timerqueue_node {
+	struct rb_node node;
+	ktime_t expires;
+};
+
+struct timerqueue_head {
+	struct rb_root_cached rb_root;
+};
+
+#endif /* _LINUX_TIMERQUEUE_TYPES_H */
-- 
cgit v1.2.3


From 22c336d0d3118824fed08834069568c57c5641a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:34:45 -0500
Subject: signal: Kill bogus dependency on list.h

list_head is in types.h, not list.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/signal.h       | 1 +
 include/linux/signal_types.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3b98e7a28538..f19816832f05 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -3,6 +3,7 @@
 #define _LINUX_SIGNAL_H
 
 #include <linux/bug.h>
+#include <linux/list.h>
 #include <linux/signal_types.h>
 #include <linux/string.h>
 
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index a70b2bdbf4d9..caf4f7a59ab9 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -6,7 +6,7 @@
  * Basic signal handling related data type definitions:
  */
 
-#include <linux/list.h>
+#include <linux/types.h>
 #include <uapi/linux/signal.h>
 
 typedef struct kernel_siginfo {
-- 
cgit v1.2.3


From dff0fd233a5104337069603d201f8cad74bc0e5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:53:34 -0500
Subject: timers: Split out timer_types.h

Cutting down on sched.h dependencies: this is going to be used in
workqueue_types.h in the next patch, so we can kill the sched.h
dependency on workqueue.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/timer.h       | 16 +---------------
 include/linux/timer_types.h | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/timer_types.h

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 26a545bb0153..f18a2f1eb79e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -7,21 +7,7 @@
 #include <linux/stddef.h>
 #include <linux/debugobjects.h>
 #include <linux/stringify.h>
-
-struct timer_list {
-	/*
-	 * All fields that change during normal runtime grouped to the
-	 * same cacheline
-	 */
-	struct hlist_node	entry;
-	unsigned long		expires;
-	void			(*function)(struct timer_list *);
-	u32			flags;
-
-#ifdef CONFIG_LOCKDEP
-	struct lockdep_map	lockdep_map;
-#endif
-};
+#include <linux/timer_types.h>
 
 #ifdef CONFIG_LOCKDEP
 /*
diff --git a/include/linux/timer_types.h b/include/linux/timer_types.h
new file mode 100644
index 000000000000..fae5a388f914
--- /dev/null
+++ b/include/linux/timer_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMER_TYPES_H
+#define _LINUX_TIMER_TYPES_H
+
+#include <linux/lockdep_types.h>
+#include <linux/types.h>
+
+struct timer_list {
+	/*
+	 * All fields that change during normal runtime grouped to the
+	 * same cacheline
+	 */
+	struct hlist_node	entry;
+	unsigned long		expires;
+	void			(*function)(struct timer_list *);
+	u32			flags;
+
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map	lockdep_map;
+#endif
+};
+
+#endif /* _LINUX_TIMER_TYPES_H */
-- 
cgit v1.2.3


From b2fa8443db320c4873feca2588b957439e350890 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:55:01 -0500
Subject: workqueue: Split out workqueue_types.h

More sched.h dependency culling - this lets us kill a rhashtable-types.h
dependency on workqueue.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/dma-fence.h        |  1 +
 include/linux/rhashtable-types.h |  2 +-
 include/linux/timekeeping.h      |  1 +
 include/linux/workqueue.h        | 16 +---------------
 include/linux/workqueue_types.h  | 25 +++++++++++++++++++++++++
 5 files changed, 29 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/workqueue_types.h

(limited to 'include/linux')

diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index b3772edca2e6..e06bad467f55 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/printk.h>
 #include <linux/rcupdate.h>
+#include <linux/timekeeping.h>
 
 struct dma_fence;
 struct dma_fence_ops;
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 57467cbf4c5b..b6f3797277ff 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -12,7 +12,7 @@
 #include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
+#include <linux/workqueue_types.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fe1e467ba046..7c43e98cf211 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -4,6 +4,7 @@
 
 #include <linux/errno.h>
 #include <linux/clocksource_ids.h>
+#include <linux/ktime.h>
 
 /* Included from linux/ktime.h */
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 24b1e5070f4d..f1bb2e35301f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -14,12 +14,7 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 #include <linux/rcupdate.h>
-
-struct workqueue_struct;
-
-struct work_struct;
-typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(struct timer_list *t);
+#include <linux/workqueue_types.h>
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -95,15 +90,6 @@ enum {
 #define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
 #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
 
-struct work_struct {
-	atomic_long_t data;
-	struct list_head entry;
-	work_func_t func;
-#ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
-#endif
-};
-
 #define WORK_DATA_INIT()	ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()	\
 	ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
diff --git a/include/linux/workqueue_types.h b/include/linux/workqueue_types.h
new file mode 100644
index 000000000000..4c38824f3ab4
--- /dev/null
+++ b/include/linux/workqueue_types.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WORKQUEUE_TYPES_H
+#define _LINUX_WORKQUEUE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/lockdep_types.h>
+#include <linux/timer_types.h>
+#include <linux/types.h>
+
+struct workqueue_struct;
+
+struct work_struct;
+typedef void (*work_func_t)(struct work_struct *work);
+void delayed_work_timer_fn(struct timer_list *t);
+
+struct work_struct {
+	atomic_long_t data;
+	struct list_head entry;
+	work_func_t func;
+#ifdef CONFIG_LOCKDEP
+	struct lockdep_map lockdep_map;
+#endif
+};
+
+#endif /* _LINUX_WORKQUEUE_TYPES_H */
-- 
cgit v1.2.3


From bc46ef3cea3d6f63952d7e29a324e889c34970a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:58:25 -0500
Subject: shm: Slim down dependencies

list_head is in types.h, not list.h., and the uapi header wasn't needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/parisc/mm/init.c      | 1 +
 include/linux/shm.h        | 4 ++--
 ipc/shm.c                  | 1 +
 security/selinux/hooks.c   | 1 +
 security/smack/smack_lsm.c | 1 +
 5 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index a2a3e89f2d9a..f876af56e13f 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -33,6 +33,7 @@
 #include <asm/msgbuf.h>
 #include <asm/sparsemem.h>
 #include <asm/asm-offsets.h>
+#include <asm/shmbuf.h>
 
 extern int  data_start;
 extern void parisc_kernel_start(void);	/* Kernel entry point in head.S */
diff --git a/include/linux/shm.h b/include/linux/shm.h
index d8e69aed3d32..c55bef0538e5 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -2,12 +2,12 @@
 #ifndef _LINUX_SHM_H_
 #define _LINUX_SHM_H_
 
-#include <linux/list.h>
+#include <linux/types.h>
 #include <asm/page.h>
-#include <uapi/linux/shm.h>
 #include <asm/shmparam.h>
 
 struct file;
+struct task_struct;
 
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
diff --git a/ipc/shm.c b/ipc/shm.c
index 222aaf035afb..a89f001a8bf0 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/shm.h>
+#include <uapi/linux/shm.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/mman.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..b9ccc98421e9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -85,6 +85,7 @@
 #include <linux/export.h>
 #include <linux/msg.h>
 #include <linux/shm.h>
+#include <uapi/linux/shm.h>
 #include <linux/bpf.h>
 #include <linux/kernfs.h>
 #include <linux/stringhash.h>	/* for hashlen_string() */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 65130a791f57..7a5600834f16 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -37,6 +37,7 @@
 #include <linux/personality.h>
 #include <linux/msg.h>
 #include <linux/shm.h>
+#include <uapi/linux/shm.h>
 #include <linux/binfmts.h>
 #include <linux/parser.h>
 #include <linux/fs_context.h>
-- 
cgit v1.2.3


From 72375a8864ebc0a20ca4a35f382441b01a0b85b9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:00:10 -0500
Subject: ipc: Kill bogus dependency on spinlock.h

pruning sched.h dependencies, headers shouldn't pull in more than they
need.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/ipc.h   | 2 +-
 include/linux/sched.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index e1c9eea6015b..9b1434247aab 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_IPC_H
 #define _LINUX_IPC_H
 
-#include <linux/spinlock.h>
+#include <linux/spinlock_types.h>
 #include <linux/uidgid.h>
 #include <linux/rhashtable-types.h>
 #include <uapi/linux/ipc.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c230f24688b..34400b16e57c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,6 +2139,8 @@ extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
 
+#include <linux/spinlock.h>
+
 /*
  * In order to reduce various lock holder preemption latencies provide an
  * interface to see if a vCPU is currently running or not.
-- 
cgit v1.2.3


From 9983deb26d9021aecd971d25abf4cd263c72c385 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:01:25 -0500
Subject: Split out irqflags_types.h

We're working on only pulling in type definitions to sched.h whenever
possible.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/irqflags.h       | 14 +-------------
 include/linux/irqflags_types.h | 22 ++++++++++++++++++++++
 include/linux/sched.h          |  2 +-
 3 files changed, 24 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/irqflags_types.h

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 2b665c32f5fe..147feebd508c 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_TRACE_IRQFLAGS_H
 #define _LINUX_TRACE_IRQFLAGS_H
 
+#include <linux/irqflags_types.h>
 #include <linux/typecheck.h>
 #include <linux/cleanup.h>
 #include <asm/irqflags.h>
@@ -34,19 +35,6 @@
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
-/* Per-task IRQ trace events information. */
-struct irqtrace_events {
-	unsigned int	irq_events;
-	unsigned long	hardirq_enable_ip;
-	unsigned long	hardirq_disable_ip;
-	unsigned int	hardirq_enable_event;
-	unsigned int	hardirq_disable_event;
-	unsigned long	softirq_disable_ip;
-	unsigned long	softirq_enable_ip;
-	unsigned int	softirq_disable_event;
-	unsigned int	softirq_enable_event;
-};
-
 DECLARE_PER_CPU(int, hardirqs_enabled);
 DECLARE_PER_CPU(int, hardirq_context);
 
diff --git a/include/linux/irqflags_types.h b/include/linux/irqflags_types.h
new file mode 100644
index 000000000000..c13f0d915097
--- /dev/null
+++ b/include/linux/irqflags_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQFLAGS_TYPES_H
+#define _LINUX_IRQFLAGS_TYPES_H
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/* Per-task IRQ trace events information. */
+struct irqtrace_events {
+	unsigned int	irq_events;
+	unsigned long	hardirq_enable_ip;
+	unsigned long	hardirq_disable_ip;
+	unsigned int	hardirq_enable_event;
+	unsigned int	hardirq_disable_event;
+	unsigned long	softirq_disable_ip;
+	unsigned long	softirq_enable_ip;
+	unsigned int	softirq_disable_event;
+	unsigned int	softirq_enable_event;
+};
+
+#endif
+
+#endif /* _LINUX_IRQFLAGS_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34400b16e57c..026390e6fe59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,6 +11,7 @@
 
 #include <asm/current.h>
 
+#include <linux/irqflags_types.h>
 #include <linux/pid_types.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
@@ -18,7 +19,6 @@
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
-#include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
-- 
cgit v1.2.3


From 959d8dc8046186ffea5410f51fcb309880f0dfaa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:15:35 -0500
Subject: mm_types_task.h: Trim dependencies

more sched.h header dependency trimming

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/mm_types_task.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index aa44fff8bb9d..a2f6179b672b 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -9,9 +9,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/threads.h>
-#include <linux/atomic.h>
-#include <linux/cpumask.h>
 
 #include <asm/page.h>
 
@@ -36,6 +33,8 @@ enum {
 	NR_MM_COUNTERS
 };
 
+struct page;
+
 struct page_frag {
 	struct page *page;
 #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-- 
cgit v1.2.3


From 55b899aa3e7d0dc02ff9075b883d29eb2d0cb49a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 14:25:40 -0500
Subject: syscall_user_dispatch.h: split out *_types.h

thread_info.h pulls in a lot of junk that sched.h that we don't need; in
particular, this helps to kill the printk.h dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h                       |  2 +-
 include/linux/syscall_user_dispatch.h       |  9 +--------
 include/linux/syscall_user_dispatch_types.h | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/syscall_user_dispatch_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 026390e6fe59..925711edd7f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,7 +28,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
-#include <linux/syscall_user_dispatch.h>
+#include <linux/syscall_user_dispatch_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
index 641ca8880995..3858a6ffdd5c 100644
--- a/include/linux/syscall_user_dispatch.h
+++ b/include/linux/syscall_user_dispatch.h
@@ -6,16 +6,10 @@
 #define _SYSCALL_USER_DISPATCH_H
 
 #include <linux/thread_info.h>
+#include <linux/syscall_user_dispatch_types.h>
 
 #ifdef CONFIG_GENERIC_ENTRY
 
-struct syscall_user_dispatch {
-	char __user	*selector;
-	unsigned long	offset;
-	unsigned long	len;
-	bool		on_dispatch;
-};
-
 int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 			      unsigned long len, char __user *selector);
 
@@ -29,7 +23,6 @@ int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long siz
 				     void __user *data);
 
 #else
-struct syscall_user_dispatch {};
 
 static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
 					    unsigned long len, char __user *selector)
diff --git a/include/linux/syscall_user_dispatch_types.h b/include/linux/syscall_user_dispatch_types.h
new file mode 100644
index 000000000000..3be36b06c7d7
--- /dev/null
+++ b/include/linux/syscall_user_dispatch_types.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SYSCALL_USER_DISPATCH_TYPES_H
+#define _SYSCALL_USER_DISPATCH_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_GENERIC_ENTRY
+
+struct syscall_user_dispatch {
+	char __user	*selector;
+	unsigned long	offset;
+	unsigned long	len;
+	bool		on_dispatch;
+};
+
+#else
+
+struct syscall_user_dispatch {};
+
+#endif
+
+#endif /* _SYSCALL_USER_DISPATCH_TYPES_H */
-- 
cgit v1.2.3


From f9d6966b7f4182f612208f9dad9e2cfaaf667ba3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:15:38 -0500
Subject: refcount: Split out refcount_types.h

More trimming of sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/refcount.h       | 13 +------------
 include/linux/refcount_types.h | 19 +++++++++++++++++++
 include/linux/sched.h          |  2 +-
 include/linux/sched/task.h     |  1 +
 4 files changed, 22 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/refcount_types.h

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index a62fcca97486..85c6df0d1bef 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -96,22 +96,11 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 #include <linux/limits.h>
+#include <linux/refcount_types.h>
 #include <linux/spinlock_types.h>
 
 struct mutex;
 
-/**
- * typedef refcount_t - variant of atomic_t specialized for reference counts
- * @refs: atomic_t counter field
- *
- * The counter saturates at REFCOUNT_SATURATED and will not move once
- * there. This avoids wrapping the counter and causing 'spurious'
- * use-after-free bugs.
- */
-typedef struct refcount_struct {
-	atomic_t refs;
-} refcount_t;
-
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
 #define REFCOUNT_MAX		INT_MAX
 #define REFCOUNT_SATURATED	(INT_MIN / 2)
diff --git a/include/linux/refcount_types.h b/include/linux/refcount_types.h
new file mode 100644
index 000000000000..162004f06edf
--- /dev/null
+++ b/include/linux/refcount_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_REFCOUNT_TYPES_H
+#define _LINUX_REFCOUNT_TYPES_H
+
+#include <linux/types.h>
+
+/**
+ * typedef refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at REFCOUNT_SATURATED and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
+typedef struct refcount_struct {
+	atomic_t refs;
+} refcount_t;
+
+#endif /* _LINUX_REFCOUNT_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 925711edd7f7..414e4df701ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,7 +22,7 @@
 #include <linux/seccomp.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
+#include <linux/refcount_types.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..1880ae21a9cb 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,7 @@
  * functionality:
  */
 
+#include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 
-- 
cgit v1.2.3


From a6e1420ce4fc91da56c0a2444c4482245e7617d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:30:14 -0500
Subject: seccomp: Split out seccomp_types.h

More pruning of sched.h dependencies.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/sched.h         |  2 +-
 include/linux/seccomp.h       | 22 +---------------------
 include/linux/seccomp_types.h | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/seccomp_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 414e4df701ec..db6e3e629606 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -19,7 +19,7 @@
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
-#include <linux/seccomp.h>
+#include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount_types.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 1ec0d8dc4b69..709ad84809e1 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,6 +3,7 @@
 #define _LINUX_SECCOMP_H
 
 #include <uapi/linux/seccomp.h>
+#include <linux/seccomp_types.h>
 
 #define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
 					 SECCOMP_FILTER_FLAG_LOG | \
@@ -21,25 +22,6 @@
 #include <linux/atomic.h>
 #include <asm/seccomp.h>
 
-struct seccomp_filter;
-/**
- * struct seccomp - the state of a seccomp'ed process
- *
- * @mode:  indicates one of the valid values above for controlled
- *         system calls available to a process.
- * @filter_count: number of seccomp filters
- * @filter: must always point to a valid seccomp-filter or NULL as it is
- *          accessed without locking during system call entry.
- *
- *          @filter must only be accessed from the context of current as there
- *          is no read locking.
- */
-struct seccomp {
-	int mode;
-	atomic_t filter_count;
-	struct seccomp_filter *filter;
-};
-
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 extern int __secure_computing(const struct seccomp_data *sd);
 static inline int secure_computing(void)
@@ -64,8 +46,6 @@ static inline int seccomp_mode(struct seccomp *s)
 
 #include <linux/errno.h>
 
-struct seccomp { };
-struct seccomp_filter { };
 struct seccomp_data;
 
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
diff --git a/include/linux/seccomp_types.h b/include/linux/seccomp_types.h
new file mode 100644
index 000000000000..cf0a0355024f
--- /dev/null
+++ b/include/linux/seccomp_types.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SECCOMP_TYPES_H
+#define _LINUX_SECCOMP_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_SECCOMP
+
+struct seccomp_filter;
+/**
+ * struct seccomp - the state of a seccomp'ed process
+ *
+ * @mode:  indicates one of the valid values above for controlled
+ *         system calls available to a process.
+ * @filter_count: number of seccomp filters
+ * @filter: must always point to a valid seccomp-filter or NULL as it is
+ *          accessed without locking during system call entry.
+ *
+ *          @filter must only be accessed from the context of current as there
+ *          is no read locking.
+ */
+struct seccomp {
+	int mode;
+	atomic_t filter_count;
+	struct seccomp_filter *filter;
+};
+
+#else
+
+struct seccomp { };
+struct seccomp_filter { };
+
+#endif
+
+#endif /* _LINUX_SECCOMP_TYPES_H */
-- 
cgit v1.2.3


From af6da56a223831cb74d1cf006f5742db6403398e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:51:30 -0500
Subject: uidgid: Split out uidgid_types.h

More sched.h dependency pruning.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/sched.h        |  1 +
 include/linux/uidgid.h       | 11 +----------
 include/linux/uidgid_types.h | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/uidgid_types.h

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index db6e3e629606..10e25c7fc793 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -37,6 +37,7 @@
 #include <linux/kcsan.h>
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
+#include <linux/uidgid_types.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..ba20b62f13e1 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -12,21 +12,12 @@
  * to detect when we overlook these differences.
  *
  */
-#include <linux/types.h>
+#include <linux/uidgid_types.h>
 #include <linux/highuid.h>
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
-typedef struct {
-	uid_t val;
-} kuid_t;
-
-
-typedef struct {
-	gid_t val;
-} kgid_t;
-
 #define KUIDT_INIT(value) (kuid_t){ value }
 #define KGIDT_INIT(value) (kgid_t){ value }
 
diff --git a/include/linux/uidgid_types.h b/include/linux/uidgid_types.h
new file mode 100644
index 000000000000..b35ac4955a33
--- /dev/null
+++ b/include/linux/uidgid_types.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UIDGID_TYPES_H
+#define _LINUX_UIDGID_TYPES_H
+
+#include <linux/types.h>
+
+typedef struct {
+	uid_t val;
+} kuid_t;
+
+typedef struct {
+	gid_t val;
+} kgid_t;
+
+#endif /* _LINUX_UIDGID_TYPES_H */
-- 
cgit v1.2.3


From e034d49eb01c7c83a08a3ce2a1091b55f806b26b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 15:52:17 -0500
Subject: sem: Split out sem_types.h

More sched.h dependency pruning.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/audit.h     |  1 +
 include/linux/sched.h     |  4 +++-
 include/linux/sem.h       | 10 +---------
 include/linux/sem_types.h | 13 +++++++++++++
 4 files changed, 18 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/sem_types.h

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 51b1b7054a23..0050ef288ab3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -36,6 +36,7 @@ struct mqstat;
 struct audit_watch;
 struct audit_tree;
 struct sk_buff;
+struct kern_ipc_perm;
 
 struct audit_krule {
 	u32			pflags;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 10e25c7fc793..d341a6c302d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,14 +11,16 @@
 
 #include <asm/current.h>
 
+#include <linux/cache.h>
 #include <linux/irqflags_types.h>
 #include <linux/pid_types.h>
-#include <linux/sem.h>
+#include <linux/sem_types.h>
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
 #include <linux/mutex_types.h>
 #include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
+#include <linux/timer_types.h>
 #include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
 #include <linux/rcupdate.h>
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 5608a500c43e..c4deefe42aeb 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -3,25 +3,17 @@
 #define _LINUX_SEM_H
 
 #include <uapi/linux/sem.h>
+#include <linux/sem_types.h>
 
 struct task_struct;
-struct sem_undo_list;
 
 #ifdef CONFIG_SYSVIPC
 
-struct sysv_sem {
-	struct sem_undo_list *undo_list;
-};
-
 extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
 extern void exit_sem(struct task_struct *tsk);
 
 #else
 
-struct sysv_sem {
-	/* empty */
-};
-
 static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
 {
 	return 0;
diff --git a/include/linux/sem_types.h b/include/linux/sem_types.h
new file mode 100644
index 000000000000..73df1971a7ae
--- /dev/null
+++ b/include/linux/sem_types.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SEM_TYPES_H
+#define _LINUX_SEM_TYPES_H
+
+struct sem_undo_list;
+
+struct sysv_sem {
+#ifdef CONFIG_SYSVIPC
+	struct sem_undo_list *undo_list;
+#endif
+};
+
+#endif /* _LINUX_SEM_TYPES_H */
-- 
cgit v1.2.3


From 99bac36667b6b20b9b0a20dc976365d23f90628b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 16:58:51 -0500
Subject: lockdep: move held_lock to lockdep_types.h

held_lock is embedded in task_struct, and we don't want sched.h pulling
in all of lockdep.h

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Waiman Long <longman@redhat.com>
---
 include/linux/lockdep.h       | 57 -------------------------------------------
 include/linux/lockdep_types.h | 57 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index dc2844b071c2..08b0d1d9d78b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -82,63 +82,6 @@ struct lock_chain {
 	u64				chain_key;
 };
 
-#define MAX_LOCKDEP_KEYS_BITS		13
-#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
-#define INITIAL_CHAIN_KEY		-1
-
-struct held_lock {
-	/*
-	 * One-way hash of the dependency chain up to this point. We
-	 * hash the hashes step by step as the dependency chain grows.
-	 *
-	 * We use it for dependency-caching and we skip detection
-	 * passes and dependency-updates if there is a cache-hit, so
-	 * it is absolutely critical for 100% coverage of the validator
-	 * to have a unique key value for every unique dependency path
-	 * that can occur in the system, to make a unique hash value
-	 * as likely as possible - hence the 64-bit width.
-	 *
-	 * The task struct holds the current hash value (initialized
-	 * with zero), here we store the previous hash value:
-	 */
-	u64				prev_chain_key;
-	unsigned long			acquire_ip;
-	struct lockdep_map		*instance;
-	struct lockdep_map		*nest_lock;
-#ifdef CONFIG_LOCK_STAT
-	u64 				waittime_stamp;
-	u64				holdtime_stamp;
-#endif
-	/*
-	 * class_idx is zero-indexed; it points to the element in
-	 * lock_classes this held lock instance belongs to. class_idx is in
-	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
-	 */
-	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
-	/*
-	 * The lock-stack is unified in that the lock chains of interrupt
-	 * contexts nest ontop of process context chains, but we 'separate'
-	 * the hashes by starting with 0 if we cross into an interrupt
-	 * context, and we also keep do not add cross-context lock
-	 * dependencies - the lock usage graph walking covers that area
-	 * anyway, and we'd just unnecessarily increase the number of
-	 * dependencies otherwise. [Note: hardirq and softirq contexts
-	 * are separated from each other too.]
-	 *
-	 * The following field is used to detect when we cross into an
-	 * interrupt context:
-	 */
-	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
-	unsigned int trylock:1;						/* 16 bits */
-
-	unsigned int read:2;        /* see lock_acquire() comment */
-	unsigned int check:1;       /* see lock_acquire() comment */
-	unsigned int hardirqs_off:1;
-	unsigned int sync:1;
-	unsigned int references:11;					/* 32 bits */
-	unsigned int pin_count;
-};
-
 /*
  * Initialization, self-test and debugging-output methods:
  */
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 2ebc323d345a..9c533c8d701e 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -198,6 +198,63 @@ struct lockdep_map {
 
 struct pin_cookie { unsigned int val; };
 
+#define MAX_LOCKDEP_KEYS_BITS		13
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+#define INITIAL_CHAIN_KEY		-1
+
+struct held_lock {
+	/*
+	 * One-way hash of the dependency chain up to this point. We
+	 * hash the hashes step by step as the dependency chain grows.
+	 *
+	 * We use it for dependency-caching and we skip detection
+	 * passes and dependency-updates if there is a cache-hit, so
+	 * it is absolutely critical for 100% coverage of the validator
+	 * to have a unique key value for every unique dependency path
+	 * that can occur in the system, to make a unique hash value
+	 * as likely as possible - hence the 64-bit width.
+	 *
+	 * The task struct holds the current hash value (initialized
+	 * with zero), here we store the previous hash value:
+	 */
+	u64				prev_chain_key;
+	unsigned long			acquire_ip;
+	struct lockdep_map		*instance;
+	struct lockdep_map		*nest_lock;
+#ifdef CONFIG_LOCK_STAT
+	u64 				waittime_stamp;
+	u64				holdtime_stamp;
+#endif
+	/*
+	 * class_idx is zero-indexed; it points to the element in
+	 * lock_classes this held lock instance belongs to. class_idx is in
+	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+	 */
+	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
+	/*
+	 * The lock-stack is unified in that the lock chains of interrupt
+	 * contexts nest ontop of process context chains, but we 'separate'
+	 * the hashes by starting with 0 if we cross into an interrupt
+	 * context, and we also keep do not add cross-context lock
+	 * dependencies - the lock usage graph walking covers that area
+	 * anyway, and we'd just unnecessarily increase the number of
+	 * dependencies otherwise. [Note: hardirq and softirq contexts
+	 * are separated from each other too.]
+	 *
+	 * The following field is used to detect when we cross into an
+	 * interrupt context:
+	 */
+	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+	unsigned int trylock:1;						/* 16 bits */
+
+	unsigned int read:2;        /* see lock_acquire() comment */
+	unsigned int check:1;       /* see lock_acquire() comment */
+	unsigned int hardirqs_off:1;
+	unsigned int sync:1;
+	unsigned int references:11;					/* 32 bits */
+	unsigned int pin_count;
+};
+
 #else /* !CONFIG_LOCKDEP */
 
 /*
-- 
cgit v1.2.3


From cba6167f0adb07b6cd1b8758dd67718c772e108c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 17:00:41 -0500
Subject: restart_block: Trim includes

We don't actually use any timekeeping types, no need to pull in
time64.h.

Also, sched.h uses restart_block; add it as a direct dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/restart_block.h | 2 +-
 include/linux/sched.h         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 980a65594412..13f17676c5f4 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -7,8 +7,8 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <linux/time64.h>
 
+struct __kernel_timespec;
 struct timespec;
 struct old_timespec32;
 struct pollfd;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d341a6c302d6..dd002d193726 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
+#include <linux/restart_block.h>
 #include <linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
-- 
cgit v1.2.3


From a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 15 Dec 2023 19:31:14 -0800
Subject: net/mlx5e: Use the correct lag ports number when creating TISes

The cited commit moved the code of mlx5e_create_tises() and changed the
loop to create TISes over MLX5_MAX_PORTS constant value, instead of
getting the correct lag ports supported by the device, which can cause
FW errors on devices with less than MLX5_MAX_PORTS ports.

Change that back to mlx5e_get_num_lag_ports(mdev).

Also IPoIB interfaces create there own TISes, they don't use the eth
TISes, pass a flag to indicate that.

Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources")
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 21 +++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c   |  2 +-
 include/linux/mlx5/driver.h                         |  1 +
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0bfe1ca8a364..55c6ace0acd5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1124,7 +1124,7 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 
 int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey);
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises);
 void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
 int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb,
 		       bool enable_mc_lb);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 67f546683e85..6ed3a32b7e22 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -95,7 +95,7 @@ static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PO
 {
 	int tc, i;
 
-	for (i = 0; i < MLX5_MAX_PORTS; i++)
+	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++)
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++)
 			mlx5e_destroy_tis(mdev, tisn[i][tc]);
 }
@@ -110,7 +110,7 @@ static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORT
 	int tc, i;
 	int err;
 
-	for (i = 0; i < MLX5_MAX_PORTS; i++) {
+	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) {
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) {
 			u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
 			void *tisc;
@@ -140,7 +140,7 @@ err_close_tises:
 	return err;
 }
 
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises)
 {
 	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 	int err;
@@ -169,11 +169,15 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 		goto err_destroy_mkey;
 	}
 
-	err = mlx5e_create_tises(mdev, res->tisn);
-	if (err) {
-		mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
-		goto err_destroy_bfreg;
+	if (create_tises) {
+		err = mlx5e_create_tises(mdev, res->tisn);
+		if (err) {
+			mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
+			goto err_destroy_bfreg;
+		}
+		res->tisn_valid = true;
 	}
+
 	INIT_LIST_HEAD(&res->td.tirs_list);
 	mutex_init(&res->td.list_lock);
 
@@ -203,7 +207,8 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 
 	mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv);
 	mdev->mlx5e_res.dek_priv = NULL;
-	mlx5e_destroy_tises(mdev, res->tisn);
+	if (res->tisn_valid)
+		mlx5e_destroy_tises(mdev, res->tisn);
 	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b5f1c4ca38ba..c8e8f512803e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5992,7 +5992,7 @@ static int mlx5e_resume(struct auxiliary_device *adev)
 	if (netif_device_present(netdev))
 		return 0;
 
-	err = mlx5e_create_mdev_resources(mdev);
+	err = mlx5e_create_mdev_resources(mdev, true);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 58845121954c..d77be1b4dd9c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -783,7 +783,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num,
 		}
 
 		/* This should only be called once per mdev */
-		err = mlx5e_create_mdev_resources(mdev);
+		err = mlx5e_create_mdev_resources(mdev, false);
 		if (err)
 			goto destroy_ht;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7ee5b79ff3d6..aafb36c9e5d9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,6 +681,7 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
+		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From e04984a37398b3f4f5a79c993b94c6b1224184cc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 19 Dec 2023 14:46:20 +0200
Subject: net/mlx5: Fix query of sd_group field

The sd_group field moved in the HW spec from the MPIR register
to the vport context.
Align the query accordingly.

Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 21 +++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   | 10 +++++++---
 include/linux/mlx5/vport.h                      |  1 +
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 5a31fb47ffa5..c95a84b7db3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -440,6 +440,27 @@ out:
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
 
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	if (err)
+		goto out;
+
+	*sd_group = MLX5_GET(query_nic_vport_context_out, out,
+			     nic_vport_context.sd_group);
+out:
+	kvfree(out);
+	return err;
+}
+
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 {
 	u32 *out;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index fee20fc010c2..bf2d51952e48 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4030,8 +4030,13 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xd0];
+	u8	   reserved_at_60[0xa0];
 
+	u8	   reserved_at_100[0x1];
+	u8         sd_group[0x3];
+	u8	   reserved_at_104[0x1c];
+
+	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10116,8 +10121,7 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x15];
-	u8         sd_group[0x3];
+	u8         reserved_at_28[0x18];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index fbb9bf447889..c36cc6d82926 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From c88c49ac9c18fb7c3fa431126de1d8f8f555e912 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 5 Dec 2023 23:54:21 +0200
Subject: net/mlx5: Enable SD feature

Have an actual mlx5_sd instance in the core device, and fix the getter
accordingly. This allows SD stuff to flow, the feature becomes supported
only here.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 3 ++-
 include/linux/mlx5/driver.h                        | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 0810b92b48d0..37d5f445598c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -59,10 +59,11 @@ struct mlx5_sd;
 
 static inline struct mlx5_sd *mlx5_get_sd(struct mlx5_core_dev *dev)
 {
-	return NULL;
+	return dev->sd;
 }
 
 static inline void mlx5_set_sd(struct mlx5_core_dev *dev, struct mlx5_sd *sd)
 {
+	dev->sd = sd;
 }
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index aafb36c9e5d9..cd286b681970 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -822,6 +822,7 @@ struct mlx5_core_dev {
 	struct blocking_notifier_head macsec_nh;
 #endif
 	u64 num_ipsec_offloads;
+	struct mlx5_sd          *sd;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 22c4640698a1d47606b5a4264a584e8046641784 Mon Sep 17 00:00:00 2001
From: Armen Ratner <armeng@nvidia.com>
Date: Fri, 8 Sep 2023 14:53:09 -0500
Subject: net/mlx5: Implement management PF Ethernet profile

Add management PF modules, which introduce support for the structures
needed to create the resources for the MGMT PF to work.
Also, add the necessary calls and functions to establish this
functionality.

Signed-off-by: Armen Ratner <armeng@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Daniel Jurgens <danielj@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c     |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   4 +
 .../net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c   | 268 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  24 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   2 +-
 include/linux/mlx5/driver.h                        |   8 +
 include/linux/mlx5/mlx5_ifc.h                      |  14 +-
 9 files changed, 323 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 76dc5a9b9648..f36232dead1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -29,7 +29,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \
 		en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
 		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
 		en/qos.o en/htb.o en/trap.o en/fs_tt_redirect.o en/selq.o \
-		lib/crypto.o lib/sd.o
+		en/mgmt_pf.o lib/crypto.o lib/sd.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index cf0477f53dc4..aa1b471e13fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -190,6 +190,9 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev)
 	if (is_mp_supported(dev))
 		return false;
 
+	if (mlx5_core_is_mgmt_pf(dev))
+		return false;
+
 	return true;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index d000236ddbac..aa397e3ebe6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -75,6 +75,9 @@ int mlx5_ec_init(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return 0;
 
+	if (mlx5_core_is_mgmt_pf(dev))
+		return 0;
+
 	return mlx5_host_pf_init(dev);
 }
 
@@ -85,6 +88,9 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return;
 
+	if (mlx5_core_is_mgmt_pf(dev))
+		return;
+
 	mlx5_host_pf_cleanup(dev);
 
 	err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 84db05fb9389..922b63c25154 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -63,6 +63,7 @@
 #include "lib/sd.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
+extern const struct net_device_ops mlx5e_mgmt_netdev_ops;
 struct page_pool;
 
 #define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
@@ -1125,6 +1126,7 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 }
 
 extern const struct ethtool_ops mlx5e_ethtool_ops;
+extern const struct mlx5e_profile mlx5e_mgmt_pf_nic_profile;
 
 int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey);
 int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises);
@@ -1230,6 +1232,8 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb,
 				       struct net_device *netdev,
 				       netdev_features_t features);
 int mlx5e_set_features(struct net_device *netdev, netdev_features_t features);
+void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv);
+
 #ifdef CONFIG_MLX5_ESWITCH
 int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac);
 int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c b/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c
new file mode 100644
index 000000000000..77b5805895b9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+#include <linux/kernel.h>
+#include "en/params.h"
+#include "en/health.h"
+#include "lib/eq.h"
+#include "en/dcbnl.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/en_accel.h"
+#include "en/trap.h"
+#include "en/monitor_stats.h"
+#include "en/hv_vhca_stats.h"
+#include "en_rep.h"
+#include "en.h"
+
+static int mgmt_pf_async_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb);
+	struct mlx5_eqe   *eqe = data;
+
+	if (event != MLX5_EVENT_TYPE_PORT_CHANGE)
+		return NOTIFY_DONE;
+
+	switch (eqe->sub_type) {
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+		queue_work(priv->wq, &priv->update_carrier_work);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+static void mlx5e_mgmt_pf_enable_async_events(struct mlx5e_priv *priv)
+{
+	priv->events_nb.notifier_call = mgmt_pf_async_event;
+	mlx5_notifier_register(priv->mdev, &priv->events_nb);
+}
+
+static void mlx5e_disable_mgmt_pf_async_events(struct mlx5e_priv *priv)
+{
+	mlx5_notifier_unregister(priv->mdev, &priv->events_nb);
+}
+
+static void mlx5e_modify_mgmt_pf_admin_state(struct mlx5_core_dev *mdev,
+					     enum mlx5_port_status state)
+{
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	int vport_admin_state;
+
+	mlx5_set_port_admin_status(mdev, state);
+
+	if (state == MLX5_PORT_UP)
+		vport_admin_state = MLX5_VPORT_ADMIN_STATE_AUTO;
+	else
+		vport_admin_state = MLX5_VPORT_ADMIN_STATE_DOWN;
+
+	mlx5_eswitch_set_vport_state(esw, MLX5_VPORT_UPLINK, vport_admin_state);
+}
+
+static void mlx5e_build_mgmt_pf_nic_params(struct mlx5e_priv *priv, u16 mtu)
+{
+	struct mlx5e_params *params = &priv->channels.params;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 rx_cq_period_mode;
+
+	params->sw_mtu = mtu;
+	params->hard_mtu = MLX5E_ETH_HARD_MTU;
+	params->num_channels = 1;
+
+	/* SQ */
+	params->log_sq_size = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
+		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev));
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, false);
+
+	/* RQ */
+	mlx5e_build_rq_params(mdev, params);
+
+	/* CQ moderation params */
+	rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
+	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+
+	/* TX inline */
+	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
+}
+
+static int mlx5e_mgmt_pf_init(struct mlx5_core_dev *mdev,
+			      struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_flow_steering *fs;
+	int err;
+
+	mlx5e_build_mgmt_pf_nic_params(priv, netdev->mtu);
+
+	mlx5e_timestamp_init(priv);
+
+	fs = mlx5e_fs_init(priv->profile, mdev,
+			   !test_bit(MLX5E_STATE_DESTROYING, &priv->state),
+			   priv->dfs_root);
+	if (!fs) {
+		err = -ENOMEM;
+		mlx5_core_err(mdev, "FS initialization failed, %d\n", err);
+		return err;
+	}
+	priv->fs = fs;
+
+	mlx5e_health_create_reporters(priv);
+
+	return 0;
+}
+
+static void mlx5e_mgmt_pf_cleanup(struct mlx5e_priv *priv)
+{
+	mlx5e_health_destroy_reporters(priv);
+	mlx5e_fs_cleanup(priv->fs);
+	priv->fs = NULL;
+}
+
+static int mlx5e_mgmt_pf_init_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	priv->rx_res = mlx5e_rx_res_create(mdev, 0, priv->max_nch, priv->drop_rq.rqn,
+					   &priv->channels.params.packet_merge,
+					   priv->channels.params.num_channels);
+	if (!priv->rx_res)
+		return -ENOMEM;
+
+	mlx5e_create_q_counters(priv);
+
+	err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
+	if (err) {
+		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
+		goto err_destroy_q_counters;
+	}
+
+	err = mlx5e_create_flow_steering(priv->fs, priv->rx_res, priv->profile,
+					 priv->netdev);
+	if (err) {
+		mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
+		goto err_destroy_rx_res;
+	}
+
+	return 0;
+
+err_destroy_rx_res:
+	mlx5e_rx_res_destroy(priv->rx_res);
+	priv->rx_res = NULL;
+	mlx5e_close_drop_rq(&priv->drop_rq);
+err_destroy_q_counters:
+	mlx5e_destroy_q_counters(priv);
+	return err;
+}
+
+static void mlx5e_mgmt_pf_cleanup_rx(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_flow_steering(priv->fs, !!(priv->netdev->hw_features & NETIF_F_NTUPLE),
+				    priv->profile);
+	mlx5e_rx_res_destroy(priv->rx_res);
+	priv->rx_res = NULL;
+	mlx5e_close_drop_rq(&priv->drop_rq);
+	mlx5e_destroy_q_counters(priv);
+}
+
+static int mlx5e_mgmt_pf_init_tx(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static void mlx5e_mgmt_pf_cleanup_tx(struct mlx5e_priv *priv)
+{
+}
+
+static void mlx5e_mgmt_pf_enable(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mlx5e_fs_init_l2_addr(priv->fs, netdev);
+
+	/* Marking the link as currently not needed by the Driver */
+	if (!netif_running(netdev))
+		mlx5e_modify_mgmt_pf_admin_state(mdev, MLX5_PORT_DOWN);
+
+	mlx5e_set_netdev_mtu_boundaries(priv);
+	mlx5e_set_dev_port_mtu(priv);
+
+	mlx5e_mgmt_pf_enable_async_events(priv);
+	if (mlx5e_monitor_counter_supported(priv))
+		mlx5e_monitor_counter_init(priv);
+
+	mlx5e_hv_vhca_stats_create(priv);
+	if (netdev->reg_state != NETREG_REGISTERED)
+		return;
+	mlx5e_dcbnl_init_app(priv);
+
+	mlx5e_nic_set_rx_mode(priv);
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		mlx5e_open(netdev);
+	udp_tunnel_nic_reset_ntf(priv->netdev);
+	netif_device_attach(netdev);
+	rtnl_unlock();
+}
+
+static void mlx5e_mgmt_pf_disable(struct mlx5e_priv *priv)
+{
+	if (priv->netdev->reg_state == NETREG_REGISTERED)
+		mlx5e_dcbnl_delete_app(priv);
+
+	rtnl_lock();
+	if (netif_running(priv->netdev))
+		mlx5e_close(priv->netdev);
+	netif_device_detach(priv->netdev);
+	rtnl_unlock();
+
+	mlx5e_nic_set_rx_mode(priv);
+
+	mlx5e_hv_vhca_stats_destroy(priv);
+	if (mlx5e_monitor_counter_supported(priv))
+		mlx5e_monitor_counter_cleanup(priv);
+
+	mlx5e_disable_mgmt_pf_async_events(priv);
+	mlx5e_ipsec_cleanup(priv);
+}
+
+static int mlx5e_mgmt_pf_update_rx(struct mlx5e_priv *priv)
+{
+	return mlx5e_refresh_tirs(priv, false, false);
+}
+
+static int mlx5e_mgmt_pf_max_nch_limit(struct mlx5_core_dev *mdev)
+{
+	return 1;
+}
+
+const struct mlx5e_profile mlx5e_mgmt_pf_nic_profile = {
+	.init		   = mlx5e_mgmt_pf_init,
+	.cleanup	   = mlx5e_mgmt_pf_cleanup,
+	.init_rx	   = mlx5e_mgmt_pf_init_rx,
+	.cleanup_rx	   = mlx5e_mgmt_pf_cleanup_rx,
+	.init_tx	   = mlx5e_mgmt_pf_init_tx,
+	.cleanup_tx	   = mlx5e_mgmt_pf_cleanup_tx,
+	.enable		   = mlx5e_mgmt_pf_enable,
+	.disable	   = mlx5e_mgmt_pf_disable,
+	.update_rx	   = mlx5e_mgmt_pf_update_rx,
+	.update_stats	   = mlx5e_stats_update_ndo_stats,
+	.update_carrier	   = mlx5e_update_carrier,
+	.rx_handlers       = &mlx5e_rx_handlers_nic,
+	.max_tc		   = 1,
+	.max_nch_limit	   = mlx5e_mgmt_pf_max_nch_limit,
+	.stats_grps	   = mlx5e_nic_stats_grps,
+	.stats_grps_num	   = mlx5e_nic_stats_grps_num
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b8f08d64f66b..40626b6108fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3799,7 +3799,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
 }
 
-static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
+void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
 {
 	if (mlx5e_is_uplink_rep(priv))
 		return; /* no rx mode for uplink rep */
@@ -5004,6 +5004,15 @@ const struct net_device_ops mlx5e_netdev_ops = {
 #endif
 };
 
+const struct net_device_ops mlx5e_mgmt_netdev_ops = {
+	.ndo_open		= mlx5e_open,
+	.ndo_stop		= mlx5e_close,
+	.ndo_start_xmit		= mlx5e_xmit,
+	.ndo_get_stats64	= mlx5e_get_stats,
+	.ndo_change_mtu		= mlx5e_change_nic_mtu,
+	.ndo_set_rx_mode	= mlx5e_set_rx_mode,
+};
+
 static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
 {
 	int i;
@@ -5143,7 +5152,11 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 
 	SET_NETDEV_DEV(netdev, mdev->device);
 
-	netdev->netdev_ops = &mlx5e_netdev_ops;
+	if (mlx5_core_is_mgmt_pf(mdev))
+		netdev->netdev_ops = &mlx5e_mgmt_netdev_ops;
+	else
+		netdev->netdev_ops = &mlx5e_netdev_ops;
+
 	netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
 	netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops;
 
@@ -6094,13 +6107,18 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state)
 static int _mlx5e_probe(struct auxiliary_device *adev)
 {
 	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
-	const struct mlx5e_profile *profile = &mlx5e_nic_profile;
 	struct mlx5_core_dev *mdev = edev->mdev;
+	const struct mlx5e_profile *profile;
 	struct mlx5e_dev *mlx5e_dev;
 	struct net_device *netdev;
 	struct mlx5e_priv *priv;
 	int err;
 
+	if (mlx5_core_is_mgmt_pf(mdev))
+		profile = &mlx5e_mgmt_pf_nic_profile;
+	else
+		profile = &mlx5e_nic_profile;
+
 	mlx5e_dev = mlx5e_create_devlink(&adev->dev, mdev);
 	if (IS_ERR(mlx5e_dev))
 		return PTR_ERR(mlx5e_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 3047d7015c52..3bf419d06d53 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1665,7 +1665,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *
 	void *hca_caps;
 	int err;
 
-	if (!mlx5_core_is_ecpf(dev)) {
+	if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_mgmt_pf(dev)) {
 		*max_sfs = 0;
 		return 0;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cd286b681970..2bba88c67f58 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1224,6 +1224,14 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
+static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev)
+{
+	if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid))
+		return false;
+
+	return MLX5_CAP_GEN_2(dev, local_mng_port);
+}
+
 static inline bool
 mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index bf2d51952e48..586569209254 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1954,8 +1954,10 @@ enum {
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
-	u8         migratable[0x1];
-	u8         reserved_at_81[0x1f];
+	u8	   migratable[0x1];
+	u8	   reserved_at_81[0x19];
+	u8	   local_mng_port[0x1];
+	u8	   reserved_at_9b[0x5];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -1973,7 +1975,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   allowed_object_for_other_vhca_access[0x40];
 
-	u8	   reserved_at_140[0x60];
+	u8	   reserved_at_140[0x20];
+
+	u8	   reserved_at_160[0xa];
+	u8	   local_mng_port_valid[0x1];
+	u8	   reserved_at_16b[0x15];
+
+	u8	   reserved_at_180[0x20];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
-- 
cgit v1.2.3


From ee9ec49046951eff704752669f0c388b506ddbdf Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Tue, 19 Dec 2023 18:50:02 +0100
Subject: iio: buffer-dma: Get rid of outgoing queue

The buffer-dma code was using two queues, incoming and outgoing, to
manage the state of the blocks in use.

While this totally works, it adds some complexity to the code,
especially since the code only manages 2 blocks. It is much easier to
just check each block's state manually, and keep a counter for the next
block to dequeue.

Since the new DMABUF based API wouldn't use the outgoing queue anyway,
getting rid of it now makes the upcoming changes simpler.

With this change, the IIO_BLOCK_STATE_DEQUEUED is now useless, and can
be removed.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Link: https://lore.kernel.org/r/20231219175009.65482-2-paul@crapouillou.net
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dma.c | 44 +++++++++++++++-------------
 include/linux/iio/buffer-dma.h               |  7 ++---
 2 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index d348af8b9705..1fc91467d1aa 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -179,7 +179,7 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 	}
 
 	block->size = size;
-	block->state = IIO_BLOCK_STATE_DEQUEUED;
+	block->state = IIO_BLOCK_STATE_DONE;
 	block->queue = queue;
 	INIT_LIST_HEAD(&block->head);
 	kref_init(&block->kref);
@@ -191,16 +191,8 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
 
 static void _iio_dma_buffer_block_done(struct iio_dma_buffer_block *block)
 {
-	struct iio_dma_buffer_queue *queue = block->queue;
-
-	/*
-	 * The buffer has already been freed by the application, just drop the
-	 * reference.
-	 */
-	if (block->state != IIO_BLOCK_STATE_DEAD) {
+	if (block->state != IIO_BLOCK_STATE_DEAD)
 		block->state = IIO_BLOCK_STATE_DONE;
-		list_add_tail(&block->head, &queue->outgoing);
-	}
 }
 
 /**
@@ -261,7 +253,6 @@ static bool iio_dma_block_reusable(struct iio_dma_buffer_block *block)
 	 * not support abort and has not given back the block yet.
 	 */
 	switch (block->state) {
-	case IIO_BLOCK_STATE_DEQUEUED:
 	case IIO_BLOCK_STATE_QUEUED:
 	case IIO_BLOCK_STATE_DONE:
 		return true;
@@ -317,7 +308,6 @@ int iio_dma_buffer_request_update(struct iio_buffer *buffer)
 	 * dead. This means we can reset the lists without having to fear
 	 * corrution.
 	 */
-	INIT_LIST_HEAD(&queue->outgoing);
 	spin_unlock_irq(&queue->list_lock);
 
 	INIT_LIST_HEAD(&queue->incoming);
@@ -456,14 +446,20 @@ static struct iio_dma_buffer_block *iio_dma_buffer_dequeue(
 	struct iio_dma_buffer_queue *queue)
 {
 	struct iio_dma_buffer_block *block;
+	unsigned int idx;
 
 	spin_lock_irq(&queue->list_lock);
-	block = list_first_entry_or_null(&queue->outgoing, struct
-		iio_dma_buffer_block, head);
-	if (block != NULL) {
-		list_del(&block->head);
-		block->state = IIO_BLOCK_STATE_DEQUEUED;
+
+	idx = queue->fileio.next_dequeue;
+	block = queue->fileio.blocks[idx];
+
+	if (block->state == IIO_BLOCK_STATE_DONE) {
+		idx = (idx + 1) % ARRAY_SIZE(queue->fileio.blocks);
+		queue->fileio.next_dequeue = idx;
+	} else {
+		block = NULL;
 	}
+
 	spin_unlock_irq(&queue->list_lock);
 
 	return block;
@@ -539,6 +535,7 @@ size_t iio_dma_buffer_data_available(struct iio_buffer *buf)
 	struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buf);
 	struct iio_dma_buffer_block *block;
 	size_t data_available = 0;
+	unsigned int i;
 
 	/*
 	 * For counting the available bytes we'll use the size of the block not
@@ -552,8 +549,15 @@ size_t iio_dma_buffer_data_available(struct iio_buffer *buf)
 		data_available += queue->fileio.active_block->size;
 
 	spin_lock_irq(&queue->list_lock);
-	list_for_each_entry(block, &queue->outgoing, head)
-		data_available += block->size;
+
+	for (i = 0; i < ARRAY_SIZE(queue->fileio.blocks); i++) {
+		block = queue->fileio.blocks[i];
+
+		if (block != queue->fileio.active_block
+		    && block->state == IIO_BLOCK_STATE_DONE)
+			data_available += block->size;
+	}
+
 	spin_unlock_irq(&queue->list_lock);
 	mutex_unlock(&queue->lock);
 
@@ -617,7 +621,6 @@ int iio_dma_buffer_init(struct iio_dma_buffer_queue *queue,
 	queue->ops = ops;
 
 	INIT_LIST_HEAD(&queue->incoming);
-	INIT_LIST_HEAD(&queue->outgoing);
 
 	mutex_init(&queue->lock);
 	spin_lock_init(&queue->list_lock);
@@ -645,7 +648,6 @@ void iio_dma_buffer_exit(struct iio_dma_buffer_queue *queue)
 			continue;
 		queue->fileio.blocks[i]->state = IIO_BLOCK_STATE_DEAD;
 	}
-	INIT_LIST_HEAD(&queue->outgoing);
 	spin_unlock_irq(&queue->list_lock);
 
 	INIT_LIST_HEAD(&queue->incoming);
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 6564bdcdac66..18d3702fa95d 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -19,14 +19,12 @@ struct device;
 
 /**
  * enum iio_block_state - State of a struct iio_dma_buffer_block
- * @IIO_BLOCK_STATE_DEQUEUED: Block is not queued
  * @IIO_BLOCK_STATE_QUEUED: Block is on the incoming queue
  * @IIO_BLOCK_STATE_ACTIVE: Block is currently being processed by the DMA
  * @IIO_BLOCK_STATE_DONE: Block is on the outgoing queue
  * @IIO_BLOCK_STATE_DEAD: Block has been marked as to be freed
  */
 enum iio_block_state {
-	IIO_BLOCK_STATE_DEQUEUED,
 	IIO_BLOCK_STATE_QUEUED,
 	IIO_BLOCK_STATE_ACTIVE,
 	IIO_BLOCK_STATE_DONE,
@@ -73,12 +71,15 @@ struct iio_dma_buffer_block {
  * @active_block: Block being used in read()
  * @pos: Read offset in the active block
  * @block_size: Size of each block
+ * @next_dequeue: index of next block that will be dequeued
  */
 struct iio_dma_buffer_queue_fileio {
 	struct iio_dma_buffer_block *blocks[2];
 	struct iio_dma_buffer_block *active_block;
 	size_t pos;
 	size_t block_size;
+
+	unsigned int next_dequeue;
 };
 
 /**
@@ -93,7 +94,6 @@ struct iio_dma_buffer_queue_fileio {
  *   list and typically also a list of active blocks in the part that handles
  *   the DMA controller
  * @incoming: List of buffers on the incoming queue
- * @outgoing: List of buffers on the outgoing queue
  * @active: Whether the buffer is currently active
  * @fileio: FileIO state
  */
@@ -105,7 +105,6 @@ struct iio_dma_buffer_queue {
 	struct mutex lock;
 	spinlock_t list_lock;
 	struct list_head incoming;
-	struct list_head outgoing;
 
 	bool active;
 
-- 
cgit v1.2.3


From 3efdc78fdc21ab82694707eb234ab93f28d13ba8 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Wed, 13 Dec 2023 22:44:38 -0800
Subject: fs/proc: show correct device and inode numbers in /proc/pid/maps

/proc/pid/maps shows device and inode numbers of vma->vm_file-s. Here is
an issue. If a mapped file is on a stackable file system (e.g.,
overlayfs), vma->vm_file is a backing file whose f_inode is on the
underlying filesystem. To show correct numbers, we need to get a user
file and shows its numbers. The same trick is used to show file paths in
/proc/pid/maps.

Cc: Alexander Mikhalitsyn <alexander@mihalicyn.com>
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Andrei Vagin <avagin@google.com>
Link: https://lore.kernel.org/r/20231214064439.1023011-1-avagin@google.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/task_mmu.c |  3 ++-
 include/linux/fs.h | 18 +++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef2eb12906da..5de0f70e52ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode = file_inode(vma->vm_file);
+		const struct inode *inode = file_user_inode(vma->vm_file);
+
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f171505940ff..a3a48a5d8728 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2523,20 +2523,28 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 struct path *backing_file_user_path(struct file *f);
 
 /*
- * file_user_path - get the path to display for memory mapped file
- *
  * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
  * stored in ->vm_file is a backing file whose f_inode is on the underlying
- * filesystem.  When the mapped file path is displayed to user (e.g. via
- * /proc/<pid>/maps), this helper should be used to get the path to display
- * to the user, which is the path of the fd that user has requested to map.
+ * filesystem.  When the mapped file path and inode number are displayed to
+ * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
+ * path and inode number to display to the user, which is the path of the fd
+ * that user has requested to map and the inode number that would be returned
+ * by fstat() on that same fd.
  */
+/* Get the path to display in /proc/<pid>/maps */
 static inline const struct path *file_user_path(struct file *f)
 {
 	if (unlikely(f->f_mode & FMODE_BACKING))
 		return backing_file_user_path(f);
 	return &f->f_path;
 }
+/* Get the inode whose inode number to display in /proc/<pid>/maps */
+static inline const struct inode *file_user_inode(struct file *f)
+{
+	if (unlikely(f->f_mode & FMODE_BACKING))
+		return d_inode(backing_file_user_path(f)->dentry);
+	return file_inode(f);
+}
 
 static inline struct file *file_clone_open(struct file *file)
 {
-- 
cgit v1.2.3


From 5ae81209491ed3718fee798db6fb2cc81214824c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:52:36 +0100
Subject: driver core: bus: make bus_sort_breadthfirst() take a const pointer

For some reason, during the big "clean up the driver core for a const
struct bus_type" work, the bus_sort_breadthfirst() call was missed.  Fix
this up by changing the type to be a const * as it should be.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121935-stinking-ditzy-fd5d@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c         | 2 +-
 include/linux/device/bus.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 84a21084d67d..33d952695927 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1030,7 +1030,7 @@ static void device_insertion_sort_klist(struct device *a, struct list_head *list
 	list_move_tail(&a->p->knode_bus.n_node, list);
 }
 
-void bus_sort_breadthfirst(struct bus_type *bus,
+void bus_sort_breadthfirst(const struct bus_type *bus,
 			   int (*compare)(const struct device *a,
 					  const struct device *b))
 {
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index ae10c4322754..25127f750349 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -232,7 +232,7 @@ bus_find_device_by_acpi_dev(const struct bus_type *bus, const void *adev)
 
 int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start,
 		     void *data, int (*fn)(struct device_driver *, void *));
-void bus_sort_breadthfirst(struct bus_type *bus,
+void bus_sort_breadthfirst(const struct bus_type *bus,
 			   int (*compare)(const struct device *a,
 					  const struct device *b));
 /*
-- 
cgit v1.2.3


From 32f78abe59c740b6ec34c89dc10a09208eae7e1f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:15:09 +0100
Subject: driver core: bus: constantify subsys_register() calls

The functions subsys_register() and subsys_virtual_register() should be
taking a constant pointer to a struct bus_type, as they do not actually
modify anything in it, so fix up the function definitions to do so
properly.

This also changes the pointer type in struct subsys_interface to be
constant as well, as again, that's the proper signature of it.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/2023121908-grove-genetics-f8af@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c     | 6 +++---
 include/linux/device.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 33d952695927..daee55c9b2d9 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1194,7 +1194,7 @@ static void system_root_device_release(struct device *dev)
 	kfree(dev);
 }
 
-static int subsys_register(struct bus_type *subsys,
+static int subsys_register(const struct bus_type *subsys,
 			   const struct attribute_group **groups,
 			   struct kobject *parent_of_root)
 {
@@ -1264,7 +1264,7 @@ err_sp:
  * directory itself and not some create fake root-device placed in
  * /sys/devices/system/<name>.
  */
-int subsys_system_register(struct bus_type *subsys,
+int subsys_system_register(const struct bus_type *subsys,
 			   const struct attribute_group **groups)
 {
 	return subsys_register(subsys, groups, &system_kset->kobj);
@@ -1282,7 +1282,7 @@ EXPORT_SYMBOL_GPL(subsys_system_register);
  * There's no restriction on device naming.  This is for kernel software
  * constructs which need sysfs interface.
  */
-int subsys_virtual_register(struct bus_type *subsys,
+int subsys_virtual_register(const struct bus_type *subsys,
 			    const struct attribute_group **groups)
 {
 	struct kobject *virtual_dir;
diff --git a/include/linux/device.h b/include/linux/device.h
index 4aa34c8d1361..aefc5ca7f1cf 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -63,7 +63,7 @@ struct msi_device_data;
  */
 struct subsys_interface {
 	const char *name;
-	struct bus_type *subsys;
+	const struct bus_type *subsys;
 	struct list_head node;
 	int (*add_dev)(struct device *dev, struct subsys_interface *sif);
 	void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
@@ -72,9 +72,9 @@ struct subsys_interface {
 int subsys_interface_register(struct subsys_interface *sif);
 void subsys_interface_unregister(struct subsys_interface *sif);
 
-int subsys_system_register(struct bus_type *subsys,
+int subsys_system_register(const struct bus_type *subsys,
 			   const struct attribute_group **groups);
-int subsys_virtual_register(struct bus_type *subsys,
+int subsys_virtual_register(const struct bus_type *subsys,
 			    const struct attribute_group **groups);
 
 /*
-- 
cgit v1.2.3


From dedb868994d8308c6c4650203e190ec619005806 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:03:20 +0100
Subject: driver core: container: make container_subsys const

Now that the driver core can properly handle constant struct bus_type,
move the container_subsys variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121919-chatter-grumbling-9ef3@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/container.c  | 2 +-
 include/linux/container.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/container.c b/drivers/base/container.c
index 1ba42d2d3532..f40588ebc3f5 100644
--- a/drivers/base/container.c
+++ b/drivers/base/container.c
@@ -24,7 +24,7 @@ static int container_offline(struct device *dev)
 	return cdev->offline ? cdev->offline(cdev) : 0;
 }
 
-struct bus_type container_subsys = {
+const struct bus_type container_subsys = {
 	.name = CONTAINER_BUS_NAME,
 	.dev_name = CONTAINER_BUS_NAME,
 	.online = trivial_online,
diff --git a/include/linux/container.h b/include/linux/container.h
index 2566a1baa736..dd00cc918a92 100644
--- a/include/linux/container.h
+++ b/include/linux/container.h
@@ -12,7 +12,7 @@
 #include <linux/device.h>
 
 /* drivers/base/power/container.c */
-extern struct bus_type container_subsys;
+extern const struct bus_type container_subsys;
 
 struct container_dev {
 	struct device dev;
-- 
cgit v1.2.3


From 39299bdd2546688d92ed9db4948f6219ca1b9542 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 9 Dec 2023 00:41:55 +0000
Subject: keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on
 expiry

If a key has an expiration time, then when that time passes, the key is
left around for a certain amount of time before being collected (5 mins by
default) so that EKEYEXPIRED can be returned instead of ENOKEY.  This is a
problem for DNS keys because we want to redo the DNS lookup immediately at
that point.

Fix this by allowing key types to be marked such that keys of that type
don't have this extra period, but are reclaimed as soon as they expire and
turn this on for dns_resolver-type keys.  To make this easier to handle,
key->expiry is changed to be permanent if TIME64_MAX rather than 0.

Furthermore, give such new-style negative DNS results a 1s default expiry
if no other expiry time is set rather than allowing it to stick around
indefinitely.  This shouldn't be zero as ls will follow a failing stat call
immediately with a second with AT_SYMLINK_NOFOLLOW added.

Fixes: 1a4240f4764a ("DNS: Separate out CIFS DNS Resolver code")
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Markus Suvanto <markus.suvanto@gmail.com>
cc: Wang Lei <wang840925@gmail.com>
cc: Jeff Layton <jlayton@redhat.com>
cc: Steve French <smfrench@gmail.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Jarkko Sakkinen <jarkko@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: keyrings@vger.kernel.org
cc: netdev@vger.kernel.org
---
 include/linux/key-type.h   |  1 +
 net/dns_resolver/dns_key.c | 10 +++++++++-
 security/keys/gc.c         | 31 +++++++++++++++++++++----------
 security/keys/internal.h   | 11 ++++++++++-
 security/keys/key.c        | 15 +++++----------
 security/keys/proc.c       |  2 +-
 6 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 7d985a1dfe4a..5caf3ce82373 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -73,6 +73,7 @@ struct key_type {
 
 	unsigned int flags;
 #define KEY_TYPE_NET_DOMAIN	0x00000001 /* Keys of this type have a net namespace domain */
+#define KEY_TYPE_INSTANT_REAP	0x00000002 /* Keys of this type don't have a delay after expiring */
 
 	/* vet a description */
 	int (*vet_description)(const char *description);
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 01e54b46ae0b..2a6d363763a2 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -91,6 +91,7 @@ const struct cred *dns_resolver_cache;
 static int
 dns_resolver_preparse(struct key_preparsed_payload *prep)
 {
+	const struct dns_server_list_v1_header *v1;
 	const struct dns_payload_header *bin;
 	struct user_key_payload *upayload;
 	unsigned long derrno;
@@ -122,6 +123,13 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 			return -EINVAL;
 		}
 
+		v1 = (const struct dns_server_list_v1_header *)bin;
+		if ((v1->status != DNS_LOOKUP_GOOD &&
+		     v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) {
+			if (prep->expiry == TIME64_MAX)
+				prep->expiry = ktime_get_real_seconds() + 1;
+		}
+
 		result_len = datalen;
 		goto store_result;
 	}
@@ -314,7 +322,7 @@ static long dns_resolver_read(const struct key *key,
 
 struct key_type key_type_dns_resolver = {
 	.name		= "dns_resolver",
-	.flags		= KEY_TYPE_NET_DOMAIN,
+	.flags		= KEY_TYPE_NET_DOMAIN | KEY_TYPE_INSTANT_REAP,
 	.preparse	= dns_resolver_preparse,
 	.free_preparse	= dns_resolver_free_preparse,
 	.instantiate	= generic_key_instantiate,
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 3c90807476eb..eaddaceda14e 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -66,6 +66,19 @@ void key_schedule_gc(time64_t gc_at)
 	}
 }
 
+/*
+ * Set the expiration time on a key.
+ */
+void key_set_expiry(struct key *key, time64_t expiry)
+{
+	key->expiry = expiry;
+	if (expiry != TIME64_MAX) {
+		if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
+			expiry += key_gc_delay;
+		key_schedule_gc(expiry);
+	}
+}
+
 /*
  * Schedule a dead links collection run.
  */
@@ -176,7 +189,6 @@ static void key_garbage_collector(struct work_struct *work)
 	static u8 gc_state;		/* Internal persistent state */
 #define KEY_GC_REAP_AGAIN	0x01	/* - Need another cycle */
 #define KEY_GC_REAPING_LINKS	0x02	/* - We need to reap links */
-#define KEY_GC_SET_TIMER	0x04	/* - We need to restart the timer */
 #define KEY_GC_REAPING_DEAD_1	0x10	/* - We need to mark dead keys */
 #define KEY_GC_REAPING_DEAD_2	0x20	/* - We need to reap dead key links */
 #define KEY_GC_REAPING_DEAD_3	0x40	/* - We need to reap dead keys */
@@ -184,21 +196,17 @@ static void key_garbage_collector(struct work_struct *work)
 
 	struct rb_node *cursor;
 	struct key *key;
-	time64_t new_timer, limit;
+	time64_t new_timer, limit, expiry;
 
 	kenter("[%lx,%x]", key_gc_flags, gc_state);
 
 	limit = ktime_get_real_seconds();
-	if (limit > key_gc_delay)
-		limit -= key_gc_delay;
-	else
-		limit = key_gc_delay;
 
 	/* Work out what we're going to be doing in this pass */
 	gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2;
 	gc_state <<= 1;
 	if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags))
-		gc_state |= KEY_GC_REAPING_LINKS | KEY_GC_SET_TIMER;
+		gc_state |= KEY_GC_REAPING_LINKS;
 
 	if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags))
 		gc_state |= KEY_GC_REAPING_DEAD_1;
@@ -233,8 +241,11 @@ continue_scanning:
 			}
 		}
 
-		if (gc_state & KEY_GC_SET_TIMER) {
-			if (key->expiry > limit && key->expiry < new_timer) {
+		expiry = key->expiry;
+		if (expiry != TIME64_MAX) {
+			if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
+				expiry += key_gc_delay;
+			if (expiry > limit && expiry < new_timer) {
 				kdebug("will expire %x in %lld",
 				       key_serial(key), key->expiry - limit);
 				new_timer = key->expiry;
@@ -276,7 +287,7 @@ maybe_resched:
 	 */
 	kdebug("pass complete");
 
-	if (gc_state & KEY_GC_SET_TIMER && new_timer != (time64_t)TIME64_MAX) {
+	if (new_timer != TIME64_MAX) {
 		new_timer += key_gc_delay;
 		key_schedule_gc(new_timer);
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 471cf36dedc0..2cffa6dc8255 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -167,6 +167,7 @@ extern unsigned key_gc_delay;
 extern void keyring_gc(struct key *keyring, time64_t limit);
 extern void keyring_restriction_gc(struct key *keyring,
 				   struct key_type *dead_type);
+void key_set_expiry(struct key *key, time64_t expiry);
 extern void key_schedule_gc(time64_t gc_at);
 extern void key_schedule_gc_links(void);
 extern void key_gc_keytype(struct key_type *ktype);
@@ -215,10 +216,18 @@ extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
  */
 static inline bool key_is_dead(const struct key *key, time64_t limit)
 {
+	time64_t expiry = key->expiry;
+
+	if (expiry != TIME64_MAX) {
+		if (!(key->type->flags & KEY_TYPE_INSTANT_REAP))
+			expiry += key_gc_delay;
+		if (expiry <= limit)
+			return true;
+	}
+
 	return
 		key->flags & ((1 << KEY_FLAG_DEAD) |
 			      (1 << KEY_FLAG_INVALIDATED)) ||
-		(key->expiry > 0 && key->expiry <= limit) ||
 		key->domain_tag->removed;
 }
 
diff --git a/security/keys/key.c b/security/keys/key.c
index 0260a1902922..5b10641debd5 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -294,6 +294,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->uid = uid;
 	key->gid = gid;
 	key->perm = perm;
+	key->expiry = TIME64_MAX;
 	key->restrict_link = restrict_link;
 	key->last_used_at = ktime_get_real_seconds();
 
@@ -463,10 +464,7 @@ static int __key_instantiate_and_link(struct key *key,
 			if (authkey)
 				key_invalidate(authkey);
 
-			if (prep->expiry != TIME64_MAX) {
-				key->expiry = prep->expiry;
-				key_schedule_gc(prep->expiry + key_gc_delay);
-			}
+			key_set_expiry(key, prep->expiry);
 		}
 	}
 
@@ -606,8 +604,7 @@ int key_reject_and_link(struct key *key,
 		atomic_inc(&key->user->nikeys);
 		mark_key_instantiated(key, -error);
 		notify_key(key, NOTIFY_KEY_INSTANTIATED, -error);
-		key->expiry = ktime_get_real_seconds() + timeout;
-		key_schedule_gc(key->expiry + key_gc_delay);
+		key_set_expiry(key, ktime_get_real_seconds() + timeout);
 
 		if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
 			awaken = 1;
@@ -723,16 +720,14 @@ found_kernel_type:
 
 void key_set_timeout(struct key *key, unsigned timeout)
 {
-	time64_t expiry = 0;
+	time64_t expiry = TIME64_MAX;
 
 	/* make the changes with the locks held to prevent races */
 	down_write(&key->sem);
 
 	if (timeout > 0)
 		expiry = ktime_get_real_seconds() + timeout;
-
-	key->expiry = expiry;
-	key_schedule_gc(key->expiry + key_gc_delay);
+	key_set_expiry(key, expiry);
 
 	up_write(&key->sem);
 }
diff --git a/security/keys/proc.c b/security/keys/proc.c
index d0cde6685627..4f4e2c1824f1 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -198,7 +198,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 
 	/* come up with a suitable timeout value */
 	expiry = READ_ONCE(key->expiry);
-	if (expiry == 0) {
+	if (expiry == TIME64_MAX) {
 		memcpy(xbuf, "perm", 5);
 	} else if (now >= expiry) {
 		memcpy(xbuf, "expd", 5);
-- 
cgit v1.2.3


From 4515d08a742c76612b65d2f47a87d12860519842 Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Thu, 21 Dec 2023 17:58:47 +0100
Subject: kernel/module: improve documentation for try_module_get()

The sentence "this call will fail if the module is already being
removed" is potentially confusing and may contradict the rest of the
documentation. If one tries to get a module that has already been
removed using a stale pointer, the kernel will crash.

Signed-off-by: Marco Pagani <marpagan@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index a98e188cf37b..08364d5cbc07 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -668,7 +668,7 @@ extern void __module_get(struct module *module);
  * @module: the module we should check for
  *
  * Only try to get a module reference count if the module is not being removed.
- * This call will fail if the module is already being removed.
+ * This call will fail if the module is in the process of being removed.
  *
  * Care must also be taken to ensure the module exists and is alive prior to
  * usage of this call. This can be gauranteed through two means:
-- 
cgit v1.2.3


From 645f3d85129d8aac3b896ba685fbc20a31c2c036 Mon Sep 17 00:00:00 2001
From: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
Date: Wed, 20 Dec 2023 13:41:38 +0200
Subject: wifi: cfg80211: handle UHB AP and STA power type

UHB AP send supported power type(LPI, SP, VLP)
in beacon and probe response IE and STA should
connect to these AP only if their regulatory support
the AP power type.

Beacon/Probe response are reported to userspace
with reason "STA regulatory not supporting to connect to AP
based on transmitted power type" and it should
not connect to AP.

Signed-off-by: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20231220133549.cbfbef9170a9.I432f78438de18aa9f5c9006be12e41dc34cc47c5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  1 +
 include/net/cfg80211.h       |  6 ++++++
 include/uapi/linux/nl80211.h | 13 +++++++++++++
 net/wireless/nl80211.c       |  6 ++++++
 net/wireless/reg.c           |  4 ++++
 net/wireless/scan.c          | 38 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 8ad008591e32..2f5554482047 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2720,6 +2720,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
 
 #define IEEE80211_6GHZ_CTRL_REG_LPI_AP	0
 #define IEEE80211_6GHZ_CTRL_REG_SP_AP	1
+#define IEEE80211_6GHZ_CTRL_REG_VLP_AP	2
 
 /**
  * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 745974d45ea4..cf79656ce09c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -118,6 +118,10 @@ struct wiphy;
  *	restrictions.
  * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
  * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT
+ * @IEEE80211_CHAN_NO_UHB_VLP_CLIENT: Client connection with VLP AP
+ *	not permitted using this channel
+ * @IEEE80211_CHAN_NO_UHB_AFC_CLIENT: Client connection with AFC AP
+ *	not permitted using this channel
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -142,6 +146,8 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
 	IEEE80211_CHAN_NO_EHT		= 1<<20,
 	IEEE80211_CHAN_DFS_CONCURRENT	= 1<<21,
+	IEEE80211_CHAN_NO_UHB_VLP_CLIENT= 1<<22,
+	IEEE80211_CHAN_NO_UHB_AFC_CLIENT= 1<<23,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 466da830e65f..1ccdcae24372 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4260,6 +4260,10 @@ enum nl80211_wmm_rule {
  *	allowed for peer-to-peer or adhoc communication under the control
  *	of a DFS master which operates on the same channel (FCC-594280 D01
  *	Section B.3). Should be used together with %NL80211_RRF_DFS only.
+ * @NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT: Client connection to VLP AP
+ *	not allowed using this channel
+ * @NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT: Client connection to AFC AP
+ *	not allowed using this channel
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4300,6 +4304,8 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_EHT,
 	NL80211_FREQUENCY_ATTR_PSD,
 	NL80211_FREQUENCY_ATTR_DFS_CONCURRENT,
+	NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT,
+	NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4509,6 +4515,8 @@ enum nl80211_sched_scan_match_attr {
 	peer-to-peer or adhoc communication under the control of a DFS master
 	which operates on the same channel (FCC-594280 D01 Section B.3).
 	Should be used together with %NL80211_RRF_DFS only.
+ * @NL80211_RRF_NO_UHB_VLP_CLIENT: Client connection to VLP AP not allowed
+ * @NL80211_RRF_NO_UHB_AFC_CLIENT: Client connection to AFC AP not allowed
  */
 enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_OFDM		= 1<<0,
@@ -4531,6 +4539,8 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_EHT		= 1<<19,
 	NL80211_RRF_PSD			= 1<<20,
 	NL80211_RRF_DFS_CONCURRENT	= 1<<21,
+	NL80211_RRF_NO_UHB_VLP_CLIENT	= 1<<22,
+	NL80211_RRF_NO_UHB_AFC_CLIENT	= 1<<23,
 };
 
 #define NL80211_RRF_PASSIVE_SCAN	NL80211_RRF_NO_IR
@@ -5086,9 +5096,12 @@ enum nl80211_bss_use_for {
  *	BSS isn't possible
  * @NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY: NSTR nonprimary links aren't
  *	supported by the device, and this BSS entry represents one.
+ * @NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH: STA is not supporting
+ *	the AP power type (SP, VLP, AP) that the AP uses.
  */
 enum nl80211_bss_cannot_use_reasons {
 	NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY	= 1 << 0,
+	NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH	= 1 << 1,
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 534ef3fe0696..60877b532993 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1204,6 +1204,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_DFS_CONCURRENT) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT))
 			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT))
+			goto nla_put_failure;
+		if ((chan->flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT))
+			goto nla_put_failure;
 	}
 
 	if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 44684df64734..2741b626919a 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1595,6 +1595,10 @@ static u32 map_regdom_flags(u32 rd_flags)
 		channel_flags |= IEEE80211_CHAN_NO_EHT;
 	if (rd_flags & NL80211_RRF_DFS_CONCURRENT)
 		channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT;
+	if (rd_flags & NL80211_RRF_NO_UHB_VLP_CLIENT)
+		channel_flags |= IEEE80211_CHAN_NO_UHB_VLP_CLIENT;
+	if (rd_flags & NL80211_RRF_NO_UHB_AFC_CLIENT)
+		channel_flags |= IEEE80211_CHAN_NO_UHB_AFC_CLIENT;
 	if (rd_flags & NL80211_RRF_PSD)
 		channel_flags |= IEEE80211_CHAN_PSD;
 	return channel_flags;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 3d260c99c348..a601f1c7f835 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2848,6 +2848,36 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL(cfg80211_inform_bss_data);
 
+static bool cfg80211_uhb_power_type_valid(const u8 *ie,
+					  size_t ielen,
+					  const u32 flags)
+{
+	const struct element *tmp;
+	struct ieee80211_he_operation *he_oper;
+
+	tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen);
+	if (tmp && tmp->datalen >= sizeof(*he_oper) + 1) {
+		const struct ieee80211_he_6ghz_oper *he_6ghz_oper;
+
+		he_oper = (void *)&tmp->data[1];
+		he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper);
+
+		if (!he_6ghz_oper)
+			return false;
+
+		switch (u8_get_bits(he_6ghz_oper->control,
+				    IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
+		case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
+			return true;
+		case IEEE80211_6GHZ_CTRL_REG_SP_AP:
+			return !(flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT);
+		case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
+			return !(flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT);
+		}
+	}
+	return false;
+}
+
 /* cfg80211_inform_bss_width_frame helper */
 static struct cfg80211_bss *
 cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
@@ -2906,6 +2936,14 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 	if (!channel)
 		return NULL;
 
+	if (channel->band == NL80211_BAND_6GHZ &&
+	    !cfg80211_uhb_power_type_valid(variable, ielen, channel->flags)) {
+		data->restrict_use = 1;
+		data->use_for = 0;
+		data->cannot_use_reasons =
+			NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH;
+	}
+
 	if (ext) {
 		const struct ieee80211_s1g_bcn_compat_ie *compat;
 		const struct element *elem;
-- 
cgit v1.2.3


From d68019471995ba47e56a9da355df13a1cdb5bf7e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:18 +0100
Subject: entry: Move exit to usermode functions to header file

To allow inlining, move exit_to_user_mode() to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-2-svens@linux.ibm.com
---
 include/linux/entry-common.h | 53 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/entry/common.c        | 52 ++++++++-----------------------------------
 2 files changed, 61 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d95ab85f96ba..6a6e98f3805f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -7,6 +7,10 @@
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/context_tracking.h>
+#include <linux/livepatch.h>
+#include <linux/resume_user_mode.h>
+#include <linux/tick.h>
 
 #include <asm/entry-common.h>
 
@@ -258,6 +262,43 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  */
 void arch_do_signal_or_restart(struct pt_regs *regs);
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+				     unsigned long ti_work);
+
+/**
+ * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * 1) check that interrupts are disabled
+ * 2) call tick_nohz_user_enter_prepare()
+ * 3) call exit_to_user_mode_loop() if any flags from
+ *    EXIT_TO_USER_MODE_WORK are set
+ * 4) check that interrupts are still disabled
+ */
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	unsigned long ti_work;
+
+	lockdep_assert_irqs_disabled();
+
+	/* Flush pending rcuog wakeup before the last need_resched() check */
+	tick_nohz_user_enter_prepare();
+
+	ti_work = read_thread_flags();
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+		ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+	arch_exit_to_user_mode_prepare(regs, ti_work);
+
+	/* Ensure that kernel state is sane for a return to userspace */
+	kmap_assert_nomap();
+	lockdep_assert_irqs_disabled();
+	lockdep_sys_exit();
+}
+
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -276,7 +317,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * non-instrumentable.
  * The caller has to invoke syscall_exit_to_user_mode_work() before this.
  */
-void exit_to_user_mode(void);
+static __always_inline void exit_to_user_mode(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare();
+	instrumentation_end();
+
+	user_enter_irqoff();
+	arch_exit_to_user_mode();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
 
 /**
  * syscall_exit_to_user_mode_work - Handle work before returning to user mode
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f2ba..7f8f8c16140a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -123,29 +123,16 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 	instrumentation_end();
 }
 
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
-	instrumentation_begin();
-	trace_hardirqs_on_prepare();
-	lockdep_hardirqs_on_prepare();
-	instrumentation_end();
-
-	user_enter_irqoff();
-	arch_exit_to_user_mode();
-	lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
-	__exit_to_user_mode();
-}
-
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-					    unsigned long ti_work)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs:	Pointer to pt_regs on entry stack
+ * @ti_work:	TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+						     unsigned long ti_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -190,27 +177,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	return ti_work;
 }
 
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
-	unsigned long ti_work;
-
-	lockdep_assert_irqs_disabled();
-
-	/* Flush pending rcuog wakeup before the last need_resched() check */
-	tick_nohz_user_enter_prepare();
-
-	ti_work = read_thread_flags();
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
-
-	arch_exit_to_user_mode_prepare(regs, ti_work);
-
-	/* Ensure that kernel state is sane for a return to userspace */
-	kmap_assert_nomap();
-	lockdep_assert_irqs_disabled();
-	lockdep_sys_exit();
-}
-
 /*
  * If SYSCALL_EMU is set, then the only reason to report is when
  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
@@ -295,7 +261,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	__syscall_exit_to_user_mode_work(regs);
 	instrumentation_end();
-	__exit_to_user_mode();
+	exit_to_user_mode();
 }
 
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
@@ -308,7 +274,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	exit_to_user_mode_prepare(regs);
 	instrumentation_end();
-	__exit_to_user_mode();
+	exit_to_user_mode();
 }
 
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
-- 
cgit v1.2.3


From caf4062e35b21cd7d3d35ac2f58f9765d02d32a0 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:19 +0100
Subject: entry: Move enter_from_user_mode() to header file

To allow inlining of enter_from_user_mode(), move it to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-3-svens@linux.ibm.com
---
 include/linux/entry-common.h | 15 ++++++++++++++-
 kernel/entry/common.c        | 26 +++-----------------------
 2 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 6a6e98f3805f..c4205390448e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -11,6 +11,7 @@
 #include <linux/livepatch.h>
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
+#include <linux/kmsan.h>
 
 #include <asm/entry-common.h>
 
@@ -102,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
  * done between establishing state and enabling interrupts. The caller must
  * enable interrupts before invoking syscall_enter_from_user_mode_work().
  */
-void enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+	arch_enter_from_user_mode(regs);
+	lockdep_hardirqs_off(CALLER_ADDR0);
+
+	CT_WARN_ON(__ct_state() != CONTEXT_USER);
+	user_exit_irqoff();
+
+	instrumentation_begin();
+	kmsan_unpoison_entry_regs(regs);
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
 
 /**
  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 7f8f8c16140a..0616f239da4b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -15,26 +15,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
-	arch_enter_from_user_mode(regs);
-	lockdep_hardirqs_off(CALLER_ADDR0);
-
-	CT_WARN_ON(__ct_state() != CONTEXT_USER);
-	user_exit_irqoff();
-
-	instrumentation_begin();
-	kmsan_unpoison_entry_regs(regs);
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
-	__enter_from_user_mode(regs);
-}
-
 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 {
 	if (unlikely(audit_context())) {
@@ -105,7 +85,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 {
 	long ret;
 
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 
 	instrumentation_begin();
 	local_irq_enable();
@@ -117,7 +97,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 
 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 {
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 	instrumentation_begin();
 	local_irq_enable();
 	instrumentation_end();
@@ -266,7 +246,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
 
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 }
 
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-- 
cgit v1.2.3


From 221a164035fd8b554a44bd7c4bf8e7715a497561 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:20 +0100
Subject: entry: Move syscall_enter_from_user_mode() to header file

To allow inlining of syscall_enter_from_user_mode(), move it
to entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-4-svens@linux.ibm.com
---
 include/linux/entry-common.h | 27 +++++++++++++++++++++++++--
 kernel/entry/common.c        | 32 +-------------------------------
 2 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c4205390448e..b0fb775a600d 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -134,6 +134,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
  */
 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
 
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+			 unsigned long work);
+
 /**
  * syscall_enter_from_user_mode_work - Check and handle work before invoking
  *				       a syscall
@@ -157,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
+
+	if (work & SYSCALL_WORK_ENTER)
+		syscall = syscall_trace_enter(regs, syscall, work);
+
+	return syscall;
+}
 
 /**
  * syscall_enter_from_user_mode - Establish state and check and handle work
@@ -176,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
  */
-long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+	long ret;
+
+	enter_from_user_mode(regs);
+
+	instrumentation_begin();
+	local_irq_enable();
+	ret = syscall_enter_from_user_mode_work(regs, syscall);
+	instrumentation_end();
+
+	return ret;
+}
 
 /**
  * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0616f239da4b..88cb3c88aaa5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -25,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 	}
 }
 
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
 				unsigned long work)
 {
 	long ret = 0;
@@ -65,36 +65,6 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	return ret ? : syscall;
 }
 
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
-	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
-	if (work & SYSCALL_WORK_ENTER)
-		syscall = syscall_trace_enter(regs, syscall, work);
-
-	return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
-	return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
-	long ret;
-
-	enter_from_user_mode(regs);
-
-	instrumentation_begin();
-	local_irq_enable();
-	ret = __syscall_enter_from_user_work(regs, syscall);
-	instrumentation_end();
-
-	return ret;
-}
-
 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
-- 
cgit v1.2.3


From bbcd80f53a5e8c27c2511f539fec8c373f500cf4 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 15 Dec 2023 13:32:05 +0100
Subject: mtd: rawnand: Prevent crossing LUN boundaries during sequential reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ONFI specification states that devices do not need to support
sequential reads across LUN boundaries. In order to prevent such event
from happening and possibly failing, let's introduce the concept of
"pause" in the sequential read to handle these cases. The first/last
pages remain the same but any time we cross a LUN boundary we will end
and restart (if relevant) the sequential read operation.

Cc: stable@vger.kernel.org
Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Martin Hundebøll <martin@geanix.com>
Link: https://lore.kernel.org/linux-mtd/20231215123208.516590-2-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_base.c | 43 ++++++++++++++++++++++++++++++++++------
 include/linux/mtd/rawnand.h      |  2 ++
 2 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 9e24bedffd89..04e80ace4182 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -1207,6 +1207,23 @@ static int nand_lp_exec_read_page_op(struct nand_chip *chip, unsigned int page,
 	return nand_exec_op(chip, &op);
 }
 
+static void rawnand_cap_cont_reads(struct nand_chip *chip)
+{
+	struct nand_memory_organization *memorg;
+	unsigned int pages_per_lun, first_lun, last_lun;
+
+	memorg = nanddev_get_memorg(&chip->base);
+	pages_per_lun = memorg->pages_per_eraseblock * memorg->eraseblocks_per_lun;
+	first_lun = chip->cont_read.first_page / pages_per_lun;
+	last_lun = chip->cont_read.last_page / pages_per_lun;
+
+	/* Prevent sequential cache reads across LUN boundaries */
+	if (first_lun != last_lun)
+		chip->cont_read.pause_page = first_lun * pages_per_lun + pages_per_lun - 1;
+	else
+		chip->cont_read.pause_page = chip->cont_read.last_page;
+}
+
 static int nand_lp_exec_cont_read_page_op(struct nand_chip *chip, unsigned int page,
 					  unsigned int offset_in_page, void *buf,
 					  unsigned int len, bool check_only)
@@ -1225,7 +1242,7 @@ static int nand_lp_exec_cont_read_page_op(struct nand_chip *chip, unsigned int p
 		NAND_OP_DATA_IN(len, buf, 0),
 	};
 	struct nand_op_instr cont_instrs[] = {
-		NAND_OP_CMD(page == chip->cont_read.last_page ?
+		NAND_OP_CMD(page == chip->cont_read.pause_page ?
 			    NAND_CMD_READCACHEEND : NAND_CMD_READCACHESEQ,
 			    NAND_COMMON_TIMING_NS(conf, tWB_max)),
 		NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tR_max),
@@ -1262,16 +1279,29 @@ static int nand_lp_exec_cont_read_page_op(struct nand_chip *chip, unsigned int p
 	}
 
 	if (page == chip->cont_read.first_page)
-		return nand_exec_op(chip, &start_op);
+		ret = nand_exec_op(chip, &start_op);
 	else
-		return nand_exec_op(chip, &cont_op);
+		ret = nand_exec_op(chip, &cont_op);
+	if (ret)
+		return ret;
+
+	if (!chip->cont_read.ongoing)
+		return 0;
+
+	if (page == chip->cont_read.pause_page &&
+	    page != chip->cont_read.last_page) {
+		chip->cont_read.first_page = chip->cont_read.pause_page + 1;
+		rawnand_cap_cont_reads(chip);
+	} else if (page == chip->cont_read.last_page) {
+		chip->cont_read.ongoing = false;
+	}
+
+	return 0;
 }
 
 static bool rawnand_cont_read_ongoing(struct nand_chip *chip, unsigned int page)
 {
-	return chip->cont_read.ongoing &&
-		page >= chip->cont_read.first_page &&
-		page <= chip->cont_read.last_page;
+	return chip->cont_read.ongoing && page >= chip->cont_read.first_page;
 }
 
 /**
@@ -3445,6 +3475,7 @@ static void rawnand_enable_cont_reads(struct nand_chip *chip, unsigned int page,
 	if (col)
 		chip->cont_read.first_page++;
 	chip->cont_read.last_page = page + ((readlen >> chip->page_shift) & chip->pagemask);
+	rawnand_cap_cont_reads(chip);
 }
 
 /**
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index c29ace15a053..9d0fc5109af6 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1265,6 +1265,7 @@ struct nand_secure_region {
  * @cont_read: Sequential page read internals
  * @cont_read.ongoing: Whether a continuous read is ongoing or not
  * @cont_read.first_page: Start of the continuous read operation
+ * @cont_read.pause_page: End of the current sequential cache read operation
  * @cont_read.last_page: End of the continuous read operation
  * @controller: The hardware controller	structure which is shared among multiple
  *              independent devices
@@ -1321,6 +1322,7 @@ struct nand_chip {
 	struct {
 		bool ongoing;
 		unsigned int first_page;
+		unsigned int pause_page;
 		unsigned int last_page;
 	} cont_read;
 
-- 
cgit v1.2.3


From 8e6e83d77227d9ba39e0c7b50693f1b4f8728006 Mon Sep 17 00:00:00 2001
From: Kundan Kumar <kundan.kumar@samsung.com>
Date: Fri, 22 Dec 2023 15:47:07 +0530
Subject: block: skip start/end time stamping for passthrough IO

commit 41fa722239b4 ("blk-mq: do not include passthrough requests in I/O
accounting")' disables I/O accounting for passthrough requests. Since tools
like 'iostat' do not show anything useful for passthrough I/O, it's
wasteful to do start/end time-stamping. So do away with that.

Avoiding the time-stamping improves the I/O performance by ~7%

Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20231222101707.6921-1-kundan.kumar@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..a676e116085f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -830,6 +830,12 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
  */
 static inline bool blk_mq_need_time_stamp(struct request *rq)
 {
+	/*
+	 * passthrough io doesn't use iostat accounting, cgroup stats
+	 * and io scheduler functionalities.
+	 */
+	if (blk_rq_is_passthrough(rq))
+		return false;
 	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
 }
 
-- 
cgit v1.2.3


From 60e43fe5285e2077ce9904d78cd42a230d03b788 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:02:31 -0700
Subject: lib/firmware_table: tables: Add CDAT table parsing support

The CDAT table is very similar to ACPI tables when it comes to sub-table
and entry structures. The helper functions can be also used to parse the
CDAT table. Add support to the helper functions to deal with an external
CDAT table, and also handle the endieness since CDAT can be processed by a
BE host. Export a function cdat_table_parse() for CXL driver to parse
a CDAT table.

In order to minimize ACPICA code changes, __force is being utilized to deal
with the case of a big endian (BE) host parsing a CDAT. All CDAT data
structure variables are being force casted to __leX as appropriate.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/170319615131.2212653.10932785667981494238.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/tables.c    |  5 ++--
 include/linux/fw_table.h | 21 +++++++++++++-
 lib/fw_table.c           | 75 ++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index c1516337f668..b07f7d091d13 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -251,8 +251,9 @@ int __init_or_acpilib acpi_table_parse_entries_array(
 		return -ENODEV;
 	}
 
-	count = acpi_parse_entries_array(id, table_size, table_header,
-			proc, proc_num, max_entries);
+	count = acpi_parse_entries_array(id, table_size,
+					 (union fw_table_header *)table_header,
+					 proc, proc_num, max_entries);
 
 	acpi_put_table(table_header);
 	return count;
diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h
index ca49947f0a77..95421860397a 100644
--- a/include/linux/fw_table.h
+++ b/include/linux/fw_table.h
@@ -25,16 +25,35 @@ struct acpi_subtable_proc {
 	int count;
 };
 
+union fw_table_header {
+	struct acpi_table_header acpi;
+	struct acpi_table_cdat cdat;
+};
+
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
 	struct acpi_prmt_module_header prmt;
 	struct acpi_cedt_header cedt;
+	struct acpi_cdat_header cdat;
 };
 
 int acpi_parse_entries_array(char *id, unsigned long table_size,
-			     struct acpi_table_header *table_header,
+			     union fw_table_header *table_header,
 			     struct acpi_subtable_proc *proc,
 			     int proc_num, unsigned int max_entries);
 
+int cdat_table_parse(enum acpi_cdat_type type,
+		     acpi_tbl_entry_handler_arg handler_arg, void *arg,
+		     struct acpi_table_cdat *table_header);
+
+/* CXL is the only non-ACPI consumer of the FIRMWARE_TABLE library */
+#if IS_ENABLED(CONFIG_ACPI) && !IS_ENABLED(CONFIG_CXL_BUS)
+#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_ACPI_LIB(x)
+#define __init_or_fwtbl_lib __init_or_acpilib
+#else
+#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, CXL)
+#define __init_or_fwtbl_lib
+#endif
+
 #endif
diff --git a/lib/fw_table.c b/lib/fw_table.c
index 294df54e33b6..1e5e0b2f7012 100644
--- a/lib/fw_table.c
+++ b/lib/fw_table.c
@@ -12,12 +12,14 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/fw_table.h>
 
 enum acpi_subtable_type {
 	ACPI_SUBTABLE_COMMON,
 	ACPI_SUBTABLE_HMAT,
 	ACPI_SUBTABLE_PRMT,
 	ACPI_SUBTABLE_CEDT,
+	CDAT_SUBTABLE,
 };
 
 struct acpi_subtable_entry {
@@ -25,7 +27,7 @@ struct acpi_subtable_entry {
 	enum acpi_subtable_type type;
 };
 
-static unsigned long __init_or_acpilib
+static unsigned long __init_or_fwtbl_lib
 acpi_get_entry_type(struct acpi_subtable_entry *entry)
 {
 	switch (entry->type) {
@@ -37,11 +39,13 @@ acpi_get_entry_type(struct acpi_subtable_entry *entry)
 		return 0;
 	case ACPI_SUBTABLE_CEDT:
 		return entry->hdr->cedt.type;
+	case CDAT_SUBTABLE:
+		return entry->hdr->cdat.type;
 	}
 	return 0;
 }
 
-static unsigned long __init_or_acpilib
+static unsigned long __init_or_fwtbl_lib
 acpi_get_entry_length(struct acpi_subtable_entry *entry)
 {
 	switch (entry->type) {
@@ -53,11 +57,16 @@ acpi_get_entry_length(struct acpi_subtable_entry *entry)
 		return entry->hdr->prmt.length;
 	case ACPI_SUBTABLE_CEDT:
 		return entry->hdr->cedt.length;
+	case CDAT_SUBTABLE: {
+		__le16 length = (__force __le16)entry->hdr->cdat.length;
+
+		return le16_to_cpu(length);
+	}
 	}
 	return 0;
 }
 
-static unsigned long __init_or_acpilib
+static unsigned long __init_or_fwtbl_lib
 acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
 {
 	switch (entry->type) {
@@ -69,11 +78,13 @@ acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
 		return sizeof(entry->hdr->prmt);
 	case ACPI_SUBTABLE_CEDT:
 		return sizeof(entry->hdr->cedt);
+	case CDAT_SUBTABLE:
+		return sizeof(entry->hdr->cdat);
 	}
 	return 0;
 }
 
-static enum acpi_subtable_type __init_or_acpilib
+static enum acpi_subtable_type __init_or_fwtbl_lib
 acpi_get_subtable_type(char *id)
 {
 	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
@@ -82,17 +93,32 @@ acpi_get_subtable_type(char *id)
 		return ACPI_SUBTABLE_PRMT;
 	if (strncmp(id, ACPI_SIG_CEDT, 4) == 0)
 		return ACPI_SUBTABLE_CEDT;
+	if (strncmp(id, ACPI_SIG_CDAT, 4) == 0)
+		return CDAT_SUBTABLE;
 	return ACPI_SUBTABLE_COMMON;
 }
 
-static __init_or_acpilib bool has_handler(struct acpi_subtable_proc *proc)
+static unsigned long __init_or_fwtbl_lib
+acpi_table_get_length(enum acpi_subtable_type type,
+		      union fw_table_header *header)
+{
+	if (type == CDAT_SUBTABLE) {
+		__le32 length = (__force __le32)header->cdat.length;
+
+		return le32_to_cpu(length);
+	}
+
+	return header->acpi.length;
+}
+
+static __init_or_fwtbl_lib bool has_handler(struct acpi_subtable_proc *proc)
 {
 	return proc->handler || proc->handler_arg;
 }
 
-static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc,
-					  union acpi_subtable_headers *hdr,
-					  unsigned long end)
+static __init_or_fwtbl_lib int call_handler(struct acpi_subtable_proc *proc,
+					    union acpi_subtable_headers *hdr,
+					    unsigned long end)
 {
 	if (proc->handler)
 		return proc->handler(hdr, end);
@@ -124,23 +150,26 @@ static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc,
  * On success returns sum of all matching entries for all proc handlers.
  * Otherwise, -ENODEV or -EINVAL is returned.
  */
-int __init_or_acpilib
+int __init_or_fwtbl_lib
 acpi_parse_entries_array(char *id, unsigned long table_size,
-			 struct acpi_table_header *table_header,
+			 union fw_table_header *table_header,
 			 struct acpi_subtable_proc *proc,
 			 int proc_num, unsigned int max_entries)
 {
 	unsigned long table_end, subtable_len, entry_len;
 	struct acpi_subtable_entry entry;
+	enum acpi_subtable_type type;
 	int count = 0;
 	int errs = 0;
 	int i;
 
-	table_end = (unsigned long)table_header + table_header->length;
+	type = acpi_get_subtable_type(id);
+	table_end = (unsigned long)table_header +
+		    acpi_table_get_length(type, table_header);
 
 	/* Parse all entries looking for a match. */
 
-	entry.type = acpi_get_subtable_type(id);
+	entry.type = type;
 	entry.hdr = (union acpi_subtable_headers *)
 	    ((unsigned long)table_header + table_size);
 	subtable_len = acpi_get_subtable_header_length(&entry);
@@ -186,3 +215,25 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 
 	return errs ? -EINVAL : count;
 }
+
+int __init_or_fwtbl_lib
+cdat_table_parse(enum acpi_cdat_type type,
+		 acpi_tbl_entry_handler_arg handler_arg,
+		 void *arg,
+		 struct acpi_table_cdat *table_header)
+{
+	struct acpi_subtable_proc proc = {
+		.id		= type,
+		.handler_arg	= handler_arg,
+		.arg		= arg,
+	};
+
+	if (!table_header)
+		return -EINVAL;
+
+	return acpi_parse_entries_array(ACPI_SIG_CDAT,
+					sizeof(struct acpi_table_cdat),
+					(union fw_table_header *)table_header,
+					&proc, 1, 0);
+}
+EXPORT_SYMBOL_FWTBL_LIB(cdat_table_parse);
-- 
cgit v1.2.3


From 6a954e94d038f41d79c4e04348c95774d1c9337d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:02:37 -0700
Subject: base/node / acpi: Change 'node_hmem_attrs' to 'access_coordinates'

Dan Williams suggested changing the struct 'node_hmem_attrs' to
'access_coordinates' [1]. The struct is a container of r/w-latency and
r/w-bandwidth numbers. Moving forward, this container will also be used by
CXL to store the performance characteristics of each link hop in
the PCIE/CXL topology. So, where node_hmem_attrs is just the access
parameters of a memory-node, access_coordinates applies more broadly
to hardware topology characteristics. The observation is that seemed like
an exercise in having the application identify "where" it falls on a
spectrum of bandwidth and latency needs. For the tuple of
read/write-latency and read/write-bandwidth, "coordinates" is not a perfect
fit. Sometimes it is just conveying values in isolation and not a
"location" relative to other performance points, but in the end this data
is used to identify the performance operation point of a given memory-node.
[2]

Link: http://lore.kernel.org/r/64471313421f7_1b66294d5@dwillia2-xfh.jf.intel.com.notmuch/
Link: https://lore.kernel.org/linux-cxl/645e6215ee0de_1e6f2945e@dwillia2-xfh.jf.intel.com.notmuch/
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/170319615734.2212653.15319394025985499185.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/numa/hmat.c     | 28 ++++++++++++++--------------
 drivers/base/node.c          | 12 ++++++------
 include/linux/memory-tiers.h | 10 +++++-----
 include/linux/node.h         |  8 ++++----
 mm/memory-tiers.c            | 12 ++++++------
 5 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 9ef5f1bdcfdb..83bc2b69401b 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -63,7 +63,7 @@ struct memory_target {
 	unsigned int memory_pxm;
 	unsigned int processor_pxm;
 	struct resource memregions;
-	struct node_hmem_attrs hmem_attrs[2];
+	struct access_coordinate coord[2];
 	struct list_head caches;
 	struct node_cache_attrs cache_attrs;
 	bool registered;
@@ -228,24 +228,24 @@ static void hmat_update_target_access(struct memory_target *target,
 {
 	switch (type) {
 	case ACPI_HMAT_ACCESS_LATENCY:
-		target->hmem_attrs[access].read_latency = value;
-		target->hmem_attrs[access].write_latency = value;
+		target->coord[access].read_latency = value;
+		target->coord[access].write_latency = value;
 		break;
 	case ACPI_HMAT_READ_LATENCY:
-		target->hmem_attrs[access].read_latency = value;
+		target->coord[access].read_latency = value;
 		break;
 	case ACPI_HMAT_WRITE_LATENCY:
-		target->hmem_attrs[access].write_latency = value;
+		target->coord[access].write_latency = value;
 		break;
 	case ACPI_HMAT_ACCESS_BANDWIDTH:
-		target->hmem_attrs[access].read_bandwidth = value;
-		target->hmem_attrs[access].write_bandwidth = value;
+		target->coord[access].read_bandwidth = value;
+		target->coord[access].write_bandwidth = value;
 		break;
 	case ACPI_HMAT_READ_BANDWIDTH:
-		target->hmem_attrs[access].read_bandwidth = value;
+		target->coord[access].read_bandwidth = value;
 		break;
 	case ACPI_HMAT_WRITE_BANDWIDTH:
-		target->hmem_attrs[access].write_bandwidth = value;
+		target->coord[access].write_bandwidth = value;
 		break;
 	default:
 		break;
@@ -681,7 +681,7 @@ static void hmat_register_target_cache(struct memory_target *target)
 static void hmat_register_target_perf(struct memory_target *target, int access)
 {
 	unsigned mem_nid = pxm_to_node(target->memory_pxm);
-	node_set_perf_attrs(mem_nid, &target->hmem_attrs[access], access);
+	node_set_perf_attrs(mem_nid, &target->coord[access], access);
 }
 
 static void hmat_register_target_devices(struct memory_target *target)
@@ -765,7 +765,7 @@ static int hmat_set_default_dram_perf(void)
 	int rc;
 	int nid, pxm;
 	struct memory_target *target;
-	struct node_hmem_attrs *attrs;
+	struct access_coordinate *attrs;
 
 	if (!default_dram_type)
 		return -EIO;
@@ -775,7 +775,7 @@ static int hmat_set_default_dram_perf(void)
 		target = find_mem_target(pxm);
 		if (!target)
 			continue;
-		attrs = &target->hmem_attrs[1];
+		attrs = &target->coord[1];
 		rc = mt_set_default_dram_perf(nid, attrs, "ACPI HMAT");
 		if (rc)
 			return rc;
@@ -789,7 +789,7 @@ static int hmat_calculate_adistance(struct notifier_block *self,
 {
 	static DECLARE_BITMAP(p_nodes, MAX_NUMNODES);
 	struct memory_target *target;
-	struct node_hmem_attrs *perf;
+	struct access_coordinate *perf;
 	int *adist = data;
 	int pxm;
 
@@ -802,7 +802,7 @@ static int hmat_calculate_adistance(struct notifier_block *self,
 	hmat_update_target_attrs(target, p_nodes, 1);
 	mutex_unlock(&target_lock);
 
-	perf = &target->hmem_attrs[1];
+	perf = &target->coord[1];
 
 	if (mt_perf_to_adistance(perf, adist))
 		return NOTIFY_OK;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 493d533f8375..cb2b6cc7f6e6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -74,14 +74,14 @@ static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
  * @dev:	Device for this memory access class
  * @list_node:	List element in the node's access list
  * @access:	The access class rank
- * @hmem_attrs: Heterogeneous memory performance attributes
+ * @coord:	Heterogeneous memory performance coordinates
  */
 struct node_access_nodes {
 	struct device		dev;
 	struct list_head	list_node;
 	unsigned int		access;
 #ifdef CONFIG_HMEM_REPORTING
-	struct node_hmem_attrs	hmem_attrs;
+	struct access_coordinate	coord;
 #endif
 };
 #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)
@@ -167,7 +167,7 @@ static ssize_t property##_show(struct device *dev,			\
 			   char *buf)					\
 {									\
 	return sysfs_emit(buf, "%u\n",					\
-			  to_access_nodes(dev)->hmem_attrs.property);	\
+			  to_access_nodes(dev)->coord.property);	\
 }									\
 static DEVICE_ATTR_RO(property)
 
@@ -187,10 +187,10 @@ static struct attribute *access_attrs[] = {
 /**
  * node_set_perf_attrs - Set the performance values for given access class
  * @nid: Node identifier to be set
- * @hmem_attrs: Heterogeneous memory performance attributes
+ * @coord: Heterogeneous memory performance coordinates
  * @access: The access class the for the given attributes
  */
-void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
+void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord,
 			 unsigned int access)
 {
 	struct node_access_nodes *c;
@@ -205,7 +205,7 @@ void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
 	if (!c)
 		return;
 
-	c->hmem_attrs = *hmem_attrs;
+	c->coord = *coord;
 	for (i = 0; access_attrs[i] != NULL; i++) {
 		if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i],
 					    "initiators")) {
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 1e39d27bee41..69e781900082 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
 	struct kref kref;
 };
 
-struct node_hmem_attrs;
+struct access_coordinate;
 
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
@@ -45,9 +45,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 int register_mt_adistance_algorithm(struct notifier_block *nb);
 int unregister_mt_adistance_algorithm(struct notifier_block *nb);
 int mt_calc_adistance(int node, int *adist);
-int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 			     const char *source);
-int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
+int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -126,13 +126,13 @@ static inline int mt_calc_adistance(int node, int *adist)
 	return NOTIFY_DONE;
 }
 
-static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+static inline int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 					   const char *source)
 {
 	return -EIO;
 }
 
-static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
 {
 	return -EIO;
 }
diff --git a/include/linux/node.h b/include/linux/node.h
index 427a5975cf40..25b66d705ee2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -20,14 +20,14 @@
 #include <linux/list.h>
 
 /**
- * struct node_hmem_attrs - heterogeneous memory performance attributes
+ * struct access_coordinate - generic performance coordinates container
  *
  * @read_bandwidth:	Read bandwidth in MB/s
  * @write_bandwidth:	Write bandwidth in MB/s
  * @read_latency:	Read latency in nanoseconds
  * @write_latency:	Write latency in nanoseconds
  */
-struct node_hmem_attrs {
+struct access_coordinate {
 	unsigned int read_bandwidth;
 	unsigned int write_bandwidth;
 	unsigned int read_latency;
@@ -65,7 +65,7 @@ struct node_cache_attrs {
 
 #ifdef CONFIG_HMEM_REPORTING
 void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs);
-void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
+void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord,
 			 unsigned access);
 #else
 static inline void node_add_cache(unsigned int nid,
@@ -74,7 +74,7 @@ static inline void node_add_cache(unsigned int nid,
 }
 
 static inline void node_set_perf_attrs(unsigned int nid,
-				       struct node_hmem_attrs *hmem_attrs,
+				       struct access_coordinate *coord,
 				       unsigned access)
 {
 }
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 8d5291add2bc..5462d9e3c84c 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -109,7 +109,7 @@ static struct demotion_nodes *node_demotion __read_mostly;
 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
 
 static bool default_dram_perf_error;
-static struct node_hmem_attrs default_dram_perf;
+static struct access_coordinate default_dram_perf;
 static int default_dram_perf_ref_nid = NUMA_NO_NODE;
 static const char *default_dram_perf_ref_source;
 
@@ -601,15 +601,15 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
-static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix)
+static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
 {
 	pr_info(
 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
-		prefix, attrs->read_latency, attrs->write_latency,
-		attrs->read_bandwidth, attrs->write_bandwidth);
+		prefix, coord->read_latency, coord->write_latency,
+		coord->read_bandwidth, coord->write_bandwidth);
 }
 
-int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 			     const char *source)
 {
 	int rc = 0;
@@ -666,7 +666,7 @@ out:
 	return rc;
 }
 
-int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
 {
 	if (default_dram_perf_error)
 		return -EIO;
-- 
cgit v1.2.3


From ca53543d8e340070fb37fde93f36ed9012c76b90 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:03:07 -0700
Subject: acpi: numa: Add helper function to retrieve the performance
 attributes

Add helper to retrieve the performance attributes based on the device
handle.  The helper function is exported so the CXL driver can use that
to acquire the performance data between the CPU and the CXL host bridge.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/170319618721.2212653.5552947472849081786.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/numa/hmat.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h     | 11 +++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 8a1802e078f3..d6b85f0f6082 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -108,6 +108,47 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm)
 	return NULL;
 }
 
+static struct memory_target *acpi_find_genport_target(u32 uid)
+{
+	struct memory_target *target;
+	u32 target_uid;
+	u8 *uid_ptr;
+
+	list_for_each_entry(target, &targets, node) {
+		uid_ptr = target->gen_port_device_handle + 8;
+		target_uid = *(u32 *)uid_ptr;
+		if (uid == target_uid)
+			return target;
+	}
+
+	return NULL;
+}
+
+/**
+ * acpi_get_genport_coordinates - Retrieve the access coordinates for a generic port
+ * @uid: ACPI unique id
+ * @coord: The access coordinates written back out for the generic port
+ *
+ * Return: 0 on success. Errno on failure.
+ *
+ * Only supports device handles that are ACPI. Assume ACPI0016 HID for CXL.
+ */
+int acpi_get_genport_coordinates(u32 uid,
+				 struct access_coordinate *coord)
+{
+	struct memory_target *target;
+
+	guard(mutex)(&target_lock);
+	target = acpi_find_genport_target(uid);
+	if (!target)
+		return -ENOENT;
+
+	*coord = target->coord[NODE_ACCESS_CLASS_GENPORT_SINK];
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(acpi_get_genport_coordinates, CXL);
+
 static __init void alloc_memory_initiator(unsigned int cpu_pxm)
 {
 	struct memory_initiator *initiator;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4db54e928b36..8b0761c682f9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -15,6 +15,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/property.h>
 #include <linux/uuid.h>
+#include <linux/node.h>
 
 struct irq_domain;
 struct irq_domain_ops;
@@ -424,6 +425,16 @@ extern int acpi_blacklisted(void);
 extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
+#ifdef CONFIG_ACPI_HMAT
+int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord);
+#else
+static inline int acpi_get_genport_coordinates(u32 uid,
+					       struct access_coordinate *coord)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
 int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
-- 
cgit v1.2.3


From 4d07a05397c8c15c37c8c3abb7afaea1dcd2f0e7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 21 Dec 2023 15:03:39 -0700
Subject: cxl: Calculate and store PCI link latency for the downstream ports

The latency is calculated by dividing the flit size over the bandwidth. Add
support to retrieve the flit size for the CXL switch device and calculate
the latency of the PCIe link. Cache the latency number with cxl_dport.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/170319621931.2212653.6800240203604822886.stgit@djiang5-mobl3
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/core.h |  2 ++
 drivers/cxl/core/pci.c  | 36 ++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/port.c |  6 ++++++
 drivers/cxl/cxl.h       |  4 ++++
 drivers/cxl/cxlpci.h    | 13 +++++++++++++
 drivers/pci/pci.c       | 38 ++++++++++++++++++++++++++++++++++++--
 include/linux/pci.h     |  1 +
 7 files changed, 98 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 86d7ba23235e..3b64fb1b9ed0 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -88,4 +88,6 @@ enum cxl_poison_trace_type {
 	CXL_POISON_TRACE_CLEAR,
 };
 
+long cxl_pci_get_latency(struct pci_dev *pdev);
+
 #endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 37e1652afbc7..6c9c8d92f8f7 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2021 Intel Corporation. All rights reserved. */
+#include <linux/units.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/device.h>
 #include <linux/delay.h>
@@ -979,3 +980,38 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_NEED_RESET;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, CXL);
+
+static int cxl_flit_size(struct pci_dev *pdev)
+{
+	if (cxl_pci_flit_256(pdev))
+		return 256;
+
+	return 68;
+}
+
+/**
+ * cxl_pci_get_latency - calculate the link latency for the PCIe link
+ * @pdev: PCI device
+ *
+ * return: calculated latency or 0 for no latency
+ *
+ * CXL Memory Device SW Guide v1.0 2.11.4 Link latency calculation
+ * Link latency = LinkPropagationLatency + FlitLatency + RetimerLatency
+ * LinkProgationLatency is negligible, so 0 will be used
+ * RetimerLatency is assumed to be negligible and 0 will be used
+ * FlitLatency = FlitSize / LinkBandwidth
+ * FlitSize is defined by spec. CXL rev3.0 4.2.1.
+ * 68B flit is used up to 32GT/s. >32GT/s, 256B flit size is used.
+ * The FlitLatency is converted to picoseconds.
+ */
+long cxl_pci_get_latency(struct pci_dev *pdev)
+{
+	long bw;
+
+	bw = pcie_link_speed_mbps(pdev);
+	if (bw < 0)
+		return 0;
+	bw /= BITS_PER_BYTE;
+
+	return cxl_flit_size(pdev) * MEGA / bw;
+}
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 9393cbf04652..b5ad227fe0d8 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -854,6 +854,9 @@ static struct cxl_port *__devm_cxl_add_port(struct device *host,
 	if (rc)
 		return ERR_PTR(rc);
 
+	if (parent_dport && dev_is_pci(uport_dev))
+		port->pci_latency = cxl_pci_get_latency(to_pci_dev(uport_dev));
+
 	return port;
 
 err:
@@ -1137,6 +1140,9 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev,
 	if (rc)
 		return ERR_PTR(rc);
 
+	if (dev_is_pci(dport_dev))
+		dport->link_latency = cxl_pci_get_latency(to_pci_dev(dport_dev));
+
 	return dport;
 }
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index abbdcd3a7596..7da8db919a20 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -591,6 +591,7 @@ struct cxl_dax_region {
  * @depth: How deep this port is relative to the root. depth 0 is the root.
  * @cdat: Cached CDAT data
  * @cdat_available: Should a CDAT attribute be available in sysfs
+ * @pci_latency: Upstream latency in picoseconds
  */
 struct cxl_port {
 	struct device dev;
@@ -613,6 +614,7 @@ struct cxl_port {
 		size_t length;
 	} cdat;
 	bool cdat_available;
+	long pci_latency;
 };
 
 struct cxl_root_ops {
@@ -659,6 +661,7 @@ struct cxl_rcrb_info {
  * @port: reference to cxl_port that contains this downstream port
  * @regs: Dport parsed register blocks
  * @sw_coord: access coordinates (performance) for switch from CDAT
+ * @link_latency: calculated PCIe downstream latency
  */
 struct cxl_dport {
 	struct device *dport_dev;
@@ -669,6 +672,7 @@ struct cxl_dport {
 	struct cxl_port *port;
 	struct cxl_regs regs;
 	struct access_coordinate sw_coord;
+	long link_latency;
 };
 
 /**
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 0fa4799ea316..711b05d9a370 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -85,6 +85,19 @@ struct cdat_entry_header {
 	__le16 length;
 } __packed;
 
+/*
+ * CXL v3.0 6.2.3 Table 6-4
+ * The table indicates that if PCIe Flit Mode is set, then CXL is in 256B flits
+ * mode, otherwise it's 68B flits mode.
+ */
+static inline bool cxl_pci_flit_256(struct pci_dev *pdev)
+{
+	u16 lnksta2;
+
+	pcie_capability_read_word(pdev, PCI_EXP_LNKSTA2, &lnksta2);
+	return lnksta2 & PCI_EXP_LNKSTA2_FLIT;
+}
+
 int devm_cxl_port_enumerate_dports(struct cxl_port *port);
 struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 55bc3576a985..00817d403ad4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6224,6 +6224,41 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 }
 EXPORT_SYMBOL(pcie_set_mps);
 
+static enum pci_bus_speed to_pcie_link_speed(u16 lnksta)
+{
+	return pcie_link_speed[FIELD_GET(PCI_EXP_LNKSTA_CLS, lnksta)];
+}
+
+int pcie_link_speed_mbps(struct pci_dev *pdev)
+{
+	u16 lnksta;
+	int err;
+
+	err = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
+	if (err)
+		return err;
+
+	switch (to_pcie_link_speed(lnksta)) {
+	case PCIE_SPEED_2_5GT:
+		return 2500;
+	case PCIE_SPEED_5_0GT:
+		return 5000;
+	case PCIE_SPEED_8_0GT:
+		return 8000;
+	case PCIE_SPEED_16_0GT:
+		return 16000;
+	case PCIE_SPEED_32_0GT:
+		return 32000;
+	case PCIE_SPEED_64_0GT:
+		return 64000;
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(pcie_link_speed_mbps);
+
 /**
  * pcie_bandwidth_available - determine minimum link settings of a PCIe
  *			      device and its bandwidth limitation
@@ -6257,8 +6292,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 	while (dev) {
 		pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
 
-		next_speed = pcie_link_speed[FIELD_GET(PCI_EXP_LNKSTA_CLS,
-						       lnksta)];
+		next_speed = to_pcie_link_speed(lnksta);
 		next_width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta);
 
 		next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index dea043bc1e38..504a4ba2c29e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1364,6 +1364,7 @@ int pcie_set_mps(struct pci_dev *dev, int mps);
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
+int pcie_link_speed_mbps(struct pci_dev *pdev);
 void pcie_print_link_status(struct pci_dev *dev);
 int pcie_reset_flr(struct pci_dev *dev, bool probe);
 int pcie_flr(struct pci_dev *dev);
-- 
cgit v1.2.3


From dcc3e46472d678f4af5ce1194a23649231c5d241 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 18 Dec 2023 17:26:26 -0700
Subject: net: skbuff: Remove some excess struct-member documentation

Remove documentation for nonexistent structure members, addressing these
warnings:

  ./include/linux/skbuff.h:1063: warning: Excess struct member 'sp' description in 'sk_buff'
  ./include/linux/skbuff.h:1063: warning: Excess struct member 'nf_bridge' description in 'sk_buff'

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ea5c8ab3ed00..50e92c8471dc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -754,7 +754,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@_skb_refdst: destination entry (with norefcount bit)
- *	@sp: the security path, used for xfrm
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
@@ -788,7 +787,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_sk_redir: socket redirection information for skmsg
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
- *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
  *	@tc_index: Traffic control index
  *	@hash: the packet hash
-- 
cgit v1.2.3


From 520adf3ba4a4bdd41450c57b17ef01f8a069fbfe Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:22 -0800
Subject: driver core: class: fix Excess kernel-doc description warning

Remove the @p: lines to prevent the kernel-doc warning:

include/linux/device/class.h:72: warning: Excess struct member 'p' description in 'class'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231223050522.13867-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/class.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index abf3d3bfb6fe..c576b49c55c2 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -40,8 +40,6 @@ struct fwnode_handle;
  *		for the devices belonging to the class. Usually tied to
  *		device's namespace.
  * @pm:		The default device power management operations of this class.
- * @p:		The private data of the driver core, no one other than the
- *		driver core can touch this.
  *
  * A class is a higher-level view of a device that abstracts out low-level
  * implementation details. Drivers may see a SCSI disk or an ATA disk, but,
-- 
cgit v1.2.3


From ae4d90f7ca49eb71f8a3dca64d06d4c4e2193705 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:32 -0800
Subject: driver core: device.h: fix Excess kernel-doc description warning

Remove the @knode_class: line to prevent the kernel-doc warning:

include/linux/device.h:807: warning: Excess struct member 'knode_class' description in 'device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231223050532.13881-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index aefc5ca7f1cf..ed600dbf950e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -662,7 +662,6 @@ struct device_physical_location {
  * @id:		device instance
  * @devres_lock: Spinlock to protect the resource of the device.
  * @devres_head: The resources list of the device.
- * @knode_class: The node used to add the device to the class list.
  * @class:	The class of the device.
  * @groups:	Optional attribute groups.
  * @release:	Callback to free the device after all references have
-- 
cgit v1.2.3


From 1760bfa7d7ca490cf8a61fe50ddeb1769cadd89e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:36 -0800
Subject: usb: linux/usb.h: fix Excess kernel-doc description warning

Remove the @removable: line to prevent the kernel-doc warning:

include/linux/usb.h:732: warning: Excess struct member 'removable' description in 'usb_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: linux-usb@vger.kernel.org
Link: https://lore.kernel.org/r/20231223050636.14022-1-rdunlap@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 618e5a0b1a22..07556341ba2b 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -632,7 +632,6 @@ struct usb3_lpm_parameters {
  * @reset_resume: needs reset instead of resume
  * @port_is_suspended: the upstream port is suspended (L2 or U3)
  * @slot_id: Slot ID assigned by xHCI
- * @removable: Device can be physically removed from this port
  * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout.
  * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout.
  * @u2_params: exit latencies for USB3 U2 LPM state, and hub-initiated timeout.
-- 
cgit v1.2.3


From f6847807c22f6944c71c981b630b9fff30801e73 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 22 Nov 2023 23:18:11 +0100
Subject: linux/export: Fix alignment for 64-bit ksymtab entries

An alignment of 4 bytes is wrong for 64-bit platforms which don't define
CONFIG_HAVE_ARCH_PREL32_RELOCATIONS (which then store 64-bit pointers).
Fix their alignment to 8 bytes.

Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost")
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index 69501e0ec239..cd253eb51d6c 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -16,10 +16,13 @@
  * and eliminates the need for absolute relocations that require runtime
  * processing on relocatable kernels.
  */
+#define __KSYM_ALIGN		".balign 4"
 #define __KSYM_REF(sym)		".long " #sym "- ."
 #elif defined(CONFIG_64BIT)
+#define __KSYM_ALIGN		".balign 8"
 #define __KSYM_REF(sym)		".quad " #sym
 #else
+#define __KSYM_ALIGN		".balign 4"
 #define __KSYM_REF(sym)		".long " #sym
 #endif
 
@@ -42,7 +45,7 @@
 	    "	.asciz \"" ns "\""					"\n"	\
 	    "	.previous"						"\n"	\
 	    "	.section \"___ksymtab" sec "+" #name "\", \"a\""	"\n"	\
-	    "	.balign	4"						"\n"	\
+		__KSYM_ALIGN						"\n"	\
 	    "__ksymtab_" #name ":"					"\n"	\
 		__KSYM_REF(sym)						"\n"	\
 		__KSYM_REF(__kstrtab_ ##name)				"\n"	\
-- 
cgit v1.2.3


From f91a704f7161c2cf0fcd41fa9fbec4355b813fff Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 2 Oct 2023 17:19:46 +0300
Subject: fs: prepare for stackable filesystems backing file helpers

In preparation for factoring out some backing file io helpers from
overlayfs, move backing_file_open() into a new file fs/backing-file.c
and header.

Add a MAINTAINERS entry for stackable filesystems and add a Kconfig
FS_STACK which stackable filesystems need to select.

For now, the backing_file struct, the backing_file alloc/free functions
and the backing_file_real_path() accessor remain internal to file_table.c.
We may change that in the future.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 MAINTAINERS                  |  9 +++++++++
 fs/Kconfig                   |  4 ++++
 fs/Makefile                  |  1 +
 fs/backing-file.c            | 48 ++++++++++++++++++++++++++++++++++++++++++++
 fs/open.c                    | 38 -----------------------------------
 fs/overlayfs/Kconfig         |  1 +
 fs/overlayfs/file.c          |  1 +
 include/linux/backing-file.h | 17 ++++++++++++++++
 include/linux/fs.h           |  3 ---
 9 files changed, 81 insertions(+), 41 deletions(-)
 create mode 100644 fs/backing-file.c
 create mode 100644 include/linux/backing-file.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 97f51d5ec1cf..d9ae2688f5c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8186,6 +8186,15 @@ S:	Supported
 F:	fs/iomap/
 F:	include/linux/iomap.h
 
+FILESYSTEMS [STACKABLE]
+M:	Miklos Szeredi <miklos@szeredi.hu>
+M:	Amir Goldstein <amir73il@gmail.com>
+L:	linux-fsdevel@vger.kernel.org
+L:	linux-unionfs@vger.kernel.org
+S:	Maintained
+F:	fs/backing-file.c
+F:	include/linux/backing-file.h
+
 FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
 M:	Riku Voipio <riku.voipio@iki.fi>
 L:	linux-hwmon@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index fd1f655b4f1f..c47fa4eb9282 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER
 config FS_IOMAP
 	bool
 
+# Stackable filesystems
+config FS_STACK
+	bool
+
 config BUFFER_HEAD
 	bool
 
diff --git a/fs/Makefile b/fs/Makefile
index 75522f88e763..a6962c588962 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
+obj-$(CONFIG_FS_STACK)		+= backing-file.o
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
diff --git a/fs/backing-file.c b/fs/backing-file.c
new file mode 100644
index 000000000000..04b33036f709
--- /dev/null
+++ b/fs/backing-file.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-file.h>
+
+#include "internal.h"
+
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @user_path:	path that the user reuqested to open
+ * @flags:	open flags
+ * @real_path:	path of the backing file
+ * @cred:	credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem.  In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
+ */
+struct file *backing_file_open(const struct path *user_path, int flags,
+			       const struct path *real_path,
+			       const struct cred *cred)
+{
+	struct file *f;
+	int error;
+
+	f = alloc_empty_backing_file(flags, cred);
+	if (IS_ERR(f))
+		return f;
+
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	error = vfs_open(real_path, f);
+	if (error) {
+		fput(f);
+		f = ERR_PTR(error);
+	}
+
+	return f;
+}
+EXPORT_SYMBOL_GPL(backing_file_open);
diff --git a/fs/open.c b/fs/open.c
index d877228d5939..a75054237437 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1184,44 +1184,6 @@ struct file *kernel_file_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL_GPL(kernel_file_open);
 
-/**
- * backing_file_open - open a backing file for kernel internal use
- * @user_path:	path that the user reuqested to open
- * @flags:	open flags
- * @real_path:	path of the backing file
- * @cred:	credentials for open
- *
- * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @user_path may be on the stackable filesystem and @real_path on the
- * underlying filesystem.  In this case, we want to be able to return the
- * @user_path of the stackable filesystem. This is done by embedding the
- * returned file into a container structure that also stores the stacked
- * file's path, which can be retrieved using backing_file_user_path().
- */
-struct file *backing_file_open(const struct path *user_path, int flags,
-			       const struct path *real_path,
-			       const struct cred *cred)
-{
-	struct file *f;
-	int error;
-
-	f = alloc_empty_backing_file(flags, cred);
-	if (IS_ERR(f))
-		return f;
-
-	path_get(user_path);
-	*backing_file_user_path(f) = *user_path;
-	f->f_path = *real_path;
-	error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
-	if (error) {
-		fput(f);
-		f = ERR_PTR(error);
-	}
-
-	return f;
-}
-EXPORT_SYMBOL_GPL(backing_file_open);
-
 #define WILL_CREATE(flags)	(flags & (O_CREAT | __O_TMPFILE))
 #define O_PATH_FLAGS		(O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
 
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index fec5020c3495..2ac67e04a6fb 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config OVERLAY_FS
 	tristate "Overlay filesystem support"
+	select FS_STACK
 	select EXPORTFS
 	help
 	  An overlay filesystem combines two filesystems - an 'upper' filesystem
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4e46420c8fdd..a6da3eaf6d4f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -13,6 +13,7 @@
 #include <linux/security.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/backing-file.h>
 #include "overlayfs.h"
 
 #include "../internal.h"	/* for sb_init_dio_done_wq */
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
new file mode 100644
index 000000000000..55c9e804f780
--- /dev/null
+++ b/include/linux/backing-file.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#ifndef _LINUX_BACKING_FILE_H
+#define _LINUX_BACKING_FILE_H
+
+#include <linux/file.h>
+
+struct file *backing_file_open(const struct path *user_path, int flags,
+			       const struct path *real_path,
+			       const struct cred *cred);
+
+#endif /* _LINUX_BACKING_FILE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 900d0cd55b50..db5d07e6e02e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2575,9 +2575,6 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *user_path, int flags,
-			       const struct path *real_path,
-			       const struct cred *cred);
 struct path *backing_file_user_path(struct file *f);
 
 /*
-- 
cgit v1.2.3


From a6293b3e285cd0d7692141d7981a5f144f0e2f0b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 22 Nov 2023 17:48:52 +0200
Subject: fs: factor out backing_file_{read,write}_iter() helpers

Overlayfs submits files io to backing files on other filesystems.
Factor out some common helpers to perform io to backing files, into
fs/backing-file.c.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/backing-file.c            | 210 +++++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/file.c          | 188 +++-----------------------------------
 fs/overlayfs/overlayfs.h     |   8 +-
 fs/overlayfs/super.c         |  11 +--
 include/linux/backing-file.h |  15 ++++
 5 files changed, 247 insertions(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 04b33036f709..323187a49da3 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -2,6 +2,9 @@
 /*
  * Common helpers for stackable filesystems and backing files.
  *
+ * Forked from fs/overlayfs/file.c.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
  * Copyright (C) 2023 CTERA Networks.
  */
 
@@ -46,3 +49,210 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 	return f;
 }
 EXPORT_SYMBOL_GPL(backing_file_open);
+
+struct backing_aio {
+	struct kiocb iocb;
+	refcount_t ref;
+	struct kiocb *orig_iocb;
+	/* used for aio completion */
+	void (*end_write)(struct file *);
+	struct work_struct work;
+	long res;
+};
+
+static struct kmem_cache *backing_aio_cachep;
+
+#define BACKING_IOCB_MASK \
+	(IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
+{
+	return (__force rwf_t)(flags & BACKING_IOCB_MASK);
+}
+
+static void backing_aio_put(struct backing_aio *aio)
+{
+	if (refcount_dec_and_test(&aio->ref)) {
+		fput(aio->iocb.ki_filp);
+		kmem_cache_free(backing_aio_cachep, aio);
+	}
+}
+
+static void backing_aio_cleanup(struct backing_aio *aio, long res)
+{
+	struct kiocb *iocb = &aio->iocb;
+	struct kiocb *orig_iocb = aio->orig_iocb;
+
+	if (aio->end_write)
+		aio->end_write(orig_iocb->ki_filp);
+
+	orig_iocb->ki_pos = iocb->ki_pos;
+	backing_aio_put(aio);
+}
+
+static void backing_aio_rw_complete(struct kiocb *iocb, long res)
+{
+	struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+	struct kiocb *orig_iocb = aio->orig_iocb;
+
+	if (iocb->ki_flags & IOCB_WRITE)
+		kiocb_end_write(iocb);
+
+	backing_aio_cleanup(aio, res);
+	orig_iocb->ki_complete(orig_iocb, res);
+}
+
+static void backing_aio_complete_work(struct work_struct *work)
+{
+	struct backing_aio *aio = container_of(work, struct backing_aio, work);
+
+	backing_aio_rw_complete(&aio->iocb, aio->res);
+}
+
+static void backing_aio_queue_completion(struct kiocb *iocb, long res)
+{
+	struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+
+	/*
+	 * Punt to a work queue to serialize updates of mtime/size.
+	 */
+	aio->res = res;
+	INIT_WORK(&aio->work, backing_aio_complete_work);
+	queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+		   &aio->work);
+}
+
+static int backing_aio_init_wq(struct kiocb *iocb)
+{
+	struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
+
+	if (sb->s_dio_done_wq)
+		return 0;
+
+	return sb_init_dio_done_wq(sb);
+}
+
+
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+			       struct kiocb *iocb, int flags,
+			       struct backing_file_ctx *ctx)
+{
+	struct backing_aio *aio = NULL;
+	const struct cred *old_cred;
+	ssize_t ret;
+
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+		return -EIO;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	if (iocb->ki_flags & IOCB_DIRECT &&
+	    !(file->f_mode & FMODE_CAN_ODIRECT))
+		return -EINVAL;
+
+	old_cred = override_creds(ctx->cred);
+	if (is_sync_kiocb(iocb)) {
+		rwf_t rwf = iocb_to_rw_flags(flags);
+
+		ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+	} else {
+		ret = -ENOMEM;
+		aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+		if (!aio)
+			goto out;
+
+		aio->orig_iocb = iocb;
+		kiocb_clone(&aio->iocb, iocb, get_file(file));
+		aio->iocb.ki_complete = backing_aio_rw_complete;
+		refcount_set(&aio->ref, 2);
+		ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+		backing_aio_put(aio);
+		if (ret != -EIOCBQUEUED)
+			backing_aio_cleanup(aio, ret);
+	}
+out:
+	revert_creds(old_cred);
+
+	if (ctx->accessed)
+		ctx->accessed(ctx->user_file);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+				struct kiocb *iocb, int flags,
+				struct backing_file_ctx *ctx)
+{
+	const struct cred *old_cred;
+	ssize_t ret;
+
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+		return -EIO;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	ret = file_remove_privs(ctx->user_file);
+	if (ret)
+		return ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT &&
+	    !(file->f_mode & FMODE_CAN_ODIRECT))
+		return -EINVAL;
+
+	/*
+	 * Stacked filesystems don't support deferred completions, don't copy
+	 * this property in case it is set by the issuer.
+	 */
+	flags &= ~IOCB_DIO_CALLER_COMP;
+
+	old_cred = override_creds(ctx->cred);
+	if (is_sync_kiocb(iocb)) {
+		rwf_t rwf = iocb_to_rw_flags(flags);
+
+		ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+		if (ctx->end_write)
+			ctx->end_write(ctx->user_file);
+	} else {
+		struct backing_aio *aio;
+
+		ret = backing_aio_init_wq(iocb);
+		if (ret)
+			goto out;
+
+		ret = -ENOMEM;
+		aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+		if (!aio)
+			goto out;
+
+		aio->orig_iocb = iocb;
+		aio->end_write = ctx->end_write;
+		kiocb_clone(&aio->iocb, iocb, get_file(file));
+		aio->iocb.ki_flags = flags;
+		aio->iocb.ki_complete = backing_aio_queue_completion;
+		refcount_set(&aio->ref, 2);
+		ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+		backing_aio_put(aio);
+		if (ret != -EIOCBQUEUED)
+			backing_aio_cleanup(aio, ret);
+	}
+out:
+	revert_creds(old_cred);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_write_iter);
+
+static int __init backing_aio_init(void)
+{
+	backing_aio_cachep = kmem_cache_create("backing_aio",
+					       sizeof(struct backing_aio),
+					       0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!backing_aio_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+fs_initcall(backing_aio_init);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a6da3eaf6d4f..1b578cb27a26 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -16,19 +16,6 @@
 #include <linux/backing-file.h>
 #include "overlayfs.h"
 
-#include "../internal.h"	/* for sb_init_dio_done_wq */
-
-struct ovl_aio_req {
-	struct kiocb iocb;
-	refcount_t ref;
-	struct kiocb *orig_iocb;
-	/* used for aio completion */
-	struct work_struct work;
-	long res;
-};
-
-static struct kmem_cache *ovl_aio_request_cachep;
-
 static char ovl_whatisit(struct inode *inode, struct inode *realinode)
 {
 	if (realinode != ovl_inode_upper(inode))
@@ -275,84 +262,16 @@ static void ovl_file_accessed(struct file *file)
 	touch_atime(&file->f_path);
 }
 
-#define OVL_IOCB_MASK \
-	(IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
-
-static rwf_t iocb_to_rw_flags(int flags)
-{
-	return (__force rwf_t)(flags & OVL_IOCB_MASK);
-}
-
-static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
-{
-	if (refcount_dec_and_test(&aio_req->ref)) {
-		fput(aio_req->iocb.ki_filp);
-		kmem_cache_free(ovl_aio_request_cachep, aio_req);
-	}
-}
-
-static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
-{
-	struct kiocb *iocb = &aio_req->iocb;
-	struct kiocb *orig_iocb = aio_req->orig_iocb;
-
-	if (iocb->ki_flags & IOCB_WRITE)
-		ovl_file_modified(orig_iocb->ki_filp);
-
-	orig_iocb->ki_pos = iocb->ki_pos;
-	ovl_aio_put(aio_req);
-}
-
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
-{
-	struct ovl_aio_req *aio_req = container_of(iocb,
-						   struct ovl_aio_req, iocb);
-	struct kiocb *orig_iocb = aio_req->orig_iocb;
-
-	if (iocb->ki_flags & IOCB_WRITE)
-		kiocb_end_write(iocb);
-
-	ovl_aio_cleanup_handler(aio_req);
-	orig_iocb->ki_complete(orig_iocb, res);
-}
-
-static void ovl_aio_complete_work(struct work_struct *work)
-{
-	struct ovl_aio_req *aio_req = container_of(work,
-						   struct ovl_aio_req, work);
-
-	ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
-}
-
-static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
-{
-	struct ovl_aio_req *aio_req = container_of(iocb,
-						   struct ovl_aio_req, iocb);
-	struct kiocb *orig_iocb = aio_req->orig_iocb;
-
-	/*
-	 * Punt to a work queue to serialize updates of mtime/size.
-	 */
-	aio_req->res = res;
-	INIT_WORK(&aio_req->work, ovl_aio_complete_work);
-	queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
-		   &aio_req->work);
-}
-
-static int ovl_init_aio_done_wq(struct super_block *sb)
-{
-	if (sb->s_dio_done_wq)
-		return 0;
-
-	return sb_init_dio_done_wq(sb);
-}
-
 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct fd real;
-	const struct cred *old_cred;
 	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ovl_creds(file_inode(file)->i_sb),
+		.user_file = file,
+		.accessed = ovl_file_accessed,
+	};
 
 	if (!iov_iter_count(iter))
 		return 0;
@@ -361,37 +280,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	if (ret)
 		return ret;
 
-	ret = -EINVAL;
-	if (iocb->ki_flags & IOCB_DIRECT &&
-	    !(real.file->f_mode & FMODE_CAN_ODIRECT))
-		goto out_fdput;
-
-	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	if (is_sync_kiocb(iocb)) {
-		rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
-
-		ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
-	} else {
-		struct ovl_aio_req *aio_req;
-
-		ret = -ENOMEM;
-		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
-		if (!aio_req)
-			goto out;
-
-		aio_req->orig_iocb = iocb;
-		kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
-		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
-		refcount_set(&aio_req->ref, 2);
-		ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
-		ovl_aio_put(aio_req);
-		if (ret != -EIOCBQUEUED)
-			ovl_aio_cleanup_handler(aio_req);
-	}
-out:
-	revert_creds(old_cred);
-	ovl_file_accessed(file);
-out_fdput:
+	ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+				     &ctx);
 	fdput(real);
 
 	return ret;
@@ -402,9 +292,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct fd real;
-	const struct cred *old_cred;
 	ssize_t ret;
 	int ifl = iocb->ki_flags;
+	struct backing_file_ctx ctx = {
+		.cred = ovl_creds(inode->i_sb),
+		.user_file = file,
+		.end_write = ovl_file_modified,
+	};
 
 	if (!iov_iter_count(iter))
 		return 0;
@@ -412,19 +306,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	inode_lock(inode);
 	/* Update mode */
 	ovl_copyattr(inode);
-	ret = file_remove_privs(file);
-	if (ret)
-		goto out_unlock;
 
 	ret = ovl_real_fdget(file, &real);
 	if (ret)
 		goto out_unlock;
 
-	ret = -EINVAL;
-	if (iocb->ki_flags & IOCB_DIRECT &&
-	    !(real.file->f_mode & FMODE_CAN_ODIRECT))
-		goto out_fdput;
-
 	if (!ovl_should_sync(OVL_FS(inode->i_sb)))
 		ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
 
@@ -433,39 +319,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * this property in case it is set by the issuer.
 	 */
 	ifl &= ~IOCB_DIO_CALLER_COMP;
-
-	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	if (is_sync_kiocb(iocb)) {
-		rwf_t rwf = iocb_to_rw_flags(ifl);
-
-		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
-		/* Update size */
-		ovl_file_modified(file);
-	} else {
-		struct ovl_aio_req *aio_req;
-
-		ret = ovl_init_aio_done_wq(inode->i_sb);
-		if (ret)
-			goto out;
-
-		ret = -ENOMEM;
-		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
-		if (!aio_req)
-			goto out;
-
-		aio_req->orig_iocb = iocb;
-		kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
-		aio_req->iocb.ki_flags = ifl;
-		aio_req->iocb.ki_complete = ovl_aio_queue_completion;
-		refcount_set(&aio_req->ref, 2);
-		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
-		ovl_aio_put(aio_req);
-		if (ret != -EIOCBQUEUED)
-			ovl_aio_cleanup_handler(aio_req);
-	}
-out:
-	revert_creds(old_cred);
-out_fdput:
+	ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
 	fdput(real);
 
 out_unlock:
@@ -777,19 +631,3 @@ const struct file_operations ovl_file_operations = {
 	.copy_file_range	= ovl_copy_file_range,
 	.remap_file_range	= ovl_remap_file_range,
 };
-
-int __init ovl_aio_request_cache_init(void)
-{
-	ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
-						   sizeof(struct ovl_aio_req),
-						   0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!ovl_aio_request_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void ovl_aio_request_cache_destroy(void)
-{
-	kmem_cache_destroy(ovl_aio_request_cachep);
-}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ca88b2636a57..509a57c85fae 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -417,6 +417,12 @@ int ovl_want_write(struct dentry *dentry);
 void ovl_drop_write(struct dentry *dentry);
 struct dentry *ovl_workdir(struct dentry *dentry);
 const struct cred *ovl_override_creds(struct super_block *sb);
+
+static inline const struct cred *ovl_creds(struct super_block *sb)
+{
+	return OVL_FS(sb)->creator_cred;
+}
+
 int ovl_can_decode_fh(struct super_block *sb);
 struct dentry *ovl_indexdir(struct super_block *sb);
 bool ovl_index_all(struct super_block *sb);
@@ -829,8 +835,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 
 /* file.c */
 extern const struct file_operations ovl_file_operations;
-int __init ovl_aio_request_cache_init(void);
-void ovl_aio_request_cache_destroy(void);
 int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
 int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
 int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a0967bb25003..bcd4c314a7eb 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1501,14 +1501,10 @@ static int __init ovl_init(void)
 	if (ovl_inode_cachep == NULL)
 		return -ENOMEM;
 
-	err = ovl_aio_request_cache_init();
-	if (!err) {
-		err = register_filesystem(&ovl_fs_type);
-		if (!err)
-			return 0;
+	err = register_filesystem(&ovl_fs_type);
+	if (!err)
+		return 0;
 
-		ovl_aio_request_cache_destroy();
-	}
 	kmem_cache_destroy(ovl_inode_cachep);
 
 	return err;
@@ -1524,7 +1520,6 @@ static void __exit ovl_exit(void)
 	 */
 	rcu_barrier();
 	kmem_cache_destroy(ovl_inode_cachep);
-	ovl_aio_request_cache_destroy();
 }
 
 module_init(ovl_init);
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 55c9e804f780..0648d548a418 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -9,9 +9,24 @@
 #define _LINUX_BACKING_FILE_H
 
 #include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/fs.h>
+
+struct backing_file_ctx {
+	const struct cred *cred;
+	struct file *user_file;
+	void (*accessed)(struct file *);
+	void (*end_write)(struct file *);
+};
 
 struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+			       struct kiocb *iocb, int flags,
+			       struct backing_file_ctx *ctx);
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+				struct kiocb *iocb, int flags,
+				struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 13 Oct 2023 12:13:12 +0300
Subject: fs: factor out backing_file_splice_{read,write}() helpers

There is not much in those helpers, but it makes sense to have them
logically next to the backing_file_{read,write}_iter() helpers as they
may grow more common logic in the future.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/backing-file.c            | 51 ++++++++++++++++++++++++++++++++++++++++++++
 fs/overlayfs/file.c          | 33 +++++++++++-----------------
 include/linux/backing-file.h |  8 +++++++
 3 files changed, 72 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 323187a49da3..ddd35c1d6c71 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -10,6 +10,7 @@
 
 #include <linux/fs.h>
 #include <linux/backing-file.h>
+#include <linux/splice.h>
 
 #include "internal.h"
 
@@ -245,6 +246,56 @@ out:
 }
 EXPORT_SYMBOL_GPL(backing_file_write_iter);
 
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags,
+				 struct backing_file_ctx *ctx)
+{
+	const struct cred *old_cred;
+	ssize_t ret;
+
+	if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
+		return -EIO;
+
+	old_cred = override_creds(ctx->cred);
+	ret = vfs_splice_read(in, ppos, pipe, len, flags);
+	revert_creds(old_cred);
+
+	if (ctx->accessed)
+		ctx->accessed(ctx->user_file);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_read);
+
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+				  struct file *out, loff_t *ppos, size_t len,
+				  unsigned int flags,
+				  struct backing_file_ctx *ctx)
+{
+	const struct cred *old_cred;
+	ssize_t ret;
+
+	if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
+		return -EIO;
+
+	ret = file_remove_privs(ctx->user_file);
+	if (ret)
+		return ret;
+
+	old_cred = override_creds(ctx->cred);
+	file_start_write(out);
+	ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+	file_end_write(out);
+	revert_creds(old_cred);
+
+	if (ctx->end_write)
+		ctx->end_write(ctx->user_file);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_write);
+
 static int __init backing_aio_init(void)
 {
 	backing_aio_cachep = kmem_cache_create("backing_aio",
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 1b578cb27a26..69b52d2f9c74 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,7 +9,6 @@
 #include <linux/xattr.h>
 #include <linux/uio.h>
 #include <linux/uaccess.h>
-#include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -332,20 +331,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
 			       struct pipe_inode_info *pipe, size_t len,
 			       unsigned int flags)
 {
-	const struct cred *old_cred;
 	struct fd real;
 	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ovl_creds(file_inode(in)->i_sb),
+		.user_file = in,
+		.accessed = ovl_file_accessed,
+	};
 
 	ret = ovl_real_fdget(in, &real);
 	if (ret)
 		return ret;
 
-	old_cred = ovl_override_creds(file_inode(in)->i_sb);
-	ret = vfs_splice_read(real.file, ppos, pipe, len, flags);
-	revert_creds(old_cred);
-	ovl_file_accessed(in);
-
+	ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
 	fdput(real);
+
 	return ret;
 }
 
@@ -361,30 +361,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 				loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct fd real;
-	const struct cred *old_cred;
 	struct inode *inode = file_inode(out);
 	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ovl_creds(inode->i_sb),
+		.user_file = out,
+		.end_write = ovl_file_modified,
+	};
 
 	inode_lock(inode);
 	/* Update mode */
 	ovl_copyattr(inode);
-	ret = file_remove_privs(out);
-	if (ret)
-		goto out_unlock;
 
 	ret = ovl_real_fdget(out, &real);
 	if (ret)
 		goto out_unlock;
 
-	old_cred = ovl_override_creds(inode->i_sb);
-	file_start_write(real.file);
-
-	ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
-
-	file_end_write(real.file);
-	/* Update size */
-	ovl_file_modified(out);
-	revert_creds(old_cred);
+	ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
 	fdput(real);
 
 out_unlock:
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0648d548a418..0546d5b1c9f5 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
 				struct kiocb *iocb, int flags,
 				struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags,
+				 struct backing_file_ctx *ctx);
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+				  struct file *out, loff_t *ppos, size_t len,
+				  unsigned int flags,
+				  struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From f567377e406c032fff0799bde4fdf4a977529b84 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 13 Oct 2023 12:49:37 +0300
Subject: fs: factor out backing_file_mmap() helper

Assert that the file object is allocated in a backing_file container
so that file_user_path() could be used to display the user path and
not the backing file's path in /proc/<pid>/maps.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/backing-file.c            | 27 +++++++++++++++++++++++++++
 fs/overlayfs/file.c          | 23 ++++++-----------------
 include/linux/backing-file.h |  2 ++
 3 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index ddd35c1d6c71..a681f38d84d8 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/backing-file.h>
 #include <linux/splice.h>
+#include <linux/mm.h>
 
 #include "internal.h"
 
@@ -296,6 +297,32 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 }
 EXPORT_SYMBOL_GPL(backing_file_splice_write);
 
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+		      struct backing_file_ctx *ctx)
+{
+	const struct cred *old_cred;
+	int ret;
+
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
+	    WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+		return -EIO;
+
+	if (!file->f_op->mmap)
+		return -ENODEV;
+
+	vma_set_file(vma, file);
+
+	old_cred = override_creds(ctx->cred);
+	ret = call_mmap(vma->vm_file, vma);
+	revert_creds(old_cred);
+
+	if (ctx->accessed)
+		ctx->accessed(ctx->user_file);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_mmap);
+
 static int __init backing_aio_init(void)
 {
 	backing_aio_cachep = kmem_cache_create("backing_aio",
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 69b52d2f9c74..05536964d37f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -10,7 +10,6 @@
 #include <linux/uio.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
-#include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/backing-file.h>
 #include "overlayfs.h"
@@ -415,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct file *realfile = file->private_data;
-	const struct cred *old_cred;
-	int ret;
-
-	if (!realfile->f_op->mmap)
-		return -ENODEV;
-
-	if (WARN_ON(file != vma->vm_file))
-		return -EIO;
-
-	vma_set_file(vma, realfile);
-
-	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = call_mmap(vma->vm_file, vma);
-	revert_creds(old_cred);
-	ovl_file_accessed(file);
+	struct backing_file_ctx ctx = {
+		.cred = ovl_creds(file_inode(file)->i_sb),
+		.user_file = file,
+		.accessed = ovl_file_accessed,
+	};
 
-	return ret;
+	return backing_file_mmap(realfile, vma, &ctx);
 }
 
 static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 0546d5b1c9f5..3f1fe1774f1b 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 				  struct file *out, loff_t *ppos, size_t len,
 				  unsigned int flags,
 				  struct backing_file_ctx *ctx);
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+		      struct backing_file_ctx *ctx);
 
 #endif /* _LINUX_BACKING_FILE_H */
-- 
cgit v1.2.3


From 9942cb22ea458c34fa17b73d143ea32d4df1caca Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:49 +0100
Subject: sched/topology: Add a new arch_scale_freq_ref() method

Create a new method to get a unique and fixed max frequency. Currently
cpuinfo.max_freq or the highest (or last) state of performance domain are
used as the max frequency when computing the frequency for a level of
utilization, but:

  - cpuinfo_max_freq can change at runtime. boost is one example of
    such change.

  - cpuinfo.max_freq and last item of the PD can be different leading to
    different results between cpufreq and energy model.

We need to save the reference frequency that has been used when computing
the CPUs capacity and use this fixed and coherent value to convert between
frequency and CPU's capacity.

In fact, we already save the frequency that has been used when computing
the capacity of each CPU. We extend the precision to save kHz instead of
MHz currently and we modify the type to be aligned with other variables
used when converting frequency to capacity and the other way.

[ mingo: Minor edits. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org
---
 arch/arm/include/asm/topology.h   |  1 +
 arch/arm64/include/asm/topology.h |  1 +
 arch/riscv/include/asm/topology.h |  1 +
 drivers/base/arch_topology.c      | 29 ++++++++++++++---------------
 include/linux/arch_topology.h     |  7 +++++++
 include/linux/sched/topology.h    |  8 ++++++++
 6 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index c7d2510e5a78..853c4f81ba4a 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -13,6 +13,7 @@
 #define arch_set_freq_scale topology_set_freq_scale
 #define arch_scale_freq_capacity topology_get_freq_scale
 #define arch_scale_freq_invariant topology_scale_freq_invariant
+#define arch_scale_freq_ref topology_get_freq_ref
 #endif
 
 /* Replace task scheduler's default cpu-invariant accounting */
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 9fab663dd2de..a323b109b9c4 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -23,6 +23,7 @@ void update_freq_counters_refs(void);
 #define arch_set_freq_scale topology_set_freq_scale
 #define arch_scale_freq_capacity topology_get_freq_scale
 #define arch_scale_freq_invariant topology_scale_freq_invariant
+#define arch_scale_freq_ref topology_get_freq_ref
 
 #ifdef CONFIG_ACPI_CPPC_LIB
 #define arch_init_invariance_cppc topology_init_cpu_capacity_cppc
diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h
index e316ab3b77f3..61183688bdd5 100644
--- a/arch/riscv/include/asm/topology.h
+++ b/arch/riscv/include/asm/topology.h
@@ -9,6 +9,7 @@
 #define arch_set_freq_scale		topology_set_freq_scale
 #define arch_scale_freq_capacity	topology_get_freq_scale
 #define arch_scale_freq_invariant	topology_scale_freq_invariant
+#define arch_scale_freq_ref		topology_get_freq_ref
 
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity	topology_get_cpu_scale
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index b741b5ba82bd..0c9ae5b157b1 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/units.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/thermal_pressure.h>
@@ -26,7 +27,8 @@
 static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
 static struct cpumask scale_freq_counters_mask;
 static bool scale_freq_invariant;
-static DEFINE_PER_CPU(u32, freq_factor) = 1;
+DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1;
+EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref);
 
 static bool supports_scale_freq_counters(const struct cpumask *cpus)
 {
@@ -170,9 +172,9 @@ DEFINE_PER_CPU(unsigned long, thermal_pressure);
  * operating on stale data when hot-plug is used for some CPUs. The
  * @capped_freq reflects the currently allowed max CPUs frequency due to
  * thermal capping. It might be also a boost frequency value, which is bigger
- * than the internal 'freq_factor' max frequency. In such case the pressure
- * value should simply be removed, since this is an indication that there is
- * no thermal throttling. The @capped_freq must be provided in kHz.
+ * than the internal 'capacity_freq_ref' max frequency. In such case the
+ * pressure value should simply be removed, since this is an indication that
+ * there is no thermal throttling. The @capped_freq must be provided in kHz.
  */
 void topology_update_thermal_pressure(const struct cpumask *cpus,
 				      unsigned long capped_freq)
@@ -183,10 +185,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
 
 	cpu = cpumask_first(cpus);
 	max_capacity = arch_scale_cpu_capacity(cpu);
-	max_freq = per_cpu(freq_factor, cpu);
-
-	/* Convert to MHz scale which is used in 'freq_factor' */
-	capped_freq /= 1000;
+	max_freq = arch_scale_freq_ref(cpu);
 
 	/*
 	 * Handle properly the boost frequencies, which should simply clean
@@ -279,13 +278,13 @@ void topology_normalize_cpu_scale(void)
 
 	capacity_scale = 1;
 	for_each_possible_cpu(cpu) {
-		capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
+		capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
 		capacity_scale = max(capacity, capacity_scale);
 	}
 
 	pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
 	for_each_possible_cpu(cpu) {
-		capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu);
+		capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
 		capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
 			capacity_scale);
 		topology_set_cpu_scale(cpu, capacity);
@@ -321,15 +320,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
 			cpu_node, raw_capacity[cpu]);
 
 		/*
-		 * Update freq_factor for calculating early boot cpu capacities.
+		 * Update capacity_freq_ref for calculating early boot CPU capacities.
 		 * For non-clk CPU DVFS mechanism, there's no way to get the
 		 * frequency value now, assuming they are running at the same
-		 * frequency (by keeping the initial freq_factor value).
+		 * frequency (by keeping the initial capacity_freq_ref value).
 		 */
 		cpu_clk = of_clk_get(cpu_node, 0);
 		if (!PTR_ERR_OR_ZERO(cpu_clk)) {
-			per_cpu(freq_factor, cpu) =
-				clk_get_rate(cpu_clk) / 1000;
+			per_cpu(capacity_freq_ref, cpu) =
+				clk_get_rate(cpu_clk) / HZ_PER_KHZ;
 			clk_put(cpu_clk);
 		}
 	} else {
@@ -411,7 +410,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
 	cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
 
 	for_each_cpu(cpu, policy->related_cpus)
-		per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000;
+		per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq;
 
 	if (cpumask_empty(cpus_to_visit)) {
 		topology_normalize_cpu_scale();
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index a07b510e7dc5..32c24ff4f2a8 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu)
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
+DECLARE_PER_CPU(unsigned long, capacity_freq_ref);
+
+static inline unsigned long topology_get_freq_ref(int cpu)
+{
+	return per_cpu(capacity_freq_ref, cpu);
+}
+
 DECLARE_PER_CPU(unsigned long, arch_freq_scale);
 
 static inline unsigned long topology_get_freq_scale(int cpu)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index de545ba85218..a6e04b4a21d7 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus,
 { }
 #endif
 
+#ifndef arch_scale_freq_ref
+static __always_inline
+unsigned int arch_scale_freq_ref(int cpu)
+{
+	return 0;
+}
+#endif
+
 static inline int task_node(const struct task_struct *p)
 {
 	return cpu_to_node(task_cpu(p));
-- 
cgit v1.2.3


From 599457ba15403037b489fe536266a3d5f9efaed7 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:50 +0100
Subject: cpufreq: Use the fixed and coherent frequency for scaling capacity

cpuinfo.max_freq can change at runtime because of boost as an example. This
implies that the value could be different from the frequency that has been
used to compute the capacity of a CPU.

The new arch_scale_freq_ref() returns a fixed and coherent frequency
that can be used to compute the capacity for a given frequency.

[ Also fix a arch_set_freq_scale()  newline style wart in <linux/cpufreq.h>. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org
---
 drivers/cpufreq/cpufreq.c | 4 ++--
 include/linux/cpufreq.h   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 934d35f570b7..44db4f59c4cc 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -454,7 +454,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy,
 
 	arch_set_freq_scale(policy->related_cpus,
 			    policy->cur,
-			    policy->cpuinfo.max_freq);
+			    arch_scale_freq_ref(policy->cpu));
 
 	spin_lock(&policy->transition_lock);
 	policy->transition_ongoing = false;
@@ -2174,7 +2174,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 
 	policy->cur = freq;
 	arch_set_freq_scale(policy->related_cpus, freq,
-			    policy->cpuinfo.max_freq);
+			    arch_scale_freq_ref(policy->cpu));
 	cpufreq_stats_record_transition(policy, freq);
 
 	if (trace_cpu_frequency_enabled()) {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1c5ca92a0555..afda5f24d3dd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus,
 {
 }
 #endif
+
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
-- 
cgit v1.2.3


From 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:52 +0100
Subject: energy_model: Use a fixed reference frequency

The last item of a performance domain is not always the performance point
that has been used to compute CPU's capacity. This can lead to different
target frequency compared with other part of the system like schedutil and
would result in wrong energy estimation.

A new arch_scale_freq_ref() is available to return a fixed and coherent
frequency reference that can be used when computing the CPU's frequency
for an level of utilization. Use this function to get this reference
frequency.

Energy model is never used without defining arch_scale_freq_ref() but
can be compiled. Define a default arch_scale_freq_ref() returning 0
in such case.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org
---
 include/linux/energy_model.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index adec808b371a..88d91e087471 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 				unsigned long max_util, unsigned long sum_util,
 				unsigned long allowed_cpu_cap)
 {
-	unsigned long freq, scale_cpu;
+	unsigned long freq, ref_freq, scale_cpu;
 	struct em_perf_state *ps;
 	int cpu;
 
@@ -241,10 +241,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
 	scale_cpu = arch_scale_cpu_capacity(cpu);
-	ps = &pd->table[pd->nr_perf_states - 1];
+	ref_freq = arch_scale_freq_ref(cpu);
 
 	max_util = min(max_util, allowed_cpu_cap);
-	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
+	freq = map_util_freq(max_util, ref_freq, scale_cpu);
 
 	/*
 	 * Find the lowest performance state of the Energy Model above the
-- 
cgit v1.2.3


From 1f023007f5e782bda19ad9104830c404fd622c5d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:55 +0100
Subject: arm64/amu: Use capacity_ref_freq() to set AMU ratio

Use the new capacity_ref_freq() method to set the ratio that is used by AMU for
computing the arch_scale_freq_capacity().
This helps to keep everything aligned using the same reference for
computing CPUs capacity.

The default value of the ratio (stored in per_cpu(arch_max_freq_scale))
ensures that arch_scale_freq_capacity() returns max capacity until it is
set to its correct value with the cpu capacity and capacity_ref_freq().

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20231211104855.558096-8-vincent.guittot@linaro.org
---
 arch/arm64/kernel/topology.c  | 26 +++++++++++++-------------
 drivers/base/arch_topology.c  | 12 +++++++++++-
 include/linux/arch_topology.h |  1 +
 3 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 817d788cd866..1a2c72f3e7f8 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -82,7 +82,12 @@ int __init parse_acpi_topology(void)
 #undef pr_fmt
 #define pr_fmt(fmt) "AMU: " fmt
 
-static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale);
+/*
+ * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until
+ * the CPU capacity and its associated frequency have been correctly
+ * initialized.
+ */
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
 static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
 static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
 static cpumask_var_t amu_fie_cpus;
@@ -112,14 +117,14 @@ static inline bool freq_counters_valid(int cpu)
 	return true;
 }
 
-static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
+void freq_inv_set_max_ratio(int cpu, u64 max_rate)
 {
-	u64 ratio;
+	u64 ratio, ref_rate = arch_timer_get_rate();
 
 	if (unlikely(!max_rate || !ref_rate)) {
-		pr_debug("CPU%d: invalid maximum or reference frequency.\n",
+		WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n",
 			 cpu);
-		return -EINVAL;
+		return;
 	}
 
 	/*
@@ -139,12 +144,10 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
 	ratio = div64_u64(ratio, max_rate);
 	if (!ratio) {
 		WARN_ONCE(1, "Reference frequency too low.\n");
-		return -EINVAL;
+		return;
 	}
 
-	per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio;
-
-	return 0;
+	WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio);
 }
 
 static void amu_scale_freq_tick(void)
@@ -195,10 +198,7 @@ static void amu_fie_setup(const struct cpumask *cpus)
 		return;
 
 	for_each_cpu(cpu, cpus) {
-		if (!freq_counters_valid(cpu) ||
-		    freq_inv_set_max_ratio(cpu,
-					   cpufreq_get_hw_max_freq(cpu) * 1000ULL,
-					   arch_timer_get_rate()))
+		if (!freq_counters_valid(cpu))
 			return;
 	}
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1aa76b5c96c2..5aaa0865625d 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -344,6 +344,10 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
 	return !ret;
 }
 
+void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate)
+{
+}
+
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
@@ -381,6 +385,9 @@ void topology_init_cpu_capacity_cppc(void)
 	}
 
 	for_each_possible_cpu(cpu) {
+		freq_inv_set_max_ratio(cpu,
+				       per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ);
+
 		capacity = raw_capacity[cpu];
 		capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
 				     capacity_scale);
@@ -422,8 +429,11 @@ init_cpu_capacity_callback(struct notifier_block *nb,
 
 	cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
 
-	for_each_cpu(cpu, policy->related_cpus)
+	for_each_cpu(cpu, policy->related_cpus) {
 		per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq;
+		freq_inv_set_max_ratio(cpu,
+				       per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ);
+	}
 
 	if (cpumask_empty(cpus_to_visit)) {
 		topology_normalize_cpu_scale();
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 32c24ff4f2a8..a63d61ca55af 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -99,6 +99,7 @@ void update_siblings_masks(unsigned int cpu);
 void remove_cpu_topology(unsigned int cpuid);
 void reset_cpu_topology(void);
 int parse_acpi_topology(void);
+void freq_inv_set_max_ratio(int cpu, u64 max_rate);
 #endif
 
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
cgit v1.2.3


From 11137d384996bb05cf33c8163db271e1bac3f4bf Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 1 Dec 2023 17:16:52 +0100
Subject: sched/fair: Simplify util_est

With UTIL_EST_FASTUP now being permanent, we can take advantage of the
fact that the ewma jumps directly to a higher utilization at dequeue to
simplify util_est and remove the enqueued field.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
---
 include/linux/sched.h | 49 ++++++++----------------------
 kernel/sched/debug.c  |  7 ++---
 kernel/sched/fair.c   | 82 +++++++++++++++++++--------------------------------
 kernel/sched/pelt.h   |  4 +--
 4 files changed, 48 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..03bfe9ab2951 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -415,42 +415,6 @@ struct load_weight {
 	u32				inv_weight;
 };
 
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma:     the Exponential Weighted Moving Average (EWMA)
- *            utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task:   the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
-	unsigned int			enqueued;
-	unsigned int			ewma;
-#define UTIL_EST_WEIGHT_SHIFT		2
-#define UTIL_AVG_UNCHANGED		0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -505,9 +469,20 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
-	struct util_est			util_est;
+	unsigned int			util_est;
 } ____cacheline_aligned;
 
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
+
 struct sched_statistics {
 #ifdef CONFIG_SCHEDSTATS
 	u64				wait_start;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 168eecc209b4..8d5d98a5834d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.runnable_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
-	SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
-			cfs_rq->avg.util_est.enqueued);
+	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
+			cfs_rq->avg.util_est);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
 			cfs_rq->removed.load_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.runnable_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
-	P(se.avg.util_est.ewma);
-	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+	PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
 	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e94d65da8d66..823dd76d0546 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
 
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
-	struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
-	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 		return;
 
 	/* Update root cfs_rq's estimated utilization */
-	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued  = cfs_rq->avg.util_est;
 	enqueued += _task_util_est(p);
-	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
 		return;
 
 	/* Update root cfs_rq's estimated utilization */
-	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued  = cfs_rq->avg.util_est;
 	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
-	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
 
 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
 
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
-	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
 static inline void util_est_update(struct cfs_rq *cfs_rq,
 				   struct task_struct *p,
 				   bool task_sleep)
 {
-	long last_ewma_diff, last_enqueued_diff;
-	struct util_est ue;
+	unsigned int ewma, dequeued, last_ewma_diff;
 
 	if (!sched_feat(UTIL_EST))
 		return;
@@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	if (!task_sleep)
 		return;
 
+	/* Get current estimate of utilization */
+	ewma = READ_ONCE(p->se.avg.util_est);
+
 	/*
 	 * If the PELT values haven't changed since enqueue time,
 	 * skip the util_est update.
 	 */
-	ue = p->se.avg.util_est;
-	if (ue.enqueued & UTIL_AVG_UNCHANGED)
+	if (ewma & UTIL_AVG_UNCHANGED)
 		return;
 
-	last_enqueued_diff = ue.enqueued;
+	/* Get utilization at dequeue */
+	dequeued = task_util(p);
 
 	/*
 	 * Reset EWMA on utilization increases, the moving average is used only
 	 * to smooth utilization decreases.
 	 */
-	ue.enqueued = task_util(p);
-	if (ue.ewma < ue.enqueued) {
-		ue.ewma = ue.enqueued;
+	if (ewma <= dequeued) {
+		ewma = dequeued;
 		goto done;
 	}
 
@@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Skip update of task's estimated utilization when its members are
 	 * already ~1% close to its last activation value.
 	 */
-	last_ewma_diff = ue.enqueued - ue.ewma;
-	last_enqueued_diff -= ue.enqueued;
-	if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
-		if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
-			goto done;
-
-		return;
-	}
+	last_ewma_diff = ewma - dequeued;
+	if (last_ewma_diff < UTIL_EST_MARGIN)
+		goto done;
 
 	/*
 	 * To avoid overestimation of actual task utilization, skip updates if
 	 * we cannot grant there is idle time in this CPU.
 	 */
-	if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
+	if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 		return;
 
 	/*
 	 * To avoid underestimate of task utilization, skip updates of EWMA if
 	 * we cannot grant that thread got all CPU time it wanted.
 	 */
-	if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
 		goto done;
 
 
@@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Update Task's estimated utilization
 	 *
 	 * When *p completes an activation we can consolidate another sample
-	 * of the task size. This is done by storing the current PELT value
-	 * as ue.enqueued and by using this value to update the Exponential
-	 * Weighted Moving Average (EWMA):
+	 * of the task size. This is done by using this value to update the
+	 * Exponential Weighted Moving Average (EWMA):
 	 *
 	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
 	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
 	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
-	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
-	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+	 *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
+	 *          = w * (-last_ewma_diff +  ewma(t-1) / w)
 	 *
 	 * Where 'w' is the weight of new samples, which is configured to be
 	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 	 */
-	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
-	ue.ewma  += last_ewma_diff;
-	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+	ewma <<= UTIL_EST_WEIGHT_SHIFT;
+	ewma  -= last_ewma_diff;
+	ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
-	ue.enqueued |= UTIL_AVG_UNCHANGED;
-	WRITE_ONCE(p->se.avg.util_est, ue);
+	ewma |= UTIL_AVG_UNCHANGED;
+	WRITE_ONCE(p->se.avg.util_est, ewma);
 
 	trace_sched_util_est_se_tp(&p->se);
 }
@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 	if (sched_feat(UTIL_EST)) {
 		unsigned long util_est;
 
-		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+		util_est = READ_ONCE(cfs_rq->avg.util_est);
 
 		/*
 		 * During wake-up @p isn't enqueued yet and doesn't contribute
-		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+		 * to any cpu_rq(cpu)->cfs.avg.util_est.
 		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
 		 * has been enqueued.
 		 *
 		 * During exec (@dst_cpu = -1) @p is enqueued and does
-		 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+		 * contribute to cpu_rq(cpu)->cfs.util_est.
 		 * Remove it to "simulate" cpu_util without @p's contribution.
 		 *
 		 * Despite the task_on_rq_queued(@p) check there is still a
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9e1083465fbc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 		return;
 
 	/* Avoid store if the flag has been already reset */
-	enqueued = avg->util_est.enqueued;
+	enqueued = avg->util_est;
 	if (!(enqueued & UTIL_AVG_UNCHANGED))
 		return;
 
 	/* Reset flag to report util_avg has been updated */
 	enqueued &= ~UTIL_AVG_UNCHANGED;
-	WRITE_ONCE(avg->util_est.enqueued, enqueued);
+	WRITE_ONCE(avg->util_est, enqueued);
 }
 
 static inline u64 rq_clock_pelt(struct rq *rq)
-- 
cgit v1.2.3


From 4498a8eccc97de3d65f876b6fdeddb439ef73abc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 20 Nov 2023 17:09:47 +0000
Subject: netfs, fscache: Remove ->begin_cache_operation

Remove ->begin_cache_operation() in favour of just calling fscache directly.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Christian Brauner <christian@brauner.io>
cc: linux-fsdevel@vger.kernel.org
cc: linux-cachefs@redhat.com
---
 Documentation/filesystems/netfs_library.rst | 23 ++++----------------
 fs/9p/vfs_addr.c                            | 16 --------------
 fs/afs/file.c                               | 13 ------------
 fs/ceph/addr.c                              |  1 -
 fs/ceph/cache.h                             | 12 -----------
 fs/netfs/buffered_read.c                    | 33 ++++++++++++++++-------------
 fs/nfs/fscache.c                            |  7 ------
 include/linux/fscache.h                     |  3 ---
 include/linux/netfs.h                       |  4 +---
 9 files changed, 23 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst
index 48b95d04f72d..4cc657d743f7 100644
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -295,7 +295,6 @@ through which it can issue requests and negotiate::
 	struct netfs_request_ops {
 		void (*init_request)(struct netfs_io_request *rreq, struct file *file);
 		void (*free_request)(struct netfs_io_request *rreq);
-		int (*begin_cache_operation)(struct netfs_io_request *rreq);
 		void (*expand_readahead)(struct netfs_io_request *rreq);
 		bool (*clamp_length)(struct netfs_io_subrequest *subreq);
 		void (*issue_read)(struct netfs_io_subrequest *subreq);
@@ -317,20 +316,6 @@ The operations are as follows:
    [Optional] This is called as the request is being deallocated so that the
    filesystem can clean up any state it has attached there.
 
- * ``begin_cache_operation()``
-
-   [Optional] This is called to ask the network filesystem to call into the
-   cache (if present) to initialise the caching state for this read.  The netfs
-   library module cannot access the cache directly, so the cache should call
-   something like fscache_begin_read_operation() to do this.
-
-   The cache gets to store its state in ->cache_resources and must set a table
-   of operations of its own there (though of a different type).
-
-   This should return 0 on success and an error code otherwise.  If an error is
-   reported, the operation may proceed anyway, just without local caching (only
-   out of memory and interruption errors cause failure here).
-
  * ``expand_readahead()``
 
    [Optional] This is called to allow the filesystem to expand the size of a
@@ -460,14 +445,14 @@ When implementing a local cache to be used by the read helpers, two things are
 required: some way for the network filesystem to initialise the caching for a
 read request and a table of operations for the helpers to call.
 
-The network filesystem's ->begin_cache_operation() method is called to set up a
-cache and this must call into the cache to do the work.  If using fscache, for
-example, the cache would call::
+To begin a cache operation on an fscache object, the following function is
+called::
 
 	int fscache_begin_read_operation(struct netfs_io_request *rreq,
 					 struct fscache_cookie *cookie);
 
-passing in the request pointer and the cookie corresponding to the file.
+passing in the request pointer and the cookie corresponding to the file.  This
+fills in the cache resources mentioned below.
 
 The netfs_io_request object contains a place for the cache to hang its
 state::
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8a635999a7d6..39db7c01e30a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -82,25 +82,9 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
 	p9_fid_put(fid);
 }
 
-/**
- * v9fs_begin_cache_operation - Begin a cache operation for a read
- * @rreq: The read request
- */
-static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_9P_FSCACHE
-	struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
-
-	return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-#else
-	return -ENOBUFS;
-#endif
-}
-
 const struct netfs_request_ops v9fs_req_ops = {
 	.init_request		= v9fs_init_request,
 	.free_request		= v9fs_free_request,
-	.begin_cache_operation	= v9fs_begin_cache_operation,
 	.issue_read		= v9fs_issue_read,
 };
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d37dd201752b..8c17e37c2e59 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -366,18 +366,6 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 	return 0;
 }
 
-static int afs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_AFS_FSCACHE
-	struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
-
-	return fscache_begin_read_operation(&rreq->cache_resources,
-					    afs_vnode_cache(vnode));
-#else
-	return -ENOBUFS;
-#endif
-}
-
 static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
 				 struct folio **foliop, void **_fsdata)
 {
@@ -394,7 +382,6 @@ static void afs_free_request(struct netfs_io_request *rreq)
 const struct netfs_request_ops afs_req_ops = {
 	.init_request		= afs_init_request,
 	.free_request		= afs_free_request,
-	.begin_cache_operation	= afs_begin_cache_operation,
 	.check_write_begin	= afs_check_write_begin,
 	.issue_read		= afs_issue_read,
 };
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85be3bf18cdf..3b8641febeac 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -509,7 +509,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
 const struct netfs_request_ops ceph_netfs_ops = {
 	.init_request		= ceph_init_request,
 	.free_request		= ceph_netfs_free_request,
-	.begin_cache_operation	= ceph_begin_cache_operation,
 	.issue_read		= ceph_netfs_issue_read,
 	.expand_readahead	= ceph_netfs_expand_readahead,
 	.clamp_length		= ceph_netfs_clamp_length,
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index dc502daac49a..b804f1094764 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -57,13 +57,6 @@ static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
 	return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
 }
 
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
-	struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-
-	return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-}
-
 static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
 	return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
@@ -135,11 +128,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode)
 	return false;
 }
 
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
-	return -ENOBUFS;
-}
-
 static inline void ceph_fscache_note_page_release(struct inode *inode)
 {
 }
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c439..d39d0ffe75d2 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -147,6 +147,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
 	}
 }
 
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+	return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
 /**
  * netfs_readahead - Helper to manage a read request
  * @ractl: The description of the readahead request
@@ -180,11 +189,9 @@ void netfs_readahead(struct readahead_control *ractl)
 	if (IS_ERR(rreq))
 		return;
 
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto cleanup_free;
-	}
+	ret = netfs_begin_cache_read(rreq, ctx);
+	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+		goto cleanup_free;
 
 	netfs_stat(&netfs_n_rh_readahead);
 	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -238,11 +245,9 @@ int netfs_read_folio(struct file *file, struct folio *folio)
 		goto alloc_error;
 	}
 
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto discard;
-	}
+	ret = netfs_begin_cache_read(rreq, ctx);
+	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+		goto discard;
 
 	netfs_stat(&netfs_n_rh_readpage);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
@@ -390,11 +395,9 @@ retry:
 	rreq->no_unlock_folio	= folio_index(folio);
 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
 
-	if (ctx->ops->begin_cache_operation) {
-		ret = ctx->ops->begin_cache_operation(rreq);
-		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
-			goto error_put;
-	}
+	ret = netfs_begin_cache_read(rreq, ctx);
+	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+		goto error_put;
 
 	netfs_stat(&netfs_n_rh_write_begin);
 	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b05717fe0d4e..2d1bfee225c3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
 	put_nfs_open_context(rreq->netfs_priv);
 }
 
-static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-	return fscache_begin_read_operation(&rreq->cache_resources,
-					    netfs_i_cookie(netfs_inode(rreq->inode)));
-}
-
 static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
 {
 	struct nfs_netfs_io_data *netfs;
@@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
 const struct netfs_request_ops nfs_netfs_ops = {
 	.init_request		= nfs_netfs_init_request,
 	.free_request		= nfs_netfs_free_request,
-	.begin_cache_operation	= nfs_netfs_begin_cache_operation,
 	.issue_read		= nfs_netfs_issue_read,
 	.clamp_length		= nfs_netfs_clamp_length
 };
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 8e312c8323a8..9ed6696aee7a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -437,9 +437,6 @@ const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_r
  * indicates the cache resources to which the operation state should be
  * attached; @cookie indicates the cache object that will be accessed.
  *
- * This is intended to be called from the ->begin_cache_operation() netfs lib
- * operation as implemented by the network filesystem.
- *
  * @cres->inval_counter is set from @cookie->inval_counter for comparison at
  * the end of the operation.  This allows invalidation during the operation to
  * be detected by the caller.
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index b11a84f6c32b..d294ff8f9ae4 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -208,7 +208,6 @@ struct netfs_io_request {
 struct netfs_request_ops {
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
-	int (*begin_cache_operation)(struct netfs_io_request *rreq);
 
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
@@ -229,8 +228,7 @@ enum netfs_read_from_hole {
 };
 
 /*
- * Table of operations for access to a cache.  This is obtained by
- * rreq->ops->begin_cache_operation().
+ * Table of operations for access to a cache.
  */
 struct netfs_cache_ops {
 	/* End an operation */
-- 
cgit v1.2.3


From 7eb5b3e3a0a55f2d166ca949ef47ca6e0c704aab Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 21 Nov 2023 15:43:52 +0000
Subject: netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a
 symlink

Rename /proc/fs/fscache to "netfs" and make a symlink from fscache to that.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Christian Brauner <christian@brauner.io>
cc: linux-fsdevel@vger.kernel.org
cc: linux-cachefs@redhat.com
---
 fs/netfs/fscache_main.c  |  8 ++------
 fs/netfs/fscache_proc.c  | 23 ++++++++---------------
 fs/netfs/fscache_stats.c |  4 +---
 fs/netfs/internal.h      | 12 +++++++++++-
 fs/netfs/main.c          | 33 +++++++++++++++++++++++++++++++++
 fs/netfs/stats.c         | 13 +++++++------
 include/linux/netfs.h    |  1 -
 7 files changed, 62 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 00600a4d9ce5..42e98bb523e3 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -62,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
 /*
  * initialise the fs caching module
  */
-static int __init fscache_init(void)
+int __init fscache_init(void)
 {
 	int ret = -ENOMEM;
 
@@ -94,12 +94,10 @@ error_wq:
 	return ret;
 }
 
-fs_initcall(fscache_init);
-
 /*
  * clean up on module removal
  */
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
 {
 	_enter("");
 
@@ -108,5 +106,3 @@ static void __exit fscache_exit(void)
 	destroy_workqueue(fscache_wq);
 	pr_notice("FS-Cache unloaded\n");
 }
-
-module_exit(fscache_exit);
diff --git a/fs/netfs/fscache_proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8cce..ecd0d1edafaa 100644
--- a/fs/netfs/fscache_proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
 #include "internal.h"
 
 /*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
  */
 int __init fscache_proc_init(void)
 {
-	if (!proc_mkdir("fs/fscache", NULL))
-		goto error_dir;
+	if (!proc_symlink("fs/fscache", NULL, "../netfs"))
+		goto error_sym;
 
-	if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+	if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
 			     &fscache_caches_seq_ops))
 		goto error;
 
-	if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+	if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
 			     &fscache_volumes_seq_ops))
 		goto error;
 
-	if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+	if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
 			     &fscache_cookies_seq_ops))
 		goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
-	if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
-				fscache_stats_show))
-		goto error;
-#endif
-
 	return 0;
 
 error:
 	remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
 	return -ENOMEM;
 }
 
 /*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
  */
 void fscache_proc_cleanup(void)
 {
diff --git a/fs/netfs/fscache_stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f1c..aad812ead398 100644
--- a/fs/netfs/fscache_stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(fscache_n_culled);
 /*
  * display the general statistics
  */
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
 {
 	seq_puts(m, "FS-Cache statistics\n");
 	seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
@@ -96,7 +96,5 @@ int fscache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "IO     : rd=%u wr=%u\n",
 		   atomic_read(&fscache_n_read),
 		   atomic_read(&fscache_n_write));
-
-	netfs_stats_show(m);
 	return 0;
 }
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43769ac606e8..3f6e22229433 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -76,6 +76,7 @@ extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
 extern atomic_t netfs_n_rh_write_zskip;
 
+int netfs_stats_show(struct seq_file *m, void *v);
 
 static inline void netfs_stat(atomic_t *stat)
 {
@@ -166,6 +167,13 @@ static inline void fscache_see_cookie(struct fscache_cookie *cookie,
  * fscache-main.c
  */
 extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
 
 /*
  * fscache-proc.c
@@ -216,12 +224,14 @@ static inline void fscache_stat_d(atomic_t *stat)
 
 #define __fscache_stat(stat) (stat)
 
-int fscache_stats_show(struct seq_file *m, void *v);
+int fscache_stats_show(struct seq_file *m);
 #else
 
 #define __fscache_stat(stat) (NULL)
 #define fscache_stat(stat) do {} while (0)
 #define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
 #endif
 
 /*
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 1ba8091fcf3e..c9af6e0896d3 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
 
 #include <linux/module.h>
 #include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/netfs.h>
@@ -19,3 +21,34 @@ unsigned netfs_debug;
 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
 
+static int __init netfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	if (!proc_mkdir("fs/netfs", NULL))
+		goto error;
+
+#ifdef CONFIG_FSCACHE_STATS
+	if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+				netfs_stats_show))
+		goto error_proc;
+#endif
+
+	ret = fscache_init();
+	if (ret < 0)
+		goto error_proc;
+	return 0;
+
+error_proc:
+	remove_proc_entry("fs/netfs", NULL);
+error:
+	return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+	fscache_exit();
+	remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a40..6025dc485f7e 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -28,31 +28,32 @@ atomic_t netfs_n_rh_write_done;
 atomic_t netfs_n_rh_write_failed;
 atomic_t netfs_n_rh_write_zskip;
 
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+	seq_printf(m, "Netfs  : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
 		   atomic_read(&netfs_n_rh_readahead),
 		   atomic_read(&netfs_n_rh_readpage),
 		   atomic_read(&netfs_n_rh_write_begin),
 		   atomic_read(&netfs_n_rh_write_zskip),
 		   atomic_read(&netfs_n_rh_rreq),
 		   atomic_read(&netfs_n_rh_sreq));
-	seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+	seq_printf(m, "Netfs  : ZR=%u sh=%u sk=%u\n",
 		   atomic_read(&netfs_n_rh_zero),
 		   atomic_read(&netfs_n_rh_short_read),
 		   atomic_read(&netfs_n_rh_write_zskip));
-	seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+	seq_printf(m, "Netfs  : DL=%u ds=%u df=%u di=%u\n",
 		   atomic_read(&netfs_n_rh_download),
 		   atomic_read(&netfs_n_rh_download_done),
 		   atomic_read(&netfs_n_rh_download_failed),
 		   atomic_read(&netfs_n_rh_download_instead));
-	seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+	seq_printf(m, "Netfs  : RD=%u rs=%u rf=%u\n",
 		   atomic_read(&netfs_n_rh_read),
 		   atomic_read(&netfs_n_rh_read_done),
 		   atomic_read(&netfs_n_rh_read_failed));
-	seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
+	seq_printf(m, "Netfs  : WR=%u ws=%u wf=%u\n",
 		   atomic_read(&netfs_n_rh_write),
 		   atomic_read(&netfs_n_rh_write_done),
 		   atomic_read(&netfs_n_rh_write_failed));
+	return fscache_stats_show(m);
 }
 EXPORT_SYMBOL(netfs_stats_show);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d294ff8f9ae4..9bd91cd615d5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -294,7 +294,6 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
 void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
 			  bool was_async, enum netfs_sreq_ref_trace what);
-void netfs_stats_show(struct seq_file *);
 ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
-- 
cgit v1.2.3


From c9c4ff12df110feb1b91951010f673f4b16e49e8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 27 Nov 2023 13:58:07 +0000
Subject: netfs: Move pinning-for-writeback from fscache to netfs

Move the resource pinning-for-writeback from fscache code to netfslib code.
This is used to keep a cache backing object pinned whilst we have dirty
pages on the netfs inode in the pagecache such that VM writeback will be
able to reach it.

Whilst we're at it, switch the parameters of netfs_unpin_writeback() to
match ->write_inode() so that it can be used for that directly.

Note that this mechanism could be more generically useful than that for
network filesystems.  Quite often they have to keep around other resources
(e.g. authentication tokens or network connections) until the writeback is
complete.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/9p/vfs_addr.c          | 33 +++++-------------
 fs/9p/vfs_inode.c         |  3 +-
 fs/9p/vfs_super.c         | 14 ++------
 fs/afs/file.c             |  8 +----
 fs/afs/inode.c            |  2 +-
 fs/afs/internal.h         |  6 ----
 fs/afs/super.c            |  2 +-
 fs/afs/write.c            |  9 -----
 fs/ceph/cache.h           | 23 ++++---------
 fs/ceph/inode.c           |  2 +-
 fs/fs-writeback.c         | 10 +++---
 fs/netfs/Makefile         |  1 +
 fs/netfs/fscache_io.c     | 40 ----------------------
 fs/netfs/misc.c           | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/smb/client/cifsfs.c    |  5 ++-
 fs/smb/client/file.c      | 18 ++--------
 include/linux/fs.h        |  2 +-
 include/linux/fscache.h   | 42 -----------------------
 include/linux/netfs.h     |  3 ++
 include/linux/writeback.h |  2 +-
 20 files changed, 124 insertions(+), 187 deletions(-)
 create mode 100644 fs/netfs/misc.c

(limited to 'include/linux')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 39db7c01e30a..131b83c31f85 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -317,30 +317,15 @@ out:
 	return copied;
 }
 
-#ifdef CONFIG_9P_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback.  We also
- * need to pin the cache object to write back to.
- */
-static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
-	struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
-	return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
-}
-#else
-#define v9fs_dirty_folio filemap_dirty_folio
-#endif
-
 const struct address_space_operations v9fs_addr_operations = {
-	.read_folio = netfs_read_folio,
-	.readahead = netfs_readahead,
-	.dirty_folio = v9fs_dirty_folio,
-	.writepage = v9fs_vfs_writepage,
-	.write_begin = v9fs_write_begin,
-	.write_end = v9fs_write_end,
-	.release_folio = v9fs_release_folio,
+	.read_folio	= netfs_read_folio,
+	.readahead	= netfs_readahead,
+	.dirty_folio	= netfs_dirty_folio,
+	.writepage	= v9fs_vfs_writepage,
+	.write_begin	= v9fs_write_begin,
+	.write_end	= v9fs_write_end,
+	.release_folio	= v9fs_release_folio,
 	.invalidate_folio = v9fs_invalidate_folio,
-	.launder_folio = v9fs_launder_folio,
-	.direct_IO = v9fs_direct_IO,
+	.launder_folio	= v9fs_launder_folio,
+	.direct_IO	= v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b845ee18a80b..74122540e00f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -376,8 +376,7 @@ void v9fs_evict_inode(struct inode *inode)
 
 #ifdef CONFIG_9P_FSCACHE
 	version = cpu_to_le32(v9inode->qid.version);
-	fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
-				      &version);
+	netfs_clear_inode_writeback(inode, &version);
 #endif
 
 	clear_inode(inode);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 73db55c050bf..941f7d0e0bfa 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode)
 static int v9fs_write_inode(struct inode *inode,
 			    struct writeback_control *wbc)
 {
-	struct v9fs_inode *v9inode;
-
 	/*
 	 * send an fsync request to server irrespective of
 	 * wbc->sync_mode.
 	 */
 	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-
-	v9inode = V9FS_I(inode);
-	fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
-	return 0;
+	return netfs_unpin_writeback(inode, wbc);
 }
 
 static int v9fs_write_inode_dotl(struct inode *inode,
 				 struct writeback_control *wbc)
 {
-	struct v9fs_inode *v9inode;
 
-	v9inode = V9FS_I(inode);
 	p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
 
-	fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
-	return 0;
+	return netfs_unpin_writeback(inode, wbc);
 }
 
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8c17e37c2e59..9142fda7dbd6 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -55,7 +55,7 @@ const struct inode_operations afs_file_inode_operations = {
 const struct address_space_operations afs_file_aops = {
 	.read_folio	= netfs_read_folio,
 	.readahead	= netfs_readahead,
-	.dirty_folio	= afs_dirty_folio,
+	.dirty_folio	= netfs_dirty_folio,
 	.launder_folio	= afs_launder_folio,
 	.release_folio	= afs_release_folio,
 	.invalidate_folio = afs_invalidate_folio,
@@ -386,12 +386,6 @@ const struct netfs_request_ops afs_req_ops = {
 	.issue_read		= afs_issue_read,
 };
 
-int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
-	return 0;
-}
-
 /*
  * Adjust the dirty region of the page on truncation or full invalidation,
  * getting rid of the markers altogether if the region is entirely invalidated.
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 78efc9719349..6f375f0cf650 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -823,7 +823,7 @@ void afs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 
 	afs_set_cache_aux(vnode, &aux);
-	fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
+	netfs_clear_inode_writeback(inode, &aux);
 	clear_inode(inode);
 
 	while (!list_empty(&vnode->wb_keys)) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7385d62c8cf5..b77797559e27 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1073,7 +1073,6 @@ extern int afs_release(struct inode *, struct file *);
 extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
 extern struct afs_read *afs_alloc_read(gfp_t);
 extern void afs_put_read(struct afs_read *);
-extern int afs_write_inode(struct inode *, struct writeback_control *);
 
 static inline struct afs_read *afs_get_read(struct afs_read *req)
 {
@@ -1522,11 +1521,6 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
 /*
  * write.c
  */
-#ifdef CONFIG_AFS_FSCACHE
-bool afs_dirty_folio(struct address_space *, struct folio *);
-#else
-#define afs_dirty_folio filemap_dirty_folio
-#endif
 extern int afs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len,
 			struct page **pagep, void **fsdata);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a01a0fb2cdbb..4f1b0492f1c5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,7 +55,7 @@ int afs_net_id;
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
-	.write_inode	= afs_write_inode,
+	.write_inode	= netfs_unpin_writeback,
 	.drop_inode	= afs_drop_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.free_inode	= afs_free_inode,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4a168781936b..e40cf8e7543a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -23,15 +23,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len
 			       loff_t i_size, bool caching);
 
 #ifdef CONFIG_AFS_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback.  We also
- * need to pin the cache object to write back to.
- */
-bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
-	return fscache_dirty_folio(mapping, folio,
-				afs_vnode_cache(AFS_FS_I(mapping->host)));
-}
 static void afs_folio_start_fscache(bool caching, struct folio *folio)
 {
 	if (caching)
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index b804f1094764..8fc7d828d990 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -43,19 +43,13 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
 	}
 }
 
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
 						struct writeback_control *wbc)
 {
-	fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
+	return netfs_unpin_writeback(inode, wbc);
 }
 
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	struct ceph_inode_info *ci = ceph_inode(mapping->host);
-
-	return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
-}
+#define ceph_fscache_dirty_folio netfs_dirty_folio
 
 static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
@@ -112,16 +106,13 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
 {
 }
 
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
-						struct writeback_control *wbc)
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+					       struct writeback_control *wbc)
 {
+	return 0;
 }
 
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	return filemap_dirty_folio(mapping, folio);
-}
+#define ceph_fscache_dirty_folio filemap_dirty_folio
 
 static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0679240f06db..3149d79a9dbe 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode)
 	percpu_counter_dec(&mdsc->metric.total_inodes);
 
 	truncate_inode_pages_final(&inode->i_data);
-	if (inode->i_state & I_PINNING_FSCACHE_WB)
+	if (inode->i_state & I_PINNING_NETFS_WB)
 		ceph_fscache_unuse_cookie(inode, true);
 	clear_inode(inode);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1767493dffda..3d84fcc471c6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		inode->i_state |= I_DIRTY_PAGES;
-	else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+	else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
 		if (!(inode->i_state & I_DIRTY_PAGES)) {
-			inode->i_state &= ~I_PINNING_FSCACHE_WB;
-			wbc->unpinned_fscache_wb = true;
-			dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+			inode->i_state &= ~I_PINNING_NETFS_WB;
+			wbc->unpinned_netfs_wb = true;
+			dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
 		}
 	}
 
@@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		if (ret == 0)
 			ret = err;
 	}
-	wbc->unpinned_fscache_wb = false;
+	wbc->unpinned_netfs_wb = false;
 	trace_writeback_single_inode(inode, wbc, nr_to_write);
 	return ret;
 }
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index b57162ef9cfb..a84fe9bbd3c4 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -5,6 +5,7 @@ netfs-y := \
 	io.o \
 	iterator.o \
 	main.o \
+	misc.o \
 	objects.o
 
 netfs-$(CONFIG_NETFS_STATS) += stats.o
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f82..79171a687930 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
 }
 EXPORT_SYMBOL(__fscache_begin_write_operation);
 
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it.  This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
-				struct fscache_cookie *cookie)
-{
-	struct inode *inode = mapping->host;
-	bool need_use = false;
-
-	_enter("");
-
-	if (!filemap_dirty_folio(mapping, folio))
-		return false;
-	if (!fscache_cookie_valid(cookie))
-		return true;
-
-	if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
-		spin_lock(&inode->i_lock);
-		if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
-			inode->i_state |= I_PINNING_FSCACHE_WB;
-			need_use = true;
-		}
-		spin_unlock(&inode->i_lock);
-
-		if (need_use)
-			fscache_use_cookie(cookie, true);
-	}
-	return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
 struct fscache_write_request {
 	struct netfs_cache_resources cache_resources;
 	struct address_space	*mapping;
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 000000000000..68baf55c47a4
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it.  This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct inode *inode = mapping->host;
+	struct netfs_inode *ictx = netfs_inode(inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+	bool need_use = false;
+
+	_enter("");
+
+	if (!filemap_dirty_folio(mapping, folio))
+		return false;
+	if (!fscache_cookie_valid(cookie))
+		return true;
+
+	if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+		spin_lock(&inode->i_lock);
+		if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+			inode->i_state |= I_PINNING_NETFS_WB;
+			need_use = true;
+		}
+		spin_unlock(&inode->i_lock);
+
+		if (need_use)
+			fscache_use_cookie(cookie, true);
+	}
+	return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio().  This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+	if (wbc->unpinned_netfs_wb)
+		fscache_unuse_cookie(cookie, NULL, NULL);
+	return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+	if (inode->i_state & I_PINNING_NETFS_WB) {
+		loff_t i_size = i_size_read(inode);
+		fscache_unuse_cookie(cookie, aux, &i_size);
+	}
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2131638f26d0..96a65cf9b5ec 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -429,7 +429,7 @@ static void
 cifs_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
-	if (inode->i_state & I_PINNING_FSCACHE_WB)
+	if (inode->i_state & I_PINNING_NETFS_WB)
 		cifs_fscache_unuse_inode_cookie(inode, true);
 	cifs_fscache_release_inode_cookie(inode);
 	clear_inode(inode);
@@ -792,8 +792,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 
 static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
-	return 0;
+	return netfs_unpin_writeback(inode, wbc);
 }
 
 static int cifs_drop_inode(struct inode *inode)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 32a8525415d9..b02b7f0a47dc 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -5043,27 +5043,13 @@ static void cifs_swap_deactivate(struct file *file)
 	/* do we need to unpin (or unlock) the file */
 }
 
-/*
- * Mark a page as having been made dirty and thus needing writeback.  We also
- * need to pin the cache object to write back to.
- */
-#ifdef CONFIG_CIFS_FSCACHE
-static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
-	return fscache_dirty_folio(mapping, folio,
-					cifs_inode_cookie(mapping->host));
-}
-#else
-#define cifs_dirty_folio filemap_dirty_folio
-#endif
-
 const struct address_space_operations cifs_addr_ops = {
 	.read_folio = cifs_read_folio,
 	.readahead = cifs_readahead,
 	.writepages = cifs_writepages,
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
-	.dirty_folio = cifs_dirty_folio,
+	.dirty_folio = netfs_dirty_folio,
 	.release_folio = cifs_release_folio,
 	.direct_IO = cifs_direct_io,
 	.invalidate_folio = cifs_invalidate_folio,
@@ -5087,7 +5073,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.writepages = cifs_writepages,
 	.write_begin = cifs_write_begin,
 	.write_end = cifs_write_end,
-	.dirty_folio = cifs_dirty_folio,
+	.dirty_folio = netfs_dirty_folio,
 	.release_folio = cifs_release_folio,
 	.invalidate_folio = cifs_invalidate_folio,
 	.launder_folio = cifs_launder_folio,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..68a957261694 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2294,7 +2294,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_CREATING		(1 << 15)
 #define I_DONTCACHE		(1 << 16)
 #define I_SYNC_QUEUED		(1 << 17)
-#define I_PINNING_FSCACHE_WB	(1 << 18)
+#define I_PINNING_NETFS_WB	(1 << 18)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 9ed6696aee7a..6e8562cbcc43 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -626,48 +626,6 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie,
 
 }
 
-#if __fscache_available
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
-		struct fscache_cookie *cookie);
-#else
-#define fscache_dirty_folio(MAPPING, FOLIO, COOKIE) \
-		filemap_dirty_folio(MAPPING, FOLIO)
-#endif
-
-/**
- * fscache_unpin_writeback - Unpin writeback resources
- * @wbc: The writeback control
- * @cookie: The cookie referring to the cache object
- *
- * Unpin the writeback resources pinned by fscache_dirty_folio().  This is
- * intended to be called by the netfs's ->write_inode() method.
- */
-static inline void fscache_unpin_writeback(struct writeback_control *wbc,
-					   struct fscache_cookie *cookie)
-{
-	if (wbc->unpinned_fscache_wb)
-		fscache_unuse_cookie(cookie, NULL, NULL);
-}
-
-/**
- * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode
- * @cookie: The cookie referring to the cache object
- * @inode: The inode to clean up
- * @aux: Auxiliary data to apply to the inode
- *
- * Clear any writeback resources held by an inode when the inode is evicted.
- * This must be called before clear_inode() is called.
- */
-static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie,
-						 struct inode *inode,
-						 const void *aux)
-{
-	if (inode->i_state & I_PINNING_FSCACHE_WB) {
-		loff_t i_size = i_size_read(inode);
-		fscache_unuse_cookie(cookie, aux, &i_size);
-	}
-}
-
 /**
  * fscache_note_page_release - Note that a netfs page got released
  * @cookie: The cookie corresponding to the file
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 9bd91cd615d5..32faf6c89702 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -288,6 +288,9 @@ int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
 		struct address_space *, loff_t pos, unsigned int len,
 		struct folio **, void **fsdata);
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 083387c00f0c..1e08392fb43e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -60,7 +60,7 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
-	unsigned unpinned_fscache_wb:1;	/* Cleared I_PINNING_FSCACHE_WB */
+	unsigned unpinned_netfs_wb:1;	/* Cleared I_PINNING_NETFS_WB */
 
 	/*
 	 * When writeback IOs are bounced through async layers, only the
-- 
cgit v1.2.3


From 87b57a048964abfd5f3d8b79bc55687327f5a380 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 4 Mar 2022 10:34:27 +0000
Subject: netfs: Add a procfile to list in-progress requests

Add a procfile, /proc/fs/netfs/requests, to list in-progress netfslib I/O
requests.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/internal.h   | 22 ++++++++++++++++
 fs/netfs/main.c       | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/netfs/objects.c    |  4 ++-
 include/linux/netfs.h |  6 ++++-
 4 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 3f6e22229433..4708fb15446b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -33,6 +33,28 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
  * main.c
  */
 extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+	spin_lock(&netfs_proc_lock);
+	list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+	spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+	if (!list_empty(&rreq->proc_link)) {
+		spin_lock(&netfs_proc_lock);
+		list_del_rcu(&rreq->proc_link);
+		spin_unlock(&netfs_proc_lock);
+	}
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
 
 /*
  * objects.c
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index c9af6e0896d3..97ce1436615b 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -21,13 +21,80 @@ unsigned netfs_debug;
 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
 
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[] = {
+	[NETFS_READAHEAD]	= "RA",
+	[NETFS_READPAGE]	= "RP",
+	[NETFS_READ_FOR_WRITE]	= "RW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+	struct netfs_io_request *rreq;
+
+	if (v == &netfs_io_requests) {
+		seq_puts(m,
+			 "REQUEST  OR REF FL ERR  OPS COVERAGE\n"
+			 "======== == === == ==== === =========\n"
+			 );
+		return 0;
+	}
+
+	rreq = list_entry(v, struct netfs_io_request, proc_link);
+	seq_printf(m,
+		   "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+		   rreq->debug_id,
+		   netfs_origins[rreq->origin],
+		   refcount_read(&rreq->ref),
+		   rreq->flags,
+		   rreq->error,
+		   atomic_read(&rreq->nr_outstanding),
+		   rreq->start, rreq->submitted, rreq->len);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+	.start  = netfs_requests_seq_start,
+	.next   = netfs_requests_seq_next,
+	.stop   = netfs_requests_seq_stop,
+	.show   = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
 static int __init netfs_init(void)
 {
 	int ret = -ENOMEM;
 
 	if (!proc_mkdir("fs/netfs", NULL))
 		goto error;
-
+	if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+			     &netfs_requests_seq_ops))
+		goto error_proc;
 #ifdef CONFIG_FSCACHE_STATS
 	if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
 				netfs_stats_show))
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6a7..85f428fc52e6 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -45,6 +45,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 		}
 	}
 
+	netfs_proc_add_rreq(rreq);
 	netfs_stat(&netfs_n_rh_rreq);
 	return rreq;
 }
@@ -76,12 +77,13 @@ static void netfs_free_request(struct work_struct *work)
 		container_of(work, struct netfs_io_request, work);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+	netfs_proc_del_rreq(rreq);
 	netfs_clear_subrequests(rreq, false);
 	if (rreq->netfs_ops->free_request)
 		rreq->netfs_ops->free_request(rreq);
 	if (rreq->cache_resources.ops)
 		rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
-	kfree(rreq);
+	kfree_rcu(rreq, rcu);
 	netfs_stat_d(&netfs_n_rh_rreq);
 }
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 32faf6c89702..7244ddebd974 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -175,10 +175,14 @@ enum netfs_io_origin {
  * operations to a variety of data stores and then stitch the result together.
  */
 struct netfs_io_request {
-	struct work_struct	work;
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 	struct inode		*inode;		/* The file being accessed */
 	struct address_space	*mapping;	/* The mapping being accessed */
 	struct netfs_cache_resources cache_resources;
+	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
-- 
cgit v1.2.3


From cc3cb0a18da46a51d9fc173155576ba1d068e536 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Mar 2022 11:01:12 +0000
Subject: netfs: Allow the netfs to make the io (sub)request alloc larger

Allow the network filesystem to specify extra space to be allocated on the
end of the io (sub)request.  This allows cifs, for example, to use this
space rather than allocating its own cifs_readdata struct.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/objects.c    | 7 +++++--
 include/linux/netfs.h | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 85f428fc52e6..c4229c5f3f54 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -22,7 +22,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct netfs_io_request *rreq;
 	int ret;
 
-	rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+	rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+		       GFP_KERNEL);
 	if (!rreq)
 		return ERR_PTR(-ENOMEM);
 
@@ -114,7 +115,9 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
 {
 	struct netfs_io_subrequest *subreq;
 
-	subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+	subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+			 sizeof(struct netfs_io_subrequest),
+			 GFP_KERNEL);
 	if (subreq) {
 		INIT_LIST_HEAD(&subreq->rreq_link);
 		refcount_set(&subreq->ref, 2);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 7244ddebd974..d6f27000eeb0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -210,6 +210,8 @@ struct netfs_io_request {
  * Operations the network filesystem can/must provide to the helpers.
  */
 struct netfs_request_ops {
+	unsigned int	io_request_size;	/* Alloc size for netfs_io_request struct */
+	unsigned int	io_subrequest_size;	/* Alloc size for netfs_io_subrequest struct */
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
 
-- 
cgit v1.2.3


From 5f5ce7ba15e7e6a6539ac8e1f845757aaebecf0d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Feb 2022 11:19:14 +0000
Subject: netfs: Add a ->free_subrequest() op

Add a ->free_subrequest() op so that the netfs can clean up data attached
to a subrequest.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/objects.c    | 2 ++
 include/linux/netfs.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index c4229c5f3f54..1bd20bdad983 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -145,6 +145,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
 	struct netfs_io_request *rreq = subreq->rreq;
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+	if (rreq->netfs_ops->free_subrequest)
+		rreq->netfs_ops->free_subrequest(subreq);
 	kfree(subreq);
 	netfs_stat_d(&netfs_n_rh_sreq);
 	netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d6f27000eeb0..06f57d9d09f6 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -214,6 +214,7 @@ struct netfs_request_ops {
 	unsigned int	io_subrequest_size;	/* Alloc size for netfs_io_subrequest struct */
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
+	void (*free_subrequest)(struct netfs_io_subrequest *rreq);
 
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
-- 
cgit v1.2.3


From c1ec4d7c2e13471558cfea302b7583856284f94c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Aug 2021 17:08:30 +0100
Subject: netfs: Provide invalidate_folio and release_folio calls

Provide default invalidate_folio and release_folio calls.  These will need
to interact with invalidation correctly at some point.  They will be needed
if netfslib is to make use of folio->private for its own purposes.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/9p/vfs_addr.c      | 33 ++------------------------------
 fs/afs/file.c         | 53 ++++-----------------------------------------------
 fs/ceph/addr.c        | 24 ++---------------------
 fs/ceph/cache.h       | 10 ----------
 fs/netfs/misc.c       | 42 ++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h |  6 ++++--
 6 files changed, 54 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 131b83c31f85..055b672a247d 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -88,35 +88,6 @@ const struct netfs_request_ops v9fs_req_ops = {
 	.issue_read		= v9fs_issue_read,
 };
 
-/**
- * v9fs_release_folio - release the private state associated with a folio
- * @folio: The folio to be released
- * @gfp: The caller's allocation restrictions
- *
- * Returns true if the page can be released, false otherwise.
- */
-
-static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
-{
-	if (folio_test_private(folio))
-		return false;
-#ifdef CONFIG_9P_FSCACHE
-	if (folio_test_fscache(folio)) {
-		if (current_is_kswapd() || !(gfp & __GFP_FS))
-			return false;
-		folio_wait_fscache(folio);
-	}
-	fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
-#endif
-	return true;
-}
-
-static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
-				 size_t length)
-{
-	folio_wait_fscache(folio);
-}
-
 #ifdef CONFIG_9P_FSCACHE
 static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
 				     bool was_async)
@@ -324,8 +295,8 @@ const struct address_space_operations v9fs_addr_operations = {
 	.writepage	= v9fs_vfs_writepage,
 	.write_begin	= v9fs_write_begin,
 	.write_end	= v9fs_write_end,
-	.release_folio	= v9fs_release_folio,
-	.invalidate_folio = v9fs_invalidate_folio,
+	.release_folio	= netfs_release_folio,
+	.invalidate_folio = netfs_invalidate_folio,
 	.launder_folio	= v9fs_launder_folio,
 	.direct_IO	= v9fs_direct_IO,
 };
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d783e5b2147..d152ba451f0e 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,9 +20,6 @@
 
 static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
 static int afs_symlink_read_folio(struct file *file, struct folio *folio);
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
-			       size_t length);
-static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
 
 static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -57,8 +54,8 @@ const struct address_space_operations afs_file_aops = {
 	.readahead	= netfs_readahead,
 	.dirty_folio	= netfs_dirty_folio,
 	.launder_folio	= afs_launder_folio,
-	.release_folio	= afs_release_folio,
-	.invalidate_folio = afs_invalidate_folio,
+	.release_folio	= netfs_release_folio,
+	.invalidate_folio = netfs_invalidate_folio,
 	.write_begin	= afs_write_begin,
 	.write_end	= afs_write_end,
 	.writepages	= afs_writepages,
@@ -67,8 +64,8 @@ const struct address_space_operations afs_file_aops = {
 
 const struct address_space_operations afs_symlink_aops = {
 	.read_folio	= afs_symlink_read_folio,
-	.release_folio	= afs_release_folio,
-	.invalidate_folio = afs_invalidate_folio,
+	.release_folio	= netfs_release_folio,
+	.invalidate_folio = netfs_invalidate_folio,
 	.migrate_folio	= filemap_migrate_folio,
 };
 
@@ -386,48 +383,6 @@ const struct netfs_request_ops afs_req_ops = {
 	.issue_read		= afs_issue_read,
 };
 
-/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- *   the entire page)
- */
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
-			       size_t length)
-{
-	_enter("{%lu},%zu,%zu", folio->index, offset, length);
-
-	folio_wait_fscache(folio);
-	_leave("");
-}
-
-/*
- * release a page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
- */
-static bool afs_release_folio(struct folio *folio, gfp_t gfp)
-{
-	struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
-
-	_enter("{{%llx:%llu}[%lu],%lx},%x",
-	       vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
-	       gfp);
-
-	/* deny if folio is being written to the cache and the caller hasn't
-	 * elected to wait */
-#ifdef CONFIG_AFS_FSCACHE
-	if (folio_test_fscache(folio)) {
-		if (current_is_kswapd() || !(gfp & __GFP_FS))
-			return false;
-		folio_wait_fscache(folio);
-	}
-	fscache_note_page_release(afs_vnode_cache(vnode));
-#endif
-
-	/* Indicate that the folio can be released */
-	_leave(" = T");
-	return true;
-}
-
 static void afs_add_open_mmap(struct afs_vnode *vnode)
 {
 	if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3b8641febeac..8eedc62e7ac4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
 		ceph_put_snap_context(snapc);
 	}
 
-	folio_wait_fscache(folio);
-}
-
-static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
-{
-	struct inode *inode = folio->mapping->host;
-	struct ceph_client *cl = ceph_inode_to_client(inode);
-
-	doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
-	      folio->index, folio_test_dirty(folio) ? "" : "not ");
-
-	if (folio_test_private(folio))
-		return false;
-
-	if (folio_test_fscache(folio)) {
-		if (current_is_kswapd() || !(gfp & __GFP_FS))
-			return false;
-		folio_wait_fscache(folio);
-	}
-	ceph_fscache_note_page_release(inode);
-	return true;
+	netfs_invalidate_folio(folio, offset, length);
 }
 
 static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -1585,7 +1565,7 @@ const struct address_space_operations ceph_aops = {
 	.write_end = ceph_write_end,
 	.dirty_folio = ceph_dirty_folio,
 	.invalidate_folio = ceph_invalidate_folio,
-	.release_folio = ceph_release_folio,
+	.release_folio = netfs_release_folio,
 	.direct_IO = noop_direct_IO,
 };
 
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 8fc7d828d990..20efac020394 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -56,12 +56,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode)
 	return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
 }
 
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-
-	fscache_note_page_release(ceph_fscache_cookie(ci));
-}
 #else /* CONFIG_CEPH_FSCACHE */
 static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
 					   struct fs_context *fc)
@@ -118,10 +112,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode)
 {
 	return false;
 }
-
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-}
 #endif /* CONFIG_CEPH_FSCACHE */
 
 #endif
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 68baf55c47a4..45bb19ec9a63 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -84,3 +84,45 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
 	}
 }
 EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem.  The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	_enter("{%lx},%zx,%zx", folio_index(folio), offset, length);
+
+	folio_wait_fscache(folio);
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+	struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+
+	if (folio_test_private(folio))
+		return false;
+	if (folio_test_fscache(folio)) {
+		if (current_is_kswapd() || !(gfp & __GFP_FS))
+			return false;
+		folio_wait_fscache(folio);
+	}
+
+	fscache_note_page_release(netfs_i_cookie(ctx));
+	return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 06f57d9d09f6..8efbfd3b2820 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -293,11 +293,13 @@ struct readahead_control;
 void netfs_readahead(struct readahead_control *);
 int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
-		struct address_space *, loff_t pos, unsigned int len,
-		struct folio **, void **fsdata);
+		      struct address_space *, loff_t pos, unsigned int len,
+		      struct folio **, void **fsdata);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
+bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
-- 
cgit v1.2.3


From 46ed60dcd4f2c94d27735743ce55cd8d6b93cc1d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Oct 2023 15:34:07 +0100
Subject: netfs: Implement unbuffered/DIO vs buffered I/O locking

Borrow NFS's direct-vs-buffered I/O locking into netfslib.  Similar code is
also used in ceph.

Modify it to have the correct checker annotations for i_rwsem lock
acquisition/release and to return -ERESTARTSYS if waits are interrupted.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/Makefile     |   1 +
 fs/netfs/locking.c    | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h |  10 +++
 3 files changed, 227 insertions(+)
 create mode 100644 fs/netfs/locking.c

(limited to 'include/linux')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index a84fe9bbd3c4..cf3fc847b8ac 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -4,6 +4,7 @@ netfs-y := \
 	buffered_read.o \
 	io.o \
 	iterator.o \
+	locking.o \
 	main.o \
 	misc.o \
 	objects.o
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 000000000000..75dc52a49b3a
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+	if (!atomic_read(&inode->i_dio_count))
+		return 0;
+
+	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+	for (;;) {
+		prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+		if (!atomic_read(&inode->i_dio_count))
+			break;
+		if (signal_pending(current))
+			break;
+		schedule();
+	}
+	finish_wait(wq, &q.wq_entry);
+
+	return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+	if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+		return 0;
+	clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+	return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+	__acquires(inode->i_rwsem)
+{
+	struct netfs_inode *ictx = netfs_inode(inode);
+
+	/* Be an optimist! */
+	if (down_read_interruptible(&inode->i_rwsem) < 0)
+		return -ERESTARTSYS;
+	if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+		return 0;
+	up_read(&inode->i_rwsem);
+
+	/* Slow path.... */
+	if (down_write_killable(&inode->i_rwsem) < 0)
+		return -ERESTARTSYS;
+	if (netfs_block_o_direct(ictx) < 0) {
+		up_write(&inode->i_rwsem);
+		return -ERESTARTSYS;
+	}
+	downgrade_write(&inode->i_rwsem);
+	return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+	__releases(inode->i_rwsem)
+{
+	up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+	__acquires(inode->i_rwsem)
+{
+	struct netfs_inode *ictx = netfs_inode(inode);
+
+	if (down_write_killable(&inode->i_rwsem) < 0)
+		return -ERESTARTSYS;
+	if (netfs_block_o_direct(ictx) < 0) {
+		up_write(&inode->i_rwsem);
+		return -ERESTARTSYS;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+	__releases(inode->i_rwsem)
+{
+	up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+	struct netfs_inode *ictx = netfs_inode(inode);
+	int ret;
+
+	if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+		set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+		if (inode->i_mapping->nrpages != 0) {
+			unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+			ret = filemap_fdatawait(inode->i_mapping);
+			if (ret < 0) {
+				clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+				return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+	__acquires(inode->i_rwsem)
+{
+	struct netfs_inode *ictx = netfs_inode(inode);
+	int ret;
+
+	/* Be an optimist! */
+	if (down_read_interruptible(&inode->i_rwsem) < 0)
+		return -ERESTARTSYS;
+	if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+		return 0;
+	up_read(&inode->i_rwsem);
+
+	/* Slow path.... */
+	if (down_write_killable(&inode->i_rwsem) < 0)
+		return -ERESTARTSYS;
+	ret = netfs_block_buffered(inode);
+	if (ret < 0) {
+		up_write(&inode->i_rwsem);
+		return ret;
+	}
+	downgrade_write(&inode->i_rwsem);
+	return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+	__releases(inode->i_rwsem)
+{
+	up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8efbfd3b2820..fc6d9756a029 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -129,6 +129,8 @@ struct netfs_inode {
 	struct fscache_cookie	*cache;
 #endif
 	loff_t			remote_i_size;	/* Size of the remote file */
+	unsigned long		flags;
+#define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 };
 
 /*
@@ -310,6 +312,13 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
 
+int netfs_start_io_read(struct inode *inode);
+void netfs_end_io_read(struct inode *inode);
+int netfs_start_io_write(struct inode *inode);
+void netfs_end_io_write(struct inode *inode);
+int netfs_start_io_direct(struct inode *inode);
+void netfs_end_io_direct(struct inode *inode);
+
 /**
  * netfs_inode - Get the netfs inode context from the inode
  * @inode: The inode to query
@@ -335,6 +344,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
 {
 	ctx->ops = ops;
 	ctx->remote_i_size = i_size_read(&ctx->inode);
+	ctx->flags = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
-- 
cgit v1.2.3


From 92b6cc5d1e7cbe569f00e9c1249ac8214fd5e2d2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 26 Sep 2023 17:42:26 +0100
Subject: netfs: Add iov_iters to (sub)requests to describe various buffers

Add three iov_iter structs:

 (1) Add an iov_iter (->iter) to the I/O request to describe the
     unencrypted-side buffer.

 (2) Add an iov_iter (->io_iter) to the I/O request to describe the
     encrypted-side I/O buffer.  This may be a different size to the buffer
     in (1).

 (3) Add an iov_iter (->io_iter) to the I/O subrequest to describe the part
     of the I/O buffer for that subrequest.

This will allow future patches to point to a bounce buffer instead for
purposes of handling oversize writes, decryption (where we want to save the
encrypted data to the cache) and decompression.

These iov_iters persist for the lifetime of the (sub)request, and so can be
accessed multiple times without worrying about them being deallocated upon
return to the caller.

The network filesystem must appropriately advance the iterator before
terminating the request.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/afs/file.c            |  6 +----
 fs/netfs/buffered_read.c | 13 +++++++++
 fs/netfs/io.c            | 69 +++++++++++++++++++++++++++++++++++-------------
 include/linux/netfs.h    |  3 +++
 4 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index d152ba451f0e..3403bb792deb 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -323,11 +323,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
 	fsreq->len	= subreq->len   - subreq->transferred;
 	fsreq->key	= key_get(subreq->rreq->netfs_priv);
 	fsreq->vnode	= vnode;
-	fsreq->iter	= &fsreq->def_iter;
-
-	iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
-			&fsreq->vnode->netfs.inode.i_mapping->i_pages,
-			fsreq->pos, fsreq->len);
+	fsreq->iter	= &subreq->io_iter;
 
 	afs_fetch_data(fsreq->vnode, fsreq);
 	afs_put_read(fsreq);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index d39d0ffe75d2..751556faa70b 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -199,6 +199,10 @@ void netfs_readahead(struct readahead_control *ractl)
 
 	netfs_rreq_expand(rreq, ractl);
 
+	/* Set up the output buffer */
+	iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+			rreq->start, rreq->len);
+
 	/* Drop the refs on the folios here rather than in the cache or
 	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
 	 */
@@ -251,6 +255,11 @@ int netfs_read_folio(struct file *file, struct folio *folio)
 
 	netfs_stat(&netfs_n_rh_readpage);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+
+	/* Set up the output buffer */
+	iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+			rreq->start, rreq->len);
+
 	return netfs_begin_read(rreq, true);
 
 discard:
@@ -408,6 +417,10 @@ retry:
 	ractl._nr_pages = folio_nr_pages(folio);
 	netfs_rreq_expand(rreq, &ractl);
 
+	/* Set up the output buffer */
+	iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+			rreq->start, rreq->len);
+
 	/* We hold the folio locks, so we can drop the references */
 	folio_get(folio);
 	while (readahead_folio(&ractl))
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e047..e9d408e211b8 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
  */
 static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
 {
-	struct iov_iter iter;
-
-	iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
-			subreq->start + subreq->transferred,
-			subreq->len   - subreq->transferred);
-	iov_iter_zero(iov_iter_count(&iter), &iter);
+	iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
 }
 
 static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
 				  enum netfs_read_from_hole read_hole)
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;
-	struct iov_iter iter;
 
 	netfs_stat(&netfs_n_rh_read);
-	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
-			subreq->start + subreq->transferred,
-			subreq->len   - subreq->transferred);
-
-	cres->ops->read(cres, subreq->start, &iter, read_hole,
+	cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
 			netfs_cache_read_terminated, subreq);
 }
 
@@ -88,6 +78,11 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
 				   struct netfs_io_subrequest *subreq)
 {
 	netfs_stat(&netfs_n_rh_download);
+	if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+		pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+			rreq->debug_id, subreq->debug_index,
+			iov_iter_count(&subreq->io_iter), subreq->len,
+			subreq->transferred, subreq->flags);
 	rreq->netfs_ops->issue_read(subreq);
 }
 
@@ -259,6 +254,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
 		netfs_read_from_server(rreq, subreq);
 }
 
+/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+				    struct netfs_io_subrequest *subreq)
+{
+	size_t remaining = subreq->len - subreq->transferred;
+	size_t count = iov_iter_count(&subreq->io_iter);
+
+	if (count == remaining)
+		return;
+
+	_debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+	       rreq->debug_id, subreq->debug_index,
+	       iov_iter_count(&subreq->io_iter), subreq->transferred,
+	       subreq->len, rreq->i_size,
+	       subreq->io_iter.iter_type);
+
+	if (count < remaining)
+		iov_iter_revert(&subreq->io_iter, remaining - count);
+	else
+		iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
 /*
  * Resubmit any short or failed operations.  Returns true if we got the rreq
  * ref back.
@@ -287,6 +306,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
 			trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
 			atomic_inc(&rreq->nr_outstanding);
+			netfs_reset_subreq_iter(rreq, subreq);
 			netfs_read_from_server(rreq, subreq);
 		} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
 			netfs_rreq_short_read(rreq, subreq);
@@ -399,9 +419,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
 	struct netfs_io_request *rreq = subreq->rreq;
 	int u;
 
-	_enter("[%u]{%llx,%lx},%zd",
-	       subreq->debug_index, subreq->start, subreq->flags,
-	       transferred_or_error);
+	_enter("R=%x[%x]{%llx,%lx},%zd",
+	       rreq->debug_id, subreq->debug_index,
+	       subreq->start, subreq->flags, transferred_or_error);
 
 	switch (subreq->source) {
 	case NETFS_READ_FROM_CACHE:
@@ -501,7 +521,8 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
  */
 static enum netfs_io_source
 netfs_rreq_prepare_read(struct netfs_io_request *rreq,
-			struct netfs_io_subrequest *subreq)
+			struct netfs_io_subrequest *subreq,
+			struct iov_iter *io_iter)
 {
 	enum netfs_io_source source;
 
@@ -528,9 +549,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 		}
 	}
 
-	if (WARN_ON(subreq->len == 0))
+	if (WARN_ON(subreq->len == 0)) {
 		source = NETFS_INVALID_READ;
+		goto out;
+	}
 
+	subreq->io_iter = *io_iter;
+	iov_iter_truncate(&subreq->io_iter, subreq->len);
+	iov_iter_advance(io_iter, subreq->len);
 out:
 	subreq->source = source;
 	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +567,7 @@ out:
  * Slice off a piece of a read request and submit an I/O request for it.
  */
 static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+				    struct iov_iter *io_iter,
 				    unsigned int *_debug_index)
 {
 	struct netfs_io_subrequest *subreq;
@@ -565,7 +592,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
 	 * (the starts must coincide), in which case, we go around the loop
 	 * again and ask it to download the next piece.
 	 */
-	source = netfs_rreq_prepare_read(rreq, subreq);
+	source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
 	if (source == NETFS_INVALID_READ)
 		goto subreq_failed;
 
@@ -603,6 +630,7 @@ subreq_failed:
  */
 int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 {
+	struct iov_iter io_iter;
 	unsigned int debug_index = 0;
 	int ret;
 
@@ -615,6 +643,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 		return -EIO;
 	}
 
+	rreq->io_iter = rreq->iter;
+
 	INIT_WORK(&rreq->work, netfs_rreq_work);
 
 	if (sync)
@@ -624,8 +654,9 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 	 * want and submit each one.
 	 */
 	atomic_set(&rreq->nr_outstanding, 1);
+	io_iter = rreq->io_iter;
 	do {
-		if (!netfs_rreq_submit_slice(rreq, &debug_index))
+		if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
 			break;
 
 	} while (rreq->submitted < rreq->len);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index fc6d9756a029..3da962e977f5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -150,6 +150,7 @@ struct netfs_cache_resources {
 struct netfs_io_subrequest {
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
+	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	loff_t			start;		/* Where to start the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
@@ -186,6 +187,8 @@ struct netfs_io_request {
 	struct netfs_cache_resources cache_resources;
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
+	struct iov_iter		iter;		/* Unencrypted-side iterator */
+	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
-- 
cgit v1.2.3


From f1bb47a31dff6d4b34fb14e99850860ee74bb003 Mon Sep 17 00:00:00 2001
From: Alfred Piccioni <alpic@google.com>
Date: Tue, 19 Dec 2023 10:09:09 +0100
Subject: lsm: new security_file_ioctl_compat() hook

Some ioctl commands do not require ioctl permission, but are routed to
other permissions such as FILE_GETATTR or FILE_SETATTR. This routing is
done by comparing the ioctl cmd to a set of 64-bit flags (FS_IOC_*).

However, if a 32-bit process is running on a 64-bit kernel, it emits
32-bit flags (FS_IOC32_*) for certain ioctl operations. These flags are
being checked erroneously, which leads to these ioctl operations being
routed to the ioctl permission, rather than the correct file
permissions.

This was also noted in a RED-PEN finding from a while back -
"/* RED-PEN how should LSM module know it's handling 32bit? */".

This patch introduces a new hook, security_file_ioctl_compat(), that is
called from the compat ioctl syscall. All current LSMs have been changed
to support this hook.

Reviewing the three places where we are currently using
security_file_ioctl(), it appears that only SELinux needs a dedicated
compat change; TOMOYO and SMACK appear to be functional without any
change.

Cc: stable@vger.kernel.org
Fixes: 0b24dcb7f2f7 ("Revert "selinux: simplify ioctl checking"")
Signed-off-by: Alfred Piccioni <alpic@google.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
[PM: subject tweak, line length fixes, and alignment corrections]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/ioctl.c                    |  3 +--
 include/linux/lsm_hook_defs.h |  2 ++
 include/linux/security.h      |  9 +++++++++
 security/security.c           | 18 ++++++++++++++++++
 security/selinux/hooks.c      | 28 ++++++++++++++++++++++++++++
 security/smack/smack_lsm.c    |  1 +
 security/tomoyo/tomoyo.c      |  1 +
 7 files changed, 60 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index f5fd99d6b0d4..76cf22ac97d7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -920,8 +920,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	if (!f.file)
 		return -EBADF;
 
-	/* RED-PEN how should LSM module know it's handling 32bit? */
-	error = security_file_ioctl(f.file, cmd, arg);
+	error = security_file_ioctl_compat(f.file, cmd, arg);
 	if (error)
 		goto out;
 
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index c925a0d26edf..185924c56378 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -171,6 +171,8 @@ LSM_HOOK(int, 0, file_alloc_security, struct file *file)
 LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
 LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
 	 unsigned long arg)
+LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
+	 unsigned long arg)
 LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
 LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
 	 unsigned long prot, unsigned long flags)
diff --git a/include/linux/security.h b/include/linux/security.h
index 750130a7b9dd..d0eb20f90b26 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -394,6 +394,8 @@ int security_file_permission(struct file *file, int mask);
 int security_file_alloc(struct file *file);
 void security_file_free(struct file *file);
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+			       unsigned long arg);
 int security_mmap_file(struct file *file, unsigned long prot,
 			unsigned long flags);
 int security_mmap_addr(unsigned long addr);
@@ -1002,6 +1004,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
 	return 0;
 }
 
+static inline int security_file_ioctl_compat(struct file *file,
+					     unsigned int cmd,
+					     unsigned long arg)
+{
+	return 0;
+}
+
 static inline int security_mmap_file(struct file *file, unsigned long prot,
 				     unsigned long flags)
 {
diff --git a/security/security.c b/security/security.c
index d7b15ea67c3f..69148dfc90cd 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2732,6 +2732,24 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL_GPL(security_file_ioctl);
 
+/**
+ * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
+ * @file: associated file
+ * @cmd: ioctl cmd
+ * @arg: ioctl arguments
+ *
+ * Compat version of security_file_ioctl() that correctly handles 32-bit
+ * processes running on 64-bit kernels.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
+}
+EXPORT_SYMBOL_GPL(security_file_ioctl_compat);
+
 static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 {
 	/*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b340425ccfae..179540441115 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3732,6 +3732,33 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
 	return error;
 }
 
+static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	/*
+	 * If we are in a 64-bit kernel running 32-bit userspace, we need to
+	 * make sure we don't compare 32-bit flags to 64-bit flags.
+	 */
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	case FS_IOC32_SETVERSION:
+		cmd = FS_IOC_SETVERSION;
+		break;
+	default:
+		break;
+	}
+
+	return selinux_file_ioctl(file, cmd, arg);
+}
+
 static int default_noexec __ro_after_init;
 
 static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
@@ -7122,6 +7149,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(file_permission, selinux_file_permission),
 	LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
 	LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+	LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
 	LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
 	LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
 	LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 53336d7daa93..c126f6a16de4 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -5051,6 +5051,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
 
 	LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
 	LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+	LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
 	LSM_HOOK_INIT(file_lock, smack_file_lock),
 	LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
 	LSM_HOOK_INIT(mmap_file, smack_mmap_file),
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index e10491f155a5..3c3af149bf1c 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -574,6 +574,7 @@ static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
 	LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
 	LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
+	LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
 	LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
 	LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
 	LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
-- 
cgit v1.2.3


From 8645e659e2d227f6ce8fcea1ac640c324fbbb3e6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:56 -0800
Subject: iio: linux/iio.h: fix Excess kernel-doc description warning

Remove the @of_xlate: lines to prevent the kernel-doc warning:

include/linux/iio/iio.h:534: warning: Excess struct member 'of_xlate' description in 'iio_info'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: linux-iio@vger.kernel.org
Link: https://lore.kernel.org/r/20231223050556.13948-1-rdunlap@infradead.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d0ce3b71106a..c5b36d2c1e73 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -434,13 +434,7 @@ struct iio_trigger; /* forward declaration */
  * @update_scan_mode:	function to configure device and scan buffer when
  *			channels have changed
  * @debugfs_reg_access:	function to read or write register value of device
- * @of_xlate:		function pointer to obtain channel specifier index.
- *			When #iio-cells is greater than '0', the driver could
- *			provide a custom of_xlate function that reads the
- *			*args* and returns the appropriate index in registered
- *			IIO channels array.
  * @fwnode_xlate:	fwnode based function pointer to obtain channel specifier index.
- *			Functionally the same as @of_xlate.
  * @hwfifo_set_watermark: function pointer to set the current hardware
  *			fifo watermark level; see hwfifo_* entries in
  *			Documentation/ABI/testing/sysfs-bus-iio for details on
-- 
cgit v1.2.3


From 02d374f3418df577c850f0cd45c3da9245ead547 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Dec 2023 08:15:24 +0000
Subject: block: renumber QUEUE_FLAG_HW_WC

For the QUEUE_FLAG_HW_WC to actually work, it needs to have a separate
number from QUEUE_FLAG_FUA, doh.

Fixes: 43c9835b144c ("block: don't allow enabling a cache on devices that don't support it")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231226081524.180289-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51fa7ffdee83..88e9dd4b71fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -538,7 +538,7 @@ struct request_queue {
 #define QUEUE_FLAG_ADD_RANDOM	10	/* Contributes to random pool */
 #define QUEUE_FLAG_SYNCHRONOUS	11	/* always completes in submit context */
 #define QUEUE_FLAG_SAME_FORCE	12	/* force complete on same CPU */
-#define QUEUE_FLAG_HW_WC	18	/* Write back caching supported */
+#define QUEUE_FLAG_HW_WC	13	/* Write back caching supported */
 #define QUEUE_FLAG_INIT_DONE	14	/* queue is initialized */
 #define QUEUE_FLAG_STABLE_WRITES 15	/* don't modify blks until WB is done */
 #define QUEUE_FLAG_POLL		16	/* IO polling enabled if set */
-- 
cgit v1.2.3


From 1c042f8d4bc342b7985b1de3d76836f1a1083b65 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Dec 2023 08:05:38 +0100
Subject: block: reject invalid operation in submit_bio_noacct

submit_bio_noacct allows completely invalid operations, or operations
that are not supported in the bio path.  Extent the existing switch
statement to rejcect all invalid types.

Move the code point for REQ_OP_ZONE_APPEND so that it's not right in the
middle of the zone management operations and the switch statement can
follow the numerical order of the operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231221070538.1112446-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 26 +++++++++++++++++++++-----
 include/linux/blk_types.h |  8 ++++----
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index fdf25b8d6e78..9520ccab3050 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -764,6 +764,15 @@ void submit_bio_noacct(struct bio *bio)
 		bio_clear_polled(bio);
 
 	switch (bio_op(bio)) {
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
+		break;
+	case REQ_OP_FLUSH:
+		/*
+		 * REQ_OP_FLUSH can't be submitted through bios, it is only
+		 * synthetized in struct request by the flush state machine.
+		 */
+		goto not_supported;
 	case REQ_OP_DISCARD:
 		if (!bdev_max_discard_sectors(bdev))
 			goto not_supported;
@@ -777,6 +786,10 @@ void submit_bio_noacct(struct bio *bio)
 		if (status != BLK_STS_OK)
 			goto end_io;
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		if (!q->limits.max_write_zeroes_sectors)
+			goto not_supported;
+		break;
 	case REQ_OP_ZONE_RESET:
 	case REQ_OP_ZONE_OPEN:
 	case REQ_OP_ZONE_CLOSE:
@@ -788,12 +801,15 @@ void submit_bio_noacct(struct bio *bio)
 		if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
 			goto not_supported;
 		break;
-	case REQ_OP_WRITE_ZEROES:
-		if (!q->limits.max_write_zeroes_sectors)
-			goto not_supported;
-		break;
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
+		/*
+		 * Driver private operations are only used with passthrough
+		 * requests.
+		 */
+		fallthrough;
 	default:
-		break;
+		goto not_supported;
 	}
 
 	if (blk_throtl_bio(bio))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d5c5e59ddbd2..68c9eb2374a4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -378,6 +378,8 @@ enum req_op {
 	REQ_OP_DISCARD		= (__force blk_opf_t)3,
 	/* securely erase sectors */
 	REQ_OP_SECURE_ERASE	= (__force blk_opf_t)5,
+	/* write data at the current zone write pointer */
+	REQ_OP_ZONE_APPEND	= (__force blk_opf_t)7,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= (__force blk_opf_t)9,
 	/* Open a zone */
@@ -386,12 +388,10 @@ enum req_op {
 	REQ_OP_ZONE_CLOSE	= (__force blk_opf_t)11,
 	/* Transition a zone to full */
 	REQ_OP_ZONE_FINISH	= (__force blk_opf_t)12,
-	/* write data at the current zone write pointer */
-	REQ_OP_ZONE_APPEND	= (__force blk_opf_t)13,
 	/* reset a zone write pointer */
-	REQ_OP_ZONE_RESET	= (__force blk_opf_t)15,
+	REQ_OP_ZONE_RESET	= (__force blk_opf_t)13,
 	/* reset all the zone present on the device */
-	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
+	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
 
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
-- 
cgit v1.2.3


From b3bf76024f645369e1fc45e0b08a2bd24f200d9b Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Tue, 19 Dec 2023 22:26:16 +0800
Subject: net/smc: manage system EID in SMC stack instead of ISM driver

The System EID (SEID) is an internal EID that is used by the SMCv2
software stack that has a predefined and constant value representing
the s390 physical machine that the OS is executing on. So it should
be managed by SMC stack instead of ISM driver and be consistent for
all ISMv2 device (including virtual ISM devices) on s390 architecture.

Suggested-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-and-tested-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism.h     |  7 -------
 drivers/s390/net/ism_drv.c | 38 ++++++--------------------------------
 include/linux/ism.h        |  1 -
 include/net/smc.h          |  1 -
 net/smc/smc_ism.c          | 33 ++++++++++++++++++++++++---------
 net/smc/smc_ism.h          |  7 +++++++
 6 files changed, 37 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h
index 70c5bbda0fea..047fa6101555 100644
--- a/drivers/s390/net/ism.h
+++ b/drivers/s390/net/ism.h
@@ -16,7 +16,6 @@
  */
 #define ISM_DMB_WORD_OFFSET	1
 #define ISM_DMB_BIT_OFFSET	(ISM_DMB_WORD_OFFSET * 32)
-#define ISM_IDENT_MASK		0x00FFFF
 
 #define ISM_REG_SBA	0x1
 #define ISM_REG_IEQ	0x2
@@ -192,12 +191,6 @@ struct ism_sba {
 #define ISM_CREATE_REQ(dmb, idx, sf, offset)		\
 	((dmb) | (idx) << 24 | (sf) << 23 | (offset))
 
-struct ism_systemeid {
-	u8	seid_string[24];
-	u8	serial_number[4];
-	u8	type[4];
-};
-
 static inline void __ism_read_cmd(struct ism_dev *ism, void *data,
 				  unsigned long offset, unsigned long len)
 {
diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index 34dd06324e38..2c8e964425dc 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -36,6 +36,7 @@ static struct ism_client *clients[MAX_CLIENTS];	/* use an array rather than */
 						/* a list for fast mapping  */
 static u8 max_client;
 static DEFINE_MUTEX(clients_lock);
+static bool ism_v2_capable;
 struct ism_dev_list {
 	struct list_head list;
 	struct mutex mutex; /* protects ism device list */
@@ -443,32 +444,6 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf,
 }
 EXPORT_SYMBOL_GPL(ism_move);
 
-static struct ism_systemeid SYSTEM_EID = {
-	.seid_string = "IBM-SYSZ-ISMSEID00000000",
-	.serial_number = "0000",
-	.type = "0000",
-};
-
-static void ism_create_system_eid(void)
-{
-	struct cpuid id;
-	u16 ident_tail;
-	char tmp[5];
-
-	get_cpu_id(&id);
-	ident_tail = (u16)(id.ident & ISM_IDENT_MASK);
-	snprintf(tmp, 5, "%04X", ident_tail);
-	memcpy(&SYSTEM_EID.serial_number, tmp, 4);
-	snprintf(tmp, 5, "%04X", id.machine);
-	memcpy(&SYSTEM_EID.type, tmp, 4);
-}
-
-u8 *ism_get_seid(void)
-{
-	return SYSTEM_EID.seid_string;
-}
-EXPORT_SYMBOL_GPL(ism_get_seid);
-
 static void ism_handle_event(struct ism_dev *ism)
 {
 	struct ism_event *entry;
@@ -560,7 +535,9 @@ static int ism_dev_init(struct ism_dev *ism)
 
 	if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID))
 		/* hardware is V2 capable */
-		ism_create_system_eid();
+		ism_v2_capable = true;
+	else
+		ism_v2_capable = false;
 
 	mutex_lock(&ism_dev_list.mutex);
 	mutex_lock(&clients_lock);
@@ -665,8 +642,7 @@ static void ism_dev_exit(struct ism_dev *ism)
 	}
 	mutex_unlock(&clients_lock);
 
-	if (SYSTEM_EID.serial_number[0] != '0' ||
-	    SYSTEM_EID.type[0] != '0')
+	if (ism_v2_capable)
 		ism_del_vlan_id(ism, ISM_RESERVED_VLANID);
 	unregister_ieq(ism);
 	unregister_sba(ism);
@@ -813,8 +789,7 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
 
 static int smcd_supports_v2(void)
 {
-	return SYSTEM_EID.serial_number[0] != '0' ||
-		SYSTEM_EID.type[0] != '0';
+	return ism_v2_capable;
 }
 
 static u64 ism_get_local_gid(struct ism_dev *ism)
@@ -860,7 +835,6 @@ static const struct smcd_ops ism_ops = {
 	.signal_event = smcd_signal_ieq,
 	.move_data = smcd_move,
 	.supports_v2 = smcd_supports_v2,
-	.get_system_eid = ism_get_seid,
 	.get_local_gid = smcd_get_local_gid,
 	.get_chid = smcd_get_chid,
 	.get_dev = smcd_get_dev,
diff --git a/include/linux/ism.h b/include/linux/ism.h
index 9a4c204df3da..5428edd90982 100644
--- a/include/linux/ism.h
+++ b/include/linux/ism.h
@@ -86,7 +86,6 @@ int  ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb,
 int  ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb);
 int  ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf,
 	      unsigned int offset, void *data, unsigned int size);
-u8  *ism_get_seid(void);
 
 const struct smcd_ops *ism_get_smcd_ops(void);
 
diff --git a/include/net/smc.h b/include/net/smc.h
index a0dc1187e96e..c9dcb30e3fd9 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -73,7 +73,6 @@ struct smcd_ops {
 			 bool sf, unsigned int offset, void *data,
 			 unsigned int size);
 	int (*supports_v2)(void);
-	u8* (*get_system_eid)(void);
 	void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid);
 	u16 (*get_chid)(struct smcd_dev *dev);
 	struct device* (*get_dev)(struct smcd_dev *dev);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index a33f861cf7c1..ac88de2a06a0 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -43,6 +43,27 @@ static struct ism_client smc_ism_client = {
 };
 #endif
 
+static void smc_ism_create_system_eid(void)
+{
+	struct smc_ism_seid *seid =
+		(struct smc_ism_seid *)smc_ism_v2_system_eid;
+#if IS_ENABLED(CONFIG_S390)
+	struct cpuid id;
+	u16 ident_tail;
+	char tmp[5];
+
+	memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24);
+	get_cpu_id(&id);
+	ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK);
+	snprintf(tmp, 5, "%04X", ident_tail);
+	memcpy(seid->serial_number, tmp, 4);
+	snprintf(tmp, 5, "%04X", id.machine);
+	memcpy(seid->type, tmp, 4);
+#else
+	memset(seid, 0, SMC_MAX_EID_LEN);
+#endif
+}
+
 /* Test if an ISM communication is possible - same CPC */
 int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id,
 		    struct smcd_dev *smcd)
@@ -431,14 +452,8 @@ static void smcd_register_dev(struct ism_dev *ism)
 
 	mutex_lock(&smcd_dev_list.mutex);
 	if (list_empty(&smcd_dev_list.list)) {
-		u8 *system_eid = NULL;
-
-		system_eid = smcd->ops->get_system_eid();
-		if (smcd->ops->supports_v2()) {
+		if (smcd->ops->supports_v2())
 			smc_ism_v2_capable = true;
-			memcpy(smc_ism_v2_system_eid, system_eid,
-			       SMC_MAX_EID_LEN);
-		}
 	}
 	/* sort list: devices without pnetid before devices with pnetid */
 	if (smcd->pnetid[0])
@@ -542,10 +557,10 @@ int smc_ism_init(void)
 {
 	int rc = 0;
 
-#if IS_ENABLED(CONFIG_ISM)
 	smc_ism_v2_capable = false;
-	memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN);
+	smc_ism_create_system_eid();
 
+#if IS_ENABLED(CONFIG_ISM)
 	rc = ism_register_client(&smc_ism_client);
 #endif
 	return rc;
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index 0e5e563099ec..ffff40c30a06 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -16,6 +16,7 @@
 #include "smc.h"
 
 #define SMC_VIRTUAL_ISM_CHID_MASK	0xFF00
+#define SMC_ISM_IDENT_MASK		0x00FFFF
 
 struct smcd_dev_list {	/* List of SMCD devices */
 	struct list_head list;
@@ -30,6 +31,12 @@ struct smc_ism_vlanid {			/* VLAN id set on ISM device */
 	refcount_t refcnt;		/* Reference count */
 };
 
+struct smc_ism_seid {
+	u8 seid_string[24];
+	u8 serial_number[4];
+	u8 type[4];
+};
+
 struct smcd_dev;
 
 int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id,
-- 
cgit v1.2.3


From 0942155a48e4cfc2c83e514c86a3de8f78f6af02 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Wed, 20 Dec 2023 14:35:05 +0100
Subject: PCI: Remove unused 'node' member from struct pci_driver

Remove the unused 'node' member. It got replaced by device_driver chaining
more than 20 years ago in commit 4b4a837f2b57 ("PCI: start to use common
fields of struct device_driver more...") of the history.git tree.

Link: https://lore.kernel.org/r/20231220133505.8798-1-minipli@grsecurity.net
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Kalle Valo <kvalo@kernel.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c | 1 -
 include/linux/pci.h                                     | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 80220685f5e4..d7fb88bb6ae1 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -2707,7 +2707,6 @@ MODULE_DEVICE_TABLE(pci, brcmf_pcie_devid_table);
 
 
 static struct pci_driver brcmf_pciedrvr = {
-	.node = {},
 	.name = KBUILD_MODNAME,
 	.id_table = brcmf_pcie_devid_table,
 	.probe = brcmf_pcie_probe,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60ca768bc867..1a89dc66f89a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -885,7 +885,6 @@ struct module;
 
 /**
  * struct pci_driver - PCI driver structure
- * @node:	List of driver structures.
  * @name:	Driver name.
  * @id_table:	Pointer to table of device IDs the driver is
  *		interested in.  Most drivers should export this
@@ -940,7 +939,6 @@ struct module;
  *		own I/O address space.
  */
 struct pci_driver {
-	struct list_head	node;
 	const char		*name;
 	const struct pci_device_id *id_table;	/* Must be non-NULL for probe to be called */
 	int  (*probe)(struct pci_dev *dev, const struct pci_device_id *id);	/* New device inserted */
-- 
cgit v1.2.3


From ba367479c7ad0b870461024cd5ae7a1ea6e1e3db Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 19 Dec 2023 11:32:39 +0530
Subject: OPP: The level field is always of unsigned int type

By mistake, dev_pm_opp_find_level_floor() used the level parameter as
unsigned long instead of unsigned int. Fix it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 9 +++++++--
 include/linux/pm_opp.h | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index c022d548067d..49b429984bdb 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -842,9 +842,14 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_ceil);
  * use.
  */
 struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
-					       unsigned long *level)
+					       unsigned int *level)
 {
-	return _find_key_floor(dev, level, 0, true, _read_level, NULL);
+	unsigned long temp = *level;
+	struct dev_pm_opp *opp;
+
+	opp = _find_key_floor(dev, &temp, 0, true, _read_level, NULL);
+	*level = temp;
+	return opp;
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_floor);
 
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 81dff7facdc9..74768c47d790 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -163,7 +163,7 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 					      unsigned int *level);
 
 struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
-					       unsigned long *level);
+					       unsigned int *level);
 
 struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev,
 					   unsigned int *bw, int index);
@@ -330,7 +330,7 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev,
-							     unsigned long *level)
+							     unsigned int *level)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
cgit v1.2.3


From 90abde49ea85a8af9a56bbab8c419aefc77f919a Mon Sep 17 00:00:00 2001
From: "Radu Pirea (NXP OSS)" <radu-nicolae.pirea@oss.nxp.com>
Date: Tue, 19 Dec 2023 16:53:25 +0200
Subject: net: rename dsa_realloc_skb to skb_ensure_writable_head_tail

Rename dsa_realloc_skb to skb_ensure_writable_head_tail and move it to
skbuff.c to use it as helper.

Signed-off-by: Radu Pirea (NXP OSS) <radu-nicolae.pirea@oss.nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 25 +++++++++++++++++++++++++
 net/dsa/user.c         | 29 +++--------------------------
 3 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 50e92c8471dc..a5ae952454c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4007,6 +4007,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features
 				 unsigned int offset);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce5687ddb768..12d22c0b8551 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5995,6 +5995,31 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
 }
 EXPORT_SYMBOL(skb_ensure_writable);
 
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
+{
+	int needed_headroom = dev->needed_headroom;
+	int needed_tailroom = dev->needed_tailroom;
+
+	/* For tail taggers, we need to pad short frames ourselves, to ensure
+	 * that the tail tag does not fail at its role of being at the end of
+	 * the packet, once the conduit interface pads the frame. Account for
+	 * that pad length here, and pad later.
+	 */
+	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+		needed_tailroom += ETH_ZLEN - skb->len;
+	/* skb_headroom() returns unsigned int... */
+	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+		/* No reallocation needed, yay! */
+		return 0;
+
+	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+				GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable_head_tail);
+
 /* remove VLAN header from packet and update csum accordingly.
  * expects a non skb_vlan_tag_present skb with a vlan tag payload
  */
diff --git a/net/dsa/user.c b/net/dsa/user.c
index d438884a4eb0..b738a466e2dc 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -920,30 +920,6 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
 
-static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev)
-{
-	int needed_headroom = dev->needed_headroom;
-	int needed_tailroom = dev->needed_tailroom;
-
-	/* For tail taggers, we need to pad short frames ourselves, to ensure
-	 * that the tail tag does not fail at its role of being at the end of
-	 * the packet, once the conduit interface pads the frame. Account for
-	 * that pad length here, and pad later.
-	 */
-	if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
-		needed_tailroom += ETH_ZLEN - skb->len;
-	/* skb_headroom() returns unsigned int... */
-	needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
-	needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
-
-	if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
-		/* No reallocation needed, yay! */
-		return 0;
-
-	return pskb_expand_head(skb, needed_headroom, needed_tailroom,
-				GFP_ATOMIC);
-}
-
 static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dsa_user_priv *p = netdev_priv(dev);
@@ -956,13 +932,14 @@ static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Handle tx timestamp if any */
 	dsa_skb_tx_timestamp(p, skb);
 
-	if (dsa_realloc_skb(skb, dev)) {
+	if (skb_ensure_writable_head_tail(skb, dev)) {
 		dev_kfree_skb_any(skb);
 		return NETDEV_TX_OK;
 	}
 
 	/* needed_tailroom should still be 'warm' in the cache line from
-	 * dsa_realloc_skb(), which has also ensured that padding is safe.
+	 * skb_ensure_writable_head_tail(), which has also ensured that
+	 * padding is safe.
 	 */
 	if (dev->needed_tailroom)
 		eth_skb_pad(skb);
-- 
cgit v1.2.3


From 932562a6045ed613d45bd100db37114273c22077 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:58:20 -0500
Subject: rseq: Split out rseq.h from sched.h

We're trying to get sched.h down to more or less just types only, not
code - rseq can live in its own header.

This helps us kill the dependency on preempt.h in sched.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/arm64/kernel/ptrace.c       |   1 +
 arch/powerpc/kernel/interrupt.c  |   1 +
 arch/s390/kernel/signal.c        |   1 +
 arch/x86/kernel/signal.c         |   1 +
 fs/exec.c                        |   1 +
 include/linux/resume_user_mode.h |   1 +
 include/linux/rseq.h             | 131 +++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h            | 125 +------------------------------------
 kernel/fork.c                    |   1 +
 kernel/sched/core.c              |   1 +
 10 files changed, 140 insertions(+), 124 deletions(-)
 create mode 100644 include/linux/rseq.h

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 20d7ef82de90..09bb7fc7d3c2 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -28,6 +28,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
+#include <linux/rseq.h>
 
 #include <asm/compat.h>
 #include <asm/cpufeature.h>
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index c4f6d3c69ba9..eca293794a1e 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -3,6 +3,7 @@
 #include <linux/context_tracking.h>
 #include <linux/err.h>
 #include <linux/compat.h>
+#include <linux/rseq.h>
 #include <linux/sched/debug.h> /* for show_regs */
 
 #include <asm/kup.h>
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index d63557d3868c..48e46809ee27 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -12,6 +12,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <linux/rseq.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094da59..31b6f5dddfc2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
 #include <linux/syscalls.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..41773af7e3dc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,6 +66,7 @@
 #include <linux/coredump.h>
 #include <linux/time_namespace.h>
 #include <linux/user_events.h>
+#include <linux/rseq.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index f8f3e958e9cf..e0135e0adae0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
+#include <linux/rseq.h>
 #include <linux/blk-cgroup.h>
 
 /**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
new file mode 100644
index 000000000000..bc8af3eb5598
--- /dev/null
+++ b/include/linux/rseq.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _LINUX_RSEQ_H
+#define _LINUX_RSEQ_H
+
+#ifdef CONFIG_RSEQ
+
+#include <linux/preempt.h>
+#include <linux/sched.h>
+
+/*
+ * Map the event mask on the user-space ABI enum rseq_cs_flags
+ * for direct mask checks.
+ */
+enum rseq_event_mask_bits {
+	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
+	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
+	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
+};
+
+enum rseq_event_mask {
+	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
+	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
+	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
+};
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+	if (t->rseq)
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+	if (current->rseq)
+		__rseq_handle_notify_resume(ksig, regs);
+}
+
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+	preempt_disable();
+	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+	preempt_enable();
+	rseq_handle_notify_resume(ksig, regs);
+}
+
+/* rseq_preempt() requires preemption to be disabled. */
+static inline void rseq_preempt(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/* rseq_migrate() requires preemption to be disabled. */
+static inline void rseq_migrate(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_VM) {
+		t->rseq = NULL;
+		t->rseq_len = 0;
+		t->rseq_sig = 0;
+		t->rseq_event_mask = 0;
+	} else {
+		t->rseq = current->rseq;
+		t->rseq_len = current->rseq_len;
+		t->rseq_sig = current->rseq_sig;
+		t->rseq_event_mask = current->rseq_event_mask;
+	}
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	t->rseq = NULL;
+	t->rseq_len = 0;
+	t->rseq_sig = 0;
+	t->rseq_event_mask = 0;
+}
+
+#else
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+}
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+}
+static inline void rseq_preempt(struct task_struct *t)
+{
+}
+static inline void rseq_migrate(struct task_struct *t)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+void rseq_syscall(struct pt_regs *regs);
+
+#else
+
+static inline void rseq_syscall(struct pt_regs *regs)
+{
+}
+
+#endif
+
+#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd002d193726..a588b94988bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,7 +35,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
@@ -2181,129 +2181,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_RSEQ
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-	if (current->rseq)
-		__rseq_handle_notify_resume(ksig, regs);
-}
-
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-	preempt_disable();
-	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	preempt_enable();
-	rseq_handle_notify_resume(ksig, regs);
-}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
-	} else {
-		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
-	}
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
-}
-
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
-
-#endif
-
-#ifdef CONFIG_DEBUG_RSEQ
-
-void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
-
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 319e61297bfb..53816393995b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -100,6 +100,7 @@
 #include <linux/stackprotector.h>
 #include <linux/user_events.h>
 #include <linux/iommu.h>
+#include <linux/rseq.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..d04cf3c47899 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -57,6 +57,7 @@
 #include <linux/profile.h>
 #include <linux/psi.h>
 #include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
 #include <linux/sched/wake_q.h>
 #include <linux/scs.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 2b010a69350f2c995f40585fb801904874c85dd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 16:04:03 -0500
Subject: preempt.h: Kill dependency on list.h

We really only need types.h, list.h is big.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/preempt.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 9aa6358a1a16..7233e9cf1bab 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -9,7 +9,7 @@
 
 #include <linux/linkage.h>
 #include <linux/cleanup.h>
-#include <linux/list.h>
+#include <linux/types.h>
 
 /*
  * We put the hardirq and softirq counter into the preemption
@@ -360,7 +360,9 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier);
 static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 				     struct preempt_ops *ops)
 {
-	INIT_HLIST_NODE(&notifier->link);
+	/* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */
+	notifier->link.next = NULL;
+	notifier->link.pprev = NULL;
 	notifier->ops = ops;
 }
 
-- 
cgit v1.2.3


From 1e2f2d31997a9496f99e2b43255d6a48b06fbcc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:51:54 -0500
Subject: Kill sched.h dependency on rcupdate.h

by moving cond_resched_rcu() to rcupdate_wait.h, we can kill another big
sched.h dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/powerpc/kvm/book3s_64_vio.c        |  1 +
 include/linux/rcupdate_wait.h           | 10 ++++++++++
 include/linux/sched.h                   | 15 +++++----------
 include/linux/sched/task.h              |  1 +
 kernel/bpf/hashtab.c                    |  1 +
 lib/test_rhashtable.c                   |  1 +
 mm/filemap.c                            |  1 +
 mm/khugepaged.c                         |  1 +
 mm/shmem.c                              |  1 +
 net/ipv4/fib_trie.c                     |  1 +
 net/netfilter/ipset/ip_set_bitmap_gen.h |  2 ++
 net/netfilter/ipset/ip_set_hash_gen.h   |  1 +
 net/netfilter/ipvs/ip_vs_conn.c         |  1 +
 net/netfilter/ipvs/ip_vs_est.c          |  1 +
 14 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 14c6d7e318da..b569ebaa590e 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -20,6 +20,7 @@
 #include <linux/iommu.h>
 #include <linux/file.h>
 #include <linux/mm.h>
+#include <linux/rcupdate_wait.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 5e0f74f2f8ca..d07f0848802e 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -8,6 +8,7 @@
 
 #include <linux/rcupdate.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 
 /*
  * Structure allowing asynchronous waiting on RCU.
@@ -55,4 +56,13 @@ do {									\
 #define synchronize_rcu_mult(...) \
 	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
 
+static inline void cond_resched_rcu(void)
+{
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+#endif
+}
+
 #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a588b94988bc..814bfdafbc1c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -10,9 +10,14 @@
 #include <uapi/linux/sched.h>
 
 #include <asm/current.h>
+#include <asm/processor.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <linux/cpumask.h>
 
 #include <linux/cache.h>
 #include <linux/irqflags_types.h>
+#include <linux/smp_types.h>
 #include <linux/pid_types.h>
 #include <linux/sem_types.h>
 #include <linux/shm.h>
@@ -23,7 +28,6 @@
 #include <linux/timer_types.h>
 #include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
-#include <linux/rcupdate.h>
 #include <linux/refcount_types.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
@@ -2059,15 +2063,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })
 
-static inline void cond_resched_rcu(void)
-{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-	rcu_read_unlock();
-	cond_resched();
-	rcu_read_lock();
-#endif
-}
-
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
 extern bool preempt_model_none(void);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1880ae21a9cb..538cdfbe895f 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,7 @@
  * functionality:
  */
 
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfc..3ba98ed2b3f9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -7,6 +7,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/random.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index c20f6cb4bf55..42b585208249 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/rhashtable.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index f1c8c278310f..1219ffc04a26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -45,6 +45,7 @@
 #include <linux/migrate.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/splice.h>
+#include <linux/rcupdate_wait.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 064654717843..47a20a4ece09 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,6 +17,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/page_table_check.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/ksm.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index 91e2620148b2..98f6ca7bdae1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -79,6 +79,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
 #include <linux/rmap.h>
 #include <linux/uuid.h>
 #include <linux/quotaops.h>
+#include <linux/rcupdate_wait.h>
 
 #include <linux/uaccess.h>
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9bdfdab906fe..3ff35f811765 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -52,6 +52,7 @@
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..21f7860e8fa1 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -4,6 +4,8 @@
 #ifndef __IP_SET_BITMAP_IP_GEN_H
 #define __IP_SET_BITMAP_IP_GEN_H
 
+#include <linux/rcupdate_wait.h>
+
 #define mtype_do_test		IPSET_TOKEN(MTYPE, _do_test)
 #define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test)
 #define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7c2399541771..cbf80da9a01c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -5,6 +5,7 @@
 #define _IP_SET_HASH_GEN_H
 
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/jhash.h>
 #include <linux/types.h>
 #include <linux/netfilter/nfnetlink.h>
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9065da3cdd12..a743db073887 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -31,6 +31,7 @@
 #include <linux/seq_file.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <linux/rcupdate_wait.h>
 
 #include <net/net_namespace.h>
 #include <net/ip_vs.h>
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index c5970ba416ae..f821ad2e19b3 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/sysctl.h>
 #include <linux/list.h>
+#include <linux/rcupdate_wait.h>
 
 #include <net/ip_vs.h>
 
-- 
cgit v1.2.3


From d6b9f4e6f7fb589d8024a31cc4883d15d0c8def4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Dec 2023 09:23:05 +0000
Subject: block: rename and document BLK_DEF_MAX_SECTORS

Give BLK_DEF_MAX_SECTORS a _CAP postfix and document what it is used for.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231227092305.279567-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 2 +-
 block/blk-sysfs.c      | 2 +-
 drivers/scsi/sd.c      | 2 +-
 include/linux/blkdev.h | 9 ++++++++-
 4 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 33b3f767b81e..ba6e0e97118c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -139,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	if (limits->max_user_sectors)
 		max_sectors = min(max_sectors, limits->max_user_sectors);
 	else
-		max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS);
+		max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
 
 	max_sectors = round_down(max_sectors,
 				 limits->logical_block_size >> SECTOR_SHIFT);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d5e669a401b0..40bab5975c56 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 	if (max_sectors_kb == 0) {
 		q->limits.max_user_sectors = 0;
 		max_sectors_kb = min(max_hw_sectors_kb,
-				     BLK_DEF_MAX_SECTORS >> 1);
+				     BLK_DEF_MAX_SECTORS_CAP >> 1);
 	} else {
 		if (max_sectors_kb > max_hw_sectors_kb ||
 		    max_sectors_kb < page_kb)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 8c8ac5cd1833..6bedd2d5298f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3500,7 +3500,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	} else {
 		q->limits.io_opt = 0;
 		rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
-				      (sector_t)BLK_DEF_MAX_SECTORS);
+				      (sector_t)BLK_DEF_MAX_SECTORS_CAP);
 	}
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc236e77d85e..94701a63ad8a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1056,7 +1056,14 @@ enum blk_default_limits {
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 
-#define BLK_DEF_MAX_SECTORS 2560u
+/*
+ * Default upper limit for the software max_sectors limit used for
+ * regular file system I/O.  This can be increased through sysfs.
+ *
+ * Not to be confused with the max_hw_sector limit that is entirely
+ * controlled by the driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP	2560u
 
 static inline unsigned long queue_segment_boundary(const struct request_queue *q)
 {
-- 
cgit v1.2.3


From 21d706d5cf570917594b21edee81893bdce09ab8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Jul 2021 08:41:17 +0100
Subject: netfs: Add support for DIO buffering

Add a bvec array pointer and an iterator to netfs_io_request for either
holding a copy of a DIO iterator or a list of all the bits of buffer
pointed to by a DIO iterator.

There are two problems:  Firstly, if an iovec-class iov_iter is passed to
->read_iter() or ->write_iter(), this cannot be passed directly to
kernel_sendmsg() or kernel_recvmsg() as that may cause locking recursion if
a fault is generated, so we need to keep track of the pages involved
separately.

Secondly, if the I/O is asynchronous, we must copy the iov_iter describing
the buffer before returning to the caller as it may be immediately
deallocated.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/objects.c    | 10 ++++++++++
 include/linux/netfs.h |  3 +++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 1bd20bdad983..4df5e5eeada6 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -76,6 +76,7 @@ static void netfs_free_request(struct work_struct *work)
 {
 	struct netfs_io_request *rreq =
 		container_of(work, struct netfs_io_request, work);
+	unsigned int i;
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_free);
 	netfs_proc_del_rreq(rreq);
@@ -84,6 +85,15 @@ static void netfs_free_request(struct work_struct *work)
 		rreq->netfs_ops->free_request(rreq);
 	if (rreq->cache_resources.ops)
 		rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+	if (rreq->direct_bv) {
+		for (i = 0; i < rreq->direct_bv_count; i++) {
+			if (rreq->direct_bv[i].bv_page) {
+				if (rreq->direct_bv_unpin)
+					unpin_user_page(rreq->direct_bv[i].bv_page);
+			}
+		}
+		kvfree(rreq->direct_bv);
+	}
 	kfree_rcu(rreq, rcu);
 	netfs_stat_d(&netfs_n_rh_rreq);
 }
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3da962e977f5..2bb1273b38f4 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -190,6 +190,8 @@ struct netfs_io_request {
 	struct iov_iter		iter;		/* Unencrypted-side iterator */
 	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
+	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
+	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
@@ -197,6 +199,7 @@ struct netfs_io_request {
 	size_t			len;		/* Length of the request */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
+	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
 	loff_t			i_size;		/* Size of the file */
 	loff_t			start;		/* Start position */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
-- 
cgit v1.2.3


From 7d828a06634799aba0fa392913c7fe2953eb64a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 22 Sep 2023 13:25:22 +0100
Subject: netfs: Provide tools to create a buffer in an xarray

Provide tools to create a buffer in an xarray, with a function to add new
folios with a mark.  This will be used to create bounce buffer and can be
used more easily to create a list of folios the span of which would require
more than a page's worth of bio_vec structs.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/internal.h   | 13 +++++++++
 fs/netfs/misc.c       | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h |  4 +++
 3 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 4708fb15446b..b908c7e0a901 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -56,6 +56,19 @@ static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
 static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
 #endif
 
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK		BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK	BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+			    struct folio *folio, unsigned int flags,
+			    gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+			       struct address_space *mapping,
+			       pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
+
 /*
  * objects.c
  */
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 45bb19ec9a63..5d545073fe03 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,6 +8,87 @@
 #include <linux/swap.h>
 #include "internal.h"
 
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+			    struct folio *folio, unsigned int flags,
+			    gfp_t gfp_mask)
+{
+	XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+	xas_lock(&xas);
+	for (;;) {
+		xas_store(&xas, folio);
+		if (!xas_error(&xas))
+			break;
+		xas_unlock(&xas);
+		if (!xas_nomem(&xas, gfp_mask))
+			return xas_error(&xas);
+		goto retry;
+	}
+
+	if (flags & NETFS_FLAG_PUT_MARK)
+		xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+	if (flags & NETFS_FLAG_PAGECACHE_MARK)
+		xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+	xas_unlock(&xas);
+	return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request.  The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+			       struct address_space *mapping,
+			       pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+	struct folio *folio;
+	int ret;
+
+	if (to + 1 == index) /* Page range is inclusive */
+		return 0;
+
+	do {
+		/* TODO: Figure out what order folio can be allocated here */
+		folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+		if (!folio)
+			return -ENOMEM;
+		folio->index = index;
+		ret = netfs_xa_store_and_mark(buffer, index, folio,
+					      NETFS_FLAG_PUT_MARK, gfp_mask);
+		if (ret < 0) {
+			folio_put(folio);
+			return ret;
+		}
+
+		index += folio_nr_pages(folio);
+	} while (index <= to && index != 0);
+
+	return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+	struct folio *folio;
+	XA_STATE(xas, buffer, 0);
+
+	rcu_read_lock();
+	xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+		folio_put(folio);
+	}
+	rcu_read_unlock();
+	xa_destroy(buffer);
+}
+
 /**
  * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
  * @mapping: The mapping the folio belongs to.
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 2bb1273b38f4..c05365e3f428 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -109,6 +109,10 @@ static inline int wait_on_page_fscache_killable(struct page *page)
 	return folio_wait_private_2_killable(page_folio(page));
 }
 
+/* Marks used on xarray-based buffers */
+#define NETFS_BUF_PUT_MARK	XA_MARK_0	/* - Page needs putting  */
+#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1	/* - Page needs wb/dirty flag wrangling */
+
 enum netfs_io_source {
 	NETFS_FILL_WITH_ZEROES,
 	NETFS_DOWNLOAD_FROM_SERVER,
-- 
cgit v1.2.3


From cae932d3aee55035a54415dcea8e7ecf2ec469b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 22 Sep 2023 14:49:47 +0100
Subject: netfs: Add func to calculate pagecount/size-limited span of an
 iterator

Add a function to work out how much of an ITER_BVEC or ITER_XARRAY iterator
we can use in a pagecount-limited and size-limited span.  This will be
used, for example, to limit the number of segments in a subrequest to the
maximum number of elements that an RDMA transfer can handle.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/iterator.c   | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h |  2 ++
 2 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655a0..b781bbbf1d8d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 	return npages;
 }
 EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use.  Limit it by both maximum
+ * size and maximum number of segments.  Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+			       size_t max_size, size_t max_segs)
+{
+	const struct bio_vec *bvecs = iter->bvec;
+	unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+	size_t len, span = 0, n = iter->count;
+	size_t skip = iter->iov_offset + start_offset;
+
+	if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+	    WARN_ON(start_offset > n) ||
+	    n == 0)
+		return 0;
+
+	while (n && ix < nbv && skip) {
+		len = bvecs[ix].bv_len;
+		if (skip < len)
+			break;
+		skip -= len;
+		n -= len;
+		ix++;
+	}
+
+	while (n && ix < nbv) {
+		len = min3(n, bvecs[ix].bv_len - skip, max_size);
+		span += len;
+		nsegs++;
+		ix++;
+		if (span >= max_size || nsegs >= max_segs)
+			break;
+		skip = 0;
+		n -= len;
+	}
+
+	return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use.  Limit it by both
+ * maximum size and maximum number of segments.  It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+				 size_t max_size, size_t max_segs)
+{
+	struct folio *folio;
+	unsigned int nsegs = 0;
+	loff_t pos = iter->xarray_start + iter->iov_offset;
+	pgoff_t index = pos / PAGE_SIZE;
+	size_t span = 0, n = iter->count;
+
+	XA_STATE(xas, iter->xarray, index);
+
+	if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+	    WARN_ON(start_offset > n) ||
+	    n == 0)
+		return 0;
+	max_size = min(max_size, n - start_offset);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, ULONG_MAX) {
+		size_t offset, flen, len;
+		if (xas_retry(&xas, folio))
+			continue;
+		if (WARN_ON(xa_is_value(folio)))
+			break;
+		if (WARN_ON(folio_test_hugetlb(folio)))
+			break;
+
+		flen = folio_size(folio);
+		offset = offset_in_folio(folio, pos);
+		len = min(max_size, flen - offset);
+		span += len;
+		nsegs++;
+		if (span >= max_size || nsegs >= max_segs)
+			break;
+	}
+
+	rcu_read_unlock();
+	return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+			size_t max_size, size_t max_segs)
+{
+	if (iov_iter_is_bvec(iter))
+		return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+	if (iov_iter_is_xarray(iter))
+		return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+	BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c05365e3f428..d673d0785b9d 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -321,6 +321,8 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
 ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				struct iov_iter *new,
 				iov_iter_extraction_t extraction_flags);
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+			size_t max_size, size_t max_segs);
 
 int netfs_start_io_read(struct inode *inode);
 void netfs_end_io_read(struct inode *inode);
-- 
cgit v1.2.3


From 768ddb1eacf5dd997ecf393e7bab9796bad047e0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 May 2022 13:45:28 +0100
Subject: netfs: Limit subrequest by size or number of segments

Limit a subrequest to a maximum size and/or a maximum number of contiguous
physical regions.  This permits, for instance, an subreq's iterator to be
limited to the number of DMA'able segments that a large RDMA request can
handle.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/io.c                | 18 ++++++++++++++++++
 include/linux/netfs.h        |  1 +
 include/trace/events/netfs.h |  1 +
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index e9d408e211b8..e228bfb530ea 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -525,6 +525,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 			struct iov_iter *io_iter)
 {
 	enum netfs_io_source source;
+	size_t lsize;
 
 	_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 
@@ -547,13 +548,30 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 			source = NETFS_INVALID_READ;
 			goto out;
 		}
+
+		if (subreq->max_nr_segs) {
+			lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+						 subreq->max_nr_segs);
+			if (subreq->len > lsize) {
+				subreq->len = lsize;
+				trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+			}
+		}
 	}
 
+	if (subreq->len > rreq->len)
+		pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+			rreq->debug_id, subreq->debug_index,
+			subreq->len, rreq->len);
+
 	if (WARN_ON(subreq->len == 0)) {
 		source = NETFS_INVALID_READ;
 		goto out;
 	}
 
+	subreq->source = source;
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
 	subreq->io_iter = *io_iter;
 	iov_iter_truncate(&subreq->io_iter, subreq->len);
 	iov_iter_advance(io_iter, subreq->len);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d673d0785b9d..44cd13ad695a 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -161,6 +161,7 @@ struct netfs_io_subrequest {
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
+	unsigned int		max_nr_segs;	/* 0 or max number of segments in an iterator */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned long		flags;
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index beec534cbaab..fce6d0bc78e5 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -44,6 +44,7 @@
 #define netfs_sreq_traces					\
 	EM(netfs_sreq_trace_download_instead,	"RDOWN")	\
 	EM(netfs_sreq_trace_free,		"FREE ")	\
+	EM(netfs_sreq_trace_limited,		"LIMIT")	\
 	EM(netfs_sreq_trace_prepare,		"PREP ")	\
 	EM(netfs_sreq_trace_resubmit_short,	"SHORT")	\
 	EM(netfs_sreq_trace_submit,		"SUBMT")	\
-- 
cgit v1.2.3


From 16af134ca4b7051b1587108f2066ec90ae029f74 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Feb 2022 19:52:13 +0000
Subject: netfs: Extend the netfs_io_*request structs to handle writes

Modify the netfs_io_request struct to act as a point around which writes
can be coordinated.  It represents and pins a range of pages that need
writing and a list of regions of dirty data in that range of pages.

If RMW is required, the original data can be downloaded into the bounce
buffer, decrypted if necessary, the modifications made, then the modified
data can be reencrypted/recompressed and sent back to the server.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/internal.h          |  6 ++++++
 fs/netfs/main.c              |  3 ++-
 fs/netfs/objects.c           |  6 ++++++
 fs/netfs/stats.c             | 16 +++++++++++++---
 include/linux/netfs.h        | 15 ++++++++++++++-
 include/trace/events/netfs.h |  8 ++++++--
 6 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b908c7e0a901..2bf2e82b2ad7 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -110,6 +110,12 @@ extern atomic_t netfs_n_rh_write_begin;
 extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
 extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
 
 int netfs_stats_show(struct seq_file *m, void *v);
 
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 97ce1436615b..ab6cac110676 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -25,10 +25,11 @@ MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
 LIST_HEAD(netfs_io_requests);
 DEFINE_SPINLOCK(netfs_proc_lock);
 
-static const char *netfs_origins[] = {
+static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READAHEAD]	= "RA",
 	[NETFS_READPAGE]	= "RP",
 	[NETFS_READ_FOR_WRITE]	= "RW",
+	[NETFS_WRITEBACK]	= "WB",
 };
 
 /*
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 4df5e5eeada6..65a17dd4ab49 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,6 +20,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct inode *inode = file ? file_inode(file) : mapping->host;
 	struct netfs_inode *ctx = netfs_inode(inode);
 	struct netfs_io_request *rreq;
+	bool cached = netfs_is_cache_enabled(ctx);
 	int ret;
 
 	rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
@@ -37,7 +38,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	rreq->debug_id	= atomic_inc_return(&debug_ids);
 	INIT_LIST_HEAD(&rreq->subrequests);
 	refcount_set(&rreq->ref, 1);
+
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+	if (cached)
+		__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 	if (rreq->netfs_ops->init_request) {
 		ret = rreq->netfs_ops->init_request(rreq, file);
 		if (ret < 0) {
@@ -46,6 +50,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 		}
 	}
 
+	trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
 	netfs_proc_add_rreq(rreq);
 	netfs_stat(&netfs_n_rh_rreq);
 	return rreq;
@@ -129,6 +134,7 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
 			 sizeof(struct netfs_io_subrequest),
 			 GFP_KERNEL);
 	if (subreq) {
+		INIT_WORK(&subreq->work, NULL);
 		INIT_LIST_HEAD(&subreq->rreq_link);
 		refcount_set(&subreq->ref, 2);
 		subreq->rreq = rreq;
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 6025dc485f7e..c1f85cd595a4 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -27,6 +27,12 @@ atomic_t netfs_n_rh_write_begin;
 atomic_t netfs_n_rh_write_done;
 atomic_t netfs_n_rh_write_failed;
 atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
 
 int netfs_stats_show(struct seq_file *m, void *v)
 {
@@ -50,10 +56,14 @@ int netfs_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&netfs_n_rh_read),
 		   atomic_read(&netfs_n_rh_read_done),
 		   atomic_read(&netfs_n_rh_read_failed));
+	seq_printf(m, "Netfs  : UL=%u us=%u uf=%u\n",
+		   atomic_read(&netfs_n_wh_upload),
+		   atomic_read(&netfs_n_wh_upload_done),
+		   atomic_read(&netfs_n_wh_upload_failed));
 	seq_printf(m, "Netfs  : WR=%u ws=%u wf=%u\n",
-		   atomic_read(&netfs_n_rh_write),
-		   atomic_read(&netfs_n_rh_write_done),
-		   atomic_read(&netfs_n_rh_write_failed));
+		   atomic_read(&netfs_n_wh_write),
+		   atomic_read(&netfs_n_wh_write_done),
+		   atomic_read(&netfs_n_wh_write_failed));
 	return fscache_stats_show(m);
 }
 EXPORT_SYMBOL(netfs_stats_show);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 44cd13ad695a..f302123a3e38 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -118,6 +118,9 @@ enum netfs_io_source {
 	NETFS_DOWNLOAD_FROM_SERVER,
 	NETFS_READ_FROM_CACHE,
 	NETFS_INVALID_READ,
+	NETFS_UPLOAD_TO_SERVER,
+	NETFS_WRITE_TO_CACHE,
+	NETFS_INVALID_WRITE,
 } __mode(byte);
 
 typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
@@ -149,9 +152,14 @@ struct netfs_cache_resources {
 };
 
 /*
- * Descriptor for a single component subrequest.
+ * Descriptor for a single component subrequest.  Each operation represents an
+ * individual read/write from/to a server, a cache, a journal, etc..
+ *
+ * The buffer iterator is persistent for the life of the subrequest struct and
+ * the pages it points to can be relied on to exist for the duration.
  */
 struct netfs_io_subrequest {
+	struct work_struct	work;
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
@@ -176,6 +184,8 @@ enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
+	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	nr__netfs_io_origin
 } __mode(byte);
 
 /*
@@ -198,6 +208,7 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
@@ -216,6 +227,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
+#define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
+#define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 	const struct netfs_request_ops *netfs_ops;
 };
 
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index fce6d0bc78e5..4ea4e34d279f 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -24,7 +24,8 @@
 #define netfs_rreq_origins					\
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
-	E_(NETFS_READ_FOR_WRITE,		"RW")
+	EM(NETFS_READ_FOR_WRITE,		"RW")		\
+	E_(NETFS_WRITEBACK,			"WB")
 
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
@@ -39,7 +40,10 @@
 	EM(NETFS_FILL_WITH_ZEROES,		"ZERO")		\
 	EM(NETFS_DOWNLOAD_FROM_SERVER,		"DOWN")		\
 	EM(NETFS_READ_FROM_CACHE,		"READ")		\
-	E_(NETFS_INVALID_READ,			"INVL")		\
+	EM(NETFS_INVALID_READ,			"INVL")		\
+	EM(NETFS_UPLOAD_TO_SERVER,		"UPLD")		\
+	EM(NETFS_WRITE_TO_CACHE,		"WRIT")		\
+	E_(NETFS_INVALID_WRITE,			"INVL")
 
 #define netfs_sreq_traces					\
 	EM(netfs_sreq_trace_download_instead,	"RDOWN")	\
-- 
cgit v1.2.3


From c6dc54dd91bbf597942b4975b8adec660a16827d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Feb 2022 12:27:53 +0000
Subject: netfs: Add a hook to allow tell the netfs to update its i_size

Add a hook for netfslib's write helpers to call to tell the network
filesystem that it should update its i_size.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f302123a3e38..3fc41f616621 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -242,6 +242,7 @@ struct netfs_request_ops {
 	void (*free_request)(struct netfs_io_request *rreq);
 	void (*free_subrequest)(struct netfs_io_subrequest *rreq);
 
+	/* Read request handling */
 	void (*expand_readahead)(struct netfs_io_request *rreq);
 	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
 	void (*issue_read)(struct netfs_io_subrequest *subreq);
@@ -249,6 +250,9 @@ struct netfs_request_ops {
 	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
 				 struct folio **foliop, void **_fsdata);
 	void (*done)(struct netfs_io_request *rreq);
+
+	/* Modification handling */
+	void (*update_i_size)(struct inode *inode, loff_t i_size);
 };
 
 /*
-- 
cgit v1.2.3


From 9ebff83e648148b9ece97d4e4890dd84ca54d6ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 29 Sep 2023 17:28:25 +0100
Subject: netfs: Prep to use folio->private for write grouping and streaming
 write

Prepare to use folio->private to hold information write grouping and
streaming write.  These are implemented in the same commit as they both
make use of folio->private and will be both checked at the same time in
several places.

"Write grouping" involves ordering the writeback of groups of writes, such
as is needed for ceph snaps.  A group is represented by a
filesystem-supplied object which must contain a netfs_group struct.  This
contains just a refcount and a pointer to a destructor.

"Streaming write" is the storage of data in folios that are marked dirty,
but not uptodate, to avoid unnecessary reads of data.  This is represented
by a netfs_folio struct.  This contains the offset and length of the
modified region plus the otherwise displaced write grouping pointer.

The way folio->private is multiplexed is:

 (1) If private is NULL then neither is in operation on a dirty folio.

 (2) If private is set, with bit 0 clear, then this points to a group.

 (3) If private is set, with bit 0 set, then this points to a netfs_folio
     struct (with bit 0 AND'ed out).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/internal.h   | 28 ++++++++++++++++++++++++++++
 fs/netfs/misc.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 2bf2e82b2ad7..d72292e40f9b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -149,6 +149,34 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
 #endif
 }
 
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+	if (netfs_group)
+		refcount_inc(&netfs_group->ref);
+	return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+	if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+		netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+	if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+		netfs_group->free(netfs_group);
+}
+
 /*
  * fscache-cache.c
  */
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 5d545073fe03..eeb44abe59c5 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -177,9 +177,55 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
  */
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 {
+	struct netfs_folio *finfo = NULL;
+	size_t flen = folio_size(folio);
+
 	_enter("{%lx},%zx,%zx", folio_index(folio), offset, length);
 
 	folio_wait_fscache(folio);
+
+	if (!folio_test_private(folio))
+		return;
+
+	finfo = netfs_folio_info(folio);
+
+	if (offset == 0 && length >= flen)
+		goto erase_completely;
+
+	if (finfo) {
+		/* We have a partially uptodate page from a streaming write. */
+		unsigned int fstart = finfo->dirty_offset;
+		unsigned int fend = fstart + finfo->dirty_len;
+		unsigned int end = offset + length;
+
+		if (offset >= fend)
+			return;
+		if (end <= fstart)
+			return;
+		if (offset <= fstart && end >= fend)
+			goto erase_completely;
+		if (offset <= fstart && end > fstart)
+			goto reduce_len;
+		if (offset > fstart && end >= fend)
+			goto move_start;
+		/* A partial write was split.  The caller has already zeroed
+		 * it, so just absorb the hole.
+		 */
+	}
+	return;
+
+erase_completely:
+	netfs_put_group(netfs_folio_group(folio));
+	folio_detach_private(folio);
+	folio_clear_uptodate(folio);
+	kfree(finfo);
+	return;
+reduce_len:
+	finfo->dirty_len = offset + length - finfo->dirty_offset;
+	return;
+move_start:
+	finfo->dirty_len -= offset - finfo->dirty_offset;
+	finfo->dirty_offset = offset;
 }
 EXPORT_SYMBOL(netfs_invalidate_folio);
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3fc41f616621..cfba83e3e3d2 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -140,6 +140,47 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 };
 
+/*
+ * A netfs group - for instance a ceph snap.  This is marked on dirty pages and
+ * pages marked with a group must be flushed before they can be written under
+ * the domain of another group.
+ */
+struct netfs_group {
+	refcount_t		ref;
+	void (*free)(struct netfs_group *netfs_group);
+};
+
+/*
+ * Information about a dirty page (attached only if necessary).
+ * folio->private
+ */
+struct netfs_folio {
+	struct netfs_group	*netfs_group;	/* Filesystem's grouping marker (or NULL). */
+	unsigned int		dirty_offset;	/* Write-streaming dirty data offset */
+	unsigned int		dirty_len;	/* Write-streaming dirty data length */
+};
+#define NETFS_FOLIO_INFO	0x1UL	/* OR'd with folio->private. */
+
+static inline struct netfs_folio *netfs_folio_info(struct folio *folio)
+{
+	void *priv = folio_get_private(folio);
+
+	if ((unsigned long)priv & NETFS_FOLIO_INFO)
+		return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO);
+	return NULL;
+}
+
+static inline struct netfs_group *netfs_folio_group(struct folio *folio)
+{
+	struct netfs_folio *finfo;
+	void *priv = folio_get_private(folio);
+
+	finfo = netfs_folio_info(folio);
+	if (finfo)
+		return finfo->netfs_group;
+	return priv;
+}
+
 /*
  * Resources required to do operations on a cache.
  */
-- 
cgit v1.2.3


From 0e0f2dfe880fb19e4b15a7ca468623eb0b4ba586 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Jun 2021 22:31:48 +0100
Subject: netfs: Dispatch write requests to process a writeback slice

Dispatch one or more write reqeusts to process a writeback slice, where a
slice is tailored more to logical block divisions within the file (such as
crypto blocks, an object layout or cache granules) than the protocol RPC
maximum capacity.

The dispatch doesn't happen until throttling allows, at which point the
entire writeback slice is processed and queued.  A slice may be written to
multiple destinations (one or more servers and the local cache) and the
writes to each destination might be split up along different lines.

The writeback slice holds the required folios pinned.  An iov_iter is
provided in netfs_write_request that describes the buffer to be used.  This
may be part of the pagecache, may have auxiliary padding pages attached or
may be a bounce buffer resulting from crypto or compression.  Consequently,
the filesystem must not twiddle the folio markings directly.

The following API is available to the filesystem:

 (1) The ->create_write_requests() method is called to ask the filesystem
     to create the requests it needs.  This is passed the writeback slice
     to be processed.

 (2) The filesystem should then call netfs_create_write_request() to create
     the requests it needs.

 (3) Once a request is initialised, netfs_queue_write_request() can be
     called to dispatch it asynchronously, if not completed immediately.

 (4) netfs_write_request_completed() should be called to note the
     completion of a request.

 (5) netfs_get_write_request() and netfs_put_write_request() are provided
     to refcount a request.  These take constants from the netfs_wreq_trace
     enum for logging into ftrace.

 (6) The ->free_write_request is method is called to ask the filesystem to
     clean up a request.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/Makefile            |   3 +-
 fs/netfs/internal.h          |   6 +
 fs/netfs/output.c            | 363 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h        |  13 ++
 include/trace/events/netfs.h |  50 +++++-
 5 files changed, 432 insertions(+), 3 deletions(-)
 create mode 100644 fs/netfs/output.c

(limited to 'include/linux')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index cf3fc847b8ac..c69c6775b8ac 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -7,7 +7,8 @@ netfs-y := \
 	locking.o \
 	main.o \
 	misc.o \
-	objects.o
+	objects.o \
+	output.o
 
 netfs-$(CONFIG_NETFS_STATS) += stats.o
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d72292e40f9b..0f20587f5a9b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -88,6 +88,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
 	trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
 }
 
+/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+		      enum netfs_write_trace what);
+
 /*
  * stats.c
  */
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 000000000000..2ad0fd8c32be
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+						       enum netfs_io_source dest,
+						       loff_t start, size_t len,
+						       work_func_t worker)
+{
+	struct netfs_io_subrequest *subreq;
+
+	subreq = netfs_alloc_subrequest(wreq);
+	if (subreq) {
+		INIT_WORK(&subreq->work, worker);
+		subreq->source	= dest;
+		subreq->start	= start;
+		subreq->len	= len;
+		subreq->debug_index = wreq->subreq_counter++;
+
+		switch (subreq->source) {
+		case NETFS_UPLOAD_TO_SERVER:
+			netfs_stat(&netfs_n_wh_upload);
+			break;
+		case NETFS_WRITE_TO_CACHE:
+			netfs_stat(&netfs_n_wh_write);
+			break;
+		default:
+			BUG();
+		}
+
+		subreq->io_iter = wreq->io_iter;
+		iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+		iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+		trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+				     refcount_read(&subreq->ref),
+				     netfs_sreq_trace_new);
+		atomic_inc(&wreq->nr_outstanding);
+		list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+		trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+	}
+
+	return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+	_enter("R=%x[]", wreq->debug_id);
+
+	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+	list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+		if (!subreq->error)
+			continue;
+		switch (subreq->source) {
+		case NETFS_UPLOAD_TO_SERVER:
+			/* Depending on the type of failure, this may prevent
+			 * writeback completion unless we're in disconnected
+			 * mode.
+			 */
+			if (!wreq->error)
+				wreq->error = subreq->error;
+			break;
+
+		case NETFS_WRITE_TO_CACHE:
+			/* Failure doesn't prevent writeback completion unless
+			 * we're in disconnected mode.
+			 */
+			if (subreq->error != -ENOBUFS)
+				ctx->ops->invalidate_cache(wreq);
+			break;
+
+		default:
+			WARN_ON_ONCE(1);
+			if (!wreq->error)
+				wreq->error = -EIO;
+			return;
+		}
+	}
+
+	wreq->cleanup(wreq);
+
+	_debug("finished");
+	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+	wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+	netfs_clear_subrequests(wreq, was_async);
+	netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+				       bool was_async)
+{
+	struct netfs_io_subrequest *subreq = _op;
+	struct netfs_io_request *wreq = subreq->rreq;
+	unsigned int u;
+
+	_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+	switch (subreq->source) {
+	case NETFS_UPLOAD_TO_SERVER:
+		netfs_stat(&netfs_n_wh_upload_done);
+		break;
+	case NETFS_WRITE_TO_CACHE:
+		netfs_stat(&netfs_n_wh_write_done);
+		break;
+	case NETFS_INVALID_WRITE:
+		break;
+	default:
+		BUG();
+	}
+
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		subreq->error = transferred_or_error;
+		trace_netfs_failure(wreq, subreq, transferred_or_error,
+				    netfs_fail_write);
+		goto failed;
+	}
+
+	if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+		 "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+		 wreq->debug_id, subreq->debug_index,
+		 transferred_or_error, subreq->len, subreq->transferred))
+		transferred_or_error = subreq->len - subreq->transferred;
+
+	subreq->error = 0;
+	subreq->transferred += transferred_or_error;
+
+	if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+		pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+			wreq->debug_id, subreq->debug_index,
+			iov_iter_count(&subreq->io_iter), subreq->len,
+			subreq->transferred, subreq->io_iter.iter_type);
+
+	if (subreq->transferred < subreq->len)
+		goto incomplete;
+
+	__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+	/* If we decrement nr_outstanding to 0, the ref belongs to us. */
+	u = atomic_dec_return(&wreq->nr_outstanding);
+	if (u == 0)
+		netfs_write_terminated(wreq, was_async);
+	else if (u == 1)
+		wake_up_var(&wreq->nr_outstanding);
+
+	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+	return;
+
+incomplete:
+	if (transferred_or_error == 0) {
+		if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+			subreq->error = -ENODATA;
+			goto failed;
+		}
+	} else {
+		__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+	}
+
+	__set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+	set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+	goto out;
+
+failed:
+	switch (subreq->source) {
+	case NETFS_WRITE_TO_CACHE:
+		netfs_stat(&netfs_n_wh_write_failed);
+		set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+		break;
+	case NETFS_UPLOAD_TO_SERVER:
+		netfs_stat(&netfs_n_wh_upload_failed);
+		set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+		wreq->error = subreq->error;
+		break;
+	default:
+		break;
+	}
+	goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *wreq = subreq->rreq;
+	struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+	cres->ops->write(cres, subreq->start, &subreq->io_iter,
+			 netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+	struct netfs_io_subrequest *subreq =
+		container_of(work, struct netfs_io_subrequest, work);
+
+	netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread.  We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+	if (!queue_work(system_unbound_wq, &subreq->work))
+		netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+	struct netfs_cache_resources *cres;
+	struct netfs_io_subrequest *subreq;
+	struct netfs_inode *ctx = netfs_inode(wreq->inode);
+	struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+	loff_t start = wreq->start;
+	size_t len = wreq->len;
+	int ret;
+
+	if (!fscache_cookie_enabled(cookie)) {
+		clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+		return;
+	}
+
+	_debug("write to cache");
+	subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+					    netfs_write_to_cache_op_worker);
+	if (!subreq)
+		return;
+
+	cres = &wreq->cache_resources;
+	ret = fscache_begin_read_operation(cres, cookie);
+	if (ret < 0) {
+		netfs_write_subrequest_terminated(subreq, ret, false);
+		return;
+	}
+
+	ret = cres->ops->prepare_write(cres, &start, &len, i_size_read(wreq->inode),
+				       true);
+	if (ret < 0) {
+		netfs_write_subrequest_terminated(subreq, ret, false);
+		return;
+	}
+
+	netfs_queue_write_request(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present.  The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ *     data into it, otherwise the output buffer will point to the original
+ *     folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ *     buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ *     (a) If the data is to be compressed, create a compression buffer and
+ *         compress the data into it.
+ *
+ *     (b) For each destination we want to upload to, set up write ops to write
+ *         to that destination.  We may need multiple writes if the data is not
+ *         contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+		      enum netfs_write_trace what)
+{
+	struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+	_enter("R=%x %llx-%llx f=%lx",
+	       wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+	       wreq->flags);
+
+	trace_netfs_write(wreq, what);
+	if (wreq->len == 0 || wreq->iter.count == 0) {
+		pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+		return -EIO;
+	}
+
+	wreq->io_iter = wreq->iter;
+
+	/* ->outstanding > 0 carries a ref */
+	netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+	atomic_set(&wreq->nr_outstanding, 1);
+
+	/* Start the encryption/compression going.  We can do that in the
+	 * background whilst we generate a list of write ops that we want to
+	 * perform.
+	 */
+	// TODO: Encrypt or compress the region as appropriate
+
+	/* We need to write all of the region to the cache */
+	if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+		netfs_set_up_write_to_cache(wreq);
+
+	/* However, we don't necessarily write all of the region to the server.
+	 * Caching of reads is being managed this way also.
+	 */
+	if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+		ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+	if (atomic_dec_and_test(&wreq->nr_outstanding))
+		netfs_write_terminated(wreq, false);
+
+	if (!may_wait)
+		return -EIOCBQUEUED;
+
+	wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+		    TASK_UNINTERRUPTIBLE);
+	return wreq->error;
+}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cfba83e3e3d2..890a5d8b2299 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -249,6 +249,7 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		wsize;		/* Maximum write size (0 for none) */
 	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
@@ -271,6 +272,7 @@ struct netfs_io_request {
 #define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 	const struct netfs_request_ops *netfs_ops;
+	void (*cleanup)(struct netfs_io_request *req);
 };
 
 /*
@@ -294,6 +296,11 @@ struct netfs_request_ops {
 
 	/* Modification handling */
 	void (*update_i_size)(struct inode *inode, loff_t i_size);
+
+	/* Write request handling */
+	void (*create_write_requests)(struct netfs_io_request *wreq,
+				      loff_t start, size_t len);
+	void (*invalidate_cache)(struct netfs_io_request *wreq);
 };
 
 /*
@@ -382,6 +389,12 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				iov_iter_extraction_t extraction_flags);
 size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
 			size_t max_size, size_t max_segs);
+struct netfs_io_subrequest *netfs_create_write_request(
+	struct netfs_io_request *wreq, enum netfs_io_source dest,
+	loff_t start, size_t len, work_func_t worker);
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+				       bool was_async);
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
 
 int netfs_start_io_read(struct inode *inode);
 void netfs_end_io_read(struct inode *inode);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 6daadf2aac8a..e03635172760 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -21,6 +21,11 @@
 	EM(netfs_read_trace_readpage,		"READPAGE ")	\
 	E_(netfs_read_trace_write_begin,	"WRITEBEGN")
 
+#define netfs_write_traces					\
+	EM(netfs_write_trace_dio_write,		"DIO-WRITE")	\
+	EM(netfs_write_trace_unbuffered_write,	"UNB-WRITE")	\
+	E_(netfs_write_trace_writeback,		"WRITEBACK")
+
 #define netfs_rreq_origins					\
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
@@ -32,11 +37,13 @@
 	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
+	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
 	EM(netfs_rreq_trace_unlock,		"UNLOCK ")	\
 	EM(netfs_rreq_trace_unmark,		"UNMARK ")	\
 	EM(netfs_rreq_trace_wait_ip,		"WAIT-IP")	\
-	E_(netfs_rreq_trace_wake_ip,		"WAKE-IP")
+	EM(netfs_rreq_trace_wake_ip,		"WAKE-IP")	\
+	E_(netfs_rreq_trace_write_done,		"WR-DONE")
 
 #define netfs_sreq_sources					\
 	EM(NETFS_FILL_WITH_ZEROES,		"ZERO")		\
@@ -64,7 +71,8 @@
 	EM(netfs_fail_copy_to_cache,		"copy-to-cache")	\
 	EM(netfs_fail_read,			"read")			\
 	EM(netfs_fail_short_read,		"short-read")		\
-	E_(netfs_fail_prepare_write,		"prep-write")
+	EM(netfs_fail_prepare_write,		"prep-write")		\
+	E_(netfs_fail_write,			"write")
 
 #define netfs_rreq_ref_traces					\
 	EM(netfs_rreq_trace_get_for_outstanding,"GET OUTSTND")	\
@@ -74,6 +82,8 @@
 	EM(netfs_rreq_trace_put_failed,		"PUT FAILED ")	\
 	EM(netfs_rreq_trace_put_return,		"PUT RETURN ")	\
 	EM(netfs_rreq_trace_put_subreq,		"PUT SUBREQ ")	\
+	EM(netfs_rreq_trace_put_work,		"PUT WORK   ")	\
+	EM(netfs_rreq_trace_see_work,		"SEE WORK   ")	\
 	E_(netfs_rreq_trace_new,		"NEW        ")
 
 #define netfs_sreq_ref_traces					\
@@ -82,9 +92,12 @@
 	EM(netfs_sreq_trace_get_short_read,	"GET SHORTRD")	\
 	EM(netfs_sreq_trace_new,		"NEW        ")	\
 	EM(netfs_sreq_trace_put_clear,		"PUT CLEAR  ")	\
+	EM(netfs_sreq_trace_put_discard,	"PUT DISCARD")	\
 	EM(netfs_sreq_trace_put_failed,		"PUT FAILED ")	\
 	EM(netfs_sreq_trace_put_merged,		"PUT MERGED ")	\
 	EM(netfs_sreq_trace_put_no_copy,	"PUT NO COPY")	\
+	EM(netfs_sreq_trace_put_wip,		"PUT WIP    ")	\
+	EM(netfs_sreq_trace_put_work,		"PUT WORK   ")	\
 	E_(netfs_sreq_trace_put_terminated,	"PUT TERM   ")
 
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
@@ -96,6 +109,7 @@
 #define E_(a, b) a
 
 enum netfs_read_trace { netfs_read_traces } __mode(byte);
+enum netfs_write_trace { netfs_write_traces } __mode(byte);
 enum netfs_rreq_trace { netfs_rreq_traces } __mode(byte);
 enum netfs_sreq_trace { netfs_sreq_traces } __mode(byte);
 enum netfs_failure { netfs_failures } __mode(byte);
@@ -113,6 +127,7 @@ enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
 #define E_(a, b) TRACE_DEFINE_ENUM(a);
 
 netfs_read_traces;
+netfs_write_traces;
 netfs_rreq_origins;
 netfs_rreq_traces;
 netfs_sreq_sources;
@@ -320,6 +335,37 @@ TRACE_EVENT(netfs_sreq_ref,
 		      __entry->ref)
 	    );
 
+TRACE_EVENT(netfs_write,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     enum netfs_write_trace what),
+
+	    TP_ARGS(wreq, what),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		wreq		)
+		    __field(unsigned int,		cookie		)
+		    __field(enum netfs_write_trace,	what		)
+		    __field(unsigned long long,		start		)
+		    __field(size_t,			len		)
+			     ),
+
+	    TP_fast_assign(
+		    struct netfs_inode *__ctx = netfs_inode(wreq->inode);
+		    struct fscache_cookie *__cookie = netfs_i_cookie(__ctx);
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->cookie	= __cookie ? __cookie->debug_id : 0;
+		    __entry->what	= what;
+		    __entry->start	= wreq->start;
+		    __entry->len	= wreq->len;
+			   ),
+
+	    TP_printk("R=%08x %s c=%08x by=%llx-%llx",
+		      __entry->wreq,
+		      __print_symbolic(__entry->what, netfs_write_traces),
+		      __entry->cookie,
+		      __entry->start, __entry->start + __entry->len - 1)
+	    );
+
 #undef EM
 #undef E_
 #endif /* _TRACE_NETFS_H */
-- 
cgit v1.2.3


From c38f4e96e605f17990e871214e6ea1496bc4e65f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jun 2021 13:09:21 +0100
Subject: netfs: Provide func to copy data to pagecache for buffered write

Provide a netfs write helper, netfs_perform_write() to buffer data to be
written in the pagecache and mark the modified folios dirty.

It will perform "streaming writes" for folios that aren't currently
resident, if possible, storing data in partially modified folios that are
marked dirty, but not uptodate.  It will also tag pages as belonging to
fs-specific write groups if so directed by the filesystem.

This is derived from generic_perform_write(), but doesn't use
->write_begin() and ->write_end(), having that logic rolled in instead.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/Makefile            |   1 +
 fs/netfs/buffered_read.c     |  49 +++++++
 fs/netfs/buffered_write.c    | 330 +++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/internal.h          |   2 +
 fs/netfs/io.c                |   1 +
 include/linux/netfs.h        |   5 +
 include/trace/events/netfs.h |  73 ++++++++++
 7 files changed, 461 insertions(+)
 create mode 100644 fs/netfs/buffered_write.c

(limited to 'include/linux')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index c69c6775b8ac..85d8333a1ed4 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,6 +2,7 @@
 
 netfs-y := \
 	buffered_read.o \
+	buffered_write.o \
 	io.o \
 	iterator.o \
 	locking.o \
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 6b9a44cafbac..73a6e4d61f9d 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -63,6 +63,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 				break;
 			}
 			if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+				trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
 				folio_start_fscache(folio);
 				folio_started = true;
 			}
@@ -454,3 +455,51 @@ error:
 	return ret;
 }
 EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+			     size_t offset, size_t len)
+{
+	struct netfs_io_request *rreq;
+	struct address_space *mapping = folio_file_mapping(folio);
+	struct netfs_inode *ctx = netfs_inode(mapping->host);
+	unsigned long long start = folio_pos(folio);
+	size_t flen = folio_size(folio);
+	int ret;
+
+	_enter("%zx @%llx", flen, start);
+
+	ret = -ENOMEM;
+
+	rreq = netfs_alloc_request(mapping, file, start, flen,
+				   NETFS_READ_FOR_WRITE);
+	if (IS_ERR(rreq)) {
+		ret = PTR_ERR(rreq);
+		goto error;
+	}
+
+	rreq->no_unlock_folio = folio_index(folio);
+	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+	ret = netfs_begin_cache_read(rreq, ctx);
+	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+		goto error_put;
+
+	netfs_stat(&netfs_n_rh_write_begin);
+	trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+	/* Set up the output buffer */
+	iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+			rreq->start, rreq->len);
+
+	ret = netfs_begin_read(rreq, true);
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+	return ret;
+
+error_put:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 000000000000..6e7f06d9962d
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method.  Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+	NETFS_FOLIO_IS_UPTODATE,	/* Folio is uptodate already */
+	NETFS_JUST_PREFETCH,		/* We have to read the folio anyway */
+	NETFS_WHOLE_FOLIO_MODIFY,	/* We're going to overwrite the whole folio */
+	NETFS_MODIFY_AND_CLEAR,		/* We can assume there is no data to be downloaded. */
+	NETFS_STREAMING_WRITE,		/* Store incomplete data in non-uptodate page. */
+	NETFS_STREAMING_WRITE_CONT,	/* Continue streaming write. */
+	NETFS_FLUSH_CONTENT,		/* Flush incompatible content. */
+};
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+	if (netfs_group && !folio_get_private(folio))
+		folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+/*
+ * Decide how we should modify a folio.  We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it.  If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW.  If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+						    struct file *file,
+						    struct folio *folio,
+						    void *netfs_group,
+						    size_t flen,
+						    size_t offset,
+						    size_t len,
+						    bool maybe_trouble)
+{
+	struct netfs_folio *finfo = netfs_folio_info(folio);
+	loff_t pos = folio_file_pos(folio);
+
+	_enter("");
+
+	if (netfs_folio_group(folio) != netfs_group)
+		return NETFS_FLUSH_CONTENT;
+
+	if (folio_test_uptodate(folio))
+		return NETFS_FOLIO_IS_UPTODATE;
+
+	if (pos >= ctx->remote_i_size)
+		return NETFS_MODIFY_AND_CLEAR;
+
+	if (!maybe_trouble && offset == 0 && len >= flen)
+		return NETFS_WHOLE_FOLIO_MODIFY;
+
+	if (file->f_mode & FMODE_READ)
+		return NETFS_JUST_PREFETCH;
+
+	if (netfs_is_cache_enabled(ctx))
+		return NETFS_JUST_PREFETCH;
+
+	if (!finfo)
+		return NETFS_STREAMING_WRITE;
+
+	/* We can continue a streaming write only if it continues on from the
+	 * previous.  If it overlaps, we must flush lest we suffer a partial
+	 * copy and disjoint dirty regions.
+	 */
+	if (offset == finfo->dirty_offset + finfo->dirty_len)
+		return NETFS_STREAMING_WRITE_CONT;
+	return NETFS_FLUSH_CONTENT;
+}
+
+/*
+ * Grab a folio for writing and lock it.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+						loff_t pos, size_t part)
+{
+	pgoff_t index = pos / PAGE_SIZE;
+
+	return __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+				   mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified.  Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+			    struct netfs_group *netfs_group)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	struct netfs_inode *ctx = netfs_inode(inode);
+	struct netfs_folio *finfo;
+	struct folio *folio;
+	enum netfs_how_to_modify howto;
+	enum netfs_folio_trace trace;
+	unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+	ssize_t written = 0, ret;
+	loff_t i_size, pos = iocb->ki_pos, from, to;
+	size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+	bool maybe_trouble = false;
+
+	do {
+		size_t flen;
+		size_t offset;	/* Offset into pagecache folio */
+		size_t part;	/* Bytes to write to folio */
+		size_t copied;	/* Bytes copied from user */
+
+		ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+		if (unlikely(ret < 0))
+			break;
+
+		offset = pos & (max_chunk - 1);
+		part = min(max_chunk - offset, iov_iter_count(iter));
+
+		/* Bring in the user pages that we will copy from _first_ lest
+		 * we hit a nasty deadlock on copying from the same page as
+		 * we're writing to, without it being marked uptodate.
+		 *
+		 * Not only is this an optimisation, but it is also required to
+		 * check that the address is actually valid, when atomic
+		 * usercopies are used below.
+		 *
+		 * We rely on the page being held onto long enough by the LRU
+		 * that we can grab it below if this causes it to be read.
+		 */
+		ret = -EFAULT;
+		if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+			break;
+
+		ret = -ENOMEM;
+		folio = netfs_grab_folio_for_write(mapping, pos, part);
+		if (!folio)
+			break;
+
+		flen = folio_size(folio);
+		offset = pos & (flen - 1);
+		part = min_t(size_t, flen - offset, part);
+
+		if (signal_pending(current)) {
+			ret = written ? -EINTR : -ERESTARTSYS;
+			goto error_folio_unlock;
+		}
+
+		/* See if we need to prefetch the area we're going to modify.
+		 * We need to do this before we get a lock on the folio in case
+		 * there's more than one writer competing for the same cache
+		 * block.
+		 */
+		howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+					    flen, offset, part, maybe_trouble);
+		_debug("howto %u", howto);
+		switch (howto) {
+		case NETFS_JUST_PREFETCH:
+			ret = netfs_prefetch_for_write(file, folio, offset, part);
+			if (ret < 0) {
+				_debug("prefetch = %zd", ret);
+				goto error_folio_unlock;
+			}
+			break;
+		case NETFS_FOLIO_IS_UPTODATE:
+		case NETFS_WHOLE_FOLIO_MODIFY:
+		case NETFS_STREAMING_WRITE_CONT:
+			break;
+		case NETFS_MODIFY_AND_CLEAR:
+			zero_user_segment(&folio->page, 0, offset);
+			break;
+		case NETFS_STREAMING_WRITE:
+			ret = -EIO;
+			if (WARN_ON(folio_get_private(folio)))
+				goto error_folio_unlock;
+			break;
+		case NETFS_FLUSH_CONTENT:
+			trace_netfs_folio(folio, netfs_flush_content);
+			from = folio_pos(folio);
+			to = from + folio_size(folio) - 1;
+			folio_unlock(folio);
+			folio_put(folio);
+			ret = filemap_write_and_wait_range(mapping, from, to);
+			if (ret < 0)
+				goto error_folio_unlock;
+			continue;
+		}
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_folio(folio);
+
+		copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+		flush_dcache_folio(folio);
+
+		/* Deal with a (partially) failed copy */
+		if (copied == 0) {
+			ret = -EFAULT;
+			goto error_folio_unlock;
+		}
+
+		trace = (enum netfs_folio_trace)howto;
+		switch (howto) {
+		case NETFS_FOLIO_IS_UPTODATE:
+		case NETFS_JUST_PREFETCH:
+			netfs_set_group(folio, netfs_group);
+			break;
+		case NETFS_MODIFY_AND_CLEAR:
+			zero_user_segment(&folio->page, offset + copied, flen);
+			netfs_set_group(folio, netfs_group);
+			folio_mark_uptodate(folio);
+			break;
+		case NETFS_WHOLE_FOLIO_MODIFY:
+			if (unlikely(copied < part)) {
+				maybe_trouble = true;
+				iov_iter_revert(iter, copied);
+				copied = 0;
+				goto retry;
+			}
+			netfs_set_group(folio, netfs_group);
+			folio_mark_uptodate(folio);
+			break;
+		case NETFS_STREAMING_WRITE:
+			if (offset == 0 && copied == flen) {
+				netfs_set_group(folio, netfs_group);
+				folio_mark_uptodate(folio);
+				trace = netfs_streaming_filled_page;
+				break;
+			}
+			finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+			if (!finfo) {
+				iov_iter_revert(iter, copied);
+				ret = -ENOMEM;
+				goto error_folio_unlock;
+			}
+			finfo->netfs_group = netfs_get_group(netfs_group);
+			finfo->dirty_offset = offset;
+			finfo->dirty_len = copied;
+			folio_attach_private(folio, (void *)((unsigned long)finfo |
+							     NETFS_FOLIO_INFO));
+			break;
+		case NETFS_STREAMING_WRITE_CONT:
+			finfo = netfs_folio_info(folio);
+			finfo->dirty_len += copied;
+			if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+				if (finfo->netfs_group)
+					folio_change_private(folio, finfo->netfs_group);
+				else
+					folio_detach_private(folio);
+				folio_mark_uptodate(folio);
+				kfree(finfo);
+				trace = netfs_streaming_cont_filled_page;
+			}
+			break;
+		default:
+			WARN(true, "Unexpected modify type %u ix=%lx\n",
+			     howto, folio_index(folio));
+			ret = -EIO;
+			goto error_folio_unlock;
+		}
+
+		trace_netfs_folio(folio, trace);
+
+		/* Update the inode size if we moved the EOF marker */
+		i_size = i_size_read(inode);
+		pos += copied;
+		if (pos > i_size) {
+			if (ctx->ops->update_i_size) {
+				ctx->ops->update_i_size(inode, pos);
+			} else {
+				i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+				fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+			}
+		}
+		written += copied;
+
+		folio_mark_dirty(folio);
+	retry:
+		folio_unlock(folio);
+		folio_put(folio);
+		folio = NULL;
+
+		cond_resched();
+	} while (iov_iter_count(iter));
+
+out:
+	if (likely(written)) {
+		/* Flush and wait for a write that requires immediate synchronisation. */
+		if (iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) {
+			_debug("dsync");
+			ret = filemap_fdatawait_range(mapping, iocb->ki_pos,
+						      iocb->ki_pos + written);
+		}
+
+		iocb->ki_pos += written;
+	}
+
+	_leave(" = %zd [%zd]", written, ret);
+	return written ? written : ret;
+
+error_folio_unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 0f20587f5a9b..17e4ea4456c7 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -23,6 +23,8 @@
  * buffered_read.c
  */
 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+			     size_t offset, size_t len);
 
 /*
  * io.c
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index e83ef5835d25..774aef6ea4cb 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -125,6 +125,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
 			if (have_unlocked && folio_index(folio) <= unlocked)
 				continue;
 			unlocked = folio_index(folio);
+			trace_netfs_folio(folio, netfs_folio_trace_end_copy);
 			folio_end_fscache(folio);
 			have_unlocked = true;
 		}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 890a5d8b2299..70f578cf3715 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -367,6 +367,11 @@ struct netfs_cache_ops {
 			       loff_t *_data_start, size_t *_data_len);
 };
 
+/* High-level write API */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+			    struct netfs_group *netfs_group);
+
+/* Address operations API */
 struct readahead_control;
 void netfs_readahead(struct readahead_control *);
 int netfs_read_folio(struct file *, struct folio *);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index e03635172760..8308b81f36be 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -19,6 +19,7 @@
 	EM(netfs_read_trace_expanded,		"EXPANDED ")	\
 	EM(netfs_read_trace_readahead,		"READAHEAD")	\
 	EM(netfs_read_trace_readpage,		"READPAGE ")	\
+	EM(netfs_read_trace_prefetch_for_write,	"PREFETCHW")	\
 	E_(netfs_read_trace_write_begin,	"WRITEBEGN")
 
 #define netfs_write_traces					\
@@ -100,6 +101,31 @@
 	EM(netfs_sreq_trace_put_work,		"PUT WORK   ")	\
 	E_(netfs_sreq_trace_put_terminated,	"PUT TERM   ")
 
+#define netfs_folio_traces					\
+	/* The first few correspond to enum netfs_how_to_modify */	\
+	EM(netfs_folio_is_uptodate,		"mod-uptodate")	\
+	EM(netfs_just_prefetch,			"mod-prefetch")	\
+	EM(netfs_whole_folio_modify,		"mod-whole-f")	\
+	EM(netfs_modify_and_clear,		"mod-n-clear")	\
+	EM(netfs_streaming_write,		"mod-streamw")	\
+	EM(netfs_streaming_write_cont,		"mod-streamw+")	\
+	EM(netfs_flush_content,			"flush")	\
+	EM(netfs_streaming_filled_page,		"mod-streamw-f") \
+	EM(netfs_streaming_cont_filled_page,	"mod-streamw-f+") \
+	/* The rest are for writeback */			\
+	EM(netfs_folio_trace_clear,		"clear")	\
+	EM(netfs_folio_trace_clear_s,		"clear-s")	\
+	EM(netfs_folio_trace_clear_g,		"clear-g")	\
+	EM(netfs_folio_trace_copy_to_cache,	"copy")		\
+	EM(netfs_folio_trace_end_copy,		"end-copy")	\
+	EM(netfs_folio_trace_kill,		"kill")		\
+	EM(netfs_folio_trace_mkwrite,		"mkwrite")	\
+	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
+	EM(netfs_folio_trace_redirty,		"redirty")	\
+	EM(netfs_folio_trace_redirtied,		"redirtied")	\
+	EM(netfs_folio_trace_store,		"store")	\
+	E_(netfs_folio_trace_store_plus,	"store+")
+
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 
@@ -115,6 +141,7 @@ enum netfs_sreq_trace { netfs_sreq_traces } __mode(byte);
 enum netfs_failure { netfs_failures } __mode(byte);
 enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte);
 enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
+enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
 
 #endif
 
@@ -135,6 +162,7 @@ netfs_sreq_traces;
 netfs_failures;
 netfs_rreq_ref_traces;
 netfs_sreq_ref_traces;
+netfs_folio_traces;
 
 /*
  * Now redefine the EM() and E_() macros to map the enums to the strings that
@@ -335,6 +363,51 @@ TRACE_EVENT(netfs_sreq_ref,
 		      __entry->ref)
 	    );
 
+TRACE_EVENT(netfs_folio,
+	    TP_PROTO(struct folio *folio, enum netfs_folio_trace why),
+
+	    TP_ARGS(folio, why),
+
+	    TP_STRUCT__entry(
+		    __field(ino_t,			ino)
+		    __field(pgoff_t,			index)
+		    __field(unsigned int,		nr)
+		    __field(enum netfs_folio_trace,	why)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->ino = folio->mapping->host->i_ino;
+		    __entry->why = why;
+		    __entry->index = folio_index(folio);
+		    __entry->nr = folio_nr_pages(folio);
+			   ),
+
+	    TP_printk("i=%05lx ix=%05lx-%05lx %s",
+		      __entry->ino, __entry->index, __entry->index + __entry->nr - 1,
+		      __print_symbolic(__entry->why, netfs_folio_traces))
+	    );
+
+TRACE_EVENT(netfs_write_iter,
+	    TP_PROTO(const struct kiocb *iocb, const struct iov_iter *from),
+
+	    TP_ARGS(iocb, from),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned long long,		start		)
+		    __field(size_t,			len		)
+		    __field(unsigned int,		flags		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->start	= iocb->ki_pos;
+		    __entry->len	= iov_iter_count(from);
+		    __entry->flags	= iocb->ki_flags;
+			   ),
+
+	    TP_printk("WRITE-ITER s=%llx l=%zx f=%x",
+		      __entry->start, __entry->len, __entry->flags)
+	    );
+
 TRACE_EVENT(netfs_write,
 	    TP_PROTO(const struct netfs_io_request *wreq,
 		     enum netfs_write_trace what),
-- 
cgit v1.2.3


From 016dc8516aec8719641e7aaaacd78d344759178e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Jan 2022 17:39:55 +0000
Subject: netfs: Implement unbuffered/DIO read support

Implement support for unbuffered and DIO reads in the netfs library,
utilising the existing read helper code to do block splitting and
individual queuing.  The code also handles extraction of the destination
buffer from the supplied iterator, allowing async unbuffered reads to take
place.

The read will be split up according to the rsize setting and, if supplied,
the ->clamp_length() method.  Note that the next subrequest will be issued
as soon as issue_op returns, without waiting for previous ones to finish.
The network filesystem needs to pause or handle queuing them if it doesn't
want to fire them all at the server simultaneously.

Once all the subrequests have finished, the state will be assessed and the
amount of data to be indicated as having being obtained will be
determined.  As the subrequests may finish in any order, if an intermediate
subrequest is short, any further subrequests may be copied into the buffer
and then abandoned.

In the future, this will also take care of doing an unbuffered read from
encrypted content, with the decryption being done by the library.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/Makefile            |   1 +
 fs/netfs/direct_read.c       | 125 +++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/internal.h          |   1 +
 fs/netfs/io.c                |  83 +++++++++++++++++++++++++---
 fs/netfs/main.c              |   1 +
 fs/netfs/objects.c           |   5 +-
 fs/netfs/stats.c             |   4 +-
 include/linux/netfs.h        |   9 ++++
 include/trace/events/netfs.h |   7 ++-
 mm/filemap.c                 |   1 +
 10 files changed, 226 insertions(+), 11 deletions(-)
 create mode 100644 fs/netfs/direct_read.c

(limited to 'include/linux')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 85d8333a1ed4..e968ab1eca40 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -3,6 +3,7 @@
 netfs-y := \
 	buffered_read.o \
 	buffered_write.o \
+	direct_read.o \
 	io.o \
 	iterator.o \
 	locking.o \
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 000000000000..ad4370b3935d
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer.  No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct netfs_io_request *rreq;
+	ssize_t ret;
+	size_t orig_count = iov_iter_count(iter);
+	bool async = !is_sync_kiocb(iocb);
+
+	_enter("");
+
+	if (!orig_count)
+		return 0; /* Don't update atime */
+
+	ret = kiocb_write_and_wait(iocb, orig_count);
+	if (ret < 0)
+		return ret;
+	file_accessed(iocb->ki_filp);
+
+	rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+				   iocb->ki_pos, orig_count,
+				   NETFS_DIO_READ);
+	if (IS_ERR(rreq))
+		return PTR_ERR(rreq);
+
+	netfs_stat(&netfs_n_rh_dio_read);
+	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+	/* If this is an async op, we have to keep track of the destination
+	 * buffer for ourselves as the caller's iterator will be trashed when
+	 * we return.
+	 *
+	 * In such a case, extract an iterator to represent as much of the the
+	 * output buffer as we can manage.  Note that the extraction might not
+	 * be able to allocate a sufficiently large bvec array and may shorten
+	 * the request.
+	 */
+	if (user_backed_iter(iter)) {
+		ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+		if (ret < 0)
+			goto out;
+		rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+		rreq->direct_bv_count = ret;
+		rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+		rreq->len = iov_iter_count(&rreq->iter);
+	} else {
+		rreq->iter = *iter;
+		rreq->len = orig_count;
+		rreq->direct_bv_unpin = false;
+		iov_iter_advance(iter, orig_count);
+	}
+
+	// TODO: Set up bounce buffer if needed
+
+	if (async)
+		rreq->iocb = iocb;
+
+	ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+	if (ret < 0)
+		goto out; /* May be -EIOCBQUEUED */
+	if (!async) {
+		// TODO: Copy from bounce buffer
+		iocb->ki_pos += rreq->transferred;
+		ret = rreq->transferred;
+	}
+
+out:
+	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+	if (ret > 0)
+		orig_count -= ret;
+	if (ret != -EIOCBQUEUED)
+		iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+	return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer.  No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	if (!iter->count)
+		return 0; /* Don't update atime */
+
+	ret = netfs_start_io_direct(inode);
+	if (ret == 0) {
+		ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+		netfs_end_io_direct(inode);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 17e4ea4456c7..886c2e8f841f 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -100,6 +100,7 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
  * stats.c
  */
 #ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
 extern atomic_t netfs_n_rh_readahead;
 extern atomic_t netfs_n_rh_readpage;
 extern atomic_t netfs_n_rh_rreq;
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 774aef6ea4cb..c972415c8aad 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -78,7 +78,9 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
 				   struct netfs_io_subrequest *subreq)
 {
 	netfs_stat(&netfs_n_rh_download);
-	if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+
+	if (rreq->origin != NETFS_DIO_READ &&
+	    iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
 		pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
 			rreq->debug_id, subreq->debug_index,
 			iov_iter_count(&subreq->io_iter), subreq->len,
@@ -341,6 +343,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
 	}
 }
 
+/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+	struct netfs_io_subrequest *subreq;
+	unsigned int i;
+	size_t transferred = 0;
+
+	for (i = 0; i < rreq->direct_bv_count; i++)
+		flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		if (subreq->error || subreq->transferred == 0)
+			break;
+		transferred += subreq->transferred;
+		if (subreq->transferred < subreq->len)
+			break;
+	}
+
+	for (i = 0; i < rreq->direct_bv_count; i++)
+		flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+	rreq->transferred = transferred;
+	task_io_account_read(transferred);
+
+	if (rreq->iocb) {
+		rreq->iocb->ki_pos += transferred;
+		if (rreq->iocb->ki_complete)
+			rreq->iocb->ki_complete(
+				rreq->iocb, rreq->error ? rreq->error : transferred);
+	}
+	if (rreq->netfs_ops->done)
+		rreq->netfs_ops->done(rreq);
+	inode_dio_end(rreq->inode);
+}
+
 /*
  * Assess the state of a read request and decide what to do next.
  *
@@ -361,7 +400,10 @@ again:
 		return;
 	}
 
-	netfs_rreq_unlock_folios(rreq);
+	if (rreq->origin != NETFS_DIO_READ)
+		netfs_rreq_unlock_folios(rreq);
+	else
+		netfs_rreq_assess_dio(rreq);
 
 	trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
 	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
@@ -526,14 +568,16 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 			struct netfs_io_subrequest *subreq,
 			struct iov_iter *io_iter)
 {
-	enum netfs_io_source source;
+	enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
 	size_t lsize;
 
 	_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
 
-	source = netfs_cache_prepare_read(subreq, rreq->i_size);
-	if (source == NETFS_INVALID_READ)
-		goto out;
+	if (rreq->origin != NETFS_DIO_READ) {
+		source = netfs_cache_prepare_read(subreq, rreq->i_size);
+		if (source == NETFS_INVALID_READ)
+			goto out;
+	}
 
 	if (source == NETFS_DOWNLOAD_FROM_SERVER) {
 		/* Call out to the netfs to let it shrink the request to fit
@@ -544,6 +588,8 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 		 */
 		if (subreq->len > rreq->i_size - subreq->start)
 			subreq->len = rreq->i_size - subreq->start;
+		if (rreq->rsize && subreq->len > rreq->rsize)
+			subreq->len = rreq->rsize;
 
 		if (rreq->netfs_ops->clamp_length &&
 		    !rreq->netfs_ops->clamp_length(subreq)) {
@@ -662,6 +708,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 		return -EIO;
 	}
 
+	if (rreq->origin == NETFS_DIO_READ)
+		inode_dio_begin(rreq->inode);
+
+	// TODO: Use bounce buffer if requested
 	rreq->io_iter = rreq->iter;
 
 	INIT_WORK(&rreq->work, netfs_rreq_work);
@@ -673,11 +723,25 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 	atomic_set(&rreq->nr_outstanding, 1);
 	io_iter = rreq->io_iter;
 	do {
+		_debug("submit %llx + %zx >= %llx",
+		       rreq->start, rreq->submitted, rreq->i_size);
+		if (rreq->origin == NETFS_DIO_READ &&
+		    rreq->start + rreq->submitted >= rreq->i_size)
+			break;
 		if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
 			break;
+		if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+		    test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
+			break;
 
 	} while (rreq->submitted < rreq->len);
 
+	if (!rreq->submitted) {
+		netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+		ret = 0;
+		goto out;
+	}
+
 	if (sync) {
 		/* Keep nr_outstanding incremented so that the ref always
 		 * belongs to us, and the service code isn't punted off to a
@@ -694,7 +758,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 			    TASK_UNINTERRUPTIBLE);
 
 		ret = rreq->error;
-		if (ret == 0 && rreq->submitted < rreq->len) {
+		if (ret == 0 && rreq->submitted < rreq->len &&
+		    rreq->origin != NETFS_DIO_READ) {
 			trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
 			ret = -EIO;
 		}
@@ -702,7 +767,9 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 		/* If we decrement nr_outstanding to 0, the ref belongs to us. */
 		if (atomic_dec_and_test(&rreq->nr_outstanding))
 			netfs_rreq_assess(rreq, false);
-		ret = 0;
+		ret = -EIOCBQUEUED;
 	}
+
+out:
 	return ret;
 }
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index ab6cac110676..abb8857486ee 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READPAGE]	= "RP",
 	[NETFS_READ_FOR_WRITE]	= "RW",
 	[NETFS_WRITEBACK]	= "WB",
+	[NETFS_DIO_READ]	= "DR",
 };
 
 /*
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 3aa0bfbc04ec..7153f24e8034 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,7 +20,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct inode *inode = file ? file_inode(file) : mapping->host;
 	struct netfs_inode *ctx = netfs_inode(inode);
 	struct netfs_io_request *rreq;
-	bool cached = netfs_is_cache_enabled(ctx);
+	bool is_dio = (origin == NETFS_DIO_READ);
+	bool cached = is_dio && netfs_is_cache_enabled(ctx);
 	int ret;
 
 	rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
@@ -42,6 +43,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 	if (cached)
 		__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+	if (file && file->f_flags & O_NONBLOCK)
+		__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
 	if (rreq->netfs_ops->init_request) {
 		ret = rreq->netfs_ops->init_request(rreq, file);
 		if (ret < 0) {
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index c1f85cd595a4..15fd5c3f0f39 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include "internal.h"
 
+atomic_t netfs_n_rh_dio_read;
 atomic_t netfs_n_rh_readahead;
 atomic_t netfs_n_rh_readpage;
 atomic_t netfs_n_rh_rreq;
@@ -36,7 +37,8 @@ atomic_t netfs_n_wh_write_failed;
 
 int netfs_stats_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "Netfs  : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+	seq_printf(m, "Netfs  : DR=%u RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+		   atomic_read(&netfs_n_rh_dio_read),
 		   atomic_read(&netfs_n_rh_readahead),
 		   atomic_read(&netfs_n_rh_readpage),
 		   atomic_read(&netfs_n_rh_write_begin),
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 70f578cf3715..7c1309568459 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -226,6 +226,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_DIO_READ,			/* This is a direct I/O read */
 	nr__netfs_io_origin
 } __mode(byte);
 
@@ -240,6 +241,7 @@ struct netfs_io_request {
 	};
 	struct inode		*inode;		/* The file being accessed */
 	struct address_space	*mapping;	/* The mapping being accessed */
+	struct kiocb		*iocb;		/* AIO completion vector */
 	struct netfs_cache_resources cache_resources;
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
@@ -249,12 +251,14 @@ struct netfs_io_request {
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
+	unsigned int		rsize;		/* Maximum read size (0 for none) */
 	unsigned int		wsize;		/* Maximum write size (0 for none) */
 	unsigned int		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
+	size_t			transferred;	/* Amount to be indicated as transferred */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
@@ -271,6 +275,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
 #define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
+#define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
+#define NETFS_RREQ_BLOCKED		10	/* We blocked */
 	const struct netfs_request_ops *netfs_ops;
 	void (*cleanup)(struct netfs_io_request *req);
 };
@@ -367,6 +373,9 @@ struct netfs_cache_ops {
 			       loff_t *_data_start, size_t *_data_len);
 };
 
+/* High-level read API. */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 082a5e717b58..5a4edadf0e59 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -16,6 +16,7 @@
  * Define enums for tracing information.
  */
 #define netfs_read_traces					\
+	EM(netfs_read_trace_dio_read,		"DIO-READ ")	\
 	EM(netfs_read_trace_expanded,		"EXPANDED ")	\
 	EM(netfs_read_trace_readahead,		"READAHEAD")	\
 	EM(netfs_read_trace_readpage,		"READPAGE ")	\
@@ -31,7 +32,8 @@
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
-	E_(NETFS_WRITEBACK,			"WB")
+	EM(NETFS_WRITEBACK,			"WB")		\
+	E_(NETFS_DIO_READ,			"DR")
 
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
@@ -70,6 +72,8 @@
 #define netfs_failures							\
 	EM(netfs_fail_check_write_begin,	"check-write-begin")	\
 	EM(netfs_fail_copy_to_cache,		"copy-to-cache")	\
+	EM(netfs_fail_dio_read_short,		"dio-read-short")	\
+	EM(netfs_fail_dio_read_zero,		"dio-read-zero")	\
 	EM(netfs_fail_read,			"read")			\
 	EM(netfs_fail_short_read,		"short-read")		\
 	EM(netfs_fail_prepare_write,		"prep-write")		\
@@ -81,6 +85,7 @@
 	EM(netfs_rreq_trace_put_complete,	"PUT COMPLT ")	\
 	EM(netfs_rreq_trace_put_discard,	"PUT DISCARD")	\
 	EM(netfs_rreq_trace_put_failed,		"PUT FAILED ")	\
+	EM(netfs_rreq_trace_put_no_submit,	"PUT NO-SUBM")	\
 	EM(netfs_rreq_trace_put_return,		"PUT RETURN ")	\
 	EM(netfs_rreq_trace_put_subreq,		"PUT SUBREQ ")	\
 	EM(netfs_rreq_trace_put_work,		"PUT WORK   ")	\
diff --git a/mm/filemap.c b/mm/filemap.c
index f1c8c278310f..1c5271ed0cc0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2678,6 +2678,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
 
 	return filemap_write_and_wait_range(mapping, pos, end);
 }
+EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
 
 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
 {
-- 
cgit v1.2.3


From 153a9961b551101cd38e94e26cd92fbfd198b19b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 21 Feb 2022 11:38:17 +0000
Subject: netfs: Implement unbuffered/DIO write support

Implement support for unbuffered writes and direct I/O writes.  If the
write is misaligned with respect to the fscrypt block size, then RMW cycles
are performed if necessary.  DIO writes are a special case of unbuffered
writes with extra restriction imposed, such as block size alignment
requirements.

Also provide a field that can tell the code to add some extra space onto
the bounce buffer for use by the filesystem in the case of a
content-encrypted file.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/afs/inode.c               |   2 +-
 fs/netfs/Makefile            |   1 +
 fs/netfs/direct_write.c      | 166 +++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/internal.h          |   6 ++
 fs/netfs/io.c                |   2 +-
 fs/netfs/main.c              |  12 ++--
 fs/netfs/objects.c           |   6 +-
 fs/netfs/output.c            |  30 ++++++++
 include/linux/netfs.h        |   4 ++
 include/trace/events/netfs.h |   4 +-
 mm/filemap.c                 |   1 +
 11 files changed, 224 insertions(+), 10 deletions(-)
 create mode 100644 fs/netfs/direct_write.c

(limited to 'include/linux')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6f375f0cf650..37485ae31471 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -250,7 +250,7 @@ static void afs_apply_status(struct afs_operation *op,
 		 * what's on the server.
 		 */
 		vnode->netfs.remote_i_size = status->size;
-		if (change_size) {
+		if (change_size || status->size > i_size_read(inode)) {
 			afs_set_i_size(vnode, status->size);
 			inode_set_ctime_to_ts(inode, t);
 			inode_set_atime_to_ts(inode, t);
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index e968ab1eca40..d4d1d799819e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -4,6 +4,7 @@ netfs-y := \
 	buffered_read.o \
 	buffered_write.o \
 	direct_read.o \
+	direct_write.o \
 	io.o \
 	iterator.o \
 	locking.o \
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 000000000000..bb0c2718f57b
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+	struct inode *inode = wreq->inode;
+	unsigned long long end = wreq->start + wreq->len;
+
+	if (!wreq->error &&
+	    i_size_read(inode) < end) {
+		if (wreq->netfs_ops->update_i_size)
+			wreq->netfs_ops->update_i_size(inode, end);
+		else
+			i_size_write(inode, end);
+	}
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file.  This can also be used for direct I/O writes.
+ */
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+					   struct netfs_group *netfs_group)
+{
+	struct netfs_io_request *wreq;
+	unsigned long long start = iocb->ki_pos;
+	unsigned long long end = start + iov_iter_count(iter);
+	ssize_t ret, n;
+	bool async = !is_sync_kiocb(iocb);
+
+	_enter("");
+
+	/* We're going to need a bounce buffer if what we transmit is going to
+	 * be different in some way to the source buffer, e.g. because it gets
+	 * encrypted/compressed or because it needs expanding to a block size.
+	 */
+	// TODO
+
+	_debug("uw %llx-%llx", start, end);
+
+	wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+				   start, end - start,
+				   iocb->ki_flags & IOCB_DIRECT ?
+				   NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+	if (IS_ERR(wreq))
+		return PTR_ERR(wreq);
+
+	{
+		/* If this is an async op and we're not using a bounce buffer,
+		 * we have to save the source buffer as the iterator is only
+		 * good until we return.  In such a case, extract an iterator
+		 * to represent as much of the the output buffer as we can
+		 * manage.  Note that the extraction might not be able to
+		 * allocate a sufficiently large bvec array and may shorten the
+		 * request.
+		 */
+		if (async || user_backed_iter(iter)) {
+			n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+			if (n < 0) {
+				ret = n;
+				goto out;
+			}
+			wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+			wreq->direct_bv_count = n;
+			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+			wreq->len = iov_iter_count(&wreq->iter);
+		} else {
+			wreq->iter = *iter;
+		}
+
+		wreq->io_iter = wreq->iter;
+	}
+
+	/* Copy the data into the bounce buffer and encrypt it. */
+	// TODO
+
+	/* Dispatch the write. */
+	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+	if (async)
+		wreq->iocb = iocb;
+	wreq->cleanup = netfs_cleanup_dio_write;
+	ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+				iocb->ki_flags & IOCB_DIRECT ?
+				netfs_write_trace_dio_write :
+				netfs_write_trace_unbuffered_write);
+	if (ret < 0) {
+		_debug("begin = %zd", ret);
+		goto out;
+	}
+
+	if (!async) {
+		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+			    TASK_UNINTERRUPTIBLE);
+
+		ret = wreq->error;
+		_debug("waited = %zd", ret);
+		if (ret == 0) {
+			ret = wreq->transferred;
+			iocb->ki_pos += ret;
+		}
+	} else {
+		ret = -EIOCBQUEUED;
+	}
+
+out:
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ *   vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct netfs_inode *ictx = netfs_inode(inode);
+	ssize_t ret;
+
+	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+	trace_netfs_write_iter(iocb, from);
+
+	ret = netfs_start_io_direct(inode);
+	if (ret < 0)
+		return ret;
+	ret = generic_write_checks(iocb, from);
+	if (ret < 0)
+		goto out;
+	ret = file_remove_privs(file);
+	if (ret < 0)
+		goto out;
+	ret = file_update_time(file);
+	if (ret < 0)
+		goto out;
+	ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+	if (ret < 0)
+		goto out;
+
+	fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+			   FSCACHE_INVAL_DIO_WRITE);
+	ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+	netfs_end_io_direct(inode);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 886c2e8f841f..2de4f826dbe4 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -26,6 +26,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 			     size_t offset, size_t len);
 
+/*
+ * direct_write.c
+ */
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+					   struct netfs_group *netfs_group);
+
 /*
  * io.c
  */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index c972415c8aad..01c7ff27228e 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -645,7 +645,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
 
 	subreq->debug_index	= (*_debug_index)++;
 	subreq->start		= rreq->start + rreq->submitted;
-	subreq->len		= rreq->len   - rreq->submitted;
+	subreq->len		= io_iter->count;
 
 	_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
 	list_add_tail(&subreq->rreq_link, &rreq->subrequests);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index abb8857486ee..8e4db9ff40c4 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -26,11 +26,13 @@ LIST_HEAD(netfs_io_requests);
 DEFINE_SPINLOCK(netfs_proc_lock);
 
 static const char *netfs_origins[nr__netfs_io_origin] = {
-	[NETFS_READAHEAD]	= "RA",
-	[NETFS_READPAGE]	= "RP",
-	[NETFS_READ_FOR_WRITE]	= "RW",
-	[NETFS_WRITEBACK]	= "WB",
-	[NETFS_DIO_READ]	= "DR",
+	[NETFS_READAHEAD]		= "RA",
+	[NETFS_READPAGE]		= "RP",
+	[NETFS_READ_FOR_WRITE]		= "RW",
+	[NETFS_WRITEBACK]		= "WB",
+	[NETFS_UNBUFFERED_WRITE]	= "UW",
+	[NETFS_DIO_READ]		= "DR",
+	[NETFS_DIO_WRITE]		= "DW",
 };
 
 /*
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 7153f24e8034..93f1d7431199 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,8 +20,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct inode *inode = file ? file_inode(file) : mapping->host;
 	struct netfs_inode *ctx = netfs_inode(inode);
 	struct netfs_io_request *rreq;
-	bool is_dio = (origin == NETFS_DIO_READ);
-	bool cached = is_dio && netfs_is_cache_enabled(ctx);
+	bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+			      origin == NETFS_DIO_READ ||
+			      origin == NETFS_DIO_WRITE);
+	bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
 	int ret;
 
 	rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
index 2ad0fd8c32be..560cbcea0c0a 100644
--- a/fs/netfs/output.c
+++ b/fs/netfs/output.c
@@ -74,11 +74,21 @@ static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async
 {
 	struct netfs_io_subrequest *subreq;
 	struct netfs_inode *ctx = netfs_inode(wreq->inode);
+	size_t transferred = 0;
 
 	_enter("R=%x[]", wreq->debug_id);
 
 	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
 
+	list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+		if (subreq->error || subreq->transferred == 0)
+			break;
+		transferred += subreq->transferred;
+		if (subreq->transferred < subreq->len)
+			break;
+	}
+	wreq->transferred = transferred;
+
 	list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
 		if (!subreq->error)
 			continue;
@@ -110,11 +120,28 @@ static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async
 
 	wreq->cleanup(wreq);
 
+	if (wreq->origin == NETFS_DIO_WRITE &&
+	    wreq->mapping->nrpages) {
+		pgoff_t first = wreq->start >> PAGE_SHIFT;
+		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+		invalidate_inode_pages2_range(wreq->mapping, first, last);
+	}
+
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_end(wreq->inode);
+
 	_debug("finished");
 	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
 	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
 	wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
 
+	if (wreq->iocb) {
+		wreq->iocb->ki_pos += transferred;
+		if (wreq->iocb->ki_complete)
+			wreq->iocb->ki_complete(
+				wreq->iocb, wreq->error ? wreq->error : transferred);
+	}
+
 	netfs_clear_subrequests(wreq, was_async);
 	netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
 }
@@ -329,6 +356,9 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
 		return -EIO;
 	}
 
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_begin(wreq->inode);
+
 	wreq->io_iter = wreq->iter;
 
 	/* ->outstanding > 0 carries a ref */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 7c1309568459..e1dfd6775c2c 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -138,6 +138,7 @@ struct netfs_inode {
 	loff_t			remote_i_size;	/* Size of the remote file */
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
+#define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 };
 
 /*
@@ -226,7 +227,9 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
+	NETFS_DIO_WRITE,		/* This is a direct I/O write */
 	nr__netfs_io_origin
 } __mode(byte);
 
@@ -379,6 +382,7 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 /* Address operations API */
 struct readahead_control;
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 5a4edadf0e59..914a24b03d08 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -33,7 +33,9 @@
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
-	E_(NETFS_DIO_READ,			"DR")
+	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
+	EM(NETFS_DIO_READ,			"DR")		\
+	E_(NETFS_DIO_WRITE,			"DW")
 
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
diff --git a/mm/filemap.c b/mm/filemap.c
index 1c5271ed0cc0..73626eb323f3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2706,6 +2706,7 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
 	return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
 					     end >> PAGE_SHIFT);
 }
+EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
 
 /**
  * generic_file_read_iter - generic filesystem read routine
-- 
cgit v1.2.3


From 938e13a73b244278a3777f38fa915bd239b2efd2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jun 2021 13:09:21 +0100
Subject: netfs: Implement buffered write API

Institute a netfs write helper, netfs_file_write_iter(), to be pointed at
by the network filesystem ->write_iter() call.  Make it handled buffered
writes by calling the previously defined netfs_perform_write() to copy the
source data into the pagecache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h     |  3 ++
 2 files changed, 86 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b76688e98f81..f244123ab568 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -333,3 +333,86 @@ error_folio_unlock:
 	goto out;
 }
 EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb:	IO state structure (file, offset, etc.)
+ * @from:	iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already.  The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+					 struct netfs_group *netfs_group)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	trace_netfs_write_iter(iocb, from);
+
+	ret = file_remove_privs(file);
+	if (ret)
+		return ret;
+
+	ret = file_update_time(file);
+	if (ret)
+		return ret;
+
+	return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ *   vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct netfs_inode *ictx = netfs_inode(inode);
+	ssize_t ret;
+
+	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+	if ((iocb->ki_flags & IOCB_DIRECT) ||
+	    test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+		return netfs_unbuffered_write_iter(iocb, from);
+
+	ret = netfs_start_io_write(inode);
+	if (ret < 0)
+		return ret;
+
+	ret = generic_write_checks(iocb, from);
+	if (ret > 0)
+		ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+	netfs_end_io_write(inode);
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index e1dfd6775c2c..0948ecf69aa5 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -382,7 +382,10 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 			    struct netfs_group *netfs_group);
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+					 struct netfs_group *netfs_group);
 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 /* Address operations API */
 struct readahead_control;
-- 
cgit v1.2.3


From 102a7e2c598c22bd2621fa97eb1c93c89d469a12 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 15 Feb 2022 23:15:57 +0000
Subject: netfs: Allow buffered shared-writeable mmap through
 netfs_page_mkwrite()

Provide an entry point to delegate a filesystem's ->page_mkwrite() to.
This checks for conflicting writes, then attached any netfs-specific group
marking (e.g. ceph snap) to the page to be considered dirty.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h     |  4 ++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index f244123ab568..70cb8e98d068 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -416,3 +416,62 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 }
 EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+	struct folio *folio = page_folio(vmf->page);
+	struct file *file = vmf->vma->vm_file;
+	struct inode *inode = file_inode(file);
+	vm_fault_t ret = VM_FAULT_RETRY;
+	int err;
+
+	_enter("%lx", folio->index);
+
+	sb_start_pagefault(inode->i_sb);
+
+	if (folio_wait_writeback_killable(folio))
+		goto out;
+
+	if (folio_lock_killable(folio) < 0)
+		goto out;
+
+	/* Can we see a streaming write here? */
+	if (WARN_ON(!folio_test_uptodate(folio))) {
+		ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+		goto out;
+	}
+
+	if (netfs_folio_group(folio) != netfs_group) {
+		folio_unlock(folio);
+		err = filemap_fdatawait_range(inode->i_mapping,
+					      folio_pos(folio),
+					      folio_pos(folio) + folio_size(folio));
+		switch (err) {
+		case 0:
+			ret = VM_FAULT_RETRY;
+			goto out;
+		case -ENOMEM:
+			ret = VM_FAULT_OOM;
+			goto out;
+		default:
+			ret = VM_FAULT_SIGBUS;
+			goto out;
+		}
+	}
+
+	if (folio_test_dirty(folio))
+		trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+	else
+		trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+	netfs_set_group(folio, netfs_group);
+	file_update_time(file);
+	ret = VM_FAULT_LOCKED;
+out:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 0948ecf69aa5..d7f324c7c22a 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -400,6 +400,10 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 
+/* VMA operations API. */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
+
+/* (Sub)request management API. */
 void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
-- 
cgit v1.2.3


From 80645bd4aa33a5c325f11b8dc6b38b38410ad5c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 11 Oct 2023 09:29:43 +0100
Subject: netfs: Provide netfs_file_read_iter()

Provide a top-level-ish function that can be pointed to directly by
->read_iter file op.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_read.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h    |  2 ++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 950f63fc156a..a59e7b2edaac 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -558,3 +558,76 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 }
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead.  When no data can be read,
+ * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct netfs_inode *ictx = netfs_inode(inode);
+	ssize_t ret;
+
+	if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+			 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+		return -EINVAL;
+
+	ret = netfs_start_io_read(inode);
+	if (ret == 0) {
+		ret = filemap_read(iocb, iter, 0);
+		netfs_end_io_read(inode);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead.  When no data can be read,
+ * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+	if ((iocb->ki_flags & IOCB_DIRECT) ||
+	    test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+		return netfs_unbuffered_read_iter(iocb, iter);
+
+	return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d7f324c7c22a..19a41c437af3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -378,6 +378,8 @@ struct netfs_cache_ops {
 
 /* High-level read API. */
 ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 
 /* High-level write API */
 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
-- 
cgit v1.2.3


From e0ace6ca98bef0d8d354040f13ffc0a498813ee9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 22 Nov 2023 17:18:17 +0000
Subject: netfs, cachefiles: Pass upper bound length to allow expansion

Make netfslib pass the maximum length to the ->prepare_write() op to tell
the cache how much it can expand the length of a write to.  This allows a
write to the server at the end of a file to be limited to a few bytes
whilst writing an entire block to the cache (something required by direct
I/O).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/cachefiles/internal.h |  2 +-
 fs/cachefiles/io.c       | 10 ++++++----
 fs/cachefiles/ondemand.c |  2 +-
 fs/netfs/fscache_io.c    |  2 +-
 fs/netfs/io.c            |  2 +-
 fs/netfs/objects.c       |  1 +
 fs/netfs/output.c        | 25 ++++++++++---------------
 fs/smb/client/fscache.c  |  2 +-
 include/linux/netfs.h    |  5 +++--
 9 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 2ad58c465208..1af48d576a34 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -233,7 +233,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
 				       enum fscache_want_state want_state);
 extern int __cachefiles_prepare_write(struct cachefiles_object *object,
 				      struct file *file,
-				      loff_t *_start, size_t *_len,
+				      loff_t *_start, size_t *_len, size_t upper_len,
 				      bool no_space_allocated_yet);
 extern int __cachefiles_write(struct cachefiles_object *object,
 			      struct file *file,
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 009d23cd435b..bffffedce4a9 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -518,7 +518,7 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
  */
 int __cachefiles_prepare_write(struct cachefiles_object *object,
 			       struct file *file,
-			       loff_t *_start, size_t *_len,
+			       loff_t *_start, size_t *_len, size_t upper_len,
 			       bool no_space_allocated_yet)
 {
 	struct cachefiles_cache *cache = object->volume->cache;
@@ -530,6 +530,8 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
 	down = start - round_down(start, PAGE_SIZE);
 	*_start = start - down;
 	*_len = round_up(down + len, PAGE_SIZE);
+	if (down < start || *_len > upper_len)
+		return -ENOBUFS;
 
 	/* We need to work out whether there's sufficient disk space to perform
 	 * the write - but we can skip that check if we have space already
@@ -592,8 +594,8 @@ check_space:
 }
 
 static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
-				    loff_t *_start, size_t *_len, loff_t i_size,
-				    bool no_space_allocated_yet)
+				    loff_t *_start, size_t *_len, size_t upper_len,
+				    loff_t i_size, bool no_space_allocated_yet)
 {
 	struct cachefiles_object *object = cachefiles_cres_object(cres);
 	struct cachefiles_cache *cache = object->volume->cache;
@@ -609,7 +611,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
 
 	cachefiles_begin_secure(cache, &saved_cred);
 	ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
-					 _start, _len,
+					 _start, _len, upper_len,
 					 no_space_allocated_yet);
 	cachefiles_end_secure(cache, saved_cred);
 	return ret;
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 0254ed39f68c..9301d1eb0504 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -52,7 +52,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
 		return -ENOBUFS;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+	ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
 	cachefiles_end_secure(cache, saved_cred);
 	if (ret < 0)
 		return ret;
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 79171a687930..ad572f7ee897 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -237,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
 				    fscache_access_io_write) < 0)
 		goto abandon_free;
 
-	ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+	ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
 	if (ret < 0)
 		goto abandon_end;
 
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 01c7ff27228e..14c18be5aca0 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -199,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
 		}
 
 		ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
-					       rreq->i_size, true);
+					       subreq->len, rreq->i_size, true);
 		if (ret < 0) {
 			trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
 			trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 93f1d7431199..b4e3bd836e5d 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -33,6 +33,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 
 	rreq->start	= start;
 	rreq->len	= len;
+	rreq->upper_len	= len;
 	rreq->origin	= origin;
 	rreq->netfs_ops	= ctx->ops;
 	rreq->mapping	= mapping;
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
index 560cbcea0c0a..cc9065733b42 100644
--- a/fs/netfs/output.c
+++ b/fs/netfs/output.c
@@ -280,7 +280,7 @@ EXPORT_SYMBOL(netfs_queue_write_request);
  */
 static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
 {
-	struct netfs_cache_resources *cres;
+	struct netfs_cache_resources *cres = &wreq->cache_resources;
 	struct netfs_io_subrequest *subreq;
 	struct netfs_inode *ctx = netfs_inode(wreq->inode);
 	struct fscache_cookie *cookie = netfs_i_cookie(ctx);
@@ -294,26 +294,21 @@ static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
 	}
 
 	_debug("write to cache");
-	subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
-					    netfs_write_to_cache_op_worker);
-	if (!subreq)
+	ret = fscache_begin_write_operation(cres, cookie);
+	if (ret < 0)
 		return;
 
-	cres = &wreq->cache_resources;
-	ret = fscache_begin_read_operation(cres, cookie);
-	if (ret < 0) {
-		netfs_write_subrequest_terminated(subreq, ret, false);
+	ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+				       i_size_read(wreq->inode), true);
+	if (ret < 0)
 		return;
-	}
 
-	ret = cres->ops->prepare_write(cres, &start, &len, i_size_read(wreq->inode),
-				       true);
-	if (ret < 0) {
-		netfs_write_subrequest_terminated(subreq, ret, false);
+	subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+					    netfs_write_to_cache_op_worker);
+	if (!subreq)
 		return;
-	}
 
-	netfs_queue_write_request(subreq);
+	netfs_write_to_cache_op(subreq);
 }
 
 /*
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index e5cad149f5a2..c4a3cb736881 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_
 	if (ret < 0)
 		return ret;
 
-	ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+	ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
 				      no_space_allocated_yet);
 	if (ret == 0)
 		ret = fscache_write(&cres, start, &iter, NULL, NULL);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 19a41c437af3..2856389f4694 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -261,6 +261,7 @@ struct netfs_io_request {
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
+	size_t			upper_len;	/* Length can be extended to here */
 	size_t			transferred;	/* Amount to be indicated as transferred */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
@@ -357,8 +358,8 @@ struct netfs_cache_ops {
 	 * actually do.
 	 */
 	int (*prepare_write)(struct netfs_cache_resources *cres,
-			     loff_t *_start, size_t *_len, loff_t i_size,
-			     bool no_space_allocated_yet);
+			     loff_t *_start, size_t *_len, size_t upper_len,
+			     loff_t i_size, bool no_space_allocated_yet);
 
 	/* Prepare an on-demand read operation, shortening it to a cached/uncached
 	 * boundary as appropriate.
-- 
cgit v1.2.3


From 62c3b7481b9a108cb99ef9438dba66bb4738768b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 28 Sep 2023 11:46:49 +0100
Subject: netfs: Provide a writepages implementation

Provide an implementation of writepages for network filesystems to delegate
to.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c | 636 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h     |   2 +
 2 files changed, 638 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 70cb8e98d068..c078826f7fe6 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -32,6 +32,18 @@ static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group
 		folio_attach_private(folio, netfs_get_group(netfs_group));
 }
 
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+	if (caching)
+		folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
 /*
  * Decide how we should modify a folio.  We might be attempting to do
  * write-streaming, in which case we don't want to a local RMW cycle if we can
@@ -475,3 +487,627 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+			     loff_t start, loff_t len)
+{
+	struct folio *folio;
+	pgoff_t index = start / PAGE_SIZE;
+	pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+	_enter("%llx-%llx", start, start + len - 1);
+
+	do {
+		_debug("kill %lx (to %lx)", index, last);
+
+		folio = filemap_get_folio(mapping, index);
+		if (IS_ERR(folio)) {
+			next = index + 1;
+			continue;
+		}
+
+		next = folio_next_index(folio);
+
+		trace_netfs_folio(folio, netfs_folio_trace_kill);
+		folio_clear_uptodate(folio);
+		if (folio_test_fscache(folio))
+			folio_end_fscache(folio);
+		folio_end_writeback(folio);
+		folio_lock(folio);
+		generic_error_remove_page(mapping, &folio->page);
+		folio_unlock(folio);
+		folio_put(folio);
+
+	} while (index = next, index <= last);
+
+	_leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+				loff_t start, loff_t len)
+{
+	struct folio *folio;
+	pgoff_t index = start / PAGE_SIZE;
+	pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+	_enter("%llx-%llx", start, start + len - 1);
+
+	do {
+		_debug("redirty %llx @%llx", len, start);
+
+		folio = filemap_get_folio(mapping, index);
+		if (IS_ERR(folio)) {
+			next = index + 1;
+			continue;
+		}
+
+		next = folio_next_index(folio);
+		trace_netfs_folio(folio, netfs_folio_trace_redirty);
+		filemap_dirty_folio(mapping, folio);
+		if (folio_test_fscache(folio))
+			folio_end_fscache(folio);
+		folio_end_writeback(folio);
+		folio_put(folio);
+	} while (index = next, index <= last);
+
+	balance_dirty_pages_ratelimited(mapping);
+
+	_leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+	struct address_space *mapping = wreq->mapping;
+	struct netfs_folio *finfo;
+	struct netfs_group *group = NULL;
+	struct folio *folio;
+	pgoff_t last;
+	int gcount = 0;
+
+	XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+	_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+	rcu_read_lock();
+
+	last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+	xas_for_each(&xas, folio, last) {
+		WARN(!folio_test_writeback(folio),
+		     "bad %zx @%llx page %lx %lx\n",
+		     wreq->len, wreq->start, folio_index(folio), last);
+
+		if ((finfo = netfs_folio_info(folio))) {
+			/* Streaming writes cannot be redirtied whilst under
+			 * writeback, so discard the streaming record.
+			 */
+			folio_detach_private(folio);
+			group = finfo->netfs_group;
+			gcount++;
+			trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+			kfree(finfo);
+		} else if ((group = netfs_folio_group(folio))) {
+			/* Need to detach the group pointer if the page didn't
+			 * get redirtied.  If it has been redirtied, then it
+			 * must be within the same group.
+			 */
+			if (folio_test_dirty(folio)) {
+				trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+				goto end_wb;
+			}
+			if (folio_trylock(folio)) {
+				if (!folio_test_dirty(folio)) {
+					folio_detach_private(folio);
+					gcount++;
+					trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+				} else {
+					trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+				}
+				folio_unlock(folio);
+				goto end_wb;
+			}
+
+			xas_pause(&xas);
+			rcu_read_unlock();
+			folio_lock(folio);
+			if (!folio_test_dirty(folio)) {
+				folio_detach_private(folio);
+				gcount++;
+				trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+			} else {
+				trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+			}
+			folio_unlock(folio);
+			rcu_read_lock();
+		} else {
+			trace_netfs_folio(folio, netfs_folio_trace_clear);
+		}
+	end_wb:
+		if (folio_test_fscache(folio))
+			folio_end_fscache(folio);
+		folio_end_writeback(folio);
+	}
+
+	rcu_read_unlock();
+	netfs_put_group_many(group, gcount);
+	_leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+	struct address_space *mapping = wreq->mapping;
+
+	_enter("");
+
+	switch (wreq->error) {
+	case 0:
+		netfs_pages_written_back(wreq);
+		break;
+
+	default:
+		pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+		fallthrough;
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+	case -ENETRESET:
+	case -EDQUOT:
+	case -ENOSPC:
+		netfs_redirty_pages(mapping, wreq->start, wreq->len);
+		break;
+
+	case -EROFS:
+	case -EIO:
+	case -EREMOTEIO:
+	case -EFBIG:
+	case -ENOENT:
+	case -ENOMEDIUM:
+	case -ENXIO:
+		netfs_kill_pages(mapping, wreq->start, wreq->len);
+		break;
+	}
+
+	if (wreq->error)
+		mapping_set_error(mapping, wreq->error);
+	if (wreq->netfs_ops->done)
+		wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+				   struct netfs_group *group,
+				   struct xa_state *xas,
+				   long *_count,
+				   loff_t start,
+				   loff_t max_len,
+				   bool caching,
+				   size_t *_len,
+				   size_t *_top)
+{
+	struct netfs_folio *finfo;
+	struct folio_batch fbatch;
+	struct folio *folio;
+	unsigned int i;
+	pgoff_t index = (start + *_len) / PAGE_SIZE;
+	size_t len;
+	void *priv;
+	bool stop = true;
+
+	folio_batch_init(&fbatch);
+
+	do {
+		/* Firstly, we gather up a batch of contiguous dirty pages
+		 * under the RCU read lock - but we can't clear the dirty flags
+		 * there if any of those pages are mapped.
+		 */
+		rcu_read_lock();
+
+		xas_for_each(xas, folio, ULONG_MAX) {
+			stop = true;
+			if (xas_retry(xas, folio))
+				continue;
+			if (xa_is_value(folio))
+				break;
+			if (folio_index(folio) != index) {
+				xas_reset(xas);
+				break;
+			}
+
+			if (!folio_try_get_rcu(folio)) {
+				xas_reset(xas);
+				continue;
+			}
+
+			/* Has the folio moved or been split? */
+			if (unlikely(folio != xas_reload(xas))) {
+				folio_put(folio);
+				xas_reset(xas);
+				break;
+			}
+
+			if (!folio_trylock(folio)) {
+				folio_put(folio);
+				xas_reset(xas);
+				break;
+			}
+			if (!folio_test_dirty(folio) ||
+			    folio_test_writeback(folio) ||
+			    folio_test_fscache(folio)) {
+				folio_unlock(folio);
+				folio_put(folio);
+				xas_reset(xas);
+				break;
+			}
+
+			stop = false;
+			len = folio_size(folio);
+			priv = folio_get_private(folio);
+			if ((const struct netfs_group *)priv != group) {
+				stop = true;
+				finfo = netfs_folio_info(folio);
+				if (finfo->netfs_group != group ||
+				    finfo->dirty_offset > 0) {
+					folio_unlock(folio);
+					folio_put(folio);
+					xas_reset(xas);
+					break;
+				}
+				len = finfo->dirty_len;
+			}
+
+			*_top += folio_size(folio);
+			index += folio_nr_pages(folio);
+			*_count -= folio_nr_pages(folio);
+			*_len += len;
+			if (*_len >= max_len || *_count <= 0)
+				stop = true;
+
+			if (!folio_batch_add(&fbatch, folio))
+				break;
+			if (stop)
+				break;
+		}
+
+		xas_pause(xas);
+		rcu_read_unlock();
+
+		/* Now, if we obtained any folios, we can shift them to being
+		 * writable and mark them for caching.
+		 */
+		if (!folio_batch_count(&fbatch))
+			break;
+
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			folio = fbatch.folios[i];
+			trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+			if (!folio_clear_dirty_for_io(folio))
+				BUG();
+			folio_start_writeback(folio);
+			netfs_folio_start_fscache(caching, folio);
+			folio_unlock(folio);
+		}
+
+		folio_batch_release(&fbatch);
+		cond_resched();
+	} while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+						  struct writeback_control *wbc,
+						  struct netfs_group *group,
+						  struct xa_state *xas,
+						  struct folio *folio,
+						  unsigned long long start,
+						  unsigned long long end)
+{
+	struct netfs_io_request *wreq;
+	struct netfs_folio *finfo;
+	struct netfs_inode *ctx = netfs_inode(mapping->host);
+	unsigned long long i_size = i_size_read(&ctx->inode);
+	size_t len, max_len;
+	bool caching = netfs_is_cache_enabled(ctx);
+	long count = wbc->nr_to_write;
+	int ret;
+
+	_enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching);
+
+	wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+				   NETFS_WRITEBACK);
+	if (IS_ERR(wreq)) {
+		folio_unlock(folio);
+		return PTR_ERR(wreq);
+	}
+
+	if (!folio_clear_dirty_for_io(folio))
+		BUG();
+	folio_start_writeback(folio);
+	netfs_folio_start_fscache(caching, folio);
+
+	count -= folio_nr_pages(folio);
+
+	/* Find all consecutive lockable dirty pages that have contiguous
+	 * written regions, stopping when we find a page that is not
+	 * immediately lockable, is not dirty or is missing, or we reach the
+	 * end of the range.
+	 */
+	trace_netfs_folio(folio, netfs_folio_trace_store);
+
+	len = wreq->len;
+	finfo = netfs_folio_info(folio);
+	if (finfo) {
+		start += finfo->dirty_offset;
+		if (finfo->dirty_offset + finfo->dirty_len != len) {
+			len = finfo->dirty_len;
+			goto cant_expand;
+		}
+		len = finfo->dirty_len;
+	}
+
+	if (start < i_size) {
+		/* Trim the write to the EOF; the extra data is ignored.  Also
+		 * put an upper limit on the size of a single storedata op.
+		 */
+		max_len = 65536 * 4096;
+		max_len = min_t(unsigned long long, max_len, end - start + 1);
+		max_len = min_t(unsigned long long, max_len, i_size - start);
+
+		if (len < max_len)
+			netfs_extend_writeback(mapping, group, xas, &count, start,
+					       max_len, caching, &len, &wreq->upper_len);
+	}
+
+cant_expand:
+	len = min_t(unsigned long long, len, i_size - start);
+
+	/* We now have a contiguous set of dirty pages, each with writeback
+	 * set; the first page is still locked at this point, but all the rest
+	 * have been unlocked.
+	 */
+	folio_unlock(folio);
+	wreq->start = start;
+	wreq->len = len;
+
+	if (start < i_size) {
+		_debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+		/* Speculatively write to the cache.  We have to fix this up
+		 * later if the store fails.
+		 */
+		wreq->cleanup = netfs_cleanup_buffered_write;
+
+		iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+				wreq->upper_len);
+		__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+		ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+		if (ret == 0 || ret == -EIOCBQUEUED)
+			wbc->nr_to_write -= len / PAGE_SIZE;
+	} else {
+		_debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+		/* The dirty region was entirely beyond the EOF. */
+		fscache_clear_page_bits(mapping, start, len, caching);
+		netfs_pages_written_back(wreq);
+		ret = 0;
+	}
+
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	_leave(" = 1");
+	return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+				      struct writeback_control *wbc,
+				      struct netfs_group *group,
+				      struct xa_state *xas,
+				      unsigned long long *_start,
+				      unsigned long long end)
+{
+	const struct netfs_folio *finfo;
+	struct folio *folio;
+	unsigned long long start = *_start;
+	ssize_t ret;
+	void *priv;
+	int skips = 0;
+
+	_enter("%llx,%llx,", start, end);
+
+search_again:
+	/* Find the first dirty page in the group. */
+	rcu_read_lock();
+
+	for (;;) {
+		folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+		if (xas_retry(xas, folio) || xa_is_value(folio))
+			continue;
+		if (!folio)
+			break;
+
+		if (!folio_try_get_rcu(folio)) {
+			xas_reset(xas);
+			continue;
+		}
+
+		if (unlikely(folio != xas_reload(xas))) {
+			folio_put(folio);
+			xas_reset(xas);
+			continue;
+		}
+
+		/* Skip any dirty folio that's not in the group of interest. */
+		priv = folio_get_private(folio);
+		if ((const struct netfs_group *)priv != group) {
+			finfo = netfs_folio_info(folio);
+			if (finfo->netfs_group != group) {
+				folio_put(folio);
+				continue;
+			}
+		}
+
+		xas_pause(xas);
+		break;
+	}
+	rcu_read_unlock();
+	if (!folio)
+		return 0;
+
+	start = folio_pos(folio); /* May regress with THPs */
+
+	_debug("wback %lx", folio_index(folio));
+
+	/* At this point we hold neither the i_pages lock nor the page lock:
+	 * the page may be truncated or invalidated (changing page->mapping to
+	 * NULL), or even swizzled back from swapper_space to tmpfs file
+	 * mapping
+	 */
+lock_again:
+	if (wbc->sync_mode != WB_SYNC_NONE) {
+		ret = folio_lock_killable(folio);
+		if (ret < 0)
+			return ret;
+	} else {
+		if (!folio_trylock(folio))
+			goto search_again;
+	}
+
+	if (folio->mapping != mapping ||
+	    !folio_test_dirty(folio)) {
+		start += folio_size(folio);
+		folio_unlock(folio);
+		goto search_again;
+	}
+
+	if (folio_test_writeback(folio) ||
+	    folio_test_fscache(folio)) {
+		folio_unlock(folio);
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			folio_wait_writeback(folio);
+#ifdef CONFIG_NETFS_FSCACHE
+			folio_wait_fscache(folio);
+#endif
+			goto lock_again;
+		}
+
+		start += folio_size(folio);
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			if (skips >= 5 || need_resched()) {
+				ret = 0;
+				goto out;
+			}
+			skips++;
+		}
+		goto search_again;
+	}
+
+	ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+						 folio, start, end);
+out:
+	if (ret > 0)
+		*_start = start + ret;
+	_leave(" = %zd [%llx]", ret, *_start);
+	return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+				   struct writeback_control *wbc,
+				   struct netfs_group *group,
+				   unsigned long long *_start,
+				   unsigned long long end)
+{
+	ssize_t ret;
+
+	XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+	do {
+		ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+					     _start, end);
+		if (ret > 0 && wbc->nr_to_write > 0)
+			cond_resched();
+	} while (ret > 0 && wbc->nr_to_write > 0);
+
+	return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc)
+{
+	struct netfs_group *group = NULL;
+	loff_t start, end;
+	int ret;
+
+	_enter("");
+
+	/* We have to be careful as we can end up racing with setattr()
+	 * truncating the pagecache since the caller doesn't take a lock here
+	 * to prevent it.
+	 */
+
+	if (wbc->range_cyclic && mapping->writeback_index) {
+		start = mapping->writeback_index * PAGE_SIZE;
+		ret = netfs_writepages_region(mapping, wbc, group,
+					      &start, LLONG_MAX);
+		if (ret < 0)
+			goto out;
+
+		if (wbc->nr_to_write <= 0) {
+			mapping->writeback_index = start / PAGE_SIZE;
+			goto out;
+		}
+
+		start = 0;
+		end = mapping->writeback_index * PAGE_SIZE;
+		mapping->writeback_index = 0;
+		ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+		if (ret == 0)
+			mapping->writeback_index = start / PAGE_SIZE;
+	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+		start = 0;
+		ret = netfs_writepages_region(mapping, wbc, group,
+					      &start, LLONG_MAX);
+		if (wbc->nr_to_write > 0 && ret == 0)
+			mapping->writeback_index = start / PAGE_SIZE;
+	} else {
+		start = wbc->range_start;
+		ret = netfs_writepages_region(mapping, wbc, group,
+					      &start, wbc->range_end);
+	}
+
+out:
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 2856389f4694..86bb8cb7f8d0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -397,6 +397,8 @@ int netfs_read_folio(struct file *, struct folio *);
 int netfs_write_begin(struct netfs_inode *, struct file *,
 		      struct address_space *, loff_t pos, unsigned int len,
 		      struct folio **, void **fsdata);
+int netfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
-- 
cgit v1.2.3


From 4a79616cfb27d76947ea37f0336745ef929d56be Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 5 Oct 2023 16:52:58 +0100
Subject: netfs: Provide a launder_folio implementation

Provide a launder_folio implementation for netfslib.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c    | 74 ++++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/main.c              |  1 +
 include/linux/netfs.h        |  2 ++
 include/trace/events/netfs.h |  3 ++
 4 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index c078826f7fe6..50be8fe3ca43 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1111,3 +1111,77 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+	if (wreq->error) {
+		pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+		mapping_set_error(wreq->mapping, wreq->error);
+	}
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down.  Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+	struct netfs_io_request *wreq;
+	struct address_space *mapping = folio->mapping;
+	struct netfs_folio *finfo = netfs_folio_info(folio);
+	struct netfs_group *group = netfs_folio_group(folio);
+	struct bio_vec bvec;
+	unsigned long long i_size = i_size_read(mapping->host);
+	unsigned long long start = folio_pos(folio);
+	size_t offset = 0, len;
+	int ret = 0;
+
+	if (finfo) {
+		offset = finfo->dirty_offset;
+		start += offset;
+		len = finfo->dirty_len;
+	} else {
+		len = folio_size(folio);
+	}
+	len = min_t(unsigned long long, len, i_size - start);
+
+	wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+	if (IS_ERR(wreq)) {
+		ret = PTR_ERR(wreq);
+		goto out;
+	}
+
+	if (!folio_clear_dirty_for_io(folio))
+		goto out_put;
+
+	trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+	_debug("launder %llx-%llx", start, start + len - 1);
+
+	/* Speculatively write to the cache.  We have to fix this up later if
+	 * the store fails.
+	 */
+	wreq->cleanup = netfs_cleanup_launder_folio;
+
+	bvec_set_folio(&bvec, folio, len, offset);
+	iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+	ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+	folio_detach_private(folio);
+	netfs_put_group(group);
+	kfree(finfo);
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+	folio_wait_fscache(folio);
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 8e4db9ff40c4..473f889e1bd1 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READPAGE]		= "RP",
 	[NETFS_READ_FOR_WRITE]		= "RW",
 	[NETFS_WRITEBACK]		= "WB",
+	[NETFS_LAUNDER_WRITE]		= "LW",
 	[NETFS_UNBUFFERED_WRITE]	= "UW",
 	[NETFS_DIO_READ]		= "DR",
 	[NETFS_DIO_WRITE]		= "DW",
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 86bb8cb7f8d0..29c66acad925 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -227,6 +227,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
@@ -404,6 +405,7 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp);
+int netfs_launder_folio(struct folio *folio);
 
 /* VMA operations API. */
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 914a24b03d08..cc998798e20a 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -25,6 +25,7 @@
 
 #define netfs_write_traces					\
 	EM(netfs_write_trace_dio_write,		"DIO-WRITE")	\
+	EM(netfs_write_trace_launder,		"LAUNDER  ")	\
 	EM(netfs_write_trace_unbuffered_write,	"UNB-WRITE")	\
 	E_(netfs_write_trace_writeback,		"WRITEBACK")
 
@@ -33,6 +34,7 @@
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
+	EM(NETFS_LAUNDER_WRITE,			"LW")		\
 	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
 	EM(NETFS_DIO_READ,			"DR")		\
 	E_(NETFS_DIO_WRITE,			"DW")
@@ -127,6 +129,7 @@
 	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
+	EM(netfs_folio_trace_launder,		"launder")	\
 	EM(netfs_folio_trace_mkwrite,		"mkwrite")	\
 	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
 	EM(netfs_folio_trace_read_gaps,		"read-gaps")	\
-- 
cgit v1.2.3


From 41d8e7673a7726cba57cb8112d81c89cfb6c3e35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 12 Oct 2023 09:06:24 +0100
Subject: netfs: Implement a write-through caching option

Provide a flag whereby a filesystem may request that cifs_perform_write()
perform write-through caching.  This involves putting pages directly into
writeback rather than dirty and attaching them to a write operation as we
go.

Further, the writes being made are limited to the byte range being written
rather than whole folios being written.  This can be used by cifs, for
example, to deal with strict byte-range locking.

This can't be used with content encryption as that may require expansion of
the write RPC beyond the write being made.

This doesn't affect writes via mmap - those are written back in the normal
way; similarly failed writethrough writes are marked dirty and left to
writeback to retry.  Another option would be to simply invalidate them, but
the contents can be simultaneously accessed by read() and through mmap.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c    | 69 ++++++++++++++++++++++++++++-----
 fs/netfs/internal.h          |  3 ++
 fs/netfs/main.c              |  1 +
 fs/netfs/objects.c           |  1 +
 fs/netfs/output.c            | 90 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h        |  2 +
 include/trace/events/netfs.h |  8 +++-
 7 files changed, 162 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 50be8fe3ca43..6ca6c4bde5eb 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -26,6 +26,8 @@ enum netfs_how_to_modify {
 	NETFS_FLUSH_CONTENT,		/* Flush incompatible content. */
 };
 
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
 {
 	if (netfs_group && !folio_get_private(folio))
@@ -133,6 +135,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	struct netfs_inode *ctx = netfs_inode(inode);
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_NONE,
+		.for_sync	= true,
+		.nr_to_write	= LONG_MAX,
+		.range_start	= iocb->ki_pos,
+		.range_end	= iocb->ki_pos + iter->count,
+	};
+	struct netfs_io_request *wreq = NULL;
 	struct netfs_folio *finfo;
 	struct folio *folio;
 	enum netfs_how_to_modify howto;
@@ -143,6 +153,30 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
 	bool maybe_trouble = false;
 
+	if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+		     iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+	    ) {
+		if (pos < i_size_read(inode)) {
+			ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+			if (ret < 0) {
+				goto out;
+			}
+		}
+
+		wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+		wreq = netfs_begin_writethrough(iocb, iter->count);
+		if (IS_ERR(wreq)) {
+			wbc_detach_inode(&wbc);
+			ret = PTR_ERR(wreq);
+			wreq = NULL;
+			goto out;
+		}
+		if (!is_sync_kiocb(iocb))
+			wreq->iocb = iocb;
+		wreq->cleanup = netfs_cleanup_buffered_write;
+	}
+
 	do {
 		size_t flen;
 		size_t offset;	/* Offset into pagecache folio */
@@ -315,7 +349,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		written += copied;
 
-		folio_mark_dirty(folio);
+		if (likely(!wreq)) {
+			folio_mark_dirty(folio);
+		} else {
+			if (folio_test_dirty(folio))
+				/* Sigh.  mmap. */
+				folio_clear_dirty_for_io(folio);
+			/* We make multiple writes to the folio... */
+			if (!folio_test_writeback(folio)) {
+				folio_wait_fscache(folio);
+				folio_start_writeback(folio);
+				folio_start_fscache(folio);
+				if (wreq->iter.count == 0)
+					trace_netfs_folio(folio, netfs_folio_trace_wthru);
+				else
+					trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+			}
+			netfs_advance_writethrough(wreq, copied,
+						   offset + copied == flen);
+		}
 	retry:
 		folio_unlock(folio);
 		folio_put(folio);
@@ -325,17 +377,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	} while (iov_iter_count(iter));
 
 out:
-	if (likely(written)) {
-		/* Flush and wait for a write that requires immediate synchronisation. */
-		if (iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) {
-			_debug("dsync");
-			ret = filemap_fdatawait_range(mapping, iocb->ki_pos,
-						      iocb->ki_pos + written);
-		}
-
-		iocb->ki_pos += written;
+	if (unlikely(wreq)) {
+		ret = netfs_end_writethrough(wreq, iocb);
+		wbc_detach_inode(&wbc);
+		if (ret == -EIOCBQUEUED)
+			return ret;
 	}
 
+	iocb->ki_pos += written;
 	_leave(" = %zd [%zd]", written, ret);
 	return written ? written : ret;
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 2de4f826dbe4..d2d63120ac60 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -101,6 +101,9 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
  */
 int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
 		      enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
 
 /*
  * stats.c
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 473f889e1bd1..81a13071b258 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READPAGE]		= "RP",
 	[NETFS_READ_FOR_WRITE]		= "RW",
 	[NETFS_WRITEBACK]		= "WB",
+	[NETFS_WRITETHROUGH]		= "WT",
 	[NETFS_LAUNDER_WRITE]		= "LW",
 	[NETFS_UNBUFFERED_WRITE]	= "UW",
 	[NETFS_DIO_READ]		= "DR",
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index b4e3bd836e5d..610ceb5bd86c 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -41,6 +41,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	rreq->i_size	= i_size_read(inode);
 	rreq->debug_id	= atomic_inc_return(&debug_ids);
 	INIT_LIST_HEAD(&rreq->subrequests);
+	INIT_WORK(&rreq->work, NULL);
 	refcount_set(&rreq->ref, 1);
 
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
index cc9065733b42..625eb68f3e5a 100644
--- a/fs/netfs/output.c
+++ b/fs/netfs/output.c
@@ -386,3 +386,93 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
 		    TASK_UNINTERRUPTIBLE);
 	return wreq->error;
 }
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+	struct netfs_io_request *wreq;
+	struct file *file = iocb->ki_filp;
+
+	wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+				   NETFS_WRITETHROUGH);
+	if (IS_ERR(wreq))
+		return wreq;
+
+	trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+	iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+	wreq->io_iter = wreq->iter;
+
+	/* ->outstanding > 0 carries a ref */
+	netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+	atomic_set(&wreq->nr_outstanding, 1);
+	return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+	struct netfs_inode *ictx = netfs_inode(wreq->inode);
+	unsigned long long start;
+	size_t len;
+
+	if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+		return;
+
+	start = wreq->start + wreq->submitted;
+	len = wreq->iter.count - wreq->submitted;
+	if (!final) {
+		len /= wreq->wsize; /* Round to number of maximum packets */
+		len *= wreq->wsize;
+	}
+
+	ictx->ops->create_write_requests(wreq, start, len);
+	wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache.  Data has been copied into the pagecache that we need to append
+ * to the request.  If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+	_enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+	       wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+	wreq->iter.count += copied;
+	wreq->io_iter.count += copied;
+	if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+		netfs_submit_writethrough(wreq, false);
+
+	return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+	int ret = -EIOCBQUEUED;
+
+	_enter("ic=%zu sb=%zu ws=%u",
+	       wreq->iter.count, wreq->submitted, wreq->wsize);
+
+	if (wreq->submitted < wreq->io_iter.count)
+		netfs_submit_writethrough(wreq, true);
+
+	if (atomic_dec_and_test(&wreq->nr_outstanding))
+		netfs_write_terminated(wreq, false);
+
+	if (is_sync_kiocb(iocb)) {
+		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+			    TASK_UNINTERRUPTIBLE);
+		ret = wreq->error;
+	}
+
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	return ret;
+}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 29c66acad925..8a2dd882a781 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -139,6 +139,7 @@ struct netfs_inode {
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
+#define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
 };
 
 /*
@@ -227,6 +228,7 @@ enum netfs_io_origin {
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
+	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index cc998798e20a..447a8c21cf57 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -27,13 +27,15 @@
 	EM(netfs_write_trace_dio_write,		"DIO-WRITE")	\
 	EM(netfs_write_trace_launder,		"LAUNDER  ")	\
 	EM(netfs_write_trace_unbuffered_write,	"UNB-WRITE")	\
-	E_(netfs_write_trace_writeback,		"WRITEBACK")
+	EM(netfs_write_trace_writeback,		"WRITEBACK")	\
+	E_(netfs_write_trace_writethrough,	"WRITETHRU")
 
 #define netfs_rreq_origins					\
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
+	EM(NETFS_WRITETHROUGH,			"WT")		\
 	EM(NETFS_LAUNDER_WRITE,			"LW")		\
 	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
 	EM(NETFS_DIO_READ,			"DR")		\
@@ -136,7 +138,9 @@
 	EM(netfs_folio_trace_redirty,		"redirty")	\
 	EM(netfs_folio_trace_redirtied,		"redirtied")	\
 	EM(netfs_folio_trace_store,		"store")	\
-	E_(netfs_folio_trace_store_plus,	"store+")
+	EM(netfs_folio_trace_store_plus,	"store+")	\
+	EM(netfs_folio_trace_wthru,		"wthru")	\
+	E_(netfs_folio_trace_wthru_plus,	"wthru+")
 
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
-- 
cgit v1.2.3


From 100ccd18bb41ea7abb4fbb419202c06079559501 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2023 13:39:02 +0000
Subject: netfs: Optimise away reads above the point at which there can be no
 data

Track the file position above which the server is not expected to have any
data (the "zero point") and preemptively assume that we can satisfy
requests by filling them with zeroes locally rather than attempting to
download them if they're over that line - even if we've written data back
to the server.  Assume that any data that was written back above that
position is held in the local cache.  Note that we have to split requests
that straddle the line.

Make use of this to optimise away some reads from the server.  We need to
set the zero point in the following circumstances:

 (1) When we see an extant remote inode and have no cache for it, we set
     the zero_point to i_size.

 (2) On local inode creation, we set zero_point to 0.

 (3) On local truncation down, we reduce zero_point to the new i_size if
     the new i_size is lower.

 (4) On local truncation up, we don't change zero_point.

 (5) On local modification, we don't change zero_point.

 (6) On remote invalidation, we set zero_point to the new i_size.

 (7) If stored data is discarded from the pagecache or culled from fscache,
     we must set zero_point above that if the data also got written to the
     server.

 (8) If dirty data is written back to the server, but not fscache, we must
     set zero_point above that.

 (9) If a direct I/O write is made, set zero_point above that.

Assuming the above, any read from the server at or above the zero_point
position will return all zeroes.

The zero_point value can be stored in the cache, provided the above rules
are applied to it by any code that culls part of the local cache.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/9p/vfs_inode.c         |  2 +-
 fs/afs/dynroot.c          |  2 +-
 fs/afs/inode.c            | 24 ++++++++++++++----------
 fs/ceph/inode.c           |  2 +-
 fs/netfs/buffered_write.c |  2 +-
 fs/netfs/direct_write.c   |  4 ++++
 fs/netfs/io.c             | 10 ++++++++++
 fs/netfs/misc.c           |  5 +++++
 fs/nfs/fscache.h          |  2 +-
 fs/smb/client/cifsfs.c    |  4 ++--
 include/linux/netfs.h     | 21 ++++++++++++++++++---
 11 files changed, 58 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 74122540e00f..df7ae381a708 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -249,7 +249,7 @@ void v9fs_free_inode(struct inode *inode)
 static void v9fs_set_netfs_context(struct inode *inode)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
-	netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+	netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
 }
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 1f656005018e..9c517269ff95 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 	/* there shouldn't be an existing inode */
 	BUG_ON(!(inode->i_state & I_NEW));
 
-	netfs_inode_init(&vnode->netfs, NULL);
+	netfs_inode_init(&vnode->netfs, NULL, false);
 	inode->i_size		= 0;
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	if (root) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 37485ae31471..381521e9e118 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
  */
 static void afs_set_netfs_context(struct afs_vnode *vnode)
 {
-	netfs_inode_init(&vnode->netfs, &afs_req_ops);
+	netfs_inode_init(&vnode->netfs, &afs_req_ops, true);
 }
 
 /*
@@ -168,6 +168,7 @@ static void afs_apply_status(struct afs_operation *op,
 	struct inode *inode = &vnode->netfs.inode;
 	struct timespec64 t;
 	umode_t mode;
+	bool unexpected_jump = false;
 	bool data_changed = false;
 	bool change_size = vp->set_size;
 
@@ -231,6 +232,7 @@ static void afs_apply_status(struct afs_operation *op,
 		}
 		change_size = true;
 		data_changed = true;
+		unexpected_jump = true;
 	} else if (vnode->status.type == AFS_FTYPE_DIR) {
 		/* Expected directory change is handled elsewhere so
 		 * that we can locally edit the directory and save on a
@@ -252,6 +254,8 @@ static void afs_apply_status(struct afs_operation *op,
 		vnode->netfs.remote_i_size = status->size;
 		if (change_size || status->size > i_size_read(inode)) {
 			afs_set_i_size(vnode, status->size);
+			if (unexpected_jump)
+				vnode->netfs.zero_point = status->size;
 			inode_set_ctime_to_ts(inode, t);
 			inode_set_atime_to_ts(inode, t);
 		}
@@ -865,17 +869,17 @@ static void afs_setattr_success(struct afs_operation *op)
 static void afs_setattr_edit_file(struct afs_operation *op)
 {
 	struct afs_vnode_param *vp = &op->file[0];
-	struct inode *inode = &vp->vnode->netfs.inode;
+	struct afs_vnode *vnode = vp->vnode;
 
 	if (op->setattr.attr->ia_valid & ATTR_SIZE) {
 		loff_t size = op->setattr.attr->ia_size;
 		loff_t i_size = op->setattr.old_i_size;
 
-		if (size < i_size)
-			truncate_pagecache(inode, size);
-		if (size != i_size)
-			fscache_resize_cookie(afs_vnode_cache(vp->vnode),
-					      vp->scb.status.size);
+		if (size != i_size) {
+			truncate_setsize(&vnode->netfs.inode, size);
+			netfs_resize_file(&vnode->netfs, size, true);
+			fscache_resize_cookie(afs_vnode_cache(vnode), size);
+		}
 	}
 }
 
@@ -943,11 +947,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		 */
 		if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
 		    attr->ia_size < i_size &&
-		    attr->ia_size > vnode->status.size) {
-			truncate_pagecache(inode, attr->ia_size);
+		    attr->ia_size > vnode->netfs.remote_i_size) {
+			truncate_setsize(inode, attr->ia_size);
+			netfs_resize_file(&vnode->netfs, size, false);
 			fscache_resize_cookie(afs_vnode_cache(vnode),
 					      attr->ia_size);
-			i_size_write(inode, attr->ia_size);
 			ret = 0;
 			goto out_unlock;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3149d79a9dbe..0c25d326afc4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	doutc(fsc->client, "%p\n", &ci->netfs.inode);
 
 	/* Set parameters for the netfs library */
-	netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
+	netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
 
 	spin_lock_init(&ci->i_ceph_lock);
 
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 6ca6c4bde5eb..08f28800232c 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -73,7 +73,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
 	if (folio_test_uptodate(folio))
 		return NETFS_FOLIO_IS_UPTODATE;
 
-	if (pos >= ctx->remote_i_size)
+	if (pos >= ctx->zero_point)
 		return NETFS_MODIFY_AND_CLEAR;
 
 	if (!maybe_trouble && offset == 0 && len >= flen)
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bb0c2718f57b..aad05f2349a4 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -134,6 +134,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct netfs_inode *ictx = netfs_inode(inode);
+	unsigned long long end;
 	ssize_t ret;
 
 	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
@@ -155,6 +156,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
 	if (ret < 0)
 		goto out;
+	end = iocb->ki_pos + iov_iter_count(from);
+	if (end > ictx->zero_point)
+		ictx->zero_point = end;
 
 	fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
 			   FSCACHE_INVAL_DIO_WRITE);
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 14c18be5aca0..5b5af96cd4b9 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -569,6 +569,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 			struct iov_iter *io_iter)
 {
 	enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+	struct netfs_inode *ictx = netfs_inode(rreq->inode);
 	size_t lsize;
 
 	_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
@@ -586,6 +587,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 		 * to make serial calls, it can indicate a short read and then
 		 * we will call it again.
 		 */
+		if (rreq->origin != NETFS_DIO_READ) {
+			if (subreq->start >= ictx->zero_point) {
+				source = NETFS_FILL_WITH_ZEROES;
+				goto set;
+			}
+			if (subreq->len > ictx->zero_point - subreq->start)
+				subreq->len = ictx->zero_point - subreq->start;
+		}
 		if (subreq->len > rreq->i_size - subreq->start)
 			subreq->len = rreq->i_size - subreq->start;
 		if (rreq->rsize && subreq->len > rreq->rsize)
@@ -607,6 +616,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 		}
 	}
 
+set:
 	if (subreq->len > rreq->len)
 		pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
 			rreq->debug_id, subreq->debug_index,
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index eeb44abe59c5..0e3af37fc924 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -240,6 +240,11 @@ EXPORT_SYMBOL(netfs_invalidate_folio);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp)
 {
 	struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+	unsigned long long end;
+
+	end = folio_pos(folio) + folio_size(folio);
+	if (end > ctx->zero_point)
+		ctx->zero_point = end;
 
 	if (folio_test_private(folio))
 		return false;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 5407ab8c8783..e3cb4923316b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
 }
 static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
 {
-	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
 }
 extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
 extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 96a65cf9b5ec..07cd88897c33 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1220,7 +1220,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s
 	if (rc < 0)
 		goto set_failed;
 
-	netfs_resize_file(&src_cifsi->netfs, src_end);
+	netfs_resize_file(&src_cifsi->netfs, src_end, true);
 	fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
 	return 0;
 
@@ -1351,7 +1351,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
 			smb_file_src, smb_file_target, off, len, destoff);
 		if (rc == 0 && new_size > i_size_read(target_inode)) {
 			truncate_setsize(target_inode, new_size);
-			netfs_resize_file(&target_cifsi->netfs, new_size);
+			netfs_resize_file(&target_cifsi->netfs, new_size, true);
 			fscache_resize_cookie(cifs_inode_cookie(target_inode),
 					      new_size);
 		}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8a2dd882a781..852956aa3c4b 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -136,6 +136,8 @@ struct netfs_inode {
 	struct fscache_cookie	*cache;
 #endif
 	loff_t			remote_i_size;	/* Size of the remote file */
+	loff_t			zero_point;	/* Size after which we assume there's no data
+						 * on the server */
 	unsigned long		flags;
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
@@ -453,31 +455,44 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode)
  * netfs_inode_init - Initialise a netfslib inode context
  * @ctx: The netfs inode to initialise
  * @ops: The netfs's operations list
+ * @use_zero_point: True to use the zero_point read optimisation
  *
  * Initialise the netfs library context struct.  This is expected to follow on
  * directly from the VFS inode struct.
  */
 static inline void netfs_inode_init(struct netfs_inode *ctx,
-				    const struct netfs_request_ops *ops)
+				    const struct netfs_request_ops *ops,
+				    bool use_zero_point)
 {
 	ctx->ops = ops;
 	ctx->remote_i_size = i_size_read(&ctx->inode);
+	ctx->zero_point = LLONG_MAX;
 	ctx->flags = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
+	/* ->releasepage() drives zero_point */
+	if (use_zero_point) {
+		ctx->zero_point = ctx->remote_i_size;
+		mapping_set_release_always(ctx->inode.i_mapping);
+	}
 }
 
 /**
  * netfs_resize_file - Note that a file got resized
  * @ctx: The netfs inode being resized
  * @new_i_size: The new file size
+ * @changed_on_server: The change was applied to the server
  *
  * Inform the netfs lib that a file got resized so that it can adjust its state.
  */
-static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size)
+static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size,
+				     bool changed_on_server)
 {
-	ctx->remote_i_size = new_i_size;
+	if (changed_on_server)
+		ctx->remote_i_size = new_i_size;
+	if (new_i_size < ctx->zero_point)
+		ctx->zero_point = new_i_size;
 }
 
 /**
-- 
cgit v1.2.3


From c39e2ae3943d4ee278af4e1b1dcfd5946da1089b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Thu, 28 Dec 2023 11:06:08 +0100
Subject: fs: fix __sb_write_started() kerneldoc formatting

When running 'make htmldocs', I see the following warning:

  Documentation/filesystems/api-summary:14: ./include/linux/fs.h:1659: WARNING: Definition list ends without a blank line; unexpected unindent.

The official guidance [1] seems to be to use lists, which will prevent
both the "unexpected unindent" warning as well as ensure that each line
is formatted on a separate line in the HTML output instead of being
all considered a single paragraph.

[1]: https://docs.kernel.org/doc-guide/kernel-doc.html#return-values

Fixes: 8802e580ee64 ("fs: create __sb_write_started() helper")
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Link: https://lore.kernel.org/r/20231228100608.3123987-1-vegard.nossum@oracle.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index db5d07e6e02e..473063f385e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1650,9 +1650,9 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
  * @sb: the super we write to
  * @level: the freeze level
  *
- * > 0 sb freeze level is held
- *   0 sb freeze level is not held
- * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ * * > 0 - sb freeze level is held
+ * *   0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
  */
 static inline int __sb_write_started(const struct super_block *sb, int level)
 {
-- 
cgit v1.2.3


From 0b68ab50b8101a35b51fb9ec203cd988e47dbed3 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 23 Dec 2023 14:53:47 +0100
Subject: sysctl: delete unused define SYSCTL_PERM_EMPTY_DIR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems it was never used.

Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/sysctl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 61b40ea81f4d..26a38161c28f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -255,8 +255,6 @@ extern int unaligned_enabled;
 extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 
-#define SYSCTL_PERM_EMPTY_DIR	(1 << 0)
-
 #else /* CONFIG_SYSCTL */
 
 static inline void register_sysctl_init(const char *path, struct ctl_table *table)
-- 
cgit v1.2.3


From 561429807d50aad76f1205b0b1d7b4aacf365d4e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 20 Dec 2023 22:23:35 +0100
Subject: sysctl: remove struct ctl_path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All usages of this struct have been removed from the kernel tree.

The struct is still referenced by scripts/check-sysctl-docs but that
script is broken anyways as it only supports the register_sysctl_paths()
API and not the currently used register_sysctl() one.

Fixes: 0199849acd07 ("sysctl: remove register_sysctl_paths()")
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 include/linux/sysctl.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 26a38161c28f..ee7d33b89e9e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -210,11 +210,6 @@ struct ctl_table_root {
 	int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
 };
 
-/* struct ctl_path describes where in the hierarchy a table is added */
-struct ctl_path {
-	const char *procname;
-};
-
 #define register_sysctl(path, table)	\
 	register_sysctl_sz(path, table, ARRAY_SIZE(table))
 
-- 
cgit v1.2.3


From 4e814173a8c4f432fd068b1c796f0416328c9d99 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 18 Dec 2023 20:25:02 +0100
Subject: thermal: core: Fix thermal zone suspend-resume synchronization

There are 3 synchronization issues with thermal zone suspend-resume
during system-wide transitions:

 1. The resume code runs in a PM notifier which is invoked after user
    space has been thawed, so it can run concurrently with user space
    which can trigger a thermal zone device removal.  If that happens,
    the thermal zone resume code may use a stale pointer to the next
    list element and crash, because it does not hold thermal_list_lock
    while walking thermal_tz_list.

 2. The thermal zone resume code calls thermal_zone_device_init()
    outside the zone lock, so user space or an update triggered by
    the platform firmware may see an inconsistent state of a
    thermal zone leading to unexpected behavior.

 3. Clearing the in_suspend global variable in thermal_pm_notify()
    allows __thermal_zone_device_update() to continue for all thermal
    zones and it may as well run before the thermal_tz_list walk (or
    at any point during the list walk for that matter) and attempt to
    operate on a thermal zone that has not been resumed yet.  It may
    also race destructively with thermal_zone_device_init().

To address these issues, add thermal_list_lock locking to
thermal_pm_notify(), especially arount the thermal_tz_list,
make it call thermal_zone_device_init() back-to-back with
__thermal_zone_device_update() under the zone lock and replace
in_suspend with per-zone bool "suspend" indicators set and unset
under the given zone's lock.

Link: https://lore.kernel.org/linux-pm/20231218162348.69101-1-bo.ye@mediatek.com/
Reported-by: Bo Ye <bo.ye@mediatek.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 30 +++++++++++++++++++++++-------
 include/linux/thermal.h        |  2 ++
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5e5fcbd81dda..7456335efaaa 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -37,8 +37,6 @@ static LIST_HEAD(thermal_governor_list);
 static DEFINE_MUTEX(thermal_list_lock);
 static DEFINE_MUTEX(thermal_governor_lock);
 
-static atomic_t in_suspend;
-
 static struct thermal_governor *def_governor;
 
 /*
@@ -431,7 +429,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 {
 	struct thermal_trip *trip;
 
-	if (atomic_read(&in_suspend))
+	if (tz->suspended)
 		return;
 
 	if (!thermal_zone_device_is_enabled(tz))
@@ -1542,17 +1540,35 @@ static int thermal_pm_notify(struct notifier_block *nb,
 	case PM_HIBERNATION_PREPARE:
 	case PM_RESTORE_PREPARE:
 	case PM_SUSPEND_PREPARE:
-		atomic_set(&in_suspend, 1);
+		mutex_lock(&thermal_list_lock);
+
+		list_for_each_entry(tz, &thermal_tz_list, node) {
+			mutex_lock(&tz->lock);
+
+			tz->suspended = true;
+
+			mutex_unlock(&tz->lock);
+		}
+
+		mutex_unlock(&thermal_list_lock);
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_RESTORE:
 	case PM_POST_SUSPEND:
-		atomic_set(&in_suspend, 0);
+		mutex_lock(&thermal_list_lock);
+
 		list_for_each_entry(tz, &thermal_tz_list, node) {
+			mutex_lock(&tz->lock);
+
+			tz->suspended = false;
+
 			thermal_zone_device_init(tz);
-			thermal_zone_device_update(tz,
-						   THERMAL_EVENT_UNSPECIFIED);
+			__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+
+			mutex_unlock(&tz->lock);
 		}
+
+		mutex_unlock(&thermal_list_lock);
 		break;
 	default:
 		break;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 09f6eb82c191..d00622b64d50 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -152,6 +152,7 @@ struct thermal_cooling_device {
  * @node:	node in thermal_tz_list (in thermal_core.c)
  * @poll_queue:	delayed work for polling
  * @notify_event: Last notification event
+ * @suspended: thermal zone suspend indicator
  */
 struct thermal_zone_device {
 	int id;
@@ -185,6 +186,7 @@ struct thermal_zone_device {
 	struct list_head node;
 	struct delayed_work poll_queue;
 	enum thermal_notify_event notify_event;
+	bool suspended;
 };
 
 /**
-- 
cgit v1.2.3


From 753547de0daecbdbd1af3618987ddade325d9aaa Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 28 Dec 2023 11:36:03 +0100
Subject: linux/export: Ensure natural alignment of kcrctab array

The ___kcrctab section holds an array of 32-bit CRC values.
Add a .balign 4 to tell the linker the correct memory alignment.

Fixes: f3304ecd7f06 ("linux/export: use inline assembler to populate symbol CRCs")
Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/export-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h
index cd253eb51d6c..d445705ac13c 100644
--- a/include/linux/export-internal.h
+++ b/include/linux/export-internal.h
@@ -64,6 +64,7 @@
 
 #define SYMBOL_CRC(sym, crc, sec)   \
 	asm(".section \"___kcrctab" sec "+" #sym "\",\"a\""	"\n" \
+	    ".balign 4"						"\n" \
 	    "__crc_" #sym ":"					"\n" \
 	    ".long " #crc					"\n" \
 	    ".previous"						"\n")
-- 
cgit v1.2.3


From 1271ca00aa7f9bb3fd94cb7ac8f654de71099580 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 19 Dec 2023 16:55:31 -0700
Subject: ethtool: reformat kerneldoc for struct ethtool_fec_stats

The kerneldoc comment for struct ethtool_fec_stats attempts to describe the
"total" and "lanes" fields of the ethtool_fec_stat substructure in a way
leading to these warnings:

  ./include/linux/ethtool.h:424: warning: Excess struct member 'lane' description in 'ethtool_fec_stats'
  ./include/linux/ethtool.h:424: warning: Excess struct member 'total' description in 'ethtool_fec_stats'

Reformat the comment to retain the information while eliminating the
warnings.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index cfcd952a1d4f..325e0778e937 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -411,8 +411,10 @@ struct ethtool_pause_stats {
  *	not entire FEC data blocks. This is a non-standard statistic.
  *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS.
  *
- * @lane: per-lane/PCS-instance counts as defined by the standard
- * @total: error counts for the entire port, for drivers incapable of reporting
+ * For each of the above fields, the two substructure members are:
+ *
+ * - @lanes: per-lane/PCS-instance counts as defined by the standard
+ * - @total: error counts for the entire port, for drivers incapable of reporting
  *	per-lane stats
  *
  * Drivers should fill in either only total or per-lane statistics, core
-- 
cgit v1.2.3


From a8c959402d4dd6823918b33828d79900ae58c700 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 20 Dec 2023 23:17:45 +0000
Subject: thermal: core: Add governor callback for thermal zone change

Add a new callback to the struct thermal_governor. It can be used for
updating governors when there is a change in the thermal zone internals,
e.g. thermal cooling device is bind to the thermal zone.

That makes possible to move some heavy operations like memory allocations
related to the number of cooling instances out of the throttle() callback.

Both callback code paths (throttle() and update_tz()) are protected with
the same thermal zone lock, which guaranties the consistency.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 14 ++++++++++++++
 drivers/thermal/thermal_core.h |  2 ++
 include/linux/thermal.h        |  6 ++++++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 3ffccd73b19e..58958288b559 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -309,6 +309,15 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 		       def_governor->throttle(tz, trip);
 }
 
+void thermal_governor_update_tz(struct thermal_zone_device *tz,
+				enum thermal_notify_event reason)
+{
+	if (!tz->governor || !tz->governor->update_tz)
+		return;
+
+	tz->governor->update_tz(tz, reason);
+}
+
 void thermal_zone_device_critical(struct thermal_zone_device *tz)
 {
 	/*
@@ -715,6 +724,8 @@ int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
 		list_add_tail(&dev->tz_node, &tz->thermal_instances);
 		list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
 		atomic_set(&tz->need_update, 1);
+
+		thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV);
 	}
 	mutex_unlock(&cdev->lock);
 	mutex_unlock(&tz->lock);
@@ -773,6 +784,9 @@ int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
 		if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
 			list_del(&pos->tz_node);
 			list_del(&pos->cdev_node);
+
+			thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV);
+
 			mutex_unlock(&cdev->lock);
 			mutex_unlock(&tz->lock);
 			goto unbind;
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index fe2917a74054..479c3b6917e4 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -114,6 +114,8 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *, char *);
 int thermal_build_list_of_policies(char *buf);
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event);
+void thermal_governor_update_tz(struct thermal_zone_device *tz,
+				enum thermal_notify_event reason);
 
 /* Helpers */
 #define for_each_trip(__tz, __trip)	\
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d00622b64d50..4d96fefb2767 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -51,6 +51,8 @@ enum thermal_notify_event {
 	THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
 	THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
+	THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */
+	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
 };
 
 /**
@@ -199,6 +201,8 @@ struct thermal_zone_device {
  *			thermal zone.
  * @throttle:	callback called for every trip point even if temperature is
  *		below the trip point temperature
+ * @update_tz:	callback called when thermal zone internals have changed, e.g.
+ *		thermal cooling instance was added/removed
  * @governor_list:	node in thermal_governor_list (in thermal_core.c)
  */
 struct thermal_governor {
@@ -207,6 +211,8 @@ struct thermal_governor {
 	void (*unbind_from_tz)(struct thermal_zone_device *tz);
 	int (*throttle)(struct thermal_zone_device *tz,
 			const struct thermal_trip *trip);
+	void (*update_tz)(struct thermal_zone_device *tz,
+			  enum thermal_notify_event reason);
 	struct list_head	governor_list;
 };
 
-- 
cgit v1.2.3


From bfc57bd1685981730bfe9802d9de7603a0a43bc4 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 20 Dec 2023 23:17:52 +0000
Subject: thermal/sysfs: Update governors when the 'weight' has changed

Support governors update when the thermal instance's weight has changed.
This allows to adjust internal state for the governor.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
[ rjw: Add two empty code lines aroung the locking ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_sysfs.c | 5 +++++
 include/linux/thermal.h         | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 299c0fb16593..f4033865b093 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -939,7 +939,12 @@ ssize_t weight_store(struct device *dev, struct device_attribute *attr,
 
 	/* Don't race with governors using the 'weight' value */
 	mutex_lock(&instance->tz->lock);
+
 	instance->weight = weight;
+
+	thermal_governor_update_tz(instance->tz,
+				   THERMAL_INSTANCE_WEIGHT_CHANGED);
+
 	mutex_unlock(&instance->tz->lock);
 
 	return count;
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4d96fefb2767..9d0427da32af 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -53,6 +53,7 @@ enum thermal_notify_event {
 	THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */
 	THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */
 	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
+	THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */
 };
 
 /**
-- 
cgit v1.2.3


From adef440691bab824e39c1b17382322d195e1fab0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Dec 2023 02:36:56 -0800
Subject: userfaultfd: UFFDIO_MOVE uABI

Implement the uABI of UFFDIO_MOVE ioctl.
UFFDIO_COPY performs ~20% better than UFFDIO_MOVE when the application
needs pages to be allocated [1]. However, with UFFDIO_MOVE, if pages are
available (in userspace) for recycling, as is usually the case in heap
compaction algorithms, then we can avoid the page allocation and memcpy
(done by UFFDIO_COPY). Also, since the pages are recycled in the
userspace, we avoid the need to release (via madvise) the pages back to
the kernel [2].

We see over 40% reduction (on a Google pixel 6 device) in the compacting
thread's completion time by using UFFDIO_MOVE vs.  UFFDIO_COPY.  This was
measured using a benchmark that emulates a heap compaction implementation
using userfaultfd (to allow concurrent accesses by application threads).
More details of the usecase are explained in [2].  Furthermore,
UFFDIO_MOVE enables moving swapped-out pages without touching them within
the same vma.  Today, it can only be done by mremap, however it forces
splitting the vma.

[1] https://lore.kernel.org/all/1425575884-2574-1-git-send-email-aarcange@redhat.com/
[2] https://lore.kernel.org/linux-mm/CA+EESO4uO84SSnBhArH4HvLNhaUQ5nZKNKXqxRCyjniNVjp0Aw@mail.gmail.com/

Update for the ioctl_userfaultfd(2)  manpage:

   UFFDIO_MOVE
       (Since Linux xxx)  Move a continuous memory chunk into the
       userfault registered range and optionally wake up the blocked
       thread. The source and destination addresses and the number of
       bytes to move are specified by the src, dst, and len fields of
       the uffdio_move structure pointed to by argp:

           struct uffdio_move {
               __u64 dst;    /* Destination of move */
               __u64 src;    /* Source of move */
               __u64 len;    /* Number of bytes to move */
               __u64 mode;   /* Flags controlling behavior of move */
               __s64 move;   /* Number of bytes moved, or negated error */
           };

       The following value may be bitwise ORed in mode to change the
       behavior of the UFFDIO_MOVE operation:

       UFFDIO_MOVE_MODE_DONTWAKE
              Do not wake up the thread that waits for page-fault
              resolution

       UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES
              Allow holes in the source virtual range that is being moved.
              When not specified, the holes will result in ENOENT error.
              When specified, the holes will be accounted as successfully
              moved memory. This is mostly useful to move hugepage aligned
              virtual regions without knowing if there are transparent
              hugepages in the regions or not, but preventing the risk of
              having to split the hugepage during the operation.

       The move field is used by the kernel to return the number of
       bytes that was actually moved, or an error (a negated errno-
       style value).  If the value returned in move doesn't match the
       value that was specified in len, the operation fails with the
       error EAGAIN.  The move field is output-only; it is not read by
       the UFFDIO_MOVE operation.

       The operation may fail for various reasons. Usually, remapping of
       pages that are not exclusive to the given process fail; once KSM
       might deduplicate pages or fork() COW-shares pages during fork()
       with child processes, they are no longer exclusive. Further, the
       kernel might only perform lightweight checks for detecting whether
       the pages are exclusive, and return -EBUSY in case that check fails.
       To make the operation more likely to succeed, KSM should be
       disabled, fork() should be avoided or MADV_DONTFORK should be
       configured for the source VMA before fork().

       This ioctl(2) operation returns 0 on success.  In this case, the
       entire area was moved.  On error, -1 is returned and errno is
       set to indicate the error.  Possible errors include:

       EAGAIN The number of bytes moved (i.e., the value returned in
              the move field) does not equal the value that was
              specified in the len field.

       EINVAL Either dst or len was not a multiple of the system page
              size, or the range specified by src and len or dst and len
              was invalid.

       EINVAL An invalid bit was specified in the mode field.

       ENOENT
              The source virtual memory range has unmapped holes and
              UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES is not set.

       EEXIST
              The destination virtual memory range is fully or partially
              mapped.

       EBUSY
              The pages in the source virtual memory range are either
              pinned or not exclusive to the process. The kernel might
              only perform lightweight checks for detecting whether the
              pages are exclusive. To make the operation more likely to
              succeed, KSM should be disabled, fork() should be avoided
              or MADV_DONTFORK should be configured for the source virtual
              memory area before fork().

       ENOMEM Allocating memory needed for the operation failed.

       ESRCH
              The target process has exited at the time of a UFFDIO_MOVE
              operation.

Link: https://lkml.kernel.org/r/20231206103702.3873743-3-surenb@google.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/mm/userfaultfd.rst |   3 +
 fs/userfaultfd.c                             |  72 ++++
 include/linux/rmap.h                         |   5 +
 include/linux/userfaultfd_k.h                |  11 +
 include/uapi/linux/userfaultfd.h             |  29 +-
 mm/huge_memory.c                             | 122 ++++++
 mm/khugepaged.c                              |   3 +
 mm/rmap.c                                    |   6 +
 mm/userfaultfd.c                             | 614 +++++++++++++++++++++++++++
 9 files changed, 864 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 203e26da5f92..e5cc8848dcb3 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -113,6 +113,9 @@ events, except page fault notifications, may be generated:
   areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating
   support for shmem virtual memory areas.
 
+- ``UFFD_FEATURE_MOVE`` indicates that the kernel supports moving an
+  existing page contents from userspace.
+
 The userland application should set the feature flags it intends to use
 when invoking the ``UFFDIO_API`` ioctl, to request that those features be
 enabled if supported.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e8af40b05549..6e2a4d6a0d8f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,6 +2005,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
 	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
 }
 
+static int userfaultfd_move(struct userfaultfd_ctx *ctx,
+			    unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_move uffdio_move;
+	struct uffdio_move __user *user_uffdio_move;
+	struct userfaultfd_wake_range range;
+	struct mm_struct *mm = ctx->mm;
+
+	user_uffdio_move = (struct uffdio_move __user *) arg;
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	if (copy_from_user(&uffdio_move, user_uffdio_move,
+			   /* don't copy "move" last field */
+			   sizeof(uffdio_move)-sizeof(__s64)))
+		return -EFAULT;
+
+	/* Do not allow cross-mm moves. */
+	if (mm != current->mm)
+		return -EINVAL;
+
+	ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
+	if (ret)
+		return ret;
+
+	ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
+	if (ret)
+		return ret;
+
+	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
+				  UFFDIO_MOVE_MODE_DONTWAKE))
+		return -EINVAL;
+
+	if (mmget_not_zero(mm)) {
+		mmap_read_lock(mm);
+
+		/* Re-check after taking mmap_lock */
+		if (likely(!atomic_read(&ctx->mmap_changing)))
+			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
+					 uffdio_move.len, uffdio_move.mode);
+		else
+			ret = -EINVAL;
+
+		mmap_read_unlock(mm);
+		mmput(mm);
+	} else {
+		return -ESRCH;
+	}
+
+	if (unlikely(put_user(ret, &user_uffdio_move->move)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+
+	/* len == 0 would wake all */
+	VM_WARN_ON(!ret);
+	range.len = ret;
+	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
+		range.start = uffdio_move.dst;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
+
+out:
+	return ret;
+}
+
 /*
  * userland asks for a certain API version and we return which bits
  * and ioctl commands are implemented in this kernel for such API
@@ -2097,6 +2166,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_ZEROPAGE:
 		ret = userfaultfd_zeropage(ctx, arg);
 		break;
+	case UFFDIO_MOVE:
+		ret = userfaultfd_move(ctx, arg);
+		break;
 	case UFFDIO_WRITEPROTECT:
 		ret = userfaultfd_writeprotect(ctx, arg);
 		break;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3c2fc291b071..af6a32b6f3e7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 	down_write(&anon_vma->root->rwsem);
 }
 
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+	return down_write_trylock(&anon_vma->root->rwsem);
+}
+
 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
 {
 	up_write(&anon_vma->root->rwsem);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2dc19f40d05..e4056547fbe6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm,
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
 
+/* move_pages */
+void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
+void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
+		   unsigned long dst_start, unsigned long src_start,
+		   unsigned long len, __u64 flags);
+int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
+			struct vm_area_struct *dst_vma,
+			struct vm_area_struct *src_vma,
+			unsigned long dst_addr, unsigned long src_addr);
+
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 0dbc81015018..2841e4ea8f2c 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -41,7 +41,8 @@
 			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
 			   UFFD_FEATURE_WP_UNPOPULATED |	\
 			   UFFD_FEATURE_POISON |		\
-			   UFFD_FEATURE_WP_ASYNC)
+			   UFFD_FEATURE_WP_ASYNC |		\
+			   UFFD_FEATURE_MOVE)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -50,6 +51,7 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
+	 (__u64)1 << _UFFDIO_MOVE |		\
 	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
 	 (__u64)1 << _UFFDIO_CONTINUE |		\
 	 (__u64)1 << _UFFDIO_POISON)
@@ -73,6 +75,7 @@
 #define _UFFDIO_WAKE			(0x02)
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_MOVE			(0x05)
 #define _UFFDIO_WRITEPROTECT		(0x06)
 #define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_POISON			(0x08)
@@ -92,6 +95,8 @@
 				      struct uffdio_copy)
 #define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 				      struct uffdio_zeropage)
+#define UFFDIO_MOVE		_IOWR(UFFDIO, _UFFDIO_MOVE,	\
+				      struct uffdio_move)
 #define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
 				      struct uffdio_writeprotect)
 #define UFFDIO_CONTINUE		_IOWR(UFFDIO, _UFFDIO_CONTINUE,	\
@@ -222,6 +227,9 @@ struct uffdio_api {
 	 * asynchronous mode is supported in which the write fault is
 	 * automatically resolved and write-protection is un-set.
 	 * It implies UFFD_FEATURE_WP_UNPOPULATED.
+	 *
+	 * UFFD_FEATURE_MOVE indicates that the kernel supports moving an
+	 * existing page contents from userspace.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -239,6 +247,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
 #define UFFD_FEATURE_POISON			(1<<14)
 #define UFFD_FEATURE_WP_ASYNC			(1<<15)
+#define UFFD_FEATURE_MOVE			(1<<16)
 	__u64 features;
 
 	__u64 ioctls;
@@ -347,6 +356,24 @@ struct uffdio_poison {
 	__s64 updated;
 };
 
+struct uffdio_move {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * Especially if used to atomically remove memory from the
+	 * address space the wake on the dst range is not needed.
+	 */
+#define UFFDIO_MOVE_MODE_DONTWAKE		((__u64)1<<0)
+#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES	((__u64)1<<1)
+	__u64 mode;
+	/*
+	 * "move" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 move;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 387b030c7f15..6be1a380a298 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2141,6 +2141,128 @@ unlock:
 	return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
+/*
+ * The PT lock for src_pmd and the mmap_lock for reading are held by
+ * the caller, but it must return after releasing the page_table_lock.
+ * Just move the page from src_pmd to dst_pmd if possible.
+ * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
+ * repeated by the caller, or other errors in case of failure.
+ */
+int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
+			struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+			unsigned long dst_addr, unsigned long src_addr)
+{
+	pmd_t _dst_pmd, src_pmdval;
+	struct page *src_page;
+	struct folio *src_folio;
+	struct anon_vma *src_anon_vma;
+	spinlock_t *src_ptl, *dst_ptl;
+	pgtable_t src_pgtable;
+	struct mmu_notifier_range range;
+	int err = 0;
+
+	src_pmdval = *src_pmd;
+	src_ptl = pmd_lockptr(mm, src_pmd);
+
+	lockdep_assert_held(src_ptl);
+	mmap_assert_locked(mm);
+
+	/* Sanity checks before the operation */
+	if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
+	    WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
+		spin_unlock(src_ptl);
+		return -EINVAL;
+	}
+
+	if (!pmd_trans_huge(src_pmdval)) {
+		spin_unlock(src_ptl);
+		if (is_pmd_migration_entry(src_pmdval)) {
+			pmd_migration_entry_wait(mm, &src_pmdval);
+			return -EAGAIN;
+		}
+		return -ENOENT;
+	}
+
+	src_page = pmd_page(src_pmdval);
+	if (unlikely(!PageAnonExclusive(src_page))) {
+		spin_unlock(src_ptl);
+		return -EBUSY;
+	}
+
+	src_folio = page_folio(src_page);
+	folio_get(src_folio);
+	spin_unlock(src_ptl);
+
+	flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
+				src_addr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	folio_lock(src_folio);
+
+	/*
+	 * split_huge_page walks the anon_vma chain without the page
+	 * lock. Serialize against it with the anon_vma lock, the page
+	 * lock is not enough.
+	 */
+	src_anon_vma = folio_get_anon_vma(src_folio);
+	if (!src_anon_vma) {
+		err = -EAGAIN;
+		goto unlock_folio;
+	}
+	anon_vma_lock_write(src_anon_vma);
+
+	dst_ptl = pmd_lockptr(mm, dst_pmd);
+	double_pt_lock(src_ptl, dst_ptl);
+	if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
+		     !pmd_same(*dst_pmd, dst_pmdval))) {
+		err = -EAGAIN;
+		goto unlock_ptls;
+	}
+	if (folio_maybe_dma_pinned(src_folio) ||
+	    !PageAnonExclusive(&src_folio->page)) {
+		err = -EBUSY;
+		goto unlock_ptls;
+	}
+
+	if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
+	    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+		err = -EBUSY;
+		goto unlock_ptls;
+	}
+
+	folio_move_anon_rmap(src_folio, dst_vma);
+	WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+
+	src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+	/* Folio got pinned from under us. Put it back and fail the move. */
+	if (folio_maybe_dma_pinned(src_folio)) {
+		set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
+		err = -EBUSY;
+		goto unlock_ptls;
+	}
+
+	_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+	/* Follow mremap() behavior and treat the entry dirty after the move */
+	_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+	set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
+
+	src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
+	pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
+unlock_ptls:
+	double_pt_unlock(src_ptl, dst_ptl);
+	anon_vma_unlock_write(src_anon_vma);
+	put_anon_vma(src_anon_vma);
+unlock_folio:
+	/* unblock rmap walks */
+	folio_unlock(src_folio);
+	mmu_notifier_invalidate_range_end(&range);
+	folio_put(src_folio);
+	return err;
+}
+#endif /* CONFIG_USERFAULTFD */
+
 /*
  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
  *
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d72aecd3624a..de174d049e71 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1140,6 +1140,9 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 * Prevent all access to pagetables with the exception of
 	 * gup_fast later handled by the ptep_clear_flush and the VM
 	 * handled by the anon_vma lock + PG_lock.
+	 *
+	 * UFFDIO_MOVE is prevented to race as well thanks to the
+	 * mmap_lock.
 	 */
 	mmap_write_lock(mm);
 	result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
diff --git a/mm/rmap.c b/mm/rmap.c
index 15a55304aa3b..846fc79f3ca9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -490,6 +490,12 @@ void __init anon_vma_init(void)
  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
  * if there is a mapcount, we can dereference the anon_vma after observing
  * those.
+ *
+ * NOTE: the caller should normally hold folio lock when calling this.  If
+ * not, the caller needs to double check the anon_vma didn't change after
+ * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
+ * concurrently without folio lock protection). See folio_lock_anon_vma_read()
+ * which has already covered that, and comment above remap_pages().
  */
 struct anon_vma *folio_get_anon_vma(struct folio *folio)
 {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0b6ca553bebe..9ec814e47e99 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -842,3 +842,617 @@ out_unlock:
 	mmap_read_unlock(dst_mm);
 	return err;
 }
+
+
+void double_pt_lock(spinlock_t *ptl1,
+		    spinlock_t *ptl2)
+	__acquires(ptl1)
+	__acquires(ptl2)
+{
+	spinlock_t *ptl_tmp;
+
+	if (ptl1 > ptl2) {
+		/* exchange ptl1 and ptl2 */
+		ptl_tmp = ptl1;
+		ptl1 = ptl2;
+		ptl2 = ptl_tmp;
+	}
+	/* lock in virtual address order to avoid lock inversion */
+	spin_lock(ptl1);
+	if (ptl1 != ptl2)
+		spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
+	else
+		__acquire(ptl2);
+}
+
+void double_pt_unlock(spinlock_t *ptl1,
+		      spinlock_t *ptl2)
+	__releases(ptl1)
+	__releases(ptl2)
+{
+	spin_unlock(ptl1);
+	if (ptl1 != ptl2)
+		spin_unlock(ptl2);
+	else
+		__release(ptl2);
+}
+
+
+static int move_present_pte(struct mm_struct *mm,
+			    struct vm_area_struct *dst_vma,
+			    struct vm_area_struct *src_vma,
+			    unsigned long dst_addr, unsigned long src_addr,
+			    pte_t *dst_pte, pte_t *src_pte,
+			    pte_t orig_dst_pte, pte_t orig_src_pte,
+			    spinlock_t *dst_ptl, spinlock_t *src_ptl,
+			    struct folio *src_folio)
+{
+	int err = 0;
+
+	double_pt_lock(dst_ptl, src_ptl);
+
+	if (!pte_same(*src_pte, orig_src_pte) ||
+	    !pte_same(*dst_pte, orig_dst_pte)) {
+		err = -EAGAIN;
+		goto out;
+	}
+	if (folio_test_large(src_folio) ||
+	    folio_maybe_dma_pinned(src_folio) ||
+	    !PageAnonExclusive(&src_folio->page)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	folio_move_anon_rmap(src_folio, dst_vma);
+	WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+
+	orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
+	/* Folio got pinned from under us. Put it back and fail the move. */
+	if (folio_maybe_dma_pinned(src_folio)) {
+		set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+		err = -EBUSY;
+		goto out;
+	}
+
+	orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
+	/* Follow mremap() behavior and treat the entry dirty after the move */
+	orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+
+	set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+out:
+	double_pt_unlock(dst_ptl, src_ptl);
+	return err;
+}
+
+static int move_swap_pte(struct mm_struct *mm,
+			 unsigned long dst_addr, unsigned long src_addr,
+			 pte_t *dst_pte, pte_t *src_pte,
+			 pte_t orig_dst_pte, pte_t orig_src_pte,
+			 spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+	if (!pte_swp_exclusive(orig_src_pte))
+		return -EBUSY;
+
+	double_pt_lock(dst_ptl, src_ptl);
+
+	if (!pte_same(*src_pte, orig_src_pte) ||
+	    !pte_same(*dst_pte, orig_dst_pte)) {
+		double_pt_unlock(dst_ptl, src_ptl);
+		return -EAGAIN;
+	}
+
+	orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
+	double_pt_unlock(dst_ptl, src_ptl);
+
+	return 0;
+}
+
+/*
+ * The mmap_lock for reading is held by the caller. Just move the page
+ * from src_pmd to dst_pmd if possible, and return true if succeeded
+ * in moving the page.
+ */
+static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+			  struct vm_area_struct *dst_vma,
+			  struct vm_area_struct *src_vma,
+			  unsigned long dst_addr, unsigned long src_addr,
+			  __u64 mode)
+{
+	swp_entry_t entry;
+	pte_t orig_src_pte, orig_dst_pte;
+	pte_t src_folio_pte;
+	spinlock_t *src_ptl, *dst_ptl;
+	pte_t *src_pte = NULL;
+	pte_t *dst_pte = NULL;
+
+	struct folio *src_folio = NULL;
+	struct anon_vma *src_anon_vma = NULL;
+	struct mmu_notifier_range range;
+	int err = 0;
+
+	flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				src_addr, src_addr + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+retry:
+	dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
+
+	/* Retry if a huge pmd materialized from under us */
+	if (unlikely(!dst_pte)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
+
+	/*
+	 * We held the mmap_lock for reading so MADV_DONTNEED
+	 * can zap transparent huge pages under us, or the
+	 * transparent huge page fault can establish new
+	 * transparent huge pages under us.
+	 */
+	if (unlikely(!src_pte)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	/* Sanity checks before the operation */
+	if (WARN_ON_ONCE(pmd_none(*dst_pmd)) ||	WARN_ON_ONCE(pmd_none(*src_pmd)) ||
+	    WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock(dst_ptl);
+	orig_dst_pte = *dst_pte;
+	spin_unlock(dst_ptl);
+	if (!pte_none(orig_dst_pte)) {
+		err = -EEXIST;
+		goto out;
+	}
+
+	spin_lock(src_ptl);
+	orig_src_pte = *src_pte;
+	spin_unlock(src_ptl);
+	if (pte_none(orig_src_pte)) {
+		if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
+			err = -ENOENT;
+		else /* nothing to do to move a hole */
+			err = 0;
+		goto out;
+	}
+
+	/* If PTE changed after we locked the folio them start over */
+	if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	if (pte_present(orig_src_pte)) {
+		/*
+		 * Pin and lock both source folio and anon_vma. Since we are in
+		 * RCU read section, we can't block, so on contention have to
+		 * unmap the ptes, obtain the lock and retry.
+		 */
+		if (!src_folio) {
+			struct folio *folio;
+
+			/*
+			 * Pin the page while holding the lock to be sure the
+			 * page isn't freed under us
+			 */
+			spin_lock(src_ptl);
+			if (!pte_same(orig_src_pte, *src_pte)) {
+				spin_unlock(src_ptl);
+				err = -EAGAIN;
+				goto out;
+			}
+
+			folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+			if (!folio || !PageAnonExclusive(&folio->page)) {
+				spin_unlock(src_ptl);
+				err = -EBUSY;
+				goto out;
+			}
+
+			folio_get(folio);
+			src_folio = folio;
+			src_folio_pte = orig_src_pte;
+			spin_unlock(src_ptl);
+
+			if (!folio_trylock(src_folio)) {
+				pte_unmap(&orig_src_pte);
+				pte_unmap(&orig_dst_pte);
+				src_pte = dst_pte = NULL;
+				/* now we can block and wait */
+				folio_lock(src_folio);
+				goto retry;
+			}
+
+			if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+				err = -EBUSY;
+				goto out;
+			}
+		}
+
+		/* at this point we have src_folio locked */
+		if (folio_test_large(src_folio)) {
+			err = split_folio(src_folio);
+			if (err)
+				goto out;
+		}
+
+		if (!src_anon_vma) {
+			/*
+			 * folio_referenced walks the anon_vma chain
+			 * without the folio lock. Serialize against it with
+			 * the anon_vma lock, the folio lock is not enough.
+			 */
+			src_anon_vma = folio_get_anon_vma(src_folio);
+			if (!src_anon_vma) {
+				/* page was unmapped from under us */
+				err = -EAGAIN;
+				goto out;
+			}
+			if (!anon_vma_trylock_write(src_anon_vma)) {
+				pte_unmap(&orig_src_pte);
+				pte_unmap(&orig_dst_pte);
+				src_pte = dst_pte = NULL;
+				/* now we can block and wait */
+				anon_vma_lock_write(src_anon_vma);
+				goto retry;
+			}
+		}
+
+		err = move_present_pte(mm,  dst_vma, src_vma,
+				       dst_addr, src_addr, dst_pte, src_pte,
+				       orig_dst_pte, orig_src_pte,
+				       dst_ptl, src_ptl, src_folio);
+	} else {
+		entry = pte_to_swp_entry(orig_src_pte);
+		if (non_swap_entry(entry)) {
+			if (is_migration_entry(entry)) {
+				pte_unmap(&orig_src_pte);
+				pte_unmap(&orig_dst_pte);
+				src_pte = dst_pte = NULL;
+				migration_entry_wait(mm, src_pmd, src_addr);
+				err = -EAGAIN;
+			} else
+				err = -EFAULT;
+			goto out;
+		}
+
+		err = move_swap_pte(mm, dst_addr, src_addr,
+				    dst_pte, src_pte,
+				    orig_dst_pte, orig_src_pte,
+				    dst_ptl, src_ptl);
+	}
+
+out:
+	if (src_anon_vma) {
+		anon_vma_unlock_write(src_anon_vma);
+		put_anon_vma(src_anon_vma);
+	}
+	if (src_folio) {
+		folio_unlock(src_folio);
+		folio_put(src_folio);
+	}
+	if (dst_pte)
+		pte_unmap(dst_pte);
+	if (src_pte)
+		pte_unmap(src_pte);
+	mmu_notifier_invalidate_range_end(&range);
+
+	return err;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline bool move_splits_huge_pmd(unsigned long dst_addr,
+					unsigned long src_addr,
+					unsigned long src_end)
+{
+	return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
+		src_end - src_addr < HPAGE_PMD_SIZE;
+}
+#else
+static inline bool move_splits_huge_pmd(unsigned long dst_addr,
+					unsigned long src_addr,
+					unsigned long src_end)
+{
+	/* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
+	return false;
+}
+#endif
+
+static inline bool vma_move_compatible(struct vm_area_struct *vma)
+{
+	return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB |
+				  VM_MIXEDMAP | VM_SHADOW_STACK));
+}
+
+static int validate_move_areas(struct userfaultfd_ctx *ctx,
+			       struct vm_area_struct *src_vma,
+			       struct vm_area_struct *dst_vma)
+{
+	/* Only allow moving if both have the same access and protection */
+	if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
+	    pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
+		return -EINVAL;
+
+	/* Only allow moving if both are mlocked or both aren't */
+	if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
+		return -EINVAL;
+
+	/*
+	 * For now, we keep it simple and only move between writable VMAs.
+	 * Access flags are equal, therefore cheching only the source is enough.
+	 */
+	if (!(src_vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+
+	/* Check if vma flags indicate content which can be moved */
+	if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
+		return -EINVAL;
+
+	/* Ensure dst_vma is registered in uffd we are operating on */
+	if (!dst_vma->vm_userfaultfd_ctx.ctx ||
+	    dst_vma->vm_userfaultfd_ctx.ctx != ctx)
+		return -EINVAL;
+
+	/* Only allow moving across anonymous vmas */
+	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
+		return -EINVAL;
+
+	/*
+	 * Ensure the dst_vma has a anon_vma or this page
+	 * would get a NULL anon_vma when moved in the
+	 * dst_vma.
+	 */
+	if (unlikely(anon_vma_prepare(dst_vma)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * move_pages - move arbitrary anonymous pages of an existing vma
+ * @ctx: pointer to the userfaultfd context
+ * @mm: the address space to move pages
+ * @dst_start: start of the destination virtual memory range
+ * @src_start: start of the source virtual memory range
+ * @len: length of the virtual memory range
+ * @mode: flags from uffdio_move.mode
+ *
+ * Must be called with mmap_lock held for read.
+ *
+ * move_pages() remaps arbitrary anonymous pages atomically in zero
+ * copy. It only works on non shared anonymous pages because those can
+ * be relocated without generating non linear anon_vmas in the rmap
+ * code.
+ *
+ * It provides a zero copy mechanism to handle userspace page faults.
+ * The source vma pages should have mapcount == 1, which can be
+ * enforced by using madvise(MADV_DONTFORK) on src vma.
+ *
+ * The thread receiving the page during the userland page fault
+ * will receive the faulting page in the source vma through the network,
+ * storage or any other I/O device (MADV_DONTFORK in the source vma
+ * avoids move_pages() to fail with -EBUSY if the process forks before
+ * move_pages() is called), then it will call move_pages() to map the
+ * page in the faulting address in the destination vma.
+ *
+ * This userfaultfd command works purely via pagetables, so it's the
+ * most efficient way to move physical non shared anonymous pages
+ * across different virtual addresses. Unlike mremap()/mmap()/munmap()
+ * it does not create any new vmas. The mapping in the destination
+ * address is atomic.
+ *
+ * It only works if the vma protection bits are identical from the
+ * source and destination vma.
+ *
+ * It can remap non shared anonymous pages within the same vma too.
+ *
+ * If the source virtual memory range has any unmapped holes, or if
+ * the destination virtual memory range is not a whole unmapped hole,
+ * move_pages() will fail respectively with -ENOENT or -EEXIST. This
+ * provides a very strict behavior to avoid any chance of memory
+ * corruption going unnoticed if there are userland race conditions.
+ * Only one thread should resolve the userland page fault at any given
+ * time for any given faulting address. This means that if two threads
+ * try to both call move_pages() on the same destination address at the
+ * same time, the second thread will get an explicit error from this
+ * command.
+ *
+ * The command retval will return "len" is successful. The command
+ * however can be interrupted by fatal signals or errors. If
+ * interrupted it will return the number of bytes successfully
+ * remapped before the interruption if any, or the negative error if
+ * none. It will never return zero. Either it will return an error or
+ * an amount of bytes successfully moved. If the retval reports a
+ * "short" remap, the move_pages() command should be repeated by
+ * userland with src+retval, dst+reval, len-retval if it wants to know
+ * about the error that interrupted it.
+ *
+ * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
+ * prevent -ENOENT errors to materialize if there are holes in the
+ * source virtual range that is being remapped. The holes will be
+ * accounted as successfully remapped in the retval of the
+ * command. This is mostly useful to remap hugepage naturally aligned
+ * virtual regions without knowing if there are transparent hugepage
+ * in the regions or not, but preventing the risk of having to split
+ * the hugepmd during the remap.
+ *
+ * If there's any rmap walk that is taking the anon_vma locks without
+ * first obtaining the folio lock (the only current instance is
+ * folio_referenced), they will have to verify if the folio->mapping
+ * has changed after taking the anon_vma lock. If it changed they
+ * should release the lock and retry obtaining a new anon_vma, because
+ * it means the anon_vma was changed by move_pages() before the lock
+ * could be obtained. This is the only additional complexity added to
+ * the rmap code to provide this anonymous page remapping functionality.
+ */
+ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
+		   unsigned long dst_start, unsigned long src_start,
+		   unsigned long len, __u64 mode)
+{
+	struct vm_area_struct *src_vma, *dst_vma;
+	unsigned long src_addr, dst_addr;
+	pmd_t *src_pmd, *dst_pmd;
+	long err = -EINVAL;
+	ssize_t moved = 0;
+
+	/* Sanitize the command parameters. */
+	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
+	    WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
+	    WARN_ON_ONCE(len & ~PAGE_MASK))
+		goto out;
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	if (WARN_ON_ONCE(src_start + len <= src_start) ||
+	    WARN_ON_ONCE(dst_start + len <= dst_start))
+		goto out;
+
+	/*
+	 * Make sure the vma is not shared, that the src and dst remap
+	 * ranges are both valid and fully within a single existing
+	 * vma.
+	 */
+	src_vma = find_vma(mm, src_start);
+	if (!src_vma || (src_vma->vm_flags & VM_SHARED))
+		goto out;
+	if (src_start < src_vma->vm_start ||
+	    src_start + len > src_vma->vm_end)
+		goto out;
+
+	dst_vma = find_vma(mm, dst_start);
+	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+		goto out;
+	if (dst_start < dst_vma->vm_start ||
+	    dst_start + len > dst_vma->vm_end)
+		goto out;
+
+	err = validate_move_areas(ctx, src_vma, dst_vma);
+	if (err)
+		goto out;
+
+	for (src_addr = src_start, dst_addr = dst_start;
+	     src_addr < src_start + len;) {
+		spinlock_t *ptl;
+		pmd_t dst_pmdval;
+		unsigned long step_size;
+
+		/*
+		 * Below works because anonymous area would not have a
+		 * transparent huge PUD. If file-backed support is added,
+		 * that case would need to be handled here.
+		 */
+		src_pmd = mm_find_pmd(mm, src_addr);
+		if (unlikely(!src_pmd)) {
+			if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
+				err = -ENOENT;
+				break;
+			}
+			src_pmd = mm_alloc_pmd(mm, src_addr);
+			if (unlikely(!src_pmd)) {
+				err = -ENOMEM;
+				break;
+			}
+		}
+		dst_pmd = mm_alloc_pmd(mm, dst_addr);
+		if (unlikely(!dst_pmd)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		dst_pmdval = pmdp_get_lockless(dst_pmd);
+		/*
+		 * If the dst_pmd is mapped as THP don't override it and just
+		 * be strict. If dst_pmd changes into TPH after this check, the
+		 * move_pages_huge_pmd() will detect the change and retry
+		 * while move_pages_pte() will detect the change and fail.
+		 */
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
+
+		ptl = pmd_trans_huge_lock(src_pmd, src_vma);
+		if (ptl) {
+			if (pmd_devmap(*src_pmd)) {
+				spin_unlock(ptl);
+				err = -ENOENT;
+				break;
+			}
+
+			/* Check if we can move the pmd without splitting it. */
+			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
+			    !pmd_none(dst_pmdval)) {
+				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
+
+				if (!folio || !PageAnonExclusive(&folio->page)) {
+					spin_unlock(ptl);
+					err = -EBUSY;
+					break;
+				}
+
+				spin_unlock(ptl);
+				split_huge_pmd(src_vma, src_pmd, src_addr);
+				/* The folio will be split by move_pages_pte() */
+				continue;
+			}
+
+			err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
+						  dst_pmdval, dst_vma, src_vma,
+						  dst_addr, src_addr);
+			step_size = HPAGE_PMD_SIZE;
+		} else {
+			if (pmd_none(*src_pmd)) {
+				if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
+					err = -ENOENT;
+					break;
+				}
+				if (unlikely(__pte_alloc(mm, src_pmd))) {
+					err = -ENOMEM;
+					break;
+				}
+			}
+
+			if (unlikely(pte_alloc(mm, dst_pmd))) {
+				err = -ENOMEM;
+				break;
+			}
+
+			err = move_pages_pte(mm, dst_pmd, src_pmd,
+					     dst_vma, src_vma,
+					     dst_addr, src_addr, mode);
+			step_size = PAGE_SIZE;
+		}
+
+		cond_resched();
+
+		if (fatal_signal_pending(current)) {
+			/* Do not override an error */
+			if (!err || err == -EAGAIN)
+				err = -EINTR;
+			break;
+		}
+
+		if (err) {
+			if (err == -EAGAIN)
+				continue;
+			break;
+		}
+
+		/* Proceed to the next page */
+		dst_addr += step_size;
+		src_addr += step_size;
+		moved += step_size;
+	}
+
+out:
+	VM_WARN_ON(moved < 0);
+	VM_WARN_ON(err > 0);
+	VM_WARN_ON(!moved && !err);
+	return moved ? moved : err;
+}
-- 
cgit v1.2.3


From 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 16:22:06 +0000
Subject: mm: convert ksm_might_need_to_copy() to work on folios

Patch series "Finish two folio conversions".

Most callers of page_add_new_anon_rmap() and
lru_cache_add_inactive_or_unevictable() have been converted to their folio
equivalents, but there are still a few stragglers.  There's a bit of
preparatory work in ksm and unuse_pte(), but after that it's pretty
mechanical.


This patch (of 9):

Accept a folio as an argument and return a folio result.  Removes a call
to compound_head() in do_swap_page(), and prevents folio & page from
getting out of sync in unuse_pte().

Reviewed-by: David Hildenbrand <david@redhat.com>
[willy@infradead.org: fix smatch warning]
  Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org
[david@redhat.com: only adjust the page if the folio changed]
  Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com
Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h |  6 +++---
 mm/ksm.c            | 21 +++++++++++----------
 mm/memory.c         | 11 +++++++----
 mm/swapfile.c       |  8 +++++---
 4 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 4643d5244e77..401348e9f92b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm)
  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
  * but what if the vma was unmerged while the page was swapped out?
  */
-struct page *ksm_might_need_to_copy(struct page *page,
+struct folio *ksm_might_need_to_copy(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long addr);
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
@@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 	return 0;
 }
 
-static inline struct page *ksm_might_need_to_copy(struct page *page,
+static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long addr)
 {
-	return page;
+	return folio;
 }
 
 static inline void rmap_walk_ksm(struct folio *folio,
diff --git a/mm/ksm.c b/mm/ksm.c
index c0e1995fb444..e2ce850c2739 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2873,30 +2873,30 @@ void __ksm_exit(struct mm_struct *mm)
 	trace_ksm_exit(mm);
 }
 
-struct page *ksm_might_need_to_copy(struct page *page,
+struct folio *ksm_might_need_to_copy(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long addr)
 {
-	struct folio *folio = page_folio(page);
+	struct page *page = folio_page(folio, 0);
 	struct anon_vma *anon_vma = folio_anon_vma(folio);
 	struct folio *new_folio;
 
 	if (folio_test_large(folio))
-		return page;
+		return folio;
 
 	if (folio_test_ksm(folio)) {
 		if (folio_stable_node(folio) &&
 		    !(ksm_run & KSM_RUN_UNMERGE))
-			return page;	/* no need to copy it */
+			return folio;	/* no need to copy it */
 	} else if (!anon_vma) {
-		return page;		/* no need to copy it */
+		return folio;		/* no need to copy it */
 	} else if (folio->index == linear_page_index(vma, addr) &&
 			anon_vma->root == vma->anon_vma->root) {
-		return page;		/* still no need to copy it */
+		return folio;		/* still no need to copy it */
 	}
 	if (PageHWPoison(page))
 		return ERR_PTR(-EHWPOISON);
 	if (!folio_test_uptodate(folio))
-		return page;		/* let do_swap_page report the error */
+		return folio;		/* let do_swap_page report the error */
 
 	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
 	if (new_folio &&
@@ -2905,9 +2905,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
 		new_folio = NULL;
 	}
 	if (new_folio) {
-		if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) {
+		if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
+								addr, vma)) {
 			folio_put(new_folio);
-			memory_failure_queue(page_to_pfn(page), 0);
+			memory_failure_queue(folio_pfn(folio), 0);
 			return ERR_PTR(-EHWPOISON);
 		}
 		folio_set_dirty(new_folio);
@@ -2918,7 +2919,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 #endif
 	}
 
-	return new_folio ? &new_folio->page : NULL;
+	return new_folio;
 }
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
diff --git a/mm/memory.c b/mm/memory.c
index b9cc56a75f4b..7649cb9eb7f5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3942,15 +3942,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		 * page->index of !PageKSM() pages would be nonlinear inside the
 		 * anon VMA -- PageKSM() is lost on actual swapout.
 		 */
-		page = ksm_might_need_to_copy(page, vma, vmf->address);
-		if (unlikely(!page)) {
+		folio = ksm_might_need_to_copy(folio, vma, vmf->address);
+		if (unlikely(!folio)) {
 			ret = VM_FAULT_OOM;
+			folio = swapcache;
 			goto out_page;
-		} else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
+		} else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
 			ret = VM_FAULT_HWPOISON;
+			folio = swapcache;
 			goto out_page;
 		}
-		folio = page_folio(page);
+		if (folio != swapcache)
+			page = folio_page(folio, 0);
 
 		/*
 		 * If we want to map a page that's in the swapcache writable, we
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8be70912e298..0371b7b3cd27 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1749,11 +1749,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	int ret = 1;
 
 	swapcache = page;
-	page = ksm_might_need_to_copy(page, vma, addr);
-	if (unlikely(!page))
+	folio = ksm_might_need_to_copy(folio, vma, addr);
+	if (unlikely(!folio))
 		return -ENOMEM;
-	else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+	else if (unlikely(folio == ERR_PTR(-EHWPOISON)))
 		hwpoisoned = true;
+	else
+		page = folio_file_page(folio, swp_offset(entry));
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
-- 
cgit v1.2.3


From cafa8e37a2ebd344ae0774324c21f46640bbaab3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 16:22:14 +0000
Subject: mm: remove page_add_new_anon_rmap and
 lru_cache_add_inactive_or_unevictable

All callers have now been converted to folio_add_new_anon_rmap() and
folio_add_lru_vma() so we can remove the wrapper.

Link: https://lkml.kernel.org/r/20231211162214.2146080-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 --
 include/linux/swap.h |  3 ---
 mm/folio-compat.c    | 16 ----------------
 3 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index af6a32b6f3e7..0ae2bb0e77f5 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -197,8 +197,6 @@ typedef int __bitwise rmap_t;
 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-		unsigned long address);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6dd6575b905..3e1909087f6a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio);
 void folio_mark_lazyfree(struct folio *folio);
 extern void swap_setup(void);
 
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
-						struct vm_area_struct *vma);
-
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index aee3b9a16828..50412014f16f 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -77,12 +77,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc,
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
 
-void lru_cache_add_inactive_or_unevictable(struct page *page,
-		struct vm_area_struct *vma)
-{
-	folio_add_lru_vma(page_folio(page), vma);
-}
-
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp)
 {
@@ -122,13 +116,3 @@ void putback_lru_page(struct page *page)
 {
 	folio_putback_lru(page_folio(page));
 }
-
-#ifdef CONFIG_MMU
-void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
-		unsigned long address)
-{
-	VM_BUG_ON_PAGE(PageTail(page), page);
-
-	return folio_add_new_anon_rmap((struct folio *)page, vma, address);
-}
-#endif
-- 
cgit v1.2.3


From 8ba2f844f050a82624ba3ad5146aa3c116f506f7 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Thu, 28 Dec 2023 09:45:46 +0000
Subject: mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx

First of all, we need to rename acomp_ctx->dstmem field to buffer, since
we are now using for purposes other than compression.

Then we change per-cpu mutex and buffer to per-acomp_ctx, since them
belong to the acomp_ctx and are necessary parts when used in the
compress/decompress contexts.

So we can remove the old per-cpu mutex and dstmem.

Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Chris Li <chrisl@kernel.org> (Google)
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpuhotplug.h |   1 -
 mm/zswap.c                 | 104 ++++++++++++++-------------------------------
 2 files changed, 33 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index efc0c0b07efb..c3e06e21766a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -124,7 +124,6 @@ enum cpuhp_state {
 	CPUHP_ARM_BL_PREPARE,
 	CPUHP_TRACE_RB_PREPARE,
 	CPUHP_MM_ZS_PREPARE,
-	CPUHP_MM_ZSWP_MEM_PREPARE,
 	CPUHP_MM_ZSWP_POOL_PREPARE,
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
diff --git a/mm/zswap.c b/mm/zswap.c
index 880c33bbe146..12ce5a68da91 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -161,8 +161,8 @@ struct crypto_acomp_ctx {
 	struct crypto_acomp *acomp;
 	struct acomp_req *req;
 	struct crypto_wait wait;
-	u8 *dstmem;
-	struct mutex *mutex;
+	u8 *buffer;
+	struct mutex mutex;
 };
 
 /*
@@ -688,63 +688,26 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool)
 /*********************************
 * per-cpu code
 **********************************/
-static DEFINE_PER_CPU(u8 *, zswap_dstmem);
-/*
- * If users dynamically change the zpool type and compressor at runtime, i.e.
- * zswap is running, zswap can have more than one zpool on one cpu, but they
- * are sharing dtsmem. So we need this mutex to be per-cpu.
- */
-static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
-
-static int zswap_dstmem_prepare(unsigned int cpu)
-{
-	struct mutex *mutex;
-	u8 *dst;
-
-	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!dst)
-		return -ENOMEM;
-
-	mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
-	if (!mutex) {
-		kfree(dst);
-		return -ENOMEM;
-	}
-
-	mutex_init(mutex);
-	per_cpu(zswap_dstmem, cpu) = dst;
-	per_cpu(zswap_mutex, cpu) = mutex;
-	return 0;
-}
-
-static int zswap_dstmem_dead(unsigned int cpu)
-{
-	struct mutex *mutex;
-	u8 *dst;
-
-	mutex = per_cpu(zswap_mutex, cpu);
-	kfree(mutex);
-	per_cpu(zswap_mutex, cpu) = NULL;
-
-	dst = per_cpu(zswap_dstmem, cpu);
-	kfree(dst);
-	per_cpu(zswap_dstmem, cpu) = NULL;
-
-	return 0;
-}
-
 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 {
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 	struct crypto_acomp *acomp;
 	struct acomp_req *req;
+	int ret;
+
+	mutex_init(&acomp_ctx->mutex);
+
+	acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+	if (!acomp_ctx->buffer)
+		return -ENOMEM;
 
 	acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
 	if (IS_ERR(acomp)) {
 		pr_err("could not alloc crypto acomp %s : %ld\n",
 				pool->tfm_name, PTR_ERR(acomp));
-		return PTR_ERR(acomp);
+		ret = PTR_ERR(acomp);
+		goto acomp_fail;
 	}
 	acomp_ctx->acomp = acomp;
 
@@ -752,8 +715,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	if (!req) {
 		pr_err("could not alloc crypto acomp_request %s\n",
 		       pool->tfm_name);
-		crypto_free_acomp(acomp_ctx->acomp);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto req_fail;
 	}
 	acomp_ctx->req = req;
 
@@ -766,10 +729,13 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   crypto_req_done, &acomp_ctx->wait);
 
-	acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
-	acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
-
 	return 0;
+
+req_fail:
+	crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+	kfree(acomp_ctx->buffer);
+	return ret;
 }
 
 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
@@ -782,6 +748,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 			acomp_request_free(acomp_ctx->req);
 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
 			crypto_free_acomp(acomp_ctx->acomp);
+		kfree(acomp_ctx->buffer);
 	}
 
 	return 0;
@@ -1391,12 +1358,12 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
 	u8 *src;
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-	mutex_lock(acomp_ctx->mutex);
+	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
 	if (!zpool_can_sleep_mapped(zpool)) {
-		memcpy(acomp_ctx->dstmem, src, entry->length);
-		src = acomp_ctx->dstmem;
+		memcpy(acomp_ctx->buffer, src, entry->length);
+		src = acomp_ctx->buffer;
 		zpool_unmap_handle(zpool, entry->handle);
 	}
 
@@ -1406,7 +1373,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
 	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
 	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
 	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
-	mutex_unlock(acomp_ctx->mutex);
+	mutex_unlock(&acomp_ctx->mutex);
 
 	if (zpool_can_sleep_mapped(zpool))
 		zpool_unmap_handle(zpool, entry->handle);
@@ -1622,13 +1589,17 @@ bool zswap_store(struct folio *folio)
 	/* compress */
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 
-	mutex_lock(acomp_ctx->mutex);
+	mutex_lock(&acomp_ctx->mutex);
 
-	dst = acomp_ctx->dstmem;
+	dst = acomp_ctx->buffer;
 	sg_init_table(&input, 1);
 	sg_set_page(&input, page, PAGE_SIZE, 0);
 
-	/* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
+	/*
+	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+	 * and hardware-accelerators may won't check the dst buffer size, so
+	 * giving the dst buffer with enough length to avoid buffer overflow.
+	 */
 	sg_init_one(&output, dst, PAGE_SIZE * 2);
 	acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
 	/*
@@ -1668,7 +1639,7 @@ bool zswap_store(struct folio *folio)
 	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, dst, dlen);
 	zpool_unmap_handle(zpool, handle);
-	mutex_unlock(acomp_ctx->mutex);
+	mutex_unlock(&acomp_ctx->mutex);
 
 	/* populate entry */
 	entry->swpentry = swp_entry(type, offset);
@@ -1711,7 +1682,7 @@ insert_entry:
 	return true;
 
 put_dstmem:
-	mutex_unlock(acomp_ctx->mutex);
+	mutex_unlock(&acomp_ctx->mutex);
 put_pool:
 	zswap_pool_put(entry->pool);
 freepage:
@@ -1886,13 +1857,6 @@ static int zswap_setup(void)
 		goto cache_fail;
 	}
 
-	ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
-				zswap_dstmem_prepare, zswap_dstmem_dead);
-	if (ret) {
-		pr_err("dstmem alloc failed\n");
-		goto dstmem_fail;
-	}
-
 	ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
 				      "mm/zswap_pool:prepare",
 				      zswap_cpu_comp_prepare,
@@ -1924,8 +1888,6 @@ fallback_fail:
 	if (pool)
 		zswap_pool_destroy(pool);
 hp_fail:
-	cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
-dstmem_fail:
 	kmem_cache_destroy(zswap_entry_cache);
 cache_fail:
 	/* if built-in, we aren't unloaded on failure; don't allow use */
-- 
cgit v1.2.3


From 96c7b0b42239e7b8987b2664b458dc74e825f760 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:30 +0000
Subject: mm: return the folio from __read_swap_cache_async()

Patch series "More swap folio conversions".

These all seem like fairly straightforward conversions to me.  A lot of
compound_head() calls get removed.  And page_swap_info(), which is nice.


This patch (of 13):

Move the folio->page conversion into the callers that actually want that.
Most of the callers are happier with the folio anyway.  If the
page_allocated boolean is set, the folio allocated is of order-0, so it is
safe to pass the page directly to swap_readpage().

Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h |  4 +--
 mm/swap.h             |  7 +++--
 mm/swap_state.c       | 75 ++++++++++++++++++++++++---------------------------
 mm/zswap.c            | 58 +++++++++++++++++++--------------------
 4 files changed, 69 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 08c240e16a01..e88572d4c720 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -34,7 +34,7 @@ void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
-void zswap_page_swapin(struct page *page);
+void zswap_folio_swapin(struct folio *folio);
 #else
 
 struct zswap_lruvec_state {};
@@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
-static inline void zswap_page_swapin(struct page *page) {}
+static inline void zswap_folio_swapin(struct folio *folio) {}
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/swap.h b/mm/swap.h
index c0dc73e10e91..a60ab1cfcaf2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -49,10 +49,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				   struct vm_area_struct *vma,
 				   unsigned long addr,
 				   struct swap_iocb **plug);
-struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated,
-				     bool skip_if_exists);
+struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
+		bool skip_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c597cec606e4..874b40a1f502 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -410,14 +410,12 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 	return folio;
 }
 
-struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-				     struct mempolicy *mpol, pgoff_t ilx,
-				     bool *new_page_allocated,
-				     bool skip_if_exists)
+struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+		struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
+		bool skip_if_exists)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
-	struct page *page;
 	void *shadow = NULL;
 
 	*new_page_allocated = false;
@@ -434,10 +432,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 */
 		folio = filemap_get_folio(swap_address_space(entry),
 						swp_offset(entry));
-		if (!IS_ERR(folio)) {
-			page = folio_file_page(folio, swp_offset(entry));
-			goto got_page;
-		}
+		if (!IS_ERR(folio))
+			goto got_folio;
 
 		/*
 		 * Just skip read ahead for unused swap slot.
@@ -451,7 +447,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			goto fail_put_swap;
 
 		/*
-		 * Get a new page to read into from swap.  Allocate it now,
+		 * Get a new folio to read into from swap.  Allocate it now,
 		 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
 		 * cause any racers to loop around until we add it to cache.
 		 */
@@ -487,13 +483,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
 		 * has not yet been cleared.  Or race against another
 		 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
-		 * in swap_map, but not yet added its page to swap cache.
+		 * in swap_map, but not yet added its folio to swap cache.
 		 */
 		schedule_timeout_uninterruptible(1);
 	}
 
 	/*
-	 * The swap entry is ours to swap in. Prepare the new page.
+	 * The swap entry is ours to swap in. Prepare the new folio.
 	 */
 
 	__folio_set_locked(folio);
@@ -514,10 +510,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	/* Caller will initiate read into locked folio */
 	folio_add_lru(folio);
 	*new_page_allocated = true;
-	page = &folio->page;
-got_page:
+got_folio:
 	put_swap_device(si);
-	return page;
+	return folio;
 
 fail_unlock:
 	put_swap_folio(folio, entry);
@@ -545,16 +540,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	bool page_allocated;
 	struct mempolicy *mpol;
 	pgoff_t ilx;
-	struct page *page;
+	struct folio *folio;
 
 	mpol = get_vma_policy(vma, addr, 0, &ilx);
-	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
 	mpol_cond_put(mpol);
 
 	if (page_allocated)
-		swap_readpage(page, false, plug);
-	return page;
+		swap_readpage(&folio->page, false, plug);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 static unsigned int __swapin_nr_pages(unsigned long prev_offset,
@@ -639,7 +634,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 				    struct mempolicy *mpol, pgoff_t ilx)
 {
-	struct page *page;
+	struct folio *folio;
 	unsigned long entry_offset = swp_offset(entry);
 	unsigned long offset = entry_offset;
 	unsigned long start_offset, end_offset;
@@ -664,31 +659,31 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	blk_start_plug(&plug);
 	for (offset = start_offset; offset <= end_offset ; offset++) {
 		/* Ok, do the async read-ahead now */
-		page = __read_swap_cache_async(
+		folio = __read_swap_cache_async(
 				swp_entry(swp_type(entry), offset),
 				gfp_mask, mpol, ilx, &page_allocated, false);
-		if (!page)
+		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false, &splug);
+			swap_readpage(&folio->page, false, &splug);
 			if (offset != entry_offset) {
-				SetPageReadahead(page);
+				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
 			}
 		}
-		put_page(page);
+		folio_put(folio);
 	}
 	blk_finish_plug(&plug);
 	swap_read_unplug(splug);
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
 	/* The page was likely read above, so no need for plugging here */
-	page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(page, false, NULL);
-	zswap_page_swapin(page);
-	return page;
+		swap_readpage(&folio->page, false, NULL);
+	zswap_folio_swapin(folio);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -806,7 +801,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
-	struct page *page;
+	struct folio *folio;
 	pte_t *pte = NULL, pentry;
 	unsigned long addr;
 	swp_entry_t entry;
@@ -839,18 +834,18 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 			continue;
 		pte_unmap(pte);
 		pte = NULL;
-		page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+		folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
 						&page_allocated, false);
-		if (!page)
+		if (!folio)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false, &splug);
+			swap_readpage(&folio->page, false, &splug);
 			if (i != ra_info.offset) {
-				SetPageReadahead(page);
+				folio_set_readahead(folio);
 				count_vm_event(SWAP_RA);
 			}
 		}
-		put_page(page);
+		folio_put(folio);
 	}
 	if (pte)
 		pte_unmap(pte);
@@ -858,13 +853,13 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 	swap_read_unplug(splug);
 	lru_add_drain();
 skip:
-	/* The page was likely read above, so no need for plugging here */
-	page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
+	/* The folio was likely read above, so no need for plugging here */
+	folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
 					&page_allocated, false);
 	if (unlikely(page_allocated))
-		swap_readpage(page, false, NULL);
-	zswap_page_swapin(page);
-	return page;
+		swap_readpage(&folio->page, false, NULL);
+	zswap_folio_swapin(folio);
+	return folio_file_page(folio, swp_offset(entry));
 }
 
 /**
diff --git a/mm/zswap.c b/mm/zswap.c
index 12ce5a68da91..6d9ee2a5334a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -368,12 +368,12 @@ void zswap_lruvec_state_init(struct lruvec *lruvec)
 	atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
 }
 
-void zswap_page_swapin(struct page *page)
+void zswap_folio_swapin(struct folio *folio)
 {
 	struct lruvec *lruvec;
 
-	if (page) {
-		lruvec = folio_lruvec(page_folio(page));
+	if (folio) {
+		lruvec = folio_lruvec(folio);
 		atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
 	}
 }
@@ -1383,14 +1383,14 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
 * writeback code
 **********************************/
 /*
- * Attempts to free an entry by adding a page to the swap cache,
- * decompressing the entry data into the page, and issuing a
- * bio write to write the page back to the swap device.
+ * Attempts to free an entry by adding a folio to the swap cache,
+ * decompressing the entry data into the folio, and issuing a
+ * bio write to write the folio back to the swap device.
  *
- * This can be thought of as a "resumed writeback" of the page
+ * This can be thought of as a "resumed writeback" of the folio
  * to the swap device.  We are basically resuming the same swap
  * writeback path that was intercepted with the zswap_store()
- * in the first place.  After the page has been decompressed into
+ * in the first place.  After the folio has been decompressed into
  * the swap cache, the compressed version stored by zswap can be
  * freed.
  */
@@ -1398,56 +1398,56 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
 				 struct zswap_tree *tree)
 {
 	swp_entry_t swpentry = entry->swpentry;
-	struct page *page;
+	struct folio *folio;
 	struct mempolicy *mpol;
-	bool page_was_allocated;
+	bool folio_was_allocated;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
 	};
 
-	/* try to allocate swap cache page */
+	/* try to allocate swap cache folio */
 	mpol = get_task_policy(current);
-	page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
-				NO_INTERLEAVE_INDEX, &page_was_allocated, true);
-	if (!page)
+	folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
+				NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+	if (!folio)
 		return -ENOMEM;
 
 	/*
-	 * Found an existing page, we raced with load/swapin. We generally
-	 * writeback cold pages from zswap, and swapin means the page just
-	 * became hot. Skip this page and let the caller find another one.
+	 * Found an existing folio, we raced with load/swapin. We generally
+	 * writeback cold folios from zswap, and swapin means the folio just
+	 * became hot. Skip this folio and let the caller find another one.
 	 */
-	if (!page_was_allocated) {
-		put_page(page);
+	if (!folio_was_allocated) {
+		folio_put(folio);
 		return -EEXIST;
 	}
 
 	/*
-	 * Page is locked, and the swapcache is now secured against
+	 * folio is locked, and the swapcache is now secured against
 	 * concurrent swapping to and from the slot. Verify that the
 	 * swap entry hasn't been invalidated and recycled behind our
 	 * backs (our zswap_entry reference doesn't prevent that), to
-	 * avoid overwriting a new swap page with old compressed data.
+	 * avoid overwriting a new swap folio with old compressed data.
 	 */
 	spin_lock(&tree->lock);
 	if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
 		spin_unlock(&tree->lock);
-		delete_from_swap_cache(page_folio(page));
+		delete_from_swap_cache(folio);
 		return -ENOMEM;
 	}
 	spin_unlock(&tree->lock);
 
-	__zswap_load(entry, page);
+	__zswap_load(entry, &folio->page);
 
-	/* page is up to date */
-	SetPageUptodate(page);
+	/* folio is up to date */
+	folio_mark_uptodate(folio);
 
 	/* move it to the tail of the inactive list after end_writeback */
-	SetPageReclaim(page);
+	folio_set_reclaim(folio);
 
 	/* start writeback */
-	__swap_writepage(page, &wbc);
-	put_page(page);
+	__swap_writepage(&folio->page, &wbc);
+	folio_put(folio);
 
 	return 0;
 }
@@ -1593,7 +1593,7 @@ bool zswap_store(struct folio *folio)
 
 	dst = acomp_ctx->buffer;
 	sg_init_table(&input, 1);
-	sg_set_page(&input, page, PAGE_SIZE, 0);
+	sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
 
 	/*
 	 * We need PAGE_SIZE * 2 here since there maybe over-compression case,
-- 
cgit v1.2.3


From 3a61e6f668120ee2c7840b91891c858d575d07e2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:38 +0000
Subject: mm: convert swap_page_sector() to swap_folio_sector()

All callers have a folio, so pass it in.  Saves a couple of calls to
compound_head().

Link: https://lkml.kernel.org/r/20231213215842.671461-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 2 +-
 mm/page_io.c         | 8 ++++----
 mm/swapfile.c        | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3e1909087f6a..2d09e9b7ee70 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -493,7 +493,7 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
-sector_t swap_page_sector(struct page *page);
+sector_t swap_folio_sector(struct folio *folio);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index e18afcd9c19a..6736c56526bf 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -334,7 +334,7 @@ static void swap_writepage_bdev_sync(struct folio *folio,
 
 	bio_init(&bio, sis->bdev, &bv, 1,
 		 REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
-	bio.bi_iter.bi_sector = swap_page_sector(&folio->page);
+	bio.bi_iter.bi_sector = swap_folio_sector(folio);
 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
 
 	bio_associate_blkg_from_page(&bio, folio);
@@ -355,7 +355,7 @@ static void swap_writepage_bdev_async(struct folio *folio,
 	bio = bio_alloc(sis->bdev, 1,
 			REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
 			GFP_NOIO);
-	bio->bi_iter.bi_sector = swap_page_sector(&folio->page);
+	bio->bi_iter.bi_sector = swap_folio_sector(folio);
 	bio->bi_end_io = end_swap_bio_write;
 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
 
@@ -461,7 +461,7 @@ static void swap_readpage_bdev_sync(struct folio *folio,
 	struct bio bio;
 
 	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
-	bio.bi_iter.bi_sector = swap_page_sector(&folio->page);
+	bio.bi_iter.bi_sector = swap_folio_sector(folio);
 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
 	/*
 	 * Keep this task valid during swap readpage because the oom killer may
@@ -480,7 +480,7 @@ static void swap_readpage_bdev_async(struct folio *folio,
 	struct bio *bio;
 
 	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
-	bio->bi_iter.bi_sector = swap_page_sector(&folio->page);
+	bio->bi_iter.bi_sector = swap_folio_sector(folio);
 	bio->bi_end_io = end_swap_bio_read;
 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
 	count_vm_event(PSWPIN);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1501bc956456..b22c47b11d65 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -227,14 +227,14 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
 	BUG();
 }
 
-sector_t swap_page_sector(struct page *page)
+sector_t swap_folio_sector(struct folio *folio)
 {
-	struct swap_info_struct *sis = page_swap_info(page);
+	struct swap_info_struct *sis = swp_swap_info(folio->swap);
 	struct swap_extent *se;
 	sector_t sector;
 	pgoff_t offset;
 
-	offset = __page_file_index(page);
+	offset = swp_offset(folio->swap);
 	se = offset_to_swap_extent(sis, offset);
 	sector = se->start_block + (offset - se->start_page);
 	return sector << (PAGE_SHIFT - 9);
-- 
cgit v1.2.3


From 69fe7d67cb0c6eeab3d4c9a3bf950f9d12af4719 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 13 Dec 2023 21:58:40 +0000
Subject: mm: remove page_swap_info()

It's more efficient to get the swap_info_struct by calling
swp_swap_info() directly.

Link: https://lkml.kernel.org/r/20231213215842.671461-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 3 +--
 mm/swap.h            | 2 +-
 mm/swapfile.c        | 8 +-------
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2d09e9b7ee70..4db00ddad261 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -487,8 +487,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern int __swap_count(swp_entry_t entry);
 extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
-extern struct swap_info_struct *page_swap_info(struct page *);
-extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
+struct swap_info_struct *swp_swap_info(swp_entry_t entry);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
diff --git a/mm/swap.h b/mm/swap.h
index 859ae8f0fd2d..6bf25342589f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -60,7 +60,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
-	return page_swap_info(&folio->page)->flags;
+	return swp_swap_info(folio->swap)->flags;
 }
 #else /* CONFIG_SWAP */
 struct swap_iocb;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f3e23a3d26ae..2f877ca44513 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3369,18 +3369,12 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 	return swap_type_to_swap_info(swp_type(entry));
 }
 
-struct swap_info_struct *page_swap_info(struct page *page)
-{
-	swp_entry_t entry = page_swap_entry(page);
-	return swp_swap_info(entry);
-}
-
 /*
  * out-of-line methods to avoid include hell.
  */
 struct address_space *swapcache_mapping(struct folio *folio)
 {
-	return page_swap_info(&folio->page)->swap_file->f_mapping;
+	return swp_swap_info(folio->swap)->swap_file->f_mapping;
 }
 EXPORT_SYMBOL_GPL(swapcache_mapping);
 
-- 
cgit v1.2.3


From f099c961f4998ad7107b1c6a7d6efb225e9a4614 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:32 +0000
Subject: fs: remove clean_page_buffers()

Patch series "Clean up the writeback paths".

Most of these patches verge on the trivial, converting filesystems that
just use block_write_full_page() to use mpage_writepages().  But as we saw
with Christoph's earlier patchset, there can be some "interesting"
gotchas, and I clearly haven't tested the majority of filesystems I've
touched here.

Patches 3 & 4 get rid of a lot of stack usage on architectures with larger
page sizes; 1024 bytes on 64-bit systems with 64KiB pages.  It starts to
open the door to larger folio sizes on all architectures, but it's
certainly not enough yet.

Patch 14 is kind of trivial, but it's nice to get that simplification in.


This patch (of 14):

This function has been unused since the removal of bdev_write_page().

Link: https://lkml.kernel.org/r/20231215200245.748418-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231215200245.748418-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/mpage.c                  | 10 ----------
 include/linux/buffer_head.h |  1 -
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index ffb064ed9d04..63bf99856024 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -455,16 +455,6 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
 		try_to_free_buffers(page_folio(page));
 }
 
-/*
- * For situations where we want to clean all buffers attached to a page.
- * We don't need to calculate how many buffers are attached to the page,
- * we just need to specify a number larger than the maximum number of buffers.
- */
-void clean_page_buffers(struct page *page)
-{
-	clean_buffers(page, ~0U);
-}
-
 static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 		      void *data)
 {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5f23ee599889..94f6161eb45e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -270,7 +270,6 @@ int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
-void clean_page_buffers(struct page *page);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
-- 
cgit v1.2.3


From 17bf23a981be9c6629198a76940c777eb5c8c521 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:44 +0000
Subject: fs: convert block_write_full_page to block_write_full_folio

Convert the function to be compatible with writepage_t so that it can be
passed to write_cache_pages() by blkdev.  This removes a call to
compound_head().  We can also remove the function export as both callers
are built-in.

Link: https://lkml.kernel.org/r/20231215200245.748418-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 block/fops.c                | 21 ++++++++++++++++++---
 fs/buffer.c                 | 16 +++++++---------
 fs/ext4/page-io.c           |  2 +-
 fs/gfs2/aops.c              |  4 ++--
 fs/mpage.c                  |  2 +-
 fs/ntfs/aops.c              |  4 ++--
 fs/ocfs2/alloc.c            |  2 +-
 fs/ocfs2/file.c             |  2 +-
 include/linux/buffer_head.h |  4 ++--
 9 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/fops.c b/block/fops.c
index 0bdad1e8d514..0cf8cf72cdfa 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock.  The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
 {
-	return block_write_full_page(page, blkdev_get_block, wbc);
+	struct blk_plug plug;
+	int err;
+
+	blk_start_plug(&plug);
+	err = write_cache_pages(mapping, wbc, block_write_full_folio,
+			blkdev_get_block);
+	blk_finish_plug(&plug);
+
+	return err;
 }
 
 static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = {
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= blkdev_read_folio,
 	.readahead	= blkdev_readahead,
-	.writepage	= blkdev_writepage,
+	.writepages	= blkdev_writepages,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.migrate_folio	= buffer_migrate_folio_norefs,
diff --git a/fs/buffer.c b/fs/buffer.c
index 3a8c8322ed28..c838b4a31009 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -372,7 +372,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * Completion handler for block_write_full_page() - pages which are unlocked
+ * Completion handler for block_write_full_folio() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
@@ -1771,18 +1771,18 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
  */
 
 /*
- * While block_write_full_page is writing back the dirty buffers under
+ * While block_write_full_folio is writing back the dirty buffers under
  * the page lock, whoever dirtied the buffers may decide to clean them
  * again at any time.  We handle that by only looking at the buffer
  * state inside lock_buffer().
  *
- * If block_write_full_page() is called for regular writeback
+ * If block_write_full_folio() is called for regular writeback
  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
  * locked buffer.   This only can happen if someone has written the buffer
  * directly, with submit_bh().  At the address_space level PageWriteback
  * prevents this contention from occurring.
  *
- * If block_write_full_page() is called with wbc->sync_mode ==
+ * If block_write_full_folio() is called with wbc->sync_mode ==
  * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  * causes the writes to be flagged as synchronous writes.
  */
@@ -1829,7 +1829,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
 			 * truncate in progress.
 			 */
 			/*
-			 * The buffer was zeroed by block_write_full_page()
+			 * The buffer was zeroed by block_write_full_folio()
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
@@ -2696,10 +2696,9 @@ EXPORT_SYMBOL(block_truncate_page);
 /*
  * The generic ->writepage function for buffer-backed address_spaces
  */
-int block_write_full_page(struct page *page, get_block_t *get_block,
-			struct writeback_control *wbc)
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+		void *get_block)
 {
-	struct folio *folio = page_folio(page);
 	struct inode * const inode = folio->mapping->host;
 	loff_t i_size = i_size_read(inode);
 
@@ -2726,7 +2725,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	return __block_write_full_folio(inode, folio, get_block, wbc,
 			end_buffer_async_write);
 }
-EXPORT_SYMBOL(block_write_full_page);
 
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 			    get_block_t *get_block)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..312bc6813357 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 	folio_clear_error(folio);
 
 	/*
-	 * Comments copied from block_write_full_page:
+	 * Comments copied from block_write_full_folio:
 	 *
 	 * The folio straddles i_size.  It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped.  "A file is mapped
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5cffb079b87c..f986cd032b76 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 }
 
 /**
- * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page
+ * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio
  * @folio: The folio to write
  * @wbc: The writeback control
  *
- * This is the same as calling block_write_full_page, but it also
+ * This is the same as calling block_write_full_folio, but it also
  * writes pages outside of i_size
  */
 static int gfs2_write_jdata_folio(struct folio *folio,
diff --git a/fs/mpage.c b/fs/mpage.c
index d4963f3d8051..738882e0766d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -642,7 +642,7 @@ confused:
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
-	ret = block_write_full_page(&folio->page, mpd->get_block, wbc);
+	ret = block_write_full_folio(folio, wbc, mpd->get_block);
 	mapping_set_error(mapping, ret);
 out:
 	mpd->bio = bio;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 70479ce915e8..6c414957e2c2 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1304,7 +1304,7 @@ done:
  * page cleaned.  The VM has already locked the page and marked it clean.
  *
  * For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_page() function,
+ * the ntfs version of the generic block_write_full_folio() function,
  * ntfs_write_block(), which in turn if necessary creates and writes the
  * buffers associated with the page asynchronously.
  *
@@ -1314,7 +1314,7 @@ done:
  * vfs inode dirty code path for the inode the mft record belongs to or via the
  * vm page dirty code path for the page the mft record is in.
  *
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
  *
  * Return 0 on success and -errno on error.
  */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 91b32b2377ac..ea9127ba3208 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
  * nonzero data on subsequent file extends.
  *
  * We need to call this before i_size is updated on the inode because
- * otherwise block_write_full_page() will skip writeout of pages past
+ * otherwise block_write_full_folio() will skip writeout of pages past
  * i_size.
  */
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 94e2a1244442..8b6d15010703 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	/*
 	 * fs-writeback will release the dirty pages without page lock
 	 * whose offset are over inode size, the release happens at
-	 * block_write_full_page().
+	 * block_write_full_folio().
 	 */
 	i_size_write(inode, abs_to);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 94f6161eb45e..396b2adf24bf 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -252,8 +252,8 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[],
  * address_spaces.
  */
 void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
-int block_write_full_page(struct page *page, get_block_t *get_block,
-				struct writeback_control *wbc);
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+		void *get_block);
 int __block_write_full_folio(struct inode *inode, struct folio *folio,
 			get_block_t *get_block, struct writeback_control *wbc,
 			bh_end_io_t *handler);
-- 
cgit v1.2.3


From 14059f66a959c760467ea2041e165f412845bcb8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:02:45 +0000
Subject: fs: remove the bh_end_io argument from __block_write_full_folio

All callers are passing end_buffer_async_write as this argument, so we can
hardcode references to it within __block_write_full_folio().  That lets us
make end_buffer_async_write() static.

Link: https://lkml.kernel.org/r/20231215200245.748418-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 22 ++++++++++------------
 fs/gfs2/aops.c              |  2 +-
 include/linux/buffer_head.h |  4 +---
 3 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index c838b4a31009..19548369bc6c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * Completion handler for block_write_full_folio() - pages which are unlocked
- * during I/O, and which have PageWriteback cleared upon I/O completion.
+ * Completion handler for block_write_full_folio() - folios which are unlocked
+ * during I/O, and which have the writeback flag cleared upon I/O completion.
  */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	unsigned long flags;
 	struct buffer_head *first;
@@ -415,7 +415,6 @@ still_busy:
 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
-EXPORT_SYMBOL(end_buffer_async_write);
 
 /*
  * If a page's buffers are under async readin (end_buffer_async_read
@@ -1787,8 +1786,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
  * causes the writes to be flagged as synchronous writes.
  */
 int __block_write_full_folio(struct inode *inode, struct folio *folio,
-			get_block_t *get_block, struct writeback_control *wbc,
-			bh_end_io_t *handler)
+			get_block_t *get_block, struct writeback_control *wbc)
 {
 	int err;
 	sector_t block;
@@ -1867,7 +1865,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write_endio(bh, handler);
+			mark_buffer_async_write_endio(bh,
+				end_buffer_async_write);
 		} else {
 			unlock_buffer(bh);
 		}
@@ -1920,7 +1919,8 @@ recover:
 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
 		    !buffer_delay(bh)) {
 			lock_buffer(bh);
-			mark_buffer_async_write_endio(bh, handler);
+			mark_buffer_async_write_endio(bh,
+				end_buffer_async_write);
 		} else {
 			/*
 			 * The buffer may have been set dirty during
@@ -2704,8 +2704,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
 
 	/* Is the folio fully inside i_size? */
 	if (folio_pos(folio) + folio_size(folio) <= i_size)
-		return __block_write_full_folio(inode, folio, get_block, wbc,
-					       end_buffer_async_write);
+		return __block_write_full_folio(inode, folio, get_block, wbc);
 
 	/* Is the folio fully outside i_size? (truncate in progress) */
 	if (folio_pos(folio) >= i_size) {
@@ -2722,8 +2721,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
 	 */
 	folio_zero_segment(folio, offset_in_folio(folio, i_size),
 			folio_size(folio));
-	return __block_write_full_folio(inode, folio, get_block, wbc,
-			end_buffer_async_write);
+	return __block_write_full_folio(inode, folio, get_block, wbc);
 }
 
 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f986cd032b76..9914d7f54f7d 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio,
 				folio_size(folio));
 
 	return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc,
-			wbc, end_buffer_async_write);
+			wbc);
 }
 
 /**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 396b2adf24bf..d78454a4dd1f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -255,8 +254,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
 		void *get_block);
 int __block_write_full_folio(struct inode *inode, struct folio *folio,
-			get_block_t *get_block, struct writeback_control *wbc,
-			bh_end_io_t *handler);
+		get_block_t *get_block, struct writeback_control *wbc);
 int block_read_full_folio(struct folio *, get_block_t *);
 bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-- 
cgit v1.2.3


From 280ec6ccb6422aa4a04f9ac4216ddcf055acc95d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:45 +0100
Subject: kasan: rename kasan_slab_free_mempool to kasan_mempool_poison_object

Patch series "kasan: save mempool stack traces".

This series updates KASAN to save alloc and free stack traces for
secondary-level allocators that cache and reuse allocations internally
instead of giving them back to the underlying allocator (e.g.  mempool).

As a part of this change, introduce and document a set of KASAN hooks:

bool kasan_mempool_poison_pages(struct page *page, unsigned int order);
void kasan_mempool_unpoison_pages(struct page *page, unsigned int order);
bool kasan_mempool_poison_object(void *ptr);
void kasan_mempool_unpoison_object(void *ptr, size_t size);

and use them in the mempool code.

Besides mempool, skbuff and io_uring also cache allocations and already
use KASAN hooks to poison those.  Their code is updated to use the new
mempool hooks.

The new hooks save alloc and free stack traces (for normal kmalloc and
slab objects; stack traces for large kmalloc objects and page_alloc are
not supported by KASAN yet), improve the readability of the users' code,
and also allow the users to prevent double-free and invalid-free bugs; see
the patches for the details.


This patch (of 21):

Rename kasan_slab_free_mempool to kasan_mempool_poison_object.

kasan_slab_free_mempool is a slightly confusing name: it is unclear
whether this function poisons the object when it is freed into mempool or
does something when the object is freed from mempool to the underlying
allocator.

The new name also aligns with other mempool-related KASAN hooks added in
the following patches in this series.

Link: https://lkml.kernel.org/r/cover.1703024586.git.andreyknvl@google.com
Link: https://lkml.kernel.org/r/c5618685abb7cdbf9fb4897f565e7759f601da84.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h  | 8 ++++----
 io_uring/alloc_cache.h | 3 +--
 mm/kasan/common.c      | 4 ++--
 mm/mempool.c           | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 72cb693b075b..6310435f528b 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -172,11 +172,11 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
-void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr)
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+static __always_inline void kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_slab_free_mempool(ptr, _RET_IP_);
+		__kasan_mempool_poison_object(ptr, _RET_IP_);
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
@@ -256,7 +256,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init
 	return false;
 }
 static inline void kasan_kfree_large(void *ptr) {}
-static inline void kasan_slab_free_mempool(void *ptr) {}
+static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags, bool init)
 {
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 241245cb54a6..8de0414e8efe 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,8 +16,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
 	if (cache->nr_cached < cache->max_cached) {
 		cache->nr_cached++;
 		wq_stack_add_head(&entry->node, &cache->list);
-		/* KASAN poisons object */
-		kasan_slab_free_mempool(entry);
+		kasan_mempool_poison_object(entry);
 		return true;
 	}
 	return false;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index fe6c4b43ad9f..e0394d0ee7f1 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -271,7 +271,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
 
 	/*
 	 * The object will be poisoned by kasan_poison_pages() or
-	 * kasan_slab_free_mempool().
+	 * kasan_mempool_poison_object().
 	 */
 
 	return false;
@@ -282,7 +282,7 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
 	____kasan_kfree_large(ptr, ip);
 }
 
-void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 {
 	struct folio *folio;
 
diff --git a/mm/mempool.c b/mm/mempool.c
index b3d2084fd989..7e1c729f292b 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -107,7 +107,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_slab_free_mempool(element);
+		kasan_mempool_poison_object(element);
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_poison_pages(element, (unsigned long)pool->pool_data,
 				   false);
-- 
cgit v1.2.3


From 9b94fe91099cbf05606151ef05bea9632666f5d5 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:46 +0100
Subject: kasan: move kasan_mempool_poison_object

Move kasan_mempool_poison_object after all slab-related KASAN hooks.

This is a preparatory change for the following patches in this series.

No functional changes.

Link: https://lkml.kernel.org/r/23ea215409f43c13cdf9ecc454501a264c107d67.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 mm/kasan/common.c     | 46 +++++++++++++++++++++++-----------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6310435f528b..0d1f925c136d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -172,13 +172,6 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
-static __always_inline void kasan_mempool_poison_object(void *ptr)
-{
-	if (kasan_enabled())
-		__kasan_mempool_poison_object(ptr, _RET_IP_);
-}
-
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
 				       void *object, gfp_t flags, bool init);
 static __always_inline void * __must_check kasan_slab_alloc(
@@ -219,6 +212,13 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+static __always_inline void kasan_mempool_poison_object(void *ptr)
+{
+	if (kasan_enabled())
+		__kasan_mempool_poison_object(ptr, _RET_IP_);
+}
+
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -256,7 +256,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init
 	return false;
 }
 static inline void kasan_kfree_large(void *ptr) {}
-static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags, bool init)
 {
@@ -276,6 +275,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
+static inline void kasan_mempool_poison_object(void *ptr) {}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e0394d0ee7f1..fc7f711607e1 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -282,29 +282,6 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
 	____kasan_kfree_large(ptr, ip);
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip)
-{
-	struct folio *folio;
-
-	folio = virt_to_folio(ptr);
-
-	/*
-	 * Even though this function is only called for kmem_cache_alloc and
-	 * kmalloc backed mempool allocations, those allocations can still be
-	 * !PageSlab() when the size provided to kmalloc is larger than
-	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
-	 */
-	if (unlikely(!folio_test_slab(folio))) {
-		if (____kasan_kfree_large(ptr, ip))
-			return;
-		kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
-	} else {
-		struct slab *slab = folio_slab(folio);
-
-		____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
-	}
-}
-
 void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
 					void *object, gfp_t flags, bool init)
 {
@@ -452,6 +429,29 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 		return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
 }
 
+void __kasan_mempool_poison_object(void *ptr, unsigned long ip)
+{
+	struct folio *folio;
+
+	folio = virt_to_folio(ptr);
+
+	/*
+	 * Even though this function is only called for kmem_cache_alloc and
+	 * kmalloc backed mempool allocations, those allocations can still be
+	 * !PageSlab() when the size provided to kmalloc is larger than
+	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+	 */
+	if (unlikely(!folio_test_slab(folio))) {
+		if (____kasan_kfree_large(ptr, ip))
+			return;
+		kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
+	} else {
+		struct slab *slab = folio_slab(folio);
+
+		____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
+	}
+}
+
 bool __kasan_check_byte(const void *address, unsigned long ip)
 {
 	if (!kasan_byte_accessible(address)) {
-- 
cgit v1.2.3


From 1bb843048d00050678c392dab87a15c8b756df6f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:47 +0100
Subject: kasan: document kasan_mempool_poison_object

Add documentation comment for kasan_mempool_poison_object.

Link: https://lkml.kernel.org/r/af33ba8cabfa1ad731fe23a3f874bfc8d3b7fed4.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0d1f925c136d..bbf6e2fa4ffd 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -213,6 +213,24 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 }
 
 void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+/**
+ * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function poisons a slab allocation without initializing its memory and
+ * without putting it into the quarantine (for the Generic mode).
+ *
+ * This function also performs checks to detect double-free and invalid-free
+ * bugs and reports them.
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ */
 static __always_inline void kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-- 
cgit v1.2.3


From 2e7c954c11af96aa1e0566a706f22152ef91d759 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:48 +0100
Subject: kasan: add return value for kasan_mempool_poison_object

Add a return value for kasan_mempool_poison_object that lets the caller
know whether the allocation is affected by a double-free or an
invalid-free bug.  The caller can use this return value to stop operating
on the object.

Also introduce a check_page_allocation helper function to improve the code
readability.

Link: https://lkml.kernel.org/r/618af65273875fb9f56954285443279b15f1fcd9.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 17 ++++++++++++-----
 mm/kasan/common.c     | 21 ++++++++++-----------
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index bbf6e2fa4ffd..33387e254caa 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -212,7 +212,7 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
  * @ptr: Pointer to the slab allocation.
@@ -225,16 +225,20 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * without putting it into the quarantine (for the Generic mode).
  *
  * This function also performs checks to detect double-free and invalid-free
- * bugs and reports them.
+ * bugs and reports them. The caller can use the return value of this function
+ * to find out if the allocation is buggy.
  *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
  * size > KMALLOC_MAX_SIZE).
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
  */
-static __always_inline void kasan_mempool_poison_object(void *ptr)
+static __always_inline bool kasan_mempool_poison_object(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_mempool_poison_object(ptr, _RET_IP_);
+		return __kasan_mempool_poison_object(ptr, _RET_IP_);
+	return true;
 }
 
 /*
@@ -293,7 +297,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_mempool_poison_object(void *ptr) {}
+static inline bool kasan_mempool_poison_object(void *ptr)
+{
+	return true;
+}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index fc7f711607e1..2b4869de4985 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -254,7 +254,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 	return ____kasan_slab_free(cache, object, ip, true, init);
 }
 
-static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
+static inline bool check_page_allocation(void *ptr, unsigned long ip)
 {
 	if (!kasan_arch_is_ready())
 		return false;
@@ -269,17 +269,14 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
 		return true;
 	}
 
-	/*
-	 * The object will be poisoned by kasan_poison_pages() or
-	 * kasan_mempool_poison_object().
-	 */
-
 	return false;
 }
 
 void __kasan_kfree_large(void *ptr, unsigned long ip)
 {
-	____kasan_kfree_large(ptr, ip);
+	check_page_allocation(ptr, ip);
+
+	/* The object will be poisoned by kasan_poison_pages(). */
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
@@ -429,7 +426,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 		return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
 }
 
-void __kasan_mempool_poison_object(void *ptr, unsigned long ip)
+bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 {
 	struct folio *folio;
 
@@ -442,13 +439,15 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
 	 */
 	if (unlikely(!folio_test_slab(folio))) {
-		if (____kasan_kfree_large(ptr, ip))
-			return;
+		if (check_page_allocation(ptr, ip))
+			return false;
 		kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
+		return true;
 	} else {
 		struct slab *slab = folio_slab(folio);
 
-		____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
+		return !____kasan_slab_free(slab->slab_cache, ptr, ip,
+						false, false);
 	}
 }
 
-- 
cgit v1.2.3


From 1956832753735b1c399b86b2c66cb7c317dc9f31 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:49 +0100
Subject: kasan: introduce kasan_mempool_unpoison_object

Introduce and document a kasan_mempool_unpoison_object hook.

This hook serves as a replacement for the generic kasan_unpoison_range
that the mempool code relies on right now.  mempool will be updated to use
the new hook in one of the following patches.

For now, define the new hook to be identical to kasan_unpoison_range.  One
of the following patches will update it to add stack trace collection.

Link: https://lkml.kernel.org/r/dae25f0e18ed8fd50efe509c5b71a0592de5c18d.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 31 +++++++++++++++++++++++++++++++
 mm/kasan/common.c     |  5 +++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 33387e254caa..c5fe303bc1c2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -228,6 +228,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * bugs and reports them. The caller can use the return value of this function
  * to find out if the allocation is buggy.
  *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_object().
+ *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
  * size > KMALLOC_MAX_SIZE).
@@ -241,6 +244,32 @@ static __always_inline bool kasan_mempool_poison_object(void *ptr)
 	return true;
 }
 
+void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
+/**
+ * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ * @size: Size to be unpoisoned.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function unpoisons a slab allocation that was previously poisoned via
+ * kasan_mempool_poison_object() without initializing its memory. For the
+ * tag-based modes, this function does not assign a new tag to the allocation
+ * and instead restores the original tags based on the pointer value.
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ */
+static __always_inline void kasan_mempool_unpoison_object(void *ptr,
+							  size_t size)
+{
+	if (kasan_enabled())
+		__kasan_mempool_unpoison_object(ptr, size, _RET_IP_);
+}
+
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -301,6 +330,8 @@ static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
 }
+static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {}
+
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2b4869de4985..4b85d35bb8ab 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -451,6 +451,11 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 	}
 }
 
+void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip)
+{
+	kasan_unpoison(ptr, size, false);
+}
+
 bool __kasan_check_byte(const void *address, unsigned long ip)
 {
 	if (!kasan_byte_accessible(address)) {
-- 
cgit v1.2.3


From f129c31039283df884913142b0f3797d64d3a9d6 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:50 +0100
Subject: kasan: introduce kasan_mempool_poison_pages

Introduce and document a kasan_mempool_poison_pages hook to be used by the
mempool code instead of kasan_poison_pages.

Compated to kasan_poison_pages, the new hook:

1. For the tag-based modes, skips checking and poisoning allocations that
   were not tagged due to sampling.

2. Checks for double-free and invalid-free bugs.

In the future, kasan_poison_pages can also be updated to handle #2, but
this is out-of-scope of this series.

Link: https://lkml.kernel.org/r/88dc7340cce28249abf789f6e0c792c317df9ba5.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 27 +++++++++++++++++++++++++++
 mm/kasan/common.c     | 23 +++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index c5fe303bc1c2..de2a695ad34d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -212,6 +212,29 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
+bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
+				  unsigned long ip);
+/**
+ * kasan_mempool_poison_pages - Check and poison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function is similar to kasan_mempool_poison_object() but operates on
+ * page allocations.
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
+ */
+static __always_inline bool kasan_mempool_poison_pages(struct page *page,
+						       unsigned int order)
+{
+	if (kasan_enabled())
+		return __kasan_mempool_poison_pages(page, order, _RET_IP_);
+	return true;
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
@@ -326,6 +349,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
+static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order)
+{
+	return true;
+}
 static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 4b85d35bb8ab..b416f4c265a4 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -426,6 +426,29 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 		return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
 }
 
+bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
+				  unsigned long ip)
+{
+	unsigned long *ptr;
+
+	if (unlikely(PageHighMem(page)))
+		return true;
+
+	/* Bail out if allocation was excluded due to sampling. */
+	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+	    page_kasan_tag(page) == KASAN_TAG_KERNEL)
+		return true;
+
+	ptr = page_address(page);
+
+	if (check_page_allocation(ptr, ip))
+		return false;
+
+	kasan_poison(ptr, PAGE_SIZE << order, KASAN_PAGE_FREE, false);
+
+	return true;
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 {
 	struct folio *folio;
-- 
cgit v1.2.3


From 9f41c59ae3163690868a32bd77e9e33c3bab555e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:51 +0100
Subject: kasan: introduce kasan_mempool_unpoison_pages

Introduce and document a new kasan_mempool_unpoison_pages hook to be used
by the mempool code instead of kasan_unpoison_pages.

This hook is not functionally different from kasan_unpoison_pages, but
using it improves the mempool code readability.

Link: https://lkml.kernel.org/r/239bd9af6176f2cc59f5c25893eb36143184daff.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 25 +++++++++++++++++++++++++
 mm/kasan/common.c     |  6 ++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index de2a695ad34d..f8ebde384bd7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -225,6 +225,9 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
  * This function is similar to kasan_mempool_poison_object() but operates on
  * page allocations.
  *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_pages().
+ *
  * Return: true if the allocation can be safely reused; false otherwise.
  */
 static __always_inline bool kasan_mempool_poison_pages(struct page *page,
@@ -235,6 +238,27 @@ static __always_inline bool kasan_mempool_poison_pages(struct page *page,
 	return true;
 }
 
+void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
+				    unsigned long ip);
+/**
+ * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function unpoisons a page allocation that was previously poisoned by
+ * kasan_mempool_poison_pages() without zeroing the allocation's memory. For
+ * the tag-based modes, this function assigns a new tag to the allocation.
+ */
+static __always_inline void kasan_mempool_unpoison_pages(struct page *page,
+							 unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_mempool_unpoison_pages(page, order, _RET_IP_);
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
 /**
  * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
@@ -353,6 +377,7 @@ static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int or
 {
 	return true;
 }
+static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {}
 static inline bool kasan_mempool_poison_object(void *ptr)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b416f4c265a4..7ebc001d0fcd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -449,6 +449,12 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
 	return true;
 }
 
+void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
+				    unsigned long ip)
+{
+	__kasan_unpoison_pages(page, order, false);
+}
+
 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 {
 	struct folio *folio;
-- 
cgit v1.2.3


From b556a462eb8df6b6836c318d23f43409c40a7c7e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:53 +0100
Subject: kasan: save free stack traces for slab mempools

Make kasan_mempool_poison_object save free stack traces for slab and
kmalloc mempools when the object is freed into the mempool.

Also simplify and rename ____kasan_slab_free to poison_slab_object and do
a few other reability changes.

Link: https://lkml.kernel.org/r/413a7c7c3344fb56809853339ffaabc9e4905e94.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h |  5 +++--
 mm/kasan/common.c     | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f8ebde384bd7..e636a00e26ba 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -268,8 +268,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
  * to reuse them instead of freeing them back to the slab allocator (e.g.
  * mempool).
  *
- * This function poisons a slab allocation without initializing its memory and
- * without putting it into the quarantine (for the Generic mode).
+ * This function poisons a slab allocation and saves a free stack trace for it
+ * without initializing the allocation's memory and without putting it into the
+ * quarantine (for the Generic mode).
  *
  * This function also performs checks to detect double-free and invalid-free
  * bugs and reports them. The caller can use the return value of this function
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 3f4a1ed69e03..59146886e57d 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -207,8 +207,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
 	return (void *)object;
 }
 
-static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
-				unsigned long ip, bool quarantine, bool init)
+static inline bool poison_slab_object(struct kmem_cache *cache, void *object,
+				      unsigned long ip, bool init)
 {
 	void *tagged_object;
 
@@ -221,13 +221,12 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 	if (is_kfence_address(object))
 		return false;
 
-	if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
-	    object)) {
+	if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) {
 		kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE);
 		return true;
 	}
 
-	/* RCU slabs could be legally used after free within the RCU period */
+	/* RCU slabs could be legally used after free within the RCU period. */
 	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return false;
 
@@ -239,19 +238,18 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 	kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
 			KASAN_SLAB_FREE, init);
 
-	if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
-		return false;
-
 	if (kasan_stack_collection_enabled())
 		kasan_save_free_info(cache, tagged_object);
 
-	return kasan_quarantine_put(cache, object);
+	return false;
 }
 
 bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 				unsigned long ip, bool init)
 {
-	return ____kasan_slab_free(cache, object, ip, true, init);
+	bool buggy_object = poison_slab_object(cache, object, ip, init);
+
+	return buggy_object ? true : kasan_quarantine_put(cache, object);
 }
 
 static inline bool check_page_allocation(void *ptr, unsigned long ip)
@@ -472,7 +470,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 	}
 
 	slab = folio_slab(folio);
-	return !____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
+	return !poison_slab_object(slab->slab_cache, ptr, ip, false);
 }
 
 void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip)
-- 
cgit v1.2.3


From 29d7355a9d05de9a6e38cc4d1146fb96c43853fb Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:56 +0100
Subject: kasan: save alloc stack traces for mempool

Update kasan_mempool_unpoison_object to properly poison the redzone and
save alloc strack traces for kmalloc and slab pools.

As a part of this change, split out and use a unpoison_slab_object helper
function from __kasan_slab_alloc.

[nathan@kernel.org: mark unpoison_slab_object() as static]
  Link: https://lkml.kernel.org/r/20231221180042.104694-1-andrey.konovalov@linux.dev
Link: https://lkml.kernel.org/r/05ad235da8347cfe14d496d01b2aaf074b4f607c.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h |  7 ++++---
 mm/kasan/common.c     | 50 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e636a00e26ba..7392c5d89b92 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -303,9 +303,10 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
  * mempool).
  *
  * This function unpoisons a slab allocation that was previously poisoned via
- * kasan_mempool_poison_object() without initializing its memory. For the
- * tag-based modes, this function does not assign a new tag to the allocation
- * and instead restores the original tags based on the pointer value.
+ * kasan_mempool_poison_object() and saves an alloc stack trace for it without
+ * initializing the allocation's memory. For the tag-based modes, this function
+ * does not assign a new tag to the allocation and instead restores the
+ * original tags based on the pointer value.
  *
  * This function operates on all slab allocations including large kmalloc
  * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 962805bf5f62..bf16c2dfa8e7 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -277,6 +277,20 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
 	/* The object will be poisoned by kasan_poison_pages(). */
 }
 
+static inline void unpoison_slab_object(struct kmem_cache *cache, void *object,
+					gfp_t flags, bool init)
+{
+	/*
+	 * Unpoison the whole object. For kmalloc() allocations,
+	 * poison_kmalloc_redzone() will do precise poisoning.
+	 */
+	kasan_unpoison(object, cache->object_size, init);
+
+	/* Save alloc info (if possible) for non-kmalloc() allocations. */
+	if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
+		kasan_save_alloc_info(cache, object, flags);
+}
+
 void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
 					void *object, gfp_t flags, bool init)
 {
@@ -299,15 +313,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
 	tag = assign_tag(cache, object, false);
 	tagged_object = set_tag(object, tag);
 
-	/*
-	 * Unpoison the whole object.
-	 * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
-	 */
-	kasan_unpoison(tagged_object, cache->object_size, init);
-
-	/* Save alloc info (if possible) for non-kmalloc() allocations. */
-	if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
-		kasan_save_alloc_info(cache, tagged_object, flags);
+	/* Unpoison the object and save alloc info for non-kmalloc() allocations. */
+	unpoison_slab_object(cache, tagged_object, flags, init);
 
 	return tagged_object;
 }
@@ -482,7 +489,30 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
 
 void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip)
 {
-	kasan_unpoison(ptr, size, false);
+	struct slab *slab;
+	gfp_t flags = 0; /* Might be executing under a lock. */
+
+	if (is_kfence_address(kasan_reset_tag(ptr)))
+		return;
+
+	slab = virt_to_slab(ptr);
+
+	/*
+	 * This function can be called for large kmalloc allocation that get
+	 * their memory from page_alloc.
+	 */
+	if (unlikely(!slab)) {
+		kasan_unpoison(ptr, size, false);
+		poison_kmalloc_large_redzone(ptr, size, flags);
+		return;
+	}
+
+	/* Unpoison the object and save alloc info for non-kmalloc() allocations. */
+	unpoison_slab_object(slab->slab_cache, ptr, size, flags);
+
+	/* Poison the redzone and save alloc info for kmalloc() allocations. */
+	if (is_kmalloc_cache(slab->slab_cache))
+		poison_kmalloc_redzone(slab->slab_cache, ptr, size, flags);
 }
 
 bool __kasan_check_byte(const void *address, unsigned long ip)
-- 
cgit v1.2.3


From 37dcc69ad17a008d2b720bdc39f070ef2a959430 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:28:59 +0100
Subject: mempool: introduce mempool_use_prealloc_only

Introduce a new mempool_alloc_preallocated API that asks the mempool to
only use the elements preallocated during the mempool's creation when
allocating and to not attempt allocating new ones from the underlying
allocator.

This API is required to test the KASAN poisoning/unpoisoning functionality
in KASAN tests, but it might be also useful on its own.

Link: https://lkml.kernel.org/r/a14d809dbdfd04cc33bcacc632fee2abd6b83c00.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>

Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempool.h |  1 +
 mm/mempool.c            | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4aae6c06c5f2..7be1e32e6d42 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
 extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
diff --git a/mm/mempool.c b/mm/mempool.c
index 103dc4770cfb..cb7b4b56cec1 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -456,6 +456,43 @@ repeat_alloc:
 }
 EXPORT_SYMBOL(mempool_alloc);
 
+/**
+ * mempool_alloc_preallocated - allocate an element from preallocated elements
+ *                              belonging to a specific memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * This function is similar to mempool_alloc, but it only attempts allocating
+ * an element from the preallocated elements. It does not sleep and immediately
+ * returns if no preallocated elements are available.
+ *
+ * Return: pointer to the allocated element or %NULL if no elements are
+ * available.
+ */
+void *mempool_alloc_preallocated(mempool_t *pool)
+{
+	void *element;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (likely(pool->curr_nr)) {
+		element = remove_element(pool);
+		spin_unlock_irqrestore(&pool->lock, flags);
+		/* paired with rmb in mempool_free(), read comment there */
+		smp_wmb();
+		/*
+		 * Update the allocation stack trace as this is more useful
+		 * for debugging.
+		 */
+		kmemleak_update_trace(element);
+		return element;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mempool_alloc_preallocated);
+
 /**
  * mempool_free - return an element to the pool.
  * @element:   pool element pointer.
-- 
cgit v1.2.3


From 1ce9a0523938f87dd8505233cc3445f8e2d8dcee Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 23:29:03 +0100
Subject: kasan: rename and document kasan_(un)poison_object_data

Rename kasan_unpoison_object_data to kasan_unpoison_new_object and add a
documentation comment.  Do the same for kasan_poison_object_data.

The new names and the comments should suggest the users that these hooks
are intended for internal use by the slab allocator.

The following patch will remove non-slab-internal uses of these hooks.

No functional changes.

[andreyknvl@google.com: update references to renamed functions in comments]
  Link: https://lkml.kernel.org/r/20231221180637.105098-1-andrey.konovalov@linux.dev
Link: https://lkml.kernel.org/r/eab156ebbd635f9635ef67d1a4271f716994e628.1703024586.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 35 +++++++++++++++++++++++++++--------
 mm/kasan/common.c     |  4 ++--
 mm/kasan/shadow.c     |  4 ++--
 mm/slab.c             | 10 ++++------
 mm/slub.c             |  4 ++--
 net/core/skbuff.c     |  8 ++++----
 6 files changed, 41 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7392c5d89b92..d49e3d4c099e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -129,20 +129,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab)
 		__kasan_poison_slab(slab);
 }
 
-void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Temporarily unpoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * temporarily unpoisons an object from a newly allocated slab without doing
+ * anything else. The object must later be repoisoned by
+ * kasan_poison_new_object().
+ */
+static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache,
 							void *object)
 {
 	if (kasan_enabled())
-		__kasan_unpoison_object_data(cache, object);
+		__kasan_unpoison_new_object(cache, object);
 }
 
-void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+void __kasan_poison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Repoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * repoisons an object that was previously unpoisoned by
+ * kasan_unpoison_new_object() without doing anything else.
+ */
+static __always_inline void kasan_poison_new_object(struct kmem_cache *cache,
 							void *object)
 {
 	if (kasan_enabled())
-		__kasan_poison_object_data(cache, object);
+		__kasan_poison_new_object(cache, object);
 }
 
 void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
@@ -342,9 +361,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
 	return false;
 }
 static inline void kasan_poison_slab(struct slab *slab) {}
-static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+static inline void kasan_unpoison_new_object(struct kmem_cache *cache,
 					void *object) {}
-static inline void kasan_poison_object_data(struct kmem_cache *cache,
+static inline void kasan_poison_new_object(struct kmem_cache *cache,
 					void *object) {}
 static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 				const void *object)
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index bf16c2dfa8e7..f4255e807b74 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -143,12 +143,12 @@ void __kasan_poison_slab(struct slab *slab)
 		     KASAN_SLAB_REDZONE, false);
 }
 
-void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object)
 {
 	kasan_unpoison(object, cache->object_size, false);
 }
 
-void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_new_object(struct kmem_cache *cache, void *object)
 {
 	kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
 			KASAN_SLAB_REDZONE, false);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d687f09a7ae3..0154d200be40 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -130,7 +130,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init)
 
 	/*
 	 * Perform shadow offset calculation based on untagged address, as
-	 * some of the callers (e.g. kasan_poison_object_data) pass tagged
+	 * some of the callers (e.g. kasan_poison_new_object) pass tagged
 	 * addresses to this function.
 	 */
 	addr = kasan_reset_tag(addr);
@@ -170,7 +170,7 @@ void kasan_unpoison(const void *addr, size_t size, bool init)
 
 	/*
 	 * Perform shadow offset calculation based on untagged address, as
-	 * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+	 * some of the callers (e.g. kasan_unpoison_new_object) pass tagged
 	 * addresses to this function.
 	 */
 	addr = kasan_reset_tag(addr);
diff --git a/mm/slab.c b/mm/slab.c
index 9ad3d0f2d1a5..773c79e153f3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2327,11 +2327,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
-			kasan_unpoison_object_data(cachep,
-						   objp + obj_offset(cachep));
+			kasan_unpoison_new_object(cachep, objp + obj_offset(cachep));
 			cachep->ctor(objp + obj_offset(cachep));
-			kasan_poison_object_data(
-				cachep, objp + obj_offset(cachep));
+			kasan_poison_new_object(cachep, objp + obj_offset(cachep));
 		}
 
 		if (cachep->flags & SLAB_RED_ZONE) {
@@ -2472,9 +2470,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 		/* constructor could break poison info */
 		if (DEBUG == 0 && cachep->ctor) {
-			kasan_unpoison_object_data(cachep, objp);
+			kasan_unpoison_new_object(cachep, objp);
 			cachep->ctor(objp);
-			kasan_poison_object_data(cachep, objp);
+			kasan_poison_new_object(cachep, objp);
 		}
 
 		if (!shuffled)
diff --git a/mm/slub.c b/mm/slub.c
index 782bd8a6bd34..891742e5932a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1860,9 +1860,9 @@ static void *setup_object(struct kmem_cache *s, void *object)
 	setup_object_debug(s, object);
 	object = kasan_init_slab_obj(s, object);
 	if (unlikely(s->ctor)) {
-		kasan_unpoison_object_data(s, object);
+		kasan_unpoison_new_object(s, object);
 		s->ctor(object);
-		kasan_poison_object_data(s, object);
+		kasan_poison_new_object(s, object);
 	}
 	return object;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b157efea5dea..63bb6526399d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -337,7 +337,7 @@ static struct sk_buff *napi_skb_cache_get(void)
 	}
 
 	skb = nc->skb_cache[--nc->skb_count];
-	kasan_unpoison_object_data(skbuff_cache, skb);
+	kasan_unpoison_new_object(skbuff_cache, skb);
 
 	return skb;
 }
@@ -1309,13 +1309,13 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	u32 i;
 
-	kasan_poison_object_data(skbuff_cache, skb);
+	kasan_poison_new_object(skbuff_cache, skb);
 	nc->skb_cache[nc->skb_count++] = skb;
 
 	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
 		for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
-			kasan_unpoison_object_data(skbuff_cache,
-						   nc->skb_cache[i]);
+			kasan_unpoison_new_object(skbuff_cache,
+						  nc->skb_cache[i]);
 
 		kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF,
 				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
-- 
cgit v1.2.3


From 91349f541e7daa6cce15e01e7ffe4fd63731ead9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 19 Dec 2023 22:19:53 +0100
Subject: lib/stackdepot: fix comment in include/linux/stackdepot.h

As stack traces can now be evicted from the stack depot, remove the
comment saying that they are never removed.

Link: https://lkml.kernel.org/r/0ebe712d91f8d302a8947d3c9e9123bc2b1b8440.1703020707.git.andreyknvl@google.com
Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces")
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/stackdepot.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index a6796f178913..adcbb8f23600 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -11,8 +11,6 @@
  * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free
  * stack traces often repeat, using stack depot allows to save about 100x space.
  *
- * Stack traces are never removed from the stack depot.
- *
  * Author: Alexander Potapenko <glider@google.com>
  * Copyright (C) 2016 Google, Inc.
  *
-- 
cgit v1.2.3


From 7fbb5e188248c50f737720825da1864ce42536d1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 19 Dec 2023 21:41:23 -0800
Subject: mm: remove VM_EXEC requirement for THP eligibility

Commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking
in transparent_hugepage_enabled()") introduced the VM_EXEC requirement,
which is not strictly needed.

lld's default --rosegment option and GNU ld's -z separate-code option
(default on Linux/x86 since binutils 2.31) create a read-only PT_LOAD
segment without the PF_X flag, which should be eligible for THP.

Certain architectures support medium and large code models, where .lrodata
may be placed in a separate read-only PT_LOAD segment, which should be
eligible for THP as well.

Link: https://lkml.kernel.org/r/20231220054123.1266001-1-maskray@google.com
Signed-off-by: Fangrui Song <maskray@google.com>
Acked-by: Yang Shi <shy828301@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa7a38a30fc6..5adb86af35fc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -206,7 +206,6 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	inode = vma->vm_file->f_inode;
 
 	return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
-	       (vma->vm_flags & VM_EXEC) &&
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-- 
cgit v1.2.3


From 5ec8e8ea8b7783fab150cf86404fc38cb4db8800 Mon Sep 17 00:00:00 2001
From: Charan Teja Kalla <quic_charante@quicinc.com>
Date: Fri, 13 Oct 2023 18:34:27 +0530
Subject: mm/sparsemem: fix race in accessing memory_section->usage

The below race is observed on a PFN which falls into the device memory
region with the system memory configuration where PFN's are such that
[ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL].  Since normal zone start and end
pfn contains the device memory PFN's as well, the compaction triggered
will try on the device memory PFN's too though they end up in NOP(because
pfn_to_online_page() returns NULL for ZONE_DEVICE memory sections).  When
from other core, the section mappings are being removed for the
ZONE_DEVICE region, that the PFN in question belongs to, on which
compaction is currently being operated is resulting into the kernel crash
with CONFIG_SPASEMEM_VMEMAP enabled.  The crash logs can be seen at [1].

compact_zone()			memunmap_pages
-------------			---------------
__pageblock_pfn_to_page
   ......
 (a)pfn_valid():
     valid_section()//return true
			      (b)__remove_pages()->
				  sparse_remove_section()->
				    section_deactivate():
				    [Free the array ms->usage and set
				     ms->usage = NULL]
     pfn_section_valid()
     [Access ms->usage which
     is NULL]

NOTE: From the above it can be said that the race is reduced to between
the pfn_valid()/pfn_section_valid() and the section deactivate with
SPASEMEM_VMEMAP enabled.

The commit b943f045a9af("mm/sparse: fix kernel crash with
pfn_section_valid check") tried to address the same problem by clearing
the SECTION_HAS_MEM_MAP with the expectation of valid_section() returns
false thus ms->usage is not accessed.

Fix this issue by the below steps:

a) Clear SECTION_HAS_MEM_MAP before freeing the ->usage.

b) RCU protected read side critical section will either return NULL
   when SECTION_HAS_MEM_MAP is cleared or can successfully access ->usage.

c) Free the ->usage with kfree_rcu() and set ms->usage = NULL.  No
   attempt will be made to access ->usage after this as the
   SECTION_HAS_MEM_MAP is cleared thus valid_section() return false.

Thanks to David/Pavan for their inputs on this patch.

[1] https://lore.kernel.org/linux-mm/994410bb-89aa-d987-1f50-f514903c55aa@quicinc.com/

On Snapdragon SoC, with the mentioned memory configuration of PFN's as
[ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL], we are able to see bunch of
issues daily while testing on a device farm.

For this particular issue below is the log.  Though the below log is
not directly pointing to the pfn_section_valid(){ ms->usage;}, when we
loaded this dump on T32 lauterbach tool, it is pointing.

[  540.578056] Unable to handle kernel NULL pointer dereference at
virtual address 0000000000000000
[  540.578068] Mem abort info:
[  540.578070]   ESR = 0x0000000096000005
[  540.578073]   EC = 0x25: DABT (current EL), IL = 32 bits
[  540.578077]   SET = 0, FnV = 0
[  540.578080]   EA = 0, S1PTW = 0
[  540.578082]   FSC = 0x05: level 1 translation fault
[  540.578085] Data abort info:
[  540.578086]   ISV = 0, ISS = 0x00000005
[  540.578088]   CM = 0, WnR = 0
[  540.579431] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBSBTYPE=--)
[  540.579436] pc : __pageblock_pfn_to_page+0x6c/0x14c
[  540.579454] lr : compact_zone+0x994/0x1058
[  540.579460] sp : ffffffc03579b510
[  540.579463] x29: ffffffc03579b510 x28: 0000000000235800 x27:000000000000000c
[  540.579470] x26: 0000000000235c00 x25: 0000000000000068 x24:ffffffc03579b640
[  540.579477] x23: 0000000000000001 x22: ffffffc03579b660 x21:0000000000000000
[  540.579483] x20: 0000000000235bff x19: ffffffdebf7e3940 x18:ffffffdebf66d140
[  540.579489] x17: 00000000739ba063 x16: 00000000739ba063 x15:00000000009f4bff
[  540.579495] x14: 0000008000000000 x13: 0000000000000000 x12:0000000000000001
[  540.579501] x11: 0000000000000000 x10: 0000000000000000 x9 :ffffff897d2cd440
[  540.579507] x8 : 0000000000000000 x7 : 0000000000000000 x6 :ffffffc03579b5b4
[  540.579512] x5 : 0000000000027f25 x4 : ffffffc03579b5b8 x3 :0000000000000001
[  540.579518] x2 : ffffffdebf7e3940 x1 : 0000000000235c00 x0 :0000000000235800
[  540.579524] Call trace:
[  540.579527]  __pageblock_pfn_to_page+0x6c/0x14c
[  540.579533]  compact_zone+0x994/0x1058
[  540.579536]  try_to_compact_pages+0x128/0x378
[  540.579540]  __alloc_pages_direct_compact+0x80/0x2b0
[  540.579544]  __alloc_pages_slowpath+0x5c0/0xe10
[  540.579547]  __alloc_pages+0x250/0x2d0
[  540.579550]  __iommu_dma_alloc_noncontiguous+0x13c/0x3fc
[  540.579561]  iommu_dma_alloc+0xa0/0x320
[  540.579565]  dma_alloc_attrs+0xd4/0x108

[quic_charante@quicinc.com: use kfree_rcu() in place of synchronize_rcu(), per David]
  Link: https://lkml.kernel.org/r/1698403778-20938-1-git-send-email-quic_charante@quicinc.com
Link: https://lkml.kernel.org/r/1697202267-23600-1-git-send-email-quic_charante@quicinc.com
Fixes: f46edbd1b151 ("mm/sparsemem: add helpers track active portions of a section at boot")
Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 14 +++++++++++---
 mm/sparse.c            | 17 +++++++++--------
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ec73582e7d27..2efd3be484fd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1799,6 +1799,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
 
 struct mem_section_usage {
+	struct rcu_head rcu;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 #endif
@@ -1992,7 +1993,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 {
 	int idx = subsection_map_index(pfn);
 
-	return test_bit(idx, ms->usage->subsection_map);
+	return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
 }
 #else
 static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
@@ -2016,6 +2017,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 static inline int pfn_valid(unsigned long pfn)
 {
 	struct mem_section *ms;
+	int ret;
 
 	/*
 	 * Ensure the upper PAGE_SHIFT bits are clear in the
@@ -2029,13 +2031,19 @@ static inline int pfn_valid(unsigned long pfn)
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	ms = __pfn_to_section(pfn);
-	if (!valid_section(ms))
+	rcu_read_lock();
+	if (!valid_section(ms)) {
+		rcu_read_unlock();
 		return 0;
+	}
 	/*
 	 * Traditionally early sections always returned pfn_valid() for
 	 * the entire section-sized span.
 	 */
-	return early_section(ms) || pfn_section_valid(ms, pfn);
+	ret = early_section(ms) || pfn_section_valid(ms, pfn);
+	rcu_read_unlock();
+
+	return ret;
 }
 #endif
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 77d91e565045..338cf946dee8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -791,6 +791,13 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	if (empty) {
 		unsigned long section_nr = pfn_to_section_nr(pfn);
 
+		/*
+		 * Mark the section invalid so that valid_section()
+		 * return false. This prevents code from dereferencing
+		 * ms->usage array.
+		 */
+		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
+
 		/*
 		 * When removing an early section, the usage map is kept (as the
 		 * usage maps of other sections fall into the same page). It
@@ -799,16 +806,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 		 * was allocated during boot.
 		 */
 		if (!PageReserved(virt_to_page(ms->usage))) {
-			kfree(ms->usage);
-			ms->usage = NULL;
+			kfree_rcu(ms->usage, rcu);
+			WRITE_ONCE(ms->usage, NULL);
 		}
 		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-		/*
-		 * Mark the section invalid so that valid_section()
-		 * return false. This prevents code from dereferencing
-		 * ms->usage array.
-		 */
-		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
 	}
 
 	/*
-- 
cgit v1.2.3


From 5cb6674b694b84803cbee8bfccaa2bfdfeb6eae4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 21 Dec 2023 21:04:44 +0100
Subject: mm, kasan: use KASAN_TAG_KERNEL instead of 0xff

Use the KASAN_TAG_KERNEL marco instead of open-coding 0xff in the mm code.
This macro is provided by include/linux/kasan-tags.h, which does not
include any other headers, so it's safe to include it into mm.h without
causing circular include dependencies.

Link: https://lkml.kernel.org/r/71db9087b0aebb6c4dccbc609cc0cd50621533c7.1703188911.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 include/linux/mm.h    | 4 ++--
 mm/page_alloc.c       | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d49e3d4c099e..dbb06d789e74 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -4,6 +4,7 @@
 
 #include <linux/bug.h>
 #include <linux/kasan-enabled.h>
+#include <linux/kasan-tags.h>
 #include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b72bf25a45cf..2563ffdb51bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1815,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 
 static inline u8 page_kasan_tag(const struct page *page)
 {
-	u8 tag = 0xff;
+	u8 tag = KASAN_TAG_KERNEL;
 
 	if (kasan_enabled()) {
 		tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
@@ -1844,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
 static inline void page_kasan_tag_reset(struct page *page)
 {
 	if (kasan_enabled())
-		page_kasan_tag_set(page, 0xff);
+		page_kasan_tag_set(page, KASAN_TAG_KERNEL);
 }
 
 #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 692f33c764d2..5526797b7f96 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1059,7 +1059,7 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
 		return deferred_pages_enabled();
 
-	return page_kasan_tag(page) == 0xff;
+	return page_kasan_tag(page) == KASAN_TAG_KERNEL;
 }
 
 static void kernel_init_pages(struct page *page, int numpages)
-- 
cgit v1.2.3


From 9d5fafd5d882446999366f673ab06edba453f862 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:25 +0100
Subject: mm/rmap: rename hugepage_add* to hugetlb_add*

Patch series "mm/rmap: interface overhaul", v2.

This series overhauls the rmap interface, to get rid of the "bool
compound" / RMAP_COMPOUND parameter with the goal of making the interface
less error prone, more future proof, and more natural to extend to
"batching".  Also, this converts the interface to always consume
folio+subpage, which speeds up operations on large folios.

Further, this series adds PTE-batching variants for 4 rmap functions,
whereby only folio_add_anon_rmap_ptes() is used for batching in this
series when PTE-remapping a PMD-mapped THP.  folio_remove_rmap_ptes(),
folio_try_dup_anon_rmap_ptes() and folio_dup_file_rmap_ptes() will soon
come in handy[1,2].

This series performs a lot of folio conversion along the way.  Most of the
added LOC in the diff are only due to documentation.

As we're moving to a pte/pmd interface where we clearly express the
mapping granularity we are dealing with, we first get the remainder of
hugetlb out of the way, as it is special and expected to remain special:
it treats everything as a "single logical PTE" and only currently allows
entire mappings.

Even if we'd ever support partial mappings, I strongly assume the
interface and implementation will still differ heavily: hopefull we can
avoid working on subpages/subpage mapcounts completely and only add a
"count" parameter for them to enable batching.

New (extended) hugetlb interface that operates on entire folio:
 * hugetlb_add_new_anon_rmap() -> Already existed
 * hugetlb_add_anon_rmap() -> Already existed
 * hugetlb_try_dup_anon_rmap()
 * hugetlb_try_share_anon_rmap()
 * hugetlb_add_file_rmap()
 * hugetlb_remove_rmap()

New "ordinary" interface for small folios / THP::
 * folio_add_new_anon_rmap() -> Already existed
 * folio_add_anon_rmap_[pte|ptes|pmd]()
 * folio_try_dup_anon_rmap_[pte|ptes|pmd]()
 * folio_try_share_anon_rmap_[pte|pmd]()
 * folio_add_file_rmap_[pte|ptes|pmd]()
 * folio_dup_file_rmap_[pte|ptes|pmd]()
 * folio_remove_rmap_[pte|ptes|pmd]()

folio_add_new_anon_rmap() will always map at the largest granularity
possible (currently, a single PMD to cover a PMD-sized THP).  Could be
extended if ever required.

In the future, we might want "_pud" variants and eventually "_pmds"
variants for batching.

I ran some simple microbenchmarks on an Intel(R) Xeon(R) Silver 4210R:
measuring munmap(), fork(), cow, MADV_DONTNEED on each PTE ...  and PTE
remapping PMD-mapped THPs on 1 GiB of memory.

For small folios, there is barely a change (< 1% improvement for me).

For PTE-mapped THP:
* PTE-remapping a PMD-mapped THP is more than 10% faster.
* fork() is more than 4% faster.
* MADV_DONTNEED is 2% faster
* COW when writing only a single byte on a COW-shared PTE is 1% faster
* munmap() barely changes (< 1%).

[1] https://lkml.kernel.org/r/20230810103332.3062143-1-ryan.roberts@arm.com
[2] https://lkml.kernel.org/r/20231204105440.61448-1-ryan.roberts@arm.com


This patch (of 40):

Let's just call it "hugetlb_".

Yes, it's all already inconsistent and confusing because we have a lot of
"hugepage_" functions for legacy reasons.  But "hugetlb" cannot possibly
be confused with transparent huge pages, and it matches "hugetlb.c" and
"folio_test_hugetlb()".  So let's minimize confusion in rmap code.

Link: https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com
Link: https://lkml.kernel.org/r/20231220224504.646757-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 mm/hugetlb.c         | 8 ++++----
 mm/migrate.c         | 4 ++--
 mm/rmap.c            | 8 ++++----
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0ae2bb0e77f5..36096ba69bdc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -206,9 +206,9 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
-void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
+void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
+void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
 static inline void __page_dup_rmap(struct page *page, bool compound)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6feb3e0630d1..305f3ca1dee6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5285,7 +5285,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add
 	pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
 
 	__folio_mark_uptodate(new_folio);
-	hugepage_add_new_anon_rmap(new_folio, vma, addr);
+	hugetlb_add_new_anon_rmap(new_folio, vma, addr);
 	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
 		newpte = huge_pte_mkuffd_wp(newpte);
 	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
@@ -5988,7 +5988,7 @@ retry_avoidcopy:
 		/* Break COW or unshare */
 		huge_ptep_clear_flush(vma, haddr, ptep);
 		page_remove_rmap(&old_folio->page, vma, true);
-		hugepage_add_new_anon_rmap(new_folio, vma, haddr);
+		hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
 		if (huge_pte_uffd_wp(pte))
 			newpte = huge_pte_mkuffd_wp(newpte);
 		set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
@@ -6277,7 +6277,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		goto backout;
 
 	if (anon_rmap)
-		hugepage_add_new_anon_rmap(folio, vma, haddr);
+		hugetlb_add_new_anon_rmap(folio, vma, haddr);
 	else
 		page_dup_file_rmap(&folio->page, true);
 	new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
@@ -6732,7 +6732,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 	if (folio_in_pagecache)
 		page_dup_file_rmap(&folio->page, true);
 	else
-		hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
+		hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);
 
 	/*
 	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
diff --git a/mm/migrate.c b/mm/migrate.c
index bad3039d165e..7d1c3f292d24 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -249,8 +249,8 @@ static bool remove_migration_pte(struct folio *folio,
 
 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			if (folio_test_anon(folio))
-				hugepage_add_anon_rmap(folio, vma, pvmw.address,
-						       rmap_flags);
+				hugetlb_add_anon_rmap(folio, vma, pvmw.address,
+						      rmap_flags);
 			else
 				page_dup_file_rmap(new, true);
 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
diff --git a/mm/rmap.c b/mm/rmap.c
index 23da5b1ac33b..9845499b22f8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2625,8 +2625,8 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
  *
  * RMAP_COMPOUND is ignored.
  */
-void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
-			    unsigned long address, rmap_t flags)
+void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
+		unsigned long address, rmap_t flags)
 {
 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
 
@@ -2637,8 +2637,8 @@ void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 			 PageAnonExclusive(&folio->page), folio);
 }
 
-void hugepage_add_new_anon_rmap(struct folio *folio,
-			struct vm_area_struct *vma, unsigned long address)
+void hugetlb_add_new_anon_rmap(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long address)
 {
 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	/* increment count (starts at -1) */
-- 
cgit v1.2.3


From e135826b2da0cf25305086dc9ac1e91718a148e1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:26 +0100
Subject: mm/rmap: introduce and use hugetlb_remove_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

Let's introduce and use hugetlb_remove_rmap() and remove the hugetlb code
from page_remove_rmap().  This effectively removes one check on the
small-folio path as well.

Add sanity checks that we end up with the right folios in the right
functions.

Note: all possible candidates that need care are page_remove_rmap() that
      pass compound=true.

Link: https://lkml.kernel.org/r/20231220224504.646757-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  7 +++++++
 mm/hugetlb.c         |  4 ++--
 mm/rmap.c            | 18 +++++++++---------
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 36096ba69bdc..64ae6c4d7272 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,13 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+static inline void hugetlb_remove_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+
+	atomic_dec(&folio->_entire_mapcount);
+}
+
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
 	if (compound) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 305f3ca1dee6..ef48ae673890 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5676,7 +5676,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 					make_pte_marker(PTE_MARKER_UFFD_WP),
 					sz);
 		hugetlb_count_sub(pages_per_huge_page(h), mm);
-		page_remove_rmap(page, vma, true);
+		hugetlb_remove_rmap(page_folio(page));
 
 		spin_unlock(ptl);
 		tlb_remove_page_size(tlb, page, huge_page_size(h));
@@ -5987,7 +5987,7 @@ retry_avoidcopy:
 
 		/* Break COW or unshare */
 		huge_ptep_clear_flush(vma, haddr, ptep);
-		page_remove_rmap(&old_folio->page, vma, true);
+		hugetlb_remove_rmap(old_folio);
 		hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
 		if (huge_pte_uffd_wp(pte))
 			newpte = huge_pte_mkuffd_wp(newpte);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9845499b22f8..261e1af0d254 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1480,15 +1480,9 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 	bool last;
 	enum node_stat_item idx;
 
+	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
 
-	/* Hugetlb pages are not counted in NR_*MAPPED */
-	if (unlikely(folio_test_hugetlb(folio))) {
-		/* hugetlb pages are always mapped with pmds */
-		atomic_dec(&folio->_entire_mapcount);
-		return;
-	}
-
 	/* Is page being unmapped by PTE? Is this its last map to be removed? */
 	if (likely(!compound)) {
 		last = atomic_add_negative(-1, &page->_mapcount);
@@ -1846,7 +1840,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			dec_mm_counter(mm, mm_counter_file(&folio->page));
 		}
 discard:
-		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+		if (unlikely(folio_test_hugetlb(folio)))
+			hugetlb_remove_rmap(folio);
+		else
+			page_remove_rmap(subpage, vma, false);
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
 		folio_put(folio);
@@ -2199,7 +2196,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 */
 		}
 
-		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+		if (unlikely(folio_test_hugetlb(folio)))
+			hugetlb_remove_rmap(folio);
+		else
+			page_remove_rmap(subpage, vma, false);
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_drain_local();
 		folio_put(folio);
-- 
cgit v1.2.3


From 44887f39945519fa8405133b1acd098fda9c9746 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:27 +0100
Subject: mm/rmap: introduce and use hugetlb_add_file_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

Right now we're using page_dup_file_rmap() in some cases where "ordinary"
rmap code would have used page_add_file_rmap().  So let's introduce and
use hugetlb_add_file_rmap() instead.  We won't be adding a
"hugetlb_dup_file_rmap()" functon for the fork() case, as it would be
doing the same: "dup" is just an optimization for "add".

What remains is a single page_dup_file_rmap() call in fork() code.

Add sanity checks that we end up with the right folios in the right
functions.

Link: https://lkml.kernel.org/r/20231220224504.646757-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 8 ++++++++
 mm/hugetlb.c         | 6 +++---
 mm/migrate.c         | 2 +-
 mm/rmap.c            | 1 +
 4 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 64ae6c4d7272..56900a16f41a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,14 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+static inline void hugetlb_add_file_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
+
+	atomic_inc(&folio->_entire_mapcount);
+}
+
 static inline void hugetlb_remove_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef48ae673890..57e898187931 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5408,7 +5408,7 @@ again:
 			 * sleep during the process.
 			 */
 			if (!folio_test_anon(pte_folio)) {
-				page_dup_file_rmap(&pte_folio->page, true);
+				hugetlb_add_file_rmap(pte_folio);
 			} else if (page_try_dup_anon_rmap(&pte_folio->page,
 							  true, src_vma)) {
 				pte_t src_pte_old = entry;
@@ -6279,7 +6279,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	if (anon_rmap)
 		hugetlb_add_new_anon_rmap(folio, vma, haddr);
 	else
-		page_dup_file_rmap(&folio->page, true);
+		hugetlb_add_file_rmap(folio);
 	new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	/*
@@ -6730,7 +6730,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 		goto out_release_unlock;
 
 	if (folio_in_pagecache)
-		page_dup_file_rmap(&folio->page, true);
+		hugetlb_add_file_rmap(folio);
 	else
 		hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 7d1c3f292d24..0e912443a18c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -252,7 +252,7 @@ static bool remove_migration_pte(struct folio *folio,
 				hugetlb_add_anon_rmap(folio, vma, pvmw.address,
 						      rmap_flags);
 			else
-				page_dup_file_rmap(new, true);
+				hugetlb_add_file_rmap(folio);
 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
 					psize);
 		} else
diff --git a/mm/rmap.c b/mm/rmap.c
index 261e1af0d254..a57ec926daf0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1395,6 +1395,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 	unsigned int nr_pmdmapped = 0, first;
 	int nr = 0;
 
+	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
 
 	/* Is page being mapped by PTE? Is this its first map to be added? */
-- 
cgit v1.2.3


From ebe2e35ec0f256372c158a18de459fb60070b313 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:28 +0100
Subject: mm/rmap: introduce and use hugetlb_try_dup_anon_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb
handling use dedicated hugetlb_* rmap functions.

Add sanity checks that we end up with the right folios in the right
functions.

Note that is_device_private_page() does not apply to hugetlb.

Link: https://lkml.kernel.org/r/20231220224504.646757-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   | 12 +++++++++---
 include/linux/rmap.h | 18 ++++++++++++++++++
 mm/hugetlb.c         |  3 +--
 3 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2563ffdb51bc..75bba6102825 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1964,15 +1964,21 @@ static inline bool page_maybe_dma_pinned(struct page *page)
  *
  * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
  */
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
-					  struct page *page)
+static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
+					  struct folio *folio)
 {
 	VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
 
 	if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
 		return false;
 
-	return page_maybe_dma_pinned(page);
+	return folio_maybe_dma_pinned(folio);
+}
+
+static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
+					  struct page *page)
+{
+	return folio_needs_cow_for_dma(vma, page_folio(page));
 }
 
 /**
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 56900a16f41a..5f26752de945 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -211,6 +211,22 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
+/* See page_try_dup_anon_rmap() */
+static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
+		struct vm_area_struct *vma)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+
+	if (PageAnonExclusive(&folio->page)) {
+		if (unlikely(folio_needs_cow_for_dma(vma, folio)))
+			return -EBUSY;
+		ClearPageAnonExclusive(&folio->page);
+	}
+	atomic_inc(&folio->_entire_mapcount);
+	return 0;
+}
+
 static inline void hugetlb_add_file_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -228,6 +244,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
+	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
+
 	if (compound) {
 		struct folio *folio = (struct folio *)page;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 57e898187931..378e460a6ab4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5409,8 +5409,7 @@ again:
 			 */
 			if (!folio_test_anon(pte_folio)) {
 				hugetlb_add_file_rmap(pte_folio);
-			} else if (page_try_dup_anon_rmap(&pte_folio->page,
-							  true, src_vma)) {
+			} else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
 				pte_t src_pte_old = entry;
 				struct folio *new_folio;
 
-- 
cgit v1.2.3


From 0c2ec32bf0b2f0d7ccb98c53ee5d255d68e73595 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:29 +0100
Subject: mm/rmap: introduce and use hugetlb_try_share_anon_rmap()

hugetlb rmap handling differs quite a lot from "ordinary" rmap code.  For
example, hugetlb currently only supports entire mappings, and treats any
mapping as mapped using a single "logical PTE".  Let's move it out of the
way so we can overhaul our "ordinary" rmap.  implementation/interface.

So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb
handling use dedicated hugetlb_* rmap functions.

Add sanity checks that we end up with the right folios in the right
functions.

Note that try_to_unmap_one() does not need care.  Easy to spot because
among all that nasty hugetlb special-casing in that function, we're not
using set_huge_pte_at() on the anon path -- well, and that code assumes
that we would want to swapout.

Link: https://lkml.kernel.org/r/20231220224504.646757-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 25 +++++++++++++++++++++++++
 mm/rmap.c            | 15 ++++++++++-----
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5f26752de945..d6fefa0f0410 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -227,6 +227,30 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 	return 0;
 }
 
+/* See page_try_share_anon_rmap() */
+static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
+
+	/* Paired with the memory barrier in try_grab_folio(). */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb();
+
+	if (unlikely(folio_maybe_dma_pinned(folio)))
+		return -EBUSY;
+	ClearPageAnonExclusive(&folio->page);
+
+	/*
+	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
+	 * gup_must_unshare().
+	 */
+	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+		smp_mb__after_atomic();
+	return 0;
+}
+
 static inline void hugetlb_add_file_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -331,6 +355,7 @@ dup:
  */
 static inline int page_try_share_anon_rmap(struct page *page)
 {
+	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
 	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
 
 	/* device private pages cannot get pinned via GUP. */
diff --git a/mm/rmap.c b/mm/rmap.c
index a57ec926daf0..c229e48cf5a9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2149,13 +2149,18 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				       !anon_exclusive, subpage);
 
 			/* See page_try_share_anon_rmap(): clear PTE first. */
-			if (anon_exclusive &&
-			    page_try_share_anon_rmap(subpage)) {
-				if (folio_test_hugetlb(folio))
+			if (folio_test_hugetlb(folio)) {
+				if (anon_exclusive &&
+				    hugetlb_try_share_anon_rmap(folio)) {
 					set_huge_pte_at(mm, address, pvmw.pte,
 							pteval, hsz);
-				else
-					set_pte_at(mm, address, pvmw.pte, pteval);
+					ret = false;
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+			} else if (anon_exclusive &&
+				   page_try_share_anon_rmap(subpage)) {
+				set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
-- 
cgit v1.2.3


From 68f0320824fa59c5429cbc811e6c46e7a30ea32c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:31 +0100
Subject: mm/rmap: convert folio_add_file_rmap_range() into
 folio_add_file_rmap_[pte|ptes|pmd]()

Let's get rid of the compound parameter and instead define explicitly
which mappings we're adding.  That is more future proof, easier to read
and harder to mess up.

Use an enum to express the granularity internally.  Make the compiler
always special-case on the granularity by using __always_inline.  Replace
the "compound" check by a switch-case that will be removed by the compiler
completely.

Add plenty of sanity checks with CONFIG_DEBUG_VM.  Replace the
folio_test_pmd_mappable() check by a config check in the caller and sanity
checks.  Convert the single user of folio_add_file_rmap_range().

While at it, consistently use "int" instead of "unisgned int" in rmap code
when dealing with mapcounts and the number of pages.

This function design can later easily be extended to PUDs and to batch
PMDs.  Note that for now we don't support anything bigger than PMD-sized
folios (as we cleanly separated hugetlb handling).  Sanity checks will
catch if that ever changes.

Next up is removing page_remove_rmap() along with its "compound" parameter
and smilarly converting all other rmap functions.

Link: https://lkml.kernel.org/r/20231220224504.646757-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 46 ++++++++++++++++++++++++++++--
 mm/memory.c          |  2 +-
 mm/rmap.c            | 79 +++++++++++++++++++++++++++++++++-------------------
 3 files changed, 95 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d6fefa0f0410..3d86a76b2836 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -191,6 +191,44 @@ typedef int __bitwise rmap_t;
  */
 #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
 
+/*
+ * Internally, we're using an enum to specify the granularity. We make the
+ * compiler emit specialized code for each granularity.
+ */
+enum rmap_level {
+	RMAP_LEVEL_PTE = 0,
+	RMAP_LEVEL_PMD,
+};
+
+static inline void __folio_rmap_sanity_checks(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
+{
+	/* hugetlb folios are handled separately. */
+	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+			 !folio_test_large_rmappable(folio), folio);
+
+	VM_WARN_ON_ONCE(nr_pages <= 0);
+	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
+	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
+
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		break;
+	case RMAP_LEVEL_PMD:
+		/*
+		 * We don't support folios larger than a single PMD yet. So
+		 * when RMAP_LEVEL_PMD is set, we assume that we are creating
+		 * a single "entire" mapping of the folio.
+		 */
+		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
+		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
+		break;
+	default:
+		VM_WARN_ON_ONCE(true);
+	}
+}
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
@@ -201,8 +239,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
-void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
-		struct vm_area_struct *, bool compound);
+void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *);
+#define folio_add_file_rmap_pte(folio, page, vma) \
+	folio_add_file_rmap_ptes(folio, page, 1, vma)
+void folio_add_file_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
diff --git a/mm/memory.c b/mm/memory.c
index cfcaf4c0198c..9b977b2cf893 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4516,7 +4516,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		folio_add_lru_vma(folio, vma);
 	} else {
 		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
-		folio_add_file_rmap_range(folio, page, nr, vma, false);
+		folio_add_file_rmap_ptes(folio, page, nr, vma);
 	}
 	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 6a1829324053..cc1fc2d570f0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1378,31 +1378,18 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	__lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
 }
 
-/**
- * folio_add_file_rmap_range - add pte mapping to page range of a folio
- * @folio:	The folio to add the mapping to
- * @page:	The first page to add
- * @nr_pages:	The number of pages which will be mapped
- * @vma:	the vm area in which the mapping is added
- * @compound:	charge the page as compound or small page
- *
- * The page range of folio is defined by [first_page, first_page + nr_pages)
- *
- * The caller needs to hold the pte lock.
- */
-void folio_add_file_rmap_range(struct folio *folio, struct page *page,
-			unsigned int nr_pages, struct vm_area_struct *vma,
-			bool compound)
+static __always_inline void __folio_add_file_rmap(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *vma,
+		enum rmap_level level)
 {
 	atomic_t *mapped = &folio->_nr_pages_mapped;
-	unsigned int nr_pmdmapped = 0, first;
-	int nr = 0;
+	int nr = 0, nr_pmdmapped = 0, first;
 
-	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
-	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
-	/* Is page being mapped by PTE? Is this its first map to be added? */
-	if (likely(!compound)) {
+	switch (level) {
+	case RMAP_LEVEL_PTE:
 		do {
 			first = atomic_inc_and_test(&page->_mapcount);
 			if (first && folio_test_large(folio)) {
@@ -1413,9 +1400,8 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 			if (first)
 				nr++;
 		} while (page++, --nr_pages > 0);
-	} else if (folio_test_pmd_mappable(folio)) {
-		/* That test is redundant: it's for safety or to optimize out */
-
+		break;
+	case RMAP_LEVEL_PMD:
 		first = atomic_inc_and_test(&folio->_entire_mapcount);
 		if (first) {
 			nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
@@ -1430,6 +1416,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 				nr = 0;
 			}
 		}
+		break;
 	}
 
 	if (nr_pmdmapped)
@@ -1443,6 +1430,43 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 		mlock_vma_folio(folio, vma);
 }
 
+/**
+ * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
+ * @folio:	The folio to add the mappings to
+ * @page:	The first page to add
+ * @nr_pages:	The number of pages that will be mapped using PTEs
+ * @vma:	The vm area in which the mappings are added
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
+		int nr_pages, struct vm_area_struct *vma)
+{
+	__folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
+ * @folio:	The folio to add the mapping to
+ * @page:	The first page to add
+ * @vma:	The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page:	the page to add the mapping to
@@ -1455,16 +1479,13 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
 		bool compound)
 {
 	struct folio *folio = page_folio(page);
-	unsigned int nr_pages;
 
 	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
 
 	if (likely(!compound))
-		nr_pages = 1;
+		folio_add_file_rmap_pte(folio, page, vma);
 	else
-		nr_pages = folio_nr_pages(folio);
-
-	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
+		folio_add_file_rmap_pmd(folio, page, vma);
 }
 
 /**
-- 
cgit v1.2.3


From be6e57cfabe99a5d3b3869103c4ea0ed4a9692d4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:36 +0100
Subject: mm/rmap: remove page_add_file_rmap()

All users are gone, let's remove it.

Link: https://lkml.kernel.org/r/20231220224504.646757-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 --
 mm/rmap.c            | 21 ---------------------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3d86a76b2836..6a4db6933e7d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -237,8 +237,6 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
-void page_add_file_rmap(struct page *, struct vm_area_struct *,
-		bool compound);
 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_add_file_rmap_pte(folio, page, vma) \
diff --git a/mm/rmap.c b/mm/rmap.c
index cc1fc2d570f0..5ab5ef10fbf5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1467,27 +1467,6 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
-/**
- * page_add_file_rmap - add pte mapping to a file page
- * @page:	the page to add the mapping to
- * @vma:	the vm area in which the mapping is added
- * @compound:	charge the page as compound or small page
- *
- * The caller needs to hold the pte lock.
- */
-void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
-		bool compound)
-{
-	struct folio *folio = page_folio(page);
-
-	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
-
-	if (likely(!compound))
-		folio_add_file_rmap_pte(folio, page, vma);
-	else
-		folio_add_file_rmap_pmd(folio, page, vma);
-}
-
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page:	page to remove mapping from
-- 
cgit v1.2.3


From 8bd5130070fbf2247a97c5361427a810522ac98a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:38 +0100
Subject: mm/rmap: introduce folio_add_anon_rmap_[pte|ptes|pmd]()

Let's mimic what we did with folio_add_file_rmap_*() so we can similarly
replace page_add_anon_rmap() next.

Make the compiler always special-case on the granularity by using
__always_inline.

For the PageAnonExclusive sanity checks, when adding a PMD mapping, we're
now also checking each individual subpage covered by that PMD, instead of
only the head page.

Note that the new functions ignore the RMAP_COMPOUND flag, which we will
remove as soon as page_add_anon_rmap() is gone.

Link: https://lkml.kernel.org/r/20231220224504.646757-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |   6 +++
 mm/rmap.c            | 120 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 88 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6a4db6933e7d..b5da3d86200e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -233,6 +233,12 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
  * rmap interfaces called when adding or removing pte of page
  */
 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
+void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *, unsigned long address, rmap_t flags);
+#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
+	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
+void folio_add_anon_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *, unsigned long address, rmap_t flags);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
diff --git a/mm/rmap.c b/mm/rmap.c
index 895a8534a935..7f380f5a34c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1299,40 +1299,20 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
 		unsigned long address, rmap_t flags)
 {
 	struct folio *folio = page_folio(page);
-	atomic_t *mapped = &folio->_nr_pages_mapped;
-	int nr = 0, nr_pmdmapped = 0;
-	bool compound = flags & RMAP_COMPOUND;
-	bool first;
-
-	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
 
-	/* Is page being mapped by PTE? Is this its first map to be added? */
-	if (likely(!compound)) {
-		first = atomic_inc_and_test(&page->_mapcount);
-		nr = first;
-		if (first && folio_test_large(folio)) {
-			nr = atomic_inc_return_relaxed(mapped);
-			nr = (nr < COMPOUND_MAPPED);
-		}
-	} else if (folio_test_pmd_mappable(folio)) {
-		/* That test is redundant: it's for safety or to optimize out */
+	if (likely(!(flags & RMAP_COMPOUND)))
+		folio_add_anon_rmap_pte(folio, page, vma, address, flags);
+	else
+		folio_add_anon_rmap_pmd(folio, page, vma, address, flags);
+}
 
-		first = atomic_inc_and_test(&folio->_entire_mapcount);
-		if (first) {
-			nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
-			if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
-				nr_pmdmapped = folio_nr_pages(folio);
-				nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
-				/* Raced ahead of a remove and another add? */
-				if (unlikely(nr < 0))
-					nr = 0;
-			} else {
-				/* Raced ahead of a remove of COMPOUND_MAPPED */
-				nr = 0;
-			}
-		}
-	}
+static __always_inline void __folio_add_anon_rmap(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *vma,
+		unsigned long address, rmap_t flags, enum rmap_level level)
+{
+	int i, nr, nr_pmdmapped = 0;
 
+	nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
 	if (nr_pmdmapped)
 		__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
 	if (nr)
@@ -1346,18 +1326,34 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
 		 * folio->index right when not given the address of the head
 		 * page.
 		 */
-		VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio);
+		VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+				 level != RMAP_LEVEL_PMD, folio);
 		__folio_set_anon(folio, vma, address,
 				 !!(flags & RMAP_EXCLUSIVE));
 	} else if (likely(!folio_test_ksm(folio))) {
 		__page_check_anon_rmap(folio, page, vma, address);
 	}
-	if (flags & RMAP_EXCLUSIVE)
-		SetPageAnonExclusive(page);
-	/* While PTE-mapping a THP we have a PMD and a PTE mapping. */
-	VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 ||
-			  (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) &&
-			 PageAnonExclusive(page), folio);
+
+	if (flags & RMAP_EXCLUSIVE) {
+		switch (level) {
+		case RMAP_LEVEL_PTE:
+			for (i = 0; i < nr_pages; i++)
+				SetPageAnonExclusive(page + i);
+			break;
+		case RMAP_LEVEL_PMD:
+			SetPageAnonExclusive(page);
+			break;
+		}
+	}
+	for (i = 0; i < nr_pages; i++) {
+		struct page *cur_page = page + i;
+
+		/* While PTE-mapping a THP we have a PMD and a PTE mapping. */
+		VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
+				  (folio_test_large(folio) &&
+				   folio_entire_mapcount(folio) > 1)) &&
+				 PageAnonExclusive(cur_page), folio);
+	}
 
 	/*
 	 * For large folio, only mlock it if it's fully mapped to VMA. It's
@@ -1369,6 +1365,54 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
 		mlock_vma_folio(folio, vma);
 }
 
+/**
+ * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
+ * @folio:	The folio to add the mappings to
+ * @page:	The first page to add
+ * @nr_pages:	The number of pages which will be mapped
+ * @vma:	The vm area in which the mappings are added
+ * @address:	The user virtual address of the first page to map
+ * @flags:	The rmap flags
+ *
+ * The page range of folio is defined by [first_page, first_page + nr_pages)
+ *
+ * The caller needs to hold the page table lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting,
+ * and to ensure that an anon folio is not being upgraded racily to a KSM folio
+ * (but KSM folios are never downgraded).
+ */
+void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
+		int nr_pages, struct vm_area_struct *vma, unsigned long address,
+		rmap_t flags)
+{
+	__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
+			      RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
+ * @folio:	The folio to add the mapping to
+ * @page:	The first page to add
+ * @vma:	The vm area in which the mapping is added
+ * @address:	The user virtual address of the first page to map
+ * @flags:	The rmap flags
+ *
+ * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting.
+ */
+void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma, unsigned long address, rmap_t flags)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
+			      RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 /**
  * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
  * @folio:	The folio to add the mapping to.
-- 
cgit v1.2.3


From 84f0169e6c8a613012722e0d63302f9da4a72099 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:45 +0100
Subject: mm/rmap: remove page_add_anon_rmap()

All users are gone, remove it and all traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-22-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 --
 mm/rmap.c            | 31 ++++---------------------------
 2 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b5da3d86200e..fe7b5a8b0e75 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -239,8 +239,6 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
 void folio_add_anon_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *, unsigned long address, rmap_t flags);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *,
-		unsigned long address, rmap_t flags);
 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
diff --git a/mm/rmap.c b/mm/rmap.c
index 7f380f5a34c9..87415bbf2402 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1270,7 +1270,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
 	 * The page's anon-rmap details (mapping and index) are guaranteed to
 	 * be set up correctly at this point.
 	 *
-	 * We have exclusion against page_add_anon_rmap because the caller
+	 * We have exclusion against folio_add_anon_rmap_*() because the caller
 	 * always holds the page locked.
 	 *
 	 * We have exclusion against folio_add_new_anon_rmap because those pages
@@ -1283,29 +1283,6 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
 		       page);
 }
 
-/**
- * page_add_anon_rmap - add pte mapping to an anonymous page
- * @page:	the page to add the mapping to
- * @vma:	the vm area in which the mapping is added
- * @address:	the user virtual address mapped
- * @flags:	the rmap flags
- *
- * The caller needs to hold the pte lock, and the page must be locked in
- * the anon_vma case: to serialize mapping,index checking after setting,
- * and to ensure that PageAnon is not being upgraded racily to PageKsm
- * (but PageKsm is never downgraded to PageAnon).
- */
-void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
-		unsigned long address, rmap_t flags)
-{
-	struct folio *folio = page_folio(page);
-
-	if (likely(!(flags & RMAP_COMPOUND)))
-		folio_add_anon_rmap_pte(folio, page, vma, address, flags);
-	else
-		folio_add_anon_rmap_pmd(folio, page, vma, address, flags);
-}
-
 static __always_inline void __folio_add_anon_rmap(struct folio *folio,
 		struct page *page, int nr_pages, struct vm_area_struct *vma,
 		unsigned long address, rmap_t flags, enum rmap_level level)
@@ -1419,7 +1396,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
  * @vma:	the vm area in which the mapping is added
  * @address:	the user virtual address mapped
  *
- * Like page_add_anon_rmap() but must only be called on *new* folios.
+ * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
  * This means the inc-and-test can be bypassed.
  * The folio does not have to be locked.
  *
@@ -1479,7 +1456,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
 	if (nr)
 		__lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
 
-	/* See comments in page_add_anon_rmap() */
+	/* See comments in folio_add_anon_rmap_*() */
 	if (!folio_test_large(folio))
 		mlock_vma_folio(folio, vma);
 }
@@ -1593,7 +1570,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 
 	/*
 	 * It would be tidy to reset folio_test_anon mapping when fully
-	 * unmapped, but that might overwrite a racing page_add_anon_rmap
+	 * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
 	 * which increments mapcount after us but sets mapping before us:
 	 * so leave the reset to free_pages_prepare, and remember that
 	 * it's only reliable while mapped.
-- 
cgit v1.2.3


From 0cae959e3abf19ba62805f6e6a8b42b6cd9ed3e3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:46 +0100
Subject: mm/rmap: remove RMAP_COMPOUND

No longer used, let's remove it and clarify RMAP_NONE/RMAP_EXCLUSIVE a
bit.

Link: https://lkml.kernel.org/r/20231220224504.646757-23-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 12 +++---------
 mm/rmap.c            |  2 --
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fe7b5a8b0e75..bf6cb79aa7a0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -177,20 +177,14 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio);
 typedef int __bitwise rmap_t;
 
 /*
- * No special request: if the page is a subpage of a compound page, it is
- * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
+ * No special request: A mapped anonymous (sub)page is possibly shared between
+ * processes.
  */
 #define RMAP_NONE		((__force rmap_t)0)
 
-/* The (sub)page is exclusive to a single process. */
+/* The anonymous (sub)page is exclusive to a single process. */
 #define RMAP_EXCLUSIVE		((__force rmap_t)BIT(0))
 
-/*
- * The compound page is not mapped via PTEs, but instead via a single PMD and
- * should be accounted accordingly.
- */
-#define RMAP_COMPOUND		((__force rmap_t)BIT(1))
-
 /*
  * Internally, we're using an enum to specify the granularity. We make the
  * compiler emit specialized code for each granularity.
diff --git a/mm/rmap.c b/mm/rmap.c
index 87415bbf2402..2b386b9f6791 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2662,8 +2662,6 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
  * The following two functions are for anonymous (private mapped) hugepages.
  * Unlike common anonymous pages, anonymous hugepages have no accounting code
  * and no lru code, because we handle hugepages differently from common pages.
- *
- * RMAP_COMPOUND is ignored.
  */
 void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 		unsigned long address, rmap_t flags)
-- 
cgit v1.2.3


From b06dc281aa9901076898d4d0a7bde588f11bc204 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:47 +0100
Subject: mm/rmap: introduce folio_remove_rmap_[pte|ptes|pmd]()

Let's mimic what we did with folio_add_file_rmap_*() and
folio_add_anon_rmap_*() so we can similarly replace page_remove_rmap()
next.

Make the compiler always special-case on the granularity by using
__always_inline.

We're adding folio_remove_rmap_ptes() handling right away, as we want to
use that soon for batching rmap operations when unmapping PTE-mapped large
folios.

Link: https://lkml.kernel.org/r/20231220224504.646757-24-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  6 ++++
 mm/rmap.c            | 82 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf6cb79aa7a0..57e045093f04 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -243,6 +243,12 @@ void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
+void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
+		struct vm_area_struct *);
+#define folio_remove_rmap_pte(folio, page, vma) \
+	folio_remove_rmap_ptes(folio, page, 1, vma)
+void folio_remove_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *);
 
 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
diff --git a/mm/rmap.c b/mm/rmap.c
index 2b386b9f6791..127318075395 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1510,25 +1510,37 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 		bool compound)
 {
 	struct folio *folio = page_folio(page);
+
+	if (likely(!compound))
+		folio_remove_rmap_pte(folio, page, vma);
+	else
+		folio_remove_rmap_pmd(folio, page, vma);
+}
+
+static __always_inline void __folio_remove_rmap(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *vma,
+		enum rmap_level level)
+{
 	atomic_t *mapped = &folio->_nr_pages_mapped;
-	int nr = 0, nr_pmdmapped = 0;
-	bool last;
+	int last, nr = 0, nr_pmdmapped = 0;
 	enum node_stat_item idx;
 
-	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
-	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
-
-	/* Is page being unmapped by PTE? Is this its last map to be removed? */
-	if (likely(!compound)) {
-		last = atomic_add_negative(-1, &page->_mapcount);
-		nr = last;
-		if (last && folio_test_large(folio)) {
-			nr = atomic_dec_return_relaxed(mapped);
-			nr = (nr < COMPOUND_MAPPED);
-		}
-	} else if (folio_test_pmd_mappable(folio)) {
-		/* That test is redundant: it's for safety or to optimize out */
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
+
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		do {
+			last = atomic_add_negative(-1, &page->_mapcount);
+			if (last && folio_test_large(folio)) {
+				last = atomic_dec_return_relaxed(mapped);
+				last = (last < COMPOUND_MAPPED);
+			}
 
+			if (last)
+				nr++;
+		} while (page++, --nr_pages > 0);
+		break;
+	case RMAP_LEVEL_PMD:
 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
 		if (last) {
 			nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
@@ -1543,6 +1555,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 				nr = 0;
 			}
 		}
+		break;
 	}
 
 	if (nr_pmdmapped) {
@@ -1564,7 +1577,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 		 * is still mapped.
 		 */
 		if (folio_test_large(folio) && folio_test_anon(folio))
-			if (!compound || nr < nr_pmdmapped)
+			if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped)
 				deferred_split_folio(folio);
 	}
 
@@ -1579,6 +1592,43 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
 	munlock_vma_folio(folio, vma);
 }
 
+/**
+ * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
+ * @folio:	The folio to remove the mappings from
+ * @page:	The first page to remove
+ * @nr_pages:	The number of pages that will be removed from the mapping
+ * @vma:	The vm area from which the mappings are removed
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
+		int nr_pages, struct vm_area_struct *vma)
+{
+	__folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
+ * @folio:	The folio to remove the mapping from
+ * @page:	The first page to remove
+ * @vma:	The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
-- 
cgit v1.2.3


From 4d8f7418e8ba36036c8486d92d9591c368ab9b85 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:56 +0100
Subject: mm/rmap: remove page_remove_rmap()

All callers are gone, let's remove it and some leftover traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-33-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  4 +---
 mm/filemap.c         | 10 +++++-----
 mm/internal.h        |  2 +-
 mm/memory-failure.c  |  4 ++--
 mm/rmap.c            | 23 ++---------------------
 5 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 57e045093f04..fef369e37039 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -241,8 +241,6 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_file_rmap_ptes(folio, page, 1, vma)
 void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
-void page_remove_rmap(struct page *, struct vm_area_struct *,
-		bool compound);
 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_remove_rmap_pte(folio, page, vma) \
@@ -389,7 +387,7 @@ dup:
  *
  * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
  * to duplicate a mapping, but instead to prepare for KSM or temporarily
- * unmapping a page (swap, migration) via page_remove_rmap().
+ * unmapping a page (swap, migration) via folio_remove_rmap_*().
  *
  * Marking the page shared can only fail if the page may be pinned; device
  * private pages cannot get pinned and consequently this function cannot fail.
diff --git a/mm/filemap.c b/mm/filemap.c
index 67ba56ecdd32..c8dafe70d4cc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -113,11 +113,11 @@
  *    ->i_pages lock		(try_to_unmap_one)
  *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
  *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
- *    ->private_lock		(page_remove_rmap->set_page_dirty)
- *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
- *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
- *    ->memcg->move_lock	(page_remove_rmap->folio_memcg_lock)
+ *    ->private_lock		(folio_remove_rmap_pte->set_page_dirty)
+ *    ->i_pages lock		(folio_remove_rmap_pte->set_page_dirty)
+ *    bdi.wb->list_lock		(folio_remove_rmap_pte->set_page_dirty)
+ *    ->inode->i_lock		(folio_remove_rmap_pte->set_page_dirty)
+ *    ->memcg->move_lock	(folio_remove_rmap_pte->folio_memcg_lock)
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
diff --git a/mm/internal.h b/mm/internal.h
index 222e63b2dea4..a94355e70bd7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -651,7 +651,7 @@ folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
  * under page table lock for the pte/pmd being added or removed.
  *
  * mlock is usually called at the end of page_add_*_rmap(), munlock at
- * the end of page_remove_rmap(); but new anon folios are managed by
+ * the end of folio_remove_rmap_*(); but new anon folios are managed by
  * folio_add_lru_vma() calling mlock_new_folio().
  */
 void mlock_folio(struct folio *folio);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5a23da5eb8c1..a0d9b4ac7d54 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2315,8 +2315,8 @@ try_again:
 	 * We use page flags to determine what action should be taken, but
 	 * the flags can be modified by the error containment action.  One
 	 * example is an mlocked page, where PG_mlocked is cleared by
-	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
-	 * correctly, we save a copy of the page flags at this time.
+	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
+	 * status correctly, we save a copy of the page flags at this time.
 	 */
 	page_flags = p->flags;
 
diff --git a/mm/rmap.c b/mm/rmap.c
index a3ec2be484cf..3ee254a99622 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -470,7 +470,7 @@ void __init anon_vma_init(void)
 /*
  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
  *
- * Since there is no serialization what so ever against page_remove_rmap()
+ * Since there is no serialization what so ever against folio_remove_rmap_*()
  * the best this function can do is return a refcount increased anon_vma
  * that might have been relevant to this page.
  *
@@ -487,7 +487,7 @@ void __init anon_vma_init(void)
  * [ something equivalent to page_mapped_in_vma() ].
  *
  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
- * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
+ * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
  * if there is a mapcount, we can dereference the anon_vma after observing
  * those.
  *
@@ -1498,25 +1498,6 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
-/**
- * page_remove_rmap - take down pte mapping from a page
- * @page:	page to remove mapping from
- * @vma:	the vm area from which the mapping is removed
- * @compound:	uncharge the page as compound or small page
- *
- * The caller needs to hold the pte lock.
- */
-void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
-		bool compound)
-{
-	struct folio *folio = page_folio(page);
-
-	if (likely(!compound))
-		folio_remove_rmap_pte(folio, page, vma);
-	else
-		folio_remove_rmap_pmd(folio, page, vma);
-}
-
 static __always_inline void __folio_remove_rmap(struct folio *folio,
 		struct page *page, int nr_pages, struct vm_area_struct *vma,
 		enum rmap_level level)
-- 
cgit v1.2.3


From d8ef5e311d7bfde54b60ab45026f206eff31b2d2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:57 +0100
Subject: mm/rmap: convert page_dup_file_rmap() to
 folio_dup_file_rmap_[pte|ptes|pmd]()

Let's convert page_dup_file_rmap() like the other rmap functions.  As
there is only a single caller, convert that single caller right away and
remove page_dup_file_rmap().

Add folio_dup_file_rmap_ptes() right away, we want to perform rmap baching
during fork() soon.

Link: https://lkml.kernel.org/r/20231220224504.646757-34-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c          |  2 +-
 2 files changed, 55 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fef369e37039..7607f862e795 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -308,6 +308,60 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 	atomic_dec(&folio->_entire_mapcount);
 }
 
+static __always_inline void __folio_dup_file_rmap(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
+{
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
+
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		do {
+			atomic_inc(&page->_mapcount);
+		} while (page++, --nr_pages > 0);
+		break;
+	case RMAP_LEVEL_PMD:
+		atomic_inc(&folio->_entire_mapcount);
+		break;
+	}
+}
+
+/**
+ * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
+ * @folio:	The folio to duplicate the mappings of
+ * @page:	The first page to duplicate the mappings of
+ * @nr_pages:	The number of pages of which the mapping will be duplicated
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
+ *
+ * The caller needs to hold the page table lock.
+ */
+static inline void folio_dup_file_rmap_ptes(struct folio *folio,
+		struct page *page, int nr_pages)
+{
+	__folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
+}
+#define folio_dup_file_rmap_pte(folio, page) \
+	folio_dup_file_rmap_ptes(folio, page, 1)
+
+/**
+ * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
+ * @folio:	The folio to duplicate the mapping of
+ * @page:	The first page to duplicate the mapping of
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+static inline void folio_dup_file_rmap_pmd(struct folio *folio,
+		struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 static inline void __page_dup_rmap(struct page *page, bool compound)
 {
 	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
@@ -322,11 +376,6 @@ static inline void __page_dup_rmap(struct page *page, bool compound)
 	}
 }
 
-static inline void page_dup_file_rmap(struct page *page, bool compound)
-{
-	__page_dup_rmap(page, compound);
-}
-
 /**
  * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
  *			    anonymous page
diff --git a/mm/memory.c b/mm/memory.c
index e52c6e97444a..fdc87bf15545 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -965,7 +965,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		rss[MM_ANONPAGES]++;
 	} else if (page) {
 		folio_get(folio);
-		page_dup_file_rmap(page, false);
+		folio_dup_file_rmap_pte(folio, page);
 		rss[mm_counter_file(page)]++;
 	}
 
-- 
cgit v1.2.3


From 61d90309b7156d54c5d358cb5d8bf55b33d233d2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:58 +0100
Subject: mm/rmap: introduce folio_try_dup_anon_rmap_[pte|ptes|pmd]()

The last user of page_needs_cow_for_dma() and __page_dup_rmap() are gone,
remove them.

Add folio_try_dup_anon_rmap_ptes() right away, we want to perform rmap
baching during fork() soon.

Link: https://lkml.kernel.org/r/20231220224504.646757-35-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |   6 ---
 include/linux/rmap.h | 150 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 106 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75bba6102825..896c0079f64f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1975,12 +1975,6 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
 	return folio_maybe_dma_pinned(folio);
 }
 
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
-					  struct page *page)
-{
-	return folio_needs_cow_for_dma(vma, page_folio(page));
-}
-
 /**
  * is_zero_page - Query if a page is a zero page
  * @page: The page to query
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7607f862e795..850aa74b6724 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -362,68 +362,130 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio,
 #endif
 }
 
-static inline void __page_dup_rmap(struct page *page, bool compound)
+static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *src_vma,
+		enum rmap_level level)
 {
-	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
+	bool maybe_pinned;
+	int i;
+
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
-	if (compound) {
-		struct folio *folio = (struct folio *)page;
+	/*
+	 * If this folio may have been pinned by the parent process,
+	 * don't allow to duplicate the mappings but instead require to e.g.,
+	 * copy the subpage immediately for the child so that we'll always
+	 * guarantee the pinned folio won't be randomly replaced in the
+	 * future on write faults.
+	 */
+	maybe_pinned = likely(!folio_is_device_private(folio)) &&
+		       unlikely(folio_needs_cow_for_dma(src_vma, folio));
 
-		VM_BUG_ON_PAGE(compound && !PageHead(page), page);
+	/*
+	 * No need to check+clear for already shared PTEs/PMDs of the
+	 * folio. But if any page is PageAnonExclusive, we must fallback to
+	 * copying if the folio maybe pinned.
+	 */
+	switch (level) {
+	case RMAP_LEVEL_PTE:
+		if (unlikely(maybe_pinned)) {
+			for (i = 0; i < nr_pages; i++)
+				if (PageAnonExclusive(page + i))
+					return -EBUSY;
+		}
+		do {
+			if (PageAnonExclusive(page))
+				ClearPageAnonExclusive(page);
+			atomic_inc(&page->_mapcount);
+		} while (page++, --nr_pages > 0);
+		break;
+	case RMAP_LEVEL_PMD:
+		if (PageAnonExclusive(page)) {
+			if (unlikely(maybe_pinned))
+				return -EBUSY;
+			ClearPageAnonExclusive(page);
+		}
 		atomic_inc(&folio->_entire_mapcount);
-	} else {
-		atomic_inc(&page->_mapcount);
+		break;
 	}
+	return 0;
 }
 
 /**
- * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
- *			    anonymous page
- * @page: the page to duplicate the mapping for
- * @compound: the page is mapped as compound or as a small page
- * @vma: the source vma
+ * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
+ *				  of a folio
+ * @folio:	The folio to duplicate the mappings of
+ * @page:	The first page to duplicate the mappings of
+ * @nr_pages:	The number of pages of which the mapping will be duplicated
+ * @src_vma:	The vm area from which the mappings are duplicated
  *
- * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
+ * The page range of the folio is defined by [page, page + nr_pages)
  *
- * Duplicating the mapping can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
+ *
+ * Duplicating the mappings can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
+ *
+ * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
+ *
+ * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
+ */
+static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
+		struct page *page, int nr_pages, struct vm_area_struct *src_vma)
+{
+	return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
+					 RMAP_LEVEL_PTE);
+}
+#define folio_try_dup_anon_rmap_pte(folio, page, vma) \
+	folio_try_dup_anon_rmap_ptes(folio, page, 1, vma)
+
+/**
+ * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
+ *				 of a folio
+ * @folio:	The folio to duplicate the mapping of
+ * @page:	The first page to duplicate the mapping of
+ * @src_vma:	The vm area from which the mapping is duplicated
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
  *
- * If duplicating the mapping succeeds, the page has to be mapped R/O into
- * the parent and the child. It must *not* get mapped writable after this call.
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
+ *
+ * Duplicating the mapping can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
+ *
+ * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
  *
  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
  */
+static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
+		struct page *page, struct vm_area_struct *src_vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
+					 RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+	return -EBUSY;
+#endif
+}
+
 static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
 					 struct vm_area_struct *vma)
 {
-	VM_BUG_ON_PAGE(!PageAnon(page), page);
-
-	/*
-	 * No need to check+clear for already shared pages, including KSM
-	 * pages.
-	 */
-	if (!PageAnonExclusive(page))
-		goto dup;
-
-	/*
-	 * If this page may have been pinned by the parent process,
-	 * don't allow to duplicate the mapping but instead require to e.g.,
-	 * copy the page immediately for the child so that we'll always
-	 * guarantee the pinned page won't be randomly replaced in the
-	 * future on write faults.
-	 */
-	if (likely(!is_device_private_page(page)) &&
-	    unlikely(page_needs_cow_for_dma(vma, page)))
-		return -EBUSY;
+	struct folio *folio = page_folio(page);
 
-	ClearPageAnonExclusive(page);
-	/*
-	 * It's okay to share the anon page between both processes, mapping
-	 * the page R/O into both processes.
-	 */
-dup:
-	__page_dup_rmap(page, compound);
-	return 0;
+	if (likely(!compound))
+		return folio_try_dup_anon_rmap_pte(folio, page, vma);
+	return folio_try_dup_anon_rmap_pmd(folio, page, vma);
 }
 
 /**
-- 
cgit v1.2.3


From a13d096471ec0ac5c6fc90fbcd57e8430024046a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:45:01 +0100
Subject: mm/rmap: remove page_try_dup_anon_rmap()

All users are gone, remove page_try_dup_anon_rmap() and any remaining
traces.

Link: https://lkml.kernel.org/r/20231220224504.646757-38-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 850aa74b6724..0ad2ea2734e4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -253,7 +253,7 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
-/* See page_try_dup_anon_rmap() */
+/* See folio_try_dup_anon_rmap_*() */
 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 		struct vm_area_struct *vma)
 {
@@ -478,16 +478,6 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
 #endif
 }
 
-static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
-					 struct vm_area_struct *vma)
-{
-	struct folio *folio = page_folio(page);
-
-	if (likely(!compound))
-		return folio_try_dup_anon_rmap_pte(folio, page, vma);
-	return folio_try_dup_anon_rmap_pmd(folio, page, vma);
-}
-
 /**
  * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
  *			      shared to prepare for KSM or temporary unmapping
@@ -496,8 +486,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
  * The caller needs to hold the PT lock and has to have the page table entry
  * cleared/invalidated.
  *
- * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
- * to duplicate a mapping, but instead to prepare for KSM or temporarily
+ * This is similar to folio_try_dup_anon_rmap_*(), however, not used during
+ * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily
  * unmapping a page (swap, migration) via folio_remove_rmap_*().
  *
  * Marking the page shared can only fail if the page may be pinned; device
-- 
cgit v1.2.3


From e3b4b1374f87c71e9309efc6149f113cdd17af72 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:45:02 +0100
Subject: mm: convert page_try_share_anon_rmap() to
 folio_try_share_anon_rmap_[pte|pmd]()

Let's convert it like we converted all the other rmap functions.  Don't
introduce folio_try_share_anon_rmap_ptes() for now, as we don't have a
user that wants rmap batching in sight.  Pretty easy to add later.

All users are easy to convert -- only ksm.c doesn't use folios yet but
that is left for future work -- so let's just do it in a single shot.

While at it, turn the BUG_ON into a WARN_ON_ONCE.

Note that page_try_share_anon_rmap() so far didn't care about pte/pmd
mappings (no compound parameter).  We're changing that so we can perform
better sanity checks and make the code actually more readable/consistent.
For example, __folio_rmap_sanity_checks() will make sure that a PMD range
actually falls completely into the folio.

Link: https://lkml.kernel.org/r/20231220224504.646757-39-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 96 ++++++++++++++++++++++++++++++++++++++--------------
 mm/gup.c             |  2 +-
 mm/huge_memory.c     |  9 ++---
 mm/internal.h        |  4 +--
 mm/ksm.c             |  5 +--
 mm/migrate_device.c  |  2 +-
 mm/rmap.c            | 11 +++---
 7 files changed, 89 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0ad2ea2734e4..fd6fe16fa358 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -269,7 +269,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 	return 0;
 }
 
-/* See page_try_share_anon_rmap() */
+/* See folio_try_share_anon_rmap_*() */
 static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
 {
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
@@ -478,31 +478,15 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
 #endif
 }
 
-/**
- * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
- *			      shared to prepare for KSM or temporary unmapping
- * @page: the exclusive anonymous page to try marking possibly shared
- *
- * The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated.
- *
- * This is similar to folio_try_dup_anon_rmap_*(), however, not used during
- * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily
- * unmapping a page (swap, migration) via folio_remove_rmap_*().
- *
- * Marking the page shared can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
- *
- * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
- * otherwise.
- */
-static inline int page_try_share_anon_rmap(struct page *page)
+static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
+		struct page *page, int nr_pages, enum rmap_level level)
 {
-	VM_WARN_ON(folio_test_hugetlb(page_folio(page)));
-	VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
+	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+	VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
-	/* device private pages cannot get pinned via GUP. */
-	if (unlikely(is_device_private_page(page))) {
+	/* device private folios cannot get pinned via GUP. */
+	if (unlikely(folio_is_device_private(folio))) {
 		ClearPageAnonExclusive(page);
 		return 0;
 	}
@@ -553,7 +537,7 @@ static inline int page_try_share_anon_rmap(struct page *page)
 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
 		smp_mb();
 
-	if (unlikely(page_maybe_dma_pinned(page)))
+	if (unlikely(folio_maybe_dma_pinned(folio)))
 		return -EBUSY;
 	ClearPageAnonExclusive(page);
 
@@ -566,6 +550,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
 	return 0;
 }
 
+/**
+ * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
+ *				   mapped by a PTE possibly shared to prepare
+ *				   for KSM or temporary unmapping
+ * @folio:	The folio to share a mapping of
+ * @page:	The mapped exclusive page
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
+ * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
+ *
+ * Marking the mapped page shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped page possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
+		struct page *page)
+{
+	return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
+ *				   range mapped by a PMD possibly shared to
+ *				   prepare for temporary unmapping
+ * @folio:	The folio to share the mapping of
+ * @page:	The first page to share the mapping of
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
+ * fork() to duplicate a mapping, but instead to prepare for temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
+ *
+ * Marking the mapped pages shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
+		struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
+					   RMAP_LEVEL_PMD);
+#else
+	WARN_ON_ONCE(true);
+	return -EBUSY;
+#endif
+}
+
 /*
  * Called from mm/vmscan.c to handle paging out
  */
diff --git a/mm/gup.c b/mm/gup.c
index 0a5f0e91bfec..df83182ec72d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -177,7 +177,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
 	/*
 	 * Adjust the pincount before re-checking the PTE for changes.
 	 * This is essentially a smp_mb() and is paired with a memory
-	 * barrier in page_try_share_anon_rmap().
+	 * barrier in folio_try_share_anon_rmap_*().
 	 */
 	smp_mb__after_atomic();
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de623b942b6e..1a588e29d287 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2523,10 +2523,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 * In case we cannot clear PageAnonExclusive(), split the PMD
 		 * only and let try_to_migrate_one() fail later.
 		 *
-		 * See page_try_share_anon_rmap(): invalidate PMD first.
+		 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
 		 */
 		anon_exclusive = PageAnonExclusive(page);
-		if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
+		if (freeze && anon_exclusive &&
+		    folio_try_share_anon_rmap_pmd(folio, page))
 			freeze = false;
 		if (!freeze) {
 			rmap_t rmap_flags = RMAP_NONE;
@@ -3554,9 +3555,9 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
 	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
 
-	/* See page_try_share_anon_rmap(): invalidate PMD first. */
+	/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
 	anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
-	if (anon_exclusive && page_try_share_anon_rmap(page)) {
+	if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
 		set_pmd_at(mm, address, pvmw->pmd, pmdval);
 		return -EBUSY;
 	}
diff --git a/mm/internal.h b/mm/internal.h
index a94355e70bd7..29589bc3f046 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1047,7 +1047,7 @@ enum {
  * * Ordinary GUP: Using the PT lock
  * * GUP-fast and fork(): mm->write_protect_seq
  * * GUP-fast and KSM or temporary unmapping (swap, migration): see
- *    page_try_share_anon_rmap()
+ *    folio_try_share_anon_rmap_*()
  *
  * Must be called with the (sub)page that's actually referenced via the
  * page table entry, which might not necessarily be the head page for a
@@ -1090,7 +1090,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 		return is_cow_mapping(vma->vm_flags);
 	}
 
-	/* Paired with a memory barrier in page_try_share_anon_rmap(). */
+	/* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
 		smp_rmb();
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 716e2f87dd79..8c001819cf10 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1331,8 +1331,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 			goto out_unlock;
 		}
 
-		/* See page_try_share_anon_rmap(): clear PTE first. */
-		if (anon_exclusive && page_try_share_anon_rmap(page)) {
+		/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
+		if (anon_exclusive &&
+		    folio_try_share_anon_rmap_pte(page_folio(page), page)) {
 			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
 			goto out_unlock;
 		}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 39b7754480c6..b6c27c76e1a0 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -202,7 +202,7 @@ again:
 			if (anon_exclusive) {
 				pte = ptep_clear_flush(vma, addr, ptep);
 
-				if (page_try_share_anon_rmap(page)) {
+				if (folio_try_share_anon_rmap_pte(folio, page)) {
 					set_pte_at(mm, addr, ptep, pte);
 					folio_unlock(folio);
 					folio_put(folio);
diff --git a/mm/rmap.c b/mm/rmap.c
index 3ee254a99622..6209e65985a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1866,9 +1866,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				break;
 			}
 
-			/* See page_try_share_anon_rmap(): clear PTE first. */
+			/* See folio_try_share_anon_rmap(): clear PTE first. */
 			if (anon_exclusive &&
-			    page_try_share_anon_rmap(subpage)) {
+			    folio_try_share_anon_rmap_pte(folio, subpage)) {
 				swap_free(entry);
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = false;
@@ -2142,7 +2142,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			pte_t swp_pte;
 
 			if (anon_exclusive)
-				BUG_ON(page_try_share_anon_rmap(subpage));
+				WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
+									   subpage));
 
 			/*
 			 * Store the pfn of the page in a special migration
@@ -2213,7 +2214,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
 				       !anon_exclusive, subpage);
 
-			/* See page_try_share_anon_rmap(): clear PTE first. */
+			/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
 			if (folio_test_hugetlb(folio)) {
 				if (anon_exclusive &&
 				    hugetlb_try_share_anon_rmap(folio)) {
@@ -2224,7 +2225,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 					break;
 				}
 			} else if (anon_exclusive &&
-				   page_try_share_anon_rmap(subpage)) {
+				   folio_try_share_anon_rmap_pte(folio, subpage)) {
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
-- 
cgit v1.2.3


From 90ca22513ed5d7cf546c7c8d35a03ec2a2f5c87e Mon Sep 17 00:00:00 2001
From: Mathis Marion <mathis.marion@silabs.com>
Date: Tue, 19 Dec 2023 14:11:54 +0100
Subject: lib: crc_ccitt_false() is identical to crc_itu_t()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

crc_ccitt_false() was introduced in commit 0d85adb5fbd33 ("lib/crc-ccitt:
Add CCITT-FALSE CRC16 variant"), but it is redundant with crc_itu_t().
Since the latter is more used, it is the one being kept.

Link: https://lkml.kernel.org/r/20231219131154.748577-1-Mathis.Marion@silabs.com
Signed-off-by: Mathis Marion <mathis.marion@silabs.com>
Cc: Andrey Smirnov <andrew.smirnov@gmail.com>
Cc: Andrey Vostrikov <andrey.vostrikov@cogentembedded.com>
Cc: Jérôme Pouiller <jerome.pouiller@silabs.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../driver-api/surface_aggregator/ssh.rst          |  2 +-
 drivers/mfd/Kconfig                                |  2 +-
 drivers/mfd/rave-sp.c                              |  4 +-
 drivers/platform/surface/aggregator/Kconfig        |  2 +-
 include/linux/crc-ccitt.h                          |  7 ---
 include/linux/surface_aggregator/serial_hub.h      |  4 +-
 lib/crc-ccitt.c                                    | 55 ----------------------
 7 files changed, 7 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/surface_aggregator/ssh.rst b/Documentation/driver-api/surface_aggregator/ssh.rst
index b955b673838b..58a757319931 100644
--- a/Documentation/driver-api/surface_aggregator/ssh.rst
+++ b/Documentation/driver-api/surface_aggregator/ssh.rst
@@ -39,7 +39,7 @@ Note that the standard disclaimer for this subsystem also applies to this
 document: All of this has been reverse-engineered and may thus be erroneous
 and/or incomplete.
 
-All CRCs used in the following are two-byte ``crc_ccitt_false(0xffff, ...)``.
+All CRCs used in the following are two-byte ``crc_itu_t(0xffff, ...)``.
 All multi-byte values are little-endian, there is no implicit padding between
 values.
 
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 90ce58fd629e..925c19ee513b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -2255,7 +2255,7 @@ config MFD_VEXPRESS_SYSREG
 config RAVE_SP_CORE
 	tristate "RAVE SP MCU core driver"
 	depends on SERIAL_DEV_BUS
-	select CRC_CCITT
+	select CRC_ITU_T
 	help
 	  Select this to get support for the Supervisory Processor
 	  device found on several devices in RAVE line of hardware.
diff --git a/drivers/mfd/rave-sp.c b/drivers/mfd/rave-sp.c
index da50eba10014..f62422740de2 100644
--- a/drivers/mfd/rave-sp.c
+++ b/drivers/mfd/rave-sp.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/atomic.h>
-#include <linux/crc-ccitt.h>
+#include <linux/crc-itu-t.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/init.h>
@@ -251,7 +251,7 @@ static void csum_8b2c(const u8 *buf, size_t size, u8 *crc)
 
 static void csum_ccitt(const u8 *buf, size_t size, u8 *crc)
 {
-	const u16 calculated = crc_ccitt_false(0xffff, buf, size);
+	const u16 calculated = crc_itu_t(0xffff, buf, size);
 
 	/*
 	 * While the rest of the wire protocol is little-endian,
diff --git a/drivers/platform/surface/aggregator/Kconfig b/drivers/platform/surface/aggregator/Kconfig
index 88afc38ffdc5..957c216c180c 100644
--- a/drivers/platform/surface/aggregator/Kconfig
+++ b/drivers/platform/surface/aggregator/Kconfig
@@ -5,7 +5,7 @@ menuconfig SURFACE_AGGREGATOR
 	tristate "Microsoft Surface System Aggregator Module Subsystem and Drivers"
 	depends on SERIAL_DEV_BUS
 	depends on ACPI && !RISCV
-	select CRC_CCITT
+	select CRC_ITU_T
 	help
 	  The Surface System Aggregator Module (Surface SAM or SSAM) is an
 	  embedded controller (EC) found on 5th- and later-generation Microsoft
diff --git a/include/linux/crc-ccitt.h b/include/linux/crc-ccitt.h
index 72c92c396bb8..cd4f420231ba 100644
--- a/include/linux/crc-ccitt.h
+++ b/include/linux/crc-ccitt.h
@@ -5,19 +5,12 @@
 #include <linux/types.h>
 
 extern u16 const crc_ccitt_table[256];
-extern u16 const crc_ccitt_false_table[256];
 
 extern u16 crc_ccitt(u16 crc, const u8 *buffer, size_t len);
-extern u16 crc_ccitt_false(u16 crc, const u8 *buffer, size_t len);
 
 static inline u16 crc_ccitt_byte(u16 crc, const u8 c)
 {
 	return (crc >> 8) ^ crc_ccitt_table[(crc ^ c) & 0xff];
 }
 
-static inline u16 crc_ccitt_false_byte(u16 crc, const u8 c)
-{
-    return (crc << 8) ^ crc_ccitt_false_table[(crc >> 8) ^ c];
-}
-
 #endif /* _LINUX_CRC_CCITT_H */
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index 5c4ae1a26183..d8dbef6b7fc2 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -12,7 +12,7 @@
 #ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
 #define _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
 
-#include <linux/crc-ccitt.h>
+#include <linux/crc-itu-t.h>
 #include <linux/kref.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
@@ -188,7 +188,7 @@ static_assert(sizeof(struct ssh_command) == 8);
  */
 static inline u16 ssh_crc(const u8 *buf, size_t len)
 {
-	return crc_ccitt_false(0xffff, buf, len);
+	return crc_itu_t(0xffff, buf, len);
 }
 
 /*
diff --git a/lib/crc-ccitt.c b/lib/crc-ccitt.c
index d1a7d29d2ac9..9cddf35d3b66 100644
--- a/lib/crc-ccitt.c
+++ b/lib/crc-ccitt.c
@@ -49,46 +49,6 @@ u16 const crc_ccitt_table[256] = {
 };
 EXPORT_SYMBOL(crc_ccitt_table);
 
-/*
- * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE
- * Reflected bits order, does not augment final value.
- */
-u16 const crc_ccitt_false_table[256] = {
-    0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
-    0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
-    0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
-    0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
-    0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
-    0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
-    0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
-    0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
-    0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
-    0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
-    0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
-    0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
-    0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
-    0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
-    0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
-    0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
-    0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
-    0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
-    0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
-    0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
-    0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
-    0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
-    0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
-    0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
-    0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
-    0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
-    0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
-    0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
-    0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
-    0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
-    0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
-    0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
-};
-EXPORT_SYMBOL(crc_ccitt_false_table);
-
 /**
  *	crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data
  *	buffer
@@ -104,20 +64,5 @@ u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len)
 }
 EXPORT_SYMBOL(crc_ccitt);
 
-/**
- *	crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant)
- *	for the data buffer
- *	@crc: previous CRC value
- *	@buffer: data pointer
- *	@len: number of bytes in the buffer
- */
-u16 crc_ccitt_false(u16 crc, u8 const *buffer, size_t len)
-{
-	while (len--)
-		crc = crc_ccitt_false_byte(crc, *buffer++);
-	return crc;
-}
-EXPORT_SYMBOL(crc_ccitt_false);
-
 MODULE_DESCRIPTION("CRC-CCITT calculations");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 29166371ef6780429e4cb84f1827fafbdd4005ab Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Wed, 27 Dec 2023 07:46:25 +0800
Subject: kdump: remove redundant DEFAULT_CRASH_KERNEL_LOW_SIZE

Remove duplicate definitions, no functional changes.

Link: https://lkml.kernel.org/r/MW4PR84MB3145459ADC7EB38BBB36955B8198A@MW4PR84MB3145.NAMPRD84.PROD.OUTLOOK.COM
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Reported-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 5126a4fecb44..9eaeaafe0cad 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -87,12 +87,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-#define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
-#endif
-#endif
-
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-- 
cgit v1.2.3


From 501a06fe8e4c185bbda371b8cedbdf1b23a633d8 Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Thu, 7 Dec 2023 11:24:06 -0800
Subject: zswap: memcontrol: implement zswap writeback disabling

During our experiment with zswap, we sometimes observe swap IOs due to
occasional zswap store failures and writebacks-to-swap.  These swapping
IOs prevent many users who cannot tolerate swapping from adopting zswap to
save memory and improve performance where possible.

This patch adds the option to disable this behavior entirely: do not
writeback to backing swapping device when a zswap store attempt fail, and
do not write pages in the zswap pool back to the backing swap device (both
when the pool is full, and when the new zswap shrinker is called).

This new behavior can be opted-in/out on a per-cgroup basis via a new
cgroup file.  By default, writebacks to swap device is enabled, which is
the previous behavior.  Initially, writeback is enabled for the root
cgroup, and a newly created cgroup will inherit the current setting of its
parent.

Note that this is subtly different from setting memory.swap.max to 0, as
it still allows for pages to be stored in the zswap pool (which itself
consumes swap space in its current form).

This patch should be applied on top of the zswap shrinker series:

https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/

as it also disables the zswap shrinker, a major source of zswap
writebacks.

For the most part, this feature is motivated by internal parties who
have already established their opinions regarding swapping - the
workloads that are highly sensitive to IO, and especially those who are
using servers with really slow disk performance (for instance, massive
but slow HDDs).  For these folks, it's impossible to convince them to
even entertain zswap if swapping also comes as a packaged deal.
Writeback disabling is quite a useful feature in these situations - on
a mixed workloads deployment, they can disable writeback for the more
IO-sensitive workloads, and enable writeback for other background
workloads.

For instance, on a server with HDD, I allocate memories and populate
them with random values (so that zswap store will always fail), and
specify memory.high low enough to trigger reclaim.  The time it takes
to allocate the memories and just read through it a couple of times
(doing silly things like computing the values' average etc.):

zswap.writeback disabled:
real 0m30.537s
user 0m23.687s
sys 0m6.637s
0 pages swapped in
0 pages swapped out

zswap.writeback enabled:
real 0m45.061s
user 0m24.310s
sys 0m8.892s
712686 pages swapped in
461093 pages swapped out

(the last two lines are from vmstat -s).

[nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency]
  Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Heidelberg <david@ixit.cz>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 15 +++++++++++++
 Documentation/admin-guide/mm/zswap.rst  | 10 +++++++++
 include/linux/memcontrol.h              | 12 +++++++++++
 include/linux/zswap.h                   |  7 ++++++
 mm/memcontrol.c                         | 38 +++++++++++++++++++++++++++++++++
 mm/page_io.c                            |  5 +++++
 mm/shmem.c                              |  3 +--
 mm/zswap.c                              | 13 +++++++++--
 8 files changed, 99 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3f85254f3cef..5ec7dd753cd1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1679,6 +1679,21 @@ PAGE_SIZE multiple when read back.
 	limit, it will refuse to take any more stores before existing
 	entries fault back in or are written out to disk.
 
+  memory.zswap.writeback
+	A read-write single value file. The default value is "1". The
+	initial value of the root cgroup is 1, and when a new cgroup is
+	created, it inherits the current value of its parent.
+
+	When this is set to 0, all swapping attempts to swapping devices
+	are disabled. This included both zswap writebacks, and swapping due
+	to zswap store failures. If the zswap store failures are recurring
+	(for e.g if the pages are incompressible), users can observe
+	reclaim inefficiency after disabling writeback (because the same
+	pages might be rejected again and again).
+
+	Note that this is subtly different from setting memory.swap.max to
+	0, as it still allows for pages to be written to the zswap pool.
+
   memory.pressure
 	A read-only nested-keyed file.
 
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 62fc244ec702..b42132969e31 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -153,6 +153,16 @@ attribute, e. g.::
 
 Setting this parameter to 100 will disable the hysteresis.
 
+Some users cannot tolerate the swapping that comes with zswap store failures
+and zswap writebacks. Swapping can be disabled entirely (without disabling
+zswap itself) on a cgroup-basis as follows:
+
+	echo 0 > /sys/fs/cgroup/<cgroup-name>/memory.zswap.writeback
+
+Note that if the store failures are recurring (for e.g if the pages are
+incompressible), users can observe reclaim inefficiency after disabling
+writeback (because the same pages might be rejected again and again).
+
 When there is a sizable amount of cold memory residing in the zswap pool, it
 can be advantageous to proactively write these cold pages to swap and reclaim
 the memory for other use cases. By default, the zswap shrinker is disabled.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 43b77363ab8e..5de775e6cdd9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -219,6 +219,12 @@ struct mem_cgroup {
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	unsigned long zswap_max;
+
+	/*
+	 * Prevent pages from this memcg from being written back from zswap to
+	 * swap, and from being swapped out on zswap store failures.
+	 */
+	bool zswap_writeback;
 #endif
 
 	unsigned long soft_limit;
@@ -1941,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
+bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
 #else
 static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 {
@@ -1954,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
 					     size_t size)
 {
 }
+static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
+{
+	/* if zswap is disabled, do not block pages going to the swapping device */
+	return true;
+}
 #endif
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e88572d4c720..0b709f5bc65f 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -35,6 +35,7 @@ void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 void zswap_lruvec_state_init(struct lruvec *lruvec);
 void zswap_folio_swapin(struct folio *folio);
+bool is_zswap_enabled(void);
 #else
 
 struct zswap_lruvec_state {};
@@ -55,6 +56,12 @@ static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
 static inline void zswap_folio_swapin(struct folio *folio) {}
+
+static inline bool is_zswap_enabled(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* _LINUX_ZSWAP_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b08b9cd4a3a8..3ca691fb5b49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5538,6 +5538,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	memcg->zswap_max = PAGE_COUNTER_MAX;
+	WRITE_ONCE(memcg->zswap_writeback,
+		!parent || READ_ONCE(parent->zswap_writeback));
 #endif
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
 	if (parent) {
@@ -8166,6 +8168,12 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
 	rcu_read_unlock();
 }
 
+bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
+{
+	/* if zswap is disabled, do not block pages going to the swapping device */
+	return !is_zswap_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback);
+}
+
 static u64 zswap_current_read(struct cgroup_subsys_state *css,
 			      struct cftype *cft)
 {
@@ -8198,6 +8206,31 @@ static ssize_t zswap_max_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int zswap_writeback_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback));
+	return 0;
+}
+
+static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int zswap_writeback;
+	ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback);
+
+	if (parse_ret)
+		return parse_ret;
+
+	if (zswap_writeback != 0 && zswap_writeback != 1)
+		return -EINVAL;
+
+	WRITE_ONCE(memcg->zswap_writeback, zswap_writeback);
+	return nbytes;
+}
+
 static struct cftype zswap_files[] = {
 	{
 		.name = "zswap.current",
@@ -8210,6 +8243,11 @@ static struct cftype zswap_files[] = {
 		.seq_show = zswap_max_show,
 		.write = zswap_max_write,
 	},
+	{
+		.name = "zswap.writeback",
+		.seq_show = zswap_writeback_show,
+		.write = zswap_writeback_write,
+	},
 	{ }	/* terminate */
 };
 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
diff --git a/mm/page_io.c b/mm/page_io.c
index 09c6a4f316f3..ae2b49055e43 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -201,6 +201,11 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		folio_end_writeback(folio);
 		return 0;
 	}
+	if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
+		folio_mark_dirty(folio);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
+
 	__swap_writepage(folio, wbc);
 	return 0;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index a4d388973021..928aa2304932 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1514,8 +1514,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 
 		mutex_unlock(&shmem_swaplist_mutex);
 		BUG_ON(folio_mapped(folio));
-		swap_writepage(&folio->page, wbc);
-		return 0;
+		return swap_writepage(&folio->page, wbc);
 	}
 
 	mutex_unlock(&shmem_swaplist_mutex);
diff --git a/mm/zswap.c b/mm/zswap.c
index f760801a3ea8..ca25b676048e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -153,6 +153,11 @@ static bool zswap_shrinker_enabled = IS_ENABLED(
 		CONFIG_ZSWAP_SHRINKER_DEFAULT_ON);
 module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
 
+bool is_zswap_enabled(void)
+{
+	return zswap_enabled;
+}
+
 /*********************************
 * data structures
 **********************************/
@@ -596,7 +601,8 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 	struct zswap_pool *pool = shrinker->private_data;
 	bool encountered_page_in_swapcache = false;
 
-	if (!zswap_shrinker_enabled) {
+	if (!zswap_shrinker_enabled ||
+			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
 		sc->nr_scanned = 0;
 		return SHRINK_STOP;
 	}
@@ -637,7 +643,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
 	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
 
-	if (!zswap_shrinker_enabled)
+	if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
 		return 0;
 
 #ifdef CONFIG_MEMCG_KMEM
@@ -923,6 +929,9 @@ static int shrink_memcg(struct mem_cgroup *memcg)
 	struct zswap_pool *pool;
 	int nid, shrunk = 0;
 
+	if (!mem_cgroup_zswap_writeback_enabled(memcg))
+		return -EINVAL;
+
 	/*
 	 * Skip zombies because their LRUs are reparented and we would be
 	 * reclaiming from the parent instead of the dead memcg.
-- 
cgit v1.2.3


From 7c223098212957a1ecd8768e8e747ae2cf88e880 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Fri, 29 Dec 2023 20:53:49 +0000
Subject: locking/osq_lock: Move the definition of optimistic_spin_node into
 osq_lock.c

struct optimistic_spin_node is private to the implementation.
Move it into the C file to ensure nothing is accessing it.

Signed-off-by: David Laight <david.laight@aculab.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/osq_lock.h  | 5 -----
 kernel/locking/osq_lock.c | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 5581dbd3bd34..ea8fb31379e3 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -6,11 +6,6 @@
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
  */
-struct optimistic_spin_node {
-	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # + 1 value */
-};
 
 struct optimistic_spin_queue {
 	/*
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..d414eef4bec6 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
  * called from interrupt context and we have preemption disabled while
  * spinning.
  */
+
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
+	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # + 1 value */
+};
+
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
 
 /*
-- 
cgit v1.2.3


From cff9c565e65f3622e8dc1dcc21c1520a083dff35 Mon Sep 17 00:00:00 2001
From: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Date: Wed, 20 Dec 2023 01:52:29 -0300
Subject: net: mdio: get/put device node during (un)registration

The __of_mdiobus_register() function was storing the device node in
dev.of_node without increasing its reference count. It implicitly relied
on the caller to maintain the allocated node until the mdiobus was
unregistered.

Now, __of_mdiobus_register() will acquire the node before assigning it,
and of_mdiobus_unregister_callback() will be called at the end of
mdio_unregister().

Drivers can now release the node immediately after MDIO registration.
Some of them are already doing that even before this patch.

Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/of_mdio.c | 12 +++++++++++-
 drivers/net/phy/mdio_bus.c |  3 +++
 include/linux/phy.h        |  3 +++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 64ebcb6d235c..9b6cab6154e0 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -139,6 +139,11 @@ bool of_mdiobus_child_is_phy(struct device_node *child)
 }
 EXPORT_SYMBOL(of_mdiobus_child_is_phy);
 
+static void __of_mdiobus_unregister_callback(struct mii_bus *mdio)
+{
+	of_node_put(mdio->dev.of_node);
+}
+
 /**
  * __of_mdiobus_register - Register mii_bus and create PHYs from the device tree
  * @mdio: pointer to mii_bus structure
@@ -166,6 +171,8 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 	 * the device tree are populated after the bus has been registered */
 	mdio->phy_mask = ~0;
 
+	mdio->__unregister_callback = __of_mdiobus_unregister_callback;
+	of_node_get(np);
 	device_set_node(&mdio->dev, of_fwnode_handle(np));
 
 	/* Get bus level PHY reset GPIO details */
@@ -177,7 +184,7 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 	/* Register the MDIO bus */
 	rc = __mdiobus_register(mdio, owner);
 	if (rc)
-		return rc;
+		goto put_node;
 
 	/* Loop over the child nodes and register a phy_device for each phy */
 	for_each_available_child_of_node(np, child) {
@@ -237,6 +244,9 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 unregister:
 	of_node_put(child);
 	mdiobus_unregister(mdio);
+
+put_node:
+	of_node_put(np);
 	return rc;
 }
 EXPORT_SYMBOL(__of_mdiobus_register);
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 6cf73c15635b..4a30757c4ff8 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -787,6 +787,9 @@ void mdiobus_unregister(struct mii_bus *bus)
 		gpiod_set_value_cansleep(bus->reset_gpiod, 1);
 
 	device_del(&bus->dev);
+
+	if (bus->__unregister_callback)
+		bus->__unregister_callback(bus);
 }
 EXPORT_SYMBOL(mdiobus_unregister);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e9e85d347587..ede891776d8b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -434,6 +434,9 @@ struct mii_bus {
 
 	/** @shared: shared state across different PHYs */
 	struct phy_package_shared *shared[PHY_MAX_ADDR];
+
+	/** @__unregister_callback: called at the last step of unregistration */
+	void (*__unregister_callback)(struct mii_bus *bus);
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
-- 
cgit v1.2.3


From 02018c544ef113e980a2349eba89003d6f399d22 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:34 +0100
Subject: net: phy: Introduce ethernet link topology representation

Link topologies containing multiple network PHYs attached to the same
net_device can be found when using a PHY as a media converter for use
with an SFP connector, on which an SFP transceiver containing a PHY can
be used.

With the current model, the transceiver's PHY can't be used for
operations such as cable testing, timestamping, macsec offload, etc.

The reason being that most of the logic for these configuration, coming
from either ethtool netlink or ioctls tend to use netdev->phydev, which
in multi-phy systems will reference the PHY closest to the MAC.

Introduce a numbering scheme allowing to enumerate PHY devices that
belong to any netdev, which can in turn allow userspace to take more
precise decisions with regard to each PHY's configuration.

The numbering is maintained per-netdev, in a phy_device_list.
The numbering works similarly to a netdevice's ifindex, with
identifiers that are only recycled once INT_MAX has been reached.

This prevents races that could occur between PHY listing and SFP
transceiver removal/insertion.

The identifiers are assigned at phy_attach time, as the numbering
depends on the netdevice the phy is attached to.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                            |  2 +
 drivers/net/phy/Makefile               |  2 +-
 drivers/net/phy/phy_device.c           |  7 ++++
 drivers/net/phy/phy_link_topology.c    | 66 +++++++++++++++++++++++++++++++++
 include/linux/netdevice.h              |  4 +-
 include/linux/phy.h                    |  4 ++
 include/linux/phy_link_topology.h      | 67 ++++++++++++++++++++++++++++++++++
 include/linux/phy_link_topology_core.h | 19 ++++++++++
 include/uapi/linux/ethtool.h           | 16 ++++++++
 net/core/dev.c                         |  3 ++
 10 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/phy/phy_link_topology.c
 create mode 100644 include/linux/phy_link_topology.h
 create mode 100644 include/linux/phy_link_topology_core.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2b916990d7f0..79ac49b113dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7871,6 +7871,8 @@ F:	include/linux/mii.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
+F:	include/linux/phy_link_topology.h
+F:	include/linux/phy_link_topology_core.h
 F:	include/linux/phylib_stubs.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/linux/platform_data/mdio-gpio.h
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 6097afd44392..f218954fd7a8 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Linux PHY drivers
 
 libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o \
-				   linkmode.o
+				   linkmode.o phy_link_topology.o
 mdio-bus-y			+= mdio_bus.o mdio_device.o
 
 ifdef CONFIG_MDIO_DEVICE
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 3611ea64875e..ab8ae976a2f8 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -29,6 +29,7 @@
 #include <linux/phy.h>
 #include <linux/phylib_stubs.h>
 #include <linux/phy_led_triggers.h>
+#include <linux/phy_link_topology.h>
 #include <linux/pse-pd/pse.h>
 #include <linux/property.h>
 #include <linux/rtnetlink.h>
@@ -1491,6 +1492,11 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 		if (phydev->sfp_bus_attached)
 			dev->sfp_bus = phydev->sfp_bus;
+
+		err = phy_link_topo_add_phy(&dev->link_topo, phydev,
+					    PHY_UPSTREAM_MAC, dev);
+		if (err)
+			goto error;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
@@ -1820,6 +1826,7 @@ void phy_detach(struct phy_device *phydev)
 	if (dev) {
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
+		phy_link_topo_del_phy(&dev->link_topo, phydev);
 	}
 	phydev->phylink = NULL;
 
diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c
new file mode 100644
index 000000000000..34e7e08fbfc3
--- /dev/null
+++ b/drivers/net/phy/phy_link_topology.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Infrastructure to handle all PHY devices connected to a given netdev,
+ * either directly or indirectly attached.
+ *
+ * Copyright (c) 2023 Maxime Chevallier<maxime.chevallier@bootlin.com>
+ */
+
+#include <linux/phy_link_topology.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/rtnetlink.h>
+#include <linux/xarray.h>
+
+int phy_link_topo_add_phy(struct phy_link_topology *topo,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream)
+{
+	struct phy_device_node *pdn;
+	int ret;
+
+	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+	if (!pdn)
+		return -ENOMEM;
+
+	pdn->phy = phy;
+	switch (upt) {
+	case PHY_UPSTREAM_MAC:
+		pdn->upstream.netdev = (struct net_device *)upstream;
+		if (phy_on_sfp(phy))
+			pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus;
+		break;
+	case PHY_UPSTREAM_PHY:
+		pdn->upstream.phydev = (struct phy_device *)upstream;
+		if (phy_on_sfp(phy))
+			pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus;
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+	pdn->upstream_type = upt;
+
+	ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b,
+			      &topo->next_phy_index, GFP_KERNEL);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	kfree(pdn);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_link_topo_add_phy);
+
+void phy_link_topo_del_phy(struct phy_link_topology *topo,
+			   struct phy_device *phy)
+{
+	struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex);
+
+	phy->phyindex = 0;
+
+	kfree(pdn);
+}
+EXPORT_SYMBOL_GPL(phy_link_topo_del_phy);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 75c7725e5e4f..5baa5517f533 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,7 +40,6 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
-
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -52,6 +51,7 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -2047,6 +2047,7 @@ enum netdev_stat_type {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
+ *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2441,6 +2442,7 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
+	struct phy_link_topology	link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ede891776d8b..ea9416797b89 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -547,6 +547,9 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
+ * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
+ *	      from userspace, similar to ifindex. A zero index means the PHY
+ *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -646,6 +649,7 @@ struct phy_device {
 
 	struct device_link *devlink;
 
+	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
new file mode 100644
index 000000000000..91902263ec0e
--- /dev/null
+++ b/include/linux/phy_link_topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PHY device list allow maintaining a list of PHY devices that are
+ * part of a netdevice's link topology. PHYs can for example be chained,
+ * as is the case when using a PHY that exposes an SFP module, on which an
+ * SFP transceiver that embeds a PHY is connected.
+ *
+ * This list can then be used by userspace to leverage individual PHY
+ * capabilities.
+ */
+#ifndef __PHY_LINK_TOPOLOGY_H
+#define __PHY_LINK_TOPOLOGY_H
+
+#include <linux/ethtool.h>
+#include <linux/phy_link_topology_core.h>
+
+struct xarray;
+struct phy_device;
+struct net_device;
+struct sfp_bus;
+
+struct phy_device_node {
+	enum phy_upstream upstream_type;
+
+	union {
+		struct net_device	*netdev;
+		struct phy_device	*phydev;
+	} upstream;
+
+	struct sfp_bus *parent_sfp_bus;
+
+	struct phy_device *phy;
+};
+
+static inline struct phy_device *
+phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
+{
+	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
+
+	if (pdn)
+		return pdn->phy;
+
+	return NULL;
+}
+
+#if IS_ENABLED(CONFIG_PHYLIB)
+int phy_link_topo_add_phy(struct phy_link_topology *topo,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream);
+
+void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
+
+#else
+static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
+					struct phy_device *phy,
+					enum phy_upstream upt, void *upstream)
+{
+	return 0;
+}
+
+static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
+					 struct phy_device *phy)
+{
+}
+#endif
+
+#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
new file mode 100644
index 000000000000..78c75f909489
--- /dev/null
+++ b/include/linux/phy_link_topology_core.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PHY_LINK_TOPOLOGY_CORE_H
+#define __PHY_LINK_TOPOLOGY_CORE_H
+
+struct xarray;
+
+struct phy_link_topology {
+	struct xarray phys;
+
+	u32 next_phy_index;
+};
+
+static inline void phy_link_topo_init(struct phy_link_topology *topo)
+{
+	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
+	topo->next_phy_index = 1;
+}
+
+#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 85c412c23ab5..60801df9d8c0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2219,4 +2219,20 @@ struct ethtool_link_settings {
 	 * __u32 map_lp_advertising[link_mode_masks_nwords];
 	 */
 };
+
+/**
+ * enum phy_upstream - Represents the upstream component a given PHY device
+ * is connected to, as in what is on the other end of the MII bus. Most PHYs
+ * will be attached to an Ethernet MAC controller, but in some cases, there's
+ * an intermediate PHY used as a media-converter, which will driver another
+ * MII interface as its output.
+ * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port,
+ *		      or ethernet controller)
+ * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter)
+ */
+enum phy_upstream {
+	PHY_UPSTREAM_MAC,
+	PHY_UPSTREAM_PHY,
+};
+
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index f9d4b550ef4b..df04cbf77551 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,7 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <linux/phy_link_topology_core.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -10875,6 +10876,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #ifdef CONFIG_NET_SCHED
 	hash_init(dev->qdisc_hash);
 #endif
+	phy_link_topo_init(&dev->link_topo);
+
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
-- 
cgit v1.2.3


From 9c5625f559ad6fe9f6f733c11475bf470e637d34 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:35 +0100
Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY

Pass the phy_device as a parameter to the sfp upstream .disconnect_phy
operation. This is preparatory work to help track phy devices across
a net_device's link.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 8 ++++++++
 drivers/net/phy/phylink.c    | 3 ++-
 drivers/net/phy/sfp-bus.c    | 4 ++--
 include/linux/sfp.h          | 2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index ab8ae976a2f8..711629c49c11 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -266,6 +266,14 @@ static void phy_mdio_device_remove(struct mdio_device *mdiodev)
 
 static struct phy_driver genphy_driver;
 
+static struct phy_link_topology *phy_get_link_topology(struct phy_device *phydev)
+{
+	if (phydev->attached_dev)
+		return &phydev->attached_dev->link_topo;
+
+	return NULL;
+}
+
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 298dfd6982a5..3d25a4a6212b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -3319,7 +3319,8 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 	return ret;
 }
 
-static void phylink_sfp_disconnect_phy(void *upstream)
+static void phylink_sfp_disconnect_phy(void *upstream,
+				       struct phy_device *phydev)
 {
 	phylink_disconnect_phy(upstream);
 }
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 6fa679b36290..3a86c41e1235 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -486,7 +486,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
 		if (bus->phydev && ops && ops->disconnect_phy)
-			ops->disconnect_phy(bus->upstream);
+			ops->disconnect_phy(bus->upstream, bus->phydev);
 	}
 	bus->registered = false;
 }
@@ -742,7 +742,7 @@ void sfp_remove_phy(struct sfp_bus *bus)
 	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
 
 	if (ops && ops->disconnect_phy)
-		ops->disconnect_phy(bus->upstream);
+		ops->disconnect_phy(bus->upstream, bus->phydev);
 	bus->phydev = NULL;
 }
 EXPORT_SYMBOL_GPL(sfp_remove_phy);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 9346cd44814d..0573e53b0c11 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv);
+	void (*disconnect_phy)(void *priv, struct phy_device *);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
-- 
cgit v1.2.3


From 034fcc210349b873ece7356905be5c6ca11eef2a Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:36 +0100
Subject: net: phy: add helpers to handle sfp phy connect/disconnect

There are a few PHY drivers that can handle SFP modules through their
sfp_upstream_ops. Introduce Phylib helpers to keep track of connected
SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the
upstream PHY's netdev's namespace.

By doing so, these SFP PHYs can be enumerated and exposed to users,
which will be able to use their capabilities.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/at803x.c          |  2 ++
 drivers/net/phy/marvell-88x2222.c |  2 ++
 drivers/net/phy/marvell.c         |  2 ++
 drivers/net/phy/marvell10g.c      |  2 ++
 drivers/net/phy/phy_device.c      | 40 +++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h               |  2 ++
 6 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 19cfbf36fe80..aaf6c654aaed 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -1452,6 +1452,8 @@ static const struct sfp_upstream_ops at8031_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
 	.module_insert = at8031_sfp_insert,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int at8031_parse_dt(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
index e3aa30dad2e6..3f77bbc7e04f 100644
--- a/drivers/net/phy/marvell-88x2222.c
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -555,6 +555,8 @@ static const struct sfp_upstream_ops sfp_phy_ops = {
 	.link_down = mv2222_sfp_link_down,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int mv2222_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index eba652a4c1d8..674e29bce2cc 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3254,6 +3254,8 @@ static const struct sfp_upstream_ops m88e1510_sfp_ops = {
 	.module_remove = m88e1510_sfp_remove,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int m88e1510_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index ad43e280930c..6642eb642d4b 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -503,6 +503,8 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 static const struct sfp_upstream_ops mv3310_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 	.module_insert = mv3310_sfp_insert,
 };
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 711629c49c11..1e595762afea 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1363,6 +1363,46 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_standalone);
 
+/**
+ * phy_sfp_connect_phy - Connect the SFP module's PHY to the upstream PHY
+ * @upstream: pointer to the upstream phy device
+ * @phy: pointer to the SFP module's phy device
+ *
+ * This helper allows keeping track of PHY devices on the link. It adds the
+ * SFP module's phy to the phy namespace of the upstream phy
+ */
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy)
+{
+	struct phy_device *phydev = upstream;
+	struct phy_link_topology *topo = phy_get_link_topology(phydev);
+
+	if (topo)
+		return phy_link_topo_add_phy(topo, phy, PHY_UPSTREAM_PHY, phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_sfp_connect_phy);
+
+/**
+ * phy_sfp_disconnect_phy - Disconnect the SFP module's PHY from the upstream PHY
+ * @upstream: pointer to the upstream phy device
+ * @phy: pointer to the SFP module's phy device
+ *
+ * This helper allows keeping track of PHY devices on the link. It removes the
+ * SFP module's phy to the phy namespace of the upstream phy. As the module phy
+ * will be destroyed, re-inserting the same module will add a new phy with a
+ * new index.
+ */
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy)
+{
+	struct phy_device *phydev = upstream;
+	struct phy_link_topology *topo = phy_get_link_topology(phydev);
+
+	if (topo)
+		phy_link_topo_del_phy(topo, phy);
+}
+EXPORT_SYMBOL(phy_sfp_disconnect_phy);
+
 /**
  * phy_sfp_attach - attach the SFP bus to the PHY upstream network device
  * @upstream: pointer to the phy device
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ea9416797b89..ac22b8e28a85 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1729,6 +1729,8 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
-- 
cgit v1.2.3


From dedd702a35793ab462fce4c737eeba0badf9718e Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 21 Dec 2023 19:00:37 +0100
Subject: net: sfp: Add helper to return the SFP bus name

Knowing the bus name is helpful when we want to expose the link topology
to userspace, add a helper to return the SFP bus name.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 11 +++++++++++
 include/linux/sfp.h       |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 3a86c41e1235..fb1c102714b5 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -859,3 +859,14 @@ void sfp_unregister_socket(struct sfp_bus *bus)
 	sfp_bus_put(bus);
 }
 EXPORT_SYMBOL_GPL(sfp_unregister_socket);
+
+const char *sfp_get_name(struct sfp_bus *bus)
+{
+	ASSERT_RTNL();
+
+	if (bus->sfp_dev)
+		return dev_name(bus->sfp_dev);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(sfp_get_name);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 0573e53b0c11..55c0ab17c9e2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -570,6 +570,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
+const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -648,6 +649,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
+
+static inline const char *sfp_get_name(struct sfp_bus *bus)
+{
+	return NULL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 5a0e241003b80247de59727c945bc94c848f893d Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:28 -0300
Subject: thermal/core: Prepare for introduction of thermal reboot

Add some helper functions to make it easier introducing the support
for thermal reboot.

No functional change.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-2-festevam@gmail.com
---
 drivers/thermal/thermal_core.c | 14 ++++++++++----
 include/linux/reboot.h         |  7 ++++++-
 kernel/reboot.c                |  8 ++++----
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5e5fcbd81dda..859f62e9d779 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -311,18 +311,24 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 		       def_governor->throttle(tz, trip);
 }
 
-void thermal_zone_device_critical(struct thermal_zone_device *tz)
+static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown)
 {
 	/*
 	 * poweroff_delay_ms must be a carefully profiled positive value.
 	 * Its a must for forced_emergency_poweroff_work to be scheduled.
 	 */
 	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
+	const char *msg = "Temperature too high";
+
+	dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type);
 
-	dev_emerg(&tz->device, "%s: critical temperature reached, "
-		  "shutting down\n", tz->type);
+	if (shutdown)
+		hw_protection_shutdown(msg, poweroff_delay_ms);
+}
 
-	hw_protection_shutdown("Temperature too high", poweroff_delay_ms);
+void thermal_zone_device_critical(struct thermal_zone_device *tz)
+{
+	thermal_zone_device_halt(tz, true);
 }
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c4cc3b89ced1..4586c663884e 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -177,7 +177,12 @@ void ctrl_alt_del(void);
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
-void hw_protection_shutdown(const char *reason, int ms_until_forced);
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
+
+static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, true);
+}
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 395a0ea3c7a8..b236c4c06bb3 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -957,7 +957,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
 }
 
 /**
- * hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system poweroff
  *
  * @reason:		Reason of emergency shutdown to be printed.
  * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
@@ -971,7 +971,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
  * if the previous request has given a large timeout for forced shutdown.
  * Can be called from any context.
  */
-void hw_protection_shutdown(const char *reason, int ms_until_forced)
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
 {
 	static atomic_t allow_proceed = ATOMIC_INIT(1);
 
@@ -986,9 +986,9 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 	 * orderly_poweroff failure
 	 */
 	hw_failure_emergency_poweroff(ms_until_forced);
-	orderly_poweroff(true);
+	if (shutdown)
+		orderly_poweroff(true);
 }
-EXPORT_SYMBOL_GPL(hw_protection_shutdown);
 
 static int __init reboot_setup(char *str)
 {
-- 
cgit v1.2.3


From 79fa723ba84c2b1b3124c72df8a3b07b851a5477 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:29 -0300
Subject: reboot: Introduce thermal_zone_device_critical_reboot()

Introduce thermal_zone_device_critical_reboot() to trigger an
emergency reboot.

It is a counterpart of thermal_zone_device_critical() with the
difference that it will force a reboot instead of shutdown.

The motivation for doing this is to allow the thermal subystem
to trigger a reboot when the temperature reaches the critical
temperature.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-3-festevam@gmail.com
---
 drivers/thermal/thermal_core.c |  7 +++++++
 drivers/thermal/thermal_core.h |  1 +
 include/linux/reboot.h         |  5 +++++
 kernel/reboot.c                | 28 +++++++++++++++++-----------
 4 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 859f62e9d779..0d761afb7cbc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -324,6 +324,8 @@ static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdo
 
 	if (shutdown)
 		hw_protection_shutdown(msg, poweroff_delay_ms);
+	else
+		hw_protection_reboot(msg, poweroff_delay_ms);
 }
 
 void thermal_zone_device_critical(struct thermal_zone_device *tz)
@@ -332,6 +334,11 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz)
 }
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
+void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz)
+{
+	thermal_zone_device_halt(tz, false);
+}
+
 static void handle_critical_trips(struct thermal_zone_device *tz,
 				  const struct thermal_trip *trip)
 {
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index fe2917a74054..b5e6743bd157 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -114,6 +114,7 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *, char *);
 int thermal_build_list_of_policies(char *buf);
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event);
+void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz);
 
 /* Helpers */
 #define for_each_trip(__tz, __trip)	\
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 4586c663884e..abcdde4df697 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -179,6 +179,11 @@ extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
 void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
 
+static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, false);
+}
+
 static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	__hw_protection_shutdown(reason, ms_until_forced, true);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b236c4c06bb3..35d5e0b67993 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -957,19 +957,22 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
 }
 
 /**
- * __hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot
  *
- * @reason:		Reason of emergency shutdown to be printed.
- * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
- *			forced shudown. Negative value disables the forced
- *			shutdown.
+ * @reason:		Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced:	Time to wait for orderly shutdown or reboot before
+ *			triggering it. Negative value disables the forced
+ *			shutdown or reboot.
+ * @shutdown:		If true, indicates that a shutdown will happen
+ *			after the critical tempeature is reached.
+ *			If false, indicates that a reboot will happen
+ *			after the critical tempeature is reached.
  *
- * Initiate an emergency system shutdown in order to protect hardware from
- * further damage. Usage examples include a thermal protection or a voltage or
- * current regulator failures.
- * NOTE: The request is ignored if protection shutdown is already pending even
- * if the previous request has given a large timeout for forced shutdown.
- * Can be called from any context.
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
  */
 void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
 {
@@ -988,7 +991,10 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut
 	hw_failure_emergency_poweroff(ms_until_forced);
 	if (shutdown)
 		orderly_poweroff(true);
+	else
+		orderly_reboot();
 }
+EXPORT_SYMBOL_GPL(__hw_protection_shutdown);
 
 static int __init reboot_setup(char *str)
 {
-- 
cgit v1.2.3


From 37d158d0b05144f696323ae5bbfe1e137f7c06d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Dec 2023 08:38:46 +0100
Subject: HID: make hid_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the hid_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/hid-core.c | 2 +-
 include/linux/hid.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index e0181218ad85..de7a477d6665 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2749,7 +2749,7 @@ static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-struct bus_type hid_bus_type = {
+const struct bus_type hid_bus_type = {
 	.name		= "hid",
 	.dev_groups	= hid_dev_groups,
 	.drv_groups	= hid_drv_groups,
diff --git a/include/linux/hid.h b/include/linux/hid.h
index bf43f3ff6664..7c26db874ff0 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -912,7 +912,7 @@ extern bool hid_ignore(struct hid_device *);
 extern int hid_add_device(struct hid_device *);
 extern void hid_destroy_device(struct hid_device *);
 
-extern struct bus_type hid_bus_type;
+extern const struct bus_type hid_bus_type;
 
 extern int __must_check __hid_register_driver(struct hid_driver *,
 		struct module *, const char *mod_name);
-- 
cgit v1.2.3


From 9b0a3839e8d29663cd9ee2c43d38b06c3b91619e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 20 Dec 2023 08:38:48 +0100
Subject: HID: bpf: make bus_type const in struct hid_bpf_ops

The struct bus_type pointer in hid_bpf_ops just passes the pointer to
the driver core, and the driver core can handle, and expects, a constant
pointer, so also make the pointer constant in hid_bpf_ops.

Part of the process of moving all usages of struct bus_type to be
constant to move them all to read-only memory.

Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/hid_bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index e9afb61e6ee0..840cd254172d 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -115,7 +115,7 @@ struct hid_bpf_ops {
 				  size_t len, enum hid_report_type rtype,
 				  enum hid_class_request reqtype);
 	struct module *owner;
-	struct bus_type *bus_type;
+	const struct bus_type *bus_type;
 };
 
 extern struct hid_bpf_ops *hid_bpf_ops;
-- 
cgit v1.2.3


From 87d8f1ee1d4070571095859d310f6951440f288c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 23 Dec 2023 11:43:19 -0800
Subject: platform/x86: wmi: linux/wmi.h: fix Excess kernel-doc description
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the "private:" comment to prevent the kernel-doc warning:

include/linux/wmi.h:27: warning: Excess struct member 'setable' description in 'wmi_device'

Either a struct member is documented (via kernel-doc) or it's private,
but not both.

Fixes: b4cc979588ee ("platform/x86: wmi: Add kernel doc comments")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Armin Wolf <W_Armin@gmx.de>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: platform-driver-x86@vger.kernel.org
Reviewed-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20231223194321.23084-1-rdunlap@infradead.org
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/wmi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 50f7f1e4fd4f..686291b87852 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -21,8 +21,6 @@
  */
 struct wmi_device {
 	struct device dev;
-
-	/* private: used by the WMI driver core */
 	bool setable;
 };
 
-- 
cgit v1.2.3


From 993498e537af9260e697219ce41b41b22b6199cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Dec 2023 14:07:47 +0000
Subject: net-device: move gso_partial_features to net_device_read_tx

dev->gso_partial_features is read from tx fast path for GSO packets.

Move it to appropriate section to avoid a cache line miss.

Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Coco Li <lixiaoyan@google.com>
Cc: David Ahern <dsahern@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/net_cachelines/net_device.rst | 2 +-
 include/linux/netdevice.h                              | 2 +-
 net/core/dev.c                                         | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 6cab1b797739..2dd8d8f20da2 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -38,7 +38,7 @@ netdev_features_t                   wanted_features
 netdev_features_t                   vlan_features                                                   
 netdev_features_t                   hw_enc_features         -                   -                   netif_skb_features
 netdev_features_t                   mpls_features                                                   
-netdev_features_t                   gso_partial_features                                            
+netdev_features_t                   gso_partial_features    read_mostly                             gso_features_check
 unsigned_int                        min_mtu                                                         
 unsigned_int                        max_mtu                                                         
 unsigned_short                      type                                                            
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5baa5517f533..d59db9adcc96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2115,6 +2115,7 @@ struct net_device {
 	const struct net_device_ops *netdev_ops;
 	const struct header_ops *header_ops;
 	struct netdev_queue	*_tx;
+	netdev_features_t	gso_partial_features;
 	unsigned int		real_num_tx_queues;
 	unsigned int		gso_max_size;
 	unsigned int		gso_ipv4_max_size;
@@ -2211,7 +2212,6 @@ struct net_device {
 	netdev_features_t	vlan_features;
 	netdev_features_t	hw_enc_features;
 	netdev_features_t	mpls_features;
-	netdev_features_t	gso_partial_features;
 
 	unsigned int		min_mtu;
 	unsigned int		max_mtu;
diff --git a/net/core/dev.c b/net/core/dev.c
index df04cbf77551..31588a50b757 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11629,6 +11629,7 @@ static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
@@ -11642,7 +11643,7 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 152);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
 
 	/* TXRX read-mostly hotpath */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
-- 
cgit v1.2.3


From 059d37b718d38d26087121c754691df77acfc66b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:13 -0800
Subject: net: phy: linux/phy.h: fix Excess kernel-doc description warning

Remove the @phy_timer: line to prevent the kernel-doc warning:

include/linux/phy.h:768: warning: Excess struct member 'phy_timer' description in 'phy_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: netdev@vger.kernel.org
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3cc52826f18e..bd285950972c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -568,7 +568,6 @@ struct macsec_ops;
  *      - Bits [31:24] are reserved for defining generic
  *        PHY driver behavior.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
- * @phy_timer: The timer for handling the state machine
  * @phylink: Pointer to phylink instance for this PHY
  * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached
  * @sfp_bus: SFP bus attached to this PHY's fiber port
-- 
cgit v1.2.3


From e4df56ad0bf3506c5189abb9be83f3bea05a4c4f Mon Sep 17 00:00:00 2001
From: Lin Gui <lin.gui@mediatek.com>
Date: Tue, 19 Dec 2023 07:05:32 +0800
Subject: mmc: core: Add wp_grp_size sysfs node

The eMMC card can be set into write-protected mode to prevent data from
being accidentally modified or deleted. Wp_grp_size (Write Protect Group
Size) refers to an attribute of the eMMC card, used to manage write
protection and is the CSD register [36:32] of the eMMC device. Wp_grp_size
(Write Protect Group Size) indicates how many eMMC blocks are contained in
each write protection group on the eMMC card.

To allow userspace easy access of the CSD register bits, let's add sysfs
node "wp_grp_size".

Signed-off-by: Lin Gui <lin.gui@mediatek.com>
Signed-off-by: Bo Ye <bo.ye@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20231218230532.82427-1-bo.ye@mediatek.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 16 +++++++++++++++-
 include/linux/mmc/card.h |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 7b996db570c9..adf7b70cf1b6 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -136,6 +136,17 @@ static void mmc_set_erase_size(struct mmc_card *card)
 	mmc_init_erase(card);
 }
 
+
+static void mmc_set_wp_grp_size(struct mmc_card *card)
+{
+	if (card->ext_csd.erase_group_def & 1)
+		card->wp_grp_size = card->ext_csd.hc_erase_size *
+			card->ext_csd.raw_hc_erase_gap_size;
+	else
+		card->wp_grp_size = card->csd.erase_size *
+			(card->csd.wp_grp_size + 1);
+}
+
 /*
  * Given a 128-bit response, decode to our card CSD structure.
  */
@@ -186,6 +197,7 @@ static int mmc_decode_csd(struct mmc_card *card)
 		b = UNSTUFF_BITS(resp, 37, 5);
 		csd->erase_size = (a + 1) * (b + 1);
 		csd->erase_size <<= csd->write_blkbits - 9;
+		csd->wp_grp_size = UNSTUFF_BITS(resp, 32, 5);
 	}
 
 	return 0;
@@ -785,6 +797,7 @@ MMC_DEV_ATTR(csd, "%08x%08x%08x%08x\n", card->raw_csd[0], card->raw_csd[1],
 MMC_DEV_ATTR(date, "%02d/%04d\n", card->cid.month, card->cid.year);
 MMC_DEV_ATTR(erase_size, "%u\n", card->erase_size << 9);
 MMC_DEV_ATTR(preferred_erase_size, "%u\n", card->pref_erase << 9);
+MMC_DEV_ATTR(wp_grp_size, "%u\n", card->wp_grp_size << 9);
 MMC_DEV_ATTR(ffu_capable, "%d\n", card->ext_csd.ffu_capable);
 MMC_DEV_ATTR(hwrev, "0x%x\n", card->cid.hwrev);
 MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
@@ -845,6 +858,7 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_date.attr,
 	&dev_attr_erase_size.attr,
 	&dev_attr_preferred_erase_size.attr,
+	&dev_attr_wp_grp_size.attr,
 	&dev_attr_fwrev.attr,
 	&dev_attr_ffu_capable.attr,
 	&dev_attr_hwrev.attr,
@@ -1759,7 +1773,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 			mmc_set_erase_size(card);
 		}
 	}
-
+	mmc_set_wp_grp_size(card);
 	/*
 	 * Ensure eMMC user default partition is enabled
 	 */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 47eeb122524c..f34407cc2788 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -32,6 +32,7 @@ struct mmc_csd {
 	unsigned int		r2w_factor;
 	unsigned int		max_dtr;
 	unsigned int		erase_size;		/* In sectors */
+	unsigned int		wp_grp_size;
 	unsigned int		read_blkbits;
 	unsigned int		write_blkbits;
 	unsigned int		capacity;
@@ -303,6 +304,7 @@ struct mmc_card {
 	unsigned int		eg_boundary;	/* don't cross erase-group boundaries */
 	unsigned int		erase_arg;	/* erase / trim / discard */
  	u8			erased_byte;	/* value of erased bytes */
+	unsigned int		wp_grp_size;	/* write group size in sectors */
 
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
-- 
cgit v1.2.3


From 3e64db35bc37edbe9e37aaa987df92cde12ddb6c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 2 Jan 2024 14:23:34 -0800
Subject: Revert "net: mdio: get/put device node during (un)registration"

This reverts commit cff9c565e65f3622e8dc1dcc21c1520a083dff35.

Revert based on feedback from Russell.

Link: https://lore.kernel.org/all/ZZPtUIRerqTI2%2Fyh@shell.armlinux.org.uk/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/of_mdio.c | 12 +-----------
 drivers/net/phy/mdio_bus.c |  3 ---
 include/linux/phy.h        |  3 ---
 3 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 9b6cab6154e0..64ebcb6d235c 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -139,11 +139,6 @@ bool of_mdiobus_child_is_phy(struct device_node *child)
 }
 EXPORT_SYMBOL(of_mdiobus_child_is_phy);
 
-static void __of_mdiobus_unregister_callback(struct mii_bus *mdio)
-{
-	of_node_put(mdio->dev.of_node);
-}
-
 /**
  * __of_mdiobus_register - Register mii_bus and create PHYs from the device tree
  * @mdio: pointer to mii_bus structure
@@ -171,8 +166,6 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 	 * the device tree are populated after the bus has been registered */
 	mdio->phy_mask = ~0;
 
-	mdio->__unregister_callback = __of_mdiobus_unregister_callback;
-	of_node_get(np);
 	device_set_node(&mdio->dev, of_fwnode_handle(np));
 
 	/* Get bus level PHY reset GPIO details */
@@ -184,7 +177,7 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 	/* Register the MDIO bus */
 	rc = __mdiobus_register(mdio, owner);
 	if (rc)
-		goto put_node;
+		return rc;
 
 	/* Loop over the child nodes and register a phy_device for each phy */
 	for_each_available_child_of_node(np, child) {
@@ -244,9 +237,6 @@ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np,
 unregister:
 	of_node_put(child);
 	mdiobus_unregister(mdio);
-
-put_node:
-	of_node_put(np);
 	return rc;
 }
 EXPORT_SYMBOL(__of_mdiobus_register);
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 4a30757c4ff8..6cf73c15635b 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -787,9 +787,6 @@ void mdiobus_unregister(struct mii_bus *bus)
 		gpiod_set_value_cansleep(bus->reset_gpiod, 1);
 
 	device_del(&bus->dev);
-
-	if (bus->__unregister_callback)
-		bus->__unregister_callback(bus);
 }
 EXPORT_SYMBOL(mdiobus_unregister);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ac22b8e28a85..6cb9d843aee9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -434,9 +434,6 @@ struct mii_bus {
 
 	/** @shared: shared state across different PHYs */
 	struct phy_package_shared *shared[PHY_MAX_ADDR];
-
-	/** @__unregister_callback: called at the last step of unregistration */
-	void (*__unregister_callback)(struct mii_bus *bus);
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
-- 
cgit v1.2.3


From db02e176f597a14eb696141ffa008c2429453a15 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 6 Dec 2023 16:42:31 -0600
Subject: PCI/AER: Use explicit register sizes for struct members

aer_irq() reads the AER Root Error Status and Error Source Identification
(PCI_ERR_ROOT_STATUS and PCI_ERR_ROOT_ERR_SRC) registers directly into
struct aer_err_source.  Both registers are 32 bits, so declare the members
explicitly as "u32" instead of "unsigned int".

Similarly, aer_get_device_error_info() reads the AER Header Log
(PCI_ERR_HEADER_LOG) registers, which are also 32 bits, into struct
aer_header_log_regs.  Declare those members as "u32" as well.

No functional changes intended.

Link: https://lore.kernel.org/r/20231206224231.732765-4-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pcie/aer.c | 4 ++--
 include/linux/aer.h    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 412b00bdf918..05fc30bb5134 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -41,8 +41,8 @@
 #define AER_MAX_TYPEOF_UNCOR_ERRS	27	/* as per PCI_ERR_UNCOR_STATUS*/
 
 struct aer_err_source {
-	unsigned int status;
-	unsigned int id;
+	u32 status;			/* PCI_ERR_ROOT_STATUS */
+	u32 id;				/* PCI_ERR_ROOT_ERR_SRC */
 };
 
 struct aer_rpc {
diff --git a/include/linux/aer.h b/include/linux/aer.h
index f6ea2f57d808..ae0fae70d4bd 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -19,10 +19,10 @@
 struct pci_dev;
 
 struct aer_header_log_regs {
-	unsigned int dw0;
-	unsigned int dw1;
-	unsigned int dw2;
-	unsigned int dw3;
+	u32 dw0;
+	u32 dw1;
+	u32 dw2;
+	u32 dw3;
 };
 
 struct aer_capability_regs {
-- 
cgit v1.2.3


From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 27 Dec 2023 21:38:23 +0100
Subject: async: Introduce async_schedule_dev_nocall()

In preparation for subsequent changes, introduce a specialized variant
of async_schedule_dev() that will not invoke the argument function
synchronously when it cannot be scheduled for asynchronous execution.

The new function, async_schedule_dev_nocall(), will be used for fixing
possible deadlocks in the system-wide power management core code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com> for the series.
Tested-by: Youngmin Nam <youngmin.nam@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/async.h |  2 ++
 kernel/async.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/async.h b/include/linux/async.h
index cce4ad31e8fc..33c9ff4afb49 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev)
 	return async_schedule_node(func, dev, dev_to_node(dev));
 }
 
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev);
+
 /**
  * async_schedule_dev_domain - A device specific version of async_schedule_domain
  * @func: function to execute asynchronously
diff --git a/kernel/async.c b/kernel/async.c
index cffe6b4cff9f..673bba6bdf3a 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -243,6 +243,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
 }
 EXPORT_SYMBOL_GPL(async_schedule_node);
 
+/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+	struct async_entry *entry;
+
+	entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+	/* Give up if there is no memory or too much work. */
+	if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+		kfree(entry);
+		return false;
+	}
+
+	__async_schedule_node_domain(func, dev, dev_to_node(dev),
+				     &async_dfl_domain, entry);
+	return true;
+}
+
 /**
  * async_synchronize_full - synchronize all asynchronous function calls
  *
-- 
cgit v1.2.3


From 3b82024c5ba93e7a0db2d0b9635ca6b28338efd7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 28 Dec 2023 13:04:41 +0530
Subject: OPP: Move dev_pm_opp_icc_bw to internal opp.h

It isn't used by any driver or API, privatize it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/opp.h      | 12 ++++++++++++
 include/linux/pm_opp.h | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 23dcb2fbf8c3..558c9ac6a6fa 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -50,6 +50,18 @@ struct opp_config_data {
 	unsigned int flags;
 };
 
+/**
+ * struct dev_pm_opp_icc_bw - Interconnect bandwidth values
+ * @avg:	Average bandwidth corresponding to this OPP (in icc units)
+ * @peak:	Peak bandwidth corresponding to this OPP (in icc units)
+ *
+ * This structure stores the bandwidth values for a single interconnect path.
+ */
+struct dev_pm_opp_icc_bw {
+	u32 avg;
+	u32 peak;
+};
+
 /*
  * Internal data structure organization with the OPP layer library is as
  * follows:
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 74768c47d790..76dcb7f37bcd 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -45,18 +45,6 @@ struct dev_pm_opp_supply {
 	unsigned long u_watt;
 };
 
-/**
- * struct dev_pm_opp_icc_bw - Interconnect bandwidth values
- * @avg:	Average bandwidth corresponding to this OPP (in icc units)
- * @peak:	Peak bandwidth corresponding to this OPP (in icc units)
- *
- * This structure stores the bandwidth values for a single interconnect path.
- */
-struct dev_pm_opp_icc_bw {
-	u32 avg;
-	u32 peak;
-};
-
 typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
-- 
cgit v1.2.3


From a15f2d48c6f84ae0dd2000288592c79d5d1acd0e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:47:41 +0100
Subject: nubus: Make nubus_bus_type static and constant

Now that the driver core can properly handle constant struct bus_type,
move the nubus_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

It's also never used outside of drivers/nubus/bus.c so make it static
and don't export it as no one is using it.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Finn Thain <fthain@linux-m68k.org>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/2023121940-enlarged-editor-c9a8@gregkh
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 drivers/nubus/bus.c   | 3 +--
 include/linux/nubus.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nubus/bus.c b/drivers/nubus/bus.c
index 72921e4f35f6..12df4d88970c 100644
--- a/drivers/nubus/bus.c
+++ b/drivers/nubus/bus.c
@@ -32,12 +32,11 @@ static void nubus_device_remove(struct device *dev)
 		ndrv->remove(to_nubus_board(dev));
 }
 
-struct bus_type nubus_bus_type = {
+static const struct bus_type nubus_bus_type = {
 	.name		= "nubus",
 	.probe		= nubus_device_probe,
 	.remove		= nubus_device_remove,
 };
-EXPORT_SYMBOL(nubus_bus_type);
 
 int nubus_driver_register(struct nubus_driver *ndrv)
 {
diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index bdcd85e622d8..4d103ac8f5c7 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -89,8 +89,6 @@ struct nubus_driver {
 	void (*remove)(struct nubus_board *board);
 };
 
-extern struct bus_type nubus_bus_type;
-
 /* Generic NuBus interface functions, modelled after the PCI interface */
 #ifdef CONFIG_PROC_FS
 extern bool nubus_populate_procfs;
-- 
cgit v1.2.3


From 2ad28ce9b98f8b22feaecc0966c706a8ef59cbf0 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Mon, 1 Jan 2024 12:35:27 +0200
Subject: nvme: remove unused definition

There is no users for NVMF_AUTH_HASH_LEN macro.

Reviewed-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 44325c068b6a..462c21e0e417 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -20,7 +20,6 @@
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
-#define NVMF_AUTH_HASH_LEN	64
 
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
-- 
cgit v1.2.3


From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 21 Dec 2023 15:23:24 -0800
Subject: bpf: sockmap, added comments describing update proto rules

Add a comment describing that the psock update proto callbback can be
called multiple times and this must be safe.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c953b8c0d2f4..888a4b217829 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -100,6 +100,11 @@ struct sk_psock {
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
+	/* psock_update_sk_prot may be called with restore=false many times
+	 * so the handler must be safe for this case. It will be called
+	 * exactly once with restore=true when the psock is being destroyed
+	 * and psock refcnt is zero, but before an RCU grace period.
+	 */
 	int  (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock,
 				     bool restore);
 	struct proto			*sk_proto;
-- 
cgit v1.2.3


From d3d344a1ca69d8fb2413e29e6400f3ad58a05c06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 2 Jan 2024 16:22:20 +0000
Subject: net-device: move xdp_prog to net_device_read_rx

xdp_prog is used in receive path, both from XDP enabled drivers
and from netif_elide_gro().

This patch also removes two 4-bytes holes.

Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Coco Li <lixiaoyan@google.com>
Cc: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240102162220.750823-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_cachelines/net_device.rst | 2 +-
 include/linux/netdevice.h                              | 2 +-
 net/core/dev.c                                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 2dd8d8f20da2..e75a53593bb9 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -96,7 +96,7 @@ unsigned_char*                      dev_addr
 struct_netdev_queue*                _rx                     read_mostly         -                   netdev_get_rx_queue(rx)
 unsigned_int                        num_rx_queues                                                   
 unsigned_int                        real_num_rx_queues      -                   read_mostly         get_rps_cpu
-struct_bpf_prog*                    xdp_prog                                                        
+struct_bpf_prog*                    xdp_prog                -                   read_mostly         netif_elide_gro()
 unsigned_long                       gro_flush_timeout       -                   read_mostly         napi_complete_done
 int                                 napi_defer_hard_irqs    -                   read_mostly         napi_complete_done
 unsigned_int                        gro_max_size            -                   read_mostly         skb_gro_receive
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d59db9adcc96..e265aa1f2169 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2150,6 +2150,7 @@ struct net_device {
 
 	/* RX read-mostly hotpath */
 	__cacheline_group_begin(net_device_read_rx);
+	struct bpf_prog __rcu	*xdp_prog;
 	struct list_head	ptype_specific;
 	int			ifindex;
 	unsigned int		real_num_rx_queues;
@@ -2325,7 +2326,6 @@ struct net_device {
 	const unsigned char	*dev_addr;
 
 	unsigned int		num_rx_queues;
-	struct bpf_prog __rcu	*xdp_prog;
 #define GRO_LEGACY_MAX_SIZE	65536u
 /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
  * and shinfo->gso_segs is a 16bit field.
diff --git a/net/core/dev.c b/net/core/dev.c
index 31588a50b757..bc4ac49d4643 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11670,7 +11670,7 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 96);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
 }
 
 /*
-- 
cgit v1.2.3


From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:39 -0800
Subject: bpf: Add objcg to bpf_mem_alloc

The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's
are with the same objcg. This patch made such a property explicit.
The next patch will use this property to save and restore objcg
for percpu unit allocator.

Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  1 +
 kernel/bpf/memalloc.c         | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index bb1223b21308..acef8c808599 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	struct obj_cgroup *objcg;
 	bool percpu;
 	struct work_struct work;
 };
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 288ec4a967d0..4a21050f0359 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -523,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 		if (memcg_bpf_enabled())
 			objcg = get_obj_cgroup_from_current();
 #endif
+		ma->objcg = objcg;
 		for_each_possible_cpu(cpu) {
 			c = per_cpu_ptr(pc, cpu);
 			c->unit_size = unit_size;
@@ -542,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 #ifdef CONFIG_MEMCG_KMEM
 	objcg = get_obj_cgroup_from_current();
 #endif
+	ma->objcg = objcg;
 	for_each_possible_cpu(cpu) {
 		cc = per_cpu_ptr(pcc, cpu);
 		for (i = 0; i < NUM_CACHES; i++) {
@@ -691,9 +693,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
 			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
-		/* objcg is the same across cpus */
-		if (c->objcg)
-			obj_cgroup_put(c->objcg);
+		if (ma->objcg)
+			obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 	if (ma->caches) {
@@ -709,8 +710,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
-		if (c->objcg)
-			obj_cgroup_put(c->objcg);
+		if (ma->objcg)
+			obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 }
-- 
cgit v1.2.3


From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:45 -0800
Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator

Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)

In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
  * 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.

Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.

Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.

To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  7 ++++++
 kernel/bpf/memalloc.c         | 57 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c         | 37 +++++++++++++++++-----------
 3 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index acef8c808599..aaf004d94322 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -22,8 +22,15 @@ struct bpf_mem_alloc {
  * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
  * Alloc and free are done with bpf_mem_{alloc,free}() and the size of
  * the returned object is given by the size argument of bpf_mem_alloc().
+ * If percpu equals true, error will be returned in order to avoid
+ * large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
+ * should be used to do on-demand per-cpu allocation for each size.
  */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+/* Initialize a non-fix-size percpu memory allocator */
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
+/* The percpu allocation with a specific unit size. */
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 4a21050f0359..f71da07eb8a0 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,6 +121,8 @@ struct bpf_mem_caches {
 	struct bpf_mem_cache cache[NUM_CACHES];
 };
 
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
 static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 {
 	struct llist_node *entry, *next;
@@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
  */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 {
-	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
 	struct bpf_mem_caches *cc, __percpu *pcc;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	if (percpu && size == 0)
+		return -EINVAL;
+
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
 		percpu_size = LLIST_NODE_SZ + sizeof(void *);
@@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			objcg = get_obj_cgroup_from_current();
 #endif
 		ma->objcg = objcg;
+
 		for_each_possible_cpu(cpu) {
 			c = per_cpu_ptr(pc, cpu);
 			c->unit_size = unit_size;
@@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	return 0;
 }
 
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+	struct bpf_mem_caches __percpu *pcc;
+
+	pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+	if (!pcc)
+		return -ENOMEM;
+
+	ma->caches = pcc;
+	ma->objcg = objcg;
+	ma->percpu = true;
+	return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+	struct bpf_mem_caches *cc, __percpu *pcc;
+	int cpu, i, unit_size, percpu_size;
+	struct obj_cgroup *objcg;
+	struct bpf_mem_cache *c;
+
+	i = bpf_mem_cache_idx(size);
+	if (i < 0)
+		return -EINVAL;
+
+	/* room for llist_node and per-cpu pointer */
+	percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+	unit_size = sizes[i];
+	objcg = ma->objcg;
+	pcc = ma->caches;
+
+	for_each_possible_cpu(cpu) {
+		cc = per_cpu_ptr(pcc, cpu);
+		c = &cc->cache[i];
+		if (cpu == 0 && c->unit_size)
+			break;
+
+		c->unit_size = unit_size;
+		c->objcg = objcg;
+		c->percpu_size = percpu_size;
+		c->tgt = c;
+
+		init_refill_work(c);
+		prefill_mem_cache(c, cpu);
+	}
+
+	return 0;
+}
+
 static void drain_mem_cache(struct bpf_mem_cache *c)
 {
 	bool percpu = !!c->percpu_size;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4e31f61de0e..e9699a2cfe4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
-					if (!bpf_global_percpu_ma_set) {
-						mutex_lock(&bpf_percpu_ma_lock);
-						if (!bpf_global_percpu_ma_set) {
-							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-							if (!err)
-								bpf_global_percpu_ma_set = true;
-						}
-						mutex_unlock(&bpf_percpu_ma_lock);
-						if (err)
-							return err;
-					}
-				}
-
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
 					return -EINVAL;
@@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 					return -EINVAL;
 				}
 
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							/* Charge memory allocated with bpf_global_percpu_ma to
+							 * root memcg. The obj_cgroup for root memcg is NULL.
+							 */
+							err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+
+					mutex_lock(&bpf_percpu_ma_lock);
+					err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+					mutex_unlock(&bpf_percpu_ma_lock);
+					if (err)
+						return err;
+				}
+
 				struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
 				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
 					if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
-- 
cgit v1.2.3


From 5e5401d6612ef599ad45785b941eebda7effc90f Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 4 Jan 2024 09:47:36 +0000
Subject: net: phylink: move phylink_pcs_neg_mode() into phylink.c

Move phylink_pcs_neg_mode() from the header file into the .c file since
nothing should be using it.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phylink.h   | 66 -----------------------------------------------
 2 files changed, 66 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 3d25a4a6212b..a816391add12 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1074,6 +1074,72 @@ static void phylink_pcs_an_restart(struct phylink *pl)
 		pl->pcs->ops->pcs_an_restart(pl->pcs);
 }
 
+/**
+ * phylink_pcs_neg_mode() - helper to determine PCS inband mode
+ * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
+ * @interface: interface mode to be used
+ * @advertising: adertisement ethtool link mode mask
+ *
+ * Determines the negotiation mode to be used by the PCS, and returns
+ * one of:
+ *
+ * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband
+ * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY)
+ *   will be used.
+ * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg
+ *   disabled
+ * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled
+ *
+ * Note: this is for cases where the PCS itself is involved in negotiation
+ * (e.g. Clause 37, SGMII and similar) not Clause 73.
+ */
+static unsigned int phylink_pcs_neg_mode(unsigned int mode,
+					 phy_interface_t interface,
+					 const unsigned long *advertising)
+{
+	unsigned int neg_mode;
+
+	switch (interface) {
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_QSGMII:
+	case PHY_INTERFACE_MODE_QUSGMII:
+	case PHY_INTERFACE_MODE_USXGMII:
+		/* These protocols are designed for use with a PHY which
+		 * communicates its negotiation result back to the MAC via
+		 * inband communication. Note: there exist PHYs that run
+		 * with SGMII but do not send the inband data.
+		 */
+		if (!phylink_autoneg_inband(mode))
+			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
+		else
+			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
+		break;
+
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		/* 1000base-X is designed for use media-side for Fibre
+		 * connections, and thus the Autoneg bit needs to be
+		 * taken into account. We also do this for 2500base-X
+		 * as well, but drivers may not support this, so may
+		 * need to override this.
+		 */
+		if (!phylink_autoneg_inband(mode))
+			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
+		else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					   advertising))
+			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
+		else
+			neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED;
+		break;
+
+	default:
+		neg_mode = PHYLINK_PCS_NEG_NONE;
+		break;
+	}
+
+	return neg_mode;
+}
+
 static void phylink_major_config(struct phylink *pl, bool restart,
 				  const struct phylink_link_state *state)
 {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 875439ab45de..d589f89c612c 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -98,72 +98,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode)
 	return mode == MLO_AN_INBAND;
 }
 
-/**
- * phylink_pcs_neg_mode() - helper to determine PCS inband mode
- * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
- * @interface: interface mode to be used
- * @advertising: adertisement ethtool link mode mask
- *
- * Determines the negotiation mode to be used by the PCS, and returns
- * one of:
- *
- * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband
- * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY)
- *   will be used.
- * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg
- *   disabled
- * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled
- *
- * Note: this is for cases where the PCS itself is involved in negotiation
- * (e.g. Clause 37, SGMII and similar) not Clause 73.
- */
-static inline unsigned int phylink_pcs_neg_mode(unsigned int mode,
-						phy_interface_t interface,
-						const unsigned long *advertising)
-{
-	unsigned int neg_mode;
-
-	switch (interface) {
-	case PHY_INTERFACE_MODE_SGMII:
-	case PHY_INTERFACE_MODE_QSGMII:
-	case PHY_INTERFACE_MODE_QUSGMII:
-	case PHY_INTERFACE_MODE_USXGMII:
-		/* These protocols are designed for use with a PHY which
-		 * communicates its negotiation result back to the MAC via
-		 * inband communication. Note: there exist PHYs that run
-		 * with SGMII but do not send the inband data.
-		 */
-		if (!phylink_autoneg_inband(mode))
-			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
-		else
-			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
-		break;
-
-	case PHY_INTERFACE_MODE_1000BASEX:
-	case PHY_INTERFACE_MODE_2500BASEX:
-		/* 1000base-X is designed for use media-side for Fibre
-		 * connections, and thus the Autoneg bit needs to be
-		 * taken into account. We also do this for 2500base-X
-		 * as well, but drivers may not support this, so may
-		 * need to override this.
-		 */
-		if (!phylink_autoneg_inband(mode))
-			neg_mode = PHYLINK_PCS_NEG_OUTBAND;
-		else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
-					   advertising))
-			neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED;
-		else
-			neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED;
-		break;
-
-	default:
-		neg_mode = PHYLINK_PCS_NEG_NONE;
-		break;
-	}
-
-	return neg_mode;
-}
-
 /**
  * struct phylink_link_state - link state structure
  * @advertising: ethtool bitmask containing advertised link modes
-- 
cgit v1.2.3


From 040a82be54c09a72162a3db2f5cd2ba289c0f224 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 13 Oct 2023 12:26:31 +0100
Subject: netfs: Rearrange netfs_io_subrequest to put request pointer first

Rearrange the netfs_io_subrequest struct to put the netfs_io_request
pointer (rreq) first.  This then allows netfs_io_subrequest to be put in a
union with a pointer to a wrapper around netfs_io_request.  This will be
useful in the future for cifs and maybe ceph.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 include/linux/netfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 852956aa3c4b..d3bac60fcd6f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -204,8 +204,8 @@ struct netfs_cache_resources {
  * the pages it points to can be relied on to exist for the duration.
  */
 struct netfs_io_subrequest {
-	struct work_struct	work;
 	struct netfs_io_request *rreq;		/* Supervising I/O request */
+	struct work_struct	work;
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	loff_t			start;		/* Where to start the I/O */
-- 
cgit v1.2.3


From f36be9ce8146faabdbbf74ee0499edb2039c53a5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:13:10 +0100
Subject: EDAC: constantify the struct bus_type usage

In many places in the edac code, struct bus_type pointers are passed
around and then eventually sent to the driver core, which can handle a
constant pointer.  So constantify all of the edac usage of these as well
because the data in them is never modified by the edac code either.

Cc: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Robert Richter <rric@kernel.org>
Cc:  <linux-edac@vger.kernel.org>
Link: https://lore.kernel.org/r/2023121909-tribute-punctuate-4b22@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/edac/edac_device.h       | 2 +-
 drivers/edac/edac_device_sysfs.c | 2 +-
 drivers/edac/edac_module.c       | 4 ++--
 drivers/edac/edac_pci_sysfs.c    | 2 +-
 include/linux/edac.h             | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index 3f44e6b9d387..7db22a4c83ef 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -176,7 +176,7 @@ struct edac_device_ctl_info {
 	struct edac_dev_sysfs_attribute *sysfs_attributes;
 
 	/* pointer to main 'edac' subsys in sysfs */
-	struct bus_type *edac_subsys;
+	const struct bus_type *edac_subsys;
 
 	/* the internal state of this controller instance */
 	int op_state;
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 010c26be5846..237a542e045a 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -229,7 +229,7 @@ static struct kobj_type ktype_device_ctrl = {
 int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
 {
 	struct device *dev_root;
-	struct bus_type *edac_subsys;
+	const struct bus_type *edac_subsys;
 	int err = -ENODEV;
 
 	edac_dbg(1, "\n");
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 32a931d0cb71..1c9f62382666 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -67,7 +67,7 @@ char *edac_op_state_to_string(int opstate)
  * sysfs object: /sys/devices/system/edac
  *	need to export to other files
  */
-static struct bus_type edac_subsys = {
+static const struct bus_type edac_subsys = {
 	.name = "edac",
 	.dev_name = "edac",
 };
@@ -90,7 +90,7 @@ static void edac_subsys_exit(void)
 }
 
 /* return pointer to the 'edac' node in sysfs */
-struct bus_type *edac_get_sysfs_subsys(void)
+const struct bus_type *edac_get_sysfs_subsys(void)
 {
 	return &edac_subsys;
 }
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c
index 287cc51dbc86..e823e4da086a 100644
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -338,7 +338,7 @@ static struct kobj_type ktype_edac_pci_main_kobj = {
 static int edac_pci_main_kobj_setup(void)
 {
 	int err = -ENODEV;
-	struct bus_type *edac_subsys;
+	const struct bus_type *edac_subsys;
 	struct device *dev_root;
 
 	edac_dbg(0, "\n");
diff --git a/include/linux/edac.h b/include/linux/edac.h
index fa4bda2a70f6..ccaf2ae0801d 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -30,7 +30,7 @@ struct device;
 
 extern int edac_op_state;
 
-struct bus_type *edac_get_sysfs_subsys(void);
+const struct bus_type *edac_get_sysfs_subsys(void);
 
 static inline void opstate_init(void)
 {
@@ -492,7 +492,7 @@ struct edac_raw_error_desc {
  */
 struct mem_ctl_info {
 	struct device			dev;
-	struct bus_type			*bus;
+	const struct bus_type		*bus;
 
 	struct list_head link;	/* for global list of mem_ctl_info structs */
 
-- 
cgit v1.2.3


From db2292b01b799e926abfdbd6fafa1f27f0d0e457 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:07:23 +0100
Subject: PM: clk: make pm_clk_add_notifier() take a const pointer

The driver core wants to work with const struct bus_type, so there's no
reason that pm_clk_add_notifier() should not also do the same thing,
considering that it just passes the pointer off to the driver core which
is expecting a const *.

Cc: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/2023121922-triumph-exploit-f545@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/power/clock_ops.c | 2 +-
 include/linux/pm_clock.h       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index 4110c19c08dc..e18ba676cdf6 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -793,7 +793,7 @@ static int pm_clk_notify(struct notifier_block *nb,
  * the remaining members of @clknb should be populated prior to calling this
  * routine.
  */
-void pm_clk_add_notifier(struct bus_type *bus,
+void pm_clk_add_notifier(const struct bus_type *bus,
 				 struct pm_clk_notifier_block *clknb)
 {
 	if (!bus || !clknb)
diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h
index ada3a0ab10bf..68669ce18720 100644
--- a/include/linux/pm_clock.h
+++ b/include/linux/pm_clock.h
@@ -91,10 +91,10 @@ static inline int devm_pm_clk_create(struct device *dev)
 #endif
 
 #ifdef CONFIG_HAVE_CLK
-extern void pm_clk_add_notifier(struct bus_type *bus,
+extern void pm_clk_add_notifier(const struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb);
 #else
-static inline void pm_clk_add_notifier(struct bus_type *bus,
+static inline void pm_clk_add_notifier(const struct bus_type *bus,
 					struct pm_clk_notifier_block *clknb)
 {
 }
-- 
cgit v1.2.3


From e76933a9bfa9b7f28a387f2e13cb3e689adc200d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:06:19 +0100
Subject: maple: make maple_bus_type static and const

There is no need to export maple_bus_type as no one uses it outside of
maple.c, so make it static, AND make it const as it can be read-only as
no one modifies it.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: <linux-sh@vger.kernel.org>
Link: https://lore.kernel.org/r/2023121918-rejoicing-frostlike-d976@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/sh/maple/maple.c | 4 ++--
 include/linux/maple.h    | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/sh/maple/maple.c b/drivers/sh/maple/maple.c
index e05473c5c267..16018009a5a6 100644
--- a/drivers/sh/maple/maple.c
+++ b/drivers/sh/maple/maple.c
@@ -59,6 +59,7 @@ struct maple_device_specify {
 static bool checked[MAPLE_PORTS];
 static bool empty[MAPLE_PORTS];
 static struct maple_device *baseunits[MAPLE_PORTS];
+static const struct bus_type maple_bus_type;
 
 /**
  * maple_driver_register - register a maple driver
@@ -773,11 +774,10 @@ static struct maple_driver maple_unsupported_device = {
 /*
  * maple_bus_type - core maple bus structure
  */
-struct bus_type maple_bus_type = {
+static const struct bus_type maple_bus_type = {
 	.name = "maple",
 	.match = maple_match_bus_driver,
 };
-EXPORT_SYMBOL_GPL(maple_bus_type);
 
 static struct device maple_bus = {
 	.init_name = "maple",
diff --git a/include/linux/maple.h b/include/linux/maple.h
index 9b140272ee16..9aae44efcfd4 100644
--- a/include/linux/maple.h
+++ b/include/linux/maple.h
@@ -5,7 +5,6 @@
 #include <mach/maple.h>
 
 struct device;
-extern struct bus_type maple_bus_type;
 
 /* Maple Bus command and response codes */
 enum maple_code {
-- 
cgit v1.2.3


From 86438841e48f6361f0a6a04805b7d7813738761f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:41:42 +0100
Subject: dma-debug: make dma_debug_add_bus take a const pointer

The driver core now can handle a const struct bus_type pointer, and the
dma_debug_add_bus() call just passes on the pointer give to it to the
driver core, so make this pointer const as well to allow everyone to use
read-only struct bus_type pointers going forward.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc:  <iommu@lists.linux.dev>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2023121941-dejected-nugget-681e@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dma-map-ops.h | 4 ++--
 kernel/dma/debug.c          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f2fc203fb8a1..e401f824a007 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -443,10 +443,10 @@ static inline void arch_teardown_dma_ops(struct device *dev)
 #endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */
 
 #ifdef CONFIG_DMA_API_DEBUG
-void dma_debug_add_bus(struct bus_type *bus);
+void dma_debug_add_bus(const struct bus_type *bus);
 void debug_dma_dump_mappings(struct device *dev);
 #else
-static inline void dma_debug_add_bus(struct bus_type *bus)
+static inline void dma_debug_add_bus(const struct bus_type *bus)
 {
 }
 static inline void debug_dma_dump_mappings(struct device *dev)
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 3de494375b7b..1a5c86dd87d5 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -876,7 +876,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
 	return 0;
 }
 
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
 {
 	struct notifier_block *nb;
 
-- 
cgit v1.2.3


From b5b0774d53bb81bddbf8c609b3f183d4af6e91da Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 20 Dec 2023 12:30:41 +0530
Subject: HID: amd_sfh: Add a new interface for exporting HPD data

AMDSFH has information about the User presence information via the Human
Presence Detection (HPD) sensor which is part of the AMD sensor fusion hub.

Add a new interface to export this information, where other drivers like
PMF can use this information to enhance user experiences.

Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/
Co-developed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/amd-sfh-hid/amd_sfh_common.h           |  5 +++
 drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c      | 14 +++++++
 drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c | 37 +++++++++++++++++
 drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.h |  1 +
 include/linux/amd-pmf-io.h                         | 46 ++++++++++++++++++++++
 5 files changed, 103 insertions(+)
 create mode 100644 include/linux/amd-pmf-io.h

(limited to 'include/linux')

diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_common.h b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
index 2643bb14fee2..cd57037bf217 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_common.h
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
@@ -37,6 +37,10 @@ struct amd_mp2_sensor_info {
 	dma_addr_t dma_address;
 };
 
+struct sfh_dev_status {
+	bool is_hpd_present;
+};
+
 struct amd_mp2_dev {
 	struct pci_dev *pdev;
 	struct amdtp_cl_data *cl_data;
@@ -47,6 +51,7 @@ struct amd_mp2_dev {
 	struct amd_input_data in_data;
 	/* mp2 active control status */
 	u32 mp2_acs;
+	struct sfh_dev_status dev_en;
 };
 
 struct amd_mp2_ops {
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
index e9c6413af24a..0351b0fd394a 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
@@ -73,6 +73,12 @@ static int amd_sfh_hid_client_deinit(struct amd_mp2_dev *privdata)
 	int i, status;
 
 	for (i = 0; i < cl_data->num_hid_devices; i++) {
+		switch (cl_data->sensor_idx[i]) {
+		case HPD_IDX:
+			privdata->dev_en.is_hpd_present = false;
+			break;
+		}
+
 		if (cl_data->sensor_sts[i] == SENSOR_ENABLED) {
 			privdata->mp2_ops->stop(privdata, cl_data->sensor_idx[i]);
 			status = amd_sfh_wait_for_response
@@ -178,6 +184,11 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
 			rc = amdtp_hid_probe(i, cl_data);
 			if (rc)
 				goto cleanup;
+			switch (cl_data->sensor_idx[i]) {
+			case HPD_IDX:
+				privdata->dev_en.is_hpd_present = true;
+				break;
+			}
 		}
 		dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n",
 			cl_data->sensor_idx[i], get_sensor_name(cl_data->sensor_idx[i]),
@@ -259,6 +270,7 @@ static void amd_mp2_pci_remove(void *privdata)
 {
 	struct amd_mp2_dev *mp2 = privdata;
 
+	sfh_deinit_emp2();
 	amd_sfh_hid_client_deinit(privdata);
 	mp2->mp2_ops->stop_all(mp2);
 	pci_intx(mp2->pdev, false);
@@ -311,12 +323,14 @@ int amd_sfh1_1_init(struct amd_mp2_dev *mp2)
 
 	rc = amd_sfh_irq_init(mp2);
 	if (rc) {
+		sfh_deinit_emp2();
 		dev_err(dev, "amd_sfh_irq_init failed\n");
 		return rc;
 	}
 
 	rc = amd_sfh1_1_hid_client_init(mp2);
 	if (rc) {
+		sfh_deinit_emp2();
 		dev_err(dev, "amd_sfh1_1_hid_client_init failed\n");
 		return rc;
 	}
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
index 4f81ef2d4f56..197b828fc6a0 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
@@ -7,11 +7,14 @@
  *
  * Author: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
  */
+#include <linux/amd-pmf-io.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iopoll.h>
 
 #include "amd_sfh_interface.h"
 
+static struct amd_mp2_dev *emp2;
+
 static int amd_sfh_wait_response(struct amd_mp2_dev *mp2, u8 sid, u32 cmd_id)
 {
 	struct sfh_cmd_response cmd_resp;
@@ -73,7 +76,41 @@ static struct amd_mp2_ops amd_sfh_ops = {
 	.response = amd_sfh_wait_response,
 };
 
+void sfh_deinit_emp2(void)
+{
+	emp2 = NULL;
+}
+
 void sfh_interface_init(struct amd_mp2_dev *mp2)
 {
 	mp2->mp2_ops = &amd_sfh_ops;
+	emp2 = mp2;
+}
+
+static int amd_sfh_hpd_info(u8 *user_present)
+{
+	struct hpd_status hpdstatus;
+
+	if (!user_present)
+		return -EINVAL;
+
+	if (!emp2 || !emp2->dev_en.is_hpd_present)
+		return -ENODEV;
+
+	hpdstatus.val = readl(emp2->mmio + AMD_C2P_MSG(4));
+	*user_present = hpdstatus.shpd.presence;
+
+	return 0;
+}
+
+int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op)
+{
+	if (sfh_info) {
+		switch (op) {
+		case MT_HPD:
+			return amd_sfh_hpd_info(&sfh_info->user_present);
+		}
+	}
+	return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(amd_get_sfh_info);
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.h b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.h
index 75267b0fec70..2c211d28764d 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.h
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.h
@@ -165,6 +165,7 @@ struct hpd_status {
 };
 
 void sfh_interface_init(struct amd_mp2_dev *mp2);
+void sfh_deinit_emp2(void);
 void amd_sfh1_1_set_desc_ops(struct amd_mp2_ops *mp2_ops);
 int amd_sfh_float_to_int(u32 flt32_val);
 #endif
diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h
new file mode 100644
index 000000000000..5b6d29d36922
--- /dev/null
+++ b/include/linux/amd-pmf-io.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Platform Management Framework Interface
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Authors: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
+ *          Basavaraj Natikar <Basavaraj.Natikar@amd.com>
+ */
+
+#ifndef AMD_PMF_IO_H
+#define AMD_PMF_IO_H
+
+#include <linux/types.h>
+
+/**
+ * enum sfh_message_type - Query the SFH message type
+ * @MT_HPD: Message ID to know the Human presence info from MP2 FW
+ */
+enum sfh_message_type {
+	MT_HPD,
+};
+
+/**
+ * enum sfh_hpd_info - Query the Human presence information
+ * @SFH_NOT_DETECTED: Check the HPD connection information from MP2 FW
+ * @SFH_USER_PRESENT: Check if the user is present from HPD sensor
+ * @SFH_USER_AWAY: Check if the user is away from HPD sensor
+ */
+enum sfh_hpd_info {
+	SFH_NOT_DETECTED,
+	SFH_USER_PRESENT,
+	SFH_USER_AWAY,
+};
+
+/**
+ * struct amd_sfh_info - get HPD sensor info from MP2 FW
+ * @user_present: Populates the user presence information
+ */
+struct amd_sfh_info {
+	u8 user_present;
+};
+
+int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op);
+#endif
-- 
cgit v1.2.3


From 584f35a3647d42980af495fc0bc5c51eb174aa35 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 20 Dec 2023 12:30:42 +0530
Subject: HID: amd_sfh: Add a new interface for exporting ALS data

AMDSFH has information about the Ambient light via the Ambient
Light Sensor (ALS) which is part of the AMD sensor fusion hub.

Add a new interface to export this information, where other drivers like
PMF can use this information to enhance user experiences.

Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Co-developed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/amd-sfh-hid/amd_sfh_common.h           |  1 +
 drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c      |  6 ++++++
 drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c | 22 ++++++++++++++++++++++
 include/linux/amd-pmf-io.h                         |  4 ++++
 4 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_common.h b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
index cd57037bf217..a1950bc6e6ce 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_common.h
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
@@ -39,6 +39,7 @@ struct amd_mp2_sensor_info {
 
 struct sfh_dev_status {
 	bool is_hpd_present;
+	bool is_als_present;
 };
 
 struct amd_mp2_dev {
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
index 0351b0fd394a..9dbe6f4cb294 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
@@ -77,6 +77,9 @@ static int amd_sfh_hid_client_deinit(struct amd_mp2_dev *privdata)
 		case HPD_IDX:
 			privdata->dev_en.is_hpd_present = false;
 			break;
+		case ALS_IDX:
+			privdata->dev_en.is_als_present = false;
+			break;
 		}
 
 		if (cl_data->sensor_sts[i] == SENSOR_ENABLED) {
@@ -188,6 +191,9 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
 			case HPD_IDX:
 				privdata->dev_en.is_hpd_present = true;
 				break;
+			case ALS_IDX:
+				privdata->dev_en.is_als_present = true;
+				break;
 			}
 		}
 		dev_dbg(dev, "sid 0x%x (%s) status 0x%x\n",
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
index 197b828fc6a0..ae36312bc236 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_interface.c
@@ -103,12 +103,34 @@ static int amd_sfh_hpd_info(u8 *user_present)
 	return 0;
 }
 
+static int amd_sfh_als_info(u32 *ambient_light)
+{
+	struct sfh_als_data als_data;
+	void __iomem *sensoraddr;
+
+	if (!ambient_light)
+		return -EINVAL;
+
+	if (!emp2 || !emp2->dev_en.is_als_present)
+		return -ENODEV;
+
+	sensoraddr = emp2->vsbase +
+		(ALS_IDX * SENSOR_DATA_MEM_SIZE_DEFAULT) +
+		OFFSET_SENSOR_DATA_DEFAULT;
+	memcpy_fromio(&als_data, sensoraddr, sizeof(struct sfh_als_data));
+	*ambient_light = amd_sfh_float_to_int(als_data.lux);
+
+	return 0;
+}
+
 int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op)
 {
 	if (sfh_info) {
 		switch (op) {
 		case MT_HPD:
 			return amd_sfh_hpd_info(&sfh_info->user_present);
+		case MT_ALS:
+			return amd_sfh_als_info(&sfh_info->ambient_light);
 		}
 	}
 	return -EINVAL;
diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h
index 5b6d29d36922..b4f818205216 100644
--- a/include/linux/amd-pmf-io.h
+++ b/include/linux/amd-pmf-io.h
@@ -17,9 +17,11 @@
 /**
  * enum sfh_message_type - Query the SFH message type
  * @MT_HPD: Message ID to know the Human presence info from MP2 FW
+ * @MT_ALS: Message ID to know the Ambient light info from MP2 FW
  */
 enum sfh_message_type {
 	MT_HPD,
+	MT_ALS,
 };
 
 /**
@@ -36,9 +38,11 @@ enum sfh_hpd_info {
 
 /**
  * struct amd_sfh_info - get HPD sensor info from MP2 FW
+ * @ambient_light: Populates the ambient light information
  * @user_present: Populates the user presence information
  */
 struct amd_sfh_info {
+	u32 ambient_light;
 	u8 user_present;
 };
 
-- 
cgit v1.2.3


From 398aa9a7e77cf23c2a6f882ddd3dcd96f21771dc Mon Sep 17 00:00:00 2001
From: Manan Aurora <maurora@google.com>
Date: Tue, 31 Oct 2023 03:46:41 +0000
Subject: usb: dwc3: Support EBC feature of DWC_usb31

Support configuration and use of bulk endpoints in the so-called EBC
mode described in the DBC_usb31 databook (appendix E)

Added a bit fifo_mode to usb_ep to indicate to the UDC driver that a
specific endpoint is to operate in the EBC (or equivalent) mode when
enabled

Added macros for bits 15 and 14 of DEPCFG parameter 1 to indicate EBC
mode and write back behaviour. These bits will be set to 1 when
configuring an EBC endpoint as described in the programming guide

Signed-off-by: Manan Aurora <maurora@google.com>
Link: https://lore.kernel.org/r/20231031034641.660606-1-maurora@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc3/core.h    | 1 +
 drivers/usb/dwc3/gadget.c  | 6 ++++++
 drivers/usb/dwc3/gadget.h  | 2 ++
 include/linux/usb/gadget.h | 1 +
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index e120611a5174..e3eea965e57b 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -376,6 +376,7 @@
 /* Global HWPARAMS4 Register */
 #define DWC3_GHWPARAMS4_HIBER_SCRATCHBUFS(n)	(((n) & (0x0f << 13)) >> 13)
 #define DWC3_MAX_HIBER_SCRATCHBUFS		15
+#define DWC3_EXT_BUFF_CONTROL		BIT(21)
 
 /* Global HWPARAMS6 Register */
 #define DWC3_GHWPARAMS6_BCSUPPORT		BIT(14)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 88d8d589f014..c15e965ea95a 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -673,6 +673,12 @@ static int dwc3_gadget_set_ep_config(struct dwc3_ep *dep, unsigned int action)
 		params.param1 |= DWC3_DEPCFG_BINTERVAL_M1(bInterval_m1);
 	}
 
+	if (dep->endpoint.fifo_mode) {
+		if (!(dwc->hwparams.hwparams4 & DWC3_EXT_BUFF_CONTROL))
+			return -EINVAL;
+		params.param1 |= DWC3_DEPCFG_EBC_HWO_NOWB | DWC3_DEPCFG_USE_EBC;
+	}
+
 	return dwc3_send_gadget_ep_cmd(dep, DWC3_DEPCMD_SETEPCONFIG, &params);
 }
 
diff --git a/drivers/usb/dwc3/gadget.h b/drivers/usb/dwc3/gadget.h
index 55a56cf67d73..fd7a4e94397e 100644
--- a/drivers/usb/dwc3/gadget.h
+++ b/drivers/usb/dwc3/gadget.h
@@ -26,6 +26,8 @@ struct dwc3;
 #define DWC3_DEPCFG_XFER_NOT_READY_EN	BIT(10)
 #define DWC3_DEPCFG_FIFO_ERROR_EN	BIT(11)
 #define DWC3_DEPCFG_STREAM_EVENT_EN	BIT(13)
+#define DWC3_DEPCFG_EBC_HWO_NOWB	BIT(14)
+#define DWC3_DEPCFG_USE_EBC		BIT(15)
 #define DWC3_DEPCFG_BINTERVAL_M1(n)	(((n) & 0xff) << 16)
 #define DWC3_DEPCFG_STREAM_CAPABLE	BIT(24)
 #define DWC3_DEPCFG_EP_NUMBER(n)	(((n) & 0x1f) << 25)
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 6532beb587b1..a771ccc038ac 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -236,6 +236,7 @@ struct usb_ep {
 	unsigned		max_streams:16;
 	unsigned		mult:2;
 	unsigned		maxburst:5;
+	unsigned		fifo_mode:1;
 	u8			address;
 	const struct usb_endpoint_descriptor	*desc;
 	const struct usb_ss_ep_comp_descriptor	*comp_desc;
-- 
cgit v1.2.3


From 49a78b05d5ca1e23fd737747a8757b8bdc319b30 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 4 Jan 2024 11:28:22 +0800
Subject: USB: core: Use device_driver directly in struct usb_driver and
 usb_device_driver

There is usbdrv_wrap in struct usb_driver and usb_device_driver, it
contains device_driver and for_devices. for_devices is used to
distinguish between device drivers and interface drivers.

Like the is_usb_device(), it tests the type of the device. We can test
that if the probe of device_driver is equal to usb_probe_device in
is_usb_device_driver(), and then the struct usbdrv_wrap is no longer
needed.

Clean up struct usbdrv_wrap, use device_driver directly in struct
usb_driver and usb_device_driver. This makes the code cleaner.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20240104032822.1896596-1-yajun.deng@linux.dev
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/bluetooth/btusb.c                          |  6 +--
 drivers/net/can/usb/peak_usb/pcan_usb_core.c       |  2 +-
 .../net/wireless/broadcom/brcm80211/brcmfmac/usb.c |  2 +-
 drivers/net/wireless/marvell/mwifiex/usb.c         |  2 +-
 drivers/usb/core/driver.c                          | 59 ++++++++++++----------
 drivers/usb/core/usb.c                             |  2 +-
 drivers/usb/core/usb.h                             |  8 +--
 drivers/usb/misc/onboard_usb_hub.c                 |  2 +-
 drivers/usb/serial/bus.c                           |  2 +-
 drivers/usb/serial/usb-serial.c                    |  2 +-
 drivers/usb/storage/uas.c                          |  2 +-
 drivers/usb/usbip/stub_main.c                      |  8 +--
 include/linux/usb.h                                | 24 +++------
 13 files changed, 53 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index b8e9de887b5d..02b3b1825403 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4790,10 +4790,8 @@ static struct usb_driver btusb_driver = {
 	.disable_hub_initiated_lpm = 1,
 
 #ifdef CONFIG_DEV_COREDUMP
-	.drvwrap = {
-		.driver = {
-			.coredump = btusb_coredump,
-		},
+	.driver = {
+		.coredump = btusb_coredump,
 	},
 #endif
 };
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 24ad9f593a77..1efa39e134f4 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -1143,7 +1143,7 @@ static void __exit peak_usb_exit(void)
 	int err;
 
 	/* last chance do send any synchronous commands here */
-	err = driver_for_each_device(&peak_usb_driver.drvwrap.driver, NULL,
+	err = driver_for_each_device(&peak_usb_driver.driver, NULL,
 				     NULL, peak_usb_do_device_exit);
 	if (err)
 		pr_err("%s: failed to stop all can devices (err %d)\n",
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index 2178675ae1a4..0ccf735316c2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -1581,7 +1581,7 @@ static int brcmf_usb_reset_device(struct device *dev, void *notused)
 
 void brcmf_usb_exit(void)
 {
-	struct device_driver *drv = &brcmf_usbdrvr.drvwrap.driver;
+	struct device_driver *drv = &brcmf_usbdrvr.driver;
 	int ret;
 
 	brcmf_dbg(USB, "Enter\n");
diff --git a/drivers/net/wireless/marvell/mwifiex/usb.c b/drivers/net/wireless/marvell/mwifiex/usb.c
index d3ab9572e711..515e6db410f2 100644
--- a/drivers/net/wireless/marvell/mwifiex/usb.c
+++ b/drivers/net/wireless/marvell/mwifiex/usb.c
@@ -687,7 +687,7 @@ static struct usb_driver mwifiex_usb_driver = {
 	.suspend = mwifiex_usb_suspend,
 	.resume = mwifiex_usb_resume,
 	.soft_unbind = 1,
-	.drvwrap.driver = {
+	.driver = {
 		.coredump = mwifiex_usb_coredump,
 	},
 };
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 1dc0c0413043..e01b1913d02b 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -189,13 +189,13 @@ static int usb_create_newid_files(struct usb_driver *usb_drv)
 		goto exit;
 
 	if (usb_drv->probe != NULL) {
-		error = driver_create_file(&usb_drv->drvwrap.driver,
+		error = driver_create_file(&usb_drv->driver,
 					   &driver_attr_new_id);
 		if (error == 0) {
-			error = driver_create_file(&usb_drv->drvwrap.driver,
+			error = driver_create_file(&usb_drv->driver,
 					&driver_attr_remove_id);
 			if (error)
-				driver_remove_file(&usb_drv->drvwrap.driver,
+				driver_remove_file(&usb_drv->driver,
 						&driver_attr_new_id);
 		}
 	}
@@ -209,9 +209,9 @@ static void usb_remove_newid_files(struct usb_driver *usb_drv)
 		return;
 
 	if (usb_drv->probe != NULL) {
-		driver_remove_file(&usb_drv->drvwrap.driver,
+		driver_remove_file(&usb_drv->driver,
 				&driver_attr_remove_id);
-		driver_remove_file(&usb_drv->drvwrap.driver,
+		driver_remove_file(&usb_drv->driver,
 				   &driver_attr_new_id);
 	}
 }
@@ -552,7 +552,7 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 	if (!iface->authorized)
 		return -ENODEV;
 
-	dev->driver = &driver->drvwrap.driver;
+	dev->driver = &driver->driver;
 	usb_set_intfdata(iface, data);
 	iface->needs_binding = 0;
 
@@ -615,7 +615,7 @@ void usb_driver_release_interface(struct usb_driver *driver,
 	struct device *dev = &iface->dev;
 
 	/* this should never happen, don't release something that's not ours */
-	if (!dev->driver || dev->driver != &driver->drvwrap.driver)
+	if (!dev->driver || dev->driver != &driver->driver)
 		return;
 
 	/* don't release from within disconnect() */
@@ -950,7 +950,7 @@ static int __usb_bus_reprobe_drivers(struct device *dev, void *data)
 	int ret;
 
 	/* Don't reprobe if current driver isn't usb_generic_driver */
-	if (dev->driver != &usb_generic_driver.drvwrap.driver)
+	if (dev->driver != &usb_generic_driver.driver)
 		return 0;
 
 	udev = to_usb_device(dev);
@@ -964,6 +964,11 @@ static int __usb_bus_reprobe_drivers(struct device *dev, void *data)
 	return 0;
 }
 
+bool is_usb_device_driver(const struct device_driver *drv)
+{
+	return drv->probe == usb_probe_device;
+}
+
 /**
  * usb_register_device_driver - register a USB device (not interface) driver
  * @new_udriver: USB operations for the device driver
@@ -983,15 +988,14 @@ int usb_register_device_driver(struct usb_device_driver *new_udriver,
 	if (usb_disabled())
 		return -ENODEV;
 
-	new_udriver->drvwrap.for_devices = 1;
-	new_udriver->drvwrap.driver.name = new_udriver->name;
-	new_udriver->drvwrap.driver.bus = &usb_bus_type;
-	new_udriver->drvwrap.driver.probe = usb_probe_device;
-	new_udriver->drvwrap.driver.remove = usb_unbind_device;
-	new_udriver->drvwrap.driver.owner = owner;
-	new_udriver->drvwrap.driver.dev_groups = new_udriver->dev_groups;
+	new_udriver->driver.name = new_udriver->name;
+	new_udriver->driver.bus = &usb_bus_type;
+	new_udriver->driver.probe = usb_probe_device;
+	new_udriver->driver.remove = usb_unbind_device;
+	new_udriver->driver.owner = owner;
+	new_udriver->driver.dev_groups = new_udriver->dev_groups;
 
-	retval = driver_register(&new_udriver->drvwrap.driver);
+	retval = driver_register(&new_udriver->driver);
 
 	if (!retval) {
 		pr_info("%s: registered new device driver %s\n",
@@ -1023,7 +1027,7 @@ void usb_deregister_device_driver(struct usb_device_driver *udriver)
 	pr_info("%s: deregistering device driver %s\n",
 			usbcore_name, udriver->name);
 
-	driver_unregister(&udriver->drvwrap.driver);
+	driver_unregister(&udriver->driver);
 }
 EXPORT_SYMBOL_GPL(usb_deregister_device_driver);
 
@@ -1051,18 +1055,17 @@ int usb_register_driver(struct usb_driver *new_driver, struct module *owner,
 	if (usb_disabled())
 		return -ENODEV;
 
-	new_driver->drvwrap.for_devices = 0;
-	new_driver->drvwrap.driver.name = new_driver->name;
-	new_driver->drvwrap.driver.bus = &usb_bus_type;
-	new_driver->drvwrap.driver.probe = usb_probe_interface;
-	new_driver->drvwrap.driver.remove = usb_unbind_interface;
-	new_driver->drvwrap.driver.owner = owner;
-	new_driver->drvwrap.driver.mod_name = mod_name;
-	new_driver->drvwrap.driver.dev_groups = new_driver->dev_groups;
+	new_driver->driver.name = new_driver->name;
+	new_driver->driver.bus = &usb_bus_type;
+	new_driver->driver.probe = usb_probe_interface;
+	new_driver->driver.remove = usb_unbind_interface;
+	new_driver->driver.owner = owner;
+	new_driver->driver.mod_name = mod_name;
+	new_driver->driver.dev_groups = new_driver->dev_groups;
 	spin_lock_init(&new_driver->dynids.lock);
 	INIT_LIST_HEAD(&new_driver->dynids.list);
 
-	retval = driver_register(&new_driver->drvwrap.driver);
+	retval = driver_register(&new_driver->driver);
 	if (retval)
 		goto out;
 
@@ -1077,7 +1080,7 @@ out:
 	return retval;
 
 out_newid:
-	driver_unregister(&new_driver->drvwrap.driver);
+	driver_unregister(&new_driver->driver);
 
 	pr_err("%s: error %d registering interface driver %s\n",
 		usbcore_name, retval, new_driver->name);
@@ -1102,7 +1105,7 @@ void usb_deregister(struct usb_driver *driver)
 			usbcore_name, driver->name);
 
 	usb_remove_newid_files(driver);
-	driver_unregister(&driver->drvwrap.driver);
+	driver_unregister(&driver->driver);
 	usb_free_dynids(driver);
 }
 EXPORT_SYMBOL_GPL(usb_deregister);
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 2a938cf47ccd..dc8d9228a5e7 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -431,7 +431,7 @@ struct usb_interface *usb_find_interface(struct usb_driver *drv, int minor)
 	struct device *dev;
 
 	argb.minor = minor;
-	argb.drv = &drv->drvwrap.driver;
+	argb.drv = &drv->driver;
 
 	dev = bus_find_device(&usb_bus_type, NULL, &argb, __find_interface);
 
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 60363153fc3f..bfecb50773b6 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -175,13 +175,7 @@ static inline int is_root_hub(struct usb_device *udev)
 	return (udev->parent == NULL);
 }
 
-/* Do the same for device drivers and interface drivers. */
-
-static inline int is_usb_device_driver(struct device_driver *drv)
-{
-	return container_of(drv, struct usbdrv_wrap, driver)->
-			for_devices;
-}
+extern bool is_usb_device_driver(const struct device_driver *drv);
 
 /* for labeling diagnostics */
 extern const char *usbcore_name;
diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c
index c09fb6bd7d7d..0dd2b032c90b 100644
--- a/drivers/usb/misc/onboard_usb_hub.c
+++ b/drivers/usb/misc/onboard_usb_hub.c
@@ -244,7 +244,7 @@ static void onboard_hub_attach_usb_driver(struct work_struct *work)
 {
 	int err;
 
-	err = driver_attach(&onboard_hub_usbdev_driver.drvwrap.driver);
+	err = driver_attach(&onboard_hub_usbdev_driver.driver);
 	if (err)
 		pr_err("Failed to attach USB driver: %pe\n", ERR_PTR(err));
 }
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 3eb8dc3a1a8f..6c812d01b37d 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -113,7 +113,7 @@ static ssize_t new_id_store(struct device_driver *driver,
 	if (retval >= 0 && usb_drv->usb_driver != NULL)
 		retval = usb_store_new_id(&usb_drv->usb_driver->dynids,
 					  usb_drv->usb_driver->id_table,
-					  &usb_drv->usb_driver->drvwrap.driver,
+					  &usb_drv->usb_driver->driver,
 					  buf, count);
 	return retval;
 }
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 17b09f03ef84..f1e91eb7f8a4 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1521,7 +1521,7 @@ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[]
 
 	/* Now set udriver's id_table and look for matches */
 	udriver->id_table = id_table;
-	rc = driver_attach(&udriver->drvwrap.driver);
+	rc = driver_attach(&udriver->driver);
 	return 0;
 
 err_deregister_drivers:
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 696bb0b23599..9707f53cfda9 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -1246,7 +1246,7 @@ static struct usb_driver uas_driver = {
 	.suspend = uas_suspend,
 	.resume = uas_resume,
 	.reset_resume = uas_reset_resume,
-	.drvwrap.driver.shutdown = uas_shutdown,
+	.driver.shutdown = uas_shutdown,
 	.id_table = uas_usb_ids,
 };
 
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 0a6624d37929..79110a69d697 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -377,14 +377,14 @@ static int __init usbip_host_init(void)
 		goto err_usb_register;
 	}
 
-	ret = driver_create_file(&stub_driver.drvwrap.driver,
+	ret = driver_create_file(&stub_driver.driver,
 				 &driver_attr_match_busid);
 	if (ret) {
 		pr_err("driver_create_file failed\n");
 		goto err_create_file;
 	}
 
-	ret = driver_create_file(&stub_driver.drvwrap.driver,
+	ret = driver_create_file(&stub_driver.driver,
 				 &driver_attr_rebind);
 	if (ret) {
 		pr_err("driver_create_file failed\n");
@@ -402,10 +402,10 @@ err_usb_register:
 
 static void __exit usbip_host_exit(void)
 {
-	driver_remove_file(&stub_driver.drvwrap.driver,
+	driver_remove_file(&stub_driver.driver,
 			   &driver_attr_match_busid);
 
-	driver_remove_file(&stub_driver.drvwrap.driver,
+	driver_remove_file(&stub_driver.driver,
 			   &driver_attr_rebind);
 
 	/*
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 07556341ba2b..9e52179872a5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1143,16 +1143,6 @@ extern ssize_t usb_store_new_id(struct usb_dynids *dynids,
 
 extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf);
 
-/**
- * struct usbdrv_wrap - wrapper for driver-model structure
- * @driver: The driver-model core driver structure.
- * @for_devices: Non-zero for device drivers, 0 for interface drivers.
- */
-struct usbdrv_wrap {
-	struct device_driver driver;
-	int for_devices;
-};
-
 /**
  * struct usb_driver - identifies USB interface driver to usbcore
  * @name: The driver name should be unique among USB drivers,
@@ -1193,7 +1183,7 @@ struct usbdrv_wrap {
  *	is bound to the driver.
  * @dynids: used internally to hold the list of dynamically added device
  *	ids for this driver.
- * @drvwrap: Driver-model core structure wrapper.
+ * @driver: The driver-model core driver structure.
  * @no_dynamic_id: if set to 1, the USB core will not allow dynamic ids to be
  *	added to this driver by preventing the sysfs file from being created.
  * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
@@ -1241,13 +1231,13 @@ struct usb_driver {
 	const struct attribute_group **dev_groups;
 
 	struct usb_dynids dynids;
-	struct usbdrv_wrap drvwrap;
+	struct device_driver driver;
 	unsigned int no_dynamic_id:1;
 	unsigned int supports_autosuspend:1;
 	unsigned int disable_hub_initiated_lpm:1;
 	unsigned int soft_unbind:1;
 };
-#define	to_usb_driver(d) container_of(d, struct usb_driver, drvwrap.driver)
+#define	to_usb_driver(d) container_of(d, struct usb_driver, driver)
 
 /**
  * struct usb_device_driver - identifies USB device driver to usbcore
@@ -1268,7 +1258,7 @@ struct usb_driver {
  *	on to call the normal usb_choose_configuration().
  * @dev_groups: Attributes attached to the device that will be created once it
  *	is bound to the driver.
- * @drvwrap: Driver-model core structure wrapper.
+ * @driver: The driver-model core driver structure.
  * @id_table: used with @match() to select better matching driver at
  * 	probe() time.
  * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend
@@ -1277,7 +1267,7 @@ struct usb_driver {
  *	resume and suspend functions will be called in addition to the driver's
  *	own, so this part of the setup does not need to be replicated.
  *
- * USB drivers must provide all the fields listed above except drvwrap,
+ * USB drivers must provide all the fields listed above except driver,
  * match, and id_table.
  */
 struct usb_device_driver {
@@ -1293,13 +1283,13 @@ struct usb_device_driver {
 	int (*choose_configuration) (struct usb_device *udev);
 
 	const struct attribute_group **dev_groups;
-	struct usbdrv_wrap drvwrap;
+	struct device_driver driver;
 	const struct usb_device_id *id_table;
 	unsigned int supports_autosuspend:1;
 	unsigned int generic_subclass:1;
 };
 #define	to_usb_device_driver(d) container_of(d, struct usb_device_driver, \
-		drvwrap.driver)
+		driver)
 
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
-- 
cgit v1.2.3


From cb86a3383aa7b9bb891daca691e596f6bfe52d82 Mon Sep 17 00:00:00 2001
From: Vamshi Gajjela <vamshigajjela@google.com>
Date: Thu, 9 Nov 2023 12:04:16 +0530
Subject: serial: core: Update uart_poll_timeout() function to return unsigned
 long

The function uart_fifo_timeout() returns an unsigned long value, which
is the number of jiffies. Therefore, change the variable timeout in the
function uart_poll_timeout() from int to unsigned long.
Change the return type of the function uart_poll_timeout() from int to
unsigned long to be consistent with the type of timeout values.

Signed-off-by: Vamshi Gajjela <vamshigajjela@google.com>
Link: https://lore.kernel.org/r/20231109063417.3971005-2-vamshigajjela@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 89f7b6c63598..536b2581d3e2 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -852,9 +852,9 @@ static inline unsigned long uart_fifo_timeout(struct uart_port *port)
 }
 
 /* Base timer interval for polling */
-static inline int uart_poll_timeout(struct uart_port *port)
+static inline unsigned long uart_poll_timeout(struct uart_port *port)
 {
-	int timeout = uart_fifo_timeout(port);
+	unsigned long timeout = uart_fifo_timeout(port);
 
 	return timeout > 6 ? (timeout / 2 - 2) : 1;
 }
-- 
cgit v1.2.3


From 5f1e77b2285b47c216b44e513071549cf006a309 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 30 Nov 2023 13:30:15 -0500
Subject: SUNRPC: Remove unused function rpc_clnt_xprt_switch_put()

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h | 1 -
 net/sunrpc/clnt.c           | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e9d4377d03c6..5e9d1469c6fa 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -252,7 +252,6 @@ void		rpc_clnt_probe_trunked_xprts(struct rpc_clnt *,
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
-void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4aa838543f79..8df944444e9b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3247,14 +3247,6 @@ rpc_set_connect_timeout(struct rpc_clnt *clnt,
 }
 EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
 
-void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
-{
-	rcu_read_lock();
-	xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
-
 void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
 	struct rpc_xprt_switch *xps;
-- 
cgit v1.2.3


From 12fc0a963128b54b82e98b9909f463e784b90b07 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 15 Dec 2023 20:47:07 +0000
Subject: nfs: Remove writepage

NFS already has writepages and migrate_folio, so it does not need to
implement writepage.  The writepage operation is deprecated as it leads
to worse performance under high memory pressure due to folios being
written out in LRU order rather than sequentially within a file.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/file.c          |  1 -
 fs/nfs/write.c         | 11 -----------
 include/linux/nfs_fs.h |  1 -
 3 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3f9768810427..e437de2a5ad4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -558,7 +558,6 @@ const struct address_space_operations nfs_file_aops = {
 	.read_folio = nfs_read_folio,
 	.readahead = nfs_readahead,
 	.dirty_folio = filemap_dirty_folio,
-	.writepage = nfs_writepage,
 	.writepages = nfs_writepages,
 	.write_begin = nfs_write_begin,
 	.write_end = nfs_write_end,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b664caea8b4e..0b4fdad511d9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -680,17 +680,6 @@ static int nfs_writepage_locked(struct folio *folio,
 	return err;
 }
 
-int nfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct folio *folio = page_folio(page);
-	int ret;
-
-	ret = nfs_writepage_locked(folio, wbc);
-	if (ret != AOP_WRITEPAGE_ACTIVATE)
-		unlock_page(page);
-	return ret;
-}
-
 static int nfs_writepages_callback(struct folio *folio,
 				   struct writeback_control *wbc, void *data)
 {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 279262057a92..f5ce7b101146 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -595,7 +595,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
  * linux/fs/nfs/write.c
  */
 extern int  nfs_congestion_kb;
-extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct folio *folio);
 extern int  nfs_update_folio(struct file *file, struct folio *folio,
-- 
cgit v1.2.3


From 401df0d4f4098ecc9c5278da2f50756d62e5b37d Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Tue, 19 Dec 2023 13:01:03 +0100
Subject: nvmem: layouts: refactor .add_cells() callback arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simply pass whole "struct nvmem_layout" instead of single variables.
There is nothing in "struct nvmem_layout" that we have to hide from
layout drivers. They also access it during .probe() and .remove().

Thanks to this change:

1. API gets more consistent
   All layouts drivers callbacks get the same argument

2. Layouts get correct device
   Before this change NVMEM core code was passing NVMEM device instead
   of layout device. That resulted in:
   * Confusing prints
   * Calling devm_*() helpers on wrong device
   * Helpers like of_device_get_match_data() dereferencing NULLs

3. It gets possible to get match data
   First of all nvmem_layout_get_match_data() requires passing "struct
   nvmem_layout" which .add_cells() callback didn't have before this. It
   doesn't matter much as it's rather useless now anyway (and will be
   dropped).
   What's more important however is that of_device_get_match_data() can
   be used now thanks to owning a proper device pointer.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231219120104.3422-1-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c             | 2 +-
 drivers/nvmem/layouts/onie-tlv.c | 4 +++-
 drivers/nvmem/layouts/sl28vpd.c  | 4 +++-
 include/linux/nvmem-provider.h   | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index ba559e81f77f..441d132ebb61 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -854,7 +854,7 @@ int nvmem_layout_register(struct nvmem_layout *layout)
 		return -EINVAL;
 
 	/* Populate the cells */
-	ret = layout->add_cells(&layout->nvmem->dev, layout->nvmem);
+	ret = layout->add_cells(layout);
 	if (ret)
 		return ret;
 
diff --git a/drivers/nvmem/layouts/onie-tlv.c b/drivers/nvmem/layouts/onie-tlv.c
index b24cc5dcc6ee..9d2ad5f2dc10 100644
--- a/drivers/nvmem/layouts/onie-tlv.c
+++ b/drivers/nvmem/layouts/onie-tlv.c
@@ -182,8 +182,10 @@ static bool onie_tlv_crc_is_valid(struct device *dev, size_t table_len, u8 *tabl
 	return true;
 }
 
-static int onie_tlv_parse_table(struct device *dev, struct nvmem_device *nvmem)
+static int onie_tlv_parse_table(struct nvmem_layout *layout)
 {
+	struct nvmem_device *nvmem = layout->nvmem;
+	struct device *dev = &layout->dev;
 	struct onie_tlv_hdr hdr;
 	size_t table_len, data_len, hdr_len;
 	u8 *table, *data;
diff --git a/drivers/nvmem/layouts/sl28vpd.c b/drivers/nvmem/layouts/sl28vpd.c
index b8ffae646cc2..53fa50f17dca 100644
--- a/drivers/nvmem/layouts/sl28vpd.c
+++ b/drivers/nvmem/layouts/sl28vpd.c
@@ -80,8 +80,10 @@ static int sl28vpd_v1_check_crc(struct device *dev, struct nvmem_device *nvmem)
 	return 0;
 }
 
-static int sl28vpd_add_cells(struct device *dev, struct nvmem_device *nvmem)
+static int sl28vpd_add_cells(struct nvmem_layout *layout)
 {
+	struct nvmem_device *nvmem = layout->nvmem;
+	struct device *dev = &layout->dev;
 	const struct nvmem_cell_info *pinfo;
 	struct nvmem_cell_info info = {0};
 	struct device_node *layout_np;
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 6fe65b35ea97..81a67642ac55 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -173,7 +173,7 @@ struct nvmem_cell_table {
 struct nvmem_layout {
 	struct device dev;
 	struct nvmem_device *nvmem;
-	int (*add_cells)(struct device *dev, struct nvmem_device *nvmem);
+	int (*add_cells)(struct nvmem_layout *layout);
 };
 
 struct nvmem_layout_driver {
-- 
cgit v1.2.3


From 43f60e3fb62edc7bd8891de8779fb422f4ae23ae Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Tue, 19 Dec 2023 13:01:04 +0100
Subject: nvmem: drop nvmem_layout_get_match_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thanks for layouts refactoring we now have "struct device" associated
with layout. Also its OF pointer points directly to the "nvmem-layout"
DT node.

All it takes to get match data is a generic of_device_get_match_data().

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20231219120104.3422-2-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 13 -------------
 include/linux/nvmem-provider.h | 10 ----------
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 441d132ebb61..4ed54076346d 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -876,19 +876,6 @@ void nvmem_layout_unregister(struct nvmem_layout *layout)
 }
 EXPORT_SYMBOL_GPL(nvmem_layout_unregister);
 
-const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
-					struct nvmem_layout *layout)
-{
-	struct device_node __maybe_unused *layout_np;
-	const struct of_device_id *match;
-
-	layout_np = of_nvmem_layout_get_container(nvmem);
-	match = of_match_node(layout->dev.driver->of_match_table, layout_np);
-
-	return match ? match->data : NULL;
-}
-EXPORT_SYMBOL_GPL(nvmem_layout_get_match_data);
-
 /**
  * nvmem_register() - Register a nvmem device for given nvmem_config.
  * Also creates a binary entry in /sys/bus/nvmem/devices/dev-name/nvmem
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index 81a67642ac55..f0ba0e03218f 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -205,9 +205,6 @@ void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv);
 	module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \
 		      nvmem_layout_driver_unregister)
 
-const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem,
-					struct nvmem_layout *layout);
-
 #else
 
 static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c)
@@ -238,13 +235,6 @@ static inline int nvmem_layout_register(struct nvmem_layout *layout)
 
 static inline void nvmem_layout_unregister(struct nvmem_layout *layout) {}
 
-static inline const void *
-nvmem_layout_get_match_data(struct nvmem_device *nvmem,
-			    struct nvmem_layout *layout)
-{
-	return NULL;
-}
-
 #endif /* CONFIG_NVMEM */
 
 #if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF)
-- 
cgit v1.2.3


From 33cf42e68efc8ff529a7eee08a4f0ba8c8d0a207 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Thu, 21 Dec 2023 18:34:17 +0100
Subject: nvmem: core: add nvmem_dev_size() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required by layouts that need to read whole NVMEM content. It's
especially useful for NVMEM devices without hardcoded layout (like
U-Boot environment data block).

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20231221173421.13737-2-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 13 +++++++++++++
 include/linux/nvmem-consumer.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 4ed54076346d..980123fb4dde 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -2163,6 +2163,19 @@ const char *nvmem_dev_name(struct nvmem_device *nvmem)
 }
 EXPORT_SYMBOL_GPL(nvmem_dev_name);
 
+/**
+ * nvmem_dev_size() - Get the size of a given nvmem device.
+ *
+ * @nvmem: nvmem device.
+ *
+ * Return: size of the nvmem device.
+ */
+size_t nvmem_dev_size(struct nvmem_device *nvmem)
+{
+	return nvmem->size;
+}
+EXPORT_SYMBOL_GPL(nvmem_dev_size);
+
 static int __init nvmem_init(void)
 {
 	int ret;
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 2d306fa13b1a..34c0e58dfa26 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -81,6 +81,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
 			    struct nvmem_cell_info *info, void *buf);
 
 const char *nvmem_dev_name(struct nvmem_device *nvmem);
+size_t nvmem_dev_size(struct nvmem_device *nvmem);
 
 void nvmem_add_cell_lookups(struct nvmem_cell_lookup *entries,
 			    size_t nentries);
-- 
cgit v1.2.3


From aeda33ab8160c7a2e24ba4f44492ad1e974ddc7d Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Fri, 22 Dec 2023 12:16:26 +0530
Subject: cdx: create sysfs bin files for cdx resources

Resource binary file contains the content of the memory regions.
These resources<x> devices can be used to mmap the MMIO regions in
the user-space.

Co-developed-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Puneet Gupta <puneet.gupta@amd.com>
Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231222064627.2828960-1-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-cdx |   7 ++
 drivers/cdx/cdx.c                       | 118 +++++++++++++++++++++++++++++++-
 include/linux/cdx/cdx_bus.h             |  10 +++
 3 files changed, 134 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
index 8c067ff99e54..e84277531414 100644
--- a/Documentation/ABI/testing/sysfs-bus-cdx
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -98,6 +98,13 @@ Description:
 
 		  # echo 1 > /sys/bus/cdx/devices/.../remove
 
+What:		/sys/bus/cdx/devices/.../resource<N>
+Date:		July 2023
+Contact:	puneet.gupta@amd.com
+Description:
+		The resource binary file contains the content of the memory
+		regions. These files can be m'maped from userspace.
+
 What:		/sys/bus/cdx/devices/.../modalias
 Date:		July 2023
 Contact:	nipun.gupta@amd.com
diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index d84d153078d7..2ec1846ff063 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -80,6 +80,8 @@ static DEFINE_MUTEX(cdx_controller_lock);
 
 static char *compat_node_name = "xlnx,versal-net-cdx";
 
+static void cdx_destroy_res_attr(struct cdx_device *cdx_dev, int num);
+
 /**
  * cdx_dev_reset - Reset a CDX device
  * @dev: CDX device
@@ -148,6 +150,7 @@ static int cdx_unregister_device(struct device *dev,
 		if (cdx_dev->enabled && cdx->ops->bus_disable)
 			cdx->ops->bus_disable(cdx, cdx_dev->bus_num);
 	} else {
+		cdx_destroy_res_attr(cdx_dev, MAX_CDX_DEV_RESOURCES);
 		kfree(cdx_dev->driver_override);
 		cdx_dev->driver_override = NULL;
 	}
@@ -643,11 +646,105 @@ static void cdx_device_release(struct device *dev)
 	kfree(cdx_dev);
 }
 
+static const struct vm_operations_struct cdx_phys_vm_ops = {
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+	.access = generic_access_phys,
+#endif
+};
+
+/**
+ * cdx_mmap_resource - map a CDX resource into user memory space
+ * @fp: File pointer. Not used in this function, but required where
+ *      this API is registered as a callback.
+ * @kobj: kobject for mapping
+ * @attr: struct bin_attribute for the file being mapped
+ * @vma: struct vm_area_struct passed into the mmap
+ *
+ * Use the regular CDX mapping routines to map a CDX resource into userspace.
+ *
+ * Return: true on success, false otherwise.
+ */
+static int cdx_mmap_resource(struct file *fp, struct kobject *kobj,
+			     struct bin_attribute *attr,
+			     struct vm_area_struct *vma)
+{
+	struct cdx_device *cdx_dev = to_cdx_device(kobj_to_dev(kobj));
+	int num = (unsigned long)attr->private;
+	struct resource *res;
+	unsigned long size;
+
+	res = &cdx_dev->res[num];
+	if (iomem_is_exclusive(res->start))
+		return -EINVAL;
+
+	/* Make sure the caller is mapping a valid resource for this device */
+	size = ((cdx_resource_len(cdx_dev, num) - 1) >> PAGE_SHIFT) + 1;
+	if (vma->vm_pgoff + vma_pages(vma) > size)
+		return -EINVAL;
+
+	/*
+	 * Map memory region and vm->vm_pgoff is expected to be an
+	 * offset within that region.
+	 */
+	vma->vm_page_prot = pgprot_device(vma->vm_page_prot);
+	vma->vm_pgoff += (cdx_resource_start(cdx_dev, num) >> PAGE_SHIFT);
+	vma->vm_ops = &cdx_phys_vm_ops;
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				  vma->vm_end - vma->vm_start,
+				  vma->vm_page_prot);
+}
+
+static void cdx_destroy_res_attr(struct cdx_device *cdx_dev, int num)
+{
+	int i;
+
+	/* removing the bin attributes */
+	for (i = 0; i < num; i++) {
+		struct bin_attribute *res_attr;
+
+		res_attr = cdx_dev->res_attr[i];
+		if (res_attr) {
+			sysfs_remove_bin_file(&cdx_dev->dev.kobj, res_attr);
+			kfree(res_attr);
+		}
+	}
+}
+
+#define CDX_RES_ATTR_NAME_LEN	10
+static int cdx_create_res_attr(struct cdx_device *cdx_dev, int num)
+{
+	struct bin_attribute *res_attr;
+	char *res_attr_name;
+	int ret;
+
+	res_attr = kzalloc(sizeof(*res_attr) + CDX_RES_ATTR_NAME_LEN, GFP_ATOMIC);
+	if (!res_attr)
+		return -ENOMEM;
+
+	res_attr_name = (char *)(res_attr + 1);
+
+	sysfs_bin_attr_init(res_attr);
+
+	cdx_dev->res_attr[num] = res_attr;
+	sprintf(res_attr_name, "resource%d", num);
+
+	res_attr->mmap = cdx_mmap_resource;
+	res_attr->attr.name = res_attr_name;
+	res_attr->attr.mode = 0600;
+	res_attr->size = cdx_resource_len(cdx_dev, num);
+	res_attr->private = (void *)(unsigned long)num;
+	ret = sysfs_create_bin_file(&cdx_dev->dev.kobj, res_attr);
+	if (ret)
+		kfree(res_attr);
+
+	return ret;
+}
+
 int cdx_device_add(struct cdx_dev_params *dev_params)
 {
 	struct cdx_controller *cdx = dev_params->cdx;
 	struct cdx_device *cdx_dev;
-	int ret;
+	int ret, i;
 
 	cdx_dev = kzalloc(sizeof(*cdx_dev), GFP_KERNEL);
 	if (!cdx_dev)
@@ -690,7 +787,26 @@ int cdx_device_add(struct cdx_dev_params *dev_params)
 		goto fail;
 	}
 
+	/* Create resource<N> attributes */
+	for (i = 0; i < MAX_CDX_DEV_RESOURCES; i++) {
+		if (cdx_resource_flags(cdx_dev, i) & IORESOURCE_MEM) {
+			/* skip empty resources */
+			if (!cdx_resource_len(cdx_dev, i))
+				continue;
+
+			ret = cdx_create_res_attr(cdx_dev, i);
+			if (ret != 0) {
+				dev_err(&cdx_dev->dev,
+					"cdx device resource<%d> file creation failed: %d", i, ret);
+				goto resource_create_fail;
+			}
+		}
+	}
+
 	return 0;
+resource_create_fail:
+	cdx_destroy_res_attr(cdx_dev, i);
+	device_del(&cdx_dev->dev);
 fail:
 	/*
 	 * Do not free cdx_dev here as it would be freed in
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index 94ad2c9017c9..fab9e62c7e7d 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -135,6 +135,7 @@ struct cdx_device {
 	u8 bus_num;
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
+	struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES];
 	u8 res_count;
 	u64 dma_mask;
 	u16 flags;
@@ -147,6 +148,15 @@ struct cdx_device {
 #define to_cdx_device(_dev) \
 	container_of(_dev, struct cdx_device, dev)
 
+#define cdx_resource_start(dev, num)	((dev)->res[(num)].start)
+#define cdx_resource_end(dev, num)	((dev)->res[(num)].end)
+#define cdx_resource_flags(dev, num)	((dev)->res[(num)].flags)
+#define cdx_resource_len(dev, num) \
+	((cdx_resource_start((dev), (num)) == 0 &&	\
+	  cdx_resource_end((dev), (num)) ==		\
+	  cdx_resource_start((dev), (num))) ? 0 :	\
+	 (cdx_resource_end((dev), (num)) -		\
+	  cdx_resource_start((dev), (num)) + 1))
 /**
  * struct cdx_driver - CDX device driver
  * @driver: Generic device driver
-- 
cgit v1.2.3


From cf60af04edfe51fca488246c9959904adb2750fa Mon Sep 17 00:00:00 2001
From: Abhijit Gangurde <abhijit.gangurde@amd.com>
Date: Fri, 22 Dec 2023 12:16:27 +0530
Subject: cdx: Create resource debugfs file for cdx device

resource debugfs file contains host addresses of CDX device resources.
Each line of the resource file describe type of resource, a region
with start-end and flag fields.

Signed-off-by: Abhijit Gangurde <abhijit.gangurde@amd.com>
Link: https://lore.kernel.org/r/20231222064627.2828960-2-abhijit.gangurde@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/cdx/cdx.c           | 39 ++++++++++++++++++++++++++++++++++++++-
 include/linux/cdx/cdx_bus.h |  2 ++
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index 2ec1846ff063..c0fca63c7434 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -67,6 +67,7 @@
 #include <linux/cdx/cdx_bus.h>
 #include <linux/iommu.h>
 #include <linux/dma-map-ops.h>
+#include <linux/debugfs.h>
 #include "cdx.h"
 
 /* Default DMA mask for devices on a CDX bus */
@@ -77,6 +78,8 @@
 static DEFINE_IDA(cdx_controller_ida);
 /* Lock to protect controller ops */
 static DEFINE_MUTEX(cdx_controller_lock);
+/* Debugfs dir for cdx bus */
+static struct dentry *cdx_debugfs_dir;
 
 static char *compat_node_name = "xlnx,versal-net-cdx";
 
@@ -151,6 +154,7 @@ static int cdx_unregister_device(struct device *dev,
 			cdx->ops->bus_disable(cdx, cdx_dev->bus_num);
 	} else {
 		cdx_destroy_res_attr(cdx_dev, MAX_CDX_DEV_RESOURCES);
+		debugfs_remove_recursive(cdx_dev->debugfs_dir);
 		kfree(cdx_dev->driver_override);
 		cdx_dev->driver_override = NULL;
 	}
@@ -554,6 +558,31 @@ static const struct attribute_group *cdx_dev_groups[] = {
 	NULL,
 };
 
+static int cdx_debug_resource_show(struct seq_file *s, void *data)
+{
+	struct cdx_device *cdx_dev = s->private;
+	int i;
+
+	for (i = 0; i < MAX_CDX_DEV_RESOURCES; i++) {
+		struct resource *res =  &cdx_dev->res[i];
+
+		seq_printf(s, "%pr\n", res);
+	}
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cdx_debug_resource);
+
+static void cdx_device_debugfs_init(struct cdx_device *cdx_dev)
+{
+	cdx_dev->debugfs_dir = debugfs_create_dir(dev_name(&cdx_dev->dev), cdx_debugfs_dir);
+	if (IS_ERR(cdx_dev->debugfs_dir))
+		return;
+
+	debugfs_create_file("resource", 0444, cdx_dev->debugfs_dir, cdx_dev,
+			    &cdx_debug_resource_fops);
+}
+
 static ssize_t rescan_store(const struct bus_type *bus,
 			    const char *buf, size_t count)
 {
@@ -803,6 +832,8 @@ int cdx_device_add(struct cdx_dev_params *dev_params)
 		}
 	}
 
+	cdx_device_debugfs_init(cdx_dev);
+
 	return 0;
 resource_create_fail:
 	cdx_destroy_res_attr(cdx_dev, i);
@@ -907,6 +938,12 @@ EXPORT_SYMBOL_NS_GPL(cdx_unregister_controller, CDX_BUS_CONTROLLER);
 
 static int __init cdx_bus_init(void)
 {
-	return bus_register(&cdx_bus_type);
+	int ret;
+
+	ret = bus_register(&cdx_bus_type);
+	if (!ret)
+		cdx_debugfs_dir = debugfs_create_dir(cdx_bus_type.name, NULL);
+
+	return ret;
 }
 postcore_initcall(cdx_bus_init);
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index fab9e62c7e7d..6355a36a3f81 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -113,6 +113,7 @@ struct cdx_controller {
  * @dev_num: Device number for this device
  * @res: array of MMIO region entries
  * @res_attr: resource binary attribute
+ * @debugfs_dir: debugfs directory for this device
  * @res_count: number of valid MMIO regions
  * @dma_mask: Default DMA mask
  * @flags: CDX device flags
@@ -136,6 +137,7 @@ struct cdx_device {
 	u8 dev_num;
 	struct resource res[MAX_CDX_DEV_RESOURCES];
 	struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES];
+	struct dentry *debugfs_dir;
 	u8 res_count;
 	u64 dma_mask;
 	u16 flags;
-- 
cgit v1.2.3


From a87e55bfa25c195b3aaa25369175905ba9527fff Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:26:38 +0100
Subject: moxtet: remove unused moxtet_type declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason, moxtet_type was defined in moxtet.h, but never actually
used.  Looks like a left-over from the original commit that was
exporting the moxtet bus type, but that wasn't needed, and it was a
different variable name, so no one noticed this one dangling around.

Cc: Marek Behún <kabel@kernel.org>
Link: https://lore.kernel.org/r/2023121937-pants-heroics-17c1@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/moxtet.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moxtet.h b/include/linux/moxtet.h
index 79184948fab4..ac577699edfd 100644
--- a/include/linux/moxtet.h
+++ b/include/linux/moxtet.h
@@ -35,8 +35,6 @@ enum turris_mox_module_id {
 
 #define MOXTET_NIRQS	16
 
-extern struct bus_type moxtet_type;
-
 struct moxtet {
 	struct device			*dev;
 	struct mutex			lock;
-- 
cgit v1.2.3


From e1be24b2e1190a7662462e8e398189ac795339cd Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 18:18:58 +0100
Subject: platform/surface: aggregator: make ssam_bus_type constant and static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the driver core can properly handle constant struct bus_type,
move the ssam_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

It's also never used outside of
drivers/platform/surface/aggregator/bus.c so make it static and don't
export it as no one is using it.

Cc: Maximilian Luz <luzmaximilian@gmail.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc:  <platform-driver-x86@vger.kernel.org>
Reviewed-by: Maximilian Luz <luzmaximilian@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/2023121957-tapered-upswing-8326@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/platform/surface/aggregator/bus.c | 5 +++--
 include/linux/surface_aggregator/device.h | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c
index 42ccd7f1c9b9..118caa651bec 100644
--- a/drivers/platform/surface/aggregator/bus.c
+++ b/drivers/platform/surface/aggregator/bus.c
@@ -35,6 +35,8 @@ static struct attribute *ssam_device_attrs[] = {
 };
 ATTRIBUTE_GROUPS(ssam_device);
 
+static const struct bus_type ssam_bus_type;
+
 static int ssam_device_uevent(const struct device *dev, struct kobj_uevent_env *env)
 {
 	const struct ssam_device *sdev = to_ssam_device(dev);
@@ -329,13 +331,12 @@ static void ssam_bus_remove(struct device *dev)
 		sdrv->remove(to_ssam_device(dev));
 }
 
-struct bus_type ssam_bus_type = {
+static const struct bus_type ssam_bus_type = {
 	.name   = "surface_aggregator",
 	.match  = ssam_bus_match,
 	.probe  = ssam_bus_probe,
 	.remove = ssam_bus_remove,
 };
-EXPORT_SYMBOL_GPL(ssam_bus_type);
 
 /**
  * __ssam_device_driver_register() - Register a SSAM client device driver.
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 42b249b4c24b..8cd8c38cf3f3 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -193,7 +193,6 @@ struct ssam_device_driver {
 
 #ifdef CONFIG_SURFACE_AGGREGATOR_BUS
 
-extern struct bus_type ssam_bus_type;
 extern const struct device_type ssam_device_type;
 
 /**
-- 
cgit v1.2.3


From 0c4b2255b7afbcc80f4efcc8f67425162f49c263 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Mon, 18 Dec 2023 21:50:24 -0800
Subject: firmware: xilinx: Export function to use in other module

Export zynqmp_pm_get_family_info() to access and find family information
in other module.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231219055025.27570-2-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 3 ++-
 include/linux/firmware/xlnx-zynqmp.h | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 73cae6ef83ec..79789f0563f6 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -463,7 +463,7 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_chipid);
  *
  * Return: Returns status, either success or error+reason
  */
-static int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
+int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
 {
 	u32 ret_payload[PAYLOAD_ARG_CNT];
 	u32 idcode;
@@ -488,6 +488,7 @@ static int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(zynqmp_pm_get_family_info);
 
 /**
  * zynqmp_pm_get_trustzone_version() - Get secure trustzone firmware version
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1478f691cc10..06f4e6eaf13e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -524,6 +524,7 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...);
 #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)
 int zynqmp_pm_get_api_version(u32 *version);
 int zynqmp_pm_get_chipid(u32 *idcode, u32 *version);
+int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily);
 int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out);
 int zynqmp_pm_clock_enable(u32 clock_id);
 int zynqmp_pm_clock_disable(u32 clock_id);
@@ -602,6 +603,11 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 	return -ENODEV;
 }
 
+static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily)
+{
+	return -ENODEV;
+}
+
 static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata,
 				       u32 *out)
 {
-- 
cgit v1.2.3


From 97d62760e441af9ed393e127a46172f9534b5808 Mon Sep 17 00:00:00 2001
From: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Date: Mon, 18 Dec 2023 21:50:25 -0800
Subject: drivers: soc: xilinx: add check for platform

Some error event IDs for Versal and Versal NET are different.
Both the platforms should access their respective error event
IDs so use sub_family_code to check for platform and check
error IDs for respective platforms. The family code is passed
via platform data to avoid platform detection again.
Platform data is setup when even driver is registered.

Signed-off-by: Jay Buddhabhatti <jay.buddhabhatti@amd.com>
Link: https://lore.kernel.org/r/20231219055025.27570-3-jay.buddhabhatti@amd.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/edac/versal_edac.c              |  4 ++--
 drivers/soc/xilinx/xlnx_event_manager.c | 25 ++++++++++++++++++++-----
 include/linux/firmware/xlnx-zynqmp.h    | 16 ++++++++++++----
 3 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
index 87e730dfefa0..116ed9b375de 100644
--- a/drivers/edac/versal_edac.c
+++ b/drivers/edac/versal_edac.c
@@ -1005,7 +1005,7 @@ static int mc_probe(struct platform_device *pdev)
 		goto free_edac_mc;
 	}
 
-	rc = xlnx_register_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+	rc = xlnx_register_event(PM_NOTIFY_CB, VERSAL_EVENT_ERROR_PMC_ERR1,
 				 XPM_EVENT_ERROR_MASK_DDRMC_CR | XPM_EVENT_ERROR_MASK_DDRMC_NCR |
 				 XPM_EVENT_ERROR_MASK_NOC_CR | XPM_EVENT_ERROR_MASK_NOC_NCR,
 				 false, err_callback, mci);
@@ -1042,7 +1042,7 @@ static int mc_remove(struct platform_device *pdev)
 	debugfs_remove_recursive(priv->debugfs);
 #endif
 
-	xlnx_unregister_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+	xlnx_unregister_event(PM_NOTIFY_CB, VERSAL_EVENT_ERROR_PMC_ERR1,
 			      XPM_EVENT_ERROR_MASK_DDRMC_CR |
 			      XPM_EVENT_ERROR_MASK_NOC_CR |
 			      XPM_EVENT_ERROR_MASK_NOC_NCR |
diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c
index 27a8d89a0a09..2ce4c8e01f57 100644
--- a/drivers/soc/xilinx/xlnx_event_manager.c
+++ b/drivers/soc/xilinx/xlnx_event_manager.c
@@ -77,11 +77,26 @@ struct registered_event_data {
 
 static bool xlnx_is_error_event(const u32 node_id)
 {
-	if (node_id == EVENT_ERROR_PMC_ERR1 ||
-	    node_id == EVENT_ERROR_PMC_ERR2 ||
-	    node_id == EVENT_ERROR_PSM_ERR1 ||
-	    node_id == EVENT_ERROR_PSM_ERR2)
-		return true;
+	u32 pm_family_code, pm_sub_family_code;
+
+	zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code);
+
+	if (pm_sub_family_code == VERSAL_SUB_FAMILY_CODE) {
+		if (node_id == VERSAL_EVENT_ERROR_PMC_ERR1 ||
+		    node_id == VERSAL_EVENT_ERROR_PMC_ERR2 ||
+		    node_id == VERSAL_EVENT_ERROR_PSM_ERR1 ||
+		    node_id == VERSAL_EVENT_ERROR_PSM_ERR2)
+			return true;
+	} else {
+		if (node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR1 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR2 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR3 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PSM_ERR1 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PSM_ERR2 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PSM_ERR3 ||
+		    node_id == VERSAL_NET_EVENT_ERROR_PSM_ERR4)
+			return true;
+	}
 
 	return false;
 }
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 06f4e6eaf13e..9a7e52739251 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -95,10 +95,18 @@
 /*
  * Node IDs for the Error Events.
  */
-#define EVENT_ERROR_PMC_ERR1	(0x28100000U)
-#define EVENT_ERROR_PMC_ERR2	(0x28104000U)
-#define EVENT_ERROR_PSM_ERR1	(0x28108000U)
-#define EVENT_ERROR_PSM_ERR2	(0x2810C000U)
+#define VERSAL_EVENT_ERROR_PMC_ERR1	(0x28100000U)
+#define VERSAL_EVENT_ERROR_PMC_ERR2	(0x28104000U)
+#define VERSAL_EVENT_ERROR_PSM_ERR1	(0x28108000U)
+#define VERSAL_EVENT_ERROR_PSM_ERR2	(0x2810C000U)
+
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR1	(0x28100000U)
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR2	(0x28104000U)
+#define VERSAL_NET_EVENT_ERROR_PMC_ERR3	(0x28108000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR1	(0x2810C000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR2	(0x28110000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR3	(0x28114000U)
+#define VERSAL_NET_EVENT_ERROR_PSM_ERR4	(0x28118000U)
 
 /* ZynqMP SD tap delay tuning */
 #define SD_ITAPDLY	0xFF180314
-- 
cgit v1.2.3


From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001
From: Quentin Deslandes <qde@naccy.de>
Date: Tue, 26 Dec 2023 14:07:42 +0100
Subject: bpfilter: remove bpfilter

bpfilter was supposed to convert iptables filtering rules into
BPF programs on the fly, from the kernel, through a usermode
helper. The base code for the UMH was introduced in 2018, and
couple of attempts (2, 3) tried to introduce the BPF program
generate features but were abandoned.

bpfilter now sits in a kernel tree unused and unusable, occasionally
causing confusion amongst Linux users (4, 5).

As bpfilter is now developed in a dedicated repository on GitHub (6),
it was suggested a couple of times this year (LSFMM/BPF 2023,
LPC 2023) to remove the deprecated kernel part of the project. This
is the purpose of this patch.

[1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/
[2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t
[3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/
[4]: https://dxuuu.xyz/bpfilter.html
[5]: https://github.com/linuxkit/linuxkit/pull/3904
[6]: https://github.com/facebook/bpfilter

Signed-off-by: Quentin Deslandes <qde@naccy.de>
Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/loongarch/configs/loongson3_defconfig |   1 -
 include/linux/bpfilter.h                   |  24 -----
 include/uapi/linux/bpfilter.h              |  21 -----
 net/Kconfig                                |   2 -
 net/Makefile                               |   1 -
 net/bpfilter/.gitignore                    |   2 -
 net/bpfilter/Kconfig                       |  23 -----
 net/bpfilter/Makefile                      |  20 -----
 net/bpfilter/bpfilter_kern.c               | 136 -----------------------------
 net/bpfilter/bpfilter_umh_blob.S           |   7 --
 net/bpfilter/main.c                        |  64 --------------
 net/bpfilter/msgfmt.h                      |  17 ----
 net/ipv4/Makefile                          |   2 -
 net/ipv4/bpfilter/Makefile                 |   2 -
 net/ipv4/bpfilter/sockopt.c                |  71 ---------------
 net/ipv4/ip_sockglue.c                     |  12 ---
 tools/bpf/bpftool/feature.c                |   4 -
 tools/testing/selftests/bpf/config.aarch64 |   1 -
 tools/testing/selftests/bpf/config.s390x   |   1 -
 tools/testing/selftests/bpf/config.x86_64  |   1 -
 tools/testing/selftests/hid/config         |   1 -
 21 files changed, 413 deletions(-)
 delete mode 100644 include/linux/bpfilter.h
 delete mode 100644 include/uapi/linux/bpfilter.h
 delete mode 100644 net/bpfilter/.gitignore
 delete mode 100644 net/bpfilter/Kconfig
 delete mode 100644 net/bpfilter/Makefile
 delete mode 100644 net/bpfilter/bpfilter_kern.c
 delete mode 100644 net/bpfilter/bpfilter_umh_blob.S
 delete mode 100644 net/bpfilter/main.c
 delete mode 100644 net/bpfilter/msgfmt.h
 delete mode 100644 net/ipv4/bpfilter/Makefile
 delete mode 100644 net/ipv4/bpfilter/sockopt.c

(limited to 'include/linux')

diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 9c333d133c30..60e331af9839 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -276,7 +276,6 @@ CONFIG_BRIDGE_EBT_T_NAT=m
 CONFIG_BRIDGE_EBT_ARP=m
 CONFIG_BRIDGE_EBT_IP=m
 CONFIG_BRIDGE_EBT_IP6=m
-CONFIG_BPFILTER=y
 CONFIG_IP_SCTP=m
 CONFIG_RDS=y
 CONFIG_L2TP=m
diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
deleted file mode 100644
index 736ded4905e0..000000000000
--- a/include/linux/bpfilter.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_BPFILTER_H
-#define _LINUX_BPFILTER_H
-
-#include <uapi/linux/bpfilter.h>
-#include <linux/usermode_driver.h>
-#include <linux/sockptr.h>
-
-struct sock;
-int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval,
-			    unsigned int optlen);
-int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
-			    int __user *optlen);
-
-struct bpfilter_umh_ops {
-	struct umd_info info;
-	/* since ip_getsockopt() can run in parallel, serialize access to umh */
-	struct mutex lock;
-	int (*sockopt)(struct sock *sk, int optname, sockptr_t optval,
-		       unsigned int optlen, bool is_set);
-	int (*start)(void);
-};
-extern struct bpfilter_umh_ops bpfilter_ops;
-#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
deleted file mode 100644
index cbc1f5813f50..000000000000
--- a/include/uapi/linux/bpfilter.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_LINUX_BPFILTER_H
-#define _UAPI_LINUX_BPFILTER_H
-
-#include <linux/if.h>
-
-enum {
-	BPFILTER_IPT_SO_SET_REPLACE = 64,
-	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
-	BPFILTER_IPT_SET_MAX,
-};
-
-enum {
-	BPFILTER_IPT_SO_GET_INFO = 64,
-	BPFILTER_IPT_SO_GET_ENTRIES = 65,
-	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
-	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
-	BPFILTER_IPT_GET_MAX,
-};
-
-#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index 3ec6bc98fa05..4adc47d0c9c2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -233,8 +233,6 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
-source "net/bpfilter/Kconfig"
-
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 4c4dc535453d..b06b5539e7a6 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX_SCM)		+= unix/
 obj-y				+= ipv6/
-obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore
deleted file mode 100644
index f34e85ee8204..000000000000
--- a/net/bpfilter/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-bpfilter_umh
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
deleted file mode 100644
index 3d4a21462458..000000000000
--- a/net/bpfilter/Kconfig
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-menuconfig BPFILTER
-	bool "BPF based packet filtering framework (BPFILTER)"
-	depends on BPF && INET
-	select USERMODE_DRIVER
-	help
-	  This builds experimental bpfilter framework that is aiming to
-	  provide netfilter compatible functionality via BPF
-
-if BPFILTER
-config BPFILTER_UMH
-	tristate "bpfilter kernel module with user mode helper"
-	depends on CC_CAN_LINK
-	depends on m || CC_CAN_LINK_STATIC
-	default m
-	help
-	  This builds bpfilter kernel module with embedded user mode helper
-
-	  Note: To compile this as built-in, your toolchain must support
-	  building static binaries, since rootfs isn't mounted at the time
-	  when __init functions are called and do_execv won't be able to find
-	  the elf interpreter.
-endif
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
deleted file mode 100644
index cdac82b8c53a..000000000000
--- a/net/bpfilter/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the Linux BPFILTER layer.
-#
-
-userprogs := bpfilter_umh
-bpfilter_umh-objs := main.o
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi
-
-ifeq ($(CONFIG_BPFILTER_UMH), y)
-# builtin bpfilter_umh should be linked with -static
-# since rootfs isn't mounted at the time of __init
-# function is called and do_execv won't find elf interpreter
-userldflags += -static
-endif
-
-$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
-
-obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
-bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
deleted file mode 100644
index 97e129e3f31c..000000000000
--- a/net/bpfilter/bpfilter_kern.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/umh.h>
-#include <linux/bpfilter.h>
-#include <linux/sched.h>
-#include <linux/sched/signal.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include "msgfmt.h"
-
-extern char bpfilter_umh_start;
-extern char bpfilter_umh_end;
-
-static void shutdown_umh(void)
-{
-	struct umd_info *info = &bpfilter_ops.info;
-	struct pid *tgid = info->tgid;
-
-	if (tgid) {
-		kill_pid(tgid, SIGKILL, 1);
-		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
-		umd_cleanup_helper(info);
-	}
-}
-
-static void __stop_umh(void)
-{
-	if (IS_ENABLED(CONFIG_INET))
-		shutdown_umh();
-}
-
-static int bpfilter_send_req(struct mbox_request *req)
-{
-	struct mbox_reply reply;
-	loff_t pos = 0;
-	ssize_t n;
-
-	if (!bpfilter_ops.info.tgid)
-		return -EFAULT;
-	pos = 0;
-	n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req),
-			   &pos);
-	if (n != sizeof(*req)) {
-		pr_err("write fail %zd\n", n);
-		goto stop;
-	}
-	pos = 0;
-	n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply),
-			&pos);
-	if (n != sizeof(reply)) {
-		pr_err("read fail %zd\n", n);
-		goto stop;
-	}
-	return reply.status;
-stop:
-	__stop_umh();
-	return -EFAULT;
-}
-
-static int bpfilter_process_sockopt(struct sock *sk, int optname,
-				    sockptr_t optval, unsigned int optlen,
-				    bool is_set)
-{
-	struct mbox_request req = {
-		.is_set		= is_set,
-		.pid		= current->pid,
-		.cmd		= optname,
-		.addr		= (uintptr_t)optval.user,
-		.len		= optlen,
-	};
-	if (sockptr_is_kernel(optval)) {
-		pr_err("kernel access not supported\n");
-		return -EFAULT;
-	}
-	return bpfilter_send_req(&req);
-}
-
-static int start_umh(void)
-{
-	struct mbox_request req = { .pid = current->pid };
-	int err;
-
-	/* fork usermode process */
-	err = fork_usermode_driver(&bpfilter_ops.info);
-	if (err)
-		return err;
-	pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid));
-
-	/* health check that usermode process started correctly */
-	if (bpfilter_send_req(&req) != 0) {
-		shutdown_umh();
-		return -EFAULT;
-	}
-
-	return 0;
-}
-
-static int __init load_umh(void)
-{
-	int err;
-
-	err = umd_load_blob(&bpfilter_ops.info,
-			    &bpfilter_umh_start,
-			    &bpfilter_umh_end - &bpfilter_umh_start);
-	if (err)
-		return err;
-
-	mutex_lock(&bpfilter_ops.lock);
-	err = start_umh();
-	if (!err && IS_ENABLED(CONFIG_INET)) {
-		bpfilter_ops.sockopt = &bpfilter_process_sockopt;
-		bpfilter_ops.start = &start_umh;
-	}
-	mutex_unlock(&bpfilter_ops.lock);
-	if (err)
-		umd_unload_blob(&bpfilter_ops.info);
-	return err;
-}
-
-static void __exit fini_umh(void)
-{
-	mutex_lock(&bpfilter_ops.lock);
-	if (IS_ENABLED(CONFIG_INET)) {
-		shutdown_umh();
-		bpfilter_ops.start = NULL;
-		bpfilter_ops.sockopt = NULL;
-	}
-	mutex_unlock(&bpfilter_ops.lock);
-
-	umd_unload_blob(&bpfilter_ops.info);
-}
-module_init(load_umh);
-module_exit(fini_umh);
-MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
deleted file mode 100644
index 40311d10d2f2..000000000000
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.section .init.rodata, "a"
-	.global bpfilter_umh_start
-bpfilter_umh_start:
-	.incbin "net/bpfilter/bpfilter_umh"
-	.global bpfilter_umh_end
-bpfilter_umh_end:
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
deleted file mode 100644
index 291a92546246..000000000000
--- a/net/bpfilter/main.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/uio.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "../../include/uapi/linux/bpf.h"
-#include <asm/unistd.h>
-#include "msgfmt.h"
-
-FILE *debug_f;
-
-static int handle_get_cmd(struct mbox_request *cmd)
-{
-	switch (cmd->cmd) {
-	case 0:
-		return 0;
-	default:
-		break;
-	}
-	return -ENOPROTOOPT;
-}
-
-static int handle_set_cmd(struct mbox_request *cmd)
-{
-	return -ENOPROTOOPT;
-}
-
-static void loop(void)
-{
-	while (1) {
-		struct mbox_request req;
-		struct mbox_reply reply;
-		int n;
-
-		n = read(0, &req, sizeof(req));
-		if (n != sizeof(req)) {
-			fprintf(debug_f, "invalid request %d\n", n);
-			return;
-		}
-
-		reply.status = req.is_set ?
-			handle_set_cmd(&req) :
-			handle_get_cmd(&req);
-
-		n = write(1, &reply, sizeof(reply));
-		if (n != sizeof(reply)) {
-			fprintf(debug_f, "reply failed %d\n", n);
-			return;
-		}
-	}
-}
-
-int main(void)
-{
-	debug_f = fopen("/dev/kmsg", "w");
-	setvbuf(debug_f, 0, _IOLBF, 0);
-	fprintf(debug_f, "<5>Started bpfilter\n");
-	loop();
-	fclose(debug_f);
-	return 0;
-}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
deleted file mode 100644
index 98d121c62945..000000000000
--- a/net/bpfilter/msgfmt.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_BPFILTER_MSGFMT_H
-#define _NET_BPFILTER_MSGFMT_H
-
-struct mbox_request {
-	__u64 addr;
-	__u32 len;
-	__u32 is_set;
-	__u32 cmd;
-	__u32 pid;
-};
-
-struct mbox_reply {
-	__u32 status;
-};
-
-#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index e144a02a6a61..ec36d2ec059e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,8 +16,6 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o netlink.o nexthop.o udp_tunnel_stub.o
 
-obj-$(CONFIG_BPFILTER) += bpfilter/
-
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
deleted file mode 100644
index 00af5305e05a..000000000000
--- a/net/ipv4/bpfilter/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_BPFILTER) += sockopt.o
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
deleted file mode 100644
index 193bcc2acccc..000000000000
--- a/net/ipv4/bpfilter/sockopt.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/bpfilter.h>
-#include <uapi/linux/bpf.h>
-#include <linux/wait.h>
-#include <linux/kmod.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-
-struct bpfilter_umh_ops bpfilter_ops;
-EXPORT_SYMBOL_GPL(bpfilter_ops);
-
-static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval,
-				 unsigned int optlen, bool is_set)
-{
-	int err;
-	mutex_lock(&bpfilter_ops.lock);
-	if (!bpfilter_ops.sockopt) {
-		mutex_unlock(&bpfilter_ops.lock);
-		request_module("bpfilter");
-		mutex_lock(&bpfilter_ops.lock);
-
-		if (!bpfilter_ops.sockopt) {
-			err = -ENOPROTOOPT;
-			goto out;
-		}
-	}
-	if (bpfilter_ops.info.tgid &&
-	    thread_group_exited(bpfilter_ops.info.tgid))
-		umd_cleanup_helper(&bpfilter_ops.info);
-
-	if (!bpfilter_ops.info.tgid) {
-		err = bpfilter_ops.start();
-		if (err)
-			goto out;
-	}
-	err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set);
-out:
-	mutex_unlock(&bpfilter_ops.lock);
-	return err;
-}
-
-int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval,
-			    unsigned int optlen)
-{
-	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
-}
-
-int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
-			    int __user *optlen)
-{
-	int len;
-
-	if (get_user(len, optlen))
-		return -EFAULT;
-
-	return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len,
-				     false);
-}
-
-static int __init bpfilter_sockopt_init(void)
-{
-	mutex_init(&bpfilter_ops.lock);
-	bpfilter_ops.info.tgid = NULL;
-	bpfilter_ops.info.driver_name = "bpfilter_umh";
-
-	return 0;
-}
-device_initcall(bpfilter_sockopt_init);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 66247e8b429e..7aa9dc0e6760 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,8 +47,6 @@
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
-#include <linux/bpfilter.h>
-
 /*
  *	SOL_IP control messages.
  */
@@ -1411,11 +1409,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-#if IS_ENABLED(CONFIG_BPFILTER_UMH)
-	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
-	    optname < BPFILTER_IPT_SET_MAX)
-		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
-#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1763,11 +1756,6 @@ int ip_getsockopt(struct sock *sk, int level,
 	err = do_ip_getsockopt(sk, level, optname,
 			       USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
 
-#if IS_ENABLED(CONFIG_BPFILTER_UMH)
-	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
-	    optname < BPFILTER_IPT_GET_MAX)
-		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
-#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index edda4fc2c4d0..708733b0ea06 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -426,10 +426,6 @@ static void probe_kernel_image_config(const char *define_prefix)
 		{ "CONFIG_BPF_STREAM_PARSER", },
 		/* xt_bpf module for passing BPF programs to netfilter  */
 		{ "CONFIG_NETFILTER_XT_MATCH_BPF", },
-		/* bpfilter back-end for iptables */
-		{ "CONFIG_BPFILTER", },
-		/* bpftilter module with "user mode helper" */
-		{ "CONFIG_BPFILTER_UMH", },
 
 		/* test_bpf module for BPF tests */
 		{ "CONFIG_TEST_BPF", },
diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64
index 29c8635c5722..3720b7611523 100644
--- a/tools/testing/selftests/bpf/config.aarch64
+++ b/tools/testing/selftests/bpf/config.aarch64
@@ -11,7 +11,6 @@ CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BONDING=y
-CONFIG_BPFILTER=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_JIT_DEFAULT_ON=y
 CONFIG_BPF_PRELOAD_UMD=y
diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x
index e93330382849..706931a8c2c6 100644
--- a/tools/testing/selftests/bpf/config.s390x
+++ b/tools/testing/selftests/bpf/config.s390x
@@ -9,7 +9,6 @@ CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_JIT_DEFAULT_ON=y
 CONFIG_BPF_PRELOAD=y
 CONFIG_BPF_PRELOAD_UMD=y
-CONFIG_BPFILTER=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_FREEZER=y
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index b946088017f1..5680befae8c6 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -19,7 +19,6 @@ CONFIG_BOOTTIME_TRACING=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_PRELOAD=y
 CONFIG_BPF_PRELOAD_UMD=y
-CONFIG_BPFILTER=y
 CONFIG_BSD_DISKLABEL=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_CFS_BANDWIDTH=y
diff --git a/tools/testing/selftests/hid/config b/tools/testing/selftests/hid/config
index 4f425178b56f..1758b055f295 100644
--- a/tools/testing/selftests/hid/config
+++ b/tools/testing/selftests/hid/config
@@ -1,5 +1,4 @@
 CONFIG_BPF_EVENTS=y
-CONFIG_BPFILTER=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_JIT=y
 CONFIG_BPF_KPROBE_OVERRIDE=y
-- 
cgit v1.2.3


From 57331a59ac0d680f606403eb24edd3c35aecba31 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 4 Jan 2024 09:58:46 -0500
Subject: NFSv4.1: Use the nfs_client's rpc timeouts for backchannel

For backchannel requests that lookup the appropriate nfs_client, use the
state-management rpc_clnt's rpc_timeout parameters for the backchannel's
response.  When the nfs_client cannot be found, fall back to using the
xprt's default timeout parameters.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback_xdr.c          |  5 +++++
 include/linux/sunrpc/bc_xprt.h |  3 ++-
 include/linux/sunrpc/sched.h   | 14 +++++++++++++-
 include/linux/sunrpc/svc.h     |  2 ++
 include/linux/sunrpc/xprt.h    | 11 -----------
 net/sunrpc/clnt.c              |  6 ++++--
 net/sunrpc/svc.c               | 11 ++++++++++-
 net/sunrpc/xprt.c              | 12 +++++++++---
 8 files changed, 45 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 321af81c456e..9369488f2ed4 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -967,6 +967,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
 		nops--;
 	}
 
+	if (svc_is_backchannel(rqstp) && cps.clp) {
+		rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval;
+		rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries;
+	}
+
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
 	nfs4_cb_free_slot(&cps);
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index db30a159f9d5..f22bf915dcf6 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -20,7 +20,8 @@
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid);
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
-void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task);
+void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task,
+		const struct rpc_timeout *to);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 8ada7dc802d3..2d61987b3545 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -37,6 +37,17 @@ struct rpc_wait {
 	struct list_head	timer_list;	/* Timer list */
 };
 
+/*
+ * This describes a timeout strategy
+ */
+struct rpc_timeout {
+	unsigned long		to_initval,		/* initial timeout */
+				to_maxval,		/* max timeout */
+				to_increment;		/* if !exponential */
+	unsigned int		to_retries;		/* max # of retries */
+	unsigned char		to_exponential;
+};
+
 /*
  * This is the RPC task struct
  */
@@ -205,7 +216,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+		struct rpc_timeout *timeout);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
 bool		rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index b10f987509cc..3331a1c2b47e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -250,6 +250,8 @@ struct svc_rqst {
 	struct net		*rq_bc_net;	/* pointer to backchannel's
 						 * net namespace
 						 */
+	unsigned long	bc_to_initval;
+	unsigned int	bc_to_retries;
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
 	unsigned int		rq_status_counter; /* RPC processing counter */
 };
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index f85d3a0daca2..464f6a9492ab 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -30,17 +30,6 @@
 #define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
 
-/*
- * This describes a timeout strategy
- */
-struct rpc_timeout {
-	unsigned long		to_initval,		/* initial timeout */
-				to_maxval,		/* max timeout */
-				to_increment;		/* if !exponential */
-	unsigned int		to_retries;		/* max # of retries */
-	unsigned char		to_exponential;
-};
-
 enum rpc_display_format_t {
 	RPC_DISPLAY_ADDR = 0,
 	RPC_DISPLAY_PORT,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b2c4b5484f5..19437b15ffc7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1311,8 +1311,10 @@ static void call_bc_encode(struct rpc_task *task);
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
  * @req: RPC request
+ * @timeout: timeout values to use for this task
  */
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+		struct rpc_timeout *timeout)
 {
 	struct rpc_task *task;
 	struct rpc_task_setup task_setup_data = {
@@ -1331,7 +1333,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 		return task;
 	}
 
-	xprt_init_bc_request(req, task);
+	xprt_init_bc_request(req, task, timeout);
 
 	task->tk_action = call_bc_encode;
 	atomic_inc(&task->tk_count);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 3f2ea7a0496f..3f714d33624b 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1557,6 +1557,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 {
 	struct rpc_task *task;
 	int proc_error;
+	struct rpc_timeout timeout;
 
 	/* Build the svc_rqst used by the common processing routine */
 	rqstp->rq_xid = req->rq_xid;
@@ -1602,8 +1603,16 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 		return;
 	}
 	/* Finally, send the reply synchronously */
+	if (rqstp->bc_to_initval > 0) {
+		timeout.to_initval = rqstp->bc_to_initval;
+		timeout.to_retries = rqstp->bc_to_initval;
+	} else {
+		timeout.to_initval = req->rq_xprt->timeout->to_initval;
+		timeout.to_initval = req->rq_xprt->timeout->to_retries;
+	}
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
-	task = rpc_run_bc_task(req);
+	task = rpc_run_bc_task(req, &timeout);
+
 	if (IS_ERR(task))
 		return;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 6cc9ffac962d..af13fdfa6672 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1986,7 +1986,8 @@ void xprt_release(struct rpc_task *task)
 
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 void
-xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task)
+xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task,
+		const struct rpc_timeout *to)
 {
 	struct xdr_buf *xbufp = &req->rq_snd_buf;
 
@@ -1999,8 +2000,13 @@ xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task)
 	 */
 	xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
 		xbufp->tail[0].iov_len;
-
-	xprt_init_majortimeo(task, req, req->rq_xprt->timeout);
+	/*
+	 * Backchannel Replies are sent with !RPC_TASK_SOFT and
+	 * RPC_TASK_NO_RETRANS_TIMEOUT. The major timeout setting
+	 * affects only how long each Reply waits to be sent when
+	 * a transport connection cannot be established.
+	 */
+	xprt_init_majortimeo(task, req, to);
 }
 #endif
 
-- 
cgit v1.2.3


From fe1eb24bd5ade085914248c527044e942f75e06a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 4 Jan 2024 16:04:35 -0800
Subject: Revert "Introduce PHY listing and link_topology tracking"

This reverts commit 32bb4515e34469975abc936deb0a116c4a445817.
This reverts commit d078d480639a4f3b5fc2d56247afa38e0956483a.
This reverts commit fcc4b105caa4b844bf043375bf799c20a9c99db1.
This reverts commit 345237dbc1bdbb274c9fb9ec38976261ff4a40b8.
This reverts commit 7db69ec9cfb8b4ab50420262631fb2d1908b25bf.
This reverts commit 95132a018f00f5dad38bdcfd4180d1af955d46f6.
This reverts commit 63d5eaf35ac36cad00cfb3809d794ef0078c822b.
This reverts commit c29451aefcb42359905d18678de38e52eccb3bb5.
This reverts commit 2ab0edb505faa9ac90dee1732571390f074e8113.
This reverts commit dedd702a35793ab462fce4c737eeba0badf9718e.
This reverts commit 034fcc210349b873ece7356905be5c6ca11eef2a.
This reverts commit 9c5625f559ad6fe9f6f733c11475bf470e637d34.
This reverts commit 02018c544ef113e980a2349eba89003d6f399d22.

Looks like we need more time for reviews, and incremental
changes will be hard to make sense of. So revert.

Link: https://lore.kernel.org/all/ZZP6FV5sXEf+xd58@shell.armlinux.org.uk/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  68 ------
 Documentation/networking/ethtool-netlink.rst   |  51 -----
 Documentation/networking/index.rst             |   1 -
 Documentation/networking/phy-link-topology.rst | 121 ----------
 MAINTAINERS                                    |   2 -
 drivers/net/phy/Makefile                       |   2 +-
 drivers/net/phy/at803x.c                       |   2 -
 drivers/net/phy/marvell-88x2222.c              |   2 -
 drivers/net/phy/marvell.c                      |   2 -
 drivers/net/phy/marvell10g.c                   |   2 -
 drivers/net/phy/phy_device.c                   |  55 -----
 drivers/net/phy/phy_link_topology.c            |  66 ------
 drivers/net/phy/phylink.c                      |   3 +-
 drivers/net/phy/sfp-bus.c                      |  15 +-
 include/linux/netdevice.h                      |   4 +-
 include/linux/phy.h                            |   6 -
 include/linux/phy_link_topology.h              |  67 ------
 include/linux/phy_link_topology_core.h         |  19 --
 include/linux/sfp.h                            |   8 +-
 include/uapi/linux/ethtool.h                   |  16 --
 include/uapi/linux/ethtool_netlink.h           |  30 ---
 net/core/dev.c                                 |   3 -
 net/ethtool/Makefile                           |   2 +-
 net/ethtool/cabletest.c                        |  12 +-
 net/ethtool/netlink.c                          |  33 ---
 net/ethtool/netlink.h                          |  12 +-
 net/ethtool/phy.c                              | 306 -------------------------
 net/ethtool/plca.c                             |  13 +-
 net/ethtool/pse-pd.c                           |   9 +-
 net/ethtool/strset.c                           |  15 +-
 30 files changed, 35 insertions(+), 912 deletions(-)
 delete mode 100644 Documentation/networking/phy-link-topology.rst
 delete mode 100644 drivers/net/phy/phy_link_topology.c
 delete mode 100644 include/linux/phy_link_topology.h
 delete mode 100644 include/linux/phy_link_topology_core.h
 delete mode 100644 net/ethtool/phy.c

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 7f6fb1f61dd4..197208f419dc 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -16,11 +16,6 @@ definitions:
     name: stringset
     type: enum
     entries: []
-  -
-    name: phy-upstream-type
-    enum-name:
-    type: enum
-    entries: [ mac, phy ]
 
 attribute-sets:
   -
@@ -35,9 +30,6 @@ attribute-sets:
       -
         name: flags
         type: u32
-      -
-        name: phy-index
-        type: u32
 
   -
     name: bitset-bit
@@ -950,45 +942,6 @@ attribute-sets:
       -
         name: burst-tmr
         type: u32
-  -
-    name: phy-upstream
-    attributes:
-      -
-        name: index
-        type: u32
-      -
-        name: sfp-name
-        type: string
-  -
-    name: phy
-    attributes:
-      -
-        name: header
-        type: nest
-        nested-attributes: header
-      -
-        name: index
-        type: u32
-      -
-        name: drvname
-        type: string
-      -
-        name: name
-        type: string
-      -
-        name: upstream-type
-        type: u8
-        enum: phy-upstream-type
-      -
-        name: upstream
-        type: nest
-        nested-attributes: phy-upstream
-      -
-        name: downstream-sfp-name
-        type: string
-      -
-        name: id
-        type: u32
 
 operations:
   enum-model: directional
@@ -1740,24 +1693,3 @@ operations:
       name: mm-ntf
       doc: Notification for change in MAC Merge configuration.
       notify: mm-get
-    -
-      name: phy-get
-      doc: Get PHY devices attached to an interface
-
-      attribute-set: phy
-
-      do: &phy-get-op
-        request:
-          attributes:
-            - header
-        reply:
-          attributes:
-            - header
-            - index
-            - drvname
-            - name
-            - upstream-type
-            - upstream
-            - downstream-sfp-name
-            - id
-      dump: *phy-get-op
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 97ff787a7dd8..d583d9abf2f8 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -57,7 +57,6 @@ Structure of this header is
   ``ETHTOOL_A_HEADER_DEV_INDEX``  u32     device ifindex
   ``ETHTOOL_A_HEADER_DEV_NAME``   string  device name
   ``ETHTOOL_A_HEADER_FLAGS``      u32     flags common for all requests
-  ``ETHTOOL_A_HEADER_PHY_INDEX``  u32     phy device index
   ==============================  ======  =============================
 
 ``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the
@@ -82,12 +81,6 @@ the behaviour is backward compatible, i.e. requests from old clients not aware
 of the flag should be interpreted the way the client expects. A client must
 not set flags it does not understand.
 
-``ETHTOOL_A_HEADER_PHY_INDEX`` identify the ethernet PHY the message relates to.
-As there are numerous commands that are related to PHY configuration, and because
-we can have more than one PHY on the link, the PHY index can be passed in the
-request for the commands that needs it. It is however not mandatory, and if it
-is not passed for commands that target a PHY, the net_device.phydev pointer
-is used, as a fallback that keeps the legacy behaviour.
 
 Bit sets
 ========
@@ -2011,49 +2004,6 @@ The attributes are propagated to the driver through the following structure:
 .. kernel-doc:: include/linux/ethtool.h
     :identifiers: ethtool_mm_cfg
 
-PHY_GET
-=======
-
-Retrieve information about a given Ethernet PHY sitting on the link. As there
-can be more than one PHY, the DUMP operation can be used to list the PHYs
-present on a given interface, by passing an interface index or name in
-the dump request
-
-Request contents:
-
-  ====================================  ======  ==========================
-  ``ETHTOOL_A_PHY_HEADER``              nested  request header
-  ====================================  ======  ==========================
-
-Kernel response contents:
-
-  ===================================== ======  ==========================
-  ``ETHTOOL_A_PHY_HEADER``              nested  request header
-  ``ETHTOOL_A_PHY_INDEX``               u32     the phy's unique index, that can
-                                                be used for phy-specific requests
-  ``ETHTOOL_A_PHY_DRVNAME``             string  the phy driver name
-  ``ETHTOOL_A_PHY_NAME``                string  the phy device name
-  ``ETHTOOL_A_PHY_UPSTREAM_TYPE``       u32     the type of device this phy is
-                                                connected to
-  ``ETHTOOL_A_PHY_UPSTREAM_PHY``        nested  if the phy is connected to another
-                                                phy, this nest contains info on
-                                                that connection
-  ``ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME`` string  if the phy controls an sfp bus,
-                                                the name of the sfp bus
-  ``ETHTOOL_A_PHY_ID``                  u32     the phy id if the phy is C22
-  ===================================== ======  ==========================
-
-When ``ETHTOOL_A_PHY_UPSTREAM_TYPE`` is PHY_UPSTREAM_PHY, the PHY's parent is
-another PHY. Information on the parent PHY will be set in the
-``ETHTOOL_A_PHY_UPSTREAM_PHY`` nest, which has the following structure :
-
-  =================================== ======  ==========================
-  ``ETHTOOL_A_PHY_UPSTREAM_INDEX``    u32     the PHY index of the upstream PHY
-  ``ETHTOOL_A_PHY_UPSTREAM_SFP_NAME`` string  if this PHY is connected to it's
-                                                parent PHY through an SFP bus, the
-                                                name of this sfp bus
-  =================================== ======  ==========================
-
 Request translation
 ===================
 
@@ -2160,5 +2110,4 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_PLCA_GET_STATUS``
   n/a                                 ``ETHTOOL_MSG_MM_GET``
   n/a                                 ``ETHTOOL_MSG_MM_SET``
-  n/a                                 ``ETHTOOL_MSG_PHY_GET``
   =================================== =====================================
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index a2c45a75a4a6..69f3d6dcd9fd 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -88,7 +88,6 @@ Contents:
    operstates
    packet_mmap
    phonet
-   phy-link-topology
    pktgen
    plip
    ppp_generic
diff --git a/Documentation/networking/phy-link-topology.rst b/Documentation/networking/phy-link-topology.rst
deleted file mode 100644
index 1fd8e904ef4b..000000000000
--- a/Documentation/networking/phy-link-topology.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=================
-PHY link topology
-=================
-
-Overview
-========
-
-The PHY link topology representation in the networking stack aims at representing
-the hardware layout for any given Ethernet link.
-
-An Ethernet Interface from userspace's point of view is nothing but a
-:c:type:`struct net_device <net_device>`, which exposes configuration options
-through the legacy ioctls and the ethool netlink commands. The base assumption
-when designing these configuration channels were that the link looked
-something like this ::
-
-  +-----------------------+        +----------+      +--------------+
-  | Ethernet Controller / |        | Ethernet |      | Connector /  |
-  |       MAC             | ------ |   PHY    | ---- |    Port      | ---... to LP
-  +-----------------------+        +----------+      +--------------+
-  struct net_device               struct phy_device
-
-Commands that needs to configure the PHY will go through the net_device.phydev
-field to reach the PHY and perform the relevant configuration.
-
-This assumption falls apart in more complex topologies that can arise when,
-for example, using SFP transceivers (although that's not the only specific case).
-
-Here, we have 2 basic scenarios. Either the MAC is able to output a serialized
-interface, that can directly be fed to an SFP cage, such as SGMII, 1000BaseX,
-10GBaseR, etc.
-
-The link topology then looks like this (when an SFP module is inserted) ::
-
-  +-----+  SGMII  +------------+
-  | MAC | ------- | SFP Module |
-  +-----+         +------------+
-
-Knowing that some modules embed a PHY, the actual link is more like ::
-
-  +-----+  SGMII   +--------------+
-  | MAC | -------- | PHY (on SFP) |
-  +-----+          +--------------+
-
-In this case, the SFP PHY is handled by phylib, and registered by phylink through
-its SFP upstream ops.
-
-Now some Ethernet controllers aren't able to output a serialized interface, so
-we can't directly connect them to an SFP cage. However, some PHYs can be used
-as media-converters, to translate the non-serialized MAC MII interface to a
-serialized MII interface fed to the SFP ::
-
-  +-----+  RGMII  +-----------------------+  SGMII  +--------------+
-  | MAC | ------- | PHY (media converter) | ------- | PHY (on SFP) |
-  +-----+         +-----------------------+         +--------------+
-
-This is where the model of having a single net_device.phydev pointer shows its
-limitations, as we now have 2 PHYs on the link.
-
-The phy_link topology framework aims at providing a way to keep track of every
-PHY on the link, for use by both kernel drivers and subsystems, but also to
-report the topology to userspace, allowing to target individual PHYs in configuration
-commands.
-
-API
-===
-
-The :c:type:`struct phy_link_topology <phy_link_topology>` is a per-netdevice
-resource, that gets initialized at netdevice creation. Once it's initialized,
-it is then possible to register PHYs to the topology through :
-
-:c:func:`phy_link_topo_add_phy`
-
-Besides registering the PHY to the topology, this call will also assign a unique
-index to the PHY, which can then be reported to userspace to refer to this PHY
-(akin to the ifindex). This index is a u32, ranging from 1 to U32_MAX. The value
-0 is reserved to indicate the PHY doesn't belong to any topology yet.
-
-The PHY can then be removed from the topology through
-
-:c:func:`phy_link_topo_del_phy`
-
-These function are already hooked into the phylib subsystem, so all PHYs that
-are linked to a net_device through :c:func:`phy_attach_direct` will automatically
-join the netdev's topology.
-
-PHYs that are on a SFP module will also be automatically registered IF the SFP
-upstream is phylink (so, no media-converter).
-
-PHY drivers that can be used as SFP upstream need to call :c:func:`phy_sfp_attach_phy`
-and :c:func:`phy_sfp_detach_phy`, which can be used as a
-.attach_phy / .detach_phy implementation for the
-:c:type:`struct sfp_upstream_ops <sfp_upstream_ops>`.
-
-UAPI
-====
-
-There exist a set of netlink commands to query the link topology from userspace,
-see ``Documentation/networking/ethtool-netlink.rst``.
-
-The whole point of having a topology representation is to assign the phyindex
-field in :c:type:`struct phy_device <phy_device>`. This index is reported to
-userspace using the ``ETHTOOL_MSG_PHY_GET`` ethtnl command. Performing a DUMP operation
-will result in all PHYs from all net_device being listed. The DUMP command
-accepts either a ``ETHTOOL_A_HEADER_DEV_INDEX`` or ``ETHTOOL_A_HEADER_DEV_NAME``
-to be passed in the request to filter the DUMP to a single net_device.
-
-The retrieved index can then be passed as a request parameter using the
-``ETHTOOL_A_HEADER_PHY_INDEX`` field in the following ethnl commands :
-
-* ``ETHTOOL_MSG_STRSET_GET`` to get the stats string set from a given PHY
-* ``ETHTOOL_MSG_CABLE_TEST_ACT`` and ``ETHTOOL_MSG_CABLE_TEST_ACT``, to perform
-  cable testing on a given PHY on the link (most likely the outermost PHY)
-* ``ETHTOOL_MSG_PSE_SET`` and ``ETHTOOL_MSG_PSE_GET`` for PHY-controlled PoE and PSE settings
-* ``ETHTOOL_MSG_PLCA_GET_CFG``, ``ETHTOOL_MSG_PLCA_SET_CFG`` and ``ETHTOOL_MSG_PLCA_GET_STATUS``
-  to set the PLCA (Physical Layer Collision Avoidance) parameters
-
-Note that the PHY index can be passed to other requests, which will silently
-ignore it if present and irrelevant.
diff --git a/MAINTAINERS b/MAINTAINERS
index 79ac49b113dc..2b916990d7f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7871,8 +7871,6 @@ F:	include/linux/mii.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
-F:	include/linux/phy_link_topology.h
-F:	include/linux/phy_link_topology_core.h
 F:	include/linux/phylib_stubs.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/linux/platform_data/mdio-gpio.h
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index f218954fd7a8..6097afd44392 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Linux PHY drivers
 
 libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o \
-				   linkmode.o phy_link_topology.o
+				   linkmode.o
 mdio-bus-y			+= mdio_bus.o mdio_device.o
 
 ifdef CONFIG_MDIO_DEVICE
diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index aaf6c654aaed..19cfbf36fe80 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -1452,8 +1452,6 @@ static const struct sfp_upstream_ops at8031_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
 	.module_insert = at8031_sfp_insert,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int at8031_parse_dt(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
index 3f77bbc7e04f..e3aa30dad2e6 100644
--- a/drivers/net/phy/marvell-88x2222.c
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -555,8 +555,6 @@ static const struct sfp_upstream_ops sfp_phy_ops = {
 	.link_down = mv2222_sfp_link_down,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int mv2222_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 674e29bce2cc..eba652a4c1d8 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3254,8 +3254,6 @@ static const struct sfp_upstream_ops m88e1510_sfp_ops = {
 	.module_remove = m88e1510_sfp_remove,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int m88e1510_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 6642eb642d4b..ad43e280930c 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -503,8 +503,6 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 static const struct sfp_upstream_ops mv3310_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 	.module_insert = mv3310_sfp_insert,
 };
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1e595762afea..3611ea64875e 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -29,7 +29,6 @@
 #include <linux/phy.h>
 #include <linux/phylib_stubs.h>
 #include <linux/phy_led_triggers.h>
-#include <linux/phy_link_topology.h>
 #include <linux/pse-pd/pse.h>
 #include <linux/property.h>
 #include <linux/rtnetlink.h>
@@ -266,14 +265,6 @@ static void phy_mdio_device_remove(struct mdio_device *mdiodev)
 
 static struct phy_driver genphy_driver;
 
-static struct phy_link_topology *phy_get_link_topology(struct phy_device *phydev)
-{
-	if (phydev->attached_dev)
-		return &phydev->attached_dev->link_topo;
-
-	return NULL;
-}
-
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
@@ -1363,46 +1354,6 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_standalone);
 
-/**
- * phy_sfp_connect_phy - Connect the SFP module's PHY to the upstream PHY
- * @upstream: pointer to the upstream phy device
- * @phy: pointer to the SFP module's phy device
- *
- * This helper allows keeping track of PHY devices on the link. It adds the
- * SFP module's phy to the phy namespace of the upstream phy
- */
-int phy_sfp_connect_phy(void *upstream, struct phy_device *phy)
-{
-	struct phy_device *phydev = upstream;
-	struct phy_link_topology *topo = phy_get_link_topology(phydev);
-
-	if (topo)
-		return phy_link_topo_add_phy(topo, phy, PHY_UPSTREAM_PHY, phydev);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_sfp_connect_phy);
-
-/**
- * phy_sfp_disconnect_phy - Disconnect the SFP module's PHY from the upstream PHY
- * @upstream: pointer to the upstream phy device
- * @phy: pointer to the SFP module's phy device
- *
- * This helper allows keeping track of PHY devices on the link. It removes the
- * SFP module's phy to the phy namespace of the upstream phy. As the module phy
- * will be destroyed, re-inserting the same module will add a new phy with a
- * new index.
- */
-void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy)
-{
-	struct phy_device *phydev = upstream;
-	struct phy_link_topology *topo = phy_get_link_topology(phydev);
-
-	if (topo)
-		phy_link_topo_del_phy(topo, phy);
-}
-EXPORT_SYMBOL(phy_sfp_disconnect_phy);
-
 /**
  * phy_sfp_attach - attach the SFP bus to the PHY upstream network device
  * @upstream: pointer to the phy device
@@ -1540,11 +1491,6 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 		if (phydev->sfp_bus_attached)
 			dev->sfp_bus = phydev->sfp_bus;
-
-		err = phy_link_topo_add_phy(&dev->link_topo, phydev,
-					    PHY_UPSTREAM_MAC, dev);
-		if (err)
-			goto error;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
@@ -1874,7 +1820,6 @@ void phy_detach(struct phy_device *phydev)
 	if (dev) {
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
-		phy_link_topo_del_phy(&dev->link_topo, phydev);
 	}
 	phydev->phylink = NULL;
 
diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c
deleted file mode 100644
index 34e7e08fbfc3..000000000000
--- a/drivers/net/phy/phy_link_topology.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Infrastructure to handle all PHY devices connected to a given netdev,
- * either directly or indirectly attached.
- *
- * Copyright (c) 2023 Maxime Chevallier<maxime.chevallier@bootlin.com>
- */
-
-#include <linux/phy_link_topology.h>
-#include <linux/netdevice.h>
-#include <linux/phy.h>
-#include <linux/rtnetlink.h>
-#include <linux/xarray.h>
-
-int phy_link_topo_add_phy(struct phy_link_topology *topo,
-			  struct phy_device *phy,
-			  enum phy_upstream upt, void *upstream)
-{
-	struct phy_device_node *pdn;
-	int ret;
-
-	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
-	if (!pdn)
-		return -ENOMEM;
-
-	pdn->phy = phy;
-	switch (upt) {
-	case PHY_UPSTREAM_MAC:
-		pdn->upstream.netdev = (struct net_device *)upstream;
-		if (phy_on_sfp(phy))
-			pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus;
-		break;
-	case PHY_UPSTREAM_PHY:
-		pdn->upstream.phydev = (struct phy_device *)upstream;
-		if (phy_on_sfp(phy))
-			pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus;
-		break;
-	default:
-		ret = -EINVAL;
-		goto err;
-	}
-	pdn->upstream_type = upt;
-
-	ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b,
-			      &topo->next_phy_index, GFP_KERNEL);
-	if (ret)
-		goto err;
-
-	return 0;
-
-err:
-	kfree(pdn);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(phy_link_topo_add_phy);
-
-void phy_link_topo_del_phy(struct phy_link_topology *topo,
-			   struct phy_device *phy)
-{
-	struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex);
-
-	phy->phyindex = 0;
-
-	kfree(pdn);
-}
-EXPORT_SYMBOL_GPL(phy_link_topo_del_phy);
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index a816391add12..ed0b4ccaa6a6 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -3385,8 +3385,7 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 	return ret;
 }
 
-static void phylink_sfp_disconnect_phy(void *upstream,
-				       struct phy_device *phydev)
+static void phylink_sfp_disconnect_phy(void *upstream)
 {
 	phylink_disconnect_phy(upstream);
 }
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index fb1c102714b5..6fa679b36290 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -486,7 +486,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
 		if (bus->phydev && ops && ops->disconnect_phy)
-			ops->disconnect_phy(bus->upstream, bus->phydev);
+			ops->disconnect_phy(bus->upstream);
 	}
 	bus->registered = false;
 }
@@ -742,7 +742,7 @@ void sfp_remove_phy(struct sfp_bus *bus)
 	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
 
 	if (ops && ops->disconnect_phy)
-		ops->disconnect_phy(bus->upstream, bus->phydev);
+		ops->disconnect_phy(bus->upstream);
 	bus->phydev = NULL;
 }
 EXPORT_SYMBOL_GPL(sfp_remove_phy);
@@ -859,14 +859,3 @@ void sfp_unregister_socket(struct sfp_bus *bus)
 	sfp_bus_put(bus);
 }
 EXPORT_SYMBOL_GPL(sfp_unregister_socket);
-
-const char *sfp_get_name(struct sfp_bus *bus)
-{
-	ASSERT_RTNL();
-
-	if (bus->sfp_dev)
-		return dev_name(bus->sfp_dev);
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(sfp_get_name);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e265aa1f2169..118c40258d07 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,6 +40,7 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
+
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -51,7 +52,6 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
-#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -2047,7 +2047,6 @@ enum netdev_stat_type {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
- *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2442,7 +2441,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
-	struct phy_link_topology	link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6cb9d843aee9..e9e85d347587 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -544,9 +544,6 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
- * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
- *	      from userspace, similar to ifindex. A zero index means the PHY
- *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -646,7 +643,6 @@ struct phy_device {
 
 	struct device_link *devlink;
 
-	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
@@ -1726,8 +1722,6 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
-int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
-void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
deleted file mode 100644
index 91902263ec0e..000000000000
--- a/include/linux/phy_link_topology.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PHY device list allow maintaining a list of PHY devices that are
- * part of a netdevice's link topology. PHYs can for example be chained,
- * as is the case when using a PHY that exposes an SFP module, on which an
- * SFP transceiver that embeds a PHY is connected.
- *
- * This list can then be used by userspace to leverage individual PHY
- * capabilities.
- */
-#ifndef __PHY_LINK_TOPOLOGY_H
-#define __PHY_LINK_TOPOLOGY_H
-
-#include <linux/ethtool.h>
-#include <linux/phy_link_topology_core.h>
-
-struct xarray;
-struct phy_device;
-struct net_device;
-struct sfp_bus;
-
-struct phy_device_node {
-	enum phy_upstream upstream_type;
-
-	union {
-		struct net_device	*netdev;
-		struct phy_device	*phydev;
-	} upstream;
-
-	struct sfp_bus *parent_sfp_bus;
-
-	struct phy_device *phy;
-};
-
-static inline struct phy_device *
-phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
-{
-	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
-
-	if (pdn)
-		return pdn->phy;
-
-	return NULL;
-}
-
-#if IS_ENABLED(CONFIG_PHYLIB)
-int phy_link_topo_add_phy(struct phy_link_topology *topo,
-			  struct phy_device *phy,
-			  enum phy_upstream upt, void *upstream);
-
-void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
-
-#else
-static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
-					struct phy_device *phy,
-					enum phy_upstream upt, void *upstream)
-{
-	return 0;
-}
-
-static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
-					 struct phy_device *phy)
-{
-}
-#endif
-
-#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
deleted file mode 100644
index 78c75f909489..000000000000
--- a/include/linux/phy_link_topology_core.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PHY_LINK_TOPOLOGY_CORE_H
-#define __PHY_LINK_TOPOLOGY_CORE_H
-
-struct xarray;
-
-struct phy_link_topology {
-	struct xarray phys;
-
-	u32 next_phy_index;
-};
-
-static inline void phy_link_topo_init(struct phy_link_topology *topo)
-{
-	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
-	topo->next_phy_index = 1;
-}
-
-#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 55c0ab17c9e2..9346cd44814d 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv, struct phy_device *);
+	void (*disconnect_phy)(void *priv);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
@@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
-const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
-
-static inline const char *sfp_get_name(struct sfp_bus *bus)
-{
-	return NULL;
-}
 #endif
 
 #endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 01ba529dbb6d..06ef6b78b7de 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2220,20 +2220,4 @@ struct ethtool_link_settings {
 	 * __u32 map_lp_advertising[link_mode_masks_nwords];
 	 */
 };
-
-/**
- * enum phy_upstream - Represents the upstream component a given PHY device
- * is connected to, as in what is on the other end of the MII bus. Most PHYs
- * will be attached to an Ethernet MAC controller, but in some cases, there's
- * an intermediate PHY used as a media-converter, which will driver another
- * MII interface as its output.
- * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port,
- *		      or ethernet controller)
- * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter)
- */
-enum phy_upstream {
-	PHY_UPSTREAM_MAC,
-	PHY_UPSTREAM_PHY,
-};
-
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 00cd7ad16709..3f89074aa06c 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -57,7 +57,6 @@ enum {
 	ETHTOOL_MSG_PLCA_GET_STATUS,
 	ETHTOOL_MSG_MM_GET,
 	ETHTOOL_MSG_MM_SET,
-	ETHTOOL_MSG_PHY_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -110,8 +109,6 @@ enum {
 	ETHTOOL_MSG_PLCA_NTF,
 	ETHTOOL_MSG_MM_GET_REPLY,
 	ETHTOOL_MSG_MM_NTF,
-	ETHTOOL_MSG_PHY_GET_REPLY,
-	ETHTOOL_MSG_PHY_NTF,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -136,7 +133,6 @@ enum {
 	ETHTOOL_A_HEADER_DEV_INDEX,		/* u32 */
 	ETHTOOL_A_HEADER_DEV_NAME,		/* string */
 	ETHTOOL_A_HEADER_FLAGS,			/* u32 - ETHTOOL_FLAG_* */
-	ETHTOOL_A_HEADER_PHY_INDEX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_HEADER_CNT,
@@ -980,32 +976,6 @@ enum {
 	ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1)
 };
 
-enum {
-	ETHTOOL_A_PHY_UPSTREAM_UNSPEC,
-	ETHTOOL_A_PHY_UPSTREAM_INDEX,			/* u32 */
-	ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,		/* string */
-
-	/* add new constants above here */
-	__ETHTOOL_A_PHY_UPSTREAM_CNT,
-	ETHTOOL_A_PHY_UPSTREAM_MAX = (__ETHTOOL_A_PHY_UPSTREAM_CNT - 1)
-};
-
-enum {
-	ETHTOOL_A_PHY_UNSPEC,
-	ETHTOOL_A_PHY_HEADER,			/* nest - _A_HEADER_* */
-	ETHTOOL_A_PHY_INDEX,			/* u32 */
-	ETHTOOL_A_PHY_DRVNAME,			/* string */
-	ETHTOOL_A_PHY_NAME,			/* string */
-	ETHTOOL_A_PHY_UPSTREAM_TYPE,		/* u8 */
-	ETHTOOL_A_PHY_UPSTREAM,			/* nest - _A_PHY_UPSTREAM_* */
-	ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,	/* string */
-	ETHTOOL_A_PHY_ID,			/* u32 */
-
-	/* add new constants above here */
-	__ETHTOOL_A_PHY_CNT,
-	ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1)
-};
-
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/core/dev.c b/net/core/dev.c
index bc4ac49d4643..f01a9b858347 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,7 +153,6 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
-#include <linux/phy_link_topology_core.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -10876,8 +10875,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #ifdef CONFIG_NET_SCHED
 	hash_init(dev->qdisc_hash);
 #endif
-	phy_link_topo_init(&dev->link_topo);
-
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 0ccd0e9afd3f..504f954a1b28 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -8,4 +8,4 @@ ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
 		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
-		   module.o pse-pd.o plca.o mm.o phy.o
+		   module.o pse-pd.o plca.o mm.o
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
index 6b00d0800f23..06a151165c31 100644
--- a/net/ethtool/cabletest.c
+++ b/net/ethtool/cabletest.c
@@ -69,7 +69,7 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
 		return ret;
 
 	dev = req_info.dev;
-	if (!req_info.phydev) {
+	if (!dev->phydev) {
 		ret = -EOPNOTSUPP;
 		goto out_dev_put;
 	}
@@ -85,12 +85,12 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
 	if (ret < 0)
 		goto out_rtnl;
 
-	ret = ops->start_cable_test(req_info.phydev, info->extack);
+	ret = ops->start_cable_test(dev->phydev, info->extack);
 
 	ethnl_ops_complete(dev);
 
 	if (!ret)
-		ethnl_cable_test_started(req_info.phydev,
+		ethnl_cable_test_started(dev->phydev,
 					 ETHTOOL_MSG_CABLE_TEST_NTF);
 
 out_rtnl:
@@ -321,7 +321,7 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
 		return ret;
 
 	dev = req_info.dev;
-	if (!req_info.phydev) {
+	if (!dev->phydev) {
 		ret = -EOPNOTSUPP;
 		goto out_dev_put;
 	}
@@ -342,12 +342,12 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
 	if (ret < 0)
 		goto out_rtnl;
 
-	ret = ops->start_cable_test_tdr(req_info.phydev, info->extack, &cfg);
+	ret = ops->start_cable_test_tdr(dev->phydev, info->extack, &cfg);
 
 	ethnl_ops_complete(dev);
 
 	if (!ret)
-		ethnl_cable_test_started(req_info.phydev,
+		ethnl_cable_test_started(dev->phydev,
 					 ETHTOOL_MSG_CABLE_TEST_TDR_NTF);
 
 out_rtnl:
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 92b0dd8ca046..fe3553f60bf3 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -4,7 +4,6 @@
 #include <linux/ethtool_netlink.h>
 #include <linux/pm_runtime.h>
 #include "netlink.h"
-#include <linux/phy_link_topology.h>
 
 static struct genl_family ethtool_genl_family;
 
@@ -21,7 +20,6 @@ const struct nla_policy ethnl_header_policy[] = {
 					    .len = ALTIFNAMSIZ - 1 },
 	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
 							  ETHTOOL_FLAGS_BASIC),
-	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
 };
 
 const struct nla_policy ethnl_header_policy_stats[] = {
@@ -30,7 +28,6 @@ const struct nla_policy ethnl_header_policy_stats[] = {
 					    .len = ALTIFNAMSIZ - 1 },
 	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
 							  ETHTOOL_FLAGS_STATS),
-	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
 };
 
 int ethnl_ops_begin(struct net_device *dev)
@@ -94,7 +91,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 {
 	struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)];
 	const struct nlattr *devname_attr;
-	struct phy_device *phydev = NULL;
 	struct net_device *dev = NULL;
 	u32 flags = 0;
 	int ret;
@@ -149,26 +145,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 		return -EINVAL;
 	}
 
-	if (dev) {
-		if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
-			u32 phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]);
-
-			phydev = phy_link_topo_get_phy(&dev->link_topo,
-						       phy_index);
-			if (!phydev) {
-				NL_SET_ERR_MSG_ATTR(extack, header,
-						    "no phy matches phy index");
-				return -EINVAL;
-			}
-		} else {
-			/* If we need a PHY but no phy index is specified, fallback
-			 * to dev->phydev
-			 */
-			phydev = dev->phydev;
-		}
-	}
-
-	req_info->phydev = phydev;
 	req_info->dev = dev;
 	req_info->flags = flags;
 	return 0;
@@ -1153,15 +1129,6 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_mm_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1,
 	},
-	{
-		.cmd	= ETHTOOL_MSG_PHY_GET,
-		.doit	= ethnl_phy_doit,
-		.start	= ethnl_phy_start,
-		.dumpit	= ethnl_phy_dumpit,
-		.done	= ethnl_phy_done,
-		.policy = ethnl_phy_get_policy,
-		.maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
-	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 5e6a43e35a09..9a333a8d04c1 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -250,7 +250,6 @@ static inline unsigned int ethnl_reply_header_size(void)
  * @dev:   network device the request is for (may be null)
  * @dev_tracker: refcount tracker for @dev reference
  * @flags: request flags common for all request types
- * @phydev: phy_device connected to @dev this request is for (may be null)
  *
  * This is a common base for request specific structures holding data from
  * parsed userspace request. These always embed struct ethnl_req_info at
@@ -260,7 +259,6 @@ struct ethnl_req_info {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	u32			flags;
-	struct phy_device	*phydev;
 };
 
 static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
@@ -397,10 +395,9 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
 extern const struct ethnl_request_ops ethnl_mm_request_ops;
-extern const struct ethnl_request_ops ethnl_phy_request_ops;
 
-extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_PHY_INDEX + 1];
-extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
 extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
 extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
@@ -444,7 +441,6 @@ extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]
 extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
 extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1];
 extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1];
-extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1];
 
 int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
 int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
@@ -452,10 +448,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
 int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info);
 int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
-int ethnl_phy_start(struct netlink_callback *cb);
-int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info);
-int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
-int ethnl_phy_done(struct netlink_callback *cb);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c
deleted file mode 100644
index 5add2840aaeb..000000000000
--- a/net/ethtool/phy.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Bootlin
- *
- */
-#include "common.h"
-#include "netlink.h"
-
-#include <linux/phy.h>
-#include <linux/phy_link_topology.h>
-#include <linux/sfp.h>
-
-struct phy_req_info {
-	struct ethnl_req_info		base;
-	struct phy_device_node		pdn;
-};
-
-#define PHY_REQINFO(__req_base) \
-	container_of(__req_base, struct phy_req_info, base)
-
-const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = {
-	[ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
-};
-
-/* Caller holds rtnl */
-static ssize_t
-ethnl_phy_reply_size(const struct ethnl_req_info *req_base,
-		     struct netlink_ext_ack *extack)
-{
-	struct phy_link_topology *topo;
-	struct phy_device_node *pdn;
-	struct phy_device *phydev;
-	unsigned long index;
-	size_t size;
-
-	ASSERT_RTNL();
-
-	topo = &req_base->dev->link_topo;
-
-	size = nla_total_size(0);
-
-	xa_for_each(&topo->phys, index, pdn) {
-		phydev = pdn->phy;
-
-		/* ETHTOOL_A_PHY_INDEX */
-		size += nla_total_size(sizeof(u32));
-
-		/* ETHTOOL_A_DRVNAME */
-		size += nla_total_size(strlen(phydev->drv->name) + 1);
-
-		/* ETHTOOL_A_NAME */
-		size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1);
-
-		/* ETHTOOL_A_PHY_UPSTREAM_TYPE */
-		size += nla_total_size(sizeof(u8));
-
-		/* ETHTOOL_A_PHY_ID */
-		size += nla_total_size(sizeof(u32));
-
-		if (phy_on_sfp(phydev)) {
-			const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus);
-
-			/* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */
-			if (upstream_sfp_name)
-				size += nla_total_size(strlen(upstream_sfp_name) + 1);
-
-			/* ETHTOOL_A_PHY_UPSTREAM_INDEX */
-			size += nla_total_size(sizeof(u32));
-		}
-
-		/* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */
-		if (phydev->sfp_bus) {
-			const char *sfp_name = sfp_get_name(phydev->sfp_bus);
-
-			if (sfp_name)
-				size += nla_total_size(strlen(sfp_name) + 1);
-		}
-	}
-
-	return size;
-}
-
-static int
-ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb)
-{
-	struct phy_req_info *req_info = PHY_REQINFO(req_base);
-	struct phy_device_node *pdn = &req_info->pdn;
-	struct phy_device *phydev = pdn->phy;
-	enum phy_upstream ptype;
-	struct nlattr *nest;
-
-	ptype = pdn->upstream_type;
-
-	if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) ||
-	    nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name) ||
-	    nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) ||
-	    nla_put_u8(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype) ||
-	    nla_put_u32(skb, ETHTOOL_A_PHY_ID, phydev->phy_id))
-		return -EMSGSIZE;
-
-	if (ptype == PHY_UPSTREAM_PHY) {
-		struct phy_device *upstream = pdn->upstream.phydev;
-		const char *sfp_upstream_name;
-
-		nest = nla_nest_start(skb, ETHTOOL_A_PHY_UPSTREAM);
-		if (!nest)
-			return -EMSGSIZE;
-
-		/* Parent index */
-		if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex))
-			return -EMSGSIZE;
-
-		if (pdn->parent_sfp_bus) {
-			sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus);
-			if (sfp_upstream_name && nla_put_string(skb,
-								ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,
-								sfp_upstream_name))
-				return -EMSGSIZE;
-		}
-
-		nla_nest_end(skb, nest);
-	}
-
-	if (phydev->sfp_bus) {
-		const char *sfp_name = sfp_get_name(phydev->sfp_bus);
-
-		if (sfp_name &&
-		    nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,
-				   sfp_name))
-			return -EMSGSIZE;
-	}
-
-	return 0;
-}
-
-static int ethnl_phy_parse_request(struct ethnl_req_info *req_base,
-				   struct nlattr **tb)
-{
-	struct phy_link_topology *topo = &req_base->dev->link_topo;
-	struct phy_req_info *req_info = PHY_REQINFO(req_base);
-	struct phy_device_node *pdn;
-
-	if (!req_base->phydev)
-		return 0;
-
-	pdn = xa_load(&topo->phys, req_base->phydev->phyindex);
-	memcpy(&req_info->pdn, pdn, sizeof(*pdn));
-
-	return 0;
-}
-
-int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info)
-{
-	struct phy_req_info req_info = {};
-	struct nlattr **tb = info->attrs;
-	struct sk_buff *rskb;
-	void *reply_payload;
-	int reply_len;
-	int ret;
-
-	ret = ethnl_parse_header_dev_get(&req_info.base,
-					 tb[ETHTOOL_A_PHY_HEADER],
-					 genl_info_net(info), info->extack,
-					 true);
-	if (ret < 0)
-		return ret;
-
-	rtnl_lock();
-
-	ret = ethnl_phy_parse_request(&req_info.base, tb);
-	if (ret < 0)
-		goto err_unlock_rtnl;
-
-	/* No PHY, return early */
-	if (!req_info.pdn.phy)
-		goto err_unlock_rtnl;
-
-	ret = ethnl_phy_reply_size(&req_info.base, info->extack);
-	if (ret < 0)
-		goto err_unlock_rtnl;
-	reply_len = ret + ethnl_reply_header_size();
-
-	rskb = ethnl_reply_init(reply_len, req_info.base.dev,
-				ETHTOOL_MSG_PHY_GET_REPLY,
-				ETHTOOL_A_PHY_HEADER,
-				info, &reply_payload);
-	if (!rskb) {
-		ret = -ENOMEM;
-		goto err_unlock_rtnl;
-	}
-
-	ret = ethnl_phy_fill_reply(&req_info.base, rskb);
-	if (ret)
-		goto err_free_msg;
-
-	rtnl_unlock();
-	ethnl_parse_header_dev_put(&req_info.base);
-	genlmsg_end(rskb, reply_payload);
-
-	return genlmsg_reply(rskb, info);
-
-err_free_msg:
-	nlmsg_free(rskb);
-err_unlock_rtnl:
-	rtnl_unlock();
-	ethnl_parse_header_dev_put(&req_info.base);
-	return ret;
-}
-
-struct ethnl_phy_dump_ctx {
-	struct phy_req_info	*phy_req_info;
-};
-
-int ethnl_phy_start(struct netlink_callback *cb)
-{
-	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
-	struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
-	struct nlattr **tb = info->info.attrs;
-	int ret;
-
-	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
-
-	ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL);
-	if (!ctx->phy_req_info)
-		return -ENOMEM;
-
-	ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base,
-					 tb[ETHTOOL_A_PHY_HEADER],
-					 sock_net(cb->skb->sk), cb->extack,
-					 false);
-	return ret;
-}
-
-int ethnl_phy_done(struct netlink_callback *cb)
-{
-	struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
-
-	kfree(ctx->phy_req_info);
-
-	return 0;
-}
-
-static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev,
-				  struct netlink_callback *cb)
-{
-	struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
-	struct phy_req_info *pri = ctx->phy_req_info;
-	struct phy_device_node *pdn;
-	unsigned long index = 1;
-	int ret = 0;
-	void *ehdr;
-
-	pri->base.dev = dev;
-
-	xa_for_each(&dev->link_topo.phys, index, pdn) {
-		ehdr = ethnl_dump_put(skb, cb,
-				      ETHTOOL_MSG_PHY_GET_REPLY);
-		if (!ehdr) {
-			ret = -EMSGSIZE;
-			break;
-		}
-
-		ret = ethnl_fill_reply_header(skb, dev,
-					      ETHTOOL_A_PHY_HEADER);
-		if (ret < 0) {
-			genlmsg_cancel(skb, ehdr);
-			break;
-		}
-
-		memcpy(&pri->pdn, pdn, sizeof(*pdn));
-		ret = ethnl_phy_fill_reply(&pri->base, skb);
-
-		genlmsg_end(skb, ehdr);
-	}
-
-	return ret;
-}
-
-int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx;
-	struct net *net = sock_net(skb->sk);
-	unsigned long ifindex = 1;
-	struct net_device *dev;
-	int ret = 0;
-
-	rtnl_lock();
-
-	if (ctx->phy_req_info->base.dev) {
-		ret = ethnl_phy_dump_one_dev(skb, ctx->phy_req_info->base.dev, cb);
-		ethnl_parse_header_dev_put(&ctx->phy_req_info->base);
-		ctx->phy_req_info->base.dev = NULL;
-	} else {
-		for_each_netdev_dump(net, dev, ifindex) {
-			ret = ethnl_phy_dump_one_dev(skb, dev, cb);
-			if (ret)
-				break;
-		}
-	}
-	rtnl_unlock();
-
-	if (ret == -EMSGSIZE && skb->len)
-		return skb->len;
-	return ret;
-}
-
diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c
index 2b3e419f4dc2..b1e2e3b5027f 100644
--- a/net/ethtool/plca.c
+++ b/net/ethtool/plca.c
@@ -61,7 +61,7 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base,
 	int ret;
 
 	// check that the PHY device is available and connected
-	if (!req_base->phydev) {
+	if (!dev->phydev) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -80,7 +80,7 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base,
 	memset(&data->plca_cfg, 0xff,
 	       sizeof_field(struct plca_reply_data, plca_cfg));
 
-	ret = ops->get_plca_cfg(req_base->phydev, &data->plca_cfg);
+	ret = ops->get_plca_cfg(dev->phydev, &data->plca_cfg);
 	ethnl_ops_complete(dev);
 
 out:
@@ -141,6 +141,7 @@ const struct nla_policy ethnl_plca_set_cfg_policy[] = {
 static int
 ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info)
 {
+	struct net_device *dev = req_info->dev;
 	const struct ethtool_phy_ops *ops;
 	struct nlattr **tb = info->attrs;
 	struct phy_plca_cfg plca_cfg;
@@ -148,7 +149,7 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info)
 	int ret;
 
 	// check that the PHY device is available and connected
-	if (!req_info->phydev)
+	if (!dev->phydev)
 		return -EOPNOTSUPP;
 
 	ops = ethtool_phy_ops;
@@ -167,7 +168,7 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (!mod)
 		return 0;
 
-	ret = ops->set_plca_cfg(req_info->phydev, &plca_cfg, info->extack);
+	ret = ops->set_plca_cfg(dev->phydev, &plca_cfg, info->extack);
 	return ret < 0 ? ret : 1;
 }
 
@@ -203,7 +204,7 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base,
 	int ret;
 
 	// check that the PHY device is available and connected
-	if (!req_base->phydev) {
+	if (!dev->phydev) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -222,7 +223,7 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base,
 	memset(&data->plca_st, 0xff,
 	       sizeof_field(struct plca_reply_data, plca_st));
 
-	ret = ops->get_plca_status(req_base->phydev, &data->plca_st);
+	ret = ops->get_plca_status(dev->phydev, &data->plca_st);
 	ethnl_ops_complete(dev);
 out:
 	return ret;
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 4a1c8d37bd3d..cc478af77111 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -31,10 +31,12 @@ const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = {
 	[ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
 };
 
-static int pse_get_pse_attributes(struct phy_device *phydev,
+static int pse_get_pse_attributes(struct net_device *dev,
 				  struct netlink_ext_ack *extack,
 				  struct pse_reply_data *data)
 {
+	struct phy_device *phydev = dev->phydev;
+
 	if (!phydev) {
 		NL_SET_ERR_MSG(extack, "No PHY is attached");
 		return -EOPNOTSUPP;
@@ -62,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base,
 	if (ret < 0)
 		return ret;
 
-	ret = pse_get_pse_attributes(req_base->phydev, info->extack, data);
+	ret = pse_get_pse_attributes(dev, info->extack, data);
 
 	ethnl_ops_complete(dev);
 
@@ -122,6 +124,7 @@ ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info)
 static int
 ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 {
+	struct net_device *dev = req_info->dev;
 	struct pse_control_config config = {};
 	struct nlattr **tb = info->attrs;
 	struct phy_device *phydev;
@@ -129,7 +132,7 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 	/* this values are already validated by the ethnl_pse_set_policy */
 	config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
 
-	phydev = req_info->phydev;
+	phydev = dev->phydev;
 	if (!phydev) {
 		NL_SET_ERR_MSG(info->extack, "No PHY is attached");
 		return -EOPNOTSUPP;
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 70c00631c51f..c678b484a079 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -233,18 +233,17 @@ static void strset_cleanup_data(struct ethnl_reply_data *reply_base)
 }
 
 static int strset_prepare_set(struct strset_info *info, struct net_device *dev,
-			      struct phy_device *phydev, unsigned int id,
-			      bool counts_only)
+			      unsigned int id, bool counts_only)
 {
 	const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	void *strings;
 	int count, ret;
 
-	if (id == ETH_SS_PHY_STATS && phydev &&
+	if (id == ETH_SS_PHY_STATS && dev->phydev &&
 	    !ops->get_ethtool_phy_stats && phy_ops &&
 	    phy_ops->get_sset_count)
-		ret = phy_ops->get_sset_count(phydev);
+		ret = phy_ops->get_sset_count(dev->phydev);
 	else if (ops->get_sset_count && ops->get_strings)
 		ret = ops->get_sset_count(dev, id);
 	else
@@ -259,10 +258,10 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev,
 		strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL);
 		if (!strings)
 			return -ENOMEM;
-		if (id == ETH_SS_PHY_STATS && phydev &&
+		if (id == ETH_SS_PHY_STATS && dev->phydev &&
 		    !ops->get_ethtool_phy_stats && phy_ops &&
 		    phy_ops->get_strings)
-			phy_ops->get_strings(phydev, strings);
+			phy_ops->get_strings(dev->phydev, strings);
 		else
 			ops->get_strings(dev, id, strings);
 		info->strings = strings;
@@ -306,8 +305,8 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base,
 		    !data->sets[i].per_dev)
 			continue;
 
-		ret = strset_prepare_set(&data->sets[i], dev, req_base->phydev,
-					 i, req_info->counts_only);
+		ret = strset_prepare_set(&data->sets[i], dev, i,
+					 req_info->counts_only);
 		if (ret < 0)
 			goto err_ops;
 	}
-- 
cgit v1.2.3


From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Wed, 3 Jan 2024 20:05:44 +0100
Subject: bpf: Relax tracing prog recursive attach rules

Currently, it's not allowed to attach an fentry/fexit prog to another
one fentry/fexit. At the same time it's not uncommon to see a tracing
program with lots of logic in use, and the attachment limitation
prevents usage of fentry/fexit for performance analysis (e.g. with
"bpftool prog profile" command) in this case. An example could be
falcosecurity libs project that uses tp_btf tracing programs.

Following the corresponding discussion [1], the reason for that is to
avoid tracing progs call cycles without introducing more complex
solutions. But currently it seems impossible to load and attach tracing
programs in a way that will form such a cycle. The limitation is coming
from the fact that attach_prog_fd is specified at the prog load (thus
making it impossible to attach to a program loaded after it in this
way), as well as tracing progs not implementing link_detach.

Replace "no same type" requirement with verification that no more than
one level of attachment nesting is allowed. In this way only one
fentry/fexit program could be attached to another fentry/fexit to cover
profiling use case, and still no cycle could be formed. To implement,
add a new field into bpf_prog_aux to track nested attachment for tracing
programs.

[1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/

Acked-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/syscall.c  | 23 ++++++++++++++++++++++-
 kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7671530d6e4e..e30100597d0a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1449,6 +1449,7 @@ struct bpf_prog_aux {
 	bool dev_bound; /* Program is bound to the netdev. */
 	bool offload_requested; /* Program is bound and offloaded to the netdev. */
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	bool attach_tracing_prog; /* true if tracing another tracing program */
 	bool func_proto_unreliable;
 	bool sleepable;
 	bool tail_call_reachable;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1bf9805ee185..d15f9998bc20 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2738,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			goto free_prog_sec;
 	}
 
+	/*
+	 * Bookkeeping for managing the program attachment chain.
+	 *
+	 * It might be tempting to set attach_tracing_prog flag at the attachment
+	 * time, but this will not prevent from loading bunch of tracing prog
+	 * first, then attach them one to another.
+	 *
+	 * The flag attach_tracing_prog is set for the whole program lifecycle, and
+	 * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+	 * programs cannot change attachment target.
+	 */
+	if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+	    dst_prog->type == BPF_PROG_TYPE_TRACING) {
+		prog->aux->attach_tracing_prog = true;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
@@ -3171,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	}
 
 	if (tgt_prog_fd) {
-		/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
+		/*
+		 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+		 * part would be changed to implement the same for
+		 * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+		 * attach_tracing_prog flag is set.
+		 */
 		if (prog->type != BPF_PROG_TYPE_EXT) {
 			err = -EINVAL;
 			goto out_put_prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5f4ff1eb235..adbf330d364b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20317,6 +20317,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    struct bpf_attach_target_info *tgt_info)
 {
 	bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+	bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
 	const char prefix[] = "btf_trace_";
 	int ret = 0, subprog = -1, i;
 	const struct btf_type *t;
@@ -20387,10 +20388,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			bpf_log(log, "Can attach to only JITed progs\n");
 			return -EINVAL;
 		}
-		if (tgt_prog->type == prog->type) {
-			/* Cannot fentry/fexit another fentry/fexit program.
-			 * Cannot attach program extension to another extension.
-			 * It's ok to attach fentry/fexit to extension program.
+		if (prog_tracing) {
+			if (aux->attach_tracing_prog) {
+				/*
+				 * Target program is an fentry/fexit which is already attached
+				 * to another tracing program. More levels of nesting
+				 * attachment are not allowed.
+				 */
+				bpf_log(log, "Cannot nest tracing program attach more than once\n");
+				return -EINVAL;
+			}
+		} else if (tgt_prog->type == prog->type) {
+			/*
+			 * To avoid potential call chain cycles, prevent attaching of a
+			 * program extension to another extension. It's ok to attach
+			 * fentry/fexit to extension program.
 			 */
 			bpf_log(log, "Cannot recursively attach\n");
 			return -EINVAL;
@@ -20403,16 +20415,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
 			 * analysis, stats and can be attached to any program
-			 * type except themselves. When extension program is
-			 * replacing XDP function it is necessary to allow
-			 * performance analysis of all functions. Both original
-			 * XDP program and its program extension. Hence
-			 * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
-			 * allowed. If extending of fentry/fexit was allowed it
-			 * would be possible to create long call chain
-			 * fentry->extension->fentry->extension beyond
-			 * reasonable stack size. Hence extending fentry is not
-			 * allowed.
+			 * type. When extension program is replacing XDP function
+			 * it is necessary to allow performance analysis of all
+			 * functions. Both original XDP program and its program
+			 * extension. Hence attaching fentry/fexit to
+			 * BPF_PROG_TYPE_EXT is allowed. If extending of
+			 * fentry/fexit was allowed it would be possible to create
+			 * long call chain fentry->extension->fentry->extension
+			 * beyond reasonable stack size. Hence extending fentry
+			 * is not allowed.
 			 */
 			bpf_log(log, "Cannot extend fentry/fexit\n");
 			return -EINVAL;
-- 
cgit v1.2.3


From 990b6b5b13b7993b7f44740c0add3119d407ccbf Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Dec 2023 09:32:20 +0800
Subject: jbd2: add errseq to detect client fs's bdev writeback error

Add errseq in journal, so that JBD2 can detect whether metadata is
successfully written to fs bdev. This patch adds detection in recovery
process to replace original solution(using local variable wb_err).

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231213013224.2100050-2-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    |  1 +
 fs/jbd2/recovery.c   |  7 +------
 include/linux/jbd2.h | 26 ++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 206cb53ef2b0..559938a82379 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1534,6 +1534,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_total_len = len;
+	jbd2_init_fs_dev_write_error(journal);
 
 	err = journal_load_superblock(journal);
 	if (err)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 01f744cb97a4..1f7664984d6e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,8 +289,6 @@ int jbd2_journal_recover(journal_t *journal)
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
-	errseq_t		wb_err;
-	struct address_space	*mapping;
 
 	memset(&info, 0, sizeof(info));
 	sb = journal->j_superblock;
@@ -308,9 +306,6 @@ int jbd2_journal_recover(journal_t *journal)
 		return 0;
 	}
 
-	wb_err = 0;
-	mapping = journal->j_fs_dev->bd_inode->i_mapping;
-	errseq_check_and_advance(&mapping->wb_err, &wb_err);
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -334,7 +329,7 @@ int jbd2_journal_recover(journal_t *journal)
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
-	err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
+	err2 = jbd2_check_fs_dev_write_error(journal);
 	if (!err)
 		err = err2;
 	/* Make sure all replayed data is on permanent storage */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index beb30719ee16..cea1aa70ae36 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -998,6 +998,13 @@ struct journal_s
 	 */
 	struct block_device	*j_fs_dev;
 
+	/**
+	 * @j_fs_dev_wb_err:
+	 *
+	 * Records the errseq of the client fs's backing block device.
+	 */
+	errseq_t		j_fs_dev_wb_err;
+
 	/**
 	 * @j_total_len: Total maximum capacity of the journal region on disk.
 	 */
@@ -1698,6 +1705,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle)
 	handle->h_aborted = 1;
 }
 
+static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
+{
+	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+
+	/*
+	 * Save the original wb_err value of client fs's bdev mapping which
+	 * could be used to detect the client fs's metadata async write error.
+	 */
+	errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
+}
+
+static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
+{
+	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+
+	return errseq_check(&mapping->wb_err,
+			    READ_ONCE(journal->j_fs_dev_wb_err));
+}
+
 #endif /* __KERNEL__   */
 
 /* Comparison functions for transaction IDs: perform comparisons using
-- 
cgit v1.2.3


From 8a4fd33d879fb303b207f06ea6340d73f698c4ed Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 13 Dec 2023 09:32:22 +0800
Subject: jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags'

Since 'JBD2_CHECKPOINT_IO_ERROR' and j_atomic_flags' are not useful
anymore after fs dev's errseq is imported into jbd2, just remove them.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20231213013224.2100050-4-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 11 -----------
 include/linux/jbd2.h | 11 -----------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 118699fff2f9..1c97e64c4784 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -556,7 +556,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	struct transaction_chp_stats_s *stats;
 	transaction_t *transaction;
 	journal_t *journal;
-	struct buffer_head *bh = jh2bh(jh);
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -569,16 +568,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	JBUFFER_TRACE(jh, "removing from transaction");
 
-	/*
-	 * If we have failed to write the buffer out to disk, the filesystem
-	 * may become inconsistent. We cannot abort the journal here since
-	 * we hold j_list_lock and we have to be careful about races with
-	 * jbd2_journal_destroy(). So mark the writeback IO error in the
-	 * journal here and we abort the journal later from a better context.
-	 */
-	if (buffer_write_io_error(bh))
-		set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags);
-
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
 	percpu_counter_dec(&journal->j_checkpoint_jh_count);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index cea1aa70ae36..971f3e826e15 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -755,11 +755,6 @@ struct journal_s
 	 */
 	unsigned long		j_flags;
 
-	/**
-	 * @j_atomic_flags: Atomic journaling state flags.
-	 */
-	unsigned long		j_atomic_flags;
-
 	/**
 	 * @j_errno:
 	 *
@@ -1406,12 +1401,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 #define JBD2_JOURNAL_FLUSH_VALID	(JBD2_JOURNAL_FLUSH_DISCARD | \
 					JBD2_JOURNAL_FLUSH_ZEROOUT)
 
-/*
- * Journal atomic flag definitions
- */
-#define JBD2_CHECKPOINT_IO_ERROR	0x001	/* Detect io error while writing
-						 * buffer back to disk */
-
 /*
  * Function declarations for the journaling transaction and buffer
  * management
-- 
cgit v1.2.3


From c2e64baac4f36f7e0365218c7523bb9ba4639250 Mon Sep 17 00:00:00 2001
From: Thierry Reding <thierry.reding@gmail.com>
Date: Thu, 21 Dec 2023 11:08:02 +0100
Subject: pwm: Add pwm_apply_state() compatibility stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to make the transition to the new pwm_apply_might_sleep() a bit
smoother, add a compatibility stub. This will prevent new calls to the
old function introduced via other subsystems from breaking builds. Once
the next merge window has closed we can take another stab at removing
the stub.

Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 5dd665d8c909..2a5e1154652e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -537,6 +537,13 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
 	pwm_apply_might_sleep(pwm, &state);
 }
 
+/* only for backwards-compatibility, new code should not use this */
+static inline int pwm_apply_state(struct pwm_device *pwm,
+				  const struct pwm_state *state)
+{
+	return pwm_apply_might_sleep(pwm, state);
+}
+
 struct pwm_lookup {
 	struct list_head list;
 	const char *provider;
-- 
cgit v1.2.3


From d73f444d06fb8a42a5c0623453f3ea1fe9880229 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:06:20 -0800
Subject: pwm: linux/pwm.h: fix Excess kernel-doc description warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the @pwm: line to prevent the kernel-doc warning:

include/linux/pwm.h:87: warning: Excess struct member 'pwm' description in 'pwm_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc:  <linux-pwm@vger.kernel.org>
Fixes: f3e25e68ceb2 ("pwm: Drop unused member "pwm" from struct pwm_device")
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 2a5e1154652e..fcc2c4496f73 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -69,7 +69,6 @@ struct pwm_state {
  * @label: name of the PWM device
  * @flags: flags associated with the PWM device
  * @hwpwm: per-chip relative index of the PWM device
- * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @args: PWM arguments
  * @state: last applied state
-- 
cgit v1.2.3


From 92a714d727ec9e7ccfcc7432d348aba730145914 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Jan 2024 15:52:11 +0000
Subject: netfs: Fix interaction between write-streaming and cachefiles culling

An issue can occur between write-streaming (storing dirty data in partial
non-uptodate pages) and a cachefiles object being culled to make space.
The problem occurs because the cache object is only marked in use while
there are files open using it.  Once it has been released, it can be culled
and the cookie marked disabled.

At this point, a streaming write is permitted to occur (if the cache is
active, we require pages to be prefetched and cached), but the cache can
become active again before this gets flushed out - and then two effects can
occur:

 (1) The cache may be asked to write out a region that's less than its DIO
     block size (assumed by cachefiles to be PAGE_SIZE) - and this causes
     one of two debugging statements to be emitted.

 (2) netfs_how_to_modify() gets confused because it sees a page that isn't
     allowed to be non-uptodate being uptodate and tries to prefetch it -
     leading to a warning that PG_fscache is set twice.

Fix this by the following means:

 (1) Add a netfs_inode flag to disallow write-streaming to an inode and set
     it if we ever do local caching of that inode.  It remains set for the
     lifetime of that inode - even if the cookie becomes disabled.

 (2) If the no-write-streaming flag is set, then make netfs_how_to_modify()
     always want to prefetch instead.

 (3) If netfs_how_to_modify() decides it wants to prefetch a folio, but
     that folio has write-streamed data in it, then it requires the folio
     be flushed first.

 (4) Export a counter of the number of times we wanted to prefetch a
     non-uptodate page, but found it had write-streamed data in it.

 (5) Export a counter of the number of times we cancelled a write to the
     cache because it didn't DIO align and remove the debug statements.

Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-erofs@lists.ozlabs.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/cachefiles/io.c            | 12 ++++++------
 fs/netfs/buffered_write.c     | 24 ++++++++++++++++++++----
 fs/netfs/fscache_stats.c      |  9 ++++++---
 fs/netfs/internal.h           |  1 +
 fs/netfs/stats.c              |  6 ++++--
 include/linux/fscache-cache.h |  3 +++
 include/linux/netfs.h         |  1 +
 7 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 7529b40bc95a..3eec26967437 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -528,12 +528,12 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
 
 	/* Round to DIO size */
 	start = round_down(*_start, PAGE_SIZE);
-	if (start != *_start) {
-		kleave(" = -ENOBUFS [down]");
-		return -ENOBUFS;
-	}
-	if (*_len > upper_len) {
-		kleave(" = -ENOBUFS [up]");
+	if (start != *_start || *_len > upper_len) {
+		/* Probably asked to cache a streaming write written into the
+		 * pagecache when the cookie was temporarily out of service to
+		 * culling.
+		 */
+		fscache_count_dio_misfit();
 		return -ENOBUFS;
 	}
 
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 08f28800232c..6cd8f7422e9a 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -80,10 +80,19 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
 		return NETFS_WHOLE_FOLIO_MODIFY;
 
 	if (file->f_mode & FMODE_READ)
-		return NETFS_JUST_PREFETCH;
-
-	if (netfs_is_cache_enabled(ctx))
-		return NETFS_JUST_PREFETCH;
+		goto no_write_streaming;
+	if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+		goto no_write_streaming;
+
+	if (netfs_is_cache_enabled(ctx)) {
+		/* We don't want to get a streaming write on a file that loses
+		 * caching service temporarily because the backing store got
+		 * culled.
+		 */
+		if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+			set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+		goto no_write_streaming;
+	}
 
 	if (!finfo)
 		return NETFS_STREAMING_WRITE;
@@ -95,6 +104,13 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
 	if (offset == finfo->dirty_offset + finfo->dirty_len)
 		return NETFS_STREAMING_WRITE_CONT;
 	return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+	if (finfo) {
+		netfs_stat(&netfs_n_wh_wstream_conflict);
+		return NETFS_FLUSH_CONTENT;
+	}
+	return NETFS_JUST_PREFETCH;
 }
 
 /*
diff --git a/fs/netfs/fscache_stats.c b/fs/netfs/fscache_stats.c
index aad812ead398..add21abdf713 100644
--- a/fs/netfs/fscache_stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
 EXPORT_SYMBOL(fscache_n_no_create_space);
 atomic_t fscache_n_culled;
 EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
 
 /*
  * display the general statistics
  */
 int fscache_stats_show(struct seq_file *m)
 {
-	seq_puts(m, "FS-Cache statistics\n");
+	seq_puts(m, "-- FS-Cache statistics --\n");
 	seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
 		   atomic_read(&fscache_n_cookies),
 		   atomic_read(&fscache_n_volumes),
@@ -93,8 +95,9 @@ int fscache_stats_show(struct seq_file *m)
 		   atomic_read(&fscache_n_no_create_space),
 		   atomic_read(&fscache_n_culled));
 
-	seq_printf(m, "IO     : rd=%u wr=%u\n",
+	seq_printf(m, "IO     : rd=%u wr=%u mis=%u\n",
 		   atomic_read(&fscache_n_read),
-		   atomic_read(&fscache_n_write));
+		   atomic_read(&fscache_n_write),
+		   atomic_read(&fscache_n_dio_misfit));
 	return 0;
 }
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 3f9620d0fa63..ec7045d24400 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -123,6 +123,7 @@ extern atomic_t netfs_n_rh_write_begin;
 extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
 extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
 extern atomic_t netfs_n_wh_upload;
 extern atomic_t netfs_n_wh_upload_done;
 extern atomic_t netfs_n_wh_upload_failed;
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 42db36528d92..deeba9f9dcf5 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -29,6 +29,7 @@ atomic_t netfs_n_rh_write_begin;
 atomic_t netfs_n_rh_write_done;
 atomic_t netfs_n_rh_write_failed;
 atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
 atomic_t netfs_n_wh_upload;
 atomic_t netfs_n_wh_upload_done;
 atomic_t netfs_n_wh_upload_failed;
@@ -66,9 +67,10 @@ int netfs_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&netfs_n_wh_write),
 		   atomic_read(&netfs_n_wh_write_done),
 		   atomic_read(&netfs_n_wh_write_failed));
-	seq_printf(m, "Netfs  : rr=%u sr=%u\n",
+	seq_printf(m, "Netfs  : rr=%u sr=%u wsc=%u\n",
 		   atomic_read(&netfs_n_rh_rreq),
-		   atomic_read(&netfs_n_rh_sreq));
+		   atomic_read(&netfs_n_rh_sreq),
+		   atomic_read(&netfs_n_wh_wstream_conflict));
 	return fscache_stats_show(m);
 }
 EXPORT_SYMBOL(netfs_stats_show);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index a174cedf4d90..bdf7f3eddf0a 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -189,17 +189,20 @@ extern atomic_t fscache_n_write;
 extern atomic_t fscache_n_no_write_space;
 extern atomic_t fscache_n_no_create_space;
 extern atomic_t fscache_n_culled;
+extern atomic_t fscache_n_dio_misfit;
 #define fscache_count_read() atomic_inc(&fscache_n_read)
 #define fscache_count_write() atomic_inc(&fscache_n_write)
 #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space)
 #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space)
 #define fscache_count_culled() atomic_inc(&fscache_n_culled)
+#define fscache_count_dio_misfit() atomic_inc(&fscache_n_dio_misfit)
 #else
 #define fscache_count_read() do {} while(0)
 #define fscache_count_write() do {} while(0)
 #define fscache_count_no_write_space() do {} while(0)
 #define fscache_count_no_create_space() do {} while(0)
 #define fscache_count_culled() do {} while(0)
+#define fscache_count_dio_misfit() do {} while(0)
 #endif
 
 #endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d3bac60fcd6f..100cbb261269 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -142,6 +142,7 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
+#define NETFS_ICTX_NO_WRITE_STREAMING	3	/* Don't engage in write-streaming */
 };
 
 /*
-- 
cgit v1.2.3


From 8a6286c1804e2c7144aef3154a0357c4b496e10b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 3 Jan 2024 14:28:36 +0100
Subject: dpll: expose fractional frequency offset value to user

Add a new netlink attribute to expose fractional frequency offset value
for a pin. Add an op to get the value from the driver.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://lore.kernel.org/r/20240103132838.1501801-2-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/dpll.yaml | 11 +++++++++++
 drivers/dpll/dpll_netlink.c           | 24 ++++++++++++++++++++++++
 include/linux/dpll.h                  |  3 +++
 include/uapi/linux/dpll.h             |  1 +
 4 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index cf8abe1c0550..b14aed18065f 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -296,6 +296,16 @@ attribute-sets:
       -
         name: phase-offset
         type: s64
+      -
+        name: fractional-frequency-offset
+        type: sint
+        doc: |
+          The FFO (Fractional Frequency Offset) between the RX and TX
+          symbol rate on the media associated with the pin:
+          (rx_frequency-tx_frequency)/rx_frequency
+          Value is in PPM (parts per million).
+          This may be implemented for example for pin of type
+          PIN_TYPE_SYNCE_ETH_PORT.
   -
     name: pin-parent-device
     subset-of: pin
@@ -460,6 +470,7 @@ operations:
             - phase-adjust-min
             - phase-adjust-max
             - phase-adjust
+            - fractional-frequency-offset
 
       dump:
         pre: dpll-lock-dumpit
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 21c627e9401a..3370dbddb86b 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -263,6 +263,27 @@ dpll_msg_add_phase_offset(struct sk_buff *msg, struct dpll_pin *pin,
 	return 0;
 }
 
+static int dpll_msg_add_ffo(struct sk_buff *msg, struct dpll_pin *pin,
+			    struct dpll_pin_ref *ref,
+			    struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	s64 ffo;
+	int ret;
+
+	if (!ops->ffo_get)
+		return 0;
+	ret = ops->ffo_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+			   dpll, dpll_priv(dpll), &ffo, extack);
+	if (ret) {
+		if (ret == -ENODATA)
+			return 0;
+		return ret;
+	}
+	return nla_put_sint(msg, DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET, ffo);
+}
+
 static int
 dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
 		      struct dpll_pin_ref *ref, struct netlink_ext_ack *extack)
@@ -440,6 +461,9 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
 			prop->phase_range.max))
 		return -EMSGSIZE;
 	ret = dpll_msg_add_pin_phase_adjust(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_ffo(msg, pin, ref, extack);
 	if (ret)
 		return ret;
 	if (xa_empty(&pin->parent_refs))
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index b1a5f9ca8ee5..9cf896ea1d41 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -77,6 +77,9 @@ struct dpll_pin_ops {
 				const struct dpll_device *dpll, void *dpll_priv,
 				const s32 phase_adjust,
 				struct netlink_ext_ack *extack);
+	int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv,
+		       const struct dpll_device *dpll, void *dpll_priv,
+		       s64 *ffo, struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 715a491d2727..b4e947f9bfbc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -179,6 +179,7 @@ enum dpll_a_pin {
 	DPLL_A_PIN_PHASE_ADJUST_MAX,
 	DPLL_A_PIN_PHASE_ADJUST,
 	DPLL_A_PIN_PHASE_OFFSET,
+	DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit v1.2.3


From 9c5938694cd0e9e00bdfb7e60900673263daf4d5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 5 Jan 2024 16:57:29 +0100
Subject: mm/rmap: silence VM_WARN_ON_FOLIO() in __folio_rmap_sanity_checks()

Unfortunately, vm_insert_page() and friends and up passing
driver-allocated folios into folio_add_file_rmap_pte() using
insert_page_into_pte_locked().

While these driver-allocated folios can be compound pages (large folios),
they are not proper "rmappable" folios.

In these VM_MIXEDMAP VMAs, there isn't really the concept of a reverse
mapping, so long-term, we should clean that up and not call into rmap
code.

For the time being, document how we can end up in rmap code with large
folios that are not marked rmappable.

Link: https://lkml.kernel.org/r/793c5cee-d5fc-4eb1-86a2-39e05686233d@redhat.com
Fixes: 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()")
Reported-by: syzbot+50ef73537bbc393a25bb@syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000014174060e09316e@google.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fd6fe16fa358..b7944a833668 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -199,8 +199,15 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
 {
 	/* hugetlb folios are handled separately. */
 	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
-	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
-			 !folio_test_large_rmappable(folio), folio);
+
+	/*
+	 * TODO: we get driver-allocated folios that have nothing to do with
+	 * the rmap using vm_insert_page(); therefore, we cannot assume that
+	 * folio_test_large_rmappable() holds for large folios. We should
+	 * handle any desired mapcount+stats accounting for these folios in
+	 * VM_MIXEDMAP VMAs separately, and then sanity-check here that
+	 * we really only get rmappable folios.
+	 */
 
 	VM_WARN_ON_ONCE(nr_pages <= 0);
 	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
-- 
cgit v1.2.3


From 71ce1ab54a505736786d9c5921e6c2718c7ec535 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:01 +0000
Subject: mm/mglru: add CONFIG_ARCH_HAS_HW_PTE_YOUNG

Patch series "mm/mglru: Kconfig cleanup", v4.

This series is the result of the following discussion:
https://lore.kernel.org/47066176-bd93-55dd-c2fa-002299d9e034@linux.ibm.com/

It mainly avoids building the code that walks page tables on CPUs that
use it, i.e., those don't support hardware accessed bit. Specifically,
it introduces a new Kconfig to guard some of functions added by
commit bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
on CPUs like POWER9, on which the series was tested.


This patch (of 5):

Some architectures are able to set the accessed bit in PTEs when PTEs
are used as part of linear address translations.

Add CONFIG_ARCH_HAS_HW_PTE_YOUNG for such architectures to be able to
override arch_has_hw_pte_young().

Link: https://lkml.kernel.org/r/20231227141205.2200125-1-kinseyho@google.com
Link: https://lkml.kernel.org/r/20231227141205.2200125-2-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig                   | 8 ++++++++
 arch/arm64/Kconfig             | 1 +
 arch/x86/Kconfig               | 1 +
 arch/x86/include/asm/pgtable.h | 6 ------
 include/linux/pgtable.h        | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..8c8901f80586 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1470,6 +1470,14 @@ config DYNAMIC_SIGFRAME
 config HAVE_ARCH_NODE_DEV_GROUP
 	bool
 
+config ARCH_HAS_HW_PTE_YOUNG
+	bool
+	help
+	  Architectures that select this option are capable of setting the
+	  accessed bit in PTE entries when using them as part of linear address
+	  translations. Architectures that require runtime check should select
+	  this option and override arch_has_hw_pte_young().
+
 config ARCH_HAS_NONLEAF_PMD_YOUNG
 	bool
 	help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b071a00425d..12d611f3da5d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -36,6 +36,7 @@ config ARM64
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
+	select ARCH_HAS_HW_PTE_YOUNG
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1566748f16c4..04941a1ffc0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
+	select ARCH_HAS_HW_PTE_YOUNG
 	select ARCH_HAS_NONLEAF_PMD_YOUNG	if PGTABLE_LEVELS > 2
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_COPY_MC			if X86_64
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 57bab91bbf50..08b5cb22d9a6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1679,12 +1679,6 @@ static inline bool arch_has_pfn_modify_check(void)
 	return boot_cpu_has_bug(X86_BUG_L1TF);
 }
 
-#define arch_has_hw_pte_young arch_has_hw_pte_young
-static inline bool arch_has_hw_pte_young(void)
-{
-	return true;
-}
-
 #define arch_check_zapped_pte arch_check_zapped_pte
 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..9ecc20fa6269 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -375,7 +375,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void)
  */
 static inline bool arch_has_hw_pte_young(void)
 {
-	return false;
+	return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
 }
 #endif
 
-- 
cgit v1.2.3


From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:02 +0000
Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU

Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that
walks page tables to promote pages into the youngest generation will
not be built.

Also improves code readability by adding two helper functions
get_mm_state() and get_next_mm().

Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |   2 +-
 include/linux/mm_types.h   |  12 ++-
 include/linux/mmzone.h     |   2 +
 kernel/fork.c              |   2 +-
 mm/Kconfig                 |   4 +
 mm/vmscan.c                | 192 +++++++++++++++++++++++++++++----------------
 6 files changed, 139 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5de775e6cdd9..20ff87f8e001 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -330,7 +330,7 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a66534c78c4d..552fa2d11c57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -958,7 +958,7 @@ struct mm_struct {
 		 */
 		unsigned long ksm_zero_pages;
 #endif /* CONFIG_KSM */
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 		struct {
 			/* this mm_struct is on lru_gen_mm_list */
 			struct list_head list;
@@ -973,7 +973,7 @@ struct mm_struct {
 			struct mem_cgroup *memcg;
 #endif
 		} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 	} __randomize_layout;
 
 	/*
@@ -1011,6 +1011,10 @@ struct lru_gen_mm_list {
 	spinlock_t lock;
 };
 
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
 void lru_gen_add_mm(struct mm_struct *mm);
 void lru_gen_del_mm(struct mm_struct *mm);
 #ifdef CONFIG_MEMCG
@@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 	WRITE_ONCE(mm->lru_gen.bitmap, -1);
 }
 
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
 
 static inline void lru_gen_add_mm(struct mm_struct *mm)
 {
@@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 {
 }
 
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 
 struct vma_iterator {
 	struct ma_state mas;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2efd3be484fd..bc3f63ec4291 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -640,9 +640,11 @@ struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* evictable pages divided into generations */
 	struct lru_gen_folio		lrugen;
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* to concurrently iterate lru_gen_mm_list */
 	struct lru_gen_mm_state		mm_state;
 #endif
+#endif /* CONFIG_LRU_GEN */
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 93924392a5c3..56cf276432c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2946,7 +2946,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 		get_task_struct(p);
 	}
 
-	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+	if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
 		/* lock the task to synchronize with memcg migration */
 		task_lock(p);
 		lru_gen_add_mm(p->mm);
diff --git a/mm/Kconfig b/mm/Kconfig
index b072664b889a..79d563d8f9e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1274,6 +1274,10 @@ config LRU_GEN_STATS
 	  from evicted generations for debugging purpose.
 
 	  This option has a per-memcg and per-node memory overhead.
+
+config LRU_GEN_WALKS_MMU
+	def_bool y
+	depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG
 # }
 
 config ARCH_SUPPORTS_PER_VMA_LOCK
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b4ca3563bcf4..aa7ea09ffb4c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2671,13 +2671,14 @@ static void get_item_key(void *item, int *key)
 	key[1] = hash >> BLOOM_FILTER_SHIFT;
 }
 
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+			      void *item)
 {
 	int key[2];
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	filter = READ_ONCE(mm_state->filters[gen]);
 	if (!filter)
 		return true;
 
@@ -2686,13 +2687,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it
 	return test_bit(key[0], filter) && test_bit(key[1], filter);
 }
 
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+				void *item)
 {
 	int key[2];
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	filter = READ_ONCE(mm_state->filters[gen]);
 	if (!filter)
 		return;
 
@@ -2704,12 +2706,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *
 		set_bit(key[1], filter);
 }
 
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
 {
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = lruvec->mm_state.filters[gen];
+	filter = mm_state->filters[gen];
 	if (filter) {
 		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
 		return;
@@ -2717,13 +2719,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
 
 	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
 			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+	WRITE_ONCE(mm_state->filters[gen], filter);
 }
 
 /******************************************************************************
  *                          mm_struct list
  ******************************************************************************/
 
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
 {
 	static struct lru_gen_mm_list mm_list = {
@@ -2740,6 +2744,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
 	return &mm_list;
 }
 
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+	return &lruvec->mm_state;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+	int key;
+	struct mm_struct *mm;
+	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
+
+	mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+	key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+		return NULL;
+
+	clear_bit(key, &mm->lru_gen.bitmap);
+
+	return mmget_not_zero(mm) ? mm : NULL;
+}
+
 void lru_gen_add_mm(struct mm_struct *mm)
 {
 	int nid;
@@ -2755,10 +2782,11 @@ void lru_gen_add_mm(struct mm_struct *mm)
 
 	for_each_node_state(nid, N_MEMORY) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		/* the first addition since the last iteration */
-		if (lruvec->mm_state.tail == &mm_list->fifo)
-			lruvec->mm_state.tail = &mm->lru_gen.list;
+		if (mm_state->tail == &mm_list->fifo)
+			mm_state->tail = &mm->lru_gen.list;
 	}
 
 	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
@@ -2784,14 +2812,15 @@ void lru_gen_del_mm(struct mm_struct *mm)
 
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		/* where the current iteration continues after */
-		if (lruvec->mm_state.head == &mm->lru_gen.list)
-			lruvec->mm_state.head = lruvec->mm_state.head->prev;
+		if (mm_state->head == &mm->lru_gen.list)
+			mm_state->head = mm_state->head->prev;
 
 		/* where the last iteration ended before */
-		if (lruvec->mm_state.tail == &mm->lru_gen.list)
-			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+		if (mm_state->tail == &mm->lru_gen.list)
+			mm_state->tail = mm_state->tail->next;
 	}
 
 	list_del_init(&mm->lru_gen.list);
@@ -2834,10 +2863,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
 }
 #endif
 
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
+
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+	return NULL;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+	return NULL;
+}
+
+#endif
+
 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
 {
 	int i;
 	int hist;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
 
@@ -2845,44 +2894,20 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 		hist = lru_hist_from_seq(walk->max_seq);
 
 		for (i = 0; i < NR_MM_STATS; i++) {
-			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
-				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+			WRITE_ONCE(mm_state->stats[hist][i],
+				   mm_state->stats[hist][i] + walk->mm_stats[i]);
 			walk->mm_stats[i] = 0;
 		}
 	}
 
 	if (NR_HIST_GENS > 1 && last) {
-		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+		hist = lru_hist_from_seq(mm_state->seq + 1);
 
 		for (i = 0; i < NR_MM_STATS; i++)
-			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+			WRITE_ONCE(mm_state->stats[hist][i], 0);
 	}
 }
 
-static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
-{
-	int type;
-	unsigned long size = 0;
-	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
-	int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
-
-	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
-		return true;
-
-	clear_bit(key, &mm->lru_gen.bitmap);
-
-	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
-		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
-			       get_mm_counter(mm, MM_ANONPAGES) +
-			       get_mm_counter(mm, MM_SHMEMPAGES);
-	}
-
-	if (size < MIN_LRU_BATCH)
-		return true;
-
-	return !mmget_not_zero(mm);
-}
-
 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 			    struct mm_struct **iter)
 {
@@ -2891,7 +2916,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 	struct mm_struct *mm = NULL;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	/*
 	 * mm_state->seq is incremented after each iteration of mm_list. There
@@ -2929,11 +2954,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 			mm_state->tail = mm_state->head->next;
 			walk->force_scan = true;
 		}
-
-		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
-		if (should_skip_mm(mm, walk))
-			mm = NULL;
-	} while (!mm);
+	} while (!(mm = get_next_mm(walk)));
 done:
 	if (*iter || last)
 		reset_mm_stats(lruvec, walk, last);
@@ -2941,7 +2962,7 @@ done:
 	spin_unlock(&mm_list->lock);
 
 	if (mm && first)
-		reset_bloom_filter(lruvec, walk->max_seq + 1);
+		reset_bloom_filter(mm_state, walk->max_seq + 1);
 
 	if (*iter)
 		mmput_async(*iter);
@@ -2956,7 +2977,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 	bool success = false;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	spin_lock(&mm_list->lock);
 
@@ -3469,6 +3490,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
 	unsigned long first = -1;
 	struct lru_gen_mm_walk *walk = args->private;
+	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
 
 	VM_WARN_ON_ONCE(pud_leaf(*pud));
 
@@ -3520,7 +3542,7 @@ restart:
 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
 		}
 
-		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+		if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
 			continue;
 
 		walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3531,7 +3553,7 @@ restart:
 		walk->mm_stats[MM_NONLEAF_ADDED]++;
 
 		/* carry over to the next generation */
-		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+		update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
 	}
 
 	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3738,16 +3760,25 @@ next:
 	return success;
 }
 
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+			bool can_swap, bool force_scan)
 {
+	bool success;
 	int prev, next;
 	int type, zone;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 restart:
+	if (max_seq < READ_ONCE(lrugen->max_seq))
+		return false;
+
 	spin_lock_irq(&lruvec->lru_lock);
 
 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 
+	success = max_seq == lrugen->max_seq;
+	if (!success)
+		goto unlock;
+
 	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
 			continue;
@@ -3791,8 +3822,10 @@ restart:
 	WRITE_ONCE(lrugen->timestamps[next], jiffies);
 	/* make sure preceding modifications appear */
 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-
+unlock:
 	spin_unlock_irq(&lruvec->lru_lock);
+
+	return success;
 }
 
 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
@@ -3802,14 +3835,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	struct lru_gen_mm_walk *walk;
 	struct mm_struct *mm = NULL;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
 
+	if (!mm_state)
+		return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+
 	/* see the comment in iterate_mm_list() */
-	if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
-		success = false;
-		goto done;
-	}
+	if (max_seq <= READ_ONCE(mm_state->seq))
+		return false;
 
 	/*
 	 * If the hardware doesn't automatically set the accessed bit, fallback
@@ -3839,8 +3874,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 			walk_mm(lruvec, mm, walk);
 	} while (mm);
 done:
-	if (success)
-		inc_max_seq(lruvec, can_swap, force_scan);
+	if (success) {
+		success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+		WARN_ON_ONCE(!success);
+	}
 
 	return success;
 }
@@ -3964,6 +4001,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	struct mem_cgroup *memcg = folio_memcg(folio);
 	struct pglist_data *pgdat = folio_pgdat(folio);
 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
 	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 
@@ -4042,8 +4080,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	mem_cgroup_unlock_pages();
 
 	/* feedback from rmap walkers to page table walkers */
-	if (suitable_to_scan(i, young))
-		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+	if (mm_state && suitable_to_scan(i, young))
+		update_bloom_filter(mm_state, max_seq, pvmw->pmd);
 }
 
 /******************************************************************************
@@ -5219,6 +5257,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 	int type, tier;
 	int hist = lru_hist_from_seq(seq);
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 		seq_printf(m, "            %10d", tier);
@@ -5244,6 +5283,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 		seq_putc(m, '\n');
 	}
 
+	if (!mm_state)
+		return;
+
 	seq_puts(m, "                      ");
 	for (i = 0; i < NR_MM_STATS; i++) {
 		const char *s = "      ";
@@ -5251,10 +5293,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 
 		if (seq == max_seq && NR_HIST_GENS == 1) {
 			s = "LOYNFA";
-			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+			n = READ_ONCE(mm_state->stats[hist][i]);
 		} else if (seq != max_seq && NR_HIST_GENS > 1) {
 			s = "loynfa";
-			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+			n = READ_ONCE(mm_state->stats[hist][i]);
 		}
 
 		seq_printf(m, " %10lu%c", n, s[i]);
@@ -5523,6 +5565,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	int i;
 	int gen, type, zone;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled = lru_gen_enabled();
@@ -5533,7 +5576,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
 
-	lruvec->mm_state.seq = MIN_NR_GENS;
+	if (mm_state)
+		mm_state->seq = MIN_NR_GENS;
 }
 
 #ifdef CONFIG_MEMCG
@@ -5552,28 +5596,38 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
-	INIT_LIST_HEAD(&memcg->mm_list.fifo);
-	spin_lock_init(&memcg->mm_list.lock);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+	if (!mm_list)
+		return;
+
+	INIT_LIST_HEAD(&mm_list->fifo);
+	spin_lock_init(&mm_list->lock);
 }
 
 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 {
 	int i;
 	int nid;
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 
-	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+	VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
 
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
 					   sizeof(lruvec->lrugen.nr_pages)));
 
 		lruvec->lrugen.list.next = LIST_POISON1;
 
+		if (!mm_state)
+			continue;
+
 		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
-			bitmap_free(lruvec->mm_state.filters[i]);
-			lruvec->mm_state.filters[i] = NULL;
+			bitmap_free(mm_state->filters[i]);
+			mm_state->filters[i] = NULL;
 		}
 	}
 }
-- 
cgit v1.2.3


From 745b13e647cd119e70d16b57698e12b7c86ca264 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:03 +0000
Subject: mm/mglru: remove CONFIG_MEMCG

Remove CONFIG_MEMCG in a refactoring to improve code readability at
the cost of a few bytes in struct lru_gen_folio per node when
CONFIG_MEMCG=n.

Link: https://lkml.kernel.org/r/20231227141205.2200125-4-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h |  4 ---
 include/linux/mmzone.h   | 26 ++-----------------
 mm/vmscan.c              | 67 +++++++++++++++---------------------------------
 3 files changed, 23 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 552fa2d11c57..55b7121809ff 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1017,9 +1017,7 @@ struct lru_gen_mm_list {
 
 void lru_gen_add_mm(struct mm_struct *mm);
 void lru_gen_del_mm(struct mm_struct *mm);
-#ifdef CONFIG_MEMCG
 void lru_gen_migrate_mm(struct mm_struct *mm);
-#endif
 
 static inline void lru_gen_init_mm(struct mm_struct *mm)
 {
@@ -1050,11 +1048,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm)
 {
 }
 
-#ifdef CONFIG_MEMCG
 static inline void lru_gen_migrate_mm(struct mm_struct *mm)
 {
 }
-#endif
 
 static inline void lru_gen_init_mm(struct mm_struct *mm)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bc3f63ec4291..28665e1b8475 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,14 +440,12 @@ struct lru_gen_folio {
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	/* whether the multi-gen LRU is enabled */
 	bool enabled;
-#ifdef CONFIG_MEMCG
 	/* the memcg generation this lru_gen_folio belongs to */
 	u8 gen;
 	/* the list segment this lru_gen_folio belongs to */
 	u8 seg;
 	/* per-node lru_gen_folio list for global reclaim */
 	struct hlist_nulls_node list;
-#endif
 };
 
 enum {
@@ -493,11 +491,6 @@ struct lru_gen_mm_walk {
 	bool force_scan;
 };
 
-void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
-#ifdef CONFIG_MEMCG
-
 /*
  * For each node, memcgs are divided into two generations: the old and the
  * young. For each generation, memcgs are randomly sharded into multiple bins
@@ -555,6 +548,8 @@ struct lru_gen_memcg {
 };
 
 void lru_gen_init_pgdat(struct pglist_data *pgdat);
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -563,19 +558,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg);
 void lru_gen_release_memcg(struct mem_cgroup *memcg);
 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
 
-#else /* !CONFIG_MEMCG */
-
-#define MEMCG_NR_GENS	1
-
-struct lru_gen_memcg {
-};
-
-static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
-{
-}
-
-#endif /* CONFIG_MEMCG */
-
 #else /* !CONFIG_LRU_GEN */
 
 static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
@@ -590,8 +572,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
 }
 
-#ifdef CONFIG_MEMCG
-
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
 }
@@ -616,8 +596,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 {
 }
 
-#endif /* CONFIG_MEMCG */
-
 #endif /* CONFIG_LRU_GEN */
 
 struct lruvec {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aa7ea09ffb4c..351a0b5043c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4097,13 +4097,6 @@ enum {
 	MEMCG_LRU_YOUNG,
 };
 
-#ifdef CONFIG_MEMCG
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
-	return READ_ONCE(lruvec->lrugen.seg);
-}
-
 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 {
 	int seg;
@@ -4150,6 +4143,8 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 	spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
 }
 
+#ifdef CONFIG_MEMCG
+
 void lru_gen_online_memcg(struct mem_cgroup *memcg)
 {
 	int gen;
@@ -4217,18 +4212,11 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
 	struct lruvec *lruvec = get_lruvec(memcg, nid);
 
 	/* see the comment on MEMCG_NR_GENS */
-	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
 }
 
-#else /* !CONFIG_MEMCG */
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
-	return 0;
-}
-
-#endif
+#endif /* CONFIG_MEMCG */
 
 /******************************************************************************
  *                          the eviction
@@ -4776,7 +4764,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 
 	if (mem_cgroup_below_low(NULL, memcg)) {
 		/* see the comment on MEMCG_NR_GENS */
-		if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL)
+		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
 			return MEMCG_LRU_TAIL;
 
 		memcg_memory_event(memcg, MEMCG_LOW);
@@ -4799,12 +4787,10 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 		return 0;
 
 	/* one retry if offlined or too small */
-	return lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL ?
+	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
 	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
 }
 
-#ifdef CONFIG_MEMCG
-
 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
 {
 	int op;
@@ -4896,20 +4882,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 	blk_finish_plug(&plug);
 }
 
-#else /* !CONFIG_MEMCG */
-
-static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
-{
-	BUILD_BUG();
-}
-
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-{
-	BUILD_BUG();
-}
-
-#endif
-
 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
 {
 	int priority;
@@ -5560,6 +5532,18 @@ static const struct file_operations lru_gen_ro_fops = {
  *                          initialization
  ******************************************************************************/
 
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+	int i, j;
+
+	spin_lock_init(&pgdat->memcg_lru.lock);
+
+	for (i = 0; i < MEMCG_NR_GENS; i++) {
+		for (j = 0; j < MEMCG_NR_BINS; j++)
+			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+	}
+}
+
 void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 	int i;
@@ -5582,18 +5566,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 
 #ifdef CONFIG_MEMCG
 
-void lru_gen_init_pgdat(struct pglist_data *pgdat)
-{
-	int i, j;
-
-	spin_lock_init(&pgdat->memcg_lru.lock);
-
-	for (i = 0; i < MEMCG_NR_GENS; i++) {
-		for (j = 0; j < MEMCG_NR_BINS; j++)
-			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
-	}
-}
-
 void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
@@ -5653,14 +5625,17 @@ late_initcall(init_lru_gen);
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
+	BUILD_BUG();
 }
 
 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
+	BUILD_BUG();
 }
 
 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
+	BUILD_BUG();
 }
 
 #endif /* CONFIG_LRU_GEN */
-- 
cgit v1.2.3


From 533c67e6358406727145efae32882c4dc355d6c5 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:04 +0000
Subject: mm/mglru: add dummy pmd_dirty()

Add dummy pmd_dirty() for architectures that don't provide it.
This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young()
for architectures not having it").

Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Suggested-by: Yu Zhao <yuzhao@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/include/asm/pgtable.h | 1 +
 arch/mips/include/asm/pgtable.h      | 1 +
 arch/riscv/include/asm/pgtable.h     | 1 +
 arch/s390/include/asm/pgtable.h      | 1 +
 arch/sparc/include/asm/pgtable_64.h  | 1 +
 arch/x86/include/asm/pgtable.h       | 1 +
 include/linux/pgtable.h              | 7 +++++++
 7 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 29d9b12298bc..8b5df1bbf9e9 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -523,6 +523,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
 	return pmd;
 }
 
+#define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & (_PAGE_DIRTY | _PAGE_MODIFIED));
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 430b208c0130..e27a4c83c548 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -655,6 +655,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
 	return pmd;
 }
 
+#define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_MODIFIED);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index ab00235b018f..7b4287f36054 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -673,6 +673,7 @@ static inline int pmd_write(pmd_t pmd)
 	return pte_write(pmd_pte(pmd));
 }
 
+#define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
 	return pte_dirty(pmd_pte(pmd));
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 601e87fa8a9a..1299b56e43f6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -770,6 +770,7 @@ static inline int pud_write(pud_t pud)
 	return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
 }
 
+#define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 5e41033bf4ca..a8c871b7d786 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -706,6 +706,7 @@ static inline unsigned long pmd_write(pmd_t pmd)
 #define pud_write(pud)	pte_write(__pte(pud_val(pud)))
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define pmd_dirty pmd_dirty
 static inline unsigned long pmd_dirty(pmd_t pmd)
 {
 	pte_t pte = __pte(pmd_val(pmd));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 08b5cb22d9a6..9d077bca6a10 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte)
 	return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
+#define pmd_dirty pmd_dirty
 static inline bool pmd_dirty(pmd_t pmd)
 {
 	return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9ecc20fa6269..466cf477551a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd)
 }
 #endif
 
+#ifndef pmd_dirty
+static inline int pmd_dirty(pmd_t pmd)
+{
+	return 0;
+}
+#endif
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
-- 
cgit v1.2.3


From e435ca87882167dda78776ce4bd6eb2094eb864b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Dec 2023 08:57:43 +0000
Subject: mm: remove inc/dec lruvec page state functions

Patch series "Remove some lruvec page accounting functions", v2.

Some functions are now unused; remove them.  Make
__mod_lruvec_page_state() unused and then remove it.


This patch (of 6):

All callers of these have been converted to their folio equivalents.

Link: https://lkml.kernel.org/r/20231228085748.1083901-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231228085748.1083901-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fed855bae6d8..147ae73e0ee7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -597,18 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __inc_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void __dec_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx)
-{
-	__mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline void __lruvec_stat_mod_folio(struct folio *folio,
 					   enum node_stat_item idx, int val)
 {
@@ -627,18 +615,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio,
 	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
 }
 
-static inline void inc_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void dec_lruvec_page_state(struct page *page,
-					 enum node_stat_item idx)
-{
-	mod_lruvec_page_state(page, idx, -1);
-}
-
 static inline void lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
-- 
cgit v1.2.3


From c701123bd68bf1cc3bc167b4f597cb1f4995c39c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 28 Dec 2023 08:57:48 +0000
Subject: mm/memcontrol: remove __mod_lruvec_page_state()

There are no more callers of __mod_lruvec_page_state(), so convert the
implementation to __lruvec_stat_mod_folio(), removing two calls to
compound_head() (one explicit, one hidden inside page_memcg()).

Link: https://lkml.kernel.org/r/20231228085748.1083901-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 36 ++++++++++++++++++------------------
 mm/memcontrol.c        |  9 ++++-----
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 147ae73e0ee7..343906a98d6e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 	local_irq_restore(flags);
 }
 
-void __mod_lruvec_page_state(struct page *page,
+void __lruvec_stat_mod_folio(struct folio *folio,
 			     enum node_stat_item idx, int val);
 
-static inline void mod_lruvec_page_state(struct page *page,
+static inline void lruvec_stat_mod_folio(struct folio *folio,
 					 enum node_stat_item idx, int val)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__mod_lruvec_page_state(page, idx, val);
+	__lruvec_stat_mod_folio(folio, idx, val);
 	local_irq_restore(flags);
 }
 
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	lruvec_stat_mod_folio(page_folio(page), idx, val);
+}
+
 #else
 
 static inline void __mod_lruvec_state(struct lruvec *lruvec,
@@ -583,10 +589,16 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
 	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 }
 
-static inline void __mod_lruvec_page_state(struct page *page,
-					   enum node_stat_item idx, int val)
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+					 enum node_stat_item idx, int val)
 {
-	__mod_node_page_state(page_pgdat(page), idx, val);
+	__mod_node_page_state(folio_pgdat(folio), idx, val);
+}
+
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+					 enum node_stat_item idx, int val)
+{
+	mod_node_page_state(folio_pgdat(folio), idx, val);
 }
 
 static inline void mod_lruvec_page_state(struct page *page,
@@ -597,12 +609,6 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __lruvec_stat_mod_folio(struct folio *folio,
-					   enum node_stat_item idx, int val)
-{
-	__mod_lruvec_page_state(&folio->page, idx, val);
-}
-
 static inline void __lruvec_stat_add_folio(struct folio *folio,
 					   enum node_stat_item idx)
 {
@@ -615,12 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio,
 	__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
 }
 
-static inline void lruvec_stat_mod_folio(struct folio *folio,
-					 enum node_stat_item idx, int val)
-{
-	mod_lruvec_page_state(&folio->page, idx, val);
-}
-
 static inline void lruvec_stat_add_folio(struct folio *folio,
 					 enum node_stat_item idx)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ca691fb5b49..0082cef6e1fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -890,16 +890,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 		__mod_memcg_lruvec_state(lruvec, idx, val);
 }
 
-void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
+void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
 			     int val)
 {
-	struct page *head = compound_head(page); /* rmap on tail pages */
 	struct mem_cgroup *memcg;
-	pg_data_t *pgdat = page_pgdat(page);
+	pg_data_t *pgdat = folio_pgdat(folio);
 	struct lruvec *lruvec;
 
 	rcu_read_lock();
-	memcg = page_memcg(head);
+	memcg = folio_memcg(folio);
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
 		rcu_read_unlock();
@@ -911,7 +910,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 	__mod_lruvec_state(lruvec, idx, val);
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(__mod_lruvec_page_state);
+EXPORT_SYMBOL(__lruvec_stat_mod_folio);
 
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 {
-- 
cgit v1.2.3


From b805ab3c6935d14654ccc28f16ffce7a13c2c528 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Fri, 29 Dec 2023 10:26:51 +0800
Subject: mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING

Demotion can work well without CONFIG_NUMA_BALANCING.  But the commit
23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") wrongly hid
it behind CONFIG_NUMA_BALANCING.

Fix it by moving them out of CONFIG_NUMA_BALANCING.

Link: https://lkml.kernel.org/r/20231229022651.3229174-1-lizhijian@fujitsu.com
Fixes: 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 mm/vmscan.c            | 5 +----
 mm/vmstat.c            | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 28665e1b8475..c18c53353b50 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -207,11 +207,11 @@ enum node_stat_item {
 #ifdef CONFIG_NUMA_BALANCING
 	PGPROMOTE_SUCCESS,	/* promote successfully */
 	PGPROMOTE_CANDIDATE,	/* candidate pages to promote */
+#endif
 	/* PGDEMOTE_*: pages demoted */
 	PGDEMOTE_KSWAPD,
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
-#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ceba905e5630..600ed3cbf7cb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -409,12 +409,10 @@ void drop_slab(void)
 
 static int reclaimer_offset(void)
 {
-#ifdef CONFIG_NUMA_BALANCING
 	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
-#endif
 	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 			PGSCAN_DIRECT - PGSCAN_KSWAPD);
 	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
@@ -978,10 +976,9 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 	migrate_pages(demote_folios, alloc_demote_folio, NULL,
 		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
 		      &nr_succeeded);
-#ifdef CONFIG_NUMA_BALANCING
+
 	mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
 			    nr_succeeded);
-#endif
 
 	return nr_succeeded;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cfd8d8256f8e..1437ca2f28c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1248,10 +1248,10 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_NUMA_BALANCING
 	"pgpromote_success",
 	"pgpromote_candidate",
+#endif
 	"pgdemote_kswapd",
 	"pgdemote_direct",
 	"pgdemote_khugepaged",
-#endif
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
-- 
cgit v1.2.3


From 26a1a86dd093a10d0653429bf013dae6e95dccbf Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:29 -0800
Subject: cxl/events: Promote CXL event structures to a core header

UEFI code can process CXL events through CPER records.  Those records
use almost the same format as the CXL events.

Lift the CXL event structures to a core header to be shared in later
patches.

[jic123: drop "CXL rev 3.0" mention]

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-2-1bb8a4ca2c7a@intel.com
[djbw: add F: entry to maintainers for include/linux/cxl-event.h]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS               |  1 +
 drivers/cxl/cxlmem.h      | 90 +-------------------------------------------
 include/linux/cxl-event.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 89 deletions(-)
 create mode 100644 include/linux/cxl-event.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7cef2d2ef8d7..04f6c52d0a8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5245,6 +5245,7 @@ M:	Dan Williams <dan.j.williams@intel.com>
 L:	linux-cxl@vger.kernel.org
 S:	Maintained
 F:	drivers/cxl/
+F:	include/linux/cxl-event.h
 F:	include/uapi/linux/cxl_mem.h
 F:	tools/testing/cxl/
 
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index a2fcbca253f3..f0e7ebb84f02 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -6,6 +6,7 @@
 #include <linux/cdev.h>
 #include <linux/uuid.h>
 #include <linux/rcuwait.h>
+#include <linux/cxl-event.h>
 #include "cxl.h"
 
 /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
@@ -579,27 +580,6 @@ struct cxl_mbox_identify {
 	u8 qos_telemetry_caps;
 } __packed;
 
-/*
- * Common Event Record Format
- * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
- */
-struct cxl_event_record_hdr {
-	uuid_t id;
-	u8 length;
-	u8 flags[3];
-	__le16 handle;
-	__le16 related_handle;
-	__le64 timestamp;
-	u8 maint_op_class;
-	u8 reserved[15];
-} __packed;
-
-#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
-struct cxl_event_record_raw {
-	struct cxl_event_record_hdr hdr;
-	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
-} __packed;
-
 /*
  * Get Event Records output payload
  * CXL rev 3.0 section 8.2.9.2.2; Table 8-50
@@ -641,74 +621,6 @@ struct cxl_mbox_clear_event_payload {
 } __packed;
 #define CXL_CLEAR_EVENT_MAX_HANDLES U8_MAX
 
-/*
- * General Media Event Record
- * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
- */
-#define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
-struct cxl_event_gen_media {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
-	u8 device[3];
-	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
-	u8 reserved[46];
-} __packed;
-
-/*
- * DRAM Event Record - DER
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44
- */
-#define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
-struct cxl_event_dram {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
-	u8 nibble_mask[3];
-	u8 bank_group;
-	u8 bank;
-	u8 row[3];
-	u8 column[2];
-	u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE];
-	u8 reserved[0x17];
-} __packed;
-
-/*
- * Get Health Info Record
- * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100
- */
-struct cxl_get_health_info {
-	u8 health_status;
-	u8 media_status;
-	u8 add_status;
-	u8 life_used;
-	u8 device_temp[2];
-	u8 dirty_shutdown_cnt[4];
-	u8 cor_vol_err_cnt[4];
-	u8 cor_per_err_cnt[4];
-} __packed;
-
-/*
- * Memory Module Event Record
- * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
- */
-struct cxl_event_mem_module {
-	struct cxl_event_record_hdr hdr;
-	u8 event_type;
-	struct cxl_get_health_info info;
-	u8 reserved[0x3d];
-} __packed;
-
 struct cxl_mbox_get_partition_info {
 	__le64 active_volatile_cap;
 	__le64 active_persistent_cap;
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
new file mode 100644
index 000000000000..0fc068123f8e
--- /dev/null
+++ b/include/linux/cxl-event.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Intel Corporation. */
+#ifndef _LINUX_CXL_EVENT_H
+#define _LINUX_CXL_EVENT_H
+
+/*
+ * Common Event Record Format
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_hdr {
+	uuid_t id;
+	u8 length;
+	u8 flags[3];
+	__le16 handle;
+	__le16 related_handle;
+	__le64 timestamp;
+	u8 maint_op_class;
+	u8 reserved[15];
+} __packed;
+
+#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
+struct cxl_event_record_raw {
+	struct cxl_event_record_hdr hdr;
+	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
+} __packed;
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
+struct cxl_event_gen_media {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 device[3];
+	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
+	u8 reserved[46];
+} __packed;
+
+/*
+ * DRAM Event Record - DER
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44
+ */
+#define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
+struct cxl_event_dram {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 nibble_mask[3];
+	u8 bank_group;
+	u8 bank;
+	u8 row[3];
+	u8 column[2];
+	u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE];
+	u8 reserved[0x17];
+} __packed;
+
+/*
+ * Get Health Info Record
+ * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100
+ */
+struct cxl_get_health_info {
+	u8 health_status;
+	u8 media_status;
+	u8 add_status;
+	u8 life_used;
+	u8 device_temp[2];
+	u8 dirty_shutdown_cnt[4];
+	u8 cor_vol_err_cnt[4];
+	u8 cor_per_err_cnt[4];
+} __packed;
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+struct cxl_event_mem_module {
+	struct cxl_event_record_hdr hdr;
+	u8 event_type;
+	struct cxl_get_health_info info;
+	u8 reserved[0x3d];
+} __packed;
+
+#endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From 477bd4beb93bf9ace9bda71f1437b191befa9cf4 Mon Sep 17 00:00:00 2001
From: Swee Leong Ching <leong.ching.swee@intel.com>
Date: Fri, 5 Jan 2024 15:09:23 +0800
Subject: net: stmmac: Make MSI interrupt routine generic

There is no support for per DMA channel interrupt for non-MSI platform,
where the MAC's per channel interrupt hooks up to interrupt controller(GIC)
through shared peripheral interrupt(SPI) to handle interrupt from TX/RX
transmit channel.

This patch generalize the existing MSI ISR to also support non-MSI
platform.

Signed-off-by: Teoh Ji Sheng <ji.sheng.teoh@intel.com>
Signed-off-by: Swee Leong Ching <leong.ching.swee@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  |  4 +--
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  3 +++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 30 +++++++++++-----------
 include/linux/stmmac.h                             |  4 +--
 5 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 60283543ffc8..f0ec69af96c9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -952,7 +952,7 @@ static int stmmac_config_single_msi(struct pci_dev *pdev,
 
 	res->irq = pci_irq_vector(pdev, 0);
 	res->wol_irq = res->irq;
-	plat->flags &= ~STMMAC_FLAG_MULTI_MSI_EN;
+	plat->flags &= ~STMMAC_FLAG_MULTI_IRQ_EN;
 	dev_info(&pdev->dev, "%s: Single IRQ enablement successful\n",
 		 __func__);
 
@@ -1004,7 +1004,7 @@ static int stmmac_config_multi_msi(struct pci_dev *pdev,
 	if (plat->msi_sfty_ue_vec < STMMAC_MSI_VEC_MAX)
 		res->sfty_ue_irq = pci_irq_vector(pdev, plat->msi_sfty_ue_vec);
 
-	plat->flags |= STMMAC_FLAG_MULTI_MSI_EN;
+	plat->flags |= STMMAC_FLAG_MULTI_IRQ_EN;
 	dev_info(&pdev->dev, "%s: multi MSI enablement successful\n", __func__);
 
 	return 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index ba2ce776bd4d..cf43fb3c6cc5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -427,6 +427,9 @@ static int socfpga_dwmac_probe(struct platform_device *pdev)
 	plat_dat->bsp_priv = dwmac;
 	plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed;
 
+	if (stmmac_res.rx_irq[0] > 0 && stmmac_res.tx_irq[0] > 0)
+		plat_dat->flags |= STMMAC_FLAG_MULTI_IRQ_EN;
+
 	ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 84d3a8551b03..5f649106ffcd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -175,7 +175,7 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 
 	value = readl(ioaddr + DMA_BUS_MODE);
 
-	if (dma_cfg->multi_msi_en) {
+	if (dma_cfg->multi_irq_en) {
 		value &= ~DMA_BUS_MODE_INTM_MASK;
 		value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT);
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 47de466e432c..57873b879b33 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -129,8 +129,8 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id);
 /* For MSI interrupts handling */
 static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id);
 static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id);
-static irqreturn_t stmmac_msi_intr_tx(int irq, void *data);
-static irqreturn_t stmmac_msi_intr_rx(int irq, void *data);
+static irqreturn_t stmmac_dma_tx_interrupt(int irq, void *data);
+static irqreturn_t stmmac_dma_rx_interrupt(int irq, void *data);
 static void stmmac_reset_rx_queue(struct stmmac_priv *priv, u32 queue);
 static void stmmac_reset_tx_queue(struct stmmac_priv *priv, u32 queue);
 static void stmmac_reset_queues_param(struct stmmac_priv *priv);
@@ -3602,7 +3602,7 @@ static void stmmac_free_irq(struct net_device *dev,
 	}
 }
 
-static int stmmac_request_irq_multi_msi(struct net_device *dev)
+static int stmmac_request_irq_multi(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	enum request_irq_err irq_err;
@@ -3697,7 +3697,7 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
 		}
 	}
 
-	/* Request Rx MSI irq */
+	/* Request Rx irq */
 	for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
 		if (i >= MTL_MAX_RX_QUEUES)
 			break;
@@ -3707,11 +3707,11 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
 		int_name = priv->int_name_rx_irq[i];
 		sprintf(int_name, "%s:%s-%d", dev->name, "rx", i);
 		ret = request_irq(priv->rx_irq[i],
-				  stmmac_msi_intr_rx,
+				  stmmac_dma_rx_interrupt,
 				  0, int_name, &priv->dma_conf.rx_queue[i]);
 		if (unlikely(ret < 0)) {
 			netdev_err(priv->dev,
-				   "%s: alloc rx-%d  MSI %d (error: %d)\n",
+				   "%s: alloc rx-%d  dma rx_irq %d (error: %d)\n",
 				   __func__, i, priv->rx_irq[i], ret);
 			irq_err = REQ_IRQ_ERR_RX;
 			irq_idx = i;
@@ -3722,7 +3722,7 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
 		irq_set_affinity_hint(priv->rx_irq[i], &cpu_mask);
 	}
 
-	/* Request Tx MSI irq */
+	/* Request Tx irq */
 	for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
 		if (i >= MTL_MAX_TX_QUEUES)
 			break;
@@ -3732,11 +3732,11 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev)
 		int_name = priv->int_name_tx_irq[i];
 		sprintf(int_name, "%s:%s-%d", dev->name, "tx", i);
 		ret = request_irq(priv->tx_irq[i],
-				  stmmac_msi_intr_tx,
+				  stmmac_dma_tx_interrupt,
 				  0, int_name, &priv->dma_conf.tx_queue[i]);
 		if (unlikely(ret < 0)) {
 			netdev_err(priv->dev,
-				   "%s: alloc tx-%d  MSI %d (error: %d)\n",
+				   "%s: alloc tx-%d  dma tx_irq %d (error: %d)\n",
 				   __func__, i, priv->tx_irq[i], ret);
 			irq_err = REQ_IRQ_ERR_TX;
 			irq_idx = i;
@@ -3811,8 +3811,8 @@ static int stmmac_request_irq(struct net_device *dev)
 	int ret;
 
 	/* Request the IRQ lines */
-	if (priv->plat->flags & STMMAC_FLAG_MULTI_MSI_EN)
-		ret = stmmac_request_irq_multi_msi(dev);
+	if (priv->plat->flags & STMMAC_FLAG_MULTI_IRQ_EN)
+		ret = stmmac_request_irq_multi(dev);
 	else
 		ret = stmmac_request_irq_single(dev);
 
@@ -6075,7 +6075,7 @@ static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t stmmac_msi_intr_tx(int irq, void *data)
+static irqreturn_t stmmac_dma_tx_interrupt(int irq, void *data)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)data;
 	struct stmmac_dma_conf *dma_conf;
@@ -6107,7 +6107,7 @@ static irqreturn_t stmmac_msi_intr_tx(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t stmmac_msi_intr_rx(int irq, void *data)
+static irqreturn_t stmmac_dma_rx_interrupt(int irq, void *data)
 {
 	struct stmmac_rx_queue *rx_q = (struct stmmac_rx_queue *)data;
 	struct stmmac_dma_conf *dma_conf;
@@ -7456,8 +7456,8 @@ int stmmac_dvr_probe(struct device *device,
 	priv->plat = plat_dat;
 	priv->ioaddr = res->addr;
 	priv->dev->base_addr = (unsigned long)res->addr;
-	priv->plat->dma_cfg->multi_msi_en =
-		(priv->plat->flags & STMMAC_FLAG_MULTI_MSI_EN);
+	priv->plat->dma_cfg->multi_irq_en =
+		(priv->plat->flags & STMMAC_FLAG_MULTI_IRQ_EN);
 
 	priv->dev->irq = res->irq;
 	priv->wol_irq = res->wol_irq;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dee5ad6e48c5..b950e6f9761d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -98,7 +98,7 @@ struct stmmac_dma_cfg {
 	int mixed_burst;
 	bool aal;
 	bool eame;
-	bool multi_msi_en;
+	bool multi_irq_en;
 	bool dche;
 };
 
@@ -215,7 +215,7 @@ struct dwmac4_addrs {
 #define STMMAC_FLAG_TSO_EN			BIT(4)
 #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP	BIT(5)
 #define STMMAC_FLAG_VLAN_FAIL_Q_EN		BIT(6)
-#define STMMAC_FLAG_MULTI_MSI_EN		BIT(7)
+#define STMMAC_FLAG_MULTI_IRQ_EN		BIT(7)
 #define STMMAC_FLAG_EXT_SNAPSHOT_EN		BIT(8)
 #define STMMAC_FLAG_INT_SNAPSHOT_EN		BIT(9)
 #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI		BIT(10)
-- 
cgit v1.2.3


From deb704281f076097b0347116a82edeba96697db1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Nov 2023 17:14:27 -0500
Subject: SUNRPC: Add a server-side API for retrieving an RPC's pseudoflavor

NFSD will use this new API to determine whether nfsd_splice_read is
safe to use. This avoids the need to add a dependency to NFSD for
CONFIG_SUNRPC_GSS.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svcauth.h    |  7 ++++++-
 net/sunrpc/auth_gss/svcauth_gss.c |  6 ++++++
 net/sunrpc/svcauth.c              | 16 ++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 6f90203edbf8..61c455f1e1f5 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -131,8 +131,11 @@ enum svc_auth_status {
  *   This call releases a domain.
  *
  * set_client()
- *   Givens a pending request (struct svc_rqst), finds and assigns
+ *   Given a pending request (struct svc_rqst), finds and assigns
  *   an appropriate 'auth_domain' as the client.
+ *
+ * pseudoflavor()
+ *   Returns RPC_AUTH pseudoflavor in use by @rqstp.
  */
 struct auth_ops {
 	char *	name;
@@ -143,11 +146,13 @@ struct auth_ops {
 	int			(*release)(struct svc_rqst *rqstp);
 	void			(*domain_release)(struct auth_domain *dom);
 	enum svc_auth_status	(*set_client)(struct svc_rqst *rqstp);
+	rpc_authflavor_t	(*pseudoflavor)(struct svc_rqst *rqstp);
 };
 
 struct svc_xprt;
 
 extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp);
+extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp);
 extern int	svc_authorise(struct svc_rqst *rqstp);
 extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp);
 extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 18734e70c5dd..104d9a320142 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -2014,6 +2014,11 @@ svcauth_gss_domain_release(struct auth_domain *dom)
 	call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu);
 }
 
+static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp)
+{
+	return svcauth_gss_flavor(rqstp->rq_gssclient);
+}
+
 static struct auth_ops svcauthops_gss = {
 	.name		= "rpcsec_gss",
 	.owner		= THIS_MODULE,
@@ -2022,6 +2027,7 @@ static struct auth_ops svcauthops_gss = {
 	.release	= svcauth_gss_release,
 	.domain_release = svcauth_gss_domain_release,
 	.set_client	= svcauth_gss_set_client,
+	.pseudoflavor	= svcauth_gss_pseudoflavor,
 };
 
 static int rsi_cache_create_net(struct net *net)
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index aa4429d0b810..1619211f0960 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -160,6 +160,22 @@ svc_auth_unregister(rpc_authflavor_t flavor)
 }
 EXPORT_SYMBOL_GPL(svc_auth_unregister);
 
+/**
+ * svc_auth_flavor - return RPC transaction's RPC_AUTH flavor
+ * @rqstp: RPC transaction context
+ *
+ * Returns an RPC flavor or GSS pseudoflavor.
+ */
+rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp)
+{
+	struct auth_ops *aops = rqstp->rq_authop;
+
+	if (!aops->pseudoflavor)
+		return aops->flavour;
+	return aops->pseudoflavor(rqstp);
+}
+EXPORT_SYMBOL_GPL(svc_auth_flavor);
+
 /**************************************************
  * 'auth_domains' are stored in a hash table indexed by name.
  * When the last reference to an 'auth_domain' is dropped,
-- 
cgit v1.2.3


From 3587b5c75376fd0b6ca8c4a8de54954e410f4e0e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 17 Nov 2023 17:14:46 -0500
Subject: SUNRPC: Remove RQ_SPLICE_OK

This flag is no longer used.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h        |  2 --
 include/trace/events/sunrpc.h     |  1 -
 net/sunrpc/auth_gss/svcauth_gss.c | 10 ----------
 net/sunrpc/svc.c                  |  2 --
 4 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index b10f987509cc..544fcfe07479 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -260,8 +260,6 @@ enum {
 	RQ_LOCAL,		/* local request */
 	RQ_USEDEFERRAL,		/* use deferral */
 	RQ_DROPME,		/* drop current reply */
-	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
-				 * encrypting page cache pages */
 	RQ_VICTIM,		/* Have agreed to shut down */
 	RQ_DATA,		/* request has data */
 };
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 337c90787fb1..cdd3a45e6003 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1675,7 +1675,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
 	svc_rqst_flag(LOCAL)						\
 	svc_rqst_flag(USEDEFERRAL)					\
 	svc_rqst_flag(DROPME)						\
-	svc_rqst_flag(SPLICE_OK)					\
 	svc_rqst_flag(VICTIM)						\
 	svc_rqst_flag_end(DATA)
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 104d9a320142..24de94184700 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -866,14 +866,6 @@ svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx)
 	struct xdr_buf databody_integ;
 	struct xdr_netobj checksum;
 
-	/* NFS READ normally uses splice to send data in-place. However
-	 * the data in cache can change after the reply's MIC is computed
-	 * but before the RPC reply is sent. To prevent the client from
-	 * rejecting the server-computed MIC in this somewhat rare case,
-	 * do not use splice with the GSS integrity service.
-	 */
-	clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
-
 	/* Did we already verify the signature on the original pass through? */
 	if (rqstp->rq_deferred)
 		return 0;
@@ -948,8 +940,6 @@ svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx)
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int saved_len;
 
-	clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
-
 	if (xdr_stream_decode_u32(xdr, &len) < 0)
 		goto unwrap_failed;
 	if (rqstp->rq_deferred) {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 3f2ea7a0496f..fa4e23fa0e09 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1305,8 +1305,6 @@ svc_process_common(struct svc_rqst *rqstp)
 	int			rc;
 	__be32			*p;
 
-	/* Will be turned off by GSS integrity and privacy services */
-	set_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
 	/* Will be turned off only when NFSv4 Sessions are used */
 	set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
 	clear_bit(RQ_DROPME, &rqstp->rq_flags);
-- 
cgit v1.2.3


From 52e89100754b2e888cb63bf2d19e65d809497cd6 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Sat, 2 Dec 2023 21:07:25 +0000
Subject: NFSv4, NFSD: move enum nfs_cb_opnum4 to include/linux/nfs4.h

Callback operations enum is defined in client and server, move it to
common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Anna Schumaker <Anna.Schumaker@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/callback.h      | 19 -------------------
 fs/nfsd/nfs4callback.c | 26 +-------------------------
 include/linux/nfs4.h   | 22 ++++++++++++++++++++++
 3 files changed, 23 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ccd4f245cae2..0279b78b5fc9 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -19,25 +19,6 @@ enum nfs4_callback_procnum {
 	CB_COMPOUND = 1,
 };
 
-enum nfs4_callback_opnum {
-	OP_CB_GETATTR = 3,
-	OP_CB_RECALL  = 4,
-/* Callback operations new to NFSv4.1 */
-	OP_CB_LAYOUTRECALL  = 5,
-	OP_CB_NOTIFY        = 6,
-	OP_CB_PUSH_DELEG    = 7,
-	OP_CB_RECALL_ANY    = 8,
-	OP_CB_RECALLABLE_OBJ_AVAIL = 9,
-	OP_CB_RECALL_SLOT   = 10,
-	OP_CB_SEQUENCE      = 11,
-	OP_CB_WANTS_CANCELLED = 12,
-	OP_CB_NOTIFY_LOCK   = 13,
-	OP_CB_NOTIFY_DEVICEID = 14,
-/* Callback operations new to NFSv4.2 */
-	OP_CB_OFFLOAD = 15,
-	OP_CB_ILLEGAL = 10044,
-};
-
 struct nfs4_slot;
 struct cb_process_state {
 	__be32			drc_status;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4039ffcf90ba..926c29879c6a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -31,6 +31,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/nfs4.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/svc_xprt.h>
@@ -87,31 +88,6 @@ static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
 	WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
 }
 
-/*
- *	nfs_cb_opnum4
- *
- *	enum nfs_cb_opnum4 {
- *		OP_CB_GETATTR		= 3,
- *		  ...
- *	};
- */
-enum nfs_cb_opnum4 {
-	OP_CB_GETATTR			= 3,
-	OP_CB_RECALL			= 4,
-	OP_CB_LAYOUTRECALL		= 5,
-	OP_CB_NOTIFY			= 6,
-	OP_CB_PUSH_DELEG		= 7,
-	OP_CB_RECALL_ANY		= 8,
-	OP_CB_RECALLABLE_OBJ_AVAIL	= 9,
-	OP_CB_RECALL_SLOT		= 10,
-	OP_CB_SEQUENCE			= 11,
-	OP_CB_WANTS_CANCELLED		= 12,
-	OP_CB_NOTIFY_LOCK		= 13,
-	OP_CB_NOTIFY_DEVICEID		= 14,
-	OP_CB_OFFLOAD			= 15,
-	OP_CB_ILLEGAL			= 10044
-};
-
 static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
 {
 	__be32 *p;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index c11c4db34639..ef8d2d618d5b 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -869,4 +869,26 @@ enum {
 	RCA4_TYPE_MASK_OTHER_LAYOUT_MAX	= 15,
 };
 
+enum nfs_cb_opnum4 {
+	OP_CB_GETATTR = 3,
+	OP_CB_RECALL  = 4,
+
+	/* Callback operations new to NFSv4.1 */
+	OP_CB_LAYOUTRECALL  = 5,
+	OP_CB_NOTIFY        = 6,
+	OP_CB_PUSH_DELEG    = 7,
+	OP_CB_RECALL_ANY    = 8,
+	OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+	OP_CB_RECALL_SLOT   = 10,
+	OP_CB_SEQUENCE      = 11,
+	OP_CB_WANTS_CANCELLED = 12,
+	OP_CB_NOTIFY_LOCK   = 13,
+	OP_CB_NOTIFY_DEVICEID = 14,
+
+	/* Callback operations new to NFSv4.2 */
+	OP_CB_OFFLOAD = 15,
+
+	OP_CB_ILLEGAL = 10044,
+};
+
 #endif
-- 
cgit v1.2.3


From b541dd554bc0442f7ff8c6cab6c5460c044913c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:13 -0500
Subject: svcrdma: Eliminate allocation of recv_ctxt objects in backchannel

The svc_rdma_recv_ctxt free list uses a lockless list to avoid the
need for a spin lock in the fast path. llist_del_first(), which is
used by svc_rdma_recv_ctxt_get(), requires serialization, however,
when there are multiple list producers that are unserialized.

I mistakenly thought there was only one caller of
svc_rdma_recv_ctxt_get() (svc_rdma_refresh_recvs()), thus explicit
serialization would not be necessary. But there is another caller:
svc_rdma_bc_sendto(), and these two are not serialized against each
other. I haven't seen ill effects that I could directly ascribe to
a lack of serialization. It's just an observation based on code
audit.

When DMA-mapping before sending a Reply, the passed-in struct
svc_rdma_recv_ctxt is used only for its write and reply PCLs. These
are currently always empty in the backchannel case. So, instead of
passing a full svc_rdma_recv_ctxt object to
svc_rdma_map_reply_msg(), let's pass in just the Write and Reply
PCLs.

This change makes it unnecessary for the backchannel to acquire a
dummy svc_rdma_recv_ctxt object when sending an RPC Call. The need
for svc_rdma_recv_ctxt free list serialization is now completely
avoided.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h            |  3 ++-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 11 ++++-------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 31 ++++++++++++++++--------------
 3 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a5ee0af2a310..4ac32895a058 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -200,7 +200,8 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma,
 			 struct svc_rdma_send_ctxt *ctxt);
 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_send_ctxt *sctxt,
-				  const struct svc_rdma_recv_ctxt *rctxt,
+				  const struct svc_rdma_pcl *write_pcl,
+				  const struct svc_rdma_pcl *reply_pcl,
 				  const struct xdr_buf *xdr);
 extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 				    struct svc_rdma_send_ctxt *sctxt,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 7420a2c990c7..c9be6778643b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -76,15 +76,12 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 			      struct rpc_rqst *rqst,
 			      struct svc_rdma_send_ctxt *sctxt)
 {
-	struct svc_rdma_recv_ctxt *rctxt;
+	struct svc_rdma_pcl empty_pcl;
 	int ret;
 
-	rctxt = svc_rdma_recv_ctxt_get(rdma);
-	if (!rctxt)
-		return -EIO;
-
-	ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf);
-	svc_rdma_recv_ctxt_put(rdma, rctxt);
+	pcl_init(&empty_pcl);
+	ret = svc_rdma_map_reply_msg(rdma, sctxt, &empty_pcl, &empty_pcl,
+				     &rqst->rq_snd_buf);
 	if (ret < 0)
 		return -EIO;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c6644cca52c5..45735f74eb86 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -653,7 +653,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
  * svc_rdma_pull_up_needed - Determine whether to use pull-up
  * @rdma: controlling transport
  * @sctxt: send_ctxt for the Send WR
- * @rctxt: Write and Reply chunks provided by client
+ * @write_pcl: Write chunk list provided by client
  * @xdr: xdr_buf containing RPC message to transmit
  *
  * Returns:
@@ -662,7 +662,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
  */
 static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
 				    const struct svc_rdma_send_ctxt *sctxt,
-				    const struct svc_rdma_recv_ctxt *rctxt,
+				    const struct svc_rdma_pcl *write_pcl,
 				    const struct xdr_buf *xdr)
 {
 	/* Resources needed for the transport header */
@@ -672,7 +672,7 @@ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
 	};
 	int ret;
 
-	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+	ret = pcl_process_nonpayloads(write_pcl, xdr,
 				      svc_rdma_xb_count_sges, &args);
 	if (ret < 0)
 		return false;
@@ -728,7 +728,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
  * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer
  * @rdma: controlling transport
  * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared
- * @rctxt: Write and Reply chunks provided by client
+ * @write_pcl: Write chunk list provided by client
  * @xdr: prepared xdr_buf containing RPC message
  *
  * The device is not capable of sending the reply directly.
@@ -743,7 +743,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
  */
 static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
 				      struct svc_rdma_send_ctxt *sctxt,
-				      const struct svc_rdma_recv_ctxt *rctxt,
+				      const struct svc_rdma_pcl *write_pcl,
 				      const struct xdr_buf *xdr)
 {
 	struct svc_rdma_pullup_data args = {
@@ -751,7 +751,7 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
 	};
 	int ret;
 
-	ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+	ret = pcl_process_nonpayloads(write_pcl, xdr,
 				      svc_rdma_xb_linearize, &args);
 	if (ret < 0)
 		return ret;
@@ -764,7 +764,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
 /* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message
  * @rdma: controlling transport
  * @sctxt: send_ctxt for the Send WR
- * @rctxt: Write and Reply chunks provided by client
+ * @write_pcl: Write chunk list provided by client
+ * @reply_pcl: Reply chunk provided by client
  * @xdr: prepared xdr_buf containing RPC message
  *
  * Returns:
@@ -776,7 +777,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
  */
 int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 			   struct svc_rdma_send_ctxt *sctxt,
-			   const struct svc_rdma_recv_ctxt *rctxt,
+			   const struct svc_rdma_pcl *write_pcl,
+			   const struct svc_rdma_pcl *reply_pcl,
 			   const struct xdr_buf *xdr)
 {
 	struct svc_rdma_map_data args = {
@@ -789,18 +791,18 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
 
 	/* If there is a Reply chunk, nothing follows the transport
-	 * header, and we're done here.
+	 * header, so there is nothing to map.
 	 */
-	if (!pcl_is_empty(&rctxt->rc_reply_pcl))
+	if (!pcl_is_empty(reply_pcl))
 		return 0;
 
 	/* For pull-up, svc_rdma_send() will sync the transport header.
 	 * No additional DMA mapping is necessary.
 	 */
-	if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
-		return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
+	if (svc_rdma_pull_up_needed(rdma, sctxt, write_pcl, xdr))
+		return svc_rdma_pull_up_reply_msg(rdma, sctxt, write_pcl, xdr);
 
-	return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+	return pcl_process_nonpayloads(write_pcl, xdr,
 				       svc_rdma_xb_dma_map, &args);
 }
 
@@ -848,7 +850,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 {
 	int ret;
 
-	ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res);
+	ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl,
+				     &rctxt->rc_reply_pcl, &rqstp->rq_res);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 9c7e1a06588ee6962afe0dfe5a398e1d23212005 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:26 -0500
Subject: svcrdma: Add a utility workqueue to svcrdma

To handle work in the background, set up an UNBOUND workqueue for
svcrdma. Subsequent patches will make use of it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 +
 net/sunrpc/xprtrdma/svc_rdma.c           | 32 ++++++++++++++++++++++++--------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  1 +
 3 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 4ac32895a058..e18c94e816b3 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -65,6 +65,7 @@ extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
+extern struct workqueue_struct *svcrdma_wq;
 
 extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index f0d5eeed4c88..f86970733eb0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -256,28 +256,44 @@ out_err:
 	return rc;
 }
 
+struct workqueue_struct *svcrdma_wq;
+
 void svc_rdma_cleanup(void)
 {
-	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
 	svc_unreg_xprt_class(&svc_rdma_class);
 	svc_rdma_proc_cleanup();
+	if (svcrdma_wq) {
+		struct workqueue_struct *wq = svcrdma_wq;
+
+		svcrdma_wq = NULL;
+		destroy_workqueue(wq);
+	}
+
+	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
 }
 
 int svc_rdma_init(void)
 {
+	struct workqueue_struct *wq;
 	int rc;
 
-	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
-	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
-	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
-	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
-	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+	wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0);
+	if (!wq)
+		return -ENOMEM;
 
 	rc = svc_rdma_proc_init();
-	if (rc)
+	if (rc) {
+		destroy_workqueue(wq);
 		return rc;
+	}
 
-	/* Register RDMA with the SVC transport switch */
+	svcrdma_wq = wq;
 	svc_reg_xprt_class(&svc_rdma_class);
+
+	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
+	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
+	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 	return 0;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 2abd895046ee..c046916df007 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -547,6 +547,7 @@ static void __svc_rdma_free(struct work_struct *work)
 	/* This blocks until the Completion Queues are empty */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
 		ib_drain_qp(rdma->sc_qp);
+	flush_workqueue(svcrdma_wq);
 
 	svc_rdma_flush_recv_queues(rdma);
 
-- 
cgit v1.2.3


From ae225fe27b931de89b6b1e1bbe6de4de23000850 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 21 Nov 2023 11:40:33 -0500
Subject: svcrdma: Add an async version of svc_rdma_send_ctxt_put()

DMA unmapping can take quite some time, so it should not be handled
in a single-threaded completion handler. Defer releasing send_ctxts
to the recently-added workqueue.

With this patch, DMA unmapping can be handled in parallel, and it
does not cause head-of-queue blocking of Send completions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |  2 ++
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 34 +++++++++++++++++++++++++---------
 2 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e18c94e816b3..ab250017b99f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -152,7 +152,9 @@ struct svc_rdma_recv_ctxt {
 struct svc_rdma_send_ctxt {
 	struct llist_node	sc_node;
 	struct rpc_rdma_cid	sc_cid;
+	struct work_struct	sc_work;
 
+	struct svcxprt_rdma	*sc_rdma;
 	struct ib_send_wr	sc_send_wr;
 	struct ib_cqe		sc_cqe;
 	struct xdr_buf		sc_hdrbuf;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 45735f74eb86..22c39ba923d2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -143,6 +143,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 
 	svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
 
+	ctxt->sc_rdma = rdma;
 	ctxt->sc_send_wr.next = NULL;
 	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
 	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
@@ -223,15 +224,8 @@ out_empty:
 	goto out;
 }
 
-/**
- * svc_rdma_send_ctxt_put - Return send_ctxt to free list
- * @rdma: controlling svcxprt_rdma
- * @ctxt: object to return to the free list
- *
- * Pages left in sc_pages are DMA unmapped and released.
- */
-void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
-			    struct svc_rdma_send_ctxt *ctxt)
+static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
+				       struct svc_rdma_send_ctxt *ctxt)
 {
 	struct ib_device *device = rdma->sc_cm_id->device;
 	unsigned int i;
@@ -255,6 +249,28 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 	llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts);
 }
 
+static void svc_rdma_send_ctxt_put_async(struct work_struct *work)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+
+	ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work);
+	svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt);
+}
+
+/**
+ * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * @rdma: controlling svcxprt_rdma
+ * @ctxt: object to return to the free list
+ *
+ * Pages left in sc_pages are DMA unmapped and released.
+ */
+void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
+			    struct svc_rdma_send_ctxt *ctxt)
+{
+	INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async);
+	queue_work(svcrdma_wq, &ctxt->sc_work);
+}
+
 /**
  * svc_rdma_wake_send_waiters - manage Send Queue accounting
  * @rdma: controlling transport
-- 
cgit v1.2.3


From 6a04a4349330c5476adf465159a7f49411091bbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:09 -0500
Subject: svcrdma: Move struct svc_rdma_chunk_ctxt to svc_rdma.h

Prepare for nestling these into the send and recv ctxts so they
no longer have to be allocated dynamically.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   | 15 +++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 18 ------------------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ab250017b99f..50c4f18a9b7f 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -127,6 +127,21 @@ enum {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/*
+ * A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct rpc_rdma_cid	cc_cid;
+	struct ib_cqe		cc_cqe;
+	struct list_head	cc_rwctxts;
+	ktime_t			cc_posttime;
+	int			cc_sqecount;
+	enum ib_wc_status	cc_status;
+	struct completion	cc_done;
+};
+
 struct svc_rdma_recv_ctxt {
 	struct llist_node	rc_node;
 	struct list_head	rc_list;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index cfa5973c9277..1de56e9fea91 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -146,24 +146,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
 	return ret;
 }
 
-/* A chunk context tracks all I/O for moving one Read or Write
- * chunk. This is a set of rdma_rw's that handle data movement
- * for all segments of one chunk.
- *
- * These are small, acquired with a single allocator call, and
- * no more than one is needed per chunk. They are allocated on
- * demand, and not cached.
- */
-struct svc_rdma_chunk_ctxt {
-	struct rpc_rdma_cid	cc_cid;
-	struct ib_cqe		cc_cqe;
-	struct list_head	cc_rwctxts;
-	ktime_t			cc_posttime;
-	int			cc_sqecount;
-	enum ib_wc_status	cc_status;
-	struct completion	cc_done;
-};
-
 static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
 				 struct rpc_rdma_cid *cid)
 {
-- 
cgit v1.2.3


From b1818412d06fc03605d02dbdd4a7c53dc9e2d5ba Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:16 -0500
Subject: svcrdma: Start moving fields out of struct svc_rdma_read_info

Since the request's svc_rdma_recv_ctxt will stay around for the
duration of the RDMA Read operation, the contents of struct
svc_rdma_read_info can reside in the request's svc_rdma_recv_ctxt
rather than being allocated separately. This will eventually save a
call to kmalloc() in a hot path.

Start this clean-up by moving the Read chunk's svc_rdma_chunk_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  4 +++
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 57 ++++++++++++++++++---------------------
 2 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 50c4f18a9b7f..6c7501ae4e29 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -156,6 +156,10 @@ struct svc_rdma_recv_ctxt {
 	u32			rc_inv_rkey;
 	__be32			rc_msgtype;
 
+	/* State for pulling a Read chunk */
+	unsigned int		rc_readbytes;
+	struct svc_rdma_chunk_ctxt	rc_cc;
+
 	struct svc_rdma_pcl	rc_call_pcl;
 
 	struct svc_rdma_pcl	rc_read_pcl;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 1de56e9fea91..a27b8f338ae5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -294,9 +294,6 @@ struct svc_rdma_read_info {
 	struct svc_rdma_recv_ctxt	*ri_readctxt;
 	unsigned int			ri_pageno;
 	unsigned int			ri_pageoff;
-	unsigned int			ri_totalbytes;
-
-	struct svc_rdma_chunk_ctxt	ri_cc;
 };
 
 static struct svc_rdma_read_info *
@@ -304,20 +301,13 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_read_info *info;
 
-	info = kmalloc_node(sizeof(*info), GFP_KERNEL,
+	return kmalloc_node(sizeof(*info), GFP_KERNEL,
 			    ibdev_to_node(rdma->sc_cm_id->device));
-	if (!info)
-		return info;
-
-	svc_rdma_cc_init(rdma, &info->ri_cc);
-	info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done;
-	return info;
 }
 
 static void svc_rdma_read_info_free(struct svcxprt_rdma *rdma,
 				    struct svc_rdma_read_info *info)
 {
-	svc_rdma_cc_release(rdma, &info->ri_cc, DMA_FROM_DEVICE);
 	kfree(info);
 }
 
@@ -333,12 +323,12 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_chunk_ctxt *cc =
 			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
-	struct svc_rdma_read_info *info;
+	struct svc_rdma_recv_ctxt *ctxt;
 
 	switch (wc->status) {
 	case IB_WC_SUCCESS:
-		info = container_of(cc, struct svc_rdma_read_info, ri_cc);
-		trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes,
+		ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc);
+		trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes,
 				      cc->cc_posttime);
 		break;
 	case IB_WC_WR_FLUSH_ERR:
@@ -708,7 +698,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
 				       const struct svc_rdma_segment *segment)
 {
 	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
-	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+	struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
 	struct svc_rqst *rqstp = info->ri_rqst;
 	unsigned int sge_no, seg_len, len;
 	struct svc_rdma_rw_ctxt *ctxt;
@@ -778,6 +768,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma,
 				     struct svc_rdma_read_info *info,
 				     const struct svc_rdma_chunk *chunk)
 {
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	const struct svc_rdma_segment *segment;
 	int ret;
 
@@ -786,7 +777,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma,
 		ret = svc_rdma_build_read_segment(rdma, info, segment);
 		if (ret < 0)
 			break;
-		info->ri_totalbytes += segment->rs_length;
+		head->rc_readbytes += segment->rs_length;
 	}
 	return ret;
 }
@@ -828,7 +819,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
 		dst = page_address(rqstp->rq_pages[info->ri_pageno]);
 		memcpy(dst + info->ri_pageno, src + offset, page_len);
 
-		info->ri_totalbytes += page_len;
+		head->rc_readbytes += page_len;
 		info->ri_pageoff += page_len;
 		if (info->ri_pageoff == PAGE_SIZE) {
 			info->ri_pageno++;
@@ -883,7 +874,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma,
 			break;
 
 		start += length;
-		length = next->ch_position - info->ri_totalbytes;
+		length = next->ch_position - head->rc_readbytes;
 		ret = svc_rdma_copy_inline_range(info, start, length);
 		if (ret < 0)
 			return ret;
@@ -895,13 +886,13 @@ static noinline int svc_rdma_read_multiple_chunks(struct svcxprt_rdma *rdma,
 	if (ret < 0)
 		return ret;
 
-	buf->len += info->ri_totalbytes;
-	buf->buflen += info->ri_totalbytes;
+	buf->len += head->rc_readbytes;
+	buf->buflen += head->rc_readbytes;
 
 	buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
-	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes);
 	buf->pages = &info->ri_rqst->rq_pages[1];
-	buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
+	buf->page_len = head->rc_readbytes - buf->head[0].iov_len;
 	return 0;
 }
 
@@ -985,6 +976,7 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     unsigned int offset, unsigned int length)
 {
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	const struct svc_rdma_segment *segment;
 	int ret;
 
@@ -1005,7 +997,7 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma,
 		if (ret < 0)
 			break;
 
-		info->ri_totalbytes += dummy.rs_length;
+		head->rc_readbytes += dummy.rs_length;
 		length -= dummy.rs_length;
 		offset = 0;
 	}
@@ -1055,7 +1047,7 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma,
 			break;
 
 		start += length;
-		length = next->ch_position - info->ri_totalbytes;
+		length = next->ch_position - head->rc_readbytes;
 		ret = svc_rdma_read_chunk_range(rdma, info, call_chunk,
 						start, length);
 		if (ret < 0)
@@ -1089,6 +1081,7 @@ static int svc_rdma_read_call_chunk(struct svcxprt_rdma *rdma,
 static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma,
 					  struct svc_rdma_read_info *info)
 {
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
 	struct xdr_buf *buf = &info->ri_rqst->rq_arg;
 	int ret;
 
@@ -1096,13 +1089,13 @@ static noinline int svc_rdma_read_special(struct svcxprt_rdma *rdma,
 	if (ret < 0)
 		goto out;
 
-	buf->len += info->ri_totalbytes;
-	buf->buflen += info->ri_totalbytes;
+	buf->len += head->rc_readbytes;
+	buf->buflen += head->rc_readbytes;
 
 	buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
-	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes);
 	buf->pages = &info->ri_rqst->rq_pages[1];
-	buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
+	buf->page_len = head->rc_readbytes - buf->head[0].iov_len;
 
 out:
 	return ret;
@@ -1135,19 +1128,20 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 			       struct svc_rqst *rqstp,
 			       struct svc_rdma_recv_ctxt *head)
 {
+	struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
 	struct svc_rdma_read_info *info;
-	struct svc_rdma_chunk_ctxt *cc;
 	int ret;
 
 	info = svc_rdma_read_info_alloc(rdma);
 	if (!info)
 		return -ENOMEM;
-	cc = &info->ri_cc;
 	info->ri_rqst = rqstp;
 	info->ri_readctxt = head;
 	info->ri_pageno = 0;
 	info->ri_pageoff = 0;
-	info->ri_totalbytes = 0;
+	svc_rdma_cc_init(rdma, cc);
+	cc->cc_cqe.done = svc_rdma_wc_read_done;
+	head->rc_readbytes = 0;
 
 	if (pcl_is_empty(&head->rc_call_pcl)) {
 		if (head->rc_read_pcl.cl_count == 1)
@@ -1178,6 +1172,7 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 	head->rc_page_count = 0;
 
 out_err:
+	svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE);
 	svc_rdma_read_info_free(rdma, info);
 	return ret;
 }
-- 
cgit v1.2.3


From 8e122582680c6f8acd686a5a2af9c0e46fe90f2d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:22 -0500
Subject: svcrdma: Move svc_rdma_read_info::ri_pageno to struct
 svc_rdma_recv_ctxt

Further clean up: move the page index field into svc_rdma_recv_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  1 +
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 21 +++++++++------------
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6c7501ae4e29..0ea66f73bec2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt {
 	__be32			rc_msgtype;
 
 	/* State for pulling a Read chunk */
+	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
 	struct svc_rdma_chunk_ctxt	rc_cc;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index a27b8f338ae5..487acb192558 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -292,7 +292,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 struct svc_rdma_read_info {
 	struct svc_rqst			*ri_rqst;
 	struct svc_rdma_recv_ctxt	*ri_readctxt;
-	unsigned int			ri_pageno;
 	unsigned int			ri_pageoff;
 };
 
@@ -720,20 +719,18 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
 		if (!info->ri_pageoff)
 			head->rc_page_count++;
 
-		sg_set_page(sg, rqstp->rq_pages[info->ri_pageno],
+		sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
 			    seg_len, info->ri_pageoff);
 		sg = sg_next(sg);
 
 		info->ri_pageoff += seg_len;
 		if (info->ri_pageoff == PAGE_SIZE) {
-			info->ri_pageno++;
+			head->rc_curpage++;
 			info->ri_pageoff = 0;
 		}
 		len -= seg_len;
 
-		/* Safety check */
-		if (len &&
-		    &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end)
+		if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages)))
 			goto out_overrun;
 	}
 
@@ -748,7 +745,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
 	return 0;
 
 out_overrun:
-	trace_svcrdma_page_overrun_err(&cc->cc_cid, info->ri_pageno);
+	trace_svcrdma_page_overrun_err(&cc->cc_cid, head->rc_curpage);
 	return -EINVAL;
 }
 
@@ -790,7 +787,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma,
  *
  * Take a page at a time from rqstp->rq_pages and copy the inline
  * content from the Receive buffer into that page. Update
- * info->ri_pageno and info->ri_pageoff so that the next RDMA Read
+ * head->rc_curpage and info->ri_pageoff so that the next RDMA Read
  * result will land contiguously with the copied content.
  *
  * Return values:
@@ -816,13 +813,13 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
 		if (!info->ri_pageoff)
 			head->rc_page_count++;
 
-		dst = page_address(rqstp->rq_pages[info->ri_pageno]);
-		memcpy(dst + info->ri_pageno, src + offset, page_len);
+		dst = page_address(rqstp->rq_pages[head->rc_curpage]);
+		memcpy(dst + head->rc_curpage, src + offset, page_len);
 
 		head->rc_readbytes += page_len;
 		info->ri_pageoff += page_len;
 		if (info->ri_pageoff == PAGE_SIZE) {
-			info->ri_pageno++;
+			head->rc_curpage++;
 			info->ri_pageoff = 0;
 		}
 		remaining -= page_len;
@@ -1137,10 +1134,10 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 		return -ENOMEM;
 	info->ri_rqst = rqstp;
 	info->ri_readctxt = head;
-	info->ri_pageno = 0;
 	info->ri_pageoff = 0;
 	svc_rdma_cc_init(rdma, cc);
 	cc->cc_cqe.done = svc_rdma_wc_read_done;
+	head->rc_curpage = 0;
 	head->rc_readbytes = 0;
 
 	if (pcl_is_empty(&head->rc_call_pcl)) {
-- 
cgit v1.2.3


From 919f6e790ab6cca772fa60c6006162c0a7ebbfc5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:28 -0500
Subject: svcrdma: Move read_info::ri_pageoff into struct svc_rdma_recv_ctxt

Further clean up: move the starting byte offset field into
svc_rdma_recv_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  1 +
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 31 +++++++++++++++----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 0ea66f73bec2..44a14eaf8c40 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt {
 	__be32			rc_msgtype;
 
 	/* State for pulling a Read chunk */
+	unsigned int		rc_pageoff;
 	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
 	struct svc_rdma_chunk_ctxt	rc_cc;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 487acb192558..dbced8970779 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -292,7 +292,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 struct svc_rdma_read_info {
 	struct svc_rqst			*ri_rqst;
 	struct svc_rdma_recv_ctxt	*ri_readctxt;
-	unsigned int			ri_pageoff;
 };
 
 static struct svc_rdma_read_info *
@@ -705,7 +704,7 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
 	int ret;
 
 	len = segment->rs_length;
-	sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
+	sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
 	ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
 	if (!ctxt)
 		return -ENOMEM;
@@ -714,19 +713,19 @@ static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
 	sg = ctxt->rw_sg_table.sgl;
 	for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
 		seg_len = min_t(unsigned int, len,
-				PAGE_SIZE - info->ri_pageoff);
+				PAGE_SIZE - head->rc_pageoff);
 
-		if (!info->ri_pageoff)
+		if (!head->rc_pageoff)
 			head->rc_page_count++;
 
 		sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
-			    seg_len, info->ri_pageoff);
+			    seg_len, head->rc_pageoff);
 		sg = sg_next(sg);
 
-		info->ri_pageoff += seg_len;
-		if (info->ri_pageoff == PAGE_SIZE) {
+		head->rc_pageoff += seg_len;
+		if (head->rc_pageoff == PAGE_SIZE) {
 			head->rc_curpage++;
-			info->ri_pageoff = 0;
+			head->rc_pageoff = 0;
 		}
 		len -= seg_len;
 
@@ -787,7 +786,7 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma,
  *
  * Take a page at a time from rqstp->rq_pages and copy the inline
  * content from the Receive buffer into that page. Update
- * head->rc_curpage and info->ri_pageoff so that the next RDMA Read
+ * head->rc_curpage and head->rc_pageoff so that the next RDMA Read
  * result will land contiguously with the copied content.
  *
  * Return values:
@@ -803,24 +802,24 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
 	struct svc_rqst *rqstp = info->ri_rqst;
 	unsigned int page_no, numpages;
 
-	numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT;
+	numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT;
 	for (page_no = 0; page_no < numpages; page_no++) {
 		unsigned int page_len;
 
 		page_len = min_t(unsigned int, remaining,
-				 PAGE_SIZE - info->ri_pageoff);
+				 PAGE_SIZE - head->rc_pageoff);
 
-		if (!info->ri_pageoff)
+		if (!head->rc_pageoff)
 			head->rc_page_count++;
 
 		dst = page_address(rqstp->rq_pages[head->rc_curpage]);
 		memcpy(dst + head->rc_curpage, src + offset, page_len);
 
 		head->rc_readbytes += page_len;
-		info->ri_pageoff += page_len;
-		if (info->ri_pageoff == PAGE_SIZE) {
+		head->rc_pageoff += page_len;
+		if (head->rc_pageoff == PAGE_SIZE) {
 			head->rc_curpage++;
-			info->ri_pageoff = 0;
+			head->rc_pageoff = 0;
 		}
 		remaining -= page_len;
 		offset += page_len;
@@ -1134,9 +1133,9 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 		return -ENOMEM;
 	info->ri_rqst = rqstp;
 	info->ri_readctxt = head;
-	info->ri_pageoff = 0;
 	svc_rdma_cc_init(rdma, cc);
 	cc->cc_cqe.done = svc_rdma_wc_read_done;
+	head->rc_pageoff = 0;
 	head->rc_curpage = 0;
 	head->rc_readbytes = 0;
 
-- 
cgit v1.2.3


From fc20f19b4df4a46d1003d15d84148a117e8bdf5d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:57:35 -0500
Subject: svcrdma: Update synopsis of svc_rdma_build_read_segment()

Since the RDMA Read I/O state is now contained in the recv_ctxt,
svc_rdma_build_read_segment() can use the recv_ctxt to derive that
information rather than the other way around. This removes one usage
of the ri_readctxt field, enabling its removal in a subsequent
patch.

At the same time, the use of ri_rqst can similarly be replaced with
a passed-in function parameter.

Start with build_read_segment() because it is a common utility
function at the bottom of the Read chunk path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  7 +++++++
 net/sunrpc/xprtrdma/svc_rdma_rw.c | 17 +++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 44a14eaf8c40..f03f9909fb97 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -116,6 +116,13 @@ struct svcxprt_rdma {
 /* sc_flags */
 #define RDMAXPRT_CONN_PENDING	3
 
+static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+
+	return container_of(xprt, struct svcxprt_rdma, sc_xprt);
+}
+
 /*
  * Default connection parameters
  */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index dbced8970779..c2d0e4bb454e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -681,8 +681,8 @@ out_err:
 
 /**
  * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
- * @rdma: controlling transport
- * @info: context for ongoing I/O
+ * @rqstp: RPC transaction context
+ * @head: context for ongoing I/O
  * @segment: co-ordinates of remote memory to be read
  *
  * Returns:
@@ -691,13 +691,12 @@ out_err:
  *   %-ENOMEM: allocating a local resources failed
  *   %-EIO: a DMA mapping error occurred
  */
-static int svc_rdma_build_read_segment(struct svcxprt_rdma *rdma,
-				       struct svc_rdma_read_info *info,
+static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
+				       struct svc_rdma_recv_ctxt *head,
 				       const struct svc_rdma_segment *segment)
 {
-	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+	struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
 	struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
-	struct svc_rqst *rqstp = info->ri_rqst;
 	unsigned int sge_no, seg_len, len;
 	struct svc_rdma_rw_ctxt *ctxt;
 	struct scatterlist *sg;
@@ -770,7 +769,8 @@ static int svc_rdma_build_read_chunk(struct svcxprt_rdma *rdma,
 
 	ret = -EINVAL;
 	pcl_for_each_segment(segment, chunk) {
-		ret = svc_rdma_build_read_segment(rdma, info, segment);
+		ret = svc_rdma_build_read_segment(info->ri_rqst,
+						  info->ri_readctxt, segment);
 		if (ret < 0)
 			break;
 		head->rc_readbytes += segment->rs_length;
@@ -989,7 +989,8 @@ static int svc_rdma_read_chunk_range(struct svcxprt_rdma *rdma,
 		dummy.rs_length = min_t(u32, length, segment->rs_length) - offset;
 		dummy.rs_offset = segment->rs_offset + offset;
 
-		ret = svc_rdma_build_read_segment(rdma, info, &dummy);
+		ret = svc_rdma_build_read_segment(info->ri_rqst,
+						  info->ri_readctxt, &dummy);
 		if (ret < 0)
 			break;
 
-- 
cgit v1.2.3


From 018f34051bc9f4908336b3fe9e52931bb8410ced Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 4 Dec 2023 09:58:33 -0500
Subject: svcrdma: Move the svc_rdma_cc_init() call

Now that the chunk_ctxt for Reads is no longer dynamically allocated
it can be initialized once for the life of the object that contains
it (struct svc_rdma_recv_ctxt).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 ++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  1 +
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 11 ++++++++---
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f03f9909fb97..051fefde8d51 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -211,6 +211,8 @@ extern int svc_rdma_recvfrom(struct svc_rqst *);
 
 /* svc_rdma_rw.c */
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 72374033bb2b..392a91dc8a99 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -156,6 +156,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
 	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
 	ctxt->rc_recv_buf = buffer;
+	svc_rdma_cc_init(rdma, &ctxt->rc_cc);
 	return ctxt;
 
 fail2:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 0ccb21f1089e..4d2db06ccfd2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -153,8 +153,13 @@ static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
 	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
 }
 
-static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
-			     struct svc_rdma_chunk_ctxt *cc)
+/**
+ * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt
+ * @rdma: controlling transport instance
+ * @cc: svc_rdma_chunk_ctxt to be initialized
+ */
+void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+		      struct svc_rdma_chunk_ctxt *cc)
 {
 	svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
 
@@ -1101,8 +1106,8 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 	struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
 	int ret;
 
-	svc_rdma_cc_init(rdma, cc);
 	cc->cc_cqe.done = svc_rdma_wc_read_done;
+	cc->cc_sqecount = 0;
 	head->rc_pageoff = 0;
 	head->rc_curpage = 0;
 	head->rc_readbytes = 0;
-- 
cgit v1.2.3


From 28ee0ec8948ac235327a1f5472fc032b308284a3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 Dec 2023 10:24:08 -0500
Subject: svcrdma: De-duplicate completion ID initialization helpers

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         | 24 ++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  7 -------
 net/sunrpc/xprtrdma/svc_rdma_rw.c       |  9 +--------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c   |  7 -------
 4 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 051fefde8d51..46f2ce9f810b 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -134,6 +134,30 @@ enum {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/**
+ * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+					  struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+/**
+ * svc_rdma_send_cid_init - Initialize a Send Queue completion ID
+ * @rdma: controlling transport
+ * @cid: completion ID to initialize
+ */
+static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+					  struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
 /*
  * A chunk context tracks all I/O for moving one Read or Write
  * chunk. This is a set of rdma_rw's that handle data movement
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 392a91dc8a99..ac6351e292c5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -115,13 +115,6 @@ svc_rdma_next_recv_ctxt(struct list_head *list)
 					rc_list);
 }
 
-static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
-				   struct rpc_rdma_cid *cid)
-{
-	cid->ci_queue_id = rdma->sc_rq_cq->res.id;
-	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
 static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 4d2db06ccfd2..eab71f3867fa 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -146,13 +146,6 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
 	return ret;
 }
 
-static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
-				 struct rpc_rdma_cid *cid)
-{
-	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
-	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
 /**
  * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt
  * @rdma: controlling transport instance
@@ -161,7 +154,7 @@ static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
 void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 		      struct svc_rdma_chunk_ctxt *cc)
 {
-	svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
+	svc_rdma_send_cid_init(rdma, &cc->cc_cid);
 
 	INIT_LIST_HEAD(&cc->cc_rwctxts);
 	cc->cc_sqecount = 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 9571ed4a74d4..c9585e469ca8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -113,13 +113,6 @@
 
 static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
 
-static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
-				   struct rpc_rdma_cid *cid)
-{
-	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
-	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
-}
-
 static struct svc_rdma_send_ctxt *
 svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 {
-- 
cgit v1.2.3


From 4d9d69db898d05bd063548eee65d16a020676fec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:31:48 -0500
Subject: svcrdma: Add back svc_rdma_recv_ctxt::rc_pages

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (the client) stops responding. We
need to go back to handling RDMA Reads by allowing the nfsd thread
to return to the svc scheduler, then waking a second thread finish
the RPC message once the Read completion fires.

To start with, restore the rc_pages field so that RDMA Read pages
can be managed across calls to svc_rdma_recvfrom().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         | 4 +++-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 5 +++++
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 4 +++-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 46f2ce9f810b..0f2d7f68ef5d 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -183,7 +183,6 @@ struct svc_rdma_recv_ctxt {
 	void			*rc_recv_buf;
 	struct xdr_stream	rc_stream;
 	u32			rc_byte_len;
-	unsigned int		rc_page_count;
 	u32			rc_inv_rkey;
 	__be32			rc_msgtype;
 
@@ -199,6 +198,9 @@ struct svc_rdma_recv_ctxt {
 	struct svc_rdma_chunk	*rc_cur_result_payload;
 	struct svc_rdma_pcl	rc_write_pcl;
 	struct svc_rdma_pcl	rc_reply_pcl;
+
+	unsigned int		rc_page_count;
+	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
 struct svc_rdma_send_ctxt {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 38f01652dc6d..e363cb1bdbc4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -214,6 +214,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_recv_ctxt *ctxt)
 {
+	/* @rc_page_count is normally zero here, but error flows
+	 * can leave pages in @rc_pages.
+	 */
+	release_pages(ctxt->rc_pages, ctxt->rc_page_count);
+
 	pcl_free(&ctxt->rc_call_pcl);
 	pcl_free(&ctxt->rc_read_pcl);
 	pcl_free(&ctxt->rc_write_pcl);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index ff54bb268b7d..28a34718dee5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -1131,7 +1131,9 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 	rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
+	/* Ensure svc_rdma_recv_ctxt_put() does not release pages
+	 * left in @rc_pages while I/O proceeds.
+	 */
 	head->rc_page_count = 0;
 
 out_err:
-- 
cgit v1.2.3


From a937693a82fd2211c5e52b638959d1486a77d16a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:31:54 -0500
Subject: svcrdma: Add back svcxprt_rdma::sc_read_complete_q

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (ie, the client) stops responding.
We need to go back to handling RDMA Reads by allowing the nfsd
thread to return to the svc scheduler, then waking a second thread
finish the RPC message once the Read completion fires.

As a next step, add a list_head upon which completed Reads are queued.
A subsequent patch will make use of this queue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 37 +++++++++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  1 +
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 0f2d7f68ef5d..c98d29e51b9c 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -98,6 +98,7 @@ struct svcxprt_rdma {
 	u32		     sc_pending_recvs;
 	u32		     sc_recv_batch;
 	struct list_head     sc_rq_dto_q;
+	struct list_head     sc_read_complete_q;
 	spinlock_t	     sc_rq_dto_lock;
 	struct ib_qp         *sc_qp;
 	struct ib_cq         *sc_rq_cq;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index e363cb1bdbc4..2de947183a7a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -382,6 +382,10 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
 
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
+		list_del(&ctxt->rc_list);
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
+	}
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
 		list_del(&ctxt->rc_list);
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
@@ -763,6 +767,30 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
 	return true;
 }
 
+static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp,
+					    struct svc_rdma_recv_ctxt *ctxt)
+{
+	int i;
+
+	/* Transfer the Read chunk pages into @rqstp.rq_pages, replacing
+	 * the rq_pages that were already allocated for this rqstp.
+	 */
+	release_pages(rqstp->rq_respages, ctxt->rc_page_count);
+	for (i = 0; i < ctxt->rc_page_count; i++)
+		rqstp->rq_pages[i] = ctxt->rc_pages[i];
+
+	/* Update @rqstp's result send buffer to start after the
+	 * last page in the RDMA Read payload.
+	 */
+	rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* Prevent svc_rdma_recv_ctxt_put() from releasing the
+	 * pages in ctxt::rc_pages a second time.
+	 */
+	ctxt->rc_page_count = 0;
+}
+
 /**
  * svc_rdma_recvfrom - Receive an RPC call
  * @rqstp: request structure into which to receive an RPC Call
@@ -807,8 +835,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	rqstp->rq_xprt_ctxt = NULL;
 
-	ctxt = NULL;
 	spin_lock(&rdma_xprt->sc_rq_dto_lock);
+	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
+	if (ctxt) {
+		list_del(&ctxt->rc_list);
+		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		svc_rdma_read_complete(rqstp, ctxt);
+		goto complete;
+	}
 	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
 	if (ctxt)
 		list_del(&ctxt->rc_list);
@@ -846,6 +880,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 			goto out_readfail;
 	}
 
+complete:
 	rqstp->rq_xprt_ctxt = ctxt;
 	rqstp->rq_prot = IPPROTO_MAX;
 	svc_xprt_copy_addrs(rqstp, xprt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 8127c711fa3b..4f27325ace4a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -137,6 +137,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	init_llist_head(&cma_xprt->sc_send_ctxts);
 	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	init_llist_head(&cma_xprt->sc_rw_ctxts);
-- 
cgit v1.2.3


From d3dba534100d4e9eb7a5204be97cd6f9ada2066e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 18 Dec 2023 17:32:07 -0500
Subject: svcrdma: Implement multi-stage Read completion again

Having an nfsd thread waiting for an RDMA Read completion is
problematic if the Read responder (ie, the client) stops responding.
We need to go back to handling RDMA Reads by getting the svc scheduler
to call svc_rdma_recvfrom() a second time to finish building an RPC
message after a Read completion.

This is the final patch, and makes several changes that have to
happen concurrently:

1. svc_rdma_process_read_list no longer waits for a completion, but
   simply builds and posts the Read WRs.

2. svc_rdma_read_done() now queues a completed Read on
   sc_read_complete_q for later processing rather than calling
   complete().

3. The completed RPC message is no longer built in the
   svc_rdma_process_read_list() path. Finishing the message is now
   done in svc_rdma_recvfrom() when it notices work on the
   sc_read_complete_q. The "finish building this RPC message" code
   is removed from the svc_rdma_process_read_list() path.

This arrangement avoids the need for an nfsd thread to wait for an
RDMA Read non-interruptibly without a timeout. It's basically the
same code structure that Tom Tucker used for Read chunks along with
some clean-up and modernization.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |   6 +-
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  36 +++++---
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 151 +++++++++++---------------------
 3 files changed, 80 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c98d29e51b9c..e7595ae62fe2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -170,8 +170,6 @@ struct svc_rdma_chunk_ctxt {
 	struct list_head	cc_rwctxts;
 	ktime_t			cc_posttime;
 	int			cc_sqecount;
-	enum ib_wc_status	cc_status;
-	struct completion	cc_done;
 };
 
 struct svc_rdma_recv_ctxt {
@@ -191,6 +189,7 @@ struct svc_rdma_recv_ctxt {
 	unsigned int		rc_pageoff;
 	unsigned int		rc_curpage;
 	unsigned int		rc_readbytes;
+	struct xdr_buf		rc_saved_arg;
 	struct svc_rdma_chunk_ctxt	rc_cc;
 
 	struct svc_rdma_pcl	rc_call_pcl;
@@ -240,6 +239,9 @@ extern int svc_rdma_recvfrom(struct svc_rqst *);
 extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
 extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 			     struct svc_rdma_chunk_ctxt *cc);
+extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
+				struct svc_rdma_chunk_ctxt *cc,
+				enum dma_data_direction dir);
 extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
 				     const struct svc_rdma_chunk *chunk,
 				     const struct xdr_buf *xdr);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 034bdd02f925..d72953f29258 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -214,6 +214,8 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_recv_ctxt *ctxt)
 {
+	svc_rdma_cc_release(rdma, &ctxt->rc_cc, DMA_FROM_DEVICE);
+
 	/* @rc_page_count is normally zero here, but error flows
 	 * can leave pages in @rc_pages.
 	 */
@@ -870,6 +872,7 @@ static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp,
 	 * procedure for that depends on what kind of RPC/RDMA
 	 * chunks were provided by the client.
 	 */
+	rqstp->rq_arg = ctxt->rc_saved_arg;
 	if (pcl_is_empty(&ctxt->rc_call_pcl)) {
 		if (ctxt->rc_read_pcl.cl_count == 1)
 			svc_rdma_read_complete_one(rqstp, ctxt);
@@ -930,7 +933,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
 	if (ctxt) {
 		list_del(&ctxt->rc_list);
-		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+		svc_xprt_received(xprt);
 		svc_rdma_read_complete(rqstp, ctxt);
 		goto complete;
 	}
@@ -965,11 +969,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
 
 	if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
-	    !pcl_is_empty(&ctxt->rc_call_pcl)) {
-		ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
-		if (ret < 0)
-			goto out_readfail;
-	}
+	    !pcl_is_empty(&ctxt->rc_call_pcl))
+		goto out_readlist;
 
 complete:
 	rqstp->rq_xprt_ctxt = ctxt;
@@ -983,12 +984,23 @@ out_err:
 	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
 	return 0;
 
-out_readfail:
-	if (ret == -EINVAL)
-		svc_rdma_send_error(rdma_xprt, ctxt, ret);
-	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
-	svc_xprt_deferred_close(xprt);
-	return -ENOTCONN;
+out_readlist:
+	/* This @rqstp is about to be recycled. Save the work
+	 * already done constructing the Call message in rq_arg
+	 * so it can be restored when the RDMA Reads have
+	 * completed.
+	 */
+	ctxt->rc_saved_arg = rqstp->rq_arg;
+
+	ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
+	if (ret < 0) {
+		if (ret == -EINVAL)
+			svc_rdma_send_error(rdma_xprt, ctxt, ret);
+		svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
+		svc_xprt_deferred_close(xprt);
+		return ret;
+	}
+	return 0;
 
 out_backchannel:
 	svc_rdma_handle_bc_reply(rqstp, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 28a34718dee5..c00fcce61d1e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -163,14 +163,15 @@ void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
 	cc->cc_sqecount = 0;
 }
 
-/*
- * The consumed rw_ctx's are cleaned and placed on a local llist so
- * that only one atomic llist operation is needed to put them all
- * back on the free list.
+/**
+ * svc_rdma_cc_release - Release resources held by a svc_rdma_chunk_ctxt
+ * @rdma: controlling transport instance
+ * @cc: svc_rdma_chunk_ctxt to be released
+ * @dir: DMA direction
  */
-static void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
-				struct svc_rdma_chunk_ctxt *cc,
-				enum dma_data_direction dir)
+void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
+			 struct svc_rdma_chunk_ctxt *cc,
+			 enum dma_data_direction dir)
 {
 	struct llist_node *first, *last;
 	struct svc_rdma_rw_ctxt *ctxt;
@@ -300,12 +301,21 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
 	struct svc_rdma_recv_ctxt *ctxt;
 
+	svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
+
+	ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc);
 	switch (wc->status) {
 	case IB_WC_SUCCESS:
-		ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc);
 		trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes,
 				      cc->cc_posttime);
-		break;
+
+		spin_lock(&rdma->sc_rq_dto_lock);
+		list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q);
+		/* the unlock pairs with the smp_rmb in svc_xprt_ready */
+		set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+		spin_unlock(&rdma->sc_rq_dto_lock);
+		svc_xprt_enqueue(&rdma->sc_xprt);
+		return;
 	case IB_WC_WR_FLUSH_ERR:
 		trace_svcrdma_wc_read_flush(wc, &cc->cc_cid);
 		break;
@@ -313,10 +323,13 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 		trace_svcrdma_wc_read_err(wc, &cc->cc_cid);
 	}
 
-	svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
-	cc->cc_status = wc->status;
-	complete(&cc->cc_done);
-	return;
+	/* The RDMA Read has flushed, so the incoming RPC message
+	 * cannot be constructed and must be dropped. Signal the
+	 * loss to the client by closing the connection.
+	 */
+	svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE);
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
 /*
@@ -823,7 +836,6 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp,
 			      struct svc_rdma_recv_ctxt *head)
 {
 	const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
-	struct xdr_buf *buf = &rqstp->rq_arg;
 	struct svc_rdma_chunk *chunk, *next;
 	unsigned int start, length;
 	int ret;
@@ -853,18 +865,7 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp,
 
 	start += length;
 	length = head->rc_byte_len - start;
-	ret = svc_rdma_copy_inline_range(rqstp, head, start, length);
-	if (ret < 0)
-		return ret;
-
-	buf->len += head->rc_readbytes;
-	buf->buflen += head->rc_readbytes;
-
-	buf->head[0].iov_base = page_address(rqstp->rq_pages[0]);
-	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes);
-	buf->pages = &rqstp->rq_pages[1];
-	buf->page_len = head->rc_readbytes - buf->head[0].iov_len;
-	return 0;
+	return svc_rdma_copy_inline_range(rqstp, head, start, length);
 }
 
 /**
@@ -888,42 +889,8 @@ svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp,
 static int svc_rdma_read_data_item(struct svc_rqst *rqstp,
 				   struct svc_rdma_recv_ctxt *head)
 {
-	struct xdr_buf *buf = &rqstp->rq_arg;
-	struct svc_rdma_chunk *chunk;
-	unsigned int length;
-	int ret;
-
-	chunk = pcl_first_chunk(&head->rc_read_pcl);
-	ret = svc_rdma_build_read_chunk(rqstp, head, chunk);
-	if (ret < 0)
-		goto out;
-
-	/* Split the Receive buffer between the head and tail
-	 * buffers at Read chunk's position. XDR roundup of the
-	 * chunk is not included in either the pagelist or in
-	 * the tail.
-	 */
-	buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
-	buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
-	buf->head[0].iov_len = chunk->ch_position;
-
-	/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
-	 *
-	 * If the client already rounded up the chunk length, the
-	 * length does not change. Otherwise, the length of the page
-	 * list is increased to include XDR round-up.
-	 *
-	 * Currently these chunks always start at page offset 0,
-	 * thus the rounded-up length never crosses a page boundary.
-	 */
-	buf->pages = &rqstp->rq_pages[0];
-	length = xdr_align_size(chunk->ch_length);
-	buf->page_len = length;
-	buf->len += length;
-	buf->buflen += length;
-
-out:
-	return ret;
+	return svc_rdma_build_read_chunk(rqstp, head,
+					 pcl_first_chunk(&head->rc_read_pcl));
 }
 
 /**
@@ -1051,23 +1018,28 @@ static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp,
 static noinline int svc_rdma_read_special(struct svc_rqst *rqstp,
 					  struct svc_rdma_recv_ctxt *head)
 {
-	struct xdr_buf *buf = &rqstp->rq_arg;
-	int ret;
-
-	ret = svc_rdma_read_call_chunk(rqstp, head);
-	if (ret < 0)
-		goto out;
-
-	buf->len += head->rc_readbytes;
-	buf->buflen += head->rc_readbytes;
+	return svc_rdma_read_call_chunk(rqstp, head);
+}
 
-	buf->head[0].iov_base = page_address(rqstp->rq_pages[0]);
-	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, head->rc_readbytes);
-	buf->pages = &rqstp->rq_pages[1];
-	buf->page_len = head->rc_readbytes - buf->head[0].iov_len;
+/* Pages under I/O have been copied to head->rc_pages. Ensure that
+ * svc_xprt_release() does not put them when svc_rdma_recvfrom()
+ * returns. This has to be done after all Read WRs are constructed
+ * to properly handle a page that happens to be part of I/O on behalf
+ * of two different RDMA segments.
+ *
+ * Note: if the subsequent post_send fails, these pages have already
+ * been moved to head->rc_pages and thus will be cleaned up by
+ * svc_rdma_recv_ctxt_put().
+ */
+static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
+				      struct svc_rdma_recv_ctxt *head)
+{
+	unsigned int i;
 
-out:
-	return ret;
+	for (i = 0; i < head->rc_page_count; i++) {
+		head->rc_pages[i] = rqstp->rq_pages[i];
+		rqstp->rq_pages[i] = NULL;
+	}
 }
 
 /**
@@ -1113,30 +1085,11 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 			ret = svc_rdma_read_multiple_chunks(rqstp, head);
 	} else
 		ret = svc_rdma_read_special(rqstp, head);
+	svc_rdma_clear_rqst_pages(rqstp, head);
 	if (ret < 0)
-		goto out_err;
+		return ret;
 
 	trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
-	init_completion(&cc->cc_done);
 	ret = svc_rdma_post_chunk_ctxt(rdma, cc);
-	if (ret < 0)
-		goto out_err;
-
-	ret = 1;
-	wait_for_completion(&cc->cc_done);
-	if (cc->cc_status != IB_WC_SUCCESS)
-		ret = -EIO;
-
-	/* rq_respages starts after the last arg page */
-	rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
-	rqstp->rq_next_page = rqstp->rq_respages + 1;
-
-	/* Ensure svc_rdma_recv_ctxt_put() does not release pages
-	 * left in @rc_pages while I/O proceeds.
-	 */
-	head->rc_page_count = 0;
-
-out_err:
-	svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE);
-	return ret;
+	return ret < 0 ? ret : 1;
 }
-- 
cgit v1.2.3


From 7b207ccd983350a5dedd132b57c666186dd02a7c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 11:56:32 +1100
Subject: svc: don't hold reference for poolstats, only mutex.

A future patch will remove refcounting on svc_serv as it is of little
use.
It is currently used to keep the svc around while the pool_stats file is
open.
Change this to get the pointer, protected by the mutex, only in
seq_start, and the release the mutex in seq_stop.
This means that if the nfsd server is stopped and restarted while the
pool_stats file it open, then some pool stats info could be from the
first instance and some from the second.  This might appear odd, but is
unlikely to be a problem in practice.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h            |  4 +++-
 fs/nfsd/nfsctl.c           |  2 +-
 fs/nfsd/nfssvc.c           | 24 ++----------------------
 include/linux/sunrpc/svc.h |  8 +++++++-
 net/sunrpc/svc_xprt.c      | 32 +++++++++++++++++++++++---------
 5 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ab303a8b77d5..16dbef245dbb 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -123,7 +123,9 @@ struct nfsd_net {
 	u32 clientid_counter;
 	u32 clverifier_counter;
 
-	struct svc_serv *nfsd_serv;
+	struct svc_info nfsd_info;
+#define nfsd_serv nfsd_info.serv
+
 	/* When a listening socket is added to nfsd, keep_active is set
 	 * and this justifies a reference on nfsd_serv.  This stops
 	 * nfsd_serv from being freed.  When the number of threads is
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a3b385703cc..46a001e81b55 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -183,7 +183,7 @@ static const struct file_operations pool_stats_operations = {
 	.open		= nfsd_pool_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= nfsd_pool_stats_release,
+	.release	= seq_release,
 };
 
 DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ee835bf9ee42..365968737923 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -684,6 +684,7 @@ int nfsd_create_serv(struct net *net)
 		return error;
 	}
 	spin_lock(&nfsd_notifier_lock);
+	nn->nfsd_info.mutex = &nfsd_mutex;
 	nn->nfsd_serv = serv;
 	spin_unlock(&nfsd_notifier_lock);
 
@@ -1082,28 +1083,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 
 int nfsd_pool_stats_open(struct inode *inode, struct file *file)
 {
-	int ret;
 	struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
 
-	mutex_lock(&nfsd_mutex);
-	if (nn->nfsd_serv == NULL) {
-		mutex_unlock(&nfsd_mutex);
-		return -ENODEV;
-	}
-	svc_get(nn->nfsd_serv);
-	ret = svc_pool_stats_open(nn->nfsd_serv, file);
-	mutex_unlock(&nfsd_mutex);
-	return ret;
-}
-
-int nfsd_pool_stats_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct svc_serv *serv = seq->private;
-	int ret = seq_release(inode, file);
-
-	mutex_lock(&nfsd_mutex);
-	svc_put(serv);
-	mutex_unlock(&nfsd_mutex);
-	return ret;
+	return svc_pool_stats_open(&nn->nfsd_info, file);
 }
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 544fcfe07479..3bea2840272d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -97,6 +97,12 @@ struct svc_serv {
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
 
+/* This is used by pool_stats to find and lock an svc */
+struct svc_info {
+	struct svc_serv		*serv;
+	struct mutex		*mutex;
+};
+
 /**
  * svc_get() - increment reference count on a SUNRPC serv
  * @serv:  the svc_serv to have count incremented
@@ -431,7 +437,7 @@ void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 				     int (*threadfn)(void *data));
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
-int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
+int		   svc_pool_stats_open(struct svc_info *si, struct file *file);
 void		   svc_process(struct svc_rqst *rqstp);
 void		   svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 1b71055fc391..b4a85a227bd7 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1362,29 +1362,36 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)
 }
 EXPORT_SYMBOL_GPL(svc_xprt_names);
 
-
 /*----------------------------------------------------------------------------*/
 
 static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
 {
 	unsigned int pidx = (unsigned int)*pos;
-	struct svc_serv *serv = m->private;
+	struct svc_info *si = m->private;
 
 	dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
 
+	mutex_lock(si->mutex);
+
 	if (!pidx)
 		return SEQ_START_TOKEN;
-	return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]);
+	if (!si->serv)
+		return NULL;
+	return pidx > si->serv->sv_nrpools ? NULL
+		: &si->serv->sv_pools[pidx - 1];
 }
 
 static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct svc_pool *pool = p;
-	struct svc_serv *serv = m->private;
+	struct svc_info *si = m->private;
+	struct svc_serv *serv = si->serv;
 
 	dprintk("svc_pool_stats_next, *pos=%llu\n", *pos);
 
-	if (p == SEQ_START_TOKEN) {
+	if (!serv) {
+		pool = NULL;
+	} else if (p == SEQ_START_TOKEN) {
 		pool = &serv->sv_pools[0];
 	} else {
 		unsigned int pidx = (pool - &serv->sv_pools[0]);
@@ -1399,6 +1406,9 @@ static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
 
 static void svc_pool_stats_stop(struct seq_file *m, void *p)
 {
+	struct svc_info *si = m->private;
+
+	mutex_unlock(si->mutex);
 }
 
 static int svc_pool_stats_show(struct seq_file *m, void *p)
@@ -1426,14 +1436,18 @@ static const struct seq_operations svc_pool_stats_seq_ops = {
 	.show	= svc_pool_stats_show,
 };
 
-int svc_pool_stats_open(struct svc_serv *serv, struct file *file)
+int svc_pool_stats_open(struct svc_info *info, struct file *file)
 {
+	struct seq_file *seq;
 	int err;
 
 	err = seq_open(file, &svc_pool_stats_seq_ops);
-	if (!err)
-		((struct seq_file *) file->private_data)->private = serv;
-	return err;
+	if (err)
+		return err;
+	seq = file->private_data;
+	seq->private = info;
+
+	return 0;
 }
 EXPORT_SYMBOL(svc_pool_stats_open);
 
-- 
cgit v1.2.3


From 1e3577a4521ef33199eea05ce7b9099825848c49 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 15 Dec 2023 11:56:34 +1100
Subject: SUNRPC: discard sv_refcnt, and svc_get/svc_put

sv_refcnt is no longer useful.
lockd and nfs-cb only ever have the svc active when there are a non-zero
number of threads, so sv_refcnt mirrors sv_nrthreads.

nfsd also keeps the svc active between when a socket is added and when
the first thread is started, but we don't really need a refcount for
that.  We can simply not destroy the svc while there are any permanent
sockets attached.

So remove sv_refcnt and the get/put functions.
Instead of a final call to svc_put(), call svc_destroy() instead.
This is changed to also store NULL in the passed-in pointer to make it
easier to avoid use-after-free situations.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc.c             | 10 ++++------
 fs/nfs/callback.c          | 13 ++++++-------
 fs/nfsd/netns.h            |  7 -------
 fs/nfsd/nfsctl.c           | 13 ++-----------
 fs/nfsd/nfssvc.c           | 26 ++++----------------------
 include/linux/sunrpc/svc.h | 27 +--------------------------
 net/sunrpc/svc.c           | 13 ++++---------
 7 files changed, 21 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 81be07c1d3d1..0d6cb3fdc0e1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -345,10 +345,10 @@ static int lockd_get(void)
 
 	serv->sv_maxconn = nlm_max_connections;
 	error = svc_set_num_threads(serv, NULL, 1);
-	/* The thread now holds the only reference */
-	svc_put(serv);
-	if (error < 0)
+	if (error < 0) {
+		svc_destroy(&serv);
 		return error;
+	}
 
 	nlmsvc_serv = serv;
 	register_inetaddr_notifier(&lockd_inetaddr_notifier);
@@ -372,11 +372,9 @@ static void lockd_put(void)
 	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
 #endif
 
-	svc_get(nlmsvc_serv);
 	svc_set_num_threads(nlmsvc_serv, NULL, 0);
-	svc_put(nlmsvc_serv);
 	timer_delete_sync(&nlmsvc_retry);
-	nlmsvc_serv = NULL;
+	svc_destroy(&nlmsvc_serv);
 	dprintk("lockd_down: service destroyed\n");
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 4ffa1f469e90..760d27dd7225 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -187,7 +187,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 	 * Check whether we're already up and running.
 	 */
 	if (cb_info->serv)
-		return svc_get(cb_info->serv);
+		return cb_info->serv;
 
 	/*
 	 * Sanity check: if there's no task,
@@ -245,9 +245,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 
 	cb_info->users++;
 err_net:
-	if (!cb_info->users)
-		cb_info->serv = NULL;
-	svc_put(serv);
+	if (!cb_info->users) {
+		svc_set_num_threads(cb_info->serv, NULL, 0);
+		svc_destroy(&cb_info->serv);
+	}
 err_create:
 	mutex_unlock(&nfs_callback_mutex);
 	return ret;
@@ -271,11 +272,9 @@ void nfs_callback_down(int minorversion, struct net *net)
 	nfs_callback_down_net(minorversion, serv, net);
 	cb_info->users--;
 	if (cb_info->users == 0) {
-		svc_get(serv);
 		svc_set_num_threads(serv, NULL, 0);
-		svc_put(serv);
 		dprintk("nfs_callback_down: service destroyed\n");
-		cb_info->serv = NULL;
+		svc_destroy(&cb_info->serv);
 	}
 	mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 16dbef245dbb..74b4360779a1 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -126,13 +126,6 @@ struct nfsd_net {
 	struct svc_info nfsd_info;
 #define nfsd_serv nfsd_info.serv
 
-	/* When a listening socket is added to nfsd, keep_active is set
-	 * and this justifies a reference on nfsd_serv.  This stops
-	 * nfsd_serv from being freed.  When the number of threads is
-	 * set, keep_active is cleared and the reference is dropped.  So
-	 * when the last thread exits, the service will be destroyed.
-	 */
-	int keep_active;
 
 	/*
 	 * clientid and stateid data for construction of net unique COPY
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 46a001e81b55..5c2fae98d98a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -711,12 +711,8 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
 	serv = nn->nfsd_serv;
 	err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
 
-	if (err < 0 && !serv->sv_nrthreads && !nn->keep_active)
+	if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
 		nfsd_last_thread(net);
-	else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
-		svc_get(serv);
-
-	svc_put(serv);
 	return err;
 }
 
@@ -754,10 +750,6 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
 
-	if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
-		svc_get(serv);
-
-	svc_put(serv);
 	return 0;
 out_close:
 	xprt = svc_find_xprt(serv, transport, net, PF_INET, port);
@@ -766,10 +758,9 @@ out_close:
 		svc_xprt_put(xprt);
 	}
 out_err:
-	if (!serv->sv_nrthreads && !nn->keep_active)
+	if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
 		nfsd_last_thread(net);
 
-	svc_put(serv);
 	return err;
 }
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 365968737923..1a295c409343 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -59,15 +59,6 @@ static __be32			nfsd_init_request(struct svc_rqst *,
  * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members
  * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks.
  *
- * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
- * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless
- * nn->keep_active is set).  That number of nfsd threads must
- * exist and each must be listed in ->sp_all_threads in some entry of
- * ->sv_pools[].
- *
- * Each active thread holds a counted reference on nn->nfsd_serv, as does
- * the nn->keep_active flag and various transient calls to svc_get().
- *
  * Finally, the nfsd_mutex also protects some of the global variables that are
  * accessed when nfsd starts and that are settable via the write_* routines in
  * nfsctl.c. In particular:
@@ -572,6 +563,7 @@ void nfsd_last_thread(struct net *net)
 
 	nfsd_shutdown_net(net);
 	nfsd_export_flush(net);
+	svc_destroy(&serv);
 }
 
 void nfsd_reset_versions(struct nfsd_net *nn)
@@ -646,11 +638,9 @@ void nfsd_shutdown_threads(struct net *net)
 		return;
 	}
 
-	svc_get(serv);
 	/* Kill outstanding nfsd threads */
 	svc_set_num_threads(serv, NULL, 0);
 	nfsd_last_thread(net);
-	svc_put(serv);
 	mutex_unlock(&nfsd_mutex);
 }
 
@@ -666,10 +656,9 @@ int nfsd_create_serv(struct net *net)
 	struct svc_serv *serv;
 
 	WARN_ON(!mutex_is_locked(&nfsd_mutex));
-	if (nn->nfsd_serv) {
-		svc_get(nn->nfsd_serv);
+	if (nn->nfsd_serv)
 		return 0;
-	}
+
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
@@ -680,7 +669,7 @@ int nfsd_create_serv(struct net *net)
 	serv->sv_maxconn = nn->max_connections;
 	error = svc_bind(serv, net);
 	if (error < 0) {
-		svc_put(serv);
+		svc_destroy(&serv);
 		return error;
 	}
 	spin_lock(&nfsd_notifier_lock);
@@ -764,7 +753,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 		nthreads[0] = 1;
 
 	/* apply the new numbers */
-	svc_get(nn->nfsd_serv);
 	for (i = 0; i < n; i++) {
 		err = svc_set_num_threads(nn->nfsd_serv,
 					  &nn->nfsd_serv->sv_pools[i],
@@ -772,7 +760,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
 		if (err)
 			break;
 	}
-	svc_put(nn->nfsd_serv);
 	return err;
 }
 
@@ -814,13 +801,8 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 		goto out_put;
 	error = serv->sv_nrthreads;
 out_put:
-	/* Threads now hold service active */
-	if (xchg(&nn->keep_active, 0))
-		svc_put(serv);
-
 	if (serv->sv_nrthreads == 0)
 		nfsd_last_thread(net);
-	svc_put(serv);
 out:
 	mutex_unlock(&nfsd_mutex);
 	return error;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3bea2840272d..8d7888234e9e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
 	struct svc_program *	sv_program;	/* RPC program */
 	struct svc_stat *	sv_stats;	/* RPC statistics */
 	spinlock_t		sv_lock;
-	struct kref		sv_refcnt;
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
 						 * '0' causing max to be based
@@ -103,31 +102,7 @@ struct svc_info {
 	struct mutex		*mutex;
 };
 
-/**
- * svc_get() - increment reference count on a SUNRPC serv
- * @serv:  the svc_serv to have count incremented
- *
- * Returns: the svc_serv that was passed in.
- */
-static inline struct svc_serv *svc_get(struct svc_serv *serv)
-{
-	kref_get(&serv->sv_refcnt);
-	return serv;
-}
-
-void svc_destroy(struct kref *);
-
-/**
- * svc_put - decrement reference count on a SUNRPC serv
- * @serv:  the svc_serv to have count decremented
- *
- * When the reference count reaches zero, svc_destroy()
- * is called to clean up and free the serv.
- */
-static inline void svc_put(struct svc_serv *serv)
-{
-	kref_put(&serv->sv_refcnt, svc_destroy);
-}
+void svc_destroy(struct svc_serv **svcp);
 
 /*
  * Maximum payload size supported by a kernel RPC server.
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index fa4e23fa0e09..eb5856e1351d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -463,7 +463,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		return NULL;
 	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
-	kref_init(&serv->sv_refcnt);
 	serv->sv_stats     = prog->pg_stats;
 	if (bufsize > RPCSVC_MAXPAYLOAD)
 		bufsize = RPCSVC_MAXPAYLOAD;
@@ -564,11 +563,13 @@ EXPORT_SYMBOL_GPL(svc_create_pooled);
  * protect sv_permsocks and sv_tempsocks.
  */
 void
-svc_destroy(struct kref *ref)
+svc_destroy(struct svc_serv **servp)
 {
-	struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt);
+	struct svc_serv *serv = *servp;
 	unsigned int i;
 
+	*servp = NULL;
+
 	dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name);
 	timer_shutdown_sync(&serv->sv_temptimer);
 
@@ -675,7 +676,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	if (!rqstp)
 		return ERR_PTR(-ENOMEM);
 
-	svc_get(serv);
 	spin_lock_bh(&serv->sv_lock);
 	serv->sv_nrthreads += 1;
 	spin_unlock_bh(&serv->sv_lock);
@@ -935,11 +935,6 @@ svc_exit_thread(struct svc_rqst *rqstp)
 
 	svc_rqst_free(rqstp);
 
-	svc_put(serv);
-	/* That svc_put() cannot be the last, because the thread
-	 * waiting for SP_VICTIM_REMAINS to clear must hold
-	 * a reference. So it is still safe to access pool.
-	 */
 	clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
 }
 EXPORT_SYMBOL_GPL(svc_exit_thread);
-- 
cgit v1.2.3


From 317bacf960a4879af22d12175f47d284930b3273 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Fri, 1 Dec 2023 17:25:27 -0500
Subject: i3c: master: add enable(disable) hot join in sys entry

Add hotjoin entry in sys file system allow user enable/disable hotjoin
feature.

Add (*enable(disable)_hotjoin)() to i3c_master_controller_ops.
Add api i3c_master_enable(disable)_hotjoin();

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20231201222532.2431484-2-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i3c/master.h |  5 +++
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 8b729ebae2a6..3afa530c5e32 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -557,6 +557,88 @@ static ssize_t i2c_scl_frequency_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(i2c_scl_frequency);
 
+static int i3c_set_hotjoin(struct i3c_master_controller *master, bool enable)
+{
+	int ret;
+
+	if (!master || !master->ops)
+		return -EINVAL;
+
+	if (!master->ops->enable_hotjoin || !master->ops->disable_hotjoin)
+		return -EINVAL;
+
+	i3c_bus_normaluse_lock(&master->bus);
+
+	if (enable)
+		ret = master->ops->enable_hotjoin(master);
+	else
+		ret = master->ops->disable_hotjoin(master);
+
+	master->hotjoin = enable;
+
+	i3c_bus_normaluse_unlock(&master->bus);
+
+	return ret;
+}
+
+static ssize_t hotjoin_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct i3c_bus *i3cbus = dev_to_i3cbus(dev);
+	int ret;
+	bool res;
+
+	if (!i3cbus->cur_master)
+		return -EINVAL;
+
+	if (kstrtobool(buf, &res))
+		return -EINVAL;
+
+	ret = i3c_set_hotjoin(i3cbus->cur_master->common.master, res);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+/*
+ * i3c_master_enable_hotjoin - Enable hotjoin
+ * @master: I3C master object
+ *
+ * Return: a 0 in case of success, an negative error code otherwise.
+ */
+int i3c_master_enable_hotjoin(struct i3c_master_controller *master)
+{
+	return i3c_set_hotjoin(master, true);
+}
+EXPORT_SYMBOL_GPL(i3c_master_enable_hotjoin);
+
+/*
+ * i3c_master_disable_hotjoin - Disable hotjoin
+ * @master: I3C master object
+ *
+ * Return: a 0 in case of success, an negative error code otherwise.
+ */
+int i3c_master_disable_hotjoin(struct i3c_master_controller *master)
+{
+	return i3c_set_hotjoin(master, false);
+}
+EXPORT_SYMBOL_GPL(i3c_master_disable_hotjoin);
+
+static ssize_t hotjoin_show(struct device *dev, struct device_attribute *da, char *buf)
+{
+	struct i3c_bus *i3cbus = dev_to_i3cbus(dev);
+	ssize_t ret;
+
+	i3c_bus_normaluse_lock(i3cbus);
+	ret = sysfs_emit(buf, "%d\n", i3cbus->cur_master->common.master->hotjoin);
+	i3c_bus_normaluse_unlock(i3cbus);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RW(hotjoin);
+
 static struct attribute *i3c_masterdev_attrs[] = {
 	&dev_attr_mode.attr,
 	&dev_attr_current_master.attr,
@@ -567,6 +649,7 @@ static struct attribute *i3c_masterdev_attrs[] = {
 	&dev_attr_pid.attr,
 	&dev_attr_dynamic_address.attr,
 	&dev_attr_hdrcap.attr,
+	&dev_attr_hotjoin.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(i3c_masterdev);
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 24c1863b86e2..3b5bd8e3257c 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -460,6 +460,8 @@ struct i3c_master_controller_ops {
 	int (*disable_ibi)(struct i3c_dev_desc *dev);
 	void (*recycle_ibi_slot)(struct i3c_dev_desc *dev,
 				 struct i3c_ibi_slot *slot);
+	int (*enable_hotjoin)(struct i3c_master_controller *master);
+	int (*disable_hotjoin)(struct i3c_master_controller *master);
 };
 
 /**
@@ -495,6 +497,7 @@ struct i3c_master_controller {
 	const struct i3c_master_controller_ops *ops;
 	unsigned int secondary : 1;
 	unsigned int init_done : 1;
+	unsigned int hotjoin: 1;
 	struct {
 		struct list_head i3c;
 		struct list_head i2c;
@@ -551,6 +554,8 @@ int i3c_master_register(struct i3c_master_controller *master,
 			const struct i3c_master_controller_ops *ops,
 			bool secondary);
 void i3c_master_unregister(struct i3c_master_controller *master);
+int i3c_master_enable_hotjoin(struct i3c_master_controller *master);
+int i3c_master_disable_hotjoin(struct i3c_master_controller *master);
 
 /**
  * i3c_dev_get_master_data() - get master private data attached to an I3C
-- 
cgit v1.2.3


From e5e3df06ac98d15cfb10bb5c12356709365e91b2 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Fri, 1 Dec 2023 17:25:29 -0500
Subject: i3c: add actual_len in i3c_priv_xfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In MIPI I3C Specification:

"Ninth Bit of SDR Target Returned (Read) Data as End-of-Data: In I2C, the
ninth Data bit from Target to Controller is an ACK by the Controller. By
contrast, in I3C this bit allows the Target to end a Read, and allows the
Controller to Abort a Read. In SDR terms, the ninth bit of Read data is
referred to as the T-Bit (for ‘Transition’)"

I3C allow devices early terminate data transfer. So need "actual_len" field
to indicate how much get by i3c_priv_xfer.

Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20231201222532.2431484-4-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index 84ed77c04940..e119f11948ef 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -54,6 +54,7 @@ enum i3c_hdr_mode {
  * struct i3c_priv_xfer - I3C SDR private transfer
  * @rnw: encodes the transfer direction. true for a read, false for a write
  * @len: transfer length in bytes of the transfer
+ * @actual_len: actual length in bytes are transferred by the controller
  * @data: input/output buffer
  * @data.in: input buffer. Must point to a DMA-able buffer
  * @data.out: output buffer. Must point to a DMA-able buffer
@@ -62,6 +63,7 @@ enum i3c_hdr_mode {
 struct i3c_priv_xfer {
 	u8 rnw;
 	u16 len;
+	u16 actual_len;
 	union {
 		void *in;
 		const void *out;
-- 
cgit v1.2.3


From 18e5794879905a788e06fb2bc40b6f5b58eae5c2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:42 -0800
Subject: i3c: master: fix Excess kernel-doc description warning

Remove the @boardinfo: line to prevent the kernel-doc warning:

include/linux/i3c/master.h:98: warning: Excess struct member 'boardinfo' description in 'i2c_dev_desc'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc:  <linux-i3c@lists.infradead.org>
Link: https://lore.kernel.org/r/20231223050542.13930-1-rdunlap@infradead.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 3b5bd8e3257c..1ecd73b17ff5 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -76,7 +76,6 @@ struct i2c_dev_boardinfo {
 /**
  * struct i2c_dev_desc - I2C device descriptor
  * @common: common part of the I2C device descriptor
- * @boardinfo: pointer to the boardinfo attached to this I2C device
  * @dev: I2C device object registered to the I2C framework
  * @addr: I2C device address
  * @lvr: LVR (Legacy Virtual Register) needed by the I3C core to know about
-- 
cgit v1.2.3


From 10416a3578ba5f76d0b161d2d36a1d8a4c46a69d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 15:59:32 +0100
Subject: firewire: make fw_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the fw_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Cc: linux1394-devel@lists.sourceforge.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/2023121931-skydiver-dodgy-d1bd@gregkh
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-device.c | 2 +-
 include/linux/firewire.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 2828e9573e90..7ca8e31dadb5 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -219,7 +219,7 @@ static int fw_unit_uevent(const struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-struct bus_type fw_bus_type = {
+const struct bus_type fw_bus_type = {
 	.name = "firewire",
 	.match = fw_unit_match,
 	.probe = fw_unit_probe,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index bd3fc75d4f14..dd9f2d765e68 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -75,7 +75,7 @@ void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p);
 int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value);
 int fw_csr_string(const u32 *directory, int key, char *buf, size_t size);
 
-extern struct bus_type fw_bus_type;
+extern const struct bus_type fw_bus_type;
 
 struct fw_card_driver;
 struct fw_node;
-- 
cgit v1.2.3


From e9ee910218ffd420454b52a052d6f1087354905b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 7 Jan 2024 17:11:38 -0800
Subject: Revert "net: stmmac: Enable Per DMA Channel interrupt"

Revert "net: stmmac: Use interrupt mode INTM=1 for per channel irq"
This reverts commit 36af9f25ddfd311da82628f194c794786467cb12.
Revert "net: stmmac: Add support for TX/RX channel interrupt"
This reverts commit 9072e03d32088137a435ddf3aa95fd6e038d69d8.
Revert "net: stmmac: Make MSI interrupt routine generic"
This reverts commit 477bd4beb93bf9ace9bda71f1437b191befa9cf4.
Revert "dt-bindings: net: snps,dwmac: per channel irq"
This reverts commit 67d47c8ada0f8795bfcdb85cc8f2ad3ce556674b.

Device tree bindings need to be reviewed.

Link: https://lore.kernel.org/all/2df9fe3e-7971-4aa2-89a9-0e085b3b00d7@linaro.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../devicetree/bindings/net/snps,dwmac.yaml        | 24 +++++-----------
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  |  4 +--
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  3 --
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h     |  3 --
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 32 +++++++++-------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 30 ++++++++++----------
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 28 -------------------
 include/linux/stmmac.h                             |  4 +--
 9 files changed, 40 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
index e72dded824f4..5c2769dc689a 100644
--- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -103,27 +103,17 @@ properties:
 
   interrupts:
     minItems: 1
-    maxItems: 19
+    items:
+      - description: Combined signal for various interrupt events
+      - description: The interrupt to manage the remote wake-up packet detection
+      - description: The interrupt that occurs when Rx exits the LPI state
 
   interrupt-names:
     minItems: 1
-    maxItems: 19
     items:
-      oneOf:
-        - description: Combined signal for various interrupt events
-          const: macirq
-        - description: The interrupt to manage the remote wake-up packet detection
-          const: eth_wake_irq
-        - description: The interrupt that occurs when Rx exits the LPI state
-          const: eth_lpi
-        - description: DMA Tx per-channel interrupt
-          pattern: '^dma_tx[0-7]?$'
-        - description: DMA Rx per-channel interrupt
-          pattern: '^dma_rx[0-7]?$'
-
-    allOf:
-      - contains:
-          const: macirq
+      - const: macirq
+      - enum: [eth_wake_irq, eth_lpi]
+      - const: eth_lpi
 
   clocks:
     minItems: 1
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index f0ec69af96c9..60283543ffc8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -952,7 +952,7 @@ static int stmmac_config_single_msi(struct pci_dev *pdev,
 
 	res->irq = pci_irq_vector(pdev, 0);
 	res->wol_irq = res->irq;
-	plat->flags &= ~STMMAC_FLAG_MULTI_IRQ_EN;
+	plat->flags &= ~STMMAC_FLAG_MULTI_MSI_EN;
 	dev_info(&pdev->dev, "%s: Single IRQ enablement successful\n",
 		 __func__);
 
@@ -1004,7 +1004,7 @@ static int stmmac_config_multi_msi(struct pci_dev *pdev,
 	if (plat->msi_sfty_ue_vec < STMMAC_MSI_VEC_MAX)
 		res->sfty_ue_irq = pci_irq_vector(pdev, plat->msi_sfty_ue_vec);
 
-	plat->flags |= STMMAC_FLAG_MULTI_IRQ_EN;
+	plat->flags |= STMMAC_FLAG_MULTI_MSI_EN;
 	dev_info(&pdev->dev, "%s: multi MSI enablement successful\n", __func__);
 
 	return 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index cf43fb3c6cc5..ba2ce776bd4d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -427,9 +427,6 @@ static int socfpga_dwmac_probe(struct platform_device *pdev)
 	plat_dat->bsp_priv = dwmac;
 	plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed;
 
-	if (stmmac_res.rx_irq[0] > 0 && stmmac_res.tx_irq[0] > 0)
-		plat_dat->flags |= STMMAC_FLAG_MULTI_IRQ_EN;
-
 	ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 5f649106ffcd..84d3a8551b03 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -175,7 +175,7 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 
 	value = readl(ioaddr + DMA_BUS_MODE);
 
-	if (dma_cfg->multi_irq_en) {
+	if (dma_cfg->multi_msi_en) {
 		value &= ~DMA_BUS_MODE_INTM_MASK;
 		value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT);
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
index 04bf731cb7ea..207ff1799f2c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
@@ -346,9 +346,6 @@
 /* DMA Registers */
 #define XGMAC_DMA_MODE			0x00003000
 #define XGMAC_SWR			BIT(0)
-#define XGMAC_DMA_MODE_INTM_MASK	GENMASK(13, 12)
-#define XGMAC_DMA_MODE_INTM_SHIFT	12
-#define XGMAC_DMA_MODE_INTM_MODE1	0x1
 #define XGMAC_DMA_SYSBUS_MODE		0x00003004
 #define XGMAC_WR_OSR_LMT		GENMASK(29, 24)
 #define XGMAC_WR_OSR_LMT_SHIFT		24
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
index dcb9f094415d..3cde695fec91 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
@@ -31,13 +31,6 @@ static void dwxgmac2_dma_init(void __iomem *ioaddr,
 		value |= XGMAC_EAME;
 
 	writel(value, ioaddr + XGMAC_DMA_SYSBUS_MODE);
-
-	if (dma_cfg->multi_irq_en) {
-		value = readl(ioaddr + XGMAC_DMA_MODE);
-		value &= ~XGMAC_DMA_MODE_INTM_MASK;
-		value |= (XGMAC_DMA_MODE_INTM_MODE1 << XGMAC_DMA_MODE_INTM_SHIFT);
-		writel(value, ioaddr + XGMAC_DMA_MODE);
-	}
 }
 
 static void dwxgmac2_dma_init_chan(struct stmmac_priv *priv,
@@ -372,18 +365,19 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv,
 	}
 
 	/* TX/RX NORMAL interrupts */
-	if (likely(intr_status & XGMAC_RI)) {
-		u64_stats_update_begin(&rxq_stats->syncp);
-		rxq_stats->rx_normal_irq_n++;
-		u64_stats_update_end(&rxq_stats->syncp);
-		ret |= handle_rx;
-	}
-
-	if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) {
-		u64_stats_update_begin(&txq_stats->syncp);
-		txq_stats->tx_normal_irq_n++;
-		u64_stats_update_end(&txq_stats->syncp);
-		ret |= handle_tx;
+	if (likely(intr_status & XGMAC_NIS)) {
+		if (likely(intr_status & XGMAC_RI)) {
+			u64_stats_update_begin(&rxq_stats->syncp);
+			rxq_stats->rx_normal_irq_n++;
+			u64_stats_update_end(&rxq_stats->syncp);
+			ret |= handle_rx;
+		}
+		if (likely(intr_status & (XGMAC_TI | XGMAC_TBU))) {
+			u64_stats_update_begin(&txq_stats->syncp);
+			txq_stats->tx_normal_irq_n++;
+			u64_stats_update_end(&txq_stats->syncp);
+			ret |= handle_tx;
+		}
 	}
 
 	/* Clear interrupts */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 57873b879b33..47de466e432c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -129,8 +129,8 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id);
 /* For MSI interrupts handling */
 static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id);
 static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id);
-static irqreturn_t stmmac_dma_tx_interrupt(int irq, void *data);
-static irqreturn_t stmmac_dma_rx_interrupt(int irq, void *data);
+static irqreturn_t stmmac_msi_intr_tx(int irq, void *data);
+static irqreturn_t stmmac_msi_intr_rx(int irq, void *data);
 static void stmmac_reset_rx_queue(struct stmmac_priv *priv, u32 queue);
 static void stmmac_reset_tx_queue(struct stmmac_priv *priv, u32 queue);
 static void stmmac_reset_queues_param(struct stmmac_priv *priv);
@@ -3602,7 +3602,7 @@ static void stmmac_free_irq(struct net_device *dev,
 	}
 }
 
-static int stmmac_request_irq_multi(struct net_device *dev)
+static int stmmac_request_irq_multi_msi(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	enum request_irq_err irq_err;
@@ -3697,7 +3697,7 @@ static int stmmac_request_irq_multi(struct net_device *dev)
 		}
 	}
 
-	/* Request Rx irq */
+	/* Request Rx MSI irq */
 	for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
 		if (i >= MTL_MAX_RX_QUEUES)
 			break;
@@ -3707,11 +3707,11 @@ static int stmmac_request_irq_multi(struct net_device *dev)
 		int_name = priv->int_name_rx_irq[i];
 		sprintf(int_name, "%s:%s-%d", dev->name, "rx", i);
 		ret = request_irq(priv->rx_irq[i],
-				  stmmac_dma_rx_interrupt,
+				  stmmac_msi_intr_rx,
 				  0, int_name, &priv->dma_conf.rx_queue[i]);
 		if (unlikely(ret < 0)) {
 			netdev_err(priv->dev,
-				   "%s: alloc rx-%d  dma rx_irq %d (error: %d)\n",
+				   "%s: alloc rx-%d  MSI %d (error: %d)\n",
 				   __func__, i, priv->rx_irq[i], ret);
 			irq_err = REQ_IRQ_ERR_RX;
 			irq_idx = i;
@@ -3722,7 +3722,7 @@ static int stmmac_request_irq_multi(struct net_device *dev)
 		irq_set_affinity_hint(priv->rx_irq[i], &cpu_mask);
 	}
 
-	/* Request Tx irq */
+	/* Request Tx MSI irq */
 	for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
 		if (i >= MTL_MAX_TX_QUEUES)
 			break;
@@ -3732,11 +3732,11 @@ static int stmmac_request_irq_multi(struct net_device *dev)
 		int_name = priv->int_name_tx_irq[i];
 		sprintf(int_name, "%s:%s-%d", dev->name, "tx", i);
 		ret = request_irq(priv->tx_irq[i],
-				  stmmac_dma_tx_interrupt,
+				  stmmac_msi_intr_tx,
 				  0, int_name, &priv->dma_conf.tx_queue[i]);
 		if (unlikely(ret < 0)) {
 			netdev_err(priv->dev,
-				   "%s: alloc tx-%d  dma tx_irq %d (error: %d)\n",
+				   "%s: alloc tx-%d  MSI %d (error: %d)\n",
 				   __func__, i, priv->tx_irq[i], ret);
 			irq_err = REQ_IRQ_ERR_TX;
 			irq_idx = i;
@@ -3811,8 +3811,8 @@ static int stmmac_request_irq(struct net_device *dev)
 	int ret;
 
 	/* Request the IRQ lines */
-	if (priv->plat->flags & STMMAC_FLAG_MULTI_IRQ_EN)
-		ret = stmmac_request_irq_multi(dev);
+	if (priv->plat->flags & STMMAC_FLAG_MULTI_MSI_EN)
+		ret = stmmac_request_irq_multi_msi(dev);
 	else
 		ret = stmmac_request_irq_single(dev);
 
@@ -6075,7 +6075,7 @@ static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t stmmac_dma_tx_interrupt(int irq, void *data)
+static irqreturn_t stmmac_msi_intr_tx(int irq, void *data)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)data;
 	struct stmmac_dma_conf *dma_conf;
@@ -6107,7 +6107,7 @@ static irqreturn_t stmmac_dma_tx_interrupt(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t stmmac_dma_rx_interrupt(int irq, void *data)
+static irqreturn_t stmmac_msi_intr_rx(int irq, void *data)
 {
 	struct stmmac_rx_queue *rx_q = (struct stmmac_rx_queue *)data;
 	struct stmmac_dma_conf *dma_conf;
@@ -7456,8 +7456,8 @@ int stmmac_dvr_probe(struct device *device,
 	priv->plat = plat_dat;
 	priv->ioaddr = res->addr;
 	priv->dev->base_addr = (unsigned long)res->addr;
-	priv->plat->dma_cfg->multi_irq_en =
-		(priv->plat->flags & STMMAC_FLAG_MULTI_IRQ_EN);
+	priv->plat->dma_cfg->multi_msi_en =
+		(priv->plat->flags & STMMAC_FLAG_MULTI_MSI_EN);
 
 	priv->dev->irq = res->irq;
 	priv->wol_irq = res->wol_irq;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index ae6859153e98..70eadc83ca68 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -710,10 +710,6 @@ EXPORT_SYMBOL_GPL(devm_stmmac_probe_config_dt);
 int stmmac_get_platform_resources(struct platform_device *pdev,
 				  struct stmmac_resources *stmmac_res)
 {
-	char irq_name[9];
-	int i;
-	int irq;
-
 	memset(stmmac_res, 0, sizeof(*stmmac_res));
 
 	/* Get IRQ information early to have an ability to ask for deferred
@@ -747,30 +743,6 @@ int stmmac_get_platform_resources(struct platform_device *pdev,
 		dev_info(&pdev->dev, "IRQ eth_lpi not found\n");
 	}
 
-	/* For RX Channel */
-	for (i = 0; i < MTL_MAX_RX_QUEUES; i++) {
-		snprintf(irq_name, sizeof(irq_name), "dma_rx%i", i);
-		irq = platform_get_irq_byname_optional(pdev, irq_name);
-		if (irq == -EPROBE_DEFER)
-			return irq;
-		else if (irq < 0)
-			break;
-
-		stmmac_res->rx_irq[i] = irq;
-	}
-
-	/* For TX Channel */
-	for (i = 0; i < MTL_MAX_TX_QUEUES; i++) {
-		snprintf(irq_name, sizeof(irq_name), "dma_tx%i", i);
-		irq = platform_get_irq_byname_optional(pdev, irq_name);
-		if (irq == -EPROBE_DEFER)
-			return irq;
-		else if (irq < 0)
-			break;
-
-		stmmac_res->tx_irq[i] = irq;
-	}
-
 	stmmac_res->addr = devm_platform_ioremap_resource(pdev, 0);
 
 	return PTR_ERR_OR_ZERO(stmmac_res->addr);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b950e6f9761d..dee5ad6e48c5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -98,7 +98,7 @@ struct stmmac_dma_cfg {
 	int mixed_burst;
 	bool aal;
 	bool eame;
-	bool multi_irq_en;
+	bool multi_msi_en;
 	bool dche;
 };
 
@@ -215,7 +215,7 @@ struct dwmac4_addrs {
 #define STMMAC_FLAG_TSO_EN			BIT(4)
 #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP	BIT(5)
 #define STMMAC_FLAG_VLAN_FAIL_Q_EN		BIT(6)
-#define STMMAC_FLAG_MULTI_IRQ_EN		BIT(7)
+#define STMMAC_FLAG_MULTI_MSI_EN		BIT(7)
 #define STMMAC_FLAG_EXT_SNAPSHOT_EN		BIT(8)
 #define STMMAC_FLAG_INT_SNAPSHOT_EN		BIT(9)
 #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI		BIT(10)
-- 
cgit v1.2.3


From 3fbf61207c66ff7ac9b60ab76d4bfd239f97e973 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 7 Jan 2024 17:14:51 -0800
Subject: Revert "mlx5 updates 2023-12-20"

Revert "net/mlx5: Implement management PF Ethernet profile"
This reverts commit 22c4640698a1d47606b5a4264a584e8046641784.
Revert "net/mlx5: Enable SD feature"
This reverts commit c88c49ac9c18fb7c3fa431126de1d8f8f555e912.
Revert "net/mlx5e: Block TLS device offload on combined SD netdev"
This reverts commit 83a59ce0057b7753d7fbece194b89622c663b2a6.
Revert "net/mlx5e: Support per-mdev queue counter"
This reverts commit d72baceb92539a178d2610b0e9ceb75706a75b55.
Revert "net/mlx5e: Support cross-vhca RSS"
This reverts commit c73a3ab8fa6e93a783bd563938d7cf00d62d5d34.
Revert "net/mlx5e: Let channels be SD-aware"
This reverts commit e4f9686bdee7b4dd89e0ed63cd03606e4bda4ced.
Revert "net/mlx5e: Create EN core HW resources for all secondary devices"
This reverts commit c4fb94aa822d6c9d05fc3c5aee35c7e339061dc1.
Revert "net/mlx5e: Create single netdev per SD group"
This reverts commit e2578b4f983cfcd47837bbe3bcdbf5920e50b2ad.
Revert "net/mlx5: SD, Add informative prints in kernel log"
This reverts commit c82d360325112ccc512fc11a3b68cdcdf04a1478.
Revert "net/mlx5: SD, Implement steering for primary and secondaries"
This reverts commit 605fcce33b2d1beb0139b6e5913fa0b2062116b2.
Revert "net/mlx5: SD, Implement devcom communication and primary election"
This reverts commit a45af9a96740873db9a4b5bb493ce2ad81ccb4d5.
Revert "net/mlx5: SD, Implement basic query and instantiation"
This reverts commit 63b9ce944c0e26c44c42cdd5095c2e9851c1a8ff.
Revert "net/mlx5: SD, Introduce SD lib"
This reverts commit 4a04a31f49320d078b8078e1da4b0e2faca5dfa3.
Revert "net/mlx5: Fix query of sd_group field"
This reverts commit e04984a37398b3f4f5a79c993b94c6b1224184cc.
Revert "net/mlx5e: Use the correct lag ports number when creating TISes"
This reverts commit a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703.

There are some unanswered questions on the list, and we don't
have any docs. Given the lack of replies so far and the fact
that v6.8 merge window has started - let's revert this and
revisit for v6.9.

Link: https://lore.kernel.org/all/20231221005721.186607-1-saeed@kernel.org/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |   3 -
 drivers/net/ethernet/mellanox/mlx5/core/ecpf.c     |   6 -
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  15 +-
 .../net/ethernet/mellanox/mlx5/core/en/channels.c  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en/channels.h  |   6 +-
 .../net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c   | 268 ------------
 .../ethernet/mellanox/mlx5/core/en/monitor_stats.c |  48 +-
 .../net/ethernet/mellanox/mlx5/core/en/params.c    |   9 +-
 .../net/ethernet/mellanox/mlx5/core/en/params.h    |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c   |  12 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c   |   8 +-
 .../ethernet/mellanox/mlx5/core/en/reporter_rx.c   |   4 +-
 .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c   | 123 +-----
 drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h   |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/rss.c   |  17 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/rss.h   |   4 +-
 .../net/ethernet/mellanox/mlx5/core/en/rx_res.c    |  62 +--
 .../net/ethernet/mellanox/mlx5/core/en/rx_res.h    |   1 -
 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c  |  11 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/pool.c  |   6 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/setup.c |   8 +-
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.c    |   2 +-
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.h    |   4 +-
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c |   6 +-
 .../net/ethernet/mellanox/mlx5/core/en_common.c    |  21 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 200 ++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  39 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.h   |   1 -
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |  12 -
 drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c   | 487 ---------------------
 drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h   |  38 --
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    |  21 -
 include/linux/mlx5/driver.h                        |  10 -
 include/linux/mlx5/mlx5_ifc.h                      |  24 +-
 include/linux/mlx5/vport.h                         |   1 -
 40 files changed, 192 insertions(+), 1320 deletions(-)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index f36232dead1a..c44870b175f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -29,7 +29,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \
 		en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
 		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
 		en/qos.o en/htb.o en/trap.o en/fs_tt_redirect.o en/selq.o \
-		en/mgmt_pf.o lib/crypto.o lib/sd.o
+		lib/crypto.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index aa1b471e13fa..cf0477f53dc4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -190,9 +190,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev)
 	if (is_mp_supported(dev))
 		return false;
 
-	if (mlx5_core_is_mgmt_pf(dev))
-		return false;
-
 	return true;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
index aa397e3ebe6d..d000236ddbac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c
@@ -75,9 +75,6 @@ int mlx5_ec_init(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return 0;
 
-	if (mlx5_core_is_mgmt_pf(dev))
-		return 0;
-
 	return mlx5_host_pf_init(dev);
 }
 
@@ -88,9 +85,6 @@ void mlx5_ec_cleanup(struct mlx5_core_dev *dev)
 	if (!mlx5_core_is_ecpf(dev))
 		return;
 
-	if (mlx5_core_is_mgmt_pf(dev))
-		return;
-
 	mlx5_host_pf_cleanup(dev);
 
 	err = mlx5_wait_for_pages(dev, &dev->priv.page_counters[MLX5_HOST_PF]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 922b63c25154..0bfe1ca8a364 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -60,10 +60,8 @@
 #include "lib/clock.h"
 #include "en/rx_res.h"
 #include "en/selq.h"
-#include "lib/sd.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
-extern const struct net_device_ops mlx5e_mgmt_netdev_ops;
 struct page_pool;
 
 #define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
@@ -793,8 +791,6 @@ struct mlx5e_channel {
 	struct hwtstamp_config    *tstamp;
 	DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES);
 	int                        ix;
-	int                        vec_ix;
-	int                        sd_ix;
 	int                        cpu;
 	/* Sync between icosq recovery and XSK enable/disable. */
 	struct mutex               icosq_recovery_lock;
@@ -918,7 +914,7 @@ struct mlx5e_priv {
 	bool                       tx_ptp_opened;
 	bool                       rx_ptp_opened;
 	struct hwtstamp_config     tstamp;
-	u16                        q_counter[MLX5_SD_MAX_GROUP_SZ];
+	u16                        q_counter;
 	u16                        drop_rq_q_counter;
 	struct notifier_block      events_nb;
 	struct notifier_block      blocking_events_nb;
@@ -1033,12 +1029,12 @@ struct mlx5e_xsk_param;
 
 struct mlx5e_rq_param;
 int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
-		  struct mlx5e_xsk_param *xsk, int node, u16 q_counter,
+		  struct mlx5e_xsk_param *xsk, int node,
 		  struct mlx5e_rq *rq);
 #define MLX5E_RQ_WQES_TIMEOUT 20000 /* msecs */
 int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time);
 void mlx5e_close_rq(struct mlx5e_rq *rq);
-int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_counter);
+int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param);
 void mlx5e_destroy_rq(struct mlx5e_rq *rq);
 
 struct mlx5e_sq_param;
@@ -1126,10 +1122,9 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 }
 
 extern const struct ethtool_ops mlx5e_ethtool_ops;
-extern const struct mlx5e_profile mlx5e_mgmt_pf_nic_profile;
 
 int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey);
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
 void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
 int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb,
 		       bool enable_mc_lb);
@@ -1232,8 +1227,6 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb,
 				       struct net_device *netdev,
 				       netdev_features_t features);
 int mlx5e_set_features(struct net_device *netdev, netdev_features_t features);
-void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv);
-
 #ifdef CONFIG_MLX5_ESWITCH
 int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac);
 int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
index 874a1016623c..48581ea3adcb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
@@ -23,26 +23,20 @@ bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix)
 	return test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
 }
 
-void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn,
-				    u32 *vhca_id)
+void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
 {
 	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);
 
 	*rqn = c->rq.rqn;
-	if (vhca_id)
-		*vhca_id = MLX5_CAP_GEN(c->mdev, vhca_id);
 }
 
-void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn,
-				u32 *vhca_id)
+void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
 {
 	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);
 
 	WARN_ON_ONCE(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state));
 
 	*rqn = c->xskrq.rqn;
-	if (vhca_id)
-		*vhca_id = MLX5_CAP_GEN(c->mdev, vhca_id);
 }
 
 bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
index 6715aa9383b9..637ca90daaa8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
@@ -10,10 +10,8 @@ struct mlx5e_channels;
 
 unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs);
 bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix);
-void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn,
-				    u32 *vhca_id);
-void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn,
-				u32 *vhca_id);
+void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
+void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
 bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn);
 
 #endif /* __MLX5_EN_CHANNELS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c b/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c
deleted file mode 100644
index 77b5805895b9..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/mgmt_pf.c
+++ /dev/null
@@ -1,268 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-
-#include <linux/kernel.h>
-#include "en/params.h"
-#include "en/health.h"
-#include "lib/eq.h"
-#include "en/dcbnl.h"
-#include "en_accel/ipsec.h"
-#include "en_accel/en_accel.h"
-#include "en/trap.h"
-#include "en/monitor_stats.h"
-#include "en/hv_vhca_stats.h"
-#include "en_rep.h"
-#include "en.h"
-
-static int mgmt_pf_async_event(struct notifier_block *nb, unsigned long event, void *data)
-{
-	struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb);
-	struct mlx5_eqe   *eqe = data;
-
-	if (event != MLX5_EVENT_TYPE_PORT_CHANGE)
-		return NOTIFY_DONE;
-
-	switch (eqe->sub_type) {
-	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
-	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
-		queue_work(priv->wq, &priv->update_carrier_work);
-		break;
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-
-static void mlx5e_mgmt_pf_enable_async_events(struct mlx5e_priv *priv)
-{
-	priv->events_nb.notifier_call = mgmt_pf_async_event;
-	mlx5_notifier_register(priv->mdev, &priv->events_nb);
-}
-
-static void mlx5e_disable_mgmt_pf_async_events(struct mlx5e_priv *priv)
-{
-	mlx5_notifier_unregister(priv->mdev, &priv->events_nb);
-}
-
-static void mlx5e_modify_mgmt_pf_admin_state(struct mlx5_core_dev *mdev,
-					     enum mlx5_port_status state)
-{
-	struct mlx5_eswitch *esw = mdev->priv.eswitch;
-	int vport_admin_state;
-
-	mlx5_set_port_admin_status(mdev, state);
-
-	if (state == MLX5_PORT_UP)
-		vport_admin_state = MLX5_VPORT_ADMIN_STATE_AUTO;
-	else
-		vport_admin_state = MLX5_VPORT_ADMIN_STATE_DOWN;
-
-	mlx5_eswitch_set_vport_state(esw, MLX5_VPORT_UPLINK, vport_admin_state);
-}
-
-static void mlx5e_build_mgmt_pf_nic_params(struct mlx5e_priv *priv, u16 mtu)
-{
-	struct mlx5e_params *params = &priv->channels.params;
-	struct mlx5_core_dev *mdev = priv->mdev;
-	u8 rx_cq_period_mode;
-
-	params->sw_mtu = mtu;
-	params->hard_mtu = MLX5E_ETH_HARD_MTU;
-	params->num_channels = 1;
-
-	/* SQ */
-	params->log_sq_size = is_kdump_kernel() ?
-		MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
-		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
-	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev));
-
-	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, false);
-
-	/* RQ */
-	mlx5e_build_rq_params(mdev, params);
-
-	/* CQ moderation params */
-	rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
-			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
-			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
-	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
-	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
-
-	/* TX inline */
-	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
-}
-
-static int mlx5e_mgmt_pf_init(struct mlx5_core_dev *mdev,
-			      struct net_device *netdev)
-{
-	struct mlx5e_priv *priv = netdev_priv(netdev);
-	struct mlx5e_flow_steering *fs;
-	int err;
-
-	mlx5e_build_mgmt_pf_nic_params(priv, netdev->mtu);
-
-	mlx5e_timestamp_init(priv);
-
-	fs = mlx5e_fs_init(priv->profile, mdev,
-			   !test_bit(MLX5E_STATE_DESTROYING, &priv->state),
-			   priv->dfs_root);
-	if (!fs) {
-		err = -ENOMEM;
-		mlx5_core_err(mdev, "FS initialization failed, %d\n", err);
-		return err;
-	}
-	priv->fs = fs;
-
-	mlx5e_health_create_reporters(priv);
-
-	return 0;
-}
-
-static void mlx5e_mgmt_pf_cleanup(struct mlx5e_priv *priv)
-{
-	mlx5e_health_destroy_reporters(priv);
-	mlx5e_fs_cleanup(priv->fs);
-	priv->fs = NULL;
-}
-
-static int mlx5e_mgmt_pf_init_rx(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *mdev = priv->mdev;
-	int err;
-
-	priv->rx_res = mlx5e_rx_res_create(mdev, 0, priv->max_nch, priv->drop_rq.rqn,
-					   &priv->channels.params.packet_merge,
-					   priv->channels.params.num_channels);
-	if (!priv->rx_res)
-		return -ENOMEM;
-
-	mlx5e_create_q_counters(priv);
-
-	err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
-	if (err) {
-		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
-		goto err_destroy_q_counters;
-	}
-
-	err = mlx5e_create_flow_steering(priv->fs, priv->rx_res, priv->profile,
-					 priv->netdev);
-	if (err) {
-		mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
-		goto err_destroy_rx_res;
-	}
-
-	return 0;
-
-err_destroy_rx_res:
-	mlx5e_rx_res_destroy(priv->rx_res);
-	priv->rx_res = NULL;
-	mlx5e_close_drop_rq(&priv->drop_rq);
-err_destroy_q_counters:
-	mlx5e_destroy_q_counters(priv);
-	return err;
-}
-
-static void mlx5e_mgmt_pf_cleanup_rx(struct mlx5e_priv *priv)
-{
-	mlx5e_destroy_flow_steering(priv->fs, !!(priv->netdev->hw_features & NETIF_F_NTUPLE),
-				    priv->profile);
-	mlx5e_rx_res_destroy(priv->rx_res);
-	priv->rx_res = NULL;
-	mlx5e_close_drop_rq(&priv->drop_rq);
-	mlx5e_destroy_q_counters(priv);
-}
-
-static int mlx5e_mgmt_pf_init_tx(struct mlx5e_priv *priv)
-{
-	return 0;
-}
-
-static void mlx5e_mgmt_pf_cleanup_tx(struct mlx5e_priv *priv)
-{
-}
-
-static void mlx5e_mgmt_pf_enable(struct mlx5e_priv *priv)
-{
-	struct net_device *netdev = priv->netdev;
-	struct mlx5_core_dev *mdev = priv->mdev;
-
-	mlx5e_fs_init_l2_addr(priv->fs, netdev);
-
-	/* Marking the link as currently not needed by the Driver */
-	if (!netif_running(netdev))
-		mlx5e_modify_mgmt_pf_admin_state(mdev, MLX5_PORT_DOWN);
-
-	mlx5e_set_netdev_mtu_boundaries(priv);
-	mlx5e_set_dev_port_mtu(priv);
-
-	mlx5e_mgmt_pf_enable_async_events(priv);
-	if (mlx5e_monitor_counter_supported(priv))
-		mlx5e_monitor_counter_init(priv);
-
-	mlx5e_hv_vhca_stats_create(priv);
-	if (netdev->reg_state != NETREG_REGISTERED)
-		return;
-	mlx5e_dcbnl_init_app(priv);
-
-	mlx5e_nic_set_rx_mode(priv);
-
-	rtnl_lock();
-	if (netif_running(netdev))
-		mlx5e_open(netdev);
-	udp_tunnel_nic_reset_ntf(priv->netdev);
-	netif_device_attach(netdev);
-	rtnl_unlock();
-}
-
-static void mlx5e_mgmt_pf_disable(struct mlx5e_priv *priv)
-{
-	if (priv->netdev->reg_state == NETREG_REGISTERED)
-		mlx5e_dcbnl_delete_app(priv);
-
-	rtnl_lock();
-	if (netif_running(priv->netdev))
-		mlx5e_close(priv->netdev);
-	netif_device_detach(priv->netdev);
-	rtnl_unlock();
-
-	mlx5e_nic_set_rx_mode(priv);
-
-	mlx5e_hv_vhca_stats_destroy(priv);
-	if (mlx5e_monitor_counter_supported(priv))
-		mlx5e_monitor_counter_cleanup(priv);
-
-	mlx5e_disable_mgmt_pf_async_events(priv);
-	mlx5e_ipsec_cleanup(priv);
-}
-
-static int mlx5e_mgmt_pf_update_rx(struct mlx5e_priv *priv)
-{
-	return mlx5e_refresh_tirs(priv, false, false);
-}
-
-static int mlx5e_mgmt_pf_max_nch_limit(struct mlx5_core_dev *mdev)
-{
-	return 1;
-}
-
-const struct mlx5e_profile mlx5e_mgmt_pf_nic_profile = {
-	.init		   = mlx5e_mgmt_pf_init,
-	.cleanup	   = mlx5e_mgmt_pf_cleanup,
-	.init_rx	   = mlx5e_mgmt_pf_init_rx,
-	.cleanup_rx	   = mlx5e_mgmt_pf_cleanup_rx,
-	.init_tx	   = mlx5e_mgmt_pf_init_tx,
-	.cleanup_tx	   = mlx5e_mgmt_pf_cleanup_tx,
-	.enable		   = mlx5e_mgmt_pf_enable,
-	.disable	   = mlx5e_mgmt_pf_disable,
-	.update_rx	   = mlx5e_mgmt_pf_update_rx,
-	.update_stats	   = mlx5e_stats_update_ndo_stats,
-	.update_carrier	   = mlx5e_update_carrier,
-	.rx_handlers       = &mlx5e_rx_handlers_nic,
-	.max_tc		   = 1,
-	.max_nch_limit	   = mlx5e_mgmt_pf_max_nch_limit,
-	.stats_grps	   = mlx5e_nic_stats_grps,
-	.stats_grps_num	   = mlx5e_nic_stats_grps_num
-};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c
index e2d8d2754be0..40c8df111754 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c
@@ -20,8 +20,10 @@
 #define NUM_REQ_PPCNT_COUNTER_S1 MLX5_CMD_SET_MONITOR_NUM_PPCNT_COUNTER_SET1
 #define NUM_REQ_Q_COUNTERS_S1    MLX5_CMD_SET_MONITOR_NUM_Q_COUNTERS_SET1
 
-static int mlx5e_monitor_counter_cap(struct mlx5_core_dev *mdev)
+int mlx5e_monitor_counter_supported(struct mlx5e_priv *priv)
 {
+	struct mlx5_core_dev *mdev = priv->mdev;
+
 	if (!MLX5_CAP_GEN(mdev, max_num_of_monitor_counters))
 		return false;
 	if (MLX5_CAP_PCAM_REG(mdev, ppcnt) &&
@@ -34,38 +36,24 @@ static int mlx5e_monitor_counter_cap(struct mlx5_core_dev *mdev)
 	return true;
 }
 
-int mlx5e_monitor_counter_supported(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *pos;
-	int i;
-
-	mlx5_sd_for_each_dev(i, priv->mdev, pos)
-		if (!mlx5e_monitor_counter_cap(pos))
-			return false;
-	return true;
-}
-
-static void mlx5e_monitor_counter_arm(struct mlx5_core_dev *mdev)
+static void mlx5e_monitor_counter_arm(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(arm_monitor_counter_in)] = {};
 
 	MLX5_SET(arm_monitor_counter_in, in, opcode,
 		 MLX5_CMD_OP_ARM_MONITOR_COUNTER);
-	mlx5_cmd_exec_in(mdev, arm_monitor_counter, in);
+	mlx5_cmd_exec_in(priv->mdev, arm_monitor_counter, in);
 }
 
 static void mlx5e_monitor_counters_work(struct work_struct *work)
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 					       monitor_counters_work);
-	struct mlx5_core_dev *pos;
-	int i;
 
 	mutex_lock(&priv->state_lock);
 	mlx5e_stats_update_ndo_stats(priv);
 	mutex_unlock(&priv->state_lock);
-	mlx5_sd_for_each_dev(i, priv->mdev, pos)
-		mlx5e_monitor_counter_arm(pos);
+	mlx5e_monitor_counter_arm(priv);
 }
 
 static int mlx5e_monitor_event_handler(struct notifier_block *nb,
@@ -109,13 +97,15 @@ static int fill_monitor_counter_q_counter_set1(int cnt, int q_counter, u32 *in)
 }
 
 /* check if mlx5e_monitor_counter_supported before calling this function*/
-static void mlx5e_set_monitor_counter(struct mlx5_core_dev *mdev, int q_counter)
+static void mlx5e_set_monitor_counter(struct mlx5e_priv *priv)
 {
+	struct mlx5_core_dev *mdev = priv->mdev;
 	int max_num_of_counters = MLX5_CAP_GEN(mdev, max_num_of_monitor_counters);
 	int num_q_counters      = MLX5_CAP_GEN(mdev, num_q_monitor_counters);
 	int num_ppcnt_counters  = !MLX5_CAP_PCAM_REG(mdev, ppcnt) ? 0 :
 				  MLX5_CAP_GEN(mdev, num_ppcnt_monitor_counters);
 	u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {};
+	int q_counter = priv->q_counter;
 	int cnt	= 0;
 
 	if (num_ppcnt_counters  >=  NUM_REQ_PPCNT_COUNTER_S1 &&
@@ -137,17 +127,13 @@ static void mlx5e_set_monitor_counter(struct mlx5_core_dev *mdev, int q_counter)
 /* check if mlx5e_monitor_counter_supported before calling this function*/
 void mlx5e_monitor_counter_init(struct mlx5e_priv *priv)
 {
-	struct mlx5_core_dev *pos;
-	int i;
-
 	INIT_WORK(&priv->monitor_counters_work, mlx5e_monitor_counters_work);
 	MLX5_NB_INIT(&priv->monitor_counters_nb, mlx5e_monitor_event_handler,
 		     MONITOR_COUNTER);
-	mlx5_sd_for_each_dev(i, priv->mdev, pos) {
-		mlx5_eq_notifier_register(pos, &priv->monitor_counters_nb);
-		mlx5e_set_monitor_counter(pos, priv->q_counter[i]);
-		mlx5e_monitor_counter_arm(pos);
-	}
+	mlx5_eq_notifier_register(priv->mdev, &priv->monitor_counters_nb);
+
+	mlx5e_set_monitor_counter(priv);
+	mlx5e_monitor_counter_arm(priv);
 	queue_work(priv->wq, &priv->update_stats_work);
 }
 
@@ -155,15 +141,11 @@ void mlx5e_monitor_counter_init(struct mlx5e_priv *priv)
 void mlx5e_monitor_counter_cleanup(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {};
-	struct mlx5_core_dev *pos;
-	int i;
 
 	MLX5_SET(set_monitor_counter_in, in, opcode,
 		 MLX5_CMD_OP_SET_MONITOR_COUNTER);
 
-	mlx5_sd_for_each_dev(i, priv->mdev, pos) {
-		mlx5_cmd_exec_in(pos, set_monitor_counter, in);
-		mlx5_eq_notifier_unregister(pos, &priv->monitor_counters_nb);
-	}
+	mlx5_cmd_exec_in(priv->mdev, set_monitor_counter, in);
+	mlx5_eq_notifier_unregister(priv->mdev, &priv->monitor_counters_nb);
 	cancel_work_sync(&priv->monitor_counters_work);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index fb10bb166fbb..284253b79266 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -674,7 +674,7 @@ void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e
 		.napi = &c->napi,
 		.ch_stats = c->stats,
 		.node = cpu_to_node(c->cpu),
-		.ix = c->vec_ix,
+		.ix = c->ix,
 	};
 }
 
@@ -945,6 +945,7 @@ static u8 rq_end_pad_mode(struct mlx5_core_dev *mdev, struct mlx5e_params *param
 int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 			 struct mlx5e_params *params,
 			 struct mlx5e_xsk_param *xsk,
+			 u16 q_counter,
 			 struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
@@ -1006,6 +1007,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 	MLX5_SET(wq, wq, log_wq_stride,
 		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
 	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.hw_objs.pdn);
+	MLX5_SET(rqc, rqc, counter_set_id, q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
 	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
 
@@ -1016,6 +1018,7 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 }
 
 void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
+			       u16 q_counter,
 			       struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
@@ -1024,6 +1027,7 @@ void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
 	MLX5_SET(wq, wq, log_wq_stride,
 		 mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1));
+	MLX5_SET(rqc, rqc, counter_set_id, q_counter);
 
 	param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev));
 }
@@ -1288,12 +1292,13 @@ void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev,
 
 int mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
 			      struct mlx5e_params *params,
+			      u16 q_counter,
 			      struct mlx5e_channel_param *cparam)
 {
 	u8 icosq_log_wq_sz, async_icosq_log_wq_sz;
 	int err;
 
-	err = mlx5e_build_rq_param(mdev, params, NULL, &cparam->rq);
+	err = mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index 9a781f18b57f..6800949dafbc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -130,8 +130,10 @@ void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e
 int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 			 struct mlx5e_params *params,
 			 struct mlx5e_xsk_param *xsk,
+			 u16 q_counter,
 			 struct mlx5e_rq_param *param);
 void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
+			       u16 q_counter,
 			       struct mlx5e_rq_param *param);
 void mlx5e_build_sq_param_common(struct mlx5_core_dev *mdev,
 				 struct mlx5e_sq_param *param);
@@ -147,6 +149,7 @@ void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev,
 			     struct mlx5e_sq_param *param);
 int mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
 			      struct mlx5e_params *params,
+			      u16 q_counter,
 			      struct mlx5e_channel_param *cparam);
 
 u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index cafb41895f94..c206cc0a8483 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -646,6 +646,7 @@ static void mlx5e_ptp_build_sq_param(struct mlx5_core_dev *mdev,
 
 static void mlx5e_ptp_build_rq_param(struct mlx5_core_dev *mdev,
 				     struct net_device *netdev,
+				     u16 q_counter,
 				     struct mlx5e_ptp_params *ptp_params)
 {
 	struct mlx5e_rq_param *rq_params = &ptp_params->rq_param;
@@ -654,7 +655,7 @@ static void mlx5e_ptp_build_rq_param(struct mlx5_core_dev *mdev,
 	params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC;
 	mlx5e_init_rq_type_params(mdev, params);
 	params->sw_mtu = netdev->max_mtu;
-	mlx5e_build_rq_param(mdev, params, NULL, rq_params);
+	mlx5e_build_rq_param(mdev, params, NULL, q_counter, rq_params);
 }
 
 static void mlx5e_ptp_build_params(struct mlx5e_ptp *c,
@@ -680,7 +681,7 @@ static void mlx5e_ptp_build_params(struct mlx5e_ptp *c,
 	/* RQ */
 	if (test_bit(MLX5E_PTP_STATE_RX, c->state)) {
 		params->vlan_strip_disable = orig->vlan_strip_disable;
-		mlx5e_ptp_build_rq_param(c->mdev, c->netdev, cparams);
+		mlx5e_ptp_build_rq_param(c->mdev, c->netdev, c->priv->q_counter, cparams);
 	}
 }
 
@@ -713,16 +714,13 @@ static int mlx5e_ptp_open_rq(struct mlx5e_ptp *c, struct mlx5e_params *params,
 			     struct mlx5e_rq_param *rq_param)
 {
 	int node = dev_to_node(c->mdev->device);
-	int err, sd_ix;
-	u16 q_counter;
+	int err;
 
 	err = mlx5e_init_ptp_rq(c, params, &c->rq);
 	if (err)
 		return err;
 
-	sd_ix = mlx5_sd_ch_ix_get_dev_ix(c->mdev, MLX5E_PTP_CHANNEL_IX);
-	q_counter = c->priv->q_counter[sd_ix];
-	return mlx5e_open_rq(params, rq_param, NULL, node, q_counter, &c->rq);
+	return mlx5e_open_rq(params, rq_param, NULL, node, &c->rq);
 }
 
 static int mlx5e_ptp_open_queues(struct mlx5e_ptp *c,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
index e87e26f2c669..34adf8c3f81a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
@@ -122,8 +122,8 @@ int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs,
 
 	memset(&param_sq, 0, sizeof(param_sq));
 	memset(&param_cq, 0, sizeof(param_cq));
-	mlx5e_build_sq_param(c->mdev, params, &param_sq);
-	mlx5e_build_tx_cq_param(c->mdev, params, &param_cq);
+	mlx5e_build_sq_param(priv->mdev, params, &param_sq);
+	mlx5e_build_tx_cq_param(priv->mdev, params, &param_cq);
 	err = mlx5e_open_cq(c->mdev, params->tx_cq_moderation, &param_cq, &ccp, &sq->cq);
 	if (err)
 		goto err_free_sq;
@@ -176,7 +176,7 @@ int mlx5e_activate_qos_sq(void *data, u16 node_qid, u32 hw_id)
 	 */
 	smp_wmb();
 
-	qos_dbg(sq->mdev, "Activate QoS SQ qid %u\n", node_qid);
+	qos_dbg(priv->mdev, "Activate QoS SQ qid %u\n", node_qid);
 	mlx5e_activate_txqsq(sq);
 
 	return 0;
@@ -190,7 +190,7 @@ void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid)
 	if (!sq) /* Handle the case when the SQ failed to open. */
 		return;
 
-	qos_dbg(sq->mdev, "Deactivate QoS SQ qid %u\n", qid);
+	qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid);
 	mlx5e_deactivate_txqsq(sq);
 
 	priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index 25d751eba99b..4358798d6ce1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -294,8 +294,8 @@ static void mlx5e_rx_reporter_diagnose_generic_rq(struct mlx5e_rq *rq,
 
 	params = &priv->channels.params;
 	rq_sz = mlx5e_rqwq_get_size(rq);
-	real_time =  mlx5_is_real_time_rq(rq->mdev);
-	rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(rq->mdev, params, NULL));
+	real_time =  mlx5_is_real_time_rq(priv->mdev);
+	rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(priv->mdev, params, NULL));
 
 	mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ");
 	devlink_fmsg_u8_pair_put(fmsg, "type", params->rq_wq_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 0ab9db319530..6b44ddce14e9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -219,6 +219,7 @@ mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
 						  struct mlx5e_txqsq *sq, int tc)
 {
 	bool stopped = netif_xmit_stopped(sq->txq);
+	struct mlx5e_priv *priv = sq->priv;
 	u8 state;
 	int err;
 
@@ -226,7 +227,7 @@ mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
 	devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
 	devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
 
-	err = mlx5_core_query_sq_state(sq->mdev, sq->sqn, &state);
+	err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
 	if (!err)
 		devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
index bcafb4bf9415..7b8ff7a71003 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c
@@ -4,33 +4,6 @@
 #include "rqt.h"
 #include <linux/mlx5/transobj.h>
 
-static bool verify_num_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids,
-				unsigned int size)
-{
-	unsigned int max_num_vhca_id = MLX5_CAP_GEN_2(mdev, max_rqt_vhca_id);
-	int i;
-
-	/* Verify that all vhca_ids are in range [0, max_num_vhca_ids - 1] */
-	for (i = 0; i < size; i++)
-		if (vhca_ids[i] >= max_num_vhca_id)
-			return false;
-	return true;
-}
-
-static bool rqt_verify_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids,
-				unsigned int size)
-{
-	if (!vhca_ids)
-		return true;
-
-	if (!MLX5_CAP_GEN(mdev, cross_vhca_rqt))
-		return false;
-	if (!verify_num_vhca_ids(mdev, vhca_ids, size))
-		return false;
-
-	return true;
-}
-
 void mlx5e_rss_params_indir_init_uniform(struct mlx5e_rss_params_indir *indir,
 					 unsigned int num_channels)
 {
@@ -40,38 +13,19 @@ void mlx5e_rss_params_indir_init_uniform(struct mlx5e_rss_params_indir *indir,
 		indir->table[i] = i % num_channels;
 }
 
-static void fill_rqn_list(void *rqtc, u32 *rqns, u32 *vhca_ids, unsigned int size)
-{
-	unsigned int i;
-
-	if (vhca_ids) {
-		MLX5_SET(rqtc, rqtc, rq_vhca_id_format, 1);
-		for (i = 0; i < size; i++) {
-			MLX5_SET(rqtc, rqtc, rq_vhca[i].rq_num, rqns[i]);
-			MLX5_SET(rqtc, rqtc, rq_vhca[i].rq_vhca_id, vhca_ids[i]);
-		}
-	} else {
-		for (i = 0; i < size; i++)
-			MLX5_SET(rqtc, rqtc, rq_num[i], rqns[i]);
-	}
-}
 static int mlx5e_rqt_init(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
-			  u16 max_size, u32 *init_rqns, u32 *init_vhca_ids, u16 init_size)
+			  u16 max_size, u32 *init_rqns, u16 init_size)
 {
-	int entry_sz;
 	void *rqtc;
 	int inlen;
 	int err;
 	u32 *in;
-
-	if (!rqt_verify_vhca_ids(mdev, init_vhca_ids, init_size))
-		return -EOPNOTSUPP;
+	int i;
 
 	rqt->mdev = mdev;
 	rqt->size = max_size;
 
-	entry_sz = init_vhca_ids ? MLX5_ST_SZ_BYTES(rq_vhca) : MLX5_ST_SZ_BYTES(rq_num);
-	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + entry_sz * init_size;
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * init_size;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -79,9 +33,10 @@ static int mlx5e_rqt_init(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt->size);
-	MLX5_SET(rqtc, rqtc, rqt_actual_size, init_size);
 
-	fill_rqn_list(rqtc, init_rqns, init_vhca_ids, init_size);
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, init_size);
+	for (i = 0; i < init_size; i++)
+		MLX5_SET(rqtc, rqtc, rq_num[i], init_rqns[i]);
 
 	err = mlx5_core_create_rqt(rqt->mdev, in, inlen, &rqt->rqtn);
 
@@ -94,7 +49,7 @@ int mlx5e_rqt_init_direct(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
 {
 	u16 max_size = indir_enabled ? indir_table_size : 1;
 
-	return mlx5e_rqt_init(rqt, mdev, max_size, &init_rqn, NULL, 1);
+	return mlx5e_rqt_init(rqt, mdev, max_size, &init_rqn, 1);
 }
 
 static int mlx5e_bits_invert(unsigned long a, int size)
@@ -108,8 +63,7 @@ static int mlx5e_bits_invert(unsigned long a, int size)
 	return inv;
 }
 
-static int mlx5e_calc_indir_rqns(u32 *rss_rqns, u32 *rqns, u32 *rss_vhca_ids, u32 *vhca_ids,
-				 unsigned int num_rqns,
+static int mlx5e_calc_indir_rqns(u32 *rss_rqns, u32 *rqns, unsigned int num_rqns,
 				 u8 hfunc, struct mlx5e_rss_params_indir *indir)
 {
 	unsigned int i;
@@ -128,42 +82,30 @@ static int mlx5e_calc_indir_rqns(u32 *rss_rqns, u32 *rqns, u32 *rss_vhca_ids, u3
 			 */
 			return -EINVAL;
 		rss_rqns[i] = rqns[ix];
-		if (vhca_ids)
-			rss_vhca_ids[i] = vhca_ids[ix];
 	}
 
 	return 0;
 }
 
 int mlx5e_rqt_init_indir(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
-			 u32 *rqns, u32 *vhca_ids, unsigned int num_rqns,
+			 u32 *rqns, unsigned int num_rqns,
 			 u8 hfunc, struct mlx5e_rss_params_indir *indir)
 {
-	u32 *rss_rqns, *rss_vhca_ids = NULL;
+	u32 *rss_rqns;
 	int err;
 
 	rss_rqns = kvmalloc_array(indir->actual_table_size, sizeof(*rss_rqns), GFP_KERNEL);
 	if (!rss_rqns)
 		return -ENOMEM;
 
-	if (vhca_ids) {
-		rss_vhca_ids = kvmalloc_array(indir->actual_table_size, sizeof(*rss_vhca_ids),
-					      GFP_KERNEL);
-		if (!rss_vhca_ids) {
-			kvfree(rss_rqns);
-			return -ENOMEM;
-		}
-	}
-
-	err = mlx5e_calc_indir_rqns(rss_rqns, rqns, rss_vhca_ids, vhca_ids, num_rqns, hfunc, indir);
+	err = mlx5e_calc_indir_rqns(rss_rqns, rqns, num_rqns, hfunc, indir);
 	if (err)
 		goto out;
 
-	err = mlx5e_rqt_init(rqt, mdev, indir->max_table_size, rss_rqns, rss_vhca_ids,
+	err = mlx5e_rqt_init(rqt, mdev, indir->max_table_size, rss_rqns,
 			     indir->actual_table_size);
 
 out:
-	kvfree(rss_vhca_ids);
 	kvfree(rss_rqns);
 	return err;
 }
@@ -184,20 +126,15 @@ void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt)
 	mlx5_core_destroy_rqt(rqt->mdev, rqt->rqtn);
 }
 
-static int mlx5e_rqt_redirect(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
-			      unsigned int size)
+static int mlx5e_rqt_redirect(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int size)
 {
-	int entry_sz;
+	unsigned int i;
 	void *rqtc;
 	int inlen;
 	u32 *in;
 	int err;
 
-	if (!rqt_verify_vhca_ids(rqt->mdev, vhca_ids, size))
-		return -EINVAL;
-
-	entry_sz = vhca_ids ? MLX5_ST_SZ_BYTES(rq_vhca) : MLX5_ST_SZ_BYTES(rq_num);
-	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + entry_sz * size;
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * size;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -206,8 +143,8 @@ static int mlx5e_rqt_redirect(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
 
 	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
 	MLX5_SET(rqtc, rqtc, rqt_actual_size, size);
-
-	fill_rqn_list(rqtc, rqns, vhca_ids, size);
+	for (i = 0; i < size; i++)
+		MLX5_SET(rqtc, rqtc, rq_num[i], rqns[i]);
 
 	err = mlx5_core_modify_rqt(rqt->mdev, rqt->rqtn, in, inlen);
 
@@ -215,21 +152,17 @@ static int mlx5e_rqt_redirect(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
 	return err;
 }
 
-int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn, u32 *vhca_id)
+int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn)
 {
-	return mlx5e_rqt_redirect(rqt, &rqn, vhca_id, 1);
+	return mlx5e_rqt_redirect(rqt, &rqn, 1);
 }
 
-int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
-			     unsigned int num_rqns,
+int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int num_rqns,
 			     u8 hfunc, struct mlx5e_rss_params_indir *indir)
 {
-	u32 *rss_rqns, *rss_vhca_ids = NULL;
+	u32 *rss_rqns;
 	int err;
 
-	if (!rqt_verify_vhca_ids(rqt->mdev, vhca_ids, num_rqns))
-		return -EINVAL;
-
 	if (WARN_ON(rqt->size != indir->max_table_size))
 		return -EINVAL;
 
@@ -237,23 +170,13 @@ int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
 	if (!rss_rqns)
 		return -ENOMEM;
 
-	if (vhca_ids) {
-		rss_vhca_ids = kvmalloc_array(indir->actual_table_size, sizeof(*rss_vhca_ids),
-					      GFP_KERNEL);
-		if (!rss_vhca_ids) {
-			kvfree(rss_rqns);
-			return -ENOMEM;
-		}
-	}
-
-	err = mlx5e_calc_indir_rqns(rss_rqns, rqns, rss_vhca_ids, vhca_ids, num_rqns, hfunc, indir);
+	err = mlx5e_calc_indir_rqns(rss_rqns, rqns, num_rqns, hfunc, indir);
 	if (err)
 		goto out;
 
-	err = mlx5e_rqt_redirect(rqt, rss_rqns, rss_vhca_ids, indir->actual_table_size);
+	err = mlx5e_rqt_redirect(rqt, rss_rqns, indir->actual_table_size);
 
 out:
-	kvfree(rss_vhca_ids);
 	kvfree(rss_rqns);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h
index e0bc30308c77..77fba3ebd18d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h
@@ -20,7 +20,7 @@ void mlx5e_rss_params_indir_init_uniform(struct mlx5e_rss_params_indir *indir,
 					 unsigned int num_channels);
 
 struct mlx5e_rqt {
-	struct mlx5_core_dev *mdev; /* primary */
+	struct mlx5_core_dev *mdev;
 	u32 rqtn;
 	u16 size;
 };
@@ -28,7 +28,7 @@ struct mlx5e_rqt {
 int mlx5e_rqt_init_direct(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
 			  bool indir_enabled, u32 init_rqn, u32 indir_table_size);
 int mlx5e_rqt_init_indir(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev,
-			 u32 *rqns, u32 *vhca_ids, unsigned int num_rqns,
+			 u32 *rqns, unsigned int num_rqns,
 			 u8 hfunc, struct mlx5e_rss_params_indir *indir);
 void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt);
 
@@ -38,9 +38,8 @@ static inline u32 mlx5e_rqt_get_rqtn(struct mlx5e_rqt *rqt)
 }
 
 u32 mlx5e_rqt_size(struct mlx5_core_dev *mdev, unsigned int num_channels);
-int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn, u32 *vhca_id);
-int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, u32 *vhca_ids,
-			     unsigned int num_rqns,
+int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn);
+int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int num_rqns,
 			     u8 hfunc, struct mlx5e_rss_params_indir *indir);
 
 #endif /* __MLX5_EN_RQT_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c
index 5f742f896600..c1545a2e8d6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c
@@ -74,7 +74,7 @@ struct mlx5e_rss {
 	struct mlx5e_tir *tir[MLX5E_NUM_INDIR_TIRS];
 	struct mlx5e_tir *inner_tir[MLX5E_NUM_INDIR_TIRS];
 	struct mlx5e_rqt rqt;
-	struct mlx5_core_dev *mdev; /* primary */
+	struct mlx5_core_dev *mdev;
 	u32 drop_rqn;
 	bool inner_ft_support;
 	bool enabled;
@@ -473,22 +473,21 @@ int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss,
 	return 0;
 }
 
-static int mlx5e_rss_apply(struct mlx5e_rss *rss, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns)
+static int mlx5e_rss_apply(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns)
 {
 	int err;
 
-	err = mlx5e_rqt_redirect_indir(&rss->rqt, rqns, vhca_ids, num_rqns, rss->hash.hfunc,
-				       &rss->indir);
+	err = mlx5e_rqt_redirect_indir(&rss->rqt, rqns, num_rqns, rss->hash.hfunc, &rss->indir);
 	if (err)
 		mlx5e_rss_warn(rss->mdev, "Failed to redirect RQT %#x to channels: err = %d\n",
 			       mlx5e_rqt_get_rqtn(&rss->rqt), err);
 	return err;
 }
 
-void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns)
+void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns)
 {
 	rss->enabled = true;
-	mlx5e_rss_apply(rss, rqns, vhca_ids, num_rqns);
+	mlx5e_rss_apply(rss, rqns, num_rqns);
 }
 
 void mlx5e_rss_disable(struct mlx5e_rss *rss)
@@ -496,7 +495,7 @@ void mlx5e_rss_disable(struct mlx5e_rss *rss)
 	int err;
 
 	rss->enabled = false;
-	err = mlx5e_rqt_redirect_direct(&rss->rqt, rss->drop_rqn, NULL);
+	err = mlx5e_rqt_redirect_direct(&rss->rqt, rss->drop_rqn);
 	if (err)
 		mlx5e_rss_warn(rss->mdev, "Failed to redirect RQT %#x to drop RQ %#x: err = %d\n",
 			       mlx5e_rqt_get_rqtn(&rss->rqt), rss->drop_rqn, err);
@@ -569,7 +568,7 @@ int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc)
 
 int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir,
 		       const u8 *key, const u8 *hfunc,
-		       u32 *rqns, u32 *vhca_ids, unsigned int num_rqns)
+		       u32 *rqns, unsigned int num_rqns)
 {
 	bool changed_indir = false;
 	bool changed_hash = false;
@@ -609,7 +608,7 @@ int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir,
 	}
 
 	if (changed_indir && rss->enabled) {
-		err = mlx5e_rss_apply(rss, rqns, vhca_ids, num_rqns);
+		err = mlx5e_rss_apply(rss, rqns, num_rqns);
 		if (err) {
 			mlx5e_rss_copy(rss, old_rss);
 			goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h
index d0df98963c8d..d1d0bc350e92 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h
@@ -39,7 +39,7 @@ int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss,
 			  const struct mlx5e_packet_merge_param *init_pkt_merge_param,
 			  bool inner, u32 *tirn);
 
-void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, u32 *vhca_ids, unsigned int num_rqns);
+void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns);
 void mlx5e_rss_disable(struct mlx5e_rss *rss);
 
 int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss,
@@ -47,7 +47,7 @@ int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss,
 int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc);
 int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir,
 		       const u8 *key, const u8 *hfunc,
-		       u32 *rqns, u32 *vhca_ids, unsigned int num_rqns);
+		       u32 *rqns, unsigned int num_rqns);
 struct mlx5e_rss_params_hash mlx5e_rss_get_hash(struct mlx5e_rss *rss);
 u8 mlx5e_rss_get_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt);
 int mlx5e_rss_set_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
index a86eade9a9e0..b23e224e3763 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
@@ -8,7 +8,7 @@
 #define MLX5E_MAX_NUM_RSS 16
 
 struct mlx5e_rx_res {
-	struct mlx5_core_dev *mdev; /* primary */
+	struct mlx5_core_dev *mdev;
 	enum mlx5e_rx_res_features features;
 	unsigned int max_nch;
 	u32 drop_rqn;
@@ -19,7 +19,6 @@ struct mlx5e_rx_res {
 	struct mlx5e_rss *rss[MLX5E_MAX_NUM_RSS];
 	bool rss_active;
 	u32 *rss_rqns;
-	u32 *rss_vhca_ids;
 	unsigned int rss_nch;
 
 	struct {
@@ -35,13 +34,6 @@ struct mlx5e_rx_res {
 
 /* API for rx_res_rss_* */
 
-static u32 *get_vhca_ids(struct mlx5e_rx_res *res, int offset)
-{
-	bool multi_vhca = res->features & MLX5E_RX_RES_FEATURE_MULTI_VHCA;
-
-	return multi_vhca ? res->rss_vhca_ids + offset : NULL;
-}
-
 void mlx5e_rx_res_rss_update_num_channels(struct mlx5e_rx_res *res, u32 nch)
 {
 	int i;
@@ -93,11 +85,8 @@ int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 *rss_idx, unsigned int i
 		return PTR_ERR(rss);
 
 	mlx5e_rss_set_indir_uniform(rss, init_nch);
-	if (res->rss_active) {
-		u32 *vhca_ids = get_vhca_ids(res, 0);
-
-		mlx5e_rss_enable(rss, res->rss_rqns, vhca_ids, res->rss_nch);
-	}
+	if (res->rss_active)
+		mlx5e_rss_enable(rss, res->rss_rqns, res->rss_nch);
 
 	res->rss[i] = rss;
 	*rss_idx = i;
@@ -164,12 +153,10 @@ static void mlx5e_rx_res_rss_enable(struct mlx5e_rx_res *res)
 
 	for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) {
 		struct mlx5e_rss *rss = res->rss[i];
-		u32 *vhca_ids;
 
 		if (!rss)
 			continue;
-		vhca_ids = get_vhca_ids(res, 0);
-		mlx5e_rss_enable(rss, res->rss_rqns, vhca_ids, res->rss_nch);
+		mlx5e_rss_enable(rss, res->rss_rqns, res->rss_nch);
 	}
 }
 
@@ -213,7 +200,6 @@ int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx,
 int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx,
 			      const u32 *indir, const u8 *key, const u8 *hfunc)
 {
-	u32 *vhca_ids = get_vhca_ids(res, 0);
 	struct mlx5e_rss *rss;
 
 	if (rss_idx >= MLX5E_MAX_NUM_RSS)
@@ -223,8 +209,7 @@ int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx,
 	if (!rss)
 		return -ENOENT;
 
-	return mlx5e_rss_set_rxfh(rss, indir, key, hfunc, res->rss_rqns, vhca_ids,
-				  res->rss_nch);
+	return mlx5e_rss_set_rxfh(rss, indir, key, hfunc, res->rss_rqns, res->rss_nch);
 }
 
 int mlx5e_rx_res_rss_get_hash_fields(struct mlx5e_rx_res *res, u32 rss_idx,
@@ -295,13 +280,11 @@ struct mlx5e_rss *mlx5e_rx_res_rss_get(struct mlx5e_rx_res *res, u32 rss_idx)
 
 static void mlx5e_rx_res_free(struct mlx5e_rx_res *res)
 {
-	kvfree(res->rss_vhca_ids);
 	kvfree(res->rss_rqns);
 	kvfree(res);
 }
 
-static struct mlx5e_rx_res *mlx5e_rx_res_alloc(struct mlx5_core_dev *mdev, unsigned int max_nch,
-					       bool multi_vhca)
+static struct mlx5e_rx_res *mlx5e_rx_res_alloc(struct mlx5_core_dev *mdev, unsigned int max_nch)
 {
 	struct mlx5e_rx_res *rx_res;
 
@@ -315,15 +298,6 @@ static struct mlx5e_rx_res *mlx5e_rx_res_alloc(struct mlx5_core_dev *mdev, unsig
 		return NULL;
 	}
 
-	if (multi_vhca) {
-		rx_res->rss_vhca_ids = kvcalloc(max_nch, sizeof(*rx_res->rss_vhca_ids), GFP_KERNEL);
-		if (!rx_res->rss_vhca_ids) {
-			kvfree(rx_res->rss_rqns);
-			kvfree(rx_res);
-			return NULL;
-		}
-	}
-
 	return rx_res;
 }
 
@@ -450,11 +424,10 @@ mlx5e_rx_res_create(struct mlx5_core_dev *mdev, enum mlx5e_rx_res_features featu
 		    const struct mlx5e_packet_merge_param *init_pkt_merge_param,
 		    unsigned int init_nch)
 {
-	bool multi_vhca = features & MLX5E_RX_RES_FEATURE_MULTI_VHCA;
 	struct mlx5e_rx_res *res;
 	int err;
 
-	res = mlx5e_rx_res_alloc(mdev, max_nch, multi_vhca);
+	res = mlx5e_rx_res_alloc(mdev, max_nch);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
 
@@ -531,11 +504,10 @@ static void mlx5e_rx_res_channel_activate_direct(struct mlx5e_rx_res *res,
 						 struct mlx5e_channels *chs,
 						 unsigned int ix)
 {
-	u32 *vhca_id = get_vhca_ids(res, ix);
 	u32 rqn = res->rss_rqns[ix];
 	int err;
 
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, rqn, vhca_id);
+	err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, rqn);
 	if (err)
 		mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to RQ %#x (channel %u): err = %d\n",
 			       mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt),
@@ -547,7 +519,7 @@ static void mlx5e_rx_res_channel_deactivate_direct(struct mlx5e_rx_res *res,
 {
 	int err;
 
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, res->drop_rqn, NULL);
+	err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, res->drop_rqn);
 	if (err)
 		mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (channel %u): err = %d\n",
 			       mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt),
@@ -562,12 +534,10 @@ void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_chann
 	nch = mlx5e_channels_get_num(chs);
 
 	for (ix = 0; ix < chs->num; ix++) {
-		u32 *vhca_id = get_vhca_ids(res, ix);
-
 		if (mlx5e_channels_is_xsk(chs, ix))
-			mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix], vhca_id);
+			mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix]);
 		else
-			mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix], vhca_id);
+			mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]);
 	}
 	res->rss_nch = chs->num;
 
@@ -584,7 +554,7 @@ void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_chann
 		if (!mlx5e_channels_get_ptp_rqn(chs, &rqn))
 			rqn = res->drop_rqn;
 
-		err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, rqn, NULL);
+		err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, rqn);
 		if (err)
 			mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to RQ %#x (PTP): err = %d\n",
 				       mlx5e_rqt_get_rqtn(&res->ptp.rqt),
@@ -603,7 +573,7 @@ void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res)
 		mlx5e_rx_res_channel_deactivate_direct(res, ix);
 
 	if (res->features & MLX5E_RX_RES_FEATURE_PTP) {
-		err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, res->drop_rqn, NULL);
+		err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, res->drop_rqn);
 		if (err)
 			mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (PTP): err = %d\n",
 				       mlx5e_rqt_get_rqtn(&res->ptp.rqt),
@@ -614,12 +584,10 @@ void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res)
 void mlx5e_rx_res_xsk_update(struct mlx5e_rx_res *res, struct mlx5e_channels *chs,
 			     unsigned int ix, bool xsk)
 {
-	u32 *vhca_id = get_vhca_ids(res, ix);
-
 	if (xsk)
-		mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix], vhca_id);
+		mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix]);
 	else
-		mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix], vhca_id);
+		mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]);
 
 	mlx5e_rx_res_rss_enable(res);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
index 7b1a9f0f1874..82aaba8a82b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
@@ -18,7 +18,6 @@ struct mlx5e_rss_params_hash;
 enum mlx5e_rx_res_features {
 	MLX5E_RX_RES_FEATURE_INNER_FT = BIT(0),
 	MLX5E_RX_RES_FEATURE_PTP = BIT(1),
-	MLX5E_RX_RES_FEATURE_MULTI_VHCA = BIT(2),
 };
 
 /* Setup */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
index 53ca16cb9c41..ac458a8d10e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
@@ -63,12 +63,10 @@ static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct mlx5e_trap *t)
 	struct mlx5e_create_cq_param ccp = {};
 	struct dim_cq_moder trap_moder = {};
 	struct mlx5e_rq *rq = &t->rq;
-	u16 q_counter;
 	int node;
 	int err;
 
 	node = dev_to_node(mdev->device);
-	q_counter = priv->q_counter[0];
 
 	ccp.netdev   = priv->netdev;
 	ccp.wq       = priv->wq;
@@ -81,7 +79,7 @@ static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct mlx5e_trap *t)
 		return err;
 
 	mlx5e_init_trap_rq(t, &t->params, rq);
-	err = mlx5e_open_rq(&t->params, rq_param, NULL, node, q_counter, rq);
+	err = mlx5e_open_rq(&t->params, rq_param, NULL, node, rq);
 	if (err)
 		goto err_destroy_cq;
 
@@ -118,14 +116,15 @@ static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct ml
 }
 
 static void mlx5e_build_trap_params(struct mlx5_core_dev *mdev,
-				    int max_mtu, struct mlx5e_trap *t)
+				    int max_mtu, u16 q_counter,
+				    struct mlx5e_trap *t)
 {
 	struct mlx5e_params *params = &t->params;
 
 	params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC;
 	mlx5e_init_rq_type_params(mdev, params);
 	params->sw_mtu = max_mtu;
-	mlx5e_build_rq_param(mdev, params, NULL, &t->rq_param);
+	mlx5e_build_rq_param(mdev, params, NULL, q_counter, &t->rq_param);
 }
 
 static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
@@ -139,7 +138,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
 	if (!t)
 		return ERR_PTR(-ENOMEM);
 
-	mlx5e_build_trap_params(priv->mdev, netdev->max_mtu, t);
+	mlx5e_build_trap_params(priv->mdev, netdev->max_mtu, priv->q_counter, t);
 
 	t->priv     = priv;
 	t->mdev     = priv->mdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
index db776e515b6a..ebada0c5af3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
@@ -6,10 +6,10 @@
 #include "setup.h"
 #include "en/params.h"
 
-static int mlx5e_xsk_map_pool(struct mlx5_core_dev *mdev,
+static int mlx5e_xsk_map_pool(struct mlx5e_priv *priv,
 			      struct xsk_buff_pool *pool)
 {
-	struct device *dev = mlx5_core_dma_dev(mdev);
+	struct device *dev = mlx5_core_dma_dev(priv->mdev);
 
 	return xsk_pool_dma_map(pool, dev, DMA_ATTR_SKIP_CPU_SYNC);
 }
@@ -89,7 +89,7 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
 	if (unlikely(!mlx5e_xsk_is_pool_sane(pool)))
 		return -EINVAL;
 
-	err = mlx5e_xsk_map_pool(mlx5_sd_ch_ix_get_dev(priv->mdev, ix), pool);
+	err = mlx5e_xsk_map_pool(priv, pool);
 	if (unlikely(err))
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index 06592b9f0424..82e6abbc1734 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -49,9 +49,10 @@ bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
 static void mlx5e_build_xsk_cparam(struct mlx5_core_dev *mdev,
 				   struct mlx5e_params *params,
 				   struct mlx5e_xsk_param *xsk,
+				   u16 q_counter,
 				   struct mlx5e_channel_param *cparam)
 {
-	mlx5e_build_rq_param(mdev, params, xsk, &cparam->rq);
+	mlx5e_build_rq_param(mdev, params, xsk, q_counter, &cparam->rq);
 	mlx5e_build_xdpsq_param(mdev, params, xsk, &cparam->xdp_sq);
 }
 
@@ -92,7 +93,6 @@ static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *param
 			     struct mlx5e_rq_param *rq_params, struct xsk_buff_pool *pool,
 			     struct mlx5e_xsk_param *xsk)
 {
-	u16 q_counter = c->priv->q_counter[c->sd_ix];
 	struct mlx5e_rq *xskrq = &c->xskrq;
 	int err;
 
@@ -100,7 +100,7 @@ static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *param
 	if (err)
 		return err;
 
-	err = mlx5e_open_rq(params, rq_params, xsk, cpu_to_node(c->cpu), q_counter, xskrq);
+	err = mlx5e_open_rq(params, rq_params, xsk, cpu_to_node(c->cpu), xskrq);
 	if (err)
 		return err;
 
@@ -125,7 +125,7 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
 	if (!cparam)
 		return -ENOMEM;
 
-	mlx5e_build_xsk_cparam(priv->mdev, params, xsk, cparam);
+	mlx5e_build_xsk_cparam(priv->mdev, params, xsk, priv->q_counter, cparam);
 
 	err = mlx5e_open_cq(c->mdev, params->rx_cq_moderation, &cparam->rq.cqp, &ccp,
 			    &c->xskrq.cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
index e3e57c849436..984fa04bd331 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
@@ -96,7 +96,7 @@ bool mlx5e_is_ktls_rx(struct mlx5_core_dev *mdev)
 {
 	u8 max_sq_wqebbs = mlx5e_get_max_sq_wqebbs(mdev);
 
-	if (is_kdump_kernel() || !MLX5_CAP_GEN(mdev, tls_rx) || mlx5_get_sd(mdev))
+	if (is_kdump_kernel() || !MLX5_CAP_GEN(mdev, tls_rx))
 		return false;
 
 	/* Check the possibility to post the required ICOSQ WQEs. */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
index adc6d8ea0960..f11075e67658 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
@@ -11,7 +11,6 @@
 
 #ifdef CONFIG_MLX5_EN_TLS
 #include "lib/crypto.h"
-#include "lib/mlx5.h"
 
 struct mlx5_crypto_dek *mlx5_ktls_create_key(struct mlx5_crypto_dek_pool *dek_pool,
 					     struct tls_crypto_info *crypto_info);
@@ -62,8 +61,7 @@ void mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_
 
 static inline bool mlx5e_is_ktls_tx(struct mlx5_core_dev *mdev)
 {
-	return !is_kdump_kernel() && MLX5_CAP_GEN(mdev, tls_tx) &&
-		!mlx5_get_sd(mdev);
+	return !is_kdump_kernel() && MLX5_CAP_GEN(mdev, tls_tx);
 }
 
 bool mlx5e_is_ktls_rx(struct mlx5_core_dev *mdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
index 65ccb33edafb..9b597cb24598 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
@@ -267,7 +267,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq,
 		goto err_out;
 	}
 
-	pdev = mlx5_core_dma_dev(sq->channel->mdev);
+	pdev = mlx5_core_dma_dev(sq->channel->priv->mdev);
 	buf->dma_addr = dma_map_single(pdev, &buf->progress,
 				       PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(pdev, buf->dma_addr))) {
@@ -425,12 +425,14 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi,
 {
 	struct mlx5e_ktls_rx_resync_buf *buf = wi->tls_get_params.buf;
 	struct mlx5e_ktls_offload_context_rx *priv_rx;
+	struct mlx5e_ktls_rx_resync_ctx *resync;
 	u8 tracker_state, auth_state, *ctx;
 	struct device *dev;
 	u32 hw_seq;
 
 	priv_rx = buf->priv_rx;
-	dev = mlx5_core_dma_dev(sq->channel->mdev);
+	resync = &priv_rx->resync;
+	dev = mlx5_core_dma_dev(resync->priv->mdev);
 	if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags)))
 		goto out;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 6ed3a32b7e22..67f546683e85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -95,7 +95,7 @@ static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PO
 {
 	int tc, i;
 
-	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++)
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++)
 			mlx5e_destroy_tis(mdev, tisn[i][tc]);
 }
@@ -110,7 +110,7 @@ static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORT
 	int tc, i;
 	int err;
 
-	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) {
+	for (i = 0; i < MLX5_MAX_PORTS; i++) {
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) {
 			u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
 			void *tisc;
@@ -140,7 +140,7 @@ err_close_tises:
 	return err;
 }
 
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises)
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 {
 	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 	int err;
@@ -169,15 +169,11 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises)
 		goto err_destroy_mkey;
 	}
 
-	if (create_tises) {
-		err = mlx5e_create_tises(mdev, res->tisn);
-		if (err) {
-			mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
-			goto err_destroy_bfreg;
-		}
-		res->tisn_valid = true;
+	err = mlx5e_create_tises(mdev, res->tisn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
+		goto err_destroy_bfreg;
 	}
-
 	INIT_LIST_HEAD(&res->td.tirs_list);
 	mutex_init(&res->td.list_lock);
 
@@ -207,8 +203,7 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 
 	mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv);
 	mdev->mlx5e_res.dek_priv = NULL;
-	if (res->tisn_valid)
-		mlx5e_destroy_tises(mdev, res->tisn);
+	mlx5e_destroy_tises(mdev, res->tisn);
 	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 40626b6108fb..b5f1c4ca38ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -70,7 +70,6 @@
 #include "qos.h"
 #include "en/trap.h"
 #include "lib/devcom.h"
-#include "lib/sd.h"
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
 					    enum mlx5e_mpwrq_umr_mode umr_mode)
@@ -1025,7 +1024,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
-int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_counter)
+int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5_core_dev *mdev = rq->mdev;
 	u8 ts_format;
@@ -1052,7 +1051,6 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_cou
 	MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
 	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
 	MLX5_SET(rqc,  rqc, ts_format,		ts_format);
-	MLX5_SET(rqc,  rqc, counter_set_id,     q_counter);
 	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
 						MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
@@ -1276,7 +1274,7 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 }
 
 int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
-		  struct mlx5e_xsk_param *xsk, int node, u16 q_counter,
+		  struct mlx5e_xsk_param *xsk, int node,
 		  struct mlx5e_rq *rq)
 {
 	struct mlx5_core_dev *mdev = rq->mdev;
@@ -1289,7 +1287,7 @@ int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
 	if (err)
 		return err;
 
-	err = mlx5e_create_rq(rq, param, q_counter);
+	err = mlx5e_create_rq(rq, param);
 	if (err)
 		goto err_free_rq;
 
@@ -2335,14 +2333,13 @@ static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
 			     struct mlx5e_rq_param *rq_params)
 {
-	u16 q_counter = c->priv->q_counter[c->sd_ix];
 	int err;
 
 	err = mlx5e_init_rxq_rq(c, params, rq_params->xdp_frag_size, &c->rq);
 	if (err)
 		return err;
 
-	return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), q_counter, &c->rq);
+	return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), &c->rq);
 }
 
 static int mlx5e_open_queues(struct mlx5e_channel *c,
@@ -2529,20 +2526,14 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 			      struct xsk_buff_pool *xsk_pool,
 			      struct mlx5e_channel **cp)
 {
+	int cpu = mlx5_comp_vector_get_cpu(priv->mdev, ix);
 	struct net_device *netdev = priv->netdev;
-	struct mlx5_core_dev *mdev;
 	struct mlx5e_xsk_param xsk;
 	struct mlx5e_channel *c;
 	unsigned int irq;
-	int vec_ix;
-	int cpu;
 	int err;
 
-	mdev = mlx5_sd_ch_ix_get_dev(priv->mdev, ix);
-	vec_ix = mlx5_sd_ch_ix_get_vec_ix(mdev, ix);
-	cpu = mlx5_comp_vector_get_cpu(mdev, vec_ix);
-
-	err = mlx5_comp_irqn_get(mdev, vec_ix, &irq);
+	err = mlx5_comp_irqn_get(priv->mdev, ix, &irq);
 	if (err)
 		return err;
 
@@ -2555,20 +2546,18 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 		return -ENOMEM;
 
 	c->priv     = priv;
-	c->mdev     = mdev;
+	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
-	c->vec_ix   = vec_ix;
-	c->sd_ix    = mlx5_sd_ch_ix_get_dev_ix(mdev, ix);
 	c->cpu      = cpu;
-	c->pdev     = mlx5_core_dma_dev(mdev);
+	c->pdev     = mlx5_core_dma_dev(priv->mdev);
 	c->netdev   = priv->netdev;
-	c->mkey_be  = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey);
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey);
 	c->num_tc   = mlx5e_get_dcb_num_tc(params);
 	c->xdp      = !!params->xdp_prog;
 	c->stats    = &priv->channel_stats[ix]->ch;
 	c->aff_mask = irq_get_effective_affinity_mask(irq);
-	c->lag_port = mlx5e_enumerate_lag_port(mdev, ix);
+	c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
 
 	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll);
 
@@ -2658,7 +2647,7 @@ int mlx5e_open_channels(struct mlx5e_priv *priv,
 	if (!chs->c || !cparam)
 		goto err_free;
 
-	err = mlx5e_build_channel_param(priv->mdev, &chs->params, cparam);
+	err = mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam);
 	if (err)
 		goto err_free;
 
@@ -2946,18 +2935,15 @@ static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_update_netdev_queues);
 static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv,
 					   struct mlx5e_params *params)
 {
-	int ix;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int num_comp_vectors, ix, irq;
 
-	for (ix = 0; ix < params->num_channels; ix++) {
-		int num_comp_vectors, irq, vec_ix;
-		struct mlx5_core_dev *mdev;
+	num_comp_vectors = mlx5_comp_vectors_max(mdev);
 
-		mdev = mlx5_sd_ch_ix_get_dev(priv->mdev, ix);
-		num_comp_vectors = mlx5_comp_vectors_max(mdev);
+	for (ix = 0; ix < params->num_channels; ix++) {
 		cpumask_clear(priv->scratchpad.cpumask);
-		vec_ix = mlx5_sd_ch_ix_get_vec_ix(mdev, ix);
 
-		for (irq = vec_ix; irq < num_comp_vectors; irq += params->num_channels) {
+		for (irq = ix; irq < num_comp_vectors; irq += params->num_channels) {
 			int cpu = mlx5_comp_vector_get_cpu(mdev, irq);
 
 			cpumask_set_cpu(cpu, priv->scratchpad.cpumask);
@@ -3349,7 +3335,7 @@ int mlx5e_open_drop_rq(struct mlx5e_priv *priv,
 	struct mlx5e_cq *cq = &drop_rq->cq;
 	int err;
 
-	mlx5e_build_drop_rq_param(mdev, &rq_param);
+	mlx5e_build_drop_rq_param(mdev, priv->drop_rq_q_counter, &rq_param);
 
 	err = mlx5e_alloc_drop_cq(priv, cq, &cq_param);
 	if (err)
@@ -3363,7 +3349,7 @@ int mlx5e_open_drop_rq(struct mlx5e_priv *priv,
 	if (err)
 		goto err_destroy_cq;
 
-	err = mlx5e_create_rq(drop_rq, &rq_param, priv->drop_rq_q_counter);
+	err = mlx5e_create_rq(drop_rq, &rq_param);
 	if (err)
 		goto err_free_rq;
 
@@ -3799,7 +3785,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
 }
 
-void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
+static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
 {
 	if (mlx5e_is_uplink_rep(priv))
 		return; /* no rx mode for uplink rep */
@@ -5004,15 +4990,6 @@ const struct net_device_ops mlx5e_netdev_ops = {
 #endif
 };
 
-const struct net_device_ops mlx5e_mgmt_netdev_ops = {
-	.ndo_open		= mlx5e_open,
-	.ndo_stop		= mlx5e_close,
-	.ndo_start_xmit		= mlx5e_xmit,
-	.ndo_get_stats64	= mlx5e_get_stats,
-	.ndo_change_mtu		= mlx5e_change_nic_mtu,
-	.ndo_set_rx_mode	= mlx5e_set_rx_mode,
-};
-
 static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
 {
 	int i;
@@ -5152,11 +5129,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 
 	SET_NETDEV_DEV(netdev, mdev->device);
 
-	if (mlx5_core_is_mgmt_pf(mdev))
-		netdev->netdev_ops = &mlx5e_mgmt_netdev_ops;
-	else
-		netdev->netdev_ops = &mlx5e_netdev_ops;
-
+	netdev->netdev_ops = &mlx5e_netdev_ops;
 	netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
 	netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops;
 
@@ -5291,17 +5264,13 @@ void mlx5e_create_q_counters(struct mlx5e_priv *priv)
 	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_core_dev *pos;
-	int err, i;
+	int err;
 
 	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
-
-	mlx5_sd_for_each_dev(i, mdev, pos) {
-		err = mlx5_cmd_exec_inout(pos, alloc_q_counter, in, out);
-		if (!err)
-			priv->q_counter[i] =
-				MLX5_GET(alloc_q_counter_out, out, counter_set_id);
-	}
+	err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out);
+	if (!err)
+		priv->q_counter =
+			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
 
 	err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out);
 	if (!err)
@@ -5312,17 +5281,13 @@ void mlx5e_create_q_counters(struct mlx5e_priv *priv)
 void mlx5e_destroy_q_counters(struct mlx5e_priv *priv)
 {
 	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
-	struct mlx5_core_dev *pos;
-	int i;
 
 	MLX5_SET(dealloc_q_counter_in, in, opcode,
 		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
-	mlx5_sd_for_each_dev(i, priv->mdev, pos) {
-		if (priv->q_counter[i]) {
-			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
-				 priv->q_counter[i]);
-			mlx5_cmd_exec_in(pos, dealloc_q_counter, in);
-		}
+	if (priv->q_counter) {
+		MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
+			 priv->q_counter);
+		mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in);
 	}
 
 	if (priv->drop_rq_q_counter) {
@@ -5406,8 +5371,6 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
 	features = MLX5E_RX_RES_FEATURE_PTP;
 	if (mlx5_tunnel_inner_ft_supported(mdev))
 		features |= MLX5E_RX_RES_FEATURE_INNER_FT;
-	if (mlx5_get_sd(priv->mdev))
-		features |= MLX5E_RX_RES_FEATURE_MULTI_VHCA;
 
 	priv->rx_res = mlx5e_rx_res_create(priv->mdev, features, priv->max_nch, priv->drop_rq.rqn,
 					   &priv->channels.params.packet_merge,
@@ -6017,52 +5980,28 @@ void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
 	free_netdev(netdev);
 }
 
-static int _mlx5e_resume(struct auxiliary_device *adev)
+static int mlx5e_resume(struct auxiliary_device *adev)
 {
 	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
 	struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev);
 	struct mlx5e_priv *priv = mlx5e_dev->priv;
 	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = edev->mdev;
-	struct mlx5_core_dev *pos, *to;
-	int err, i;
+	int err;
 
 	if (netif_device_present(netdev))
 		return 0;
 
-	mlx5_sd_for_each_dev(i, mdev, pos) {
-		err = mlx5e_create_mdev_resources(pos, true);
-		if (err)
-			goto err_destroy_mdev_res;
-	}
-
-	err = mlx5e_attach_netdev(priv);
+	err = mlx5e_create_mdev_resources(mdev);
 	if (err)
-		goto err_destroy_mdev_res;
-
-	return 0;
-
-err_destroy_mdev_res:
-	to = pos;
-	mlx5_sd_for_each_dev_to(i, mdev, to, pos)
-		mlx5e_destroy_mdev_resources(pos);
-	return err;
-}
-
-static int mlx5e_resume(struct auxiliary_device *adev)
-{
-	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
-	struct mlx5_core_dev *mdev = edev->mdev;
-	struct auxiliary_device *actual_adev;
-	int err;
+		return err;
 
-	err = mlx5_sd_init(mdev);
-	if (err)
+	err = mlx5e_attach_netdev(priv);
+	if (err) {
+		mlx5e_destroy_mdev_resources(mdev);
 		return err;
+	}
 
-	actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
-	if (actual_adev)
-		return _mlx5e_resume(actual_adev);
 	return 0;
 }
 
@@ -6072,53 +6011,33 @@ static int _mlx5e_suspend(struct auxiliary_device *adev)
 	struct mlx5e_priv *priv = mlx5e_dev->priv;
 	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_core_dev *pos;
-	int i;
 
 	if (!netif_device_present(netdev)) {
 		if (test_bit(MLX5E_STATE_DESTROYING, &priv->state))
-			mlx5_sd_for_each_dev(i, mdev, pos)
-				mlx5e_destroy_mdev_resources(pos);
+			mlx5e_destroy_mdev_resources(mdev);
 		return -ENODEV;
 	}
 
 	mlx5e_detach_netdev(priv);
-	mlx5_sd_for_each_dev(i, mdev, pos)
-		mlx5e_destroy_mdev_resources(pos);
-
+	mlx5e_destroy_mdev_resources(mdev);
 	return 0;
 }
 
 static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state)
 {
-	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
-	struct mlx5_core_dev *mdev = edev->mdev;
-	struct auxiliary_device *actual_adev;
-	int err = 0;
-
-	actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
-	if (actual_adev)
-		err = _mlx5e_suspend(actual_adev);
-
-	mlx5_sd_cleanup(mdev);
-	return err;
+	return _mlx5e_suspend(adev);
 }
 
 static int _mlx5e_probe(struct auxiliary_device *adev)
 {
 	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
+	const struct mlx5e_profile *profile = &mlx5e_nic_profile;
 	struct mlx5_core_dev *mdev = edev->mdev;
-	const struct mlx5e_profile *profile;
 	struct mlx5e_dev *mlx5e_dev;
 	struct net_device *netdev;
 	struct mlx5e_priv *priv;
 	int err;
 
-	if (mlx5_core_is_mgmt_pf(mdev))
-		profile = &mlx5e_mgmt_pf_nic_profile;
-	else
-		profile = &mlx5e_nic_profile;
-
 	mlx5e_dev = mlx5e_create_devlink(&adev->dev, mdev);
 	if (IS_ERR(mlx5e_dev))
 		return PTR_ERR(mlx5e_dev);
@@ -6152,9 +6071,9 @@ static int _mlx5e_probe(struct auxiliary_device *adev)
 		goto err_destroy_netdev;
 	}
 
-	err = _mlx5e_resume(adev);
+	err = mlx5e_resume(adev);
 	if (err) {
-		mlx5_core_err(mdev, "_mlx5e_resume failed, %d\n", err);
+		mlx5_core_err(mdev, "mlx5e_resume failed, %d\n", err);
 		goto err_profile_cleanup;
 	}
 
@@ -6185,29 +6104,15 @@ err_devlink_unregister:
 static int mlx5e_probe(struct auxiliary_device *adev,
 		       const struct auxiliary_device_id *id)
 {
-	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
-	struct mlx5_core_dev *mdev = edev->mdev;
-	struct auxiliary_device *actual_adev;
-	int err;
-
-	err = mlx5_sd_init(mdev);
-	if (err)
-		return err;
-
-	actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
-	if (actual_adev)
-		return _mlx5e_probe(actual_adev);
-	return 0;
+	return _mlx5e_probe(adev);
 }
 
-static void _mlx5e_remove(struct auxiliary_device *adev)
+static void mlx5e_remove(struct auxiliary_device *adev)
 {
-	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
 	struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev);
 	struct mlx5e_priv *priv = mlx5e_dev->priv;
-	struct mlx5_core_dev *mdev = edev->mdev;
 
-	mlx5_core_uplink_netdev_set(mdev, NULL);
+	mlx5_core_uplink_netdev_set(priv->mdev, NULL);
 	mlx5e_dcbnl_delete_app(priv);
 	unregister_netdev(priv->netdev);
 	_mlx5e_suspend(adev);
@@ -6217,19 +6122,6 @@ static void _mlx5e_remove(struct auxiliary_device *adev)
 	mlx5e_destroy_devlink(mlx5e_dev);
 }
 
-static void mlx5e_remove(struct auxiliary_device *adev)
-{
-	struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
-	struct mlx5_core_dev *mdev = edev->mdev;
-	struct auxiliary_device *actual_adev;
-
-	actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
-	if (actual_adev)
-		_mlx5e_remove(actual_adev);
-
-	mlx5_sd_cleanup(mdev);
-}
-
 static const struct auxiliary_device_id mlx5e_id_table[] = {
 	{ .name = MLX5_ADEV_NAME ".eth", },
 	{},
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index f3d0898bdbc6..4b96ad657145 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -561,23 +561,11 @@ static const struct counter_desc drop_rq_stats_desc[] = {
 #define NUM_Q_COUNTERS			ARRAY_SIZE(q_stats_desc)
 #define NUM_DROP_RQ_COUNTERS		ARRAY_SIZE(drop_rq_stats_desc)
 
-static bool q_counter_any(struct mlx5e_priv *priv)
-{
-	struct mlx5_core_dev *pos;
-	int i;
-
-	mlx5_sd_for_each_dev(i, priv->mdev, pos)
-		if (priv->q_counter[i++])
-			return true;
-
-	return false;
-}
-
 static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qcnt)
 {
 	int num_stats = 0;
 
-	if (q_counter_any(priv))
+	if (priv->q_counter)
 		num_stats += NUM_Q_COUNTERS;
 
 	if (priv->drop_rq_q_counter)
@@ -590,7 +578,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qcnt)
 {
 	int i;
 
-	for (i = 0; i < NUM_Q_COUNTERS && q_counter_any(priv); i++)
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
 		strcpy(data + (idx++) * ETH_GSTRING_LEN,
 		       q_stats_desc[i].format);
 
@@ -605,7 +593,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qcnt)
 {
 	int i;
 
-	for (i = 0; i < NUM_Q_COUNTERS && q_counter_any(priv); i++)
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
 		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
 						   q_stats_desc, i);
 	for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++)
@@ -619,23 +607,18 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qcnt)
 	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
 	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
-	struct mlx5_core_dev *pos;
-	u32 rx_out_of_buffer = 0;
-	int ret, i;
+	int ret;
 
 	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
 
-	mlx5_sd_for_each_dev(i, priv->mdev, pos) {
-		if (priv->q_counter[i]) {
-			MLX5_SET(query_q_counter_in, in, counter_set_id,
-				 priv->q_counter[i]);
-			ret = mlx5_cmd_exec_inout(pos, query_q_counter, in, out);
-			if (!ret)
-				rx_out_of_buffer += MLX5_GET(query_q_counter_out,
-							     out, out_of_buffer);
-		}
+	if (priv->q_counter) {
+		MLX5_SET(query_q_counter_in, in, counter_set_id,
+			 priv->q_counter);
+		ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out);
+		if (!ret)
+			qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out,
+							  out, out_of_buffer);
 	}
-	qcnt->rx_out_of_buffer = rx_out_of_buffer;
 
 	if (priv->drop_rq_q_counter) {
 		MLX5_SET(query_q_counter_in, in, counter_set_id,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b8ceb972df9e..30932c9c9a8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -766,7 +766,7 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
 		return err;
 
 	mlx5e_rss_params_indir_init_uniform(&indir, hp->num_channels);
-	err = mlx5e_rqt_init_indir(&hp->indir_rqt, mdev, hp->pair->rqn, NULL, hp->num_channels,
+	err = mlx5e_rqt_init_indir(&hp->indir_rqt, mdev, hp->pair->rqn, hp->num_channels,
 				   mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc,
 				   &indir);
 
@@ -1169,7 +1169,7 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 			MLX5_CAP_GEN(priv->mdev, log_min_hairpin_wq_data_sz),
 			MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz));
 
-	params.q_counter = priv->q_counter[0];
+	params.q_counter = priv->q_counter;
 	err = devl_param_driverinit_value_get(
 		devlink, MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES, &val);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 3bf419d06d53..3047d7015c52 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1665,7 +1665,7 @@ int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *
 	void *hca_caps;
 	int err;
 
-	if (!mlx5_core_is_ecpf(dev) || mlx5_core_is_mgmt_pf(dev)) {
+	if (!mlx5_core_is_ecpf(dev)) {
 		*max_sfs = 0;
 		return 0;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index d77be1b4dd9c..58845121954c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -783,7 +783,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num,
 		}
 
 		/* This should only be called once per mdev */
-		err = mlx5e_create_mdev_resources(mdev, false);
+		err = mlx5e_create_mdev_resources(mdev);
 		if (err)
 			goto destroy_ht;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index d58032dd0df7..ec32b686f586 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -10,7 +10,6 @@ enum mlx5_devcom_component {
 	MLX5_DEVCOM_ESW_OFFLOADS,
 	MLX5_DEVCOM_MPV,
 	MLX5_DEVCOM_HCA_PORTS,
-	MLX5_DEVCOM_SD_GROUP,
 	MLX5_DEVCOM_NUM_COMPONENTS,
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 37d5f445598c..2b5826a785c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -54,16 +54,4 @@ static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *md
 {
 	return mdev->mlx5e_res.uplink_netdev;
 }
-
-struct mlx5_sd;
-
-static inline struct mlx5_sd *mlx5_get_sd(struct mlx5_core_dev *dev)
-{
-	return dev->sd;
-}
-
-static inline void mlx5_set_sd(struct mlx5_core_dev *dev, struct mlx5_sd *sd)
-{
-	dev->sd = sd;
-}
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
deleted file mode 100644
index f68942277c62..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ /dev/null
@@ -1,487 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
-/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
-
-#include "lib/sd.h"
-#include "mlx5_core.h"
-#include "lib/mlx5.h"
-#include "fs_cmd.h"
-#include <linux/mlx5/vport.h>
-
-#define sd_info(__dev, format, ...) \
-	dev_info((__dev)->device, "Socket-Direct: " format, ##__VA_ARGS__)
-#define sd_warn(__dev, format, ...) \
-	dev_warn((__dev)->device, "Socket-Direct: " format, ##__VA_ARGS__)
-
-struct mlx5_sd {
-	u32 group_id;
-	u8 host_buses;
-	struct mlx5_devcom_comp_dev *devcom;
-	bool primary;
-	union {
-		struct { /* primary */
-			struct mlx5_core_dev *secondaries[MLX5_SD_MAX_GROUP_SZ - 1];
-			struct mlx5_flow_table *tx_ft;
-		};
-		struct { /* secondary */
-			struct mlx5_core_dev *primary_dev;
-			u32 alias_obj_id;
-		};
-	};
-};
-
-static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-
-	if (!sd)
-		return 1;
-
-	return sd->host_buses;
-}
-
-static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-
-	if (!sd)
-		return dev;
-
-	return sd->primary ? dev : sd->primary_dev;
-}
-
-struct mlx5_core_dev *
-mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
-{
-	struct mlx5_sd *sd;
-
-	if (idx == 0)
-		return primary;
-
-	if (idx >= mlx5_sd_get_host_buses(primary))
-		return NULL;
-
-	sd = mlx5_get_sd(primary);
-	return sd->secondaries[idx - 1];
-}
-
-int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix)
-{
-	return ch_ix % mlx5_sd_get_host_buses(dev);
-}
-
-int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix)
-{
-	return ch_ix / mlx5_sd_get_host_buses(dev);
-}
-
-struct mlx5_core_dev *mlx5_sd_ch_ix_get_dev(struct mlx5_core_dev *primary, int ch_ix)
-{
-	int mdev_idx = mlx5_sd_ch_ix_get_dev_ix(primary, ch_ix);
-
-	return mlx5_sd_primary_get_peer(primary, mdev_idx);
-}
-
-static bool ft_create_alias_supported(struct mlx5_core_dev *dev)
-{
-	u64 obj_allowed = MLX5_CAP_GEN_2_64(dev, allowed_object_for_other_vhca_access);
-	u32 obj_supp = MLX5_CAP_GEN_2(dev, cross_vhca_object_to_object_supported);
-
-	if (!(obj_supp &
-	    MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE))
-		return false;
-
-	if (!(obj_allowed & MLX5_ALLOWED_OBJ_FOR_OTHER_VHCA_ACCESS_FLOW_TABLE))
-		return false;
-
-	return true;
-}
-
-static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses)
-{
-	/* Feature is currently implemented for PFs only */
-	if (!mlx5_core_is_pf(dev))
-		return false;
-
-	/* Honor the SW implementation limit */
-	if (host_buses > MLX5_SD_MAX_GROUP_SZ)
-		return false;
-
-	/* Disconnect secondaries from the network */
-	if (!MLX5_CAP_GEN(dev, eswitch_manager))
-		return false;
-	if (!MLX5_CAP_GEN(dev, silent_mode))
-		return false;
-
-	/* RX steering from primary to secondaries */
-	if (!MLX5_CAP_GEN(dev, cross_vhca_rqt))
-		return false;
-	if (host_buses > MLX5_CAP_GEN_2(dev, max_rqt_vhca_id))
-		return false;
-
-	/* TX steering from secondaries to primary */
-	if (!ft_create_alias_supported(dev))
-		return false;
-	if (!MLX5_CAP_FLOWTABLE_NIC_TX(dev, reset_root_to_default))
-		return false;
-
-	return true;
-}
-
-static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm,
-			 u8 *host_buses, u8 *sd_group)
-{
-	u32 out[MLX5_ST_SZ_DW(mpir_reg)];
-	int err;
-
-	err = mlx5_query_mpir_reg(dev, out);
-	if (err)
-		return err;
-
-	err = mlx5_query_nic_vport_sd_group(dev, sd_group);
-	if (err)
-		return err;
-
-	*sdm = MLX5_GET(mpir_reg, out, sdm);
-	*host_buses = MLX5_GET(mpir_reg, out, host_buses);
-
-	return 0;
-}
-
-static u32 mlx5_sd_group_id(struct mlx5_core_dev *dev, u8 sd_group)
-{
-	return (u32)((MLX5_CAP_GEN(dev, native_port_num) << 8) | sd_group);
-}
-
-static int sd_init(struct mlx5_core_dev *dev)
-{
-	u8 host_buses, sd_group;
-	struct mlx5_sd *sd;
-	u32 group_id;
-	bool sdm;
-	int err;
-
-	err = mlx5_query_sd(dev, &sdm, &host_buses, &sd_group);
-	if (err)
-		return err;
-
-	if (!sdm)
-		return 0;
-
-	if (!sd_group)
-		return 0;
-
-	group_id = mlx5_sd_group_id(dev, sd_group);
-
-	if (!mlx5_sd_is_supported(dev, host_buses)) {
-		sd_warn(dev, "can't support requested netdev combining for group id 0x%x), skipping\n",
-			group_id);
-		return 0;
-	}
-
-	sd = kzalloc(sizeof(*sd), GFP_KERNEL);
-	if (!sd)
-		return -ENOMEM;
-
-	sd->host_buses = host_buses;
-	sd->group_id = group_id;
-
-	mlx5_set_sd(dev, sd);
-
-	return 0;
-}
-
-static void sd_cleanup(struct mlx5_core_dev *dev)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-
-	mlx5_set_sd(dev, NULL);
-	kfree(sd);
-}
-
-static int sd_register(struct mlx5_core_dev *dev)
-{
-	struct mlx5_devcom_comp_dev *devcom, *pos;
-	struct mlx5_core_dev *peer, *primary;
-	struct mlx5_sd *sd, *primary_sd;
-	int err, i;
-
-	sd = mlx5_get_sd(dev);
-	devcom = mlx5_devcom_register_component(dev->priv.devc, MLX5_DEVCOM_SD_GROUP,
-						sd->group_id, NULL, dev);
-	if (!devcom)
-		return -ENOMEM;
-
-	sd->devcom = devcom;
-
-	if (mlx5_devcom_comp_get_size(devcom) != sd->host_buses)
-		return 0;
-
-	mlx5_devcom_comp_lock(devcom);
-	mlx5_devcom_comp_set_ready(devcom, true);
-	mlx5_devcom_comp_unlock(devcom);
-
-	if (!mlx5_devcom_for_each_peer_begin(devcom)) {
-		err = -ENODEV;
-		goto err_devcom_unreg;
-	}
-
-	primary = dev;
-	mlx5_devcom_for_each_peer_entry(devcom, peer, pos)
-		if (peer->pdev->bus->number < primary->pdev->bus->number)
-			primary = peer;
-
-	primary_sd = mlx5_get_sd(primary);
-	primary_sd->primary = true;
-	i = 0;
-	/* loop the secondaries */
-	mlx5_devcom_for_each_peer_entry(primary_sd->devcom, peer, pos) {
-		struct mlx5_sd *peer_sd = mlx5_get_sd(peer);
-
-		primary_sd->secondaries[i++] = peer;
-		peer_sd->primary = false;
-		peer_sd->primary_dev = primary;
-	}
-
-	mlx5_devcom_for_each_peer_end(devcom);
-	return 0;
-
-err_devcom_unreg:
-	mlx5_devcom_comp_lock(sd->devcom);
-	mlx5_devcom_comp_set_ready(sd->devcom, false);
-	mlx5_devcom_comp_unlock(sd->devcom);
-	mlx5_devcom_unregister_component(sd->devcom);
-	return err;
-}
-
-static void sd_unregister(struct mlx5_core_dev *dev)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-
-	mlx5_devcom_comp_lock(sd->devcom);
-	mlx5_devcom_comp_set_ready(sd->devcom, false);
-	mlx5_devcom_comp_unlock(sd->devcom);
-	mlx5_devcom_unregister_component(sd->devcom);
-}
-
-static int sd_cmd_set_primary(struct mlx5_core_dev *primary, u8 *alias_key)
-{
-	struct mlx5_cmd_allow_other_vhca_access_attr allow_attr = {};
-	struct mlx5_sd *sd = mlx5_get_sd(primary);
-	struct mlx5_flow_table_attr ft_attr = {};
-	struct mlx5_flow_namespace *nic_ns;
-	struct mlx5_flow_table *ft;
-	int err;
-
-	nic_ns = mlx5_get_flow_namespace(primary, MLX5_FLOW_NAMESPACE_EGRESS);
-	if (!nic_ns)
-		return -EOPNOTSUPP;
-
-	ft = mlx5_create_flow_table(nic_ns, &ft_attr);
-	if (IS_ERR(ft)) {
-		err = PTR_ERR(ft);
-		return err;
-	}
-	sd->tx_ft = ft;
-	memcpy(allow_attr.access_key, alias_key, ACCESS_KEY_LEN);
-	allow_attr.obj_type = MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS;
-	allow_attr.obj_id = (ft->type << FT_ID_FT_TYPE_OFFSET) | ft->id;
-
-	err = mlx5_cmd_allow_other_vhca_access(primary, &allow_attr);
-	if (err) {
-		mlx5_core_err(primary, "Failed to allow other vhca access err=%d\n",
-			      err);
-		mlx5_destroy_flow_table(ft);
-		return err;
-	}
-
-	return 0;
-}
-
-static void sd_cmd_unset_primary(struct mlx5_core_dev *primary)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(primary);
-
-	mlx5_destroy_flow_table(sd->tx_ft);
-}
-
-static int sd_secondary_create_alias_ft(struct mlx5_core_dev *secondary,
-					struct mlx5_core_dev *primary,
-					struct mlx5_flow_table *ft,
-					u32 *obj_id, u8 *alias_key)
-{
-	u32 aliased_object_id = (ft->type << FT_ID_FT_TYPE_OFFSET) | ft->id;
-	u16 vhca_id_to_be_accessed = MLX5_CAP_GEN(primary, vhca_id);
-	struct mlx5_cmd_alias_obj_create_attr alias_attr = {};
-	int ret;
-
-	memcpy(alias_attr.access_key, alias_key, ACCESS_KEY_LEN);
-	alias_attr.obj_id = aliased_object_id;
-	alias_attr.obj_type = MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS;
-	alias_attr.vhca_id = vhca_id_to_be_accessed;
-	ret = mlx5_cmd_alias_obj_create(secondary, &alias_attr, obj_id);
-	if (ret) {
-		mlx5_core_err(secondary, "Failed to create alias object err=%d\n",
-			      ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void sd_secondary_destroy_alias_ft(struct mlx5_core_dev *secondary)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(secondary);
-
-	mlx5_cmd_alias_obj_destroy(secondary, sd->alias_obj_id,
-				   MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS);
-}
-
-static int sd_cmd_set_secondary(struct mlx5_core_dev *secondary,
-				struct mlx5_core_dev *primary,
-				u8 *alias_key)
-{
-	struct mlx5_sd *primary_sd = mlx5_get_sd(primary);
-	struct mlx5_sd *sd = mlx5_get_sd(secondary);
-	int err;
-
-	err = mlx5_fs_cmd_set_l2table_entry_silent(secondary, 1);
-	if (err)
-		return err;
-
-	err = sd_secondary_create_alias_ft(secondary, primary, primary_sd->tx_ft,
-					   &sd->alias_obj_id, alias_key);
-	if (err)
-		goto err_unset_silent;
-
-	err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, sd->alias_obj_id, false);
-	if (err)
-		goto err_destroy_alias_ft;
-
-	return 0;
-
-err_destroy_alias_ft:
-	sd_secondary_destroy_alias_ft(secondary);
-err_unset_silent:
-	mlx5_fs_cmd_set_l2table_entry_silent(secondary, 0);
-	return err;
-}
-
-static void sd_cmd_unset_secondary(struct mlx5_core_dev *secondary)
-{
-	mlx5_fs_cmd_set_tx_flow_table_root(secondary, 0, true);
-	sd_secondary_destroy_alias_ft(secondary);
-	mlx5_fs_cmd_set_l2table_entry_silent(secondary, 0);
-}
-
-static void sd_print_group(struct mlx5_core_dev *primary)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(primary);
-	struct mlx5_core_dev *pos;
-	int i;
-
-	sd_info(primary, "group id %#x, primary %s, vhca %u\n",
-		sd->group_id, pci_name(primary->pdev),
-		MLX5_CAP_GEN(primary, vhca_id));
-	mlx5_sd_for_each_secondary(i, primary, pos)
-		sd_info(primary, "group id %#x, secondary#%d %s, vhca %u\n",
-			sd->group_id, i - 1, pci_name(pos->pdev),
-			MLX5_CAP_GEN(pos, vhca_id));
-}
-
-int mlx5_sd_init(struct mlx5_core_dev *dev)
-{
-	struct mlx5_core_dev *primary, *pos, *to;
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-	u8 alias_key[ACCESS_KEY_LEN];
-	int err, i;
-
-	err = sd_init(dev);
-	if (err)
-		return err;
-
-	sd = mlx5_get_sd(dev);
-	if (!sd)
-		return 0;
-
-	err = sd_register(dev);
-	if (err)
-		goto err_sd_cleanup;
-
-	if (!mlx5_devcom_comp_is_ready(sd->devcom))
-		return 0;
-
-	primary = mlx5_sd_get_primary(dev);
-
-	for (i = 0; i < ACCESS_KEY_LEN; i++)
-		alias_key[i] = get_random_u8();
-
-	err = sd_cmd_set_primary(primary, alias_key);
-	if (err)
-		goto err_sd_unregister;
-
-	mlx5_sd_for_each_secondary(i, primary, pos) {
-		err = sd_cmd_set_secondary(pos, primary, alias_key);
-		if (err)
-			goto err_unset_secondaries;
-	}
-
-	sd_info(primary, "group id %#x, size %d, combined\n",
-		sd->group_id, mlx5_devcom_comp_get_size(sd->devcom));
-	sd_print_group(primary);
-
-	return 0;
-
-err_unset_secondaries:
-	to = pos;
-	mlx5_sd_for_each_secondary_to(i, primary, to, pos)
-		sd_cmd_unset_secondary(pos);
-	sd_cmd_unset_primary(primary);
-err_sd_unregister:
-	sd_unregister(dev);
-err_sd_cleanup:
-	sd_cleanup(dev);
-	return err;
-}
-
-void mlx5_sd_cleanup(struct mlx5_core_dev *dev)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-	struct mlx5_core_dev *primary, *pos;
-	int i;
-
-	if (!sd)
-		return;
-
-	if (!mlx5_devcom_comp_is_ready(sd->devcom))
-		goto out;
-
-	primary = mlx5_sd_get_primary(dev);
-	mlx5_sd_for_each_secondary(i, primary, pos)
-		sd_cmd_unset_secondary(pos);
-	sd_cmd_unset_primary(primary);
-
-	sd_info(primary, "group id %#x, uncombined\n", sd->group_id);
-out:
-	sd_unregister(dev);
-	sd_cleanup(dev);
-}
-
-struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
-					  struct auxiliary_device *adev,
-					  int idx)
-{
-	struct mlx5_sd *sd = mlx5_get_sd(dev);
-	struct mlx5_core_dev *primary;
-
-	if (!sd)
-		return adev;
-
-	if (!mlx5_devcom_comp_is_ready(sd->devcom))
-		return NULL;
-
-	primary = mlx5_sd_get_primary(dev);
-	if (dev == primary)
-		return adev;
-
-	return &primary->priv.adev[idx]->adev;
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
deleted file mode 100644
index 137efaf9aabc..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
-/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
-
-#ifndef __MLX5_LIB_SD_H__
-#define __MLX5_LIB_SD_H__
-
-#define MLX5_SD_MAX_GROUP_SZ 2
-
-struct mlx5_sd;
-
-struct mlx5_core_dev *mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx);
-int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix);
-int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix);
-struct mlx5_core_dev *mlx5_sd_ch_ix_get_dev(struct mlx5_core_dev *primary, int ch_ix);
-struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
-					  struct auxiliary_device *adev,
-					  int idx);
-
-int mlx5_sd_init(struct mlx5_core_dev *dev);
-void mlx5_sd_cleanup(struct mlx5_core_dev *dev);
-
-#define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos)	\
-	for (i = ix_from;							\
-	     (pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++)
-
-#define mlx5_sd_for_each_dev(i, primary, pos)				\
-	mlx5_sd_for_each_dev_from_to(i, primary, 0, NULL, pos)
-
-#define mlx5_sd_for_each_dev_to(i, primary, to, pos)			\
-	mlx5_sd_for_each_dev_from_to(i, primary, 0, to, pos)
-
-#define mlx5_sd_for_each_secondary(i, primary, pos)			\
-	mlx5_sd_for_each_dev_from_to(i, primary, 1, NULL, pos)
-
-#define mlx5_sd_for_each_secondary_to(i, primary, to, pos)		\
-	mlx5_sd_for_each_dev_from_to(i, primary, 1, to, pos)
-
-#endif /* __MLX5_LIB_SD_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 1005bb6935b6..21753f327868 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -440,27 +440,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
 
-int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group)
-{
-	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
-	u32 *out;
-	int err;
-
-	out = kvzalloc(outlen, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	err = mlx5_query_nic_vport_context(mdev, 0, out);
-	if (err)
-		goto out;
-
-	*sd_group = MLX5_GET(query_nic_vport_context_out, out,
-			     nic_vport_context.sd_group);
-out:
-	kvfree(out);
-	return err;
-}
-
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 {
 	u32 *out;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2bba88c67f58..7ee5b79ff3d6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,7 +681,6 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
-		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
@@ -822,7 +821,6 @@ struct mlx5_core_dev {
 	struct blocking_notifier_head macsec_nh;
 #endif
 	u64 num_ipsec_offloads;
-	struct mlx5_sd          *sd;
 };
 
 struct mlx5_db {
@@ -1224,14 +1222,6 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
 	return dev->caps.embedded_cpu;
 }
 
-static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev)
-{
-	if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid))
-		return false;
-
-	return MLX5_CAP_GEN_2(dev, local_mng_port);
-}
-
 static inline bool
 mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
 {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 586569209254..fee20fc010c2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1954,10 +1954,8 @@ enum {
 struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
-	u8	   migratable[0x1];
-	u8	   reserved_at_81[0x19];
-	u8	   local_mng_port[0x1];
-	u8	   reserved_at_9b[0x5];
+	u8         migratable[0x1];
+	u8         reserved_at_81[0x1f];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -1975,13 +1973,7 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   allowed_object_for_other_vhca_access[0x40];
 
-	u8	   reserved_at_140[0x20];
-
-	u8	   reserved_at_160[0xa];
-	u8	   local_mng_port_valid[0x1];
-	u8	   reserved_at_16b[0x15];
-
-	u8	   reserved_at_180[0x20];
+	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
 	u8	   reserved_at_1a8[0x3];
@@ -4038,13 +4030,8 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xa0];
+	u8	   reserved_at_60[0xd0];
 
-	u8	   reserved_at_100[0x1];
-	u8         sd_group[0x3];
-	u8	   reserved_at_104[0x1c];
-
-	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10129,7 +10116,8 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x18];
+	u8         reserved_at_28[0x15];
+	u8         sd_group[0x3];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c36cc6d82926..fbb9bf447889 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,7 +72,6 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
-int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From 4e33b071bb8e8415fb9847249ffcf300fa7d8cac Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Dec 2023 07:51:41 +0000
Subject: block: remove disk_clear_zoned

disk_clear_zoned is unused now that the last warts of the host-aware
model support in sd are gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20231228075141.362560-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 21 ---------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index c59d44ee6b23..623879d875a4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -615,24 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
-void disk_clear_zoned(struct gendisk *disk)
-{
-	struct request_queue *q = disk->queue;
-
-	blk_mq_freeze_queue(q);
-
-	q->limits.zoned = false;
-	disk_free_zone_bitmaps(disk);
-	blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
-	q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
-	disk->nr_zones = 0;
-	disk->max_open_zones = 0;
-	disk->max_active_zones = 0;
-	q->limits.chunk_sectors = 0;
-	q->limits.zone_write_granularity = 0;
-	q->limits.max_zone_append_sectors = 0;
-
-	blk_mq_unfreeze_queue(q);
-}
-EXPORT_SYMBOL_GPL(disk_clear_zoned);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 94701a63ad8a..e1e705aef51e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -318,7 +318,6 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
 			       void *data);
 
 void disk_set_zoned(struct gendisk *disk);
-void disk_clear_zoned(struct gendisk *disk);
 
 #define BLK_ALL_ZONES  ((unsigned int)-1)
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 8 Jan 2024 11:50:16 -0700
Subject: block: move __get_task_ioprio() into header file

We call this once per IO, which can be millions of times per second.
Since nobody really uses io priorities, or at least it isn't very
common, this is all wasted time and can amount to as much as 3% of
the total kernel time.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioprio.c         | 26 --------------------------
 include/linux/ioprio.h | 25 ++++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioprio.c b/block/ioprio.c
index b5a942519a79..73301a261429 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -139,32 +139,6 @@ out:
 	return ret;
 }
 
-/*
- * If the task has set an I/O priority, use that. Otherwise, return
- * the default I/O priority.
- *
- * Expected to be called for current task or with task_lock() held to keep
- * io_context stable.
- */
-int __get_task_ioprio(struct task_struct *p)
-{
-	struct io_context *ioc = p->io_context;
-	int prio;
-
-	if (p != current)
-		lockdep_assert_held(&p->alloc_lock);
-	if (ioc)
-		prio = ioc->ioprio;
-	else
-		prio = IOPRIO_DEFAULT;
-
-	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
-		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
-					 task_nice_ioprio(p));
-	return prio;
-}
-EXPORT_SYMBOL_GPL(__get_task_ioprio);
-
 static int get_task_ioprio(struct task_struct *p)
 {
 	int ret;
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 7578d4f6a969..d6a9b5b7ed16 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -47,7 +47,30 @@ static inline int task_nice_ioclass(struct task_struct *task)
 }
 
 #ifdef CONFIG_BLOCK
-int __get_task_ioprio(struct task_struct *p);
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+static inline int __get_task_ioprio(struct task_struct *p)
+{
+	struct io_context *ioc = p->io_context;
+	int prio;
+
+	if (p != current)
+		lockdep_assert_held(&p->alloc_lock);
+	if (ioc)
+		prio = ioc->ioprio;
+	else
+		prio = IOPRIO_DEFAULT;
+
+	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+					 task_nice_ioprio(p));
+	return prio;
+}
 #else
 static inline int __get_task_ioprio(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 53889bcaf536b3abedeaf104019877cee37dd08b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 8 Jan 2024 11:51:57 -0700
Subject: block: make __get_task_ioprio() easier to read

We don't need to do any gymnastics if we don't have an io_context
assigned at all, so just return early with our default priority.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index d6a9b5b7ed16..db1249cd9692 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -59,13 +59,13 @@ static inline int __get_task_ioprio(struct task_struct *p)
 	struct io_context *ioc = p->io_context;
 	int prio;
 
+	if (!ioc)
+		return IOPRIO_DEFAULT;
+
 	if (p != current)
 		lockdep_assert_held(&p->alloc_lock);
-	if (ioc)
-		prio = ioc->ioprio;
-	else
-		prio = IOPRIO_DEFAULT;
 
+	prio = ioc->ioprio;
 	if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
 		prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
 					 task_nice_ioprio(p));
-- 
cgit v1.2.3


From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:03 +0300
Subject: mm, treewide: introduce NR_PAGE_ORDERS

NR_PAGE_ORDERS defines the number of page orders supported by the page
allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total.

NR_PAGE_ORDERS assists in defining arrays of page orders and allows for
more natural iteration over them.

[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning]
  Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box
Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst |  6 +++---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h          |  2 +-
 arch/sparc/kernel/traps_64.c                   |  2 +-
 drivers/gpu/drm/ttm/tests/ttm_device_test.c    |  2 +-
 drivers/gpu/drm/ttm/ttm_pool.c                 | 20 ++++++++++----------
 include/drm/ttm/ttm_pool.h                     |  2 +-
 include/linux/mmzone.h                         |  6 ++++--
 kernel/crash_core.c                            |  2 +-
 lib/test_meminit.c                             |  2 +-
 mm/compaction.c                                |  2 +-
 mm/kmsan/init.c                                |  2 +-
 mm/page_alloc.c                                | 13 ++++++-------
 mm/page_reporting.c                            |  2 +-
 mm/show_mem.c                                  |  8 ++++----
 mm/vmstat.c                                    | 12 ++++++------
 15 files changed, 42 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 78e4d2e7ba14..3f8769e46b07 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@ variables.
 Offset of the free_list's member. This value is used to compute the number
 of free pages.
 
-Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
+Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS].
 The free_list represents a linked list of free page blocks.
 
 (list_head, next|prev)
@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
 information. Makedumpfile gets the start address of the vmalloc region
 from this.
 
-(zone.free_area, MAX_ORDER + 1)
--------------------------------
+(zone.free_area, NR_PAGE_ORDERS)
+--------------------------------
 
 Free areas descriptor. User-space tools use this value to iterate the
 free_area ranges. MAX_ORDER is used by the zone buddy allocator.
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index fe5472a184a3..97c527ef53c2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -16,7 +16,7 @@ struct hyp_pool {
 	 * API at EL2.
 	 */
 	hyp_spinlock_t lock;
-	struct list_head free_area[MAX_ORDER + 1];
+	struct list_head free_area[NR_PAGE_ORDERS];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
 	unsigned short max_order;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 08ffd17d5ec3..523a6e5ee925 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
 
 	/* Now allocate error trap reporting scoreboard. */
 	sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info));
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		if ((PAGE_SIZE << order) >= sz)
 			break;
 	}
diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
index b1b423b68cdf..19eaff22e6ae 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
@@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test)
 
 	if (params->pools_init_expected) {
 		for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
-			for (int j = 0; j <= MAX_ORDER; ++j) {
+			for (int j = 0; j < NR_PAGE_ORDERS; ++j) {
 				pt = pool->caching[i].orders[j];
 				KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool);
 				KUNIT_EXPECT_EQ(test, pt.caching, i);
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index fe610a3cace0..d183bb97c526 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
-static struct ttm_pool_type global_write_combined[MAX_ORDER + 1];
-static struct ttm_pool_type global_uncached[MAX_ORDER + 1];
+static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS];
+static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
 
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1];
-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
+static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS];
+static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS];
 
 static spinlock_t shrinker_lock;
 static struct list_head shrinker_list;
@@ -568,7 +568,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 
 	if (use_dma_alloc || nid != NUMA_NO_NODE) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j <= MAX_ORDER; ++j)
+			for (j = 0; j < NR_PAGE_ORDERS; ++j)
 				ttm_pool_type_init(&pool->caching[i].orders[j],
 						   pool, i, j);
 	}
@@ -601,7 +601,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
 
 	if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j <= MAX_ORDER; ++j)
+			for (j = 0; j < NR_PAGE_ORDERS; ++j)
 				ttm_pool_type_fini(&pool->caching[i].orders[j]);
 	}
 
@@ -656,7 +656,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m)
 	unsigned int i;
 
 	seq_puts(m, "\t ");
-	for (i = 0; i <= MAX_ORDER; ++i)
+	for (i = 0; i < NR_PAGE_ORDERS; ++i)
 		seq_printf(m, " ---%2u---", i);
 	seq_puts(m, "\n");
 }
@@ -667,7 +667,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
 {
 	unsigned int i;
 
-	for (i = 0; i <= MAX_ORDER; ++i)
+	for (i = 0; i < NR_PAGE_ORDERS; ++i)
 		seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
 	seq_puts(m, "\n");
 }
@@ -776,7 +776,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	spin_lock_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
-	for (i = 0; i <= MAX_ORDER; ++i) {
+	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
 		ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i);
@@ -816,7 +816,7 @@ void ttm_pool_mgr_fini(void)
 {
 	unsigned int i;
 
-	for (i = 0; i <= MAX_ORDER; ++i) {
+	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_fini(&global_write_combined[i]);
 		ttm_pool_type_fini(&global_uncached[i]);
 
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 30a347e5aa11..4490d43c63e3 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -74,7 +74,7 @@ struct ttm_pool {
 	bool use_dma32;
 
 	struct {
-		struct ttm_pool_type orders[MAX_ORDER + 1];
+		struct ttm_pool_type orders[NR_PAGE_ORDERS];
 	} caching[TTM_NUM_CACHING_TYPES];
 };
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c18c53353b50..1ea7636dfb76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,6 +35,8 @@
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
+#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
@@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt)
 }
 
 #define for_each_migratetype_order(order, type) \
-	for (order = 0; order <= MAX_ORDER; order++) \
+	for (order = 0; order < NR_PAGE_ORDERS; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
 
 extern int page_group_by_mobility_disabled;
@@ -933,7 +935,7 @@ struct zone {
 	CACHELINE_PADDING(_pad1_);
 
 	/* free areas of different sizes */
-	struct free_area	free_area[MAX_ORDER + 1];
+	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
 	/* Pages to be accepted. All pages on the list are MAX_ORDER */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d4313b53837e..56cf4ad7abbb 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -802,7 +802,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vmap_area, va_start);
 	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
+	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/lib/test_meminit.c b/lib/test_meminit.c
index 0ae35223d773..0dc173849a54 100644
--- a/lib/test_meminit.c
+++ b/lib/test_meminit.c
@@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures)
 	int failures = 0, num_tests = 0;
 	int i;
 
-	for (i = 0; i <= MAX_ORDER; i++)
+	for (i = 0; i < NR_PAGE_ORDERS; i++)
 		num_tests += do_alloc_pages_order(i, &failures);
 
 	REPORT_FAILURES_IN_FN();
diff --git a/mm/compaction.c b/mm/compaction.c
index de15a2ef0af5..24f8eb4d6260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2229,7 +2229,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 
 	/* Direct compactor: Is a suitable page free? */
 	ret = COMPACT_NO_SUITABLE_PAGE;
-	for (order = cc->order; order <= MAX_ORDER; order++) {
+	for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
 		struct free_area *area = &cc->zone->free_area[order];
 		bool can_steal;
 
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index ffedf4dbc49d..103e2e88ea03 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void)
 struct metadata_page_pair {
 	struct page *shadow, *origin;
 };
-static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
+static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata;
 
 /*
  * Eager metadata allocation. When the memblock allocator is freeing pages to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5526797b7f96..ccecf6158ae4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1560,7 +1560,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	struct page *page;
 
 	/* Find a page of the appropriate size in the preferred list */
-	for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
+	for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		page = get_page_from_free_area(area, migratetype);
 		if (!page)
@@ -1934,7 +1934,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &(zone->free_area[order]);
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -2044,8 +2044,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	return false;
 
 find_smallest:
-	for (current_order = order; current_order <= MAX_ORDER;
-							current_order++) {
+	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
 				start_migratetype, false, &can_steal);
@@ -3000,7 +2999,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		return true;
 
 	/* For a high-order request, check at least one suitable page is free */
-	for (o = order; o <= MAX_ORDER; o++) {
+	for (o = order; o < NR_PAGE_ORDERS; o++) {
 		struct free_area *area = &z->free_area[o];
 		int mt;
 
@@ -6628,7 +6627,7 @@ bool is_free_buddy_page(struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	unsigned int order;
 
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 
 		if (PageBuddy(page_head) &&
@@ -6683,7 +6682,7 @@ bool take_page_off_buddy(struct page *page)
 	bool ret = false;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		int page_order = buddy_order(page_head);
 
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index b021f482a4cb..66369cc5279b 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 		return err;
 
 	/* Process each free list starting from lowest order/mt */
-	for (order = page_reporting_order; order <= MAX_ORDER; order++) {
+	for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) {
 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 			/* We do not pull pages from the isolate free list */
 			if (is_migrate_isolate(mt))
diff --git a/mm/show_mem.c b/mm/show_mem.c
index ba0808d6917f..8dcfafbd283c 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -352,8 +352,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 
 	for_each_populated_zone(zone) {
 		unsigned int order;
-		unsigned long nr[MAX_ORDER + 1], flags, total = 0;
-		unsigned char types[MAX_ORDER + 1];
+		unsigned long nr[NR_PAGE_ORDERS], flags, total = 0;
+		unsigned char types[NR_PAGE_ORDERS];
 
 		if (zone_idx(zone) > max_zone_idx)
 			continue;
@@ -363,7 +363,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 		printk(KERN_CONT "%s: ", zone->name);
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 
@@ -377,7 +377,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			printk(KERN_CONT "%lu*%lukB ",
 			       nr[order], K(1UL) << order);
 			if (nr[order])
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1437ca2f28c5..03ead31c46a0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1059,7 +1059,7 @@ static void fill_contig_page_info(struct zone *zone,
 	info->free_blocks_total = 0;
 	info->free_blocks_suitable = 0;
 
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		unsigned long blocks;
 
 		/*
@@ -1476,7 +1476,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 	int order;
 
 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order)
+	for (order = 0; order < NR_PAGE_ORDERS; ++order)
 		/*
 		 * Access to nr_free is lockless as nr_free is used only for
 		 * printing purposes. Use data_race to avoid KCSAN warning.
@@ -1505,7 +1505,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 					pgdat->node_id,
 					zone->name,
 					migratetype_names[mtype]);
-		for (order = 0; order <= MAX_ORDER; ++order) {
+		for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 			unsigned long freecount = 0;
 			struct free_area *area;
 			struct list_head *curr;
@@ -1545,7 +1545,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
 
 	/* Print header */
 	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
-	for (order = 0; order <= MAX_ORDER; ++order)
+	for (order = 0; order < NR_PAGE_ORDERS; ++order)
 		seq_printf(m, "%6d ", order);
 	seq_putc(m, '\n');
 
@@ -2181,7 +2181,7 @@ static void unusable_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order) {
+	for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = unusable_free_index(order, &info);
 		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
@@ -2233,7 +2233,7 @@ static void extfrag_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order) {
+	for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = __fragmentation_index(order, &info);
 		seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
-- 
cgit v1.2.3


From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:04 +0300
Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER

commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has
changed the definition of MAX_ORDER to be inclusive.  This has caused
issues with code that was not yet upstream and depended on the previous
definition.

To draw attention to the altered meaning of the define, rename MAX_ORDER
to MAX_PAGE_ORDER.

Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst  |  2 +-
 Documentation/admin-guide/kernel-parameters.txt | 24 ++++++++++++------------
 Documentation/networking/packet_mmap.rst        | 14 +++++++-------
 arch/arm/Kconfig                                |  2 +-
 arch/arm64/Kconfig                              | 20 ++++++++++----------
 arch/arm64/include/asm/sparsemem.h              |  2 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c            |  3 ++-
 arch/arm64/mm/hugetlbpage.c                     |  2 +-
 arch/m68k/Kconfig.cpu                           |  2 +-
 arch/nios2/Kconfig                              |  2 +-
 arch/powerpc/Kconfig                            |  2 +-
 arch/powerpc/mm/book3s64/iommu_api.c            |  2 +-
 arch/powerpc/mm/hugetlbpage.c                   |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c       |  2 +-
 arch/sh/mm/Kconfig                              |  2 +-
 arch/sparc/Kconfig                              |  2 +-
 arch/sparc/kernel/pci_sun4v.c                   |  2 +-
 arch/sparc/mm/tsb.c                             |  4 ++--
 arch/um/kernel/um_arch.c                        |  4 ++--
 arch/xtensa/Kconfig                             |  2 +-
 drivers/accel/qaic/qaic_data.c                  |  2 +-
 drivers/base/regmap/regmap-debugfs.c            |  8 ++++----
 drivers/block/floppy.c                          |  2 +-
 drivers/crypto/ccp/sev-dev.c                    |  2 +-
 drivers/crypto/hisilicon/sgl.c                  |  6 +++---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c    |  2 +-
 drivers/gpu/drm/i915/gem/selftests/huge_pages.c |  2 +-
 drivers/gpu/drm/ttm/tests/ttm_pool_test.c       |  8 ++++----
 drivers/gpu/drm/ttm/ttm_pool.c                  |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     |  2 +-
 drivers/iommu/dma-iommu.c                       |  2 +-
 drivers/irqchip/irq-gic-v3-its.c                |  4 ++--
 drivers/md/dm-bufio.c                           |  2 +-
 drivers/md/dm-crypt.c                           |  2 +-
 drivers/md/dm-flakey.c                          |  2 +-
 drivers/misc/genwqe/card_dev.c                  |  2 +-
 drivers/misc/genwqe/card_utils.c                |  4 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c |  2 +-
 drivers/net/ethernet/ibm/ibmvnic.h              |  4 ++--
 drivers/video/fbdev/hyperv_fb.c                 |  6 +++---
 drivers/video/fbdev/vermilion/vermilion.c       |  2 +-
 drivers/virtio/virtio_balloon.c                 |  2 +-
 drivers/virtio/virtio_mem.c                     |  8 ++++----
 fs/ramfs/file-nommu.c                           |  2 +-
 include/linux/hugetlb.h                         |  2 +-
 include/linux/mmzone.h                          | 14 +++++++-------
 include/linux/pageblock-flags.h                 |  4 ++--
 include/linux/slab.h                            |  4 ++--
 kernel/dma/pool.c                               |  6 +++---
 kernel/dma/swiotlb.c                            |  4 ++--
 kernel/events/ring_buffer.c                     | 10 +++++-----
 mm/Kconfig                                      |  6 +++---
 mm/compaction.c                                 |  4 ++--
 mm/debug_page_alloc.c                           |  2 +-
 mm/debug_vm_pgtable.c                           |  4 ++--
 mm/huge_memory.c                                |  2 +-
 mm/hugetlb.c                                    |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/kmsan/init.c                                 |  6 +++---
 mm/memblock.c                                   |  7 ++++---
 mm/memory_hotplug.c                             |  9 +++++----
 mm/mm_init.c                                    | 22 +++++++++++-----------
 mm/page_alloc.c                                 | 24 ++++++++++++------------
 mm/page_isolation.c                             | 17 +++++++++--------
 mm/page_owner.c                                 |  6 +++---
 mm/page_reporting.c                             |  4 ++--
 mm/shuffle.h                                    |  2 +-
 mm/slab.c                                       |  2 +-
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  2 +-
 mm/vmstat.c                                     |  2 +-
 net/smc/smc_ib.c                                |  2 +-
 security/integrity/ima/ima_crypto.c             |  2 +-
 tools/perf/Documentation/perf-intel-pt.txt      |  2 +-
 tools/testing/memblock/linux/mmzone.h           |  6 +++---
 tools/testing/selftests/mm/thuge-gen.c          |  3 ++-
 76 files changed, 186 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 3f8769e46b07..bced9e4b6e08 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -193,7 +193,7 @@ from this.
 --------------------------------
 
 Free areas descriptor. User-space tools use this value to iterate the
-free_area ranges. MAX_ORDER is used by the zone buddy allocator.
+free_area ranges. NR_PAGE_ORDERS is used by the zone buddy allocator.
 
 prb
 ---
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..8a01b8112f0b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -970,17 +970,17 @@
 			buddy allocator. Bigger value increase the probability
 			of catching random memory corruption, but reduce the
 			amount of memory for normal system use. The maximum
-			possible value is MAX_ORDER/2.  Setting this parameter
-			to 1 or 2 should be enough to identify most random
-			memory corruption problems caused by bugs in kernel or
-			driver code when a CPU writes to (or reads from) a
-			random memory location. Note that there exists a class
-			of memory corruptions problems caused by buggy H/W or
-			F/W or by drivers badly programming DMA (basically when
-			memory is written at bus level and the CPU MMU is
-			bypassed) which are not detectable by
-			CONFIG_DEBUG_PAGEALLOC, hence this option will not help
-			tracking down these problems.
+			possible value is MAX_PAGE_ORDER/2.  Setting this
+			parameter to 1 or 2 should be enough to identify most
+			random memory corruption problems caused by bugs in
+			kernel or driver code when a CPU writes to (or reads
+			from) a random memory location. Note that there exists
+			a class of memory corruptions problems caused by buggy
+			H/W or F/W or by drivers badly programming DMA
+			(basically when memory is written at bus level and the
+			CPU MMU is bypassed) which are not detectable by
+			CONFIG_DEBUG_PAGEALLOC, hence this option will not
+			help tracking down these problems.
 
 	debug_pagealloc=
 			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
@@ -4136,7 +4136,7 @@
 			[KNL] Minimal page reporting order
 			Format: <integer>
 			Adjust the minimal page reporting order. The page
-			reporting is disabled when it exceeds MAX_ORDER.
+			reporting is disabled when it exceeds MAX_PAGE_ORDER.
 
 	panic=		[KNL] Kernel behaviour on panic: delay <timeout>
 			timeout > 0: seconds before rebooting
diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst
index 30a3be3c48f3..dca15d15feaf 100644
--- a/Documentation/networking/packet_mmap.rst
+++ b/Documentation/networking/packet_mmap.rst
@@ -263,20 +263,20 @@ the name indicates, this function allocates pages of memory, and the second
 argument is "order" or a power of two number of pages, that is
 (for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes,
 order=2 ==> 16384 bytes, etc. The maximum size of a
-region allocated by __get_free_pages is determined by the MAX_ORDER macro. More
-precisely the limit can be calculated as::
+region allocated by __get_free_pages is determined by the MAX_PAGE_ORDER macro.
+More precisely the limit can be calculated as::
 
-   PAGE_SIZE << MAX_ORDER
+   PAGE_SIZE << MAX_PAGE_ORDER
 
    In a i386 architecture PAGE_SIZE is 4096 bytes
-   In a 2.4/i386 kernel MAX_ORDER is 10
-   In a 2.6/i386 kernel MAX_ORDER is 11
+   In a 2.4/i386 kernel MAX_PAGE_ORDER is 10
+   In a 2.6/i386 kernel MAX_PAGE_ORDER is 11
 
 So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel
 respectively, with an i386 architecture.
 
 User space programs can include /usr/include/sys/user.h and
-/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations.
+/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_PAGE_ORDER declarations.
 
 The pagesize can also be determined dynamically with the getpagesize (2)
 system call.
@@ -324,7 +324,7 @@ Definitions:
 		(see /proc/slabinfo)
 <pointer size>  depends on the architecture -- ``sizeof(void *)``
 <page size>     depends on the architecture -- PAGE_SIZE or getpagesize (2)
-<max-order>     is the value defined with MAX_ORDER
+<max-order>     is the value defined with MAX_PAGE_ORDER
 <frame size>    it's an upper bound of frame's capture size (more on this later)
 ==============  ================================================================
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f8567e95f98b..b2ab8db63c4b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1362,7 +1362,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 12d611f3da5d..442539fd06fe 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1520,15 +1520,15 @@ config XEN
 
 # include/linux/mmzone.h requires the following to be true:
 #
-#   MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
+#   MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
 #
-# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
+# so the maximum value of MAX_PAGE_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
 #
-#     | SECTION_SIZE_BITS |  PAGE_SHIFT  |  max MAX_ORDER  |  default MAX_ORDER |
-# ----+-------------------+--------------+-----------------+--------------------+
-# 4K  |       27          |      12      |       15        |         10         |
-# 16K |       27          |      14      |       13        |         11         |
-# 64K |       29          |      16      |       13        |         13         |
+#     | SECTION_SIZE_BITS |  PAGE_SHIFT  |  max MAX_PAGE_ORDER  |  default MAX_PAGE_ORDER |
+# ----+-------------------+--------------+----------------------+-------------------------+
+# 4K  |       27          |      12      |       15             |         10              |
+# 16K |       27          |      14      |       13             |         11              |
+# 64K |       29          |      16      |       13             |         13              |
 config ARCH_FORCE_MAX_ORDER
 	int
 	default "13" if ARM64_64K_PAGES
@@ -1536,16 +1536,16 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
 	  large blocks of physically contiguous memory is required.
 
 	  The maximal size of allocation cannot exceed the size of the
-	  section, so the value of MAX_ORDER should satisfy
+	  section, so the value of MAX_PAGE_ORDER should satisfy
 
-	    MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
+	    MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
 
 	  Don't change if unsure.
 
diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h
index 5f5437621029..8a8acc220371 100644
--- a/arch/arm64/include/asm/sparsemem.h
+++ b/arch/arm64/include/asm/sparsemem.h
@@ -10,7 +10,7 @@
 /*
  * Section size must be at least 512MB for 64K base
  * page size config. Otherwise it will be less than
- * MAX_ORDER and the build process will fail.
+ * MAX_PAGE_ORDER and the build process will fail.
  */
 #ifdef CONFIG_ARM64_64K_PAGES
 #define SECTION_SIZE_BITS 29
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index b1e392186a0f..e691290d3765 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -228,7 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 	int i;
 
 	hyp_spin_lock_init(&pool->lock);
-	pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+	pool->max_order = min(MAX_PAGE_ORDER,
+			      get_order(nr_pages << PAGE_SHIFT));
 	for (i = 0; i <= pool->max_order; i++)
 		INIT_LIST_HEAD(&pool->free_area[i]);
 	pool->range_start = phys;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f5aae342632c..8116ac599f80 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -51,7 +51,7 @@ void __init arm64_hugetlb_cma_reserve(void)
 	 * page allocator. Just warn if there is any change
 	 * breaking this assumption.
 	 */
-	WARN_ON(order <= MAX_ORDER);
+	WARN_ON(order <= MAX_PAGE_ORDER);
 	hugetlb_cma_reserve(order);
 }
 #endif /* CONFIG_CMA */
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index ad69b466a08b..9dcf245c9cbf 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -402,7 +402,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index d54464021a61..58d9565dc2c7 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -50,7 +50,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f11a62809f2..52d7e3fad553 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -915,7 +915,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index d19fb1f3007d..c0e8d597e4cb 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	}
 
 	mmap_read_lock(mm);
-	chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) /
+	chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
 			sizeof(struct vm_area_struct *);
 	chunk = min(chunk, entries);
 	for (entry = 0; entry < entries; entry += chunk) {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f7c683b672c1..0a540b37aab6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void)
 		order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
 
 	if (order) {
-		VM_WARN_ON(order <= MAX_ORDER);
+		VM_WARN_ON(order <= MAX_PAGE_ORDER);
 		hugetlb_cma_reserve(order);
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 28fac4770073..23f5b5093ec1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1389,7 +1389,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER);
 
 	/*
 	 * We create the default window as big as we can. The constraint is
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 511c17aede4a..455311d9a5e9 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -26,7 +26,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE:_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 49849790e66d..204c43cb3d43 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -277,7 +277,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "12"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index c80b0a21d709..083e5f05a7f0 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -194,7 +194,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 
 	size = IO_PAGE_ALIGN(size);
 	order = get_order(size);
-	if (unlikely(order > MAX_ORDER))
+	if (unlikely(order > MAX_PAGE_ORDER))
 		return NULL;
 
 	npages = size >> IO_PAGE_SHIFT;
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 5e2931a18409..6acd8a4c1e2a 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
 	unsigned long new_rss_limit;
 	gfp_t gfp_flags;
 
-	if (max_tsb_size > PAGE_SIZE << MAX_ORDER)
-		max_tsb_size = PAGE_SIZE << MAX_ORDER;
+	if (max_tsb_size > PAGE_SIZE << MAX_PAGE_ORDER)
+		max_tsb_size = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	new_cache_index = 0;
 	for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index b1bfed0c8528..7a9820797eae 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -373,10 +373,10 @@ int __init linux_main(int argc, char **argv)
 	max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC;
 
 	/*
-	 * Zones have to begin on a 1 << MAX_ORDER page boundary,
+	 * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary,
 	 * so this makes sure that's true for highmem
 	 */
-	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1);
+	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1);
 	if (physmem_size + iomem_size > max_physmem) {
 		highmem = physmem_size + iomem_size - max_physmem;
 		physmem_size -= highmem;
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 7d792077e5fd..e031eaf36c99 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -793,7 +793,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c
index 4a8e43a7a6a4..aaeb2c9c071a 100644
--- a/drivers/accel/qaic/qaic_data.c
+++ b/drivers/accel/qaic/qaic_data.c
@@ -451,7 +451,7 @@ static int create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 s
 		 * later
 		 */
 		buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE;
-		max_order = min(MAX_ORDER - 1, get_order(size));
+		max_order = min(MAX_PAGE_ORDER - 1, get_order(size));
 	} else {
 		/* allocate a single page for book keeping */
 		nr_pages = 1;
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index bdd80b73c3e6..fb84cda92a75 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << MAX_ORDER))
-		count = PAGE_SIZE << MAX_ORDER;
+	if (count > (PAGE_SIZE << MAX_PAGE_ORDER))
+		count = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
@@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << MAX_ORDER))
-		count = PAGE_SIZE << MAX_ORDER;
+	if (count > (PAGE_SIZE << MAX_PAGE_ORDER))
+		count = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 11114a5d9e5c..d0e41d52d6a9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr)
 	}
 }
 
-#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT)
+#define MAX_LEN (1UL << MAX_PAGE_ORDER << PAGE_SHIFT)
 
 static int raw_cmd_copyin(int cmd, void __user *param,
 				 struct floppy_raw_cmd **rcmd)
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index fcaccd0b5a65..e4d3f45242f6 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -906,7 +906,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 		/*
 		 * The length of the ID shouldn't be assumed by software since
 		 * it may change in the future.  The allocation size is limited
-		 * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator.
+		 * to 1 << (PAGE_SHIFT + MAX_PAGE_ORDER) by the page allocator.
 		 * If the allocation fails, simply return ENOMEM rather than
 		 * warning in the kernel log.
 		 */
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index 3df7a256e919..5c1012d7ffa9 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
 			 HISI_ACC_SGL_ALIGN_SIZE);
 
 	/*
-	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER,
+	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_PAGE_ORDER,
 	 * block size may exceed 2^31 on ia64, so the max of block size is 2^31
 	 */
-	block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ?
-			   PAGE_SHIFT + MAX_ORDER : 31);
+	block_size = 1 << (PAGE_SHIFT + MAX_PAGE_ORDER < 32 ?
+			   PAGE_SHIFT + MAX_PAGE_ORDER : 31);
 	sgl_num_per_block = block_size / sgl_size;
 	block_num = count / sgl_num_per_block;
 	remain_sgl = count % sgl_num_per_block;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index 6bc26b4b06b8..ea7561ae6e13 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 	struct sg_table *st;
 	struct scatterlist *sg;
 	unsigned int npages; /* restricted by sg_alloc_table */
-	int max_order = MAX_ORDER;
+	int max_order = MAX_PAGE_ORDER;
 	unsigned int max_segment;
 	gfp_t gfp;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index 6b9f6cf50bf6..84c50c4c4af7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj)
 		do {
 			struct page *page;
 
-			GEM_BUG_ON(order > MAX_ORDER);
+			GEM_BUG_ON(order > MAX_PAGE_ORDER);
 			page = alloc_pages(GFP | __GFP_ZERO, order);
 			if (!page)
 				goto err;
diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
index 2d9cae8cd984..cceaa18d4e46 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
@@ -109,7 +109,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = {
 	},
 	{
 		.description = "Above the allocation limit",
-		.order = MAX_ORDER + 1,
+		.order = MAX_PAGE_ORDER + 1,
 	},
 	{
 		.description = "One page, with coherent DMA mappings enabled",
@@ -118,7 +118,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = {
 	},
 	{
 		.description = "Above the allocation limit, with coherent DMA mappings enabled",
-		.order = MAX_ORDER + 1,
+		.order = MAX_PAGE_ORDER + 1,
 		.use_dma_alloc = true,
 	},
 };
@@ -165,7 +165,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
 	fst_page = tt->pages[0];
 	last_page = tt->pages[tt->num_pages - 1];
 
-	if (params->order <= MAX_ORDER) {
+	if (params->order <= MAX_PAGE_ORDER) {
 		if (params->use_dma_alloc) {
 			KUNIT_ASSERT_NOT_NULL(test, (void *)fst_page->private);
 			KUNIT_ASSERT_NOT_NULL(test, (void *)last_page->private);
@@ -182,7 +182,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
 			 * order 0 blocks
 			 */
 			KUNIT_ASSERT_EQ(test, fst_page->private,
-					min_t(unsigned int, MAX_ORDER,
+					min_t(unsigned int, MAX_PAGE_ORDER,
 					      params->order));
 			KUNIT_ASSERT_EQ(test, last_page->private, 0);
 		}
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index d183bb97c526..b62f420a9f96 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -447,7 +447,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 	else
 		gfp_flags |= GFP_HIGHUSER;
 
-	for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
+	for (order = min_t(unsigned int, MAX_PAGE_ORDER, __fls(num_pages));
 	     num_pages;
 	     order = min_t(unsigned int, order, __fls(num_pages))) {
 		struct ttm_pool_type *pt;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 961205ba86d2..925ac6a47bce 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -188,7 +188,7 @@
 #ifdef CONFIG_CMA_ALIGNMENT
 #define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
 #else
-#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_ORDER)
+#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_PAGE_ORDER)
 #endif
 
 /*
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 85163a83df2f..e59f50e11ea8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -884,7 +884,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	struct page **pages;
 	unsigned int i = 0, nid = dev_to_node(dev);
 
-	order_mask &= GENMASK(MAX_ORDER, 0);
+	order_mask &= GENMASK(MAX_PAGE_ORDER, 0);
 	if (!order_mask)
 		return NULL;
 
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 9a7a74239eab..d097001c1e3e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2465,8 +2465,8 @@ static bool its_parse_indirect_baser(struct its_node *its,
 	 * feature is not supported by hardware.
 	 */
 	new_order = max_t(u32, get_order(esz << ids), new_order);
-	if (new_order > MAX_ORDER) {
-		new_order = MAX_ORDER;
+	if (new_order > MAX_PAGE_ORDER) {
+		new_order = MAX_PAGE_ORDER;
 		ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
 		pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n",
 			&its->phys_base, its_base_type_string[type],
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f03d7dba270c..13c65b7e1ed6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1170,7 +1170,7 @@ static void __cache_size_refresh(void)
  * If the allocation may fail we use __get_free_pages. Memory fragmentation
  * won't have a fatal effect here, but it just causes flushes of some other
  * buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order > MAX_ORDER).
+ * always fails (i.e. order > MAX_PAGE_ORDER).
  *
  * If the allocation shouldn't fail we use __vmalloc. This is only for the
  * initial reserve allocation, so there's no risk of wasting all vmalloc
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2ae8560b6a14..855b482cbff1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
 	unsigned int remaining_size;
-	unsigned int order = MAX_ORDER;
+	unsigned int order = MAX_PAGE_ORDER;
 
 retry:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index f57fb821528d..7916ed9f10e8 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
 
 	remaining_size = size;
 
-	order = MAX_ORDER;
+	order = MAX_PAGE_ORDER;
 	while (remaining_size) {
 		struct page *pages;
 		unsigned size_to_add, to_copy;
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 55fc5b80e649..4441aca2280a 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (vsize == 0)
 		return -EINVAL;
 
-	if (get_order(vsize) > MAX_ORDER)
+	if (get_order(vsize) > MAX_PAGE_ORDER)
 		return -ENOMEM;
 
 	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL);
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 1c798d6b2dfb..a2c4a9b4f871 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init)
 void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
 			       dma_addr_t *dma_handle)
 {
-	if (get_order(size) > MAX_ORDER)
+	if (get_order(size) > MAX_PAGE_ORDER)
 		return NULL;
 
 	return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,
@@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl,
 	sgl->write = write;
 	sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages);
 
-	if (get_order(sgl->sgl_size) > MAX_ORDER) {
+	if (get_order(sgl->sgl_size) > MAX_PAGE_ORDER) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: too much memory requested!\n", __func__);
 		return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index b618797a7e8d..f1695c889d3a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 		return;
 
 	order = get_order(alloc_size);
-	if (order > MAX_ORDER) {
+	if (order > MAX_PAGE_ORDER) {
 		if (net_ratelimit())
 			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
 		return;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 4e18b4cefa97..94ac36b1408b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -48,7 +48,7 @@
  * of 4096 jumbo frames (MTU=9000) we will need about 9K*4K = 36MB plus
  * some padding.
  *
- * But the size of a single DMA region is limited by MAX_ORDER in the
+ * But the size of a single DMA region is limited by MAX_PAGE_ORDER in the
  * kernel (about 16MB currently).  To support say 4K Jumbo frames, we
  * use a set of LTBs (struct ltb_set) per pool.
  *
@@ -75,7 +75,7 @@
  * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160
  * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC.
  */
-#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << MAX_ORDER) * PAGE_SIZE))
+#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << MAX_PAGE_ORDER) * PAGE_SIZE))
 #define IBMVNIC_ONE_LTB_SIZE	min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX)
 #define IBMVNIC_LTB_SET_SIZE	(38 << 20)
 
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index a80939fe2ee6..6a29d2594b91 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -927,8 +927,8 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev,
 	if (request_size == 0)
 		return -1;
 
-	if (order <= MAX_ORDER) {
-		/* Call alloc_pages if the size is less than 2^MAX_ORDER */
+	if (order <= MAX_PAGE_ORDER) {
+		/* Call alloc_pages if the size is less than 2^MAX_PAGE_ORDER */
 		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 		if (!page)
 			return -1;
@@ -958,7 +958,7 @@ static void hvfb_release_phymem(struct hv_device *hdev,
 {
 	unsigned int order = get_order(size);
 
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		__free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order);
 	else
 		dma_free_coherent(&hdev->device,
diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c
index 840ead69654b..a32e5b2924c9 100644
--- a/drivers/video/fbdev/vermilion/vermilion.c
+++ b/drivers/video/fbdev/vermilion/vermilion.c
@@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo,
 		va = &vinfo->vram[i];
 		order = 0;
 
-		while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER)
+		while (requested > (PAGE_SIZE << order) && order <= MAX_PAGE_ORDER)
 			order++;
 
 		err = vmlfb_alloc_vram_area(va, order, 0);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1fe93e93f5bc..59cdc0292dce 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -33,7 +33,7 @@
 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
 					     __GFP_NOMEMALLOC)
 /* The order of free page blocks to report to host */
-#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER
+#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_PAGE_ORDER
 /* The size of a free page block in bytes */
 #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index fa5226c198cc..8e3223294442 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1154,13 +1154,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
  */
 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
 {
-	unsigned long order = MAX_ORDER;
+	unsigned long order = MAX_PAGE_ORDER;
 	unsigned long i;
 
 	/*
 	 * We might get called for ranges that don't cover properly aligned
-	 * MAX_ORDER pages; however, we can only online properly aligned
-	 * pages with an order of MAX_ORDER at maximum.
+	 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
+	 * pages with an order of MAX_PAGE_ORDER at maximum.
 	 */
 	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
 		order--;
@@ -1280,7 +1280,7 @@ static void virtio_mem_online_page(struct virtio_mem *vm,
 	bool do_online;
 
 	/*
-	 * We can get called with any order up to MAX_ORDER. If our subblock
+	 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
 	 * size is smaller than that and we have a mixture of plugged and
 	 * unplugged subblocks within such a page, we have to process in
 	 * smaller granularity. In that case we'll adjust the order exactly once
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index efb1b4c1a0a4..7a6d980e614d 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	/* make various checks */
 	order = get_order(newsize);
-	if (unlikely(order > MAX_ORDER))
+	if (unlikely(order > MAX_PAGE_ORDER))
 		return -EFBIG;
 
 	ret = inode_newsize_ok(inode, newsize);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 236ec7b63c54..c1ee640d87b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
 
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
-	return huge_page_order(h) > MAX_ORDER;
+	return huge_page_order(h) > MAX_PAGE_ORDER;
 }
 
 static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1ea7636dfb76..4ed33b127821 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -27,15 +27,15 @@
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 10
+#define MAX_PAGE_ORDER 10
 #else
-#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
+#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
 #endif
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
-#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
 
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -938,7 +938,7 @@ struct zone {
 	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
-	/* Pages to be accepted. All pages on the list are MAX_ORDER */
+	/* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
 	struct list_head	unaccepted_pages;
 #endif
 
@@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 #define SECTION_BLOCKFLAGS_BITS \
 	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
 
-#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
-#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
 #endif
 
 static inline unsigned long pfn_to_section_nr(unsigned long pfn)
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e83c4c095041..3f2409b968ec 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		MAX_PAGE_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..d63823e518c0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void)
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
@@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void)
 
 #ifdef CONFIG_SLUB
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index b481c48a31a6..d10613eb0f63 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 	void *addr;
 	int ret = -ENOMEM;
 
-	/* Cannot allocate larger than MAX_ORDER */
-	order = min(get_order(pool_size), MAX_ORDER);
+	/* Cannot allocate larger than MAX_PAGE_ORDER */
+	order = min(get_order(pool_size), MAX_PAGE_ORDER);
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
 
 	/*
 	 * If coherent_pool was not used on the command line, default the pool
-	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
+	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
 	 */
 	if (!atomic_pool_size) {
 		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 33d942615be5..176078bf2215 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -686,8 +686,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
 	size_t pool_size;
 	size_t tlb_size;
 
-	if (nslabs > SLABS_PER_PAGE << MAX_ORDER) {
-		nslabs = SLABS_PER_PAGE << MAX_ORDER;
+	if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+		nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
 		nareas = limit_nareas(nareas, nslabs);
 	}
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8d82c2f07d0..60ed43d1c29e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
 {
 	struct page *page;
 
-	if (order > MAX_ORDER)
-		order = MAX_ORDER;
+	if (order > MAX_PAGE_ORDER)
+		order = MAX_PAGE_ORDER;
 
 	do {
 		page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -702,9 +702,9 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 
 	/*
 	 * kcalloc_node() is unable to allocate buffer if the size is larger
-	 * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+	 * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
 	 */
-	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
 		return -ENOMEM;
 	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
 				     node);
@@ -821,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
+	if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
 		goto fail;
 
 	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index 79d563d8f9e0..cb9d470f0bf7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -381,7 +381,7 @@ config SHUFFLE_PAGE_ALLOCATOR
 	  the presence of a memory-side-cache. There are also incidental
 	  security benefits as it reduces the predictability of page
 	  allocations to compliment SLAB_FREELIST_RANDOM, but the
-	  default granularity of shuffling on the MAX_ORDER i.e, 10th
+	  default granularity of shuffling on the MAX_PAGE_ORDER i.e, 10th
 	  order of pages is selected based on cache utilization benefits
 	  on x86.
 
@@ -713,8 +713,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 	  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
 	  on a platform.
 
-	  Note that the pageblock_order cannot exceed MAX_ORDER and will be
-	  clamped down to MAX_ORDER.
+	  Note that the pageblock_order cannot exceed MAX_PAGE_ORDER and will be
+	  clamped down to MAX_PAGE_ORDER.
 
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
diff --git a/mm/compaction.c b/mm/compaction.c
index 24f8eb4d6260..27ada42924d5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -999,7 +999,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * a valid page order. Consider only values in the
 			 * valid order range to prevent low_pfn overflow.
 			 */
-			if (freepage_order > 0 && freepage_order <= MAX_ORDER) {
+			if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) {
 				low_pfn += (1UL << freepage_order) - 1;
 				nr_scanned += (1UL << freepage_order) - 1;
 			}
@@ -1017,7 +1017,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (PageCompound(page) && !cc->alloc_contig) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order <= MAX_ORDER)) {
+			if (likely(order <= MAX_PAGE_ORDER)) {
 				low_pfn += (1UL << order) - 1;
 				nr_scanned += (1UL << order) - 1;
 			}
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index f9d145730fd1..6755f0c9d4a3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -22,7 +22,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 
-	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_PAGE_ORDER / 2) {
 		pr_err("Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index e651500e597a..5662e29fe253 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1091,7 +1091,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	struct page *page = NULL;
 
 #ifdef CONFIG_CONTIG_ALLOC
-	if (order > MAX_ORDER) {
+	if (order > MAX_PAGE_ORDER) {
 		page = alloc_contig_pages((1 << order), GFP_KERNEL,
 					  first_online_node, NULL);
 		if (page) {
@@ -1101,7 +1101,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	}
 #endif
 
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		page = alloc_pages(GFP_KERNEL, order);
 
 	return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a588e29d287..b9a7a57691d7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -682,7 +682,7 @@ static int __init hugepage_init(void)
 	/*
 	 * hugepages can't be allocated by the buddy allocator
 	 */
-	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
+	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
 	/*
 	 * we use page->mapping and page->index in second tail page
 	 * as list_head: assuming THP order >= 2
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 378e460a6ab4..0d262784ce60 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3410,7 +3410,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
- * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
  */
 static void __init gather_bootmem_prealloc(void)
 {
@@ -4790,7 +4790,7 @@ static int __init default_hugepagesz_setup(char *s)
 	 * The number of default huge pages (for this size) could have been
 	 * specified as the first hugetlb parameter: hugepages=X.  If so,
 	 * then default_hstate_max_huge_pages is set.  If the default huge
-	 * page size is gigantic (> MAX_ORDER), then the pages must be
+	 * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
 	 * allocated here from bootmem allocator.
 	 */
 	if (default_hstate_max_huge_pages) {
diff --git a/mm/internal.h b/mm/internal.h
index ac40c3d00336..f309a010d50f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -335,7 +335,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
- * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
  */
 static inline unsigned long
 __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 103e2e88ea03..3ac3b8921d36 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -141,7 +141,7 @@ struct smallstack {
 
 static struct smallstack collect = {
 	.index = 0,
-	.order = MAX_ORDER,
+	.order = MAX_PAGE_ORDER,
 };
 
 static void smallstack_push(struct smallstack *stack, struct page *pages)
@@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void)
 	 *    order=N-1,
 	 *  - repeat.
 	 */
-	collect.order = MAX_ORDER;
-	for (int i = MAX_ORDER; i >= 0; i--) {
+	collect.order = MAX_PAGE_ORDER;
+	for (int i = MAX_PAGE_ORDER; i >= 0; i--) {
 		if (held_back[i].shadow)
 			smallstack_push(&collect, held_back[i].shadow);
 		if (held_back[i].origin)
diff --git a/mm/memblock.c b/mm/memblock.c
index 4a62f7774b65..8c194d8afeec 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2113,12 +2113,13 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 		 * Free the pages in the largest chunks alignment allows.
 		 *
 		 * __ffs() behaviour is undefined for 0. start == 0 is
-		 * MAX_ORDER-aligned, set order to MAX_ORDER for the case.
+		 * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for
+		 * the case.
 		 */
 		if (start)
-			order = min_t(int, MAX_ORDER, __ffs(start));
+			order = min_t(int, MAX_PAGE_ORDER, __ffs(start));
 		else
-			order = MAX_ORDER;
+			order = MAX_PAGE_ORDER;
 
 		while (start + (1UL << order) > end)
 			order--;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 926e1cfb10e9..b3c0ff52bb72 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -645,7 +645,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	unsigned long pfn;
 
 	/*
-	 * Online the pages in MAX_ORDER aligned chunks. The callback might
+	 * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might
 	 * decide to not expose all pages to the buddy (e.g., expose them
 	 * later). We account all pages as being online and belonging to this
 	 * zone ("present").
@@ -660,12 +660,13 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 		 * Free to online pages in the largest chunks alignment allows.
 		 *
 		 * __ffs() behaviour is undefined for 0. start == 0 is
-		 * MAX_ORDER-aligned, Set order to MAX_ORDER for the case.
+		 * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for
+		 * the case.
 		 */
 		if (pfn)
-			order = min_t(int, MAX_ORDER, __ffs(pfn));
+			order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn));
 		else
-			order = MAX_ORDER;
+			order = MAX_PAGE_ORDER;
 
 		(*online_page_callback)(pfn_to_page(pfn), order);
 		pfn += (1UL << order);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2830eef2b16c..89dc29f1e6c6 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1455,7 +1455,7 @@ static inline void setup_usemap(struct zone *zone) {}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order = MAX_ORDER;
+	unsigned int order = MAX_PAGE_ORDER;
 
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
@@ -1638,7 +1638,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 	offset = pgdat->node_start_pfn - start;
 	/*
-	 * The zone's endpoints aren't required to be MAX_ORDER
+		 * The zone's endpoints aren't required to be MAX_PAGE_ORDER
 	 * aligned but the node_mem_map endpoints must be in order
 	 * for the buddy allocator to function correctly.
 	 */
@@ -1964,11 +1964,11 @@ static void __init deferred_free_range(unsigned long pfn,
 	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
 		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
 			set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
-		__free_pages_core(page, MAX_ORDER);
+		__free_pages_core(page, MAX_PAGE_ORDER);
 		return;
 	}
 
-	/* Accept chunks smaller than MAX_ORDER upfront */
+	/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
 	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
@@ -1991,8 +1991,8 @@ static inline void __init pgdat_init_report_one_done(void)
 /*
  * Returns true if page needs to be initialized or freed to buddy allocator.
  *
- * We check if a current MAX_ORDER block is valid by only checking the validity
- * of the head pfn.
+ * We check if a current MAX_PAGE_ORDER block is valid by only checking the
+ * validity of the head pfn.
  */
 static inline bool __init deferred_pfn_valid(unsigned long pfn)
 {
@@ -2149,8 +2149,8 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
 	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so that we
-	 * can avoid introducing any issues with the buddy allocator.
+	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
+	 * we can avoid introducing any issues with the buddy allocator.
 	 */
 	while (spfn < end_pfn) {
 		deferred_init_maxorder(&i, zone, &spfn, &epfn);
@@ -2291,7 +2291,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 	}
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so
+	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
 	 * that we can avoid introducing any issues with the buddy
 	 * allocator.
 	 */
@@ -2509,7 +2509,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 			else
 				table = memblock_alloc_raw(size,
 							   SMP_CACHE_BYTES);
-		} else if (get_order(size) > MAX_ORDER || hashdist) {
+		} else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {
 			table = vmalloc_huge(size, gfp_flags);
 			virt = true;
 			if (table)
@@ -2756,7 +2756,7 @@ void __init mm_core_init(void)
 
 	/*
 	 * page_ext requires contiguous pages,
-	 * bigger than MAX_ORDER unless SPARSEMEM.
+	 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.
 	 */
 	page_ext_init_flatmem();
 	mem_debugging_and_hardening_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccecf6158ae4..a01baf0454f8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -727,7 +727,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 	unsigned long higher_page_pfn;
 	struct page *higher_page;
 
-	if (order >= MAX_ORDER - 1)
+	if (order >= MAX_PAGE_ORDER - 1)
 		return false;
 
 	higher_page_pfn = buddy_pfn & pfn;
@@ -782,7 +782,7 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
-	while (order < MAX_ORDER) {
+	while (order < MAX_PAGE_ORDER) {
 		if (compaction_capture(capc, page, order, migratetype)) {
 			__mod_zone_freepage_state(zone, -(1 << order),
 								migratetype);
@@ -1297,7 +1297,7 @@ void __free_pages_core(struct page *page, unsigned int order)
 	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
 
 	if (page_contains_unaccepted(page, order)) {
-		if (order == MAX_ORDER && __free_unaccepted(page))
+		if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
 			return;
 
 		accept_page(page, order);
@@ -1327,7 +1327,7 @@ void __free_pages_core(struct page *page, unsigned int order)
  *
  * Note: the function may return non-NULL struct page even for a page block
  * which contains a memory hole (i.e. there is no physical memory for a subset
- * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
  * even though the start pfn is online and valid. This should be safe most of
  * the time because struct pages are still initialized via init_unavailable_range()
@@ -2018,7 +2018,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	 * approximates finding the pageblock with the most free pages, which
 	 * would be too costly to do exactly.
 	 */
-	for (current_order = MAX_ORDER; current_order >= min_order;
+	for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
 				--current_order) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
@@ -2056,7 +2056,7 @@ find_smallest:
 	 * This should not happen - we already found a suitable fallback
 	 * when looking for the largest page.
 	 */
-	VM_BUG_ON(current_order > MAX_ORDER);
+	VM_BUG_ON(current_order > MAX_PAGE_ORDER);
 
 do_steal:
 	page = get_page_from_free_area(area, fallback_mt);
@@ -4533,7 +4533,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 	 * There are several places where we assume that the order value is sane
 	 * so bail out early if the request is out of bound.
 	 */
-	if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
+	if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
 		return NULL;
 
 	gfp &= gfp_allowed_mask;
@@ -4815,7 +4815,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
- * This function is also limited by MAX_ORDER.
+ * This function is also limited by MAX_PAGE_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  *
@@ -6373,7 +6373,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
-		if (++order > MAX_ORDER) {
+		if (++order > MAX_PAGE_ORDER) {
 			outer_start = start;
 			break;
 		}
@@ -6635,7 +6635,7 @@ bool is_free_buddy_page(struct page *page)
 			break;
 	}
 
-	return order <= MAX_ORDER;
+	return order <= MAX_PAGE_ORDER;
 }
 EXPORT_SYMBOL(is_free_buddy_page);
 
@@ -6807,9 +6807,9 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	accept_page(page, MAX_ORDER);
+	accept_page(page, MAX_PAGE_ORDER);
 
-	__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
 
 	if (last)
 		static_branch_dec(&zones_with_unaccepted_pages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bcf99ba747a0..cd0ea3668253 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 	 */
 	if (PageBuddy(page)) {
 		order = buddy_order(page);
-		if (order >= pageblock_order && order < MAX_ORDER) {
+		if (order >= pageblock_order && order < MAX_PAGE_ORDER) {
 			buddy = find_buddy_page_pfn(page, page_to_pfn(page),
 						    order, NULL);
 			if (buddy && !is_migrate_isolate_page(buddy)) {
@@ -290,11 +290,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *			isolate_single_pageblock()
  * @migratetype:	migrate type to set in error recovery.
  *
- * Free and in-use pages can be as big as MAX_ORDER and contain more than one
+ * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
  * pageblock. When not all pageblocks within a page are isolated at the same
  * time, free page accounting can go wrong. For example, in the case of
- * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks.
- * [         MAX_ORDER           ]
+ * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two
+ * pagelbocks.
+ * [      MAX_PAGE_ORDER         ]
  * [  pageblock0  |  pageblock1  ]
  * When either pageblock is isolated, if it is a free page, the page is not
  * split into separate migratetype lists, which is supposed to; if it is an
@@ -451,7 +452,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				 * the free page to the right migratetype list.
 				 *
 				 * head_pfn is not used here as a hugetlb page order
-				 * can be bigger than MAX_ORDER, but after it is
+				 * can be bigger than MAX_PAGE_ORDER, but after it is
 				 * freed, the free page order is not. Use pfn within
 				 * the range to find the head of the free page.
 				 */
@@ -459,7 +460,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				outer_pfn = pfn;
 				while (!PageBuddy(pfn_to_page(outer_pfn))) {
 					/* stop if we cannot find the free page */
-					if (++order > MAX_ORDER)
+					if (++order > MAX_PAGE_ORDER)
 						goto failed;
 					outer_pfn &= ~0UL << order;
 				}
@@ -660,8 +661,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 	int ret;
 
 	/*
-	 * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
-	 * are not aligned to pageblock_nr_pages.
+	 * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
+	 * pages are not aligned to pageblock_nr_pages.
 	 * Then we just check migratetype first.
 	 */
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 040dbf26a986..5634e5d890f8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -320,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				unsigned long freepage_order;
 
 				freepage_order = buddy_order_unsafe(page);
-				if (freepage_order <= MAX_ORDER)
+				if (freepage_order <= MAX_PAGE_ORDER)
 					pfn += (1UL << freepage_order) - 1;
 				continue;
 			}
@@ -555,7 +555,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = buddy_order_unsafe(page);
 
-			if (freepage_order <= MAX_ORDER)
+			if (freepage_order <= MAX_PAGE_ORDER)
 				pfn += (1UL << freepage_order) - 1;
 			continue;
 		}
@@ -663,7 +663,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 			if (PageBuddy(page)) {
 				unsigned long order = buddy_order_unsafe(page);
 
-				if (order > 0 && order <= MAX_ORDER)
+				if (order > 0 && order <= MAX_PAGE_ORDER)
 					pfn += (1UL << order) - 1;
 				continue;
 			}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 66369cc5279b..e4c428e61d8c 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param *
 	 * If param is set beyond this limit, order is set to default
 	 * pageblock_order value
 	 */
-	return  param_set_uint_minmax(val, kp, 0, MAX_ORDER);
+	return  param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER);
 }
 
 static const struct kernel_param_ops page_reporting_param_ops = {
@@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
 	 */
 
 	if (page_reporting_order == -1) {
-		if (prdev->order > 0 && prdev->order <= MAX_ORDER)
+		if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
 			page_reporting_order = prdev->order;
 		else
 			page_reporting_order = pageblock_order;
diff --git a/mm/shuffle.h b/mm/shuffle.h
index a6bdf54f96f1..61bbcddeeee6 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,7 +4,7 @@
 #define _MM_SHUFFLE_H
 #include <linux/jump_label.h>
 
-#define SHUFFLE_ORDER MAX_ORDER
+#define SHUFFLE_ORDER MAX_PAGE_ORDER
 
 #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
 DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
diff --git a/mm/slab.c b/mm/slab.c
index 773c79e153f3..073cae923d56 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str)
 {
 	get_option(&str, &slab_max_order);
 	slab_max_order = slab_max_order < 0 ? 0 :
-				min(slab_max_order, MAX_ORDER);
+				min(slab_max_order, MAX_PAGE_ORDER);
 	slab_max_order_set = true;
 
 	return 1;
diff --git a/mm/slub.c b/mm/slub.c
index a5420be89c8c..ba162e661e2e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4194,7 +4194,7 @@ static inline int calculate_order(unsigned int size)
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = get_order(size);
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		return order;
 	return -ENOSYS;
 }
@@ -4722,7 +4722,7 @@ __setup("slub_min_order=", setup_slub_min_order);
 static int __init setup_slub_max_order(char *str)
 {
 	get_option(&str, (int *)&slub_max_order);
-	slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
+	slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
 
 	if (slub_min_order > slub_max_order)
 		slub_min_order = slub_max_order;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 600ed3cbf7cb..68f0abbb8e59 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6415,7 +6415,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
 	 * Confirm they are large enough for max values.
 	 */
-	BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
+	BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
 	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
 	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 03ead31c46a0..db79935e4a54 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1092,7 +1092,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 {
 	unsigned long requested = 1UL << order;
 
-	if (WARN_ON_ONCE(order > MAX_ORDER))
+	if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
 		return 0;
 
 	if (!info->free_blocks_total)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 89981dbe46c9..97704a9e84c7 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -844,7 +844,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
 		goto out;
 	/* the calculated number of cq entries fits to mlx5 cq allocation */
 	cqe_size_order = cache_line_size() == 128 ? 7 : 6;
-	smc_order = MAX_ORDER - cqe_size_order;
+	smc_order = MAX_PAGE_ORDER - cqe_size_order;
 	if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
 		cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
 	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 51ad29940f05..f3738b2c8bcd 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp)
 
 	size = memparse(val, NULL);
 	order = get_order(size);
-	if (order > MAX_ORDER)
+	if (order > MAX_PAGE_ORDER)
 		return -EINVAL;
 	ima_maxorder = order;
 	ima_bufsize = PAGE_SIZE << order;
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 4c90cc176f81..2109690b0d5f 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -683,7 +683,7 @@ Buffer handling
 ~~~~~~~~~~~~~~~
 
 There may be buffer limitations (i.e. single ToPa entry) which means that actual
-buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER).  In order to
+buffer sizes are limited to powers of 2 up to 4MiB (MAX_PAGE_ORDER).  In order to
 provide other sizes, and in particular an arbitrarily large size, multiple
 buffers are logically concatenated.  However an interrupt must be used to switch
 between buffers.  That has two potential problems:
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index 134f8eab0768..71546e15bdd3 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -17,10 +17,10 @@ enum zone_type {
 };
 
 #define MAX_NR_ZONES __MAX_NR_ZONES
-#define MAX_ORDER 10
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_PAGE_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
 
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		MAX_PAGE_ORDER
 #define pageblock_nr_pages	BIT(pageblock_order)
 #define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
 #define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 16ed4dfa7359..622987f12c89 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -3,7 +3,8 @@
 
    Before running this huge pages for each huge page size must have been
    reserved.
-   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+   For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
+   be used.
    Also shmmax must be increased.
    And you need to run as root to work around some weird permissions in shm.
    And nothing using huge pages should run in parallel.
-- 
cgit v1.2.3


From 2fb7e4dd35c52933b18ff127bf92d703c8e2e897 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 5 Jan 2024 13:51:21 +0100
Subject: PNP: make pnp_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the pnp_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pnp/driver.c | 2 +-
 include/linux/pnp.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 46c534f6b1c9..0a5d0d8befa8 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -256,7 +256,7 @@ static const struct dev_pm_ops pnp_bus_dev_pm_ops = {
 	.restore = pnp_bus_resume,
 };
 
-struct bus_type pnp_bus_type = {
+const struct bus_type pnp_bus_type = {
 	.name    = "pnp",
 	.match   = pnp_bus_match,
 	.probe   = pnp_device_probe,
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 267fb8a4fb6e..ddbe7c3ca4ce 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -435,7 +435,7 @@ struct pnp_protocol {
 #define protocol_for_each_dev(protocol, dev)	\
 	list_for_each_entry(dev, &(protocol)->devices, protocol_list)
 
-extern struct bus_type pnp_bus_type;
+extern const struct bus_type pnp_bus_type;
 
 #if defined(CONFIG_PNP)
 
-- 
cgit v1.2.3


From 6eade110754c085cee9e46f4d87d2c3ea4e59e8c Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:32 -0800
Subject: cxl/events: Separate UUID from event structures

The UEFI CXL CPER structure does not include the UUID.  Now that the
UUID is passed separately to the trace event there is no need to have
the UUID in those structures.

Move UUID from the event record header to the raw structures.  Adjust
cxl-test to Create dummy structures for creating test records.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-5-1bb8a4ca2c7a@intel.com
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      |   2 +-
 include/linux/cxl-event.h    |   6 +-
 tools/testing/cxl/test/mem.c | 129 +++++++++++++++++++++++++------------------
 3 files changed, 81 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 5f3681de10de..4c5161896826 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -840,7 +840,7 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 				   enum cxl_event_log_type type,
 				   struct cxl_event_record_raw *record)
 {
-	uuid_t *id = &record->hdr.id;
+	uuid_t *id = &record->id;
 
 	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) {
 		struct cxl_event_gen_media *rec =
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 0fc068123f8e..3d9b5954d0c1 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -8,7 +8,6 @@
  * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
  */
 struct cxl_event_record_hdr {
-	uuid_t id;
 	u8 length;
 	u8 flags[3];
 	__le16 handle;
@@ -18,8 +17,13 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
+/*
+ * Common Event Record Format
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
 struct cxl_event_record_raw {
+	uuid_t id;
 	struct cxl_event_record_hdr hdr;
 	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
 } __packed;
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 5a95b04b329a..9cc2b8ce1efd 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -337,9 +337,9 @@ static void cxl_mock_event_trigger(struct device *dev)
 }
 
 struct cxl_event_record_raw maint_needed = {
+	.id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB,
+			0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
 	.hdr = {
-		.id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB,
-				0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
 		.length = sizeof(struct cxl_event_record_raw),
 		.flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,
 		/* .handle = Set dynamically */
@@ -349,9 +349,9 @@ struct cxl_event_record_raw maint_needed = {
 };
 
 struct cxl_event_record_raw hardware_replace = {
+	.id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E,
+			0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
 	.hdr = {
-		.id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E,
-				0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
 		.length = sizeof(struct cxl_event_record_raw),
 		.flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE,
 		/* .handle = Set dynamically */
@@ -360,61 +360,82 @@ struct cxl_event_record_raw hardware_replace = {
 	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 };
 
-struct cxl_event_gen_media gen_media = {
-	.hdr = {
-		.id = CXL_EVENT_GEN_MEDIA_UUID,
-		.length = sizeof(struct cxl_event_gen_media),
-		.flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
-		/* .handle = Set dynamically */
-		.related_handle = cpu_to_le16(0),
+struct cxl_test_gen_media {
+	uuid_t id;
+	struct cxl_event_gen_media rec;
+} __packed;
+
+struct cxl_test_gen_media gen_media = {
+	.id = CXL_EVENT_GEN_MEDIA_UUID,
+	.rec = {
+		.hdr = {
+			.length = sizeof(struct cxl_test_gen_media),
+			.flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
+			/* .handle = Set dynamically */
+			.related_handle = cpu_to_le16(0),
+		},
+		.phys_addr = cpu_to_le64(0x2000),
+		.descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
+		.type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
+		.transaction_type = CXL_GMER_TRANS_HOST_WRITE,
+		/* .validity_flags = <set below> */
+		.channel = 1,
+		.rank = 30
 	},
-	.phys_addr = cpu_to_le64(0x2000),
-	.descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
-	.type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
-	.transaction_type = CXL_GMER_TRANS_HOST_WRITE,
-	/* .validity_flags = <set below> */
-	.channel = 1,
-	.rank = 30
 };
 
-struct cxl_event_dram dram = {
-	.hdr = {
-		.id = CXL_EVENT_DRAM_UUID,
-		.length = sizeof(struct cxl_event_dram),
-		.flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
-		/* .handle = Set dynamically */
-		.related_handle = cpu_to_le16(0),
+struct cxl_test_dram {
+	uuid_t id;
+	struct cxl_event_dram rec;
+} __packed;
+
+struct cxl_test_dram dram = {
+	.id = CXL_EVENT_DRAM_UUID,
+	.rec = {
+		.hdr = {
+			.length = sizeof(struct cxl_test_dram),
+			.flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
+			/* .handle = Set dynamically */
+			.related_handle = cpu_to_le16(0),
+		},
+		.phys_addr = cpu_to_le64(0x8000),
+		.descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
+		.type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
+		.transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
+		/* .validity_flags = <set below> */
+		.channel = 1,
+		.bank_group = 5,
+		.bank = 2,
+		.column = {0xDE, 0xAD},
 	},
-	.phys_addr = cpu_to_le64(0x8000),
-	.descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
-	.type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
-	.transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
-	/* .validity_flags = <set below> */
-	.channel = 1,
-	.bank_group = 5,
-	.bank = 2,
-	.column = {0xDE, 0xAD},
 };
 
-struct cxl_event_mem_module mem_module = {
-	.hdr = {
-		.id = CXL_EVENT_MEM_MODULE_UUID,
-		.length = sizeof(struct cxl_event_mem_module),
-		/* .handle = Set dynamically */
-		.related_handle = cpu_to_le16(0),
+struct cxl_test_mem_module {
+	uuid_t id;
+	struct cxl_event_mem_module rec;
+} __packed;
+
+struct cxl_test_mem_module mem_module = {
+	.id = CXL_EVENT_MEM_MODULE_UUID,
+	.rec = {
+		.hdr = {
+			.length = sizeof(struct cxl_test_mem_module),
+			/* .handle = Set dynamically */
+			.related_handle = cpu_to_le16(0),
+		},
+		.event_type = CXL_MMER_TEMP_CHANGE,
+		.info = {
+			.health_status = CXL_DHI_HS_PERFORMANCE_DEGRADED,
+			.media_status = CXL_DHI_MS_ALL_DATA_LOST,
+			.add_status = (CXL_DHI_AS_CRITICAL << 2) |
+				      (CXL_DHI_AS_WARNING << 4) |
+				      (CXL_DHI_AS_WARNING << 5),
+			.device_temp = { 0xDE, 0xAD},
+			.dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef },
+			.cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
+			.cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
+		}
 	},
-	.event_type = CXL_MMER_TEMP_CHANGE,
-	.info = {
-		.health_status = CXL_DHI_HS_PERFORMANCE_DEGRADED,
-		.media_status = CXL_DHI_MS_ALL_DATA_LOST,
-		.add_status = (CXL_DHI_AS_CRITICAL << 2) |
-			      (CXL_DHI_AS_WARNING << 4) |
-			      (CXL_DHI_AS_WARNING << 5),
-		.device_temp = { 0xDE, 0xAD},
-		.dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef },
-		.cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
-		.cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef },
-	}
 };
 
 static int mock_set_timestamp(struct cxl_dev_state *cxlds,
@@ -436,11 +457,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds,
 static void cxl_mock_add_event_logs(struct mock_event_store *mes)
 {
 	put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
-			   &gen_media.validity_flags);
+			   &gen_media.rec.validity_flags);
 
 	put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP |
 			   CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
-			   &dram.validity_flags);
+			   &dram.rec.validity_flags);
 
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO,
-- 
cgit v1.2.3


From f9c683386f5bc0364615138ce2b14be50848dbcf Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:33 -0800
Subject: cxl/events: Create a CXL event union

The CXL CPER and event log records share everything but a UUID/GUID in
their structures.

Define a cxl_event union without the UUID/GUID to be shared between the
CPER and event log record formats.  Adjust the code to use this union.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-6-1bb8a4ca2c7a@intel.com
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c      | 32 +++++++++++++-------------------
 drivers/cxl/core/trace.h     |  8 ++++----
 include/linux/cxl-event.h    | 23 +++++++++++++++++------
 tools/testing/cxl/test/mem.c | 31 ++++++++++++++++++-------------
 4 files changed, 52 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 4c5161896826..06957696247b 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -840,26 +840,17 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 				   enum cxl_event_log_type type,
 				   struct cxl_event_record_raw *record)
 {
+	union cxl_event *evt = &record->event;
 	uuid_t *id = &record->id;
 
-	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) {
-		struct cxl_event_gen_media *rec =
-				(struct cxl_event_gen_media *)record;
-
-		trace_cxl_general_media(cxlmd, type, rec);
-	} else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) {
-		struct cxl_event_dram *rec = (struct cxl_event_dram *)record;
-
-		trace_cxl_dram(cxlmd, type, rec);
-	} else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) {
-		struct cxl_event_mem_module *rec =
-				(struct cxl_event_mem_module *)record;
-
-		trace_cxl_memory_module(cxlmd, type, rec);
-	} else {
-		/* For unknown record types print just the header */
-		trace_cxl_generic_event(cxlmd, type, id, record);
-	}
+	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
+		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
+	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
+		trace_cxl_dram(cxlmd, type, &evt->dram);
+	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
+		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
+	else
+		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
 }
 
 static int cxl_clear_event_record(struct cxl_memdev_state *mds,
@@ -902,7 +893,10 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds,
 	 */
 	i = 0;
 	for (cnt = 0; cnt < total; cnt++) {
-		payload->handles[i++] = get_pl->records[cnt].hdr.handle;
+		struct cxl_event_record_raw *raw = &get_pl->records[cnt];
+		struct cxl_event_generic *gen = &raw->event.generic;
+
+		payload->handles[i++] = gen->hdr.handle;
 		dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log,
 			le16_to_cpu(payload->handles[i]));
 
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 312cfa9e0004..89445435303a 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -225,9 +225,9 @@ TRACE_EVENT(cxl_overflow,
 TRACE_EVENT(cxl_generic_event,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
-		 const uuid_t *uuid, struct cxl_event_record_raw *rec),
+		 const uuid_t *uuid, struct cxl_event_generic *gen_rec),
 
-	TP_ARGS(cxlmd, log, uuid, rec),
+	TP_ARGS(cxlmd, log, uuid, gen_rec),
 
 	TP_STRUCT__entry(
 		CXL_EVT_TP_entry
@@ -235,9 +235,9 @@ TRACE_EVENT(cxl_generic_event,
 	),
 
 	TP_fast_assign(
-		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+		CXL_EVT_TP_fast_assign(cxlmd, log, gen_rec->hdr);
 		memcpy(&__entry->hdr_uuid, uuid, sizeof(uuid_t));
-		memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH);
+		memcpy(__entry->data, gen_rec->data, CXL_EVENT_RECORD_DATA_LENGTH);
 	),
 
 	CXL_EVT_TP_printk("%s",
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 3d9b5954d0c1..4d6c05f535f8 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -17,13 +17,8 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
-/*
- * Common Event Record Format
- * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
- */
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
-struct cxl_event_record_raw {
-	uuid_t id;
+struct cxl_event_generic {
 	struct cxl_event_record_hdr hdr;
 	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
 } __packed;
@@ -96,4 +91,20 @@ struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
+union cxl_event {
+	struct cxl_event_generic generic;
+	struct cxl_event_gen_media gen_media;
+	struct cxl_event_dram dram;
+	struct cxl_event_mem_module mem_module;
+} __packed;
+
+/*
+ * Common Event Record Format; in event logs
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_raw {
+	uuid_t id;
+	union cxl_event event;
+} __packed;
+
 #endif /* _LINUX_CXL_EVENT_H */
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 9cc2b8ce1efd..35ee41e435ab 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -251,7 +251,8 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd)
 	for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) {
 		memcpy(&pl->records[i], event_get_current(log),
 		       sizeof(pl->records[i]));
-		pl->records[i].hdr.handle = event_get_cur_event_handle(log);
+		pl->records[i].event.generic.hdr.handle =
+				event_get_cur_event_handle(log);
 		log->cur_idx++;
 	}
 
@@ -339,25 +340,29 @@ static void cxl_mock_event_trigger(struct device *dev)
 struct cxl_event_record_raw maint_needed = {
 	.id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB,
 			0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
-	.hdr = {
-		.length = sizeof(struct cxl_event_record_raw),
-		.flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,
-		/* .handle = Set dynamically */
-		.related_handle = cpu_to_le16(0xa5b6),
+	.event.generic = {
+		.hdr = {
+			.length = sizeof(struct cxl_event_record_raw),
+			.flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,
+			/* .handle = Set dynamically */
+			.related_handle = cpu_to_le16(0xa5b6),
+		},
+		.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 	},
-	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 };
 
 struct cxl_event_record_raw hardware_replace = {
 	.id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E,
 			0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5),
-	.hdr = {
-		.length = sizeof(struct cxl_event_record_raw),
-		.flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE,
-		/* .handle = Set dynamically */
-		.related_handle = cpu_to_le16(0xb6a5),
+	.event.generic = {
+		.hdr = {
+			.length = sizeof(struct cxl_event_record_raw),
+			.flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE,
+			/* .handle = Set dynamically */
+			.related_handle = cpu_to_le16(0xb6a5),
+		},
+		.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 	},
-	.data = { 0xDE, 0xAD, 0xBE, 0xEF },
 };
 
 struct cxl_test_gen_media {
-- 
cgit v1.2.3


From 671a794c33c6e048ca5cedd5ad6af44d52d5d7e5 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:34 -0800
Subject: acpi/ghes: Process CXL Component Events

BIOS can configure memory devices as firmware first.  This will send CXL
events to the firmware instead of the OS.  The firmware can then send
these events to the OS via UEFI.

UEFI v2.10 section N.2.14 defines a Common Platform Error Record (CPER)
format for CXL Component Events.  The format is mostly the same as the
CXL Common Event Record Format.  The difference is the use of a GUID in
the Section Type rather than a UUID as part of the event itself.

Add GHES support to detect CXL CPER records and call a registered
callback with the event.

A notifier chain was considered for the callback but the complexity did
not justify the use case as only the CXL subsystem requires this event.
Enforce that only one callback can be registered at any time.

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-7-1bb8a4ca2c7a@intel.com
[djbw: fixup checkpatch errors]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/apei/ghes.c  | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cxl-event.h | 50 ++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 63ad0541db38..56a5d2ef9e0a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/timer.h>
 #include <linux/cper.h>
+#include <linux/cxl-event.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
@@ -657,6 +658,78 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
+/*
+ * Only a single callback can be registered for CXL CPER events.
+ */
+static DECLARE_RWSEM(cxl_cper_rw_sem);
+static cxl_cper_callback cper_callback;
+
+/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CPER_SEC_CXL_GEN_MEDIA_GUID					\
+	GUID_INIT(0xfbcd0a77, 0xc260, 0x417f,				\
+		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
+
+/*
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CPER_SEC_CXL_DRAM_GUID						\
+	GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,				\
+		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CPER_SEC_CXL_MEM_MODULE_GUID					\
+	GUID_INIT(0xfe927475, 0xdd59, 0x4339,				\
+		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
+
+static void cxl_cper_post_event(enum cxl_event_type event_type,
+				struct cxl_cper_event_rec *rec)
+{
+	if (rec->hdr.length <= sizeof(rec->hdr) ||
+	    rec->hdr.length > sizeof(*rec)) {
+		pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n",
+		       rec->hdr.length);
+		return;
+	}
+
+	if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) {
+		pr_err(FW_WARN "CXL CPER invalid event\n");
+		return;
+	}
+
+	guard(rwsem_read)(&cxl_cper_rw_sem);
+	if (cper_callback)
+		cper_callback(event_type, rec);
+}
+
+int cxl_cper_register_callback(cxl_cper_callback callback)
+{
+	guard(rwsem_write)(&cxl_cper_rw_sem);
+	if (cper_callback)
+		return -EINVAL;
+	cper_callback = callback;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL);
+
+int cxl_cper_unregister_callback(cxl_cper_callback callback)
+{
+	guard(rwsem_write)(&cxl_cper_rw_sem);
+	if (callback != cper_callback)
+		return -EINVAL;
+	cper_callback = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL);
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -690,6 +763,22 @@ static bool ghes_do_proc(struct ghes *ghes,
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
+			struct cxl_cper_event_rec *rec =
+				acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) {
+			struct cxl_cper_event_rec *rec =
+				acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec);
+		} else if (guid_equal(sec_type,
+				      &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+			struct cxl_cper_event_rec *rec =
+				acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 4d6c05f535f8..95841750a383 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -107,4 +107,54 @@ struct cxl_event_record_raw {
 	union cxl_event event;
 } __packed;
 
+enum cxl_event_type {
+	CXL_CPER_EVENT_GEN_MEDIA,
+	CXL_CPER_EVENT_DRAM,
+	CXL_CPER_EVENT_MEM_MODULE,
+};
+
+#define CPER_CXL_DEVICE_ID_VALID		BIT(0)
+#define CPER_CXL_DEVICE_SN_VALID		BIT(1)
+#define CPER_CXL_COMP_EVENT_LOG_VALID		BIT(2)
+struct cxl_cper_event_rec {
+	struct {
+		u32 length;
+		u64 validation_bits;
+		struct cper_cxl_event_devid {
+			u16 vendor_id;
+			u16 device_id;
+			u8 func_num;
+			u8 device_num;
+			u8 bus_num;
+			u16 segment_num;
+			u16 slot_num; /* bits 2:0 reserved */
+			u8 reserved;
+		} __packed device_id;
+		struct cper_cxl_event_sn {
+			u32 lower_dw;
+			u32 upper_dw;
+		} __packed dev_serial_num;
+	} __packed hdr;
+
+	union cxl_event event;
+} __packed;
+
+typedef void (*cxl_cper_callback)(enum cxl_event_type type,
+				  struct cxl_cper_event_rec *rec);
+
+#ifdef CONFIG_ACPI_APEI_GHES
+int cxl_cper_register_callback(cxl_cper_callback callback);
+int cxl_cper_unregister_callback(cxl_cper_callback callback);
+#else
+static inline int cxl_cper_register_callback(cxl_cper_callback callback)
+{
+	return 0;
+}
+
+static inline int cxl_cper_unregister_callback(cxl_cper_callback callback)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From ced085ef369af7a2b6da962ec2fbd01339f60693 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:35 -0800
Subject: PCI: Introduce cleanup helpers for device reference counts and locks

The "goto error" pattern is notorious for introducing subtle resource
leaks. Use the new cleanup.h helpers for PCI device reference counts and
locks.

Similar to the new put_device() and device_lock() cleanup helpers,
__free(put_device) and guard(device), define the same for PCI devices,
__free(pci_dev_put) and guard(pci_dev).  These helpers eliminate the
need for "goto free;" and "goto unlock;" patterns. For example, A
'struct pci_dev *' instance declared as:

    struct pci_dev *pdev __free(pci_dev_put) = NULL;

...will automatically call pci_dev_put() if @pdev is non-NULL when @pdev
goes out of scope (automatic variable scope). If a function wants to
invoke pci_dev_put() on error, but return @pdev on success, it can do:

    return no_free_ptr(pdev);

...or:

    return_ptr(pdev);

For potential cleanup opportunity there are 587 open-coded calls to
pci_dev_put() in the kernel with 65 instances within 10 lines of a goto
statement with the CXL driver threatening to add another one.

The guard() helper holds the associated lock for the remainder of the
current scope in which it was invoked. So, for example:

    func(...)
    {
        if (...) {
            ...
            guard(pci_dev); /* pci_dev_lock() invoked here */
            ...
        } /* <- implied pci_dev_unlock() triggered here */
    }

There are 15 invocations of pci_dev_unlock() in the kernel with 5
instances within 10 lines of a goto statement. Again, the CXL driver is
threatening to add another.

Introduce these helpers to preclude the addition of new more error prone
goto put; / goto unlock; sequences. For now, these helpers are used in
drivers/cxl/pci.c to allow ACPI error reports to be fed back into the
CXL driver associated with the PCI device identified in the report.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-8-1bb8a4ca2c7a@intel.com
[djbw: rewrite changelog]
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index dea043bc1e38..0d23d2e0eb1a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1170,6 +1170,7 @@ int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
 u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp);
 struct pci_dev *pci_dev_get(struct pci_dev *dev);
 void pci_dev_put(struct pci_dev *dev);
+DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T))
 void pci_remove_bus(struct pci_bus *b);
 void pci_stop_and_remove_bus_device(struct pci_dev *dev);
 void pci_stop_and_remove_bus_device_locked(struct pci_dev *dev);
@@ -1874,6 +1875,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev);
 void pci_dev_lock(struct pci_dev *dev);
 int pci_dev_trylock(struct pci_dev *dev);
 void pci_dev_unlock(struct pci_dev *dev);
+DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
 
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
-- 
cgit v1.2.3


From dc97f6344f205b0dfa144e1b3e16d6dc05383d57 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 20 Dec 2023 16:17:36 -0800
Subject: cxl/pci: Register for and process CPER events

If the firmware has configured CXL event support to be firmware first
the OS can process those events through CPER records.  The CXL layer has
unique DPA to HPA knowledge and standard event trace parsing in place.

CPER records contain Bus, Device, Function information which can be used
to identify the PCI device which is sending the event.

Change the PCI driver registration to include registration of a CXL
CPER callback to process events through the trace subsystem.

Use new scoped based management to simplify the handling of the PCI
device object.

Tested-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Smita-Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-9-1bb8a4ca2c7a@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
[djbw: use new pci_dev guard, flip init order]
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/mbox.c   | 40 ++++++++++++++++++++++----------
 drivers/cxl/cxlmem.h      |  4 ++++
 drivers/cxl/pci.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cxl-event.h |  1 +
 4 files changed, 90 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 06957696247b..23021920aace 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -836,21 +836,37 @@ out:
 }
 EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
 
-static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-				   enum cxl_event_log_type type,
-				   struct cxl_event_record_raw *record)
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+			    enum cxl_event_log_type type,
+			    enum cxl_event_type event_type,
+			    const uuid_t *uuid, union cxl_event *evt)
 {
-	union cxl_event *evt = &record->event;
-	uuid_t *id = &record->id;
-
-	if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID))
+	if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
 		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-	else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID))
+	else if (event_type == CXL_CPER_EVENT_DRAM)
 		trace_cxl_dram(cxlmd, type, &evt->dram);
-	else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID))
+	else if (event_type == CXL_CPER_EVENT_MEM_MODULE)
 		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
 	else
-		trace_cxl_generic_event(cxlmd, type, id, &evt->generic);
+		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
+
+static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+				     enum cxl_event_log_type type,
+				     struct cxl_event_record_raw *record)
+{
+	enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
+	const uuid_t *uuid = &record->id;
+
+	if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID))
+		ev_type = CXL_CPER_EVENT_GEN_MEDIA;
+	else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID))
+		ev_type = CXL_CPER_EVENT_DRAM;
+	else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
+		ev_type = CXL_CPER_EVENT_MEM_MODULE;
+
+	cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
 }
 
 static int cxl_clear_event_record(struct cxl_memdev_state *mds,
@@ -961,8 +977,8 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
 			break;
 
 		for (i = 0; i < nr_rec; i++)
-			cxl_event_trace_record(cxlmd, type,
-					       &payload->records[i]);
+			__cxl_event_trace_record(cxlmd, type,
+						 &payload->records[i]);
 
 		if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
 			trace_cxl_overflow(cxlmd, type, payload);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 27575513ec68..3c201324a3b3 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -802,6 +802,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
 void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
 				  unsigned long *cmds);
 void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+			    enum cxl_event_log_type type,
+			    enum cxl_event_type event_type,
+			    const uuid_t *uuid, union cxl_event *evt);
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 0155fb66b580..4fd1f207c84e 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <asm-generic/unaligned.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/moduleparam.h>
 #include <linux/module.h>
@@ -969,6 +970,61 @@ static struct pci_driver cxl_pci_driver = {
 	},
 };
 
+#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
+static void cxl_cper_event_call(enum cxl_event_type ev_type,
+				struct cxl_cper_event_rec *rec)
+{
+	struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
+	struct pci_dev *pdev __free(pci_dev_put) = NULL;
+	enum cxl_event_log_type log_type;
+	struct cxl_dev_state *cxlds;
+	unsigned int devfn;
+	u32 hdr_flags;
+
+	devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
+	pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
+					   device_id->bus_num, devfn);
+	if (!pdev)
+		return;
+
+	guard(pci_dev)(pdev);
+	if (pdev->driver != &cxl_pci_driver)
+		return;
+
+	cxlds = pci_get_drvdata(pdev);
+	if (!cxlds)
+		return;
+
+	/* Fabricate a log type */
+	hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
+	log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
+
+	cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
+			       &uuid_null, &rec->event);
+}
+
+static int __init cxl_pci_driver_init(void)
+{
+	int rc;
+
+	rc = cxl_cper_register_callback(cxl_cper_event_call);
+	if (rc)
+		return rc;
+
+	rc = pci_register_driver(&cxl_pci_driver);
+	if (rc)
+		cxl_cper_unregister_callback(cxl_cper_event_call);
+
+	return rc;
+}
+
+static void __exit cxl_pci_driver_exit(void)
+{
+	pci_unregister_driver(&cxl_pci_driver);
+	cxl_cper_unregister_callback(cxl_cper_event_call);
+}
+
+module_init(cxl_pci_driver_init);
+module_exit(cxl_pci_driver_exit);
 MODULE_LICENSE("GPL v2");
-module_pci_driver(cxl_pci_driver);
 MODULE_IMPORT_NS(CXL);
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 95841750a383..91125eca4c8a 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -108,6 +108,7 @@ struct cxl_event_record_raw {
 } __packed;
 
 enum cxl_event_type {
+	CXL_CPER_EVENT_GENERIC,
 	CXL_CPER_EVENT_GEN_MEDIA,
 	CXL_CPER_EVENT_DRAM,
 	CXL_CPER_EVENT_MEM_MODULE,
-- 
cgit v1.2.3


From ef067191f73cce3ee192e991ce486d95524655d5 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:11:56 +0200
Subject: vdpa/mlx5: Expose resumable vq capability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Necessary for checking if resumable vqs are supported by the hardware.
Actual support will be added in a downstream patch.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-2-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..9eaceaf6bcb0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1236,7 +1236,8 @@ struct mlx5_ifc_virtio_emulation_cap_bits {
 
 	u8	   reserved_at_c0[0x13];
 	u8         desc_group_mkey_supported[0x1];
-	u8         reserved_at_d4[0xc];
+	u8         freeze_to_rdy_supported[0x1];
+	u8         reserved_at_d5[0xb];
 
 	u8         reserved_at_e0[0x20];
 
-- 
cgit v1.2.3


From 9b23417825df470e4c9e98e7ed4b2c37465bfa1e Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:11:59 +0200
Subject: vdpa/mlx5: Mark vq addrs for modification in hw vq

Addresses get set by .set_vq_address. hw vq addresses will be updated on
next modify_virtqueue.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-5-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +++++++++
 include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index f8f088cced50..80e066de0866 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 	bool state_change = false;
 	void *obj_context;
 	void *cmd_hdr;
+	void *vq_ctx;
 	void *in;
 	int err;
 
@@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
 
 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
+	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
 
 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
 		if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) {
@@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 		state_change = true;
 	}
 
+	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
+		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
+		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
+		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
+	}
+
 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
 	if (err)
@@ -2202,6 +2210,7 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_
 	mvq->desc_addr = desc_area;
 	mvq->device_addr = device_area;
 	mvq->driver_addr = driver_area;
+	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
 	return 0;
 }
 
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index b86d51a855f6..9594ac405740 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -145,6 +145,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_STATE                    = (u64)1 << 0,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From 60c43b3f6b4eb5a3d672952a0d65991f414ea258 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:12:00 +0200
Subject: vdpa/mlx5: Mark vq state for modification in hw vq

.set_vq_state will set the indices and mark the fields to be modified in
the hw vq.

Advertise that the device supports changing the vq state when the device
is in DRIVER_OK state and suspended.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20231225151203.152687-6-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 8 ++++++++
 include/linux/mlx5/mlx5_ifc_vdpa.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 80e066de0866..d6c8506cec8f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1249,6 +1249,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
 	}
 
+	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX)
+		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
+
+	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
+		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
+
 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
 	if (err)
@@ -2328,6 +2334,8 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
 
 	mvq->used_idx = state->split.avail_index;
 	mvq->avail_idx = state->split.avail_index;
+	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
+				MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX;
 	return 0;
 }
 
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 9594ac405740..32e712106e68 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -146,6 +146,8 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS      = (u64)1 << 3,
 	MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX       = (u64)1 << 7,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX        = (u64)1 << 8,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From f756dd3e2a4c704c0ab5ecb143ab71f1249af497 Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 25 Dec 2023 17:12:01 +0200
Subject: vdpa/mlx5: Use vq suspend/resume during .set_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of tearing down and setting up vq resources, use vq
suspend/resume during .set_map to speed things up a bit.

The vq mr is updated with the new mapping while the vqs are suspended.

If the device doesn't support resumable vqs, do the old teardown and
setup dance.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Message-Id: <20231225151203.152687-7-dtatulea@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 46 +++++++++++++++++++++++++++++++-------
 include/linux/mlx5/mlx5_ifc_vdpa.h |  1 +
 2 files changed, 39 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index d6c8506cec8f..6a21223d97a8 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1206,6 +1206,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 {
 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
+	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
 	bool state_change = false;
 	void *obj_context;
 	void *cmd_hdr;
@@ -1255,6 +1256,24 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
 
+	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
+		struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
+
+		if (mr)
+			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, mr->mkey);
+		else
+			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
+	}
+
+	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
+		struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
+
+		if (mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
+			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, mr->mkey);
+		else
+			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
+	}
+
 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
 	if (err)
@@ -2784,24 +2803,35 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
 				unsigned int asid)
 {
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	bool teardown = !is_resumable(ndev);
 	int err;
 
 	suspend_vqs(ndev);
-	err = save_channels_info(ndev);
-	if (err)
-		return err;
+	if (teardown) {
+		err = save_channels_info(ndev);
+		if (err)
+			return err;
 
-	teardown_driver(ndev);
+		teardown_driver(ndev);
+	}
 
 	mlx5_vdpa_update_mr(mvdev, new_mr, asid);
 
+	for (int i = 0; i < ndev->cur_num_vqs; i++)
+		ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY |
+						MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
+
 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
 		return 0;
 
-	restore_channels_info(ndev);
-	err = setup_driver(mvdev);
-	if (err)
-		return err;
+	if (teardown) {
+		restore_channels_info(ndev);
+		err = setup_driver(mvdev);
+		if (err)
+			return err;
+	}
+
+	resume_vqs(ndev);
 
 	return 0;
 }
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 32e712106e68..40371c916cf9 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -148,6 +148,7 @@ enum {
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS           = (u64)1 << 6,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX       = (u64)1 << 7,
 	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX        = (u64)1 << 8,
+	MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY            = (u64)1 << 11,
 	MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY          = (u64)1 << 14,
 };
 
-- 
cgit v1.2.3


From 3e999770ac1c7c31a70685dd5b88e89473509e9c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 9 Jan 2024 17:59:22 +0100
Subject: PM: sleep: Restore asynchronous device resume optimization

Before commit 7839d0078e0d ("PM: sleep: Fix possible deadlocks in core
system-wide PM code"), the resume of devices that were allowed to resume
asynchronously was scheduled before starting the resume of the other
devices, so the former did not have to wait for the latter unless
functional dependencies were present.

Commit 7839d0078e0d removed that optimization in order to address a
correctness issue, but it can be restored with the help of a new device
power management flag, so do that now.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 drivers/base/power/main.c | 117 +++++++++++++++++++++++++---------------------
 include/linux/pm.h        |   1 +
 2 files changed, 65 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9c5a5f4dba5a..fadcd0379dc2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -579,7 +579,7 @@ bool dev_pm_skip_resume(struct device *dev)
 }
 
 /**
- * __device_resume_noirq - Execute a "noirq resume" callback for given device.
+ * device_resume_noirq - Execute a "noirq resume" callback for given device.
  * @dev: Device to handle.
  * @state: PM transition of the system being carried out.
  * @async: If true, the device is being resumed asynchronously.
@@ -587,7 +587,7 @@ bool dev_pm_skip_resume(struct device *dev)
  * The driver of @dev will not receive interrupts while this function is being
  * executed.
  */
-static void __device_resume_noirq(struct device *dev, pm_message_t state, bool async)
+static void device_resume_noirq(struct device *dev, pm_message_t state, bool async)
 {
 	pm_callback_t callback = NULL;
 	const char *info = NULL;
@@ -674,16 +674,22 @@ static bool dpm_async_fn(struct device *dev, async_func_t func)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (!is_async(dev))
-		return false;
-
-	get_device(dev);
+	if (is_async(dev)) {
+		dev->power.async_in_progress = true;
 
-	if (async_schedule_dev_nocall(func, dev))
-		return true;
+		get_device(dev);
 
-	put_device(dev);
+		if (async_schedule_dev_nocall(func, dev))
+			return true;
 
+		put_device(dev);
+	}
+	/*
+	 * Because async_schedule_dev_nocall() above has returned false or it
+	 * has not been called at all, func() is not running and it is safe to
+	 * update the async_in_progress flag without extra synchronization.
+	 */
+	dev->power.async_in_progress = false;
 	return false;
 }
 
@@ -691,18 +697,10 @@ static void async_resume_noirq(void *data, async_cookie_t cookie)
 {
 	struct device *dev = data;
 
-	__device_resume_noirq(dev, pm_transition, true);
+	device_resume_noirq(dev, pm_transition, true);
 	put_device(dev);
 }
 
-static void device_resume_noirq(struct device *dev)
-{
-	if (dpm_async_fn(dev, async_resume_noirq))
-		return;
-
-	__device_resume_noirq(dev, pm_transition, false);
-}
-
 static void dpm_noirq_resume_devices(pm_message_t state)
 {
 	struct device *dev;
@@ -712,18 +710,28 @@ static void dpm_noirq_resume_devices(pm_message_t state)
 	mutex_lock(&dpm_list_mtx);
 	pm_transition = state;
 
+	/*
+	 * Trigger the resume of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones they don't depend on.
+	 */
+	list_for_each_entry(dev, &dpm_noirq_list, power.entry)
+		dpm_async_fn(dev, async_resume_noirq);
+
 	while (!list_empty(&dpm_noirq_list)) {
 		dev = to_device(dpm_noirq_list.next);
-		get_device(dev);
 		list_move_tail(&dev->power.entry, &dpm_late_early_list);
 
-		mutex_unlock(&dpm_list_mtx);
+		if (!dev->power.async_in_progress) {
+			get_device(dev);
 
-		device_resume_noirq(dev);
+			mutex_unlock(&dpm_list_mtx);
 
-		put_device(dev);
+			device_resume_noirq(dev, state, false);
 
-		mutex_lock(&dpm_list_mtx);
+			put_device(dev);
+
+			mutex_lock(&dpm_list_mtx);
+		}
 	}
 	mutex_unlock(&dpm_list_mtx);
 	async_synchronize_full();
@@ -747,14 +755,14 @@ void dpm_resume_noirq(pm_message_t state)
 }
 
 /**
- * __device_resume_early - Execute an "early resume" callback for given device.
+ * device_resume_early - Execute an "early resume" callback for given device.
  * @dev: Device to handle.
  * @state: PM transition of the system being carried out.
  * @async: If true, the device is being resumed asynchronously.
  *
  * Runtime PM is disabled for @dev while this function is being executed.
  */
-static void __device_resume_early(struct device *dev, pm_message_t state, bool async)
+static void device_resume_early(struct device *dev, pm_message_t state, bool async)
 {
 	pm_callback_t callback = NULL;
 	const char *info = NULL;
@@ -820,18 +828,10 @@ static void async_resume_early(void *data, async_cookie_t cookie)
 {
 	struct device *dev = data;
 
-	__device_resume_early(dev, pm_transition, true);
+	device_resume_early(dev, pm_transition, true);
 	put_device(dev);
 }
 
-static void device_resume_early(struct device *dev)
-{
-	if (dpm_async_fn(dev, async_resume_early))
-		return;
-
-	__device_resume_early(dev, pm_transition, false);
-}
-
 /**
  * dpm_resume_early - Execute "early resume" callbacks for all devices.
  * @state: PM transition of the system being carried out.
@@ -845,18 +845,28 @@ void dpm_resume_early(pm_message_t state)
 	mutex_lock(&dpm_list_mtx);
 	pm_transition = state;
 
+	/*
+	 * Trigger the resume of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones they don't depend on.
+	 */
+	list_for_each_entry(dev, &dpm_late_early_list, power.entry)
+		dpm_async_fn(dev, async_resume_early);
+
 	while (!list_empty(&dpm_late_early_list)) {
 		dev = to_device(dpm_late_early_list.next);
-		get_device(dev);
 		list_move_tail(&dev->power.entry, &dpm_suspended_list);
 
-		mutex_unlock(&dpm_list_mtx);
+		if (!dev->power.async_in_progress) {
+			get_device(dev);
 
-		device_resume_early(dev);
+			mutex_unlock(&dpm_list_mtx);
 
-		put_device(dev);
+			device_resume_early(dev, state, false);
 
-		mutex_lock(&dpm_list_mtx);
+			put_device(dev);
+
+			mutex_lock(&dpm_list_mtx);
+		}
 	}
 	mutex_unlock(&dpm_list_mtx);
 	async_synchronize_full();
@@ -876,12 +886,12 @@ void dpm_resume_start(pm_message_t state)
 EXPORT_SYMBOL_GPL(dpm_resume_start);
 
 /**
- * __device_resume - Execute "resume" callbacks for given device.
+ * device_resume - Execute "resume" callbacks for given device.
  * @dev: Device to handle.
  * @state: PM transition of the system being carried out.
  * @async: If true, the device is being resumed asynchronously.
  */
-static void __device_resume(struct device *dev, pm_message_t state, bool async)
+static void device_resume(struct device *dev, pm_message_t state, bool async)
 {
 	pm_callback_t callback = NULL;
 	const char *info = NULL;
@@ -975,18 +985,10 @@ static void async_resume(void *data, async_cookie_t cookie)
 {
 	struct device *dev = data;
 
-	__device_resume(dev, pm_transition, true);
+	device_resume(dev, pm_transition, true);
 	put_device(dev);
 }
 
-static void device_resume(struct device *dev)
-{
-	if (dpm_async_fn(dev, async_resume))
-		return;
-
-	__device_resume(dev, pm_transition, false);
-}
-
 /**
  * dpm_resume - Execute "resume" callbacks for non-sysdev devices.
  * @state: PM transition of the system being carried out.
@@ -1006,16 +1008,25 @@ void dpm_resume(pm_message_t state)
 	pm_transition = state;
 	async_error = 0;
 
+	/*
+	 * Trigger the resume of "async" devices upfront so they don't have to
+	 * wait for the "non-async" ones they don't depend on.
+	 */
+	list_for_each_entry(dev, &dpm_suspended_list, power.entry)
+		dpm_async_fn(dev, async_resume);
+
 	while (!list_empty(&dpm_suspended_list)) {
 		dev = to_device(dpm_suspended_list.next);
 
 		get_device(dev);
 
-		mutex_unlock(&dpm_list_mtx);
+		if (!dev->power.async_in_progress) {
+			mutex_unlock(&dpm_list_mtx);
 
-		device_resume(dev);
+			device_resume(dev, state, false);
 
-		mutex_lock(&dpm_list_mtx);
+			mutex_lock(&dpm_list_mtx);
+		}
 
 		if (!list_empty(&dev->power.entry))
 			list_move_tail(&dev->power.entry, &dpm_prepared_list);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 92a4f69de0e8..a2f3e53a8196 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -681,6 +681,7 @@ struct dev_pm_info {
 	bool			wakeup_path:1;
 	bool			syscore:1;
 	bool			no_pm_callbacks:1;	/* Owned by the PM core */
+	bool			async_in_progress:1;	/* Owned by the PM core */
 	unsigned int		must_resume:1;	/* Owned by the PM core */
 	unsigned int		may_skip_resume:1;	/* Set by subsystems */
 #else
-- 
cgit v1.2.3


From f35b88b66fbb5c90298ce3aa483b8a2cf1f39ad0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 10 Jan 2024 20:10:08 -0800
Subject: iommu: Add cache_invalidate_user op

The updates of the PTEs in the nested page table will be propagated to the
hardware caches.

Add a new domain op cache_invalidate_user() for the userspace to flush the
hardware caches for a nested domain through iommufd. No wrapper for it, as
it's only supposed to be used by iommufd. Then, pass in invalidation
requests in form of a user data array containing a number of invalidation
data entries.

Link: https://lore.kernel.org/r/20240111041015.47920-2-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6291aa7b079b..93c0d12dd047 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -284,6 +284,23 @@ struct iommu_user_data {
 	size_t len;
 };
 
+/**
+ * struct iommu_user_data_array - iommu driver specific user space data array
+ * @type: The data type of all the entries in the user buffer array
+ * @uptr: Pointer to the user buffer array
+ * @entry_len: The fixed-width length of an entry in the array, in bytes
+ * @entry_num: The number of total entries in the array
+ *
+ * The user buffer includes an array of requests with format defined in
+ * include/uapi/linux/iommufd.h
+ */
+struct iommu_user_data_array {
+	unsigned int type;
+	void __user *uptr;
+	size_t entry_len;
+	u32 entry_num;
+};
+
 /**
  * __iommu_copy_struct_from_user - Copy iommu driver specific user space data
  * @dst_data: Pointer to an iommu driver specific user data that is defined in
@@ -440,6 +457,13 @@ struct iommu_ops {
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *            queue
+ * @cache_invalidate_user: Flush hardware cache for user space IO page table.
+ *                         The @domain must be IOMMU_DOMAIN_NESTED. The @array
+ *                         passes in the cache invalidation requests, in form
+ *                         of a driver data structure. The driver must update
+ *                         array->entry_num to report the number of handled
+ *                         invalidation requests. The driver data structure
+ *                         must be defined in include/uapi/linux/iommufd.h
  * @iova_to_phys: translate iova to physical address
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
@@ -465,6 +489,8 @@ struct iommu_domain_ops {
 			      size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
+	int (*cache_invalidate_user)(struct iommu_domain *domain,
+				     struct iommu_user_data_array *array);
 
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    dma_addr_t iova);
-- 
cgit v1.2.3


From 77785117f9c73fd71a440a5ac86dd80752967adc Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 10 Jan 2024 20:10:10 -0800
Subject: iommu: Add iommu_copy_struct_from_user_array helper

Wrap up the data pointer/num sanity and __iommu_copy_struct_from_user()
call for iommu drivers to copy driver specific data at a specific location
in the struct iommu_user_data_array.

And expect it to be used in cache_invalidate_user ops for example.

Link: https://lore.kernel.org/r/20240111041015.47920-4-yi.l.liu@intel.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Co-developed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 93c0d12dd047..9dbadf74a3a1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -341,6 +341,57 @@ static inline int __iommu_copy_struct_from_user(
 				      sizeof(*kdst),                      \
 				      offsetofend(typeof(*kdst), min_last))
 
+/**
+ * __iommu_copy_struct_from_user_array - Copy iommu driver specific user space
+ *                                       data from an iommu_user_data_array
+ * @dst_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @src_array: Pointer to a struct iommu_user_data_array for a user space array
+ * @data_type: The data type of the @dst_data. Must match with @src_array.type
+ * @index: Index to the location in the array to copy user data from
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _dst)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int __iommu_copy_struct_from_user_array(
+	void *dst_data, const struct iommu_user_data_array *src_array,
+	unsigned int data_type, unsigned int index, size_t data_len,
+	size_t min_len)
+{
+	struct iommu_user_data src_data;
+
+	if (WARN_ON(!src_array || index >= src_array->entry_num))
+		return -EINVAL;
+	if (!src_array->entry_num)
+		return -EINVAL;
+	src_data.uptr = src_array->uptr + src_array->entry_len * index;
+	src_data.len = src_array->entry_len;
+	src_data.type = src_array->type;
+
+	return __iommu_copy_struct_from_user(dst_data, &src_data, data_type,
+					     data_len, min_len);
+}
+
+/**
+ * iommu_copy_struct_from_user_array - Copy iommu driver specific user space
+ *                                     data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ *              array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ * @index: Index to the location in the array to copy user data from
+ * @min_last: The last member of the data structure @kdst points in the
+ *            initial version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
+					  min_last)                           \
+	__iommu_copy_struct_from_user_array(                                  \
+		kdst, user_array, data_type, index, sizeof(*(kdst)),          \
+		offsetofend(typeof(*(kdst)), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From dee56ccb468a832074397fdbf22bbd9bf6d710aa Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 12 Dec 2023 15:18:02 +0100
Subject: fbdev: amba-clcd: Delete the old CLCD driver

We have managed to ascertain that all users of the old FBDEV
code that are out of tree are now gone.

The new DRM driver can be found in drivers/gpu/drm/pl111/.

The remaining out of tree user was the ARM FVP emulation
platform, running Android. Thanks to changes in Android
versions 13 and 14, Android can now use the DRM driver when
being emulated under FVP. Some final patches are being put
in place to make it fully featured.

This is essentially a revert of the partial revert in
commit 112c35237c72 ("Partially revert "video: fbdev: amba-clcd: Retire elder CLCD driver"")

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 MAINTAINERS                     |   5 -
 drivers/video/fbdev/Kconfig     |  18 -
 drivers/video/fbdev/Makefile    |   1 -
 drivers/video/fbdev/amba-clcd.c | 984 ----------------------------------------
 include/linux/amba/clcd-regs.h  |  87 ----
 include/linux/amba/clcd.h       | 290 ------------
 6 files changed, 1385 deletions(-)
 delete mode 100644 drivers/video/fbdev/amba-clcd.c
 delete mode 100644 include/linux/amba/clcd-regs.h
 delete mode 100644 include/linux/amba/clcd.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index cc92b10a4cad..d3054a00c8e3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1684,11 +1684,6 @@ S:	Odd Fixes
 F:	drivers/amba/
 F:	include/linux/amba/bus.h
 
-ARM PRIMECELL CLCD PL110 DRIVER
-M:	Russell King <linux@armlinux.org.uk>
-S:	Odd Fixes
-F:	drivers/video/fbdev/amba-clcd.*
-
 ARM PRIMECELL KMI PL050 DRIVER
 M:	Russell King <linux@armlinux.org.uk>
 S:	Odd Fixes
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index b65f20006bd7..3cdd2d50f59b 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -116,24 +116,6 @@ config FB_PM2_FIFO_DISCONNECT
 	help
 	  Support the Permedia2 FIFO disconnect feature.
 
-config FB_ARMCLCD
-	tristate "ARM PrimeCell PL110 support"
-	depends on ARM || ARM64 || COMPILE_TEST
-	depends on FB && ARM_AMBA && HAS_IOMEM
-	select FB_IOMEM_HELPERS
-	select FB_MODE_HELPERS if OF
-	select VIDEOMODE_HELPERS if OF
-	select BACKLIGHT_CLASS_DEVICE if OF
-	help
-	  This framebuffer device driver is for the ARM PrimeCell PL110
-	  Colour LCD controller.  ARM PrimeCells provide the building
-	  blocks for System on a Chip devices.
-
-	  If you want to compile this as a module (=code which can be
-	  inserted into and removed from the running kernel), say M
-	  here and read <file:Documentation/kbuild/modules.rst>.  The module
-	  will be called amba-clcd.
-
 config FB_ACORN
 	bool "Acorn VIDC support"
 	depends on (FB = y) && ARM && ARCH_ACORN
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index 7cf7b0af1465..3eecd51267fa 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -74,7 +74,6 @@ obj-$(CONFIG_FB_HIT)              += hitfb.o
 obj-$(CONFIG_FB_ATMEL)		  += atmel_lcdfb.o
 obj-$(CONFIG_FB_PVR2)             += pvr2fb.o
 obj-$(CONFIG_FB_VOODOO1)          += sstfb.o
-obj-$(CONFIG_FB_ARMCLCD)	  += amba-clcd.o
 obj-$(CONFIG_FB_GOLDFISH)         += goldfishfb.o
 obj-$(CONFIG_FB_68328)            += 68328fb.o
 obj-$(CONFIG_FB_GBE)              += gbefb.o
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
deleted file mode 100644
index 0399db369e70..000000000000
--- a/drivers/video/fbdev/amba-clcd.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/*
- *  linux/drivers/video/amba-clcd.c
- *
- * Copyright (C) 2001 ARM Limited, by David A Rusling
- * Updated to 2.5, Deep Blue Solutions Ltd.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- *
- *  ARM PrimeCell PL110 Color LCD Controller
- */
-#include <linux/amba/bus.h>
-#include <linux/amba/clcd.h>
-#include <linux/backlight.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/dma-mapping.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/of_address.h>
-#include <linux/of_graph.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <video/display_timing.h>
-#include <video/of_display_timing.h>
-#include <video/videomode.h>
-
-#define to_clcd(info)	container_of(info, struct clcd_fb, fb)
-
-/* This is limited to 16 characters when displayed by X startup */
-static const char *clcd_name = "CLCD FB";
-
-static inline void clcdfb_set_start(struct clcd_fb *fb)
-{
-	unsigned long ustart = fb->fb.fix.smem_start;
-	unsigned long lstart;
-
-	ustart += fb->fb.var.yoffset * fb->fb.fix.line_length;
-	lstart = ustart + fb->fb.var.yres * fb->fb.fix.line_length / 2;
-
-	writel(ustart, fb->regs + CLCD_UBAS);
-	writel(lstart, fb->regs + CLCD_LBAS);
-}
-
-static void clcdfb_disable(struct clcd_fb *fb)
-{
-	u32 val;
-
-	if (fb->board->disable)
-		fb->board->disable(fb);
-
-	if (fb->panel->backlight) {
-		fb->panel->backlight->props.power = FB_BLANK_POWERDOWN;
-		backlight_update_status(fb->panel->backlight);
-	}
-
-	val = readl(fb->regs + fb->off_cntl);
-	if (val & CNTL_LCDPWR) {
-		val &= ~CNTL_LCDPWR;
-		writel(val, fb->regs + fb->off_cntl);
-
-		msleep(20);
-	}
-	if (val & CNTL_LCDEN) {
-		val &= ~CNTL_LCDEN;
-		writel(val, fb->regs + fb->off_cntl);
-	}
-
-	/*
-	 * Disable CLCD clock source.
-	 */
-	if (fb->clk_enabled) {
-		fb->clk_enabled = false;
-		clk_disable(fb->clk);
-	}
-}
-
-static void clcdfb_enable(struct clcd_fb *fb, u32 cntl)
-{
-	/*
-	 * Enable the CLCD clock source.
-	 */
-	if (!fb->clk_enabled) {
-		fb->clk_enabled = true;
-		clk_enable(fb->clk);
-	}
-
-	/*
-	 * Bring up by first enabling..
-	 */
-	cntl |= CNTL_LCDEN;
-	writel(cntl, fb->regs + fb->off_cntl);
-
-	msleep(20);
-
-	/*
-	 * and now apply power.
-	 */
-	cntl |= CNTL_LCDPWR;
-	writel(cntl, fb->regs + fb->off_cntl);
-
-	/*
-	 * Turn on backlight
-	 */
-	if (fb->panel->backlight) {
-		fb->panel->backlight->props.power = FB_BLANK_UNBLANK;
-		backlight_update_status(fb->panel->backlight);
-	}
-
-	/*
-	 * finally, enable the interface.
-	 */
-	if (fb->board->enable)
-		fb->board->enable(fb);
-}
-
-static int
-clcdfb_set_bitfields(struct clcd_fb *fb, struct fb_var_screeninfo *var)
-{
-	u32 caps;
-	int ret = 0;
-
-	if (fb->panel->caps && fb->board->caps)
-		caps = fb->panel->caps & fb->board->caps;
-	else {
-		/* Old way of specifying what can be used */
-		caps = fb->panel->cntl & CNTL_BGR ?
-			CLCD_CAP_BGR : CLCD_CAP_RGB;
-		/* But mask out 444 modes as they weren't supported */
-		caps &= ~CLCD_CAP_444;
-	}
-
-	/* Only TFT panels can do RGB888/BGR888 */
-	if (!(fb->panel->cntl & CNTL_LCDTFT))
-		caps &= ~CLCD_CAP_888;
-
-	memset(&var->transp, 0, sizeof(var->transp));
-
-	var->red.msb_right = 0;
-	var->green.msb_right = 0;
-	var->blue.msb_right = 0;
-
-	switch (var->bits_per_pixel) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		/* If we can't do 5551, reject */
-		caps &= CLCD_CAP_5551;
-		if (!caps) {
-			ret = -EINVAL;
-			break;
-		}
-
-		var->red.length		= var->bits_per_pixel;
-		var->red.offset		= 0;
-		var->green.length	= var->bits_per_pixel;
-		var->green.offset	= 0;
-		var->blue.length	= var->bits_per_pixel;
-		var->blue.offset	= 0;
-		break;
-
-	case 16:
-		/* If we can't do 444, 5551 or 565, reject */
-		if (!(caps & (CLCD_CAP_444 | CLCD_CAP_5551 | CLCD_CAP_565))) {
-			ret = -EINVAL;
-			break;
-		}
-
-		/*
-		 * Green length can be 4, 5 or 6 depending whether
-		 * we're operating in 444, 5551 or 565 mode.
-		 */
-		if (var->green.length == 4 && caps & CLCD_CAP_444)
-			caps &= CLCD_CAP_444;
-		if (var->green.length == 5 && caps & CLCD_CAP_5551)
-			caps &= CLCD_CAP_5551;
-		else if (var->green.length == 6 && caps & CLCD_CAP_565)
-			caps &= CLCD_CAP_565;
-		else {
-			/*
-			 * PL110 officially only supports RGB555,
-			 * but may be wired up to allow RGB565.
-			 */
-			if (caps & CLCD_CAP_565) {
-				var->green.length = 6;
-				caps &= CLCD_CAP_565;
-			} else if (caps & CLCD_CAP_5551) {
-				var->green.length = 5;
-				caps &= CLCD_CAP_5551;
-			} else {
-				var->green.length = 4;
-				caps &= CLCD_CAP_444;
-			}
-		}
-
-		if (var->green.length >= 5) {
-			var->red.length = 5;
-			var->blue.length = 5;
-		} else {
-			var->red.length = 4;
-			var->blue.length = 4;
-		}
-		break;
-	case 32:
-		/* If we can't do 888, reject */
-		caps &= CLCD_CAP_888;
-		if (!caps) {
-			ret = -EINVAL;
-			break;
-		}
-
-		var->red.length = 8;
-		var->green.length = 8;
-		var->blue.length = 8;
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	/*
-	 * >= 16bpp displays have separate colour component bitfields
-	 * encoded in the pixel data.  Calculate their position from
-	 * the bitfield length defined above.
-	 */
-	if (ret == 0 && var->bits_per_pixel >= 16) {
-		bool bgr, rgb;
-
-		bgr = caps & CLCD_CAP_BGR && var->blue.offset == 0;
-		rgb = caps & CLCD_CAP_RGB && var->red.offset == 0;
-
-		if (!bgr && !rgb)
-			/*
-			 * The requested format was not possible, try just
-			 * our capabilities.  One of BGR or RGB must be
-			 * supported.
-			 */
-			bgr = caps & CLCD_CAP_BGR;
-
-		if (bgr) {
-			var->blue.offset = 0;
-			var->green.offset = var->blue.offset + var->blue.length;
-			var->red.offset = var->green.offset + var->green.length;
-		} else {
-			var->red.offset = 0;
-			var->green.offset = var->red.offset + var->red.length;
-			var->blue.offset = var->green.offset + var->green.length;
-		}
-	}
-
-	return ret;
-}
-
-static int clcdfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
-{
-	struct clcd_fb *fb = to_clcd(info);
-	int ret = -EINVAL;
-
-	if (fb->board->check)
-		ret = fb->board->check(fb, var);
-
-	if (ret == 0 &&
-	    var->xres_virtual * var->bits_per_pixel / 8 *
-	    var->yres_virtual > fb->fb.fix.smem_len)
-		ret = -EINVAL;
-
-	if (ret == 0)
-		ret = clcdfb_set_bitfields(fb, var);
-
-	return ret;
-}
-
-static int clcdfb_set_par(struct fb_info *info)
-{
-	struct clcd_fb *fb = to_clcd(info);
-	struct clcd_regs regs;
-
-	fb->fb.fix.line_length = fb->fb.var.xres_virtual *
-				 fb->fb.var.bits_per_pixel / 8;
-
-	if (fb->fb.var.bits_per_pixel <= 8)
-		fb->fb.fix.visual = FB_VISUAL_PSEUDOCOLOR;
-	else
-		fb->fb.fix.visual = FB_VISUAL_TRUECOLOR;
-
-	fb->board->decode(fb, &regs);
-
-	clcdfb_disable(fb);
-
-	writel(regs.tim0, fb->regs + CLCD_TIM0);
-	writel(regs.tim1, fb->regs + CLCD_TIM1);
-	writel(regs.tim2, fb->regs + CLCD_TIM2);
-	writel(regs.tim3, fb->regs + CLCD_TIM3);
-
-	clcdfb_set_start(fb);
-
-	clk_set_rate(fb->clk, (1000000000 / regs.pixclock) * 1000);
-
-	fb->clcd_cntl = regs.cntl;
-
-	clcdfb_enable(fb, regs.cntl);
-
-#ifdef DEBUG
-	printk(KERN_INFO
-	       "CLCD: Registers set to\n"
-	       "  %08x %08x %08x %08x\n"
-	       "  %08x %08x %08x %08x\n",
-		readl(fb->regs + CLCD_TIM0), readl(fb->regs + CLCD_TIM1),
-		readl(fb->regs + CLCD_TIM2), readl(fb->regs + CLCD_TIM3),
-		readl(fb->regs + CLCD_UBAS), readl(fb->regs + CLCD_LBAS),
-		readl(fb->regs + fb->off_ienb), readl(fb->regs + fb->off_cntl));
-#endif
-
-	return 0;
-}
-
-static inline u32 convert_bitfield(int val, struct fb_bitfield *bf)
-{
-	unsigned int mask = (1 << bf->length) - 1;
-
-	return (val >> (16 - bf->length) & mask) << bf->offset;
-}
-
-/*
- *  Set a single color register. The values supplied have a 16 bit
- *  magnitude.  Return != 0 for invalid regno.
- */
-static int
-clcdfb_setcolreg(unsigned int regno, unsigned int red, unsigned int green,
-		 unsigned int blue, unsigned int transp, struct fb_info *info)
-{
-	struct clcd_fb *fb = to_clcd(info);
-
-	if (regno < 16)
-		fb->cmap[regno] = convert_bitfield(transp, &fb->fb.var.transp) |
-				  convert_bitfield(blue, &fb->fb.var.blue) |
-				  convert_bitfield(green, &fb->fb.var.green) |
-				  convert_bitfield(red, &fb->fb.var.red);
-
-	if (fb->fb.fix.visual == FB_VISUAL_PSEUDOCOLOR && regno < 256) {
-		int hw_reg = CLCD_PALETTE + ((regno * 2) & ~3);
-		u32 val, mask, newval;
-
-		newval  = (red >> 11)  & 0x001f;
-		newval |= (green >> 6) & 0x03e0;
-		newval |= (blue >> 1)  & 0x7c00;
-
-		/*
-		 * 3.2.11: if we're configured for big endian
-		 * byte order, the palette entries are swapped.
-		 */
-		if (fb->clcd_cntl & CNTL_BEBO)
-			regno ^= 1;
-
-		if (regno & 1) {
-			newval <<= 16;
-			mask = 0x0000ffff;
-		} else {
-			mask = 0xffff0000;
-		}
-
-		val = readl(fb->regs + hw_reg) & mask;
-		writel(val | newval, fb->regs + hw_reg);
-	}
-
-	return regno > 255;
-}
-
-/*
- *  Blank the screen if blank_mode != 0, else unblank. If blank == NULL
- *  then the caller blanks by setting the CLUT (Color Look Up Table) to all
- *  black. Return 0 if blanking succeeded, != 0 if un-/blanking failed due
- *  to e.g. a video mode which doesn't support it. Implements VESA suspend
- *  and powerdown modes on hardware that supports disabling hsync/vsync:
- *    blank_mode == 2: suspend vsync
- *    blank_mode == 3: suspend hsync
- *    blank_mode == 4: powerdown
- */
-static int clcdfb_blank(int blank_mode, struct fb_info *info)
-{
-	struct clcd_fb *fb = to_clcd(info);
-
-	if (blank_mode != 0) {
-		clcdfb_disable(fb);
-	} else {
-		clcdfb_enable(fb, fb->clcd_cntl);
-	}
-	return 0;
-}
-
-static int clcdfb_mmap(struct fb_info *info,
-		       struct vm_area_struct *vma)
-{
-	struct clcd_fb *fb = to_clcd(info);
-	unsigned long len, off = vma->vm_pgoff << PAGE_SHIFT;
-	int ret = -EINVAL;
-
-	len = info->fix.smem_len;
-
-	if (off <= len && vma->vm_end - vma->vm_start <= len - off &&
-	    fb->board->mmap)
-		ret = fb->board->mmap(fb, vma);
-
-	return ret;
-}
-
-static const struct fb_ops clcdfb_ops = {
-	.owner		= THIS_MODULE,
-	__FB_DEFAULT_IOMEM_OPS_RDWR,
-	.fb_check_var	= clcdfb_check_var,
-	.fb_set_par	= clcdfb_set_par,
-	.fb_setcolreg	= clcdfb_setcolreg,
-	.fb_blank	= clcdfb_blank,
-	__FB_DEFAULT_IOMEM_OPS_DRAW,
-	.fb_mmap	= clcdfb_mmap,
-};
-
-static int clcdfb_register(struct clcd_fb *fb)
-{
-	int ret;
-
-	/*
-	 * ARM PL111 always has IENB at 0x1c; it's only PL110
-	 * which is reversed on some platforms.
-	 */
-	if (amba_manf(fb->dev) == 0x41 && amba_part(fb->dev) == 0x111) {
-		fb->off_ienb = CLCD_PL111_IENB;
-		fb->off_cntl = CLCD_PL111_CNTL;
-	} else {
-		fb->off_ienb = CLCD_PL110_IENB;
-		fb->off_cntl = CLCD_PL110_CNTL;
-	}
-
-	fb->clk = clk_get(&fb->dev->dev, NULL);
-	if (IS_ERR(fb->clk)) {
-		ret = PTR_ERR(fb->clk);
-		goto out;
-	}
-
-	ret = clk_prepare(fb->clk);
-	if (ret)
-		goto free_clk;
-
-	fb->fb.device		= &fb->dev->dev;
-
-	fb->fb.fix.mmio_start	= fb->dev->res.start;
-	fb->fb.fix.mmio_len	= resource_size(&fb->dev->res);
-
-	fb->regs = ioremap(fb->fb.fix.mmio_start, fb->fb.fix.mmio_len);
-	if (!fb->regs) {
-		printk(KERN_ERR "CLCD: unable to remap registers\n");
-		ret = -ENOMEM;
-		goto clk_unprep;
-	}
-
-	fb->fb.fbops		= &clcdfb_ops;
-	fb->fb.pseudo_palette	= fb->cmap;
-
-	strncpy(fb->fb.fix.id, clcd_name, sizeof(fb->fb.fix.id));
-	fb->fb.fix.type		= FB_TYPE_PACKED_PIXELS;
-	fb->fb.fix.type_aux	= 0;
-	fb->fb.fix.xpanstep	= 0;
-	fb->fb.fix.ypanstep	= 0;
-	fb->fb.fix.ywrapstep	= 0;
-	fb->fb.fix.accel	= FB_ACCEL_NONE;
-
-	fb->fb.var.xres		= fb->panel->mode.xres;
-	fb->fb.var.yres		= fb->panel->mode.yres;
-	fb->fb.var.xres_virtual	= fb->panel->mode.xres;
-	fb->fb.var.yres_virtual	= fb->panel->mode.yres;
-	fb->fb.var.bits_per_pixel = fb->panel->bpp;
-	fb->fb.var.grayscale	= fb->panel->grayscale;
-	fb->fb.var.pixclock	= fb->panel->mode.pixclock;
-	fb->fb.var.left_margin	= fb->panel->mode.left_margin;
-	fb->fb.var.right_margin	= fb->panel->mode.right_margin;
-	fb->fb.var.upper_margin	= fb->panel->mode.upper_margin;
-	fb->fb.var.lower_margin	= fb->panel->mode.lower_margin;
-	fb->fb.var.hsync_len	= fb->panel->mode.hsync_len;
-	fb->fb.var.vsync_len	= fb->panel->mode.vsync_len;
-	fb->fb.var.sync		= fb->panel->mode.sync;
-	fb->fb.var.vmode	= fb->panel->mode.vmode;
-	fb->fb.var.activate	= FB_ACTIVATE_NOW;
-	fb->fb.var.nonstd	= 0;
-	fb->fb.var.height	= fb->panel->height;
-	fb->fb.var.width	= fb->panel->width;
-	fb->fb.var.accel_flags	= 0;
-
-	fb->fb.monspecs.hfmin	= 0;
-	fb->fb.monspecs.hfmax   = 100000;
-	fb->fb.monspecs.vfmin	= 0;
-	fb->fb.monspecs.vfmax	= 400;
-	fb->fb.monspecs.dclkmin = 1000000;
-	fb->fb.monspecs.dclkmax	= 100000000;
-
-	/*
-	 * Make sure that the bitfields are set appropriately.
-	 */
-	clcdfb_set_bitfields(fb, &fb->fb.var);
-
-	/*
-	 * Allocate colourmap.
-	 */
-	ret = fb_alloc_cmap(&fb->fb.cmap, 256, 0);
-	if (ret)
-		goto unmap;
-
-	/*
-	 * Ensure interrupts are disabled.
-	 */
-	writel(0, fb->regs + fb->off_ienb);
-
-	fb_set_var(&fb->fb, &fb->fb.var);
-
-	dev_info(&fb->dev->dev, "%s hardware, %s display\n",
-	         fb->board->name, fb->panel->mode.name);
-
-	ret = register_framebuffer(&fb->fb);
-	if (ret == 0)
-		goto out;
-
-	printk(KERN_ERR "CLCD: cannot register framebuffer (%d)\n", ret);
-
-	fb_dealloc_cmap(&fb->fb.cmap);
- unmap:
-	iounmap(fb->regs);
- clk_unprep:
-	clk_unprepare(fb->clk);
- free_clk:
-	clk_put(fb->clk);
- out:
-	return ret;
-}
-
-#ifdef CONFIG_OF
-static int clcdfb_of_get_dpi_panel_mode(struct device_node *node,
-		struct clcd_panel *clcd_panel)
-{
-	int err;
-	struct display_timing timing;
-	struct videomode video;
-
-	err = of_get_display_timing(node, "panel-timing", &timing);
-	if (err) {
-		pr_err("%pOF: problems parsing panel-timing (%d)\n", node, err);
-		return err;
-	}
-
-	videomode_from_timing(&timing, &video);
-
-	err = fb_videomode_from_videomode(&video, &clcd_panel->mode);
-	if (err)
-		return err;
-
-	/* Set up some inversion flags */
-	if (timing.flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
-		clcd_panel->tim2 |= TIM2_IPC;
-	else if (!(timing.flags & DISPLAY_FLAGS_PIXDATA_POSEDGE))
-		/*
-		 * To preserve backwards compatibility, the IPC (inverted
-		 * pixel clock) flag needs to be set on any display that
-		 * doesn't explicitly specify that the pixel clock is
-		 * active on the negative or positive edge.
-		 */
-		clcd_panel->tim2 |= TIM2_IPC;
-
-	if (timing.flags & DISPLAY_FLAGS_HSYNC_LOW)
-		clcd_panel->tim2 |= TIM2_IHS;
-
-	if (timing.flags & DISPLAY_FLAGS_VSYNC_LOW)
-		clcd_panel->tim2 |= TIM2_IVS;
-
-	if (timing.flags & DISPLAY_FLAGS_DE_LOW)
-		clcd_panel->tim2 |= TIM2_IOE;
-
-	return 0;
-}
-
-static int clcdfb_snprintf_mode(char *buf, int size, struct fb_videomode *mode)
-{
-	return snprintf(buf, size, "%ux%u@%u", mode->xres, mode->yres,
-			mode->refresh);
-}
-
-static int clcdfb_of_get_backlight(struct device *dev,
-				   struct clcd_panel *clcd_panel)
-{
-	struct backlight_device *backlight;
-
-	/* Look up the optional backlight device */
-	backlight = devm_of_find_backlight(dev);
-	if (IS_ERR(backlight))
-		return PTR_ERR(backlight);
-
-	clcd_panel->backlight = backlight;
-	return 0;
-}
-
-static int clcdfb_of_get_mode(struct device *dev, struct device_node *panel,
-			      struct clcd_panel *clcd_panel)
-{
-	int err;
-	struct fb_videomode *mode;
-	char *name;
-	int len;
-
-	/* Only directly connected DPI panels supported for now */
-	if (of_device_is_compatible(panel, "panel-dpi"))
-		err = clcdfb_of_get_dpi_panel_mode(panel, clcd_panel);
-	else
-		err = -ENOENT;
-	if (err)
-		return err;
-	mode = &clcd_panel->mode;
-
-	len = clcdfb_snprintf_mode(NULL, 0, mode);
-	name = devm_kzalloc(dev, len + 1, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
-
-	clcdfb_snprintf_mode(name, len + 1, mode);
-	mode->name = name;
-
-	return 0;
-}
-
-static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0)
-{
-	static struct {
-		unsigned int part;
-		u32 r0, g0, b0;
-		u32 caps;
-	} panels[] = {
-		{ 0x110, 1,  7, 13, CLCD_CAP_5551 },
-		{ 0x110, 0,  8, 16, CLCD_CAP_888 },
-		{ 0x110, 16, 8, 0,  CLCD_CAP_888 },
-		{ 0x111, 4, 14, 20, CLCD_CAP_444 },
-		{ 0x111, 3, 11, 19, CLCD_CAP_444 | CLCD_CAP_5551 },
-		{ 0x111, 3, 10, 19, CLCD_CAP_444 | CLCD_CAP_5551 |
-				    CLCD_CAP_565 },
-		{ 0x111, 0,  8, 16, CLCD_CAP_444 | CLCD_CAP_5551 |
-				    CLCD_CAP_565 | CLCD_CAP_888 },
-	};
-	int i;
-
-	/* Bypass pixel clock divider */
-	fb->panel->tim2 |= TIM2_BCD;
-
-	/* TFT display, vert. comp. interrupt at the start of the back porch */
-	fb->panel->cntl |= CNTL_LCDTFT | CNTL_LCDVCOMP(1);
-
-	fb->panel->caps = 0;
-
-	/* Match the setup with known variants */
-	for (i = 0; i < ARRAY_SIZE(panels) && !fb->panel->caps; i++) {
-		if (amba_part(fb->dev) != panels[i].part)
-			continue;
-		if (g0 != panels[i].g0)
-			continue;
-		if (r0 == panels[i].r0 && b0 == panels[i].b0)
-			fb->panel->caps = panels[i].caps;
-	}
-
-	/*
-	 * If we actually physically connected the R lines to B and
-	 * vice versa
-	 */
-	if (r0 != 0 && b0 == 0)
-		fb->panel->bgr_connection = true;
-
-	return fb->panel->caps ? 0 : -EINVAL;
-}
-
-static int clcdfb_of_init_display(struct clcd_fb *fb)
-{
-	struct device_node *endpoint, *panel;
-	int err;
-	unsigned int bpp;
-	u32 max_bandwidth;
-	u32 tft_r0b0g0[3];
-
-	fb->panel = devm_kzalloc(&fb->dev->dev, sizeof(*fb->panel), GFP_KERNEL);
-	if (!fb->panel)
-		return -ENOMEM;
-
-	/*
-	 * Fetch the panel endpoint.
-	 */
-	endpoint = of_graph_get_next_endpoint(fb->dev->dev.of_node, NULL);
-	if (!endpoint)
-		return -ENODEV;
-
-	panel = of_graph_get_remote_port_parent(endpoint);
-	if (!panel) {
-		err = -ENODEV;
-		goto out_endpoint_put;
-	}
-
-	err = clcdfb_of_get_backlight(&fb->dev->dev, fb->panel);
-	if (err)
-		goto out_panel_put;
-
-	err = clcdfb_of_get_mode(&fb->dev->dev, panel, fb->panel);
-	if (err)
-		goto out_panel_put;
-
-	err = of_property_read_u32(fb->dev->dev.of_node, "max-memory-bandwidth",
-			&max_bandwidth);
-	if (!err) {
-		/*
-		 * max_bandwidth is in bytes per second and pixclock in
-		 * pico-seconds, so the maximum allowed bits per pixel is
-		 *   8 * max_bandwidth / (PICOS2KHZ(pixclock) * 1000)
-		 * Rearrange this calculation to avoid overflow and then ensure
-		 * result is a valid format.
-		 */
-		bpp = max_bandwidth / (1000 / 8)
-			/ PICOS2KHZ(fb->panel->mode.pixclock);
-		bpp = rounddown_pow_of_two(bpp);
-		if (bpp > 32)
-			bpp = 32;
-	} else
-		bpp = 32;
-	fb->panel->bpp = bpp;
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-	fb->panel->cntl |= CNTL_BEBO;
-#endif
-	fb->panel->width = -1;
-	fb->panel->height = -1;
-
-	if (of_property_read_u32_array(endpoint,
-			"arm,pl11x,tft-r0g0b0-pads",
-			tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) != 0) {
-		err = -ENOENT;
-		goto out_panel_put;
-	}
-
-	of_node_put(panel);
-	of_node_put(endpoint);
-
-	return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0],
-					tft_r0b0g0[1],  tft_r0b0g0[2]);
-out_panel_put:
-	of_node_put(panel);
-out_endpoint_put:
-	of_node_put(endpoint);
-	return err;
-}
-
-static int clcdfb_of_vram_setup(struct clcd_fb *fb)
-{
-	int err;
-	struct device_node *memory;
-	u64 size;
-
-	err = clcdfb_of_init_display(fb);
-	if (err)
-		return err;
-
-	memory = of_parse_phandle(fb->dev->dev.of_node, "memory-region", 0);
-	if (!memory)
-		return -ENODEV;
-
-	fb->fb.screen_base = of_iomap(memory, 0);
-	if (!fb->fb.screen_base) {
-		of_node_put(memory);
-		return -ENOMEM;
-	}
-
-	fb->fb.fix.smem_start = of_translate_address(memory,
-			of_get_address(memory, 0, &size, NULL));
-	fb->fb.fix.smem_len = size;
-	of_node_put(memory);
-
-	return 0;
-}
-
-static int clcdfb_of_vram_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
-{
-	unsigned long off, user_size, kernel_size;
-
-
-	off = vma->vm_pgoff << PAGE_SHIFT;
-	user_size = vma->vm_end - vma->vm_start;
-	kernel_size = fb->fb.fix.smem_len;
-
-	if (off >= kernel_size || user_size > (kernel_size - off))
-		return -ENXIO;
-
-	return remap_pfn_range(vma, vma->vm_start,
-			__phys_to_pfn(fb->fb.fix.smem_start) + vma->vm_pgoff,
-			user_size,
-			pgprot_writecombine(vma->vm_page_prot));
-}
-
-static void clcdfb_of_vram_remove(struct clcd_fb *fb)
-{
-	iounmap(fb->fb.screen_base);
-}
-
-static int clcdfb_of_dma_setup(struct clcd_fb *fb)
-{
-	unsigned long framesize;
-	dma_addr_t dma;
-	int err;
-
-	err = clcdfb_of_init_display(fb);
-	if (err)
-		return err;
-
-	framesize = PAGE_ALIGN(fb->panel->mode.xres * fb->panel->mode.yres *
-			fb->panel->bpp / 8);
-	fb->fb.screen_base = dma_alloc_coherent(&fb->dev->dev, framesize,
-			&dma, GFP_KERNEL);
-	if (!fb->fb.screen_base)
-		return -ENOMEM;
-
-	fb->fb.fix.smem_start = dma;
-	fb->fb.fix.smem_len = framesize;
-
-	return 0;
-}
-
-static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
-{
-	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
-			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
-}
-
-static void clcdfb_of_dma_remove(struct clcd_fb *fb)
-{
-	dma_free_coherent(&fb->dev->dev, fb->fb.fix.smem_len,
-			fb->fb.screen_base, fb->fb.fix.smem_start);
-}
-
-static struct clcd_board *clcdfb_of_get_board(struct amba_device *dev)
-{
-	struct clcd_board *board = devm_kzalloc(&dev->dev, sizeof(*board),
-			GFP_KERNEL);
-	struct device_node *node = dev->dev.of_node;
-
-	if (!board)
-		return NULL;
-
-	board->name = of_node_full_name(node);
-	board->caps = CLCD_CAP_ALL;
-	board->check = clcdfb_check;
-	board->decode = clcdfb_decode;
-	if (of_property_present(node, "memory-region")) {
-		board->setup = clcdfb_of_vram_setup;
-		board->mmap = clcdfb_of_vram_mmap;
-		board->remove = clcdfb_of_vram_remove;
-	} else {
-		board->setup = clcdfb_of_dma_setup;
-		board->mmap = clcdfb_of_dma_mmap;
-		board->remove = clcdfb_of_dma_remove;
-	}
-
-	return board;
-}
-#else
-static struct clcd_board *clcdfb_of_get_board(struct amba_device *dev)
-{
-	return NULL;
-}
-#endif
-
-static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
-{
-	struct clcd_board *board = dev_get_platdata(&dev->dev);
-	struct clcd_fb *fb;
-	int ret;
-
-	if (!board)
-		board = clcdfb_of_get_board(dev);
-
-	if (!board)
-		return -EINVAL;
-
-	ret = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
-	if (ret)
-		goto out;
-
-	ret = amba_request_regions(dev, NULL);
-	if (ret) {
-		printk(KERN_ERR "CLCD: unable to reserve regs region\n");
-		goto out;
-	}
-
-	fb = kzalloc(sizeof(*fb), GFP_KERNEL);
-	if (!fb) {
-		ret = -ENOMEM;
-		goto free_region;
-	}
-
-	fb->dev = dev;
-	fb->board = board;
-
-	dev_info(&fb->dev->dev, "PL%03x designer %02x rev%u at 0x%08llx\n",
-		amba_part(dev), amba_manf(dev), amba_rev(dev),
-		(unsigned long long)dev->res.start);
-
-	ret = fb->board->setup(fb);
-	if (ret)
-		goto free_fb;
-
-	ret = clcdfb_register(fb);
-	if (ret == 0) {
-		amba_set_drvdata(dev, fb);
-		goto out;
-	}
-
-	fb->board->remove(fb);
- free_fb:
-	kfree(fb);
- free_region:
-	amba_release_regions(dev);
- out:
-	return ret;
-}
-
-static void clcdfb_remove(struct amba_device *dev)
-{
-	struct clcd_fb *fb = amba_get_drvdata(dev);
-
-	clcdfb_disable(fb);
-	unregister_framebuffer(&fb->fb);
-	if (fb->fb.cmap.len)
-		fb_dealloc_cmap(&fb->fb.cmap);
-	iounmap(fb->regs);
-	clk_unprepare(fb->clk);
-	clk_put(fb->clk);
-
-	fb->board->remove(fb);
-
-	kfree(fb);
-
-	amba_release_regions(dev);
-}
-
-static const struct amba_id clcdfb_id_table[] = {
-	{
-		.id	= 0x00041110,
-		.mask	= 0x000ffffe,
-	},
-	{ 0, 0 },
-};
-
-MODULE_DEVICE_TABLE(amba, clcdfb_id_table);
-
-static struct amba_driver clcd_driver = {
-	.drv 		= {
-		.name	= "clcd-pl11x",
-	},
-	.probe		= clcdfb_probe,
-	.remove		= clcdfb_remove,
-	.id_table	= clcdfb_id_table,
-};
-
-static int __init amba_clcdfb_init(void)
-{
-	if (fb_get_options("ambafb", NULL))
-		return -ENODEV;
-
-	return amba_driver_register(&clcd_driver);
-}
-
-module_init(amba_clcdfb_init);
-
-static void __exit amba_clcdfb_exit(void)
-{
-	amba_driver_unregister(&clcd_driver);
-}
-
-module_exit(amba_clcdfb_exit);
-
-MODULE_DESCRIPTION("ARM PrimeCell PL110 CLCD core driver");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/amba/clcd-regs.h b/include/linux/amba/clcd-regs.h
deleted file mode 100644
index 421b0fa90d6a..000000000000
--- a/include/linux/amba/clcd-regs.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * David A Rusling
- *
- * Copyright (C) 2001 ARM Limited
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-
-#ifndef AMBA_CLCD_REGS_H
-#define AMBA_CLCD_REGS_H
-
-/*
- * CLCD Controller Internal Register addresses
- */
-#define CLCD_TIM0		0x00000000
-#define CLCD_TIM1 		0x00000004
-#define CLCD_TIM2 		0x00000008
-#define CLCD_TIM3 		0x0000000c
-#define CLCD_UBAS 		0x00000010
-#define CLCD_LBAS 		0x00000014
-
-#define CLCD_PL110_IENB		0x00000018
-#define CLCD_PL110_CNTL		0x0000001c
-#define CLCD_PL110_STAT		0x00000020
-#define CLCD_PL110_INTR 	0x00000024
-#define CLCD_PL110_UCUR		0x00000028
-#define CLCD_PL110_LCUR		0x0000002C
-
-#define CLCD_PL111_CNTL		0x00000018
-#define CLCD_PL111_IENB		0x0000001c
-#define CLCD_PL111_RIS		0x00000020
-#define CLCD_PL111_MIS		0x00000024
-#define CLCD_PL111_ICR		0x00000028
-#define CLCD_PL111_UCUR		0x0000002c
-#define CLCD_PL111_LCUR		0x00000030
-
-#define CLCD_PALL 		0x00000200
-#define CLCD_PALETTE		0x00000200
-
-#define TIM2_PCD_LO_MASK	GENMASK(4, 0)
-#define TIM2_PCD_LO_BITS	5
-#define TIM2_CLKSEL		(1 << 5)
-#define TIM2_ACB_MASK		GENMASK(10, 6)
-#define TIM2_IVS		(1 << 11)
-#define TIM2_IHS		(1 << 12)
-#define TIM2_IPC		(1 << 13)
-#define TIM2_IOE		(1 << 14)
-#define TIM2_BCD		(1 << 26)
-#define TIM2_PCD_HI_MASK	GENMASK(31, 27)
-#define TIM2_PCD_HI_BITS	5
-#define TIM2_PCD_HI_SHIFT	27
-
-#define CNTL_LCDEN		(1 << 0)
-#define CNTL_LCDBPP1		(0 << 1)
-#define CNTL_LCDBPP2		(1 << 1)
-#define CNTL_LCDBPP4		(2 << 1)
-#define CNTL_LCDBPP8		(3 << 1)
-#define CNTL_LCDBPP16		(4 << 1)
-#define CNTL_LCDBPP16_565	(6 << 1)
-#define CNTL_LCDBPP16_444	(7 << 1)
-#define CNTL_LCDBPP24		(5 << 1)
-#define CNTL_LCDBW		(1 << 4)
-#define CNTL_LCDTFT		(1 << 5)
-#define CNTL_LCDMONO8		(1 << 6)
-#define CNTL_LCDDUAL		(1 << 7)
-#define CNTL_BGR		(1 << 8)
-#define CNTL_BEBO		(1 << 9)
-#define CNTL_BEPO		(1 << 10)
-#define CNTL_LCDPWR		(1 << 11)
-#define CNTL_LCDVCOMP(x)	((x) << 12)
-#define CNTL_LDMAFIFOTIME	(1 << 15)
-#define CNTL_WATERMARK		(1 << 16)
-
-/* ST Microelectronics variant bits */
-#define CNTL_ST_1XBPP_444	0x0
-#define CNTL_ST_1XBPP_5551	(1 << 17)
-#define CNTL_ST_1XBPP_565	(1 << 18)
-#define CNTL_ST_CDWID_12	0x0
-#define CNTL_ST_CDWID_16	(1 << 19)
-#define CNTL_ST_CDWID_18	(1 << 20)
-#define CNTL_ST_CDWID_24	((1 << 19)|(1 << 20))
-#define CNTL_ST_CEAEN		(1 << 21)
-#define CNTL_ST_LCDBPP24_PACKED	(6 << 1)
-
-#endif /* AMBA_CLCD_REGS_H */
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
deleted file mode 100644
index b6e0cbeaf533..000000000000
--- a/include/linux/amba/clcd.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * linux/include/asm-arm/hardware/amba_clcd.h -- Integrator LCD panel.
- *
- * David A Rusling
- *
- * Copyright (C) 2001 ARM Limited
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-#include <linux/fb.h>
-#include <linux/amba/clcd-regs.h>
-
-enum {
-	/* individual formats */
-	CLCD_CAP_RGB444		= (1 << 0),
-	CLCD_CAP_RGB5551	= (1 << 1),
-	CLCD_CAP_RGB565		= (1 << 2),
-	CLCD_CAP_RGB888		= (1 << 3),
-	CLCD_CAP_BGR444		= (1 << 4),
-	CLCD_CAP_BGR5551	= (1 << 5),
-	CLCD_CAP_BGR565		= (1 << 6),
-	CLCD_CAP_BGR888		= (1 << 7),
-
-	/* connection layouts */
-	CLCD_CAP_444		= CLCD_CAP_RGB444 | CLCD_CAP_BGR444,
-	CLCD_CAP_5551		= CLCD_CAP_RGB5551 | CLCD_CAP_BGR5551,
-	CLCD_CAP_565		= CLCD_CAP_RGB565 | CLCD_CAP_BGR565,
-	CLCD_CAP_888		= CLCD_CAP_RGB888 | CLCD_CAP_BGR888,
-
-	/* red/blue ordering */
-	CLCD_CAP_RGB		= CLCD_CAP_RGB444 | CLCD_CAP_RGB5551 |
-				  CLCD_CAP_RGB565 | CLCD_CAP_RGB888,
-	CLCD_CAP_BGR		= CLCD_CAP_BGR444 | CLCD_CAP_BGR5551 |
-				  CLCD_CAP_BGR565 | CLCD_CAP_BGR888,
-
-	CLCD_CAP_ALL		= CLCD_CAP_BGR | CLCD_CAP_RGB,
-};
-
-struct backlight_device;
-
-struct clcd_panel {
-	struct fb_videomode	mode;
-	signed short		width;	/* width in mm */
-	signed short		height;	/* height in mm */
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	u32			caps;
-	unsigned int		bpp:8,
-				fixedtimings:1,
-				grayscale:1;
-	unsigned int		connector;
-	struct backlight_device	*backlight;
-	/*
-	 * If the B/R lines are switched between the CLCD
-	 * and the panel we need to know this and not try to
-	 * compensate with the BGR bit in the control register.
-	 */
-	bool			bgr_connection;
-};
-
-struct clcd_regs {
-	u32			tim0;
-	u32			tim1;
-	u32			tim2;
-	u32			tim3;
-	u32			cntl;
-	unsigned long		pixclock;
-};
-
-struct clcd_fb;
-
-/*
- * the board-type specific routines
- */
-struct clcd_board {
-	const char *name;
-
-	/*
-	 * Optional.  Hardware capability flags.
-	 */
-	u32	caps;
-
-	/*
-	 * Optional.  Check whether the var structure is acceptable
-	 * for this display.
-	 */
-	int	(*check)(struct clcd_fb *fb, struct fb_var_screeninfo *var);
-
-	/*
-	 * Compulsory.  Decode fb->fb.var into regs->*.  In the case of
-	 * fixed timing, set regs->* to the register values required.
-	 */
-	void	(*decode)(struct clcd_fb *fb, struct clcd_regs *regs);
-
-	/*
-	 * Optional.  Disable any extra display hardware.
-	 */
-	void	(*disable)(struct clcd_fb *);
-
-	/*
-	 * Optional.  Enable any extra display hardware.
-	 */
-	void	(*enable)(struct clcd_fb *);
-
-	/*
-	 * Setup platform specific parts of CLCD driver
-	 */
-	int	(*setup)(struct clcd_fb *);
-
-	/*
-	 * mmap the framebuffer memory
-	 */
-	int	(*mmap)(struct clcd_fb *, struct vm_area_struct *);
-
-	/*
-	 * Remove platform specific parts of CLCD driver
-	 */
-	void	(*remove)(struct clcd_fb *);
-};
-
-struct amba_device;
-struct clk;
-
-/* this data structure describes each frame buffer device we find */
-struct clcd_fb {
-	struct fb_info		fb;
-	struct amba_device	*dev;
-	struct clk		*clk;
-	struct clcd_panel	*panel;
-	struct clcd_board	*board;
-	void			*board_data;
-	void __iomem		*regs;
-	u16			off_ienb;
-	u16			off_cntl;
-	u32			clcd_cntl;
-	u32			cmap[16];
-	bool			clk_enabled;
-};
-
-static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	struct fb_var_screeninfo *var = &fb->fb.var;
-	u32 val, cpl;
-
-	/*
-	 * Program the CLCD controller registers and start the CLCD
-	 */
-	val = ((var->xres / 16) - 1) << 2;
-	val |= (var->hsync_len - 1) << 8;
-	val |= (var->right_margin - 1) << 16;
-	val |= (var->left_margin - 1) << 24;
-	regs->tim0 = val;
-
-	val = var->yres;
-	if (fb->panel->cntl & CNTL_LCDDUAL)
-		val /= 2;
-	val -= 1;
-	val |= (var->vsync_len - 1) << 10;
-	val |= var->lower_margin << 16;
-	val |= var->upper_margin << 24;
-	regs->tim1 = val;
-
-	val = fb->panel->tim2;
-	val |= var->sync & FB_SYNC_HOR_HIGH_ACT  ? 0 : TIM2_IHS;
-	val |= var->sync & FB_SYNC_VERT_HIGH_ACT ? 0 : TIM2_IVS;
-
-	cpl = var->xres_virtual;
-	if (fb->panel->cntl & CNTL_LCDTFT)	  /* TFT */
-		/* / 1 */;
-	else if (!var->grayscale)		  /* STN color */
-		cpl = cpl * 8 / 3;
-	else if (fb->panel->cntl & CNTL_LCDMONO8) /* STN monochrome, 8bit */
-		cpl /= 8;
-	else					  /* STN monochrome, 4bit */
-		cpl /= 4;
-
-	regs->tim2 = val | ((cpl - 1) << 16);
-
-	regs->tim3 = fb->panel->tim3;
-
-	val = fb->panel->cntl;
-	if (var->grayscale)
-		val |= CNTL_LCDBW;
-
-	if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) {
-		/*
-		 * if board and panel supply capabilities, we can support
-		 * changing BGR/RGB depending on supplied parameters. Here
-		 * we switch to what the framebuffer is providing if need
-		 * be, so if the framebuffer is BGR but the display connection
-		 * is RGB (first case) we switch it around. Vice versa mutatis
-		 * mutandis if the framebuffer is RGB but the display connection
-		 * is BGR, we flip it around.
-		 */
-		if (var->red.offset == 0)
-			val &= ~CNTL_BGR;
-		else
-			val |= CNTL_BGR;
-		if (fb->panel->bgr_connection)
-			val ^= CNTL_BGR;
-	}
-
-	switch (var->bits_per_pixel) {
-	case 1:
-		val |= CNTL_LCDBPP1;
-		break;
-	case 2:
-		val |= CNTL_LCDBPP2;
-		break;
-	case 4:
-		val |= CNTL_LCDBPP4;
-		break;
-	case 8:
-		val |= CNTL_LCDBPP8;
-		break;
-	case 16:
-		/*
-		 * PL110 cannot choose between 5551 and 565 modes in its
-		 * control register.  It is possible to use 565 with
-		 * custom external wiring.
-		 */
-		if (amba_part(fb->dev) == 0x110 ||
-		    var->green.length == 5)
-			val |= CNTL_LCDBPP16;
-		else if (var->green.length == 6)
-			val |= CNTL_LCDBPP16_565;
-		else
-			val |= CNTL_LCDBPP16_444;
-		break;
-	case 32:
-		val |= CNTL_LCDBPP24;
-		break;
-	}
-
-	regs->cntl = val;
-	regs->pixclock = var->pixclock;
-}
-
-static inline int clcdfb_check(struct clcd_fb *fb, struct fb_var_screeninfo *var)
-{
-	var->xres_virtual = var->xres = (var->xres + 15) & ~15;
-	var->yres_virtual = var->yres = (var->yres + 1) & ~1;
-
-#define CHECK(e,l,h) (var->e < l || var->e > h)
-	if (CHECK(right_margin, (5+1), 256) ||	/* back porch */
-	    CHECK(left_margin, (5+1), 256) ||	/* front porch */
-	    CHECK(hsync_len, (5+1), 256) ||
-	    var->xres > 4096 ||
-	    var->lower_margin > 255 ||		/* back porch */
-	    var->upper_margin > 255 ||		/* front porch */
-	    var->vsync_len > 32 ||
-	    var->yres > 1024)
-		return -EINVAL;
-#undef CHECK
-
-	/* single panel mode: PCD = max(PCD, 1) */
-	/* dual panel mode: PCD = max(PCD, 5) */
-
-	/*
-	 * You can't change the grayscale setting, and
-	 * we can only do non-interlaced video.
-	 */
-	if (var->grayscale != fb->fb.var.grayscale ||
-	    (var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED)
-		return -EINVAL;
-
-#define CHECK(e) (var->e != fb->fb.var.e)
-	if (fb->panel->fixedtimings &&
-	    (CHECK(xres)		||
-	     CHECK(yres)		||
-	     CHECK(bits_per_pixel)	||
-	     CHECK(pixclock)		||
-	     CHECK(left_margin)		||
-	     CHECK(right_margin)	||
-	     CHECK(upper_margin)	||
-	     CHECK(lower_margin)	||
-	     CHECK(hsync_len)		||
-	     CHECK(vsync_len)		||
-	     CHECK(sync)))
-		return -EINVAL;
-#undef CHECK
-
-	var->nonstd = 0;
-	var->accel_flags = 0;
-
-	return 0;
-}
-- 
cgit v1.2.3


From 7ea26f9460c6c76b1d6e36f39fce34b16cb88300 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 9 Jan 2024 20:22:45 +0200
Subject: fsnotify: compile out fsnotify permission hooks if
 !FANOTIFY_ACCESS_PERMISSIONS

The depency of FANOTIFY_ACCESS_PERMISSIONS on SECURITY made sure that
the fsnotify permission hooks were never called when SECURITY was
disabled.

Moving the fsnotify permission hook out of the secutiy hook broke that
optimisation.

Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>
Closes: https://lore.kernel.org/linux-fsdevel/53682ece-f0e7-48de-9a1c-879ee34b0449@kernel.dk/
Fixes: d9e5d31084b0 ("fsnotify: optionally pass access range in file permission hooks")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20240109182245.38884-1-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fsnotify.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 11e6434b8e71..8300a5286988 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -100,6 +100,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 /*
  * fsnotify_file_area_perm - permission hook before access to file range
  */
@@ -145,6 +146,24 @@ static inline int fsnotify_open_perm(struct file *file)
 	return fsnotify_file(file, FS_OPEN_PERM);
 }
 
+#else
+static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
+					  const loff_t *ppos, size_t count)
+{
+	return 0;
+}
+
+static inline int fsnotify_file_perm(struct file *file, int perm_mask)
+{
+	return 0;
+}
+
+static inline int fsnotify_open_perm(struct file *file)
+{
+	return 0;
+}
+#endif
+
 /*
  * fsnotify_link_count - inode's link count changed
  */
-- 
cgit v1.2.3


From 755113d7678681a137c330f7997ceb680adb644e Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 9 Jan 2024 10:41:11 +0100
Subject: thermal/debugfs: Add thermal cooling device debugfs information

The thermal framework does not have any debug information except a
sysfs stat which is a bit controversial. This one allocates big chunks
of memory for every cooling devices with a high number of states and
could represent on some systems in production several megabytes of
memory for just a portion of it. As the sysfs is limited to a page
size, the output is not exploitable with large data array and gets
truncated.

The patch provides the same information than sysfs except the
transitions are dynamically allocated, thus they won't show more
events than the ones which actually occurred. There is no longer a
size limitation and it opens the field for more debugging information
where the debugfs is designed for, not sysfs.

The thermal debugfs directory structure tries to stay consistent with
the sysfs one but in a very simplified way:

thermal/
 -- cooling_devices
    |-- 0
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 1
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 2
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 3
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    `-- 4
        |-- clear
        |-- time_in_state_ms
        |-- total_trans
        `-- trans_table

The content of the files in the cooling devices directory is the same
as the sysfs one except for the trans_table which has the following
format:

Transition	Hits
1->0      	246
0->1      	246
2->1      	632
1->2      	632
3->2      	98
2->3      	98

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: White space fixups, rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/Kconfig           |   7 +
 drivers/thermal/Makefile          |   2 +
 drivers/thermal/thermal_core.c    |   6 +
 drivers/thermal/thermal_core.h    |   1 +
 drivers/thermal/thermal_debugfs.c | 446 ++++++++++++++++++++++++++++++++++++++
 drivers/thermal/thermal_debugfs.h |  14 ++
 drivers/thermal/thermal_helpers.c |  20 +-
 include/linux/thermal.h           |   7 +
 8 files changed, 497 insertions(+), 6 deletions(-)
 create mode 100644 drivers/thermal/thermal_debugfs.c
 create mode 100644 drivers/thermal/thermal_debugfs.h

(limited to 'include/linux')

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index c81a00fbca7d..3ff7add3fb7c 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -33,6 +33,13 @@ config THERMAL_STATISTICS
 
 	  If in doubt, say N.
 
+config THERMAL_DEBUGFS
+	bool "Thermal subsystem debug support"
+	depends on DEBUG_FS
+	help
+	  Say Y to allow the thermal subsystem to collect diagnostic
+	  information that can be accessed via debugfs.
+
 config THERMAL_EMERGENCY_POWEROFF_DELAY_MS
 	int "Emergency poweroff delay in milli-seconds"
 	default 0
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index c934cab309ae..0f65ae86a9c6 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -10,6 +10,8 @@ thermal_sys-y			+= thermal_trip.o thermal_helpers.o
 # netlink interface to manage the thermal framework
 thermal_sys-$(CONFIG_THERMAL_NETLINK)		+= thermal_netlink.o
 
+thermal_sys-$(CONFIG_THERMAL_DEBUGFS)	+= thermal_debugfs.o
+
 # interface to/from other layers providing sensors
 thermal_sys-$(CONFIG_THERMAL_HWMON)		+= thermal_hwmon.o
 thermal_sys-$(CONFIG_THERMAL_OF)		+= thermal_of.o
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index c0b012e075f3..5d9cc422d7ba 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -960,6 +960,8 @@ __thermal_cooling_device_register(struct device_node *np,
 
 	mutex_unlock(&thermal_list_lock);
 
+	thermal_debug_cdev_add(cdev);
+
 	return cdev;
 
 out_cooling_dev:
@@ -1166,6 +1168,8 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 	if (!cdev)
 		return;
 
+	thermal_debug_cdev_remove(cdev);
+
 	mutex_lock(&thermal_list_lock);
 
 	if (!thermal_cooling_device_present(cdev)) {
@@ -1629,6 +1633,8 @@ static int __init thermal_init(void)
 {
 	int result;
 
+	thermal_debug_init();
+
 	result = thermal_netlink_init();
 	if (result)
 		goto error;
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 4e023d54fd27..e9c099ecdd0f 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -13,6 +13,7 @@
 #include <linux/thermal.h>
 
 #include "thermal_netlink.h"
+#include "thermal_debugfs.h"
 
 /* Default Thermal Governor */
 #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
new file mode 100644
index 000000000000..b18bdde47847
--- /dev/null
+++ b/drivers/thermal/thermal_debugfs.c
@@ -0,0 +1,446 @@
+
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * Thermal subsystem debug support
+ */
+#include <linux/debugfs.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/thermal.h>
+
+static struct dentry *d_root;
+static struct dentry *d_cdev;
+
+/*
+ * Length of the string containing the thermal zone id or the cooling
+ * device id, including the ending nul character. We can reasonably
+ * assume there won't be more than 256 thermal zones as the maximum
+ * observed today is around 32.
+ */
+#define IDSLENGTH 4
+
+/*
+ * The cooling device transition list is stored in a hash table where
+ * the size is CDEVSTATS_HASH_SIZE. The majority of cooling devices
+ * have dozen of states but some can have much more, so a hash table
+ * is more adequate in this case, because the cost of browsing the entire
+ * list when storing the transitions may not be negligible.
+ */
+#define CDEVSTATS_HASH_SIZE 16
+
+/**
+ * struct cdev_debugfs - per cooling device statistics structure
+ * A cooling device can have a high number of states. Showing the
+ * transitions on a matrix based representation can be overkill given
+ * most of the transitions won't happen and we end up with a matrix
+ * filled with zero. Instead, we show the transitions which actually
+ * happened.
+ *
+ * Every transition updates the current_state and the timestamp. The
+ * transitions and the durations are stored in lists.
+ *
+ * @total: the number of transitions for this cooling device
+ * @current_state: the current cooling device state
+ * @timestamp: the state change timestamp
+ * @transitions: an array of lists containing the state transitions
+ * @durations: an array of lists containing the residencies of each state
+ */
+struct cdev_debugfs {
+	u32 total;
+	int current_state;
+	ktime_t timestamp;
+	struct list_head transitions[CDEVSTATS_HASH_SIZE];
+	struct list_head durations[CDEVSTATS_HASH_SIZE];
+};
+
+/**
+ * struct cdev_value - Common structure for cooling device entry
+ *
+ * The following common structure allows to store the information
+ * related to the transitions and to the state residencies. They are
+ * identified with a id which is associated to a value. It is used as
+ * nodes for the "transitions" and "durations" above.
+ *
+ * @node: node to insert the structure in a list
+ * @id: identifier of the value which can be a state or a transition
+ * @residency: a ktime_t representing a state residency duration
+ * @count: a number of occurrences
+ */
+struct cdev_record {
+	struct list_head node;
+	int id;
+	union {
+                ktime_t residency;
+                u64 count;
+        };
+};
+
+/**
+ * struct thermal_debugfs - High level structure for a thermal object in debugfs
+ *
+ * The thermal_debugfs structure is the common structure used by the
+ * cooling device to compute the statistics.
+ *
+ * @d_top: top directory of the thermal object directory
+ * @lock: per object lock to protect the internals
+ *
+ * @cdev: a cooling device debug structure
+ */
+struct thermal_debugfs {
+	struct dentry *d_top;
+	struct mutex lock;
+	union {
+		struct cdev_debugfs cdev_dbg;
+	};
+};
+
+void thermal_debug_init(void)
+{
+	d_root = debugfs_create_dir("thermal", NULL);
+	if (!d_root)
+		return;
+
+	d_cdev = debugfs_create_dir("cooling_devices", d_root);
+}
+
+static struct thermal_debugfs *thermal_debugfs_add_id(struct dentry *d, int id)
+{
+	struct thermal_debugfs *thermal_dbg;
+	char ids[IDSLENGTH];
+
+	thermal_dbg = kzalloc(sizeof(*thermal_dbg), GFP_KERNEL);
+	if (!thermal_dbg)
+		return NULL;
+
+	mutex_init(&thermal_dbg->lock);
+
+	snprintf(ids, IDSLENGTH, "%d", id);
+
+	thermal_dbg->d_top = debugfs_create_dir(ids, d);
+	if (!thermal_dbg->d_top) {
+		kfree(thermal_dbg);
+		return NULL;
+	}
+
+	return thermal_dbg;
+}
+
+static void thermal_debugfs_remove_id(struct thermal_debugfs *thermal_dbg)
+{
+	if (!thermal_dbg)
+		return;
+
+	debugfs_remove(thermal_dbg->d_top);
+
+	kfree(thermal_dbg);
+}
+
+static struct cdev_record *
+thermal_debugfs_cdev_record_alloc(struct thermal_debugfs *thermal_dbg,
+				  struct list_head *lists, int id)
+{
+	struct cdev_record *cdev_record;
+
+	cdev_record = kzalloc(sizeof(*cdev_record), GFP_KERNEL);
+	if (!cdev_record)
+		return NULL;
+
+	cdev_record->id = id;
+	INIT_LIST_HEAD(&cdev_record->node);
+	list_add_tail(&cdev_record->node,
+		      &lists[cdev_record->id % CDEVSTATS_HASH_SIZE]);
+
+	return cdev_record;
+}
+
+static struct cdev_record *
+thermal_debugfs_cdev_record_find(struct thermal_debugfs *thermal_dbg,
+				 struct list_head *lists, int id)
+{
+	struct cdev_record *entry;
+
+	list_for_each_entry(entry, &lists[id % CDEVSTATS_HASH_SIZE], node)
+		if (entry->id == id)
+			return entry;
+
+	return NULL;
+}
+
+static struct cdev_record *
+thermal_debugfs_cdev_record_get(struct thermal_debugfs *thermal_dbg,
+				struct list_head *lists, int id)
+{
+	struct cdev_record *cdev_record;
+
+	cdev_record = thermal_debugfs_cdev_record_find(thermal_dbg, lists, id);
+	if (cdev_record)
+		return cdev_record;
+
+	return thermal_debugfs_cdev_record_alloc(thermal_dbg, lists, id);
+}
+
+static void thermal_debugfs_cdev_clear(struct cdev_debugfs *cdev_dbg)
+{
+	int i;
+	struct cdev_record *entry, *tmp;
+
+	for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
+
+		list_for_each_entry_safe(entry, tmp,
+					 &cdev_dbg->transitions[i], node) {
+			list_del(&entry->node);
+			kfree(entry);
+		}
+
+		list_for_each_entry_safe(entry, tmp,
+					 &cdev_dbg->durations[i], node) {
+			list_del(&entry->node);
+			kfree(entry);
+		}
+	}
+
+	cdev_dbg->total = 0;
+}
+
+static void *cdev_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct thermal_debugfs *thermal_dbg = s->private;
+
+	mutex_lock(&thermal_dbg->lock);
+
+	return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
+}
+
+static void *cdev_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
+}
+
+static void cdev_seq_stop(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *thermal_dbg = s->private;
+
+	mutex_unlock(&thermal_dbg->lock);
+}
+
+static int cdev_tt_seq_show(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *thermal_dbg = s->private;
+	struct cdev_debugfs *cdev_dbg = &thermal_dbg->cdev_dbg;
+	struct list_head *transitions = cdev_dbg->transitions;
+	struct cdev_record *entry;
+	int i = *(loff_t *)v;
+
+	if (!i)
+		seq_puts(s, "Transition\tOccurences\n");
+
+	list_for_each_entry(entry, &transitions[i], node) {
+		/*
+		 * Assuming maximum cdev states is 1024, the longer
+		 * string for a transition would be "1024->1024\0"
+		 */
+		char buffer[11];
+
+		snprintf(buffer, ARRAY_SIZE(buffer), "%d->%d",
+			 entry->id >> 16, entry->id & 0xFFFF);
+
+		seq_printf(s, "%-10s\t%-10llu\n", buffer, entry->count);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations tt_sops = {
+	.start = cdev_seq_start,
+	.next = cdev_seq_next,
+	.stop = cdev_seq_stop,
+	.show = cdev_tt_seq_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(tt);
+
+static int cdev_dt_seq_show(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *thermal_dbg = s->private;
+	struct cdev_debugfs *cdev_dbg = &thermal_dbg->cdev_dbg;
+	struct list_head *durations = cdev_dbg->durations;
+	struct cdev_record *entry;
+	int i = *(loff_t *)v;
+
+	if (!i)
+		seq_puts(s, "State\tResidency\n");
+
+	list_for_each_entry(entry, &durations[i], node) {
+		s64 duration = ktime_to_ms(entry->residency);
+
+		if (entry->id == cdev_dbg->current_state)
+			duration += ktime_ms_delta(ktime_get(),
+						   cdev_dbg->timestamp);
+
+		seq_printf(s, "%-5d\t%-10llu\n", entry->id, duration);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations dt_sops = {
+	.start = cdev_seq_start,
+	.next = cdev_seq_next,
+	.stop = cdev_seq_stop,
+	.show = cdev_dt_seq_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dt);
+
+static int cdev_clear_set(void *data, u64 val)
+{
+	struct thermal_debugfs *thermal_dbg = data;
+
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&thermal_dbg->lock);
+
+	thermal_debugfs_cdev_clear(&thermal_dbg->cdev_dbg);
+
+	mutex_unlock(&thermal_dbg->lock);
+
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cdev_clear_fops, NULL, cdev_clear_set, "%llu\n");
+
+/**
+ * thermal_debug_cdev_state_update - Update a cooling device state change
+ *
+ * Computes a transition and the duration of the previous state residency.
+ *
+ * @cdev : a pointer to a cooling device
+ * @new_state: an integer corresponding to the new cooling device state
+ */
+void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
+				     int new_state)
+{
+	struct thermal_debugfs *thermal_dbg = cdev->debugfs;
+	struct cdev_debugfs *cdev_dbg;
+	struct cdev_record *cdev_record;
+	int transition, old_state;
+
+	if (!thermal_dbg || (thermal_dbg->cdev_dbg.current_state == new_state))
+		return;
+
+	mutex_lock(&thermal_dbg->lock);
+
+	cdev_dbg = &thermal_dbg->cdev_dbg;
+
+	old_state = cdev_dbg->current_state;
+
+	/*
+	 * Get the old state information in the durations list. If
+	 * this one does not exist, a new allocated one will be
+	 * returned. Recompute the total duration in the old state and
+	 * get a new timestamp for the new state.
+	 */
+	cdev_record = thermal_debugfs_cdev_record_get(thermal_dbg,
+						      cdev_dbg->durations,
+						      old_state);
+	if (cdev_record) {
+		ktime_t now = ktime_get();
+		ktime_t delta = ktime_sub(now, cdev_dbg->timestamp);
+		cdev_record->residency = ktime_add(cdev_record->residency, delta);
+		cdev_dbg->timestamp = now;
+	}
+
+	cdev_dbg->current_state = new_state;
+	transition = (old_state << 16) | new_state;
+
+	/*
+	 * Get the transition in the transitions list. If this one
+	 * does not exist, a new allocated one will be returned.
+	 * Increment the occurrence of this transition which is stored
+	 * in the value field.
+	 */
+	cdev_record = thermal_debugfs_cdev_record_get(thermal_dbg,
+						      cdev_dbg->transitions,
+						      transition);
+	if (cdev_record)
+		cdev_record->count++;
+
+	cdev_dbg->total++;
+
+	mutex_unlock(&thermal_dbg->lock);
+}
+
+/**
+ * thermal_debug_cdev_add - Add a cooling device debugfs entry
+ *
+ * Allocates a cooling device object for debug, initializes the
+ * statistics and create the entries in sysfs.
+ * @cdev: a pointer to a cooling device
+ */
+void thermal_debug_cdev_add(struct thermal_cooling_device *cdev)
+{
+	struct thermal_debugfs *thermal_dbg;
+	struct cdev_debugfs *cdev_dbg;
+	int i;
+
+	thermal_dbg = thermal_debugfs_add_id(d_cdev, cdev->id);
+	if (!thermal_dbg)
+		return;
+
+	cdev_dbg = &thermal_dbg->cdev_dbg;
+
+	for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&cdev_dbg->transitions[i]);
+		INIT_LIST_HEAD(&cdev_dbg->durations[i]);
+	}
+
+	cdev_dbg->current_state = 0;
+	cdev_dbg->timestamp = ktime_get();
+
+	debugfs_create_file("trans_table", 0400, thermal_dbg->d_top,
+			    thermal_dbg, &tt_fops);
+
+	debugfs_create_file("time_in_state_ms", 0400, thermal_dbg->d_top,
+			    thermal_dbg, &dt_fops);
+
+	debugfs_create_file("clear", 0200, thermal_dbg->d_top,
+			    thermal_dbg, &cdev_clear_fops);
+
+	debugfs_create_u32("total_trans", 0400, thermal_dbg->d_top,
+			   &cdev_dbg->total);
+
+	cdev->debugfs = thermal_dbg;
+}
+
+/**
+ * thermal_debug_cdev_remove - Remove a cooling device debugfs entry
+ *
+ * Frees the statistics memory data and remove the debugfs entry
+ *
+ * @cdev: a pointer to a cooling device
+ */
+void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev)
+{
+	struct thermal_debugfs *thermal_dbg = cdev->debugfs;
+
+	if (!thermal_dbg)
+		return;
+
+	mutex_lock(&thermal_dbg->lock);
+
+	thermal_debugfs_cdev_clear(&thermal_dbg->cdev_dbg);
+	cdev->debugfs = NULL;
+
+	mutex_unlock(&thermal_dbg->lock);
+
+	thermal_debugfs_remove_id(thermal_dbg);
+}
diff --git a/drivers/thermal/thermal_debugfs.h b/drivers/thermal/thermal_debugfs.h
new file mode 100644
index 000000000000..341499388448
--- /dev/null
+++ b/drivers/thermal/thermal_debugfs.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CONFIG_THERMAL_DEBUGFS
+void thermal_debug_init(void);
+void thermal_debug_cdev_add(struct thermal_cooling_device *cdev);
+void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev);
+void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev, int state);
+#else
+static inline void thermal_debug_init(void) {}
+static inline void thermal_debug_cdev_add(struct thermal_cooling_device *cdev) {}
+static inline void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev) {}
+static inline void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
+						   int state) {}
+#endif /* CONFIG_THERMAL_DEBUGFS */
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index c3982e0f0075..8f85581693e7 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -146,14 +146,22 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
-static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
-				       int target)
+static int thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, int state)
 {
-	if (cdev->ops->set_cur_state(cdev, target))
-		return;
+	int ret;
+
+	/*
+	 * No check is needed for the ops->set_cur_state as the
+	 * registering function checked the ops are correctly set
+	 */
+	ret = cdev->ops->set_cur_state(cdev, state);
+	if (!ret) {
+		thermal_notify_cdev_state_update(cdev->id, state);
+		thermal_cooling_device_stats_update(cdev, state);
+		thermal_debug_cdev_state_update(cdev, state);
+	}
 
-	thermal_notify_cdev_state_update(cdev->id, target);
-	thermal_cooling_device_stats_update(cdev, target);
+	return ret;
 }
 
 void __thermal_cdev_update(struct thermal_cooling_device *cdev)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 9d0427da32af..7defea8fa223 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -32,6 +32,7 @@
 struct thermal_zone_device;
 struct thermal_cooling_device;
 struct thermal_instance;
+struct thermal_debugfs;
 struct thermal_attr;
 
 enum thermal_trend {
@@ -113,6 +114,9 @@ struct thermal_cooling_device {
 	struct mutex lock; /* protect thermal_instances list */
 	struct list_head thermal_instances;
 	struct list_head node;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 };
 
 /**
@@ -189,6 +193,9 @@ struct thermal_zone_device {
 	struct list_head node;
 	struct delayed_work poll_queue;
 	enum thermal_notify_event notify_event;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 	bool suspended;
 };
 
-- 
cgit v1.2.3


From 57a427c81c322c5f0cdfe7c46cdee553d18b1ec6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 5 Jan 2024 14:45:11 +0100
Subject: thermal: core: Use kstrdup_const() during cooling device registration

Some *thermal_cooling_device_register() calls pass a string literal as
the 'type' parameter, so kstrdup_const() can be used instead of
kstrdup() to avoid a memory allocation in such cases.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 6 +++---
 include/linux/thermal.h        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index abf4c2390b19..dfaa6341694a 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -845,7 +845,7 @@ static void thermal_release(struct device *dev)
 			    sizeof("cooling_device") - 1)) {
 		cdev = to_cooling_device(dev);
 		thermal_cooling_device_destroy_sysfs(cdev);
-		kfree(cdev->type);
+		kfree_const(cdev->type);
 		ida_free(&thermal_cdev_ida, cdev->id);
 		kfree(cdev);
 	}
@@ -917,7 +917,7 @@ __thermal_cooling_device_register(struct device_node *np,
 	cdev->id = ret;
 	id = ret;
 
-	cdev->type = kstrdup(type ? type : "", GFP_KERNEL);
+	cdev->type = kstrdup_const(type ? type : "", GFP_KERNEL);
 	if (!cdev->type) {
 		ret = -ENOMEM;
 		goto out_ida_remove;
@@ -970,7 +970,7 @@ __thermal_cooling_device_register(struct device_node *np,
 out_cooling_dev:
 	thermal_cooling_device_destroy_sysfs(cdev);
 out_cdev_type:
-	kfree(cdev->type);
+	kfree_const(cdev->type);
 out_ida_remove:
 	ida_free(&thermal_cdev_ida, id);
 out_kfree_cdev:
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 7defea8fa223..3227335fb447 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -103,7 +103,7 @@ struct thermal_cooling_device_ops {
 
 struct thermal_cooling_device {
 	int id;
-	char *type;
+	const char *type;
 	unsigned long max_state;
 	struct device device;
 	struct device_node *np;
-- 
cgit v1.2.3


From ba5afb9a84df2e6b26a1b6389b98849cd16ea757 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 12 Jan 2024 09:09:14 +0100
Subject: fs: rework listmount() implementation

Linus pointed out that there's error handling and naming issues in the
that we should rewrite:

* Perform the access checks for the buffer before actually doing any
  work instead of doing it during the iteration.
* Rename the arguments to listmount() and do_listmount() to clarify what
  the arguments are used for.
* Get rid of the pointless ctr variable and overflow checking.
* Get rid of the pointless speculation check.

Link: https://lore.kernel.org/r/CAHk-=wjh6Cypo8WC-McXgSzCaou3UXccxB+7PVeSuGR8AjCphg@mail.gmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c           | 50 +++++++++++++++++++++++++++---------------------
 include/linux/syscalls.h |  2 +-
 2 files changed, 29 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..437f60e96d40 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5042,13 +5042,12 @@ static struct mount *listmnt_next(struct mount *curr)
 	return node_to_mount(rb_next(&curr->mnt_node));
 }
 
-static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
-			    u64 __user *buf, size_t bufsize,
-			    const struct path *root)
+static ssize_t do_listmount(struct mount *first, struct path *orig,
+			    u64 mnt_parent_id, u64 __user *mnt_ids,
+			    size_t nr_mnt_ids, const struct path *root)
 {
 	struct mount *r;
-	ssize_t ctr;
-	int err;
+	ssize_t ret;
 
 	/*
 	 * Don't trigger audit denials. We just want to determine what
@@ -5058,50 +5057,57 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
 	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = security_sb_statfs(orig->dentry);
-	if (err)
-		return err;
+	ret = security_sb_statfs(orig->dentry);
+	if (ret)
+		return ret;
 
-	for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) {
-		if (r->mnt_id_unique == mnt_id)
+	for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+		if (r->mnt_id_unique == mnt_parent_id)
 			continue;
 		if (!is_path_reachable(r, r->mnt.mnt_root, orig))
 			continue;
-		ctr = array_index_nospec(ctr, bufsize);
-		if (put_user(r->mnt_id_unique, buf + ctr))
+		if (put_user(r->mnt_id_unique, mnt_ids))
 			return -EFAULT;
-		if (check_add_overflow(ctr, 1, &ctr))
-			return -ERANGE;
+		mnt_ids++;
+		nr_mnt_ids--;
+		ret++;
 	}
-	return ctr;
+	return ret;
 }
 
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
-		u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
+		mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mnt_id_req kreq;
 	struct mount *first;
 	struct path root, orig;
-	u64 mnt_id, last_mnt_id;
+	u64 mnt_parent_id, last_mnt_id;
+	const size_t maxcount = (size_t)-1 >> 3;
 	ssize_t ret;
 
 	if (flags)
 		return -EINVAL;
 
+	if (unlikely(nr_mnt_ids > maxcount))
+		return -EFAULT;
+
+	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
+		return -EFAULT;
+
 	ret = copy_mnt_id_req(req, &kreq);
 	if (ret)
 		return ret;
-	mnt_id = kreq.mnt_id;
+	mnt_parent_id = kreq.mnt_id;
 	last_mnt_id = kreq.param;
 
 	down_read(&namespace_sem);
 	get_fs_root(current->fs, &root);
-	if (mnt_id == LSMT_ROOT) {
+	if (mnt_parent_id == LSMT_ROOT) {
 		orig = root;
 	} else {
 		ret = -ENOENT;
-		orig.mnt  = lookup_mnt_in_ns(mnt_id, ns);
+		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
 		if (!orig.mnt)
 			goto err;
 		orig.dentry = orig.mnt->mnt_root;
@@ -5111,7 +5117,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	else
 		first = mnt_find_id_at(ns, last_mnt_id + 1);
 
-	ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root);
+	ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
 err:
 	path_put(&root);
 	up_read(&namespace_sem);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5c0dbef55792..cdba4d0c6d4a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -414,7 +414,7 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
 			      struct statmount __user *buf, size_t bufsize,
 			      unsigned int flags);
 asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
-			      u64 __user *buf, size_t bufsize,
+			      u64 __user *mnt_ids, size_t nr_mnt_ids,
 			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
-- 
cgit v1.2.3


From 9181d6f8a2bb32d158de66a84164fac05e3ddd18 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jan 2024 12:28:16 +0000
Subject: net: add more sanity check in virtio_net_hdr_to_skb()

syzbot/KMSAN reports access to uninitialized data from gso_features_check() [1]

The repro use af_packet, injecting a gso packet and hdrlen == 0.

We could fix the issue making gso_features_check() more careful
while dealing with NETIF_F_TSO_MANGLEID in fast path.

Or we can make sure virtio_net_hdr_to_skb() pulls minimal network and
transport headers as intended.

Note that for GSO packets coming from untrusted sources, SKB_GSO_DODGY
bit forces a proper header validation (and pull) before the packet can
hit any device ndo_start_xmit(), thus we do not need a precise disection
at virtio_net_hdr_to_skb() stage.

[1]
BUG: KMSAN: uninit-value in skb_gso_segment include/net/gso.h:83 [inline]
BUG: KMSAN: uninit-value in validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629
 skb_gso_segment include/net/gso.h:83 [inline]
 validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629
 __dev_queue_xmit+0x1eac/0x5130 net/core/dev.c:4341
 dev_queue_xmit include/linux/netdevice.h:3134 [inline]
 packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276
 packet_snd net/packet/af_packet.c:3087 [inline]
 packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg net/socket.c:745 [inline]
 ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584
 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638
 __sys_sendmsg net/socket.c:2667 [inline]
 __do_sys_sendmsg net/socket.c:2676 [inline]
 __se_sys_sendmsg net/socket.c:2674 [inline]
 __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

Uninit was created at:
 slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768
 slab_alloc_node mm/slub.c:3478 [inline]
 kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523
 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560
 __alloc_skb+0x318/0x740 net/core/skbuff.c:651
 alloc_skb include/linux/skbuff.h:1286 [inline]
 alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334
 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780
 packet_alloc_skb net/packet/af_packet.c:2936 [inline]
 packet_snd net/packet/af_packet.c:3030 [inline]
 packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg net/socket.c:745 [inline]
 ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584
 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638
 __sys_sendmsg net/socket.c:2667 [inline]
 __do_sys_sendmsg net/socket.c:2676 [inline]
 __se_sys_sendmsg net/socket.c:2674 [inline]
 __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x63/0x6b

CPU: 0 PID: 5025 Comm: syz-executor279 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023

Reported-by: syzbot+7f4d0ea3df4d4fa9a65f@syzkaller.appspotmail.com
Link: https://lore.kernel.org/netdev/0000000000005abd7b060eb160cd@google.com/
Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 27cc1d464321..4dfa9b69ca8d 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -3,6 +3,8 @@
 #define _LINUX_VIRTIO_NET_H
 
 #include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/udp.h>
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/virtio_net.h>
@@ -49,6 +51,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
 {
+	unsigned int nh_min_len = sizeof(struct iphdr);
 	unsigned int gso_type = 0;
 	unsigned int thlen = 0;
 	unsigned int p_off = 0;
@@ -65,6 +68,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			gso_type = SKB_GSO_TCPV6;
 			ip_proto = IPPROTO_TCP;
 			thlen = sizeof(struct tcphdr);
+			nh_min_len = sizeof(struct ipv6hdr);
 			break;
 		case VIRTIO_NET_HDR_GSO_UDP:
 			gso_type = SKB_GSO_UDP;
@@ -100,7 +104,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		if (!skb_partial_csum_set(skb, start, off))
 			return -EINVAL;
 
-		p_off = skb_transport_offset(skb) + thlen;
+		nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb));
+		p_off = nh_min_len + thlen;
 		if (!pskb_may_pull(skb, p_off))
 			return -EINVAL;
 	} else {
@@ -140,7 +145,7 @@ retry:
 
 			skb_set_transport_header(skb, keys.control.thoff);
 		} else if (gso_type) {
-			p_off = thlen;
+			p_off = nh_min_len + thlen;
 			if (!pskb_may_pull(skb, p_off))
 				return -EINVAL;
 		}
-- 
cgit v1.2.3


From 34d946b723b53488ab39d8ac540ddf9db255317a Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 9 Jan 2024 00:25:48 -0500
Subject: i3c: master: fix kernel-doc check warning

Fix warning found by
	'scripts/kernel-doc -v -none include/linux/i3c/master.h'

include/linux/i3c/master.h:457: warning: Function parameter or member 'enable_hotjoin' not described in 'i3c_master_controller_ops'
include/linux/i3c/master.h:457: warning: Function parameter or member 'disable_hotjoin' not described in 'i3c_master_controller_ops'
include/linux/i3c/master.h:499: warning: Function parameter or member 'hotjoin' not described in 'i3c_master_controller'

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20240109052548.2128133-1-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 1ecd73b17ff5..0ca27dd86956 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -433,6 +433,8 @@ struct i3c_bus {
  *		      for a future IBI
  *		      This method is mandatory only if ->request_ibi is not
  *		      NULL.
+ * @enable_hotjoin: enable hot join event detect.
+ * @disable_hotjoin: disable hot join event detect.
  */
 struct i3c_master_controller_ops {
 	int (*bus_init)(struct i3c_master_controller *master);
@@ -474,6 +476,7 @@ struct i3c_master_controller_ops {
  * @ops: master operations. See &struct i3c_master_controller_ops
  * @secondary: true if the master is a secondary master
  * @init_done: true when the bus initialization is done
+ * @hotjoin: true if the master support hotjoin
  * @boardinfo.i3c: list of I3C  boardinfo objects
  * @boardinfo.i2c: list of I2C boardinfo objects
  * @boardinfo: board-level information attached to devices connected on the bus
-- 
cgit v1.2.3


From aaefabc4a5f7ae48682c4d2d5d10faaf95c08eb9 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 7 Nov 2023 10:44:41 +0800
Subject: ceph: try to allocate a smaller extent map for sparse read

In fscrypt case and for a smaller read length we can predict the
max count of the extent map. And for small read length use cases
this could save some memories.

[ idryomov: squash into a single patch to avoid build break, drop
  redundant variable in ceph_alloc_sparse_ext_map() ]

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |  4 +++-
 fs/ceph/file.c                  |  8 ++++++--
 fs/ceph/super.h                 | 14 ++++++++++++++
 include/linux/ceph/osd_client.h |  7 +++++--
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85be3bf18cdf..a5caafc1859e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -357,6 +357,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	u64 len = subreq->len;
 	bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
 	u64 off = subreq->start;
+	int extent_cnt;
 
 	if (ceph_inode_is_shutdown(inode)) {
 		err = -EIO;
@@ -379,7 +380,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	}
 
 	if (sparse) {
-		err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+		extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+		err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
 		if (err)
 			goto out;
 	}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..4dde0da10079 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1028,6 +1028,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		struct ceph_osd_req_op *op;
 		u64 read_off = off;
 		u64 read_len = len;
+		int extent_cnt;
 
 		/* determine new offset/length if encrypted */
 		ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
@@ -1067,7 +1068,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 
 		op = &req->r_ops[0];
 		if (sparse) {
-			ret = ceph_alloc_sparse_ext_map(op);
+			extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
 			if (ret) {
 				ceph_osdc_put_request(req);
 				break;
@@ -1464,6 +1466,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		ssize_t len;
 		struct ceph_osd_req_op *op;
 		int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+		int extent_cnt;
 
 		if (write)
 			size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1527,7 +1530,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
 		op = &req->r_ops[0];
 		if (sparse) {
-			ret = ceph_alloc_sparse_ext_map(op);
+			extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+			ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
 			if (ret) {
 				ceph_osdc_put_request(req);
 				break;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe0f64a0acb2..b06e2bc86221 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -3,6 +3,7 @@
 #define _FS_CEPH_SUPER_H
 
 #include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
 
 #include <asm/unaligned.h>
 #include <linux/backing-dev.h>
@@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
 		ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
 }
 
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+	int cnt = 0;
+
+	if (IS_ENCRYPTED(inode)) {
+		cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+		if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+			cnt = 0;
+	}
+
+	return cnt;
+}
+
 extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
 			      struct ceph_mds_session *session,
 			      struct ceph_msg *msg);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index b8610e9d2471..fa018d5864e7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -572,9 +572,12 @@ int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
  */
 #define CEPH_SPARSE_EXT_ARRAY_INITIAL  16
 
-static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op)
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
 {
-	return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+	if (!cnt)
+		cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL;
+
+	return __ceph_alloc_sparse_ext_map(op, cnt);
 }
 
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
-- 
cgit v1.2.3


From 832b371097eb928d077c827b8f117bf5b99d35c0 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 15 Jan 2024 16:05:26 +0100
Subject: gpiolib: Fix scope-based gpio_device refcounting

Commit 9e4555d1e54a ("gpiolib: add support for scope-based management to
gpio_device") sought to add scope-based gpio_device refcounting, but
erroneously forgot a negation of IS_ERR_OR_NULL().

As a result, gpio_device_put() is not called if the gpio_device pointer
is valid (meaning the ref is leaked), but only called if the pointer is
NULL or an ERR_PTR().

While at it drop a superfluous trailing semicolon.

Fixes: 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index e846bd4e7559..9a5c6c76e653 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -635,7 +635,7 @@ struct gpio_device *gpio_device_get(struct gpio_device *gdev);
 void gpio_device_put(struct gpio_device *gdev);
 
 DEFINE_FREE(gpio_device_put, struct gpio_device *,
-	    if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T));
+	    if (!IS_ERR_OR_NULL(_T)) gpio_device_put(_T))
 
 struct device *gpio_device_to_device(struct gpio_device *gdev);
 
-- 
cgit v1.2.3


From 2db6b72c989763e30fab83b186e9263fece26bc6 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 6 Dec 2023 12:17:51 -0600
Subject: PCI: Fix kernel-doc issues

Fix kernel-doc issues reported by
"find include -name \*pci\* | xargs scripts/kernel-doc -none":

  include/linux/pci.h:731: warning: Function parameter or member 'pdev' not described in 'pci_is_vga'
  include/linux/pci-epc.h:154: warning: Function parameter or member 'list_lock' not described in 'pci_epc'
  include/linux/pci-epf.h:83: warning: expecting prototype for struct pci_epf_event_ops. Prototype was for struct pci_epc_event_ops instead

Link: https://lore.kernel.org/r/20240111162850.2177655-1-helgaas@kernel.org
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Sui Jingfeng <suijingfeng@loongson.cn>
---
 include/linux/pci-epc.h | 2 +-
 include/linux/pci-epf.h | 2 +-
 include/linux/pci.h     | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 5cb694031072..bfe41b03b70c 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -122,7 +122,7 @@ struct pci_epc_mem {
  * struct pci_epc - represents the PCI EPC device
  * @dev: PCI EPC device
  * @pci_epf: list of endpoint functions present in this EPC device
- * list_lock: Mutex for protecting pci_epf list
+ * @list_lock: Mutex for protecting pci_epf list
  * @ops: function pointers for performing endpoint operations
  * @windows: array of address space of the endpoint controller
  * @mem: first window of the endpoint controller, which corresponds to
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 3f44b6aec477..92d0b71d33d7 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -68,7 +68,7 @@ struct pci_epf_ops {
 };
 
 /**
- * struct pci_epf_event_ops - Callbacks for capturing the EPC events
+ * struct pci_epc_event_ops - Callbacks for capturing the EPC events
  * @core_init: Callback for the EPC initialization complete event
  * @link_up: Callback for the EPC link up event
  * @link_down: Callback for the EPC link down event
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1a89dc66f89a..eb45087d7e00 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -715,6 +715,7 @@ static inline bool pci_is_bridge(struct pci_dev *dev)
 
 /**
  * pci_is_vga - check if the PCI device is a VGA device
+ * @pdev: PCI device
  *
  * The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define
  * VGA Base Class and Sub-Classes:
-- 
cgit v1.2.3


From 5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Mon, 15 Jan 2024 17:19:34 -0300
Subject: spinlock: Fix failing build for PREEMPT_RT

Since 1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to
spinlock.h") build fails for PREEMPT_RT, since there is no definition
available of either spin_needbreak() and rwlock_needbreak().

Since it was moved on the mentioned commit, it was placed inside a
!PREEMPT_RT part of the code, making it out of reach for an RT kernel.

Fix this by moving code it a few lines down so it can be reached by an
RT build, where it can also make use of the *_is_contended() definition
added by the spinlock_rt.h.

Fixes: d1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to
spinlock.h")
Signed-off-by: Leonardo Bras <leobras@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Waiman Long <longman@redhat.com>
---
 include/linux/spinlock.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0c71f06454d9..b5c59fdad160 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -449,6 +449,12 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
+#define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
+
+#else  /* !CONFIG_PREEMPT_RT */
+# include <linux/spinlock_rt.h>
+#endif /* CONFIG_PREEMPT_RT */
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
@@ -480,12 +486,6 @@ static inline int rwlock_needbreak(rwlock_t *lock)
 #endif
 }
 
-#define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
-
-#else  /* !CONFIG_PREEMPT_RT */
-# include <linux/spinlock_rt.h>
-#endif /* CONFIG_PREEMPT_RT */
-
 /*
  * Pull the atomic_t declaration:
  * (asm-mips/atomic.h needs above definitions)
-- 
cgit v1.2.3


From 7bed6f3d08b7af27b7015da8dc3acf2b9c1f21d7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Jan 2024 21:29:59 +0000
Subject: block: Fix iterating over an empty bio with bio_for_each_folio_all

If the bio contains no data, bio_first_folio() calls page_folio() on a
NULL pointer and oopses.  Move the test that we've reached the end of
the bio from bio_next_folio() to bio_first_folio().

Reported-by: syzbot+8b23309d5788a79d3eea@syzkaller.appspotmail.com
Reported-by: syzbot+004c1e0fced2b4bc3dcc@syzkaller.appspotmail.com
Fixes: 640d1930bef4 ("block: Add bio_for_each_folio_all()")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20240116212959.3413014-1-willy@infradead.org
[axboe: add unlikely() to error case]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec4db73e5f4e..875d792bffff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -286,6 +286,11 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
 {
 	struct bio_vec *bvec = bio_first_bvec_all(bio) + i;
 
+	if (unlikely(i >= bio->bi_vcnt)) {
+		fi->folio = NULL;
+		return;
+	}
+
 	fi->folio = page_folio(bvec->bv_page);
 	fi->offset = bvec->bv_offset +
 			PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
@@ -303,10 +308,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
 		fi->offset = 0;
 		fi->length = min(folio_size(fi->folio), fi->_seg_count);
 		fi->_next = folio_next(fi->folio);
-	} else if (fi->_i + 1 < bio->bi_vcnt) {
-		bio_first_folio(fi, bio, fi->_i + 1);
 	} else {
-		fi->folio = NULL;
+		bio_first_folio(fi, bio, fi->_i + 1);
 	}
 }
 
-- 
cgit v1.2.3


From a54e72197037d2c9bfcd70dddaac8c8ccb5b41ba Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 11 Jan 2024 23:06:39 +0800
Subject: netfilter: propagate net to nf_bridge_get_physindev

This is a preparation patch for replacing physindev with physinif on
nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve
device, when needed, and it requires net to be available.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h           |  2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c        |  2 +-
 net/ipv6/netfilter/nf_reject_ipv6.c        |  2 +-
 net/netfilter/ipset/ip_set_hash_netiface.c |  8 ++++----
 net/netfilter/nf_log_syslog.c              | 13 +++++++------
 net/netfilter/nf_queue.c                   |  2 +-
 net/netfilter/xt_physdev.c                 |  2 +-
 7 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index f980edfdd278..e927b9a15a55 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
 }
 
 static inline struct net_device *
-nf_bridge_get_physindev(const struct sk_buff *skb)
+nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
 {
 	const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index f01b038fc1cd..86e7d390671a 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -289,7 +289,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	 * build the eth header using the original destination's MAC as the
 	 * source, and send the RST packet directly.
 	 */
-	br_indev = nf_bridge_get_physindev(oldskb);
+	br_indev = nf_bridge_get_physindev(oldskb, net);
 	if (br_indev) {
 		struct ethhdr *oeth = eth_hdr(oldskb);
 
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index d45bc54b7ea5..27b2164f4c43 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -354,7 +354,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	 * build the eth header using the original destination's MAC as the
 	 * source, and send the RST packet directly.
 	 */
-	br_indev = nf_bridge_get_physindev(oldskb);
+	br_indev = nf_bridge_get_physindev(oldskb, net);
 	if (br_indev) {
 		struct ethhdr *oeth = eth_hdr(oldskb);
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 95aeb31c60e0..30a655e5c4fd 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
 #include "ip_set_hash_gen.h"
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static const char *get_physindev_name(const struct sk_buff *skb)
+static const char *get_physindev_name(const struct sk_buff *skb, struct net *net)
 {
-	struct net_device *dev = nf_bridge_get_physindev(skb);
+	struct net_device *dev = nf_bridge_get_physindev(skb, net);
 
 	return dev ? dev->name : NULL;
 }
@@ -177,7 +177,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-		const char *eiface = SRCDIR ? get_physindev_name(skb) :
+		const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
 					      get_physoutdev_name(skb);
 
 		if (!eiface)
@@ -395,7 +395,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-		const char *eiface = SRCDIR ? get_physindev_name(skb) :
+		const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
 					      get_physoutdev_name(skb);
 
 		if (!eiface)
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index c66689ad2b49..58402226045e 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
 			  unsigned int hooknum, const struct sk_buff *skb,
 			  const struct net_device *in,
 			  const struct net_device *out,
-			  const struct nf_loginfo *loginfo, const char *prefix)
+			  const struct nf_loginfo *loginfo, const char *prefix,
+			  struct net *net)
 {
 	const struct net_device *physoutdev __maybe_unused;
 	const struct net_device *physindev __maybe_unused;
@@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
 			in ? in->name : "",
 			out ? out->name : "");
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	physindev = nf_bridge_get_physindev(skb);
+	physindev = nf_bridge_get_physindev(skb, net);
 	if (physindev && in != physindev)
 		nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
 	physoutdev = nf_bridge_get_physoutdev(skb);
@@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
-				  prefix);
+				  prefix, net);
 	dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
 
 	nf_log_buf_close(m);
@@ -845,7 +846,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	nf_log_dump_packet_common(m, pf, hooknum, skb, in,
-				  out, loginfo, prefix);
+				  out, loginfo, prefix, net);
 
 	if (in)
 		dump_mac_header(m, loginfo, skb);
@@ -880,7 +881,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
-				  loginfo, prefix);
+				  loginfo, prefix, net);
 
 	if (in)
 		dump_mac_header(m, loginfo, skb);
@@ -916,7 +917,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
 		loginfo = &default_loginfo;
 
 	nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
-				  prefix);
+				  prefix, net);
 
 	dump_mac_header(m, loginfo, skb);
 
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 3dfcb3ac5cb4..e2f334f70281 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -84,7 +84,7 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
 	const struct sk_buff *skb = entry->skb;
 
 	if (nf_bridge_info_exists(skb)) {
-		entry->physin = nf_bridge_get_physindev(skb);
+		entry->physin = nf_bridge_get_physindev(skb, entry->state.net);
 		entry->physout = nf_bridge_get_physoutdev(skb);
 	} else {
 		entry->physin = NULL;
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index ec6ed6fda96c..343e65f377d4 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	    (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
 		return false;
 
-	physdev = nf_bridge_get_physindev(skb);
+	physdev = nf_bridge_get_physindev(skb, xt_net(par));
 	indev = physdev ? physdev->name : NULL;
 
 	if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
-- 
cgit v1.2.3


From 9874808878d9eed407e3977fd11fee49de1e1d86 Mon Sep 17 00:00:00 2001
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 11 Jan 2024 23:06:40 +0800
Subject: netfilter: bridge: replace physindev with physinif in nf_bridge_info

An skb can be added to a neigh->arp_queue while waiting for an arp
reply. Where original skb's skb->dev can be different to neigh's
neigh->dev. For instance in case of bridging dnated skb from one veth to
another, the skb would be added to a neigh->arp_queue of the bridge.

As skb->dev can be reset back to nf_bridge->physindev and used, and as
there is no explicit mechanism that prevents this physindev from been
freed under us (for instance neigh_flush_dev doesn't cleanup skbs from
different device's neigh queue) we can crash on e.g. this stack:

arp_process
  neigh_update
    skb = __skb_dequeue(&neigh->arp_queue)
      neigh_resolve_output(..., skb)
        ...
          br_nf_dev_xmit
            br_nf_pre_routing_finish_bridge_slow
              skb->dev = nf_bridge->physindev
              br_handle_frame_finish

Let's use plain ifindex instead of net_device link. To peek into the
original net_device we will use dev_get_by_index_rcu(). Thus either we
get device and are safe to use it or we don't get it and drop skb.

Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c")
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h    |  4 ++--
 include/linux/skbuff.h              |  2 +-
 net/bridge/br_netfilter_hooks.c     | 42 ++++++++++++++++++++++++++++++-------
 net/bridge/br_netfilter_ipv6.c      | 14 +++++++++----
 net/ipv4/netfilter/nf_reject_ipv4.c |  9 +++++---
 net/ipv6/netfilter/nf_reject_ipv6.c | 11 +++++++---
 6 files changed, 61 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index e927b9a15a55..743475ca7e9d 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb)
 	if (!nf_bridge)
 		return 0;
 
-	return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0;
+	return nf_bridge->physinif;
 }
 
 static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
@@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
 {
 	const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
-	return nf_bridge ? nf_bridge->physindev : NULL;
+	return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL;
 }
 
 static inline struct net_device *
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a5ae952454c8..2dde34c29203 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -295,7 +295,7 @@ struct nf_bridge_info {
 	u8			bridged_dnat:1;
 	u8			sabotage_in_done:1;
 	__u16			frag_max_size;
-	struct net_device	*physindev;
+	int			physinif;
 
 	/* always valid & non-NULL from FORWARD on, for physdev match */
 	struct net_device	*physoutdev;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 6adcb45bca75..ed1720890757 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -279,8 +279,17 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
 
 		if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
 		    READ_ONCE(neigh->hh.hh_len)) {
+			struct net_device *br_indev;
+
+			br_indev = nf_bridge_get_physindev(skb, net);
+			if (!br_indev) {
+				neigh_release(neigh);
+				goto free_skb;
+			}
+
 			neigh_hh_bridge(&neigh->hh, skb);
-			skb->dev = nf_bridge->physindev;
+			skb->dev = br_indev;
+
 			ret = br_handle_frame_finish(net, sk, skb);
 		} else {
 			/* the neighbour function below overwrites the complete
@@ -352,12 +361,18 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
  */
 static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dev;
+	struct net_device *dev = skb->dev, *br_indev;
 	struct iphdr *iph = ip_hdr(skb);
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct rtable *rt;
 	int err;
 
+	br_indev = nf_bridge_get_physindev(skb, net);
+	if (!br_indev) {
+		kfree_skb(skb);
+		return 0;
+	}
+
 	nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
 
 	if (nf_bridge->pkt_otherhost) {
@@ -397,7 +412,7 @@ free_skb:
 		} else {
 			if (skb_dst(skb)->dev == dev) {
 bridged_dnat:
-				skb->dev = nf_bridge->physindev;
+				skb->dev = br_indev;
 				nf_bridge_update_protocol(skb);
 				nf_bridge_push_encap_header(skb);
 				br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -410,7 +425,7 @@ bridged_dnat:
 			skb->pkt_type = PACKET_HOST;
 		}
 	} else {
-		rt = bridge_parent_rtable(nf_bridge->physindev);
+		rt = bridge_parent_rtable(br_indev);
 		if (!rt) {
 			kfree_skb(skb);
 			return 0;
@@ -419,7 +434,7 @@ bridged_dnat:
 		skb_dst_set_noref(skb, &rt->dst);
 	}
 
-	skb->dev = nf_bridge->physindev;
+	skb->dev = br_indev;
 	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
 	br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
@@ -456,7 +471,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
 	}
 
 	nf_bridge->in_prerouting = 1;
-	nf_bridge->physindev = skb->dev;
+	nf_bridge->physinif = skb->dev->ifindex;
 	skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
 
 	if (skb->protocol == htons(ETH_P_8021Q))
@@ -553,7 +568,11 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
 		if (skb->protocol == htons(ETH_P_IPV6))
 			nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
 
-		in = nf_bridge->physindev;
+		in = nf_bridge_get_physindev(skb, net);
+		if (!in) {
+			kfree_skb(skb);
+			return 0;
+		}
 		if (nf_bridge->pkt_otherhost) {
 			skb->pkt_type = PACKET_OTHERHOST;
 			nf_bridge->pkt_otherhost = false;
@@ -899,6 +918,13 @@ static unsigned int ip_sabotage_in(void *priv,
 static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+	struct net_device *br_indev;
+
+	br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
+	if (!br_indev) {
+		kfree_skb(skb);
+		return;
+	}
 
 	skb_pull(skb, ETH_HLEN);
 	nf_bridge->bridged_dnat = 0;
@@ -908,7 +934,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 	skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
 				       nf_bridge->neigh_header,
 				       ETH_HLEN - ETH_ALEN);
-	skb->dev = nf_bridge->physindev;
+	skb->dev = br_indev;
 
 	nf_bridge->physoutdev = NULL;
 	br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 2e24a743f917..e0421eaa3abc 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -102,9 +102,15 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct rtable *rt;
-	struct net_device *dev = skb->dev;
+	struct net_device *dev = skb->dev, *br_indev;
 	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 
+	br_indev = nf_bridge_get_physindev(skb, net);
+	if (!br_indev) {
+		kfree_skb(skb);
+		return 0;
+	}
+
 	nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
 
 	if (nf_bridge->pkt_otherhost) {
@@ -122,7 +128,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 		}
 
 		if (skb_dst(skb)->dev == dev) {
-			skb->dev = nf_bridge->physindev;
+			skb->dev = br_indev;
 			nf_bridge_update_protocol(skb);
 			nf_bridge_push_encap_header(skb);
 			br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -133,7 +139,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 		ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
 		skb->pkt_type = PACKET_HOST;
 	} else {
-		rt = bridge_parent_rtable(nf_bridge->physindev);
+		rt = bridge_parent_rtable(br_indev);
 		if (!rt) {
 			kfree_skb(skb);
 			return 0;
@@ -142,7 +148,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
 		skb_dst_set_noref(skb, &rt->dst);
 	}
 
-	skb->dev = nf_bridge->physindev;
+	skb->dev = br_indev;
 	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
 	br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 86e7d390671a..04504b2b51df 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -239,7 +239,6 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
 void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 		   int hook)
 {
-	struct net_device *br_indev __maybe_unused;
 	struct sk_buff *nskb;
 	struct iphdr *niph;
 	const struct tcphdr *oth;
@@ -289,9 +288,13 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	 * build the eth header using the original destination's MAC as the
 	 * source, and send the RST packet directly.
 	 */
-	br_indev = nf_bridge_get_physindev(oldskb, net);
-	if (br_indev) {
+	if (nf_bridge_info_exists(oldskb)) {
 		struct ethhdr *oeth = eth_hdr(oldskb);
+		struct net_device *br_indev;
+
+		br_indev = nf_bridge_get_physindev(oldskb, net);
+		if (!br_indev)
+			goto free_nskb;
 
 		nskb->dev = br_indev;
 		niph->tot_len = htons(nskb->len);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 27b2164f4c43..196dd4ecb5e2 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -278,7 +278,6 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
 void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 		    int hook)
 {
-	struct net_device *br_indev __maybe_unused;
 	struct sk_buff *nskb;
 	struct tcphdr _otcph;
 	const struct tcphdr *otcph;
@@ -354,9 +353,15 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	 * build the eth header using the original destination's MAC as the
 	 * source, and send the RST packet directly.
 	 */
-	br_indev = nf_bridge_get_physindev(oldskb, net);
-	if (br_indev) {
+	if (nf_bridge_info_exists(oldskb)) {
 		struct ethhdr *oeth = eth_hdr(oldskb);
+		struct net_device *br_indev;
+
+		br_indev = nf_bridge_get_physindev(oldskb, net);
+		if (!br_indev) {
+			kfree_skb(nskb);
+			return;
+		}
 
 		nskb->dev = br_indev;
 		nskb->protocol = htons(ETH_P_IPV6);
-- 
cgit v1.2.3


From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 17 Jan 2024 12:36:09 -0800
Subject: blk-mq: Remove the hctx 'run' debugfs attribute

Nobody uses the debugfs hctx 'run' attribute. Hence remove this
attribute and also the code that updates the corresponding member
variable.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Cc: Gabriel Ryan <gabe@cs.columbia.edu>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 18 ------------------
 block/blk-mq-sched.c   |  2 --
 include/linux/blk-mq.h |  3 ---
 3 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5cbeb9344f2f..94668e72ab09 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -479,23 +479,6 @@ out:
 	return res;
 }
 
-static int hctx_run_show(void *data, struct seq_file *m)
-{
-	struct blk_mq_hw_ctx *hctx = data;
-
-	seq_printf(m, "%lu\n", hctx->run);
-	return 0;
-}
-
-static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count,
-			      loff_t *ppos)
-{
-	struct blk_mq_hw_ctx *hctx = data;
-
-	hctx->run = 0;
-	return count;
-}
-
 static int hctx_active_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
@@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"tags_bitmap", 0400, hctx_tags_bitmap_show},
 	{"sched_tags", 0400, hctx_sched_tags_show},
 	{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
-	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
 	{"type", 0400, hctx_type_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 67c95f31b15b..451a2c1f1f32 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 		return;
 
-	hctx->run++;
-
 	/*
 	 * A return of -EAGAIN is an indication that hctx->dispatch is not
 	 * empty and we must run again in order to avoid starving flushes.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a676e116085f..7a8150a5f051 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -391,9 +391,6 @@ struct blk_mq_hw_ctx {
 	 */
 	struct blk_mq_tags	*sched_tags;
 
-	/** @run: Number of dispatched requests. */
-	unsigned long		run;
-
 	/** @numa_node: NUMA node the storage adapter has been connected to. */
 	unsigned int		numa_node;
 	/** @queue_num: Index of this hardware queue. */
-- 
cgit v1.2.3


From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 17 Jan 2024 19:31:40 -0800
Subject: bpf: extract bpf_ctx_convert_map logic and make it more reusable

Refactor btf_get_prog_ctx_type() a bit to allow reuse of
bpf_ctx_convert_map logic in more than one places. Simplify interface by
returning btf_type instead of btf_member (field reference in BTF).

To do the above we need to touch and start untangling
btf_translate_to_vmlinux() implementation. We do the bare minimum to
not regress anything for btf_translate_to_vmlinux(), but its
implementation is very questionable for what it claims to be doing.
Mapping kfunc argument types to kernel corresponding types conceptually
is quite different from recognizing program context types. Fixing this
is out of scope for this change though.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h |  2 +-
 kernel/bpf/btf.c    | 71 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 46 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 59d404e22814..cf5c6ff48981 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
 struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
-const struct btf_member *
+const struct btf_type *
 btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 51e8b4bee0c8..10ac9efc662d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
 #undef BPF_MAP_TYPE
 #undef BPF_LINK_TYPE
 
-const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, enum bpf_prog_type prog_type,
-		      int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
 {
 	const struct btf_type *conv_struct;
-	const struct btf_type *ctx_struct;
 	const struct btf_member *ctx_type;
-	const char *tname, *ctx_tname;
 
 	conv_struct = bpf_ctx_convert.t;
-	if (!conv_struct) {
-		bpf_log(log, "btf_vmlinux is malformed\n");
+	if (!conv_struct)
 		return NULL;
-	}
+	/* prog_type is valid bpf program type. No need for bounds check. */
+	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+	/* ctx_type is a pointer to prog_ctx_type in vmlinux.
+	 * Like 'struct __sk_buff'
+	 */
+	return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+	const struct btf_type *conv_struct;
+	const struct btf_member *ctx_type;
+
+	conv_struct = bpf_ctx_convert.t;
+	if (!conv_struct)
+		return -EFAULT;
+	/* prog_type is valid bpf program type. No need for bounds check. */
+	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+	/* ctx_type is a pointer to prog_ctx_type in vmlinux.
+	 * Like 'struct sk_buff'
+	 */
+	return ctx_type->type;
+}
+
+const struct btf_type *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+		      const struct btf_type *t, enum bpf_prog_type prog_type,
+		      int arg)
+{
+	const struct btf_type *ctx_type;
+	const char *tname, *ctx_tname;
+
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
@@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
 		return NULL;
 	}
-	/* prog_type is valid bpf program type. No need for bounds check. */
-	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
-	/* ctx_struct is a pointer to prog_ctx_type in vmlinux.
-	 * Like 'struct __sk_buff'
-	 */
-	ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
-	if (!ctx_struct)
+
+	ctx_type = find_canonical_prog_ctx_type(prog_type);
+	if (!ctx_type) {
+		bpf_log(log, "btf_vmlinux is malformed\n");
 		/* should not happen */
 		return NULL;
+	}
 again:
-	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
 	if (!ctx_tname) {
 		/* should not happen */
 		bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@@ -5677,10 +5700,10 @@ again:
 		/* bpf_user_pt_regs_t is a typedef, so resolve it to
 		 * underlying struct and check name again
 		 */
-		if (!btf_type_is_modifier(ctx_struct))
+		if (!btf_type_is_modifier(ctx_type))
 			return NULL;
-		while (btf_type_is_modifier(ctx_struct))
-			ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+		while (btf_type_is_modifier(ctx_type))
+			ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
 		goto again;
 	}
 	return ctx_type;
@@ -5692,13 +5715,9 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     enum bpf_prog_type prog_type,
 				     int arg)
 {
-	const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
-	prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
-	if (!prog_ctx_type)
+	if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
 		return -ENOENT;
-	kern_ctx_type = prog_ctx_type + 1;
-	return kern_ctx_type->type;
+	return find_kern_ctx_type_id(prog_type);
 }
 
 int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
-- 
cgit v1.2.3


From 7a8e9cdf9405819105ae7405cd91e482bf574b01 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Tue, 16 Jan 2024 08:09:25 -0600
Subject: seq_buf: Make DECLARE_SEQ_BUF() usable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using the address operator on the array doesn't work:

./include/linux/seq_buf.h:27:27: error: initialization of ‘char *’
  from incompatible pointer type ‘char (*)[128]’
  [-Werror=incompatible-pointer-types]
   27 |                 .buffer = &__ ## NAME ## _buffer,       \
      |                           ^

Apart from fixing that, we can improve DECLARE_SEQ_BUF() by using a
compound literal to define the buffer array without attaching a name
to it. This makes the macro a single statement, allowing constructs
such as:

  static DECLARE_SEQ_BUF(my_seq_buf, MYSB_SIZE);

to work as intended.

Link: https://lkml.kernel.org/r/20240116-declare-seq-buf-fix-v1-1-915db4692f32@linux.ibm.com

Cc: stable@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Fixes: dcc4e5728eea ("seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 5fb1f12c33f9..c44f4b47b945 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -22,9 +22,8 @@ struct seq_buf {
 };
 
 #define DECLARE_SEQ_BUF(NAME, SIZE)			\
-	char __ ## NAME ## _buffer[SIZE] = "";		\
 	struct seq_buf NAME = {				\
-		.buffer = &__ ## NAME ## _buffer,	\
+		.buffer = (char[SIZE]) { 0 },		\
 		.size = SIZE,				\
 	}
 
-- 
cgit v1.2.3


From b60db383e2ba64a18e49b6bef3be1ab18aa159f1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 23 Nov 2023 10:40:40 +0100
Subject: include/linux/i2c.h: remove I2C_CLASS_DDC support

After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in
olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC.
Class-based device auto-detection is a legacy mechanism and shouldn't
be used in new code. So we can remove this class completely now.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 0dae9db27538..d029aade338f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -850,7 +850,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 
 /* i2c adapter classes (bitmask) */
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
-#define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
 #define I2C_CLASS_SPD		(1<<7)	/* Memory modules */
 /* Warn users that the adapter doesn't support classes anymore */
 #define I2C_CLASS_DEPRECATED	(1<<8)
-- 
cgit v1.2.3


From 73febd775bdbdb98c81255ff85773ac410ded5c4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Sun, 12 Nov 2023 17:54:41 -0500
Subject: i2c: create debugfs entry per adapter

Two drivers already implement custom debugfs handling for their
i2c_adapter and more will come. So, let the core create a debugfs
directory per adapter and pass that to drivers for their debugfs files.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 11 +++++++++++
 include/linux/i2c.h         |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index eac90a3cf61a..f6c828bbd166 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -16,6 +16,7 @@
 #include <linux/acpi.h>
 #include <linux/clk/clk-conf.h>
 #include <linux/completion.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -67,6 +68,8 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
 static DEFINE_STATIC_KEY_FALSE(i2c_trace_msg_key);
 static bool is_registered;
 
+static struct dentry *i2c_debugfs_root;
+
 int i2c_transfer_trace_reg(void)
 {
 	static_branch_inc(&i2c_trace_msg_key);
@@ -1524,6 +1527,8 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		goto out_list;
 	}
 
+	adap->debugfs = debugfs_create_dir(dev_name(&adap->dev), i2c_debugfs_root);
+
 	res = i2c_setup_smbus_alert(adap);
 	if (res)
 		goto out_reg;
@@ -1563,6 +1568,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	return 0;
 
 out_reg:
+	debugfs_remove_recursive(adap->debugfs);
 	init_completion(&adap->dev_released);
 	device_unregister(&adap->dev);
 	wait_for_completion(&adap->dev_released);
@@ -1764,6 +1770,8 @@ void i2c_del_adapter(struct i2c_adapter *adap)
 
 	i2c_host_notify_irq_teardown(adap);
 
+	debugfs_remove_recursive(adap->debugfs);
+
 	/* wait until all references to the device are gone
 	 *
 	 * FIXME: This is old code and should ideally be replaced by an
@@ -2061,6 +2069,8 @@ static int __init i2c_init(void)
 
 	is_registered = true;
 
+	i2c_debugfs_root = debugfs_create_dir("i2c", NULL);
+
 #ifdef CONFIG_I2C_COMPAT
 	i2c_adapter_compat_class = class_compat_register("i2c-adapter");
 	if (!i2c_adapter_compat_class) {
@@ -2099,6 +2109,7 @@ static void __exit i2c_exit(void)
 #ifdef CONFIG_I2C_COMPAT
 	class_compat_unregister(i2c_adapter_compat_class);
 #endif
+	debugfs_remove_recursive(i2c_debugfs_root);
 	bus_unregister(&i2c_bus_type);
 	tracepoint_synchronize_unregister();
 }
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d029aade338f..e01fb1097868 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -746,6 +746,8 @@ struct i2c_adapter {
 
 	struct irq_domain *host_notify_domain;
 	struct regulator *bus_regulator;
+
+	struct dentry *debugfs;
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-- 
cgit v1.2.3


From 94959c0e796e41128483588d133b9a7003b409f9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 16:22:43 +0100
Subject: i2c: make i2c_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the i2c_bus_type variable to be a constant structure as well, placing
it into read-only memory which can not be modified at runtime.

Note, the sound/soc/rockchip/rk3399_gru_sound.c also needed tweaking as
it decided to save off a pointer to a bus type for internal stuff, and
it was using the i2c_bus_type as well.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c           | 2 +-
 include/linux/i2c.h                   | 2 +-
 sound/soc/rockchip/rk3399_gru_sound.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index f6c828bbd166..3bd48d4b6318 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -692,7 +692,7 @@ static struct attribute *i2c_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(i2c_dev);
 
-struct bus_type i2c_bus_type = {
+const struct bus_type i2c_bus_type = {
 	.name		= "i2c",
 	.match		= i2c_device_match,
 	.probe		= i2c_device_probe,
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e01fb1097868..652ecb7abeda 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -23,7 +23,7 @@
 #include <linux/swab.h>		/* for swab16 */
 #include <uapi/linux/i2c.h>
 
-extern struct bus_type i2c_bus_type;
+extern const struct bus_type i2c_bus_type;
 extern struct device_type i2c_adapter_type;
 extern struct device_type i2c_client_type;
 
diff --git a/sound/soc/rockchip/rk3399_gru_sound.c b/sound/soc/rockchip/rk3399_gru_sound.c
index 1a504ebd3a0e..6c89c7331229 100644
--- a/sound/soc/rockchip/rk3399_gru_sound.c
+++ b/sound/soc/rockchip/rk3399_gru_sound.c
@@ -446,7 +446,7 @@ static const struct rockchip_sound_route rockchip_routes[] = {
 
 struct dailink_match_data {
 	const char *compatible;
-	struct bus_type *bus_type;
+	const struct bus_type *bus_type;
 };
 
 static const struct dailink_match_data dailink_match[] = {
-- 
cgit v1.2.3


From a8355235dbd571b32c750ee756dd6dac216d18f2 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 8 Nov 2023 07:38:07 +0100
Subject: i2c: mux: reg: Remove class-based device auto-detection support

Legacy class-based device auto-detection shouldn't be used in new code.
Therefore remove support in i2c-mux-reg as long as we don't have a
user of this feature yet.

Link: https://lore.kernel.org/linux-i2c/a22978a4-88e4-46f4-b71c-032b22321599@gmail.com/
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-reg.c           | 4 +---
 include/linux/platform_data/i2c-mux-reg.h | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c
index 9efc1ed01577..8489971babd3 100644
--- a/drivers/i2c/muxes/i2c-mux-reg.c
+++ b/drivers/i2c/muxes/i2c-mux-reg.c
@@ -159,7 +159,6 @@ static int i2c_mux_reg_probe(struct platform_device *pdev)
 	struct regmux *mux;
 	struct i2c_adapter *parent;
 	struct resource *res;
-	unsigned int class;
 	int i, ret, nr;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
@@ -213,9 +212,8 @@ static int i2c_mux_reg_probe(struct platform_device *pdev)
 
 	for (i = 0; i < mux->data.n_values; i++) {
 		nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
-		class = mux->data.classes ? mux->data.classes[i] : 0;
 
-		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], class);
+		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], 0);
 		if (ret)
 			goto err_del_mux_adapters;
 	}
diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h
index 2543c2a1c9ae..e2e895768311 100644
--- a/include/linux/platform_data/i2c-mux-reg.h
+++ b/include/linux/platform_data/i2c-mux-reg.h
@@ -17,7 +17,6 @@
  * @n_values: Number of multiplexer channels
  * @little_endian: Indicating if the register is in little endian
  * @write_only: Reading the register is not allowed by hardware
- * @classes: Optional I2C auto-detection classes
  * @idle: Value to write to mux when idle
  * @idle_in_use: indicate if idle value is in use
  * @reg: Virtual address of the register to switch channel
@@ -30,7 +29,6 @@ struct i2c_mux_reg_platform_data {
 	int n_values;
 	bool little_endian;
 	bool write_only;
-	const unsigned int *classes;
 	u32 idle;
 	bool idle_in_use;
 	void __iomem *reg;
-- 
cgit v1.2.3


From ef175b29a242fea98f467f008237484b03c94834 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 15 Jan 2021 15:24:59 -0600
Subject: of: Stop circularly including of_device.h and of_platform.h

The DT of_device.h and of_platform.h headers date back to the separate
of_platform_bus_type before it was merged into the regular platform bus.
As part of that merge prepping Arm DT support 13 years ago, they
"temporarily" include each other. The headers also include
platform_device.h and of.h. The result was lots of drivers relied on
these implicit includes.

Now the entire tree has been fixed over the last couple of cycles to
explicitly include the necessary headers instead of relying on
of_device.h and/or of_platform.h implicit includes, so the implicit and
circular includes can finally be removed.

Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h   | 5 +----
 include/linux/of_platform.h | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index a72661e47faa..9042bca5bb84 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -2,10 +2,7 @@
 #ifndef _LINUX_OF_DEVICE_H
 #define _LINUX_OF_DEVICE_H
 
-#include <linux/platform_device.h>
-#include <linux/of_platform.h> /* temporary until merge */
-
-#include <linux/of.h>
+#include <linux/device/driver.h>
 
 struct device;
 struct of_device_id;
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index fadfea575485..a2ff1ad48f7f 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -7,11 +7,11 @@
  */
 
 #include <linux/mod_devicetable.h>
-#include <linux/of_device.h>
-#include <linux/platform_device.h>
 
 struct device;
+struct device_node;
 struct of_device_id;
+struct platform_device;
 
 /**
  * struct of_dev_auxdata - lookup table entry for device names & platform_data
-- 
cgit v1.2.3


From d26270061ae66b915138af7cd73ca6f8b85e6b44 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 18 Jan 2024 12:31:55 -0800
Subject: string: Remove strlcpy()

With all the users of strlcpy() removed[1] from the kernel, remove the
API, self-tests, and other references. Leave mentions in Documentation
(about its deprecation), and in checkpatch.pl (to help migrate host-only
tools/ usage). Long live strscpy().

Link: https://github.com/KSPP/linux/issues/89 [1]
Cc: Azeem Shaikh <azeemshaikh38@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Joe Perches <joe@perches.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: linux-hardening@vger.kernel.org
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h                | 51 ---------------------------
 include/linux/string.h                        |  3 --
 lib/nlattr.c                                  |  2 +-
 lib/string.c                                  | 15 --------
 lib/test_fortify/write_overflow-strlcpy-src.c |  5 ---
 lib/test_fortify/write_overflow-strlcpy.c     |  5 ---
 6 files changed, 1 insertion(+), 80 deletions(-)
 delete mode 100644 lib/test_fortify/write_overflow-strlcpy-src.c
 delete mode 100644 lib/test_fortify/write_overflow-strlcpy.c

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 79ef6ac4c021..89a6888f2f9e 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -214,51 +214,6 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 	return ret;
 }
 
-/* Defined after fortified strlen() to reuse it. */
-extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
-/**
- * strlcpy - Copy a string into another string buffer
- *
- * @p: pointer to destination of copy
- * @q: pointer to NUL-terminated source string to copy
- * @size: maximum number of bytes to write at @p
- *
- * If strlen(@q) >= @size, the copy of @q will be truncated at
- * @size - 1 bytes. @p will always be NUL-terminated.
- *
- * Do not use this function. While FORTIFY_SOURCE tries to avoid
- * over-reads when calculating strlen(@q), it is still possible.
- * Prefer strscpy(), though note its different return values for
- * detecting truncation.
- *
- * Returns total number of bytes written to @p, including terminating NUL.
- *
- */
-__FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size)
-{
-	const size_t p_size = __member_size(p);
-	const size_t q_size = __member_size(q);
-	size_t q_len;	/* Full count of source string length. */
-	size_t len;	/* Count of characters going into destination. */
-
-	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
-		return __real_strlcpy(p, q, size);
-	q_len = strlen(q);
-	len = (q_len >= size) ? size - 1 : q_len;
-	if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) {
-		/* Write size is always larger than destination. */
-		if (len >= p_size)
-			__write_overflow();
-	}
-	if (size) {
-		if (len >= p_size)
-			fortify_panic(__func__);
-		__underlying_memcpy(p, q, len);
-		p[len] = '\0';
-	}
-	return q_len;
-}
-
 /* Defined after fortified strnlen() to reuse it. */
 extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
 /**
@@ -272,12 +227,6 @@ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
  * @p buffer. The behavior is undefined if the string buffers overlap. The
  * destination @p buffer is always NUL terminated, unless it's zero-sized.
  *
- * Preferred to strlcpy() since the API doesn't require reading memory
- * from the source @q string beyond the specified @size bytes, and since
- * the return value is easier to error-check than strlcpy()'s.
- * In addition, the implementation is robust to the string changing out
- * from underneath it, unlike the current strlcpy() implementation.
- *
  * Preferred to strncpy() since it always returns a valid string, and
  * doesn't unnecessarily force the tail of the destination buffer to be
  * zero padded. If padding is desired please use strscpy_pad().
diff --git a/include/linux/string.h b/include/linux/string.h
index ce137830a0b9..ab148d8dbfc1 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -66,9 +66,6 @@ extern char * strcpy(char *,const char *);
 #ifndef __HAVE_ARCH_STRNCPY
 extern char * strncpy(char *,const char *, __kernel_size_t);
 #endif
-#ifndef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *, const char *, size_t);
-#endif
 #ifndef __HAVE_ARCH_STRSCPY
 ssize_t strscpy(char *, const char *, size_t);
 #endif
diff --git a/lib/nlattr.c b/lib/nlattr.c
index dc15e7888fc1..ed2ab43e1b22 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -758,7 +758,7 @@ EXPORT_SYMBOL(nla_find);
  * @dstsize: Size of destination buffer.
  *
  * Copies at most dstsize - 1 bytes into the destination buffer.
- * Unlike strlcpy the destination buffer is always padded out.
+ * Unlike strscpy() the destination buffer is always padded out.
  *
  * Return:
  * * srclen - Returns @nla length (not including the trailing %NUL).
diff --git a/lib/string.c b/lib/string.c
index be26623953d2..6891d15ce991 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -103,21 +103,6 @@ char *strncpy(char *dest, const char *src, size_t count)
 EXPORT_SYMBOL(strncpy);
 #endif
 
-#ifndef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		__builtin_memcpy(dest, src, len);
-		dest[len] = '\0';
-	}
-	return ret;
-}
-EXPORT_SYMBOL(strlcpy);
-#endif
-
 #ifndef __HAVE_ARCH_STRSCPY
 ssize_t strscpy(char *dest, const char *src, size_t count)
 {
diff --git a/lib/test_fortify/write_overflow-strlcpy-src.c b/lib/test_fortify/write_overflow-strlcpy-src.c
deleted file mode 100644
index 91bf83ebd34a..000000000000
--- a/lib/test_fortify/write_overflow-strlcpy-src.c
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#define TEST	\
-	strlcpy(small, large_src, sizeof(small) + 1)
-
-#include "test_fortify.h"
diff --git a/lib/test_fortify/write_overflow-strlcpy.c b/lib/test_fortify/write_overflow-strlcpy.c
deleted file mode 100644
index 1883db7c0cd6..000000000000
--- a/lib/test_fortify/write_overflow-strlcpy.c
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#define TEST	\
-	strlcpy(instance.buf, large_src, sizeof(instance.buf) + 1)
-
-#include "test_fortify.h"
-- 
cgit v1.2.3


From a54d51fb2dfb846aedf3751af501e9688db447f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jan 2024 20:17:49 +0000
Subject: udp: fix busy polling

Generic sk_busy_loop_end() only looks at sk->sk_receive_queue
for presence of packets.

Problem is that for UDP sockets after blamed commit, some packets
could be present in another queue: udp_sk(sk)->reader_queue

In some cases, a busy poller could spin until timeout expiration,
even if some packets are available in udp_sk(sk)->reader_queue.

v3: - make sk_busy_loop_end() nicer (Willem)

v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats.
    - add a sk_is_inet() check in sk_is_udp() (Willem feedback)
    - add a sk_is_inet() check in sk_is_tcp().

Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skmsg.h   |  6 ------
 include/net/inet_sock.h |  5 -----
 include/net/sock.h      | 18 +++++++++++++++++-
 net/core/sock.c         | 11 +++++++++--
 4 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 888a4b217829..e65ec3fd2799 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -505,12 +505,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
-static inline bool sk_is_udp(const struct sock *sk)
-{
-	return sk->sk_type == SOCK_DGRAM &&
-	       sk->sk_protocol == IPPROTO_UDP;
-}
-
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index aa86453f6b9b..d94c242eb3ed 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -307,11 +307,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
 #define inet_assign_bit(nr, sk, val)		\
 	assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
 
-static inline bool sk_is_inet(struct sock *sk)
-{
-	return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
-}
-
 /**
  * sk_to_full_sk - Access to a full socket
  * @sk: pointer to a socket
diff --git a/include/net/sock.h b/include/net/sock.h
index a7f815c7cfdf..54ca8dcbfb43 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2765,9 +2765,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
 			   &skb_shinfo(skb)->tskey);
 }
 
+static inline bool sk_is_inet(const struct sock *sk)
+{
+	int family = READ_ONCE(sk->sk_family);
+
+	return family == AF_INET || family == AF_INET6;
+}
+
 static inline bool sk_is_tcp(const struct sock *sk)
 {
-	return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
+	return sk_is_inet(sk) &&
+	       sk->sk_type == SOCK_STREAM &&
+	       sk->sk_protocol == IPPROTO_TCP;
+}
+
+static inline bool sk_is_udp(const struct sock *sk)
+{
+	return sk_is_inet(sk) &&
+	       sk->sk_type == SOCK_DGRAM &&
+	       sk->sk_protocol == IPPROTO_UDP;
 }
 
 static inline bool sk_is_stream_unix(const struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index 158dbdebce6a..0a7f46c37f0c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -107,6 +107,7 @@
 #include <linux/interrupt.h>
 #include <linux/poll.h>
 #include <linux/tcp.h>
+#include <linux/udp.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/user_namespace.h>
@@ -4144,8 +4145,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
 {
 	struct sock *sk = p;
 
-	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
-	       sk_busy_loop_timeout(sk, start_time);
+	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+		return true;
+
+	if (sk_is_udp(sk) &&
+	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
+		return true;
+
+	return sk_busy_loop_timeout(sk, start_time);
 }
 EXPORT_SYMBOL(sk_busy_loop_end);
 #endif /* CONFIG_NET_RX_BUSY_POLL */
-- 
cgit v1.2.3


From 73ae7e1c7644a8c33ba526302a10267cdbc249f8 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Thu, 11 Jan 2024 17:57:44 +0100
Subject: ata: libata-sata: improve sysfs description for ATA_LPM_UNKNOWN

Currently, both ATA_LPM_UNKNOWN (0) and ATA_LPM_MAX_POWER (1) displays
as "max_performance" in sysfs.

This is quite misleading as they are not the same.

For ATA_LPM_UNKNOWN, ata_eh_set_lpm() will not be called at all,
leaving the configuration in unknown state.
For ATA_LPM_MAX_POWER, ata_eh_set_lpm() is called, and setting the
policy to ATA_LPM_MAX_POWER.

This also matches the description of the SATA_MOBILE_LPM_POLICY Kconfig:
0 => Keep firmware settings
1 => Maximum performance

Thus, update the sysfs description for ATA_LPM_UNKNOWN to match reality.

While at it, update libata.h to mention that the ascii descriptions
are in libata-sata.c and not in libata-scsi.c.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-sata.c | 2 +-
 include/linux/libata.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index b6656c287175..0fb1934875f2 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -784,7 +784,7 @@ bool sata_lpm_ignore_phy_events(struct ata_link *link)
 EXPORT_SYMBOL_GPL(sata_lpm_ignore_phy_events);
 
 static const char *ata_lpm_policy_names[] = {
-	[ATA_LPM_UNKNOWN]		= "max_performance",
+	[ATA_LPM_UNKNOWN]		= "keep_firmware_settings",
 	[ATA_LPM_MAX_POWER]		= "max_performance",
 	[ATA_LPM_MED_POWER]		= "medium_power",
 	[ATA_LPM_MED_POWER_WITH_DIPM]	= "med_power_with_dipm",
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1dbb14daccfa..26d68115afb8 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -471,7 +471,7 @@ enum ata_completion_errors {
 
 /*
  * Link power management policy: If you alter this, you also need to
- * alter libata-scsi.c (for the ascii descriptions)
+ * alter libata-sata.c (for the ascii descriptions)
  */
 enum ata_lpm_policy {
 	ATA_LPM_UNKNOWN,
-- 
cgit v1.2.3


From 41353fbad4f551e82c2792f7e82ac225c79cc710 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Thu, 18 Jan 2024 20:51:45 +0800
Subject: nvmet: unify aer type enum

The host and target use two definition of aer type, unify
them into a single one.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/core.c      | 4 ++--
 drivers/nvme/target/discovery.c | 2 +-
 include/linux/nvme.h            | 6 ------
 3 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index d26aa30f8702..fa35e65915f0 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -248,7 +248,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 			continue;
-		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+		nvmet_add_async_event(ctrl, NVME_AER_NOTICE,
 				NVME_AER_NOTICE_NS_CHANGED,
 				NVME_LOG_CHANGED_NS);
 	}
@@ -265,7 +265,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 			continue;
 		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 			continue;
-		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+		nvmet_add_async_event(ctrl, NVME_AER_NOTICE,
 				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 	}
 	mutex_unlock(&subsys->lock);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 668d257fa986..68e82ccc0e4e 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -21,7 +21,7 @@ static void __nvmet_disc_changed(struct nvmet_port *port,
 	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE))
 		return;
 
-	nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+	nvmet_add_async_event(ctrl, NVME_AER_NOTICE,
 			      NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC);
 }
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 462c21e0e417..68eff8c86ce3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -816,12 +816,6 @@ struct nvme_reservation_status_ext {
 	struct nvme_registered_ctrl_ext regctl_eds[];
 };
 
-enum nvme_async_event_type {
-	NVME_AER_TYPE_ERROR	= 0,
-	NVME_AER_TYPE_SMART	= 1,
-	NVME_AER_TYPE_NOTICE	= 2,
-};
-
 /* I/O commands */
 
 enum nvme_opcode {
-- 
cgit v1.2.3


From 25461ce8b3d28528f2c55f5e737e99d2906eda83 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@nvidia.com>
Date: Fri, 15 Dec 2023 19:31:14 -0800
Subject: net/mlx5e: Use the correct lag ports number when creating TISes

The cited commit moved the code of mlx5e_create_tises() and changed the
loop to create TISes over MLX5_MAX_PORTS constant value, instead of
getting the correct lag ports supported by the device, which can cause
FW errors on devices with less than MLX5_MAX_PORTS ports.

Change that back to mlx5e_get_num_lag_ports(mdev).

Also IPoIB interfaces create there own TISes, they don't use the eth
TISes, pass a flag to indicate that.

This fixes the following errors that might appear in kernel log:
mlx5_cmd_out_err:808:(pid 650): CREATE_TIS(0x912) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x595b5d), err(-22)
mlx5e_create_mdev_resources:174:(pid 650): alloc tises failed, -22

Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources")
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 21 +++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c   |  2 +-
 include/linux/mlx5/driver.h                         |  1 +
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0bfe1ca8a364..55c6ace0acd5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1124,7 +1124,7 @@ static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 
 int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey);
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises);
 void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
 int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb,
 		       bool enable_mc_lb);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 67f546683e85..6ed3a32b7e22 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -95,7 +95,7 @@ static void mlx5e_destroy_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PO
 {
 	int tc, i;
 
-	for (i = 0; i < MLX5_MAX_PORTS; i++)
+	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++)
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++)
 			mlx5e_destroy_tis(mdev, tisn[i][tc]);
 }
@@ -110,7 +110,7 @@ static int mlx5e_create_tises(struct mlx5_core_dev *mdev, u32 tisn[MLX5_MAX_PORT
 	int tc, i;
 	int err;
 
-	for (i = 0; i < MLX5_MAX_PORTS; i++) {
+	for (i = 0; i < mlx5e_get_num_lag_ports(mdev); i++) {
 		for (tc = 0; tc < MLX5_MAX_NUM_TC; tc++) {
 			u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
 			void *tisc;
@@ -140,7 +140,7 @@ err_close_tises:
 	return err;
 }
 
-int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises)
 {
 	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 	int err;
@@ -169,11 +169,15 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 		goto err_destroy_mkey;
 	}
 
-	err = mlx5e_create_tises(mdev, res->tisn);
-	if (err) {
-		mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
-		goto err_destroy_bfreg;
+	if (create_tises) {
+		err = mlx5e_create_tises(mdev, res->tisn);
+		if (err) {
+			mlx5_core_err(mdev, "alloc tises failed, %d\n", err);
+			goto err_destroy_bfreg;
+		}
+		res->tisn_valid = true;
 	}
+
 	INIT_LIST_HEAD(&res->td.tirs_list);
 	mutex_init(&res->td.list_lock);
 
@@ -203,7 +207,8 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 
 	mlx5_crypto_dek_cleanup(mdev->mlx5e_res.dek_priv);
 	mdev->mlx5e_res.dek_priv = NULL;
-	mlx5e_destroy_tises(mdev, res->tisn);
+	if (res->tisn_valid)
+		mlx5e_destroy_tises(mdev, res->tisn);
 	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b5f1c4ca38ba..c8e8f512803e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5992,7 +5992,7 @@ static int mlx5e_resume(struct auxiliary_device *adev)
 	if (netif_device_present(netdev))
 		return 0;
 
-	err = mlx5e_create_mdev_resources(mdev);
+	err = mlx5e_create_mdev_resources(mdev, true);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 58845121954c..d77be1b4dd9c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -783,7 +783,7 @@ static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num,
 		}
 
 		/* This should only be called once per mdev */
-		err = mlx5e_create_mdev_resources(mdev);
+		err = mlx5e_create_mdev_resources(mdev, false);
 		if (err)
 			goto destroy_ht;
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8c55ff351e5f..41f03b352401 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -681,6 +681,7 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 #define MLX5_MAX_NUM_TC 8
 		u32                        tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC];
+		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
 	struct mutex uplink_netdev_lock;
-- 
cgit v1.2.3


From cfbc3608a8c69b48bf238bd68f768192f0238e0d Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 19 Dec 2023 14:46:20 +0200
Subject: net/mlx5: Fix query of sd_group field

The sd_group field moved in the HW spec from the MPIR register
to the vport context.
Align the query accordingly.

Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 21 +++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   | 10 +++++++---
 include/linux/mlx5/vport.h                      |  1 +
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 21753f327868..1005bb6935b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -440,6 +440,27 @@ out:
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
 
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out);
+	if (err)
+		goto out;
+
+	*sd_group = MLX5_GET(query_nic_vport_context_out, out,
+			     nic_vport_context.sd_group);
+out:
+	kvfree(out);
+	return err;
+}
+
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 {
 	u32 *out;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index bf5320b28b8b..37230253f9f1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4036,8 +4036,13 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8	   affiliation_criteria[0x4];
 	u8	   affiliated_vhca_id[0x10];
 
-	u8	   reserved_at_60[0xd0];
+	u8	   reserved_at_60[0xa0];
 
+	u8	   reserved_at_100[0x1];
+	u8         sd_group[0x3];
+	u8	   reserved_at_104[0x1c];
+
+	u8	   reserved_at_120[0x10];
 	u8         mtu[0x10];
 
 	u8         system_image_guid[0x40];
@@ -10122,8 +10127,7 @@ struct mlx5_ifc_mpir_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	u8         local_port[0x8];
-	u8         reserved_at_28[0x15];
-	u8         sd_group[0x3];
+	u8         reserved_at_28[0x18];
 
 	u8         reserved_at_60[0x20];
 };
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index fbb9bf447889..c36cc6d82926 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
 int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
+int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
 int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 				    u16 vport, u64 node_guid);
-- 
cgit v1.2.3


From ec7cc38ef9f83553102e84c82536971a81630739 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Sat, 30 Dec 2023 22:40:37 +0200
Subject: net/mlx5: Bridge, fix multicast packets sent to uplink

To enable multicast packets which are offloaded in bridge multicast
offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should
be set. Add this bit to FTE for the bridge multicast offload rules.

Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c           | 2 ++
 include/linux/mlx5/fs.h                                    | 1 +
 include/linux/mlx5/mlx5_ifc.h                              | 2 +-
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
index a7ed87e9d842..22dd30cf8033 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
 		i++;
 	}
 
+	rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
 	rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 	dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16);
 	ether_addr_copy(dmac_v, entry->key.addr);
@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
 	if (!rule_spec)
 		return ERR_PTR(-ENOMEM);
 
+	rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
 	rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 
 	flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
 		dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
 		dest.vport.vhca_id = port->esw_owner_vhca_id;
 	}
+	rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
 	handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1);
 
 	kvfree(rule_spec);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 1616a6144f7b..9b8599c200e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 		 fte->flow_context.flow_tag);
 	MLX5_SET(flow_context, in_flow_context, flow_source,
 		 fte->flow_context.flow_source);
+	MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en,
+		 !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN));
 
 	MLX5_SET(flow_context, in_flow_context, extended_destination,
 		 extended_dest);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 6f7725238abc..3fb428ce7d1c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -132,6 +132,7 @@ struct mlx5_flow_handle;
 
 enum {
 	FLOW_CONTEXT_HAS_TAG = BIT(0),
+	FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1),
 };
 
 struct mlx5_flow_context {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 37230253f9f1..c726f90ab752 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3576,7 +3576,7 @@ struct mlx5_ifc_flow_context_bits {
 	u8         action[0x10];
 
 	u8         extended_destination[0x1];
-	u8         reserved_at_81[0x1];
+	u8         uplink_hairpin_en[0x1];
 	u8         flow_source[0x2];
 	u8         encrypt_decrypt_type[0x4];
 	u8         destination_list_size[0x18];
-- 
cgit v1.2.3


From 56062d60f117dccfb5281869e0ab61e090baf864 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <rpalethorpe@suse.com>
Date: Wed, 10 Jan 2024 15:01:22 +0200
Subject: x86/entry/ia32: Ensure s32 is sign extended to s64

Presently ia32 registers stored in ptregs are unconditionally cast to
unsigned int by the ia32 stub. They are then cast to long when passed to
__se_sys*, but will not be sign extended.

This takes the sign of the syscall argument into account in the ia32
stub. It still casts to unsigned int to avoid implementation specific
behavior. However then casts to int or unsigned int as necessary. So that
the following cast to long sign extends the value.

This fixes the io_pgetevents02 LTP test when compiled with -m32. Presently
the systemcall io_pgetevents_time64() unexpectedly accepts -1 for the
maximum number of events.

It doesn't appear other systemcalls with signed arguments are effected
because they all have compat variants defined and wired up.

Fixes: ebeb8c82ffaf ("syscalls/x86: Use 'struct pt_regs' based syscall calling for IA32_EMULATION and x32")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Richard Palethorpe <rpalethorpe@suse.com>
Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240110130122.3836513-1-nik.borisov@suse.com
Link: https://lore.kernel.org/ltp/20210921130127.24131-1-rpalethorpe@suse.com/
---
 arch/x86/include/asm/syscall_wrapper.h | 25 +++++++++++++++++++++----
 include/linux/syscalls.h               |  1 +
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index 21f9407be5d3..7e88705e907f 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 		,,regs->di,,regs->si,,regs->dx				\
 		,,regs->r10,,regs->r8,,regs->r9)			\
 
+
+/* SYSCALL_PT_ARGS is Adapted from s390x */
+#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6)			\
+	SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp))
+#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5)				\
+	SYSCALL_PT_ARG4(m, t1, t2, t3, t4),  m(t5, (regs->di))
+#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4)				\
+	SYSCALL_PT_ARG3(m, t1, t2, t3),  m(t4, (regs->si))
+#define SYSCALL_PT_ARG3(m, t1, t2, t3)					\
+	SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx))
+#define SYSCALL_PT_ARG2(m, t1, t2)					\
+	SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx))
+#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx))
+#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__)
+
+#define __SC_COMPAT_CAST(t, a)						\
+	(__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U)))	\
+	(unsigned int)a
+
 /* Mapping of registers to parameters for syscalls on i386 */
 #define SC_IA32_REGS_TO_ARGS(x, ...)					\
-	__MAP(x,__SC_ARGS						\
-	      ,,(unsigned int)regs->bx,,(unsigned int)regs->cx		\
-	      ,,(unsigned int)regs->dx,,(unsigned int)regs->si		\
-	      ,,(unsigned int)regs->di,,(unsigned int)regs->bp)
+	SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST,				\
+			__MAP(x, __SC_TYPE, __VA_ARGS__))		\
 
 #define __SYS_STUB0(abi, name)						\
 	long __##abi##_##name(const struct pt_regs *regs);		\
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cdba4d0c6d4a..77eb9b0e7685 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -128,6 +128,7 @@ struct mnt_id_req;
 #define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))
 #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
 #define __SC_CAST(t, a)	(__force t) a
+#define __SC_TYPE(t, a)	t
 #define __SC_ARGS(t, a)	a
 #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
 
-- 
cgit v1.2.3


From 2f8c7c3715f2c6fb51a4ecc0905c04dd78a3da29 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 13:24:24 +0000
Subject: spi: Raise limit on number of chip selects

As reported by Guenter the limit we've got on the number of chip selects is
set too low for some systems, raise the limit. We should really remove the
hard coded limit but this is needed as a fix so let's do the simple thing
and raise the limit for now.

Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Suggested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://msgid.link/r/20240124-spi-multi-cs-max-v2-1-df6fc5ab1abc@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 471fe2ff9066..600fbd5daf68 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -21,7 +21,7 @@
 #include <uapi/linux/spi/spi.h>
 
 /* Max no. of CS supported per spi device */
-#define SPI_CS_CNT_MAX 4
+#define SPI_CS_CNT_MAX 16
 
 struct dma_chan;
 struct software_node;
-- 
cgit v1.2.3


From 90383cc07895183c75a0db2460301c2ffd912359 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 24 Jan 2024 11:15:33 -0800
Subject: exec: Distinguish in_execve from in_exec

Just to help distinguish the fs->in_exec flag from the current->in_execve
flag, add comments in check_unsafe_exec() and copy_fs() for more
context. Also note that in_execve is only used by TOMOYO now.

Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/exec.c             | 1 +
 include/linux/sched.h | 2 +-
 kernel/fork.c         | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 39d773021fff..d179abb78a1c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1633,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	}
 	rcu_read_unlock();
 
+	/* "users" and "in_exec" locked for copy_fs() */
 	if (p->fs->users > n_fs)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365..ffe8f618ab86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,7 +920,7 @@ struct task_struct {
 	unsigned			sched_rt_mutex:1;
 #endif
 
-	/* Bit to tell LSMs we're in execve(): */
+	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
 #ifndef TIF_RESTORE_SIGMASK
diff --git a/kernel/fork.c b/kernel/fork.c
index 47ff3b35352e..0d944e92a43f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1748,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
 		spin_lock(&fs->lock);
+		/* "users" and "in_exec" locked for check_unsafe_exec() */
 		if (fs->in_exec) {
 			spin_unlock(&fs->lock);
 			return -EAGAIN;
-- 
cgit v1.2.3


From c4608d1bf7c6536d1a3d233eb21e50678681564e Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Wed, 20 Dec 2023 22:59:43 -0800
Subject: mm: mmap: map MAP_STACK to VM_NOHUGEPAGE

commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP
boundaries") incured regression for stress-ng pthread benchmark [1].  It
is because THP get allocated to pthread's stack area much more possible
than before.  Pthread's stack area is allocated by mmap without
VM_GROWSDOWN or VM_GROWSUP flag, so kernel can't tell whether it is a
stack area or not.

The MAP_STACK flag is used to mark the stack area, but it is a no-op on
Linux.  Mapping MAP_STACK to VM_NOHUGEPAGE to prevent from allocating THP
for such stack area.

With this change the stack area looks like:

fffd18e10000-fffd19610000 rw-p 00000000 00:00 0
Size:               8192 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB
Rss:                  12 kB
Pss:                  12 kB
Pss_Dirty:            12 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:        12 kB
Referenced:           12 kB
Anonymous:            12 kB
KSM:                   0 kB
LazyFree:              0 kB
AnonHugePages:         0 kB
ShmemPmdMapped:        0 kB
FilePmdMapped:         0 kB
Shared_Hugetlb:        0 kB
Private_Hugetlb:       0 kB
Swap:                  0 kB
SwapPss:               0 kB
Locked:                0 kB
THPeligible:           0
VmFlags: rd wr mr mw me ac nh

The "nh" flag is set.

[1] https://lore.kernel.org/linux-mm/202312192310.56367035-oliver.sang@intel.com/

Link: https://lkml.kernel.org/r/20231221065943.2803551-2-shy828301@gmail.com
Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries")
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Oliver Sang <oliver.sang@intel.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: <stable@vger.kerenl.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mman.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 40d94411d492..dc7048824be8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -156,6 +156,7 @@ calc_vm_flag_bits(unsigned long flags)
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
+	       _calc_vm_trans(flags, MAP_STACK,	     VM_NOHUGEPAGE) |
 	       arch_calc_vm_flag_bits(flags);
 }
 
-- 
cgit v1.2.3


From f6564fce256a3944aa1bc76cb3c40e792d97c1eb Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jan 2024 11:59:14 +0100
Subject: mm, kmsan: fix infinite recursion due to RCU critical section

Alexander Potapenko writes in [1]: "For every memory access in the code
instrumented by KMSAN we call kmsan_get_metadata() to obtain the metadata
for the memory being accessed.  For virtual memory the metadata pointers
are stored in the corresponding `struct page`, therefore we need to call
virt_to_page() to get them.

According to the comment in arch/x86/include/asm/page.h,
virt_to_page(kaddr) returns a valid pointer iff virt_addr_valid(kaddr) is
true, so KMSAN needs to call virt_addr_valid() as well.

To avoid recursion, kmsan_get_metadata() must not call instrumented code,
therefore ./arch/x86/include/asm/kmsan.h forks parts of
arch/x86/mm/physaddr.c to check whether a virtual address is valid or not.

But the introduction of rcu_read_lock() to pfn_valid() added instrumented
RCU API calls to virt_to_page_or_null(), which is called by
kmsan_get_metadata(), so there is an infinite recursion now.  I do not
think it is correct to stop that recursion by doing
kmsan_enter_runtime()/kmsan_exit_runtime() in kmsan_get_metadata(): that
would prevent instrumented functions called from within the runtime from
tracking the shadow values, which might introduce false positives."

Fix the issue by switching pfn_valid() to the _sched() variant of
rcu_read_lock/unlock(), which does not require calling into RCU.  Given
the critical section in pfn_valid() is very small, this is a reasonable
trade-off (with preemptible RCU).

KMSAN further needs to be careful to suppress calls into the scheduler,
which would be another source of recursion.  This can be done by wrapping
the call to pfn_valid() into preempt_disable/enable_no_resched().  The
downside is that this sacrifices breaking scheduling guarantees; however,
a kernel compiled with KMSAN has already given up any performance
guarantees due to being heavily instrumented.

Note, KMSAN code already disables tracing via Makefile, and since mmzone.h
is included, it is not necessary to use the notrace variant, which is
generally preferred in all other cases.

Link: https://lkml.kernel.org/r/20240115184430.2710652-1-glider@google.com [1]
Link: https://lkml.kernel.org/r/20240118110022.2538350-1-elver@google.com
Fixes: 5ec8e8ea8b77 ("mm/sparsemem: fix race in accessing memory_section->usage")
Signed-off-by: Marco Elver <elver@google.com>
Reported-by: Alexander Potapenko <glider@google.com>
Reported-by: syzbot+93a9e8a3dea8d6085e12@syzkaller.appspotmail.com
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Alexander Potapenko <glider@google.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/include/asm/kmsan.h | 17 ++++++++++++++++-
 include/linux/mmzone.h       |  6 +++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h
index 8fa6ac0e2d76..d91b37f5b4bb 100644
--- a/arch/x86/include/asm/kmsan.h
+++ b/arch/x86/include/asm/kmsan.h
@@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr)
 {
 	unsigned long x = (unsigned long)addr;
 	unsigned long y = x - __START_KERNEL_map;
+	bool ret;
 
 	/* use the carry flag to determine if x was < __START_KERNEL_map */
 	if (unlikely(x > y)) {
@@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr)
 			return false;
 	}
 
-	return pfn_valid(x >> PAGE_SHIFT);
+	/*
+	 * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+	 * the critical section. However, this would result in recursion with
+	 * KMSAN. Therefore, disable preemption here, and re-enable preemption
+	 * below while suppressing reschedules to avoid recursion.
+	 *
+	 * Note, this sacrifices occasionally breaking scheduling guarantees.
+	 * Although, a kernel compiled with KMSAN has already given up on any
+	 * performance guarantees due to being heavily instrumented.
+	 */
+	preempt_disable();
+	ret = pfn_valid(x >> PAGE_SHIFT);
+	preempt_enable_no_resched();
+
+	return ret;
 }
 
 #endif /* !MODULE */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4ed33b127821..a497f189d988 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2013,9 +2013,9 @@ static inline int pfn_valid(unsigned long pfn)
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	ms = __pfn_to_section(pfn);
-	rcu_read_lock();
+	rcu_read_lock_sched();
 	if (!valid_section(ms)) {
-		rcu_read_unlock();
+		rcu_read_unlock_sched();
 		return 0;
 	}
 	/*
@@ -2023,7 +2023,7 @@ static inline int pfn_valid(unsigned long pfn)
 	 * the entire section-sized span.
 	 */
 	ret = early_section(ms) || pfn_section_valid(ms, pfn);
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 
 	return ret;
 }
-- 
cgit v1.2.3


From a22fe1d6dec7e98535b97249fdc95c2be79120bb Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 23 Jan 2024 12:28:41 -0500
Subject: dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV

is_slave_direction() should return true when direction is DMA_DEV_TO_DEV.

Fixes: 49920bc66984 ("dmaengine: add new enum dma_transfer_direction")
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240123172842.3764529-1-Frank.Li@nxp.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3df70d6131c8..752dbde4cec1 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -953,7 +953,8 @@ static inline int dmaengine_slave_config(struct dma_chan *chan,
 
 static inline bool is_slave_direction(enum dma_transfer_direction direction)
 {
-	return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM);
+	return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM) ||
+	       (direction == DMA_DEV_TO_DEV);
 }
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single(
-- 
cgit v1.2.3


From 5a287d3d2b9de2b3e747132c615599907ba5c3c1 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Fri, 26 Jan 2024 19:45:31 +0100
Subject: lsm: fix default return value of the socket_getpeersec_*() hooks

For these hooks the true "neutral" value is -EOPNOTSUPP, which is
currently what is returned when no LSM provides this hook and what LSMs
return when there is no security context set on the socket. Correct the
value in <linux/lsm_hooks.h> and adjust the dispatch functions in
security/security.c to avoid issues when the BPF LSM is enabled.

Cc: stable@vger.kernel.org
Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks")
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  4 ++--
 security/security.c           | 31 +++++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 185924c56378..76458b6d53da 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -315,9 +315,9 @@ LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
 LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
 LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
 LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
-LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock,
 	 sockptr_t optval, sockptr_t optlen, unsigned int len)
-LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock,
+LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock,
 	 struct sk_buff *skb, u32 *secid)
 LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
 LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
diff --git a/security/security.c b/security/security.c
index 6196ccaba433..3aaad75c9ce8 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4624,8 +4624,20 @@ EXPORT_SYMBOL(security_sock_rcv_skb);
 int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
 				      sockptr_t optlen, unsigned int len)
 {
-	return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock,
-			     optval, optlen, len);
+	struct security_hook_list *hp;
+	int rc;
+
+	/*
+	 * Only one module will provide a security context.
+	 */
+	hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_stream,
+			     list) {
+		rc = hp->hook.socket_getpeersec_stream(sock, optval, optlen,
+						       len);
+		if (rc != LSM_RET_DEFAULT(socket_getpeersec_stream))
+			return rc;
+	}
+	return LSM_RET_DEFAULT(socket_getpeersec_stream);
 }
 
 /**
@@ -4645,8 +4657,19 @@ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
 int security_socket_getpeersec_dgram(struct socket *sock,
 				     struct sk_buff *skb, u32 *secid)
 {
-	return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock,
-			     skb, secid);
+	struct security_hook_list *hp;
+	int rc;
+
+	/*
+	 * Only one module will provide a security context.
+	 */
+	hlist_for_each_entry(hp, &security_hook_heads.socket_getpeersec_dgram,
+			     list) {
+		rc = hp->hook.socket_getpeersec_dgram(sock, skb, secid);
+		if (rc != LSM_RET_DEFAULT(socket_getpeersec_dgram))
+			return rc;
+	}
+	return LSM_RET_DEFAULT(socket_getpeersec_dgram);
 }
 EXPORT_SYMBOL(security_socket_getpeersec_dgram);
 
-- 
cgit v1.2.3


From 764ad6b02777d77dca3659ca490f0898aa593670 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Wed, 24 Jan 2024 12:26:59 +0100
Subject: HID: bpf: use __bpf_kfunc instead of noinline

Follow the docs at Documentation/bpf/kfuncs.rst:
- declare the function with `__bpf_kfunc`
- disables missing prototype warnings, which allows to remove them from
  include/linux/hid-bpf.h

Removing the prototypes is not an issue because we currently have to
redeclare them when writing the BPF program. They will eventually be
generated by bpftool directly AFAIU.

Link: https://lore.kernel.org/r/20240124-b4-hid-bpf-fixes-v2-3-052520b1e5e6@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 drivers/hid/bpf/hid_bpf_dispatch.c | 18 +++++++++++++-----
 include/linux/hid_bpf.h            | 11 -----------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 7903c8638e81..470ae2c29c94 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -143,6 +143,9 @@ u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *s
 }
 EXPORT_SYMBOL_GPL(call_hid_bpf_rdesc_fixup);
 
+/* Disables missing prototype warnings */
+__bpf_kfunc_start_defs();
+
 /**
  * hid_bpf_get_data - Get the kernel memory pointer associated with the context @ctx
  *
@@ -152,7 +155,7 @@ EXPORT_SYMBOL_GPL(call_hid_bpf_rdesc_fixup);
  *
  * @returns %NULL on error, an %__u8 memory pointer on success
  */
-noinline __u8 *
+__bpf_kfunc __u8 *
 hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr_buf_size)
 {
 	struct hid_bpf_ctx_kern *ctx_kern;
@@ -167,6 +170,7 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr
 
 	return ctx_kern->data + offset;
 }
+__bpf_kfunc_end_defs();
 
 /*
  * The following set contains all functions we agree BPF programs
@@ -274,6 +278,9 @@ static int do_hid_bpf_attach_prog(struct hid_device *hdev, int prog_fd, struct b
 	return fd;
 }
 
+/* Disables missing prototype warnings */
+__bpf_kfunc_start_defs();
+
 /**
  * hid_bpf_attach_prog - Attach the given @prog_fd to the given HID device
  *
@@ -286,7 +293,7 @@ static int do_hid_bpf_attach_prog(struct hid_device *hdev, int prog_fd, struct b
  * is pinned to the BPF file system).
  */
 /* called from syscall */
-noinline int
+__bpf_kfunc int
 hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags)
 {
 	struct hid_device *hdev;
@@ -338,7 +345,7 @@ hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags)
  *
  * @returns A pointer to &struct hid_bpf_ctx on success, %NULL on error.
  */
-noinline struct hid_bpf_ctx *
+__bpf_kfunc struct hid_bpf_ctx *
 hid_bpf_allocate_context(unsigned int hid_id)
 {
 	struct hid_device *hdev;
@@ -371,7 +378,7 @@ hid_bpf_allocate_context(unsigned int hid_id)
  * @ctx: the HID-BPF context to release
  *
  */
-noinline void
+__bpf_kfunc void
 hid_bpf_release_context(struct hid_bpf_ctx *ctx)
 {
 	struct hid_bpf_ctx_kern *ctx_kern;
@@ -397,7 +404,7 @@ hid_bpf_release_context(struct hid_bpf_ctx *ctx)
  *
  * @returns %0 on success, a negative error code otherwise.
  */
-noinline int
+__bpf_kfunc int
 hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
 		   enum hid_report_type rtype, enum hid_class_request reqtype)
 {
@@ -465,6 +472,7 @@ hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
 	kfree(dma_data);
 	return ret;
 }
+__bpf_kfunc_end_defs();
 
 /* our HID-BPF entrypoints */
 BTF_SET8_START(hid_bpf_fmodret_ids)
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 840cd254172d..7118ac28d468 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -77,17 +77,6 @@ enum hid_bpf_attach_flags {
 int hid_bpf_device_event(struct hid_bpf_ctx *ctx);
 int hid_bpf_rdesc_fixup(struct hid_bpf_ctx *ctx);
 
-/* Following functions are kfunc that we export to BPF programs */
-/* available everywhere in HID-BPF */
-__u8 *hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t __sz);
-
-/* only available in syscall */
-int hid_bpf_attach_prog(unsigned int hid_id, int prog_fd, __u32 flags);
-int hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
-		       enum hid_report_type rtype, enum hid_class_request reqtype);
-struct hid_bpf_ctx *hid_bpf_allocate_context(unsigned int hid_id);
-void hid_bpf_release_context(struct hid_bpf_ctx *ctx);
-
 /*
  * Below is HID internal
  */
-- 
cgit v1.2.3


From 1e560864159d002b453da42bd2c13a1805515a20 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Tue, 30 Jan 2024 11:02:43 +0100
Subject: PCI/ASPM: Fix deadlock when enabling ASPM

A last minute revert in 6.7-final introduced a potential deadlock when
enabling ASPM during probe of Qualcomm PCIe controllers as reported by
lockdep:

  ============================================
  WARNING: possible recursive locking detected
  6.7.0 #40 Not tainted
  --------------------------------------------
  kworker/u16:5/90 is trying to acquire lock:
  ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pcie_aspm_pm_state_change+0x58/0xdc

              but task is already holding lock:
  ffffacfa78ced000 (pci_bus_sem){++++}-{3:3}, at: pci_walk_bus+0x34/0xbc

              other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(pci_bus_sem);
    lock(pci_bus_sem);

               *** DEADLOCK ***

  Call trace:
   print_deadlock_bug+0x25c/0x348
   __lock_acquire+0x10a4/0x2064
   lock_acquire+0x1e8/0x318
   down_read+0x60/0x184
   pcie_aspm_pm_state_change+0x58/0xdc
   pci_set_full_power_state+0xa8/0x114
   pci_set_power_state+0xc4/0x120
   qcom_pcie_enable_aspm+0x1c/0x3c [pcie_qcom]
   pci_walk_bus+0x64/0xbc
   qcom_pcie_host_post_init_2_7_0+0x28/0x34 [pcie_qcom]

The deadlock can easily be reproduced on machines like the Lenovo ThinkPad
X13s by adding a delay to increase the race window during asynchronous
probe where another thread can take a write lock.

Add a new pci_set_power_state_locked() and associated helper functions that
can be called with the PCI bus semaphore held to avoid taking the read lock
twice.

Link: https://lore.kernel.org/r/ZZu0qx2cmn7IwTyQ@hovoldconsulting.com
Link: https://lore.kernel.org/r/20240130100243.11011-1-johan+linaro@kernel.org
Fixes: f93e71aea6c6 ("Revert "PCI/ASPM: Remove pcie_aspm_pm_state_change()"")
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: <stable@vger.kernel.org>	# 6.7
---
 drivers/pci/bus.c                      | 49 +++++++++++++--------
 drivers/pci/controller/dwc/pcie-qcom.c |  2 +-
 drivers/pci/pci.c                      | 78 ++++++++++++++++++++++------------
 drivers/pci/pci.h                      |  4 +-
 drivers/pci/pcie/aspm.c                | 13 ++++--
 include/linux/pci.h                    |  5 +++
 6 files changed, 101 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 9c2137dae429..826b5016a101 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -386,21 +386,8 @@ void pci_bus_add_devices(const struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pci_bus_add_devices);
 
-/** pci_walk_bus - walk devices on/under bus, calling callback.
- *  @top      bus whose devices should be walked
- *  @cb       callback to be called for each device found
- *  @userdata arbitrary pointer to be passed to callback.
- *
- *  Walk the given bus, including any bridged devices
- *  on buses under this bus.  Call the provided callback
- *  on each device found.
- *
- *  We check the return of @cb each time. If it returns anything
- *  other than 0, we break out.
- *
- */
-void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
-		  void *userdata)
+static void __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+			   void *userdata, bool locked)
 {
 	struct pci_dev *dev;
 	struct pci_bus *bus;
@@ -408,7 +395,8 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 	int retval;
 
 	bus = top;
-	down_read(&pci_bus_sem);
+	if (!locked)
+		down_read(&pci_bus_sem);
 	next = top->devices.next;
 	for (;;) {
 		if (next == &bus->devices) {
@@ -431,10 +419,37 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		if (retval)
 			break;
 	}
-	up_read(&pci_bus_sem);
+	if (!locked)
+		up_read(&pci_bus_sem);
+}
+
+/**
+ *  pci_walk_bus - walk devices on/under bus, calling callback.
+ *  @top: bus whose devices should be walked
+ *  @cb: callback to be called for each device found
+ *  @userdata: arbitrary pointer to be passed to callback
+ *
+ *  Walk the given bus, including any bridged devices
+ *  on buses under this bus.  Call the provided callback
+ *  on each device found.
+ *
+ *  We check the return of @cb each time. If it returns anything
+ *  other than 0, we break out.
+ */
+void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata)
+{
+	__pci_walk_bus(top, cb, userdata, false);
 }
 EXPORT_SYMBOL_GPL(pci_walk_bus);
 
+void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata)
+{
+	lockdep_assert_held(&pci_bus_sem);
+
+	__pci_walk_bus(top, cb, userdata, true);
+}
+EXPORT_SYMBOL_GPL(pci_walk_bus_locked);
+
 struct pci_bus *pci_bus_get(struct pci_bus *bus)
 {
 	if (bus)
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index 10f2d0bb86be..2ce2a3bd932b 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -972,7 +972,7 @@ static int qcom_pcie_enable_aspm(struct pci_dev *pdev, void *userdata)
 	 * Downstream devices need to be in D0 state before enabling PCI PM
 	 * substates.
 	 */
-	pci_set_power_state(pdev, PCI_D0);
+	pci_set_power_state_locked(pdev, PCI_D0);
 	pci_enable_link_state_locked(pdev, PCIE_LINK_STATE_ALL);
 
 	return 0;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d8f11a078924..9ab9b1008d8b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1354,6 +1354,7 @@ end:
 /**
  * pci_set_full_power_state - Put a PCI device into D0 and update its state
  * @dev: PCI device to power up
+ * @locked: whether pci_bus_sem is held
  *
  * Call pci_power_up() to put @dev into D0, read from its PCI_PM_CTRL register
  * to confirm the state change, restore its BARs if they might be lost and
@@ -1363,7 +1364,7 @@ end:
  * to D0, it is more efficient to use pci_power_up() directly instead of this
  * function.
  */
-static int pci_set_full_power_state(struct pci_dev *dev)
+static int pci_set_full_power_state(struct pci_dev *dev, bool locked)
 {
 	u16 pmcsr;
 	int ret;
@@ -1399,7 +1400,7 @@ static int pci_set_full_power_state(struct pci_dev *dev)
 	}
 
 	if (dev->bus->self)
-		pcie_aspm_pm_state_change(dev->bus->self);
+		pcie_aspm_pm_state_change(dev->bus->self, locked);
 
 	return 0;
 }
@@ -1428,10 +1429,22 @@ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state)
 		pci_walk_bus(bus, __pci_dev_set_current_state, &state);
 }
 
+static void __pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state, bool locked)
+{
+	if (!bus)
+		return;
+
+	if (locked)
+		pci_walk_bus_locked(bus, __pci_dev_set_current_state, &state);
+	else
+		pci_walk_bus(bus, __pci_dev_set_current_state, &state);
+}
+
 /**
  * pci_set_low_power_state - Put a PCI device into a low-power state.
  * @dev: PCI device to handle.
  * @state: PCI power state (D1, D2, D3hot) to put the device into.
+ * @locked: whether pci_bus_sem is held
  *
  * Use the device's PCI_PM_CTRL register to put it into a low-power state.
  *
@@ -1442,7 +1455,7 @@ void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state)
  * 0 if device already is in the requested state.
  * 0 if device's power state has been successfully changed.
  */
-static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state)
+static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state, bool locked)
 {
 	u16 pmcsr;
 
@@ -1496,29 +1509,12 @@ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state)
 				     pci_power_name(state));
 
 	if (dev->bus->self)
-		pcie_aspm_pm_state_change(dev->bus->self);
+		pcie_aspm_pm_state_change(dev->bus->self, locked);
 
 	return 0;
 }
 
-/**
- * pci_set_power_state - Set the power state of a PCI device
- * @dev: PCI device to handle.
- * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
- *
- * Transition a device to a new power state, using the platform firmware and/or
- * the device's PCI PM registers.
- *
- * RETURN VALUE:
- * -EINVAL if the requested state is invalid.
- * -EIO if device does not support PCI PM or its PM capabilities register has a
- * wrong version, or device doesn't support the requested state.
- * 0 if the transition is to D1 or D2 but D1 and D2 are not supported.
- * 0 if device already is in the requested state.
- * 0 if the transition is to D3 but D3 is not supported.
- * 0 if device's power state has been successfully changed.
- */
-int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
+static int __pci_set_power_state(struct pci_dev *dev, pci_power_t state, bool locked)
 {
 	int error;
 
@@ -1542,7 +1538,7 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 		return 0;
 
 	if (state == PCI_D0)
-		return pci_set_full_power_state(dev);
+		return pci_set_full_power_state(dev, locked);
 
 	/*
 	 * This device is quirked not to be put into D3, so don't put it in
@@ -1556,16 +1552,16 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 		 * To put the device in D3cold, put it into D3hot in the native
 		 * way, then put it into D3cold using platform ops.
 		 */
-		error = pci_set_low_power_state(dev, PCI_D3hot);
+		error = pci_set_low_power_state(dev, PCI_D3hot, locked);
 
 		if (pci_platform_power_transition(dev, PCI_D3cold))
 			return error;
 
 		/* Powering off a bridge may power off the whole hierarchy */
 		if (dev->current_state == PCI_D3cold)
-			pci_bus_set_current_state(dev->subordinate, PCI_D3cold);
+			__pci_bus_set_current_state(dev->subordinate, PCI_D3cold, locked);
 	} else {
-		error = pci_set_low_power_state(dev, state);
+		error = pci_set_low_power_state(dev, state, locked);
 
 		if (pci_platform_power_transition(dev, state))
 			return error;
@@ -1573,8 +1569,38 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
 	return 0;
 }
+
+/**
+ * pci_set_power_state - Set the power state of a PCI device
+ * @dev: PCI device to handle.
+ * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
+ *
+ * Transition a device to a new power state, using the platform firmware and/or
+ * the device's PCI PM registers.
+ *
+ * RETURN VALUE:
+ * -EINVAL if the requested state is invalid.
+ * -EIO if device does not support PCI PM or its PM capabilities register has a
+ * wrong version, or device doesn't support the requested state.
+ * 0 if the transition is to D1 or D2 but D1 and D2 are not supported.
+ * 0 if device already is in the requested state.
+ * 0 if the transition is to D3 but D3 is not supported.
+ * 0 if device's power state has been successfully changed.
+ */
+int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
+{
+	return __pci_set_power_state(dev, state, false);
+}
 EXPORT_SYMBOL(pci_set_power_state);
 
+int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state)
+{
+	lockdep_assert_held(&pci_bus_sem);
+
+	return __pci_set_power_state(dev, state, true);
+}
+EXPORT_SYMBOL(pci_set_power_state_locked);
+
 #define PCI_EXP_SAVE_REGS	7
 
 static struct pci_cap_saved_state *_pci_find_saved_cap(struct pci_dev *pci_dev,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2336a8d1edab..e9750b1b19ba 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -571,12 +571,12 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
-void pcie_aspm_pm_state_change(struct pci_dev *pdev);
+void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked);
 void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
 #else
 static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { }
 static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { }
-static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { }
+static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { }
 static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { }
 #endif
 
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 5a0066ecc3c5..bc0bd86695ec 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1003,8 +1003,11 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev)
 	up_read(&pci_bus_sem);
 }
 
-/* @pdev: the root port or switch downstream port */
-void pcie_aspm_pm_state_change(struct pci_dev *pdev)
+/*
+ * @pdev: the root port or switch downstream port
+ * @locked: whether pci_bus_sem is held
+ */
+void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked)
 {
 	struct pcie_link_state *link = pdev->link_state;
 
@@ -1014,12 +1017,14 @@ void pcie_aspm_pm_state_change(struct pci_dev *pdev)
 	 * Devices changed PM state, we should recheck if latency
 	 * meets all functions' requirement
 	 */
-	down_read(&pci_bus_sem);
+	if (!locked)
+		down_read(&pci_bus_sem);
 	mutex_lock(&aspm_lock);
 	pcie_update_aspm_capable(link->root);
 	pcie_config_aspm_path(link);
 	mutex_unlock(&aspm_lock);
-	up_read(&pci_bus_sem);
+	if (!locked)
+		up_read(&pci_bus_sem);
 }
 
 void pcie_aspm_powersave_config_link(struct pci_dev *pdev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index add9368e6314..7ab0d13672da 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1422,6 +1422,7 @@ int pci_load_and_free_saved_state(struct pci_dev *dev,
 				  struct pci_saved_state **state);
 int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state);
 int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
+int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
@@ -1625,6 +1626,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 		  void *userdata);
+void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+			 void *userdata);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 void pci_setup_bridge(struct pci_bus *bus);
@@ -2025,6 +2028,8 @@ static inline int pci_save_state(struct pci_dev *dev) { return 0; }
 static inline void pci_restore_state(struct pci_dev *dev) { }
 static inline int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 { return 0; }
+static inline int pci_set_power_state_locked(struct pci_dev *dev, pci_power_t state)
+{ return 0; }
 static inline int pci_wake_from_d3(struct pci_dev *dev, bool enable)
 { return 0; }
 static inline pci_power_t pci_choose_state(struct pci_dev *dev,
-- 
cgit v1.2.3


From 97f7cf1cd80eeed3b7c808b7c12463295c751001 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Mon, 29 Jan 2024 10:57:01 +0100
Subject: netfilter: ipset: fix performance regression in swap operation

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com>
Tested-by: David Wang <00107082@163.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h  |  4 ++++
 net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++++---
 net/netfilter/ipset/ip_set_core.c       | 37 +++++++++++++++++++++++++--------
 net/netfilter/ipset/ip_set_hash_gen.h   | 15 ++++++++++---
 net/netfilter/ipset/ip_set_list_set.c   | 13 +++++++++---
 5 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..e9f4f845d760 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -186,6 +186,8 @@ struct ip_set_type_variant {
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
 	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+	/* Cancel ongoing garbage collectors before destroying the set*/
+	void (*cancel_gc)(struct ip_set *set);
 	/* Region-locking is used */
 	bool region_lock;
 };
@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
 
 /* A generic IP set */
 struct ip_set {
+	/* For call_cru in destroy */
+	struct rcu_head rcu;
 	/* The name of the set */
 	char name[IPSET_MAXNAMELEN];
 	/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 21f7860e8fa1..cb48a2b9cb9f 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -30,6 +30,7 @@
 #define mtype_del		IPSET_TOKEN(MTYPE, _del)
 #define mtype_list		IPSET_TOKEN(MTYPE, _list)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype			MTYPE
 
 #define get_ext(set, map, id)	((map)->extensions + ((set)->dsize * (id)))
@@ -59,9 +60,6 @@ mtype_destroy(struct ip_set *set)
 {
 	struct mtype *map = set->data;
 
-	if (SET_WITH_TIMEOUT(set))
-		del_timer_sync(&map->gc);
-
 	if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
 		mtype_ext_cleanup(set);
 	ip_set_free(map->members);
@@ -290,6 +288,15 @@ mtype_gc(struct timer_list *t)
 	add_timer(&map->gc);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+	struct mtype *map = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		del_timer_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant mtype = {
 	.kadt	= mtype_kadt,
 	.uadt	= mtype_uadt,
@@ -303,6 +310,7 @@ static const struct ip_set_type_variant mtype = {
 	.head	= mtype_head,
 	.list	= mtype_list,
 	.same_set = mtype_same_set,
+	.cancel_gc = mtype_cancel_gc,
 };
 
 #endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..bcaad9c009fe 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
 	kfree(set);
 }
 
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+	struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+	ip_set_destroy_set(set);
+}
+
 static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 			  const struct nlattr * const attr[])
 {
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
-	/* Must wait for flush to be really finished in list:set */
-	rcu_barrier();
 
 	/* Commands are serialized and references are
 	 * protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 	 * counter, so if it's already zero, we can proceed
 	 * without holding the lock.
 	 */
-	read_lock_bh(&ip_set_ref_lock);
 	if (!attr[IPSET_ATTR_SETNAME]) {
+		/* Must wait for flush to be really finished in list:set */
+		rcu_barrier();
+		read_lock_bh(&ip_set_ref_lock);
 		for (i = 0; i < inst->ip_set_max; i++) {
 			s = ip_set(inst, i);
 			if (s && (s->ref || s->ref_netlink)) {
@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 			s = ip_set(inst, i);
 			if (s) {
 				ip_set(inst, i) = NULL;
+				/* Must cancel garbage collectors */
+				s->variant->cancel_gc(s);
 				ip_set_destroy_set(s);
 			}
 		}
@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 		inst->is_destroyed = false;
 	} else {
 		u32 flags = flag_exist(info->nlh);
+		u16 features = 0;
+
+		read_lock_bh(&ip_set_ref_lock);
 		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
 				    &i);
 		if (!s) {
@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
 			ret = -IPSET_ERR_BUSY;
 			goto out;
 		}
+		features = s->type->features;
 		ip_set(inst, i) = NULL;
 		read_unlock_bh(&ip_set_ref_lock);
-
-		ip_set_destroy_set(s);
+		if (features & IPSET_TYPE_NAME) {
+			/* Must wait for flush to be really finished  */
+			rcu_barrier();
+		}
+		/* Must cancel garbage collectors */
+		s->variant->cancel_gc(s);
+		call_rcu(&s->rcu, ip_set_destroy_set_rcu);
 	}
 	return 0;
 out:
@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
 	ip_set(inst, to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
-	/* Make sure all readers of the old set pointers are completed. */
-	synchronize_rcu();
-
 	return 0;
 }
 
@@ -2409,8 +2425,11 @@ ip_set_fini(void)
 {
 	nf_unregister_sockopt(&so_set);
 	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
 	unregister_pernet_subsys(&ip_set_net_ops);
+
+	/* Wait for call_rcu() in destroy */
+	rcu_barrier();
+
 	pr_debug("these are the famous last words\n");
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index cbf80da9a01c..1136510521a8 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -222,6 +222,7 @@ static const union nf_inet_addr zeromask = {};
 #undef mtype_gc_do
 #undef mtype_gc
 #undef mtype_gc_init
+#undef mtype_cancel_gc
 #undef mtype_variant
 #undef mtype_data_match
 
@@ -266,6 +267,7 @@ static const union nf_inet_addr zeromask = {};
 #define mtype_gc_do		IPSET_TOKEN(MTYPE, _gc_do)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
 #define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc		IPSET_TOKEN(MTYPE, _cancel_gc)
 #define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
 #define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match)
 
@@ -450,9 +452,6 @@ mtype_destroy(struct ip_set *set)
 	struct htype *h = set->data;
 	struct list_head *l, *lt;
 
-	if (SET_WITH_TIMEOUT(set))
-		cancel_delayed_work_sync(&h->gc.dwork);
-
 	mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
 	list_for_each_safe(l, lt, &h->ad) {
 		list_del(l);
@@ -599,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc)
 	queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
 }
 
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+	struct htype *h = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		cancel_delayed_work_sync(&h->gc.dwork);
+}
+
 static int
 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	  struct ip_set_ext *mext, u32 flags);
@@ -1441,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = {
 	.uref	= mtype_uref,
 	.resize	= mtype_resize,
 	.same_set = mtype_same_set,
+	.cancel_gc = mtype_cancel_gc,
 	.region_lock = true,
 };
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e162636525cf..6c3f28bc59b3 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
 	struct list_set *map = set->data;
 	struct set_elem *e, *n;
 
-	if (SET_WITH_TIMEOUT(set))
-		timer_shutdown_sync(&map->gc);
-
 	list_for_each_entry_safe(e, n, &map->members, list) {
 		list_del(&e->list);
 		ip_set_put_byindex(map->net, e->id);
@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
 	       a->extensions == b->extensions;
 }
 
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+	struct list_set *map = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		timer_shutdown_sync(&map->gc);
+}
+
 static const struct ip_set_type_variant set_variant = {
 	.kadt	= list_set_kadt,
 	.uadt	= list_set_uadt,
@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
 	.head	= list_set_head,
 	.list	= list_set_list,
 	.same_set = list_set_same_set,
+	.cancel_gc = list_set_cancel_gc,
 };
 
 static void
-- 
cgit v1.2.3


From f9e9115d0c014dec3278d68823eaff159f98f4d6 Mon Sep 17 00:00:00 2001
From: Caleb Sander <csander@purestorage.com>
Date: Wed, 31 Jan 2024 09:43:13 -0700
Subject: nvme: take const cmd pointer in read-only helpers

nvme_is_fabrics() and nvme_is_write() only read struct nvme_command,
so take it by const pointer. This allows callers to pass a const pointer
and communicates that these functions don't modify the command.

Signed-off-by: Caleb Sander <csander@purestorage.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 68eff8c86ce3..bc605ec4a3fd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1812,7 +1812,7 @@ struct nvme_command {
 	};
 };
 
-static inline bool nvme_is_fabrics(struct nvme_command *cmd)
+static inline bool nvme_is_fabrics(const struct nvme_command *cmd)
 {
 	return cmd->common.opcode == nvme_fabrics_command;
 }
@@ -1831,7 +1831,7 @@ struct nvme_error_slot {
 	__u8		resv2[24];
 };
 
-static inline bool nvme_is_write(struct nvme_command *cmd)
+static inline bool nvme_is_write(const struct nvme_command *cmd)
 {
 	/*
 	 * What a mess...
-- 
cgit v1.2.3


From 54ce1927eb787f7bbb7ee664841c8f5932703f39 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 31 Jan 2024 15:55:38 -0800
Subject: cxl/cper: Fix errant CPER prints for CXL events

Jonathan reports that CXL CPER events dump an extra generic error
message.

	{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1
	{1}[Hardware Error]: event severity: recoverable
	{1}[Hardware Error]:  Error 0, type: recoverable
	{1}[Hardware Error]:   section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6
	{1}[Hardware Error]:   section length: 0x90
	{1}[Hardware Error]:   00000000: 00000090 00000007 00000000 0d938086 ................
	{1}[Hardware Error]:   00000010: 00100000 00000000 00040000 00000000 ................
	...

CXL events were rerouted though the CXL subsystem for additional
processing.  However, when that work was done it was missed that
cper_estatus_print_section() continued with a generic error message
which is confusing.

Teach CPER print code to ignore printing details of some section types.
Assign the CXL event GUIDs to this set to prevent confusing unknown
prints.

Reported-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/acpi/apei/ghes.c    | 26 --------------------------
 drivers/firmware/efi/cper.c | 19 +++++++++++++++++++
 include/linux/cper.h        | 23 +++++++++++++++++++++++
 3 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 7b7c605166e0..fe825a432c5b 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -680,32 +680,6 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 static DECLARE_RWSEM(cxl_cper_rw_sem);
 static cxl_cper_callback cper_callback;
 
-/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
-
-/*
- * General Media Event Record
- * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
- */
-#define CPER_SEC_CXL_GEN_MEDIA_GUID					\
-	GUID_INIT(0xfbcd0a77, 0xc260, 0x417f,				\
-		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
-
-/*
- * DRAM Event Record
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
- */
-#define CPER_SEC_CXL_DRAM_GUID						\
-	GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,				\
-		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
-
-/*
- * Memory Module Event Record
- * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
- */
-#define CPER_SEC_CXL_MEM_MODULE_GUID					\
-	GUID_INIT(0xfe927475, 0xdd59, 0x4339,				\
-		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
-
 static void cxl_cper_post_event(enum cxl_event_type event_type,
 				struct cxl_cper_event_rec *rec)
 {
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 35c37f667781..9b3884ff81e6 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -523,6 +523,17 @@ static void cper_print_tstamp(const char *pfx,
 	}
 }
 
+struct ignore_section {
+	guid_t guid;
+	const char *name;
+};
+
+static const struct ignore_section ignore_sections[] = {
+	{ .guid = CPER_SEC_CXL_GEN_MEDIA_GUID, .name = "CXL General Media Event" },
+	{ .guid = CPER_SEC_CXL_DRAM_GUID, .name = "CXL DRAM Event" },
+	{ .guid = CPER_SEC_CXL_MEM_MODULE_GUID, .name = "CXL Memory Module Event" },
+};
+
 static void
 cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
 			   int sec_no)
@@ -543,6 +554,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
 
 	snprintf(newpfx, sizeof(newpfx), "%s ", pfx);
+
+	for (int i = 0; i < ARRAY_SIZE(ignore_sections); i++) {
+		if (guid_equal(sec_type, &ignore_sections[i].guid)) {
+			printk("%ssection_type: %s\n", newpfx, ignore_sections[i].name);
+			return;
+		}
+	}
+
 	if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
 		struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
 
diff --git a/include/linux/cper.h b/include/linux/cper.h
index c1a7dc325121..265b0f8fc0b3 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -90,6 +90,29 @@ enum {
 	GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,	\
 		  0x72, 0x2D, 0xEB, 0x41)
 
+/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CPER_SEC_CXL_GEN_MEDIA_GUID					\
+	GUID_INIT(0xfbcd0a77, 0xc260, 0x417f,				\
+		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
+/*
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CPER_SEC_CXL_DRAM_GUID						\
+	GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,				\
+		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CPER_SEC_CXL_MEM_MODULE_GUID					\
+	GUID_INIT(0xfe927475, 0xdd59, 0x4339,				\
+		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
+
 /*
  * Flags bits definitions for flags in struct cper_record_header
  * If set, the error has been recovered
-- 
cgit v1.2.3


From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 29 Jan 2024 15:56:36 -0800
Subject: hrtimer: Report offline hrtimer enqueue

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com
---
 include/linux/hrtimer.h | 4 +++-
 kernel/time/hrtimer.c   | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bedf8eb0..641c4567cfa7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum  hrtimer_base_type {
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
  *			 expired
+ * @online:		CPU is online from an hrtimers point of view
  * @timer_waiters:	A hrtimer_cancel() invocation waits for the timer
  *			callback to finish.
  * @expires_next:	absolute time of the next event, is required for remote
@@ -179,7 +180,8 @@ struct hrtimer_cpu_base {
 	unsigned int			hres_active		: 1,
 					in_hrtirq		: 1,
 					hang_detected		: 1,
-					softirq_activated       : 1;
+					softirq_activated       : 1,
+					online			: 1;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			nr_events;
 	unsigned short			nr_retries;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 			   enum hrtimer_mode mode)
 {
 	debug_activate(timer, mode);
+	WARN_ON_ONCE(!base->cpu_base->online);
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
+	cpu_base->online = 1;
 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
 	raw_spin_unlock(&new_base->lock);
+	old_base->online = 0;
 	raw_spin_unlock(&old_base->lock);
 
 	return 0;
-- 
cgit v1.2.3


From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: blk-wbt: Fix detection of dirty-throttled tasks

The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).

Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.

CC: stable@vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-wbt.c                  | 4 ++--
 include/linux/backing-dev-defs.h | 7 +++++--
 mm/backing-dev.c                 | 2 +-
 mm/page-writeback.c              | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  */
 static bool wb_recent_wait(struct rq_wb *rwb)
 {
-	struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
 
-	return time_before(jiffies, wb->dirty_sleep + HZ);
+	return time_before(jiffies, bdi->last_bdp_sleep + HZ);
 }
 
 static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
 	struct delayed_work dwork;	/* work item used for writeback */
 	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
 
-	unsigned long dirty_sleep;	/* last wait */
-
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
 	 * any dirty wbs, which is depended upon by bdi_has_dirty().
 	 */
 	atomic_long_t tot_write_bandwidth;
+	/*
+	 * Jiffies when last process was dirty throttled on this bdi. Used by
+	 * blk-wbt.
+	 */
+	unsigned long last_bdp_sleep;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
 	struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 	INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
-	wb->dirty_sleep = jiffies;
 
 	err = fprop_local_init_percpu(&wb->completions, gfp);
 	if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	init_waitqueue_head(&bdi->wb_waitq);
+	bdi->last_bdp_sleep = jiffies;
 
 	return cgwb_bdi_init(bdi);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ pause:
 			break;
 		}
 		__set_current_state(TASK_KILLABLE);
-		wb->dirty_sleep = now;
+		bdi->last_bdp_sleep = jiffies;
 		io_schedule_timeout(pause);
 
 		current->dirty_paused_when = now + pause;
-- 
cgit v1.2.3


From cd7d469c25704d414d71bf3644f163fb74e7996b Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Fri, 13 Oct 2023 13:55:44 +0800
Subject: libceph: fail sparse-read if the data length doesn't match

Once this happens that means there have bugs.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  3 ++-
 net/ceph/osd_client.c           | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fa018d5864e7..f66f6aac74f6 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -45,6 +45,7 @@ enum ceph_sparse_read_state {
 	CEPH_SPARSE_READ_HDR	= 0,
 	CEPH_SPARSE_READ_EXTENTS,
 	CEPH_SPARSE_READ_DATA_LEN,
+	CEPH_SPARSE_READ_DATA_PRE,
 	CEPH_SPARSE_READ_DATA,
 };
 
@@ -64,7 +65,7 @@ struct ceph_sparse_read {
 	u64				sr_req_len;  /* orig request length */
 	u64				sr_pos;      /* current pos in buffer */
 	int				sr_index;    /* current extent index */
-	__le32				sr_datalen;  /* length of actual data */
+	u32				sr_datalen;  /* length of actual data */
 	u32				sr_count;    /* extent count in reply */
 	int				sr_ext_len;  /* length of extent array */
 	struct ceph_sparse_extent	*sr_extent;  /* extent array */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 625622016f57..2cea35e4ff8e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5857,8 +5857,8 @@ static int osd_sparse_read(struct ceph_connection *con,
 	struct ceph_osd *o = con->private;
 	struct ceph_sparse_read *sr = &o->o_sparse_read;
 	u32 count = sr->sr_count;
-	u64 eoff, elen;
-	int ret;
+	u64 eoff, elen, len = 0;
+	int i, ret;
 
 	switch (sr->sr_state) {
 	case CEPH_SPARSE_READ_HDR:
@@ -5903,8 +5903,20 @@ next_op:
 		convert_extent_map(sr);
 		ret = sizeof(sr->sr_datalen);
 		*pbuf = (char *)&sr->sr_datalen;
-		sr->sr_state = CEPH_SPARSE_READ_DATA;
+		sr->sr_state = CEPH_SPARSE_READ_DATA_PRE;
 		break;
+	case CEPH_SPARSE_READ_DATA_PRE:
+		/* Convert sr_datalen to host-endian */
+		sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen);
+		for (i = 0; i < count; i++)
+			len += sr->sr_extent[i].len;
+		if (sr->sr_datalen != len) {
+			pr_warn_ratelimited("data len %u != extent len %llu\n",
+					    sr->sr_datalen, len);
+			return -EREMOTEIO;
+		}
+		sr->sr_state = CEPH_SPARSE_READ_DATA;
+		fallthrough;
 	case CEPH_SPARSE_READ_DATA:
 		if (sr->sr_index >= count) {
 			sr->sr_state = CEPH_SPARSE_READ_HDR;
-- 
cgit v1.2.3


From 8e46a2d068c92a905d01cbb018b00d66991585ab Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 14 Dec 2023 16:01:03 +0800
Subject: libceph: just wait for more data to be available on the socket

A short read may occur while reading the message footer from the
socket.  Later, when the socket is ready for another read, the
messenger invokes all read_partial_*() handlers, including
read_partial_sparse_msg_data().  The expectation is that
read_partial_sparse_msg_data() would bail, allowing the messenger to
invoke read_partial() for the footer and pick up where it left off.

However read_partial_sparse_msg_data() violates that and ends up
calling into the state machine in the OSD client.  The sparse-read
state machine assumes that it's a new op and interprets some piece of
the footer as the sparse-read header and returns bogus extents/data
length, etc.

To determine whether read_partial_sparse_msg_data() should bail, let's
reuse cursor->total_resid.  Because once it reaches to zero that means
all the extents and data have been successfully received in last read,
else it could break out when partially reading any of the extents and
data.  And then osd_sparse_read() could continue where it left off.

[ idryomov: changelog ]

Link: https://tracker.ceph.com/issues/63586
Fixes: d396f89db39a ("libceph: add sparse read support to msgr1")
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger_v1.c        | 25 +++++++++++++------------
 net/ceph/messenger_v2.c        |  4 ++--
 net/ceph/osd_client.c          |  9 +++------
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 2eaaabbe98cb..1717cc57cdac 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -283,7 +283,7 @@ struct ceph_msg {
 	struct kref kref;
 	bool more_to_follow;
 	bool needs_out_seq;
-	bool sparse_read;
+	u64 sparse_read_total;
 	int front_alloc_len;
 
 	struct ceph_msgpool *pool;
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 4cb60bacf5f5..0cb61c76b9b8 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -160,8 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con)
 static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
 	/* Initialize data cursor if it's not a sparse read */
-	if (!msg->sparse_read)
-		ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+	u64 len = msg->sparse_read_total ? : data_len;
+
+	ceph_msg_data_cursor_init(&msg->cursor, msg, len);
 }
 
 /*
@@ -1036,7 +1037,7 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
 	if (do_datacrc)
 		crc = con->in_data_crc;
 
-	do {
+	while (cursor->total_resid) {
 		if (con->v1.in_sr_kvec.iov_base)
 			ret = read_partial_message_chunk(con,
 							 &con->v1.in_sr_kvec,
@@ -1044,23 +1045,23 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con)
 							 &crc);
 		else if (cursor->sr_resid > 0)
 			ret = read_partial_sparse_msg_extent(con, &crc);
-
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-			return ret;
-		}
+		if (ret <= 0)
+			break;
 
 		memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
 		ret = con->ops->sparse_read(con, cursor,
 				(char **)&con->v1.in_sr_kvec.iov_base);
+		if (ret <= 0) {
+			ret = ret ? ret : 1;  /* must return > 0 to indicate success */
+			break;
+		}
 		con->v1.in_sr_len = ret;
-	} while (ret > 0);
+	}
 
 	if (do_datacrc)
 		con->in_data_crc = crc;
 
-	return ret < 0 ? ret : 1;  /* must return > 0 to indicate success */
+	return ret;
 }
 
 static int read_partial_msg_data(struct ceph_connection *con)
@@ -1253,7 +1254,7 @@ static int read_partial_message(struct ceph_connection *con)
 		if (!m->num_data_items)
 			return -EIO;
 
-		if (m->sparse_read)
+		if (m->sparse_read_total)
 			ret = read_partial_sparse_msg_data(con);
 		else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
 			ret = read_partial_msg_data_bounce(con);
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index f8ec60e1aba3..a0ca5414b333 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -1128,7 +1128,7 @@ static int decrypt_tail(struct ceph_connection *con)
 	struct sg_table enc_sgt = {};
 	struct sg_table sgt = {};
 	struct page **pages = NULL;
-	bool sparse = con->in_msg->sparse_read;
+	bool sparse = !!con->in_msg->sparse_read_total;
 	int dpos = 0;
 	int tail_len;
 	int ret;
@@ -2060,7 +2060,7 @@ static int prepare_read_tail_plain(struct ceph_connection *con)
 	}
 
 	if (data_len(msg)) {
-		if (msg->sparse_read)
+		if (msg->sparse_read_total)
 			con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
 		else
 			con->v2.in_state = IN_S_PREPARE_READ_DATA;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2cea35e4ff8e..9d078b37fe0b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5510,7 +5510,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	}
 
 	m = ceph_msg_get(req->r_reply);
-	m->sparse_read = (bool)srlen;
+	m->sparse_read_total = srlen;
 
 	dout("get_reply tid %lld %p\n", tid, m);
 
@@ -5777,11 +5777,8 @@ static int prep_next_sparse_read(struct ceph_connection *con,
 	}
 
 	if (o->o_sparse_op_idx < 0) {
-		u64 srlen = sparse_data_requested(req);
-
-		dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n",
-		     __func__, o->o_osd, srlen);
-		ceph_msg_data_cursor_init(cursor, con->in_msg, srlen);
+		dout("%s: [%d] starting new sparse read req\n",
+		     __func__, o->o_osd);
 	} else {
 		u64 end;
 
-- 
cgit v1.2.3


From 4356e9f841f7fbb945521cef3577ba394c65f3fc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 9 Feb 2024 12:39:31 -0800
Subject: work around gcc bugs with 'asm goto' with outputs

We've had issues with gcc and 'asm goto' before, and we created a
'asm_volatile_goto()' macro for that in the past: see commits
3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation
bug") and a9f180345f53 ("compiler/gcc4: Make quirk for
asm_volatile_goto() unconditional").

Then, much later, we ended up removing the workaround in commit
43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR
58670") because we no longer supported building the kernel with the
affected gcc versions, but we left the macro uses around.

Now, Sean Christopherson reports a new version of a very similar
problem, which is fixed by re-applying that ancient workaround.  But the
problem in question is limited to only the 'asm goto with outputs'
cases, so instead of re-introducing the old workaround as-is, let's
rename and limit the workaround to just that much less common case.

It looks like there are at least two separate issues that all hit in
this area:

 (a) some versions of gcc don't mark the asm goto as 'volatile' when it
     has outputs:

        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420

     which is easy to work around by just adding the 'volatile' by hand.

 (b) Internal compiler errors:

        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422

     which are worked around by adding the extra empty 'asm' as a
     barrier, as in the original workaround.

but the problem Sean sees may be a third thing since it involves bad
code generation (not an ICE) even with the manually added 'volatile'.

but the same old workaround works for this case, even if this feels a
bit like voodoo programming and may only be hiding the issue.

Reported-and-tested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Andrew Pinski <quic_apinski@quicinc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/include/asm/jump_label.h           |  4 ++--
 arch/arm/include/asm/jump_label.h           |  4 ++--
 arch/arm64/include/asm/alternative-macros.h |  4 ++--
 arch/arm64/include/asm/jump_label.h         |  4 ++--
 arch/csky/include/asm/jump_label.h          |  4 ++--
 arch/loongarch/include/asm/jump_label.h     |  4 ++--
 arch/mips/include/asm/jump_label.h          |  4 ++--
 arch/parisc/include/asm/jump_label.h        |  4 ++--
 arch/powerpc/include/asm/jump_label.h       |  4 ++--
 arch/powerpc/include/asm/uaccess.h          | 12 ++++++------
 arch/powerpc/kernel/irq_64.c                |  2 +-
 arch/riscv/include/asm/arch_hweight.h       |  4 ++--
 arch/riscv/include/asm/bitops.h             |  8 ++++----
 arch/riscv/include/asm/checksum.h           |  2 +-
 arch/riscv/include/asm/cpufeature.h         |  4 ++--
 arch/riscv/include/asm/jump_label.h         |  4 ++--
 arch/riscv/lib/csum.c                       | 10 +++++-----
 arch/s390/include/asm/jump_label.h          |  4 ++--
 arch/sparc/include/asm/jump_label.h         |  4 ++--
 arch/um/include/asm/cpufeature.h            |  2 +-
 arch/x86/include/asm/cpufeature.h           |  2 +-
 arch/x86/include/asm/jump_label.h           |  6 +++---
 arch/x86/include/asm/rmwcc.h                |  2 +-
 arch/x86/include/asm/special_insns.h        |  2 +-
 arch/x86/include/asm/uaccess.h              | 10 +++++-----
 arch/x86/kvm/svm/svm_ops.h                  |  6 +++---
 arch/x86/kvm/vmx/vmx.c                      |  4 ++--
 arch/x86/kvm/vmx/vmx_ops.h                  |  6 +++---
 arch/xtensa/include/asm/jump_label.h        |  4 ++--
 include/linux/compiler-gcc.h                | 19 +++++++++++++++++++
 include/linux/compiler_types.h              |  4 ++--
 net/netfilter/nft_set_pipapo_avx2.c         |  2 +-
 samples/bpf/asm_goto_workaround.h           |  8 ++++----
 tools/arch/x86/include/asm/rmwcc.h          |  2 +-
 tools/include/linux/compiler_types.h        |  4 ++--
 35 files changed, 96 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
index 9d9618079739..a339223d9e05 100644
--- a/arch/arc/include/asm/jump_label.h
+++ b/arch/arc/include/asm/jump_label.h
@@ -31,7 +31,7 @@
 static __always_inline bool arch_static_branch(struct static_key *key,
 					       bool branch)
 {
-	asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)"	\n"
+	asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)"		\n"
 		 "1:							\n"
 		 "nop							\n"
 		 ".pushsection __jump_table, \"aw\"			\n"
@@ -47,7 +47,7 @@ l_yes:
 static __always_inline bool arch_static_branch_jump(struct static_key *key,
 						    bool branch)
 {
-	asm_volatile_goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)"	\n"
+	asm goto(".balign "__stringify(JUMP_LABEL_NOP_SIZE)"		\n"
 		 "1:							\n"
 		 "b %l[l_yes]						\n"
 		 ".pushsection __jump_table, \"aw\"			\n"
diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h
index e12d7d096fc0..e4eb54f6cd9f 100644
--- a/arch/arm/include/asm/jump_label.h
+++ b/arch/arm/include/asm/jump_label.h
@@ -11,7 +11,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 WASM(nop) "\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".word 1b, %l[l_yes], %c0\n\t"
@@ -25,7 +25,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 WASM(b) " %l[l_yes]\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".word 1b, %l[l_yes], %c0\n\t"
diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 210bb43cff2c..d328f549b1a6 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -229,7 +229,7 @@ alternative_has_cap_likely(const unsigned long cpucap)
 	if (!cpucap_is_possible(cpucap))
 		return false;
 
-	asm_volatile_goto(
+	asm goto(
 	ALTERNATIVE_CB("b	%l[l_no]", %[cpucap], alt_cb_patch_nops)
 	:
 	: [cpucap] "i" (cpucap)
@@ -247,7 +247,7 @@ alternative_has_cap_unlikely(const unsigned long cpucap)
 	if (!cpucap_is_possible(cpucap))
 		return false;
 
-	asm_volatile_goto(
+	asm goto(
 	ALTERNATIVE("nop", "b	%l[l_yes]", %[cpucap])
 	:
 	: [cpucap] "i" (cpucap)
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index 48ddc0f45d22..6aafbb789991 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -18,7 +18,7 @@
 static __always_inline bool arch_static_branch(struct static_key * const key,
 					       const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	nop					\n\t"
 		 "	.pushsection	__jump_table, \"aw\"	\n\t"
 		 "	.align		3			\n\t"
@@ -35,7 +35,7 @@ l_yes:
 static __always_inline bool arch_static_branch_jump(struct static_key * const key,
 						    const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	b		%l[l_yes]		\n\t"
 		 "	.pushsection	__jump_table, \"aw\"	\n\t"
 		 "	.align		3			\n\t"
diff --git a/arch/csky/include/asm/jump_label.h b/arch/csky/include/asm/jump_label.h
index 98a3f4b168bd..ef2e37a10a0f 100644
--- a/arch/csky/include/asm/jump_label.h
+++ b/arch/csky/include/asm/jump_label.h
@@ -12,7 +12,7 @@
 static __always_inline bool arch_static_branch(struct static_key *key,
 					       bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	nop32					\n"
 		"	.pushsection	__jump_table, \"aw\"	\n"
 		"	.align		2			\n"
@@ -29,7 +29,7 @@ label:
 static __always_inline bool arch_static_branch_jump(struct static_key *key,
 						    bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	bsr32		%l[label]		\n"
 		"	.pushsection	__jump_table, \"aw\"	\n"
 		"	.align		2			\n"
diff --git a/arch/loongarch/include/asm/jump_label.h b/arch/loongarch/include/asm/jump_label.h
index 3cea299a5ef5..29acfe3de3fa 100644
--- a/arch/loongarch/include/asm/jump_label.h
+++ b/arch/loongarch/include/asm/jump_label.h
@@ -22,7 +22,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	nop			\n\t"
 		JUMP_TABLE_ENTRY
 		:  :  "i"(&((char *)key)[branch]) :  : l_yes);
@@ -35,7 +35,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"1:	b	%l[l_yes]	\n\t"
 		JUMP_TABLE_ENTRY
 		:  :  "i"(&((char *)key)[branch]) :  : l_yes);
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 081be98c71ef..ff5d388502d4 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -39,7 +39,7 @@ extern void jump_label_apply_nops(struct module *mod);
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\t" B_INSN " 2f\n\t"
+	asm goto("1:\t" B_INSN " 2f\n\t"
 		"2:\t.insn\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
@@ -53,7 +53,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\t" J_INSN " %l[l_yes]\n\t"
+	asm goto("1:\t" J_INSN " %l[l_yes]\n\t"
 		".pushsection __jump_table,  \"aw\"\n\t"
 		WORD_INSN " 1b, %l[l_yes], %0\n\t"
 		".popsection\n\t"
diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h
index 94428798b6aa..317ebc5edc9f 100644
--- a/arch/parisc/include/asm/jump_label.h
+++ b/arch/parisc/include/asm/jump_label.h
@@ -12,7 +12,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "nop\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".align %1\n\t"
@@ -29,7 +29,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "b,n %l[l_yes]\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".align %1\n\t"
diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h
index 93ce3ec25387..2f2a86ed2280 100644
--- a/arch/powerpc/include/asm/jump_label.h
+++ b/arch/powerpc/include/asm/jump_label.h
@@ -17,7 +17,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "nop # arch_static_branch\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".long 1b - ., %l[l_yes] - .\n\t"
@@ -32,7 +32,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "b %l[l_yes] # arch_static_branch_jump\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
 		 ".long 1b - ., %l[l_yes] - .\n\t"
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index f1f9890f50d3..de10437fd206 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -74,7 +74,7 @@ __pu_failed:							\
 /* -mprefixed can generate offsets beyond range, fall back hack */
 #ifdef CONFIG_PPC_KERNEL_PREFIXED
 #define __put_user_asm_goto(x, addr, label, op)			\
-	asm_volatile_goto(					\
+	asm goto(					\
 		"1:	" op " %0,0(%1)	# put_user\n"		\
 		EX_TABLE(1b, %l2)				\
 		:						\
@@ -83,7 +83,7 @@ __pu_failed:							\
 		: label)
 #else
 #define __put_user_asm_goto(x, addr, label, op)			\
-	asm_volatile_goto(					\
+	asm goto(					\
 		"1:	" op "%U1%X1 %0,%1	# put_user\n"	\
 		EX_TABLE(1b, %l2)				\
 		:						\
@@ -97,7 +97,7 @@ __pu_failed:							\
 	__put_user_asm_goto(x, ptr, label, "std")
 #else /* __powerpc64__ */
 #define __put_user_asm2_goto(x, addr, label)			\
-	asm_volatile_goto(					\
+	asm goto(					\
 		"1:	stw%X1 %0, %1\n"			\
 		"2:	stw%X1 %L0, %L1\n"			\
 		EX_TABLE(1b, %l2)				\
@@ -146,7 +146,7 @@ do {								\
 /* -mprefixed can generate offsets beyond range, fall back hack */
 #ifdef CONFIG_PPC_KERNEL_PREFIXED
 #define __get_user_asm_goto(x, addr, label, op)			\
-	asm_volatile_goto(					\
+	asm_goto_output(					\
 		"1:	"op" %0,0(%1)	# get_user\n"		\
 		EX_TABLE(1b, %l2)				\
 		: "=r" (x)					\
@@ -155,7 +155,7 @@ do {								\
 		: label)
 #else
 #define __get_user_asm_goto(x, addr, label, op)			\
-	asm_volatile_goto(					\
+	asm_goto_output(					\
 		"1:	"op"%U1%X1 %0, %1	# get_user\n"	\
 		EX_TABLE(1b, %l2)				\
 		: "=r" (x)					\
@@ -169,7 +169,7 @@ do {								\
 	__get_user_asm_goto(x, addr, label, "ld")
 #else /* __powerpc64__ */
 #define __get_user_asm2_goto(x, addr, label)			\
-	asm_volatile_goto(					\
+	asm_goto_output(					\
 		"1:	lwz%X1 %0, %1\n"			\
 		"2:	lwz%X1 %L0, %L1\n"			\
 		EX_TABLE(1b, %l2)				\
diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c
index 938e66829eae..d5c48d1b0a31 100644
--- a/arch/powerpc/kernel/irq_64.c
+++ b/arch/powerpc/kernel/irq_64.c
@@ -230,7 +230,7 @@ again:
 	 * This allows interrupts to be unmasked without hard disabling, and
 	 * also without new hard interrupts coming in ahead of pending ones.
 	 */
-	asm_volatile_goto(
+	asm goto(
 "1:					\n"
 "		lbz	9,%0(13)	\n"
 "		cmpwi	9,0		\n"
diff --git a/arch/riscv/include/asm/arch_hweight.h b/arch/riscv/include/asm/arch_hweight.h
index c20236a0725b..85b2c443823e 100644
--- a/arch/riscv/include/asm/arch_hweight.h
+++ b/arch/riscv/include/asm/arch_hweight.h
@@ -20,7 +20,7 @@
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
 #ifdef CONFIG_RISCV_ISA_ZBB
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
@@ -51,7 +51,7 @@ static inline unsigned int __arch_hweight8(unsigned int w)
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
 # ifdef CONFIG_RISCV_ISA_ZBB
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 9ffc35537024..329d8244a9b3 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -39,7 +39,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
 {
 	int num;
 
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
@@ -95,7 +95,7 @@ static __always_inline unsigned long variable__fls(unsigned long word)
 {
 	int num;
 
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
@@ -154,7 +154,7 @@ static __always_inline int variable_ffs(int x)
 	if (!x)
 		return 0;
 
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
@@ -209,7 +209,7 @@ static __always_inline int variable_fls(unsigned int x)
 	if (!x)
 		return 0;
 
-	asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
+	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
 			  : : : : legacy);
 
diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
index a5b60b54b101..88e6f1499e88 100644
--- a/arch/riscv/include/asm/checksum.h
+++ b/arch/riscv/include/asm/checksum.h
@@ -53,7 +53,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 	    IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
 		unsigned long fold_temp;
 
-		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+		asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
 					      RISCV_ISA_EXT_ZBB, 1)
 		    :
 		    :
diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 5a626ed2c47a..0bd11862b760 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -80,7 +80,7 @@ riscv_has_extension_likely(const unsigned long ext)
 			   "ext must be < RISCV_ISA_EXT_MAX");
 
 	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
-		asm_volatile_goto(
+		asm goto(
 		ALTERNATIVE("j	%l[l_no]", "nop", 0, %[ext], 1)
 		:
 		: [ext] "i" (ext)
@@ -103,7 +103,7 @@ riscv_has_extension_unlikely(const unsigned long ext)
 			   "ext must be < RISCV_ISA_EXT_MAX");
 
 	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
-		asm_volatile_goto(
+		asm goto(
 		ALTERNATIVE("nop", "j	%l[l_yes]", 0, %[ext], 1)
 		:
 		: [ext] "i" (ext)
diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h
index 14a5ea8d8ef0..4a35d787c019 100644
--- a/arch/riscv/include/asm/jump_label.h
+++ b/arch/riscv/include/asm/jump_label.h
@@ -17,7 +17,7 @@
 static __always_inline bool arch_static_branch(struct static_key * const key,
 					       const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"	.align		2			\n\t"
 		"	.option push				\n\t"
 		"	.option norelax				\n\t"
@@ -39,7 +39,7 @@ label:
 static __always_inline bool arch_static_branch_jump(struct static_key * const key,
 						    const bool branch)
 {
-	asm_volatile_goto(
+	asm goto(
 		"	.align		2			\n\t"
 		"	.option push				\n\t"
 		"	.option norelax				\n\t"
diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c
index af3df5274ccb..74af3ab520b6 100644
--- a/arch/riscv/lib/csum.c
+++ b/arch/riscv/lib/csum.c
@@ -53,7 +53,7 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 		 * support, so nop when Zbb is available and jump when Zbb is
 		 * not available.
 		 */
-		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+		asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
 					      RISCV_ISA_EXT_ZBB, 1)
 				  :
 				  :
@@ -170,7 +170,7 @@ do_csum_with_alignment(const unsigned char *buff, int len)
 		 * support, so nop when Zbb is available and jump when Zbb is
 		 * not available.
 		 */
-		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+		asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
 					      RISCV_ISA_EXT_ZBB, 1)
 				  :
 				  :
@@ -178,7 +178,7 @@ do_csum_with_alignment(const unsigned char *buff, int len)
 				  : no_zbb);
 
 #ifdef CONFIG_32BIT
-		asm_volatile_goto(".option push			\n\
+		asm_goto_output(".option push			\n\
 		.option arch,+zbb				\n\
 			rori	%[fold_temp], %[csum], 16	\n\
 			andi	%[offset], %[offset], 1		\n\
@@ -193,7 +193,7 @@ do_csum_with_alignment(const unsigned char *buff, int len)
 
 		return (unsigned short)csum;
 #else /* !CONFIG_32BIT */
-		asm_volatile_goto(".option push			\n\
+		asm_goto_output(".option push			\n\
 		.option arch,+zbb				\n\
 			rori	%[fold_temp], %[csum], 32	\n\
 			add	%[csum], %[fold_temp], %[csum]	\n\
@@ -257,7 +257,7 @@ do_csum_no_alignment(const unsigned char *buff, int len)
 		 * support, so nop when Zbb is available and jump when Zbb is
 		 * not available.
 		 */
-		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+		asm goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
 					      RISCV_ISA_EXT_ZBB, 1)
 				  :
 				  :
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 895f774bbcc5..bf78cf381dfc 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -25,7 +25,7 @@
  */
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("0:	brcl 0,%l[label]\n"
+	asm goto("0:	brcl 0,%l[label]\n"
 			  ".pushsection __jump_table,\"aw\"\n"
 			  ".balign	8\n"
 			  ".long	0b-.,%l[label]-.\n"
@@ -39,7 +39,7 @@ label:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("0:	brcl 15,%l[label]\n"
+	asm goto("0:	brcl 15,%l[label]\n"
 			  ".pushsection __jump_table,\"aw\"\n"
 			  ".balign	8\n"
 			  ".long	0b-.,%l[label]-.\n"
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
index 94eb529dcb77..2718cbea826a 100644
--- a/arch/sparc/include/asm/jump_label.h
+++ b/arch/sparc/include/asm/jump_label.h
@@ -10,7 +10,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "nop\n\t"
 		 "nop\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
@@ -26,7 +26,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 		 "b %l[l_yes]\n\t"
 		 "nop\n\t"
 		 ".pushsection __jump_table,  \"aw\"\n\t"
diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h
index 4b6d1b526bc1..66fe06db872f 100644
--- a/arch/um/include/asm/cpufeature.h
+++ b/arch/um/include/asm/cpufeature.h
@@ -75,7 +75,7 @@ extern void setup_clear_cpu_cap(unsigned int bit);
  */
 static __always_inline bool _static_cpu_has(u16 bit)
 {
-	asm_volatile_goto("1: jmp 6f\n"
+	asm goto("1: jmp 6f\n"
 		 "2:\n"
 		 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			 "((5f-4f) - (2b-1b)),0x90\n"
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a26bebbdff87..a1273698fc43 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -168,7 +168,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
  */
 static __always_inline bool _static_cpu_has(u16 bit)
 {
-	asm_volatile_goto(
+	asm goto(
 		ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
 		".pushsection .altinstr_aux,\"ax\"\n"
 		"6:\n"
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 071572e23d3a..cbbef32517f0 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -24,7 +24,7 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("1:"
+	asm goto("1:"
 		"jmp %l[l_yes] # objtool NOPs this \n\t"
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (2 | branch) : : l_yes);
@@ -38,7 +38,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
 {
-	asm_volatile_goto("1:"
+	asm goto("1:"
 		".byte " __stringify(BYTES_NOP5) "\n\t"
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
@@ -52,7 +52,7 @@ l_yes:
 
 static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
 {
-	asm_volatile_goto("1:"
+	asm goto("1:"
 		"jmp %l[l_yes]\n\t"
 		JUMP_TABLE_ENTRY
 		: :  "i" (key), "i" (branch) : : l_yes);
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 4b081e0d3306..363266cbcada 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -13,7 +13,7 @@
 #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...)			\
 ({									\
 	bool c = false;							\
-	asm_volatile_goto (fullop "; j" #cc " %l[cc_label]"		\
+	asm goto (fullop "; j" #cc " %l[cc_label]"		\
 			: : [var] "m" (_var), ## __VA_ARGS__		\
 			: clobbers : cc_label);				\
 	if (0) {							\
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index d6cd9344f6c7..48f8dd47cf68 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p)
 #ifdef CONFIG_X86_USER_SHADOW_STACK
 static inline int write_user_shstk_64(u64 __user *addr, u64 val)
 {
-	asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+	asm goto("1: wrussq %[val], (%[addr])\n"
 			  _ASM_EXTABLE(1b, %l[fail])
 			  :: [addr] "r" (addr), [val] "r" (val)
 			  :: fail);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5c367c1290c3..237dc8cdd12b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -133,7 +133,7 @@ extern int __get_user_bad(void);
 
 #ifdef CONFIG_X86_32
 #define __put_user_goto_u64(x, addr, label)			\
-	asm_volatile_goto("\n"					\
+	asm goto("\n"					\
 		     "1:	movl %%eax,0(%1)\n"		\
 		     "2:	movl %%edx,4(%1)\n"		\
 		     _ASM_EXTABLE_UA(1b, %l2)			\
@@ -295,7 +295,7 @@ do {									\
 } while (0)
 
 #define __get_user_asm(x, addr, itype, ltype, label)			\
-	asm_volatile_goto("\n"						\
+	asm_goto_output("\n"						\
 		     "1:	mov"itype" %[umem],%[output]\n"		\
 		     _ASM_EXTABLE_UA(1b, %l2)				\
 		     : [output] ltype(x)				\
@@ -375,7 +375,7 @@ do {									\
 	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
 	__typeof__(*(_ptr)) __old = *_old;				\
 	__typeof__(*(_ptr)) __new = (_new);				\
-	asm_volatile_goto("\n"						\
+	asm_goto_output("\n"						\
 		     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
 		     _ASM_EXTABLE_UA(1b, %l[label])			\
 		     : CC_OUT(z) (success),				\
@@ -394,7 +394,7 @@ do {									\
 	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
 	__typeof__(*(_ptr)) __old = *_old;				\
 	__typeof__(*(_ptr)) __new = (_new);				\
-	asm_volatile_goto("\n"						\
+	asm_goto_output("\n"						\
 		     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"		\
 		     _ASM_EXTABLE_UA(1b, %l[label])			\
 		     : CC_OUT(z) (success),				\
@@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; };
  * aliasing issues.
  */
 #define __put_user_goto(x, addr, itype, ltype, label)			\
-	asm_volatile_goto("\n"						\
+	asm goto("\n"							\
 		"1:	mov"itype" %0,%1\n"				\
 		_ASM_EXTABLE_UA(1b, %l2)				\
 		: : ltype(x), "m" (__m(addr))				\
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 36c8af87a707..4e725854c63a 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -8,7 +8,7 @@
 
 #define svm_asm(insn, clobber...)				\
 do {								\
-	asm_volatile_goto("1: " __stringify(insn) "\n\t"	\
+	asm goto("1: " __stringify(insn) "\n\t"	\
 			  _ASM_EXTABLE(1b, %l[fault])		\
 			  ::: clobber : fault);			\
 	return;							\
@@ -18,7 +18,7 @@ fault:								\
 
 #define svm_asm1(insn, op1, clobber...)				\
 do {								\
-	asm_volatile_goto("1: "  __stringify(insn) " %0\n\t"	\
+	asm goto("1: "  __stringify(insn) " %0\n\t"	\
 			  _ASM_EXTABLE(1b, %l[fault])		\
 			  :: op1 : clobber : fault);		\
 	return;							\
@@ -28,7 +28,7 @@ fault:								\
 
 #define svm_asm2(insn, op1, op2, clobber...)				\
 do {									\
-	asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"	\
+	asm goto("1: "  __stringify(insn) " %1, %0\n\t"	\
 			  _ASM_EXTABLE(1b, %l[fault])			\
 			  :: op1, op2 : clobber : fault);		\
 	return;								\
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e262bc2ba4e5..1111d9d08903 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -738,7 +738,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
  */
 static int kvm_cpu_vmxoff(void)
 {
-	asm_volatile_goto("1: vmxoff\n\t"
+	asm goto("1: vmxoff\n\t"
 			  _ASM_EXTABLE(1b, %l[fault])
 			  ::: "cc", "memory" : fault);
 
@@ -2784,7 +2784,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
 
 	cr4_set_bits(X86_CR4_VMXE);
 
-	asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+	asm goto("1: vmxon %[vmxon_pointer]\n\t"
 			  _ASM_EXTABLE(1b, %l[fault])
 			  : : [vmxon_pointer] "m"(vmxon_pointer)
 			  : : fault);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index f41ce3c24123..8060e5fc6dbd 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
 
 #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
-	asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+	asm_goto_output("1: vmread %[field], %[output]\n\t"
 			  "jna %l[do_fail]\n\t"
 
 			  _ASM_EXTABLE(1b, %l[do_exception])
@@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
 
 #define vmx_asm1(insn, op1, error_args...)				\
 do {									\
-	asm_volatile_goto("1: " __stringify(insn) " %0\n\t"		\
+	asm goto("1: " __stringify(insn) " %0\n\t"			\
 			  ".byte 0x2e\n\t" /* branch not taken hint */	\
 			  "jna %l[error]\n\t"				\
 			  _ASM_EXTABLE(1b, %l[fault])			\
@@ -205,7 +205,7 @@ fault:									\
 
 #define vmx_asm2(insn, op1, op2, error_args...)				\
 do {									\
-	asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"	\
+	asm goto("1: "  __stringify(insn) " %1, %0\n\t"			\
 			  ".byte 0x2e\n\t" /* branch not taken hint */	\
 			  "jna %l[error]\n\t"				\
 			  _ASM_EXTABLE(1b, %l[fault])			\
diff --git a/arch/xtensa/include/asm/jump_label.h b/arch/xtensa/include/asm/jump_label.h
index c812bf85021c..46c8596259d2 100644
--- a/arch/xtensa/include/asm/jump_label.h
+++ b/arch/xtensa/include/asm/jump_label.h
@@ -13,7 +13,7 @@
 static __always_inline bool arch_static_branch(struct static_key *key,
 					       bool branch)
 {
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 			  "_nop\n\t"
 			  ".pushsection __jump_table,  \"aw\"\n\t"
 			  ".word 1b, %l[l_yes], %c0\n\t"
@@ -38,7 +38,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key,
 	 * make it reachable and wrap both into a no-transform block
 	 * to avoid any assembler interference with this.
 	 */
-	asm_volatile_goto("1:\n\t"
+	asm goto("1:\n\t"
 			  ".begin no-transform\n\t"
 			  "_j %l[l_yes]\n\t"
 			  "2:\n\t"
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index aebb65bf95a7..c1a963be7d28 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -64,6 +64,25 @@
 		__builtin_unreachable();	\
 	} while (0)
 
+/*
+ * GCC 'asm goto' with outputs miscompiles certain code sequences:
+ *
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422
+ *
+ * Work it around via the same compiler barrier quirk that we used
+ * to use for the old 'asm goto' workaround.
+ *
+ * Also, always mark such 'asm goto' statements as volatile: all
+ * asm goto statements are supposed to be volatile as per the
+ * documentation, but some versions of gcc didn't actually do
+ * that for asms with outputs:
+ *
+ *    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
+ */
+#define asm_goto_output(x...) \
+	do { asm volatile goto(x); asm (""); } while (0)
+
 #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 6f1ca49306d2..663d8791c871 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -362,8 +362,8 @@ struct ftrace_likely_data {
 #define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
-#ifndef asm_volatile_goto
-#define asm_volatile_goto(x...) asm goto(x)
+#ifndef asm_goto_output
+#define asm_goto_output(x...) asm goto(x)
 #endif
 
 #ifdef CONFIG_CC_HAS_ASM_INLINE
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index 90e275bb3e5d..a3a8ddca9918 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -57,7 +57,7 @@
 
 /* Jump to label if @reg is zero */
 #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label)			\
-	asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";"	\
+	asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";"	\
 			  "je %l[" #label "]" : : : : label)
 
 /* Store 256 bits from YMM register into memory. Contrary to bucket load
diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h
index 7048bb3594d6..634e81d83efd 100644
--- a/samples/bpf/asm_goto_workaround.h
+++ b/samples/bpf/asm_goto_workaround.h
@@ -4,14 +4,14 @@
 #define __ASM_GOTO_WORKAROUND_H
 
 /*
- * This will bring in asm_volatile_goto and asm_inline macro definitions
+ * This will bring in asm_goto_output and asm_inline macro definitions
  * if enabled by compiler and config options.
  */
 #include <linux/types.h>
 
-#ifdef asm_volatile_goto
-#undef asm_volatile_goto
-#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto")
+#ifdef asm_goto_output
+#undef asm_goto_output
+#define asm_goto_output(x...) asm volatile("invalid use of asm_goto_output")
 #endif
 
 /*
diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h
index 11ff975242ca..e2ff22b379a4 100644
--- a/tools/arch/x86/include/asm/rmwcc.h
+++ b/tools/arch/x86/include/asm/rmwcc.h
@@ -4,7 +4,7 @@
 
 #define __GEN_RMWcc(fullop, var, cc, ...)				\
 do {									\
-	asm_volatile_goto (fullop "; j" cc " %l[cc_label]"		\
+	asm goto (fullop "; j" cc " %l[cc_label]"		\
 			: : "m" (var), ## __VA_ARGS__ 			\
 			: "memory" : cc_label);				\
 	return 0;							\
diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h
index 1bdd834bdd57..d09f9dc172a4 100644
--- a/tools/include/linux/compiler_types.h
+++ b/tools/include/linux/compiler_types.h
@@ -36,8 +36,8 @@
 #include <linux/compiler-gcc.h>
 #endif
 
-#ifndef asm_volatile_goto
-#define asm_volatile_goto(x...) asm goto(x)
+#ifndef asm_goto_output
+#define asm_goto_output(x...) asm goto(x)
 #endif
 
 #endif /* __LINUX_COMPILER_TYPES_H */
-- 
cgit v1.2.3